{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.875, "eval_steps": 2000, "global_step": 28000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 6.25e-05, "grad_norm": 660.0, "learning_rate": 2.1600000000000003e-05, "loss": 89.3086, "loss/crossentropy": 9.156324863433838, "loss/hidden": 10.34375, "loss/jsd": 0.0, "loss/logits": 6.980849266052246, "step": 2 }, { "epoch": 0.000125, "grad_norm": 724.0, "learning_rate": 2.32e-05, "loss": 92.6541, "loss/crossentropy": 9.248075485229492, "loss/hidden": 10.25, "loss/jsd": 0.0, "loss/logits": 7.315607070922852, "step": 4 }, { "epoch": 0.0001875, "grad_norm": 652.0, "learning_rate": 2.48e-05, "loss": 90.2102, "loss/crossentropy": 9.289902210235596, "loss/hidden": 10.125, "loss/jsd": 0.0, "loss/logits": 7.079533338546753, "step": 6 }, { "epoch": 0.00025, "grad_norm": 398.0, "learning_rate": 2.64e-05, "loss": 86.3137, "loss/crossentropy": 8.973662853240967, "loss/hidden": 10.15625, "loss/jsd": 0.0, "loss/logits": 6.718380928039551, "step": 8 }, { "epoch": 0.0003125, "grad_norm": 246.0, "learning_rate": 2.8000000000000003e-05, "loss": 74.1221, "loss/crossentropy": 7.941339015960693, "loss/hidden": 9.875, "loss/jsd": 0.0, "loss/logits": 5.630578994750977, "step": 10 }, { "epoch": 0.000375, "grad_norm": 149.0, "learning_rate": 2.9600000000000005e-05, "loss": 67.799, "loss/crossentropy": 7.281719923019409, "loss/hidden": 9.84375, "loss/jsd": 0.0, "loss/logits": 5.067355394363403, "step": 12 }, { "epoch": 0.0004375, "grad_norm": 96.0, "learning_rate": 3.1200000000000006e-05, "loss": 59.7456, "loss/crossentropy": 6.664980173110962, "loss/hidden": 9.25, "loss/jsd": 0.0, "loss/logits": 4.383058547973633, "step": 14 }, { "epoch": 0.0005, "grad_norm": 82.0, "grad_norm_var": 65283.240625, "learning_rate": 3.2800000000000004e-05, "loss": 54.7434, "loss/crossentropy": 6.230701446533203, "loss/hidden": 8.96875, "loss/jsd": 0.0, "loss/logits": 3.954396367073059, "step": 16 }, { "epoch": 0.0005625, "grad_norm": 82.5, "grad_norm_var": 59929.1625, "learning_rate": 3.4399999999999996e-05, "loss": 49.011, "loss/crossentropy": 5.828659772872925, "loss/hidden": 8.75, "loss/jsd": 0.0, "loss/logits": 3.443238377571106, "step": 18 }, { "epoch": 0.000625, "grad_norm": 58.0, "grad_norm_var": 41848.00729166667, "learning_rate": 3.600000000000001e-05, "loss": 44.1889, "loss/crossentropy": 5.225403785705566, "loss/hidden": 8.21875, "loss/jsd": 0.0, "loss/logits": 3.074475884437561, "step": 20 }, { "epoch": 0.0006875, "grad_norm": 61.5, "grad_norm_var": 21039.383333333335, "learning_rate": 3.76e-05, "loss": 40.1213, "loss/crossentropy": 5.027252197265625, "loss/hidden": 7.984375, "loss/jsd": 0.0, "loss/logits": 2.710965871810913, "step": 22 }, { "epoch": 0.00075, "grad_norm": 58.5, "grad_norm_var": 9249.7625, "learning_rate": 3.9200000000000004e-05, "loss": 36.1313, "loss/crossentropy": 4.690935373306274, "loss/hidden": 7.578125, "loss/jsd": 0.0, "loss/logits": 2.386227607727051, "step": 24 }, { "epoch": 0.0008125, "grad_norm": 48.5, "grad_norm_var": 4270.605989583333, "learning_rate": 4.08e-05, "loss": 31.8483, "loss/crossentropy": 4.4289772510528564, "loss/hidden": 7.140625, "loss/jsd": 0.0, "loss/logits": 2.0278735160827637, "step": 26 }, { "epoch": 0.000875, "grad_norm": 61.5, "grad_norm_var": 340.2708333333333, "learning_rate": 4.240000000000001e-05, "loss": 32.7749, "loss/crossentropy": 4.644165277481079, "loss/hidden": 6.71875, "loss/jsd": 0.0, "loss/logits": 2.1411956548690796, "step": 28 }, { "epoch": 0.0009375, "grad_norm": 193.0, "grad_norm_var": 1174.7375, "learning_rate": 4.4000000000000006e-05, "loss": 28.865, "loss/crossentropy": 4.189300537109375, "loss/hidden": 6.546875, "loss/jsd": 0.0, "loss/logits": 1.81288480758667, "step": 30 }, { "epoch": 0.001, "grad_norm": 32.25, "grad_norm_var": 1289.6643229166666, "learning_rate": 4.5600000000000004e-05, "loss": 25.722, "loss/crossentropy": 3.7905068397521973, "loss/hidden": 6.078125, "loss/jsd": 0.0, "loss/logits": 1.5853379368782043, "step": 32 }, { "epoch": 0.0010625, "grad_norm": 48.75, "grad_norm_var": 1344.2759765625, "learning_rate": 4.72e-05, "loss": 24.645, "loss/crossentropy": 3.972483277320862, "loss/hidden": 5.890625, "loss/jsd": 0.0, "loss/logits": 1.4781858325004578, "step": 34 }, { "epoch": 0.001125, "grad_norm": 28.5, "grad_norm_var": 1459.4322265625, "learning_rate": 4.88e-05, "loss": 21.5933, "loss/crossentropy": 3.3368273973464966, "loss/hidden": 5.59375, "loss/jsd": 0.0, "loss/logits": 1.2662731409072876, "step": 36 }, { "epoch": 0.0011875, "grad_norm": 23.75, "grad_norm_var": 1564.7108723958333, "learning_rate": 5.0400000000000005e-05, "loss": 21.9488, "loss/crossentropy": 3.6340177059173584, "loss/hidden": 5.484375, "loss/jsd": 0.0, "loss/logits": 1.2830361127853394, "step": 38 }, { "epoch": 0.00125, "grad_norm": 28.375, "grad_norm_var": 1630.9760416666666, "learning_rate": 5.2000000000000004e-05, "loss": 21.1924, "loss/crossentropy": 3.7127119302749634, "loss/hidden": 5.25, "loss/jsd": 0.0, "loss/logits": 1.2229673862457275, "step": 40 }, { "epoch": 0.0013125, "grad_norm": 27.875, "grad_norm_var": 1695.2822916666667, "learning_rate": 5.360000000000001e-05, "loss": 19.8704, "loss/crossentropy": 3.402773141860962, "loss/hidden": 5.203125, "loss/jsd": 0.0, "loss/logits": 1.1264490485191345, "step": 42 }, { "epoch": 0.001375, "grad_norm": 35.5, "grad_norm_var": 1718.1832682291667, "learning_rate": 5.520000000000001e-05, "loss": 18.1969, "loss/crossentropy": 3.2803523540496826, "loss/hidden": 4.9375, "loss/jsd": 0.0, "loss/logits": 0.9979034960269928, "step": 44 }, { "epoch": 0.0014375, "grad_norm": 23.5, "grad_norm_var": 72.73639322916667, "learning_rate": 5.680000000000001e-05, "loss": 20.0092, "loss/crossentropy": 3.6576796770095825, "loss/hidden": 4.984375, "loss/jsd": 0.0, "loss/logits": 1.136712908744812, "step": 46 }, { "epoch": 0.0015, "grad_norm": 18.125, "grad_norm_var": 58.280208333333334, "learning_rate": 5.840000000000001e-05, "loss": 17.1218, "loss/crossentropy": 3.304303526878357, "loss/hidden": 4.6875, "loss/jsd": 0.0, "loss/logits": 0.9130024909973145, "step": 48 }, { "epoch": 0.0015625, "grad_norm": 26.75, "grad_norm_var": 63.002018229166666, "learning_rate": 6.0000000000000015e-05, "loss": 17.7318, "loss/crossentropy": 3.2923504114151, "loss/hidden": 4.78125, "loss/jsd": 0.0, "loss/logits": 0.9658186733722687, "step": 50 }, { "epoch": 0.001625, "grad_norm": 35.0, "grad_norm_var": 67.05670572916667, "learning_rate": 6.16e-05, "loss": 16.8527, "loss/crossentropy": 3.1591343879699707, "loss/hidden": 4.625, "loss/jsd": 0.0, "loss/logits": 0.9068574905395508, "step": 52 }, { "epoch": 0.0016875, "grad_norm": 16.25, "grad_norm_var": 78.15983072916667, "learning_rate": 6.320000000000002e-05, "loss": 16.2787, "loss/crossentropy": 3.1270612478256226, "loss/hidden": 4.453125, "loss/jsd": 0.0, "loss/logits": 0.8698541224002838, "step": 54 }, { "epoch": 0.00175, "grad_norm": 17.75, "grad_norm_var": 86.74993489583333, "learning_rate": 6.480000000000002e-05, "loss": 16.5175, "loss/crossentropy": 3.2500909566879272, "loss/hidden": 4.46875, "loss/jsd": 0.0, "loss/logits": 0.8798635005950928, "step": 56 }, { "epoch": 0.0018125, "grad_norm": 23.0, "grad_norm_var": 93.07473958333334, "learning_rate": 6.64e-05, "loss": 16.7407, "loss/crossentropy": 3.3384851217269897, "loss/hidden": 4.359375, "loss/jsd": 0.0, "loss/logits": 0.9042791426181793, "step": 58 }, { "epoch": 0.001875, "grad_norm": 19.0, "grad_norm_var": 88.33587239583333, "learning_rate": 6.8e-05, "loss": 15.3873, "loss/crossentropy": 2.9939886331558228, "loss/hidden": 4.234375, "loss/jsd": 0.0, "loss/logits": 0.8158909380435944, "step": 60 }, { "epoch": 0.0019375, "grad_norm": 17.0, "grad_norm_var": 80.16223958333333, "learning_rate": 6.96e-05, "loss": 16.1478, "loss/crossentropy": 3.2266587018966675, "loss/hidden": 4.296875, "loss/jsd": 0.0, "loss/logits": 0.8624304533004761, "step": 62 }, { "epoch": 0.002, "grad_norm": 16.5, "grad_norm_var": 79.83515625, "learning_rate": 7.12e-05, "loss": 13.9403, "loss/crossentropy": 2.7177222967147827, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.7081980109214783, "step": 64 }, { "epoch": 0.0020625, "grad_norm": 17.875, "grad_norm_var": 22.685791015625, "learning_rate": 7.280000000000001e-05, "loss": 14.943, "loss/crossentropy": 3.1492207050323486, "loss/hidden": 4.125, "loss/jsd": 0.0, "loss/logits": 0.7668733894824982, "step": 66 }, { "epoch": 0.002125, "grad_norm": 13.875, "grad_norm_var": 5.928369140625, "learning_rate": 7.44e-05, "loss": 14.6872, "loss/crossentropy": 3.163086175918579, "loss/hidden": 3.9375, "loss/jsd": 0.0, "loss/logits": 0.758662760257721, "step": 68 }, { "epoch": 0.0021875, "grad_norm": 20.0, "grad_norm_var": 7.180712890625, "learning_rate": 7.6e-05, "loss": 15.1751, "loss/crossentropy": 3.189119815826416, "loss/hidden": 4.015625, "loss/jsd": 0.0, "loss/logits": 0.7970321774482727, "step": 70 }, { "epoch": 0.00225, "grad_norm": 18.25, "grad_norm_var": 7.503059895833333, "learning_rate": 7.76e-05, "loss": 14.9421, "loss/crossentropy": 3.257962226867676, "loss/hidden": 3.9140625, "loss/jsd": 0.0, "loss/logits": 0.777004063129425, "step": 72 }, { "epoch": 0.0023125, "grad_norm": 18.5, "grad_norm_var": 6.022395833333333, "learning_rate": 7.920000000000001e-05, "loss": 15.0205, "loss/crossentropy": 3.289088249206543, "loss/hidden": 3.9296875, "loss/jsd": 0.0, "loss/logits": 0.7801713049411774, "step": 74 }, { "epoch": 0.002375, "grad_norm": 17.25, "grad_norm_var": 5.628059895833333, "learning_rate": 8.080000000000001e-05, "loss": 14.6665, "loss/crossentropy": 3.132324695587158, "loss/hidden": 3.875, "loss/jsd": 0.0, "loss/logits": 0.7659187614917755, "step": 76 }, { "epoch": 0.0024375, "grad_norm": 14.125, "grad_norm_var": 6.317643229166666, "learning_rate": 8.240000000000001e-05, "loss": 14.5497, "loss/crossentropy": 3.1414411067962646, "loss/hidden": 3.8828125, "loss/jsd": 0.0, "loss/logits": 0.7525473237037659, "step": 78 }, { "epoch": 0.0025, "grad_norm": 22.5, "grad_norm_var": 8.394205729166666, "learning_rate": 8.400000000000001e-05, "loss": 14.471, "loss/crossentropy": 3.19692599773407, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.7477201521396637, "step": 80 }, { "epoch": 0.0025625, "grad_norm": 16.5, "grad_norm_var": 8.338802083333333, "learning_rate": 8.560000000000001e-05, "loss": 13.6071, "loss/crossentropy": 2.9412546157836914, "loss/hidden": 3.78125, "loss/jsd": 0.0, "loss/logits": 0.6884627342224121, "step": 82 }, { "epoch": 0.002625, "grad_norm": 21.875, "grad_norm_var": 8.0587890625, "learning_rate": 8.720000000000002e-05, "loss": 13.4559, "loss/crossentropy": 3.074158549308777, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.6702075600624084, "step": 84 }, { "epoch": 0.0026875, "grad_norm": 11.875, "grad_norm_var": 8.190104166666666, "learning_rate": 8.880000000000002e-05, "loss": 13.1482, "loss/crossentropy": 2.9633986949920654, "loss/hidden": 3.734375, "loss/jsd": 0.0, "loss/logits": 0.6450382769107819, "step": 86 }, { "epoch": 0.00275, "grad_norm": 13.3125, "grad_norm_var": 8.740885416666666, "learning_rate": 9.040000000000002e-05, "loss": 13.2811, "loss/crossentropy": 3.0899221897125244, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.6597437262535095, "step": 88 }, { "epoch": 0.0028125, "grad_norm": 14.5, "grad_norm_var": 9.731770833333334, "learning_rate": 9.200000000000001e-05, "loss": 12.5622, "loss/crossentropy": 2.6711933612823486, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.6250402927398682, "step": 90 }, { "epoch": 0.002875, "grad_norm": 22.125, "grad_norm_var": 11.3453125, "learning_rate": 9.360000000000003e-05, "loss": 12.9117, "loss/crossentropy": 2.9529306888580322, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.6279078125953674, "step": 92 }, { "epoch": 0.0029375, "grad_norm": 15.75, "grad_norm_var": 11.020247395833334, "learning_rate": 9.52e-05, "loss": 13.1727, "loss/crossentropy": 3.0362765789031982, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.655828982591629, "step": 94 }, { "epoch": 0.003, "grad_norm": 14.8125, "grad_norm_var": 9.081103515625, "learning_rate": 9.680000000000001e-05, "loss": 13.2317, "loss/crossentropy": 3.050165295600891, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.6525256931781769, "step": 96 }, { "epoch": 0.0030625, "grad_norm": 16.125, "grad_norm_var": 10.2056640625, "learning_rate": 9.84e-05, "loss": 12.9596, "loss/crossentropy": 3.007634401321411, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.6436329782009125, "step": 98 }, { "epoch": 0.003125, "grad_norm": 12.6875, "grad_norm_var": 8.851546223958334, "learning_rate": 0.0001, "loss": 12.526, "loss/crossentropy": 3.0878303050994873, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.5985024273395538, "step": 100 }, { "epoch": 0.0031875, "grad_norm": 14.875, "grad_norm_var": 8.074934895833334, "learning_rate": 0.0001, "loss": 12.5772, "loss/crossentropy": 2.8073278665542603, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.6262076199054718, "step": 102 }, { "epoch": 0.00325, "grad_norm": 12.8125, "grad_norm_var": 8.747639973958334, "learning_rate": 0.0001, "loss": 12.8581, "loss/crossentropy": 2.9922837018966675, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.6381443738937378, "step": 104 }, { "epoch": 0.0033125, "grad_norm": 13.25, "grad_norm_var": 8.709358723958333, "learning_rate": 0.0001, "loss": 12.9596, "loss/crossentropy": 2.9500906467437744, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.6517307162284851, "step": 106 }, { "epoch": 0.003375, "grad_norm": 12.25, "grad_norm_var": 4.660660807291666, "learning_rate": 0.0001, "loss": 11.768, "loss/crossentropy": 2.8668397665023804, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.5580830276012421, "step": 108 }, { "epoch": 0.0034375, "grad_norm": 16.625, "grad_norm_var": 4.357666015625, "learning_rate": 0.0001, "loss": 12.1518, "loss/crossentropy": 2.9460701942443848, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.5986941456794739, "step": 110 }, { "epoch": 0.0035, "grad_norm": 13.5625, "grad_norm_var": 2.1166015625, "learning_rate": 0.0001, "loss": 11.8625, "loss/crossentropy": 2.9012417793273926, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.5570653975009918, "step": 112 }, { "epoch": 0.0035625, "grad_norm": 11.875, "grad_norm_var": 1.6338541666666666, "learning_rate": 0.0001, "loss": 11.8177, "loss/crossentropy": 2.9161791801452637, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.5620248317718506, "step": 114 }, { "epoch": 0.003625, "grad_norm": 15.5, "grad_norm_var": 2.4508951822916667, "learning_rate": 0.0001, "loss": 12.2833, "loss/crossentropy": 3.0185381174087524, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.596792608499527, "step": 116 }, { "epoch": 0.0036875, "grad_norm": 13.0625, "grad_norm_var": 2.618603515625, "learning_rate": 0.0001, "loss": 12.3115, "loss/crossentropy": 3.1687170267105103, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.5845893919467926, "step": 118 }, { "epoch": 0.00375, "grad_norm": 13.8125, "grad_norm_var": 2.6577962239583335, "learning_rate": 0.0001, "loss": 12.3601, "loss/crossentropy": 3.0240886211395264, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.6023522615432739, "step": 120 }, { "epoch": 0.0038125, "grad_norm": 10.9375, "grad_norm_var": 3.499462890625, "learning_rate": 0.0001, "loss": 10.8604, "loss/crossentropy": 2.6125407218933105, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.5075993537902832, "step": 122 }, { "epoch": 0.003875, "grad_norm": 12.0, "grad_norm_var": 3.5036295572916667, "learning_rate": 0.0001, "loss": 11.8795, "loss/crossentropy": 2.8815178871154785, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.581826388835907, "step": 124 }, { "epoch": 0.0039375, "grad_norm": 13.375, "grad_norm_var": 2.6025390625, "learning_rate": 0.0001, "loss": 11.8437, "loss/crossentropy": 2.966344118118286, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.5650805234909058, "step": 126 }, { "epoch": 0.004, "grad_norm": 10.9375, "grad_norm_var": 2.746875, "learning_rate": 0.0001, "loss": 11.7435, "loss/crossentropy": 2.978629946708679, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.5546101331710815, "step": 128 }, { "epoch": 0.0040625, "grad_norm": 9.875, "grad_norm_var": 3.224723307291667, "learning_rate": 0.0001, "loss": 11.4013, "loss/crossentropy": 2.8617138862609863, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.5406723916530609, "step": 130 }, { "epoch": 0.004125, "grad_norm": 11.125, "grad_norm_var": 2.3733723958333335, "learning_rate": 0.0001, "loss": 11.7582, "loss/crossentropy": 2.960026741027832, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.5595053732395172, "step": 132 }, { "epoch": 0.0041875, "grad_norm": 11.125, "grad_norm_var": 1.3651041666666666, "learning_rate": 0.0001, "loss": 11.0347, "loss/crossentropy": 2.877363085746765, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.5071391761302948, "step": 134 }, { "epoch": 0.00425, "grad_norm": 12.8125, "grad_norm_var": 1.1153483072916666, "learning_rate": 0.0001, "loss": 11.4875, "loss/crossentropy": 2.904009461402893, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.5450641810894012, "step": 136 }, { "epoch": 0.0043125, "grad_norm": 10.6875, "grad_norm_var": 1.0735514322916666, "learning_rate": 0.0001, "loss": 10.8593, "loss/crossentropy": 2.7577372789382935, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.5007785558700562, "step": 138 }, { "epoch": 0.004375, "grad_norm": 11.875, "grad_norm_var": 1.42734375, "learning_rate": 0.0001, "loss": 12.4117, "loss/crossentropy": 3.1381568908691406, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.6015740633010864, "step": 140 }, { "epoch": 0.0044375, "grad_norm": 12.8125, "grad_norm_var": 1.5671875, "learning_rate": 0.0001, "loss": 11.4659, "loss/crossentropy": 3.0004279613494873, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.537956103682518, "step": 142 }, { "epoch": 0.0045, "grad_norm": 11.5, "grad_norm_var": 1.3536295572916666, "learning_rate": 0.0001, "loss": 10.9772, "loss/crossentropy": 2.6849220991134644, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.5253205299377441, "step": 144 }, { "epoch": 0.0045625, "grad_norm": 13.0, "grad_norm_var": 1.42890625, "learning_rate": 0.0001, "loss": 10.991, "loss/crossentropy": 2.929854989051819, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.501423716545105, "step": 146 }, { "epoch": 0.004625, "grad_norm": 8.75, "grad_norm_var": 2.023697916666667, "learning_rate": 0.0001, "loss": 10.8928, "loss/crossentropy": 2.955693483352661, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.4944879859685898, "step": 148 }, { "epoch": 0.0046875, "grad_norm": 14.625, "grad_norm_var": 2.6907389322916666, "learning_rate": 0.0001, "loss": 10.9484, "loss/crossentropy": 2.745115876197815, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.5187694430351257, "step": 150 }, { "epoch": 0.00475, "grad_norm": 9.4375, "grad_norm_var": 3.390625, "learning_rate": 0.0001, "loss": 11.0682, "loss/crossentropy": 2.793348789215088, "loss/hidden": 3.046875, "loss/jsd": 0.0, "loss/logits": 0.5228001177310944, "step": 152 }, { "epoch": 0.0048125, "grad_norm": 9.0625, "grad_norm_var": 3.870556640625, "learning_rate": 0.0001, "loss": 9.9671, "loss/crossentropy": 2.544915556907654, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.45315225422382355, "step": 154 }, { "epoch": 0.004875, "grad_norm": 9.75, "grad_norm_var": 3.581103515625, "learning_rate": 0.0001, "loss": 10.2548, "loss/crossentropy": 2.7735856771469116, "loss/hidden": 2.984375, "loss/jsd": 0.0, "loss/logits": 0.44968172907829285, "step": 156 }, { "epoch": 0.0049375, "grad_norm": 9.5, "grad_norm_var": 3.4734212239583333, "learning_rate": 0.0001, "loss": 10.6978, "loss/crossentropy": 2.61746346950531, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.5010055005550385, "step": 158 }, { "epoch": 0.005, "grad_norm": 12.375, "grad_norm_var": 3.4423014322916665, "learning_rate": 0.0001, "loss": 11.1114, "loss/crossentropy": 3.0405282974243164, "loss/hidden": 3.015625, "loss/jsd": 0.0, "loss/logits": 0.5055254101753235, "step": 160 }, { "epoch": 0.0050625, "grad_norm": 13.5, "grad_norm_var": 3.6130045572916667, "learning_rate": 0.0001, "loss": 11.3564, "loss/crossentropy": 2.9557673931121826, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.5361583232879639, "step": 162 }, { "epoch": 0.005125, "grad_norm": 9.6875, "grad_norm_var": 3.3590983072916667, "learning_rate": 0.0001, "loss": 10.8173, "loss/crossentropy": 2.9696191549301147, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.49414533376693726, "step": 164 }, { "epoch": 0.0051875, "grad_norm": 10.9375, "grad_norm_var": 2.4331868489583335, "learning_rate": 0.0001, "loss": 10.7629, "loss/crossentropy": 2.870342493057251, "loss/hidden": 2.9921875, "loss/jsd": 0.0, "loss/logits": 0.4900369644165039, "step": 166 }, { "epoch": 0.00525, "grad_norm": 11.9375, "grad_norm_var": 1.7266764322916666, "learning_rate": 0.0001, "loss": 11.3727, "loss/crossentropy": 2.9022059440612793, "loss/hidden": 3.03125, "loss/jsd": 0.0, "loss/logits": 0.5439256131649017, "step": 168 }, { "epoch": 0.0053125, "grad_norm": 13.6875, "grad_norm_var": 1.939697265625, "learning_rate": 0.0001, "loss": 11.2794, "loss/crossentropy": 2.9904470443725586, "loss/hidden": 2.9765625, "loss/jsd": 0.0, "loss/logits": 0.5312376618385315, "step": 170 }, { "epoch": 0.005375, "grad_norm": 11.875, "grad_norm_var": 1.6367024739583333, "learning_rate": 0.0001, "loss": 11.0391, "loss/crossentropy": 2.9553390741348267, "loss/hidden": 3.0390625, "loss/jsd": 0.0, "loss/logits": 0.5044730305671692, "step": 172 }, { "epoch": 0.0054375, "grad_norm": 8.5, "grad_norm_var": 2.192822265625, "learning_rate": 0.0001, "loss": 10.5749, "loss/crossentropy": 2.886873483657837, "loss/hidden": 2.921875, "loss/jsd": 0.0, "loss/logits": 0.4766187369823456, "step": 174 }, { "epoch": 0.0055, "grad_norm": 10.625, "grad_norm_var": 2.0989420572916666, "learning_rate": 0.0001, "loss": 10.7418, "loss/crossentropy": 2.8771119117736816, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.5020954459905624, "step": 176 }, { "epoch": 0.0055625, "grad_norm": 9.5, "grad_norm_var": 1.9636555989583333, "learning_rate": 0.0001, "loss": 10.3281, "loss/crossentropy": 2.769477367401123, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.4746148884296417, "step": 178 }, { "epoch": 0.005625, "grad_norm": 9.4375, "grad_norm_var": 2.148177083333333, "learning_rate": 0.0001, "loss": 10.6448, "loss/crossentropy": 2.889192581176758, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.4950959086418152, "step": 180 }, { "epoch": 0.0056875, "grad_norm": 9.3125, "grad_norm_var": 2.2296712239583334, "learning_rate": 0.0001, "loss": 10.4231, "loss/crossentropy": 2.9166088104248047, "loss/hidden": 2.8203125, "loss/jsd": 0.0, "loss/logits": 0.4686211049556732, "step": 182 }, { "epoch": 0.00575, "grad_norm": 10.25, "grad_norm_var": 1.7809895833333333, "learning_rate": 0.0001, "loss": 10.4394, "loss/crossentropy": 2.962415933609009, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.4633220136165619, "step": 184 }, { "epoch": 0.0058125, "grad_norm": 9.625, "grad_norm_var": 0.896728515625, "learning_rate": 0.0001, "loss": 10.7424, "loss/crossentropy": 2.9772469997406006, "loss/hidden": 2.90625, "loss/jsd": 0.0, "loss/logits": 0.48589444160461426, "step": 186 }, { "epoch": 0.005875, "grad_norm": 11.25, "grad_norm_var": 0.6596354166666667, "learning_rate": 0.0001, "loss": 11.4098, "loss/crossentropy": 3.0638362169265747, "loss/hidden": 2.96875, "loss/jsd": 0.0, "loss/logits": 0.5377195775508881, "step": 188 }, { "epoch": 0.0059375, "grad_norm": 11.125, "grad_norm_var": 0.6338541666666667, "learning_rate": 0.0001, "loss": 10.2039, "loss/crossentropy": 2.698214054107666, "loss/hidden": 2.8671875, "loss/jsd": 0.0, "loss/logits": 0.46385255455970764, "step": 190 }, { "epoch": 0.006, "grad_norm": 13.4375, "grad_norm_var": 1.5541015625, "learning_rate": 0.0001, "loss": 10.3437, "loss/crossentropy": 2.7819976806640625, "loss/hidden": 2.84375, "loss/jsd": 0.0, "loss/logits": 0.47179484367370605, "step": 192 }, { "epoch": 0.0060625, "grad_norm": 9.625, "grad_norm_var": 1.4389973958333333, "learning_rate": 0.0001, "loss": 10.434, "loss/crossentropy": 2.776822566986084, "loss/hidden": 2.8046875, "loss/jsd": 0.0, "loss/logits": 0.48525065183639526, "step": 194 }, { "epoch": 0.006125, "grad_norm": 9.9375, "grad_norm_var": 1.3817057291666666, "learning_rate": 0.0001, "loss": 10.5311, "loss/crossentropy": 2.9022562503814697, "loss/hidden": 2.890625, "loss/jsd": 0.0, "loss/logits": 0.473824679851532, "step": 196 }, { "epoch": 0.0061875, "grad_norm": 9.0625, "grad_norm_var": 1.4453125, "learning_rate": 0.0001, "loss": 10.6488, "loss/crossentropy": 3.086912989616394, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.4749341607093811, "step": 198 }, { "epoch": 0.00625, "grad_norm": 9.9375, "grad_norm_var": 1.6442057291666667, "learning_rate": 0.0001, "loss": 10.5391, "loss/crossentropy": 2.9904398918151855, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.4751739054918289, "step": 200 }, { "epoch": 0.0063125, "grad_norm": 9.3125, "grad_norm_var": 1.6425618489583333, "learning_rate": 0.0001, "loss": 10.7217, "loss/crossentropy": 3.009773015975952, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.48994509875774384, "step": 202 }, { "epoch": 0.006375, "grad_norm": 9.1875, "grad_norm_var": 1.433447265625, "learning_rate": 0.0001, "loss": 10.2394, "loss/crossentropy": 2.7742444276809692, "loss/hidden": 2.8515625, "loss/jsd": 0.0, "loss/logits": 0.46136191487312317, "step": 204 }, { "epoch": 0.0064375, "grad_norm": 8.5625, "grad_norm_var": 1.3983723958333334, "learning_rate": 0.0001, "loss": 10.1427, "loss/crossentropy": 2.848304867744446, "loss/hidden": 2.7578125, "loss/jsd": 0.0, "loss/logits": 0.45365576446056366, "step": 206 }, { "epoch": 0.0065, "grad_norm": 8.4375, "grad_norm_var": 0.33151041666666664, "learning_rate": 0.0001, "loss": 10.3211, "loss/crossentropy": 2.9591645002365112, "loss/hidden": 2.765625, "loss/jsd": 0.0, "loss/logits": 0.4596277326345444, "step": 208 }, { "epoch": 0.0065625, "grad_norm": 8.8125, "grad_norm_var": 0.5629557291666667, "learning_rate": 0.0001, "loss": 10.0751, "loss/crossentropy": 2.704858183860779, "loss/hidden": 2.734375, "loss/jsd": 0.0, "loss/logits": 0.4635818302631378, "step": 210 }, { "epoch": 0.006625, "grad_norm": 9.5, "grad_norm_var": 0.5410807291666667, "learning_rate": 0.0001, "loss": 10.6019, "loss/crossentropy": 2.992745518684387, "loss/hidden": 2.7734375, "loss/jsd": 0.0, "loss/logits": 0.48356935381889343, "step": 212 }, { "epoch": 0.0066875, "grad_norm": 9.625, "grad_norm_var": 0.6176920572916667, "learning_rate": 0.0001, "loss": 10.0108, "loss/crossentropy": 2.840528726577759, "loss/hidden": 2.7734375, "loss/jsd": 0.0, "loss/logits": 0.4396800994873047, "step": 214 }, { "epoch": 0.00675, "grad_norm": 10.4375, "grad_norm_var": 0.605322265625, "learning_rate": 0.0001, "loss": 9.8039, "loss/crossentropy": 2.64647901058197, "loss/hidden": 2.7578125, "loss/jsd": 0.0, "loss/logits": 0.43996211886405945, "step": 216 }, { "epoch": 0.0068125, "grad_norm": 8.1875, "grad_norm_var": 0.8450520833333334, "learning_rate": 0.0001, "loss": 10.1335, "loss/crossentropy": 2.840444326400757, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.4503984898328781, "step": 218 }, { "epoch": 0.006875, "grad_norm": 10.3125, "grad_norm_var": 0.9781087239583334, "learning_rate": 0.0001, "loss": 9.9466, "loss/crossentropy": 2.655266284942627, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.4494474083185196, "step": 220 }, { "epoch": 0.0069375, "grad_norm": 10.9375, "grad_norm_var": 1.1989583333333333, "learning_rate": 0.0001, "loss": 10.1344, "loss/crossentropy": 2.819695830345154, "loss/hidden": 2.7421875, "loss/jsd": 0.0, "loss/logits": 0.45725369453430176, "step": 222 }, { "epoch": 0.007, "grad_norm": 8.9375, "grad_norm_var": 1.1447916666666667, "learning_rate": 0.0001, "loss": 10.2209, "loss/crossentropy": 2.9145302772521973, "loss/hidden": 2.7734375, "loss/jsd": 0.0, "loss/logits": 0.4532930552959442, "step": 224 }, { "epoch": 0.0070625, "grad_norm": 8.4375, "grad_norm_var": 0.9332682291666666, "learning_rate": 0.0001, "loss": 10.0121, "loss/crossentropy": 2.8794121742248535, "loss/hidden": 2.75, "loss/jsd": 0.0, "loss/logits": 0.4382711499929428, "step": 226 }, { "epoch": 0.007125, "grad_norm": 10.125, "grad_norm_var": 1.0872395833333333, "learning_rate": 0.0001, "loss": 10.397, "loss/crossentropy": 2.894919991493225, "loss/hidden": 2.734375, "loss/jsd": 0.0, "loss/logits": 0.4767727255821228, "step": 228 }, { "epoch": 0.0071875, "grad_norm": 8.9375, "grad_norm_var": 1.021728515625, "learning_rate": 0.0001, "loss": 10.1881, "loss/crossentropy": 2.923813581466675, "loss/hidden": 2.7578125, "loss/jsd": 0.0, "loss/logits": 0.4506445974111557, "step": 230 }, { "epoch": 0.00725, "grad_norm": 9.4375, "grad_norm_var": 0.9921875, "learning_rate": 0.0001, "loss": 10.1729, "loss/crossentropy": 2.806499481201172, "loss/hidden": 2.8125, "loss/jsd": 0.0, "loss/logits": 0.4553864002227783, "step": 232 }, { "epoch": 0.0073125, "grad_norm": 8.75, "grad_norm_var": 0.771728515625, "learning_rate": 0.0001, "loss": 10.0788, "loss/crossentropy": 2.8656084537506104, "loss/hidden": 2.6875, "loss/jsd": 0.0, "loss/logits": 0.4525725245475769, "step": 234 }, { "epoch": 0.007375, "grad_norm": 9.875, "grad_norm_var": 0.6760416666666667, "learning_rate": 0.0001, "loss": 10.0307, "loss/crossentropy": 2.7157695293426514, "loss/hidden": 2.796875, "loss/jsd": 0.0, "loss/logits": 0.45180511474609375, "step": 236 }, { "epoch": 0.0074375, "grad_norm": 8.8125, "grad_norm_var": 0.3675618489583333, "learning_rate": 0.0001, "loss": 10.218, "loss/crossentropy": 2.8563307523727417, "loss/hidden": 2.7265625, "loss/jsd": 0.0, "loss/logits": 0.46350690722465515, "step": 238 }, { "epoch": 0.0075, "grad_norm": 8.5, "grad_norm_var": 0.38795572916666665, "learning_rate": 0.0001, "loss": 10.0956, "loss/crossentropy": 2.801733136177063, "loss/hidden": 2.6953125, "loss/jsd": 0.0, "loss/logits": 0.45985159277915955, "step": 240 }, { "epoch": 0.0075625, "grad_norm": 9.25, "grad_norm_var": 0.372509765625, "learning_rate": 0.0001, "loss": 10.3281, "loss/crossentropy": 3.0221978425979614, "loss/hidden": 2.6875, "loss/jsd": 0.0, "loss/logits": 0.46184317767620087, "step": 242 }, { "epoch": 0.007625, "grad_norm": 8.5, "grad_norm_var": 0.2699055989583333, "learning_rate": 0.0001, "loss": 10.069, "loss/crossentropy": 2.868880271911621, "loss/hidden": 2.6875, "loss/jsd": 0.0, "loss/logits": 0.4512626975774765, "step": 244 }, { "epoch": 0.0076875, "grad_norm": 9.125, "grad_norm_var": 0.37405192057291664, "learning_rate": 0.0001, "loss": 10.1415, "loss/crossentropy": 3.017348051071167, "loss/hidden": 2.6875, "loss/jsd": 0.0, "loss/logits": 0.4436669647693634, "step": 246 }, { "epoch": 0.00775, "grad_norm": 7.5625, "grad_norm_var": 0.4295857747395833, "learning_rate": 0.0001, "loss": 9.7982, "loss/crossentropy": 2.8351858854293823, "loss/hidden": 2.640625, "loss/jsd": 0.0, "loss/logits": 0.432241827249527, "step": 248 }, { "epoch": 0.0078125, "grad_norm": 8.75, "grad_norm_var": 0.4040323893229167, "learning_rate": 0.0001, "loss": 10.0516, "loss/crossentropy": 2.9942902326583862, "loss/hidden": 2.625, "loss/jsd": 0.0, "loss/logits": 0.44322872161865234, "step": 250 }, { "epoch": 0.007875, "grad_norm": 7.0625, "grad_norm_var": 0.5571451822916667, "learning_rate": 0.0001, "loss": 9.6001, "loss/crossentropy": 2.8024221658706665, "loss/hidden": 2.5859375, "loss/jsd": 0.0, "loss/logits": 0.42117369174957275, "step": 252 }, { "epoch": 0.0079375, "grad_norm": 7.53125, "grad_norm_var": 0.6127237955729167, "learning_rate": 0.0001, "loss": 9.4012, "loss/crossentropy": 2.659646511077881, "loss/hidden": 2.6171875, "loss/jsd": 0.0, "loss/logits": 0.4124371409416199, "step": 254 }, { "epoch": 0.008, "grad_norm": 8.5, "grad_norm_var": 4.341044108072917, "learning_rate": 0.0001, "loss": 9.9692, "loss/crossentropy": 2.7675873041152954, "loss/hidden": 2.8359375, "loss/jsd": 0.0, "loss/logits": 0.43657153844833374, "step": 256 }, { "epoch": 0.0080625, "grad_norm": 7.375, "grad_norm_var": 4.434098307291666, "learning_rate": 0.0001, "loss": 9.6241, "loss/crossentropy": 2.752623677253723, "loss/hidden": 2.65625, "loss/jsd": 0.0, "loss/logits": 0.421526238322258, "step": 258 }, { "epoch": 0.008125, "grad_norm": 9.75, "grad_norm_var": 4.814322916666667, "learning_rate": 0.0001, "loss": 10.5617, "loss/crossentropy": 2.9343960285186768, "loss/hidden": 2.7265625, "loss/jsd": 0.0, "loss/logits": 0.4900728166103363, "step": 260 }, { "epoch": 0.0081875, "grad_norm": 8.5, "grad_norm_var": 4.757255045572917, "learning_rate": 0.0001, "loss": 9.813, "loss/crossentropy": 2.8956027030944824, "loss/hidden": 2.6171875, "loss/jsd": 0.0, "loss/logits": 0.43002206087112427, "step": 262 }, { "epoch": 0.00825, "grad_norm": 6.90625, "grad_norm_var": 4.924479166666667, "learning_rate": 0.0001, "loss": 9.1786, "loss/crossentropy": 2.5651010274887085, "loss/hidden": 2.6484375, "loss/jsd": 0.0, "loss/logits": 0.39650242030620575, "step": 264 }, { "epoch": 0.0083125, "grad_norm": 10.125, "grad_norm_var": 5.049674479166667, "learning_rate": 0.0001, "loss": 10.1403, "loss/crossentropy": 2.9591987133026123, "loss/hidden": 2.734375, "loss/jsd": 0.0, "loss/logits": 0.4446714520454407, "step": 266 }, { "epoch": 0.008375, "grad_norm": 10.4375, "grad_norm_var": 4.84810791015625, "learning_rate": 0.0001, "loss": 10.0619, "loss/crossentropy": 2.9117506742477417, "loss/hidden": 2.7109375, "loss/jsd": 0.0, "loss/logits": 0.4439200907945633, "step": 268 }, { "epoch": 0.0084375, "grad_norm": 12.8125, "grad_norm_var": 5.396858723958333, "learning_rate": 0.0001, "loss": 9.8102, "loss/crossentropy": 2.7316226959228516, "loss/hidden": 2.7265625, "loss/jsd": 0.0, "loss/logits": 0.435204416513443, "step": 270 }, { "epoch": 0.0085, "grad_norm": 11.625, "grad_norm_var": 2.6581380208333334, "learning_rate": 0.0001, "loss": 10.3313, "loss/crossentropy": 2.8637452125549316, "loss/hidden": 2.7890625, "loss/jsd": 0.0, "loss/logits": 0.4678504019975662, "step": 272 }, { "epoch": 0.0085625, "grad_norm": 8.25, "grad_norm_var": 2.36773681640625, "learning_rate": 0.0001, "loss": 10.0692, "loss/crossentropy": 2.905339241027832, "loss/hidden": 2.703125, "loss/jsd": 0.0, "loss/logits": 0.44607456028461456, "step": 274 }, { "epoch": 0.008625, "grad_norm": 8.875, "grad_norm_var": 2.241890462239583, "learning_rate": 0.0001, "loss": 9.7537, "loss/crossentropy": 2.7921040058135986, "loss/hidden": 2.6328125, "loss/jsd": 0.0, "loss/logits": 0.43287399411201477, "step": 276 }, { "epoch": 0.0086875, "grad_norm": 11.0, "grad_norm_var": 2.432059733072917, "learning_rate": 0.0001, "loss": 9.8559, "loss/crossentropy": 2.66982901096344, "loss/hidden": 2.671875, "loss/jsd": 0.0, "loss/logits": 0.45141659677028656, "step": 278 }, { "epoch": 0.00875, "grad_norm": 8.1875, "grad_norm_var": 2.079150390625, "learning_rate": 0.0001, "loss": 9.8994, "loss/crossentropy": 2.9137284755706787, "loss/hidden": 2.59375, "loss/jsd": 0.0, "loss/logits": 0.4391949772834778, "step": 280 }, { "epoch": 0.0088125, "grad_norm": 8.1875, "grad_norm_var": 2.11875, "learning_rate": 0.0001, "loss": 9.8661, "loss/crossentropy": 2.753507614135742, "loss/hidden": 2.6328125, "loss/jsd": 0.0, "loss/logits": 0.4479823410511017, "step": 282 }, { "epoch": 0.008875, "grad_norm": 8.25, "grad_norm_var": 2.0929524739583334, "learning_rate": 0.0001, "loss": 9.7656, "loss/crossentropy": 2.8703192472457886, "loss/hidden": 2.5703125, "loss/jsd": 0.0, "loss/logits": 0.4324973225593567, "step": 284 }, { "epoch": 0.0089375, "grad_norm": 8.1875, "grad_norm_var": 1.2997233072916667, "learning_rate": 0.0001, "loss": 9.2983, "loss/crossentropy": 2.6686055660247803, "loss/hidden": 2.609375, "loss/jsd": 0.0, "loss/logits": 0.4020322114229202, "step": 286 }, { "epoch": 0.009, "grad_norm": 7.28125, "grad_norm_var": 0.891796875, "learning_rate": 0.0001, "loss": 9.6798, "loss/crossentropy": 2.915258288383484, "loss/hidden": 2.546875, "loss/jsd": 0.0, "loss/logits": 0.42177151143550873, "step": 288 }, { "epoch": 0.0090625, "grad_norm": 7.78125, "grad_norm_var": 0.80250244140625, "learning_rate": 0.0001, "loss": 9.7686, "loss/crossentropy": 2.877256751060486, "loss/hidden": 2.609375, "loss/jsd": 0.0, "loss/logits": 0.42820094525814056, "step": 290 }, { "epoch": 0.009125, "grad_norm": 7.8125, "grad_norm_var": 0.90875244140625, "learning_rate": 0.0001, "loss": 9.3521, "loss/crossentropy": 2.6476125717163086, "loss/hidden": 2.5703125, "loss/jsd": 0.0, "loss/logits": 0.4134131520986557, "step": 292 }, { "epoch": 0.0091875, "grad_norm": 9.5, "grad_norm_var": 0.52418212890625, "learning_rate": 0.0001, "loss": 9.1739, "loss/crossentropy": 2.618640184402466, "loss/hidden": 2.640625, "loss/jsd": 0.0, "loss/logits": 0.391463965177536, "step": 294 }, { "epoch": 0.00925, "grad_norm": 7.90625, "grad_norm_var": 0.579541015625, "learning_rate": 0.0001, "loss": 9.3249, "loss/crossentropy": 2.5597482919692993, "loss/hidden": 2.6171875, "loss/jsd": 0.0, "loss/logits": 0.4147980213165283, "step": 296 }, { "epoch": 0.0093125, "grad_norm": 7.46875, "grad_norm_var": 0.4962076822916667, "learning_rate": 0.0001, "loss": 9.5687, "loss/crossentropy": 2.7897287607192993, "loss/hidden": 2.5546875, "loss/jsd": 0.0, "loss/logits": 0.4224274307489395, "step": 298 }, { "epoch": 0.009375, "grad_norm": 8.9375, "grad_norm_var": 0.53707275390625, "learning_rate": 0.0001, "loss": 9.7683, "loss/crossentropy": 2.743291974067688, "loss/hidden": 2.625, "loss/jsd": 0.0, "loss/logits": 0.43999941647052765, "step": 300 }, { "epoch": 0.0094375, "grad_norm": 9.625, "grad_norm_var": 0.9377888997395833, "learning_rate": 0.0001, "loss": 10.1141, "loss/crossentropy": 2.968402862548828, "loss/hidden": 2.65625, "loss/jsd": 0.0, "loss/logits": 0.4489475339651108, "step": 302 }, { "epoch": 0.0095, "grad_norm": 7.65625, "grad_norm_var": 1.1374837239583333, "learning_rate": 0.0001, "loss": 9.9672, "loss/crossentropy": 2.921197533607483, "loss/hidden": 2.609375, "loss/jsd": 0.0, "loss/logits": 0.4436652660369873, "step": 304 }, { "epoch": 0.0095625, "grad_norm": 9.125, "grad_norm_var": 1.0794230143229167, "learning_rate": 0.0001, "loss": 9.9847, "loss/crossentropy": 2.8543198108673096, "loss/hidden": 2.578125, "loss/jsd": 0.0, "loss/logits": 0.45522603392601013, "step": 306 }, { "epoch": 0.009625, "grad_norm": 7.96875, "grad_norm_var": 1.0253255208333334, "learning_rate": 0.0001, "loss": 9.478, "loss/crossentropy": 2.711910605430603, "loss/hidden": 2.625, "loss/jsd": 0.0, "loss/logits": 0.41411033272743225, "step": 308 }, { "epoch": 0.0096875, "grad_norm": 8.6875, "grad_norm_var": 0.957666015625, "learning_rate": 0.0001, "loss": 9.7925, "loss/crossentropy": 2.9207193851470947, "loss/hidden": 2.625, "loss/jsd": 0.0, "loss/logits": 0.42468030750751495, "step": 310 }, { "epoch": 0.00975, "grad_norm": 8.375, "grad_norm_var": 0.952197265625, "learning_rate": 0.0001, "loss": 9.4838, "loss/crossentropy": 2.861418128013611, "loss/hidden": 2.5390625, "loss/jsd": 0.0, "loss/logits": 0.40833115577697754, "step": 312 }, { "epoch": 0.0098125, "grad_norm": 8.8125, "grad_norm_var": 3.8309733072916665, "learning_rate": 0.0001, "loss": 9.8637, "loss/crossentropy": 2.8941240310668945, "loss/hidden": 2.5625, "loss/jsd": 0.0, "loss/logits": 0.44070352613925934, "step": 314 }, { "epoch": 0.009875, "grad_norm": 8.1875, "grad_norm_var": 3.8521443684895833, "learning_rate": 0.0001, "loss": 9.5803, "loss/crossentropy": 2.8630210161209106, "loss/hidden": 2.5625, "loss/jsd": 0.0, "loss/logits": 0.41548123955726624, "step": 316 }, { "epoch": 0.0099375, "grad_norm": 8.875, "grad_norm_var": 3.709403483072917, "learning_rate": 0.0001, "loss": 9.5554, "loss/crossentropy": 2.6127843856811523, "loss/hidden": 2.625, "loss/jsd": 0.0, "loss/logits": 0.43175867199897766, "step": 318 }, { "epoch": 0.01, "grad_norm": 7.9375, "grad_norm_var": 3.5029947916666666, "learning_rate": 0.0001, "loss": 9.2689, "loss/crossentropy": 2.6942365169525146, "loss/hidden": 2.5546875, "loss/jsd": 0.0, "loss/logits": 0.40199390053749084, "step": 320 }, { "epoch": 0.0100625, "grad_norm": 9.0, "grad_norm_var": 3.4977213541666665, "learning_rate": 0.0001, "loss": 9.6463, "loss/crossentropy": 2.8481950759887695, "loss/hidden": 2.5703125, "loss/jsd": 0.0, "loss/logits": 0.42277809977531433, "step": 322 }, { "epoch": 0.010125, "grad_norm": 8.3125, "grad_norm_var": 3.5474894205729166, "learning_rate": 0.0001, "loss": 9.4688, "loss/crossentropy": 2.792279362678528, "loss/hidden": 2.546875, "loss/jsd": 0.0, "loss/logits": 0.4129619151353836, "step": 324 }, { "epoch": 0.0101875, "grad_norm": 7.71875, "grad_norm_var": 3.8777180989583333, "learning_rate": 0.0001, "loss": 9.6284, "loss/crossentropy": 2.919487237930298, "loss/hidden": 2.5078125, "loss/jsd": 0.0, "loss/logits": 0.42011047899723053, "step": 326 }, { "epoch": 0.01025, "grad_norm": 11.6875, "grad_norm_var": 4.332255045572917, "learning_rate": 0.0001, "loss": 8.8365, "loss/crossentropy": 2.4436700344085693, "loss/hidden": 2.5625, "loss/jsd": 0.0, "loss/logits": 0.3830350488424301, "step": 328 }, { "epoch": 0.0103125, "grad_norm": 8.6875, "grad_norm_var": 1.2620402018229167, "learning_rate": 0.0001, "loss": 10.4608, "loss/crossentropy": 3.0949630737304688, "loss/hidden": 2.625, "loss/jsd": 0.0, "loss/logits": 0.4740859717130661, "step": 330 }, { "epoch": 0.010375, "grad_norm": 9.8125, "grad_norm_var": 1.4888631184895833, "learning_rate": 0.0001, "loss": 10.0661, "loss/crossentropy": 2.9477418661117554, "loss/hidden": 2.6015625, "loss/jsd": 0.0, "loss/logits": 0.45167967677116394, "step": 332 }, { "epoch": 0.0104375, "grad_norm": 7.1875, "grad_norm_var": 1.8235677083333333, "learning_rate": 0.0001, "loss": 9.2473, "loss/crossentropy": 2.713068962097168, "loss/hidden": 2.546875, "loss/jsd": 0.0, "loss/logits": 0.39873576164245605, "step": 334 }, { "epoch": 0.0105, "grad_norm": 8.4375, "grad_norm_var": 1.9116170247395834, "learning_rate": 0.0001, "loss": 9.017, "loss/crossentropy": 2.5754904747009277, "loss/hidden": 2.546875, "loss/jsd": 0.0, "loss/logits": 0.3894636482000351, "step": 336 }, { "epoch": 0.0105625, "grad_norm": 7.5625, "grad_norm_var": 1.9513020833333334, "learning_rate": 0.0001, "loss": 9.322, "loss/crossentropy": 2.7804969549179077, "loss/hidden": 2.5, "loss/jsd": 0.0, "loss/logits": 0.40414653718471527, "step": 338 }, { "epoch": 0.010625, "grad_norm": 7.28125, "grad_norm_var": 2.0599568684895835, "learning_rate": 0.0001, "loss": 9.4558, "loss/crossentropy": 2.9021013975143433, "loss/hidden": 2.4921875, "loss/jsd": 0.0, "loss/logits": 0.40615350008010864, "step": 340 }, { "epoch": 0.0106875, "grad_norm": 7.53125, "grad_norm_var": 1.9496053059895833, "learning_rate": 0.0001, "loss": 9.7347, "loss/crossentropy": 2.8895071744918823, "loss/hidden": 2.53125, "loss/jsd": 0.0, "loss/logits": 0.4313907325267792, "step": 342 }, { "epoch": 0.01075, "grad_norm": 7.5625, "grad_norm_var": 1.24234619140625, "learning_rate": 0.0001, "loss": 9.2208, "loss/crossentropy": 2.8096436262130737, "loss/hidden": 2.53125, "loss/jsd": 0.0, "loss/logits": 0.3879920691251755, "step": 344 }, { "epoch": 0.0108125, "grad_norm": 8.375, "grad_norm_var": 0.9823567708333333, "learning_rate": 0.0001, "loss": 8.3941, "loss/crossentropy": 2.444824457168579, "loss/hidden": 2.4765625, "loss/jsd": 0.0, "loss/logits": 0.3472696393728256, "step": 346 }, { "epoch": 0.010875, "grad_norm": 6.4375, "grad_norm_var": 0.27044270833333334, "learning_rate": 0.0001, "loss": 9.3588, "loss/crossentropy": 2.8576740026474, "loss/hidden": 2.5, "loss/jsd": 0.0, "loss/logits": 0.40011417865753174, "step": 348 }, { "epoch": 0.0109375, "grad_norm": 6.875, "grad_norm_var": 0.30201416015625, "learning_rate": 0.0001, "loss": 9.0438, "loss/crossentropy": 2.7489218711853027, "loss/hidden": 2.453125, "loss/jsd": 0.0, "loss/logits": 0.3841765522956848, "step": 350 }, { "epoch": 0.011, "grad_norm": 8.0, "grad_norm_var": 0.26008707682291665, "learning_rate": 0.0001, "loss": 9.2174, "loss/crossentropy": 2.693643808364868, "loss/hidden": 2.5390625, "loss/jsd": 0.0, "loss/logits": 0.398467093706131, "step": 352 }, { "epoch": 0.0110625, "grad_norm": 7.34375, "grad_norm_var": 0.2731404622395833, "learning_rate": 0.0001, "loss": 9.2476, "loss/crossentropy": 2.8319369554519653, "loss/hidden": 2.46875, "loss/jsd": 0.0, "loss/logits": 0.3946947753429413, "step": 354 }, { "epoch": 0.011125, "grad_norm": 7.4375, "grad_norm_var": 3.001936848958333, "learning_rate": 0.0001, "loss": 9.8609, "loss/crossentropy": 2.928380846977234, "loss/hidden": 2.515625, "loss/jsd": 0.0, "loss/logits": 0.44168923795223236, "step": 356 }, { "epoch": 0.0111875, "grad_norm": 7.5625, "grad_norm_var": 3.0012858072916666, "learning_rate": 0.0001, "loss": 9.7272, "loss/crossentropy": 2.978438377380371, "loss/hidden": 2.546875, "loss/jsd": 0.0, "loss/logits": 0.4201928675174713, "step": 358 }, { "epoch": 0.01125, "grad_norm": 7.9375, "grad_norm_var": 2.9525390625, "learning_rate": 0.0001, "loss": 9.7725, "loss/crossentropy": 2.9407122135162354, "loss/hidden": 2.515625, "loss/jsd": 0.0, "loss/logits": 0.4316175580024719, "step": 360 }, { "epoch": 0.0113125, "grad_norm": 7.40625, "grad_norm_var": 2.9946573893229167, "learning_rate": 0.0001, "loss": 9.1066, "loss/crossentropy": 2.8210840225219727, "loss/hidden": 2.4296875, "loss/jsd": 0.0, "loss/logits": 0.38558244705200195, "step": 362 }, { "epoch": 0.011375, "grad_norm": 6.84375, "grad_norm_var": 2.9395792643229166, "learning_rate": 0.0001, "loss": 9.1715, "loss/crossentropy": 2.683255434036255, "loss/hidden": 2.484375, "loss/jsd": 0.0, "loss/logits": 0.40038590133190155, "step": 364 }, { "epoch": 0.0114375, "grad_norm": 7.28125, "grad_norm_var": 2.906233723958333, "learning_rate": 0.0001, "loss": 8.6687, "loss/crossentropy": 2.61862576007843, "loss/hidden": 2.40625, "loss/jsd": 0.0, "loss/logits": 0.3643851727247238, "step": 366 }, { "epoch": 0.0115, "grad_norm": 7.71875, "grad_norm_var": 2.9218098958333334, "learning_rate": 0.0001, "loss": 8.9238, "loss/crossentropy": 2.6416503190994263, "loss/hidden": 2.4765625, "loss/jsd": 0.0, "loss/logits": 0.38056348264217377, "step": 368 }, { "epoch": 0.0115625, "grad_norm": 7.90625, "grad_norm_var": 2.856864420572917, "learning_rate": 0.0001, "loss": 9.4662, "loss/crossentropy": 2.924672245979309, "loss/hidden": 2.46875, "loss/jsd": 0.0, "loss/logits": 0.40727290511131287, "step": 370 }, { "epoch": 0.011625, "grad_norm": 7.625, "grad_norm_var": 0.15422770182291667, "learning_rate": 0.0001, "loss": 9.5455, "loss/crossentropy": 2.887393593788147, "loss/hidden": 2.484375, "loss/jsd": 0.0, "loss/logits": 0.4173741787672043, "step": 372 }, { "epoch": 0.0116875, "grad_norm": 7.09375, "grad_norm_var": 0.160009765625, "learning_rate": 0.0001, "loss": 9.3231, "loss/crossentropy": 2.940009832382202, "loss/hidden": 2.421875, "loss/jsd": 0.0, "loss/logits": 0.39612552523612976, "step": 374 }, { "epoch": 0.01175, "grad_norm": 7.4375, "grad_norm_var": 0.14763997395833334, "learning_rate": 0.0001, "loss": 8.7557, "loss/crossentropy": 2.558520793914795, "loss/hidden": 2.4375, "loss/jsd": 0.0, "loss/logits": 0.3759680688381195, "step": 376 }, { "epoch": 0.0118125, "grad_norm": 7.84375, "grad_norm_var": 0.17099202473958333, "learning_rate": 0.0001, "loss": 9.4308, "loss/crossentropy": 2.7902355194091797, "loss/hidden": 2.4453125, "loss/jsd": 0.0, "loss/logits": 0.4195282459259033, "step": 378 }, { "epoch": 0.011875, "grad_norm": 6.90625, "grad_norm_var": 0.19140625, "learning_rate": 0.0001, "loss": 8.8902, "loss/crossentropy": 2.752501130104065, "loss/hidden": 2.375, "loss/jsd": 0.0, "loss/logits": 0.37626585364341736, "step": 380 }, { "epoch": 0.0119375, "grad_norm": 8.125, "grad_norm_var": 0.71929931640625, "learning_rate": 0.0001, "loss": 9.3502, "loss/crossentropy": 2.772351384162903, "loss/hidden": 2.4921875, "loss/jsd": 0.0, "loss/logits": 0.40857018530368805, "step": 382 }, { "epoch": 0.012, "grad_norm": 7.21875, "grad_norm_var": 0.77066650390625, "learning_rate": 0.0001, "loss": 9.3194, "loss/crossentropy": 2.7217490673065186, "loss/hidden": 2.5078125, "loss/jsd": 0.0, "loss/logits": 0.4089791476726532, "step": 384 }, { "epoch": 0.0120625, "grad_norm": 7.65625, "grad_norm_var": 0.7696451822916667, "learning_rate": 0.0001, "loss": 9.2867, "loss/crossentropy": 2.7904086112976074, "loss/hidden": 2.46875, "loss/jsd": 0.0, "loss/logits": 0.402749627828598, "step": 386 }, { "epoch": 0.012125, "grad_norm": 7.84375, "grad_norm_var": 0.7747029622395833, "learning_rate": 0.0001, "loss": 9.4515, "loss/crossentropy": 2.9278364181518555, "loss/hidden": 2.4140625, "loss/jsd": 0.0, "loss/logits": 0.4109601080417633, "step": 388 }, { "epoch": 0.0121875, "grad_norm": 7.78125, "grad_norm_var": 0.775390625, "learning_rate": 0.0001, "loss": 9.0024, "loss/crossentropy": 2.808686137199402, "loss/hidden": 2.40625, "loss/jsd": 0.0, "loss/logits": 0.37874314188957214, "step": 390 }, { "epoch": 0.01225, "grad_norm": 7.40625, "grad_norm_var": 0.765869140625, "learning_rate": 0.0001, "loss": 8.8575, "loss/crossentropy": 2.651176333427429, "loss/hidden": 2.390625, "loss/jsd": 0.0, "loss/logits": 0.38156652450561523, "step": 392 }, { "epoch": 0.0123125, "grad_norm": 8.625, "grad_norm_var": 0.8224568684895833, "learning_rate": 0.0001, "loss": 10.1481, "loss/crossentropy": 3.143176555633545, "loss/hidden": 2.484375, "loss/jsd": 0.0, "loss/logits": 0.45205217599868774, "step": 394 }, { "epoch": 0.012375, "grad_norm": 7.03125, "grad_norm_var": 0.7639933268229167, "learning_rate": 0.0001, "loss": 8.8441, "loss/crossentropy": 2.7559027671813965, "loss/hidden": 2.390625, "loss/jsd": 0.0, "loss/logits": 0.36975668370723724, "step": 396 }, { "epoch": 0.0124375, "grad_norm": 7.03125, "grad_norm_var": 0.32005208333333335, "learning_rate": 0.0001, "loss": 9.0988, "loss/crossentropy": 2.8294628858566284, "loss/hidden": 2.4453125, "loss/jsd": 0.0, "loss/logits": 0.3824039399623871, "step": 398 }, { "epoch": 0.0125, "grad_norm": 7.125, "grad_norm_var": 0.24947509765625, "learning_rate": 0.0001, "loss": 9.2609, "loss/crossentropy": 2.8465192317962646, "loss/hidden": 2.453125, "loss/jsd": 0.0, "loss/logits": 0.39612245559692383, "step": 400 }, { "epoch": 0.0125625, "grad_norm": 7.375, "grad_norm_var": 0.25227457682291665, "learning_rate": 0.0001, "loss": 9.135, "loss/crossentropy": 2.829011917114258, "loss/hidden": 2.375, "loss/jsd": 0.0, "loss/logits": 0.39309877157211304, "step": 402 }, { "epoch": 0.012625, "grad_norm": 6.625, "grad_norm_var": 0.26951497395833335, "learning_rate": 0.0001, "loss": 8.5141, "loss/crossentropy": 2.407685399055481, "loss/hidden": 2.3671875, "loss/jsd": 0.0, "loss/logits": 0.37392735481262207, "step": 404 }, { "epoch": 0.0126875, "grad_norm": 6.96875, "grad_norm_var": 0.25638020833333336, "learning_rate": 0.0001, "loss": 9.0194, "loss/crossentropy": 2.782030463218689, "loss/hidden": 2.390625, "loss/jsd": 0.0, "loss/logits": 0.3846723139286041, "step": 406 }, { "epoch": 0.01275, "grad_norm": 6.84375, "grad_norm_var": 0.3097005208333333, "learning_rate": 0.0001, "loss": 7.9774, "loss/crossentropy": 2.263835072517395, "loss/hidden": 2.3515625, "loss/jsd": 0.0, "loss/logits": 0.336203470826149, "step": 408 }, { "epoch": 0.0128125, "grad_norm": 7.25, "grad_norm_var": 0.13372395833333334, "learning_rate": 0.0001, "loss": 9.1392, "loss/crossentropy": 2.795884609222412, "loss/hidden": 2.453125, "loss/jsd": 0.0, "loss/logits": 0.38901545107364655, "step": 410 }, { "epoch": 0.012875, "grad_norm": 7.25, "grad_norm_var": 0.24560139973958334, "learning_rate": 0.0001, "loss": 9.8796, "loss/crossentropy": 3.1521027088165283, "loss/hidden": 2.453125, "loss/jsd": 0.0, "loss/logits": 0.427437886595726, "step": 412 }, { "epoch": 0.0129375, "grad_norm": 7.125, "grad_norm_var": 0.24451497395833333, "learning_rate": 0.0001, "loss": 9.2833, "loss/crossentropy": 2.9037975072860718, "loss/hidden": 2.40625, "loss/jsd": 0.0, "loss/logits": 0.3973206430673599, "step": 414 }, { "epoch": 0.013, "grad_norm": 6.65625, "grad_norm_var": 0.2760701497395833, "learning_rate": 0.0001, "loss": 9.1047, "loss/crossentropy": 2.8156282901763916, "loss/hidden": 2.3984375, "loss/jsd": 0.0, "loss/logits": 0.38906630873680115, "step": 416 }, { "epoch": 0.0130625, "grad_norm": 8.75, "grad_norm_var": 0.44397379557291666, "learning_rate": 0.0001, "loss": 9.1789, "loss/crossentropy": 2.7314385175704956, "loss/hidden": 2.375, "loss/jsd": 0.0, "loss/logits": 0.40724538266658783, "step": 418 }, { "epoch": 0.013125, "grad_norm": 8.125, "grad_norm_var": 0.4571451822916667, "learning_rate": 0.0001, "loss": 9.4066, "loss/crossentropy": 2.7825275659561157, "loss/hidden": 2.4765625, "loss/jsd": 0.0, "loss/logits": 0.4147540330886841, "step": 420 }, { "epoch": 0.0131875, "grad_norm": 6.03125, "grad_norm_var": 0.5766276041666667, "learning_rate": 0.0001, "loss": 9.1609, "loss/crossentropy": 2.930117607116699, "loss/hidden": 2.4453125, "loss/jsd": 0.0, "loss/logits": 0.3785484880208969, "step": 422 }, { "epoch": 0.01325, "grad_norm": 5.90625, "grad_norm_var": 0.6495442708333333, "learning_rate": 0.0001, "loss": 8.6362, "loss/crossentropy": 2.676853656768799, "loss/hidden": 2.359375, "loss/jsd": 0.0, "loss/logits": 0.3599983751773834, "step": 424 }, { "epoch": 0.0133125, "grad_norm": 7.5, "grad_norm_var": 0.624853515625, "learning_rate": 0.0001, "loss": 8.7011, "loss/crossentropy": 2.5884207487106323, "loss/hidden": 2.4296875, "loss/jsd": 0.0, "loss/logits": 0.36829495429992676, "step": 426 }, { "epoch": 0.013375, "grad_norm": 7.53125, "grad_norm_var": 0.5376953125, "learning_rate": 0.0001, "loss": 9.7283, "loss/crossentropy": 2.9578946828842163, "loss/hidden": 2.5078125, "loss/jsd": 0.0, "loss/logits": 0.42625896632671356, "step": 428 }, { "epoch": 0.0134375, "grad_norm": 7.8125, "grad_norm_var": 0.5708943684895833, "learning_rate": 0.0001, "loss": 9.2129, "loss/crossentropy": 2.830846667289734, "loss/hidden": 2.34375, "loss/jsd": 0.0, "loss/logits": 0.4038323014974594, "step": 430 }, { "epoch": 0.0135, "grad_norm": 7.15625, "grad_norm_var": 0.53961181640625, "learning_rate": 0.0001, "loss": 9.1391, "loss/crossentropy": 2.934818983078003, "loss/hidden": 2.3671875, "loss/jsd": 0.0, "loss/logits": 0.3837139457464218, "step": 432 }, { "epoch": 0.0135625, "grad_norm": 6.53125, "grad_norm_var": 0.42125244140625, "learning_rate": 0.0001, "loss": 8.8191, "loss/crossentropy": 2.789232611656189, "loss/hidden": 2.3203125, "loss/jsd": 0.0, "loss/logits": 0.3709518015384674, "step": 434 }, { "epoch": 0.013625, "grad_norm": 7.03125, "grad_norm_var": 0.33824462890625, "learning_rate": 0.0001, "loss": 8.8767, "loss/crossentropy": 2.723211646080017, "loss/hidden": 2.3828125, "loss/jsd": 0.0, "loss/logits": 0.3770655542612076, "step": 436 }, { "epoch": 0.0136875, "grad_norm": 7.15625, "grad_norm_var": 0.3138631184895833, "learning_rate": 0.0001, "loss": 8.6882, "loss/crossentropy": 2.7312934398651123, "loss/hidden": 2.34375, "loss/jsd": 0.0, "loss/logits": 0.36132051050662994, "step": 438 }, { "epoch": 0.01375, "grad_norm": 7.40625, "grad_norm_var": 0.30740559895833336, "learning_rate": 0.0001, "loss": 9.2994, "loss/crossentropy": 3.003957748413086, "loss/hidden": 2.4609375, "loss/jsd": 0.0, "loss/logits": 0.38345402479171753, "step": 440 }, { "epoch": 0.0138125, "grad_norm": 7.1875, "grad_norm_var": 0.32190348307291666, "learning_rate": 0.0001, "loss": 9.5717, "loss/crossentropy": 2.9630134105682373, "loss/hidden": 2.421875, "loss/jsd": 0.0, "loss/logits": 0.4186822474002838, "step": 442 }, { "epoch": 0.013875, "grad_norm": 7.5, "grad_norm_var": 0.3284993489583333, "learning_rate": 0.0001, "loss": 8.9237, "loss/crossentropy": 2.695352554321289, "loss/hidden": 2.3828125, "loss/jsd": 0.0, "loss/logits": 0.3845515549182892, "step": 444 }, { "epoch": 0.0139375, "grad_norm": 12.75, "grad_norm_var": 2.301416015625, "learning_rate": 0.0001, "loss": 10.1005, "loss/crossentropy": 2.9550745487213135, "loss/hidden": 2.5, "loss/jsd": 0.0, "loss/logits": 0.4645442068576813, "step": 446 }, { "epoch": 0.014, "grad_norm": 7.125, "grad_norm_var": 2.3378865559895834, "learning_rate": 0.0001, "loss": 9.2894, "loss/crossentropy": 2.7871328592300415, "loss/hidden": 2.4765625, "loss/jsd": 0.0, "loss/logits": 0.40257345139980316, "step": 448 }, { "epoch": 0.0140625, "grad_norm": 6.59375, "grad_norm_var": 2.273270670572917, "learning_rate": 0.0001, "loss": 8.9407, "loss/crossentropy": 2.749815583229065, "loss/hidden": 2.328125, "loss/jsd": 0.0, "loss/logits": 0.38627225160598755, "step": 450 }, { "epoch": 0.014125, "grad_norm": 7.0, "grad_norm_var": 2.2490519205729167, "learning_rate": 0.0001, "loss": 9.0574, "loss/crossentropy": 2.733398675918579, "loss/hidden": 2.359375, "loss/jsd": 0.0, "loss/logits": 0.39646580815315247, "step": 452 }, { "epoch": 0.0141875, "grad_norm": 6.90625, "grad_norm_var": 2.060530598958333, "learning_rate": 0.0001, "loss": 9.1024, "loss/crossentropy": 2.782361626625061, "loss/hidden": 2.390625, "loss/jsd": 0.0, "loss/logits": 0.3929390609264374, "step": 454 }, { "epoch": 0.01425, "grad_norm": 7.125, "grad_norm_var": 2.107405598958333, "learning_rate": 0.0001, "loss": 8.9667, "loss/crossentropy": 2.81704843044281, "loss/hidden": 2.4140625, "loss/jsd": 0.0, "loss/logits": 0.3735543340444565, "step": 456 }, { "epoch": 0.0143125, "grad_norm": 7.40625, "grad_norm_var": 2.492041015625, "learning_rate": 0.0001, "loss": 8.9565, "loss/crossentropy": 2.850237250328064, "loss/hidden": 2.3984375, "loss/jsd": 0.0, "loss/logits": 0.3707829564809799, "step": 458 }, { "epoch": 0.014375, "grad_norm": 6.78125, "grad_norm_var": 2.554931640625, "learning_rate": 0.0001, "loss": 8.6868, "loss/crossentropy": 2.717839241027832, "loss/hidden": 2.3125, "loss/jsd": 0.0, "loss/logits": 0.3656424283981323, "step": 460 }, { "epoch": 0.0144375, "grad_norm": 6.96875, "grad_norm_var": 0.7952473958333334, "learning_rate": 0.0001, "loss": 9.2741, "loss/crossentropy": 2.896919012069702, "loss/hidden": 2.3828125, "loss/jsd": 0.0, "loss/logits": 0.3994409739971161, "step": 462 }, { "epoch": 0.0145, "grad_norm": 6.6875, "grad_norm_var": 0.8183430989583333, "learning_rate": 0.0001, "loss": 8.9309, "loss/crossentropy": 2.8172919750213623, "loss/hidden": 2.359375, "loss/jsd": 0.0, "loss/logits": 0.3754217326641083, "step": 464 }, { "epoch": 0.0145625, "grad_norm": 6.78125, "grad_norm_var": 0.8165323893229167, "learning_rate": 0.0001, "loss": 8.9581, "loss/crossentropy": 2.779883623123169, "loss/hidden": 2.375, "loss/jsd": 0.0, "loss/logits": 0.3803219199180603, "step": 466 }, { "epoch": 0.014625, "grad_norm": 7.1875, "grad_norm_var": 0.7966145833333333, "learning_rate": 0.0001, "loss": 8.8229, "loss/crossentropy": 2.738626718521118, "loss/hidden": 2.3515625, "loss/jsd": 0.0, "loss/logits": 0.3732661008834839, "step": 468 }, { "epoch": 0.0146875, "grad_norm": 10.875, "grad_norm_var": 1.6305338541666667, "learning_rate": 0.0001, "loss": 9.3225, "loss/crossentropy": 2.7142670154571533, "loss/hidden": 2.4765625, "loss/jsd": 0.0, "loss/logits": 0.4131620526313782, "step": 470 }, { "epoch": 0.01475, "grad_norm": 6.75, "grad_norm_var": 1.64146728515625, "learning_rate": 0.0001, "loss": 8.7388, "loss/crossentropy": 2.8026680946350098, "loss/hidden": 2.3515625, "loss/jsd": 0.0, "loss/logits": 0.35846084356307983, "step": 472 }, { "epoch": 0.0148125, "grad_norm": 6.3125, "grad_norm_var": 1.345556640625, "learning_rate": 0.0001, "loss": 9.0225, "loss/crossentropy": 2.877347230911255, "loss/hidden": 2.3359375, "loss/jsd": 0.0, "loss/logits": 0.38092535734176636, "step": 474 }, { "epoch": 0.014875, "grad_norm": 6.375, "grad_norm_var": 1.351416015625, "learning_rate": 0.0001, "loss": 9.2148, "loss/crossentropy": 2.9207284450531006, "loss/hidden": 2.3671875, "loss/jsd": 0.0, "loss/logits": 0.39269061386585236, "step": 476 }, { "epoch": 0.0149375, "grad_norm": 6.46875, "grad_norm_var": 1.3369140625, "learning_rate": 0.0001, "loss": 8.772, "loss/crossentropy": 2.7076833248138428, "loss/hidden": 2.421875, "loss/jsd": 0.0, "loss/logits": 0.3642459362745285, "step": 478 }, { "epoch": 0.015, "grad_norm": 6.21875, "grad_norm_var": 1.34088134765625, "learning_rate": 0.0001, "loss": 8.7427, "loss/crossentropy": 2.684576988220215, "loss/hidden": 2.2890625, "loss/jsd": 0.0, "loss/logits": 0.3769093304872513, "step": 480 }, { "epoch": 0.0150625, "grad_norm": 6.59375, "grad_norm_var": 1.45689697265625, "learning_rate": 0.0001, "loss": 8.6295, "loss/crossentropy": 2.735003113746643, "loss/hidden": 2.2890625, "loss/jsd": 0.0, "loss/logits": 0.36054036021232605, "step": 482 }, { "epoch": 0.015125, "grad_norm": 7.59375, "grad_norm_var": 1.4751139322916667, "learning_rate": 0.0001, "loss": 9.1039, "loss/crossentropy": 2.8056256771087646, "loss/hidden": 2.40625, "loss/jsd": 0.0, "loss/logits": 0.3891993761062622, "step": 484 }, { "epoch": 0.0151875, "grad_norm": 6.78125, "grad_norm_var": 0.46174723307291665, "learning_rate": 0.0001, "loss": 9.1391, "loss/crossentropy": 2.9603331089019775, "loss/hidden": 2.359375, "loss/jsd": 0.0, "loss/logits": 0.3819381147623062, "step": 486 }, { "epoch": 0.01525, "grad_norm": 7.84375, "grad_norm_var": 0.5404256184895834, "learning_rate": 0.0001, "loss": 9.0152, "loss/crossentropy": 2.802178978919983, "loss/hidden": 2.34375, "loss/jsd": 0.0, "loss/logits": 0.3869243860244751, "step": 488 }, { "epoch": 0.0153125, "grad_norm": 6.28125, "grad_norm_var": 0.27545166015625, "learning_rate": 0.0001, "loss": 8.6984, "loss/crossentropy": 2.745243191719055, "loss/hidden": 2.3125, "loss/jsd": 0.0, "loss/logits": 0.3640620857477188, "step": 490 }, { "epoch": 0.015375, "grad_norm": 6.59375, "grad_norm_var": 0.27112223307291666, "learning_rate": 0.0001, "loss": 8.5364, "loss/crossentropy": 2.6457338333129883, "loss/hidden": 2.296875, "loss/jsd": 0.0, "loss/logits": 0.3593788295984268, "step": 492 }, { "epoch": 0.0154375, "grad_norm": 6.3125, "grad_norm_var": 0.2749837239583333, "learning_rate": 0.0001, "loss": 8.7473, "loss/crossentropy": 2.8219770193099976, "loss/hidden": 2.296875, "loss/jsd": 0.0, "loss/logits": 0.3628465384244919, "step": 494 }, { "epoch": 0.0155, "grad_norm": 6.46875, "grad_norm_var": 0.28863525390625, "learning_rate": 0.0001, "loss": 9.2873, "loss/crossentropy": 2.9155017137527466, "loss/hidden": 2.375, "loss/jsd": 0.0, "loss/logits": 0.3996797800064087, "step": 496 }, { "epoch": 0.0155625, "grad_norm": 7.9375, "grad_norm_var": 0.3087076822916667, "learning_rate": 0.0001, "loss": 8.8924, "loss/crossentropy": 2.916568875312805, "loss/hidden": 2.3125, "loss/jsd": 0.0, "loss/logits": 0.3663354367017746, "step": 498 }, { "epoch": 0.015625, "grad_norm": 6.96875, "grad_norm_var": 0.2665201822916667, "learning_rate": 0.0001, "loss": 8.9003, "loss/crossentropy": 2.785780906677246, "loss/hidden": 2.421875, "loss/jsd": 0.0, "loss/logits": 0.36926528811454773, "step": 500 }, { "epoch": 0.0156875, "grad_norm": 7.15625, "grad_norm_var": 0.276025390625, "learning_rate": 0.0001, "loss": 8.8767, "loss/crossentropy": 2.8471392393112183, "loss/hidden": 2.3046875, "loss/jsd": 0.0, "loss/logits": 0.3724879324436188, "step": 502 }, { "epoch": 0.01575, "grad_norm": 6.15625, "grad_norm_var": 0.22040608723958333, "learning_rate": 0.0001, "loss": 8.8554, "loss/crossentropy": 2.868078351020813, "loss/hidden": 2.3203125, "loss/jsd": 0.0, "loss/logits": 0.36670316755771637, "step": 504 }, { "epoch": 0.0158125, "grad_norm": 8.375, "grad_norm_var": 0.3575358072916667, "learning_rate": 0.0001, "loss": 8.9408, "loss/crossentropy": 2.7880598306655884, "loss/hidden": 2.34375, "loss/jsd": 0.0, "loss/logits": 0.3809019774198532, "step": 506 }, { "epoch": 0.015875, "grad_norm": 7.21875, "grad_norm_var": 0.380712890625, "learning_rate": 0.0001, "loss": 8.7033, "loss/crossentropy": 2.674094319343567, "loss/hidden": 2.3125, "loss/jsd": 0.0, "loss/logits": 0.37166814506053925, "step": 508 }, { "epoch": 0.0159375, "grad_norm": 6.28125, "grad_norm_var": 0.3999837239583333, "learning_rate": 0.0001, "loss": 8.4149, "loss/crossentropy": 2.6351935863494873, "loss/hidden": 2.28125, "loss/jsd": 0.0, "loss/logits": 0.34984463453292847, "step": 510 }, { "epoch": 0.016, "grad_norm": 5.875, "grad_norm_var": 0.46190999348958334, "learning_rate": 0.0001, "loss": 8.6473, "loss/crossentropy": 2.8746068477630615, "loss/hidden": 2.28125, "loss/jsd": 0.0, "loss/logits": 0.3491448760032654, "step": 512 }, { "epoch": 0.0160625, "grad_norm": 9.0625, "grad_norm_var": 0.7147420247395834, "learning_rate": 0.0001, "loss": 9.3762, "loss/crossentropy": 2.979753851890564, "loss/hidden": 2.4375, "loss/jsd": 0.0, "loss/logits": 0.39589935541152954, "step": 514 }, { "epoch": 0.016125, "grad_norm": 6.28125, "grad_norm_var": 0.73521728515625, "learning_rate": 0.0001, "loss": 8.8009, "loss/crossentropy": 2.7439844608306885, "loss/hidden": 2.3359375, "loss/jsd": 0.0, "loss/logits": 0.3720976561307907, "step": 516 }, { "epoch": 0.0161875, "grad_norm": 7.3125, "grad_norm_var": 0.7470703125, "learning_rate": 0.0001, "loss": 9.0876, "loss/crossentropy": 2.9311214685440063, "loss/hidden": 2.34375, "loss/jsd": 0.0, "loss/logits": 0.38126952946186066, "step": 518 }, { "epoch": 0.01625, "grad_norm": 6.71875, "grad_norm_var": 0.74068603515625, "learning_rate": 0.0001, "loss": 8.3644, "loss/crossentropy": 2.749508023262024, "loss/hidden": 2.296875, "loss/jsd": 0.0, "loss/logits": 0.331801176071167, "step": 520 }, { "epoch": 0.0163125, "grad_norm": 5.875, "grad_norm_var": 0.6261555989583333, "learning_rate": 0.0001, "loss": 8.3462, "loss/crossentropy": 2.638471841812134, "loss/hidden": 2.296875, "loss/jsd": 0.0, "loss/logits": 0.34108367562294006, "step": 522 }, { "epoch": 0.016375, "grad_norm": 6.4375, "grad_norm_var": 0.5886555989583333, "learning_rate": 0.0001, "loss": 8.7263, "loss/crossentropy": 2.772747278213501, "loss/hidden": 2.2734375, "loss/jsd": 0.0, "loss/logits": 0.3680117428302765, "step": 524 }, { "epoch": 0.0164375, "grad_norm": 7.8125, "grad_norm_var": 0.6463826497395834, "learning_rate": 0.0001, "loss": 8.424, "loss/crossentropy": 2.672973871231079, "loss/hidden": 2.3359375, "loss/jsd": 0.0, "loss/logits": 0.3415112644433975, "step": 526 }, { "epoch": 0.0165, "grad_norm": 6.03125, "grad_norm_var": 0.6551432291666667, "learning_rate": 0.0001, "loss": 8.9393, "loss/crossentropy": 2.907612681388855, "loss/hidden": 2.3828125, "loss/jsd": 0.0, "loss/logits": 0.36489084362983704, "step": 528 }, { "epoch": 0.0165625, "grad_norm": 7.78125, "grad_norm_var": 0.3875, "learning_rate": 0.0001, "loss": 8.8, "loss/crossentropy": 2.7299684286117554, "loss/hidden": 2.3828125, "loss/jsd": 0.0, "loss/logits": 0.3687261939048767, "step": 530 }, { "epoch": 0.016625, "grad_norm": 6.90625, "grad_norm_var": 0.36607666015625, "learning_rate": 0.0001, "loss": 8.7147, "loss/crossentropy": 2.771029829978943, "loss/hidden": 2.28125, "loss/jsd": 0.0, "loss/logits": 0.36624111235141754, "step": 532 }, { "epoch": 0.0166875, "grad_norm": 7.03125, "grad_norm_var": 0.36376546223958334, "learning_rate": 0.0001, "loss": 8.9628, "loss/crossentropy": 2.915344715118408, "loss/hidden": 2.3515625, "loss/jsd": 0.0, "loss/logits": 0.36958499252796173, "step": 534 }, { "epoch": 0.01675, "grad_norm": 8.6875, "grad_norm_var": 0.5572224934895833, "learning_rate": 0.0001, "loss": 8.6004, "loss/crossentropy": 2.746902346611023, "loss/hidden": 2.3203125, "loss/jsd": 0.0, "loss/logits": 0.3533162772655487, "step": 536 }, { "epoch": 0.0168125, "grad_norm": 6.21875, "grad_norm_var": 0.5417805989583333, "learning_rate": 0.0001, "loss": 8.8733, "loss/crossentropy": 2.7585846185684204, "loss/hidden": 2.3515625, "loss/jsd": 0.0, "loss/logits": 0.37631259858608246, "step": 538 }, { "epoch": 0.016875, "grad_norm": 6.625, "grad_norm_var": 0.5559529622395833, "learning_rate": 0.0001, "loss": 8.5898, "loss/crossentropy": 2.757652521133423, "loss/hidden": 2.265625, "loss/jsd": 0.0, "loss/logits": 0.35664936900138855, "step": 540 }, { "epoch": 0.0169375, "grad_norm": 6.40625, "grad_norm_var": 0.5765909830729167, "learning_rate": 0.0001, "loss": 8.5212, "loss/crossentropy": 2.701608419418335, "loss/hidden": 2.265625, "loss/jsd": 0.0, "loss/logits": 0.35539373755455017, "step": 542 }, { "epoch": 0.017, "grad_norm": 7.40625, "grad_norm_var": 0.5002888997395833, "learning_rate": 0.0001, "loss": 8.5642, "loss/crossentropy": 2.677858829498291, "loss/hidden": 2.328125, "loss/jsd": 0.0, "loss/logits": 0.35582463443279266, "step": 544 }, { "epoch": 0.0170625, "grad_norm": 6.375, "grad_norm_var": 0.53140869140625, "learning_rate": 0.0001, "loss": 8.5848, "loss/crossentropy": 2.7882959842681885, "loss/hidden": 2.2890625, "loss/jsd": 0.0, "loss/logits": 0.3507450073957443, "step": 546 }, { "epoch": 0.017125, "grad_norm": 6.65625, "grad_norm_var": 0.5334269205729166, "learning_rate": 0.0001, "loss": 8.2292, "loss/crossentropy": 2.616737723350525, "loss/hidden": 2.3046875, "loss/jsd": 0.0, "loss/logits": 0.33077819645404816, "step": 548 }, { "epoch": 0.0171875, "grad_norm": 7.0625, "grad_norm_var": 0.5574503580729167, "learning_rate": 0.0001, "loss": 8.6128, "loss/crossentropy": 2.8342689275741577, "loss/hidden": 2.265625, "loss/jsd": 0.0, "loss/logits": 0.351292222738266, "step": 550 }, { "epoch": 0.01725, "grad_norm": 6.375, "grad_norm_var": 0.29099934895833335, "learning_rate": 0.0001, "loss": 9.0425, "loss/crossentropy": 2.837349534034729, "loss/hidden": 2.3671875, "loss/jsd": 0.0, "loss/logits": 0.38379451632499695, "step": 552 }, { "epoch": 0.0173125, "grad_norm": 9.75, "grad_norm_var": 0.8583943684895833, "learning_rate": 0.0001, "loss": 8.8276, "loss/crossentropy": 2.689685583114624, "loss/hidden": 2.328125, "loss/jsd": 0.0, "loss/logits": 0.38097959756851196, "step": 554 }, { "epoch": 0.017375, "grad_norm": 6.78125, "grad_norm_var": 0.87076416015625, "learning_rate": 0.0001, "loss": 8.5501, "loss/crossentropy": 2.777572512626648, "loss/hidden": 2.265625, "loss/jsd": 0.0, "loss/logits": 0.35068848729133606, "step": 556 }, { "epoch": 0.0174375, "grad_norm": 6.21875, "grad_norm_var": 0.8652628580729167, "learning_rate": 0.0001, "loss": 8.6392, "loss/crossentropy": 2.783918261528015, "loss/hidden": 2.2890625, "loss/jsd": 0.0, "loss/logits": 0.3566194474697113, "step": 558 }, { "epoch": 0.0175, "grad_norm": 6.1875, "grad_norm_var": 0.8312337239583333, "learning_rate": 0.0001, "loss": 8.3345, "loss/crossentropy": 2.6030431985855103, "loss/hidden": 2.265625, "loss/jsd": 0.0, "loss/logits": 0.3465864509344101, "step": 560 }, { "epoch": 0.0175625, "grad_norm": 6.46875, "grad_norm_var": 0.80621337890625, "learning_rate": 0.0001, "loss": 9.2577, "loss/crossentropy": 2.9951740503311157, "loss/hidden": 2.3046875, "loss/jsd": 0.0, "loss/logits": 0.39578162133693695, "step": 562 }, { "epoch": 0.017625, "grad_norm": 6.46875, "grad_norm_var": 0.83209228515625, "learning_rate": 0.0001, "loss": 8.6354, "loss/crossentropy": 2.8244831562042236, "loss/hidden": 2.2578125, "loss/jsd": 0.0, "loss/logits": 0.35530561208724976, "step": 564 }, { "epoch": 0.0176875, "grad_norm": 6.6875, "grad_norm_var": 0.7983357747395833, "learning_rate": 0.0001, "loss": 8.4636, "loss/crossentropy": 2.7996731996536255, "loss/hidden": 2.2578125, "loss/jsd": 0.0, "loss/logits": 0.34061114490032196, "step": 566 }, { "epoch": 0.01775, "grad_norm": 5.96875, "grad_norm_var": 0.854296875, "learning_rate": 0.0001, "loss": 7.9341, "loss/crossentropy": 2.5139763355255127, "loss/hidden": 2.171875, "loss/jsd": 0.0, "loss/logits": 0.3248262405395508, "step": 568 }, { "epoch": 0.0178125, "grad_norm": 6.0625, "grad_norm_var": 0.12476806640625, "learning_rate": 0.0001, "loss": 8.2548, "loss/crossentropy": 2.5873286724090576, "loss/hidden": 2.3203125, "loss/jsd": 0.0, "loss/logits": 0.3347187936306, "step": 570 }, { "epoch": 0.017875, "grad_norm": 6.96875, "grad_norm_var": 0.17823893229166668, "learning_rate": 0.0001, "loss": 8.2163, "loss/crossentropy": 2.6500922441482544, "loss/hidden": 2.2734375, "loss/jsd": 0.0, "loss/logits": 0.3292814791202545, "step": 572 }, { "epoch": 0.0179375, "grad_norm": 6.25, "grad_norm_var": 0.6076456705729166, "learning_rate": 0.0001, "loss": 8.0095, "loss/crossentropy": 2.4520283937454224, "loss/hidden": 2.1796875, "loss/jsd": 0.0, "loss/logits": 0.3377826511859894, "step": 574 }, { "epoch": 0.018, "grad_norm": 5.96875, "grad_norm_var": 0.6046712239583333, "learning_rate": 0.0001, "loss": 8.5973, "loss/crossentropy": 2.8165611028671265, "loss/hidden": 2.2265625, "loss/jsd": 0.0, "loss/logits": 0.35541336238384247, "step": 576 }, { "epoch": 0.0180625, "grad_norm": 5.71875, "grad_norm_var": 0.6470052083333333, "learning_rate": 0.0001, "loss": 8.5303, "loss/crossentropy": 2.7367520332336426, "loss/hidden": 2.2421875, "loss/jsd": 0.0, "loss/logits": 0.3551349341869354, "step": 578 }, { "epoch": 0.018125, "grad_norm": 7.71875, "grad_norm_var": 0.722900390625, "learning_rate": 0.0001, "loss": 8.5377, "loss/crossentropy": 2.64441180229187, "loss/hidden": 2.265625, "loss/jsd": 0.0, "loss/logits": 0.3627614676952362, "step": 580 }, { "epoch": 0.0181875, "grad_norm": 6.375, "grad_norm_var": 1.022119140625, "learning_rate": 0.0001, "loss": 8.7801, "loss/crossentropy": 2.859044909477234, "loss/hidden": 2.2890625, "loss/jsd": 0.0, "loss/logits": 0.36319929361343384, "step": 582 }, { "epoch": 0.01825, "grad_norm": 6.5625, "grad_norm_var": 0.9454264322916667, "learning_rate": 0.0001, "loss": 8.5919, "loss/crossentropy": 2.662962317466736, "loss/hidden": 2.3046875, "loss/jsd": 0.0, "loss/logits": 0.3624245524406433, "step": 584 }, { "epoch": 0.0183125, "grad_norm": 7.59375, "grad_norm_var": 1.02203369140625, "learning_rate": 0.0001, "loss": 8.4608, "loss/crossentropy": 2.702337145805359, "loss/hidden": 2.2578125, "loss/jsd": 0.0, "loss/logits": 0.3500698506832123, "step": 586 }, { "epoch": 0.018375, "grad_norm": 5.96875, "grad_norm_var": 0.9475260416666667, "learning_rate": 0.0001, "loss": 8.4998, "loss/crossentropy": 2.7028015851974487, "loss/hidden": 2.2265625, "loss/jsd": 0.0, "loss/logits": 0.3570452928543091, "step": 588 }, { "epoch": 0.0184375, "grad_norm": 7.40625, "grad_norm_var": 0.7027303059895833, "learning_rate": 0.0001, "loss": 8.7391, "loss/crossentropy": 2.8359057903289795, "loss/hidden": 2.2265625, "loss/jsd": 0.0, "loss/logits": 0.36766134202480316, "step": 590 }, { "epoch": 0.0185, "grad_norm": 6.03125, "grad_norm_var": 0.7144816080729167, "learning_rate": 0.0001, "loss": 8.4026, "loss/crossentropy": 2.7504743337631226, "loss/hidden": 2.2421875, "loss/jsd": 0.0, "loss/logits": 0.34099745750427246, "step": 592 }, { "epoch": 0.0185625, "grad_norm": 7.21875, "grad_norm_var": 0.63756103515625, "learning_rate": 0.0001, "loss": 8.8234, "loss/crossentropy": 2.694290280342102, "loss/hidden": 2.3515625, "loss/jsd": 0.0, "loss/logits": 0.3777501881122589, "step": 594 }, { "epoch": 0.018625, "grad_norm": 6.21875, "grad_norm_var": 0.6261067708333333, "learning_rate": 0.0001, "loss": 8.3699, "loss/crossentropy": 2.754095435142517, "loss/hidden": 2.28125, "loss/jsd": 0.0, "loss/logits": 0.3334520310163498, "step": 596 }, { "epoch": 0.0186875, "grad_norm": 5.9375, "grad_norm_var": 0.3947224934895833, "learning_rate": 0.0001, "loss": 8.3719, "loss/crossentropy": 2.728385329246521, "loss/hidden": 2.2265625, "loss/jsd": 0.0, "loss/logits": 0.3416992574930191, "step": 598 }, { "epoch": 0.01875, "grad_norm": 6.0, "grad_norm_var": 0.4166666666666667, "learning_rate": 0.0001, "loss": 8.6437, "loss/crossentropy": 2.7940341234207153, "loss/hidden": 2.21875, "loss/jsd": 0.0, "loss/logits": 0.36308759450912476, "step": 600 }, { "epoch": 0.0188125, "grad_norm": 6.125, "grad_norm_var": 0.23287760416666667, "learning_rate": 0.0001, "loss": 8.5931, "loss/crossentropy": 2.888337016105652, "loss/hidden": 2.2421875, "loss/jsd": 0.0, "loss/logits": 0.3462534695863724, "step": 602 }, { "epoch": 0.018875, "grad_norm": 7.21875, "grad_norm_var": 0.30959879557291664, "learning_rate": 0.0001, "loss": 8.4772, "loss/crossentropy": 2.8515857458114624, "loss/hidden": 2.234375, "loss/jsd": 0.0, "loss/logits": 0.33912205696105957, "step": 604 }, { "epoch": 0.0189375, "grad_norm": 6.5, "grad_norm_var": 0.23274332682291668, "learning_rate": 0.0001, "loss": 8.2908, "loss/crossentropy": 2.711584448814392, "loss/hidden": 2.2109375, "loss/jsd": 0.0, "loss/logits": 0.33683064579963684, "step": 606 }, { "epoch": 0.019, "grad_norm": 6.3125, "grad_norm_var": 0.23814697265625, "learning_rate": 0.0001, "loss": 8.6234, "loss/crossentropy": 2.8399088382720947, "loss/hidden": 2.2578125, "loss/jsd": 0.0, "loss/logits": 0.35256920754909515, "step": 608 }, { "epoch": 0.0190625, "grad_norm": 7.34375, "grad_norm_var": 0.23062744140625, "learning_rate": 0.0001, "loss": 8.5065, "loss/crossentropy": 2.7695130109786987, "loss/hidden": 2.2734375, "loss/jsd": 0.0, "loss/logits": 0.3463588356971741, "step": 610 }, { "epoch": 0.019125, "grad_norm": 7.09375, "grad_norm_var": 0.2593587239583333, "learning_rate": 0.0001, "loss": 8.7198, "loss/crossentropy": 2.824345588684082, "loss/hidden": 2.28125, "loss/jsd": 0.0, "loss/logits": 0.36141711473464966, "step": 612 }, { "epoch": 0.0191875, "grad_norm": 5.5, "grad_norm_var": 0.29983317057291664, "learning_rate": 0.0001, "loss": 8.6518, "loss/crossentropy": 3.0193722248077393, "loss/hidden": 2.28125, "loss/jsd": 0.0, "loss/logits": 0.3351168632507324, "step": 614 }, { "epoch": 0.01925, "grad_norm": 6.875, "grad_norm_var": 0.35605061848958336, "learning_rate": 0.0001, "loss": 8.4348, "loss/crossentropy": 2.788522481918335, "loss/hidden": 2.234375, "loss/jsd": 0.0, "loss/logits": 0.3411922752857208, "step": 616 }, { "epoch": 0.0193125, "grad_norm": 6.75, "grad_norm_var": 0.3453125, "learning_rate": 0.0001, "loss": 8.7769, "loss/crossentropy": 2.954568028450012, "loss/hidden": 2.2890625, "loss/jsd": 0.0, "loss/logits": 0.353326752781868, "step": 618 }, { "epoch": 0.019375, "grad_norm": 6.1875, "grad_norm_var": 0.25028889973958335, "learning_rate": 0.0001, "loss": 8.2421, "loss/crossentropy": 2.671786308288574, "loss/hidden": 2.2265625, "loss/jsd": 0.0, "loss/logits": 0.3343764841556549, "step": 620 }, { "epoch": 0.0194375, "grad_norm": 6.15625, "grad_norm_var": 0.27727864583333334, "learning_rate": 0.0001, "loss": 8.3061, "loss/crossentropy": 2.692282795906067, "loss/hidden": 2.171875, "loss/jsd": 0.0, "loss/logits": 0.34419384598731995, "step": 622 }, { "epoch": 0.0195, "grad_norm": 6.875, "grad_norm_var": 0.5027180989583333, "learning_rate": 0.0001, "loss": 7.9189, "loss/crossentropy": 2.3546725511550903, "loss/hidden": 2.265625, "loss/jsd": 0.0, "loss/logits": 0.32986071705818176, "step": 624 }, { "epoch": 0.0195625, "grad_norm": 6.15625, "grad_norm_var": 0.47144775390625, "learning_rate": 0.0001, "loss": 8.5574, "loss/crossentropy": 2.7879234552383423, "loss/hidden": 2.265625, "loss/jsd": 0.0, "loss/logits": 0.3503849357366562, "step": 626 }, { "epoch": 0.019625, "grad_norm": 6.34375, "grad_norm_var": 0.47261962890625, "learning_rate": 0.0001, "loss": 8.3498, "loss/crossentropy": 2.6622077226638794, "loss/hidden": 2.265625, "loss/jsd": 0.0, "loss/logits": 0.3421996682882309, "step": 628 }, { "epoch": 0.0196875, "grad_norm": 6.625, "grad_norm_var": 0.42604166666666665, "learning_rate": 0.0001, "loss": 8.5607, "loss/crossentropy": 2.743830442428589, "loss/hidden": 2.21875, "loss/jsd": 0.0, "loss/logits": 0.3598093241453171, "step": 630 }, { "epoch": 0.01975, "grad_norm": 5.84375, "grad_norm_var": 0.3856404622395833, "learning_rate": 0.0001, "loss": 8.3227, "loss/crossentropy": 2.563997983932495, "loss/hidden": 2.234375, "loss/jsd": 0.0, "loss/logits": 0.35243353247642517, "step": 632 }, { "epoch": 0.0198125, "grad_norm": 5.46875, "grad_norm_var": 0.4461588541666667, "learning_rate": 0.0001, "loss": 8.1279, "loss/crossentropy": 2.6363645792007446, "loss/hidden": 2.1953125, "loss/jsd": 0.0, "loss/logits": 0.32962509989738464, "step": 634 }, { "epoch": 0.019875, "grad_norm": 12.0, "grad_norm_var": 3.256363932291667, "learning_rate": 0.0001, "loss": 9.5295, "loss/crossentropy": 3.0127965211868286, "loss/hidden": 2.2890625, "loss/jsd": 0.0, "loss/logits": 0.4227666109800339, "step": 636 }, { "epoch": 0.0199375, "grad_norm": 6.5, "grad_norm_var": 3.189322916666667, "learning_rate": 0.0001, "loss": 7.9371, "loss/crossentropy": 2.4866052865982056, "loss/hidden": 2.21875, "loss/jsd": 0.0, "loss/logits": 0.32317017018795013, "step": 638 }, { "epoch": 0.02, "grad_norm": 6.03125, "grad_norm_var": 3.162369791666667, "learning_rate": 0.0001, "loss": 7.8481, "loss/crossentropy": 2.403463363647461, "loss/hidden": 2.1953125, "loss/jsd": 0.0, "loss/logits": 0.3249284029006958, "step": 640 }, { "epoch": 0.0200625, "grad_norm": 6.875, "grad_norm_var": 3.123763020833333, "learning_rate": 0.0001, "loss": 9.3288, "loss/crossentropy": 2.9746265411376953, "loss/hidden": 2.296875, "loss/jsd": 0.0, "loss/logits": 0.40572839975357056, "step": 642 }, { "epoch": 0.020125, "grad_norm": 7.625, "grad_norm_var": 3.1727701822916665, "learning_rate": 0.0001, "loss": 8.5278, "loss/crossentropy": 2.7414538860321045, "loss/hidden": 2.21875, "loss/jsd": 0.0, "loss/logits": 0.35675469040870667, "step": 644 }, { "epoch": 0.0201875, "grad_norm": 5.53125, "grad_norm_var": 3.2625, "learning_rate": 0.0001, "loss": 8.8497, "loss/crossentropy": 2.9117285013198853, "loss/hidden": 2.2265625, "loss/jsd": 0.0, "loss/logits": 0.3711456209421158, "step": 646 }, { "epoch": 0.02025, "grad_norm": 8.0625, "grad_norm_var": 3.316259765625, "learning_rate": 0.0001, "loss": 8.5108, "loss/crossentropy": 2.7149282693862915, "loss/hidden": 2.234375, "loss/jsd": 0.0, "loss/logits": 0.35615289211273193, "step": 648 }, { "epoch": 0.0203125, "grad_norm": 7.34375, "grad_norm_var": 3.121903483072917, "learning_rate": 0.0001, "loss": 8.6546, "loss/crossentropy": 2.6386566162109375, "loss/hidden": 2.2734375, "loss/jsd": 0.0, "loss/logits": 0.37424588203430176, "step": 650 }, { "epoch": 0.020375, "grad_norm": 5.46875, "grad_norm_var": 0.7382120768229167, "learning_rate": 0.0001, "loss": 8.4152, "loss/crossentropy": 2.574658155441284, "loss/hidden": 2.3125, "loss/jsd": 0.0, "loss/logits": 0.3528069108724594, "step": 652 }, { "epoch": 0.0204375, "grad_norm": 6.1875, "grad_norm_var": 0.7291951497395833, "learning_rate": 0.0001, "loss": 8.6135, "loss/crossentropy": 2.879882335662842, "loss/hidden": 2.25, "loss/jsd": 0.0, "loss/logits": 0.3483603298664093, "step": 654 }, { "epoch": 0.0205, "grad_norm": 6.5, "grad_norm_var": 0.6912760416666667, "learning_rate": 0.0001, "loss": 8.6924, "loss/crossentropy": 2.702815890312195, "loss/hidden": 2.2890625, "loss/jsd": 0.0, "loss/logits": 0.3700554370880127, "step": 656 }, { "epoch": 0.0205625, "grad_norm": 5.90625, "grad_norm_var": 0.755322265625, "learning_rate": 0.0001, "loss": 8.514, "loss/crossentropy": 2.626859426498413, "loss/hidden": 2.2578125, "loss/jsd": 0.0, "loss/logits": 0.3629360496997833, "step": 658 }, { "epoch": 0.020625, "grad_norm": 6.125, "grad_norm_var": 0.7024739583333334, "learning_rate": 0.0001, "loss": 8.7634, "loss/crossentropy": 2.8795636892318726, "loss/hidden": 2.2265625, "loss/jsd": 0.0, "loss/logits": 0.3657243996858597, "step": 660 }, { "epoch": 0.0206875, "grad_norm": 7.15625, "grad_norm_var": 0.5881144205729166, "learning_rate": 0.0001, "loss": 8.3306, "loss/crossentropy": 2.7441413402557373, "loss/hidden": 2.1640625, "loss/jsd": 0.0, "loss/logits": 0.34223654866218567, "step": 662 }, { "epoch": 0.02075, "grad_norm": 5.1875, "grad_norm_var": 0.5191691080729167, "learning_rate": 0.0001, "loss": 7.7663, "loss/crossentropy": 2.452701687812805, "loss/hidden": 2.1875, "loss/jsd": 0.0, "loss/logits": 0.31261467933654785, "step": 664 }, { "epoch": 0.0208125, "grad_norm": 5.75, "grad_norm_var": 0.38619384765625, "learning_rate": 0.0001, "loss": 8.093, "loss/crossentropy": 2.546747088432312, "loss/hidden": 2.25, "loss/jsd": 0.0, "loss/logits": 0.32962463796138763, "step": 666 }, { "epoch": 0.020875, "grad_norm": 5.96875, "grad_norm_var": 0.306640625, "learning_rate": 0.0001, "loss": 8.5068, "loss/crossentropy": 2.8044780492782593, "loss/hidden": 2.1875, "loss/jsd": 0.0, "loss/logits": 0.351484552025795, "step": 668 }, { "epoch": 0.0209375, "grad_norm": 6.78125, "grad_norm_var": 0.29547119140625, "learning_rate": 0.0001, "loss": 8.7689, "loss/crossentropy": 3.0153743028640747, "loss/hidden": 2.2578125, "loss/jsd": 0.0, "loss/logits": 0.34957103431224823, "step": 670 }, { "epoch": 0.021, "grad_norm": 5.75, "grad_norm_var": 0.29178059895833336, "learning_rate": 0.0001, "loss": 8.4564, "loss/crossentropy": 2.712641477584839, "loss/hidden": 2.1796875, "loss/jsd": 0.0, "loss/logits": 0.3564087748527527, "step": 672 }, { "epoch": 0.0210625, "grad_norm": 5.90625, "grad_norm_var": 0.22476806640625, "learning_rate": 0.0001, "loss": 8.5781, "loss/crossentropy": 2.984122633934021, "loss/hidden": 2.2109375, "loss/jsd": 0.0, "loss/logits": 0.3383050411939621, "step": 674 }, { "epoch": 0.021125, "grad_norm": 5.84375, "grad_norm_var": 0.24133707682291666, "learning_rate": 0.0001, "loss": 8.5681, "loss/crossentropy": 2.6959644556045532, "loss/hidden": 2.21875, "loss/jsd": 0.0, "loss/logits": 0.3653368502855301, "step": 676 }, { "epoch": 0.0211875, "grad_norm": 6.75, "grad_norm_var": 0.21951497395833333, "learning_rate": 0.0001, "loss": 8.3572, "loss/crossentropy": 2.661711096763611, "loss/hidden": 2.28125, "loss/jsd": 0.0, "loss/logits": 0.34142591059207916, "step": 678 }, { "epoch": 0.02125, "grad_norm": 6.96875, "grad_norm_var": 0.23346354166666666, "learning_rate": 0.0001, "loss": 8.1775, "loss/crossentropy": 2.5807024240493774, "loss/hidden": 2.25, "loss/jsd": 0.0, "loss/logits": 0.3346793055534363, "step": 680 }, { "epoch": 0.0213125, "grad_norm": 6.6875, "grad_norm_var": 0.34814046223958334, "learning_rate": 0.0001, "loss": 9.2846, "loss/crossentropy": 3.140147089958191, "loss/hidden": 2.265625, "loss/jsd": 0.0, "loss/logits": 0.38788214325904846, "step": 682 }, { "epoch": 0.021375, "grad_norm": 6.6875, "grad_norm_var": 0.3236328125, "learning_rate": 0.0001, "loss": 8.1472, "loss/crossentropy": 2.5489360094070435, "loss/hidden": 2.25, "loss/jsd": 0.0, "loss/logits": 0.33482369780540466, "step": 684 }, { "epoch": 0.0214375, "grad_norm": 5.9375, "grad_norm_var": 0.33414306640625, "learning_rate": 0.0001, "loss": 8.3896, "loss/crossentropy": 2.6653178930282593, "loss/hidden": 2.2734375, "loss/jsd": 0.0, "loss/logits": 0.34508879482746124, "step": 686 }, { "epoch": 0.0215, "grad_norm": 6.875, "grad_norm_var": 0.31392822265625, "learning_rate": 0.0001, "loss": 8.701, "loss/crossentropy": 2.9853092432022095, "loss/hidden": 2.2109375, "loss/jsd": 0.0, "loss/logits": 0.35047847032546997, "step": 688 }, { "epoch": 0.0215625, "grad_norm": 5.59375, "grad_norm_var": 0.32591145833333335, "learning_rate": 0.0001, "loss": 8.1641, "loss/crossentropy": 2.6723328828811646, "loss/hidden": 2.2109375, "loss/jsd": 0.0, "loss/logits": 0.3280813992023468, "step": 690 }, { "epoch": 0.021625, "grad_norm": 6.46875, "grad_norm_var": 0.305078125, "learning_rate": 0.0001, "loss": 8.4814, "loss/crossentropy": 2.7639589309692383, "loss/hidden": 2.2734375, "loss/jsd": 0.0, "loss/logits": 0.34440168738365173, "step": 692 }, { "epoch": 0.0216875, "grad_norm": 7.40625, "grad_norm_var": 0.35523681640625, "learning_rate": 0.0001, "loss": 8.0945, "loss/crossentropy": 2.505578875541687, "loss/hidden": 2.2578125, "loss/jsd": 0.0, "loss/logits": 0.3331110030412674, "step": 694 }, { "epoch": 0.02175, "grad_norm": 5.6875, "grad_norm_var": 0.3683553059895833, "learning_rate": 0.0001, "loss": 8.3149, "loss/crossentropy": 2.799286961555481, "loss/hidden": 2.1484375, "loss/jsd": 0.0, "loss/logits": 0.336715966463089, "step": 696 }, { "epoch": 0.0218125, "grad_norm": 6.3125, "grad_norm_var": 0.2619099934895833, "learning_rate": 0.0001, "loss": 8.1849, "loss/crossentropy": 2.5903667211532593, "loss/hidden": 2.203125, "loss/jsd": 0.0, "loss/logits": 0.3391364514827728, "step": 698 }, { "epoch": 0.021875, "grad_norm": 6.125, "grad_norm_var": 0.2579427083333333, "learning_rate": 0.0001, "loss": 8.5062, "loss/crossentropy": 2.7039811611175537, "loss/hidden": 2.28125, "loss/jsd": 0.0, "loss/logits": 0.3520972728729248, "step": 700 }, { "epoch": 0.0219375, "grad_norm": 10.375, "grad_norm_var": 1.3748982747395833, "learning_rate": 0.0001, "loss": 8.151, "loss/crossentropy": 2.5421234369277954, "loss/hidden": 2.1640625, "loss/jsd": 0.0, "loss/logits": 0.3444819301366806, "step": 702 }, { "epoch": 0.022, "grad_norm": 5.40625, "grad_norm_var": 1.465625, "learning_rate": 0.0001, "loss": 7.8739, "loss/crossentropy": 2.482094407081604, "loss/hidden": 2.1171875, "loss/jsd": 0.0, "loss/logits": 0.32745902240276337, "step": 704 }, { "epoch": 0.0220625, "grad_norm": 7.21875, "grad_norm_var": 1.5061848958333333, "learning_rate": 0.0001, "loss": 8.6827, "loss/crossentropy": 2.8615206480026245, "loss/hidden": 2.2109375, "loss/jsd": 0.0, "loss/logits": 0.3610275834798813, "step": 706 }, { "epoch": 0.022125, "grad_norm": 5.875, "grad_norm_var": 1.5166951497395833, "learning_rate": 0.0001, "loss": 8.1118, "loss/crossentropy": 2.5979604721069336, "loss/hidden": 2.1171875, "loss/jsd": 0.0, "loss/logits": 0.339664563536644, "step": 708 }, { "epoch": 0.0221875, "grad_norm": 7.125, "grad_norm_var": 1.48492431640625, "learning_rate": 0.0001, "loss": 8.3526, "loss/crossentropy": 2.7990275621414185, "loss/hidden": 2.1796875, "loss/jsd": 0.0, "loss/logits": 0.3373870253562927, "step": 710 }, { "epoch": 0.02225, "grad_norm": 6.0, "grad_norm_var": 1.4725545247395833, "learning_rate": 0.0001, "loss": 8.3694, "loss/crossentropy": 2.773742437362671, "loss/hidden": 2.1953125, "loss/jsd": 0.0, "loss/logits": 0.3400302082300186, "step": 712 }, { "epoch": 0.0223125, "grad_norm": 5.75, "grad_norm_var": 1.4877237955729166, "learning_rate": 0.0001, "loss": 8.5224, "loss/crossentropy": 2.9048824310302734, "loss/hidden": 2.2109375, "loss/jsd": 0.0, "loss/logits": 0.3406597226858139, "step": 714 }, { "epoch": 0.022375, "grad_norm": 6.21875, "grad_norm_var": 1.4764933268229166, "learning_rate": 0.0001, "loss": 8.2475, "loss/crossentropy": 2.597532033920288, "loss/hidden": 2.21875, "loss/jsd": 0.0, "loss/logits": 0.3431176245212555, "step": 716 }, { "epoch": 0.0224375, "grad_norm": 5.6875, "grad_norm_var": 0.3088175455729167, "learning_rate": 0.0001, "loss": 7.8735, "loss/crossentropy": 2.5067347288131714, "loss/hidden": 2.15625, "loss/jsd": 0.0, "loss/logits": 0.3210519254207611, "step": 718 }, { "epoch": 0.0225, "grad_norm": 6.5, "grad_norm_var": 0.29811197916666665, "learning_rate": 0.0001, "loss": 8.3045, "loss/crossentropy": 2.7840747833251953, "loss/hidden": 2.1796875, "loss/jsd": 0.0, "loss/logits": 0.3340781629085541, "step": 720 }, { "epoch": 0.0225625, "grad_norm": 6.9375, "grad_norm_var": 0.461962890625, "learning_rate": 0.0001, "loss": 8.5005, "loss/crossentropy": 2.7896593809127808, "loss/hidden": 2.265625, "loss/jsd": 0.0, "loss/logits": 0.34451836347579956, "step": 722 }, { "epoch": 0.022625, "grad_norm": 7.53125, "grad_norm_var": 0.5669108072916667, "learning_rate": 0.0001, "loss": 8.2398, "loss/crossentropy": 2.7475579977035522, "loss/hidden": 2.1484375, "loss/jsd": 0.0, "loss/logits": 0.3343813419342041, "step": 724 }, { "epoch": 0.0226875, "grad_norm": 6.75, "grad_norm_var": 0.4945271809895833, "learning_rate": 0.0001, "loss": 8.5406, "loss/crossentropy": 2.7411571741104126, "loss/hidden": 2.21875, "loss/jsd": 0.0, "loss/logits": 0.35806816816329956, "step": 726 }, { "epoch": 0.02275, "grad_norm": 6.40625, "grad_norm_var": 0.47711181640625, "learning_rate": 0.0001, "loss": 8.5912, "loss/crossentropy": 2.766042709350586, "loss/hidden": 2.1953125, "loss/jsd": 0.0, "loss/logits": 0.36298026144504547, "step": 728 }, { "epoch": 0.0228125, "grad_norm": 6.34375, "grad_norm_var": 0.4973958333333333, "learning_rate": 0.0001, "loss": 7.9852, "loss/crossentropy": 2.5189281702041626, "loss/hidden": 2.140625, "loss/jsd": 0.0, "loss/logits": 0.33256661891937256, "step": 730 }, { "epoch": 0.022875, "grad_norm": 9.9375, "grad_norm_var": 1.2870402018229166, "learning_rate": 0.0001, "loss": 8.3075, "loss/crossentropy": 2.7226722240448, "loss/hidden": 2.2421875, "loss/jsd": 0.0, "loss/logits": 0.33426010608673096, "step": 732 }, { "epoch": 0.0229375, "grad_norm": 5.625, "grad_norm_var": 1.3407389322916667, "learning_rate": 0.0001, "loss": 8.4679, "loss/crossentropy": 2.7558926343917847, "loss/hidden": 2.1640625, "loss/jsd": 0.0, "loss/logits": 0.3547917455434799, "step": 734 }, { "epoch": 0.023, "grad_norm": 6.34375, "grad_norm_var": 1.37720947265625, "learning_rate": 0.0001, "loss": 8.9548, "loss/crossentropy": 2.8543918132781982, "loss/hidden": 2.2421875, "loss/jsd": 0.0, "loss/logits": 0.3858264982700348, "step": 736 }, { "epoch": 0.0230625, "grad_norm": 6.125, "grad_norm_var": 1.3258951822916667, "learning_rate": 0.0001, "loss": 8.4039, "loss/crossentropy": 2.701572895050049, "loss/hidden": 2.2109375, "loss/jsd": 0.0, "loss/logits": 0.3491367846727371, "step": 738 }, { "epoch": 0.023125, "grad_norm": 5.46875, "grad_norm_var": 1.3380818684895834, "learning_rate": 0.0001, "loss": 8.2909, "loss/crossentropy": 2.7676188945770264, "loss/hidden": 2.15625, "loss/jsd": 0.0, "loss/logits": 0.3367016613483429, "step": 740 }, { "epoch": 0.0231875, "grad_norm": 6.25, "grad_norm_var": 1.4073527018229166, "learning_rate": 0.0001, "loss": 7.9941, "loss/crossentropy": 2.589860200881958, "loss/hidden": 2.1484375, "loss/jsd": 0.0, "loss/logits": 0.32557548582553864, "step": 742 }, { "epoch": 0.02325, "grad_norm": 5.59375, "grad_norm_var": 1.5048787434895834, "learning_rate": 0.0001, "loss": 8.1793, "loss/crossentropy": 2.689063787460327, "loss/hidden": 2.125, "loss/jsd": 0.0, "loss/logits": 0.3365195244550705, "step": 744 }, { "epoch": 0.0233125, "grad_norm": 6.5, "grad_norm_var": 1.434619140625, "learning_rate": 0.0001, "loss": 8.2082, "loss/crossentropy": 2.6279042959213257, "loss/hidden": 2.2421875, "loss/jsd": 0.0, "loss/logits": 0.3338083028793335, "step": 746 }, { "epoch": 0.023375, "grad_norm": 5.375, "grad_norm_var": 0.6980305989583333, "learning_rate": 0.0001, "loss": 8.2409, "loss/crossentropy": 2.7593994140625, "loss/hidden": 2.1796875, "loss/jsd": 0.0, "loss/logits": 0.33018162846565247, "step": 748 }, { "epoch": 0.0234375, "grad_norm": 7.34375, "grad_norm_var": 0.6168253580729167, "learning_rate": 0.0001, "loss": 8.6918, "loss/crossentropy": 2.853471040725708, "loss/hidden": 2.21875, "loss/jsd": 0.0, "loss/logits": 0.361956387758255, "step": 750 }, { "epoch": 0.0235, "grad_norm": 5.21875, "grad_norm_var": 0.31187744140625, "learning_rate": 0.0001, "loss": 7.8511, "loss/crossentropy": 2.6204041242599487, "loss/hidden": 2.109375, "loss/jsd": 0.0, "loss/logits": 0.3121359795331955, "step": 752 }, { "epoch": 0.0235625, "grad_norm": 6.03125, "grad_norm_var": 0.308447265625, "learning_rate": 0.0001, "loss": 8.0742, "loss/crossentropy": 2.605458617210388, "loss/hidden": 2.1875, "loss/jsd": 0.0, "loss/logits": 0.3281271606683731, "step": 754 }, { "epoch": 0.023625, "grad_norm": 6.46875, "grad_norm_var": 0.30725504557291666, "learning_rate": 0.0001, "loss": 8.6584, "loss/crossentropy": 2.7429229021072388, "loss/hidden": 2.2109375, "loss/jsd": 0.0, "loss/logits": 0.37045322358608246, "step": 756 }, { "epoch": 0.0236875, "grad_norm": 6.0625, "grad_norm_var": 0.29225260416666665, "learning_rate": 0.0001, "loss": 8.1709, "loss/crossentropy": 2.6542168855667114, "loss/hidden": 2.125, "loss/jsd": 0.0, "loss/logits": 0.33917176723480225, "step": 758 }, { "epoch": 0.02375, "grad_norm": 5.71875, "grad_norm_var": 0.287890625, "learning_rate": 0.0001, "loss": 8.4722, "loss/crossentropy": 2.7812271118164062, "loss/hidden": 2.1953125, "loss/jsd": 0.0, "loss/logits": 0.349564865231514, "step": 760 }, { "epoch": 0.0238125, "grad_norm": 5.96875, "grad_norm_var": 0.27245686848958334, "learning_rate": 0.0001, "loss": 8.1857, "loss/crossentropy": 2.6698429584503174, "loss/hidden": 2.15625, "loss/jsd": 0.0, "loss/logits": 0.33596329391002655, "step": 762 }, { "epoch": 0.023875, "grad_norm": 6.25, "grad_norm_var": 0.264306640625, "learning_rate": 0.0001, "loss": 8.2672, "loss/crossentropy": 2.8197702169418335, "loss/hidden": 2.140625, "loss/jsd": 0.0, "loss/logits": 0.3306830823421478, "step": 764 }, { "epoch": 0.0239375, "grad_norm": 6.53125, "grad_norm_var": 0.14451497395833332, "learning_rate": 0.0001, "loss": 8.1106, "loss/crossentropy": 2.594325065612793, "loss/hidden": 2.1953125, "loss/jsd": 0.0, "loss/logits": 0.3320969194173813, "step": 766 }, { "epoch": 0.024, "grad_norm": 6.8125, "grad_norm_var": 0.18153889973958334, "learning_rate": 0.0001, "loss": 8.8065, "loss/crossentropy": 2.9510093927383423, "loss/hidden": 2.2265625, "loss/jsd": 0.0, "loss/logits": 0.36289557814598083, "step": 768 }, { "epoch": 0.0240625, "grad_norm": 5.5625, "grad_norm_var": 0.15584309895833334, "learning_rate": 0.0001, "loss": 8.0488, "loss/crossentropy": 2.677095890045166, "loss/hidden": 2.1015625, "loss/jsd": 0.0, "loss/logits": 0.3270166665315628, "step": 770 }, { "epoch": 0.024125, "grad_norm": 5.4375, "grad_norm_var": 0.16698811848958334, "learning_rate": 0.0001, "loss": 8.2652, "loss/crossentropy": 2.7432464361190796, "loss/hidden": 2.1484375, "loss/jsd": 0.0, "loss/logits": 0.3373481184244156, "step": 772 }, { "epoch": 0.0241875, "grad_norm": 5.71875, "grad_norm_var": 0.20480143229166667, "learning_rate": 0.0001, "loss": 8.045, "loss/crossentropy": 2.7736769914627075, "loss/hidden": 2.1484375, "loss/jsd": 0.0, "loss/logits": 0.31229038536548615, "step": 774 }, { "epoch": 0.02425, "grad_norm": 5.53125, "grad_norm_var": 0.21330973307291667, "learning_rate": 0.0001, "loss": 7.8055, "loss/crossentropy": 2.4817134141921997, "loss/hidden": 2.1171875, "loss/jsd": 0.0, "loss/logits": 0.3206585496664047, "step": 776 }, { "epoch": 0.0243125, "grad_norm": 5.125, "grad_norm_var": 0.25689697265625, "learning_rate": 0.0001, "loss": 7.8784, "loss/crossentropy": 2.6201757192611694, "loss/hidden": 2.1015625, "loss/jsd": 0.0, "loss/logits": 0.3156616538763046, "step": 778 }, { "epoch": 0.024375, "grad_norm": 5.3125, "grad_norm_var": 0.2800089518229167, "learning_rate": 0.0001, "loss": 8.0161, "loss/crossentropy": 2.720884919166565, "loss/hidden": 2.09375, "loss/jsd": 0.0, "loss/logits": 0.320142462849617, "step": 780 }, { "epoch": 0.0244375, "grad_norm": 7.125, "grad_norm_var": 0.38752848307291665, "learning_rate": 0.0001, "loss": 8.5161, "loss/crossentropy": 2.83653724193573, "loss/hidden": 2.1953125, "loss/jsd": 0.0, "loss/logits": 0.34842240810394287, "step": 782 }, { "epoch": 0.0245, "grad_norm": 8.1875, "grad_norm_var": 0.666650390625, "learning_rate": 0.0001, "loss": 8.2143, "loss/crossentropy": 2.7162340879440308, "loss/hidden": 2.171875, "loss/jsd": 0.0, "loss/logits": 0.3326167166233063, "step": 784 }, { "epoch": 0.0245625, "grad_norm": 6.03125, "grad_norm_var": 0.657275390625, "learning_rate": 0.0001, "loss": 8.0563, "loss/crossentropy": 2.6525591611862183, "loss/hidden": 2.1328125, "loss/jsd": 0.0, "loss/logits": 0.32709620893001556, "step": 786 }, { "epoch": 0.024625, "grad_norm": 5.71875, "grad_norm_var": 0.6537109375, "learning_rate": 0.0001, "loss": 8.2529, "loss/crossentropy": 2.7945055961608887, "loss/hidden": 2.125, "loss/jsd": 0.0, "loss/logits": 0.3333374410867691, "step": 788 }, { "epoch": 0.0246875, "grad_norm": 6.34375, "grad_norm_var": 0.6299112955729167, "learning_rate": 0.0001, "loss": 8.3067, "loss/crossentropy": 2.855311632156372, "loss/hidden": 2.15625, "loss/jsd": 0.0, "loss/logits": 0.3295145481824875, "step": 790 }, { "epoch": 0.02475, "grad_norm": 6.65625, "grad_norm_var": 0.6583292643229167, "learning_rate": 0.0001, "loss": 8.5236, "loss/crossentropy": 2.885672926902771, "loss/hidden": 2.1484375, "loss/jsd": 0.0, "loss/logits": 0.34895357489585876, "step": 792 }, { "epoch": 0.0248125, "grad_norm": 6.125, "grad_norm_var": 0.58814697265625, "learning_rate": 0.0001, "loss": 8.077, "loss/crossentropy": 2.6696921586990356, "loss/hidden": 2.1328125, "loss/jsd": 0.0, "loss/logits": 0.327450156211853, "step": 794 }, { "epoch": 0.024875, "grad_norm": 5.3125, "grad_norm_var": 0.56900634765625, "learning_rate": 0.0001, "loss": 8.3296, "loss/crossentropy": 2.806466221809387, "loss/hidden": 2.1171875, "loss/jsd": 0.0, "loss/logits": 0.34059543907642365, "step": 796 }, { "epoch": 0.0249375, "grad_norm": 5.6875, "grad_norm_var": 1.035400390625, "learning_rate": 0.0001, "loss": 8.0681, "loss/crossentropy": 2.5905505418777466, "loss/hidden": 2.1640625, "loss/jsd": 0.0, "loss/logits": 0.33134762942790985, "step": 798 }, { "epoch": 0.025, "grad_norm": 6.28125, "grad_norm_var": 0.76422119140625, "learning_rate": 0.0001, "loss": 8.2224, "loss/crossentropy": 2.6438721418380737, "loss/hidden": 2.1875, "loss/jsd": 0.0, "loss/logits": 0.339097797870636, "step": 800 }, { "epoch": 0.0250625, "grad_norm": 5.6875, "grad_norm_var": 0.779931640625, "learning_rate": 0.0001, "loss": 7.9234, "loss/crossentropy": 2.5964959859848022, "loss/hidden": 2.140625, "loss/jsd": 0.0, "loss/logits": 0.31862902641296387, "step": 802 }, { "epoch": 0.025125, "grad_norm": 6.21875, "grad_norm_var": 0.7956013997395833, "learning_rate": 0.0001, "loss": 8.1019, "loss/crossentropy": 2.6731587648391724, "loss/hidden": 2.1328125, "loss/jsd": 0.0, "loss/logits": 0.32959243655204773, "step": 804 }, { "epoch": 0.0251875, "grad_norm": 7.21875, "grad_norm_var": 0.8546223958333333, "learning_rate": 0.0001, "loss": 8.605, "loss/crossentropy": 2.872212290763855, "loss/hidden": 2.2109375, "loss/jsd": 0.0, "loss/logits": 0.352187842130661, "step": 806 }, { "epoch": 0.02525, "grad_norm": 6.375, "grad_norm_var": 0.8997395833333334, "learning_rate": 0.0001, "loss": 8.2991, "loss/crossentropy": 2.5583417415618896, "loss/hidden": 2.203125, "loss/jsd": 0.0, "loss/logits": 0.35376642644405365, "step": 808 }, { "epoch": 0.0253125, "grad_norm": 6.25, "grad_norm_var": 0.8658854166666666, "learning_rate": 0.0001, "loss": 8.3245, "loss/crossentropy": 2.7695552110671997, "loss/hidden": 2.171875, "loss/jsd": 0.0, "loss/logits": 0.3383059650659561, "step": 810 }, { "epoch": 0.025375, "grad_norm": 6.28125, "grad_norm_var": 0.7844889322916667, "learning_rate": 0.0001, "loss": 8.36, "loss/crossentropy": 2.7573885917663574, "loss/hidden": 2.1640625, "loss/jsd": 0.0, "loss/logits": 0.3438550680875778, "step": 812 }, { "epoch": 0.0254375, "grad_norm": 5.46875, "grad_norm_var": 0.2918904622395833, "learning_rate": 0.0001, "loss": 7.7418, "loss/crossentropy": 2.553446650505066, "loss/hidden": 2.1640625, "loss/jsd": 0.0, "loss/logits": 0.30242519080638885, "step": 814 }, { "epoch": 0.0255, "grad_norm": 13.8125, "grad_norm_var": 3.9885050455729165, "learning_rate": 0.0001, "loss": 8.9974, "loss/crossentropy": 2.893692135810852, "loss/hidden": 2.2578125, "loss/jsd": 0.0, "loss/logits": 0.3845931142568588, "step": 816 }, { "epoch": 0.0255625, "grad_norm": 6.40625, "grad_norm_var": 3.9234212239583335, "learning_rate": 0.0001, "loss": 8.7496, "loss/crossentropy": 2.876673936843872, "loss/hidden": 2.21875, "loss/jsd": 0.0, "loss/logits": 0.36541396379470825, "step": 818 }, { "epoch": 0.025625, "grad_norm": 7.28125, "grad_norm_var": 3.7567545572916665, "learning_rate": 0.0001, "loss": 8.6539, "loss/crossentropy": 2.7950828075408936, "loss/hidden": 2.1953125, "loss/jsd": 0.0, "loss/logits": 0.36635273694992065, "step": 820 }, { "epoch": 0.0256875, "grad_norm": 5.40625, "grad_norm_var": 3.863541666666667, "learning_rate": 0.0001, "loss": 8.036, "loss/crossentropy": 2.569133162498474, "loss/hidden": 2.140625, "loss/jsd": 0.0, "loss/logits": 0.3326212763786316, "step": 822 }, { "epoch": 0.02575, "grad_norm": 5.75, "grad_norm_var": 4.04976806640625, "learning_rate": 0.0001, "loss": 8.4846, "loss/crossentropy": 2.9390757083892822, "loss/hidden": 2.21875, "loss/jsd": 0.0, "loss/logits": 0.33267849683761597, "step": 824 }, { "epoch": 0.0258125, "grad_norm": 6.03125, "grad_norm_var": 4.074898274739583, "learning_rate": 0.0001, "loss": 8.1337, "loss/crossentropy": 2.7280073165893555, "loss/hidden": 2.15625, "loss/jsd": 0.0, "loss/logits": 0.32494574785232544, "step": 826 }, { "epoch": 0.025875, "grad_norm": 6.3125, "grad_norm_var": 4.099833170572917, "learning_rate": 0.0001, "loss": 7.9933, "loss/crossentropy": 2.6511049270629883, "loss/hidden": 2.1875, "loss/jsd": 0.0, "loss/logits": 0.31547415256500244, "step": 828 }, { "epoch": 0.0259375, "grad_norm": 5.5625, "grad_norm_var": 4.104410807291667, "learning_rate": 0.0001, "loss": 8.1625, "loss/crossentropy": 2.761909246444702, "loss/hidden": 2.0546875, "loss/jsd": 0.0, "loss/logits": 0.3345913887023926, "step": 830 }, { "epoch": 0.026, "grad_norm": 5.5625, "grad_norm_var": 0.4061197916666667, "learning_rate": 0.0001, "loss": 7.8933, "loss/crossentropy": 2.57563316822052, "loss/hidden": 2.1171875, "loss/jsd": 0.0, "loss/logits": 0.3200434446334839, "step": 832 }, { "epoch": 0.0260625, "grad_norm": 5.28125, "grad_norm_var": 0.3856404622395833, "learning_rate": 0.0001, "loss": 7.7928, "loss/crossentropy": 2.5671942234039307, "loss/hidden": 2.0703125, "loss/jsd": 0.0, "loss/logits": 0.3155245780944824, "step": 834 }, { "epoch": 0.026125, "grad_norm": 5.1875, "grad_norm_var": 0.23222249348958332, "learning_rate": 0.0001, "loss": 7.8599, "loss/crossentropy": 2.665991187095642, "loss/hidden": 2.078125, "loss/jsd": 0.0, "loss/logits": 0.3115830421447754, "step": 836 }, { "epoch": 0.0261875, "grad_norm": 5.40625, "grad_norm_var": 0.11620686848958334, "learning_rate": 0.0001, "loss": 8.0253, "loss/crossentropy": 2.748578667640686, "loss/hidden": 2.109375, "loss/jsd": 0.0, "loss/logits": 0.3167339563369751, "step": 838 }, { "epoch": 0.02625, "grad_norm": 5.34375, "grad_norm_var": 0.126416015625, "learning_rate": 0.0001, "loss": 7.9042, "loss/crossentropy": 2.693682074546814, "loss/hidden": 2.1015625, "loss/jsd": 0.0, "loss/logits": 0.31089678406715393, "step": 840 }, { "epoch": 0.0263125, "grad_norm": 6.34375, "grad_norm_var": 0.15662434895833333, "learning_rate": 0.0001, "loss": 8.5614, "loss/crossentropy": 2.8782442808151245, "loss/hidden": 2.1640625, "loss/jsd": 0.0, "loss/logits": 0.35191184282302856, "step": 842 }, { "epoch": 0.026375, "grad_norm": 6.46875, "grad_norm_var": 0.17574462890625, "learning_rate": 0.0001, "loss": 8.2453, "loss/crossentropy": 2.783571243286133, "loss/hidden": 2.1484375, "loss/jsd": 0.0, "loss/logits": 0.3313324302434921, "step": 844 }, { "epoch": 0.0264375, "grad_norm": 5.6875, "grad_norm_var": 0.16897379557291667, "learning_rate": 0.0001, "loss": 7.8925, "loss/crossentropy": 2.6488534212112427, "loss/hidden": 2.078125, "loss/jsd": 0.0, "loss/logits": 0.3165491074323654, "step": 846 }, { "epoch": 0.0265, "grad_norm": 5.71875, "grad_norm_var": 0.15937093098958333, "learning_rate": 0.0001, "loss": 8.2695, "loss/crossentropy": 2.808097720146179, "loss/hidden": 2.1484375, "loss/jsd": 0.0, "loss/logits": 0.33130063116550446, "step": 848 }, { "epoch": 0.0265625, "grad_norm": 5.40625, "grad_norm_var": 0.14685872395833333, "learning_rate": 0.0001, "loss": 8.1955, "loss/crossentropy": 2.867979645729065, "loss/hidden": 2.109375, "loss/jsd": 0.0, "loss/logits": 0.3218100666999817, "step": 850 }, { "epoch": 0.026625, "grad_norm": 5.6875, "grad_norm_var": 0.134619140625, "learning_rate": 0.0001, "loss": 8.4039, "loss/crossentropy": 2.8698208332061768, "loss/hidden": 2.125, "loss/jsd": 0.0, "loss/logits": 0.3409119248390198, "step": 852 }, { "epoch": 0.0266875, "grad_norm": 6.53125, "grad_norm_var": 0.18111979166666667, "learning_rate": 0.0001, "loss": 8.0889, "loss/crossentropy": 2.7214276790618896, "loss/hidden": 2.125, "loss/jsd": 0.0, "loss/logits": 0.32424378395080566, "step": 854 }, { "epoch": 0.02675, "grad_norm": 6.125, "grad_norm_var": 0.17069905598958332, "learning_rate": 0.0001, "loss": 8.3495, "loss/crossentropy": 2.6523544788360596, "loss/hidden": 2.2734375, "loss/jsd": 0.0, "loss/logits": 0.34237538278102875, "step": 856 }, { "epoch": 0.0268125, "grad_norm": 6.25, "grad_norm_var": 0.18821614583333332, "learning_rate": 0.0001, "loss": 8.1074, "loss/crossentropy": 2.735729455947876, "loss/hidden": 2.0859375, "loss/jsd": 0.0, "loss/logits": 0.3285723030567169, "step": 858 }, { "epoch": 0.026875, "grad_norm": 5.03125, "grad_norm_var": 0.17589518229166667, "learning_rate": 0.0001, "loss": 7.7916, "loss/crossentropy": 2.623154044151306, "loss/hidden": 2.0546875, "loss/jsd": 0.0, "loss/logits": 0.3113795071840286, "step": 860 }, { "epoch": 0.0269375, "grad_norm": 5.65625, "grad_norm_var": 0.175244140625, "learning_rate": 0.0001, "loss": 8.0075, "loss/crossentropy": 2.6751527786254883, "loss/hidden": 2.1328125, "loss/jsd": 0.0, "loss/logits": 0.3199501931667328, "step": 862 }, { "epoch": 0.027, "grad_norm": 6.125, "grad_norm_var": 0.16978759765625, "learning_rate": 0.0001, "loss": 8.3152, "loss/crossentropy": 2.830706477165222, "loss/hidden": 2.171875, "loss/jsd": 0.0, "loss/logits": 0.331265926361084, "step": 864 }, { "epoch": 0.0270625, "grad_norm": 5.0625, "grad_norm_var": 0.21604410807291666, "learning_rate": 0.0001, "loss": 7.6622, "loss/crossentropy": 2.5602376461029053, "loss/hidden": 2.03125, "loss/jsd": 0.0, "loss/logits": 0.3070688992738724, "step": 866 }, { "epoch": 0.027125, "grad_norm": 5.21875, "grad_norm_var": 0.23385416666666667, "learning_rate": 0.0001, "loss": 7.5858, "loss/crossentropy": 2.4632397890090942, "loss/hidden": 2.1015625, "loss/jsd": 0.0, "loss/logits": 0.30210280418395996, "step": 868 }, { "epoch": 0.0271875, "grad_norm": 5.40625, "grad_norm_var": 0.19134114583333334, "learning_rate": 0.0001, "loss": 7.9338, "loss/crossentropy": 2.7243661880493164, "loss/hidden": 2.078125, "loss/jsd": 0.0, "loss/logits": 0.3131289482116699, "step": 870 }, { "epoch": 0.02725, "grad_norm": 6.46875, "grad_norm_var": 0.19524332682291667, "learning_rate": 0.0001, "loss": 8.6795, "loss/crossentropy": 3.0478591918945312, "loss/hidden": 2.140625, "loss/jsd": 0.0, "loss/logits": 0.3490995764732361, "step": 872 }, { "epoch": 0.0273125, "grad_norm": 5.78125, "grad_norm_var": 0.16415608723958333, "learning_rate": 0.0001, "loss": 8.155, "loss/crossentropy": 2.7137235403060913, "loss/hidden": 2.09375, "loss/jsd": 0.0, "loss/logits": 0.3347555547952652, "step": 874 }, { "epoch": 0.027375, "grad_norm": 5.09375, "grad_norm_var": 0.1630859375, "learning_rate": 0.0001, "loss": 7.9233, "loss/crossentropy": 2.6758487224578857, "loss/hidden": 2.078125, "loss/jsd": 0.0, "loss/logits": 0.31693698465824127, "step": 876 }, { "epoch": 0.0274375, "grad_norm": 6.1875, "grad_norm_var": 0.17623697916666667, "learning_rate": 0.0001, "loss": 7.9173, "loss/crossentropy": 2.528697967529297, "loss/hidden": 2.0859375, "loss/jsd": 0.0, "loss/logits": 0.3302699476480484, "step": 878 }, { "epoch": 0.0275, "grad_norm": 5.875, "grad_norm_var": 0.28787434895833336, "learning_rate": 0.0001, "loss": 8.5773, "loss/crossentropy": 2.8210952281951904, "loss/hidden": 2.2421875, "loss/jsd": 0.0, "loss/logits": 0.35140474140644073, "step": 880 }, { "epoch": 0.0275625, "grad_norm": 5.40625, "grad_norm_var": 0.2482421875, "learning_rate": 0.0001, "loss": 8.0685, "loss/crossentropy": 2.7750041484832764, "loss/hidden": 2.078125, "loss/jsd": 0.0, "loss/logits": 0.3215404748916626, "step": 882 }, { "epoch": 0.027625, "grad_norm": 5.46875, "grad_norm_var": 0.23173421223958332, "learning_rate": 0.0001, "loss": 7.9534, "loss/crossentropy": 2.611912250518799, "loss/hidden": 2.09375, "loss/jsd": 0.0, "loss/logits": 0.32476896047592163, "step": 884 }, { "epoch": 0.0276875, "grad_norm": 4.90625, "grad_norm_var": 0.2774739583333333, "learning_rate": 0.0001, "loss": 7.6661, "loss/crossentropy": 2.6392383575439453, "loss/hidden": 2.0859375, "loss/jsd": 0.0, "loss/logits": 0.2940940260887146, "step": 886 }, { "epoch": 0.02775, "grad_norm": 6.625, "grad_norm_var": 0.2930989583333333, "learning_rate": 0.0001, "loss": 8.6536, "loss/crossentropy": 3.054406762123108, "loss/hidden": 2.1640625, "loss/jsd": 0.0, "loss/logits": 0.34351395070552826, "step": 888 }, { "epoch": 0.0278125, "grad_norm": 5.40625, "grad_norm_var": 0.304541015625, "learning_rate": 0.0001, "loss": 7.8948, "loss/crossentropy": 2.5886287689208984, "loss/hidden": 2.078125, "loss/jsd": 0.0, "loss/logits": 0.3228069841861725, "step": 890 }, { "epoch": 0.027875, "grad_norm": 5.75, "grad_norm_var": 0.27063395182291666, "learning_rate": 0.0001, "loss": 7.9995, "loss/crossentropy": 2.773493528366089, "loss/hidden": 2.1015625, "loss/jsd": 0.0, "loss/logits": 0.3124459385871887, "step": 892 }, { "epoch": 0.0279375, "grad_norm": 5.3125, "grad_norm_var": 0.273681640625, "learning_rate": 0.0001, "loss": 8.0724, "loss/crossentropy": 2.750393271446228, "loss/hidden": 2.078125, "loss/jsd": 0.0, "loss/logits": 0.3243854194879532, "step": 894 }, { "epoch": 0.028, "grad_norm": 6.15625, "grad_norm_var": 0.17213134765625, "learning_rate": 0.0001, "loss": 7.6506, "loss/crossentropy": 2.4986928701400757, "loss/hidden": 2.0703125, "loss/jsd": 0.0, "loss/logits": 0.30815713107585907, "step": 896 }, { "epoch": 0.0280625, "grad_norm": 5.46875, "grad_norm_var": 0.19250895182291666, "learning_rate": 0.0001, "loss": 7.845, "loss/crossentropy": 2.671201229095459, "loss/hidden": 2.046875, "loss/jsd": 0.0, "loss/logits": 0.3126888573169708, "step": 898 }, { "epoch": 0.028125, "grad_norm": 6.59375, "grad_norm_var": 0.25100504557291664, "learning_rate": 0.0001, "loss": 8.2131, "loss/crossentropy": 2.8476167917251587, "loss/hidden": 2.078125, "loss/jsd": 0.0, "loss/logits": 0.3287386894226074, "step": 900 }, { "epoch": 0.0281875, "grad_norm": 5.1875, "grad_norm_var": 0.24221598307291667, "learning_rate": 0.0001, "loss": 7.9984, "loss/crossentropy": 2.7201178073883057, "loss/hidden": 2.140625, "loss/jsd": 0.0, "loss/logits": 0.31376519799232483, "step": 902 }, { "epoch": 0.02825, "grad_norm": 5.6875, "grad_norm_var": 0.22470296223958333, "learning_rate": 0.0001, "loss": 7.7041, "loss/crossentropy": 2.569111943244934, "loss/hidden": 2.0546875, "loss/jsd": 0.0, "loss/logits": 0.3080315589904785, "step": 904 }, { "epoch": 0.0283125, "grad_norm": 5.28125, "grad_norm_var": 0.3753743489583333, "learning_rate": 0.0001, "loss": 8.047, "loss/crossentropy": 2.642240047454834, "loss/hidden": 2.125, "loss/jsd": 0.0, "loss/logits": 0.3279738128185272, "step": 906 }, { "epoch": 0.028375, "grad_norm": 6.03125, "grad_norm_var": 0.38293863932291666, "learning_rate": 0.0001, "loss": 7.88, "loss/crossentropy": 2.5164895057678223, "loss/hidden": 2.0859375, "loss/jsd": 0.0, "loss/logits": 0.32775919139385223, "step": 908 }, { "epoch": 0.0284375, "grad_norm": 5.53125, "grad_norm_var": 0.38437093098958336, "learning_rate": 0.0001, "loss": 8.5437, "loss/crossentropy": 2.972300410270691, "loss/hidden": 2.1796875, "loss/jsd": 0.0, "loss/logits": 0.33917422592639923, "step": 910 }, { "epoch": 0.0285, "grad_norm": 5.09375, "grad_norm_var": 0.390625, "learning_rate": 0.0001, "loss": 7.7213, "loss/crossentropy": 2.5361626148223877, "loss/hidden": 2.109375, "loss/jsd": 0.0, "loss/logits": 0.30757124722003937, "step": 912 }, { "epoch": 0.0285625, "grad_norm": 9.375, "grad_norm_var": 1.1869425455729166, "learning_rate": 0.0001, "loss": 8.2299, "loss/crossentropy": 2.709487557411194, "loss/hidden": 2.21875, "loss/jsd": 0.0, "loss/logits": 0.3301650881767273, "step": 914 }, { "epoch": 0.028625, "grad_norm": 6.5625, "grad_norm_var": 1.1613118489583334, "learning_rate": 0.0001, "loss": 7.923, "loss/crossentropy": 2.6102263927459717, "loss/hidden": 2.09375, "loss/jsd": 0.0, "loss/logits": 0.32190655171871185, "step": 916 }, { "epoch": 0.0286875, "grad_norm": 6.875, "grad_norm_var": 1.3302042643229166, "learning_rate": 0.0001, "loss": 8.4869, "loss/crossentropy": 2.9106889963150024, "loss/hidden": 2.140625, "loss/jsd": 0.0, "loss/logits": 0.3435541093349457, "step": 918 }, { "epoch": 0.02875, "grad_norm": 5.375, "grad_norm_var": 1.285009765625, "learning_rate": 0.0001, "loss": 8.1751, "loss/crossentropy": 2.721361994743347, "loss/hidden": 2.125, "loss/jsd": 0.0, "loss/logits": 0.33287379145622253, "step": 920 }, { "epoch": 0.0288125, "grad_norm": 6.09375, "grad_norm_var": 1.2374837239583334, "learning_rate": 0.0001, "loss": 8.1547, "loss/crossentropy": 2.76972234249115, "loss/hidden": 2.0859375, "loss/jsd": 0.0, "loss/logits": 0.32990050315856934, "step": 922 }, { "epoch": 0.028875, "grad_norm": 5.75, "grad_norm_var": 1.2248982747395833, "learning_rate": 0.0001, "loss": 8.235, "loss/crossentropy": 2.905160665512085, "loss/hidden": 2.078125, "loss/jsd": 0.0, "loss/logits": 0.32516761124134064, "step": 924 }, { "epoch": 0.0289375, "grad_norm": 5.75, "grad_norm_var": 1.21920166015625, "learning_rate": 0.0001, "loss": 7.8167, "loss/crossentropy": 2.572208523750305, "loss/hidden": 2.09375, "loss/jsd": 0.0, "loss/logits": 0.3150770515203476, "step": 926 }, { "epoch": 0.029, "grad_norm": 6.0, "grad_norm_var": 1.0982381184895833, "learning_rate": 0.0001, "loss": 7.6862, "loss/crossentropy": 2.576735019683838, "loss/hidden": 2.078125, "loss/jsd": 0.0, "loss/logits": 0.3031381666660309, "step": 928 }, { "epoch": 0.0290625, "grad_norm": 5.28125, "grad_norm_var": 0.45636393229166666, "learning_rate": 0.0001, "loss": 7.9539, "loss/crossentropy": 2.7378779649734497, "loss/hidden": 2.09375, "loss/jsd": 0.0, "loss/logits": 0.3122238516807556, "step": 930 }, { "epoch": 0.029125, "grad_norm": 5.65625, "grad_norm_var": 0.47415364583333336, "learning_rate": 0.0001, "loss": 7.7871, "loss/crossentropy": 2.664810299873352, "loss/hidden": 1.9921875, "loss/jsd": 0.0, "loss/logits": 0.3130109906196594, "step": 932 }, { "epoch": 0.0291875, "grad_norm": 5.46875, "grad_norm_var": 0.12987874348958334, "learning_rate": 0.0001, "loss": 8.0233, "loss/crossentropy": 2.762031674385071, "loss/hidden": 2.0703125, "loss/jsd": 0.0, "loss/logits": 0.31909996271133423, "step": 934 }, { "epoch": 0.02925, "grad_norm": 11.125, "grad_norm_var": 1.95758056640625, "learning_rate": 0.0001, "loss": 8.1688, "loss/crossentropy": 2.6960248947143555, "loss/hidden": 2.1328125, "loss/jsd": 0.0, "loss/logits": 0.3339923918247223, "step": 936 }, { "epoch": 0.0293125, "grad_norm": 5.84375, "grad_norm_var": 1.9125651041666667, "learning_rate": 0.0001, "loss": 7.8335, "loss/crossentropy": 2.756159782409668, "loss/hidden": 2.0546875, "loss/jsd": 0.0, "loss/logits": 0.30227015912532806, "step": 938 }, { "epoch": 0.029375, "grad_norm": 5.53125, "grad_norm_var": 1.9476847330729166, "learning_rate": 0.0001, "loss": 7.8498, "loss/crossentropy": 2.552929639816284, "loss/hidden": 2.1171875, "loss/jsd": 0.0, "loss/logits": 0.3179690092802048, "step": 940 }, { "epoch": 0.0294375, "grad_norm": 6.0625, "grad_norm_var": 1.9318644205729167, "learning_rate": 0.0001, "loss": 8.2395, "loss/crossentropy": 2.7607001066207886, "loss/hidden": 2.1171875, "loss/jsd": 0.0, "loss/logits": 0.3361617773771286, "step": 942 }, { "epoch": 0.0295, "grad_norm": 5.71875, "grad_norm_var": 1.9405232747395833, "learning_rate": 0.0001, "loss": 7.8254, "loss/crossentropy": 2.6775211095809937, "loss/hidden": 2.0703125, "loss/jsd": 0.0, "loss/logits": 0.3077540248632431, "step": 944 }, { "epoch": 0.0295625, "grad_norm": 7.34375, "grad_norm_var": 2.0398274739583333, "learning_rate": 0.0001, "loss": 7.93, "loss/crossentropy": 2.6490660905838013, "loss/hidden": 2.0703125, "loss/jsd": 0.0, "loss/logits": 0.3210662305355072, "step": 946 }, { "epoch": 0.029625, "grad_norm": 5.375, "grad_norm_var": 2.0617024739583334, "learning_rate": 0.0001, "loss": 7.4648, "loss/crossentropy": 2.5306503772735596, "loss/hidden": 2.0, "loss/jsd": 0.0, "loss/logits": 0.29341667890548706, "step": 948 }, { "epoch": 0.0296875, "grad_norm": 5.3125, "grad_norm_var": 2.0951171875, "learning_rate": 0.0001, "loss": 7.4159, "loss/crossentropy": 2.4712361097335815, "loss/hidden": 2.09375, "loss/jsd": 0.0, "loss/logits": 0.28509533405303955, "step": 950 }, { "epoch": 0.02975, "grad_norm": 5.375, "grad_norm_var": 0.3578084309895833, "learning_rate": 0.0001, "loss": 7.9328, "loss/crossentropy": 2.6684658527374268, "loss/hidden": 2.09375, "loss/jsd": 0.0, "loss/logits": 0.3170586824417114, "step": 952 }, { "epoch": 0.0298125, "grad_norm": 5.0625, "grad_norm_var": 0.3912760416666667, "learning_rate": 0.0001, "loss": 7.7893, "loss/crossentropy": 2.586375594139099, "loss/hidden": 2.078125, "loss/jsd": 0.0, "loss/logits": 0.31247901916503906, "step": 954 }, { "epoch": 0.029875, "grad_norm": 4.8125, "grad_norm_var": 0.41243082682291665, "learning_rate": 0.0001, "loss": 7.4818, "loss/crossentropy": 2.563793182373047, "loss/hidden": 1.99609375, "loss/jsd": 0.0, "loss/logits": 0.2921905666589737, "step": 956 }, { "epoch": 0.0299375, "grad_norm": 5.125, "grad_norm_var": 0.39143473307291665, "learning_rate": 0.0001, "loss": 7.8392, "loss/crossentropy": 2.7233054637908936, "loss/hidden": 2.1015625, "loss/jsd": 0.0, "loss/logits": 0.3014303147792816, "step": 958 }, { "epoch": 0.03, "grad_norm": 12.3125, "grad_norm_var": 3.29986572265625, "learning_rate": 0.0001, "loss": 8.3685, "loss/crossentropy": 2.7161797285079956, "loss/hidden": 2.1953125, "loss/jsd": 0.0, "loss/logits": 0.345696359872818, "step": 960 }, { "epoch": 0.0300625, "grad_norm": 5.8125, "grad_norm_var": 3.100455729166667, "learning_rate": 0.0001, "loss": 7.8995, "loss/crossentropy": 2.6865986585617065, "loss/hidden": 2.0546875, "loss/jsd": 0.0, "loss/logits": 0.315825879573822, "step": 962 }, { "epoch": 0.030125, "grad_norm": 5.84375, "grad_norm_var": 3.0404947916666667, "learning_rate": 0.0001, "loss": 7.5698, "loss/crossentropy": 2.4539116621017456, "loss/hidden": 2.0234375, "loss/jsd": 0.0, "loss/logits": 0.30924659967422485, "step": 964 }, { "epoch": 0.0301875, "grad_norm": 5.09375, "grad_norm_var": 3.0578125, "learning_rate": 0.0001, "loss": 7.8823, "loss/crossentropy": 2.7271156311035156, "loss/hidden": 2.0625, "loss/jsd": 0.0, "loss/logits": 0.3092683255672455, "step": 966 }, { "epoch": 0.03025, "grad_norm": 5.34375, "grad_norm_var": 3.09375, "learning_rate": 0.0001, "loss": 8.3393, "loss/crossentropy": 2.7491281032562256, "loss/hidden": 2.1953125, "loss/jsd": 0.0, "loss/logits": 0.3394854813814163, "step": 968 }, { "epoch": 0.0303125, "grad_norm": 5.40625, "grad_norm_var": 3.076806640625, "learning_rate": 0.0001, "loss": 8.0177, "loss/crossentropy": 2.7733538150787354, "loss/hidden": 2.1015625, "loss/jsd": 0.0, "loss/logits": 0.3142744302749634, "step": 970 }, { "epoch": 0.030375, "grad_norm": 5.875, "grad_norm_var": 2.9033854166666666, "learning_rate": 0.0001, "loss": 8.4779, "loss/crossentropy": 3.035637617111206, "loss/hidden": 2.1015625, "loss/jsd": 0.0, "loss/logits": 0.33406516909599304, "step": 972 }, { "epoch": 0.0304375, "grad_norm": 6.21875, "grad_norm_var": 2.83970947265625, "learning_rate": 0.0001, "loss": 8.3789, "loss/crossentropy": 2.986885666847229, "loss/hidden": 2.078125, "loss/jsd": 0.0, "loss/logits": 0.331390380859375, "step": 974 }, { "epoch": 0.0305, "grad_norm": 6.65625, "grad_norm_var": 0.195703125, "learning_rate": 0.0001, "loss": 8.4519, "loss/crossentropy": 2.8858373165130615, "loss/hidden": 2.1015625, "loss/jsd": 0.0, "loss/logits": 0.3464466333389282, "step": 976 }, { "epoch": 0.0305625, "grad_norm": 6.4375, "grad_norm_var": 0.47063802083333334, "learning_rate": 0.0001, "loss": 8.3219, "loss/crossentropy": 2.7667852640151978, "loss/hidden": 2.171875, "loss/jsd": 0.0, "loss/logits": 0.3383200764656067, "step": 978 }, { "epoch": 0.030625, "grad_norm": 5.53125, "grad_norm_var": 0.5205362955729167, "learning_rate": 0.0001, "loss": 7.7302, "loss/crossentropy": 2.6153576374053955, "loss/hidden": 2.046875, "loss/jsd": 0.0, "loss/logits": 0.3067933917045593, "step": 980 }, { "epoch": 0.0306875, "grad_norm": 5.96875, "grad_norm_var": 0.510791015625, "learning_rate": 0.0001, "loss": 8.5684, "loss/crossentropy": 2.9319427013397217, "loss/hidden": 2.15625, "loss/jsd": 0.0, "loss/logits": 0.34801939129829407, "step": 982 }, { "epoch": 0.03075, "grad_norm": 5.6875, "grad_norm_var": 0.4574178059895833, "learning_rate": 0.0001, "loss": 8.154, "loss/crossentropy": 2.73896861076355, "loss/hidden": 2.109375, "loss/jsd": 0.0, "loss/logits": 0.3305632919073105, "step": 984 }, { "epoch": 0.0308125, "grad_norm": 5.375, "grad_norm_var": 0.44302978515625, "learning_rate": 0.0001, "loss": 7.9289, "loss/crossentropy": 2.6419018507003784, "loss/hidden": 2.125, "loss/jsd": 0.0, "loss/logits": 0.31620074808597565, "step": 986 }, { "epoch": 0.030875, "grad_norm": 6.03125, "grad_norm_var": 0.4825520833333333, "learning_rate": 0.0001, "loss": 7.8493, "loss/crossentropy": 2.6793285608291626, "loss/hidden": 2.0390625, "loss/jsd": 0.0, "loss/logits": 0.313095286488533, "step": 988 }, { "epoch": 0.0309375, "grad_norm": 4.96875, "grad_norm_var": 0.529541015625, "learning_rate": 0.0001, "loss": 7.8312, "loss/crossentropy": 2.6495360136032104, "loss/hidden": 2.0390625, "loss/jsd": 0.0, "loss/logits": 0.3142629563808441, "step": 990 }, { "epoch": 0.031, "grad_norm": 5.65625, "grad_norm_var": 0.5310506184895833, "learning_rate": 0.0001, "loss": 7.8569, "loss/crossentropy": 2.6996525526046753, "loss/hidden": 2.0703125, "loss/jsd": 0.0, "loss/logits": 0.3086983859539032, "step": 992 }, { "epoch": 0.0310625, "grad_norm": 5.78125, "grad_norm_var": 0.19765625, "learning_rate": 0.0001, "loss": 8.2429, "loss/crossentropy": 2.787962317466736, "loss/hidden": 2.0859375, "loss/jsd": 0.0, "loss/logits": 0.33689984679222107, "step": 994 }, { "epoch": 0.031125, "grad_norm": 6.21875, "grad_norm_var": 0.22366129557291667, "learning_rate": 0.0001, "loss": 7.8903, "loss/crossentropy": 2.657445192337036, "loss/hidden": 2.1171875, "loss/jsd": 0.0, "loss/logits": 0.31156356632709503, "step": 996 }, { "epoch": 0.0311875, "grad_norm": 5.5625, "grad_norm_var": 0.13487955729166667, "learning_rate": 0.0001, "loss": 7.927, "loss/crossentropy": 2.7342272996902466, "loss/hidden": 2.09375, "loss/jsd": 0.0, "loss/logits": 0.3099045604467392, "step": 998 }, { "epoch": 0.03125, "grad_norm": 5.125, "grad_norm_var": 0.13541666666666666, "learning_rate": 0.0001, "loss": 7.7652, "loss/crossentropy": 2.6236950159072876, "loss/hidden": 2.046875, "loss/jsd": 0.0, "loss/logits": 0.30945977568626404, "step": 1000 }, { "epoch": 0.0313125, "grad_norm": 5.8125, "grad_norm_var": 0.14263916015625, "learning_rate": 0.0001, "loss": 8.256, "loss/crossentropy": 2.8440134525299072, "loss/hidden": 2.1171875, "loss/jsd": 0.0, "loss/logits": 0.329479455947876, "step": 1002 }, { "epoch": 0.031375, "grad_norm": 5.78125, "grad_norm_var": 0.12047119140625, "learning_rate": 0.0001, "loss": 7.9484, "loss/crossentropy": 2.6811503171920776, "loss/hidden": 2.1328125, "loss/jsd": 0.0, "loss/logits": 0.313446044921875, "step": 1004 }, { "epoch": 0.0314375, "grad_norm": 5.15625, "grad_norm_var": 0.12053629557291666, "learning_rate": 0.0001, "loss": 7.8344, "loss/crossentropy": 2.676396131515503, "loss/hidden": 2.0390625, "loss/jsd": 0.0, "loss/logits": 0.3118928074836731, "step": 1006 }, { "epoch": 0.0315, "grad_norm": 5.28125, "grad_norm_var": 0.10519205729166667, "learning_rate": 0.0001, "loss": 7.6031, "loss/crossentropy": 2.5833935737609863, "loss/hidden": 2.046875, "loss/jsd": 0.0, "loss/logits": 0.2972829341888428, "step": 1008 }, { "epoch": 0.0315625, "grad_norm": 4.71875, "grad_norm_var": 0.14134114583333332, "learning_rate": 0.0001, "loss": 7.8421, "loss/crossentropy": 2.774709463119507, "loss/hidden": 2.0859375, "loss/jsd": 0.0, "loss/logits": 0.2981446832418442, "step": 1010 }, { "epoch": 0.031625, "grad_norm": 6.15625, "grad_norm_var": 0.57730712890625, "learning_rate": 0.0001, "loss": 7.9047, "loss/crossentropy": 2.4489853382110596, "loss/hidden": 2.1640625, "loss/jsd": 0.0, "loss/logits": 0.329169899225235, "step": 1012 }, { "epoch": 0.0316875, "grad_norm": 4.96875, "grad_norm_var": 0.6143513997395833, "learning_rate": 0.0001, "loss": 7.7484, "loss/crossentropy": 2.7602421045303345, "loss/hidden": 2.03125, "loss/jsd": 0.0, "loss/logits": 0.2956867665052414, "step": 1014 }, { "epoch": 0.03175, "grad_norm": 5.28125, "grad_norm_var": 0.60299072265625, "learning_rate": 0.0001, "loss": 7.9042, "loss/crossentropy": 2.6339290142059326, "loss/hidden": 2.046875, "loss/jsd": 0.0, "loss/logits": 0.32234111428260803, "step": 1016 }, { "epoch": 0.0318125, "grad_norm": 5.71875, "grad_norm_var": 0.6020833333333333, "learning_rate": 0.0001, "loss": 7.887, "loss/crossentropy": 2.5684269666671753, "loss/hidden": 2.1015625, "loss/jsd": 0.0, "loss/logits": 0.3217056393623352, "step": 1018 }, { "epoch": 0.031875, "grad_norm": 5.8125, "grad_norm_var": 0.5924763997395833, "learning_rate": 0.0001, "loss": 7.8581, "loss/crossentropy": 2.7317564487457275, "loss/hidden": 2.0703125, "loss/jsd": 0.0, "loss/logits": 0.3056041747331619, "step": 1020 }, { "epoch": 0.0319375, "grad_norm": 9.3125, "grad_norm_var": 1.39703369140625, "learning_rate": 0.0001, "loss": 7.7259, "loss/crossentropy": 2.5896379947662354, "loss/hidden": 2.09375, "loss/jsd": 0.0, "loss/logits": 0.30425289273262024, "step": 1022 }, { "epoch": 0.032, "grad_norm": 5.09375, "grad_norm_var": 1.40953369140625, "learning_rate": 0.0001, "loss": 7.7384, "loss/crossentropy": 2.5754462480545044, "loss/hidden": 2.078125, "loss/jsd": 0.0, "loss/logits": 0.3084825873374939, "step": 1024 }, { "epoch": 0.0320625, "grad_norm": 5.4375, "grad_norm_var": 1.3848958333333334, "learning_rate": 0.0001, "loss": 8.0083, "loss/crossentropy": 2.7684485912323, "loss/hidden": 2.078125, "loss/jsd": 0.0, "loss/logits": 0.31617075204849243, "step": 1026 }, { "epoch": 0.032125, "grad_norm": 5.5, "grad_norm_var": 1.0278605143229167, "learning_rate": 0.0001, "loss": 7.4014, "loss/crossentropy": 2.4708162546157837, "loss/hidden": 2.03125, "loss/jsd": 0.0, "loss/logits": 0.2899314910173416, "step": 1028 }, { "epoch": 0.0321875, "grad_norm": 5.8125, "grad_norm_var": 0.9800130208333333, "learning_rate": 0.0001, "loss": 8.1904, "loss/crossentropy": 2.7571115493774414, "loss/hidden": 2.125, "loss/jsd": 0.0, "loss/logits": 0.33083009719848633, "step": 1030 }, { "epoch": 0.03225, "grad_norm": 4.875, "grad_norm_var": 1.01529541015625, "learning_rate": 0.0001, "loss": 7.6514, "loss/crossentropy": 2.638901114463806, "loss/hidden": 2.0234375, "loss/jsd": 0.0, "loss/logits": 0.29890111088752747, "step": 1032 }, { "epoch": 0.0323125, "grad_norm": 7.90625, "grad_norm_var": 1.320947265625, "learning_rate": 0.0001, "loss": 8.1064, "loss/crossentropy": 2.8018993139266968, "loss/hidden": 2.0859375, "loss/jsd": 0.0, "loss/logits": 0.3218534588813782, "step": 1034 }, { "epoch": 0.032375, "grad_norm": 7.5, "grad_norm_var": 1.4863240559895834, "learning_rate": 0.0001, "loss": 8.4959, "loss/crossentropy": 2.9163622856140137, "loss/hidden": 2.1953125, "loss/jsd": 0.0, "loss/logits": 0.33842067420482635, "step": 1036 }, { "epoch": 0.0324375, "grad_norm": 5.71875, "grad_norm_var": 0.6952473958333333, "learning_rate": 0.0001, "loss": 7.8989, "loss/crossentropy": 2.7055130004882812, "loss/hidden": 2.0390625, "loss/jsd": 0.0, "loss/logits": 0.31543323397636414, "step": 1038 }, { "epoch": 0.0325, "grad_norm": 5.78125, "grad_norm_var": 0.6703084309895834, "learning_rate": 0.0001, "loss": 7.9723, "loss/crossentropy": 2.5736807584762573, "loss/hidden": 2.125, "loss/jsd": 0.0, "loss/logits": 0.3273621052503586, "step": 1040 }, { "epoch": 0.0325625, "grad_norm": 6.09375, "grad_norm_var": 1.0708333333333333, "learning_rate": 0.0001, "loss": 8.718, "loss/crossentropy": 2.940716505050659, "loss/hidden": 2.140625, "loss/jsd": 0.0, "loss/logits": 0.3636625409126282, "step": 1042 }, { "epoch": 0.032625, "grad_norm": 6.25, "grad_norm_var": 1.0049479166666666, "learning_rate": 0.0001, "loss": 8.4934, "loss/crossentropy": 2.8877869844436646, "loss/hidden": 2.1328125, "loss/jsd": 0.0, "loss/logits": 0.34728457033634186, "step": 1044 }, { "epoch": 0.0326875, "grad_norm": 5.28125, "grad_norm_var": 1.0175618489583333, "learning_rate": 0.0001, "loss": 8.099, "loss/crossentropy": 2.7789658308029175, "loss/hidden": 2.078125, "loss/jsd": 0.0, "loss/logits": 0.3241891860961914, "step": 1046 }, { "epoch": 0.03275, "grad_norm": 6.1875, "grad_norm_var": 0.8885701497395834, "learning_rate": 0.0001, "loss": 8.1815, "loss/crossentropy": 2.7968313694000244, "loss/hidden": 2.15625, "loss/jsd": 0.0, "loss/logits": 0.32283732295036316, "step": 1048 }, { "epoch": 0.0328125, "grad_norm": 5.40625, "grad_norm_var": 0.7373046875, "learning_rate": 0.0001, "loss": 7.472, "loss/crossentropy": 2.44843852519989, "loss/hidden": 2.0859375, "loss/jsd": 0.0, "loss/logits": 0.2937634289264679, "step": 1050 }, { "epoch": 0.032875, "grad_norm": 5.9375, "grad_norm_var": 0.5847941080729167, "learning_rate": 0.0001, "loss": 8.6614, "loss/crossentropy": 3.1080269813537598, "loss/hidden": 2.109375, "loss/jsd": 0.0, "loss/logits": 0.34440042078495026, "step": 1052 }, { "epoch": 0.0329375, "grad_norm": 5.28125, "grad_norm_var": 0.6066365559895833, "learning_rate": 0.0001, "loss": 7.9554, "loss/crossentropy": 2.694094657897949, "loss/hidden": 2.0390625, "loss/jsd": 0.0, "loss/logits": 0.322227418422699, "step": 1054 }, { "epoch": 0.033, "grad_norm": 4.9375, "grad_norm_var": 0.69879150390625, "learning_rate": 0.0001, "loss": 7.6204, "loss/crossentropy": 2.594877004623413, "loss/hidden": 2.03515625, "loss/jsd": 0.0, "loss/logits": 0.2990366369485855, "step": 1056 }, { "epoch": 0.0330625, "grad_norm": 5.6875, "grad_norm_var": 0.18345947265625, "learning_rate": 0.0001, "loss": 7.7801, "loss/crossentropy": 2.691964864730835, "loss/hidden": 2.0625, "loss/jsd": 0.0, "loss/logits": 0.30256471037864685, "step": 1058 }, { "epoch": 0.033125, "grad_norm": 5.5, "grad_norm_var": 0.16809895833333333, "learning_rate": 0.0001, "loss": 7.9022, "loss/crossentropy": 2.747640609741211, "loss/hidden": 2.0703125, "loss/jsd": 0.0, "loss/logits": 0.3084237426519394, "step": 1060 }, { "epoch": 0.0331875, "grad_norm": 4.6875, "grad_norm_var": 0.1982421875, "learning_rate": 0.0001, "loss": 7.4803, "loss/crossentropy": 2.540893077850342, "loss/hidden": 1.9921875, "loss/jsd": 0.0, "loss/logits": 0.29471834003925323, "step": 1062 }, { "epoch": 0.03325, "grad_norm": 4.8125, "grad_norm_var": 0.18079020182291666, "learning_rate": 0.0001, "loss": 7.8368, "loss/crossentropy": 2.8419036865234375, "loss/hidden": 2.015625, "loss/jsd": 0.0, "loss/logits": 0.2979314625263214, "step": 1064 }, { "epoch": 0.0333125, "grad_norm": 5.34375, "grad_norm_var": 0.19078369140625, "learning_rate": 0.0001, "loss": 7.6024, "loss/crossentropy": 2.562094211578369, "loss/hidden": 2.0234375, "loss/jsd": 0.0, "loss/logits": 0.3016863167285919, "step": 1066 }, { "epoch": 0.033375, "grad_norm": 4.875, "grad_norm_var": 0.13828125, "learning_rate": 0.0001, "loss": 7.4647, "loss/crossentropy": 2.46220201253891, "loss/hidden": 2.0703125, "loss/jsd": 0.0, "loss/logits": 0.29322096705436707, "step": 1068 }, { "epoch": 0.0334375, "grad_norm": 5.1875, "grad_norm_var": 0.14286702473958332, "learning_rate": 0.0001, "loss": 7.6626, "loss/crossentropy": 2.729177236557007, "loss/hidden": 2.0234375, "loss/jsd": 0.0, "loss/logits": 0.29100048542022705, "step": 1070 }, { "epoch": 0.0335, "grad_norm": 5.21875, "grad_norm_var": 0.14036458333333332, "learning_rate": 0.0001, "loss": 7.565, "loss/crossentropy": 2.565149426460266, "loss/hidden": 2.046875, "loss/jsd": 0.0, "loss/logits": 0.29529547691345215, "step": 1072 }, { "epoch": 0.0335625, "grad_norm": 5.53125, "grad_norm_var": 0.13606363932291668, "learning_rate": 0.0001, "loss": 8.2564, "loss/crossentropy": 2.856488823890686, "loss/hidden": 2.078125, "loss/jsd": 0.0, "loss/logits": 0.33218303322792053, "step": 1074 }, { "epoch": 0.033625, "grad_norm": 4.9375, "grad_norm_var": 0.13307291666666668, "learning_rate": 0.0001, "loss": 7.3967, "loss/crossentropy": 2.5067635774612427, "loss/hidden": 1.984375, "loss/jsd": 0.0, "loss/logits": 0.29055845737457275, "step": 1076 }, { "epoch": 0.0336875, "grad_norm": 5.625, "grad_norm_var": 0.13527018229166668, "learning_rate": 0.0001, "loss": 8.2068, "loss/crossentropy": 2.904178261756897, "loss/hidden": 2.0859375, "loss/jsd": 0.0, "loss/logits": 0.32166482508182526, "step": 1078 }, { "epoch": 0.03375, "grad_norm": 5.84375, "grad_norm_var": 0.09478759765625, "learning_rate": 0.0001, "loss": 7.9513, "loss/crossentropy": 2.6391228437423706, "loss/hidden": 2.125, "loss/jsd": 0.0, "loss/logits": 0.3187223821878433, "step": 1080 }, { "epoch": 0.0338125, "grad_norm": 5.75, "grad_norm_var": 0.10234375, "learning_rate": 0.0001, "loss": 7.9395, "loss/crossentropy": 2.7211371660232544, "loss/hidden": 2.08984375, "loss/jsd": 0.0, "loss/logits": 0.31285470724105835, "step": 1082 }, { "epoch": 0.033875, "grad_norm": 5.25, "grad_norm_var": 0.093603515625, "learning_rate": 0.0001, "loss": 7.8391, "loss/crossentropy": 2.802746534347534, "loss/hidden": 2.03515625, "loss/jsd": 0.0, "loss/logits": 0.3001168519258499, "step": 1084 }, { "epoch": 0.0339375, "grad_norm": 5.34375, "grad_norm_var": 0.09425455729166667, "learning_rate": 0.0001, "loss": 7.5762, "loss/crossentropy": 2.604748845100403, "loss/hidden": 1.99609375, "loss/jsd": 0.0, "loss/logits": 0.2975347489118576, "step": 1086 }, { "epoch": 0.034, "grad_norm": 5.15625, "grad_norm_var": 0.09062093098958333, "learning_rate": 0.0001, "loss": 7.8203, "loss/crossentropy": 2.646591305732727, "loss/hidden": 2.0390625, "loss/jsd": 0.0, "loss/logits": 0.3134637773036957, "step": 1088 }, { "epoch": 0.0340625, "grad_norm": 4.96875, "grad_norm_var": 0.09803059895833334, "learning_rate": 0.0001, "loss": 7.6995, "loss/crossentropy": 2.7274059057235718, "loss/hidden": 2.0234375, "loss/jsd": 0.0, "loss/logits": 0.29486148059368134, "step": 1090 }, { "epoch": 0.034125, "grad_norm": 5.5625, "grad_norm_var": 0.19062093098958333, "learning_rate": 0.0001, "loss": 7.3454, "loss/crossentropy": 2.3077635765075684, "loss/hidden": 2.03515625, "loss/jsd": 0.0, "loss/logits": 0.3002438396215439, "step": 1092 }, { "epoch": 0.0341875, "grad_norm": 5.09375, "grad_norm_var": 0.19384358723958334, "learning_rate": 0.0001, "loss": 7.5719, "loss/crossentropy": 2.6316027641296387, "loss/hidden": 2.0390625, "loss/jsd": 0.0, "loss/logits": 0.29012222588062286, "step": 1094 }, { "epoch": 0.03425, "grad_norm": 5.03125, "grad_norm_var": 0.18843994140625, "learning_rate": 0.0001, "loss": 7.4854, "loss/crossentropy": 2.538474917411804, "loss/hidden": 1.97265625, "loss/jsd": 0.0, "loss/logits": 0.2974313497543335, "step": 1096 }, { "epoch": 0.0343125, "grad_norm": 5.40625, "grad_norm_var": 0.22355143229166666, "learning_rate": 0.0001, "loss": 7.938, "loss/crossentropy": 2.862139940261841, "loss/hidden": 2.015625, "loss/jsd": 0.0, "loss/logits": 0.3060276210308075, "step": 1098 }, { "epoch": 0.034375, "grad_norm": 5.28125, "grad_norm_var": 0.21444905598958333, "learning_rate": 0.0001, "loss": 7.7295, "loss/crossentropy": 2.5745826959609985, "loss/hidden": 2.0546875, "loss/jsd": 0.0, "loss/logits": 0.31002719700336456, "step": 1100 }, { "epoch": 0.0344375, "grad_norm": 5.25, "grad_norm_var": 0.198291015625, "learning_rate": 0.0001, "loss": 8.1631, "loss/crossentropy": 2.921027660369873, "loss/hidden": 2.109375, "loss/jsd": 0.0, "loss/logits": 0.313272625207901, "step": 1102 }, { "epoch": 0.0345, "grad_norm": 4.96875, "grad_norm_var": 0.21783854166666666, "learning_rate": 0.0001, "loss": 7.2742, "loss/crossentropy": 2.430017828941345, "loss/hidden": 2.03125, "loss/jsd": 0.0, "loss/logits": 0.28129300475120544, "step": 1104 }, { "epoch": 0.0345625, "grad_norm": 5.59375, "grad_norm_var": 0.20753580729166668, "learning_rate": 0.0001, "loss": 7.9823, "loss/crossentropy": 2.8249661922454834, "loss/hidden": 2.0234375, "loss/jsd": 0.0, "loss/logits": 0.31338855624198914, "step": 1106 }, { "epoch": 0.034625, "grad_norm": 6.0, "grad_norm_var": 0.123681640625, "learning_rate": 0.0001, "loss": 7.9694, "loss/crossentropy": 2.717615008354187, "loss/hidden": 2.1015625, "loss/jsd": 0.0, "loss/logits": 0.3150188624858856, "step": 1108 }, { "epoch": 0.0346875, "grad_norm": 5.53125, "grad_norm_var": 0.12056884765625, "learning_rate": 0.0001, "loss": 7.7475, "loss/crossentropy": 2.7591657638549805, "loss/hidden": 2.0703125, "loss/jsd": 0.0, "loss/logits": 0.2918030321598053, "step": 1110 }, { "epoch": 0.03475, "grad_norm": 4.78125, "grad_norm_var": 0.13730061848958333, "learning_rate": 0.0001, "loss": 7.624, "loss/crossentropy": 2.5710976123809814, "loss/hidden": 2.046875, "loss/jsd": 0.0, "loss/logits": 0.3006017506122589, "step": 1112 }, { "epoch": 0.0348125, "grad_norm": 7.125, "grad_norm_var": 0.30250244140625, "learning_rate": 0.0001, "loss": 7.9458, "loss/crossentropy": 2.817541718482971, "loss/hidden": 2.0625, "loss/jsd": 0.0, "loss/logits": 0.3065790385007858, "step": 1114 }, { "epoch": 0.034875, "grad_norm": 5.1875, "grad_norm_var": 0.30601806640625, "learning_rate": 0.0001, "loss": 8.3373, "loss/crossentropy": 3.012556791305542, "loss/hidden": 2.09375, "loss/jsd": 0.0, "loss/logits": 0.323103591799736, "step": 1116 }, { "epoch": 0.0349375, "grad_norm": 6.09375, "grad_norm_var": 0.33033447265625, "learning_rate": 0.0001, "loss": 8.2642, "loss/crossentropy": 2.9240721464157104, "loss/hidden": 2.0703125, "loss/jsd": 0.0, "loss/logits": 0.32698529958724976, "step": 1118 }, { "epoch": 0.035, "grad_norm": 4.96875, "grad_norm_var": 0.316015625, "learning_rate": 0.0001, "loss": 7.7638, "loss/crossentropy": 2.6127941608428955, "loss/hidden": 2.0859375, "loss/jsd": 0.0, "loss/logits": 0.306509867310524, "step": 1120 }, { "epoch": 0.0350625, "grad_norm": 5.0, "grad_norm_var": 0.32821858723958336, "learning_rate": 0.0001, "loss": 8.0152, "loss/crossentropy": 2.7948319911956787, "loss/hidden": 2.0625, "loss/jsd": 0.0, "loss/logits": 0.31578800082206726, "step": 1122 }, { "epoch": 0.035125, "grad_norm": 6.15625, "grad_norm_var": 0.36256103515625, "learning_rate": 0.0001, "loss": 7.5683, "loss/crossentropy": 2.5288106203079224, "loss/hidden": 2.0234375, "loss/jsd": 0.0, "loss/logits": 0.301602378487587, "step": 1124 }, { "epoch": 0.0351875, "grad_norm": 5.5, "grad_norm_var": 0.3632120768229167, "learning_rate": 0.0001, "loss": 8.1987, "loss/crossentropy": 2.9701608419418335, "loss/hidden": 2.0859375, "loss/jsd": 0.0, "loss/logits": 0.31426164507865906, "step": 1126 }, { "epoch": 0.03525, "grad_norm": 5.125, "grad_norm_var": 0.33684895833333334, "learning_rate": 0.0001, "loss": 7.9655, "loss/crossentropy": 2.819477081298828, "loss/hidden": 2.046875, "loss/jsd": 0.0, "loss/logits": 0.3099192678928375, "step": 1128 }, { "epoch": 0.0353125, "grad_norm": 4.875, "grad_norm_var": 0.15735270182291666, "learning_rate": 0.0001, "loss": 7.3771, "loss/crossentropy": 2.5352863073349, "loss/hidden": 1.953125, "loss/jsd": 0.0, "loss/logits": 0.28886666893959045, "step": 1130 }, { "epoch": 0.035375, "grad_norm": 5.5625, "grad_norm_var": 0.15623372395833332, "learning_rate": 0.0001, "loss": 7.9852, "loss/crossentropy": 2.8817960023880005, "loss/hidden": 2.015625, "loss/jsd": 0.0, "loss/logits": 0.30878154933452606, "step": 1132 }, { "epoch": 0.0354375, "grad_norm": 5.75, "grad_norm_var": 0.13157145182291666, "learning_rate": 0.0001, "loss": 8.0863, "loss/crossentropy": 2.783313274383545, "loss/hidden": 2.0625, "loss/jsd": 0.0, "loss/logits": 0.3240460753440857, "step": 1134 }, { "epoch": 0.0355, "grad_norm": 5.84375, "grad_norm_var": 0.14817708333333332, "learning_rate": 0.0001, "loss": 8.2244, "loss/crossentropy": 2.797518014907837, "loss/hidden": 2.1171875, "loss/jsd": 0.0, "loss/logits": 0.3309740275144577, "step": 1136 }, { "epoch": 0.0355625, "grad_norm": 4.90625, "grad_norm_var": 0.19295247395833334, "learning_rate": 0.0001, "loss": 7.3743, "loss/crossentropy": 2.5806562900543213, "loss/hidden": 1.9296875, "loss/jsd": 0.0, "loss/logits": 0.28640007972717285, "step": 1138 }, { "epoch": 0.035625, "grad_norm": 5.375, "grad_norm_var": 0.13605143229166666, "learning_rate": 0.0001, "loss": 7.825, "loss/crossentropy": 2.722290277481079, "loss/hidden": 2.0859375, "loss/jsd": 0.0, "loss/logits": 0.30167827010154724, "step": 1140 }, { "epoch": 0.0356875, "grad_norm": 5.65625, "grad_norm_var": 0.141259765625, "learning_rate": 0.0001, "loss": 7.7702, "loss/crossentropy": 2.702438473701477, "loss/hidden": 2.0546875, "loss/jsd": 0.0, "loss/logits": 0.30130916833877563, "step": 1142 }, { "epoch": 0.03575, "grad_norm": 4.84375, "grad_norm_var": 0.146875, "learning_rate": 0.0001, "loss": 7.8102, "loss/crossentropy": 2.712978720664978, "loss/hidden": 2.046875, "loss/jsd": 0.0, "loss/logits": 0.3050341159105301, "step": 1144 }, { "epoch": 0.0358125, "grad_norm": 5.625, "grad_norm_var": 0.1291015625, "learning_rate": 0.0001, "loss": 7.8463, "loss/crossentropy": 2.7124587297439575, "loss/hidden": 2.03125, "loss/jsd": 0.0, "loss/logits": 0.3102594017982483, "step": 1146 }, { "epoch": 0.035875, "grad_norm": 5.40625, "grad_norm_var": 0.13014322916666668, "learning_rate": 0.0001, "loss": 7.6124, "loss/crossentropy": 2.6879937648773193, "loss/hidden": 1.9765625, "loss/jsd": 0.0, "loss/logits": 0.29478274285793304, "step": 1148 }, { "epoch": 0.0359375, "grad_norm": 4.71875, "grad_norm_var": 0.14752604166666666, "learning_rate": 0.0001, "loss": 7.7755, "loss/crossentropy": 2.7245869636535645, "loss/hidden": 2.01171875, "loss/jsd": 0.0, "loss/logits": 0.3039206862449646, "step": 1150 }, { "epoch": 0.036, "grad_norm": 5.5, "grad_norm_var": 0.10480143229166666, "learning_rate": 0.0001, "loss": 7.835, "loss/crossentropy": 2.744845390319824, "loss/hidden": 2.015625, "loss/jsd": 0.0, "loss/logits": 0.3074570447206497, "step": 1152 }, { "epoch": 0.0360625, "grad_norm": 8.25, "grad_norm_var": 0.6361287434895834, "learning_rate": 0.0001, "loss": 7.9799, "loss/crossentropy": 2.694003462791443, "loss/hidden": 2.0703125, "loss/jsd": 0.0, "loss/logits": 0.32155656814575195, "step": 1154 }, { "epoch": 0.036125, "grad_norm": 5.28125, "grad_norm_var": 0.6376912434895833, "learning_rate": 0.0001, "loss": 7.9291, "loss/crossentropy": 2.8468209505081177, "loss/hidden": 2.0078125, "loss/jsd": 0.0, "loss/logits": 0.30744484066963196, "step": 1156 }, { "epoch": 0.0361875, "grad_norm": 4.9375, "grad_norm_var": 0.7028483072916667, "learning_rate": 0.0001, "loss": 7.8866, "loss/crossentropy": 2.731719493865967, "loss/hidden": 2.0625, "loss/jsd": 0.0, "loss/logits": 0.3092362582683563, "step": 1158 }, { "epoch": 0.03625, "grad_norm": 5.71875, "grad_norm_var": 0.7010050455729167, "learning_rate": 0.0001, "loss": 7.7235, "loss/crossentropy": 2.7183526754379272, "loss/hidden": 2.01953125, "loss/jsd": 0.0, "loss/logits": 0.2985660433769226, "step": 1160 }, { "epoch": 0.0363125, "grad_norm": 5.21875, "grad_norm_var": 0.7028483072916667, "learning_rate": 0.0001, "loss": 7.5857, "loss/crossentropy": 2.5812745094299316, "loss/hidden": 2.078125, "loss/jsd": 0.0, "loss/logits": 0.2926321029663086, "step": 1162 }, { "epoch": 0.036375, "grad_norm": 5.53125, "grad_norm_var": 0.7484334309895834, "learning_rate": 0.0001, "loss": 7.7259, "loss/crossentropy": 2.694801926612854, "loss/hidden": 1.98828125, "loss/jsd": 0.0, "loss/logits": 0.3042774498462677, "step": 1164 }, { "epoch": 0.0364375, "grad_norm": 5.21875, "grad_norm_var": 0.7175130208333333, "learning_rate": 0.0001, "loss": 8.0344, "loss/crossentropy": 2.8037021160125732, "loss/hidden": 2.0390625, "loss/jsd": 0.0, "loss/logits": 0.3191618323326111, "step": 1166 }, { "epoch": 0.0365, "grad_norm": 4.625, "grad_norm_var": 0.7555989583333333, "learning_rate": 0.0001, "loss": 7.6207, "loss/crossentropy": 2.678989052772522, "loss/hidden": 1.98046875, "loss/jsd": 0.0, "loss/logits": 0.2961277812719345, "step": 1168 }, { "epoch": 0.0365625, "grad_norm": 4.9375, "grad_norm_var": 0.18553059895833332, "learning_rate": 0.0001, "loss": 7.8717, "loss/crossentropy": 2.750701904296875, "loss/hidden": 2.03125, "loss/jsd": 0.0, "loss/logits": 0.3089783787727356, "step": 1170 }, { "epoch": 0.036625, "grad_norm": 4.875, "grad_norm_var": 0.19503580729166667, "learning_rate": 0.0001, "loss": 7.5382, "loss/crossentropy": 2.6573996543884277, "loss/hidden": 2.0078125, "loss/jsd": 0.0, "loss/logits": 0.2873009145259857, "step": 1172 }, { "epoch": 0.0366875, "grad_norm": 5.625, "grad_norm_var": 0.116796875, "learning_rate": 0.0001, "loss": 7.6217, "loss/crossentropy": 2.5741889476776123, "loss/hidden": 2.0625, "loss/jsd": 0.0, "loss/logits": 0.2985018193721771, "step": 1174 }, { "epoch": 0.03675, "grad_norm": 4.875, "grad_norm_var": 0.20950113932291667, "learning_rate": 0.0001, "loss": 7.9366, "loss/crossentropy": 2.7274372577667236, "loss/hidden": 2.125, "loss/jsd": 0.0, "loss/logits": 0.308414101600647, "step": 1176 }, { "epoch": 0.0368125, "grad_norm": 4.84375, "grad_norm_var": 0.20976155598958332, "learning_rate": 0.0001, "loss": 7.4378, "loss/crossentropy": 2.578323483467102, "loss/hidden": 1.9453125, "loss/jsd": 0.0, "loss/logits": 0.29141393303871155, "step": 1178 }, { "epoch": 0.036875, "grad_norm": 5.78125, "grad_norm_var": 0.23632405598958334, "learning_rate": 0.0001, "loss": 7.782, "loss/crossentropy": 2.6867313385009766, "loss/hidden": 2.015625, "loss/jsd": 0.0, "loss/logits": 0.3079614043235779, "step": 1180 }, { "epoch": 0.0369375, "grad_norm": 5.1875, "grad_norm_var": 0.23717447916666667, "learning_rate": 0.0001, "loss": 7.6463, "loss/crossentropy": 2.5722291469573975, "loss/hidden": 1.97265625, "loss/jsd": 0.0, "loss/logits": 0.31014107167720795, "step": 1182 }, { "epoch": 0.037, "grad_norm": 5.1875, "grad_norm_var": 0.21139322916666667, "learning_rate": 0.0001, "loss": 7.3991, "loss/crossentropy": 2.6571277379989624, "loss/hidden": 1.98828125, "loss/jsd": 0.0, "loss/logits": 0.27536794543266296, "step": 1184 }, { "epoch": 0.0370625, "grad_norm": 5.78125, "grad_norm_var": 0.22805582682291667, "learning_rate": 0.0001, "loss": 7.8376, "loss/crossentropy": 2.785367250442505, "loss/hidden": 2.046875, "loss/jsd": 0.0, "loss/logits": 0.3005323112010956, "step": 1186 }, { "epoch": 0.037125, "grad_norm": 4.6875, "grad_norm_var": 0.23267822265625, "learning_rate": 0.0001, "loss": 7.5326, "loss/crossentropy": 2.6078680753707886, "loss/hidden": 2.0, "loss/jsd": 0.0, "loss/logits": 0.292471781373024, "step": 1188 }, { "epoch": 0.0371875, "grad_norm": 4.84375, "grad_norm_var": 0.25006510416666666, "learning_rate": 0.0001, "loss": 7.6366, "loss/crossentropy": 2.6924532651901245, "loss/hidden": 2.0234375, "loss/jsd": 0.0, "loss/logits": 0.2920667827129364, "step": 1190 }, { "epoch": 0.03725, "grad_norm": 5.0625, "grad_norm_var": 0.14954020182291666, "learning_rate": 0.0001, "loss": 7.6194, "loss/crossentropy": 2.57063090801239, "loss/hidden": 1.99609375, "loss/jsd": 0.0, "loss/logits": 0.3052666634321213, "step": 1192 }, { "epoch": 0.0373125, "grad_norm": 4.90625, "grad_norm_var": 0.15653889973958332, "learning_rate": 0.0001, "loss": 7.9032, "loss/crossentropy": 2.867217540740967, "loss/hidden": 1.9609375, "loss/jsd": 0.0, "loss/logits": 0.30750520527362823, "step": 1194 }, { "epoch": 0.037375, "grad_norm": 5.3125, "grad_norm_var": 0.11092122395833333, "learning_rate": 0.0001, "loss": 7.5179, "loss/crossentropy": 2.601387858390808, "loss/hidden": 1.9921875, "loss/jsd": 0.0, "loss/logits": 0.292428120970726, "step": 1196 }, { "epoch": 0.0374375, "grad_norm": 5.6875, "grad_norm_var": 0.12561442057291666, "learning_rate": 0.0001, "loss": 8.2058, "loss/crossentropy": 2.918960690498352, "loss/hidden": 2.0703125, "loss/jsd": 0.0, "loss/logits": 0.32165729999542236, "step": 1198 }, { "epoch": 0.0375, "grad_norm": 4.8125, "grad_norm_var": 0.17003580729166667, "learning_rate": 0.0001, "loss": 7.6882, "loss/crossentropy": 2.5691174268722534, "loss/hidden": 2.1015625, "loss/jsd": 0.0, "loss/logits": 0.3017522841691971, "step": 1200 }, { "epoch": 0.0375625, "grad_norm": 4.875, "grad_norm_var": 0.16925455729166666, "learning_rate": 0.0001, "loss": 7.5997, "loss/crossentropy": 2.5958189964294434, "loss/hidden": 2.046875, "loss/jsd": 0.0, "loss/logits": 0.2956975996494293, "step": 1202 }, { "epoch": 0.037625, "grad_norm": 5.3125, "grad_norm_var": 0.14905192057291666, "learning_rate": 0.0001, "loss": 7.9452, "loss/crossentropy": 2.7279187440872192, "loss/hidden": 2.0234375, "loss/jsd": 0.0, "loss/logits": 0.31938889622688293, "step": 1204 }, { "epoch": 0.0376875, "grad_norm": 5.09375, "grad_norm_var": 0.13006184895833334, "learning_rate": 0.0001, "loss": 7.5916, "loss/crossentropy": 2.6716209650039673, "loss/hidden": 1.96875, "loss/jsd": 0.0, "loss/logits": 0.2951197922229767, "step": 1206 }, { "epoch": 0.03775, "grad_norm": 5.40625, "grad_norm_var": 0.150390625, "learning_rate": 0.0001, "loss": 7.7079, "loss/crossentropy": 2.808968424797058, "loss/hidden": 1.9921875, "loss/jsd": 0.0, "loss/logits": 0.29067111015319824, "step": 1208 }, { "epoch": 0.0378125, "grad_norm": 5.375, "grad_norm_var": 0.13433837890625, "learning_rate": 0.0001, "loss": 7.9013, "loss/crossentropy": 2.709269404411316, "loss/hidden": 2.046875, "loss/jsd": 0.0, "loss/logits": 0.3145200312137604, "step": 1210 }, { "epoch": 0.037875, "grad_norm": 5.09375, "grad_norm_var": 0.13411051432291668, "learning_rate": 0.0001, "loss": 7.62, "loss/crossentropy": 2.611648201942444, "loss/hidden": 2.02734375, "loss/jsd": 0.0, "loss/logits": 0.2980997562408447, "step": 1212 }, { "epoch": 0.0379375, "grad_norm": 4.78125, "grad_norm_var": 0.14256184895833332, "learning_rate": 0.0001, "loss": 7.4184, "loss/crossentropy": 2.5281219482421875, "loss/hidden": 2.015625, "loss/jsd": 0.0, "loss/logits": 0.28746722638607025, "step": 1214 }, { "epoch": 0.038, "grad_norm": 5.4375, "grad_norm_var": 0.122509765625, "learning_rate": 0.0001, "loss": 7.7106, "loss/crossentropy": 2.614644765853882, "loss/hidden": 2.03125, "loss/jsd": 0.0, "loss/logits": 0.30647440254688263, "step": 1216 }, { "epoch": 0.0380625, "grad_norm": 4.8125, "grad_norm_var": 0.09927978515625, "learning_rate": 0.0001, "loss": 7.4034, "loss/crossentropy": 2.629401683807373, "loss/hidden": 1.95703125, "loss/jsd": 0.0, "loss/logits": 0.2816943824291229, "step": 1218 }, { "epoch": 0.038125, "grad_norm": 4.9375, "grad_norm_var": 0.10702718098958333, "learning_rate": 0.0001, "loss": 7.5036, "loss/crossentropy": 2.6855201721191406, "loss/hidden": 2.0, "loss/jsd": 0.0, "loss/logits": 0.28180426359176636, "step": 1220 }, { "epoch": 0.0381875, "grad_norm": 5.09375, "grad_norm_var": 0.11998697916666666, "learning_rate": 0.0001, "loss": 7.6283, "loss/crossentropy": 2.6732563972473145, "loss/hidden": 1.96484375, "loss/jsd": 0.0, "loss/logits": 0.29901817440986633, "step": 1222 }, { "epoch": 0.03825, "grad_norm": 5.3125, "grad_norm_var": 0.10178629557291667, "learning_rate": 0.0001, "loss": 7.7806, "loss/crossentropy": 2.826427936553955, "loss/hidden": 2.0078125, "loss/jsd": 0.0, "loss/logits": 0.29463550448417664, "step": 1224 }, { "epoch": 0.0383125, "grad_norm": 4.875, "grad_norm_var": 0.08203125, "learning_rate": 0.0001, "loss": 7.5518, "loss/crossentropy": 2.679568648338318, "loss/hidden": 1.97265625, "loss/jsd": 0.0, "loss/logits": 0.2899537980556488, "step": 1226 }, { "epoch": 0.038375, "grad_norm": 10.9375, "grad_norm_var": 2.25299072265625, "learning_rate": 0.0001, "loss": 7.8777, "loss/crossentropy": 2.711169719696045, "loss/hidden": 2.015625, "loss/jsd": 0.0, "loss/logits": 0.3150908648967743, "step": 1228 }, { "epoch": 0.0384375, "grad_norm": 5.15625, "grad_norm_var": 2.215234375, "learning_rate": 0.0001, "loss": 7.7375, "loss/crossentropy": 2.791654109954834, "loss/hidden": 1.96484375, "loss/jsd": 0.0, "loss/logits": 0.298099547624588, "step": 1230 }, { "epoch": 0.0385, "grad_norm": 5.40625, "grad_norm_var": 2.214774576822917, "learning_rate": 0.0001, "loss": 7.6766, "loss/crossentropy": 2.561646342277527, "loss/hidden": 2.0703125, "loss/jsd": 0.0, "loss/logits": 0.3044602572917938, "step": 1232 }, { "epoch": 0.0385625, "grad_norm": 4.65625, "grad_norm_var": 2.287239583333333, "learning_rate": 0.0001, "loss": 7.1393, "loss/crossentropy": 2.4994441270828247, "loss/hidden": 1.9453125, "loss/jsd": 0.0, "loss/logits": 0.26945771276950836, "step": 1234 }, { "epoch": 0.038625, "grad_norm": 5.0625, "grad_norm_var": 2.287040201822917, "learning_rate": 0.0001, "loss": 7.9031, "loss/crossentropy": 2.8577252626419067, "loss/hidden": 1.96875, "loss/jsd": 0.0, "loss/logits": 0.3076618164777756, "step": 1236 }, { "epoch": 0.0386875, "grad_norm": 5.40625, "grad_norm_var": 2.2509765625, "learning_rate": 0.0001, "loss": 7.9173, "loss/crossentropy": 2.8302924633026123, "loss/hidden": 2.0546875, "loss/jsd": 0.0, "loss/logits": 0.3032368868589401, "step": 1238 }, { "epoch": 0.03875, "grad_norm": 5.78125, "grad_norm_var": 2.30924072265625, "learning_rate": 0.0001, "loss": 7.7384, "loss/crossentropy": 2.829968810081482, "loss/hidden": 1.98046875, "loss/jsd": 0.0, "loss/logits": 0.2927999347448349, "step": 1240 }, { "epoch": 0.0388125, "grad_norm": 5.25, "grad_norm_var": 2.2814453125, "learning_rate": 0.0001, "loss": 7.7618, "loss/crossentropy": 2.735278010368347, "loss/hidden": 2.046875, "loss/jsd": 0.0, "loss/logits": 0.2979622185230255, "step": 1242 }, { "epoch": 0.038875, "grad_norm": 5.4375, "grad_norm_var": 0.14737955729166666, "learning_rate": 0.0001, "loss": 7.7466, "loss/crossentropy": 2.716152548789978, "loss/hidden": 2.0390625, "loss/jsd": 0.0, "loss/logits": 0.29913707077503204, "step": 1244 }, { "epoch": 0.0389375, "grad_norm": 4.625, "grad_norm_var": 0.14700113932291667, "learning_rate": 0.0001, "loss": 7.3482, "loss/crossentropy": 2.594505190849304, "loss/hidden": 1.9375, "loss/jsd": 0.0, "loss/logits": 0.2816225290298462, "step": 1246 }, { "epoch": 0.039, "grad_norm": 5.40625, "grad_norm_var": 0.14719645182291666, "learning_rate": 0.0001, "loss": 7.7186, "loss/crossentropy": 2.7304205894470215, "loss/hidden": 1.984375, "loss/jsd": 0.0, "loss/logits": 0.3003810793161392, "step": 1248 }, { "epoch": 0.0390625, "grad_norm": 5.0, "grad_norm_var": 0.113134765625, "learning_rate": 0.0001, "loss": 7.3257, "loss/crossentropy": 2.4609283208847046, "loss/hidden": 1.9921875, "loss/jsd": 0.0, "loss/logits": 0.2872539907693863, "step": 1250 }, { "epoch": 0.039125, "grad_norm": 5.25, "grad_norm_var": 0.30038655598958336, "learning_rate": 0.0001, "loss": 7.8434, "loss/crossentropy": 2.7610113620758057, "loss/hidden": 2.0078125, "loss/jsd": 0.0, "loss/logits": 0.3074617087841034, "step": 1252 }, { "epoch": 0.0391875, "grad_norm": 5.09375, "grad_norm_var": 0.29940999348958336, "learning_rate": 0.0001, "loss": 7.8855, "loss/crossentropy": 2.828143000602722, "loss/hidden": 1.95703125, "loss/jsd": 0.0, "loss/logits": 0.3100292235612869, "step": 1254 }, { "epoch": 0.03925, "grad_norm": 5.625, "grad_norm_var": 0.2684733072916667, "learning_rate": 0.0001, "loss": 7.429, "loss/crossentropy": 2.530988335609436, "loss/hidden": 1.98046875, "loss/jsd": 0.0, "loss/logits": 0.2917501926422119, "step": 1256 }, { "epoch": 0.0393125, "grad_norm": 4.625, "grad_norm_var": 0.2939453125, "learning_rate": 0.0001, "loss": 7.4206, "loss/crossentropy": 2.5805318355560303, "loss/hidden": 1.9375, "loss/jsd": 0.0, "loss/logits": 0.2902548015117645, "step": 1258 }, { "epoch": 0.039375, "grad_norm": 5.0, "grad_norm_var": 0.29010416666666666, "learning_rate": 0.0001, "loss": 7.7127, "loss/crossentropy": 2.7116379737854004, "loss/hidden": 1.9765625, "loss/jsd": 0.0, "loss/logits": 0.3024456202983856, "step": 1260 }, { "epoch": 0.0394375, "grad_norm": 5.125, "grad_norm_var": 0.26038004557291666, "learning_rate": 0.0001, "loss": 7.9923, "loss/crossentropy": 2.866301417350769, "loss/hidden": 2.0, "loss/jsd": 0.0, "loss/logits": 0.3125974237918854, "step": 1262 }, { "epoch": 0.0395, "grad_norm": 5.34375, "grad_norm_var": 0.291650390625, "learning_rate": 0.0001, "loss": 7.584, "loss/crossentropy": 2.6004990339279175, "loss/hidden": 1.9921875, "loss/jsd": 0.0, "loss/logits": 0.2991277277469635, "step": 1264 }, { "epoch": 0.0395625, "grad_norm": 5.21875, "grad_norm_var": 0.29542643229166665, "learning_rate": 0.0001, "loss": 7.8619, "loss/crossentropy": 2.7568334341049194, "loss/hidden": 2.0546875, "loss/jsd": 0.0, "loss/logits": 0.3050409257411957, "step": 1266 }, { "epoch": 0.039625, "grad_norm": 5.53125, "grad_norm_var": 0.11887613932291667, "learning_rate": 0.0001, "loss": 8.04, "loss/crossentropy": 2.8222177028656006, "loss/hidden": 2.01953125, "loss/jsd": 0.0, "loss/logits": 0.31982044875621796, "step": 1268 }, { "epoch": 0.0396875, "grad_norm": 4.625, "grad_norm_var": 0.144921875, "learning_rate": 0.0001, "loss": 7.9355, "loss/crossentropy": 2.8479357957839966, "loss/hidden": 1.984375, "loss/jsd": 0.0, "loss/logits": 0.31032055616378784, "step": 1270 }, { "epoch": 0.03975, "grad_norm": 5.1875, "grad_norm_var": 2.83492431640625, "learning_rate": 0.0001, "loss": 7.5238, "loss/crossentropy": 2.5936185121536255, "loss/hidden": 1.96875, "loss/jsd": 0.0, "loss/logits": 0.2961418032646179, "step": 1272 }, { "epoch": 0.0398125, "grad_norm": 5.65625, "grad_norm_var": 2.822066243489583, "learning_rate": 0.0001, "loss": 8.0667, "loss/crossentropy": 2.8308135271072388, "loss/hidden": 2.03125, "loss/jsd": 0.0, "loss/logits": 0.32046134769916534, "step": 1274 }, { "epoch": 0.039875, "grad_norm": 5.125, "grad_norm_var": 2.891259765625, "learning_rate": 0.0001, "loss": 7.4237, "loss/crossentropy": 2.5655672550201416, "loss/hidden": 1.9453125, "loss/jsd": 0.0, "loss/logits": 0.2912827283143997, "step": 1276 }, { "epoch": 0.0399375, "grad_norm": 5.0625, "grad_norm_var": 2.91734619140625, "learning_rate": 0.0001, "loss": 7.68, "loss/crossentropy": 2.714452862739563, "loss/hidden": 1.99609375, "loss/jsd": 0.0, "loss/logits": 0.2969430685043335, "step": 1278 }, { "epoch": 0.04, "grad_norm": 7.09375, "grad_norm_var": 2.90718994140625, "learning_rate": 0.0001, "loss": 7.5165, "loss/crossentropy": 2.4779699444770813, "loss/hidden": 2.046875, "loss/jsd": 0.0, "loss/logits": 0.2991633117198944, "step": 1280 }, { "epoch": 0.0400625, "grad_norm": 5.625, "grad_norm_var": 2.8809529622395833, "learning_rate": 0.0001, "loss": 7.6352, "loss/crossentropy": 2.7353172302246094, "loss/hidden": 1.96875, "loss/jsd": 0.0, "loss/logits": 0.2931157201528549, "step": 1282 }, { "epoch": 0.040125, "grad_norm": 6.3125, "grad_norm_var": 2.894462076822917, "learning_rate": 0.0001, "loss": 7.668, "loss/crossentropy": 2.70292329788208, "loss/hidden": 2.0078125, "loss/jsd": 0.0, "loss/logits": 0.295722633600235, "step": 1284 }, { "epoch": 0.0401875, "grad_norm": 4.875, "grad_norm_var": 2.886031087239583, "learning_rate": 0.0001, "loss": 7.5251, "loss/crossentropy": 2.6100287437438965, "loss/hidden": 2.0078125, "loss/jsd": 0.0, "loss/logits": 0.29073064029216766, "step": 1286 }, { "epoch": 0.04025, "grad_norm": 5.21875, "grad_norm_var": 0.698681640625, "learning_rate": 0.0001, "loss": 8.0385, "loss/crossentropy": 2.6995733976364136, "loss/hidden": 2.0625, "loss/jsd": 0.0, "loss/logits": 0.3276410549879074, "step": 1288 }, { "epoch": 0.0403125, "grad_norm": 4.875, "grad_norm_var": 0.6340983072916667, "learning_rate": 0.0001, "loss": 7.5085, "loss/crossentropy": 2.571585774421692, "loss/hidden": 2.0078125, "loss/jsd": 0.0, "loss/logits": 0.29291462898254395, "step": 1290 }, { "epoch": 0.040375, "grad_norm": 6.03125, "grad_norm_var": 0.6041951497395833, "learning_rate": 0.0001, "loss": 7.5429, "loss/crossentropy": 2.6209115982055664, "loss/hidden": 2.0078125, "loss/jsd": 0.0, "loss/logits": 0.29141952097415924, "step": 1292 }, { "epoch": 0.0404375, "grad_norm": 5.75, "grad_norm_var": 0.5770792643229167, "learning_rate": 0.0001, "loss": 7.5796, "loss/crossentropy": 2.5977821350097656, "loss/hidden": 1.98828125, "loss/jsd": 0.0, "loss/logits": 0.299348846077919, "step": 1294 }, { "epoch": 0.0405, "grad_norm": 5.5, "grad_norm_var": 0.43292643229166666, "learning_rate": 0.0001, "loss": 7.8398, "loss/crossentropy": 2.8123137950897217, "loss/hidden": 2.0390625, "loss/jsd": 0.0, "loss/logits": 0.2988448143005371, "step": 1296 }, { "epoch": 0.0405625, "grad_norm": 6.0625, "grad_norm_var": 0.4564737955729167, "learning_rate": 0.0001, "loss": 8.0537, "loss/crossentropy": 2.8605915307998657, "loss/hidden": 2.0546875, "loss/jsd": 0.0, "loss/logits": 0.3138464093208313, "step": 1298 }, { "epoch": 0.040625, "grad_norm": 5.8125, "grad_norm_var": 0.41858317057291666, "learning_rate": 0.0001, "loss": 7.4912, "loss/crossentropy": 2.502464771270752, "loss/hidden": 1.9921875, "loss/jsd": 0.0, "loss/logits": 0.29965806007385254, "step": 1300 }, { "epoch": 0.0406875, "grad_norm": 4.84375, "grad_norm_var": 0.41389567057291665, "learning_rate": 0.0001, "loss": 7.9667, "loss/crossentropy": 2.812674403190613, "loss/hidden": 2.01953125, "loss/jsd": 0.0, "loss/logits": 0.3134448826313019, "step": 1302 }, { "epoch": 0.04075, "grad_norm": 5.0, "grad_norm_var": 0.16886393229166666, "learning_rate": 0.0001, "loss": 7.5189, "loss/crossentropy": 2.5777794122695923, "loss/hidden": 2.046875, "loss/jsd": 0.0, "loss/logits": 0.2894267141819, "step": 1304 }, { "epoch": 0.0408125, "grad_norm": 5.25, "grad_norm_var": 0.30690104166666665, "learning_rate": 0.0001, "loss": 7.9687, "loss/crossentropy": 2.8120635747909546, "loss/hidden": 2.046875, "loss/jsd": 0.0, "loss/logits": 0.3109755218029022, "step": 1306 }, { "epoch": 0.040875, "grad_norm": 4.96875, "grad_norm_var": 0.2977701822916667, "learning_rate": 0.0001, "loss": 7.529, "loss/crossentropy": 2.723876118659973, "loss/hidden": 1.9921875, "loss/jsd": 0.0, "loss/logits": 0.2812942564487457, "step": 1308 }, { "epoch": 0.0409375, "grad_norm": 5.28125, "grad_norm_var": 0.2884724934895833, "learning_rate": 0.0001, "loss": 7.9555, "loss/crossentropy": 2.883132815361023, "loss/hidden": 2.0234375, "loss/jsd": 0.0, "loss/logits": 0.30488817393779755, "step": 1310 }, { "epoch": 0.041, "grad_norm": 5.3125, "grad_norm_var": 0.31438802083333334, "learning_rate": 0.0001, "loss": 7.2809, "loss/crossentropy": 2.370841383934021, "loss/hidden": 1.9765625, "loss/jsd": 0.0, "loss/logits": 0.2933536022901535, "step": 1312 }, { "epoch": 0.0410625, "grad_norm": 5.46875, "grad_norm_var": 0.3129191080729167, "learning_rate": 0.0001, "loss": 7.2967, "loss/crossentropy": 2.5285329818725586, "loss/hidden": 1.984375, "loss/jsd": 0.0, "loss/logits": 0.27837996184825897, "step": 1314 }, { "epoch": 0.041125, "grad_norm": 5.46875, "grad_norm_var": 0.29664306640625, "learning_rate": 0.0001, "loss": 7.8327, "loss/crossentropy": 2.757304310798645, "loss/hidden": 1.98828125, "loss/jsd": 0.0, "loss/logits": 0.30871395766735077, "step": 1316 }, { "epoch": 0.0411875, "grad_norm": 5.0625, "grad_norm_var": 0.27810872395833336, "learning_rate": 0.0001, "loss": 7.576, "loss/crossentropy": 2.7341228723526, "loss/hidden": 1.9765625, "loss/jsd": 0.0, "loss/logits": 0.2865341305732727, "step": 1318 }, { "epoch": 0.04125, "grad_norm": 4.875, "grad_norm_var": 0.284765625, "learning_rate": 0.0001, "loss": 7.3078, "loss/crossentropy": 2.576223611831665, "loss/hidden": 1.9765625, "loss/jsd": 0.0, "loss/logits": 0.27549774944782257, "step": 1320 }, { "epoch": 0.0413125, "grad_norm": 4.59375, "grad_norm_var": 0.08121337890625, "learning_rate": 0.0001, "loss": 7.2132, "loss/crossentropy": 2.5026360750198364, "loss/hidden": 1.96484375, "loss/jsd": 0.0, "loss/logits": 0.2745731547474861, "step": 1322 }, { "epoch": 0.041375, "grad_norm": 5.3125, "grad_norm_var": 0.08599853515625, "learning_rate": 0.0001, "loss": 7.0476, "loss/crossentropy": 2.394889712333679, "loss/hidden": 1.92578125, "loss/jsd": 0.0, "loss/logits": 0.27269674837589264, "step": 1324 }, { "epoch": 0.0414375, "grad_norm": 4.59375, "grad_norm_var": 0.26248372395833336, "learning_rate": 0.0001, "loss": 7.624, "loss/crossentropy": 2.6320308446884155, "loss/hidden": 2.0546875, "loss/jsd": 0.0, "loss/logits": 0.29373273253440857, "step": 1326 }, { "epoch": 0.0415, "grad_norm": 5.21875, "grad_norm_var": 0.28046875, "learning_rate": 0.0001, "loss": 7.9352, "loss/crossentropy": 2.927828311920166, "loss/hidden": 2.0, "loss/jsd": 0.0, "loss/logits": 0.300733357667923, "step": 1328 }, { "epoch": 0.0415625, "grad_norm": 6.6875, "grad_norm_var": 0.39078369140625, "learning_rate": 0.0001, "loss": 7.9079, "loss/crossentropy": 2.6833643913269043, "loss/hidden": 2.01171875, "loss/jsd": 0.0, "loss/logits": 0.3212866932153702, "step": 1330 }, { "epoch": 0.041625, "grad_norm": 5.75, "grad_norm_var": 0.40312093098958335, "learning_rate": 0.0001, "loss": 8.1211, "loss/crossentropy": 2.897345185279846, "loss/hidden": 2.0703125, "loss/jsd": 0.0, "loss/logits": 0.31534482538700104, "step": 1332 }, { "epoch": 0.0416875, "grad_norm": 5.46875, "grad_norm_var": 0.38917643229166665, "learning_rate": 0.0001, "loss": 7.7456, "loss/crossentropy": 2.584378242492676, "loss/hidden": 1.984375, "loss/jsd": 0.0, "loss/logits": 0.3176807314157486, "step": 1334 }, { "epoch": 0.04175, "grad_norm": 5.15625, "grad_norm_var": 0.3631510416666667, "learning_rate": 0.0001, "loss": 7.6088, "loss/crossentropy": 2.5694063901901245, "loss/hidden": 2.03125, "loss/jsd": 0.0, "loss/logits": 0.3008142113685608, "step": 1336 }, { "epoch": 0.0418125, "grad_norm": 4.34375, "grad_norm_var": 0.405322265625, "learning_rate": 0.0001, "loss": 7.2625, "loss/crossentropy": 2.5970876216888428, "loss/hidden": 1.9453125, "loss/jsd": 0.0, "loss/logits": 0.27201347053050995, "step": 1338 }, { "epoch": 0.041875, "grad_norm": 5.03125, "grad_norm_var": 0.41627197265625, "learning_rate": 0.0001, "loss": 7.5859, "loss/crossentropy": 2.6498775482177734, "loss/hidden": 2.0390625, "loss/jsd": 0.0, "loss/logits": 0.2896941006183624, "step": 1340 }, { "epoch": 0.0419375, "grad_norm": 6.53125, "grad_norm_var": 0.4341796875, "learning_rate": 0.0001, "loss": 7.1471, "loss/crossentropy": 2.4250807762145996, "loss/hidden": 2.0234375, "loss/jsd": 0.0, "loss/logits": 0.2698608785867691, "step": 1342 }, { "epoch": 0.042, "grad_norm": 5.34375, "grad_norm_var": 0.4249959309895833, "learning_rate": 0.0001, "loss": 7.8104, "loss/crossentropy": 2.8312790393829346, "loss/hidden": 1.984375, "loss/jsd": 0.0, "loss/logits": 0.2994709312915802, "step": 1344 }, { "epoch": 0.0420625, "grad_norm": 4.78125, "grad_norm_var": 0.33548177083333336, "learning_rate": 0.0001, "loss": 7.5373, "loss/crossentropy": 2.687604069709778, "loss/hidden": 1.953125, "loss/jsd": 0.0, "loss/logits": 0.28966057300567627, "step": 1346 }, { "epoch": 0.042125, "grad_norm": 4.84375, "grad_norm_var": 0.33264567057291666, "learning_rate": 0.0001, "loss": 7.5791, "loss/crossentropy": 2.6224461793899536, "loss/hidden": 2.0234375, "loss/jsd": 0.0, "loss/logits": 0.2933178097009659, "step": 1348 }, { "epoch": 0.0421875, "grad_norm": 5.0, "grad_norm_var": 0.350244140625, "learning_rate": 0.0001, "loss": 7.8897, "loss/crossentropy": 2.7649588584899902, "loss/hidden": 2.01171875, "loss/jsd": 0.0, "loss/logits": 0.311299666762352, "step": 1350 }, { "epoch": 0.04225, "grad_norm": 4.78125, "grad_norm_var": 0.31842041015625, "learning_rate": 0.0001, "loss": 7.5599, "loss/crossentropy": 2.6638940572738647, "loss/hidden": 1.94140625, "loss/jsd": 0.0, "loss/logits": 0.29546037316322327, "step": 1352 }, { "epoch": 0.0423125, "grad_norm": 5.15625, "grad_norm_var": 0.2779947916666667, "learning_rate": 0.0001, "loss": 7.7463, "loss/crossentropy": 2.7206530570983887, "loss/hidden": 2.0, "loss/jsd": 0.0, "loss/logits": 0.30256910622119904, "step": 1354 }, { "epoch": 0.042375, "grad_norm": 5.25, "grad_norm_var": 0.271728515625, "learning_rate": 0.0001, "loss": 7.5532, "loss/crossentropy": 2.679394841194153, "loss/hidden": 1.92578125, "loss/jsd": 0.0, "loss/logits": 0.29480086266994476, "step": 1356 }, { "epoch": 0.0424375, "grad_norm": 5.125, "grad_norm_var": 0.12278645833333333, "learning_rate": 0.0001, "loss": 7.4359, "loss/crossentropy": 2.6582494974136353, "loss/hidden": 1.9453125, "loss/jsd": 0.0, "loss/logits": 0.28323106467723846, "step": 1358 }, { "epoch": 0.0425, "grad_norm": 4.9375, "grad_norm_var": 0.13513997395833333, "learning_rate": 0.0001, "loss": 7.5235, "loss/crossentropy": 2.6920082569122314, "loss/hidden": 1.94921875, "loss/jsd": 0.0, "loss/logits": 0.2882232964038849, "step": 1360 }, { "epoch": 0.0425625, "grad_norm": 5.09375, "grad_norm_var": 0.19777018229166668, "learning_rate": 0.0001, "loss": 7.6063, "loss/crossentropy": 2.6489486694335938, "loss/hidden": 2.0625, "loss/jsd": 0.0, "loss/logits": 0.2894832342863083, "step": 1362 }, { "epoch": 0.042625, "grad_norm": 5.1875, "grad_norm_var": 0.16953125, "learning_rate": 0.0001, "loss": 7.7816, "loss/crossentropy": 2.7566269636154175, "loss/hidden": 1.9765625, "loss/jsd": 0.0, "loss/logits": 0.3048381954431534, "step": 1364 }, { "epoch": 0.0426875, "grad_norm": 4.71875, "grad_norm_var": 0.14920247395833333, "learning_rate": 0.0001, "loss": 7.3469, "loss/crossentropy": 2.634415030479431, "loss/hidden": 1.9375, "loss/jsd": 0.0, "loss/logits": 0.27750201523303986, "step": 1366 }, { "epoch": 0.04275, "grad_norm": 4.9375, "grad_norm_var": 0.15572509765625, "learning_rate": 0.0001, "loss": 7.5947, "loss/crossentropy": 2.707185387611389, "loss/hidden": 2.01953125, "loss/jsd": 0.0, "loss/logits": 0.2867947816848755, "step": 1368 }, { "epoch": 0.0428125, "grad_norm": 4.71875, "grad_norm_var": 0.15575764973958334, "learning_rate": 0.0001, "loss": 7.1529, "loss/crossentropy": 2.4458247423171997, "loss/hidden": 2.0, "loss/jsd": 0.0, "loss/logits": 0.27070252597332, "step": 1370 }, { "epoch": 0.042875, "grad_norm": 4.84375, "grad_norm_var": 0.16929931640625, "learning_rate": 0.0001, "loss": 7.482, "loss/crossentropy": 2.642867088317871, "loss/hidden": 2.0625, "loss/jsd": 0.0, "loss/logits": 0.2776651754975319, "step": 1372 }, { "epoch": 0.0429375, "grad_norm": 4.59375, "grad_norm_var": 0.17170817057291668, "learning_rate": 0.0001, "loss": 7.7102, "loss/crossentropy": 2.722243070602417, "loss/hidden": 2.015625, "loss/jsd": 0.0, "loss/logits": 0.2972361445426941, "step": 1374 }, { "epoch": 0.043, "grad_norm": 4.84375, "grad_norm_var": 0.16534830729166666, "learning_rate": 0.0001, "loss": 7.3228, "loss/crossentropy": 2.5705126523971558, "loss/hidden": 1.921875, "loss/jsd": 0.0, "loss/logits": 0.28303705155849457, "step": 1376 }, { "epoch": 0.0430625, "grad_norm": 5.0, "grad_norm_var": 0.06278889973958333, "learning_rate": 0.0001, "loss": 7.5588, "loss/crossentropy": 2.6257171630859375, "loss/hidden": 1.98046875, "loss/jsd": 0.0, "loss/logits": 0.295265793800354, "step": 1378 }, { "epoch": 0.043125, "grad_norm": 5.375, "grad_norm_var": 0.07203369140625, "learning_rate": 0.0001, "loss": 7.7361, "loss/crossentropy": 2.775424599647522, "loss/hidden": 1.9765625, "loss/jsd": 0.0, "loss/logits": 0.2984114736318588, "step": 1380 }, { "epoch": 0.0431875, "grad_norm": 6.9375, "grad_norm_var": 0.311181640625, "learning_rate": 0.0001, "loss": 7.899, "loss/crossentropy": 2.7836259603500366, "loss/hidden": 2.03125, "loss/jsd": 0.0, "loss/logits": 0.3084150403738022, "step": 1382 }, { "epoch": 0.04325, "grad_norm": 4.84375, "grad_norm_var": 0.29814046223958335, "learning_rate": 0.0001, "loss": 7.7099, "loss/crossentropy": 2.850728750228882, "loss/hidden": 1.9375, "loss/jsd": 0.0, "loss/logits": 0.2921678125858307, "step": 1384 }, { "epoch": 0.0433125, "grad_norm": 4.84375, "grad_norm_var": 0.29179280598958335, "learning_rate": 0.0001, "loss": 7.3731, "loss/crossentropy": 2.5699057579040527, "loss/hidden": 1.96875, "loss/jsd": 0.0, "loss/logits": 0.2834426909685135, "step": 1386 }, { "epoch": 0.043375, "grad_norm": 4.84375, "grad_norm_var": 0.28982747395833336, "learning_rate": 0.0001, "loss": 7.3414, "loss/crossentropy": 2.566841721534729, "loss/hidden": 1.97265625, "loss/jsd": 0.0, "loss/logits": 0.28018754720687866, "step": 1388 }, { "epoch": 0.0434375, "grad_norm": 4.90625, "grad_norm_var": 0.29019775390625, "learning_rate": 0.0001, "loss": 7.627, "loss/crossentropy": 2.803957223892212, "loss/hidden": 1.9765625, "loss/jsd": 0.0, "loss/logits": 0.2846508026123047, "step": 1390 }, { "epoch": 0.0435, "grad_norm": 4.8125, "grad_norm_var": 0.28404541015625, "learning_rate": 0.0001, "loss": 7.3805, "loss/crossentropy": 2.4680495262145996, "loss/hidden": 2.0390625, "loss/jsd": 0.0, "loss/logits": 0.28734005987644196, "step": 1392 }, { "epoch": 0.0435625, "grad_norm": 9.5625, "grad_norm_var": 1.4954060872395833, "learning_rate": 0.0001, "loss": 7.6419, "loss/crossentropy": 2.6241623163223267, "loss/hidden": 2.0625, "loss/jsd": 0.0, "loss/logits": 0.29552070796489716, "step": 1394 }, { "epoch": 0.043625, "grad_norm": 4.84375, "grad_norm_var": 1.5164998372395833, "learning_rate": 0.0001, "loss": 7.0692, "loss/crossentropy": 2.2776577472686768, "loss/hidden": 1.984375, "loss/jsd": 0.0, "loss/logits": 0.2807151973247528, "step": 1396 }, { "epoch": 0.0436875, "grad_norm": 4.875, "grad_norm_var": 1.3636555989583334, "learning_rate": 0.0001, "loss": 7.6524, "loss/crossentropy": 2.6224864721298218, "loss/hidden": 2.0859375, "loss/jsd": 0.0, "loss/logits": 0.29439981281757355, "step": 1398 }, { "epoch": 0.04375, "grad_norm": 4.71875, "grad_norm_var": 1.3757771809895833, "learning_rate": 0.0001, "loss": 7.5251, "loss/crossentropy": 2.6127312183380127, "loss/hidden": 1.9765625, "loss/jsd": 0.0, "loss/logits": 0.29357995092868805, "step": 1400 }, { "epoch": 0.0438125, "grad_norm": 5.25, "grad_norm_var": 1.3815388997395834, "learning_rate": 0.0001, "loss": 7.6992, "loss/crossentropy": 2.692687511444092, "loss/hidden": 2.046875, "loss/jsd": 0.0, "loss/logits": 0.2959606945514679, "step": 1402 }, { "epoch": 0.043875, "grad_norm": 4.75, "grad_norm_var": 1.3755818684895833, "learning_rate": 0.0001, "loss": 7.694, "loss/crossentropy": 2.7359989881515503, "loss/hidden": 1.9296875, "loss/jsd": 0.0, "loss/logits": 0.3028276115655899, "step": 1404 }, { "epoch": 0.0439375, "grad_norm": 5.125, "grad_norm_var": 1.3676920572916667, "learning_rate": 0.0001, "loss": 7.5041, "loss/crossentropy": 2.6671664714813232, "loss/hidden": 1.9609375, "loss/jsd": 0.0, "loss/logits": 0.2875981330871582, "step": 1406 }, { "epoch": 0.044, "grad_norm": 4.71875, "grad_norm_var": 1.4091145833333334, "learning_rate": 0.0001, "loss": 7.2373, "loss/crossentropy": 2.509815812110901, "loss/hidden": 1.9375, "loss/jsd": 0.0, "loss/logits": 0.2789994776248932, "step": 1408 }, { "epoch": 0.0440625, "grad_norm": 5.09375, "grad_norm_var": 0.06513264973958334, "learning_rate": 0.0001, "loss": 7.7282, "loss/crossentropy": 2.7103008031845093, "loss/hidden": 2.01171875, "loss/jsd": 0.0, "loss/logits": 0.3006156384944916, "step": 1410 }, { "epoch": 0.044125, "grad_norm": 5.15625, "grad_norm_var": 0.06676025390625, "learning_rate": 0.0001, "loss": 7.5621, "loss/crossentropy": 2.7707459926605225, "loss/hidden": 2.0, "loss/jsd": 0.0, "loss/logits": 0.2791343182325363, "step": 1412 }, { "epoch": 0.0441875, "grad_norm": 5.3125, "grad_norm_var": 0.07636311848958334, "learning_rate": 0.0001, "loss": 7.5706, "loss/crossentropy": 2.721479654312134, "loss/hidden": 1.98046875, "loss/jsd": 0.0, "loss/logits": 0.2868632972240448, "step": 1414 }, { "epoch": 0.04425, "grad_norm": 4.65625, "grad_norm_var": 0.08456624348958333, "learning_rate": 0.0001, "loss": 7.2394, "loss/crossentropy": 2.603902578353882, "loss/hidden": 1.92578125, "loss/jsd": 0.0, "loss/logits": 0.2709691673517227, "step": 1416 }, { "epoch": 0.0443125, "grad_norm": 4.3125, "grad_norm_var": 0.10087483723958333, "learning_rate": 0.0001, "loss": 6.9187, "loss/crossentropy": 2.4536471366882324, "loss/hidden": 1.88671875, "loss/jsd": 0.0, "loss/logits": 0.25783771276474, "step": 1418 }, { "epoch": 0.044375, "grad_norm": 4.78125, "grad_norm_var": 0.08489176432291666, "learning_rate": 0.0001, "loss": 6.9662, "loss/crossentropy": 2.4370354413986206, "loss/hidden": 1.94921875, "loss/jsd": 0.0, "loss/logits": 0.25799560546875, "step": 1420 }, { "epoch": 0.0444375, "grad_norm": 4.84375, "grad_norm_var": 0.09192301432291666, "learning_rate": 0.0001, "loss": 7.5708, "loss/crossentropy": 2.74661386013031, "loss/hidden": 1.9375, "loss/jsd": 0.0, "loss/logits": 0.2886638045310974, "step": 1422 }, { "epoch": 0.0445, "grad_norm": 5.1875, "grad_norm_var": 0.13909098307291667, "learning_rate": 0.0001, "loss": 7.7308, "loss/crossentropy": 2.716087222099304, "loss/hidden": 1.9921875, "loss/jsd": 0.0, "loss/logits": 0.3022549897432327, "step": 1424 }, { "epoch": 0.0445625, "grad_norm": 4.625, "grad_norm_var": 0.14034830729166667, "learning_rate": 0.0001, "loss": 7.4306, "loss/crossentropy": 2.7329652309417725, "loss/hidden": 1.91796875, "loss/jsd": 0.0, "loss/logits": 0.27796949446201324, "step": 1426 }, { "epoch": 0.044625, "grad_norm": 4.875, "grad_norm_var": 0.15035400390625, "learning_rate": 0.0001, "loss": 7.4232, "loss/crossentropy": 2.7089322805404663, "loss/hidden": 1.96484375, "loss/jsd": 0.0, "loss/logits": 0.27493977546691895, "step": 1428 }, { "epoch": 0.0446875, "grad_norm": 4.375, "grad_norm_var": 0.162353515625, "learning_rate": 0.0001, "loss": 7.1642, "loss/crossentropy": 2.5490864515304565, "loss/hidden": 1.9765625, "loss/jsd": 0.0, "loss/logits": 0.2638545036315918, "step": 1430 }, { "epoch": 0.04475, "grad_norm": 5.15625, "grad_norm_var": 0.22652587890625, "learning_rate": 0.0001, "loss": 8.1627, "loss/crossentropy": 2.870542287826538, "loss/hidden": 2.0546875, "loss/jsd": 0.0, "loss/logits": 0.32375185191631317, "step": 1432 }, { "epoch": 0.0448125, "grad_norm": 5.0, "grad_norm_var": 0.19529622395833332, "learning_rate": 0.0001, "loss": 7.4056, "loss/crossentropy": 2.5882151126861572, "loss/hidden": 1.95703125, "loss/jsd": 0.0, "loss/logits": 0.2860320508480072, "step": 1434 }, { "epoch": 0.044875, "grad_norm": 6.09375, "grad_norm_var": 0.2537760416666667, "learning_rate": 0.0001, "loss": 7.894, "loss/crossentropy": 2.7808961868286133, "loss/hidden": 2.03125, "loss/jsd": 0.0, "loss/logits": 0.30818620324134827, "step": 1436 }, { "epoch": 0.0449375, "grad_norm": 5.9375, "grad_norm_var": 0.2899576822916667, "learning_rate": 0.0001, "loss": 7.9565, "loss/crossentropy": 2.932820439338684, "loss/hidden": 2.01953125, "loss/jsd": 0.0, "loss/logits": 0.30041807889938354, "step": 1438 }, { "epoch": 0.045, "grad_norm": 5.25, "grad_norm_var": 0.268212890625, "learning_rate": 0.0001, "loss": 7.5398, "loss/crossentropy": 2.500837206840515, "loss/hidden": 2.03515625, "loss/jsd": 0.0, "loss/logits": 0.30038216710090637, "step": 1440 }, { "epoch": 0.0450625, "grad_norm": 6.1875, "grad_norm_var": 0.2953409830729167, "learning_rate": 0.0001, "loss": 7.5575, "loss/crossentropy": 2.5460762977600098, "loss/hidden": 2.03125, "loss/jsd": 0.0, "loss/logits": 0.29801732301712036, "step": 1442 }, { "epoch": 0.045125, "grad_norm": 5.0, "grad_norm_var": 0.23290608723958334, "learning_rate": 0.0001, "loss": 8.0496, "loss/crossentropy": 2.860416531562805, "loss/hidden": 2.0078125, "loss/jsd": 0.0, "loss/logits": 0.3181406408548355, "step": 1444 }, { "epoch": 0.0451875, "grad_norm": 5.5, "grad_norm_var": 0.19029947916666667, "learning_rate": 0.0001, "loss": 7.5155, "loss/crossentropy": 2.6714009046554565, "loss/hidden": 1.9375, "loss/jsd": 0.0, "loss/logits": 0.29065585136413574, "step": 1446 }, { "epoch": 0.04525, "grad_norm": 5.125, "grad_norm_var": 0.17509358723958332, "learning_rate": 0.0001, "loss": 7.2614, "loss/crossentropy": 2.548215627670288, "loss/hidden": 1.953125, "loss/jsd": 0.0, "loss/logits": 0.2760085165500641, "step": 1448 }, { "epoch": 0.0453125, "grad_norm": 4.96875, "grad_norm_var": 0.18644205729166666, "learning_rate": 0.0001, "loss": 7.7885, "loss/crossentropy": 2.8646936416625977, "loss/hidden": 1.9921875, "loss/jsd": 0.0, "loss/logits": 0.2931659668684006, "step": 1450 }, { "epoch": 0.045375, "grad_norm": 5.25, "grad_norm_var": 0.17561442057291668, "learning_rate": 0.0001, "loss": 7.5222, "loss/crossentropy": 2.7001854181289673, "loss/hidden": 1.93359375, "loss/jsd": 0.0, "loss/logits": 0.28884437680244446, "step": 1452 }, { "epoch": 0.0454375, "grad_norm": 5.375, "grad_norm_var": 0.14674479166666668, "learning_rate": 0.0001, "loss": 7.3909, "loss/crossentropy": 2.587071418762207, "loss/hidden": 2.0234375, "loss/jsd": 0.0, "loss/logits": 0.2780349850654602, "step": 1454 }, { "epoch": 0.0455, "grad_norm": 4.96875, "grad_norm_var": 0.15011393229166667, "learning_rate": 0.0001, "loss": 7.6218, "loss/crossentropy": 2.7357864379882812, "loss/hidden": 1.984375, "loss/jsd": 0.0, "loss/logits": 0.2901657521724701, "step": 1456 }, { "epoch": 0.0455625, "grad_norm": 5.25, "grad_norm_var": 0.08912353515625, "learning_rate": 0.0001, "loss": 7.8304, "loss/crossentropy": 2.845029592514038, "loss/hidden": 1.9921875, "loss/jsd": 0.0, "loss/logits": 0.2993154674768448, "step": 1458 }, { "epoch": 0.045625, "grad_norm": 4.9375, "grad_norm_var": 0.07277018229166667, "learning_rate": 0.0001, "loss": 7.627, "loss/crossentropy": 2.732051730155945, "loss/hidden": 1.98046875, "loss/jsd": 0.0, "loss/logits": 0.2914441227912903, "step": 1460 }, { "epoch": 0.0456875, "grad_norm": 4.84375, "grad_norm_var": 0.09303385416666667, "learning_rate": 0.0001, "loss": 7.2475, "loss/crossentropy": 2.6173033714294434, "loss/hidden": 1.90234375, "loss/jsd": 0.0, "loss/logits": 0.27278298139572144, "step": 1462 }, { "epoch": 0.04575, "grad_norm": 4.625, "grad_norm_var": 0.10012613932291667, "learning_rate": 0.0001, "loss": 7.2154, "loss/crossentropy": 2.5818458795547485, "loss/hidden": 1.91796875, "loss/jsd": 0.0, "loss/logits": 0.27155643701553345, "step": 1464 }, { "epoch": 0.0458125, "grad_norm": 4.875, "grad_norm_var": 0.10858968098958334, "learning_rate": 0.0001, "loss": 7.3674, "loss/crossentropy": 2.595680594444275, "loss/hidden": 1.921875, "loss/jsd": 0.0, "loss/logits": 0.2849871814250946, "step": 1466 }, { "epoch": 0.045875, "grad_norm": 5.0625, "grad_norm_var": 0.09620768229166667, "learning_rate": 0.0001, "loss": 7.5837, "loss/crossentropy": 2.663433074951172, "loss/hidden": 1.9921875, "loss/jsd": 0.0, "loss/logits": 0.29280896484851837, "step": 1468 }, { "epoch": 0.0459375, "grad_norm": 5.21875, "grad_norm_var": 0.071728515625, "learning_rate": 0.0001, "loss": 7.2683, "loss/crossentropy": 2.5311508178710938, "loss/hidden": 1.93359375, "loss/jsd": 0.0, "loss/logits": 0.2803528904914856, "step": 1470 }, { "epoch": 0.046, "grad_norm": 4.5625, "grad_norm_var": 0.07107747395833333, "learning_rate": 0.0001, "loss": 7.5451, "loss/crossentropy": 2.698567032814026, "loss/hidden": 1.921875, "loss/jsd": 0.0, "loss/logits": 0.292463943362236, "step": 1472 }, { "epoch": 0.0460625, "grad_norm": 5.8125, "grad_norm_var": 0.12903238932291666, "learning_rate": 0.0001, "loss": 7.2625, "loss/crossentropy": 2.4582966566085815, "loss/hidden": 2.00390625, "loss/jsd": 0.0, "loss/logits": 0.2800302058458328, "step": 1474 }, { "epoch": 0.046125, "grad_norm": 4.90625, "grad_norm_var": 0.12496337890625, "learning_rate": 0.0001, "loss": 7.458, "loss/crossentropy": 2.7098604440689087, "loss/hidden": 1.91796875, "loss/jsd": 0.0, "loss/logits": 0.2830130606889725, "step": 1476 }, { "epoch": 0.0461875, "grad_norm": 5.90625, "grad_norm_var": 0.2095703125, "learning_rate": 0.0001, "loss": 8.0177, "loss/crossentropy": 2.9519091844558716, "loss/hidden": 1.9921875, "loss/jsd": 0.0, "loss/logits": 0.30735622346401215, "step": 1478 }, { "epoch": 0.04625, "grad_norm": 4.78125, "grad_norm_var": 0.21249593098958333, "learning_rate": 0.0001, "loss": 7.5442, "loss/crossentropy": 2.6207966804504395, "loss/hidden": 1.94140625, "loss/jsd": 0.0, "loss/logits": 0.2981995493173599, "step": 1480 }, { "epoch": 0.0463125, "grad_norm": 4.96875, "grad_norm_var": 0.26122639973958334, "learning_rate": 0.0001, "loss": 7.6388, "loss/crossentropy": 2.5771687030792236, "loss/hidden": 2.046875, "loss/jsd": 0.0, "loss/logits": 0.3014744073152542, "step": 1482 }, { "epoch": 0.046375, "grad_norm": 5.28125, "grad_norm_var": 0.26610921223958334, "learning_rate": 0.0001, "loss": 7.6108, "loss/crossentropy": 2.6574681997299194, "loss/hidden": 1.984375, "loss/jsd": 0.0, "loss/logits": 0.29689261317253113, "step": 1484 }, { "epoch": 0.0464375, "grad_norm": 5.0, "grad_norm_var": 0.24306233723958334, "learning_rate": 0.0001, "loss": 7.9907, "loss/crossentropy": 2.8251935243606567, "loss/hidden": 2.0390625, "loss/jsd": 0.0, "loss/logits": 0.31264084577560425, "step": 1486 }, { "epoch": 0.0465, "grad_norm": 4.59375, "grad_norm_var": 0.26248372395833336, "learning_rate": 0.0001, "loss": 7.0941, "loss/crossentropy": 2.5712080001831055, "loss/hidden": 1.88671875, "loss/jsd": 0.0, "loss/logits": 0.26361557841300964, "step": 1488 }, { "epoch": 0.0465625, "grad_norm": 4.25, "grad_norm_var": 0.2503743489583333, "learning_rate": 0.0001, "loss": 7.742, "loss/crossentropy": 2.824060797691345, "loss/hidden": 1.97265625, "loss/jsd": 0.0, "loss/logits": 0.29452717304229736, "step": 1490 }, { "epoch": 0.046625, "grad_norm": 5.125, "grad_norm_var": 0.24933268229166666, "learning_rate": 0.0001, "loss": 7.5885, "loss/crossentropy": 2.7565609216690063, "loss/hidden": 1.97265625, "loss/jsd": 0.0, "loss/logits": 0.28592388331890106, "step": 1492 }, { "epoch": 0.0466875, "grad_norm": 4.6875, "grad_norm_var": 0.16730143229166666, "learning_rate": 0.0001, "loss": 7.2391, "loss/crossentropy": 2.4160306453704834, "loss/hidden": 1.94140625, "loss/jsd": 0.0, "loss/logits": 0.2881673574447632, "step": 1494 }, { "epoch": 0.04675, "grad_norm": 4.65625, "grad_norm_var": 0.17081705729166666, "learning_rate": 0.0001, "loss": 7.0716, "loss/crossentropy": 2.5062583684921265, "loss/hidden": 1.890625, "loss/jsd": 0.0, "loss/logits": 0.26747608184814453, "step": 1496 }, { "epoch": 0.0468125, "grad_norm": 4.84375, "grad_norm_var": 0.11243082682291666, "learning_rate": 0.0001, "loss": 7.6162, "loss/crossentropy": 2.732138156890869, "loss/hidden": 1.96484375, "loss/jsd": 0.0, "loss/logits": 0.29191985726356506, "step": 1498 }, { "epoch": 0.046875, "grad_norm": 4.4375, "grad_norm_var": 0.108447265625, "learning_rate": 0.0001, "loss": 7.2309, "loss/crossentropy": 2.6066497564315796, "loss/hidden": 1.9375, "loss/jsd": 0.0, "loss/logits": 0.2686777710914612, "step": 1500 }, { "epoch": 0.0469375, "grad_norm": 5.21875, "grad_norm_var": 0.12580973307291668, "learning_rate": 0.0001, "loss": 7.8604, "loss/crossentropy": 2.8097376823425293, "loss/hidden": 2.01953125, "loss/jsd": 0.0, "loss/logits": 0.30311088263988495, "step": 1502 }, { "epoch": 0.047, "grad_norm": 4.6875, "grad_norm_var": 0.10774332682291667, "learning_rate": 0.0001, "loss": 7.7376, "loss/crossentropy": 2.753267288208008, "loss/hidden": 1.97265625, "loss/jsd": 0.0, "loss/logits": 0.30116328597068787, "step": 1504 }, { "epoch": 0.0470625, "grad_norm": 4.96875, "grad_norm_var": 0.3541015625, "learning_rate": 0.0001, "loss": 7.3779, "loss/crossentropy": 2.5379709005355835, "loss/hidden": 1.94140625, "loss/jsd": 0.0, "loss/logits": 0.28985071182250977, "step": 1506 }, { "epoch": 0.047125, "grad_norm": 5.6875, "grad_norm_var": 0.37183837890625, "learning_rate": 0.0001, "loss": 7.4822, "loss/crossentropy": 2.641207695007324, "loss/hidden": 1.98046875, "loss/jsd": 0.0, "loss/logits": 0.28605230152606964, "step": 1508 }, { "epoch": 0.0471875, "grad_norm": 4.71875, "grad_norm_var": 0.372119140625, "learning_rate": 0.0001, "loss": 7.5333, "loss/crossentropy": 2.6981176137924194, "loss/hidden": 1.94921875, "loss/jsd": 0.0, "loss/logits": 0.28859612345695496, "step": 1510 }, { "epoch": 0.04725, "grad_norm": 4.59375, "grad_norm_var": 0.36682535807291666, "learning_rate": 0.0001, "loss": 7.5909, "loss/crossentropy": 2.744605541229248, "loss/hidden": 1.9921875, "loss/jsd": 0.0, "loss/logits": 0.2854115813970566, "step": 1512 }, { "epoch": 0.0473125, "grad_norm": 4.28125, "grad_norm_var": 0.403369140625, "learning_rate": 0.0001, "loss": 7.3829, "loss/crossentropy": 2.692691683769226, "loss/hidden": 1.91015625, "loss/jsd": 0.0, "loss/logits": 0.27800965309143066, "step": 1514 }, { "epoch": 0.047375, "grad_norm": 5.125, "grad_norm_var": 0.37821858723958335, "learning_rate": 0.0001, "loss": 7.519, "loss/crossentropy": 2.72913920879364, "loss/hidden": 1.99609375, "loss/jsd": 0.0, "loss/logits": 0.27937404811382294, "step": 1516 }, { "epoch": 0.0474375, "grad_norm": 5.0, "grad_norm_var": 0.37154947916666664, "learning_rate": 0.0001, "loss": 7.5057, "loss/crossentropy": 2.726033926010132, "loss/hidden": 1.953125, "loss/jsd": 0.0, "loss/logits": 0.28265441954135895, "step": 1518 }, { "epoch": 0.0475, "grad_norm": 4.5625, "grad_norm_var": 0.40818684895833335, "learning_rate": 0.0001, "loss": 7.2604, "loss/crossentropy": 2.455228567123413, "loss/hidden": 1.98828125, "loss/jsd": 0.0, "loss/logits": 0.2816852927207947, "step": 1520 }, { "epoch": 0.0475625, "grad_norm": 4.625, "grad_norm_var": 0.17095947265625, "learning_rate": 0.0001, "loss": 6.9734, "loss/crossentropy": 2.4943045377731323, "loss/hidden": 1.875, "loss/jsd": 0.0, "loss/logits": 0.2604096084833145, "step": 1522 }, { "epoch": 0.047625, "grad_norm": 4.78125, "grad_norm_var": 0.13645833333333332, "learning_rate": 0.0001, "loss": 7.0076, "loss/crossentropy": 2.4553334712982178, "loss/hidden": 1.92578125, "loss/jsd": 0.0, "loss/logits": 0.2626507952809334, "step": 1524 }, { "epoch": 0.0476875, "grad_norm": 4.625, "grad_norm_var": 0.12654622395833334, "learning_rate": 0.0001, "loss": 7.3117, "loss/crossentropy": 2.6219717264175415, "loss/hidden": 1.93359375, "loss/jsd": 0.0, "loss/logits": 0.27561162412166595, "step": 1526 }, { "epoch": 0.04775, "grad_norm": 5.21875, "grad_norm_var": 0.13336181640625, "learning_rate": 0.0001, "loss": 7.8256, "loss/crossentropy": 2.863209366798401, "loss/hidden": 1.93359375, "loss/jsd": 0.0, "loss/logits": 0.3028797209262848, "step": 1528 }, { "epoch": 0.0478125, "grad_norm": 4.8125, "grad_norm_var": 0.11070556640625, "learning_rate": 0.0001, "loss": 7.9175, "loss/crossentropy": 2.9292283058166504, "loss/hidden": 1.9765625, "loss/jsd": 0.0, "loss/logits": 0.30116941034793854, "step": 1530 }, { "epoch": 0.047875, "grad_norm": 4.875, "grad_norm_var": 0.10513916015625, "learning_rate": 0.0001, "loss": 7.1054, "loss/crossentropy": 2.462781071662903, "loss/hidden": 1.98046875, "loss/jsd": 0.0, "loss/logits": 0.2662145644426346, "step": 1532 }, { "epoch": 0.0479375, "grad_norm": 4.78125, "grad_norm_var": 0.10234375, "learning_rate": 0.0001, "loss": 7.2958, "loss/crossentropy": 2.6526283025741577, "loss/hidden": 1.90234375, "loss/jsd": 0.0, "loss/logits": 0.2740848809480667, "step": 1534 }, { "epoch": 0.048, "grad_norm": 5.25, "grad_norm_var": 0.04882405598958333, "learning_rate": 0.0001, "loss": 7.1055, "loss/crossentropy": 2.424253225326538, "loss/hidden": 1.94140625, "loss/jsd": 0.0, "loss/logits": 0.27398423850536346, "step": 1536 }, { "epoch": 0.0480625, "grad_norm": 5.46875, "grad_norm_var": 0.06599934895833333, "learning_rate": 0.0001, "loss": 7.5641, "loss/crossentropy": 2.689168930053711, "loss/hidden": 1.9921875, "loss/jsd": 0.0, "loss/logits": 0.28827279806137085, "step": 1538 }, { "epoch": 0.048125, "grad_norm": 5.34375, "grad_norm_var": 0.07213541666666666, "learning_rate": 0.0001, "loss": 7.7088, "loss/crossentropy": 2.7261996269226074, "loss/hidden": 1.95703125, "loss/jsd": 0.0, "loss/logits": 0.30255410075187683, "step": 1540 }, { "epoch": 0.0481875, "grad_norm": 4.78125, "grad_norm_var": 0.08136393229166666, "learning_rate": 0.0001, "loss": 6.9039, "loss/crossentropy": 2.494332194328308, "loss/hidden": 1.91796875, "loss/jsd": 0.0, "loss/logits": 0.24916400015354156, "step": 1542 }, { "epoch": 0.04825, "grad_norm": 4.90625, "grad_norm_var": 0.06834309895833333, "learning_rate": 0.0001, "loss": 7.4646, "loss/crossentropy": 2.5675861835479736, "loss/hidden": 1.9921875, "loss/jsd": 0.0, "loss/logits": 0.2904863655567169, "step": 1544 }, { "epoch": 0.0483125, "grad_norm": 5.03125, "grad_norm_var": 0.06894124348958333, "learning_rate": 0.0001, "loss": 7.4416, "loss/crossentropy": 2.6678693294525146, "loss/hidden": 1.953125, "loss/jsd": 0.0, "loss/logits": 0.2820626497268677, "step": 1546 }, { "epoch": 0.048375, "grad_norm": 4.75, "grad_norm_var": 0.06946207682291666, "learning_rate": 0.0001, "loss": 7.3052, "loss/crossentropy": 2.563589096069336, "loss/hidden": 1.9765625, "loss/jsd": 0.0, "loss/logits": 0.27650561928749084, "step": 1548 }, { "epoch": 0.0484375, "grad_norm": 4.75, "grad_norm_var": 0.07548421223958333, "learning_rate": 0.0001, "loss": 7.7503, "loss/crossentropy": 2.8281824588775635, "loss/hidden": 1.984375, "loss/jsd": 0.0, "loss/logits": 0.2937776744365692, "step": 1550 }, { "epoch": 0.0485, "grad_norm": 4.625, "grad_norm_var": 0.06834309895833333, "learning_rate": 0.0001, "loss": 7.2171, "loss/crossentropy": 2.690904974937439, "loss/hidden": 1.890625, "loss/jsd": 0.0, "loss/logits": 0.263553261756897, "step": 1552 }, { "epoch": 0.0485625, "grad_norm": 5.09375, "grad_norm_var": 0.05310872395833333, "learning_rate": 0.0001, "loss": 7.1013, "loss/crossentropy": 2.5671788454055786, "loss/hidden": 1.92578125, "loss/jsd": 0.0, "loss/logits": 0.26082948595285416, "step": 1554 }, { "epoch": 0.048625, "grad_norm": 5.21875, "grad_norm_var": 0.04674479166666667, "learning_rate": 0.0001, "loss": 7.3052, "loss/crossentropy": 2.6749120950698853, "loss/hidden": 1.90234375, "loss/jsd": 0.0, "loss/logits": 0.27279847860336304, "step": 1556 }, { "epoch": 0.0486875, "grad_norm": 4.625, "grad_norm_var": 0.03876546223958333, "learning_rate": 0.0001, "loss": 7.3388, "loss/crossentropy": 2.7545392513275146, "loss/hidden": 1.8984375, "loss/jsd": 0.0, "loss/logits": 0.2685810327529907, "step": 1558 }, { "epoch": 0.04875, "grad_norm": 6.875, "grad_norm_var": 0.2933553059895833, "learning_rate": 0.0001, "loss": 7.5524, "loss/crossentropy": 2.611665725708008, "loss/hidden": 2.00390625, "loss/jsd": 0.0, "loss/logits": 0.2936825156211853, "step": 1560 }, { "epoch": 0.0488125, "grad_norm": 5.4375, "grad_norm_var": 0.3044230143229167, "learning_rate": 0.0001, "loss": 7.5063, "loss/crossentropy": 2.619522213935852, "loss/hidden": 1.97265625, "loss/jsd": 0.0, "loss/logits": 0.29141077399253845, "step": 1562 }, { "epoch": 0.048875, "grad_norm": 6.25, "grad_norm_var": 0.4083170572916667, "learning_rate": 0.0001, "loss": 7.5491, "loss/crossentropy": 2.712641716003418, "loss/hidden": 1.98046875, "loss/jsd": 0.0, "loss/logits": 0.2856023460626602, "step": 1564 }, { "epoch": 0.0489375, "grad_norm": 5.46875, "grad_norm_var": 0.4775390625, "learning_rate": 0.0001, "loss": 7.9109, "loss/crossentropy": 2.710148334503174, "loss/hidden": 2.0390625, "loss/jsd": 0.0, "loss/logits": 0.31617045402526855, "step": 1566 }, { "epoch": 0.049, "grad_norm": 4.625, "grad_norm_var": 0.49078369140625, "learning_rate": 0.0001, "loss": 7.2407, "loss/crossentropy": 2.6541051864624023, "loss/hidden": 1.87109375, "loss/jsd": 0.0, "loss/logits": 0.27155420184135437, "step": 1568 }, { "epoch": 0.0490625, "grad_norm": 4.65625, "grad_norm_var": 0.4930948893229167, "learning_rate": 0.0001, "loss": 7.4394, "loss/crossentropy": 2.746522545814514, "loss/hidden": 1.890625, "loss/jsd": 0.0, "loss/logits": 0.28022365272045135, "step": 1570 }, { "epoch": 0.049125, "grad_norm": 4.65625, "grad_norm_var": 0.49719645182291666, "learning_rate": 0.0001, "loss": 7.3557, "loss/crossentropy": 2.669800877571106, "loss/hidden": 1.88671875, "loss/jsd": 0.0, "loss/logits": 0.27991996705532074, "step": 1572 }, { "epoch": 0.0491875, "grad_norm": 5.03125, "grad_norm_var": 0.48879801432291664, "learning_rate": 0.0001, "loss": 7.5363, "loss/crossentropy": 2.7022202014923096, "loss/hidden": 1.984375, "loss/jsd": 0.0, "loss/logits": 0.28497424721717834, "step": 1574 }, { "epoch": 0.04925, "grad_norm": 4.375, "grad_norm_var": 0.3167805989583333, "learning_rate": 0.0001, "loss": 7.1079, "loss/crossentropy": 2.5182780027389526, "loss/hidden": 1.90625, "loss/jsd": 0.0, "loss/logits": 0.26834164559841156, "step": 1576 }, { "epoch": 0.0493125, "grad_norm": 5.5625, "grad_norm_var": 0.32958577473958334, "learning_rate": 0.0001, "loss": 7.4399, "loss/crossentropy": 2.5361671447753906, "loss/hidden": 1.92578125, "loss/jsd": 0.0, "loss/logits": 0.2977970540523529, "step": 1578 }, { "epoch": 0.049375, "grad_norm": 5.0625, "grad_norm_var": 0.24976806640625, "learning_rate": 0.0001, "loss": 7.2944, "loss/crossentropy": 2.7187254428863525, "loss/hidden": 1.8984375, "loss/jsd": 0.0, "loss/logits": 0.2677236646413803, "step": 1580 }, { "epoch": 0.0494375, "grad_norm": 4.96875, "grad_norm_var": 0.11907145182291666, "learning_rate": 0.0001, "loss": 6.5479, "loss/crossentropy": 2.1634461879730225, "loss/hidden": 1.890625, "loss/jsd": 0.0, "loss/logits": 0.24937818944454193, "step": 1582 }, { "epoch": 0.0495, "grad_norm": 4.375, "grad_norm_var": 0.128759765625, "learning_rate": 0.0001, "loss": 7.1655, "loss/crossentropy": 2.545617699623108, "loss/hidden": 1.875, "loss/jsd": 0.0, "loss/logits": 0.27449171245098114, "step": 1584 }, { "epoch": 0.0495625, "grad_norm": 5.0, "grad_norm_var": 0.13020833333333334, "learning_rate": 0.0001, "loss": 7.8215, "loss/crossentropy": 2.8528627157211304, "loss/hidden": 1.9921875, "loss/jsd": 0.0, "loss/logits": 0.2976495623588562, "step": 1586 }, { "epoch": 0.049625, "grad_norm": 4.78125, "grad_norm_var": 0.128759765625, "learning_rate": 0.0001, "loss": 7.2978, "loss/crossentropy": 2.632421135902405, "loss/hidden": 1.921875, "loss/jsd": 0.0, "loss/logits": 0.27435266971588135, "step": 1588 }, { "epoch": 0.0496875, "grad_norm": 6.75, "grad_norm_var": 0.363671875, "learning_rate": 0.0001, "loss": 7.6244, "loss/crossentropy": 2.7735743522644043, "loss/hidden": 1.93359375, "loss/jsd": 0.0, "loss/logits": 0.2917233109474182, "step": 1590 }, { "epoch": 0.04975, "grad_norm": 6.5625, "grad_norm_var": 0.49176025390625, "learning_rate": 0.0001, "loss": 7.995, "loss/crossentropy": 2.765578508377075, "loss/hidden": 1.9765625, "loss/jsd": 0.0, "loss/logits": 0.3252904415130615, "step": 1592 }, { "epoch": 0.0498125, "grad_norm": 5.75, "grad_norm_var": 0.51353759765625, "learning_rate": 0.0001, "loss": 7.5683, "loss/crossentropy": 2.6407854557037354, "loss/hidden": 1.94140625, "loss/jsd": 0.0, "loss/logits": 0.298612505197525, "step": 1594 }, { "epoch": 0.049875, "grad_norm": 5.5625, "grad_norm_var": 0.46640218098958336, "learning_rate": 0.0001, "loss": 7.3369, "loss/crossentropy": 2.433140516281128, "loss/hidden": 2.03515625, "loss/jsd": 0.0, "loss/logits": 0.2868652194738388, "step": 1596 }, { "epoch": 0.0499375, "grad_norm": 4.5625, "grad_norm_var": 0.4823527018229167, "learning_rate": 0.0001, "loss": 7.2654, "loss/crossentropy": 2.6077977418899536, "loss/hidden": 1.92578125, "loss/jsd": 0.0, "loss/logits": 0.2731828987598419, "step": 1598 }, { "epoch": 0.05, "grad_norm": 5.3125, "grad_norm_var": 0.43136393229166664, "learning_rate": 0.0001, "loss": 7.3095, "loss/crossentropy": 2.6041181087493896, "loss/hidden": 1.96875, "loss/jsd": 0.0, "loss/logits": 0.2736643999814987, "step": 1600 }, { "epoch": 0.0500625, "grad_norm": 5.34375, "grad_norm_var": 0.40315348307291665, "learning_rate": 0.0001, "loss": 7.1878, "loss/crossentropy": 2.42449951171875, "loss/hidden": 1.94140625, "loss/jsd": 0.0, "loss/logits": 0.2821933627128601, "step": 1602 }, { "epoch": 0.050125, "grad_norm": 5.46875, "grad_norm_var": 0.35279947916666665, "learning_rate": 0.0001, "loss": 7.665, "loss/crossentropy": 2.7596222162246704, "loss/hidden": 1.95703125, "loss/jsd": 0.0, "loss/logits": 0.29483360052108765, "step": 1604 }, { "epoch": 0.0501875, "grad_norm": 4.65625, "grad_norm_var": 0.26431884765625, "learning_rate": 0.0001, "loss": 7.2976, "loss/crossentropy": 2.743039131164551, "loss/hidden": 1.91015625, "loss/jsd": 0.0, "loss/logits": 0.26444223523139954, "step": 1606 }, { "epoch": 0.05025, "grad_norm": 5.34375, "grad_norm_var": 0.141015625, "learning_rate": 0.0001, "loss": 7.515, "loss/crossentropy": 2.707811117172241, "loss/hidden": 1.9765625, "loss/jsd": 0.0, "loss/logits": 0.2830655127763748, "step": 1608 }, { "epoch": 0.0503125, "grad_norm": 4.53125, "grad_norm_var": 0.12550455729166668, "learning_rate": 0.0001, "loss": 7.175, "loss/crossentropy": 2.577424645423889, "loss/hidden": 1.90234375, "loss/jsd": 0.0, "loss/logits": 0.2695184051990509, "step": 1610 }, { "epoch": 0.050375, "grad_norm": 5.4375, "grad_norm_var": 0.11591389973958334, "learning_rate": 0.0001, "loss": 7.1828, "loss/crossentropy": 2.5045968294143677, "loss/hidden": 1.984375, "loss/jsd": 0.0, "loss/logits": 0.2693790942430496, "step": 1612 }, { "epoch": 0.0504375, "grad_norm": 5.0, "grad_norm_var": 0.10076497395833334, "learning_rate": 0.0001, "loss": 7.6011, "loss/crossentropy": 2.6588661670684814, "loss/hidden": 1.9375, "loss/jsd": 0.0, "loss/logits": 0.3004702776670456, "step": 1614 }, { "epoch": 0.0505, "grad_norm": 4.5, "grad_norm_var": 0.12099202473958333, "learning_rate": 0.0001, "loss": 7.3598, "loss/crossentropy": 2.62858247756958, "loss/hidden": 1.921875, "loss/jsd": 0.0, "loss/logits": 0.28093548119068146, "step": 1616 }, { "epoch": 0.0505625, "grad_norm": 4.46875, "grad_norm_var": 0.13435872395833334, "learning_rate": 0.0001, "loss": 7.2105, "loss/crossentropy": 2.5444538593292236, "loss/hidden": 1.9296875, "loss/jsd": 0.0, "loss/logits": 0.27363789081573486, "step": 1618 }, { "epoch": 0.050625, "grad_norm": 5.28125, "grad_norm_var": 0.12854410807291666, "learning_rate": 0.0001, "loss": 7.3259, "loss/crossentropy": 2.6291420459747314, "loss/hidden": 1.91796875, "loss/jsd": 0.0, "loss/logits": 0.2778768092393875, "step": 1620 }, { "epoch": 0.0506875, "grad_norm": 4.46875, "grad_norm_var": 0.15154622395833334, "learning_rate": 0.0001, "loss": 7.2225, "loss/crossentropy": 2.6978687047958374, "loss/hidden": 1.86328125, "loss/jsd": 0.0, "loss/logits": 0.2661316245794296, "step": 1622 }, { "epoch": 0.05075, "grad_norm": 4.84375, "grad_norm_var": 0.14163004557291667, "learning_rate": 0.0001, "loss": 7.2325, "loss/crossentropy": 2.544032335281372, "loss/hidden": 1.90234375, "loss/jsd": 0.0, "loss/logits": 0.27861447632312775, "step": 1624 }, { "epoch": 0.0508125, "grad_norm": 5.03125, "grad_norm_var": 0.14329427083333332, "learning_rate": 0.0001, "loss": 7.3249, "loss/crossentropy": 2.662850856781006, "loss/hidden": 1.90234375, "loss/jsd": 0.0, "loss/logits": 0.275974377989769, "step": 1626 }, { "epoch": 0.050875, "grad_norm": 5.3125, "grad_norm_var": 0.13893229166666668, "learning_rate": 0.0001, "loss": 7.4051, "loss/crossentropy": 2.6046046018600464, "loss/hidden": 1.97265625, "loss/jsd": 0.0, "loss/logits": 0.2827852815389633, "step": 1628 }, { "epoch": 0.0509375, "grad_norm": 4.8125, "grad_norm_var": 0.107666015625, "learning_rate": 0.0001, "loss": 7.3616, "loss/crossentropy": 2.651221752166748, "loss/hidden": 1.9140625, "loss/jsd": 0.0, "loss/logits": 0.27963121235370636, "step": 1630 }, { "epoch": 0.051, "grad_norm": 4.6875, "grad_norm_var": 0.14420166015625, "learning_rate": 0.0001, "loss": 7.4135, "loss/crossentropy": 2.646748185157776, "loss/hidden": 1.97265625, "loss/jsd": 0.0, "loss/logits": 0.2794112116098404, "step": 1632 }, { "epoch": 0.0510625, "grad_norm": 5.15625, "grad_norm_var": 0.137744140625, "learning_rate": 0.0001, "loss": 7.5045, "loss/crossentropy": 2.708313465118408, "loss/hidden": 1.9375, "loss/jsd": 0.0, "loss/logits": 0.2858666926622391, "step": 1634 }, { "epoch": 0.051125, "grad_norm": 5.34375, "grad_norm_var": 0.13944905598958332, "learning_rate": 0.0001, "loss": 7.6362, "loss/crossentropy": 2.700982093811035, "loss/hidden": 1.95703125, "loss/jsd": 0.0, "loss/logits": 0.2978232800960541, "step": 1636 }, { "epoch": 0.0511875, "grad_norm": 5.34375, "grad_norm_var": 0.6510050455729167, "learning_rate": 0.0001, "loss": 7.2766, "loss/crossentropy": 2.491866707801819, "loss/hidden": 1.984375, "loss/jsd": 0.0, "loss/logits": 0.28003498911857605, "step": 1638 }, { "epoch": 0.05125, "grad_norm": 4.40625, "grad_norm_var": 0.6700480143229167, "learning_rate": 0.0001, "loss": 7.4596, "loss/crossentropy": 2.732044219970703, "loss/hidden": 1.921875, "loss/jsd": 0.0, "loss/logits": 0.2805637717247009, "step": 1640 }, { "epoch": 0.0513125, "grad_norm": 5.1875, "grad_norm_var": 0.6787109375, "learning_rate": 0.0001, "loss": 7.5031, "loss/crossentropy": 2.650493025779724, "loss/hidden": 1.9453125, "loss/jsd": 0.0, "loss/logits": 0.29072538018226624, "step": 1642 }, { "epoch": 0.051375, "grad_norm": 5.125, "grad_norm_var": 0.656494140625, "learning_rate": 0.0001, "loss": 7.6908, "loss/crossentropy": 2.7976644039154053, "loss/hidden": 2.03125, "loss/jsd": 0.0, "loss/logits": 0.28619284927845, "step": 1644 }, { "epoch": 0.0514375, "grad_norm": 4.75, "grad_norm_var": 0.6514933268229167, "learning_rate": 0.0001, "loss": 7.557, "loss/crossentropy": 2.8387088775634766, "loss/hidden": 1.9296875, "loss/jsd": 0.0, "loss/logits": 0.2788640707731247, "step": 1646 }, { "epoch": 0.0515, "grad_norm": 4.65625, "grad_norm_var": 0.649462890625, "learning_rate": 0.0001, "loss": 7.3634, "loss/crossentropy": 2.669367551803589, "loss/hidden": 1.93359375, "loss/jsd": 0.0, "loss/logits": 0.2760390490293503, "step": 1648 }, { "epoch": 0.0515625, "grad_norm": 4.71875, "grad_norm_var": 0.6587198893229167, "learning_rate": 0.0001, "loss": 7.625, "loss/crossentropy": 2.7615777254104614, "loss/hidden": 1.9375, "loss/jsd": 0.0, "loss/logits": 0.29259105026721954, "step": 1650 }, { "epoch": 0.051625, "grad_norm": 5.0, "grad_norm_var": 0.65562744140625, "learning_rate": 0.0001, "loss": 7.8787, "loss/crossentropy": 2.975517749786377, "loss/hidden": 1.98046875, "loss/jsd": 0.0, "loss/logits": 0.29227523505687714, "step": 1652 }, { "epoch": 0.0516875, "grad_norm": 4.46875, "grad_norm_var": 0.15979410807291666, "learning_rate": 0.0001, "loss": 7.3841, "loss/crossentropy": 2.7546948194503784, "loss/hidden": 1.91015625, "loss/jsd": 0.0, "loss/logits": 0.2719228267669678, "step": 1654 }, { "epoch": 0.05175, "grad_norm": 4.40625, "grad_norm_var": 0.1611328125, "learning_rate": 0.0001, "loss": 7.5601, "loss/crossentropy": 2.7170467376708984, "loss/hidden": 1.91796875, "loss/jsd": 0.0, "loss/logits": 0.2925069183111191, "step": 1656 }, { "epoch": 0.0518125, "grad_norm": 4.53125, "grad_norm_var": 0.07076416015625, "learning_rate": 0.0001, "loss": 7.4027, "loss/crossentropy": 2.742368459701538, "loss/hidden": 1.9140625, "loss/jsd": 0.0, "loss/logits": 0.274629682302475, "step": 1658 }, { "epoch": 0.051875, "grad_norm": 4.78125, "grad_norm_var": 0.06467692057291667, "learning_rate": 0.0001, "loss": 7.491, "loss/crossentropy": 2.882826805114746, "loss/hidden": 1.91015625, "loss/jsd": 0.0, "loss/logits": 0.2697974443435669, "step": 1660 }, { "epoch": 0.0519375, "grad_norm": 5.03125, "grad_norm_var": 0.072119140625, "learning_rate": 0.0001, "loss": 7.0602, "loss/crossentropy": 2.6344101428985596, "loss/hidden": 1.8515625, "loss/jsd": 0.0, "loss/logits": 0.2574244886636734, "step": 1662 }, { "epoch": 0.052, "grad_norm": 4.84375, "grad_norm_var": 0.07496337890625, "learning_rate": 0.0001, "loss": 7.5474, "loss/crossentropy": 2.7764055728912354, "loss/hidden": 1.91015625, "loss/jsd": 0.0, "loss/logits": 0.2860814034938812, "step": 1664 }, { "epoch": 0.0520625, "grad_norm": 4.5, "grad_norm_var": 0.07120768229166667, "learning_rate": 0.0001, "loss": 7.333, "loss/crossentropy": 2.6493825912475586, "loss/hidden": 1.91015625, "loss/jsd": 0.0, "loss/logits": 0.2773485779762268, "step": 1666 }, { "epoch": 0.052125, "grad_norm": 5.125, "grad_norm_var": 0.07226155598958334, "learning_rate": 0.0001, "loss": 7.801, "loss/crossentropy": 2.91958749294281, "loss/hidden": 1.95703125, "loss/jsd": 0.0, "loss/logits": 0.2924348711967468, "step": 1668 }, { "epoch": 0.0521875, "grad_norm": 4.75, "grad_norm_var": 0.09518229166666667, "learning_rate": 0.0001, "loss": 6.945, "loss/crossentropy": 2.4583855867385864, "loss/hidden": 1.90234375, "loss/jsd": 0.0, "loss/logits": 0.258427232503891, "step": 1670 }, { "epoch": 0.05225, "grad_norm": 4.90625, "grad_norm_var": 0.07610677083333334, "learning_rate": 0.0001, "loss": 7.5141, "loss/crossentropy": 2.660768151283264, "loss/hidden": 1.9921875, "loss/jsd": 0.0, "loss/logits": 0.2861160635948181, "step": 1672 }, { "epoch": 0.0523125, "grad_norm": 5.53125, "grad_norm_var": 0.10065104166666666, "learning_rate": 0.0001, "loss": 7.4877, "loss/crossentropy": 2.635893940925598, "loss/hidden": 1.984375, "loss/jsd": 0.0, "loss/logits": 0.2867467477917671, "step": 1674 }, { "epoch": 0.052375, "grad_norm": 4.75, "grad_norm_var": 0.096875, "learning_rate": 0.0001, "loss": 7.3388, "loss/crossentropy": 2.656780481338501, "loss/hidden": 1.88671875, "loss/jsd": 0.0, "loss/logits": 0.27952560782432556, "step": 1676 }, { "epoch": 0.0524375, "grad_norm": 4.34375, "grad_norm_var": 0.10279947916666667, "learning_rate": 0.0001, "loss": 7.4772, "loss/crossentropy": 2.795539140701294, "loss/hidden": 1.91015625, "loss/jsd": 0.0, "loss/logits": 0.2771495431661606, "step": 1678 }, { "epoch": 0.0525, "grad_norm": 5.15625, "grad_norm_var": 0.11968994140625, "learning_rate": 0.0001, "loss": 7.3803, "loss/crossentropy": 2.764199137687683, "loss/hidden": 1.90234375, "loss/jsd": 0.0, "loss/logits": 0.2713741958141327, "step": 1680 }, { "epoch": 0.0525625, "grad_norm": 5.6875, "grad_norm_var": 0.148046875, "learning_rate": 0.0001, "loss": 7.4873, "loss/crossentropy": 2.785406231880188, "loss/hidden": 1.91015625, "loss/jsd": 0.0, "loss/logits": 0.27917732298374176, "step": 1682 }, { "epoch": 0.052625, "grad_norm": 5.0625, "grad_norm_var": 0.17476806640625, "learning_rate": 0.0001, "loss": 7.2824, "loss/crossentropy": 2.638624429702759, "loss/hidden": 1.90625, "loss/jsd": 0.0, "loss/logits": 0.27375538647174835, "step": 1684 }, { "epoch": 0.0526875, "grad_norm": 4.15625, "grad_norm_var": 0.18625895182291666, "learning_rate": 0.0001, "loss": 7.5929, "loss/crossentropy": 2.8473161458969116, "loss/hidden": 1.94921875, "loss/jsd": 0.0, "loss/logits": 0.2796381860971451, "step": 1686 }, { "epoch": 0.05275, "grad_norm": 5.46875, "grad_norm_var": 0.21200764973958333, "learning_rate": 0.0001, "loss": 7.5646, "loss/crossentropy": 2.6495203971862793, "loss/hidden": 1.984375, "loss/jsd": 0.0, "loss/logits": 0.2930660992860794, "step": 1688 }, { "epoch": 0.0528125, "grad_norm": 4.875, "grad_norm_var": 0.18631184895833333, "learning_rate": 0.0001, "loss": 7.16, "loss/crossentropy": 2.5938304662704468, "loss/hidden": 1.89453125, "loss/jsd": 0.0, "loss/logits": 0.26715944707393646, "step": 1690 }, { "epoch": 0.052875, "grad_norm": 4.90625, "grad_norm_var": 0.19464518229166666, "learning_rate": 0.0001, "loss": 7.2846, "loss/crossentropy": 2.5637892484664917, "loss/hidden": 1.9375, "loss/jsd": 0.0, "loss/logits": 0.27832843363285065, "step": 1692 }, { "epoch": 0.0529375, "grad_norm": 4.75, "grad_norm_var": 0.19178059895833333, "learning_rate": 0.0001, "loss": 7.7704, "loss/crossentropy": 2.8994187116622925, "loss/hidden": 1.94921875, "loss/jsd": 0.0, "loss/logits": 0.29217809438705444, "step": 1694 }, { "epoch": 0.053, "grad_norm": 4.75, "grad_norm_var": 0.174853515625, "learning_rate": 0.0001, "loss": 7.0468, "loss/crossentropy": 2.444552779197693, "loss/hidden": 1.91015625, "loss/jsd": 0.0, "loss/logits": 0.2692095637321472, "step": 1696 }, { "epoch": 0.0530625, "grad_norm": 4.96875, "grad_norm_var": 0.12893473307291667, "learning_rate": 0.0001, "loss": 7.7064, "loss/crossentropy": 2.796920657157898, "loss/hidden": 1.97265625, "loss/jsd": 0.0, "loss/logits": 0.29368677735328674, "step": 1698 }, { "epoch": 0.053125, "grad_norm": 5.28125, "grad_norm_var": 0.12107747395833333, "learning_rate": 0.0001, "loss": 7.6303, "loss/crossentropy": 2.813141107559204, "loss/hidden": 1.9296875, "loss/jsd": 0.0, "loss/logits": 0.2887490391731262, "step": 1700 }, { "epoch": 0.0531875, "grad_norm": 4.84375, "grad_norm_var": 0.09192301432291666, "learning_rate": 0.0001, "loss": 7.3593, "loss/crossentropy": 2.739987373352051, "loss/hidden": 1.90625, "loss/jsd": 0.0, "loss/logits": 0.27130600810050964, "step": 1702 }, { "epoch": 0.05325, "grad_norm": 4.625, "grad_norm_var": 0.07307535807291667, "learning_rate": 0.0001, "loss": 7.1669, "loss/crossentropy": 2.6518582105636597, "loss/hidden": 1.890625, "loss/jsd": 0.0, "loss/logits": 0.2624403387308121, "step": 1704 }, { "epoch": 0.0533125, "grad_norm": 5.0625, "grad_norm_var": 0.06300455729166667, "learning_rate": 0.0001, "loss": 7.661, "loss/crossentropy": 2.716781973838806, "loss/hidden": 1.96875, "loss/jsd": 0.0, "loss/logits": 0.2975441813468933, "step": 1706 }, { "epoch": 0.053375, "grad_norm": 4.53125, "grad_norm_var": 0.05813802083333333, "learning_rate": 0.0001, "loss": 7.1827, "loss/crossentropy": 2.491211175918579, "loss/hidden": 1.91015625, "loss/jsd": 0.0, "loss/logits": 0.27813446521759033, "step": 1708 }, { "epoch": 0.0534375, "grad_norm": 4.78125, "grad_norm_var": 0.07604166666666666, "learning_rate": 0.0001, "loss": 7.0346, "loss/crossentropy": 2.6145761013031006, "loss/hidden": 1.8359375, "loss/jsd": 0.0, "loss/logits": 0.25840601325035095, "step": 1710 }, { "epoch": 0.0535, "grad_norm": 4.6875, "grad_norm_var": 0.07730712890625, "learning_rate": 0.0001, "loss": 7.2211, "loss/crossentropy": 2.608027696609497, "loss/hidden": 1.90625, "loss/jsd": 0.0, "loss/logits": 0.27067790925502777, "step": 1712 }, { "epoch": 0.0535625, "grad_norm": 6.6875, "grad_norm_var": 0.30857747395833335, "learning_rate": 0.0001, "loss": 7.2801, "loss/crossentropy": 2.5958139896392822, "loss/hidden": 1.91796875, "loss/jsd": 0.0, "loss/logits": 0.2766307219862938, "step": 1714 }, { "epoch": 0.053625, "grad_norm": 6.5625, "grad_norm_var": 0.4998982747395833, "learning_rate": 0.0001, "loss": 7.4305, "loss/crossentropy": 2.6899927854537964, "loss/hidden": 1.9375, "loss/jsd": 0.0, "loss/logits": 0.2803005874156952, "step": 1716 }, { "epoch": 0.0536875, "grad_norm": 5.4375, "grad_norm_var": 0.5143513997395833, "learning_rate": 0.0001, "loss": 7.3846, "loss/crossentropy": 2.563607931137085, "loss/hidden": 1.96875, "loss/jsd": 0.0, "loss/logits": 0.2852242588996887, "step": 1718 }, { "epoch": 0.05375, "grad_norm": 5.40625, "grad_norm_var": 0.5321451822916666, "learning_rate": 0.0001, "loss": 7.3026, "loss/crossentropy": 2.646055579185486, "loss/hidden": 1.921875, "loss/jsd": 0.0, "loss/logits": 0.2734649181365967, "step": 1720 }, { "epoch": 0.0538125, "grad_norm": 4.3125, "grad_norm_var": 0.5678995768229167, "learning_rate": 0.0001, "loss": 6.968, "loss/crossentropy": 2.5653910636901855, "loss/hidden": 1.8671875, "loss/jsd": 0.0, "loss/logits": 0.25354039669036865, "step": 1722 }, { "epoch": 0.053875, "grad_norm": 5.28125, "grad_norm_var": 0.5819620768229167, "learning_rate": 0.0001, "loss": 7.4924, "loss/crossentropy": 2.834080696105957, "loss/hidden": 1.91796875, "loss/jsd": 0.0, "loss/logits": 0.27403783798217773, "step": 1724 }, { "epoch": 0.0539375, "grad_norm": 4.5, "grad_norm_var": 0.6571248372395834, "learning_rate": 0.0001, "loss": 7.7559, "loss/crossentropy": 2.8956801891326904, "loss/hidden": 1.9765625, "loss/jsd": 0.0, "loss/logits": 0.2883637845516205, "step": 1726 }, { "epoch": 0.054, "grad_norm": 4.3125, "grad_norm_var": 0.7003743489583333, "learning_rate": 0.0001, "loss": 7.1868, "loss/crossentropy": 2.660465121269226, "loss/hidden": 1.859375, "loss/jsd": 0.0, "loss/logits": 0.26670072972774506, "step": 1728 }, { "epoch": 0.0540625, "grad_norm": 4.90625, "grad_norm_var": 0.47978108723958335, "learning_rate": 0.0001, "loss": 7.3968, "loss/crossentropy": 2.6902763843536377, "loss/hidden": 1.9296875, "loss/jsd": 0.0, "loss/logits": 0.27768830955028534, "step": 1730 }, { "epoch": 0.054125, "grad_norm": 5.1875, "grad_norm_var": 0.2855428059895833, "learning_rate": 0.0001, "loss": 7.5651, "loss/crossentropy": 2.790587067604065, "loss/hidden": 1.91015625, "loss/jsd": 0.0, "loss/logits": 0.28643281757831573, "step": 1732 }, { "epoch": 0.0541875, "grad_norm": 4.8125, "grad_norm_var": 0.2678019205729167, "learning_rate": 0.0001, "loss": 7.267, "loss/crossentropy": 2.6578781604766846, "loss/hidden": 1.8984375, "loss/jsd": 0.0, "loss/logits": 0.27107058465480804, "step": 1734 }, { "epoch": 0.05425, "grad_norm": 5.03125, "grad_norm_var": 0.22975260416666668, "learning_rate": 0.0001, "loss": 7.4316, "loss/crossentropy": 2.717787742614746, "loss/hidden": 1.9375, "loss/jsd": 0.0, "loss/logits": 0.2776351869106293, "step": 1736 }, { "epoch": 0.0543125, "grad_norm": 4.71875, "grad_norm_var": 0.20924072265625, "learning_rate": 0.0001, "loss": 7.4847, "loss/crossentropy": 2.774222254753113, "loss/hidden": 1.921875, "loss/jsd": 0.0, "loss/logits": 0.27885735034942627, "step": 1738 }, { "epoch": 0.054375, "grad_norm": 5.03125, "grad_norm_var": 0.2652180989583333, "learning_rate": 0.0001, "loss": 7.2475, "loss/crossentropy": 2.5218154191970825, "loss/hidden": 1.96484375, "loss/jsd": 0.0, "loss/logits": 0.27608276903629303, "step": 1740 }, { "epoch": 0.0544375, "grad_norm": 4.65625, "grad_norm_var": 0.16047770182291668, "learning_rate": 0.0001, "loss": 7.3771, "loss/crossentropy": 2.7365787029266357, "loss/hidden": 1.90234375, "loss/jsd": 0.0, "loss/logits": 0.27381476759910583, "step": 1742 }, { "epoch": 0.0545, "grad_norm": 4.375, "grad_norm_var": 0.17029622395833333, "learning_rate": 0.0001, "loss": 7.1074, "loss/crossentropy": 2.650601625442505, "loss/hidden": 1.87109375, "loss/jsd": 0.0, "loss/logits": 0.2585696280002594, "step": 1744 }, { "epoch": 0.0545625, "grad_norm": 5.4375, "grad_norm_var": 0.19088134765625, "learning_rate": 0.0001, "loss": 7.2373, "loss/crossentropy": 2.4645010232925415, "loss/hidden": 1.9609375, "loss/jsd": 0.0, "loss/logits": 0.2811823785305023, "step": 1746 }, { "epoch": 0.054625, "grad_norm": 4.8125, "grad_norm_var": 0.18046875, "learning_rate": 0.0001, "loss": 7.0984, "loss/crossentropy": 2.5452595949172974, "loss/hidden": 1.96875, "loss/jsd": 0.0, "loss/logits": 0.2584358751773834, "step": 1748 }, { "epoch": 0.0546875, "grad_norm": 4.90625, "grad_norm_var": 0.183837890625, "learning_rate": 0.0001, "loss": 7.7758, "loss/crossentropy": 2.812344789505005, "loss/hidden": 1.9375, "loss/jsd": 0.0, "loss/logits": 0.3026003837585449, "step": 1750 }, { "epoch": 0.05475, "grad_norm": 4.71875, "grad_norm_var": 0.18756103515625, "learning_rate": 0.0001, "loss": 7.1996, "loss/crossentropy": 2.7080670595169067, "loss/hidden": 1.87890625, "loss/jsd": 0.0, "loss/logits": 0.2612666040658951, "step": 1752 }, { "epoch": 0.0548125, "grad_norm": 4.84375, "grad_norm_var": 0.19273681640625, "learning_rate": 0.0001, "loss": 7.3367, "loss/crossentropy": 2.656891942024231, "loss/hidden": 1.8828125, "loss/jsd": 0.0, "loss/logits": 0.2797033041715622, "step": 1754 }, { "epoch": 0.054875, "grad_norm": 4.5, "grad_norm_var": 0.11409098307291667, "learning_rate": 0.0001, "loss": 7.1449, "loss/crossentropy": 2.577438712120056, "loss/hidden": 1.86328125, "loss/jsd": 0.0, "loss/logits": 0.2704222649335861, "step": 1756 }, { "epoch": 0.0549375, "grad_norm": 4.875, "grad_norm_var": 0.11855061848958333, "learning_rate": 0.0001, "loss": 7.4559, "loss/crossentropy": 2.8306283950805664, "loss/hidden": 1.890625, "loss/jsd": 0.0, "loss/logits": 0.2734677642583847, "step": 1758 }, { "epoch": 0.055, "grad_norm": 6.5625, "grad_norm_var": 0.266650390625, "learning_rate": 0.0001, "loss": 7.3112, "loss/crossentropy": 2.6345953941345215, "loss/hidden": 1.93359375, "loss/jsd": 0.0, "loss/logits": 0.2743034064769745, "step": 1760 }, { "epoch": 0.0550625, "grad_norm": 4.6875, "grad_norm_var": 0.25494791666666666, "learning_rate": 0.0001, "loss": 7.4958, "loss/crossentropy": 2.742944121360779, "loss/hidden": 1.9296875, "loss/jsd": 0.0, "loss/logits": 0.2823123633861542, "step": 1762 }, { "epoch": 0.055125, "grad_norm": 4.96875, "grad_norm_var": 0.2669921875, "learning_rate": 0.0001, "loss": 7.3314, "loss/crossentropy": 2.6937159299850464, "loss/hidden": 1.90234375, "loss/jsd": 0.0, "loss/logits": 0.2735290676355362, "step": 1764 }, { "epoch": 0.0551875, "grad_norm": 4.375, "grad_norm_var": 0.2847493489583333, "learning_rate": 0.0001, "loss": 7.0821, "loss/crossentropy": 2.6495813131332397, "loss/hidden": 1.86328125, "loss/jsd": 0.0, "loss/logits": 0.2569257989525795, "step": 1766 }, { "epoch": 0.05525, "grad_norm": 5.0, "grad_norm_var": 0.2819295247395833, "learning_rate": 0.0001, "loss": 7.654, "loss/crossentropy": 2.885606050491333, "loss/hidden": 1.92578125, "loss/jsd": 0.0, "loss/logits": 0.2842566519975662, "step": 1768 }, { "epoch": 0.0553125, "grad_norm": 4.59375, "grad_norm_var": 0.28938395182291665, "learning_rate": 0.0001, "loss": 7.2599, "loss/crossentropy": 2.683700203895569, "loss/hidden": 1.9140625, "loss/jsd": 0.0, "loss/logits": 0.2662117928266525, "step": 1770 }, { "epoch": 0.055375, "grad_norm": 4.28125, "grad_norm_var": 0.297509765625, "learning_rate": 0.0001, "loss": 7.4357, "loss/crossentropy": 2.809111475944519, "loss/hidden": 1.875, "loss/jsd": 0.0, "loss/logits": 0.2751620411872864, "step": 1772 }, { "epoch": 0.0554375, "grad_norm": 4.3125, "grad_norm_var": 0.29944254557291666, "learning_rate": 0.0001, "loss": 7.3595, "loss/crossentropy": 2.7361397743225098, "loss/hidden": 1.89453125, "loss/jsd": 0.0, "loss/logits": 0.2728865295648575, "step": 1774 }, { "epoch": 0.0555, "grad_norm": 4.625, "grad_norm_var": 0.071484375, "learning_rate": 0.0001, "loss": 7.2885, "loss/crossentropy": 2.7563177347183228, "loss/hidden": 1.859375, "loss/jsd": 0.0, "loss/logits": 0.26727694272994995, "step": 1776 }, { "epoch": 0.0555625, "grad_norm": 4.6875, "grad_norm_var": 0.06933186848958334, "learning_rate": 0.0001, "loss": 7.1418, "loss/crossentropy": 2.609483480453491, "loss/hidden": 1.859375, "loss/jsd": 0.0, "loss/logits": 0.2672937512397766, "step": 1778 }, { "epoch": 0.055625, "grad_norm": 4.71875, "grad_norm_var": 0.060009765625, "learning_rate": 0.0001, "loss": 7.373, "loss/crossentropy": 2.7315728664398193, "loss/hidden": 1.890625, "loss/jsd": 0.0, "loss/logits": 0.2750767469406128, "step": 1780 }, { "epoch": 0.0556875, "grad_norm": 4.875, "grad_norm_var": 0.05640869140625, "learning_rate": 0.0001, "loss": 7.1895, "loss/crossentropy": 2.5997053384780884, "loss/hidden": 1.91015625, "loss/jsd": 0.0, "loss/logits": 0.2679608315229416, "step": 1782 }, { "epoch": 0.05575, "grad_norm": 4.25, "grad_norm_var": 0.047509765625, "learning_rate": 0.0001, "loss": 7.2201, "loss/crossentropy": 2.680380940437317, "loss/hidden": 1.8984375, "loss/jsd": 0.0, "loss/logits": 0.2641296237707138, "step": 1784 }, { "epoch": 0.0558125, "grad_norm": 4.28125, "grad_norm_var": 0.053629557291666664, "learning_rate": 0.0001, "loss": 7.1739, "loss/crossentropy": 2.6020020246505737, "loss/hidden": 1.87109375, "loss/jsd": 0.0, "loss/logits": 0.27007773518562317, "step": 1786 }, { "epoch": 0.055875, "grad_norm": 4.78125, "grad_norm_var": 0.04641520182291667, "learning_rate": 0.0001, "loss": 7.3154, "loss/crossentropy": 2.705802083015442, "loss/hidden": 1.92578125, "loss/jsd": 0.0, "loss/logits": 0.26838430762290955, "step": 1788 }, { "epoch": 0.0559375, "grad_norm": 7.71875, "grad_norm_var": 0.6591796875, "learning_rate": 0.0001, "loss": 7.5472, "loss/crossentropy": 2.700193166732788, "loss/hidden": 1.93359375, "loss/jsd": 0.0, "loss/logits": 0.2913414239883423, "step": 1790 }, { "epoch": 0.056, "grad_norm": 5.34375, "grad_norm_var": 0.67265625, "learning_rate": 0.0001, "loss": 7.6119, "loss/crossentropy": 2.830447196960449, "loss/hidden": 1.9453125, "loss/jsd": 0.0, "loss/logits": 0.28361743688583374, "step": 1792 }, { "epoch": 0.0560625, "grad_norm": 4.5, "grad_norm_var": 0.660009765625, "learning_rate": 0.0001, "loss": 7.2971, "loss/crossentropy": 2.7628878355026245, "loss/hidden": 1.9140625, "loss/jsd": 0.0, "loss/logits": 0.26201970875263214, "step": 1794 }, { "epoch": 0.056125, "grad_norm": 5.03125, "grad_norm_var": 0.7589680989583333, "learning_rate": 0.0001, "loss": 7.3627, "loss/crossentropy": 2.752334475517273, "loss/hidden": 1.86328125, "loss/jsd": 0.0, "loss/logits": 0.2747083008289337, "step": 1796 }, { "epoch": 0.0561875, "grad_norm": 5.65625, "grad_norm_var": 0.7759765625, "learning_rate": 0.0001, "loss": 7.6213, "loss/crossentropy": 2.984766721725464, "loss/hidden": 1.91015625, "loss/jsd": 0.0, "loss/logits": 0.27263370156288147, "step": 1798 }, { "epoch": 0.05625, "grad_norm": 4.625, "grad_norm_var": 0.732275390625, "learning_rate": 0.0001, "loss": 7.3488, "loss/crossentropy": 2.7198057174682617, "loss/hidden": 1.8828125, "loss/jsd": 0.0, "loss/logits": 0.2746191918849945, "step": 1800 }, { "epoch": 0.0563125, "grad_norm": 4.46875, "grad_norm_var": 0.7081868489583333, "learning_rate": 0.0001, "loss": 7.2877, "loss/crossentropy": 2.6790480613708496, "loss/hidden": 1.87890625, "loss/jsd": 0.0, "loss/logits": 0.27297329902648926, "step": 1802 }, { "epoch": 0.056375, "grad_norm": 5.21875, "grad_norm_var": 0.69107666015625, "learning_rate": 0.0001, "loss": 6.8517, "loss/crossentropy": 2.3501633405685425, "loss/hidden": 1.88671875, "loss/jsd": 0.0, "loss/logits": 0.26148542761802673, "step": 1804 }, { "epoch": 0.0564375, "grad_norm": 4.53125, "grad_norm_var": 0.22669270833333333, "learning_rate": 0.0001, "loss": 7.5052, "loss/crossentropy": 2.75057852268219, "loss/hidden": 1.8828125, "loss/jsd": 0.0, "loss/logits": 0.28718413412570953, "step": 1806 }, { "epoch": 0.0565, "grad_norm": 4.71875, "grad_norm_var": 0.22301025390625, "learning_rate": 0.0001, "loss": 7.2447, "loss/crossentropy": 2.6712071895599365, "loss/hidden": 1.91015625, "loss/jsd": 0.0, "loss/logits": 0.26633328199386597, "step": 1808 }, { "epoch": 0.0565625, "grad_norm": 5.3125, "grad_norm_var": 0.220947265625, "learning_rate": 0.0001, "loss": 7.5968, "loss/crossentropy": 2.8121140003204346, "loss/hidden": 1.9921875, "loss/jsd": 0.0, "loss/logits": 0.27925361692905426, "step": 1810 }, { "epoch": 0.056625, "grad_norm": 4.6875, "grad_norm_var": 0.1205078125, "learning_rate": 0.0001, "loss": 7.1545, "loss/crossentropy": 2.555493950843811, "loss/hidden": 1.8671875, "loss/jsd": 0.0, "loss/logits": 0.27317963540554047, "step": 1812 }, { "epoch": 0.0566875, "grad_norm": 4.15625, "grad_norm_var": 0.11138916015625, "learning_rate": 0.0001, "loss": 6.9804, "loss/crossentropy": 2.5900630950927734, "loss/hidden": 1.83203125, "loss/jsd": 0.0, "loss/logits": 0.25583045184612274, "step": 1814 }, { "epoch": 0.05675, "grad_norm": 5.4375, "grad_norm_var": 0.13201497395833334, "learning_rate": 0.0001, "loss": 7.5876, "loss/crossentropy": 2.8362863063812256, "loss/hidden": 1.94140625, "loss/jsd": 0.0, "loss/logits": 0.2809862047433853, "step": 1816 }, { "epoch": 0.0568125, "grad_norm": 5.03125, "grad_norm_var": 0.11678059895833333, "learning_rate": 0.0001, "loss": 7.6393, "loss/crossentropy": 2.8320658206939697, "loss/hidden": 1.90625, "loss/jsd": 0.0, "loss/logits": 0.29009483754634857, "step": 1818 }, { "epoch": 0.056875, "grad_norm": 4.5625, "grad_norm_var": 0.11044514973958333, "learning_rate": 0.0001, "loss": 7.4099, "loss/crossentropy": 2.7755016088485718, "loss/hidden": 1.8984375, "loss/jsd": 0.0, "loss/logits": 0.27359677851200104, "step": 1820 }, { "epoch": 0.0569375, "grad_norm": 4.1875, "grad_norm_var": 0.1587890625, "learning_rate": 0.0001, "loss": 6.747, "loss/crossentropy": 2.4457638263702393, "loss/hidden": 1.828125, "loss/jsd": 0.0, "loss/logits": 0.24731196463108063, "step": 1822 }, { "epoch": 0.057, "grad_norm": 4.34375, "grad_norm_var": 0.158056640625, "learning_rate": 0.0001, "loss": 7.3074, "loss/crossentropy": 2.709230422973633, "loss/hidden": 1.88671875, "loss/jsd": 0.0, "loss/logits": 0.2711452543735504, "step": 1824 }, { "epoch": 0.0570625, "grad_norm": 5.15625, "grad_norm_var": 0.14768473307291666, "learning_rate": 0.0001, "loss": 7.6037, "loss/crossentropy": 2.705008387565613, "loss/hidden": 1.95703125, "loss/jsd": 0.0, "loss/logits": 0.29416972398757935, "step": 1826 }, { "epoch": 0.057125, "grad_norm": 4.34375, "grad_norm_var": 0.15325520833333334, "learning_rate": 0.0001, "loss": 6.9901, "loss/crossentropy": 2.60453999042511, "loss/hidden": 1.8515625, "loss/jsd": 0.0, "loss/logits": 0.2533971518278122, "step": 1828 }, { "epoch": 0.0571875, "grad_norm": 4.625, "grad_norm_var": 0.13787434895833334, "learning_rate": 0.0001, "loss": 7.1899, "loss/crossentropy": 2.5849435329437256, "loss/hidden": 1.9453125, "loss/jsd": 0.0, "loss/logits": 0.2659648209810257, "step": 1830 }, { "epoch": 0.05725, "grad_norm": 4.625, "grad_norm_var": 0.08670247395833333, "learning_rate": 0.0001, "loss": 7.0161, "loss/crossentropy": 2.420079231262207, "loss/hidden": 1.93359375, "loss/jsd": 0.0, "loss/logits": 0.2662433907389641, "step": 1832 }, { "epoch": 0.0573125, "grad_norm": 4.34375, "grad_norm_var": 0.08098958333333334, "learning_rate": 0.0001, "loss": 6.947, "loss/crossentropy": 2.5382049083709717, "loss/hidden": 1.9140625, "loss/jsd": 0.0, "loss/logits": 0.2494724839925766, "step": 1834 }, { "epoch": 0.057375, "grad_norm": 4.59375, "grad_norm_var": 0.07668863932291667, "learning_rate": 0.0001, "loss": 7.4878, "loss/crossentropy": 2.8954910039901733, "loss/hidden": 1.8828125, "loss/jsd": 0.0, "loss/logits": 0.2709462493658066, "step": 1836 }, { "epoch": 0.0574375, "grad_norm": 4.1875, "grad_norm_var": 0.06139322916666667, "learning_rate": 0.0001, "loss": 7.0911, "loss/crossentropy": 2.6605488061904907, "loss/hidden": 1.8984375, "loss/jsd": 0.0, "loss/logits": 0.2532087415456772, "step": 1838 }, { "epoch": 0.0575, "grad_norm": 4.6875, "grad_norm_var": 0.07224934895833333, "learning_rate": 0.0001, "loss": 7.4682, "loss/crossentropy": 2.7157968282699585, "loss/hidden": 1.90234375, "loss/jsd": 0.0, "loss/logits": 0.28500914573669434, "step": 1840 }, { "epoch": 0.0575625, "grad_norm": 4.0, "grad_norm_var": 0.07867431640625, "learning_rate": 0.0001, "loss": 6.9401, "loss/crossentropy": 2.514353036880493, "loss/hidden": 1.8203125, "loss/jsd": 0.0, "loss/logits": 0.26054561138153076, "step": 1842 }, { "epoch": 0.057625, "grad_norm": 4.75, "grad_norm_var": 0.11404622395833333, "learning_rate": 0.0001, "loss": 7.3218, "loss/crossentropy": 2.6728512048721313, "loss/hidden": 1.9375, "loss/jsd": 0.0, "loss/logits": 0.27114808559417725, "step": 1844 }, { "epoch": 0.0576875, "grad_norm": 4.4375, "grad_norm_var": 0.11599934895833333, "learning_rate": 0.0001, "loss": 6.7175, "loss/crossentropy": 2.397219181060791, "loss/hidden": 1.9453125, "loss/jsd": 0.0, "loss/logits": 0.2374936044216156, "step": 1846 }, { "epoch": 0.05775, "grad_norm": 5.125, "grad_norm_var": 0.145556640625, "learning_rate": 0.0001, "loss": 6.8971, "loss/crossentropy": 2.435696005821228, "loss/hidden": 1.91015625, "loss/jsd": 0.0, "loss/logits": 0.25512565672397614, "step": 1848 }, { "epoch": 0.0578125, "grad_norm": 4.78125, "grad_norm_var": 0.14934895833333334, "learning_rate": 0.0001, "loss": 7.4901, "loss/crossentropy": 2.7314751148223877, "loss/hidden": 1.91796875, "loss/jsd": 0.0, "loss/logits": 0.2840634733438492, "step": 1850 }, { "epoch": 0.057875, "grad_norm": 5.125, "grad_norm_var": 0.16090087890625, "learning_rate": 0.0001, "loss": 7.3878, "loss/crossentropy": 2.701223373413086, "loss/hidden": 1.9296875, "loss/jsd": 0.0, "loss/logits": 0.275688573718071, "step": 1852 }, { "epoch": 0.0579375, "grad_norm": 4.625, "grad_norm_var": 0.16480712890625, "learning_rate": 0.0001, "loss": 7.1744, "loss/crossentropy": 2.564082622528076, "loss/hidden": 1.8828125, "loss/jsd": 0.0, "loss/logits": 0.27274955809116364, "step": 1854 }, { "epoch": 0.058, "grad_norm": 4.375, "grad_norm_var": 0.16873372395833333, "learning_rate": 0.0001, "loss": 7.1284, "loss/crossentropy": 2.5915364027023315, "loss/hidden": 1.89453125, "loss/jsd": 0.0, "loss/logits": 0.2642373740673065, "step": 1856 }, { "epoch": 0.0580625, "grad_norm": 5.3125, "grad_norm_var": 0.15331624348958334, "learning_rate": 0.0001, "loss": 7.493, "loss/crossentropy": 2.7819111347198486, "loss/hidden": 1.9375, "loss/jsd": 0.0, "loss/logits": 0.27735432982444763, "step": 1858 }, { "epoch": 0.058125, "grad_norm": 5.96875, "grad_norm_var": 0.22011311848958334, "learning_rate": 0.0001, "loss": 7.3859, "loss/crossentropy": 2.679842948913574, "loss/hidden": 1.90625, "loss/jsd": 0.0, "loss/logits": 0.27998194098472595, "step": 1860 }, { "epoch": 0.0581875, "grad_norm": 4.65625, "grad_norm_var": 0.20553385416666667, "learning_rate": 0.0001, "loss": 6.9421, "loss/crossentropy": 2.5038408041000366, "loss/hidden": 1.82421875, "loss/jsd": 0.0, "loss/logits": 0.2614058554172516, "step": 1862 }, { "epoch": 0.05825, "grad_norm": 4.71875, "grad_norm_var": 0.17626546223958334, "learning_rate": 0.0001, "loss": 7.2092, "loss/crossentropy": 2.6554737091064453, "loss/hidden": 1.9140625, "loss/jsd": 0.0, "loss/logits": 0.26396705210208893, "step": 1864 }, { "epoch": 0.0583125, "grad_norm": 4.34375, "grad_norm_var": 0.18982747395833333, "learning_rate": 0.0001, "loss": 7.1009, "loss/crossentropy": 2.6473816633224487, "loss/hidden": 1.84765625, "loss/jsd": 0.0, "loss/logits": 0.26058557629585266, "step": 1866 }, { "epoch": 0.058375, "grad_norm": 4.59375, "grad_norm_var": 0.1978515625, "learning_rate": 0.0001, "loss": 7.3526, "loss/crossentropy": 2.696037769317627, "loss/hidden": 1.91796875, "loss/jsd": 0.0, "loss/logits": 0.273857519030571, "step": 1868 }, { "epoch": 0.0584375, "grad_norm": 4.71875, "grad_norm_var": 0.18058268229166666, "learning_rate": 0.0001, "loss": 6.9596, "loss/crossentropy": 2.452818512916565, "loss/hidden": 1.94921875, "loss/jsd": 0.0, "loss/logits": 0.2557523772120476, "step": 1870 }, { "epoch": 0.0585, "grad_norm": 4.53125, "grad_norm_var": 0.17144775390625, "learning_rate": 0.0001, "loss": 7.3039, "loss/crossentropy": 2.809278964996338, "loss/hidden": 1.86328125, "loss/jsd": 0.0, "loss/logits": 0.2631368637084961, "step": 1872 }, { "epoch": 0.0585625, "grad_norm": 6.53125, "grad_norm_var": 0.36578369140625, "learning_rate": 0.0001, "loss": 7.12, "loss/crossentropy": 2.583288073539734, "loss/hidden": 1.91015625, "loss/jsd": 0.0, "loss/logits": 0.26265595853328705, "step": 1874 }, { "epoch": 0.058625, "grad_norm": 4.125, "grad_norm_var": 0.3079427083333333, "learning_rate": 0.0001, "loss": 6.8005, "loss/crossentropy": 2.5182260274887085, "loss/hidden": 1.9296875, "loss/jsd": 0.0, "loss/logits": 0.23525764048099518, "step": 1876 }, { "epoch": 0.0586875, "grad_norm": 4.03125, "grad_norm_var": 0.3358072916666667, "learning_rate": 0.0001, "loss": 7.1627, "loss/crossentropy": 2.6038613319396973, "loss/hidden": 1.92578125, "loss/jsd": 0.0, "loss/logits": 0.2633034437894821, "step": 1878 }, { "epoch": 0.05875, "grad_norm": 6.46875, "grad_norm_var": 0.55992431640625, "learning_rate": 0.0001, "loss": 7.0627, "loss/crossentropy": 2.5273290872573853, "loss/hidden": 1.84765625, "loss/jsd": 0.0, "loss/logits": 0.2687753736972809, "step": 1880 }, { "epoch": 0.0588125, "grad_norm": 4.6875, "grad_norm_var": 0.5538899739583333, "learning_rate": 0.0001, "loss": 7.1662, "loss/crossentropy": 2.563568949699402, "loss/hidden": 1.88671875, "loss/jsd": 0.0, "loss/logits": 0.27159547805786133, "step": 1882 }, { "epoch": 0.058875, "grad_norm": 4.53125, "grad_norm_var": 0.540625, "learning_rate": 0.0001, "loss": 7.0803, "loss/crossentropy": 2.534513831138611, "loss/hidden": 1.91015625, "loss/jsd": 0.0, "loss/logits": 0.2635657638311386, "step": 1884 }, { "epoch": 0.0589375, "grad_norm": 4.34375, "grad_norm_var": 0.56256103515625, "learning_rate": 0.0001, "loss": 6.81, "loss/crossentropy": 2.3854469060897827, "loss/hidden": 1.89453125, "loss/jsd": 0.0, "loss/logits": 0.2530023232102394, "step": 1886 }, { "epoch": 0.059, "grad_norm": 4.9375, "grad_norm_var": 0.558447265625, "learning_rate": 0.0001, "loss": 7.5175, "loss/crossentropy": 2.8702194690704346, "loss/hidden": 1.88671875, "loss/jsd": 0.0, "loss/logits": 0.2760562151670456, "step": 1888 }, { "epoch": 0.0590625, "grad_norm": 4.5, "grad_norm_var": 0.33114827473958336, "learning_rate": 0.0001, "loss": 7.3912, "loss/crossentropy": 2.72122585773468, "loss/hidden": 1.91796875, "loss/jsd": 0.0, "loss/logits": 0.2752014696598053, "step": 1890 }, { "epoch": 0.059125, "grad_norm": 4.96875, "grad_norm_var": 0.3114217122395833, "learning_rate": 0.0001, "loss": 7.2087, "loss/crossentropy": 2.669129967689514, "loss/hidden": 1.87890625, "loss/jsd": 0.0, "loss/logits": 0.26606956124305725, "step": 1892 }, { "epoch": 0.0591875, "grad_norm": 4.5625, "grad_norm_var": 0.29269205729166664, "learning_rate": 0.0001, "loss": 6.8762, "loss/crossentropy": 2.4340004920959473, "loss/hidden": 1.87890625, "loss/jsd": 0.0, "loss/logits": 0.2563306391239166, "step": 1894 }, { "epoch": 0.05925, "grad_norm": 4.875, "grad_norm_var": 0.08118082682291666, "learning_rate": 0.0001, "loss": 6.9578, "loss/crossentropy": 2.363881826400757, "loss/hidden": 1.8984375, "loss/jsd": 0.0, "loss/logits": 0.2695441246032715, "step": 1896 }, { "epoch": 0.0593125, "grad_norm": 4.875, "grad_norm_var": 0.14895833333333333, "learning_rate": 0.0001, "loss": 7.2834, "loss/crossentropy": 2.5548282861709595, "loss/hidden": 1.90625, "loss/jsd": 0.0, "loss/logits": 0.2822292298078537, "step": 1898 }, { "epoch": 0.059375, "grad_norm": 4.75, "grad_norm_var": 0.15206705729166667, "learning_rate": 0.0001, "loss": 7.3495, "loss/crossentropy": 2.674235224723816, "loss/hidden": 1.890625, "loss/jsd": 0.0, "loss/logits": 0.27845965325832367, "step": 1900 }, { "epoch": 0.0594375, "grad_norm": 7.15625, "grad_norm_var": 0.47903238932291664, "learning_rate": 0.0001, "loss": 7.4386, "loss/crossentropy": 2.8135560750961304, "loss/hidden": 1.8671875, "loss/jsd": 0.0, "loss/logits": 0.27578939497470856, "step": 1902 }, { "epoch": 0.0595, "grad_norm": 4.65625, "grad_norm_var": 0.47470296223958336, "learning_rate": 0.0001, "loss": 7.3103, "loss/crossentropy": 2.764998435974121, "loss/hidden": 1.87890625, "loss/jsd": 0.0, "loss/logits": 0.26663658022880554, "step": 1904 }, { "epoch": 0.0595625, "grad_norm": 4.4375, "grad_norm_var": 0.517041015625, "learning_rate": 0.0001, "loss": 7.5119, "loss/crossentropy": 2.6685441732406616, "loss/hidden": 1.94140625, "loss/jsd": 0.0, "loss/logits": 0.2901953458786011, "step": 1906 }, { "epoch": 0.059625, "grad_norm": 5.1875, "grad_norm_var": 0.5104817708333333, "learning_rate": 0.0001, "loss": 7.0694, "loss/crossentropy": 2.5350207090377808, "loss/hidden": 1.8671875, "loss/jsd": 0.0, "loss/logits": 0.2667199671268463, "step": 1908 }, { "epoch": 0.0596875, "grad_norm": 4.84375, "grad_norm_var": 0.49635416666666665, "learning_rate": 0.0001, "loss": 7.0891, "loss/crossentropy": 2.5018303394317627, "loss/hidden": 1.87890625, "loss/jsd": 0.0, "loss/logits": 0.27083566784858704, "step": 1910 }, { "epoch": 0.05975, "grad_norm": 4.75, "grad_norm_var": 0.518212890625, "learning_rate": 0.0001, "loss": 6.9844, "loss/crossentropy": 2.463659405708313, "loss/hidden": 1.87109375, "loss/jsd": 0.0, "loss/logits": 0.2649676352739334, "step": 1912 }, { "epoch": 0.0598125, "grad_norm": 4.78125, "grad_norm_var": 0.480859375, "learning_rate": 0.0001, "loss": 7.5348, "loss/crossentropy": 2.796781539916992, "loss/hidden": 1.9453125, "loss/jsd": 0.0, "loss/logits": 0.2792717218399048, "step": 1914 }, { "epoch": 0.059875, "grad_norm": 5.0, "grad_norm_var": 0.45636393229166666, "learning_rate": 0.0001, "loss": 7.4205, "loss/crossentropy": 2.717116117477417, "loss/hidden": 1.9765625, "loss/jsd": 0.0, "loss/logits": 0.2726837396621704, "step": 1916 }, { "epoch": 0.0599375, "grad_norm": 5.1875, "grad_norm_var": 0.14191080729166666, "learning_rate": 0.0001, "loss": 7.9693, "loss/crossentropy": 3.0037057399749756, "loss/hidden": 1.93359375, "loss/jsd": 0.0, "loss/logits": 0.30319735407829285, "step": 1918 }, { "epoch": 0.06, "grad_norm": 4.59375, "grad_norm_var": 0.14514567057291666, "learning_rate": 0.0001, "loss": 7.5668, "loss/crossentropy": 2.965430498123169, "loss/hidden": 1.92578125, "loss/jsd": 0.0, "loss/logits": 0.2675633579492569, "step": 1920 }, { "epoch": 0.0600625, "grad_norm": 4.25, "grad_norm_var": 0.10286051432291667, "learning_rate": 0.0001, "loss": 6.6429, "loss/crossentropy": 2.376823306083679, "loss/hidden": 1.8359375, "loss/jsd": 0.0, "loss/logits": 0.24300982803106308, "step": 1922 }, { "epoch": 0.060125, "grad_norm": 4.96875, "grad_norm_var": 0.089697265625, "learning_rate": 0.0001, "loss": 6.9851, "loss/crossentropy": 2.5577648878097534, "loss/hidden": 1.94140625, "loss/jsd": 0.0, "loss/logits": 0.2485908716917038, "step": 1924 }, { "epoch": 0.0601875, "grad_norm": 4.9375, "grad_norm_var": 0.0943359375, "learning_rate": 0.0001, "loss": 7.3381, "loss/crossentropy": 2.583045721054077, "loss/hidden": 1.9296875, "loss/jsd": 0.0, "loss/logits": 0.28253433108329773, "step": 1926 }, { "epoch": 0.06025, "grad_norm": 5.3125, "grad_norm_var": 0.37906494140625, "learning_rate": 0.0001, "loss": 7.8211, "loss/crossentropy": 3.004792094230652, "loss/hidden": 1.92578125, "loss/jsd": 0.0, "loss/logits": 0.2890557497739792, "step": 1928 }, { "epoch": 0.0603125, "grad_norm": 4.625, "grad_norm_var": 0.374853515625, "learning_rate": 0.0001, "loss": 7.1874, "loss/crossentropy": 2.61250638961792, "loss/hidden": 1.921875, "loss/jsd": 0.0, "loss/logits": 0.2653019577264786, "step": 1930 }, { "epoch": 0.060375, "grad_norm": 4.25, "grad_norm_var": 0.408447265625, "learning_rate": 0.0001, "loss": 7.2217, "loss/crossentropy": 2.6708651781082153, "loss/hidden": 1.89453125, "loss/jsd": 0.0, "loss/logits": 0.2656322345137596, "step": 1932 }, { "epoch": 0.0604375, "grad_norm": 6.5, "grad_norm_var": 0.54166259765625, "learning_rate": 0.0001, "loss": 7.7014, "loss/crossentropy": 2.7332130670547485, "loss/hidden": 1.90625, "loss/jsd": 0.0, "loss/logits": 0.3061896413564682, "step": 1934 }, { "epoch": 0.0605, "grad_norm": 4.65625, "grad_norm_var": 0.5347615559895833, "learning_rate": 0.0001, "loss": 7.3152, "loss/crossentropy": 2.6586803197860718, "loss/hidden": 1.87890625, "loss/jsd": 0.0, "loss/logits": 0.27776092290878296, "step": 1936 }, { "epoch": 0.0605625, "grad_norm": 6.28125, "grad_norm_var": 0.5535807291666667, "learning_rate": 0.0001, "loss": 7.6632, "loss/crossentropy": 2.7831838130950928, "loss/hidden": 1.94140625, "loss/jsd": 0.0, "loss/logits": 0.2938612252473831, "step": 1938 }, { "epoch": 0.060625, "grad_norm": 4.875, "grad_norm_var": 0.5571573893229167, "learning_rate": 0.0001, "loss": 7.1143, "loss/crossentropy": 2.658159375190735, "loss/hidden": 1.8671875, "loss/jsd": 0.0, "loss/logits": 0.25889691710472107, "step": 1940 }, { "epoch": 0.0606875, "grad_norm": 9.3125, "grad_norm_var": 1.6083943684895834, "learning_rate": 0.0001, "loss": 7.8654, "loss/crossentropy": 2.923017978668213, "loss/hidden": 1.91796875, "loss/jsd": 0.0, "loss/logits": 0.30244556069374084, "step": 1942 }, { "epoch": 0.06075, "grad_norm": 18.5, "grad_norm_var": 12.256766764322917, "learning_rate": 0.0001, "loss": 7.4521, "loss/crossentropy": 2.684673309326172, "loss/hidden": 1.96875, "loss/jsd": 0.0, "loss/logits": 0.2798672914505005, "step": 1944 }, { "epoch": 0.0608125, "grad_norm": 4.625, "grad_norm_var": 12.309244791666666, "learning_rate": 0.0001, "loss": 7.3468, "loss/crossentropy": 2.713373899459839, "loss/hidden": 1.87109375, "loss/jsd": 0.0, "loss/logits": 0.27623049914836884, "step": 1946 }, { "epoch": 0.060875, "grad_norm": 5.65625, "grad_norm_var": 12.200907389322916, "learning_rate": 0.0001, "loss": 7.3181, "loss/crossentropy": 2.609257936477661, "loss/hidden": 1.9140625, "loss/jsd": 0.0, "loss/logits": 0.2794778645038605, "step": 1948 }, { "epoch": 0.0609375, "grad_norm": 4.5, "grad_norm_var": 12.456343587239584, "learning_rate": 0.0001, "loss": 6.7497, "loss/crossentropy": 2.5331573486328125, "loss/hidden": 1.80078125, "loss/jsd": 0.0, "loss/logits": 0.24157867580652237, "step": 1950 }, { "epoch": 0.061, "grad_norm": 4.96875, "grad_norm_var": 12.452197265625, "learning_rate": 0.0001, "loss": 7.6656, "loss/crossentropy": 2.8852624893188477, "loss/hidden": 1.91796875, "loss/jsd": 0.0, "loss/logits": 0.28624095022678375, "step": 1952 }, { "epoch": 0.0610625, "grad_norm": 4.71875, "grad_norm_var": 12.616402180989583, "learning_rate": 0.0001, "loss": 7.1288, "loss/crossentropy": 2.5084198713302612, "loss/hidden": 1.9296875, "loss/jsd": 0.0, "loss/logits": 0.26907259225845337, "step": 1954 }, { "epoch": 0.061125, "grad_norm": 4.34375, "grad_norm_var": 12.776590983072916, "learning_rate": 0.0001, "loss": 7.1834, "loss/crossentropy": 2.7280231714248657, "loss/hidden": 1.85546875, "loss/jsd": 0.0, "loss/logits": 0.25999484956264496, "step": 1956 }, { "epoch": 0.0611875, "grad_norm": 4.5625, "grad_norm_var": 12.005952962239583, "learning_rate": 0.0001, "loss": 7.1353, "loss/crossentropy": 2.5946719646453857, "loss/hidden": 1.86328125, "loss/jsd": 0.0, "loss/logits": 0.26773855835199356, "step": 1958 }, { "epoch": 0.06125, "grad_norm": 4.84375, "grad_norm_var": 0.11272379557291666, "learning_rate": 0.0001, "loss": 7.3448, "loss/crossentropy": 2.718293786048889, "loss/hidden": 1.91796875, "loss/jsd": 0.0, "loss/logits": 0.27085064351558685, "step": 1960 }, { "epoch": 0.0613125, "grad_norm": 4.8125, "grad_norm_var": 0.11471354166666667, "learning_rate": 0.0001, "loss": 7.2067, "loss/crossentropy": 2.6649069786071777, "loss/hidden": 1.8984375, "loss/jsd": 0.0, "loss/logits": 0.2643337845802307, "step": 1962 }, { "epoch": 0.061375, "grad_norm": 5.03125, "grad_norm_var": 0.07021077473958333, "learning_rate": 0.0001, "loss": 7.0607, "loss/crossentropy": 2.586350440979004, "loss/hidden": 1.83984375, "loss/jsd": 0.0, "loss/logits": 0.26345033198595047, "step": 1964 }, { "epoch": 0.0614375, "grad_norm": 4.0625, "grad_norm_var": 0.09211832682291667, "learning_rate": 0.0001, "loss": 7.3764, "loss/crossentropy": 2.824537992477417, "loss/hidden": 1.86328125, "loss/jsd": 0.0, "loss/logits": 0.2688605338335037, "step": 1966 }, { "epoch": 0.0615, "grad_norm": 4.9375, "grad_norm_var": 0.09081624348958334, "learning_rate": 0.0001, "loss": 7.2516, "loss/crossentropy": 2.66815447807312, "loss/hidden": 1.87890625, "loss/jsd": 0.0, "loss/logits": 0.27045372128486633, "step": 1968 }, { "epoch": 0.0615625, "grad_norm": 4.5, "grad_norm_var": 0.09013264973958333, "learning_rate": 0.0001, "loss": 7.1873, "loss/crossentropy": 2.714627504348755, "loss/hidden": 1.8515625, "loss/jsd": 0.0, "loss/logits": 0.2621074616909027, "step": 1970 }, { "epoch": 0.061625, "grad_norm": 4.40625, "grad_norm_var": 0.083984375, "learning_rate": 0.0001, "loss": 7.148, "loss/crossentropy": 2.732495427131653, "loss/hidden": 1.83203125, "loss/jsd": 0.0, "loss/logits": 0.2583463042974472, "step": 1972 }, { "epoch": 0.0616875, "grad_norm": 6.875, "grad_norm_var": 0.403369140625, "learning_rate": 0.0001, "loss": 7.7964, "loss/crossentropy": 2.6379644870758057, "loss/hidden": 1.984375, "loss/jsd": 0.0, "loss/logits": 0.31740613281726837, "step": 1974 }, { "epoch": 0.06175, "grad_norm": 5.125, "grad_norm_var": 0.7941243489583333, "learning_rate": 0.0001, "loss": 7.8055, "loss/crossentropy": 2.8083606958389282, "loss/hidden": 2.01171875, "loss/jsd": 0.0, "loss/logits": 0.29854556918144226, "step": 1976 }, { "epoch": 0.0618125, "grad_norm": 4.6875, "grad_norm_var": 0.7879191080729167, "learning_rate": 0.0001, "loss": 6.9407, "loss/crossentropy": 2.48198664188385, "loss/hidden": 1.8984375, "loss/jsd": 0.0, "loss/logits": 0.25602778792381287, "step": 1978 }, { "epoch": 0.061875, "grad_norm": 4.8125, "grad_norm_var": 0.75924072265625, "learning_rate": 0.0001, "loss": 7.4327, "loss/crossentropy": 2.723036050796509, "loss/hidden": 1.8828125, "loss/jsd": 0.0, "loss/logits": 0.2826816141605377, "step": 1980 }, { "epoch": 0.0619375, "grad_norm": 4.75, "grad_norm_var": 0.7576456705729167, "learning_rate": 0.0001, "loss": 6.9662, "loss/crossentropy": 2.535414457321167, "loss/hidden": 1.890625, "loss/jsd": 0.0, "loss/logits": 0.2540201246738434, "step": 1982 }, { "epoch": 0.062, "grad_norm": 4.40625, "grad_norm_var": 0.7925130208333333, "learning_rate": 0.0001, "loss": 7.1227, "loss/crossentropy": 2.690464735031128, "loss/hidden": 1.84375, "loss/jsd": 0.0, "loss/logits": 0.25885075330734253, "step": 1984 }, { "epoch": 0.0620625, "grad_norm": 4.0625, "grad_norm_var": 0.82467041015625, "learning_rate": 0.0001, "loss": 7.2165, "loss/crossentropy": 2.705548882484436, "loss/hidden": 1.8359375, "loss/jsd": 0.0, "loss/logits": 0.26750563085079193, "step": 1986 }, { "epoch": 0.062125, "grad_norm": 4.59375, "grad_norm_var": 0.8031901041666667, "learning_rate": 0.0001, "loss": 7.3331, "loss/crossentropy": 2.8012691736221313, "loss/hidden": 1.8671875, "loss/jsd": 0.0, "loss/logits": 0.26646314561367035, "step": 1988 }, { "epoch": 0.0621875, "grad_norm": 4.34375, "grad_norm_var": 0.5442545572916667, "learning_rate": 0.0001, "loss": 6.8132, "loss/crossentropy": 2.3716607093811035, "loss/hidden": 1.86328125, "loss/jsd": 0.0, "loss/logits": 0.2578277885913849, "step": 1990 }, { "epoch": 0.06225, "grad_norm": 4.625, "grad_norm_var": 0.9739217122395833, "learning_rate": 0.0001, "loss": 7.441, "loss/crossentropy": 2.5655524730682373, "loss/hidden": 1.88671875, "loss/jsd": 0.0, "loss/logits": 0.29887592792510986, "step": 1992 }, { "epoch": 0.0623125, "grad_norm": 5.875, "grad_norm_var": 1.05181884765625, "learning_rate": 0.0001, "loss": 7.5642, "loss/crossentropy": 2.728899836540222, "loss/hidden": 1.984375, "loss/jsd": 0.0, "loss/logits": 0.2850935161113739, "step": 1994 }, { "epoch": 0.062375, "grad_norm": 8.25, "grad_norm_var": 1.738134765625, "learning_rate": 0.0001, "loss": 7.8895, "loss/crossentropy": 2.779134511947632, "loss/hidden": 2.0, "loss/jsd": 0.0, "loss/logits": 0.31103692948818207, "step": 1996 }, { "epoch": 0.0624375, "grad_norm": 4.59375, "grad_norm_var": 1.6788899739583334, "learning_rate": 0.0001, "loss": 7.2849, "loss/crossentropy": 2.6748945713043213, "loss/hidden": 1.8828125, "loss/jsd": 0.0, "loss/logits": 0.27271807193756104, "step": 1998 }, { "epoch": 0.0625, "grad_norm": 4.875, "grad_norm_var": 2.008658854166667, "learning_rate": 0.0001, "loss": 7.8282, "loss/crossentropy": 2.801780104637146, "loss/hidden": 1.9765625, "loss/jsd": 0.0, "loss/logits": 0.30498483777046204, "step": 2000 }, { "epoch": 0.0625625, "grad_norm": 4.8125, "grad_norm_var": 1.9149576822916667, "learning_rate": 0.0001, "loss": 6.9057, "loss/crossentropy": 2.453532576560974, "loss/hidden": 1.8671875, "loss/jsd": 0.0, "loss/logits": 0.2584947943687439, "step": 2002 }, { "epoch": 0.062625, "grad_norm": 4.71875, "grad_norm_var": 1.8465983072916667, "learning_rate": 0.0001, "loss": 6.7113, "loss/crossentropy": 2.292815327644348, "loss/hidden": 1.94140625, "loss/jsd": 0.0, "loss/logits": 0.2477075159549713, "step": 2004 }, { "epoch": 0.0626875, "grad_norm": 4.6875, "grad_norm_var": 1.9022786458333334, "learning_rate": 0.0001, "loss": 7.4308, "loss/crossentropy": 2.834795117378235, "loss/hidden": 1.89453125, "loss/jsd": 0.0, "loss/logits": 0.27014750242233276, "step": 2006 }, { "epoch": 0.06275, "grad_norm": 4.875, "grad_norm_var": 1.36256103515625, "learning_rate": 0.0001, "loss": 7.6104, "loss/crossentropy": 2.861023187637329, "loss/hidden": 1.93359375, "loss/jsd": 0.0, "loss/logits": 0.2815757244825363, "step": 2008 }, { "epoch": 0.0628125, "grad_norm": 4.375, "grad_norm_var": 1.40963134765625, "learning_rate": 0.0001, "loss": 7.0763, "loss/crossentropy": 2.5962413549423218, "loss/hidden": 1.86328125, "loss/jsd": 0.0, "loss/logits": 0.2616799771785736, "step": 2010 }, { "epoch": 0.062875, "grad_norm": 4.40625, "grad_norm_var": 0.7212239583333333, "learning_rate": 0.0001, "loss": 7.2025, "loss/crossentropy": 2.6350537538528442, "loss/hidden": 1.9375, "loss/jsd": 0.0, "loss/logits": 0.26299260556697845, "step": 2012 }, { "epoch": 0.0629375, "grad_norm": 4.34375, "grad_norm_var": 0.7535441080729167, "learning_rate": 0.0001, "loss": 7.032, "loss/crossentropy": 2.6000778675079346, "loss/hidden": 1.84765625, "loss/jsd": 0.0, "loss/logits": 0.25842562317848206, "step": 2014 }, { "epoch": 0.063, "grad_norm": 4.125, "grad_norm_var": 0.07913004557291667, "learning_rate": 0.0001, "loss": 7.0661, "loss/crossentropy": 2.6812326908111572, "loss/hidden": 1.8203125, "loss/jsd": 0.0, "loss/logits": 0.2564576119184494, "step": 2016 }, { "epoch": 0.0630625, "grad_norm": 5.03125, "grad_norm_var": 0.093603515625, "learning_rate": 0.0001, "loss": 7.2916, "loss/crossentropy": 2.6755926609039307, "loss/hidden": 1.91015625, "loss/jsd": 0.0, "loss/logits": 0.2705804705619812, "step": 2018 }, { "epoch": 0.063125, "grad_norm": 4.15625, "grad_norm_var": 0.12945556640625, "learning_rate": 0.0001, "loss": 7.1084, "loss/crossentropy": 2.57517671585083, "loss/hidden": 1.87890625, "loss/jsd": 0.0, "loss/logits": 0.2654324173927307, "step": 2020 }, { "epoch": 0.0631875, "grad_norm": 5.4375, "grad_norm_var": 0.193212890625, "learning_rate": 0.0001, "loss": 7.435, "loss/crossentropy": 2.6259225606918335, "loss/hidden": 1.97265625, "loss/jsd": 0.0, "loss/logits": 0.28364163637161255, "step": 2022 }, { "epoch": 0.06325, "grad_norm": 4.78125, "grad_norm_var": 0.203759765625, "learning_rate": 0.0001, "loss": 7.0103, "loss/crossentropy": 2.5401690006256104, "loss/hidden": 1.8125, "loss/jsd": 0.0, "loss/logits": 0.2657654732465744, "step": 2024 }, { "epoch": 0.0633125, "grad_norm": 4.84375, "grad_norm_var": 0.201171875, "learning_rate": 0.0001, "loss": 6.892, "loss/crossentropy": 2.448048710823059, "loss/hidden": 1.88671875, "loss/jsd": 0.0, "loss/logits": 0.2557239532470703, "step": 2026 }, { "epoch": 0.063375, "grad_norm": 5.15625, "grad_norm_var": 0.210400390625, "learning_rate": 0.0001, "loss": 7.1379, "loss/crossentropy": 2.5381767749786377, "loss/hidden": 1.9140625, "loss/jsd": 0.0, "loss/logits": 0.26856717467308044, "step": 2028 }, { "epoch": 0.0634375, "grad_norm": 4.625, "grad_norm_var": 0.19138997395833332, "learning_rate": 0.0001, "loss": 7.3956, "loss/crossentropy": 2.7830891609191895, "loss/hidden": 1.8984375, "loss/jsd": 0.0, "loss/logits": 0.2714068740606308, "step": 2030 }, { "epoch": 0.0635, "grad_norm": 5.96875, "grad_norm_var": 0.2626261393229167, "learning_rate": 0.0001, "loss": 7.0564, "loss/crossentropy": 2.473397970199585, "loss/hidden": 1.9140625, "loss/jsd": 0.0, "loss/logits": 0.2668927237391472, "step": 2032 }, { "epoch": 0.0635625, "grad_norm": 4.6875, "grad_norm_var": 0.26495768229166666, "learning_rate": 0.0001, "loss": 7.0239, "loss/crossentropy": 2.6535342931747437, "loss/hidden": 1.84375, "loss/jsd": 0.0, "loss/logits": 0.2526656314730644, "step": 2034 }, { "epoch": 0.063625, "grad_norm": 4.96875, "grad_norm_var": 0.21588541666666666, "learning_rate": 0.0001, "loss": 7.3198, "loss/crossentropy": 2.62338125705719, "loss/hidden": 1.92578125, "loss/jsd": 0.0, "loss/logits": 0.2770598828792572, "step": 2036 }, { "epoch": 0.0636875, "grad_norm": 4.3125, "grad_norm_var": 0.20494791666666667, "learning_rate": 0.0001, "loss": 6.834, "loss/crossentropy": 2.4843112230300903, "loss/hidden": 1.8046875, "loss/jsd": 0.0, "loss/logits": 0.2544984146952629, "step": 2038 }, { "epoch": 0.06375, "grad_norm": 4.375, "grad_norm_var": 0.19140625, "learning_rate": 0.0001, "loss": 7.0979, "loss/crossentropy": 2.6469043493270874, "loss/hidden": 1.8515625, "loss/jsd": 0.0, "loss/logits": 0.2599470168352127, "step": 2040 }, { "epoch": 0.0638125, "grad_norm": 4.5, "grad_norm_var": 0.19073893229166666, "learning_rate": 0.0001, "loss": 6.9246, "loss/crossentropy": 2.4736326932907104, "loss/hidden": 1.875, "loss/jsd": 0.0, "loss/logits": 0.2576009929180145, "step": 2042 }, { "epoch": 0.063875, "grad_norm": 4.1875, "grad_norm_var": 0.19659830729166666, "learning_rate": 0.0001, "loss": 6.8111, "loss/crossentropy": 2.47759473323822, "loss/hidden": 1.828125, "loss/jsd": 0.0, "loss/logits": 0.25053443014621735, "step": 2044 }, { "epoch": 0.0639375, "grad_norm": 4.59375, "grad_norm_var": 0.20572916666666666, "learning_rate": 0.0001, "loss": 6.8598, "loss/crossentropy": 2.4240305423736572, "loss/hidden": 1.859375, "loss/jsd": 0.0, "loss/logits": 0.2576344683766365, "step": 2046 }, { "epoch": 0.064, "grad_norm": 4.78125, "grad_norm_var": 0.071875, "learning_rate": 0.0001, "loss": 7.2763, "loss/crossentropy": 2.636983275413513, "loss/hidden": 1.88671875, "loss/jsd": 0.0, "loss/logits": 0.27525630593299866, "step": 2048 }, { "epoch": 0.0640625, "grad_norm": 5.25, "grad_norm_var": 0.13248697916666666, "learning_rate": 0.0001, "loss": 7.3148, "loss/crossentropy": 2.602674722671509, "loss/hidden": 1.90625, "loss/jsd": 0.0, "loss/logits": 0.2805839627981186, "step": 2050 }, { "epoch": 0.064125, "grad_norm": 4.78125, "grad_norm_var": 0.11145426432291666, "learning_rate": 0.0001, "loss": 7.5156, "loss/crossentropy": 2.8106677532196045, "loss/hidden": 1.91796875, "loss/jsd": 0.0, "loss/logits": 0.2786969691514969, "step": 2052 }, { "epoch": 0.0641875, "grad_norm": 4.28125, "grad_norm_var": 0.11145426432291666, "learning_rate": 0.0001, "loss": 6.9285, "loss/crossentropy": 2.535367965698242, "loss/hidden": 1.8125, "loss/jsd": 0.0, "loss/logits": 0.258063942193985, "step": 2054 }, { "epoch": 0.06425, "grad_norm": 4.125, "grad_norm_var": 0.14810282389322918, "learning_rate": 0.0001, "loss": 7.0355, "loss/crossentropy": 2.7271558046340942, "loss/hidden": 1.828125, "loss/jsd": 0.0, "loss/logits": 0.2480250746011734, "step": 2056 }, { "epoch": 0.0643125, "grad_norm": 4.71875, "grad_norm_var": 0.16648661295572917, "learning_rate": 0.0001, "loss": 7.6621, "loss/crossentropy": 2.8376705646514893, "loss/hidden": 1.90625, "loss/jsd": 0.0, "loss/logits": 0.2918137311935425, "step": 2058 }, { "epoch": 0.064375, "grad_norm": 4.375, "grad_norm_var": 0.16048075358072916, "learning_rate": 0.0001, "loss": 7.4221, "loss/crossentropy": 2.738136410713196, "loss/hidden": 1.9140625, "loss/jsd": 0.0, "loss/logits": 0.2769925594329834, "step": 2060 }, { "epoch": 0.0644375, "grad_norm": 5.21875, "grad_norm_var": 0.1754547119140625, "learning_rate": 0.0001, "loss": 7.5939, "loss/crossentropy": 2.849211812019348, "loss/hidden": 1.9140625, "loss/jsd": 0.0, "loss/logits": 0.283063068985939, "step": 2062 }, { "epoch": 0.0645, "grad_norm": 4.71875, "grad_norm_var": 0.17480367024739582, "learning_rate": 0.0001, "loss": 7.5665, "loss/crossentropy": 2.8930327892303467, "loss/hidden": 1.875, "loss/jsd": 0.0, "loss/logits": 0.2798466980457306, "step": 2064 }, { "epoch": 0.0645625, "grad_norm": 4.15625, "grad_norm_var": 0.13145243326822917, "learning_rate": 0.0001, "loss": 6.9794, "loss/crossentropy": 2.5419009923934937, "loss/hidden": 1.87890625, "loss/jsd": 0.0, "loss/logits": 0.25585878640413284, "step": 2066 }, { "epoch": 0.064625, "grad_norm": 4.65625, "grad_norm_var": 0.1286773681640625, "learning_rate": 0.0001, "loss": 7.3639, "loss/crossentropy": 2.8139398097991943, "loss/hidden": 1.85546875, "loss/jsd": 0.0, "loss/logits": 0.2694520950317383, "step": 2068 }, { "epoch": 0.0646875, "grad_norm": 4.3125, "grad_norm_var": 0.1372955322265625, "learning_rate": 0.0001, "loss": 7.0513, "loss/crossentropy": 2.6727343797683716, "loss/hidden": 1.8359375, "loss/jsd": 0.0, "loss/logits": 0.25425978749990463, "step": 2070 }, { "epoch": 0.06475, "grad_norm": 4.65625, "grad_norm_var": 0.09823811848958333, "learning_rate": 0.0001, "loss": 7.0751, "loss/crossentropy": 2.564603090286255, "loss/hidden": 1.90625, "loss/jsd": 0.0, "loss/logits": 0.26042503118515015, "step": 2072 }, { "epoch": 0.0648125, "grad_norm": 4.1875, "grad_norm_var": 0.09440104166666667, "learning_rate": 0.0001, "loss": 7.3307, "loss/crossentropy": 2.74035382270813, "loss/hidden": 1.88671875, "loss/jsd": 0.0, "loss/logits": 0.27035802602767944, "step": 2074 }, { "epoch": 0.064875, "grad_norm": 4.3125, "grad_norm_var": 0.08847249348958333, "learning_rate": 0.0001, "loss": 6.7706, "loss/crossentropy": 2.4303542375564575, "loss/hidden": 1.80078125, "loss/jsd": 0.0, "loss/logits": 0.2539462745189667, "step": 2076 }, { "epoch": 0.0649375, "grad_norm": 4.59375, "grad_norm_var": 0.06291910807291666, "learning_rate": 0.0001, "loss": 7.1118, "loss/crossentropy": 2.539111375808716, "loss/hidden": 1.921875, "loss/jsd": 0.0, "loss/logits": 0.26507870107889175, "step": 2078 }, { "epoch": 0.065, "grad_norm": 4.25, "grad_norm_var": 0.083203125, "learning_rate": 0.0001, "loss": 7.1723, "loss/crossentropy": 2.6623255014419556, "loss/hidden": 1.859375, "loss/jsd": 0.0, "loss/logits": 0.26506394147872925, "step": 2080 }, { "epoch": 0.0650625, "grad_norm": 4.3125, "grad_norm_var": 0.079296875, "learning_rate": 0.0001, "loss": 6.9769, "loss/crossentropy": 2.581335186958313, "loss/hidden": 1.890625, "loss/jsd": 0.0, "loss/logits": 0.2504977136850357, "step": 2082 }, { "epoch": 0.065125, "grad_norm": 4.46875, "grad_norm_var": 0.077734375, "learning_rate": 0.0001, "loss": 6.7248, "loss/crossentropy": 2.428415536880493, "loss/hidden": 1.86328125, "loss/jsd": 0.0, "loss/logits": 0.2433120310306549, "step": 2084 }, { "epoch": 0.0651875, "grad_norm": 5.6875, "grad_norm_var": 0.154541015625, "learning_rate": 0.0001, "loss": 7.3285, "loss/crossentropy": 2.767895817756653, "loss/hidden": 1.88671875, "loss/jsd": 0.0, "loss/logits": 0.26739276945590973, "step": 2086 }, { "epoch": 0.06525, "grad_norm": 4.6875, "grad_norm_var": 0.1654296875, "learning_rate": 0.0001, "loss": 7.5187, "loss/crossentropy": 2.7806981801986694, "loss/hidden": 1.9296875, "loss/jsd": 0.0, "loss/logits": 0.2808331549167633, "step": 2088 }, { "epoch": 0.0653125, "grad_norm": 4.65625, "grad_norm_var": 0.1556640625, "learning_rate": 0.0001, "loss": 6.844, "loss/crossentropy": 2.510632038116455, "loss/hidden": 1.86328125, "loss/jsd": 0.0, "loss/logits": 0.24700827151536942, "step": 2090 }, { "epoch": 0.065375, "grad_norm": 4.03125, "grad_norm_var": 0.1995513916015625, "learning_rate": 0.0001, "loss": 6.7551, "loss/crossentropy": 2.520743489265442, "loss/hidden": 1.78125, "loss/jsd": 0.0, "loss/logits": 0.2453150451183319, "step": 2092 }, { "epoch": 0.0654375, "grad_norm": 5.625, "grad_norm_var": 0.2886383056640625, "learning_rate": 0.0001, "loss": 7.2018, "loss/crossentropy": 2.580179214477539, "loss/hidden": 1.91796875, "loss/jsd": 0.0, "loss/logits": 0.2703619748353958, "step": 2094 }, { "epoch": 0.0655, "grad_norm": 4.40625, "grad_norm_var": 0.2673980712890625, "learning_rate": 0.0001, "loss": 7.1838, "loss/crossentropy": 2.655561089515686, "loss/hidden": 1.8828125, "loss/jsd": 0.0, "loss/logits": 0.26454417407512665, "step": 2096 }, { "epoch": 0.0655625, "grad_norm": 4.46875, "grad_norm_var": 0.27241923014322916, "learning_rate": 0.0001, "loss": 7.056, "loss/crossentropy": 2.610250473022461, "loss/hidden": 1.84765625, "loss/jsd": 0.0, "loss/logits": 0.2598103657364845, "step": 2098 }, { "epoch": 0.065625, "grad_norm": 4.625, "grad_norm_var": 0.2668853759765625, "learning_rate": 0.0001, "loss": 6.9496, "loss/crossentropy": 2.5838887691497803, "loss/hidden": 1.84765625, "loss/jsd": 0.0, "loss/logits": 0.25180892646312714, "step": 2100 }, { "epoch": 0.0656875, "grad_norm": 4.5, "grad_norm_var": 0.1876373291015625, "learning_rate": 0.0001, "loss": 7.208, "loss/crossentropy": 2.717166543006897, "loss/hidden": 1.8515625, "loss/jsd": 0.0, "loss/logits": 0.2639300525188446, "step": 2102 }, { "epoch": 0.06575, "grad_norm": 4.59375, "grad_norm_var": 0.1708160400390625, "learning_rate": 0.0001, "loss": 6.649, "loss/crossentropy": 2.3968560695648193, "loss/hidden": 1.8515625, "loss/jsd": 0.0, "loss/logits": 0.24005791544914246, "step": 2104 }, { "epoch": 0.0658125, "grad_norm": 4.78125, "grad_norm_var": 0.17692769368489583, "learning_rate": 0.0001, "loss": 7.0411, "loss/crossentropy": 2.5781623125076294, "loss/hidden": 1.86328125, "loss/jsd": 0.0, "loss/logits": 0.25996091961860657, "step": 2106 }, { "epoch": 0.065875, "grad_norm": 4.3125, "grad_norm_var": 0.14208577473958334, "learning_rate": 0.0001, "loss": 6.7822, "loss/crossentropy": 2.490805149078369, "loss/hidden": 1.78515625, "loss/jsd": 0.0, "loss/logits": 0.2506261169910431, "step": 2108 }, { "epoch": 0.0659375, "grad_norm": 4.75, "grad_norm_var": 0.04898681640625, "learning_rate": 0.0001, "loss": 7.1099, "loss/crossentropy": 2.639052987098694, "loss/hidden": 1.859375, "loss/jsd": 0.0, "loss/logits": 0.26115116477012634, "step": 2110 }, { "epoch": 0.066, "grad_norm": 4.5625, "grad_norm_var": 0.04544270833333333, "learning_rate": 0.0001, "loss": 7.1626, "loss/crossentropy": 2.600106716156006, "loss/hidden": 1.94140625, "loss/jsd": 0.0, "loss/logits": 0.2621118575334549, "step": 2112 }, { "epoch": 0.0660625, "grad_norm": 4.65625, "grad_norm_var": 0.03209635416666667, "learning_rate": 0.0001, "loss": 7.4427, "loss/crossentropy": 2.763522505760193, "loss/hidden": 1.90234375, "loss/jsd": 0.0, "loss/logits": 0.27768468856811523, "step": 2114 }, { "epoch": 0.066125, "grad_norm": 4.53125, "grad_norm_var": 0.043473307291666666, "learning_rate": 0.0001, "loss": 7.2648, "loss/crossentropy": 2.662122130393982, "loss/hidden": 1.8828125, "loss/jsd": 0.0, "loss/logits": 0.27199098467826843, "step": 2116 }, { "epoch": 0.0661875, "grad_norm": 4.375, "grad_norm_var": 0.04260660807291667, "learning_rate": 0.0001, "loss": 6.6901, "loss/crossentropy": 2.4105567932128906, "loss/hidden": 1.79296875, "loss/jsd": 0.0, "loss/logits": 0.2486593872308731, "step": 2118 }, { "epoch": 0.06625, "grad_norm": 4.8125, "grad_norm_var": 0.07623697916666666, "learning_rate": 0.0001, "loss": 7.4795, "loss/crossentropy": 2.7654476165771484, "loss/hidden": 1.89453125, "loss/jsd": 0.0, "loss/logits": 0.28194795548915863, "step": 2120 }, { "epoch": 0.0663125, "grad_norm": 4.625, "grad_norm_var": 0.06614583333333333, "learning_rate": 0.0001, "loss": 7.1431, "loss/crossentropy": 2.657179594039917, "loss/hidden": 1.8515625, "loss/jsd": 0.0, "loss/logits": 0.2634367048740387, "step": 2122 }, { "epoch": 0.066375, "grad_norm": 6.03125, "grad_norm_var": 0.17248942057291666, "learning_rate": 0.0001, "loss": 7.4061, "loss/crossentropy": 2.7268136739730835, "loss/hidden": 1.890625, "loss/jsd": 0.0, "loss/logits": 0.2788669764995575, "step": 2124 }, { "epoch": 0.0664375, "grad_norm": 5.03125, "grad_norm_var": 0.17203369140625, "learning_rate": 0.0001, "loss": 7.2446, "loss/crossentropy": 2.6502318382263184, "loss/hidden": 1.91796875, "loss/jsd": 0.0, "loss/logits": 0.26764001697301865, "step": 2126 }, { "epoch": 0.0665, "grad_norm": 4.46875, "grad_norm_var": 0.18108317057291667, "learning_rate": 0.0001, "loss": 7.014, "loss/crossentropy": 2.633153796195984, "loss/hidden": 1.8046875, "loss/jsd": 0.0, "loss/logits": 0.2576177716255188, "step": 2128 }, { "epoch": 0.0665625, "grad_norm": 4.21875, "grad_norm_var": 0.20506184895833332, "learning_rate": 0.0001, "loss": 7.1342, "loss/crossentropy": 2.7031787633895874, "loss/hidden": 1.88671875, "loss/jsd": 0.0, "loss/logits": 0.25443191826343536, "step": 2130 }, { "epoch": 0.066625, "grad_norm": 4.75, "grad_norm_var": 0.19849853515625, "learning_rate": 0.0001, "loss": 7.3439, "loss/crossentropy": 2.7594083547592163, "loss/hidden": 1.875, "loss/jsd": 0.0, "loss/logits": 0.2709462195634842, "step": 2132 }, { "epoch": 0.0666875, "grad_norm": 4.75, "grad_norm_var": 0.19127197265625, "learning_rate": 0.0001, "loss": 6.8376, "loss/crossentropy": 2.4818824529647827, "loss/hidden": 1.83984375, "loss/jsd": 0.0, "loss/logits": 0.25159231573343277, "step": 2134 }, { "epoch": 0.06675, "grad_norm": 4.15625, "grad_norm_var": 0.20149332682291668, "learning_rate": 0.0001, "loss": 7.1332, "loss/crossentropy": 2.6554126739501953, "loss/hidden": 1.85546875, "loss/jsd": 0.0, "loss/logits": 0.2622327506542206, "step": 2136 }, { "epoch": 0.0668125, "grad_norm": 4.6875, "grad_norm_var": 0.20015869140625, "learning_rate": 0.0001, "loss": 7.2519, "loss/crossentropy": 2.650509238243103, "loss/hidden": 1.90625, "loss/jsd": 0.0, "loss/logits": 0.26951250433921814, "step": 2138 }, { "epoch": 0.066875, "grad_norm": 5.28125, "grad_norm_var": 0.09947509765625, "learning_rate": 0.0001, "loss": 7.3845, "loss/crossentropy": 2.764272689819336, "loss/hidden": 1.890625, "loss/jsd": 0.0, "loss/logits": 0.2729562968015671, "step": 2140 }, { "epoch": 0.0669375, "grad_norm": 4.46875, "grad_norm_var": 0.43800455729166665, "learning_rate": 0.0001, "loss": 7.2234, "loss/crossentropy": 2.6682363748550415, "loss/hidden": 1.87109375, "loss/jsd": 0.0, "loss/logits": 0.26840950548648834, "step": 2142 }, { "epoch": 0.067, "grad_norm": 4.65625, "grad_norm_var": 0.4259114583333333, "learning_rate": 0.0001, "loss": 7.2398, "loss/crossentropy": 2.713719964027405, "loss/hidden": 1.890625, "loss/jsd": 0.0, "loss/logits": 0.2635442316532135, "step": 2144 }, { "epoch": 0.0670625, "grad_norm": 4.40625, "grad_norm_var": 0.42414957682291665, "learning_rate": 0.0001, "loss": 7.175, "loss/crossentropy": 2.7028512954711914, "loss/hidden": 1.87109375, "loss/jsd": 0.0, "loss/logits": 0.260101780295372, "step": 2146 }, { "epoch": 0.067125, "grad_norm": 7.0, "grad_norm_var": 1.5235026041666666, "learning_rate": 0.0001, "loss": 7.2175, "loss/crossentropy": 2.663016438484192, "loss/hidden": 1.91015625, "loss/jsd": 0.0, "loss/logits": 0.26442950963974, "step": 2148 }, { "epoch": 0.0671875, "grad_norm": 5.03125, "grad_norm_var": 1.506884765625, "learning_rate": 0.0001, "loss": 7.5776, "loss/crossentropy": 2.747779130935669, "loss/hidden": 1.9375, "loss/jsd": 0.0, "loss/logits": 0.28923508524894714, "step": 2150 }, { "epoch": 0.06725, "grad_norm": 4.125, "grad_norm_var": 1.5227823893229167, "learning_rate": 0.0001, "loss": 6.7097, "loss/crossentropy": 2.3725602626800537, "loss/hidden": 1.875, "loss/jsd": 0.0, "loss/logits": 0.24621078372001648, "step": 2152 }, { "epoch": 0.0673125, "grad_norm": 4.75, "grad_norm_var": 1.5213541666666666, "learning_rate": 0.0001, "loss": 6.9537, "loss/crossentropy": 2.5790809392929077, "loss/hidden": 1.875, "loss/jsd": 0.0, "loss/logits": 0.24995902180671692, "step": 2154 }, { "epoch": 0.067375, "grad_norm": 4.46875, "grad_norm_var": 1.5811808268229166, "learning_rate": 0.0001, "loss": 7.2648, "loss/crossentropy": 2.7113585472106934, "loss/hidden": 1.8671875, "loss/jsd": 0.0, "loss/logits": 0.26862309873104095, "step": 2156 }, { "epoch": 0.0674375, "grad_norm": 4.0625, "grad_norm_var": 1.39107666015625, "learning_rate": 0.0001, "loss": 6.6646, "loss/crossentropy": 2.4880530834198, "loss/hidden": 1.7890625, "loss/jsd": 0.0, "loss/logits": 0.23874470591545105, "step": 2158 }, { "epoch": 0.0675, "grad_norm": 4.40625, "grad_norm_var": 1.3983357747395833, "learning_rate": 0.0001, "loss": 7.1278, "loss/crossentropy": 2.6453821659088135, "loss/hidden": 1.92578125, "loss/jsd": 0.0, "loss/logits": 0.25566530227661133, "step": 2160 }, { "epoch": 0.0675625, "grad_norm": 4.875, "grad_norm_var": 1.3705037434895833, "learning_rate": 0.0001, "loss": 7.3912, "loss/crossentropy": 2.793515920639038, "loss/hidden": 1.91015625, "loss/jsd": 0.0, "loss/logits": 0.2687506824731827, "step": 2162 }, { "epoch": 0.067625, "grad_norm": 5.0, "grad_norm_var": 0.09607747395833334, "learning_rate": 0.0001, "loss": 7.5826, "loss/crossentropy": 2.8772777318954468, "loss/hidden": 1.96484375, "loss/jsd": 0.0, "loss/logits": 0.27405229210853577, "step": 2164 }, { "epoch": 0.0676875, "grad_norm": 4.03125, "grad_norm_var": 0.08372395833333333, "learning_rate": 0.0001, "loss": 6.822, "loss/crossentropy": 2.485366106033325, "loss/hidden": 1.87890625, "loss/jsd": 0.0, "loss/logits": 0.2457720786333084, "step": 2166 }, { "epoch": 0.06775, "grad_norm": 4.96875, "grad_norm_var": 0.08866780598958333, "learning_rate": 0.0001, "loss": 6.6891, "loss/crossentropy": 2.4618345499038696, "loss/hidden": 1.875, "loss/jsd": 0.0, "loss/logits": 0.2352314591407776, "step": 2168 }, { "epoch": 0.0678125, "grad_norm": 4.28125, "grad_norm_var": 0.09425455729166667, "learning_rate": 0.0001, "loss": 7.4168, "loss/crossentropy": 2.745675206184387, "loss/hidden": 1.90234375, "loss/jsd": 0.0, "loss/logits": 0.2768822908401489, "step": 2170 }, { "epoch": 0.067875, "grad_norm": 4.28125, "grad_norm_var": 0.11060791015625, "learning_rate": 0.0001, "loss": 7.2711, "loss/crossentropy": 2.8049396276474, "loss/hidden": 1.85546875, "loss/jsd": 0.0, "loss/logits": 0.26107245683670044, "step": 2172 }, { "epoch": 0.0679375, "grad_norm": 4.0, "grad_norm_var": 0.11796468098958333, "learning_rate": 0.0001, "loss": 7.1348, "loss/crossentropy": 2.637243390083313, "loss/hidden": 1.8515625, "loss/jsd": 0.0, "loss/logits": 0.2645973861217499, "step": 2174 }, { "epoch": 0.068, "grad_norm": 5.3125, "grad_norm_var": 0.1611328125, "learning_rate": 0.0001, "loss": 7.3615, "loss/crossentropy": 2.7797021865844727, "loss/hidden": 1.875, "loss/jsd": 0.0, "loss/logits": 0.27068354189395905, "step": 2176 }, { "epoch": 0.0680625, "grad_norm": 4.4375, "grad_norm_var": 0.15793863932291666, "learning_rate": 0.0001, "loss": 7.033, "loss/crossentropy": 2.639753580093384, "loss/hidden": 1.83984375, "loss/jsd": 0.0, "loss/logits": 0.2553429752588272, "step": 2178 }, { "epoch": 0.068125, "grad_norm": 4.46875, "grad_norm_var": 0.13358968098958332, "learning_rate": 0.0001, "loss": 7.1443, "loss/crossentropy": 2.788546323776245, "loss/hidden": 1.8515625, "loss/jsd": 0.0, "loss/logits": 0.25041744858026505, "step": 2180 }, { "epoch": 0.0681875, "grad_norm": 4.375, "grad_norm_var": 0.13993733723958332, "learning_rate": 0.0001, "loss": 7.4631, "loss/crossentropy": 2.8808815479278564, "loss/hidden": 1.88671875, "loss/jsd": 0.0, "loss/logits": 0.2695489227771759, "step": 2182 }, { "epoch": 0.06825, "grad_norm": 4.375, "grad_norm_var": 0.13411051432291668, "learning_rate": 0.0001, "loss": 7.1354, "loss/crossentropy": 2.6089104413986206, "loss/hidden": 1.89453125, "loss/jsd": 0.0, "loss/logits": 0.2631940394639969, "step": 2184 }, { "epoch": 0.0683125, "grad_norm": 4.5, "grad_norm_var": 0.12688802083333334, "learning_rate": 0.0001, "loss": 7.4842, "loss/crossentropy": 2.8750851154327393, "loss/hidden": 1.8984375, "loss/jsd": 0.0, "loss/logits": 0.2710645943880081, "step": 2186 }, { "epoch": 0.068375, "grad_norm": 4.375, "grad_norm_var": 0.14178059895833334, "learning_rate": 0.0001, "loss": 7.0685, "loss/crossentropy": 2.5792561769485474, "loss/hidden": 1.8515625, "loss/jsd": 0.0, "loss/logits": 0.2637658417224884, "step": 2188 }, { "epoch": 0.0684375, "grad_norm": 4.0625, "grad_norm_var": 0.13993733723958332, "learning_rate": 0.0001, "loss": 6.7785, "loss/crossentropy": 2.4673629999160767, "loss/hidden": 1.8671875, "loss/jsd": 0.0, "loss/logits": 0.2443905919790268, "step": 2190 }, { "epoch": 0.0685, "grad_norm": 3.96875, "grad_norm_var": 0.13014322916666668, "learning_rate": 0.0001, "loss": 6.7141, "loss/crossentropy": 2.534016251564026, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.24066344648599625, "step": 2192 }, { "epoch": 0.0685625, "grad_norm": 4.78125, "grad_norm_var": 0.15142822265625, "learning_rate": 0.0001, "loss": 7.1777, "loss/crossentropy": 2.86741304397583, "loss/hidden": 1.80859375, "loss/jsd": 0.0, "loss/logits": 0.25017230212688446, "step": 2194 }, { "epoch": 0.068625, "grad_norm": 4.0625, "grad_norm_var": 0.16597900390625, "learning_rate": 0.0001, "loss": 6.9375, "loss/crossentropy": 2.5084309577941895, "loss/hidden": 1.8203125, "loss/jsd": 0.0, "loss/logits": 0.26087313890457153, "step": 2196 }, { "epoch": 0.0686875, "grad_norm": 4.3125, "grad_norm_var": 0.15260416666666668, "learning_rate": 0.0001, "loss": 7.1154, "loss/crossentropy": 2.6377168893814087, "loss/hidden": 1.85546875, "loss/jsd": 0.0, "loss/logits": 0.26222096383571625, "step": 2198 }, { "epoch": 0.06875, "grad_norm": 4.34375, "grad_norm_var": 0.13980712890625, "learning_rate": 0.0001, "loss": 6.8151, "loss/crossentropy": 2.562336802482605, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.24793028831481934, "step": 2200 }, { "epoch": 0.0688125, "grad_norm": 5.3125, "grad_norm_var": 0.21320699055989584, "learning_rate": 0.0001, "loss": 7.058, "loss/crossentropy": 2.5885305404663086, "loss/hidden": 1.88671875, "loss/jsd": 0.0, "loss/logits": 0.25827546417713165, "step": 2202 }, { "epoch": 0.068875, "grad_norm": 4.34375, "grad_norm_var": 0.1595123291015625, "learning_rate": 0.0001, "loss": 6.9934, "loss/crossentropy": 2.6797332763671875, "loss/hidden": 1.80859375, "loss/jsd": 0.0, "loss/logits": 0.25050613284111023, "step": 2204 }, { "epoch": 0.0689375, "grad_norm": 4.3125, "grad_norm_var": 0.15498758951822916, "learning_rate": 0.0001, "loss": 6.7623, "loss/crossentropy": 2.4612139463424683, "loss/hidden": 1.83984375, "loss/jsd": 0.0, "loss/logits": 0.24612630903720856, "step": 2206 }, { "epoch": 0.069, "grad_norm": 4.4375, "grad_norm_var": 0.17112528483072917, "learning_rate": 0.0001, "loss": 7.2423, "loss/crossentropy": 2.674578070640564, "loss/hidden": 1.90234375, "loss/jsd": 0.0, "loss/logits": 0.2665373533964157, "step": 2208 }, { "epoch": 0.0690625, "grad_norm": 4.40625, "grad_norm_var": 0.14546610514322916, "learning_rate": 0.0001, "loss": 7.1637, "loss/crossentropy": 2.640005946159363, "loss/hidden": 1.875, "loss/jsd": 0.0, "loss/logits": 0.26486851274967194, "step": 2210 }, { "epoch": 0.069125, "grad_norm": 4.90625, "grad_norm_var": 0.14099019368489582, "learning_rate": 0.0001, "loss": 7.202, "loss/crossentropy": 2.744121551513672, "loss/hidden": 1.85546875, "loss/jsd": 0.0, "loss/logits": 0.26024456322193146, "step": 2212 }, { "epoch": 0.0691875, "grad_norm": 4.375, "grad_norm_var": 0.13904520670572917, "learning_rate": 0.0001, "loss": 7.4197, "loss/crossentropy": 2.8771705627441406, "loss/hidden": 1.890625, "loss/jsd": 0.0, "loss/logits": 0.26518765091896057, "step": 2214 }, { "epoch": 0.06925, "grad_norm": 3.84375, "grad_norm_var": 0.1639801025390625, "learning_rate": 0.0001, "loss": 6.836, "loss/crossentropy": 2.6073604822158813, "loss/hidden": 1.8125, "loss/jsd": 0.0, "loss/logits": 0.24161886423826218, "step": 2216 }, { "epoch": 0.0693125, "grad_norm": 4.34375, "grad_norm_var": 0.11125895182291666, "learning_rate": 0.0001, "loss": 6.8475, "loss/crossentropy": 2.4261631965637207, "loss/hidden": 1.86328125, "loss/jsd": 0.0, "loss/logits": 0.25580471009016037, "step": 2218 }, { "epoch": 0.069375, "grad_norm": 4.375, "grad_norm_var": 0.1138671875, "learning_rate": 0.0001, "loss": 7.3131, "loss/crossentropy": 2.8144900798797607, "loss/hidden": 1.84765625, "loss/jsd": 0.0, "loss/logits": 0.2650986611843109, "step": 2220 }, { "epoch": 0.0694375, "grad_norm": 4.71875, "grad_norm_var": 0.1197265625, "learning_rate": 0.0001, "loss": 6.8706, "loss/crossentropy": 2.5216604471206665, "loss/hidden": 1.8203125, "loss/jsd": 0.0, "loss/logits": 0.25286050140857697, "step": 2222 }, { "epoch": 0.0695, "grad_norm": 4.28125, "grad_norm_var": 0.10403238932291667, "learning_rate": 0.0001, "loss": 6.7715, "loss/crossentropy": 2.4990915060043335, "loss/hidden": 1.83203125, "loss/jsd": 0.0, "loss/logits": 0.24404004216194153, "step": 2224 }, { "epoch": 0.0695625, "grad_norm": 4.5625, "grad_norm_var": 0.10402018229166667, "learning_rate": 0.0001, "loss": 7.2957, "loss/crossentropy": 2.741424560546875, "loss/hidden": 1.86328125, "loss/jsd": 0.0, "loss/logits": 0.26909762620925903, "step": 2226 }, { "epoch": 0.069625, "grad_norm": 4.46875, "grad_norm_var": 0.097509765625, "learning_rate": 0.0001, "loss": 6.9829, "loss/crossentropy": 2.6196401119232178, "loss/hidden": 1.8359375, "loss/jsd": 0.0, "loss/logits": 0.2527298480272293, "step": 2228 }, { "epoch": 0.0696875, "grad_norm": 4.25, "grad_norm_var": 0.10338541666666666, "learning_rate": 0.0001, "loss": 6.6484, "loss/crossentropy": 2.543282628059387, "loss/hidden": 1.78125, "loss/jsd": 0.0, "loss/logits": 0.23239174485206604, "step": 2230 }, { "epoch": 0.06975, "grad_norm": 4.28125, "grad_norm_var": 0.09036458333333333, "learning_rate": 0.0001, "loss": 7.2512, "loss/crossentropy": 2.6861058473587036, "loss/hidden": 1.86328125, "loss/jsd": 0.0, "loss/logits": 0.27017952501773834, "step": 2232 }, { "epoch": 0.0698125, "grad_norm": 4.78125, "grad_norm_var": 0.07317708333333334, "learning_rate": 0.0001, "loss": 7.216, "loss/crossentropy": 2.6648640632629395, "loss/hidden": 1.87890625, "loss/jsd": 0.0, "loss/logits": 0.2672192007303238, "step": 2234 }, { "epoch": 0.069875, "grad_norm": 4.46875, "grad_norm_var": 0.06482747395833334, "learning_rate": 0.0001, "loss": 7.0849, "loss/crossentropy": 2.549190878868103, "loss/hidden": 1.875, "loss/jsd": 0.0, "loss/logits": 0.2660679966211319, "step": 2236 }, { "epoch": 0.0699375, "grad_norm": 4.0, "grad_norm_var": 0.06874593098958333, "learning_rate": 0.0001, "loss": 7.0557, "loss/crossentropy": 2.736733317375183, "loss/hidden": 1.81640625, "loss/jsd": 0.0, "loss/logits": 0.2502536326646805, "step": 2238 }, { "epoch": 0.07, "grad_norm": 4.53125, "grad_norm_var": 0.061962890625, "learning_rate": 0.0001, "loss": 7.019, "loss/crossentropy": 2.588713526725769, "loss/hidden": 1.859375, "loss/jsd": 0.0, "loss/logits": 0.2570897936820984, "step": 2240 }, { "epoch": 0.0700625, "grad_norm": 4.6875, "grad_norm_var": 0.07428385416666666, "learning_rate": 0.0001, "loss": 6.9907, "loss/crossentropy": 2.584397792816162, "loss/hidden": 1.91015625, "loss/jsd": 0.0, "loss/logits": 0.24961788207292557, "step": 2242 }, { "epoch": 0.070125, "grad_norm": 4.6875, "grad_norm_var": 0.07375895182291667, "learning_rate": 0.0001, "loss": 7.0313, "loss/crossentropy": 2.6812790632247925, "loss/hidden": 1.8828125, "loss/jsd": 0.0, "loss/logits": 0.24672441184520721, "step": 2244 }, { "epoch": 0.0701875, "grad_norm": 4.28125, "grad_norm_var": 0.07538655598958334, "learning_rate": 0.0001, "loss": 6.9258, "loss/crossentropy": 2.5161720514297485, "loss/hidden": 1.84765625, "loss/jsd": 0.0, "loss/logits": 0.256193146109581, "step": 2246 }, { "epoch": 0.07025, "grad_norm": 4.65625, "grad_norm_var": 0.06269124348958334, "learning_rate": 0.0001, "loss": 7.3589, "loss/crossentropy": 2.8025104999542236, "loss/hidden": 1.8671875, "loss/jsd": 0.0, "loss/logits": 0.2689158171415329, "step": 2248 }, { "epoch": 0.0703125, "grad_norm": 4.3125, "grad_norm_var": 0.05755208333333333, "learning_rate": 0.0001, "loss": 7.0828, "loss/crossentropy": 2.700048565864563, "loss/hidden": 1.8046875, "loss/jsd": 0.0, "loss/logits": 0.2578069567680359, "step": 2250 }, { "epoch": 0.070375, "grad_norm": 4.40625, "grad_norm_var": 0.605712890625, "learning_rate": 0.0001, "loss": 7.0709, "loss/crossentropy": 2.5707215070724487, "loss/hidden": 1.83984375, "loss/jsd": 0.0, "loss/logits": 0.26603344082832336, "step": 2252 }, { "epoch": 0.0704375, "grad_norm": 4.21875, "grad_norm_var": 0.5828125, "learning_rate": 0.0001, "loss": 7.0732, "loss/crossentropy": 2.6843878030776978, "loss/hidden": 1.8359375, "loss/jsd": 0.0, "loss/logits": 0.25528310239315033, "step": 2254 }, { "epoch": 0.0705, "grad_norm": 4.25, "grad_norm_var": 0.5912760416666667, "learning_rate": 0.0001, "loss": 7.1331, "loss/crossentropy": 2.6784332990646362, "loss/hidden": 1.87109375, "loss/jsd": 0.0, "loss/logits": 0.25835826992988586, "step": 2256 }, { "epoch": 0.0705625, "grad_norm": 3.9375, "grad_norm_var": 0.6138671875, "learning_rate": 0.0001, "loss": 6.7684, "loss/crossentropy": 2.484556198120117, "loss/hidden": 1.859375, "loss/jsd": 0.0, "loss/logits": 0.24244347214698792, "step": 2258 }, { "epoch": 0.070625, "grad_norm": 4.59375, "grad_norm_var": 0.6217732747395833, "learning_rate": 0.0001, "loss": 7.0758, "loss/crossentropy": 2.5867775678634644, "loss/hidden": 1.90234375, "loss/jsd": 0.0, "loss/logits": 0.25866710394620895, "step": 2260 }, { "epoch": 0.0706875, "grad_norm": 3.859375, "grad_norm_var": 0.6412750244140625, "learning_rate": 0.0001, "loss": 6.9539, "loss/crossentropy": 2.667303681373596, "loss/hidden": 1.8359375, "loss/jsd": 0.0, "loss/logits": 0.2450612187385559, "step": 2262 }, { "epoch": 0.07075, "grad_norm": 4.53125, "grad_norm_var": 0.6617746988932292, "learning_rate": 0.0001, "loss": 7.1908, "loss/crossentropy": 2.8160548210144043, "loss/hidden": 1.80859375, "loss/jsd": 0.0, "loss/logits": 0.256610170006752, "step": 2264 }, { "epoch": 0.0708125, "grad_norm": 4.53125, "grad_norm_var": 0.6638661702473958, "learning_rate": 0.0001, "loss": 7.3165, "loss/crossentropy": 2.7610117197036743, "loss/hidden": 1.921875, "loss/jsd": 0.0, "loss/logits": 0.2633662298321724, "step": 2266 }, { "epoch": 0.070875, "grad_norm": 4.4375, "grad_norm_var": 0.10606180826822917, "learning_rate": 0.0001, "loss": 6.9168, "loss/crossentropy": 2.6086331605911255, "loss/hidden": 1.84765625, "loss/jsd": 0.0, "loss/logits": 0.24604862928390503, "step": 2268 }, { "epoch": 0.0709375, "grad_norm": 4.4375, "grad_norm_var": 0.11240946451822917, "learning_rate": 0.0001, "loss": 6.8032, "loss/crossentropy": 2.4364657402038574, "loss/hidden": 1.84765625, "loss/jsd": 0.0, "loss/logits": 0.25191039592027664, "step": 2270 }, { "epoch": 0.071, "grad_norm": 4.1875, "grad_norm_var": 0.11458231608072916, "learning_rate": 0.0001, "loss": 7.0964, "loss/crossentropy": 2.70850145816803, "loss/hidden": 1.87890625, "loss/jsd": 0.0, "loss/logits": 0.25089629739522934, "step": 2272 }, { "epoch": 0.0710625, "grad_norm": 5.15625, "grad_norm_var": 0.13074442545572917, "learning_rate": 0.0001, "loss": 6.8514, "loss/crossentropy": 2.5284905433654785, "loss/hidden": 1.8515625, "loss/jsd": 0.0, "loss/logits": 0.24713944643735886, "step": 2274 }, { "epoch": 0.071125, "grad_norm": 5.15625, "grad_norm_var": 0.15869038899739582, "learning_rate": 0.0001, "loss": 6.9768, "loss/crossentropy": 2.5898996591567993, "loss/hidden": 1.84765625, "loss/jsd": 0.0, "loss/logits": 0.2539285346865654, "step": 2276 }, { "epoch": 0.0711875, "grad_norm": 4.25, "grad_norm_var": 0.13668212890625, "learning_rate": 0.0001, "loss": 6.8138, "loss/crossentropy": 2.4170562028884888, "loss/hidden": 1.890625, "loss/jsd": 0.0, "loss/logits": 0.2506166696548462, "step": 2278 }, { "epoch": 0.07125, "grad_norm": 4.21875, "grad_norm_var": 0.12237955729166666, "learning_rate": 0.0001, "loss": 7.1489, "loss/crossentropy": 2.708563208580017, "loss/hidden": 1.8828125, "loss/jsd": 0.0, "loss/logits": 0.25575482845306396, "step": 2280 }, { "epoch": 0.0713125, "grad_norm": 5.3125, "grad_norm_var": 0.15930582682291666, "learning_rate": 0.0001, "loss": 6.9591, "loss/crossentropy": 2.462288975715637, "loss/hidden": 1.8515625, "loss/jsd": 0.0, "loss/logits": 0.2645288556814194, "step": 2282 }, { "epoch": 0.071375, "grad_norm": 5.4375, "grad_norm_var": 0.21477864583333334, "learning_rate": 0.0001, "loss": 7.2023, "loss/crossentropy": 2.656105160713196, "loss/hidden": 1.91015625, "loss/jsd": 0.0, "loss/logits": 0.26360785216093063, "step": 2284 }, { "epoch": 0.0714375, "grad_norm": 4.65625, "grad_norm_var": 0.20078125, "learning_rate": 0.0001, "loss": 7.2032, "loss/crossentropy": 2.6529760360717773, "loss/hidden": 1.85546875, "loss/jsd": 0.0, "loss/logits": 0.26947909593582153, "step": 2286 }, { "epoch": 0.0715, "grad_norm": 4.25, "grad_norm_var": 0.18899739583333333, "learning_rate": 0.0001, "loss": 7.1448, "loss/crossentropy": 2.6024595499038696, "loss/hidden": 1.8203125, "loss/jsd": 0.0, "loss/logits": 0.2722066640853882, "step": 2288 }, { "epoch": 0.0715625, "grad_norm": 4.15625, "grad_norm_var": 0.18162434895833332, "learning_rate": 0.0001, "loss": 7.2628, "loss/crossentropy": 2.8925254344940186, "loss/hidden": 1.828125, "loss/jsd": 0.0, "loss/logits": 0.25421494245529175, "step": 2290 }, { "epoch": 0.071625, "grad_norm": 4.15625, "grad_norm_var": 0.18349202473958334, "learning_rate": 0.0001, "loss": 7.2124, "loss/crossentropy": 2.778253436088562, "loss/hidden": 1.8515625, "loss/jsd": 0.0, "loss/logits": 0.25826049596071243, "step": 2292 }, { "epoch": 0.0716875, "grad_norm": 4.6875, "grad_norm_var": 0.17978108723958333, "learning_rate": 0.0001, "loss": 7.4538, "loss/crossentropy": 2.843157172203064, "loss/hidden": 1.890625, "loss/jsd": 0.0, "loss/logits": 0.27200615406036377, "step": 2294 }, { "epoch": 0.07175, "grad_norm": 4.46875, "grad_norm_var": 0.16291910807291668, "learning_rate": 0.0001, "loss": 7.2753, "loss/crossentropy": 2.7370086908340454, "loss/hidden": 1.85546875, "loss/jsd": 0.0, "loss/logits": 0.26828624308109283, "step": 2296 }, { "epoch": 0.0718125, "grad_norm": 4.0, "grad_norm_var": 0.146728515625, "learning_rate": 0.0001, "loss": 6.9942, "loss/crossentropy": 2.6206599473953247, "loss/hidden": 1.83984375, "loss/jsd": 0.0, "loss/logits": 0.2533659115433693, "step": 2298 }, { "epoch": 0.071875, "grad_norm": 5.0625, "grad_norm_var": 0.15881754557291666, "learning_rate": 0.0001, "loss": 7.5766, "loss/crossentropy": 2.930529475212097, "loss/hidden": 1.87890625, "loss/jsd": 0.0, "loss/logits": 0.27671411633491516, "step": 2300 }, { "epoch": 0.0719375, "grad_norm": 4.53125, "grad_norm_var": 0.14914957682291666, "learning_rate": 0.0001, "loss": 7.2869, "loss/crossentropy": 2.8419981002807617, "loss/hidden": 1.8671875, "loss/jsd": 0.0, "loss/logits": 0.2577664852142334, "step": 2302 }, { "epoch": 0.072, "grad_norm": 4.8125, "grad_norm_var": 0.14768473307291666, "learning_rate": 0.0001, "loss": 7.244, "loss/crossentropy": 2.6828211545944214, "loss/hidden": 1.90234375, "loss/jsd": 0.0, "loss/logits": 0.26588352024555206, "step": 2304 }, { "epoch": 0.0720625, "grad_norm": 5.3125, "grad_norm_var": 0.16634114583333334, "learning_rate": 0.0001, "loss": 7.2035, "loss/crossentropy": 2.6578863859176636, "loss/hidden": 1.90625, "loss/jsd": 0.0, "loss/logits": 0.26393402367830276, "step": 2306 }, { "epoch": 0.072125, "grad_norm": 4.1875, "grad_norm_var": 0.138671875, "learning_rate": 0.0001, "loss": 6.9004, "loss/crossentropy": 2.594657063484192, "loss/hidden": 1.80859375, "loss/jsd": 0.0, "loss/logits": 0.24971309304237366, "step": 2308 }, { "epoch": 0.0721875, "grad_norm": 4.28125, "grad_norm_var": 0.15597330729166667, "learning_rate": 0.0001, "loss": 6.9301, "loss/crossentropy": 2.602201819419861, "loss/hidden": 1.83203125, "loss/jsd": 0.0, "loss/logits": 0.2495882734656334, "step": 2310 }, { "epoch": 0.07225, "grad_norm": 4.40625, "grad_norm_var": 0.15909830729166666, "learning_rate": 0.0001, "loss": 7.218, "loss/crossentropy": 2.758484959602356, "loss/hidden": 1.828125, "loss/jsd": 0.0, "loss/logits": 0.263138011097908, "step": 2312 }, { "epoch": 0.0723125, "grad_norm": 4.21875, "grad_norm_var": 0.14862874348958333, "learning_rate": 0.0001, "loss": 6.7638, "loss/crossentropy": 2.4779679775238037, "loss/hidden": 1.8046875, "loss/jsd": 0.0, "loss/logits": 0.24811531603336334, "step": 2314 }, { "epoch": 0.072375, "grad_norm": 4.125, "grad_norm_var": 0.10048421223958333, "learning_rate": 0.0001, "loss": 7.0218, "loss/crossentropy": 2.633275866508484, "loss/hidden": 1.83203125, "loss/jsd": 0.0, "loss/logits": 0.25565214455127716, "step": 2316 }, { "epoch": 0.0724375, "grad_norm": 4.84375, "grad_norm_var": 0.11900634765625, "learning_rate": 0.0001, "loss": 7.4657, "loss/crossentropy": 2.900314688682556, "loss/hidden": 1.8671875, "loss/jsd": 0.0, "loss/logits": 0.26982201635837555, "step": 2318 }, { "epoch": 0.0725, "grad_norm": 4.34375, "grad_norm_var": 0.12750244140625, "learning_rate": 0.0001, "loss": 7.4663, "loss/crossentropy": 2.815197467803955, "loss/hidden": 1.9140625, "loss/jsd": 0.0, "loss/logits": 0.2737061530351639, "step": 2320 }, { "epoch": 0.0725625, "grad_norm": 4.625, "grad_norm_var": 0.08837483723958334, "learning_rate": 0.0001, "loss": 7.0731, "loss/crossentropy": 2.589448571205139, "loss/hidden": 1.88671875, "loss/jsd": 0.0, "loss/logits": 0.25969552993774414, "step": 2322 }, { "epoch": 0.072625, "grad_norm": 4.125, "grad_norm_var": 0.09178059895833333, "learning_rate": 0.0001, "loss": 6.8812, "loss/crossentropy": 2.5958632230758667, "loss/hidden": 1.78515625, "loss/jsd": 0.0, "loss/logits": 0.25002047419548035, "step": 2324 }, { "epoch": 0.0726875, "grad_norm": 4.28125, "grad_norm_var": 0.08857014973958334, "learning_rate": 0.0001, "loss": 7.0842, "loss/crossentropy": 2.6766932010650635, "loss/hidden": 1.859375, "loss/jsd": 0.0, "loss/logits": 0.2548135668039322, "step": 2326 }, { "epoch": 0.07275, "grad_norm": 4.21875, "grad_norm_var": 0.10354410807291667, "learning_rate": 0.0001, "loss": 6.875, "loss/crossentropy": 2.6552330255508423, "loss/hidden": 1.78515625, "loss/jsd": 0.0, "loss/logits": 0.2434604912996292, "step": 2328 }, { "epoch": 0.0728125, "grad_norm": 4.625, "grad_norm_var": 0.090869140625, "learning_rate": 0.0001, "loss": 6.878, "loss/crossentropy": 2.5861902236938477, "loss/hidden": 1.796875, "loss/jsd": 0.0, "loss/logits": 0.24948906898498535, "step": 2330 }, { "epoch": 0.072875, "grad_norm": 4.8125, "grad_norm_var": 0.10104166666666667, "learning_rate": 0.0001, "loss": 6.9627, "loss/crossentropy": 2.5423258543014526, "loss/hidden": 1.84765625, "loss/jsd": 0.0, "loss/logits": 0.25726811587810516, "step": 2332 }, { "epoch": 0.0729375, "grad_norm": 4.40625, "grad_norm_var": 0.07372639973958334, "learning_rate": 0.0001, "loss": 6.9905, "loss/crossentropy": 2.631394863128662, "loss/hidden": 1.83984375, "loss/jsd": 0.0, "loss/logits": 0.2519259601831436, "step": 2334 }, { "epoch": 0.073, "grad_norm": 4.46875, "grad_norm_var": 0.05813802083333333, "learning_rate": 0.0001, "loss": 7.1794, "loss/crossentropy": 2.736093521118164, "loss/hidden": 1.84765625, "loss/jsd": 0.0, "loss/logits": 0.25956377387046814, "step": 2336 }, { "epoch": 0.0730625, "grad_norm": 4.1875, "grad_norm_var": 0.059305826822916664, "learning_rate": 0.0001, "loss": 6.9842, "loss/crossentropy": 2.6525719165802, "loss/hidden": 1.859375, "loss/jsd": 0.0, "loss/logits": 0.24722251296043396, "step": 2338 }, { "epoch": 0.073125, "grad_norm": 4.34375, "grad_norm_var": 0.056050618489583336, "learning_rate": 0.0001, "loss": 6.9389, "loss/crossentropy": 2.6012275218963623, "loss/hidden": 1.83203125, "loss/jsd": 0.0, "loss/logits": 0.2505672574043274, "step": 2340 }, { "epoch": 0.0731875, "grad_norm": 4.1875, "grad_norm_var": 0.05927327473958333, "learning_rate": 0.0001, "loss": 6.6983, "loss/crossentropy": 2.4778064489364624, "loss/hidden": 1.828125, "loss/jsd": 0.0, "loss/logits": 0.23923813551664352, "step": 2342 }, { "epoch": 0.07325, "grad_norm": 5.25, "grad_norm_var": 0.9503255208333333, "learning_rate": 0.0001, "loss": 7.0442, "loss/crossentropy": 2.479038953781128, "loss/hidden": 1.91015625, "loss/jsd": 0.0, "loss/logits": 0.26549845933914185, "step": 2344 }, { "epoch": 0.0733125, "grad_norm": 4.71875, "grad_norm_var": 0.9283203125, "learning_rate": 0.0001, "loss": 7.0029, "loss/crossentropy": 2.595993757247925, "loss/hidden": 1.8125, "loss/jsd": 0.0, "loss/logits": 0.25943995267152786, "step": 2346 }, { "epoch": 0.073375, "grad_norm": 4.84375, "grad_norm_var": 0.9596354166666666, "learning_rate": 0.0001, "loss": 7.6093, "loss/crossentropy": 2.858232021331787, "loss/hidden": 1.95703125, "loss/jsd": 0.0, "loss/logits": 0.27940085530281067, "step": 2348 }, { "epoch": 0.0734375, "grad_norm": 4.46875, "grad_norm_var": 0.95078125, "learning_rate": 0.0001, "loss": 7.2626, "loss/crossentropy": 2.632628321647644, "loss/hidden": 1.8828125, "loss/jsd": 0.0, "loss/logits": 0.2747160494327545, "step": 2350 }, { "epoch": 0.0735, "grad_norm": 4.1875, "grad_norm_var": 0.97877197265625, "learning_rate": 0.0001, "loss": 6.8277, "loss/crossentropy": 2.6192827224731445, "loss/hidden": 1.78515625, "loss/jsd": 0.0, "loss/logits": 0.24232970923185349, "step": 2352 }, { "epoch": 0.0735625, "grad_norm": 4.59375, "grad_norm_var": 0.9653645833333333, "learning_rate": 0.0001, "loss": 6.9852, "loss/crossentropy": 2.6112306118011475, "loss/hidden": 1.83203125, "loss/jsd": 0.0, "loss/logits": 0.2541925981640816, "step": 2354 }, { "epoch": 0.073625, "grad_norm": 3.890625, "grad_norm_var": 1.0204905192057292, "learning_rate": 0.0001, "loss": 6.8813, "loss/crossentropy": 2.5602935552597046, "loss/hidden": 1.8046875, "loss/jsd": 0.0, "loss/logits": 0.2516297549009323, "step": 2356 }, { "epoch": 0.0736875, "grad_norm": 3.796875, "grad_norm_var": 1.06181640625, "learning_rate": 0.0001, "loss": 7.0254, "loss/crossentropy": 2.710487961769104, "loss/hidden": 1.80859375, "loss/jsd": 0.0, "loss/logits": 0.250630758702755, "step": 2358 }, { "epoch": 0.07375, "grad_norm": 4.84375, "grad_norm_var": 0.17678629557291667, "learning_rate": 0.0001, "loss": 6.9378, "loss/crossentropy": 2.587727904319763, "loss/hidden": 1.85546875, "loss/jsd": 0.0, "loss/logits": 0.24946476519107819, "step": 2360 }, { "epoch": 0.0738125, "grad_norm": 4.25, "grad_norm_var": 0.21761067708333334, "learning_rate": 0.0001, "loss": 7.5178, "loss/crossentropy": 2.8490744829177856, "loss/hidden": 1.95703125, "loss/jsd": 0.0, "loss/logits": 0.2711695656180382, "step": 2362 }, { "epoch": 0.073875, "grad_norm": 4.40625, "grad_norm_var": 0.17144775390625, "learning_rate": 0.0001, "loss": 6.7725, "loss/crossentropy": 2.4507436752319336, "loss/hidden": 1.89453125, "loss/jsd": 0.0, "loss/logits": 0.2427205666899681, "step": 2364 }, { "epoch": 0.0739375, "grad_norm": 5.21875, "grad_norm_var": 0.19394124348958333, "learning_rate": 0.0001, "loss": 6.8787, "loss/crossentropy": 2.422315001487732, "loss/hidden": 1.8671875, "loss/jsd": 0.0, "loss/logits": 0.2589210420846939, "step": 2366 }, { "epoch": 0.074, "grad_norm": 4.90625, "grad_norm_var": 0.20370686848958333, "learning_rate": 0.0001, "loss": 6.9107, "loss/crossentropy": 2.5740681886672974, "loss/hidden": 1.81640625, "loss/jsd": 0.0, "loss/logits": 0.25202006101608276, "step": 2368 }, { "epoch": 0.0740625, "grad_norm": 4.65625, "grad_norm_var": 0.24055582682291668, "learning_rate": 0.0001, "loss": 6.6577, "loss/crossentropy": 2.347817301750183, "loss/hidden": 1.8046875, "loss/jsd": 0.0, "loss/logits": 0.2505149394273758, "step": 2370 }, { "epoch": 0.074125, "grad_norm": 4.53125, "grad_norm_var": 0.21880594889322916, "learning_rate": 0.0001, "loss": 7.4111, "loss/crossentropy": 2.738089680671692, "loss/hidden": 1.87890625, "loss/jsd": 0.0, "loss/logits": 0.27940770983695984, "step": 2372 }, { "epoch": 0.0741875, "grad_norm": 4.625, "grad_norm_var": 0.18175455729166667, "learning_rate": 0.0001, "loss": 6.877, "loss/crossentropy": 2.653886318206787, "loss/hidden": 1.81640625, "loss/jsd": 0.0, "loss/logits": 0.24067474901676178, "step": 2374 }, { "epoch": 0.07425, "grad_norm": 4.21875, "grad_norm_var": 0.18502604166666667, "learning_rate": 0.0001, "loss": 7.034, "loss/crossentropy": 2.6042428016662598, "loss/hidden": 1.796875, "loss/jsd": 0.0, "loss/logits": 0.26328346133232117, "step": 2376 }, { "epoch": 0.0743125, "grad_norm": 4.0, "grad_norm_var": 0.16568603515625, "learning_rate": 0.0001, "loss": 6.7866, "loss/crossentropy": 2.5228244066238403, "loss/hidden": 1.8125, "loss/jsd": 0.0, "loss/logits": 0.2451310157775879, "step": 2378 }, { "epoch": 0.074375, "grad_norm": 4.78125, "grad_norm_var": 0.15245768229166667, "learning_rate": 0.0001, "loss": 7.0086, "loss/crossentropy": 2.58968186378479, "loss/hidden": 1.84765625, "loss/jsd": 0.0, "loss/logits": 0.2571246773004532, "step": 2380 }, { "epoch": 0.0744375, "grad_norm": 4.34375, "grad_norm_var": 0.106884765625, "learning_rate": 0.0001, "loss": 7.0466, "loss/crossentropy": 2.6445672512054443, "loss/hidden": 1.83984375, "loss/jsd": 0.0, "loss/logits": 0.2562218904495239, "step": 2382 }, { "epoch": 0.0745, "grad_norm": 4.125, "grad_norm_var": 0.08958333333333333, "learning_rate": 0.0001, "loss": 6.81, "loss/crossentropy": 2.5752354860305786, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.24613595008850098, "step": 2384 }, { "epoch": 0.0745625, "grad_norm": 3.78125, "grad_norm_var": 0.08033447265625, "learning_rate": 0.0001, "loss": 6.9493, "loss/crossentropy": 2.652430534362793, "loss/hidden": 1.80859375, "loss/jsd": 0.0, "loss/logits": 0.2488238662481308, "step": 2386 }, { "epoch": 0.074625, "grad_norm": 4.46875, "grad_norm_var": 0.058186848958333336, "learning_rate": 0.0001, "loss": 7.2682, "loss/crossentropy": 2.7897156476974487, "loss/hidden": 1.859375, "loss/jsd": 0.0, "loss/logits": 0.2619110345840454, "step": 2388 }, { "epoch": 0.0746875, "grad_norm": 4.3125, "grad_norm_var": 0.053515625, "learning_rate": 0.0001, "loss": 7.0406, "loss/crossentropy": 2.68414843082428, "loss/hidden": 1.8046875, "loss/jsd": 0.0, "loss/logits": 0.25517675280570984, "step": 2390 }, { "epoch": 0.07475, "grad_norm": 4.96875, "grad_norm_var": 0.08912353515625, "learning_rate": 0.0001, "loss": 6.6606, "loss/crossentropy": 2.4219181537628174, "loss/hidden": 1.8203125, "loss/jsd": 0.0, "loss/logits": 0.2418384850025177, "step": 2392 }, { "epoch": 0.0748125, "grad_norm": 4.0625, "grad_norm_var": 0.09413655598958333, "learning_rate": 0.0001, "loss": 6.7802, "loss/crossentropy": 2.5747569799423218, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.24475835263729095, "step": 2394 }, { "epoch": 0.074875, "grad_norm": 4.53125, "grad_norm_var": 0.07884114583333333, "learning_rate": 0.0001, "loss": 6.8577, "loss/crossentropy": 2.540056586265564, "loss/hidden": 1.78125, "loss/jsd": 0.0, "loss/logits": 0.25363685190677643, "step": 2396 }, { "epoch": 0.0749375, "grad_norm": 3.8125, "grad_norm_var": 0.10328369140625, "learning_rate": 0.0001, "loss": 6.8691, "loss/crossentropy": 2.5701223611831665, "loss/hidden": 1.8046875, "loss/jsd": 0.0, "loss/logits": 0.24942508339881897, "step": 2398 }, { "epoch": 0.075, "grad_norm": 4.3125, "grad_norm_var": 0.10064697265625, "learning_rate": 0.0001, "loss": 7.2102, "loss/crossentropy": 2.895754337310791, "loss/hidden": 1.80859375, "loss/jsd": 0.0, "loss/logits": 0.25058774650096893, "step": 2400 }, { "epoch": 0.0750625, "grad_norm": 4.5625, "grad_norm_var": 0.08388264973958333, "learning_rate": 0.0001, "loss": 7.0321, "loss/crossentropy": 2.605707287788391, "loss/hidden": 1.84375, "loss/jsd": 0.0, "loss/logits": 0.25826863944530487, "step": 2402 }, { "epoch": 0.075125, "grad_norm": 5.1875, "grad_norm_var": 0.12615559895833334, "learning_rate": 0.0001, "loss": 7.0141, "loss/crossentropy": 2.5209784507751465, "loss/hidden": 1.890625, "loss/jsd": 0.0, "loss/logits": 0.2602469325065613, "step": 2404 }, { "epoch": 0.0751875, "grad_norm": 4.4375, "grad_norm_var": 0.17769775390625, "learning_rate": 0.0001, "loss": 7.0747, "loss/crossentropy": 2.622338056564331, "loss/hidden": 1.85546875, "loss/jsd": 0.0, "loss/logits": 0.25968825817108154, "step": 2406 }, { "epoch": 0.07525, "grad_norm": 4.90625, "grad_norm_var": 0.17310791015625, "learning_rate": 0.0001, "loss": 7.479, "loss/crossentropy": 2.869504928588867, "loss/hidden": 1.8671875, "loss/jsd": 0.0, "loss/logits": 0.27423471212387085, "step": 2408 }, { "epoch": 0.0753125, "grad_norm": 4.0625, "grad_norm_var": 0.16378580729166667, "learning_rate": 0.0001, "loss": 7.0745, "loss/crossentropy": 2.7071975469589233, "loss/hidden": 1.86328125, "loss/jsd": 0.0, "loss/logits": 0.25040150433778763, "step": 2410 }, { "epoch": 0.075375, "grad_norm": 4.46875, "grad_norm_var": 0.2552083333333333, "learning_rate": 0.0001, "loss": 7.2737, "loss/crossentropy": 2.802687883377075, "loss/hidden": 1.8359375, "loss/jsd": 0.0, "loss/logits": 0.26350709050893784, "step": 2412 }, { "epoch": 0.0754375, "grad_norm": 4.0625, "grad_norm_var": 0.23303629557291666, "learning_rate": 0.0001, "loss": 6.7801, "loss/crossentropy": 2.491695761680603, "loss/hidden": 1.7890625, "loss/jsd": 0.0, "loss/logits": 0.2499319687485695, "step": 2414 }, { "epoch": 0.0755, "grad_norm": 4.75, "grad_norm_var": 0.21808268229166666, "learning_rate": 0.0001, "loss": 7.4099, "loss/crossentropy": 2.7706817388534546, "loss/hidden": 1.91015625, "loss/jsd": 0.0, "loss/logits": 0.2729024589061737, "step": 2416 }, { "epoch": 0.0755625, "grad_norm": 4.3125, "grad_norm_var": 0.2091796875, "learning_rate": 0.0001, "loss": 6.9253, "loss/crossentropy": 2.564241409301758, "loss/hidden": 1.8515625, "loss/jsd": 0.0, "loss/logits": 0.25094784796237946, "step": 2418 }, { "epoch": 0.075625, "grad_norm": 3.890625, "grad_norm_var": 1.3548329671223958, "learning_rate": 0.0001, "loss": 7.1162, "loss/crossentropy": 2.616005301475525, "loss/hidden": 1.8359375, "loss/jsd": 0.0, "loss/logits": 0.26642537117004395, "step": 2420 }, { "epoch": 0.0756875, "grad_norm": 4.09375, "grad_norm_var": 1.411766560872396, "learning_rate": 0.0001, "loss": 6.7817, "loss/crossentropy": 2.5218793153762817, "loss/hidden": 1.80859375, "loss/jsd": 0.0, "loss/logits": 0.245124451816082, "step": 2422 }, { "epoch": 0.07575, "grad_norm": 4.125, "grad_norm_var": 1.4401601155598958, "learning_rate": 0.0001, "loss": 6.9256, "loss/crossentropy": 2.559067964553833, "loss/hidden": 1.80078125, "loss/jsd": 0.0, "loss/logits": 0.25657251477241516, "step": 2424 }, { "epoch": 0.0758125, "grad_norm": 4.59375, "grad_norm_var": 1.4102203369140625, "learning_rate": 0.0001, "loss": 7.1871, "loss/crossentropy": 2.6583296060562134, "loss/hidden": 1.890625, "loss/jsd": 0.0, "loss/logits": 0.26381099224090576, "step": 2426 }, { "epoch": 0.075875, "grad_norm": 4.03125, "grad_norm_var": 1.3658599853515625, "learning_rate": 0.0001, "loss": 6.7713, "loss/crossentropy": 2.5485886335372925, "loss/hidden": 1.8125, "loss/jsd": 0.0, "loss/logits": 0.24102021753787994, "step": 2428 }, { "epoch": 0.0759375, "grad_norm": 4.46875, "grad_norm_var": 1.3427073160807292, "learning_rate": 0.0001, "loss": 7.1977, "loss/crossentropy": 2.817766785621643, "loss/hidden": 1.8203125, "loss/jsd": 0.0, "loss/logits": 0.25596490502357483, "step": 2430 }, { "epoch": 0.076, "grad_norm": 25.5, "grad_norm_var": 28.454483032226562, "learning_rate": 0.0001, "loss": 7.4982, "loss/crossentropy": 2.580443024635315, "loss/hidden": 2.01171875, "loss/jsd": 0.0, "loss/logits": 0.2906050682067871, "step": 2432 }, { "epoch": 0.0760625, "grad_norm": 4.59375, "grad_norm_var": 28.318000284830728, "learning_rate": 0.0001, "loss": 7.4054, "loss/crossentropy": 2.8410911560058594, "loss/hidden": 1.87890625, "loss/jsd": 0.0, "loss/logits": 0.2685410678386688, "step": 2434 }, { "epoch": 0.076125, "grad_norm": 4.75, "grad_norm_var": 27.567769368489582, "learning_rate": 0.0001, "loss": 7.0348, "loss/crossentropy": 2.5193880796432495, "loss/hidden": 1.86328125, "loss/jsd": 0.0, "loss/logits": 0.2652158737182617, "step": 2436 }, { "epoch": 0.0761875, "grad_norm": 4.375, "grad_norm_var": 27.481148274739585, "learning_rate": 0.0001, "loss": 6.8777, "loss/crossentropy": 2.5700310468673706, "loss/hidden": 1.8125, "loss/jsd": 0.0, "loss/logits": 0.24951593577861786, "step": 2438 }, { "epoch": 0.07625, "grad_norm": 4.5625, "grad_norm_var": 27.446207682291668, "learning_rate": 0.0001, "loss": 6.4905, "loss/crossentropy": 2.379134774208069, "loss/hidden": 1.80859375, "loss/jsd": 0.0, "loss/logits": 0.23027870059013367, "step": 2440 }, { "epoch": 0.0763125, "grad_norm": 4.125, "grad_norm_var": 27.518229166666668, "learning_rate": 0.0001, "loss": 6.9303, "loss/crossentropy": 2.5619860887527466, "loss/hidden": 1.91796875, "loss/jsd": 0.0, "loss/logits": 0.24503721296787262, "step": 2442 }, { "epoch": 0.076375, "grad_norm": 4.1875, "grad_norm_var": 27.57097880045573, "learning_rate": 0.0001, "loss": 6.5573, "loss/crossentropy": 2.392979383468628, "loss/hidden": 1.79296875, "loss/jsd": 0.0, "loss/logits": 0.23713234812021255, "step": 2444 }, { "epoch": 0.0764375, "grad_norm": 5.125, "grad_norm_var": 27.42019755045573, "learning_rate": 0.0001, "loss": 7.196, "loss/crossentropy": 2.5500316619873047, "loss/hidden": 1.91796875, "loss/jsd": 0.0, "loss/logits": 0.2727985829114914, "step": 2446 }, { "epoch": 0.0765, "grad_norm": 4.40625, "grad_norm_var": 0.3015696207682292, "learning_rate": 0.0001, "loss": 7.0044, "loss/crossentropy": 2.5839664936065674, "loss/hidden": 1.84375, "loss/jsd": 0.0, "loss/logits": 0.257663831114769, "step": 2448 }, { "epoch": 0.0765625, "grad_norm": 4.59375, "grad_norm_var": 0.4693349202473958, "learning_rate": 0.0001, "loss": 6.9517, "loss/crossentropy": 2.5211989879608154, "loss/hidden": 1.953125, "loss/jsd": 0.0, "loss/logits": 0.24773336946964264, "step": 2450 }, { "epoch": 0.076625, "grad_norm": 3.75, "grad_norm_var": 0.37576395670572915, "learning_rate": 0.0001, "loss": 7.0608, "loss/crossentropy": 2.7452404499053955, "loss/hidden": 1.82421875, "loss/jsd": 0.0, "loss/logits": 0.24913445115089417, "step": 2452 }, { "epoch": 0.0766875, "grad_norm": 3.953125, "grad_norm_var": 0.3881144205729167, "learning_rate": 0.0001, "loss": 7.0236, "loss/crossentropy": 2.7359414100646973, "loss/hidden": 1.80078125, "loss/jsd": 0.0, "loss/logits": 0.2486885040998459, "step": 2454 }, { "epoch": 0.07675, "grad_norm": 4.46875, "grad_norm_var": 0.4288736979166667, "learning_rate": 0.0001, "loss": 6.6268, "loss/crossentropy": 2.423543334007263, "loss/hidden": 1.78515625, "loss/jsd": 0.0, "loss/logits": 0.24180874228477478, "step": 2456 }, { "epoch": 0.0768125, "grad_norm": 4.875, "grad_norm_var": 0.5277506510416666, "learning_rate": 0.0001, "loss": 7.222, "loss/crossentropy": 2.639382839202881, "loss/hidden": 1.91796875, "loss/jsd": 0.0, "loss/logits": 0.2664603292942047, "step": 2458 }, { "epoch": 0.076875, "grad_norm": 4.78125, "grad_norm_var": 0.5103830973307292, "learning_rate": 0.0001, "loss": 7.0783, "loss/crossentropy": 2.7202084064483643, "loss/hidden": 1.88671875, "loss/jsd": 0.0, "loss/logits": 0.24714156985282898, "step": 2460 }, { "epoch": 0.0769375, "grad_norm": 3.953125, "grad_norm_var": 0.5262532552083333, "learning_rate": 0.0001, "loss": 6.6897, "loss/crossentropy": 2.4970299005508423, "loss/hidden": 1.82421875, "loss/jsd": 0.0, "loss/logits": 0.236849345266819, "step": 2462 }, { "epoch": 0.077, "grad_norm": 4.4375, "grad_norm_var": 0.5183919270833334, "learning_rate": 0.0001, "loss": 7.4318, "loss/crossentropy": 2.875585675239563, "loss/hidden": 1.859375, "loss/jsd": 0.0, "loss/logits": 0.2696887403726578, "step": 2464 }, { "epoch": 0.0770625, "grad_norm": 4.28125, "grad_norm_var": 0.30789388020833336, "learning_rate": 0.0001, "loss": 7.137, "loss/crossentropy": 2.7693514823913574, "loss/hidden": 1.81640625, "loss/jsd": 0.0, "loss/logits": 0.2551232725381851, "step": 2466 }, { "epoch": 0.077125, "grad_norm": 4.375, "grad_norm_var": 0.27701416015625, "learning_rate": 0.0001, "loss": 6.8556, "loss/crossentropy": 2.479888916015625, "loss/hidden": 1.8203125, "loss/jsd": 0.0, "loss/logits": 0.2555390000343323, "step": 2468 }, { "epoch": 0.0771875, "grad_norm": 3.75, "grad_norm_var": 0.30461324055989586, "learning_rate": 0.0001, "loss": 6.6706, "loss/crossentropy": 2.5898760557174683, "loss/hidden": 1.74609375, "loss/jsd": 0.0, "loss/logits": 0.2334597110748291, "step": 2470 }, { "epoch": 0.07725, "grad_norm": 5.0625, "grad_norm_var": 0.3222401936848958, "learning_rate": 0.0001, "loss": 7.4498, "loss/crossentropy": 2.784128785133362, "loss/hidden": 1.91796875, "loss/jsd": 0.0, "loss/logits": 0.27476924657821655, "step": 2472 }, { "epoch": 0.0773125, "grad_norm": 4.5625, "grad_norm_var": 0.2200592041015625, "learning_rate": 0.0001, "loss": 7.3398, "loss/crossentropy": 2.880878210067749, "loss/hidden": 1.90234375, "loss/jsd": 0.0, "loss/logits": 0.2556593418121338, "step": 2474 }, { "epoch": 0.077375, "grad_norm": 4.21875, "grad_norm_var": 0.17563374837239584, "learning_rate": 0.0001, "loss": 7.2807, "loss/crossentropy": 2.877553939819336, "loss/hidden": 1.828125, "loss/jsd": 0.0, "loss/logits": 0.2574998736381531, "step": 2476 }, { "epoch": 0.0774375, "grad_norm": 4.65625, "grad_norm_var": 0.16825764973958332, "learning_rate": 0.0001, "loss": 6.9746, "loss/crossentropy": 2.594506025314331, "loss/hidden": 1.82421875, "loss/jsd": 0.0, "loss/logits": 0.25558608770370483, "step": 2478 }, { "epoch": 0.0775, "grad_norm": 4.78125, "grad_norm_var": 0.17343343098958333, "learning_rate": 0.0001, "loss": 7.2236, "loss/crossentropy": 2.823242425918579, "loss/hidden": 1.82421875, "loss/jsd": 0.0, "loss/logits": 0.25761541724205017, "step": 2480 }, { "epoch": 0.0775625, "grad_norm": 5.09375, "grad_norm_var": 0.19537353515625, "learning_rate": 0.0001, "loss": 7.0229, "loss/crossentropy": 2.6774028539657593, "loss/hidden": 1.7890625, "loss/jsd": 0.0, "loss/logits": 0.2556390166282654, "step": 2482 }, { "epoch": 0.077625, "grad_norm": 4.375, "grad_norm_var": 0.1978515625, "learning_rate": 0.0001, "loss": 6.6963, "loss/crossentropy": 2.5514711141586304, "loss/hidden": 1.81640625, "loss/jsd": 0.0, "loss/logits": 0.23284562677145004, "step": 2484 }, { "epoch": 0.0776875, "grad_norm": 5.9375, "grad_norm_var": 0.25780843098958334, "learning_rate": 0.0001, "loss": 7.4183, "loss/crossentropy": 2.7475714683532715, "loss/hidden": 1.90234375, "loss/jsd": 0.0, "loss/logits": 0.2768399715423584, "step": 2486 }, { "epoch": 0.07775, "grad_norm": 3.953125, "grad_norm_var": 0.22437235514322917, "learning_rate": 0.0001, "loss": 6.837, "loss/crossentropy": 2.534694790840149, "loss/hidden": 1.74609375, "loss/jsd": 0.0, "loss/logits": 0.25562240928411484, "step": 2488 }, { "epoch": 0.0778125, "grad_norm": 3.921875, "grad_norm_var": 0.2560831705729167, "learning_rate": 0.0001, "loss": 6.665, "loss/crossentropy": 2.4949283599853516, "loss/hidden": 1.79296875, "loss/jsd": 0.0, "loss/logits": 0.23770590126514435, "step": 2490 }, { "epoch": 0.077875, "grad_norm": 4.90625, "grad_norm_var": 0.86773681640625, "learning_rate": 0.0001, "loss": 7.0473, "loss/crossentropy": 2.5547311305999756, "loss/hidden": 1.90234375, "loss/jsd": 0.0, "loss/logits": 0.25901947915554047, "step": 2492 }, { "epoch": 0.0779375, "grad_norm": 4.78125, "grad_norm_var": 0.895458984375, "learning_rate": 0.0001, "loss": 6.822, "loss/crossentropy": 2.5802226066589355, "loss/hidden": 1.84375, "loss/jsd": 0.0, "loss/logits": 0.23980476707220078, "step": 2494 }, { "epoch": 0.078, "grad_norm": 3.71875, "grad_norm_var": 0.9597819010416667, "learning_rate": 0.0001, "loss": 6.2974, "loss/crossentropy": 2.323503851890564, "loss/hidden": 1.77734375, "loss/jsd": 0.0, "loss/logits": 0.21965843439102173, "step": 2496 }, { "epoch": 0.0780625, "grad_norm": 4.09375, "grad_norm_var": 0.96005859375, "learning_rate": 0.0001, "loss": 6.6043, "loss/crossentropy": 2.4191696643829346, "loss/hidden": 1.80078125, "loss/jsd": 0.0, "loss/logits": 0.2384398654103279, "step": 2498 }, { "epoch": 0.078125, "grad_norm": 4.21875, "grad_norm_var": 0.9535441080729167, "learning_rate": 0.0001, "loss": 7.2297, "loss/crossentropy": 2.8721203804016113, "loss/hidden": 1.83984375, "loss/jsd": 0.0, "loss/logits": 0.251773476600647, "step": 2500 }, { "epoch": 0.0781875, "grad_norm": 4.1875, "grad_norm_var": 0.83916015625, "learning_rate": 0.0001, "loss": 6.9023, "loss/crossentropy": 2.6179966926574707, "loss/hidden": 1.80859375, "loss/jsd": 0.0, "loss/logits": 0.247569240629673, "step": 2502 }, { "epoch": 0.07825, "grad_norm": 4.6875, "grad_norm_var": 1.0491770426432292, "learning_rate": 0.0001, "loss": 7.201, "loss/crossentropy": 2.5599087476730347, "loss/hidden": 1.94140625, "loss/jsd": 0.0, "loss/logits": 0.26996414363384247, "step": 2504 }, { "epoch": 0.0783125, "grad_norm": 4.75, "grad_norm_var": 1.005322265625, "learning_rate": 0.0001, "loss": 7.369, "loss/crossentropy": 2.879120349884033, "loss/hidden": 1.82421875, "loss/jsd": 0.0, "loss/logits": 0.26656413078308105, "step": 2506 }, { "epoch": 0.078375, "grad_norm": 4.375, "grad_norm_var": 0.3843587239583333, "learning_rate": 0.0001, "loss": 7.1051, "loss/crossentropy": 2.634618043899536, "loss/hidden": 1.83203125, "loss/jsd": 0.0, "loss/logits": 0.26384034752845764, "step": 2508 }, { "epoch": 0.0784375, "grad_norm": 4.1875, "grad_norm_var": 0.36886393229166664, "learning_rate": 0.0001, "loss": 6.8611, "loss/crossentropy": 2.601563811302185, "loss/hidden": 1.8359375, "loss/jsd": 0.0, "loss/logits": 0.24236443638801575, "step": 2510 }, { "epoch": 0.0785, "grad_norm": 4.5, "grad_norm_var": 0.39342041015625, "learning_rate": 0.0001, "loss": 7.0797, "loss/crossentropy": 2.5156508684158325, "loss/hidden": 1.87890625, "loss/jsd": 0.0, "loss/logits": 0.2685118168592453, "step": 2512 }, { "epoch": 0.0785625, "grad_norm": 4.125, "grad_norm_var": 0.398681640625, "learning_rate": 0.0001, "loss": 6.9509, "loss/crossentropy": 2.646916389465332, "loss/hidden": 1.8046875, "loss/jsd": 0.0, "loss/logits": 0.24993188679218292, "step": 2514 }, { "epoch": 0.078625, "grad_norm": 4.3125, "grad_norm_var": 0.3974609375, "learning_rate": 0.0001, "loss": 6.8179, "loss/crossentropy": 2.5790480375289917, "loss/hidden": 1.796875, "loss/jsd": 0.0, "loss/logits": 0.24420031160116196, "step": 2516 }, { "epoch": 0.0786875, "grad_norm": 4.3125, "grad_norm_var": 0.38648681640625, "learning_rate": 0.0001, "loss": 6.9183, "loss/crossentropy": 2.5294101238250732, "loss/hidden": 1.828125, "loss/jsd": 0.0, "loss/logits": 0.25607292354106903, "step": 2518 }, { "epoch": 0.07875, "grad_norm": 3.765625, "grad_norm_var": 0.18620503743489583, "learning_rate": 0.0001, "loss": 6.6685, "loss/crossentropy": 2.481043577194214, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.2414042055606842, "step": 2520 }, { "epoch": 0.0788125, "grad_norm": 4.46875, "grad_norm_var": 0.1506500244140625, "learning_rate": 0.0001, "loss": 7.3269, "loss/crossentropy": 2.931631326675415, "loss/hidden": 1.8125, "loss/jsd": 0.0, "loss/logits": 0.2582782357931137, "step": 2522 }, { "epoch": 0.078875, "grad_norm": 4.09375, "grad_norm_var": 0.1521148681640625, "learning_rate": 0.0001, "loss": 6.8272, "loss/crossentropy": 2.5830577611923218, "loss/hidden": 1.79296875, "loss/jsd": 0.0, "loss/logits": 0.24512076377868652, "step": 2524 }, { "epoch": 0.0789375, "grad_norm": 4.46875, "grad_norm_var": 0.14638570149739583, "learning_rate": 0.0001, "loss": 6.795, "loss/crossentropy": 2.4542908668518066, "loss/hidden": 1.87890625, "loss/jsd": 0.0, "loss/logits": 0.24617988616228104, "step": 2526 }, { "epoch": 0.079, "grad_norm": 3.96875, "grad_norm_var": 0.0508941650390625, "learning_rate": 0.0001, "loss": 6.8898, "loss/crossentropy": 2.6073638200759888, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.2509048581123352, "step": 2528 }, { "epoch": 0.0790625, "grad_norm": 4.0, "grad_norm_var": 0.0536041259765625, "learning_rate": 0.0001, "loss": 6.7992, "loss/crossentropy": 2.7190089225769043, "loss/hidden": 1.80859375, "loss/jsd": 0.0, "loss/logits": 0.22716018557548523, "step": 2530 }, { "epoch": 0.079125, "grad_norm": 4.84375, "grad_norm_var": 0.09160054524739583, "learning_rate": 0.0001, "loss": 7.4126, "loss/crossentropy": 2.7276889085769653, "loss/hidden": 1.89453125, "loss/jsd": 0.0, "loss/logits": 0.27903781831264496, "step": 2532 }, { "epoch": 0.0791875, "grad_norm": 4.09375, "grad_norm_var": 0.09919331868489584, "learning_rate": 0.0001, "loss": 6.6975, "loss/crossentropy": 2.497929811477661, "loss/hidden": 1.7890625, "loss/jsd": 0.0, "loss/logits": 0.2410556524991989, "step": 2534 }, { "epoch": 0.07925, "grad_norm": 4.3125, "grad_norm_var": 0.16210530598958334, "learning_rate": 0.0001, "loss": 7.0674, "loss/crossentropy": 2.7110267877578735, "loss/hidden": 1.828125, "loss/jsd": 0.0, "loss/logits": 0.25282643735408783, "step": 2536 }, { "epoch": 0.0793125, "grad_norm": 4.78125, "grad_norm_var": 0.16951497395833334, "learning_rate": 0.0001, "loss": 6.9377, "loss/crossentropy": 2.6172502040863037, "loss/hidden": 1.796875, "loss/jsd": 0.0, "loss/logits": 0.2523553669452667, "step": 2538 }, { "epoch": 0.079375, "grad_norm": 6.71875, "grad_norm_var": 0.47849934895833335, "learning_rate": 0.0001, "loss": 6.696, "loss/crossentropy": 2.4634408950805664, "loss/hidden": 1.87109375, "loss/jsd": 0.0, "loss/logits": 0.23614561557769775, "step": 2540 }, { "epoch": 0.0794375, "grad_norm": 4.0, "grad_norm_var": 0.5531209309895834, "learning_rate": 0.0001, "loss": 7.1948, "loss/crossentropy": 2.777570605278015, "loss/hidden": 1.84375, "loss/jsd": 0.0, "loss/logits": 0.2573477476835251, "step": 2542 }, { "epoch": 0.0795, "grad_norm": 4.96875, "grad_norm_var": 0.5543619791666666, "learning_rate": 0.0001, "loss": 6.8532, "loss/crossentropy": 2.5255823135375977, "loss/hidden": 1.84765625, "loss/jsd": 0.0, "loss/logits": 0.24799229949712753, "step": 2544 }, { "epoch": 0.0795625, "grad_norm": 4.25, "grad_norm_var": 0.52720947265625, "learning_rate": 0.0001, "loss": 7.1704, "loss/crossentropy": 2.666976809501648, "loss/hidden": 1.8515625, "loss/jsd": 0.0, "loss/logits": 0.26518701016902924, "step": 2546 }, { "epoch": 0.079625, "grad_norm": 4.84375, "grad_norm_var": 0.5709869384765625, "learning_rate": 0.0001, "loss": 6.6822, "loss/crossentropy": 2.4179537296295166, "loss/hidden": 1.8203125, "loss/jsd": 0.0, "loss/logits": 0.24439681321382523, "step": 2548 }, { "epoch": 0.0796875, "grad_norm": 4.875, "grad_norm_var": 0.5432688395182291, "learning_rate": 0.0001, "loss": 7.5325, "loss/crossentropy": 2.8433114290237427, "loss/hidden": 1.890625, "loss/jsd": 0.0, "loss/logits": 0.2798580527305603, "step": 2550 }, { "epoch": 0.07975, "grad_norm": 5.1875, "grad_norm_var": 0.5427805582682291, "learning_rate": 0.0001, "loss": 6.5928, "loss/crossentropy": 2.402013421058655, "loss/hidden": 1.83984375, "loss/jsd": 0.0, "loss/logits": 0.23509907722473145, "step": 2552 }, { "epoch": 0.0798125, "grad_norm": 3.71875, "grad_norm_var": 0.5950429280598958, "learning_rate": 0.0001, "loss": 6.832, "loss/crossentropy": 2.5818487405776978, "loss/hidden": 1.80078125, "loss/jsd": 0.0, "loss/logits": 0.24493704736232758, "step": 2554 }, { "epoch": 0.079875, "grad_norm": 5.09375, "grad_norm_var": 0.3174468994140625, "learning_rate": 0.0001, "loss": 7.0319, "loss/crossentropy": 2.6635189056396484, "loss/hidden": 1.87109375, "loss/jsd": 0.0, "loss/logits": 0.24973279982805252, "step": 2556 }, { "epoch": 0.0799375, "grad_norm": 4.9375, "grad_norm_var": 0.2541737874348958, "learning_rate": 0.0001, "loss": 7.4462, "loss/crossentropy": 2.94194233417511, "loss/hidden": 1.859375, "loss/jsd": 0.0, "loss/logits": 0.2644868791103363, "step": 2558 }, { "epoch": 0.08, "grad_norm": 4.15625, "grad_norm_var": 0.24468485514322916, "learning_rate": 0.0001, "loss": 6.816, "loss/crossentropy": 2.615973472595215, "loss/hidden": 1.79296875, "loss/jsd": 0.0, "loss/logits": 0.24071022123098373, "step": 2560 }, { "epoch": 0.0800625, "grad_norm": 4.53125, "grad_norm_var": 0.23405659993489583, "learning_rate": 0.0001, "loss": 7.2102, "loss/crossentropy": 2.626392126083374, "loss/hidden": 1.87109375, "loss/jsd": 0.0, "loss/logits": 0.27126961946487427, "step": 2562 }, { "epoch": 0.080125, "grad_norm": 4.15625, "grad_norm_var": 0.23209228515625, "learning_rate": 0.0001, "loss": 7.2899, "loss/crossentropy": 2.768537402153015, "loss/hidden": 1.796875, "loss/jsd": 0.0, "loss/logits": 0.27244727313518524, "step": 2564 }, { "epoch": 0.0801875, "grad_norm": 4.84375, "grad_norm_var": 0.23557535807291666, "learning_rate": 0.0001, "loss": 6.807, "loss/crossentropy": 2.501596689224243, "loss/hidden": 1.80859375, "loss/jsd": 0.0, "loss/logits": 0.24968481063842773, "step": 2566 }, { "epoch": 0.08025, "grad_norm": 5.78125, "grad_norm_var": 0.3113433837890625, "learning_rate": 0.0001, "loss": 6.8229, "loss/crossentropy": 2.6043057441711426, "loss/hidden": 1.76953125, "loss/jsd": 0.0, "loss/logits": 0.24490661919116974, "step": 2568 }, { "epoch": 0.0803125, "grad_norm": 5.625, "grad_norm_var": 0.3254954020182292, "learning_rate": 0.0001, "loss": 6.9471, "loss/crossentropy": 2.533547282218933, "loss/hidden": 1.79296875, "loss/jsd": 0.0, "loss/logits": 0.26205888390541077, "step": 2570 }, { "epoch": 0.080375, "grad_norm": 4.34375, "grad_norm_var": 0.3199534098307292, "learning_rate": 0.0001, "loss": 7.1706, "loss/crossentropy": 2.7811293601989746, "loss/hidden": 1.82421875, "loss/jsd": 0.0, "loss/logits": 0.25652652978897095, "step": 2572 }, { "epoch": 0.0804375, "grad_norm": 4.53125, "grad_norm_var": 0.31607157389322915, "learning_rate": 0.0001, "loss": 7.0857, "loss/crossentropy": 2.6379505395889282, "loss/hidden": 1.875, "loss/jsd": 0.0, "loss/logits": 0.25727570056915283, "step": 2574 }, { "epoch": 0.0805, "grad_norm": 4.625, "grad_norm_var": 0.28181864420572916, "learning_rate": 0.0001, "loss": 7.2665, "loss/crossentropy": 2.77507221698761, "loss/hidden": 1.89453125, "loss/jsd": 0.0, "loss/logits": 0.2596895396709442, "step": 2576 }, { "epoch": 0.0805625, "grad_norm": 4.53125, "grad_norm_var": 0.2671132405598958, "learning_rate": 0.0001, "loss": 7.2025, "loss/crossentropy": 2.7237536907196045, "loss/hidden": 1.83984375, "loss/jsd": 0.0, "loss/logits": 0.26388655602931976, "step": 2578 }, { "epoch": 0.080625, "grad_norm": 4.46875, "grad_norm_var": 0.2418121337890625, "learning_rate": 0.0001, "loss": 6.6304, "loss/crossentropy": 2.4240094423294067, "loss/hidden": 1.8125, "loss/jsd": 0.0, "loss/logits": 0.23938723653554916, "step": 2580 }, { "epoch": 0.0806875, "grad_norm": 3.984375, "grad_norm_var": 0.248388671875, "learning_rate": 0.0001, "loss": 6.8911, "loss/crossentropy": 2.5649718046188354, "loss/hidden": 1.8046875, "loss/jsd": 0.0, "loss/logits": 0.2521471083164215, "step": 2582 }, { "epoch": 0.08075, "grad_norm": 4.125, "grad_norm_var": 0.1258209228515625, "learning_rate": 0.0001, "loss": 6.8676, "loss/crossentropy": 2.6262258291244507, "loss/hidden": 1.76953125, "loss/jsd": 0.0, "loss/logits": 0.24718241393566132, "step": 2584 }, { "epoch": 0.0808125, "grad_norm": 4.1875, "grad_norm_var": 0.040852864583333336, "learning_rate": 0.0001, "loss": 6.9075, "loss/crossentropy": 2.565857768058777, "loss/hidden": 1.80859375, "loss/jsd": 0.0, "loss/logits": 0.2533073127269745, "step": 2586 }, { "epoch": 0.080875, "grad_norm": 4.03125, "grad_norm_var": 0.05105794270833333, "learning_rate": 0.0001, "loss": 7.0589, "loss/crossentropy": 2.7443935871124268, "loss/hidden": 1.77734375, "loss/jsd": 0.0, "loss/logits": 0.2537176012992859, "step": 2588 }, { "epoch": 0.0809375, "grad_norm": 4.625, "grad_norm_var": 0.05431315104166667, "learning_rate": 0.0001, "loss": 7.094, "loss/crossentropy": 2.6940484046936035, "loss/hidden": 1.83984375, "loss/jsd": 0.0, "loss/logits": 0.25601277500391006, "step": 2590 }, { "epoch": 0.081, "grad_norm": 5.15625, "grad_norm_var": 0.099755859375, "learning_rate": 0.0001, "loss": 6.7848, "loss/crossentropy": 2.6095337867736816, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.24252191185951233, "step": 2592 }, { "epoch": 0.0810625, "grad_norm": 4.875, "grad_norm_var": 0.11858723958333334, "learning_rate": 0.0001, "loss": 6.9664, "loss/crossentropy": 2.6425299644470215, "loss/hidden": 1.83203125, "loss/jsd": 0.0, "loss/logits": 0.24917972832918167, "step": 2594 }, { "epoch": 0.081125, "grad_norm": 3.9375, "grad_norm_var": 0.12763264973958333, "learning_rate": 0.0001, "loss": 6.6924, "loss/crossentropy": 2.5188519954681396, "loss/hidden": 1.7890625, "loss/jsd": 0.0, "loss/logits": 0.23844516277313232, "step": 2596 }, { "epoch": 0.0811875, "grad_norm": 4.40625, "grad_norm_var": 0.11960347493489583, "learning_rate": 0.0001, "loss": 7.2267, "loss/crossentropy": 2.77741277217865, "loss/hidden": 1.8515625, "loss/jsd": 0.0, "loss/logits": 0.2597716152667999, "step": 2598 }, { "epoch": 0.08125, "grad_norm": 4.46875, "grad_norm_var": 0.11912333170572917, "learning_rate": 0.0001, "loss": 6.9949, "loss/crossentropy": 2.707468867301941, "loss/hidden": 1.81640625, "loss/jsd": 0.0, "loss/logits": 0.2471056804060936, "step": 2600 }, { "epoch": 0.0813125, "grad_norm": 3.90625, "grad_norm_var": 0.11705729166666666, "learning_rate": 0.0001, "loss": 6.7894, "loss/crossentropy": 2.567119598388672, "loss/hidden": 1.77734375, "loss/jsd": 0.0, "loss/logits": 0.24449165165424347, "step": 2602 }, { "epoch": 0.081375, "grad_norm": 4.25, "grad_norm_var": 0.10214436848958333, "learning_rate": 0.0001, "loss": 6.9362, "loss/crossentropy": 2.680039644241333, "loss/hidden": 1.84765625, "loss/jsd": 0.0, "loss/logits": 0.2408483326435089, "step": 2604 }, { "epoch": 0.0814375, "grad_norm": 4.53125, "grad_norm_var": 0.10441080729166667, "learning_rate": 0.0001, "loss": 6.6155, "loss/crossentropy": 2.513710618019104, "loss/hidden": 1.78125, "loss/jsd": 0.0, "loss/logits": 0.23205246031284332, "step": 2606 }, { "epoch": 0.0815, "grad_norm": 4.46875, "grad_norm_var": 0.056473795572916666, "learning_rate": 0.0001, "loss": 6.9472, "loss/crossentropy": 2.6492717266082764, "loss/hidden": 1.82421875, "loss/jsd": 0.0, "loss/logits": 0.24736837297677994, "step": 2608 }, { "epoch": 0.0815625, "grad_norm": 3.828125, "grad_norm_var": 0.05537821451822917, "learning_rate": 0.0001, "loss": 6.8619, "loss/crossentropy": 2.6352410316467285, "loss/hidden": 1.78125, "loss/jsd": 0.0, "loss/logits": 0.24454277008771896, "step": 2610 }, { "epoch": 0.081625, "grad_norm": 4.3125, "grad_norm_var": 0.048460896809895834, "learning_rate": 0.0001, "loss": 7.0136, "loss/crossentropy": 2.6773797273635864, "loss/hidden": 1.82421875, "loss/jsd": 0.0, "loss/logits": 0.2512050122022629, "step": 2612 }, { "epoch": 0.0816875, "grad_norm": 3.9375, "grad_norm_var": 0.06180013020833333, "learning_rate": 0.0001, "loss": 6.8441, "loss/crossentropy": 2.6951568126678467, "loss/hidden": 1.79296875, "loss/jsd": 0.0, "loss/logits": 0.2355944886803627, "step": 2614 }, { "epoch": 0.08175, "grad_norm": 3.703125, "grad_norm_var": 0.08315327962239584, "learning_rate": 0.0001, "loss": 7.0625, "loss/crossentropy": 2.8284648656845093, "loss/hidden": 1.79296875, "loss/jsd": 0.0, "loss/logits": 0.2441062480211258, "step": 2616 }, { "epoch": 0.0818125, "grad_norm": 3.90625, "grad_norm_var": 0.08290608723958333, "learning_rate": 0.0001, "loss": 7.0338, "loss/crossentropy": 2.8697853088378906, "loss/hidden": 1.78515625, "loss/jsd": 0.0, "loss/logits": 0.23789025843143463, "step": 2618 }, { "epoch": 0.081875, "grad_norm": 4.03125, "grad_norm_var": 0.0837890625, "learning_rate": 0.0001, "loss": 6.5213, "loss/crossentropy": 2.4423439502716064, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.23367558419704437, "step": 2620 }, { "epoch": 0.0819375, "grad_norm": 4.25, "grad_norm_var": 0.085400390625, "learning_rate": 0.0001, "loss": 6.9671, "loss/crossentropy": 2.7237391471862793, "loss/hidden": 1.79296875, "loss/jsd": 0.0, "loss/logits": 0.24503568559885025, "step": 2622 }, { "epoch": 0.082, "grad_norm": 4.40625, "grad_norm_var": 0.07935791015625, "learning_rate": 0.0001, "loss": 6.7533, "loss/crossentropy": 2.7278497219085693, "loss/hidden": 1.74609375, "loss/jsd": 0.0, "loss/logits": 0.22793515026569366, "step": 2624 }, { "epoch": 0.0820625, "grad_norm": 4.46875, "grad_norm_var": 0.0682769775390625, "learning_rate": 0.0001, "loss": 6.983, "loss/crossentropy": 2.63441264629364, "loss/hidden": 1.8203125, "loss/jsd": 0.0, "loss/logits": 0.2528279647231102, "step": 2626 }, { "epoch": 0.082125, "grad_norm": 4.28125, "grad_norm_var": 0.07888895670572917, "learning_rate": 0.0001, "loss": 7.0744, "loss/crossentropy": 2.7248504161834717, "loss/hidden": 1.8203125, "loss/jsd": 0.0, "loss/logits": 0.25292399525642395, "step": 2628 }, { "epoch": 0.0821875, "grad_norm": 4.28125, "grad_norm_var": 0.06802978515625, "learning_rate": 0.0001, "loss": 6.8822, "loss/crossentropy": 2.598918318748474, "loss/hidden": 1.8515625, "loss/jsd": 0.0, "loss/logits": 0.24317240715026855, "step": 2630 }, { "epoch": 0.08225, "grad_norm": 3.953125, "grad_norm_var": 0.06279296875, "learning_rate": 0.0001, "loss": 7.1108, "loss/crossentropy": 2.6071430444717407, "loss/hidden": 1.85546875, "loss/jsd": 0.0, "loss/logits": 0.2648216784000397, "step": 2632 }, { "epoch": 0.0823125, "grad_norm": 4.0625, "grad_norm_var": 0.05176493326822917, "learning_rate": 0.0001, "loss": 7.0101, "loss/crossentropy": 2.7162694931030273, "loss/hidden": 1.828125, "loss/jsd": 0.0, "loss/logits": 0.2465699315071106, "step": 2634 }, { "epoch": 0.082375, "grad_norm": 4.0625, "grad_norm_var": 0.05611572265625, "learning_rate": 0.0001, "loss": 7.1623, "loss/crossentropy": 2.8163429498672485, "loss/hidden": 1.80078125, "loss/jsd": 0.0, "loss/logits": 0.2545183300971985, "step": 2636 }, { "epoch": 0.0824375, "grad_norm": 4.34375, "grad_norm_var": 0.06080322265625, "learning_rate": 0.0001, "loss": 7.2852, "loss/crossentropy": 2.8388519287109375, "loss/hidden": 1.83984375, "loss/jsd": 0.0, "loss/logits": 0.2606465071439743, "step": 2638 }, { "epoch": 0.0825, "grad_norm": 4.5, "grad_norm_var": 0.06549072265625, "learning_rate": 0.0001, "loss": 6.9992, "loss/crossentropy": 2.734708309173584, "loss/hidden": 1.796875, "loss/jsd": 0.0, "loss/logits": 0.24676088988780975, "step": 2640 }, { "epoch": 0.0825625, "grad_norm": 4.5, "grad_norm_var": 0.06012369791666667, "learning_rate": 0.0001, "loss": 6.7951, "loss/crossentropy": 2.6085119247436523, "loss/hidden": 1.78125, "loss/jsd": 0.0, "loss/logits": 0.24052976816892624, "step": 2642 }, { "epoch": 0.082625, "grad_norm": 4.40625, "grad_norm_var": 0.055615234375, "learning_rate": 0.0001, "loss": 6.8036, "loss/crossentropy": 2.632908344268799, "loss/hidden": 1.765625, "loss/jsd": 0.0, "loss/logits": 0.24050304293632507, "step": 2644 }, { "epoch": 0.0826875, "grad_norm": 4.1875, "grad_norm_var": 0.05563151041666667, "learning_rate": 0.0001, "loss": 7.0268, "loss/crossentropy": 2.7476726770401, "loss/hidden": 1.8203125, "loss/jsd": 0.0, "loss/logits": 0.24588297307491302, "step": 2646 }, { "epoch": 0.08275, "grad_norm": 4.5625, "grad_norm_var": 0.04209696451822917, "learning_rate": 0.0001, "loss": 6.8453, "loss/crossentropy": 2.6198912858963013, "loss/hidden": 1.83203125, "loss/jsd": 0.0, "loss/logits": 0.23934194445610046, "step": 2648 }, { "epoch": 0.0828125, "grad_norm": 4.09375, "grad_norm_var": 0.042455037434895836, "learning_rate": 0.0001, "loss": 7.0919, "loss/crossentropy": 2.7107146978378296, "loss/hidden": 1.8125, "loss/jsd": 0.0, "loss/logits": 0.25687095522880554, "step": 2650 }, { "epoch": 0.082875, "grad_norm": 3.921875, "grad_norm_var": 0.04362691243489583, "learning_rate": 0.0001, "loss": 6.6233, "loss/crossentropy": 2.5405800342559814, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.2360065057873726, "step": 2652 }, { "epoch": 0.0829375, "grad_norm": 4.0, "grad_norm_var": 0.0378326416015625, "learning_rate": 0.0001, "loss": 6.5801, "loss/crossentropy": 2.442927122116089, "loss/hidden": 1.80078125, "loss/jsd": 0.0, "loss/logits": 0.23364173620939255, "step": 2654 }, { "epoch": 0.083, "grad_norm": 4.46875, "grad_norm_var": 0.0396392822265625, "learning_rate": 0.0001, "loss": 6.8022, "loss/crossentropy": 2.5995049476623535, "loss/hidden": 1.78515625, "loss/jsd": 0.0, "loss/logits": 0.24175354093313217, "step": 2656 }, { "epoch": 0.0830625, "grad_norm": 4.59375, "grad_norm_var": 0.0486724853515625, "learning_rate": 0.0001, "loss": 7.1417, "loss/crossentropy": 2.7718758583068848, "loss/hidden": 1.84765625, "loss/jsd": 0.0, "loss/logits": 0.25222010165452957, "step": 2658 }, { "epoch": 0.083125, "grad_norm": 4.71875, "grad_norm_var": 0.0679107666015625, "learning_rate": 0.0001, "loss": 6.9471, "loss/crossentropy": 2.580165386199951, "loss/hidden": 1.8359375, "loss/jsd": 0.0, "loss/logits": 0.25309784710407257, "step": 2660 }, { "epoch": 0.0831875, "grad_norm": 4.34375, "grad_norm_var": 0.0666412353515625, "learning_rate": 0.0001, "loss": 7.1886, "loss/crossentropy": 2.7995067834854126, "loss/hidden": 1.875, "loss/jsd": 0.0, "loss/logits": 0.25140853226184845, "step": 2662 }, { "epoch": 0.08325, "grad_norm": 4.28125, "grad_norm_var": 0.0606109619140625, "learning_rate": 0.0001, "loss": 6.9854, "loss/crossentropy": 2.60084867477417, "loss/hidden": 1.828125, "loss/jsd": 0.0, "loss/logits": 0.255643293261528, "step": 2664 }, { "epoch": 0.0833125, "grad_norm": 4.40625, "grad_norm_var": 0.2248199462890625, "learning_rate": 0.0001, "loss": 7.6103, "loss/crossentropy": 2.9394861459732056, "loss/hidden": 1.8125, "loss/jsd": 0.0, "loss/logits": 0.2858267277479172, "step": 2666 }, { "epoch": 0.083375, "grad_norm": 5.53125, "grad_norm_var": 0.3035552978515625, "learning_rate": 0.0001, "loss": 6.9792, "loss/crossentropy": 2.661886215209961, "loss/hidden": 1.8203125, "loss/jsd": 0.0, "loss/logits": 0.2497013434767723, "step": 2668 }, { "epoch": 0.0834375, "grad_norm": 4.09375, "grad_norm_var": 0.3045857747395833, "learning_rate": 0.0001, "loss": 6.5979, "loss/crossentropy": 2.470514416694641, "loss/hidden": 1.8125, "loss/jsd": 0.0, "loss/logits": 0.2314888834953308, "step": 2670 }, { "epoch": 0.0835, "grad_norm": 4.40625, "grad_norm_var": 0.28738606770833336, "learning_rate": 0.0001, "loss": 7.3821, "loss/crossentropy": 2.8701629638671875, "loss/hidden": 1.86328125, "loss/jsd": 0.0, "loss/logits": 0.2648680955171585, "step": 2672 }, { "epoch": 0.0835625, "grad_norm": 4.21875, "grad_norm_var": 0.29339192708333334, "learning_rate": 0.0001, "loss": 6.7866, "loss/crossentropy": 2.513500213623047, "loss/hidden": 1.8125, "loss/jsd": 0.0, "loss/logits": 0.24605898559093475, "step": 2674 }, { "epoch": 0.083625, "grad_norm": 5.75, "grad_norm_var": 0.43191731770833336, "learning_rate": 0.0001, "loss": 6.4642, "loss/crossentropy": 2.3846570253372192, "loss/hidden": 1.78125, "loss/jsd": 0.0, "loss/logits": 0.22982840240001678, "step": 2676 }, { "epoch": 0.0836875, "grad_norm": 4.34375, "grad_norm_var": 0.43017171223958334, "learning_rate": 0.0001, "loss": 7.1463, "loss/crossentropy": 2.7192554473876953, "loss/hidden": 1.828125, "loss/jsd": 0.0, "loss/logits": 0.2598957419395447, "step": 2678 }, { "epoch": 0.08375, "grad_norm": 4.375, "grad_norm_var": 0.4313435872395833, "learning_rate": 0.0001, "loss": 7.0543, "loss/crossentropy": 2.72619366645813, "loss/hidden": 1.765625, "loss/jsd": 0.0, "loss/logits": 0.25624796748161316, "step": 2680 }, { "epoch": 0.0838125, "grad_norm": 4.28125, "grad_norm_var": 0.30953776041666664, "learning_rate": 0.0001, "loss": 6.8765, "loss/crossentropy": 2.620025634765625, "loss/hidden": 1.8046875, "loss/jsd": 0.0, "loss/logits": 0.2451772838830948, "step": 2682 }, { "epoch": 0.083875, "grad_norm": 4.71875, "grad_norm_var": 0.20747782389322916, "learning_rate": 0.0001, "loss": 6.9777, "loss/crossentropy": 2.6200352907180786, "loss/hidden": 1.82421875, "loss/jsd": 0.0, "loss/logits": 0.2533438503742218, "step": 2684 }, { "epoch": 0.0839375, "grad_norm": 4.46875, "grad_norm_var": 0.19401041666666666, "learning_rate": 0.0001, "loss": 6.9249, "loss/crossentropy": 2.5884883403778076, "loss/hidden": 1.8046875, "loss/jsd": 0.0, "loss/logits": 0.2531713396310806, "step": 2686 }, { "epoch": 0.084, "grad_norm": 4.46875, "grad_norm_var": 0.20788472493489582, "learning_rate": 0.0001, "loss": 6.4523, "loss/crossentropy": 2.364005446434021, "loss/hidden": 1.84765625, "loss/jsd": 0.0, "loss/logits": 0.22406137734651566, "step": 2688 }, { "epoch": 0.0840625, "grad_norm": 4.53125, "grad_norm_var": 0.20716044108072917, "learning_rate": 0.0001, "loss": 7.1637, "loss/crossentropy": 2.6903235912323, "loss/hidden": 1.83203125, "loss/jsd": 0.0, "loss/logits": 0.26413694024086, "step": 2690 }, { "epoch": 0.084125, "grad_norm": 4.15625, "grad_norm_var": 0.05725809733072917, "learning_rate": 0.0001, "loss": 7.0104, "loss/crossentropy": 2.6367440223693848, "loss/hidden": 1.8125, "loss/jsd": 0.0, "loss/logits": 0.2561148405075073, "step": 2692 }, { "epoch": 0.0841875, "grad_norm": 4.09375, "grad_norm_var": 0.0597320556640625, "learning_rate": 0.0001, "loss": 7.0027, "loss/crossentropy": 2.7296712398529053, "loss/hidden": 1.80078125, "loss/jsd": 0.0, "loss/logits": 0.2472282499074936, "step": 2694 }, { "epoch": 0.08425, "grad_norm": 5.28125, "grad_norm_var": 0.11960347493489583, "learning_rate": 0.0001, "loss": 6.5956, "loss/crossentropy": 2.212746024131775, "loss/hidden": 1.8203125, "loss/jsd": 0.0, "loss/logits": 0.2562513202428818, "step": 2696 }, { "epoch": 0.0843125, "grad_norm": 4.125, "grad_norm_var": 0.12383524576822917, "learning_rate": 0.0001, "loss": 6.456, "loss/crossentropy": 2.245554804801941, "loss/hidden": 1.84765625, "loss/jsd": 0.0, "loss/logits": 0.23628269135951996, "step": 2698 }, { "epoch": 0.084375, "grad_norm": 4.375, "grad_norm_var": 0.11800028483072916, "learning_rate": 0.0001, "loss": 7.0587, "loss/crossentropy": 2.7779324054718018, "loss/hidden": 1.8125, "loss/jsd": 0.0, "loss/logits": 0.24682226032018661, "step": 2700 }, { "epoch": 0.0844375, "grad_norm": 6.0625, "grad_norm_var": 0.2856597900390625, "learning_rate": 0.0001, "loss": 6.8239, "loss/crossentropy": 2.4644941091537476, "loss/hidden": 1.79296875, "loss/jsd": 0.0, "loss/logits": 0.2566477209329605, "step": 2702 }, { "epoch": 0.0845, "grad_norm": 4.75, "grad_norm_var": 0.267822265625, "learning_rate": 0.0001, "loss": 7.1954, "loss/crossentropy": 2.770855665206909, "loss/hidden": 1.87890625, "loss/jsd": 0.0, "loss/logits": 0.2545660585165024, "step": 2704 }, { "epoch": 0.0845625, "grad_norm": 3.859375, "grad_norm_var": 0.3005116780598958, "learning_rate": 0.0001, "loss": 7.1604, "loss/crossentropy": 2.833829402923584, "loss/hidden": 1.83203125, "loss/jsd": 0.0, "loss/logits": 0.24945129454135895, "step": 2706 }, { "epoch": 0.084625, "grad_norm": 4.53125, "grad_norm_var": 0.2871734619140625, "learning_rate": 0.0001, "loss": 6.8071, "loss/crossentropy": 2.4861559867858887, "loss/hidden": 1.8125, "loss/jsd": 0.0, "loss/logits": 0.2508440986275673, "step": 2708 }, { "epoch": 0.0846875, "grad_norm": 4.21875, "grad_norm_var": 0.28145243326822916, "learning_rate": 0.0001, "loss": 7.2675, "loss/crossentropy": 2.794210195541382, "loss/hidden": 1.84765625, "loss/jsd": 0.0, "loss/logits": 0.2625606060028076, "step": 2710 }, { "epoch": 0.08475, "grad_norm": 3.875, "grad_norm_var": 0.2695465087890625, "learning_rate": 0.0001, "loss": 6.6533, "loss/crossentropy": 2.558227300643921, "loss/hidden": 1.78125, "loss/jsd": 0.0, "loss/logits": 0.23138165473937988, "step": 2712 }, { "epoch": 0.0848125, "grad_norm": 4.03125, "grad_norm_var": 0.27464090983072914, "learning_rate": 0.0001, "loss": 6.8092, "loss/crossentropy": 2.6484687328338623, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.24107811599969864, "step": 2714 }, { "epoch": 0.084875, "grad_norm": 4.53125, "grad_norm_var": 0.29113667805989585, "learning_rate": 0.0001, "loss": 6.639, "loss/crossentropy": 2.3600529432296753, "loss/hidden": 1.8984375, "loss/jsd": 0.0, "loss/logits": 0.23805497586727142, "step": 2716 }, { "epoch": 0.0849375, "grad_norm": 4.15625, "grad_norm_var": 0.13931884765625, "learning_rate": 0.0001, "loss": 6.7615, "loss/crossentropy": 2.654744267463684, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.23645318299531937, "step": 2718 }, { "epoch": 0.085, "grad_norm": 4.65625, "grad_norm_var": 0.13863932291666667, "learning_rate": 0.0001, "loss": 6.7733, "loss/crossentropy": 2.5434645414352417, "loss/hidden": 1.81640625, "loss/jsd": 0.0, "loss/logits": 0.2413441687822342, "step": 2720 }, { "epoch": 0.0850625, "grad_norm": 4.09375, "grad_norm_var": 0.12547098795572917, "learning_rate": 0.0001, "loss": 6.9629, "loss/crossentropy": 2.571885824203491, "loss/hidden": 1.8203125, "loss/jsd": 0.0, "loss/logits": 0.2570737153291702, "step": 2722 }, { "epoch": 0.085125, "grad_norm": 4.46875, "grad_norm_var": 0.1234039306640625, "learning_rate": 0.0001, "loss": 7.1714, "loss/crossentropy": 2.752210855484009, "loss/hidden": 1.85546875, "loss/jsd": 0.0, "loss/logits": 0.2563716471195221, "step": 2724 }, { "epoch": 0.0851875, "grad_norm": 4.15625, "grad_norm_var": 0.12360738118489584, "learning_rate": 0.0001, "loss": 7.13, "loss/crossentropy": 2.7513015270233154, "loss/hidden": 1.80859375, "loss/jsd": 0.0, "loss/logits": 0.2570132464170456, "step": 2726 }, { "epoch": 0.08525, "grad_norm": 5.125, "grad_norm_var": 0.15066731770833333, "learning_rate": 0.0001, "loss": 6.8562, "loss/crossentropy": 2.527231216430664, "loss/hidden": 1.8671875, "loss/jsd": 0.0, "loss/logits": 0.24617478996515274, "step": 2728 }, { "epoch": 0.0853125, "grad_norm": 7.53125, "grad_norm_var": 0.7095987955729167, "learning_rate": 0.0001, "loss": 7.6808, "loss/crossentropy": 2.98296856880188, "loss/hidden": 1.890625, "loss/jsd": 0.0, "loss/logits": 0.2807210758328438, "step": 2730 }, { "epoch": 0.085375, "grad_norm": 4.09375, "grad_norm_var": 0.73336181640625, "learning_rate": 0.0001, "loss": 6.8353, "loss/crossentropy": 2.6061861515045166, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.24556761980056763, "step": 2732 }, { "epoch": 0.0854375, "grad_norm": 4.15625, "grad_norm_var": 0.74990234375, "learning_rate": 0.0001, "loss": 6.569, "loss/crossentropy": 2.427661895751953, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.23678777366876602, "step": 2734 }, { "epoch": 0.0855, "grad_norm": 3.90625, "grad_norm_var": 0.8070220947265625, "learning_rate": 0.0001, "loss": 6.4683, "loss/crossentropy": 2.5308948755264282, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.21951919049024582, "step": 2736 }, { "epoch": 0.0855625, "grad_norm": 4.59375, "grad_norm_var": 0.8096832275390625, "learning_rate": 0.0001, "loss": 7.1404, "loss/crossentropy": 2.737537384033203, "loss/hidden": 1.83984375, "loss/jsd": 0.0, "loss/logits": 0.2563035860657692, "step": 2738 }, { "epoch": 0.085625, "grad_norm": 3.875, "grad_norm_var": 0.8395416259765625, "learning_rate": 0.0001, "loss": 6.6333, "loss/crossentropy": 2.508602023124695, "loss/hidden": 1.76171875, "loss/jsd": 0.0, "loss/logits": 0.23629501461982727, "step": 2740 }, { "epoch": 0.0856875, "grad_norm": 4.40625, "grad_norm_var": 0.8300201416015625, "learning_rate": 0.0001, "loss": 7.1187, "loss/crossentropy": 2.715458035469055, "loss/hidden": 1.82421875, "loss/jsd": 0.0, "loss/logits": 0.2579001486301422, "step": 2742 }, { "epoch": 0.08575, "grad_norm": 4.03125, "grad_norm_var": 0.802294921875, "learning_rate": 0.0001, "loss": 6.6738, "loss/crossentropy": 2.5113805532455444, "loss/hidden": 1.765625, "loss/jsd": 0.0, "loss/logits": 0.2396751567721367, "step": 2744 }, { "epoch": 0.0858125, "grad_norm": 4.03125, "grad_norm_var": 0.04358723958333333, "learning_rate": 0.0001, "loss": 6.8626, "loss/crossentropy": 2.678552985191345, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.2434050738811493, "step": 2746 }, { "epoch": 0.085875, "grad_norm": 4.40625, "grad_norm_var": 0.048014322916666664, "learning_rate": 0.0001, "loss": 7.0644, "loss/crossentropy": 2.6373531818389893, "loss/hidden": 1.8359375, "loss/jsd": 0.0, "loss/logits": 0.25911282002925873, "step": 2748 }, { "epoch": 0.0859375, "grad_norm": 4.25, "grad_norm_var": 0.046808878580729164, "learning_rate": 0.0001, "loss": 6.9114, "loss/crossentropy": 2.703710198402405, "loss/hidden": 1.78125, "loss/jsd": 0.0, "loss/logits": 0.24264460057020187, "step": 2750 }, { "epoch": 0.086, "grad_norm": 7.09375, "grad_norm_var": 0.60865478515625, "learning_rate": 0.0001, "loss": 7.2531, "loss/crossentropy": 2.7426042556762695, "loss/hidden": 1.875, "loss/jsd": 0.0, "loss/logits": 0.2635515555739403, "step": 2752 }, { "epoch": 0.0860625, "grad_norm": 4.15625, "grad_norm_var": 0.6038899739583333, "learning_rate": 0.0001, "loss": 7.0144, "loss/crossentropy": 2.686529755592346, "loss/hidden": 1.79296875, "loss/jsd": 0.0, "loss/logits": 0.25348879396915436, "step": 2754 }, { "epoch": 0.086125, "grad_norm": 4.125, "grad_norm_var": 0.5939737955729166, "learning_rate": 0.0001, "loss": 6.8584, "loss/crossentropy": 2.621758818626404, "loss/hidden": 1.796875, "loss/jsd": 0.0, "loss/logits": 0.24397746473550797, "step": 2756 }, { "epoch": 0.0861875, "grad_norm": 3.84375, "grad_norm_var": 0.6249257405598958, "learning_rate": 0.0001, "loss": 6.4882, "loss/crossentropy": 2.4892083406448364, "loss/hidden": 1.74609375, "loss/jsd": 0.0, "loss/logits": 0.22528532147407532, "step": 2758 }, { "epoch": 0.08625, "grad_norm": 4.09375, "grad_norm_var": 0.6170155843098958, "learning_rate": 0.0001, "loss": 6.6869, "loss/crossentropy": 2.5306947231292725, "loss/hidden": 1.7890625, "loss/jsd": 0.0, "loss/logits": 0.2367103397846222, "step": 2760 }, { "epoch": 0.0863125, "grad_norm": 4.125, "grad_norm_var": 0.6169260660807292, "learning_rate": 0.0001, "loss": 7.078, "loss/crossentropy": 2.7038986682891846, "loss/hidden": 1.828125, "loss/jsd": 0.0, "loss/logits": 0.25459785759449005, "step": 2762 }, { "epoch": 0.086375, "grad_norm": 4.03125, "grad_norm_var": 0.6237375895182292, "learning_rate": 0.0001, "loss": 6.8762, "loss/crossentropy": 2.646838665008545, "loss/hidden": 1.7890625, "loss/jsd": 0.0, "loss/logits": 0.2440301477909088, "step": 2764 }, { "epoch": 0.0864375, "grad_norm": 4.03125, "grad_norm_var": 0.6159332275390625, "learning_rate": 0.0001, "loss": 7.0619, "loss/crossentropy": 2.8066179752349854, "loss/hidden": 1.77734375, "loss/jsd": 0.0, "loss/logits": 0.2477927803993225, "step": 2766 }, { "epoch": 0.0865, "grad_norm": 4.15625, "grad_norm_var": 0.0525787353515625, "learning_rate": 0.0001, "loss": 7.0585, "loss/crossentropy": 2.8025286197662354, "loss/hidden": 1.80078125, "loss/jsd": 0.0, "loss/logits": 0.24552123248577118, "step": 2768 }, { "epoch": 0.0865625, "grad_norm": 3.90625, "grad_norm_var": 0.06516520182291667, "learning_rate": 0.0001, "loss": 6.6905, "loss/crossentropy": 2.5861109495162964, "loss/hidden": 1.765625, "loss/jsd": 0.0, "loss/logits": 0.2338722199201584, "step": 2770 }, { "epoch": 0.086625, "grad_norm": 4.09375, "grad_norm_var": 0.06992085774739583, "learning_rate": 0.0001, "loss": 6.5937, "loss/crossentropy": 2.4812984466552734, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.23702051490545273, "step": 2772 }, { "epoch": 0.0866875, "grad_norm": 4.09375, "grad_norm_var": 0.08655598958333334, "learning_rate": 0.0001, "loss": 6.9514, "loss/crossentropy": 2.6603236198425293, "loss/hidden": 1.77734375, "loss/jsd": 0.0, "loss/logits": 0.25136835873126984, "step": 2774 }, { "epoch": 0.08675, "grad_norm": 3.828125, "grad_norm_var": 0.10271809895833334, "learning_rate": 0.0001, "loss": 6.6624, "loss/crossentropy": 2.646838068962097, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.22812309861183167, "step": 2776 }, { "epoch": 0.0868125, "grad_norm": 4.0, "grad_norm_var": 0.08000895182291666, "learning_rate": 0.0001, "loss": 6.6274, "loss/crossentropy": 2.505067229270935, "loss/hidden": 1.77734375, "loss/jsd": 0.0, "loss/logits": 0.2345016449689865, "step": 2778 }, { "epoch": 0.086875, "grad_norm": 4.8125, "grad_norm_var": 0.10718485514322916, "learning_rate": 0.0001, "loss": 6.8672, "loss/crossentropy": 2.6379599571228027, "loss/hidden": 1.765625, "loss/jsd": 0.0, "loss/logits": 0.24636013805866241, "step": 2780 }, { "epoch": 0.0869375, "grad_norm": 3.859375, "grad_norm_var": 0.11272786458333334, "learning_rate": 0.0001, "loss": 6.9603, "loss/crossentropy": 2.7323312759399414, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.24545498192310333, "step": 2782 }, { "epoch": 0.087, "grad_norm": 4.03125, "grad_norm_var": 0.09674072265625, "learning_rate": 0.0001, "loss": 6.5846, "loss/crossentropy": 2.439736485481262, "loss/hidden": 1.83203125, "loss/jsd": 0.0, "loss/logits": 0.23127836734056473, "step": 2784 }, { "epoch": 0.0870625, "grad_norm": 4.5625, "grad_norm_var": 0.1028472900390625, "learning_rate": 0.0001, "loss": 7.1149, "loss/crossentropy": 2.7336888313293457, "loss/hidden": 1.796875, "loss/jsd": 0.0, "loss/logits": 0.25842948257923126, "step": 2786 }, { "epoch": 0.087125, "grad_norm": 3.890625, "grad_norm_var": 0.10173238118489583, "learning_rate": 0.0001, "loss": 6.6214, "loss/crossentropy": 2.5512136220932007, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.23397225886583328, "step": 2788 }, { "epoch": 0.0871875, "grad_norm": 4.34375, "grad_norm_var": 0.07649637858072916, "learning_rate": 0.0001, "loss": 6.8521, "loss/crossentropy": 2.6787188053131104, "loss/hidden": 1.79296875, "loss/jsd": 0.0, "loss/logits": 0.23803870379924774, "step": 2790 }, { "epoch": 0.08725, "grad_norm": 5.09375, "grad_norm_var": 0.12491861979166667, "learning_rate": 0.0001, "loss": 6.6633, "loss/crossentropy": 2.4818060398101807, "loss/hidden": 1.79296875, "loss/jsd": 0.0, "loss/logits": 0.23885706812143326, "step": 2792 }, { "epoch": 0.0873125, "grad_norm": 5.21875, "grad_norm_var": 0.197265625, "learning_rate": 0.0001, "loss": 7.4022, "loss/crossentropy": 2.7372331619262695, "loss/hidden": 1.91015625, "loss/jsd": 0.0, "loss/logits": 0.2754848003387451, "step": 2794 }, { "epoch": 0.087375, "grad_norm": 4.125, "grad_norm_var": 0.17433980305989583, "learning_rate": 0.0001, "loss": 6.979, "loss/crossentropy": 2.7950183153152466, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.24418172985315323, "step": 2796 }, { "epoch": 0.0874375, "grad_norm": 4.0, "grad_norm_var": 0.17021077473958332, "learning_rate": 0.0001, "loss": 6.909, "loss/crossentropy": 2.6368402242660522, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.24987629055976868, "step": 2798 }, { "epoch": 0.0875, "grad_norm": 4.15625, "grad_norm_var": 0.16259358723958334, "learning_rate": 0.0001, "loss": 6.9035, "loss/crossentropy": 2.65077543258667, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.24793104082345963, "step": 2800 }, { "epoch": 0.0875625, "grad_norm": 4.25, "grad_norm_var": 0.15861002604166666, "learning_rate": 0.0001, "loss": 6.7637, "loss/crossentropy": 2.6732107400894165, "loss/hidden": 1.7890625, "loss/jsd": 0.0, "loss/logits": 0.23014727979898453, "step": 2802 }, { "epoch": 0.087625, "grad_norm": 4.1875, "grad_norm_var": 0.16157124837239584, "learning_rate": 0.0001, "loss": 7.1957, "loss/crossentropy": 2.7851150035858154, "loss/hidden": 1.82421875, "loss/jsd": 0.0, "loss/logits": 0.2586326599121094, "step": 2804 }, { "epoch": 0.0876875, "grad_norm": 4.25, "grad_norm_var": 0.1755279541015625, "learning_rate": 0.0001, "loss": 6.8912, "loss/crossentropy": 2.5576705932617188, "loss/hidden": 1.765625, "loss/jsd": 0.0, "loss/logits": 0.25679074227809906, "step": 2806 }, { "epoch": 0.08775, "grad_norm": 4.03125, "grad_norm_var": 0.13225504557291667, "learning_rate": 0.0001, "loss": 6.9721, "loss/crossentropy": 2.7346194982528687, "loss/hidden": 1.76171875, "loss/jsd": 0.0, "loss/logits": 0.24757720530033112, "step": 2808 }, { "epoch": 0.0878125, "grad_norm": 3.796875, "grad_norm_var": 0.08034566243489584, "learning_rate": 0.0001, "loss": 7.0882, "loss/crossentropy": 2.774062395095825, "loss/hidden": 1.80859375, "loss/jsd": 0.0, "loss/logits": 0.25055310130119324, "step": 2810 }, { "epoch": 0.087875, "grad_norm": 4.15625, "grad_norm_var": 0.0814849853515625, "learning_rate": 0.0001, "loss": 6.645, "loss/crossentropy": 2.5014760494232178, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.24013769626617432, "step": 2812 }, { "epoch": 0.0879375, "grad_norm": 4.34375, "grad_norm_var": 0.08295796712239584, "learning_rate": 0.0001, "loss": 6.8454, "loss/crossentropy": 2.706041693687439, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.2389346957206726, "step": 2814 }, { "epoch": 0.088, "grad_norm": 4.28125, "grad_norm_var": 0.0823883056640625, "learning_rate": 0.0001, "loss": 6.6613, "loss/crossentropy": 2.494887948036194, "loss/hidden": 1.81640625, "loss/jsd": 0.0, "loss/logits": 0.2349967360496521, "step": 2816 }, { "epoch": 0.0880625, "grad_norm": 4.5625, "grad_norm_var": 5.473029581705729, "learning_rate": 0.0001, "loss": 7.6397, "loss/crossentropy": 2.7328039407730103, "loss/hidden": 1.87109375, "loss/jsd": 0.0, "loss/logits": 0.30358322709798813, "step": 2818 }, { "epoch": 0.088125, "grad_norm": 4.15625, "grad_norm_var": 5.509585571289063, "learning_rate": 0.0001, "loss": 6.719, "loss/crossentropy": 2.626744270324707, "loss/hidden": 1.74609375, "loss/jsd": 0.0, "loss/logits": 0.2346126213669777, "step": 2820 }, { "epoch": 0.0881875, "grad_norm": 4.03125, "grad_norm_var": 5.575602213541667, "learning_rate": 0.0001, "loss": 6.7236, "loss/crossentropy": 2.6262214183807373, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.23395337909460068, "step": 2822 }, { "epoch": 0.08825, "grad_norm": 4.4375, "grad_norm_var": 5.55611572265625, "learning_rate": 0.0001, "loss": 6.95, "loss/crossentropy": 2.6433587074279785, "loss/hidden": 1.80078125, "loss/jsd": 0.0, "loss/logits": 0.2505885064601898, "step": 2824 }, { "epoch": 0.0883125, "grad_norm": 4.15625, "grad_norm_var": 5.5215810139973955, "learning_rate": 0.0001, "loss": 6.9737, "loss/crossentropy": 2.6511499881744385, "loss/hidden": 1.8203125, "loss/jsd": 0.0, "loss/logits": 0.2502279579639435, "step": 2826 }, { "epoch": 0.088375, "grad_norm": 4.5, "grad_norm_var": 5.497638956705729, "learning_rate": 0.0001, "loss": 7.2043, "loss/crossentropy": 2.8375182151794434, "loss/hidden": 1.80078125, "loss/jsd": 0.0, "loss/logits": 0.2565983235836029, "step": 2828 }, { "epoch": 0.0884375, "grad_norm": 4.03125, "grad_norm_var": 5.498542277018229, "learning_rate": 0.0001, "loss": 6.836, "loss/crossentropy": 2.7026480436325073, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.23598749190568924, "step": 2830 }, { "epoch": 0.0885, "grad_norm": 5.8125, "grad_norm_var": 5.547028605143229, "learning_rate": 0.0001, "loss": 6.8172, "loss/crossentropy": 2.4725399017333984, "loss/hidden": 1.84375, "loss/jsd": 0.0, "loss/logits": 0.25009439140558243, "step": 2832 }, { "epoch": 0.0885625, "grad_norm": 4.21875, "grad_norm_var": 0.1890777587890625, "learning_rate": 0.0001, "loss": 6.9316, "loss/crossentropy": 2.7388436794281006, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.24193084985017776, "step": 2834 }, { "epoch": 0.088625, "grad_norm": 4.0, "grad_norm_var": 0.1932281494140625, "learning_rate": 0.0001, "loss": 6.8544, "loss/crossentropy": 2.6529780626296997, "loss/hidden": 1.77734375, "loss/jsd": 0.0, "loss/logits": 0.2424093559384346, "step": 2836 }, { "epoch": 0.0886875, "grad_norm": 3.984375, "grad_norm_var": 0.1971832275390625, "learning_rate": 0.0001, "loss": 6.4944, "loss/crossentropy": 2.533252477645874, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.22384879738092422, "step": 2838 }, { "epoch": 0.08875, "grad_norm": 4.375, "grad_norm_var": 0.19909566243489582, "learning_rate": 0.0001, "loss": 6.6716, "loss/crossentropy": 2.474052309989929, "loss/hidden": 1.8203125, "loss/jsd": 0.0, "loss/logits": 0.23772280663251877, "step": 2840 }, { "epoch": 0.0888125, "grad_norm": 4.1875, "grad_norm_var": 0.20845438639322916, "learning_rate": 0.0001, "loss": 6.8851, "loss/crossentropy": 2.7143443822860718, "loss/hidden": 1.78515625, "loss/jsd": 0.0, "loss/logits": 0.2385583370923996, "step": 2842 }, { "epoch": 0.088875, "grad_norm": 4.46875, "grad_norm_var": 0.20620829264322918, "learning_rate": 0.0001, "loss": 7.1547, "loss/crossentropy": 2.82839298248291, "loss/hidden": 1.828125, "loss/jsd": 0.0, "loss/logits": 0.24982231855392456, "step": 2844 }, { "epoch": 0.0889375, "grad_norm": 4.375, "grad_norm_var": 0.20735575358072916, "learning_rate": 0.0001, "loss": 6.9194, "loss/crossentropy": 2.689344048500061, "loss/hidden": 1.7890625, "loss/jsd": 0.0, "loss/logits": 0.2440999150276184, "step": 2846 }, { "epoch": 0.089, "grad_norm": 4.0625, "grad_norm_var": 0.03247782389322917, "learning_rate": 0.0001, "loss": 7.0278, "loss/crossentropy": 2.8314419984817505, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.2458094358444214, "step": 2848 }, { "epoch": 0.0890625, "grad_norm": 4.15625, "grad_norm_var": 0.04157613118489583, "learning_rate": 0.0001, "loss": 6.9939, "loss/crossentropy": 2.6279104948043823, "loss/hidden": 1.828125, "loss/jsd": 0.0, "loss/logits": 0.25378216803073883, "step": 2850 }, { "epoch": 0.089125, "grad_norm": 4.0625, "grad_norm_var": 0.04472554524739583, "learning_rate": 0.0001, "loss": 6.8556, "loss/crossentropy": 2.684327483177185, "loss/hidden": 1.76953125, "loss/jsd": 0.0, "loss/logits": 0.24017417430877686, "step": 2852 }, { "epoch": 0.0891875, "grad_norm": 3.953125, "grad_norm_var": 0.06446024576822916, "learning_rate": 0.0001, "loss": 6.9153, "loss/crossentropy": 2.611162781715393, "loss/hidden": 1.79296875, "loss/jsd": 0.0, "loss/logits": 0.2511187493801117, "step": 2854 }, { "epoch": 0.08925, "grad_norm": 3.828125, "grad_norm_var": 0.08336588541666666, "learning_rate": 0.0001, "loss": 7.3155, "loss/crossentropy": 2.903210401535034, "loss/hidden": 1.828125, "loss/jsd": 0.0, "loss/logits": 0.25841856747865677, "step": 2856 }, { "epoch": 0.0893125, "grad_norm": 4.1875, "grad_norm_var": 0.07395426432291667, "learning_rate": 0.0001, "loss": 6.7204, "loss/crossentropy": 2.5703028440475464, "loss/hidden": 1.80859375, "loss/jsd": 0.0, "loss/logits": 0.23415035754442215, "step": 2858 }, { "epoch": 0.089375, "grad_norm": 3.96875, "grad_norm_var": 0.07742513020833333, "learning_rate": 0.0001, "loss": 6.6673, "loss/crossentropy": 2.515069603919983, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.2394431233406067, "step": 2860 }, { "epoch": 0.0894375, "grad_norm": 3.96875, "grad_norm_var": 0.07705078125, "learning_rate": 0.0001, "loss": 6.7068, "loss/crossentropy": 2.5977174043655396, "loss/hidden": 1.78125, "loss/jsd": 0.0, "loss/logits": 0.23278646916151047, "step": 2862 }, { "epoch": 0.0895, "grad_norm": 3.84375, "grad_norm_var": 0.106982421875, "learning_rate": 0.0001, "loss": 7.1021, "loss/crossentropy": 2.7704328298568726, "loss/hidden": 1.83203125, "loss/jsd": 0.0, "loss/logits": 0.24995948374271393, "step": 2864 }, { "epoch": 0.0895625, "grad_norm": 4.125, "grad_norm_var": 0.11728108723958333, "learning_rate": 0.0001, "loss": 7.1029, "loss/crossentropy": 2.7944629192352295, "loss/hidden": 1.7890625, "loss/jsd": 0.0, "loss/logits": 0.25193439424037933, "step": 2866 }, { "epoch": 0.089625, "grad_norm": 4.15625, "grad_norm_var": 0.11506754557291667, "learning_rate": 0.0001, "loss": 6.8402, "loss/crossentropy": 2.615453004837036, "loss/hidden": 1.78125, "loss/jsd": 0.0, "loss/logits": 0.24434728920459747, "step": 2868 }, { "epoch": 0.0896875, "grad_norm": 4.6875, "grad_norm_var": 0.11043294270833333, "learning_rate": 0.0001, "loss": 7.2103, "loss/crossentropy": 2.9622833728790283, "loss/hidden": 1.80078125, "loss/jsd": 0.0, "loss/logits": 0.2447189837694168, "step": 2870 }, { "epoch": 0.08975, "grad_norm": 4.78125, "grad_norm_var": 0.10915425618489584, "learning_rate": 0.0001, "loss": 7.1553, "loss/crossentropy": 2.7944257259368896, "loss/hidden": 1.8125, "loss/jsd": 0.0, "loss/logits": 0.2548411712050438, "step": 2872 }, { "epoch": 0.0898125, "grad_norm": 4.46875, "grad_norm_var": 0.11044820149739583, "learning_rate": 0.0001, "loss": 6.7337, "loss/crossentropy": 2.5883902311325073, "loss/hidden": 1.78515625, "loss/jsd": 0.0, "loss/logits": 0.23601321130990982, "step": 2874 }, { "epoch": 0.089875, "grad_norm": 4.15625, "grad_norm_var": 0.1197662353515625, "learning_rate": 0.0001, "loss": 7.1886, "loss/crossentropy": 2.669734477996826, "loss/hidden": 1.8515625, "loss/jsd": 0.0, "loss/logits": 0.2667340338230133, "step": 2876 }, { "epoch": 0.0899375, "grad_norm": 4.59375, "grad_norm_var": 0.1191070556640625, "learning_rate": 0.0001, "loss": 6.9796, "loss/crossentropy": 2.673854112625122, "loss/hidden": 1.83984375, "loss/jsd": 0.0, "loss/logits": 0.24659424275159836, "step": 2878 }, { "epoch": 0.09, "grad_norm": 3.84375, "grad_norm_var": 0.1076324462890625, "learning_rate": 0.0001, "loss": 6.6908, "loss/crossentropy": 2.657265067100525, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.22796591371297836, "step": 2880 }, { "epoch": 0.0900625, "grad_norm": 3.984375, "grad_norm_var": 0.10998942057291666, "learning_rate": 0.0001, "loss": 6.9527, "loss/crossentropy": 2.5288443565368652, "loss/hidden": 1.8359375, "loss/jsd": 0.0, "loss/logits": 0.2587915509939194, "step": 2882 }, { "epoch": 0.090125, "grad_norm": 4.375, "grad_norm_var": 0.10878499348958333, "learning_rate": 0.0001, "loss": 6.9951, "loss/crossentropy": 2.815529942512512, "loss/hidden": 1.796875, "loss/jsd": 0.0, "loss/logits": 0.23827117681503296, "step": 2884 }, { "epoch": 0.0901875, "grad_norm": 4.84375, "grad_norm_var": 0.11975504557291666, "learning_rate": 0.0001, "loss": 6.8181, "loss/crossentropy": 2.495347023010254, "loss/hidden": 1.82421875, "loss/jsd": 0.0, "loss/logits": 0.24985623359680176, "step": 2886 }, { "epoch": 0.09025, "grad_norm": 6.25, "grad_norm_var": 0.34085286458333336, "learning_rate": 0.0001, "loss": 7.1716, "loss/crossentropy": 2.736644744873047, "loss/hidden": 1.81640625, "loss/jsd": 0.0, "loss/logits": 0.26185525953769684, "step": 2888 }, { "epoch": 0.0903125, "grad_norm": 4.46875, "grad_norm_var": 0.3358683268229167, "learning_rate": 0.0001, "loss": 7.1126, "loss/crossentropy": 2.7053741216659546, "loss/hidden": 1.80078125, "loss/jsd": 0.0, "loss/logits": 0.2606474459171295, "step": 2890 }, { "epoch": 0.090375, "grad_norm": 4.28125, "grad_norm_var": 0.31735026041666664, "learning_rate": 0.0001, "loss": 7.067, "loss/crossentropy": 2.699031949043274, "loss/hidden": 1.81640625, "loss/jsd": 0.0, "loss/logits": 0.25515763461589813, "step": 2892 }, { "epoch": 0.0904375, "grad_norm": 5.25, "grad_norm_var": 0.38806050618489585, "learning_rate": 0.0001, "loss": 6.9426, "loss/crossentropy": 2.710414171218872, "loss/hidden": 1.796875, "loss/jsd": 0.0, "loss/logits": 0.24353402853012085, "step": 2894 }, { "epoch": 0.0905, "grad_norm": 4.84375, "grad_norm_var": 0.3643951416015625, "learning_rate": 0.0001, "loss": 6.9086, "loss/crossentropy": 2.6190038919448853, "loss/hidden": 1.7890625, "loss/jsd": 0.0, "loss/logits": 0.25004892796278, "step": 2896 }, { "epoch": 0.0905625, "grad_norm": 5.46875, "grad_norm_var": 0.4073079427083333, "learning_rate": 0.0001, "loss": 7.2013, "loss/crossentropy": 2.7578283548355103, "loss/hidden": 1.8515625, "loss/jsd": 0.0, "loss/logits": 0.2591920793056488, "step": 2898 }, { "epoch": 0.090625, "grad_norm": 4.25, "grad_norm_var": 0.4041951497395833, "learning_rate": 0.0001, "loss": 6.7156, "loss/crossentropy": 2.523174524307251, "loss/hidden": 1.80859375, "loss/jsd": 0.0, "loss/logits": 0.2383802831172943, "step": 2900 }, { "epoch": 0.0906875, "grad_norm": 4.09375, "grad_norm_var": 0.3777089436848958, "learning_rate": 0.0001, "loss": 6.6932, "loss/crossentropy": 2.539914608001709, "loss/hidden": 1.83984375, "loss/jsd": 0.0, "loss/logits": 0.23134269565343857, "step": 2902 }, { "epoch": 0.09075, "grad_norm": 4.0, "grad_norm_var": 0.20273335774739584, "learning_rate": 0.0001, "loss": 6.8114, "loss/crossentropy": 2.737942695617676, "loss/hidden": 1.76171875, "loss/jsd": 0.0, "loss/logits": 0.23117107152938843, "step": 2904 }, { "epoch": 0.0908125, "grad_norm": 4.21875, "grad_norm_var": 0.3624827067057292, "learning_rate": 0.0001, "loss": 7.1634, "loss/crossentropy": 2.7246967554092407, "loss/hidden": 1.83984375, "loss/jsd": 0.0, "loss/logits": 0.2598849982023239, "step": 2906 }, { "epoch": 0.090875, "grad_norm": 4.625, "grad_norm_var": 0.39011942545572914, "learning_rate": 0.0001, "loss": 7.0006, "loss/crossentropy": 2.6638818979263306, "loss/hidden": 1.78125, "loss/jsd": 0.0, "loss/logits": 0.2555497959256172, "step": 2908 }, { "epoch": 0.0909375, "grad_norm": 3.859375, "grad_norm_var": 0.3447011311848958, "learning_rate": 0.0001, "loss": 6.7047, "loss/crossentropy": 2.5874160528182983, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.23985768109560013, "step": 2910 }, { "epoch": 0.091, "grad_norm": 4.28125, "grad_norm_var": 0.3924763997395833, "learning_rate": 0.0001, "loss": 6.6062, "loss/crossentropy": 2.608031749725342, "loss/hidden": 1.76171875, "loss/jsd": 0.0, "loss/logits": 0.2236482873558998, "step": 2912 }, { "epoch": 0.0910625, "grad_norm": 4.25, "grad_norm_var": 0.34807942708333334, "learning_rate": 0.0001, "loss": 6.8533, "loss/crossentropy": 2.6706267595291138, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.24327071011066437, "step": 2914 }, { "epoch": 0.091125, "grad_norm": 4.0625, "grad_norm_var": 0.3446248372395833, "learning_rate": 0.0001, "loss": 6.8203, "loss/crossentropy": 2.655190944671631, "loss/hidden": 1.77734375, "loss/jsd": 0.0, "loss/logits": 0.23877842724323273, "step": 2916 }, { "epoch": 0.0911875, "grad_norm": 4.6875, "grad_norm_var": 0.3470865885416667, "learning_rate": 0.0001, "loss": 7.2147, "loss/crossentropy": 2.751248002052307, "loss/hidden": 1.8359375, "loss/jsd": 0.0, "loss/logits": 0.2627522945404053, "step": 2918 }, { "epoch": 0.09125, "grad_norm": 3.96875, "grad_norm_var": 0.34788411458333335, "learning_rate": 0.0001, "loss": 6.9592, "loss/crossentropy": 2.6001957654953003, "loss/hidden": 1.78125, "loss/jsd": 0.0, "loss/logits": 0.25777673721313477, "step": 2920 }, { "epoch": 0.0913125, "grad_norm": 5.28125, "grad_norm_var": 0.22195638020833333, "learning_rate": 0.0001, "loss": 6.786, "loss/crossentropy": 2.5922285318374634, "loss/hidden": 1.796875, "loss/jsd": 0.0, "loss/logits": 0.23969252407550812, "step": 2922 }, { "epoch": 0.091375, "grad_norm": 4.3125, "grad_norm_var": 0.14903971354166667, "learning_rate": 0.0001, "loss": 7.0038, "loss/crossentropy": 2.7293628454208374, "loss/hidden": 1.81640625, "loss/jsd": 0.0, "loss/logits": 0.2457985281944275, "step": 2924 }, { "epoch": 0.0914375, "grad_norm": 4.125, "grad_norm_var": 0.15593159993489583, "learning_rate": 0.0001, "loss": 6.701, "loss/crossentropy": 2.60032856464386, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.2342819646000862, "step": 2926 }, { "epoch": 0.0915, "grad_norm": 3.703125, "grad_norm_var": 0.15835673014322918, "learning_rate": 0.0001, "loss": 6.4028, "loss/crossentropy": 2.512592077255249, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.21519330888986588, "step": 2928 }, { "epoch": 0.0915625, "grad_norm": 4.34375, "grad_norm_var": 0.15203348795572916, "learning_rate": 0.0001, "loss": 6.8907, "loss/crossentropy": 2.677201509475708, "loss/hidden": 1.8125, "loss/jsd": 0.0, "loss/logits": 0.24010415375232697, "step": 2930 }, { "epoch": 0.091625, "grad_norm": 4.4375, "grad_norm_var": 0.15335184733072918, "learning_rate": 0.0001, "loss": 7.0395, "loss/crossentropy": 2.7401055097579956, "loss/hidden": 1.77734375, "loss/jsd": 0.0, "loss/logits": 0.2522094398736954, "step": 2932 }, { "epoch": 0.0916875, "grad_norm": 4.21875, "grad_norm_var": 0.13443094889322918, "learning_rate": 0.0001, "loss": 6.786, "loss/crossentropy": 2.6013104915618896, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.24542643129825592, "step": 2934 }, { "epoch": 0.09175, "grad_norm": 4.59375, "grad_norm_var": 2.154735310872396, "learning_rate": 0.0001, "loss": 7.3931, "loss/crossentropy": 2.691000461578369, "loss/hidden": 1.921875, "loss/jsd": 0.0, "loss/logits": 0.278026819229126, "step": 2936 }, { "epoch": 0.0918125, "grad_norm": 4.40625, "grad_norm_var": 2.124315388997396, "learning_rate": 0.0001, "loss": 6.5102, "loss/crossentropy": 2.412446618080139, "loss/hidden": 1.8125, "loss/jsd": 0.0, "loss/logits": 0.2285284548997879, "step": 2938 }, { "epoch": 0.091875, "grad_norm": 4.21875, "grad_norm_var": 2.149030558268229, "learning_rate": 0.0001, "loss": 6.8509, "loss/crossentropy": 2.6718616485595703, "loss/hidden": 1.76953125, "loss/jsd": 0.0, "loss/logits": 0.24095501005649567, "step": 2940 }, { "epoch": 0.0919375, "grad_norm": 4.4375, "grad_norm_var": 2.1091461181640625, "learning_rate": 0.0001, "loss": 7.2243, "loss/crossentropy": 2.857651472091675, "loss/hidden": 1.84375, "loss/jsd": 0.0, "loss/logits": 0.25229182839393616, "step": 2942 }, { "epoch": 0.092, "grad_norm": 3.796875, "grad_norm_var": 2.1014556884765625, "learning_rate": 0.0001, "loss": 6.4888, "loss/crossentropy": 2.57748019695282, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.21808737516403198, "step": 2944 }, { "epoch": 0.0920625, "grad_norm": 3.953125, "grad_norm_var": 2.124755859375, "learning_rate": 0.0001, "loss": 6.6413, "loss/crossentropy": 2.599161982536316, "loss/hidden": 1.83984375, "loss/jsd": 0.0, "loss/logits": 0.2202320247888565, "step": 2946 }, { "epoch": 0.092125, "grad_norm": 4.3125, "grad_norm_var": 2.14498291015625, "learning_rate": 0.0001, "loss": 6.6873, "loss/crossentropy": 2.5599820613861084, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.23929814249277115, "step": 2948 }, { "epoch": 0.0921875, "grad_norm": 4.03125, "grad_norm_var": 2.14400634765625, "learning_rate": 0.0001, "loss": 6.4769, "loss/crossentropy": 2.4159616231918335, "loss/hidden": 1.81640625, "loss/jsd": 0.0, "loss/logits": 0.22445113956928253, "step": 2950 }, { "epoch": 0.09225, "grad_norm": 5.65625, "grad_norm_var": 0.32291259765625, "learning_rate": 0.0001, "loss": 7.0604, "loss/crossentropy": 2.6208486557006836, "loss/hidden": 1.85546875, "loss/jsd": 0.0, "loss/logits": 0.2584053575992584, "step": 2952 }, { "epoch": 0.0923125, "grad_norm": 3.9375, "grad_norm_var": 0.33408203125, "learning_rate": 0.0001, "loss": 6.8853, "loss/crossentropy": 2.723706603050232, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.24116095155477524, "step": 2954 }, { "epoch": 0.092375, "grad_norm": 3.875, "grad_norm_var": 0.33986002604166665, "learning_rate": 0.0001, "loss": 6.808, "loss/crossentropy": 2.670522689819336, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.23874707520008087, "step": 2956 }, { "epoch": 0.0924375, "grad_norm": 4.34375, "grad_norm_var": 0.62880859375, "learning_rate": 0.0001, "loss": 6.943, "loss/crossentropy": 2.642168879508972, "loss/hidden": 1.875, "loss/jsd": 0.0, "loss/logits": 0.24258745461702347, "step": 2958 }, { "epoch": 0.0925, "grad_norm": 4.28125, "grad_norm_var": 1.5287913004557292, "learning_rate": 0.0001, "loss": 6.9016, "loss/crossentropy": 2.5931923389434814, "loss/hidden": 1.80859375, "loss/jsd": 0.0, "loss/logits": 0.2499857395887375, "step": 2960 }, { "epoch": 0.0925625, "grad_norm": 4.46875, "grad_norm_var": 1.5153483072916667, "learning_rate": 0.0001, "loss": 7.1575, "loss/crossentropy": 2.789669632911682, "loss/hidden": 1.8046875, "loss/jsd": 0.0, "loss/logits": 0.2563166320323944, "step": 2962 }, { "epoch": 0.092625, "grad_norm": 4.4375, "grad_norm_var": 1.4841105143229167, "learning_rate": 0.0001, "loss": 6.93, "loss/crossentropy": 2.761322498321533, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.24265322089195251, "step": 2964 }, { "epoch": 0.0926875, "grad_norm": 4.375, "grad_norm_var": 1.4745076497395833, "learning_rate": 0.0001, "loss": 6.6479, "loss/crossentropy": 2.478193521499634, "loss/hidden": 1.765625, "loss/jsd": 0.0, "loss/logits": 0.2404085099697113, "step": 2966 }, { "epoch": 0.09275, "grad_norm": 4.53125, "grad_norm_var": 1.37994384765625, "learning_rate": 0.0001, "loss": 7.2748, "loss/crossentropy": 2.903393268585205, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.2597960978746414, "step": 2968 }, { "epoch": 0.0928125, "grad_norm": 4.75, "grad_norm_var": 1.3373331705729166, "learning_rate": 0.0001, "loss": 7.0387, "loss/crossentropy": 2.710214138031006, "loss/hidden": 1.8515625, "loss/jsd": 0.0, "loss/logits": 0.24768862128257751, "step": 2970 }, { "epoch": 0.092875, "grad_norm": 4.21875, "grad_norm_var": 1.2830078125, "learning_rate": 0.0001, "loss": 6.5448, "loss/crossentropy": 2.471164584159851, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.23392198234796524, "step": 2972 }, { "epoch": 0.0929375, "grad_norm": 4.46875, "grad_norm_var": 1.1928670247395834, "learning_rate": 0.0001, "loss": 7.2907, "loss/crossentropy": 2.745394468307495, "loss/hidden": 1.8515625, "loss/jsd": 0.0, "loss/logits": 0.26937687397003174, "step": 2974 }, { "epoch": 0.093, "grad_norm": 4.3125, "grad_norm_var": 0.21354166666666666, "learning_rate": 0.0001, "loss": 6.8719, "loss/crossentropy": 2.703699469566345, "loss/hidden": 1.828125, "loss/jsd": 0.0, "loss/logits": 0.23400261253118515, "step": 2976 }, { "epoch": 0.0930625, "grad_norm": 4.0, "grad_norm_var": 0.22515869140625, "learning_rate": 0.0001, "loss": 6.9013, "loss/crossentropy": 2.700608015060425, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.2427232563495636, "step": 2978 }, { "epoch": 0.093125, "grad_norm": 3.859375, "grad_norm_var": 0.24124247233072918, "learning_rate": 0.0001, "loss": 6.305, "loss/crossentropy": 2.276304244995117, "loss/hidden": 1.8359375, "loss/jsd": 0.0, "loss/logits": 0.2192765772342682, "step": 2980 }, { "epoch": 0.0931875, "grad_norm": 4.09375, "grad_norm_var": 0.24733784993489583, "learning_rate": 0.0001, "loss": 6.8706, "loss/crossentropy": 2.6530104875564575, "loss/hidden": 1.78125, "loss/jsd": 0.0, "loss/logits": 0.2436370849609375, "step": 2982 }, { "epoch": 0.09325, "grad_norm": 4.40625, "grad_norm_var": 0.2544179280598958, "learning_rate": 0.0001, "loss": 6.9514, "loss/crossentropy": 2.6985493898391724, "loss/hidden": 1.8203125, "loss/jsd": 0.0, "loss/logits": 0.24325336515903473, "step": 2984 }, { "epoch": 0.0933125, "grad_norm": 3.90625, "grad_norm_var": 0.26167704264322916, "learning_rate": 0.0001, "loss": 6.5768, "loss/crossentropy": 2.4524112939834595, "loss/hidden": 1.79296875, "loss/jsd": 0.0, "loss/logits": 0.2331458330154419, "step": 2986 }, { "epoch": 0.093375, "grad_norm": 4.4375, "grad_norm_var": 0.2630523681640625, "learning_rate": 0.0001, "loss": 6.5181, "loss/crossentropy": 2.4587045907974243, "loss/hidden": 1.83203125, "loss/jsd": 0.0, "loss/logits": 0.22274015843868256, "step": 2988 }, { "epoch": 0.0934375, "grad_norm": 4.5625, "grad_norm_var": 0.0508941650390625, "learning_rate": 0.0001, "loss": 6.4799, "loss/crossentropy": 2.4741140604019165, "loss/hidden": 1.765625, "loss/jsd": 0.0, "loss/logits": 0.2240130975842476, "step": 2990 }, { "epoch": 0.0935, "grad_norm": 3.734375, "grad_norm_var": 0.06552327473958333, "learning_rate": 0.0001, "loss": 6.5749, "loss/crossentropy": 2.5307430028915405, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.2309805527329445, "step": 2992 }, { "epoch": 0.0935625, "grad_norm": 4.0, "grad_norm_var": 0.06594645182291667, "learning_rate": 0.0001, "loss": 6.7368, "loss/crossentropy": 2.6206862926483154, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.23621946573257446, "step": 2994 }, { "epoch": 0.093625, "grad_norm": 3.84375, "grad_norm_var": 0.05030008951822917, "learning_rate": 0.0001, "loss": 6.4821, "loss/crossentropy": 2.4800742864608765, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.2244247943162918, "step": 2996 }, { "epoch": 0.0936875, "grad_norm": 4.0625, "grad_norm_var": 0.05836181640625, "learning_rate": 0.0001, "loss": 6.9033, "loss/crossentropy": 2.7235101461410522, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.24532272666692734, "step": 2998 }, { "epoch": 0.09375, "grad_norm": 4.21875, "grad_norm_var": 0.289404296875, "learning_rate": 0.0001, "loss": 7.3286, "loss/crossentropy": 2.658680558204651, "loss/hidden": 1.91015625, "loss/jsd": 0.0, "loss/logits": 0.27597957849502563, "step": 3000 }, { "epoch": 0.0938125, "grad_norm": 3.90625, "grad_norm_var": 0.29117431640625, "learning_rate": 0.0001, "loss": 6.7126, "loss/crossentropy": 2.571900725364685, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.23906809091567993, "step": 3002 }, { "epoch": 0.093875, "grad_norm": 3.984375, "grad_norm_var": 0.2908681233723958, "learning_rate": 0.0001, "loss": 6.8183, "loss/crossentropy": 2.616904139518738, "loss/hidden": 1.78515625, "loss/jsd": 0.0, "loss/logits": 0.24162866920232773, "step": 3004 }, { "epoch": 0.0939375, "grad_norm": 4.09375, "grad_norm_var": 0.28135477701822914, "learning_rate": 0.0001, "loss": 6.5828, "loss/crossentropy": 2.5343810319900513, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.2294553965330124, "step": 3006 }, { "epoch": 0.094, "grad_norm": 4.21875, "grad_norm_var": 0.27568359375, "learning_rate": 0.0001, "loss": 6.8372, "loss/crossentropy": 2.6878076791763306, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.2391548901796341, "step": 3008 }, { "epoch": 0.0940625, "grad_norm": 3.890625, "grad_norm_var": 0.28186442057291666, "learning_rate": 0.0001, "loss": 6.8996, "loss/crossentropy": 2.6858986616134644, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.2455882728099823, "step": 3010 }, { "epoch": 0.094125, "grad_norm": 4.375, "grad_norm_var": 0.2776763916015625, "learning_rate": 0.0001, "loss": 6.9203, "loss/crossentropy": 2.6672648191452026, "loss/hidden": 1.765625, "loss/jsd": 0.0, "loss/logits": 0.24874596297740936, "step": 3012 }, { "epoch": 0.0941875, "grad_norm": 4.15625, "grad_norm_var": 0.25683186848958334, "learning_rate": 0.0001, "loss": 6.8733, "loss/crossentropy": 2.6610859632492065, "loss/hidden": 1.78515625, "loss/jsd": 0.0, "loss/logits": 0.2427053600549698, "step": 3014 }, { "epoch": 0.09425, "grad_norm": 4.625, "grad_norm_var": 0.04869791666666667, "learning_rate": 0.0001, "loss": 6.8369, "loss/crossentropy": 2.577809453010559, "loss/hidden": 1.8671875, "loss/jsd": 0.0, "loss/logits": 0.23919511586427689, "step": 3016 }, { "epoch": 0.0943125, "grad_norm": 6.75, "grad_norm_var": 0.531884765625, "learning_rate": 0.0001, "loss": 7.1416, "loss/crossentropy": 2.660222053527832, "loss/hidden": 1.8828125, "loss/jsd": 0.0, "loss/logits": 0.2598596066236496, "step": 3018 }, { "epoch": 0.094375, "grad_norm": 5.03125, "grad_norm_var": 0.5339182535807292, "learning_rate": 0.0001, "loss": 6.722, "loss/crossentropy": 2.464984178543091, "loss/hidden": 1.8671875, "loss/jsd": 0.0, "loss/logits": 0.23898254334926605, "step": 3020 }, { "epoch": 0.0944375, "grad_norm": 4.46875, "grad_norm_var": 0.5307607014973958, "learning_rate": 0.0001, "loss": 6.9155, "loss/crossentropy": 2.7366366386413574, "loss/hidden": 1.78515625, "loss/jsd": 0.0, "loss/logits": 0.23937073349952698, "step": 3022 }, { "epoch": 0.0945, "grad_norm": 4.21875, "grad_norm_var": 0.5398590087890625, "learning_rate": 0.0001, "loss": 7.1708, "loss/crossentropy": 2.825117588043213, "loss/hidden": 1.77734375, "loss/jsd": 0.0, "loss/logits": 0.2568327337503433, "step": 3024 }, { "epoch": 0.0945625, "grad_norm": 3.9375, "grad_norm_var": 0.55103759765625, "learning_rate": 0.0001, "loss": 6.7961, "loss/crossentropy": 2.6475404500961304, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.23751483112573624, "step": 3026 }, { "epoch": 0.094625, "grad_norm": 4.1875, "grad_norm_var": 0.5415191650390625, "learning_rate": 0.0001, "loss": 6.4474, "loss/crossentropy": 2.3609050512313843, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.2356044054031372, "step": 3028 }, { "epoch": 0.0946875, "grad_norm": 3.921875, "grad_norm_var": 0.5520345052083333, "learning_rate": 0.0001, "loss": 6.6852, "loss/crossentropy": 2.574931025505066, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.23563718795776367, "step": 3030 }, { "epoch": 0.09475, "grad_norm": 5.5, "grad_norm_var": 0.6260050455729167, "learning_rate": 0.0001, "loss": 6.8219, "loss/crossentropy": 2.615893244743347, "loss/hidden": 1.79296875, "loss/jsd": 0.0, "loss/logits": 0.24130163341760635, "step": 3032 }, { "epoch": 0.0948125, "grad_norm": 4.03125, "grad_norm_var": 0.18860270182291666, "learning_rate": 0.0001, "loss": 6.5735, "loss/crossentropy": 2.4803576469421387, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.23431752622127533, "step": 3034 }, { "epoch": 0.094875, "grad_norm": 3.890625, "grad_norm_var": 0.15572001139322916, "learning_rate": 0.0001, "loss": 6.739, "loss/crossentropy": 2.682898163795471, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.23217643052339554, "step": 3036 }, { "epoch": 0.0949375, "grad_norm": 4.59375, "grad_norm_var": 0.16021219889322916, "learning_rate": 0.0001, "loss": 6.7686, "loss/crossentropy": 2.6070168018341064, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.2411540225148201, "step": 3038 }, { "epoch": 0.095, "grad_norm": 4.28125, "grad_norm_var": 0.1994781494140625, "learning_rate": 0.0001, "loss": 7.0366, "loss/crossentropy": 2.666748523712158, "loss/hidden": 1.8203125, "loss/jsd": 0.0, "loss/logits": 0.25495024025440216, "step": 3040 }, { "epoch": 0.0950625, "grad_norm": 4.21875, "grad_norm_var": 0.1870269775390625, "learning_rate": 0.0001, "loss": 6.8307, "loss/crossentropy": 2.6568331718444824, "loss/hidden": 1.765625, "loss/jsd": 0.0, "loss/logits": 0.24082067608833313, "step": 3042 }, { "epoch": 0.095125, "grad_norm": 3.9375, "grad_norm_var": 0.1998046875, "learning_rate": 0.0001, "loss": 6.7003, "loss/crossentropy": 2.595793128013611, "loss/hidden": 1.74609375, "loss/jsd": 0.0, "loss/logits": 0.2358388602733612, "step": 3044 }, { "epoch": 0.0951875, "grad_norm": 4.6875, "grad_norm_var": 0.20746968587239584, "learning_rate": 0.0001, "loss": 6.4576, "loss/crossentropy": 2.3486874103546143, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.23745301365852356, "step": 3046 }, { "epoch": 0.09525, "grad_norm": 4.0625, "grad_norm_var": 0.1053131103515625, "learning_rate": 0.0001, "loss": 6.8214, "loss/crossentropy": 2.672357678413391, "loss/hidden": 1.77734375, "loss/jsd": 0.0, "loss/logits": 0.23716489970684052, "step": 3048 }, { "epoch": 0.0953125, "grad_norm": 3.859375, "grad_norm_var": 0.115087890625, "learning_rate": 0.0001, "loss": 6.7248, "loss/crossentropy": 2.5690304040908813, "loss/hidden": 1.796875, "loss/jsd": 0.0, "loss/logits": 0.23588933050632477, "step": 3050 }, { "epoch": 0.095375, "grad_norm": 4.46875, "grad_norm_var": 0.11972249348958333, "learning_rate": 0.0001, "loss": 6.8619, "loss/crossentropy": 2.6711432933807373, "loss/hidden": 1.80078125, "loss/jsd": 0.0, "loss/logits": 0.2390008568763733, "step": 3052 }, { "epoch": 0.0954375, "grad_norm": 4.09375, "grad_norm_var": 0.10676676432291667, "learning_rate": 0.0001, "loss": 6.5898, "loss/crossentropy": 2.5515469312667847, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.2288241982460022, "step": 3054 }, { "epoch": 0.0955, "grad_norm": 4.3125, "grad_norm_var": 0.0671539306640625, "learning_rate": 0.0001, "loss": 6.6707, "loss/crossentropy": 2.6111743450164795, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.2317328155040741, "step": 3056 }, { "epoch": 0.0955625, "grad_norm": 4.0625, "grad_norm_var": 0.22987874348958334, "learning_rate": 0.0001, "loss": 7.0313, "loss/crossentropy": 2.750077962875366, "loss/hidden": 1.82421875, "loss/jsd": 0.0, "loss/logits": 0.24569693207740784, "step": 3058 }, { "epoch": 0.095625, "grad_norm": 4.1875, "grad_norm_var": 0.2235015869140625, "learning_rate": 0.0001, "loss": 6.6612, "loss/crossentropy": 2.4988226890563965, "loss/hidden": 1.81640625, "loss/jsd": 0.0, "loss/logits": 0.23459364473819733, "step": 3060 }, { "epoch": 0.0956875, "grad_norm": 4.875, "grad_norm_var": 0.23723042805989583, "learning_rate": 0.0001, "loss": 6.9914, "loss/crossentropy": 2.6185423135757446, "loss/hidden": 1.84765625, "loss/jsd": 0.0, "loss/logits": 0.25251929461956024, "step": 3062 }, { "epoch": 0.09575, "grad_norm": 5.65625, "grad_norm_var": 0.35871480305989584, "learning_rate": 0.0001, "loss": 6.6228, "loss/crossentropy": 2.582352042198181, "loss/hidden": 1.796875, "loss/jsd": 0.0, "loss/logits": 0.2243526726961136, "step": 3064 }, { "epoch": 0.0958125, "grad_norm": 4.03125, "grad_norm_var": 0.33693033854166665, "learning_rate": 0.0001, "loss": 6.735, "loss/crossentropy": 2.521291971206665, "loss/hidden": 1.796875, "loss/jsd": 0.0, "loss/logits": 0.24168409407138824, "step": 3066 }, { "epoch": 0.095875, "grad_norm": 4.15625, "grad_norm_var": 0.3283762613932292, "learning_rate": 0.0001, "loss": 6.9965, "loss/crossentropy": 2.7500585317611694, "loss/hidden": 1.80078125, "loss/jsd": 0.0, "loss/logits": 0.24456849694252014, "step": 3068 }, { "epoch": 0.0959375, "grad_norm": 4.03125, "grad_norm_var": 0.3553538004557292, "learning_rate": 0.0001, "loss": 7.0783, "loss/crossentropy": 2.7938302755355835, "loss/hidden": 1.8046875, "loss/jsd": 0.0, "loss/logits": 0.24797877669334412, "step": 3070 }, { "epoch": 0.096, "grad_norm": 3.4375, "grad_norm_var": 0.37916666666666665, "learning_rate": 0.0001, "loss": 6.7114, "loss/crossentropy": 2.6448922157287598, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.22930297255516052, "step": 3072 }, { "epoch": 0.0960625, "grad_norm": 3.890625, "grad_norm_var": 0.2966217041015625, "learning_rate": 0.0001, "loss": 6.7515, "loss/crossentropy": 2.675472378730774, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.23376993834972382, "step": 3074 }, { "epoch": 0.096125, "grad_norm": 4.125, "grad_norm_var": 0.30192769368489586, "learning_rate": 0.0001, "loss": 6.5454, "loss/crossentropy": 2.5147024393081665, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.22806604951620102, "step": 3076 }, { "epoch": 0.0961875, "grad_norm": 4.0, "grad_norm_var": 0.2820709228515625, "learning_rate": 0.0001, "loss": 6.6275, "loss/crossentropy": 2.5008946657180786, "loss/hidden": 1.79296875, "loss/jsd": 0.0, "loss/logits": 0.23336394876241684, "step": 3078 }, { "epoch": 0.09625, "grad_norm": 4.15625, "grad_norm_var": 0.1670806884765625, "learning_rate": 0.0001, "loss": 6.8559, "loss/crossentropy": 2.682113766670227, "loss/hidden": 1.78515625, "loss/jsd": 0.0, "loss/logits": 0.23885856568813324, "step": 3080 }, { "epoch": 0.0963125, "grad_norm": 4.21875, "grad_norm_var": 0.1740631103515625, "learning_rate": 0.0001, "loss": 6.5302, "loss/crossentropy": 2.4620113372802734, "loss/hidden": 1.7890625, "loss/jsd": 0.0, "loss/logits": 0.22791030257940292, "step": 3082 }, { "epoch": 0.096375, "grad_norm": 4.375, "grad_norm_var": 0.1577301025390625, "learning_rate": 0.0001, "loss": 7.1082, "loss/crossentropy": 2.7661988735198975, "loss/hidden": 1.80859375, "loss/jsd": 0.0, "loss/logits": 0.2533379793167114, "step": 3084 }, { "epoch": 0.0964375, "grad_norm": 3.828125, "grad_norm_var": 0.09364827473958333, "learning_rate": 0.0001, "loss": 6.6862, "loss/crossentropy": 2.629671812057495, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.23182903230190277, "step": 3086 }, { "epoch": 0.0965, "grad_norm": 4.40625, "grad_norm_var": 0.057633463541666666, "learning_rate": 0.0001, "loss": 6.8724, "loss/crossentropy": 2.6740200519561768, "loss/hidden": 1.74609375, "loss/jsd": 0.0, "loss/logits": 0.24522805958986282, "step": 3088 }, { "epoch": 0.0965625, "grad_norm": 4.0625, "grad_norm_var": 0.0516510009765625, "learning_rate": 0.0001, "loss": 6.7875, "loss/crossentropy": 2.6869399547576904, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.23505789041519165, "step": 3090 }, { "epoch": 0.096625, "grad_norm": 4.25, "grad_norm_var": 0.05257059733072917, "learning_rate": 0.0001, "loss": 6.9584, "loss/crossentropy": 2.700324773788452, "loss/hidden": 1.79296875, "loss/jsd": 0.0, "loss/logits": 0.24651554971933365, "step": 3092 }, { "epoch": 0.0966875, "grad_norm": 3.84375, "grad_norm_var": 0.06670633951822917, "learning_rate": 0.0001, "loss": 6.6047, "loss/crossentropy": 2.5858383178710938, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.2296229526400566, "step": 3094 }, { "epoch": 0.09675, "grad_norm": 4.375, "grad_norm_var": 0.0513092041015625, "learning_rate": 0.0001, "loss": 6.897, "loss/crossentropy": 2.5975048542022705, "loss/hidden": 1.7890625, "loss/jsd": 0.0, "loss/logits": 0.25104597210884094, "step": 3096 }, { "epoch": 0.0968125, "grad_norm": 3.6875, "grad_norm_var": 0.06236063639322917, "learning_rate": 0.0001, "loss": 6.4617, "loss/crossentropy": 2.5920755863189697, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.21313737332820892, "step": 3098 }, { "epoch": 0.096875, "grad_norm": 4.0, "grad_norm_var": 0.0770660400390625, "learning_rate": 0.0001, "loss": 6.7644, "loss/crossentropy": 2.6767162084579468, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.23689288645982742, "step": 3100 }, { "epoch": 0.0969375, "grad_norm": 4.0625, "grad_norm_var": 0.07552083333333333, "learning_rate": 0.0001, "loss": 6.9666, "loss/crossentropy": 2.70647394657135, "loss/hidden": 1.8125, "loss/jsd": 0.0, "loss/logits": 0.24476733803749084, "step": 3102 }, { "epoch": 0.097, "grad_norm": 7.8125, "grad_norm_var": 0.961328125, "learning_rate": 0.0001, "loss": 7.8022, "loss/crossentropy": 3.112025260925293, "loss/hidden": 1.88671875, "loss/jsd": 0.0, "loss/logits": 0.28034333884716034, "step": 3104 }, { "epoch": 0.0970625, "grad_norm": 4.375, "grad_norm_var": 0.9477864583333333, "learning_rate": 0.0001, "loss": 6.7555, "loss/crossentropy": 2.5812584161758423, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.24008256942033768, "step": 3106 }, { "epoch": 0.097125, "grad_norm": 5.25, "grad_norm_var": 0.9876139322916667, "learning_rate": 0.0001, "loss": 6.916, "loss/crossentropy": 2.659627318382263, "loss/hidden": 1.82421875, "loss/jsd": 0.0, "loss/logits": 0.24321314692497253, "step": 3108 }, { "epoch": 0.0971875, "grad_norm": 4.03125, "grad_norm_var": 0.9808553059895834, "learning_rate": 0.0001, "loss": 6.8265, "loss/crossentropy": 2.709407329559326, "loss/hidden": 1.796875, "loss/jsd": 0.0, "loss/logits": 0.23201733827590942, "step": 3110 }, { "epoch": 0.09725, "grad_norm": 4.4375, "grad_norm_var": 1.1321614583333333, "learning_rate": 0.0001, "loss": 7.1443, "loss/crossentropy": 2.7275675535202026, "loss/hidden": 1.875, "loss/jsd": 0.0, "loss/logits": 0.2541683614253998, "step": 3112 }, { "epoch": 0.0973125, "grad_norm": 5.125, "grad_norm_var": 1.0579386393229167, "learning_rate": 0.0001, "loss": 7.3439, "loss/crossentropy": 2.883936047554016, "loss/hidden": 1.8125, "loss/jsd": 0.0, "loss/logits": 0.26474176347255707, "step": 3114 }, { "epoch": 0.097375, "grad_norm": 4.09375, "grad_norm_var": 0.9590494791666667, "learning_rate": 0.0001, "loss": 6.8974, "loss/crossentropy": 2.684443473815918, "loss/hidden": 1.765625, "loss/jsd": 0.0, "loss/logits": 0.24473382532596588, "step": 3116 }, { "epoch": 0.0974375, "grad_norm": 6.53125, "grad_norm_var": 1.1113240559895834, "learning_rate": 0.0001, "loss": 6.6607, "loss/crossentropy": 2.57452130317688, "loss/hidden": 1.80078125, "loss/jsd": 0.0, "loss/logits": 0.22854435443878174, "step": 3118 }, { "epoch": 0.0975, "grad_norm": 4.46875, "grad_norm_var": 0.5167805989583333, "learning_rate": 0.0001, "loss": 6.5166, "loss/crossentropy": 2.3872928619384766, "loss/hidden": 1.796875, "loss/jsd": 0.0, "loss/logits": 0.2332434430718422, "step": 3120 }, { "epoch": 0.0975625, "grad_norm": 4.4375, "grad_norm_var": 0.5370402018229167, "learning_rate": 0.0001, "loss": 7.0696, "loss/crossentropy": 2.7737441062927246, "loss/hidden": 1.765625, "loss/jsd": 0.0, "loss/logits": 0.253020279109478, "step": 3122 }, { "epoch": 0.097625, "grad_norm": 3.9375, "grad_norm_var": 0.5593170166015625, "learning_rate": 0.0001, "loss": 6.8688, "loss/crossentropy": 2.7830101251602173, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.23279358446598053, "step": 3124 }, { "epoch": 0.0976875, "grad_norm": 5.34375, "grad_norm_var": 0.5707265218098958, "learning_rate": 0.0001, "loss": 7.207, "loss/crossentropy": 2.699463367462158, "loss/hidden": 1.859375, "loss/jsd": 0.0, "loss/logits": 0.26481975615024567, "step": 3126 }, { "epoch": 0.09775, "grad_norm": 4.78125, "grad_norm_var": 0.4343658447265625, "learning_rate": 0.0001, "loss": 7.1911, "loss/crossentropy": 2.7707037925720215, "loss/hidden": 1.84375, "loss/jsd": 0.0, "loss/logits": 0.25765983760356903, "step": 3128 }, { "epoch": 0.0978125, "grad_norm": 4.28125, "grad_norm_var": 0.4162831624348958, "learning_rate": 0.0001, "loss": 7.0955, "loss/crossentropy": 2.7431172132492065, "loss/hidden": 1.81640625, "loss/jsd": 0.0, "loss/logits": 0.253596693277359, "step": 3130 }, { "epoch": 0.097875, "grad_norm": 3.96875, "grad_norm_var": 0.5329661051432292, "learning_rate": 0.0001, "loss": 6.8752, "loss/crossentropy": 2.6066911220550537, "loss/hidden": 1.8359375, "loss/jsd": 0.0, "loss/logits": 0.24325723201036453, "step": 3132 }, { "epoch": 0.0979375, "grad_norm": 4.53125, "grad_norm_var": 0.2732086181640625, "learning_rate": 0.0001, "loss": 6.5346, "loss/crossentropy": 2.51951003074646, "loss/hidden": 1.74609375, "loss/jsd": 0.0, "loss/logits": 0.2269037440419197, "step": 3134 }, { "epoch": 0.098, "grad_norm": 4.25, "grad_norm_var": 0.2782379150390625, "learning_rate": 0.0001, "loss": 6.609, "loss/crossentropy": 2.4421186447143555, "loss/hidden": 1.81640625, "loss/jsd": 0.0, "loss/logits": 0.23504481464624405, "step": 3136 }, { "epoch": 0.0980625, "grad_norm": 4.3125, "grad_norm_var": 0.29846089680989585, "learning_rate": 0.0001, "loss": 6.7649, "loss/crossentropy": 2.7085916996002197, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.2302408069372177, "step": 3138 }, { "epoch": 0.098125, "grad_norm": 4.78125, "grad_norm_var": 0.35513407389322915, "learning_rate": 0.0001, "loss": 6.7473, "loss/crossentropy": 2.6201740503311157, "loss/hidden": 1.796875, "loss/jsd": 0.0, "loss/logits": 0.2330276444554329, "step": 3140 }, { "epoch": 0.0981875, "grad_norm": 4.1875, "grad_norm_var": 0.2961578369140625, "learning_rate": 0.0001, "loss": 6.9348, "loss/crossentropy": 2.6920173168182373, "loss/hidden": 1.7890625, "loss/jsd": 0.0, "loss/logits": 0.24536843597888947, "step": 3142 }, { "epoch": 0.09825, "grad_norm": 3.75, "grad_norm_var": 0.3045074462890625, "learning_rate": 0.0001, "loss": 6.7441, "loss/crossentropy": 2.5798075199127197, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.24338270723819733, "step": 3144 }, { "epoch": 0.0983125, "grad_norm": 4.15625, "grad_norm_var": 0.3104329427083333, "learning_rate": 0.0001, "loss": 6.8746, "loss/crossentropy": 2.736800789833069, "loss/hidden": 1.74609375, "loss/jsd": 0.0, "loss/logits": 0.239170603454113, "step": 3146 }, { "epoch": 0.098375, "grad_norm": 4.53125, "grad_norm_var": 0.1495758056640625, "learning_rate": 0.0001, "loss": 6.8822, "loss/crossentropy": 2.7565410137176514, "loss/hidden": 1.78125, "loss/jsd": 0.0, "loss/logits": 0.23443719744682312, "step": 3148 }, { "epoch": 0.0984375, "grad_norm": 4.09375, "grad_norm_var": 0.13869527180989583, "learning_rate": 0.0001, "loss": 6.7574, "loss/crossentropy": 2.711781859397888, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.23190922290086746, "step": 3150 }, { "epoch": 0.0985, "grad_norm": 4.21875, "grad_norm_var": 0.11704813639322917, "learning_rate": 0.0001, "loss": 6.5084, "loss/crossentropy": 2.5728635787963867, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.22167624533176422, "step": 3152 }, { "epoch": 0.0985625, "grad_norm": 3.828125, "grad_norm_var": 0.11435139973958333, "learning_rate": 0.0001, "loss": 6.4095, "loss/crossentropy": 2.422892928123474, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.22132034599781036, "step": 3154 }, { "epoch": 0.098625, "grad_norm": 4.25, "grad_norm_var": 0.04759012858072917, "learning_rate": 0.0001, "loss": 6.8278, "loss/crossentropy": 2.6931371688842773, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.23808039724826813, "step": 3156 }, { "epoch": 0.0986875, "grad_norm": 3.9375, "grad_norm_var": 0.04902242024739583, "learning_rate": 0.0001, "loss": 6.7769, "loss/crossentropy": 2.723364233970642, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.2311325967311859, "step": 3158 }, { "epoch": 0.09875, "grad_norm": 3.578125, "grad_norm_var": 0.05366923014322917, "learning_rate": 0.0001, "loss": 6.0851, "loss/crossentropy": 2.2519463300704956, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.21300450712442398, "step": 3160 }, { "epoch": 0.0988125, "grad_norm": 4.09375, "grad_norm_var": 0.05276285807291667, "learning_rate": 0.0001, "loss": 6.7297, "loss/crossentropy": 2.5974432229995728, "loss/hidden": 1.8203125, "loss/jsd": 0.0, "loss/logits": 0.23119933903217316, "step": 3162 }, { "epoch": 0.098875, "grad_norm": 4.21875, "grad_norm_var": 0.0400054931640625, "learning_rate": 0.0001, "loss": 7.0831, "loss/crossentropy": 2.8298484086990356, "loss/hidden": 1.8125, "loss/jsd": 0.0, "loss/logits": 0.24407267570495605, "step": 3164 }, { "epoch": 0.0989375, "grad_norm": 4.21875, "grad_norm_var": 0.0450103759765625, "learning_rate": 0.0001, "loss": 6.7796, "loss/crossentropy": 2.7380000352859497, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.22993908822536469, "step": 3166 }, { "epoch": 0.099, "grad_norm": 4.5625, "grad_norm_var": 0.0597808837890625, "learning_rate": 0.0001, "loss": 6.7739, "loss/crossentropy": 2.6205304861068726, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.2395516186952591, "step": 3168 }, { "epoch": 0.0990625, "grad_norm": 3.796875, "grad_norm_var": 0.0579254150390625, "learning_rate": 0.0001, "loss": 7.0631, "loss/crossentropy": 2.8302695751190186, "loss/hidden": 1.79296875, "loss/jsd": 0.0, "loss/logits": 0.24398455023765564, "step": 3170 }, { "epoch": 0.099125, "grad_norm": 5.0625, "grad_norm_var": 0.11843973795572917, "learning_rate": 0.0001, "loss": 7.3957, "loss/crossentropy": 2.925995945930481, "loss/hidden": 1.890625, "loss/jsd": 0.0, "loss/logits": 0.25790391862392426, "step": 3172 }, { "epoch": 0.0991875, "grad_norm": 4.1875, "grad_norm_var": 0.11941630045572917, "learning_rate": 0.0001, "loss": 6.684, "loss/crossentropy": 2.4603766202926636, "loss/hidden": 1.83203125, "loss/jsd": 0.0, "loss/logits": 0.23915697634220123, "step": 3174 }, { "epoch": 0.09925, "grad_norm": 3.828125, "grad_norm_var": 0.08981119791666667, "learning_rate": 0.0001, "loss": 6.8299, "loss/crossentropy": 2.674596667289734, "loss/hidden": 1.74609375, "loss/jsd": 0.0, "loss/logits": 0.24092095345258713, "step": 3176 }, { "epoch": 0.0993125, "grad_norm": 3.859375, "grad_norm_var": 0.1056549072265625, "learning_rate": 0.0001, "loss": 6.3702, "loss/crossentropy": 2.389748215675354, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.22499437630176544, "step": 3178 }, { "epoch": 0.099375, "grad_norm": 4.25, "grad_norm_var": 0.3156077067057292, "learning_rate": 0.0001, "loss": 6.9551, "loss/crossentropy": 2.7173407077789307, "loss/hidden": 1.74609375, "loss/jsd": 0.0, "loss/logits": 0.24916420876979828, "step": 3180 }, { "epoch": 0.0994375, "grad_norm": 8.625, "grad_norm_var": 1.487555948893229, "learning_rate": 0.0001, "loss": 6.6407, "loss/crossentropy": 2.406462073326111, "loss/hidden": 1.77734375, "loss/jsd": 0.0, "loss/logits": 0.24568749964237213, "step": 3182 }, { "epoch": 0.0995, "grad_norm": 4.46875, "grad_norm_var": 1.4655100504557292, "learning_rate": 0.0001, "loss": 6.8779, "loss/crossentropy": 2.5153772830963135, "loss/hidden": 1.86328125, "loss/jsd": 0.0, "loss/logits": 0.24992363154888153, "step": 3184 }, { "epoch": 0.0995625, "grad_norm": 4.8125, "grad_norm_var": 1.4021769205729167, "learning_rate": 0.0001, "loss": 6.5114, "loss/crossentropy": 2.387337803840637, "loss/hidden": 1.8046875, "loss/jsd": 0.0, "loss/logits": 0.23193595558404922, "step": 3186 }, { "epoch": 0.099625, "grad_norm": 4.0625, "grad_norm_var": 1.4149373372395833, "learning_rate": 0.0001, "loss": 6.844, "loss/crossentropy": 2.6651508808135986, "loss/hidden": 1.8046875, "loss/jsd": 0.0, "loss/logits": 0.23741832375526428, "step": 3188 }, { "epoch": 0.0996875, "grad_norm": 3.734375, "grad_norm_var": 1.458503214518229, "learning_rate": 0.0001, "loss": 6.4834, "loss/crossentropy": 2.3933539390563965, "loss/hidden": 1.765625, "loss/jsd": 0.0, "loss/logits": 0.23244161903858185, "step": 3190 }, { "epoch": 0.09975, "grad_norm": 4.96875, "grad_norm_var": 1.44927978515625, "learning_rate": 0.0001, "loss": 7.1836, "loss/crossentropy": 2.730854630470276, "loss/hidden": 1.89453125, "loss/jsd": 0.0, "loss/logits": 0.25581687688827515, "step": 3192 }, { "epoch": 0.0998125, "grad_norm": 4.28125, "grad_norm_var": 1.3643137613932292, "learning_rate": 0.0001, "loss": 6.8291, "loss/crossentropy": 2.6246731281280518, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.24309448152780533, "step": 3194 }, { "epoch": 0.099875, "grad_norm": 4.21875, "grad_norm_var": 1.2745676676432292, "learning_rate": 0.0001, "loss": 7.1324, "loss/crossentropy": 2.785401940345764, "loss/hidden": 1.796875, "loss/jsd": 0.0, "loss/logits": 0.2550133019685745, "step": 3196 }, { "epoch": 0.0999375, "grad_norm": 4.75, "grad_norm_var": 0.16503804524739582, "learning_rate": 0.0001, "loss": 6.8579, "loss/crossentropy": 2.624993920326233, "loss/hidden": 1.8359375, "loss/jsd": 0.0, "loss/logits": 0.23969610035419464, "step": 3198 }, { "epoch": 0.1, "grad_norm": 4.4375, "grad_norm_var": 0.16879781087239584, "learning_rate": 0.0001, "loss": 6.7553, "loss/crossentropy": 2.605829358100891, "loss/hidden": 1.76953125, "loss/jsd": 0.0, "loss/logits": 0.23798971623182297, "step": 3200 }, { "epoch": 0.1000625, "grad_norm": 3.703125, "grad_norm_var": 0.18420817057291666, "learning_rate": 0.0001, "loss": 6.8028, "loss/crossentropy": 2.650471568107605, "loss/hidden": 1.78125, "loss/jsd": 0.0, "loss/logits": 0.23710602521896362, "step": 3202 }, { "epoch": 0.100125, "grad_norm": 3.6875, "grad_norm_var": 0.21162109375, "learning_rate": 0.0001, "loss": 6.7042, "loss/crossentropy": 2.5433319807052612, "loss/hidden": 1.8125, "loss/jsd": 0.0, "loss/logits": 0.23484085500240326, "step": 3204 }, { "epoch": 0.1001875, "grad_norm": 3.75, "grad_norm_var": 0.22713114420572916, "learning_rate": 0.0001, "loss": 6.457, "loss/crossentropy": 2.5292232036590576, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.22207817435264587, "step": 3206 }, { "epoch": 0.10025, "grad_norm": 3.734375, "grad_norm_var": 0.09065755208333333, "learning_rate": 0.0001, "loss": 6.7258, "loss/crossentropy": 2.6866955757141113, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.23047468811273575, "step": 3208 }, { "epoch": 0.1003125, "grad_norm": 4.1875, "grad_norm_var": 0.08564046223958334, "learning_rate": 0.0001, "loss": 6.8028, "loss/crossentropy": 2.612138509750366, "loss/hidden": 1.7890625, "loss/jsd": 0.0, "loss/logits": 0.24016397446393967, "step": 3210 }, { "epoch": 0.100375, "grad_norm": 3.90625, "grad_norm_var": 0.087255859375, "learning_rate": 0.0001, "loss": 6.9648, "loss/crossentropy": 2.7992119789123535, "loss/hidden": 1.79296875, "loss/jsd": 0.0, "loss/logits": 0.23726076632738113, "step": 3212 }, { "epoch": 0.1004375, "grad_norm": 4.3125, "grad_norm_var": 0.07073567708333334, "learning_rate": 0.0001, "loss": 6.7238, "loss/crossentropy": 2.5705056190490723, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.2379813715815544, "step": 3214 }, { "epoch": 0.1005, "grad_norm": 4.4375, "grad_norm_var": 0.07086181640625, "learning_rate": 0.0001, "loss": 6.9679, "loss/crossentropy": 2.720636248588562, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.24972644448280334, "step": 3216 }, { "epoch": 0.1005625, "grad_norm": 4.4375, "grad_norm_var": 0.07771708170572916, "learning_rate": 0.0001, "loss": 6.9827, "loss/crossentropy": 2.7298405170440674, "loss/hidden": 1.77734375, "loss/jsd": 0.0, "loss/logits": 0.24754898250102997, "step": 3218 }, { "epoch": 0.100625, "grad_norm": 4.71875, "grad_norm_var": 0.0835601806640625, "learning_rate": 0.0001, "loss": 6.7132, "loss/crossentropy": 2.5458946228027344, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.2393861562013626, "step": 3220 }, { "epoch": 0.1006875, "grad_norm": 4.46875, "grad_norm_var": 0.10512593587239584, "learning_rate": 0.0001, "loss": 7.0824, "loss/crossentropy": 2.813864588737488, "loss/hidden": 1.78125, "loss/jsd": 0.0, "loss/logits": 0.2487269639968872, "step": 3222 }, { "epoch": 0.10075, "grad_norm": 4.375, "grad_norm_var": 0.08682352701822917, "learning_rate": 0.0001, "loss": 6.5424, "loss/crossentropy": 2.4539612531661987, "loss/hidden": 1.765625, "loss/jsd": 0.0, "loss/logits": 0.23228086531162262, "step": 3224 }, { "epoch": 0.1008125, "grad_norm": 4.0, "grad_norm_var": 0.10008138020833333, "learning_rate": 0.0001, "loss": 6.5922, "loss/crossentropy": 2.486897349357605, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.2347516268491745, "step": 3226 }, { "epoch": 0.100875, "grad_norm": 4.375, "grad_norm_var": 0.11945699055989584, "learning_rate": 0.0001, "loss": 6.6798, "loss/crossentropy": 2.6234965324401855, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.22828619182109833, "step": 3228 }, { "epoch": 0.1009375, "grad_norm": 4.28125, "grad_norm_var": 0.1401275634765625, "learning_rate": 0.0001, "loss": 6.5299, "loss/crossentropy": 2.453442335128784, "loss/hidden": 1.80078125, "loss/jsd": 0.0, "loss/logits": 0.22756576538085938, "step": 3230 }, { "epoch": 0.101, "grad_norm": 4.03125, "grad_norm_var": 0.16022135416666666, "learning_rate": 0.0001, "loss": 6.2773, "loss/crossentropy": 2.4830336570739746, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.20910926163196564, "step": 3232 }, { "epoch": 0.1010625, "grad_norm": 4.34375, "grad_norm_var": 0.15885416666666666, "learning_rate": 0.0001, "loss": 6.9401, "loss/crossentropy": 2.710699439048767, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.24559374898672104, "step": 3234 }, { "epoch": 0.101125, "grad_norm": 4.0625, "grad_norm_var": 0.14816080729166667, "learning_rate": 0.0001, "loss": 6.8652, "loss/crossentropy": 2.6526342630386353, "loss/hidden": 1.765625, "loss/jsd": 0.0, "loss/logits": 0.24469351768493652, "step": 3236 }, { "epoch": 0.1011875, "grad_norm": 3.90625, "grad_norm_var": 0.10041910807291667, "learning_rate": 0.0001, "loss": 6.4651, "loss/crossentropy": 2.483692169189453, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.22392398118972778, "step": 3238 }, { "epoch": 0.10125, "grad_norm": 3.78125, "grad_norm_var": 0.10042215983072916, "learning_rate": 0.0001, "loss": 6.5394, "loss/crossentropy": 2.5091440677642822, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.23388579487800598, "step": 3240 }, { "epoch": 0.1013125, "grad_norm": 4.90625, "grad_norm_var": 0.14345296223958334, "learning_rate": 0.0001, "loss": 6.6786, "loss/crossentropy": 2.4868510961532593, "loss/hidden": 1.77734375, "loss/jsd": 0.0, "loss/logits": 0.24143566191196442, "step": 3242 }, { "epoch": 0.101375, "grad_norm": 3.765625, "grad_norm_var": 0.13547261555989584, "learning_rate": 0.0001, "loss": 6.4898, "loss/crossentropy": 2.5291796922683716, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.22028034180402756, "step": 3244 }, { "epoch": 0.1014375, "grad_norm": 4.34375, "grad_norm_var": 0.0960601806640625, "learning_rate": 0.0001, "loss": 6.854, "loss/crossentropy": 2.7553012371063232, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.23409316688776016, "step": 3246 }, { "epoch": 0.1015, "grad_norm": 3.796875, "grad_norm_var": 0.10049540201822917, "learning_rate": 0.0001, "loss": 6.5228, "loss/crossentropy": 2.4410321712493896, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.23552102595567703, "step": 3248 }, { "epoch": 0.1015625, "grad_norm": 3.84375, "grad_norm_var": 0.1018463134765625, "learning_rate": 0.0001, "loss": 6.7058, "loss/crossentropy": 2.555396318435669, "loss/hidden": 1.80859375, "loss/jsd": 0.0, "loss/logits": 0.23418189585208893, "step": 3250 }, { "epoch": 0.101625, "grad_norm": 3.828125, "grad_norm_var": 0.14159749348958334, "learning_rate": 0.0001, "loss": 6.7568, "loss/crossentropy": 2.5550049543380737, "loss/hidden": 1.83984375, "loss/jsd": 0.0, "loss/logits": 0.23619654774665833, "step": 3252 }, { "epoch": 0.1016875, "grad_norm": 4.53125, "grad_norm_var": 0.13186442057291667, "learning_rate": 0.0001, "loss": 6.6336, "loss/crossentropy": 2.5909934043884277, "loss/hidden": 1.8359375, "loss/jsd": 0.0, "loss/logits": 0.22066353261470795, "step": 3254 }, { "epoch": 0.10175, "grad_norm": 4.3125, "grad_norm_var": 0.12200113932291666, "learning_rate": 0.0001, "loss": 6.9677, "loss/crossentropy": 2.701515555381775, "loss/hidden": 1.79296875, "loss/jsd": 0.0, "loss/logits": 0.24731867015361786, "step": 3256 }, { "epoch": 0.1018125, "grad_norm": 4.34375, "grad_norm_var": 0.09191080729166666, "learning_rate": 0.0001, "loss": 6.6695, "loss/crossentropy": 2.6422771215438843, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.23046043515205383, "step": 3258 }, { "epoch": 0.101875, "grad_norm": 4.09375, "grad_norm_var": 0.07873942057291666, "learning_rate": 0.0001, "loss": 6.2728, "loss/crossentropy": 2.242031455039978, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.2257358878850937, "step": 3260 }, { "epoch": 0.1019375, "grad_norm": 3.953125, "grad_norm_var": 0.07946675618489583, "learning_rate": 0.0001, "loss": 6.5906, "loss/crossentropy": 2.5818673372268677, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.22587288916110992, "step": 3262 }, { "epoch": 0.102, "grad_norm": 3.765625, "grad_norm_var": 0.0798736572265625, "learning_rate": 0.0001, "loss": 6.6192, "loss/crossentropy": 2.602246403694153, "loss/hidden": 1.76953125, "loss/jsd": 0.0, "loss/logits": 0.2247394099831581, "step": 3264 }, { "epoch": 0.1020625, "grad_norm": 3.796875, "grad_norm_var": 0.08364156087239584, "learning_rate": 0.0001, "loss": 6.7432, "loss/crossentropy": 2.6607872247695923, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.23480576276779175, "step": 3266 }, { "epoch": 0.102125, "grad_norm": 5.03125, "grad_norm_var": 0.10357666015625, "learning_rate": 0.0001, "loss": 7.0483, "loss/crossentropy": 2.6793575286865234, "loss/hidden": 1.8046875, "loss/jsd": 0.0, "loss/logits": 0.2564301863312721, "step": 3268 }, { "epoch": 0.1021875, "grad_norm": 4.03125, "grad_norm_var": 0.09527079264322917, "learning_rate": 0.0001, "loss": 6.6106, "loss/crossentropy": 2.5537089109420776, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.23264338821172714, "step": 3270 }, { "epoch": 0.10225, "grad_norm": 3.984375, "grad_norm_var": 0.09326171875, "learning_rate": 0.0001, "loss": 6.6138, "loss/crossentropy": 2.5651522874832153, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.23298664391040802, "step": 3272 }, { "epoch": 0.1023125, "grad_norm": 4.6875, "grad_norm_var": 0.12069905598958333, "learning_rate": 0.0001, "loss": 6.2125, "loss/crossentropy": 2.287251889705658, "loss/hidden": 1.765625, "loss/jsd": 0.0, "loss/logits": 0.21595896035432816, "step": 3274 }, { "epoch": 0.102375, "grad_norm": 4.53125, "grad_norm_var": 0.12675374348958332, "learning_rate": 0.0001, "loss": 6.6801, "loss/crossentropy": 2.540787935256958, "loss/hidden": 1.78125, "loss/jsd": 0.0, "loss/logits": 0.2358093112707138, "step": 3276 }, { "epoch": 0.1024375, "grad_norm": 4.0, "grad_norm_var": 0.1319732666015625, "learning_rate": 0.0001, "loss": 6.6532, "loss/crossentropy": 2.479809880256653, "loss/hidden": 1.74609375, "loss/jsd": 0.0, "loss/logits": 0.24272619932889938, "step": 3278 }, { "epoch": 0.1025, "grad_norm": 4.0625, "grad_norm_var": 0.11599934895833333, "learning_rate": 0.0001, "loss": 6.5609, "loss/crossentropy": 2.5186489820480347, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.2311742752790451, "step": 3280 }, { "epoch": 0.1025625, "grad_norm": 4.125, "grad_norm_var": 0.09345703125, "learning_rate": 0.0001, "loss": 6.5359, "loss/crossentropy": 2.5733646154403687, "loss/hidden": 1.7109375, "loss/jsd": 0.0, "loss/logits": 0.2251606062054634, "step": 3282 }, { "epoch": 0.102625, "grad_norm": 4.4375, "grad_norm_var": 0.05789388020833333, "learning_rate": 0.0001, "loss": 6.8128, "loss/crossentropy": 2.6851168870925903, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.23737867176532745, "step": 3284 }, { "epoch": 0.1026875, "grad_norm": 4.40625, "grad_norm_var": 0.2830149332682292, "learning_rate": 0.0001, "loss": 7.2477, "loss/crossentropy": 2.7112995386123657, "loss/hidden": 1.8125, "loss/jsd": 0.0, "loss/logits": 0.2723904848098755, "step": 3286 }, { "epoch": 0.10275, "grad_norm": 3.9375, "grad_norm_var": 0.27584635416666664, "learning_rate": 0.0001, "loss": 6.889, "loss/crossentropy": 2.689952254295349, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.24490958452224731, "step": 3288 }, { "epoch": 0.1028125, "grad_norm": 4.0, "grad_norm_var": 0.30777079264322915, "learning_rate": 0.0001, "loss": 6.6267, "loss/crossentropy": 2.6512755155563354, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.22684409469366074, "step": 3290 }, { "epoch": 0.102875, "grad_norm": 3.8125, "grad_norm_var": 0.3424763997395833, "learning_rate": 0.0001, "loss": 6.3049, "loss/crossentropy": 2.470964550971985, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.21112415194511414, "step": 3292 }, { "epoch": 0.1029375, "grad_norm": 4.03125, "grad_norm_var": 0.3399566650390625, "learning_rate": 0.0001, "loss": 6.601, "loss/crossentropy": 2.5517722368240356, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.22992479801177979, "step": 3294 }, { "epoch": 0.103, "grad_norm": 3.953125, "grad_norm_var": 0.34455973307291665, "learning_rate": 0.0001, "loss": 6.9876, "loss/crossentropy": 2.772384524345398, "loss/hidden": 1.77734375, "loss/jsd": 0.0, "loss/logits": 0.24378702044487, "step": 3296 }, { "epoch": 0.1030625, "grad_norm": 4.1875, "grad_norm_var": 0.3600870768229167, "learning_rate": 0.0001, "loss": 6.5065, "loss/crossentropy": 2.5865761041641235, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.22168077528476715, "step": 3298 }, { "epoch": 0.103125, "grad_norm": 4.5, "grad_norm_var": 0.36243082682291666, "learning_rate": 0.0001, "loss": 6.7596, "loss/crossentropy": 2.6341527700424194, "loss/hidden": 1.76953125, "loss/jsd": 0.0, "loss/logits": 0.23559296876192093, "step": 3300 }, { "epoch": 0.1031875, "grad_norm": 4.59375, "grad_norm_var": 0.08058268229166667, "learning_rate": 0.0001, "loss": 6.8533, "loss/crossentropy": 2.7177610397338867, "loss/hidden": 1.76953125, "loss/jsd": 0.0, "loss/logits": 0.23659591376781464, "step": 3302 }, { "epoch": 0.10325, "grad_norm": 4.4375, "grad_norm_var": 0.083447265625, "learning_rate": 0.0001, "loss": 6.9423, "loss/crossentropy": 2.6986615657806396, "loss/hidden": 1.76953125, "loss/jsd": 0.0, "loss/logits": 0.24740910530090332, "step": 3304 }, { "epoch": 0.1033125, "grad_norm": 4.3125, "grad_norm_var": 0.0849761962890625, "learning_rate": 0.0001, "loss": 6.3193, "loss/crossentropy": 2.3887938261032104, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.22156642377376556, "step": 3306 }, { "epoch": 0.103375, "grad_norm": 4.15625, "grad_norm_var": 0.06568603515625, "learning_rate": 0.0001, "loss": 7.2802, "loss/crossentropy": 3.0000956058502197, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.25066203624010086, "step": 3308 }, { "epoch": 0.1034375, "grad_norm": 55.25, "grad_norm_var": 163.27130432128905, "learning_rate": 0.0001, "loss": 7.7819, "loss/crossentropy": 2.6019656658172607, "loss/hidden": 1.9375, "loss/jsd": 0.0, "loss/logits": 0.3242449462413788, "step": 3310 }, { "epoch": 0.1035, "grad_norm": 4.28125, "grad_norm_var": 163.11725260416668, "learning_rate": 0.0001, "loss": 6.8478, "loss/crossentropy": 2.632767677307129, "loss/hidden": 1.80859375, "loss/jsd": 0.0, "loss/logits": 0.24064189940690994, "step": 3312 }, { "epoch": 0.1035625, "grad_norm": 6.65625, "grad_norm_var": 162.36551005045573, "learning_rate": 0.0001, "loss": 6.6017, "loss/crossentropy": 2.5260356664657593, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.23256751894950867, "step": 3314 }, { "epoch": 0.103625, "grad_norm": 4.46875, "grad_norm_var": 162.21673075358072, "learning_rate": 0.0001, "loss": 6.9282, "loss/crossentropy": 2.644586205482483, "loss/hidden": 1.80078125, "loss/jsd": 0.0, "loss/logits": 0.24828049540519714, "step": 3316 }, { "epoch": 0.1036875, "grad_norm": 4.21875, "grad_norm_var": 162.37351786295574, "learning_rate": 0.0001, "loss": 6.4069, "loss/crossentropy": 2.370494246482849, "loss/hidden": 1.765625, "loss/jsd": 0.0, "loss/logits": 0.22707335650920868, "step": 3318 }, { "epoch": 0.10375, "grad_norm": 4.15625, "grad_norm_var": 162.66480204264323, "learning_rate": 0.0001, "loss": 6.7115, "loss/crossentropy": 2.620364189147949, "loss/hidden": 1.76953125, "loss/jsd": 0.0, "loss/logits": 0.23216355592012405, "step": 3320 }, { "epoch": 0.1038125, "grad_norm": 3.921875, "grad_norm_var": 162.7859120686849, "learning_rate": 0.0001, "loss": 6.739, "loss/crossentropy": 2.599918246269226, "loss/hidden": 1.765625, "loss/jsd": 0.0, "loss/logits": 0.23734501004219055, "step": 3322 }, { "epoch": 0.103875, "grad_norm": 3.71875, "grad_norm_var": 163.06065165201824, "learning_rate": 0.0001, "loss": 6.007, "loss/crossentropy": 2.29317045211792, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.20107153803110123, "step": 3324 }, { "epoch": 0.1039375, "grad_norm": 4.75, "grad_norm_var": 0.49283447265625, "learning_rate": 0.0001, "loss": 6.6523, "loss/crossentropy": 2.5400946140289307, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.2369968742132187, "step": 3326 }, { "epoch": 0.104, "grad_norm": 5.34375, "grad_norm_var": 0.5761220296223958, "learning_rate": 0.0001, "loss": 6.9468, "loss/crossentropy": 2.67788302898407, "loss/hidden": 1.796875, "loss/jsd": 0.0, "loss/logits": 0.24720104038715363, "step": 3328 }, { "epoch": 0.1040625, "grad_norm": 4.03125, "grad_norm_var": 0.18349202473958334, "learning_rate": 0.0001, "loss": 6.7984, "loss/crossentropy": 2.7285367250442505, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.2316000536084175, "step": 3330 }, { "epoch": 0.104125, "grad_norm": 4.3125, "grad_norm_var": 0.18943684895833332, "learning_rate": 0.0001, "loss": 7.0711, "loss/crossentropy": 2.7006884813308716, "loss/hidden": 1.87890625, "loss/jsd": 0.0, "loss/logits": 0.24914763867855072, "step": 3332 }, { "epoch": 0.1041875, "grad_norm": 4.15625, "grad_norm_var": 0.19184468587239584, "learning_rate": 0.0001, "loss": 6.2808, "loss/crossentropy": 2.314067244529724, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.22089679539203644, "step": 3334 }, { "epoch": 0.10425, "grad_norm": 4.3125, "grad_norm_var": 0.18879292805989584, "learning_rate": 0.0001, "loss": 6.6084, "loss/crossentropy": 2.5610376596450806, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.23208405077457428, "step": 3336 }, { "epoch": 0.1043125, "grad_norm": 3.921875, "grad_norm_var": 0.1903717041015625, "learning_rate": 0.0001, "loss": 6.4163, "loss/crossentropy": 2.499178647994995, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.2198343127965927, "step": 3338 }, { "epoch": 0.104375, "grad_norm": 5.1875, "grad_norm_var": 0.2302398681640625, "learning_rate": 0.0001, "loss": 6.9003, "loss/crossentropy": 2.6422451734542847, "loss/hidden": 1.78125, "loss/jsd": 0.0, "loss/logits": 0.24767768383026123, "step": 3340 }, { "epoch": 0.1044375, "grad_norm": 4.40625, "grad_norm_var": 0.20227864583333333, "learning_rate": 0.0001, "loss": 6.6569, "loss/crossentropy": 2.496048331260681, "loss/hidden": 1.76953125, "loss/jsd": 0.0, "loss/logits": 0.23913077265024185, "step": 3342 }, { "epoch": 0.1045, "grad_norm": 4.25, "grad_norm_var": 0.11485087076822917, "learning_rate": 0.0001, "loss": 7.0398, "loss/crossentropy": 2.788357138633728, "loss/hidden": 1.77734375, "loss/jsd": 0.0, "loss/logits": 0.24741224944591522, "step": 3344 }, { "epoch": 0.1045625, "grad_norm": 3.78125, "grad_norm_var": 0.12363993326822917, "learning_rate": 0.0001, "loss": 6.7648, "loss/crossentropy": 2.645902395248413, "loss/hidden": 1.76953125, "loss/jsd": 0.0, "loss/logits": 0.23493220657110214, "step": 3346 }, { "epoch": 0.104625, "grad_norm": 4.84375, "grad_norm_var": 0.1349761962890625, "learning_rate": 0.0001, "loss": 6.6878, "loss/crossentropy": 2.617870330810547, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.23355630785226822, "step": 3348 }, { "epoch": 0.1046875, "grad_norm": 3.953125, "grad_norm_var": 0.14269917805989582, "learning_rate": 0.0001, "loss": 6.9487, "loss/crossentropy": 2.728622317314148, "loss/hidden": 1.77734375, "loss/jsd": 0.0, "loss/logits": 0.2442745715379715, "step": 3350 }, { "epoch": 0.10475, "grad_norm": 4.15625, "grad_norm_var": 0.1415435791015625, "learning_rate": 0.0001, "loss": 6.8727, "loss/crossentropy": 2.638767719268799, "loss/hidden": 1.80078125, "loss/jsd": 0.0, "loss/logits": 0.24331195652484894, "step": 3352 }, { "epoch": 0.1048125, "grad_norm": 4.0625, "grad_norm_var": 0.12536519368489582, "learning_rate": 0.0001, "loss": 6.8102, "loss/crossentropy": 2.7050987482070923, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.23511598259210587, "step": 3354 }, { "epoch": 0.104875, "grad_norm": 3.71875, "grad_norm_var": 0.07929585774739584, "learning_rate": 0.0001, "loss": 6.2211, "loss/crossentropy": 2.3178874254226685, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.21766043454408646, "step": 3356 }, { "epoch": 0.1049375, "grad_norm": 3.640625, "grad_norm_var": 0.09163309733072916, "learning_rate": 0.0001, "loss": 6.1729, "loss/crossentropy": 2.3861899375915527, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.21108877658843994, "step": 3358 }, { "epoch": 0.105, "grad_norm": 4.09375, "grad_norm_var": 0.09419657389322916, "learning_rate": 0.0001, "loss": 6.5909, "loss/crossentropy": 2.560970664024353, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.23072851449251175, "step": 3360 }, { "epoch": 0.1050625, "grad_norm": 4.0, "grad_norm_var": 0.09385477701822917, "learning_rate": 0.0001, "loss": 6.4854, "loss/crossentropy": 2.5368300676345825, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.22220295667648315, "step": 3362 }, { "epoch": 0.105125, "grad_norm": 3.953125, "grad_norm_var": 0.053515625, "learning_rate": 0.0001, "loss": 6.4987, "loss/crossentropy": 2.570409417152405, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.21977746486663818, "step": 3364 }, { "epoch": 0.1051875, "grad_norm": 3.875, "grad_norm_var": 0.03459370930989583, "learning_rate": 0.0001, "loss": 6.8575, "loss/crossentropy": 2.7380369901657104, "loss/hidden": 1.765625, "loss/jsd": 0.0, "loss/logits": 0.23538140952587128, "step": 3366 }, { "epoch": 0.10525, "grad_norm": 4.3125, "grad_norm_var": 0.031712849934895836, "learning_rate": 0.0001, "loss": 6.6548, "loss/crossentropy": 2.5999975204467773, "loss/hidden": 1.74609375, "loss/jsd": 0.0, "loss/logits": 0.23086804151535034, "step": 3368 }, { "epoch": 0.1053125, "grad_norm": 3.40625, "grad_norm_var": 0.0510894775390625, "learning_rate": 0.0001, "loss": 6.2373, "loss/crossentropy": 2.3586994409561157, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.21403686702251434, "step": 3370 }, { "epoch": 0.105375, "grad_norm": 3.9375, "grad_norm_var": 0.0415679931640625, "learning_rate": 0.0001, "loss": 6.3877, "loss/crossentropy": 2.436392903327942, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.22168905287981033, "step": 3372 }, { "epoch": 0.1054375, "grad_norm": 3.8125, "grad_norm_var": 0.046930948893229164, "learning_rate": 0.0001, "loss": 6.3903, "loss/crossentropy": 2.537071108818054, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.21617799997329712, "step": 3374 }, { "epoch": 0.1055, "grad_norm": 3.734375, "grad_norm_var": 0.0477447509765625, "learning_rate": 0.0001, "loss": 6.3936, "loss/crossentropy": 2.44389808177948, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.22348541766405106, "step": 3376 }, { "epoch": 0.1055625, "grad_norm": 3.828125, "grad_norm_var": 0.047265625, "learning_rate": 0.0001, "loss": 6.1489, "loss/crossentropy": 2.351946711540222, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.2089892104268074, "step": 3378 }, { "epoch": 0.105625, "grad_norm": 3.640625, "grad_norm_var": 0.05133056640625, "learning_rate": 0.0001, "loss": 6.4778, "loss/crossentropy": 2.489010810852051, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.22818031907081604, "step": 3380 }, { "epoch": 0.1056875, "grad_norm": 3.9375, "grad_norm_var": 0.05028889973958333, "learning_rate": 0.0001, "loss": 6.8609, "loss/crossentropy": 2.7186564207077026, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.24039897322654724, "step": 3382 }, { "epoch": 0.10575, "grad_norm": 4.25, "grad_norm_var": 0.11812744140625, "learning_rate": 0.0001, "loss": 7.0582, "loss/crossentropy": 2.758796215057373, "loss/hidden": 1.7890625, "loss/jsd": 0.0, "loss/logits": 0.2510346248745918, "step": 3384 }, { "epoch": 0.1058125, "grad_norm": 3.953125, "grad_norm_var": 0.1041168212890625, "learning_rate": 0.0001, "loss": 6.4993, "loss/crossentropy": 2.5294731855392456, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.22120419889688492, "step": 3386 }, { "epoch": 0.105875, "grad_norm": 3.9375, "grad_norm_var": 0.10358784993489584, "learning_rate": 0.0001, "loss": 6.5889, "loss/crossentropy": 2.5931564569473267, "loss/hidden": 1.76171875, "loss/jsd": 0.0, "loss/logits": 0.22339749336242676, "step": 3388 }, { "epoch": 0.1059375, "grad_norm": 4.34375, "grad_norm_var": 0.09595947265625, "learning_rate": 0.0001, "loss": 6.754, "loss/crossentropy": 2.637158513069153, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.23903284966945648, "step": 3390 }, { "epoch": 0.106, "grad_norm": 4.0, "grad_norm_var": 0.08388264973958333, "learning_rate": 0.0001, "loss": 6.38, "loss/crossentropy": 2.390872836112976, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.22664637118577957, "step": 3392 }, { "epoch": 0.1060625, "grad_norm": 3.84375, "grad_norm_var": 0.0847320556640625, "learning_rate": 0.0001, "loss": 6.859, "loss/crossentropy": 2.7824547290802, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.23460885882377625, "step": 3394 }, { "epoch": 0.106125, "grad_norm": 3.859375, "grad_norm_var": 0.0758209228515625, "learning_rate": 0.0001, "loss": 6.4763, "loss/crossentropy": 2.5256487131118774, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.22709660977125168, "step": 3396 }, { "epoch": 0.1061875, "grad_norm": 3.78125, "grad_norm_var": 0.08244527180989583, "learning_rate": 0.0001, "loss": 6.7439, "loss/crossentropy": 2.5768805742263794, "loss/hidden": 1.78515625, "loss/jsd": 0.0, "loss/logits": 0.2381836324930191, "step": 3398 }, { "epoch": 0.10625, "grad_norm": 3.6875, "grad_norm_var": 0.04130757649739583, "learning_rate": 0.0001, "loss": 6.611, "loss/crossentropy": 2.593908429145813, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.22827593237161636, "step": 3400 }, { "epoch": 0.1063125, "grad_norm": 3.921875, "grad_norm_var": 0.044066365559895834, "learning_rate": 0.0001, "loss": 6.2723, "loss/crossentropy": 2.3864080905914307, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.21359364688396454, "step": 3402 }, { "epoch": 0.106375, "grad_norm": 4.28125, "grad_norm_var": 0.05420633951822917, "learning_rate": 0.0001, "loss": 6.8791, "loss/crossentropy": 2.740124464035034, "loss/hidden": 1.74609375, "loss/jsd": 0.0, "loss/logits": 0.23929176479578018, "step": 3404 }, { "epoch": 0.1064375, "grad_norm": 3.96875, "grad_norm_var": 0.044733683268229164, "learning_rate": 0.0001, "loss": 6.8561, "loss/crossentropy": 2.72505784034729, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.23966719955205917, "step": 3406 }, { "epoch": 0.1065, "grad_norm": 4.375, "grad_norm_var": 0.054541015625, "learning_rate": 0.0001, "loss": 7.0188, "loss/crossentropy": 2.7997384071350098, "loss/hidden": 1.78515625, "loss/jsd": 0.0, "loss/logits": 0.24338825047016144, "step": 3408 }, { "epoch": 0.1065625, "grad_norm": 3.828125, "grad_norm_var": 0.05488993326822917, "learning_rate": 0.0001, "loss": 6.9797, "loss/crossentropy": 2.819231390953064, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.24026915431022644, "step": 3410 }, { "epoch": 0.106625, "grad_norm": 4.96875, "grad_norm_var": 2.1745402018229165, "learning_rate": 0.0001, "loss": 7.0188, "loss/crossentropy": 2.718687415122986, "loss/hidden": 1.84375, "loss/jsd": 0.0, "loss/logits": 0.24564100801944733, "step": 3412 }, { "epoch": 0.1066875, "grad_norm": 4.21875, "grad_norm_var": 2.156966145833333, "learning_rate": 0.0001, "loss": 6.5127, "loss/crossentropy": 2.5084104537963867, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.2297305017709732, "step": 3414 }, { "epoch": 0.10675, "grad_norm": 3.796875, "grad_norm_var": 2.158740234375, "learning_rate": 0.0001, "loss": 6.6456, "loss/crossentropy": 2.6552449464797974, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.22754787653684616, "step": 3416 }, { "epoch": 0.1068125, "grad_norm": 4.40625, "grad_norm_var": 2.106273396809896, "learning_rate": 0.0001, "loss": 6.9007, "loss/crossentropy": 2.6753886938095093, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.24674660712480545, "step": 3418 }, { "epoch": 0.106875, "grad_norm": 4.21875, "grad_norm_var": 2.1166178385416665, "learning_rate": 0.0001, "loss": 6.4056, "loss/crossentropy": 2.5564688444137573, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.21655535697937012, "step": 3420 }, { "epoch": 0.1069375, "grad_norm": 3.78125, "grad_norm_var": 2.1394195556640625, "learning_rate": 0.0001, "loss": 6.6441, "loss/crossentropy": 2.6539390087127686, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.22596576809883118, "step": 3422 }, { "epoch": 0.107, "grad_norm": 3.484375, "grad_norm_var": 2.1920562744140626, "learning_rate": 0.0001, "loss": 6.0564, "loss/crossentropy": 2.279597282409668, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.20502237975597382, "step": 3424 }, { "epoch": 0.1070625, "grad_norm": 4.03125, "grad_norm_var": 2.3096964518229166, "learning_rate": 0.0001, "loss": 6.9587, "loss/crossentropy": 2.729211688041687, "loss/hidden": 1.7890625, "loss/jsd": 0.0, "loss/logits": 0.2440449818968773, "step": 3426 }, { "epoch": 0.107125, "grad_norm": 4.09375, "grad_norm_var": 0.2835774739583333, "learning_rate": 0.0001, "loss": 6.672, "loss/crossentropy": 2.5828728675842285, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.233519047498703, "step": 3428 }, { "epoch": 0.1071875, "grad_norm": 3.625, "grad_norm_var": 0.29830322265625, "learning_rate": 0.0001, "loss": 6.5412, "loss/crossentropy": 2.6584055423736572, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.2175753340125084, "step": 3430 }, { "epoch": 0.10725, "grad_norm": 4.125, "grad_norm_var": 0.29244384765625, "learning_rate": 0.0001, "loss": 7.0144, "loss/crossentropy": 2.808635711669922, "loss/hidden": 1.78125, "loss/jsd": 0.0, "loss/logits": 0.24245436489582062, "step": 3432 }, { "epoch": 0.1073125, "grad_norm": 4.28125, "grad_norm_var": 0.29611002604166664, "learning_rate": 0.0001, "loss": 6.582, "loss/crossentropy": 2.5630985498428345, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.2292320355772972, "step": 3434 }, { "epoch": 0.107375, "grad_norm": 4.34375, "grad_norm_var": 0.29687398274739585, "learning_rate": 0.0001, "loss": 6.3284, "loss/crossentropy": 2.5125374794006348, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.20814663916826248, "step": 3436 }, { "epoch": 0.1074375, "grad_norm": 4.15625, "grad_norm_var": 0.288427734375, "learning_rate": 0.0001, "loss": 6.6832, "loss/crossentropy": 2.553908348083496, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.23793192207813263, "step": 3438 }, { "epoch": 0.1075, "grad_norm": 3.890625, "grad_norm_var": 0.267138671875, "learning_rate": 0.0001, "loss": 6.6803, "loss/crossentropy": 2.5226951837539673, "loss/hidden": 1.765625, "loss/jsd": 0.0, "loss/logits": 0.23919696360826492, "step": 3440 }, { "epoch": 0.1075625, "grad_norm": 3.921875, "grad_norm_var": 0.04299214680989583, "learning_rate": 0.0001, "loss": 6.8391, "loss/crossentropy": 2.732064366340637, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.2357020601630211, "step": 3442 }, { "epoch": 0.107625, "grad_norm": 4.09375, "grad_norm_var": 0.050455729166666664, "learning_rate": 0.0001, "loss": 6.4614, "loss/crossentropy": 2.5358023643493652, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.22224289923906326, "step": 3444 }, { "epoch": 0.1076875, "grad_norm": 4.34375, "grad_norm_var": 0.09778645833333334, "learning_rate": 0.0001, "loss": 7.4274, "loss/crossentropy": 2.9860514402389526, "loss/hidden": 1.8046875, "loss/jsd": 0.0, "loss/logits": 0.26366259157657623, "step": 3446 }, { "epoch": 0.10775, "grad_norm": 3.953125, "grad_norm_var": 0.13871968587239583, "learning_rate": 0.0001, "loss": 6.7431, "loss/crossentropy": 2.5070163011550903, "loss/hidden": 1.77734375, "loss/jsd": 0.0, "loss/logits": 0.24586967378854752, "step": 3448 }, { "epoch": 0.1078125, "grad_norm": 4.28125, "grad_norm_var": 0.12927144368489582, "learning_rate": 0.0001, "loss": 6.9973, "loss/crossentropy": 2.7542529106140137, "loss/hidden": 1.81640625, "loss/jsd": 0.0, "loss/logits": 0.24266376346349716, "step": 3450 }, { "epoch": 0.107875, "grad_norm": 3.640625, "grad_norm_var": 0.12939453125, "learning_rate": 0.0001, "loss": 6.5825, "loss/crossentropy": 2.6318721771240234, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.22435743361711502, "step": 3452 }, { "epoch": 0.1079375, "grad_norm": 3.984375, "grad_norm_var": 0.12965494791666668, "learning_rate": 0.0001, "loss": 6.9525, "loss/crossentropy": 2.752131462097168, "loss/hidden": 1.8046875, "loss/jsd": 0.0, "loss/logits": 0.2395711988210678, "step": 3454 }, { "epoch": 0.108, "grad_norm": 4.0625, "grad_norm_var": 0.12970377604166666, "learning_rate": 0.0001, "loss": 6.811, "loss/crossentropy": 2.6434799432754517, "loss/hidden": 1.78125, "loss/jsd": 0.0, "loss/logits": 0.23862747848033905, "step": 3456 }, { "epoch": 0.1080625, "grad_norm": 4.53125, "grad_norm_var": 0.13997294108072916, "learning_rate": 0.0001, "loss": 6.7528, "loss/crossentropy": 2.690545082092285, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.23083413392305374, "step": 3458 }, { "epoch": 0.108125, "grad_norm": 3.96875, "grad_norm_var": 0.1258453369140625, "learning_rate": 0.0001, "loss": 6.6226, "loss/crossentropy": 2.639745831489563, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.22484835237264633, "step": 3460 }, { "epoch": 0.1081875, "grad_norm": 4.0625, "grad_norm_var": 0.09070536295572916, "learning_rate": 0.0001, "loss": 6.2426, "loss/crossentropy": 2.396896004676819, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.21152300387620926, "step": 3462 }, { "epoch": 0.10825, "grad_norm": 3.578125, "grad_norm_var": 0.06910807291666667, "learning_rate": 0.0001, "loss": 6.324, "loss/crossentropy": 2.4832128286361694, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.21220114827156067, "step": 3464 }, { "epoch": 0.1083125, "grad_norm": 3.921875, "grad_norm_var": 0.05734049479166667, "learning_rate": 0.0001, "loss": 6.793, "loss/crossentropy": 2.7297017574310303, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.23250501602888107, "step": 3466 }, { "epoch": 0.108375, "grad_norm": 4.1875, "grad_norm_var": 0.05925191243489583, "learning_rate": 0.0001, "loss": 6.7522, "loss/crossentropy": 2.6332927942276, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.23767106980085373, "step": 3468 }, { "epoch": 0.1084375, "grad_norm": 4.0625, "grad_norm_var": 0.06487630208333334, "learning_rate": 0.0001, "loss": 7.205, "loss/crossentropy": 2.95207941532135, "loss/hidden": 1.80859375, "loss/jsd": 0.0, "loss/logits": 0.24442926049232483, "step": 3470 }, { "epoch": 0.1085, "grad_norm": 3.90625, "grad_norm_var": 0.06583658854166667, "learning_rate": 0.0001, "loss": 6.8472, "loss/crossentropy": 2.7336323261260986, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.2383066490292549, "step": 3472 }, { "epoch": 0.1085625, "grad_norm": 3.78125, "grad_norm_var": 0.05103759765625, "learning_rate": 0.0001, "loss": 6.5448, "loss/crossentropy": 2.5316661596298218, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.22787390649318695, "step": 3474 }, { "epoch": 0.108625, "grad_norm": 3.984375, "grad_norm_var": 0.30260009765625, "learning_rate": 0.0001, "loss": 6.6127, "loss/crossentropy": 2.57386314868927, "loss/hidden": 1.74609375, "loss/jsd": 0.0, "loss/logits": 0.2292788252234459, "step": 3476 }, { "epoch": 0.1086875, "grad_norm": 4.375, "grad_norm_var": 0.30172119140625, "learning_rate": 0.0001, "loss": 6.4784, "loss/crossentropy": 2.4776185750961304, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.22429370135068893, "step": 3478 }, { "epoch": 0.10875, "grad_norm": 4.40625, "grad_norm_var": 0.26549072265625, "learning_rate": 0.0001, "loss": 6.7101, "loss/crossentropy": 2.6190848350524902, "loss/hidden": 1.76171875, "loss/jsd": 0.0, "loss/logits": 0.23293334245681763, "step": 3480 }, { "epoch": 0.1088125, "grad_norm": 3.75, "grad_norm_var": 0.26448160807291665, "learning_rate": 0.0001, "loss": 6.7926, "loss/crossentropy": 2.765929102897644, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.2296176701784134, "step": 3482 }, { "epoch": 0.108875, "grad_norm": 3.796875, "grad_norm_var": 0.27838541666666666, "learning_rate": 0.0001, "loss": 6.4387, "loss/crossentropy": 2.511491060256958, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.2204545959830284, "step": 3484 }, { "epoch": 0.1089375, "grad_norm": 3.75, "grad_norm_var": 0.29277242024739586, "learning_rate": 0.0001, "loss": 6.5973, "loss/crossentropy": 2.638323187828064, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.22637015581130981, "step": 3486 }, { "epoch": 0.109, "grad_norm": 4.25, "grad_norm_var": 0.2879384358723958, "learning_rate": 0.0001, "loss": 6.7602, "loss/crossentropy": 2.6970373392105103, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.23366264253854752, "step": 3488 }, { "epoch": 0.1090625, "grad_norm": 4.65625, "grad_norm_var": 0.2937164306640625, "learning_rate": 0.0001, "loss": 6.8439, "loss/crossentropy": 2.669746994972229, "loss/hidden": 1.76171875, "loss/jsd": 0.0, "loss/logits": 0.24124659597873688, "step": 3490 }, { "epoch": 0.109125, "grad_norm": 3.9375, "grad_norm_var": 0.07747395833333333, "learning_rate": 0.0001, "loss": 6.8294, "loss/crossentropy": 2.710958242416382, "loss/hidden": 1.765625, "loss/jsd": 0.0, "loss/logits": 0.23527763038873672, "step": 3492 }, { "epoch": 0.1091875, "grad_norm": 4.46875, "grad_norm_var": 0.08189697265625, "learning_rate": 0.0001, "loss": 6.9861, "loss/crossentropy": 2.617711901664734, "loss/hidden": 1.8125, "loss/jsd": 0.0, "loss/logits": 0.2555924579501152, "step": 3494 }, { "epoch": 0.10925, "grad_norm": 3.640625, "grad_norm_var": 0.0930816650390625, "learning_rate": 0.0001, "loss": 6.425, "loss/crossentropy": 2.4971325397491455, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.22208131849765778, "step": 3496 }, { "epoch": 0.1093125, "grad_norm": 3.875, "grad_norm_var": 0.0868316650390625, "learning_rate": 0.0001, "loss": 6.8564, "loss/crossentropy": 2.7396827936172485, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.23667167872190475, "step": 3498 }, { "epoch": 0.109375, "grad_norm": 3.984375, "grad_norm_var": 0.08381754557291667, "learning_rate": 0.0001, "loss": 6.3491, "loss/crossentropy": 2.466445207595825, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.219122052192688, "step": 3500 }, { "epoch": 0.1094375, "grad_norm": 3.9375, "grad_norm_var": 0.07215067545572916, "learning_rate": 0.0001, "loss": 6.7396, "loss/crossentropy": 2.6930452585220337, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.2312178760766983, "step": 3502 }, { "epoch": 0.1095, "grad_norm": 3.984375, "grad_norm_var": 0.069189453125, "learning_rate": 0.0001, "loss": 7.0488, "loss/crossentropy": 2.8537940979003906, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.24645072221755981, "step": 3504 }, { "epoch": 0.1095625, "grad_norm": 4.03125, "grad_norm_var": 0.059789021809895836, "learning_rate": 0.0001, "loss": 6.2965, "loss/crossentropy": 2.4314013719558716, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.22010138630867004, "step": 3506 }, { "epoch": 0.109625, "grad_norm": 3.6875, "grad_norm_var": 0.05520426432291667, "learning_rate": 0.0001, "loss": 6.1617, "loss/crossentropy": 2.3627779483795166, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.2111404687166214, "step": 3508 }, { "epoch": 0.1096875, "grad_norm": 4.28125, "grad_norm_var": 0.04342041015625, "learning_rate": 0.0001, "loss": 7.208, "loss/crossentropy": 2.9820592403411865, "loss/hidden": 1.74609375, "loss/jsd": 0.0, "loss/logits": 0.24798868596553802, "step": 3510 }, { "epoch": 0.10975, "grad_norm": 3.8125, "grad_norm_var": 0.030790201822916665, "learning_rate": 0.0001, "loss": 6.5128, "loss/crossentropy": 2.5333805084228516, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.22724204510450363, "step": 3512 }, { "epoch": 0.1098125, "grad_norm": 3.703125, "grad_norm_var": 0.03193257649739583, "learning_rate": 0.0001, "loss": 6.6557, "loss/crossentropy": 2.628496289253235, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.22694183886051178, "step": 3514 }, { "epoch": 0.109875, "grad_norm": 4.46875, "grad_norm_var": 0.85230712890625, "learning_rate": 0.0001, "loss": 6.923, "loss/crossentropy": 2.598778486251831, "loss/hidden": 1.7890625, "loss/jsd": 0.0, "loss/logits": 0.253519706428051, "step": 3516 }, { "epoch": 0.1099375, "grad_norm": 4.125, "grad_norm_var": 0.8498931884765625, "learning_rate": 0.0001, "loss": 6.7063, "loss/crossentropy": 2.7180824279785156, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.22694354504346848, "step": 3518 }, { "epoch": 0.11, "grad_norm": 4.21875, "grad_norm_var": 0.8428944905598958, "learning_rate": 0.0001, "loss": 6.8915, "loss/crossentropy": 2.7829537391662598, "loss/hidden": 1.78515625, "loss/jsd": 0.0, "loss/logits": 0.2323419153690338, "step": 3520 }, { "epoch": 0.1100625, "grad_norm": 3.84375, "grad_norm_var": 0.8219685872395833, "learning_rate": 0.0001, "loss": 6.619, "loss/crossentropy": 2.5749711990356445, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.23018094897270203, "step": 3522 }, { "epoch": 0.110125, "grad_norm": 3.71875, "grad_norm_var": 0.8130116780598958, "learning_rate": 0.0001, "loss": 6.526, "loss/crossentropy": 2.5394864082336426, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.2267736867070198, "step": 3524 }, { "epoch": 0.1101875, "grad_norm": 4.0, "grad_norm_var": 0.83092041015625, "learning_rate": 0.0001, "loss": 6.6519, "loss/crossentropy": 2.6654670238494873, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.2291162833571434, "step": 3526 }, { "epoch": 0.11025, "grad_norm": 4.3125, "grad_norm_var": 0.8125315348307292, "learning_rate": 0.0001, "loss": 6.7104, "loss/crossentropy": 2.6382076740264893, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.2345614731311798, "step": 3528 }, { "epoch": 0.1103125, "grad_norm": 3.453125, "grad_norm_var": 0.8322336832682292, "learning_rate": 0.0001, "loss": 6.7222, "loss/crossentropy": 2.7326756715774536, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.22864137589931488, "step": 3530 }, { "epoch": 0.110375, "grad_norm": 4.25, "grad_norm_var": 0.0852447509765625, "learning_rate": 0.0001, "loss": 7.1299, "loss/crossentropy": 2.849326014518738, "loss/hidden": 1.796875, "loss/jsd": 0.0, "loss/logits": 0.2483738362789154, "step": 3532 }, { "epoch": 0.1104375, "grad_norm": 4.09375, "grad_norm_var": 0.08559468587239584, "learning_rate": 0.0001, "loss": 6.6426, "loss/crossentropy": 2.622498631477356, "loss/hidden": 1.74609375, "loss/jsd": 0.0, "loss/logits": 0.227398082613945, "step": 3534 }, { "epoch": 0.1105, "grad_norm": 3.703125, "grad_norm_var": 0.08915608723958333, "learning_rate": 0.0001, "loss": 6.3708, "loss/crossentropy": 2.4668338298797607, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.21461939066648483, "step": 3536 }, { "epoch": 0.1105625, "grad_norm": 4.40625, "grad_norm_var": 0.09412434895833334, "learning_rate": 0.0001, "loss": 6.6304, "loss/crossentropy": 2.595758080482483, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.23197656869888306, "step": 3538 }, { "epoch": 0.110625, "grad_norm": 3.734375, "grad_norm_var": 0.09536031087239584, "learning_rate": 0.0001, "loss": 6.5308, "loss/crossentropy": 2.606462597846985, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.2221251055598259, "step": 3540 }, { "epoch": 0.1106875, "grad_norm": 4.09375, "grad_norm_var": 0.10174153645833334, "learning_rate": 0.0001, "loss": 6.4257, "loss/crossentropy": 2.4909168481826782, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.22277754545211792, "step": 3542 }, { "epoch": 0.11075, "grad_norm": 4.40625, "grad_norm_var": 0.10397135416666667, "learning_rate": 0.0001, "loss": 6.7788, "loss/crossentropy": 2.604660987854004, "loss/hidden": 1.8046875, "loss/jsd": 0.0, "loss/logits": 0.23694781959056854, "step": 3544 }, { "epoch": 0.1108125, "grad_norm": 4.125, "grad_norm_var": 0.077587890625, "learning_rate": 0.0001, "loss": 6.3997, "loss/crossentropy": 2.427910327911377, "loss/hidden": 1.7109375, "loss/jsd": 0.0, "loss/logits": 0.22608358412981033, "step": 3546 }, { "epoch": 0.110875, "grad_norm": 3.65625, "grad_norm_var": 0.06253153483072917, "learning_rate": 0.0001, "loss": 6.204, "loss/crossentropy": 2.4499400854110718, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.2117353156208992, "step": 3548 }, { "epoch": 0.1109375, "grad_norm": 4.5625, "grad_norm_var": 0.08551025390625, "learning_rate": 0.0001, "loss": 6.8613, "loss/crossentropy": 2.785742163658142, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.23216184228658676, "step": 3550 }, { "epoch": 0.111, "grad_norm": 4.375, "grad_norm_var": 0.08940327962239583, "learning_rate": 0.0001, "loss": 6.8523, "loss/crossentropy": 2.646605968475342, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.24713413417339325, "step": 3552 }, { "epoch": 0.1110625, "grad_norm": 3.6875, "grad_norm_var": 0.08571675618489584, "learning_rate": 0.0001, "loss": 6.5563, "loss/crossentropy": 2.618689775466919, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.22383636236190796, "step": 3554 }, { "epoch": 0.111125, "grad_norm": 4.09375, "grad_norm_var": 0.26513264973958334, "learning_rate": 0.0001, "loss": 6.6827, "loss/crossentropy": 2.5601232051849365, "loss/hidden": 1.77734375, "loss/jsd": 0.0, "loss/logits": 0.23452390730381012, "step": 3556 }, { "epoch": 0.1111875, "grad_norm": 3.640625, "grad_norm_var": 0.26907145182291664, "learning_rate": 0.0001, "loss": 6.2314, "loss/crossentropy": 2.4062451124191284, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.21611201018095016, "step": 3558 }, { "epoch": 0.11125, "grad_norm": 3.578125, "grad_norm_var": 0.2773671468098958, "learning_rate": 0.0001, "loss": 6.465, "loss/crossentropy": 2.562098264694214, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.22115087509155273, "step": 3560 }, { "epoch": 0.1113125, "grad_norm": 3.6875, "grad_norm_var": 0.2841796875, "learning_rate": 0.0001, "loss": 6.3455, "loss/crossentropy": 2.5443878173828125, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.21370669454336166, "step": 3562 }, { "epoch": 0.111375, "grad_norm": 3.859375, "grad_norm_var": 0.27547098795572916, "learning_rate": 0.0001, "loss": 6.4085, "loss/crossentropy": 2.515444278717041, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.22134120762348175, "step": 3564 }, { "epoch": 0.1114375, "grad_norm": 4.78125, "grad_norm_var": 0.2936024983723958, "learning_rate": 0.0001, "loss": 6.2856, "loss/crossentropy": 2.3123891353607178, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.2250596508383751, "step": 3566 }, { "epoch": 0.1115, "grad_norm": 4.40625, "grad_norm_var": 0.2957672119140625, "learning_rate": 0.0001, "loss": 6.6701, "loss/crossentropy": 2.613504409790039, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.23065698146820068, "step": 3568 }, { "epoch": 0.1115625, "grad_norm": 4.5625, "grad_norm_var": 0.30191650390625, "learning_rate": 0.0001, "loss": 6.2952, "loss/crossentropy": 2.3294789791107178, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.22352328151464462, "step": 3570 }, { "epoch": 0.111625, "grad_norm": 4.0625, "grad_norm_var": 0.11542561848958334, "learning_rate": 0.0001, "loss": 6.5475, "loss/crossentropy": 2.4532060623168945, "loss/hidden": 1.76953125, "loss/jsd": 0.0, "loss/logits": 0.23247961699962616, "step": 3572 }, { "epoch": 0.1116875, "grad_norm": 3.875, "grad_norm_var": 0.10655008951822917, "learning_rate": 0.0001, "loss": 6.4323, "loss/crossentropy": 2.528500556945801, "loss/hidden": 1.7109375, "loss/jsd": 0.0, "loss/logits": 0.21928536146879196, "step": 3574 }, { "epoch": 0.11175, "grad_norm": 4.5625, "grad_norm_var": 0.3978352864583333, "learning_rate": 0.0001, "loss": 7.1899, "loss/crossentropy": 2.9368622303009033, "loss/hidden": 1.84765625, "loss/jsd": 0.0, "loss/logits": 0.24054250866174698, "step": 3576 }, { "epoch": 0.1118125, "grad_norm": 4.1875, "grad_norm_var": 0.37014567057291664, "learning_rate": 0.0001, "loss": 6.6702, "loss/crossentropy": 2.626793384552002, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.23051141202449799, "step": 3578 }, { "epoch": 0.111875, "grad_norm": 3.53125, "grad_norm_var": 0.37301025390625, "learning_rate": 0.0001, "loss": 6.3652, "loss/crossentropy": 2.3946304321289062, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.22322587668895721, "step": 3580 }, { "epoch": 0.1119375, "grad_norm": 3.875, "grad_norm_var": 0.38766276041666664, "learning_rate": 0.0001, "loss": 6.512, "loss/crossentropy": 2.6459895372390747, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.219414621591568, "step": 3582 }, { "epoch": 0.112, "grad_norm": 3.9375, "grad_norm_var": 0.3851236979166667, "learning_rate": 0.0001, "loss": 6.5002, "loss/crossentropy": 2.5649925470352173, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.22086559236049652, "step": 3584 }, { "epoch": 0.1120625, "grad_norm": 3.984375, "grad_norm_var": 0.3741048177083333, "learning_rate": 0.0001, "loss": 6.636, "loss/crossentropy": 2.722179889678955, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.21716240048408508, "step": 3586 }, { "epoch": 0.112125, "grad_norm": 4.5625, "grad_norm_var": 0.387255859375, "learning_rate": 0.0001, "loss": 6.7132, "loss/crossentropy": 2.6352118253707886, "loss/hidden": 1.7890625, "loss/jsd": 0.0, "loss/logits": 0.22889728099107742, "step": 3588 }, { "epoch": 0.1121875, "grad_norm": 3.671875, "grad_norm_var": 0.40722249348958334, "learning_rate": 0.0001, "loss": 6.5121, "loss/crossentropy": 2.532341957092285, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.22219117730855942, "step": 3590 }, { "epoch": 0.11225, "grad_norm": 4.125, "grad_norm_var": 0.09814453125, "learning_rate": 0.0001, "loss": 6.7784, "loss/crossentropy": 2.6766886711120605, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.23634084314107895, "step": 3592 }, { "epoch": 0.1123125, "grad_norm": 4.125, "grad_norm_var": 0.10041910807291667, "learning_rate": 0.0001, "loss": 6.839, "loss/crossentropy": 2.7825467586517334, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.23181992769241333, "step": 3594 }, { "epoch": 0.112375, "grad_norm": 4.25, "grad_norm_var": 0.08924051920572916, "learning_rate": 0.0001, "loss": 6.8486, "loss/crossentropy": 2.769296169281006, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.23527134954929352, "step": 3596 }, { "epoch": 0.1124375, "grad_norm": 3.96875, "grad_norm_var": 0.08483072916666666, "learning_rate": 0.0001, "loss": 6.3203, "loss/crossentropy": 2.4583276510238647, "loss/hidden": 1.7109375, "loss/jsd": 0.0, "loss/logits": 0.2151002734899521, "step": 3598 }, { "epoch": 0.1125, "grad_norm": 3.859375, "grad_norm_var": 0.0872955322265625, "learning_rate": 0.0001, "loss": 6.4537, "loss/crossentropy": 2.5872033834457397, "loss/hidden": 1.7109375, "loss/jsd": 0.0, "loss/logits": 0.21555408835411072, "step": 3600 }, { "epoch": 0.1125625, "grad_norm": 3.65625, "grad_norm_var": 0.10201822916666667, "learning_rate": 0.0001, "loss": 6.635, "loss/crossentropy": 2.611227035522461, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.22894278168678284, "step": 3602 }, { "epoch": 0.112625, "grad_norm": 3.765625, "grad_norm_var": 0.08616129557291667, "learning_rate": 0.0001, "loss": 6.8173, "loss/crossentropy": 2.761025547981262, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.23336273431777954, "step": 3604 }, { "epoch": 0.1126875, "grad_norm": 3.640625, "grad_norm_var": 0.06401265462239583, "learning_rate": 0.0001, "loss": 6.1301, "loss/crossentropy": 2.4436473846435547, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.199115589261055, "step": 3606 }, { "epoch": 0.11275, "grad_norm": 4.40625, "grad_norm_var": 0.07748921712239583, "learning_rate": 0.0001, "loss": 6.5835, "loss/crossentropy": 2.56386399269104, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.22930476069450378, "step": 3608 }, { "epoch": 0.1128125, "grad_norm": 3.890625, "grad_norm_var": 0.11940104166666667, "learning_rate": 0.0001, "loss": 6.7951, "loss/crossentropy": 2.6115182638168335, "loss/hidden": 1.796875, "loss/jsd": 0.0, "loss/logits": 0.2386746183037758, "step": 3610 }, { "epoch": 0.112875, "grad_norm": 3.875, "grad_norm_var": 0.11263020833333333, "learning_rate": 0.0001, "loss": 6.3766, "loss/crossentropy": 2.4483206272125244, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.2221250981092453, "step": 3612 }, { "epoch": 0.1129375, "grad_norm": 3.796875, "grad_norm_var": 0.10579427083333333, "learning_rate": 0.0001, "loss": 6.7492, "loss/crossentropy": 2.800732374191284, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.2198447734117508, "step": 3614 }, { "epoch": 0.113, "grad_norm": 3.78125, "grad_norm_var": 0.10694071451822916, "learning_rate": 0.0001, "loss": 6.9306, "loss/crossentropy": 2.844248056411743, "loss/hidden": 1.76171875, "loss/jsd": 0.0, "loss/logits": 0.23246607184410095, "step": 3616 }, { "epoch": 0.1130625, "grad_norm": 4.5, "grad_norm_var": 0.11110738118489584, "learning_rate": 0.0001, "loss": 6.5568, "loss/crossentropy": 2.523680090904236, "loss/hidden": 1.76171875, "loss/jsd": 0.0, "loss/logits": 0.22713688760995865, "step": 3618 }, { "epoch": 0.113125, "grad_norm": 3.640625, "grad_norm_var": 0.1162017822265625, "learning_rate": 0.0001, "loss": 6.2764, "loss/crossentropy": 2.5017552375793457, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.20753761380910873, "step": 3620 }, { "epoch": 0.1131875, "grad_norm": 4.03125, "grad_norm_var": 0.1085113525390625, "learning_rate": 0.0001, "loss": 6.5934, "loss/crossentropy": 2.6008695363998413, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.2297266125679016, "step": 3622 }, { "epoch": 0.11325, "grad_norm": 4.34375, "grad_norm_var": 0.09004618326822916, "learning_rate": 0.0001, "loss": 6.6576, "loss/crossentropy": 2.6061915159225464, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.23014119267463684, "step": 3624 }, { "epoch": 0.1133125, "grad_norm": 3.875, "grad_norm_var": 0.0517578125, "learning_rate": 0.0001, "loss": 6.828, "loss/crossentropy": 2.762625575065613, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.233097642660141, "step": 3626 }, { "epoch": 0.113375, "grad_norm": 3.984375, "grad_norm_var": 0.0546051025390625, "learning_rate": 0.0001, "loss": 6.4933, "loss/crossentropy": 2.5349791049957275, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.2184930369257927, "step": 3628 }, { "epoch": 0.1134375, "grad_norm": 4.1875, "grad_norm_var": 0.06451416015625, "learning_rate": 0.0001, "loss": 6.8052, "loss/crossentropy": 2.711672782897949, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.23591265082359314, "step": 3630 }, { "epoch": 0.1135, "grad_norm": 4.03125, "grad_norm_var": 0.0630523681640625, "learning_rate": 0.0001, "loss": 6.4884, "loss/crossentropy": 2.4987112283706665, "loss/hidden": 1.7109375, "loss/jsd": 0.0, "loss/logits": 0.22787494212388992, "step": 3632 }, { "epoch": 0.1135625, "grad_norm": 4.71875, "grad_norm_var": 0.07967020670572916, "learning_rate": 0.0001, "loss": 6.7434, "loss/crossentropy": 2.6017539501190186, "loss/hidden": 1.8203125, "loss/jsd": 0.0, "loss/logits": 0.2321312204003334, "step": 3634 }, { "epoch": 0.113625, "grad_norm": 3.828125, "grad_norm_var": 0.07819010416666666, "learning_rate": 0.0001, "loss": 6.7524, "loss/crossentropy": 2.789211392402649, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.22600264102220535, "step": 3636 }, { "epoch": 0.1136875, "grad_norm": 4.1875, "grad_norm_var": 0.0706207275390625, "learning_rate": 0.0001, "loss": 6.9458, "loss/crossentropy": 2.745774269104004, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.24265731871128082, "step": 3638 }, { "epoch": 0.11375, "grad_norm": 3.984375, "grad_norm_var": 0.07154541015625, "learning_rate": 0.0001, "loss": 7.0743, "loss/crossentropy": 2.8300269842147827, "loss/hidden": 1.796875, "loss/jsd": 0.0, "loss/logits": 0.24474012106657028, "step": 3640 }, { "epoch": 0.1138125, "grad_norm": 3.859375, "grad_norm_var": 0.0783111572265625, "learning_rate": 0.0001, "loss": 6.295, "loss/crossentropy": 2.3331433534622192, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.2207912728190422, "step": 3642 }, { "epoch": 0.113875, "grad_norm": 3.765625, "grad_norm_var": 0.08606363932291666, "learning_rate": 0.0001, "loss": 6.7028, "loss/crossentropy": 2.7445106506347656, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.2243453711271286, "step": 3644 }, { "epoch": 0.1139375, "grad_norm": 4.125, "grad_norm_var": 0.08013916015625, "learning_rate": 0.0001, "loss": 6.8632, "loss/crossentropy": 2.7395485639572144, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.2373623326420784, "step": 3646 }, { "epoch": 0.114, "grad_norm": 3.78125, "grad_norm_var": 0.08299051920572917, "learning_rate": 0.0001, "loss": 6.9405, "loss/crossentropy": 2.8632638454437256, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.23233596980571747, "step": 3648 }, { "epoch": 0.1140625, "grad_norm": 10.625, "grad_norm_var": 2.787287394205729, "learning_rate": 0.0001, "loss": 7.184, "loss/crossentropy": 2.526134729385376, "loss/hidden": 1.79296875, "loss/jsd": 0.0, "loss/logits": 0.2864900380373001, "step": 3650 }, { "epoch": 0.114125, "grad_norm": 4.15625, "grad_norm_var": 2.727408854166667, "learning_rate": 0.0001, "loss": 6.6539, "loss/crossentropy": 2.5402532815933228, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.23714881390333176, "step": 3652 }, { "epoch": 0.1141875, "grad_norm": 4.21875, "grad_norm_var": 2.713703409830729, "learning_rate": 0.0001, "loss": 6.467, "loss/crossentropy": 2.4508432149887085, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.2289644032716751, "step": 3654 }, { "epoch": 0.11425, "grad_norm": 4.625, "grad_norm_var": 2.7493967692057293, "learning_rate": 0.0001, "loss": 6.9745, "loss/crossentropy": 2.7536582946777344, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.24474085122346878, "step": 3656 }, { "epoch": 0.1143125, "grad_norm": 4.09375, "grad_norm_var": 2.7429850260416666, "learning_rate": 0.0001, "loss": 6.775, "loss/crossentropy": 2.79840886592865, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.22461071610450745, "step": 3658 }, { "epoch": 0.114375, "grad_norm": 3.71875, "grad_norm_var": 2.7027659098307293, "learning_rate": 0.0001, "loss": 6.643, "loss/crossentropy": 2.691011071205139, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.22332537174224854, "step": 3660 }, { "epoch": 0.1144375, "grad_norm": 3.671875, "grad_norm_var": 2.7574859619140626, "learning_rate": 0.0001, "loss": 6.3804, "loss/crossentropy": 2.5471014976501465, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.2141846865415573, "step": 3662 }, { "epoch": 0.1145, "grad_norm": 4.375, "grad_norm_var": 2.7490875244140627, "learning_rate": 0.0001, "loss": 6.5308, "loss/crossentropy": 2.5367395877838135, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.2294824793934822, "step": 3664 }, { "epoch": 0.1145625, "grad_norm": 4.4375, "grad_norm_var": 0.0964996337890625, "learning_rate": 0.0001, "loss": 6.7434, "loss/crossentropy": 2.6196374893188477, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.23854342103004456, "step": 3666 }, { "epoch": 0.114625, "grad_norm": 3.625, "grad_norm_var": 0.1001373291015625, "learning_rate": 0.0001, "loss": 6.3404, "loss/crossentropy": 2.478997230529785, "loss/hidden": 1.7109375, "loss/jsd": 0.0, "loss/logits": 0.21505115926265717, "step": 3668 }, { "epoch": 0.1146875, "grad_norm": 4.0, "grad_norm_var": 0.09562886555989583, "learning_rate": 0.0001, "loss": 6.8886, "loss/crossentropy": 2.7907516956329346, "loss/hidden": 1.76953125, "loss/jsd": 0.0, "loss/logits": 0.23283251374959946, "step": 3670 }, { "epoch": 0.11475, "grad_norm": 4.15625, "grad_norm_var": 0.06789957682291667, "learning_rate": 0.0001, "loss": 6.8005, "loss/crossentropy": 2.735516667366028, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.23501239717006683, "step": 3672 }, { "epoch": 0.1148125, "grad_norm": 3.765625, "grad_norm_var": 0.07175191243489583, "learning_rate": 0.0001, "loss": 6.7163, "loss/crossentropy": 2.6947312355041504, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.2294972613453865, "step": 3674 }, { "epoch": 0.114875, "grad_norm": 3.984375, "grad_norm_var": 0.05624593098958333, "learning_rate": 0.0001, "loss": 6.6487, "loss/crossentropy": 2.6631579399108887, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.22668370604515076, "step": 3676 }, { "epoch": 0.1149375, "grad_norm": 3.75, "grad_norm_var": 0.05592041015625, "learning_rate": 0.0001, "loss": 6.8093, "loss/crossentropy": 2.752833366394043, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.23220966756343842, "step": 3678 }, { "epoch": 0.115, "grad_norm": 4.375, "grad_norm_var": 0.053548177083333336, "learning_rate": 0.0001, "loss": 6.8694, "loss/crossentropy": 2.7636189460754395, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.23674701154232025, "step": 3680 }, { "epoch": 0.1150625, "grad_norm": 4.03125, "grad_norm_var": 0.04224853515625, "learning_rate": 0.0001, "loss": 6.6997, "loss/crossentropy": 2.702791452407837, "loss/hidden": 1.80078125, "loss/jsd": 0.0, "loss/logits": 0.21960802376270294, "step": 3682 }, { "epoch": 0.115125, "grad_norm": 3.8125, "grad_norm_var": 0.03583984375, "learning_rate": 0.0001, "loss": 6.6096, "loss/crossentropy": 2.5904924869537354, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.2319922372698784, "step": 3684 }, { "epoch": 0.1151875, "grad_norm": 4.03125, "grad_norm_var": 0.05891825358072917, "learning_rate": 0.0001, "loss": 6.2208, "loss/crossentropy": 2.417539119720459, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.2107919454574585, "step": 3686 }, { "epoch": 0.11525, "grad_norm": 4.28125, "grad_norm_var": 0.05426025390625, "learning_rate": 0.0001, "loss": 6.5656, "loss/crossentropy": 2.5044217109680176, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.23268309235572815, "step": 3688 }, { "epoch": 0.1153125, "grad_norm": 3.75, "grad_norm_var": 0.05659077962239583, "learning_rate": 0.0001, "loss": 6.5324, "loss/crossentropy": 2.5434666872024536, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.2238949090242386, "step": 3690 }, { "epoch": 0.115375, "grad_norm": 3.953125, "grad_norm_var": 0.25015360514322915, "learning_rate": 0.0001, "loss": 6.4936, "loss/crossentropy": 2.470642566680908, "loss/hidden": 1.77734375, "loss/jsd": 0.0, "loss/logits": 0.22456327080726624, "step": 3692 }, { "epoch": 0.1154375, "grad_norm": 4.0, "grad_norm_var": 0.2470703125, "learning_rate": 0.0001, "loss": 6.7092, "loss/crossentropy": 2.697585701942444, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.23045646399259567, "step": 3694 }, { "epoch": 0.1155, "grad_norm": 3.9375, "grad_norm_var": 0.24149983723958332, "learning_rate": 0.0001, "loss": 6.7061, "loss/crossentropy": 2.6695618629455566, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.23177966475486755, "step": 3696 }, { "epoch": 0.1155625, "grad_norm": 3.921875, "grad_norm_var": 0.2424224853515625, "learning_rate": 0.0001, "loss": 6.2392, "loss/crossentropy": 2.3701905012130737, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.21150615811347961, "step": 3698 }, { "epoch": 0.115625, "grad_norm": 4.0625, "grad_norm_var": 0.2453765869140625, "learning_rate": 0.0001, "loss": 6.4243, "loss/crossentropy": 2.4719810485839844, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.2221815437078476, "step": 3700 }, { "epoch": 0.1156875, "grad_norm": 3.90625, "grad_norm_var": 0.22551167805989583, "learning_rate": 0.0001, "loss": 6.5181, "loss/crossentropy": 2.5747958421707153, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.22245176136493683, "step": 3702 }, { "epoch": 0.11575, "grad_norm": 3.65625, "grad_norm_var": 0.23251546223958333, "learning_rate": 0.0001, "loss": 6.4551, "loss/crossentropy": 2.557185173034668, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.2183024138212204, "step": 3704 }, { "epoch": 0.1158125, "grad_norm": 4.03125, "grad_norm_var": 0.23841145833333333, "learning_rate": 0.0001, "loss": 6.4098, "loss/crossentropy": 2.527738094329834, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.21828729659318924, "step": 3706 }, { "epoch": 0.115875, "grad_norm": 3.765625, "grad_norm_var": 0.042708333333333334, "learning_rate": 0.0001, "loss": 6.6301, "loss/crossentropy": 2.637513756752014, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.22776946425437927, "step": 3708 }, { "epoch": 0.1159375, "grad_norm": 4.5, "grad_norm_var": 0.062548828125, "learning_rate": 0.0001, "loss": 6.6892, "loss/crossentropy": 2.666839361190796, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.23036272078752518, "step": 3710 }, { "epoch": 0.116, "grad_norm": 3.75, "grad_norm_var": 0.0642578125, "learning_rate": 0.0001, "loss": 6.757, "loss/crossentropy": 2.6890220642089844, "loss/hidden": 1.74609375, "loss/jsd": 0.0, "loss/logits": 0.23218737542629242, "step": 3712 }, { "epoch": 0.1160625, "grad_norm": 3.859375, "grad_norm_var": 0.06907552083333333, "learning_rate": 0.0001, "loss": 6.4918, "loss/crossentropy": 2.558487057685852, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.22340869903564453, "step": 3714 }, { "epoch": 0.116125, "grad_norm": 4.25, "grad_norm_var": 0.07427978515625, "learning_rate": 0.0001, "loss": 6.694, "loss/crossentropy": 2.642730474472046, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.22778676450252533, "step": 3716 }, { "epoch": 0.1161875, "grad_norm": 3.765625, "grad_norm_var": 0.0743316650390625, "learning_rate": 0.0001, "loss": 6.6046, "loss/crossentropy": 2.582736015319824, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.22640958428382874, "step": 3718 }, { "epoch": 0.11625, "grad_norm": 4.0625, "grad_norm_var": 0.07993876139322917, "learning_rate": 0.0001, "loss": 6.8234, "loss/crossentropy": 2.6626198291778564, "loss/hidden": 1.80078125, "loss/jsd": 0.0, "loss/logits": 0.23600252717733383, "step": 3720 }, { "epoch": 0.1163125, "grad_norm": 3.953125, "grad_norm_var": 0.07737630208333333, "learning_rate": 0.0001, "loss": 6.7206, "loss/crossentropy": 2.554721474647522, "loss/hidden": 1.8125, "loss/jsd": 0.0, "loss/logits": 0.23533476889133453, "step": 3722 }, { "epoch": 0.116375, "grad_norm": 4.90625, "grad_norm_var": 0.13069254557291668, "learning_rate": 0.0001, "loss": 6.7879, "loss/crossentropy": 2.7431427240371704, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.2325975000858307, "step": 3724 }, { "epoch": 0.1164375, "grad_norm": 4.3125, "grad_norm_var": 21.46592508951823, "learning_rate": 0.0001, "loss": 6.9384, "loss/crossentropy": 2.5549099445343018, "loss/hidden": 1.87890625, "loss/jsd": 0.0, "loss/logits": 0.25046199560165405, "step": 3726 }, { "epoch": 0.1165, "grad_norm": 7.46875, "grad_norm_var": 21.633101399739584, "learning_rate": 0.0001, "loss": 7.0932, "loss/crossentropy": 2.690193295478821, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.2629580646753311, "step": 3728 }, { "epoch": 0.1165625, "grad_norm": 3.5625, "grad_norm_var": 21.62051493326823, "learning_rate": 0.0001, "loss": 6.3362, "loss/crossentropy": 2.4794082641601562, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.21380110830068588, "step": 3730 }, { "epoch": 0.116625, "grad_norm": 4.03125, "grad_norm_var": 21.58226623535156, "learning_rate": 0.0001, "loss": 7.0605, "loss/crossentropy": 2.8624175786972046, "loss/hidden": 1.74609375, "loss/jsd": 0.0, "loss/logits": 0.2451971471309662, "step": 3732 }, { "epoch": 0.1166875, "grad_norm": 3.828125, "grad_norm_var": 21.607673136393228, "learning_rate": 0.0001, "loss": 6.5621, "loss/crossentropy": 2.6379172801971436, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.22171756625175476, "step": 3734 }, { "epoch": 0.11675, "grad_norm": 6.0625, "grad_norm_var": 21.62181701660156, "learning_rate": 0.0001, "loss": 6.755, "loss/crossentropy": 2.536095380783081, "loss/hidden": 1.7890625, "loss/jsd": 0.0, "loss/logits": 0.24298104643821716, "step": 3736 }, { "epoch": 0.1168125, "grad_norm": 3.671875, "grad_norm_var": 21.738084920247395, "learning_rate": 0.0001, "loss": 6.6708, "loss/crossentropy": 2.7039905786514282, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.22636428475379944, "step": 3738 }, { "epoch": 0.116875, "grad_norm": 3.890625, "grad_norm_var": 21.764842732747397, "learning_rate": 0.0001, "loss": 6.7239, "loss/crossentropy": 2.686025619506836, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.23073802888393402, "step": 3740 }, { "epoch": 0.1169375, "grad_norm": 3.375, "grad_norm_var": 1.1045857747395833, "learning_rate": 0.0001, "loss": 5.9112, "loss/crossentropy": 2.2144126892089844, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.19663581252098083, "step": 3742 }, { "epoch": 0.117, "grad_norm": 3.765625, "grad_norm_var": 0.3494954427083333, "learning_rate": 0.0001, "loss": 6.4705, "loss/crossentropy": 2.5193395614624023, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.2220740020275116, "step": 3744 }, { "epoch": 0.1170625, "grad_norm": 3.9375, "grad_norm_var": 0.33886311848958334, "learning_rate": 0.0001, "loss": 6.5074, "loss/crossentropy": 2.5095298290252686, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.2279161512851715, "step": 3746 }, { "epoch": 0.117125, "grad_norm": 4.375, "grad_norm_var": 0.3591949462890625, "learning_rate": 0.0001, "loss": 6.3287, "loss/crossentropy": 2.5012372732162476, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.21555764973163605, "step": 3748 }, { "epoch": 0.1171875, "grad_norm": 4.25, "grad_norm_var": 0.3813547770182292, "learning_rate": 0.0001, "loss": 6.5337, "loss/crossentropy": 2.591525673866272, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.22234736382961273, "step": 3750 }, { "epoch": 0.11725, "grad_norm": 4.3125, "grad_norm_var": 0.10349833170572917, "learning_rate": 0.0001, "loss": 6.8932, "loss/crossentropy": 2.746386170387268, "loss/hidden": 1.74609375, "loss/jsd": 0.0, "loss/logits": 0.2400682345032692, "step": 3752 }, { "epoch": 0.1173125, "grad_norm": 3.90625, "grad_norm_var": 0.10373942057291667, "learning_rate": 0.0001, "loss": 6.6784, "loss/crossentropy": 2.580140709877014, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.23561152815818787, "step": 3754 }, { "epoch": 0.117375, "grad_norm": 4.28125, "grad_norm_var": 0.11262613932291667, "learning_rate": 0.0001, "loss": 6.4309, "loss/crossentropy": 2.4955955743789673, "loss/hidden": 1.7109375, "loss/jsd": 0.0, "loss/logits": 0.22243603318929672, "step": 3756 }, { "epoch": 0.1174375, "grad_norm": 3.96875, "grad_norm_var": 0.09172770182291666, "learning_rate": 0.0001, "loss": 6.5724, "loss/crossentropy": 2.580453872680664, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.2273193672299385, "step": 3758 }, { "epoch": 0.1175, "grad_norm": 4.25, "grad_norm_var": 0.10486551920572916, "learning_rate": 0.0001, "loss": 7.1613, "loss/crossentropy": 2.834384322166443, "loss/hidden": 1.81640625, "loss/jsd": 0.0, "loss/logits": 0.2510491907596588, "step": 3760 }, { "epoch": 0.1175625, "grad_norm": 3.90625, "grad_norm_var": 0.10698954264322917, "learning_rate": 0.0001, "loss": 6.5418, "loss/crossentropy": 2.5571680068969727, "loss/hidden": 1.77734375, "loss/jsd": 0.0, "loss/logits": 0.220725879073143, "step": 3762 }, { "epoch": 0.117625, "grad_norm": 4.15625, "grad_norm_var": 0.10991109212239583, "learning_rate": 0.0001, "loss": 6.6508, "loss/crossentropy": 2.6110875606536865, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.22662819921970367, "step": 3764 }, { "epoch": 0.1176875, "grad_norm": 3.4375, "grad_norm_var": 0.10447489420572917, "learning_rate": 0.0001, "loss": 6.5068, "loss/crossentropy": 2.6286540031433105, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.2217947244644165, "step": 3766 }, { "epoch": 0.11775, "grad_norm": 3.765625, "grad_norm_var": 0.0976959228515625, "learning_rate": 0.0001, "loss": 6.2205, "loss/crossentropy": 2.449827790260315, "loss/hidden": 1.7109375, "loss/jsd": 0.0, "loss/logits": 0.2059696912765503, "step": 3768 }, { "epoch": 0.1178125, "grad_norm": 3.90625, "grad_norm_var": 0.12255757649739583, "learning_rate": 0.0001, "loss": 6.7584, "loss/crossentropy": 2.619004487991333, "loss/hidden": 1.8203125, "loss/jsd": 0.0, "loss/logits": 0.23191242665052414, "step": 3770 }, { "epoch": 0.117875, "grad_norm": 3.890625, "grad_norm_var": 0.11604715983072916, "learning_rate": 0.0001, "loss": 6.4862, "loss/crossentropy": 2.4522976875305176, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.2303444668650627, "step": 3772 }, { "epoch": 0.1179375, "grad_norm": 4.3125, "grad_norm_var": 0.12135009765625, "learning_rate": 0.0001, "loss": 6.9901, "loss/crossentropy": 2.853781223297119, "loss/hidden": 1.79296875, "loss/jsd": 0.0, "loss/logits": 0.2343396246433258, "step": 3774 }, { "epoch": 0.118, "grad_norm": 4.4375, "grad_norm_var": 0.11217041015625, "learning_rate": 0.0001, "loss": 6.8352, "loss/crossentropy": 2.7490190267562866, "loss/hidden": 1.74609375, "loss/jsd": 0.0, "loss/logits": 0.2340097874403, "step": 3776 }, { "epoch": 0.1180625, "grad_norm": 3.4375, "grad_norm_var": 0.134814453125, "learning_rate": 0.0001, "loss": 6.4295, "loss/crossentropy": 2.532248616218567, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.21746356040239334, "step": 3778 }, { "epoch": 0.118125, "grad_norm": 4.09375, "grad_norm_var": 0.11428120930989584, "learning_rate": 0.0001, "loss": 6.8923, "loss/crossentropy": 2.812334418296814, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.2357269674539566, "step": 3780 }, { "epoch": 0.1181875, "grad_norm": 4.09375, "grad_norm_var": 0.09714253743489583, "learning_rate": 0.0001, "loss": 6.6251, "loss/crossentropy": 2.6684558391571045, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.2257475033402443, "step": 3782 }, { "epoch": 0.11825, "grad_norm": 3.984375, "grad_norm_var": 0.07913004557291667, "learning_rate": 0.0001, "loss": 6.5944, "loss/crossentropy": 2.530303120613098, "loss/hidden": 1.77734375, "loss/jsd": 0.0, "loss/logits": 0.228675976395607, "step": 3784 }, { "epoch": 0.1183125, "grad_norm": 3.8125, "grad_norm_var": 0.06564127604166667, "learning_rate": 0.0001, "loss": 6.8646, "loss/crossentropy": 2.777267575263977, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.23685341328382492, "step": 3786 }, { "epoch": 0.118375, "grad_norm": 4.03125, "grad_norm_var": 0.07919820149739583, "learning_rate": 0.0001, "loss": 6.6104, "loss/crossentropy": 2.720764994621277, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.21825769543647766, "step": 3788 }, { "epoch": 0.1184375, "grad_norm": 3.71875, "grad_norm_var": 0.0782623291015625, "learning_rate": 0.0001, "loss": 6.5611, "loss/crossentropy": 2.617440700531006, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.22248947620391846, "step": 3790 }, { "epoch": 0.1185, "grad_norm": 3.78125, "grad_norm_var": 0.06314697265625, "learning_rate": 0.0001, "loss": 6.5085, "loss/crossentropy": 2.5625646114349365, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.22310566902160645, "step": 3792 }, { "epoch": 0.1185625, "grad_norm": 3.796875, "grad_norm_var": 0.03997395833333333, "learning_rate": 0.0001, "loss": 6.8123, "loss/crossentropy": 2.7593398094177246, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.23342178761959076, "step": 3794 }, { "epoch": 0.118625, "grad_norm": 3.890625, "grad_norm_var": 0.0342926025390625, "learning_rate": 0.0001, "loss": 6.43, "loss/crossentropy": 2.5027835369110107, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.22123384475708008, "step": 3796 }, { "epoch": 0.1186875, "grad_norm": 4.59375, "grad_norm_var": 0.06208394368489583, "learning_rate": 0.0001, "loss": 7.0069, "loss/crossentropy": 2.9035476446151733, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.23689740151166916, "step": 3798 }, { "epoch": 0.11875, "grad_norm": 3.84375, "grad_norm_var": 0.06220296223958333, "learning_rate": 0.0001, "loss": 6.7092, "loss/crossentropy": 2.609599232673645, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.23574373871088028, "step": 3800 }, { "epoch": 0.1188125, "grad_norm": 3.859375, "grad_norm_var": 0.14558817545572916, "learning_rate": 0.0001, "loss": 6.6645, "loss/crossentropy": 2.588888168334961, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.23372826725244522, "step": 3802 }, { "epoch": 0.118875, "grad_norm": 3.40625, "grad_norm_var": 0.16487528483072916, "learning_rate": 0.0001, "loss": 6.3971, "loss/crossentropy": 2.6425379514694214, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.21139515936374664, "step": 3804 }, { "epoch": 0.1189375, "grad_norm": 4.34375, "grad_norm_var": 0.18788960774739583, "learning_rate": 0.0001, "loss": 6.4841, "loss/crossentropy": 2.421231508255005, "loss/hidden": 1.76953125, "loss/jsd": 0.0, "loss/logits": 0.2293347418308258, "step": 3806 }, { "epoch": 0.119, "grad_norm": 5.28125, "grad_norm_var": 0.2775349934895833, "learning_rate": 0.0001, "loss": 6.4633, "loss/crossentropy": 2.3681161403656006, "loss/hidden": 1.8125, "loss/jsd": 0.0, "loss/logits": 0.2282676324248314, "step": 3808 }, { "epoch": 0.1190625, "grad_norm": 3.953125, "grad_norm_var": 0.2936197916666667, "learning_rate": 0.0001, "loss": 6.316, "loss/crossentropy": 2.4912995100021362, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.21489524841308594, "step": 3810 }, { "epoch": 0.119125, "grad_norm": 3.796875, "grad_norm_var": 0.29673563639322914, "learning_rate": 0.0001, "loss": 6.6797, "loss/crossentropy": 2.7319802045822144, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.22250615060329437, "step": 3812 }, { "epoch": 0.1191875, "grad_norm": 3.875, "grad_norm_var": 0.2789947509765625, "learning_rate": 0.0001, "loss": 6.9102, "loss/crossentropy": 2.805300712585449, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.2366633266210556, "step": 3814 }, { "epoch": 0.11925, "grad_norm": 4.1875, "grad_norm_var": 0.2746246337890625, "learning_rate": 0.0001, "loss": 6.5752, "loss/crossentropy": 2.6415826082229614, "loss/hidden": 1.74609375, "loss/jsd": 0.0, "loss/logits": 0.21875379979610443, "step": 3816 }, { "epoch": 0.1193125, "grad_norm": 4.5, "grad_norm_var": 0.21341145833333333, "learning_rate": 0.0001, "loss": 6.5314, "loss/crossentropy": 2.5881221294403076, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.22089115530252457, "step": 3818 }, { "epoch": 0.119375, "grad_norm": 13.125, "grad_norm_var": 5.160139973958334, "learning_rate": 0.0001, "loss": 6.6154, "loss/crossentropy": 2.380750298500061, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.2476859986782074, "step": 3820 }, { "epoch": 0.1194375, "grad_norm": 3.828125, "grad_norm_var": 5.204002888997396, "learning_rate": 0.0001, "loss": 6.6778, "loss/crossentropy": 2.7415353059768677, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.22291918098926544, "step": 3822 }, { "epoch": 0.1195, "grad_norm": 4.71875, "grad_norm_var": 5.1928049723307295, "learning_rate": 0.0001, "loss": 7.1779, "loss/crossentropy": 2.8588478565216064, "loss/hidden": 1.7890625, "loss/jsd": 0.0, "loss/logits": 0.25300103425979614, "step": 3824 }, { "epoch": 0.1195625, "grad_norm": 4.3125, "grad_norm_var": 5.140648396809896, "learning_rate": 0.0001, "loss": 6.5702, "loss/crossentropy": 2.557085633277893, "loss/hidden": 1.74609375, "loss/jsd": 0.0, "loss/logits": 0.22669967263936996, "step": 3826 }, { "epoch": 0.119625, "grad_norm": 4.4375, "grad_norm_var": 5.0800120035807295, "learning_rate": 0.0001, "loss": 6.8565, "loss/crossentropy": 2.6817139387130737, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.2444320172071457, "step": 3828 }, { "epoch": 0.1196875, "grad_norm": 3.5, "grad_norm_var": 5.15458984375, "learning_rate": 0.0001, "loss": 6.2591, "loss/crossentropy": 2.42851722240448, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.21665531396865845, "step": 3830 }, { "epoch": 0.11975, "grad_norm": 4.3125, "grad_norm_var": 5.154011027018229, "learning_rate": 0.0001, "loss": 6.7774, "loss/crossentropy": 2.70755672454834, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.23315252363681793, "step": 3832 }, { "epoch": 0.1198125, "grad_norm": 4.59375, "grad_norm_var": 5.165925089518229, "learning_rate": 0.0001, "loss": 6.8659, "loss/crossentropy": 2.693860173225403, "loss/hidden": 1.765625, "loss/jsd": 0.0, "loss/logits": 0.24063920229673386, "step": 3834 }, { "epoch": 0.119875, "grad_norm": 3.859375, "grad_norm_var": 0.14049072265625, "learning_rate": 0.0001, "loss": 6.7968, "loss/crossentropy": 2.7572206258773804, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.2312999740242958, "step": 3836 }, { "epoch": 0.1199375, "grad_norm": 4.09375, "grad_norm_var": 0.10696512858072917, "learning_rate": 0.0001, "loss": 6.4984, "loss/crossentropy": 2.5449635982513428, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.22463569045066833, "step": 3838 }, { "epoch": 0.12, "grad_norm": 3.65625, "grad_norm_var": 0.09641927083333333, "learning_rate": 0.0001, "loss": 6.4033, "loss/crossentropy": 2.559556007385254, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.21288719773292542, "step": 3840 }, { "epoch": 0.1200625, "grad_norm": 4.0625, "grad_norm_var": 0.08531494140625, "learning_rate": 0.0001, "loss": 6.4479, "loss/crossentropy": 2.5646849870681763, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.21644552052021027, "step": 3842 }, { "epoch": 0.120125, "grad_norm": 3.953125, "grad_norm_var": 0.07693583170572917, "learning_rate": 0.0001, "loss": 6.7152, "loss/crossentropy": 2.7804479598999023, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.22355540096759796, "step": 3844 }, { "epoch": 0.1201875, "grad_norm": 3.984375, "grad_norm_var": 0.06265360514322917, "learning_rate": 0.0001, "loss": 6.7568, "loss/crossentropy": 2.7693997621536255, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.2260872721672058, "step": 3846 }, { "epoch": 0.12025, "grad_norm": 5.8125, "grad_norm_var": 0.26751302083333334, "learning_rate": 0.0001, "loss": 6.9386, "loss/crossentropy": 2.737673044204712, "loss/hidden": 1.765625, "loss/jsd": 0.0, "loss/logits": 0.24353280663490295, "step": 3848 }, { "epoch": 0.1203125, "grad_norm": 4.0, "grad_norm_var": 0.42926025390625, "learning_rate": 0.0001, "loss": 6.6092, "loss/crossentropy": 2.4701892137527466, "loss/hidden": 1.796875, "loss/jsd": 0.0, "loss/logits": 0.23421593010425568, "step": 3850 }, { "epoch": 0.120375, "grad_norm": 4.1875, "grad_norm_var": 0.4547526041666667, "learning_rate": 0.0001, "loss": 6.4197, "loss/crossentropy": 2.488594174385071, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.22005945444107056, "step": 3852 }, { "epoch": 0.1204375, "grad_norm": 4.15625, "grad_norm_var": 0.44224853515625, "learning_rate": 0.0001, "loss": 6.156, "loss/crossentropy": 2.3247268199920654, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.21008112281560898, "step": 3854 }, { "epoch": 0.1205, "grad_norm": 3.75, "grad_norm_var": 0.4252675374348958, "learning_rate": 0.0001, "loss": 6.46, "loss/crossentropy": 2.516259789466858, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.22366880625486374, "step": 3856 }, { "epoch": 0.1205625, "grad_norm": 4.09375, "grad_norm_var": 0.42454325358072914, "learning_rate": 0.0001, "loss": 6.6779, "loss/crossentropy": 2.6965575218200684, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.22625887393951416, "step": 3858 }, { "epoch": 0.120625, "grad_norm": 3.671875, "grad_norm_var": 0.4255442301432292, "learning_rate": 0.0001, "loss": 6.2982, "loss/crossentropy": 2.446297526359558, "loss/hidden": 1.74609375, "loss/jsd": 0.0, "loss/logits": 0.21058575063943863, "step": 3860 }, { "epoch": 0.1206875, "grad_norm": 3.921875, "grad_norm_var": 0.4435373942057292, "learning_rate": 0.0001, "loss": 6.7492, "loss/crossentropy": 2.723504066467285, "loss/hidden": 1.74609375, "loss/jsd": 0.0, "loss/logits": 0.2279648631811142, "step": 3862 }, { "epoch": 0.12075, "grad_norm": 3.828125, "grad_norm_var": 0.2752604166666667, "learning_rate": 0.0001, "loss": 6.3156, "loss/crossentropy": 2.4643293619155884, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.2163769155740738, "step": 3864 }, { "epoch": 0.1208125, "grad_norm": 3.984375, "grad_norm_var": 0.07724507649739583, "learning_rate": 0.0001, "loss": 6.578, "loss/crossentropy": 2.6052640676498413, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.22696109861135483, "step": 3866 }, { "epoch": 0.120875, "grad_norm": 4.1875, "grad_norm_var": 0.0659088134765625, "learning_rate": 0.0001, "loss": 6.4002, "loss/crossentropy": 2.4768152236938477, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.22280435264110565, "step": 3868 }, { "epoch": 0.1209375, "grad_norm": 3.609375, "grad_norm_var": 0.05943603515625, "learning_rate": 0.0001, "loss": 6.4668, "loss/crossentropy": 2.5721131563186646, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.22150424122810364, "step": 3870 }, { "epoch": 0.121, "grad_norm": 3.921875, "grad_norm_var": 0.058958943684895834, "learning_rate": 0.0001, "loss": 6.5498, "loss/crossentropy": 2.6573235988616943, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.22010327875614166, "step": 3872 }, { "epoch": 0.1210625, "grad_norm": 4.28125, "grad_norm_var": 0.060400390625, "learning_rate": 0.0001, "loss": 6.1348, "loss/crossentropy": 2.4366344213485718, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.20107021182775497, "step": 3874 }, { "epoch": 0.121125, "grad_norm": 3.984375, "grad_norm_var": 0.044905598958333334, "learning_rate": 0.0001, "loss": 6.6133, "loss/crossentropy": 2.625810146331787, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.22570153325796127, "step": 3876 }, { "epoch": 0.1211875, "grad_norm": 4.1875, "grad_norm_var": 0.050699869791666664, "learning_rate": 0.0001, "loss": 6.3149, "loss/crossentropy": 2.4852793216705322, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.2165529802441597, "step": 3878 }, { "epoch": 0.12125, "grad_norm": 3.9375, "grad_norm_var": 0.05750223795572917, "learning_rate": 0.0001, "loss": 6.4477, "loss/crossentropy": 2.55775785446167, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.22141185402870178, "step": 3880 }, { "epoch": 0.1213125, "grad_norm": 4.0625, "grad_norm_var": 0.06702067057291666, "learning_rate": 0.0001, "loss": 6.6623, "loss/crossentropy": 2.6345800161361694, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.22972697764635086, "step": 3882 }, { "epoch": 0.121375, "grad_norm": 4.25, "grad_norm_var": 0.06428934733072916, "learning_rate": 0.0001, "loss": 6.9427, "loss/crossentropy": 2.794018030166626, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.24182262271642685, "step": 3884 }, { "epoch": 0.1214375, "grad_norm": 3.71875, "grad_norm_var": 0.060445149739583336, "learning_rate": 0.0001, "loss": 6.6471, "loss/crossentropy": 2.7294814586639404, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.22262265533208847, "step": 3886 }, { "epoch": 0.1215, "grad_norm": 3.53125, "grad_norm_var": 0.0771148681640625, "learning_rate": 0.0001, "loss": 6.5541, "loss/crossentropy": 2.7204082012176514, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.21188200265169144, "step": 3888 }, { "epoch": 0.1215625, "grad_norm": 3.578125, "grad_norm_var": 0.09329427083333333, "learning_rate": 0.0001, "loss": 6.6385, "loss/crossentropy": 2.7236995697021484, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.22312351316213608, "step": 3890 }, { "epoch": 0.121625, "grad_norm": 3.96875, "grad_norm_var": 0.09780985514322917, "learning_rate": 0.0001, "loss": 6.6929, "loss/crossentropy": 2.778989553451538, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.2195180281996727, "step": 3892 }, { "epoch": 0.1216875, "grad_norm": 3.59375, "grad_norm_var": 0.10478108723958333, "learning_rate": 0.0001, "loss": 6.6652, "loss/crossentropy": 2.6517540216445923, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.22790998965501785, "step": 3894 }, { "epoch": 0.12175, "grad_norm": 3.765625, "grad_norm_var": 0.10888264973958334, "learning_rate": 0.0001, "loss": 6.5588, "loss/crossentropy": 2.6892133951187134, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.21742477267980576, "step": 3896 }, { "epoch": 0.1218125, "grad_norm": 3.828125, "grad_norm_var": 0.096044921875, "learning_rate": 0.0001, "loss": 6.3872, "loss/crossentropy": 2.534321904182434, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.21693114936351776, "step": 3898 }, { "epoch": 0.121875, "grad_norm": 3.71875, "grad_norm_var": 0.07714436848958334, "learning_rate": 0.0001, "loss": 6.4921, "loss/crossentropy": 2.593091368675232, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.21763041615486145, "step": 3900 }, { "epoch": 0.1219375, "grad_norm": 4.1875, "grad_norm_var": 0.08701171875, "learning_rate": 0.0001, "loss": 6.5717, "loss/crossentropy": 2.5926719903945923, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.22290125489234924, "step": 3902 }, { "epoch": 0.122, "grad_norm": 3.71875, "grad_norm_var": 0.06660868326822916, "learning_rate": 0.0001, "loss": 6.6787, "loss/crossentropy": 2.6835639476776123, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.2291964739561081, "step": 3904 }, { "epoch": 0.1220625, "grad_norm": 4.3125, "grad_norm_var": 0.07808329264322916, "learning_rate": 0.0001, "loss": 6.485, "loss/crossentropy": 2.571964383125305, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.22138310968875885, "step": 3906 }, { "epoch": 0.122125, "grad_norm": 4.0625, "grad_norm_var": 0.06379292805989584, "learning_rate": 0.0001, "loss": 6.6788, "loss/crossentropy": 2.6926345825195312, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.2247883677482605, "step": 3908 }, { "epoch": 0.1221875, "grad_norm": 5.03125, "grad_norm_var": 0.1368560791015625, "learning_rate": 0.0001, "loss": 5.9859, "loss/crossentropy": 2.3010092973709106, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.19700200110673904, "step": 3910 }, { "epoch": 0.12225, "grad_norm": 4.25, "grad_norm_var": 0.13918863932291667, "learning_rate": 0.0001, "loss": 6.426, "loss/crossentropy": 2.584962010383606, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.2141772359609604, "step": 3912 }, { "epoch": 0.1223125, "grad_norm": 3.75, "grad_norm_var": 0.13753255208333334, "learning_rate": 0.0001, "loss": 6.4614, "loss/crossentropy": 2.509815812110901, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.22484420239925385, "step": 3914 }, { "epoch": 0.122375, "grad_norm": 4.0, "grad_norm_var": 0.15249735514322918, "learning_rate": 0.0001, "loss": 6.3401, "loss/crossentropy": 2.4113447666168213, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.21904617547988892, "step": 3916 }, { "epoch": 0.1224375, "grad_norm": 10.625, "grad_norm_var": 2.94146728515625, "learning_rate": 0.0001, "loss": 7.0081, "loss/crossentropy": 2.5848923921585083, "loss/hidden": 1.89453125, "loss/jsd": 0.0, "loss/logits": 0.25286970287561417, "step": 3918 }, { "epoch": 0.1225, "grad_norm": 5.5625, "grad_norm_var": 3.3592844645182294, "learning_rate": 0.0001, "loss": 7.1548, "loss/crossentropy": 2.74170184135437, "loss/hidden": 1.82421875, "loss/jsd": 0.0, "loss/logits": 0.2588881254196167, "step": 3920 }, { "epoch": 0.1225625, "grad_norm": 3.984375, "grad_norm_var": 3.308463541666667, "learning_rate": 0.0001, "loss": 6.5678, "loss/crossentropy": 2.5315656661987305, "loss/hidden": 1.78515625, "loss/jsd": 0.0, "loss/logits": 0.22511044144630432, "step": 3922 }, { "epoch": 0.122625, "grad_norm": 4.46875, "grad_norm_var": 3.26783447265625, "learning_rate": 0.0001, "loss": 6.649, "loss/crossentropy": 2.4766751527786255, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.2430151253938675, "step": 3924 }, { "epoch": 0.1226875, "grad_norm": 3.53125, "grad_norm_var": 3.360798136393229, "learning_rate": 0.0001, "loss": 6.3786, "loss/crossentropy": 2.5347131490707397, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.21212033182382584, "step": 3926 }, { "epoch": 0.12275, "grad_norm": 3.9375, "grad_norm_var": 3.3823232014973956, "learning_rate": 0.0001, "loss": 6.5134, "loss/crossentropy": 2.561252474784851, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.22451211512088776, "step": 3928 }, { "epoch": 0.1228125, "grad_norm": 4.28125, "grad_norm_var": 3.322021484375, "learning_rate": 0.0001, "loss": 6.4125, "loss/crossentropy": 2.4684640169143677, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.22292188555002213, "step": 3930 }, { "epoch": 0.122875, "grad_norm": 3.90625, "grad_norm_var": 3.2751617431640625, "learning_rate": 0.0001, "loss": 6.6115, "loss/crossentropy": 2.600993514060974, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.22604826837778091, "step": 3932 }, { "epoch": 0.1229375, "grad_norm": 3.8125, "grad_norm_var": 0.7657389322916667, "learning_rate": 0.0001, "loss": 6.5081, "loss/crossentropy": 2.5906589031219482, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.21791699528694153, "step": 3934 }, { "epoch": 0.123, "grad_norm": 3.640625, "grad_norm_var": 0.08106180826822916, "learning_rate": 0.0001, "loss": 6.5433, "loss/crossentropy": 2.596111297607422, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.2240198478102684, "step": 3936 }, { "epoch": 0.1230625, "grad_norm": 4.84375, "grad_norm_var": 0.203173828125, "learning_rate": 0.0001, "loss": 7.1452, "loss/crossentropy": 2.843635082244873, "loss/hidden": 1.8046875, "loss/jsd": 0.0, "loss/logits": 0.24968495965003967, "step": 3938 }, { "epoch": 0.123125, "grad_norm": 3.6875, "grad_norm_var": 0.19171549479166666, "learning_rate": 0.0001, "loss": 6.8274, "loss/crossentropy": 2.7907726764678955, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.23061522096395493, "step": 3940 }, { "epoch": 0.1231875, "grad_norm": 3.734375, "grad_norm_var": 0.17717997233072916, "learning_rate": 0.0001, "loss": 6.6786, "loss/crossentropy": 2.7477582693099976, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.22550544887781143, "step": 3942 }, { "epoch": 0.12325, "grad_norm": 3.875, "grad_norm_var": 0.17346598307291666, "learning_rate": 0.0001, "loss": 6.4862, "loss/crossentropy": 2.572945475578308, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.22218024730682373, "step": 3944 }, { "epoch": 0.1233125, "grad_norm": 3.65625, "grad_norm_var": 0.2100982666015625, "learning_rate": 0.0001, "loss": 6.6538, "loss/crossentropy": 2.5931992530822754, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.23379892855882645, "step": 3946 }, { "epoch": 0.123375, "grad_norm": 4.71875, "grad_norm_var": 0.23860677083333334, "learning_rate": 0.0001, "loss": 6.928, "loss/crossentropy": 2.7630093097686768, "loss/hidden": 1.76953125, "loss/jsd": 0.0, "loss/logits": 0.23954720050096512, "step": 3948 }, { "epoch": 0.1234375, "grad_norm": 3.953125, "grad_norm_var": 0.23022359212239582, "learning_rate": 0.0001, "loss": 7.0825, "loss/crossentropy": 2.878159523010254, "loss/hidden": 1.77734375, "loss/jsd": 0.0, "loss/logits": 0.24269741028547287, "step": 3950 }, { "epoch": 0.1235, "grad_norm": 5.65625, "grad_norm_var": 0.4005045572916667, "learning_rate": 0.0001, "loss": 6.4622, "loss/crossentropy": 2.620681405067444, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.21735333651304245, "step": 3952 }, { "epoch": 0.1235625, "grad_norm": 4.375, "grad_norm_var": 0.3162180582682292, "learning_rate": 0.0001, "loss": 6.6901, "loss/crossentropy": 2.6496429443359375, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.23138756304979324, "step": 3954 }, { "epoch": 0.123625, "grad_norm": 4.1875, "grad_norm_var": 0.31428934733072916, "learning_rate": 0.0001, "loss": 6.4552, "loss/crossentropy": 2.529421091079712, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.219528928399086, "step": 3956 }, { "epoch": 0.1236875, "grad_norm": 4.15625, "grad_norm_var": 0.2940338134765625, "learning_rate": 0.0001, "loss": 6.5467, "loss/crossentropy": 2.5284109115600586, "loss/hidden": 1.7109375, "loss/jsd": 0.0, "loss/logits": 0.230732724070549, "step": 3958 }, { "epoch": 0.12375, "grad_norm": 5.125, "grad_norm_var": 0.3382639567057292, "learning_rate": 0.0001, "loss": 7.0126, "loss/crossentropy": 2.92387056350708, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.23621230572462082, "step": 3960 }, { "epoch": 0.1238125, "grad_norm": 3.734375, "grad_norm_var": 0.31686197916666664, "learning_rate": 0.0001, "loss": 6.4582, "loss/crossentropy": 2.526583433151245, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.22324207425117493, "step": 3962 }, { "epoch": 0.123875, "grad_norm": 3.90625, "grad_norm_var": 0.2927805582682292, "learning_rate": 0.0001, "loss": 6.6438, "loss/crossentropy": 2.6276822090148926, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.2281695306301117, "step": 3964 }, { "epoch": 0.1239375, "grad_norm": 3.828125, "grad_norm_var": 0.29655659993489586, "learning_rate": 0.0001, "loss": 6.905, "loss/crossentropy": 2.866371512413025, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.23003342747688293, "step": 3966 }, { "epoch": 0.124, "grad_norm": 3.40625, "grad_norm_var": 0.14641927083333334, "learning_rate": 0.0001, "loss": 6.8189, "loss/crossentropy": 2.93355929851532, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.21705003827810287, "step": 3968 }, { "epoch": 0.1240625, "grad_norm": 3.375, "grad_norm_var": 0.16176656087239583, "learning_rate": 0.0001, "loss": 6.5506, "loss/crossentropy": 2.607635974884033, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.2200758457183838, "step": 3970 }, { "epoch": 0.124125, "grad_norm": 4.0625, "grad_norm_var": 0.1582916259765625, "learning_rate": 0.0001, "loss": 6.6297, "loss/crossentropy": 2.632441759109497, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.22823940962553024, "step": 3972 }, { "epoch": 0.1241875, "grad_norm": 3.453125, "grad_norm_var": 0.1716461181640625, "learning_rate": 0.0001, "loss": 6.3758, "loss/crossentropy": 2.6422054767608643, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.2046114206314087, "step": 3974 }, { "epoch": 0.12425, "grad_norm": 3.78125, "grad_norm_var": 0.06135152180989583, "learning_rate": 0.0001, "loss": 6.7175, "loss/crossentropy": 2.7409368753433228, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.2250024750828743, "step": 3976 }, { "epoch": 0.1243125, "grad_norm": 3.734375, "grad_norm_var": 0.05761311848958333, "learning_rate": 0.0001, "loss": 6.096, "loss/crossentropy": 2.399104118347168, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.2017187550663948, "step": 3978 }, { "epoch": 0.124375, "grad_norm": 3.859375, "grad_norm_var": 0.08145243326822917, "learning_rate": 0.0001, "loss": 6.5613, "loss/crossentropy": 2.7199209928512573, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.20992152392864227, "step": 3980 }, { "epoch": 0.1244375, "grad_norm": 4.1875, "grad_norm_var": 0.09019775390625, "learning_rate": 0.0001, "loss": 6.7587, "loss/crossentropy": 2.6355478763580322, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.23653244227170944, "step": 3982 }, { "epoch": 0.1245, "grad_norm": 4.0, "grad_norm_var": 0.0809478759765625, "learning_rate": 0.0001, "loss": 6.6411, "loss/crossentropy": 2.5781519412994385, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.23051701486110687, "step": 3984 }, { "epoch": 0.1245625, "grad_norm": 3.453125, "grad_norm_var": 0.06982014973958334, "learning_rate": 0.0001, "loss": 6.0452, "loss/crossentropy": 2.3460733890533447, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.2062443271279335, "step": 3986 }, { "epoch": 0.124625, "grad_norm": 3.71875, "grad_norm_var": 0.06868387858072916, "learning_rate": 0.0001, "loss": 6.2594, "loss/crossentropy": 2.419608950614929, "loss/hidden": 1.74609375, "loss/jsd": 0.0, "loss/logits": 0.20937252044677734, "step": 3988 }, { "epoch": 0.1246875, "grad_norm": 3.71875, "grad_norm_var": 0.0579986572265625, "learning_rate": 0.0001, "loss": 6.3339, "loss/crossentropy": 2.4288982152938843, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.22058139741420746, "step": 3990 }, { "epoch": 0.12475, "grad_norm": 6.6875, "grad_norm_var": 0.5380279541015625, "learning_rate": 0.0001, "loss": 6.838, "loss/crossentropy": 2.5921448469161987, "loss/hidden": 1.78125, "loss/jsd": 0.0, "loss/logits": 0.24646371603012085, "step": 3992 }, { "epoch": 0.1248125, "grad_norm": 4.1875, "grad_norm_var": 0.5374582926432292, "learning_rate": 0.0001, "loss": 6.9783, "loss/crossentropy": 2.6753528118133545, "loss/hidden": 1.76953125, "loss/jsd": 0.0, "loss/logits": 0.25334077328443527, "step": 3994 }, { "epoch": 0.124875, "grad_norm": 3.890625, "grad_norm_var": 0.5369781494140625, "learning_rate": 0.0001, "loss": 6.3665, "loss/crossentropy": 2.5173572301864624, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.20991378277540207, "step": 3996 }, { "epoch": 0.1249375, "grad_norm": 4.0, "grad_norm_var": 0.53902587890625, "learning_rate": 0.0001, "loss": 6.4783, "loss/crossentropy": 2.622048854827881, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.21140175312757492, "step": 3998 }, { "epoch": 0.125, "grad_norm": 4.75, "grad_norm_var": 0.5801432291666667, "learning_rate": 0.0001, "loss": 6.6491, "loss/crossentropy": 2.7371960878372192, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.2208828255534172, "step": 4000 }, { "epoch": 0.1250625, "grad_norm": 4.65625, "grad_norm_var": 0.5559804280598958, "learning_rate": 0.0001, "loss": 6.7283, "loss/crossentropy": 2.761434555053711, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.22364237159490585, "step": 4002 }, { "epoch": 0.125125, "grad_norm": 4.40625, "grad_norm_var": 0.5261545817057292, "learning_rate": 0.0001, "loss": 6.6485, "loss/crossentropy": 2.512582302093506, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.23859276622533798, "step": 4004 }, { "epoch": 0.1251875, "grad_norm": 4.0, "grad_norm_var": 0.49958394368489584, "learning_rate": 0.0001, "loss": 6.6833, "loss/crossentropy": 2.5966222286224365, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.23327559977769852, "step": 4006 }, { "epoch": 0.12525, "grad_norm": 3.890625, "grad_norm_var": 0.1353912353515625, "learning_rate": 0.0001, "loss": 6.2893, "loss/crossentropy": 2.4675627946853638, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.2126445323228836, "step": 4008 }, { "epoch": 0.1253125, "grad_norm": 4.75, "grad_norm_var": 0.14102274576822918, "learning_rate": 0.0001, "loss": 6.5078, "loss/crossentropy": 2.4831652641296387, "loss/hidden": 1.78515625, "loss/jsd": 0.0, "loss/logits": 0.22395003587007523, "step": 4010 }, { "epoch": 0.125375, "grad_norm": 4.6875, "grad_norm_var": 0.15494384765625, "learning_rate": 0.0001, "loss": 6.6971, "loss/crossentropy": 2.6222360134124756, "loss/hidden": 1.796875, "loss/jsd": 0.0, "loss/logits": 0.22779580950737, "step": 4012 }, { "epoch": 0.1254375, "grad_norm": 3.90625, "grad_norm_var": 0.15995992024739583, "learning_rate": 0.0001, "loss": 6.6417, "loss/crossentropy": 2.7136658430099487, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.22092662006616592, "step": 4014 }, { "epoch": 0.1255, "grad_norm": 3.859375, "grad_norm_var": 0.13006184895833334, "learning_rate": 0.0001, "loss": 6.5771, "loss/crossentropy": 2.63912570476532, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.22387553751468658, "step": 4016 }, { "epoch": 0.1255625, "grad_norm": 4.34375, "grad_norm_var": 0.3907552083333333, "learning_rate": 0.0001, "loss": 7.2111, "loss/crossentropy": 2.8089661598205566, "loss/hidden": 1.8046875, "loss/jsd": 0.0, "loss/logits": 0.2597440257668495, "step": 4018 }, { "epoch": 0.125625, "grad_norm": 4.21875, "grad_norm_var": 0.40019429524739586, "learning_rate": 0.0001, "loss": 6.7308, "loss/crossentropy": 2.6494024991989136, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.23391643166542053, "step": 4020 }, { "epoch": 0.1256875, "grad_norm": 3.96875, "grad_norm_var": 0.406884765625, "learning_rate": 0.0001, "loss": 6.8903, "loss/crossentropy": 2.84161376953125, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.23064526915550232, "step": 4022 }, { "epoch": 0.12575, "grad_norm": 5.15625, "grad_norm_var": 0.43072509765625, "learning_rate": 0.0001, "loss": 7.1108, "loss/crossentropy": 2.7824318408966064, "loss/hidden": 1.8125, "loss/jsd": 0.0, "loss/logits": 0.2515895813703537, "step": 4024 }, { "epoch": 0.1258125, "grad_norm": 3.921875, "grad_norm_var": 0.4175771077473958, "learning_rate": 0.0001, "loss": 6.5405, "loss/crossentropy": 2.5641770362854004, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.22731705754995346, "step": 4026 }, { "epoch": 0.125875, "grad_norm": 3.5625, "grad_norm_var": 0.4420572916666667, "learning_rate": 0.0001, "loss": 6.4007, "loss/crossentropy": 2.6236324310302734, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.2062263935804367, "step": 4028 }, { "epoch": 0.1259375, "grad_norm": 4.09375, "grad_norm_var": 0.44380594889322916, "learning_rate": 0.0001, "loss": 6.1651, "loss/crossentropy": 2.388764262199402, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.2084936499595642, "step": 4030 }, { "epoch": 0.126, "grad_norm": 6.90625, "grad_norm_var": 0.8959299723307291, "learning_rate": 0.0001, "loss": 6.8036, "loss/crossentropy": 2.6765114068984985, "loss/hidden": 1.76953125, "loss/jsd": 0.0, "loss/logits": 0.2357548102736473, "step": 4032 }, { "epoch": 0.1260625, "grad_norm": 3.671875, "grad_norm_var": 0.6935831705729166, "learning_rate": 0.0001, "loss": 6.3616, "loss/crossentropy": 2.583994150161743, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.21330882608890533, "step": 4034 }, { "epoch": 0.126125, "grad_norm": 3.9375, "grad_norm_var": 0.68369140625, "learning_rate": 0.0001, "loss": 6.6707, "loss/crossentropy": 2.6019418239593506, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.23304607719182968, "step": 4036 }, { "epoch": 0.1261875, "grad_norm": 4.09375, "grad_norm_var": 0.6779612223307292, "learning_rate": 0.0001, "loss": 6.7854, "loss/crossentropy": 2.8110634088516235, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.22712517529726028, "step": 4038 }, { "epoch": 0.12625, "grad_norm": 4.34375, "grad_norm_var": 0.61314697265625, "learning_rate": 0.0001, "loss": 7.036, "loss/crossentropy": 2.830755352973938, "loss/hidden": 1.76171875, "loss/jsd": 0.0, "loss/logits": 0.2443506345152855, "step": 4040 }, { "epoch": 0.1263125, "grad_norm": 3.984375, "grad_norm_var": 0.6087198893229167, "learning_rate": 0.0001, "loss": 6.6535, "loss/crossentropy": 2.6730138063430786, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.2289079651236534, "step": 4042 }, { "epoch": 0.126375, "grad_norm": 3.5625, "grad_norm_var": 0.60465087890625, "learning_rate": 0.0001, "loss": 6.5962, "loss/crossentropy": 2.6363285779953003, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.22567617148160934, "step": 4044 }, { "epoch": 0.1264375, "grad_norm": 3.6875, "grad_norm_var": 0.6362782796223958, "learning_rate": 0.0001, "loss": 6.6012, "loss/crossentropy": 2.7529706954956055, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.2164599373936653, "step": 4046 }, { "epoch": 0.1265, "grad_norm": 3.71875, "grad_norm_var": 0.07694905598958333, "learning_rate": 0.0001, "loss": 6.2872, "loss/crossentropy": 2.568008065223694, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.20707439631223679, "step": 4048 }, { "epoch": 0.1265625, "grad_norm": 4.03125, "grad_norm_var": 0.06897379557291666, "learning_rate": 0.0001, "loss": 6.7791, "loss/crossentropy": 2.7470881938934326, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.22820251435041428, "step": 4050 }, { "epoch": 0.126625, "grad_norm": 3.921875, "grad_norm_var": 0.06415913899739584, "learning_rate": 0.0001, "loss": 6.6669, "loss/crossentropy": 2.7223349809646606, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.2229759618639946, "step": 4052 }, { "epoch": 0.1266875, "grad_norm": 4.0625, "grad_norm_var": 0.06628316243489583, "learning_rate": 0.0001, "loss": 6.6013, "loss/crossentropy": 2.7354965209960938, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.21587659418582916, "step": 4054 }, { "epoch": 0.12675, "grad_norm": 4.09375, "grad_norm_var": 0.05424702962239583, "learning_rate": 0.0001, "loss": 6.8119, "loss/crossentropy": 2.8012603521347046, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.22879604250192642, "step": 4056 }, { "epoch": 0.1268125, "grad_norm": 4.125, "grad_norm_var": 0.05943603515625, "learning_rate": 0.0001, "loss": 6.4312, "loss/crossentropy": 2.5267443656921387, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.21817763149738312, "step": 4058 }, { "epoch": 0.126875, "grad_norm": 3.546875, "grad_norm_var": 0.0608306884765625, "learning_rate": 0.0001, "loss": 6.6004, "loss/crossentropy": 2.601567268371582, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.2241053283214569, "step": 4060 }, { "epoch": 0.1269375, "grad_norm": 4.21875, "grad_norm_var": 0.05134989420572917, "learning_rate": 0.0001, "loss": 6.7407, "loss/crossentropy": 2.706727147102356, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.2307443767786026, "step": 4062 }, { "epoch": 0.127, "grad_norm": 3.578125, "grad_norm_var": 0.035868326822916664, "learning_rate": 0.0001, "loss": 6.2964, "loss/crossentropy": 2.4649651050567627, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.21400236338377, "step": 4064 }, { "epoch": 0.1270625, "grad_norm": 3.703125, "grad_norm_var": 0.04338785807291667, "learning_rate": 0.0001, "loss": 6.561, "loss/crossentropy": 2.6405688524246216, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.22134052217006683, "step": 4066 }, { "epoch": 0.127125, "grad_norm": 3.921875, "grad_norm_var": 0.04312744140625, "learning_rate": 0.0001, "loss": 6.7403, "loss/crossentropy": 2.7473138570785522, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.228990375995636, "step": 4068 }, { "epoch": 0.1271875, "grad_norm": 3.984375, "grad_norm_var": 0.04299214680989583, "learning_rate": 0.0001, "loss": 6.701, "loss/crossentropy": 2.588662028312683, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.23818998783826828, "step": 4070 }, { "epoch": 0.12725, "grad_norm": 3.828125, "grad_norm_var": 0.050593058268229164, "learning_rate": 0.0001, "loss": 6.6683, "loss/crossentropy": 2.6175063848495483, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.2296886220574379, "step": 4072 }, { "epoch": 0.1273125, "grad_norm": 3.765625, "grad_norm_var": 0.04804280598958333, "learning_rate": 0.0001, "loss": 6.4641, "loss/crossentropy": 2.5841997861862183, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.216502346098423, "step": 4074 }, { "epoch": 0.127375, "grad_norm": 4.65625, "grad_norm_var": 0.07437744140625, "learning_rate": 0.0001, "loss": 6.6996, "loss/crossentropy": 2.647361397743225, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.23296315222978592, "step": 4076 }, { "epoch": 0.1274375, "grad_norm": 3.625, "grad_norm_var": 0.07527669270833333, "learning_rate": 0.0001, "loss": 6.6153, "loss/crossentropy": 2.6356441974639893, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.22570262849330902, "step": 4078 }, { "epoch": 0.1275, "grad_norm": 4.46875, "grad_norm_var": 0.0843658447265625, "learning_rate": 0.0001, "loss": 6.5986, "loss/crossentropy": 2.5087159872055054, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.2359411045908928, "step": 4080 }, { "epoch": 0.1275625, "grad_norm": 3.921875, "grad_norm_var": 0.07222391764322916, "learning_rate": 0.0001, "loss": 6.2928, "loss/crossentropy": 2.4719302654266357, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.20865336060523987, "step": 4082 }, { "epoch": 0.127625, "grad_norm": 11.875, "grad_norm_var": 3.9582590738932293, "learning_rate": 0.0001, "loss": 6.6642, "loss/crossentropy": 2.618984341621399, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.2365577220916748, "step": 4084 }, { "epoch": 0.1276875, "grad_norm": 4.03125, "grad_norm_var": 3.9661936442057293, "learning_rate": 0.0001, "loss": 6.4982, "loss/crossentropy": 2.5682222843170166, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.2207360565662384, "step": 4086 }, { "epoch": 0.12775, "grad_norm": 3.828125, "grad_norm_var": 3.9713205973307293, "learning_rate": 0.0001, "loss": 6.3929, "loss/crossentropy": 2.5119932889938354, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.21894660592079163, "step": 4088 }, { "epoch": 0.1278125, "grad_norm": 3.671875, "grad_norm_var": 3.978327433268229, "learning_rate": 0.0001, "loss": 6.4042, "loss/crossentropy": 2.5663267374038696, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.2173830345273018, "step": 4090 }, { "epoch": 0.127875, "grad_norm": 4.1875, "grad_norm_var": 4.021939086914062, "learning_rate": 0.0001, "loss": 6.4591, "loss/crossentropy": 2.5778207778930664, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.2201596274971962, "step": 4092 }, { "epoch": 0.1279375, "grad_norm": 3.484375, "grad_norm_var": 4.07906494140625, "learning_rate": 0.0001, "loss": 6.2625, "loss/crossentropy": 2.538253903388977, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.2040649726986885, "step": 4094 }, { "epoch": 0.128, "grad_norm": 4.0625, "grad_norm_var": 4.089891560872396, "learning_rate": 0.0001, "loss": 6.6743, "loss/crossentropy": 2.7568410634994507, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.22143083810806274, "step": 4096 }, { "epoch": 0.1280625, "grad_norm": 3.609375, "grad_norm_var": 4.11890869140625, "learning_rate": 0.0001, "loss": 6.4438, "loss/crossentropy": 2.6530957221984863, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.21070635318756104, "step": 4098 }, { "epoch": 0.128125, "grad_norm": 3.90625, "grad_norm_var": 0.07829488118489583, "learning_rate": 0.0001, "loss": 6.6074, "loss/crossentropy": 2.625568151473999, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.22631164640188217, "step": 4100 }, { "epoch": 0.1281875, "grad_norm": 4.125, "grad_norm_var": 0.08401285807291667, "learning_rate": 0.0001, "loss": 6.5135, "loss/crossentropy": 2.6141321659088135, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.2192334309220314, "step": 4102 }, { "epoch": 0.12825, "grad_norm": 4.03125, "grad_norm_var": 0.0801422119140625, "learning_rate": 0.0001, "loss": 6.7171, "loss/crossentropy": 2.7333229780197144, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.23002268373966217, "step": 4104 }, { "epoch": 0.1283125, "grad_norm": 3.4375, "grad_norm_var": 0.09021809895833334, "learning_rate": 0.0001, "loss": 6.4842, "loss/crossentropy": 2.6517757177352905, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.21566757559776306, "step": 4106 }, { "epoch": 0.128375, "grad_norm": 3.625, "grad_norm_var": 0.08058268229166667, "learning_rate": 0.0001, "loss": 6.4124, "loss/crossentropy": 2.5258562564849854, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.21912791579961777, "step": 4108 }, { "epoch": 0.1284375, "grad_norm": 3.9375, "grad_norm_var": 0.07349344889322916, "learning_rate": 0.0001, "loss": 6.4231, "loss/crossentropy": 2.5245391130447388, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.22032760083675385, "step": 4110 }, { "epoch": 0.1285, "grad_norm": 3.84375, "grad_norm_var": 0.06735026041666667, "learning_rate": 0.0001, "loss": 6.5448, "loss/crossentropy": 2.599327564239502, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.2211119681596756, "step": 4112 }, { "epoch": 0.1285625, "grad_norm": 3.59375, "grad_norm_var": 0.06770426432291667, "learning_rate": 0.0001, "loss": 6.4937, "loss/crossentropy": 2.6408677101135254, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.21536342799663544, "step": 4114 }, { "epoch": 0.128625, "grad_norm": 3.578125, "grad_norm_var": 0.0384674072265625, "learning_rate": 0.0001, "loss": 6.4221, "loss/crossentropy": 2.54049289226532, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.22175266593694687, "step": 4116 }, { "epoch": 0.1286875, "grad_norm": 3.5625, "grad_norm_var": 0.03052978515625, "learning_rate": 0.0001, "loss": 6.659, "loss/crossentropy": 2.755501627922058, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.2204309031367302, "step": 4118 }, { "epoch": 0.12875, "grad_norm": 4.625, "grad_norm_var": 0.19879150390625, "learning_rate": 0.0001, "loss": 6.7089, "loss/crossentropy": 2.5334556102752686, "loss/hidden": 1.8046875, "loss/jsd": 0.0, "loss/logits": 0.2370723932981491, "step": 4120 }, { "epoch": 0.1288125, "grad_norm": 4.40625, "grad_norm_var": 0.2015533447265625, "learning_rate": 0.0001, "loss": 6.4647, "loss/crossentropy": 2.4602530002593994, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.22739624977111816, "step": 4122 }, { "epoch": 0.128875, "grad_norm": 3.453125, "grad_norm_var": 0.20187886555989584, "learning_rate": 0.0001, "loss": 6.684, "loss/crossentropy": 2.7768908739089966, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.22195608168840408, "step": 4124 }, { "epoch": 0.1289375, "grad_norm": 3.640625, "grad_norm_var": 0.20279541015625, "learning_rate": 0.0001, "loss": 6.5967, "loss/crossentropy": 2.624576449394226, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.22690071165561676, "step": 4126 }, { "epoch": 0.129, "grad_norm": 4.15625, "grad_norm_var": 0.20142822265625, "learning_rate": 0.0001, "loss": 6.0872, "loss/crossentropy": 2.3337572813034058, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.20541912317276, "step": 4128 }, { "epoch": 0.1290625, "grad_norm": 4.09375, "grad_norm_var": 0.5465128580729167, "learning_rate": 0.0001, "loss": 6.8517, "loss/crossentropy": 2.7883676290512085, "loss/hidden": 1.77734375, "loss/jsd": 0.0, "loss/logits": 0.22859449684619904, "step": 4130 }, { "epoch": 0.129125, "grad_norm": 3.796875, "grad_norm_var": 0.52691650390625, "learning_rate": 0.0001, "loss": 6.9007, "loss/crossentropy": 2.802171230316162, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.23485445231199265, "step": 4132 }, { "epoch": 0.1291875, "grad_norm": 4.78125, "grad_norm_var": 0.5078521728515625, "learning_rate": 0.0001, "loss": 6.6574, "loss/crossentropy": 2.6423803567886353, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.22728128731250763, "step": 4134 }, { "epoch": 0.12925, "grad_norm": 3.375, "grad_norm_var": 0.4954254150390625, "learning_rate": 0.0001, "loss": 6.2662, "loss/crossentropy": 2.5183045864105225, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.20916834473609924, "step": 4136 }, { "epoch": 0.1293125, "grad_norm": 4.0625, "grad_norm_var": 0.48957417805989584, "learning_rate": 0.0001, "loss": 6.5832, "loss/crossentropy": 2.6430411338806152, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.22097034752368927, "step": 4138 }, { "epoch": 0.129375, "grad_norm": 3.953125, "grad_norm_var": 0.4623687744140625, "learning_rate": 0.0001, "loss": 6.8817, "loss/crossentropy": 2.7385300397872925, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.2416619285941124, "step": 4140 }, { "epoch": 0.1294375, "grad_norm": 3.671875, "grad_norm_var": 0.47001953125, "learning_rate": 0.0001, "loss": 6.2309, "loss/crossentropy": 2.5208064317703247, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.20694401860237122, "step": 4142 }, { "epoch": 0.1295, "grad_norm": 3.734375, "grad_norm_var": 0.4915852864583333, "learning_rate": 0.0001, "loss": 6.3752, "loss/crossentropy": 2.5856963396072388, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.21332371979951859, "step": 4144 }, { "epoch": 0.1295625, "grad_norm": 3.890625, "grad_norm_var": 0.10068257649739583, "learning_rate": 0.0001, "loss": 6.302, "loss/crossentropy": 2.464065670967102, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.2130947783589363, "step": 4146 }, { "epoch": 0.129625, "grad_norm": 3.625, "grad_norm_var": 0.102978515625, "learning_rate": 0.0001, "loss": 6.0577, "loss/crossentropy": 2.259469985961914, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.20638281852006912, "step": 4148 }, { "epoch": 0.1296875, "grad_norm": 4.0625, "grad_norm_var": 0.0470367431640625, "learning_rate": 0.0001, "loss": 6.3444, "loss/crossentropy": 2.5508021116256714, "loss/hidden": 1.7109375, "loss/jsd": 0.0, "loss/logits": 0.208266943693161, "step": 4150 }, { "epoch": 0.12975, "grad_norm": 3.5625, "grad_norm_var": 0.0468414306640625, "learning_rate": 0.0001, "loss": 6.8325, "loss/crossentropy": 2.7907811403274536, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.23268505930900574, "step": 4152 }, { "epoch": 0.1298125, "grad_norm": 4.40625, "grad_norm_var": 0.0658203125, "learning_rate": 0.0001, "loss": 6.7278, "loss/crossentropy": 2.6292165517807007, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.23759499937295914, "step": 4154 }, { "epoch": 0.129875, "grad_norm": 4.09375, "grad_norm_var": 0.07007548014322916, "learning_rate": 0.0001, "loss": 6.8438, "loss/crossentropy": 2.810207486152649, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.23031456023454666, "step": 4156 }, { "epoch": 0.1299375, "grad_norm": 3.625, "grad_norm_var": 0.07054036458333333, "learning_rate": 0.0001, "loss": 6.3715, "loss/crossentropy": 2.574985980987549, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.21402358263731003, "step": 4158 }, { "epoch": 0.13, "grad_norm": 3.625, "grad_norm_var": 0.07649332682291667, "learning_rate": 0.0001, "loss": 6.3259, "loss/crossentropy": 2.5313199758529663, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.21266181766986847, "step": 4160 }, { "epoch": 0.1300625, "grad_norm": 3.859375, "grad_norm_var": 0.07180887858072917, "learning_rate": 0.0001, "loss": 6.4371, "loss/crossentropy": 2.6102746725082397, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.21354027092456818, "step": 4162 }, { "epoch": 0.130125, "grad_norm": 3.53125, "grad_norm_var": 0.07256571451822917, "learning_rate": 0.0001, "loss": 6.482, "loss/crossentropy": 2.6907159090042114, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.20959646999835968, "step": 4164 }, { "epoch": 0.1301875, "grad_norm": 4.125, "grad_norm_var": 0.07476806640625, "learning_rate": 0.0001, "loss": 6.3805, "loss/crossentropy": 2.4660550355911255, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.22347452491521835, "step": 4166 }, { "epoch": 0.13025, "grad_norm": 4.03125, "grad_norm_var": 0.07359619140625, "learning_rate": 0.0001, "loss": 6.7581, "loss/crossentropy": 2.7357442378997803, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.22958087921142578, "step": 4168 }, { "epoch": 0.1303125, "grad_norm": 3.46875, "grad_norm_var": 0.12799479166666666, "learning_rate": 0.0001, "loss": 6.0945, "loss/crossentropy": 2.3649386167526245, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.20498840510845184, "step": 4170 }, { "epoch": 0.130375, "grad_norm": 3.71875, "grad_norm_var": 0.13772684733072918, "learning_rate": 0.0001, "loss": 6.4307, "loss/crossentropy": 2.4999207258224487, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.21886222064495087, "step": 4172 }, { "epoch": 0.1304375, "grad_norm": 6.28125, "grad_norm_var": 0.500537109375, "learning_rate": 0.0001, "loss": 6.2569, "loss/crossentropy": 2.4742400646209717, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.2157633751630783, "step": 4174 }, { "epoch": 0.1305, "grad_norm": 3.640625, "grad_norm_var": 0.48405659993489586, "learning_rate": 0.0001, "loss": 6.3167, "loss/crossentropy": 2.4573535919189453, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.2179667130112648, "step": 4176 }, { "epoch": 0.1305625, "grad_norm": 3.921875, "grad_norm_var": 0.47384440104166664, "learning_rate": 0.0001, "loss": 6.4765, "loss/crossentropy": 2.6034940481185913, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.2177659273147583, "step": 4178 }, { "epoch": 0.130625, "grad_norm": 3.859375, "grad_norm_var": 0.4401326497395833, "learning_rate": 0.0001, "loss": 6.6969, "loss/crossentropy": 2.6809717416763306, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.22854258120059967, "step": 4180 }, { "epoch": 0.1306875, "grad_norm": 3.96875, "grad_norm_var": 0.44468994140625, "learning_rate": 0.0001, "loss": 6.9014, "loss/crossentropy": 2.9240981340408325, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.22624869644641876, "step": 4182 }, { "epoch": 0.13075, "grad_norm": 3.703125, "grad_norm_var": 0.4532704671223958, "learning_rate": 0.0001, "loss": 6.1262, "loss/crossentropy": 2.3521162271499634, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.2070925459265709, "step": 4184 }, { "epoch": 0.1308125, "grad_norm": 3.546875, "grad_norm_var": 0.41184794108072914, "learning_rate": 0.0001, "loss": 6.2821, "loss/crossentropy": 2.519057512283325, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.20872611552476883, "step": 4186 }, { "epoch": 0.130875, "grad_norm": 3.703125, "grad_norm_var": 0.4066243489583333, "learning_rate": 0.0001, "loss": 6.5832, "loss/crossentropy": 2.6278737783432007, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.22521910816431046, "step": 4188 }, { "epoch": 0.1309375, "grad_norm": 4.0, "grad_norm_var": 0.034440104166666666, "learning_rate": 0.0001, "loss": 6.9223, "loss/crossentropy": 2.7981791496276855, "loss/hidden": 1.78125, "loss/jsd": 0.0, "loss/logits": 0.23429103940725327, "step": 4190 }, { "epoch": 0.131, "grad_norm": 3.984375, "grad_norm_var": 0.033919270833333334, "learning_rate": 0.0001, "loss": 6.48, "loss/crossentropy": 2.611246943473816, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.2153891921043396, "step": 4192 }, { "epoch": 0.1310625, "grad_norm": 4.6875, "grad_norm_var": 0.07752278645833334, "learning_rate": 0.0001, "loss": 6.7144, "loss/crossentropy": 2.749796986579895, "loss/hidden": 1.7109375, "loss/jsd": 0.0, "loss/logits": 0.2253640741109848, "step": 4194 }, { "epoch": 0.131125, "grad_norm": 3.734375, "grad_norm_var": 0.08818257649739583, "learning_rate": 0.0001, "loss": 6.3536, "loss/crossentropy": 2.5561606884002686, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.21177595853805542, "step": 4196 }, { "epoch": 0.1311875, "grad_norm": 4.03125, "grad_norm_var": 0.08901265462239584, "learning_rate": 0.0001, "loss": 6.313, "loss/crossentropy": 2.41589617729187, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.21978683024644852, "step": 4198 }, { "epoch": 0.13125, "grad_norm": 3.859375, "grad_norm_var": 0.09361572265625, "learning_rate": 0.0001, "loss": 6.2272, "loss/crossentropy": 2.4662758111953735, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.20538844913244247, "step": 4200 }, { "epoch": 0.1313125, "grad_norm": 3.796875, "grad_norm_var": 0.09602864583333333, "learning_rate": 0.0001, "loss": 6.4304, "loss/crossentropy": 2.605375051498413, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.2141440585255623, "step": 4202 }, { "epoch": 0.131375, "grad_norm": 3.671875, "grad_norm_var": 0.09169514973958333, "learning_rate": 0.0001, "loss": 6.4819, "loss/crossentropy": 2.5840961933135986, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.21673616021871567, "step": 4204 }, { "epoch": 0.1314375, "grad_norm": 3.6875, "grad_norm_var": 0.08472900390625, "learning_rate": 0.0001, "loss": 6.5872, "loss/crossentropy": 2.652251124382019, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.21966180205345154, "step": 4206 }, { "epoch": 0.1315, "grad_norm": 3.359375, "grad_norm_var": 0.0946197509765625, "learning_rate": 0.0001, "loss": 6.3679, "loss/crossentropy": 2.7024388313293457, "loss/hidden": 1.7109375, "loss/jsd": 0.0, "loss/logits": 0.19545652717351913, "step": 4208 }, { "epoch": 0.1315625, "grad_norm": 3.78125, "grad_norm_var": 0.034012858072916666, "learning_rate": 0.0001, "loss": 6.2644, "loss/crossentropy": 2.55469286441803, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.20300530642271042, "step": 4210 }, { "epoch": 0.131625, "grad_norm": 3.6875, "grad_norm_var": 0.036799112955729164, "learning_rate": 0.0001, "loss": 6.7124, "loss/crossentropy": 2.8051246404647827, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.2215913087129593, "step": 4212 }, { "epoch": 0.1316875, "grad_norm": 4.9375, "grad_norm_var": 0.12801005045572916, "learning_rate": 0.0001, "loss": 6.2274, "loss/crossentropy": 2.3926981687545776, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.21198853850364685, "step": 4214 }, { "epoch": 0.13175, "grad_norm": 3.953125, "grad_norm_var": 0.12809244791666666, "learning_rate": 0.0001, "loss": 6.4395, "loss/crossentropy": 2.504998803138733, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.22040077298879623, "step": 4216 }, { "epoch": 0.1318125, "grad_norm": 3.921875, "grad_norm_var": 0.12309468587239583, "learning_rate": 0.0001, "loss": 6.483, "loss/crossentropy": 2.5757195949554443, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.2208108976483345, "step": 4218 }, { "epoch": 0.131875, "grad_norm": 3.5, "grad_norm_var": 0.12935282389322916, "learning_rate": 0.0001, "loss": 6.3703, "loss/crossentropy": 2.5360913276672363, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.2162364348769188, "step": 4220 }, { "epoch": 0.1319375, "grad_norm": 4.15625, "grad_norm_var": 0.13492838541666666, "learning_rate": 0.0001, "loss": 6.4147, "loss/crossentropy": 2.4870160818099976, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.21854856610298157, "step": 4222 }, { "epoch": 0.132, "grad_norm": 4.3125, "grad_norm_var": 0.1375, "learning_rate": 0.0001, "loss": 6.8119, "loss/crossentropy": 2.793803095817566, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.2283768579363823, "step": 4224 }, { "epoch": 0.1320625, "grad_norm": 4.28125, "grad_norm_var": 0.14158528645833332, "learning_rate": 0.0001, "loss": 6.9475, "loss/crossentropy": 2.823140859603882, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.2393893525004387, "step": 4226 }, { "epoch": 0.132125, "grad_norm": 4.25, "grad_norm_var": 0.14159749348958334, "learning_rate": 0.0001, "loss": 6.4212, "loss/crossentropy": 2.466855525970459, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.22121071815490723, "step": 4228 }, { "epoch": 0.1321875, "grad_norm": 3.9375, "grad_norm_var": 0.06571858723958333, "learning_rate": 0.0001, "loss": 6.1528, "loss/crossentropy": 2.2941900491714478, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.2139892429113388, "step": 4230 }, { "epoch": 0.13225, "grad_norm": 3.875, "grad_norm_var": 0.06431376139322917, "learning_rate": 0.0001, "loss": 6.3381, "loss/crossentropy": 2.562358021736145, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.20882709324359894, "step": 4232 }, { "epoch": 0.1323125, "grad_norm": 3.96875, "grad_norm_var": 0.06397196451822916, "learning_rate": 0.0001, "loss": 6.0432, "loss/crossentropy": 2.2380497455596924, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.20981623232364655, "step": 4234 }, { "epoch": 0.132375, "grad_norm": 4.1875, "grad_norm_var": 0.05945638020833333, "learning_rate": 0.0001, "loss": 6.8093, "loss/crossentropy": 2.7688835859298706, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.23021194338798523, "step": 4236 }, { "epoch": 0.1324375, "grad_norm": 3.484375, "grad_norm_var": 0.07353108723958333, "learning_rate": 0.0001, "loss": 6.1162, "loss/crossentropy": 2.437235474586487, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.2030561864376068, "step": 4238 }, { "epoch": 0.1325, "grad_norm": 3.703125, "grad_norm_var": 0.052718098958333334, "learning_rate": 0.0001, "loss": 6.4468, "loss/crossentropy": 2.6158939599990845, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.21629557013511658, "step": 4240 }, { "epoch": 0.1325625, "grad_norm": 3.859375, "grad_norm_var": 0.04348551432291667, "learning_rate": 0.0001, "loss": 6.4783, "loss/crossentropy": 2.6150516271591187, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.21835701167583466, "step": 4242 }, { "epoch": 0.132625, "grad_norm": 3.703125, "grad_norm_var": 0.037596638997395834, "learning_rate": 0.0001, "loss": 6.3232, "loss/crossentropy": 2.542102098464966, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.21014492213726044, "step": 4244 }, { "epoch": 0.1326875, "grad_norm": 3.75, "grad_norm_var": 0.0372467041015625, "learning_rate": 0.0001, "loss": 6.7724, "loss/crossentropy": 2.832777500152588, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.22364888340234756, "step": 4246 }, { "epoch": 0.13275, "grad_norm": 3.671875, "grad_norm_var": 0.0381744384765625, "learning_rate": 0.0001, "loss": 6.4954, "loss/crossentropy": 2.6658281087875366, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.21499013900756836, "step": 4248 }, { "epoch": 0.1328125, "grad_norm": 3.703125, "grad_norm_var": 0.03746337890625, "learning_rate": 0.0001, "loss": 6.6817, "loss/crossentropy": 2.795152187347412, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.22029437124729156, "step": 4250 }, { "epoch": 0.132875, "grad_norm": 3.71875, "grad_norm_var": 0.0105377197265625, "learning_rate": 0.0001, "loss": 6.5964, "loss/crossentropy": 2.6854653358459473, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.22233900427818298, "step": 4252 }, { "epoch": 0.1329375, "grad_norm": 3.859375, "grad_norm_var": 0.0064117431640625, "learning_rate": 0.0001, "loss": 6.7142, "loss/crossentropy": 2.7894644737243652, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.22490007430315018, "step": 4254 }, { "epoch": 0.133, "grad_norm": 3.84375, "grad_norm_var": 0.006965128580729166, "learning_rate": 0.0001, "loss": 6.5515, "loss/crossentropy": 2.6032466888427734, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.22568415105342865, "step": 4256 }, { "epoch": 0.1330625, "grad_norm": 3.546875, "grad_norm_var": 0.009447224934895833, "learning_rate": 0.0001, "loss": 6.6607, "loss/crossentropy": 2.729295015335083, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.22438761591911316, "step": 4258 }, { "epoch": 0.133125, "grad_norm": 4.0625, "grad_norm_var": 0.014842732747395834, "learning_rate": 0.0001, "loss": 6.6664, "loss/crossentropy": 2.7235976457595825, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.2239716351032257, "step": 4260 }, { "epoch": 0.1331875, "grad_norm": 3.84375, "grad_norm_var": 0.018505859375, "learning_rate": 0.0001, "loss": 6.2934, "loss/crossentropy": 2.503963351249695, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.2164425626397133, "step": 4262 }, { "epoch": 0.13325, "grad_norm": 3.890625, "grad_norm_var": 0.055028279622395836, "learning_rate": 0.0001, "loss": 6.5547, "loss/crossentropy": 2.598883867263794, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.22526460140943527, "step": 4264 }, { "epoch": 0.1333125, "grad_norm": 3.890625, "grad_norm_var": 0.052000935872395834, "learning_rate": 0.0001, "loss": 6.4364, "loss/crossentropy": 2.55340576171875, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.22267234325408936, "step": 4266 }, { "epoch": 0.133375, "grad_norm": 3.59375, "grad_norm_var": 0.05603841145833333, "learning_rate": 0.0001, "loss": 6.4032, "loss/crossentropy": 2.63783061504364, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.2109164074063301, "step": 4268 }, { "epoch": 0.1334375, "grad_norm": 4.09375, "grad_norm_var": 0.0647369384765625, "learning_rate": 0.0001, "loss": 6.8678, "loss/crossentropy": 2.767982602119446, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.23849761486053467, "step": 4270 }, { "epoch": 0.1335, "grad_norm": 3.78125, "grad_norm_var": 0.06555989583333334, "learning_rate": 0.0001, "loss": 6.5162, "loss/crossentropy": 2.573201060295105, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.22203601151704788, "step": 4272 }, { "epoch": 0.1335625, "grad_norm": 3.453125, "grad_norm_var": 0.07156473795572917, "learning_rate": 0.0001, "loss": 6.5034, "loss/crossentropy": 2.6283360719680786, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.21602189540863037, "step": 4274 }, { "epoch": 0.133625, "grad_norm": 4.0, "grad_norm_var": 0.071044921875, "learning_rate": 0.0001, "loss": 6.3222, "loss/crossentropy": 2.5181901454925537, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.21126443147659302, "step": 4276 }, { "epoch": 0.1336875, "grad_norm": 3.984375, "grad_norm_var": 0.06546223958333333, "learning_rate": 0.0001, "loss": 6.9159, "loss/crossentropy": 2.8771623373031616, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.2319938987493515, "step": 4278 }, { "epoch": 0.13375, "grad_norm": 4.0625, "grad_norm_var": 0.07353515625, "learning_rate": 0.0001, "loss": 6.1239, "loss/crossentropy": 2.4219969511032104, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.19753269106149673, "step": 4280 }, { "epoch": 0.1338125, "grad_norm": 3.921875, "grad_norm_var": 0.07343343098958334, "learning_rate": 0.0001, "loss": 6.3541, "loss/crossentropy": 2.4714077711105347, "loss/hidden": 1.7109375, "loss/jsd": 0.0, "loss/logits": 0.21717225015163422, "step": 4282 }, { "epoch": 0.133875, "grad_norm": 3.984375, "grad_norm_var": 0.0681640625, "learning_rate": 0.0001, "loss": 6.5769, "loss/crossentropy": 2.685715436935425, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.2188032567501068, "step": 4284 }, { "epoch": 0.1339375, "grad_norm": 4.28125, "grad_norm_var": 0.07177327473958334, "learning_rate": 0.0001, "loss": 6.4156, "loss/crossentropy": 2.4861207008361816, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.22067828476428986, "step": 4286 }, { "epoch": 0.134, "grad_norm": 4.21875, "grad_norm_var": 0.0796539306640625, "learning_rate": 0.0001, "loss": 6.3744, "loss/crossentropy": 2.5076987743377686, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.21362122148275375, "step": 4288 }, { "epoch": 0.1340625, "grad_norm": 4.09375, "grad_norm_var": 0.11110026041666667, "learning_rate": 0.0001, "loss": 6.7546, "loss/crossentropy": 2.7254503965377808, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.23260141164064407, "step": 4290 }, { "epoch": 0.134125, "grad_norm": 3.578125, "grad_norm_var": 0.13523661295572917, "learning_rate": 0.0001, "loss": 6.1764, "loss/crossentropy": 2.425421357154846, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.20986061543226242, "step": 4292 }, { "epoch": 0.1341875, "grad_norm": 4.34375, "grad_norm_var": 0.14474283854166667, "learning_rate": 0.0001, "loss": 6.5565, "loss/crossentropy": 2.553962469100952, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.22681234776973724, "step": 4294 }, { "epoch": 0.13425, "grad_norm": 3.734375, "grad_norm_var": 0.11599833170572917, "learning_rate": 0.0001, "loss": 6.498, "loss/crossentropy": 2.701680064201355, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.21400703489780426, "step": 4296 }, { "epoch": 0.1343125, "grad_norm": 3.71875, "grad_norm_var": 0.1501617431640625, "learning_rate": 0.0001, "loss": 6.1799, "loss/crossentropy": 2.540942668914795, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.20256894826889038, "step": 4298 }, { "epoch": 0.134375, "grad_norm": 4.0, "grad_norm_var": 0.15080973307291667, "learning_rate": 0.0001, "loss": 6.5185, "loss/crossentropy": 2.6064943075180054, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.21971440315246582, "step": 4300 }, { "epoch": 0.1344375, "grad_norm": 3.53125, "grad_norm_var": 0.14550374348958334, "learning_rate": 0.0001, "loss": 6.4625, "loss/crossentropy": 2.6421563625335693, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.21367350220680237, "step": 4302 }, { "epoch": 0.1345, "grad_norm": 4.4375, "grad_norm_var": 0.16135660807291666, "learning_rate": 0.0001, "loss": 6.559, "loss/crossentropy": 2.6035948991775513, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.2248389720916748, "step": 4304 }, { "epoch": 0.1345625, "grad_norm": 3.890625, "grad_norm_var": 0.09832255045572917, "learning_rate": 0.0001, "loss": 6.5766, "loss/crossentropy": 2.636247396469116, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.22215821593999863, "step": 4306 }, { "epoch": 0.134625, "grad_norm": 4.09375, "grad_norm_var": 0.09346415201822916, "learning_rate": 0.0001, "loss": 6.3825, "loss/crossentropy": 2.516153335571289, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.2174898460507393, "step": 4308 }, { "epoch": 0.1346875, "grad_norm": 3.703125, "grad_norm_var": 0.07429911295572916, "learning_rate": 0.0001, "loss": 6.1787, "loss/crossentropy": 2.4977835416793823, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.20247140526771545, "step": 4310 }, { "epoch": 0.13475, "grad_norm": 3.640625, "grad_norm_var": 0.07258707682291667, "learning_rate": 0.0001, "loss": 6.7702, "loss/crossentropy": 2.7508914470672607, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.2304483950138092, "step": 4312 }, { "epoch": 0.1348125, "grad_norm": 3.703125, "grad_norm_var": 0.07841695149739583, "learning_rate": 0.0001, "loss": 6.3305, "loss/crossentropy": 2.513437509536743, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.21295832097530365, "step": 4314 }, { "epoch": 0.134875, "grad_norm": 3.71875, "grad_norm_var": 0.07786051432291667, "learning_rate": 0.0001, "loss": 6.3279, "loss/crossentropy": 2.533980965614319, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.21298405528068542, "step": 4316 }, { "epoch": 0.1349375, "grad_norm": 3.765625, "grad_norm_var": 33.87757059733073, "learning_rate": 0.0001, "loss": 6.8462, "loss/crossentropy": 2.828759551048279, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.22947552800178528, "step": 4318 }, { "epoch": 0.135, "grad_norm": 3.78125, "grad_norm_var": 33.94041341145833, "learning_rate": 0.0001, "loss": 6.5928, "loss/crossentropy": 2.7170915603637695, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.21686632186174393, "step": 4320 }, { "epoch": 0.1350625, "grad_norm": 3.90625, "grad_norm_var": 34.03020426432292, "learning_rate": 0.0001, "loss": 6.2161, "loss/crossentropy": 2.4180595874786377, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.2118338942527771, "step": 4322 }, { "epoch": 0.135125, "grad_norm": 3.984375, "grad_norm_var": 34.01892903645833, "learning_rate": 0.0001, "loss": 6.4675, "loss/crossentropy": 2.610173225402832, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.2158115953207016, "step": 4324 }, { "epoch": 0.1351875, "grad_norm": 3.96875, "grad_norm_var": 33.88642578125, "learning_rate": 0.0001, "loss": 6.534, "loss/crossentropy": 2.64306640625, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.21721620857715607, "step": 4326 }, { "epoch": 0.13525, "grad_norm": 3.5, "grad_norm_var": 33.971805826822916, "learning_rate": 0.0001, "loss": 6.4392, "loss/crossentropy": 2.6547141075134277, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.21243591606616974, "step": 4328 }, { "epoch": 0.1353125, "grad_norm": 3.40625, "grad_norm_var": 34.10946858723958, "learning_rate": 0.0001, "loss": 6.368, "loss/crossentropy": 2.5513360500335693, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.21799319982528687, "step": 4330 }, { "epoch": 0.135375, "grad_norm": 3.765625, "grad_norm_var": 34.01167704264323, "learning_rate": 0.0001, "loss": 6.8963, "loss/crossentropy": 2.9096730947494507, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.22952109575271606, "step": 4332 }, { "epoch": 0.1354375, "grad_norm": 4.125, "grad_norm_var": 0.07390034993489583, "learning_rate": 0.0001, "loss": 6.5352, "loss/crossentropy": 2.6305580139160156, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.22366870939731598, "step": 4334 }, { "epoch": 0.1355, "grad_norm": 3.828125, "grad_norm_var": 0.0769927978515625, "learning_rate": 0.0001, "loss": 6.5557, "loss/crossentropy": 2.622426748275757, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.2198849767446518, "step": 4336 }, { "epoch": 0.1355625, "grad_norm": 4.125, "grad_norm_var": 0.0683013916015625, "learning_rate": 0.0001, "loss": 6.7091, "loss/crossentropy": 2.711685061454773, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.2290424406528473, "step": 4338 }, { "epoch": 0.135625, "grad_norm": 4.5625, "grad_norm_var": 0.09519856770833333, "learning_rate": 0.0001, "loss": 6.6465, "loss/crossentropy": 2.6272358894348145, "loss/hidden": 1.74609375, "loss/jsd": 0.0, "loss/logits": 0.22731468826532364, "step": 4340 }, { "epoch": 0.1356875, "grad_norm": 4.28125, "grad_norm_var": 0.102099609375, "learning_rate": 0.0001, "loss": 6.796, "loss/crossentropy": 2.6994833946228027, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.23465242981910706, "step": 4342 }, { "epoch": 0.13575, "grad_norm": 3.921875, "grad_norm_var": 0.08208719889322917, "learning_rate": 0.0001, "loss": 6.2886, "loss/crossentropy": 2.5074750185012817, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.21014603972434998, "step": 4344 }, { "epoch": 0.1358125, "grad_norm": 4.0625, "grad_norm_var": 0.06747945149739583, "learning_rate": 0.0001, "loss": 6.5879, "loss/crossentropy": 2.709128260612488, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.22108222544193268, "step": 4346 }, { "epoch": 0.135875, "grad_norm": 3.734375, "grad_norm_var": 0.05568745930989583, "learning_rate": 0.0001, "loss": 6.3625, "loss/crossentropy": 2.6139813661575317, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.21040183305740356, "step": 4348 }, { "epoch": 0.1359375, "grad_norm": 3.671875, "grad_norm_var": 0.0558502197265625, "learning_rate": 0.0001, "loss": 6.5649, "loss/crossentropy": 2.662161111831665, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.21800871938467026, "step": 4350 }, { "epoch": 0.136, "grad_norm": 3.875, "grad_norm_var": 0.0582672119140625, "learning_rate": 0.0001, "loss": 6.2154, "loss/crossentropy": 2.428389310836792, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.2119050994515419, "step": 4352 }, { "epoch": 0.1360625, "grad_norm": 4.0625, "grad_norm_var": 0.07548828125, "learning_rate": 0.0001, "loss": 6.7486, "loss/crossentropy": 2.7366960048675537, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.2293202206492424, "step": 4354 }, { "epoch": 0.136125, "grad_norm": 3.9375, "grad_norm_var": 0.0656890869140625, "learning_rate": 0.0001, "loss": 6.259, "loss/crossentropy": 2.418584704399109, "loss/hidden": 1.7109375, "loss/jsd": 0.0, "loss/logits": 0.21295125782489777, "step": 4356 }, { "epoch": 0.1361875, "grad_norm": 3.515625, "grad_norm_var": 0.06562093098958334, "learning_rate": 0.0001, "loss": 6.5373, "loss/crossentropy": 2.6187655925750732, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.22232334315776825, "step": 4358 }, { "epoch": 0.13625, "grad_norm": 3.765625, "grad_norm_var": 0.07109375, "learning_rate": 0.0001, "loss": 6.6928, "loss/crossentropy": 2.7608814239501953, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.2228756546974182, "step": 4360 }, { "epoch": 0.1363125, "grad_norm": 4.1875, "grad_norm_var": 0.07258199055989584, "learning_rate": 0.0001, "loss": 6.7529, "loss/crossentropy": 2.7711809873580933, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.22668741643428802, "step": 4362 }, { "epoch": 0.136375, "grad_norm": 4.09375, "grad_norm_var": 0.0748931884765625, "learning_rate": 0.0001, "loss": 6.5502, "loss/crossentropy": 2.643489956855774, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.2211436778306961, "step": 4364 }, { "epoch": 0.1364375, "grad_norm": 4.03125, "grad_norm_var": 0.06730855305989583, "learning_rate": 0.0001, "loss": 6.4432, "loss/crossentropy": 2.633753180503845, "loss/hidden": 1.7109375, "loss/jsd": 0.0, "loss/logits": 0.20984596014022827, "step": 4366 }, { "epoch": 0.1365, "grad_norm": 3.90625, "grad_norm_var": 0.0697906494140625, "learning_rate": 0.0001, "loss": 6.2885, "loss/crossentropy": 2.534524083137512, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.2093779295682907, "step": 4368 }, { "epoch": 0.1365625, "grad_norm": 3.625, "grad_norm_var": 0.04158426920572917, "learning_rate": 0.0001, "loss": 6.0191, "loss/crossentropy": 2.3582180738449097, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.20241554081439972, "step": 4370 }, { "epoch": 0.136625, "grad_norm": 3.5, "grad_norm_var": 0.04755452473958333, "learning_rate": 0.0001, "loss": 6.387, "loss/crossentropy": 2.5663875341415405, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.2140897959470749, "step": 4372 }, { "epoch": 0.1366875, "grad_norm": 3.6875, "grad_norm_var": 0.04810282389322917, "learning_rate": 0.0001, "loss": 6.2828, "loss/crossentropy": 2.5431841611862183, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.20833703875541687, "step": 4374 }, { "epoch": 0.13675, "grad_norm": 4.65625, "grad_norm_var": 0.10110270182291667, "learning_rate": 0.0001, "loss": 6.6385, "loss/crossentropy": 2.7397044897079468, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.22112590074539185, "step": 4376 }, { "epoch": 0.1368125, "grad_norm": 4.0, "grad_norm_var": 0.24049479166666668, "learning_rate": 0.0001, "loss": 6.3941, "loss/crossentropy": 2.472692608833313, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.22261205315589905, "step": 4378 }, { "epoch": 0.136875, "grad_norm": 4.03125, "grad_norm_var": 0.23801676432291666, "learning_rate": 0.0001, "loss": 6.4296, "loss/crossentropy": 2.52515184879303, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.21974552422761917, "step": 4380 }, { "epoch": 0.1369375, "grad_norm": 3.796875, "grad_norm_var": 0.23893229166666666, "learning_rate": 0.0001, "loss": 6.2777, "loss/crossentropy": 2.4887278079986572, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.2101452499628067, "step": 4382 }, { "epoch": 0.137, "grad_norm": 3.828125, "grad_norm_var": 0.23479410807291667, "learning_rate": 0.0001, "loss": 6.4066, "loss/crossentropy": 2.611871361732483, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.2107202634215355, "step": 4384 }, { "epoch": 0.1370625, "grad_norm": 3.484375, "grad_norm_var": 0.2543690999348958, "learning_rate": 0.0001, "loss": 6.4682, "loss/crossentropy": 2.648727774620056, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.2163223773241043, "step": 4386 }, { "epoch": 0.137125, "grad_norm": 3.875, "grad_norm_var": 0.23957417805989584, "learning_rate": 0.0001, "loss": 6.3062, "loss/crossentropy": 2.5285497903823853, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.20822982490062714, "step": 4388 }, { "epoch": 0.1371875, "grad_norm": 3.5, "grad_norm_var": 0.23983968098958333, "learning_rate": 0.0001, "loss": 6.2979, "loss/crossentropy": 2.506072998046875, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.21238084137439728, "step": 4390 }, { "epoch": 0.13725, "grad_norm": 3.8125, "grad_norm_var": 0.19384765625, "learning_rate": 0.0001, "loss": 6.5237, "loss/crossentropy": 2.7000246047973633, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.21361419558525085, "step": 4392 }, { "epoch": 0.1373125, "grad_norm": 3.71875, "grad_norm_var": 0.0458892822265625, "learning_rate": 0.0001, "loss": 6.4284, "loss/crossentropy": 2.589784860610962, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.2162848636507988, "step": 4394 }, { "epoch": 0.137375, "grad_norm": 3.59375, "grad_norm_var": 0.044123331705729164, "learning_rate": 0.0001, "loss": 6.1978, "loss/crossentropy": 2.410344958305359, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.20960881561040878, "step": 4396 }, { "epoch": 0.1374375, "grad_norm": 4.375, "grad_norm_var": 0.07614644368489583, "learning_rate": 0.0001, "loss": 6.5374, "loss/crossentropy": 2.6627117395401, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.21676111966371536, "step": 4398 }, { "epoch": 0.1375, "grad_norm": 3.921875, "grad_norm_var": 0.07752278645833334, "learning_rate": 0.0001, "loss": 6.4063, "loss/crossentropy": 2.6314616203308105, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.21381668746471405, "step": 4400 }, { "epoch": 0.1375625, "grad_norm": 3.828125, "grad_norm_var": 0.0466949462890625, "learning_rate": 0.0001, "loss": 6.4063, "loss/crossentropy": 2.54606032371521, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.21532301604747772, "step": 4402 }, { "epoch": 0.137625, "grad_norm": 4.03125, "grad_norm_var": 0.06516825358072917, "learning_rate": 0.0001, "loss": 6.7664, "loss/crossentropy": 2.695158004760742, "loss/hidden": 1.74609375, "loss/jsd": 0.0, "loss/logits": 0.23251530528068542, "step": 4404 }, { "epoch": 0.1376875, "grad_norm": 3.671875, "grad_norm_var": 0.059178670247395836, "learning_rate": 0.0001, "loss": 6.5929, "loss/crossentropy": 2.7052929401397705, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.21922975778579712, "step": 4406 }, { "epoch": 0.13775, "grad_norm": 3.640625, "grad_norm_var": 0.08063863118489584, "learning_rate": 0.0001, "loss": 6.2633, "loss/crossentropy": 2.559746026992798, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.20707596093416214, "step": 4408 }, { "epoch": 0.1378125, "grad_norm": 4.59375, "grad_norm_var": 0.1199127197265625, "learning_rate": 0.0001, "loss": 6.0566, "loss/crossentropy": 2.4197880029678345, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.20157083868980408, "step": 4410 }, { "epoch": 0.137875, "grad_norm": 3.796875, "grad_norm_var": 0.12034403483072917, "learning_rate": 0.0001, "loss": 6.3774, "loss/crossentropy": 2.6243380308151245, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.20772488415241241, "step": 4412 }, { "epoch": 0.1379375, "grad_norm": 3.578125, "grad_norm_var": 0.11555887858072916, "learning_rate": 0.0001, "loss": 6.6581, "loss/crossentropy": 2.6697126626968384, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.22696784883737564, "step": 4414 }, { "epoch": 0.138, "grad_norm": 3.921875, "grad_norm_var": 0.1158599853515625, "learning_rate": 0.0001, "loss": 6.5487, "loss/crossentropy": 2.6725574731826782, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.21378755569458008, "step": 4416 }, { "epoch": 0.1380625, "grad_norm": 3.75, "grad_norm_var": 0.11702067057291667, "learning_rate": 0.0001, "loss": 6.6902, "loss/crossentropy": 2.743945837020874, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.2274349480867386, "step": 4418 }, { "epoch": 0.138125, "grad_norm": 3.546875, "grad_norm_var": 0.13110249837239582, "learning_rate": 0.0001, "loss": 6.7319, "loss/crossentropy": 2.7539360523223877, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.22630860656499863, "step": 4420 }, { "epoch": 0.1381875, "grad_norm": 3.828125, "grad_norm_var": 0.1296783447265625, "learning_rate": 0.0001, "loss": 6.4803, "loss/crossentropy": 2.604143738746643, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.21613523364067078, "step": 4422 }, { "epoch": 0.13825, "grad_norm": 3.859375, "grad_norm_var": 0.109326171875, "learning_rate": 0.0001, "loss": 6.6886, "loss/crossentropy": 2.7972522974014282, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.22077278792858124, "step": 4424 }, { "epoch": 0.1383125, "grad_norm": 4.15625, "grad_norm_var": 0.080810546875, "learning_rate": 0.0001, "loss": 6.5401, "loss/crossentropy": 2.657148241996765, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.22345105558633804, "step": 4426 }, { "epoch": 0.138375, "grad_norm": 4.65625, "grad_norm_var": 0.120458984375, "learning_rate": 0.0001, "loss": 6.5473, "loss/crossentropy": 2.6493141651153564, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.22065355628728867, "step": 4428 }, { "epoch": 0.1384375, "grad_norm": 3.65625, "grad_norm_var": 0.09996744791666666, "learning_rate": 0.0001, "loss": 6.2785, "loss/crossentropy": 2.490964412689209, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.21000496298074722, "step": 4430 }, { "epoch": 0.1385, "grad_norm": 3.953125, "grad_norm_var": 0.11620992024739583, "learning_rate": 0.0001, "loss": 6.6311, "loss/crossentropy": 2.739372491836548, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.22003226727247238, "step": 4432 }, { "epoch": 0.1385625, "grad_norm": 3.9375, "grad_norm_var": 0.11472066243489583, "learning_rate": 0.0001, "loss": 6.4536, "loss/crossentropy": 2.550296187400818, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.2161109298467636, "step": 4434 }, { "epoch": 0.138625, "grad_norm": 3.828125, "grad_norm_var": 0.0882476806640625, "learning_rate": 0.0001, "loss": 6.554, "loss/crossentropy": 2.681362748146057, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.21929988265037537, "step": 4436 }, { "epoch": 0.1386875, "grad_norm": 3.890625, "grad_norm_var": 0.09313151041666666, "learning_rate": 0.0001, "loss": 6.4458, "loss/crossentropy": 2.70820152759552, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.20500586926937103, "step": 4438 }, { "epoch": 0.13875, "grad_norm": 4.15625, "grad_norm_var": 0.0947418212890625, "learning_rate": 0.0001, "loss": 6.2504, "loss/crossentropy": 2.5078264474868774, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.20941196382045746, "step": 4440 }, { "epoch": 0.1388125, "grad_norm": 3.8125, "grad_norm_var": 0.09263407389322917, "learning_rate": 0.0001, "loss": 5.8633, "loss/crossentropy": 2.26975679397583, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.19295161217451096, "step": 4442 }, { "epoch": 0.138875, "grad_norm": 3.59375, "grad_norm_var": 0.05187174479166667, "learning_rate": 0.0001, "loss": 6.3014, "loss/crossentropy": 2.5669431686401367, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.2074338048696518, "step": 4444 }, { "epoch": 0.1389375, "grad_norm": 3.515625, "grad_norm_var": 0.056737263997395836, "learning_rate": 0.0001, "loss": 6.2993, "loss/crossentropy": 2.540955901145935, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.21098945289850235, "step": 4446 }, { "epoch": 0.139, "grad_norm": 3.5, "grad_norm_var": 0.03548075358072917, "learning_rate": 0.0001, "loss": 6.4083, "loss/crossentropy": 2.60480535030365, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.21004024147987366, "step": 4448 }, { "epoch": 0.1390625, "grad_norm": 3.796875, "grad_norm_var": 0.03355712890625, "learning_rate": 0.0001, "loss": 6.3533, "loss/crossentropy": 2.52720844745636, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.20994866639375687, "step": 4450 }, { "epoch": 0.139125, "grad_norm": 3.65625, "grad_norm_var": 0.0287506103515625, "learning_rate": 0.0001, "loss": 6.2883, "loss/crossentropy": 2.4919263124465942, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.2108834832906723, "step": 4452 }, { "epoch": 0.1391875, "grad_norm": 3.40625, "grad_norm_var": 0.0337890625, "learning_rate": 0.0001, "loss": 6.3411, "loss/crossentropy": 2.643371105194092, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.20687970519065857, "step": 4454 }, { "epoch": 0.13925, "grad_norm": 3.53125, "grad_norm_var": 0.0250152587890625, "learning_rate": 0.0001, "loss": 6.5259, "loss/crossentropy": 2.7266517877578735, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.21351400017738342, "step": 4456 }, { "epoch": 0.1393125, "grad_norm": 3.53125, "grad_norm_var": 0.024833170572916667, "learning_rate": 0.0001, "loss": 6.1921, "loss/crossentropy": 2.515869140625, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.2019999995827675, "step": 4458 }, { "epoch": 0.139375, "grad_norm": 3.6875, "grad_norm_var": 0.026520792643229166, "learning_rate": 0.0001, "loss": 6.0831, "loss/crossentropy": 2.4401614665985107, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.20140717178583145, "step": 4460 }, { "epoch": 0.1394375, "grad_norm": 3.78125, "grad_norm_var": 0.026298014322916667, "learning_rate": 0.0001, "loss": 6.4829, "loss/crossentropy": 2.7296160459518433, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.2124355137348175, "step": 4462 }, { "epoch": 0.1395, "grad_norm": 3.515625, "grad_norm_var": 0.025487263997395832, "learning_rate": 0.0001, "loss": 6.4402, "loss/crossentropy": 2.608121871948242, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.21328146010637283, "step": 4464 }, { "epoch": 0.1395625, "grad_norm": 3.6875, "grad_norm_var": 0.024681599934895833, "learning_rate": 0.0001, "loss": 6.5292, "loss/crossentropy": 2.684123992919922, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.2192751243710518, "step": 4466 }, { "epoch": 0.139625, "grad_norm": 3.765625, "grad_norm_var": 0.027620442708333335, "learning_rate": 0.0001, "loss": 6.6034, "loss/crossentropy": 2.69321072101593, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.22227226942777634, "step": 4468 }, { "epoch": 0.1396875, "grad_norm": 4.28125, "grad_norm_var": 0.0414703369140625, "learning_rate": 0.0001, "loss": 6.5612, "loss/crossentropy": 2.6232060194015503, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.22309193760156631, "step": 4470 }, { "epoch": 0.13975, "grad_norm": 3.375, "grad_norm_var": 0.044408162434895836, "learning_rate": 0.0001, "loss": 5.9948, "loss/crossentropy": 2.433348774909973, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.19599098712205887, "step": 4472 }, { "epoch": 0.1398125, "grad_norm": 4.5, "grad_norm_var": 0.08082275390625, "learning_rate": 0.0001, "loss": 6.6228, "loss/crossentropy": 2.6278765201568604, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.22410084307193756, "step": 4474 }, { "epoch": 0.139875, "grad_norm": 3.78125, "grad_norm_var": 0.07467041015625, "learning_rate": 0.0001, "loss": 6.0025, "loss/crossentropy": 2.3112982511520386, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.1991952434182167, "step": 4476 }, { "epoch": 0.1399375, "grad_norm": 3.65625, "grad_norm_var": 0.080810546875, "learning_rate": 0.0001, "loss": 6.1531, "loss/crossentropy": 2.4080352783203125, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.20654088258743286, "step": 4478 }, { "epoch": 0.14, "grad_norm": 3.671875, "grad_norm_var": 0.08090718587239583, "learning_rate": 0.0001, "loss": 6.3801, "loss/crossentropy": 2.57833468914032, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.21142880618572235, "step": 4480 }, { "epoch": 0.1400625, "grad_norm": 4.28125, "grad_norm_var": 0.09670817057291667, "learning_rate": 0.0001, "loss": 6.3799, "loss/crossentropy": 2.578604817390442, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.20747198164463043, "step": 4482 }, { "epoch": 0.140125, "grad_norm": 3.625, "grad_norm_var": 0.11760965983072917, "learning_rate": 0.0001, "loss": 6.7525, "loss/crossentropy": 2.8470464944839478, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.22023146599531174, "step": 4484 }, { "epoch": 0.1401875, "grad_norm": 4.1875, "grad_norm_var": 0.2569000244140625, "learning_rate": 0.0001, "loss": 6.3493, "loss/crossentropy": 2.4870035648345947, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.21826012432575226, "step": 4486 }, { "epoch": 0.14025, "grad_norm": 3.71875, "grad_norm_var": 0.2294830322265625, "learning_rate": 0.0001, "loss": 6.3122, "loss/crossentropy": 2.494701623916626, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.21456186473369598, "step": 4488 }, { "epoch": 0.1403125, "grad_norm": 4.09375, "grad_norm_var": 0.20968424479166667, "learning_rate": 0.0001, "loss": 6.4447, "loss/crossentropy": 2.5576690435409546, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.218392476439476, "step": 4490 }, { "epoch": 0.140375, "grad_norm": 3.890625, "grad_norm_var": 0.20572509765625, "learning_rate": 0.0001, "loss": 6.6909, "loss/crossentropy": 2.7482335567474365, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.2278623878955841, "step": 4492 }, { "epoch": 0.1404375, "grad_norm": 3.71875, "grad_norm_var": 0.18975321451822916, "learning_rate": 0.0001, "loss": 6.8476, "loss/crossentropy": 2.8149173259735107, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.23099815845489502, "step": 4494 }, { "epoch": 0.1405, "grad_norm": 3.828125, "grad_norm_var": 0.1708648681640625, "learning_rate": 0.0001, "loss": 6.3192, "loss/crossentropy": 2.4586331844329834, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.21535467356443405, "step": 4496 }, { "epoch": 0.1405625, "grad_norm": 3.921875, "grad_norm_var": 0.16803385416666666, "learning_rate": 0.0001, "loss": 6.4127, "loss/crossentropy": 2.5607200860977173, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.21137110888957977, "step": 4498 }, { "epoch": 0.140625, "grad_norm": 5.3125, "grad_norm_var": 0.261767578125, "learning_rate": 0.0001, "loss": 6.9379, "loss/crossentropy": 2.715990900993347, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.24641267955303192, "step": 4500 }, { "epoch": 0.1406875, "grad_norm": 3.46875, "grad_norm_var": 0.1669342041015625, "learning_rate": 0.0001, "loss": 6.0838, "loss/crossentropy": 2.3416292667388916, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.20937299728393555, "step": 4502 }, { "epoch": 0.14075, "grad_norm": 3.8125, "grad_norm_var": 0.1842193603515625, "learning_rate": 0.0001, "loss": 6.1107, "loss/crossentropy": 2.399893641471863, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.20779887586832047, "step": 4504 }, { "epoch": 0.1408125, "grad_norm": 3.578125, "grad_norm_var": 0.1912506103515625, "learning_rate": 0.0001, "loss": 6.4685, "loss/crossentropy": 2.614013910293579, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.21787265688180923, "step": 4506 }, { "epoch": 0.140875, "grad_norm": 3.796875, "grad_norm_var": 0.2802398681640625, "learning_rate": 0.0001, "loss": 6.7776, "loss/crossentropy": 2.7103980779647827, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.2344505339860916, "step": 4508 }, { "epoch": 0.1409375, "grad_norm": 3.90625, "grad_norm_var": 0.268994140625, "learning_rate": 0.0001, "loss": 6.4618, "loss/crossentropy": 2.5430240631103516, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.22156068682670593, "step": 4510 }, { "epoch": 0.141, "grad_norm": 3.25, "grad_norm_var": 0.3018544514973958, "learning_rate": 0.0001, "loss": 6.0656, "loss/crossentropy": 2.3237478733062744, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.20582488179206848, "step": 4512 }, { "epoch": 0.1410625, "grad_norm": 3.375, "grad_norm_var": 0.3292877197265625, "learning_rate": 0.0001, "loss": 6.3944, "loss/crossentropy": 2.6120007038116455, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.21261148154735565, "step": 4514 }, { "epoch": 0.141125, "grad_norm": 4.03125, "grad_norm_var": 0.19192301432291667, "learning_rate": 0.0001, "loss": 6.3112, "loss/crossentropy": 2.4821159839630127, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.21532998234033585, "step": 4516 }, { "epoch": 0.1411875, "grad_norm": 4.25, "grad_norm_var": 0.20214742024739582, "learning_rate": 0.0001, "loss": 6.6308, "loss/crossentropy": 2.641494393348694, "loss/hidden": 1.7109375, "loss/jsd": 0.0, "loss/logits": 0.22783225774765015, "step": 4518 }, { "epoch": 0.14125, "grad_norm": 3.890625, "grad_norm_var": 0.1877349853515625, "learning_rate": 0.0001, "loss": 6.083, "loss/crossentropy": 2.351473093032837, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.20596114546060562, "step": 4520 }, { "epoch": 0.1413125, "grad_norm": 3.640625, "grad_norm_var": 0.18372294108072917, "learning_rate": 0.0001, "loss": 6.4145, "loss/crossentropy": 2.6024839878082275, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.21128298342227936, "step": 4522 }, { "epoch": 0.141375, "grad_norm": 3.40625, "grad_norm_var": 0.09845377604166666, "learning_rate": 0.0001, "loss": 6.0818, "loss/crossentropy": 2.4369860887527466, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.2015867829322815, "step": 4524 }, { "epoch": 0.1414375, "grad_norm": 4.125, "grad_norm_var": 0.10485026041666666, "learning_rate": 0.0001, "loss": 6.5281, "loss/crossentropy": 2.6377745866775513, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.22028282284736633, "step": 4526 }, { "epoch": 0.1415, "grad_norm": 3.921875, "grad_norm_var": 0.075390625, "learning_rate": 0.0001, "loss": 6.1738, "loss/crossentropy": 2.4429298639297485, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.2086314857006073, "step": 4528 }, { "epoch": 0.1415625, "grad_norm": 4.28125, "grad_norm_var": 0.08909098307291667, "learning_rate": 0.0001, "loss": 6.9688, "loss/crossentropy": 2.8351988792419434, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.23914079368114471, "step": 4530 }, { "epoch": 0.141625, "grad_norm": 3.703125, "grad_norm_var": 0.09009501139322916, "learning_rate": 0.0001, "loss": 6.333, "loss/crossentropy": 2.566150426864624, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.21066495776176453, "step": 4532 }, { "epoch": 0.1416875, "grad_norm": 4.125, "grad_norm_var": 0.07337137858072916, "learning_rate": 0.0001, "loss": 6.6482, "loss/crossentropy": 2.7197688817977905, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.22448784857988358, "step": 4534 }, { "epoch": 0.14175, "grad_norm": 3.921875, "grad_norm_var": 0.0838531494140625, "learning_rate": 0.0001, "loss": 6.5358, "loss/crossentropy": 2.7000629901885986, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.21873365342617035, "step": 4536 }, { "epoch": 0.1418125, "grad_norm": 3.421875, "grad_norm_var": 0.1019439697265625, "learning_rate": 0.0001, "loss": 6.2915, "loss/crossentropy": 2.5986807346343994, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.20560834556818008, "step": 4538 }, { "epoch": 0.141875, "grad_norm": 3.546875, "grad_norm_var": 0.09635009765625, "learning_rate": 0.0001, "loss": 6.5066, "loss/crossentropy": 2.612900137901306, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.22178903222084045, "step": 4540 }, { "epoch": 0.1419375, "grad_norm": 3.6875, "grad_norm_var": 0.09374898274739583, "learning_rate": 0.0001, "loss": 6.4313, "loss/crossentropy": 2.623916268348694, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.21199119836091995, "step": 4542 }, { "epoch": 0.142, "grad_norm": 3.84375, "grad_norm_var": 0.10750223795572916, "learning_rate": 0.0001, "loss": 6.2772, "loss/crossentropy": 2.52363121509552, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.20778058469295502, "step": 4544 }, { "epoch": 0.1420625, "grad_norm": 3.671875, "grad_norm_var": 0.06940104166666666, "learning_rate": 0.0001, "loss": 6.5497, "loss/crossentropy": 2.559391736984253, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.22676189988851547, "step": 4546 }, { "epoch": 0.142125, "grad_norm": 3.859375, "grad_norm_var": 0.07356363932291667, "learning_rate": 0.0001, "loss": 6.2551, "loss/crossentropy": 2.5050313472747803, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.2046976387500763, "step": 4548 }, { "epoch": 0.1421875, "grad_norm": 4.46875, "grad_norm_var": 0.10657552083333334, "learning_rate": 0.0001, "loss": 6.7595, "loss/crossentropy": 2.6757744550704956, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.2353217825293541, "step": 4550 }, { "epoch": 0.14225, "grad_norm": 4.46875, "grad_norm_var": 0.12874247233072916, "learning_rate": 0.0001, "loss": 6.6681, "loss/crossentropy": 2.7467669248580933, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.2179144099354744, "step": 4552 }, { "epoch": 0.1423125, "grad_norm": 4.28125, "grad_norm_var": 0.13662109375, "learning_rate": 0.0001, "loss": 6.3387, "loss/crossentropy": 2.508995532989502, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.21773157268762589, "step": 4554 }, { "epoch": 0.142375, "grad_norm": 3.59375, "grad_norm_var": 0.135107421875, "learning_rate": 0.0001, "loss": 6.3024, "loss/crossentropy": 2.5592448711395264, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.21025492250919342, "step": 4556 }, { "epoch": 0.1424375, "grad_norm": 4.25, "grad_norm_var": 0.25068359375, "learning_rate": 0.0001, "loss": 6.3025, "loss/crossentropy": 2.3946605920791626, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.22242514044046402, "step": 4558 }, { "epoch": 0.1425, "grad_norm": 4.0, "grad_norm_var": 0.23028971354166666, "learning_rate": 0.0001, "loss": 6.4046, "loss/crossentropy": 2.659841537475586, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.2076825425028801, "step": 4560 }, { "epoch": 0.1425625, "grad_norm": 3.75, "grad_norm_var": 0.2351226806640625, "learning_rate": 0.0001, "loss": 6.4888, "loss/crossentropy": 2.6625062227249146, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.21622255444526672, "step": 4562 }, { "epoch": 0.142625, "grad_norm": 3.4375, "grad_norm_var": 0.25201416015625, "learning_rate": 0.0001, "loss": 6.1066, "loss/crossentropy": 2.4543986320495605, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.20076348632574081, "step": 4564 }, { "epoch": 0.1426875, "grad_norm": 3.515625, "grad_norm_var": 0.2391998291015625, "learning_rate": 0.0001, "loss": 6.3698, "loss/crossentropy": 2.6311250925064087, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.20980965346097946, "step": 4566 }, { "epoch": 0.14275, "grad_norm": 3.921875, "grad_norm_var": 0.21142171223958334, "learning_rate": 0.0001, "loss": 6.4228, "loss/crossentropy": 2.5519657135009766, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.21833696961402893, "step": 4568 }, { "epoch": 0.1428125, "grad_norm": 3.75, "grad_norm_var": 0.1912506103515625, "learning_rate": 0.0001, "loss": 6.3472, "loss/crossentropy": 2.5183498859405518, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.2157001942396164, "step": 4570 }, { "epoch": 0.142875, "grad_norm": 3.703125, "grad_norm_var": 0.19122721354166666, "learning_rate": 0.0001, "loss": 6.2351, "loss/crossentropy": 2.500234842300415, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.2113747000694275, "step": 4572 }, { "epoch": 0.1429375, "grad_norm": 4.0, "grad_norm_var": 0.037385050455729166, "learning_rate": 0.0001, "loss": 6.6098, "loss/crossentropy": 2.7225894927978516, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.2203599065542221, "step": 4574 }, { "epoch": 0.143, "grad_norm": 3.65625, "grad_norm_var": 0.046662394205729166, "learning_rate": 0.0001, "loss": 6.3647, "loss/crossentropy": 2.57330060005188, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.21117094904184341, "step": 4576 }, { "epoch": 0.1430625, "grad_norm": 3.734375, "grad_norm_var": 0.0470123291015625, "learning_rate": 0.0001, "loss": 6.621, "loss/crossentropy": 2.7625339031219482, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.21905124187469482, "step": 4578 }, { "epoch": 0.143125, "grad_norm": 3.890625, "grad_norm_var": 0.0507720947265625, "learning_rate": 0.0001, "loss": 6.6931, "loss/crossentropy": 2.7396358251571655, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.22463934868574142, "step": 4580 }, { "epoch": 0.1431875, "grad_norm": 3.78125, "grad_norm_var": 0.05603841145833333, "learning_rate": 0.0001, "loss": 6.3751, "loss/crossentropy": 2.5076018571853638, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.21995484083890915, "step": 4582 }, { "epoch": 0.14325, "grad_norm": 4.125, "grad_norm_var": 0.060774739583333334, "learning_rate": 0.0001, "loss": 6.302, "loss/crossentropy": 2.444732427597046, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.21190303564071655, "step": 4584 }, { "epoch": 0.1433125, "grad_norm": 3.796875, "grad_norm_var": 0.055597941080729164, "learning_rate": 0.0001, "loss": 6.6817, "loss/crossentropy": 2.6860902309417725, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.22417040169239044, "step": 4586 }, { "epoch": 0.143375, "grad_norm": 3.859375, "grad_norm_var": 0.04543863932291667, "learning_rate": 0.0001, "loss": 6.3433, "loss/crossentropy": 2.6071120500564575, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.20682349801063538, "step": 4588 }, { "epoch": 0.1434375, "grad_norm": 3.96875, "grad_norm_var": 0.04850972493489583, "learning_rate": 0.0001, "loss": 6.4867, "loss/crossentropy": 2.5918766260147095, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.21916785836219788, "step": 4590 }, { "epoch": 0.1435, "grad_norm": 3.59375, "grad_norm_var": 0.04853413899739583, "learning_rate": 0.0001, "loss": 6.5322, "loss/crossentropy": 2.661565065383911, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.21909621357917786, "step": 4592 }, { "epoch": 0.1435625, "grad_norm": 3.5, "grad_norm_var": 0.04719136555989583, "learning_rate": 0.0001, "loss": 6.1707, "loss/crossentropy": 2.3326449394226074, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.21505358070135117, "step": 4594 }, { "epoch": 0.143625, "grad_norm": 3.640625, "grad_norm_var": 0.048876953125, "learning_rate": 0.0001, "loss": 6.1565, "loss/crossentropy": 2.409466505050659, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.20829974859952927, "step": 4596 }, { "epoch": 0.1436875, "grad_norm": 3.765625, "grad_norm_var": 0.042210896809895836, "learning_rate": 0.0001, "loss": 6.5329, "loss/crossentropy": 2.6367892026901245, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.21812911331653595, "step": 4598 }, { "epoch": 0.14375, "grad_norm": 3.5625, "grad_norm_var": 0.044896443684895836, "learning_rate": 0.0001, "loss": 6.4859, "loss/crossentropy": 2.6948471069335938, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.21660198271274567, "step": 4600 }, { "epoch": 0.1438125, "grad_norm": 3.84375, "grad_norm_var": 0.03905843098958333, "learning_rate": 0.0001, "loss": 6.0582, "loss/crossentropy": 2.3523894548416138, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.20027178525924683, "step": 4602 }, { "epoch": 0.143875, "grad_norm": 3.6875, "grad_norm_var": 0.0413970947265625, "learning_rate": 0.0001, "loss": 6.6664, "loss/crossentropy": 2.769497036933899, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.22133028507232666, "step": 4604 }, { "epoch": 0.1439375, "grad_norm": 3.84375, "grad_norm_var": 0.032013956705729166, "learning_rate": 0.0001, "loss": 6.756, "loss/crossentropy": 2.7452114820480347, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.23233232647180557, "step": 4606 }, { "epoch": 0.144, "grad_norm": 4.5, "grad_norm_var": 0.06319071451822916, "learning_rate": 0.0001, "loss": 6.6783, "loss/crossentropy": 2.7978492975234985, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.21811985969543457, "step": 4608 }, { "epoch": 0.1440625, "grad_norm": 3.90625, "grad_norm_var": 0.0533203125, "learning_rate": 0.0001, "loss": 6.1483, "loss/crossentropy": 2.4034503698349, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.20768387615680695, "step": 4610 }, { "epoch": 0.144125, "grad_norm": 4.03125, "grad_norm_var": 0.05178629557291667, "learning_rate": 0.0001, "loss": 6.3053, "loss/crossentropy": 2.5535662174224854, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.20603449642658234, "step": 4612 }, { "epoch": 0.1441875, "grad_norm": 3.90625, "grad_norm_var": 0.059691365559895834, "learning_rate": 0.0001, "loss": 6.2352, "loss/crossentropy": 2.566025137901306, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.19503897428512573, "step": 4614 }, { "epoch": 0.14425, "grad_norm": 4.03125, "grad_norm_var": 0.0542144775390625, "learning_rate": 0.0001, "loss": 6.5978, "loss/crossentropy": 2.6479650735855103, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.22350239753723145, "step": 4616 }, { "epoch": 0.1443125, "grad_norm": 3.78125, "grad_norm_var": 0.0493560791015625, "learning_rate": 0.0001, "loss": 6.2562, "loss/crossentropy": 2.5567132234573364, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.20432090759277344, "step": 4618 }, { "epoch": 0.144375, "grad_norm": 3.53125, "grad_norm_var": 0.05799051920572917, "learning_rate": 0.0001, "loss": 6.1748, "loss/crossentropy": 2.5087623596191406, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.2056633159518242, "step": 4620 }, { "epoch": 0.1444375, "grad_norm": 3.90625, "grad_norm_var": 0.0584136962890625, "learning_rate": 0.0001, "loss": 6.6676, "loss/crossentropy": 2.757576823234558, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.21951927244663239, "step": 4622 }, { "epoch": 0.1445, "grad_norm": 3.546875, "grad_norm_var": 0.04504801432291667, "learning_rate": 0.0001, "loss": 6.4611, "loss/crossentropy": 2.717313051223755, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.20914141833782196, "step": 4624 }, { "epoch": 0.1445625, "grad_norm": 3.921875, "grad_norm_var": 0.044722493489583334, "learning_rate": 0.0001, "loss": 6.5246, "loss/crossentropy": 2.6257013082504272, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.22230887413024902, "step": 4626 }, { "epoch": 0.144625, "grad_norm": 3.609375, "grad_norm_var": 0.047652180989583334, "learning_rate": 0.0001, "loss": 6.5916, "loss/crossentropy": 2.752093195915222, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.21911172568798065, "step": 4628 }, { "epoch": 0.1446875, "grad_norm": 4.09375, "grad_norm_var": 0.0403228759765625, "learning_rate": 0.0001, "loss": 6.4875, "loss/crossentropy": 2.518881678581238, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.22381020337343216, "step": 4630 }, { "epoch": 0.14475, "grad_norm": 3.5, "grad_norm_var": 0.0376129150390625, "learning_rate": 0.0001, "loss": 6.2797, "loss/crossentropy": 2.5216615200042725, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.2105671912431717, "step": 4632 }, { "epoch": 0.1448125, "grad_norm": 4.75, "grad_norm_var": 0.10459696451822917, "learning_rate": 0.0001, "loss": 6.6336, "loss/crossentropy": 2.7041332721710205, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.22068022191524506, "step": 4634 }, { "epoch": 0.144875, "grad_norm": 3.59375, "grad_norm_var": 0.11144917805989583, "learning_rate": 0.0001, "loss": 6.2327, "loss/crossentropy": 2.368388056755066, "loss/hidden": 1.7109375, "loss/jsd": 0.0, "loss/logits": 0.2153344452381134, "step": 4636 }, { "epoch": 0.1449375, "grad_norm": 3.484375, "grad_norm_var": 0.12156575520833333, "learning_rate": 0.0001, "loss": 5.9793, "loss/crossentropy": 2.3809818029403687, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.19616173207759857, "step": 4638 }, { "epoch": 0.145, "grad_norm": 4.125, "grad_norm_var": 0.1231353759765625, "learning_rate": 0.0001, "loss": 6.7704, "loss/crossentropy": 2.7761400938034058, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.22637692838907242, "step": 4640 }, { "epoch": 0.1450625, "grad_norm": 4.21875, "grad_norm_var": 0.1308990478515625, "learning_rate": 0.0001, "loss": 6.6359, "loss/crossentropy": 2.667805314064026, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.2264966070652008, "step": 4642 }, { "epoch": 0.145125, "grad_norm": 3.828125, "grad_norm_var": 0.13297119140625, "learning_rate": 0.0001, "loss": 6.4757, "loss/crossentropy": 2.7138952016830444, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.2097788080573082, "step": 4644 }, { "epoch": 0.1451875, "grad_norm": 5.6875, "grad_norm_var": 0.34869791666666666, "learning_rate": 0.0001, "loss": 6.4296, "loss/crossentropy": 2.5671643018722534, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.2182699367403984, "step": 4646 }, { "epoch": 0.14525, "grad_norm": 4.0625, "grad_norm_var": 0.3304524739583333, "learning_rate": 0.0001, "loss": 6.3105, "loss/crossentropy": 2.389971971511841, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.22017350792884827, "step": 4648 }, { "epoch": 0.1453125, "grad_norm": 3.75, "grad_norm_var": 0.2860514322916667, "learning_rate": 0.0001, "loss": 6.1683, "loss/crossentropy": 2.426085114479065, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.20703301578760147, "step": 4650 }, { "epoch": 0.145375, "grad_norm": 4.15625, "grad_norm_var": 0.2794586181640625, "learning_rate": 0.0001, "loss": 6.3578, "loss/crossentropy": 2.5354756116867065, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.2158234342932701, "step": 4652 }, { "epoch": 0.1454375, "grad_norm": 3.921875, "grad_norm_var": 0.26105855305989584, "learning_rate": 0.0001, "loss": 6.7593, "loss/crossentropy": 2.630834221839905, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.2355065494775772, "step": 4654 }, { "epoch": 0.1455, "grad_norm": 4.03125, "grad_norm_var": 0.26048075358072914, "learning_rate": 0.0001, "loss": 6.8064, "loss/crossentropy": 2.7368087768554688, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.23664291948080063, "step": 4656 }, { "epoch": 0.1455625, "grad_norm": 4.6875, "grad_norm_var": 0.40924479166666666, "learning_rate": 0.0001, "loss": 7.0822, "loss/crossentropy": 2.8304264545440674, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.25134557485580444, "step": 4658 }, { "epoch": 0.145625, "grad_norm": 3.9375, "grad_norm_var": 0.4041015625, "learning_rate": 0.0001, "loss": 6.5154, "loss/crossentropy": 2.655316472053528, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.21921440213918686, "step": 4660 }, { "epoch": 0.1456875, "grad_norm": 4.0, "grad_norm_var": 0.2482330322265625, "learning_rate": 0.0001, "loss": 6.4032, "loss/crossentropy": 2.604591965675354, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.2158007100224495, "step": 4662 }, { "epoch": 0.14575, "grad_norm": 3.703125, "grad_norm_var": 0.2572550455729167, "learning_rate": 0.0001, "loss": 6.2784, "loss/crossentropy": 2.573136806488037, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.20646653324365616, "step": 4664 }, { "epoch": 0.1458125, "grad_norm": 3.75, "grad_norm_var": 0.244140625, "learning_rate": 0.0001, "loss": 6.6643, "loss/crossentropy": 2.864218592643738, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.21437915414571762, "step": 4666 }, { "epoch": 0.145875, "grad_norm": 3.796875, "grad_norm_var": 0.23665364583333334, "learning_rate": 0.0001, "loss": 6.3439, "loss/crossentropy": 2.5378594398498535, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.21380609273910522, "step": 4668 }, { "epoch": 0.1459375, "grad_norm": 3.890625, "grad_norm_var": 0.22810872395833334, "learning_rate": 0.0001, "loss": 6.4716, "loss/crossentropy": 2.617180824279785, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.21786151826381683, "step": 4670 }, { "epoch": 0.146, "grad_norm": 4.15625, "grad_norm_var": 0.23333333333333334, "learning_rate": 0.0001, "loss": 6.6927, "loss/crossentropy": 2.6598581075668335, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.22984379529953003, "step": 4672 }, { "epoch": 0.1460625, "grad_norm": 4.03125, "grad_norm_var": 0.0334136962890625, "learning_rate": 0.0001, "loss": 6.6473, "loss/crossentropy": 2.679308295249939, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.22726821154356003, "step": 4674 }, { "epoch": 0.146125, "grad_norm": 3.78125, "grad_norm_var": 0.028499348958333334, "learning_rate": 0.0001, "loss": 6.4195, "loss/crossentropy": 2.625803232192993, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.21217992156744003, "step": 4676 }, { "epoch": 0.1461875, "grad_norm": 4.59375, "grad_norm_var": 0.04967041015625, "learning_rate": 0.0001, "loss": 6.5957, "loss/crossentropy": 2.7120320796966553, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.22117996215820312, "step": 4678 }, { "epoch": 0.14625, "grad_norm": 3.515625, "grad_norm_var": 0.0579254150390625, "learning_rate": 0.0001, "loss": 6.3614, "loss/crossentropy": 2.6168389320373535, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.21117979288101196, "step": 4680 }, { "epoch": 0.1463125, "grad_norm": 3.671875, "grad_norm_var": 0.06334228515625, "learning_rate": 0.0001, "loss": 6.6885, "loss/crossentropy": 2.751344323158264, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.2241879627108574, "step": 4682 }, { "epoch": 0.146375, "grad_norm": 4.125, "grad_norm_var": 0.08232421875, "learning_rate": 0.0001, "loss": 6.5906, "loss/crossentropy": 2.7465096712112427, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.21292072534561157, "step": 4684 }, { "epoch": 0.1464375, "grad_norm": 3.6875, "grad_norm_var": 0.0863189697265625, "learning_rate": 0.0001, "loss": 6.5358, "loss/crossentropy": 2.718324303627014, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.2133835405111313, "step": 4686 }, { "epoch": 0.1465, "grad_norm": 3.40625, "grad_norm_var": 0.09709370930989583, "learning_rate": 0.0001, "loss": 6.1771, "loss/crossentropy": 2.439010500907898, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.20662237703800201, "step": 4688 }, { "epoch": 0.1465625, "grad_norm": 3.90625, "grad_norm_var": 0.10308837890625, "learning_rate": 0.0001, "loss": 6.5201, "loss/crossentropy": 2.627174496650696, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.22014875710010529, "step": 4690 }, { "epoch": 0.146625, "grad_norm": 3.765625, "grad_norm_var": 0.10435791015625, "learning_rate": 0.0001, "loss": 6.6709, "loss/crossentropy": 2.7912899255752563, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.21842972934246063, "step": 4692 }, { "epoch": 0.1466875, "grad_norm": 3.65625, "grad_norm_var": 0.070849609375, "learning_rate": 0.0001, "loss": 6.4942, "loss/crossentropy": 2.630295991897583, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.21568869799375534, "step": 4694 }, { "epoch": 0.14675, "grad_norm": 3.625, "grad_norm_var": 0.08316650390625, "learning_rate": 0.0001, "loss": 6.0061, "loss/crossentropy": 2.477609872817993, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19581755995750427, "step": 4696 }, { "epoch": 0.1468125, "grad_norm": 3.5625, "grad_norm_var": 0.07779541015625, "learning_rate": 0.0001, "loss": 6.6173, "loss/crossentropy": 2.7599306106567383, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.21972202509641647, "step": 4698 }, { "epoch": 0.146875, "grad_norm": 4.0, "grad_norm_var": 0.06713765462239583, "learning_rate": 0.0001, "loss": 6.5485, "loss/crossentropy": 2.7502459287643433, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.21458908915519714, "step": 4700 }, { "epoch": 0.1469375, "grad_norm": 3.625, "grad_norm_var": 0.0636871337890625, "learning_rate": 0.0001, "loss": 6.4994, "loss/crossentropy": 2.6590317487716675, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.2160710096359253, "step": 4702 }, { "epoch": 0.147, "grad_norm": 3.515625, "grad_norm_var": 0.0592681884765625, "learning_rate": 0.0001, "loss": 6.2914, "loss/crossentropy": 2.5735844373703003, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.20264513790607452, "step": 4704 }, { "epoch": 0.1470625, "grad_norm": 3.625, "grad_norm_var": 0.0452056884765625, "learning_rate": 0.0001, "loss": 6.3309, "loss/crossentropy": 2.5724921226501465, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.20825912058353424, "step": 4706 }, { "epoch": 0.147125, "grad_norm": 3.671875, "grad_norm_var": 0.05728759765625, "learning_rate": 0.0001, "loss": 6.3636, "loss/crossentropy": 2.558912515640259, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.20976246148347855, "step": 4708 }, { "epoch": 0.1471875, "grad_norm": 3.96875, "grad_norm_var": 0.055908203125, "learning_rate": 0.0001, "loss": 6.3413, "loss/crossentropy": 2.588721752166748, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.21159029752016068, "step": 4710 }, { "epoch": 0.14725, "grad_norm": 3.9375, "grad_norm_var": 0.048981730143229166, "learning_rate": 0.0001, "loss": 6.4642, "loss/crossentropy": 2.610605239868164, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.21856311708688736, "step": 4712 }, { "epoch": 0.1473125, "grad_norm": 3.640625, "grad_norm_var": 0.054963175455729166, "learning_rate": 0.0001, "loss": 6.4486, "loss/crossentropy": 2.6900192499160767, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.21257588267326355, "step": 4714 }, { "epoch": 0.147375, "grad_norm": 4.0, "grad_norm_var": 0.05174051920572917, "learning_rate": 0.0001, "loss": 6.4907, "loss/crossentropy": 2.6867319345474243, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.21203771233558655, "step": 4716 }, { "epoch": 0.1474375, "grad_norm": 3.78125, "grad_norm_var": 0.05563151041666667, "learning_rate": 0.0001, "loss": 5.9098, "loss/crossentropy": 2.360137462615967, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.1932504028081894, "step": 4718 }, { "epoch": 0.1475, "grad_norm": 3.671875, "grad_norm_var": 0.05534566243489583, "learning_rate": 0.0001, "loss": 6.7295, "loss/crossentropy": 2.795089840888977, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.2258637547492981, "step": 4720 }, { "epoch": 0.1475625, "grad_norm": 4.53125, "grad_norm_var": 0.32356669108072916, "learning_rate": 0.0001, "loss": 6.5211, "loss/crossentropy": 2.617510437965393, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.21652990579605103, "step": 4722 }, { "epoch": 0.147625, "grad_norm": 4.875, "grad_norm_var": 0.36549072265625, "learning_rate": 0.0001, "loss": 6.7006, "loss/crossentropy": 2.6605865955352783, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.23368670791387558, "step": 4724 }, { "epoch": 0.1476875, "grad_norm": 4.125, "grad_norm_var": 0.33739827473958334, "learning_rate": 0.0001, "loss": 6.5533, "loss/crossentropy": 2.675244092941284, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.21476037055253983, "step": 4726 }, { "epoch": 0.14775, "grad_norm": 3.625, "grad_norm_var": 0.3437164306640625, "learning_rate": 0.0001, "loss": 6.3036, "loss/crossentropy": 2.573203682899475, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.206244595348835, "step": 4728 }, { "epoch": 0.1478125, "grad_norm": 3.390625, "grad_norm_var": 0.3688629150390625, "learning_rate": 0.0001, "loss": 6.2489, "loss/crossentropy": 2.5649033784866333, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.20082136988639832, "step": 4730 }, { "epoch": 0.147875, "grad_norm": 3.90625, "grad_norm_var": 0.3668853759765625, "learning_rate": 0.0001, "loss": 6.9884, "loss/crossentropy": 2.994609832763672, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.22906502336263657, "step": 4732 }, { "epoch": 0.1479375, "grad_norm": 3.78125, "grad_norm_var": 0.3581451416015625, "learning_rate": 0.0001, "loss": 6.3501, "loss/crossentropy": 2.52126944065094, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.2137421816587448, "step": 4734 }, { "epoch": 0.148, "grad_norm": 3.828125, "grad_norm_var": 0.3618316650390625, "learning_rate": 0.0001, "loss": 6.4362, "loss/crossentropy": 2.6966384649276733, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.20872260630130768, "step": 4736 }, { "epoch": 0.1480625, "grad_norm": 3.671875, "grad_norm_var": 0.1119537353515625, "learning_rate": 0.0001, "loss": 6.4629, "loss/crossentropy": 2.6376761198043823, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.214164137840271, "step": 4738 }, { "epoch": 0.148125, "grad_norm": 3.625, "grad_norm_var": 0.041792805989583334, "learning_rate": 0.0001, "loss": 6.1429, "loss/crossentropy": 2.5099347829818726, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.19884376972913742, "step": 4740 }, { "epoch": 0.1481875, "grad_norm": 4.125, "grad_norm_var": 0.03687744140625, "learning_rate": 0.0001, "loss": 6.6726, "loss/crossentropy": 2.6728895902633667, "loss/hidden": 1.734375, "loss/jsd": 0.0, "loss/logits": 0.22653395682573318, "step": 4742 }, { "epoch": 0.14825, "grad_norm": 3.578125, "grad_norm_var": 0.03776753743489583, "learning_rate": 0.0001, "loss": 6.5611, "loss/crossentropy": 2.78212308883667, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.21227127313613892, "step": 4744 }, { "epoch": 0.1483125, "grad_norm": 3.765625, "grad_norm_var": 0.029344685872395835, "learning_rate": 0.0001, "loss": 6.3416, "loss/crossentropy": 2.6213202476501465, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.2087465226650238, "step": 4746 }, { "epoch": 0.148375, "grad_norm": 3.484375, "grad_norm_var": 0.030304972330729166, "learning_rate": 0.0001, "loss": 6.5042, "loss/crossentropy": 2.7428064346313477, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.20972855389118195, "step": 4748 }, { "epoch": 0.1484375, "grad_norm": 3.671875, "grad_norm_var": 0.030304972330729166, "learning_rate": 0.0001, "loss": 6.6375, "loss/crossentropy": 2.8026511669158936, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.2170819416642189, "step": 4750 }, { "epoch": 0.1485, "grad_norm": 3.765625, "grad_norm_var": 0.04433492024739583, "learning_rate": 0.0001, "loss": 6.5706, "loss/crossentropy": 2.613227605819702, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.22503775358200073, "step": 4752 }, { "epoch": 0.1485625, "grad_norm": 3.8125, "grad_norm_var": 0.045491536458333336, "learning_rate": 0.0001, "loss": 6.5307, "loss/crossentropy": 2.7017834186553955, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.2157081514596939, "step": 4754 }, { "epoch": 0.148625, "grad_norm": 3.796875, "grad_norm_var": 0.042822265625, "learning_rate": 0.0001, "loss": 6.6199, "loss/crossentropy": 2.764488458633423, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.21756920218467712, "step": 4756 }, { "epoch": 0.1486875, "grad_norm": 4.21875, "grad_norm_var": 0.044921875, "learning_rate": 0.0001, "loss": 6.4397, "loss/crossentropy": 2.6155864000320435, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.21835249662399292, "step": 4758 }, { "epoch": 0.14875, "grad_norm": 3.9375, "grad_norm_var": 0.06787821451822916, "learning_rate": 0.0001, "loss": 6.5694, "loss/crossentropy": 2.645212173461914, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.2236703336238861, "step": 4760 }, { "epoch": 0.1488125, "grad_norm": 3.65625, "grad_norm_var": 0.148583984375, "learning_rate": 0.0001, "loss": 6.2737, "loss/crossentropy": 2.5080801248550415, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.21133046597242355, "step": 4762 }, { "epoch": 0.148875, "grad_norm": 5.125, "grad_norm_var": 0.2266021728515625, "learning_rate": 0.0001, "loss": 6.5326, "loss/crossentropy": 2.6351619958877563, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.22450867295265198, "step": 4764 }, { "epoch": 0.1489375, "grad_norm": 4.21875, "grad_norm_var": 0.21855061848958332, "learning_rate": 0.0001, "loss": 6.4739, "loss/crossentropy": 2.570665121078491, "loss/hidden": 1.75, "loss/jsd": 0.0, "loss/logits": 0.2153189554810524, "step": 4766 }, { "epoch": 0.149, "grad_norm": 3.578125, "grad_norm_var": 0.22913309733072917, "learning_rate": 0.0001, "loss": 6.3232, "loss/crossentropy": 2.6073015928268433, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.20831340551376343, "step": 4768 }, { "epoch": 0.1490625, "grad_norm": 3.4375, "grad_norm_var": 0.23277587890625, "learning_rate": 0.0001, "loss": 6.2837, "loss/crossentropy": 2.4850372076034546, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.2072141021490097, "step": 4770 }, { "epoch": 0.149125, "grad_norm": 3.859375, "grad_norm_var": 0.22883707682291668, "learning_rate": 0.0001, "loss": 6.6157, "loss/crossentropy": 2.6990227699279785, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.22292090207338333, "step": 4772 }, { "epoch": 0.1491875, "grad_norm": 3.609375, "grad_norm_var": 0.23316650390625, "learning_rate": 0.0001, "loss": 6.5108, "loss/crossentropy": 2.7127143144607544, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.2157444953918457, "step": 4774 }, { "epoch": 0.14925, "grad_norm": 4.53125, "grad_norm_var": 0.25297749837239586, "learning_rate": 0.0001, "loss": 6.5796, "loss/crossentropy": 2.648136258125305, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.2247898280620575, "step": 4776 }, { "epoch": 0.1493125, "grad_norm": 3.84375, "grad_norm_var": 0.19218648274739583, "learning_rate": 0.0001, "loss": 6.5274, "loss/crossentropy": 2.7297213077545166, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.21101507544517517, "step": 4778 }, { "epoch": 0.149375, "grad_norm": 3.953125, "grad_norm_var": 0.10084635416666667, "learning_rate": 0.0001, "loss": 6.6353, "loss/crossentropy": 2.7655104398727417, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.22174807637929916, "step": 4780 }, { "epoch": 0.1494375, "grad_norm": 3.921875, "grad_norm_var": 0.09071858723958333, "learning_rate": 0.0001, "loss": 6.5437, "loss/crossentropy": 2.667617917060852, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.2176843211054802, "step": 4782 }, { "epoch": 0.1495, "grad_norm": 4.0, "grad_norm_var": 0.08983968098958334, "learning_rate": 0.0001, "loss": 6.3786, "loss/crossentropy": 2.5850164890289307, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.21295025944709778, "step": 4784 }, { "epoch": 0.1495625, "grad_norm": 3.703125, "grad_norm_var": 0.09162495930989584, "learning_rate": 0.0001, "loss": 6.3162, "loss/crossentropy": 2.6784101724624634, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.19736792147159576, "step": 4786 }, { "epoch": 0.149625, "grad_norm": 4.4375, "grad_norm_var": 0.11328837076822916, "learning_rate": 0.0001, "loss": 5.9967, "loss/crossentropy": 2.2380075454711914, "loss/hidden": 1.7578125, "loss/jsd": 0.0, "loss/logits": 0.20008385181427002, "step": 4788 }, { "epoch": 0.1496875, "grad_norm": 3.8125, "grad_norm_var": 0.1090728759765625, "learning_rate": 0.0001, "loss": 6.2627, "loss/crossentropy": 2.4229001998901367, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.2160077840089798, "step": 4790 }, { "epoch": 0.14975, "grad_norm": 3.765625, "grad_norm_var": 0.07546284993489584, "learning_rate": 0.0001, "loss": 6.5056, "loss/crossentropy": 2.6331783533096313, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.21693290770053864, "step": 4792 }, { "epoch": 0.1498125, "grad_norm": 4.53125, "grad_norm_var": 0.0869049072265625, "learning_rate": 0.0001, "loss": 6.306, "loss/crossentropy": 2.5121694803237915, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.21297980844974518, "step": 4794 }, { "epoch": 0.149875, "grad_norm": 4.03125, "grad_norm_var": 0.08339436848958333, "learning_rate": 0.0001, "loss": 6.8366, "loss/crossentropy": 2.779943823814392, "loss/hidden": 1.765625, "loss/jsd": 0.0, "loss/logits": 0.22909872978925705, "step": 4796 }, { "epoch": 0.1499375, "grad_norm": 7.3125, "grad_norm_var": 0.8360677083333333, "learning_rate": 0.0001, "loss": 6.6624, "loss/crossentropy": 2.6996039152145386, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.22987814247608185, "step": 4798 }, { "epoch": 0.15, "grad_norm": 4.03125, "grad_norm_var": 0.823681640625, "learning_rate": 0.0001, "loss": 6.5988, "loss/crossentropy": 2.6779834032058716, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.2166868895292282, "step": 4800 }, { "epoch": 0.1500625, "grad_norm": 3.40625, "grad_norm_var": 0.8215159098307292, "learning_rate": 0.0001, "loss": 6.2327, "loss/crossentropy": 2.5419305562973022, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.20501716434955597, "step": 4802 }, { "epoch": 0.150125, "grad_norm": 3.78125, "grad_norm_var": 0.8072174072265625, "learning_rate": 0.0001, "loss": 6.3111, "loss/crossentropy": 2.5460424423217773, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.21243851631879807, "step": 4804 }, { "epoch": 0.1501875, "grad_norm": 3.875, "grad_norm_var": 0.8091868082682292, "learning_rate": 0.0001, "loss": 6.3154, "loss/crossentropy": 2.6478878259658813, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.19761145114898682, "step": 4806 }, { "epoch": 0.15025, "grad_norm": 3.78125, "grad_norm_var": 0.80947265625, "learning_rate": 0.0001, "loss": 6.159, "loss/crossentropy": 2.370529294013977, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.2108783796429634, "step": 4808 }, { "epoch": 0.1503125, "grad_norm": 3.75, "grad_norm_var": 0.7909332275390625, "learning_rate": 0.0001, "loss": 6.3784, "loss/crossentropy": 2.6217751502990723, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.213553749024868, "step": 4810 }, { "epoch": 0.150375, "grad_norm": 4.34375, "grad_norm_var": 0.8113515218098958, "learning_rate": 0.0001, "loss": 6.5287, "loss/crossentropy": 2.6473978757858276, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.2217206507921219, "step": 4812 }, { "epoch": 0.1504375, "grad_norm": 3.53125, "grad_norm_var": 0.056640625, "learning_rate": 0.0001, "loss": 6.2518, "loss/crossentropy": 2.496413826942444, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.21147903054952621, "step": 4814 }, { "epoch": 0.1505, "grad_norm": 3.46875, "grad_norm_var": 0.0618804931640625, "learning_rate": 0.0001, "loss": 6.4329, "loss/crossentropy": 2.6252561807632446, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.21475353837013245, "step": 4816 }, { "epoch": 0.1505625, "grad_norm": 3.75, "grad_norm_var": 0.051493326822916664, "learning_rate": 0.0001, "loss": 6.3718, "loss/crossentropy": 2.5250319242477417, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.2159259095788002, "step": 4818 }, { "epoch": 0.150625, "grad_norm": 3.8125, "grad_norm_var": 0.05698954264322917, "learning_rate": 0.0001, "loss": 6.0336, "loss/crossentropy": 2.400040030479431, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.1992897242307663, "step": 4820 }, { "epoch": 0.1506875, "grad_norm": 3.59375, "grad_norm_var": 0.07403055826822917, "learning_rate": 0.0001, "loss": 6.4269, "loss/crossentropy": 2.708932042121887, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.20616790652275085, "step": 4822 }, { "epoch": 0.15075, "grad_norm": 3.65625, "grad_norm_var": 0.05953369140625, "learning_rate": 0.0001, "loss": 6.6234, "loss/crossentropy": 2.6994398832321167, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.22286559641361237, "step": 4824 }, { "epoch": 0.1508125, "grad_norm": 3.6875, "grad_norm_var": 0.057840983072916664, "learning_rate": 0.0001, "loss": 6.4284, "loss/crossentropy": 2.6506223678588867, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.21254274994134903, "step": 4826 }, { "epoch": 0.150875, "grad_norm": 4.0625, "grad_norm_var": 0.038248697916666664, "learning_rate": 0.0001, "loss": 6.4538, "loss/crossentropy": 2.6030107736587524, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.21633025258779526, "step": 4828 }, { "epoch": 0.1509375, "grad_norm": 4.125, "grad_norm_var": 0.05157877604166667, "learning_rate": 0.0001, "loss": 6.5578, "loss/crossentropy": 2.6313416957855225, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.22272682189941406, "step": 4830 }, { "epoch": 0.151, "grad_norm": 4.3125, "grad_norm_var": 0.07156575520833333, "learning_rate": 0.0001, "loss": 6.7107, "loss/crossentropy": 2.744732975959778, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.22667010128498077, "step": 4832 }, { "epoch": 0.1510625, "grad_norm": 3.84375, "grad_norm_var": 0.07504781087239583, "learning_rate": 0.0001, "loss": 6.2812, "loss/crossentropy": 2.538503885269165, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.20746999979019165, "step": 4834 }, { "epoch": 0.151125, "grad_norm": 3.921875, "grad_norm_var": 0.07955322265625, "learning_rate": 0.0001, "loss": 6.6262, "loss/crossentropy": 2.7523258924484253, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.2178577333688736, "step": 4836 }, { "epoch": 0.1511875, "grad_norm": 3.671875, "grad_norm_var": 0.05794169108072917, "learning_rate": 0.0001, "loss": 6.544, "loss/crossentropy": 2.660256505012512, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.21962185949087143, "step": 4838 }, { "epoch": 0.15125, "grad_norm": 6.84375, "grad_norm_var": 0.6006022135416667, "learning_rate": 0.0001, "loss": 6.2568, "loss/crossentropy": 2.4511998891830444, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.21102909743785858, "step": 4840 }, { "epoch": 0.1513125, "grad_norm": 3.5, "grad_norm_var": 0.5956858317057292, "learning_rate": 0.0001, "loss": 6.4881, "loss/crossentropy": 2.6805083751678467, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.21435106545686722, "step": 4842 }, { "epoch": 0.151375, "grad_norm": 3.84375, "grad_norm_var": 0.5873046875, "learning_rate": 0.0001, "loss": 6.1579, "loss/crossentropy": 2.474646806716919, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.20426137745380402, "step": 4844 }, { "epoch": 0.1514375, "grad_norm": 3.515625, "grad_norm_var": 0.6108561197916667, "learning_rate": 0.0001, "loss": 6.2489, "loss/crossentropy": 2.508758306503296, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.2076030820608139, "step": 4846 }, { "epoch": 0.1515, "grad_norm": 3.71875, "grad_norm_var": 0.5954060872395833, "learning_rate": 0.0001, "loss": 6.6277, "loss/crossentropy": 2.7554014921188354, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.2192564159631729, "step": 4848 }, { "epoch": 0.1515625, "grad_norm": 3.671875, "grad_norm_var": 0.6114491780598958, "learning_rate": 0.0001, "loss": 6.5457, "loss/crossentropy": 2.6733874082565308, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.21731121093034744, "step": 4850 }, { "epoch": 0.151625, "grad_norm": 3.578125, "grad_norm_var": 0.6405181884765625, "learning_rate": 0.0001, "loss": 6.3359, "loss/crossentropy": 2.6059813499450684, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.20932254940271378, "step": 4852 }, { "epoch": 0.1516875, "grad_norm": 3.640625, "grad_norm_var": 0.6582509358723958, "learning_rate": 0.0001, "loss": 6.4843, "loss/crossentropy": 2.637516736984253, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.21631476283073425, "step": 4854 }, { "epoch": 0.15175, "grad_norm": 5.5, "grad_norm_var": 0.26500244140625, "learning_rate": 0.0001, "loss": 6.7869, "loss/crossentropy": 2.763762354850769, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.2323966547846794, "step": 4856 }, { "epoch": 0.1518125, "grad_norm": 4.125, "grad_norm_var": 0.2546620686848958, "learning_rate": 0.0001, "loss": 6.5614, "loss/crossentropy": 2.6742827892303467, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.22269795089960098, "step": 4858 }, { "epoch": 0.151875, "grad_norm": 3.484375, "grad_norm_var": 0.2902303059895833, "learning_rate": 0.0001, "loss": 6.5024, "loss/crossentropy": 2.5040030479431152, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.22678908705711365, "step": 4860 }, { "epoch": 0.1519375, "grad_norm": 3.859375, "grad_norm_var": 0.28025716145833335, "learning_rate": 0.0001, "loss": 6.2587, "loss/crossentropy": 2.4296271800994873, "loss/hidden": 1.7109375, "loss/jsd": 0.0, "loss/logits": 0.21181800961494446, "step": 4862 }, { "epoch": 0.152, "grad_norm": 3.453125, "grad_norm_var": 0.29153544108072915, "learning_rate": 0.0001, "loss": 6.3599, "loss/crossentropy": 2.5853668451309204, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.21612919867038727, "step": 4864 }, { "epoch": 0.1520625, "grad_norm": 4.09375, "grad_norm_var": 0.2879842122395833, "learning_rate": 0.0001, "loss": 6.4013, "loss/crossentropy": 2.6143546104431152, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.21033377200365067, "step": 4866 }, { "epoch": 0.152125, "grad_norm": 3.6875, "grad_norm_var": 0.26902567545572914, "learning_rate": 0.0001, "loss": 6.7236, "loss/crossentropy": 2.8559582233428955, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.21606221795082092, "step": 4868 }, { "epoch": 0.1521875, "grad_norm": 3.5, "grad_norm_var": 0.2936920166015625, "learning_rate": 0.0001, "loss": 6.0866, "loss/crossentropy": 2.5235257148742676, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.1965421736240387, "step": 4870 }, { "epoch": 0.15225, "grad_norm": 3.734375, "grad_norm_var": 0.101416015625, "learning_rate": 0.0001, "loss": 6.4072, "loss/crossentropy": 2.604854464530945, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.21265756338834763, "step": 4872 }, { "epoch": 0.1523125, "grad_norm": 3.734375, "grad_norm_var": 0.09153238932291667, "learning_rate": 0.0001, "loss": 6.3704, "loss/crossentropy": 2.5827693939208984, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.21430625021457672, "step": 4874 }, { "epoch": 0.152375, "grad_norm": 3.78125, "grad_norm_var": 0.0570953369140625, "learning_rate": 0.0001, "loss": 6.623, "loss/crossentropy": 2.838082194328308, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.21247129142284393, "step": 4876 }, { "epoch": 0.1524375, "grad_norm": 3.765625, "grad_norm_var": 0.05123291015625, "learning_rate": 0.0001, "loss": 6.3011, "loss/crossentropy": 2.579153895378113, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.21086709201335907, "step": 4878 }, { "epoch": 0.1525, "grad_norm": 3.53125, "grad_norm_var": 0.0462066650390625, "learning_rate": 0.0001, "loss": 6.1631, "loss/crossentropy": 2.5194358825683594, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.20147526264190674, "step": 4880 }, { "epoch": 0.1525625, "grad_norm": 3.75, "grad_norm_var": 0.037333170572916664, "learning_rate": 0.0001, "loss": 6.4245, "loss/crossentropy": 2.5760059356689453, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.21532009541988373, "step": 4882 }, { "epoch": 0.152625, "grad_norm": 3.6875, "grad_norm_var": 0.03874409993489583, "learning_rate": 0.0001, "loss": 6.3237, "loss/crossentropy": 2.5853145122528076, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.2058662474155426, "step": 4884 }, { "epoch": 0.1526875, "grad_norm": 3.453125, "grad_norm_var": 0.023014322916666666, "learning_rate": 0.0001, "loss": 6.1755, "loss/crossentropy": 2.4950112104415894, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.20789121091365814, "step": 4886 }, { "epoch": 0.15275, "grad_norm": 3.6875, "grad_norm_var": 0.025788370768229166, "learning_rate": 0.0001, "loss": 6.2782, "loss/crossentropy": 2.464964509010315, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.2090589702129364, "step": 4888 }, { "epoch": 0.1528125, "grad_norm": 3.703125, "grad_norm_var": 0.027586873372395834, "learning_rate": 0.0001, "loss": 6.72, "loss/crossentropy": 2.7551660537719727, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.22772932052612305, "step": 4890 }, { "epoch": 0.152875, "grad_norm": 3.546875, "grad_norm_var": 0.020832316080729166, "learning_rate": 0.0001, "loss": 6.2148, "loss/crossentropy": 2.508195996284485, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.2073819264769554, "step": 4892 }, { "epoch": 0.1529375, "grad_norm": 3.8125, "grad_norm_var": 0.022587076822916666, "learning_rate": 0.0001, "loss": 6.1928, "loss/crossentropy": 2.421853542327881, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.21419940888881683, "step": 4894 }, { "epoch": 0.153, "grad_norm": 3.921875, "grad_norm_var": 0.0239654541015625, "learning_rate": 0.0001, "loss": 5.6587, "loss/crossentropy": 2.0499314069747925, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.18783257901668549, "step": 4896 }, { "epoch": 0.1530625, "grad_norm": 3.65625, "grad_norm_var": 0.0221343994140625, "learning_rate": 0.0001, "loss": 6.3923, "loss/crossentropy": 2.6016801595687866, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.2122626230120659, "step": 4898 }, { "epoch": 0.153125, "grad_norm": 3.78125, "grad_norm_var": 0.018407185872395832, "learning_rate": 0.0001, "loss": 6.5129, "loss/crossentropy": 2.709871530532837, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.21819671988487244, "step": 4900 }, { "epoch": 0.1531875, "grad_norm": 4.03125, "grad_norm_var": 0.018163045247395832, "learning_rate": 0.0001, "loss": 6.4619, "loss/crossentropy": 2.638934850692749, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.2151099443435669, "step": 4902 }, { "epoch": 0.15325, "grad_norm": 4.3125, "grad_norm_var": 0.03599853515625, "learning_rate": 0.0001, "loss": 6.5957, "loss/crossentropy": 2.6319879293441772, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.22644442319869995, "step": 4904 }, { "epoch": 0.1533125, "grad_norm": 3.734375, "grad_norm_var": 0.07101236979166667, "learning_rate": 0.0001, "loss": 6.3658, "loss/crossentropy": 2.5488661527633667, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.21215952187776566, "step": 4906 }, { "epoch": 0.153375, "grad_norm": 3.546875, "grad_norm_var": 0.0849761962890625, "learning_rate": 0.0001, "loss": 6.3781, "loss/crossentropy": 2.5926159620285034, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.2129197046160698, "step": 4908 }, { "epoch": 0.1534375, "grad_norm": 3.8125, "grad_norm_var": 0.08209228515625, "learning_rate": 0.0001, "loss": 6.0521, "loss/crossentropy": 2.3596653938293457, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.20322907716035843, "step": 4910 }, { "epoch": 0.1535, "grad_norm": 3.578125, "grad_norm_var": 0.09072265625, "learning_rate": 0.0001, "loss": 6.2831, "loss/crossentropy": 2.5288031101226807, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.20980124175548553, "step": 4912 }, { "epoch": 0.1535625, "grad_norm": 3.90625, "grad_norm_var": 0.08870442708333333, "learning_rate": 0.0001, "loss": 6.9371, "loss/crossentropy": 2.849328398704529, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.23651330918073654, "step": 4914 }, { "epoch": 0.153625, "grad_norm": 4.09375, "grad_norm_var": 0.09208577473958333, "learning_rate": 0.0001, "loss": 6.5117, "loss/crossentropy": 2.6502633094787598, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.21622441709041595, "step": 4916 }, { "epoch": 0.1536875, "grad_norm": 4.0625, "grad_norm_var": 0.09516499837239584, "learning_rate": 0.0001, "loss": 6.632, "loss/crossentropy": 2.7815967798233032, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.21316128224134445, "step": 4918 }, { "epoch": 0.15375, "grad_norm": 4.09375, "grad_norm_var": 0.08580729166666666, "learning_rate": 0.0001, "loss": 6.5193, "loss/crossentropy": 2.6187355518341064, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.22287128120660782, "step": 4920 }, { "epoch": 0.1538125, "grad_norm": 3.890625, "grad_norm_var": 0.0621978759765625, "learning_rate": 0.0001, "loss": 6.3034, "loss/crossentropy": 2.581501007080078, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.21047167479991913, "step": 4922 }, { "epoch": 0.153875, "grad_norm": 3.640625, "grad_norm_var": 0.0472564697265625, "learning_rate": 0.0001, "loss": 6.4875, "loss/crossentropy": 2.6864192485809326, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.21448803693056107, "step": 4924 }, { "epoch": 0.1539375, "grad_norm": 5.53125, "grad_norm_var": 0.24671223958333333, "learning_rate": 0.0001, "loss": 6.4715, "loss/crossentropy": 2.5524847507476807, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.22275861352682114, "step": 4926 }, { "epoch": 0.154, "grad_norm": 4.125, "grad_norm_var": 0.2464019775390625, "learning_rate": 0.0001, "loss": 6.572, "loss/crossentropy": 2.698154926300049, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.2194170281291008, "step": 4928 }, { "epoch": 0.1540625, "grad_norm": 3.90625, "grad_norm_var": 0.24670817057291666, "learning_rate": 0.0001, "loss": 6.699, "loss/crossentropy": 2.7247055768966675, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.23062902688980103, "step": 4930 }, { "epoch": 0.154125, "grad_norm": 3.578125, "grad_norm_var": 0.24951070149739582, "learning_rate": 0.0001, "loss": 6.404, "loss/crossentropy": 2.587713122367859, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.21404717862606049, "step": 4932 }, { "epoch": 0.1541875, "grad_norm": 5.5, "grad_norm_var": 0.43082275390625, "learning_rate": 0.0001, "loss": 6.9871, "loss/crossentropy": 2.9728357791900635, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.22994327545166016, "step": 4934 }, { "epoch": 0.15425, "grad_norm": 3.78125, "grad_norm_var": 0.42596028645833334, "learning_rate": 0.0001, "loss": 6.3586, "loss/crossentropy": 2.537160277366638, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.21651742607355118, "step": 4936 }, { "epoch": 0.1543125, "grad_norm": 3.5, "grad_norm_var": 0.4275716145833333, "learning_rate": 0.0001, "loss": 6.2248, "loss/crossentropy": 2.5505727529525757, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.20569919794797897, "step": 4938 }, { "epoch": 0.154375, "grad_norm": 3.6875, "grad_norm_var": 0.4137603759765625, "learning_rate": 0.0001, "loss": 6.3691, "loss/crossentropy": 2.5592960119247437, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.21262313425540924, "step": 4940 }, { "epoch": 0.1544375, "grad_norm": 3.9375, "grad_norm_var": 0.23670247395833333, "learning_rate": 0.0001, "loss": 6.1737, "loss/crossentropy": 2.476612091064453, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.20408708602190018, "step": 4942 }, { "epoch": 0.1545, "grad_norm": 3.8125, "grad_norm_var": 0.2263824462890625, "learning_rate": 0.0001, "loss": 6.6774, "loss/crossentropy": 2.7931333780288696, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.2188974693417549, "step": 4944 }, { "epoch": 0.1545625, "grad_norm": 3.59375, "grad_norm_var": 0.23065999348958333, "learning_rate": 0.0001, "loss": 6.6406, "loss/crossentropy": 2.8075019121170044, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.21455729007720947, "step": 4946 }, { "epoch": 0.154625, "grad_norm": 3.28125, "grad_norm_var": 0.2508697509765625, "learning_rate": 0.0001, "loss": 6.2334, "loss/crossentropy": 2.5928040742874146, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.2019491121172905, "step": 4948 }, { "epoch": 0.1546875, "grad_norm": 3.59375, "grad_norm_var": 0.04607747395833333, "learning_rate": 0.0001, "loss": 6.4066, "loss/crossentropy": 2.6578075885772705, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.21042943745851517, "step": 4950 }, { "epoch": 0.15475, "grad_norm": 3.765625, "grad_norm_var": 0.053564453125, "learning_rate": 0.0001, "loss": 6.5547, "loss/crossentropy": 2.6699572801589966, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.2216789573431015, "step": 4952 }, { "epoch": 0.1548125, "grad_norm": 3.1875, "grad_norm_var": 0.069580078125, "learning_rate": 0.0001, "loss": 6.1558, "loss/crossentropy": 2.5289593935012817, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.20018472522497177, "step": 4954 }, { "epoch": 0.154875, "grad_norm": 3.84375, "grad_norm_var": 0.061498006184895836, "learning_rate": 0.0001, "loss": 6.4415, "loss/crossentropy": 2.6500484943389893, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.21507825702428818, "step": 4956 }, { "epoch": 0.1549375, "grad_norm": 3.953125, "grad_norm_var": 0.09381103515625, "learning_rate": 0.0001, "loss": 6.7442, "loss/crossentropy": 2.700336456298828, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.2379828542470932, "step": 4958 }, { "epoch": 0.155, "grad_norm": 3.8125, "grad_norm_var": 0.088623046875, "learning_rate": 0.0001, "loss": 6.3364, "loss/crossentropy": 2.595617890357971, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.20767300575971603, "step": 4960 }, { "epoch": 0.1550625, "grad_norm": 3.9375, "grad_norm_var": 0.0892974853515625, "learning_rate": 0.0001, "loss": 6.4777, "loss/crossentropy": 2.6588134765625, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.21509408205747604, "step": 4962 }, { "epoch": 0.155125, "grad_norm": 4.875, "grad_norm_var": 0.15546468098958333, "learning_rate": 0.0001, "loss": 6.0125, "loss/crossentropy": 2.2977291345596313, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.20546399056911469, "step": 4964 }, { "epoch": 0.1551875, "grad_norm": 3.578125, "grad_norm_var": 0.15156148274739584, "learning_rate": 0.0001, "loss": 6.2089, "loss/crossentropy": 2.4438865184783936, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.2104882299900055, "step": 4966 }, { "epoch": 0.15525, "grad_norm": 3.84375, "grad_norm_var": 0.15627848307291667, "learning_rate": 0.0001, "loss": 6.6539, "loss/crossentropy": 2.769439458847046, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.22008167952299118, "step": 4968 }, { "epoch": 0.1553125, "grad_norm": 3.375, "grad_norm_var": 0.13815104166666667, "learning_rate": 0.0001, "loss": 6.3455, "loss/crossentropy": 2.6334996223449707, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.202054463326931, "step": 4970 }, { "epoch": 0.155375, "grad_norm": 3.515625, "grad_norm_var": 0.15764058430989583, "learning_rate": 0.0001, "loss": 6.1166, "loss/crossentropy": 2.5081863403320312, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.19794577360153198, "step": 4972 }, { "epoch": 0.1554375, "grad_norm": 3.640625, "grad_norm_var": 0.13384501139322916, "learning_rate": 0.0001, "loss": 6.3389, "loss/crossentropy": 2.5470728874206543, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.21004494279623032, "step": 4974 }, { "epoch": 0.1555, "grad_norm": 3.4375, "grad_norm_var": 0.13826395670572916, "learning_rate": 0.0001, "loss": 6.344, "loss/crossentropy": 2.6228668689727783, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.20609334856271744, "step": 4976 }, { "epoch": 0.1555625, "grad_norm": 3.796875, "grad_norm_var": 0.13590494791666666, "learning_rate": 0.0001, "loss": 6.6796, "loss/crossentropy": 2.734183669090271, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.22305253148078918, "step": 4978 }, { "epoch": 0.155625, "grad_norm": 3.515625, "grad_norm_var": 0.05121968587239583, "learning_rate": 0.0001, "loss": 6.2023, "loss/crossentropy": 2.4566444158554077, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.21010775864124298, "step": 4980 }, { "epoch": 0.1556875, "grad_norm": 3.875, "grad_norm_var": 0.05627848307291667, "learning_rate": 0.0001, "loss": 6.1007, "loss/crossentropy": 2.3971216678619385, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.20434706658124924, "step": 4982 }, { "epoch": 0.15575, "grad_norm": 4.25, "grad_norm_var": 0.062841796875, "learning_rate": 0.0001, "loss": 6.3932, "loss/crossentropy": 2.6078414916992188, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.20979001373052597, "step": 4984 }, { "epoch": 0.1558125, "grad_norm": 3.6875, "grad_norm_var": 0.05384012858072917, "learning_rate": 0.0001, "loss": 6.1757, "loss/crossentropy": 2.5649302005767822, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.19623598456382751, "step": 4986 }, { "epoch": 0.155875, "grad_norm": 3.5625, "grad_norm_var": 0.05051981608072917, "learning_rate": 0.0001, "loss": 6.3125, "loss/crossentropy": 2.5719149112701416, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.21155717223882675, "step": 4988 }, { "epoch": 0.1559375, "grad_norm": 4.09375, "grad_norm_var": 0.058649698893229164, "learning_rate": 0.0001, "loss": 6.2429, "loss/crossentropy": 2.6092617511749268, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.2024226039648056, "step": 4990 }, { "epoch": 0.156, "grad_norm": 4.3125, "grad_norm_var": 0.07617899576822916, "learning_rate": 0.0001, "loss": 6.5649, "loss/crossentropy": 2.692357659339905, "loss/hidden": 1.7109375, "loss/jsd": 0.0, "loss/logits": 0.21616382896900177, "step": 4992 }, { "epoch": 0.1560625, "grad_norm": 3.53125, "grad_norm_var": 0.0810699462890625, "learning_rate": 0.0001, "loss": 6.5541, "loss/crossentropy": 2.740652918815613, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.21416035294532776, "step": 4994 }, { "epoch": 0.156125, "grad_norm": 3.640625, "grad_norm_var": 0.0776763916015625, "learning_rate": 0.0001, "loss": 6.0529, "loss/crossentropy": 2.3745936155319214, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.1990818828344345, "step": 4996 }, { "epoch": 0.1561875, "grad_norm": 3.484375, "grad_norm_var": 0.07605794270833334, "learning_rate": 0.0001, "loss": 6.4075, "loss/crossentropy": 2.6630691289901733, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.2076416164636612, "step": 4998 }, { "epoch": 0.15625, "grad_norm": 3.421875, "grad_norm_var": 0.061848958333333336, "learning_rate": 0.0001, "loss": 6.417, "loss/crossentropy": 2.7289209365844727, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.20670168101787567, "step": 5000 }, { "epoch": 0.1563125, "grad_norm": 3.625, "grad_norm_var": 0.06197916666666667, "learning_rate": 0.0001, "loss": 6.3584, "loss/crossentropy": 2.621297597885132, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.2092597633600235, "step": 5002 }, { "epoch": 0.156375, "grad_norm": 3.890625, "grad_norm_var": 0.12735087076822918, "learning_rate": 0.0001, "loss": 6.5332, "loss/crossentropy": 2.6060941219329834, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.2196609079837799, "step": 5004 }, { "epoch": 0.1564375, "grad_norm": 3.734375, "grad_norm_var": 0.12423502604166667, "learning_rate": 0.0001, "loss": 6.1773, "loss/crossentropy": 2.4523247480392456, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.2115599289536476, "step": 5006 }, { "epoch": 0.1565, "grad_norm": 3.765625, "grad_norm_var": 0.09846903483072916, "learning_rate": 0.0001, "loss": 6.41, "loss/crossentropy": 2.568955659866333, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.21886961162090302, "step": 5008 }, { "epoch": 0.1565625, "grad_norm": 3.484375, "grad_norm_var": 0.09791666666666667, "learning_rate": 0.0001, "loss": 6.6024, "loss/crossentropy": 2.722938656806946, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.21763822436332703, "step": 5010 }, { "epoch": 0.156625, "grad_norm": 3.8125, "grad_norm_var": 0.10121968587239584, "learning_rate": 0.0001, "loss": 6.335, "loss/crossentropy": 2.561442732810974, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.207045778632164, "step": 5012 }, { "epoch": 0.1566875, "grad_norm": 4.625, "grad_norm_var": 0.14866129557291666, "learning_rate": 0.0001, "loss": 6.5044, "loss/crossentropy": 2.5820696353912354, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.22582750767469406, "step": 5014 }, { "epoch": 0.15675, "grad_norm": 6.03125, "grad_norm_var": 0.44075419108072916, "learning_rate": 0.0001, "loss": 6.296, "loss/crossentropy": 2.5115894079208374, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.20851974934339523, "step": 5016 }, { "epoch": 0.1568125, "grad_norm": 3.9375, "grad_norm_var": 0.4282297770182292, "learning_rate": 0.0001, "loss": 6.2359, "loss/crossentropy": 2.5803922414779663, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.1971878930926323, "step": 5018 }, { "epoch": 0.156875, "grad_norm": 4.15625, "grad_norm_var": 1.7147420247395833, "learning_rate": 0.0001, "loss": 6.9218, "loss/crossentropy": 2.7316008806228638, "loss/hidden": 1.7890625, "loss/jsd": 0.0, "loss/logits": 0.24011238664388657, "step": 5020 }, { "epoch": 0.1569375, "grad_norm": 3.78125, "grad_norm_var": 1.6871907552083334, "learning_rate": 0.0001, "loss": 6.3541, "loss/crossentropy": 2.5209031105041504, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.21574316918849945, "step": 5022 }, { "epoch": 0.157, "grad_norm": 3.734375, "grad_norm_var": 1.6809967041015625, "learning_rate": 0.0001, "loss": 6.4448, "loss/crossentropy": 2.6470454931259155, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.21337257325649261, "step": 5024 }, { "epoch": 0.1570625, "grad_norm": 3.75, "grad_norm_var": 1.649006144205729, "learning_rate": 0.0001, "loss": 6.1475, "loss/crossentropy": 2.424108862876892, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.2047611102461815, "step": 5026 }, { "epoch": 0.157125, "grad_norm": 3.546875, "grad_norm_var": 1.6721750895182292, "learning_rate": 0.0001, "loss": 6.2396, "loss/crossentropy": 2.609367609024048, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.20130402594804764, "step": 5028 }, { "epoch": 0.1571875, "grad_norm": 3.65625, "grad_norm_var": 1.6649698893229166, "learning_rate": 0.0001, "loss": 6.5154, "loss/crossentropy": 2.649414300918579, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.21628418564796448, "step": 5030 }, { "epoch": 0.15725, "grad_norm": 3.34375, "grad_norm_var": 1.4771443684895833, "learning_rate": 0.0001, "loss": 6.2335, "loss/crossentropy": 2.515960216522217, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.20495706796646118, "step": 5032 }, { "epoch": 0.1573125, "grad_norm": 3.84375, "grad_norm_var": 1.50572509765625, "learning_rate": 0.0001, "loss": 6.6099, "loss/crossentropy": 2.7881150245666504, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.2169434353709221, "step": 5034 }, { "epoch": 0.157375, "grad_norm": 3.40625, "grad_norm_var": 0.04423726399739583, "learning_rate": 0.0001, "loss": 6.3489, "loss/crossentropy": 2.5991029739379883, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.20974069833755493, "step": 5036 }, { "epoch": 0.1574375, "grad_norm": 3.3125, "grad_norm_var": 0.05195210774739583, "learning_rate": 0.0001, "loss": 6.2484, "loss/crossentropy": 2.5991371870040894, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.20281457901000977, "step": 5038 }, { "epoch": 0.1575, "grad_norm": 3.703125, "grad_norm_var": 0.048079427083333334, "learning_rate": 0.0001, "loss": 6.5411, "loss/crossentropy": 2.719546914100647, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.21183981746435165, "step": 5040 }, { "epoch": 0.1575625, "grad_norm": 3.890625, "grad_norm_var": 0.043187459309895836, "learning_rate": 0.0001, "loss": 6.4863, "loss/crossentropy": 2.7351890802383423, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.21221739053726196, "step": 5042 }, { "epoch": 0.157625, "grad_norm": 3.5625, "grad_norm_var": 0.0495758056640625, "learning_rate": 0.0001, "loss": 6.2946, "loss/crossentropy": 2.523933529853821, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.2137833908200264, "step": 5044 }, { "epoch": 0.1576875, "grad_norm": 3.4375, "grad_norm_var": 0.04592692057291667, "learning_rate": 0.0001, "loss": 6.0445, "loss/crossentropy": 2.4377795457839966, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.19817539304494858, "step": 5046 }, { "epoch": 0.15775, "grad_norm": 3.96875, "grad_norm_var": 0.05283915201822917, "learning_rate": 0.0001, "loss": 5.9443, "loss/crossentropy": 2.4297016859054565, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.184275820851326, "step": 5048 }, { "epoch": 0.1578125, "grad_norm": 3.65625, "grad_norm_var": 0.04575907389322917, "learning_rate": 0.0001, "loss": 6.0538, "loss/crossentropy": 2.4386450052261353, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.19511238485574722, "step": 5050 }, { "epoch": 0.157875, "grad_norm": 3.390625, "grad_norm_var": 0.06670633951822917, "learning_rate": 0.0001, "loss": 6.4905, "loss/crossentropy": 2.6591763496398926, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.21672768890857697, "step": 5052 }, { "epoch": 0.1579375, "grad_norm": 3.71875, "grad_norm_var": 0.06217041015625, "learning_rate": 0.0001, "loss": 6.4167, "loss/crossentropy": 2.7084654569625854, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.20637103915214539, "step": 5054 }, { "epoch": 0.158, "grad_norm": 3.375, "grad_norm_var": 0.0797027587890625, "learning_rate": 0.0001, "loss": 6.4931, "loss/crossentropy": 2.6809680461883545, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.21324515342712402, "step": 5056 }, { "epoch": 0.1580625, "grad_norm": 4.0625, "grad_norm_var": 0.12431233723958333, "learning_rate": 0.0001, "loss": 6.6414, "loss/crossentropy": 2.652488112449646, "loss/hidden": 1.75390625, "loss/jsd": 0.0, "loss/logits": 0.22349800169467926, "step": 5058 }, { "epoch": 0.158125, "grad_norm": 3.46875, "grad_norm_var": 0.12433980305989584, "learning_rate": 0.0001, "loss": 6.3064, "loss/crossentropy": 2.605587124824524, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.20680193603038788, "step": 5060 }, { "epoch": 0.1581875, "grad_norm": 3.5, "grad_norm_var": 0.12500712076822917, "learning_rate": 0.0001, "loss": 5.9345, "loss/crossentropy": 2.387421131134033, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19493824988603592, "step": 5062 }, { "epoch": 0.15825, "grad_norm": 3.84375, "grad_norm_var": 0.12653706868489584, "learning_rate": 0.0001, "loss": 6.6495, "loss/crossentropy": 2.7843059301376343, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.21698807924985886, "step": 5064 }, { "epoch": 0.1583125, "grad_norm": 3.6875, "grad_norm_var": 0.13125712076822918, "learning_rate": 0.0001, "loss": 6.1758, "loss/crossentropy": 2.5535526275634766, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.19894598424434662, "step": 5066 }, { "epoch": 0.158375, "grad_norm": 3.9375, "grad_norm_var": 0.11034749348958334, "learning_rate": 0.0001, "loss": 6.1638, "loss/crossentropy": 2.449795961380005, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.20772428065538406, "step": 5068 }, { "epoch": 0.1584375, "grad_norm": 4.125, "grad_norm_var": 0.10965067545572917, "learning_rate": 0.0001, "loss": 6.7174, "loss/crossentropy": 2.7422256469726562, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.22837365418672562, "step": 5070 }, { "epoch": 0.1585, "grad_norm": 3.34375, "grad_norm_var": 0.10690104166666667, "learning_rate": 0.0001, "loss": 6.3429, "loss/crossentropy": 2.657220244407654, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.20646213740110397, "step": 5072 }, { "epoch": 0.1585625, "grad_norm": 3.921875, "grad_norm_var": 0.074755859375, "learning_rate": 0.0001, "loss": 6.6273, "loss/crossentropy": 2.8043344020843506, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.2178436666727066, "step": 5074 }, { "epoch": 0.158625, "grad_norm": 3.359375, "grad_norm_var": 0.07821858723958333, "learning_rate": 0.0001, "loss": 5.8211, "loss/crossentropy": 2.3485565185546875, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.18475934863090515, "step": 5076 }, { "epoch": 0.1586875, "grad_norm": 3.28125, "grad_norm_var": 0.08338114420572916, "learning_rate": 0.0001, "loss": 6.1521, "loss/crossentropy": 2.469932198524475, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.2014240026473999, "step": 5078 }, { "epoch": 0.15875, "grad_norm": 4.375, "grad_norm_var": 0.1011871337890625, "learning_rate": 0.0001, "loss": 6.8118, "loss/crossentropy": 2.7964015007019043, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.23201094567775726, "step": 5080 }, { "epoch": 0.1588125, "grad_norm": 3.609375, "grad_norm_var": 0.10099283854166667, "learning_rate": 0.0001, "loss": 6.3316, "loss/crossentropy": 2.656257152557373, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.20347124338150024, "step": 5082 }, { "epoch": 0.158875, "grad_norm": 3.640625, "grad_norm_var": 0.10268452962239584, "learning_rate": 0.0001, "loss": 6.2924, "loss/crossentropy": 2.5780434608459473, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.20854273438453674, "step": 5084 }, { "epoch": 0.1589375, "grad_norm": 3.9375, "grad_norm_var": 0.08918863932291667, "learning_rate": 0.0001, "loss": 6.2844, "loss/crossentropy": 2.5688424110412598, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.2070987969636917, "step": 5086 }, { "epoch": 0.159, "grad_norm": 3.484375, "grad_norm_var": 0.081298828125, "learning_rate": 0.0001, "loss": 6.4102, "loss/crossentropy": 2.6852883100509644, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.20686575770378113, "step": 5088 }, { "epoch": 0.1590625, "grad_norm": 3.75, "grad_norm_var": 0.28347981770833336, "learning_rate": 0.0001, "loss": 6.2168, "loss/crossentropy": 2.4014971256256104, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.21551689505577087, "step": 5090 }, { "epoch": 0.159125, "grad_norm": 3.84375, "grad_norm_var": 0.2693684895833333, "learning_rate": 0.0001, "loss": 6.2445, "loss/crossentropy": 2.539799928665161, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.2087467536330223, "step": 5092 }, { "epoch": 0.1591875, "grad_norm": 4.15625, "grad_norm_var": 0.25432942708333334, "learning_rate": 0.0001, "loss": 6.5523, "loss/crossentropy": 2.720983147621155, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.21946312487125397, "step": 5094 }, { "epoch": 0.15925, "grad_norm": 3.625, "grad_norm_var": 0.23621419270833333, "learning_rate": 0.0001, "loss": 6.062, "loss/crossentropy": 2.464852809906006, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.19800060987472534, "step": 5096 }, { "epoch": 0.1593125, "grad_norm": 3.921875, "grad_norm_var": 0.22683919270833333, "learning_rate": 0.0001, "loss": 6.3407, "loss/crossentropy": 2.5605088472366333, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.21160940825939178, "step": 5098 }, { "epoch": 0.159375, "grad_norm": 4.0, "grad_norm_var": 0.22457275390625, "learning_rate": 0.0001, "loss": 6.0553, "loss/crossentropy": 2.394958972930908, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.2047068253159523, "step": 5100 }, { "epoch": 0.1594375, "grad_norm": 3.515625, "grad_norm_var": 0.238427734375, "learning_rate": 0.0001, "loss": 6.1042, "loss/crossentropy": 2.592382788658142, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.1890685334801674, "step": 5102 }, { "epoch": 0.1595, "grad_norm": 3.875, "grad_norm_var": 0.23775634765625, "learning_rate": 0.0001, "loss": 6.3744, "loss/crossentropy": 2.544821858406067, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.2165486440062523, "step": 5104 }, { "epoch": 0.1595625, "grad_norm": 3.875, "grad_norm_var": 0.042023722330729166, "learning_rate": 0.0001, "loss": 6.4397, "loss/crossentropy": 2.6532318592071533, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.20872486382722855, "step": 5106 }, { "epoch": 0.159625, "grad_norm": 3.625, "grad_norm_var": 0.042723592122395834, "learning_rate": 0.0001, "loss": 6.248, "loss/crossentropy": 2.629621386528015, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.1934749037027359, "step": 5108 }, { "epoch": 0.1596875, "grad_norm": 3.3125, "grad_norm_var": 0.0379791259765625, "learning_rate": 0.0001, "loss": 6.0829, "loss/crossentropy": 2.38829243183136, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.20539657771587372, "step": 5110 }, { "epoch": 0.15975, "grad_norm": 3.96875, "grad_norm_var": 0.040934244791666664, "learning_rate": 0.0001, "loss": 6.3522, "loss/crossentropy": 2.6986857652664185, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.200894795358181, "step": 5112 }, { "epoch": 0.1598125, "grad_norm": 3.5, "grad_norm_var": 0.04039713541666667, "learning_rate": 0.0001, "loss": 6.328, "loss/crossentropy": 2.610170006752014, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.20928016304969788, "step": 5114 }, { "epoch": 0.159875, "grad_norm": 3.25, "grad_norm_var": 0.04309794108072917, "learning_rate": 0.0001, "loss": 6.1676, "loss/crossentropy": 2.5283877849578857, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.2021999955177307, "step": 5116 }, { "epoch": 0.1599375, "grad_norm": 3.625, "grad_norm_var": 0.04107666015625, "learning_rate": 0.0001, "loss": 6.0732, "loss/crossentropy": 2.3941776752471924, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.1999344378709793, "step": 5118 }, { "epoch": 0.16, "grad_norm": 3.84375, "grad_norm_var": 0.04444071451822917, "learning_rate": 0.0001, "loss": 6.2364, "loss/crossentropy": 2.4882354736328125, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.20841020345687866, "step": 5120 }, { "epoch": 0.1600625, "grad_norm": 3.9375, "grad_norm_var": 0.04726460774739583, "learning_rate": 0.0001, "loss": 6.0552, "loss/crossentropy": 2.391435742378235, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.19997309893369675, "step": 5122 }, { "epoch": 0.160125, "grad_norm": 4.1875, "grad_norm_var": 0.06616109212239583, "learning_rate": 0.0001, "loss": 6.3829, "loss/crossentropy": 2.4768353700637817, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.218342125415802, "step": 5124 }, { "epoch": 0.1601875, "grad_norm": 3.953125, "grad_norm_var": 0.060301717122395834, "learning_rate": 0.0001, "loss": 6.5424, "loss/crossentropy": 2.626850724220276, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.22007066011428833, "step": 5126 }, { "epoch": 0.16025, "grad_norm": 4.0625, "grad_norm_var": 0.07681884765625, "learning_rate": 0.0001, "loss": 6.2302, "loss/crossentropy": 2.40298068523407, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.21827350556850433, "step": 5128 }, { "epoch": 0.1603125, "grad_norm": 3.640625, "grad_norm_var": 0.0725494384765625, "learning_rate": 0.0001, "loss": 6.5084, "loss/crossentropy": 2.6761317253112793, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.21799086779356003, "step": 5130 }, { "epoch": 0.160375, "grad_norm": 4.375, "grad_norm_var": 0.061432902018229166, "learning_rate": 0.0001, "loss": 6.4369, "loss/crossentropy": 2.5701708793640137, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.22065787762403488, "step": 5132 }, { "epoch": 0.1604375, "grad_norm": 3.640625, "grad_norm_var": 0.05025634765625, "learning_rate": 0.0001, "loss": 6.3835, "loss/crossentropy": 2.5875306129455566, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.2104513794183731, "step": 5134 }, { "epoch": 0.1605, "grad_norm": 3.609375, "grad_norm_var": 0.05621337890625, "learning_rate": 0.0001, "loss": 6.4553, "loss/crossentropy": 2.675415873527527, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.20962948352098465, "step": 5136 }, { "epoch": 0.1605625, "grad_norm": 4.0625, "grad_norm_var": 0.0486328125, "learning_rate": 0.0001, "loss": 6.6254, "loss/crossentropy": 2.7770951986312866, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.2231074795126915, "step": 5138 }, { "epoch": 0.160625, "grad_norm": 4.15625, "grad_norm_var": 0.0473297119140625, "learning_rate": 0.0001, "loss": 6.4635, "loss/crossentropy": 2.665207266807556, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.21224810183048248, "step": 5140 }, { "epoch": 0.1606875, "grad_norm": 3.921875, "grad_norm_var": 0.07489827473958334, "learning_rate": 0.0001, "loss": 6.5802, "loss/crossentropy": 2.6273059844970703, "loss/hidden": 1.765625, "loss/jsd": 0.0, "loss/logits": 0.2187241166830063, "step": 5142 }, { "epoch": 0.16075, "grad_norm": 3.625, "grad_norm_var": 0.07668863932291667, "learning_rate": 0.0001, "loss": 6.4066, "loss/crossentropy": 2.621706962585449, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.2089575156569481, "step": 5144 }, { "epoch": 0.1608125, "grad_norm": 3.390625, "grad_norm_var": 0.09207255045572917, "learning_rate": 0.0001, "loss": 6.0105, "loss/crossentropy": 2.5059956312179565, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.18990148603916168, "step": 5146 }, { "epoch": 0.160875, "grad_norm": 4.0, "grad_norm_var": 0.08333231608072916, "learning_rate": 0.0001, "loss": 6.1947, "loss/crossentropy": 2.434074878692627, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.20887398719787598, "step": 5148 }, { "epoch": 0.1609375, "grad_norm": 3.953125, "grad_norm_var": 0.11112874348958333, "learning_rate": 0.0001, "loss": 6.348, "loss/crossentropy": 2.608154535293579, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.2048449069261551, "step": 5150 }, { "epoch": 0.161, "grad_norm": 3.578125, "grad_norm_var": 0.12499593098958334, "learning_rate": 0.0001, "loss": 6.1689, "loss/crossentropy": 2.5595574378967285, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.1984364241361618, "step": 5152 }, { "epoch": 0.1610625, "grad_norm": 3.578125, "grad_norm_var": 0.1310699462890625, "learning_rate": 0.0001, "loss": 6.2864, "loss/crossentropy": 2.62160587310791, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.2051507607102394, "step": 5154 }, { "epoch": 0.161125, "grad_norm": 4.09375, "grad_norm_var": 0.18584696451822916, "learning_rate": 0.0001, "loss": 6.5913, "loss/crossentropy": 2.7094404697418213, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.2155286818742752, "step": 5156 }, { "epoch": 0.1611875, "grad_norm": 3.75, "grad_norm_var": 0.1341217041015625, "learning_rate": 0.0001, "loss": 6.2557, "loss/crossentropy": 2.498765468597412, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.21201671659946442, "step": 5158 }, { "epoch": 0.16125, "grad_norm": 3.6875, "grad_norm_var": 0.16846415201822917, "learning_rate": 0.0001, "loss": 6.4407, "loss/crossentropy": 2.5842690467834473, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.21884340792894363, "step": 5160 }, { "epoch": 0.1613125, "grad_norm": 3.625, "grad_norm_var": 0.16340230305989584, "learning_rate": 0.0001, "loss": 6.2249, "loss/crossentropy": 2.6013705730438232, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.2014119178056717, "step": 5162 }, { "epoch": 0.161375, "grad_norm": 3.890625, "grad_norm_var": 0.16494038899739583, "learning_rate": 0.0001, "loss": 6.4383, "loss/crossentropy": 2.6550703048706055, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.21465323865413666, "step": 5164 }, { "epoch": 0.1614375, "grad_norm": 3.9375, "grad_norm_var": 0.1437896728515625, "learning_rate": 0.0001, "loss": 6.5286, "loss/crossentropy": 2.627516508102417, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.2197948545217514, "step": 5166 }, { "epoch": 0.1615, "grad_norm": 4.25, "grad_norm_var": 0.13238525390625, "learning_rate": 0.0001, "loss": 6.5927, "loss/crossentropy": 2.7336513996124268, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.21871402859687805, "step": 5168 }, { "epoch": 0.1615625, "grad_norm": 3.875, "grad_norm_var": 0.10852762858072916, "learning_rate": 0.0001, "loss": 6.4115, "loss/crossentropy": 2.640623927116394, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.20834068953990936, "step": 5170 }, { "epoch": 0.161625, "grad_norm": 3.96875, "grad_norm_var": 0.07009175618489584, "learning_rate": 0.0001, "loss": 6.6146, "loss/crossentropy": 2.7761088609695435, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.21548492461442947, "step": 5172 }, { "epoch": 0.1616875, "grad_norm": 3.65625, "grad_norm_var": 0.07420247395833333, "learning_rate": 0.0001, "loss": 6.3679, "loss/crossentropy": 2.681834101676941, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.20571385324001312, "step": 5174 }, { "epoch": 0.16175, "grad_norm": 3.640625, "grad_norm_var": 0.054743448893229164, "learning_rate": 0.0001, "loss": 6.4309, "loss/crossentropy": 2.570289731025696, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.21769776940345764, "step": 5176 }, { "epoch": 0.1618125, "grad_norm": 3.625, "grad_norm_var": 0.06123758951822917, "learning_rate": 0.0001, "loss": 6.4081, "loss/crossentropy": 2.6298660039901733, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.20868495106697083, "step": 5178 }, { "epoch": 0.161875, "grad_norm": 3.859375, "grad_norm_var": 0.05553385416666667, "learning_rate": 0.0001, "loss": 6.1179, "loss/crossentropy": 2.452068328857422, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.20329831540584564, "step": 5180 }, { "epoch": 0.1619375, "grad_norm": 3.71875, "grad_norm_var": 0.05422770182291667, "learning_rate": 0.0001, "loss": 6.2349, "loss/crossentropy": 2.5371674299240112, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.20415294915437698, "step": 5182 }, { "epoch": 0.162, "grad_norm": 3.984375, "grad_norm_var": 0.07447916666666667, "learning_rate": 0.0001, "loss": 6.5218, "loss/crossentropy": 2.6421661376953125, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.21570029109716415, "step": 5184 }, { "epoch": 0.1620625, "grad_norm": 3.703125, "grad_norm_var": 0.07503153483072916, "learning_rate": 0.0001, "loss": 6.3243, "loss/crossentropy": 2.581998825073242, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.208997443318367, "step": 5186 }, { "epoch": 0.162125, "grad_norm": 3.703125, "grad_norm_var": 0.068115234375, "learning_rate": 0.0001, "loss": 6.564, "loss/crossentropy": 2.727616548538208, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.21722905337810516, "step": 5188 }, { "epoch": 0.1621875, "grad_norm": 3.640625, "grad_norm_var": 0.0645172119140625, "learning_rate": 0.0001, "loss": 6.8096, "loss/crossentropy": 2.9005852937698364, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.22176381200551987, "step": 5190 }, { "epoch": 0.16225, "grad_norm": 3.46875, "grad_norm_var": 0.07733968098958334, "learning_rate": 0.0001, "loss": 6.0591, "loss/crossentropy": 2.4665474891662598, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.19909808039665222, "step": 5192 }, { "epoch": 0.1623125, "grad_norm": 3.703125, "grad_norm_var": 0.06539306640625, "learning_rate": 0.0001, "loss": 6.5161, "loss/crossentropy": 2.7396236658096313, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.21124304085969925, "step": 5194 }, { "epoch": 0.162375, "grad_norm": 3.78125, "grad_norm_var": 0.06278889973958333, "learning_rate": 0.0001, "loss": 6.2922, "loss/crossentropy": 2.588027238845825, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.20713649690151215, "step": 5196 }, { "epoch": 0.1624375, "grad_norm": 3.390625, "grad_norm_var": 0.21730855305989583, "learning_rate": 0.0001, "loss": 6.4155, "loss/crossentropy": 2.5007861852645874, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.22232603281736374, "step": 5198 }, { "epoch": 0.1625, "grad_norm": 3.5, "grad_norm_var": 0.18430989583333332, "learning_rate": 0.0001, "loss": 6.3138, "loss/crossentropy": 2.7006181478500366, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.19842666387557983, "step": 5200 }, { "epoch": 0.1625625, "grad_norm": 3.578125, "grad_norm_var": 0.37224019368489586, "learning_rate": 0.0001, "loss": 6.209, "loss/crossentropy": 2.4328094720840454, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.20613687485456467, "step": 5202 }, { "epoch": 0.162625, "grad_norm": 3.484375, "grad_norm_var": 0.37862955729166664, "learning_rate": 0.0001, "loss": 6.245, "loss/crossentropy": 2.5239570140838623, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.20570280402898788, "step": 5204 }, { "epoch": 0.1626875, "grad_norm": 3.78125, "grad_norm_var": 0.3766103108723958, "learning_rate": 0.0001, "loss": 6.2682, "loss/crossentropy": 2.5990031957626343, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.20051166415214539, "step": 5206 }, { "epoch": 0.16275, "grad_norm": 3.4375, "grad_norm_var": 0.3717447916666667, "learning_rate": 0.0001, "loss": 5.9836, "loss/crossentropy": 2.4379926919937134, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19362393021583557, "step": 5208 }, { "epoch": 0.1628125, "grad_norm": 3.9375, "grad_norm_var": 0.3742472330729167, "learning_rate": 0.0001, "loss": 6.0679, "loss/crossentropy": 2.390724301338196, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.20013543963432312, "step": 5210 }, { "epoch": 0.162875, "grad_norm": 3.671875, "grad_norm_var": 0.3865559895833333, "learning_rate": 0.0001, "loss": 6.5876, "loss/crossentropy": 2.7420979738235474, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.21463260054588318, "step": 5212 }, { "epoch": 0.1629375, "grad_norm": 3.984375, "grad_norm_var": 0.24397684733072916, "learning_rate": 0.0001, "loss": 6.2138, "loss/crossentropy": 2.4656132459640503, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.20880325138568878, "step": 5214 }, { "epoch": 0.163, "grad_norm": 3.484375, "grad_norm_var": 0.25559895833333335, "learning_rate": 0.0001, "loss": 6.0856, "loss/crossentropy": 2.4168217182159424, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.19968654215335846, "step": 5216 }, { "epoch": 0.1630625, "grad_norm": 3.875, "grad_norm_var": 0.08619791666666667, "learning_rate": 0.0001, "loss": 6.3584, "loss/crossentropy": 2.638114094734192, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.20875079184770584, "step": 5218 }, { "epoch": 0.163125, "grad_norm": 3.703125, "grad_norm_var": 0.08138020833333333, "learning_rate": 0.0001, "loss": 6.441, "loss/crossentropy": 2.6618345975875854, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.21033572405576706, "step": 5220 }, { "epoch": 0.1631875, "grad_norm": 3.921875, "grad_norm_var": 0.08289388020833334, "learning_rate": 0.0001, "loss": 6.6087, "loss/crossentropy": 2.7304168939590454, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.216347374022007, "step": 5222 }, { "epoch": 0.16325, "grad_norm": 3.3125, "grad_norm_var": 0.08680013020833334, "learning_rate": 0.0001, "loss": 5.8067, "loss/crossentropy": 2.2770395278930664, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19398200511932373, "step": 5224 }, { "epoch": 0.1633125, "grad_norm": 4.1875, "grad_norm_var": 0.16366780598958333, "learning_rate": 0.0001, "loss": 6.5923, "loss/crossentropy": 2.704433560371399, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.22121261805295944, "step": 5226 }, { "epoch": 0.163375, "grad_norm": 3.984375, "grad_norm_var": 0.16862691243489583, "learning_rate": 0.0001, "loss": 6.3544, "loss/crossentropy": 2.5782910585403442, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.2112007588148117, "step": 5228 }, { "epoch": 0.1634375, "grad_norm": 3.8125, "grad_norm_var": 0.17488606770833334, "learning_rate": 0.0001, "loss": 6.3675, "loss/crossentropy": 2.6073272228240967, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.21234365552663803, "step": 5230 }, { "epoch": 0.1635, "grad_norm": 3.828125, "grad_norm_var": 0.15170796712239584, "learning_rate": 0.0001, "loss": 6.3838, "loss/crossentropy": 2.715991497039795, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.20349497348070145, "step": 5232 }, { "epoch": 0.1635625, "grad_norm": 4.1875, "grad_norm_var": 0.15120442708333334, "learning_rate": 0.0001, "loss": 6.2745, "loss/crossentropy": 2.5514438152313232, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.205508753657341, "step": 5234 }, { "epoch": 0.163625, "grad_norm": 3.4375, "grad_norm_var": 0.1604156494140625, "learning_rate": 0.0001, "loss": 6.3004, "loss/crossentropy": 2.616993546485901, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.20662517845630646, "step": 5236 }, { "epoch": 0.1636875, "grad_norm": 3.75, "grad_norm_var": 0.159228515625, "learning_rate": 0.0001, "loss": 6.4741, "loss/crossentropy": 2.6319591999053955, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.21546736359596252, "step": 5238 }, { "epoch": 0.16375, "grad_norm": 3.59375, "grad_norm_var": 0.146484375, "learning_rate": 0.0001, "loss": 6.5345, "loss/crossentropy": 2.6985357999801636, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.21602191030979156, "step": 5240 }, { "epoch": 0.1638125, "grad_norm": 4.21875, "grad_norm_var": 0.07890523274739583, "learning_rate": 0.0001, "loss": 6.5487, "loss/crossentropy": 2.69356632232666, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.2179398387670517, "step": 5242 }, { "epoch": 0.163875, "grad_norm": 3.421875, "grad_norm_var": 0.06830952962239584, "learning_rate": 0.0001, "loss": 6.4129, "loss/crossentropy": 2.6797508001327515, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.20768603682518005, "step": 5244 }, { "epoch": 0.1639375, "grad_norm": 3.65625, "grad_norm_var": 0.0629058837890625, "learning_rate": 0.0001, "loss": 6.3771, "loss/crossentropy": 2.670736074447632, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.20345058292150497, "step": 5246 }, { "epoch": 0.164, "grad_norm": 3.4375, "grad_norm_var": 0.06568603515625, "learning_rate": 0.0001, "loss": 6.4829, "loss/crossentropy": 2.721261501312256, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.21287819743156433, "step": 5248 }, { "epoch": 0.1640625, "grad_norm": 3.671875, "grad_norm_var": 0.04943033854166667, "learning_rate": 0.0001, "loss": 6.2365, "loss/crossentropy": 2.5345875024795532, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.20651615411043167, "step": 5250 }, { "epoch": 0.164125, "grad_norm": 3.421875, "grad_norm_var": 0.0500152587890625, "learning_rate": 0.0001, "loss": 6.1669, "loss/crossentropy": 2.5575015544891357, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.198830708861351, "step": 5252 }, { "epoch": 0.1641875, "grad_norm": 3.6875, "grad_norm_var": 0.050446573893229166, "learning_rate": 0.0001, "loss": 6.2607, "loss/crossentropy": 2.505549192428589, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.2075466513633728, "step": 5254 }, { "epoch": 0.16425, "grad_norm": 3.9375, "grad_norm_var": 0.051667277018229166, "learning_rate": 0.0001, "loss": 6.5641, "loss/crossentropy": 2.744532585144043, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.2194598913192749, "step": 5256 }, { "epoch": 0.1643125, "grad_norm": 3.671875, "grad_norm_var": 0.028511555989583333, "learning_rate": 0.0001, "loss": 6.049, "loss/crossentropy": 2.42891788482666, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.20029054582118988, "step": 5258 }, { "epoch": 0.164375, "grad_norm": 3.59375, "grad_norm_var": 0.027750651041666668, "learning_rate": 0.0001, "loss": 6.0197, "loss/crossentropy": 2.432560443878174, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.19933748990297318, "step": 5260 }, { "epoch": 0.1644375, "grad_norm": 3.96875, "grad_norm_var": 0.03818359375, "learning_rate": 0.0001, "loss": 6.2531, "loss/crossentropy": 2.4191304445266724, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.21308691799640656, "step": 5262 }, { "epoch": 0.1645, "grad_norm": 3.75, "grad_norm_var": 0.038834635416666666, "learning_rate": 0.0001, "loss": 6.062, "loss/crossentropy": 2.460596442222595, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.19295437633991241, "step": 5264 }, { "epoch": 0.1645625, "grad_norm": 3.4375, "grad_norm_var": 0.04149983723958333, "learning_rate": 0.0001, "loss": 6.1671, "loss/crossentropy": 2.5348631143569946, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.20229025930166245, "step": 5266 }, { "epoch": 0.164625, "grad_norm": 4.03125, "grad_norm_var": 0.04888916015625, "learning_rate": 0.0001, "loss": 5.7182, "loss/crossentropy": 2.207819700241089, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.18619763106107712, "step": 5268 }, { "epoch": 0.1646875, "grad_norm": 3.703125, "grad_norm_var": 0.04625651041666667, "learning_rate": 0.0001, "loss": 6.2923, "loss/crossentropy": 2.5302654504776, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.21252676844596863, "step": 5270 }, { "epoch": 0.16475, "grad_norm": 3.453125, "grad_norm_var": 0.0417144775390625, "learning_rate": 0.0001, "loss": 5.8896, "loss/crossentropy": 2.3639479875564575, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.1896733045578003, "step": 5272 }, { "epoch": 0.1648125, "grad_norm": 3.78125, "grad_norm_var": 0.03972066243489583, "learning_rate": 0.0001, "loss": 6.4644, "loss/crossentropy": 2.6995153427124023, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.2116451859474182, "step": 5274 }, { "epoch": 0.164875, "grad_norm": 5.1875, "grad_norm_var": 0.18012593587239584, "learning_rate": 0.0001, "loss": 6.0782, "loss/crossentropy": 2.4377158880233765, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.20232883840799332, "step": 5276 }, { "epoch": 0.1649375, "grad_norm": 3.953125, "grad_norm_var": 0.18073628743489584, "learning_rate": 0.0001, "loss": 6.1884, "loss/crossentropy": 2.6104766130447388, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.19763468205928802, "step": 5278 }, { "epoch": 0.165, "grad_norm": 3.765625, "grad_norm_var": 0.16936442057291667, "learning_rate": 0.0001, "loss": 6.457, "loss/crossentropy": 2.747946619987488, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.20879191160202026, "step": 5280 }, { "epoch": 0.1650625, "grad_norm": 3.765625, "grad_norm_var": 0.16105855305989583, "learning_rate": 0.0001, "loss": 6.2794, "loss/crossentropy": 2.5800029039382935, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.20665637403726578, "step": 5282 }, { "epoch": 0.165125, "grad_norm": 3.40625, "grad_norm_var": 0.1702056884765625, "learning_rate": 0.0001, "loss": 6.0868, "loss/crossentropy": 2.5235323905944824, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.19108830392360687, "step": 5284 }, { "epoch": 0.1651875, "grad_norm": 3.75, "grad_norm_var": 0.17115478515625, "learning_rate": 0.0001, "loss": 6.2818, "loss/crossentropy": 2.5826101303100586, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.20624583214521408, "step": 5286 }, { "epoch": 0.16525, "grad_norm": 3.484375, "grad_norm_var": 0.17464192708333334, "learning_rate": 0.0001, "loss": 6.1176, "loss/crossentropy": 2.4823321104049683, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.20063431560993195, "step": 5288 }, { "epoch": 0.1653125, "grad_norm": 3.515625, "grad_norm_var": 0.18733622233072916, "learning_rate": 0.0001, "loss": 6.4095, "loss/crossentropy": 2.626147985458374, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.21544121205806732, "step": 5290 }, { "epoch": 0.165375, "grad_norm": 3.046875, "grad_norm_var": 0.06989644368489584, "learning_rate": 0.0001, "loss": 6.0643, "loss/crossentropy": 2.466831922531128, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.19450916349887848, "step": 5292 }, { "epoch": 0.1654375, "grad_norm": 4.03125, "grad_norm_var": 0.074072265625, "learning_rate": 0.0001, "loss": 6.5792, "loss/crossentropy": 2.6792465448379517, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.22397875040769577, "step": 5294 }, { "epoch": 0.1655, "grad_norm": 3.78125, "grad_norm_var": 0.07503153483072916, "learning_rate": 0.0001, "loss": 6.2, "loss/crossentropy": 2.5096585750579834, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.20536701381206512, "step": 5296 }, { "epoch": 0.1655625, "grad_norm": 4.0, "grad_norm_var": 0.08166910807291666, "learning_rate": 0.0001, "loss": 6.4394, "loss/crossentropy": 2.628980278968811, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.2122943252325058, "step": 5298 }, { "epoch": 0.165625, "grad_norm": 3.28125, "grad_norm_var": 0.08350321451822916, "learning_rate": 0.0001, "loss": 5.8707, "loss/crossentropy": 2.3820395469665527, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.18636663258075714, "step": 5300 }, { "epoch": 0.1656875, "grad_norm": 3.4375, "grad_norm_var": 0.08725484212239583, "learning_rate": 0.0001, "loss": 6.0521, "loss/crossentropy": 2.4212061166763306, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.19551438838243484, "step": 5302 }, { "epoch": 0.16575, "grad_norm": 3.765625, "grad_norm_var": 0.09051106770833334, "learning_rate": 0.0001, "loss": 6.2196, "loss/crossentropy": 2.6344507932662964, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.19992156326770782, "step": 5304 }, { "epoch": 0.1658125, "grad_norm": 4.03125, "grad_norm_var": 0.08406575520833333, "learning_rate": 0.0001, "loss": 6.1123, "loss/crossentropy": 2.4072346687316895, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.2044946402311325, "step": 5306 }, { "epoch": 0.165875, "grad_norm": 3.609375, "grad_norm_var": 0.06939697265625, "learning_rate": 0.0001, "loss": 5.9202, "loss/crossentropy": 2.3986281156539917, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.18965810537338257, "step": 5308 }, { "epoch": 0.1659375, "grad_norm": 3.5625, "grad_norm_var": 0.05939839680989583, "learning_rate": 0.0001, "loss": 6.329, "loss/crossentropy": 2.6033272743225098, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.21045982837677002, "step": 5310 }, { "epoch": 0.166, "grad_norm": 3.75, "grad_norm_var": 0.05845947265625, "learning_rate": 0.0001, "loss": 6.139, "loss/crossentropy": 2.564116358757019, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.1988985240459442, "step": 5312 }, { "epoch": 0.1660625, "grad_norm": 4.03125, "grad_norm_var": 0.06773173014322917, "learning_rate": 0.0001, "loss": 6.5924, "loss/crossentropy": 2.6761828660964966, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.2252115160226822, "step": 5314 }, { "epoch": 0.166125, "grad_norm": 3.59375, "grad_norm_var": 0.06974283854166667, "learning_rate": 0.0001, "loss": 6.094, "loss/crossentropy": 2.426016926765442, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.2011687308549881, "step": 5316 }, { "epoch": 0.1661875, "grad_norm": 3.9375, "grad_norm_var": 0.06330464680989584, "learning_rate": 0.0001, "loss": 6.3091, "loss/crossentropy": 2.5874863862991333, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.204196497797966, "step": 5318 }, { "epoch": 0.16625, "grad_norm": 5.84375, "grad_norm_var": 0.32942606608072916, "learning_rate": 0.0001, "loss": 6.6528, "loss/crossentropy": 2.6687638759613037, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.2292676791548729, "step": 5320 }, { "epoch": 0.1663125, "grad_norm": 3.71875, "grad_norm_var": 0.35510660807291666, "learning_rate": 0.0001, "loss": 6.4089, "loss/crossentropy": 2.679196834564209, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.20734111219644547, "step": 5322 }, { "epoch": 0.166375, "grad_norm": 3.546875, "grad_norm_var": 0.33088785807291665, "learning_rate": 0.0001, "loss": 6.3873, "loss/crossentropy": 2.6827419996261597, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.20405340194702148, "step": 5324 }, { "epoch": 0.1664375, "grad_norm": 3.609375, "grad_norm_var": 0.35852457682291666, "learning_rate": 0.0001, "loss": 6.5107, "loss/crossentropy": 2.693418025970459, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.21258943527936935, "step": 5326 }, { "epoch": 0.1665, "grad_norm": 3.59375, "grad_norm_var": 0.394287109375, "learning_rate": 0.0001, "loss": 6.0726, "loss/crossentropy": 2.532777190208435, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.1934373378753662, "step": 5328 }, { "epoch": 0.1665625, "grad_norm": 3.734375, "grad_norm_var": 0.40081380208333334, "learning_rate": 0.0001, "loss": 6.4014, "loss/crossentropy": 2.661211609840393, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.20996029675006866, "step": 5330 }, { "epoch": 0.166625, "grad_norm": 4.25, "grad_norm_var": 0.4056630452473958, "learning_rate": 0.0001, "loss": 6.4706, "loss/crossentropy": 2.697288155555725, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.2132657915353775, "step": 5332 }, { "epoch": 0.1666875, "grad_norm": 4.0, "grad_norm_var": 0.415185546875, "learning_rate": 0.0001, "loss": 6.4788, "loss/crossentropy": 2.6174012422561646, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.21699899435043335, "step": 5334 }, { "epoch": 0.16675, "grad_norm": 4.09375, "grad_norm_var": 0.1821197509765625, "learning_rate": 0.0001, "loss": 6.0464, "loss/crossentropy": 2.4147592782974243, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.2037886083126068, "step": 5336 }, { "epoch": 0.1668125, "grad_norm": 3.546875, "grad_norm_var": 0.15432535807291667, "learning_rate": 0.0001, "loss": 6.4814, "loss/crossentropy": 2.7996264696121216, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.20528528094291687, "step": 5338 }, { "epoch": 0.166875, "grad_norm": 5.625, "grad_norm_var": 0.34579671223958336, "learning_rate": 0.0001, "loss": 6.5194, "loss/crossentropy": 2.6324565410614014, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.220335453748703, "step": 5340 }, { "epoch": 0.1669375, "grad_norm": 3.703125, "grad_norm_var": 0.30339253743489586, "learning_rate": 0.0001, "loss": 6.2926, "loss/crossentropy": 2.5119271278381348, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.20970982313156128, "step": 5342 }, { "epoch": 0.167, "grad_norm": 4.03125, "grad_norm_var": 0.2597564697265625, "learning_rate": 0.0001, "loss": 6.6131, "loss/crossentropy": 2.618421196937561, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.23266763985157013, "step": 5344 }, { "epoch": 0.1670625, "grad_norm": 4.0625, "grad_norm_var": 0.2544097900390625, "learning_rate": 0.0001, "loss": 6.7084, "loss/crossentropy": 2.812577247619629, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.22004784643650055, "step": 5346 }, { "epoch": 0.167125, "grad_norm": 4.0625, "grad_norm_var": 0.2555491129557292, "learning_rate": 0.0001, "loss": 6.5053, "loss/crossentropy": 2.6363308429718018, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.2201043888926506, "step": 5348 }, { "epoch": 0.1671875, "grad_norm": 4.03125, "grad_norm_var": 0.24338785807291666, "learning_rate": 0.0001, "loss": 6.0258, "loss/crossentropy": 2.3339359760284424, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.20707960426807404, "step": 5350 }, { "epoch": 0.16725, "grad_norm": 3.671875, "grad_norm_var": 0.24612223307291667, "learning_rate": 0.0001, "loss": 6.2252, "loss/crossentropy": 2.5705204010009766, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.20062048733234406, "step": 5352 }, { "epoch": 0.1673125, "grad_norm": 3.484375, "grad_norm_var": 0.2734039306640625, "learning_rate": 0.0001, "loss": 6.248, "loss/crossentropy": 2.613363027572632, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.20096845924854279, "step": 5354 }, { "epoch": 0.167375, "grad_norm": 3.375, "grad_norm_var": 0.07454020182291667, "learning_rate": 0.0001, "loss": 6.1435, "loss/crossentropy": 2.6340157985687256, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.19157454371452332, "step": 5356 }, { "epoch": 0.1674375, "grad_norm": 3.75, "grad_norm_var": 0.12720947265625, "learning_rate": 0.0001, "loss": 6.2857, "loss/crossentropy": 2.5772162675857544, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.20639867335557938, "step": 5358 }, { "epoch": 0.1675, "grad_norm": 3.328125, "grad_norm_var": 0.12574462890625, "learning_rate": 0.0001, "loss": 6.0702, "loss/crossentropy": 2.4975491762161255, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.19788971543312073, "step": 5360 }, { "epoch": 0.1675625, "grad_norm": 4.5625, "grad_norm_var": 0.1691802978515625, "learning_rate": 0.0001, "loss": 6.5493, "loss/crossentropy": 2.58541202545166, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.2241220772266388, "step": 5362 }, { "epoch": 0.167625, "grad_norm": 3.875, "grad_norm_var": 0.16763916015625, "learning_rate": 0.0001, "loss": 6.3549, "loss/crossentropy": 2.555784583091736, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.20998644083738327, "step": 5364 }, { "epoch": 0.1676875, "grad_norm": 3.28125, "grad_norm_var": 0.1792144775390625, "learning_rate": 0.0001, "loss": 6.1039, "loss/crossentropy": 2.540518641471863, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.19383682310581207, "step": 5366 }, { "epoch": 0.16775, "grad_norm": 3.40625, "grad_norm_var": 0.18572489420572916, "learning_rate": 0.0001, "loss": 6.1902, "loss/crossentropy": 2.560936450958252, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.2035495787858963, "step": 5368 }, { "epoch": 0.1678125, "grad_norm": 3.734375, "grad_norm_var": 0.17398173014322918, "learning_rate": 0.0001, "loss": 6.389, "loss/crossentropy": 2.6606903076171875, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.20720500499010086, "step": 5370 }, { "epoch": 0.167875, "grad_norm": 3.59375, "grad_norm_var": 0.16502278645833332, "learning_rate": 0.0001, "loss": 6.5097, "loss/crossentropy": 2.69782817363739, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.21634666621685028, "step": 5372 }, { "epoch": 0.1679375, "grad_norm": 3.640625, "grad_norm_var": 0.10647684733072917, "learning_rate": 0.0001, "loss": 6.4707, "loss/crossentropy": 2.744077205657959, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.20859896391630173, "step": 5374 }, { "epoch": 0.168, "grad_norm": 3.65625, "grad_norm_var": 0.17427469889322916, "learning_rate": 0.0001, "loss": 6.645, "loss/crossentropy": 2.8004655838012695, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.21219252049922943, "step": 5376 }, { "epoch": 0.1680625, "grad_norm": 4.0, "grad_norm_var": 0.13902079264322917, "learning_rate": 0.0001, "loss": 6.3748, "loss/crossentropy": 2.616547465324402, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.20785995572805405, "step": 5378 }, { "epoch": 0.168125, "grad_norm": 3.65625, "grad_norm_var": 0.12819010416666668, "learning_rate": 0.0001, "loss": 6.1065, "loss/crossentropy": 2.487843632698059, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.19858013838529587, "step": 5380 }, { "epoch": 0.1681875, "grad_norm": 3.5, "grad_norm_var": 0.13079020182291667, "learning_rate": 0.0001, "loss": 6.2677, "loss/crossentropy": 2.6592295169830322, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.20108597725629807, "step": 5382 }, { "epoch": 0.16825, "grad_norm": 3.65625, "grad_norm_var": 0.1249664306640625, "learning_rate": 0.0001, "loss": 6.2955, "loss/crossentropy": 2.5784114599227905, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.20608504116535187, "step": 5384 }, { "epoch": 0.1683125, "grad_norm": 3.46875, "grad_norm_var": 0.13178609212239584, "learning_rate": 0.0001, "loss": 6.3715, "loss/crossentropy": 2.714709997177124, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.20435550063848495, "step": 5386 }, { "epoch": 0.168375, "grad_norm": 3.453125, "grad_norm_var": 0.14839579264322916, "learning_rate": 0.0001, "loss": 6.3324, "loss/crossentropy": 2.7200835943222046, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.20146213471889496, "step": 5388 }, { "epoch": 0.1684375, "grad_norm": 3.59375, "grad_norm_var": 0.14973958333333334, "learning_rate": 0.0001, "loss": 6.4809, "loss/crossentropy": 2.7606624364852905, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.2099115252494812, "step": 5390 }, { "epoch": 0.1685, "grad_norm": 3.640625, "grad_norm_var": 0.03892822265625, "learning_rate": 0.0001, "loss": 6.3505, "loss/crossentropy": 2.65522837638855, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.203899547457695, "step": 5392 }, { "epoch": 0.1685625, "grad_norm": 3.765625, "grad_norm_var": 0.0453033447265625, "learning_rate": 0.0001, "loss": 6.5379, "loss/crossentropy": 2.6739238500595093, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.2195974886417389, "step": 5394 }, { "epoch": 0.168625, "grad_norm": 3.4375, "grad_norm_var": 0.04299723307291667, "learning_rate": 0.0001, "loss": 6.0921, "loss/crossentropy": 2.4353628158569336, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.2047354057431221, "step": 5396 }, { "epoch": 0.1686875, "grad_norm": 3.828125, "grad_norm_var": 0.05764058430989583, "learning_rate": 0.0001, "loss": 6.3689, "loss/crossentropy": 2.521478533744812, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.2144312784075737, "step": 5398 }, { "epoch": 0.16875, "grad_norm": 3.625, "grad_norm_var": 0.05908915201822917, "learning_rate": 0.0001, "loss": 6.0522, "loss/crossentropy": 2.4441404342651367, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.19596174359321594, "step": 5400 }, { "epoch": 0.1688125, "grad_norm": 3.5625, "grad_norm_var": 0.05773111979166667, "learning_rate": 0.0001, "loss": 6.3178, "loss/crossentropy": 2.64383864402771, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.20528355240821838, "step": 5402 }, { "epoch": 0.168875, "grad_norm": 4.1875, "grad_norm_var": 0.06798502604166666, "learning_rate": 0.0001, "loss": 6.4363, "loss/crossentropy": 2.61694872379303, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.21474573761224747, "step": 5404 }, { "epoch": 0.1689375, "grad_norm": 3.625, "grad_norm_var": 0.06555582682291666, "learning_rate": 0.0001, "loss": 6.3585, "loss/crossentropy": 2.6157970428466797, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.2141156867146492, "step": 5406 }, { "epoch": 0.169, "grad_norm": 3.734375, "grad_norm_var": 0.0596343994140625, "learning_rate": 0.0001, "loss": 6.7139, "loss/crossentropy": 2.8296098709106445, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.22124291211366653, "step": 5408 }, { "epoch": 0.1690625, "grad_norm": 3.78125, "grad_norm_var": 0.0520660400390625, "learning_rate": 0.0001, "loss": 6.5764, "loss/crossentropy": 2.770159125328064, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.21304430067539215, "step": 5410 }, { "epoch": 0.169125, "grad_norm": 3.796875, "grad_norm_var": 0.05730794270833333, "learning_rate": 0.0001, "loss": 6.1421, "loss/crossentropy": 2.5159448385238647, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.19933024793863297, "step": 5412 }, { "epoch": 0.1691875, "grad_norm": 3.5625, "grad_norm_var": 0.0895904541015625, "learning_rate": 0.0001, "loss": 6.8257, "loss/crossentropy": 2.857306122779846, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.22925830632448196, "step": 5414 }, { "epoch": 0.16925, "grad_norm": 4.4375, "grad_norm_var": 0.1106842041015625, "learning_rate": 0.0001, "loss": 6.3079, "loss/crossentropy": 2.5683765411376953, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.20754961669445038, "step": 5416 }, { "epoch": 0.1693125, "grad_norm": 3.984375, "grad_norm_var": 0.09905192057291666, "learning_rate": 0.0001, "loss": 6.2138, "loss/crossentropy": 2.4839487075805664, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.20735560357570648, "step": 5418 }, { "epoch": 0.169375, "grad_norm": 3.4375, "grad_norm_var": 0.10004781087239584, "learning_rate": 0.0001, "loss": 6.2687, "loss/crossentropy": 2.6493914127349854, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.1986526921391487, "step": 5420 }, { "epoch": 0.1694375, "grad_norm": 4.03125, "grad_norm_var": 0.10148111979166667, "learning_rate": 0.0001, "loss": 6.5061, "loss/crossentropy": 2.7358455657958984, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.21296165138483047, "step": 5422 }, { "epoch": 0.1695, "grad_norm": 3.484375, "grad_norm_var": 0.10832926432291666, "learning_rate": 0.0001, "loss": 6.2646, "loss/crossentropy": 2.6034480333328247, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.20518138259649277, "step": 5424 }, { "epoch": 0.1695625, "grad_norm": 3.46875, "grad_norm_var": 0.11578369140625, "learning_rate": 0.0001, "loss": 6.1431, "loss/crossentropy": 2.5414342880249023, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19923234730958939, "step": 5426 }, { "epoch": 0.169625, "grad_norm": 3.890625, "grad_norm_var": 0.10565999348958334, "learning_rate": 0.0001, "loss": 5.8812, "loss/crossentropy": 2.3314428329467773, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.19013582915067673, "step": 5428 }, { "epoch": 0.1696875, "grad_norm": 3.8125, "grad_norm_var": 0.07024637858072917, "learning_rate": 0.0001, "loss": 6.293, "loss/crossentropy": 2.6051303148269653, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.2031620442867279, "step": 5430 }, { "epoch": 0.16975, "grad_norm": 3.75, "grad_norm_var": 0.038655598958333336, "learning_rate": 0.0001, "loss": 6.2182, "loss/crossentropy": 2.517228841781616, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.20330332219600677, "step": 5432 }, { "epoch": 0.1698125, "grad_norm": 3.9375, "grad_norm_var": 0.0454254150390625, "learning_rate": 0.0001, "loss": 6.0247, "loss/crossentropy": 2.445191740989685, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.20052997767925262, "step": 5434 }, { "epoch": 0.169875, "grad_norm": 4.09375, "grad_norm_var": 0.05201416015625, "learning_rate": 0.0001, "loss": 6.6324, "loss/crossentropy": 2.6764878034591675, "loss/hidden": 1.7265625, "loss/jsd": 0.0, "loss/logits": 0.22293312847614288, "step": 5436 }, { "epoch": 0.1699375, "grad_norm": 3.515625, "grad_norm_var": 0.05042215983072917, "learning_rate": 0.0001, "loss": 6.4418, "loss/crossentropy": 2.747052788734436, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.20541198551654816, "step": 5438 }, { "epoch": 0.17, "grad_norm": 3.609375, "grad_norm_var": 0.0805084228515625, "learning_rate": 0.0001, "loss": 5.9118, "loss/crossentropy": 2.369445323944092, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.19603674113750458, "step": 5440 }, { "epoch": 0.1700625, "grad_norm": 3.296875, "grad_norm_var": 0.08769124348958333, "learning_rate": 0.0001, "loss": 5.7587, "loss/crossentropy": 2.2850255966186523, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.19267499446868896, "step": 5442 }, { "epoch": 0.170125, "grad_norm": 3.625, "grad_norm_var": 0.08445536295572917, "learning_rate": 0.0001, "loss": 6.3702, "loss/crossentropy": 2.642918348312378, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.20944388210773468, "step": 5444 }, { "epoch": 0.1701875, "grad_norm": 3.265625, "grad_norm_var": 0.0998931884765625, "learning_rate": 0.0001, "loss": 6.0806, "loss/crossentropy": 2.5002808570861816, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.19435935467481613, "step": 5446 }, { "epoch": 0.17025, "grad_norm": 3.609375, "grad_norm_var": 0.10025634765625, "learning_rate": 0.0001, "loss": 6.2508, "loss/crossentropy": 2.54501211643219, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.20495309680700302, "step": 5448 }, { "epoch": 0.1703125, "grad_norm": 3.703125, "grad_norm_var": 0.0860015869140625, "learning_rate": 0.0001, "loss": 6.3207, "loss/crossentropy": 2.633958101272583, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.20305413007736206, "step": 5450 }, { "epoch": 0.170375, "grad_norm": 4.28125, "grad_norm_var": 0.10237223307291667, "learning_rate": 0.0001, "loss": 6.4857, "loss/crossentropy": 2.7109899520874023, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.21419066935777664, "step": 5452 }, { "epoch": 0.1704375, "grad_norm": 3.40625, "grad_norm_var": 0.10229390462239583, "learning_rate": 0.0001, "loss": 5.651, "loss/crossentropy": 2.245222568511963, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.17690138518810272, "step": 5454 }, { "epoch": 0.1705, "grad_norm": 3.46875, "grad_norm_var": 0.05877278645833333, "learning_rate": 0.0001, "loss": 6.2159, "loss/crossentropy": 2.6513789892196655, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19668704271316528, "step": 5456 }, { "epoch": 0.1705625, "grad_norm": 3.578125, "grad_norm_var": 0.059992472330729164, "learning_rate": 0.0001, "loss": 6.543, "loss/crossentropy": 2.70147442817688, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.21969977766275406, "step": 5458 }, { "epoch": 0.170625, "grad_norm": 3.96875, "grad_norm_var": 0.06984049479166667, "learning_rate": 0.0001, "loss": 6.4508, "loss/crossentropy": 2.6886109113693237, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.21020089089870453, "step": 5460 }, { "epoch": 0.1706875, "grad_norm": 3.46875, "grad_norm_var": 0.061644490559895834, "learning_rate": 0.0001, "loss": 5.9927, "loss/crossentropy": 2.4410237073898315, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.19228195399045944, "step": 5462 }, { "epoch": 0.17075, "grad_norm": 3.5625, "grad_norm_var": 0.0674957275390625, "learning_rate": 0.0001, "loss": 6.5238, "loss/crossentropy": 2.766111969947815, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.2109237089753151, "step": 5464 }, { "epoch": 0.1708125, "grad_norm": 3.5, "grad_norm_var": 0.06883138020833333, "learning_rate": 0.0001, "loss": 6.3399, "loss/crossentropy": 2.627722144126892, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.20832887291908264, "step": 5466 }, { "epoch": 0.170875, "grad_norm": 3.5625, "grad_norm_var": 0.03599853515625, "learning_rate": 0.0001, "loss": 6.2697, "loss/crossentropy": 2.5772327184677124, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.20675109326839447, "step": 5468 }, { "epoch": 0.1709375, "grad_norm": 3.375, "grad_norm_var": 0.03992513020833333, "learning_rate": 0.0001, "loss": 6.4087, "loss/crossentropy": 2.600441336631775, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.21754159033298492, "step": 5470 }, { "epoch": 0.171, "grad_norm": 3.5, "grad_norm_var": 0.03908589680989583, "learning_rate": 0.0001, "loss": 6.1616, "loss/crossentropy": 2.4985276460647583, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.20302552729845047, "step": 5472 }, { "epoch": 0.1710625, "grad_norm": 3.859375, "grad_norm_var": 0.036181640625, "learning_rate": 0.0001, "loss": 6.4449, "loss/crossentropy": 2.6403703689575195, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.21521900594234467, "step": 5474 }, { "epoch": 0.171125, "grad_norm": 3.359375, "grad_norm_var": 0.0319732666015625, "learning_rate": 0.0001, "loss": 6.4397, "loss/crossentropy": 2.7072731256484985, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.21074757725000381, "step": 5476 }, { "epoch": 0.1711875, "grad_norm": 4.09375, "grad_norm_var": 0.0464752197265625, "learning_rate": 0.0001, "loss": 6.5197, "loss/crossentropy": 2.655811905860901, "loss/hidden": 1.7109375, "loss/jsd": 0.0, "loss/logits": 0.21529193222522736, "step": 5478 }, { "epoch": 0.17125, "grad_norm": 3.4375, "grad_norm_var": 0.0478515625, "learning_rate": 0.0001, "loss": 5.8979, "loss/crossentropy": 2.374879837036133, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.18628836423158646, "step": 5480 }, { "epoch": 0.1713125, "grad_norm": 3.96875, "grad_norm_var": 0.058226521809895834, "learning_rate": 0.0001, "loss": 6.4394, "loss/crossentropy": 2.685877799987793, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.21206961572170258, "step": 5482 }, { "epoch": 0.171375, "grad_norm": 3.6875, "grad_norm_var": 0.05786844889322917, "learning_rate": 0.0001, "loss": 6.1926, "loss/crossentropy": 2.487221598625183, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.20686902850866318, "step": 5484 }, { "epoch": 0.1714375, "grad_norm": 4.15625, "grad_norm_var": 0.06420796712239583, "learning_rate": 0.0001, "loss": 6.6209, "loss/crossentropy": 2.7182233333587646, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.2215162143111229, "step": 5486 }, { "epoch": 0.1715, "grad_norm": 3.890625, "grad_norm_var": 0.06687723795572917, "learning_rate": 0.0001, "loss": 6.2613, "loss/crossentropy": 2.6063839197158813, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.19869530200958252, "step": 5488 }, { "epoch": 0.1715625, "grad_norm": 3.875, "grad_norm_var": 0.0705230712890625, "learning_rate": 0.0001, "loss": 6.401, "loss/crossentropy": 2.596443295478821, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.21131177991628647, "step": 5490 }, { "epoch": 0.171625, "grad_norm": 3.75, "grad_norm_var": 0.06787821451822916, "learning_rate": 0.0001, "loss": 6.3697, "loss/crossentropy": 2.6126943826675415, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.21008026599884033, "step": 5492 }, { "epoch": 0.1716875, "grad_norm": 3.734375, "grad_norm_var": 0.061701456705729164, "learning_rate": 0.0001, "loss": 6.1955, "loss/crossentropy": 2.5357481241226196, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.20269111543893814, "step": 5494 }, { "epoch": 0.17175, "grad_norm": 3.890625, "grad_norm_var": 0.048173014322916666, "learning_rate": 0.0001, "loss": 6.3698, "loss/crossentropy": 2.6355448961257935, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.2042831853032112, "step": 5496 }, { "epoch": 0.1718125, "grad_norm": 3.40625, "grad_norm_var": 0.04729410807291667, "learning_rate": 0.0001, "loss": 6.0, "loss/crossentropy": 2.480632781982422, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1952923834323883, "step": 5498 }, { "epoch": 0.171875, "grad_norm": 3.578125, "grad_norm_var": 0.057373046875, "learning_rate": 0.0001, "loss": 6.3511, "loss/crossentropy": 2.5731500387191772, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.20787642896175385, "step": 5500 }, { "epoch": 0.1719375, "grad_norm": 3.296875, "grad_norm_var": 0.0629791259765625, "learning_rate": 0.0001, "loss": 5.9312, "loss/crossentropy": 2.383143901824951, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.1895749568939209, "step": 5502 }, { "epoch": 0.172, "grad_norm": 3.65625, "grad_norm_var": 0.05706380208333333, "learning_rate": 0.0001, "loss": 6.5475, "loss/crossentropy": 2.9158960580825806, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.20261601358652115, "step": 5504 }, { "epoch": 0.1720625, "grad_norm": 4.03125, "grad_norm_var": 0.05901692708333333, "learning_rate": 0.0001, "loss": 6.1549, "loss/crossentropy": 2.452110528945923, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.19996604323387146, "step": 5506 }, { "epoch": 0.172125, "grad_norm": 3.734375, "grad_norm_var": 0.04944559733072917, "learning_rate": 0.0001, "loss": 6.2734, "loss/crossentropy": 2.584115147590637, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.20526093244552612, "step": 5508 }, { "epoch": 0.1721875, "grad_norm": 3.796875, "grad_norm_var": 0.04934488932291667, "learning_rate": 0.0001, "loss": 6.5578, "loss/crossentropy": 2.7004928588867188, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.2134695053100586, "step": 5510 }, { "epoch": 0.17225, "grad_norm": 3.734375, "grad_norm_var": 0.04513346354166667, "learning_rate": 0.0001, "loss": 5.8912, "loss/crossentropy": 2.4137511253356934, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.18602566421031952, "step": 5512 }, { "epoch": 0.1723125, "grad_norm": 3.9375, "grad_norm_var": 0.04523824055989583, "learning_rate": 0.0001, "loss": 5.9679, "loss/crossentropy": 2.368557929992676, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19899508357048035, "step": 5514 }, { "epoch": 0.172375, "grad_norm": 3.703125, "grad_norm_var": 0.03235677083333333, "learning_rate": 0.0001, "loss": 6.2388, "loss/crossentropy": 2.5534307956695557, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.20759540796279907, "step": 5516 }, { "epoch": 0.1724375, "grad_norm": 3.640625, "grad_norm_var": 0.021776326497395835, "learning_rate": 0.0001, "loss": 6.3551, "loss/crossentropy": 2.6560505628585815, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.2089691162109375, "step": 5518 }, { "epoch": 0.1725, "grad_norm": 4.03125, "grad_norm_var": 0.031183878580729168, "learning_rate": 0.0001, "loss": 6.4113, "loss/crossentropy": 2.628575325012207, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.20991557836532593, "step": 5520 }, { "epoch": 0.1725625, "grad_norm": 3.71875, "grad_norm_var": 0.025716145833333332, "learning_rate": 0.0001, "loss": 6.3448, "loss/crossentropy": 2.7013977766036987, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.20144866406917572, "step": 5522 }, { "epoch": 0.172625, "grad_norm": 3.75, "grad_norm_var": 0.027242024739583332, "learning_rate": 0.0001, "loss": 6.1257, "loss/crossentropy": 2.425819158554077, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.20592305809259415, "step": 5524 }, { "epoch": 0.1726875, "grad_norm": 3.71875, "grad_norm_var": 0.0305572509765625, "learning_rate": 0.0001, "loss": 6.1834, "loss/crossentropy": 2.5644673109054565, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.20251353085041046, "step": 5526 }, { "epoch": 0.17275, "grad_norm": 3.34375, "grad_norm_var": 0.0370513916015625, "learning_rate": 0.0001, "loss": 6.5402, "loss/crossentropy": 2.778851270675659, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.21363477408885956, "step": 5528 }, { "epoch": 0.1728125, "grad_norm": 3.84375, "grad_norm_var": 0.07545166015625, "learning_rate": 0.0001, "loss": 6.4929, "loss/crossentropy": 2.754807710647583, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.20779237151145935, "step": 5530 }, { "epoch": 0.172875, "grad_norm": 3.609375, "grad_norm_var": 0.07896728515625, "learning_rate": 0.0001, "loss": 6.3329, "loss/crossentropy": 2.6094292402267456, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.2078896090388298, "step": 5532 }, { "epoch": 0.1729375, "grad_norm": 3.734375, "grad_norm_var": 0.08041890462239583, "learning_rate": 0.0001, "loss": 6.5027, "loss/crossentropy": 2.8108277320861816, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.20668402314186096, "step": 5534 }, { "epoch": 0.173, "grad_norm": 3.703125, "grad_norm_var": 0.07682291666666667, "learning_rate": 0.0001, "loss": 6.4419, "loss/crossentropy": 2.603666305541992, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.21390238404273987, "step": 5536 }, { "epoch": 0.1730625, "grad_norm": 3.484375, "grad_norm_var": 0.08290608723958333, "learning_rate": 0.0001, "loss": 6.4747, "loss/crossentropy": 2.6804850101470947, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.21692615002393723, "step": 5538 }, { "epoch": 0.173125, "grad_norm": 3.21875, "grad_norm_var": 0.09759012858072917, "learning_rate": 0.0001, "loss": 6.3413, "loss/crossentropy": 2.7084202766418457, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.20000549405813217, "step": 5540 }, { "epoch": 0.1731875, "grad_norm": 3.4375, "grad_norm_var": 0.10080973307291667, "learning_rate": 0.0001, "loss": 6.3795, "loss/crossentropy": 2.7111209630966187, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.2043348103761673, "step": 5542 }, { "epoch": 0.17325, "grad_norm": 3.328125, "grad_norm_var": 0.10380452473958333, "learning_rate": 0.0001, "loss": 5.9665, "loss/crossentropy": 2.4299083948135376, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.19233398884534836, "step": 5544 }, { "epoch": 0.1733125, "grad_norm": 3.1875, "grad_norm_var": 0.0641021728515625, "learning_rate": 0.0001, "loss": 6.2057, "loss/crossentropy": 2.600328803062439, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.19881782680749893, "step": 5546 }, { "epoch": 0.173375, "grad_norm": 3.34375, "grad_norm_var": 0.09570210774739583, "learning_rate": 0.0001, "loss": 6.2745, "loss/crossentropy": 2.575212240219116, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.2023472636938095, "step": 5548 }, { "epoch": 0.1734375, "grad_norm": 4.1875, "grad_norm_var": 0.11731669108072916, "learning_rate": 0.0001, "loss": 6.2189, "loss/crossentropy": 2.5943928956985474, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.1976061761379242, "step": 5550 }, { "epoch": 0.1735, "grad_norm": 3.84375, "grad_norm_var": 0.10994364420572916, "learning_rate": 0.0001, "loss": 6.5701, "loss/crossentropy": 2.767845034599304, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.2130359187722206, "step": 5552 }, { "epoch": 0.1735625, "grad_norm": 3.453125, "grad_norm_var": 0.11357014973958333, "learning_rate": 0.0001, "loss": 6.1909, "loss/crossentropy": 2.653822660446167, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.19238030910491943, "step": 5554 }, { "epoch": 0.173625, "grad_norm": 3.78125, "grad_norm_var": 0.10829671223958333, "learning_rate": 0.0001, "loss": 6.2698, "loss/crossentropy": 2.593658685684204, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.1996469348669052, "step": 5556 }, { "epoch": 0.1736875, "grad_norm": 3.28125, "grad_norm_var": 0.11754150390625, "learning_rate": 0.0001, "loss": 6.1297, "loss/crossentropy": 2.5261536836624146, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.19746138155460358, "step": 5558 }, { "epoch": 0.17375, "grad_norm": 3.828125, "grad_norm_var": 0.1182037353515625, "learning_rate": 0.0001, "loss": 6.0964, "loss/crossentropy": 2.4176089763641357, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.1987377628684044, "step": 5560 }, { "epoch": 0.1738125, "grad_norm": 3.765625, "grad_norm_var": 0.10011393229166667, "learning_rate": 0.0001, "loss": 6.3642, "loss/crossentropy": 2.759734869003296, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.19989798218011856, "step": 5562 }, { "epoch": 0.173875, "grad_norm": 3.5, "grad_norm_var": 0.06797587076822917, "learning_rate": 0.0001, "loss": 6.0124, "loss/crossentropy": 2.4658912420272827, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.19527988135814667, "step": 5564 }, { "epoch": 0.1739375, "grad_norm": 3.71875, "grad_norm_var": 0.05498046875, "learning_rate": 0.0001, "loss": 6.0655, "loss/crossentropy": 2.4361329078674316, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.19965960085391998, "step": 5566 }, { "epoch": 0.174, "grad_norm": 3.640625, "grad_norm_var": 0.0514068603515625, "learning_rate": 0.0001, "loss": 6.3428, "loss/crossentropy": 2.624956250190735, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.208501935005188, "step": 5568 }, { "epoch": 0.1740625, "grad_norm": 3.859375, "grad_norm_var": 0.03737691243489583, "learning_rate": 0.0001, "loss": 6.4201, "loss/crossentropy": 2.6422702074050903, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.21371586620807648, "step": 5570 }, { "epoch": 0.174125, "grad_norm": 4.125, "grad_norm_var": 0.0541015625, "learning_rate": 0.0001, "loss": 6.4237, "loss/crossentropy": 2.668924331665039, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.210244320333004, "step": 5572 }, { "epoch": 0.1741875, "grad_norm": 3.75, "grad_norm_var": 0.039697265625, "learning_rate": 0.0001, "loss": 6.43, "loss/crossentropy": 2.765849232673645, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.20313312858343124, "step": 5574 }, { "epoch": 0.17425, "grad_norm": 3.421875, "grad_norm_var": 0.0411773681640625, "learning_rate": 0.0001, "loss": 6.4027, "loss/crossentropy": 2.7064108848571777, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.20517880469560623, "step": 5576 }, { "epoch": 0.1743125, "grad_norm": 3.8125, "grad_norm_var": 0.042378743489583336, "learning_rate": 0.0001, "loss": 6.5003, "loss/crossentropy": 2.7742602825164795, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.2101084440946579, "step": 5578 }, { "epoch": 0.174375, "grad_norm": 3.421875, "grad_norm_var": 0.05230712890625, "learning_rate": 0.0001, "loss": 6.4819, "loss/crossentropy": 2.7062528133392334, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.21194252371788025, "step": 5580 }, { "epoch": 0.1744375, "grad_norm": 3.6875, "grad_norm_var": 0.047948201497395836, "learning_rate": 0.0001, "loss": 6.3452, "loss/crossentropy": 2.660622000694275, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.20400027930736542, "step": 5582 }, { "epoch": 0.1745, "grad_norm": 4.09375, "grad_norm_var": 0.0604400634765625, "learning_rate": 0.0001, "loss": 6.6467, "loss/crossentropy": 2.7296407222747803, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.22529703378677368, "step": 5584 }, { "epoch": 0.1745625, "grad_norm": 3.671875, "grad_norm_var": 0.059912109375, "learning_rate": 0.0001, "loss": 6.3289, "loss/crossentropy": 2.6632198095321655, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.2071952521800995, "step": 5586 }, { "epoch": 0.174625, "grad_norm": 3.359375, "grad_norm_var": 0.04794921875, "learning_rate": 0.0001, "loss": 6.2706, "loss/crossentropy": 2.6442915201187134, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.20012595504522324, "step": 5588 }, { "epoch": 0.1746875, "grad_norm": 3.734375, "grad_norm_var": 0.055501302083333336, "learning_rate": 0.0001, "loss": 6.4411, "loss/crossentropy": 2.7457441091537476, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.2035154029726982, "step": 5590 }, { "epoch": 0.17475, "grad_norm": 3.40625, "grad_norm_var": 0.07525634765625, "learning_rate": 0.0001, "loss": 5.8117, "loss/crossentropy": 2.4168955087661743, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.17971421033143997, "step": 5592 }, { "epoch": 0.1748125, "grad_norm": 3.578125, "grad_norm_var": 0.07383524576822917, "learning_rate": 0.0001, "loss": 6.0586, "loss/crossentropy": 2.48819899559021, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.19414632767438889, "step": 5594 }, { "epoch": 0.174875, "grad_norm": 3.75, "grad_norm_var": 0.22345377604166666, "learning_rate": 0.0001, "loss": 6.9743, "loss/crossentropy": 2.888335704803467, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.23632864654064178, "step": 5596 }, { "epoch": 0.1749375, "grad_norm": 3.4375, "grad_norm_var": 0.22860921223958333, "learning_rate": 0.0001, "loss": 6.0856, "loss/crossentropy": 2.4923453330993652, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.19994857162237167, "step": 5598 }, { "epoch": 0.175, "grad_norm": 3.828125, "grad_norm_var": 0.21950581868489583, "learning_rate": 0.0001, "loss": 5.9782, "loss/crossentropy": 2.332379460334778, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.20052067935466766, "step": 5600 }, { "epoch": 0.1750625, "grad_norm": 3.78125, "grad_norm_var": 0.218701171875, "learning_rate": 0.0001, "loss": 6.4891, "loss/crossentropy": 2.630122423171997, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.21871009469032288, "step": 5602 }, { "epoch": 0.175125, "grad_norm": 3.546875, "grad_norm_var": 0.2118072509765625, "learning_rate": 0.0001, "loss": 6.1111, "loss/crossentropy": 2.5449728965759277, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.19449857622385025, "step": 5604 }, { "epoch": 0.1751875, "grad_norm": 3.6875, "grad_norm_var": 0.20869140625, "learning_rate": 0.0001, "loss": 6.1819, "loss/crossentropy": 2.5370291471481323, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.20042233169078827, "step": 5606 }, { "epoch": 0.17525, "grad_norm": 3.5625, "grad_norm_var": 0.17384440104166668, "learning_rate": 0.0001, "loss": 6.2695, "loss/crossentropy": 2.5881470441818237, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.19899416714906693, "step": 5608 }, { "epoch": 0.1753125, "grad_norm": 3.515625, "grad_norm_var": 0.17097066243489584, "learning_rate": 0.0001, "loss": 6.4462, "loss/crossentropy": 2.672497034072876, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.20510167628526688, "step": 5610 }, { "epoch": 0.175375, "grad_norm": 3.40625, "grad_norm_var": 0.0383941650390625, "learning_rate": 0.0001, "loss": 6.0375, "loss/crossentropy": 2.4991201162338257, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.19368354231119156, "step": 5612 }, { "epoch": 0.1754375, "grad_norm": 3.875, "grad_norm_var": 0.03714192708333333, "learning_rate": 0.0001, "loss": 6.4138, "loss/crossentropy": 2.6649649143218994, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.2076936662197113, "step": 5614 }, { "epoch": 0.1755, "grad_norm": 3.90625, "grad_norm_var": 0.0385406494140625, "learning_rate": 0.0001, "loss": 5.8119, "loss/crossentropy": 2.332767963409424, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19087842851877213, "step": 5616 }, { "epoch": 0.1755625, "grad_norm": 3.796875, "grad_norm_var": 0.0342926025390625, "learning_rate": 0.0001, "loss": 6.3032, "loss/crossentropy": 2.5783437490463257, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.2084260806441307, "step": 5618 }, { "epoch": 0.175625, "grad_norm": 3.671875, "grad_norm_var": 0.049235026041666664, "learning_rate": 0.0001, "loss": 6.737, "loss/crossentropy": 2.8179370164871216, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.22589459270238876, "step": 5620 }, { "epoch": 0.1756875, "grad_norm": 3.96875, "grad_norm_var": 0.054833984375, "learning_rate": 0.0001, "loss": 6.3994, "loss/crossentropy": 2.7480050325393677, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.20303140580654144, "step": 5622 }, { "epoch": 0.17575, "grad_norm": 3.828125, "grad_norm_var": 0.05921223958333333, "learning_rate": 0.0001, "loss": 6.2438, "loss/crossentropy": 2.5771526098251343, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.1998675912618637, "step": 5624 }, { "epoch": 0.1758125, "grad_norm": 3.921875, "grad_norm_var": 0.060114542643229164, "learning_rate": 0.0001, "loss": 6.4994, "loss/crossentropy": 2.608477830886841, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.2187797725200653, "step": 5626 }, { "epoch": 0.175875, "grad_norm": 4.09375, "grad_norm_var": 0.04951070149739583, "learning_rate": 0.0001, "loss": 6.3209, "loss/crossentropy": 2.6283583641052246, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.20871003717184067, "step": 5628 }, { "epoch": 0.1759375, "grad_norm": 3.96875, "grad_norm_var": 0.05357157389322917, "learning_rate": 0.0001, "loss": 6.298, "loss/crossentropy": 2.627431631088257, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.20377814769744873, "step": 5630 }, { "epoch": 0.176, "grad_norm": 3.96875, "grad_norm_var": 0.0513671875, "learning_rate": 0.0001, "loss": 6.4348, "loss/crossentropy": 2.6045050621032715, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.21545372158288956, "step": 5632 }, { "epoch": 0.1760625, "grad_norm": 3.734375, "grad_norm_var": 0.03843994140625, "learning_rate": 0.0001, "loss": 6.4334, "loss/crossentropy": 2.734872579574585, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.2057913839817047, "step": 5634 }, { "epoch": 0.176125, "grad_norm": 3.875, "grad_norm_var": 0.032124837239583336, "learning_rate": 0.0001, "loss": 6.2559, "loss/crossentropy": 2.5179080963134766, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.2097330242395401, "step": 5636 }, { "epoch": 0.1761875, "grad_norm": 4.375, "grad_norm_var": 0.04289957682291667, "learning_rate": 0.0001, "loss": 6.3522, "loss/crossentropy": 2.6638708114624023, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.20399095118045807, "step": 5638 }, { "epoch": 0.17625, "grad_norm": 3.65625, "grad_norm_var": 0.047419230143229164, "learning_rate": 0.0001, "loss": 6.1607, "loss/crossentropy": 2.555658221244812, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.20073561370372772, "step": 5640 }, { "epoch": 0.1763125, "grad_norm": 3.328125, "grad_norm_var": 0.06232808430989583, "learning_rate": 0.0001, "loss": 6.2548, "loss/crossentropy": 2.580946207046509, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.20527862012386322, "step": 5642 }, { "epoch": 0.176375, "grad_norm": 3.90625, "grad_norm_var": 0.07954813639322916, "learning_rate": 0.0001, "loss": 5.9941, "loss/crossentropy": 2.4849324226379395, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.19505522400140762, "step": 5644 }, { "epoch": 0.1764375, "grad_norm": 3.75, "grad_norm_var": 0.07453511555989584, "learning_rate": 0.0001, "loss": 6.4735, "loss/crossentropy": 2.701940655708313, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.21113985031843185, "step": 5646 }, { "epoch": 0.1765, "grad_norm": 3.34375, "grad_norm_var": 0.077490234375, "learning_rate": 0.0001, "loss": 6.0421, "loss/crossentropy": 2.433351516723633, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.1968122348189354, "step": 5648 }, { "epoch": 0.1765625, "grad_norm": 3.6875, "grad_norm_var": 0.0806060791015625, "learning_rate": 0.0001, "loss": 6.4999, "loss/crossentropy": 2.7361565828323364, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.20840420573949814, "step": 5650 }, { "epoch": 0.176625, "grad_norm": 4.09375, "grad_norm_var": 0.0964263916015625, "learning_rate": 0.0001, "loss": 6.1257, "loss/crossentropy": 2.467520236968994, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.20097361505031586, "step": 5652 }, { "epoch": 0.1766875, "grad_norm": 3.546875, "grad_norm_var": 0.09436442057291666, "learning_rate": 0.0001, "loss": 6.2714, "loss/crossentropy": 2.6138484477996826, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.19973890483379364, "step": 5654 }, { "epoch": 0.17675, "grad_norm": 3.640625, "grad_norm_var": 0.09977925618489583, "learning_rate": 0.0001, "loss": 6.6365, "loss/crossentropy": 2.814689874649048, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.21421615034341812, "step": 5656 }, { "epoch": 0.1768125, "grad_norm": 3.546875, "grad_norm_var": 0.0920318603515625, "learning_rate": 0.0001, "loss": 6.3915, "loss/crossentropy": 2.6593785285949707, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.20915207266807556, "step": 5658 }, { "epoch": 0.176875, "grad_norm": 3.375, "grad_norm_var": 0.074462890625, "learning_rate": 0.0001, "loss": 6.0323, "loss/crossentropy": 2.47067928314209, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.1921006441116333, "step": 5660 }, { "epoch": 0.1769375, "grad_norm": 3.71875, "grad_norm_var": 0.07395426432291667, "learning_rate": 0.0001, "loss": 6.0652, "loss/crossentropy": 2.50555682182312, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.197368785738945, "step": 5662 }, { "epoch": 0.177, "grad_norm": 2.9375, "grad_norm_var": 0.11296284993489583, "learning_rate": 0.0001, "loss": 5.5988, "loss/crossentropy": 2.294847249984741, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17531780153512955, "step": 5664 }, { "epoch": 0.1770625, "grad_norm": 3.375, "grad_norm_var": 0.1178863525390625, "learning_rate": 0.0001, "loss": 6.3314, "loss/crossentropy": 2.6526646614074707, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.20341810584068298, "step": 5666 }, { "epoch": 0.177125, "grad_norm": 3.6875, "grad_norm_var": 0.1009674072265625, "learning_rate": 0.0001, "loss": 6.3754, "loss/crossentropy": 2.6954914331436157, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.20822450518608093, "step": 5668 }, { "epoch": 0.1771875, "grad_norm": 3.96875, "grad_norm_var": 0.0833984375, "learning_rate": 0.0001, "loss": 6.4037, "loss/crossentropy": 2.563956379890442, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.2171739861369133, "step": 5670 }, { "epoch": 0.17725, "grad_norm": 3.84375, "grad_norm_var": 0.09482320149739583, "learning_rate": 0.0001, "loss": 6.0265, "loss/crossentropy": 2.4230778217315674, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.19276423752307892, "step": 5672 }, { "epoch": 0.1773125, "grad_norm": 3.59375, "grad_norm_var": 0.10392964680989583, "learning_rate": 0.0001, "loss": 6.2188, "loss/crossentropy": 2.607333183288574, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.20059751719236374, "step": 5674 }, { "epoch": 0.177375, "grad_norm": 3.515625, "grad_norm_var": 0.10070699055989583, "learning_rate": 0.0001, "loss": 5.9932, "loss/crossentropy": 2.5008485317230225, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.18634529411792755, "step": 5676 }, { "epoch": 0.1774375, "grad_norm": 3.765625, "grad_norm_var": 0.10190327962239583, "learning_rate": 0.0001, "loss": 6.1233, "loss/crossentropy": 2.4628396034240723, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.20198483765125275, "step": 5678 }, { "epoch": 0.1775, "grad_norm": 3.390625, "grad_norm_var": 0.06603902180989583, "learning_rate": 0.0001, "loss": 5.9784, "loss/crossentropy": 2.521420478820801, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18866854161024094, "step": 5680 }, { "epoch": 0.1775625, "grad_norm": 3.609375, "grad_norm_var": 0.059065755208333334, "learning_rate": 0.0001, "loss": 6.1665, "loss/crossentropy": 2.536958336830139, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.20045578479766846, "step": 5682 }, { "epoch": 0.177625, "grad_norm": 3.921875, "grad_norm_var": 0.06405843098958333, "learning_rate": 0.0001, "loss": 6.0364, "loss/crossentropy": 2.4570053815841675, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.1911417543888092, "step": 5684 }, { "epoch": 0.1776875, "grad_norm": 3.3125, "grad_norm_var": 0.05449117024739583, "learning_rate": 0.0001, "loss": 6.3008, "loss/crossentropy": 2.6369824409484863, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.20388375967741013, "step": 5686 }, { "epoch": 0.17775, "grad_norm": 3.59375, "grad_norm_var": 0.0405670166015625, "learning_rate": 0.0001, "loss": 6.5723, "loss/crossentropy": 2.806073546409607, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.21021592617034912, "step": 5688 }, { "epoch": 0.1778125, "grad_norm": 3.765625, "grad_norm_var": 0.03326822916666667, "learning_rate": 0.0001, "loss": 6.6962, "loss/crossentropy": 2.8453763723373413, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.21906623244285583, "step": 5690 }, { "epoch": 0.177875, "grad_norm": 3.546875, "grad_norm_var": 0.034830729166666664, "learning_rate": 0.0001, "loss": 6.2838, "loss/crossentropy": 2.645614504814148, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.20483769476413727, "step": 5692 }, { "epoch": 0.1779375, "grad_norm": 3.34375, "grad_norm_var": 0.035416666666666666, "learning_rate": 0.0001, "loss": 6.1371, "loss/crossentropy": 2.5817540884017944, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.1961633712053299, "step": 5694 }, { "epoch": 0.178, "grad_norm": 3.5, "grad_norm_var": 0.04157613118489583, "learning_rate": 0.0001, "loss": 6.7531, "loss/crossentropy": 2.898721218109131, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.21942297369241714, "step": 5696 }, { "epoch": 0.1780625, "grad_norm": 3.96875, "grad_norm_var": 0.0493560791015625, "learning_rate": 0.0001, "loss": 6.3258, "loss/crossentropy": 2.5990471839904785, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.20939195156097412, "step": 5698 }, { "epoch": 0.178125, "grad_norm": 3.515625, "grad_norm_var": 0.0461090087890625, "learning_rate": 0.0001, "loss": 6.1863, "loss/crossentropy": 2.567986488342285, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.1997183933854103, "step": 5700 }, { "epoch": 0.1781875, "grad_norm": 3.859375, "grad_norm_var": 0.041943359375, "learning_rate": 0.0001, "loss": 6.2774, "loss/crossentropy": 2.6630423069000244, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.194635771214962, "step": 5702 }, { "epoch": 0.17825, "grad_norm": 4.15625, "grad_norm_var": 0.05338134765625, "learning_rate": 0.0001, "loss": 6.0809, "loss/crossentropy": 2.3729244470596313, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.20947062969207764, "step": 5704 }, { "epoch": 0.1783125, "grad_norm": 3.515625, "grad_norm_var": 0.058080037434895836, "learning_rate": 0.0001, "loss": 6.1394, "loss/crossentropy": 2.588572144508362, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19414159655570984, "step": 5706 }, { "epoch": 0.178375, "grad_norm": 3.265625, "grad_norm_var": 0.0641510009765625, "learning_rate": 0.0001, "loss": 6.1624, "loss/crossentropy": 2.5588879585266113, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.2029315009713173, "step": 5708 }, { "epoch": 0.1784375, "grad_norm": 3.546875, "grad_norm_var": 0.057835896809895836, "learning_rate": 0.0001, "loss": 6.1616, "loss/crossentropy": 2.531624436378479, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.20166686177253723, "step": 5710 }, { "epoch": 0.1785, "grad_norm": 3.421875, "grad_norm_var": 0.0538970947265625, "learning_rate": 0.0001, "loss": 6.3368, "loss/crossentropy": 2.6661752462387085, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.20690400898456573, "step": 5712 }, { "epoch": 0.1785625, "grad_norm": 3.78125, "grad_norm_var": 0.04736328125, "learning_rate": 0.0001, "loss": 6.4346, "loss/crossentropy": 2.690172791481018, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.2088213488459587, "step": 5714 }, { "epoch": 0.178625, "grad_norm": 3.65625, "grad_norm_var": 0.051268513997395834, "learning_rate": 0.0001, "loss": 6.414, "loss/crossentropy": 2.7049120664596558, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.20645985007286072, "step": 5716 }, { "epoch": 0.1786875, "grad_norm": 3.703125, "grad_norm_var": 0.04967447916666667, "learning_rate": 0.0001, "loss": 6.2708, "loss/crossentropy": 2.523758292198181, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.2079075500369072, "step": 5718 }, { "epoch": 0.17875, "grad_norm": 4.0, "grad_norm_var": 0.03925679524739583, "learning_rate": 0.0001, "loss": 6.4607, "loss/crossentropy": 2.631307601928711, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.21418674290180206, "step": 5720 }, { "epoch": 0.1788125, "grad_norm": 3.625, "grad_norm_var": 0.03351949055989583, "learning_rate": 0.0001, "loss": 6.0575, "loss/crossentropy": 2.4578261375427246, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.19473493099212646, "step": 5722 }, { "epoch": 0.178875, "grad_norm": 3.765625, "grad_norm_var": 0.022184244791666665, "learning_rate": 0.0001, "loss": 6.1044, "loss/crossentropy": 2.4846503734588623, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.20025886595249176, "step": 5724 }, { "epoch": 0.1789375, "grad_norm": 3.578125, "grad_norm_var": 0.022459920247395834, "learning_rate": 0.0001, "loss": 6.212, "loss/crossentropy": 2.6433242559432983, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.19319302588701248, "step": 5726 }, { "epoch": 0.179, "grad_norm": 3.546875, "grad_norm_var": 0.0189453125, "learning_rate": 0.0001, "loss": 5.9439, "loss/crossentropy": 2.4513763189315796, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.18948928266763687, "step": 5728 }, { "epoch": 0.1790625, "grad_norm": 3.84375, "grad_norm_var": 0.027765909830729168, "learning_rate": 0.0001, "loss": 6.5101, "loss/crossentropy": 2.668804407119751, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.21616463363170624, "step": 5730 }, { "epoch": 0.179125, "grad_norm": 4.28125, "grad_norm_var": 0.04680989583333333, "learning_rate": 0.0001, "loss": 6.1737, "loss/crossentropy": 2.4736764430999756, "loss/hidden": 1.72265625, "loss/jsd": 0.0, "loss/logits": 0.1977366879582405, "step": 5732 }, { "epoch": 0.1791875, "grad_norm": 3.671875, "grad_norm_var": 0.047652180989583334, "learning_rate": 0.0001, "loss": 6.4047, "loss/crossentropy": 2.67950177192688, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.2072872519493103, "step": 5734 }, { "epoch": 0.17925, "grad_norm": 3.578125, "grad_norm_var": 0.06891988118489584, "learning_rate": 0.0001, "loss": 6.2035, "loss/crossentropy": 2.63262677192688, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.19537082314491272, "step": 5736 }, { "epoch": 0.1793125, "grad_norm": 3.515625, "grad_norm_var": 0.07222391764322916, "learning_rate": 0.0001, "loss": 6.2492, "loss/crossentropy": 2.5669682025909424, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.20493736863136292, "step": 5738 }, { "epoch": 0.179375, "grad_norm": 3.90625, "grad_norm_var": 0.07522786458333333, "learning_rate": 0.0001, "loss": 6.2924, "loss/crossentropy": 2.6394450664520264, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.20279543846845627, "step": 5740 }, { "epoch": 0.1794375, "grad_norm": 3.5625, "grad_norm_var": 0.082666015625, "learning_rate": 0.0001, "loss": 6.4208, "loss/crossentropy": 2.7411283254623413, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.2066434845328331, "step": 5742 }, { "epoch": 0.1795, "grad_norm": 3.4375, "grad_norm_var": 0.0856353759765625, "learning_rate": 0.0001, "loss": 6.3208, "loss/crossentropy": 2.672483444213867, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.2019430547952652, "step": 5744 }, { "epoch": 0.1795625, "grad_norm": 3.859375, "grad_norm_var": 0.07745768229166666, "learning_rate": 0.0001, "loss": 6.4814, "loss/crossentropy": 2.6624244451522827, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.21197732537984848, "step": 5746 }, { "epoch": 0.179625, "grad_norm": 3.5625, "grad_norm_var": 0.0529449462890625, "learning_rate": 0.0001, "loss": 6.3465, "loss/crossentropy": 2.6245813369750977, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.20969115942716599, "step": 5748 }, { "epoch": 0.1796875, "grad_norm": 3.390625, "grad_norm_var": 0.0566802978515625, "learning_rate": 0.0001, "loss": 6.2782, "loss/crossentropy": 2.6000006198883057, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.2045365795493126, "step": 5750 }, { "epoch": 0.17975, "grad_norm": 3.515625, "grad_norm_var": 0.038037109375, "learning_rate": 0.0001, "loss": 6.1203, "loss/crossentropy": 2.5339865684509277, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.19495518505573273, "step": 5752 }, { "epoch": 0.1798125, "grad_norm": 3.75, "grad_norm_var": 0.03528238932291667, "learning_rate": 0.0001, "loss": 6.0857, "loss/crossentropy": 2.468671679496765, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.1999809518456459, "step": 5754 }, { "epoch": 0.179875, "grad_norm": 3.28125, "grad_norm_var": 0.04231669108072917, "learning_rate": 0.0001, "loss": 6.4099, "loss/crossentropy": 2.736938714981079, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.2055758759379387, "step": 5756 }, { "epoch": 0.1799375, "grad_norm": 3.875, "grad_norm_var": 0.03561197916666667, "learning_rate": 0.0001, "loss": 6.3897, "loss/crossentropy": 2.660236954689026, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.2104458436369896, "step": 5758 }, { "epoch": 0.18, "grad_norm": 3.0625, "grad_norm_var": 0.0550445556640625, "learning_rate": 0.0001, "loss": 5.5766, "loss/crossentropy": 2.194609224796295, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.17882372438907623, "step": 5760 }, { "epoch": 0.1800625, "grad_norm": 3.65625, "grad_norm_var": 0.05657145182291667, "learning_rate": 0.0001, "loss": 6.1178, "loss/crossentropy": 2.506523847579956, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.20019135624170303, "step": 5762 }, { "epoch": 0.180125, "grad_norm": 4.0, "grad_norm_var": 0.06211649576822917, "learning_rate": 0.0001, "loss": 6.2018, "loss/crossentropy": 2.5049558877944946, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.2103073075413704, "step": 5764 }, { "epoch": 0.1801875, "grad_norm": 4.34375, "grad_norm_var": 0.08708394368489583, "learning_rate": 0.0001, "loss": 6.4528, "loss/crossentropy": 2.705584406852722, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.21144528687000275, "step": 5766 }, { "epoch": 0.18025, "grad_norm": 3.78125, "grad_norm_var": 0.16002604166666667, "learning_rate": 0.0001, "loss": 6.464, "loss/crossentropy": 2.6017357110977173, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.22060427069664001, "step": 5768 }, { "epoch": 0.1803125, "grad_norm": 3.8125, "grad_norm_var": 0.16955973307291666, "learning_rate": 0.0001, "loss": 6.2266, "loss/crossentropy": 2.54715359210968, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.20583520829677582, "step": 5770 }, { "epoch": 0.180375, "grad_norm": 3.453125, "grad_norm_var": 0.15359700520833333, "learning_rate": 0.0001, "loss": 6.2142, "loss/crossentropy": 2.560963273048401, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.19970254600048065, "step": 5772 }, { "epoch": 0.1804375, "grad_norm": 3.59375, "grad_norm_var": 0.161083984375, "learning_rate": 0.0001, "loss": 6.3225, "loss/crossentropy": 2.578567624092102, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.20603880286216736, "step": 5774 }, { "epoch": 0.1805, "grad_norm": 3.640625, "grad_norm_var": 1.7318033854166666, "learning_rate": 0.0001, "loss": 6.5207, "loss/crossentropy": 2.654413104057312, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.2202201560139656, "step": 5776 }, { "epoch": 0.1805625, "grad_norm": 3.515625, "grad_norm_var": 1.7427642822265625, "learning_rate": 0.0001, "loss": 6.3629, "loss/crossentropy": 2.5971418619155884, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.208999365568161, "step": 5778 }, { "epoch": 0.180625, "grad_norm": 3.75, "grad_norm_var": 1.759130859375, "learning_rate": 0.0001, "loss": 6.1511, "loss/crossentropy": 2.503427267074585, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.2022705376148224, "step": 5780 }, { "epoch": 0.1806875, "grad_norm": 3.71875, "grad_norm_var": 1.7509724934895834, "learning_rate": 0.0001, "loss": 6.2061, "loss/crossentropy": 2.415714144706726, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.21223796904087067, "step": 5782 }, { "epoch": 0.18075, "grad_norm": 3.46875, "grad_norm_var": 1.7370402018229167, "learning_rate": 0.0001, "loss": 6.2435, "loss/crossentropy": 2.522855043411255, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.2103436440229416, "step": 5784 }, { "epoch": 0.1808125, "grad_norm": 3.5, "grad_norm_var": 1.7345011393229166, "learning_rate": 0.0001, "loss": 6.2004, "loss/crossentropy": 2.580862522125244, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.20140428841114044, "step": 5786 }, { "epoch": 0.180875, "grad_norm": 3.5, "grad_norm_var": 1.755329386393229, "learning_rate": 0.0001, "loss": 5.9218, "loss/crossentropy": 2.3795002698898315, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.19134049117565155, "step": 5788 }, { "epoch": 0.1809375, "grad_norm": 4.375, "grad_norm_var": 1.745368448893229, "learning_rate": 0.0001, "loss": 6.3283, "loss/crossentropy": 2.644761562347412, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.20507150143384933, "step": 5790 }, { "epoch": 0.181, "grad_norm": 3.40625, "grad_norm_var": 0.07665608723958334, "learning_rate": 0.0001, "loss": 6.5596, "loss/crossentropy": 2.7409855127334595, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.21584705263376236, "step": 5792 }, { "epoch": 0.1810625, "grad_norm": 3.453125, "grad_norm_var": 0.0763671875, "learning_rate": 0.0001, "loss": 5.9313, "loss/crossentropy": 2.430976629257202, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19105058908462524, "step": 5794 }, { "epoch": 0.181125, "grad_norm": 3.515625, "grad_norm_var": 0.07681376139322917, "learning_rate": 0.0001, "loss": 6.2093, "loss/crossentropy": 2.5864611864089966, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.19939538091421127, "step": 5796 }, { "epoch": 0.1811875, "grad_norm": 3.78125, "grad_norm_var": 0.07241109212239584, "learning_rate": 0.0001, "loss": 6.3301, "loss/crossentropy": 2.6111371517181396, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.203924298286438, "step": 5798 }, { "epoch": 0.18125, "grad_norm": 3.75, "grad_norm_var": 0.07019856770833334, "learning_rate": 0.0001, "loss": 6.1891, "loss/crossentropy": 2.4769375324249268, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.2055877298116684, "step": 5800 }, { "epoch": 0.1813125, "grad_norm": 3.328125, "grad_norm_var": 0.07612202962239584, "learning_rate": 0.0001, "loss": 6.229, "loss/crossentropy": 2.6561849117279053, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.19556637108325958, "step": 5802 }, { "epoch": 0.181375, "grad_norm": 3.546875, "grad_norm_var": 0.06879781087239584, "learning_rate": 0.0001, "loss": 6.2133, "loss/crossentropy": 2.5491530895233154, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.20156937837600708, "step": 5804 }, { "epoch": 0.1814375, "grad_norm": 3.703125, "grad_norm_var": 0.037083943684895836, "learning_rate": 0.0001, "loss": 6.4403, "loss/crossentropy": 2.6249520778656006, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.2116159126162529, "step": 5806 }, { "epoch": 0.1815, "grad_norm": 3.65625, "grad_norm_var": 0.02398681640625, "learning_rate": 0.0001, "loss": 6.0739, "loss/crossentropy": 2.522126793861389, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.1922825202345848, "step": 5808 }, { "epoch": 0.1815625, "grad_norm": 4.46875, "grad_norm_var": 0.06580403645833334, "learning_rate": 0.0001, "loss": 6.8166, "loss/crossentropy": 2.8532909154891968, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.22562817484140396, "step": 5810 }, { "epoch": 0.181625, "grad_norm": 3.6875, "grad_norm_var": 0.12453511555989584, "learning_rate": 0.0001, "loss": 6.6679, "loss/crossentropy": 2.8089990615844727, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.21948668360710144, "step": 5812 }, { "epoch": 0.1816875, "grad_norm": 3.703125, "grad_norm_var": 1.460643513997396, "learning_rate": 0.0001, "loss": 6.3886, "loss/crossentropy": 2.6490007638931274, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.2098982334136963, "step": 5814 }, { "epoch": 0.18175, "grad_norm": 3.671875, "grad_norm_var": 1.4552480061848958, "learning_rate": 0.0001, "loss": 6.1484, "loss/crossentropy": 2.4922189712524414, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.1999887079000473, "step": 5816 }, { "epoch": 0.1818125, "grad_norm": 3.828125, "grad_norm_var": 1.4140625, "learning_rate": 0.0001, "loss": 6.5733, "loss/crossentropy": 2.758327007293701, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.21665561199188232, "step": 5818 }, { "epoch": 0.181875, "grad_norm": 3.5625, "grad_norm_var": 1.4061197916666666, "learning_rate": 0.0001, "loss": 6.2049, "loss/crossentropy": 2.554853320121765, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.20055203139781952, "step": 5820 }, { "epoch": 0.1819375, "grad_norm": 3.734375, "grad_norm_var": 1.4133290608723958, "learning_rate": 0.0001, "loss": 6.5205, "loss/crossentropy": 2.6887972354888916, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.21519909799098969, "step": 5822 }, { "epoch": 0.182, "grad_norm": 6.40625, "grad_norm_var": 1.7002237955729167, "learning_rate": 0.0001, "loss": 6.4641, "loss/crossentropy": 2.568132162094116, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.22943571954965591, "step": 5824 }, { "epoch": 0.1820625, "grad_norm": 3.5625, "grad_norm_var": 1.7548177083333334, "learning_rate": 0.0001, "loss": 6.3732, "loss/crossentropy": 2.7769051790237427, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.19713202863931656, "step": 5826 }, { "epoch": 0.182125, "grad_norm": 3.96875, "grad_norm_var": 1.7350545247395834, "learning_rate": 0.0001, "loss": 6.5541, "loss/crossentropy": 2.8054513931274414, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.209628663957119, "step": 5828 }, { "epoch": 0.1821875, "grad_norm": 3.65625, "grad_norm_var": 0.49374593098958336, "learning_rate": 0.0001, "loss": 6.231, "loss/crossentropy": 2.5347152948379517, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.20673973113298416, "step": 5830 }, { "epoch": 0.18225, "grad_norm": 3.6875, "grad_norm_var": 0.49521077473958336, "learning_rate": 0.0001, "loss": 6.0622, "loss/crossentropy": 2.431654691696167, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.19938179105520248, "step": 5832 }, { "epoch": 0.1823125, "grad_norm": 3.890625, "grad_norm_var": 0.49771219889322915, "learning_rate": 0.0001, "loss": 6.4294, "loss/crossentropy": 2.6954740285873413, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.20581210404634476, "step": 5834 }, { "epoch": 0.182375, "grad_norm": 3.5, "grad_norm_var": 0.5013417561848958, "learning_rate": 0.0001, "loss": 6.3685, "loss/crossentropy": 2.5816421508789062, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.21266847103834152, "step": 5836 }, { "epoch": 0.1824375, "grad_norm": 3.484375, "grad_norm_var": 0.5062733968098958, "learning_rate": 0.0001, "loss": 6.3101, "loss/crossentropy": 2.661076307296753, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.20084363222122192, "step": 5838 }, { "epoch": 0.1825, "grad_norm": 3.34375, "grad_norm_var": 0.031346638997395836, "learning_rate": 0.0001, "loss": 6.1729, "loss/crossentropy": 2.5334683656692505, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.20027121901512146, "step": 5840 }, { "epoch": 0.1825625, "grad_norm": 3.71875, "grad_norm_var": 0.0342193603515625, "learning_rate": 0.0001, "loss": 5.9912, "loss/crossentropy": 2.4580332040786743, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.18847763538360596, "step": 5842 }, { "epoch": 0.182625, "grad_norm": 3.625, "grad_norm_var": 0.022826131184895834, "learning_rate": 0.0001, "loss": 6.1967, "loss/crossentropy": 2.510142683982849, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.20498262345790863, "step": 5844 }, { "epoch": 0.1826875, "grad_norm": 3.453125, "grad_norm_var": 0.02457275390625, "learning_rate": 0.0001, "loss": 5.8873, "loss/crossentropy": 2.379484176635742, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.192183256149292, "step": 5846 }, { "epoch": 0.18275, "grad_norm": 3.5625, "grad_norm_var": 0.05812886555989583, "learning_rate": 0.0001, "loss": 6.1376, "loss/crossentropy": 2.4847766160964966, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.2043425291776657, "step": 5848 }, { "epoch": 0.1828125, "grad_norm": 3.484375, "grad_norm_var": 0.055052693684895834, "learning_rate": 0.0001, "loss": 6.0785, "loss/crossentropy": 2.466696262359619, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.20063114166259766, "step": 5850 }, { "epoch": 0.182875, "grad_norm": 3.40625, "grad_norm_var": 0.05804036458333333, "learning_rate": 0.0001, "loss": 5.8681, "loss/crossentropy": 2.3853559494018555, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1912408024072647, "step": 5852 }, { "epoch": 0.1829375, "grad_norm": 3.59375, "grad_norm_var": 0.0567291259765625, "learning_rate": 0.0001, "loss": 6.2438, "loss/crossentropy": 2.5789082050323486, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.2028198093175888, "step": 5854 }, { "epoch": 0.183, "grad_norm": 3.203125, "grad_norm_var": 0.06461181640625, "learning_rate": 0.0001, "loss": 5.9171, "loss/crossentropy": 2.3911736011505127, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1955569088459015, "step": 5856 }, { "epoch": 0.1830625, "grad_norm": 3.4375, "grad_norm_var": 0.0631988525390625, "learning_rate": 0.0001, "loss": 5.8855, "loss/crossentropy": 2.3705540895462036, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.18938453495502472, "step": 5858 }, { "epoch": 0.183125, "grad_norm": 3.734375, "grad_norm_var": 0.06767578125, "learning_rate": 0.0001, "loss": 6.2594, "loss/crossentropy": 2.577322840690613, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.20453617721796036, "step": 5860 }, { "epoch": 0.1831875, "grad_norm": 3.421875, "grad_norm_var": 0.06760660807291667, "learning_rate": 0.0001, "loss": 5.9125, "loss/crossentropy": 2.3466345071792603, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.19018106907606125, "step": 5862 }, { "epoch": 0.18325, "grad_norm": 3.875, "grad_norm_var": 0.0389312744140625, "learning_rate": 0.0001, "loss": 6.6228, "loss/crossentropy": 2.7842180728912354, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.21862202137708664, "step": 5864 }, { "epoch": 0.1833125, "grad_norm": 3.359375, "grad_norm_var": 0.049592081705729166, "learning_rate": 0.0001, "loss": 6.4839, "loss/crossentropy": 2.7767481803894043, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.2054857388138771, "step": 5866 }, { "epoch": 0.183375, "grad_norm": 4.21875, "grad_norm_var": 0.07108968098958333, "learning_rate": 0.0001, "loss": 6.4285, "loss/crossentropy": 2.7161409854888916, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.2036587819457054, "step": 5868 }, { "epoch": 0.1834375, "grad_norm": 3.71875, "grad_norm_var": 0.07155659993489584, "learning_rate": 0.0001, "loss": 6.3881, "loss/crossentropy": 2.6537606716156006, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.20976366102695465, "step": 5870 }, { "epoch": 0.1835, "grad_norm": 3.578125, "grad_norm_var": 0.06264546712239584, "learning_rate": 0.0001, "loss": 6.5392, "loss/crossentropy": 2.760679841041565, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.21339457482099533, "step": 5872 }, { "epoch": 0.1835625, "grad_norm": 3.59375, "grad_norm_var": 0.05198465983072917, "learning_rate": 0.0001, "loss": 6.4479, "loss/crossentropy": 2.747857093811035, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.20672143250703812, "step": 5874 }, { "epoch": 0.183625, "grad_norm": 3.46875, "grad_norm_var": 0.05926106770833333, "learning_rate": 0.0001, "loss": 6.1602, "loss/crossentropy": 2.582856297492981, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19680150598287582, "step": 5876 }, { "epoch": 0.1836875, "grad_norm": 3.65625, "grad_norm_var": 0.05678609212239583, "learning_rate": 0.0001, "loss": 6.4322, "loss/crossentropy": 2.755292773246765, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.20363199710845947, "step": 5878 }, { "epoch": 0.18375, "grad_norm": 3.609375, "grad_norm_var": 0.0533599853515625, "learning_rate": 0.0001, "loss": 6.3255, "loss/crossentropy": 2.6330443620681763, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.20635131001472473, "step": 5880 }, { "epoch": 0.1838125, "grad_norm": 3.453125, "grad_norm_var": 0.043701171875, "learning_rate": 0.0001, "loss": 6.3948, "loss/crossentropy": 2.711233615875244, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.20507829636335373, "step": 5882 }, { "epoch": 0.183875, "grad_norm": 3.40625, "grad_norm_var": 0.022175089518229166, "learning_rate": 0.0001, "loss": 6.1855, "loss/crossentropy": 2.6189242601394653, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.19532468914985657, "step": 5884 }, { "epoch": 0.1839375, "grad_norm": 3.421875, "grad_norm_var": 0.022233072916666666, "learning_rate": 0.0001, "loss": 6.3397, "loss/crossentropy": 2.698093295097351, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.20478975027799606, "step": 5886 }, { "epoch": 0.184, "grad_norm": 3.296875, "grad_norm_var": 0.018708292643229166, "learning_rate": 0.0001, "loss": 5.7069, "loss/crossentropy": 2.351386785507202, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.17383243143558502, "step": 5888 }, { "epoch": 0.1840625, "grad_norm": 3.296875, "grad_norm_var": 0.022728474934895833, "learning_rate": 0.0001, "loss": 5.966, "loss/crossentropy": 2.510135769844055, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18855902552604675, "step": 5890 }, { "epoch": 0.184125, "grad_norm": 3.765625, "grad_norm_var": 0.033935546875, "learning_rate": 0.0001, "loss": 6.1203, "loss/crossentropy": 2.476384997367859, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.20696952939033508, "step": 5892 }, { "epoch": 0.1841875, "grad_norm": 3.28125, "grad_norm_var": 0.0362213134765625, "learning_rate": 0.0001, "loss": 5.9829, "loss/crossentropy": 2.4397428035736084, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.192990280687809, "step": 5894 }, { "epoch": 0.18425, "grad_norm": 4.34375, "grad_norm_var": 0.08364969889322917, "learning_rate": 0.0001, "loss": 5.778, "loss/crossentropy": 2.3311071395874023, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.17945268750190735, "step": 5896 }, { "epoch": 0.1843125, "grad_norm": 3.390625, "grad_norm_var": 0.0907135009765625, "learning_rate": 0.0001, "loss": 6.2392, "loss/crossentropy": 2.6726242303848267, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.2004098892211914, "step": 5898 }, { "epoch": 0.184375, "grad_norm": 3.734375, "grad_norm_var": 0.0914215087890625, "learning_rate": 0.0001, "loss": 6.2737, "loss/crossentropy": 2.6352880001068115, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.20017390698194504, "step": 5900 }, { "epoch": 0.1844375, "grad_norm": 3.484375, "grad_norm_var": 0.09117431640625, "learning_rate": 0.0001, "loss": 6.3768, "loss/crossentropy": 2.702660083770752, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.20491401851177216, "step": 5902 }, { "epoch": 0.1845, "grad_norm": 3.625, "grad_norm_var": 0.08385009765625, "learning_rate": 0.0001, "loss": 6.3654, "loss/crossentropy": 2.5936564207077026, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.21232753992080688, "step": 5904 }, { "epoch": 0.1845625, "grad_norm": 3.296875, "grad_norm_var": 0.0859771728515625, "learning_rate": 0.0001, "loss": 5.9536, "loss/crossentropy": 2.4742748737335205, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.1905067041516304, "step": 5906 }, { "epoch": 0.184625, "grad_norm": 3.609375, "grad_norm_var": 0.08317057291666667, "learning_rate": 0.0001, "loss": 6.1984, "loss/crossentropy": 2.583977222442627, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.2040199413895607, "step": 5908 }, { "epoch": 0.1846875, "grad_norm": 3.734375, "grad_norm_var": 0.0803375244140625, "learning_rate": 0.0001, "loss": 6.5871, "loss/crossentropy": 2.709283709526062, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.2194249927997589, "step": 5910 }, { "epoch": 0.18475, "grad_norm": 3.5, "grad_norm_var": 0.04755859375, "learning_rate": 0.0001, "loss": 5.9032, "loss/crossentropy": 2.3166788816452026, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.1953662857413292, "step": 5912 }, { "epoch": 0.1848125, "grad_norm": 3.859375, "grad_norm_var": 0.04663798014322917, "learning_rate": 0.0001, "loss": 6.475, "loss/crossentropy": 2.7360684871673584, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.20944271236658096, "step": 5914 }, { "epoch": 0.184875, "grad_norm": 4.0625, "grad_norm_var": 0.0570465087890625, "learning_rate": 0.0001, "loss": 6.0926, "loss/crossentropy": 2.427862048149109, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.2039765566587448, "step": 5916 }, { "epoch": 0.1849375, "grad_norm": 3.8125, "grad_norm_var": 0.05185546875, "learning_rate": 0.0001, "loss": 6.5126, "loss/crossentropy": 2.7930887937545776, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.20515745878219604, "step": 5918 }, { "epoch": 0.185, "grad_norm": 4.8125, "grad_norm_var": 0.13144124348958333, "learning_rate": 0.0001, "loss": 6.4902, "loss/crossentropy": 2.752415418624878, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.2085418626666069, "step": 5920 }, { "epoch": 0.1850625, "grad_norm": 3.5625, "grad_norm_var": 0.09431864420572916, "learning_rate": 0.0001, "loss": 6.0656, "loss/crossentropy": 2.5158064365386963, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.19677375257015228, "step": 5922 }, { "epoch": 0.185125, "grad_norm": 4.09375, "grad_norm_var": 0.09625244140625, "learning_rate": 0.0001, "loss": 6.4938, "loss/crossentropy": 2.683521032333374, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.21579697728157043, "step": 5924 }, { "epoch": 0.1851875, "grad_norm": 3.609375, "grad_norm_var": 0.1140045166015625, "learning_rate": 0.0001, "loss": 5.6919, "loss/crossentropy": 2.217368483543396, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.182216115295887, "step": 5926 }, { "epoch": 0.18525, "grad_norm": 3.34375, "grad_norm_var": 0.12662353515625, "learning_rate": 0.0001, "loss": 5.9032, "loss/crossentropy": 2.399585723876953, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.1882522851228714, "step": 5928 }, { "epoch": 0.1853125, "grad_norm": 4.15625, "grad_norm_var": 0.1364654541015625, "learning_rate": 0.0001, "loss": 6.5129, "loss/crossentropy": 2.6554906368255615, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.21972205489873886, "step": 5930 }, { "epoch": 0.185375, "grad_norm": 3.203125, "grad_norm_var": 0.1580718994140625, "learning_rate": 0.0001, "loss": 6.1766, "loss/crossentropy": 2.5827823877334595, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.19922207295894623, "step": 5932 }, { "epoch": 0.1854375, "grad_norm": 3.921875, "grad_norm_var": 0.161376953125, "learning_rate": 0.0001, "loss": 6.298, "loss/crossentropy": 2.5829977989196777, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.20508920401334763, "step": 5934 }, { "epoch": 0.1855, "grad_norm": 3.484375, "grad_norm_var": 0.13147684733072917, "learning_rate": 0.0001, "loss": 6.2714, "loss/crossentropy": 2.52670419216156, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.21040621399879456, "step": 5936 }, { "epoch": 0.1855625, "grad_norm": 3.609375, "grad_norm_var": 0.14094645182291668, "learning_rate": 0.0001, "loss": 6.1793, "loss/crossentropy": 2.5923471450805664, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.1989283263683319, "step": 5938 }, { "epoch": 0.185625, "grad_norm": 3.296875, "grad_norm_var": 0.15126546223958334, "learning_rate": 0.0001, "loss": 6.0703, "loss/crossentropy": 2.5271341800689697, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.1937677189707756, "step": 5940 }, { "epoch": 0.1856875, "grad_norm": 3.625, "grad_norm_var": 0.1473541259765625, "learning_rate": 0.0001, "loss": 6.6569, "loss/crossentropy": 2.8995094299316406, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.21206432580947876, "step": 5942 }, { "epoch": 0.18575, "grad_norm": 3.3125, "grad_norm_var": 0.16066080729166668, "learning_rate": 0.0001, "loss": 5.5161, "loss/crossentropy": 2.213390350341797, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.17207244038581848, "step": 5944 }, { "epoch": 0.1858125, "grad_norm": 3.75, "grad_norm_var": 0.14329427083333332, "learning_rate": 0.0001, "loss": 6.0463, "loss/crossentropy": 2.470386028289795, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.19392302632331848, "step": 5946 }, { "epoch": 0.185875, "grad_norm": 3.5625, "grad_norm_var": 0.17496744791666666, "learning_rate": 0.0001, "loss": 6.4909, "loss/crossentropy": 2.6137181520462036, "loss/hidden": 1.73046875, "loss/jsd": 0.0, "loss/logits": 0.21466894447803497, "step": 5948 }, { "epoch": 0.1859375, "grad_norm": 3.828125, "grad_norm_var": 0.17626546223958334, "learning_rate": 0.0001, "loss": 6.1844, "loss/crossentropy": 2.5273345708847046, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.2047644555568695, "step": 5950 }, { "epoch": 0.186, "grad_norm": 3.65625, "grad_norm_var": 0.10972391764322917, "learning_rate": 0.0001, "loss": 6.133, "loss/crossentropy": 2.5080472230911255, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.2011718973517418, "step": 5952 }, { "epoch": 0.1860625, "grad_norm": 3.890625, "grad_norm_var": 0.1380035400390625, "learning_rate": 0.0001, "loss": 6.3812, "loss/crossentropy": 2.6199456453323364, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.20932674407958984, "step": 5954 }, { "epoch": 0.186125, "grad_norm": 3.625, "grad_norm_var": 0.12379150390625, "learning_rate": 0.0001, "loss": 6.1097, "loss/crossentropy": 2.495076298713684, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.20013613998889923, "step": 5956 }, { "epoch": 0.1861875, "grad_norm": 3.421875, "grad_norm_var": 0.12910868326822916, "learning_rate": 0.0001, "loss": 6.1547, "loss/crossentropy": 2.533836007118225, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.19919445365667343, "step": 5958 }, { "epoch": 0.18625, "grad_norm": 3.5, "grad_norm_var": 0.10904032389322917, "learning_rate": 0.0001, "loss": 6.2077, "loss/crossentropy": 2.6468350887298584, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19710253924131393, "step": 5960 }, { "epoch": 0.1863125, "grad_norm": 5.8125, "grad_norm_var": 0.39912821451822916, "learning_rate": 0.0001, "loss": 6.6428, "loss/crossentropy": 2.8047144412994385, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.21544451266527176, "step": 5962 }, { "epoch": 0.186375, "grad_norm": 3.453125, "grad_norm_var": 0.38775634765625, "learning_rate": 0.0001, "loss": 5.9039, "loss/crossentropy": 2.4518396854400635, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.1866111382842064, "step": 5964 }, { "epoch": 0.1864375, "grad_norm": 3.703125, "grad_norm_var": 0.4022125244140625, "learning_rate": 0.0001, "loss": 6.3249, "loss/crossentropy": 2.555612087249756, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.20856502652168274, "step": 5966 }, { "epoch": 0.1865, "grad_norm": 3.4375, "grad_norm_var": 0.41238606770833336, "learning_rate": 0.0001, "loss": 6.0578, "loss/crossentropy": 2.496393322944641, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.19246415048837662, "step": 5968 }, { "epoch": 0.1865625, "grad_norm": 3.625, "grad_norm_var": 0.3786417643229167, "learning_rate": 0.0001, "loss": 5.716, "loss/crossentropy": 2.2444005012512207, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.1823199763894081, "step": 5970 }, { "epoch": 0.186625, "grad_norm": 3.921875, "grad_norm_var": 0.3862050374348958, "learning_rate": 0.0001, "loss": 6.1808, "loss/crossentropy": 2.5879788398742676, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.19756118953227997, "step": 5972 }, { "epoch": 0.1866875, "grad_norm": 3.515625, "grad_norm_var": 0.38194071451822914, "learning_rate": 0.0001, "loss": 6.16, "loss/crossentropy": 2.476559638977051, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.20388895273208618, "step": 5974 }, { "epoch": 0.18675, "grad_norm": 3.703125, "grad_norm_var": 0.380712890625, "learning_rate": 0.0001, "loss": 6.3285, "loss/crossentropy": 2.6764557361602783, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.20426716655492783, "step": 5976 }, { "epoch": 0.1868125, "grad_norm": 3.78125, "grad_norm_var": 0.06890869140625, "learning_rate": 0.0001, "loss": 6.1595, "loss/crossentropy": 2.5331965684890747, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.20560278743505478, "step": 5978 }, { "epoch": 0.186875, "grad_norm": 3.71875, "grad_norm_var": 0.061620076497395836, "learning_rate": 0.0001, "loss": 6.2729, "loss/crossentropy": 2.617649555206299, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.20341891050338745, "step": 5980 }, { "epoch": 0.1869375, "grad_norm": 3.53125, "grad_norm_var": 0.031151326497395833, "learning_rate": 0.0001, "loss": 6.3184, "loss/crossentropy": 2.657251715660095, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.20166247338056564, "step": 5982 }, { "epoch": 0.187, "grad_norm": 14.6875, "grad_norm_var": 7.671647135416666, "learning_rate": 0.0001, "loss": 6.4626, "loss/crossentropy": 2.511157751083374, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.22912870347499847, "step": 5984 }, { "epoch": 0.1870625, "grad_norm": 3.671875, "grad_norm_var": 7.643797810872396, "learning_rate": 0.0001, "loss": 6.1287, "loss/crossentropy": 2.533773183822632, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.19543274492025375, "step": 5986 }, { "epoch": 0.187125, "grad_norm": 3.75, "grad_norm_var": 7.625886027018229, "learning_rate": 0.0001, "loss": 6.2694, "loss/crossentropy": 2.6192766427993774, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.20055712759494781, "step": 5988 }, { "epoch": 0.1871875, "grad_norm": 3.5625, "grad_norm_var": 7.604227701822917, "learning_rate": 0.0001, "loss": 6.4918, "loss/crossentropy": 2.6744190454483032, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.213771253824234, "step": 5990 }, { "epoch": 0.18725, "grad_norm": 3.78125, "grad_norm_var": 7.5575103759765625, "learning_rate": 0.0001, "loss": 6.3335, "loss/crossentropy": 2.694438338279724, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.19984393566846848, "step": 5992 }, { "epoch": 0.1873125, "grad_norm": 3.671875, "grad_norm_var": 7.572516886393229, "learning_rate": 0.0001, "loss": 6.2238, "loss/crossentropy": 2.52706241607666, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.20600418001413345, "step": 5994 }, { "epoch": 0.187375, "grad_norm": 3.515625, "grad_norm_var": 7.656012980143229, "learning_rate": 0.0001, "loss": 5.9532, "loss/crossentropy": 2.426245331764221, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.19214607775211334, "step": 5996 }, { "epoch": 0.1874375, "grad_norm": 3.796875, "grad_norm_var": 7.616178385416666, "learning_rate": 0.0001, "loss": 6.4061, "loss/crossentropy": 2.657272458076477, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.20925995707511902, "step": 5998 }, { "epoch": 0.1875, "grad_norm": 3.5, "grad_norm_var": 0.03191731770833333, "learning_rate": 0.0001, "loss": 6.4515, "loss/crossentropy": 2.7046492099761963, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.21218468993902206, "step": 6000 }, { "epoch": 0.1875625, "grad_norm": 3.328125, "grad_norm_var": 0.034521484375, "learning_rate": 0.0001, "loss": 6.2953, "loss/crossentropy": 2.7289458513259888, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.1945292055606842, "step": 6002 }, { "epoch": 0.187625, "grad_norm": 3.90625, "grad_norm_var": 0.03873291015625, "learning_rate": 0.0001, "loss": 6.2382, "loss/crossentropy": 2.5910305976867676, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.20260342955589294, "step": 6004 }, { "epoch": 0.1876875, "grad_norm": 3.703125, "grad_norm_var": 0.034407552083333334, "learning_rate": 0.0001, "loss": 6.4952, "loss/crossentropy": 2.707619071006775, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.21039412915706635, "step": 6006 }, { "epoch": 0.18775, "grad_norm": 3.734375, "grad_norm_var": 0.038863118489583334, "learning_rate": 0.0001, "loss": 6.1359, "loss/crossentropy": 2.426607131958008, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.20881590247154236, "step": 6008 }, { "epoch": 0.1878125, "grad_norm": 3.703125, "grad_norm_var": 0.03825581868489583, "learning_rate": 0.0001, "loss": 6.6067, "loss/crossentropy": 2.7720266580581665, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.2154972180724144, "step": 6010 }, { "epoch": 0.187875, "grad_norm": 3.84375, "grad_norm_var": 0.03859761555989583, "learning_rate": 0.0001, "loss": 6.7914, "loss/crossentropy": 2.8925548791885376, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.2219136357307434, "step": 6012 }, { "epoch": 0.1879375, "grad_norm": 3.453125, "grad_norm_var": 0.04293212890625, "learning_rate": 0.0001, "loss": 6.502, "loss/crossentropy": 2.8628028631210327, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.2010301947593689, "step": 6014 }, { "epoch": 0.188, "grad_norm": 3.359375, "grad_norm_var": 0.05241597493489583, "learning_rate": 0.0001, "loss": 6.1368, "loss/crossentropy": 2.570828080177307, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.19839782267808914, "step": 6016 }, { "epoch": 0.1880625, "grad_norm": 3.5, "grad_norm_var": 0.04946187337239583, "learning_rate": 0.0001, "loss": 6.0992, "loss/crossentropy": 2.5494972467422485, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19520193338394165, "step": 6018 }, { "epoch": 0.188125, "grad_norm": 3.703125, "grad_norm_var": 0.04431864420572917, "learning_rate": 0.0001, "loss": 6.274, "loss/crossentropy": 2.5650126934051514, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.20722636580467224, "step": 6020 }, { "epoch": 0.1881875, "grad_norm": 4.0625, "grad_norm_var": 0.054076131184895834, "learning_rate": 0.0001, "loss": 6.8115, "loss/crossentropy": 2.8891549110412598, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.2234850898385048, "step": 6022 }, { "epoch": 0.18825, "grad_norm": 3.4375, "grad_norm_var": 0.06729227701822917, "learning_rate": 0.0001, "loss": 5.7411, "loss/crossentropy": 2.296898603439331, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.1795775294303894, "step": 6024 }, { "epoch": 0.1883125, "grad_norm": 3.484375, "grad_norm_var": 0.06712137858072917, "learning_rate": 0.0001, "loss": 6.3372, "loss/crossentropy": 2.705489993095398, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.2037922739982605, "step": 6026 }, { "epoch": 0.188375, "grad_norm": 3.5625, "grad_norm_var": 0.04289449055989583, "learning_rate": 0.0001, "loss": 6.5295, "loss/crossentropy": 2.7792210578918457, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.20705441385507584, "step": 6028 }, { "epoch": 0.1884375, "grad_norm": 3.5625, "grad_norm_var": 0.049738566080729164, "learning_rate": 0.0001, "loss": 6.4451, "loss/crossentropy": 2.713812470436096, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.2071167230606079, "step": 6030 }, { "epoch": 0.1885, "grad_norm": 3.578125, "grad_norm_var": 0.048493448893229166, "learning_rate": 0.0001, "loss": 6.602, "loss/crossentropy": 2.7990516424179077, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.21662269532680511, "step": 6032 }, { "epoch": 0.1885625, "grad_norm": 3.546875, "grad_norm_var": 0.046117146809895836, "learning_rate": 0.0001, "loss": 6.1163, "loss/crossentropy": 2.664048671722412, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.18624288588762283, "step": 6034 }, { "epoch": 0.188625, "grad_norm": 6.375, "grad_norm_var": 0.5297688802083333, "learning_rate": 0.0001, "loss": 6.7929, "loss/crossentropy": 2.8639910221099854, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.22219152748584747, "step": 6036 }, { "epoch": 0.1886875, "grad_norm": 3.6875, "grad_norm_var": 0.5204823811848959, "learning_rate": 0.0001, "loss": 6.279, "loss/crossentropy": 2.6860432624816895, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.20031289756298065, "step": 6038 }, { "epoch": 0.18875, "grad_norm": 3.421875, "grad_norm_var": 0.49435221354166664, "learning_rate": 0.0001, "loss": 6.1542, "loss/crossentropy": 2.5659446716308594, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.19710734486579895, "step": 6040 }, { "epoch": 0.1888125, "grad_norm": 3.703125, "grad_norm_var": 0.48899637858072914, "learning_rate": 0.0001, "loss": 6.3601, "loss/crossentropy": 2.688350558280945, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.20194531977176666, "step": 6042 }, { "epoch": 0.188875, "grad_norm": 3.578125, "grad_norm_var": 0.49302978515625, "learning_rate": 0.0001, "loss": 6.3056, "loss/crossentropy": 2.6706860065460205, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.204509399831295, "step": 6044 }, { "epoch": 0.1889375, "grad_norm": 3.734375, "grad_norm_var": 0.48799540201822916, "learning_rate": 0.0001, "loss": 6.5712, "loss/crossentropy": 2.746452808380127, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.21528411656618118, "step": 6046 }, { "epoch": 0.189, "grad_norm": 4.375, "grad_norm_var": 0.5116373697916666, "learning_rate": 0.0001, "loss": 6.3722, "loss/crossentropy": 2.6199615001678467, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.21076759696006775, "step": 6048 }, { "epoch": 0.1890625, "grad_norm": 3.5625, "grad_norm_var": 0.5173248291015625, "learning_rate": 0.0001, "loss": 5.7291, "loss/crossentropy": 2.251525402069092, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.18135468661785126, "step": 6050 }, { "epoch": 0.189125, "grad_norm": 3.5, "grad_norm_var": 0.05324605305989583, "learning_rate": 0.0001, "loss": 6.3893, "loss/crossentropy": 2.6776301860809326, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.2086646556854248, "step": 6052 }, { "epoch": 0.1891875, "grad_norm": 4.8125, "grad_norm_var": 0.18164774576822917, "learning_rate": 0.0001, "loss": 6.5803, "loss/crossentropy": 2.852060914039612, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.2040734738111496, "step": 6054 }, { "epoch": 0.18925, "grad_norm": 3.625, "grad_norm_var": 0.17443033854166667, "learning_rate": 0.0001, "loss": 6.489, "loss/crossentropy": 2.769283890724182, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.2090795561671257, "step": 6056 }, { "epoch": 0.1893125, "grad_norm": 3.421875, "grad_norm_var": 0.17888895670572916, "learning_rate": 0.0001, "loss": 5.935, "loss/crossentropy": 2.397734045982361, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.1927870362997055, "step": 6058 }, { "epoch": 0.189375, "grad_norm": 3.84375, "grad_norm_var": 0.1787261962890625, "learning_rate": 0.0001, "loss": 6.323, "loss/crossentropy": 2.716283082962036, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.19816960394382477, "step": 6060 }, { "epoch": 0.1894375, "grad_norm": 3.46875, "grad_norm_var": 0.1854888916015625, "learning_rate": 0.0001, "loss": 6.3036, "loss/crossentropy": 2.6464054584503174, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.20244264602661133, "step": 6062 }, { "epoch": 0.1895, "grad_norm": 3.5625, "grad_norm_var": 0.16020406087239583, "learning_rate": 0.0001, "loss": 6.1906, "loss/crossentropy": 2.612483024597168, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.1976574957370758, "step": 6064 }, { "epoch": 0.1895625, "grad_norm": 3.703125, "grad_norm_var": 0.1543121337890625, "learning_rate": 0.0001, "loss": 6.1622, "loss/crossentropy": 2.5031535625457764, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.20418957620859146, "step": 6066 }, { "epoch": 0.189625, "grad_norm": 3.96875, "grad_norm_var": 0.1566802978515625, "learning_rate": 0.0001, "loss": 6.4214, "loss/crossentropy": 2.6766852140426636, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.20415638387203217, "step": 6068 }, { "epoch": 0.1896875, "grad_norm": 3.546875, "grad_norm_var": 0.026349894205729165, "learning_rate": 0.0001, "loss": 6.3106, "loss/crossentropy": 2.7238508462905884, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.19851642102003098, "step": 6070 }, { "epoch": 0.18975, "grad_norm": 4.0, "grad_norm_var": 0.038232421875, "learning_rate": 0.0001, "loss": 6.3525, "loss/crossentropy": 2.5249571800231934, "loss/hidden": 1.71875, "loss/jsd": 0.0, "loss/logits": 0.2108766883611679, "step": 6072 }, { "epoch": 0.1898125, "grad_norm": 4.09375, "grad_norm_var": 0.04902242024739583, "learning_rate": 0.0001, "loss": 6.2244, "loss/crossentropy": 2.585625648498535, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.20138001441955566, "step": 6074 }, { "epoch": 0.189875, "grad_norm": 3.6875, "grad_norm_var": 0.043488566080729166, "learning_rate": 0.0001, "loss": 6.2864, "loss/crossentropy": 2.6281604766845703, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.20293108373880386, "step": 6076 }, { "epoch": 0.1899375, "grad_norm": 4.375, "grad_norm_var": 0.0712554931640625, "learning_rate": 0.0001, "loss": 6.1245, "loss/crossentropy": 2.405423879623413, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.20823708176612854, "step": 6078 }, { "epoch": 0.19, "grad_norm": 3.5, "grad_norm_var": 0.07091471354166666, "learning_rate": 0.0001, "loss": 5.8131, "loss/crossentropy": 2.343820095062256, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18950790911912918, "step": 6080 }, { "epoch": 0.1900625, "grad_norm": 3.734375, "grad_norm_var": 0.06970926920572916, "learning_rate": 0.0001, "loss": 6.6922, "loss/crossentropy": 2.852641463279724, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.2148171365261078, "step": 6082 }, { "epoch": 0.190125, "grad_norm": 3.796875, "grad_norm_var": 0.06372782389322916, "learning_rate": 0.0001, "loss": 6.4927, "loss/crossentropy": 2.714944005012512, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.21137383580207825, "step": 6084 }, { "epoch": 0.1901875, "grad_norm": 3.765625, "grad_norm_var": 0.05829671223958333, "learning_rate": 0.0001, "loss": 5.8325, "loss/crossentropy": 2.2950823307037354, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.1877269148826599, "step": 6086 }, { "epoch": 0.19025, "grad_norm": 3.578125, "grad_norm_var": 0.06189676920572917, "learning_rate": 0.0001, "loss": 6.3545, "loss/crossentropy": 2.7253220081329346, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.19611873477697372, "step": 6088 }, { "epoch": 0.1903125, "grad_norm": 4.125, "grad_norm_var": 0.0606109619140625, "learning_rate": 0.0001, "loss": 6.5246, "loss/crossentropy": 2.7045950889587402, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.21950286626815796, "step": 6090 }, { "epoch": 0.190375, "grad_norm": 4.03125, "grad_norm_var": 0.06428934733072916, "learning_rate": 0.0001, "loss": 6.361, "loss/crossentropy": 2.581110715866089, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.20885251462459564, "step": 6092 }, { "epoch": 0.1904375, "grad_norm": 3.53125, "grad_norm_var": 0.03723856608072917, "learning_rate": 0.0001, "loss": 6.28, "loss/crossentropy": 2.647969961166382, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.20030918717384338, "step": 6094 }, { "epoch": 0.1905, "grad_norm": 3.828125, "grad_norm_var": 0.03548075358072917, "learning_rate": 0.0001, "loss": 6.3454, "loss/crossentropy": 2.6669753789901733, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.20261073857545853, "step": 6096 }, { "epoch": 0.1905625, "grad_norm": 3.53125, "grad_norm_var": 0.04225972493489583, "learning_rate": 0.0001, "loss": 6.2998, "loss/crossentropy": 2.621742844581604, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.20764455944299698, "step": 6098 }, { "epoch": 0.190625, "grad_norm": 3.59375, "grad_norm_var": 0.04215087890625, "learning_rate": 0.0001, "loss": 6.0253, "loss/crossentropy": 2.3852285146713257, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.19916538894176483, "step": 6100 }, { "epoch": 0.1906875, "grad_norm": 3.5625, "grad_norm_var": 0.04202473958333333, "learning_rate": 0.0001, "loss": 6.3239, "loss/crossentropy": 2.6609463691711426, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.20379143953323364, "step": 6102 }, { "epoch": 0.19075, "grad_norm": 3.90625, "grad_norm_var": 0.04533589680989583, "learning_rate": 0.0001, "loss": 6.1169, "loss/crossentropy": 2.599799633026123, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.19663503021001816, "step": 6104 }, { "epoch": 0.1908125, "grad_norm": 3.5, "grad_norm_var": 0.031201171875, "learning_rate": 0.0001, "loss": 6.0963, "loss/crossentropy": 2.561492085456848, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.19332732260227203, "step": 6106 }, { "epoch": 0.190875, "grad_norm": 3.3125, "grad_norm_var": 0.022362263997395833, "learning_rate": 0.0001, "loss": 6.1243, "loss/crossentropy": 2.561802625656128, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.19258107244968414, "step": 6108 }, { "epoch": 0.1909375, "grad_norm": 3.46875, "grad_norm_var": 0.04573465983072917, "learning_rate": 0.0001, "loss": 6.4606, "loss/crossentropy": 2.772518277168274, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.20982690900564194, "step": 6110 }, { "epoch": 0.191, "grad_norm": 3.78125, "grad_norm_var": 0.05339253743489583, "learning_rate": 0.0001, "loss": 6.2575, "loss/crossentropy": 2.608256459236145, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.2043812796473503, "step": 6112 }, { "epoch": 0.1910625, "grad_norm": 3.53125, "grad_norm_var": 0.054488118489583334, "learning_rate": 0.0001, "loss": 6.3426, "loss/crossentropy": 2.754101514816284, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.2022107094526291, "step": 6114 }, { "epoch": 0.191125, "grad_norm": 3.46875, "grad_norm_var": 0.05672098795572917, "learning_rate": 0.0001, "loss": 6.064, "loss/crossentropy": 2.5370287895202637, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.1929335743188858, "step": 6116 }, { "epoch": 0.1911875, "grad_norm": 3.46875, "grad_norm_var": 0.07105712890625, "learning_rate": 0.0001, "loss": 6.3428, "loss/crossentropy": 2.7027775049209595, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.20228593051433563, "step": 6118 }, { "epoch": 0.19125, "grad_norm": 4.15625, "grad_norm_var": 0.08713277180989583, "learning_rate": 0.0001, "loss": 6.389, "loss/crossentropy": 2.6226966381073, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.2102195918560028, "step": 6120 }, { "epoch": 0.1913125, "grad_norm": 3.46875, "grad_norm_var": 0.0928863525390625, "learning_rate": 0.0001, "loss": 6.2733, "loss/crossentropy": 2.6209421157836914, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.2019570991396904, "step": 6122 }, { "epoch": 0.191375, "grad_norm": 3.265625, "grad_norm_var": 0.09683329264322917, "learning_rate": 0.0001, "loss": 6.0993, "loss/crossentropy": 2.5557941198349, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1977071538567543, "step": 6124 }, { "epoch": 0.1914375, "grad_norm": 4.125, "grad_norm_var": 0.11159566243489584, "learning_rate": 0.0001, "loss": 6.5564, "loss/crossentropy": 2.638274908065796, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.22267445921897888, "step": 6126 }, { "epoch": 0.1915, "grad_norm": 3.296875, "grad_norm_var": 0.10690816243489583, "learning_rate": 0.0001, "loss": 5.9419, "loss/crossentropy": 2.3865491151809692, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.1918674185872078, "step": 6128 }, { "epoch": 0.1915625, "grad_norm": 3.890625, "grad_norm_var": 0.10657145182291666, "learning_rate": 0.0001, "loss": 6.2456, "loss/crossentropy": 2.4741748571395874, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.2115134298801422, "step": 6130 }, { "epoch": 0.191625, "grad_norm": 3.515625, "grad_norm_var": 0.0994140625, "learning_rate": 0.0001, "loss": 6.3467, "loss/crossentropy": 2.683540105819702, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.203819178044796, "step": 6132 }, { "epoch": 0.1916875, "grad_norm": 3.546875, "grad_norm_var": 0.10526936848958333, "learning_rate": 0.0001, "loss": 6.2246, "loss/crossentropy": 2.635378360748291, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.1968080848455429, "step": 6134 }, { "epoch": 0.19175, "grad_norm": 3.40625, "grad_norm_var": 0.08983968098958334, "learning_rate": 0.0001, "loss": 5.9543, "loss/crossentropy": 2.4498454332351685, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.19184674322605133, "step": 6136 }, { "epoch": 0.1918125, "grad_norm": 4.0, "grad_norm_var": 0.09337565104166666, "learning_rate": 0.0001, "loss": 6.1546, "loss/crossentropy": 2.5229196548461914, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.2030087485909462, "step": 6138 }, { "epoch": 0.191875, "grad_norm": 4.71875, "grad_norm_var": 0.14919331868489583, "learning_rate": 0.0001, "loss": 6.5918, "loss/crossentropy": 2.786103367805481, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.21572455763816833, "step": 6140 }, { "epoch": 0.1919375, "grad_norm": 4.4375, "grad_norm_var": 0.1832427978515625, "learning_rate": 0.0001, "loss": 6.7382, "loss/crossentropy": 2.689463257789612, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.23808030039072037, "step": 6142 }, { "epoch": 0.192, "grad_norm": 3.390625, "grad_norm_var": 0.18919169108072917, "learning_rate": 0.0001, "loss": 5.9059, "loss/crossentropy": 2.4961681365966797, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18550093472003937, "step": 6144 }, { "epoch": 0.1920625, "grad_norm": 3.71875, "grad_norm_var": 0.20533854166666668, "learning_rate": 0.0001, "loss": 5.8333, "loss/crossentropy": 2.3912068605422974, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.18639901280403137, "step": 6146 }, { "epoch": 0.192125, "grad_norm": 3.21875, "grad_norm_var": 0.21966044108072916, "learning_rate": 0.0001, "loss": 6.0744, "loss/crossentropy": 2.5090500116348267, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19559796899557114, "step": 6148 }, { "epoch": 0.1921875, "grad_norm": 3.65625, "grad_norm_var": 0.20067952473958334, "learning_rate": 0.0001, "loss": 6.3968, "loss/crossentropy": 2.7094013690948486, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.20545856654644012, "step": 6150 }, { "epoch": 0.19225, "grad_norm": 4.1875, "grad_norm_var": 0.1937164306640625, "learning_rate": 0.0001, "loss": 6.2988, "loss/crossentropy": 2.616774797439575, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.20609114319086075, "step": 6152 }, { "epoch": 0.1923125, "grad_norm": 3.21875, "grad_norm_var": 0.214013671875, "learning_rate": 0.0001, "loss": 6.1391, "loss/crossentropy": 2.506002187728882, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.20237066596746445, "step": 6154 }, { "epoch": 0.192375, "grad_norm": 5.25, "grad_norm_var": 0.3080963134765625, "learning_rate": 0.0001, "loss": 6.3327, "loss/crossentropy": 2.64454448223114, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.20358256250619888, "step": 6156 }, { "epoch": 0.1924375, "grad_norm": 3.796875, "grad_norm_var": 0.25627339680989586, "learning_rate": 0.0001, "loss": 6.125, "loss/crossentropy": 2.4693968296051025, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.2038370817899704, "step": 6158 }, { "epoch": 0.1925, "grad_norm": 3.859375, "grad_norm_var": 0.23990478515625, "learning_rate": 0.0001, "loss": 6.5249, "loss/crossentropy": 2.726282238960266, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.21697460114955902, "step": 6160 }, { "epoch": 0.1925625, "grad_norm": 3.578125, "grad_norm_var": 0.22180989583333333, "learning_rate": 0.0001, "loss": 6.0866, "loss/crossentropy": 2.3971667289733887, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.20370658487081528, "step": 6162 }, { "epoch": 0.192625, "grad_norm": 4.0, "grad_norm_var": 0.19635009765625, "learning_rate": 0.0001, "loss": 6.451, "loss/crossentropy": 2.5855096578598022, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.21897225081920624, "step": 6164 }, { "epoch": 0.1926875, "grad_norm": 3.875, "grad_norm_var": 0.19466044108072916, "learning_rate": 0.0001, "loss": 6.4751, "loss/crossentropy": 2.7665354013442993, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.20328181236982346, "step": 6166 }, { "epoch": 0.19275, "grad_norm": 3.328125, "grad_norm_var": 0.20995992024739582, "learning_rate": 0.0001, "loss": 6.134, "loss/crossentropy": 2.5601810216903687, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.19292522966861725, "step": 6168 }, { "epoch": 0.1928125, "grad_norm": 3.84375, "grad_norm_var": 0.20896809895833332, "learning_rate": 0.0001, "loss": 6.3857, "loss/crossentropy": 2.703435182571411, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.20494025945663452, "step": 6170 }, { "epoch": 0.192875, "grad_norm": 4.3125, "grad_norm_var": 0.0761871337890625, "learning_rate": 0.0001, "loss": 6.5079, "loss/crossentropy": 2.6246687173843384, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.2199653834104538, "step": 6172 }, { "epoch": 0.1929375, "grad_norm": 3.671875, "grad_norm_var": 0.07942301432291667, "learning_rate": 0.0001, "loss": 5.8911, "loss/crossentropy": 2.304368019104004, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.19813121110200882, "step": 6174 }, { "epoch": 0.193, "grad_norm": 4.09375, "grad_norm_var": 0.09829813639322917, "learning_rate": 0.0001, "loss": 5.9767, "loss/crossentropy": 2.4628864526748657, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.1896635890007019, "step": 6176 }, { "epoch": 0.1930625, "grad_norm": 3.84375, "grad_norm_var": 0.10165608723958333, "learning_rate": 0.0001, "loss": 6.2344, "loss/crossentropy": 2.622955799102783, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.198647640645504, "step": 6178 }, { "epoch": 0.193125, "grad_norm": 3.25, "grad_norm_var": 0.10637919108072917, "learning_rate": 0.0001, "loss": 6.2863, "loss/crossentropy": 2.7121388912200928, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.19725629687309265, "step": 6180 }, { "epoch": 0.1931875, "grad_norm": 3.6875, "grad_norm_var": 0.10224507649739584, "learning_rate": 0.0001, "loss": 5.8827, "loss/crossentropy": 2.3777220249176025, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.18956278264522552, "step": 6182 }, { "epoch": 0.19325, "grad_norm": 3.640625, "grad_norm_var": 0.104443359375, "learning_rate": 0.0001, "loss": 6.2903, "loss/crossentropy": 2.6881426572799683, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19927627593278885, "step": 6184 }, { "epoch": 0.1933125, "grad_norm": 3.890625, "grad_norm_var": 0.09348856608072917, "learning_rate": 0.0001, "loss": 6.3074, "loss/crossentropy": 2.5851895809173584, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.20620650053024292, "step": 6186 }, { "epoch": 0.193375, "grad_norm": 3.609375, "grad_norm_var": 0.057291666666666664, "learning_rate": 0.0001, "loss": 6.0688, "loss/crossentropy": 2.531674385070801, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.1935586929321289, "step": 6188 }, { "epoch": 0.1934375, "grad_norm": 3.671875, "grad_norm_var": 0.0791015625, "learning_rate": 0.0001, "loss": 6.2447, "loss/crossentropy": 2.599445343017578, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.20085646957159042, "step": 6190 }, { "epoch": 0.1935, "grad_norm": 3.609375, "grad_norm_var": 0.05747782389322917, "learning_rate": 0.0001, "loss": 6.2174, "loss/crossentropy": 2.591778039932251, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.20044782757759094, "step": 6192 }, { "epoch": 0.1935625, "grad_norm": 3.453125, "grad_norm_var": 0.05000712076822917, "learning_rate": 0.0001, "loss": 5.9738, "loss/crossentropy": 2.4809694290161133, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19146671891212463, "step": 6194 }, { "epoch": 0.193625, "grad_norm": 3.4375, "grad_norm_var": 0.04241129557291667, "learning_rate": 0.0001, "loss": 5.8573, "loss/crossentropy": 2.316272735595703, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.1896507441997528, "step": 6196 }, { "epoch": 0.1936875, "grad_norm": 3.5, "grad_norm_var": 0.04509175618489583, "learning_rate": 0.0001, "loss": 5.8686, "loss/crossentropy": 2.430908203125, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1859612688422203, "step": 6198 }, { "epoch": 0.19375, "grad_norm": 3.359375, "grad_norm_var": 0.04146219889322917, "learning_rate": 0.0001, "loss": 6.1377, "loss/crossentropy": 2.570114254951477, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.1985529363155365, "step": 6200 }, { "epoch": 0.1938125, "grad_norm": 3.59375, "grad_norm_var": 0.03762613932291667, "learning_rate": 0.0001, "loss": 6.412, "loss/crossentropy": 2.7270954847335815, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.20169344544410706, "step": 6202 }, { "epoch": 0.193875, "grad_norm": 3.703125, "grad_norm_var": 0.03767801920572917, "learning_rate": 0.0001, "loss": 5.8985, "loss/crossentropy": 2.3919687271118164, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.1920596957206726, "step": 6204 }, { "epoch": 0.1939375, "grad_norm": 3.796875, "grad_norm_var": 0.017699178059895834, "learning_rate": 0.0001, "loss": 6.5405, "loss/crossentropy": 2.784037232398987, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.20846372842788696, "step": 6206 }, { "epoch": 0.194, "grad_norm": 3.671875, "grad_norm_var": 0.018387858072916666, "learning_rate": 0.0001, "loss": 5.9835, "loss/crossentropy": 2.3724876642227173, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.2005559653043747, "step": 6208 }, { "epoch": 0.1940625, "grad_norm": 3.515625, "grad_norm_var": 0.03406473795572917, "learning_rate": 0.0001, "loss": 6.2921, "loss/crossentropy": 2.662288188934326, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.19813786447048187, "step": 6210 }, { "epoch": 0.194125, "grad_norm": 3.921875, "grad_norm_var": 0.03808186848958333, "learning_rate": 0.0001, "loss": 6.4928, "loss/crossentropy": 2.73208487033844, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.21122856438159943, "step": 6212 }, { "epoch": 0.1941875, "grad_norm": 3.328125, "grad_norm_var": 0.0408843994140625, "learning_rate": 0.0001, "loss": 5.8068, "loss/crossentropy": 2.2671496868133545, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.19107583165168762, "step": 6214 }, { "epoch": 0.19425, "grad_norm": 4.8125, "grad_norm_var": 0.1188140869140625, "learning_rate": 0.0001, "loss": 6.0706, "loss/crossentropy": 2.461466670036316, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.19880161434412003, "step": 6216 }, { "epoch": 0.1943125, "grad_norm": 3.71875, "grad_norm_var": 0.11562093098958333, "learning_rate": 0.0001, "loss": 6.2875, "loss/crossentropy": 2.588753581047058, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.20659077912569046, "step": 6218 }, { "epoch": 0.194375, "grad_norm": 4.09375, "grad_norm_var": 0.12668863932291666, "learning_rate": 0.0001, "loss": 6.0093, "loss/crossentropy": 2.498833656311035, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19205859303474426, "step": 6220 }, { "epoch": 0.1944375, "grad_norm": 3.796875, "grad_norm_var": 0.13118082682291668, "learning_rate": 0.0001, "loss": 5.934, "loss/crossentropy": 2.3816803693771362, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.1927303969860077, "step": 6222 }, { "epoch": 0.1945, "grad_norm": 3.59375, "grad_norm_var": 0.13492431640625, "learning_rate": 0.0001, "loss": 6.3904, "loss/crossentropy": 2.7171014547348022, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.2071734443306923, "step": 6224 }, { "epoch": 0.1945625, "grad_norm": 3.484375, "grad_norm_var": 0.13240559895833334, "learning_rate": 0.0001, "loss": 6.0352, "loss/crossentropy": 2.5309075117111206, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.18909665942192078, "step": 6226 }, { "epoch": 0.194625, "grad_norm": 4.21875, "grad_norm_var": 0.18277994791666666, "learning_rate": 0.0001, "loss": 6.4397, "loss/crossentropy": 2.879347324371338, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.19080226868391037, "step": 6228 }, { "epoch": 0.1946875, "grad_norm": 3.53125, "grad_norm_var": 0.17288004557291667, "learning_rate": 0.0001, "loss": 6.0098, "loss/crossentropy": 2.470539927482605, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.19103093445301056, "step": 6230 }, { "epoch": 0.19475, "grad_norm": 3.40625, "grad_norm_var": 0.09817301432291667, "learning_rate": 0.0001, "loss": 6.6131, "loss/crossentropy": 2.857435464859009, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.21111512184143066, "step": 6232 }, { "epoch": 0.1948125, "grad_norm": 3.90625, "grad_norm_var": 0.11474507649739583, "learning_rate": 0.0001, "loss": 6.1691, "loss/crossentropy": 2.5498597621917725, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.20059340447187424, "step": 6234 }, { "epoch": 0.194875, "grad_norm": 3.390625, "grad_norm_var": 0.11454671223958333, "learning_rate": 0.0001, "loss": 5.9912, "loss/crossentropy": 2.5641114711761475, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1864565759897232, "step": 6236 }, { "epoch": 0.1949375, "grad_norm": 3.421875, "grad_norm_var": 0.1144927978515625, "learning_rate": 0.0001, "loss": 6.1785, "loss/crossentropy": 2.6006263494491577, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19684504717588425, "step": 6238 }, { "epoch": 0.195, "grad_norm": 3.65625, "grad_norm_var": 0.11428629557291667, "learning_rate": 0.0001, "loss": 6.4466, "loss/crossentropy": 2.6832586526870728, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.21032319962978363, "step": 6240 }, { "epoch": 0.1950625, "grad_norm": 3.765625, "grad_norm_var": 0.10955301920572917, "learning_rate": 0.0001, "loss": 6.2545, "loss/crossentropy": 2.583243250846863, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.20423732697963715, "step": 6242 }, { "epoch": 0.195125, "grad_norm": 4.15625, "grad_norm_var": 0.05838216145833333, "learning_rate": 0.0001, "loss": 6.3258, "loss/crossentropy": 2.6092876195907593, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.2032882571220398, "step": 6244 }, { "epoch": 0.1951875, "grad_norm": 3.40625, "grad_norm_var": 0.06337483723958333, "learning_rate": 0.0001, "loss": 6.0453, "loss/crossentropy": 2.4458929300308228, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.19626961648464203, "step": 6246 }, { "epoch": 0.19525, "grad_norm": 3.40625, "grad_norm_var": 0.0630523681640625, "learning_rate": 0.0001, "loss": 6.0131, "loss/crossentropy": 2.524987578392029, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.1898244246840477, "step": 6248 }, { "epoch": 0.1953125, "grad_norm": 3.671875, "grad_norm_var": 0.04722900390625, "learning_rate": 0.0001, "loss": 6.153, "loss/crossentropy": 2.53132426738739, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.2008429765701294, "step": 6250 }, { "epoch": 0.195375, "grad_norm": 4.03125, "grad_norm_var": 0.0478424072265625, "learning_rate": 0.0001, "loss": 6.2777, "loss/crossentropy": 2.6375958919525146, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.2046319842338562, "step": 6252 }, { "epoch": 0.1954375, "grad_norm": 3.5625, "grad_norm_var": 0.04595947265625, "learning_rate": 0.0001, "loss": 6.8056, "loss/crossentropy": 3.0053709745407104, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.2128308191895485, "step": 6254 }, { "epoch": 0.1955, "grad_norm": 3.71875, "grad_norm_var": 0.04680989583333333, "learning_rate": 0.0001, "loss": 6.1446, "loss/crossentropy": 2.5195276737213135, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.20000825077295303, "step": 6256 }, { "epoch": 0.1955625, "grad_norm": 3.484375, "grad_norm_var": 0.04716695149739583, "learning_rate": 0.0001, "loss": 6.2884, "loss/crossentropy": 2.652225613594055, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.2030748426914215, "step": 6258 }, { "epoch": 0.195625, "grad_norm": 3.703125, "grad_norm_var": 0.0280181884765625, "learning_rate": 0.0001, "loss": 6.3109, "loss/crossentropy": 2.706356406211853, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.19834348559379578, "step": 6260 }, { "epoch": 0.1956875, "grad_norm": 3.65625, "grad_norm_var": 0.0787506103515625, "learning_rate": 0.0001, "loss": 6.0637, "loss/crossentropy": 2.5506874322891235, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.18645452708005905, "step": 6262 }, { "epoch": 0.19575, "grad_norm": 3.390625, "grad_norm_var": 0.090283203125, "learning_rate": 0.0001, "loss": 6.2095, "loss/crossentropy": 2.659890294075012, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.19363542646169662, "step": 6264 }, { "epoch": 0.1958125, "grad_norm": 3.5625, "grad_norm_var": 0.16773681640625, "learning_rate": 0.0001, "loss": 6.1357, "loss/crossentropy": 2.4744845628738403, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.19814816117286682, "step": 6266 }, { "epoch": 0.195875, "grad_norm": 3.328125, "grad_norm_var": 0.1708404541015625, "learning_rate": 0.0001, "loss": 6.2998, "loss/crossentropy": 2.6317743062973022, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.20274505764245987, "step": 6268 }, { "epoch": 0.1959375, "grad_norm": 3.640625, "grad_norm_var": 0.16910400390625, "learning_rate": 0.0001, "loss": 6.0523, "loss/crossentropy": 2.4604722261428833, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.1931646391749382, "step": 6270 }, { "epoch": 0.196, "grad_norm": 3.78125, "grad_norm_var": 0.16767171223958333, "learning_rate": 0.0001, "loss": 6.2577, "loss/crossentropy": 2.5092904567718506, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.20765485614538193, "step": 6272 }, { "epoch": 0.1960625, "grad_norm": 3.625, "grad_norm_var": 0.16330973307291666, "learning_rate": 0.0001, "loss": 6.2526, "loss/crossentropy": 2.607978105545044, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.20313771069049835, "step": 6274 }, { "epoch": 0.196125, "grad_norm": 3.265625, "grad_norm_var": 0.19286702473958334, "learning_rate": 0.0001, "loss": 6.0585, "loss/crossentropy": 2.600584626197815, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18876295536756516, "step": 6276 }, { "epoch": 0.1961875, "grad_norm": 3.390625, "grad_norm_var": 0.14591471354166666, "learning_rate": 0.0001, "loss": 6.0282, "loss/crossentropy": 2.3694067001342773, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.19908495992422104, "step": 6278 }, { "epoch": 0.19625, "grad_norm": 3.4375, "grad_norm_var": 0.1331451416015625, "learning_rate": 0.0001, "loss": 6.2367, "loss/crossentropy": 2.596962571144104, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.2049897238612175, "step": 6280 }, { "epoch": 0.1963125, "grad_norm": 4.125, "grad_norm_var": 0.06413472493489583, "learning_rate": 0.0001, "loss": 6.5489, "loss/crossentropy": 2.7599679231643677, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.21326513588428497, "step": 6282 }, { "epoch": 0.196375, "grad_norm": 3.53125, "grad_norm_var": 0.05855712890625, "learning_rate": 0.0001, "loss": 6.1641, "loss/crossentropy": 2.602334141731262, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.1979694366455078, "step": 6284 }, { "epoch": 0.1964375, "grad_norm": 3.34375, "grad_norm_var": 0.062409464518229166, "learning_rate": 0.0001, "loss": 5.9995, "loss/crossentropy": 2.484236478805542, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.1905846670269966, "step": 6286 }, { "epoch": 0.1965, "grad_norm": 4.0625, "grad_norm_var": 0.06709696451822916, "learning_rate": 0.0001, "loss": 5.9537, "loss/crossentropy": 2.2918028831481934, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.20212748646736145, "step": 6288 }, { "epoch": 0.1965625, "grad_norm": 3.5625, "grad_norm_var": 0.06467997233072917, "learning_rate": 0.0001, "loss": 6.3358, "loss/crossentropy": 2.6319711208343506, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.21062206476926804, "step": 6290 }, { "epoch": 0.196625, "grad_norm": 3.375, "grad_norm_var": 0.054703776041666666, "learning_rate": 0.0001, "loss": 6.0362, "loss/crossentropy": 2.4948049783706665, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.1951577365398407, "step": 6292 }, { "epoch": 0.1966875, "grad_norm": 3.578125, "grad_norm_var": 0.049714152018229166, "learning_rate": 0.0001, "loss": 6.3416, "loss/crossentropy": 2.634096145629883, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.20981720089912415, "step": 6294 }, { "epoch": 0.19675, "grad_norm": 3.546875, "grad_norm_var": 0.05196024576822917, "learning_rate": 0.0001, "loss": 6.1901, "loss/crossentropy": 2.591532826423645, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.19462481141090393, "step": 6296 }, { "epoch": 0.1968125, "grad_norm": 3.578125, "grad_norm_var": 0.10296223958333334, "learning_rate": 0.0001, "loss": 6.3022, "loss/crossentropy": 2.592622399330139, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.20650336146354675, "step": 6298 }, { "epoch": 0.196875, "grad_norm": 3.671875, "grad_norm_var": 0.10568745930989583, "learning_rate": 0.0001, "loss": 6.5115, "loss/crossentropy": 2.7239500284194946, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.2103961780667305, "step": 6300 }, { "epoch": 0.1969375, "grad_norm": 3.46875, "grad_norm_var": 0.09684244791666667, "learning_rate": 0.0001, "loss": 6.5188, "loss/crossentropy": 2.784137725830078, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.20627786219120026, "step": 6302 }, { "epoch": 0.197, "grad_norm": 3.640625, "grad_norm_var": 0.09217122395833334, "learning_rate": 0.0001, "loss": 6.2574, "loss/crossentropy": 2.6546131372451782, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.20090806484222412, "step": 6304 }, { "epoch": 0.1970625, "grad_norm": 3.625, "grad_norm_var": 0.09218648274739584, "learning_rate": 0.0001, "loss": 6.459, "loss/crossentropy": 2.7477740049362183, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.20822839438915253, "step": 6306 }, { "epoch": 0.197125, "grad_norm": 3.671875, "grad_norm_var": 0.10091044108072916, "learning_rate": 0.0001, "loss": 6.1576, "loss/crossentropy": 2.6425379514694214, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19252388179302216, "step": 6308 }, { "epoch": 0.1971875, "grad_norm": 3.5, "grad_norm_var": 0.101220703125, "learning_rate": 0.0001, "loss": 6.1944, "loss/crossentropy": 2.570142388343811, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.20227423310279846, "step": 6310 }, { "epoch": 0.19725, "grad_norm": 3.8125, "grad_norm_var": 59.4859364827474, "learning_rate": 0.0001, "loss": 7.034, "loss/crossentropy": 2.753095507621765, "loss/hidden": 2.0546875, "loss/jsd": 0.0, "loss/logits": 0.22262004762887955, "step": 6312 }, { "epoch": 0.1973125, "grad_norm": 3.875, "grad_norm_var": 59.61048075358073, "learning_rate": 0.0001, "loss": 6.3162, "loss/crossentropy": 2.640398144721985, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.20039722323417664, "step": 6314 }, { "epoch": 0.197375, "grad_norm": 3.765625, "grad_norm_var": 59.516988118489586, "learning_rate": 0.0001, "loss": 6.306, "loss/crossentropy": 2.629085659980774, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.207536019384861, "step": 6316 }, { "epoch": 0.1974375, "grad_norm": 3.796875, "grad_norm_var": 59.381900024414065, "learning_rate": 0.0001, "loss": 6.1296, "loss/crossentropy": 2.5182682275772095, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.1978505775332451, "step": 6318 }, { "epoch": 0.1975, "grad_norm": 3.90625, "grad_norm_var": 59.28876953125, "learning_rate": 0.0001, "loss": 6.2356, "loss/crossentropy": 2.5801738500595093, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.20303912460803986, "step": 6320 }, { "epoch": 0.1975625, "grad_norm": 3.765625, "grad_norm_var": 59.175113932291666, "learning_rate": 0.0001, "loss": 5.987, "loss/crossentropy": 2.393300414085388, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.19686954468488693, "step": 6322 }, { "epoch": 0.197625, "grad_norm": 3.625, "grad_norm_var": 59.05861002604167, "learning_rate": 0.0001, "loss": 6.3308, "loss/crossentropy": 2.6749523878097534, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.20504015684127808, "step": 6324 }, { "epoch": 0.1976875, "grad_norm": 3.734375, "grad_norm_var": 59.07056884765625, "learning_rate": 0.0001, "loss": 6.1818, "loss/crossentropy": 2.6099843978881836, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19741416722536087, "step": 6326 }, { "epoch": 0.19775, "grad_norm": 3.703125, "grad_norm_var": 0.048844401041666666, "learning_rate": 0.0001, "loss": 6.1622, "loss/crossentropy": 2.540986657142639, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.19727765768766403, "step": 6328 }, { "epoch": 0.1978125, "grad_norm": 3.9375, "grad_norm_var": 0.04920145670572917, "learning_rate": 0.0001, "loss": 6.5252, "loss/crossentropy": 2.7225914001464844, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.21346324682235718, "step": 6330 }, { "epoch": 0.197875, "grad_norm": 3.5, "grad_norm_var": 0.033447265625, "learning_rate": 0.0001, "loss": 6.5429, "loss/crossentropy": 2.8141279220581055, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.21155305206775665, "step": 6332 }, { "epoch": 0.1979375, "grad_norm": 3.640625, "grad_norm_var": 0.025861612955729165, "learning_rate": 0.0001, "loss": 6.1337, "loss/crossentropy": 2.5853216648101807, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19389677047729492, "step": 6334 }, { "epoch": 0.198, "grad_norm": 3.5, "grad_norm_var": 0.022614542643229166, "learning_rate": 0.0001, "loss": 5.9631, "loss/crossentropy": 2.4103667736053467, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.1916046440601349, "step": 6336 }, { "epoch": 0.1980625, "grad_norm": 3.71875, "grad_norm_var": 0.025324503580729168, "learning_rate": 0.0001, "loss": 6.2243, "loss/crossentropy": 2.612160801887512, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.20261822640895844, "step": 6338 }, { "epoch": 0.198125, "grad_norm": 3.328125, "grad_norm_var": 0.031103515625, "learning_rate": 0.0001, "loss": 6.0515, "loss/crossentropy": 2.571484923362732, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19018612802028656, "step": 6340 }, { "epoch": 0.1981875, "grad_norm": 3.328125, "grad_norm_var": 0.047200520833333336, "learning_rate": 0.0001, "loss": 5.8307, "loss/crossentropy": 2.4519015550613403, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18397627025842667, "step": 6342 }, { "epoch": 0.19825, "grad_norm": 3.6875, "grad_norm_var": 0.04722391764322917, "learning_rate": 0.0001, "loss": 6.2341, "loss/crossentropy": 2.633862018585205, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.2002565562725067, "step": 6344 }, { "epoch": 0.1983125, "grad_norm": 3.859375, "grad_norm_var": 0.03828125, "learning_rate": 0.0001, "loss": 6.365, "loss/crossentropy": 2.621291160583496, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.20913610607385635, "step": 6346 }, { "epoch": 0.198375, "grad_norm": 3.296875, "grad_norm_var": 0.0399810791015625, "learning_rate": 0.0001, "loss": 6.2709, "loss/crossentropy": 2.7081961631774902, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19533439725637436, "step": 6348 }, { "epoch": 0.1984375, "grad_norm": 3.484375, "grad_norm_var": 0.0437408447265625, "learning_rate": 0.0001, "loss": 6.3223, "loss/crossentropy": 2.6249715089797974, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.20723501592874527, "step": 6350 }, { "epoch": 0.1985, "grad_norm": 3.65625, "grad_norm_var": 0.049462890625, "learning_rate": 0.0001, "loss": 6.4, "loss/crossentropy": 2.7535150051116943, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.20253852754831314, "step": 6352 }, { "epoch": 0.1985625, "grad_norm": 3.421875, "grad_norm_var": 0.04804280598958333, "learning_rate": 0.0001, "loss": 6.5861, "loss/crossentropy": 2.8564473390579224, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.21241725236177444, "step": 6354 }, { "epoch": 0.198625, "grad_norm": 3.609375, "grad_norm_var": 0.044676717122395834, "learning_rate": 0.0001, "loss": 6.5185, "loss/crossentropy": 2.7504091262817383, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.2150934562087059, "step": 6356 }, { "epoch": 0.1986875, "grad_norm": 3.484375, "grad_norm_var": 0.03127339680989583, "learning_rate": 0.0001, "loss": 6.1064, "loss/crossentropy": 2.5444202423095703, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.19291561096906662, "step": 6358 }, { "epoch": 0.19875, "grad_norm": 4.75, "grad_norm_var": 0.11840718587239583, "learning_rate": 0.0001, "loss": 6.6099, "loss/crossentropy": 2.734977602958679, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.21952679008245468, "step": 6360 }, { "epoch": 0.1988125, "grad_norm": 4.46875, "grad_norm_var": 0.17399088541666666, "learning_rate": 0.0001, "loss": 6.2625, "loss/crossentropy": 2.4923490285873413, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.21412266045808792, "step": 6362 }, { "epoch": 0.198875, "grad_norm": 3.578125, "grad_norm_var": 0.17430013020833332, "learning_rate": 0.0001, "loss": 6.154, "loss/crossentropy": 2.5593817234039307, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.2004818245768547, "step": 6364 }, { "epoch": 0.1989375, "grad_norm": 4.03125, "grad_norm_var": 0.17472330729166666, "learning_rate": 0.0001, "loss": 6.0191, "loss/crossentropy": 2.418843388557434, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.19518518447875977, "step": 6366 }, { "epoch": 0.199, "grad_norm": 3.875, "grad_norm_var": 0.19568684895833333, "learning_rate": 0.0001, "loss": 6.4857, "loss/crossentropy": 2.751247763633728, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.20782289654016495, "step": 6368 }, { "epoch": 0.1990625, "grad_norm": 3.59375, "grad_norm_var": 0.19297587076822917, "learning_rate": 0.0001, "loss": 6.0458, "loss/crossentropy": 2.521559238433838, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.196170873939991, "step": 6370 }, { "epoch": 0.199125, "grad_norm": 3.25, "grad_norm_var": 0.21787821451822917, "learning_rate": 0.0001, "loss": 5.827, "loss/crossentropy": 2.4283812046051025, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18360844254493713, "step": 6372 }, { "epoch": 0.1991875, "grad_norm": 3.4375, "grad_norm_var": 0.20514322916666666, "learning_rate": 0.0001, "loss": 5.9552, "loss/crossentropy": 2.408576011657715, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19568176567554474, "step": 6374 }, { "epoch": 0.19925, "grad_norm": 3.890625, "grad_norm_var": 0.13240559895833334, "learning_rate": 0.0001, "loss": 6.0052, "loss/crossentropy": 2.449387311935425, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.193084217607975, "step": 6376 }, { "epoch": 0.1993125, "grad_norm": 3.59375, "grad_norm_var": 0.079541015625, "learning_rate": 0.0001, "loss": 6.3974, "loss/crossentropy": 2.7193866968154907, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.2068602815270424, "step": 6378 }, { "epoch": 0.199375, "grad_norm": 3.625, "grad_norm_var": 0.07008056640625, "learning_rate": 0.0001, "loss": 6.4284, "loss/crossentropy": 2.7452560663223267, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.20386269688606262, "step": 6380 }, { "epoch": 0.1994375, "grad_norm": 3.5, "grad_norm_var": 0.0810455322265625, "learning_rate": 0.0001, "loss": 5.7566, "loss/crossentropy": 2.3827916383743286, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1819118708372116, "step": 6382 }, { "epoch": 0.1995, "grad_norm": 3.734375, "grad_norm_var": 0.04063212076822917, "learning_rate": 0.0001, "loss": 6.378, "loss/crossentropy": 2.6957045793533325, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.20417062938213348, "step": 6384 }, { "epoch": 0.1995625, "grad_norm": 3.5625, "grad_norm_var": 0.0408355712890625, "learning_rate": 0.0001, "loss": 6.1671, "loss/crossentropy": 2.603260636329651, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.19466431438922882, "step": 6386 }, { "epoch": 0.199625, "grad_norm": 3.65625, "grad_norm_var": 0.04121805826822917, "learning_rate": 0.0001, "loss": 6.5784, "loss/crossentropy": 2.73874568939209, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.21717103570699692, "step": 6388 }, { "epoch": 0.1996875, "grad_norm": 3.28125, "grad_norm_var": 0.04345703125, "learning_rate": 0.0001, "loss": 6.2663, "loss/crossentropy": 2.6854538917541504, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.19793018698692322, "step": 6390 }, { "epoch": 0.19975, "grad_norm": 6.25, "grad_norm_var": 0.6866770426432292, "learning_rate": 0.0001, "loss": 6.5006, "loss/crossentropy": 2.59263277053833, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.22321807593107224, "step": 6392 }, { "epoch": 0.1998125, "grad_norm": 3.578125, "grad_norm_var": 0.6817535400390625, "learning_rate": 0.0001, "loss": 6.0462, "loss/crossentropy": 2.4960540533065796, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19408106058835983, "step": 6394 }, { "epoch": 0.199875, "grad_norm": 3.625, "grad_norm_var": 0.696044921875, "learning_rate": 0.0001, "loss": 6.0056, "loss/crossentropy": 2.483731508255005, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.19280751049518585, "step": 6396 }, { "epoch": 0.1999375, "grad_norm": 4.0, "grad_norm_var": 0.6753000895182292, "learning_rate": 0.0001, "loss": 6.3558, "loss/crossentropy": 2.6343945264816284, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.20612553507089615, "step": 6398 }, { "epoch": 0.2, "grad_norm": 3.75, "grad_norm_var": 0.6785959879557292, "learning_rate": 0.0001, "loss": 6.0777, "loss/crossentropy": 2.5519295930862427, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.19711043685674667, "step": 6400 }, { "epoch": 0.2000625, "grad_norm": 3.609375, "grad_norm_var": 0.687744140625, "learning_rate": 0.0001, "loss": 6.034, "loss/crossentropy": 2.49414598941803, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.1938270404934883, "step": 6402 }, { "epoch": 0.200125, "grad_norm": 3.640625, "grad_norm_var": 0.6919911702473959, "learning_rate": 0.0001, "loss": 6.2959, "loss/crossentropy": 2.6305042505264282, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.2020825818181038, "step": 6404 }, { "epoch": 0.2001875, "grad_norm": 6.0625, "grad_norm_var": 0.9452952067057292, "learning_rate": 0.0001, "loss": 6.4622, "loss/crossentropy": 2.616268277168274, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.21389038115739822, "step": 6406 }, { "epoch": 0.20025, "grad_norm": 3.375, "grad_norm_var": 0.4216542561848958, "learning_rate": 0.0001, "loss": 6.1744, "loss/crossentropy": 2.6161030530929565, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.19450364261865616, "step": 6408 }, { "epoch": 0.2003125, "grad_norm": 3.5625, "grad_norm_var": 0.43029683430989585, "learning_rate": 0.0001, "loss": 6.2095, "loss/crossentropy": 2.6182576417922974, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.19779501855373383, "step": 6410 }, { "epoch": 0.200375, "grad_norm": 3.4375, "grad_norm_var": 0.4272369384765625, "learning_rate": 0.0001, "loss": 6.2618, "loss/crossentropy": 2.6078569889068604, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.20093682408332825, "step": 6412 }, { "epoch": 0.2004375, "grad_norm": 3.46875, "grad_norm_var": 0.41306966145833335, "learning_rate": 0.0001, "loss": 5.821, "loss/crossentropy": 2.303962230682373, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.19662676006555557, "step": 6414 }, { "epoch": 0.2005, "grad_norm": 3.328125, "grad_norm_var": 0.44433186848958334, "learning_rate": 0.0001, "loss": 5.5651, "loss/crossentropy": 2.257362961769104, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.17413020133972168, "step": 6416 }, { "epoch": 0.2005625, "grad_norm": 3.546875, "grad_norm_var": 0.43925679524739586, "learning_rate": 0.0001, "loss": 6.4127, "loss/crossentropy": 2.721003532409668, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.2070593237876892, "step": 6418 }, { "epoch": 0.200625, "grad_norm": 4.625, "grad_norm_var": 0.49986572265625, "learning_rate": 0.0001, "loss": 6.561, "loss/crossentropy": 2.606340169906616, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.22593571245670319, "step": 6420 }, { "epoch": 0.2006875, "grad_norm": 3.34375, "grad_norm_var": 0.1508697509765625, "learning_rate": 0.0001, "loss": 6.0928, "loss/crossentropy": 2.532419443130493, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19510090351104736, "step": 6422 }, { "epoch": 0.20075, "grad_norm": 3.609375, "grad_norm_var": 0.13479715983072918, "learning_rate": 0.0001, "loss": 5.6919, "loss/crossentropy": 2.1905429363250732, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.18880540132522583, "step": 6424 }, { "epoch": 0.2008125, "grad_norm": 3.453125, "grad_norm_var": 0.13567301432291667, "learning_rate": 0.0001, "loss": 6.0427, "loss/crossentropy": 2.5184231996536255, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.19227512180805206, "step": 6426 }, { "epoch": 0.200875, "grad_norm": 3.421875, "grad_norm_var": 0.137060546875, "learning_rate": 0.0001, "loss": 6.1531, "loss/crossentropy": 2.416158676147461, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.20806624740362167, "step": 6428 }, { "epoch": 0.2009375, "grad_norm": 3.53125, "grad_norm_var": 0.13623758951822917, "learning_rate": 0.0001, "loss": 6.4346, "loss/crossentropy": 2.746420979499817, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.20592492073774338, "step": 6430 }, { "epoch": 0.201, "grad_norm": 3.53125, "grad_norm_var": 0.11658426920572916, "learning_rate": 0.0001, "loss": 6.1585, "loss/crossentropy": 2.525734782218933, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.19882269948720932, "step": 6432 }, { "epoch": 0.2010625, "grad_norm": 3.625, "grad_norm_var": 0.1169342041015625, "learning_rate": 0.0001, "loss": 5.9987, "loss/crossentropy": 2.3857542276382446, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.19606362283229828, "step": 6434 }, { "epoch": 0.201125, "grad_norm": 3.765625, "grad_norm_var": 0.019449869791666668, "learning_rate": 0.0001, "loss": 6.494, "loss/crossentropy": 2.761850357055664, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.20289798080921173, "step": 6436 }, { "epoch": 0.2011875, "grad_norm": 3.875, "grad_norm_var": 0.025260416666666667, "learning_rate": 0.0001, "loss": 6.5977, "loss/crossentropy": 2.7616851329803467, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.21602296084165573, "step": 6438 }, { "epoch": 0.20125, "grad_norm": 3.5, "grad_norm_var": 0.028076171875, "learning_rate": 0.0001, "loss": 6.1706, "loss/crossentropy": 2.5454601049423218, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.19962288439273834, "step": 6440 }, { "epoch": 0.2013125, "grad_norm": 3.5, "grad_norm_var": 0.03411051432291667, "learning_rate": 0.0001, "loss": 5.9664, "loss/crossentropy": 2.4520124197006226, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19050417840480804, "step": 6442 }, { "epoch": 0.201375, "grad_norm": 3.4375, "grad_norm_var": 0.03203023274739583, "learning_rate": 0.0001, "loss": 6.3257, "loss/crossentropy": 2.726597547531128, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.2028745710849762, "step": 6444 }, { "epoch": 0.2014375, "grad_norm": 3.046875, "grad_norm_var": 0.05325113932291667, "learning_rate": 0.0001, "loss": 5.8921, "loss/crossentropy": 2.405794858932495, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.18925213813781738, "step": 6446 }, { "epoch": 0.2015, "grad_norm": 3.421875, "grad_norm_var": 0.05666910807291667, "learning_rate": 0.0001, "loss": 6.0421, "loss/crossentropy": 2.5933274030685425, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.18394114822149277, "step": 6448 }, { "epoch": 0.2015625, "grad_norm": 4.25, "grad_norm_var": 0.08600260416666666, "learning_rate": 0.0001, "loss": 6.291, "loss/crossentropy": 2.6293972730636597, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.2028760313987732, "step": 6450 }, { "epoch": 0.201625, "grad_norm": 3.484375, "grad_norm_var": 0.08571675618489584, "learning_rate": 0.0001, "loss": 6.3673, "loss/crossentropy": 2.68838632106781, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.20578349381685257, "step": 6452 }, { "epoch": 0.2016875, "grad_norm": 3.53125, "grad_norm_var": 0.06887613932291667, "learning_rate": 0.0001, "loss": 6.2586, "loss/crossentropy": 2.695120930671692, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.1934536248445511, "step": 6454 }, { "epoch": 0.20175, "grad_norm": 3.34375, "grad_norm_var": 0.07627665201822917, "learning_rate": 0.0001, "loss": 5.6712, "loss/crossentropy": 2.2751861810684204, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.18022499233484268, "step": 6456 }, { "epoch": 0.2018125, "grad_norm": 3.421875, "grad_norm_var": 0.07322489420572917, "learning_rate": 0.0001, "loss": 5.9988, "loss/crossentropy": 2.5843334197998047, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1875448003411293, "step": 6458 }, { "epoch": 0.201875, "grad_norm": 3.421875, "grad_norm_var": 0.07529195149739583, "learning_rate": 0.0001, "loss": 5.957, "loss/crossentropy": 2.453627586364746, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19135772436857224, "step": 6460 }, { "epoch": 0.2019375, "grad_norm": 3.53125, "grad_norm_var": 0.05598856608072917, "learning_rate": 0.0001, "loss": 5.9872, "loss/crossentropy": 2.428037643432617, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.1969291865825653, "step": 6462 }, { "epoch": 0.202, "grad_norm": 3.34375, "grad_norm_var": 0.05963541666666667, "learning_rate": 0.0001, "loss": 6.4439, "loss/crossentropy": 2.763149619102478, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.20479683578014374, "step": 6464 }, { "epoch": 0.2020625, "grad_norm": 3.484375, "grad_norm_var": 0.024706013997395835, "learning_rate": 0.0001, "loss": 6.3375, "loss/crossentropy": 2.755637764930725, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.198811337351799, "step": 6466 }, { "epoch": 0.202125, "grad_norm": 4.21875, "grad_norm_var": 0.1077056884765625, "learning_rate": 0.0001, "loss": 6.7012, "loss/crossentropy": 2.681010603904724, "loss/hidden": 1.7421875, "loss/jsd": 0.0, "loss/logits": 0.22779762744903564, "step": 6468 }, { "epoch": 0.2021875, "grad_norm": 3.609375, "grad_norm_var": 0.10625, "learning_rate": 0.0001, "loss": 5.7852, "loss/crossentropy": 2.30839204788208, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.1871296763420105, "step": 6470 }, { "epoch": 0.20225, "grad_norm": 3.546875, "grad_norm_var": 0.09915262858072917, "learning_rate": 0.0001, "loss": 6.3085, "loss/crossentropy": 2.6761258840560913, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.2030799761414528, "step": 6472 }, { "epoch": 0.2023125, "grad_norm": 3.609375, "grad_norm_var": 0.10269266764322917, "learning_rate": 0.0001, "loss": 6.0303, "loss/crossentropy": 2.5599225759506226, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.18922977149486542, "step": 6474 }, { "epoch": 0.202375, "grad_norm": 3.546875, "grad_norm_var": 0.1067779541015625, "learning_rate": 0.0001, "loss": 6.3538, "loss/crossentropy": 2.677946925163269, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.207425557076931, "step": 6476 }, { "epoch": 0.2024375, "grad_norm": 3.703125, "grad_norm_var": 0.1154937744140625, "learning_rate": 0.0001, "loss": 6.4047, "loss/crossentropy": 2.7720776796340942, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.1999780759215355, "step": 6478 }, { "epoch": 0.2025, "grad_norm": 3.1875, "grad_norm_var": 0.12296549479166667, "learning_rate": 0.0001, "loss": 6.2042, "loss/crossentropy": 2.692052960395813, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.19379380345344543, "step": 6480 }, { "epoch": 0.2025625, "grad_norm": 3.390625, "grad_norm_var": 0.12428385416666667, "learning_rate": 0.0001, "loss": 6.1168, "loss/crossentropy": 2.5561152696609497, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.19669246673583984, "step": 6482 }, { "epoch": 0.202625, "grad_norm": 4.03125, "grad_norm_var": 0.063720703125, "learning_rate": 0.0001, "loss": 6.1133, "loss/crossentropy": 2.533989191055298, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.19464991986751556, "step": 6484 }, { "epoch": 0.2026875, "grad_norm": 3.46875, "grad_norm_var": 0.06373291015625, "learning_rate": 0.0001, "loss": 6.3021, "loss/crossentropy": 2.685084342956543, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.20037659257650375, "step": 6486 }, { "epoch": 0.20275, "grad_norm": 3.625, "grad_norm_var": 0.06504618326822917, "learning_rate": 0.0001, "loss": 6.115, "loss/crossentropy": 2.609553813934326, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.19429339468479156, "step": 6488 }, { "epoch": 0.2028125, "grad_norm": 3.140625, "grad_norm_var": 0.06892801920572916, "learning_rate": 0.0001, "loss": 5.9493, "loss/crossentropy": 2.5031588077545166, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18836626410484314, "step": 6490 }, { "epoch": 0.202875, "grad_norm": 3.28125, "grad_norm_var": 0.051590983072916666, "learning_rate": 0.0001, "loss": 5.8351, "loss/crossentropy": 2.389310359954834, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.18677020817995071, "step": 6492 }, { "epoch": 0.2029375, "grad_norm": 3.34375, "grad_norm_var": 0.04875895182291667, "learning_rate": 0.0001, "loss": 6.2967, "loss/crossentropy": 2.6834553480148315, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.19530673325061798, "step": 6494 }, { "epoch": 0.203, "grad_norm": 3.40625, "grad_norm_var": 0.053544108072916666, "learning_rate": 0.0001, "loss": 6.0616, "loss/crossentropy": 2.5159674882888794, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.19284315407276154, "step": 6496 }, { "epoch": 0.2030625, "grad_norm": 3.421875, "grad_norm_var": 0.05363667805989583, "learning_rate": 0.0001, "loss": 6.157, "loss/crossentropy": 2.5118918418884277, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.20123399794101715, "step": 6498 }, { "epoch": 0.203125, "grad_norm": 3.671875, "grad_norm_var": 0.03837890625, "learning_rate": 0.0001, "loss": 6.1266, "loss/crossentropy": 2.590605854988098, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19656459987163544, "step": 6500 }, { "epoch": 0.2031875, "grad_norm": 3.453125, "grad_norm_var": 0.04551493326822917, "learning_rate": 0.0001, "loss": 6.005, "loss/crossentropy": 2.442551851272583, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.1960894912481308, "step": 6502 }, { "epoch": 0.20325, "grad_norm": 3.875, "grad_norm_var": 0.056050618489583336, "learning_rate": 0.0001, "loss": 6.0774, "loss/crossentropy": 2.550527811050415, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.19448236376047134, "step": 6504 }, { "epoch": 0.2033125, "grad_norm": 3.59375, "grad_norm_var": 0.05068359375, "learning_rate": 0.0001, "loss": 6.252, "loss/crossentropy": 2.6041879653930664, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.20423421263694763, "step": 6506 }, { "epoch": 0.203375, "grad_norm": 3.359375, "grad_norm_var": 0.045947265625, "learning_rate": 0.0001, "loss": 6.4221, "loss/crossentropy": 2.733941078186035, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.2027966007590294, "step": 6508 }, { "epoch": 0.2034375, "grad_norm": 3.3125, "grad_norm_var": 0.04585673014322917, "learning_rate": 0.0001, "loss": 6.0533, "loss/crossentropy": 2.4923083782196045, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.1920376867055893, "step": 6510 }, { "epoch": 0.2035, "grad_norm": 4.09375, "grad_norm_var": 0.06238505045572917, "learning_rate": 0.0001, "loss": 6.2568, "loss/crossentropy": 2.6752275228500366, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.19799719750881195, "step": 6512 }, { "epoch": 0.2035625, "grad_norm": 3.609375, "grad_norm_var": 0.06461181640625, "learning_rate": 0.0001, "loss": 6.1931, "loss/crossentropy": 2.5532758235931396, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.201869398355484, "step": 6514 }, { "epoch": 0.203625, "grad_norm": 3.578125, "grad_norm_var": 0.0533355712890625, "learning_rate": 0.0001, "loss": 6.2844, "loss/crossentropy": 2.6496084928512573, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.19628706574440002, "step": 6516 }, { "epoch": 0.2036875, "grad_norm": 3.921875, "grad_norm_var": 0.0578765869140625, "learning_rate": 0.0001, "loss": 6.2643, "loss/crossentropy": 2.5256234407424927, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.20784791558980942, "step": 6518 }, { "epoch": 0.20375, "grad_norm": 3.3125, "grad_norm_var": 0.052685546875, "learning_rate": 0.0001, "loss": 6.0125, "loss/crossentropy": 2.6128029823303223, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18567069619894028, "step": 6520 }, { "epoch": 0.2038125, "grad_norm": 3.46875, "grad_norm_var": 0.054850260416666664, "learning_rate": 0.0001, "loss": 6.0551, "loss/crossentropy": 2.523378372192383, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.19458089023828506, "step": 6522 }, { "epoch": 0.203875, "grad_norm": 3.328125, "grad_norm_var": 0.06409505208333334, "learning_rate": 0.0001, "loss": 5.6911, "loss/crossentropy": 2.242918074131012, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.1819288358092308, "step": 6524 }, { "epoch": 0.2039375, "grad_norm": 3.375, "grad_norm_var": 0.06256103515625, "learning_rate": 0.0001, "loss": 6.0178, "loss/crossentropy": 2.5325701236724854, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.1879783421754837, "step": 6526 }, { "epoch": 0.204, "grad_norm": 3.1875, "grad_norm_var": 0.0496246337890625, "learning_rate": 0.0001, "loss": 5.8456, "loss/crossentropy": 2.405721068382263, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18773876875638962, "step": 6528 }, { "epoch": 0.2040625, "grad_norm": 3.53125, "grad_norm_var": 0.0484039306640625, "learning_rate": 0.0001, "loss": 6.0804, "loss/crossentropy": 2.5674500465393066, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1942674219608307, "step": 6530 }, { "epoch": 0.204125, "grad_norm": 3.8125, "grad_norm_var": 0.0522125244140625, "learning_rate": 0.0001, "loss": 5.9899, "loss/crossentropy": 2.3522396087646484, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.20165231823921204, "step": 6532 }, { "epoch": 0.2041875, "grad_norm": 3.640625, "grad_norm_var": 0.05276285807291667, "learning_rate": 0.0001, "loss": 5.9671, "loss/crossentropy": 2.5224697589874268, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1866544932126999, "step": 6534 }, { "epoch": 0.20425, "grad_norm": 3.46875, "grad_norm_var": 0.04951883951822917, "learning_rate": 0.0001, "loss": 6.3238, "loss/crossentropy": 2.7269598245620728, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.1967899575829506, "step": 6536 }, { "epoch": 0.2043125, "grad_norm": 3.46875, "grad_norm_var": 0.04924214680989583, "learning_rate": 0.0001, "loss": 6.0685, "loss/crossentropy": 2.5368363857269287, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19534919410943985, "step": 6538 }, { "epoch": 0.204375, "grad_norm": 3.953125, "grad_norm_var": 0.046873982747395834, "learning_rate": 0.0001, "loss": 6.089, "loss/crossentropy": 2.502684712409973, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.19925415515899658, "step": 6540 }, { "epoch": 0.2044375, "grad_norm": 3.65625, "grad_norm_var": 0.0486480712890625, "learning_rate": 0.0001, "loss": 6.0269, "loss/crossentropy": 2.5080604553222656, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.19367945939302444, "step": 6542 }, { "epoch": 0.2045, "grad_norm": 3.796875, "grad_norm_var": 0.04801025390625, "learning_rate": 0.0001, "loss": 6.2834, "loss/crossentropy": 2.530786395072937, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.2108120620250702, "step": 6544 }, { "epoch": 0.2045625, "grad_norm": 3.40625, "grad_norm_var": 0.06304931640625, "learning_rate": 0.0001, "loss": 5.9298, "loss/crossentropy": 2.4919790029525757, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.18440379947423935, "step": 6546 }, { "epoch": 0.204625, "grad_norm": 4.9375, "grad_norm_var": 0.20811258951822917, "learning_rate": 0.0001, "loss": 5.7784, "loss/crossentropy": 2.259275794029236, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.183159738779068, "step": 6548 }, { "epoch": 0.2046875, "grad_norm": 3.671875, "grad_norm_var": 0.20513916015625, "learning_rate": 0.0001, "loss": 6.3012, "loss/crossentropy": 2.639529585838318, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.20600835978984833, "step": 6550 }, { "epoch": 0.20475, "grad_norm": 3.78125, "grad_norm_var": 0.2039947509765625, "learning_rate": 0.0001, "loss": 6.209, "loss/crossentropy": 2.7606594562530518, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18819257616996765, "step": 6552 }, { "epoch": 0.2048125, "grad_norm": 3.484375, "grad_norm_var": 0.19899800618489583, "learning_rate": 0.0001, "loss": 6.2039, "loss/crossentropy": 2.7157788276672363, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.191785030066967, "step": 6554 }, { "epoch": 0.204875, "grad_norm": 3.328125, "grad_norm_var": 0.2031158447265625, "learning_rate": 0.0001, "loss": 5.9576, "loss/crossentropy": 2.3542600870132446, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.2005726769566536, "step": 6556 }, { "epoch": 0.2049375, "grad_norm": 3.515625, "grad_norm_var": 0.19844462076822916, "learning_rate": 0.0001, "loss": 6.4571, "loss/crossentropy": 2.7662460803985596, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.20501954853534698, "step": 6558 }, { "epoch": 0.205, "grad_norm": 3.984375, "grad_norm_var": 0.20293680826822916, "learning_rate": 0.0001, "loss": 6.4058, "loss/crossentropy": 2.7039828300476074, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.20846319943666458, "step": 6560 }, { "epoch": 0.2050625, "grad_norm": 3.328125, "grad_norm_var": 0.1826171875, "learning_rate": 0.0001, "loss": 6.3746, "loss/crossentropy": 2.7657452821731567, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.20150689780712128, "step": 6562 }, { "epoch": 0.205125, "grad_norm": 3.84375, "grad_norm_var": 0.045633951822916664, "learning_rate": 0.0001, "loss": 6.2148, "loss/crossentropy": 2.551281690597534, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.20697413384914398, "step": 6564 }, { "epoch": 0.2051875, "grad_norm": 3.5, "grad_norm_var": 0.0443756103515625, "learning_rate": 0.0001, "loss": 5.8608, "loss/crossentropy": 2.430768132209778, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1867513507604599, "step": 6566 }, { "epoch": 0.20525, "grad_norm": 3.234375, "grad_norm_var": 0.05994466145833333, "learning_rate": 0.0001, "loss": 6.2358, "loss/crossentropy": 2.7113460302352905, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19267980754375458, "step": 6568 }, { "epoch": 0.2053125, "grad_norm": 3.515625, "grad_norm_var": 0.05836181640625, "learning_rate": 0.0001, "loss": 5.6412, "loss/crossentropy": 2.3279199600219727, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17859302461147308, "step": 6570 }, { "epoch": 0.205375, "grad_norm": 3.46875, "grad_norm_var": 0.04892578125, "learning_rate": 0.0001, "loss": 6.2538, "loss/crossentropy": 2.665827751159668, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.19942322373390198, "step": 6572 }, { "epoch": 0.2054375, "grad_norm": 3.796875, "grad_norm_var": 0.05757548014322917, "learning_rate": 0.0001, "loss": 6.3836, "loss/crossentropy": 2.6853175163269043, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.21006380766630173, "step": 6574 }, { "epoch": 0.2055, "grad_norm": 3.5625, "grad_norm_var": 0.05373942057291667, "learning_rate": 0.0001, "loss": 6.2879, "loss/crossentropy": 2.569413185119629, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.20466507971286774, "step": 6576 }, { "epoch": 0.2055625, "grad_norm": 3.6875, "grad_norm_var": 0.0574859619140625, "learning_rate": 0.0001, "loss": 6.2565, "loss/crossentropy": 2.6899293661117554, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19572219252586365, "step": 6578 }, { "epoch": 0.205625, "grad_norm": 3.796875, "grad_norm_var": 0.0571197509765625, "learning_rate": 0.0001, "loss": 6.3495, "loss/crossentropy": 2.7121264934539795, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.20084993541240692, "step": 6580 }, { "epoch": 0.2056875, "grad_norm": 3.40625, "grad_norm_var": 0.052294921875, "learning_rate": 0.0001, "loss": 6.0029, "loss/crossentropy": 2.565826892852783, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.18589099496603012, "step": 6582 }, { "epoch": 0.20575, "grad_norm": 3.1875, "grad_norm_var": 0.048173014322916666, "learning_rate": 0.0001, "loss": 6.0348, "loss/crossentropy": 2.5720603466033936, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.18338802456855774, "step": 6584 }, { "epoch": 0.2058125, "grad_norm": 3.765625, "grad_norm_var": 0.0484375, "learning_rate": 0.0001, "loss": 6.2553, "loss/crossentropy": 2.6761839389801025, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19697345048189163, "step": 6586 }, { "epoch": 0.205875, "grad_norm": 3.40625, "grad_norm_var": 0.05034077962239583, "learning_rate": 0.0001, "loss": 6.2078, "loss/crossentropy": 2.63966965675354, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.1986144557595253, "step": 6588 }, { "epoch": 0.2059375, "grad_norm": 3.890625, "grad_norm_var": 0.04778238932291667, "learning_rate": 0.0001, "loss": 5.9383, "loss/crossentropy": 2.411689281463623, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.19055413454771042, "step": 6590 }, { "epoch": 0.206, "grad_norm": 3.5625, "grad_norm_var": 0.0485260009765625, "learning_rate": 0.0001, "loss": 5.9474, "loss/crossentropy": 2.4322092533111572, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.19136208295822144, "step": 6592 }, { "epoch": 0.2060625, "grad_norm": 3.5, "grad_norm_var": 0.04551493326822917, "learning_rate": 0.0001, "loss": 5.9522, "loss/crossentropy": 2.410311698913574, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.1905139610171318, "step": 6594 }, { "epoch": 0.206125, "grad_norm": 3.453125, "grad_norm_var": 0.04211832682291667, "learning_rate": 0.0001, "loss": 6.1695, "loss/crossentropy": 2.6207791566848755, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19510557502508163, "step": 6596 }, { "epoch": 0.2061875, "grad_norm": 3.515625, "grad_norm_var": 0.040608723958333336, "learning_rate": 0.0001, "loss": 6.1398, "loss/crossentropy": 2.588352918624878, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.1961626261472702, "step": 6598 }, { "epoch": 0.20625, "grad_norm": 3.328125, "grad_norm_var": 0.038557942708333334, "learning_rate": 0.0001, "loss": 6.0676, "loss/crossentropy": 2.5641703605651855, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.18940068036317825, "step": 6600 }, { "epoch": 0.2063125, "grad_norm": 3.578125, "grad_norm_var": 0.03681640625, "learning_rate": 0.0001, "loss": 6.0762, "loss/crossentropy": 2.5501731634140015, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19166115671396255, "step": 6602 }, { "epoch": 0.206375, "grad_norm": 3.546875, "grad_norm_var": 0.03290913899739583, "learning_rate": 0.0001, "loss": 6.0994, "loss/crossentropy": 2.5292768478393555, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.19958586245775223, "step": 6604 }, { "epoch": 0.2064375, "grad_norm": 3.578125, "grad_norm_var": 0.02412109375, "learning_rate": 0.0001, "loss": 6.0442, "loss/crossentropy": 2.487869143486023, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19586338847875595, "step": 6606 }, { "epoch": 0.2065, "grad_norm": 3.75, "grad_norm_var": 0.016877237955729166, "learning_rate": 0.0001, "loss": 6.1839, "loss/crossentropy": 2.5478591918945312, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.20071035623550415, "step": 6608 }, { "epoch": 0.2065625, "grad_norm": 3.578125, "grad_norm_var": 0.014192708333333333, "learning_rate": 0.0001, "loss": 5.9695, "loss/crossentropy": 2.489307165145874, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.1894274652004242, "step": 6610 }, { "epoch": 0.206625, "grad_norm": 3.484375, "grad_norm_var": 0.013570149739583334, "learning_rate": 0.0001, "loss": 5.9812, "loss/crossentropy": 2.469808340072632, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1948893666267395, "step": 6612 }, { "epoch": 0.2066875, "grad_norm": 3.546875, "grad_norm_var": 0.014777628580729167, "learning_rate": 0.0001, "loss": 5.9901, "loss/crossentropy": 2.4591652154922485, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.1941061168909073, "step": 6614 }, { "epoch": 0.20675, "grad_norm": 3.546875, "grad_norm_var": 0.0080963134765625, "learning_rate": 0.0001, "loss": 6.3745, "loss/crossentropy": 2.7677459716796875, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.20168904960155487, "step": 6616 }, { "epoch": 0.2068125, "grad_norm": 3.59375, "grad_norm_var": 0.009326171875, "learning_rate": 0.0001, "loss": 6.331, "loss/crossentropy": 2.570394515991211, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.2116076424717903, "step": 6618 }, { "epoch": 0.206875, "grad_norm": 3.75, "grad_norm_var": 0.04052327473958333, "learning_rate": 0.0001, "loss": 6.5496, "loss/crossentropy": 2.781446933746338, "loss/hidden": 1.6953125, "loss/jsd": 0.0, "loss/logits": 0.20728357881307602, "step": 6620 }, { "epoch": 0.2069375, "grad_norm": 3.546875, "grad_norm_var": 0.039957682291666664, "learning_rate": 0.0001, "loss": 6.1231, "loss/crossentropy": 2.6173206567764282, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.190808467566967, "step": 6622 }, { "epoch": 0.207, "grad_norm": 3.640625, "grad_norm_var": 0.042601521809895834, "learning_rate": 0.0001, "loss": 5.8668, "loss/crossentropy": 2.445883631706238, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.18232329189777374, "step": 6624 }, { "epoch": 0.2070625, "grad_norm": 3.40625, "grad_norm_var": 0.0454742431640625, "learning_rate": 0.0001, "loss": 6.1287, "loss/crossentropy": 2.62809419631958, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.19342082738876343, "step": 6626 }, { "epoch": 0.207125, "grad_norm": 3.5625, "grad_norm_var": 0.04563700358072917, "learning_rate": 0.0001, "loss": 6.4298, "loss/crossentropy": 2.7382125854492188, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.20744048058986664, "step": 6628 }, { "epoch": 0.2071875, "grad_norm": 3.546875, "grad_norm_var": 0.04147135416666667, "learning_rate": 0.0001, "loss": 6.1929, "loss/crossentropy": 2.6155550479888916, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19679277390241623, "step": 6630 }, { "epoch": 0.20725, "grad_norm": 3.5, "grad_norm_var": 0.042740885416666666, "learning_rate": 0.0001, "loss": 6.0664, "loss/crossentropy": 2.5981115102767944, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.19096802175045013, "step": 6632 }, { "epoch": 0.2073125, "grad_norm": 3.53125, "grad_norm_var": 0.0436431884765625, "learning_rate": 0.0001, "loss": 6.5656, "loss/crossentropy": 2.8172494173049927, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.20881778746843338, "step": 6634 }, { "epoch": 0.207375, "grad_norm": 3.609375, "grad_norm_var": 0.013671875, "learning_rate": 0.0001, "loss": 6.4479, "loss/crossentropy": 2.655908226966858, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.2104528844356537, "step": 6636 }, { "epoch": 0.2074375, "grad_norm": 3.484375, "grad_norm_var": 0.024149576822916668, "learning_rate": 0.0001, "loss": 5.8728, "loss/crossentropy": 2.4703177213668823, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1851721778512001, "step": 6638 }, { "epoch": 0.2075, "grad_norm": 3.53125, "grad_norm_var": 0.022098795572916666, "learning_rate": 0.0001, "loss": 5.8662, "loss/crossentropy": 2.426244616508484, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1908668577671051, "step": 6640 }, { "epoch": 0.2075625, "grad_norm": 3.671875, "grad_norm_var": 0.0309478759765625, "learning_rate": 0.0001, "loss": 6.0628, "loss/crossentropy": 2.6135430335998535, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18984804302453995, "step": 6642 }, { "epoch": 0.207625, "grad_norm": 3.5625, "grad_norm_var": 0.03248291015625, "learning_rate": 0.0001, "loss": 5.9551, "loss/crossentropy": 2.4342517852783203, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.19075235724449158, "step": 6644 }, { "epoch": 0.2076875, "grad_norm": 3.546875, "grad_norm_var": 0.03248291015625, "learning_rate": 0.0001, "loss": 6.0009, "loss/crossentropy": 2.528058171272278, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18986602127552032, "step": 6646 }, { "epoch": 0.20775, "grad_norm": 3.5625, "grad_norm_var": 0.03463134765625, "learning_rate": 0.0001, "loss": 5.8214, "loss/crossentropy": 2.3128613233566284, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.19030648469924927, "step": 6648 }, { "epoch": 0.2078125, "grad_norm": 3.59375, "grad_norm_var": 0.0339019775390625, "learning_rate": 0.0001, "loss": 6.2097, "loss/crossentropy": 2.593241810798645, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.19992893934249878, "step": 6650 }, { "epoch": 0.207875, "grad_norm": 3.375, "grad_norm_var": 0.0320465087890625, "learning_rate": 0.0001, "loss": 6.085, "loss/crossentropy": 2.586739182472229, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.1924077644944191, "step": 6652 }, { "epoch": 0.2079375, "grad_norm": 3.453125, "grad_norm_var": 0.025275675455729167, "learning_rate": 0.0001, "loss": 6.1803, "loss/crossentropy": 2.6167575120925903, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19541189074516296, "step": 6654 }, { "epoch": 0.208, "grad_norm": 3.625, "grad_norm_var": 0.0313140869140625, "learning_rate": 0.0001, "loss": 5.8731, "loss/crossentropy": 2.3873835802078247, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19076339900493622, "step": 6656 }, { "epoch": 0.2080625, "grad_norm": 3.453125, "grad_norm_var": 0.021800740559895834, "learning_rate": 0.0001, "loss": 6.1023, "loss/crossentropy": 2.521125078201294, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.19679167866706848, "step": 6658 }, { "epoch": 0.208125, "grad_norm": 3.625, "grad_norm_var": 0.021219889322916668, "learning_rate": 0.0001, "loss": 5.9835, "loss/crossentropy": 2.5526716709136963, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1891724169254303, "step": 6660 }, { "epoch": 0.2081875, "grad_norm": 3.5625, "grad_norm_var": 0.022526041666666666, "learning_rate": 0.0001, "loss": 5.9984, "loss/crossentropy": 2.4385485649108887, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.19465378671884537, "step": 6662 }, { "epoch": 0.20825, "grad_norm": 3.65625, "grad_norm_var": 0.023542277018229165, "learning_rate": 0.0001, "loss": 6.0496, "loss/crossentropy": 2.517141580581665, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1970003843307495, "step": 6664 }, { "epoch": 0.2083125, "grad_norm": 3.671875, "grad_norm_var": 0.029264322916666665, "learning_rate": 0.0001, "loss": 6.3474, "loss/crossentropy": 2.6856855154037476, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.20327741652727127, "step": 6666 }, { "epoch": 0.208375, "grad_norm": 3.4375, "grad_norm_var": 0.027619425455729166, "learning_rate": 0.0001, "loss": 6.2244, "loss/crossentropy": 2.67505943775177, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1971169114112854, "step": 6668 }, { "epoch": 0.2084375, "grad_norm": 3.828125, "grad_norm_var": 0.031412760416666664, "learning_rate": 0.0001, "loss": 6.0871, "loss/crossentropy": 2.5097700357437134, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.2022606059908867, "step": 6670 }, { "epoch": 0.2085, "grad_norm": 4.90625, "grad_norm_var": 0.14741923014322916, "learning_rate": 0.0001, "loss": 6.5034, "loss/crossentropy": 2.736795425415039, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.21454696357250214, "step": 6672 }, { "epoch": 0.2085625, "grad_norm": 3.484375, "grad_norm_var": 0.14700113932291667, "learning_rate": 0.0001, "loss": 5.9703, "loss/crossentropy": 2.4670923948287964, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.18821196258068085, "step": 6674 }, { "epoch": 0.208625, "grad_norm": 3.578125, "grad_norm_var": 0.14710184733072917, "learning_rate": 0.0001, "loss": 6.1057, "loss/crossentropy": 2.564454197883606, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.19592129439115524, "step": 6676 }, { "epoch": 0.2086875, "grad_norm": 3.46875, "grad_norm_var": 0.14792378743489584, "learning_rate": 0.0001, "loss": 6.044, "loss/crossentropy": 2.5001375675201416, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1977410465478897, "step": 6678 }, { "epoch": 0.20875, "grad_norm": 4.53125, "grad_norm_var": 0.1970123291015625, "learning_rate": 0.0001, "loss": 6.3306, "loss/crossentropy": 2.623522400856018, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.2101605385541916, "step": 6680 }, { "epoch": 0.2088125, "grad_norm": 3.453125, "grad_norm_var": 0.1873931884765625, "learning_rate": 0.0001, "loss": 6.2546, "loss/crossentropy": 2.6927119493484497, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.1968143731355667, "step": 6682 }, { "epoch": 0.208875, "grad_norm": 3.65625, "grad_norm_var": 0.1829498291015625, "learning_rate": 0.0001, "loss": 6.3133, "loss/crossentropy": 2.665427803993225, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.20346005260944366, "step": 6684 }, { "epoch": 0.2089375, "grad_norm": 4.125, "grad_norm_var": 0.18936258951822918, "learning_rate": 0.0001, "loss": 6.4817, "loss/crossentropy": 2.6795389652252197, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.21693264693021774, "step": 6686 }, { "epoch": 0.209, "grad_norm": 3.703125, "grad_norm_var": 0.09228515625, "learning_rate": 0.0001, "loss": 6.2551, "loss/crossentropy": 2.697451591491699, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19599945843219757, "step": 6688 }, { "epoch": 0.2090625, "grad_norm": 3.671875, "grad_norm_var": 0.09664713541666667, "learning_rate": 0.0001, "loss": 6.3445, "loss/crossentropy": 2.641369342803955, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.20429710298776627, "step": 6690 }, { "epoch": 0.209125, "grad_norm": 3.25, "grad_norm_var": 0.09938151041666667, "learning_rate": 0.0001, "loss": 5.7877, "loss/crossentropy": 2.3258358240127563, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.1817334219813347, "step": 6692 }, { "epoch": 0.2091875, "grad_norm": 3.5625, "grad_norm_var": 0.10060933430989584, "learning_rate": 0.0001, "loss": 6.271, "loss/crossentropy": 2.6425410509109497, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.2015206143260002, "step": 6694 }, { "epoch": 0.20925, "grad_norm": 3.359375, "grad_norm_var": 0.05495503743489583, "learning_rate": 0.0001, "loss": 6.0879, "loss/crossentropy": 2.579468846321106, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19107597321271896, "step": 6696 }, { "epoch": 0.2093125, "grad_norm": 3.453125, "grad_norm_var": 0.0579986572265625, "learning_rate": 0.0001, "loss": 6.3796, "loss/crossentropy": 2.787188410758972, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.2006485015153885, "step": 6698 }, { "epoch": 0.209375, "grad_norm": 3.734375, "grad_norm_var": 0.05845947265625, "learning_rate": 0.0001, "loss": 6.3861, "loss/crossentropy": 2.7306629419326782, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.20460304617881775, "step": 6700 }, { "epoch": 0.2094375, "grad_norm": 3.765625, "grad_norm_var": 0.03974609375, "learning_rate": 0.0001, "loss": 6.3023, "loss/crossentropy": 2.5314093828201294, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.21107427775859833, "step": 6702 }, { "epoch": 0.2095, "grad_norm": 3.25, "grad_norm_var": 0.044657389322916664, "learning_rate": 0.0001, "loss": 6.0491, "loss/crossentropy": 2.4721285104751587, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.1963651329278946, "step": 6704 }, { "epoch": 0.2095625, "grad_norm": 3.8125, "grad_norm_var": 0.03984375, "learning_rate": 0.0001, "loss": 6.136, "loss/crossentropy": 2.4893136024475098, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.2010004222393036, "step": 6706 }, { "epoch": 0.209625, "grad_norm": 3.609375, "grad_norm_var": 0.0373046875, "learning_rate": 0.0001, "loss": 6.322, "loss/crossentropy": 2.671256422996521, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.20062114298343658, "step": 6708 }, { "epoch": 0.2096875, "grad_norm": 3.875, "grad_norm_var": 0.0443511962890625, "learning_rate": 0.0001, "loss": 6.3426, "loss/crossentropy": 2.7482519149780273, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19967231899499893, "step": 6710 }, { "epoch": 0.20975, "grad_norm": 4.0, "grad_norm_var": 0.0500396728515625, "learning_rate": 0.0001, "loss": 6.3674, "loss/crossentropy": 2.688132405281067, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.19918164610862732, "step": 6712 }, { "epoch": 0.2098125, "grad_norm": 3.546875, "grad_norm_var": 0.041356404622395836, "learning_rate": 0.0001, "loss": 6.3835, "loss/crossentropy": 2.7763701677322388, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.19860640913248062, "step": 6714 }, { "epoch": 0.209875, "grad_norm": 3.5, "grad_norm_var": 0.04519856770833333, "learning_rate": 0.0001, "loss": 6.347, "loss/crossentropy": 2.749025583267212, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.19925252348184586, "step": 6716 }, { "epoch": 0.2099375, "grad_norm": 3.34375, "grad_norm_var": 0.05291341145833333, "learning_rate": 0.0001, "loss": 5.7526, "loss/crossentropy": 2.431371331214905, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.17275211960077286, "step": 6718 }, { "epoch": 0.21, "grad_norm": 3.578125, "grad_norm_var": 0.050023396809895836, "learning_rate": 0.0001, "loss": 6.2584, "loss/crossentropy": 2.6580491065979004, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.1998780071735382, "step": 6720 }, { "epoch": 0.2100625, "grad_norm": 3.3125, "grad_norm_var": 0.045882161458333334, "learning_rate": 0.0001, "loss": 5.9866, "loss/crossentropy": 2.57085919380188, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.18298456072807312, "step": 6722 }, { "epoch": 0.210125, "grad_norm": 3.984375, "grad_norm_var": 0.05683492024739583, "learning_rate": 0.0001, "loss": 6.2644, "loss/crossentropy": 2.5806660652160645, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.20430614054203033, "step": 6724 }, { "epoch": 0.2101875, "grad_norm": 3.4375, "grad_norm_var": 0.04755859375, "learning_rate": 0.0001, "loss": 6.1806, "loss/crossentropy": 2.649932026863098, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19525517523288727, "step": 6726 }, { "epoch": 0.21025, "grad_norm": 3.53125, "grad_norm_var": 0.034130859375, "learning_rate": 0.0001, "loss": 6.4122, "loss/crossentropy": 2.719170093536377, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.21109655499458313, "step": 6728 }, { "epoch": 0.2103125, "grad_norm": 3.90625, "grad_norm_var": 0.05331624348958333, "learning_rate": 0.0001, "loss": 6.2589, "loss/crossentropy": 2.509567618370056, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.21282700449228287, "step": 6730 }, { "epoch": 0.210375, "grad_norm": 3.765625, "grad_norm_var": 0.053515625, "learning_rate": 0.0001, "loss": 6.3033, "loss/crossentropy": 2.68427836894989, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.1994030922651291, "step": 6732 }, { "epoch": 0.2104375, "grad_norm": 3.734375, "grad_norm_var": 3.7589508056640626, "learning_rate": 0.0001, "loss": 6.1585, "loss/crossentropy": 2.346200704574585, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.21678119897842407, "step": 6734 }, { "epoch": 0.2105, "grad_norm": 3.546875, "grad_norm_var": 3.731297810872396, "learning_rate": 0.0001, "loss": 6.4917, "loss/crossentropy": 2.8597121238708496, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.19523271918296814, "step": 6736 }, { "epoch": 0.2105625, "grad_norm": 4.34375, "grad_norm_var": 3.691844685872396, "learning_rate": 0.0001, "loss": 6.3788, "loss/crossentropy": 2.7760846614837646, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.2008993998169899, "step": 6738 }, { "epoch": 0.210625, "grad_norm": 3.75, "grad_norm_var": 3.670995076497396, "learning_rate": 0.0001, "loss": 6.342, "loss/crossentropy": 2.637627124786377, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.21184614300727844, "step": 6740 }, { "epoch": 0.2106875, "grad_norm": 3.796875, "grad_norm_var": 3.6266886393229165, "learning_rate": 0.0001, "loss": 6.2531, "loss/crossentropy": 2.6286277770996094, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.2018960416316986, "step": 6742 }, { "epoch": 0.21075, "grad_norm": 3.8125, "grad_norm_var": 3.5985677083333334, "learning_rate": 0.0001, "loss": 6.3415, "loss/crossentropy": 2.7454168796539307, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.19633187353610992, "step": 6744 }, { "epoch": 0.2108125, "grad_norm": 3.609375, "grad_norm_var": 3.647419230143229, "learning_rate": 0.0001, "loss": 6.4109, "loss/crossentropy": 2.7401084899902344, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.20418954640626907, "step": 6746 }, { "epoch": 0.210875, "grad_norm": 3.609375, "grad_norm_var": 3.637495930989583, "learning_rate": 0.0001, "loss": 5.882, "loss/crossentropy": 2.24249267578125, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.1975444257259369, "step": 6748 }, { "epoch": 0.2109375, "grad_norm": 3.15625, "grad_norm_var": 0.09569905598958334, "learning_rate": 0.0001, "loss": 6.2418, "loss/crossentropy": 2.7148282527923584, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.19527536630630493, "step": 6750 }, { "epoch": 0.211, "grad_norm": 3.125, "grad_norm_var": 0.12573140462239582, "learning_rate": 0.0001, "loss": 6.1224, "loss/crossentropy": 2.6875683069229126, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1876225247979164, "step": 6752 }, { "epoch": 0.2110625, "grad_norm": 4.0, "grad_norm_var": 0.09426676432291667, "learning_rate": 0.0001, "loss": 6.4024, "loss/crossentropy": 2.6420962810516357, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.213921919465065, "step": 6754 }, { "epoch": 0.211125, "grad_norm": 4.34375, "grad_norm_var": 0.12241923014322917, "learning_rate": 0.0001, "loss": 6.0016, "loss/crossentropy": 2.3534570932388306, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.19879579544067383, "step": 6756 }, { "epoch": 0.2111875, "grad_norm": 4.21875, "grad_norm_var": 0.15220438639322917, "learning_rate": 0.0001, "loss": 6.4019, "loss/crossentropy": 2.689331889152527, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.2079753801226616, "step": 6758 }, { "epoch": 0.21125, "grad_norm": 3.484375, "grad_norm_var": 0.15225321451822918, "learning_rate": 0.0001, "loss": 6.2664, "loss/crossentropy": 2.6862618923187256, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.1982519030570984, "step": 6760 }, { "epoch": 0.2113125, "grad_norm": 3.21875, "grad_norm_var": 0.16969401041666668, "learning_rate": 0.0001, "loss": 5.9312, "loss/crossentropy": 2.5212395191192627, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1870882213115692, "step": 6762 }, { "epoch": 0.211375, "grad_norm": 4.1875, "grad_norm_var": 0.15653889973958332, "learning_rate": 0.0001, "loss": 6.5337, "loss/crossentropy": 2.787313938140869, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.20979921519756317, "step": 6764 }, { "epoch": 0.2114375, "grad_norm": 3.609375, "grad_norm_var": 0.14332682291666668, "learning_rate": 0.0001, "loss": 6.1694, "loss/crossentropy": 2.5976446866989136, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.19584590196609497, "step": 6766 }, { "epoch": 0.2115, "grad_norm": 3.203125, "grad_norm_var": 0.13141988118489584, "learning_rate": 0.0001, "loss": 6.1991, "loss/crossentropy": 2.71761953830719, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.18799372017383575, "step": 6768 }, { "epoch": 0.2115625, "grad_norm": 3.734375, "grad_norm_var": 0.12600504557291667, "learning_rate": 0.0001, "loss": 6.2506, "loss/crossentropy": 2.704631805419922, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19678643345832825, "step": 6770 }, { "epoch": 0.211625, "grad_norm": 3.90625, "grad_norm_var": 0.10442708333333334, "learning_rate": 0.0001, "loss": 6.2673, "loss/crossentropy": 2.6605230569839478, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.1977902352809906, "step": 6772 }, { "epoch": 0.2116875, "grad_norm": 3.90625, "grad_norm_var": 0.07746480305989584, "learning_rate": 0.0001, "loss": 6.6312, "loss/crossentropy": 2.835526466369629, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.21550609171390533, "step": 6774 }, { "epoch": 0.21175, "grad_norm": 3.78125, "grad_norm_var": 0.07994384765625, "learning_rate": 0.0001, "loss": 6.1223, "loss/crossentropy": 2.536879062652588, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.19603867828845978, "step": 6776 }, { "epoch": 0.2118125, "grad_norm": 3.796875, "grad_norm_var": 0.11632486979166666, "learning_rate": 0.0001, "loss": 6.2197, "loss/crossentropy": 2.4974231719970703, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.20777669548988342, "step": 6778 }, { "epoch": 0.211875, "grad_norm": 3.90625, "grad_norm_var": 0.1041015625, "learning_rate": 0.0001, "loss": 6.1775, "loss/crossentropy": 2.4850826263427734, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.2067461460828781, "step": 6780 }, { "epoch": 0.2119375, "grad_norm": 3.875, "grad_norm_var": 0.1038970947265625, "learning_rate": 0.0001, "loss": 6.2258, "loss/crossentropy": 2.639601707458496, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19963591545820236, "step": 6782 }, { "epoch": 0.212, "grad_norm": 3.421875, "grad_norm_var": 0.09138895670572916, "learning_rate": 0.0001, "loss": 6.0989, "loss/crossentropy": 2.51748263835907, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19915585964918137, "step": 6784 }, { "epoch": 0.2120625, "grad_norm": 3.359375, "grad_norm_var": 0.10368550618489583, "learning_rate": 0.0001, "loss": 6.1392, "loss/crossentropy": 2.6376984119415283, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.19507159292697906, "step": 6786 }, { "epoch": 0.212125, "grad_norm": 4.09375, "grad_norm_var": 0.10632222493489583, "learning_rate": 0.0001, "loss": 6.1884, "loss/crossentropy": 2.629759669303894, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.1960998997092247, "step": 6788 }, { "epoch": 0.2121875, "grad_norm": 3.5625, "grad_norm_var": 0.10808817545572917, "learning_rate": 0.0001, "loss": 6.6151, "loss/crossentropy": 2.7972121238708496, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.21615905314683914, "step": 6790 }, { "epoch": 0.21225, "grad_norm": 3.6875, "grad_norm_var": 0.10956929524739584, "learning_rate": 0.0001, "loss": 6.3118, "loss/crossentropy": 2.690866470336914, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.19920150190591812, "step": 6792 }, { "epoch": 0.2123125, "grad_norm": 3.515625, "grad_norm_var": 0.06291910807291666, "learning_rate": 0.0001, "loss": 6.4656, "loss/crossentropy": 2.7472859621047974, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.2085518091917038, "step": 6794 }, { "epoch": 0.212375, "grad_norm": 3.421875, "grad_norm_var": 0.05852457682291667, "learning_rate": 0.0001, "loss": 6.8297, "loss/crossentropy": 3.120318293571472, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.20766117423772812, "step": 6796 }, { "epoch": 0.2124375, "grad_norm": 3.140625, "grad_norm_var": 0.06950581868489583, "learning_rate": 0.0001, "loss": 6.0367, "loss/crossentropy": 2.587893486022949, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1901942640542984, "step": 6798 }, { "epoch": 0.2125, "grad_norm": 3.53125, "grad_norm_var": 0.083251953125, "learning_rate": 0.0001, "loss": 5.9106, "loss/crossentropy": 2.4920233488082886, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.18404901027679443, "step": 6800 }, { "epoch": 0.2125625, "grad_norm": 4.09375, "grad_norm_var": 0.09627176920572916, "learning_rate": 0.0001, "loss": 6.1437, "loss/crossentropy": 2.584474802017212, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19498298317193985, "step": 6802 }, { "epoch": 0.212625, "grad_norm": 3.703125, "grad_norm_var": 0.19191792805989583, "learning_rate": 0.0001, "loss": 6.2582, "loss/crossentropy": 2.5794581174850464, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.20303219556808472, "step": 6804 }, { "epoch": 0.2126875, "grad_norm": 3.4375, "grad_norm_var": 0.19745686848958333, "learning_rate": 0.0001, "loss": 6.0408, "loss/crossentropy": 2.5239760875701904, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.1919121965765953, "step": 6806 }, { "epoch": 0.21275, "grad_norm": 3.734375, "grad_norm_var": 0.19284566243489584, "learning_rate": 0.0001, "loss": 6.6782, "loss/crossentropy": 2.947417378425598, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.2113574743270874, "step": 6808 }, { "epoch": 0.2128125, "grad_norm": 3.34375, "grad_norm_var": 0.2046051025390625, "learning_rate": 0.0001, "loss": 5.7501, "loss/crossentropy": 2.3616881370544434, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.17907945811748505, "step": 6810 }, { "epoch": 0.212875, "grad_norm": 3.375, "grad_norm_var": 0.2069488525390625, "learning_rate": 0.0001, "loss": 6.2424, "loss/crossentropy": 2.6910303831100464, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.19224313646554947, "step": 6812 }, { "epoch": 0.2129375, "grad_norm": 4.15625, "grad_norm_var": 0.20847066243489584, "learning_rate": 0.0001, "loss": 6.5023, "loss/crossentropy": 2.777039647102356, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.21041592955589294, "step": 6814 }, { "epoch": 0.213, "grad_norm": 4.5625, "grad_norm_var": 0.23484700520833332, "learning_rate": 0.0001, "loss": 5.8413, "loss/crossentropy": 2.267647624015808, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.19603300094604492, "step": 6816 }, { "epoch": 0.2130625, "grad_norm": 3.609375, "grad_norm_var": 0.23153889973958333, "learning_rate": 0.0001, "loss": 6.3894, "loss/crossentropy": 2.82620370388031, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.19576887786388397, "step": 6818 }, { "epoch": 0.213125, "grad_norm": 3.53125, "grad_norm_var": 0.14072977701822917, "learning_rate": 0.0001, "loss": 6.4969, "loss/crossentropy": 2.78303325176239, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.2135787233710289, "step": 6820 }, { "epoch": 0.2131875, "grad_norm": 3.109375, "grad_norm_var": 0.15377197265625, "learning_rate": 0.0001, "loss": 6.1589, "loss/crossentropy": 2.6030240058898926, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19465148448944092, "step": 6822 }, { "epoch": 0.21325, "grad_norm": 3.359375, "grad_norm_var": 0.16599833170572917, "learning_rate": 0.0001, "loss": 6.0984, "loss/crossentropy": 2.5625205039978027, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.19538921862840652, "step": 6824 }, { "epoch": 0.2133125, "grad_norm": 3.6875, "grad_norm_var": 0.14905192057291666, "learning_rate": 0.0001, "loss": 6.186, "loss/crossentropy": 2.593506693840027, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.19987186789512634, "step": 6826 }, { "epoch": 0.213375, "grad_norm": 3.671875, "grad_norm_var": 0.1437164306640625, "learning_rate": 0.0001, "loss": 6.2935, "loss/crossentropy": 2.6629804372787476, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.20250384509563446, "step": 6828 }, { "epoch": 0.2134375, "grad_norm": 3.203125, "grad_norm_var": 0.14336649576822916, "learning_rate": 0.0001, "loss": 6.0402, "loss/crossentropy": 2.5341413021087646, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.18966825306415558, "step": 6830 }, { "epoch": 0.2135, "grad_norm": 4.03125, "grad_norm_var": 0.09088134765625, "learning_rate": 0.0001, "loss": 6.2665, "loss/crossentropy": 2.608793616294861, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.20171182602643967, "step": 6832 }, { "epoch": 0.2135625, "grad_norm": 3.4375, "grad_norm_var": 0.08474934895833333, "learning_rate": 0.0001, "loss": 6.3257, "loss/crossentropy": 2.6409130096435547, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.2075371891260147, "step": 6834 }, { "epoch": 0.213625, "grad_norm": 3.46875, "grad_norm_var": 0.06357014973958333, "learning_rate": 0.0001, "loss": 6.1442, "loss/crossentropy": 2.496389865875244, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.2026677504181862, "step": 6836 }, { "epoch": 0.2136875, "grad_norm": 3.6875, "grad_norm_var": 0.0514801025390625, "learning_rate": 0.0001, "loss": 6.1189, "loss/crossentropy": 2.5822921991348267, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.19311807304620743, "step": 6838 }, { "epoch": 0.21375, "grad_norm": 3.78125, "grad_norm_var": 0.0456939697265625, "learning_rate": 0.0001, "loss": 6.2426, "loss/crossentropy": 2.5685629844665527, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.20763982832431793, "step": 6840 }, { "epoch": 0.2138125, "grad_norm": 3.359375, "grad_norm_var": 0.0572418212890625, "learning_rate": 0.0001, "loss": 6.0465, "loss/crossentropy": 2.61044180393219, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.19047950953245163, "step": 6842 }, { "epoch": 0.213875, "grad_norm": 3.203125, "grad_norm_var": 0.0666015625, "learning_rate": 0.0001, "loss": 5.6172, "loss/crossentropy": 2.2282965183258057, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.1775653213262558, "step": 6844 }, { "epoch": 0.2139375, "grad_norm": 3.609375, "grad_norm_var": 0.059375, "learning_rate": 0.0001, "loss": 6.2345, "loss/crossentropy": 2.593433976173401, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.2019970864057541, "step": 6846 }, { "epoch": 0.214, "grad_norm": 3.515625, "grad_norm_var": 0.04716695149739583, "learning_rate": 0.0001, "loss": 6.2703, "loss/crossentropy": 2.7207247018814087, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.193631112575531, "step": 6848 }, { "epoch": 0.2140625, "grad_norm": 3.25, "grad_norm_var": 0.051350911458333336, "learning_rate": 0.0001, "loss": 5.868, "loss/crossentropy": 2.4845513105392456, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18209190666675568, "step": 6850 }, { "epoch": 0.214125, "grad_norm": 3.453125, "grad_norm_var": 0.051813761393229164, "learning_rate": 0.0001, "loss": 6.0807, "loss/crossentropy": 2.618214249610901, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.19077570736408234, "step": 6852 }, { "epoch": 0.2141875, "grad_norm": 3.578125, "grad_norm_var": 0.03345947265625, "learning_rate": 0.0001, "loss": 6.0171, "loss/crossentropy": 2.489748954772949, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.1910119280219078, "step": 6854 }, { "epoch": 0.21425, "grad_norm": 3.5, "grad_norm_var": 0.02760009765625, "learning_rate": 0.0001, "loss": 6.0291, "loss/crossentropy": 2.6054465770721436, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18924039602279663, "step": 6856 }, { "epoch": 0.2143125, "grad_norm": 3.515625, "grad_norm_var": 0.035676066080729166, "learning_rate": 0.0001, "loss": 6.2084, "loss/crossentropy": 2.5486491918563843, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.201907716691494, "step": 6858 }, { "epoch": 0.214375, "grad_norm": 3.9375, "grad_norm_var": 0.048981730143229166, "learning_rate": 0.0001, "loss": 6.027, "loss/crossentropy": 2.454265594482422, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.19242677092552185, "step": 6860 }, { "epoch": 0.2144375, "grad_norm": 3.390625, "grad_norm_var": 0.0418365478515625, "learning_rate": 0.0001, "loss": 5.9576, "loss/crossentropy": 2.468759536743164, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19107331335544586, "step": 6862 }, { "epoch": 0.2145, "grad_norm": 3.34375, "grad_norm_var": 0.0423980712890625, "learning_rate": 0.0001, "loss": 5.91, "loss/crossentropy": 2.439316511154175, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.18730410933494568, "step": 6864 }, { "epoch": 0.2145625, "grad_norm": 3.453125, "grad_norm_var": 0.03689778645833333, "learning_rate": 0.0001, "loss": 5.901, "loss/crossentropy": 2.439536452293396, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18872268497943878, "step": 6866 }, { "epoch": 0.214625, "grad_norm": 3.875, "grad_norm_var": 0.046549479166666664, "learning_rate": 0.0001, "loss": 6.036, "loss/crossentropy": 2.5354151725769043, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.19497878849506378, "step": 6868 }, { "epoch": 0.2146875, "grad_norm": 3.921875, "grad_norm_var": 0.0598541259765625, "learning_rate": 0.0001, "loss": 6.3898, "loss/crossentropy": 2.7419973611831665, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.20345129817724228, "step": 6870 }, { "epoch": 0.21475, "grad_norm": 6.0, "grad_norm_var": 0.43945210774739585, "learning_rate": 0.0001, "loss": 6.9018, "loss/crossentropy": 2.9109256267547607, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.22916782647371292, "step": 6872 }, { "epoch": 0.2148125, "grad_norm": 3.59375, "grad_norm_var": 0.436181640625, "learning_rate": 0.0001, "loss": 6.228, "loss/crossentropy": 2.65897274017334, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.1994800567626953, "step": 6874 }, { "epoch": 0.214875, "grad_norm": 4.125, "grad_norm_var": 0.43635660807291665, "learning_rate": 0.0001, "loss": 6.2568, "loss/crossentropy": 2.5532950162887573, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.2059016078710556, "step": 6876 }, { "epoch": 0.2149375, "grad_norm": 3.46875, "grad_norm_var": 0.43700764973958334, "learning_rate": 0.0001, "loss": 5.8958, "loss/crossentropy": 2.4443269968032837, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.18655820935964584, "step": 6878 }, { "epoch": 0.215, "grad_norm": 3.28125, "grad_norm_var": 0.4430816650390625, "learning_rate": 0.0001, "loss": 6.0938, "loss/crossentropy": 2.662695527076721, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.18373503535985947, "step": 6880 }, { "epoch": 0.2150625, "grad_norm": 3.34375, "grad_norm_var": 0.45083719889322915, "learning_rate": 0.0001, "loss": 6.3018, "loss/crossentropy": 2.7749582529067993, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19487353414297104, "step": 6882 }, { "epoch": 0.215125, "grad_norm": 3.671875, "grad_norm_var": 0.44015299479166664, "learning_rate": 0.0001, "loss": 6.3429, "loss/crossentropy": 2.737243890762329, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.1984574869275093, "step": 6884 }, { "epoch": 0.2151875, "grad_norm": 3.578125, "grad_norm_var": 0.42986653645833334, "learning_rate": 0.0001, "loss": 6.3917, "loss/crossentropy": 2.7898651361465454, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.20119955390691757, "step": 6886 }, { "epoch": 0.21525, "grad_norm": 3.46875, "grad_norm_var": 0.054032389322916666, "learning_rate": 0.0001, "loss": 5.9249, "loss/crossentropy": 2.4453309774398804, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.192485474050045, "step": 6888 }, { "epoch": 0.2153125, "grad_norm": 3.140625, "grad_norm_var": 0.057648722330729166, "learning_rate": 0.0001, "loss": 6.1452, "loss/crossentropy": 2.6766172647476196, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.19138825684785843, "step": 6890 }, { "epoch": 0.215375, "grad_norm": 3.5625, "grad_norm_var": 0.027567545572916668, "learning_rate": 0.0001, "loss": 6.2641, "loss/crossentropy": 2.6406314373016357, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.19906429201364517, "step": 6892 }, { "epoch": 0.2154375, "grad_norm": 3.90625, "grad_norm_var": 0.04077860514322917, "learning_rate": 0.0001, "loss": 6.4015, "loss/crossentropy": 2.7206833362579346, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.2067534253001213, "step": 6894 }, { "epoch": 0.2155, "grad_norm": 3.546875, "grad_norm_var": 0.04058837890625, "learning_rate": 0.0001, "loss": 5.9721, "loss/crossentropy": 2.5484172105789185, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1861155927181244, "step": 6896 }, { "epoch": 0.2155625, "grad_norm": 3.59375, "grad_norm_var": 0.03752339680989583, "learning_rate": 0.0001, "loss": 6.0761, "loss/crossentropy": 2.551543116569519, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.19502922147512436, "step": 6898 }, { "epoch": 0.215625, "grad_norm": 3.625, "grad_norm_var": 0.036942545572916666, "learning_rate": 0.0001, "loss": 6.0563, "loss/crossentropy": 2.5941959619522095, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18995870649814606, "step": 6900 }, { "epoch": 0.2156875, "grad_norm": 3.421875, "grad_norm_var": 0.03948567708333333, "learning_rate": 0.0001, "loss": 5.8376, "loss/crossentropy": 2.4501700401306152, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18366114050149918, "step": 6902 }, { "epoch": 0.21575, "grad_norm": 3.734375, "grad_norm_var": 0.0407135009765625, "learning_rate": 0.0001, "loss": 6.1904, "loss/crossentropy": 2.533998727798462, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.1996278166770935, "step": 6904 }, { "epoch": 0.2158125, "grad_norm": 3.890625, "grad_norm_var": 0.03933919270833333, "learning_rate": 0.0001, "loss": 6.2574, "loss/crossentropy": 2.6609431505203247, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.19753922522068024, "step": 6906 }, { "epoch": 0.215875, "grad_norm": 3.71875, "grad_norm_var": 0.10784098307291666, "learning_rate": 0.0001, "loss": 6.211, "loss/crossentropy": 2.457728385925293, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.21009515970945358, "step": 6908 }, { "epoch": 0.2159375, "grad_norm": 4.5, "grad_norm_var": 0.15359598795572918, "learning_rate": 0.0001, "loss": 6.5915, "loss/crossentropy": 2.7991663217544556, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.21477922797203064, "step": 6910 }, { "epoch": 0.216, "grad_norm": 3.453125, "grad_norm_var": 0.14563395182291666, "learning_rate": 0.0001, "loss": 6.4119, "loss/crossentropy": 2.751275420188904, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.20707565546035767, "step": 6912 }, { "epoch": 0.2160625, "grad_norm": 3.421875, "grad_norm_var": 0.15192769368489584, "learning_rate": 0.0001, "loss": 6.0199, "loss/crossentropy": 2.5404539108276367, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1924763321876526, "step": 6914 }, { "epoch": 0.216125, "grad_norm": 3.4375, "grad_norm_var": 0.15031636555989583, "learning_rate": 0.0001, "loss": 6.4459, "loss/crossentropy": 2.7554389238357544, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.20654761791229248, "step": 6916 }, { "epoch": 0.2161875, "grad_norm": 4.0, "grad_norm_var": 0.13528645833333333, "learning_rate": 0.0001, "loss": 6.662, "loss/crossentropy": 2.817233920097351, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.21885564178228378, "step": 6918 }, { "epoch": 0.21625, "grad_norm": 3.546875, "grad_norm_var": 0.13834635416666666, "learning_rate": 0.0001, "loss": 6.2551, "loss/crossentropy": 2.618025064468384, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.20003525912761688, "step": 6920 }, { "epoch": 0.2163125, "grad_norm": 3.53125, "grad_norm_var": 0.15013020833333332, "learning_rate": 0.0001, "loss": 5.8261, "loss/crossentropy": 2.4295458793640137, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18496494740247726, "step": 6922 }, { "epoch": 0.216375, "grad_norm": 3.421875, "grad_norm_var": 0.0981353759765625, "learning_rate": 0.0001, "loss": 6.0448, "loss/crossentropy": 2.54641056060791, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19007324427366257, "step": 6924 }, { "epoch": 0.2164375, "grad_norm": 3.421875, "grad_norm_var": 0.0444488525390625, "learning_rate": 0.0001, "loss": 6.3752, "loss/crossentropy": 2.781325340270996, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.1976737380027771, "step": 6926 }, { "epoch": 0.2165, "grad_norm": 3.65625, "grad_norm_var": 0.046923828125, "learning_rate": 0.0001, "loss": 6.2686, "loss/crossentropy": 2.6794657707214355, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.19875822216272354, "step": 6928 }, { "epoch": 0.2165625, "grad_norm": 6.3125, "grad_norm_var": 0.52379150390625, "learning_rate": 0.0001, "loss": 6.0096, "loss/crossentropy": 2.478939652442932, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.1917371302843094, "step": 6930 }, { "epoch": 0.216625, "grad_norm": 3.75, "grad_norm_var": 0.5166005452473958, "learning_rate": 0.0001, "loss": 6.2988, "loss/crossentropy": 2.561856746673584, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.20572340488433838, "step": 6932 }, { "epoch": 0.2166875, "grad_norm": 3.359375, "grad_norm_var": 0.5197550455729166, "learning_rate": 0.0001, "loss": 6.2968, "loss/crossentropy": 2.6517215967178345, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.20200955122709274, "step": 6934 }, { "epoch": 0.21675, "grad_norm": 3.28125, "grad_norm_var": 0.5318023681640625, "learning_rate": 0.0001, "loss": 5.8895, "loss/crossentropy": 2.4591037034988403, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18561290949583054, "step": 6936 }, { "epoch": 0.2168125, "grad_norm": 3.359375, "grad_norm_var": 0.5356730143229167, "learning_rate": 0.0001, "loss": 5.8998, "loss/crossentropy": 2.508857011795044, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1859700232744217, "step": 6938 }, { "epoch": 0.216875, "grad_norm": 3.40625, "grad_norm_var": 0.5375935872395833, "learning_rate": 0.0001, "loss": 6.1059, "loss/crossentropy": 2.579146146774292, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19368895143270493, "step": 6940 }, { "epoch": 0.2169375, "grad_norm": 4.28125, "grad_norm_var": 0.5642649332682291, "learning_rate": 0.0001, "loss": 6.336, "loss/crossentropy": 2.79544997215271, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1978062242269516, "step": 6942 }, { "epoch": 0.217, "grad_norm": 4.5625, "grad_norm_var": 0.59810791015625, "learning_rate": 0.0001, "loss": 6.2227, "loss/crossentropy": 2.620998740196228, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.19688712805509567, "step": 6944 }, { "epoch": 0.2170625, "grad_norm": 3.375, "grad_norm_var": 0.14003804524739583, "learning_rate": 0.0001, "loss": 6.1529, "loss/crossentropy": 2.6646175384521484, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.19413743168115616, "step": 6946 }, { "epoch": 0.217125, "grad_norm": 3.859375, "grad_norm_var": 0.14371744791666666, "learning_rate": 0.0001, "loss": 6.4361, "loss/crossentropy": 2.7347337007522583, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.2095884531736374, "step": 6948 }, { "epoch": 0.2171875, "grad_norm": 4.0, "grad_norm_var": 0.14880269368489582, "learning_rate": 0.0001, "loss": 6.2403, "loss/crossentropy": 2.5746344327926636, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.20328227430582047, "step": 6950 }, { "epoch": 0.21725, "grad_norm": 3.484375, "grad_norm_var": 0.1431549072265625, "learning_rate": 0.0001, "loss": 6.3584, "loss/crossentropy": 2.7034659385681152, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.20260456204414368, "step": 6952 }, { "epoch": 0.2173125, "grad_norm": 3.15625, "grad_norm_var": 0.15107014973958333, "learning_rate": 0.0001, "loss": 6.0041, "loss/crossentropy": 2.5409141778945923, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.19006993621587753, "step": 6954 }, { "epoch": 0.217375, "grad_norm": 4.125, "grad_norm_var": 0.16077473958333333, "learning_rate": 0.0001, "loss": 6.1016, "loss/crossentropy": 2.4567782878875732, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.20276756584644318, "step": 6956 }, { "epoch": 0.2174375, "grad_norm": 4.53125, "grad_norm_var": 0.172412109375, "learning_rate": 0.0001, "loss": 6.2712, "loss/crossentropy": 2.551652431488037, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.2063276246190071, "step": 6958 }, { "epoch": 0.2175, "grad_norm": 3.5625, "grad_norm_var": 0.1279937744140625, "learning_rate": 0.0001, "loss": 6.1276, "loss/crossentropy": 2.6114827394485474, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.1934124454855919, "step": 6960 }, { "epoch": 0.2175625, "grad_norm": 4.03125, "grad_norm_var": 0.15374247233072916, "learning_rate": 0.0001, "loss": 6.3278, "loss/crossentropy": 2.5706037282943726, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.20931678265333176, "step": 6962 }, { "epoch": 0.217625, "grad_norm": 3.25, "grad_norm_var": 0.173583984375, "learning_rate": 0.0001, "loss": 6.1378, "loss/crossentropy": 2.5814859867095947, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.20016635954380035, "step": 6964 }, { "epoch": 0.2176875, "grad_norm": 3.90625, "grad_norm_var": 0.1806060791015625, "learning_rate": 0.0001, "loss": 6.5318, "loss/crossentropy": 2.6615020036697388, "loss/hidden": 1.70703125, "loss/jsd": 0.0, "loss/logits": 0.21632423251867294, "step": 6966 }, { "epoch": 0.21775, "grad_norm": 3.671875, "grad_norm_var": 0.17083333333333334, "learning_rate": 0.0001, "loss": 6.1315, "loss/crossentropy": 2.544968843460083, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.20005465298891068, "step": 6968 }, { "epoch": 0.2178125, "grad_norm": 4.09375, "grad_norm_var": 0.13464253743489582, "learning_rate": 0.0001, "loss": 6.7397, "loss/crossentropy": 2.9408375024795532, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.21113675087690353, "step": 6970 }, { "epoch": 0.217875, "grad_norm": 3.390625, "grad_norm_var": 0.1431640625, "learning_rate": 0.0001, "loss": 6.1237, "loss/crossentropy": 2.596482753753662, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.19256656616926193, "step": 6972 }, { "epoch": 0.2179375, "grad_norm": 3.484375, "grad_norm_var": 0.11965230305989584, "learning_rate": 0.0001, "loss": 6.2645, "loss/crossentropy": 2.7255892753601074, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.19803231209516525, "step": 6974 }, { "epoch": 0.218, "grad_norm": 3.03125, "grad_norm_var": 0.1577301025390625, "learning_rate": 0.0001, "loss": 5.6588, "loss/crossentropy": 2.37584125995636, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17439134418964386, "step": 6976 }, { "epoch": 0.2180625, "grad_norm": 3.59375, "grad_norm_var": 0.11507161458333333, "learning_rate": 0.0001, "loss": 6.0221, "loss/crossentropy": 2.602111339569092, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18770533800125122, "step": 6978 }, { "epoch": 0.218125, "grad_norm": 3.34375, "grad_norm_var": 0.11194559733072916, "learning_rate": 0.0001, "loss": 5.8931, "loss/crossentropy": 2.467549204826355, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.18161573261022568, "step": 6980 }, { "epoch": 0.2181875, "grad_norm": 3.484375, "grad_norm_var": 0.07280985514322917, "learning_rate": 0.0001, "loss": 6.1829, "loss/crossentropy": 2.5902645587921143, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.19793671369552612, "step": 6982 }, { "epoch": 0.21825, "grad_norm": 3.59375, "grad_norm_var": 0.07154541015625, "learning_rate": 0.0001, "loss": 6.2253, "loss/crossentropy": 2.642678380012512, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.19654599577188492, "step": 6984 }, { "epoch": 0.2183125, "grad_norm": 3.609375, "grad_norm_var": 0.030134073893229165, "learning_rate": 0.0001, "loss": 6.1163, "loss/crossentropy": 2.5914446115493774, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19467195123434067, "step": 6986 }, { "epoch": 0.218375, "grad_norm": 3.671875, "grad_norm_var": 0.03561197916666667, "learning_rate": 0.0001, "loss": 5.5533, "loss/crossentropy": 2.160017430782318, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18190325796604156, "step": 6988 }, { "epoch": 0.2184375, "grad_norm": 3.734375, "grad_norm_var": 0.043619791666666664, "learning_rate": 0.0001, "loss": 6.1725, "loss/crossentropy": 2.5211633443832397, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.19989459216594696, "step": 6990 }, { "epoch": 0.2185, "grad_norm": 3.359375, "grad_norm_var": 0.031981404622395834, "learning_rate": 0.0001, "loss": 6.0217, "loss/crossentropy": 2.500148296356201, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.19590522348880768, "step": 6992 }, { "epoch": 0.2185625, "grad_norm": 3.5, "grad_norm_var": 0.025764973958333333, "learning_rate": 0.0001, "loss": 6.2181, "loss/crossentropy": 2.735214591026306, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.19008450955152512, "step": 6994 }, { "epoch": 0.218625, "grad_norm": 4.21875, "grad_norm_var": 0.04881083170572917, "learning_rate": 0.0001, "loss": 6.3318, "loss/crossentropy": 2.646474242210388, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.2083786502480507, "step": 6996 }, { "epoch": 0.2186875, "grad_norm": 3.484375, "grad_norm_var": 0.05103759765625, "learning_rate": 0.0001, "loss": 6.0873, "loss/crossentropy": 2.5000009536743164, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.19584060460329056, "step": 6998 }, { "epoch": 0.21875, "grad_norm": 8.0625, "grad_norm_var": 1.3062825520833334, "learning_rate": 0.0001, "loss": 6.2512, "loss/crossentropy": 2.6957199573516846, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19656287878751755, "step": 7000 }, { "epoch": 0.2188125, "grad_norm": 4.34375, "grad_norm_var": 1.2825592041015625, "learning_rate": 0.0001, "loss": 6.5302, "loss/crossentropy": 2.7317248582839966, "loss/hidden": 1.6875, "loss/jsd": 0.0, "loss/logits": 0.2110992819070816, "step": 7002 }, { "epoch": 0.218875, "grad_norm": 3.40625, "grad_norm_var": 11.030924479166666, "learning_rate": 0.0001, "loss": 6.7224, "loss/crossentropy": 2.6649667024612427, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.24284875392913818, "step": 7004 }, { "epoch": 0.2189375, "grad_norm": 3.953125, "grad_norm_var": 11.0185546875, "learning_rate": 0.0001, "loss": 6.2481, "loss/crossentropy": 2.5178329944610596, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.21130909025669098, "step": 7006 }, { "epoch": 0.219, "grad_norm": 3.53125, "grad_norm_var": 10.85338134765625, "learning_rate": 0.0001, "loss": 6.3382, "loss/crossentropy": 2.545018792152405, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.21486765146255493, "step": 7008 }, { "epoch": 0.2190625, "grad_norm": 3.609375, "grad_norm_var": 10.790208943684895, "learning_rate": 0.0001, "loss": 6.0454, "loss/crossentropy": 2.5478323698043823, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19077441096305847, "step": 7010 }, { "epoch": 0.219125, "grad_norm": 3.796875, "grad_norm_var": 10.765526326497396, "learning_rate": 0.0001, "loss": 6.4756, "loss/crossentropy": 2.6919052600860596, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.21547965705394745, "step": 7012 }, { "epoch": 0.2191875, "grad_norm": 3.359375, "grad_norm_var": 10.698778279622395, "learning_rate": 0.0001, "loss": 6.6294, "loss/crossentropy": 2.824127435684204, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.21451301872730255, "step": 7014 }, { "epoch": 0.21925, "grad_norm": 3.1875, "grad_norm_var": 10.271480305989583, "learning_rate": 0.0001, "loss": 5.8119, "loss/crossentropy": 2.401167631149292, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18638114631175995, "step": 7016 }, { "epoch": 0.2193125, "grad_norm": 3.40625, "grad_norm_var": 10.422298177083333, "learning_rate": 0.0001, "loss": 6.2321, "loss/crossentropy": 2.6169735193252563, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.20330799371004105, "step": 7018 }, { "epoch": 0.219375, "grad_norm": 3.53125, "grad_norm_var": 0.21865234375, "learning_rate": 0.0001, "loss": 6.2863, "loss/crossentropy": 2.6988236904144287, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.19937185943126678, "step": 7020 }, { "epoch": 0.2194375, "grad_norm": 5.09375, "grad_norm_var": 0.33622639973958335, "learning_rate": 0.0001, "loss": 6.0942, "loss/crossentropy": 2.487640380859375, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.20284540206193924, "step": 7022 }, { "epoch": 0.2195, "grad_norm": 3.953125, "grad_norm_var": 0.2670806884765625, "learning_rate": 0.0001, "loss": 6.4727, "loss/crossentropy": 2.6979743242263794, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.212629072368145, "step": 7024 }, { "epoch": 0.2195625, "grad_norm": 3.6875, "grad_norm_var": 0.26594645182291665, "learning_rate": 0.0001, "loss": 6.6105, "loss/crossentropy": 2.811803102493286, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.2138504534959793, "step": 7026 }, { "epoch": 0.219625, "grad_norm": 3.296875, "grad_norm_var": 0.27529296875, "learning_rate": 0.0001, "loss": 6.16, "loss/crossentropy": 2.5398292541503906, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.20225611329078674, "step": 7028 }, { "epoch": 0.2196875, "grad_norm": 3.1875, "grad_norm_var": 0.2148101806640625, "learning_rate": 0.0001, "loss": 5.8806, "loss/crossentropy": 2.467816948890686, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18659129738807678, "step": 7030 }, { "epoch": 0.21975, "grad_norm": 3.59375, "grad_norm_var": 0.22088216145833334, "learning_rate": 0.0001, "loss": 6.1715, "loss/crossentropy": 2.4942638874053955, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.201707124710083, "step": 7032 }, { "epoch": 0.2198125, "grad_norm": 3.3125, "grad_norm_var": 0.22753499348958334, "learning_rate": 0.0001, "loss": 5.9798, "loss/crossentropy": 2.5174331665039062, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.19038093090057373, "step": 7034 }, { "epoch": 0.219875, "grad_norm": 3.25, "grad_norm_var": 0.2456207275390625, "learning_rate": 0.0001, "loss": 6.161, "loss/crossentropy": 2.651417851448059, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1939275935292244, "step": 7036 }, { "epoch": 0.2199375, "grad_norm": 3.015625, "grad_norm_var": 0.1223297119140625, "learning_rate": 0.0001, "loss": 6.1559, "loss/crossentropy": 2.6518659591674805, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.18985295295715332, "step": 7038 }, { "epoch": 0.22, "grad_norm": 3.671875, "grad_norm_var": 0.11574605305989584, "learning_rate": 0.0001, "loss": 5.7382, "loss/crossentropy": 2.3332555294036865, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.17877569794654846, "step": 7040 }, { "epoch": 0.2200625, "grad_norm": 3.390625, "grad_norm_var": 0.10338134765625, "learning_rate": 0.0001, "loss": 5.9545, "loss/crossentropy": 2.5254757404327393, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.1839166134595871, "step": 7042 }, { "epoch": 0.220125, "grad_norm": 3.359375, "grad_norm_var": 0.09316304524739584, "learning_rate": 0.0001, "loss": 5.7698, "loss/crossentropy": 2.3414753675460815, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.18501769751310349, "step": 7044 }, { "epoch": 0.2201875, "grad_norm": 3.203125, "grad_norm_var": 0.09169921875, "learning_rate": 0.0001, "loss": 6.0464, "loss/crossentropy": 2.503966808319092, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.1929159015417099, "step": 7046 }, { "epoch": 0.22025, "grad_norm": 3.40625, "grad_norm_var": 0.0310943603515625, "learning_rate": 0.0001, "loss": 5.8648, "loss/crossentropy": 2.4673224687576294, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.178032785654068, "step": 7048 }, { "epoch": 0.2203125, "grad_norm": 3.703125, "grad_norm_var": 0.03764546712239583, "learning_rate": 0.0001, "loss": 6.4858, "loss/crossentropy": 2.8963719606399536, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.19605228304862976, "step": 7050 }, { "epoch": 0.220375, "grad_norm": 3.390625, "grad_norm_var": 0.041291300455729166, "learning_rate": 0.0001, "loss": 6.3703, "loss/crossentropy": 2.7580485343933105, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.20107314735651016, "step": 7052 }, { "epoch": 0.2204375, "grad_norm": 3.59375, "grad_norm_var": 0.03156636555989583, "learning_rate": 0.0001, "loss": 6.1383, "loss/crossentropy": 2.636568784713745, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.19196929037570953, "step": 7054 }, { "epoch": 0.2205, "grad_norm": 3.890625, "grad_norm_var": 0.03916015625, "learning_rate": 0.0001, "loss": 6.3322, "loss/crossentropy": 2.6917834281921387, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.19997604191303253, "step": 7056 }, { "epoch": 0.2205625, "grad_norm": 3.609375, "grad_norm_var": 0.043366495768229166, "learning_rate": 0.0001, "loss": 6.0525, "loss/crossentropy": 2.5482877492904663, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.188310407102108, "step": 7058 }, { "epoch": 0.220625, "grad_norm": 3.53125, "grad_norm_var": 0.06220296223958333, "learning_rate": 0.0001, "loss": 6.1149, "loss/crossentropy": 2.491082549095154, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.20183787494897842, "step": 7060 }, { "epoch": 0.2206875, "grad_norm": 3.90625, "grad_norm_var": 0.06349283854166667, "learning_rate": 0.0001, "loss": 6.5998, "loss/crossentropy": 2.760351061820984, "loss/hidden": 1.69921875, "loss/jsd": 0.0, "loss/logits": 0.21402332931756973, "step": 7062 }, { "epoch": 0.22075, "grad_norm": 3.4375, "grad_norm_var": 0.05373433430989583, "learning_rate": 0.0001, "loss": 6.11, "loss/crossentropy": 2.5462125539779663, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19739294797182083, "step": 7064 }, { "epoch": 0.2208125, "grad_norm": 3.3125, "grad_norm_var": 0.061376953125, "learning_rate": 0.0001, "loss": 6.0224, "loss/crossentropy": 2.554285407066345, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18977734446525574, "step": 7066 }, { "epoch": 0.220875, "grad_norm": 3.546875, "grad_norm_var": 0.05858968098958333, "learning_rate": 0.0001, "loss": 6.373, "loss/crossentropy": 2.7668060064315796, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.20319905132055283, "step": 7068 }, { "epoch": 0.2209375, "grad_norm": 3.265625, "grad_norm_var": 0.060301717122395834, "learning_rate": 0.0001, "loss": 6.1924, "loss/crossentropy": 2.679903984069824, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19226178526878357, "step": 7070 }, { "epoch": 0.221, "grad_norm": 3.25, "grad_norm_var": 0.054255167643229164, "learning_rate": 0.0001, "loss": 6.0998, "loss/crossentropy": 2.584157705307007, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19062745571136475, "step": 7072 }, { "epoch": 0.2210625, "grad_norm": 3.5, "grad_norm_var": 0.051935831705729164, "learning_rate": 0.0001, "loss": 6.153, "loss/crossentropy": 2.6092441082000732, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19539161026477814, "step": 7074 }, { "epoch": 0.221125, "grad_norm": 3.625, "grad_norm_var": 0.03181864420572917, "learning_rate": 0.0001, "loss": 6.1599, "loss/crossentropy": 2.556912899017334, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.19624081999063492, "step": 7076 }, { "epoch": 0.2211875, "grad_norm": 3.515625, "grad_norm_var": 0.018355305989583334, "learning_rate": 0.0001, "loss": 6.3478, "loss/crossentropy": 2.6942012310028076, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.20754989236593246, "step": 7078 }, { "epoch": 0.22125, "grad_norm": 3.625, "grad_norm_var": 0.022102864583333333, "learning_rate": 0.0001, "loss": 6.1846, "loss/crossentropy": 2.581602454185486, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.20092745125293732, "step": 7080 }, { "epoch": 0.2213125, "grad_norm": 3.546875, "grad_norm_var": 0.03290608723958333, "learning_rate": 0.0001, "loss": 5.9697, "loss/crossentropy": 2.4440085887908936, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.19787990301847458, "step": 7082 }, { "epoch": 0.221375, "grad_norm": 3.96875, "grad_norm_var": 0.04449462890625, "learning_rate": 0.0001, "loss": 6.4231, "loss/crossentropy": 2.677451968193054, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.21167093515396118, "step": 7084 }, { "epoch": 0.2214375, "grad_norm": 3.921875, "grad_norm_var": 0.0378326416015625, "learning_rate": 0.0001, "loss": 6.5638, "loss/crossentropy": 2.8133574724197388, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.20863892138004303, "step": 7086 }, { "epoch": 0.2215, "grad_norm": 3.65625, "grad_norm_var": 0.0268218994140625, "learning_rate": 0.0001, "loss": 6.1669, "loss/crossentropy": 2.544797658920288, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.20283634215593338, "step": 7088 }, { "epoch": 0.2215625, "grad_norm": 3.5625, "grad_norm_var": 0.032124837239583336, "learning_rate": 0.0001, "loss": 5.9956, "loss/crossentropy": 2.562207341194153, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1878679394721985, "step": 7090 }, { "epoch": 0.221625, "grad_norm": 3.65625, "grad_norm_var": 0.73599853515625, "learning_rate": 0.0001, "loss": 6.2003, "loss/crossentropy": 2.454803466796875, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.20970892161130905, "step": 7092 }, { "epoch": 0.2216875, "grad_norm": 3.71875, "grad_norm_var": 0.7364217122395833, "learning_rate": 0.0001, "loss": 6.4424, "loss/crossentropy": 2.7469085454940796, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.20665527135133743, "step": 7094 }, { "epoch": 0.22175, "grad_norm": 3.453125, "grad_norm_var": 0.7474843343098958, "learning_rate": 0.0001, "loss": 5.8923, "loss/crossentropy": 2.3767290115356445, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.19647910445928574, "step": 7096 }, { "epoch": 0.2218125, "grad_norm": 3.40625, "grad_norm_var": 0.7739491780598958, "learning_rate": 0.0001, "loss": 5.7138, "loss/crossentropy": 2.3209710121154785, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18537429720163345, "step": 7098 }, { "epoch": 0.221875, "grad_norm": 3.953125, "grad_norm_var": 0.7713175455729167, "learning_rate": 0.0001, "loss": 6.4804, "loss/crossentropy": 2.6829233169555664, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.21334099769592285, "step": 7100 }, { "epoch": 0.2219375, "grad_norm": 3.6875, "grad_norm_var": 0.7812001546223958, "learning_rate": 0.0001, "loss": 6.408, "loss/crossentropy": 2.7748000621795654, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.20159757882356644, "step": 7102 }, { "epoch": 0.222, "grad_norm": 3.328125, "grad_norm_var": 0.8007771809895833, "learning_rate": 0.0001, "loss": 6.3518, "loss/crossentropy": 2.754671096801758, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.19643227010965347, "step": 7104 }, { "epoch": 0.2220625, "grad_norm": 3.359375, "grad_norm_var": 0.795263671875, "learning_rate": 0.0001, "loss": 5.9993, "loss/crossentropy": 2.408530831336975, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.19658169895410538, "step": 7106 }, { "epoch": 0.222125, "grad_norm": 3.453125, "grad_norm_var": 0.0819244384765625, "learning_rate": 0.0001, "loss": 6.3234, "loss/crossentropy": 2.7514824867248535, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19742165505886078, "step": 7108 }, { "epoch": 0.2221875, "grad_norm": 3.5, "grad_norm_var": 0.058381144205729166, "learning_rate": 0.0001, "loss": 5.6316, "loss/crossentropy": 2.305321216583252, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17950443178415298, "step": 7110 }, { "epoch": 0.22225, "grad_norm": 3.125, "grad_norm_var": 0.06800028483072916, "learning_rate": 0.0001, "loss": 5.7636, "loss/crossentropy": 2.4000433683395386, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.1769847720861435, "step": 7112 }, { "epoch": 0.2223125, "grad_norm": 4.0, "grad_norm_var": 0.07986551920572917, "learning_rate": 0.0001, "loss": 6.4122, "loss/crossentropy": 2.672022819519043, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.20683379471302032, "step": 7114 }, { "epoch": 0.222375, "grad_norm": 3.828125, "grad_norm_var": 0.0757720947265625, "learning_rate": 0.0001, "loss": 6.4042, "loss/crossentropy": 2.7689915895462036, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.2037556767463684, "step": 7116 }, { "epoch": 0.2224375, "grad_norm": 4.0625, "grad_norm_var": 0.091796875, "learning_rate": 0.0001, "loss": 6.0827, "loss/crossentropy": 2.5164963006973267, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.19842083007097244, "step": 7118 }, { "epoch": 0.2225, "grad_norm": 3.578125, "grad_norm_var": 0.0732421875, "learning_rate": 0.0001, "loss": 5.9162, "loss/crossentropy": 2.453752040863037, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.19038930535316467, "step": 7120 }, { "epoch": 0.2225625, "grad_norm": 3.578125, "grad_norm_var": 0.07088216145833333, "learning_rate": 0.0001, "loss": 6.3235, "loss/crossentropy": 2.7482842206954956, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.1981423869729042, "step": 7122 }, { "epoch": 0.222625, "grad_norm": 4.125, "grad_norm_var": 0.09062093098958333, "learning_rate": 0.0001, "loss": 6.4008, "loss/crossentropy": 2.5867438316345215, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.21460527181625366, "step": 7124 }, { "epoch": 0.2226875, "grad_norm": 3.59375, "grad_norm_var": 0.08170166015625, "learning_rate": 0.0001, "loss": 5.7468, "loss/crossentropy": 2.335509777069092, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18683339655399323, "step": 7126 }, { "epoch": 0.22275, "grad_norm": 3.453125, "grad_norm_var": 0.06715087890625, "learning_rate": 0.0001, "loss": 5.9505, "loss/crossentropy": 2.484445810317993, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.1860543116927147, "step": 7128 }, { "epoch": 0.2228125, "grad_norm": 3.859375, "grad_norm_var": 0.06148681640625, "learning_rate": 0.0001, "loss": 6.167, "loss/crossentropy": 2.599080204963684, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.19624672830104828, "step": 7130 }, { "epoch": 0.222875, "grad_norm": 3.078125, "grad_norm_var": 0.0745269775390625, "learning_rate": 0.0001, "loss": 6.1566, "loss/crossentropy": 2.5842623710632324, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19941814243793488, "step": 7132 }, { "epoch": 0.2229375, "grad_norm": 3.421875, "grad_norm_var": 0.06288655598958333, "learning_rate": 0.0001, "loss": 5.9316, "loss/crossentropy": 2.4942493438720703, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18983107805252075, "step": 7134 }, { "epoch": 0.223, "grad_norm": 3.40625, "grad_norm_var": 0.058512369791666664, "learning_rate": 0.0001, "loss": 6.0632, "loss/crossentropy": 2.4725465774536133, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.19735118746757507, "step": 7136 }, { "epoch": 0.2230625, "grad_norm": 3.390625, "grad_norm_var": 0.06085611979166667, "learning_rate": 0.0001, "loss": 6.1609, "loss/crossentropy": 2.6143925189971924, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.19840151816606522, "step": 7138 }, { "epoch": 0.223125, "grad_norm": 5.5625, "grad_norm_var": 0.3148722330729167, "learning_rate": 0.0001, "loss": 6.2831, "loss/crossentropy": 2.7380794286727905, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19746720790863037, "step": 7140 }, { "epoch": 0.2231875, "grad_norm": 3.359375, "grad_norm_var": 0.31502176920572916, "learning_rate": 0.0001, "loss": 5.8714, "loss/crossentropy": 2.4498631954193115, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1855173036456108, "step": 7142 }, { "epoch": 0.22325, "grad_norm": 3.515625, "grad_norm_var": 0.3158925374348958, "learning_rate": 0.0001, "loss": 6.1672, "loss/crossentropy": 2.544526696205139, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.19742655754089355, "step": 7144 }, { "epoch": 0.2233125, "grad_norm": 3.625, "grad_norm_var": 0.31091206868489585, "learning_rate": 0.0001, "loss": 6.4244, "loss/crossentropy": 2.779419422149658, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.20395386964082718, "step": 7146 }, { "epoch": 0.223375, "grad_norm": 3.921875, "grad_norm_var": 0.30429280598958336, "learning_rate": 0.0001, "loss": 6.1574, "loss/crossentropy": 2.537745952606201, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.2002440243959427, "step": 7148 }, { "epoch": 0.2234375, "grad_norm": 4.375, "grad_norm_var": 0.3318837483723958, "learning_rate": 0.0001, "loss": 6.4352, "loss/crossentropy": 2.7549798488616943, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.2063073366880417, "step": 7150 }, { "epoch": 0.2235, "grad_norm": 3.5, "grad_norm_var": 0.327783203125, "learning_rate": 0.0001, "loss": 6.1109, "loss/crossentropy": 2.532162070274353, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.20044952630996704, "step": 7152 }, { "epoch": 0.2235625, "grad_norm": 3.65625, "grad_norm_var": 0.3145172119140625, "learning_rate": 0.0001, "loss": 6.241, "loss/crossentropy": 2.542296290397644, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.20698314160108566, "step": 7154 }, { "epoch": 0.223625, "grad_norm": 3.5, "grad_norm_var": 0.09064127604166666, "learning_rate": 0.0001, "loss": 6.3842, "loss/crossentropy": 2.649972081184387, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.20779529958963394, "step": 7156 }, { "epoch": 0.2236875, "grad_norm": 3.515625, "grad_norm_var": 0.07971598307291666, "learning_rate": 0.0001, "loss": 6.0456, "loss/crossentropy": 2.5300976037979126, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1949128583073616, "step": 7158 }, { "epoch": 0.22375, "grad_norm": 3.90625, "grad_norm_var": 0.0878082275390625, "learning_rate": 0.0001, "loss": 6.2176, "loss/crossentropy": 2.521161913871765, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.20636696368455887, "step": 7160 }, { "epoch": 0.2238125, "grad_norm": 3.609375, "grad_norm_var": 0.08700764973958333, "learning_rate": 0.0001, "loss": 6.2207, "loss/crossentropy": 2.7186752557754517, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.1919991374015808, "step": 7162 }, { "epoch": 0.223875, "grad_norm": 3.484375, "grad_norm_var": 0.07935791015625, "learning_rate": 0.0001, "loss": 6.0661, "loss/crossentropy": 2.516534924507141, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1987050324678421, "step": 7164 }, { "epoch": 0.2239375, "grad_norm": 3.375, "grad_norm_var": 0.05379130045572917, "learning_rate": 0.0001, "loss": 6.3041, "loss/crossentropy": 2.608791708946228, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.20624708384275436, "step": 7166 }, { "epoch": 0.224, "grad_norm": 3.671875, "grad_norm_var": 0.0555816650390625, "learning_rate": 0.0001, "loss": 6.1641, "loss/crossentropy": 2.6959011554718018, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.1878363937139511, "step": 7168 }, { "epoch": 0.2240625, "grad_norm": 3.40625, "grad_norm_var": 0.06803385416666667, "learning_rate": 0.0001, "loss": 6.0807, "loss/crossentropy": 2.5687514543533325, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.1918240785598755, "step": 7170 }, { "epoch": 0.224125, "grad_norm": 3.453125, "grad_norm_var": 0.044169108072916664, "learning_rate": 0.0001, "loss": 6.3498, "loss/crossentropy": 2.717841625213623, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.20069392770528793, "step": 7172 }, { "epoch": 0.2241875, "grad_norm": 3.40625, "grad_norm_var": 0.045849609375, "learning_rate": 0.0001, "loss": 5.9979, "loss/crossentropy": 2.548419952392578, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1879129856824875, "step": 7174 }, { "epoch": 0.22425, "grad_norm": 3.65625, "grad_norm_var": 0.043717447916666666, "learning_rate": 0.0001, "loss": 6.0759, "loss/crossentropy": 2.5514897108078003, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19267520308494568, "step": 7176 }, { "epoch": 0.2243125, "grad_norm": 3.484375, "grad_norm_var": 0.046126302083333334, "learning_rate": 0.0001, "loss": 6.05, "loss/crossentropy": 2.5378646850585938, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1945752575993538, "step": 7178 }, { "epoch": 0.224375, "grad_norm": 3.78125, "grad_norm_var": 0.04854227701822917, "learning_rate": 0.0001, "loss": 5.9998, "loss/crossentropy": 2.5784002542495728, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.18316055834293365, "step": 7180 }, { "epoch": 0.2244375, "grad_norm": 3.25, "grad_norm_var": 0.047932942708333336, "learning_rate": 0.0001, "loss": 6.2681, "loss/crossentropy": 2.7137949466705322, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19644546508789062, "step": 7182 }, { "epoch": 0.2245, "grad_norm": 3.078125, "grad_norm_var": 0.0605133056640625, "learning_rate": 0.0001, "loss": 5.9173, "loss/crossentropy": 2.4534000158309937, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.18466929346323013, "step": 7184 }, { "epoch": 0.2245625, "grad_norm": 3.921875, "grad_norm_var": 0.06454976399739583, "learning_rate": 0.0001, "loss": 6.1779, "loss/crossentropy": 2.5641443729400635, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.20044228434562683, "step": 7186 }, { "epoch": 0.224625, "grad_norm": 4.1875, "grad_norm_var": 0.0912109375, "learning_rate": 0.0001, "loss": 5.9642, "loss/crossentropy": 2.539321780204773, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.1811591386795044, "step": 7188 }, { "epoch": 0.2246875, "grad_norm": 3.3125, "grad_norm_var": 0.09397379557291667, "learning_rate": 0.0001, "loss": 5.7936, "loss/crossentropy": 2.4275606870651245, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1842569038271904, "step": 7190 }, { "epoch": 0.22475, "grad_norm": 3.46875, "grad_norm_var": 0.07342122395833334, "learning_rate": 0.0001, "loss": 6.1311, "loss/crossentropy": 2.6554477214813232, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.19209395349025726, "step": 7192 }, { "epoch": 0.2248125, "grad_norm": 3.484375, "grad_norm_var": 0.073291015625, "learning_rate": 0.0001, "loss": 6.0223, "loss/crossentropy": 2.511997103691101, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19126948714256287, "step": 7194 }, { "epoch": 0.224875, "grad_norm": 3.359375, "grad_norm_var": 0.07537333170572917, "learning_rate": 0.0001, "loss": 6.327, "loss/crossentropy": 2.757696270942688, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.1979505866765976, "step": 7196 }, { "epoch": 0.2249375, "grad_norm": 3.5, "grad_norm_var": 0.07089436848958333, "learning_rate": 0.0001, "loss": 5.9437, "loss/crossentropy": 2.507505178451538, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1858089566230774, "step": 7198 }, { "epoch": 0.225, "grad_norm": 4.0, "grad_norm_var": 0.08759663899739584, "learning_rate": 0.0001, "loss": 6.2874, "loss/crossentropy": 2.633690595626831, "loss/hidden": 1.7109375, "loss/jsd": 0.0, "loss/logits": 0.1942805051803589, "step": 7200 }, { "epoch": 0.2250625, "grad_norm": 3.296875, "grad_norm_var": 0.0906646728515625, "learning_rate": 0.0001, "loss": 6.237, "loss/crossentropy": 2.6839088201522827, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.19710107892751694, "step": 7202 }, { "epoch": 0.225125, "grad_norm": 3.3125, "grad_norm_var": 0.06440327962239584, "learning_rate": 0.0001, "loss": 6.0258, "loss/crossentropy": 2.5818649530410767, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18736477941274643, "step": 7204 }, { "epoch": 0.2251875, "grad_norm": 3.296875, "grad_norm_var": 0.06496480305989584, "learning_rate": 0.0001, "loss": 6.2243, "loss/crossentropy": 2.69213604927063, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19540417194366455, "step": 7206 }, { "epoch": 0.22525, "grad_norm": 3.390625, "grad_norm_var": 0.06997782389322917, "learning_rate": 0.0001, "loss": 6.408, "loss/crossentropy": 2.735700845718384, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.20863628387451172, "step": 7208 }, { "epoch": 0.2253125, "grad_norm": 3.59375, "grad_norm_var": 0.10693257649739583, "learning_rate": 0.0001, "loss": 6.252, "loss/crossentropy": 2.6026411056518555, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.20008762180805206, "step": 7210 }, { "epoch": 0.225375, "grad_norm": 3.625, "grad_norm_var": 0.10064697265625, "learning_rate": 0.0001, "loss": 5.8114, "loss/crossentropy": 2.366856098175049, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.19054561108350754, "step": 7212 }, { "epoch": 0.2254375, "grad_norm": 3.59375, "grad_norm_var": 0.11060791015625, "learning_rate": 0.0001, "loss": 5.9851, "loss/crossentropy": 2.4928311109542847, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.1918041706085205, "step": 7214 }, { "epoch": 0.2255, "grad_norm": 3.3125, "grad_norm_var": 0.08677469889322917, "learning_rate": 0.0001, "loss": 6.1356, "loss/crossentropy": 2.576792359352112, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1992373839020729, "step": 7216 }, { "epoch": 0.2255625, "grad_norm": 3.4375, "grad_norm_var": 0.08844401041666666, "learning_rate": 0.0001, "loss": 6.0178, "loss/crossentropy": 2.3962244987487793, "loss/hidden": 1.6796875, "loss/jsd": 0.0, "loss/logits": 0.1941882222890854, "step": 7218 }, { "epoch": 0.225625, "grad_norm": 3.765625, "grad_norm_var": 0.0850006103515625, "learning_rate": 0.0001, "loss": 6.0182, "loss/crossentropy": 2.485282301902771, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.19274737685918808, "step": 7220 }, { "epoch": 0.2256875, "grad_norm": 3.5625, "grad_norm_var": 0.08580322265625, "learning_rate": 0.0001, "loss": 6.042, "loss/crossentropy": 2.4804413318634033, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19521985948085785, "step": 7222 }, { "epoch": 0.22575, "grad_norm": 3.53125, "grad_norm_var": 0.0804107666015625, "learning_rate": 0.0001, "loss": 6.2497, "loss/crossentropy": 2.649003744125366, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.19991525262594223, "step": 7224 }, { "epoch": 0.2258125, "grad_norm": 3.375, "grad_norm_var": 0.04898681640625, "learning_rate": 0.0001, "loss": 6.1027, "loss/crossentropy": 2.5942927598953247, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.19264158606529236, "step": 7226 }, { "epoch": 0.225875, "grad_norm": 3.53125, "grad_norm_var": 0.0519439697265625, "learning_rate": 0.0001, "loss": 6.1449, "loss/crossentropy": 2.6132737398147583, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.1926158145070076, "step": 7228 }, { "epoch": 0.2259375, "grad_norm": 3.859375, "grad_norm_var": 0.04111328125, "learning_rate": 0.0001, "loss": 6.0757, "loss/crossentropy": 2.5855209827423096, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.18846654891967773, "step": 7230 }, { "epoch": 0.226, "grad_norm": 3.546875, "grad_norm_var": 0.07340087890625, "learning_rate": 0.0001, "loss": 6.1571, "loss/crossentropy": 2.5730226039886475, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.1947377249598503, "step": 7232 }, { "epoch": 0.2260625, "grad_norm": 3.53125, "grad_norm_var": 0.06357421875, "learning_rate": 0.0001, "loss": 6.0368, "loss/crossentropy": 2.53850257396698, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.19045089930295944, "step": 7234 }, { "epoch": 0.226125, "grad_norm": 3.515625, "grad_norm_var": 0.057616170247395834, "learning_rate": 0.0001, "loss": 5.9541, "loss/crossentropy": 2.423492431640625, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.19446807354688644, "step": 7236 }, { "epoch": 0.2261875, "grad_norm": 3.9375, "grad_norm_var": 0.060042317708333334, "learning_rate": 0.0001, "loss": 5.933, "loss/crossentropy": 2.3947328329086304, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.18937600404024124, "step": 7238 }, { "epoch": 0.22625, "grad_norm": 3.734375, "grad_norm_var": 0.0597808837890625, "learning_rate": 0.0001, "loss": 6.074, "loss/crossentropy": 2.458932042121887, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.19744067639112473, "step": 7240 }, { "epoch": 0.2263125, "grad_norm": 3.796875, "grad_norm_var": 0.0525299072265625, "learning_rate": 0.0001, "loss": 6.1446, "loss/crossentropy": 2.521939754486084, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.20210901647806168, "step": 7242 }, { "epoch": 0.226375, "grad_norm": 3.625, "grad_norm_var": 0.05084228515625, "learning_rate": 0.0001, "loss": 6.3326, "loss/crossentropy": 2.70308518409729, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.20357725769281387, "step": 7244 }, { "epoch": 0.2264375, "grad_norm": 3.265625, "grad_norm_var": 0.07063700358072916, "learning_rate": 0.0001, "loss": 6.008, "loss/crossentropy": 2.571186661720276, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.188599094748497, "step": 7246 }, { "epoch": 0.2265, "grad_norm": 3.4375, "grad_norm_var": 0.0380767822265625, "learning_rate": 0.0001, "loss": 6.2089, "loss/crossentropy": 2.70067298412323, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.1918356642127037, "step": 7248 }, { "epoch": 0.2265625, "grad_norm": 3.734375, "grad_norm_var": 0.05286051432291667, "learning_rate": 0.0001, "loss": 5.8106, "loss/crossentropy": 2.393430471420288, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18429815769195557, "step": 7250 }, { "epoch": 0.226625, "grad_norm": 3.4375, "grad_norm_var": 0.05406494140625, "learning_rate": 0.0001, "loss": 6.1698, "loss/crossentropy": 2.624249815940857, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19557499140501022, "step": 7252 }, { "epoch": 0.2266875, "grad_norm": 3.3125, "grad_norm_var": 0.046296183268229166, "learning_rate": 0.0001, "loss": 5.6594, "loss/crossentropy": 2.2593475580215454, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18844523280858994, "step": 7254 }, { "epoch": 0.22675, "grad_norm": 3.453125, "grad_norm_var": 0.043473307291666666, "learning_rate": 0.0001, "loss": 6.0688, "loss/crossentropy": 2.633441209793091, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.188069187104702, "step": 7256 }, { "epoch": 0.2268125, "grad_norm": 3.75, "grad_norm_var": 0.03833719889322917, "learning_rate": 0.0001, "loss": 6.0438, "loss/crossentropy": 2.435559034347534, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.19480594247579575, "step": 7258 }, { "epoch": 0.226875, "grad_norm": 3.375, "grad_norm_var": 0.03524983723958333, "learning_rate": 0.0001, "loss": 6.0048, "loss/crossentropy": 2.555377721786499, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18791264295578003, "step": 7260 }, { "epoch": 0.2269375, "grad_norm": 3.109375, "grad_norm_var": 0.0385162353515625, "learning_rate": 0.0001, "loss": 5.9651, "loss/crossentropy": 2.540740966796875, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18813879787921906, "step": 7262 }, { "epoch": 0.227, "grad_norm": 3.1875, "grad_norm_var": 0.0409088134765625, "learning_rate": 0.0001, "loss": 5.6813, "loss/crossentropy": 2.3397140502929688, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17908480763435364, "step": 7264 }, { "epoch": 0.2270625, "grad_norm": 3.265625, "grad_norm_var": 0.03850911458333333, "learning_rate": 0.0001, "loss": 6.601, "loss/crossentropy": 3.0177700519561768, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.1997295320034027, "step": 7266 }, { "epoch": 0.227125, "grad_norm": 3.75, "grad_norm_var": 0.0453521728515625, "learning_rate": 0.0001, "loss": 6.1002, "loss/crossentropy": 2.540258288383484, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.19309942424297333, "step": 7268 }, { "epoch": 0.2271875, "grad_norm": 3.8125, "grad_norm_var": 0.08054097493489583, "learning_rate": 0.0001, "loss": 6.1415, "loss/crossentropy": 2.4717822074890137, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.20447231829166412, "step": 7270 }, { "epoch": 0.22725, "grad_norm": 3.328125, "grad_norm_var": 0.07685445149739584, "learning_rate": 0.0001, "loss": 6.0849, "loss/crossentropy": 2.57514226436615, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1939399093389511, "step": 7272 }, { "epoch": 0.2273125, "grad_norm": 3.578125, "grad_norm_var": 0.07429097493489584, "learning_rate": 0.0001, "loss": 5.7311, "loss/crossentropy": 2.333707571029663, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18543951958417892, "step": 7274 }, { "epoch": 0.227375, "grad_norm": 3.234375, "grad_norm_var": 0.07571614583333333, "learning_rate": 0.0001, "loss": 6.0251, "loss/crossentropy": 2.5862536430358887, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.18607349693775177, "step": 7276 }, { "epoch": 0.2274375, "grad_norm": 3.0, "grad_norm_var": 0.10493876139322916, "learning_rate": 0.0001, "loss": 5.9968, "loss/crossentropy": 2.556631088256836, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.1865924820303917, "step": 7278 }, { "epoch": 0.2275, "grad_norm": 3.6875, "grad_norm_var": 0.09657796223958333, "learning_rate": 0.0001, "loss": 6.216, "loss/crossentropy": 2.6764732599258423, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.194186270236969, "step": 7280 }, { "epoch": 0.2275625, "grad_norm": 3.40625, "grad_norm_var": 0.09251302083333333, "learning_rate": 0.0001, "loss": 6.2562, "loss/crossentropy": 2.716831684112549, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19495396316051483, "step": 7282 }, { "epoch": 0.227625, "grad_norm": 4.25, "grad_norm_var": 0.13001302083333333, "learning_rate": 0.0001, "loss": 5.9113, "loss/crossentropy": 2.297808527946472, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.19689101725816727, "step": 7284 }, { "epoch": 0.2276875, "grad_norm": 3.359375, "grad_norm_var": 0.1010894775390625, "learning_rate": 0.0001, "loss": 6.109, "loss/crossentropy": 2.637886643409729, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.19047501683235168, "step": 7286 }, { "epoch": 0.22775, "grad_norm": 3.421875, "grad_norm_var": 0.0988433837890625, "learning_rate": 0.0001, "loss": 6.1929, "loss/crossentropy": 2.7201133966445923, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.19336767494678497, "step": 7288 }, { "epoch": 0.2278125, "grad_norm": 3.140625, "grad_norm_var": 0.10550028483072917, "learning_rate": 0.0001, "loss": 5.6638, "loss/crossentropy": 2.355941653251648, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1776605248451233, "step": 7290 }, { "epoch": 0.227875, "grad_norm": 3.21875, "grad_norm_var": 0.10982666015625, "learning_rate": 0.0001, "loss": 5.7562, "loss/crossentropy": 2.4231557846069336, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18056906759738922, "step": 7292 }, { "epoch": 0.2279375, "grad_norm": 3.515625, "grad_norm_var": 0.0697174072265625, "learning_rate": 0.0001, "loss": 6.1144, "loss/crossentropy": 2.5779067277908325, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.19544732570648193, "step": 7294 }, { "epoch": 0.228, "grad_norm": 3.421875, "grad_norm_var": 0.06889546712239583, "learning_rate": 0.0001, "loss": 6.4275, "loss/crossentropy": 2.780030369758606, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.20459024608135223, "step": 7296 }, { "epoch": 0.2280625, "grad_norm": 3.859375, "grad_norm_var": 0.08276265462239583, "learning_rate": 0.0001, "loss": 6.0204, "loss/crossentropy": 2.416700005531311, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.1931861937046051, "step": 7298 }, { "epoch": 0.228125, "grad_norm": 3.375, "grad_norm_var": 0.03961181640625, "learning_rate": 0.0001, "loss": 5.8088, "loss/crossentropy": 2.481619954109192, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.181547611951828, "step": 7300 }, { "epoch": 0.2281875, "grad_norm": 3.25, "grad_norm_var": 0.04090067545572917, "learning_rate": 0.0001, "loss": 5.8149, "loss/crossentropy": 2.395822048187256, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1879977211356163, "step": 7302 }, { "epoch": 0.22825, "grad_norm": 3.5625, "grad_norm_var": 0.04132486979166667, "learning_rate": 0.0001, "loss": 6.3389, "loss/crossentropy": 2.743680715560913, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.20171266049146652, "step": 7304 }, { "epoch": 0.2283125, "grad_norm": 3.875, "grad_norm_var": 0.04804280598958333, "learning_rate": 0.0001, "loss": 5.9588, "loss/crossentropy": 2.5179975032806396, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1870514154434204, "step": 7306 }, { "epoch": 0.228375, "grad_norm": 3.8125, "grad_norm_var": 0.0456207275390625, "learning_rate": 0.0001, "loss": 6.2987, "loss/crossentropy": 2.748790740966797, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.19640059769153595, "step": 7308 }, { "epoch": 0.2284375, "grad_norm": 3.484375, "grad_norm_var": 0.0541900634765625, "learning_rate": 0.0001, "loss": 6.0278, "loss/crossentropy": 2.588564395904541, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1872790977358818, "step": 7310 }, { "epoch": 0.2285, "grad_norm": 3.40625, "grad_norm_var": 0.05211588541666667, "learning_rate": 0.0001, "loss": 6.1007, "loss/crossentropy": 2.5295733213424683, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.19696111977100372, "step": 7312 }, { "epoch": 0.2285625, "grad_norm": 3.28125, "grad_norm_var": 0.039891560872395836, "learning_rate": 0.0001, "loss": 5.849, "loss/crossentropy": 2.4458121061325073, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18289253860712051, "step": 7314 }, { "epoch": 0.228625, "grad_norm": 3.5, "grad_norm_var": 0.04163004557291667, "learning_rate": 0.0001, "loss": 6.2824, "loss/crossentropy": 2.627694010734558, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.20530951023101807, "step": 7316 }, { "epoch": 0.2286875, "grad_norm": 3.484375, "grad_norm_var": 0.03870035807291667, "learning_rate": 0.0001, "loss": 5.8016, "loss/crossentropy": 2.4144967794418335, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18480688333511353, "step": 7318 }, { "epoch": 0.22875, "grad_norm": 3.546875, "grad_norm_var": 0.04283447265625, "learning_rate": 0.0001, "loss": 5.8909, "loss/crossentropy": 2.421323776245117, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.19070785492658615, "step": 7320 }, { "epoch": 0.2288125, "grad_norm": 3.484375, "grad_norm_var": 0.029182942708333333, "learning_rate": 0.0001, "loss": 5.9303, "loss/crossentropy": 2.5196995735168457, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1859780177474022, "step": 7322 }, { "epoch": 0.228875, "grad_norm": 3.140625, "grad_norm_var": 0.023583984375, "learning_rate": 0.0001, "loss": 6.0889, "loss/crossentropy": 2.652615547180176, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18815699964761734, "step": 7324 }, { "epoch": 0.2289375, "grad_norm": 6.5, "grad_norm_var": 0.6192860921223958, "learning_rate": 0.0001, "loss": 6.1446, "loss/crossentropy": 2.5927183628082275, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.196204774081707, "step": 7326 }, { "epoch": 0.229, "grad_norm": 3.859375, "grad_norm_var": 0.6470611572265625, "learning_rate": 0.0001, "loss": 6.395, "loss/crossentropy": 2.6915029287338257, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.20862893015146255, "step": 7328 }, { "epoch": 0.2290625, "grad_norm": 3.984375, "grad_norm_var": 0.6263580322265625, "learning_rate": 0.0001, "loss": 6.2523, "loss/crossentropy": 2.586848020553589, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.20248162746429443, "step": 7330 }, { "epoch": 0.229125, "grad_norm": 3.65625, "grad_norm_var": 0.6242095947265625, "learning_rate": 0.0001, "loss": 6.3837, "loss/crossentropy": 2.72838294506073, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.203810915350914, "step": 7332 }, { "epoch": 0.2291875, "grad_norm": 3.78125, "grad_norm_var": 0.6142079671223958, "learning_rate": 0.0001, "loss": 6.22, "loss/crossentropy": 2.5156002044677734, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.20325710624456406, "step": 7334 }, { "epoch": 0.22925, "grad_norm": 3.40625, "grad_norm_var": 0.6004628499348958, "learning_rate": 0.0001, "loss": 6.2168, "loss/crossentropy": 2.610624313354492, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19967524707317352, "step": 7336 }, { "epoch": 0.2293125, "grad_norm": 3.203125, "grad_norm_var": 0.6215810139973958, "learning_rate": 0.0001, "loss": 6.0493, "loss/crossentropy": 2.615975260734558, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1870839074254036, "step": 7338 }, { "epoch": 0.229375, "grad_norm": 3.546875, "grad_norm_var": 0.5929758707682292, "learning_rate": 0.0001, "loss": 6.0301, "loss/crossentropy": 2.5628278255462646, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.19204376637935638, "step": 7340 }, { "epoch": 0.2294375, "grad_norm": 3.796875, "grad_norm_var": 0.09176432291666667, "learning_rate": 0.0001, "loss": 6.1977, "loss/crossentropy": 2.5626055002212524, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.19866493344306946, "step": 7342 }, { "epoch": 0.2295, "grad_norm": 3.3125, "grad_norm_var": 0.06845296223958333, "learning_rate": 0.0001, "loss": 5.7414, "loss/crossentropy": 2.365588426589966, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18407006561756134, "step": 7344 }, { "epoch": 0.2295625, "grad_norm": 3.625, "grad_norm_var": 0.07990620930989584, "learning_rate": 0.0001, "loss": 6.1658, "loss/crossentropy": 2.662602186203003, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1960218995809555, "step": 7346 }, { "epoch": 0.229625, "grad_norm": 3.375, "grad_norm_var": 0.09516499837239584, "learning_rate": 0.0001, "loss": 5.9993, "loss/crossentropy": 2.550138235092163, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.19100836664438248, "step": 7348 }, { "epoch": 0.2296875, "grad_norm": 3.71875, "grad_norm_var": 0.09342041015625, "learning_rate": 0.0001, "loss": 6.0154, "loss/crossentropy": 2.5097841024398804, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.19235394895076752, "step": 7350 }, { "epoch": 0.22975, "grad_norm": 3.53125, "grad_norm_var": 0.09157613118489584, "learning_rate": 0.0001, "loss": 5.976, "loss/crossentropy": 2.540569543838501, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.1810412034392357, "step": 7352 }, { "epoch": 0.2298125, "grad_norm": 3.515625, "grad_norm_var": 0.07981363932291667, "learning_rate": 0.0001, "loss": 6.1652, "loss/crossentropy": 2.666202664375305, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.19325844943523407, "step": 7354 }, { "epoch": 0.229875, "grad_norm": 3.53125, "grad_norm_var": 0.0746490478515625, "learning_rate": 0.0001, "loss": 6.2743, "loss/crossentropy": 2.694416880607605, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.1966572105884552, "step": 7356 }, { "epoch": 0.2299375, "grad_norm": 3.5625, "grad_norm_var": 0.0745269775390625, "learning_rate": 0.0001, "loss": 5.9192, "loss/crossentropy": 2.5005931854248047, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1879514679312706, "step": 7358 }, { "epoch": 0.23, "grad_norm": 3.6875, "grad_norm_var": 0.07662353515625, "learning_rate": 0.0001, "loss": 6.2571, "loss/crossentropy": 2.651940941810608, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.20192715525627136, "step": 7360 }, { "epoch": 0.2300625, "grad_norm": 5.03125, "grad_norm_var": 0.18541259765625, "learning_rate": 0.0001, "loss": 6.1134, "loss/crossentropy": 2.456774592399597, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.20238465070724487, "step": 7362 }, { "epoch": 0.230125, "grad_norm": 6.8125, "grad_norm_var": 0.78668212890625, "learning_rate": 0.0001, "loss": 6.1916, "loss/crossentropy": 2.581822395324707, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.1941842883825302, "step": 7364 }, { "epoch": 0.2301875, "grad_norm": 3.453125, "grad_norm_var": 0.7692372639973958, "learning_rate": 0.0001, "loss": 6.2382, "loss/crossentropy": 2.6422585248947144, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.2045135721564293, "step": 7366 }, { "epoch": 0.23025, "grad_norm": 4.0625, "grad_norm_var": 0.7674112955729167, "learning_rate": 0.0001, "loss": 6.6114, "loss/crossentropy": 2.907786726951599, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.2078600376844406, "step": 7368 }, { "epoch": 0.2303125, "grad_norm": 3.28125, "grad_norm_var": 0.7768717447916667, "learning_rate": 0.0001, "loss": 6.1037, "loss/crossentropy": 2.5247615575790405, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.19578395783901215, "step": 7370 }, { "epoch": 0.230375, "grad_norm": 3.390625, "grad_norm_var": 0.8033274332682292, "learning_rate": 0.0001, "loss": 6.1342, "loss/crossentropy": 2.605314254760742, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.19156111776828766, "step": 7372 }, { "epoch": 0.2304375, "grad_norm": 3.390625, "grad_norm_var": 0.8084869384765625, "learning_rate": 0.0001, "loss": 5.7904, "loss/crossentropy": 2.444626212120056, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1775500327348709, "step": 7374 }, { "epoch": 0.2305, "grad_norm": 3.3125, "grad_norm_var": 0.85035400390625, "learning_rate": 0.0001, "loss": 5.8499, "loss/crossentropy": 2.4502416849136353, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18762332946062088, "step": 7376 }, { "epoch": 0.2305625, "grad_norm": 3.484375, "grad_norm_var": 0.744580078125, "learning_rate": 0.0001, "loss": 5.9014, "loss/crossentropy": 2.4991562366485596, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18710073083639145, "step": 7378 }, { "epoch": 0.230625, "grad_norm": 3.828125, "grad_norm_var": 0.062398274739583336, "learning_rate": 0.0001, "loss": 6.2592, "loss/crossentropy": 2.6415544748306274, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.20395462214946747, "step": 7380 }, { "epoch": 0.2306875, "grad_norm": 4.1875, "grad_norm_var": 0.08961181640625, "learning_rate": 0.0001, "loss": 6.4175, "loss/crossentropy": 2.7125109434127808, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.2076052576303482, "step": 7382 }, { "epoch": 0.23075, "grad_norm": 3.515625, "grad_norm_var": 0.07551167805989584, "learning_rate": 0.0001, "loss": 6.259, "loss/crossentropy": 2.6772814989089966, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.19801807403564453, "step": 7384 }, { "epoch": 0.2308125, "grad_norm": 3.375, "grad_norm_var": 0.06308186848958333, "learning_rate": 0.0001, "loss": 5.962, "loss/crossentropy": 2.497159481048584, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.1890670359134674, "step": 7386 }, { "epoch": 0.230875, "grad_norm": 3.671875, "grad_norm_var": 0.0703765869140625, "learning_rate": 0.0001, "loss": 6.1034, "loss/crossentropy": 2.4412357807159424, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.20137407630681992, "step": 7388 }, { "epoch": 0.2309375, "grad_norm": 3.46875, "grad_norm_var": 0.0634674072265625, "learning_rate": 0.0001, "loss": 6.2355, "loss/crossentropy": 2.7218462228775024, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.19511879980564117, "step": 7390 }, { "epoch": 0.231, "grad_norm": 3.703125, "grad_norm_var": 0.053376261393229166, "learning_rate": 0.0001, "loss": 5.8255, "loss/crossentropy": 2.4146286249160767, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18561378866434097, "step": 7392 }, { "epoch": 0.2310625, "grad_norm": 3.359375, "grad_norm_var": 0.06270243326822916, "learning_rate": 0.0001, "loss": 5.9418, "loss/crossentropy": 2.549205780029297, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18496041744947433, "step": 7394 }, { "epoch": 0.231125, "grad_norm": 3.390625, "grad_norm_var": 0.0640625, "learning_rate": 0.0001, "loss": 5.9342, "loss/crossentropy": 2.497208595275879, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18862471729516983, "step": 7396 }, { "epoch": 0.2311875, "grad_norm": 3.3125, "grad_norm_var": 0.046751912434895834, "learning_rate": 0.0001, "loss": 6.2351, "loss/crossentropy": 2.6447510719299316, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.1957508847117424, "step": 7398 }, { "epoch": 0.23125, "grad_norm": 3.171875, "grad_norm_var": 0.051123046875, "learning_rate": 0.0001, "loss": 5.8249, "loss/crossentropy": 2.4528943300247192, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18251455575227737, "step": 7400 }, { "epoch": 0.2313125, "grad_norm": 3.734375, "grad_norm_var": 0.05332743326822917, "learning_rate": 0.0001, "loss": 6.1517, "loss/crossentropy": 2.573247790336609, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.1984696239233017, "step": 7402 }, { "epoch": 0.231375, "grad_norm": 3.390625, "grad_norm_var": 0.10120035807291666, "learning_rate": 0.0001, "loss": 6.4598, "loss/crossentropy": 2.763832688331604, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.20474957674741745, "step": 7404 }, { "epoch": 0.2314375, "grad_norm": 3.484375, "grad_norm_var": 0.1046783447265625, "learning_rate": 0.0001, "loss": 6.0386, "loss/crossentropy": 2.639525532722473, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1840517669916153, "step": 7406 }, { "epoch": 0.2315, "grad_norm": 4.09375, "grad_norm_var": 0.12578837076822916, "learning_rate": 0.0001, "loss": 6.2328, "loss/crossentropy": 2.7072668075561523, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.19512664526700974, "step": 7408 }, { "epoch": 0.2315625, "grad_norm": 3.296875, "grad_norm_var": 0.12578837076822916, "learning_rate": 0.0001, "loss": 5.8393, "loss/crossentropy": 2.417103886604309, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1875307485461235, "step": 7410 }, { "epoch": 0.231625, "grad_norm": 3.515625, "grad_norm_var": 0.12415364583333334, "learning_rate": 0.0001, "loss": 6.296, "loss/crossentropy": 2.6419787406921387, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.2025139182806015, "step": 7412 }, { "epoch": 0.2316875, "grad_norm": 3.5, "grad_norm_var": 0.12319234212239584, "learning_rate": 0.0001, "loss": 5.9212, "loss/crossentropy": 2.5086781978607178, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1857873946428299, "step": 7414 }, { "epoch": 0.23175, "grad_norm": 3.625, "grad_norm_var": 0.12553609212239583, "learning_rate": 0.0001, "loss": 6.3267, "loss/crossentropy": 2.7113449573516846, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.19669626653194427, "step": 7416 }, { "epoch": 0.2318125, "grad_norm": 3.625, "grad_norm_var": 0.14147135416666667, "learning_rate": 0.0001, "loss": 6.3766, "loss/crossentropy": 2.7021708488464355, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.2061111181974411, "step": 7418 }, { "epoch": 0.231875, "grad_norm": 3.4375, "grad_norm_var": 0.09260965983072916, "learning_rate": 0.0001, "loss": 5.9801, "loss/crossentropy": 2.5316959619522095, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.1854703575372696, "step": 7420 }, { "epoch": 0.2319375, "grad_norm": 3.421875, "grad_norm_var": 0.10178934733072917, "learning_rate": 0.0001, "loss": 5.8133, "loss/crossentropy": 2.450483798980713, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18081633746623993, "step": 7422 }, { "epoch": 0.232, "grad_norm": 3.65625, "grad_norm_var": 0.08230692545572917, "learning_rate": 0.0001, "loss": 5.9437, "loss/crossentropy": 2.411531090736389, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.19189123809337616, "step": 7424 }, { "epoch": 0.2320625, "grad_norm": 3.71875, "grad_norm_var": 0.07639058430989583, "learning_rate": 0.0001, "loss": 5.9129, "loss/crossentropy": 2.5023571252822876, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18285023421049118, "step": 7426 }, { "epoch": 0.232125, "grad_norm": 3.71875, "grad_norm_var": 0.12104390462239584, "learning_rate": 0.0001, "loss": 6.1384, "loss/crossentropy": 2.497291088104248, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.2043498232960701, "step": 7428 }, { "epoch": 0.2321875, "grad_norm": 3.359375, "grad_norm_var": 0.10891927083333333, "learning_rate": 0.0001, "loss": 6.2714, "loss/crossentropy": 2.755240559577942, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19185104221105576, "step": 7430 }, { "epoch": 0.23225, "grad_norm": 3.6875, "grad_norm_var": 0.09778645833333334, "learning_rate": 0.0001, "loss": 6.3849, "loss/crossentropy": 2.709605097770691, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.20268186926841736, "step": 7432 }, { "epoch": 0.2323125, "grad_norm": 3.5, "grad_norm_var": 0.0864166259765625, "learning_rate": 0.0001, "loss": 5.7756, "loss/crossentropy": 2.337099075317383, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18994461745023727, "step": 7434 }, { "epoch": 0.232375, "grad_norm": 3.9375, "grad_norm_var": 0.09278055826822916, "learning_rate": 0.0001, "loss": 6.1802, "loss/crossentropy": 2.614351749420166, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19760002940893173, "step": 7436 }, { "epoch": 0.2324375, "grad_norm": 3.5, "grad_norm_var": 0.07560933430989583, "learning_rate": 0.0001, "loss": 6.6144, "loss/crossentropy": 2.9351630210876465, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.20620250701904297, "step": 7438 }, { "epoch": 0.2325, "grad_norm": 3.375, "grad_norm_var": 0.07434488932291666, "learning_rate": 0.0001, "loss": 6.2547, "loss/crossentropy": 2.697936177253723, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1986447423696518, "step": 7440 }, { "epoch": 0.2325625, "grad_norm": 3.78125, "grad_norm_var": 0.081005859375, "learning_rate": 0.0001, "loss": 6.3051, "loss/crossentropy": 2.68693745136261, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.20361657440662384, "step": 7442 }, { "epoch": 0.232625, "grad_norm": 3.59375, "grad_norm_var": 0.04145406087239583, "learning_rate": 0.0001, "loss": 5.9403, "loss/crossentropy": 2.4055399894714355, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.19800888746976852, "step": 7444 }, { "epoch": 0.2326875, "grad_norm": 3.171875, "grad_norm_var": 0.04772135416666667, "learning_rate": 0.0001, "loss": 6.2272, "loss/crossentropy": 2.7289352416992188, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.19162369519472122, "step": 7446 }, { "epoch": 0.23275, "grad_norm": 3.515625, "grad_norm_var": 0.04728902180989583, "learning_rate": 0.0001, "loss": 6.1197, "loss/crossentropy": 2.5941803455352783, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.19629882276058197, "step": 7448 }, { "epoch": 0.2328125, "grad_norm": 3.65625, "grad_norm_var": 0.04666341145833333, "learning_rate": 0.0001, "loss": 6.1391, "loss/crossentropy": 2.5506038665771484, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.19713414460420609, "step": 7450 }, { "epoch": 0.232875, "grad_norm": 3.265625, "grad_norm_var": 0.0388336181640625, "learning_rate": 0.0001, "loss": 6.0142, "loss/crossentropy": 2.5255450010299683, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19105535745620728, "step": 7452 }, { "epoch": 0.2329375, "grad_norm": 3.59375, "grad_norm_var": 0.04072265625, "learning_rate": 0.0001, "loss": 6.4106, "loss/crossentropy": 2.724580407142639, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.20023799687623978, "step": 7454 }, { "epoch": 0.233, "grad_norm": 3.109375, "grad_norm_var": 0.0539947509765625, "learning_rate": 0.0001, "loss": 5.9726, "loss/crossentropy": 2.5376689434051514, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18997538089752197, "step": 7456 }, { "epoch": 0.2330625, "grad_norm": 3.21875, "grad_norm_var": 0.04094645182291667, "learning_rate": 0.0001, "loss": 5.9383, "loss/crossentropy": 2.5750547647476196, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18320411443710327, "step": 7458 }, { "epoch": 0.233125, "grad_norm": 3.28125, "grad_norm_var": 0.0402740478515625, "learning_rate": 0.0001, "loss": 5.826, "loss/crossentropy": 2.477864980697632, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.17816954106092453, "step": 7460 }, { "epoch": 0.2331875, "grad_norm": 3.359375, "grad_norm_var": 0.03504231770833333, "learning_rate": 0.0001, "loss": 5.7815, "loss/crossentropy": 2.437563419342041, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1797105148434639, "step": 7462 }, { "epoch": 0.23325, "grad_norm": 3.46875, "grad_norm_var": 0.03479817708333333, "learning_rate": 0.0001, "loss": 6.4018, "loss/crossentropy": 2.7117398977279663, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.2080647200345993, "step": 7464 }, { "epoch": 0.2333125, "grad_norm": 3.765625, "grad_norm_var": 0.03961181640625, "learning_rate": 0.0001, "loss": 6.1329, "loss/crossentropy": 2.7160139083862305, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1838773414492607, "step": 7466 }, { "epoch": 0.233375, "grad_norm": 3.203125, "grad_norm_var": 0.04114583333333333, "learning_rate": 0.0001, "loss": 6.0136, "loss/crossentropy": 2.598839044570923, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18405775725841522, "step": 7468 }, { "epoch": 0.2334375, "grad_norm": 3.203125, "grad_norm_var": 0.03804423014322917, "learning_rate": 0.0001, "loss": 6.104, "loss/crossentropy": 2.6647852659225464, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18571925908327103, "step": 7470 }, { "epoch": 0.2335, "grad_norm": 3.15625, "grad_norm_var": 0.0279693603515625, "learning_rate": 0.0001, "loss": 5.6951, "loss/crossentropy": 2.384377956390381, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17794862389564514, "step": 7472 }, { "epoch": 0.2335625, "grad_norm": 3.5, "grad_norm_var": 0.028156534830729166, "learning_rate": 0.0001, "loss": 5.9524, "loss/crossentropy": 2.49960196018219, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18785890191793442, "step": 7474 }, { "epoch": 0.233625, "grad_norm": 3.546875, "grad_norm_var": 0.029683430989583332, "learning_rate": 0.0001, "loss": 6.1427, "loss/crossentropy": 2.557058572769165, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19762682914733887, "step": 7476 }, { "epoch": 0.2336875, "grad_norm": 3.40625, "grad_norm_var": 0.03368733723958333, "learning_rate": 0.0001, "loss": 6.0381, "loss/crossentropy": 2.534654378890991, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.19487891346216202, "step": 7478 }, { "epoch": 0.23375, "grad_norm": 3.359375, "grad_norm_var": 0.036188761393229164, "learning_rate": 0.0001, "loss": 5.8262, "loss/crossentropy": 2.4826525449752808, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18162458389997482, "step": 7480 }, { "epoch": 0.2338125, "grad_norm": 3.65625, "grad_norm_var": 0.033600870768229166, "learning_rate": 0.0001, "loss": 5.8905, "loss/crossentropy": 2.4829264879226685, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1868496686220169, "step": 7482 }, { "epoch": 0.233875, "grad_norm": 3.390625, "grad_norm_var": 0.04011942545572917, "learning_rate": 0.0001, "loss": 6.1066, "loss/crossentropy": 2.5453397035598755, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.19479379057884216, "step": 7484 }, { "epoch": 0.2339375, "grad_norm": 3.46875, "grad_norm_var": 0.043229166666666666, "learning_rate": 0.0001, "loss": 5.7595, "loss/crossentropy": 2.438984513282776, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17931944131851196, "step": 7486 }, { "epoch": 0.234, "grad_norm": 3.625, "grad_norm_var": 0.04342447916666667, "learning_rate": 0.0001, "loss": 6.3522, "loss/crossentropy": 2.6873581409454346, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.2055514082312584, "step": 7488 }, { "epoch": 0.2340625, "grad_norm": 3.40625, "grad_norm_var": 0.040848795572916666, "learning_rate": 0.0001, "loss": 6.0623, "loss/crossentropy": 2.6052039861679077, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.18516145646572113, "step": 7490 }, { "epoch": 0.234125, "grad_norm": 3.390625, "grad_norm_var": 0.04080403645833333, "learning_rate": 0.0001, "loss": 6.2604, "loss/crossentropy": 2.753083825111389, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.19213388115167618, "step": 7492 }, { "epoch": 0.2341875, "grad_norm": 3.625, "grad_norm_var": 0.03994038899739583, "learning_rate": 0.0001, "loss": 6.1461, "loss/crossentropy": 2.5053790807724, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.19688421487808228, "step": 7494 }, { "epoch": 0.23425, "grad_norm": 3.25, "grad_norm_var": 0.0356109619140625, "learning_rate": 0.0001, "loss": 6.1289, "loss/crossentropy": 2.672441601753235, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18939807265996933, "step": 7496 }, { "epoch": 0.2343125, "grad_norm": 3.25, "grad_norm_var": 0.0393218994140625, "learning_rate": 0.0001, "loss": 5.8123, "loss/crossentropy": 2.453073740005493, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1788911297917366, "step": 7498 }, { "epoch": 0.234375, "grad_norm": 3.234375, "grad_norm_var": 0.0416412353515625, "learning_rate": 0.0001, "loss": 6.2689, "loss/crossentropy": 2.692493438720703, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.198652982711792, "step": 7500 }, { "epoch": 0.2344375, "grad_norm": 3.21875, "grad_norm_var": 0.037158203125, "learning_rate": 0.0001, "loss": 6.0308, "loss/crossentropy": 2.57742440700531, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18869930505752563, "step": 7502 }, { "epoch": 0.2345, "grad_norm": 3.28125, "grad_norm_var": 0.030973307291666665, "learning_rate": 0.0001, "loss": 5.7385, "loss/crossentropy": 2.3825695514678955, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.17816990613937378, "step": 7504 }, { "epoch": 0.2345625, "grad_norm": 3.515625, "grad_norm_var": 0.08776041666666666, "learning_rate": 0.0001, "loss": 6.2344, "loss/crossentropy": 2.630827307701111, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.19785429537296295, "step": 7506 }, { "epoch": 0.234625, "grad_norm": 3.5625, "grad_norm_var": 0.1081207275390625, "learning_rate": 0.0001, "loss": 5.9919, "loss/crossentropy": 2.440385937690735, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.19225788116455078, "step": 7508 }, { "epoch": 0.2346875, "grad_norm": 3.484375, "grad_norm_var": 0.10650634765625, "learning_rate": 0.0001, "loss": 5.8651, "loss/crossentropy": 2.441224455833435, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1896503046154976, "step": 7510 }, { "epoch": 0.23475, "grad_norm": 3.65625, "grad_norm_var": 0.10657145182291666, "learning_rate": 0.0001, "loss": 6.3991, "loss/crossentropy": 2.7317874431610107, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.2073518931865692, "step": 7512 }, { "epoch": 0.2348125, "grad_norm": 3.453125, "grad_norm_var": 0.09326070149739583, "learning_rate": 0.0001, "loss": 6.1689, "loss/crossentropy": 2.6027718782424927, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.1956755369901657, "step": 7514 }, { "epoch": 0.234875, "grad_norm": 3.546875, "grad_norm_var": 0.08765869140625, "learning_rate": 0.0001, "loss": 6.0401, "loss/crossentropy": 2.4928770065307617, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.19182778894901276, "step": 7516 }, { "epoch": 0.2349375, "grad_norm": 3.390625, "grad_norm_var": 0.07895406087239583, "learning_rate": 0.0001, "loss": 6.2329, "loss/crossentropy": 2.7454384565353394, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.19132785499095917, "step": 7518 }, { "epoch": 0.235, "grad_norm": 3.40625, "grad_norm_var": 0.07144775390625, "learning_rate": 0.0001, "loss": 5.9715, "loss/crossentropy": 2.5534324645996094, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18360119313001633, "step": 7520 }, { "epoch": 0.2350625, "grad_norm": 3.5, "grad_norm_var": 0.032933553059895836, "learning_rate": 0.0001, "loss": 6.2586, "loss/crossentropy": 2.6766849756240845, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.2007722333073616, "step": 7522 }, { "epoch": 0.235125, "grad_norm": 3.40625, "grad_norm_var": 0.0172271728515625, "learning_rate": 0.0001, "loss": 6.083, "loss/crossentropy": 2.5798628330230713, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.18898801505565643, "step": 7524 }, { "epoch": 0.2351875, "grad_norm": 3.765625, "grad_norm_var": 0.021370442708333333, "learning_rate": 0.0001, "loss": 6.1985, "loss/crossentropy": 2.5955700874328613, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.20208538323640823, "step": 7526 }, { "epoch": 0.23525, "grad_norm": 3.359375, "grad_norm_var": 0.023176066080729165, "learning_rate": 0.0001, "loss": 6.181, "loss/crossentropy": 2.667165517807007, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.19552771747112274, "step": 7528 }, { "epoch": 0.2353125, "grad_norm": 3.234375, "grad_norm_var": 0.03821614583333333, "learning_rate": 0.0001, "loss": 5.8642, "loss/crossentropy": 2.4298404455184937, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18601436913013458, "step": 7530 }, { "epoch": 0.235375, "grad_norm": 3.78125, "grad_norm_var": 0.040327962239583334, "learning_rate": 0.0001, "loss": 6.4814, "loss/crossentropy": 2.8501453399658203, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.19828537851572037, "step": 7532 }, { "epoch": 0.2354375, "grad_norm": 3.5625, "grad_norm_var": 0.04738667805989583, "learning_rate": 0.0001, "loss": 6.0067, "loss/crossentropy": 2.5832111835479736, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.18453294783830643, "step": 7534 }, { "epoch": 0.2355, "grad_norm": 3.25, "grad_norm_var": 0.04707743326822917, "learning_rate": 0.0001, "loss": 6.0754, "loss/crossentropy": 2.593501329421997, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.19154705107212067, "step": 7536 }, { "epoch": 0.2355625, "grad_norm": 3.625, "grad_norm_var": 0.04840494791666667, "learning_rate": 0.0001, "loss": 6.1885, "loss/crossentropy": 2.6841200590133667, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1941901594400406, "step": 7538 }, { "epoch": 0.235625, "grad_norm": 3.6875, "grad_norm_var": 0.050226847330729164, "learning_rate": 0.0001, "loss": 5.701, "loss/crossentropy": 2.362775444984436, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.17288699746131897, "step": 7540 }, { "epoch": 0.2356875, "grad_norm": 3.546875, "grad_norm_var": 0.046418253580729166, "learning_rate": 0.0001, "loss": 5.9492, "loss/crossentropy": 2.4750373363494873, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18921273946762085, "step": 7542 }, { "epoch": 0.23575, "grad_norm": 3.09375, "grad_norm_var": 0.05595601399739583, "learning_rate": 0.0001, "loss": 5.785, "loss/crossentropy": 2.453348994255066, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.17652620375156403, "step": 7544 }, { "epoch": 0.2358125, "grad_norm": 5.84375, "grad_norm_var": 0.39527994791666665, "learning_rate": 0.0001, "loss": 5.9636, "loss/crossentropy": 2.440808653831482, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.19055798649787903, "step": 7546 }, { "epoch": 0.235875, "grad_norm": 3.5625, "grad_norm_var": 0.3976715087890625, "learning_rate": 0.0001, "loss": 5.9528, "loss/crossentropy": 2.4879361391067505, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1886746659874916, "step": 7548 }, { "epoch": 0.2359375, "grad_norm": 3.390625, "grad_norm_var": 0.38782145182291666, "learning_rate": 0.0001, "loss": 6.2643, "loss/crossentropy": 2.6960970163345337, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.19666466116905212, "step": 7550 }, { "epoch": 0.236, "grad_norm": 3.1875, "grad_norm_var": 0.3906402587890625, "learning_rate": 0.0001, "loss": 5.9808, "loss/crossentropy": 2.5641645193099976, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18463445454835892, "step": 7552 }, { "epoch": 0.2360625, "grad_norm": 3.9375, "grad_norm_var": 0.4043121337890625, "learning_rate": 0.0001, "loss": 6.465, "loss/crossentropy": 2.766203284263611, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.20581817626953125, "step": 7554 }, { "epoch": 0.236125, "grad_norm": 3.609375, "grad_norm_var": 0.4017405192057292, "learning_rate": 0.0001, "loss": 6.0331, "loss/crossentropy": 2.4767733812332153, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.19821280241012573, "step": 7556 }, { "epoch": 0.2361875, "grad_norm": 3.40625, "grad_norm_var": 0.4049763997395833, "learning_rate": 0.0001, "loss": 6.0414, "loss/crossentropy": 2.5478110313415527, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.18841978907585144, "step": 7558 }, { "epoch": 0.23625, "grad_norm": 3.296875, "grad_norm_var": 0.3839670817057292, "learning_rate": 0.0001, "loss": 5.8884, "loss/crossentropy": 2.4817367792129517, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.17855586111545563, "step": 7560 }, { "epoch": 0.2363125, "grad_norm": 3.5625, "grad_norm_var": 0.03857014973958333, "learning_rate": 0.0001, "loss": 5.6971, "loss/crossentropy": 2.261039435863495, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18852870911359787, "step": 7562 }, { "epoch": 0.236375, "grad_norm": 3.171875, "grad_norm_var": 0.0419830322265625, "learning_rate": 0.0001, "loss": 5.9997, "loss/crossentropy": 2.6095385551452637, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18550091981887817, "step": 7564 }, { "epoch": 0.2364375, "grad_norm": 3.4375, "grad_norm_var": 0.0416015625, "learning_rate": 0.0001, "loss": 6.0966, "loss/crossentropy": 2.6820207834243774, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18715624511241913, "step": 7566 }, { "epoch": 0.2365, "grad_norm": 3.59375, "grad_norm_var": 0.043294270833333336, "learning_rate": 0.0001, "loss": 6.2628, "loss/crossentropy": 2.7443941831588745, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.19442252069711685, "step": 7568 }, { "epoch": 0.2365625, "grad_norm": 3.53125, "grad_norm_var": 0.027440388997395832, "learning_rate": 0.0001, "loss": 6.3244, "loss/crossentropy": 2.7657259702682495, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.19532336294651031, "step": 7570 }, { "epoch": 0.236625, "grad_norm": 3.15625, "grad_norm_var": 0.025423177083333335, "learning_rate": 0.0001, "loss": 5.5643, "loss/crossentropy": 2.2630890607833862, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1738731563091278, "step": 7572 }, { "epoch": 0.2366875, "grad_norm": 3.625, "grad_norm_var": 0.028180948893229165, "learning_rate": 0.0001, "loss": 6.4293, "loss/crossentropy": 2.828166961669922, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.2015225887298584, "step": 7574 }, { "epoch": 0.23675, "grad_norm": 3.265625, "grad_norm_var": 0.024153645833333334, "learning_rate": 0.0001, "loss": 5.8275, "loss/crossentropy": 2.4868907928466797, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18054601550102234, "step": 7576 }, { "epoch": 0.2368125, "grad_norm": 4.34375, "grad_norm_var": 0.08116861979166666, "learning_rate": 0.0001, "loss": 6.1035, "loss/crossentropy": 2.4918447732925415, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.1963195875287056, "step": 7578 }, { "epoch": 0.236875, "grad_norm": 3.734375, "grad_norm_var": 0.08313802083333334, "learning_rate": 0.0001, "loss": 5.715, "loss/crossentropy": 2.3960211277008057, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1799403429031372, "step": 7580 }, { "epoch": 0.2369375, "grad_norm": 3.28125, "grad_norm_var": 0.08834228515625, "learning_rate": 0.0001, "loss": 5.9184, "loss/crossentropy": 2.5109978914260864, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18644145131111145, "step": 7582 }, { "epoch": 0.237, "grad_norm": 3.109375, "grad_norm_var": 0.0928619384765625, "learning_rate": 0.0001, "loss": 5.6529, "loss/crossentropy": 2.3433854579925537, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1770445704460144, "step": 7584 }, { "epoch": 0.2370625, "grad_norm": 3.609375, "grad_norm_var": 0.09990234375, "learning_rate": 0.0001, "loss": 6.3634, "loss/crossentropy": 2.692499876022339, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.20458614826202393, "step": 7586 }, { "epoch": 0.237125, "grad_norm": 3.625, "grad_norm_var": 0.10044657389322917, "learning_rate": 0.0001, "loss": 6.2058, "loss/crossentropy": 2.702503204345703, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.1917399913072586, "step": 7588 }, { "epoch": 0.2371875, "grad_norm": 3.75, "grad_norm_var": 0.10458882649739583, "learning_rate": 0.0001, "loss": 6.1442, "loss/crossentropy": 2.5761271715164185, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.19469749182462692, "step": 7590 }, { "epoch": 0.23725, "grad_norm": 3.78125, "grad_norm_var": 0.10366109212239584, "learning_rate": 0.0001, "loss": 6.0064, "loss/crossentropy": 2.5359818935394287, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1907963976264, "step": 7592 }, { "epoch": 0.2373125, "grad_norm": 3.390625, "grad_norm_var": 0.055475870768229164, "learning_rate": 0.0001, "loss": 6.1073, "loss/crossentropy": 2.6080822944641113, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1932821273803711, "step": 7594 }, { "epoch": 0.237375, "grad_norm": 3.4375, "grad_norm_var": 0.048371378580729166, "learning_rate": 0.0001, "loss": 5.8831, "loss/crossentropy": 2.4336295127868652, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.18713627755641937, "step": 7596 }, { "epoch": 0.2374375, "grad_norm": 3.296875, "grad_norm_var": 0.04436442057291667, "learning_rate": 0.0001, "loss": 5.7833, "loss/crossentropy": 2.2970622777938843, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1935415342450142, "step": 7598 }, { "epoch": 0.2375, "grad_norm": 3.390625, "grad_norm_var": 0.03173421223958333, "learning_rate": 0.0001, "loss": 5.8934, "loss/crossentropy": 2.5146186351776123, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18475797772407532, "step": 7600 }, { "epoch": 0.2375625, "grad_norm": 3.59375, "grad_norm_var": 0.03154195149739583, "learning_rate": 0.0001, "loss": 6.3939, "loss/crossentropy": 2.7156718969345093, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.20805390179157257, "step": 7602 }, { "epoch": 0.237625, "grad_norm": 3.546875, "grad_norm_var": 0.0245025634765625, "learning_rate": 0.0001, "loss": 6.2358, "loss/crossentropy": 2.6067934036254883, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.20508348941802979, "step": 7604 }, { "epoch": 0.2376875, "grad_norm": 3.71875, "grad_norm_var": 0.0274566650390625, "learning_rate": 0.0001, "loss": 6.0435, "loss/crossentropy": 2.4407546520233154, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.19855981320142746, "step": 7606 }, { "epoch": 0.23775, "grad_norm": 3.34375, "grad_norm_var": 0.0262115478515625, "learning_rate": 0.0001, "loss": 6.3214, "loss/crossentropy": 2.7076494693756104, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.1996530145406723, "step": 7608 }, { "epoch": 0.2378125, "grad_norm": 3.5, "grad_norm_var": 0.028197224934895834, "learning_rate": 0.0001, "loss": 6.2234, "loss/crossentropy": 2.6549901962280273, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.1978522315621376, "step": 7610 }, { "epoch": 0.237875, "grad_norm": 3.421875, "grad_norm_var": 0.0282135009765625, "learning_rate": 0.0001, "loss": 6.0882, "loss/crossentropy": 2.6244239807128906, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1920793578028679, "step": 7612 }, { "epoch": 0.2379375, "grad_norm": 4.75, "grad_norm_var": 0.11799723307291667, "learning_rate": 0.0001, "loss": 6.0148, "loss/crossentropy": 2.5746607780456543, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1901114359498024, "step": 7614 }, { "epoch": 0.238, "grad_norm": 3.4375, "grad_norm_var": 0.11655985514322917, "learning_rate": 0.0001, "loss": 5.8119, "loss/crossentropy": 2.3741531372070312, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1875242292881012, "step": 7616 }, { "epoch": 0.2380625, "grad_norm": 3.265625, "grad_norm_var": 0.1244293212890625, "learning_rate": 0.0001, "loss": 6.1934, "loss/crossentropy": 2.69778573513031, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.19097129255533218, "step": 7618 }, { "epoch": 0.238125, "grad_norm": 3.578125, "grad_norm_var": 0.12808329264322918, "learning_rate": 0.0001, "loss": 6.1297, "loss/crossentropy": 2.5959811210632324, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.19751714169979095, "step": 7620 }, { "epoch": 0.2381875, "grad_norm": 5.75, "grad_norm_var": 0.43488667805989584, "learning_rate": 0.0001, "loss": 6.3296, "loss/crossentropy": 2.6300052404403687, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.20628874748945236, "step": 7622 }, { "epoch": 0.23825, "grad_norm": 3.390625, "grad_norm_var": 0.4568430582682292, "learning_rate": 0.0001, "loss": 5.5186, "loss/crossentropy": 2.2347458600997925, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.17096569389104843, "step": 7624 }, { "epoch": 0.2383125, "grad_norm": 3.8125, "grad_norm_var": 0.4613596598307292, "learning_rate": 0.0001, "loss": 6.3385, "loss/crossentropy": 2.8292057514190674, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.18921514600515366, "step": 7626 }, { "epoch": 0.238375, "grad_norm": 3.359375, "grad_norm_var": 0.46288655598958334, "learning_rate": 0.0001, "loss": 5.9493, "loss/crossentropy": 2.499138593673706, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18915607780218124, "step": 7628 }, { "epoch": 0.2384375, "grad_norm": 3.453125, "grad_norm_var": 0.5111480712890625, "learning_rate": 0.0001, "loss": 6.0143, "loss/crossentropy": 2.4007151126861572, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.19572950899600983, "step": 7630 }, { "epoch": 0.2385, "grad_norm": 3.1875, "grad_norm_var": 0.5228342692057292, "learning_rate": 0.0001, "loss": 6.1161, "loss/crossentropy": 2.6703180074691772, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1902800351381302, "step": 7632 }, { "epoch": 0.2385625, "grad_norm": 3.921875, "grad_norm_var": 0.539404296875, "learning_rate": 0.0001, "loss": 5.9397, "loss/crossentropy": 2.48610520362854, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.18442490696907043, "step": 7634 }, { "epoch": 0.238625, "grad_norm": 3.90625, "grad_norm_var": 0.5344034830729166, "learning_rate": 0.0001, "loss": 6.1724, "loss/crossentropy": 2.52171790599823, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.20334511995315552, "step": 7636 }, { "epoch": 0.2386875, "grad_norm": 3.515625, "grad_norm_var": 0.23290608723958334, "learning_rate": 0.0001, "loss": 5.7379, "loss/crossentropy": 2.3531194925308228, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18340447545051575, "step": 7638 }, { "epoch": 0.23875, "grad_norm": 3.453125, "grad_norm_var": 0.21437886555989583, "learning_rate": 0.0001, "loss": 6.1048, "loss/crossentropy": 2.5537298917770386, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.19456487894058228, "step": 7640 }, { "epoch": 0.2388125, "grad_norm": 3.3125, "grad_norm_var": 0.21537984212239583, "learning_rate": 0.0001, "loss": 6.0609, "loss/crossentropy": 2.5692453384399414, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.19096451997756958, "step": 7642 }, { "epoch": 0.238875, "grad_norm": 3.40625, "grad_norm_var": 0.21619466145833333, "learning_rate": 0.0001, "loss": 6.1539, "loss/crossentropy": 2.62418270111084, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.19672146439552307, "step": 7644 }, { "epoch": 0.2389375, "grad_norm": 3.421875, "grad_norm_var": 0.057450358072916666, "learning_rate": 0.0001, "loss": 6.0482, "loss/crossentropy": 2.5824190378189087, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.19149936735630035, "step": 7646 }, { "epoch": 0.239, "grad_norm": 3.078125, "grad_norm_var": 0.06647847493489584, "learning_rate": 0.0001, "loss": 5.9723, "loss/crossentropy": 2.5797115564346313, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.17832079529762268, "step": 7648 }, { "epoch": 0.2390625, "grad_norm": 3.4375, "grad_norm_var": 0.04267171223958333, "learning_rate": 0.0001, "loss": 6.286, "loss/crossentropy": 2.747970700263977, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.1936463937163353, "step": 7650 }, { "epoch": 0.239125, "grad_norm": 3.5625, "grad_norm_var": 0.030777994791666666, "learning_rate": 0.0001, "loss": 5.9269, "loss/crossentropy": 2.3935400247573853, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.19278503954410553, "step": 7652 }, { "epoch": 0.2391875, "grad_norm": 3.796875, "grad_norm_var": 0.0376861572265625, "learning_rate": 0.0001, "loss": 6.1596, "loss/crossentropy": 2.6391823291778564, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.19462397694587708, "step": 7654 }, { "epoch": 0.23925, "grad_norm": 3.4375, "grad_norm_var": 0.0478668212890625, "learning_rate": 0.0001, "loss": 5.9814, "loss/crossentropy": 2.4334522485733032, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.19463550299406052, "step": 7656 }, { "epoch": 0.2393125, "grad_norm": 4.0625, "grad_norm_var": 0.06364644368489583, "learning_rate": 0.0001, "loss": 6.2801, "loss/crossentropy": 2.685059428215027, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.197001650929451, "step": 7658 }, { "epoch": 0.239375, "grad_norm": 3.140625, "grad_norm_var": 0.07219950358072917, "learning_rate": 0.0001, "loss": 6.1138, "loss/crossentropy": 2.6333361864089966, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19102010130882263, "step": 7660 }, { "epoch": 0.2394375, "grad_norm": 3.5, "grad_norm_var": 0.06955464680989583, "learning_rate": 0.0001, "loss": 6.0346, "loss/crossentropy": 2.51070237159729, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.19575046002864838, "step": 7662 }, { "epoch": 0.2395, "grad_norm": 3.359375, "grad_norm_var": 0.05730794270833333, "learning_rate": 0.0001, "loss": 6.019, "loss/crossentropy": 2.5976483821868896, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1893959939479828, "step": 7664 }, { "epoch": 0.2395625, "grad_norm": 3.21875, "grad_norm_var": 0.06433919270833334, "learning_rate": 0.0001, "loss": 5.9062, "loss/crossentropy": 2.5433908700942993, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1796429306268692, "step": 7666 }, { "epoch": 0.239625, "grad_norm": 3.375, "grad_norm_var": 0.0742095947265625, "learning_rate": 0.0001, "loss": 6.44, "loss/crossentropy": 2.791746497154236, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.2038840353488922, "step": 7668 }, { "epoch": 0.2396875, "grad_norm": 3.59375, "grad_norm_var": 0.090673828125, "learning_rate": 0.0001, "loss": 5.9489, "loss/crossentropy": 2.4165494441986084, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.19034140557050705, "step": 7670 }, { "epoch": 0.23975, "grad_norm": 3.5, "grad_norm_var": 0.0778961181640625, "learning_rate": 0.0001, "loss": 6.1395, "loss/crossentropy": 2.613227963447571, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.18934263288974762, "step": 7672 }, { "epoch": 0.2398125, "grad_norm": 4.09375, "grad_norm_var": 0.08218994140625, "learning_rate": 0.0001, "loss": 6.4134, "loss/crossentropy": 2.7667269706726074, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.2033383771777153, "step": 7674 }, { "epoch": 0.239875, "grad_norm": 3.5, "grad_norm_var": 0.08207906087239583, "learning_rate": 0.0001, "loss": 5.8618, "loss/crossentropy": 2.4750369787216187, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.17969341576099396, "step": 7676 }, { "epoch": 0.2399375, "grad_norm": 3.375, "grad_norm_var": 0.08401285807291667, "learning_rate": 0.0001, "loss": 6.0187, "loss/crossentropy": 2.508861541748047, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19200244545936584, "step": 7678 }, { "epoch": 0.24, "grad_norm": 3.328125, "grad_norm_var": 0.08294270833333334, "learning_rate": 0.0001, "loss": 6.365, "loss/crossentropy": 2.7623302936553955, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.19815552234649658, "step": 7680 }, { "epoch": 0.2400625, "grad_norm": 3.625, "grad_norm_var": 0.07265523274739584, "learning_rate": 0.0001, "loss": 6.0794, "loss/crossentropy": 2.5649800300598145, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1936321035027504, "step": 7682 }, { "epoch": 0.240125, "grad_norm": 3.40625, "grad_norm_var": 0.07997945149739584, "learning_rate": 0.0001, "loss": 5.8903, "loss/crossentropy": 2.523725152015686, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18274738639593124, "step": 7684 }, { "epoch": 0.2401875, "grad_norm": 3.5, "grad_norm_var": 0.05571187337239583, "learning_rate": 0.0001, "loss": 6.3296, "loss/crossentropy": 2.7412075996398926, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19907841831445694, "step": 7686 }, { "epoch": 0.24025, "grad_norm": 3.828125, "grad_norm_var": 0.06308186848958333, "learning_rate": 0.0001, "loss": 6.2066, "loss/crossentropy": 2.6145856380462646, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.20021675527095795, "step": 7688 }, { "epoch": 0.2403125, "grad_norm": 3.265625, "grad_norm_var": 0.03584696451822917, "learning_rate": 0.0001, "loss": 6.1771, "loss/crossentropy": 2.6702972650527954, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19169631600379944, "step": 7690 }, { "epoch": 0.240375, "grad_norm": 3.71875, "grad_norm_var": 0.03361714680989583, "learning_rate": 0.0001, "loss": 6.3533, "loss/crossentropy": 2.7222691774368286, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.20255489647388458, "step": 7692 }, { "epoch": 0.2404375, "grad_norm": 3.1875, "grad_norm_var": 0.0396636962890625, "learning_rate": 0.0001, "loss": 5.9519, "loss/crossentropy": 2.548743963241577, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1848517507314682, "step": 7694 }, { "epoch": 0.2405, "grad_norm": 3.546875, "grad_norm_var": 0.04666341145833333, "learning_rate": 0.0001, "loss": 6.2107, "loss/crossentropy": 2.63839328289032, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.19668738543987274, "step": 7696 }, { "epoch": 0.2405625, "grad_norm": 3.9375, "grad_norm_var": 0.05812886555989583, "learning_rate": 0.0001, "loss": 6.2819, "loss/crossentropy": 2.62613844871521, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.20737700909376144, "step": 7698 }, { "epoch": 0.240625, "grad_norm": 4.03125, "grad_norm_var": 0.059056599934895836, "learning_rate": 0.0001, "loss": 6.3754, "loss/crossentropy": 2.713670492172241, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.20484429597854614, "step": 7700 }, { "epoch": 0.2406875, "grad_norm": 3.3125, "grad_norm_var": 0.060595703125, "learning_rate": 0.0001, "loss": 6.2529, "loss/crossentropy": 2.7103840112686157, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19448387622833252, "step": 7702 }, { "epoch": 0.24075, "grad_norm": 3.328125, "grad_norm_var": 0.06695556640625, "learning_rate": 0.0001, "loss": 5.7694, "loss/crossentropy": 2.4169265031814575, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17977654933929443, "step": 7704 }, { "epoch": 0.2408125, "grad_norm": 4.28125, "grad_norm_var": 0.10196940104166667, "learning_rate": 0.0001, "loss": 6.2524, "loss/crossentropy": 2.4799392223358154, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.21396034210920334, "step": 7706 }, { "epoch": 0.240875, "grad_norm": 3.28125, "grad_norm_var": 0.11464436848958333, "learning_rate": 0.0001, "loss": 5.6104, "loss/crossentropy": 2.3208820819854736, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17504741251468658, "step": 7708 }, { "epoch": 0.2409375, "grad_norm": 3.078125, "grad_norm_var": 0.12100321451822917, "learning_rate": 0.0001, "loss": 5.7849, "loss/crossentropy": 2.4717557430267334, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17740823328495026, "step": 7710 }, { "epoch": 0.241, "grad_norm": 3.46875, "grad_norm_var": 0.12170308430989583, "learning_rate": 0.0001, "loss": 5.9905, "loss/crossentropy": 2.655658006668091, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1795746013522148, "step": 7712 }, { "epoch": 0.2410625, "grad_norm": 41.0, "grad_norm_var": 88.06312561035156, "learning_rate": 0.0001, "loss": 7.0637, "loss/crossentropy": 2.5289785861968994, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.2831588163971901, "step": 7714 }, { "epoch": 0.241125, "grad_norm": 3.890625, "grad_norm_var": 87.94371744791667, "learning_rate": 0.0001, "loss": 6.1867, "loss/crossentropy": 2.650020718574524, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19585387408733368, "step": 7716 }, { "epoch": 0.2411875, "grad_norm": 3.875, "grad_norm_var": 87.73944396972657, "learning_rate": 0.0001, "loss": 6.2797, "loss/crossentropy": 2.657504081726074, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.20167139172554016, "step": 7718 }, { "epoch": 0.24125, "grad_norm": 3.625, "grad_norm_var": 87.48702799479166, "learning_rate": 0.0001, "loss": 6.3552, "loss/crossentropy": 2.6477073431015015, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.20590269565582275, "step": 7720 }, { "epoch": 0.2413125, "grad_norm": 3.34375, "grad_norm_var": 87.8251454671224, "learning_rate": 0.0001, "loss": 6.2522, "loss/crossentropy": 2.693089723587036, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19692938029766083, "step": 7722 }, { "epoch": 0.241375, "grad_norm": 3.046875, "grad_norm_var": 87.7824208577474, "learning_rate": 0.0001, "loss": 5.8971, "loss/crossentropy": 2.472025990486145, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1870427280664444, "step": 7724 }, { "epoch": 0.2414375, "grad_norm": 3.328125, "grad_norm_var": 87.68208719889323, "learning_rate": 0.0001, "loss": 6.0263, "loss/crossentropy": 2.6007825136184692, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.18083418905735016, "step": 7726 }, { "epoch": 0.2415, "grad_norm": 3.359375, "grad_norm_var": 87.63302408854166, "learning_rate": 0.0001, "loss": 6.2469, "loss/crossentropy": 2.689045548439026, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19797423481941223, "step": 7728 }, { "epoch": 0.2415625, "grad_norm": 3.171875, "grad_norm_var": 0.08767903645833333, "learning_rate": 0.0001, "loss": 5.662, "loss/crossentropy": 2.308548331260681, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.17558036744594574, "step": 7730 }, { "epoch": 0.241625, "grad_norm": 3.5625, "grad_norm_var": 0.058568318684895836, "learning_rate": 0.0001, "loss": 6.1883, "loss/crossentropy": 2.5779424905776978, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.2024448812007904, "step": 7732 }, { "epoch": 0.2416875, "grad_norm": 3.359375, "grad_norm_var": 0.05073140462239583, "learning_rate": 0.0001, "loss": 6.0548, "loss/crossentropy": 2.5732924938201904, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.1883901134133339, "step": 7734 }, { "epoch": 0.24175, "grad_norm": 3.65625, "grad_norm_var": 0.050455729166666664, "learning_rate": 0.0001, "loss": 5.964, "loss/crossentropy": 2.459815263748169, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.19221822917461395, "step": 7736 }, { "epoch": 0.2418125, "grad_norm": 3.34375, "grad_norm_var": 0.0482330322265625, "learning_rate": 0.0001, "loss": 6.1748, "loss/crossentropy": 2.7405179738998413, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18718285858631134, "step": 7738 }, { "epoch": 0.241875, "grad_norm": 3.765625, "grad_norm_var": 0.04411519368489583, "learning_rate": 0.0001, "loss": 6.0505, "loss/crossentropy": 2.4628392457962036, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19783060252666473, "step": 7740 }, { "epoch": 0.2419375, "grad_norm": 3.609375, "grad_norm_var": 0.04273173014322917, "learning_rate": 0.0001, "loss": 6.2276, "loss/crossentropy": 2.6094053983688354, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.20048777014017105, "step": 7742 }, { "epoch": 0.242, "grad_norm": 3.71875, "grad_norm_var": 0.04364827473958333, "learning_rate": 0.0001, "loss": 6.2317, "loss/crossentropy": 2.5628308057785034, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.2047729715704918, "step": 7744 }, { "epoch": 0.2420625, "grad_norm": 3.5, "grad_norm_var": 0.032486979166666666, "learning_rate": 0.0001, "loss": 6.142, "loss/crossentropy": 2.5770037174224854, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.1943896859884262, "step": 7746 }, { "epoch": 0.242125, "grad_norm": 3.1875, "grad_norm_var": 0.031083170572916666, "learning_rate": 0.0001, "loss": 5.9243, "loss/crossentropy": 2.5422744750976562, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18546995520591736, "step": 7748 }, { "epoch": 0.2421875, "grad_norm": 3.515625, "grad_norm_var": 0.028514607747395834, "learning_rate": 0.0001, "loss": 5.6252, "loss/crossentropy": 2.283834457397461, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.17437529563903809, "step": 7750 }, { "epoch": 0.24225, "grad_norm": 3.578125, "grad_norm_var": 0.025634765625, "learning_rate": 0.0001, "loss": 6.4153, "loss/crossentropy": 2.743710160255432, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.20465648919343948, "step": 7752 }, { "epoch": 0.2423125, "grad_norm": 3.484375, "grad_norm_var": 0.0237213134765625, "learning_rate": 0.0001, "loss": 6.0075, "loss/crossentropy": 2.5681275129318237, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1876872330904007, "step": 7754 }, { "epoch": 0.242375, "grad_norm": 3.828125, "grad_norm_var": 0.025690714518229168, "learning_rate": 0.0001, "loss": 6.4399, "loss/crossentropy": 2.7721768617630005, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.20271114259958267, "step": 7756 }, { "epoch": 0.2424375, "grad_norm": 3.359375, "grad_norm_var": 0.026545206705729168, "learning_rate": 0.0001, "loss": 6.0202, "loss/crossentropy": 2.6353739500045776, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18223249912261963, "step": 7758 }, { "epoch": 0.2425, "grad_norm": 3.203125, "grad_norm_var": 0.026981608072916666, "learning_rate": 0.0001, "loss": 6.1179, "loss/crossentropy": 2.664836049079895, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.1863185241818428, "step": 7760 }, { "epoch": 0.2425625, "grad_norm": 3.53125, "grad_norm_var": 0.027958170572916666, "learning_rate": 0.0001, "loss": 6.085, "loss/crossentropy": 2.597185730934143, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.19331426173448563, "step": 7762 }, { "epoch": 0.242625, "grad_norm": 3.59375, "grad_norm_var": 0.04345703125, "learning_rate": 0.0001, "loss": 6.1179, "loss/crossentropy": 2.5563639402389526, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.19325853884220123, "step": 7764 }, { "epoch": 0.2426875, "grad_norm": 3.25, "grad_norm_var": 0.053343709309895834, "learning_rate": 0.0001, "loss": 5.9444, "loss/crossentropy": 2.4918466806411743, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.18510272353887558, "step": 7766 }, { "epoch": 0.24275, "grad_norm": 3.578125, "grad_norm_var": 0.05282796223958333, "learning_rate": 0.0001, "loss": 6.0905, "loss/crossentropy": 2.6600621938705444, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18601295351982117, "step": 7768 }, { "epoch": 0.2428125, "grad_norm": 3.453125, "grad_norm_var": 0.0508697509765625, "learning_rate": 0.0001, "loss": 5.9313, "loss/crossentropy": 2.4641988277435303, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.19046224653720856, "step": 7770 }, { "epoch": 0.242875, "grad_norm": 3.484375, "grad_norm_var": 0.0777740478515625, "learning_rate": 0.0001, "loss": 6.0365, "loss/crossentropy": 2.5514219999313354, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.18952420353889465, "step": 7772 }, { "epoch": 0.2429375, "grad_norm": 4.0, "grad_norm_var": 0.08321024576822916, "learning_rate": 0.0001, "loss": 6.1001, "loss/crossentropy": 2.650992274284363, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18944354355335236, "step": 7774 }, { "epoch": 0.243, "grad_norm": 3.453125, "grad_norm_var": 0.08057352701822916, "learning_rate": 0.0001, "loss": 6.1831, "loss/crossentropy": 2.6317412853240967, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.19302934408187866, "step": 7776 }, { "epoch": 0.2430625, "grad_norm": 3.140625, "grad_norm_var": 0.10263570149739583, "learning_rate": 0.0001, "loss": 5.764, "loss/crossentropy": 2.448301076889038, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.17492999881505966, "step": 7778 }, { "epoch": 0.243125, "grad_norm": 3.34375, "grad_norm_var": 0.09371744791666667, "learning_rate": 0.0001, "loss": 6.1999, "loss/crossentropy": 2.648719549179077, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.19691617041826248, "step": 7780 }, { "epoch": 0.2431875, "grad_norm": 3.09375, "grad_norm_var": 0.09827473958333334, "learning_rate": 0.0001, "loss": 5.8564, "loss/crossentropy": 2.5009610652923584, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1824200451374054, "step": 7782 }, { "epoch": 0.24325, "grad_norm": 3.53125, "grad_norm_var": 0.10519917805989583, "learning_rate": 0.0001, "loss": 5.8692, "loss/crossentropy": 2.490937113761902, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18235694617033005, "step": 7784 }, { "epoch": 0.2433125, "grad_norm": 3.390625, "grad_norm_var": 0.10624593098958333, "learning_rate": 0.0001, "loss": 5.8675, "loss/crossentropy": 2.485131859779358, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1804227977991104, "step": 7786 }, { "epoch": 0.243375, "grad_norm": 3.515625, "grad_norm_var": 0.0654205322265625, "learning_rate": 0.0001, "loss": 6.1414, "loss/crossentropy": 2.639013171195984, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.1885160133242607, "step": 7788 }, { "epoch": 0.2434375, "grad_norm": 3.53125, "grad_norm_var": 0.049046834309895836, "learning_rate": 0.0001, "loss": 5.9523, "loss/crossentropy": 2.5348676443099976, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.18275688588619232, "step": 7790 }, { "epoch": 0.2435, "grad_norm": 4.5, "grad_norm_var": 0.11607666015625, "learning_rate": 0.0001, "loss": 5.8492, "loss/crossentropy": 2.330635905265808, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.19521702826023102, "step": 7792 }, { "epoch": 0.2435625, "grad_norm": 3.609375, "grad_norm_var": 0.11142476399739583, "learning_rate": 0.0001, "loss": 6.1485, "loss/crossentropy": 2.628572463989258, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.19769364595413208, "step": 7794 }, { "epoch": 0.243625, "grad_norm": 3.5625, "grad_norm_var": 0.11405843098958333, "learning_rate": 0.0001, "loss": 5.9074, "loss/crossentropy": 2.4764528274536133, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1891844943165779, "step": 7796 }, { "epoch": 0.2436875, "grad_norm": 3.203125, "grad_norm_var": 0.107421875, "learning_rate": 0.0001, "loss": 6.0289, "loss/crossentropy": 2.5855778455734253, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18730523437261581, "step": 7798 }, { "epoch": 0.24375, "grad_norm": 3.25, "grad_norm_var": 0.1046295166015625, "learning_rate": 0.0001, "loss": 6.4235, "loss/crossentropy": 2.844739079475403, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19888964295387268, "step": 7800 }, { "epoch": 0.2438125, "grad_norm": 3.265625, "grad_norm_var": 0.1085113525390625, "learning_rate": 0.0001, "loss": 5.7583, "loss/crossentropy": 2.369624972343445, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18457041680812836, "step": 7802 }, { "epoch": 0.243875, "grad_norm": 3.328125, "grad_norm_var": 0.119482421875, "learning_rate": 0.0001, "loss": 6.0802, "loss/crossentropy": 2.5831106901168823, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.18916666507720947, "step": 7804 }, { "epoch": 0.2439375, "grad_norm": 3.296875, "grad_norm_var": 0.11793619791666667, "learning_rate": 0.0001, "loss": 6.0391, "loss/crossentropy": 2.6238601207733154, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18683332204818726, "step": 7806 }, { "epoch": 0.244, "grad_norm": 3.546875, "grad_norm_var": 0.0329254150390625, "learning_rate": 0.0001, "loss": 5.9501, "loss/crossentropy": 2.5450098514556885, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18465009331703186, "step": 7808 }, { "epoch": 0.2440625, "grad_norm": 3.71875, "grad_norm_var": 0.03805338541666667, "learning_rate": 0.0001, "loss": 6.1418, "loss/crossentropy": 2.5412479639053345, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19911602139472961, "step": 7810 }, { "epoch": 0.244125, "grad_norm": 3.015625, "grad_norm_var": 0.04783426920572917, "learning_rate": 0.0001, "loss": 6.091, "loss/crossentropy": 2.619031071662903, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.19134248048067093, "step": 7812 }, { "epoch": 0.2441875, "grad_norm": 3.484375, "grad_norm_var": 0.059798177083333334, "learning_rate": 0.0001, "loss": 6.2009, "loss/crossentropy": 2.6912925243377686, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.19471533596515656, "step": 7814 }, { "epoch": 0.24425, "grad_norm": 3.234375, "grad_norm_var": 0.06214090983072917, "learning_rate": 0.0001, "loss": 6.1676, "loss/crossentropy": 2.704859495162964, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.19119124859571457, "step": 7816 }, { "epoch": 0.2443125, "grad_norm": 3.359375, "grad_norm_var": 0.06272684733072917, "learning_rate": 0.0001, "loss": 6.2454, "loss/crossentropy": 2.7171125411987305, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1989237368106842, "step": 7818 }, { "epoch": 0.244375, "grad_norm": 3.390625, "grad_norm_var": 0.0551910400390625, "learning_rate": 0.0001, "loss": 5.6902, "loss/crossentropy": 2.326253294944763, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1793610379099846, "step": 7820 }, { "epoch": 0.2444375, "grad_norm": 3.421875, "grad_norm_var": 0.06988525390625, "learning_rate": 0.0001, "loss": 6.2151, "loss/crossentropy": 2.7847646474838257, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18756647408008575, "step": 7822 }, { "epoch": 0.2445, "grad_norm": 3.421875, "grad_norm_var": 0.07024739583333334, "learning_rate": 0.0001, "loss": 5.9772, "loss/crossentropy": 2.487933397293091, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.1883789598941803, "step": 7824 }, { "epoch": 0.2445625, "grad_norm": 3.34375, "grad_norm_var": 0.06451822916666666, "learning_rate": 0.0001, "loss": 6.0661, "loss/crossentropy": 2.6811541318893433, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1826338768005371, "step": 7826 }, { "epoch": 0.244625, "grad_norm": 3.640625, "grad_norm_var": 0.05382486979166667, "learning_rate": 0.0001, "loss": 6.3543, "loss/crossentropy": 2.798715114593506, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19578896462917328, "step": 7828 }, { "epoch": 0.2446875, "grad_norm": 3.46875, "grad_norm_var": 0.03632405598958333, "learning_rate": 0.0001, "loss": 5.4193, "loss/crossentropy": 2.237901747226715, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.16267180442810059, "step": 7830 }, { "epoch": 0.24475, "grad_norm": 5.375, "grad_norm_var": 0.27489827473958334, "learning_rate": 0.0001, "loss": 6.3888, "loss/crossentropy": 2.6911579370498657, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.2025742381811142, "step": 7832 }, { "epoch": 0.2448125, "grad_norm": 3.359375, "grad_norm_var": 0.26678059895833334, "learning_rate": 0.0001, "loss": 5.8585, "loss/crossentropy": 2.5180585384368896, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1817043051123619, "step": 7834 }, { "epoch": 0.244875, "grad_norm": 3.71875, "grad_norm_var": 0.2519765218098958, "learning_rate": 0.0001, "loss": 6.376, "loss/crossentropy": 2.7115944623947144, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.20745471864938736, "step": 7836 }, { "epoch": 0.2449375, "grad_norm": 3.359375, "grad_norm_var": 0.25478515625, "learning_rate": 0.0001, "loss": 6.0173, "loss/crossentropy": 2.4863085746765137, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.19684798270463943, "step": 7838 }, { "epoch": 0.245, "grad_norm": 3.53125, "grad_norm_var": 0.25222880045572915, "learning_rate": 0.0001, "loss": 6.3938, "loss/crossentropy": 2.752587676048279, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.20552918314933777, "step": 7840 }, { "epoch": 0.2450625, "grad_norm": 3.34375, "grad_norm_var": 0.2506174723307292, "learning_rate": 0.0001, "loss": 6.0401, "loss/crossentropy": 2.57265305519104, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1920556202530861, "step": 7842 }, { "epoch": 0.245125, "grad_norm": 3.65625, "grad_norm_var": 0.2801920572916667, "learning_rate": 0.0001, "loss": 6.6359, "loss/crossentropy": 2.7755597829818726, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.22002330422401428, "step": 7844 }, { "epoch": 0.2451875, "grad_norm": 3.640625, "grad_norm_var": 0.272119140625, "learning_rate": 0.0001, "loss": 5.9087, "loss/crossentropy": 2.4002764225006104, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.19029289484024048, "step": 7846 }, { "epoch": 0.24525, "grad_norm": 3.6875, "grad_norm_var": 0.07565816243489583, "learning_rate": 0.0001, "loss": 6.2504, "loss/crossentropy": 2.630833864212036, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.20297091454267502, "step": 7848 }, { "epoch": 0.2453125, "grad_norm": 3.5625, "grad_norm_var": 0.0699127197265625, "learning_rate": 0.0001, "loss": 6.2238, "loss/crossentropy": 2.7208290100097656, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19326822459697723, "step": 7850 }, { "epoch": 0.245375, "grad_norm": 3.453125, "grad_norm_var": 0.07089742024739583, "learning_rate": 0.0001, "loss": 6.0403, "loss/crossentropy": 2.524513602256775, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.19532479345798492, "step": 7852 }, { "epoch": 0.2454375, "grad_norm": 3.390625, "grad_norm_var": 0.06756083170572917, "learning_rate": 0.0001, "loss": 6.0687, "loss/crossentropy": 2.5335192680358887, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1957024410367012, "step": 7854 }, { "epoch": 0.2455, "grad_norm": 3.703125, "grad_norm_var": 0.07585347493489583, "learning_rate": 0.0001, "loss": 6.5999, "loss/crossentropy": 2.828696131706238, "loss/hidden": 1.68359375, "loss/jsd": 0.0, "loss/logits": 0.20876126736402512, "step": 7856 }, { "epoch": 0.2455625, "grad_norm": 3.25, "grad_norm_var": 0.0802886962890625, "learning_rate": 0.0001, "loss": 5.5055, "loss/crossentropy": 2.0957219004631042, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.17769645899534225, "step": 7858 }, { "epoch": 0.245625, "grad_norm": 3.65625, "grad_norm_var": 0.0358551025390625, "learning_rate": 0.0001, "loss": 6.1333, "loss/crossentropy": 2.5687731504440308, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.19512902200222015, "step": 7860 }, { "epoch": 0.2456875, "grad_norm": 3.5, "grad_norm_var": 0.03453369140625, "learning_rate": 0.0001, "loss": 6.2233, "loss/crossentropy": 2.69309139251709, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.19481310993433, "step": 7862 }, { "epoch": 0.24575, "grad_norm": 3.53125, "grad_norm_var": 0.036253865559895834, "learning_rate": 0.0001, "loss": 6.0009, "loss/crossentropy": 2.590876817703247, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18435797840356827, "step": 7864 }, { "epoch": 0.2458125, "grad_norm": 3.46875, "grad_norm_var": 0.0365142822265625, "learning_rate": 0.0001, "loss": 5.9163, "loss/crossentropy": 2.5039474964141846, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18577133119106293, "step": 7866 }, { "epoch": 0.245875, "grad_norm": 3.609375, "grad_norm_var": 0.033600870768229166, "learning_rate": 0.0001, "loss": 6.214, "loss/crossentropy": 2.5934181213378906, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.19603876769542694, "step": 7868 }, { "epoch": 0.2459375, "grad_norm": 4.09375, "grad_norm_var": 0.06715494791666667, "learning_rate": 0.0001, "loss": 6.5384, "loss/crossentropy": 2.8254576921463013, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.20566459000110626, "step": 7870 }, { "epoch": 0.246, "grad_norm": 3.234375, "grad_norm_var": 0.07030843098958334, "learning_rate": 0.0001, "loss": 6.0556, "loss/crossentropy": 2.5720643997192383, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1917138248682022, "step": 7872 }, { "epoch": 0.2460625, "grad_norm": 4.90625, "grad_norm_var": 0.17058003743489583, "learning_rate": 0.0001, "loss": 6.1044, "loss/crossentropy": 2.4462668895721436, "loss/hidden": 1.703125, "loss/jsd": 0.0, "loss/logits": 0.1955041065812111, "step": 7874 }, { "epoch": 0.246125, "grad_norm": 3.578125, "grad_norm_var": 0.1717193603515625, "learning_rate": 0.0001, "loss": 5.9914, "loss/crossentropy": 2.463874340057373, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.18713021278381348, "step": 7876 }, { "epoch": 0.2461875, "grad_norm": 3.828125, "grad_norm_var": 0.19169514973958332, "learning_rate": 0.0001, "loss": 5.8841, "loss/crossentropy": 2.4507832527160645, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18825646489858627, "step": 7878 }, { "epoch": 0.24625, "grad_norm": 3.9375, "grad_norm_var": 0.19039713541666667, "learning_rate": 0.0001, "loss": 6.4666, "loss/crossentropy": 2.8597030639648438, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.20014318823814392, "step": 7880 }, { "epoch": 0.2463125, "grad_norm": 3.546875, "grad_norm_var": 0.18128153483072917, "learning_rate": 0.0001, "loss": 6.3076, "loss/crossentropy": 2.747925639152527, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.1977677345275879, "step": 7882 }, { "epoch": 0.246375, "grad_norm": 3.265625, "grad_norm_var": 0.19488932291666666, "learning_rate": 0.0001, "loss": 6.0471, "loss/crossentropy": 2.5719133615493774, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.18892011791467667, "step": 7884 }, { "epoch": 0.2464375, "grad_norm": 3.59375, "grad_norm_var": 0.18992513020833332, "learning_rate": 0.0001, "loss": 6.5645, "loss/crossentropy": 2.784387230873108, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.21199924498796463, "step": 7886 }, { "epoch": 0.2465, "grad_norm": 4.09375, "grad_norm_var": 0.17419331868489582, "learning_rate": 0.0001, "loss": 6.6613, "loss/crossentropy": 2.934768319129944, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.20742224156856537, "step": 7888 }, { "epoch": 0.2465625, "grad_norm": 3.703125, "grad_norm_var": 0.09788411458333333, "learning_rate": 0.0001, "loss": 5.9526, "loss/crossentropy": 2.491362690925598, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.18752900511026382, "step": 7890 }, { "epoch": 0.246625, "grad_norm": 3.359375, "grad_norm_var": 0.11057535807291667, "learning_rate": 0.0001, "loss": 5.7986, "loss/crossentropy": 2.3962568044662476, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18476200103759766, "step": 7892 }, { "epoch": 0.2466875, "grad_norm": 3.21875, "grad_norm_var": 0.09982096354166667, "learning_rate": 0.0001, "loss": 5.9733, "loss/crossentropy": 2.5740636587142944, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18484096974134445, "step": 7894 }, { "epoch": 0.24675, "grad_norm": 3.609375, "grad_norm_var": 0.1060943603515625, "learning_rate": 0.0001, "loss": 5.8972, "loss/crossentropy": 2.5059421062469482, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1832689419388771, "step": 7896 }, { "epoch": 0.2468125, "grad_norm": 3.375, "grad_norm_var": 0.10416259765625, "learning_rate": 0.0001, "loss": 6.1949, "loss/crossentropy": 2.745757818222046, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18944872915744781, "step": 7898 }, { "epoch": 0.246875, "grad_norm": 3.21875, "grad_norm_var": 0.10508524576822917, "learning_rate": 0.0001, "loss": 6.2038, "loss/crossentropy": 2.656543731689453, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.19612840563058853, "step": 7900 }, { "epoch": 0.2469375, "grad_norm": 3.140625, "grad_norm_var": 0.07021382649739584, "learning_rate": 0.0001, "loss": 6.0582, "loss/crossentropy": 2.552359104156494, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.18652494251728058, "step": 7902 }, { "epoch": 0.247, "grad_norm": 3.421875, "grad_norm_var": 0.03429361979166667, "learning_rate": 0.0001, "loss": 6.0395, "loss/crossentropy": 2.514822244644165, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.18957258015871048, "step": 7904 }, { "epoch": 0.2470625, "grad_norm": 3.765625, "grad_norm_var": 0.0343902587890625, "learning_rate": 0.0001, "loss": 5.9636, "loss/crossentropy": 2.5300296545028687, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.18164195120334625, "step": 7906 }, { "epoch": 0.247125, "grad_norm": 3.125, "grad_norm_var": 0.0420562744140625, "learning_rate": 0.0001, "loss": 5.3254, "loss/crossentropy": 2.123539924621582, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.16041796654462814, "step": 7908 }, { "epoch": 0.2471875, "grad_norm": 3.375, "grad_norm_var": 0.04210611979166667, "learning_rate": 0.0001, "loss": 6.2468, "loss/crossentropy": 2.684428095817566, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.19803011417388916, "step": 7910 }, { "epoch": 0.24725, "grad_norm": 3.5625, "grad_norm_var": 0.03721415201822917, "learning_rate": 0.0001, "loss": 6.1013, "loss/crossentropy": 2.5559555292129517, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19750570505857468, "step": 7912 }, { "epoch": 0.2473125, "grad_norm": 3.359375, "grad_norm_var": 0.0376953125, "learning_rate": 0.0001, "loss": 6.2057, "loss/crossentropy": 2.7045116424560547, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1942594200372696, "step": 7914 }, { "epoch": 0.247375, "grad_norm": 4.0625, "grad_norm_var": 0.060986328125, "learning_rate": 0.0001, "loss": 6.1003, "loss/crossentropy": 2.5861334800720215, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.19282116740942, "step": 7916 }, { "epoch": 0.2474375, "grad_norm": 3.984375, "grad_norm_var": 0.1520904541015625, "learning_rate": 0.0001, "loss": 5.808, "loss/crossentropy": 2.408286929130554, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.1802079826593399, "step": 7918 }, { "epoch": 0.2475, "grad_norm": 3.59375, "grad_norm_var": 0.15461832682291668, "learning_rate": 0.0001, "loss": 6.177, "loss/crossentropy": 2.6627827882766724, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19244200736284256, "step": 7920 }, { "epoch": 0.2475625, "grad_norm": 3.0, "grad_norm_var": 0.17177327473958334, "learning_rate": 0.0001, "loss": 5.4008, "loss/crossentropy": 2.206899642944336, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.16626481711864471, "step": 7922 }, { "epoch": 0.247625, "grad_norm": 3.328125, "grad_norm_var": 0.14933268229166666, "learning_rate": 0.0001, "loss": 6.0001, "loss/crossentropy": 2.5021921396255493, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.19315312057733536, "step": 7924 }, { "epoch": 0.2476875, "grad_norm": 3.46875, "grad_norm_var": 0.15714518229166666, "learning_rate": 0.0001, "loss": 5.9853, "loss/crossentropy": 2.622257351875305, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18161460012197495, "step": 7926 }, { "epoch": 0.24775, "grad_norm": 3.578125, "grad_norm_var": 0.15847066243489583, "learning_rate": 0.0001, "loss": 5.9218, "loss/crossentropy": 2.493217349052429, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.18348479270935059, "step": 7928 }, { "epoch": 0.2478125, "grad_norm": 3.640625, "grad_norm_var": 0.16197001139322917, "learning_rate": 0.0001, "loss": 5.9928, "loss/crossentropy": 2.5236148834228516, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.19027671217918396, "step": 7930 }, { "epoch": 0.247875, "grad_norm": 3.671875, "grad_norm_var": 0.14403889973958334, "learning_rate": 0.0001, "loss": 5.8449, "loss/crossentropy": 2.3914425373077393, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18831348419189453, "step": 7932 }, { "epoch": 0.2479375, "grad_norm": 3.3125, "grad_norm_var": 0.0432281494140625, "learning_rate": 0.0001, "loss": 6.0737, "loss/crossentropy": 2.6326550245285034, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18941866606473923, "step": 7934 }, { "epoch": 0.248, "grad_norm": 3.546875, "grad_norm_var": 0.04403889973958333, "learning_rate": 0.0001, "loss": 6.5346, "loss/crossentropy": 2.8211774826049805, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.20649944990873337, "step": 7936 }, { "epoch": 0.2480625, "grad_norm": 3.453125, "grad_norm_var": 0.02926025390625, "learning_rate": 0.0001, "loss": 6.2807, "loss/crossentropy": 2.684471607208252, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.2018081322312355, "step": 7938 }, { "epoch": 0.248125, "grad_norm": 3.546875, "grad_norm_var": 0.032567342122395836, "learning_rate": 0.0001, "loss": 6.2865, "loss/crossentropy": 2.6986790895462036, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.1970643773674965, "step": 7940 }, { "epoch": 0.2481875, "grad_norm": 3.171875, "grad_norm_var": 0.039850870768229164, "learning_rate": 0.0001, "loss": 5.8474, "loss/crossentropy": 2.4854358434677124, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18385069072246552, "step": 7942 }, { "epoch": 0.24825, "grad_norm": 3.546875, "grad_norm_var": 0.039208984375, "learning_rate": 0.0001, "loss": 6.1029, "loss/crossentropy": 2.4998894929885864, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.1962408572435379, "step": 7944 }, { "epoch": 0.2483125, "grad_norm": 4.5625, "grad_norm_var": 0.10787353515625, "learning_rate": 0.0001, "loss": 6.2775, "loss/crossentropy": 2.5435431003570557, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.21441563218832016, "step": 7946 }, { "epoch": 0.248375, "grad_norm": 3.40625, "grad_norm_var": 0.10576070149739583, "learning_rate": 0.0001, "loss": 6.0459, "loss/crossentropy": 2.5619404315948486, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.19292648136615753, "step": 7948 }, { "epoch": 0.2484375, "grad_norm": 3.671875, "grad_norm_var": 0.1029296875, "learning_rate": 0.0001, "loss": 6.0509, "loss/crossentropy": 2.510421395301819, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.19545641541481018, "step": 7950 }, { "epoch": 0.2485, "grad_norm": 3.515625, "grad_norm_var": 0.10670166015625, "learning_rate": 0.0001, "loss": 6.0985, "loss/crossentropy": 2.620503783226013, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.19310956448316574, "step": 7952 }, { "epoch": 0.2485625, "grad_norm": 3.375, "grad_norm_var": 0.10779622395833334, "learning_rate": 0.0001, "loss": 5.8262, "loss/crossentropy": 2.4311505556106567, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18715716898441315, "step": 7954 }, { "epoch": 0.248625, "grad_norm": 3.328125, "grad_norm_var": 0.10303446451822916, "learning_rate": 0.0001, "loss": 6.1145, "loss/crossentropy": 2.6362648010253906, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19001401215791702, "step": 7956 }, { "epoch": 0.2486875, "grad_norm": 4.5, "grad_norm_var": 0.15091145833333333, "learning_rate": 0.0001, "loss": 6.2873, "loss/crossentropy": 2.619986891746521, "loss/hidden": 1.7734375, "loss/jsd": 0.0, "loss/logits": 0.1893879696726799, "step": 7958 }, { "epoch": 0.24875, "grad_norm": 3.390625, "grad_norm_var": 0.15543212890625, "learning_rate": 0.0001, "loss": 5.8951, "loss/crossentropy": 2.4806383848190308, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1855819895863533, "step": 7960 }, { "epoch": 0.2488125, "grad_norm": 3.203125, "grad_norm_var": 0.08589579264322916, "learning_rate": 0.0001, "loss": 6.2061, "loss/crossentropy": 2.761041522026062, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18747299909591675, "step": 7962 }, { "epoch": 0.248875, "grad_norm": 3.453125, "grad_norm_var": 0.08677978515625, "learning_rate": 0.0001, "loss": 5.8637, "loss/crossentropy": 2.4276249408721924, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18891657888889313, "step": 7964 }, { "epoch": 0.2489375, "grad_norm": 3.5, "grad_norm_var": 0.08857014973958334, "learning_rate": 0.0001, "loss": 6.0556, "loss/crossentropy": 2.5081448554992676, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19771240651607513, "step": 7966 }, { "epoch": 0.249, "grad_norm": 3.765625, "grad_norm_var": 0.09461263020833334, "learning_rate": 0.0001, "loss": 6.067, "loss/crossentropy": 2.5315128564834595, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19651669263839722, "step": 7968 }, { "epoch": 0.2490625, "grad_norm": 3.140625, "grad_norm_var": 0.10575764973958333, "learning_rate": 0.0001, "loss": 5.6147, "loss/crossentropy": 2.4189276695251465, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16683895140886307, "step": 7970 }, { "epoch": 0.249125, "grad_norm": 3.359375, "grad_norm_var": 0.10559488932291666, "learning_rate": 0.0001, "loss": 6.2059, "loss/crossentropy": 2.5469354391098022, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.20105034857988358, "step": 7972 }, { "epoch": 0.2491875, "grad_norm": 7.96875, "grad_norm_var": 1.3269816080729167, "learning_rate": 0.0001, "loss": 6.3564, "loss/crossentropy": 2.734826445579529, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.19575071334838867, "step": 7974 }, { "epoch": 0.24925, "grad_norm": 4.375, "grad_norm_var": 1.3368123372395833, "learning_rate": 0.0001, "loss": 6.09, "loss/crossentropy": 2.45013964176178, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.19718587398529053, "step": 7976 }, { "epoch": 0.2493125, "grad_norm": 3.65625, "grad_norm_var": 1.3228505452473958, "learning_rate": 0.0001, "loss": 6.1342, "loss/crossentropy": 2.6453663110733032, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19185029715299606, "step": 7978 }, { "epoch": 0.249375, "grad_norm": 3.921875, "grad_norm_var": 1.3178670247395834, "learning_rate": 0.0001, "loss": 6.3624, "loss/crossentropy": 2.704474449157715, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.20289774239063263, "step": 7980 }, { "epoch": 0.2494375, "grad_norm": 3.203125, "grad_norm_var": 1.3607584635416667, "learning_rate": 0.0001, "loss": 5.942, "loss/crossentropy": 2.601536989212036, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17857421189546585, "step": 7982 }, { "epoch": 0.2495, "grad_norm": 3.28125, "grad_norm_var": 1.383137003580729, "learning_rate": 0.0001, "loss": 5.8629, "loss/crossentropy": 2.492098808288574, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18082839250564575, "step": 7984 }, { "epoch": 0.2495625, "grad_norm": 3.28125, "grad_norm_var": 1.3790273030598958, "learning_rate": 0.0001, "loss": 5.709, "loss/crossentropy": 2.4098531007766724, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17835478484630585, "step": 7986 }, { "epoch": 0.249625, "grad_norm": 3.78125, "grad_norm_var": 1.3669759114583333, "learning_rate": 0.0001, "loss": 6.3246, "loss/crossentropy": 2.687322974205017, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.20435378700494766, "step": 7988 }, { "epoch": 0.2496875, "grad_norm": 3.5625, "grad_norm_var": 0.13305562337239582, "learning_rate": 0.0001, "loss": 6.3689, "loss/crossentropy": 2.691186547279358, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.2083970531821251, "step": 7990 }, { "epoch": 0.24975, "grad_norm": 3.578125, "grad_norm_var": 0.09887593587239583, "learning_rate": 0.0001, "loss": 6.2701, "loss/crossentropy": 2.7293598651885986, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19704613834619522, "step": 7992 }, { "epoch": 0.2498125, "grad_norm": 3.5, "grad_norm_var": 0.09706624348958333, "learning_rate": 0.0001, "loss": 5.952, "loss/crossentropy": 2.6070988178253174, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17863477021455765, "step": 7994 }, { "epoch": 0.249875, "grad_norm": 3.375, "grad_norm_var": 0.07883199055989583, "learning_rate": 0.0001, "loss": 5.6068, "loss/crossentropy": 2.2992024421691895, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.17372917383909225, "step": 7996 }, { "epoch": 0.2499375, "grad_norm": 3.578125, "grad_norm_var": 0.07669270833333333, "learning_rate": 0.0001, "loss": 6.232, "loss/crossentropy": 2.7200305461883545, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.19260654598474503, "step": 7998 }, { "epoch": 0.25, "grad_norm": 3.6875, "grad_norm_var": 0.0800445556640625, "learning_rate": 0.0001, "loss": 6.0694, "loss/crossentropy": 2.5788803100585938, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.1892903596162796, "step": 8000 }, { "epoch": 0.2500625, "grad_norm": 3.640625, "grad_norm_var": 0.07529195149739583, "learning_rate": 0.0001, "loss": 6.1261, "loss/crossentropy": 2.6562572717666626, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.18760987371206284, "step": 8002 }, { "epoch": 0.250125, "grad_norm": 3.328125, "grad_norm_var": 0.0545806884765625, "learning_rate": 0.0001, "loss": 5.9428, "loss/crossentropy": 2.517930746078491, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18740666657686234, "step": 8004 }, { "epoch": 0.2501875, "grad_norm": 3.765625, "grad_norm_var": 0.04888916015625, "learning_rate": 0.0001, "loss": 5.9677, "loss/crossentropy": 2.528059959411621, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.1838068664073944, "step": 8006 }, { "epoch": 0.25025, "grad_norm": 3.3125, "grad_norm_var": 0.033055623372395836, "learning_rate": 0.0001, "loss": 6.0525, "loss/crossentropy": 2.603206753730774, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.19024566560983658, "step": 8008 }, { "epoch": 0.2503125, "grad_norm": 3.484375, "grad_norm_var": 0.0303375244140625, "learning_rate": 0.0001, "loss": 6.0339, "loss/crossentropy": 2.5453351736068726, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.19494757801294327, "step": 8010 }, { "epoch": 0.250375, "grad_norm": 3.421875, "grad_norm_var": 0.03229166666666667, "learning_rate": 0.0001, "loss": 5.7867, "loss/crossentropy": 2.459004521369934, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17808642238378525, "step": 8012 }, { "epoch": 0.2504375, "grad_norm": 3.578125, "grad_norm_var": 0.030403645833333333, "learning_rate": 0.0001, "loss": 5.9464, "loss/crossentropy": 2.4782962799072266, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.18431393802165985, "step": 8014 }, { "epoch": 0.2505, "grad_norm": 3.609375, "grad_norm_var": 0.027360026041666666, "learning_rate": 0.0001, "loss": 6.2003, "loss/crossentropy": 2.765717029571533, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1879941076040268, "step": 8016 }, { "epoch": 0.2505625, "grad_norm": 3.25, "grad_norm_var": 0.024344889322916667, "learning_rate": 0.0001, "loss": 6.0431, "loss/crossentropy": 2.6034449338912964, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18967097252607346, "step": 8018 }, { "epoch": 0.250625, "grad_norm": 3.375, "grad_norm_var": 0.023661295572916668, "learning_rate": 0.0001, "loss": 5.9184, "loss/crossentropy": 2.5038918256759644, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1848083809018135, "step": 8020 }, { "epoch": 0.2506875, "grad_norm": 3.25, "grad_norm_var": 0.015787760416666668, "learning_rate": 0.0001, "loss": 5.6977, "loss/crossentropy": 2.339107394218445, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.17492011189460754, "step": 8022 }, { "epoch": 0.25075, "grad_norm": 3.28125, "grad_norm_var": 0.0177886962890625, "learning_rate": 0.0001, "loss": 5.8359, "loss/crossentropy": 2.505526900291443, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.17522908002138138, "step": 8024 }, { "epoch": 0.2508125, "grad_norm": 3.015625, "grad_norm_var": 0.023828125, "learning_rate": 0.0001, "loss": 5.6513, "loss/crossentropy": 2.377779960632324, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17188209295272827, "step": 8026 }, { "epoch": 0.250875, "grad_norm": 3.53125, "grad_norm_var": 0.025243123372395832, "learning_rate": 0.0001, "loss": 6.1452, "loss/crossentropy": 2.6347309350967407, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1951827108860016, "step": 8028 }, { "epoch": 0.2509375, "grad_norm": 3.40625, "grad_norm_var": 0.021870930989583332, "learning_rate": 0.0001, "loss": 5.9928, "loss/crossentropy": 2.6351139545440674, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18107682466506958, "step": 8030 }, { "epoch": 0.251, "grad_norm": 3.484375, "grad_norm_var": 0.02720947265625, "learning_rate": 0.0001, "loss": 6.2226, "loss/crossentropy": 2.5201927423477173, "loss/hidden": 1.67578125, "loss/jsd": 0.0, "loss/logits": 0.2026629075407982, "step": 8032 }, { "epoch": 0.2510625, "grad_norm": 3.265625, "grad_norm_var": 0.0265045166015625, "learning_rate": 0.0001, "loss": 6.0213, "loss/crossentropy": 2.5705147981643677, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18687426298856735, "step": 8034 }, { "epoch": 0.251125, "grad_norm": 3.34375, "grad_norm_var": 0.029035441080729165, "learning_rate": 0.0001, "loss": 6.0548, "loss/crossentropy": 2.6310404539108276, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18886138498783112, "step": 8036 }, { "epoch": 0.2511875, "grad_norm": 3.484375, "grad_norm_var": 0.029255167643229166, "learning_rate": 0.0001, "loss": 5.8158, "loss/crossentropy": 2.3926368951797485, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18684671074151993, "step": 8038 }, { "epoch": 0.25125, "grad_norm": 3.609375, "grad_norm_var": 0.030908203125, "learning_rate": 0.0001, "loss": 6.2981, "loss/crossentropy": 2.7984174489974976, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19215591996908188, "step": 8040 }, { "epoch": 0.2513125, "grad_norm": 3.3125, "grad_norm_var": 0.020865885416666667, "learning_rate": 0.0001, "loss": 5.9793, "loss/crossentropy": 2.5516382455825806, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18690845370292664, "step": 8042 }, { "epoch": 0.251375, "grad_norm": 3.40625, "grad_norm_var": 0.030711873372395834, "learning_rate": 0.0001, "loss": 5.9134, "loss/crossentropy": 2.5644371509552, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1806023046374321, "step": 8044 }, { "epoch": 0.2514375, "grad_norm": 3.21875, "grad_norm_var": 0.03463541666666667, "learning_rate": 0.0001, "loss": 6.101, "loss/crossentropy": 2.622431755065918, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1927769035100937, "step": 8046 }, { "epoch": 0.2515, "grad_norm": 3.5, "grad_norm_var": 0.026590983072916668, "learning_rate": 0.0001, "loss": 6.0714, "loss/crossentropy": 2.4856817722320557, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19880184531211853, "step": 8048 }, { "epoch": 0.2515625, "grad_norm": 3.40625, "grad_norm_var": 0.025764973958333333, "learning_rate": 0.0001, "loss": 5.9678, "loss/crossentropy": 2.541865348815918, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.18478521704673767, "step": 8050 }, { "epoch": 0.251625, "grad_norm": 3.296875, "grad_norm_var": 0.027757771809895835, "learning_rate": 0.0001, "loss": 5.6594, "loss/crossentropy": 2.3731764554977417, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1751052439212799, "step": 8052 }, { "epoch": 0.2516875, "grad_norm": 3.234375, "grad_norm_var": 0.03154296875, "learning_rate": 0.0001, "loss": 6.2624, "loss/crossentropy": 2.677454948425293, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.19795053452253342, "step": 8054 }, { "epoch": 0.25175, "grad_norm": 3.46875, "grad_norm_var": 0.027692667643229165, "learning_rate": 0.0001, "loss": 6.2567, "loss/crossentropy": 2.74212908744812, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1936415657401085, "step": 8056 }, { "epoch": 0.2518125, "grad_norm": 3.421875, "grad_norm_var": 0.0286773681640625, "learning_rate": 0.0001, "loss": 6.046, "loss/crossentropy": 2.5982284545898438, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18891958892345428, "step": 8058 }, { "epoch": 0.251875, "grad_norm": 3.25, "grad_norm_var": 0.021480305989583334, "learning_rate": 0.0001, "loss": 6.0877, "loss/crossentropy": 2.6000083684921265, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19095581769943237, "step": 8060 }, { "epoch": 0.2519375, "grad_norm": 3.421875, "grad_norm_var": 0.017625935872395835, "learning_rate": 0.0001, "loss": 5.7538, "loss/crossentropy": 2.3270821571350098, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.18134818971157074, "step": 8062 }, { "epoch": 0.252, "grad_norm": 3.703125, "grad_norm_var": 0.02535400390625, "learning_rate": 0.0001, "loss": 6.3668, "loss/crossentropy": 2.758496880531311, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.2030191347002983, "step": 8064 }, { "epoch": 0.2520625, "grad_norm": 3.46875, "grad_norm_var": 0.026707967122395832, "learning_rate": 0.0001, "loss": 6.171, "loss/crossentropy": 2.5974239110946655, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.1999397724866867, "step": 8066 }, { "epoch": 0.252125, "grad_norm": 3.828125, "grad_norm_var": 0.028804524739583334, "learning_rate": 0.0001, "loss": 6.304, "loss/crossentropy": 2.7676830291748047, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19660098105669022, "step": 8068 }, { "epoch": 0.2521875, "grad_norm": 3.9375, "grad_norm_var": 0.04177144368489583, "learning_rate": 0.0001, "loss": 6.123, "loss/crossentropy": 2.5446542501449585, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.19415876269340515, "step": 8070 }, { "epoch": 0.25225, "grad_norm": 3.078125, "grad_norm_var": 0.05226949055989583, "learning_rate": 0.0001, "loss": 5.8428, "loss/crossentropy": 2.4725788831710815, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.17881622165441513, "step": 8072 }, { "epoch": 0.2523125, "grad_norm": 3.171875, "grad_norm_var": 0.059601847330729166, "learning_rate": 0.0001, "loss": 5.8556, "loss/crossentropy": 2.4939451217651367, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18382671475410461, "step": 8074 }, { "epoch": 0.252375, "grad_norm": 3.34375, "grad_norm_var": 0.058283487955729164, "learning_rate": 0.0001, "loss": 6.2737, "loss/crossentropy": 2.7139326333999634, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.19855286180973053, "step": 8076 }, { "epoch": 0.2524375, "grad_norm": 3.140625, "grad_norm_var": 0.06663411458333333, "learning_rate": 0.0001, "loss": 5.8327, "loss/crossentropy": 2.42052161693573, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.18262653052806854, "step": 8078 }, { "epoch": 0.2525, "grad_norm": 3.6875, "grad_norm_var": 0.0643463134765625, "learning_rate": 0.0001, "loss": 6.1268, "loss/crossentropy": 2.6724756956100464, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18879260122776031, "step": 8080 }, { "epoch": 0.2525625, "grad_norm": 3.328125, "grad_norm_var": 0.06493733723958334, "learning_rate": 0.0001, "loss": 6.1529, "loss/crossentropy": 2.6575675010681152, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.19406761974096298, "step": 8082 }, { "epoch": 0.252625, "grad_norm": 3.15625, "grad_norm_var": 0.056982421875, "learning_rate": 0.0001, "loss": 6.2375, "loss/crossentropy": 2.7514657974243164, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.19353008270263672, "step": 8084 }, { "epoch": 0.2526875, "grad_norm": 3.515625, "grad_norm_var": 0.0350738525390625, "learning_rate": 0.0001, "loss": 6.1594, "loss/crossentropy": 2.6690210103988647, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19200769811868668, "step": 8086 }, { "epoch": 0.25275, "grad_norm": 3.453125, "grad_norm_var": 0.029832967122395835, "learning_rate": 0.0001, "loss": 5.8653, "loss/crossentropy": 2.4479297399520874, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18587347120046616, "step": 8088 }, { "epoch": 0.2528125, "grad_norm": 4.03125, "grad_norm_var": 0.050414021809895834, "learning_rate": 0.0001, "loss": 6.3817, "loss/crossentropy": 2.7035313844680786, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.20571190118789673, "step": 8090 }, { "epoch": 0.252875, "grad_norm": 3.15625, "grad_norm_var": 0.0542144775390625, "learning_rate": 0.0001, "loss": 6.0223, "loss/crossentropy": 2.5742950439453125, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.18347414582967758, "step": 8092 }, { "epoch": 0.2529375, "grad_norm": 2.96875, "grad_norm_var": 0.06529032389322917, "learning_rate": 0.0001, "loss": 5.7376, "loss/crossentropy": 2.3434239625930786, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.18160177022218704, "step": 8094 }, { "epoch": 0.253, "grad_norm": 4.125, "grad_norm_var": 0.0930328369140625, "learning_rate": 0.0001, "loss": 6.1727, "loss/crossentropy": 2.617506980895996, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.1957532986998558, "step": 8096 }, { "epoch": 0.2530625, "grad_norm": 3.3125, "grad_norm_var": 0.09023030598958333, "learning_rate": 0.0001, "loss": 5.7503, "loss/crossentropy": 2.399343729019165, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17962299287319183, "step": 8098 }, { "epoch": 0.253125, "grad_norm": 3.3125, "grad_norm_var": 0.0849029541015625, "learning_rate": 0.0001, "loss": 6.1124, "loss/crossentropy": 2.61958384513855, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.1902993693947792, "step": 8100 }, { "epoch": 0.2531875, "grad_norm": 3.296875, "grad_norm_var": 0.08745829264322917, "learning_rate": 0.0001, "loss": 5.8804, "loss/crossentropy": 2.463230609893799, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18859213590621948, "step": 8102 }, { "epoch": 0.25325, "grad_norm": 3.421875, "grad_norm_var": 0.09225972493489583, "learning_rate": 0.0001, "loss": 5.7748, "loss/crossentropy": 2.4880319833755493, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17516177147626877, "step": 8104 }, { "epoch": 0.2533125, "grad_norm": 3.421875, "grad_norm_var": 0.08961181640625, "learning_rate": 0.0001, "loss": 6.4198, "loss/crossentropy": 2.817080855369568, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.1997276097536087, "step": 8106 }, { "epoch": 0.253375, "grad_norm": 3.578125, "grad_norm_var": 0.0907623291015625, "learning_rate": 0.0001, "loss": 6.4113, "loss/crossentropy": 2.7963486909866333, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.19743558764457703, "step": 8108 }, { "epoch": 0.2534375, "grad_norm": 3.84375, "grad_norm_var": 0.07965087890625, "learning_rate": 0.0001, "loss": 6.1079, "loss/crossentropy": 2.680260419845581, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1888602152466774, "step": 8110 }, { "epoch": 0.2535, "grad_norm": 3.203125, "grad_norm_var": 0.05621337890625, "learning_rate": 0.0001, "loss": 6.0002, "loss/crossentropy": 2.531370520591736, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1910264864563942, "step": 8112 }, { "epoch": 0.2535625, "grad_norm": 3.640625, "grad_norm_var": 0.05748291015625, "learning_rate": 0.0001, "loss": 6.0763, "loss/crossentropy": 2.5369983911514282, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.19182298332452774, "step": 8114 }, { "epoch": 0.253625, "grad_norm": 3.71875, "grad_norm_var": 0.06272379557291667, "learning_rate": 0.0001, "loss": 5.6069, "loss/crossentropy": 2.2425425052642822, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18252773582935333, "step": 8116 }, { "epoch": 0.2536875, "grad_norm": 3.6875, "grad_norm_var": 0.0618804931640625, "learning_rate": 0.0001, "loss": 6.3295, "loss/crossentropy": 2.6767349243164062, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.20473266392946243, "step": 8118 }, { "epoch": 0.25375, "grad_norm": 3.6875, "grad_norm_var": 0.05396219889322917, "learning_rate": 0.0001, "loss": 6.0133, "loss/crossentropy": 2.5724856853485107, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18705421686172485, "step": 8120 }, { "epoch": 0.2538125, "grad_norm": 3.28125, "grad_norm_var": 0.04700520833333333, "learning_rate": 0.0001, "loss": 5.8223, "loss/crossentropy": 2.4687927961349487, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.17909876257181168, "step": 8122 }, { "epoch": 0.253875, "grad_norm": 3.53125, "grad_norm_var": 0.04430338541666667, "learning_rate": 0.0001, "loss": 5.9073, "loss/crossentropy": 2.532727599143982, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1847231239080429, "step": 8124 }, { "epoch": 0.2539375, "grad_norm": 3.578125, "grad_norm_var": 0.044759114583333336, "learning_rate": 0.0001, "loss": 6.082, "loss/crossentropy": 2.6451234817504883, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.18236370384693146, "step": 8126 }, { "epoch": 0.254, "grad_norm": 3.5625, "grad_norm_var": 0.04051005045572917, "learning_rate": 0.0001, "loss": 6.1677, "loss/crossentropy": 2.591571807861328, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.19628168642520905, "step": 8128 }, { "epoch": 0.2540625, "grad_norm": 3.453125, "grad_norm_var": 0.03752848307291667, "learning_rate": 0.0001, "loss": 6.2562, "loss/crossentropy": 2.6917589902877808, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.199021115899086, "step": 8130 }, { "epoch": 0.254125, "grad_norm": 3.453125, "grad_norm_var": 0.032124837239583336, "learning_rate": 0.0001, "loss": 5.9864, "loss/crossentropy": 2.5716795921325684, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18639299273490906, "step": 8132 }, { "epoch": 0.2541875, "grad_norm": 3.671875, "grad_norm_var": 0.029520670572916668, "learning_rate": 0.0001, "loss": 5.8128, "loss/crossentropy": 2.4251999855041504, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.17977185547351837, "step": 8134 }, { "epoch": 0.25425, "grad_norm": 3.46875, "grad_norm_var": 0.02769775390625, "learning_rate": 0.0001, "loss": 5.9771, "loss/crossentropy": 2.446664810180664, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19327668100595474, "step": 8136 }, { "epoch": 0.2543125, "grad_norm": 3.3125, "grad_norm_var": 0.03144124348958333, "learning_rate": 0.0001, "loss": 6.1304, "loss/crossentropy": 2.6993672847747803, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18920022994279861, "step": 8138 }, { "epoch": 0.254375, "grad_norm": 3.9375, "grad_norm_var": 0.06031494140625, "learning_rate": 0.0001, "loss": 6.2245, "loss/crossentropy": 2.6362926959991455, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.19749093800783157, "step": 8140 }, { "epoch": 0.2544375, "grad_norm": 4.0625, "grad_norm_var": 0.06389872233072917, "learning_rate": 0.0001, "loss": 6.1924, "loss/crossentropy": 2.6543272733688354, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.19326184689998627, "step": 8142 }, { "epoch": 0.2545, "grad_norm": 3.65625, "grad_norm_var": 8.84273681640625, "learning_rate": 0.0001, "loss": 6.4958, "loss/crossentropy": 2.622679352760315, "loss/hidden": 1.69140625, "loss/jsd": 0.0, "loss/logits": 0.21816815435886383, "step": 8144 }, { "epoch": 0.2545625, "grad_norm": 3.15625, "grad_norm_var": 8.895556640625, "learning_rate": 0.0001, "loss": 5.8055, "loss/crossentropy": 2.5427088737487793, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17549703270196915, "step": 8146 }, { "epoch": 0.254625, "grad_norm": 3.1875, "grad_norm_var": 8.953531901041666, "learning_rate": 0.0001, "loss": 5.8557, "loss/crossentropy": 2.5127452611923218, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18038783222436905, "step": 8148 }, { "epoch": 0.2546875, "grad_norm": 3.34375, "grad_norm_var": 8.972587076822917, "learning_rate": 0.0001, "loss": 5.8703, "loss/crossentropy": 2.4678895473480225, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1859426498413086, "step": 8150 }, { "epoch": 0.25475, "grad_norm": 3.84375, "grad_norm_var": 8.950553385416667, "learning_rate": 0.0001, "loss": 6.3494, "loss/crossentropy": 2.7035274505615234, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.202867791056633, "step": 8152 }, { "epoch": 0.2548125, "grad_norm": 3.671875, "grad_norm_var": 8.850846354166666, "learning_rate": 0.0001, "loss": 6.0579, "loss/crossentropy": 2.4627203941345215, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.20209801942110062, "step": 8154 }, { "epoch": 0.254875, "grad_norm": 3.59375, "grad_norm_var": 8.912621053059896, "learning_rate": 0.0001, "loss": 5.9049, "loss/crossentropy": 2.4870606660842896, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18358328938484192, "step": 8156 }, { "epoch": 0.2549375, "grad_norm": 3.75, "grad_norm_var": 8.94732666015625, "learning_rate": 0.0001, "loss": 5.8499, "loss/crossentropy": 2.37395977973938, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.18860754370689392, "step": 8158 }, { "epoch": 0.255, "grad_norm": 3.546875, "grad_norm_var": 0.04077860514322917, "learning_rate": 0.0001, "loss": 5.7366, "loss/crossentropy": 2.353061556816101, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1805398240685463, "step": 8160 }, { "epoch": 0.2550625, "grad_norm": 3.671875, "grad_norm_var": 0.037760416666666664, "learning_rate": 0.0001, "loss": 6.0241, "loss/crossentropy": 2.4962236881256104, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19302287697792053, "step": 8162 }, { "epoch": 0.255125, "grad_norm": 3.609375, "grad_norm_var": 0.031208292643229166, "learning_rate": 0.0001, "loss": 5.9913, "loss/crossentropy": 2.4920365810394287, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.19132770597934723, "step": 8164 }, { "epoch": 0.2551875, "grad_norm": 3.671875, "grad_norm_var": 0.029264322916666665, "learning_rate": 0.0001, "loss": 6.1592, "loss/crossentropy": 2.6284236907958984, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19605131447315216, "step": 8166 }, { "epoch": 0.25525, "grad_norm": 3.15625, "grad_norm_var": 0.04185791015625, "learning_rate": 0.0001, "loss": 5.895, "loss/crossentropy": 2.559125542640686, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17889637500047684, "step": 8168 }, { "epoch": 0.2553125, "grad_norm": 3.359375, "grad_norm_var": 0.03865458170572917, "learning_rate": 0.0001, "loss": 5.877, "loss/crossentropy": 2.508115768432617, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1837659329175949, "step": 8170 }, { "epoch": 0.255375, "grad_norm": 3.84375, "grad_norm_var": 0.047749837239583336, "learning_rate": 0.0001, "loss": 6.2866, "loss/crossentropy": 2.6549712419509888, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.20144866406917572, "step": 8172 }, { "epoch": 0.2554375, "grad_norm": 3.4375, "grad_norm_var": 0.042041015625, "learning_rate": 0.0001, "loss": 5.8874, "loss/crossentropy": 2.5244874954223633, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18238522112369537, "step": 8174 }, { "epoch": 0.2555, "grad_norm": 3.359375, "grad_norm_var": 0.0411773681640625, "learning_rate": 0.0001, "loss": 6.2863, "loss/crossentropy": 2.7426899671554565, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19459190964698792, "step": 8176 }, { "epoch": 0.2555625, "grad_norm": 3.578125, "grad_norm_var": 0.03765869140625, "learning_rate": 0.0001, "loss": 6.5763, "loss/crossentropy": 2.919371247291565, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.2055361270904541, "step": 8178 }, { "epoch": 0.255625, "grad_norm": 3.671875, "grad_norm_var": 0.04023335774739583, "learning_rate": 0.0001, "loss": 5.7059, "loss/crossentropy": 2.3707518577575684, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.17570346593856812, "step": 8180 }, { "epoch": 0.2556875, "grad_norm": 3.3125, "grad_norm_var": 0.0366119384765625, "learning_rate": 0.0001, "loss": 5.9558, "loss/crossentropy": 2.569222569465637, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.18749002367258072, "step": 8182 }, { "epoch": 0.25575, "grad_norm": 3.546875, "grad_norm_var": 0.029911295572916666, "learning_rate": 0.0001, "loss": 6.395, "loss/crossentropy": 2.7556833028793335, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.20065077394247055, "step": 8184 }, { "epoch": 0.2558125, "grad_norm": 3.234375, "grad_norm_var": 0.05172526041666667, "learning_rate": 0.0001, "loss": 6.2051, "loss/crossentropy": 2.6984463930130005, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.19480326026678085, "step": 8186 }, { "epoch": 0.255875, "grad_norm": 3.1875, "grad_norm_var": 0.048371378580729166, "learning_rate": 0.0001, "loss": 6.2935, "loss/crossentropy": 2.731417417526245, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19526979327201843, "step": 8188 }, { "epoch": 0.2559375, "grad_norm": 3.75, "grad_norm_var": 0.0550445556640625, "learning_rate": 0.0001, "loss": 6.1322, "loss/crossentropy": 2.5600863695144653, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.19354426860809326, "step": 8190 }, { "epoch": 0.256, "grad_norm": 3.25, "grad_norm_var": 0.06477762858072916, "learning_rate": 0.0001, "loss": 5.8101, "loss/crossentropy": 2.5173254013061523, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1773252859711647, "step": 8192 }, { "epoch": 0.2560625, "grad_norm": 3.515625, "grad_norm_var": 0.0637847900390625, "learning_rate": 0.0001, "loss": 6.308, "loss/crossentropy": 2.842012047767639, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18917309492826462, "step": 8194 }, { "epoch": 0.256125, "grad_norm": 3.53125, "grad_norm_var": 0.05677083333333333, "learning_rate": 0.0001, "loss": 6.0664, "loss/crossentropy": 2.609717845916748, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1902015060186386, "step": 8196 }, { "epoch": 0.2561875, "grad_norm": 3.6875, "grad_norm_var": 0.057145182291666666, "learning_rate": 0.0001, "loss": 6.1134, "loss/crossentropy": 2.550622820854187, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.1980718970298767, "step": 8198 }, { "epoch": 0.25625, "grad_norm": 3.25, "grad_norm_var": 0.0616363525390625, "learning_rate": 0.0001, "loss": 5.7814, "loss/crossentropy": 2.3334431648254395, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.1811218336224556, "step": 8200 }, { "epoch": 0.2563125, "grad_norm": 3.46875, "grad_norm_var": 0.03873291015625, "learning_rate": 0.0001, "loss": 6.2308, "loss/crossentropy": 2.691083073616028, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.19733324646949768, "step": 8202 }, { "epoch": 0.256375, "grad_norm": 3.421875, "grad_norm_var": 0.0377593994140625, "learning_rate": 0.0001, "loss": 5.8558, "loss/crossentropy": 2.4827888011932373, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18417687714099884, "step": 8204 }, { "epoch": 0.2564375, "grad_norm": 3.640625, "grad_norm_var": 0.03142903645833333, "learning_rate": 0.0001, "loss": 6.041, "loss/crossentropy": 2.623255968093872, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18552595376968384, "step": 8206 }, { "epoch": 0.2565, "grad_norm": 3.453125, "grad_norm_var": 0.020524088541666666, "learning_rate": 0.0001, "loss": 6.0687, "loss/crossentropy": 2.516445279121399, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19624106585979462, "step": 8208 }, { "epoch": 0.2565625, "grad_norm": 3.546875, "grad_norm_var": 0.020873006184895834, "learning_rate": 0.0001, "loss": 6.0083, "loss/crossentropy": 2.499703049659729, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19187762588262558, "step": 8210 }, { "epoch": 0.256625, "grad_norm": 3.21875, "grad_norm_var": 0.02584228515625, "learning_rate": 0.0001, "loss": 5.8727, "loss/crossentropy": 2.5297656059265137, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1792202666401863, "step": 8212 }, { "epoch": 0.2566875, "grad_norm": 3.34375, "grad_norm_var": 0.0221343994140625, "learning_rate": 0.0001, "loss": 5.938, "loss/crossentropy": 2.5417758226394653, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1868831068277359, "step": 8214 }, { "epoch": 0.25675, "grad_norm": 4.3125, "grad_norm_var": 0.09054361979166667, "learning_rate": 0.0001, "loss": 5.9652, "loss/crossentropy": 2.501801609992981, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.18736010044813156, "step": 8216 }, { "epoch": 0.2568125, "grad_norm": 3.296875, "grad_norm_var": 0.09519755045572917, "learning_rate": 0.0001, "loss": 6.1688, "loss/crossentropy": 2.674699306488037, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.1919931024312973, "step": 8218 }, { "epoch": 0.256875, "grad_norm": 3.765625, "grad_norm_var": 0.0919097900390625, "learning_rate": 0.0001, "loss": 6.0433, "loss/crossentropy": 2.5629888772964478, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19022127240896225, "step": 8220 }, { "epoch": 0.2569375, "grad_norm": 3.4375, "grad_norm_var": 0.0944000244140625, "learning_rate": 0.0001, "loss": 6.1382, "loss/crossentropy": 2.676474690437317, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18953000009059906, "step": 8222 }, { "epoch": 0.257, "grad_norm": 3.453125, "grad_norm_var": 0.0957427978515625, "learning_rate": 0.0001, "loss": 6.3055, "loss/crossentropy": 2.6694082021713257, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.20383962988853455, "step": 8224 }, { "epoch": 0.2570625, "grad_norm": 3.421875, "grad_norm_var": 0.09700520833333333, "learning_rate": 0.0001, "loss": 6.3182, "loss/crossentropy": 2.7566806077957153, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.19755762815475464, "step": 8226 }, { "epoch": 0.257125, "grad_norm": 3.59375, "grad_norm_var": 0.0902984619140625, "learning_rate": 0.0001, "loss": 6.2823, "loss/crossentropy": 2.7556703090667725, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.19016651809215546, "step": 8228 }, { "epoch": 0.2571875, "grad_norm": 3.3125, "grad_norm_var": 0.08502604166666666, "learning_rate": 0.0001, "loss": 5.947, "loss/crossentropy": 2.4731982946395874, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.18878469616174698, "step": 8230 }, { "epoch": 0.25725, "grad_norm": 3.28125, "grad_norm_var": 0.0430084228515625, "learning_rate": 0.0001, "loss": 5.9901, "loss/crossentropy": 2.4843109846115112, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.1869049295783043, "step": 8232 }, { "epoch": 0.2573125, "grad_norm": 3.734375, "grad_norm_var": 0.04150390625, "learning_rate": 0.0001, "loss": 6.1095, "loss/crossentropy": 2.643258213996887, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.1880340278148651, "step": 8234 }, { "epoch": 0.257375, "grad_norm": 3.53125, "grad_norm_var": 0.046875, "learning_rate": 0.0001, "loss": 6.121, "loss/crossentropy": 2.6990444660186768, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18829254806041718, "step": 8236 }, { "epoch": 0.2574375, "grad_norm": 3.296875, "grad_norm_var": 0.04801025390625, "learning_rate": 0.0001, "loss": 6.135, "loss/crossentropy": 2.7081637382507324, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18604490160942078, "step": 8238 }, { "epoch": 0.2575, "grad_norm": 3.484375, "grad_norm_var": 0.04849853515625, "learning_rate": 0.0001, "loss": 6.2395, "loss/crossentropy": 2.634418249130249, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.19644414633512497, "step": 8240 }, { "epoch": 0.2575625, "grad_norm": 3.59375, "grad_norm_var": 0.0493560791015625, "learning_rate": 0.0001, "loss": 6.2743, "loss/crossentropy": 2.7220499515533447, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19740818440914154, "step": 8242 }, { "epoch": 0.257625, "grad_norm": 3.375, "grad_norm_var": 0.04008687337239583, "learning_rate": 0.0001, "loss": 5.913, "loss/crossentropy": 2.4557689428329468, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.18478545546531677, "step": 8244 }, { "epoch": 0.2576875, "grad_norm": 3.96875, "grad_norm_var": 0.0530181884765625, "learning_rate": 0.0001, "loss": 5.7819, "loss/crossentropy": 2.339992046356201, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.1828642264008522, "step": 8246 }, { "epoch": 0.25775, "grad_norm": 3.265625, "grad_norm_var": 0.043375651041666664, "learning_rate": 0.0001, "loss": 5.7539, "loss/crossentropy": 2.4117339849472046, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17874816805124283, "step": 8248 }, { "epoch": 0.2578125, "grad_norm": 3.859375, "grad_norm_var": 0.048273722330729164, "learning_rate": 0.0001, "loss": 6.4533, "loss/crossentropy": 2.731229305267334, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.20892907679080963, "step": 8250 }, { "epoch": 0.257875, "grad_norm": 3.140625, "grad_norm_var": 0.04885152180989583, "learning_rate": 0.0001, "loss": 6.0605, "loss/crossentropy": 2.6448343992233276, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18882764875888824, "step": 8252 }, { "epoch": 0.2579375, "grad_norm": 3.671875, "grad_norm_var": 0.053792317708333336, "learning_rate": 0.0001, "loss": 6.0896, "loss/crossentropy": 2.6585968732833862, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18762676417827606, "step": 8254 }, { "epoch": 0.258, "grad_norm": 3.453125, "grad_norm_var": 0.056550089518229166, "learning_rate": 0.0001, "loss": 5.9864, "loss/crossentropy": 2.599988579750061, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18277861922979355, "step": 8256 }, { "epoch": 0.2580625, "grad_norm": 3.15625, "grad_norm_var": 0.059911092122395836, "learning_rate": 0.0001, "loss": 6.1234, "loss/crossentropy": 2.659175992012024, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1901729479432106, "step": 8258 }, { "epoch": 0.258125, "grad_norm": 3.140625, "grad_norm_var": 0.06591389973958334, "learning_rate": 0.0001, "loss": 5.8625, "loss/crossentropy": 2.4908353090286255, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18092092871665955, "step": 8260 }, { "epoch": 0.2581875, "grad_norm": 3.5, "grad_norm_var": 0.04632161458333333, "learning_rate": 0.0001, "loss": 6.1232, "loss/crossentropy": 2.56347119808197, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.19855140149593353, "step": 8262 }, { "epoch": 0.25825, "grad_norm": 3.75, "grad_norm_var": 0.0597320556640625, "learning_rate": 0.0001, "loss": 6.3446, "loss/crossentropy": 2.7170121669769287, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.19752085208892822, "step": 8264 }, { "epoch": 0.2583125, "grad_norm": 3.71875, "grad_norm_var": 0.0683258056640625, "learning_rate": 0.0001, "loss": 6.0703, "loss/crossentropy": 2.4784377813339233, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.20058993995189667, "step": 8266 }, { "epoch": 0.258375, "grad_norm": 3.765625, "grad_norm_var": 0.06558837890625, "learning_rate": 0.0001, "loss": 6.2707, "loss/crossentropy": 2.6991453170776367, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.1950492411851883, "step": 8268 }, { "epoch": 0.2584375, "grad_norm": 3.15625, "grad_norm_var": 0.068017578125, "learning_rate": 0.0001, "loss": 5.8645, "loss/crossentropy": 2.5384750366210938, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1786932647228241, "step": 8270 }, { "epoch": 0.2585, "grad_norm": 3.234375, "grad_norm_var": 0.071875, "learning_rate": 0.0001, "loss": 5.983, "loss/crossentropy": 2.5552178621292114, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18692362308502197, "step": 8272 }, { "epoch": 0.2585625, "grad_norm": 3.453125, "grad_norm_var": 0.07221577962239584, "learning_rate": 0.0001, "loss": 6.3375, "loss/crossentropy": 2.727352499961853, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.19929222017526627, "step": 8274 }, { "epoch": 0.258625, "grad_norm": 3.25, "grad_norm_var": 0.06682535807291666, "learning_rate": 0.0001, "loss": 6.0953, "loss/crossentropy": 2.6253907680511475, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.18722385168075562, "step": 8276 }, { "epoch": 0.2586875, "grad_norm": 3.234375, "grad_norm_var": 0.07351888020833333, "learning_rate": 0.0001, "loss": 6.2028, "loss/crossentropy": 2.692871928215027, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19396165758371353, "step": 8278 }, { "epoch": 0.25875, "grad_norm": 3.59375, "grad_norm_var": 0.06799723307291666, "learning_rate": 0.0001, "loss": 6.2305, "loss/crossentropy": 2.7118791341781616, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.19639146327972412, "step": 8280 }, { "epoch": 0.2588125, "grad_norm": 3.359375, "grad_norm_var": 0.0605133056640625, "learning_rate": 0.0001, "loss": 5.5454, "loss/crossentropy": 2.3294434547424316, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1684712991118431, "step": 8282 }, { "epoch": 0.258875, "grad_norm": 3.25, "grad_norm_var": 0.054442342122395834, "learning_rate": 0.0001, "loss": 5.8204, "loss/crossentropy": 2.476076364517212, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1820843368768692, "step": 8284 }, { "epoch": 0.2589375, "grad_norm": 3.875, "grad_norm_var": 0.071826171875, "learning_rate": 0.0001, "loss": 6.3151, "loss/crossentropy": 2.664148688316345, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.20064260065555573, "step": 8286 }, { "epoch": 0.259, "grad_norm": 3.109375, "grad_norm_var": 0.08062744140625, "learning_rate": 0.0001, "loss": 5.9039, "loss/crossentropy": 2.5980091094970703, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17824992537498474, "step": 8288 }, { "epoch": 0.2590625, "grad_norm": 3.3125, "grad_norm_var": 0.08085835774739583, "learning_rate": 0.0001, "loss": 5.8166, "loss/crossentropy": 2.4984443187713623, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17829637974500656, "step": 8290 }, { "epoch": 0.259125, "grad_norm": 3.53125, "grad_norm_var": 0.08088277180989584, "learning_rate": 0.0001, "loss": 6.2211, "loss/crossentropy": 2.633698344230652, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.1970238834619522, "step": 8292 }, { "epoch": 0.2591875, "grad_norm": 3.59375, "grad_norm_var": 0.09090067545572916, "learning_rate": 0.0001, "loss": 6.4699, "loss/crossentropy": 2.7759658098220825, "loss/hidden": 1.65234375, "loss/jsd": 0.0, "loss/logits": 0.2041609138250351, "step": 8294 }, { "epoch": 0.25925, "grad_norm": 3.71875, "grad_norm_var": 0.09553629557291667, "learning_rate": 0.0001, "loss": 6.0969, "loss/crossentropy": 2.5544604063034058, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.19408855587244034, "step": 8296 }, { "epoch": 0.2593125, "grad_norm": 3.359375, "grad_norm_var": 0.08961588541666667, "learning_rate": 0.0001, "loss": 5.9955, "loss/crossentropy": 2.543579578399658, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18816332519054413, "step": 8298 }, { "epoch": 0.259375, "grad_norm": 3.578125, "grad_norm_var": 0.08601888020833333, "learning_rate": 0.0001, "loss": 6.2063, "loss/crossentropy": 2.6519927978515625, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.1952776163816452, "step": 8300 }, { "epoch": 0.2594375, "grad_norm": 3.109375, "grad_norm_var": 0.06621805826822917, "learning_rate": 0.0001, "loss": 5.6115, "loss/crossentropy": 2.3252283334732056, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.16924922168254852, "step": 8302 }, { "epoch": 0.2595, "grad_norm": 3.75, "grad_norm_var": 0.05898335774739583, "learning_rate": 0.0001, "loss": 6.0397, "loss/crossentropy": 2.5799871683120728, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18855176120996475, "step": 8304 }, { "epoch": 0.2595625, "grad_norm": 3.5, "grad_norm_var": 0.036844889322916664, "learning_rate": 0.0001, "loss": 5.945, "loss/crossentropy": 2.486521363258362, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.19115658104419708, "step": 8306 }, { "epoch": 0.259625, "grad_norm": 3.4375, "grad_norm_var": 0.03909505208333333, "learning_rate": 0.0001, "loss": 5.9135, "loss/crossentropy": 2.5399292707443237, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1842341423034668, "step": 8308 }, { "epoch": 0.2596875, "grad_norm": 3.28125, "grad_norm_var": 0.028962198893229166, "learning_rate": 0.0001, "loss": 5.8404, "loss/crossentropy": 2.4623615741729736, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1831134930253029, "step": 8310 }, { "epoch": 0.25975, "grad_norm": 3.421875, "grad_norm_var": 0.0230865478515625, "learning_rate": 0.0001, "loss": 6.1338, "loss/crossentropy": 2.6255195140838623, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.19340698421001434, "step": 8312 }, { "epoch": 0.2598125, "grad_norm": 3.59375, "grad_norm_var": 0.023681640625, "learning_rate": 0.0001, "loss": 6.0778, "loss/crossentropy": 2.566820979118347, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.18976758420467377, "step": 8314 }, { "epoch": 0.259875, "grad_norm": 3.40625, "grad_norm_var": 0.022931925455729165, "learning_rate": 0.0001, "loss": 5.856, "loss/crossentropy": 2.480563521385193, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18402428925037384, "step": 8316 }, { "epoch": 0.2599375, "grad_norm": 3.265625, "grad_norm_var": 0.02086181640625, "learning_rate": 0.0001, "loss": 6.0715, "loss/crossentropy": 2.6022223234176636, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1906745731830597, "step": 8318 }, { "epoch": 0.26, "grad_norm": 4.53125, "grad_norm_var": 0.08355712890625, "learning_rate": 0.0001, "loss": 5.9294, "loss/crossentropy": 2.46099591255188, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.18551427125930786, "step": 8320 }, { "epoch": 0.2600625, "grad_norm": 3.625, "grad_norm_var": 0.10201822916666667, "learning_rate": 0.0001, "loss": 6.2567, "loss/crossentropy": 2.639232039451599, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.20159168541431427, "step": 8322 }, { "epoch": 0.260125, "grad_norm": 3.453125, "grad_norm_var": 0.09713134765625, "learning_rate": 0.0001, "loss": 6.1552, "loss/crossentropy": 2.626811981201172, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.19072547554969788, "step": 8324 }, { "epoch": 0.2601875, "grad_norm": 3.4375, "grad_norm_var": 0.09269205729166667, "learning_rate": 0.0001, "loss": 6.1227, "loss/crossentropy": 2.5807541608810425, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.19754991680383682, "step": 8326 }, { "epoch": 0.26025, "grad_norm": 3.3125, "grad_norm_var": 0.099755859375, "learning_rate": 0.0001, "loss": 6.0594, "loss/crossentropy": 2.5863711833953857, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1926172822713852, "step": 8328 }, { "epoch": 0.2603125, "grad_norm": 3.953125, "grad_norm_var": 0.10774637858072916, "learning_rate": 0.0001, "loss": 6.2198, "loss/crossentropy": 2.6828631162643433, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.19509944319725037, "step": 8330 }, { "epoch": 0.260375, "grad_norm": 3.25, "grad_norm_var": 0.11334228515625, "learning_rate": 0.0001, "loss": 5.6538, "loss/crossentropy": 2.307045340538025, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.17764659970998764, "step": 8332 }, { "epoch": 0.2604375, "grad_norm": 3.53125, "grad_norm_var": 0.10630594889322917, "learning_rate": 0.0001, "loss": 6.075, "loss/crossentropy": 2.556526780128479, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.19755026698112488, "step": 8334 }, { "epoch": 0.2605, "grad_norm": 3.09375, "grad_norm_var": 0.060358683268229164, "learning_rate": 0.0001, "loss": 5.6987, "loss/crossentropy": 2.3782535791397095, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.17188557982444763, "step": 8336 }, { "epoch": 0.2605625, "grad_norm": 3.546875, "grad_norm_var": 0.079541015625, "learning_rate": 0.0001, "loss": 5.9846, "loss/crossentropy": 2.5467023849487305, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.1851910725235939, "step": 8338 }, { "epoch": 0.260625, "grad_norm": 3.515625, "grad_norm_var": 0.08170572916666667, "learning_rate": 0.0001, "loss": 6.1645, "loss/crossentropy": 2.6724393367767334, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19217563420534134, "step": 8340 }, { "epoch": 0.2606875, "grad_norm": 3.40625, "grad_norm_var": 0.0935943603515625, "learning_rate": 0.0001, "loss": 5.9087, "loss/crossentropy": 2.4909695386886597, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.1816122606396675, "step": 8342 }, { "epoch": 0.26075, "grad_norm": 3.296875, "grad_norm_var": 0.10429280598958333, "learning_rate": 0.0001, "loss": 5.9806, "loss/crossentropy": 2.591573119163513, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18617041409015656, "step": 8344 }, { "epoch": 0.2608125, "grad_norm": 3.390625, "grad_norm_var": 0.09718424479166667, "learning_rate": 0.0001, "loss": 6.2064, "loss/crossentropy": 2.6100746393203735, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.19791245460510254, "step": 8346 }, { "epoch": 0.260875, "grad_norm": 3.421875, "grad_norm_var": 0.09492085774739584, "learning_rate": 0.0001, "loss": 6.1266, "loss/crossentropy": 2.613708972930908, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1946462318301201, "step": 8348 }, { "epoch": 0.2609375, "grad_norm": 3.640625, "grad_norm_var": 0.10991109212239583, "learning_rate": 0.0001, "loss": 5.5421, "loss/crossentropy": 2.3060216903686523, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17361095547676086, "step": 8350 }, { "epoch": 0.261, "grad_norm": 3.359375, "grad_norm_var": 0.10115458170572916, "learning_rate": 0.0001, "loss": 5.9834, "loss/crossentropy": 2.5177823305130005, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.189922496676445, "step": 8352 }, { "epoch": 0.2610625, "grad_norm": 3.34375, "grad_norm_var": 0.056494140625, "learning_rate": 0.0001, "loss": 6.0121, "loss/crossentropy": 2.5313172340393066, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18987558782100677, "step": 8354 }, { "epoch": 0.261125, "grad_norm": 3.375, "grad_norm_var": 0.062939453125, "learning_rate": 0.0001, "loss": 6.2673, "loss/crossentropy": 2.6749621629714966, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19947275519371033, "step": 8356 }, { "epoch": 0.2611875, "grad_norm": 3.515625, "grad_norm_var": 0.054833984375, "learning_rate": 0.0001, "loss": 5.7834, "loss/crossentropy": 2.427981734275818, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.1761658638715744, "step": 8358 }, { "epoch": 0.26125, "grad_norm": 3.640625, "grad_norm_var": 0.04914449055989583, "learning_rate": 0.0001, "loss": 6.0362, "loss/crossentropy": 2.5841875076293945, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.1877766028046608, "step": 8360 }, { "epoch": 0.2613125, "grad_norm": 3.328125, "grad_norm_var": 0.04364827473958333, "learning_rate": 0.0001, "loss": 6.4257, "loss/crossentropy": 2.832598328590393, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.2011021375656128, "step": 8362 }, { "epoch": 0.261375, "grad_norm": 3.40625, "grad_norm_var": 0.047379557291666666, "learning_rate": 0.0001, "loss": 6.0373, "loss/crossentropy": 2.547619581222534, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.18919818103313446, "step": 8364 }, { "epoch": 0.2614375, "grad_norm": 3.140625, "grad_norm_var": 0.039567057291666666, "learning_rate": 0.0001, "loss": 5.865, "loss/crossentropy": 2.556501626968384, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17577163130044937, "step": 8366 }, { "epoch": 0.2615, "grad_norm": 3.171875, "grad_norm_var": 0.04397379557291667, "learning_rate": 0.0001, "loss": 5.9708, "loss/crossentropy": 2.5391035079956055, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18926027417182922, "step": 8368 }, { "epoch": 0.2615625, "grad_norm": 3.15625, "grad_norm_var": 0.04334309895833333, "learning_rate": 0.0001, "loss": 6.0405, "loss/crossentropy": 2.5592786073684692, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.19070356339216232, "step": 8370 }, { "epoch": 0.261625, "grad_norm": 3.28125, "grad_norm_var": 0.03657938639322917, "learning_rate": 0.0001, "loss": 5.7534, "loss/crossentropy": 2.413809657096863, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18200141936540604, "step": 8372 }, { "epoch": 0.2616875, "grad_norm": 3.328125, "grad_norm_var": 0.03467508951822917, "learning_rate": 0.0001, "loss": 6.3139, "loss/crossentropy": 2.706895589828491, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19976375997066498, "step": 8374 }, { "epoch": 0.26175, "grad_norm": 3.5, "grad_norm_var": 0.029474894205729168, "learning_rate": 0.0001, "loss": 5.9654, "loss/crossentropy": 2.599493980407715, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18073485046625137, "step": 8376 }, { "epoch": 0.2618125, "grad_norm": 3.234375, "grad_norm_var": 0.0298004150390625, "learning_rate": 0.0001, "loss": 5.7526, "loss/crossentropy": 2.4448423385620117, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.18077139556407928, "step": 8378 }, { "epoch": 0.261875, "grad_norm": 3.25, "grad_norm_var": 0.0318023681640625, "learning_rate": 0.0001, "loss": 6.0073, "loss/crossentropy": 2.585577368736267, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1867080181837082, "step": 8380 }, { "epoch": 0.2619375, "grad_norm": 3.1875, "grad_norm_var": 0.03072509765625, "learning_rate": 0.0001, "loss": 5.9465, "loss/crossentropy": 2.610453486442566, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18126288056373596, "step": 8382 }, { "epoch": 0.262, "grad_norm": 3.375, "grad_norm_var": 0.0347808837890625, "learning_rate": 0.0001, "loss": 6.1832, "loss/crossentropy": 2.772537350654602, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18403807282447815, "step": 8384 }, { "epoch": 0.2620625, "grad_norm": 3.1875, "grad_norm_var": 0.033056640625, "learning_rate": 0.0001, "loss": 6.0056, "loss/crossentropy": 2.648902654647827, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18293480575084686, "step": 8386 }, { "epoch": 0.262125, "grad_norm": 3.53125, "grad_norm_var": 0.0350982666015625, "learning_rate": 0.0001, "loss": 5.7087, "loss/crossentropy": 2.3789840936660767, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1751616671681404, "step": 8388 }, { "epoch": 0.2621875, "grad_norm": 3.4375, "grad_norm_var": 0.030304972330729166, "learning_rate": 0.0001, "loss": 5.8585, "loss/crossentropy": 2.4742425680160522, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18257129937410355, "step": 8390 }, { "epoch": 0.26225, "grad_norm": 3.34375, "grad_norm_var": 0.028392537434895834, "learning_rate": 0.0001, "loss": 5.8659, "loss/crossentropy": 2.4556645154953003, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18360427021980286, "step": 8392 }, { "epoch": 0.2623125, "grad_norm": 3.1875, "grad_norm_var": 0.0290924072265625, "learning_rate": 0.0001, "loss": 6.0201, "loss/crossentropy": 2.5338116884231567, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19159843772649765, "step": 8394 }, { "epoch": 0.262375, "grad_norm": 3.609375, "grad_norm_var": 0.025764973958333333, "learning_rate": 0.0001, "loss": 6.0123, "loss/crossentropy": 2.4817099571228027, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.19290197640657425, "step": 8396 }, { "epoch": 0.2624375, "grad_norm": 3.515625, "grad_norm_var": 0.02603759765625, "learning_rate": 0.0001, "loss": 6.006, "loss/crossentropy": 2.598557710647583, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1852778196334839, "step": 8398 }, { "epoch": 0.2625, "grad_norm": 3.515625, "grad_norm_var": 0.018619791666666666, "learning_rate": 0.0001, "loss": 6.082, "loss/crossentropy": 2.567653179168701, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.18932054936885834, "step": 8400 }, { "epoch": 0.2625625, "grad_norm": 3.296875, "grad_norm_var": 0.018773396809895832, "learning_rate": 0.0001, "loss": 5.8867, "loss/crossentropy": 2.5691102743148804, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1817583665251732, "step": 8402 }, { "epoch": 0.262625, "grad_norm": 3.453125, "grad_norm_var": 0.0141998291015625, "learning_rate": 0.0001, "loss": 6.1406, "loss/crossentropy": 2.613145351409912, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.19610608369112015, "step": 8404 }, { "epoch": 0.2626875, "grad_norm": 3.359375, "grad_norm_var": 0.020018513997395834, "learning_rate": 0.0001, "loss": 6.4031, "loss/crossentropy": 2.8511613607406616, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.20050807297229767, "step": 8406 }, { "epoch": 0.26275, "grad_norm": 3.46875, "grad_norm_var": 0.027904256184895834, "learning_rate": 0.0001, "loss": 5.8355, "loss/crossentropy": 2.463037610054016, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.1782667636871338, "step": 8408 }, { "epoch": 0.2628125, "grad_norm": 3.25, "grad_norm_var": 0.027958170572916666, "learning_rate": 0.0001, "loss": 6.201, "loss/crossentropy": 2.721705675125122, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19090045243501663, "step": 8410 }, { "epoch": 0.262875, "grad_norm": 3.28125, "grad_norm_var": 0.031647745768229166, "learning_rate": 0.0001, "loss": 5.5934, "loss/crossentropy": 2.385154128074646, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17199848592281342, "step": 8412 }, { "epoch": 0.2629375, "grad_norm": 3.671875, "grad_norm_var": 0.03550516764322917, "learning_rate": 0.0001, "loss": 6.2027, "loss/crossentropy": 2.6594830751419067, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.1953330710530281, "step": 8414 }, { "epoch": 0.263, "grad_norm": 3.03125, "grad_norm_var": 0.04192301432291667, "learning_rate": 0.0001, "loss": 5.7108, "loss/crossentropy": 2.443622589111328, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17828471958637238, "step": 8416 }, { "epoch": 0.2630625, "grad_norm": 3.734375, "grad_norm_var": 0.0504302978515625, "learning_rate": 0.0001, "loss": 6.1936, "loss/crossentropy": 2.6475489139556885, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.19522744417190552, "step": 8418 }, { "epoch": 0.263125, "grad_norm": 3.5625, "grad_norm_var": 0.0546295166015625, "learning_rate": 0.0001, "loss": 5.9959, "loss/crossentropy": 2.6391217708587646, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18294747918844223, "step": 8420 }, { "epoch": 0.2631875, "grad_norm": 3.21875, "grad_norm_var": 0.0515533447265625, "learning_rate": 0.0001, "loss": 6.053, "loss/crossentropy": 2.62371289730072, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18785394728183746, "step": 8422 }, { "epoch": 0.26325, "grad_norm": 3.28125, "grad_norm_var": 0.04903971354166667, "learning_rate": 0.0001, "loss": 6.0535, "loss/crossentropy": 2.6555440425872803, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18471328914165497, "step": 8424 }, { "epoch": 0.2633125, "grad_norm": 3.171875, "grad_norm_var": 0.04951070149739583, "learning_rate": 0.0001, "loss": 5.9066, "loss/crossentropy": 2.5810846090316772, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1805986762046814, "step": 8426 }, { "epoch": 0.263375, "grad_norm": 3.0625, "grad_norm_var": 0.0529296875, "learning_rate": 0.0001, "loss": 5.9898, "loss/crossentropy": 2.6199209690093994, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.17878857254981995, "step": 8428 }, { "epoch": 0.2634375, "grad_norm": 4.1875, "grad_norm_var": 0.08944905598958333, "learning_rate": 0.0001, "loss": 6.1658, "loss/crossentropy": 2.6054818630218506, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19900210201740265, "step": 8430 }, { "epoch": 0.2635, "grad_norm": 3.140625, "grad_norm_var": 0.1032135009765625, "learning_rate": 0.0001, "loss": 6.1958, "loss/crossentropy": 2.714025855064392, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.19154001772403717, "step": 8432 }, { "epoch": 0.2635625, "grad_norm": 3.46875, "grad_norm_var": 0.18465169270833334, "learning_rate": 0.0001, "loss": 6.0181, "loss/crossentropy": 2.5759806632995605, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18757275491952896, "step": 8434 }, { "epoch": 0.263625, "grad_norm": 3.40625, "grad_norm_var": 0.18899332682291667, "learning_rate": 0.0001, "loss": 5.8798, "loss/crossentropy": 2.510910749435425, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18219773471355438, "step": 8436 }, { "epoch": 0.2636875, "grad_norm": 3.390625, "grad_norm_var": 0.18684794108072916, "learning_rate": 0.0001, "loss": 5.8096, "loss/crossentropy": 2.4002357721328735, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18468758463859558, "step": 8438 }, { "epoch": 0.26375, "grad_norm": 3.21875, "grad_norm_var": 0.18179931640625, "learning_rate": 0.0001, "loss": 5.8787, "loss/crossentropy": 2.54973042011261, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17860250920057297, "step": 8440 }, { "epoch": 0.2638125, "grad_norm": 3.25, "grad_norm_var": 0.17497456868489583, "learning_rate": 0.0001, "loss": 5.9215, "loss/crossentropy": 2.4767297506332397, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1886201873421669, "step": 8442 }, { "epoch": 0.263875, "grad_norm": 3.453125, "grad_norm_var": 0.3054270426432292, "learning_rate": 0.0001, "loss": 6.3098, "loss/crossentropy": 2.6348717212677, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.20577538013458252, "step": 8444 }, { "epoch": 0.2639375, "grad_norm": 3.453125, "grad_norm_var": 0.28345947265625, "learning_rate": 0.0001, "loss": 6.1107, "loss/crossentropy": 2.671947479248047, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1891855150461197, "step": 8446 }, { "epoch": 0.264, "grad_norm": 3.5, "grad_norm_var": 0.2637603759765625, "learning_rate": 0.0001, "loss": 5.9479, "loss/crossentropy": 2.5242542028427124, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1892443746328354, "step": 8448 }, { "epoch": 0.2640625, "grad_norm": 3.296875, "grad_norm_var": 0.19389546712239583, "learning_rate": 0.0001, "loss": 5.9163, "loss/crossentropy": 2.5203553438186646, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18491075187921524, "step": 8450 }, { "epoch": 0.264125, "grad_norm": 3.40625, "grad_norm_var": 0.19278055826822918, "learning_rate": 0.0001, "loss": 5.8552, "loss/crossentropy": 2.513572573661804, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.18298938870429993, "step": 8452 }, { "epoch": 0.2641875, "grad_norm": 3.5, "grad_norm_var": 0.19021708170572918, "learning_rate": 0.0001, "loss": 5.97, "loss/crossentropy": 2.4746936559677124, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1944495290517807, "step": 8454 }, { "epoch": 0.26425, "grad_norm": 3.78125, "grad_norm_var": 0.18772684733072917, "learning_rate": 0.0001, "loss": 6.2641, "loss/crossentropy": 2.6473084688186646, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.2023027092218399, "step": 8456 }, { "epoch": 0.2643125, "grad_norm": 3.234375, "grad_norm_var": 0.18791910807291667, "learning_rate": 0.0001, "loss": 5.9522, "loss/crossentropy": 2.6063324213027954, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18067849427461624, "step": 8458 }, { "epoch": 0.264375, "grad_norm": 3.46875, "grad_norm_var": 0.02222900390625, "learning_rate": 0.0001, "loss": 5.8842, "loss/crossentropy": 2.526416301727295, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18186820298433304, "step": 8460 }, { "epoch": 0.2644375, "grad_norm": 3.59375, "grad_norm_var": 0.0293609619140625, "learning_rate": 0.0001, "loss": 5.7632, "loss/crossentropy": 2.3774088621139526, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1862344667315483, "step": 8462 }, { "epoch": 0.2645, "grad_norm": 3.28125, "grad_norm_var": 0.03349609375, "learning_rate": 0.0001, "loss": 6.1811, "loss/crossentropy": 2.741239309310913, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.18265872448682785, "step": 8464 }, { "epoch": 0.2645625, "grad_norm": 3.609375, "grad_norm_var": 0.035456339518229164, "learning_rate": 0.0001, "loss": 5.8993, "loss/crossentropy": 2.5158239603042603, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18209965527057648, "step": 8466 }, { "epoch": 0.264625, "grad_norm": 3.09375, "grad_norm_var": 0.040257771809895836, "learning_rate": 0.0001, "loss": 5.747, "loss/crossentropy": 2.390213966369629, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1805977299809456, "step": 8468 }, { "epoch": 0.2646875, "grad_norm": 3.484375, "grad_norm_var": 0.0401031494140625, "learning_rate": 0.0001, "loss": 6.2295, "loss/crossentropy": 2.6879630088806152, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.19361046701669693, "step": 8470 }, { "epoch": 0.26475, "grad_norm": 3.328125, "grad_norm_var": 0.030631510416666667, "learning_rate": 0.0001, "loss": 6.0514, "loss/crossentropy": 2.625853419303894, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18865232914686203, "step": 8472 }, { "epoch": 0.2648125, "grad_norm": 3.515625, "grad_norm_var": 0.03129781087239583, "learning_rate": 0.0001, "loss": 6.0974, "loss/crossentropy": 2.589285969734192, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.19416602700948715, "step": 8474 }, { "epoch": 0.264875, "grad_norm": 3.296875, "grad_norm_var": 0.029996744791666665, "learning_rate": 0.0001, "loss": 5.8653, "loss/crossentropy": 2.503763794898987, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1783452183008194, "step": 8476 }, { "epoch": 0.2649375, "grad_norm": 3.4375, "grad_norm_var": 0.0326171875, "learning_rate": 0.0001, "loss": 6.2003, "loss/crossentropy": 2.6542402505874634, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.19601517915725708, "step": 8478 }, { "epoch": 0.265, "grad_norm": 3.734375, "grad_norm_var": 0.048094685872395834, "learning_rate": 0.0001, "loss": 5.7677, "loss/crossentropy": 2.405746579170227, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18384867906570435, "step": 8480 }, { "epoch": 0.2650625, "grad_norm": 3.40625, "grad_norm_var": 0.04792378743489583, "learning_rate": 0.0001, "loss": 6.0084, "loss/crossentropy": 2.685820698738098, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1779606118798256, "step": 8482 }, { "epoch": 0.265125, "grad_norm": 9.75, "grad_norm_var": 2.5621327718098956, "learning_rate": 0.0001, "loss": 6.2049, "loss/crossentropy": 2.587186574935913, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.20474126935005188, "step": 8484 }, { "epoch": 0.2651875, "grad_norm": 3.734375, "grad_norm_var": 2.559129842122396, "learning_rate": 0.0001, "loss": 5.8343, "loss/crossentropy": 2.442618250846863, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1833084225654602, "step": 8486 }, { "epoch": 0.26525, "grad_norm": 3.25, "grad_norm_var": 2.552848307291667, "learning_rate": 0.0001, "loss": 6.4387, "loss/crossentropy": 2.8887436389923096, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.19327892363071442, "step": 8488 }, { "epoch": 0.2653125, "grad_norm": 3.296875, "grad_norm_var": 2.5823964436848956, "learning_rate": 0.0001, "loss": 5.7949, "loss/crossentropy": 2.453776001930237, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1833271160721779, "step": 8490 }, { "epoch": 0.265375, "grad_norm": 3.296875, "grad_norm_var": 2.581656901041667, "learning_rate": 0.0001, "loss": 6.0074, "loss/crossentropy": 2.468347191810608, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.18945540487766266, "step": 8492 }, { "epoch": 0.2654375, "grad_norm": 3.609375, "grad_norm_var": 2.5796712239583335, "learning_rate": 0.0001, "loss": 6.1462, "loss/crossentropy": 2.676905393600464, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.19067463278770447, "step": 8494 }, { "epoch": 0.2655, "grad_norm": 3.65625, "grad_norm_var": 2.5346832275390625, "learning_rate": 0.0001, "loss": 6.2797, "loss/crossentropy": 2.7150908708572388, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.1978655308485031, "step": 8496 }, { "epoch": 0.2655625, "grad_norm": 3.671875, "grad_norm_var": 2.5151041666666667, "learning_rate": 0.0001, "loss": 6.0652, "loss/crossentropy": 2.5959692001342773, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18989138305187225, "step": 8498 }, { "epoch": 0.265625, "grad_norm": 3.734375, "grad_norm_var": 0.04973551432291667, "learning_rate": 0.0001, "loss": 6.1024, "loss/crossentropy": 2.5476499795913696, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.19609753042459488, "step": 8500 }, { "epoch": 0.2656875, "grad_norm": 4.125, "grad_norm_var": 0.07014058430989584, "learning_rate": 0.0001, "loss": 6.267, "loss/crossentropy": 2.6953723430633545, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.19583825767040253, "step": 8502 }, { "epoch": 0.26575, "grad_norm": 3.125, "grad_norm_var": 0.06415913899739584, "learning_rate": 0.0001, "loss": 5.935, "loss/crossentropy": 2.5268644094467163, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1853414699435234, "step": 8504 }, { "epoch": 0.2658125, "grad_norm": 4.875, "grad_norm_var": 0.16070048014322916, "learning_rate": 0.0001, "loss": 6.208, "loss/crossentropy": 2.6290266513824463, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.20164984464645386, "step": 8506 }, { "epoch": 0.265875, "grad_norm": 3.53125, "grad_norm_var": 0.1534820556640625, "learning_rate": 0.0001, "loss": 6.1913, "loss/crossentropy": 2.6603872776031494, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.19371574372053146, "step": 8508 }, { "epoch": 0.2659375, "grad_norm": 3.78125, "grad_norm_var": 0.6705963134765625, "learning_rate": 0.0001, "loss": 6.1024, "loss/crossentropy": 2.6138943433761597, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1918160319328308, "step": 8510 }, { "epoch": 0.266, "grad_norm": 3.171875, "grad_norm_var": 0.7031534830729167, "learning_rate": 0.0001, "loss": 6.0797, "loss/crossentropy": 2.6256297826766968, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1915031224489212, "step": 8512 }, { "epoch": 0.2660625, "grad_norm": 3.4375, "grad_norm_var": 0.7205729166666667, "learning_rate": 0.0001, "loss": 6.1179, "loss/crossentropy": 2.6821320056915283, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18693740665912628, "step": 8514 }, { "epoch": 0.266125, "grad_norm": 3.5, "grad_norm_var": 0.7332102457682291, "learning_rate": 0.0001, "loss": 5.5921, "loss/crossentropy": 2.170780062675476, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.17963345348834991, "step": 8516 }, { "epoch": 0.2661875, "grad_norm": 3.71875, "grad_norm_var": 0.7375396728515625, "learning_rate": 0.0001, "loss": 6.2086, "loss/crossentropy": 2.6892930269241333, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.19255327433347702, "step": 8518 }, { "epoch": 0.26625, "grad_norm": 3.390625, "grad_norm_var": 0.7311808268229166, "learning_rate": 0.0001, "loss": 5.8271, "loss/crossentropy": 2.4138920307159424, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1846773475408554, "step": 8520 }, { "epoch": 0.2663125, "grad_norm": 3.5625, "grad_norm_var": 0.6611979166666667, "learning_rate": 0.0001, "loss": 5.7981, "loss/crossentropy": 2.438520908355713, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18283716589212418, "step": 8522 }, { "epoch": 0.266375, "grad_norm": 3.421875, "grad_norm_var": 0.6628743489583333, "learning_rate": 0.0001, "loss": 5.8994, "loss/crossentropy": 2.4681202173233032, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18649118393659592, "step": 8524 }, { "epoch": 0.2664375, "grad_norm": 3.765625, "grad_norm_var": 0.07416890462239584, "learning_rate": 0.0001, "loss": 6.0627, "loss/crossentropy": 2.5536561012268066, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.19855762273073196, "step": 8526 }, { "epoch": 0.2665, "grad_norm": 3.734375, "grad_norm_var": 0.07022196451822917, "learning_rate": 0.0001, "loss": 5.6656, "loss/crossentropy": 2.3667540550231934, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1755838543176651, "step": 8528 }, { "epoch": 0.2665625, "grad_norm": 3.359375, "grad_norm_var": 0.06329752604166666, "learning_rate": 0.0001, "loss": 6.0194, "loss/crossentropy": 2.491975426673889, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19297584146261215, "step": 8530 }, { "epoch": 0.266625, "grad_norm": 3.625, "grad_norm_var": 0.08082682291666667, "learning_rate": 0.0001, "loss": 5.9617, "loss/crossentropy": 2.532854199409485, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18819521367549896, "step": 8532 }, { "epoch": 0.2666875, "grad_norm": 3.515625, "grad_norm_var": 0.07795817057291667, "learning_rate": 0.0001, "loss": 6.2831, "loss/crossentropy": 2.7350999116897583, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19385851174592972, "step": 8534 }, { "epoch": 0.26675, "grad_norm": 3.9375, "grad_norm_var": 0.6849273681640625, "learning_rate": 0.0001, "loss": 5.9346, "loss/crossentropy": 2.4325190782546997, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.1838061511516571, "step": 8536 }, { "epoch": 0.2668125, "grad_norm": 3.5625, "grad_norm_var": 0.6939849853515625, "learning_rate": 0.0001, "loss": 5.8413, "loss/crossentropy": 2.4846941232681274, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.17862877994775772, "step": 8538 }, { "epoch": 0.266875, "grad_norm": 3.875, "grad_norm_var": 0.68277587890625, "learning_rate": 0.0001, "loss": 6.2013, "loss/crossentropy": 2.639085292816162, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19528042525053024, "step": 8540 }, { "epoch": 0.2669375, "grad_norm": 3.375, "grad_norm_var": 0.7116282145182292, "learning_rate": 0.0001, "loss": 6.0734, "loss/crossentropy": 2.645695686340332, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18847399204969406, "step": 8542 }, { "epoch": 0.267, "grad_norm": 3.34375, "grad_norm_var": 0.7354237874348958, "learning_rate": 0.0001, "loss": 6.0328, "loss/crossentropy": 2.622828960418701, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1851336508989334, "step": 8544 }, { "epoch": 0.2670625, "grad_norm": 3.578125, "grad_norm_var": 0.7755849202473958, "learning_rate": 0.0001, "loss": 6.0321, "loss/crossentropy": 2.5674968957901, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.19177460670471191, "step": 8546 }, { "epoch": 0.267125, "grad_norm": 3.421875, "grad_norm_var": 0.7624094645182292, "learning_rate": 0.0001, "loss": 6.1345, "loss/crossentropy": 2.6102665662765503, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.19578158855438232, "step": 8548 }, { "epoch": 0.2671875, "grad_norm": 3.875, "grad_norm_var": 0.7626291910807291, "learning_rate": 0.0001, "loss": 5.8333, "loss/crossentropy": 2.478430151939392, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.17806804925203323, "step": 8550 }, { "epoch": 0.26725, "grad_norm": 3.421875, "grad_norm_var": 0.13746337890625, "learning_rate": 0.0001, "loss": 5.9301, "loss/crossentropy": 2.470288872718811, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.18699821829795837, "step": 8552 }, { "epoch": 0.2673125, "grad_norm": 3.359375, "grad_norm_var": 0.13751627604166666, "learning_rate": 0.0001, "loss": 5.7851, "loss/crossentropy": 2.3833465576171875, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.18158571422100067, "step": 8554 }, { "epoch": 0.267375, "grad_norm": 3.5, "grad_norm_var": 0.13147684733072917, "learning_rate": 0.0001, "loss": 5.8517, "loss/crossentropy": 2.461923360824585, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18311707675457, "step": 8556 }, { "epoch": 0.2674375, "grad_norm": 3.203125, "grad_norm_var": 0.13958333333333334, "learning_rate": 0.0001, "loss": 5.9528, "loss/crossentropy": 2.553870439529419, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18715743720531464, "step": 8558 }, { "epoch": 0.2675, "grad_norm": 3.515625, "grad_norm_var": 0.13875223795572916, "learning_rate": 0.0001, "loss": 5.6426, "loss/crossentropy": 2.277732491493225, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.17672587931156158, "step": 8560 }, { "epoch": 0.2675625, "grad_norm": 3.125, "grad_norm_var": 0.0509674072265625, "learning_rate": 0.0001, "loss": 6.0759, "loss/crossentropy": 2.677735924720764, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18357118964195251, "step": 8562 }, { "epoch": 0.267625, "grad_norm": 3.34375, "grad_norm_var": 0.24551493326822918, "learning_rate": 0.0001, "loss": 6.1003, "loss/crossentropy": 2.658802032470703, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1875048652291298, "step": 8564 }, { "epoch": 0.2676875, "grad_norm": 3.3125, "grad_norm_var": 0.23691304524739584, "learning_rate": 0.0001, "loss": 6.104, "loss/crossentropy": 2.683103084564209, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18818234652280807, "step": 8566 }, { "epoch": 0.26775, "grad_norm": 4.0625, "grad_norm_var": 0.260107421875, "learning_rate": 0.0001, "loss": 5.8088, "loss/crossentropy": 2.433120369911194, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18366563320159912, "step": 8568 }, { "epoch": 0.2678125, "grad_norm": 3.46875, "grad_norm_var": 0.26452534993489585, "learning_rate": 0.0001, "loss": 6.098, "loss/crossentropy": 2.537542939186096, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.19158834964036942, "step": 8570 }, { "epoch": 0.267875, "grad_norm": 3.5, "grad_norm_var": 0.26738179524739586, "learning_rate": 0.0001, "loss": 6.4872, "loss/crossentropy": 2.7899457216262817, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.20918139815330505, "step": 8572 }, { "epoch": 0.2679375, "grad_norm": 3.265625, "grad_norm_var": 0.2688385009765625, "learning_rate": 0.0001, "loss": 5.6195, "loss/crossentropy": 2.3636358976364136, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17441652715206146, "step": 8574 }, { "epoch": 0.268, "grad_norm": 3.90625, "grad_norm_var": 0.26971028645833334, "learning_rate": 0.0001, "loss": 5.9691, "loss/crossentropy": 2.4868744611740112, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.18845638632774353, "step": 8576 }, { "epoch": 0.2680625, "grad_norm": 3.515625, "grad_norm_var": 0.8212076822916666, "learning_rate": 0.0001, "loss": 6.2685, "loss/crossentropy": 2.60793399810791, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.20472849905490875, "step": 8578 }, { "epoch": 0.268125, "grad_norm": 3.234375, "grad_norm_var": 0.70054931640625, "learning_rate": 0.0001, "loss": 6.0464, "loss/crossentropy": 2.6345239877700806, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.18219953030347824, "step": 8580 }, { "epoch": 0.2681875, "grad_norm": 3.453125, "grad_norm_var": 1.0117177327473958, "learning_rate": 0.0001, "loss": 6.4786, "loss/crossentropy": 2.73080575466156, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.211110420525074, "step": 8582 }, { "epoch": 0.26825, "grad_norm": 3.78125, "grad_norm_var": 1.0106404622395833, "learning_rate": 0.0001, "loss": 6.2299, "loss/crossentropy": 2.610520124435425, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.19982445240020752, "step": 8584 }, { "epoch": 0.2683125, "grad_norm": 3.484375, "grad_norm_var": 1.0197499593098958, "learning_rate": 0.0001, "loss": 6.3412, "loss/crossentropy": 2.7998993396759033, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1978771984577179, "step": 8586 }, { "epoch": 0.268375, "grad_norm": 3.796875, "grad_norm_var": 1.0201812744140626, "learning_rate": 0.0001, "loss": 5.8564, "loss/crossentropy": 2.38976788520813, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.18689440190792084, "step": 8588 }, { "epoch": 0.2684375, "grad_norm": 3.3125, "grad_norm_var": 0.9783162434895833, "learning_rate": 0.0001, "loss": 6.1383, "loss/crossentropy": 2.5660598278045654, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.1982373148202896, "step": 8590 }, { "epoch": 0.2685, "grad_norm": 3.34375, "grad_norm_var": 1.0193644205729167, "learning_rate": 0.0001, "loss": 5.9395, "loss/crossentropy": 2.569070339202881, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18196505308151245, "step": 8592 }, { "epoch": 0.2685625, "grad_norm": 3.328125, "grad_norm_var": 0.485791015625, "learning_rate": 0.0001, "loss": 6.0061, "loss/crossentropy": 2.6035884618759155, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18477965891361237, "step": 8594 }, { "epoch": 0.268625, "grad_norm": 3.765625, "grad_norm_var": 0.4724609375, "learning_rate": 0.0001, "loss": 6.2822, "loss/crossentropy": 2.7133415937423706, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19594413042068481, "step": 8596 }, { "epoch": 0.2686875, "grad_norm": 3.234375, "grad_norm_var": 0.0524810791015625, "learning_rate": 0.0001, "loss": 5.9493, "loss/crossentropy": 2.4889795780181885, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.18743963539600372, "step": 8598 }, { "epoch": 0.26875, "grad_norm": 3.375, "grad_norm_var": 0.0525299072265625, "learning_rate": 0.0001, "loss": 6.0522, "loss/crossentropy": 2.6539541482925415, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18514089286327362, "step": 8600 }, { "epoch": 0.2688125, "grad_norm": 3.234375, "grad_norm_var": 0.05427958170572917, "learning_rate": 0.0001, "loss": 5.7099, "loss/crossentropy": 2.2864630222320557, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.188829205930233, "step": 8602 }, { "epoch": 0.268875, "grad_norm": 3.109375, "grad_norm_var": 0.06525065104166666, "learning_rate": 0.0001, "loss": 6.2145, "loss/crossentropy": 2.717995524406433, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.19378919154405594, "step": 8604 }, { "epoch": 0.2689375, "grad_norm": 3.125, "grad_norm_var": 0.0459625244140625, "learning_rate": 0.0001, "loss": 5.758, "loss/crossentropy": 2.4573293924331665, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17771954834461212, "step": 8606 }, { "epoch": 0.269, "grad_norm": 3.578125, "grad_norm_var": 0.05129801432291667, "learning_rate": 0.0001, "loss": 5.9259, "loss/crossentropy": 2.485554337501526, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18817304819822311, "step": 8608 }, { "epoch": 0.2690625, "grad_norm": 3.390625, "grad_norm_var": 0.07372639973958334, "learning_rate": 0.0001, "loss": 6.209, "loss/crossentropy": 2.6714975833892822, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.19632639735937119, "step": 8610 }, { "epoch": 0.269125, "grad_norm": 3.046875, "grad_norm_var": 0.07276102701822916, "learning_rate": 0.0001, "loss": 5.8628, "loss/crossentropy": 2.6209064722061157, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17301487177610397, "step": 8612 }, { "epoch": 0.2691875, "grad_norm": 3.828125, "grad_norm_var": 0.09257405598958333, "learning_rate": 0.0001, "loss": 6.4746, "loss/crossentropy": 2.7915139198303223, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.20659378916025162, "step": 8614 }, { "epoch": 0.26925, "grad_norm": 3.5625, "grad_norm_var": 0.10926005045572916, "learning_rate": 0.0001, "loss": 6.1453, "loss/crossentropy": 2.718366861343384, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.18370791524648666, "step": 8616 }, { "epoch": 0.2693125, "grad_norm": 4.3125, "grad_norm_var": 0.144970703125, "learning_rate": 0.0001, "loss": 6.0593, "loss/crossentropy": 2.50391161441803, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.19264744967222214, "step": 8618 }, { "epoch": 0.269375, "grad_norm": 3.109375, "grad_norm_var": 0.13963114420572917, "learning_rate": 0.0001, "loss": 5.8813, "loss/crossentropy": 2.5312806367874146, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18343793600797653, "step": 8620 }, { "epoch": 0.2694375, "grad_norm": 3.421875, "grad_norm_var": 0.12897135416666666, "learning_rate": 0.0001, "loss": 6.264, "loss/crossentropy": 2.7507938146591187, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1954570934176445, "step": 8622 }, { "epoch": 0.2695, "grad_norm": 3.234375, "grad_norm_var": 0.1293853759765625, "learning_rate": 0.0001, "loss": 5.8537, "loss/crossentropy": 2.525124430656433, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1809079274535179, "step": 8624 }, { "epoch": 0.2695625, "grad_norm": 3.234375, "grad_norm_var": 0.12226460774739584, "learning_rate": 0.0001, "loss": 6.1066, "loss/crossentropy": 2.6248010396957397, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1934967190027237, "step": 8626 }, { "epoch": 0.269625, "grad_norm": 3.34375, "grad_norm_var": 0.10802408854166666, "learning_rate": 0.0001, "loss": 6.0722, "loss/crossentropy": 2.6180055141448975, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18917328119277954, "step": 8628 }, { "epoch": 0.2696875, "grad_norm": 3.421875, "grad_norm_var": 0.10091044108072916, "learning_rate": 0.0001, "loss": 5.994, "loss/crossentropy": 2.541751265525818, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18936219811439514, "step": 8630 }, { "epoch": 0.26975, "grad_norm": 3.46875, "grad_norm_var": 0.0783843994140625, "learning_rate": 0.0001, "loss": 6.3035, "loss/crossentropy": 2.8230782747268677, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18983669579029083, "step": 8632 }, { "epoch": 0.2698125, "grad_norm": 3.796875, "grad_norm_var": 0.0278717041015625, "learning_rate": 0.0001, "loss": 6.2003, "loss/crossentropy": 2.559078574180603, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.2012312412261963, "step": 8634 }, { "epoch": 0.269875, "grad_norm": 3.453125, "grad_norm_var": 0.0256500244140625, "learning_rate": 0.0001, "loss": 5.8106, "loss/crossentropy": 2.497360348701477, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17976141721010208, "step": 8636 }, { "epoch": 0.2699375, "grad_norm": 3.65625, "grad_norm_var": 0.0294830322265625, "learning_rate": 0.0001, "loss": 6.3131, "loss/crossentropy": 2.727493166923523, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.1984044834971428, "step": 8638 }, { "epoch": 0.27, "grad_norm": 3.25, "grad_norm_var": 0.0282379150390625, "learning_rate": 0.0001, "loss": 5.8564, "loss/crossentropy": 2.4807841777801514, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1840466484427452, "step": 8640 }, { "epoch": 0.2700625, "grad_norm": 3.25, "grad_norm_var": 0.024470011393229168, "learning_rate": 0.0001, "loss": 5.8187, "loss/crossentropy": 2.4412097930908203, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18267560750246048, "step": 8642 }, { "epoch": 0.270125, "grad_norm": 3.703125, "grad_norm_var": 0.03683268229166667, "learning_rate": 0.0001, "loss": 6.5103, "loss/crossentropy": 2.903472065925598, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.20131144672632217, "step": 8644 }, { "epoch": 0.2701875, "grad_norm": 3.21875, "grad_norm_var": 0.04053446451822917, "learning_rate": 0.0001, "loss": 6.2487, "loss/crossentropy": 2.7037285566329956, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.18965797871351242, "step": 8646 }, { "epoch": 0.27025, "grad_norm": 3.203125, "grad_norm_var": 0.04619038899739583, "learning_rate": 0.0001, "loss": 5.8646, "loss/crossentropy": 2.4815542697906494, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18361923843622208, "step": 8648 }, { "epoch": 0.2703125, "grad_norm": 3.859375, "grad_norm_var": 0.0555572509765625, "learning_rate": 0.0001, "loss": 5.7107, "loss/crossentropy": 2.400591015815735, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17944909632205963, "step": 8650 }, { "epoch": 0.270375, "grad_norm": 3.5625, "grad_norm_var": 0.05408528645833333, "learning_rate": 0.0001, "loss": 6.0948, "loss/crossentropy": 2.658158540725708, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.1862383633852005, "step": 8652 }, { "epoch": 0.2704375, "grad_norm": 3.234375, "grad_norm_var": 0.052144368489583336, "learning_rate": 0.0001, "loss": 5.8797, "loss/crossentropy": 2.499776244163513, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18056650459766388, "step": 8654 }, { "epoch": 0.2705, "grad_norm": 3.609375, "grad_norm_var": 0.05791015625, "learning_rate": 0.0001, "loss": 6.1491, "loss/crossentropy": 2.69377064704895, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18849724531173706, "step": 8656 }, { "epoch": 0.2705625, "grad_norm": 3.296875, "grad_norm_var": 0.057047526041666664, "learning_rate": 0.0001, "loss": 6.3627, "loss/crossentropy": 2.8331875801086426, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19318635761737823, "step": 8658 }, { "epoch": 0.270625, "grad_norm": 3.265625, "grad_norm_var": 0.04064127604166667, "learning_rate": 0.0001, "loss": 6.2608, "loss/crossentropy": 2.7417465448379517, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.19448032975196838, "step": 8660 }, { "epoch": 0.2706875, "grad_norm": 3.4375, "grad_norm_var": 0.03798828125, "learning_rate": 0.0001, "loss": 6.0342, "loss/crossentropy": 2.577946186065674, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.19172074645757675, "step": 8662 }, { "epoch": 0.27075, "grad_norm": 3.234375, "grad_norm_var": 0.035868326822916664, "learning_rate": 0.0001, "loss": 5.8113, "loss/crossentropy": 2.4585071802139282, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18137074261903763, "step": 8664 }, { "epoch": 0.2708125, "grad_norm": 3.03125, "grad_norm_var": 0.025972493489583335, "learning_rate": 0.0001, "loss": 5.7591, "loss/crossentropy": 2.4618247747421265, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1773819923400879, "step": 8666 }, { "epoch": 0.270875, "grad_norm": 3.625, "grad_norm_var": 0.028490193684895835, "learning_rate": 0.0001, "loss": 6.1608, "loss/crossentropy": 2.6863608360290527, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.18884887546300888, "step": 8668 }, { "epoch": 0.2709375, "grad_norm": 3.5, "grad_norm_var": 0.02926025390625, "learning_rate": 0.0001, "loss": 5.778, "loss/crossentropy": 2.4676570892333984, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17751803994178772, "step": 8670 }, { "epoch": 0.271, "grad_norm": 3.1875, "grad_norm_var": 0.02369384765625, "learning_rate": 0.0001, "loss": 6.1098, "loss/crossentropy": 2.6976585388183594, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18300720304250717, "step": 8672 }, { "epoch": 0.2710625, "grad_norm": 3.625, "grad_norm_var": 0.0282867431640625, "learning_rate": 0.0001, "loss": 6.1249, "loss/crossentropy": 2.682486414909363, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.19112010300159454, "step": 8674 }, { "epoch": 0.271125, "grad_norm": 3.78125, "grad_norm_var": 0.08658447265625, "learning_rate": 0.0001, "loss": 6.3226, "loss/crossentropy": 2.64150333404541, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.20756685733795166, "step": 8676 }, { "epoch": 0.2711875, "grad_norm": 3.5625, "grad_norm_var": 0.08772786458333333, "learning_rate": 0.0001, "loss": 6.1881, "loss/crossentropy": 2.605919599533081, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19922953099012375, "step": 8678 }, { "epoch": 0.27125, "grad_norm": 6.375, "grad_norm_var": 0.6093251546223958, "learning_rate": 0.0001, "loss": 6.0575, "loss/crossentropy": 2.4938093423843384, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.19347576051950455, "step": 8680 }, { "epoch": 0.2713125, "grad_norm": 3.625, "grad_norm_var": 0.5719716389973958, "learning_rate": 0.0001, "loss": 6.2066, "loss/crossentropy": 2.7196247577667236, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.18736734986305237, "step": 8682 }, { "epoch": 0.271375, "grad_norm": 3.984375, "grad_norm_var": 0.561279296875, "learning_rate": 0.0001, "loss": 6.3675, "loss/crossentropy": 2.715699076652527, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.20502400398254395, "step": 8684 }, { "epoch": 0.2714375, "grad_norm": 3.484375, "grad_norm_var": 0.5424550374348959, "learning_rate": 0.0001, "loss": 6.0175, "loss/crossentropy": 2.6258652210235596, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1836940422654152, "step": 8686 }, { "epoch": 0.2715, "grad_norm": 3.65625, "grad_norm_var": 0.530419921875, "learning_rate": 0.0001, "loss": 5.9491, "loss/crossentropy": 2.454306483268738, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19049323350191116, "step": 8688 }, { "epoch": 0.2715625, "grad_norm": 3.734375, "grad_norm_var": 0.5486979166666667, "learning_rate": 0.0001, "loss": 6.4043, "loss/crossentropy": 2.7677533626556396, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.2042810544371605, "step": 8690 }, { "epoch": 0.271625, "grad_norm": 3.5625, "grad_norm_var": 0.5486165364583333, "learning_rate": 0.0001, "loss": 5.8867, "loss/crossentropy": 2.4385541677474976, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.18543724715709686, "step": 8692 }, { "epoch": 0.2716875, "grad_norm": 4.125, "grad_norm_var": 0.5405588785807292, "learning_rate": 0.0001, "loss": 6.4892, "loss/crossentropy": 2.7224197387695312, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.20988348871469498, "step": 8694 }, { "epoch": 0.27175, "grad_norm": 3.6875, "grad_norm_var": 0.07668863932291667, "learning_rate": 0.0001, "loss": 6.1692, "loss/crossentropy": 2.6610034704208374, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.18948668986558914, "step": 8696 }, { "epoch": 0.2718125, "grad_norm": 3.609375, "grad_norm_var": 0.07870686848958333, "learning_rate": 0.0001, "loss": 6.0613, "loss/crossentropy": 2.5653761625289917, "loss/hidden": 1.6171875, "loss/jsd": 0.0, "loss/logits": 0.1878705695271492, "step": 8698 }, { "epoch": 0.271875, "grad_norm": 3.546875, "grad_norm_var": 0.0612213134765625, "learning_rate": 0.0001, "loss": 5.9579, "loss/crossentropy": 2.5688871145248413, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18382221460342407, "step": 8700 }, { "epoch": 0.2719375, "grad_norm": 3.28125, "grad_norm_var": 0.05758056640625, "learning_rate": 0.0001, "loss": 5.8379, "loss/crossentropy": 2.452753782272339, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18109457194805145, "step": 8702 }, { "epoch": 0.272, "grad_norm": 3.3125, "grad_norm_var": 0.05315348307291667, "learning_rate": 0.0001, "loss": 5.9339, "loss/crossentropy": 2.5359824895858765, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18471135199069977, "step": 8704 }, { "epoch": 0.2720625, "grad_norm": 3.34375, "grad_norm_var": 0.04827067057291667, "learning_rate": 0.0001, "loss": 5.9526, "loss/crossentropy": 2.540789246559143, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18844721466302872, "step": 8706 }, { "epoch": 0.272125, "grad_norm": 3.671875, "grad_norm_var": 0.048949178059895834, "learning_rate": 0.0001, "loss": 6.377, "loss/crossentropy": 2.760253071784973, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.20386086404323578, "step": 8708 }, { "epoch": 0.2721875, "grad_norm": 3.171875, "grad_norm_var": 0.021610514322916666, "learning_rate": 0.0001, "loss": 5.8678, "loss/crossentropy": 2.484999179840088, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18554507941007614, "step": 8710 }, { "epoch": 0.27225, "grad_norm": 3.375, "grad_norm_var": 0.018375651041666666, "learning_rate": 0.0001, "loss": 5.7152, "loss/crossentropy": 2.4064172506332397, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17736196517944336, "step": 8712 }, { "epoch": 0.2723125, "grad_norm": 3.328125, "grad_norm_var": 0.016304524739583333, "learning_rate": 0.0001, "loss": 6.194, "loss/crossentropy": 2.6760975122451782, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1951456367969513, "step": 8714 }, { "epoch": 0.272375, "grad_norm": 3.3125, "grad_norm_var": 0.015327962239583333, "learning_rate": 0.0001, "loss": 6.0213, "loss/crossentropy": 2.6049975156784058, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18694640696048737, "step": 8716 }, { "epoch": 0.2724375, "grad_norm": 3.53125, "grad_norm_var": 0.01611328125, "learning_rate": 0.0001, "loss": 6.2175, "loss/crossentropy": 2.626902222633362, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.19265294820070267, "step": 8718 }, { "epoch": 0.2725, "grad_norm": 3.296875, "grad_norm_var": 0.019466145833333334, "learning_rate": 0.0001, "loss": 6.3469, "loss/crossentropy": 2.8148353099823, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.1957806497812271, "step": 8720 }, { "epoch": 0.2725625, "grad_norm": 3.328125, "grad_norm_var": 0.020319620768229168, "learning_rate": 0.0001, "loss": 5.6954, "loss/crossentropy": 2.363893508911133, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17806947231292725, "step": 8722 }, { "epoch": 0.272625, "grad_norm": 3.359375, "grad_norm_var": 0.015653483072916665, "learning_rate": 0.0001, "loss": 5.9333, "loss/crossentropy": 2.547742247581482, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18308507651090622, "step": 8724 }, { "epoch": 0.2726875, "grad_norm": 3.8125, "grad_norm_var": 0.030744425455729165, "learning_rate": 0.0001, "loss": 6.2602, "loss/crossentropy": 2.6908657550811768, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.19951613247394562, "step": 8726 }, { "epoch": 0.27275, "grad_norm": 3.5, "grad_norm_var": 0.029955037434895835, "learning_rate": 0.0001, "loss": 6.0924, "loss/crossentropy": 2.683358073234558, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18660413473844528, "step": 8728 }, { "epoch": 0.2728125, "grad_norm": 3.921875, "grad_norm_var": 0.04309794108072917, "learning_rate": 0.0001, "loss": 6.1644, "loss/crossentropy": 2.6175061464309692, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.1972716897726059, "step": 8730 }, { "epoch": 0.272875, "grad_norm": 4.09375, "grad_norm_var": 0.06542561848958334, "learning_rate": 0.0001, "loss": 6.2474, "loss/crossentropy": 2.5593132972717285, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.2043544426560402, "step": 8732 }, { "epoch": 0.2729375, "grad_norm": 3.640625, "grad_norm_var": 0.06306864420572916, "learning_rate": 0.0001, "loss": 6.2188, "loss/crossentropy": 2.6943942308425903, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1946251168847084, "step": 8734 }, { "epoch": 0.273, "grad_norm": 3.359375, "grad_norm_var": 0.0703125, "learning_rate": 0.0001, "loss": 5.8123, "loss/crossentropy": 2.492477774620056, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.17495298385620117, "step": 8736 }, { "epoch": 0.2730625, "grad_norm": 3.890625, "grad_norm_var": 0.06643778483072917, "learning_rate": 0.0001, "loss": 6.3913, "loss/crossentropy": 2.8301221132278442, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.19869162142276764, "step": 8738 }, { "epoch": 0.273125, "grad_norm": 3.671875, "grad_norm_var": 0.0551177978515625, "learning_rate": 0.0001, "loss": 6.3365, "loss/crossentropy": 2.775020718574524, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.19950658828020096, "step": 8740 }, { "epoch": 0.2731875, "grad_norm": 3.140625, "grad_norm_var": 0.06675516764322917, "learning_rate": 0.0001, "loss": 5.8037, "loss/crossentropy": 2.5151455402374268, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1757265403866768, "step": 8742 }, { "epoch": 0.27325, "grad_norm": 3.203125, "grad_norm_var": 0.1047027587890625, "learning_rate": 0.0001, "loss": 5.8188, "loss/crossentropy": 2.563611149787903, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17200451344251633, "step": 8744 }, { "epoch": 0.2733125, "grad_norm": 3.40625, "grad_norm_var": 0.1, "learning_rate": 0.0001, "loss": 6.1039, "loss/crossentropy": 2.6742992401123047, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18905840814113617, "step": 8746 }, { "epoch": 0.273375, "grad_norm": 3.5, "grad_norm_var": 0.070947265625, "learning_rate": 0.0001, "loss": 5.9399, "loss/crossentropy": 2.424111247062683, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.19102928042411804, "step": 8748 }, { "epoch": 0.2734375, "grad_norm": 3.640625, "grad_norm_var": 0.0660797119140625, "learning_rate": 0.0001, "loss": 6.22, "loss/crossentropy": 2.6243419647216797, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.20409469306468964, "step": 8750 }, { "epoch": 0.2735, "grad_norm": 3.71875, "grad_norm_var": 0.0675201416015625, "learning_rate": 0.0001, "loss": 6.0798, "loss/crossentropy": 2.552013397216797, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.19223305583000183, "step": 8752 }, { "epoch": 0.2735625, "grad_norm": 3.5, "grad_norm_var": 0.057103474934895836, "learning_rate": 0.0001, "loss": 6.2181, "loss/crossentropy": 2.641934394836426, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.19745796918869019, "step": 8754 }, { "epoch": 0.273625, "grad_norm": 3.390625, "grad_norm_var": 0.052652994791666664, "learning_rate": 0.0001, "loss": 6.222, "loss/crossentropy": 2.711361527442932, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.19051231443881989, "step": 8756 }, { "epoch": 0.2736875, "grad_norm": 3.421875, "grad_norm_var": 0.0475982666015625, "learning_rate": 0.0001, "loss": 6.0285, "loss/crossentropy": 2.663419246673584, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18182026594877243, "step": 8758 }, { "epoch": 0.27375, "grad_norm": 4.65625, "grad_norm_var": 0.10269775390625, "learning_rate": 0.0001, "loss": 6.1043, "loss/crossentropy": 2.649004578590393, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18811038881540298, "step": 8760 }, { "epoch": 0.2738125, "grad_norm": 3.4375, "grad_norm_var": 0.0944732666015625, "learning_rate": 0.0001, "loss": 6.08, "loss/crossentropy": 2.5890932083129883, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1920555904507637, "step": 8762 }, { "epoch": 0.273875, "grad_norm": 3.46875, "grad_norm_var": 0.097216796875, "learning_rate": 0.0001, "loss": 6.2831, "loss/crossentropy": 2.7520138025283813, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.19490370899438858, "step": 8764 }, { "epoch": 0.2739375, "grad_norm": 3.390625, "grad_norm_var": 0.10485026041666666, "learning_rate": 0.0001, "loss": 5.9451, "loss/crossentropy": 2.5493834018707275, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18566906452178955, "step": 8766 }, { "epoch": 0.274, "grad_norm": 3.625, "grad_norm_var": 0.10478108723958333, "learning_rate": 0.0001, "loss": 6.0873, "loss/crossentropy": 2.5950188636779785, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19141746312379837, "step": 8768 }, { "epoch": 0.2740625, "grad_norm": 3.765625, "grad_norm_var": 0.1172027587890625, "learning_rate": 0.0001, "loss": 6.4435, "loss/crossentropy": 2.8003259897232056, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.19986136257648468, "step": 8770 }, { "epoch": 0.274125, "grad_norm": 3.328125, "grad_norm_var": 0.12132059733072917, "learning_rate": 0.0001, "loss": 5.8654, "loss/crossentropy": 2.447899103164673, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18784713745117188, "step": 8772 }, { "epoch": 0.2741875, "grad_norm": 3.4375, "grad_norm_var": 0.438037109375, "learning_rate": 0.0001, "loss": 6.2964, "loss/crossentropy": 2.7445706129074097, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.19385666400194168, "step": 8774 }, { "epoch": 0.27425, "grad_norm": 3.53125, "grad_norm_var": 0.3990559895833333, "learning_rate": 0.0001, "loss": 5.6377, "loss/crossentropy": 2.3796085119247437, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17463870346546173, "step": 8776 }, { "epoch": 0.2743125, "grad_norm": 3.484375, "grad_norm_var": 0.40067952473958335, "learning_rate": 0.0001, "loss": 5.7103, "loss/crossentropy": 2.4123687744140625, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17666452378034592, "step": 8778 }, { "epoch": 0.274375, "grad_norm": 3.6875, "grad_norm_var": 0.404345703125, "learning_rate": 0.0001, "loss": 6.1997, "loss/crossentropy": 2.716852307319641, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.19359584152698517, "step": 8780 }, { "epoch": 0.2744375, "grad_norm": 2.921875, "grad_norm_var": 0.42649637858072914, "learning_rate": 0.0001, "loss": 6.1726, "loss/crossentropy": 2.763651132583618, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1865966022014618, "step": 8782 }, { "epoch": 0.2745, "grad_norm": 3.0625, "grad_norm_var": 0.44247639973958336, "learning_rate": 0.0001, "loss": 5.8291, "loss/crossentropy": 2.481606960296631, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.18435516953468323, "step": 8784 }, { "epoch": 0.2745625, "grad_norm": 3.21875, "grad_norm_var": 0.43795572916666664, "learning_rate": 0.0001, "loss": 5.8875, "loss/crossentropy": 2.5274109840393066, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18053767830133438, "step": 8786 }, { "epoch": 0.274625, "grad_norm": 3.28125, "grad_norm_var": 0.43502197265625, "learning_rate": 0.0001, "loss": 6.3651, "loss/crossentropy": 2.8572980165481567, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.19218553602695465, "step": 8788 }, { "epoch": 0.2746875, "grad_norm": 3.421875, "grad_norm_var": 0.06050516764322917, "learning_rate": 0.0001, "loss": 5.8025, "loss/crossentropy": 2.4208855628967285, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.1799629107117653, "step": 8790 }, { "epoch": 0.27475, "grad_norm": 3.828125, "grad_norm_var": 0.06599833170572916, "learning_rate": 0.0001, "loss": 5.9873, "loss/crossentropy": 2.390637755393982, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.19404463469982147, "step": 8792 }, { "epoch": 0.2748125, "grad_norm": 3.421875, "grad_norm_var": 0.06614481608072917, "learning_rate": 0.0001, "loss": 6.2452, "loss/crossentropy": 2.7461918592453003, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.19520880281925201, "step": 8794 }, { "epoch": 0.274875, "grad_norm": 3.421875, "grad_norm_var": 0.059794108072916664, "learning_rate": 0.0001, "loss": 6.1468, "loss/crossentropy": 2.60414457321167, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.1945016235113144, "step": 8796 }, { "epoch": 0.2749375, "grad_norm": 3.109375, "grad_norm_var": 0.06798502604166666, "learning_rate": 0.0001, "loss": 5.2521, "loss/crossentropy": 2.1276236176490784, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16166242957115173, "step": 8798 }, { "epoch": 0.275, "grad_norm": 3.265625, "grad_norm_var": 0.06199442545572917, "learning_rate": 0.0001, "loss": 5.9335, "loss/crossentropy": 2.6193655729293823, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17672650516033173, "step": 8800 }, { "epoch": 0.2750625, "grad_norm": 3.671875, "grad_norm_var": 0.0619140625, "learning_rate": 0.0001, "loss": 5.8253, "loss/crossentropy": 2.409619092941284, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18688050657510757, "step": 8802 }, { "epoch": 0.275125, "grad_norm": 3.328125, "grad_norm_var": 0.0607818603515625, "learning_rate": 0.0001, "loss": 5.8032, "loss/crossentropy": 2.3988406658172607, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18301154673099518, "step": 8804 }, { "epoch": 0.2751875, "grad_norm": 3.3125, "grad_norm_var": 0.05152079264322917, "learning_rate": 0.0001, "loss": 5.8565, "loss/crossentropy": 2.544641375541687, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17649392038583755, "step": 8806 }, { "epoch": 0.27525, "grad_norm": 3.203125, "grad_norm_var": 0.043578084309895834, "learning_rate": 0.0001, "loss": 5.813, "loss/crossentropy": 2.5152865648269653, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17820528894662857, "step": 8808 }, { "epoch": 0.2753125, "grad_norm": 3.15625, "grad_norm_var": 0.04828999837239583, "learning_rate": 0.0001, "loss": 5.7696, "loss/crossentropy": 2.489772319793701, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17680658400058746, "step": 8810 }, { "epoch": 0.275375, "grad_norm": 3.59375, "grad_norm_var": 0.051301066080729166, "learning_rate": 0.0001, "loss": 6.1193, "loss/crossentropy": 2.623705267906189, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.18940023332834244, "step": 8812 }, { "epoch": 0.2754375, "grad_norm": 3.4375, "grad_norm_var": 0.024609375, "learning_rate": 0.0001, "loss": 6.0475, "loss/crossentropy": 2.624902129173279, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18718025833368301, "step": 8814 }, { "epoch": 0.2755, "grad_norm": 3.453125, "grad_norm_var": 0.034440104166666666, "learning_rate": 0.0001, "loss": 5.7325, "loss/crossentropy": 2.4753358364105225, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17297816276550293, "step": 8816 }, { "epoch": 0.2755625, "grad_norm": 3.421875, "grad_norm_var": 0.028645833333333332, "learning_rate": 0.0001, "loss": 5.5818, "loss/crossentropy": 2.259931802749634, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.1712503433227539, "step": 8818 }, { "epoch": 0.275625, "grad_norm": 3.34375, "grad_norm_var": 0.027391560872395835, "learning_rate": 0.0001, "loss": 5.6535, "loss/crossentropy": 2.3533644676208496, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1788463592529297, "step": 8820 }, { "epoch": 0.2756875, "grad_norm": 3.546875, "grad_norm_var": 0.045308430989583336, "learning_rate": 0.0001, "loss": 6.2962, "loss/crossentropy": 2.6124351024627686, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.20549017190933228, "step": 8822 }, { "epoch": 0.27575, "grad_norm": 3.234375, "grad_norm_var": 0.043732706705729166, "learning_rate": 0.0001, "loss": 6.3325, "loss/crossentropy": 2.8760937452316284, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1901729702949524, "step": 8824 }, { "epoch": 0.2758125, "grad_norm": 7.46875, "grad_norm_var": 1.0603678385416666, "learning_rate": 0.0001, "loss": 6.0266, "loss/crossentropy": 2.5222177505493164, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.191059410572052, "step": 8826 }, { "epoch": 0.275875, "grad_norm": 3.515625, "grad_norm_var": 1.0563140869140626, "learning_rate": 0.0001, "loss": 6.0066, "loss/crossentropy": 2.600649833679199, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18512310087680817, "step": 8828 }, { "epoch": 0.2759375, "grad_norm": 3.671875, "grad_norm_var": 1.0502675374348958, "learning_rate": 0.0001, "loss": 6.071, "loss/crossentropy": 2.4670032262802124, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.20375903695821762, "step": 8830 }, { "epoch": 0.276, "grad_norm": 3.265625, "grad_norm_var": 1.0273834228515626, "learning_rate": 0.0001, "loss": 5.8144, "loss/crossentropy": 2.4422956705093384, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.183305524289608, "step": 8832 }, { "epoch": 0.2760625, "grad_norm": 3.265625, "grad_norm_var": 1.03642578125, "learning_rate": 0.0001, "loss": 6.1734, "loss/crossentropy": 2.7296916246414185, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.1869504600763321, "step": 8834 }, { "epoch": 0.276125, "grad_norm": 3.53125, "grad_norm_var": 1.035163370768229, "learning_rate": 0.0001, "loss": 5.7405, "loss/crossentropy": 2.4141600131988525, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.179900124669075, "step": 8836 }, { "epoch": 0.2761875, "grad_norm": 3.296875, "grad_norm_var": 1.0578277587890625, "learning_rate": 0.0001, "loss": 5.9823, "loss/crossentropy": 2.6309428215026855, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18005498498678207, "step": 8838 }, { "epoch": 0.27625, "grad_norm": 3.734375, "grad_norm_var": 1.0456451416015624, "learning_rate": 0.0001, "loss": 6.1293, "loss/crossentropy": 2.5688467025756836, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19628392159938812, "step": 8840 }, { "epoch": 0.2763125, "grad_norm": 3.375, "grad_norm_var": 0.04659830729166667, "learning_rate": 0.0001, "loss": 6.0435, "loss/crossentropy": 2.640480637550354, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18639644235372543, "step": 8842 }, { "epoch": 0.276375, "grad_norm": 3.390625, "grad_norm_var": 0.047200520833333336, "learning_rate": 0.0001, "loss": 6.2727, "loss/crossentropy": 2.7400788068771362, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.19193562865257263, "step": 8844 }, { "epoch": 0.2764375, "grad_norm": 3.46875, "grad_norm_var": 0.0207916259765625, "learning_rate": 0.0001, "loss": 6.1126, "loss/crossentropy": 2.6935667991638184, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18643878400325775, "step": 8846 }, { "epoch": 0.2765, "grad_norm": 4.0625, "grad_norm_var": 0.0489654541015625, "learning_rate": 0.0001, "loss": 6.1042, "loss/crossentropy": 2.595457911491394, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.1918850839138031, "step": 8848 }, { "epoch": 0.2765625, "grad_norm": 3.125, "grad_norm_var": 0.05380452473958333, "learning_rate": 0.0001, "loss": 5.7562, "loss/crossentropy": 2.459462285041809, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1761576235294342, "step": 8850 }, { "epoch": 0.276625, "grad_norm": 3.25, "grad_norm_var": 0.055150349934895836, "learning_rate": 0.0001, "loss": 5.8443, "loss/crossentropy": 2.4820131063461304, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1799747496843338, "step": 8852 }, { "epoch": 0.2766875, "grad_norm": 3.1875, "grad_norm_var": 0.05754292805989583, "learning_rate": 0.0001, "loss": 5.8885, "loss/crossentropy": 2.467939019203186, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18658292293548584, "step": 8854 }, { "epoch": 0.27675, "grad_norm": 3.390625, "grad_norm_var": 0.04973551432291667, "learning_rate": 0.0001, "loss": 6.0669, "loss/crossentropy": 2.6163350343704224, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18998120725154877, "step": 8856 }, { "epoch": 0.2768125, "grad_norm": 3.140625, "grad_norm_var": 0.0527984619140625, "learning_rate": 0.0001, "loss": 5.7258, "loss/crossentropy": 2.452614665031433, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17732173204421997, "step": 8858 }, { "epoch": 0.276875, "grad_norm": 4.5, "grad_norm_var": 0.13489583333333333, "learning_rate": 0.0001, "loss": 6.393, "loss/crossentropy": 2.8383991718292236, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.19726187735795975, "step": 8860 }, { "epoch": 0.2769375, "grad_norm": 3.6875, "grad_norm_var": 0.13736063639322918, "learning_rate": 0.0001, "loss": 6.1063, "loss/crossentropy": 2.6258177757263184, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.19257892668247223, "step": 8862 }, { "epoch": 0.277, "grad_norm": 3.484375, "grad_norm_var": 0.11110026041666667, "learning_rate": 0.0001, "loss": 5.8231, "loss/crossentropy": 2.3822438716888428, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.18197131156921387, "step": 8864 }, { "epoch": 0.2770625, "grad_norm": 3.390625, "grad_norm_var": 0.10546875, "learning_rate": 0.0001, "loss": 5.7566, "loss/crossentropy": 2.384195327758789, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18294084072113037, "step": 8866 }, { "epoch": 0.277125, "grad_norm": 3.21875, "grad_norm_var": 0.10819905598958333, "learning_rate": 0.0001, "loss": 5.9445, "loss/crossentropy": 2.611143112182617, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17903496325016022, "step": 8868 }, { "epoch": 0.2771875, "grad_norm": 3.375, "grad_norm_var": 0.1090972900390625, "learning_rate": 0.0001, "loss": 6.1364, "loss/crossentropy": 2.7580084800720215, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18236715346574783, "step": 8870 }, { "epoch": 0.27725, "grad_norm": 3.390625, "grad_norm_var": 0.10901285807291666, "learning_rate": 0.0001, "loss": 5.88, "loss/crossentropy": 2.5484120845794678, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18042747676372528, "step": 8872 }, { "epoch": 0.2773125, "grad_norm": 4.375, "grad_norm_var": 0.16637369791666667, "learning_rate": 0.0001, "loss": 6.0186, "loss/crossentropy": 2.544276237487793, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.19157471507787704, "step": 8874 }, { "epoch": 0.277375, "grad_norm": 3.46875, "grad_norm_var": 0.09169514973958333, "learning_rate": 0.0001, "loss": 5.927, "loss/crossentropy": 2.526648759841919, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18495458364486694, "step": 8876 }, { "epoch": 0.2774375, "grad_norm": 3.234375, "grad_norm_var": 0.09654541015625, "learning_rate": 0.0001, "loss": 5.9058, "loss/crossentropy": 2.490079164505005, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.18298199772834778, "step": 8878 }, { "epoch": 0.2775, "grad_norm": 3.140625, "grad_norm_var": 0.0988189697265625, "learning_rate": 0.0001, "loss": 5.924, "loss/crossentropy": 2.631035327911377, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17694874107837677, "step": 8880 }, { "epoch": 0.2775625, "grad_norm": 2.984375, "grad_norm_var": 0.14885660807291667, "learning_rate": 0.0001, "loss": 5.9504, "loss/crossentropy": 2.489716410636902, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.18356838822364807, "step": 8882 }, { "epoch": 0.277625, "grad_norm": 3.171875, "grad_norm_var": 0.15318603515625, "learning_rate": 0.0001, "loss": 6.0393, "loss/crossentropy": 2.6960405111312866, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.17495335638523102, "step": 8884 }, { "epoch": 0.2776875, "grad_norm": 3.0, "grad_norm_var": 0.15896809895833333, "learning_rate": 0.0001, "loss": 5.8487, "loss/crossentropy": 2.581865906715393, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17394591867923737, "step": 8886 }, { "epoch": 0.27775, "grad_norm": 3.546875, "grad_norm_var": 0.16142476399739583, "learning_rate": 0.0001, "loss": 6.2924, "loss/crossentropy": 2.6841617822647095, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.20144400000572205, "step": 8888 }, { "epoch": 0.2778125, "grad_norm": 3.125, "grad_norm_var": 0.09556884765625, "learning_rate": 0.0001, "loss": 5.8561, "loss/crossentropy": 2.5051841735839844, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.17806024849414825, "step": 8890 }, { "epoch": 0.277875, "grad_norm": 4.03125, "grad_norm_var": 0.1261383056640625, "learning_rate": 0.0001, "loss": 6.0474, "loss/crossentropy": 2.5743348598480225, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.19379350543022156, "step": 8892 }, { "epoch": 0.2779375, "grad_norm": 3.15625, "grad_norm_var": 0.12121988932291666, "learning_rate": 0.0001, "loss": 6.1704, "loss/crossentropy": 2.7337318658828735, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.18507727980613708, "step": 8894 }, { "epoch": 0.278, "grad_norm": 3.109375, "grad_norm_var": 0.1221343994140625, "learning_rate": 0.0001, "loss": 5.8408, "loss/crossentropy": 2.5032352209091187, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17828834056854248, "step": 8896 }, { "epoch": 0.2780625, "grad_norm": 3.21875, "grad_norm_var": 0.0725006103515625, "learning_rate": 0.0001, "loss": 5.7559, "loss/crossentropy": 2.4360796213150024, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1765114590525627, "step": 8898 }, { "epoch": 0.278125, "grad_norm": 3.765625, "grad_norm_var": 0.08477274576822917, "learning_rate": 0.0001, "loss": 6.2151, "loss/crossentropy": 2.651309370994568, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19661091268062592, "step": 8900 }, { "epoch": 0.2781875, "grad_norm": 3.453125, "grad_norm_var": 0.06966044108072916, "learning_rate": 0.0001, "loss": 5.8337, "loss/crossentropy": 2.427757143974304, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18590442836284637, "step": 8902 }, { "epoch": 0.27825, "grad_norm": 3.890625, "grad_norm_var": 0.08192952473958333, "learning_rate": 0.0001, "loss": 5.9187, "loss/crossentropy": 2.3886170387268066, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.19714532792568207, "step": 8904 }, { "epoch": 0.2783125, "grad_norm": 3.625, "grad_norm_var": 0.06999409993489583, "learning_rate": 0.0001, "loss": 5.7418, "loss/crossentropy": 2.3733723163604736, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18254968523979187, "step": 8906 }, { "epoch": 0.278375, "grad_norm": 3.359375, "grad_norm_var": 0.0604400634765625, "learning_rate": 0.0001, "loss": 5.7892, "loss/crossentropy": 2.49438738822937, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1767505556344986, "step": 8908 }, { "epoch": 0.2784375, "grad_norm": 3.28125, "grad_norm_var": 0.061400349934895834, "learning_rate": 0.0001, "loss": 5.5431, "loss/crossentropy": 2.2899755239486694, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17140889167785645, "step": 8910 }, { "epoch": 0.2785, "grad_norm": 3.421875, "grad_norm_var": 0.05349019368489583, "learning_rate": 0.0001, "loss": 6.0516, "loss/crossentropy": 2.625260591506958, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.18404033035039902, "step": 8912 }, { "epoch": 0.2785625, "grad_norm": 3.34375, "grad_norm_var": 0.051610310872395836, "learning_rate": 0.0001, "loss": 6.0681, "loss/crossentropy": 2.603209972381592, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.19336361438035965, "step": 8914 }, { "epoch": 0.278625, "grad_norm": 4.125, "grad_norm_var": 0.069189453125, "learning_rate": 0.0001, "loss": 6.155, "loss/crossentropy": 2.629045248031616, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.19204367697238922, "step": 8916 }, { "epoch": 0.2786875, "grad_norm": 3.078125, "grad_norm_var": 0.08277587890625, "learning_rate": 0.0001, "loss": 5.9272, "loss/crossentropy": 2.633598208427429, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17935702204704285, "step": 8918 }, { "epoch": 0.27875, "grad_norm": 3.734375, "grad_norm_var": 0.0746246337890625, "learning_rate": 0.0001, "loss": 5.9349, "loss/crossentropy": 2.489575743675232, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18788864463567734, "step": 8920 }, { "epoch": 0.2788125, "grad_norm": 4.59375, "grad_norm_var": 0.16741434733072916, "learning_rate": 0.0001, "loss": 5.6964, "loss/crossentropy": 2.37763512134552, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1783628612756729, "step": 8922 }, { "epoch": 0.278875, "grad_norm": 3.203125, "grad_norm_var": 0.16574605305989584, "learning_rate": 0.0001, "loss": 6.0553, "loss/crossentropy": 2.676714062690735, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18395595252513885, "step": 8924 }, { "epoch": 0.2789375, "grad_norm": 3.796875, "grad_norm_var": 0.16239827473958332, "learning_rate": 0.0001, "loss": 5.8915, "loss/crossentropy": 2.429540514945984, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.18604381382465363, "step": 8926 }, { "epoch": 0.279, "grad_norm": 3.359375, "grad_norm_var": 0.16198628743489582, "learning_rate": 0.0001, "loss": 5.7803, "loss/crossentropy": 2.4657455682754517, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1752079725265503, "step": 8928 }, { "epoch": 0.2790625, "grad_norm": 3.5, "grad_norm_var": 0.16420796712239583, "learning_rate": 0.0001, "loss": 6.0748, "loss/crossentropy": 2.6903953552246094, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1825784221291542, "step": 8930 }, { "epoch": 0.279125, "grad_norm": 3.53125, "grad_norm_var": 0.13433837890625, "learning_rate": 0.0001, "loss": 6.1758, "loss/crossentropy": 2.631464958190918, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19739867746829987, "step": 8932 }, { "epoch": 0.2791875, "grad_norm": 3.171875, "grad_norm_var": 0.12526041666666668, "learning_rate": 0.0001, "loss": 6.0468, "loss/crossentropy": 2.5245360136032104, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.1936342790722847, "step": 8934 }, { "epoch": 0.27925, "grad_norm": 3.078125, "grad_norm_var": 0.1358795166015625, "learning_rate": 0.0001, "loss": 5.7535, "loss/crossentropy": 2.496535301208496, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17335310578346252, "step": 8936 }, { "epoch": 0.2793125, "grad_norm": 3.1875, "grad_norm_var": 0.05370686848958333, "learning_rate": 0.0001, "loss": 5.8803, "loss/crossentropy": 2.5169789791107178, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18320399522781372, "step": 8938 }, { "epoch": 0.279375, "grad_norm": 3.484375, "grad_norm_var": 0.05538736979166667, "learning_rate": 0.0001, "loss": 5.5435, "loss/crossentropy": 2.28670871257782, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1678697019815445, "step": 8940 }, { "epoch": 0.2794375, "grad_norm": 3.578125, "grad_norm_var": 0.0518707275390625, "learning_rate": 0.0001, "loss": 5.9636, "loss/crossentropy": 2.5089820623397827, "loss/hidden": 1.6328125, "loss/jsd": 0.0, "loss/logits": 0.18218537420034409, "step": 8942 }, { "epoch": 0.2795, "grad_norm": 3.71875, "grad_norm_var": 0.05826416015625, "learning_rate": 0.0001, "loss": 6.0677, "loss/crossentropy": 2.6015191078186035, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1919344812631607, "step": 8944 }, { "epoch": 0.2795625, "grad_norm": 4.125, "grad_norm_var": 0.08690999348958334, "learning_rate": 0.0001, "loss": 6.2675, "loss/crossentropy": 2.6956677436828613, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.19663256406784058, "step": 8946 }, { "epoch": 0.279625, "grad_norm": 3.328125, "grad_norm_var": 0.09688212076822916, "learning_rate": 0.0001, "loss": 5.8046, "loss/crossentropy": 2.519374966621399, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1773502230644226, "step": 8948 }, { "epoch": 0.2796875, "grad_norm": 3.671875, "grad_norm_var": 0.0864898681640625, "learning_rate": 0.0001, "loss": 6.0677, "loss/crossentropy": 2.6085387468338013, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18927831947803497, "step": 8950 }, { "epoch": 0.27975, "grad_norm": 3.234375, "grad_norm_var": 0.07963765462239583, "learning_rate": 0.0001, "loss": 5.9959, "loss/crossentropy": 2.5420197248458862, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18953076004981995, "step": 8952 }, { "epoch": 0.2798125, "grad_norm": 3.40625, "grad_norm_var": 0.07415262858072917, "learning_rate": 0.0001, "loss": 6.1113, "loss/crossentropy": 2.6709654331207275, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18778185546398163, "step": 8954 }, { "epoch": 0.279875, "grad_norm": 3.390625, "grad_norm_var": 0.07258707682291667, "learning_rate": 0.0001, "loss": 5.9649, "loss/crossentropy": 2.6230790615081787, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.179103784263134, "step": 8956 }, { "epoch": 0.2799375, "grad_norm": 3.328125, "grad_norm_var": 0.0714752197265625, "learning_rate": 0.0001, "loss": 5.973, "loss/crossentropy": 2.5649073123931885, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18572864681482315, "step": 8958 }, { "epoch": 0.28, "grad_norm": 3.53125, "grad_norm_var": 0.06720377604166666, "learning_rate": 0.0001, "loss": 5.8986, "loss/crossentropy": 2.5165066719055176, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18429826945066452, "step": 8960 }, { "epoch": 0.2800625, "grad_norm": 3.5, "grad_norm_var": 0.034501139322916666, "learning_rate": 0.0001, "loss": 6.0107, "loss/crossentropy": 2.5791239738464355, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.1829991489648819, "step": 8962 }, { "epoch": 0.280125, "grad_norm": 3.390625, "grad_norm_var": 0.026688639322916666, "learning_rate": 0.0001, "loss": 6.2176, "loss/crossentropy": 2.7475950717926025, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.1895764321088791, "step": 8964 }, { "epoch": 0.2801875, "grad_norm": 4.375, "grad_norm_var": 0.08095703125, "learning_rate": 0.0001, "loss": 5.7247, "loss/crossentropy": 2.4407023191452026, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1725417673587799, "step": 8966 }, { "epoch": 0.28025, "grad_norm": 3.46875, "grad_norm_var": 0.072265625, "learning_rate": 0.0001, "loss": 6.1318, "loss/crossentropy": 2.6310908794403076, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.1903063729405403, "step": 8968 }, { "epoch": 0.2803125, "grad_norm": 3.109375, "grad_norm_var": 0.07995503743489583, "learning_rate": 0.0001, "loss": 5.8537, "loss/crossentropy": 2.465729832649231, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18567556142807007, "step": 8970 }, { "epoch": 0.280375, "grad_norm": 3.3125, "grad_norm_var": 0.07912495930989584, "learning_rate": 0.0001, "loss": 5.6157, "loss/crossentropy": 2.361966848373413, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17459086328744888, "step": 8972 }, { "epoch": 0.2804375, "grad_norm": 3.625, "grad_norm_var": 0.08082275390625, "learning_rate": 0.0001, "loss": 6.1712, "loss/crossentropy": 2.601422905921936, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.198383167386055, "step": 8974 }, { "epoch": 0.2805, "grad_norm": 3.84375, "grad_norm_var": 0.08948567708333334, "learning_rate": 0.0001, "loss": 6.1741, "loss/crossentropy": 2.692523241043091, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.19307806342840195, "step": 8976 }, { "epoch": 0.2805625, "grad_norm": 3.46875, "grad_norm_var": 0.0899566650390625, "learning_rate": 0.0001, "loss": 5.9848, "loss/crossentropy": 2.4888601303100586, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.19139544665813446, "step": 8978 }, { "epoch": 0.280625, "grad_norm": 3.546875, "grad_norm_var": 0.09114481608072916, "learning_rate": 0.0001, "loss": 6.094, "loss/crossentropy": 2.6560696363449097, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18871255218982697, "step": 8980 }, { "epoch": 0.2806875, "grad_norm": 3.4375, "grad_norm_var": 0.04195048014322917, "learning_rate": 0.0001, "loss": 6.1607, "loss/crossentropy": 2.6694579124450684, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.19365518540143967, "step": 8982 }, { "epoch": 0.28075, "grad_norm": 3.375, "grad_norm_var": 0.03999735514322917, "learning_rate": 0.0001, "loss": 6.0601, "loss/crossentropy": 2.6100698709487915, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1926591694355011, "step": 8984 }, { "epoch": 0.2808125, "grad_norm": 3.703125, "grad_norm_var": 0.0390289306640625, "learning_rate": 0.0001, "loss": 5.95, "loss/crossentropy": 2.567933201789856, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.1796116977930069, "step": 8986 }, { "epoch": 0.280875, "grad_norm": 3.328125, "grad_norm_var": 0.0328765869140625, "learning_rate": 0.0001, "loss": 6.0463, "loss/crossentropy": 2.65181303024292, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18242233991622925, "step": 8988 }, { "epoch": 0.2809375, "grad_norm": 3.09375, "grad_norm_var": 0.04934488932291667, "learning_rate": 0.0001, "loss": 5.4396, "loss/crossentropy": 2.2871644496917725, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16602488607168198, "step": 8990 }, { "epoch": 0.281, "grad_norm": 3.015625, "grad_norm_var": 0.04771728515625, "learning_rate": 0.0001, "loss": 6.077, "loss/crossentropy": 2.5936484336853027, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.19403835386037827, "step": 8992 }, { "epoch": 0.2810625, "grad_norm": 3.578125, "grad_norm_var": 0.04986572265625, "learning_rate": 0.0001, "loss": 6.0082, "loss/crossentropy": 2.5718302726745605, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18933695554733276, "step": 8994 }, { "epoch": 0.281125, "grad_norm": 3.5, "grad_norm_var": 0.04856669108072917, "learning_rate": 0.0001, "loss": 6.1828, "loss/crossentropy": 2.682897448539734, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.1925681158900261, "step": 8996 }, { "epoch": 0.2811875, "grad_norm": 3.671875, "grad_norm_var": 0.04409077962239583, "learning_rate": 0.0001, "loss": 5.9279, "loss/crossentropy": 2.5359108448028564, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18567855656147003, "step": 8998 }, { "epoch": 0.28125, "grad_norm": 3.234375, "grad_norm_var": 0.04519856770833333, "learning_rate": 0.0001, "loss": 6.0286, "loss/crossentropy": 2.624733328819275, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.18140491843223572, "step": 9000 }, { "epoch": 0.2813125, "grad_norm": 3.28125, "grad_norm_var": 0.04487202962239583, "learning_rate": 0.0001, "loss": 6.2065, "loss/crossentropy": 2.7221689224243164, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.19375034421682358, "step": 9002 }, { "epoch": 0.281375, "grad_norm": 3.5, "grad_norm_var": 0.0489898681640625, "learning_rate": 0.0001, "loss": 6.0372, "loss/crossentropy": 2.5447323322296143, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1953364461660385, "step": 9004 }, { "epoch": 0.2814375, "grad_norm": 5.0, "grad_norm_var": 0.19611002604166666, "learning_rate": 0.0001, "loss": 6.2313, "loss/crossentropy": 2.6949377059936523, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1993364840745926, "step": 9006 }, { "epoch": 0.2815, "grad_norm": 3.25, "grad_norm_var": 0.18479817708333332, "learning_rate": 0.0001, "loss": 6.228, "loss/crossentropy": 2.7894601821899414, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1883883774280548, "step": 9008 }, { "epoch": 0.2815625, "grad_norm": 3.625, "grad_norm_var": 0.18494466145833333, "learning_rate": 0.0001, "loss": 6.3061, "loss/crossentropy": 2.7123767137527466, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.20273320376873016, "step": 9010 }, { "epoch": 0.281625, "grad_norm": 3.09375, "grad_norm_var": 0.1956695556640625, "learning_rate": 0.0001, "loss": 5.9871, "loss/crossentropy": 2.632554531097412, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18233323842287064, "step": 9012 }, { "epoch": 0.2816875, "grad_norm": 3.171875, "grad_norm_var": 0.21155192057291666, "learning_rate": 0.0001, "loss": 5.5732, "loss/crossentropy": 2.3073713779449463, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.17033526301383972, "step": 9014 }, { "epoch": 0.28175, "grad_norm": 3.984375, "grad_norm_var": 0.2232574462890625, "learning_rate": 0.0001, "loss": 6.193, "loss/crossentropy": 2.6605632305145264, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19542711228132248, "step": 9016 }, { "epoch": 0.2818125, "grad_norm": 3.40625, "grad_norm_var": 0.22009989420572917, "learning_rate": 0.0001, "loss": 5.9053, "loss/crossentropy": 2.5774831771850586, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18083104491233826, "step": 9018 }, { "epoch": 0.281875, "grad_norm": 3.421875, "grad_norm_var": 0.22069905598958334, "learning_rate": 0.0001, "loss": 6.3092, "loss/crossentropy": 2.8267698287963867, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1927720308303833, "step": 9020 }, { "epoch": 0.2819375, "grad_norm": 3.28125, "grad_norm_var": 0.06379292805989584, "learning_rate": 0.0001, "loss": 6.0883, "loss/crossentropy": 2.6888206005096436, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18642932176589966, "step": 9022 }, { "epoch": 0.282, "grad_norm": 3.109375, "grad_norm_var": 0.06591389973958334, "learning_rate": 0.0001, "loss": 5.9574, "loss/crossentropy": 2.5752652883529663, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18469508737325668, "step": 9024 }, { "epoch": 0.2820625, "grad_norm": 3.40625, "grad_norm_var": 0.06319986979166667, "learning_rate": 0.0001, "loss": 6.0361, "loss/crossentropy": 2.5644443035125732, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18895837664604187, "step": 9026 }, { "epoch": 0.282125, "grad_norm": 3.5, "grad_norm_var": 0.061986287434895836, "learning_rate": 0.0001, "loss": 6.1833, "loss/crossentropy": 2.6032562255859375, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.20019075274467468, "step": 9028 }, { "epoch": 0.2821875, "grad_norm": 3.203125, "grad_norm_var": 0.046875, "learning_rate": 0.0001, "loss": 5.8594, "loss/crossentropy": 2.4787466526031494, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18571703881025314, "step": 9030 }, { "epoch": 0.28225, "grad_norm": 3.171875, "grad_norm_var": 0.024909464518229167, "learning_rate": 0.0001, "loss": 6.064, "loss/crossentropy": 2.6514443159103394, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1842246949672699, "step": 9032 }, { "epoch": 0.2823125, "grad_norm": 3.296875, "grad_norm_var": 0.024803670247395833, "learning_rate": 0.0001, "loss": 5.7463, "loss/crossentropy": 2.3479292392730713, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18398087471723557, "step": 9034 }, { "epoch": 0.282375, "grad_norm": 3.046875, "grad_norm_var": 0.024388631184895832, "learning_rate": 0.0001, "loss": 5.8337, "loss/crossentropy": 2.4997901916503906, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18104806542396545, "step": 9036 }, { "epoch": 0.2824375, "grad_norm": 2.984375, "grad_norm_var": 0.030485026041666665, "learning_rate": 0.0001, "loss": 5.6793, "loss/crossentropy": 2.4415680170059204, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17455822974443436, "step": 9038 }, { "epoch": 0.2825, "grad_norm": 3.15625, "grad_norm_var": 0.0307525634765625, "learning_rate": 0.0001, "loss": 6.0256, "loss/crossentropy": 2.7107661962509155, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17601224780082703, "step": 9040 }, { "epoch": 0.2825625, "grad_norm": 3.421875, "grad_norm_var": 0.030931599934895835, "learning_rate": 0.0001, "loss": 5.9807, "loss/crossentropy": 2.551282525062561, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18864601850509644, "step": 9042 }, { "epoch": 0.282625, "grad_norm": 3.203125, "grad_norm_var": 0.030248006184895832, "learning_rate": 0.0001, "loss": 5.8746, "loss/crossentropy": 2.461247444152832, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1889927163720131, "step": 9044 }, { "epoch": 0.2826875, "grad_norm": 3.65625, "grad_norm_var": 0.04267171223958333, "learning_rate": 0.0001, "loss": 6.2772, "loss/crossentropy": 2.6784560680389404, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.20088715851306915, "step": 9046 }, { "epoch": 0.28275, "grad_norm": 3.328125, "grad_norm_var": 0.041552734375, "learning_rate": 0.0001, "loss": 5.7114, "loss/crossentropy": 2.465242028236389, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17305027693510056, "step": 9048 }, { "epoch": 0.2828125, "grad_norm": 3.40625, "grad_norm_var": 0.04221903483072917, "learning_rate": 0.0001, "loss": 5.8207, "loss/crossentropy": 2.4869046211242676, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18103093653917313, "step": 9050 }, { "epoch": 0.282875, "grad_norm": 3.515625, "grad_norm_var": 0.040192667643229166, "learning_rate": 0.0001, "loss": 5.8688, "loss/crossentropy": 2.471032738685608, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18821820616722107, "step": 9052 }, { "epoch": 0.2829375, "grad_norm": 3.21875, "grad_norm_var": 0.06074930826822917, "learning_rate": 0.0001, "loss": 5.9442, "loss/crossentropy": 2.5340031385421753, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18828360736370087, "step": 9054 }, { "epoch": 0.283, "grad_norm": 3.25, "grad_norm_var": 0.05743815104166667, "learning_rate": 0.0001, "loss": 5.7985, "loss/crossentropy": 2.535214900970459, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17632829397916794, "step": 9056 }, { "epoch": 0.2830625, "grad_norm": 4.0625, "grad_norm_var": 0.08642171223958334, "learning_rate": 0.0001, "loss": 5.8255, "loss/crossentropy": 2.485495924949646, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.17735742032527924, "step": 9058 }, { "epoch": 0.283125, "grad_norm": 3.21875, "grad_norm_var": 0.1017242431640625, "learning_rate": 0.0001, "loss": 6.0565, "loss/crossentropy": 2.626773238182068, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18710918724536896, "step": 9060 }, { "epoch": 0.2831875, "grad_norm": 3.40625, "grad_norm_var": 0.09986572265625, "learning_rate": 0.0001, "loss": 5.8211, "loss/crossentropy": 2.5458565950393677, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1759604886174202, "step": 9062 }, { "epoch": 0.28325, "grad_norm": 3.484375, "grad_norm_var": 0.09824930826822917, "learning_rate": 0.0001, "loss": 5.7519, "loss/crossentropy": 2.442685604095459, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.17466949671506882, "step": 9064 }, { "epoch": 0.2833125, "grad_norm": 3.375, "grad_norm_var": 0.10879618326822917, "learning_rate": 0.0001, "loss": 6.2599, "loss/crossentropy": 2.6942391395568848, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.19718880206346512, "step": 9066 }, { "epoch": 0.283375, "grad_norm": 3.359375, "grad_norm_var": 0.13450113932291666, "learning_rate": 0.0001, "loss": 5.7228, "loss/crossentropy": 2.4559353590011597, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17434197664260864, "step": 9068 }, { "epoch": 0.2834375, "grad_norm": 3.703125, "grad_norm_var": 0.16128641764322918, "learning_rate": 0.0001, "loss": 6.3024, "loss/crossentropy": 2.7098337411880493, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.19870896637439728, "step": 9070 }, { "epoch": 0.2835, "grad_norm": 3.5, "grad_norm_var": 0.1585601806640625, "learning_rate": 0.0001, "loss": 5.9719, "loss/crossentropy": 2.5387319326400757, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1882343515753746, "step": 9072 }, { "epoch": 0.2835625, "grad_norm": 3.28125, "grad_norm_var": 0.1357818603515625, "learning_rate": 0.0001, "loss": 5.8944, "loss/crossentropy": 2.5297415256500244, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18216697871685028, "step": 9074 }, { "epoch": 0.283625, "grad_norm": 3.171875, "grad_norm_var": 0.11490478515625, "learning_rate": 0.0001, "loss": 5.7764, "loss/crossentropy": 2.4699500799179077, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.177521213889122, "step": 9076 }, { "epoch": 0.2836875, "grad_norm": 3.53125, "grad_norm_var": 0.1197265625, "learning_rate": 0.0001, "loss": 5.6463, "loss/crossentropy": 2.3882386684417725, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1707303747534752, "step": 9078 }, { "epoch": 0.28375, "grad_norm": 4.125, "grad_norm_var": 0.167724609375, "learning_rate": 0.0001, "loss": 6.1195, "loss/crossentropy": 2.611040472984314, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.19068588316440582, "step": 9080 }, { "epoch": 0.2838125, "grad_norm": 3.40625, "grad_norm_var": 0.16392822265625, "learning_rate": 0.0001, "loss": 5.9801, "loss/crossentropy": 2.6328080892562866, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1812119111418724, "step": 9082 }, { "epoch": 0.283875, "grad_norm": 3.265625, "grad_norm_var": 0.14704488118489584, "learning_rate": 0.0001, "loss": 6.4703, "loss/crossentropy": 2.8104302883148193, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.20817676931619644, "step": 9084 }, { "epoch": 0.2839375, "grad_norm": 3.5, "grad_norm_var": 0.09387105305989583, "learning_rate": 0.0001, "loss": 6.104, "loss/crossentropy": 2.558880567550659, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.19631367921829224, "step": 9086 }, { "epoch": 0.284, "grad_norm": 3.546875, "grad_norm_var": 0.09000651041666667, "learning_rate": 0.0001, "loss": 6.2526, "loss/crossentropy": 2.8148038387298584, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18752646446228027, "step": 9088 }, { "epoch": 0.2840625, "grad_norm": 3.546875, "grad_norm_var": 0.08503316243489584, "learning_rate": 0.0001, "loss": 5.5647, "loss/crossentropy": 2.253652572631836, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.17329681664705276, "step": 9090 }, { "epoch": 0.284125, "grad_norm": 4.15625, "grad_norm_var": 0.11277669270833333, "learning_rate": 0.0001, "loss": 6.383, "loss/crossentropy": 2.6728352308273315, "loss/hidden": 1.671875, "loss/jsd": 0.0, "loss/logits": 0.20383010804653168, "step": 9092 }, { "epoch": 0.2841875, "grad_norm": 3.59375, "grad_norm_var": 0.1004302978515625, "learning_rate": 0.0001, "loss": 5.9191, "loss/crossentropy": 2.4468226432800293, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.19253655523061752, "step": 9094 }, { "epoch": 0.28425, "grad_norm": 3.53125, "grad_norm_var": 0.06933186848958334, "learning_rate": 0.0001, "loss": 6.0211, "loss/crossentropy": 2.636566638946533, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18259137123823166, "step": 9096 }, { "epoch": 0.2843125, "grad_norm": 3.234375, "grad_norm_var": 0.06502176920572916, "learning_rate": 0.0001, "loss": 5.822, "loss/crossentropy": 2.5147953033447266, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.18071867525577545, "step": 9098 }, { "epoch": 0.284375, "grad_norm": 3.484375, "grad_norm_var": 0.05845438639322917, "learning_rate": 0.0001, "loss": 6.1502, "loss/crossentropy": 2.6123939752578735, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.19557856768369675, "step": 9100 }, { "epoch": 0.2844375, "grad_norm": 3.640625, "grad_norm_var": 0.0567291259765625, "learning_rate": 0.0001, "loss": 6.3179, "loss/crossentropy": 2.7712206840515137, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19568248838186264, "step": 9102 }, { "epoch": 0.2845, "grad_norm": 3.15625, "grad_norm_var": 0.0625152587890625, "learning_rate": 0.0001, "loss": 5.7568, "loss/crossentropy": 2.3733925819396973, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.18052639067173004, "step": 9104 }, { "epoch": 0.2845625, "grad_norm": 3.53125, "grad_norm_var": 0.08746337890625, "learning_rate": 0.0001, "loss": 6.2083, "loss/crossentropy": 2.762609839439392, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18948964029550552, "step": 9106 }, { "epoch": 0.284625, "grad_norm": 3.859375, "grad_norm_var": 0.06048177083333333, "learning_rate": 0.0001, "loss": 5.7611, "loss/crossentropy": 2.3383185863494873, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.18094955384731293, "step": 9108 }, { "epoch": 0.2846875, "grad_norm": 3.28125, "grad_norm_var": 0.0689605712890625, "learning_rate": 0.0001, "loss": 5.8825, "loss/crossentropy": 2.542226552963257, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1801212877035141, "step": 9110 }, { "epoch": 0.28475, "grad_norm": 3.109375, "grad_norm_var": 0.07508036295572916, "learning_rate": 0.0001, "loss": 5.6944, "loss/crossentropy": 2.4106396436691284, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1779869645833969, "step": 9112 }, { "epoch": 0.2848125, "grad_norm": 3.796875, "grad_norm_var": 0.0863922119140625, "learning_rate": 0.0001, "loss": 6.0255, "loss/crossentropy": 2.6151647567749023, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18673612922430038, "step": 9114 }, { "epoch": 0.284875, "grad_norm": 3.3125, "grad_norm_var": 0.0794921875, "learning_rate": 0.0001, "loss": 5.9964, "loss/crossentropy": 2.5844658613204956, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1861146315932274, "step": 9116 }, { "epoch": 0.2849375, "grad_norm": 3.375, "grad_norm_var": 0.07571207682291667, "learning_rate": 0.0001, "loss": 5.947, "loss/crossentropy": 2.5303101539611816, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18971359729766846, "step": 9118 }, { "epoch": 0.285, "grad_norm": 3.359375, "grad_norm_var": 0.07317301432291666, "learning_rate": 0.0001, "loss": 6.0363, "loss/crossentropy": 2.655127763748169, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18343449383974075, "step": 9120 }, { "epoch": 0.2850625, "grad_norm": 3.796875, "grad_norm_var": 0.09767964680989584, "learning_rate": 0.0001, "loss": 6.018, "loss/crossentropy": 2.493555426597595, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.1926787868142128, "step": 9122 }, { "epoch": 0.285125, "grad_norm": 3.546875, "grad_norm_var": 0.075439453125, "learning_rate": 0.0001, "loss": 6.0001, "loss/crossentropy": 2.5280349254608154, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1901702806353569, "step": 9124 }, { "epoch": 0.2851875, "grad_norm": 4.59375, "grad_norm_var": 0.16013895670572917, "learning_rate": 0.0001, "loss": 6.3609, "loss/crossentropy": 2.7163779735565186, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.2078113555908203, "step": 9126 }, { "epoch": 0.28525, "grad_norm": 3.59375, "grad_norm_var": 0.14339192708333334, "learning_rate": 0.0001, "loss": 5.9869, "loss/crossentropy": 2.551315426826477, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18730486929416656, "step": 9128 }, { "epoch": 0.2853125, "grad_norm": 3.375, "grad_norm_var": 0.13235270182291667, "learning_rate": 0.0001, "loss": 5.9906, "loss/crossentropy": 2.602681517601013, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1860525906085968, "step": 9130 }, { "epoch": 0.285375, "grad_norm": 4.0, "grad_norm_var": 0.134619140625, "learning_rate": 0.0001, "loss": 6.3725, "loss/crossentropy": 2.7989598512649536, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.19719856977462769, "step": 9132 }, { "epoch": 0.2854375, "grad_norm": 3.984375, "grad_norm_var": 35.59182535807292, "learning_rate": 0.0001, "loss": 7.1315, "loss/crossentropy": 2.71242094039917, "loss/hidden": 1.64453125, "loss/jsd": 0.0, "loss/logits": 0.27745749801397324, "step": 9134 }, { "epoch": 0.2855, "grad_norm": 4.125, "grad_norm_var": 35.21113993326823, "learning_rate": 0.0001, "loss": 6.6899, "loss/crossentropy": 2.899548292160034, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.21496783941984177, "step": 9136 }, { "epoch": 0.2855625, "grad_norm": 3.40625, "grad_norm_var": 35.45763346354167, "learning_rate": 0.0001, "loss": 5.8573, "loss/crossentropy": 2.4658334255218506, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18640770018100739, "step": 9138 }, { "epoch": 0.285625, "grad_norm": 3.375, "grad_norm_var": 35.547672526041666, "learning_rate": 0.0001, "loss": 5.8011, "loss/crossentropy": 2.4256393909454346, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1820792406797409, "step": 9140 }, { "epoch": 0.2856875, "grad_norm": 3.359375, "grad_norm_var": 35.784830729166664, "learning_rate": 0.0001, "loss": 6.0857, "loss/crossentropy": 2.64946711063385, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1889318972826004, "step": 9142 }, { "epoch": 0.28575, "grad_norm": 3.296875, "grad_norm_var": 35.78233947753906, "learning_rate": 0.0001, "loss": 5.8765, "loss/crossentropy": 2.4886363744735718, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.185664564371109, "step": 9144 }, { "epoch": 0.2858125, "grad_norm": 3.234375, "grad_norm_var": 35.79916076660156, "learning_rate": 0.0001, "loss": 5.7125, "loss/crossentropy": 2.361438512802124, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18354769051074982, "step": 9146 }, { "epoch": 0.285875, "grad_norm": 3.703125, "grad_norm_var": 35.96471252441406, "learning_rate": 0.0001, "loss": 5.9112, "loss/crossentropy": 2.548967123031616, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1823192909359932, "step": 9148 }, { "epoch": 0.2859375, "grad_norm": 7.34375, "grad_norm_var": 1.03726806640625, "learning_rate": 0.0001, "loss": 6.0803, "loss/crossentropy": 2.5867608785629272, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19037093222141266, "step": 9150 }, { "epoch": 0.286, "grad_norm": 3.875, "grad_norm_var": 0.9989217122395834, "learning_rate": 0.0001, "loss": 6.0382, "loss/crossentropy": 2.55404531955719, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19059911370277405, "step": 9152 }, { "epoch": 0.2860625, "grad_norm": 3.140625, "grad_norm_var": 1.0123931884765625, "learning_rate": 0.0001, "loss": 5.9966, "loss/crossentropy": 2.560911774635315, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18809889256954193, "step": 9154 }, { "epoch": 0.286125, "grad_norm": 3.53125, "grad_norm_var": 1.009968058268229, "learning_rate": 0.0001, "loss": 5.5887, "loss/crossentropy": 2.3073805570602417, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1726599782705307, "step": 9156 }, { "epoch": 0.2861875, "grad_norm": 3.390625, "grad_norm_var": 1.0194488525390626, "learning_rate": 0.0001, "loss": 5.7854, "loss/crossentropy": 2.393547296524048, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18527594208717346, "step": 9158 }, { "epoch": 0.28625, "grad_norm": 3.640625, "grad_norm_var": 1.0142242431640625, "learning_rate": 0.0001, "loss": 6.0094, "loss/crossentropy": 2.545302629470825, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18820396065711975, "step": 9160 }, { "epoch": 0.2863125, "grad_norm": 4.125, "grad_norm_var": 1.0132161458333333, "learning_rate": 0.0001, "loss": 6.3975, "loss/crossentropy": 2.7841343879699707, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.20117808878421783, "step": 9162 }, { "epoch": 0.286375, "grad_norm": 3.515625, "grad_norm_var": 0.9988840738932292, "learning_rate": 0.0001, "loss": 6.0892, "loss/crossentropy": 2.603915810585022, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.18954448401927948, "step": 9164 }, { "epoch": 0.2864375, "grad_norm": 3.484375, "grad_norm_var": 0.07822977701822917, "learning_rate": 0.0001, "loss": 6.1461, "loss/crossentropy": 2.6508954763412476, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19170917570590973, "step": 9166 }, { "epoch": 0.2865, "grad_norm": 3.421875, "grad_norm_var": 0.065185546875, "learning_rate": 0.0001, "loss": 5.8764, "loss/crossentropy": 2.5029672384262085, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18344075232744217, "step": 9168 }, { "epoch": 0.2865625, "grad_norm": 3.3125, "grad_norm_var": 0.06448160807291667, "learning_rate": 0.0001, "loss": 6.1846, "loss/crossentropy": 2.7582833766937256, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1879480704665184, "step": 9170 }, { "epoch": 0.286625, "grad_norm": 3.28125, "grad_norm_var": 0.06555582682291666, "learning_rate": 0.0001, "loss": 5.7316, "loss/crossentropy": 2.3871891498565674, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18209929764270782, "step": 9172 }, { "epoch": 0.2866875, "grad_norm": 3.546875, "grad_norm_var": 0.0596588134765625, "learning_rate": 0.0001, "loss": 6.0071, "loss/crossentropy": 2.5322988033294678, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.19279064238071442, "step": 9174 }, { "epoch": 0.28675, "grad_norm": 3.328125, "grad_norm_var": 0.06018778483072917, "learning_rate": 0.0001, "loss": 6.0731, "loss/crossentropy": 2.6345635652542114, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18916398286819458, "step": 9176 }, { "epoch": 0.2868125, "grad_norm": 3.03125, "grad_norm_var": 0.0293853759765625, "learning_rate": 0.0001, "loss": 5.8183, "loss/crossentropy": 2.5042537450790405, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17906411737203598, "step": 9178 }, { "epoch": 0.286875, "grad_norm": 3.390625, "grad_norm_var": 0.025397745768229167, "learning_rate": 0.0001, "loss": 5.807, "loss/crossentropy": 2.5237934589385986, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17675773054361343, "step": 9180 }, { "epoch": 0.2869375, "grad_norm": 4.9375, "grad_norm_var": 0.17666727701822918, "learning_rate": 0.0001, "loss": 5.7583, "loss/crossentropy": 2.3460735082626343, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18458520621061325, "step": 9182 }, { "epoch": 0.287, "grad_norm": 3.296875, "grad_norm_var": 0.18108317057291667, "learning_rate": 0.0001, "loss": 6.0849, "loss/crossentropy": 2.655609369277954, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1851135641336441, "step": 9184 }, { "epoch": 0.2870625, "grad_norm": 3.265625, "grad_norm_var": 0.17948811848958332, "learning_rate": 0.0001, "loss": 6.1004, "loss/crossentropy": 2.6337010860443115, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1904214471578598, "step": 9186 }, { "epoch": 0.287125, "grad_norm": 3.21875, "grad_norm_var": 0.18765869140625, "learning_rate": 0.0001, "loss": 5.7994, "loss/crossentropy": 2.459021210670471, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18091781437397003, "step": 9188 }, { "epoch": 0.2871875, "grad_norm": 3.25, "grad_norm_var": 0.19498697916666666, "learning_rate": 0.0001, "loss": 6.0276, "loss/crossentropy": 2.621256709098816, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18634165823459625, "step": 9190 }, { "epoch": 0.28725, "grad_norm": 3.328125, "grad_norm_var": 0.19714253743489582, "learning_rate": 0.0001, "loss": 6.3783, "loss/crossentropy": 2.853867769241333, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1969754844903946, "step": 9192 }, { "epoch": 0.2873125, "grad_norm": 3.15625, "grad_norm_var": 0.1912994384765625, "learning_rate": 0.0001, "loss": 5.6215, "loss/crossentropy": 2.3387062549591064, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.17164014279842377, "step": 9194 }, { "epoch": 0.287375, "grad_norm": 3.375, "grad_norm_var": 0.19104410807291666, "learning_rate": 0.0001, "loss": 6.1026, "loss/crossentropy": 2.6131699085235596, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.19152169674634933, "step": 9196 }, { "epoch": 0.2874375, "grad_norm": 3.6875, "grad_norm_var": 0.044709269205729166, "learning_rate": 0.0001, "loss": 5.9232, "loss/crossentropy": 2.5079824924468994, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1868300512433052, "step": 9198 }, { "epoch": 0.2875, "grad_norm": 3.71875, "grad_norm_var": 0.048111979166666666, "learning_rate": 0.0001, "loss": 6.3274, "loss/crossentropy": 2.782606601715088, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.19588365405797958, "step": 9200 }, { "epoch": 0.2875625, "grad_norm": 3.671875, "grad_norm_var": 0.04052327473958333, "learning_rate": 0.0001, "loss": 6.1545, "loss/crossentropy": 2.6817389726638794, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.18751223385334015, "step": 9202 }, { "epoch": 0.287625, "grad_norm": 3.34375, "grad_norm_var": 0.03462626139322917, "learning_rate": 0.0001, "loss": 5.9146, "loss/crossentropy": 2.536959648132324, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18385609984397888, "step": 9204 }, { "epoch": 0.2876875, "grad_norm": 3.671875, "grad_norm_var": 0.031754557291666666, "learning_rate": 0.0001, "loss": 6.1421, "loss/crossentropy": 2.664365768432617, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1930903196334839, "step": 9206 }, { "epoch": 0.28775, "grad_norm": 3.40625, "grad_norm_var": 0.027253214518229166, "learning_rate": 0.0001, "loss": 5.6488, "loss/crossentropy": 2.3551712036132812, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17546115815639496, "step": 9208 }, { "epoch": 0.2878125, "grad_norm": 3.09375, "grad_norm_var": 0.03013916015625, "learning_rate": 0.0001, "loss": 5.8193, "loss/crossentropy": 2.4081833362579346, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18486540019512177, "step": 9210 }, { "epoch": 0.287875, "grad_norm": 3.609375, "grad_norm_var": 0.041666666666666664, "learning_rate": 0.0001, "loss": 5.6278, "loss/crossentropy": 2.3689277172088623, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1723714917898178, "step": 9212 }, { "epoch": 0.2879375, "grad_norm": 3.46875, "grad_norm_var": 0.046468098958333336, "learning_rate": 0.0001, "loss": 5.9088, "loss/crossentropy": 2.5543618202209473, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1830964982509613, "step": 9214 }, { "epoch": 0.288, "grad_norm": 3.453125, "grad_norm_var": 0.04487202962239583, "learning_rate": 0.0001, "loss": 6.1735, "loss/crossentropy": 2.5532714128494263, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.20225465297698975, "step": 9216 }, { "epoch": 0.2880625, "grad_norm": 3.546875, "grad_norm_var": 0.05185139973958333, "learning_rate": 0.0001, "loss": 6.1338, "loss/crossentropy": 2.6418739557266235, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.19567591696977615, "step": 9218 }, { "epoch": 0.288125, "grad_norm": 3.40625, "grad_norm_var": 0.0592437744140625, "learning_rate": 0.0001, "loss": 5.7424, "loss/crossentropy": 2.3867040872573853, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1824459806084633, "step": 9220 }, { "epoch": 0.2881875, "grad_norm": 3.703125, "grad_norm_var": 0.06051432291666667, "learning_rate": 0.0001, "loss": 6.0869, "loss/crossentropy": 2.5976165533065796, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.1883801445364952, "step": 9222 }, { "epoch": 0.28825, "grad_norm": 3.546875, "grad_norm_var": 0.06036783854166667, "learning_rate": 0.0001, "loss": 6.3511, "loss/crossentropy": 2.787461757659912, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.19698838144540787, "step": 9224 }, { "epoch": 0.2883125, "grad_norm": 3.21875, "grad_norm_var": 0.055419921875, "learning_rate": 0.0001, "loss": 5.9473, "loss/crossentropy": 2.564436197280884, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18554943799972534, "step": 9226 }, { "epoch": 0.288375, "grad_norm": 3.640625, "grad_norm_var": 0.055029296875, "learning_rate": 0.0001, "loss": 6.043, "loss/crossentropy": 2.5392333269119263, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19334959238767624, "step": 9228 }, { "epoch": 0.2884375, "grad_norm": 3.609375, "grad_norm_var": 0.04179585774739583, "learning_rate": 0.0001, "loss": 5.7817, "loss/crossentropy": 2.423188805580139, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18077274411916733, "step": 9230 }, { "epoch": 0.2885, "grad_norm": 3.15625, "grad_norm_var": 0.0576568603515625, "learning_rate": 0.0001, "loss": 5.9619, "loss/crossentropy": 2.6595394611358643, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17867840826511383, "step": 9232 }, { "epoch": 0.2885625, "grad_norm": 4.25, "grad_norm_var": 0.08697509765625, "learning_rate": 0.0001, "loss": 6.3437, "loss/crossentropy": 2.8047486543655396, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19607874006032944, "step": 9234 }, { "epoch": 0.288625, "grad_norm": 3.375, "grad_norm_var": 0.08189697265625, "learning_rate": 0.0001, "loss": 5.814, "loss/crossentropy": 2.499058723449707, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17759249359369278, "step": 9236 }, { "epoch": 0.2886875, "grad_norm": 4.09375, "grad_norm_var": 0.10056966145833333, "learning_rate": 0.0001, "loss": 6.0368, "loss/crossentropy": 2.552868962287903, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.18627947568893433, "step": 9238 }, { "epoch": 0.28875, "grad_norm": 3.484375, "grad_norm_var": 0.10156962076822916, "learning_rate": 0.0001, "loss": 5.9558, "loss/crossentropy": 2.547224760055542, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1881256103515625, "step": 9240 }, { "epoch": 0.2888125, "grad_norm": 3.28125, "grad_norm_var": 0.09879557291666667, "learning_rate": 0.0001, "loss": 5.9994, "loss/crossentropy": 2.5107314586639404, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.19340085983276367, "step": 9242 }, { "epoch": 0.288875, "grad_norm": 3.46875, "grad_norm_var": 0.09177958170572917, "learning_rate": 0.0001, "loss": 5.7584, "loss/crossentropy": 2.4198479652404785, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.18385396897792816, "step": 9244 }, { "epoch": 0.2889375, "grad_norm": 3.03125, "grad_norm_var": 0.10918680826822917, "learning_rate": 0.0001, "loss": 5.8078, "loss/crossentropy": 2.5677201747894287, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17283860594034195, "step": 9246 }, { "epoch": 0.289, "grad_norm": 3.28125, "grad_norm_var": 0.10030924479166667, "learning_rate": 0.0001, "loss": 6.0421, "loss/crossentropy": 2.5909671783447266, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1880839392542839, "step": 9248 }, { "epoch": 0.2890625, "grad_norm": 3.671875, "grad_norm_var": 0.121337890625, "learning_rate": 0.0001, "loss": 6.1022, "loss/crossentropy": 2.5477112531661987, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19764020293951035, "step": 9250 }, { "epoch": 0.289125, "grad_norm": 3.28125, "grad_norm_var": 0.122705078125, "learning_rate": 0.0001, "loss": 5.8505, "loss/crossentropy": 2.532976508140564, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17784690856933594, "step": 9252 }, { "epoch": 0.2891875, "grad_norm": 3.625, "grad_norm_var": 0.105078125, "learning_rate": 0.0001, "loss": 6.1819, "loss/crossentropy": 2.5791308879852295, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.20207322388887405, "step": 9254 }, { "epoch": 0.28925, "grad_norm": 3.515625, "grad_norm_var": 0.10579020182291667, "learning_rate": 0.0001, "loss": 5.9808, "loss/crossentropy": 2.5386255979537964, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18718945980072021, "step": 9256 }, { "epoch": 0.2893125, "grad_norm": 3.203125, "grad_norm_var": 0.11177978515625, "learning_rate": 0.0001, "loss": 5.9823, "loss/crossentropy": 2.648008704185486, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.179526224732399, "step": 9258 }, { "epoch": 0.289375, "grad_norm": 3.09375, "grad_norm_var": 0.11738179524739584, "learning_rate": 0.0001, "loss": 5.5612, "loss/crossentropy": 2.2775819301605225, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17680228501558304, "step": 9260 }, { "epoch": 0.2894375, "grad_norm": 3.21875, "grad_norm_var": 0.10325113932291667, "learning_rate": 0.0001, "loss": 5.8594, "loss/crossentropy": 2.468423366546631, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.18128884583711624, "step": 9262 }, { "epoch": 0.2895, "grad_norm": 3.671875, "grad_norm_var": 0.1106109619140625, "learning_rate": 0.0001, "loss": 5.9303, "loss/crossentropy": 2.5634262561798096, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1831684410572052, "step": 9264 }, { "epoch": 0.2895625, "grad_norm": 3.28125, "grad_norm_var": 0.0459869384765625, "learning_rate": 0.0001, "loss": 5.953, "loss/crossentropy": 2.5027319192886353, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.19189921766519547, "step": 9266 }, { "epoch": 0.289625, "grad_norm": 3.46875, "grad_norm_var": 0.04576416015625, "learning_rate": 0.0001, "loss": 5.9283, "loss/crossentropy": 2.5710513591766357, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17986471205949783, "step": 9268 }, { "epoch": 0.2896875, "grad_norm": 3.265625, "grad_norm_var": 0.03170166015625, "learning_rate": 0.0001, "loss": 5.9576, "loss/crossentropy": 2.578773617744446, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18319019675254822, "step": 9270 }, { "epoch": 0.28975, "grad_norm": 4.09375, "grad_norm_var": 0.06482645670572916, "learning_rate": 0.0001, "loss": 5.906, "loss/crossentropy": 2.3595499992370605, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.19214613735675812, "step": 9272 }, { "epoch": 0.2898125, "grad_norm": 3.46875, "grad_norm_var": 0.05940653483072917, "learning_rate": 0.0001, "loss": 5.8633, "loss/crossentropy": 2.495652437210083, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.17817071825265884, "step": 9274 }, { "epoch": 0.289875, "grad_norm": 3.53125, "grad_norm_var": 0.05322977701822917, "learning_rate": 0.0001, "loss": 5.7203, "loss/crossentropy": 2.4016173481941223, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.17562279850244522, "step": 9276 }, { "epoch": 0.2899375, "grad_norm": 3.65625, "grad_norm_var": 0.05185139973958333, "learning_rate": 0.0001, "loss": 5.9694, "loss/crossentropy": 2.572734236717224, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18420225381851196, "step": 9278 }, { "epoch": 0.29, "grad_norm": 3.96875, "grad_norm_var": 0.06738993326822916, "learning_rate": 0.0001, "loss": 6.0559, "loss/crossentropy": 2.6163170337677, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.19044649600982666, "step": 9280 }, { "epoch": 0.2900625, "grad_norm": 3.609375, "grad_norm_var": 0.06609700520833334, "learning_rate": 0.0001, "loss": 6.3712, "loss/crossentropy": 2.8518699407577515, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19099697470664978, "step": 9282 }, { "epoch": 0.290125, "grad_norm": 3.46875, "grad_norm_var": 0.06609700520833334, "learning_rate": 0.0001, "loss": 6.0108, "loss/crossentropy": 2.5253738164901733, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19151408970355988, "step": 9284 }, { "epoch": 0.2901875, "grad_norm": 3.671875, "grad_norm_var": 0.08382161458333333, "learning_rate": 0.0001, "loss": 6.1044, "loss/crossentropy": 2.62210476398468, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19041306525468826, "step": 9286 }, { "epoch": 0.29025, "grad_norm": 6.1875, "grad_norm_var": 0.5259928385416667, "learning_rate": 0.0001, "loss": 5.704, "loss/crossentropy": 2.3728272914886475, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.17530959844589233, "step": 9288 }, { "epoch": 0.2903125, "grad_norm": 3.359375, "grad_norm_var": 0.5331044514973958, "learning_rate": 0.0001, "loss": 6.0333, "loss/crossentropy": 2.5850698947906494, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1870090439915657, "step": 9290 }, { "epoch": 0.290375, "grad_norm": 3.21875, "grad_norm_var": 0.5549763997395833, "learning_rate": 0.0001, "loss": 5.9617, "loss/crossentropy": 2.6056989431381226, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1824745461344719, "step": 9292 }, { "epoch": 0.2904375, "grad_norm": 3.578125, "grad_norm_var": 0.5628214518229167, "learning_rate": 0.0001, "loss": 6.0058, "loss/crossentropy": 2.5911813974380493, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18911613523960114, "step": 9294 }, { "epoch": 0.2905, "grad_norm": 4.5625, "grad_norm_var": 0.60439453125, "learning_rate": 0.0001, "loss": 6.025, "loss/crossentropy": 2.558809995651245, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1919320896267891, "step": 9296 }, { "epoch": 0.2905625, "grad_norm": 3.765625, "grad_norm_var": 0.6037923177083333, "learning_rate": 0.0001, "loss": 6.0465, "loss/crossentropy": 2.5537161827087402, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19029832631349564, "step": 9298 }, { "epoch": 0.290625, "grad_norm": 3.015625, "grad_norm_var": 0.6330800374348958, "learning_rate": 0.0001, "loss": 5.8275, "loss/crossentropy": 2.5661016702651978, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1761380210518837, "step": 9300 }, { "epoch": 0.2906875, "grad_norm": 6.5625, "grad_norm_var": 1.161327107747396, "learning_rate": 0.0001, "loss": 6.7729, "loss/crossentropy": 2.9322030544281006, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.22586390376091003, "step": 9302 }, { "epoch": 0.29075, "grad_norm": 3.46875, "grad_norm_var": 0.7327870686848958, "learning_rate": 0.0001, "loss": 6.2271, "loss/crossentropy": 2.728728771209717, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.1916360929608345, "step": 9304 }, { "epoch": 0.2908125, "grad_norm": 4.3125, "grad_norm_var": 0.74677734375, "learning_rate": 0.0001, "loss": 6.2302, "loss/crossentropy": 2.59408700466156, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.20267153531312943, "step": 9306 }, { "epoch": 0.290875, "grad_norm": 3.390625, "grad_norm_var": 0.7129547119140625, "learning_rate": 0.0001, "loss": 5.8737, "loss/crossentropy": 2.5048424005508423, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18337111920118332, "step": 9308 }, { "epoch": 0.2909375, "grad_norm": 3.625, "grad_norm_var": 0.701904296875, "learning_rate": 0.0001, "loss": 6.0379, "loss/crossentropy": 2.601284146308899, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1866263747215271, "step": 9310 }, { "epoch": 0.291, "grad_norm": 3.203125, "grad_norm_var": 0.6597615559895833, "learning_rate": 0.0001, "loss": 6.0365, "loss/crossentropy": 2.6188138723373413, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18629661947488785, "step": 9312 }, { "epoch": 0.2910625, "grad_norm": 7.46875, "grad_norm_var": 1.5605753580729167, "learning_rate": 0.0001, "loss": 6.462, "loss/crossentropy": 2.6669247150421143, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.21700634062290192, "step": 9314 }, { "epoch": 0.291125, "grad_norm": 3.65625, "grad_norm_var": 1.5083160400390625, "learning_rate": 0.0001, "loss": 5.9701, "loss/crossentropy": 2.56630277633667, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18685922026634216, "step": 9316 }, { "epoch": 0.2911875, "grad_norm": 3.265625, "grad_norm_var": 1.0509592692057292, "learning_rate": 0.0001, "loss": 5.8683, "loss/crossentropy": 2.4687705039978027, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1833105981349945, "step": 9318 }, { "epoch": 0.29125, "grad_norm": 3.6875, "grad_norm_var": 1.0473917643229167, "learning_rate": 0.0001, "loss": 6.094, "loss/crossentropy": 2.5667370557785034, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.19530849903821945, "step": 9320 }, { "epoch": 0.2913125, "grad_norm": 3.6875, "grad_norm_var": 1.0302154541015625, "learning_rate": 0.0001, "loss": 5.9883, "loss/crossentropy": 2.5635939836502075, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18778348714113235, "step": 9322 }, { "epoch": 0.291375, "grad_norm": 3.875, "grad_norm_var": 1.03828125, "learning_rate": 0.0001, "loss": 6.1287, "loss/crossentropy": 2.6045225858688354, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.19108834862709045, "step": 9324 }, { "epoch": 0.2914375, "grad_norm": 3.328125, "grad_norm_var": 1.0485514322916667, "learning_rate": 0.0001, "loss": 5.77, "loss/crossentropy": 2.5001951456069946, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17659078538417816, "step": 9326 }, { "epoch": 0.2915, "grad_norm": 3.65625, "grad_norm_var": 1.0400217692057292, "learning_rate": 0.0001, "loss": 5.6163, "loss/crossentropy": 2.339695930480957, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17688129842281342, "step": 9328 }, { "epoch": 0.2915625, "grad_norm": 3.015625, "grad_norm_var": 0.05122782389322917, "learning_rate": 0.0001, "loss": 5.4758, "loss/crossentropy": 2.3356382846832275, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1620657593011856, "step": 9330 }, { "epoch": 0.291625, "grad_norm": 7.5, "grad_norm_var": 1.0894694010416666, "learning_rate": 0.0001, "loss": 6.4588, "loss/crossentropy": 2.841578722000122, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.20351407676935196, "step": 9332 }, { "epoch": 0.2916875, "grad_norm": 3.734375, "grad_norm_var": 1.0865397135416666, "learning_rate": 0.0001, "loss": 6.1103, "loss/crossentropy": 2.682422637939453, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.190447598695755, "step": 9334 }, { "epoch": 0.29175, "grad_norm": 3.1875, "grad_norm_var": 1.1200846354166667, "learning_rate": 0.0001, "loss": 5.6275, "loss/crossentropy": 2.3950765132904053, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1720711588859558, "step": 9336 }, { "epoch": 0.2918125, "grad_norm": 3.203125, "grad_norm_var": 1.1268229166666666, "learning_rate": 0.0001, "loss": 5.6068, "loss/crossentropy": 2.337676167488098, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17222560197114944, "step": 9338 }, { "epoch": 0.291875, "grad_norm": 3.34375, "grad_norm_var": 1.1230428059895834, "learning_rate": 0.0001, "loss": 6.1973, "loss/crossentropy": 2.795773148536682, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18624678999185562, "step": 9340 }, { "epoch": 0.2919375, "grad_norm": 2.859375, "grad_norm_var": 1.1498769124348958, "learning_rate": 0.0001, "loss": 5.594, "loss/crossentropy": 2.4307150840759277, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1643746793270111, "step": 9342 }, { "epoch": 0.292, "grad_norm": 3.484375, "grad_norm_var": 2.4121419270833333, "learning_rate": 0.0001, "loss": 6.8335, "loss/crossentropy": 2.7818437814712524, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.24501432478427887, "step": 9344 }, { "epoch": 0.2920625, "grad_norm": 3.28125, "grad_norm_var": 2.3807281494140624, "learning_rate": 0.0001, "loss": 5.7506, "loss/crossentropy": 2.4875333309173584, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17786619812250137, "step": 9346 }, { "epoch": 0.292125, "grad_norm": 3.359375, "grad_norm_var": 1.44351806640625, "learning_rate": 0.0001, "loss": 5.8438, "loss/crossentropy": 2.517126202583313, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18189110606908798, "step": 9348 }, { "epoch": 0.2921875, "grad_norm": 3.265625, "grad_norm_var": 1.450202433268229, "learning_rate": 0.0001, "loss": 5.8565, "loss/crossentropy": 2.475774884223938, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.1790889948606491, "step": 9350 }, { "epoch": 0.29225, "grad_norm": 3.234375, "grad_norm_var": 1.4333892822265626, "learning_rate": 0.0001, "loss": 5.771, "loss/crossentropy": 2.3728095293045044, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18551874160766602, "step": 9352 }, { "epoch": 0.2923125, "grad_norm": 3.921875, "grad_norm_var": 1.4277628580729167, "learning_rate": 0.0001, "loss": 6.1051, "loss/crossentropy": 2.6263909339904785, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1924026682972908, "step": 9354 }, { "epoch": 0.292375, "grad_norm": 3.84375, "grad_norm_var": 1.4276692708333334, "learning_rate": 0.0001, "loss": 6.1152, "loss/crossentropy": 2.5191601514816284, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.19905493408441544, "step": 9356 }, { "epoch": 0.2924375, "grad_norm": 3.265625, "grad_norm_var": 1.3986480712890625, "learning_rate": 0.0001, "loss": 5.7928, "loss/crossentropy": 2.454358458518982, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1815009042620659, "step": 9358 }, { "epoch": 0.2925, "grad_norm": 3.234375, "grad_norm_var": 0.05308837890625, "learning_rate": 0.0001, "loss": 5.6278, "loss/crossentropy": 2.4135148525238037, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16869229823350906, "step": 9360 }, { "epoch": 0.2925625, "grad_norm": 3.515625, "grad_norm_var": 0.05319010416666667, "learning_rate": 0.0001, "loss": 5.9325, "loss/crossentropy": 2.5693756341934204, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18279489129781723, "step": 9362 }, { "epoch": 0.292625, "grad_norm": 3.484375, "grad_norm_var": 0.07939351399739583, "learning_rate": 0.0001, "loss": 6.2678, "loss/crossentropy": 2.7371314764022827, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.19564001262187958, "step": 9364 }, { "epoch": 0.2926875, "grad_norm": 3.1875, "grad_norm_var": 0.1599517822265625, "learning_rate": 0.0001, "loss": 6.1241, "loss/crossentropy": 2.58701753616333, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.19745878875255585, "step": 9366 }, { "epoch": 0.29275, "grad_norm": 3.90625, "grad_norm_var": 0.1594390869140625, "learning_rate": 0.0001, "loss": 6.3299, "loss/crossentropy": 2.775208592414856, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.19882483780384064, "step": 9368 }, { "epoch": 0.2928125, "grad_norm": 3.3125, "grad_norm_var": 0.15263570149739583, "learning_rate": 0.0001, "loss": 5.8915, "loss/crossentropy": 2.4790087938308716, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18656082451343536, "step": 9370 }, { "epoch": 0.292875, "grad_norm": 3.046875, "grad_norm_var": 0.16703999837239583, "learning_rate": 0.0001, "loss": 5.6431, "loss/crossentropy": 2.39720356464386, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1726379320025444, "step": 9372 }, { "epoch": 0.2929375, "grad_norm": 3.5, "grad_norm_var": 0.1646392822265625, "learning_rate": 0.0001, "loss": 6.112, "loss/crossentropy": 2.69320011138916, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18719422817230225, "step": 9374 }, { "epoch": 0.293, "grad_norm": 3.46875, "grad_norm_var": 0.166943359375, "learning_rate": 0.0001, "loss": 5.4966, "loss/crossentropy": 2.2177973985671997, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.17085294425487518, "step": 9376 }, { "epoch": 0.2930625, "grad_norm": 3.28125, "grad_norm_var": 0.1681793212890625, "learning_rate": 0.0001, "loss": 6.1634, "loss/crossentropy": 2.6918656826019287, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.18817061185836792, "step": 9378 }, { "epoch": 0.293125, "grad_norm": 3.546875, "grad_norm_var": 0.14599507649739582, "learning_rate": 0.0001, "loss": 6.1192, "loss/crossentropy": 2.6786845922470093, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18702397495508194, "step": 9380 }, { "epoch": 0.2931875, "grad_norm": 3.5, "grad_norm_var": 0.05211588541666667, "learning_rate": 0.0001, "loss": 5.9031, "loss/crossentropy": 2.4648101329803467, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.1848449409008026, "step": 9382 }, { "epoch": 0.29325, "grad_norm": 3.390625, "grad_norm_var": 0.031672159830729164, "learning_rate": 0.0001, "loss": 5.9042, "loss/crossentropy": 2.516110062599182, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1833401843905449, "step": 9384 }, { "epoch": 0.2933125, "grad_norm": 3.75, "grad_norm_var": 0.06259358723958333, "learning_rate": 0.0001, "loss": 6.1422, "loss/crossentropy": 2.5483232736587524, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.20118191838264465, "step": 9386 }, { "epoch": 0.293375, "grad_norm": 3.203125, "grad_norm_var": 0.053857421875, "learning_rate": 0.0001, "loss": 5.6446, "loss/crossentropy": 2.4846439361572266, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16365066170692444, "step": 9388 }, { "epoch": 0.2934375, "grad_norm": 3.34375, "grad_norm_var": 0.05217692057291667, "learning_rate": 0.0001, "loss": 5.8954, "loss/crossentropy": 2.557347536087036, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18145860731601715, "step": 9390 }, { "epoch": 0.2935, "grad_norm": 3.46875, "grad_norm_var": 0.04788411458333333, "learning_rate": 0.0001, "loss": 5.7785, "loss/crossentropy": 2.4624106884002686, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17770367860794067, "step": 9392 }, { "epoch": 0.2935625, "grad_norm": 3.265625, "grad_norm_var": 0.04924723307291667, "learning_rate": 0.0001, "loss": 6.0135, "loss/crossentropy": 2.5998064279556274, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18512319773435593, "step": 9394 }, { "epoch": 0.293625, "grad_norm": 3.265625, "grad_norm_var": 0.05273030598958333, "learning_rate": 0.0001, "loss": 6.0434, "loss/crossentropy": 2.7213690280914307, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17673882097005844, "step": 9396 }, { "epoch": 0.2936875, "grad_norm": 3.59375, "grad_norm_var": 0.055418904622395834, "learning_rate": 0.0001, "loss": 6.0422, "loss/crossentropy": 2.630928635597229, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1879998818039894, "step": 9398 }, { "epoch": 0.29375, "grad_norm": 3.140625, "grad_norm_var": 0.0619537353515625, "learning_rate": 0.0001, "loss": 5.794, "loss/crossentropy": 2.47506582736969, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17798515409231186, "step": 9400 }, { "epoch": 0.2938125, "grad_norm": 3.65625, "grad_norm_var": 0.02877197265625, "learning_rate": 0.0001, "loss": 5.7255, "loss/crossentropy": 2.420718193054199, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17852778732776642, "step": 9402 }, { "epoch": 0.293875, "grad_norm": 5.0625, "grad_norm_var": 0.20722249348958333, "learning_rate": 0.0001, "loss": 6.1786, "loss/crossentropy": 2.5539212226867676, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.19997169077396393, "step": 9404 }, { "epoch": 0.2939375, "grad_norm": 4.125, "grad_norm_var": 0.23736572265625, "learning_rate": 0.0001, "loss": 6.1939, "loss/crossentropy": 2.6862411499023438, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.19490952044725418, "step": 9406 }, { "epoch": 0.294, "grad_norm": 3.28125, "grad_norm_var": 0.23877665201822917, "learning_rate": 0.0001, "loss": 5.8797, "loss/crossentropy": 2.5766749382019043, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17756423354148865, "step": 9408 }, { "epoch": 0.2940625, "grad_norm": 3.0625, "grad_norm_var": 0.24751688639322916, "learning_rate": 0.0001, "loss": 5.9456, "loss/crossentropy": 2.650808095932007, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17869983613491058, "step": 9410 }, { "epoch": 0.294125, "grad_norm": 3.234375, "grad_norm_var": 0.24697977701822918, "learning_rate": 0.0001, "loss": 5.9725, "loss/crossentropy": 2.5413256883621216, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18765229731798172, "step": 9412 }, { "epoch": 0.2941875, "grad_norm": 3.25, "grad_norm_var": 0.24722900390625, "learning_rate": 0.0001, "loss": 5.8193, "loss/crossentropy": 2.443260073661804, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18018130958080292, "step": 9414 }, { "epoch": 0.29425, "grad_norm": 3.40625, "grad_norm_var": 0.24661051432291667, "learning_rate": 0.0001, "loss": 5.8545, "loss/crossentropy": 2.491973638534546, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18312648683786392, "step": 9416 }, { "epoch": 0.2943125, "grad_norm": 3.53125, "grad_norm_var": 0.23987630208333333, "learning_rate": 0.0001, "loss": 6.0687, "loss/crossentropy": 2.5530205965042114, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.1918068453669548, "step": 9418 }, { "epoch": 0.294375, "grad_norm": 3.65625, "grad_norm_var": 0.07014973958333333, "learning_rate": 0.0001, "loss": 5.5433, "loss/crossentropy": 2.3218941688537598, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1686219573020935, "step": 9420 }, { "epoch": 0.2944375, "grad_norm": 3.15625, "grad_norm_var": 0.035741170247395836, "learning_rate": 0.0001, "loss": 5.8188, "loss/crossentropy": 2.5462976694107056, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17217408120632172, "step": 9422 }, { "epoch": 0.2945, "grad_norm": 3.359375, "grad_norm_var": 0.035319010416666664, "learning_rate": 0.0001, "loss": 5.9661, "loss/crossentropy": 2.5954443216323853, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18276644498109818, "step": 9424 }, { "epoch": 0.2945625, "grad_norm": 3.328125, "grad_norm_var": 0.03476155598958333, "learning_rate": 0.0001, "loss": 5.9103, "loss/crossentropy": 2.6398624181747437, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17587270587682724, "step": 9426 }, { "epoch": 0.294625, "grad_norm": 3.4375, "grad_norm_var": 0.03241780598958333, "learning_rate": 0.0001, "loss": 5.9045, "loss/crossentropy": 2.5071613788604736, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18387354165315628, "step": 9428 }, { "epoch": 0.2946875, "grad_norm": 5.21875, "grad_norm_var": 0.25580952962239584, "learning_rate": 0.0001, "loss": 6.5772, "loss/crossentropy": 2.8044530153274536, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.21243342012166977, "step": 9430 }, { "epoch": 0.29475, "grad_norm": 3.625, "grad_norm_var": 0.24859619140625, "learning_rate": 0.0001, "loss": 5.8637, "loss/crossentropy": 2.4671213626861572, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18223374336957932, "step": 9432 }, { "epoch": 0.2948125, "grad_norm": 3.4375, "grad_norm_var": 0.24726460774739584, "learning_rate": 0.0001, "loss": 6.168, "loss/crossentropy": 2.750429153442383, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18667643517255783, "step": 9434 }, { "epoch": 0.294875, "grad_norm": 3.234375, "grad_norm_var": 0.24963277180989582, "learning_rate": 0.0001, "loss": 5.7225, "loss/crossentropy": 2.441996455192566, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1784442812204361, "step": 9436 }, { "epoch": 0.2949375, "grad_norm": 3.53125, "grad_norm_var": 0.24006245930989584, "learning_rate": 0.0001, "loss": 5.7917, "loss/crossentropy": 2.450874924659729, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1782272458076477, "step": 9438 }, { "epoch": 0.295, "grad_norm": 3.5625, "grad_norm_var": 0.26676025390625, "learning_rate": 0.0001, "loss": 6.0014, "loss/crossentropy": 2.551122784614563, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.19034453481435776, "step": 9440 }, { "epoch": 0.2950625, "grad_norm": 3.4375, "grad_norm_var": 0.2520904541015625, "learning_rate": 0.0001, "loss": 5.8832, "loss/crossentropy": 2.4829729795455933, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18689806014299393, "step": 9442 }, { "epoch": 0.295125, "grad_norm": 3.46875, "grad_norm_var": 0.2420318603515625, "learning_rate": 0.0001, "loss": 6.3832, "loss/crossentropy": 2.75757896900177, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.20045582950115204, "step": 9444 }, { "epoch": 0.2951875, "grad_norm": 3.28125, "grad_norm_var": 0.064453125, "learning_rate": 0.0001, "loss": 6.0769, "loss/crossentropy": 2.629858613014221, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.19157715141773224, "step": 9446 }, { "epoch": 0.29525, "grad_norm": 3.296875, "grad_norm_var": 0.0658111572265625, "learning_rate": 0.0001, "loss": 5.7368, "loss/crossentropy": 2.4494065046310425, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17952421307563782, "step": 9448 }, { "epoch": 0.2953125, "grad_norm": 3.515625, "grad_norm_var": 0.0683258056640625, "learning_rate": 0.0001, "loss": 6.2172, "loss/crossentropy": 2.6646794080734253, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.1927492320537567, "step": 9450 }, { "epoch": 0.295375, "grad_norm": 3.09375, "grad_norm_var": 0.07465718587239584, "learning_rate": 0.0001, "loss": 5.9107, "loss/crossentropy": 2.565911889076233, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1833091378211975, "step": 9452 }, { "epoch": 0.2954375, "grad_norm": 3.171875, "grad_norm_var": 0.0834136962890625, "learning_rate": 0.0001, "loss": 5.9376, "loss/crossentropy": 2.5572386980056763, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18178445845842361, "step": 9454 }, { "epoch": 0.2955, "grad_norm": 3.28125, "grad_norm_var": 0.04449462890625, "learning_rate": 0.0001, "loss": 5.9452, "loss/crossentropy": 2.6706295013427734, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17628566920757294, "step": 9456 }, { "epoch": 0.2955625, "grad_norm": 3.5, "grad_norm_var": 0.04592692057291667, "learning_rate": 0.0001, "loss": 6.1368, "loss/crossentropy": 2.710642457008362, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18793027102947235, "step": 9458 }, { "epoch": 0.295625, "grad_norm": 3.375, "grad_norm_var": 0.02685546875, "learning_rate": 0.0001, "loss": 6.0779, "loss/crossentropy": 2.7107990980148315, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18397362530231476, "step": 9460 }, { "epoch": 0.2956875, "grad_norm": 3.359375, "grad_norm_var": 0.02685546875, "learning_rate": 0.0001, "loss": 5.858, "loss/crossentropy": 2.5094329118728638, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18016769737005234, "step": 9462 }, { "epoch": 0.29575, "grad_norm": 3.234375, "grad_norm_var": 0.03330790201822917, "learning_rate": 0.0001, "loss": 5.9281, "loss/crossentropy": 2.5578246116638184, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18507373332977295, "step": 9464 }, { "epoch": 0.2958125, "grad_norm": 3.625, "grad_norm_var": 0.0293853759765625, "learning_rate": 0.0001, "loss": 6.177, "loss/crossentropy": 2.6653488874435425, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.19569653272628784, "step": 9466 }, { "epoch": 0.295875, "grad_norm": 3.203125, "grad_norm_var": 0.0265777587890625, "learning_rate": 0.0001, "loss": 5.5547, "loss/crossentropy": 2.340910792350769, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.16630034893751144, "step": 9468 }, { "epoch": 0.2959375, "grad_norm": 3.0625, "grad_norm_var": 0.028343709309895833, "learning_rate": 0.0001, "loss": 6.0088, "loss/crossentropy": 2.661539316177368, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18238188326358795, "step": 9470 }, { "epoch": 0.296, "grad_norm": 3.328125, "grad_norm_var": 0.024267578125, "learning_rate": 0.0001, "loss": 6.0804, "loss/crossentropy": 2.6197402477264404, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.18668701499700546, "step": 9472 }, { "epoch": 0.2960625, "grad_norm": 3.953125, "grad_norm_var": 0.05462239583333333, "learning_rate": 0.0001, "loss": 5.6989, "loss/crossentropy": 2.4417498111724854, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17102348804473877, "step": 9474 }, { "epoch": 0.296125, "grad_norm": 3.203125, "grad_norm_var": 0.056441243489583334, "learning_rate": 0.0001, "loss": 5.9945, "loss/crossentropy": 2.5656116008758545, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.19054538756608963, "step": 9476 }, { "epoch": 0.2961875, "grad_norm": 3.203125, "grad_norm_var": 0.0676177978515625, "learning_rate": 0.0001, "loss": 5.9758, "loss/crossentropy": 2.5559715032577515, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18690335750579834, "step": 9478 }, { "epoch": 0.29625, "grad_norm": 3.359375, "grad_norm_var": 0.06379292805989584, "learning_rate": 0.0001, "loss": 6.15, "loss/crossentropy": 2.7207692861557007, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18823497742414474, "step": 9480 }, { "epoch": 0.2963125, "grad_norm": 3.203125, "grad_norm_var": 0.06122639973958333, "learning_rate": 0.0001, "loss": 5.8091, "loss/crossentropy": 2.492751121520996, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.17538242787122726, "step": 9482 }, { "epoch": 0.296375, "grad_norm": 3.078125, "grad_norm_var": 0.06492513020833333, "learning_rate": 0.0001, "loss": 5.736, "loss/crossentropy": 2.543042540550232, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16890213638544083, "step": 9484 }, { "epoch": 0.2964375, "grad_norm": 3.53125, "grad_norm_var": 0.061799112955729166, "learning_rate": 0.0001, "loss": 5.9448, "loss/crossentropy": 2.458292841911316, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.1892717480659485, "step": 9486 }, { "epoch": 0.2965, "grad_norm": 3.46875, "grad_norm_var": 0.06148681640625, "learning_rate": 0.0001, "loss": 6.4874, "loss/crossentropy": 2.889983057975769, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.20232471078634262, "step": 9488 }, { "epoch": 0.2965625, "grad_norm": 3.25, "grad_norm_var": 0.03206278483072917, "learning_rate": 0.0001, "loss": 6.2168, "loss/crossentropy": 2.751617908477783, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.19378162175416946, "step": 9490 }, { "epoch": 0.296625, "grad_norm": 3.375, "grad_norm_var": 0.027864583333333335, "learning_rate": 0.0001, "loss": 6.0141, "loss/crossentropy": 2.566988706588745, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18807435780763626, "step": 9492 }, { "epoch": 0.2966875, "grad_norm": 3.15625, "grad_norm_var": 0.016600545247395834, "learning_rate": 0.0001, "loss": 5.8184, "loss/crossentropy": 2.528294086456299, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17744722217321396, "step": 9494 }, { "epoch": 0.29675, "grad_norm": 3.375, "grad_norm_var": 0.017529296875, "learning_rate": 0.0001, "loss": 6.3648, "loss/crossentropy": 2.832018256187439, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.19468458741903305, "step": 9496 }, { "epoch": 0.2968125, "grad_norm": 3.25, "grad_norm_var": 0.0201568603515625, "learning_rate": 0.0001, "loss": 6.061, "loss/crossentropy": 2.7335082292556763, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.18274761736392975, "step": 9498 }, { "epoch": 0.296875, "grad_norm": 3.046875, "grad_norm_var": 0.020699055989583333, "learning_rate": 0.0001, "loss": 6.0026, "loss/crossentropy": 2.644629120826721, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18111416697502136, "step": 9500 }, { "epoch": 0.2969375, "grad_norm": 3.40625, "grad_norm_var": 0.018387858072916666, "learning_rate": 0.0001, "loss": 5.9254, "loss/crossentropy": 2.5114939212799072, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18553601205348969, "step": 9502 }, { "epoch": 0.297, "grad_norm": 3.6875, "grad_norm_var": 0.025537109375, "learning_rate": 0.0001, "loss": 6.0737, "loss/crossentropy": 2.6292479038238525, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1885829046368599, "step": 9504 }, { "epoch": 0.2970625, "grad_norm": 3.171875, "grad_norm_var": 0.024217732747395835, "learning_rate": 0.0001, "loss": 5.829, "loss/crossentropy": 2.529231548309326, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17763758450746536, "step": 9506 }, { "epoch": 0.297125, "grad_norm": 3.75, "grad_norm_var": 0.03554585774739583, "learning_rate": 0.0001, "loss": 6.2838, "loss/crossentropy": 2.723105311393738, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.19864670932292938, "step": 9508 }, { "epoch": 0.2971875, "grad_norm": 3.328125, "grad_norm_var": 0.033934529622395834, "learning_rate": 0.0001, "loss": 6.1941, "loss/crossentropy": 2.7853564023971558, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1842387244105339, "step": 9510 }, { "epoch": 0.29725, "grad_norm": 3.28125, "grad_norm_var": 0.033568318684895834, "learning_rate": 0.0001, "loss": 5.5186, "loss/crossentropy": 2.222517728805542, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1757029891014099, "step": 9512 }, { "epoch": 0.2973125, "grad_norm": 3.515625, "grad_norm_var": 0.03264058430989583, "learning_rate": 0.0001, "loss": 5.5155, "loss/crossentropy": 2.2605106830596924, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17237187176942825, "step": 9514 }, { "epoch": 0.297375, "grad_norm": 3.40625, "grad_norm_var": 0.027925618489583335, "learning_rate": 0.0001, "loss": 6.0786, "loss/crossentropy": 2.588716983795166, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.19274143874645233, "step": 9516 }, { "epoch": 0.2974375, "grad_norm": 3.21875, "grad_norm_var": 0.04374898274739583, "learning_rate": 0.0001, "loss": 5.8546, "loss/crossentropy": 2.580670118331909, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17465391755104065, "step": 9518 }, { "epoch": 0.2975, "grad_norm": 3.859375, "grad_norm_var": 0.05211181640625, "learning_rate": 0.0001, "loss": 5.7883, "loss/crossentropy": 2.466141700744629, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17870307713747025, "step": 9520 }, { "epoch": 0.2975625, "grad_norm": 3.609375, "grad_norm_var": 0.04810791015625, "learning_rate": 0.0001, "loss": 5.8819, "loss/crossentropy": 2.5266683101654053, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.1773185133934021, "step": 9522 }, { "epoch": 0.297625, "grad_norm": 4.6875, "grad_norm_var": 0.14081929524739584, "learning_rate": 0.0001, "loss": 5.888, "loss/crossentropy": 2.44830322265625, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1873292475938797, "step": 9524 }, { "epoch": 0.2976875, "grad_norm": 3.125, "grad_norm_var": 0.15488179524739584, "learning_rate": 0.0001, "loss": 5.7312, "loss/crossentropy": 2.435327410697937, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17880625277757645, "step": 9526 }, { "epoch": 0.29775, "grad_norm": 3.203125, "grad_norm_var": 0.16428120930989584, "learning_rate": 0.0001, "loss": 5.7866, "loss/crossentropy": 2.4521981477737427, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1818797066807747, "step": 9528 }, { "epoch": 0.2978125, "grad_norm": 3.796875, "grad_norm_var": 0.17026265462239584, "learning_rate": 0.0001, "loss": 6.0854, "loss/crossentropy": 2.559659242630005, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.19632402807474136, "step": 9530 }, { "epoch": 0.297875, "grad_norm": 3.078125, "grad_norm_var": 0.17836812337239583, "learning_rate": 0.0001, "loss": 5.5406, "loss/crossentropy": 2.393701434135437, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1639113500714302, "step": 9532 }, { "epoch": 0.2979375, "grad_norm": 3.34375, "grad_norm_var": 0.16140848795572918, "learning_rate": 0.0001, "loss": 5.8569, "loss/crossentropy": 2.4648091793060303, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18452439457178116, "step": 9534 }, { "epoch": 0.298, "grad_norm": 3.296875, "grad_norm_var": 0.15240885416666666, "learning_rate": 0.0001, "loss": 5.9575, "loss/crossentropy": 2.516305923461914, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18670186400413513, "step": 9536 }, { "epoch": 0.2980625, "grad_norm": 3.015625, "grad_norm_var": 0.16092122395833333, "learning_rate": 0.0001, "loss": 5.7747, "loss/crossentropy": 2.584937572479248, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1666332334280014, "step": 9538 }, { "epoch": 0.298125, "grad_norm": 3.546875, "grad_norm_var": 0.04719136555989583, "learning_rate": 0.0001, "loss": 6.1296, "loss/crossentropy": 2.6019665002822876, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.19651742279529572, "step": 9540 }, { "epoch": 0.2981875, "grad_norm": 3.6875, "grad_norm_var": 0.048140462239583334, "learning_rate": 0.0001, "loss": 5.9211, "loss/crossentropy": 2.519709587097168, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18545471131801605, "step": 9542 }, { "epoch": 0.29825, "grad_norm": 3.46875, "grad_norm_var": 0.040913899739583336, "learning_rate": 0.0001, "loss": 6.135, "loss/crossentropy": 2.707599401473999, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1880495250225067, "step": 9544 }, { "epoch": 0.2983125, "grad_norm": 3.125, "grad_norm_var": 0.039891560872395836, "learning_rate": 0.0001, "loss": 5.8902, "loss/crossentropy": 2.584674119949341, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17938528209924698, "step": 9546 }, { "epoch": 0.298375, "grad_norm": 3.328125, "grad_norm_var": 0.07604878743489583, "learning_rate": 0.0001, "loss": 6.1498, "loss/crossentropy": 2.6279720067977905, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.19554487615823746, "step": 9548 }, { "epoch": 0.2984375, "grad_norm": 4.0625, "grad_norm_var": 0.10044657389322917, "learning_rate": 0.0001, "loss": 6.1114, "loss/crossentropy": 2.615634799003601, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1941043734550476, "step": 9550 }, { "epoch": 0.2985, "grad_norm": 3.625, "grad_norm_var": 0.10454813639322917, "learning_rate": 0.0001, "loss": 5.8584, "loss/crossentropy": 2.473676919937134, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18105436861515045, "step": 9552 }, { "epoch": 0.2985625, "grad_norm": 3.40625, "grad_norm_var": 0.09113667805989584, "learning_rate": 0.0001, "loss": 5.8849, "loss/crossentropy": 2.549417018890381, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1772959977388382, "step": 9554 }, { "epoch": 0.298625, "grad_norm": 3.53125, "grad_norm_var": 0.09916890462239583, "learning_rate": 0.0001, "loss": 5.8984, "loss/crossentropy": 2.5247409343719482, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18267476558685303, "step": 9556 }, { "epoch": 0.2986875, "grad_norm": 3.5625, "grad_norm_var": 0.09558003743489583, "learning_rate": 0.0001, "loss": 6.3896, "loss/crossentropy": 2.9074044227600098, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.19197405874729156, "step": 9558 }, { "epoch": 0.29875, "grad_norm": 3.640625, "grad_norm_var": 0.10263264973958333, "learning_rate": 0.0001, "loss": 5.951, "loss/crossentropy": 2.599033832550049, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18089497834444046, "step": 9560 }, { "epoch": 0.2988125, "grad_norm": 3.546875, "grad_norm_var": 0.08199869791666667, "learning_rate": 0.0001, "loss": 6.0215, "loss/crossentropy": 2.534602165222168, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.18853811919689178, "step": 9562 }, { "epoch": 0.298875, "grad_norm": 3.3125, "grad_norm_var": 0.0502105712890625, "learning_rate": 0.0001, "loss": 5.9781, "loss/crossentropy": 2.5743662118911743, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18216802924871445, "step": 9564 }, { "epoch": 0.2989375, "grad_norm": 4.0, "grad_norm_var": 0.07532450358072916, "learning_rate": 0.0001, "loss": 6.0835, "loss/crossentropy": 2.565767288208008, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19083568453788757, "step": 9566 }, { "epoch": 0.299, "grad_norm": 3.46875, "grad_norm_var": 0.068994140625, "learning_rate": 0.0001, "loss": 5.9021, "loss/crossentropy": 2.493484139442444, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1830536648631096, "step": 9568 }, { "epoch": 0.2990625, "grad_norm": 3.671875, "grad_norm_var": 0.06985677083333333, "learning_rate": 0.0001, "loss": 6.2318, "loss/crossentropy": 2.683763265609741, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19386757910251617, "step": 9570 }, { "epoch": 0.299125, "grad_norm": 3.28125, "grad_norm_var": 0.06435445149739584, "learning_rate": 0.0001, "loss": 5.9197, "loss/crossentropy": 2.583989977836609, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18045029789209366, "step": 9572 }, { "epoch": 0.2991875, "grad_norm": 4.09375, "grad_norm_var": 0.08567301432291667, "learning_rate": 0.0001, "loss": 6.0451, "loss/crossentropy": 2.5369582176208496, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.19339578598737717, "step": 9574 }, { "epoch": 0.29925, "grad_norm": 3.390625, "grad_norm_var": 0.07838134765625, "learning_rate": 0.0001, "loss": 5.6927, "loss/crossentropy": 2.391352415084839, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1742788478732109, "step": 9576 }, { "epoch": 0.2993125, "grad_norm": 3.453125, "grad_norm_var": 0.07647196451822917, "learning_rate": 0.0001, "loss": 6.1259, "loss/crossentropy": 2.701521873474121, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18853048235177994, "step": 9578 }, { "epoch": 0.299375, "grad_norm": 3.421875, "grad_norm_var": 0.07623291015625, "learning_rate": 0.0001, "loss": 6.2207, "loss/crossentropy": 2.770782947540283, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18952109664678574, "step": 9580 }, { "epoch": 0.2994375, "grad_norm": 3.453125, "grad_norm_var": 0.03580322265625, "learning_rate": 0.0001, "loss": 5.908, "loss/crossentropy": 2.5371055603027344, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18357133120298386, "step": 9582 }, { "epoch": 0.2995, "grad_norm": 3.625, "grad_norm_var": 0.0381988525390625, "learning_rate": 0.0001, "loss": 5.7893, "loss/crossentropy": 2.3868966102600098, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.18242916464805603, "step": 9584 }, { "epoch": 0.2995625, "grad_norm": 3.640625, "grad_norm_var": 0.03821614583333333, "learning_rate": 0.0001, "loss": 5.8449, "loss/crossentropy": 2.508471131324768, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17895280569791794, "step": 9586 }, { "epoch": 0.299625, "grad_norm": 3.4375, "grad_norm_var": 0.03559468587239583, "learning_rate": 0.0001, "loss": 5.6706, "loss/crossentropy": 2.353018879890442, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17629433423280716, "step": 9588 }, { "epoch": 0.2996875, "grad_norm": 3.21875, "grad_norm_var": 0.017096964518229167, "learning_rate": 0.0001, "loss": 5.8188, "loss/crossentropy": 2.514328956604004, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17771629244089127, "step": 9590 }, { "epoch": 0.29975, "grad_norm": 3.453125, "grad_norm_var": 0.02197265625, "learning_rate": 0.0001, "loss": 5.9967, "loss/crossentropy": 2.6237692832946777, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18416789174079895, "step": 9592 }, { "epoch": 0.2998125, "grad_norm": 3.078125, "grad_norm_var": 0.0258941650390625, "learning_rate": 0.0001, "loss": 5.948, "loss/crossentropy": 2.5956236124038696, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18289864808321, "step": 9594 }, { "epoch": 0.299875, "grad_norm": 3.390625, "grad_norm_var": 0.02568359375, "learning_rate": 0.0001, "loss": 6.041, "loss/crossentropy": 2.6824913024902344, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18272147327661514, "step": 9596 }, { "epoch": 0.2999375, "grad_norm": 3.40625, "grad_norm_var": 0.0229888916015625, "learning_rate": 0.0001, "loss": 5.926, "loss/crossentropy": 2.5956075191497803, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17991703003644943, "step": 9598 }, { "epoch": 0.3, "grad_norm": 3.5, "grad_norm_var": 0.019660441080729167, "learning_rate": 0.0001, "loss": 6.218, "loss/crossentropy": 2.779041051864624, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.1864701434969902, "step": 9600 }, { "epoch": 0.3000625, "grad_norm": 3.40625, "grad_norm_var": 0.016486612955729167, "learning_rate": 0.0001, "loss": 6.0231, "loss/crossentropy": 2.605224609375, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18710007518529892, "step": 9602 }, { "epoch": 0.300125, "grad_norm": 3.453125, "grad_norm_var": 0.020588175455729166, "learning_rate": 0.0001, "loss": 5.9393, "loss/crossentropy": 2.516131639480591, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.187236025929451, "step": 9604 }, { "epoch": 0.3001875, "grad_norm": 3.125, "grad_norm_var": 0.022297159830729166, "learning_rate": 0.0001, "loss": 5.7019, "loss/crossentropy": 2.4160910844802856, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17584580928087234, "step": 9606 }, { "epoch": 0.30025, "grad_norm": 3.34375, "grad_norm_var": 0.05022379557291667, "learning_rate": 0.0001, "loss": 6.1495, "loss/crossentropy": 2.7719966173171997, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18384820222854614, "step": 9608 }, { "epoch": 0.3003125, "grad_norm": 3.09375, "grad_norm_var": 0.049681599934895834, "learning_rate": 0.0001, "loss": 5.9063, "loss/crossentropy": 2.548715829849243, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18185021728277206, "step": 9610 }, { "epoch": 0.300375, "grad_norm": 3.171875, "grad_norm_var": 0.05271708170572917, "learning_rate": 0.0001, "loss": 5.9881, "loss/crossentropy": 2.6455163955688477, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17839647084474564, "step": 9612 }, { "epoch": 0.3004375, "grad_norm": 3.484375, "grad_norm_var": 0.05388081868489583, "learning_rate": 0.0001, "loss": 5.8598, "loss/crossentropy": 2.5083526372909546, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.178897887468338, "step": 9614 }, { "epoch": 0.3005, "grad_norm": 3.390625, "grad_norm_var": 0.05284830729166667, "learning_rate": 0.0001, "loss": 6.1027, "loss/crossentropy": 2.705984592437744, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18693287670612335, "step": 9616 }, { "epoch": 0.3005625, "grad_norm": 3.25, "grad_norm_var": 0.05530192057291667, "learning_rate": 0.0001, "loss": 6.0881, "loss/crossentropy": 2.582324504852295, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19158920645713806, "step": 9618 }, { "epoch": 0.300625, "grad_norm": 3.40625, "grad_norm_var": 0.05628255208333333, "learning_rate": 0.0001, "loss": 5.9898, "loss/crossentropy": 2.5595717430114746, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.189116433262825, "step": 9620 }, { "epoch": 0.3006875, "grad_norm": 3.28125, "grad_norm_var": 0.053644816080729164, "learning_rate": 0.0001, "loss": 5.9982, "loss/crossentropy": 2.6964563131332397, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17900234460830688, "step": 9622 }, { "epoch": 0.30075, "grad_norm": 3.046875, "grad_norm_var": 0.03076171875, "learning_rate": 0.0001, "loss": 5.859, "loss/crossentropy": 2.5678685903549194, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17911336570978165, "step": 9624 }, { "epoch": 0.3008125, "grad_norm": 3.046875, "grad_norm_var": 0.03389383951822917, "learning_rate": 0.0001, "loss": 5.7959, "loss/crossentropy": 2.503232955932617, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17653512209653854, "step": 9626 }, { "epoch": 0.300875, "grad_norm": 3.359375, "grad_norm_var": 0.03497721354166667, "learning_rate": 0.0001, "loss": 5.7962, "loss/crossentropy": 2.52774715423584, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1745016649365425, "step": 9628 }, { "epoch": 0.3009375, "grad_norm": 3.421875, "grad_norm_var": 0.03589579264322917, "learning_rate": 0.0001, "loss": 5.8252, "loss/crossentropy": 2.4879744052886963, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.17746925354003906, "step": 9630 }, { "epoch": 0.301, "grad_norm": 3.1875, "grad_norm_var": 0.0363922119140625, "learning_rate": 0.0001, "loss": 5.9173, "loss/crossentropy": 2.5328809022903442, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.18727260828018188, "step": 9632 }, { "epoch": 0.3010625, "grad_norm": 3.59375, "grad_norm_var": 0.03642578125, "learning_rate": 0.0001, "loss": 5.9692, "loss/crossentropy": 2.6109448671340942, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18230891227722168, "step": 9634 }, { "epoch": 0.301125, "grad_norm": 3.25, "grad_norm_var": 0.023323567708333333, "learning_rate": 0.0001, "loss": 5.9591, "loss/crossentropy": 2.6024105548858643, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18058867007493973, "step": 9636 }, { "epoch": 0.3011875, "grad_norm": 3.484375, "grad_norm_var": 0.02896728515625, "learning_rate": 0.0001, "loss": 5.5176, "loss/crossentropy": 2.2790863513946533, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17345691472291946, "step": 9638 }, { "epoch": 0.30125, "grad_norm": 3.40625, "grad_norm_var": 0.030790201822916665, "learning_rate": 0.0001, "loss": 6.1989, "loss/crossentropy": 2.7583065032958984, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18858934193849564, "step": 9640 }, { "epoch": 0.3013125, "grad_norm": 3.078125, "grad_norm_var": 0.03411458333333333, "learning_rate": 0.0001, "loss": 5.7386, "loss/crossentropy": 2.461578130722046, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17535844445228577, "step": 9642 }, { "epoch": 0.301375, "grad_norm": 3.1875, "grad_norm_var": 0.03196512858072917, "learning_rate": 0.0001, "loss": 5.9512, "loss/crossentropy": 2.6277549266815186, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1807790920138359, "step": 9644 }, { "epoch": 0.3014375, "grad_norm": 3.21875, "grad_norm_var": 0.029076131184895833, "learning_rate": 0.0001, "loss": 5.4921, "loss/crossentropy": 2.221467673778534, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1723794862627983, "step": 9646 }, { "epoch": 0.3015, "grad_norm": 3.015625, "grad_norm_var": 0.03277994791666667, "learning_rate": 0.0001, "loss": 5.6586, "loss/crossentropy": 2.457500696182251, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17206186801195145, "step": 9648 }, { "epoch": 0.3015625, "grad_norm": 5.8125, "grad_norm_var": 0.4381022135416667, "learning_rate": 0.0001, "loss": 5.787, "loss/crossentropy": 2.481991767883301, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17971954494714737, "step": 9650 }, { "epoch": 0.301625, "grad_norm": 3.453125, "grad_norm_var": 0.4443023681640625, "learning_rate": 0.0001, "loss": 6.5569, "loss/crossentropy": 2.9203284978866577, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.20232653617858887, "step": 9652 }, { "epoch": 0.3016875, "grad_norm": 3.53125, "grad_norm_var": 0.4389556884765625, "learning_rate": 0.0001, "loss": 5.9993, "loss/crossentropy": 2.6400113105773926, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18006744980812073, "step": 9654 }, { "epoch": 0.30175, "grad_norm": 3.390625, "grad_norm_var": 0.4326985677083333, "learning_rate": 0.0001, "loss": 5.972, "loss/crossentropy": 2.6356629133224487, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18129239231348038, "step": 9656 }, { "epoch": 0.3018125, "grad_norm": 3.515625, "grad_norm_var": 0.42558186848958335, "learning_rate": 0.0001, "loss": 5.9715, "loss/crossentropy": 2.6084052324295044, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18435966968536377, "step": 9658 }, { "epoch": 0.301875, "grad_norm": 3.203125, "grad_norm_var": 0.42740478515625, "learning_rate": 0.0001, "loss": 5.5919, "loss/crossentropy": 2.396050214767456, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16958267986774445, "step": 9660 }, { "epoch": 0.3019375, "grad_norm": 3.46875, "grad_norm_var": 0.42538655598958336, "learning_rate": 0.0001, "loss": 6.0327, "loss/crossentropy": 2.6803042888641357, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18367833644151688, "step": 9662 }, { "epoch": 0.302, "grad_norm": 3.46875, "grad_norm_var": 0.40513916015625, "learning_rate": 0.0001, "loss": 5.9193, "loss/crossentropy": 2.616178512573242, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1775795891880989, "step": 9664 }, { "epoch": 0.3020625, "grad_norm": 3.125, "grad_norm_var": 0.03400777180989583, "learning_rate": 0.0001, "loss": 6.0945, "loss/crossentropy": 2.6753381490707397, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18918544054031372, "step": 9666 }, { "epoch": 0.302125, "grad_norm": 3.453125, "grad_norm_var": 0.023921712239583334, "learning_rate": 0.0001, "loss": 5.76, "loss/crossentropy": 2.3625649213790894, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.18076029419898987, "step": 9668 }, { "epoch": 0.3021875, "grad_norm": 3.21875, "grad_norm_var": 0.021468098958333334, "learning_rate": 0.0001, "loss": 5.9876, "loss/crossentropy": 2.6000994443893433, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18445394933223724, "step": 9670 }, { "epoch": 0.30225, "grad_norm": 3.234375, "grad_norm_var": 0.019596354166666666, "learning_rate": 0.0001, "loss": 6.0433, "loss/crossentropy": 2.628748297691345, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18911347538232803, "step": 9672 }, { "epoch": 0.3023125, "grad_norm": 3.34375, "grad_norm_var": 0.018583170572916665, "learning_rate": 0.0001, "loss": 6.0157, "loss/crossentropy": 2.6911391019821167, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17972269654273987, "step": 9674 }, { "epoch": 0.302375, "grad_norm": 3.421875, "grad_norm_var": 0.018919881184895834, "learning_rate": 0.0001, "loss": 5.9976, "loss/crossentropy": 2.540882706642151, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.19098274409770966, "step": 9676 }, { "epoch": 0.3024375, "grad_norm": 3.0, "grad_norm_var": 0.0226470947265625, "learning_rate": 0.0001, "loss": 5.8145, "loss/crossentropy": 2.524951934814453, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1727074533700943, "step": 9678 }, { "epoch": 0.3025, "grad_norm": 3.078125, "grad_norm_var": 0.024901326497395834, "learning_rate": 0.0001, "loss": 5.9706, "loss/crossentropy": 2.6175925731658936, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18256500363349915, "step": 9680 }, { "epoch": 0.3025625, "grad_norm": 3.328125, "grad_norm_var": 0.029931640625, "learning_rate": 0.0001, "loss": 6.1773, "loss/crossentropy": 2.6699429750442505, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.19526726007461548, "step": 9682 }, { "epoch": 0.302625, "grad_norm": 3.21875, "grad_norm_var": 0.024828084309895835, "learning_rate": 0.0001, "loss": 5.7396, "loss/crossentropy": 2.4560199975967407, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17757853120565414, "step": 9684 }, { "epoch": 0.3026875, "grad_norm": 3.28125, "grad_norm_var": 0.0289459228515625, "learning_rate": 0.0001, "loss": 5.706, "loss/crossentropy": 2.4104011058807373, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17370350658893585, "step": 9686 }, { "epoch": 0.30275, "grad_norm": 3.21875, "grad_norm_var": 0.03212788899739583, "learning_rate": 0.0001, "loss": 6.059, "loss/crossentropy": 2.6281578540802, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18957317620515823, "step": 9688 }, { "epoch": 0.3028125, "grad_norm": 3.390625, "grad_norm_var": 0.030077107747395835, "learning_rate": 0.0001, "loss": 6.0764, "loss/crossentropy": 2.6625046730041504, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18787209689617157, "step": 9690 }, { "epoch": 0.302875, "grad_norm": 3.546875, "grad_norm_var": 0.031110636393229165, "learning_rate": 0.0001, "loss": 6.0875, "loss/crossentropy": 2.630816340446472, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18746767193078995, "step": 9692 }, { "epoch": 0.3029375, "grad_norm": 2.90625, "grad_norm_var": 0.03593648274739583, "learning_rate": 0.0001, "loss": 5.7976, "loss/crossentropy": 2.48416268825531, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1789965257048607, "step": 9694 }, { "epoch": 0.303, "grad_norm": 3.109375, "grad_norm_var": 0.037255859375, "learning_rate": 0.0001, "loss": 5.6082, "loss/crossentropy": 2.3885338306427, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17118359357118607, "step": 9696 }, { "epoch": 0.3030625, "grad_norm": 5.125, "grad_norm_var": 0.23954671223958332, "learning_rate": 0.0001, "loss": 5.8096, "loss/crossentropy": 2.481685996055603, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18044928461313248, "step": 9698 }, { "epoch": 0.303125, "grad_norm": 3.640625, "grad_norm_var": 0.24023335774739582, "learning_rate": 0.0001, "loss": 5.9725, "loss/crossentropy": 2.5380191802978516, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.18485242128372192, "step": 9700 }, { "epoch": 0.3031875, "grad_norm": 4.21875, "grad_norm_var": 0.28283589680989585, "learning_rate": 0.0001, "loss": 6.1457, "loss/crossentropy": 2.648195743560791, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1931135207414627, "step": 9702 }, { "epoch": 0.30325, "grad_norm": 3.40625, "grad_norm_var": 0.31652730305989585, "learning_rate": 0.0001, "loss": 6.2557, "loss/crossentropy": 2.6659653186798096, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.20076920092105865, "step": 9704 }, { "epoch": 0.3033125, "grad_norm": 3.359375, "grad_norm_var": 0.3236968994140625, "learning_rate": 0.0001, "loss": 5.774, "loss/crossentropy": 2.449228048324585, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17779207974672318, "step": 9706 }, { "epoch": 0.303375, "grad_norm": 3.328125, "grad_norm_var": 0.32242431640625, "learning_rate": 0.0001, "loss": 5.9802, "loss/crossentropy": 2.4985880851745605, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.19269168376922607, "step": 9708 }, { "epoch": 0.3034375, "grad_norm": 3.265625, "grad_norm_var": 0.29934488932291664, "learning_rate": 0.0001, "loss": 5.7041, "loss/crossentropy": 2.4116644859313965, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17455655336380005, "step": 9710 }, { "epoch": 0.3035, "grad_norm": 3.5625, "grad_norm_var": 0.27971089680989586, "learning_rate": 0.0001, "loss": 5.8835, "loss/crossentropy": 2.5367835760116577, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17920752614736557, "step": 9712 }, { "epoch": 0.3035625, "grad_norm": 3.546875, "grad_norm_var": 0.1057525634765625, "learning_rate": 0.0001, "loss": 6.2456, "loss/crossentropy": 2.7077924013137817, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19480124861001968, "step": 9714 }, { "epoch": 0.303625, "grad_norm": 3.59375, "grad_norm_var": 0.10676676432291667, "learning_rate": 0.0001, "loss": 5.8065, "loss/crossentropy": 2.430699110031128, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18054787814617157, "step": 9716 }, { "epoch": 0.3036875, "grad_norm": 3.71875, "grad_norm_var": 0.06737874348958334, "learning_rate": 0.0001, "loss": 6.3493, "loss/crossentropy": 2.7686712741851807, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.1971207708120346, "step": 9718 }, { "epoch": 0.30375, "grad_norm": 3.390625, "grad_norm_var": 0.024442545572916665, "learning_rate": 0.0001, "loss": 6.152, "loss/crossentropy": 2.6613842248916626, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.19554419070482254, "step": 9720 }, { "epoch": 0.3038125, "grad_norm": 3.765625, "grad_norm_var": 0.026448567708333332, "learning_rate": 0.0001, "loss": 6.0501, "loss/crossentropy": 2.6510229110717773, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.1817050278186798, "step": 9722 }, { "epoch": 0.303875, "grad_norm": 3.40625, "grad_norm_var": 0.025194295247395835, "learning_rate": 0.0001, "loss": 5.992, "loss/crossentropy": 2.553894519805908, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18639186024665833, "step": 9724 }, { "epoch": 0.3039375, "grad_norm": 3.265625, "grad_norm_var": 0.026276652018229166, "learning_rate": 0.0001, "loss": 5.9109, "loss/crossentropy": 2.565396308898926, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17986725270748138, "step": 9726 }, { "epoch": 0.304, "grad_norm": 3.265625, "grad_norm_var": 0.0246490478515625, "learning_rate": 0.0001, "loss": 5.7018, "loss/crossentropy": 2.4221383333206177, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17249853909015656, "step": 9728 }, { "epoch": 0.3040625, "grad_norm": 3.140625, "grad_norm_var": 0.02939453125, "learning_rate": 0.0001, "loss": 5.7047, "loss/crossentropy": 2.3192784786224365, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.17955958098173141, "step": 9730 }, { "epoch": 0.304125, "grad_norm": 3.125, "grad_norm_var": 0.03310546875, "learning_rate": 0.0001, "loss": 5.7375, "loss/crossentropy": 2.4046072959899902, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1797727271914482, "step": 9732 }, { "epoch": 0.3041875, "grad_norm": 3.578125, "grad_norm_var": 0.0322174072265625, "learning_rate": 0.0001, "loss": 5.862, "loss/crossentropy": 2.5601195096969604, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17745760083198547, "step": 9734 }, { "epoch": 0.30425, "grad_norm": 4.15625, "grad_norm_var": 0.07295633951822916, "learning_rate": 0.0001, "loss": 6.1264, "loss/crossentropy": 2.6302343606948853, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.19140934944152832, "step": 9736 }, { "epoch": 0.3043125, "grad_norm": 3.4375, "grad_norm_var": 0.06372782389322916, "learning_rate": 0.0001, "loss": 5.9925, "loss/crossentropy": 2.5693787336349487, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18528062105178833, "step": 9738 }, { "epoch": 0.304375, "grad_norm": 3.0, "grad_norm_var": 0.076953125, "learning_rate": 0.0001, "loss": 5.7311, "loss/crossentropy": 2.5311845541000366, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16843144595623016, "step": 9740 }, { "epoch": 0.3044375, "grad_norm": 3.296875, "grad_norm_var": 0.07720947265625, "learning_rate": 0.0001, "loss": 5.6622, "loss/crossentropy": 2.4350874423980713, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.16606903076171875, "step": 9742 }, { "epoch": 0.3045, "grad_norm": 3.515625, "grad_norm_var": 0.07876688639322917, "learning_rate": 0.0001, "loss": 5.9038, "loss/crossentropy": 2.5257811546325684, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18389403074979782, "step": 9744 }, { "epoch": 0.3045625, "grad_norm": 3.21875, "grad_norm_var": 0.0848297119140625, "learning_rate": 0.0001, "loss": 6.0084, "loss/crossentropy": 2.6016011238098145, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18403909355401993, "step": 9746 }, { "epoch": 0.304625, "grad_norm": 3.5625, "grad_norm_var": 0.08259989420572916, "learning_rate": 0.0001, "loss": 5.8981, "loss/crossentropy": 2.4193954467773438, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.18771403282880783, "step": 9748 }, { "epoch": 0.3046875, "grad_norm": 3.25, "grad_norm_var": 0.07692057291666667, "learning_rate": 0.0001, "loss": 6.047, "loss/crossentropy": 2.6089885234832764, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.19145718216896057, "step": 9750 }, { "epoch": 0.30475, "grad_norm": 3.359375, "grad_norm_var": 0.034764607747395836, "learning_rate": 0.0001, "loss": 5.7729, "loss/crossentropy": 2.414872884750366, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.17838361859321594, "step": 9752 }, { "epoch": 0.3048125, "grad_norm": 3.21875, "grad_norm_var": 0.034891764322916664, "learning_rate": 0.0001, "loss": 5.8067, "loss/crossentropy": 2.4757325649261475, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18075258284807205, "step": 9754 }, { "epoch": 0.304875, "grad_norm": 3.421875, "grad_norm_var": 0.024079386393229166, "learning_rate": 0.0001, "loss": 5.8794, "loss/crossentropy": 2.59158194065094, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17644038051366806, "step": 9756 }, { "epoch": 0.3049375, "grad_norm": 3.1875, "grad_norm_var": 0.025667317708333335, "learning_rate": 0.0001, "loss": 5.903, "loss/crossentropy": 2.588863968849182, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17750480771064758, "step": 9758 }, { "epoch": 0.305, "grad_norm": 3.40625, "grad_norm_var": 0.02877197265625, "learning_rate": 0.0001, "loss": 6.0948, "loss/crossentropy": 2.6731438636779785, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18747824430465698, "step": 9760 }, { "epoch": 0.3050625, "grad_norm": 3.1875, "grad_norm_var": 0.019896443684895834, "learning_rate": 0.0001, "loss": 6.1542, "loss/crossentropy": 2.7394288778305054, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1852274313569069, "step": 9762 }, { "epoch": 0.305125, "grad_norm": 4.15625, "grad_norm_var": 0.0641510009765625, "learning_rate": 0.0001, "loss": 6.1654, "loss/crossentropy": 2.600125551223755, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.19441720843315125, "step": 9764 }, { "epoch": 0.3051875, "grad_norm": 3.15625, "grad_norm_var": 0.06757405598958334, "learning_rate": 0.0001, "loss": 5.8276, "loss/crossentropy": 2.552868127822876, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17473676800727844, "step": 9766 }, { "epoch": 0.30525, "grad_norm": 3.796875, "grad_norm_var": 0.08151753743489583, "learning_rate": 0.0001, "loss": 5.7432, "loss/crossentropy": 2.376441240310669, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.17808697372674942, "step": 9768 }, { "epoch": 0.3053125, "grad_norm": 3.890625, "grad_norm_var": 0.10416259765625, "learning_rate": 0.0001, "loss": 6.2732, "loss/crossentropy": 2.801830768585205, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.1897130012512207, "step": 9770 }, { "epoch": 0.305375, "grad_norm": 3.578125, "grad_norm_var": 0.10221354166666667, "learning_rate": 0.0001, "loss": 6.282, "loss/crossentropy": 2.7058520317077637, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.20019535720348358, "step": 9772 }, { "epoch": 0.3054375, "grad_norm": 3.5625, "grad_norm_var": 0.09934895833333333, "learning_rate": 0.0001, "loss": 5.9044, "loss/crossentropy": 2.424149751663208, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1913842111825943, "step": 9774 }, { "epoch": 0.3055, "grad_norm": 3.59375, "grad_norm_var": 0.09735921223958334, "learning_rate": 0.0001, "loss": 5.8527, "loss/crossentropy": 2.4528133869171143, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.182962104678154, "step": 9776 }, { "epoch": 0.3055625, "grad_norm": 3.109375, "grad_norm_var": 0.09986063639322916, "learning_rate": 0.0001, "loss": 6.0973, "loss/crossentropy": 2.6982662677764893, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18638630211353302, "step": 9778 }, { "epoch": 0.305625, "grad_norm": 3.625, "grad_norm_var": 0.0629791259765625, "learning_rate": 0.0001, "loss": 6.3929, "loss/crossentropy": 2.798943281173706, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.2000209465622902, "step": 9780 }, { "epoch": 0.3056875, "grad_norm": 3.0, "grad_norm_var": 0.069580078125, "learning_rate": 0.0001, "loss": 5.7924, "loss/crossentropy": 2.5172841548919678, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1779046207666397, "step": 9782 }, { "epoch": 0.30575, "grad_norm": 3.40625, "grad_norm_var": 0.06213785807291667, "learning_rate": 0.0001, "loss": 5.8359, "loss/crossentropy": 2.506075620651245, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1810283660888672, "step": 9784 }, { "epoch": 0.3058125, "grad_norm": 3.484375, "grad_norm_var": 0.0404449462890625, "learning_rate": 0.0001, "loss": 5.9368, "loss/crossentropy": 2.524527072906494, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18770781904459, "step": 9786 }, { "epoch": 0.305875, "grad_norm": 3.234375, "grad_norm_var": 0.0729888916015625, "learning_rate": 0.0001, "loss": 5.6651, "loss/crossentropy": 2.3186328411102295, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1811298280954361, "step": 9788 }, { "epoch": 0.3059375, "grad_norm": 3.3125, "grad_norm_var": 0.09324544270833333, "learning_rate": 0.0001, "loss": 5.9769, "loss/crossentropy": 2.6126078367233276, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18212811648845673, "step": 9790 }, { "epoch": 0.306, "grad_norm": 3.40625, "grad_norm_var": 0.08733317057291666, "learning_rate": 0.0001, "loss": 5.843, "loss/crossentropy": 2.464438796043396, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18395350873470306, "step": 9792 }, { "epoch": 0.3060625, "grad_norm": 3.84375, "grad_norm_var": 0.0983306884765625, "learning_rate": 0.0001, "loss": 5.8246, "loss/crossentropy": 2.483186721801758, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17905943095684052, "step": 9794 }, { "epoch": 0.306125, "grad_norm": 3.15625, "grad_norm_var": 0.10074462890625, "learning_rate": 0.0001, "loss": 6.034, "loss/crossentropy": 2.705387234687805, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18129978328943253, "step": 9796 }, { "epoch": 0.3061875, "grad_norm": 3.421875, "grad_norm_var": 0.08580322265625, "learning_rate": 0.0001, "loss": 5.8784, "loss/crossentropy": 2.4730982780456543, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18428245186805725, "step": 9798 }, { "epoch": 0.30625, "grad_norm": 3.25, "grad_norm_var": 0.09385477701822917, "learning_rate": 0.0001, "loss": 5.5357, "loss/crossentropy": 2.307369589805603, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.16892365366220474, "step": 9800 }, { "epoch": 0.3063125, "grad_norm": 3.3125, "grad_norm_var": 0.0942291259765625, "learning_rate": 0.0001, "loss": 5.7947, "loss/crossentropy": 2.520161747932434, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17433323711156845, "step": 9802 }, { "epoch": 0.306375, "grad_norm": 3.28125, "grad_norm_var": 0.0646484375, "learning_rate": 0.0001, "loss": 6.0379, "loss/crossentropy": 2.6151548624038696, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18641389906406403, "step": 9804 }, { "epoch": 0.3064375, "grad_norm": 3.421875, "grad_norm_var": 0.05175374348958333, "learning_rate": 0.0001, "loss": 5.8846, "loss/crossentropy": 2.3822693824768066, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.187337726354599, "step": 9806 }, { "epoch": 0.3065, "grad_norm": 3.59375, "grad_norm_var": 0.05625712076822917, "learning_rate": 0.0001, "loss": 6.2549, "loss/crossentropy": 2.7366974353790283, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.19322381913661957, "step": 9808 }, { "epoch": 0.3065625, "grad_norm": 3.046875, "grad_norm_var": 0.06360270182291666, "learning_rate": 0.0001, "loss": 5.6028, "loss/crossentropy": 2.431782364845276, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16788671165704727, "step": 9810 }, { "epoch": 0.306625, "grad_norm": 3.421875, "grad_norm_var": 0.05806884765625, "learning_rate": 0.0001, "loss": 6.0663, "loss/crossentropy": 2.6174486875534058, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18863768130540848, "step": 9812 }, { "epoch": 0.3066875, "grad_norm": 3.421875, "grad_norm_var": 0.0593658447265625, "learning_rate": 0.0001, "loss": 5.8863, "loss/crossentropy": 2.484455108642578, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18393176048994064, "step": 9814 }, { "epoch": 0.30675, "grad_norm": 3.359375, "grad_norm_var": 0.061909993489583336, "learning_rate": 0.0001, "loss": 5.8589, "loss/crossentropy": 2.4574899673461914, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18271709978580475, "step": 9816 }, { "epoch": 0.3068125, "grad_norm": 3.609375, "grad_norm_var": 0.06285400390625, "learning_rate": 0.0001, "loss": 6.0258, "loss/crossentropy": 2.581871509552002, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18775159865617752, "step": 9818 }, { "epoch": 0.306875, "grad_norm": 3.53125, "grad_norm_var": 0.0616363525390625, "learning_rate": 0.0001, "loss": 6.1466, "loss/crossentropy": 2.6562764644622803, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.19473157078027725, "step": 9820 }, { "epoch": 0.3069375, "grad_norm": 3.78125, "grad_norm_var": 0.06469624837239583, "learning_rate": 0.0001, "loss": 6.2948, "loss/crossentropy": 2.640057325363159, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.20141346007585526, "step": 9822 }, { "epoch": 0.307, "grad_norm": 3.484375, "grad_norm_var": 0.06384175618489583, "learning_rate": 0.0001, "loss": 6.0037, "loss/crossentropy": 2.6254125833511353, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18391861766576767, "step": 9824 }, { "epoch": 0.3070625, "grad_norm": 3.0, "grad_norm_var": 0.051732381184895836, "learning_rate": 0.0001, "loss": 6.1341, "loss/crossentropy": 2.7805886268615723, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1853499636054039, "step": 9826 }, { "epoch": 0.307125, "grad_norm": 4.09375, "grad_norm_var": 0.07996317545572916, "learning_rate": 0.0001, "loss": 6.1145, "loss/crossentropy": 2.549388289451599, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19869714230298996, "step": 9828 }, { "epoch": 0.3071875, "grad_norm": 3.25, "grad_norm_var": 0.08531901041666666, "learning_rate": 0.0001, "loss": 5.7362, "loss/crossentropy": 2.4683566093444824, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17131523042917252, "step": 9830 }, { "epoch": 0.30725, "grad_norm": 3.4375, "grad_norm_var": 0.0760162353515625, "learning_rate": 0.0001, "loss": 5.9847, "loss/crossentropy": 2.5591347217559814, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1886463165283203, "step": 9832 }, { "epoch": 0.3073125, "grad_norm": 3.34375, "grad_norm_var": 0.08001302083333334, "learning_rate": 0.0001, "loss": 6.272, "loss/crossentropy": 2.7357157468795776, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19659461081027985, "step": 9834 }, { "epoch": 0.307375, "grad_norm": 3.484375, "grad_norm_var": 0.079541015625, "learning_rate": 0.0001, "loss": 5.9386, "loss/crossentropy": 2.547179102897644, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18289538472890854, "step": 9836 }, { "epoch": 0.3074375, "grad_norm": 3.515625, "grad_norm_var": 0.07056884765625, "learning_rate": 0.0001, "loss": 6.2855, "loss/crossentropy": 2.701116442680359, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.1998470351099968, "step": 9838 }, { "epoch": 0.3075, "grad_norm": 3.421875, "grad_norm_var": 0.07183837890625, "learning_rate": 0.0001, "loss": 6.0312, "loss/crossentropy": 2.6321617364883423, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1867804378271103, "step": 9840 }, { "epoch": 0.3075625, "grad_norm": 3.296875, "grad_norm_var": 0.06365458170572917, "learning_rate": 0.0001, "loss": 6.1738, "loss/crossentropy": 2.7372519969940186, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18662403523921967, "step": 9842 }, { "epoch": 0.307625, "grad_norm": 3.3125, "grad_norm_var": 0.03885091145833333, "learning_rate": 0.0001, "loss": 5.7245, "loss/crossentropy": 2.4566760063171387, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1756145879626274, "step": 9844 }, { "epoch": 0.3076875, "grad_norm": 3.4375, "grad_norm_var": 0.07322591145833333, "learning_rate": 0.0001, "loss": 6.1802, "loss/crossentropy": 2.640401840209961, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19421062618494034, "step": 9846 }, { "epoch": 0.30775, "grad_norm": 3.3125, "grad_norm_var": 0.07102457682291667, "learning_rate": 0.0001, "loss": 5.9244, "loss/crossentropy": 2.5438748598098755, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18297865241765976, "step": 9848 }, { "epoch": 0.3078125, "grad_norm": 3.765625, "grad_norm_var": 0.07532450358072916, "learning_rate": 0.0001, "loss": 6.1063, "loss/crossentropy": 2.6450347900390625, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18909262120723724, "step": 9850 }, { "epoch": 0.307875, "grad_norm": 3.53125, "grad_norm_var": 0.07706705729166667, "learning_rate": 0.0001, "loss": 6.0794, "loss/crossentropy": 2.6155248880386353, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.18857572972774506, "step": 9852 }, { "epoch": 0.3079375, "grad_norm": 3.28125, "grad_norm_var": 0.0739898681640625, "learning_rate": 0.0001, "loss": 5.7991, "loss/crossentropy": 2.467707872390747, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17844978719949722, "step": 9854 }, { "epoch": 0.308, "grad_norm": 3.328125, "grad_norm_var": 0.0732574462890625, "learning_rate": 0.0001, "loss": 6.2447, "loss/crossentropy": 2.705179214477539, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1976974532008171, "step": 9856 }, { "epoch": 0.3080625, "grad_norm": 3.328125, "grad_norm_var": 0.06796773274739583, "learning_rate": 0.0001, "loss": 6.0257, "loss/crossentropy": 2.636121988296509, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18465761840343475, "step": 9858 }, { "epoch": 0.308125, "grad_norm": 3.34375, "grad_norm_var": 0.0575103759765625, "learning_rate": 0.0001, "loss": 6.0515, "loss/crossentropy": 2.5803788900375366, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.19047530740499496, "step": 9860 }, { "epoch": 0.3081875, "grad_norm": 3.234375, "grad_norm_var": 0.05416666666666667, "learning_rate": 0.0001, "loss": 6.1671, "loss/crossentropy": 2.685998320579529, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19029683619737625, "step": 9862 }, { "epoch": 0.30825, "grad_norm": 3.28125, "grad_norm_var": 0.05315348307291667, "learning_rate": 0.0001, "loss": 6.1954, "loss/crossentropy": 2.747362732887268, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18738098442554474, "step": 9864 }, { "epoch": 0.3083125, "grad_norm": 3.375, "grad_norm_var": 0.0454254150390625, "learning_rate": 0.0001, "loss": 6.2392, "loss/crossentropy": 2.808570146560669, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18759103119373322, "step": 9866 }, { "epoch": 0.308375, "grad_norm": 3.171875, "grad_norm_var": 0.05003153483072917, "learning_rate": 0.0001, "loss": 6.049, "loss/crossentropy": 2.623955488204956, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18586328625679016, "step": 9868 }, { "epoch": 0.3084375, "grad_norm": 3.0625, "grad_norm_var": 0.056864420572916664, "learning_rate": 0.0001, "loss": 5.7197, "loss/crossentropy": 2.475876808166504, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1732119396328926, "step": 9870 }, { "epoch": 0.3085, "grad_norm": 3.515625, "grad_norm_var": 0.05852457682291667, "learning_rate": 0.0001, "loss": 5.8995, "loss/crossentropy": 2.5887688398361206, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17756131291389465, "step": 9872 }, { "epoch": 0.3085625, "grad_norm": 4.78125, "grad_norm_var": 0.18848368326822917, "learning_rate": 0.0001, "loss": 6.5997, "loss/crossentropy": 2.8490360975265503, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.21413087844848633, "step": 9874 }, { "epoch": 0.308625, "grad_norm": 3.3125, "grad_norm_var": 0.18660481770833334, "learning_rate": 0.0001, "loss": 6.2266, "loss/crossentropy": 2.7796058654785156, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18650047481060028, "step": 9876 }, { "epoch": 0.3086875, "grad_norm": 3.4375, "grad_norm_var": 0.1646148681640625, "learning_rate": 0.0001, "loss": 6.1877, "loss/crossentropy": 2.775872588157654, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18610799312591553, "step": 9878 }, { "epoch": 0.30875, "grad_norm": 3.734375, "grad_norm_var": 0.16564127604166667, "learning_rate": 0.0001, "loss": 5.9442, "loss/crossentropy": 2.5594289302825928, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.181057408452034, "step": 9880 }, { "epoch": 0.3088125, "grad_norm": 3.53125, "grad_norm_var": 0.16448465983072916, "learning_rate": 0.0001, "loss": 6.1045, "loss/crossentropy": 2.648719549179077, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.19128568470478058, "step": 9882 }, { "epoch": 0.308875, "grad_norm": 3.15625, "grad_norm_var": 0.16286519368489583, "learning_rate": 0.0001, "loss": 5.9229, "loss/crossentropy": 2.567718982696533, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18161438405513763, "step": 9884 }, { "epoch": 0.3089375, "grad_norm": 3.359375, "grad_norm_var": 0.1539703369140625, "learning_rate": 0.0001, "loss": 6.0521, "loss/crossentropy": 2.6140952110290527, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.18481512367725372, "step": 9886 }, { "epoch": 0.309, "grad_norm": 3.171875, "grad_norm_var": 0.1518218994140625, "learning_rate": 0.0001, "loss": 5.9709, "loss/crossentropy": 2.589396595954895, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18580633401870728, "step": 9888 }, { "epoch": 0.3090625, "grad_norm": 3.265625, "grad_norm_var": 0.028251139322916667, "learning_rate": 0.0001, "loss": 5.9705, "loss/crossentropy": 2.5676733255386353, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.1805213838815689, "step": 9890 }, { "epoch": 0.309125, "grad_norm": 3.203125, "grad_norm_var": 0.024120076497395834, "learning_rate": 0.0001, "loss": 6.1128, "loss/crossentropy": 2.677307605743408, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1904277577996254, "step": 9892 }, { "epoch": 0.3091875, "grad_norm": 3.109375, "grad_norm_var": 0.027083333333333334, "learning_rate": 0.0001, "loss": 5.3723, "loss/crossentropy": 2.256300449371338, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16081786900758743, "step": 9894 }, { "epoch": 0.30925, "grad_norm": 3.703125, "grad_norm_var": 0.0312408447265625, "learning_rate": 0.0001, "loss": 6.5875, "loss/crossentropy": 2.90229868888855, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.2087506353855133, "step": 9896 }, { "epoch": 0.3093125, "grad_norm": 3.421875, "grad_norm_var": 0.029133097330729166, "learning_rate": 0.0001, "loss": 5.5702, "loss/crossentropy": 2.324537992477417, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17299997806549072, "step": 9898 }, { "epoch": 0.309375, "grad_norm": 3.59375, "grad_norm_var": 0.032648722330729164, "learning_rate": 0.0001, "loss": 5.9227, "loss/crossentropy": 2.5770009756088257, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1810528114438057, "step": 9900 }, { "epoch": 0.3094375, "grad_norm": 2.890625, "grad_norm_var": 0.046930948893229164, "learning_rate": 0.0001, "loss": 5.6693, "loss/crossentropy": 2.446715235710144, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17264599353075027, "step": 9902 }, { "epoch": 0.3095, "grad_norm": 3.359375, "grad_norm_var": 0.04419657389322917, "learning_rate": 0.0001, "loss": 5.8134, "loss/crossentropy": 2.5081379413604736, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17661748826503754, "step": 9904 }, { "epoch": 0.3095625, "grad_norm": 3.109375, "grad_norm_var": 0.044352213541666664, "learning_rate": 0.0001, "loss": 5.749, "loss/crossentropy": 2.468726873397827, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17646163702011108, "step": 9906 }, { "epoch": 0.309625, "grad_norm": 3.953125, "grad_norm_var": 0.08619384765625, "learning_rate": 0.0001, "loss": 5.7259, "loss/crossentropy": 2.3587589263916016, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.17968729883432388, "step": 9908 }, { "epoch": 0.3096875, "grad_norm": 3.75, "grad_norm_var": 0.09411519368489583, "learning_rate": 0.0001, "loss": 6.1154, "loss/crossentropy": 2.744936943054199, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18274830281734467, "step": 9910 }, { "epoch": 0.30975, "grad_norm": 3.453125, "grad_norm_var": 0.09482421875, "learning_rate": 0.0001, "loss": 5.834, "loss/crossentropy": 2.5946810245513916, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1715928465127945, "step": 9912 }, { "epoch": 0.3098125, "grad_norm": 3.609375, "grad_norm_var": 0.6234527587890625, "learning_rate": 0.0001, "loss": 6.0723, "loss/crossentropy": 2.575463891029358, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.19421055167913437, "step": 9914 }, { "epoch": 0.309875, "grad_norm": 3.3125, "grad_norm_var": 0.6217437744140625, "learning_rate": 0.0001, "loss": 5.7882, "loss/crossentropy": 2.528978943824768, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1731920763850212, "step": 9916 }, { "epoch": 0.3099375, "grad_norm": 3.640625, "grad_norm_var": 0.5905914306640625, "learning_rate": 0.0001, "loss": 6.0089, "loss/crossentropy": 2.5800297260284424, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18546050786972046, "step": 9918 }, { "epoch": 0.31, "grad_norm": 3.921875, "grad_norm_var": 0.5864166259765625, "learning_rate": 0.0001, "loss": 6.1517, "loss/crossentropy": 2.6178401708602905, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.19283895194530487, "step": 9920 }, { "epoch": 0.3100625, "grad_norm": 3.359375, "grad_norm_var": 0.5754709879557292, "learning_rate": 0.0001, "loss": 6.3124, "loss/crossentropy": 2.8379873037338257, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.19118830561637878, "step": 9922 }, { "epoch": 0.310125, "grad_norm": 3.46875, "grad_norm_var": 0.6920206705729167, "learning_rate": 0.0001, "loss": 5.9962, "loss/crossentropy": 2.508086919784546, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1960764080286026, "step": 9924 }, { "epoch": 0.3101875, "grad_norm": 3.515625, "grad_norm_var": 0.6780832926432292, "learning_rate": 0.0001, "loss": 6.1674, "loss/crossentropy": 2.7152515649795532, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.18661946803331375, "step": 9926 }, { "epoch": 0.31025, "grad_norm": 3.40625, "grad_norm_var": 0.6502024332682291, "learning_rate": 0.0001, "loss": 5.6761, "loss/crossentropy": 2.3589260578155518, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.17508075386285782, "step": 9928 }, { "epoch": 0.3103125, "grad_norm": 3.5625, "grad_norm_var": 0.2156646728515625, "learning_rate": 0.0001, "loss": 5.8203, "loss/crossentropy": 2.5374714136123657, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17515962570905685, "step": 9930 }, { "epoch": 0.310375, "grad_norm": 3.15625, "grad_norm_var": 0.219921875, "learning_rate": 0.0001, "loss": 6.037, "loss/crossentropy": 2.6640161275863647, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18416845053434372, "step": 9932 }, { "epoch": 0.3104375, "grad_norm": 3.609375, "grad_norm_var": 0.21574605305989583, "learning_rate": 0.0001, "loss": 6.198, "loss/crossentropy": 2.705440402030945, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.19104811549186707, "step": 9934 }, { "epoch": 0.3105, "grad_norm": 3.453125, "grad_norm_var": 0.2122467041015625, "learning_rate": 0.0001, "loss": 5.8265, "loss/crossentropy": 2.546779990196228, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17953650653362274, "step": 9936 }, { "epoch": 0.3105625, "grad_norm": 3.4375, "grad_norm_var": 0.20606180826822917, "learning_rate": 0.0001, "loss": 6.1075, "loss/crossentropy": 2.6210367679595947, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.1900544911623001, "step": 9938 }, { "epoch": 0.310625, "grad_norm": 3.3125, "grad_norm_var": 0.06210530598958333, "learning_rate": 0.0001, "loss": 5.8099, "loss/crossentropy": 2.5792685747146606, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1734534054994583, "step": 9940 }, { "epoch": 0.3106875, "grad_norm": 7.65625, "grad_norm_var": 1.1744466145833334, "learning_rate": 0.0001, "loss": 6.0235, "loss/crossentropy": 2.4303317070007324, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.19994131475687027, "step": 9942 }, { "epoch": 0.31075, "grad_norm": 3.09375, "grad_norm_var": 1.1827799479166667, "learning_rate": 0.0001, "loss": 5.732, "loss/crossentropy": 2.526008129119873, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17098946869373322, "step": 9944 }, { "epoch": 0.3108125, "grad_norm": 3.65625, "grad_norm_var": 1.1689198811848958, "learning_rate": 0.0001, "loss": 6.0426, "loss/crossentropy": 2.6789000034332275, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18128886818885803, "step": 9946 }, { "epoch": 0.310875, "grad_norm": 3.0625, "grad_norm_var": 1.18121337890625, "learning_rate": 0.0001, "loss": 5.8011, "loss/crossentropy": 2.4889053106307983, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17770235240459442, "step": 9948 }, { "epoch": 0.3109375, "grad_norm": 3.390625, "grad_norm_var": 1.1945963541666667, "learning_rate": 0.0001, "loss": 5.963, "loss/crossentropy": 2.5693660974502563, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18780048191547394, "step": 9950 }, { "epoch": 0.311, "grad_norm": 3.515625, "grad_norm_var": 1.1877604166666667, "learning_rate": 0.0001, "loss": 6.3526, "loss/crossentropy": 2.8071502447128296, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.1936059445142746, "step": 9952 }, { "epoch": 0.3110625, "grad_norm": 3.625, "grad_norm_var": 1.1951243082682292, "learning_rate": 0.0001, "loss": 6.1133, "loss/crossentropy": 2.6762869358062744, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18627797067165375, "step": 9954 }, { "epoch": 0.311125, "grad_norm": 3.296875, "grad_norm_var": 1.17275390625, "learning_rate": 0.0001, "loss": 6.275, "loss/crossentropy": 2.8558825254440308, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.18410159647464752, "step": 9956 }, { "epoch": 0.3111875, "grad_norm": 3.5, "grad_norm_var": 0.055597941080729164, "learning_rate": 0.0001, "loss": 6.21, "loss/crossentropy": 2.735555052757263, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1915881484746933, "step": 9958 }, { "epoch": 0.31125, "grad_norm": 3.1875, "grad_norm_var": 0.05237223307291667, "learning_rate": 0.0001, "loss": 5.8453, "loss/crossentropy": 2.5318996906280518, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1762586608529091, "step": 9960 }, { "epoch": 0.3113125, "grad_norm": 3.328125, "grad_norm_var": 0.049051920572916664, "learning_rate": 0.0001, "loss": 6.0751, "loss/crossentropy": 2.6993168592453003, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1832793802022934, "step": 9962 }, { "epoch": 0.311375, "grad_norm": 3.40625, "grad_norm_var": 0.0489654541015625, "learning_rate": 0.0001, "loss": 5.998, "loss/crossentropy": 2.5605742931365967, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1874968707561493, "step": 9964 }, { "epoch": 0.3114375, "grad_norm": 3.859375, "grad_norm_var": 0.056396484375, "learning_rate": 0.0001, "loss": 6.1923, "loss/crossentropy": 2.628121256828308, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19743662327528, "step": 9966 }, { "epoch": 0.3115, "grad_norm": 3.296875, "grad_norm_var": 0.052294921875, "learning_rate": 0.0001, "loss": 5.9873, "loss/crossentropy": 2.6812620162963867, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17787063866853714, "step": 9968 }, { "epoch": 0.3115625, "grad_norm": 3.609375, "grad_norm_var": 0.04845377604166667, "learning_rate": 0.0001, "loss": 6.0452, "loss/crossentropy": 2.5701335668563843, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.18813063204288483, "step": 9970 }, { "epoch": 0.311625, "grad_norm": 3.328125, "grad_norm_var": 0.0431060791015625, "learning_rate": 0.0001, "loss": 5.6299, "loss/crossentropy": 2.377366542816162, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17446747422218323, "step": 9972 }, { "epoch": 0.3116875, "grad_norm": 3.671875, "grad_norm_var": 0.048974609375, "learning_rate": 0.0001, "loss": 5.7221, "loss/crossentropy": 2.362430214881897, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.17659056186676025, "step": 9974 }, { "epoch": 0.31175, "grad_norm": 3.75, "grad_norm_var": 0.05611572265625, "learning_rate": 0.0001, "loss": 5.8213, "loss/crossentropy": 2.462308406829834, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.17808612436056137, "step": 9976 }, { "epoch": 0.3118125, "grad_norm": 3.3125, "grad_norm_var": 0.05877278645833333, "learning_rate": 0.0001, "loss": 6.1144, "loss/crossentropy": 2.7456939220428467, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18335268646478653, "step": 9978 }, { "epoch": 0.311875, "grad_norm": 3.390625, "grad_norm_var": 0.0531890869140625, "learning_rate": 0.0001, "loss": 5.8751, "loss/crossentropy": 2.599093437194824, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1733005940914154, "step": 9980 }, { "epoch": 0.3119375, "grad_norm": 3.390625, "grad_norm_var": 0.03828125, "learning_rate": 0.0001, "loss": 6.3225, "loss/crossentropy": 2.8240467309951782, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.19437789916992188, "step": 9982 }, { "epoch": 0.312, "grad_norm": 3.609375, "grad_norm_var": 0.03931884765625, "learning_rate": 0.0001, "loss": 6.072, "loss/crossentropy": 2.614315390586853, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.19069334864616394, "step": 9984 }, { "epoch": 0.3120625, "grad_norm": 3.453125, "grad_norm_var": 0.04952799479166667, "learning_rate": 0.0001, "loss": 6.1526, "loss/crossentropy": 2.637527585029602, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19369181990623474, "step": 9986 }, { "epoch": 0.312125, "grad_norm": 3.390625, "grad_norm_var": 0.04692281087239583, "learning_rate": 0.0001, "loss": 6.1406, "loss/crossentropy": 2.7079596519470215, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18857339769601822, "step": 9988 }, { "epoch": 0.3121875, "grad_norm": 3.578125, "grad_norm_var": 0.04166259765625, "learning_rate": 0.0001, "loss": 5.921, "loss/crossentropy": 2.5391063690185547, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18310847878456116, "step": 9990 }, { "epoch": 0.31225, "grad_norm": 3.1875, "grad_norm_var": 0.032242838541666666, "learning_rate": 0.0001, "loss": 6.4327, "loss/crossentropy": 2.9867708683013916, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1902955174446106, "step": 9992 }, { "epoch": 0.3123125, "grad_norm": 3.34375, "grad_norm_var": 0.03570963541666667, "learning_rate": 0.0001, "loss": 5.8569, "loss/crossentropy": 2.498446226119995, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18232624232769012, "step": 9994 }, { "epoch": 0.312375, "grad_norm": 3.34375, "grad_norm_var": 0.031245930989583334, "learning_rate": 0.0001, "loss": 6.0615, "loss/crossentropy": 2.6084243059158325, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.19100607931613922, "step": 9996 }, { "epoch": 0.3124375, "grad_norm": 3.3125, "grad_norm_var": 0.0321441650390625, "learning_rate": 0.0001, "loss": 5.9178, "loss/crossentropy": 2.620732069015503, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17580141127109528, "step": 9998 }, { "epoch": 0.3125, "grad_norm": 3.421875, "grad_norm_var": 0.03232014973958333, "learning_rate": 0.0001, "loss": 5.748, "loss/crossentropy": 2.4111841917037964, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18055960536003113, "step": 10000 }, { "epoch": 0.3125625, "grad_norm": 3.359375, "grad_norm_var": 0.018407185872395832, "learning_rate": 0.0001, "loss": 5.8249, "loss/crossentropy": 2.4341362714767456, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1863442361354828, "step": 10002 }, { "epoch": 0.312625, "grad_norm": 3.921875, "grad_norm_var": 0.03772379557291667, "learning_rate": 0.0001, "loss": 5.9725, "loss/crossentropy": 2.524761438369751, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.18384020030498505, "step": 10004 }, { "epoch": 0.3126875, "grad_norm": 3.359375, "grad_norm_var": 0.035481770833333336, "learning_rate": 0.0001, "loss": 5.8406, "loss/crossentropy": 2.4397616386413574, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1822672188282013, "step": 10006 }, { "epoch": 0.31275, "grad_norm": 3.359375, "grad_norm_var": 0.03580322265625, "learning_rate": 0.0001, "loss": 5.9844, "loss/crossentropy": 2.6379276514053345, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17995747178792953, "step": 10008 }, { "epoch": 0.3128125, "grad_norm": 3.609375, "grad_norm_var": 0.034521484375, "learning_rate": 0.0001, "loss": 5.9061, "loss/crossentropy": 2.4900271892547607, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18457267433404922, "step": 10010 }, { "epoch": 0.312875, "grad_norm": 3.015625, "grad_norm_var": 0.04472554524739583, "learning_rate": 0.0001, "loss": 6.0315, "loss/crossentropy": 2.5978981256484985, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.1859375536441803, "step": 10012 }, { "epoch": 0.3129375, "grad_norm": 3.34375, "grad_norm_var": 0.04940999348958333, "learning_rate": 0.0001, "loss": 5.9851, "loss/crossentropy": 2.6621599197387695, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17995379120111465, "step": 10014 }, { "epoch": 0.313, "grad_norm": 3.546875, "grad_norm_var": 0.0474609375, "learning_rate": 0.0001, "loss": 6.3342, "loss/crossentropy": 2.760565996170044, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.19525590538978577, "step": 10016 }, { "epoch": 0.3130625, "grad_norm": 3.5, "grad_norm_var": 0.04726460774739583, "learning_rate": 0.0001, "loss": 5.8966, "loss/crossentropy": 2.5175243616104126, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18517116457223892, "step": 10018 }, { "epoch": 0.313125, "grad_norm": 3.484375, "grad_norm_var": 0.029548136393229167, "learning_rate": 0.0001, "loss": 6.05, "loss/crossentropy": 2.631057024002075, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1875997707247734, "step": 10020 }, { "epoch": 0.3131875, "grad_norm": 3.65625, "grad_norm_var": 0.0339263916015625, "learning_rate": 0.0001, "loss": 5.9978, "loss/crossentropy": 2.4847946166992188, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.1930968165397644, "step": 10022 }, { "epoch": 0.31325, "grad_norm": 3.171875, "grad_norm_var": 0.0322418212890625, "learning_rate": 0.0001, "loss": 6.1395, "loss/crossentropy": 2.7118492126464844, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18964150547981262, "step": 10024 }, { "epoch": 0.3133125, "grad_norm": 3.515625, "grad_norm_var": 0.03418680826822917, "learning_rate": 0.0001, "loss": 6.3217, "loss/crossentropy": 2.7653316259384155, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19860395044088364, "step": 10026 }, { "epoch": 0.313375, "grad_norm": 3.265625, "grad_norm_var": 0.024592081705729168, "learning_rate": 0.0001, "loss": 6.0078, "loss/crossentropy": 2.6814316511154175, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.18146147578954697, "step": 10028 }, { "epoch": 0.3134375, "grad_norm": 3.484375, "grad_norm_var": 0.02001953125, "learning_rate": 0.0001, "loss": 6.0967, "loss/crossentropy": 2.6082998514175415, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.19298158586025238, "step": 10030 }, { "epoch": 0.3135, "grad_norm": 3.28125, "grad_norm_var": 0.020995076497395834, "learning_rate": 0.0001, "loss": 5.9659, "loss/crossentropy": 2.502140998840332, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.18855996429920197, "step": 10032 }, { "epoch": 0.3135625, "grad_norm": 3.328125, "grad_norm_var": 0.02412109375, "learning_rate": 0.0001, "loss": 5.9479, "loss/crossentropy": 2.6646758317947388, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17714965343475342, "step": 10034 }, { "epoch": 0.313625, "grad_norm": 3.359375, "grad_norm_var": 0.02877197265625, "learning_rate": 0.0001, "loss": 5.7889, "loss/crossentropy": 2.542953610420227, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17225146293640137, "step": 10036 }, { "epoch": 0.3136875, "grad_norm": 3.109375, "grad_norm_var": 0.025446573893229168, "learning_rate": 0.0001, "loss": 6.0174, "loss/crossentropy": 2.647829055786133, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1834377571940422, "step": 10038 }, { "epoch": 0.31375, "grad_norm": 3.390625, "grad_norm_var": 0.023921712239583334, "learning_rate": 0.0001, "loss": 6.0348, "loss/crossentropy": 2.7294058799743652, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17897269129753113, "step": 10040 }, { "epoch": 0.3138125, "grad_norm": 3.421875, "grad_norm_var": 0.012523396809895834, "learning_rate": 0.0001, "loss": 5.7022, "loss/crossentropy": 2.3868244886398315, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1737213134765625, "step": 10042 }, { "epoch": 0.313875, "grad_norm": 3.34375, "grad_norm_var": 0.012723795572916667, "learning_rate": 0.0001, "loss": 5.6612, "loss/crossentropy": 2.4281164407730103, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17330902069807053, "step": 10044 }, { "epoch": 0.3139375, "grad_norm": 3.46875, "grad_norm_var": 0.020210774739583333, "learning_rate": 0.0001, "loss": 5.7424, "loss/crossentropy": 2.547224283218384, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1652233824133873, "step": 10046 }, { "epoch": 0.314, "grad_norm": 3.46875, "grad_norm_var": 0.024559529622395833, "learning_rate": 0.0001, "loss": 5.9676, "loss/crossentropy": 2.5773009061813354, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18824848532676697, "step": 10048 }, { "epoch": 0.3140625, "grad_norm": 4.03125, "grad_norm_var": 6.336457316080729, "learning_rate": 0.0001, "loss": 6.2463, "loss/crossentropy": 2.6703532934188843, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.1993865668773651, "step": 10050 }, { "epoch": 0.314125, "grad_norm": 3.4375, "grad_norm_var": 6.288590494791666, "learning_rate": 0.0001, "loss": 6.0822, "loss/crossentropy": 2.74504292011261, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17824263870716095, "step": 10052 }, { "epoch": 0.3141875, "grad_norm": 3.359375, "grad_norm_var": 6.258088175455729, "learning_rate": 0.0001, "loss": 5.9835, "loss/crossentropy": 2.5916759967803955, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1848854348063469, "step": 10054 }, { "epoch": 0.31425, "grad_norm": 3.265625, "grad_norm_var": 6.292967732747396, "learning_rate": 0.0001, "loss": 5.7399, "loss/crossentropy": 2.497455358505249, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17229175567626953, "step": 10056 }, { "epoch": 0.3143125, "grad_norm": 3.625, "grad_norm_var": 6.293008422851562, "learning_rate": 0.0001, "loss": 5.9707, "loss/crossentropy": 2.6046427488327026, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1803584024310112, "step": 10058 }, { "epoch": 0.314375, "grad_norm": 3.75, "grad_norm_var": 6.237691243489583, "learning_rate": 0.0001, "loss": 5.9682, "loss/crossentropy": 2.5612787008285522, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18444325774908066, "step": 10060 }, { "epoch": 0.3144375, "grad_norm": 3.265625, "grad_norm_var": 6.182445271809896, "learning_rate": 0.0001, "loss": 5.8448, "loss/crossentropy": 2.4632593393325806, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18190263211727142, "step": 10062 }, { "epoch": 0.3145, "grad_norm": 3.203125, "grad_norm_var": 6.199735514322916, "learning_rate": 0.0001, "loss": 5.9259, "loss/crossentropy": 2.5135574340820312, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18303150683641434, "step": 10064 }, { "epoch": 0.3145625, "grad_norm": 3.609375, "grad_norm_var": 0.03591206868489583, "learning_rate": 0.0001, "loss": 5.9092, "loss/crossentropy": 2.597083568572998, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17964644730091095, "step": 10066 }, { "epoch": 0.314625, "grad_norm": 3.390625, "grad_norm_var": 0.03460286458333333, "learning_rate": 0.0001, "loss": 5.7752, "loss/crossentropy": 2.3850090503692627, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18081103265285492, "step": 10068 }, { "epoch": 0.3146875, "grad_norm": 3.8125, "grad_norm_var": 0.07229410807291667, "learning_rate": 0.0001, "loss": 6.113, "loss/crossentropy": 2.440373659133911, "loss/hidden": 1.71484375, "loss/jsd": 0.0, "loss/logits": 0.1957770213484764, "step": 10070 }, { "epoch": 0.31475, "grad_norm": 3.828125, "grad_norm_var": 0.06702067057291666, "learning_rate": 0.0001, "loss": 6.0065, "loss/crossentropy": 2.573858380317688, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1877976655960083, "step": 10072 }, { "epoch": 0.3148125, "grad_norm": 3.46875, "grad_norm_var": 0.07527567545572916, "learning_rate": 0.0001, "loss": 5.9498, "loss/crossentropy": 2.566790461540222, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18361780047416687, "step": 10074 }, { "epoch": 0.314875, "grad_norm": 4.0625, "grad_norm_var": 0.09072265625, "learning_rate": 0.0001, "loss": 6.2288, "loss/crossentropy": 2.6813313961029053, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.19654232263565063, "step": 10076 }, { "epoch": 0.3149375, "grad_norm": 3.71875, "grad_norm_var": 0.0894927978515625, "learning_rate": 0.0001, "loss": 6.2026, "loss/crossentropy": 2.6554404497146606, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19768338650465012, "step": 10078 }, { "epoch": 0.315, "grad_norm": 3.453125, "grad_norm_var": 0.0810699462890625, "learning_rate": 0.0001, "loss": 5.9567, "loss/crossentropy": 2.5344364643096924, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18792706727981567, "step": 10080 }, { "epoch": 0.3150625, "grad_norm": 3.3125, "grad_norm_var": 0.08329671223958333, "learning_rate": 0.0001, "loss": 5.9447, "loss/crossentropy": 2.5786601305007935, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18503838777542114, "step": 10082 }, { "epoch": 0.315125, "grad_norm": 3.28125, "grad_norm_var": 0.08629150390625, "learning_rate": 0.0001, "loss": 6.0267, "loss/crossentropy": 2.615303635597229, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.188794806599617, "step": 10084 }, { "epoch": 0.3151875, "grad_norm": 3.03125, "grad_norm_var": 0.09729410807291666, "learning_rate": 0.0001, "loss": 5.6881, "loss/crossentropy": 2.497081160545349, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17105385661125183, "step": 10086 }, { "epoch": 0.31525, "grad_norm": 3.59375, "grad_norm_var": 0.09480692545572916, "learning_rate": 0.0001, "loss": 6.041, "loss/crossentropy": 2.6185524463653564, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18833978474140167, "step": 10088 }, { "epoch": 0.3153125, "grad_norm": 4.34375, "grad_norm_var": 0.1343414306640625, "learning_rate": 0.0001, "loss": 5.7858, "loss/crossentropy": 2.496425151824951, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17620240151882172, "step": 10090 }, { "epoch": 0.315375, "grad_norm": 3.484375, "grad_norm_var": 0.11458231608072916, "learning_rate": 0.0001, "loss": 5.837, "loss/crossentropy": 2.5144152641296387, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17874548584222794, "step": 10092 }, { "epoch": 0.3154375, "grad_norm": 3.59375, "grad_norm_var": 0.1048004150390625, "learning_rate": 0.0001, "loss": 5.8075, "loss/crossentropy": 2.473135471343994, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17914441972970963, "step": 10094 }, { "epoch": 0.3155, "grad_norm": 3.34375, "grad_norm_var": 0.10533854166666666, "learning_rate": 0.0001, "loss": 6.141, "loss/crossentropy": 2.6939806938171387, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.1872761845588684, "step": 10096 }, { "epoch": 0.3155625, "grad_norm": 3.234375, "grad_norm_var": 0.0986968994140625, "learning_rate": 0.0001, "loss": 5.4468, "loss/crossentropy": 2.274045705795288, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16727469116449356, "step": 10098 }, { "epoch": 0.315625, "grad_norm": 3.03125, "grad_norm_var": 0.105078125, "learning_rate": 0.0001, "loss": 6.0835, "loss/crossentropy": 2.676869750022888, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18910247087478638, "step": 10100 }, { "epoch": 0.3156875, "grad_norm": 3.0625, "grad_norm_var": 0.0931640625, "learning_rate": 0.0001, "loss": 5.7976, "loss/crossentropy": 2.5403417348861694, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17455856502056122, "step": 10102 }, { "epoch": 0.31575, "grad_norm": 3.25, "grad_norm_var": 0.08815104166666667, "learning_rate": 0.0001, "loss": 5.8803, "loss/crossentropy": 2.5339393615722656, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.17721383273601532, "step": 10104 }, { "epoch": 0.3158125, "grad_norm": 3.25, "grad_norm_var": 0.022705078125, "learning_rate": 0.0001, "loss": 5.8428, "loss/crossentropy": 2.582388997077942, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17447850108146667, "step": 10106 }, { "epoch": 0.315875, "grad_norm": 3.5, "grad_norm_var": 0.9008748372395833, "learning_rate": 0.0001, "loss": 6.0148, "loss/crossentropy": 2.483462333679199, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.19024056941270828, "step": 10108 }, { "epoch": 0.3159375, "grad_norm": 3.234375, "grad_norm_var": 0.9079264322916667, "learning_rate": 0.0001, "loss": 5.6382, "loss/crossentropy": 2.321375846862793, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17738066613674164, "step": 10110 }, { "epoch": 0.316, "grad_norm": 3.421875, "grad_norm_var": 0.9083292643229167, "learning_rate": 0.0001, "loss": 6.0126, "loss/crossentropy": 2.585088610649109, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18845763057470322, "step": 10112 }, { "epoch": 0.3160625, "grad_norm": 3.609375, "grad_norm_var": 0.8969390869140625, "learning_rate": 0.0001, "loss": 5.8656, "loss/crossentropy": 2.4846831560134888, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18340729176998138, "step": 10114 }, { "epoch": 0.316125, "grad_norm": 3.328125, "grad_norm_var": 0.8772420247395833, "learning_rate": 0.0001, "loss": 6.2419, "loss/crossentropy": 2.7272180318832397, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.1928756982088089, "step": 10116 }, { "epoch": 0.3161875, "grad_norm": 3.3125, "grad_norm_var": 0.860009765625, "learning_rate": 0.0001, "loss": 6.0352, "loss/crossentropy": 2.6085673570632935, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18601831048727036, "step": 10118 }, { "epoch": 0.31625, "grad_norm": 3.625, "grad_norm_var": 0.8505930582682292, "learning_rate": 0.0001, "loss": 5.9706, "loss/crossentropy": 2.563066244125366, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18918965011835098, "step": 10120 }, { "epoch": 0.3163125, "grad_norm": 3.1875, "grad_norm_var": 0.86148681640625, "learning_rate": 0.0001, "loss": 5.9185, "loss/crossentropy": 2.5656826496124268, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18020518869161606, "step": 10122 }, { "epoch": 0.316375, "grad_norm": 3.28125, "grad_norm_var": 0.027269490559895835, "learning_rate": 0.0001, "loss": 6.2062, "loss/crossentropy": 2.751970887184143, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18839187920093536, "step": 10124 }, { "epoch": 0.3164375, "grad_norm": 3.53125, "grad_norm_var": 0.033492024739583334, "learning_rate": 0.0001, "loss": 6.0275, "loss/crossentropy": 2.6042853593826294, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18685470521450043, "step": 10126 }, { "epoch": 0.3165, "grad_norm": 3.734375, "grad_norm_var": 0.0392730712890625, "learning_rate": 0.0001, "loss": 6.4755, "loss/crossentropy": 2.923148512840271, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.1942974030971527, "step": 10128 }, { "epoch": 0.3165625, "grad_norm": 3.421875, "grad_norm_var": 0.03551025390625, "learning_rate": 0.0001, "loss": 6.4908, "loss/crossentropy": 2.9329782724380493, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19796500355005264, "step": 10130 }, { "epoch": 0.316625, "grad_norm": 3.0625, "grad_norm_var": 0.04715169270833333, "learning_rate": 0.0001, "loss": 5.8301, "loss/crossentropy": 2.6023718118667603, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1727747619152069, "step": 10132 }, { "epoch": 0.3166875, "grad_norm": 2.984375, "grad_norm_var": 0.062333170572916666, "learning_rate": 0.0001, "loss": 5.7403, "loss/crossentropy": 2.5091053247451782, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17312300950288773, "step": 10134 }, { "epoch": 0.31675, "grad_norm": 3.546875, "grad_norm_var": 0.05915425618489583, "learning_rate": 0.0001, "loss": 5.4448, "loss/crossentropy": 2.1704102754592896, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1704123616218567, "step": 10136 }, { "epoch": 0.3168125, "grad_norm": 3.453125, "grad_norm_var": 0.05845947265625, "learning_rate": 0.0001, "loss": 6.0383, "loss/crossentropy": 2.6399617195129395, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18436075747013092, "step": 10138 }, { "epoch": 0.316875, "grad_norm": 3.3125, "grad_norm_var": 0.0577789306640625, "learning_rate": 0.0001, "loss": 6.2752, "loss/crossentropy": 2.7846546173095703, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1935882568359375, "step": 10140 }, { "epoch": 0.3169375, "grad_norm": 3.671875, "grad_norm_var": 0.06409098307291666, "learning_rate": 0.0001, "loss": 5.9632, "loss/crossentropy": 2.5568896532058716, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18750131130218506, "step": 10142 }, { "epoch": 0.317, "grad_norm": 3.578125, "grad_norm_var": 0.058470662434895834, "learning_rate": 0.0001, "loss": 6.0904, "loss/crossentropy": 2.6694942712783813, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18897011131048203, "step": 10144 }, { "epoch": 0.3170625, "grad_norm": 3.21875, "grad_norm_var": 0.08661702473958334, "learning_rate": 0.0001, "loss": 6.068, "loss/crossentropy": 2.649327874183655, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18718105554580688, "step": 10146 }, { "epoch": 0.317125, "grad_norm": 3.3125, "grad_norm_var": 0.07558492024739584, "learning_rate": 0.0001, "loss": 5.599, "loss/crossentropy": 2.344407320022583, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17507170885801315, "step": 10148 }, { "epoch": 0.3171875, "grad_norm": 3.46875, "grad_norm_var": 0.055078125, "learning_rate": 0.0001, "loss": 5.8624, "loss/crossentropy": 2.5185035467147827, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18165487796068192, "step": 10150 }, { "epoch": 0.31725, "grad_norm": 3.46875, "grad_norm_var": 0.050169881184895834, "learning_rate": 0.0001, "loss": 5.8706, "loss/crossentropy": 2.4989309310913086, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18326275050640106, "step": 10152 }, { "epoch": 0.3173125, "grad_norm": 3.609375, "grad_norm_var": 0.050553385416666666, "learning_rate": 0.0001, "loss": 6.2412, "loss/crossentropy": 2.77180016040802, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.19381748884916306, "step": 10154 }, { "epoch": 0.317375, "grad_norm": 3.5625, "grad_norm_var": 0.05331624348958333, "learning_rate": 0.0001, "loss": 5.8568, "loss/crossentropy": 2.56472384929657, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17686709016561508, "step": 10156 }, { "epoch": 0.3174375, "grad_norm": 3.296875, "grad_norm_var": 0.04781494140625, "learning_rate": 0.0001, "loss": 5.5883, "loss/crossentropy": 2.3735530376434326, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17225316166877747, "step": 10158 }, { "epoch": 0.3175, "grad_norm": 3.6875, "grad_norm_var": 0.058447265625, "learning_rate": 0.0001, "loss": 6.2468, "loss/crossentropy": 2.710073947906494, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19585628807544708, "step": 10160 }, { "epoch": 0.3175625, "grad_norm": 3.359375, "grad_norm_var": 0.032938639322916664, "learning_rate": 0.0001, "loss": 5.4335, "loss/crossentropy": 2.1610593795776367, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1721634864807129, "step": 10162 }, { "epoch": 0.317625, "grad_norm": 3.109375, "grad_norm_var": 0.03828837076822917, "learning_rate": 0.0001, "loss": 6.2081, "loss/crossentropy": 2.792189121246338, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18768785893917084, "step": 10164 }, { "epoch": 0.3176875, "grad_norm": 3.265625, "grad_norm_var": 0.04657796223958333, "learning_rate": 0.0001, "loss": 5.8877, "loss/crossentropy": 2.6299341917037964, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17616848647594452, "step": 10166 }, { "epoch": 0.31775, "grad_norm": 3.546875, "grad_norm_var": 0.052708943684895836, "learning_rate": 0.0001, "loss": 6.1424, "loss/crossentropy": 2.760109543800354, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1827574521303177, "step": 10168 }, { "epoch": 0.3178125, "grad_norm": 3.671875, "grad_norm_var": 0.059992472330729164, "learning_rate": 0.0001, "loss": 5.8406, "loss/crossentropy": 2.468233346939087, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18216142058372498, "step": 10170 }, { "epoch": 0.317875, "grad_norm": 3.25, "grad_norm_var": 0.055516560872395836, "learning_rate": 0.0001, "loss": 6.0491, "loss/crossentropy": 2.6257340908050537, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1895984634757042, "step": 10172 }, { "epoch": 0.3179375, "grad_norm": 3.484375, "grad_norm_var": 0.053254191080729166, "learning_rate": 0.0001, "loss": 6.0024, "loss/crossentropy": 2.6600505113601685, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18228283524513245, "step": 10174 }, { "epoch": 0.318, "grad_norm": 4.4375, "grad_norm_var": 0.11450093587239583, "learning_rate": 0.0001, "loss": 5.8633, "loss/crossentropy": 2.492933988571167, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18430470675230026, "step": 10176 }, { "epoch": 0.3180625, "grad_norm": 3.265625, "grad_norm_var": 0.11482645670572916, "learning_rate": 0.0001, "loss": 5.48, "loss/crossentropy": 2.2273969650268555, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17291437089443207, "step": 10178 }, { "epoch": 0.318125, "grad_norm": 3.5, "grad_norm_var": 0.11145426432291666, "learning_rate": 0.0001, "loss": 6.0786, "loss/crossentropy": 2.6331560611724854, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1918104961514473, "step": 10180 }, { "epoch": 0.3181875, "grad_norm": 3.359375, "grad_norm_var": 0.10051676432291666, "learning_rate": 0.0001, "loss": 6.0399, "loss/crossentropy": 2.6284717321395874, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.185287743806839, "step": 10182 }, { "epoch": 0.31825, "grad_norm": 3.1875, "grad_norm_var": 0.10147196451822917, "learning_rate": 0.0001, "loss": 5.739, "loss/crossentropy": 2.525644898414612, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1740656942129135, "step": 10184 }, { "epoch": 0.3183125, "grad_norm": 3.296875, "grad_norm_var": 0.09661458333333334, "learning_rate": 0.0001, "loss": 5.705, "loss/crossentropy": 2.35819411277771, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17960207164287567, "step": 10186 }, { "epoch": 0.318375, "grad_norm": 3.1875, "grad_norm_var": 0.09866129557291667, "learning_rate": 0.0001, "loss": 5.9201, "loss/crossentropy": 2.6530667543411255, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1763104498386383, "step": 10188 }, { "epoch": 0.3184375, "grad_norm": 4.25, "grad_norm_var": 0.14165751139322916, "learning_rate": 0.0001, "loss": 5.7726, "loss/crossentropy": 2.3765740394592285, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18217645585536957, "step": 10190 }, { "epoch": 0.3185, "grad_norm": 3.3125, "grad_norm_var": 0.072509765625, "learning_rate": 0.0001, "loss": 5.9997, "loss/crossentropy": 2.617422342300415, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18666477501392365, "step": 10192 }, { "epoch": 0.3185625, "grad_norm": 3.515625, "grad_norm_var": 0.07060139973958333, "learning_rate": 0.0001, "loss": 6.0119, "loss/crossentropy": 2.6309128999710083, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18614888191223145, "step": 10194 }, { "epoch": 0.318625, "grad_norm": 3.3125, "grad_norm_var": 0.07162984212239583, "learning_rate": 0.0001, "loss": 6.0888, "loss/crossentropy": 2.712532877922058, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1876249685883522, "step": 10196 }, { "epoch": 0.3186875, "grad_norm": 3.28125, "grad_norm_var": 0.1028228759765625, "learning_rate": 0.0001, "loss": 6.3096, "loss/crossentropy": 2.7521666288375854, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.1951950639486313, "step": 10198 }, { "epoch": 0.31875, "grad_norm": 3.453125, "grad_norm_var": 0.08837483723958334, "learning_rate": 0.0001, "loss": 5.8462, "loss/crossentropy": 2.4042731523513794, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.19028404355049133, "step": 10200 }, { "epoch": 0.3188125, "grad_norm": 4.71875, "grad_norm_var": 0.18262430826822917, "learning_rate": 0.0001, "loss": 5.8477, "loss/crossentropy": 2.4556018114089966, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.17670895904302597, "step": 10202 }, { "epoch": 0.318875, "grad_norm": 3.9375, "grad_norm_var": 0.17320556640625, "learning_rate": 0.0001, "loss": 6.0425, "loss/crossentropy": 2.592575192451477, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18679186701774597, "step": 10204 }, { "epoch": 0.3189375, "grad_norm": 4.625, "grad_norm_var": 0.22554931640625, "learning_rate": 0.0001, "loss": 6.1813, "loss/crossentropy": 2.7186423540115356, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1892346814274788, "step": 10206 }, { "epoch": 0.319, "grad_norm": 3.265625, "grad_norm_var": 0.23151041666666666, "learning_rate": 0.0001, "loss": 5.8299, "loss/crossentropy": 2.483436942100525, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18112580478191376, "step": 10208 }, { "epoch": 0.3190625, "grad_norm": 3.71875, "grad_norm_var": 0.23720296223958334, "learning_rate": 0.0001, "loss": 6.1856, "loss/crossentropy": 2.73944354057312, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18875248730182648, "step": 10210 }, { "epoch": 0.319125, "grad_norm": 3.21875, "grad_norm_var": 0.24117431640625, "learning_rate": 0.0001, "loss": 5.8642, "loss/crossentropy": 2.4610995054244995, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18523633480072021, "step": 10212 }, { "epoch": 0.3191875, "grad_norm": 3.71875, "grad_norm_var": 0.22043355305989584, "learning_rate": 0.0001, "loss": 5.7804, "loss/crossentropy": 2.481137990951538, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17562881857156754, "step": 10214 }, { "epoch": 0.31925, "grad_norm": 3.53125, "grad_norm_var": 0.22139383951822916, "learning_rate": 0.0001, "loss": 6.2475, "loss/crossentropy": 2.704582691192627, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.1917915642261505, "step": 10216 }, { "epoch": 0.3193125, "grad_norm": 4.625, "grad_norm_var": 0.20972391764322917, "learning_rate": 0.0001, "loss": 5.7252, "loss/crossentropy": 2.357407569885254, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18209270387887955, "step": 10218 }, { "epoch": 0.319375, "grad_norm": 3.03125, "grad_norm_var": 0.223779296875, "learning_rate": 0.0001, "loss": 5.6514, "loss/crossentropy": 2.3752524852752686, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17409750074148178, "step": 10220 }, { "epoch": 0.3194375, "grad_norm": 3.96875, "grad_norm_var": 0.15097554524739584, "learning_rate": 0.0001, "loss": 6.2729, "loss/crossentropy": 2.6497918367385864, "loss/hidden": 1.66796875, "loss/jsd": 0.0, "loss/logits": 0.19551673531532288, "step": 10222 }, { "epoch": 0.3195, "grad_norm": 2.984375, "grad_norm_var": 0.17255757649739584, "learning_rate": 0.0001, "loss": 5.8602, "loss/crossentropy": 2.5426958799362183, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1809680014848709, "step": 10224 }, { "epoch": 0.3195625, "grad_norm": 3.3125, "grad_norm_var": 0.17149149576822917, "learning_rate": 0.0001, "loss": 5.8726, "loss/crossentropy": 2.609194755554199, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17633941769599915, "step": 10226 }, { "epoch": 0.319625, "grad_norm": 3.328125, "grad_norm_var": 0.16856180826822917, "learning_rate": 0.0001, "loss": 6.0416, "loss/crossentropy": 2.5963852405548096, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18749169260263443, "step": 10228 }, { "epoch": 0.3196875, "grad_norm": 3.5625, "grad_norm_var": 0.16494038899739583, "learning_rate": 0.0001, "loss": 6.0253, "loss/crossentropy": 2.5735563039779663, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18892797082662582, "step": 10230 }, { "epoch": 0.31975, "grad_norm": 3.0625, "grad_norm_var": 0.18153889973958334, "learning_rate": 0.0001, "loss": 5.8955, "loss/crossentropy": 2.6234103441238403, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17604166269302368, "step": 10232 }, { "epoch": 0.3198125, "grad_norm": 4.0625, "grad_norm_var": 0.10650126139322917, "learning_rate": 0.0001, "loss": 6.102, "loss/crossentropy": 2.589194655418396, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1950305849313736, "step": 10234 }, { "epoch": 0.319875, "grad_norm": 3.421875, "grad_norm_var": 0.09208882649739583, "learning_rate": 0.0001, "loss": 6.0808, "loss/crossentropy": 2.5738414525985718, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1963956654071808, "step": 10236 }, { "epoch": 0.3199375, "grad_norm": 3.1875, "grad_norm_var": 0.0952789306640625, "learning_rate": 0.0001, "loss": 5.8711, "loss/crossentropy": 2.5417840480804443, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1766820102930069, "step": 10238 }, { "epoch": 0.32, "grad_norm": 3.28125, "grad_norm_var": 0.08124593098958334, "learning_rate": 0.0001, "loss": 5.8375, "loss/crossentropy": 2.502975821495056, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17954809218645096, "step": 10240 }, { "epoch": 0.3200625, "grad_norm": 3.015625, "grad_norm_var": 0.09079488118489583, "learning_rate": 0.0001, "loss": 6.0292, "loss/crossentropy": 2.710159420967102, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1795569211244583, "step": 10242 }, { "epoch": 0.320125, "grad_norm": 3.390625, "grad_norm_var": 0.0902740478515625, "learning_rate": 0.0001, "loss": 6.0092, "loss/crossentropy": 2.625326156616211, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18369631469249725, "step": 10244 }, { "epoch": 0.3201875, "grad_norm": 3.390625, "grad_norm_var": 0.2173980712890625, "learning_rate": 0.0001, "loss": 5.8772, "loss/crossentropy": 2.528961658477783, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.17349748313426971, "step": 10246 }, { "epoch": 0.32025, "grad_norm": 3.375, "grad_norm_var": 0.1926177978515625, "learning_rate": 0.0001, "loss": 6.0081, "loss/crossentropy": 2.6666018962860107, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18024158477783203, "step": 10248 }, { "epoch": 0.3203125, "grad_norm": 3.921875, "grad_norm_var": 0.1835357666015625, "learning_rate": 0.0001, "loss": 6.2809, "loss/crossentropy": 2.6556284427642822, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.20627307891845703, "step": 10250 }, { "epoch": 0.320375, "grad_norm": 3.546875, "grad_norm_var": 0.18212890625, "learning_rate": 0.0001, "loss": 6.0027, "loss/crossentropy": 2.710022449493408, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17575644701719284, "step": 10252 }, { "epoch": 0.3204375, "grad_norm": 3.828125, "grad_norm_var": 0.172119140625, "learning_rate": 0.0001, "loss": 6.0304, "loss/crossentropy": 2.572240948677063, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.18682856857776642, "step": 10254 }, { "epoch": 0.3205, "grad_norm": 4.28125, "grad_norm_var": 0.19068094889322917, "learning_rate": 0.0001, "loss": 5.9118, "loss/crossentropy": 2.43207323551178, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.18937978148460388, "step": 10256 }, { "epoch": 0.3205625, "grad_norm": 3.125, "grad_norm_var": 0.18294270833333334, "learning_rate": 0.0001, "loss": 6.0053, "loss/crossentropy": 2.6684393882751465, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1805582493543625, "step": 10258 }, { "epoch": 0.320625, "grad_norm": 3.078125, "grad_norm_var": 0.19446614583333333, "learning_rate": 0.0001, "loss": 5.8534, "loss/crossentropy": 2.5321428775787354, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17939115315675735, "step": 10260 }, { "epoch": 0.3206875, "grad_norm": 3.21875, "grad_norm_var": 0.0946685791015625, "learning_rate": 0.0001, "loss": 5.935, "loss/crossentropy": 2.6070172786712646, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18084676563739777, "step": 10262 }, { "epoch": 0.32075, "grad_norm": 3.53125, "grad_norm_var": 0.10591532389322916, "learning_rate": 0.0001, "loss": 5.7859, "loss/crossentropy": 2.498133897781372, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17604473233222961, "step": 10264 }, { "epoch": 0.3208125, "grad_norm": 3.28125, "grad_norm_var": 0.09695638020833333, "learning_rate": 0.0001, "loss": 5.8425, "loss/crossentropy": 2.546578049659729, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1776386946439743, "step": 10266 }, { "epoch": 0.320875, "grad_norm": 3.25, "grad_norm_var": 0.10373433430989583, "learning_rate": 0.0001, "loss": 6.0768, "loss/crossentropy": 2.699702501296997, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18419396877288818, "step": 10268 }, { "epoch": 0.3209375, "grad_norm": 3.234375, "grad_norm_var": 0.09401041666666667, "learning_rate": 0.0001, "loss": 5.9828, "loss/crossentropy": 2.5811959505081177, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18703202903270721, "step": 10270 }, { "epoch": 0.321, "grad_norm": 3.125, "grad_norm_var": 0.043017578125, "learning_rate": 0.0001, "loss": 5.7115, "loss/crossentropy": 2.476984977722168, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17384564876556396, "step": 10272 }, { "epoch": 0.3210625, "grad_norm": 3.296875, "grad_norm_var": 0.0375396728515625, "learning_rate": 0.0001, "loss": 6.1687, "loss/crossentropy": 2.780236601829529, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18338096141815186, "step": 10274 }, { "epoch": 0.321125, "grad_norm": 3.296875, "grad_norm_var": 0.03824462890625, "learning_rate": 0.0001, "loss": 6.0086, "loss/crossentropy": 2.6080679893493652, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18693005293607712, "step": 10276 }, { "epoch": 0.3211875, "grad_norm": 3.40625, "grad_norm_var": 0.0389556884765625, "learning_rate": 0.0001, "loss": 6.034, "loss/crossentropy": 2.629909634590149, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1849387288093567, "step": 10278 }, { "epoch": 0.32125, "grad_norm": 3.234375, "grad_norm_var": 0.04850972493489583, "learning_rate": 0.0001, "loss": 6.0483, "loss/crossentropy": 2.610661268234253, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18985499441623688, "step": 10280 }, { "epoch": 0.3213125, "grad_norm": 3.65625, "grad_norm_var": 0.054784138997395836, "learning_rate": 0.0001, "loss": 5.9443, "loss/crossentropy": 2.598568916320801, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1841811239719391, "step": 10282 }, { "epoch": 0.321375, "grad_norm": 3.0, "grad_norm_var": 0.05699869791666667, "learning_rate": 0.0001, "loss": 5.8561, "loss/crossentropy": 2.504266142845154, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18322645872831345, "step": 10284 }, { "epoch": 0.3214375, "grad_norm": 3.78125, "grad_norm_var": 0.06689453125, "learning_rate": 0.0001, "loss": 5.8586, "loss/crossentropy": 2.497063636779785, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18068519979715347, "step": 10286 }, { "epoch": 0.3215, "grad_norm": 3.453125, "grad_norm_var": 0.0549713134765625, "learning_rate": 0.0001, "loss": 5.9398, "loss/crossentropy": 2.5412371158599854, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18595405668020248, "step": 10288 }, { "epoch": 0.3215625, "grad_norm": 3.4375, "grad_norm_var": 0.054833984375, "learning_rate": 0.0001, "loss": 6.2372, "loss/crossentropy": 2.7960572242736816, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18982186168432236, "step": 10290 }, { "epoch": 0.321625, "grad_norm": 3.390625, "grad_norm_var": 0.06021728515625, "learning_rate": 0.0001, "loss": 6.0493, "loss/crossentropy": 2.7268223762512207, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.18107493221759796, "step": 10292 }, { "epoch": 0.3216875, "grad_norm": 3.3125, "grad_norm_var": 0.0586822509765625, "learning_rate": 0.0001, "loss": 5.9802, "loss/crossentropy": 2.5619630813598633, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1898685246706009, "step": 10294 }, { "epoch": 0.32175, "grad_norm": 3.125, "grad_norm_var": 0.04859619140625, "learning_rate": 0.0001, "loss": 5.959, "loss/crossentropy": 2.6295331716537476, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18021129816770554, "step": 10296 }, { "epoch": 0.3218125, "grad_norm": 3.3125, "grad_norm_var": 0.046484375, "learning_rate": 0.0001, "loss": 6.1336, "loss/crossentropy": 2.800679564476013, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18172965943813324, "step": 10298 }, { "epoch": 0.321875, "grad_norm": 3.140625, "grad_norm_var": 0.038309733072916664, "learning_rate": 0.0001, "loss": 5.9261, "loss/crossentropy": 2.616189479827881, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.18059717118740082, "step": 10300 }, { "epoch": 0.3219375, "grad_norm": 3.46875, "grad_norm_var": 0.060179646809895834, "learning_rate": 0.0001, "loss": 6.3866, "loss/crossentropy": 2.903854727745056, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.19163402915000916, "step": 10302 }, { "epoch": 0.322, "grad_norm": 3.21875, "grad_norm_var": 0.060933430989583336, "learning_rate": 0.0001, "loss": 5.8206, "loss/crossentropy": 2.4988759756088257, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17787177860736847, "step": 10304 }, { "epoch": 0.3220625, "grad_norm": 3.109375, "grad_norm_var": 0.06848042805989583, "learning_rate": 0.0001, "loss": 5.943, "loss/crossentropy": 2.5184680223464966, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.18386346101760864, "step": 10306 }, { "epoch": 0.322125, "grad_norm": 3.21875, "grad_norm_var": 0.06252339680989584, "learning_rate": 0.0001, "loss": 5.9365, "loss/crossentropy": 2.560743570327759, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18444863706827164, "step": 10308 }, { "epoch": 0.3221875, "grad_norm": 3.171875, "grad_norm_var": 0.06266988118489583, "learning_rate": 0.0001, "loss": 5.896, "loss/crossentropy": 2.492717981338501, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1879885271191597, "step": 10310 }, { "epoch": 0.32225, "grad_norm": 3.359375, "grad_norm_var": 0.08522847493489584, "learning_rate": 0.0001, "loss": 6.0193, "loss/crossentropy": 2.629201889038086, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18197664618492126, "step": 10312 }, { "epoch": 0.3223125, "grad_norm": 3.296875, "grad_norm_var": 0.08351949055989584, "learning_rate": 0.0001, "loss": 5.6949, "loss/crossentropy": 2.4657797813415527, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17134519666433334, "step": 10314 }, { "epoch": 0.322375, "grad_norm": 3.5625, "grad_norm_var": 0.0806549072265625, "learning_rate": 0.0001, "loss": 6.1845, "loss/crossentropy": 2.742559552192688, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.1852065622806549, "step": 10316 }, { "epoch": 0.3224375, "grad_norm": 3.53125, "grad_norm_var": 0.0503570556640625, "learning_rate": 0.0001, "loss": 5.5001, "loss/crossentropy": 2.2537004947662354, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1726863980293274, "step": 10318 }, { "epoch": 0.3225, "grad_norm": 3.453125, "grad_norm_var": 0.04840087890625, "learning_rate": 0.0001, "loss": 6.0665, "loss/crossentropy": 2.6084266901016235, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.18799513578414917, "step": 10320 }, { "epoch": 0.3225625, "grad_norm": 4.15625, "grad_norm_var": 0.08440348307291666, "learning_rate": 0.0001, "loss": 5.7407, "loss/crossentropy": 2.47977352142334, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1698397845029831, "step": 10322 }, { "epoch": 0.322625, "grad_norm": 3.40625, "grad_norm_var": 0.08185933430989584, "learning_rate": 0.0001, "loss": 5.7587, "loss/crossentropy": 2.4150086641311646, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1800706535577774, "step": 10324 }, { "epoch": 0.3226875, "grad_norm": 3.5625, "grad_norm_var": 0.08221028645833334, "learning_rate": 0.0001, "loss": 5.5886, "loss/crossentropy": 2.318339467048645, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1758519634604454, "step": 10326 }, { "epoch": 0.32275, "grad_norm": 3.34375, "grad_norm_var": 0.0698883056640625, "learning_rate": 0.0001, "loss": 5.5275, "loss/crossentropy": 2.342996120452881, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1664949432015419, "step": 10328 }, { "epoch": 0.3228125, "grad_norm": 3.21875, "grad_norm_var": 0.06549072265625, "learning_rate": 0.0001, "loss": 6.0932, "loss/crossentropy": 2.6780357360839844, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18683306872844696, "step": 10330 }, { "epoch": 0.322875, "grad_norm": 3.53125, "grad_norm_var": 0.0680816650390625, "learning_rate": 0.0001, "loss": 5.9287, "loss/crossentropy": 2.5059865713119507, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1852380484342575, "step": 10332 }, { "epoch": 0.3229375, "grad_norm": 3.359375, "grad_norm_var": 0.06879781087239584, "learning_rate": 0.0001, "loss": 6.0371, "loss/crossentropy": 2.7091665267944336, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1816255897283554, "step": 10334 }, { "epoch": 0.323, "grad_norm": 3.140625, "grad_norm_var": 0.07222391764322916, "learning_rate": 0.0001, "loss": 6.0552, "loss/crossentropy": 2.6606714725494385, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1851576864719391, "step": 10336 }, { "epoch": 0.3230625, "grad_norm": 3.203125, "grad_norm_var": 0.028539021809895832, "learning_rate": 0.0001, "loss": 6.1826, "loss/crossentropy": 2.755558729171753, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18450582027435303, "step": 10338 }, { "epoch": 0.323125, "grad_norm": 3.484375, "grad_norm_var": 0.03092041015625, "learning_rate": 0.0001, "loss": 5.6844, "loss/crossentropy": 2.4298194646835327, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1707700565457344, "step": 10340 }, { "epoch": 0.3231875, "grad_norm": 3.171875, "grad_norm_var": 0.0276275634765625, "learning_rate": 0.0001, "loss": 6.0102, "loss/crossentropy": 2.654059410095215, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18131651729345322, "step": 10342 }, { "epoch": 0.32325, "grad_norm": 3.234375, "grad_norm_var": 0.025121053059895832, "learning_rate": 0.0001, "loss": 5.7932, "loss/crossentropy": 2.459676742553711, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.177101731300354, "step": 10344 }, { "epoch": 0.3233125, "grad_norm": 3.28125, "grad_norm_var": 0.025439453125, "learning_rate": 0.0001, "loss": 6.0264, "loss/crossentropy": 2.617383599281311, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.18972976505756378, "step": 10346 }, { "epoch": 0.323375, "grad_norm": 3.46875, "grad_norm_var": 0.018062337239583334, "learning_rate": 0.0001, "loss": 5.8327, "loss/crossentropy": 2.4840651750564575, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18407892435789108, "step": 10348 }, { "epoch": 0.3234375, "grad_norm": 3.203125, "grad_norm_var": 0.0202301025390625, "learning_rate": 0.0001, "loss": 5.74, "loss/crossentropy": 2.3936723470687866, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18033472448587418, "step": 10350 }, { "epoch": 0.3235, "grad_norm": 3.390625, "grad_norm_var": 0.018260701497395834, "learning_rate": 0.0001, "loss": 6.2428, "loss/crossentropy": 2.749495029449463, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1915174424648285, "step": 10352 }, { "epoch": 0.3235625, "grad_norm": 3.328125, "grad_norm_var": 0.0174468994140625, "learning_rate": 0.0001, "loss": 5.4822, "loss/crossentropy": 2.28355073928833, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.16634956747293472, "step": 10354 }, { "epoch": 0.323625, "grad_norm": 3.40625, "grad_norm_var": 0.016747029622395833, "learning_rate": 0.0001, "loss": 5.8807, "loss/crossentropy": 2.597448945045471, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17598330974578857, "step": 10356 }, { "epoch": 0.3236875, "grad_norm": 3.09375, "grad_norm_var": 0.018903605143229165, "learning_rate": 0.0001, "loss": 5.9361, "loss/crossentropy": 2.642319917678833, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1762532889842987, "step": 10358 }, { "epoch": 0.32375, "grad_norm": 3.125, "grad_norm_var": 0.020164998372395833, "learning_rate": 0.0001, "loss": 6.2062, "loss/crossentropy": 2.7625609636306763, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18889529258012772, "step": 10360 }, { "epoch": 0.3238125, "grad_norm": 3.3125, "grad_norm_var": 0.017606608072916665, "learning_rate": 0.0001, "loss": 5.6625, "loss/crossentropy": 2.37093186378479, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17876386642456055, "step": 10362 }, { "epoch": 0.323875, "grad_norm": 3.046875, "grad_norm_var": 0.020865885416666667, "learning_rate": 0.0001, "loss": 5.6234, "loss/crossentropy": 2.4135403633117676, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1698107272386551, "step": 10364 }, { "epoch": 0.3239375, "grad_norm": 3.21875, "grad_norm_var": 0.021122233072916666, "learning_rate": 0.0001, "loss": 6.103, "loss/crossentropy": 2.6791166067123413, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18613450974225998, "step": 10366 }, { "epoch": 0.324, "grad_norm": 3.546875, "grad_norm_var": 0.0278717041015625, "learning_rate": 0.0001, "loss": 6.0993, "loss/crossentropy": 2.6597228050231934, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18770888447761536, "step": 10368 }, { "epoch": 0.3240625, "grad_norm": 3.53125, "grad_norm_var": 0.032731119791666666, "learning_rate": 0.0001, "loss": 6.1056, "loss/crossentropy": 2.677014470100403, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1870013177394867, "step": 10370 }, { "epoch": 0.324125, "grad_norm": 3.390625, "grad_norm_var": 0.03764546712239583, "learning_rate": 0.0001, "loss": 5.7593, "loss/crossentropy": 2.435485005378723, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18003712594509125, "step": 10372 }, { "epoch": 0.3241875, "grad_norm": 3.25, "grad_norm_var": 0.036408487955729166, "learning_rate": 0.0001, "loss": 6.0285, "loss/crossentropy": 2.649515151977539, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18320664763450623, "step": 10374 }, { "epoch": 0.32425, "grad_norm": 3.125, "grad_norm_var": 0.043778483072916666, "learning_rate": 0.0001, "loss": 5.9564, "loss/crossentropy": 2.605233073234558, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1812061071395874, "step": 10376 }, { "epoch": 0.3243125, "grad_norm": 3.453125, "grad_norm_var": 0.0540191650390625, "learning_rate": 0.0001, "loss": 5.8441, "loss/crossentropy": 2.578033208847046, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17075205594301224, "step": 10378 }, { "epoch": 0.324375, "grad_norm": 3.109375, "grad_norm_var": 0.05322265625, "learning_rate": 0.0001, "loss": 5.6596, "loss/crossentropy": 2.445371389389038, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1725914552807808, "step": 10380 }, { "epoch": 0.3244375, "grad_norm": 3.15625, "grad_norm_var": 0.052229817708333334, "learning_rate": 0.0001, "loss": 5.9566, "loss/crossentropy": 2.6489113569259644, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17686673998832703, "step": 10382 }, { "epoch": 0.3245, "grad_norm": 3.15625, "grad_norm_var": 0.04602762858072917, "learning_rate": 0.0001, "loss": 5.9523, "loss/crossentropy": 2.565733313560486, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18435630202293396, "step": 10384 }, { "epoch": 0.3245625, "grad_norm": 4.15625, "grad_norm_var": 0.08870035807291667, "learning_rate": 0.0001, "loss": 5.6202, "loss/crossentropy": 2.3419684171676636, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17196565866470337, "step": 10386 }, { "epoch": 0.324625, "grad_norm": 3.34375, "grad_norm_var": 0.08421223958333333, "learning_rate": 0.0001, "loss": 5.8697, "loss/crossentropy": 2.485108971595764, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1865016222000122, "step": 10388 }, { "epoch": 0.3246875, "grad_norm": 3.5, "grad_norm_var": 0.08633524576822917, "learning_rate": 0.0001, "loss": 6.2263, "loss/crossentropy": 2.757042169570923, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.1894996240735054, "step": 10390 }, { "epoch": 0.32475, "grad_norm": 3.390625, "grad_norm_var": 0.07402242024739583, "learning_rate": 0.0001, "loss": 5.8389, "loss/crossentropy": 2.444661259651184, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18629730492830276, "step": 10392 }, { "epoch": 0.3248125, "grad_norm": 3.609375, "grad_norm_var": 0.07128804524739583, "learning_rate": 0.0001, "loss": 6.3191, "loss/crossentropy": 2.7633039951324463, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1977638155221939, "step": 10394 }, { "epoch": 0.324875, "grad_norm": 3.234375, "grad_norm_var": 0.0710601806640625, "learning_rate": 0.0001, "loss": 5.8646, "loss/crossentropy": 2.640702486038208, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17316973209381104, "step": 10396 }, { "epoch": 0.3249375, "grad_norm": 3.4375, "grad_norm_var": 0.06949462890625, "learning_rate": 0.0001, "loss": 5.791, "loss/crossentropy": 2.443985104560852, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18001803755760193, "step": 10398 }, { "epoch": 0.325, "grad_norm": 3.21875, "grad_norm_var": 0.06363525390625, "learning_rate": 0.0001, "loss": 6.0377, "loss/crossentropy": 2.6358437538146973, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1823723390698433, "step": 10400 }, { "epoch": 0.3250625, "grad_norm": 3.296875, "grad_norm_var": 0.017528279622395834, "learning_rate": 0.0001, "loss": 5.4682, "loss/crossentropy": 2.22607159614563, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17265252023935318, "step": 10402 }, { "epoch": 0.325125, "grad_norm": 3.3125, "grad_norm_var": 0.025581868489583333, "learning_rate": 0.0001, "loss": 6.1358, "loss/crossentropy": 2.7480798959732056, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18291470408439636, "step": 10404 }, { "epoch": 0.3251875, "grad_norm": 4.03125, "grad_norm_var": 0.051656087239583336, "learning_rate": 0.0001, "loss": 5.8878, "loss/crossentropy": 2.5838598012924194, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17922098189592361, "step": 10406 }, { "epoch": 0.32525, "grad_norm": 3.125, "grad_norm_var": 0.05635477701822917, "learning_rate": 0.0001, "loss": 5.8983, "loss/crossentropy": 2.529516100883484, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18218666315078735, "step": 10408 }, { "epoch": 0.3253125, "grad_norm": 2.984375, "grad_norm_var": 0.06767578125, "learning_rate": 0.0001, "loss": 5.6475, "loss/crossentropy": 2.3896443843841553, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17344091832637787, "step": 10410 }, { "epoch": 0.325375, "grad_norm": 3.09375, "grad_norm_var": 0.06689453125, "learning_rate": 0.0001, "loss": 5.9586, "loss/crossentropy": 2.6563864946365356, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1790490746498108, "step": 10412 }, { "epoch": 0.3254375, "grad_norm": 3.5, "grad_norm_var": 0.06845703125, "learning_rate": 0.0001, "loss": 6.1893, "loss/crossentropy": 2.7665140628814697, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18915075063705444, "step": 10414 }, { "epoch": 0.3255, "grad_norm": 3.484375, "grad_norm_var": 0.12932027180989583, "learning_rate": 0.0001, "loss": 6.155, "loss/crossentropy": 2.6996634006500244, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.1861589252948761, "step": 10416 }, { "epoch": 0.3255625, "grad_norm": 4.28125, "grad_norm_var": 0.1626617431640625, "learning_rate": 0.0001, "loss": 5.4045, "loss/crossentropy": 2.205993890762329, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.16594918072223663, "step": 10418 }, { "epoch": 0.325625, "grad_norm": 3.265625, "grad_norm_var": 0.16363525390625, "learning_rate": 0.0001, "loss": 6.1563, "loss/crossentropy": 2.702183246612549, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18955016881227493, "step": 10420 }, { "epoch": 0.3256875, "grad_norm": 3.0, "grad_norm_var": 0.1684722900390625, "learning_rate": 0.0001, "loss": 5.5715, "loss/crossentropy": 2.3634437322616577, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.17510025948286057, "step": 10422 }, { "epoch": 0.32575, "grad_norm": 3.1875, "grad_norm_var": 0.16743876139322916, "learning_rate": 0.0001, "loss": 5.9613, "loss/crossentropy": 2.613492727279663, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.17852627485990524, "step": 10424 }, { "epoch": 0.3258125, "grad_norm": 3.234375, "grad_norm_var": 0.159619140625, "learning_rate": 0.0001, "loss": 6.1792, "loss/crossentropy": 2.718513250350952, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.19059736281633377, "step": 10426 }, { "epoch": 0.325875, "grad_norm": 3.796875, "grad_norm_var": 0.1602691650390625, "learning_rate": 0.0001, "loss": 5.9991, "loss/crossentropy": 2.6648871898651123, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1795169711112976, "step": 10428 }, { "epoch": 0.3259375, "grad_norm": 3.109375, "grad_norm_var": 0.1678131103515625, "learning_rate": 0.0001, "loss": 5.809, "loss/crossentropy": 2.539751172065735, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1761474683880806, "step": 10430 }, { "epoch": 0.326, "grad_norm": 3.328125, "grad_norm_var": 0.10966695149739583, "learning_rate": 0.0001, "loss": 6.1314, "loss/crossentropy": 2.715307593345642, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.19004476815462112, "step": 10432 }, { "epoch": 0.3260625, "grad_norm": 3.3125, "grad_norm_var": 0.05855204264322917, "learning_rate": 0.0001, "loss": 6.1432, "loss/crossentropy": 2.6461070775985718, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.19385036826133728, "step": 10434 }, { "epoch": 0.326125, "grad_norm": 3.3125, "grad_norm_var": 0.05244140625, "learning_rate": 0.0001, "loss": 5.8139, "loss/crossentropy": 2.513677954673767, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.17338525503873825, "step": 10436 }, { "epoch": 0.3261875, "grad_norm": 3.5625, "grad_norm_var": 0.04198811848958333, "learning_rate": 0.0001, "loss": 5.869, "loss/crossentropy": 2.5315924882888794, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.18335412442684174, "step": 10438 }, { "epoch": 0.32625, "grad_norm": 3.328125, "grad_norm_var": 0.046418253580729166, "learning_rate": 0.0001, "loss": 5.8026, "loss/crossentropy": 2.502164602279663, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17965249717235565, "step": 10440 }, { "epoch": 0.3263125, "grad_norm": 3.25, "grad_norm_var": 0.037495930989583336, "learning_rate": 0.0001, "loss": 6.0665, "loss/crossentropy": 2.6249698400497437, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18907830864191055, "step": 10442 }, { "epoch": 0.326375, "grad_norm": 3.875, "grad_norm_var": 0.040257771809895836, "learning_rate": 0.0001, "loss": 6.3294, "loss/crossentropy": 2.8112963438034058, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1947823166847229, "step": 10444 }, { "epoch": 0.3264375, "grad_norm": 3.46875, "grad_norm_var": 0.03654683430989583, "learning_rate": 0.0001, "loss": 6.0778, "loss/crossentropy": 2.722153425216675, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18165677040815353, "step": 10446 }, { "epoch": 0.3265, "grad_norm": 3.21875, "grad_norm_var": 0.04531148274739583, "learning_rate": 0.0001, "loss": 5.9552, "loss/crossentropy": 2.666857600212097, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17804977297782898, "step": 10448 }, { "epoch": 0.3265625, "grad_norm": 3.703125, "grad_norm_var": 0.05487874348958333, "learning_rate": 0.0001, "loss": 6.0745, "loss/crossentropy": 2.7147200107574463, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18285037577152252, "step": 10450 }, { "epoch": 0.326625, "grad_norm": 3.25, "grad_norm_var": 0.055399576822916664, "learning_rate": 0.0001, "loss": 5.8654, "loss/crossentropy": 2.5688791275024414, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1765284687280655, "step": 10452 }, { "epoch": 0.3266875, "grad_norm": 4.46875, "grad_norm_var": 0.12888895670572917, "learning_rate": 0.0001, "loss": 6.2283, "loss/crossentropy": 2.7135192155838013, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1948346346616745, "step": 10454 }, { "epoch": 0.32675, "grad_norm": 3.1875, "grad_norm_var": 0.12759501139322918, "learning_rate": 0.0001, "loss": 5.8369, "loss/crossentropy": 2.5518386363983154, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17654848098754883, "step": 10456 }, { "epoch": 0.3268125, "grad_norm": 3.484375, "grad_norm_var": 0.12392171223958333, "learning_rate": 0.0001, "loss": 5.9143, "loss/crossentropy": 2.50605046749115, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.18222897499799728, "step": 10458 }, { "epoch": 0.326875, "grad_norm": 3.34375, "grad_norm_var": 0.11468098958333334, "learning_rate": 0.0001, "loss": 5.886, "loss/crossentropy": 2.5006728172302246, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.183840811252594, "step": 10460 }, { "epoch": 0.3269375, "grad_norm": 3.015625, "grad_norm_var": 0.13113505045572918, "learning_rate": 0.0001, "loss": 5.5897, "loss/crossentropy": 2.428275942802429, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16692692786455154, "step": 10462 }, { "epoch": 0.327, "grad_norm": 3.328125, "grad_norm_var": 0.12127278645833334, "learning_rate": 0.0001, "loss": 5.9769, "loss/crossentropy": 2.618067502975464, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18197977542877197, "step": 10464 }, { "epoch": 0.3270625, "grad_norm": 2.9375, "grad_norm_var": 0.12903645833333333, "learning_rate": 0.0001, "loss": 5.5058, "loss/crossentropy": 2.329106092453003, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16766852140426636, "step": 10466 }, { "epoch": 0.327125, "grad_norm": 3.234375, "grad_norm_var": 0.13027242024739583, "learning_rate": 0.0001, "loss": 5.8744, "loss/crossentropy": 2.5502312183380127, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17851175367832184, "step": 10468 }, { "epoch": 0.3271875, "grad_norm": 3.09375, "grad_norm_var": 0.055322265625, "learning_rate": 0.0001, "loss": 5.9023, "loss/crossentropy": 2.62544047832489, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17808130383491516, "step": 10470 }, { "epoch": 0.32725, "grad_norm": 3.71875, "grad_norm_var": 0.05129801432291667, "learning_rate": 0.0001, "loss": 6.1697, "loss/crossentropy": 2.612215995788574, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.19754952192306519, "step": 10472 }, { "epoch": 0.3273125, "grad_norm": 3.390625, "grad_norm_var": 0.06692301432291667, "learning_rate": 0.0001, "loss": 5.3695, "loss/crossentropy": 2.3111432790756226, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15778601169586182, "step": 10474 }, { "epoch": 0.327375, "grad_norm": 3.078125, "grad_norm_var": 0.069921875, "learning_rate": 0.0001, "loss": 6.0333, "loss/crossentropy": 2.6295695304870605, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18725257366895676, "step": 10476 }, { "epoch": 0.3274375, "grad_norm": 3.453125, "grad_norm_var": 0.06411844889322917, "learning_rate": 0.0001, "loss": 5.8327, "loss/crossentropy": 2.553055167198181, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1764000654220581, "step": 10478 }, { "epoch": 0.3275, "grad_norm": 3.25, "grad_norm_var": 0.0619140625, "learning_rate": 0.0001, "loss": 5.8046, "loss/crossentropy": 2.449642062187195, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1815905198454857, "step": 10480 }, { "epoch": 0.3275625, "grad_norm": 3.234375, "grad_norm_var": 0.11783447265625, "learning_rate": 0.0001, "loss": 6.0743, "loss/crossentropy": 2.6281604766845703, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.19148899614810944, "step": 10482 }, { "epoch": 0.327625, "grad_norm": 3.40625, "grad_norm_var": 0.11571858723958334, "learning_rate": 0.0001, "loss": 5.8511, "loss/crossentropy": 2.5006006956100464, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18231221288442612, "step": 10484 }, { "epoch": 0.3276875, "grad_norm": 3.34375, "grad_norm_var": 0.110205078125, "learning_rate": 0.0001, "loss": 5.6347, "loss/crossentropy": 2.3496744632720947, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1730307787656784, "step": 10486 }, { "epoch": 0.32775, "grad_norm": 3.3125, "grad_norm_var": 0.10273030598958334, "learning_rate": 0.0001, "loss": 5.9948, "loss/crossentropy": 2.6307467222213745, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18444740772247314, "step": 10488 }, { "epoch": 0.3278125, "grad_norm": 3.40625, "grad_norm_var": 0.091015625, "learning_rate": 0.0001, "loss": 5.7624, "loss/crossentropy": 2.5877468585968018, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16746732592582703, "step": 10490 }, { "epoch": 0.327875, "grad_norm": 3.171875, "grad_norm_var": 0.089111328125, "learning_rate": 0.0001, "loss": 5.8677, "loss/crossentropy": 2.573415517807007, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1766989678144455, "step": 10492 }, { "epoch": 0.3279375, "grad_norm": 3.09375, "grad_norm_var": 0.0910797119140625, "learning_rate": 0.0001, "loss": 5.6794, "loss/crossentropy": 2.364428162574768, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17602647095918655, "step": 10494 }, { "epoch": 0.328, "grad_norm": 4.0625, "grad_norm_var": 0.12268473307291666, "learning_rate": 0.0001, "loss": 6.2771, "loss/crossentropy": 2.719959020614624, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.19751346856355667, "step": 10496 }, { "epoch": 0.3280625, "grad_norm": 3.375, "grad_norm_var": 0.056428019205729166, "learning_rate": 0.0001, "loss": 5.7902, "loss/crossentropy": 2.4194256067276, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1851213350892067, "step": 10498 }, { "epoch": 0.328125, "grad_norm": 3.390625, "grad_norm_var": 0.08354390462239583, "learning_rate": 0.0001, "loss": 6.115, "loss/crossentropy": 2.686237335205078, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1870184689760208, "step": 10500 }, { "epoch": 0.3281875, "grad_norm": 3.484375, "grad_norm_var": 0.089990234375, "learning_rate": 0.0001, "loss": 5.9201, "loss/crossentropy": 2.5260117053985596, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18276835978031158, "step": 10502 }, { "epoch": 0.32825, "grad_norm": 3.390625, "grad_norm_var": 0.08671468098958333, "learning_rate": 0.0001, "loss": 5.8161, "loss/crossentropy": 2.5687613487243652, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17356525361537933, "step": 10504 }, { "epoch": 0.3283125, "grad_norm": 3.140625, "grad_norm_var": 0.07932942708333333, "learning_rate": 0.0001, "loss": 5.592, "loss/crossentropy": 2.33124041557312, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17099381238222122, "step": 10506 }, { "epoch": 0.328375, "grad_norm": 3.390625, "grad_norm_var": 0.07315165201822917, "learning_rate": 0.0001, "loss": 5.967, "loss/crossentropy": 2.6727588176727295, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17903530597686768, "step": 10508 }, { "epoch": 0.3284375, "grad_norm": 3.5625, "grad_norm_var": 0.06545817057291667, "learning_rate": 0.0001, "loss": 5.9038, "loss/crossentropy": 2.4932072162628174, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18363633006811142, "step": 10510 }, { "epoch": 0.3285, "grad_norm": 3.078125, "grad_norm_var": 0.053132120768229166, "learning_rate": 0.0001, "loss": 6.0411, "loss/crossentropy": 2.7488632202148438, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1776644065976143, "step": 10512 }, { "epoch": 0.3285625, "grad_norm": 3.1875, "grad_norm_var": 0.06106770833333333, "learning_rate": 0.0001, "loss": 6.4665, "loss/crossentropy": 2.885968565940857, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.19789214432239532, "step": 10514 }, { "epoch": 0.328625, "grad_norm": 3.109375, "grad_norm_var": 0.03954671223958333, "learning_rate": 0.0001, "loss": 5.824, "loss/crossentropy": 2.562509536743164, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17380927503108978, "step": 10516 }, { "epoch": 0.3286875, "grad_norm": 3.046875, "grad_norm_var": 0.0358551025390625, "learning_rate": 0.0001, "loss": 5.5661, "loss/crossentropy": 2.369008183479309, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.170881487429142, "step": 10518 }, { "epoch": 0.32875, "grad_norm": 3.296875, "grad_norm_var": 0.040576171875, "learning_rate": 0.0001, "loss": 5.9116, "loss/crossentropy": 2.5701723098754883, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1794508770108223, "step": 10520 }, { "epoch": 0.3288125, "grad_norm": 3.609375, "grad_norm_var": 0.044677734375, "learning_rate": 0.0001, "loss": 6.1161, "loss/crossentropy": 2.674402117729187, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1894790381193161, "step": 10522 }, { "epoch": 0.328875, "grad_norm": 3.265625, "grad_norm_var": 0.044287109375, "learning_rate": 0.0001, "loss": 5.9051, "loss/crossentropy": 2.544066548347473, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.17907274514436722, "step": 10524 }, { "epoch": 0.3289375, "grad_norm": 3.421875, "grad_norm_var": 0.0453125, "learning_rate": 0.0001, "loss": 5.8596, "loss/crossentropy": 2.5954864025115967, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17484527081251144, "step": 10526 }, { "epoch": 0.329, "grad_norm": 3.265625, "grad_norm_var": 0.04072977701822917, "learning_rate": 0.0001, "loss": 5.9282, "loss/crossentropy": 2.6108763217926025, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.18172959238290787, "step": 10528 }, { "epoch": 0.3290625, "grad_norm": 3.546875, "grad_norm_var": 0.034928385416666666, "learning_rate": 0.0001, "loss": 5.976, "loss/crossentropy": 2.6122357845306396, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.18520929664373398, "step": 10530 }, { "epoch": 0.329125, "grad_norm": 3.046875, "grad_norm_var": 0.03798828125, "learning_rate": 0.0001, "loss": 6.1419, "loss/crossentropy": 2.7869744300842285, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.18510252982378006, "step": 10532 }, { "epoch": 0.3291875, "grad_norm": 3.1875, "grad_norm_var": 0.03351949055989583, "learning_rate": 0.0001, "loss": 6.0635, "loss/crossentropy": 2.7528659105300903, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1775524988770485, "step": 10534 }, { "epoch": 0.32925, "grad_norm": 3.140625, "grad_norm_var": 0.03192952473958333, "learning_rate": 0.0001, "loss": 5.9812, "loss/crossentropy": 2.60650634765625, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18083249777555466, "step": 10536 }, { "epoch": 0.3293125, "grad_norm": 3.0, "grad_norm_var": 0.030524698893229167, "learning_rate": 0.0001, "loss": 5.8681, "loss/crossentropy": 2.6220571994781494, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1742141991853714, "step": 10538 }, { "epoch": 0.329375, "grad_norm": 3.234375, "grad_norm_var": 0.037653605143229164, "learning_rate": 0.0001, "loss": 5.7579, "loss/crossentropy": 2.4170820713043213, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1786084622144699, "step": 10540 }, { "epoch": 0.3294375, "grad_norm": 3.171875, "grad_norm_var": 0.038361612955729166, "learning_rate": 0.0001, "loss": 5.5535, "loss/crossentropy": 2.4177643060684204, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16865624487400055, "step": 10542 }, { "epoch": 0.3295, "grad_norm": 4.21875, "grad_norm_var": 0.0998199462890625, "learning_rate": 0.0001, "loss": 5.5978, "loss/crossentropy": 2.239539384841919, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.17879444360733032, "step": 10544 }, { "epoch": 0.3295625, "grad_norm": 3.28125, "grad_norm_var": 0.09480692545572916, "learning_rate": 0.0001, "loss": 5.6869, "loss/crossentropy": 2.414452075958252, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17412185668945312, "step": 10546 }, { "epoch": 0.329625, "grad_norm": 3.28125, "grad_norm_var": 0.09277242024739583, "learning_rate": 0.0001, "loss": 5.908, "loss/crossentropy": 2.638763189315796, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17496571689844131, "step": 10548 }, { "epoch": 0.3296875, "grad_norm": 3.09375, "grad_norm_var": 0.09472249348958334, "learning_rate": 0.0001, "loss": 5.8715, "loss/crossentropy": 2.533913731575012, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.17712035030126572, "step": 10550 }, { "epoch": 0.32975, "grad_norm": 3.171875, "grad_norm_var": 0.09904683430989583, "learning_rate": 0.0001, "loss": 5.953, "loss/crossentropy": 2.5308799743652344, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1844041869044304, "step": 10552 }, { "epoch": 0.3298125, "grad_norm": 3.28125, "grad_norm_var": 0.09582926432291666, "learning_rate": 0.0001, "loss": 5.2914, "loss/crossentropy": 2.190713882446289, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1588962972164154, "step": 10554 }, { "epoch": 0.329875, "grad_norm": 3.1875, "grad_norm_var": 0.09474283854166667, "learning_rate": 0.0001, "loss": 5.7447, "loss/crossentropy": 2.421551823616028, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1780192330479622, "step": 10556 }, { "epoch": 0.3299375, "grad_norm": 3.765625, "grad_norm_var": 0.09556884765625, "learning_rate": 0.0001, "loss": 5.8952, "loss/crossentropy": 2.55362606048584, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17869000136852264, "step": 10558 }, { "epoch": 0.33, "grad_norm": 3.234375, "grad_norm_var": 0.047200520833333336, "learning_rate": 0.0001, "loss": 6.1143, "loss/crossentropy": 2.75285267829895, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18067169189453125, "step": 10560 }, { "epoch": 0.3300625, "grad_norm": 3.375, "grad_norm_var": 0.048140462239583334, "learning_rate": 0.0001, "loss": 5.7913, "loss/crossentropy": 2.5640674829483032, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17116037011146545, "step": 10562 }, { "epoch": 0.330125, "grad_norm": 3.296875, "grad_norm_var": 0.042708333333333334, "learning_rate": 0.0001, "loss": 5.7565, "loss/crossentropy": 2.4372029304504395, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17763124406337738, "step": 10564 }, { "epoch": 0.3301875, "grad_norm": 3.265625, "grad_norm_var": 0.09617411295572917, "learning_rate": 0.0001, "loss": 5.9864, "loss/crossentropy": 2.4982370138168335, "loss/hidden": 1.62109375, "loss/jsd": 0.0, "loss/logits": 0.18670214712619781, "step": 10566 }, { "epoch": 0.33025, "grad_norm": 3.265625, "grad_norm_var": 0.09566141764322916, "learning_rate": 0.0001, "loss": 6.2147, "loss/crossentropy": 2.682075023651123, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.19662075489759445, "step": 10568 }, { "epoch": 0.3303125, "grad_norm": 3.515625, "grad_norm_var": 0.08302408854166667, "learning_rate": 0.0001, "loss": 6.034, "loss/crossentropy": 2.6168935298919678, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18624671548604965, "step": 10570 }, { "epoch": 0.330375, "grad_norm": 3.171875, "grad_norm_var": 0.08391927083333334, "learning_rate": 0.0001, "loss": 6.1638, "loss/crossentropy": 2.775294542312622, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.185729518532753, "step": 10572 }, { "epoch": 0.3304375, "grad_norm": 3.3125, "grad_norm_var": 0.07730712890625, "learning_rate": 0.0001, "loss": 5.9852, "loss/crossentropy": 2.5786255598068237, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18675360083580017, "step": 10574 }, { "epoch": 0.3305, "grad_norm": 3.140625, "grad_norm_var": 0.0783599853515625, "learning_rate": 0.0001, "loss": 5.3715, "loss/crossentropy": 2.2789233922958374, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.15887174755334854, "step": 10576 }, { "epoch": 0.3305625, "grad_norm": 3.1875, "grad_norm_var": 0.07822265625, "learning_rate": 0.0001, "loss": 5.6954, "loss/crossentropy": 2.4184963703155518, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17612414807081223, "step": 10578 }, { "epoch": 0.330625, "grad_norm": 3.0, "grad_norm_var": 0.08936258951822916, "learning_rate": 0.0001, "loss": 6.0551, "loss/crossentropy": 2.6684863567352295, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18397579342126846, "step": 10580 }, { "epoch": 0.3306875, "grad_norm": 3.5625, "grad_norm_var": 0.05384114583333333, "learning_rate": 0.0001, "loss": 6.3382, "loss/crossentropy": 2.7625722885131836, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.2020929455757141, "step": 10582 }, { "epoch": 0.33075, "grad_norm": 3.234375, "grad_norm_var": 0.0497467041015625, "learning_rate": 0.0001, "loss": 5.7447, "loss/crossentropy": 2.435757040977478, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.17191199213266373, "step": 10584 }, { "epoch": 0.3308125, "grad_norm": 4.03125, "grad_norm_var": 0.0761871337890625, "learning_rate": 0.0001, "loss": 6.0752, "loss/crossentropy": 2.595419764518738, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19095022231340408, "step": 10586 }, { "epoch": 0.330875, "grad_norm": 3.15625, "grad_norm_var": 0.07667643229166667, "learning_rate": 0.0001, "loss": 5.9915, "loss/crossentropy": 2.5594364404678345, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18657050281763077, "step": 10588 }, { "epoch": 0.3309375, "grad_norm": 3.15625, "grad_norm_var": 0.0802734375, "learning_rate": 0.0001, "loss": 5.8781, "loss/crossentropy": 2.589942455291748, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17803217470645905, "step": 10590 }, { "epoch": 0.331, "grad_norm": 2.859375, "grad_norm_var": 0.09575907389322917, "learning_rate": 0.0001, "loss": 6.007, "loss/crossentropy": 2.692532777786255, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1798829883337021, "step": 10592 }, { "epoch": 0.3310625, "grad_norm": 3.125, "grad_norm_var": 0.09853413899739584, "learning_rate": 0.0001, "loss": 5.9402, "loss/crossentropy": 2.6700247526168823, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1758483648300171, "step": 10594 }, { "epoch": 0.331125, "grad_norm": 3.5, "grad_norm_var": 0.09239908854166666, "learning_rate": 0.0001, "loss": 5.8785, "loss/crossentropy": 2.488546133041382, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.17845331132411957, "step": 10596 }, { "epoch": 0.3311875, "grad_norm": 3.515625, "grad_norm_var": 0.07979227701822916, "learning_rate": 0.0001, "loss": 6.2341, "loss/crossentropy": 2.7175732851028442, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.19422803819179535, "step": 10598 }, { "epoch": 0.33125, "grad_norm": 3.203125, "grad_norm_var": 0.08053385416666667, "learning_rate": 0.0001, "loss": 5.8776, "loss/crossentropy": 2.5200968980789185, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18262987583875656, "step": 10600 }, { "epoch": 0.3313125, "grad_norm": 3.328125, "grad_norm_var": 0.05237223307291667, "learning_rate": 0.0001, "loss": 5.669, "loss/crossentropy": 2.410857081413269, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17425625771284103, "step": 10602 }, { "epoch": 0.331375, "grad_norm": 3.4375, "grad_norm_var": 0.05281473795572917, "learning_rate": 0.0001, "loss": 5.9284, "loss/crossentropy": 2.5856266021728516, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1834932491183281, "step": 10604 }, { "epoch": 0.3314375, "grad_norm": 3.171875, "grad_norm_var": 0.052229817708333334, "learning_rate": 0.0001, "loss": 5.7939, "loss/crossentropy": 2.5548208951950073, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17195909470319748, "step": 10606 }, { "epoch": 0.3315, "grad_norm": 3.265625, "grad_norm_var": 0.03737691243489583, "learning_rate": 0.0001, "loss": 5.943, "loss/crossentropy": 2.641461133956909, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17663607746362686, "step": 10608 }, { "epoch": 0.3315625, "grad_norm": 3.046875, "grad_norm_var": 0.03974202473958333, "learning_rate": 0.0001, "loss": 5.666, "loss/crossentropy": 2.4323424100875854, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.16984786093235016, "step": 10610 }, { "epoch": 0.331625, "grad_norm": 3.09375, "grad_norm_var": 0.033935546875, "learning_rate": 0.0001, "loss": 5.4376, "loss/crossentropy": 2.272653341293335, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16844987869262695, "step": 10612 }, { "epoch": 0.3316875, "grad_norm": 3.125, "grad_norm_var": 0.011617024739583334, "learning_rate": 0.0001, "loss": 5.8562, "loss/crossentropy": 2.58320415019989, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1765137016773224, "step": 10614 }, { "epoch": 0.33175, "grad_norm": 3.0625, "grad_norm_var": 0.012035115559895834, "learning_rate": 0.0001, "loss": 5.8355, "loss/crossentropy": 2.6183416843414307, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17289245128631592, "step": 10616 }, { "epoch": 0.3318125, "grad_norm": 3.5625, "grad_norm_var": 0.019367472330729166, "learning_rate": 0.0001, "loss": 6.1285, "loss/crossentropy": 2.7610244750976562, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.17893724143505096, "step": 10618 }, { "epoch": 0.331875, "grad_norm": 3.234375, "grad_norm_var": 0.017281087239583333, "learning_rate": 0.0001, "loss": 5.8985, "loss/crossentropy": 2.5375760793685913, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18101774901151657, "step": 10620 }, { "epoch": 0.3319375, "grad_norm": 4.09375, "grad_norm_var": 0.06529947916666666, "learning_rate": 0.0001, "loss": 5.8052, "loss/crossentropy": 2.487018585205078, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17596260458230972, "step": 10622 }, { "epoch": 0.332, "grad_norm": 3.203125, "grad_norm_var": 0.06450093587239583, "learning_rate": 0.0001, "loss": 5.8771, "loss/crossentropy": 2.591071367263794, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17821310460567474, "step": 10624 }, { "epoch": 0.3320625, "grad_norm": 3.484375, "grad_norm_var": 0.06321614583333333, "learning_rate": 0.0001, "loss": 5.7008, "loss/crossentropy": 2.428701877593994, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.17056617140769958, "step": 10626 }, { "epoch": 0.332125, "grad_norm": 3.578125, "grad_norm_var": 0.06451822916666666, "learning_rate": 0.0001, "loss": 5.9604, "loss/crossentropy": 2.5602476596832275, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18844837695360184, "step": 10628 }, { "epoch": 0.3321875, "grad_norm": 3.34375, "grad_norm_var": 0.061823527018229164, "learning_rate": 0.0001, "loss": 5.652, "loss/crossentropy": 2.4678162336349487, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16881363093852997, "step": 10630 }, { "epoch": 0.33225, "grad_norm": 3.640625, "grad_norm_var": 0.0569732666015625, "learning_rate": 0.0001, "loss": 6.0892, "loss/crossentropy": 2.691482424736023, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18703413009643555, "step": 10632 }, { "epoch": 0.3323125, "grad_norm": 3.640625, "grad_norm_var": 0.058958943684895834, "learning_rate": 0.0001, "loss": 6.0169, "loss/crossentropy": 2.605046033859253, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18610990047454834, "step": 10634 }, { "epoch": 0.332375, "grad_norm": 4.1875, "grad_norm_var": 0.10077718098958334, "learning_rate": 0.0001, "loss": 6.2816, "loss/crossentropy": 2.7226985692977905, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.19924483448266983, "step": 10636 }, { "epoch": 0.3324375, "grad_norm": 3.5625, "grad_norm_var": 0.0700836181640625, "learning_rate": 0.0001, "loss": 5.8493, "loss/crossentropy": 2.4939684867858887, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.17928729951381683, "step": 10638 }, { "epoch": 0.3325, "grad_norm": 3.828125, "grad_norm_var": 0.10445963541666667, "learning_rate": 0.0001, "loss": 6.0624, "loss/crossentropy": 2.5568013191223145, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.19626538455486298, "step": 10640 }, { "epoch": 0.3325625, "grad_norm": 3.265625, "grad_norm_var": 0.11070556640625, "learning_rate": 0.0001, "loss": 5.8763, "loss/crossentropy": 2.5293742418289185, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18391429632902145, "step": 10642 }, { "epoch": 0.332625, "grad_norm": 3.015625, "grad_norm_var": 0.12675374348958332, "learning_rate": 0.0001, "loss": 5.6411, "loss/crossentropy": 2.470715045928955, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16704195737838745, "step": 10644 }, { "epoch": 0.3326875, "grad_norm": 3.6875, "grad_norm_var": 0.12403971354166667, "learning_rate": 0.0001, "loss": 6.0903, "loss/crossentropy": 2.5943169593811035, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.19100331515073776, "step": 10646 }, { "epoch": 0.33275, "grad_norm": 3.25, "grad_norm_var": 0.12360738118489584, "learning_rate": 0.0001, "loss": 5.8859, "loss/crossentropy": 2.4839388132095337, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.18237964808940887, "step": 10648 }, { "epoch": 0.3328125, "grad_norm": 2.96875, "grad_norm_var": 0.13709309895833333, "learning_rate": 0.0001, "loss": 5.7542, "loss/crossentropy": 2.485421061515808, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17414316534996033, "step": 10650 }, { "epoch": 0.332875, "grad_norm": 3.8125, "grad_norm_var": 0.11015218098958333, "learning_rate": 0.0001, "loss": 5.6255, "loss/crossentropy": 2.3566641807556152, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17219743877649307, "step": 10652 }, { "epoch": 0.3329375, "grad_norm": 3.59375, "grad_norm_var": 0.11064046223958333, "learning_rate": 0.0001, "loss": 5.9665, "loss/crossentropy": 2.5395361185073853, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18566076457500458, "step": 10654 }, { "epoch": 0.333, "grad_norm": 3.671875, "grad_norm_var": 0.08931884765625, "learning_rate": 0.0001, "loss": 6.1111, "loss/crossentropy": 2.56496000289917, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19485173374414444, "step": 10656 }, { "epoch": 0.3330625, "grad_norm": 3.40625, "grad_norm_var": 0.08658447265625, "learning_rate": 0.0001, "loss": 5.7782, "loss/crossentropy": 2.532205820083618, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17459736764431, "step": 10658 }, { "epoch": 0.333125, "grad_norm": 3.421875, "grad_norm_var": 0.06955973307291667, "learning_rate": 0.0001, "loss": 5.6623, "loss/crossentropy": 2.408676028251648, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1734139770269394, "step": 10660 }, { "epoch": 0.3331875, "grad_norm": 3.265625, "grad_norm_var": 0.06777242024739584, "learning_rate": 0.0001, "loss": 5.9523, "loss/crossentropy": 2.6525381803512573, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17763584852218628, "step": 10662 }, { "epoch": 0.33325, "grad_norm": 2.96875, "grad_norm_var": 0.08220113118489583, "learning_rate": 0.0001, "loss": 5.8732, "loss/crossentropy": 2.58855938911438, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1784590780735016, "step": 10664 }, { "epoch": 0.3333125, "grad_norm": 3.359375, "grad_norm_var": 0.08573811848958333, "learning_rate": 0.0001, "loss": 5.8772, "loss/crossentropy": 2.512405276298523, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1813981756567955, "step": 10666 }, { "epoch": 0.333375, "grad_norm": 3.25, "grad_norm_var": 0.07273763020833333, "learning_rate": 0.0001, "loss": 5.7278, "loss/crossentropy": 2.418931484222412, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1816667690873146, "step": 10668 }, { "epoch": 0.3334375, "grad_norm": 3.171875, "grad_norm_var": 0.0744140625, "learning_rate": 0.0001, "loss": 5.9827, "loss/crossentropy": 2.6172374486923218, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1802959442138672, "step": 10670 }, { "epoch": 0.3335, "grad_norm": 3.234375, "grad_norm_var": 0.0423980712890625, "learning_rate": 0.0001, "loss": 5.8085, "loss/crossentropy": 2.5476096868515015, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1753033846616745, "step": 10672 }, { "epoch": 0.3335625, "grad_norm": 3.59375, "grad_norm_var": 0.04898681640625, "learning_rate": 0.0001, "loss": 5.8135, "loss/crossentropy": 2.5202016830444336, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17893477529287338, "step": 10674 }, { "epoch": 0.333625, "grad_norm": 3.109375, "grad_norm_var": 0.051070149739583334, "learning_rate": 0.0001, "loss": 5.966, "loss/crossentropy": 2.663010001182556, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17600010335445404, "step": 10676 }, { "epoch": 0.3336875, "grad_norm": 3.421875, "grad_norm_var": 0.052408854166666664, "learning_rate": 0.0001, "loss": 5.8813, "loss/crossentropy": 2.593448281288147, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1764383167028427, "step": 10678 }, { "epoch": 0.33375, "grad_norm": 3.28125, "grad_norm_var": 0.04797770182291667, "learning_rate": 0.0001, "loss": 6.1149, "loss/crossentropy": 2.6398794651031494, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.19281642884016037, "step": 10680 }, { "epoch": 0.3338125, "grad_norm": 3.71875, "grad_norm_var": 0.0342926025390625, "learning_rate": 0.0001, "loss": 5.8726, "loss/crossentropy": 2.5157653093338013, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1802167147397995, "step": 10682 }, { "epoch": 0.333875, "grad_norm": 3.296875, "grad_norm_var": 0.0352203369140625, "learning_rate": 0.0001, "loss": 5.8488, "loss/crossentropy": 2.5493035316467285, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17799723148345947, "step": 10684 }, { "epoch": 0.3339375, "grad_norm": 3.484375, "grad_norm_var": 0.0407867431640625, "learning_rate": 0.0001, "loss": 5.8222, "loss/crossentropy": 2.5019291639328003, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1777304857969284, "step": 10686 }, { "epoch": 0.334, "grad_norm": 3.4375, "grad_norm_var": 0.0457183837890625, "learning_rate": 0.0001, "loss": 6.1355, "loss/crossentropy": 2.6443079710006714, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.19364753365516663, "step": 10688 }, { "epoch": 0.3340625, "grad_norm": 3.75, "grad_norm_var": 0.06055399576822917, "learning_rate": 0.0001, "loss": 5.6513, "loss/crossentropy": 2.3860379457473755, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17340559512376785, "step": 10690 }, { "epoch": 0.334125, "grad_norm": 3.640625, "grad_norm_var": 0.055859375, "learning_rate": 0.0001, "loss": 6.0432, "loss/crossentropy": 2.6361632347106934, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18523555994033813, "step": 10692 }, { "epoch": 0.3341875, "grad_norm": 3.84375, "grad_norm_var": 0.058592732747395834, "learning_rate": 0.0001, "loss": 5.6446, "loss/crossentropy": 2.3392274379730225, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17975673079490662, "step": 10694 }, { "epoch": 0.33425, "grad_norm": 3.515625, "grad_norm_var": 0.056103515625, "learning_rate": 0.0001, "loss": 6.1306, "loss/crossentropy": 2.662463068962097, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.19407868385314941, "step": 10696 }, { "epoch": 0.3343125, "grad_norm": 3.203125, "grad_norm_var": 0.06316630045572917, "learning_rate": 0.0001, "loss": 5.8663, "loss/crossentropy": 2.55662739276886, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.18174849450588226, "step": 10698 }, { "epoch": 0.334375, "grad_norm": 3.453125, "grad_norm_var": 0.054423014322916664, "learning_rate": 0.0001, "loss": 6.0766, "loss/crossentropy": 2.595309257507324, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.19148609787225723, "step": 10700 }, { "epoch": 0.3344375, "grad_norm": 3.125, "grad_norm_var": 0.0628570556640625, "learning_rate": 0.0001, "loss": 6.0151, "loss/crossentropy": 2.636073112487793, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.18009155988693237, "step": 10702 }, { "epoch": 0.3345, "grad_norm": 3.453125, "grad_norm_var": 0.0578521728515625, "learning_rate": 0.0001, "loss": 5.9708, "loss/crossentropy": 2.5911970138549805, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18444080650806427, "step": 10704 }, { "epoch": 0.3345625, "grad_norm": 4.15625, "grad_norm_var": 0.07268778483072917, "learning_rate": 0.0001, "loss": 6.0049, "loss/crossentropy": 2.5935776233673096, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18839357793331146, "step": 10706 }, { "epoch": 0.334625, "grad_norm": 3.421875, "grad_norm_var": 0.07278645833333333, "learning_rate": 0.0001, "loss": 5.6842, "loss/crossentropy": 2.3627558946609497, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.17394591867923737, "step": 10708 }, { "epoch": 0.3346875, "grad_norm": 3.21875, "grad_norm_var": 0.06808268229166667, "learning_rate": 0.0001, "loss": 6.0557, "loss/crossentropy": 2.6889145374298096, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1796456128358841, "step": 10710 }, { "epoch": 0.33475, "grad_norm": 3.390625, "grad_norm_var": 0.06531473795572916, "learning_rate": 0.0001, "loss": 5.8758, "loss/crossentropy": 2.5156023502349854, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.183290034532547, "step": 10712 }, { "epoch": 0.3348125, "grad_norm": 3.203125, "grad_norm_var": 0.0652496337890625, "learning_rate": 0.0001, "loss": 5.9407, "loss/crossentropy": 2.618725299835205, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18063638359308243, "step": 10714 }, { "epoch": 0.334875, "grad_norm": 3.609375, "grad_norm_var": 0.07017822265625, "learning_rate": 0.0001, "loss": 5.5902, "loss/crossentropy": 2.3678793907165527, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.16559426486492157, "step": 10716 }, { "epoch": 0.3349375, "grad_norm": 3.234375, "grad_norm_var": 0.06812235514322916, "learning_rate": 0.0001, "loss": 5.7844, "loss/crossentropy": 2.527473211288452, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17490901052951813, "step": 10718 }, { "epoch": 0.335, "grad_norm": 3.234375, "grad_norm_var": 0.0790191650390625, "learning_rate": 0.0001, "loss": 5.5054, "loss/crossentropy": 2.298014998435974, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17073433101177216, "step": 10720 }, { "epoch": 0.3350625, "grad_norm": 3.515625, "grad_norm_var": 0.041015625, "learning_rate": 0.0001, "loss": 5.8837, "loss/crossentropy": 2.5489251613616943, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.18308701366186142, "step": 10722 }, { "epoch": 0.335125, "grad_norm": 3.53125, "grad_norm_var": 0.03801167805989583, "learning_rate": 0.0001, "loss": 5.8824, "loss/crossentropy": 2.590053081512451, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17493685334920883, "step": 10724 }, { "epoch": 0.3351875, "grad_norm": 3.703125, "grad_norm_var": 0.04772135416666667, "learning_rate": 0.0001, "loss": 6.0159, "loss/crossentropy": 2.5686169862747192, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1876990720629692, "step": 10726 }, { "epoch": 0.33525, "grad_norm": 3.296875, "grad_norm_var": 0.05120035807291667, "learning_rate": 0.0001, "loss": 5.5758, "loss/crossentropy": 2.423282265663147, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16720261424779892, "step": 10728 }, { "epoch": 0.3353125, "grad_norm": 3.203125, "grad_norm_var": 0.041731770833333334, "learning_rate": 0.0001, "loss": 5.8786, "loss/crossentropy": 2.5165220499038696, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18073506653308868, "step": 10730 }, { "epoch": 0.335375, "grad_norm": 3.1875, "grad_norm_var": 0.03559468587239583, "learning_rate": 0.0001, "loss": 5.459, "loss/crossentropy": 2.332701325416565, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16380292922258377, "step": 10732 }, { "epoch": 0.3354375, "grad_norm": 3.25, "grad_norm_var": 0.035542805989583336, "learning_rate": 0.0001, "loss": 6.101, "loss/crossentropy": 2.673590302467346, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18766145408153534, "step": 10734 }, { "epoch": 0.3355, "grad_norm": 3.0625, "grad_norm_var": 0.03414306640625, "learning_rate": 0.0001, "loss": 5.8438, "loss/crossentropy": 2.531806230545044, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1784651130437851, "step": 10736 }, { "epoch": 0.3355625, "grad_norm": 3.515625, "grad_norm_var": 0.030517578125, "learning_rate": 0.0001, "loss": 5.9511, "loss/crossentropy": 2.5333372354507446, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1859154775738716, "step": 10738 }, { "epoch": 0.335625, "grad_norm": 3.390625, "grad_norm_var": 0.02564697265625, "learning_rate": 0.0001, "loss": 5.6016, "loss/crossentropy": 2.356564521789551, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17645235359668732, "step": 10740 }, { "epoch": 0.3356875, "grad_norm": 3.1875, "grad_norm_var": 0.03204752604166667, "learning_rate": 0.0001, "loss": 5.8296, "loss/crossentropy": 2.5035078525543213, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18104194849729538, "step": 10742 }, { "epoch": 0.33575, "grad_norm": 3.71875, "grad_norm_var": 0.045929972330729166, "learning_rate": 0.0001, "loss": 5.9766, "loss/crossentropy": 2.483887195587158, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1961456835269928, "step": 10744 }, { "epoch": 0.3358125, "grad_norm": 3.734375, "grad_norm_var": 0.08267822265625, "learning_rate": 0.0001, "loss": 6.4994, "loss/crossentropy": 2.8418978452682495, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.2028629183769226, "step": 10746 }, { "epoch": 0.335875, "grad_norm": 3.109375, "grad_norm_var": 0.08565165201822916, "learning_rate": 0.0001, "loss": 5.9268, "loss/crossentropy": 2.621497869491577, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17896533757448196, "step": 10748 }, { "epoch": 0.3359375, "grad_norm": 2.9375, "grad_norm_var": 0.1017486572265625, "learning_rate": 0.0001, "loss": 5.7917, "loss/crossentropy": 2.494469165802002, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17855554819107056, "step": 10750 }, { "epoch": 0.336, "grad_norm": 3.625, "grad_norm_var": 0.10789388020833333, "learning_rate": 0.0001, "loss": 5.9852, "loss/crossentropy": 2.5950719118118286, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18471822887659073, "step": 10752 }, { "epoch": 0.3360625, "grad_norm": 3.625, "grad_norm_var": 0.10816650390625, "learning_rate": 0.0001, "loss": 5.991, "loss/crossentropy": 2.512368321418762, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18966437131166458, "step": 10754 }, { "epoch": 0.336125, "grad_norm": 3.3125, "grad_norm_var": 0.10845438639322917, "learning_rate": 0.0001, "loss": 6.11, "loss/crossentropy": 2.673190951347351, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18665232509374619, "step": 10756 }, { "epoch": 0.3361875, "grad_norm": 3.25, "grad_norm_var": 0.10006103515625, "learning_rate": 0.0001, "loss": 6.2482, "loss/crossentropy": 2.7868168354034424, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.19027435779571533, "step": 10758 }, { "epoch": 0.33625, "grad_norm": 3.1875, "grad_norm_var": 0.0986480712890625, "learning_rate": 0.0001, "loss": 5.9804, "loss/crossentropy": 2.595295548439026, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18694473803043365, "step": 10760 }, { "epoch": 0.3363125, "grad_norm": 3.4375, "grad_norm_var": 0.067529296875, "learning_rate": 0.0001, "loss": 5.9568, "loss/crossentropy": 2.5836589336395264, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18340512365102768, "step": 10762 }, { "epoch": 0.336375, "grad_norm": 3.03125, "grad_norm_var": 0.06737874348958334, "learning_rate": 0.0001, "loss": 5.7095, "loss/crossentropy": 2.5286333560943604, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16848018765449524, "step": 10764 }, { "epoch": 0.3364375, "grad_norm": 3.546875, "grad_norm_var": 0.0657867431640625, "learning_rate": 0.0001, "loss": 5.6757, "loss/crossentropy": 2.3952642679214478, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.17179232090711594, "step": 10766 }, { "epoch": 0.3365, "grad_norm": 3.296875, "grad_norm_var": 0.051318359375, "learning_rate": 0.0001, "loss": 5.8813, "loss/crossentropy": 2.6146652698516846, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1770579069852829, "step": 10768 }, { "epoch": 0.3365625, "grad_norm": 3.375, "grad_norm_var": 0.04949544270833333, "learning_rate": 0.0001, "loss": 6.2394, "loss/crossentropy": 2.711554527282715, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19184448570013046, "step": 10770 }, { "epoch": 0.336625, "grad_norm": 3.171875, "grad_norm_var": 0.05370686848958333, "learning_rate": 0.0001, "loss": 5.9321, "loss/crossentropy": 2.6609376668930054, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17711444944143295, "step": 10772 }, { "epoch": 0.3366875, "grad_norm": 3.546875, "grad_norm_var": 0.0553619384765625, "learning_rate": 0.0001, "loss": 6.0884, "loss/crossentropy": 2.6715911626815796, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1854308694601059, "step": 10774 }, { "epoch": 0.33675, "grad_norm": 3.28125, "grad_norm_var": 0.049609375, "learning_rate": 0.0001, "loss": 5.9226, "loss/crossentropy": 2.57106876373291, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1789025068283081, "step": 10776 }, { "epoch": 0.3368125, "grad_norm": 3.0625, "grad_norm_var": 0.06587626139322916, "learning_rate": 0.0001, "loss": 6.0601, "loss/crossentropy": 2.6637685298919678, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1865086406469345, "step": 10778 }, { "epoch": 0.336875, "grad_norm": 3.1875, "grad_norm_var": 0.06286519368489583, "learning_rate": 0.0001, "loss": 5.9621, "loss/crossentropy": 2.669995427131653, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17764340341091156, "step": 10780 }, { "epoch": 0.3369375, "grad_norm": 3.1875, "grad_norm_var": 0.0591796875, "learning_rate": 0.0001, "loss": 5.7788, "loss/crossentropy": 2.5435677766799927, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17664822190999985, "step": 10782 }, { "epoch": 0.337, "grad_norm": 3.34375, "grad_norm_var": 0.0614654541015625, "learning_rate": 0.0001, "loss": 5.8965, "loss/crossentropy": 2.6301380395889282, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1758502647280693, "step": 10784 }, { "epoch": 0.3370625, "grad_norm": 3.296875, "grad_norm_var": 0.054215494791666666, "learning_rate": 0.0001, "loss": 5.6843, "loss/crossentropy": 2.3977192640304565, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.17084172368049622, "step": 10786 }, { "epoch": 0.337125, "grad_norm": 3.1875, "grad_norm_var": 0.0639068603515625, "learning_rate": 0.0001, "loss": 6.0451, "loss/crossentropy": 2.5708560943603516, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1923437863588333, "step": 10788 }, { "epoch": 0.3371875, "grad_norm": 3.453125, "grad_norm_var": 0.05845947265625, "learning_rate": 0.0001, "loss": 5.6234, "loss/crossentropy": 2.28852117061615, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.17684955149888992, "step": 10790 }, { "epoch": 0.33725, "grad_norm": 3.296875, "grad_norm_var": 0.058394368489583334, "learning_rate": 0.0001, "loss": 5.7565, "loss/crossentropy": 2.5005563497543335, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1728557050228119, "step": 10792 }, { "epoch": 0.3373125, "grad_norm": 3.15625, "grad_norm_var": 0.0307037353515625, "learning_rate": 0.0001, "loss": 5.3686, "loss/crossentropy": 2.236561596393585, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1596895009279251, "step": 10794 }, { "epoch": 0.337375, "grad_norm": 3.546875, "grad_norm_var": 0.04013264973958333, "learning_rate": 0.0001, "loss": 5.9064, "loss/crossentropy": 2.5232553482055664, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18323633074760437, "step": 10796 }, { "epoch": 0.3374375, "grad_norm": 3.046875, "grad_norm_var": 0.03885091145833333, "learning_rate": 0.0001, "loss": 5.8346, "loss/crossentropy": 2.624903440475464, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17097139358520508, "step": 10798 }, { "epoch": 0.3375, "grad_norm": 3.375, "grad_norm_var": 0.037694295247395836, "learning_rate": 0.0001, "loss": 5.5798, "loss/crossentropy": 2.3961304426193237, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16797547787427902, "step": 10800 }, { "epoch": 0.3375625, "grad_norm": 3.34375, "grad_norm_var": 0.039159138997395836, "learning_rate": 0.0001, "loss": 5.981, "loss/crossentropy": 2.683085799217224, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1786186248064041, "step": 10802 }, { "epoch": 0.337625, "grad_norm": 3.265625, "grad_norm_var": 0.02857666015625, "learning_rate": 0.0001, "loss": 5.9571, "loss/crossentropy": 2.624199628829956, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1821201890707016, "step": 10804 }, { "epoch": 0.3376875, "grad_norm": 3.28125, "grad_norm_var": 0.02808837890625, "learning_rate": 0.0001, "loss": 5.924, "loss/crossentropy": 2.570906400680542, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18257682770490646, "step": 10806 }, { "epoch": 0.33775, "grad_norm": 3.390625, "grad_norm_var": 0.028694661458333333, "learning_rate": 0.0001, "loss": 6.0505, "loss/crossentropy": 2.7018431425094604, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18291601538658142, "step": 10808 }, { "epoch": 0.3378125, "grad_norm": 4.21875, "grad_norm_var": 0.0766510009765625, "learning_rate": 0.0001, "loss": 5.7848, "loss/crossentropy": 2.3851877450942993, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18840241432189941, "step": 10810 }, { "epoch": 0.337875, "grad_norm": 3.8125, "grad_norm_var": 0.0820709228515625, "learning_rate": 0.0001, "loss": 6.1217, "loss/crossentropy": 2.7344181537628174, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18638356775045395, "step": 10812 }, { "epoch": 0.3379375, "grad_norm": 3.671875, "grad_norm_var": 0.07802632649739584, "learning_rate": 0.0001, "loss": 5.7403, "loss/crossentropy": 2.4325857162475586, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17608315497636795, "step": 10814 }, { "epoch": 0.338, "grad_norm": 3.21875, "grad_norm_var": 0.07598368326822917, "learning_rate": 0.0001, "loss": 5.9573, "loss/crossentropy": 2.6354763507843018, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17983748018741608, "step": 10816 }, { "epoch": 0.3380625, "grad_norm": 2.875, "grad_norm_var": 0.0956207275390625, "learning_rate": 0.0001, "loss": 5.738, "loss/crossentropy": 2.550377368927002, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1668103039264679, "step": 10818 }, { "epoch": 0.338125, "grad_norm": 3.25, "grad_norm_var": 0.09594624837239583, "learning_rate": 0.0001, "loss": 5.5867, "loss/crossentropy": 2.396926999092102, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16741415858268738, "step": 10820 }, { "epoch": 0.3381875, "grad_norm": 4.03125, "grad_norm_var": 0.11691792805989583, "learning_rate": 0.0001, "loss": 5.6129, "loss/crossentropy": 2.3473883867263794, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1749899983406067, "step": 10822 }, { "epoch": 0.33825, "grad_norm": 3.109375, "grad_norm_var": 0.12259012858072917, "learning_rate": 0.0001, "loss": 5.8787, "loss/crossentropy": 2.5520554780960083, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18071406334638596, "step": 10824 }, { "epoch": 0.3383125, "grad_norm": 3.28125, "grad_norm_var": 0.07906494140625, "learning_rate": 0.0001, "loss": 5.7556, "loss/crossentropy": 2.445315718650818, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1802482008934021, "step": 10826 }, { "epoch": 0.338375, "grad_norm": 3.25, "grad_norm_var": 0.061356608072916666, "learning_rate": 0.0001, "loss": 6.0558, "loss/crossentropy": 2.675871968269348, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18213185667991638, "step": 10828 }, { "epoch": 0.3384375, "grad_norm": 3.28125, "grad_norm_var": 0.05349934895833333, "learning_rate": 0.0001, "loss": 5.763, "loss/crossentropy": 2.541442632675171, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17332328110933304, "step": 10830 }, { "epoch": 0.3385, "grad_norm": 3.390625, "grad_norm_var": 0.057763671875, "learning_rate": 0.0001, "loss": 5.7538, "loss/crossentropy": 2.457796812057495, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17764590680599213, "step": 10832 }, { "epoch": 0.3385625, "grad_norm": 3.671875, "grad_norm_var": 0.04993082682291667, "learning_rate": 0.0001, "loss": 5.7388, "loss/crossentropy": 2.423627495765686, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17956066131591797, "step": 10834 }, { "epoch": 0.338625, "grad_norm": 3.40625, "grad_norm_var": 0.05520426432291667, "learning_rate": 0.0001, "loss": 5.8473, "loss/crossentropy": 2.5403659343719482, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1767827644944191, "step": 10836 }, { "epoch": 0.3386875, "grad_norm": 3.953125, "grad_norm_var": 0.04716796875, "learning_rate": 0.0001, "loss": 5.8963, "loss/crossentropy": 2.445417642593384, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.19196118414402008, "step": 10838 }, { "epoch": 0.33875, "grad_norm": 3.34375, "grad_norm_var": 0.04111328125, "learning_rate": 0.0001, "loss": 6.0339, "loss/crossentropy": 2.66239595413208, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18362993001937866, "step": 10840 }, { "epoch": 0.3388125, "grad_norm": 3.25, "grad_norm_var": 0.042919921875, "learning_rate": 0.0001, "loss": 6.0567, "loss/crossentropy": 2.661539673805237, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1836584359407425, "step": 10842 }, { "epoch": 0.338875, "grad_norm": 3.3125, "grad_norm_var": 0.040583292643229164, "learning_rate": 0.0001, "loss": 5.7153, "loss/crossentropy": 2.4426932334899902, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17491213977336884, "step": 10844 }, { "epoch": 0.3389375, "grad_norm": 3.375, "grad_norm_var": 0.04104410807291667, "learning_rate": 0.0001, "loss": 6.1584, "loss/crossentropy": 2.711267828941345, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1880771815776825, "step": 10846 }, { "epoch": 0.339, "grad_norm": 3.609375, "grad_norm_var": 0.0428375244140625, "learning_rate": 0.0001, "loss": 6.1288, "loss/crossentropy": 2.648341417312622, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.19062668830156326, "step": 10848 }, { "epoch": 0.3390625, "grad_norm": 3.078125, "grad_norm_var": 0.0487945556640625, "learning_rate": 0.0001, "loss": 5.5299, "loss/crossentropy": 2.352954149246216, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17082266509532928, "step": 10850 }, { "epoch": 0.339125, "grad_norm": 4.0625, "grad_norm_var": 0.0731597900390625, "learning_rate": 0.0001, "loss": 5.5417, "loss/crossentropy": 2.261208415031433, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.17024008929729462, "step": 10852 }, { "epoch": 0.3391875, "grad_norm": 3.671875, "grad_norm_var": 2.5806925455729166, "learning_rate": 0.0001, "loss": 5.8908, "loss/crossentropy": 2.4909229278564453, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1868659257888794, "step": 10854 }, { "epoch": 0.33925, "grad_norm": 3.265625, "grad_norm_var": 2.587532552083333, "learning_rate": 0.0001, "loss": 6.0128, "loss/crossentropy": 2.71052086353302, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17983968555927277, "step": 10856 }, { "epoch": 0.3393125, "grad_norm": 3.171875, "grad_norm_var": 2.620417277018229, "learning_rate": 0.0001, "loss": 6.1523, "loss/crossentropy": 2.8265068531036377, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18179305642843246, "step": 10858 }, { "epoch": 0.339375, "grad_norm": 3.671875, "grad_norm_var": 2.6018300374348957, "learning_rate": 0.0001, "loss": 5.8658, "loss/crossentropy": 2.5263173580169678, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1804347187280655, "step": 10860 }, { "epoch": 0.3394375, "grad_norm": 3.40625, "grad_norm_var": 2.62666015625, "learning_rate": 0.0001, "loss": 6.0053, "loss/crossentropy": 2.6483066082000732, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.17945124208927155, "step": 10862 }, { "epoch": 0.3395, "grad_norm": 3.28125, "grad_norm_var": 2.6412923177083334, "learning_rate": 0.0001, "loss": 5.9145, "loss/crossentropy": 2.5227582454681396, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18292731046676636, "step": 10864 }, { "epoch": 0.3395625, "grad_norm": 3.5625, "grad_norm_var": 2.609358723958333, "learning_rate": 0.0001, "loss": 5.919, "loss/crossentropy": 2.577438712120056, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18258970230817795, "step": 10866 }, { "epoch": 0.339625, "grad_norm": 3.140625, "grad_norm_var": 2.622119140625, "learning_rate": 0.0001, "loss": 5.9416, "loss/crossentropy": 2.5979727506637573, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17889202386140823, "step": 10868 }, { "epoch": 0.3396875, "grad_norm": 3.109375, "grad_norm_var": 0.0297760009765625, "learning_rate": 0.0001, "loss": 6.0314, "loss/crossentropy": 2.663851022720337, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.17933289706707, "step": 10870 }, { "epoch": 0.33975, "grad_norm": 3.609375, "grad_norm_var": 0.09868062337239583, "learning_rate": 0.0001, "loss": 6.1448, "loss/crossentropy": 2.641571283340454, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1940707564353943, "step": 10872 }, { "epoch": 0.3398125, "grad_norm": 3.546875, "grad_norm_var": 0.09877827962239584, "learning_rate": 0.0001, "loss": 6.0617, "loss/crossentropy": 2.6150325536727905, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.19153756648302078, "step": 10874 }, { "epoch": 0.339875, "grad_norm": 3.40625, "grad_norm_var": 0.10087483723958333, "learning_rate": 0.0001, "loss": 5.9674, "loss/crossentropy": 2.6188477277755737, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18329627811908722, "step": 10876 }, { "epoch": 0.3399375, "grad_norm": 3.390625, "grad_norm_var": 0.09728902180989583, "learning_rate": 0.0001, "loss": 6.1462, "loss/crossentropy": 2.661772608757019, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19141025841236115, "step": 10878 }, { "epoch": 0.34, "grad_norm": 2.9375, "grad_norm_var": 0.11197916666666667, "learning_rate": 0.0001, "loss": 5.636, "loss/crossentropy": 2.44494891166687, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16871945559978485, "step": 10880 }, { "epoch": 0.3400625, "grad_norm": 3.53125, "grad_norm_var": 0.10836181640625, "learning_rate": 0.0001, "loss": 6.0516, "loss/crossentropy": 2.627223014831543, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18775425106287003, "step": 10882 }, { "epoch": 0.340125, "grad_norm": 3.0625, "grad_norm_var": 0.1162261962890625, "learning_rate": 0.0001, "loss": 5.6793, "loss/crossentropy": 2.494239091873169, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1716337651014328, "step": 10884 }, { "epoch": 0.3401875, "grad_norm": 3.109375, "grad_norm_var": 0.11667378743489583, "learning_rate": 0.0001, "loss": 5.4698, "loss/crossentropy": 2.2702760696411133, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1652650088071823, "step": 10886 }, { "epoch": 0.34025, "grad_norm": 3.3125, "grad_norm_var": 0.04019775390625, "learning_rate": 0.0001, "loss": 5.7117, "loss/crossentropy": 2.4544767141342163, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17181452363729477, "step": 10888 }, { "epoch": 0.3403125, "grad_norm": 3.265625, "grad_norm_var": 0.03330078125, "learning_rate": 0.0001, "loss": 5.9917, "loss/crossentropy": 2.7106869220733643, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17380309849977493, "step": 10890 }, { "epoch": 0.340375, "grad_norm": 3.15625, "grad_norm_var": 0.031981404622395834, "learning_rate": 0.0001, "loss": 5.9661, "loss/crossentropy": 2.6236428022384644, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18268528580665588, "step": 10892 }, { "epoch": 0.3404375, "grad_norm": 3.0625, "grad_norm_var": 0.029124959309895834, "learning_rate": 0.0001, "loss": 5.6988, "loss/crossentropy": 2.512023091316223, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16985370963811874, "step": 10894 }, { "epoch": 0.3405, "grad_norm": 3.046875, "grad_norm_var": 0.0265289306640625, "learning_rate": 0.0001, "loss": 5.6424, "loss/crossentropy": 2.4638726711273193, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16511869430541992, "step": 10896 }, { "epoch": 0.3405625, "grad_norm": 3.53125, "grad_norm_var": 0.024372355143229166, "learning_rate": 0.0001, "loss": 6.0874, "loss/crossentropy": 2.6857768297195435, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18860473483800888, "step": 10898 }, { "epoch": 0.340625, "grad_norm": 3.46875, "grad_norm_var": 0.025516764322916666, "learning_rate": 0.0001, "loss": 5.4173, "loss/crossentropy": 2.2630616426467896, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16893823444843292, "step": 10900 }, { "epoch": 0.3406875, "grad_norm": 3.359375, "grad_norm_var": 0.03925374348958333, "learning_rate": 0.0001, "loss": 5.8756, "loss/crossentropy": 2.5280606746673584, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18162433803081512, "step": 10902 }, { "epoch": 0.34075, "grad_norm": 3.109375, "grad_norm_var": 0.0426910400390625, "learning_rate": 0.0001, "loss": 5.9101, "loss/crossentropy": 2.5396711826324463, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18352441489696503, "step": 10904 }, { "epoch": 0.3408125, "grad_norm": 3.0625, "grad_norm_var": 0.0494537353515625, "learning_rate": 0.0001, "loss": 5.6406, "loss/crossentropy": 2.461556911468506, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1694711446762085, "step": 10906 }, { "epoch": 0.340875, "grad_norm": 3.375, "grad_norm_var": 0.04871317545572917, "learning_rate": 0.0001, "loss": 5.9681, "loss/crossentropy": 2.568337917327881, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18645568192005157, "step": 10908 }, { "epoch": 0.3409375, "grad_norm": 3.375, "grad_norm_var": 0.05250244140625, "learning_rate": 0.0001, "loss": 5.7098, "loss/crossentropy": 2.4167133569717407, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1777435913681984, "step": 10910 }, { "epoch": 0.341, "grad_norm": 3.359375, "grad_norm_var": 0.04472554524739583, "learning_rate": 0.0001, "loss": 5.9249, "loss/crossentropy": 2.5556130409240723, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18380820006132126, "step": 10912 }, { "epoch": 0.3410625, "grad_norm": 3.296875, "grad_norm_var": 0.0411529541015625, "learning_rate": 0.0001, "loss": 5.6859, "loss/crossentropy": 2.3107681274414062, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18086906522512436, "step": 10914 }, { "epoch": 0.341125, "grad_norm": 3.25, "grad_norm_var": 0.040425618489583336, "learning_rate": 0.0001, "loss": 5.9311, "loss/crossentropy": 2.6373904943466187, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.179367333650589, "step": 10916 }, { "epoch": 0.3411875, "grad_norm": 3.453125, "grad_norm_var": 0.03131510416666667, "learning_rate": 0.0001, "loss": 5.9823, "loss/crossentropy": 2.6373625993728638, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.181370347738266, "step": 10918 }, { "epoch": 0.34125, "grad_norm": 3.078125, "grad_norm_var": 0.0240234375, "learning_rate": 0.0001, "loss": 5.7492, "loss/crossentropy": 2.4768972396850586, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1764535903930664, "step": 10920 }, { "epoch": 0.3413125, "grad_norm": 3.15625, "grad_norm_var": 0.0212890625, "learning_rate": 0.0001, "loss": 5.8238, "loss/crossentropy": 2.574054479598999, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1730186939239502, "step": 10922 }, { "epoch": 0.341375, "grad_norm": 3.296875, "grad_norm_var": 0.020719401041666665, "learning_rate": 0.0001, "loss": 5.8977, "loss/crossentropy": 2.562409281730652, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1811838299036026, "step": 10924 }, { "epoch": 0.3414375, "grad_norm": 3.359375, "grad_norm_var": 0.014557902018229167, "learning_rate": 0.0001, "loss": 5.9327, "loss/crossentropy": 2.5949437618255615, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18103672564029694, "step": 10926 }, { "epoch": 0.3415, "grad_norm": 3.375, "grad_norm_var": 0.022102864583333333, "learning_rate": 0.0001, "loss": 6.2892, "loss/crossentropy": 2.779433012008667, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.19238261878490448, "step": 10928 }, { "epoch": 0.3415625, "grad_norm": 3.125, "grad_norm_var": 0.024397786458333334, "learning_rate": 0.0001, "loss": 5.7626, "loss/crossentropy": 2.495705723762512, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17395368218421936, "step": 10930 }, { "epoch": 0.341625, "grad_norm": 3.390625, "grad_norm_var": 0.0246978759765625, "learning_rate": 0.0001, "loss": 6.0376, "loss/crossentropy": 2.7212945222854614, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.18123944103717804, "step": 10932 }, { "epoch": 0.3416875, "grad_norm": 3.265625, "grad_norm_var": 0.020539347330729166, "learning_rate": 0.0001, "loss": 6.0254, "loss/crossentropy": 2.6789186000823975, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.18347567319869995, "step": 10934 }, { "epoch": 0.34175, "grad_norm": 2.84375, "grad_norm_var": 0.0332672119140625, "learning_rate": 0.0001, "loss": 5.7685, "loss/crossentropy": 2.5864064693450928, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17016415297985077, "step": 10936 }, { "epoch": 0.3418125, "grad_norm": 3.328125, "grad_norm_var": 0.03037109375, "learning_rate": 0.0001, "loss": 5.7732, "loss/crossentropy": 2.477833867073059, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17836246639490128, "step": 10938 }, { "epoch": 0.341875, "grad_norm": 3.375, "grad_norm_var": 0.0384429931640625, "learning_rate": 0.0001, "loss": 5.9246, "loss/crossentropy": 2.5930657386779785, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17807789146900177, "step": 10940 }, { "epoch": 0.3419375, "grad_norm": 3.28125, "grad_norm_var": 0.3951080322265625, "learning_rate": 0.0001, "loss": 6.2484, "loss/crossentropy": 2.8314849138259888, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18895969539880753, "step": 10942 }, { "epoch": 0.342, "grad_norm": 3.265625, "grad_norm_var": 0.39583231608072916, "learning_rate": 0.0001, "loss": 5.6654, "loss/crossentropy": 2.41261088848114, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17371421307325363, "step": 10944 }, { "epoch": 0.3420625, "grad_norm": 3.21875, "grad_norm_var": 0.38682352701822914, "learning_rate": 0.0001, "loss": 6.0481, "loss/crossentropy": 2.647688627243042, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18301014602184296, "step": 10946 }, { "epoch": 0.342125, "grad_norm": 3.390625, "grad_norm_var": 0.38798828125, "learning_rate": 0.0001, "loss": 5.9545, "loss/crossentropy": 2.6143709421157837, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1832272633910179, "step": 10948 }, { "epoch": 0.3421875, "grad_norm": 3.234375, "grad_norm_var": 0.3848297119140625, "learning_rate": 0.0001, "loss": 6.2824, "loss/crossentropy": 2.8192338943481445, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.18850116431713104, "step": 10950 }, { "epoch": 0.34225, "grad_norm": 4.03125, "grad_norm_var": 0.37916666666666665, "learning_rate": 0.0001, "loss": 5.6463, "loss/crossentropy": 2.3457504510879517, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.17341547459363937, "step": 10952 }, { "epoch": 0.3423125, "grad_norm": 3.359375, "grad_norm_var": 0.38039957682291664, "learning_rate": 0.0001, "loss": 5.9078, "loss/crossentropy": 2.5608900785446167, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18312767893075943, "step": 10954 }, { "epoch": 0.342375, "grad_norm": 3.5, "grad_norm_var": 0.39982808430989586, "learning_rate": 0.0001, "loss": 5.9286, "loss/crossentropy": 2.5840686559677124, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18054546415805817, "step": 10956 }, { "epoch": 0.3424375, "grad_norm": 3.3125, "grad_norm_var": 0.052277628580729166, "learning_rate": 0.0001, "loss": 5.7118, "loss/crossentropy": 2.4785114526748657, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17332758009433746, "step": 10958 }, { "epoch": 0.3425, "grad_norm": 3.5, "grad_norm_var": 0.05308837890625, "learning_rate": 0.0001, "loss": 6.422, "loss/crossentropy": 2.9245123863220215, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.19467034935951233, "step": 10960 }, { "epoch": 0.3425625, "grad_norm": 3.21875, "grad_norm_var": 0.05732320149739583, "learning_rate": 0.0001, "loss": 5.6495, "loss/crossentropy": 2.456456422805786, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16930365562438965, "step": 10962 }, { "epoch": 0.342625, "grad_norm": 3.34375, "grad_norm_var": 0.05842183430989583, "learning_rate": 0.0001, "loss": 5.9639, "loss/crossentropy": 2.673986077308655, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17586512863636017, "step": 10964 }, { "epoch": 0.3426875, "grad_norm": 3.34375, "grad_norm_var": 0.0572265625, "learning_rate": 0.0001, "loss": 5.6662, "loss/crossentropy": 2.3765084743499756, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17701995372772217, "step": 10966 }, { "epoch": 0.34275, "grad_norm": 2.984375, "grad_norm_var": 0.0366119384765625, "learning_rate": 0.0001, "loss": 5.7426, "loss/crossentropy": 2.4702529907226562, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17176322638988495, "step": 10968 }, { "epoch": 0.3428125, "grad_norm": 3.0, "grad_norm_var": 0.042496744791666666, "learning_rate": 0.0001, "loss": 6.0317, "loss/crossentropy": 2.765709638595581, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17386630922555923, "step": 10970 }, { "epoch": 0.342875, "grad_norm": 3.078125, "grad_norm_var": 0.036229451497395836, "learning_rate": 0.0001, "loss": 5.8305, "loss/crossentropy": 2.593336582183838, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17293524742126465, "step": 10972 }, { "epoch": 0.3429375, "grad_norm": 3.390625, "grad_norm_var": 0.037287394205729164, "learning_rate": 0.0001, "loss": 5.6782, "loss/crossentropy": 2.459639310836792, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17107924818992615, "step": 10974 }, { "epoch": 0.343, "grad_norm": 3.09375, "grad_norm_var": 0.03460286458333333, "learning_rate": 0.0001, "loss": 5.9007, "loss/crossentropy": 2.568490982055664, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1808793842792511, "step": 10976 }, { "epoch": 0.3430625, "grad_norm": 3.015625, "grad_norm_var": 0.0361968994140625, "learning_rate": 0.0001, "loss": 5.9303, "loss/crossentropy": 2.644846558570862, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1793234720826149, "step": 10978 }, { "epoch": 0.343125, "grad_norm": 2.96875, "grad_norm_var": 0.0394439697265625, "learning_rate": 0.0001, "loss": 5.7084, "loss/crossentropy": 2.4656275510787964, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17545323073863983, "step": 10980 }, { "epoch": 0.3431875, "grad_norm": 3.1875, "grad_norm_var": 0.03463134765625, "learning_rate": 0.0001, "loss": 5.4337, "loss/crossentropy": 2.3236976861953735, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16255810111761093, "step": 10982 }, { "epoch": 0.34325, "grad_norm": 3.75, "grad_norm_var": 0.0402008056640625, "learning_rate": 0.0001, "loss": 5.7879, "loss/crossentropy": 2.6042559146881104, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1687503159046173, "step": 10984 }, { "epoch": 0.3433125, "grad_norm": 3.328125, "grad_norm_var": 0.0443756103515625, "learning_rate": 0.0001, "loss": 5.9716, "loss/crossentropy": 2.5806045532226562, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1867561936378479, "step": 10986 }, { "epoch": 0.343375, "grad_norm": 3.34375, "grad_norm_var": 0.04302978515625, "learning_rate": 0.0001, "loss": 5.8281, "loss/crossentropy": 2.579994797706604, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17559535056352615, "step": 10988 }, { "epoch": 0.3434375, "grad_norm": 3.46875, "grad_norm_var": 0.0450836181640625, "learning_rate": 0.0001, "loss": 6.1501, "loss/crossentropy": 2.6620121002197266, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.19216465950012207, "step": 10990 }, { "epoch": 0.3435, "grad_norm": 3.15625, "grad_norm_var": 0.0447662353515625, "learning_rate": 0.0001, "loss": 5.6643, "loss/crossentropy": 2.463992714881897, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1719801351428032, "step": 10992 }, { "epoch": 0.3435625, "grad_norm": 3.34375, "grad_norm_var": 0.042577107747395836, "learning_rate": 0.0001, "loss": 5.8199, "loss/crossentropy": 2.567610740661621, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1748414933681488, "step": 10994 }, { "epoch": 0.343625, "grad_norm": 3.5, "grad_norm_var": 0.04566650390625, "learning_rate": 0.0001, "loss": 6.0374, "loss/crossentropy": 2.708420515060425, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1813330352306366, "step": 10996 }, { "epoch": 0.3436875, "grad_norm": 3.3125, "grad_norm_var": 0.040314737955729166, "learning_rate": 0.0001, "loss": 6.0973, "loss/crossentropy": 2.697722911834717, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18683434277772903, "step": 10998 }, { "epoch": 0.34375, "grad_norm": 3.453125, "grad_norm_var": 0.023802693684895834, "learning_rate": 0.0001, "loss": 5.7794, "loss/crossentropy": 2.438982844352722, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18325923383235931, "step": 11000 }, { "epoch": 0.3438125, "grad_norm": 3.4375, "grad_norm_var": 0.0238922119140625, "learning_rate": 0.0001, "loss": 5.9612, "loss/crossentropy": 2.6043819189071655, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18177300691604614, "step": 11002 }, { "epoch": 0.343875, "grad_norm": 3.34375, "grad_norm_var": 0.02119140625, "learning_rate": 0.0001, "loss": 6.027, "loss/crossentropy": 2.7204278707504272, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1763620674610138, "step": 11004 }, { "epoch": 0.3439375, "grad_norm": 3.421875, "grad_norm_var": 0.021483357747395834, "learning_rate": 0.0001, "loss": 6.086, "loss/crossentropy": 2.653968095779419, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.18539541214704514, "step": 11006 }, { "epoch": 0.344, "grad_norm": 2.953125, "grad_norm_var": 0.025951131184895834, "learning_rate": 0.0001, "loss": 5.6879, "loss/crossentropy": 2.4831135272979736, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17243118584156036, "step": 11008 }, { "epoch": 0.3440625, "grad_norm": 3.421875, "grad_norm_var": 0.026688639322916666, "learning_rate": 0.0001, "loss": 5.9452, "loss/crossentropy": 2.528921961784363, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18616057932376862, "step": 11010 }, { "epoch": 0.344125, "grad_norm": 3.21875, "grad_norm_var": 0.024833170572916667, "learning_rate": 0.0001, "loss": 5.7475, "loss/crossentropy": 2.4258735179901123, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.18099413812160492, "step": 11012 }, { "epoch": 0.3441875, "grad_norm": 3.015625, "grad_norm_var": 0.031050618489583334, "learning_rate": 0.0001, "loss": 5.8515, "loss/crossentropy": 2.5265121459960938, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1821117326617241, "step": 11014 }, { "epoch": 0.34425, "grad_norm": 3.328125, "grad_norm_var": 0.0298248291015625, "learning_rate": 0.0001, "loss": 5.9604, "loss/crossentropy": 2.6057002544403076, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18156321346759796, "step": 11016 }, { "epoch": 0.3443125, "grad_norm": 2.875, "grad_norm_var": 0.038248697916666664, "learning_rate": 0.0001, "loss": 5.8424, "loss/crossentropy": 2.6125999689102173, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17493344843387604, "step": 11018 }, { "epoch": 0.344375, "grad_norm": 2.921875, "grad_norm_var": 0.045441691080729166, "learning_rate": 0.0001, "loss": 5.5205, "loss/crossentropy": 2.342848062515259, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16542082279920578, "step": 11020 }, { "epoch": 0.3444375, "grad_norm": 3.15625, "grad_norm_var": 0.04234619140625, "learning_rate": 0.0001, "loss": 5.702, "loss/crossentropy": 2.473252773284912, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17209185659885406, "step": 11022 }, { "epoch": 0.3445, "grad_norm": 3.484375, "grad_norm_var": 0.04439697265625, "learning_rate": 0.0001, "loss": 5.9239, "loss/crossentropy": 2.5366417169570923, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1824759989976883, "step": 11024 }, { "epoch": 0.3445625, "grad_norm": 3.140625, "grad_norm_var": 0.0422515869140625, "learning_rate": 0.0001, "loss": 5.8792, "loss/crossentropy": 2.5807461738586426, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17866993695497513, "step": 11026 }, { "epoch": 0.344625, "grad_norm": 3.234375, "grad_norm_var": 0.028511555989583333, "learning_rate": 0.0001, "loss": 5.5309, "loss/crossentropy": 2.296918511390686, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1718367487192154, "step": 11028 }, { "epoch": 0.3446875, "grad_norm": 3.34375, "grad_norm_var": 0.026463826497395832, "learning_rate": 0.0001, "loss": 6.0491, "loss/crossentropy": 2.704776167869568, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.18326442688703537, "step": 11030 }, { "epoch": 0.34475, "grad_norm": 3.1875, "grad_norm_var": 0.025162760416666666, "learning_rate": 0.0001, "loss": 6.0549, "loss/crossentropy": 2.7294983863830566, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17941391468048096, "step": 11032 }, { "epoch": 0.3448125, "grad_norm": 3.203125, "grad_norm_var": 0.01793212890625, "learning_rate": 0.0001, "loss": 6.1078, "loss/crossentropy": 2.755513548851013, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.18522819131612778, "step": 11034 }, { "epoch": 0.344875, "grad_norm": 3.0, "grad_norm_var": 0.013016764322916667, "learning_rate": 0.0001, "loss": 5.6999, "loss/crossentropy": 2.4434242248535156, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17330323159694672, "step": 11036 }, { "epoch": 0.3449375, "grad_norm": 3.71875, "grad_norm_var": 0.031217447916666665, "learning_rate": 0.0001, "loss": 6.0105, "loss/crossentropy": 2.668699264526367, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.18301181495189667, "step": 11038 }, { "epoch": 0.345, "grad_norm": 3.046875, "grad_norm_var": 0.0318756103515625, "learning_rate": 0.0001, "loss": 5.3591, "loss/crossentropy": 2.254475712776184, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16046421229839325, "step": 11040 }, { "epoch": 0.3450625, "grad_norm": 3.09375, "grad_norm_var": 0.03238525390625, "learning_rate": 0.0001, "loss": 5.6728, "loss/crossentropy": 2.4203919172286987, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.17914804816246033, "step": 11042 }, { "epoch": 0.345125, "grad_norm": 3.4375, "grad_norm_var": 0.30448811848958335, "learning_rate": 0.0001, "loss": 5.9763, "loss/crossentropy": 2.5770797729492188, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18367434293031693, "step": 11044 }, { "epoch": 0.3451875, "grad_norm": 3.078125, "grad_norm_var": 0.3071523030598958, "learning_rate": 0.0001, "loss": 5.7715, "loss/crossentropy": 2.4922112226486206, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1771462932229042, "step": 11046 }, { "epoch": 0.34525, "grad_norm": 3.515625, "grad_norm_var": 0.3077138264973958, "learning_rate": 0.0001, "loss": 5.9733, "loss/crossentropy": 2.598994731903076, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18039784580469131, "step": 11048 }, { "epoch": 0.3453125, "grad_norm": 6.21875, "grad_norm_var": 0.8429107666015625, "learning_rate": 0.0001, "loss": 6.5324, "loss/crossentropy": 2.894778251647949, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.20126450061798096, "step": 11050 }, { "epoch": 0.345375, "grad_norm": 3.390625, "grad_norm_var": 0.8036946614583333, "learning_rate": 0.0001, "loss": 6.0874, "loss/crossentropy": 2.665947198867798, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18980085104703903, "step": 11052 }, { "epoch": 0.3454375, "grad_norm": 3.40625, "grad_norm_var": 0.7823069254557292, "learning_rate": 0.0001, "loss": 5.9363, "loss/crossentropy": 2.5732173919677734, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18317849189043045, "step": 11054 }, { "epoch": 0.3455, "grad_norm": 3.28125, "grad_norm_var": 0.7324045817057292, "learning_rate": 0.0001, "loss": 5.8919, "loss/crossentropy": 2.570802092552185, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.17547086626291275, "step": 11056 }, { "epoch": 0.3455625, "grad_norm": 3.40625, "grad_norm_var": 0.7168121337890625, "learning_rate": 0.0001, "loss": 6.2789, "loss/crossentropy": 2.7660707235336304, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19425665587186813, "step": 11058 }, { "epoch": 0.345625, "grad_norm": 4.03125, "grad_norm_var": 0.5719553629557291, "learning_rate": 0.0001, "loss": 6.3759, "loss/crossentropy": 2.8103519678115845, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.19640257209539413, "step": 11060 }, { "epoch": 0.3456875, "grad_norm": 3.71875, "grad_norm_var": 0.5447336832682291, "learning_rate": 0.0001, "loss": 6.3307, "loss/crossentropy": 2.857063055038452, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.18955209106206894, "step": 11062 }, { "epoch": 0.34575, "grad_norm": 3.203125, "grad_norm_var": 0.5674967447916667, "learning_rate": 0.0001, "loss": 6.0962, "loss/crossentropy": 2.7202454805374146, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18485981225967407, "step": 11064 }, { "epoch": 0.3458125, "grad_norm": 4.0, "grad_norm_var": 0.07200113932291667, "learning_rate": 0.0001, "loss": 5.8249, "loss/crossentropy": 2.524648666381836, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1776851788163185, "step": 11066 }, { "epoch": 0.345875, "grad_norm": 3.484375, "grad_norm_var": 0.06721903483072916, "learning_rate": 0.0001, "loss": 6.1936, "loss/crossentropy": 2.8223618268966675, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18243858963251114, "step": 11068 }, { "epoch": 0.3459375, "grad_norm": 3.5625, "grad_norm_var": 0.08352457682291667, "learning_rate": 0.0001, "loss": 5.9817, "loss/crossentropy": 2.6173232793807983, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18057696521282196, "step": 11070 }, { "epoch": 0.346, "grad_norm": 3.953125, "grad_norm_var": 0.1038970947265625, "learning_rate": 0.0001, "loss": 5.9832, "loss/crossentropy": 2.5125160217285156, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.19120532274246216, "step": 11072 }, { "epoch": 0.3460625, "grad_norm": 3.46875, "grad_norm_var": 0.1014312744140625, "learning_rate": 0.0001, "loss": 5.9291, "loss/crossentropy": 2.5588573217391968, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1823352500796318, "step": 11074 }, { "epoch": 0.346125, "grad_norm": 3.53125, "grad_norm_var": 0.08059794108072917, "learning_rate": 0.0001, "loss": 5.9067, "loss/crossentropy": 2.497497797012329, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.1807660534977913, "step": 11076 }, { "epoch": 0.3461875, "grad_norm": 2.953125, "grad_norm_var": 0.09683329264322917, "learning_rate": 0.0001, "loss": 5.4389, "loss/crossentropy": 2.295078158378601, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1636056751012802, "step": 11078 }, { "epoch": 0.34625, "grad_norm": 3.40625, "grad_norm_var": 0.09576822916666666, "learning_rate": 0.0001, "loss": 6.0454, "loss/crossentropy": 2.6146689653396606, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.18213783204555511, "step": 11080 }, { "epoch": 0.3463125, "grad_norm": 2.96875, "grad_norm_var": 0.08359273274739583, "learning_rate": 0.0001, "loss": 5.8011, "loss/crossentropy": 2.588598370552063, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16929897665977478, "step": 11082 }, { "epoch": 0.346375, "grad_norm": 3.296875, "grad_norm_var": 0.08227437337239583, "learning_rate": 0.0001, "loss": 5.735, "loss/crossentropy": 2.449593663215637, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17893542349338531, "step": 11084 }, { "epoch": 0.3464375, "grad_norm": 3.28125, "grad_norm_var": 0.07458394368489583, "learning_rate": 0.0001, "loss": 6.0398, "loss/crossentropy": 2.669768452644348, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1858329400420189, "step": 11086 }, { "epoch": 0.3465, "grad_norm": 3.28125, "grad_norm_var": 0.056639607747395834, "learning_rate": 0.0001, "loss": 5.6926, "loss/crossentropy": 2.4738898277282715, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16952742636203766, "step": 11088 }, { "epoch": 0.3465625, "grad_norm": 3.390625, "grad_norm_var": 0.056559244791666664, "learning_rate": 0.0001, "loss": 5.934, "loss/crossentropy": 2.592034935951233, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17989840358495712, "step": 11090 }, { "epoch": 0.346625, "grad_norm": 3.25, "grad_norm_var": 0.044189453125, "learning_rate": 0.0001, "loss": 6.0853, "loss/crossentropy": 2.683542847633362, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18665538728237152, "step": 11092 }, { "epoch": 0.3466875, "grad_norm": 3.03125, "grad_norm_var": 0.039403279622395836, "learning_rate": 0.0001, "loss": 5.8121, "loss/crossentropy": 2.5736337900161743, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17345602065324783, "step": 11094 }, { "epoch": 0.34675, "grad_norm": 3.09375, "grad_norm_var": 0.03297119140625, "learning_rate": 0.0001, "loss": 6.08, "loss/crossentropy": 2.7127881050109863, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18320278823375702, "step": 11096 }, { "epoch": 0.3468125, "grad_norm": 3.390625, "grad_norm_var": 0.02799072265625, "learning_rate": 0.0001, "loss": 5.8826, "loss/crossentropy": 2.57907497882843, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17957322299480438, "step": 11098 }, { "epoch": 0.346875, "grad_norm": 3.265625, "grad_norm_var": 0.027587890625, "learning_rate": 0.0001, "loss": 5.502, "loss/crossentropy": 2.2725518941879272, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.16981849074363708, "step": 11100 }, { "epoch": 0.3469375, "grad_norm": 3.1875, "grad_norm_var": 0.016893513997395835, "learning_rate": 0.0001, "loss": 5.6568, "loss/crossentropy": 2.4335155487060547, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17311019450426102, "step": 11102 }, { "epoch": 0.347, "grad_norm": 3.296875, "grad_norm_var": 0.015478515625, "learning_rate": 0.0001, "loss": 6.0785, "loss/crossentropy": 2.749638795852661, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.18288739770650864, "step": 11104 }, { "epoch": 0.3470625, "grad_norm": 3.265625, "grad_norm_var": 0.0157623291015625, "learning_rate": 0.0001, "loss": 5.9349, "loss/crossentropy": 2.621717691421509, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17819301784038544, "step": 11106 }, { "epoch": 0.347125, "grad_norm": 3.140625, "grad_norm_var": 0.011872355143229167, "learning_rate": 0.0001, "loss": 5.6888, "loss/crossentropy": 2.4195992946624756, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17730767279863358, "step": 11108 }, { "epoch": 0.3471875, "grad_norm": 3.078125, "grad_norm_var": 0.010965983072916666, "learning_rate": 0.0001, "loss": 5.7349, "loss/crossentropy": 2.514296054840088, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17010922729969025, "step": 11110 }, { "epoch": 0.34725, "grad_norm": 3.125, "grad_norm_var": 0.0102691650390625, "learning_rate": 0.0001, "loss": 5.4622, "loss/crossentropy": 2.3277621269226074, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16422312706708908, "step": 11112 }, { "epoch": 0.3473125, "grad_norm": 3.53125, "grad_norm_var": 0.036530558268229166, "learning_rate": 0.0001, "loss": 5.9148, "loss/crossentropy": 2.5701433420181274, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18095263093709946, "step": 11114 }, { "epoch": 0.347375, "grad_norm": 3.421875, "grad_norm_var": 0.0496734619140625, "learning_rate": 0.0001, "loss": 5.6771, "loss/crossentropy": 2.4177253246307373, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17281214892864227, "step": 11116 }, { "epoch": 0.3474375, "grad_norm": 3.578125, "grad_norm_var": 0.057027180989583336, "learning_rate": 0.0001, "loss": 6.002, "loss/crossentropy": 2.5987123250961304, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1844726875424385, "step": 11118 }, { "epoch": 0.3475, "grad_norm": 2.953125, "grad_norm_var": 0.061295572916666666, "learning_rate": 0.0001, "loss": 5.6562, "loss/crossentropy": 2.4043514728546143, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17127804458141327, "step": 11120 }, { "epoch": 0.3475625, "grad_norm": 3.328125, "grad_norm_var": 0.05907796223958333, "learning_rate": 0.0001, "loss": 6.0375, "loss/crossentropy": 2.6463372707366943, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18403904139995575, "step": 11122 }, { "epoch": 0.347625, "grad_norm": 3.53125, "grad_norm_var": 0.0587554931640625, "learning_rate": 0.0001, "loss": 5.7592, "loss/crossentropy": 2.4847307205200195, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1758800819516182, "step": 11124 }, { "epoch": 0.3476875, "grad_norm": 3.25, "grad_norm_var": 0.0590240478515625, "learning_rate": 0.0001, "loss": 6.1016, "loss/crossentropy": 2.643786668777466, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1910940557718277, "step": 11126 }, { "epoch": 0.34775, "grad_norm": 3.015625, "grad_norm_var": 0.06352437337239583, "learning_rate": 0.0001, "loss": 5.7997, "loss/crossentropy": 2.6193262338638306, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17116473615169525, "step": 11128 }, { "epoch": 0.3478125, "grad_norm": 3.328125, "grad_norm_var": 0.05071207682291667, "learning_rate": 0.0001, "loss": 5.9549, "loss/crossentropy": 2.670639753341675, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17764153331518173, "step": 11130 }, { "epoch": 0.347875, "grad_norm": 3.21875, "grad_norm_var": 0.03535868326822917, "learning_rate": 0.0001, "loss": 5.7988, "loss/crossentropy": 2.5086302757263184, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17940445989370346, "step": 11132 }, { "epoch": 0.3479375, "grad_norm": 3.515625, "grad_norm_var": 0.03955790201822917, "learning_rate": 0.0001, "loss": 6.3464, "loss/crossentropy": 2.8006097078323364, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.19793908298015594, "step": 11134 }, { "epoch": 0.348, "grad_norm": 3.078125, "grad_norm_var": 0.03473307291666667, "learning_rate": 0.0001, "loss": 5.4672, "loss/crossentropy": 2.351517081260681, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16547825932502747, "step": 11136 }, { "epoch": 0.3480625, "grad_norm": 3.03125, "grad_norm_var": 0.0445953369140625, "learning_rate": 0.0001, "loss": 5.7162, "loss/crossentropy": 2.5065797567367554, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17213500291109085, "step": 11138 }, { "epoch": 0.348125, "grad_norm": 3.234375, "grad_norm_var": 0.05287984212239583, "learning_rate": 0.0001, "loss": 5.7799, "loss/crossentropy": 2.4705816507339478, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.17233440279960632, "step": 11140 }, { "epoch": 0.3481875, "grad_norm": 3.296875, "grad_norm_var": 0.04781901041666667, "learning_rate": 0.0001, "loss": 6.0974, "loss/crossentropy": 2.680031180381775, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18783296644687653, "step": 11142 }, { "epoch": 0.34825, "grad_norm": 3.5625, "grad_norm_var": 0.04947509765625, "learning_rate": 0.0001, "loss": 6.0563, "loss/crossentropy": 2.6336668729782104, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18405906111001968, "step": 11144 }, { "epoch": 0.3483125, "grad_norm": 2.96875, "grad_norm_var": 0.0525787353515625, "learning_rate": 0.0001, "loss": 5.5608, "loss/crossentropy": 2.3576338291168213, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1714879646897316, "step": 11146 }, { "epoch": 0.348375, "grad_norm": 3.296875, "grad_norm_var": 0.05322977701822917, "learning_rate": 0.0001, "loss": 5.7678, "loss/crossentropy": 2.5067286491394043, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17493688315153122, "step": 11148 }, { "epoch": 0.3484375, "grad_norm": 3.25, "grad_norm_var": 0.054459635416666666, "learning_rate": 0.0001, "loss": 5.9017, "loss/crossentropy": 2.5168557167053223, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18418489396572113, "step": 11150 }, { "epoch": 0.3485, "grad_norm": 3.546875, "grad_norm_var": 0.05573628743489583, "learning_rate": 0.0001, "loss": 5.9392, "loss/crossentropy": 2.6039544343948364, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18195894360542297, "step": 11152 }, { "epoch": 0.3485625, "grad_norm": 3.296875, "grad_norm_var": 0.04540608723958333, "learning_rate": 0.0001, "loss": 6.2064, "loss/crossentropy": 2.760994553565979, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18789827078580856, "step": 11154 }, { "epoch": 0.348625, "grad_norm": 3.828125, "grad_norm_var": 0.06337483723958333, "learning_rate": 0.0001, "loss": 6.1188, "loss/crossentropy": 2.54991352558136, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.19790344685316086, "step": 11156 }, { "epoch": 0.3486875, "grad_norm": 3.34375, "grad_norm_var": 0.0599761962890625, "learning_rate": 0.0001, "loss": 5.7801, "loss/crossentropy": 2.4646570682525635, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17724335938692093, "step": 11158 }, { "epoch": 0.34875, "grad_norm": 4.40625, "grad_norm_var": 0.114208984375, "learning_rate": 0.0001, "loss": 6.0655, "loss/crossentropy": 2.5859330892562866, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19014528393745422, "step": 11160 }, { "epoch": 0.3488125, "grad_norm": 3.96875, "grad_norm_var": 0.10943094889322917, "learning_rate": 0.0001, "loss": 6.1698, "loss/crossentropy": 2.6557672023773193, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.19163348525762558, "step": 11162 }, { "epoch": 0.348875, "grad_norm": 3.859375, "grad_norm_var": 0.11806233723958333, "learning_rate": 0.0001, "loss": 6.1783, "loss/crossentropy": 2.7903772592544556, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1844942718744278, "step": 11164 }, { "epoch": 0.3489375, "grad_norm": 4.15625, "grad_norm_var": 0.13076070149739583, "learning_rate": 0.0001, "loss": 5.9133, "loss/crossentropy": 2.4920116662979126, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1901763305068016, "step": 11166 }, { "epoch": 0.349, "grad_norm": 3.359375, "grad_norm_var": 0.13056233723958333, "learning_rate": 0.0001, "loss": 5.9955, "loss/crossentropy": 2.6160982847213745, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.17973970621824265, "step": 11168 }, { "epoch": 0.3490625, "grad_norm": 3.546875, "grad_norm_var": 0.14177958170572916, "learning_rate": 0.0001, "loss": 5.8252, "loss/crossentropy": 2.562760353088379, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1758510321378708, "step": 11170 }, { "epoch": 0.349125, "grad_norm": 3.109375, "grad_norm_var": 0.16461181640625, "learning_rate": 0.0001, "loss": 5.5451, "loss/crossentropy": 2.348025679588318, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17048463970422745, "step": 11172 }, { "epoch": 0.3491875, "grad_norm": 3.390625, "grad_norm_var": 0.172119140625, "learning_rate": 0.0001, "loss": 5.9302, "loss/crossentropy": 2.6501015424728394, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17722531408071518, "step": 11174 }, { "epoch": 0.34925, "grad_norm": 3.28125, "grad_norm_var": 0.1240142822265625, "learning_rate": 0.0001, "loss": 5.4685, "loss/crossentropy": 2.325492262840271, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16390928626060486, "step": 11176 }, { "epoch": 0.3493125, "grad_norm": 3.203125, "grad_norm_var": 0.11184794108072917, "learning_rate": 0.0001, "loss": 6.2105, "loss/crossentropy": 2.7702953815460205, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1889393851161003, "step": 11178 }, { "epoch": 0.349375, "grad_norm": 3.546875, "grad_norm_var": 0.0923980712890625, "learning_rate": 0.0001, "loss": 5.8614, "loss/crossentropy": 2.550577998161316, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1818677932024002, "step": 11180 }, { "epoch": 0.3494375, "grad_norm": 3.453125, "grad_norm_var": 0.051493326822916664, "learning_rate": 0.0001, "loss": 5.9858, "loss/crossentropy": 2.600010871887207, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18467383831739426, "step": 11182 }, { "epoch": 0.3495, "grad_norm": 3.578125, "grad_norm_var": 0.06024983723958333, "learning_rate": 0.0001, "loss": 6.2648, "loss/crossentropy": 2.7478842735290527, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19387764483690262, "step": 11184 }, { "epoch": 0.3495625, "grad_norm": 3.5625, "grad_norm_var": 0.056864420572916664, "learning_rate": 0.0001, "loss": 5.759, "loss/crossentropy": 2.4200514554977417, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.1753026694059372, "step": 11186 }, { "epoch": 0.349625, "grad_norm": 3.34375, "grad_norm_var": 0.047728474934895834, "learning_rate": 0.0001, "loss": 5.8962, "loss/crossentropy": 2.652923583984375, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17393695563077927, "step": 11188 }, { "epoch": 0.3496875, "grad_norm": 3.453125, "grad_norm_var": 0.04353739420572917, "learning_rate": 0.0001, "loss": 5.4828, "loss/crossentropy": 2.2370325326919556, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.16676458716392517, "step": 11190 }, { "epoch": 0.34975, "grad_norm": 3.234375, "grad_norm_var": 0.04783426920572917, "learning_rate": 0.0001, "loss": 5.9645, "loss/crossentropy": 2.518808126449585, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1894880086183548, "step": 11192 }, { "epoch": 0.3498125, "grad_norm": 3.46875, "grad_norm_var": 0.038426717122395836, "learning_rate": 0.0001, "loss": 5.8168, "loss/crossentropy": 2.4844318628311157, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17894314229488373, "step": 11194 }, { "epoch": 0.349875, "grad_norm": 3.578125, "grad_norm_var": 0.03642578125, "learning_rate": 0.0001, "loss": 6.2959, "loss/crossentropy": 2.748618245124817, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.19730991125106812, "step": 11196 }, { "epoch": 0.3499375, "grad_norm": 5.78125, "grad_norm_var": 0.3701568603515625, "learning_rate": 0.0001, "loss": 5.8812, "loss/crossentropy": 2.458068370819092, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18528541177511215, "step": 11198 }, { "epoch": 0.35, "grad_norm": 3.484375, "grad_norm_var": 0.37374674479166664, "learning_rate": 0.0001, "loss": 5.9547, "loss/crossentropy": 2.5829252004623413, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1863955408334732, "step": 11200 }, { "epoch": 0.3500625, "grad_norm": 4.125, "grad_norm_var": 0.38059895833333335, "learning_rate": 0.0001, "loss": 6.2833, "loss/crossentropy": 2.7533146142959595, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.19557969272136688, "step": 11202 }, { "epoch": 0.350125, "grad_norm": 3.4375, "grad_norm_var": 0.363525390625, "learning_rate": 0.0001, "loss": 5.8647, "loss/crossentropy": 2.588066577911377, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17375587671995163, "step": 11204 }, { "epoch": 0.3501875, "grad_norm": 3.25, "grad_norm_var": 0.37858784993489586, "learning_rate": 0.0001, "loss": 6.1644, "loss/crossentropy": 2.754800796508789, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18470770865678787, "step": 11206 }, { "epoch": 0.35025, "grad_norm": 3.515625, "grad_norm_var": 0.36643778483072914, "learning_rate": 0.0001, "loss": 5.9204, "loss/crossentropy": 2.622808814048767, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1793663278222084, "step": 11208 }, { "epoch": 0.3503125, "grad_norm": 3.265625, "grad_norm_var": 0.3826568603515625, "learning_rate": 0.0001, "loss": 5.8128, "loss/crossentropy": 2.534000873565674, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17632145434617996, "step": 11210 }, { "epoch": 0.350375, "grad_norm": 3.421875, "grad_norm_var": 0.38606363932291665, "learning_rate": 0.0001, "loss": 6.2683, "loss/crossentropy": 2.803617000579834, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.19177913665771484, "step": 11212 }, { "epoch": 0.3504375, "grad_norm": 4.15625, "grad_norm_var": 0.07792561848958333, "learning_rate": 0.0001, "loss": 5.7733, "loss/crossentropy": 2.41003954410553, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.17499523609876633, "step": 11214 }, { "epoch": 0.3505, "grad_norm": 3.359375, "grad_norm_var": 0.08134663899739583, "learning_rate": 0.0001, "loss": 6.0462, "loss/crossentropy": 2.616556763648987, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18906167149543762, "step": 11216 }, { "epoch": 0.3505625, "grad_norm": 3.234375, "grad_norm_var": 0.05584309895833333, "learning_rate": 0.0001, "loss": 5.9943, "loss/crossentropy": 2.5787479877471924, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1853073462843895, "step": 11218 }, { "epoch": 0.350625, "grad_norm": 3.328125, "grad_norm_var": 0.0581451416015625, "learning_rate": 0.0001, "loss": 5.8301, "loss/crossentropy": 2.56949520111084, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17606385052204132, "step": 11220 }, { "epoch": 0.3506875, "grad_norm": 3.4375, "grad_norm_var": 0.062483723958333334, "learning_rate": 0.0001, "loss": 5.8622, "loss/crossentropy": 2.6186574697494507, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17240316420793533, "step": 11222 }, { "epoch": 0.35075, "grad_norm": 3.140625, "grad_norm_var": 0.06483968098958333, "learning_rate": 0.0001, "loss": 5.9472, "loss/crossentropy": 2.6057368516921997, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17946264147758484, "step": 11224 }, { "epoch": 0.3508125, "grad_norm": 3.078125, "grad_norm_var": 0.06871337890625, "learning_rate": 0.0001, "loss": 6.3552, "loss/crossentropy": 2.9567378759384155, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18516208976507187, "step": 11226 }, { "epoch": 0.350875, "grad_norm": 4.0625, "grad_norm_var": 0.09927978515625, "learning_rate": 0.0001, "loss": 5.7293, "loss/crossentropy": 2.478781580924988, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17505527287721634, "step": 11228 }, { "epoch": 0.3509375, "grad_norm": 3.234375, "grad_norm_var": 0.06197001139322917, "learning_rate": 0.0001, "loss": 5.6721, "loss/crossentropy": 2.4796438217163086, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16728852689266205, "step": 11230 }, { "epoch": 0.351, "grad_norm": 3.3125, "grad_norm_var": 0.0557281494140625, "learning_rate": 0.0001, "loss": 5.7428, "loss/crossentropy": 2.528485894203186, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1686977818608284, "step": 11232 }, { "epoch": 0.3510625, "grad_norm": 3.203125, "grad_norm_var": 0.0560211181640625, "learning_rate": 0.0001, "loss": 5.9358, "loss/crossentropy": 2.5955309867858887, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18168071657419205, "step": 11234 }, { "epoch": 0.351125, "grad_norm": 2.921875, "grad_norm_var": 0.06412353515625, "learning_rate": 0.0001, "loss": 5.6136, "loss/crossentropy": 2.4391366243362427, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16861621290445328, "step": 11236 }, { "epoch": 0.3511875, "grad_norm": 3.1875, "grad_norm_var": 0.06402587890625, "learning_rate": 0.0001, "loss": 5.7673, "loss/crossentropy": 2.608486533164978, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1686142310500145, "step": 11238 }, { "epoch": 0.35125, "grad_norm": 3.5625, "grad_norm_var": 0.06883036295572917, "learning_rate": 0.0001, "loss": 5.9049, "loss/crossentropy": 2.573819637298584, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17998070269823074, "step": 11240 }, { "epoch": 0.3513125, "grad_norm": 3.546875, "grad_norm_var": 0.07077534993489583, "learning_rate": 0.0001, "loss": 5.8188, "loss/crossentropy": 2.440657377243042, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18117434531450272, "step": 11242 }, { "epoch": 0.351375, "grad_norm": 3.546875, "grad_norm_var": 0.034830729166666664, "learning_rate": 0.0001, "loss": 6.0702, "loss/crossentropy": 2.7077648639678955, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18272359669208527, "step": 11244 }, { "epoch": 0.3514375, "grad_norm": 3.390625, "grad_norm_var": 0.0373199462890625, "learning_rate": 0.0001, "loss": 5.7257, "loss/crossentropy": 2.380261778831482, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18180926144123077, "step": 11246 }, { "epoch": 0.3515, "grad_norm": 3.234375, "grad_norm_var": 0.03427327473958333, "learning_rate": 0.0001, "loss": 6.2033, "loss/crossentropy": 2.7489737272262573, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.189968541264534, "step": 11248 }, { "epoch": 0.3515625, "grad_norm": 3.40625, "grad_norm_var": 0.034154256184895836, "learning_rate": 0.0001, "loss": 5.8867, "loss/crossentropy": 2.6347585916519165, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1720653623342514, "step": 11250 }, { "epoch": 0.351625, "grad_norm": 3.65625, "grad_norm_var": 0.033610026041666664, "learning_rate": 0.0001, "loss": 5.5141, "loss/crossentropy": 2.294405937194824, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16962312906980515, "step": 11252 }, { "epoch": 0.3516875, "grad_norm": 3.46875, "grad_norm_var": 0.020588175455729166, "learning_rate": 0.0001, "loss": 5.6743, "loss/crossentropy": 2.416317582130432, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1695522665977478, "step": 11254 }, { "epoch": 0.35175, "grad_norm": 3.34375, "grad_norm_var": 0.0200347900390625, "learning_rate": 0.0001, "loss": 5.6698, "loss/crossentropy": 2.437352418899536, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17363091558218002, "step": 11256 }, { "epoch": 0.3518125, "grad_norm": 2.90625, "grad_norm_var": 0.04293212890625, "learning_rate": 0.0001, "loss": 5.5666, "loss/crossentropy": 2.447490692138672, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16581828147172928, "step": 11258 }, { "epoch": 0.351875, "grad_norm": 3.34375, "grad_norm_var": 0.04511311848958333, "learning_rate": 0.0001, "loss": 5.714, "loss/crossentropy": 2.4743552207946777, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17201179265975952, "step": 11260 }, { "epoch": 0.3519375, "grad_norm": 3.359375, "grad_norm_var": 0.042780558268229164, "learning_rate": 0.0001, "loss": 5.7087, "loss/crossentropy": 2.4358839988708496, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17533152550458908, "step": 11262 }, { "epoch": 0.352, "grad_norm": 3.40625, "grad_norm_var": 0.04302978515625, "learning_rate": 0.0001, "loss": 5.9783, "loss/crossentropy": 2.613458275794983, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18062026798725128, "step": 11264 }, { "epoch": 0.3520625, "grad_norm": 3.484375, "grad_norm_var": 0.0494781494140625, "learning_rate": 0.0001, "loss": 6.0169, "loss/crossentropy": 2.6738606691360474, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18235048651695251, "step": 11266 }, { "epoch": 0.352125, "grad_norm": 3.25, "grad_norm_var": 0.030757649739583334, "learning_rate": 0.0001, "loss": 5.932, "loss/crossentropy": 2.573430895805359, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.17921525239944458, "step": 11268 }, { "epoch": 0.3521875, "grad_norm": 3.078125, "grad_norm_var": 0.02935791015625, "learning_rate": 0.0001, "loss": 6.0359, "loss/crossentropy": 2.6839070320129395, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1812937706708908, "step": 11270 }, { "epoch": 0.35225, "grad_norm": 3.421875, "grad_norm_var": 0.030907185872395833, "learning_rate": 0.0001, "loss": 6.0565, "loss/crossentropy": 2.701889157295227, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18155021220445633, "step": 11272 }, { "epoch": 0.3523125, "grad_norm": 3.21875, "grad_norm_var": 0.0376617431640625, "learning_rate": 0.0001, "loss": 5.5597, "loss/crossentropy": 2.294955849647522, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17335163056850433, "step": 11274 }, { "epoch": 0.352375, "grad_norm": 3.1875, "grad_norm_var": 0.03697509765625, "learning_rate": 0.0001, "loss": 5.8427, "loss/crossentropy": 2.4607044458389282, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18351703882217407, "step": 11276 }, { "epoch": 0.3524375, "grad_norm": 3.28125, "grad_norm_var": 0.036604817708333334, "learning_rate": 0.0001, "loss": 5.9594, "loss/crossentropy": 2.536848306655884, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18873773515224457, "step": 11278 }, { "epoch": 0.3525, "grad_norm": 3.46875, "grad_norm_var": 0.037385050455729166, "learning_rate": 0.0001, "loss": 6.1033, "loss/crossentropy": 2.7329437732696533, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1827412247657776, "step": 11280 }, { "epoch": 0.3525625, "grad_norm": 3.65625, "grad_norm_var": 0.0367095947265625, "learning_rate": 0.0001, "loss": 6.0466, "loss/crossentropy": 2.6824004650115967, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.17860594391822815, "step": 11282 }, { "epoch": 0.352625, "grad_norm": 3.453125, "grad_norm_var": 0.0357574462890625, "learning_rate": 0.0001, "loss": 5.8863, "loss/crossentropy": 2.61265230178833, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1738537773489952, "step": 11284 }, { "epoch": 0.3526875, "grad_norm": 3.40625, "grad_norm_var": 0.04616597493489583, "learning_rate": 0.0001, "loss": 6.1326, "loss/crossentropy": 2.674401044845581, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.18683121353387833, "step": 11286 }, { "epoch": 0.35275, "grad_norm": 3.359375, "grad_norm_var": 0.04273681640625, "learning_rate": 0.0001, "loss": 6.1229, "loss/crossentropy": 2.6818618774414062, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.19020012766122818, "step": 11288 }, { "epoch": 0.3528125, "grad_norm": 3.140625, "grad_norm_var": 0.04355061848958333, "learning_rate": 0.0001, "loss": 5.7223, "loss/crossentropy": 2.535930633544922, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16863670200109482, "step": 11290 }, { "epoch": 0.352875, "grad_norm": 3.515625, "grad_norm_var": 0.05386962890625, "learning_rate": 0.0001, "loss": 5.8413, "loss/crossentropy": 2.5034278631210327, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1818310022354126, "step": 11292 }, { "epoch": 0.3529375, "grad_norm": 3.515625, "grad_norm_var": 0.05308329264322917, "learning_rate": 0.0001, "loss": 5.9111, "loss/crossentropy": 2.5392953157424927, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1813190057873726, "step": 11294 }, { "epoch": 0.353, "grad_norm": 3.359375, "grad_norm_var": 0.05386962890625, "learning_rate": 0.0001, "loss": 5.6253, "loss/crossentropy": 2.42446768283844, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17008384317159653, "step": 11296 }, { "epoch": 0.3530625, "grad_norm": 3.0625, "grad_norm_var": 0.05767822265625, "learning_rate": 0.0001, "loss": 5.7541, "loss/crossentropy": 2.5158231258392334, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17578310519456863, "step": 11298 }, { "epoch": 0.353125, "grad_norm": 3.171875, "grad_norm_var": 0.057417805989583334, "learning_rate": 0.0001, "loss": 6.1198, "loss/crossentropy": 2.7350213527679443, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18262308090925217, "step": 11300 }, { "epoch": 0.3531875, "grad_norm": 3.234375, "grad_norm_var": 0.02818603515625, "learning_rate": 0.0001, "loss": 5.8617, "loss/crossentropy": 2.554205536842346, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17722924798727036, "step": 11302 }, { "epoch": 0.35325, "grad_norm": 3.578125, "grad_norm_var": 0.03394775390625, "learning_rate": 0.0001, "loss": 6.1269, "loss/crossentropy": 2.652440905570984, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.19080623984336853, "step": 11304 }, { "epoch": 0.3533125, "grad_norm": 3.234375, "grad_norm_var": 0.0314117431640625, "learning_rate": 0.0001, "loss": 5.8889, "loss/crossentropy": 2.5206410884857178, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1809711903333664, "step": 11306 }, { "epoch": 0.353375, "grad_norm": 3.296875, "grad_norm_var": 0.021610514322916666, "learning_rate": 0.0001, "loss": 5.8617, "loss/crossentropy": 2.5516124963760376, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1786694973707199, "step": 11308 }, { "epoch": 0.3534375, "grad_norm": 3.328125, "grad_norm_var": 0.017545572916666665, "learning_rate": 0.0001, "loss": 5.9227, "loss/crossentropy": 2.5877593755722046, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18154260516166687, "step": 11310 }, { "epoch": 0.3535, "grad_norm": 3.25, "grad_norm_var": 0.0334136962890625, "learning_rate": 0.0001, "loss": 6.1391, "loss/crossentropy": 2.717615008354187, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18902809917926788, "step": 11312 }, { "epoch": 0.3535625, "grad_norm": 3.28125, "grad_norm_var": 0.0310699462890625, "learning_rate": 0.0001, "loss": 5.949, "loss/crossentropy": 2.5918458700180054, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18336959183216095, "step": 11314 }, { "epoch": 0.353625, "grad_norm": 3.3125, "grad_norm_var": 0.029752604166666665, "learning_rate": 0.0001, "loss": 5.7579, "loss/crossentropy": 2.500713586807251, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1753242462873459, "step": 11316 }, { "epoch": 0.3536875, "grad_norm": 3.4375, "grad_norm_var": 0.03313802083333333, "learning_rate": 0.0001, "loss": 5.6947, "loss/crossentropy": 2.4850414991378784, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.170186847448349, "step": 11318 }, { "epoch": 0.35375, "grad_norm": 3.703125, "grad_norm_var": 0.033919270833333334, "learning_rate": 0.0001, "loss": 5.7845, "loss/crossentropy": 2.454130530357361, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17952131479978561, "step": 11320 }, { "epoch": 0.3538125, "grad_norm": 3.5625, "grad_norm_var": 0.04283447265625, "learning_rate": 0.0001, "loss": 5.8565, "loss/crossentropy": 2.5047308206558228, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1820513680577278, "step": 11322 }, { "epoch": 0.353875, "grad_norm": 3.0625, "grad_norm_var": 0.04586181640625, "learning_rate": 0.0001, "loss": 5.7061, "loss/crossentropy": 2.4678107500076294, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17343752086162567, "step": 11324 }, { "epoch": 0.3539375, "grad_norm": 3.140625, "grad_norm_var": 0.04939676920572917, "learning_rate": 0.0001, "loss": 5.8189, "loss/crossentropy": 2.5566846132278442, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1758304387331009, "step": 11326 }, { "epoch": 0.354, "grad_norm": 3.34375, "grad_norm_var": 0.03601786295572917, "learning_rate": 0.0001, "loss": 6.1119, "loss/crossentropy": 2.7197126150131226, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1864854320883751, "step": 11328 }, { "epoch": 0.3540625, "grad_norm": 3.296875, "grad_norm_var": 0.04797770182291667, "learning_rate": 0.0001, "loss": 5.9145, "loss/crossentropy": 2.634163022041321, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17569313943386078, "step": 11330 }, { "epoch": 0.354125, "grad_norm": 3.140625, "grad_norm_var": 0.050146484375, "learning_rate": 0.0001, "loss": 5.8728, "loss/crossentropy": 2.5763657093048096, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17691173404455185, "step": 11332 }, { "epoch": 0.3541875, "grad_norm": 3.234375, "grad_norm_var": 0.04676106770833333, "learning_rate": 0.0001, "loss": 6.14, "loss/crossentropy": 2.7341185808181763, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1866796761751175, "step": 11334 }, { "epoch": 0.35425, "grad_norm": 3.234375, "grad_norm_var": 0.03681233723958333, "learning_rate": 0.0001, "loss": 5.9644, "loss/crossentropy": 2.674552321434021, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17703347653150558, "step": 11336 }, { "epoch": 0.3543125, "grad_norm": 3.328125, "grad_norm_var": 2.8357167561848957, "learning_rate": 0.0001, "loss": 5.8163, "loss/crossentropy": 2.3537384271621704, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18922463059425354, "step": 11338 }, { "epoch": 0.354375, "grad_norm": 3.140625, "grad_norm_var": 2.81578369140625, "learning_rate": 0.0001, "loss": 5.97, "loss/crossentropy": 2.622086763381958, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.18674640357494354, "step": 11340 }, { "epoch": 0.3544375, "grad_norm": 3.0625, "grad_norm_var": 2.811994425455729, "learning_rate": 0.0001, "loss": 5.8829, "loss/crossentropy": 2.6072442531585693, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17404572665691376, "step": 11342 }, { "epoch": 0.3545, "grad_norm": 3.765625, "grad_norm_var": 2.8266886393229167, "learning_rate": 0.0001, "loss": 5.7161, "loss/crossentropy": 2.516643524169922, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17112035304307938, "step": 11344 }, { "epoch": 0.3545625, "grad_norm": 3.046875, "grad_norm_var": 2.8582916259765625, "learning_rate": 0.0001, "loss": 5.7841, "loss/crossentropy": 2.5356485843658447, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17562797665596008, "step": 11346 }, { "epoch": 0.354625, "grad_norm": 3.15625, "grad_norm_var": 2.874381510416667, "learning_rate": 0.0001, "loss": 5.5878, "loss/crossentropy": 2.4020326137542725, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1681831330060959, "step": 11348 }, { "epoch": 0.3546875, "grad_norm": 3.453125, "grad_norm_var": 2.861034138997396, "learning_rate": 0.0001, "loss": 5.9719, "loss/crossentropy": 2.6108627319335938, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1853175312280655, "step": 11350 }, { "epoch": 0.35475, "grad_norm": 3.34375, "grad_norm_var": 2.86441650390625, "learning_rate": 0.0001, "loss": 5.8816, "loss/crossentropy": 2.545454263687134, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18048831820487976, "step": 11352 }, { "epoch": 0.3548125, "grad_norm": 3.828125, "grad_norm_var": 0.07869364420572916, "learning_rate": 0.0001, "loss": 6.3151, "loss/crossentropy": 2.843619704246521, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19011834263801575, "step": 11354 }, { "epoch": 0.354875, "grad_norm": 3.203125, "grad_norm_var": 0.05623270670572917, "learning_rate": 0.0001, "loss": 5.6727, "loss/crossentropy": 2.3813884258270264, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17522287368774414, "step": 11356 }, { "epoch": 0.3549375, "grad_norm": 3.296875, "grad_norm_var": 0.0600494384765625, "learning_rate": 0.0001, "loss": 5.7162, "loss/crossentropy": 2.4047796726226807, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1768423169851303, "step": 11358 }, { "epoch": 0.355, "grad_norm": 3.53125, "grad_norm_var": 0.0487945556640625, "learning_rate": 0.0001, "loss": 5.6889, "loss/crossentropy": 2.43475878238678, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17385368049144745, "step": 11360 }, { "epoch": 0.3550625, "grad_norm": 3.28125, "grad_norm_var": 0.04837239583333333, "learning_rate": 0.0001, "loss": 5.7149, "loss/crossentropy": 2.4121944904327393, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17792222648859024, "step": 11362 }, { "epoch": 0.355125, "grad_norm": 3.375, "grad_norm_var": 0.038309733072916664, "learning_rate": 0.0001, "loss": 5.8767, "loss/crossentropy": 2.5436421632766724, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1762741208076477, "step": 11364 }, { "epoch": 0.3551875, "grad_norm": 3.5625, "grad_norm_var": 0.44999593098958335, "learning_rate": 0.0001, "loss": 6.2411, "loss/crossentropy": 2.7405037879943848, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.19888553023338318, "step": 11366 }, { "epoch": 0.35525, "grad_norm": 3.234375, "grad_norm_var": 0.46381734212239584, "learning_rate": 0.0001, "loss": 5.6813, "loss/crossentropy": 2.51580548286438, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16928555816411972, "step": 11368 }, { "epoch": 0.3553125, "grad_norm": 3.421875, "grad_norm_var": 0.45828348795572915, "learning_rate": 0.0001, "loss": 5.8859, "loss/crossentropy": 2.5942893028259277, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17525933682918549, "step": 11370 }, { "epoch": 0.355375, "grad_norm": 3.25, "grad_norm_var": 0.46943359375, "learning_rate": 0.0001, "loss": 5.867, "loss/crossentropy": 2.6768126487731934, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17019033432006836, "step": 11372 }, { "epoch": 0.3554375, "grad_norm": 3.0625, "grad_norm_var": 0.47509765625, "learning_rate": 0.0001, "loss": 5.6296, "loss/crossentropy": 2.4368897676467896, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16771049797534943, "step": 11374 }, { "epoch": 0.3555, "grad_norm": 3.515625, "grad_norm_var": 0.47421875, "learning_rate": 0.0001, "loss": 5.7442, "loss/crossentropy": 2.4130361080169678, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.1745217740535736, "step": 11376 }, { "epoch": 0.3555625, "grad_norm": 3.453125, "grad_norm_var": 0.4614410400390625, "learning_rate": 0.0001, "loss": 5.7711, "loss/crossentropy": 2.423428177833557, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17969273030757904, "step": 11378 }, { "epoch": 0.355625, "grad_norm": 3.796875, "grad_norm_var": 0.6179758707682291, "learning_rate": 0.0001, "loss": 5.9717, "loss/crossentropy": 2.619338870048523, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17977125942707062, "step": 11380 }, { "epoch": 0.3556875, "grad_norm": 3.203125, "grad_norm_var": 0.24495442708333334, "learning_rate": 0.0001, "loss": 6.052, "loss/crossentropy": 2.606391429901123, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18830808997154236, "step": 11382 }, { "epoch": 0.35575, "grad_norm": 3.21875, "grad_norm_var": 0.2530426025390625, "learning_rate": 0.0001, "loss": 5.6287, "loss/crossentropy": 2.441736340522766, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1694810837507248, "step": 11384 }, { "epoch": 0.3558125, "grad_norm": 3.4375, "grad_norm_var": 0.27371317545572915, "learning_rate": 0.0001, "loss": 5.8177, "loss/crossentropy": 2.563707709312439, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17344237864017487, "step": 11386 }, { "epoch": 0.355875, "grad_norm": 3.15625, "grad_norm_var": 0.2701334635416667, "learning_rate": 0.0001, "loss": 5.8076, "loss/crossentropy": 2.5102453231811523, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1742628961801529, "step": 11388 }, { "epoch": 0.3559375, "grad_norm": 3.390625, "grad_norm_var": 0.26374409993489584, "learning_rate": 0.0001, "loss": 5.8307, "loss/crossentropy": 2.5091181993484497, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17981696128845215, "step": 11390 }, { "epoch": 0.356, "grad_norm": 3.78125, "grad_norm_var": 0.2713704427083333, "learning_rate": 0.0001, "loss": 5.6833, "loss/crossentropy": 2.3582528829574585, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17821084707975388, "step": 11392 }, { "epoch": 0.3560625, "grad_norm": 3.640625, "grad_norm_var": 0.27932840983072915, "learning_rate": 0.0001, "loss": 5.9741, "loss/crossentropy": 2.55366849899292, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18618155270814896, "step": 11394 }, { "epoch": 0.356125, "grad_norm": 3.03125, "grad_norm_var": 0.07580464680989583, "learning_rate": 0.0001, "loss": 5.8154, "loss/crossentropy": 2.4842296838760376, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18037919700145721, "step": 11396 }, { "epoch": 0.3561875, "grad_norm": 3.4375, "grad_norm_var": 0.0642242431640625, "learning_rate": 0.0001, "loss": 6.0091, "loss/crossentropy": 2.615309715270996, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1885964423418045, "step": 11398 }, { "epoch": 0.35625, "grad_norm": 3.25, "grad_norm_var": 0.058568318684895836, "learning_rate": 0.0001, "loss": 5.5899, "loss/crossentropy": 2.429373621940613, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16722938418388367, "step": 11400 }, { "epoch": 0.3563125, "grad_norm": 4.5625, "grad_norm_var": 0.14453125, "learning_rate": 0.0001, "loss": 6.0887, "loss/crossentropy": 2.6522120237350464, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18935363739728928, "step": 11402 }, { "epoch": 0.356375, "grad_norm": 3.40625, "grad_norm_var": 0.13702799479166666, "learning_rate": 0.0001, "loss": 6.0122, "loss/crossentropy": 2.635712504386902, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18452216684818268, "step": 11404 }, { "epoch": 0.3564375, "grad_norm": 3.25, "grad_norm_var": 0.13648681640625, "learning_rate": 0.0001, "loss": 5.8178, "loss/crossentropy": 2.485180974006653, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1777970865368843, "step": 11406 }, { "epoch": 0.3565, "grad_norm": 3.3125, "grad_norm_var": 0.12939453125, "learning_rate": 0.0001, "loss": 5.8076, "loss/crossentropy": 2.5296874046325684, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17544680833816528, "step": 11408 }, { "epoch": 0.3565625, "grad_norm": 3.21875, "grad_norm_var": 0.12379150390625, "learning_rate": 0.0001, "loss": 5.9481, "loss/crossentropy": 2.6227307319641113, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1809762418270111, "step": 11410 }, { "epoch": 0.356625, "grad_norm": 3.0625, "grad_norm_var": 0.1189849853515625, "learning_rate": 0.0001, "loss": 5.8063, "loss/crossentropy": 2.5536731481552124, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17096972465515137, "step": 11412 }, { "epoch": 0.3566875, "grad_norm": 3.40625, "grad_norm_var": 0.1249908447265625, "learning_rate": 0.0001, "loss": 6.0507, "loss/crossentropy": 2.648218274116516, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18946433812379837, "step": 11414 }, { "epoch": 0.35675, "grad_norm": 3.078125, "grad_norm_var": 0.1256744384765625, "learning_rate": 0.0001, "loss": 5.9262, "loss/crossentropy": 2.59994637966156, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17950069904327393, "step": 11416 }, { "epoch": 0.3568125, "grad_norm": 3.296875, "grad_norm_var": 0.022294108072916666, "learning_rate": 0.0001, "loss": 6.0349, "loss/crossentropy": 2.742589235305786, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1776665523648262, "step": 11418 }, { "epoch": 0.356875, "grad_norm": 3.15625, "grad_norm_var": 0.020685831705729168, "learning_rate": 0.0001, "loss": 5.7908, "loss/crossentropy": 2.485624313354492, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1750485599040985, "step": 11420 }, { "epoch": 0.3569375, "grad_norm": 3.234375, "grad_norm_var": 0.0204498291015625, "learning_rate": 0.0001, "loss": 5.9228, "loss/crossentropy": 2.6894630193710327, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.172164686024189, "step": 11422 }, { "epoch": 0.357, "grad_norm": 3.171875, "grad_norm_var": 0.02359619140625, "learning_rate": 0.0001, "loss": 6.0369, "loss/crossentropy": 2.658891439437866, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18428927659988403, "step": 11424 }, { "epoch": 0.3570625, "grad_norm": 3.4375, "grad_norm_var": 0.0354400634765625, "learning_rate": 0.0001, "loss": 5.8742, "loss/crossentropy": 2.530714273452759, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18005235493183136, "step": 11426 }, { "epoch": 0.357125, "grad_norm": 3.40625, "grad_norm_var": 0.037890625, "learning_rate": 0.0001, "loss": 6.1503, "loss/crossentropy": 2.7310314178466797, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1864599734544754, "step": 11428 }, { "epoch": 0.3571875, "grad_norm": 3.0625, "grad_norm_var": 0.0415679931640625, "learning_rate": 0.0001, "loss": 5.6779, "loss/crossentropy": 2.4858238697052, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1688140332698822, "step": 11430 }, { "epoch": 0.35725, "grad_norm": 3.140625, "grad_norm_var": 0.040192667643229166, "learning_rate": 0.0001, "loss": 5.7823, "loss/crossentropy": 2.5231053829193115, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17475000023841858, "step": 11432 }, { "epoch": 0.3573125, "grad_norm": 3.1875, "grad_norm_var": 0.041890462239583336, "learning_rate": 0.0001, "loss": 5.9383, "loss/crossentropy": 2.6606061458587646, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.175423301756382, "step": 11434 }, { "epoch": 0.357375, "grad_norm": 3.734375, "grad_norm_var": 0.05413411458333333, "learning_rate": 0.0001, "loss": 5.7, "loss/crossentropy": 2.3909060955047607, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17543992400169373, "step": 11436 }, { "epoch": 0.3574375, "grad_norm": 3.0, "grad_norm_var": 0.06560872395833334, "learning_rate": 0.0001, "loss": 5.6324, "loss/crossentropy": 2.520796775817871, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16311749815940857, "step": 11438 }, { "epoch": 0.3575, "grad_norm": 3.28125, "grad_norm_var": 0.06467692057291667, "learning_rate": 0.0001, "loss": 5.521, "loss/crossentropy": 2.355831027030945, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16808204352855682, "step": 11440 }, { "epoch": 0.3575625, "grad_norm": 3.53125, "grad_norm_var": 0.05696512858072917, "learning_rate": 0.0001, "loss": 5.8167, "loss/crossentropy": 2.5602900981903076, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17251737415790558, "step": 11442 }, { "epoch": 0.357625, "grad_norm": 3.46875, "grad_norm_var": 0.0509765625, "learning_rate": 0.0001, "loss": 5.8388, "loss/crossentropy": 2.6111416816711426, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17237364500761032, "step": 11444 }, { "epoch": 0.3576875, "grad_norm": 3.609375, "grad_norm_var": 0.0578765869140625, "learning_rate": 0.0001, "loss": 6.075, "loss/crossentropy": 2.681327700614929, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18428415060043335, "step": 11446 }, { "epoch": 0.35775, "grad_norm": 3.09375, "grad_norm_var": 0.0592193603515625, "learning_rate": 0.0001, "loss": 5.8754, "loss/crossentropy": 2.6480273008346558, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1723475679755211, "step": 11448 }, { "epoch": 0.3578125, "grad_norm": 3.375, "grad_norm_var": 0.055501302083333336, "learning_rate": 0.0001, "loss": 6.0198, "loss/crossentropy": 2.7274798154830933, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1784529834985733, "step": 11450 }, { "epoch": 0.357875, "grad_norm": 3.484375, "grad_norm_var": 0.04488525390625, "learning_rate": 0.0001, "loss": 6.2585, "loss/crossentropy": 2.8555736541748047, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18521306663751602, "step": 11452 }, { "epoch": 0.3579375, "grad_norm": 3.203125, "grad_norm_var": 0.03599853515625, "learning_rate": 0.0001, "loss": 5.8051, "loss/crossentropy": 2.4731528759002686, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1800696775317192, "step": 11454 }, { "epoch": 0.358, "grad_norm": 3.4375, "grad_norm_var": 0.0372467041015625, "learning_rate": 0.0001, "loss": 6.0278, "loss/crossentropy": 2.633829951286316, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18431830406188965, "step": 11456 }, { "epoch": 0.3580625, "grad_norm": 3.375, "grad_norm_var": 0.032450358072916664, "learning_rate": 0.0001, "loss": 6.1159, "loss/crossentropy": 2.7020633220672607, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1851382553577423, "step": 11458 }, { "epoch": 0.358125, "grad_norm": 3.046875, "grad_norm_var": 0.029523722330729165, "learning_rate": 0.0001, "loss": 5.5757, "loss/crossentropy": 2.4457303285598755, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1661197990179062, "step": 11460 }, { "epoch": 0.3581875, "grad_norm": 3.328125, "grad_norm_var": 0.021923828125, "learning_rate": 0.0001, "loss": 6.0214, "loss/crossentropy": 2.771443009376526, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17265425622463226, "step": 11462 }, { "epoch": 0.35825, "grad_norm": 3.15625, "grad_norm_var": 0.019331868489583334, "learning_rate": 0.0001, "loss": 5.6269, "loss/crossentropy": 2.448089599609375, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16787806153297424, "step": 11464 }, { "epoch": 0.3583125, "grad_norm": 3.71875, "grad_norm_var": 0.035481770833333336, "learning_rate": 0.0001, "loss": 5.9265, "loss/crossentropy": 2.513920307159424, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18305665999650955, "step": 11466 }, { "epoch": 0.358375, "grad_norm": 3.578125, "grad_norm_var": 0.333203125, "learning_rate": 0.0001, "loss": 5.9521, "loss/crossentropy": 2.5828335285186768, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18028418719768524, "step": 11468 }, { "epoch": 0.3584375, "grad_norm": 3.46875, "grad_norm_var": 0.33818359375, "learning_rate": 0.0001, "loss": 5.839, "loss/crossentropy": 2.5756973028182983, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17281711846590042, "step": 11470 }, { "epoch": 0.3585, "grad_norm": 3.46875, "grad_norm_var": 0.32499593098958335, "learning_rate": 0.0001, "loss": 5.5674, "loss/crossentropy": 2.346805453300476, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1712806150317192, "step": 11472 }, { "epoch": 0.3585625, "grad_norm": 3.890625, "grad_norm_var": 0.33482666015625, "learning_rate": 0.0001, "loss": 6.0567, "loss/crossentropy": 2.6429061889648438, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1859058141708374, "step": 11474 }, { "epoch": 0.358625, "grad_norm": 3.0625, "grad_norm_var": 0.3302154541015625, "learning_rate": 0.0001, "loss": 5.918, "loss/crossentropy": 2.584723472595215, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18176878988742828, "step": 11476 }, { "epoch": 0.3586875, "grad_norm": 3.328125, "grad_norm_var": 0.322705078125, "learning_rate": 0.0001, "loss": 6.003, "loss/crossentropy": 2.609718918800354, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1862044632434845, "step": 11478 }, { "epoch": 0.35875, "grad_norm": 3.375, "grad_norm_var": 0.30917561848958336, "learning_rate": 0.0001, "loss": 6.4142, "loss/crossentropy": 2.867684483528137, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.1933259814977646, "step": 11480 }, { "epoch": 0.3588125, "grad_norm": 3.21875, "grad_norm_var": 0.330126953125, "learning_rate": 0.0001, "loss": 5.7085, "loss/crossentropy": 2.4196025133132935, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17732993513345718, "step": 11482 }, { "epoch": 0.358875, "grad_norm": 3.375, "grad_norm_var": 0.06323140462239583, "learning_rate": 0.0001, "loss": 5.9215, "loss/crossentropy": 2.6263914108276367, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17716874182224274, "step": 11484 }, { "epoch": 0.3589375, "grad_norm": 3.578125, "grad_norm_var": 0.05761311848958333, "learning_rate": 0.0001, "loss": 5.6976, "loss/crossentropy": 2.416483521461487, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17655231058597565, "step": 11486 }, { "epoch": 0.359, "grad_norm": 7.46875, "grad_norm_var": 1.0815714518229167, "learning_rate": 0.0001, "loss": 6.3502, "loss/crossentropy": 2.7684093713760376, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.1987995207309723, "step": 11488 }, { "epoch": 0.3590625, "grad_norm": 3.65625, "grad_norm_var": 1.0758453369140626, "learning_rate": 0.0001, "loss": 6.0211, "loss/crossentropy": 2.5561872720718384, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.1890682429075241, "step": 11490 }, { "epoch": 0.359125, "grad_norm": 3.375, "grad_norm_var": 1.0589752197265625, "learning_rate": 0.0001, "loss": 5.9378, "loss/crossentropy": 2.5617023706436157, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18253500759601593, "step": 11492 }, { "epoch": 0.3591875, "grad_norm": 3.328125, "grad_norm_var": 1.06138916015625, "learning_rate": 0.0001, "loss": 6.02, "loss/crossentropy": 2.614338517189026, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18744519352912903, "step": 11494 }, { "epoch": 0.35925, "grad_norm": 3.09375, "grad_norm_var": 1.0776763916015626, "learning_rate": 0.0001, "loss": 6.0142, "loss/crossentropy": 2.7154823541641235, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17830651253461838, "step": 11496 }, { "epoch": 0.3593125, "grad_norm": 3.6875, "grad_norm_var": 1.07730712890625, "learning_rate": 0.0001, "loss": 5.9975, "loss/crossentropy": 2.6306092739105225, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18395738303661346, "step": 11498 }, { "epoch": 0.359375, "grad_norm": 3.265625, "grad_norm_var": 1.0700510660807292, "learning_rate": 0.0001, "loss": 5.5256, "loss/crossentropy": 2.3245222568511963, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16854803264141083, "step": 11500 }, { "epoch": 0.3594375, "grad_norm": 3.28125, "grad_norm_var": 1.07838134765625, "learning_rate": 0.0001, "loss": 5.9712, "loss/crossentropy": 2.6609901189804077, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.1724277064204216, "step": 11502 }, { "epoch": 0.3595, "grad_norm": 3.15625, "grad_norm_var": 0.0346343994140625, "learning_rate": 0.0001, "loss": 5.9798, "loss/crossentropy": 2.6193424463272095, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18292488902807236, "step": 11504 }, { "epoch": 0.3595625, "grad_norm": 3.65625, "grad_norm_var": 0.03511962890625, "learning_rate": 0.0001, "loss": 6.0716, "loss/crossentropy": 2.686578392982483, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18185945600271225, "step": 11506 }, { "epoch": 0.359625, "grad_norm": 3.09375, "grad_norm_var": 0.0396881103515625, "learning_rate": 0.0001, "loss": 5.779, "loss/crossentropy": 2.5633209943771362, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17195884883403778, "step": 11508 }, { "epoch": 0.3596875, "grad_norm": 3.28125, "grad_norm_var": 0.03990478515625, "learning_rate": 0.0001, "loss": 6.0202, "loss/crossentropy": 2.6731693744659424, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18157903850078583, "step": 11510 }, { "epoch": 0.35975, "grad_norm": 3.109375, "grad_norm_var": 0.04692281087239583, "learning_rate": 0.0001, "loss": 5.5234, "loss/crossentropy": 2.385671377182007, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16103439033031464, "step": 11512 }, { "epoch": 0.3598125, "grad_norm": 3.1875, "grad_norm_var": 0.032201131184895836, "learning_rate": 0.0001, "loss": 6.1492, "loss/crossentropy": 2.7553439140319824, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18274714052677155, "step": 11514 }, { "epoch": 0.359875, "grad_norm": 3.390625, "grad_norm_var": 0.03298238118489583, "learning_rate": 0.0001, "loss": 5.9644, "loss/crossentropy": 2.6135441064834595, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17961689829826355, "step": 11516 }, { "epoch": 0.3599375, "grad_norm": 3.015625, "grad_norm_var": 0.03629150390625, "learning_rate": 0.0001, "loss": 5.5116, "loss/crossentropy": 2.3820388317108154, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16452037543058395, "step": 11518 }, { "epoch": 0.36, "grad_norm": 3.140625, "grad_norm_var": 0.040827433268229164, "learning_rate": 0.0001, "loss": 5.9463, "loss/crossentropy": 2.649800658226013, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.178085595369339, "step": 11520 }, { "epoch": 0.3600625, "grad_norm": 3.578125, "grad_norm_var": 0.03927408854166667, "learning_rate": 0.0001, "loss": 6.2177, "loss/crossentropy": 2.772137999534607, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.19104167073965073, "step": 11522 }, { "epoch": 0.360125, "grad_norm": 3.140625, "grad_norm_var": 0.037821451822916664, "learning_rate": 0.0001, "loss": 6.0252, "loss/crossentropy": 2.7171050310134888, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18002324551343918, "step": 11524 }, { "epoch": 0.3601875, "grad_norm": 3.234375, "grad_norm_var": 0.03765869140625, "learning_rate": 0.0001, "loss": 6.0706, "loss/crossentropy": 2.757015585899353, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.18174485862255096, "step": 11526 }, { "epoch": 0.36025, "grad_norm": 3.203125, "grad_norm_var": 0.03736572265625, "learning_rate": 0.0001, "loss": 5.4559, "loss/crossentropy": 2.3074045181274414, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16524454951286316, "step": 11528 }, { "epoch": 0.3603125, "grad_norm": 3.40625, "grad_norm_var": 0.03972066243489583, "learning_rate": 0.0001, "loss": 5.7828, "loss/crossentropy": 2.4054633378982544, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18382655084133148, "step": 11530 }, { "epoch": 0.360375, "grad_norm": 3.296875, "grad_norm_var": 0.03899332682291667, "learning_rate": 0.0001, "loss": 5.729, "loss/crossentropy": 2.433566451072693, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1756378412246704, "step": 11532 }, { "epoch": 0.3604375, "grad_norm": 3.203125, "grad_norm_var": 0.03413798014322917, "learning_rate": 0.0001, "loss": 6.1159, "loss/crossentropy": 2.7514678239822388, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1833144798874855, "step": 11534 }, { "epoch": 0.3605, "grad_norm": 3.21875, "grad_norm_var": 0.0281158447265625, "learning_rate": 0.0001, "loss": 6.1491, "loss/crossentropy": 2.819941759109497, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1821325495839119, "step": 11536 }, { "epoch": 0.3605625, "grad_norm": 3.234375, "grad_norm_var": 0.053742472330729166, "learning_rate": 0.0001, "loss": 5.5643, "loss/crossentropy": 2.311951160430908, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1717149093747139, "step": 11538 }, { "epoch": 0.360625, "grad_norm": 3.03125, "grad_norm_var": 0.061644490559895834, "learning_rate": 0.0001, "loss": 5.3732, "loss/crossentropy": 2.313949942588806, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1574876829981804, "step": 11540 }, { "epoch": 0.3606875, "grad_norm": 3.25, "grad_norm_var": 0.0590240478515625, "learning_rate": 0.0001, "loss": 5.8614, "loss/crossentropy": 2.5384668111801147, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1783868819475174, "step": 11542 }, { "epoch": 0.36075, "grad_norm": 3.015625, "grad_norm_var": 0.059000651041666664, "learning_rate": 0.0001, "loss": 5.7281, "loss/crossentropy": 2.412795901298523, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.18231303244829178, "step": 11544 }, { "epoch": 0.3608125, "grad_norm": 3.75, "grad_norm_var": 0.08050130208333334, "learning_rate": 0.0001, "loss": 6.1654, "loss/crossentropy": 2.6934750080108643, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18977203965187073, "step": 11546 }, { "epoch": 0.360875, "grad_norm": 3.390625, "grad_norm_var": 0.08046468098958333, "learning_rate": 0.0001, "loss": 6.0536, "loss/crossentropy": 2.67894446849823, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18395385891199112, "step": 11548 }, { "epoch": 0.3609375, "grad_norm": 3.390625, "grad_norm_var": 0.12551167805989583, "learning_rate": 0.0001, "loss": 5.9916, "loss/crossentropy": 2.415665626525879, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.20173435658216476, "step": 11550 }, { "epoch": 0.361, "grad_norm": 3.375, "grad_norm_var": 0.11824442545572916, "learning_rate": 0.0001, "loss": 5.992, "loss/crossentropy": 2.6645971536636353, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18039774894714355, "step": 11552 }, { "epoch": 0.3610625, "grad_norm": 3.1875, "grad_norm_var": 0.0986724853515625, "learning_rate": 0.0001, "loss": 6.0679, "loss/crossentropy": 2.702301025390625, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18304389715194702, "step": 11554 }, { "epoch": 0.361125, "grad_norm": 3.484375, "grad_norm_var": 0.07206624348958333, "learning_rate": 0.0001, "loss": 6.174, "loss/crossentropy": 2.7713425159454346, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18792008608579636, "step": 11556 }, { "epoch": 0.3611875, "grad_norm": 3.734375, "grad_norm_var": 0.07440999348958334, "learning_rate": 0.0001, "loss": 5.9285, "loss/crossentropy": 2.554310441017151, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18585404008626938, "step": 11558 }, { "epoch": 0.36125, "grad_norm": 3.40625, "grad_norm_var": 0.06679280598958333, "learning_rate": 0.0001, "loss": 6.3119, "loss/crossentropy": 2.803977608680725, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19298001378774643, "step": 11560 }, { "epoch": 0.3613125, "grad_norm": 3.203125, "grad_norm_var": 0.06648661295572916, "learning_rate": 0.0001, "loss": 5.8028, "loss/crossentropy": 2.4768543243408203, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17946702241897583, "step": 11562 }, { "epoch": 0.361375, "grad_norm": 3.0625, "grad_norm_var": 0.07428385416666666, "learning_rate": 0.0001, "loss": 5.909, "loss/crossentropy": 2.573415517807007, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1796480119228363, "step": 11564 }, { "epoch": 0.3614375, "grad_norm": 3.421875, "grad_norm_var": 0.04191792805989583, "learning_rate": 0.0001, "loss": 6.1032, "loss/crossentropy": 2.708019971847534, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18404622375965118, "step": 11566 }, { "epoch": 0.3615, "grad_norm": 3.34375, "grad_norm_var": 0.04479166666666667, "learning_rate": 0.0001, "loss": 5.5103, "loss/crossentropy": 2.3572477102279663, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16803553700447083, "step": 11568 }, { "epoch": 0.3615625, "grad_norm": 3.421875, "grad_norm_var": 0.044234212239583334, "learning_rate": 0.0001, "loss": 5.8691, "loss/crossentropy": 2.554656982421875, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.18222244083881378, "step": 11570 }, { "epoch": 0.361625, "grad_norm": 3.234375, "grad_norm_var": 0.04781901041666667, "learning_rate": 0.0001, "loss": 5.8138, "loss/crossentropy": 2.56459379196167, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1741422712802887, "step": 11572 }, { "epoch": 0.3616875, "grad_norm": 3.046875, "grad_norm_var": 0.04560546875, "learning_rate": 0.0001, "loss": 5.7767, "loss/crossentropy": 2.5425853729248047, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1753685027360916, "step": 11574 }, { "epoch": 0.36175, "grad_norm": 3.15625, "grad_norm_var": 0.028831990559895833, "learning_rate": 0.0001, "loss": 5.6042, "loss/crossentropy": 2.3334556818008423, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1731710433959961, "step": 11576 }, { "epoch": 0.3618125, "grad_norm": 3.421875, "grad_norm_var": 0.028678385416666667, "learning_rate": 0.0001, "loss": 6.0177, "loss/crossentropy": 2.638232946395874, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1813022494316101, "step": 11578 }, { "epoch": 0.361875, "grad_norm": 2.96875, "grad_norm_var": 0.03369140625, "learning_rate": 0.0001, "loss": 5.3485, "loss/crossentropy": 2.2655311822891235, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.15712259709835052, "step": 11580 }, { "epoch": 0.3619375, "grad_norm": 3.375, "grad_norm_var": 0.0190582275390625, "learning_rate": 0.0001, "loss": 6.0837, "loss/crossentropy": 2.672060251235962, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18803488463163376, "step": 11582 }, { "epoch": 0.362, "grad_norm": 2.984375, "grad_norm_var": 0.0309722900390625, "learning_rate": 0.0001, "loss": 5.4147, "loss/crossentropy": 2.373745918273926, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.15409798175096512, "step": 11584 }, { "epoch": 0.3620625, "grad_norm": 3.421875, "grad_norm_var": 0.031050618489583334, "learning_rate": 0.0001, "loss": 6.0394, "loss/crossentropy": 2.607384204864502, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.19164405763149261, "step": 11586 }, { "epoch": 0.362125, "grad_norm": 3.375, "grad_norm_var": 0.034601847330729164, "learning_rate": 0.0001, "loss": 5.8228, "loss/crossentropy": 2.6524970531463623, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16819876432418823, "step": 11588 }, { "epoch": 0.3621875, "grad_norm": 3.078125, "grad_norm_var": 0.0370758056640625, "learning_rate": 0.0001, "loss": 5.7826, "loss/crossentropy": 2.526271104812622, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1760205775499344, "step": 11590 }, { "epoch": 0.36225, "grad_norm": 3.1875, "grad_norm_var": 0.04140523274739583, "learning_rate": 0.0001, "loss": 5.73, "loss/crossentropy": 2.4259228706359863, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.174160897731781, "step": 11592 }, { "epoch": 0.3623125, "grad_norm": 3.28125, "grad_norm_var": 0.03748372395833333, "learning_rate": 0.0001, "loss": 5.8936, "loss/crossentropy": 2.581678628921509, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.17494133859872818, "step": 11594 }, { "epoch": 0.362375, "grad_norm": 3.390625, "grad_norm_var": 0.043553670247395836, "learning_rate": 0.0001, "loss": 5.814, "loss/crossentropy": 2.466444492340088, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1824108511209488, "step": 11596 }, { "epoch": 0.3624375, "grad_norm": 3.78125, "grad_norm_var": 0.0593902587890625, "learning_rate": 0.0001, "loss": 5.8607, "loss/crossentropy": 2.518047571182251, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.181922048330307, "step": 11598 }, { "epoch": 0.3625, "grad_norm": 3.375, "grad_norm_var": 0.034228515625, "learning_rate": 0.0001, "loss": 5.5955, "loss/crossentropy": 2.3988256454467773, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.17396844923496246, "step": 11600 }, { "epoch": 0.3625625, "grad_norm": 3.203125, "grad_norm_var": 0.0350250244140625, "learning_rate": 0.0001, "loss": 5.93, "loss/crossentropy": 2.633981227874756, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17647858709096909, "step": 11602 }, { "epoch": 0.362625, "grad_norm": 3.1875, "grad_norm_var": 0.042578125, "learning_rate": 0.0001, "loss": 5.5269, "loss/crossentropy": 2.3514821529388428, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17027592658996582, "step": 11604 }, { "epoch": 0.3626875, "grad_norm": 3.578125, "grad_norm_var": 0.04189351399739583, "learning_rate": 0.0001, "loss": 5.5546, "loss/crossentropy": 2.203986644744873, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1788102239370346, "step": 11606 }, { "epoch": 0.36275, "grad_norm": 3.609375, "grad_norm_var": 0.05056050618489583, "learning_rate": 0.0001, "loss": 5.7688, "loss/crossentropy": 2.4843231439590454, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17649301886558533, "step": 11608 }, { "epoch": 0.3628125, "grad_norm": 3.078125, "grad_norm_var": 0.059260050455729164, "learning_rate": 0.0001, "loss": 6.075, "loss/crossentropy": 2.6833078861236572, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1864389181137085, "step": 11610 }, { "epoch": 0.362875, "grad_norm": 3.4375, "grad_norm_var": 0.05190327962239583, "learning_rate": 0.0001, "loss": 6.0866, "loss/crossentropy": 2.5845061540603638, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.19200821965932846, "step": 11612 }, { "epoch": 0.3629375, "grad_norm": 3.171875, "grad_norm_var": 0.03876546223958333, "learning_rate": 0.0001, "loss": 5.9458, "loss/crossentropy": 2.663218140602112, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1790422722697258, "step": 11614 }, { "epoch": 0.363, "grad_norm": 3.359375, "grad_norm_var": 0.03864644368489583, "learning_rate": 0.0001, "loss": 6.1841, "loss/crossentropy": 2.820657968521118, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18126174062490463, "step": 11616 }, { "epoch": 0.3630625, "grad_norm": 3.375, "grad_norm_var": 0.041764322916666666, "learning_rate": 0.0001, "loss": 5.7947, "loss/crossentropy": 2.549742341041565, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17098423838615417, "step": 11618 }, { "epoch": 0.363125, "grad_norm": 3.546875, "grad_norm_var": 0.04560445149739583, "learning_rate": 0.0001, "loss": 5.6737, "loss/crossentropy": 2.3581948280334473, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.17491018027067184, "step": 11620 }, { "epoch": 0.3631875, "grad_norm": 3.296875, "grad_norm_var": 0.045210774739583334, "learning_rate": 0.0001, "loss": 5.8093, "loss/crossentropy": 2.5261915922164917, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1783144325017929, "step": 11622 }, { "epoch": 0.36325, "grad_norm": 3.25, "grad_norm_var": 0.0349273681640625, "learning_rate": 0.0001, "loss": 5.8163, "loss/crossentropy": 2.544545292854309, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17678914219141006, "step": 11624 }, { "epoch": 0.3633125, "grad_norm": 3.234375, "grad_norm_var": 0.026252237955729167, "learning_rate": 0.0001, "loss": 5.7587, "loss/crossentropy": 2.469446539878845, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.17267972975969315, "step": 11626 }, { "epoch": 0.363375, "grad_norm": 3.640625, "grad_norm_var": 0.03076171875, "learning_rate": 0.0001, "loss": 6.3727, "loss/crossentropy": 2.7683225870132446, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.1991046443581581, "step": 11628 }, { "epoch": 0.3634375, "grad_norm": 3.5, "grad_norm_var": 0.028251139322916667, "learning_rate": 0.0001, "loss": 6.1533, "loss/crossentropy": 2.645849823951721, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.19449244439601898, "step": 11630 }, { "epoch": 0.3635, "grad_norm": 3.1875, "grad_norm_var": 0.031571451822916666, "learning_rate": 0.0001, "loss": 5.7111, "loss/crossentropy": 2.488775372505188, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17223234474658966, "step": 11632 }, { "epoch": 0.3635625, "grad_norm": 3.71875, "grad_norm_var": 1.595905558268229, "learning_rate": 0.0001, "loss": 6.3605, "loss/crossentropy": 2.544732093811035, "loss/hidden": 1.62890625, "loss/jsd": 0.0, "loss/logits": 0.2186906337738037, "step": 11634 }, { "epoch": 0.363625, "grad_norm": 3.203125, "grad_norm_var": 1.64381103515625, "learning_rate": 0.0001, "loss": 5.8801, "loss/crossentropy": 2.657926321029663, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17182911932468414, "step": 11636 }, { "epoch": 0.3636875, "grad_norm": 3.34375, "grad_norm_var": 1.6316691080729167, "learning_rate": 0.0001, "loss": 5.957, "loss/crossentropy": 2.5820019245147705, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18203197419643402, "step": 11638 }, { "epoch": 0.36375, "grad_norm": 3.078125, "grad_norm_var": 1.6466471354166667, "learning_rate": 0.0001, "loss": 5.8765, "loss/crossentropy": 2.6228911876678467, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17418856918811798, "step": 11640 }, { "epoch": 0.3638125, "grad_norm": 3.078125, "grad_norm_var": 1.6611480712890625, "learning_rate": 0.0001, "loss": 5.718, "loss/crossentropy": 2.4857271909713745, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17166081070899963, "step": 11642 }, { "epoch": 0.363875, "grad_norm": 3.546875, "grad_norm_var": 1.7013092041015625, "learning_rate": 0.0001, "loss": 6.1816, "loss/crossentropy": 2.5950502157211304, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.19928032159805298, "step": 11644 }, { "epoch": 0.3639375, "grad_norm": 3.078125, "grad_norm_var": 1.7250315348307292, "learning_rate": 0.0001, "loss": 6.0275, "loss/crossentropy": 2.6586222648620605, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1841554269194603, "step": 11646 }, { "epoch": 0.364, "grad_norm": 3.359375, "grad_norm_var": 1.7169748942057292, "learning_rate": 0.0001, "loss": 6.0681, "loss/crossentropy": 2.6864718198776245, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18503419309854507, "step": 11648 }, { "epoch": 0.3640625, "grad_norm": 3.140625, "grad_norm_var": 0.13170572916666667, "learning_rate": 0.0001, "loss": 5.9131, "loss/crossentropy": 2.6158591508865356, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17894399911165237, "step": 11650 }, { "epoch": 0.364125, "grad_norm": 3.78125, "grad_norm_var": 0.13697509765625, "learning_rate": 0.0001, "loss": 5.522, "loss/crossentropy": 2.3324047327041626, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1646653339266777, "step": 11652 }, { "epoch": 0.3641875, "grad_norm": 3.40625, "grad_norm_var": 0.14096577962239584, "learning_rate": 0.0001, "loss": 5.791, "loss/crossentropy": 2.4745148420333862, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.18203992396593094, "step": 11654 }, { "epoch": 0.36425, "grad_norm": 3.0625, "grad_norm_var": 0.14129130045572916, "learning_rate": 0.0001, "loss": 5.5313, "loss/crossentropy": 2.3341383934020996, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17166809737682343, "step": 11656 }, { "epoch": 0.3643125, "grad_norm": 3.328125, "grad_norm_var": 0.1370025634765625, "learning_rate": 0.0001, "loss": 5.5383, "loss/crossentropy": 2.3614786863327026, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1692415550351143, "step": 11658 }, { "epoch": 0.364375, "grad_norm": 2.890625, "grad_norm_var": 0.05548502604166667, "learning_rate": 0.0001, "loss": 5.4776, "loss/crossentropy": 2.327902913093567, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1641886830329895, "step": 11660 }, { "epoch": 0.3644375, "grad_norm": 4.125, "grad_norm_var": 0.0938140869140625, "learning_rate": 0.0001, "loss": 5.8242, "loss/crossentropy": 2.3535810708999634, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.190810889005661, "step": 11662 }, { "epoch": 0.3645, "grad_norm": 3.25, "grad_norm_var": 0.09388020833333334, "learning_rate": 0.0001, "loss": 5.8349, "loss/crossentropy": 2.6038748025894165, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17388267070055008, "step": 11664 }, { "epoch": 0.3645625, "grad_norm": 3.28125, "grad_norm_var": 0.09176025390625, "learning_rate": 0.0001, "loss": 5.7459, "loss/crossentropy": 2.4753239154815674, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1743217408657074, "step": 11666 }, { "epoch": 0.364625, "grad_norm": 3.59375, "grad_norm_var": 0.0791412353515625, "learning_rate": 0.0001, "loss": 6.2747, "loss/crossentropy": 2.79960834980011, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.19282344728708267, "step": 11668 }, { "epoch": 0.3646875, "grad_norm": 3.828125, "grad_norm_var": 0.09381103515625, "learning_rate": 0.0001, "loss": 5.9164, "loss/crossentropy": 2.572916865348816, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.17809413373470306, "step": 11670 }, { "epoch": 0.36475, "grad_norm": 3.1875, "grad_norm_var": 0.08993733723958333, "learning_rate": 0.0001, "loss": 5.8124, "loss/crossentropy": 2.5034350156784058, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.17386088520288467, "step": 11672 }, { "epoch": 0.3648125, "grad_norm": 3.171875, "grad_norm_var": 0.0907379150390625, "learning_rate": 0.0001, "loss": 6.0899, "loss/crossentropy": 2.7098684310913086, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18527349829673767, "step": 11674 }, { "epoch": 0.364875, "grad_norm": 3.328125, "grad_norm_var": 0.07219136555989583, "learning_rate": 0.0001, "loss": 5.8037, "loss/crossentropy": 2.4899182319641113, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.17356415838003159, "step": 11676 }, { "epoch": 0.3649375, "grad_norm": 3.375, "grad_norm_var": 0.030013020833333334, "learning_rate": 0.0001, "loss": 6.0627, "loss/crossentropy": 2.7277345657348633, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18271364271640778, "step": 11678 }, { "epoch": 0.365, "grad_norm": 3.734375, "grad_norm_var": 0.040299479166666666, "learning_rate": 0.0001, "loss": 6.0736, "loss/crossentropy": 2.6390098333358765, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18916141241788864, "step": 11680 }, { "epoch": 0.3650625, "grad_norm": 3.0, "grad_norm_var": 0.04865620930989583, "learning_rate": 0.0001, "loss": 5.5582, "loss/crossentropy": 2.3566737174987793, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1681957244873047, "step": 11682 }, { "epoch": 0.365125, "grad_norm": 3.25, "grad_norm_var": 0.0457672119140625, "learning_rate": 0.0001, "loss": 5.7941, "loss/crossentropy": 2.5121636390686035, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17780755460262299, "step": 11684 }, { "epoch": 0.3651875, "grad_norm": 3.96875, "grad_norm_var": 0.0543853759765625, "learning_rate": 0.0001, "loss": 6.0005, "loss/crossentropy": 2.5540707111358643, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.18605080246925354, "step": 11686 }, { "epoch": 0.36525, "grad_norm": 3.03125, "grad_norm_var": 0.059691365559895834, "learning_rate": 0.0001, "loss": 5.9282, "loss/crossentropy": 2.586628556251526, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.17712734639644623, "step": 11688 }, { "epoch": 0.3653125, "grad_norm": 3.5, "grad_norm_var": 0.05976155598958333, "learning_rate": 0.0001, "loss": 5.9325, "loss/crossentropy": 2.536140561103821, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1876792460680008, "step": 11690 }, { "epoch": 0.365375, "grad_norm": 3.34375, "grad_norm_var": 0.06415608723958334, "learning_rate": 0.0001, "loss": 5.9249, "loss/crossentropy": 2.6262542009353638, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17908621579408646, "step": 11692 }, { "epoch": 0.3654375, "grad_norm": 3.53125, "grad_norm_var": 0.0690093994140625, "learning_rate": 0.0001, "loss": 5.7679, "loss/crossentropy": 2.517289161682129, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17428414523601532, "step": 11694 }, { "epoch": 0.3655, "grad_norm": 3.21875, "grad_norm_var": 0.06822916666666666, "learning_rate": 0.0001, "loss": 5.8793, "loss/crossentropy": 2.5547901391983032, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18010495603084564, "step": 11696 }, { "epoch": 0.3655625, "grad_norm": 3.359375, "grad_norm_var": 0.060774739583333334, "learning_rate": 0.0001, "loss": 5.7699, "loss/crossentropy": 2.498196601867676, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17561272531747818, "step": 11698 }, { "epoch": 0.365625, "grad_norm": 3.5, "grad_norm_var": 0.07692057291666667, "learning_rate": 0.0001, "loss": 6.0204, "loss/crossentropy": 2.573630452156067, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18920981138944626, "step": 11700 }, { "epoch": 0.3656875, "grad_norm": 2.9375, "grad_norm_var": 0.07570699055989584, "learning_rate": 0.0001, "loss": 5.524, "loss/crossentropy": 2.4320229291915894, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1638863906264305, "step": 11702 }, { "epoch": 0.36575, "grad_norm": 3.0625, "grad_norm_var": 0.074658203125, "learning_rate": 0.0001, "loss": 5.3621, "loss/crossentropy": 2.2419254183769226, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1627953201532364, "step": 11704 }, { "epoch": 0.3658125, "grad_norm": 3.03125, "grad_norm_var": 0.07830403645833334, "learning_rate": 0.0001, "loss": 5.7887, "loss/crossentropy": 2.5559518337249756, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17327089607715607, "step": 11706 }, { "epoch": 0.365875, "grad_norm": 3.09375, "grad_norm_var": 0.07842508951822917, "learning_rate": 0.0001, "loss": 5.7768, "loss/crossentropy": 2.544148802757263, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17405012249946594, "step": 11708 }, { "epoch": 0.3659375, "grad_norm": 3.265625, "grad_norm_var": 0.07327473958333333, "learning_rate": 0.0001, "loss": 5.8207, "loss/crossentropy": 2.5839486122131348, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17094473540782928, "step": 11710 }, { "epoch": 0.366, "grad_norm": 3.359375, "grad_norm_var": 0.06852925618489583, "learning_rate": 0.0001, "loss": 6.0067, "loss/crossentropy": 2.642910599708557, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18208573758602142, "step": 11712 }, { "epoch": 0.3660625, "grad_norm": 3.359375, "grad_norm_var": 0.06663411458333333, "learning_rate": 0.0001, "loss": 6.038, "loss/crossentropy": 2.672016978263855, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18269260227680206, "step": 11714 }, { "epoch": 0.366125, "grad_norm": 3.3125, "grad_norm_var": 0.028392537434895834, "learning_rate": 0.0001, "loss": 5.82, "loss/crossentropy": 2.5538461208343506, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17856648564338684, "step": 11716 }, { "epoch": 0.3661875, "grad_norm": 3.15625, "grad_norm_var": 0.02291259765625, "learning_rate": 0.0001, "loss": 5.836, "loss/crossentropy": 2.6154959201812744, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1712726205587387, "step": 11718 }, { "epoch": 0.36625, "grad_norm": 3.203125, "grad_norm_var": 0.0215728759765625, "learning_rate": 0.0001, "loss": 5.932, "loss/crossentropy": 2.6341400146484375, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1809549182653427, "step": 11720 }, { "epoch": 0.3663125, "grad_norm": 3.3125, "grad_norm_var": 0.019205729166666668, "learning_rate": 0.0001, "loss": 6.1397, "loss/crossentropy": 2.791115164756775, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18173208832740784, "step": 11722 }, { "epoch": 0.366375, "grad_norm": 3.265625, "grad_norm_var": 0.019921875, "learning_rate": 0.0001, "loss": 5.9471, "loss/crossentropy": 2.646299362182617, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17812715470790863, "step": 11724 }, { "epoch": 0.3664375, "grad_norm": 3.3125, "grad_norm_var": 0.0202056884765625, "learning_rate": 0.0001, "loss": 5.9564, "loss/crossentropy": 2.6312177181243896, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18056128174066544, "step": 11726 }, { "epoch": 0.3665, "grad_norm": 3.421875, "grad_norm_var": 0.0116363525390625, "learning_rate": 0.0001, "loss": 5.8562, "loss/crossentropy": 2.532650113105774, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18156994879245758, "step": 11728 }, { "epoch": 0.3665625, "grad_norm": 3.28125, "grad_norm_var": 0.009566243489583333, "learning_rate": 0.0001, "loss": 6.1817, "loss/crossentropy": 2.808872938156128, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18416258692741394, "step": 11730 }, { "epoch": 0.366625, "grad_norm": 3.109375, "grad_norm_var": 0.009358723958333334, "learning_rate": 0.0001, "loss": 5.7934, "loss/crossentropy": 2.5109959840774536, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1743355542421341, "step": 11732 }, { "epoch": 0.3666875, "grad_norm": 3.265625, "grad_norm_var": 0.030859375, "learning_rate": 0.0001, "loss": 6.2513, "loss/crossentropy": 2.783187747001648, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.19368354231119156, "step": 11734 }, { "epoch": 0.36675, "grad_norm": 2.984375, "grad_norm_var": 0.0400543212890625, "learning_rate": 0.0001, "loss": 5.7834, "loss/crossentropy": 2.5999754667282104, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16912198066711426, "step": 11736 }, { "epoch": 0.3668125, "grad_norm": 3.390625, "grad_norm_var": 0.04117431640625, "learning_rate": 0.0001, "loss": 5.9766, "loss/crossentropy": 2.683970332145691, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17809562385082245, "step": 11738 }, { "epoch": 0.366875, "grad_norm": 3.265625, "grad_norm_var": 0.054182942708333334, "learning_rate": 0.0001, "loss": 6.0108, "loss/crossentropy": 2.56476891040802, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.18366805464029312, "step": 11740 }, { "epoch": 0.3669375, "grad_norm": 3.15625, "grad_norm_var": 0.057763671875, "learning_rate": 0.0001, "loss": 5.6899, "loss/crossentropy": 2.467342257499695, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17264722287654877, "step": 11742 }, { "epoch": 0.367, "grad_norm": 3.296875, "grad_norm_var": 0.0559234619140625, "learning_rate": 0.0001, "loss": 5.7186, "loss/crossentropy": 2.435905694961548, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17710177600383759, "step": 11744 }, { "epoch": 0.3670625, "grad_norm": 3.296875, "grad_norm_var": 0.055924479166666666, "learning_rate": 0.0001, "loss": 5.6426, "loss/crossentropy": 2.3441178798675537, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17633193731307983, "step": 11746 }, { "epoch": 0.367125, "grad_norm": 3.265625, "grad_norm_var": 0.05445556640625, "learning_rate": 0.0001, "loss": 5.7862, "loss/crossentropy": 2.5711851119995117, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17189423739910126, "step": 11748 }, { "epoch": 0.3671875, "grad_norm": 3.34375, "grad_norm_var": 0.03928934733072917, "learning_rate": 0.0001, "loss": 5.7056, "loss/crossentropy": 2.3876869678497314, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17944584786891937, "step": 11750 }, { "epoch": 0.36725, "grad_norm": 3.484375, "grad_norm_var": 0.027912394205729166, "learning_rate": 0.0001, "loss": 6.0769, "loss/crossentropy": 2.682102680206299, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18322941660881042, "step": 11752 }, { "epoch": 0.3673125, "grad_norm": 3.15625, "grad_norm_var": 0.029850260416666666, "learning_rate": 0.0001, "loss": 6.0325, "loss/crossentropy": 2.6718112230300903, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18177196383476257, "step": 11754 }, { "epoch": 0.367375, "grad_norm": 3.21875, "grad_norm_var": 0.01611328125, "learning_rate": 0.0001, "loss": 5.8179, "loss/crossentropy": 2.515088200569153, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1787201538681984, "step": 11756 }, { "epoch": 0.3674375, "grad_norm": 3.40625, "grad_norm_var": 0.013036092122395834, "learning_rate": 0.0001, "loss": 5.9501, "loss/crossentropy": 2.6708627939224243, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17753348499536514, "step": 11758 }, { "epoch": 0.3675, "grad_norm": 3.328125, "grad_norm_var": 0.015851847330729165, "learning_rate": 0.0001, "loss": 5.7554, "loss/crossentropy": 2.5031509399414062, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17796175181865692, "step": 11760 }, { "epoch": 0.3675625, "grad_norm": 2.90625, "grad_norm_var": 0.026005045572916666, "learning_rate": 0.0001, "loss": 5.5039, "loss/crossentropy": 2.3185973167419434, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1685332953929901, "step": 11762 }, { "epoch": 0.367625, "grad_norm": 3.4375, "grad_norm_var": 0.02691650390625, "learning_rate": 0.0001, "loss": 6.066, "loss/crossentropy": 2.6969223022460938, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18534033745527267, "step": 11764 }, { "epoch": 0.3676875, "grad_norm": 3.0, "grad_norm_var": 0.026204427083333332, "learning_rate": 0.0001, "loss": 6.0345, "loss/crossentropy": 2.7123241424560547, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.181439608335495, "step": 11766 }, { "epoch": 0.36775, "grad_norm": 3.3125, "grad_norm_var": 0.021773274739583334, "learning_rate": 0.0001, "loss": 5.5845, "loss/crossentropy": 2.3817098140716553, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17106405645608902, "step": 11768 }, { "epoch": 0.3678125, "grad_norm": 3.328125, "grad_norm_var": 0.029520670572916668, "learning_rate": 0.0001, "loss": 5.9256, "loss/crossentropy": 2.521042823791504, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.18108432739973068, "step": 11770 }, { "epoch": 0.367875, "grad_norm": 3.265625, "grad_norm_var": 0.029255167643229166, "learning_rate": 0.0001, "loss": 5.6482, "loss/crossentropy": 2.3615509271621704, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17514477670192719, "step": 11772 }, { "epoch": 0.3679375, "grad_norm": 3.1875, "grad_norm_var": 0.027880859375, "learning_rate": 0.0001, "loss": 6.0203, "loss/crossentropy": 2.6792795658111572, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1809818148612976, "step": 11774 }, { "epoch": 0.368, "grad_norm": 3.234375, "grad_norm_var": 0.02574462890625, "learning_rate": 0.0001, "loss": 5.7908, "loss/crossentropy": 2.4552351236343384, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.17691315710544586, "step": 11776 }, { "epoch": 0.3680625, "grad_norm": 3.09375, "grad_norm_var": 0.018879191080729166, "learning_rate": 0.0001, "loss": 5.8327, "loss/crossentropy": 2.4948712587356567, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1775280386209488, "step": 11778 }, { "epoch": 0.368125, "grad_norm": 3.5, "grad_norm_var": 0.020116170247395832, "learning_rate": 0.0001, "loss": 5.9142, "loss/crossentropy": 2.597373604774475, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17777980864048004, "step": 11780 }, { "epoch": 0.3681875, "grad_norm": 3.40625, "grad_norm_var": 0.014567057291666666, "learning_rate": 0.0001, "loss": 5.594, "loss/crossentropy": 2.3891193866729736, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17009887844324112, "step": 11782 }, { "epoch": 0.36825, "grad_norm": 3.171875, "grad_norm_var": 0.015511067708333333, "learning_rate": 0.0001, "loss": 5.81, "loss/crossentropy": 2.645570397377014, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16409894824028015, "step": 11784 }, { "epoch": 0.3683125, "grad_norm": 3.296875, "grad_norm_var": 0.0119049072265625, "learning_rate": 0.0001, "loss": 5.6802, "loss/crossentropy": 2.476786255836487, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.168389730155468, "step": 11786 }, { "epoch": 0.368375, "grad_norm": 3.703125, "grad_norm_var": 0.370654296875, "learning_rate": 0.0001, "loss": 5.9516, "loss/crossentropy": 2.423331618309021, "loss/hidden": 1.65625, "loss/jsd": 0.0, "loss/logits": 0.1872054487466812, "step": 11788 }, { "epoch": 0.3684375, "grad_norm": 3.125, "grad_norm_var": 0.3765289306640625, "learning_rate": 0.0001, "loss": 5.709, "loss/crossentropy": 2.542636752128601, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1697589010000229, "step": 11790 }, { "epoch": 0.3685, "grad_norm": 3.625, "grad_norm_var": 0.37255452473958334, "learning_rate": 0.0001, "loss": 6.1027, "loss/crossentropy": 2.6770007610321045, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1878843605518341, "step": 11792 }, { "epoch": 0.3685625, "grad_norm": 2.875, "grad_norm_var": 0.3907389322916667, "learning_rate": 0.0001, "loss": 5.5742, "loss/crossentropy": 2.499484658241272, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16294343024492264, "step": 11794 }, { "epoch": 0.368625, "grad_norm": 3.40625, "grad_norm_var": 0.3915201822916667, "learning_rate": 0.0001, "loss": 5.8344, "loss/crossentropy": 2.538533926010132, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17880485951900482, "step": 11796 }, { "epoch": 0.3686875, "grad_norm": 3.40625, "grad_norm_var": 0.39090067545572915, "learning_rate": 0.0001, "loss": 6.1114, "loss/crossentropy": 2.6758971214294434, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.1861327812075615, "step": 11798 }, { "epoch": 0.36875, "grad_norm": 3.515625, "grad_norm_var": 0.3773834228515625, "learning_rate": 0.0001, "loss": 6.3307, "loss/crossentropy": 2.873810648918152, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.19256100058555603, "step": 11800 }, { "epoch": 0.3688125, "grad_norm": 3.234375, "grad_norm_var": 0.3807525634765625, "learning_rate": 0.0001, "loss": 5.6037, "loss/crossentropy": 2.340093493461609, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17205937206745148, "step": 11802 }, { "epoch": 0.368875, "grad_norm": 3.359375, "grad_norm_var": 0.046174112955729166, "learning_rate": 0.0001, "loss": 5.8077, "loss/crossentropy": 2.5863677263259888, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17096296697854996, "step": 11804 }, { "epoch": 0.3689375, "grad_norm": 3.1875, "grad_norm_var": 0.04405924479166667, "learning_rate": 0.0001, "loss": 5.8759, "loss/crossentropy": 2.5780467987060547, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1790032461285591, "step": 11806 }, { "epoch": 0.369, "grad_norm": 3.484375, "grad_norm_var": 0.07014567057291667, "learning_rate": 0.0001, "loss": 5.6983, "loss/crossentropy": 2.3743369579315186, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17927594482898712, "step": 11808 }, { "epoch": 0.3690625, "grad_norm": 3.078125, "grad_norm_var": 0.0919921875, "learning_rate": 0.0001, "loss": 5.9331, "loss/crossentropy": 2.5354260206222534, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18741943687200546, "step": 11810 }, { "epoch": 0.369125, "grad_norm": 3.40625, "grad_norm_var": 0.09038798014322917, "learning_rate": 0.0001, "loss": 5.9963, "loss/crossentropy": 2.649405837059021, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.17805065214633942, "step": 11812 }, { "epoch": 0.3691875, "grad_norm": 3.4375, "grad_norm_var": 0.10051676432291666, "learning_rate": 0.0001, "loss": 5.9888, "loss/crossentropy": 2.676456570625305, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1777212768793106, "step": 11814 }, { "epoch": 0.36925, "grad_norm": 3.296875, "grad_norm_var": 0.10239156087239583, "learning_rate": 0.0001, "loss": 5.9971, "loss/crossentropy": 2.6781569719314575, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.18072041869163513, "step": 11816 }, { "epoch": 0.3693125, "grad_norm": 3.09375, "grad_norm_var": 0.10413004557291666, "learning_rate": 0.0001, "loss": 6.1823, "loss/crossentropy": 2.8319051265716553, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1807391345500946, "step": 11818 }, { "epoch": 0.369375, "grad_norm": 3.03125, "grad_norm_var": 0.11757405598958333, "learning_rate": 0.0001, "loss": 5.6943, "loss/crossentropy": 2.5074106454849243, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16947129368782043, "step": 11820 }, { "epoch": 0.3694375, "grad_norm": 3.1875, "grad_norm_var": 0.12092692057291667, "learning_rate": 0.0001, "loss": 6.0688, "loss/crossentropy": 2.6527081727981567, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1896558776497841, "step": 11822 }, { "epoch": 0.3695, "grad_norm": 3.5625, "grad_norm_var": 0.10622456868489584, "learning_rate": 0.0001, "loss": 6.4309, "loss/crossentropy": 2.8148432970046997, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.20184143632650375, "step": 11824 }, { "epoch": 0.3695625, "grad_norm": 3.15625, "grad_norm_var": 0.05650634765625, "learning_rate": 0.0001, "loss": 5.8168, "loss/crossentropy": 2.520147204399109, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.18044225126504898, "step": 11826 }, { "epoch": 0.369625, "grad_norm": 3.359375, "grad_norm_var": 0.05172119140625, "learning_rate": 0.0001, "loss": 5.8837, "loss/crossentropy": 2.5915223360061646, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1780419945716858, "step": 11828 }, { "epoch": 0.3696875, "grad_norm": 3.296875, "grad_norm_var": 0.0499908447265625, "learning_rate": 0.0001, "loss": 5.6589, "loss/crossentropy": 2.4262943267822266, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.16857632994651794, "step": 11830 }, { "epoch": 0.36975, "grad_norm": 3.203125, "grad_norm_var": 0.0506988525390625, "learning_rate": 0.0001, "loss": 6.0, "loss/crossentropy": 2.713708758354187, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1762854903936386, "step": 11832 }, { "epoch": 0.3698125, "grad_norm": 3.203125, "grad_norm_var": 0.04698893229166667, "learning_rate": 0.0001, "loss": 5.6122, "loss/crossentropy": 2.363717198371887, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17445889115333557, "step": 11834 }, { "epoch": 0.369875, "grad_norm": 3.328125, "grad_norm_var": 0.0415191650390625, "learning_rate": 0.0001, "loss": 5.7302, "loss/crossentropy": 2.459938406944275, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17624631524085999, "step": 11836 }, { "epoch": 0.3699375, "grad_norm": 3.421875, "grad_norm_var": 0.0331451416015625, "learning_rate": 0.0001, "loss": 5.8738, "loss/crossentropy": 2.522003412246704, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18127425760030746, "step": 11838 }, { "epoch": 0.37, "grad_norm": 3.078125, "grad_norm_var": 0.009891764322916666, "learning_rate": 0.0001, "loss": 5.664, "loss/crossentropy": 2.4958502054214478, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1675918996334076, "step": 11840 }, { "epoch": 0.3700625, "grad_norm": 3.578125, "grad_norm_var": 0.016437784830729166, "learning_rate": 0.0001, "loss": 6.1671, "loss/crossentropy": 2.7934341430664062, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1811162754893303, "step": 11842 }, { "epoch": 0.370125, "grad_norm": 3.25, "grad_norm_var": 0.016893513997395835, "learning_rate": 0.0001, "loss": 5.8931, "loss/crossentropy": 2.596327543258667, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17889384925365448, "step": 11844 }, { "epoch": 0.3701875, "grad_norm": 3.15625, "grad_norm_var": 0.016304524739583333, "learning_rate": 0.0001, "loss": 5.938, "loss/crossentropy": 2.633580446243286, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17887794226408005, "step": 11846 }, { "epoch": 0.37025, "grad_norm": 3.046875, "grad_norm_var": 0.021187337239583333, "learning_rate": 0.0001, "loss": 5.9855, "loss/crossentropy": 2.6642227172851562, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18134354054927826, "step": 11848 }, { "epoch": 0.3703125, "grad_norm": 3.140625, "grad_norm_var": 0.022294108072916666, "learning_rate": 0.0001, "loss": 5.7663, "loss/crossentropy": 2.5717811584472656, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16749581694602966, "step": 11850 }, { "epoch": 0.370375, "grad_norm": 3.140625, "grad_norm_var": 0.021451822916666665, "learning_rate": 0.0001, "loss": 5.8715, "loss/crossentropy": 2.627884268760681, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17474838346242905, "step": 11852 }, { "epoch": 0.3704375, "grad_norm": 3.234375, "grad_norm_var": 0.0182281494140625, "learning_rate": 0.0001, "loss": 5.5614, "loss/crossentropy": 2.3951979875564575, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.16310697048902512, "step": 11854 }, { "epoch": 0.3705, "grad_norm": 3.21875, "grad_norm_var": 0.017281087239583333, "learning_rate": 0.0001, "loss": 5.9406, "loss/crossentropy": 2.6456116437911987, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17950201779603958, "step": 11856 }, { "epoch": 0.3705625, "grad_norm": 3.3125, "grad_norm_var": 0.008918253580729167, "learning_rate": 0.0001, "loss": 5.8216, "loss/crossentropy": 2.5380886793136597, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17678741365671158, "step": 11858 }, { "epoch": 0.370625, "grad_norm": 3.3125, "grad_norm_var": 0.0146484375, "learning_rate": 0.0001, "loss": 5.8239, "loss/crossentropy": 2.5890220403671265, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17348285764455795, "step": 11860 }, { "epoch": 0.3706875, "grad_norm": 3.03125, "grad_norm_var": 0.01617431640625, "learning_rate": 0.0001, "loss": 5.8439, "loss/crossentropy": 2.6098623275756836, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17457354068756104, "step": 11862 }, { "epoch": 0.37075, "grad_norm": 3.296875, "grad_norm_var": 0.0123199462890625, "learning_rate": 0.0001, "loss": 5.7248, "loss/crossentropy": 2.4863415956497192, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.16994335502386093, "step": 11864 }, { "epoch": 0.3708125, "grad_norm": 3.234375, "grad_norm_var": 0.012548828125, "learning_rate": 0.0001, "loss": 5.8885, "loss/crossentropy": 2.595168352127075, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1777665838599205, "step": 11866 }, { "epoch": 0.370875, "grad_norm": 3.59375, "grad_norm_var": 0.021187337239583333, "learning_rate": 0.0001, "loss": 5.8157, "loss/crossentropy": 2.5132588148117065, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.18023964762687683, "step": 11868 }, { "epoch": 0.3709375, "grad_norm": 3.515625, "grad_norm_var": 0.026244099934895834, "learning_rate": 0.0001, "loss": 5.5236, "loss/crossentropy": 2.3587870597839355, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1688261479139328, "step": 11870 }, { "epoch": 0.371, "grad_norm": 3.15625, "grad_norm_var": 0.028929646809895834, "learning_rate": 0.0001, "loss": 5.6618, "loss/crossentropy": 2.5152169466018677, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1634834259748459, "step": 11872 }, { "epoch": 0.3710625, "grad_norm": 3.65625, "grad_norm_var": 0.04599202473958333, "learning_rate": 0.0001, "loss": 5.8343, "loss/crossentropy": 2.5197051763534546, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1779429167509079, "step": 11874 }, { "epoch": 0.371125, "grad_norm": 3.40625, "grad_norm_var": 0.0421783447265625, "learning_rate": 0.0001, "loss": 5.9015, "loss/crossentropy": 2.619465470314026, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17663691192865372, "step": 11876 }, { "epoch": 0.3711875, "grad_norm": 3.28125, "grad_norm_var": 0.046826171875, "learning_rate": 0.0001, "loss": 5.7655, "loss/crossentropy": 2.504304528236389, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1761169210076332, "step": 11878 }, { "epoch": 0.37125, "grad_norm": 3.53125, "grad_norm_var": 0.04921773274739583, "learning_rate": 0.0001, "loss": 6.141, "loss/crossentropy": 2.733517646789551, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18567375093698502, "step": 11880 }, { "epoch": 0.3713125, "grad_norm": 3.4375, "grad_norm_var": 0.0548736572265625, "learning_rate": 0.0001, "loss": 5.9319, "loss/crossentropy": 2.5222357511520386, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18354719132184982, "step": 11882 }, { "epoch": 0.371375, "grad_norm": 3.1875, "grad_norm_var": 0.050104777018229164, "learning_rate": 0.0001, "loss": 6.1632, "loss/crossentropy": 2.722404360771179, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.19056753814220428, "step": 11884 }, { "epoch": 0.3714375, "grad_norm": 3.390625, "grad_norm_var": 0.046052042643229166, "learning_rate": 0.0001, "loss": 6.281, "loss/crossentropy": 2.844220280647278, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1893763542175293, "step": 11886 }, { "epoch": 0.3715, "grad_norm": 3.359375, "grad_norm_var": 0.038426717122395836, "learning_rate": 0.0001, "loss": 5.9486, "loss/crossentropy": 2.6783348321914673, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17663230001926422, "step": 11888 }, { "epoch": 0.3715625, "grad_norm": 3.28125, "grad_norm_var": 0.03242085774739583, "learning_rate": 0.0001, "loss": 5.7972, "loss/crossentropy": 2.472720503807068, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1793181523680687, "step": 11890 }, { "epoch": 0.371625, "grad_norm": 3.390625, "grad_norm_var": 0.024995930989583335, "learning_rate": 0.0001, "loss": 5.859, "loss/crossentropy": 2.5683202743530273, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17515835911035538, "step": 11892 }, { "epoch": 0.3716875, "grad_norm": 3.09375, "grad_norm_var": 0.019505818684895832, "learning_rate": 0.0001, "loss": 5.8766, "loss/crossentropy": 2.563071131706238, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.18018251657485962, "step": 11894 }, { "epoch": 0.37175, "grad_norm": 3.40625, "grad_norm_var": 0.016893513997395835, "learning_rate": 0.0001, "loss": 5.803, "loss/crossentropy": 2.605095624923706, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17174078524112701, "step": 11896 }, { "epoch": 0.3718125, "grad_norm": 3.328125, "grad_norm_var": 0.01591796875, "learning_rate": 0.0001, "loss": 5.4955, "loss/crossentropy": 2.3684515953063965, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16699732095003128, "step": 11898 }, { "epoch": 0.371875, "grad_norm": 3.0625, "grad_norm_var": 0.021678670247395834, "learning_rate": 0.0001, "loss": 5.8702, "loss/crossentropy": 2.6153483390808105, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1739197000861168, "step": 11900 }, { "epoch": 0.3719375, "grad_norm": 3.34375, "grad_norm_var": 0.02447509765625, "learning_rate": 0.0001, "loss": 5.5087, "loss/crossentropy": 2.327476739883423, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1657761111855507, "step": 11902 }, { "epoch": 0.372, "grad_norm": 3.8125, "grad_norm_var": 0.04108072916666667, "learning_rate": 0.0001, "loss": 6.0579, "loss/crossentropy": 2.683075189590454, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1831829622387886, "step": 11904 }, { "epoch": 0.3720625, "grad_norm": 3.1875, "grad_norm_var": 0.03892822265625, "learning_rate": 0.0001, "loss": 5.8588, "loss/crossentropy": 2.5925426483154297, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17467505484819412, "step": 11906 }, { "epoch": 0.372125, "grad_norm": 3.171875, "grad_norm_var": 0.0461822509765625, "learning_rate": 0.0001, "loss": 5.984, "loss/crossentropy": 2.646338939666748, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18103424459695816, "step": 11908 }, { "epoch": 0.3721875, "grad_norm": 3.828125, "grad_norm_var": 0.0583404541015625, "learning_rate": 0.0001, "loss": 5.8657, "loss/crossentropy": 2.5094738006591797, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18406447768211365, "step": 11910 }, { "epoch": 0.37225, "grad_norm": 3.375, "grad_norm_var": 0.06013081868489583, "learning_rate": 0.0001, "loss": 5.5741, "loss/crossentropy": 2.288855195045471, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.17070835828781128, "step": 11912 }, { "epoch": 0.3723125, "grad_norm": 3.296875, "grad_norm_var": 0.07337137858072916, "learning_rate": 0.0001, "loss": 5.9294, "loss/crossentropy": 2.5085290670394897, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18505970388650894, "step": 11914 }, { "epoch": 0.372375, "grad_norm": 3.40625, "grad_norm_var": 0.06606343587239584, "learning_rate": 0.0001, "loss": 6.045, "loss/crossentropy": 2.6599591970443726, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18576952815055847, "step": 11916 }, { "epoch": 0.3724375, "grad_norm": 3.0, "grad_norm_var": 0.07512105305989583, "learning_rate": 0.0001, "loss": 5.859, "loss/crossentropy": 2.6257758140563965, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17527589201927185, "step": 11918 }, { "epoch": 0.3725, "grad_norm": 3.1875, "grad_norm_var": 0.06702067057291666, "learning_rate": 0.0001, "loss": 5.622, "loss/crossentropy": 2.420231819152832, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17290856689214706, "step": 11920 }, { "epoch": 0.3725625, "grad_norm": 3.234375, "grad_norm_var": 0.06565348307291667, "learning_rate": 0.0001, "loss": 5.8418, "loss/crossentropy": 2.585193395614624, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.175659641623497, "step": 11922 }, { "epoch": 0.372625, "grad_norm": 3.515625, "grad_norm_var": 1.6833821614583333, "learning_rate": 0.0001, "loss": 6.0385, "loss/crossentropy": 2.539443612098694, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.19248707592487335, "step": 11924 }, { "epoch": 0.3726875, "grad_norm": 3.15625, "grad_norm_var": 1.6886138916015625, "learning_rate": 0.0001, "loss": 6.0132, "loss/crossentropy": 2.6566983461380005, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18252985924482346, "step": 11926 }, { "epoch": 0.37275, "grad_norm": 3.15625, "grad_norm_var": 1.7012115478515626, "learning_rate": 0.0001, "loss": 5.6475, "loss/crossentropy": 2.4480196237564087, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1722874492406845, "step": 11928 }, { "epoch": 0.3728125, "grad_norm": 3.171875, "grad_norm_var": 1.701123046875, "learning_rate": 0.0001, "loss": 5.7602, "loss/crossentropy": 2.413305878639221, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18078355491161346, "step": 11930 }, { "epoch": 0.372875, "grad_norm": 3.609375, "grad_norm_var": 1.6905344645182292, "learning_rate": 0.0001, "loss": 5.789, "loss/crossentropy": 2.392745018005371, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.18180835992097855, "step": 11932 }, { "epoch": 0.3729375, "grad_norm": 3.015625, "grad_norm_var": 1.6818196614583334, "learning_rate": 0.0001, "loss": 5.7455, "loss/crossentropy": 2.50241756439209, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17000866681337357, "step": 11934 }, { "epoch": 0.373, "grad_norm": 3.265625, "grad_norm_var": 1.6605133056640624, "learning_rate": 0.0001, "loss": 5.9344, "loss/crossentropy": 2.5978814363479614, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18013402074575424, "step": 11936 }, { "epoch": 0.3730625, "grad_norm": 3.21875, "grad_norm_var": 1.6770579020182292, "learning_rate": 0.0001, "loss": 5.7267, "loss/crossentropy": 2.4426097869873047, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17684456706047058, "step": 11938 }, { "epoch": 0.373125, "grad_norm": 3.34375, "grad_norm_var": 0.0436676025390625, "learning_rate": 0.0001, "loss": 6.013, "loss/crossentropy": 2.625849723815918, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18558523058891296, "step": 11940 }, { "epoch": 0.3731875, "grad_norm": 3.921875, "grad_norm_var": 0.0610015869140625, "learning_rate": 0.0001, "loss": 6.1125, "loss/crossentropy": 2.7566566467285156, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1808934286236763, "step": 11942 }, { "epoch": 0.37325, "grad_norm": 3.296875, "grad_norm_var": 0.05533447265625, "learning_rate": 0.0001, "loss": 5.7133, "loss/crossentropy": 2.4668716192245483, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17229923605918884, "step": 11944 }, { "epoch": 0.3733125, "grad_norm": 3.234375, "grad_norm_var": 0.05533447265625, "learning_rate": 0.0001, "loss": 5.5595, "loss/crossentropy": 2.3742669820785522, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17047683149576187, "step": 11946 }, { "epoch": 0.373375, "grad_norm": 3.453125, "grad_norm_var": 0.04828999837239583, "learning_rate": 0.0001, "loss": 5.9406, "loss/crossentropy": 2.5744495391845703, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18388355523347855, "step": 11948 }, { "epoch": 0.3734375, "grad_norm": 3.09375, "grad_norm_var": 0.0493072509765625, "learning_rate": 0.0001, "loss": 5.7761, "loss/crossentropy": 2.529895782470703, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17344459891319275, "step": 11950 }, { "epoch": 0.3735, "grad_norm": 3.421875, "grad_norm_var": 0.05340169270833333, "learning_rate": 0.0001, "loss": 5.783, "loss/crossentropy": 2.5650538206100464, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17218929529190063, "step": 11952 }, { "epoch": 0.3735625, "grad_norm": 3.484375, "grad_norm_var": 0.050618489583333336, "learning_rate": 0.0001, "loss": 5.6553, "loss/crossentropy": 2.396847724914551, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17623884230852127, "step": 11954 }, { "epoch": 0.373625, "grad_norm": 2.96875, "grad_norm_var": 0.06588134765625, "learning_rate": 0.0001, "loss": 5.986, "loss/crossentropy": 2.6252458095550537, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18216659128665924, "step": 11956 }, { "epoch": 0.3736875, "grad_norm": 3.296875, "grad_norm_var": 0.041844685872395836, "learning_rate": 0.0001, "loss": 5.8041, "loss/crossentropy": 2.5305240154266357, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17228445410728455, "step": 11958 }, { "epoch": 0.37375, "grad_norm": 3.046875, "grad_norm_var": 0.049637858072916666, "learning_rate": 0.0001, "loss": 5.6102, "loss/crossentropy": 2.4852755069732666, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16288577020168304, "step": 11960 }, { "epoch": 0.3738125, "grad_norm": 3.234375, "grad_norm_var": 0.29588216145833335, "learning_rate": 0.0001, "loss": 5.9325, "loss/crossentropy": 2.562421679496765, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.18740053474903107, "step": 11962 }, { "epoch": 0.373875, "grad_norm": 4.59375, "grad_norm_var": 0.4058990478515625, "learning_rate": 0.0001, "loss": 6.2446, "loss/crossentropy": 2.6552056074142456, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.20503675192594528, "step": 11964 }, { "epoch": 0.3739375, "grad_norm": 3.34375, "grad_norm_var": 0.39443359375, "learning_rate": 0.0001, "loss": 5.7876, "loss/crossentropy": 2.4652920961380005, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17988931387662888, "step": 11966 }, { "epoch": 0.374, "grad_norm": 2.984375, "grad_norm_var": 0.39947509765625, "learning_rate": 0.0001, "loss": 5.7098, "loss/crossentropy": 2.407015323638916, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1748071238398552, "step": 11968 }, { "epoch": 0.3740625, "grad_norm": 3.546875, "grad_norm_var": 0.39192301432291665, "learning_rate": 0.0001, "loss": 5.859, "loss/crossentropy": 2.541646957397461, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17861248552799225, "step": 11970 }, { "epoch": 0.374125, "grad_norm": 3.34375, "grad_norm_var": 0.37522786458333335, "learning_rate": 0.0001, "loss": 6.0544, "loss/crossentropy": 2.697180986404419, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1841549649834633, "step": 11972 }, { "epoch": 0.3741875, "grad_norm": 3.65625, "grad_norm_var": 0.363623046875, "learning_rate": 0.0001, "loss": 6.1215, "loss/crossentropy": 2.690246343612671, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.1841374933719635, "step": 11974 }, { "epoch": 0.37425, "grad_norm": 2.796875, "grad_norm_var": 0.39421284993489586, "learning_rate": 0.0001, "loss": 5.6795, "loss/crossentropy": 2.4888908863067627, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17101595550775528, "step": 11976 }, { "epoch": 0.3743125, "grad_norm": 3.3125, "grad_norm_var": 0.19384358723958334, "learning_rate": 0.0001, "loss": 5.9827, "loss/crossentropy": 2.6535253524780273, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18213209509849548, "step": 11978 }, { "epoch": 0.374375, "grad_norm": 3.390625, "grad_norm_var": 0.062027994791666666, "learning_rate": 0.0001, "loss": 6.015, "loss/crossentropy": 2.711008906364441, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17922794818878174, "step": 11980 }, { "epoch": 0.3744375, "grad_norm": 3.015625, "grad_norm_var": 0.06271158854166667, "learning_rate": 0.0001, "loss": 5.7833, "loss/crossentropy": 2.5675182342529297, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17314188182353973, "step": 11982 }, { "epoch": 0.3745, "grad_norm": 3.203125, "grad_norm_var": 0.04506734212239583, "learning_rate": 0.0001, "loss": 5.6987, "loss/crossentropy": 2.4805713891983032, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.16751296073198318, "step": 11984 }, { "epoch": 0.3745625, "grad_norm": 3.359375, "grad_norm_var": 0.04267171223958333, "learning_rate": 0.0001, "loss": 5.9659, "loss/crossentropy": 2.6091278791427612, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18177400529384613, "step": 11986 }, { "epoch": 0.374625, "grad_norm": 3.296875, "grad_norm_var": 0.043017578125, "learning_rate": 0.0001, "loss": 5.8874, "loss/crossentropy": 2.5380669832229614, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1849372684955597, "step": 11988 }, { "epoch": 0.3746875, "grad_norm": 3.1875, "grad_norm_var": 0.0345123291015625, "learning_rate": 0.0001, "loss": 5.87, "loss/crossentropy": 2.6897950172424316, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16996879875659943, "step": 11990 }, { "epoch": 0.37475, "grad_norm": 3.78125, "grad_norm_var": 0.03989969889322917, "learning_rate": 0.0001, "loss": 5.6236, "loss/crossentropy": 2.408652186393738, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1664126068353653, "step": 11992 }, { "epoch": 0.3748125, "grad_norm": 3.3125, "grad_norm_var": 0.04254150390625, "learning_rate": 0.0001, "loss": 5.955, "loss/crossentropy": 2.6086472272872925, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1850292831659317, "step": 11994 }, { "epoch": 0.374875, "grad_norm": 3.34375, "grad_norm_var": 0.0419830322265625, "learning_rate": 0.0001, "loss": 5.8619, "loss/crossentropy": 2.5171536207199097, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17978684604167938, "step": 11996 }, { "epoch": 0.3749375, "grad_norm": 3.09375, "grad_norm_var": 0.048563639322916664, "learning_rate": 0.0001, "loss": 6.1212, "loss/crossentropy": 2.7672228813171387, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18227529525756836, "step": 11998 }, { "epoch": 0.375, "grad_norm": 3.328125, "grad_norm_var": 0.0486328125, "learning_rate": 0.0001, "loss": 5.5444, "loss/crossentropy": 2.3668113946914673, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16619888693094254, "step": 12000 }, { "epoch": 0.3750625, "grad_norm": 3.515625, "grad_norm_var": 0.05091044108072917, "learning_rate": 0.0001, "loss": 5.8255, "loss/crossentropy": 2.539885640144348, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1730951964855194, "step": 12002 }, { "epoch": 0.375125, "grad_norm": 3.546875, "grad_norm_var": 0.053304036458333336, "learning_rate": 0.0001, "loss": 5.977, "loss/crossentropy": 2.6519832611083984, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.17547233402729034, "step": 12004 }, { "epoch": 0.3751875, "grad_norm": 3.296875, "grad_norm_var": 0.0476959228515625, "learning_rate": 0.0001, "loss": 6.1005, "loss/crossentropy": 2.726112723350525, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.185099795460701, "step": 12006 }, { "epoch": 0.37525, "grad_norm": 3.5625, "grad_norm_var": 0.0339019775390625, "learning_rate": 0.0001, "loss": 5.9259, "loss/crossentropy": 2.5760785341262817, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18108032643795013, "step": 12008 }, { "epoch": 0.3753125, "grad_norm": 3.1875, "grad_norm_var": 0.03004150390625, "learning_rate": 0.0001, "loss": 5.536, "loss/crossentropy": 2.247123122215271, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.177718847990036, "step": 12010 }, { "epoch": 0.375375, "grad_norm": 3.1875, "grad_norm_var": 0.028986612955729168, "learning_rate": 0.0001, "loss": 5.6847, "loss/crossentropy": 2.4444741010665894, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17246388643980026, "step": 12012 }, { "epoch": 0.3754375, "grad_norm": 3.125, "grad_norm_var": 0.022191365559895832, "learning_rate": 0.0001, "loss": 5.6309, "loss/crossentropy": 2.492947816848755, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16419032961130142, "step": 12014 }, { "epoch": 0.3755, "grad_norm": 3.34375, "grad_norm_var": 0.021024576822916665, "learning_rate": 0.0001, "loss": 5.9138, "loss/crossentropy": 2.5885088443756104, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18174774199724197, "step": 12016 }, { "epoch": 0.3755625, "grad_norm": 3.3125, "grad_norm_var": 0.020555623372395835, "learning_rate": 0.0001, "loss": 6.0689, "loss/crossentropy": 2.5820603370666504, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1932116001844406, "step": 12018 }, { "epoch": 0.375625, "grad_norm": 3.203125, "grad_norm_var": 0.021272786458333335, "learning_rate": 0.0001, "loss": 5.7684, "loss/crossentropy": 2.509916305541992, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1778009980916977, "step": 12020 }, { "epoch": 0.3756875, "grad_norm": 3.390625, "grad_norm_var": 0.021708170572916668, "learning_rate": 0.0001, "loss": 5.6891, "loss/crossentropy": 2.4291462898254395, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.174431212246418, "step": 12022 }, { "epoch": 0.37575, "grad_norm": 3.0625, "grad_norm_var": 0.0208404541015625, "learning_rate": 0.0001, "loss": 5.9232, "loss/crossentropy": 2.6144766807556152, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18009580671787262, "step": 12024 }, { "epoch": 0.3758125, "grad_norm": 3.234375, "grad_norm_var": 0.0223541259765625, "learning_rate": 0.0001, "loss": 5.6589, "loss/crossentropy": 2.4320074319839478, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17034152150154114, "step": 12026 }, { "epoch": 0.375875, "grad_norm": 3.640625, "grad_norm_var": 0.028694661458333333, "learning_rate": 0.0001, "loss": 5.7392, "loss/crossentropy": 2.4564120769500732, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17866551876068115, "step": 12028 }, { "epoch": 0.3759375, "grad_norm": 3.1875, "grad_norm_var": 0.02779541015625, "learning_rate": 0.0001, "loss": 5.8896, "loss/crossentropy": 2.589442491531372, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17806587368249893, "step": 12030 }, { "epoch": 0.376, "grad_norm": 3.3125, "grad_norm_var": 0.0276519775390625, "learning_rate": 0.0001, "loss": 6.0676, "loss/crossentropy": 2.648952603340149, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18990715593099594, "step": 12032 }, { "epoch": 0.3760625, "grad_norm": 3.296875, "grad_norm_var": 0.023762003580729166, "learning_rate": 0.0001, "loss": 6.137, "loss/crossentropy": 2.6550445556640625, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.19233429431915283, "step": 12034 }, { "epoch": 0.376125, "grad_norm": 3.28125, "grad_norm_var": 0.02525634765625, "learning_rate": 0.0001, "loss": 5.4149, "loss/crossentropy": 2.3415744304656982, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15967915207147598, "step": 12036 }, { "epoch": 0.3761875, "grad_norm": 3.34375, "grad_norm_var": 0.03355712890625, "learning_rate": 0.0001, "loss": 5.77, "loss/crossentropy": 2.580352306365967, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16701454669237137, "step": 12038 }, { "epoch": 0.37625, "grad_norm": 3.140625, "grad_norm_var": 0.03664957682291667, "learning_rate": 0.0001, "loss": 5.5584, "loss/crossentropy": 2.40367329120636, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16391241550445557, "step": 12040 }, { "epoch": 0.3763125, "grad_norm": 3.296875, "grad_norm_var": 0.0327056884765625, "learning_rate": 0.0001, "loss": 6.1674, "loss/crossentropy": 2.7630926370620728, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18300750851631165, "step": 12042 }, { "epoch": 0.376375, "grad_norm": 3.328125, "grad_norm_var": 0.0356353759765625, "learning_rate": 0.0001, "loss": 6.0534, "loss/crossentropy": 2.6944353580474854, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18199177831411362, "step": 12044 }, { "epoch": 0.3764375, "grad_norm": 3.046875, "grad_norm_var": 0.0447174072265625, "learning_rate": 0.0001, "loss": 5.8027, "loss/crossentropy": 2.645112633705139, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16810278594493866, "step": 12046 }, { "epoch": 0.3765, "grad_norm": 3.078125, "grad_norm_var": 0.05136311848958333, "learning_rate": 0.0001, "loss": 5.7176, "loss/crossentropy": 2.5695072412490845, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16676388680934906, "step": 12048 }, { "epoch": 0.3765625, "grad_norm": 3.015625, "grad_norm_var": 0.05413004557291667, "learning_rate": 0.0001, "loss": 6.038, "loss/crossentropy": 2.7702596187591553, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17638515681028366, "step": 12050 }, { "epoch": 0.376625, "grad_norm": 3.140625, "grad_norm_var": 0.05302632649739583, "learning_rate": 0.0001, "loss": 5.9717, "loss/crossentropy": 2.59650194644928, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18243810534477234, "step": 12052 }, { "epoch": 0.3766875, "grad_norm": 3.0625, "grad_norm_var": 0.05816650390625, "learning_rate": 0.0001, "loss": 5.5906, "loss/crossentropy": 2.4401824474334717, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16699735075235367, "step": 12054 }, { "epoch": 0.37675, "grad_norm": 3.84375, "grad_norm_var": 0.07298177083333333, "learning_rate": 0.0001, "loss": 5.9168, "loss/crossentropy": 2.5050649642944336, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1868806630373001, "step": 12056 }, { "epoch": 0.3768125, "grad_norm": 4.9375, "grad_norm_var": 0.24763081868489584, "learning_rate": 0.0001, "loss": 5.6702, "loss/crossentropy": 2.490068197250366, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1684013083577156, "step": 12058 }, { "epoch": 0.376875, "grad_norm": 3.203125, "grad_norm_var": 0.24306233723958334, "learning_rate": 0.0001, "loss": 6.1612, "loss/crossentropy": 2.7433427572250366, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18982917815446854, "step": 12060 }, { "epoch": 0.3769375, "grad_norm": 3.328125, "grad_norm_var": 0.23471577962239584, "learning_rate": 0.0001, "loss": 5.4492, "loss/crossentropy": 2.3737224340438843, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16184395551681519, "step": 12062 }, { "epoch": 0.377, "grad_norm": 3.421875, "grad_norm_var": 0.22488606770833333, "learning_rate": 0.0001, "loss": 5.816, "loss/crossentropy": 2.5514299869537354, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17684289067983627, "step": 12064 }, { "epoch": 0.3770625, "grad_norm": 3.4375, "grad_norm_var": 0.20998433430989583, "learning_rate": 0.0001, "loss": 6.035, "loss/crossentropy": 2.5798012018203735, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.19005408138036728, "step": 12066 }, { "epoch": 0.377125, "grad_norm": 3.3125, "grad_norm_var": 0.20474853515625, "learning_rate": 0.0001, "loss": 5.929, "loss/crossentropy": 2.568007707595825, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18258086591959, "step": 12068 }, { "epoch": 0.3771875, "grad_norm": 3.078125, "grad_norm_var": 0.2058746337890625, "learning_rate": 0.0001, "loss": 5.6861, "loss/crossentropy": 2.453046202659607, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1748638078570366, "step": 12070 }, { "epoch": 0.37725, "grad_norm": 3.328125, "grad_norm_var": 0.19586588541666666, "learning_rate": 0.0001, "loss": 5.5062, "loss/crossentropy": 2.371487021446228, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1623035967350006, "step": 12072 }, { "epoch": 0.3773125, "grad_norm": 3.0, "grad_norm_var": 0.028929646809895834, "learning_rate": 0.0001, "loss": 5.6704, "loss/crossentropy": 2.464169502258301, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17023424059152603, "step": 12074 }, { "epoch": 0.377375, "grad_norm": 3.25, "grad_norm_var": 0.025536092122395833, "learning_rate": 0.0001, "loss": 5.7449, "loss/crossentropy": 2.4666662216186523, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17704083770513535, "step": 12076 }, { "epoch": 0.3774375, "grad_norm": 3.171875, "grad_norm_var": 0.025223795572916666, "learning_rate": 0.0001, "loss": 5.6234, "loss/crossentropy": 2.421693801879883, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17212750762701035, "step": 12078 }, { "epoch": 0.3775, "grad_norm": 3.359375, "grad_norm_var": 0.025544230143229166, "learning_rate": 0.0001, "loss": 5.6774, "loss/crossentropy": 2.476585626602173, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16890597343444824, "step": 12080 }, { "epoch": 0.3775625, "grad_norm": 3.375, "grad_norm_var": 0.017406209309895834, "learning_rate": 0.0001, "loss": 5.5387, "loss/crossentropy": 2.3387359380722046, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16921505331993103, "step": 12082 }, { "epoch": 0.377625, "grad_norm": 3.21875, "grad_norm_var": 0.015485636393229167, "learning_rate": 0.0001, "loss": 5.5181, "loss/crossentropy": 2.3846631050109863, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16529268771409988, "step": 12084 }, { "epoch": 0.3776875, "grad_norm": 3.421875, "grad_norm_var": 0.016950480143229165, "learning_rate": 0.0001, "loss": 5.8004, "loss/crossentropy": 2.52835750579834, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17485986649990082, "step": 12086 }, { "epoch": 0.37775, "grad_norm": 3.140625, "grad_norm_var": 0.018131510416666666, "learning_rate": 0.0001, "loss": 5.7243, "loss/crossentropy": 2.447743535041809, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17452579736709595, "step": 12088 }, { "epoch": 0.3778125, "grad_norm": 3.125, "grad_norm_var": 0.014839680989583333, "learning_rate": 0.0001, "loss": 5.776, "loss/crossentropy": 2.5936180353164673, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16941224038600922, "step": 12090 }, { "epoch": 0.377875, "grad_norm": 3.421875, "grad_norm_var": 0.01910400390625, "learning_rate": 0.0001, "loss": 5.9988, "loss/crossentropy": 2.608113169670105, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1851622387766838, "step": 12092 }, { "epoch": 0.3779375, "grad_norm": 3.3125, "grad_norm_var": 0.021540323893229168, "learning_rate": 0.0001, "loss": 5.6652, "loss/crossentropy": 2.426708698272705, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.16915695369243622, "step": 12094 }, { "epoch": 0.378, "grad_norm": 3.0, "grad_norm_var": 0.025316365559895835, "learning_rate": 0.0001, "loss": 5.6704, "loss/crossentropy": 2.3982508182525635, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17408864200115204, "step": 12096 }, { "epoch": 0.3780625, "grad_norm": 3.296875, "grad_norm_var": 0.0919586181640625, "learning_rate": 0.0001, "loss": 6.2858, "loss/crossentropy": 2.751635193824768, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.19716878235340118, "step": 12098 }, { "epoch": 0.378125, "grad_norm": 3.078125, "grad_norm_var": 0.0958984375, "learning_rate": 0.0001, "loss": 5.8983, "loss/crossentropy": 2.6188371181488037, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17834113538265228, "step": 12100 }, { "epoch": 0.3781875, "grad_norm": 2.90625, "grad_norm_var": 0.10329488118489584, "learning_rate": 0.0001, "loss": 5.251, "loss/crossentropy": 2.2201396226882935, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.15269850939512253, "step": 12102 }, { "epoch": 0.37825, "grad_norm": 3.3125, "grad_norm_var": 0.10086263020833333, "learning_rate": 0.0001, "loss": 5.7502, "loss/crossentropy": 2.4805989265441895, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1765652298927307, "step": 12104 }, { "epoch": 0.3783125, "grad_norm": 3.28125, "grad_norm_var": 0.12730204264322917, "learning_rate": 0.0001, "loss": 6.1588, "loss/crossentropy": 2.686942934989929, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18976256251335144, "step": 12106 }, { "epoch": 0.378375, "grad_norm": 3.359375, "grad_norm_var": 0.12505594889322916, "learning_rate": 0.0001, "loss": 5.7162, "loss/crossentropy": 2.430199146270752, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17469412833452225, "step": 12108 }, { "epoch": 0.3784375, "grad_norm": 3.3125, "grad_norm_var": 0.12639872233072916, "learning_rate": 0.0001, "loss": 5.9854, "loss/crossentropy": 2.623427152633667, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18189743161201477, "step": 12110 }, { "epoch": 0.3785, "grad_norm": 3.203125, "grad_norm_var": 0.11840718587239583, "learning_rate": 0.0001, "loss": 5.8371, "loss/crossentropy": 2.538592576980591, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17945991456508636, "step": 12112 }, { "epoch": 0.3785625, "grad_norm": 3.15625, "grad_norm_var": 0.06402079264322917, "learning_rate": 0.0001, "loss": 6.0307, "loss/crossentropy": 2.703576922416687, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18192638456821442, "step": 12114 }, { "epoch": 0.378625, "grad_norm": 3.359375, "grad_norm_var": 0.06735026041666667, "learning_rate": 0.0001, "loss": 5.8, "loss/crossentropy": 2.606787919998169, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.17439626157283783, "step": 12116 }, { "epoch": 0.3786875, "grad_norm": 3.328125, "grad_norm_var": 0.059015909830729164, "learning_rate": 0.0001, "loss": 6.1803, "loss/crossentropy": 2.748610496520996, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1880939081311226, "step": 12118 }, { "epoch": 0.37875, "grad_norm": 3.453125, "grad_norm_var": 0.06272684733072917, "learning_rate": 0.0001, "loss": 6.186, "loss/crossentropy": 2.7390146255493164, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18962519615888596, "step": 12120 }, { "epoch": 0.3788125, "grad_norm": 2.875, "grad_norm_var": 0.0482330322265625, "learning_rate": 0.0001, "loss": 5.7026, "loss/crossentropy": 2.450590491294861, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1736360266804695, "step": 12122 }, { "epoch": 0.378875, "grad_norm": 3.265625, "grad_norm_var": 0.0556060791015625, "learning_rate": 0.0001, "loss": 5.9319, "loss/crossentropy": 2.6200908422470093, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17961391806602478, "step": 12124 }, { "epoch": 0.3789375, "grad_norm": 3.140625, "grad_norm_var": 0.05188395182291667, "learning_rate": 0.0001, "loss": 5.8957, "loss/crossentropy": 2.6624832153320312, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1721498966217041, "step": 12126 }, { "epoch": 0.379, "grad_norm": 3.640625, "grad_norm_var": 0.06915690104166666, "learning_rate": 0.0001, "loss": 5.8722, "loss/crossentropy": 2.6058125495910645, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17468710243701935, "step": 12128 }, { "epoch": 0.3790625, "grad_norm": 3.21875, "grad_norm_var": 0.06897786458333334, "learning_rate": 0.0001, "loss": 5.9679, "loss/crossentropy": 2.654877543449402, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17896226793527603, "step": 12130 }, { "epoch": 0.379125, "grad_norm": 3.4375, "grad_norm_var": 0.06750386555989583, "learning_rate": 0.0001, "loss": 5.7398, "loss/crossentropy": 2.4703577756881714, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1749955117702484, "step": 12132 }, { "epoch": 0.3791875, "grad_norm": 3.125, "grad_norm_var": 0.05586649576822917, "learning_rate": 0.0001, "loss": 5.7071, "loss/crossentropy": 2.4364266395568848, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17589226365089417, "step": 12134 }, { "epoch": 0.37925, "grad_norm": 3.0625, "grad_norm_var": 0.05611572265625, "learning_rate": 0.0001, "loss": 5.7527, "loss/crossentropy": 2.5519464015960693, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17007731646299362, "step": 12136 }, { "epoch": 0.3793125, "grad_norm": 3.53125, "grad_norm_var": 0.06360677083333334, "learning_rate": 0.0001, "loss": 5.9415, "loss/crossentropy": 2.526289939880371, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18800681084394455, "step": 12138 }, { "epoch": 0.379375, "grad_norm": 3.265625, "grad_norm_var": 0.0621246337890625, "learning_rate": 0.0001, "loss": 5.956, "loss/crossentropy": 2.6125783920288086, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.18316560983657837, "step": 12140 }, { "epoch": 0.3794375, "grad_norm": 3.5, "grad_norm_var": 0.056722005208333336, "learning_rate": 0.0001, "loss": 5.7223, "loss/crossentropy": 2.5177252292633057, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17241264134645462, "step": 12142 }, { "epoch": 0.3795, "grad_norm": 4.5, "grad_norm_var": 0.1311187744140625, "learning_rate": 0.0001, "loss": 6.0227, "loss/crossentropy": 2.5970641374588013, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1909969076514244, "step": 12144 }, { "epoch": 0.3795625, "grad_norm": 3.625, "grad_norm_var": 0.13193257649739584, "learning_rate": 0.0001, "loss": 5.93, "loss/crossentropy": 2.5664255619049072, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18322807550430298, "step": 12146 }, { "epoch": 0.379625, "grad_norm": 3.375, "grad_norm_var": 0.13154195149739584, "learning_rate": 0.0001, "loss": 6.1476, "loss/crossentropy": 2.797860860824585, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17989974468946457, "step": 12148 }, { "epoch": 0.3796875, "grad_norm": 3.171875, "grad_norm_var": 0.13027242024739583, "learning_rate": 0.0001, "loss": 5.6635, "loss/crossentropy": 2.4914366006851196, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1683766320347786, "step": 12150 }, { "epoch": 0.37975, "grad_norm": 3.21875, "grad_norm_var": 0.10907796223958334, "learning_rate": 0.0001, "loss": 5.8303, "loss/crossentropy": 2.5784735679626465, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1728389412164688, "step": 12152 }, { "epoch": 0.3798125, "grad_norm": 3.765625, "grad_norm_var": 0.11542561848958334, "learning_rate": 0.0001, "loss": 6.2311, "loss/crossentropy": 2.6795923709869385, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.1969432532787323, "step": 12154 }, { "epoch": 0.379875, "grad_norm": 3.34375, "grad_norm_var": 0.11448160807291667, "learning_rate": 0.0001, "loss": 5.8681, "loss/crossentropy": 2.557464122772217, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1759844273328781, "step": 12156 }, { "epoch": 0.3799375, "grad_norm": 3.4375, "grad_norm_var": 0.11199442545572917, "learning_rate": 0.0001, "loss": 5.7149, "loss/crossentropy": 2.522140145301819, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.169661745429039, "step": 12158 }, { "epoch": 0.38, "grad_norm": 4.1875, "grad_norm_var": 0.0751861572265625, "learning_rate": 0.0001, "loss": 5.8308, "loss/crossentropy": 2.591986656188965, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1762208491563797, "step": 12160 }, { "epoch": 0.3800625, "grad_norm": 3.078125, "grad_norm_var": 0.07493489583333333, "learning_rate": 0.0001, "loss": 5.7254, "loss/crossentropy": 2.4571229219436646, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17604468762874603, "step": 12162 }, { "epoch": 0.380125, "grad_norm": 3.171875, "grad_norm_var": 0.08089090983072916, "learning_rate": 0.0001, "loss": 5.9118, "loss/crossentropy": 2.6147637367248535, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17970652133226395, "step": 12164 }, { "epoch": 0.3801875, "grad_norm": 3.125, "grad_norm_var": 0.08205973307291667, "learning_rate": 0.0001, "loss": 5.7955, "loss/crossentropy": 2.5248990058898926, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17549820244312286, "step": 12166 }, { "epoch": 0.38025, "grad_norm": 3.078125, "grad_norm_var": 0.08730061848958333, "learning_rate": 0.0001, "loss": 5.4255, "loss/crossentropy": 2.3403602838516235, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1608593687415123, "step": 12168 }, { "epoch": 0.3803125, "grad_norm": 3.234375, "grad_norm_var": 0.07319234212239584, "learning_rate": 0.0001, "loss": 5.9311, "loss/crossentropy": 2.554044485092163, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18340571224689484, "step": 12170 }, { "epoch": 0.380375, "grad_norm": 3.703125, "grad_norm_var": 0.08367411295572917, "learning_rate": 0.0001, "loss": 5.8228, "loss/crossentropy": 2.5055052042007446, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.18290476500988007, "step": 12172 }, { "epoch": 0.3804375, "grad_norm": 3.40625, "grad_norm_var": 0.08108622233072917, "learning_rate": 0.0001, "loss": 6.1499, "loss/crossentropy": 2.7790268659591675, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1839602366089821, "step": 12174 }, { "epoch": 0.3805, "grad_norm": 3.578125, "grad_norm_var": 0.03209228515625, "learning_rate": 0.0001, "loss": 6.0808, "loss/crossentropy": 2.6841509342193604, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18732508271932602, "step": 12176 }, { "epoch": 0.3805625, "grad_norm": 3.421875, "grad_norm_var": 0.0468170166015625, "learning_rate": 0.0001, "loss": 5.9809, "loss/crossentropy": 2.6901607513427734, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17555983364582062, "step": 12178 }, { "epoch": 0.380625, "grad_norm": 3.15625, "grad_norm_var": 0.05035400390625, "learning_rate": 0.0001, "loss": 5.7499, "loss/crossentropy": 2.541025757789612, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1728431135416031, "step": 12180 }, { "epoch": 0.3806875, "grad_norm": 2.953125, "grad_norm_var": 0.05526936848958333, "learning_rate": 0.0001, "loss": 5.6726, "loss/crossentropy": 2.44266676902771, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.16752055287361145, "step": 12182 }, { "epoch": 0.38075, "grad_norm": 3.75, "grad_norm_var": 0.07165425618489583, "learning_rate": 0.0001, "loss": 5.8804, "loss/crossentropy": 2.521677017211914, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.17649737745523453, "step": 12184 }, { "epoch": 0.3808125, "grad_norm": 3.4375, "grad_norm_var": 0.0736328125, "learning_rate": 0.0001, "loss": 5.8637, "loss/crossentropy": 2.5367971658706665, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18073730170726776, "step": 12186 }, { "epoch": 0.380875, "grad_norm": 3.390625, "grad_norm_var": 0.06292215983072917, "learning_rate": 0.0001, "loss": 5.9977, "loss/crossentropy": 2.5861297845840454, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.186076782643795, "step": 12188 }, { "epoch": 0.3809375, "grad_norm": 3.28125, "grad_norm_var": 0.06717122395833333, "learning_rate": 0.0001, "loss": 5.9649, "loss/crossentropy": 2.567418694496155, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1842784881591797, "step": 12190 }, { "epoch": 0.381, "grad_norm": 3.125, "grad_norm_var": 0.061375935872395836, "learning_rate": 0.0001, "loss": 5.7625, "loss/crossentropy": 2.532058596611023, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.16874880343675613, "step": 12192 }, { "epoch": 0.3810625, "grad_norm": 3.21875, "grad_norm_var": 0.04898681640625, "learning_rate": 0.0001, "loss": 5.867, "loss/crossentropy": 2.5616434812545776, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17741339653730392, "step": 12194 }, { "epoch": 0.381125, "grad_norm": 3.0625, "grad_norm_var": 0.046263631184895834, "learning_rate": 0.0001, "loss": 5.9946, "loss/crossentropy": 2.7013275623321533, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17893613129854202, "step": 12196 }, { "epoch": 0.3811875, "grad_norm": 3.40625, "grad_norm_var": 0.039362589518229164, "learning_rate": 0.0001, "loss": 6.0828, "loss/crossentropy": 2.731987476348877, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1835208311676979, "step": 12198 }, { "epoch": 0.38125, "grad_norm": 3.3125, "grad_norm_var": 0.02398681640625, "learning_rate": 0.0001, "loss": 5.8494, "loss/crossentropy": 2.5146725177764893, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1838630512356758, "step": 12200 }, { "epoch": 0.3813125, "grad_norm": 3.09375, "grad_norm_var": 0.027701822916666667, "learning_rate": 0.0001, "loss": 5.7736, "loss/crossentropy": 2.570136785507202, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17073464393615723, "step": 12202 }, { "epoch": 0.381375, "grad_norm": 3.609375, "grad_norm_var": 0.032103474934895834, "learning_rate": 0.0001, "loss": 5.9705, "loss/crossentropy": 2.6945348978042603, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17642514407634735, "step": 12204 }, { "epoch": 0.3814375, "grad_norm": 3.015625, "grad_norm_var": 0.0372467041015625, "learning_rate": 0.0001, "loss": 5.908, "loss/crossentropy": 2.632769465446472, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17478998005390167, "step": 12206 }, { "epoch": 0.3815, "grad_norm": 2.890625, "grad_norm_var": 0.04537353515625, "learning_rate": 0.0001, "loss": 5.46, "loss/crossentropy": 2.406898617744446, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1568731889128685, "step": 12208 }, { "epoch": 0.3815625, "grad_norm": 3.421875, "grad_norm_var": 0.0423248291015625, "learning_rate": 0.0001, "loss": 6.0608, "loss/crossentropy": 2.70103120803833, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18206676840782166, "step": 12210 }, { "epoch": 0.381625, "grad_norm": 3.03125, "grad_norm_var": 0.043257649739583334, "learning_rate": 0.0001, "loss": 5.6974, "loss/crossentropy": 2.4659173488616943, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17080692946910858, "step": 12212 }, { "epoch": 0.3816875, "grad_norm": 3.203125, "grad_norm_var": 0.0397125244140625, "learning_rate": 0.0001, "loss": 5.9125, "loss/crossentropy": 2.652961015701294, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17556791007518768, "step": 12214 }, { "epoch": 0.38175, "grad_norm": 8.9375, "grad_norm_var": 2.0692535400390626, "learning_rate": 0.0001, "loss": 5.9109, "loss/crossentropy": 2.546905755996704, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1785893440246582, "step": 12216 }, { "epoch": 0.3818125, "grad_norm": 3.015625, "grad_norm_var": 2.058552042643229, "learning_rate": 0.0001, "loss": 5.6968, "loss/crossentropy": 2.577237844467163, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16312456130981445, "step": 12218 }, { "epoch": 0.381875, "grad_norm": 3.203125, "grad_norm_var": 2.06337890625, "learning_rate": 0.0001, "loss": 5.8515, "loss/crossentropy": 2.5166709423065186, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17996849119663239, "step": 12220 }, { "epoch": 0.3819375, "grad_norm": 3.234375, "grad_norm_var": 2.081761678059896, "learning_rate": 0.0001, "loss": 5.7943, "loss/crossentropy": 2.5752391815185547, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17229846864938736, "step": 12222 }, { "epoch": 0.382, "grad_norm": 3.515625, "grad_norm_var": 2.059928385416667, "learning_rate": 0.0001, "loss": 5.6823, "loss/crossentropy": 2.489313840866089, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16851232945919037, "step": 12224 }, { "epoch": 0.3820625, "grad_norm": 3.484375, "grad_norm_var": 2.0624664306640623, "learning_rate": 0.0001, "loss": 5.9289, "loss/crossentropy": 2.5973278284072876, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18120825290679932, "step": 12226 }, { "epoch": 0.382125, "grad_norm": 3.234375, "grad_norm_var": 2.0595987955729167, "learning_rate": 0.0001, "loss": 5.9501, "loss/crossentropy": 2.6300965547561646, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17809592932462692, "step": 12228 }, { "epoch": 0.3821875, "grad_norm": 3.5625, "grad_norm_var": 2.0440388997395833, "learning_rate": 0.0001, "loss": 5.8267, "loss/crossentropy": 2.6522780656814575, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16744433343410492, "step": 12230 }, { "epoch": 0.38225, "grad_norm": 3.421875, "grad_norm_var": 0.07775065104166666, "learning_rate": 0.0001, "loss": 5.998, "loss/crossentropy": 2.5721473693847656, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.18477017432451248, "step": 12232 }, { "epoch": 0.3823125, "grad_norm": 3.078125, "grad_norm_var": 0.078662109375, "learning_rate": 0.0001, "loss": 5.6932, "loss/crossentropy": 2.521741032600403, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16909783333539963, "step": 12234 }, { "epoch": 0.382375, "grad_norm": 3.140625, "grad_norm_var": 0.0667388916015625, "learning_rate": 0.0001, "loss": 6.0465, "loss/crossentropy": 2.657966375350952, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18377594649791718, "step": 12236 }, { "epoch": 0.3824375, "grad_norm": 3.890625, "grad_norm_var": 0.0851959228515625, "learning_rate": 0.0001, "loss": 5.3831, "loss/crossentropy": 2.246523380279541, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1605309322476387, "step": 12238 }, { "epoch": 0.3825, "grad_norm": 3.5625, "grad_norm_var": 0.0819000244140625, "learning_rate": 0.0001, "loss": 5.9532, "loss/crossentropy": 2.6257444620132446, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.18274663388729095, "step": 12240 }, { "epoch": 0.3825625, "grad_norm": 3.390625, "grad_norm_var": 0.079443359375, "learning_rate": 0.0001, "loss": 5.8184, "loss/crossentropy": 2.523483991622925, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17519061267375946, "step": 12242 }, { "epoch": 0.382625, "grad_norm": 3.40625, "grad_norm_var": 0.07519124348958334, "learning_rate": 0.0001, "loss": 5.916, "loss/crossentropy": 2.5331852436065674, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.1808634251356125, "step": 12244 }, { "epoch": 0.3826875, "grad_norm": 3.046875, "grad_norm_var": 0.08105061848958334, "learning_rate": 0.0001, "loss": 5.8447, "loss/crossentropy": 2.6363556385040283, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16927223652601242, "step": 12246 }, { "epoch": 0.38275, "grad_norm": 3.328125, "grad_norm_var": 0.05364583333333333, "learning_rate": 0.0001, "loss": 5.4834, "loss/crossentropy": 2.31623113155365, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16554533690214157, "step": 12248 }, { "epoch": 0.3828125, "grad_norm": 3.390625, "grad_norm_var": 0.050959269205729164, "learning_rate": 0.0001, "loss": 5.7755, "loss/crossentropy": 2.4709372520446777, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17888887971639633, "step": 12250 }, { "epoch": 0.382875, "grad_norm": 3.625, "grad_norm_var": 0.0599761962890625, "learning_rate": 0.0001, "loss": 5.9009, "loss/crossentropy": 2.617013931274414, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17682691663503647, "step": 12252 }, { "epoch": 0.3829375, "grad_norm": 3.6875, "grad_norm_var": 0.040913899739583336, "learning_rate": 0.0001, "loss": 6.0086, "loss/crossentropy": 2.671997904777527, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1836603283882141, "step": 12254 }, { "epoch": 0.383, "grad_norm": 3.390625, "grad_norm_var": 0.038914998372395836, "learning_rate": 0.0001, "loss": 5.8406, "loss/crossentropy": 2.6281083822250366, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1720341593027115, "step": 12256 }, { "epoch": 0.3830625, "grad_norm": 3.421875, "grad_norm_var": 0.04355061848958333, "learning_rate": 0.0001, "loss": 5.7652, "loss/crossentropy": 2.478159785270691, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1751842424273491, "step": 12258 }, { "epoch": 0.383125, "grad_norm": 3.328125, "grad_norm_var": 0.04267578125, "learning_rate": 0.0001, "loss": 5.8508, "loss/crossentropy": 2.61394727230072, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17290370166301727, "step": 12260 }, { "epoch": 0.3831875, "grad_norm": 3.1875, "grad_norm_var": 0.0467681884765625, "learning_rate": 0.0001, "loss": 6.0011, "loss/crossentropy": 2.6651010513305664, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17969100177288055, "step": 12262 }, { "epoch": 0.38325, "grad_norm": 3.015625, "grad_norm_var": 0.05367431640625, "learning_rate": 0.0001, "loss": 5.8139, "loss/crossentropy": 2.531054735183716, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17555169016122818, "step": 12264 }, { "epoch": 0.3833125, "grad_norm": 2.96875, "grad_norm_var": 0.06116434733072917, "learning_rate": 0.0001, "loss": 5.7743, "loss/crossentropy": 2.5854746103286743, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17044856399297714, "step": 12266 }, { "epoch": 0.383375, "grad_norm": 3.359375, "grad_norm_var": 0.05109049479166667, "learning_rate": 0.0001, "loss": 5.6732, "loss/crossentropy": 2.37837016582489, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1732364371418953, "step": 12268 }, { "epoch": 0.3834375, "grad_norm": 3.140625, "grad_norm_var": 0.04208984375, "learning_rate": 0.0001, "loss": 5.9049, "loss/crossentropy": 2.6091073751449585, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17567677795886993, "step": 12270 }, { "epoch": 0.3835, "grad_norm": 2.90625, "grad_norm_var": 0.04729817708333333, "learning_rate": 0.0001, "loss": 5.5178, "loss/crossentropy": 2.4287716150283813, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16397615522146225, "step": 12272 }, { "epoch": 0.3835625, "grad_norm": 3.140625, "grad_norm_var": 0.042464192708333334, "learning_rate": 0.0001, "loss": 5.8446, "loss/crossentropy": 2.570552706718445, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1766263246536255, "step": 12274 }, { "epoch": 0.383625, "grad_norm": 3.34375, "grad_norm_var": 0.041845703125, "learning_rate": 0.0001, "loss": 6.1636, "loss/crossentropy": 2.717664122581482, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.19225004315376282, "step": 12276 }, { "epoch": 0.3836875, "grad_norm": 3.640625, "grad_norm_var": 0.04117431640625, "learning_rate": 0.0001, "loss": 5.6497, "loss/crossentropy": 2.3928192853927612, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17412814497947693, "step": 12278 }, { "epoch": 0.38375, "grad_norm": 3.609375, "grad_norm_var": 0.04153238932291667, "learning_rate": 0.0001, "loss": 5.757, "loss/crossentropy": 2.556597352027893, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1719910204410553, "step": 12280 }, { "epoch": 0.3838125, "grad_norm": 3.421875, "grad_norm_var": 0.035791015625, "learning_rate": 0.0001, "loss": 6.1173, "loss/crossentropy": 2.8058717250823975, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17684178799390793, "step": 12282 }, { "epoch": 0.383875, "grad_norm": 3.453125, "grad_norm_var": 0.037886555989583334, "learning_rate": 0.0001, "loss": 5.7644, "loss/crossentropy": 2.4213374853134155, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1819588765501976, "step": 12284 }, { "epoch": 0.3839375, "grad_norm": 3.484375, "grad_norm_var": 0.03873291015625, "learning_rate": 0.0001, "loss": 5.9828, "loss/crossentropy": 2.639641284942627, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18197622895240784, "step": 12286 }, { "epoch": 0.384, "grad_norm": 4.65625, "grad_norm_var": 0.13775634765625, "learning_rate": 0.0001, "loss": 5.9036, "loss/crossentropy": 2.542542815208435, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18063607811927795, "step": 12288 }, { "epoch": 0.3840625, "grad_norm": 3.3125, "grad_norm_var": 0.13239644368489584, "learning_rate": 0.0001, "loss": 5.9213, "loss/crossentropy": 2.6110751628875732, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17906443774700165, "step": 12290 }, { "epoch": 0.384125, "grad_norm": 3.15625, "grad_norm_var": 0.14096577962239584, "learning_rate": 0.0001, "loss": 5.5237, "loss/crossentropy": 2.42322838306427, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16083312034606934, "step": 12292 }, { "epoch": 0.3841875, "grad_norm": 3.65625, "grad_norm_var": 0.1385406494140625, "learning_rate": 0.0001, "loss": 5.839, "loss/crossentropy": 2.524837851524353, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17750893533229828, "step": 12294 }, { "epoch": 0.38425, "grad_norm": 3.359375, "grad_norm_var": 0.1338531494140625, "learning_rate": 0.0001, "loss": 6.0998, "loss/crossentropy": 2.6787259578704834, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1889820396900177, "step": 12296 }, { "epoch": 0.3843125, "grad_norm": 3.140625, "grad_norm_var": 0.1303375244140625, "learning_rate": 0.0001, "loss": 5.4848, "loss/crossentropy": 2.3015941381454468, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16558706760406494, "step": 12298 }, { "epoch": 0.384375, "grad_norm": 3.453125, "grad_norm_var": 0.13454488118489583, "learning_rate": 0.0001, "loss": 5.5566, "loss/crossentropy": 2.385546326637268, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16671882569789886, "step": 12300 }, { "epoch": 0.3844375, "grad_norm": 3.109375, "grad_norm_var": 0.13991597493489583, "learning_rate": 0.0001, "loss": 6.0241, "loss/crossentropy": 2.754414200782776, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17579607665538788, "step": 12302 }, { "epoch": 0.3845, "grad_norm": 3.3125, "grad_norm_var": 0.03919270833333333, "learning_rate": 0.0001, "loss": 5.9473, "loss/crossentropy": 2.5974284410476685, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1799042969942093, "step": 12304 }, { "epoch": 0.3845625, "grad_norm": 3.53125, "grad_norm_var": 0.041991170247395834, "learning_rate": 0.0001, "loss": 5.6848, "loss/crossentropy": 2.4203197956085205, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17567094415426254, "step": 12306 }, { "epoch": 0.384625, "grad_norm": 3.421875, "grad_norm_var": 0.03603413899739583, "learning_rate": 0.0001, "loss": 5.9538, "loss/crossentropy": 2.602110505104065, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17930720001459122, "step": 12308 }, { "epoch": 0.3846875, "grad_norm": 4.9375, "grad_norm_var": 0.17834879557291666, "learning_rate": 0.0001, "loss": 5.6919, "loss/crossentropy": 2.3790918588638306, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1777629777789116, "step": 12310 }, { "epoch": 0.38475, "grad_norm": 3.921875, "grad_norm_var": 0.19153238932291666, "learning_rate": 0.0001, "loss": 5.7206, "loss/crossentropy": 2.4854986667633057, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.173123300075531, "step": 12312 }, { "epoch": 0.3848125, "grad_norm": 3.75, "grad_norm_var": 0.28059488932291665, "learning_rate": 0.0001, "loss": 6.2935, "loss/crossentropy": 2.72293484210968, "loss/hidden": 1.63671875, "loss/jsd": 0.0, "loss/logits": 0.1933843046426773, "step": 12314 }, { "epoch": 0.384875, "grad_norm": 3.796875, "grad_norm_var": 0.29862874348958335, "learning_rate": 0.0001, "loss": 5.9117, "loss/crossentropy": 2.5335875749588013, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18077735602855682, "step": 12316 }, { "epoch": 0.3849375, "grad_norm": 3.34375, "grad_norm_var": 0.28352762858072916, "learning_rate": 0.0001, "loss": 6.0196, "loss/crossentropy": 2.642233729362488, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18461138755083084, "step": 12318 }, { "epoch": 0.385, "grad_norm": 3.203125, "grad_norm_var": 0.2954915364583333, "learning_rate": 0.0001, "loss": 5.907, "loss/crossentropy": 2.630492329597473, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17648132890462875, "step": 12320 }, { "epoch": 0.3850625, "grad_norm": 3.453125, "grad_norm_var": 0.28582356770833334, "learning_rate": 0.0001, "loss": 5.9095, "loss/crossentropy": 2.679234027862549, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1726335808634758, "step": 12322 }, { "epoch": 0.385125, "grad_norm": 2.984375, "grad_norm_var": 0.343896484375, "learning_rate": 0.0001, "loss": 5.3557, "loss/crossentropy": 2.304378032684326, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1559125781059265, "step": 12324 }, { "epoch": 0.3851875, "grad_norm": 3.1875, "grad_norm_var": 0.21656901041666668, "learning_rate": 0.0001, "loss": 5.8262, "loss/crossentropy": 2.606326460838318, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17003396153450012, "step": 12326 }, { "epoch": 0.38525, "grad_norm": 3.328125, "grad_norm_var": 0.20341796875, "learning_rate": 0.0001, "loss": 5.5562, "loss/crossentropy": 2.372411847114563, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16916058957576752, "step": 12328 }, { "epoch": 0.3853125, "grad_norm": 3.359375, "grad_norm_var": 0.05090738932291667, "learning_rate": 0.0001, "loss": 5.7291, "loss/crossentropy": 2.5212844610214233, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17000123113393784, "step": 12330 }, { "epoch": 0.385375, "grad_norm": 3.46875, "grad_norm_var": 0.034468587239583334, "learning_rate": 0.0001, "loss": 5.8163, "loss/crossentropy": 2.578205704689026, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17654545605182648, "step": 12332 }, { "epoch": 0.3854375, "grad_norm": 3.5625, "grad_norm_var": 0.0312896728515625, "learning_rate": 0.0001, "loss": 5.9139, "loss/crossentropy": 2.620287299156189, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17623500525951385, "step": 12334 }, { "epoch": 0.3855, "grad_norm": 3.703125, "grad_norm_var": 0.04426167805989583, "learning_rate": 0.0001, "loss": 5.6922, "loss/crossentropy": 2.4255547523498535, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17588192969560623, "step": 12336 }, { "epoch": 0.3855625, "grad_norm": 3.453125, "grad_norm_var": 0.04471028645833333, "learning_rate": 0.0001, "loss": 5.6718, "loss/crossentropy": 2.5147881507873535, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1641385480761528, "step": 12338 }, { "epoch": 0.385625, "grad_norm": 3.71875, "grad_norm_var": 0.040120442708333336, "learning_rate": 0.0001, "loss": 5.4351, "loss/crossentropy": 2.3117510080337524, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16584917902946472, "step": 12340 }, { "epoch": 0.3856875, "grad_norm": 3.6875, "grad_norm_var": 0.05225321451822917, "learning_rate": 0.0001, "loss": 5.3687, "loss/crossentropy": 2.2538561820983887, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1583578959107399, "step": 12342 }, { "epoch": 0.38575, "grad_norm": 3.546875, "grad_norm_var": 0.053206380208333334, "learning_rate": 0.0001, "loss": 5.7736, "loss/crossentropy": 2.422629475593567, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1827506199479103, "step": 12344 }, { "epoch": 0.3858125, "grad_norm": 3.296875, "grad_norm_var": 0.06763407389322916, "learning_rate": 0.0001, "loss": 5.7576, "loss/crossentropy": 2.3514147996902466, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18398155272006989, "step": 12346 }, { "epoch": 0.385875, "grad_norm": 3.609375, "grad_norm_var": 0.07009175618489584, "learning_rate": 0.0001, "loss": 5.8983, "loss/crossentropy": 2.591655969619751, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17832036316394806, "step": 12348 }, { "epoch": 0.3859375, "grad_norm": 3.109375, "grad_norm_var": 0.08994852701822917, "learning_rate": 0.0001, "loss": 5.9475, "loss/crossentropy": 2.7168256044387817, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1722814068198204, "step": 12350 }, { "epoch": 0.386, "grad_norm": 3.078125, "grad_norm_var": 0.0876861572265625, "learning_rate": 0.0001, "loss": 5.7991, "loss/crossentropy": 2.5579906702041626, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17254658043384552, "step": 12352 }, { "epoch": 0.3860625, "grad_norm": 4.78125, "grad_norm_var": 0.2190338134765625, "learning_rate": 0.0001, "loss": 6.342, "loss/crossentropy": 2.74264657497406, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.2029033899307251, "step": 12354 }, { "epoch": 0.386125, "grad_norm": 3.15625, "grad_norm_var": 0.2100982666015625, "learning_rate": 0.0001, "loss": 5.7182, "loss/crossentropy": 2.5077556371688843, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17377915978431702, "step": 12356 }, { "epoch": 0.3861875, "grad_norm": 3.15625, "grad_norm_var": 0.19944254557291666, "learning_rate": 0.0001, "loss": 6.0767, "loss/crossentropy": 2.7694251537323, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1795564666390419, "step": 12358 }, { "epoch": 0.38625, "grad_norm": 3.515625, "grad_norm_var": 0.19931640625, "learning_rate": 0.0001, "loss": 5.6196, "loss/crossentropy": 2.38739013671875, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17126502841711044, "step": 12360 }, { "epoch": 0.3863125, "grad_norm": 3.046875, "grad_norm_var": 0.19107666015625, "learning_rate": 0.0001, "loss": 5.7025, "loss/crossentropy": 2.5682406425476074, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16732843965291977, "step": 12362 }, { "epoch": 0.386375, "grad_norm": 3.203125, "grad_norm_var": 0.1846099853515625, "learning_rate": 0.0001, "loss": 5.7853, "loss/crossentropy": 2.5417308807373047, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17239879816770554, "step": 12364 }, { "epoch": 0.3864375, "grad_norm": 3.109375, "grad_norm_var": 0.17350972493489583, "learning_rate": 0.0001, "loss": 5.945, "loss/crossentropy": 2.6591559648513794, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17819470912218094, "step": 12366 }, { "epoch": 0.3865, "grad_norm": 3.125, "grad_norm_var": 0.17265523274739583, "learning_rate": 0.0001, "loss": 5.8818, "loss/crossentropy": 2.5998687744140625, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1742846518754959, "step": 12368 }, { "epoch": 0.3865625, "grad_norm": 3.375, "grad_norm_var": 0.022508748372395835, "learning_rate": 0.0001, "loss": 6.0897, "loss/crossentropy": 2.7611477375030518, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18207243829965591, "step": 12370 }, { "epoch": 0.386625, "grad_norm": 3.40625, "grad_norm_var": 0.0267974853515625, "learning_rate": 0.0001, "loss": 5.6273, "loss/crossentropy": 2.4154202938079834, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17314515262842178, "step": 12372 }, { "epoch": 0.3866875, "grad_norm": 3.59375, "grad_norm_var": 0.033568318684895834, "learning_rate": 0.0001, "loss": 6.3509, "loss/crossentropy": 2.8199591636657715, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1952815130352974, "step": 12374 }, { "epoch": 0.38675, "grad_norm": 3.53125, "grad_norm_var": 0.03404032389322917, "learning_rate": 0.0001, "loss": 6.0306, "loss/crossentropy": 2.5840632915496826, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18879874050617218, "step": 12376 }, { "epoch": 0.3868125, "grad_norm": 2.984375, "grad_norm_var": 0.030497233072916668, "learning_rate": 0.0001, "loss": 5.7335, "loss/crossentropy": 2.499807357788086, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17219502478837967, "step": 12378 }, { "epoch": 0.386875, "grad_norm": 2.984375, "grad_norm_var": 0.040608723958333336, "learning_rate": 0.0001, "loss": 5.4897, "loss/crossentropy": 2.382007360458374, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16232822835445404, "step": 12380 }, { "epoch": 0.3869375, "grad_norm": 3.203125, "grad_norm_var": 0.0439849853515625, "learning_rate": 0.0001, "loss": 6.0852, "loss/crossentropy": 2.6731148958206177, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18730565905570984, "step": 12382 }, { "epoch": 0.387, "grad_norm": 2.8125, "grad_norm_var": 0.0530426025390625, "learning_rate": 0.0001, "loss": 5.3265, "loss/crossentropy": 2.2830283641815186, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15513179451227188, "step": 12384 }, { "epoch": 0.3870625, "grad_norm": 3.71875, "grad_norm_var": 0.0671051025390625, "learning_rate": 0.0001, "loss": 6.0105, "loss/crossentropy": 2.5999969244003296, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18284514546394348, "step": 12386 }, { "epoch": 0.387125, "grad_norm": 3.453125, "grad_norm_var": 0.066015625, "learning_rate": 0.0001, "loss": 5.861, "loss/crossentropy": 2.5838682651519775, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17653851211071014, "step": 12388 }, { "epoch": 0.3871875, "grad_norm": 3.140625, "grad_norm_var": 0.06604410807291666, "learning_rate": 0.0001, "loss": 6.0443, "loss/crossentropy": 2.650669813156128, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1823267638683319, "step": 12390 }, { "epoch": 0.38725, "grad_norm": 3.5, "grad_norm_var": 0.06659749348958334, "learning_rate": 0.0001, "loss": 6.1211, "loss/crossentropy": 2.624216675758362, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.1907046064734459, "step": 12392 }, { "epoch": 0.3873125, "grad_norm": 3.453125, "grad_norm_var": 0.06513264973958334, "learning_rate": 0.0001, "loss": 5.732, "loss/crossentropy": 2.4991012811660767, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17524397373199463, "step": 12394 }, { "epoch": 0.387375, "grad_norm": 5.21875, "grad_norm_var": 0.2813639322916667, "learning_rate": 0.0001, "loss": 6.0215, "loss/crossentropy": 2.6776946783065796, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18281908333301544, "step": 12396 }, { "epoch": 0.3874375, "grad_norm": 3.53125, "grad_norm_var": 0.27920633951822915, "learning_rate": 0.0001, "loss": 5.7708, "loss/crossentropy": 2.5114076137542725, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17359807342290878, "step": 12398 }, { "epoch": 0.3875, "grad_norm": 3.046875, "grad_norm_var": 0.26319986979166665, "learning_rate": 0.0001, "loss": 5.7417, "loss/crossentropy": 2.5564173460006714, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17009221762418747, "step": 12400 }, { "epoch": 0.3875625, "grad_norm": 3.28125, "grad_norm_var": 0.2624664306640625, "learning_rate": 0.0001, "loss": 6.2902, "loss/crossentropy": 2.812481641769409, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.19152524322271347, "step": 12402 }, { "epoch": 0.387625, "grad_norm": 3.421875, "grad_norm_var": 0.25524800618489585, "learning_rate": 0.0001, "loss": 6.0524, "loss/crossentropy": 2.710444688796997, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1826329007744789, "step": 12404 }, { "epoch": 0.3876875, "grad_norm": 3.125, "grad_norm_var": 0.25578511555989586, "learning_rate": 0.0001, "loss": 5.929, "loss/crossentropy": 2.6516133546829224, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17812985181808472, "step": 12406 }, { "epoch": 0.38775, "grad_norm": 3.515625, "grad_norm_var": 0.25614827473958335, "learning_rate": 0.0001, "loss": 5.8416, "loss/crossentropy": 2.535792112350464, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.175498329102993, "step": 12408 }, { "epoch": 0.3878125, "grad_norm": 3.9375, "grad_norm_var": 0.26713765462239586, "learning_rate": 0.0001, "loss": 5.8101, "loss/crossentropy": 2.5195395946502686, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1802251636981964, "step": 12410 }, { "epoch": 0.387875, "grad_norm": 5.6875, "grad_norm_var": 0.38483784993489584, "learning_rate": 0.0001, "loss": 6.088, "loss/crossentropy": 2.5359402894973755, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.19778630137443542, "step": 12412 }, { "epoch": 0.3879375, "grad_norm": 3.484375, "grad_norm_var": 0.38273824055989586, "learning_rate": 0.0001, "loss": 5.8851, "loss/crossentropy": 2.563393473625183, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18021686375141144, "step": 12414 }, { "epoch": 0.388, "grad_norm": 3.40625, "grad_norm_var": 0.36657613118489585, "learning_rate": 0.0001, "loss": 5.6905, "loss/crossentropy": 2.400371789932251, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17510996758937836, "step": 12416 }, { "epoch": 0.3880625, "grad_norm": 3.0, "grad_norm_var": 0.3878814697265625, "learning_rate": 0.0001, "loss": 5.8078, "loss/crossentropy": 2.5770890712738037, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17150652408599854, "step": 12418 }, { "epoch": 0.388125, "grad_norm": 2.9375, "grad_norm_var": 0.41765950520833334, "learning_rate": 0.0001, "loss": 5.6815, "loss/crossentropy": 2.564165711402893, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1679873764514923, "step": 12420 }, { "epoch": 0.3881875, "grad_norm": 14.6875, "grad_norm_var": 8.300706990559895, "learning_rate": 0.0001, "loss": 6.268, "loss/crossentropy": 2.609584927558899, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.21232140809297562, "step": 12422 }, { "epoch": 0.38825, "grad_norm": 3.453125, "grad_norm_var": 8.298563639322916, "learning_rate": 0.0001, "loss": 6.045, "loss/crossentropy": 2.7234071493148804, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17942225188016891, "step": 12424 }, { "epoch": 0.3883125, "grad_norm": 3.625, "grad_norm_var": 8.25523681640625, "learning_rate": 0.0001, "loss": 6.2657, "loss/crossentropy": 2.8496131896972656, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1857447624206543, "step": 12426 }, { "epoch": 0.388375, "grad_norm": 3.234375, "grad_norm_var": 8.137954711914062, "learning_rate": 0.0001, "loss": 6.019, "loss/crossentropy": 2.729101300239563, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17938295006752014, "step": 12428 }, { "epoch": 0.3884375, "grad_norm": 3.484375, "grad_norm_var": 8.197554524739584, "learning_rate": 0.0001, "loss": 6.0574, "loss/crossentropy": 2.804273843765259, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17414409667253494, "step": 12430 }, { "epoch": 0.3885, "grad_norm": 3.328125, "grad_norm_var": 8.200804646809896, "learning_rate": 0.0001, "loss": 5.9274, "loss/crossentropy": 2.677392601966858, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17461467534303665, "step": 12432 }, { "epoch": 0.3885625, "grad_norm": 3.328125, "grad_norm_var": 8.151171875, "learning_rate": 0.0001, "loss": 6.0823, "loss/crossentropy": 2.739627718925476, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17919403314590454, "step": 12434 }, { "epoch": 0.388625, "grad_norm": 3.1875, "grad_norm_var": 8.069559733072916, "learning_rate": 0.0001, "loss": 5.5287, "loss/crossentropy": 2.3251763582229614, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16918141394853592, "step": 12436 }, { "epoch": 0.3886875, "grad_norm": 3.234375, "grad_norm_var": 0.051985677083333334, "learning_rate": 0.0001, "loss": 5.8059, "loss/crossentropy": 2.5358848571777344, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17544230818748474, "step": 12438 }, { "epoch": 0.38875, "grad_norm": 3.40625, "grad_norm_var": 0.062093098958333336, "learning_rate": 0.0001, "loss": 5.8158, "loss/crossentropy": 2.5627267360687256, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17413048446178436, "step": 12440 }, { "epoch": 0.3888125, "grad_norm": 3.078125, "grad_norm_var": 0.028913370768229165, "learning_rate": 0.0001, "loss": 5.9131, "loss/crossentropy": 2.66182541847229, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17434658855199814, "step": 12442 }, { "epoch": 0.388875, "grad_norm": 2.953125, "grad_norm_var": 0.0419830322265625, "learning_rate": 0.0001, "loss": 5.5846, "loss/crossentropy": 2.480395793914795, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16433017700910568, "step": 12444 }, { "epoch": 0.3889375, "grad_norm": 3.40625, "grad_norm_var": 0.04179585774739583, "learning_rate": 0.0001, "loss": 5.9598, "loss/crossentropy": 2.5799955129623413, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1829030141234398, "step": 12446 }, { "epoch": 0.389, "grad_norm": 3.671875, "grad_norm_var": 0.05797119140625, "learning_rate": 0.0001, "loss": 5.8662, "loss/crossentropy": 2.4446297883987427, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18551705032587051, "step": 12448 }, { "epoch": 0.3890625, "grad_norm": 3.15625, "grad_norm_var": 0.0632232666015625, "learning_rate": 0.0001, "loss": 5.5757, "loss/crossentropy": 2.4247324466705322, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16666096448898315, "step": 12450 }, { "epoch": 0.389125, "grad_norm": 3.15625, "grad_norm_var": 0.054976399739583334, "learning_rate": 0.0001, "loss": 5.6293, "loss/crossentropy": 2.4501971006393433, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1667427271604538, "step": 12452 }, { "epoch": 0.3891875, "grad_norm": 3.375, "grad_norm_var": 0.0585357666015625, "learning_rate": 0.0001, "loss": 5.9373, "loss/crossentropy": 2.6698057651519775, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1779187098145485, "step": 12454 }, { "epoch": 0.38925, "grad_norm": 3.296875, "grad_norm_var": 0.05213114420572917, "learning_rate": 0.0001, "loss": 6.0682, "loss/crossentropy": 2.7355445623397827, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.18404972553253174, "step": 12456 }, { "epoch": 0.3893125, "grad_norm": 3.203125, "grad_norm_var": 0.05105794270833333, "learning_rate": 0.0001, "loss": 5.8755, "loss/crossentropy": 2.621545433998108, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17344574630260468, "step": 12458 }, { "epoch": 0.389375, "grad_norm": 3.984375, "grad_norm_var": 0.082177734375, "learning_rate": 0.0001, "loss": 5.8256, "loss/crossentropy": 2.565748691558838, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17285622656345367, "step": 12460 }, { "epoch": 0.3894375, "grad_norm": 4.0625, "grad_norm_var": 0.11726888020833333, "learning_rate": 0.0001, "loss": 6.4138, "loss/crossentropy": 2.834993362426758, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.20397429913282394, "step": 12462 }, { "epoch": 0.3895, "grad_norm": 3.171875, "grad_norm_var": 0.17669169108072916, "learning_rate": 0.0001, "loss": 5.9481, "loss/crossentropy": 2.6729438304901123, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1786840558052063, "step": 12464 }, { "epoch": 0.3895625, "grad_norm": 3.640625, "grad_norm_var": 0.17158203125, "learning_rate": 0.0001, "loss": 6.2031, "loss/crossentropy": 2.658798575401306, "loss/hidden": 1.6484375, "loss/jsd": 0.0, "loss/logits": 0.18958371877670288, "step": 12466 }, { "epoch": 0.389625, "grad_norm": 3.1875, "grad_norm_var": 0.17887369791666666, "learning_rate": 0.0001, "loss": 5.4703, "loss/crossentropy": 2.3624590635299683, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1603964865207672, "step": 12468 }, { "epoch": 0.3896875, "grad_norm": 3.109375, "grad_norm_var": 0.17981363932291666, "learning_rate": 0.0001, "loss": 5.5185, "loss/crossentropy": 2.313608765602112, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16970573365688324, "step": 12470 }, { "epoch": 0.38975, "grad_norm": 3.390625, "grad_norm_var": 0.17998758951822916, "learning_rate": 0.0001, "loss": 5.3998, "loss/crossentropy": 2.1840202808380127, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.16533156484365463, "step": 12472 }, { "epoch": 0.3898125, "grad_norm": 3.28125, "grad_norm_var": 0.1732330322265625, "learning_rate": 0.0001, "loss": 5.7885, "loss/crossentropy": 2.514266848564148, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17312682420015335, "step": 12474 }, { "epoch": 0.389875, "grad_norm": 3.03125, "grad_norm_var": 0.14644775390625, "learning_rate": 0.0001, "loss": 5.6845, "loss/crossentropy": 2.5720447301864624, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1604689359664917, "step": 12476 }, { "epoch": 0.3899375, "grad_norm": 3.1875, "grad_norm_var": 0.117431640625, "learning_rate": 0.0001, "loss": 5.9237, "loss/crossentropy": 2.688705086708069, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17271683365106583, "step": 12478 }, { "epoch": 0.39, "grad_norm": 3.28125, "grad_norm_var": 0.045287068684895834, "learning_rate": 0.0001, "loss": 6.0534, "loss/crossentropy": 2.643076777458191, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18556687235832214, "step": 12480 }, { "epoch": 0.3900625, "grad_norm": 3.078125, "grad_norm_var": 0.3818105061848958, "learning_rate": 0.0001, "loss": 5.7111, "loss/crossentropy": 2.463478922843933, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1739843562245369, "step": 12482 }, { "epoch": 0.390125, "grad_norm": 3.46875, "grad_norm_var": 0.3636627197265625, "learning_rate": 0.0001, "loss": 5.9466, "loss/crossentropy": 2.5659433603286743, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18260101974010468, "step": 12484 }, { "epoch": 0.3901875, "grad_norm": 3.34375, "grad_norm_var": 0.35659891764322915, "learning_rate": 0.0001, "loss": 5.9332, "loss/crossentropy": 2.6430987119674683, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17940440773963928, "step": 12486 }, { "epoch": 0.39025, "grad_norm": 3.390625, "grad_norm_var": 0.3578114827473958, "learning_rate": 0.0001, "loss": 5.8222, "loss/crossentropy": 2.4864721298217773, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.17732400447130203, "step": 12488 }, { "epoch": 0.3903125, "grad_norm": 3.421875, "grad_norm_var": 0.3553700764973958, "learning_rate": 0.0001, "loss": 5.7043, "loss/crossentropy": 2.460012197494507, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1709115281701088, "step": 12490 }, { "epoch": 0.390375, "grad_norm": 3.640625, "grad_norm_var": 0.3351064046223958, "learning_rate": 0.0001, "loss": 5.8912, "loss/crossentropy": 2.4892873764038086, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18393903970718384, "step": 12492 }, { "epoch": 0.3904375, "grad_norm": 4.84375, "grad_norm_var": 0.4280181884765625, "learning_rate": 0.0001, "loss": 5.8677, "loss/crossentropy": 2.4914904832839966, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18332628905773163, "step": 12494 }, { "epoch": 0.3905, "grad_norm": 3.59375, "grad_norm_var": 0.44505208333333335, "learning_rate": 0.0001, "loss": 6.0272, "loss/crossentropy": 2.6169928312301636, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1859443485736847, "step": 12496 }, { "epoch": 0.3905625, "grad_norm": 4.3125, "grad_norm_var": 0.19702860514322917, "learning_rate": 0.0001, "loss": 5.7736, "loss/crossentropy": 2.447673797607422, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.17634569108486176, "step": 12498 }, { "epoch": 0.390625, "grad_norm": 3.25, "grad_norm_var": 0.2016754150390625, "learning_rate": 0.0001, "loss": 6.1153, "loss/crossentropy": 2.724039673805237, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1859992891550064, "step": 12500 }, { "epoch": 0.3906875, "grad_norm": 3.46875, "grad_norm_var": 0.1912017822265625, "learning_rate": 0.0001, "loss": 5.985, "loss/crossentropy": 2.615624785423279, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1845911666750908, "step": 12502 }, { "epoch": 0.39075, "grad_norm": 3.796875, "grad_norm_var": 0.20871988932291666, "learning_rate": 0.0001, "loss": 5.9715, "loss/crossentropy": 2.6195785999298096, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1808907762169838, "step": 12504 }, { "epoch": 0.3908125, "grad_norm": 3.28125, "grad_norm_var": 0.21756184895833333, "learning_rate": 0.0001, "loss": 5.8925, "loss/crossentropy": 2.5929884910583496, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17487546801567078, "step": 12506 }, { "epoch": 0.390875, "grad_norm": 3.421875, "grad_norm_var": 0.24308980305989583, "learning_rate": 0.0001, "loss": 5.6525, "loss/crossentropy": 2.4756404161453247, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1673000156879425, "step": 12508 }, { "epoch": 0.3909375, "grad_norm": 3.484375, "grad_norm_var": 0.12542317708333334, "learning_rate": 0.0001, "loss": 5.937, "loss/crossentropy": 2.653487205505371, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17600330710411072, "step": 12510 }, { "epoch": 0.391, "grad_norm": 3.28125, "grad_norm_var": 0.12543843587239584, "learning_rate": 0.0001, "loss": 5.4535, "loss/crossentropy": 2.368133783340454, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1561921015381813, "step": 12512 }, { "epoch": 0.3910625, "grad_norm": 3.515625, "grad_norm_var": 0.07339579264322917, "learning_rate": 0.0001, "loss": 5.4631, "loss/crossentropy": 2.319524049758911, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.16044697165489197, "step": 12514 }, { "epoch": 0.391125, "grad_norm": 3.125, "grad_norm_var": 0.07093098958333334, "learning_rate": 0.0001, "loss": 5.8847, "loss/crossentropy": 2.6295299530029297, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17668668925762177, "step": 12516 }, { "epoch": 0.3911875, "grad_norm": 3.0, "grad_norm_var": 0.07001546223958334, "learning_rate": 0.0001, "loss": 5.7337, "loss/crossentropy": 2.4934409856796265, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17051269114017487, "step": 12518 }, { "epoch": 0.39125, "grad_norm": 3.234375, "grad_norm_var": 0.04545796712239583, "learning_rate": 0.0001, "loss": 5.9634, "loss/crossentropy": 2.6827075481414795, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1765061467885971, "step": 12520 }, { "epoch": 0.3913125, "grad_norm": 3.46875, "grad_norm_var": 0.05082906087239583, "learning_rate": 0.0001, "loss": 5.8172, "loss/crossentropy": 2.511690855026245, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1789904236793518, "step": 12522 }, { "epoch": 0.391375, "grad_norm": 3.0625, "grad_norm_var": 0.044188435872395834, "learning_rate": 0.0001, "loss": 5.6367, "loss/crossentropy": 2.4518083333969116, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17044229060411453, "step": 12524 }, { "epoch": 0.3914375, "grad_norm": 3.125, "grad_norm_var": 0.03808186848958333, "learning_rate": 0.0001, "loss": 5.77, "loss/crossentropy": 2.6033570766448975, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.16315263509750366, "step": 12526 }, { "epoch": 0.3915, "grad_norm": 2.875, "grad_norm_var": 0.038386027018229164, "learning_rate": 0.0001, "loss": 5.6457, "loss/crossentropy": 2.487342357635498, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16974316537380219, "step": 12528 }, { "epoch": 0.3915625, "grad_norm": 2.84375, "grad_norm_var": 0.032225545247395834, "learning_rate": 0.0001, "loss": 5.8537, "loss/crossentropy": 2.6897761821746826, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16716928780078888, "step": 12530 }, { "epoch": 0.391625, "grad_norm": 3.015625, "grad_norm_var": 0.03245340983072917, "learning_rate": 0.0001, "loss": 5.6876, "loss/crossentropy": 2.4564136266708374, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17116591334342957, "step": 12532 }, { "epoch": 0.3916875, "grad_norm": 3.125, "grad_norm_var": 0.02880859375, "learning_rate": 0.0001, "loss": 6.1001, "loss/crossentropy": 2.800950288772583, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17913557589054108, "step": 12534 }, { "epoch": 0.39175, "grad_norm": 3.578125, "grad_norm_var": 0.053954060872395834, "learning_rate": 0.0001, "loss": 5.9839, "loss/crossentropy": 2.5637909173965454, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1884925439953804, "step": 12536 }, { "epoch": 0.3918125, "grad_norm": 3.125, "grad_norm_var": 0.04442952473958333, "learning_rate": 0.0001, "loss": 5.656, "loss/crossentropy": 2.512777805328369, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1674426794052124, "step": 12538 }, { "epoch": 0.391875, "grad_norm": 3.109375, "grad_norm_var": 0.04358622233072917, "learning_rate": 0.0001, "loss": 5.2536, "loss/crossentropy": 2.2287850379943848, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15521272271871567, "step": 12540 }, { "epoch": 0.3919375, "grad_norm": 3.515625, "grad_norm_var": 0.05670166015625, "learning_rate": 0.0001, "loss": 6.172, "loss/crossentropy": 2.7422330379486084, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18789854645729065, "step": 12542 }, { "epoch": 0.392, "grad_norm": 3.203125, "grad_norm_var": 0.0508453369140625, "learning_rate": 0.0001, "loss": 5.7974, "loss/crossentropy": 2.490816831588745, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17753690481185913, "step": 12544 }, { "epoch": 0.3920625, "grad_norm": 3.34375, "grad_norm_var": 0.03681640625, "learning_rate": 0.0001, "loss": 5.9525, "loss/crossentropy": 2.657665252685547, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1783137023448944, "step": 12546 }, { "epoch": 0.392125, "grad_norm": 3.25, "grad_norm_var": 0.030387369791666667, "learning_rate": 0.0001, "loss": 6.3394, "loss/crossentropy": 2.913390874862671, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18947432935237885, "step": 12548 }, { "epoch": 0.3921875, "grad_norm": 2.9375, "grad_norm_var": 0.038605753580729166, "learning_rate": 0.0001, "loss": 5.7545, "loss/crossentropy": 2.521004557609558, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17413241416215897, "step": 12550 }, { "epoch": 0.39225, "grad_norm": 3.0625, "grad_norm_var": 0.027176920572916666, "learning_rate": 0.0001, "loss": 5.6198, "loss/crossentropy": 2.4493943452835083, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1689920276403427, "step": 12552 }, { "epoch": 0.3923125, "grad_norm": 3.359375, "grad_norm_var": 0.02763671875, "learning_rate": 0.0001, "loss": 6.0448, "loss/crossentropy": 2.728785991668701, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.18042971938848495, "step": 12554 }, { "epoch": 0.392375, "grad_norm": 3.890625, "grad_norm_var": 0.06608784993489583, "learning_rate": 0.0001, "loss": 6.0623, "loss/crossentropy": 2.7216707468032837, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17937306314706802, "step": 12556 }, { "epoch": 0.3924375, "grad_norm": 4.4375, "grad_norm_var": 0.14260660807291667, "learning_rate": 0.0001, "loss": 6.114, "loss/crossentropy": 2.7082772254943848, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18705706298351288, "step": 12558 }, { "epoch": 0.3925, "grad_norm": 3.4375, "grad_norm_var": 0.14175516764322918, "learning_rate": 0.0001, "loss": 6.0443, "loss/crossentropy": 2.6535454988479614, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1836022362112999, "step": 12560 }, { "epoch": 0.3925625, "grad_norm": 3.296875, "grad_norm_var": 0.14643452962239584, "learning_rate": 0.0001, "loss": 5.8386, "loss/crossentropy": 2.5863006114959717, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17288491874933243, "step": 12562 }, { "epoch": 0.392625, "grad_norm": 3.46875, "grad_norm_var": 0.14233296712239582, "learning_rate": 0.0001, "loss": 6.2355, "loss/crossentropy": 2.804616689682007, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1868431344628334, "step": 12564 }, { "epoch": 0.3926875, "grad_norm": 3.796875, "grad_norm_var": 0.12805074055989582, "learning_rate": 0.0001, "loss": 5.9621, "loss/crossentropy": 2.629786491394043, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.18401210010051727, "step": 12566 }, { "epoch": 0.39275, "grad_norm": 3.28125, "grad_norm_var": 0.11767171223958334, "learning_rate": 0.0001, "loss": 5.7638, "loss/crossentropy": 2.5153621435165405, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17366864532232285, "step": 12568 }, { "epoch": 0.3928125, "grad_norm": 3.078125, "grad_norm_var": 0.12672119140625, "learning_rate": 0.0001, "loss": 5.7299, "loss/crossentropy": 2.584002733230591, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16575895249843597, "step": 12570 }, { "epoch": 0.392875, "grad_norm": 3.25, "grad_norm_var": 0.11220296223958333, "learning_rate": 0.0001, "loss": 5.9635, "loss/crossentropy": 2.6303478479385376, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18175004422664642, "step": 12572 }, { "epoch": 0.3929375, "grad_norm": 3.15625, "grad_norm_var": 0.0424468994140625, "learning_rate": 0.0001, "loss": 5.6407, "loss/crossentropy": 2.4312771558761597, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17328844219446182, "step": 12574 }, { "epoch": 0.393, "grad_norm": 3.28125, "grad_norm_var": 0.07740478515625, "learning_rate": 0.0001, "loss": 6.1336, "loss/crossentropy": 2.7003642320632935, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18629637360572815, "step": 12576 }, { "epoch": 0.3930625, "grad_norm": 2.890625, "grad_norm_var": 0.08684895833333334, "learning_rate": 0.0001, "loss": 5.4049, "loss/crossentropy": 2.3360401391983032, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15961584448814392, "step": 12578 }, { "epoch": 0.393125, "grad_norm": 3.203125, "grad_norm_var": 0.08376363118489584, "learning_rate": 0.0001, "loss": 5.8556, "loss/crossentropy": 2.511664032936096, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17892113327980042, "step": 12580 }, { "epoch": 0.3931875, "grad_norm": 3.3125, "grad_norm_var": 0.06633199055989583, "learning_rate": 0.0001, "loss": 5.8833, "loss/crossentropy": 2.581635594367981, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.18055399507284164, "step": 12582 }, { "epoch": 0.39325, "grad_norm": 3.375, "grad_norm_var": 0.06669514973958333, "learning_rate": 0.0001, "loss": 5.5154, "loss/crossentropy": 2.326522707939148, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1700570210814476, "step": 12584 }, { "epoch": 0.3933125, "grad_norm": 3.296875, "grad_norm_var": 0.06213785807291667, "learning_rate": 0.0001, "loss": 5.904, "loss/crossentropy": 2.644391179084778, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17518238723278046, "step": 12586 }, { "epoch": 0.393375, "grad_norm": 3.625, "grad_norm_var": 0.06856180826822916, "learning_rate": 0.0001, "loss": 6.2582, "loss/crossentropy": 2.8179484605789185, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18856094777584076, "step": 12588 }, { "epoch": 0.3934375, "grad_norm": 3.328125, "grad_norm_var": 0.0644439697265625, "learning_rate": 0.0001, "loss": 5.8571, "loss/crossentropy": 2.6284478902816772, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17482053488492966, "step": 12590 }, { "epoch": 0.3935, "grad_norm": 3.140625, "grad_norm_var": 0.028218587239583332, "learning_rate": 0.0001, "loss": 5.86, "loss/crossentropy": 2.5996965169906616, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1752540022134781, "step": 12592 }, { "epoch": 0.3935625, "grad_norm": 3.375, "grad_norm_var": 0.015771484375, "learning_rate": 0.0001, "loss": 5.6939, "loss/crossentropy": 2.366678476333618, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17842572182416916, "step": 12594 }, { "epoch": 0.393625, "grad_norm": 2.984375, "grad_norm_var": 0.02666015625, "learning_rate": 0.0001, "loss": 5.558, "loss/crossentropy": 2.4497467279434204, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16512507945299149, "step": 12596 }, { "epoch": 0.3936875, "grad_norm": 3.078125, "grad_norm_var": 0.028587849934895833, "learning_rate": 0.0001, "loss": 6.0952, "loss/crossentropy": 2.6856324672698975, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18509763479232788, "step": 12598 }, { "epoch": 0.39375, "grad_norm": 3.09375, "grad_norm_var": 0.0327545166015625, "learning_rate": 0.0001, "loss": 5.5648, "loss/crossentropy": 2.4074909687042236, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1653430312871933, "step": 12600 }, { "epoch": 0.3938125, "grad_norm": 3.046875, "grad_norm_var": 0.031636555989583336, "learning_rate": 0.0001, "loss": 5.7825, "loss/crossentropy": 2.528868794441223, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1745782196521759, "step": 12602 }, { "epoch": 0.393875, "grad_norm": 3.3125, "grad_norm_var": 0.020466105143229166, "learning_rate": 0.0001, "loss": 5.9446, "loss/crossentropy": 2.685313105583191, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1767057254910469, "step": 12604 }, { "epoch": 0.3939375, "grad_norm": 3.46875, "grad_norm_var": 0.025651041666666666, "learning_rate": 0.0001, "loss": 6.0904, "loss/crossentropy": 2.719091296195984, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1847868636250496, "step": 12606 }, { "epoch": 0.394, "grad_norm": 3.453125, "grad_norm_var": 0.032013956705729166, "learning_rate": 0.0001, "loss": 5.8556, "loss/crossentropy": 2.664482593536377, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16872593015432358, "step": 12608 }, { "epoch": 0.3940625, "grad_norm": 3.25, "grad_norm_var": 0.0301910400390625, "learning_rate": 0.0001, "loss": 5.7742, "loss/crossentropy": 2.536603331565857, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17298102378845215, "step": 12610 }, { "epoch": 0.394125, "grad_norm": 3.21875, "grad_norm_var": 0.02486572265625, "learning_rate": 0.0001, "loss": 5.664, "loss/crossentropy": 2.4420113563537598, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1690772995352745, "step": 12612 }, { "epoch": 0.3941875, "grad_norm": 3.5, "grad_norm_var": 0.027708943684895834, "learning_rate": 0.0001, "loss": 6.0206, "loss/crossentropy": 2.643946409225464, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.18648922443389893, "step": 12614 }, { "epoch": 0.39425, "grad_norm": 3.390625, "grad_norm_var": 0.03137613932291667, "learning_rate": 0.0001, "loss": 6.2392, "loss/crossentropy": 2.7503719329833984, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19185277819633484, "step": 12616 }, { "epoch": 0.3943125, "grad_norm": 3.203125, "grad_norm_var": 0.040999348958333334, "learning_rate": 0.0001, "loss": 6.1137, "loss/crossentropy": 2.66534686088562, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18741349875926971, "step": 12618 }, { "epoch": 0.394375, "grad_norm": 3.125, "grad_norm_var": 0.037398274739583334, "learning_rate": 0.0001, "loss": 5.9342, "loss/crossentropy": 2.627368450164795, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17638636380434036, "step": 12620 }, { "epoch": 0.3944375, "grad_norm": 3.84375, "grad_norm_var": 0.06957906087239583, "learning_rate": 0.0001, "loss": 5.4455, "loss/crossentropy": 2.38720965385437, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15699703991413116, "step": 12622 }, { "epoch": 0.3945, "grad_norm": 3.15625, "grad_norm_var": 0.0674224853515625, "learning_rate": 0.0001, "loss": 5.952, "loss/crossentropy": 2.627716302871704, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1789139211177826, "step": 12624 }, { "epoch": 0.3945625, "grad_norm": 3.0625, "grad_norm_var": 0.0775787353515625, "learning_rate": 0.0001, "loss": 5.6675, "loss/crossentropy": 2.4872137308120728, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16880901902914047, "step": 12626 }, { "epoch": 0.394625, "grad_norm": 3.4375, "grad_norm_var": 0.07729390462239584, "learning_rate": 0.0001, "loss": 5.8755, "loss/crossentropy": 2.627228617668152, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17365680634975433, "step": 12628 }, { "epoch": 0.3946875, "grad_norm": 3.5625, "grad_norm_var": 0.07883199055989583, "learning_rate": 0.0001, "loss": 6.046, "loss/crossentropy": 2.7390414476394653, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1763944998383522, "step": 12630 }, { "epoch": 0.39475, "grad_norm": 4.59375, "grad_norm_var": 0.1789215087890625, "learning_rate": 0.0001, "loss": 5.8115, "loss/crossentropy": 2.4511373043060303, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.17939364165067673, "step": 12632 }, { "epoch": 0.3948125, "grad_norm": 3.625, "grad_norm_var": 0.170458984375, "learning_rate": 0.0001, "loss": 5.8712, "loss/crossentropy": 2.5700401067733765, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17777566611766815, "step": 12634 }, { "epoch": 0.394875, "grad_norm": 3.234375, "grad_norm_var": 0.16705729166666666, "learning_rate": 0.0001, "loss": 5.5774, "loss/crossentropy": 2.346840262413025, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.16876373440027237, "step": 12636 }, { "epoch": 0.3949375, "grad_norm": 3.046875, "grad_norm_var": 0.14108784993489584, "learning_rate": 0.0001, "loss": 5.632, "loss/crossentropy": 2.4182028770446777, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.16825632005929947, "step": 12638 }, { "epoch": 0.395, "grad_norm": 2.984375, "grad_norm_var": 0.14585673014322917, "learning_rate": 0.0001, "loss": 5.8294, "loss/crossentropy": 2.660908341407776, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16997504979372025, "step": 12640 }, { "epoch": 0.3950625, "grad_norm": 3.40625, "grad_norm_var": 0.1327545166015625, "learning_rate": 0.0001, "loss": 5.6972, "loss/crossentropy": 2.4525365829467773, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17602672427892685, "step": 12642 }, { "epoch": 0.395125, "grad_norm": 3.28125, "grad_norm_var": 0.13359375, "learning_rate": 0.0001, "loss": 5.8437, "loss/crossentropy": 2.584797978401184, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17628318071365356, "step": 12644 }, { "epoch": 0.3951875, "grad_norm": 3.125, "grad_norm_var": 0.13804423014322917, "learning_rate": 0.0001, "loss": 5.8704, "loss/crossentropy": 2.6155022382736206, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17392294108867645, "step": 12646 }, { "epoch": 0.39525, "grad_norm": 2.96875, "grad_norm_var": 0.03209228515625, "learning_rate": 0.0001, "loss": 5.3547, "loss/crossentropy": 2.2829012870788574, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16147667169570923, "step": 12648 }, { "epoch": 0.3953125, "grad_norm": 3.21875, "grad_norm_var": 0.02261962890625, "learning_rate": 0.0001, "loss": 5.5823, "loss/crossentropy": 2.4431118965148926, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16587400436401367, "step": 12650 }, { "epoch": 0.395375, "grad_norm": 3.359375, "grad_norm_var": 0.021577962239583335, "learning_rate": 0.0001, "loss": 5.9593, "loss/crossentropy": 2.669771671295166, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17777632921934128, "step": 12652 }, { "epoch": 0.3954375, "grad_norm": 3.140625, "grad_norm_var": 0.020699055989583333, "learning_rate": 0.0001, "loss": 5.5884, "loss/crossentropy": 2.387265920639038, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1724553257226944, "step": 12654 }, { "epoch": 0.3955, "grad_norm": 3.4375, "grad_norm_var": 0.02095947265625, "learning_rate": 0.0001, "loss": 5.5649, "loss/crossentropy": 2.4451918601989746, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.15923383086919785, "step": 12656 }, { "epoch": 0.3955625, "grad_norm": 2.921875, "grad_norm_var": 0.02203369140625, "learning_rate": 0.0001, "loss": 5.5627, "loss/crossentropy": 2.475408911705017, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16341856867074966, "step": 12658 }, { "epoch": 0.395625, "grad_norm": 3.390625, "grad_norm_var": 0.023856608072916667, "learning_rate": 0.0001, "loss": 5.9212, "loss/crossentropy": 2.67451274394989, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17388500273227692, "step": 12660 }, { "epoch": 0.3956875, "grad_norm": 3.140625, "grad_norm_var": 0.02467041015625, "learning_rate": 0.0001, "loss": 5.924, "loss/crossentropy": 2.6317321062088013, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.18118055164813995, "step": 12662 }, { "epoch": 0.39575, "grad_norm": 3.546875, "grad_norm_var": 0.028776041666666665, "learning_rate": 0.0001, "loss": 5.7894, "loss/crossentropy": 2.5545856952667236, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17348483204841614, "step": 12664 }, { "epoch": 0.3958125, "grad_norm": 3.4375, "grad_norm_var": 0.030692545572916667, "learning_rate": 0.0001, "loss": 5.8091, "loss/crossentropy": 2.508827805519104, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1796346753835678, "step": 12666 }, { "epoch": 0.395875, "grad_norm": 3.40625, "grad_norm_var": 0.03242899576822917, "learning_rate": 0.0001, "loss": 5.7779, "loss/crossentropy": 2.5303611755371094, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17280302941799164, "step": 12668 }, { "epoch": 0.3959375, "grad_norm": 3.640625, "grad_norm_var": 0.03943583170572917, "learning_rate": 0.0001, "loss": 6.0169, "loss/crossentropy": 2.6776299476623535, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1831415742635727, "step": 12670 }, { "epoch": 0.396, "grad_norm": 3.125, "grad_norm_var": 0.03638407389322917, "learning_rate": 0.0001, "loss": 5.5852, "loss/crossentropy": 2.3899872303009033, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17030571401119232, "step": 12672 }, { "epoch": 0.3960625, "grad_norm": 3.40625, "grad_norm_var": 0.025016276041666667, "learning_rate": 0.0001, "loss": 5.9688, "loss/crossentropy": 2.6896661520004272, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17908131331205368, "step": 12674 }, { "epoch": 0.396125, "grad_norm": 3.21875, "grad_norm_var": 0.0358062744140625, "learning_rate": 0.0001, "loss": 6.1225, "loss/crossentropy": 2.7160264253616333, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18322856724262238, "step": 12676 }, { "epoch": 0.3961875, "grad_norm": 3.078125, "grad_norm_var": 0.038374837239583334, "learning_rate": 0.0001, "loss": 6.0282, "loss/crossentropy": 2.6944602727890015, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18063852190971375, "step": 12678 }, { "epoch": 0.39625, "grad_norm": 3.21875, "grad_norm_var": 0.03828023274739583, "learning_rate": 0.0001, "loss": 5.5322, "loss/crossentropy": 2.3815308809280396, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16272065788507462, "step": 12680 }, { "epoch": 0.3963125, "grad_norm": 2.953125, "grad_norm_var": 0.049072265625, "learning_rate": 0.0001, "loss": 5.7624, "loss/crossentropy": 2.572110176086426, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17059581726789474, "step": 12682 }, { "epoch": 0.396375, "grad_norm": 3.171875, "grad_norm_var": 0.0523834228515625, "learning_rate": 0.0001, "loss": 5.963, "loss/crossentropy": 2.7088600397109985, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17540912330150604, "step": 12684 }, { "epoch": 0.3964375, "grad_norm": 3.09375, "grad_norm_var": 0.041337076822916666, "learning_rate": 0.0001, "loss": 5.7898, "loss/crossentropy": 2.542935609817505, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17547084391117096, "step": 12686 }, { "epoch": 0.3965, "grad_norm": 3.328125, "grad_norm_var": 0.0449859619140625, "learning_rate": 0.0001, "loss": 5.9827, "loss/crossentropy": 2.6294325590133667, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1821979507803917, "step": 12688 }, { "epoch": 0.3965625, "grad_norm": 3.25, "grad_norm_var": 0.042529296875, "learning_rate": 0.0001, "loss": 5.5309, "loss/crossentropy": 2.3679721355438232, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16746362298727036, "step": 12690 }, { "epoch": 0.396625, "grad_norm": 2.96875, "grad_norm_var": 0.022977701822916665, "learning_rate": 0.0001, "loss": 5.819, "loss/crossentropy": 2.605017900466919, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17178434878587723, "step": 12692 }, { "epoch": 0.3966875, "grad_norm": 3.296875, "grad_norm_var": 0.019759114583333334, "learning_rate": 0.0001, "loss": 5.899, "loss/crossentropy": 2.6208107471466064, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17821407318115234, "step": 12694 }, { "epoch": 0.39675, "grad_norm": 3.0, "grad_norm_var": 0.020279947916666666, "learning_rate": 0.0001, "loss": 5.5962, "loss/crossentropy": 2.4298471212387085, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16663925349712372, "step": 12696 }, { "epoch": 0.3968125, "grad_norm": 3.234375, "grad_norm_var": 0.020051066080729166, "learning_rate": 0.0001, "loss": 5.482, "loss/crossentropy": 2.359502673149109, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16342318803071976, "step": 12698 }, { "epoch": 0.396875, "grad_norm": 3.40625, "grad_norm_var": 0.023102823893229166, "learning_rate": 0.0001, "loss": 6.0644, "loss/crossentropy": 2.720403790473938, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17932403087615967, "step": 12700 }, { "epoch": 0.3969375, "grad_norm": 3.1875, "grad_norm_var": 0.023680623372395834, "learning_rate": 0.0001, "loss": 5.8073, "loss/crossentropy": 2.5173072814941406, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1747017800807953, "step": 12702 }, { "epoch": 0.397, "grad_norm": 3.078125, "grad_norm_var": 0.018001302083333334, "learning_rate": 0.0001, "loss": 5.785, "loss/crossentropy": 2.5444486141204834, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17483435571193695, "step": 12704 }, { "epoch": 0.3970625, "grad_norm": 3.234375, "grad_norm_var": 0.019334920247395835, "learning_rate": 0.0001, "loss": 5.9081, "loss/crossentropy": 2.5757994651794434, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18128123879432678, "step": 12706 }, { "epoch": 0.397125, "grad_norm": 3.125, "grad_norm_var": 0.017365519205729166, "learning_rate": 0.0001, "loss": 5.716, "loss/crossentropy": 2.5148154497146606, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17207133769989014, "step": 12708 }, { "epoch": 0.3971875, "grad_norm": 3.171875, "grad_norm_var": 0.025504557291666667, "learning_rate": 0.0001, "loss": 6.4185, "loss/crossentropy": 2.932085871696472, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1908249631524086, "step": 12710 }, { "epoch": 0.39725, "grad_norm": 3.203125, "grad_norm_var": 0.0242095947265625, "learning_rate": 0.0001, "loss": 5.9251, "loss/crossentropy": 2.657052993774414, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17719236016273499, "step": 12712 }, { "epoch": 0.3973125, "grad_norm": 3.484375, "grad_norm_var": 0.03722330729166667, "learning_rate": 0.0001, "loss": 6.0949, "loss/crossentropy": 2.6813771724700928, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18510235846042633, "step": 12714 }, { "epoch": 0.397375, "grad_norm": 3.375, "grad_norm_var": 0.0348297119140625, "learning_rate": 0.0001, "loss": 5.9062, "loss/crossentropy": 2.5811485052108765, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17742551118135452, "step": 12716 }, { "epoch": 0.3974375, "grad_norm": 3.078125, "grad_norm_var": 0.04065755208333333, "learning_rate": 0.0001, "loss": 5.4382, "loss/crossentropy": 2.3304866552352905, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1631132811307907, "step": 12718 }, { "epoch": 0.3975, "grad_norm": 3.09375, "grad_norm_var": 0.0389068603515625, "learning_rate": 0.0001, "loss": 5.613, "loss/crossentropy": 2.435179114341736, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16504327952861786, "step": 12720 }, { "epoch": 0.3975625, "grad_norm": 3.171875, "grad_norm_var": 0.040755208333333334, "learning_rate": 0.0001, "loss": 5.7467, "loss/crossentropy": 2.530525803565979, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17318245768547058, "step": 12722 }, { "epoch": 0.397625, "grad_norm": 3.328125, "grad_norm_var": 0.04049072265625, "learning_rate": 0.0001, "loss": 5.8345, "loss/crossentropy": 2.524799942970276, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17823348939418793, "step": 12724 }, { "epoch": 0.3976875, "grad_norm": 3.171875, "grad_norm_var": 0.0399078369140625, "learning_rate": 0.0001, "loss": 5.778, "loss/crossentropy": 2.4941515922546387, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17877789586782455, "step": 12726 }, { "epoch": 0.39775, "grad_norm": 3.0625, "grad_norm_var": 0.0445709228515625, "learning_rate": 0.0001, "loss": 5.7333, "loss/crossentropy": 2.5802040100097656, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1684320867061615, "step": 12728 }, { "epoch": 0.3978125, "grad_norm": 3.890625, "grad_norm_var": 0.05390218098958333, "learning_rate": 0.0001, "loss": 5.8855, "loss/crossentropy": 2.5856465101242065, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17763860523700714, "step": 12730 }, { "epoch": 0.397875, "grad_norm": 3.234375, "grad_norm_var": 0.06889546712239583, "learning_rate": 0.0001, "loss": 5.846, "loss/crossentropy": 2.570215344429016, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1724957674741745, "step": 12732 }, { "epoch": 0.3979375, "grad_norm": 2.984375, "grad_norm_var": 0.06931966145833333, "learning_rate": 0.0001, "loss": 5.7952, "loss/crossentropy": 2.5533066987991333, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17457794398069382, "step": 12734 }, { "epoch": 0.398, "grad_norm": 3.203125, "grad_norm_var": 0.06868082682291667, "learning_rate": 0.0001, "loss": 5.5556, "loss/crossentropy": 2.3609848022460938, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16985399276018143, "step": 12736 }, { "epoch": 0.3980625, "grad_norm": 3.125, "grad_norm_var": 0.06868489583333333, "learning_rate": 0.0001, "loss": 5.675, "loss/crossentropy": 2.453263998031616, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17256765067577362, "step": 12738 }, { "epoch": 0.398125, "grad_norm": 3.203125, "grad_norm_var": 0.06577860514322917, "learning_rate": 0.0001, "loss": 5.8025, "loss/crossentropy": 2.5349299907684326, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17637009173631668, "step": 12740 }, { "epoch": 0.3981875, "grad_norm": 3.25, "grad_norm_var": 0.06015218098958333, "learning_rate": 0.0001, "loss": 5.8391, "loss/crossentropy": 2.591480016708374, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17437315732240677, "step": 12742 }, { "epoch": 0.39825, "grad_norm": 3.109375, "grad_norm_var": 0.05719401041666667, "learning_rate": 0.0001, "loss": 5.7709, "loss/crossentropy": 2.547766089439392, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17231663316488266, "step": 12744 }, { "epoch": 0.3983125, "grad_norm": 3.21875, "grad_norm_var": 0.029832967122395835, "learning_rate": 0.0001, "loss": 5.6614, "loss/crossentropy": 2.5061652660369873, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16434767842292786, "step": 12746 }, { "epoch": 0.398375, "grad_norm": 3.109375, "grad_norm_var": 0.00699462890625, "learning_rate": 0.0001, "loss": 5.6085, "loss/crossentropy": 2.4205719232559204, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17113681882619858, "step": 12748 }, { "epoch": 0.3984375, "grad_norm": 3.125, "grad_norm_var": 0.005760701497395834, "learning_rate": 0.0001, "loss": 5.6337, "loss/crossentropy": 2.4922345876693726, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16922497749328613, "step": 12750 }, { "epoch": 0.3985, "grad_norm": 3.265625, "grad_norm_var": 0.006734212239583333, "learning_rate": 0.0001, "loss": 5.8577, "loss/crossentropy": 2.6219236850738525, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17319178581237793, "step": 12752 }, { "epoch": 0.3985625, "grad_norm": 3.3125, "grad_norm_var": 0.00982666015625, "learning_rate": 0.0001, "loss": 5.772, "loss/crossentropy": 2.5100075006484985, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1738506257534027, "step": 12754 }, { "epoch": 0.398625, "grad_norm": 3.328125, "grad_norm_var": 0.0154296875, "learning_rate": 0.0001, "loss": 5.8406, "loss/crossentropy": 2.6387826204299927, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17135730385780334, "step": 12756 }, { "epoch": 0.3986875, "grad_norm": 3.109375, "grad_norm_var": 0.020601399739583335, "learning_rate": 0.0001, "loss": 5.8364, "loss/crossentropy": 2.608380675315857, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17358743399381638, "step": 12758 }, { "epoch": 0.39875, "grad_norm": 3.046875, "grad_norm_var": 0.021873982747395833, "learning_rate": 0.0001, "loss": 5.8293, "loss/crossentropy": 2.6063863039016724, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17150786519050598, "step": 12760 }, { "epoch": 0.3988125, "grad_norm": 3.375, "grad_norm_var": 0.044041951497395836, "learning_rate": 0.0001, "loss": 6.2794, "loss/crossentropy": 2.8802947998046875, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18639102578163147, "step": 12762 }, { "epoch": 0.398875, "grad_norm": 3.265625, "grad_norm_var": 0.044188435872395834, "learning_rate": 0.0001, "loss": 5.4637, "loss/crossentropy": 2.2918306589126587, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1683613732457161, "step": 12764 }, { "epoch": 0.3989375, "grad_norm": 3.078125, "grad_norm_var": 0.04299214680989583, "learning_rate": 0.0001, "loss": 5.8856, "loss/crossentropy": 2.683797240257263, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17095889896154404, "step": 12766 }, { "epoch": 0.399, "grad_norm": 3.078125, "grad_norm_var": 0.044352213541666664, "learning_rate": 0.0001, "loss": 5.9134, "loss/crossentropy": 2.6380010843276978, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17714772373437881, "step": 12768 }, { "epoch": 0.3990625, "grad_norm": 3.5, "grad_norm_var": 0.049820963541666666, "learning_rate": 0.0001, "loss": 5.9807, "loss/crossentropy": 2.6644222736358643, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1804550215601921, "step": 12770 }, { "epoch": 0.399125, "grad_norm": 3.078125, "grad_norm_var": 0.042704264322916664, "learning_rate": 0.0001, "loss": 5.4594, "loss/crossentropy": 2.3213056325912476, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1661573201417923, "step": 12772 }, { "epoch": 0.3991875, "grad_norm": 2.96875, "grad_norm_var": 0.043024698893229164, "learning_rate": 0.0001, "loss": 5.6452, "loss/crossentropy": 2.530721426010132, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1630120500922203, "step": 12774 }, { "epoch": 0.39925, "grad_norm": 3.125, "grad_norm_var": 0.04690348307291667, "learning_rate": 0.0001, "loss": 5.8038, "loss/crossentropy": 2.4875781536102295, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1788884699344635, "step": 12776 }, { "epoch": 0.3993125, "grad_norm": 3.125, "grad_norm_var": 0.026292928059895835, "learning_rate": 0.0001, "loss": 5.6844, "loss/crossentropy": 2.4839333295822144, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16848218441009521, "step": 12778 }, { "epoch": 0.399375, "grad_norm": 2.9375, "grad_norm_var": 0.028058878580729165, "learning_rate": 0.0001, "loss": 5.8555, "loss/crossentropy": 2.6576311588287354, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17173823714256287, "step": 12780 }, { "epoch": 0.3994375, "grad_norm": 3.46875, "grad_norm_var": 0.03469645182291667, "learning_rate": 0.0001, "loss": 5.7478, "loss/crossentropy": 2.508821964263916, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17663374543190002, "step": 12782 }, { "epoch": 0.3995, "grad_norm": 3.28125, "grad_norm_var": 0.03271077473958333, "learning_rate": 0.0001, "loss": 5.9926, "loss/crossentropy": 2.6725634336471558, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1788768768310547, "step": 12784 }, { "epoch": 0.3995625, "grad_norm": 3.265625, "grad_norm_var": 0.025731404622395832, "learning_rate": 0.0001, "loss": 5.7898, "loss/crossentropy": 2.5669726133346558, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1730625331401825, "step": 12786 }, { "epoch": 0.399625, "grad_norm": 3.171875, "grad_norm_var": 0.028571573893229167, "learning_rate": 0.0001, "loss": 6.1232, "loss/crossentropy": 2.745548963546753, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.184643916785717, "step": 12788 }, { "epoch": 0.3996875, "grad_norm": 3.90625, "grad_norm_var": 0.04778238932291667, "learning_rate": 0.0001, "loss": 6.1705, "loss/crossentropy": 2.769607663154602, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18853065371513367, "step": 12790 }, { "epoch": 0.39975, "grad_norm": 3.0, "grad_norm_var": 0.0531646728515625, "learning_rate": 0.0001, "loss": 5.6306, "loss/crossentropy": 2.5244375467300415, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16374072432518005, "step": 12792 }, { "epoch": 0.3998125, "grad_norm": 2.90625, "grad_norm_var": 0.06271158854166667, "learning_rate": 0.0001, "loss": 5.5619, "loss/crossentropy": 2.397638440132141, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16681698709726334, "step": 12794 }, { "epoch": 0.399875, "grad_norm": 3.03125, "grad_norm_var": 0.05999348958333333, "learning_rate": 0.0001, "loss": 5.6963, "loss/crossentropy": 2.5068334341049194, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.16504260897636414, "step": 12796 }, { "epoch": 0.3999375, "grad_norm": 3.171875, "grad_norm_var": 0.0570953369140625, "learning_rate": 0.0001, "loss": 5.694, "loss/crossentropy": 2.4424139261245728, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1739911586046219, "step": 12798 }, { "epoch": 0.4, "grad_norm": 3.359375, "grad_norm_var": 0.07810872395833333, "learning_rate": 0.0001, "loss": 6.0521, "loss/crossentropy": 2.644740581512451, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18995042890310287, "step": 12800 }, { "epoch": 0.4000625, "grad_norm": 3.0625, "grad_norm_var": 0.08280843098958333, "learning_rate": 0.0001, "loss": 6.062, "loss/crossentropy": 2.752319812774658, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17979149520397186, "step": 12802 }, { "epoch": 0.400125, "grad_norm": 3.796875, "grad_norm_var": 0.09868062337239583, "learning_rate": 0.0001, "loss": 5.8939, "loss/crossentropy": 2.7276742458343506, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16466771811246872, "step": 12804 }, { "epoch": 0.4001875, "grad_norm": 3.53125, "grad_norm_var": 0.08255208333333333, "learning_rate": 0.0001, "loss": 5.9977, "loss/crossentropy": 2.6202635765075684, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1830565556883812, "step": 12806 }, { "epoch": 0.40025, "grad_norm": 3.078125, "grad_norm_var": 0.07848307291666666, "learning_rate": 0.0001, "loss": 5.8544, "loss/crossentropy": 2.6176319122314453, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17563194781541824, "step": 12808 }, { "epoch": 0.4003125, "grad_norm": 3.59375, "grad_norm_var": 0.06526692708333333, "learning_rate": 0.0001, "loss": 6.0423, "loss/crossentropy": 2.7191877365112305, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18075303733348846, "step": 12810 }, { "epoch": 0.400375, "grad_norm": 3.28125, "grad_norm_var": 0.05510660807291667, "learning_rate": 0.0001, "loss": 6.155, "loss/crossentropy": 2.797845959663391, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18414809554815292, "step": 12812 }, { "epoch": 0.4004375, "grad_norm": 3.28125, "grad_norm_var": 0.05431315104166667, "learning_rate": 0.0001, "loss": 5.5796, "loss/crossentropy": 2.437540650367737, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16694405674934387, "step": 12814 }, { "epoch": 0.4005, "grad_norm": 3.0, "grad_norm_var": 0.052057902018229164, "learning_rate": 0.0001, "loss": 5.9384, "loss/crossentropy": 2.726389765739441, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17237205058336258, "step": 12816 }, { "epoch": 0.4005625, "grad_norm": 3.1875, "grad_norm_var": 0.05452372233072917, "learning_rate": 0.0001, "loss": 6.0321, "loss/crossentropy": 2.7516602277755737, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17999598383903503, "step": 12818 }, { "epoch": 0.400625, "grad_norm": 3.046875, "grad_norm_var": 0.04016520182291667, "learning_rate": 0.0001, "loss": 5.9035, "loss/crossentropy": 2.5985175371170044, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17971674352884293, "step": 12820 }, { "epoch": 0.4006875, "grad_norm": 3.515625, "grad_norm_var": 0.03284403483072917, "learning_rate": 0.0001, "loss": 6.0141, "loss/crossentropy": 2.634552836418152, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18131476640701294, "step": 12822 }, { "epoch": 0.40075, "grad_norm": 3.34375, "grad_norm_var": 0.0257232666015625, "learning_rate": 0.0001, "loss": 5.7685, "loss/crossentropy": 2.4657139778137207, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17871953547000885, "step": 12824 }, { "epoch": 0.4008125, "grad_norm": 3.46875, "grad_norm_var": 0.025519816080729167, "learning_rate": 0.0001, "loss": 5.9018, "loss/crossentropy": 2.6673959493637085, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17422594875097275, "step": 12826 }, { "epoch": 0.400875, "grad_norm": 3.4375, "grad_norm_var": 0.042878214518229166, "learning_rate": 0.0001, "loss": 5.9004, "loss/crossentropy": 2.428491711616516, "loss/hidden": 1.66015625, "loss/jsd": 0.0, "loss/logits": 0.1811741143465042, "step": 12828 }, { "epoch": 0.4009375, "grad_norm": 3.34375, "grad_norm_var": 0.0453521728515625, "learning_rate": 0.0001, "loss": 5.9111, "loss/crossentropy": 2.6243066787719727, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17711268365383148, "step": 12830 }, { "epoch": 0.401, "grad_norm": 3.296875, "grad_norm_var": 0.041291300455729166, "learning_rate": 0.0001, "loss": 5.9383, "loss/crossentropy": 2.6939502954483032, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17169564962387085, "step": 12832 }, { "epoch": 0.4010625, "grad_norm": 3.015625, "grad_norm_var": 0.04114176432291667, "learning_rate": 0.0001, "loss": 5.6311, "loss/crossentropy": 2.4560784101486206, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16984651237726212, "step": 12834 }, { "epoch": 0.401125, "grad_norm": 3.84375, "grad_norm_var": 0.0587554931640625, "learning_rate": 0.0001, "loss": 5.9381, "loss/crossentropy": 2.6205928325653076, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18097268044948578, "step": 12836 }, { "epoch": 0.4011875, "grad_norm": 3.125, "grad_norm_var": 0.06116434733072917, "learning_rate": 0.0001, "loss": 5.8675, "loss/crossentropy": 2.715549349784851, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16793277859687805, "step": 12838 }, { "epoch": 0.40125, "grad_norm": 3.21875, "grad_norm_var": 0.060628255208333336, "learning_rate": 0.0001, "loss": 5.7584, "loss/crossentropy": 2.447048544883728, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.18074850738048553, "step": 12840 }, { "epoch": 0.4013125, "grad_norm": 3.125, "grad_norm_var": 0.055817667643229166, "learning_rate": 0.0001, "loss": 5.7538, "loss/crossentropy": 2.580907702445984, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17040975391864777, "step": 12842 }, { "epoch": 0.401375, "grad_norm": 3.1875, "grad_norm_var": 0.03987528483072917, "learning_rate": 0.0001, "loss": 5.7061, "loss/crossentropy": 2.4553611278533936, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.16803889721632004, "step": 12844 }, { "epoch": 0.4014375, "grad_norm": 3.234375, "grad_norm_var": 0.03857014973958333, "learning_rate": 0.0001, "loss": 5.7958, "loss/crossentropy": 2.4673973321914673, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18089058995246887, "step": 12846 }, { "epoch": 0.4015, "grad_norm": 3.765625, "grad_norm_var": 0.058003743489583336, "learning_rate": 0.0001, "loss": 6.0188, "loss/crossentropy": 2.632445812225342, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18121369928121567, "step": 12848 }, { "epoch": 0.4015625, "grad_norm": 3.125, "grad_norm_var": 0.05601806640625, "learning_rate": 0.0001, "loss": 5.7952, "loss/crossentropy": 2.564656615257263, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17383144050836563, "step": 12850 }, { "epoch": 0.401625, "grad_norm": 3.421875, "grad_norm_var": 0.0319488525390625, "learning_rate": 0.0001, "loss": 5.7028, "loss/crossentropy": 2.5033124685287476, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16721860319375992, "step": 12852 }, { "epoch": 0.4016875, "grad_norm": 3.28125, "grad_norm_var": 0.032770792643229164, "learning_rate": 0.0001, "loss": 6.0856, "loss/crossentropy": 2.7508046627044678, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1823054924607277, "step": 12854 }, { "epoch": 0.40175, "grad_norm": 2.90625, "grad_norm_var": 0.040379842122395836, "learning_rate": 0.0001, "loss": 5.7225, "loss/crossentropy": 2.5294690132141113, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16851916164159775, "step": 12856 }, { "epoch": 0.4018125, "grad_norm": 3.203125, "grad_norm_var": 0.25660400390625, "learning_rate": 0.0001, "loss": 6.2473, "loss/crossentropy": 2.7770293951034546, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.19663351774215698, "step": 12858 }, { "epoch": 0.401875, "grad_norm": 3.046875, "grad_norm_var": 0.262939453125, "learning_rate": 0.0001, "loss": 5.4946, "loss/crossentropy": 2.327062249183655, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1667497232556343, "step": 12860 }, { "epoch": 0.4019375, "grad_norm": 3.1875, "grad_norm_var": 0.26521708170572916, "learning_rate": 0.0001, "loss": 5.7023, "loss/crossentropy": 2.474923253059387, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.16531268507242203, "step": 12862 }, { "epoch": 0.402, "grad_norm": 3.328125, "grad_norm_var": 0.26431376139322915, "learning_rate": 0.0001, "loss": 6.5668, "loss/crossentropy": 2.958469271659851, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.20262643694877625, "step": 12864 }, { "epoch": 0.4020625, "grad_norm": 3.59375, "grad_norm_var": 0.26106363932291665, "learning_rate": 0.0001, "loss": 6.1842, "loss/crossentropy": 2.7213059663772583, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.19003939628601074, "step": 12866 }, { "epoch": 0.402125, "grad_norm": 3.109375, "grad_norm_var": 0.2629191080729167, "learning_rate": 0.0001, "loss": 6.1548, "loss/crossentropy": 2.7594075202941895, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18680787086486816, "step": 12868 }, { "epoch": 0.4021875, "grad_norm": 3.078125, "grad_norm_var": 0.269140625, "learning_rate": 0.0001, "loss": 5.7794, "loss/crossentropy": 2.5408273935317993, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17268949002027512, "step": 12870 }, { "epoch": 0.40225, "grad_norm": 3.3125, "grad_norm_var": 0.25554097493489586, "learning_rate": 0.0001, "loss": 5.8066, "loss/crossentropy": 2.6045267581939697, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1721557453274727, "step": 12872 }, { "epoch": 0.4023125, "grad_norm": 3.1875, "grad_norm_var": 0.7239908854166667, "learning_rate": 0.0001, "loss": 6.1699, "loss/crossentropy": 2.70488178730011, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.1879124790430069, "step": 12874 }, { "epoch": 0.402375, "grad_norm": 3.140625, "grad_norm_var": 0.71588134765625, "learning_rate": 0.0001, "loss": 5.6145, "loss/crossentropy": 2.437716007232666, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1688496172428131, "step": 12876 }, { "epoch": 0.4024375, "grad_norm": 3.578125, "grad_norm_var": 0.7010091145833334, "learning_rate": 0.0001, "loss": 6.0098, "loss/crossentropy": 2.6502280235290527, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18361777067184448, "step": 12878 }, { "epoch": 0.4025, "grad_norm": 3.4375, "grad_norm_var": 0.7035115559895834, "learning_rate": 0.0001, "loss": 5.7489, "loss/crossentropy": 2.478968024253845, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17621035873889923, "step": 12880 }, { "epoch": 0.4025625, "grad_norm": 3.0625, "grad_norm_var": 0.7146148681640625, "learning_rate": 0.0001, "loss": 5.8506, "loss/crossentropy": 2.605592966079712, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17332439869642258, "step": 12882 }, { "epoch": 0.402625, "grad_norm": 3.171875, "grad_norm_var": 0.7188639322916667, "learning_rate": 0.0001, "loss": 5.5365, "loss/crossentropy": 2.4114911556243896, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16211073100566864, "step": 12884 }, { "epoch": 0.4026875, "grad_norm": 3.3125, "grad_norm_var": 0.712255859375, "learning_rate": 0.0001, "loss": 5.7428, "loss/crossentropy": 2.4702759981155396, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17334362864494324, "step": 12886 }, { "epoch": 0.40275, "grad_norm": 3.421875, "grad_norm_var": 0.7189280192057291, "learning_rate": 0.0001, "loss": 5.6523, "loss/crossentropy": 2.398773670196533, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17222969233989716, "step": 12888 }, { "epoch": 0.4028125, "grad_norm": 3.0, "grad_norm_var": 0.030171712239583332, "learning_rate": 0.0001, "loss": 5.6355, "loss/crossentropy": 2.4634926319122314, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16485454887151718, "step": 12890 }, { "epoch": 0.402875, "grad_norm": 3.6875, "grad_norm_var": 0.04164937337239583, "learning_rate": 0.0001, "loss": 6.3441, "loss/crossentropy": 2.9000874757766724, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1873697191476822, "step": 12892 }, { "epoch": 0.4029375, "grad_norm": 3.421875, "grad_norm_var": 0.035807291666666664, "learning_rate": 0.0001, "loss": 6.0954, "loss/crossentropy": 2.7477036714553833, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1839851438999176, "step": 12894 }, { "epoch": 0.403, "grad_norm": 2.984375, "grad_norm_var": 0.04275716145833333, "learning_rate": 0.0001, "loss": 5.8582, "loss/crossentropy": 2.6724019050598145, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1674080416560173, "step": 12896 }, { "epoch": 0.4030625, "grad_norm": 3.28125, "grad_norm_var": 0.040608723958333336, "learning_rate": 0.0001, "loss": 5.2831, "loss/crossentropy": 2.202645778656006, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.15726906061172485, "step": 12898 }, { "epoch": 0.403125, "grad_norm": 3.03125, "grad_norm_var": 0.04267578125, "learning_rate": 0.0001, "loss": 5.6854, "loss/crossentropy": 2.4447492361068726, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17523667216300964, "step": 12900 }, { "epoch": 0.4031875, "grad_norm": 3.109375, "grad_norm_var": 0.04366861979166667, "learning_rate": 0.0001, "loss": 5.9955, "loss/crossentropy": 2.6723486185073853, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17958541214466095, "step": 12902 }, { "epoch": 0.40325, "grad_norm": 3.0625, "grad_norm_var": 0.04013264973958333, "learning_rate": 0.0001, "loss": 5.924, "loss/crossentropy": 2.620428204536438, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17957250773906708, "step": 12904 }, { "epoch": 0.4033125, "grad_norm": 3.125, "grad_norm_var": 0.0362945556640625, "learning_rate": 0.0001, "loss": 5.8185, "loss/crossentropy": 2.5143805742263794, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.178853839635849, "step": 12906 }, { "epoch": 0.403375, "grad_norm": 3.0, "grad_norm_var": 0.024925740559895833, "learning_rate": 0.0001, "loss": 5.6459, "loss/crossentropy": 2.4812533855438232, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16880392283201218, "step": 12908 }, { "epoch": 0.4034375, "grad_norm": 3.0625, "grad_norm_var": 0.021581013997395832, "learning_rate": 0.0001, "loss": 5.7469, "loss/crossentropy": 2.550856828689575, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.17467769235372543, "step": 12910 }, { "epoch": 0.4035, "grad_norm": 2.859375, "grad_norm_var": 0.016630045572916665, "learning_rate": 0.0001, "loss": 5.5039, "loss/crossentropy": 2.4234888553619385, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1615532785654068, "step": 12912 }, { "epoch": 0.4035625, "grad_norm": 3.09375, "grad_norm_var": 0.023639933268229166, "learning_rate": 0.0001, "loss": 5.9952, "loss/crossentropy": 2.6943411827087402, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1781327947974205, "step": 12914 }, { "epoch": 0.403625, "grad_norm": 3.5625, "grad_norm_var": 0.0354644775390625, "learning_rate": 0.0001, "loss": 5.8373, "loss/crossentropy": 2.5322425365448, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17660397291183472, "step": 12916 }, { "epoch": 0.4036875, "grad_norm": 2.953125, "grad_norm_var": 0.038004557291666664, "learning_rate": 0.0001, "loss": 5.5832, "loss/crossentropy": 2.4476633071899414, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16394364833831787, "step": 12918 }, { "epoch": 0.40375, "grad_norm": 3.390625, "grad_norm_var": 0.049332682291666666, "learning_rate": 0.0001, "loss": 5.9082, "loss/crossentropy": 2.593148946762085, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17876767367124557, "step": 12920 }, { "epoch": 0.4038125, "grad_norm": 3.234375, "grad_norm_var": 0.0494293212890625, "learning_rate": 0.0001, "loss": 5.9072, "loss/crossentropy": 2.629413366317749, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17738816887140274, "step": 12922 }, { "epoch": 0.403875, "grad_norm": 3.375, "grad_norm_var": 0.048779296875, "learning_rate": 0.0001, "loss": 5.9481, "loss/crossentropy": 2.5518592596054077, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18571743369102478, "step": 12924 }, { "epoch": 0.4039375, "grad_norm": 3.125, "grad_norm_var": 0.04592692057291667, "learning_rate": 0.0001, "loss": 5.6959, "loss/crossentropy": 2.442676544189453, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.16985361278057098, "step": 12926 }, { "epoch": 0.404, "grad_norm": 3.171875, "grad_norm_var": 0.05259501139322917, "learning_rate": 0.0001, "loss": 5.7413, "loss/crossentropy": 2.4280115365982056, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.18210766464471817, "step": 12928 }, { "epoch": 0.4040625, "grad_norm": 2.90625, "grad_norm_var": 0.057860310872395834, "learning_rate": 0.0001, "loss": 5.5295, "loss/crossentropy": 2.4299877882003784, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.163080595433712, "step": 12930 }, { "epoch": 0.404125, "grad_norm": 3.390625, "grad_norm_var": 0.046483357747395836, "learning_rate": 0.0001, "loss": 5.8573, "loss/crossentropy": 2.609510898590088, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17087089270353317, "step": 12932 }, { "epoch": 0.4041875, "grad_norm": 3.625, "grad_norm_var": 0.04343973795572917, "learning_rate": 0.0001, "loss": 6.0956, "loss/crossentropy": 2.6327909231185913, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.19081494212150574, "step": 12934 }, { "epoch": 0.40425, "grad_norm": 3.46875, "grad_norm_var": 0.042985026041666666, "learning_rate": 0.0001, "loss": 5.8734, "loss/crossentropy": 2.5685410499572754, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1773630529642105, "step": 12936 }, { "epoch": 0.4043125, "grad_norm": 3.203125, "grad_norm_var": 0.0441314697265625, "learning_rate": 0.0001, "loss": 6.2464, "loss/crossentropy": 2.921871542930603, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1812841147184372, "step": 12938 }, { "epoch": 0.404375, "grad_norm": 3.234375, "grad_norm_var": 0.04390360514322917, "learning_rate": 0.0001, "loss": 5.889, "loss/crossentropy": 2.590018391609192, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17560270428657532, "step": 12940 }, { "epoch": 0.4044375, "grad_norm": 3.171875, "grad_norm_var": 0.043290201822916666, "learning_rate": 0.0001, "loss": 5.9632, "loss/crossentropy": 2.6677803993225098, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17719509452581406, "step": 12942 }, { "epoch": 0.4045, "grad_norm": 3.359375, "grad_norm_var": 0.030338541666666666, "learning_rate": 0.0001, "loss": 5.5436, "loss/crossentropy": 2.4174834489822388, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16221749037504196, "step": 12944 }, { "epoch": 0.4045625, "grad_norm": 3.359375, "grad_norm_var": 0.020149739583333333, "learning_rate": 0.0001, "loss": 5.697, "loss/crossentropy": 2.472835898399353, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1708502173423767, "step": 12946 }, { "epoch": 0.404625, "grad_norm": 3.359375, "grad_norm_var": 0.019928995768229166, "learning_rate": 0.0001, "loss": 5.8782, "loss/crossentropy": 2.58474600315094, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17582834511995316, "step": 12948 }, { "epoch": 0.4046875, "grad_norm": 3.234375, "grad_norm_var": 0.011881510416666666, "learning_rate": 0.0001, "loss": 5.87, "loss/crossentropy": 2.5309667587280273, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18155810236930847, "step": 12950 }, { "epoch": 0.40475, "grad_norm": 3.0625, "grad_norm_var": 0.012528483072916667, "learning_rate": 0.0001, "loss": 5.9152, "loss/crossentropy": 2.634434938430786, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17729829996824265, "step": 12952 }, { "epoch": 0.4048125, "grad_norm": 3.15625, "grad_norm_var": 0.013963826497395833, "learning_rate": 0.0001, "loss": 5.9207, "loss/crossentropy": 2.6336101293563843, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17558201402425766, "step": 12954 }, { "epoch": 0.404875, "grad_norm": 3.296875, "grad_norm_var": 0.013996378580729166, "learning_rate": 0.0001, "loss": 5.6376, "loss/crossentropy": 2.3832263946533203, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17309315502643585, "step": 12956 }, { "epoch": 0.4049375, "grad_norm": 3.328125, "grad_norm_var": 0.0151275634765625, "learning_rate": 0.0001, "loss": 5.7511, "loss/crossentropy": 2.565544009208679, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17011910676956177, "step": 12958 }, { "epoch": 0.405, "grad_norm": 3.453125, "grad_norm_var": 0.0159332275390625, "learning_rate": 0.0001, "loss": 5.6195, "loss/crossentropy": 2.4714592695236206, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1644178256392479, "step": 12960 }, { "epoch": 0.4050625, "grad_norm": 3.234375, "grad_norm_var": 0.01646728515625, "learning_rate": 0.0001, "loss": 5.9175, "loss/crossentropy": 2.6698936223983765, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17437398433685303, "step": 12962 }, { "epoch": 0.405125, "grad_norm": 3.375, "grad_norm_var": 0.0189117431640625, "learning_rate": 0.0001, "loss": 5.839, "loss/crossentropy": 2.6037967205047607, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17118076980113983, "step": 12964 }, { "epoch": 0.4051875, "grad_norm": 3.46875, "grad_norm_var": 0.020524088541666666, "learning_rate": 0.0001, "loss": 6.0013, "loss/crossentropy": 2.6123119592666626, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.17991715669631958, "step": 12966 }, { "epoch": 0.40525, "grad_norm": 3.171875, "grad_norm_var": 0.017561848958333334, "learning_rate": 0.0001, "loss": 5.9125, "loss/crossentropy": 2.702871561050415, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.16783539205789566, "step": 12968 }, { "epoch": 0.4053125, "grad_norm": 3.328125, "grad_norm_var": 0.021393839518229166, "learning_rate": 0.0001, "loss": 5.6707, "loss/crossentropy": 2.442600131034851, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17046774923801422, "step": 12970 }, { "epoch": 0.405375, "grad_norm": 4.40625, "grad_norm_var": 0.11380106608072917, "learning_rate": 0.0001, "loss": 6.1768, "loss/crossentropy": 2.6450765132904053, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.19223621487617493, "step": 12972 }, { "epoch": 0.4054375, "grad_norm": 3.25, "grad_norm_var": 0.11139322916666666, "learning_rate": 0.0001, "loss": 5.9965, "loss/crossentropy": 2.73725688457489, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1770923212170601, "step": 12974 }, { "epoch": 0.4055, "grad_norm": 3.40625, "grad_norm_var": 0.11028238932291666, "learning_rate": 0.0001, "loss": 6.0203, "loss/crossentropy": 2.6498407125473022, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1819629818201065, "step": 12976 }, { "epoch": 0.4055625, "grad_norm": 3.609375, "grad_norm_var": 0.11162821451822917, "learning_rate": 0.0001, "loss": 5.9902, "loss/crossentropy": 2.645597815513611, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17977623641490936, "step": 12978 }, { "epoch": 0.405625, "grad_norm": 3.046875, "grad_norm_var": 0.12702534993489584, "learning_rate": 0.0001, "loss": 5.3293, "loss/crossentropy": 2.323233962059021, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1560777723789215, "step": 12980 }, { "epoch": 0.4056875, "grad_norm": 3.53125, "grad_norm_var": 0.13032938639322916, "learning_rate": 0.0001, "loss": 5.9522, "loss/crossentropy": 2.6342997550964355, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17944678664207458, "step": 12982 }, { "epoch": 0.40575, "grad_norm": 3.234375, "grad_norm_var": 0.12976888020833333, "learning_rate": 0.0001, "loss": 6.1011, "loss/crossentropy": 2.750393033027649, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.18545960634946823, "step": 12984 }, { "epoch": 0.4058125, "grad_norm": 3.375, "grad_norm_var": 0.12395833333333334, "learning_rate": 0.0001, "loss": 6.0156, "loss/crossentropy": 2.686494469642639, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1805633381009102, "step": 12986 }, { "epoch": 0.405875, "grad_norm": 3.03125, "grad_norm_var": 0.03619791666666667, "learning_rate": 0.0001, "loss": 5.7684, "loss/crossentropy": 2.537803530693054, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17345496267080307, "step": 12988 }, { "epoch": 0.4059375, "grad_norm": 3.15625, "grad_norm_var": 0.044596354166666664, "learning_rate": 0.0001, "loss": 5.7914, "loss/crossentropy": 2.5131624937057495, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17626352608203888, "step": 12990 }, { "epoch": 0.406, "grad_norm": 3.265625, "grad_norm_var": 0.041657511393229166, "learning_rate": 0.0001, "loss": 5.7201, "loss/crossentropy": 2.47638475894928, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17163395136594772, "step": 12992 }, { "epoch": 0.4060625, "grad_norm": 3.359375, "grad_norm_var": 0.034566243489583336, "learning_rate": 0.0001, "loss": 5.7366, "loss/crossentropy": 2.4082270860671997, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18049810826778412, "step": 12994 }, { "epoch": 0.406125, "grad_norm": 3.15625, "grad_norm_var": 0.048258463541666664, "learning_rate": 0.0001, "loss": 5.2884, "loss/crossentropy": 2.2167803049087524, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.15638333559036255, "step": 12996 }, { "epoch": 0.4061875, "grad_norm": 3.25, "grad_norm_var": 0.04304911295572917, "learning_rate": 0.0001, "loss": 5.9594, "loss/crossentropy": 2.6436110734939575, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.177668496966362, "step": 12998 }, { "epoch": 0.40625, "grad_norm": 2.9375, "grad_norm_var": 0.04846903483072917, "learning_rate": 0.0001, "loss": 5.648, "loss/crossentropy": 2.5511425733566284, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1643727794289589, "step": 13000 }, { "epoch": 0.4063125, "grad_norm": 3.1875, "grad_norm_var": 0.04716796875, "learning_rate": 0.0001, "loss": 6.078, "loss/crossentropy": 2.709423780441284, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1856864020228386, "step": 13002 }, { "epoch": 0.406375, "grad_norm": 3.140625, "grad_norm_var": 0.043146769205729164, "learning_rate": 0.0001, "loss": 5.9485, "loss/crossentropy": 2.7473580837249756, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17089996486902237, "step": 13004 }, { "epoch": 0.4064375, "grad_norm": 3.234375, "grad_norm_var": 0.0306793212890625, "learning_rate": 0.0001, "loss": 6.011, "loss/crossentropy": 2.718340754508972, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17848655581474304, "step": 13006 }, { "epoch": 0.4065, "grad_norm": 3.3125, "grad_norm_var": 0.03271077473958333, "learning_rate": 0.0001, "loss": 5.9108, "loss/crossentropy": 2.67184054851532, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1735036000609398, "step": 13008 }, { "epoch": 0.4065625, "grad_norm": 3.671875, "grad_norm_var": 0.049658203125, "learning_rate": 0.0001, "loss": 5.9981, "loss/crossentropy": 2.624893307685852, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1834184154868126, "step": 13010 }, { "epoch": 0.406625, "grad_norm": 3.140625, "grad_norm_var": 0.036942545572916666, "learning_rate": 0.0001, "loss": 5.404, "loss/crossentropy": 2.3047443628311157, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1599244475364685, "step": 13012 }, { "epoch": 0.4066875, "grad_norm": 4.34375, "grad_norm_var": 0.12011311848958334, "learning_rate": 0.0001, "loss": 6.179, "loss/crossentropy": 2.733541250228882, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18907804042100906, "step": 13014 }, { "epoch": 0.40675, "grad_norm": 3.484375, "grad_norm_var": 0.11612040201822917, "learning_rate": 0.0001, "loss": 5.8605, "loss/crossentropy": 2.537650942802429, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.18189211934804916, "step": 13016 }, { "epoch": 0.4068125, "grad_norm": 4.1875, "grad_norm_var": 0.55986328125, "learning_rate": 0.0001, "loss": 5.989, "loss/crossentropy": 2.51317822933197, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.19601739943027496, "step": 13018 }, { "epoch": 0.406875, "grad_norm": 3.328125, "grad_norm_var": 0.5401519775390625, "learning_rate": 0.0001, "loss": 6.1557, "loss/crossentropy": 2.7317371368408203, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18848735094070435, "step": 13020 }, { "epoch": 0.4069375, "grad_norm": 3.34375, "grad_norm_var": 0.5370107014973958, "learning_rate": 0.0001, "loss": 5.7906, "loss/crossentropy": 2.585814952850342, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17086688429117203, "step": 13022 }, { "epoch": 0.407, "grad_norm": 3.3125, "grad_norm_var": 0.55888671875, "learning_rate": 0.0001, "loss": 5.6192, "loss/crossentropy": 2.5052181482315063, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16140136122703552, "step": 13024 }, { "epoch": 0.4070625, "grad_norm": 3.078125, "grad_norm_var": 0.5541341145833333, "learning_rate": 0.0001, "loss": 5.8651, "loss/crossentropy": 2.6415683031082153, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17234836518764496, "step": 13026 }, { "epoch": 0.407125, "grad_norm": 3.109375, "grad_norm_var": 0.5558878580729166, "learning_rate": 0.0001, "loss": 5.8399, "loss/crossentropy": 2.6774927377700806, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1658504456281662, "step": 13028 }, { "epoch": 0.4071875, "grad_norm": 3.328125, "grad_norm_var": 0.5191884358723958, "learning_rate": 0.0001, "loss": 5.7787, "loss/crossentropy": 2.552108645439148, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1714826375246048, "step": 13030 }, { "epoch": 0.40725, "grad_norm": 3.359375, "grad_norm_var": 0.52529296875, "learning_rate": 0.0001, "loss": 5.8789, "loss/crossentropy": 2.6156972646713257, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17476215958595276, "step": 13032 }, { "epoch": 0.4073125, "grad_norm": 3.296875, "grad_norm_var": 0.03331705729166667, "learning_rate": 0.0001, "loss": 5.8981, "loss/crossentropy": 2.586290955543518, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1811782419681549, "step": 13034 }, { "epoch": 0.407375, "grad_norm": 3.296875, "grad_norm_var": 0.024925740559895833, "learning_rate": 0.0001, "loss": 5.954, "loss/crossentropy": 2.6261991262435913, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18082567304372787, "step": 13036 }, { "epoch": 0.4074375, "grad_norm": 3.625, "grad_norm_var": 0.03329671223958333, "learning_rate": 0.0001, "loss": 6.3336, "loss/crossentropy": 2.8870203495025635, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18645723164081573, "step": 13038 }, { "epoch": 0.4075, "grad_norm": 3.5625, "grad_norm_var": 0.052179972330729164, "learning_rate": 0.0001, "loss": 6.3924, "loss/crossentropy": 2.899696469306946, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.19497000426054, "step": 13040 }, { "epoch": 0.4075625, "grad_norm": 3.4375, "grad_norm_var": 0.0502349853515625, "learning_rate": 0.0001, "loss": 6.004, "loss/crossentropy": 2.6903072595596313, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17902319878339767, "step": 13042 }, { "epoch": 0.407625, "grad_norm": 3.09375, "grad_norm_var": 0.04241536458333333, "learning_rate": 0.0001, "loss": 5.4394, "loss/crossentropy": 2.2938272953033447, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16417094320058823, "step": 13044 }, { "epoch": 0.4076875, "grad_norm": 2.953125, "grad_norm_var": 0.05730794270833333, "learning_rate": 0.0001, "loss": 5.5693, "loss/crossentropy": 2.452745795249939, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16634270548820496, "step": 13046 }, { "epoch": 0.40775, "grad_norm": 3.8125, "grad_norm_var": 0.073779296875, "learning_rate": 0.0001, "loss": 5.7333, "loss/crossentropy": 2.4640876054763794, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17496556788682938, "step": 13048 }, { "epoch": 0.4078125, "grad_norm": 3.28125, "grad_norm_var": 0.12746988932291667, "learning_rate": 0.0001, "loss": 6.1964, "loss/crossentropy": 2.707811117172241, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.19143328070640564, "step": 13050 }, { "epoch": 0.407875, "grad_norm": 3.171875, "grad_norm_var": 0.13976236979166667, "learning_rate": 0.0001, "loss": 5.8796, "loss/crossentropy": 2.644796133041382, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17309431731700897, "step": 13052 }, { "epoch": 0.4079375, "grad_norm": 3.21875, "grad_norm_var": 0.13677978515625, "learning_rate": 0.0001, "loss": 5.5999, "loss/crossentropy": 2.475613832473755, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16711590439081192, "step": 13054 }, { "epoch": 0.408, "grad_norm": 3.140625, "grad_norm_var": 0.11959228515625, "learning_rate": 0.0001, "loss": 6.0233, "loss/crossentropy": 2.7193907499313354, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17452675104141235, "step": 13056 }, { "epoch": 0.4080625, "grad_norm": 3.03125, "grad_norm_var": 0.151953125, "learning_rate": 0.0001, "loss": 5.8764, "loss/crossentropy": 2.55754816532135, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18032243102788925, "step": 13058 }, { "epoch": 0.408125, "grad_norm": 3.28125, "grad_norm_var": 0.14645894368489584, "learning_rate": 0.0001, "loss": 5.8501, "loss/crossentropy": 2.5090755224227905, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18020044267177582, "step": 13060 }, { "epoch": 0.4081875, "grad_norm": 3.25, "grad_norm_var": 0.1266998291015625, "learning_rate": 0.0001, "loss": 5.6616, "loss/crossentropy": 2.4366965293884277, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17405778169631958, "step": 13062 }, { "epoch": 0.40825, "grad_norm": 3.453125, "grad_norm_var": 0.11357014973958333, "learning_rate": 0.0001, "loss": 5.9129, "loss/crossentropy": 2.611645460128784, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.18051308393478394, "step": 13064 }, { "epoch": 0.4083125, "grad_norm": 3.15625, "grad_norm_var": 0.053511555989583334, "learning_rate": 0.0001, "loss": 5.7801, "loss/crossentropy": 2.5562463998794556, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17199376970529556, "step": 13066 }, { "epoch": 0.408375, "grad_norm": 2.90625, "grad_norm_var": 0.05780843098958333, "learning_rate": 0.0001, "loss": 5.6068, "loss/crossentropy": 2.465224266052246, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1637713611125946, "step": 13068 }, { "epoch": 0.4084375, "grad_norm": 3.203125, "grad_norm_var": 0.0585601806640625, "learning_rate": 0.0001, "loss": 5.959, "loss/crossentropy": 2.712372303009033, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.175831601023674, "step": 13070 }, { "epoch": 0.4085, "grad_norm": 3.40625, "grad_norm_var": 0.056428019205729166, "learning_rate": 0.0001, "loss": 6.1658, "loss/crossentropy": 2.6972309350967407, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1906116008758545, "step": 13072 }, { "epoch": 0.4085625, "grad_norm": 3.3125, "grad_norm_var": 0.023014322916666666, "learning_rate": 0.0001, "loss": 6.0673, "loss/crossentropy": 2.7165971994400024, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1835119128227234, "step": 13074 }, { "epoch": 0.408625, "grad_norm": 3.390625, "grad_norm_var": 0.0250640869140625, "learning_rate": 0.0001, "loss": 5.8489, "loss/crossentropy": 2.573718786239624, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17438985407352448, "step": 13076 }, { "epoch": 0.4086875, "grad_norm": 3.359375, "grad_norm_var": 0.025275675455729167, "learning_rate": 0.0001, "loss": 6.0729, "loss/crossentropy": 2.7444902658462524, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.18361777812242508, "step": 13078 }, { "epoch": 0.40875, "grad_norm": 3.0, "grad_norm_var": 0.02769775390625, "learning_rate": 0.0001, "loss": 5.6211, "loss/crossentropy": 2.511841297149658, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16405534744262695, "step": 13080 }, { "epoch": 0.4088125, "grad_norm": 4.59375, "grad_norm_var": 0.14172770182291666, "learning_rate": 0.0001, "loss": 6.2488, "loss/crossentropy": 2.8049396276474, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1885310783982277, "step": 13082 }, { "epoch": 0.408875, "grad_norm": 3.484375, "grad_norm_var": 0.12698160807291667, "learning_rate": 0.0001, "loss": 5.9688, "loss/crossentropy": 2.6219193935394287, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17960664629936218, "step": 13084 }, { "epoch": 0.4089375, "grad_norm": 3.484375, "grad_norm_var": 0.12097880045572916, "learning_rate": 0.0001, "loss": 6.251, "loss/crossentropy": 2.896095871925354, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1819765493273735, "step": 13086 }, { "epoch": 0.409, "grad_norm": 3.265625, "grad_norm_var": 0.13064676920572918, "learning_rate": 0.0001, "loss": 6.0631, "loss/crossentropy": 2.682563066482544, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.183364599943161, "step": 13088 }, { "epoch": 0.4090625, "grad_norm": 3.25, "grad_norm_var": 0.13582255045572916, "learning_rate": 0.0001, "loss": 5.8131, "loss/crossentropy": 2.4506473541259766, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1792103797197342, "step": 13090 }, { "epoch": 0.409125, "grad_norm": 3.140625, "grad_norm_var": 0.14191080729166666, "learning_rate": 0.0001, "loss": 6.05, "loss/crossentropy": 2.672413468360901, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18463655561208725, "step": 13092 }, { "epoch": 0.4091875, "grad_norm": 3.078125, "grad_norm_var": 0.15699462890625, "learning_rate": 0.0001, "loss": 5.4334, "loss/crossentropy": 2.3502501249313354, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16143521666526794, "step": 13094 }, { "epoch": 0.40925, "grad_norm": 3.28125, "grad_norm_var": 0.1360504150390625, "learning_rate": 0.0001, "loss": 5.8177, "loss/crossentropy": 2.548230767250061, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1777261570096016, "step": 13096 }, { "epoch": 0.4093125, "grad_norm": 3.375, "grad_norm_var": 0.05133056640625, "learning_rate": 0.0001, "loss": 5.6474, "loss/crossentropy": 2.5038857460021973, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16512799263000488, "step": 13098 }, { "epoch": 0.409375, "grad_norm": 3.265625, "grad_norm_var": 0.04692281087239583, "learning_rate": 0.0001, "loss": 5.8479, "loss/crossentropy": 2.573588013648987, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17430279403924942, "step": 13100 }, { "epoch": 0.4094375, "grad_norm": 3.828125, "grad_norm_var": 0.06464436848958334, "learning_rate": 0.0001, "loss": 5.7626, "loss/crossentropy": 2.4751532077789307, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17444677650928497, "step": 13102 }, { "epoch": 0.4095, "grad_norm": 3.421875, "grad_norm_var": 0.050837198893229164, "learning_rate": 0.0001, "loss": 5.7975, "loss/crossentropy": 2.5101910829544067, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17795388400554657, "step": 13104 }, { "epoch": 0.4095625, "grad_norm": 3.59375, "grad_norm_var": 0.05718994140625, "learning_rate": 0.0001, "loss": 5.7223, "loss/crossentropy": 2.4912636280059814, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1719299554824829, "step": 13106 }, { "epoch": 0.409625, "grad_norm": 3.03125, "grad_norm_var": 0.05935872395833333, "learning_rate": 0.0001, "loss": 5.8821, "loss/crossentropy": 2.6286439895629883, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17495766282081604, "step": 13108 }, { "epoch": 0.4096875, "grad_norm": 2.875, "grad_norm_var": 0.0646484375, "learning_rate": 0.0001, "loss": 5.6972, "loss/crossentropy": 2.5340648889541626, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.17100301384925842, "step": 13110 }, { "epoch": 0.40975, "grad_norm": 3.15625, "grad_norm_var": 0.06648661295572916, "learning_rate": 0.0001, "loss": 5.6622, "loss/crossentropy": 2.534527063369751, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1682344302535057, "step": 13112 }, { "epoch": 0.4098125, "grad_norm": 3.234375, "grad_norm_var": 0.07119140625, "learning_rate": 0.0001, "loss": 5.444, "loss/crossentropy": 2.3773000240325928, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15901631116867065, "step": 13114 }, { "epoch": 0.409875, "grad_norm": 3.265625, "grad_norm_var": 0.06913655598958333, "learning_rate": 0.0001, "loss": 5.7806, "loss/crossentropy": 2.476312279701233, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.18160535395145416, "step": 13116 }, { "epoch": 0.4099375, "grad_norm": 2.875, "grad_norm_var": 0.051122029622395836, "learning_rate": 0.0001, "loss": 5.6636, "loss/crossentropy": 2.5202556848526, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1658995822072029, "step": 13118 }, { "epoch": 0.41, "grad_norm": 3.34375, "grad_norm_var": 0.048746744791666664, "learning_rate": 0.0001, "loss": 5.9608, "loss/crossentropy": 2.648427128791809, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.18084564805030823, "step": 13120 }, { "epoch": 0.4100625, "grad_norm": 3.359375, "grad_norm_var": 0.034619140625, "learning_rate": 0.0001, "loss": 5.8285, "loss/crossentropy": 2.50955867767334, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18032754957675934, "step": 13122 }, { "epoch": 0.410125, "grad_norm": 3.4375, "grad_norm_var": 0.03951416015625, "learning_rate": 0.0001, "loss": 5.8366, "loss/crossentropy": 2.528256058692932, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.18278292566537857, "step": 13124 }, { "epoch": 0.4101875, "grad_norm": 2.953125, "grad_norm_var": 0.0361236572265625, "learning_rate": 0.0001, "loss": 5.7716, "loss/crossentropy": 2.579535961151123, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17076754570007324, "step": 13126 }, { "epoch": 0.41025, "grad_norm": 3.46875, "grad_norm_var": 0.04003499348958333, "learning_rate": 0.0001, "loss": 5.9524, "loss/crossentropy": 2.6309748888015747, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1805766299366951, "step": 13128 }, { "epoch": 0.4103125, "grad_norm": 5.34375, "grad_norm_var": 0.3024648030598958, "learning_rate": 0.0001, "loss": 6.2001, "loss/crossentropy": 2.7218735218048096, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.19196175783872604, "step": 13130 }, { "epoch": 0.410375, "grad_norm": 3.40625, "grad_norm_var": 0.30048828125, "learning_rate": 0.0001, "loss": 5.9113, "loss/crossentropy": 2.632519841194153, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17670893669128418, "step": 13132 }, { "epoch": 0.4104375, "grad_norm": 2.953125, "grad_norm_var": 0.30227457682291664, "learning_rate": 0.0001, "loss": 5.7421, "loss/crossentropy": 2.5946797132492065, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.17098701000213623, "step": 13134 }, { "epoch": 0.4105, "grad_norm": 3.234375, "grad_norm_var": 0.31245829264322916, "learning_rate": 0.0001, "loss": 5.7763, "loss/crossentropy": 2.5710290670394897, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17091825604438782, "step": 13136 }, { "epoch": 0.4105625, "grad_norm": 3.59375, "grad_norm_var": 0.32151590983072914, "learning_rate": 0.0001, "loss": 6.1374, "loss/crossentropy": 2.688127636909485, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.19023503363132477, "step": 13138 }, { "epoch": 0.410625, "grad_norm": 3.40625, "grad_norm_var": 0.32194722493489586, "learning_rate": 0.0001, "loss": 5.8991, "loss/crossentropy": 2.6207317113876343, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17588258534669876, "step": 13140 }, { "epoch": 0.4106875, "grad_norm": 3.5625, "grad_norm_var": 0.30554097493489585, "learning_rate": 0.0001, "loss": 5.9693, "loss/crossentropy": 2.6185660362243652, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1819438636302948, "step": 13142 }, { "epoch": 0.41075, "grad_norm": 3.484375, "grad_norm_var": 0.3094716389973958, "learning_rate": 0.0001, "loss": 5.6599, "loss/crossentropy": 2.396067976951599, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17325875163078308, "step": 13144 }, { "epoch": 0.4108125, "grad_norm": 3.296875, "grad_norm_var": 0.0578765869140625, "learning_rate": 0.0001, "loss": 6.0223, "loss/crossentropy": 2.7240883111953735, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17708660662174225, "step": 13146 }, { "epoch": 0.410875, "grad_norm": 3.015625, "grad_norm_var": 0.06599934895833333, "learning_rate": 0.0001, "loss": 5.8849, "loss/crossentropy": 2.707707166671753, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1681085005402565, "step": 13148 }, { "epoch": 0.4109375, "grad_norm": 3.359375, "grad_norm_var": 0.05093994140625, "learning_rate": 0.0001, "loss": 5.9366, "loss/crossentropy": 2.665693759918213, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17747745662927628, "step": 13150 }, { "epoch": 0.411, "grad_norm": 3.078125, "grad_norm_var": 0.058137003580729166, "learning_rate": 0.0001, "loss": 5.3523, "loss/crossentropy": 2.3129165172576904, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16018399596214294, "step": 13152 }, { "epoch": 0.4110625, "grad_norm": 3.34375, "grad_norm_var": 0.164404296875, "learning_rate": 0.0001, "loss": 5.7427, "loss/crossentropy": 2.3883581161499023, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18270309269428253, "step": 13154 }, { "epoch": 0.411125, "grad_norm": 3.1875, "grad_norm_var": 0.16544596354166666, "learning_rate": 0.0001, "loss": 5.5893, "loss/crossentropy": 2.4184921979904175, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1682557836174965, "step": 13156 }, { "epoch": 0.4111875, "grad_norm": 4.0, "grad_norm_var": 0.19062398274739584, "learning_rate": 0.0001, "loss": 5.793, "loss/crossentropy": 2.558933734893799, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1710628792643547, "step": 13158 }, { "epoch": 0.41125, "grad_norm": 3.34375, "grad_norm_var": 0.18707275390625, "learning_rate": 0.0001, "loss": 5.912, "loss/crossentropy": 2.6660226583480835, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17577102780342102, "step": 13160 }, { "epoch": 0.4113125, "grad_norm": 3.390625, "grad_norm_var": 0.19097900390625, "learning_rate": 0.0001, "loss": 5.8179, "loss/crossentropy": 2.5855531692504883, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17089534550905228, "step": 13162 }, { "epoch": 0.411375, "grad_norm": 3.359375, "grad_norm_var": 0.18017578125, "learning_rate": 0.0001, "loss": 6.1325, "loss/crossentropy": 2.7981022596359253, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1810910701751709, "step": 13164 }, { "epoch": 0.4114375, "grad_norm": 4.75, "grad_norm_var": 0.2921549479166667, "learning_rate": 0.0001, "loss": 6.0312, "loss/crossentropy": 2.654681921005249, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1841331049799919, "step": 13166 }, { "epoch": 0.4115, "grad_norm": 3.203125, "grad_norm_var": 0.2732086181640625, "learning_rate": 0.0001, "loss": 5.6075, "loss/crossentropy": 2.474108576774597, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16568316519260406, "step": 13168 }, { "epoch": 0.4115625, "grad_norm": 2.96875, "grad_norm_var": 0.18518473307291666, "learning_rate": 0.0001, "loss": 5.9398, "loss/crossentropy": 2.6023415327072144, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17827922105789185, "step": 13170 }, { "epoch": 0.411625, "grad_norm": 3.328125, "grad_norm_var": 0.18005269368489582, "learning_rate": 0.0001, "loss": 6.2196, "loss/crossentropy": 2.844622850418091, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1832045540213585, "step": 13172 }, { "epoch": 0.4116875, "grad_norm": 3.265625, "grad_norm_var": 0.15696512858072917, "learning_rate": 0.0001, "loss": 6.0967, "loss/crossentropy": 2.647633671760559, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.19255923479795456, "step": 13174 }, { "epoch": 0.41175, "grad_norm": 3.3125, "grad_norm_var": 0.15987955729166667, "learning_rate": 0.0001, "loss": 5.9396, "loss/crossentropy": 2.614232897758484, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.18292462080717087, "step": 13176 }, { "epoch": 0.4118125, "grad_norm": 3.140625, "grad_norm_var": 0.1589019775390625, "learning_rate": 0.0001, "loss": 5.5491, "loss/crossentropy": 2.373845338821411, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16595923900604248, "step": 13178 }, { "epoch": 0.411875, "grad_norm": 3.234375, "grad_norm_var": 0.16457417805989583, "learning_rate": 0.0001, "loss": 5.872, "loss/crossentropy": 2.5679699182510376, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17571911960840225, "step": 13180 }, { "epoch": 0.4119375, "grad_norm": 3.328125, "grad_norm_var": 0.023075358072916666, "learning_rate": 0.0001, "loss": 6.1259, "loss/crossentropy": 2.7394293546676636, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18317710608243942, "step": 13182 }, { "epoch": 0.412, "grad_norm": 3.328125, "grad_norm_var": 0.022587076822916666, "learning_rate": 0.0001, "loss": 5.9327, "loss/crossentropy": 2.6078226566314697, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17662370204925537, "step": 13184 }, { "epoch": 0.4120625, "grad_norm": 3.15625, "grad_norm_var": 0.015913899739583334, "learning_rate": 0.0001, "loss": 5.8175, "loss/crossentropy": 2.5503119230270386, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17554441094398499, "step": 13186 }, { "epoch": 0.412125, "grad_norm": 3.515625, "grad_norm_var": 0.018550618489583334, "learning_rate": 0.0001, "loss": 6.1154, "loss/crossentropy": 2.7444063425064087, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18436866998672485, "step": 13188 }, { "epoch": 0.4121875, "grad_norm": 3.3125, "grad_norm_var": 0.016109212239583334, "learning_rate": 0.0001, "loss": 5.5464, "loss/crossentropy": 2.332979202270508, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17055802047252655, "step": 13190 }, { "epoch": 0.41225, "grad_norm": 3.09375, "grad_norm_var": 0.01793212890625, "learning_rate": 0.0001, "loss": 5.7799, "loss/crossentropy": 2.530150532722473, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17536704242229462, "step": 13192 }, { "epoch": 0.4123125, "grad_norm": 3.1875, "grad_norm_var": 0.017317708333333334, "learning_rate": 0.0001, "loss": 5.5503, "loss/crossentropy": 2.3143259286880493, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17476805299520493, "step": 13194 }, { "epoch": 0.412375, "grad_norm": 4.53125, "grad_norm_var": 0.114111328125, "learning_rate": 0.0001, "loss": 6.0064, "loss/crossentropy": 2.6239311695098877, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18238778412342072, "step": 13196 }, { "epoch": 0.4124375, "grad_norm": 3.359375, "grad_norm_var": 0.11399637858072917, "learning_rate": 0.0001, "loss": 6.1141, "loss/crossentropy": 2.669206738471985, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.19214162975549698, "step": 13198 }, { "epoch": 0.4125, "grad_norm": 3.078125, "grad_norm_var": 0.11969401041666666, "learning_rate": 0.0001, "loss": 5.491, "loss/crossentropy": 2.3696892261505127, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1629132702946663, "step": 13200 }, { "epoch": 0.4125625, "grad_norm": 3.046875, "grad_norm_var": 0.11653544108072916, "learning_rate": 0.0001, "loss": 5.5503, "loss/crossentropy": 2.4425567388534546, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16076939553022385, "step": 13202 }, { "epoch": 0.412625, "grad_norm": 3.125, "grad_norm_var": 0.1280181884765625, "learning_rate": 0.0001, "loss": 5.6681, "loss/crossentropy": 2.4140864610671997, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17501512169837952, "step": 13204 }, { "epoch": 0.4126875, "grad_norm": 3.3125, "grad_norm_var": 0.12833658854166666, "learning_rate": 0.0001, "loss": 5.7862, "loss/crossentropy": 2.585092306137085, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16854554414749146, "step": 13206 }, { "epoch": 0.41275, "grad_norm": 3.015625, "grad_norm_var": 0.1355621337890625, "learning_rate": 0.0001, "loss": 5.9727, "loss/crossentropy": 2.6454169750213623, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18116343766450882, "step": 13208 }, { "epoch": 0.4128125, "grad_norm": 3.0625, "grad_norm_var": 0.15442301432291666, "learning_rate": 0.0001, "loss": 5.6759, "loss/crossentropy": 2.4061704874038696, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1718936413526535, "step": 13210 }, { "epoch": 0.412875, "grad_norm": 3.453125, "grad_norm_var": 0.06181538899739583, "learning_rate": 0.0001, "loss": 6.1976, "loss/crossentropy": 2.7651203870773315, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.19011814892292023, "step": 13212 }, { "epoch": 0.4129375, "grad_norm": 3.296875, "grad_norm_var": 0.061930338541666664, "learning_rate": 0.0001, "loss": 5.996, "loss/crossentropy": 2.701950192451477, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17784158140420914, "step": 13214 }, { "epoch": 0.413, "grad_norm": 3.109375, "grad_norm_var": 0.06635640462239584, "learning_rate": 0.0001, "loss": 5.6419, "loss/crossentropy": 2.512939691543579, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1621171534061432, "step": 13216 }, { "epoch": 0.4130625, "grad_norm": 3.59375, "grad_norm_var": 0.07097066243489583, "learning_rate": 0.0001, "loss": 5.8349, "loss/crossentropy": 2.527301073074341, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17724359780550003, "step": 13218 }, { "epoch": 0.413125, "grad_norm": 3.40625, "grad_norm_var": 0.057249959309895834, "learning_rate": 0.0001, "loss": 6.2121, "loss/crossentropy": 2.728484869003296, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.19289369881153107, "step": 13220 }, { "epoch": 0.4131875, "grad_norm": 3.484375, "grad_norm_var": 0.060770670572916664, "learning_rate": 0.0001, "loss": 5.9931, "loss/crossentropy": 2.6184688806533813, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18395137786865234, "step": 13222 }, { "epoch": 0.41325, "grad_norm": 3.1875, "grad_norm_var": 0.06220296223958333, "learning_rate": 0.0001, "loss": 5.6879, "loss/crossentropy": 2.5066728591918945, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16772903501987457, "step": 13224 }, { "epoch": 0.4133125, "grad_norm": 3.21875, "grad_norm_var": 0.04133199055989583, "learning_rate": 0.0001, "loss": 5.925, "loss/crossentropy": 2.6199041604995728, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1742590218782425, "step": 13226 }, { "epoch": 0.413375, "grad_norm": 3.484375, "grad_norm_var": 0.03858133951822917, "learning_rate": 0.0001, "loss": 5.825, "loss/crossentropy": 2.541438341140747, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17405883967876434, "step": 13228 }, { "epoch": 0.4134375, "grad_norm": 3.171875, "grad_norm_var": 0.04065348307291667, "learning_rate": 0.0001, "loss": 5.7935, "loss/crossentropy": 2.605634570121765, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17074382305145264, "step": 13230 }, { "epoch": 0.4135, "grad_norm": 3.1875, "grad_norm_var": 0.03902587890625, "learning_rate": 0.0001, "loss": 6.3648, "loss/crossentropy": 2.9099632501602173, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.19235942512750626, "step": 13232 }, { "epoch": 0.4135625, "grad_norm": 3.03125, "grad_norm_var": 0.03572489420572917, "learning_rate": 0.0001, "loss": 5.8881, "loss/crossentropy": 2.6659469604492188, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17104150354862213, "step": 13234 }, { "epoch": 0.413625, "grad_norm": 2.96875, "grad_norm_var": 0.04006245930989583, "learning_rate": 0.0001, "loss": 5.4022, "loss/crossentropy": 2.2921340465545654, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1602272465825081, "step": 13236 }, { "epoch": 0.4136875, "grad_norm": 3.125, "grad_norm_var": 0.03375244140625, "learning_rate": 0.0001, "loss": 5.7986, "loss/crossentropy": 2.5155210494995117, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17713217437267303, "step": 13238 }, { "epoch": 0.41375, "grad_norm": 3.15625, "grad_norm_var": 0.02906494140625, "learning_rate": 0.0001, "loss": 5.7667, "loss/crossentropy": 2.5459882020950317, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17519887536764145, "step": 13240 }, { "epoch": 0.4138125, "grad_norm": 3.40625, "grad_norm_var": 0.02672119140625, "learning_rate": 0.0001, "loss": 5.9134, "loss/crossentropy": 2.598848342895508, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17911456525325775, "step": 13242 }, { "epoch": 0.413875, "grad_norm": 3.359375, "grad_norm_var": 0.026204427083333332, "learning_rate": 0.0001, "loss": 5.7157, "loss/crossentropy": 2.5025155544281006, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1736644208431244, "step": 13244 }, { "epoch": 0.4139375, "grad_norm": 3.0625, "grad_norm_var": 0.0280426025390625, "learning_rate": 0.0001, "loss": 5.8131, "loss/crossentropy": 2.613903760910034, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17187371850013733, "step": 13246 }, { "epoch": 0.414, "grad_norm": 3.0625, "grad_norm_var": 0.022150675455729168, "learning_rate": 0.0001, "loss": 5.5436, "loss/crossentropy": 2.4482144117355347, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.162273108959198, "step": 13248 }, { "epoch": 0.4140625, "grad_norm": 3.015625, "grad_norm_var": 0.022411092122395834, "learning_rate": 0.0001, "loss": 5.9598, "loss/crossentropy": 2.663753390312195, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17960526049137115, "step": 13250 }, { "epoch": 0.414125, "grad_norm": 2.96875, "grad_norm_var": 0.0251953125, "learning_rate": 0.0001, "loss": 5.511, "loss/crossentropy": 2.4860671758651733, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15561538934707642, "step": 13252 }, { "epoch": 0.4141875, "grad_norm": 3.078125, "grad_norm_var": 0.020979817708333334, "learning_rate": 0.0001, "loss": 5.5277, "loss/crossentropy": 2.4098732471466064, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1629585400223732, "step": 13254 }, { "epoch": 0.41425, "grad_norm": 3.203125, "grad_norm_var": 0.022998046875, "learning_rate": 0.0001, "loss": 6.0683, "loss/crossentropy": 2.7509347200393677, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18095697462558746, "step": 13256 }, { "epoch": 0.4143125, "grad_norm": 3.265625, "grad_norm_var": 0.017463175455729167, "learning_rate": 0.0001, "loss": 5.9674, "loss/crossentropy": 2.6672979593276978, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17844686657190323, "step": 13258 }, { "epoch": 0.414375, "grad_norm": 3.265625, "grad_norm_var": 0.015331013997395834, "learning_rate": 0.0001, "loss": 5.6512, "loss/crossentropy": 2.4591602087020874, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16842012852430344, "step": 13260 }, { "epoch": 0.4144375, "grad_norm": 3.265625, "grad_norm_var": 0.019791666666666666, "learning_rate": 0.0001, "loss": 5.7192, "loss/crossentropy": 2.5459213256835938, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16615993529558182, "step": 13262 }, { "epoch": 0.4145, "grad_norm": 3.203125, "grad_norm_var": 0.017039998372395834, "learning_rate": 0.0001, "loss": 5.7789, "loss/crossentropy": 2.60288667678833, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1707262471318245, "step": 13264 }, { "epoch": 0.4145625, "grad_norm": 3.046875, "grad_norm_var": 0.017113240559895833, "learning_rate": 0.0001, "loss": 5.8823, "loss/crossentropy": 2.6379607915878296, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1728760376572609, "step": 13266 }, { "epoch": 0.414625, "grad_norm": 3.5625, "grad_norm_var": 0.017464192708333333, "learning_rate": 0.0001, "loss": 6.1463, "loss/crossentropy": 2.7377195358276367, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1869480088353157, "step": 13268 }, { "epoch": 0.4146875, "grad_norm": 221249536.0, "grad_norm_var": 3059459732353980.5, "learning_rate": 0.0001, "loss": 8.1439, "loss/crossentropy": 2.4181610345840454, "loss/hidden": 2.40234375, "loss/jsd": 0.0, "loss/logits": 0.332338847219944, "step": 13270 }, { "epoch": 0.41475, "grad_norm": 3.953125, "grad_norm_var": 3059459730020489.5, "learning_rate": 0.0001, "loss": 6.1215, "loss/crossentropy": 2.6964603662490845, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.18469253927469254, "step": 13272 }, { "epoch": 0.4148125, "grad_norm": 2.96875, "grad_norm_var": 3059459729905255.0, "learning_rate": 0.0001, "loss": 5.6707, "loss/crossentropy": 2.3926303386688232, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1719427853822708, "step": 13274 }, { "epoch": 0.414875, "grad_norm": 3.1875, "grad_norm_var": 3059459729501935.5, "learning_rate": 0.0001, "loss": 5.7719, "loss/crossentropy": 2.4425740242004395, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17785116285085678, "step": 13276 }, { "epoch": 0.4149375, "grad_norm": 3.109375, "grad_norm_var": 3059459729790021.0, "learning_rate": 0.0001, "loss": 5.8638, "loss/crossentropy": 2.6428544521331787, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17248062789440155, "step": 13278 }, { "epoch": 0.415, "grad_norm": 3.5, "grad_norm_var": 3059459729645978.5, "learning_rate": 0.0001, "loss": 6.1344, "loss/crossentropy": 2.7529033422470093, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.18854503333568573, "step": 13280 }, { "epoch": 0.4150625, "grad_norm": 3.296875, "grad_norm_var": 3059459728954573.5, "learning_rate": 0.0001, "loss": 5.7005, "loss/crossentropy": 2.411180257797241, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17502257227897644, "step": 13282 }, { "epoch": 0.415125, "grad_norm": 5.09375, "grad_norm_var": 3059459726966785.0, "learning_rate": 0.0001, "loss": 5.3624, "loss/crossentropy": 2.238004684448242, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1600944846868515, "step": 13284 }, { "epoch": 0.4151875, "grad_norm": 3.328125, "grad_norm_var": 0.27325846354166666, "learning_rate": 0.0001, "loss": 5.8273, "loss/crossentropy": 2.4858503341674805, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1786753088235855, "step": 13286 }, { "epoch": 0.41525, "grad_norm": 3.125, "grad_norm_var": 0.2527496337890625, "learning_rate": 0.0001, "loss": 5.8173, "loss/crossentropy": 2.5401535034179688, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1765388399362564, "step": 13288 }, { "epoch": 0.4153125, "grad_norm": 3.40625, "grad_norm_var": 0.23998921712239582, "learning_rate": 0.0001, "loss": 5.8668, "loss/crossentropy": 2.60434091091156, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17507880926132202, "step": 13290 }, { "epoch": 0.415375, "grad_norm": 3.40625, "grad_norm_var": 0.2409088134765625, "learning_rate": 0.0001, "loss": 5.8486, "loss/crossentropy": 2.604510545730591, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17284392565488815, "step": 13292 }, { "epoch": 0.4154375, "grad_norm": 4.53125, "grad_norm_var": 0.3153635660807292, "learning_rate": 0.0001, "loss": 5.9829, "loss/crossentropy": 2.625877618789673, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1810181587934494, "step": 13294 }, { "epoch": 0.4155, "grad_norm": 3.703125, "grad_norm_var": 0.3130279541015625, "learning_rate": 0.0001, "loss": 5.9803, "loss/crossentropy": 2.6242889165878296, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18247459828853607, "step": 13296 }, { "epoch": 0.4155625, "grad_norm": 3.046875, "grad_norm_var": 0.3250071207682292, "learning_rate": 0.0001, "loss": 5.6778, "loss/crossentropy": 2.4567201137542725, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17054059356451035, "step": 13298 }, { "epoch": 0.415625, "grad_norm": 3.359375, "grad_norm_var": 0.142041015625, "learning_rate": 0.0001, "loss": 5.6596, "loss/crossentropy": 2.473236560821533, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16746355593204498, "step": 13300 }, { "epoch": 0.4156875, "grad_norm": 3.234375, "grad_norm_var": 0.14356180826822917, "learning_rate": 0.0001, "loss": 5.7374, "loss/crossentropy": 2.4581050872802734, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.17090317606925964, "step": 13302 }, { "epoch": 0.41575, "grad_norm": 4.09375, "grad_norm_var": 0.333984375, "learning_rate": 0.0001, "loss": 6.5727, "loss/crossentropy": 2.818481206893921, "loss/hidden": 1.640625, "loss/jsd": 0.0, "loss/logits": 0.2113558128476143, "step": 13304 }, { "epoch": 0.4158125, "grad_norm": 3.390625, "grad_norm_var": 0.374267578125, "learning_rate": 0.0001, "loss": 5.9868, "loss/crossentropy": 2.625677466392517, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18337451666593552, "step": 13306 }, { "epoch": 0.415875, "grad_norm": 3.15625, "grad_norm_var": 0.3784739176432292, "learning_rate": 0.0001, "loss": 5.8802, "loss/crossentropy": 2.577824592590332, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17985156923532486, "step": 13308 }, { "epoch": 0.4159375, "grad_norm": 3.046875, "grad_norm_var": 0.34845377604166666, "learning_rate": 0.0001, "loss": 5.8672, "loss/crossentropy": 2.6495500802993774, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17372021824121475, "step": 13310 }, { "epoch": 0.416, "grad_norm": 3.265625, "grad_norm_var": 0.34324544270833335, "learning_rate": 0.0001, "loss": 5.8939, "loss/crossentropy": 2.5945329666137695, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17720364034175873, "step": 13312 }, { "epoch": 0.4160625, "grad_norm": 3.265625, "grad_norm_var": 0.3314127604166667, "learning_rate": 0.0001, "loss": 6.0797, "loss/crossentropy": 2.772068738937378, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17959268391132355, "step": 13314 }, { "epoch": 0.416125, "grad_norm": 3.578125, "grad_norm_var": 0.32737528483072914, "learning_rate": 0.0001, "loss": 5.9208, "loss/crossentropy": 2.5992554426193237, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17902611941099167, "step": 13316 }, { "epoch": 0.4161875, "grad_norm": 3.234375, "grad_norm_var": 0.3333160400390625, "learning_rate": 0.0001, "loss": 6.0433, "loss/crossentropy": 2.6992595195770264, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18244589865207672, "step": 13318 }, { "epoch": 0.41625, "grad_norm": 3.078125, "grad_norm_var": 0.12014058430989584, "learning_rate": 0.0001, "loss": 6.0013, "loss/crossentropy": 2.6596258878707886, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17987225949764252, "step": 13320 }, { "epoch": 0.4163125, "grad_norm": 3.3125, "grad_norm_var": 0.022835286458333333, "learning_rate": 0.0001, "loss": 5.7675, "loss/crossentropy": 2.4878780841827393, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17562340945005417, "step": 13322 }, { "epoch": 0.416375, "grad_norm": 3.09375, "grad_norm_var": 0.024117024739583333, "learning_rate": 0.0001, "loss": 5.6204, "loss/crossentropy": 2.3957300186157227, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1724710911512375, "step": 13324 }, { "epoch": 0.4164375, "grad_norm": 3.609375, "grad_norm_var": 0.030855305989583335, "learning_rate": 0.0001, "loss": 5.8249, "loss/crossentropy": 2.5164222717285156, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17537520080804825, "step": 13326 }, { "epoch": 0.4165, "grad_norm": 3.09375, "grad_norm_var": 0.03662109375, "learning_rate": 0.0001, "loss": 5.6166, "loss/crossentropy": 2.4366323947906494, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1652638539671898, "step": 13328 }, { "epoch": 0.4165625, "grad_norm": 3.125, "grad_norm_var": 0.04049072265625, "learning_rate": 0.0001, "loss": 5.9916, "loss/crossentropy": 2.7603050470352173, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17508035898208618, "step": 13330 }, { "epoch": 0.416625, "grad_norm": 3.375, "grad_norm_var": 0.03472900390625, "learning_rate": 0.0001, "loss": 5.9878, "loss/crossentropy": 2.6948909759521484, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1773415356874466, "step": 13332 }, { "epoch": 0.4166875, "grad_norm": 3.078125, "grad_norm_var": 0.0405426025390625, "learning_rate": 0.0001, "loss": 5.7725, "loss/crossentropy": 2.5620919466018677, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1694791242480278, "step": 13334 }, { "epoch": 0.41675, "grad_norm": 3.296875, "grad_norm_var": 0.0394439697265625, "learning_rate": 0.0001, "loss": 5.5998, "loss/crossentropy": 2.45719838142395, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16699153184890747, "step": 13336 }, { "epoch": 0.4168125, "grad_norm": 3.703125, "grad_norm_var": 0.053221638997395834, "learning_rate": 0.0001, "loss": 6.1197, "loss/crossentropy": 2.694188714027405, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.188251793384552, "step": 13338 }, { "epoch": 0.416875, "grad_norm": 3.1875, "grad_norm_var": 0.05523681640625, "learning_rate": 0.0001, "loss": 6.0335, "loss/crossentropy": 2.7140097618103027, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.18156076222658157, "step": 13340 }, { "epoch": 0.4169375, "grad_norm": 3.234375, "grad_norm_var": 0.0443511962890625, "learning_rate": 0.0001, "loss": 6.0226, "loss/crossentropy": 2.661650538444519, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18414033949375153, "step": 13342 }, { "epoch": 0.417, "grad_norm": 3.046875, "grad_norm_var": 0.0461334228515625, "learning_rate": 0.0001, "loss": 5.7235, "loss/crossentropy": 2.478771209716797, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1717337816953659, "step": 13344 }, { "epoch": 0.4170625, "grad_norm": 3.4375, "grad_norm_var": 0.0412506103515625, "learning_rate": 0.0001, "loss": 5.8778, "loss/crossentropy": 2.632124662399292, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17495673149824142, "step": 13346 }, { "epoch": 0.417125, "grad_norm": 3.0625, "grad_norm_var": 0.043115234375, "learning_rate": 0.0001, "loss": 6.0772, "loss/crossentropy": 2.775555968284607, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17899131774902344, "step": 13348 }, { "epoch": 0.4171875, "grad_norm": 3.1875, "grad_norm_var": 0.042601521809895834, "learning_rate": 0.0001, "loss": 6.0198, "loss/crossentropy": 2.7797752618789673, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1736101284623146, "step": 13350 }, { "epoch": 0.41725, "grad_norm": 3.0625, "grad_norm_var": 0.038863118489583334, "learning_rate": 0.0001, "loss": 5.4021, "loss/crossentropy": 2.3128799200057983, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1608761101961136, "step": 13352 }, { "epoch": 0.4173125, "grad_norm": 2.90625, "grad_norm_var": 0.0328033447265625, "learning_rate": 0.0001, "loss": 5.488, "loss/crossentropy": 2.4243158102035522, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15753886848688126, "step": 13354 }, { "epoch": 0.417375, "grad_norm": 3.28125, "grad_norm_var": 1.1357981363932292, "learning_rate": 0.0001, "loss": 6.0172, "loss/crossentropy": 2.6375356912612915, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18289029598236084, "step": 13356 }, { "epoch": 0.4174375, "grad_norm": 3.15625, "grad_norm_var": 1.153076171875, "learning_rate": 0.0001, "loss": 5.5547, "loss/crossentropy": 2.4211915731430054, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16296488046646118, "step": 13358 }, { "epoch": 0.4175, "grad_norm": 3.125, "grad_norm_var": 1.1563873291015625, "learning_rate": 0.0001, "loss": 5.7951, "loss/crossentropy": 2.5717642307281494, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17428386211395264, "step": 13360 }, { "epoch": 0.4175625, "grad_norm": 3.609375, "grad_norm_var": 1.15572509765625, "learning_rate": 0.0001, "loss": 5.9661, "loss/crossentropy": 2.6595340967178345, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.18261125683784485, "step": 13362 }, { "epoch": 0.417625, "grad_norm": 3.015625, "grad_norm_var": 1.1632232666015625, "learning_rate": 0.0001, "loss": 5.9179, "loss/crossentropy": 2.6462502479553223, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17364852130413055, "step": 13364 }, { "epoch": 0.4176875, "grad_norm": 3.25, "grad_norm_var": 1.153343709309896, "learning_rate": 0.0001, "loss": 5.8498, "loss/crossentropy": 2.6918177604675293, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16501691937446594, "step": 13366 }, { "epoch": 0.41775, "grad_norm": 3.3125, "grad_norm_var": 1.14615478515625, "learning_rate": 0.0001, "loss": 5.804, "loss/crossentropy": 2.536333441734314, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17598675191402435, "step": 13368 }, { "epoch": 0.4178125, "grad_norm": 3.234375, "grad_norm_var": 1.122021484375, "learning_rate": 0.0001, "loss": 5.3244, "loss/crossentropy": 2.227112650871277, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.15777958929538727, "step": 13370 }, { "epoch": 0.417875, "grad_norm": 4.71875, "grad_norm_var": 0.17509765625, "learning_rate": 0.0001, "loss": 5.9836, "loss/crossentropy": 2.636080026626587, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.18553803116083145, "step": 13372 }, { "epoch": 0.4179375, "grad_norm": 3.328125, "grad_norm_var": 0.16098531087239584, "learning_rate": 0.0001, "loss": 5.9206, "loss/crossentropy": 2.621453285217285, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17913124710321426, "step": 13374 }, { "epoch": 0.418, "grad_norm": 3.265625, "grad_norm_var": 0.15097249348958333, "learning_rate": 0.0001, "loss": 6.2312, "loss/crossentropy": 2.796077847480774, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18843501061201096, "step": 13376 }, { "epoch": 0.4180625, "grad_norm": 3.171875, "grad_norm_var": 0.15077718098958334, "learning_rate": 0.0001, "loss": 5.9163, "loss/crossentropy": 2.6079468727111816, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17770981043577194, "step": 13378 }, { "epoch": 0.418125, "grad_norm": 3.390625, "grad_norm_var": 0.13793843587239582, "learning_rate": 0.0001, "loss": 6.1217, "loss/crossentropy": 2.783895492553711, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.181822769343853, "step": 13380 }, { "epoch": 0.4181875, "grad_norm": 3.328125, "grad_norm_var": 0.13452860514322917, "learning_rate": 0.0001, "loss": 5.8822, "loss/crossentropy": 2.6041117906570435, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17976077646017075, "step": 13382 }, { "epoch": 0.41825, "grad_norm": 3.1875, "grad_norm_var": 0.1415679931640625, "learning_rate": 0.0001, "loss": 5.7405, "loss/crossentropy": 2.4632670879364014, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17733129858970642, "step": 13384 }, { "epoch": 0.4183125, "grad_norm": 3.0625, "grad_norm_var": 0.14594624837239584, "learning_rate": 0.0001, "loss": 6.1735, "loss/crossentropy": 2.747955083847046, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18903613835573196, "step": 13386 }, { "epoch": 0.418375, "grad_norm": 3.578125, "grad_norm_var": 0.034891764322916664, "learning_rate": 0.0001, "loss": 5.926, "loss/crossentropy": 2.628903031349182, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1824403628706932, "step": 13388 }, { "epoch": 0.4184375, "grad_norm": 6.25, "grad_norm_var": 0.5953196207682292, "learning_rate": 0.0001, "loss": 5.9103, "loss/crossentropy": 2.48799729347229, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.1840263083577156, "step": 13390 }, { "epoch": 0.4185, "grad_norm": 3.078125, "grad_norm_var": 0.6135243733723958, "learning_rate": 0.0001, "loss": 5.9143, "loss/crossentropy": 2.6407530307769775, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17696483433246613, "step": 13392 }, { "epoch": 0.4185625, "grad_norm": 3.8125, "grad_norm_var": 0.61724853515625, "learning_rate": 0.0001, "loss": 5.8641, "loss/crossentropy": 2.538280963897705, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17945598810911179, "step": 13394 }, { "epoch": 0.418625, "grad_norm": 3.578125, "grad_norm_var": 0.6085245768229167, "learning_rate": 0.0001, "loss": 5.8976, "loss/crossentropy": 2.600240468978882, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17778422683477402, "step": 13396 }, { "epoch": 0.4186875, "grad_norm": 3.296875, "grad_norm_var": 0.6098866780598958, "learning_rate": 0.0001, "loss": 5.8737, "loss/crossentropy": 2.6107362508773804, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17668740451335907, "step": 13398 }, { "epoch": 0.41875, "grad_norm": 8.625, "grad_norm_var": 2.196703084309896, "learning_rate": 0.0001, "loss": 5.7575, "loss/crossentropy": 2.415872573852539, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.17674195766448975, "step": 13400 }, { "epoch": 0.4188125, "grad_norm": 3.390625, "grad_norm_var": 2.171947224934896, "learning_rate": 0.0001, "loss": 5.953, "loss/crossentropy": 2.647102952003479, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17707321792840958, "step": 13402 }, { "epoch": 0.418875, "grad_norm": 3.015625, "grad_norm_var": 2.161229451497396, "learning_rate": 0.0001, "loss": 5.7712, "loss/crossentropy": 2.5194567441940308, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17673862725496292, "step": 13404 }, { "epoch": 0.4189375, "grad_norm": 3.046875, "grad_norm_var": 1.7971588134765626, "learning_rate": 0.0001, "loss": 6.0203, "loss/crossentropy": 2.7287405729293823, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17564278095960617, "step": 13406 }, { "epoch": 0.419, "grad_norm": 3.234375, "grad_norm_var": 1.8368448893229166, "learning_rate": 0.0001, "loss": 5.8314, "loss/crossentropy": 2.470117211341858, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18105123937129974, "step": 13408 }, { "epoch": 0.4190625, "grad_norm": 3.328125, "grad_norm_var": 1.8598917643229167, "learning_rate": 0.0001, "loss": 5.6467, "loss/crossentropy": 2.48598575592041, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1656765565276146, "step": 13410 }, { "epoch": 0.419125, "grad_norm": 3.109375, "grad_norm_var": 1.8889638264973958, "learning_rate": 0.0001, "loss": 5.4564, "loss/crossentropy": 2.2907443046569824, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16578485071659088, "step": 13412 }, { "epoch": 0.4191875, "grad_norm": 3.078125, "grad_norm_var": 1.9190012613932292, "learning_rate": 0.0001, "loss": 5.713, "loss/crossentropy": 2.5517455339431763, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16808252781629562, "step": 13414 }, { "epoch": 0.41925, "grad_norm": 3.078125, "grad_norm_var": 0.14417317708333333, "learning_rate": 0.0001, "loss": 5.7764, "loss/crossentropy": 2.537129521369934, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17587868124246597, "step": 13416 }, { "epoch": 0.4193125, "grad_norm": 3.25, "grad_norm_var": 0.14276936848958333, "learning_rate": 0.0001, "loss": 5.7669, "loss/crossentropy": 2.5112040042877197, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1700960397720337, "step": 13418 }, { "epoch": 0.419375, "grad_norm": 3.25, "grad_norm_var": 0.13907877604166666, "learning_rate": 0.0001, "loss": 5.8926, "loss/crossentropy": 2.6581804752349854, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17187707126140594, "step": 13420 }, { "epoch": 0.4194375, "grad_norm": 3.734375, "grad_norm_var": 0.15826416015625, "learning_rate": 0.0001, "loss": 5.9235, "loss/crossentropy": 2.537745952606201, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.17998237162828445, "step": 13422 }, { "epoch": 0.4195, "grad_norm": 3.421875, "grad_norm_var": 0.0461578369140625, "learning_rate": 0.0001, "loss": 5.7226, "loss/crossentropy": 2.4406378269195557, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1789797767996788, "step": 13424 }, { "epoch": 0.4195625, "grad_norm": 3.09375, "grad_norm_var": 0.045849609375, "learning_rate": 0.0001, "loss": 5.6016, "loss/crossentropy": 2.383212089538574, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1726173758506775, "step": 13426 }, { "epoch": 0.419625, "grad_norm": 3.25, "grad_norm_var": 0.04413655598958333, "learning_rate": 0.0001, "loss": 6.0146, "loss/crossentropy": 2.7026796340942383, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1796249970793724, "step": 13428 }, { "epoch": 0.4196875, "grad_norm": 3.4375, "grad_norm_var": 0.0380767822265625, "learning_rate": 0.0001, "loss": 5.8797, "loss/crossentropy": 2.6174843311309814, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17661139369010925, "step": 13430 }, { "epoch": 0.41975, "grad_norm": 3.4375, "grad_norm_var": 0.0279693603515625, "learning_rate": 0.0001, "loss": 5.9371, "loss/crossentropy": 2.576013684272766, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18376551568508148, "step": 13432 }, { "epoch": 0.4198125, "grad_norm": 3.09375, "grad_norm_var": 0.03082275390625, "learning_rate": 0.0001, "loss": 5.7509, "loss/crossentropy": 2.544191360473633, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17184053361415863, "step": 13434 }, { "epoch": 0.419875, "grad_norm": 3.421875, "grad_norm_var": 0.03209228515625, "learning_rate": 0.0001, "loss": 5.5981, "loss/crossentropy": 2.3723961114883423, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17061317712068558, "step": 13436 }, { "epoch": 0.4199375, "grad_norm": 3.46875, "grad_norm_var": 0.018550618489583334, "learning_rate": 0.0001, "loss": 5.8003, "loss/crossentropy": 2.6000717878341675, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1711946427822113, "step": 13438 }, { "epoch": 0.42, "grad_norm": 3.328125, "grad_norm_var": 0.017513020833333334, "learning_rate": 0.0001, "loss": 6.0101, "loss/crossentropy": 2.6064292192459106, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18763728439807892, "step": 13440 }, { "epoch": 0.4200625, "grad_norm": 3.703125, "grad_norm_var": 0.024828084309895835, "learning_rate": 0.0001, "loss": 5.8854, "loss/crossentropy": 2.6590187549591064, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17458895593881607, "step": 13442 }, { "epoch": 0.420125, "grad_norm": 3.46875, "grad_norm_var": 0.025191243489583334, "learning_rate": 0.0001, "loss": 5.918, "loss/crossentropy": 2.607555866241455, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17752625793218613, "step": 13444 }, { "epoch": 0.4201875, "grad_norm": 3.234375, "grad_norm_var": 0.026195271809895834, "learning_rate": 0.0001, "loss": 5.7567, "loss/crossentropy": 2.554731249809265, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1725441962480545, "step": 13446 }, { "epoch": 0.42025, "grad_norm": 3.203125, "grad_norm_var": 0.08779195149739584, "learning_rate": 0.0001, "loss": 5.9279, "loss/crossentropy": 2.6013470888137817, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.18148837983608246, "step": 13448 }, { "epoch": 0.4203125, "grad_norm": 3.25, "grad_norm_var": 0.08702799479166666, "learning_rate": 0.0001, "loss": 5.9229, "loss/crossentropy": 2.625003695487976, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17979059368371964, "step": 13450 }, { "epoch": 0.420375, "grad_norm": 2.921875, "grad_norm_var": 0.1002593994140625, "learning_rate": 0.0001, "loss": 5.879, "loss/crossentropy": 2.624898314476013, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.16915686428546906, "step": 13452 }, { "epoch": 0.4204375, "grad_norm": 3.1875, "grad_norm_var": 0.1020172119140625, "learning_rate": 0.0001, "loss": 5.7025, "loss/crossentropy": 2.572711944580078, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1661021187901497, "step": 13454 }, { "epoch": 0.4205, "grad_norm": 3.15625, "grad_norm_var": 0.11406962076822917, "learning_rate": 0.0001, "loss": 5.6847, "loss/crossentropy": 2.509779691696167, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1663188487291336, "step": 13456 }, { "epoch": 0.4205625, "grad_norm": 2.953125, "grad_norm_var": 0.11355692545572917, "learning_rate": 0.0001, "loss": 5.5047, "loss/crossentropy": 2.4106862545013428, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1613505482673645, "step": 13458 }, { "epoch": 0.420625, "grad_norm": 3.25, "grad_norm_var": 0.11013895670572917, "learning_rate": 0.0001, "loss": 5.619, "loss/crossentropy": 2.399704098701477, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.1633327305316925, "step": 13460 }, { "epoch": 0.4206875, "grad_norm": 3.28125, "grad_norm_var": 0.10957743326822916, "learning_rate": 0.0001, "loss": 6.0299, "loss/crossentropy": 2.7502713203430176, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17757584154605865, "step": 13462 }, { "epoch": 0.42075, "grad_norm": 3.265625, "grad_norm_var": 0.04217122395833333, "learning_rate": 0.0001, "loss": 5.9929, "loss/crossentropy": 2.682234048843384, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18028826266527176, "step": 13464 }, { "epoch": 0.4208125, "grad_norm": 3.28125, "grad_norm_var": 0.0327789306640625, "learning_rate": 0.0001, "loss": 5.8913, "loss/crossentropy": 2.646457552909851, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17252662032842636, "step": 13466 }, { "epoch": 0.420875, "grad_norm": 3.859375, "grad_norm_var": 0.0522857666015625, "learning_rate": 0.0001, "loss": 5.9075, "loss/crossentropy": 2.5467541217803955, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.17982478439807892, "step": 13468 }, { "epoch": 0.4209375, "grad_norm": 3.578125, "grad_norm_var": 0.0559478759765625, "learning_rate": 0.0001, "loss": 6.1718, "loss/crossentropy": 2.7906744480133057, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18498337268829346, "step": 13470 }, { "epoch": 0.421, "grad_norm": 3.78125, "grad_norm_var": 0.059723917643229166, "learning_rate": 0.0001, "loss": 6.0537, "loss/crossentropy": 2.6973766088485718, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.18602560460567474, "step": 13472 }, { "epoch": 0.4210625, "grad_norm": 3.359375, "grad_norm_var": 0.044169108072916664, "learning_rate": 0.0001, "loss": 5.7862, "loss/crossentropy": 2.57953679561615, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17145118862390518, "step": 13474 }, { "epoch": 0.421125, "grad_norm": 3.109375, "grad_norm_var": 0.05371805826822917, "learning_rate": 0.0001, "loss": 5.7416, "loss/crossentropy": 2.556352138519287, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16735686361789703, "step": 13476 }, { "epoch": 0.4211875, "grad_norm": 3.515625, "grad_norm_var": 0.05446675618489583, "learning_rate": 0.0001, "loss": 6.0581, "loss/crossentropy": 2.733359932899475, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18012653291225433, "step": 13478 }, { "epoch": 0.42125, "grad_norm": 3.203125, "grad_norm_var": 0.06277669270833333, "learning_rate": 0.0001, "loss": 6.0782, "loss/crossentropy": 2.7694530487060547, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17970608174800873, "step": 13480 }, { "epoch": 0.4213125, "grad_norm": 3.125, "grad_norm_var": 0.06507161458333334, "learning_rate": 0.0001, "loss": 5.9773, "loss/crossentropy": 2.702518939971924, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1763092428445816, "step": 13482 }, { "epoch": 0.421375, "grad_norm": 3.5, "grad_norm_var": 0.04702046712239583, "learning_rate": 0.0001, "loss": 5.7623, "loss/crossentropy": 2.4824819564819336, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17329394072294235, "step": 13484 }, { "epoch": 0.4214375, "grad_norm": 3.34375, "grad_norm_var": 0.04290364583333333, "learning_rate": 0.0001, "loss": 5.6789, "loss/crossentropy": 2.4971306324005127, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1693483293056488, "step": 13486 }, { "epoch": 0.4215, "grad_norm": 3.390625, "grad_norm_var": 0.02711181640625, "learning_rate": 0.0001, "loss": 6.0181, "loss/crossentropy": 2.667072296142578, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18236906826496124, "step": 13488 }, { "epoch": 0.4215625, "grad_norm": 3.375, "grad_norm_var": 0.356689453125, "learning_rate": 0.0001, "loss": 5.7587, "loss/crossentropy": 2.497342586517334, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17339970916509628, "step": 13490 }, { "epoch": 0.421625, "grad_norm": 3.484375, "grad_norm_var": 0.40539449055989585, "learning_rate": 0.0001, "loss": 5.9602, "loss/crossentropy": 2.5742835998535156, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18351581692695618, "step": 13492 }, { "epoch": 0.4216875, "grad_norm": 3.40625, "grad_norm_var": 0.41343994140625, "learning_rate": 0.0001, "loss": 5.6152, "loss/crossentropy": 2.4358010292053223, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.169895239174366, "step": 13494 }, { "epoch": 0.42175, "grad_norm": 3.53125, "grad_norm_var": 0.3828938802083333, "learning_rate": 0.0001, "loss": 6.0907, "loss/crossentropy": 2.7139753103256226, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18298854678869247, "step": 13496 }, { "epoch": 0.4218125, "grad_norm": 2.984375, "grad_norm_var": 0.39010416666666664, "learning_rate": 0.0001, "loss": 5.7839, "loss/crossentropy": 2.626868724822998, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16609349101781845, "step": 13498 }, { "epoch": 0.421875, "grad_norm": 3.0625, "grad_norm_var": 0.4120076497395833, "learning_rate": 0.0001, "loss": 5.9938, "loss/crossentropy": 2.759929895401001, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17456068098545074, "step": 13500 }, { "epoch": 0.4219375, "grad_norm": 3.46875, "grad_norm_var": 0.4039998372395833, "learning_rate": 0.0001, "loss": 5.9093, "loss/crossentropy": 2.6714051961898804, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17417631298303604, "step": 13502 }, { "epoch": 0.422, "grad_norm": 3.15625, "grad_norm_var": 0.41015218098958334, "learning_rate": 0.0001, "loss": 5.9514, "loss/crossentropy": 2.6693791151046753, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17742042243480682, "step": 13504 }, { "epoch": 0.4220625, "grad_norm": 3.28125, "grad_norm_var": 0.11609700520833334, "learning_rate": 0.0001, "loss": 6.0173, "loss/crossentropy": 2.7742620706558228, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17352618277072906, "step": 13506 }, { "epoch": 0.422125, "grad_norm": 2.9375, "grad_norm_var": 0.04129231770833333, "learning_rate": 0.0001, "loss": 5.8235, "loss/crossentropy": 2.619171619415283, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17238953709602356, "step": 13508 }, { "epoch": 0.4221875, "grad_norm": 3.0625, "grad_norm_var": 0.04148661295572917, "learning_rate": 0.0001, "loss": 5.8389, "loss/crossentropy": 2.6312637329101562, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17115691304206848, "step": 13510 }, { "epoch": 0.42225, "grad_norm": 3.234375, "grad_norm_var": 0.0308013916015625, "learning_rate": 0.0001, "loss": 5.9009, "loss/crossentropy": 2.6351054906845093, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17657799273729324, "step": 13512 }, { "epoch": 0.4223125, "grad_norm": 3.34375, "grad_norm_var": 0.024430338541666666, "learning_rate": 0.0001, "loss": 5.832, "loss/crossentropy": 2.595235824584961, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1748461276292801, "step": 13514 }, { "epoch": 0.422375, "grad_norm": 3.421875, "grad_norm_var": 0.026188151041666666, "learning_rate": 0.0001, "loss": 5.8888, "loss/crossentropy": 2.6085236072540283, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1764683797955513, "step": 13516 }, { "epoch": 0.4224375, "grad_norm": 3.125, "grad_norm_var": 0.023779296875, "learning_rate": 0.0001, "loss": 5.5742, "loss/crossentropy": 2.4499523639678955, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16671674698591232, "step": 13518 }, { "epoch": 0.4225, "grad_norm": 2.875, "grad_norm_var": 0.03146158854166667, "learning_rate": 0.0001, "loss": 5.4037, "loss/crossentropy": 2.3098702430725098, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1605541855096817, "step": 13520 }, { "epoch": 0.4225625, "grad_norm": 3.390625, "grad_norm_var": 0.03340555826822917, "learning_rate": 0.0001, "loss": 5.9726, "loss/crossentropy": 2.64448082447052, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1781274378299713, "step": 13522 }, { "epoch": 0.422625, "grad_norm": 3.03125, "grad_norm_var": 0.034821573893229166, "learning_rate": 0.0001, "loss": 5.6807, "loss/crossentropy": 2.45465350151062, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17338386923074722, "step": 13524 }, { "epoch": 0.4226875, "grad_norm": 3.390625, "grad_norm_var": 0.032957967122395834, "learning_rate": 0.0001, "loss": 6.0195, "loss/crossentropy": 2.72080659866333, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17518477141857147, "step": 13526 }, { "epoch": 0.42275, "grad_norm": 3.09375, "grad_norm_var": 0.03427734375, "learning_rate": 0.0001, "loss": 5.3524, "loss/crossentropy": 2.244433879852295, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16235922276973724, "step": 13528 }, { "epoch": 0.4228125, "grad_norm": 3.046875, "grad_norm_var": 0.03691304524739583, "learning_rate": 0.0001, "loss": 6.0795, "loss/crossentropy": 2.715553045272827, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1817065104842186, "step": 13530 }, { "epoch": 0.422875, "grad_norm": 3.03125, "grad_norm_var": 0.03583984375, "learning_rate": 0.0001, "loss": 5.6236, "loss/crossentropy": 2.480729341506958, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16858422756195068, "step": 13532 }, { "epoch": 0.4229375, "grad_norm": 3.421875, "grad_norm_var": 0.04533589680989583, "learning_rate": 0.0001, "loss": 5.8396, "loss/crossentropy": 2.5761055946350098, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17518050968647003, "step": 13534 }, { "epoch": 0.423, "grad_norm": 3.1875, "grad_norm_var": 0.044140625, "learning_rate": 0.0001, "loss": 5.7705, "loss/crossentropy": 2.5773677825927734, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1689225435256958, "step": 13536 }, { "epoch": 0.4230625, "grad_norm": 3.0625, "grad_norm_var": 0.04312744140625, "learning_rate": 0.0001, "loss": 5.5529, "loss/crossentropy": 2.4116233587265015, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16373953968286514, "step": 13538 }, { "epoch": 0.423125, "grad_norm": 3.4375, "grad_norm_var": 0.047118123372395834, "learning_rate": 0.0001, "loss": 6.0036, "loss/crossentropy": 2.6386624574661255, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18571265786886215, "step": 13540 }, { "epoch": 0.4231875, "grad_norm": 3.25, "grad_norm_var": 0.04478759765625, "learning_rate": 0.0001, "loss": 5.9893, "loss/crossentropy": 2.6201233863830566, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18535902351140976, "step": 13542 }, { "epoch": 0.42325, "grad_norm": 3.5625, "grad_norm_var": 0.05511067708333333, "learning_rate": 0.0001, "loss": 6.3525, "loss/crossentropy": 2.8163682222366333, "loss/hidden": 1.609375, "loss/jsd": 0.0, "loss/logits": 0.1926744133234024, "step": 13544 }, { "epoch": 0.4233125, "grad_norm": 3.796875, "grad_norm_var": 0.06813151041666667, "learning_rate": 0.0001, "loss": 6.1174, "loss/crossentropy": 2.648329496383667, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.19027109444141388, "step": 13546 }, { "epoch": 0.423375, "grad_norm": 3.734375, "grad_norm_var": 0.07858784993489583, "learning_rate": 0.0001, "loss": 5.7509, "loss/crossentropy": 2.5591460466384888, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17151930928230286, "step": 13548 }, { "epoch": 0.4234375, "grad_norm": 3.28125, "grad_norm_var": 0.06431884765625, "learning_rate": 0.0001, "loss": 5.9321, "loss/crossentropy": 2.6350157260894775, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17932235449552536, "step": 13550 }, { "epoch": 0.4235, "grad_norm": 3.046875, "grad_norm_var": 0.05556640625, "learning_rate": 0.0001, "loss": 5.7378, "loss/crossentropy": 2.443291425704956, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17905839532613754, "step": 13552 }, { "epoch": 0.4235625, "grad_norm": 3.65625, "grad_norm_var": 0.11454976399739583, "learning_rate": 0.0001, "loss": 5.7831, "loss/crossentropy": 2.4910799264907837, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17451247572898865, "step": 13554 }, { "epoch": 0.423625, "grad_norm": 3.15625, "grad_norm_var": 0.13596598307291666, "learning_rate": 0.0001, "loss": 5.5742, "loss/crossentropy": 2.452877163887024, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1644795536994934, "step": 13556 }, { "epoch": 0.4236875, "grad_norm": 2.953125, "grad_norm_var": 0.15886942545572916, "learning_rate": 0.0001, "loss": 5.665, "loss/crossentropy": 2.4376211166381836, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1696086823940277, "step": 13558 }, { "epoch": 0.42375, "grad_norm": 3.265625, "grad_norm_var": 0.15921122233072918, "learning_rate": 0.0001, "loss": 5.8832, "loss/crossentropy": 2.618662476539612, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17449791729450226, "step": 13560 }, { "epoch": 0.4238125, "grad_norm": 3.265625, "grad_norm_var": 0.14875386555989584, "learning_rate": 0.0001, "loss": 5.8942, "loss/crossentropy": 2.562021017074585, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.17422987520694733, "step": 13562 }, { "epoch": 0.423875, "grad_norm": 3.625, "grad_norm_var": 0.13917643229166668, "learning_rate": 0.0001, "loss": 6.0478, "loss/crossentropy": 2.6333521604537964, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18987825512886047, "step": 13564 }, { "epoch": 0.4239375, "grad_norm": 3.125, "grad_norm_var": 0.14609375, "learning_rate": 0.0001, "loss": 5.9004, "loss/crossentropy": 2.6945817470550537, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17018862068653107, "step": 13566 }, { "epoch": 0.424, "grad_norm": 3.6875, "grad_norm_var": 0.14438374837239584, "learning_rate": 0.0001, "loss": 5.7261, "loss/crossentropy": 2.4810198545455933, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17333311587572098, "step": 13568 }, { "epoch": 0.4240625, "grad_norm": 3.234375, "grad_norm_var": 0.0650390625, "learning_rate": 0.0001, "loss": 5.8039, "loss/crossentropy": 2.582778811454773, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17132743448019028, "step": 13570 }, { "epoch": 0.424125, "grad_norm": 3.3125, "grad_norm_var": 0.053132120768229166, "learning_rate": 0.0001, "loss": 5.9895, "loss/crossentropy": 2.6580461263656616, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1827528029680252, "step": 13572 }, { "epoch": 0.4241875, "grad_norm": 3.21875, "grad_norm_var": 0.0322418212890625, "learning_rate": 0.0001, "loss": 5.5213, "loss/crossentropy": 2.3348710536956787, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1674690619111061, "step": 13574 }, { "epoch": 0.42425, "grad_norm": 2.734375, "grad_norm_var": 0.05172526041666667, "learning_rate": 0.0001, "loss": 5.2827, "loss/crossentropy": 2.324481725692749, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1469968482851982, "step": 13576 }, { "epoch": 0.4243125, "grad_norm": 3.171875, "grad_norm_var": 0.0509674072265625, "learning_rate": 0.0001, "loss": 5.7213, "loss/crossentropy": 2.6081173419952393, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1660090982913971, "step": 13578 }, { "epoch": 0.424375, "grad_norm": 3.203125, "grad_norm_var": 0.04109598795572917, "learning_rate": 0.0001, "loss": 5.5337, "loss/crossentropy": 2.4117451906204224, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16219858080148697, "step": 13580 }, { "epoch": 0.4244375, "grad_norm": 3.828125, "grad_norm_var": 0.06424051920572917, "learning_rate": 0.0001, "loss": 5.3408, "loss/crossentropy": 2.2714133262634277, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15850047767162323, "step": 13582 }, { "epoch": 0.4245, "grad_norm": 3.203125, "grad_norm_var": 0.0598052978515625, "learning_rate": 0.0001, "loss": 5.9355, "loss/crossentropy": 2.5937576293945312, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1818263828754425, "step": 13584 }, { "epoch": 0.4245625, "grad_norm": 4.875, "grad_norm_var": 0.2268951416015625, "learning_rate": 0.0001, "loss": 6.0587, "loss/crossentropy": 2.6613736152648926, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1854352280497551, "step": 13586 }, { "epoch": 0.424625, "grad_norm": 3.234375, "grad_norm_var": 0.22893880208333334, "learning_rate": 0.0001, "loss": 5.7986, "loss/crossentropy": 2.533471941947937, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17534331232309341, "step": 13588 }, { "epoch": 0.4246875, "grad_norm": 3.171875, "grad_norm_var": 0.22678629557291666, "learning_rate": 0.0001, "loss": 5.7789, "loss/crossentropy": 2.5065789222717285, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1737145110964775, "step": 13590 }, { "epoch": 0.42475, "grad_norm": 3.171875, "grad_norm_var": 0.2024810791015625, "learning_rate": 0.0001, "loss": 5.8493, "loss/crossentropy": 2.5877480506896973, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1745925396680832, "step": 13592 }, { "epoch": 0.4248125, "grad_norm": 3.53125, "grad_norm_var": 0.19434305826822917, "learning_rate": 0.0001, "loss": 5.9073, "loss/crossentropy": 2.6121160984039307, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1759987249970436, "step": 13594 }, { "epoch": 0.424875, "grad_norm": 4.59375, "grad_norm_var": 0.2897420247395833, "learning_rate": 0.0001, "loss": 5.9084, "loss/crossentropy": 2.6313605308532715, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17730838060379028, "step": 13596 }, { "epoch": 0.4249375, "grad_norm": 3.125, "grad_norm_var": 0.28772379557291666, "learning_rate": 0.0001, "loss": 5.9081, "loss/crossentropy": 2.660796046257019, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17355673015117645, "step": 13598 }, { "epoch": 0.425, "grad_norm": 3.578125, "grad_norm_var": 0.28989969889322914, "learning_rate": 0.0001, "loss": 5.8516, "loss/crossentropy": 2.5417560338974, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17981155961751938, "step": 13600 }, { "epoch": 0.4250625, "grad_norm": 3.4375, "grad_norm_var": 0.14089253743489583, "learning_rate": 0.0001, "loss": 5.9886, "loss/crossentropy": 2.612521529197693, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18447883427143097, "step": 13602 }, { "epoch": 0.425125, "grad_norm": 3.453125, "grad_norm_var": 0.14060770670572917, "learning_rate": 0.0001, "loss": 5.7486, "loss/crossentropy": 2.529898762702942, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.16757732629776, "step": 13604 }, { "epoch": 0.4251875, "grad_norm": 3.359375, "grad_norm_var": 0.13560791015625, "learning_rate": 0.0001, "loss": 5.9274, "loss/crossentropy": 2.588602662086487, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17997673153877258, "step": 13606 }, { "epoch": 0.42525, "grad_norm": 3.265625, "grad_norm_var": 0.15074869791666667, "learning_rate": 0.0001, "loss": 5.9698, "loss/crossentropy": 2.599562168121338, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18311382085084915, "step": 13608 }, { "epoch": 0.4253125, "grad_norm": 3.1875, "grad_norm_var": 0.14986063639322916, "learning_rate": 0.0001, "loss": 5.7757, "loss/crossentropy": 2.4895899295806885, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17665836960077286, "step": 13610 }, { "epoch": 0.425375, "grad_norm": 3.421875, "grad_norm_var": 0.04376627604166667, "learning_rate": 0.0001, "loss": 5.9007, "loss/crossentropy": 2.615707755088806, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17654824256896973, "step": 13612 }, { "epoch": 0.4254375, "grad_norm": 3.359375, "grad_norm_var": 0.0394683837890625, "learning_rate": 0.0001, "loss": 5.9093, "loss/crossentropy": 2.6024194955825806, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17755922675132751, "step": 13614 }, { "epoch": 0.4255, "grad_norm": 3.34375, "grad_norm_var": 0.037007649739583336, "learning_rate": 0.0001, "loss": 5.3853, "loss/crossentropy": 2.3523961305618286, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15563051402568817, "step": 13616 }, { "epoch": 0.4255625, "grad_norm": 3.453125, "grad_norm_var": 0.03515218098958333, "learning_rate": 0.0001, "loss": 5.8712, "loss/crossentropy": 2.51068115234375, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18449315428733826, "step": 13618 }, { "epoch": 0.425625, "grad_norm": 3.140625, "grad_norm_var": 0.03655598958333333, "learning_rate": 0.0001, "loss": 5.9174, "loss/crossentropy": 2.7071765661239624, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17102541029453278, "step": 13620 }, { "epoch": 0.4256875, "grad_norm": 3.546875, "grad_norm_var": 0.03996988932291667, "learning_rate": 0.0001, "loss": 6.1444, "loss/crossentropy": 2.7265379428863525, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18710123747587204, "step": 13622 }, { "epoch": 0.42575, "grad_norm": 3.078125, "grad_norm_var": 0.020068359375, "learning_rate": 0.0001, "loss": 5.9351, "loss/crossentropy": 2.638815402984619, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17767088115215302, "step": 13624 }, { "epoch": 0.4258125, "grad_norm": 3.21875, "grad_norm_var": 0.0225006103515625, "learning_rate": 0.0001, "loss": 5.8705, "loss/crossentropy": 2.6077572107315063, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17237266153097153, "step": 13626 }, { "epoch": 0.425875, "grad_norm": 3.078125, "grad_norm_var": 0.026334635416666665, "learning_rate": 0.0001, "loss": 5.6404, "loss/crossentropy": 2.4760019779205322, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16761603951454163, "step": 13628 }, { "epoch": 0.4259375, "grad_norm": 3.390625, "grad_norm_var": 0.0270660400390625, "learning_rate": 0.0001, "loss": 5.8097, "loss/crossentropy": 2.5245094299316406, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17343804985284805, "step": 13630 }, { "epoch": 0.426, "grad_norm": 3.109375, "grad_norm_var": 0.026764933268229166, "learning_rate": 0.0001, "loss": 5.7181, "loss/crossentropy": 2.5386550426483154, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1714557781815529, "step": 13632 }, { "epoch": 0.4260625, "grad_norm": 3.34375, "grad_norm_var": 0.027123006184895833, "learning_rate": 0.0001, "loss": 5.4877, "loss/crossentropy": 2.36991286277771, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1641227975487709, "step": 13634 }, { "epoch": 0.426125, "grad_norm": 3.25, "grad_norm_var": 0.0275787353515625, "learning_rate": 0.0001, "loss": 5.488, "loss/crossentropy": 2.3451461791992188, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1635008379817009, "step": 13636 }, { "epoch": 0.4261875, "grad_norm": 3.046875, "grad_norm_var": 0.023021443684895834, "learning_rate": 0.0001, "loss": 5.7511, "loss/crossentropy": 2.6217762231826782, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16527977585792542, "step": 13638 }, { "epoch": 0.42625, "grad_norm": 3.0625, "grad_norm_var": 0.02301025390625, "learning_rate": 0.0001, "loss": 5.3194, "loss/crossentropy": 2.2426563501358032, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16235796362161636, "step": 13640 }, { "epoch": 0.4263125, "grad_norm": 3.09375, "grad_norm_var": 0.01685791015625, "learning_rate": 0.0001, "loss": 5.8931, "loss/crossentropy": 2.6649584770202637, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17476899176836014, "step": 13642 }, { "epoch": 0.426375, "grad_norm": 3.5625, "grad_norm_var": 0.0351226806640625, "learning_rate": 0.0001, "loss": 5.7322, "loss/crossentropy": 2.4845662117004395, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17242056876420975, "step": 13644 }, { "epoch": 0.4264375, "grad_norm": 3.375, "grad_norm_var": 0.03449605305989583, "learning_rate": 0.0001, "loss": 5.7222, "loss/crossentropy": 2.536791205406189, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16971704363822937, "step": 13646 }, { "epoch": 0.4265, "grad_norm": 3.09375, "grad_norm_var": 0.034566243489583336, "learning_rate": 0.0001, "loss": 5.5455, "loss/crossentropy": 2.3979681730270386, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1678829789161682, "step": 13648 }, { "epoch": 0.4265625, "grad_norm": 3.109375, "grad_norm_var": 0.03430989583333333, "learning_rate": 0.0001, "loss": 5.6397, "loss/crossentropy": 2.4746124744415283, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1704118475317955, "step": 13650 }, { "epoch": 0.426625, "grad_norm": 3.25, "grad_norm_var": 0.03469645182291667, "learning_rate": 0.0001, "loss": 5.5564, "loss/crossentropy": 2.4087108373641968, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16281092911958694, "step": 13652 }, { "epoch": 0.4266875, "grad_norm": 3.328125, "grad_norm_var": 0.03352762858072917, "learning_rate": 0.0001, "loss": 5.5027, "loss/crossentropy": 2.326142907142639, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1660897359251976, "step": 13654 }, { "epoch": 0.42675, "grad_norm": 3.390625, "grad_norm_var": 0.033421834309895836, "learning_rate": 0.0001, "loss": 5.7803, "loss/crossentropy": 2.5783458948135376, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16863028705120087, "step": 13656 }, { "epoch": 0.4268125, "grad_norm": 3.125, "grad_norm_var": 0.06815999348958333, "learning_rate": 0.0001, "loss": 5.732, "loss/crossentropy": 2.4865012168884277, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17220691591501236, "step": 13658 }, { "epoch": 0.426875, "grad_norm": 3.0625, "grad_norm_var": 0.060212198893229166, "learning_rate": 0.0001, "loss": 5.7956, "loss/crossentropy": 2.5870524644851685, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16968437284231186, "step": 13660 }, { "epoch": 0.4269375, "grad_norm": 3.6875, "grad_norm_var": 0.10939127604166667, "learning_rate": 0.0001, "loss": 6.2719, "loss/crossentropy": 2.813697934150696, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.18722982704639435, "step": 13662 }, { "epoch": 0.427, "grad_norm": 3.40625, "grad_norm_var": 0.10535380045572916, "learning_rate": 0.0001, "loss": 5.8451, "loss/crossentropy": 2.5655994415283203, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17716550827026367, "step": 13664 }, { "epoch": 0.4270625, "grad_norm": 3.625, "grad_norm_var": 0.1028961181640625, "learning_rate": 0.0001, "loss": 5.9091, "loss/crossentropy": 2.5500409603118896, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1835578829050064, "step": 13666 }, { "epoch": 0.427125, "grad_norm": 3.53125, "grad_norm_var": 0.09569905598958334, "learning_rate": 0.0001, "loss": 6.0471, "loss/crossentropy": 2.759128451347351, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1768459901213646, "step": 13668 }, { "epoch": 0.4271875, "grad_norm": 3.234375, "grad_norm_var": 0.09733072916666667, "learning_rate": 0.0001, "loss": 5.8049, "loss/crossentropy": 2.5630141496658325, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17458105832338333, "step": 13670 }, { "epoch": 0.42725, "grad_norm": 3.171875, "grad_norm_var": 0.10494384765625, "learning_rate": 0.0001, "loss": 5.4886, "loss/crossentropy": 2.2683770656585693, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.16693967580795288, "step": 13672 }, { "epoch": 0.4273125, "grad_norm": 3.921875, "grad_norm_var": 0.1036529541015625, "learning_rate": 0.0001, "loss": 5.9961, "loss/crossentropy": 2.6692570447921753, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18072886765003204, "step": 13674 }, { "epoch": 0.427375, "grad_norm": 3.4375, "grad_norm_var": 0.0935455322265625, "learning_rate": 0.0001, "loss": 5.93, "loss/crossentropy": 2.6348601579666138, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17873502522706985, "step": 13676 }, { "epoch": 0.4274375, "grad_norm": 2.875, "grad_norm_var": 0.08173421223958334, "learning_rate": 0.0001, "loss": 5.6902, "loss/crossentropy": 2.5685064792633057, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16686025261878967, "step": 13678 }, { "epoch": 0.4275, "grad_norm": 3.515625, "grad_norm_var": 0.0789459228515625, "learning_rate": 0.0001, "loss": 5.6396, "loss/crossentropy": 2.4428157806396484, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.168898805975914, "step": 13680 }, { "epoch": 0.4275625, "grad_norm": 3.171875, "grad_norm_var": 0.07375386555989584, "learning_rate": 0.0001, "loss": 6.0415, "loss/crossentropy": 2.750417947769165, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17598801851272583, "step": 13682 }, { "epoch": 0.427625, "grad_norm": 3.140625, "grad_norm_var": 0.07376302083333333, "learning_rate": 0.0001, "loss": 5.6226, "loss/crossentropy": 2.4173182249069214, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17052601277828217, "step": 13684 }, { "epoch": 0.4276875, "grad_norm": 3.125, "grad_norm_var": 0.0793609619140625, "learning_rate": 0.0001, "loss": 5.9497, "loss/crossentropy": 2.714414358139038, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17587494105100632, "step": 13686 }, { "epoch": 0.42775, "grad_norm": 3.09375, "grad_norm_var": 0.060302734375, "learning_rate": 0.0001, "loss": 5.7912, "loss/crossentropy": 2.5054643154144287, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17935198545455933, "step": 13688 }, { "epoch": 0.4278125, "grad_norm": 3.125, "grad_norm_var": 0.22333882649739584, "learning_rate": 0.0001, "loss": 5.4972, "loss/crossentropy": 2.2655035257339478, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.16731233149766922, "step": 13690 }, { "epoch": 0.427875, "grad_norm": 3.3125, "grad_norm_var": 0.22379150390625, "learning_rate": 0.0001, "loss": 6.0416, "loss/crossentropy": 2.7287049293518066, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17933955043554306, "step": 13692 }, { "epoch": 0.4279375, "grad_norm": 3.375, "grad_norm_var": 0.21106363932291666, "learning_rate": 0.0001, "loss": 5.8759, "loss/crossentropy": 2.5507595539093018, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18017400801181793, "step": 13694 }, { "epoch": 0.428, "grad_norm": 3.09375, "grad_norm_var": 0.21228841145833333, "learning_rate": 0.0001, "loss": 5.5248, "loss/crossentropy": 2.3603193759918213, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16996591538190842, "step": 13696 }, { "epoch": 0.4280625, "grad_norm": 3.359375, "grad_norm_var": 0.21627197265625, "learning_rate": 0.0001, "loss": 5.6145, "loss/crossentropy": 2.377763271331787, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.172113299369812, "step": 13698 }, { "epoch": 0.428125, "grad_norm": 3.03125, "grad_norm_var": 0.22115885416666667, "learning_rate": 0.0001, "loss": 5.6002, "loss/crossentropy": 2.4868147373199463, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16211627423763275, "step": 13700 }, { "epoch": 0.4281875, "grad_norm": 2.8125, "grad_norm_var": 0.2318267822265625, "learning_rate": 0.0001, "loss": 5.4648, "loss/crossentropy": 2.353792190551758, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16656972467899323, "step": 13702 }, { "epoch": 0.42825, "grad_norm": 3.203125, "grad_norm_var": 0.23946024576822916, "learning_rate": 0.0001, "loss": 5.869, "loss/crossentropy": 2.623986840248108, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1741149052977562, "step": 13704 }, { "epoch": 0.4283125, "grad_norm": 3.125, "grad_norm_var": 0.024869791666666665, "learning_rate": 0.0001, "loss": 6.2232, "loss/crossentropy": 2.8240978717803955, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.186787448823452, "step": 13706 }, { "epoch": 0.428375, "grad_norm": 3.203125, "grad_norm_var": 0.0230865478515625, "learning_rate": 0.0001, "loss": 5.7839, "loss/crossentropy": 2.646933913230896, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16642916202545166, "step": 13708 }, { "epoch": 0.4284375, "grad_norm": 3.21875, "grad_norm_var": 0.0195465087890625, "learning_rate": 0.0001, "loss": 5.8334, "loss/crossentropy": 2.564917206764221, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1764616221189499, "step": 13710 }, { "epoch": 0.4285, "grad_norm": 3.40625, "grad_norm_var": 0.024388631184895832, "learning_rate": 0.0001, "loss": 6.0885, "loss/crossentropy": 2.7025904655456543, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18312708288431168, "step": 13712 }, { "epoch": 0.4285625, "grad_norm": 3.78125, "grad_norm_var": 0.050455729166666664, "learning_rate": 0.0001, "loss": 6.1528, "loss/crossentropy": 2.6861952543258667, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.19236107915639877, "step": 13714 }, { "epoch": 0.428625, "grad_norm": 3.171875, "grad_norm_var": 0.04816792805989583, "learning_rate": 0.0001, "loss": 5.8196, "loss/crossentropy": 2.617398262023926, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16943687200546265, "step": 13716 }, { "epoch": 0.4286875, "grad_norm": 3.859375, "grad_norm_var": 0.06177978515625, "learning_rate": 0.0001, "loss": 5.4724, "loss/crossentropy": 2.3168063163757324, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16361089795827866, "step": 13718 }, { "epoch": 0.42875, "grad_norm": 3.4375, "grad_norm_var": 0.05182291666666667, "learning_rate": 0.0001, "loss": 6.1111, "loss/crossentropy": 2.6841464042663574, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18801257759332657, "step": 13720 }, { "epoch": 0.4288125, "grad_norm": 3.078125, "grad_norm_var": 0.05181884765625, "learning_rate": 0.0001, "loss": 5.8708, "loss/crossentropy": 2.6361746788024902, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17190248519182205, "step": 13722 }, { "epoch": 0.428875, "grad_norm": 3.515625, "grad_norm_var": 0.052632649739583336, "learning_rate": 0.0001, "loss": 5.9087, "loss/crossentropy": 2.6201218366622925, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1761198490858078, "step": 13724 }, { "epoch": 0.4289375, "grad_norm": 3.171875, "grad_norm_var": 0.05117085774739583, "learning_rate": 0.0001, "loss": 5.8039, "loss/crossentropy": 2.626819133758545, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1688845530152321, "step": 13726 }, { "epoch": 0.429, "grad_norm": 13.0, "grad_norm_var": 5.851806640625, "learning_rate": 0.0001, "loss": 6.1558, "loss/crossentropy": 2.5434751510620117, "loss/hidden": 1.625, "loss/jsd": 0.0, "loss/logits": 0.19873183220624924, "step": 13728 }, { "epoch": 0.4290625, "grad_norm": 3.15625, "grad_norm_var": 5.901416015625, "learning_rate": 0.0001, "loss": 5.6204, "loss/crossentropy": 2.4339829683303833, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1713765189051628, "step": 13730 }, { "epoch": 0.429125, "grad_norm": 3.734375, "grad_norm_var": 5.85650634765625, "learning_rate": 0.0001, "loss": 6.4488, "loss/crossentropy": 2.9593294858932495, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.19269901514053345, "step": 13732 }, { "epoch": 0.4291875, "grad_norm": 3.375, "grad_norm_var": 5.8624013264973955, "learning_rate": 0.0001, "loss": 5.7938, "loss/crossentropy": 2.544976592063904, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17527136206626892, "step": 13734 }, { "epoch": 0.42925, "grad_norm": 3.203125, "grad_norm_var": 5.8963368733723955, "learning_rate": 0.0001, "loss": 5.7754, "loss/crossentropy": 2.520832061767578, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17546138912439346, "step": 13736 }, { "epoch": 0.4293125, "grad_norm": 3.1875, "grad_norm_var": 5.9157053629557295, "learning_rate": 0.0001, "loss": 5.6156, "loss/crossentropy": 2.471334457397461, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1659916192293167, "step": 13738 }, { "epoch": 0.429375, "grad_norm": 3.90625, "grad_norm_var": 5.92720947265625, "learning_rate": 0.0001, "loss": 5.8394, "loss/crossentropy": 2.611321210861206, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17359286546707153, "step": 13740 }, { "epoch": 0.4294375, "grad_norm": 3.375, "grad_norm_var": 5.902311197916666, "learning_rate": 0.0001, "loss": 5.7597, "loss/crossentropy": 2.5149848461151123, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17017000913619995, "step": 13742 }, { "epoch": 0.4295, "grad_norm": 5.28125, "grad_norm_var": 0.31319071451822916, "learning_rate": 0.0001, "loss": 5.6272, "loss/crossentropy": 2.2962993383407593, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.17684400081634521, "step": 13744 }, { "epoch": 0.4295625, "grad_norm": 3.640625, "grad_norm_var": 0.3112701416015625, "learning_rate": 0.0001, "loss": 5.9082, "loss/crossentropy": 2.5763291120529175, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1777186393737793, "step": 13746 }, { "epoch": 0.429625, "grad_norm": 3.359375, "grad_norm_var": 0.330859375, "learning_rate": 0.0001, "loss": 5.8902, "loss/crossentropy": 2.600212574005127, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17899896204471588, "step": 13748 }, { "epoch": 0.4296875, "grad_norm": 3.328125, "grad_norm_var": 0.328955078125, "learning_rate": 0.0001, "loss": 5.7874, "loss/crossentropy": 2.4550464153289795, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.17659784853458405, "step": 13750 }, { "epoch": 0.42975, "grad_norm": 3.5, "grad_norm_var": 0.325732421875, "learning_rate": 0.0001, "loss": 5.7718, "loss/crossentropy": 2.4952975511550903, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17569301277399063, "step": 13752 }, { "epoch": 0.4298125, "grad_norm": 3.21875, "grad_norm_var": 0.32275390625, "learning_rate": 0.0001, "loss": 5.5028, "loss/crossentropy": 2.33465039730072, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16642434149980545, "step": 13754 }, { "epoch": 0.429875, "grad_norm": 3.390625, "grad_norm_var": 0.29573160807291665, "learning_rate": 0.0001, "loss": 5.9297, "loss/crossentropy": 2.6052504777908325, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1777571439743042, "step": 13756 }, { "epoch": 0.4299375, "grad_norm": 3.40625, "grad_norm_var": 0.2967437744140625, "learning_rate": 0.0001, "loss": 5.8089, "loss/crossentropy": 2.5966118574142456, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17435576766729355, "step": 13758 }, { "epoch": 0.43, "grad_norm": 3.171875, "grad_norm_var": 0.056591796875, "learning_rate": 0.0001, "loss": 5.8061, "loss/crossentropy": 2.5896633863449097, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17164431512355804, "step": 13760 }, { "epoch": 0.4300625, "grad_norm": 3.25, "grad_norm_var": 0.053120930989583336, "learning_rate": 0.0001, "loss": 5.6903, "loss/crossentropy": 2.4845727682113647, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1697913035750389, "step": 13762 }, { "epoch": 0.430125, "grad_norm": 3.1875, "grad_norm_var": 0.024007161458333332, "learning_rate": 0.0001, "loss": 5.3803, "loss/crossentropy": 2.3315229415893555, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1548776477575302, "step": 13764 }, { "epoch": 0.4301875, "grad_norm": 4.21875, "grad_norm_var": 0.08062235514322917, "learning_rate": 0.0001, "loss": 5.8792, "loss/crossentropy": 2.590985655784607, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17296196520328522, "step": 13766 }, { "epoch": 0.43025, "grad_norm": 3.984375, "grad_norm_var": 0.11121317545572916, "learning_rate": 0.0001, "loss": 6.0749, "loss/crossentropy": 2.653734803199768, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18586856126785278, "step": 13768 }, { "epoch": 0.4303125, "grad_norm": 4.125, "grad_norm_var": 0.15481770833333333, "learning_rate": 0.0001, "loss": 5.9805, "loss/crossentropy": 2.76421320438385, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16967318952083588, "step": 13770 }, { "epoch": 0.430375, "grad_norm": 3.375, "grad_norm_var": 0.15419514973958334, "learning_rate": 0.0001, "loss": 5.8072, "loss/crossentropy": 2.493858814239502, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.17352117598056793, "step": 13772 }, { "epoch": 0.4304375, "grad_norm": 3.3125, "grad_norm_var": 0.20327046712239583, "learning_rate": 0.0001, "loss": 6.1747, "loss/crossentropy": 2.7048277854919434, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.19307807832956314, "step": 13774 }, { "epoch": 0.4305, "grad_norm": 3.4375, "grad_norm_var": 0.1990386962890625, "learning_rate": 0.0001, "loss": 5.6902, "loss/crossentropy": 2.3923569917678833, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17431984096765518, "step": 13776 }, { "epoch": 0.4305625, "grad_norm": 3.234375, "grad_norm_var": 0.20152079264322917, "learning_rate": 0.0001, "loss": 5.7667, "loss/crossentropy": 2.490602493286133, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1768304854631424, "step": 13778 }, { "epoch": 0.430625, "grad_norm": 3.203125, "grad_norm_var": 0.18127848307291666, "learning_rate": 0.0001, "loss": 5.8033, "loss/crossentropy": 2.576148271560669, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17310161888599396, "step": 13780 }, { "epoch": 0.4306875, "grad_norm": 3.078125, "grad_norm_var": 0.1714019775390625, "learning_rate": 0.0001, "loss": 5.3822, "loss/crossentropy": 2.3225895166397095, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15713541209697723, "step": 13782 }, { "epoch": 0.43075, "grad_norm": 3.0, "grad_norm_var": 0.15918680826822917, "learning_rate": 0.0001, "loss": 5.7832, "loss/crossentropy": 2.639627456665039, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16670116037130356, "step": 13784 }, { "epoch": 0.4308125, "grad_norm": 3.0625, "grad_norm_var": 0.11142171223958333, "learning_rate": 0.0001, "loss": 5.7224, "loss/crossentropy": 2.518721342086792, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1719333603978157, "step": 13786 }, { "epoch": 0.430875, "grad_norm": 3.296875, "grad_norm_var": 0.10881754557291666, "learning_rate": 0.0001, "loss": 5.9192, "loss/crossentropy": 2.5601470470428467, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18082885444164276, "step": 13788 }, { "epoch": 0.4309375, "grad_norm": 3.359375, "grad_norm_var": 0.028336588541666666, "learning_rate": 0.0001, "loss": 5.8847, "loss/crossentropy": 2.6607024669647217, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1751309037208557, "step": 13790 }, { "epoch": 0.431, "grad_norm": 3.28125, "grad_norm_var": 0.024372355143229166, "learning_rate": 0.0001, "loss": 5.5683, "loss/crossentropy": 2.3886818885803223, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16678636521100998, "step": 13792 }, { "epoch": 0.4310625, "grad_norm": 3.34375, "grad_norm_var": 0.02666015625, "learning_rate": 0.0001, "loss": 5.7791, "loss/crossentropy": 2.5440547466278076, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17311306297779083, "step": 13794 }, { "epoch": 0.431125, "grad_norm": 3.703125, "grad_norm_var": 0.0519439697265625, "learning_rate": 0.0001, "loss": 6.1354, "loss/crossentropy": 2.6821444034576416, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.19298520684242249, "step": 13796 }, { "epoch": 0.4311875, "grad_norm": 3.171875, "grad_norm_var": 0.043196614583333334, "learning_rate": 0.0001, "loss": 6.1425, "loss/crossentropy": 2.775768518447876, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1835494562983513, "step": 13798 }, { "epoch": 0.43125, "grad_norm": 3.625, "grad_norm_var": 0.04099833170572917, "learning_rate": 0.0001, "loss": 6.102, "loss/crossentropy": 2.7383395433425903, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18050390481948853, "step": 13800 }, { "epoch": 0.4313125, "grad_norm": 3.515625, "grad_norm_var": 0.0325836181640625, "learning_rate": 0.0001, "loss": 6.2694, "loss/crossentropy": 2.8019193410873413, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.19206029176712036, "step": 13802 }, { "epoch": 0.431375, "grad_norm": 3.015625, "grad_norm_var": 0.03837890625, "learning_rate": 0.0001, "loss": 5.3785, "loss/crossentropy": 2.2824034690856934, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.15765908360481262, "step": 13804 }, { "epoch": 0.4314375, "grad_norm": 3.359375, "grad_norm_var": 0.03726806640625, "learning_rate": 0.0001, "loss": 5.7639, "loss/crossentropy": 2.5160328149795532, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1763525754213333, "step": 13806 }, { "epoch": 0.4315, "grad_norm": 3.03125, "grad_norm_var": 0.05715738932291667, "learning_rate": 0.0001, "loss": 6.078, "loss/crossentropy": 2.739522695541382, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18189027160406113, "step": 13808 }, { "epoch": 0.4315625, "grad_norm": 3.375, "grad_norm_var": 0.059798177083333334, "learning_rate": 0.0001, "loss": 5.948, "loss/crossentropy": 2.6395925283432007, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1800587698817253, "step": 13810 }, { "epoch": 0.431625, "grad_norm": 2.9375, "grad_norm_var": 0.06549072265625, "learning_rate": 0.0001, "loss": 5.8187, "loss/crossentropy": 2.598936915397644, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17197246104478836, "step": 13812 }, { "epoch": 0.4316875, "grad_norm": 3.125, "grad_norm_var": 0.0686187744140625, "learning_rate": 0.0001, "loss": 5.8044, "loss/crossentropy": 2.5577696561813354, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1734897345304489, "step": 13814 }, { "epoch": 0.43175, "grad_norm": 3.171875, "grad_norm_var": 0.06845703125, "learning_rate": 0.0001, "loss": 5.9794, "loss/crossentropy": 2.642100691795349, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1829443871974945, "step": 13816 }, { "epoch": 0.4318125, "grad_norm": 3.203125, "grad_norm_var": 0.07056376139322916, "learning_rate": 0.0001, "loss": 6.0588, "loss/crossentropy": 2.7387092113494873, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.18083222210407257, "step": 13818 }, { "epoch": 0.431875, "grad_norm": 3.234375, "grad_norm_var": 0.08374735514322916, "learning_rate": 0.0001, "loss": 5.8616, "loss/crossentropy": 2.754805088043213, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16341445595026016, "step": 13820 }, { "epoch": 0.4319375, "grad_norm": 3.1875, "grad_norm_var": 0.08114827473958333, "learning_rate": 0.0001, "loss": 5.974, "loss/crossentropy": 2.6396480798721313, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18265436589717865, "step": 13822 }, { "epoch": 0.432, "grad_norm": 3.171875, "grad_norm_var": 0.05879618326822917, "learning_rate": 0.0001, "loss": 5.713, "loss/crossentropy": 2.43161678314209, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.17071346193552017, "step": 13824 }, { "epoch": 0.4320625, "grad_norm": 3.21875, "grad_norm_var": 0.039534505208333334, "learning_rate": 0.0001, "loss": 5.8704, "loss/crossentropy": 2.607662320137024, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17822878062725067, "step": 13826 }, { "epoch": 0.432125, "grad_norm": 2.984375, "grad_norm_var": 0.04830322265625, "learning_rate": 0.0001, "loss": 5.929, "loss/crossentropy": 2.6019667387008667, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18114198744297028, "step": 13828 }, { "epoch": 0.4321875, "grad_norm": 3.046875, "grad_norm_var": 0.0496002197265625, "learning_rate": 0.0001, "loss": 5.9722, "loss/crossentropy": 2.7419170141220093, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17263995110988617, "step": 13830 }, { "epoch": 0.43225, "grad_norm": 3.046875, "grad_norm_var": 0.04319559733072917, "learning_rate": 0.0001, "loss": 5.9201, "loss/crossentropy": 2.6870416402816772, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17173901200294495, "step": 13832 }, { "epoch": 0.4323125, "grad_norm": 3.140625, "grad_norm_var": 0.0340972900390625, "learning_rate": 0.0001, "loss": 5.7739, "loss/crossentropy": 2.5142263174057007, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17518573999404907, "step": 13834 }, { "epoch": 0.432375, "grad_norm": 3.1875, "grad_norm_var": 0.023274739583333332, "learning_rate": 0.0001, "loss": 6.1575, "loss/crossentropy": 2.7912551164627075, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18232276290655136, "step": 13836 }, { "epoch": 0.4324375, "grad_norm": 3.140625, "grad_norm_var": 0.02359619140625, "learning_rate": 0.0001, "loss": 5.8468, "loss/crossentropy": 2.5685415267944336, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17782345414161682, "step": 13838 }, { "epoch": 0.4325, "grad_norm": 2.984375, "grad_norm_var": 0.024169921875, "learning_rate": 0.0001, "loss": 5.8359, "loss/crossentropy": 2.544231414794922, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1772119104862213, "step": 13840 }, { "epoch": 0.4325625, "grad_norm": 3.296875, "grad_norm_var": 2.9762603759765627, "learning_rate": 0.0001, "loss": 6.0882, "loss/crossentropy": 2.6641210317611694, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.19084730744361877, "step": 13842 }, { "epoch": 0.432625, "grad_norm": 3.5625, "grad_norm_var": 2.9530924479166667, "learning_rate": 0.0001, "loss": 5.9587, "loss/crossentropy": 2.6440919637680054, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1814604476094246, "step": 13844 }, { "epoch": 0.4326875, "grad_norm": 2.953125, "grad_norm_var": 2.985309855143229, "learning_rate": 0.0001, "loss": 5.3481, "loss/crossentropy": 2.33569598197937, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15827567875385284, "step": 13846 }, { "epoch": 0.43275, "grad_norm": 3.546875, "grad_norm_var": 2.9599273681640623, "learning_rate": 0.0001, "loss": 6.0096, "loss/crossentropy": 2.7038800716400146, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1770564764738083, "step": 13848 }, { "epoch": 0.4328125, "grad_norm": 3.46875, "grad_norm_var": 2.9489898681640625, "learning_rate": 0.0001, "loss": 5.9135, "loss/crossentropy": 2.6132771968841553, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1776796653866768, "step": 13850 }, { "epoch": 0.432875, "grad_norm": 3.234375, "grad_norm_var": 2.94381103515625, "learning_rate": 0.0001, "loss": 5.9411, "loss/crossentropy": 2.643177628517151, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17510050535202026, "step": 13852 }, { "epoch": 0.4329375, "grad_norm": 3.15625, "grad_norm_var": 2.954215494791667, "learning_rate": 0.0001, "loss": 5.6373, "loss/crossentropy": 2.4669450521469116, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1643032804131508, "step": 13854 }, { "epoch": 0.433, "grad_norm": 3.328125, "grad_norm_var": 2.9218658447265624, "learning_rate": 0.0001, "loss": 6.0053, "loss/crossentropy": 2.64568555355072, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.17971544712781906, "step": 13856 }, { "epoch": 0.4330625, "grad_norm": 2.984375, "grad_norm_var": 0.05982666015625, "learning_rate": 0.0001, "loss": 5.7653, "loss/crossentropy": 2.6104692220687866, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16391621530056, "step": 13858 }, { "epoch": 0.433125, "grad_norm": 3.609375, "grad_norm_var": 0.06252848307291667, "learning_rate": 0.0001, "loss": 5.8163, "loss/crossentropy": 2.574515700340271, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17417685687541962, "step": 13860 }, { "epoch": 0.4331875, "grad_norm": 3.171875, "grad_norm_var": 0.049169921875, "learning_rate": 0.0001, "loss": 5.5406, "loss/crossentropy": 2.363754153251648, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1712006852030754, "step": 13862 }, { "epoch": 0.43325, "grad_norm": 2.859375, "grad_norm_var": 0.05364176432291667, "learning_rate": 0.0001, "loss": 5.7121, "loss/crossentropy": 2.590463161468506, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16099364310503006, "step": 13864 }, { "epoch": 0.4333125, "grad_norm": 3.21875, "grad_norm_var": 0.0495025634765625, "learning_rate": 0.0001, "loss": 5.747, "loss/crossentropy": 2.486981987953186, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1732637882232666, "step": 13866 }, { "epoch": 0.433375, "grad_norm": 3.8125, "grad_norm_var": 0.07224833170572917, "learning_rate": 0.0001, "loss": 5.7717, "loss/crossentropy": 2.4394524097442627, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.17658094316720963, "step": 13868 }, { "epoch": 0.4334375, "grad_norm": 3.53125, "grad_norm_var": 0.07399088541666667, "learning_rate": 0.0001, "loss": 5.9521, "loss/crossentropy": 2.632522225379944, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17844469100236893, "step": 13870 }, { "epoch": 0.4335, "grad_norm": 3.0, "grad_norm_var": 0.06363016764322917, "learning_rate": 0.0001, "loss": 5.8005, "loss/crossentropy": 2.62562096118927, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1682659387588501, "step": 13872 }, { "epoch": 0.4335625, "grad_norm": 3.0, "grad_norm_var": 0.060347493489583334, "learning_rate": 0.0001, "loss": 5.378, "loss/crossentropy": 2.2950897216796875, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16102531552314758, "step": 13874 }, { "epoch": 0.433625, "grad_norm": 3.328125, "grad_norm_var": 0.05243733723958333, "learning_rate": 0.0001, "loss": 5.6193, "loss/crossentropy": 2.465804696083069, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16768839210271835, "step": 13876 }, { "epoch": 0.4336875, "grad_norm": 3.296875, "grad_norm_var": 0.05286356608072917, "learning_rate": 0.0001, "loss": 5.928, "loss/crossentropy": 2.5896536111831665, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18305715918540955, "step": 13878 }, { "epoch": 0.43375, "grad_norm": 3.34375, "grad_norm_var": 0.0444244384765625, "learning_rate": 0.0001, "loss": 5.7915, "loss/crossentropy": 2.517315983772278, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.17077286541461945, "step": 13880 }, { "epoch": 0.4338125, "grad_norm": 3.421875, "grad_norm_var": 0.047240193684895834, "learning_rate": 0.0001, "loss": 5.7765, "loss/crossentropy": 2.536103844642639, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17326051741838455, "step": 13882 }, { "epoch": 0.433875, "grad_norm": 3.296875, "grad_norm_var": 0.03373921712239583, "learning_rate": 0.0001, "loss": 5.7424, "loss/crossentropy": 2.558171033859253, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.17349782586097717, "step": 13884 }, { "epoch": 0.4339375, "grad_norm": 3.390625, "grad_norm_var": 0.03803609212239583, "learning_rate": 0.0001, "loss": 5.9657, "loss/crossentropy": 2.720614433288574, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17333794385194778, "step": 13886 }, { "epoch": 0.434, "grad_norm": 3.09375, "grad_norm_var": 0.0366851806640625, "learning_rate": 0.0001, "loss": 5.6914, "loss/crossentropy": 2.441069483757019, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17424984276294708, "step": 13888 }, { "epoch": 0.4340625, "grad_norm": 2.984375, "grad_norm_var": 0.03701070149739583, "learning_rate": 0.0001, "loss": 5.7065, "loss/crossentropy": 2.589089870452881, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16603338718414307, "step": 13890 }, { "epoch": 0.434125, "grad_norm": 3.140625, "grad_norm_var": 0.03532613118489583, "learning_rate": 0.0001, "loss": 5.6098, "loss/crossentropy": 2.442548394203186, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16789712011814117, "step": 13892 }, { "epoch": 0.4341875, "grad_norm": 3.109375, "grad_norm_var": 0.03786519368489583, "learning_rate": 0.0001, "loss": 5.9175, "loss/crossentropy": 2.6187649965286255, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17558104544878006, "step": 13894 }, { "epoch": 0.43425, "grad_norm": 3.0625, "grad_norm_var": 0.03863525390625, "learning_rate": 0.0001, "loss": 5.9075, "loss/crossentropy": 2.6415693759918213, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1789368912577629, "step": 13896 }, { "epoch": 0.4343125, "grad_norm": 3.484375, "grad_norm_var": 0.03986002604166667, "learning_rate": 0.0001, "loss": 5.9529, "loss/crossentropy": 2.667291522026062, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17777517437934875, "step": 13898 }, { "epoch": 0.434375, "grad_norm": 3.21875, "grad_norm_var": 0.0394439697265625, "learning_rate": 0.0001, "loss": 5.9658, "loss/crossentropy": 2.699573516845703, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17467273771762848, "step": 13900 }, { "epoch": 0.4344375, "grad_norm": 3.1875, "grad_norm_var": 0.0342681884765625, "learning_rate": 0.0001, "loss": 5.3276, "loss/crossentropy": 2.216606616973877, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.15836957097053528, "step": 13902 }, { "epoch": 0.4345, "grad_norm": 3.28125, "grad_norm_var": 0.03125712076822917, "learning_rate": 0.0001, "loss": 5.7911, "loss/crossentropy": 2.6023541688919067, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16887789964675903, "step": 13904 }, { "epoch": 0.4345625, "grad_norm": 3.03125, "grad_norm_var": 0.0301910400390625, "learning_rate": 0.0001, "loss": 5.6791, "loss/crossentropy": 2.4348760843276978, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1705169975757599, "step": 13906 }, { "epoch": 0.434625, "grad_norm": 3.15625, "grad_norm_var": 0.029832967122395835, "learning_rate": 0.0001, "loss": 5.8906, "loss/crossentropy": 2.6364128589630127, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17581136524677277, "step": 13908 }, { "epoch": 0.4346875, "grad_norm": 3.078125, "grad_norm_var": 0.029195149739583332, "learning_rate": 0.0001, "loss": 5.6061, "loss/crossentropy": 2.473349452018738, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16522951424121857, "step": 13910 }, { "epoch": 0.43475, "grad_norm": 4.15625, "grad_norm_var": 0.084033203125, "learning_rate": 0.0001, "loss": 5.5508, "loss/crossentropy": 2.3882246017456055, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16586287319660187, "step": 13912 }, { "epoch": 0.4348125, "grad_norm": 3.0, "grad_norm_var": 0.08943684895833333, "learning_rate": 0.0001, "loss": 5.652, "loss/crossentropy": 2.467076301574707, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16810613870620728, "step": 13914 }, { "epoch": 0.434875, "grad_norm": 3.390625, "grad_norm_var": 0.0906890869140625, "learning_rate": 0.0001, "loss": 6.3171, "loss/crossentropy": 2.87043559551239, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.19076044112443924, "step": 13916 }, { "epoch": 0.4349375, "grad_norm": 3.46875, "grad_norm_var": 0.09966532389322917, "learning_rate": 0.0001, "loss": 5.7529, "loss/crossentropy": 2.58914315700531, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1694972664117813, "step": 13918 }, { "epoch": 0.435, "grad_norm": 3.3125, "grad_norm_var": 0.10134175618489584, "learning_rate": 0.0001, "loss": 6.0511, "loss/crossentropy": 2.7517701387405396, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1783655434846878, "step": 13920 }, { "epoch": 0.4350625, "grad_norm": 3.265625, "grad_norm_var": 0.0958160400390625, "learning_rate": 0.0001, "loss": 5.9012, "loss/crossentropy": 2.6555097103118896, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17730756103992462, "step": 13922 }, { "epoch": 0.435125, "grad_norm": 3.328125, "grad_norm_var": 0.09462890625, "learning_rate": 0.0001, "loss": 5.8087, "loss/crossentropy": 2.492259979248047, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.18047624081373215, "step": 13924 }, { "epoch": 0.4351875, "grad_norm": 3.171875, "grad_norm_var": 0.094482421875, "learning_rate": 0.0001, "loss": 6.0533, "loss/crossentropy": 2.752696990966797, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17654091119766235, "step": 13926 }, { "epoch": 0.43525, "grad_norm": 3.03125, "grad_norm_var": 0.04117431640625, "learning_rate": 0.0001, "loss": 5.7251, "loss/crossentropy": 2.494315981864929, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1715133786201477, "step": 13928 }, { "epoch": 0.4353125, "grad_norm": 3.203125, "grad_norm_var": 0.03540751139322917, "learning_rate": 0.0001, "loss": 5.6916, "loss/crossentropy": 2.5007740259170532, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16869062930345535, "step": 13930 }, { "epoch": 0.435375, "grad_norm": 3.109375, "grad_norm_var": 0.026025390625, "learning_rate": 0.0001, "loss": 5.4062, "loss/crossentropy": 2.295772910118103, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16299156844615936, "step": 13932 }, { "epoch": 0.4354375, "grad_norm": 3.28125, "grad_norm_var": 0.0167144775390625, "learning_rate": 0.0001, "loss": 5.891, "loss/crossentropy": 2.620903968811035, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17427542805671692, "step": 13934 }, { "epoch": 0.4355, "grad_norm": 3.1875, "grad_norm_var": 0.018236287434895835, "learning_rate": 0.0001, "loss": 5.9248, "loss/crossentropy": 2.7304844856262207, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17059976607561111, "step": 13936 }, { "epoch": 0.4355625, "grad_norm": 3.734375, "grad_norm_var": 0.03484700520833333, "learning_rate": 0.0001, "loss": 5.937, "loss/crossentropy": 2.4570083618164062, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.1894073486328125, "step": 13938 }, { "epoch": 0.435625, "grad_norm": 3.4375, "grad_norm_var": 0.0402252197265625, "learning_rate": 0.0001, "loss": 6.0185, "loss/crossentropy": 2.646772623062134, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1848280280828476, "step": 13940 }, { "epoch": 0.4356875, "grad_norm": 3.28125, "grad_norm_var": 0.037886555989583334, "learning_rate": 0.0001, "loss": 5.8972, "loss/crossentropy": 2.5509214401245117, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18033506721258163, "step": 13942 }, { "epoch": 0.43575, "grad_norm": 3.5, "grad_norm_var": 0.03338216145833333, "learning_rate": 0.0001, "loss": 6.2292, "loss/crossentropy": 2.7814353704452515, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.18657396733760834, "step": 13944 }, { "epoch": 0.4358125, "grad_norm": 3.25, "grad_norm_var": 0.0400299072265625, "learning_rate": 0.0001, "loss": 6.0215, "loss/crossentropy": 2.6906360387802124, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17918318510055542, "step": 13946 }, { "epoch": 0.435875, "grad_norm": 3.125, "grad_norm_var": 0.04468994140625, "learning_rate": 0.0001, "loss": 5.8619, "loss/crossentropy": 2.4934970140457153, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1813701018691063, "step": 13948 }, { "epoch": 0.4359375, "grad_norm": 3.421875, "grad_norm_var": 0.0441314697265625, "learning_rate": 0.0001, "loss": 5.8171, "loss/crossentropy": 2.6028374433517456, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17259834706783295, "step": 13950 }, { "epoch": 0.436, "grad_norm": 3.015625, "grad_norm_var": 0.043211873372395834, "learning_rate": 0.0001, "loss": 5.6416, "loss/crossentropy": 2.4892324209213257, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1668020561337471, "step": 13952 }, { "epoch": 0.4360625, "grad_norm": 2.953125, "grad_norm_var": 0.045556640625, "learning_rate": 0.0001, "loss": 5.6439, "loss/crossentropy": 2.487300753593445, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16761772334575653, "step": 13954 }, { "epoch": 0.436125, "grad_norm": 2.96875, "grad_norm_var": 0.052179972330729164, "learning_rate": 0.0001, "loss": 5.7655, "loss/crossentropy": 2.6785272359848022, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16104034334421158, "step": 13956 }, { "epoch": 0.4361875, "grad_norm": 3.203125, "grad_norm_var": 0.05458984375, "learning_rate": 0.0001, "loss": 5.9288, "loss/crossentropy": 2.6830239295959473, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17340965569019318, "step": 13958 }, { "epoch": 0.43625, "grad_norm": 3.1875, "grad_norm_var": 0.05006103515625, "learning_rate": 0.0001, "loss": 5.8653, "loss/crossentropy": 2.5887218713760376, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1772654503583908, "step": 13960 }, { "epoch": 0.4363125, "grad_norm": 3.546875, "grad_norm_var": 0.045807902018229166, "learning_rate": 0.0001, "loss": 5.4811, "loss/crossentropy": 2.3509578704833984, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16262295097112656, "step": 13962 }, { "epoch": 0.436375, "grad_norm": 2.953125, "grad_norm_var": 0.033980305989583334, "learning_rate": 0.0001, "loss": 5.5103, "loss/crossentropy": 2.366196870803833, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16948788613080978, "step": 13964 }, { "epoch": 0.4364375, "grad_norm": 3.21875, "grad_norm_var": 0.03258463541666667, "learning_rate": 0.0001, "loss": 5.7932, "loss/crossentropy": 2.5165878534317017, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17727329581975937, "step": 13966 }, { "epoch": 0.4365, "grad_norm": 3.046875, "grad_norm_var": 0.03167317708333333, "learning_rate": 0.0001, "loss": 5.8628, "loss/crossentropy": 2.569228172302246, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17583833634853363, "step": 13968 }, { "epoch": 0.4365625, "grad_norm": 3.453125, "grad_norm_var": 0.03478190104166667, "learning_rate": 0.0001, "loss": 5.7704, "loss/crossentropy": 2.4731861352920532, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1773790717124939, "step": 13970 }, { "epoch": 0.436625, "grad_norm": 2.84375, "grad_norm_var": 0.04010009765625, "learning_rate": 0.0001, "loss": 5.7873, "loss/crossentropy": 2.5951253175735474, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1696045771241188, "step": 13972 }, { "epoch": 0.4366875, "grad_norm": 3.046875, "grad_norm_var": 0.040095011393229164, "learning_rate": 0.0001, "loss": 5.8076, "loss/crossentropy": 2.5616809129714966, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1741999313235283, "step": 13974 }, { "epoch": 0.43675, "grad_norm": 3.125, "grad_norm_var": 0.04112040201822917, "learning_rate": 0.0001, "loss": 5.4678, "loss/crossentropy": 2.3087103366851807, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16864745318889618, "step": 13976 }, { "epoch": 0.4368125, "grad_norm": 3.046875, "grad_norm_var": 0.027936808268229165, "learning_rate": 0.0001, "loss": 5.9896, "loss/crossentropy": 2.6676132678985596, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1802445352077484, "step": 13978 }, { "epoch": 0.436875, "grad_norm": 3.078125, "grad_norm_var": 0.027586873372395834, "learning_rate": 0.0001, "loss": 5.5338, "loss/crossentropy": 2.3940389156341553, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16241838037967682, "step": 13980 }, { "epoch": 0.4369375, "grad_norm": 3.453125, "grad_norm_var": 0.10838216145833333, "learning_rate": 0.0001, "loss": 6.1542, "loss/crossentropy": 2.6889700889587402, "loss/hidden": 1.61328125, "loss/jsd": 0.0, "loss/logits": 0.18519629538059235, "step": 13982 }, { "epoch": 0.437, "grad_norm": 3.34375, "grad_norm_var": 0.10601298014322917, "learning_rate": 0.0001, "loss": 5.766, "loss/crossentropy": 2.4458248615264893, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17967405915260315, "step": 13984 }, { "epoch": 0.4370625, "grad_norm": 2.953125, "grad_norm_var": 0.10950419108072916, "learning_rate": 0.0001, "loss": 5.7802, "loss/crossentropy": 2.6169904470443726, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1674969717860222, "step": 13986 }, { "epoch": 0.437125, "grad_norm": 3.1875, "grad_norm_var": 0.09851888020833334, "learning_rate": 0.0001, "loss": 5.6014, "loss/crossentropy": 2.446366548538208, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1658981293439865, "step": 13988 }, { "epoch": 0.4371875, "grad_norm": 3.234375, "grad_norm_var": 0.09902242024739584, "learning_rate": 0.0001, "loss": 5.7611, "loss/crossentropy": 2.5728660821914673, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16999761760234833, "step": 13990 }, { "epoch": 0.43725, "grad_norm": 2.984375, "grad_norm_var": 0.10242411295572916, "learning_rate": 0.0001, "loss": 5.7782, "loss/crossentropy": 2.63112211227417, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16548720002174377, "step": 13992 }, { "epoch": 0.4373125, "grad_norm": 3.3125, "grad_norm_var": 0.0985015869140625, "learning_rate": 0.0001, "loss": 5.9244, "loss/crossentropy": 2.6902605295181274, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17263738065958023, "step": 13994 }, { "epoch": 0.437375, "grad_norm": 4.09375, "grad_norm_var": 0.14097900390625, "learning_rate": 0.0001, "loss": 5.7846, "loss/crossentropy": 2.5803415775299072, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17238248139619827, "step": 13996 }, { "epoch": 0.4374375, "grad_norm": 3.15625, "grad_norm_var": 0.06813151041666667, "learning_rate": 0.0001, "loss": 5.7821, "loss/crossentropy": 2.5158464908599854, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17740444093942642, "step": 13998 }, { "epoch": 0.4375, "grad_norm": 3.53125, "grad_norm_var": 0.8107493082682292, "learning_rate": 0.0001, "loss": 6.6197, "loss/crossentropy": 3.0875765085220337, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.19695878773927689, "step": 14000 }, { "epoch": 0.4375625, "grad_norm": 3.40625, "grad_norm_var": 0.7908854166666667, "learning_rate": 0.0001, "loss": 5.6925, "loss/crossentropy": 2.4908525943756104, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.171725332736969, "step": 14002 }, { "epoch": 0.437625, "grad_norm": 3.484375, "grad_norm_var": 0.7793853759765625, "learning_rate": 0.0001, "loss": 5.9493, "loss/crossentropy": 2.6585350036621094, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1779026985168457, "step": 14004 }, { "epoch": 0.4376875, "grad_norm": 3.890625, "grad_norm_var": 0.7587961832682292, "learning_rate": 0.0001, "loss": 6.1833, "loss/crossentropy": 2.781018376350403, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1839783489704132, "step": 14006 }, { "epoch": 0.43775, "grad_norm": 3.453125, "grad_norm_var": 0.73349609375, "learning_rate": 0.0001, "loss": 5.854, "loss/crossentropy": 2.599832057952881, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17541275918483734, "step": 14008 }, { "epoch": 0.4378125, "grad_norm": 3.015625, "grad_norm_var": 0.7720937093098958, "learning_rate": 0.0001, "loss": 5.5801, "loss/crossentropy": 2.4539239406585693, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16652580350637436, "step": 14010 }, { "epoch": 0.437875, "grad_norm": 4.90625, "grad_norm_var": 0.84169921875, "learning_rate": 0.0001, "loss": 5.8686, "loss/crossentropy": 2.5402169227600098, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1820555478334427, "step": 14012 }, { "epoch": 0.4379375, "grad_norm": 3.421875, "grad_norm_var": 0.83336181640625, "learning_rate": 0.0001, "loss": 5.8299, "loss/crossentropy": 2.538370966911316, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17992941290140152, "step": 14014 }, { "epoch": 0.438, "grad_norm": 3.125, "grad_norm_var": 0.2145172119140625, "learning_rate": 0.0001, "loss": 5.5102, "loss/crossentropy": 2.3528844118118286, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16729706525802612, "step": 14016 }, { "epoch": 0.4380625, "grad_norm": 3.171875, "grad_norm_var": 0.2233795166015625, "learning_rate": 0.0001, "loss": 6.0016, "loss/crossentropy": 2.729480028152466, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17760110646486282, "step": 14018 }, { "epoch": 0.438125, "grad_norm": 3.109375, "grad_norm_var": 0.2264312744140625, "learning_rate": 0.0001, "loss": 6.0135, "loss/crossentropy": 2.694047689437866, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.18233928829431534, "step": 14020 }, { "epoch": 0.4381875, "grad_norm": 3.4375, "grad_norm_var": 0.19953511555989584, "learning_rate": 0.0001, "loss": 5.8586, "loss/crossentropy": 2.595989942550659, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17586620151996613, "step": 14022 }, { "epoch": 0.43825, "grad_norm": 3.15625, "grad_norm_var": 0.196533203125, "learning_rate": 0.0001, "loss": 5.7336, "loss/crossentropy": 2.4948946237564087, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17074289917945862, "step": 14024 }, { "epoch": 0.4383125, "grad_norm": 3.046875, "grad_norm_var": 0.19375, "learning_rate": 0.0001, "loss": 5.9275, "loss/crossentropy": 2.739194631576538, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1711711436510086, "step": 14026 }, { "epoch": 0.438375, "grad_norm": 3.03125, "grad_norm_var": 0.0367095947265625, "learning_rate": 0.0001, "loss": 5.82, "loss/crossentropy": 2.520078420639038, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17842947691679, "step": 14028 }, { "epoch": 0.4384375, "grad_norm": 3.34375, "grad_norm_var": 0.03531494140625, "learning_rate": 0.0001, "loss": 6.1359, "loss/crossentropy": 2.726751685142517, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1838875114917755, "step": 14030 }, { "epoch": 0.4385, "grad_norm": 3.15625, "grad_norm_var": 0.03815104166666667, "learning_rate": 0.0001, "loss": 5.6123, "loss/crossentropy": 2.5084946155548096, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16624469310045242, "step": 14032 }, { "epoch": 0.4385625, "grad_norm": 3.46875, "grad_norm_var": 0.04306233723958333, "learning_rate": 0.0001, "loss": 5.9164, "loss/crossentropy": 2.6273417472839355, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17733992636203766, "step": 14034 }, { "epoch": 0.438625, "grad_norm": 3.265625, "grad_norm_var": 0.04011128743489583, "learning_rate": 0.0001, "loss": 5.9573, "loss/crossentropy": 2.7191213369369507, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17616549879312515, "step": 14036 }, { "epoch": 0.4386875, "grad_norm": 3.75, "grad_norm_var": 0.05353190104166667, "learning_rate": 0.0001, "loss": 5.9259, "loss/crossentropy": 2.5204232931137085, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1866377666592598, "step": 14038 }, { "epoch": 0.43875, "grad_norm": 3.046875, "grad_norm_var": 0.0557525634765625, "learning_rate": 0.0001, "loss": 5.9545, "loss/crossentropy": 2.701447010040283, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17374404519796371, "step": 14040 }, { "epoch": 0.4388125, "grad_norm": 3.328125, "grad_norm_var": 0.04988505045572917, "learning_rate": 0.0001, "loss": 6.0738, "loss/crossentropy": 2.8018620014190674, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17641377449035645, "step": 14042 }, { "epoch": 0.438875, "grad_norm": 3.59375, "grad_norm_var": 0.04192301432291667, "learning_rate": 0.0001, "loss": 6.0156, "loss/crossentropy": 2.6663641929626465, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1814088374376297, "step": 14044 }, { "epoch": 0.4389375, "grad_norm": 3.0, "grad_norm_var": 0.04879557291666667, "learning_rate": 0.0001, "loss": 5.7216, "loss/crossentropy": 2.509984850883484, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.17545349150896072, "step": 14046 }, { "epoch": 0.439, "grad_norm": 3.21875, "grad_norm_var": 0.05074462890625, "learning_rate": 0.0001, "loss": 6.3435, "loss/crossentropy": 2.942991256713867, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18770598620176315, "step": 14048 }, { "epoch": 0.4390625, "grad_norm": 3.203125, "grad_norm_var": 0.04761962890625, "learning_rate": 0.0001, "loss": 5.902, "loss/crossentropy": 2.616852045059204, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17616866528987885, "step": 14050 }, { "epoch": 0.439125, "grad_norm": 2.921875, "grad_norm_var": 0.05896809895833333, "learning_rate": 0.0001, "loss": 5.6839, "loss/crossentropy": 2.5626214742660522, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1625198796391487, "step": 14052 }, { "epoch": 0.4391875, "grad_norm": 3.078125, "grad_norm_var": 0.04194234212239583, "learning_rate": 0.0001, "loss": 5.4755, "loss/crossentropy": 2.345886468887329, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16491156816482544, "step": 14054 }, { "epoch": 0.43925, "grad_norm": 3.359375, "grad_norm_var": 0.04426167805989583, "learning_rate": 0.0001, "loss": 5.6413, "loss/crossentropy": 2.5013986825942993, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16555754095315933, "step": 14056 }, { "epoch": 0.4393125, "grad_norm": 3.515625, "grad_norm_var": 0.04892171223958333, "learning_rate": 0.0001, "loss": 5.8352, "loss/crossentropy": 2.5604896545410156, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17903056740760803, "step": 14058 }, { "epoch": 0.439375, "grad_norm": 3.03125, "grad_norm_var": 0.03905843098958333, "learning_rate": 0.0001, "loss": 5.7855, "loss/crossentropy": 2.551244020462036, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17264288663864136, "step": 14060 }, { "epoch": 0.4394375, "grad_norm": 3.203125, "grad_norm_var": 0.038630167643229164, "learning_rate": 0.0001, "loss": 6.0331, "loss/crossentropy": 2.6743807792663574, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18313482403755188, "step": 14062 }, { "epoch": 0.4395, "grad_norm": 3.21875, "grad_norm_var": 1.5511881510416667, "learning_rate": 0.0001, "loss": 5.5925, "loss/crossentropy": 2.3944915533065796, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17097366601228714, "step": 14064 }, { "epoch": 0.4395625, "grad_norm": 3.484375, "grad_norm_var": 1.5429026285807292, "learning_rate": 0.0001, "loss": 5.6763, "loss/crossentropy": 2.4504220485687256, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1698511689901352, "step": 14066 }, { "epoch": 0.439625, "grad_norm": 3.59375, "grad_norm_var": 1.515087890625, "learning_rate": 0.0001, "loss": 5.8929, "loss/crossentropy": 2.578840494155884, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.18023841083049774, "step": 14068 }, { "epoch": 0.4396875, "grad_norm": 3.265625, "grad_norm_var": 1.5004140218098958, "learning_rate": 0.0001, "loss": 6.1559, "loss/crossentropy": 2.7392576932907104, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1881519854068756, "step": 14070 }, { "epoch": 0.43975, "grad_norm": 3.625, "grad_norm_var": 1.4768218994140625, "learning_rate": 0.0001, "loss": 5.8832, "loss/crossentropy": 2.6005676984786987, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1755259484052658, "step": 14072 }, { "epoch": 0.4398125, "grad_norm": 3.015625, "grad_norm_var": 1.5114095052083334, "learning_rate": 0.0001, "loss": 5.5797, "loss/crossentropy": 2.4586753845214844, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.162494458258152, "step": 14074 }, { "epoch": 0.439875, "grad_norm": 3.125, "grad_norm_var": 1.5263417561848958, "learning_rate": 0.0001, "loss": 5.5643, "loss/crossentropy": 2.388434648513794, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16563333570957184, "step": 14076 }, { "epoch": 0.4399375, "grad_norm": 2.953125, "grad_norm_var": 1.5548828125, "learning_rate": 0.0001, "loss": 5.6168, "loss/crossentropy": 2.4416269063949585, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16986381262540817, "step": 14078 }, { "epoch": 0.44, "grad_norm": 3.296875, "grad_norm_var": 0.05517578125, "learning_rate": 0.0001, "loss": 5.8422, "loss/crossentropy": 2.558874249458313, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17950692027807236, "step": 14080 }, { "epoch": 0.4400625, "grad_norm": 3.03125, "grad_norm_var": 0.045832316080729164, "learning_rate": 0.0001, "loss": 5.7236, "loss/crossentropy": 2.5704649686813354, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16687601059675217, "step": 14082 }, { "epoch": 0.440125, "grad_norm": 3.25, "grad_norm_var": 0.033446248372395834, "learning_rate": 0.0001, "loss": 5.7626, "loss/crossentropy": 2.5022090673446655, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17448122054338455, "step": 14084 }, { "epoch": 0.4401875, "grad_norm": 3.03125, "grad_norm_var": 0.03964436848958333, "learning_rate": 0.0001, "loss": 6.0418, "loss/crossentropy": 2.786649227142334, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17238830029964447, "step": 14086 }, { "epoch": 0.44025, "grad_norm": 3.390625, "grad_norm_var": 0.028571573893229167, "learning_rate": 0.0001, "loss": 5.7734, "loss/crossentropy": 2.6143399477005005, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16786056756973267, "step": 14088 }, { "epoch": 0.4403125, "grad_norm": 2.96875, "grad_norm_var": 0.028962198893229166, "learning_rate": 0.0001, "loss": 5.5703, "loss/crossentropy": 2.4524561166763306, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16178082674741745, "step": 14090 }, { "epoch": 0.440375, "grad_norm": 3.0, "grad_norm_var": 0.030304972330729166, "learning_rate": 0.0001, "loss": 5.7567, "loss/crossentropy": 2.584386944770813, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17074551433324814, "step": 14092 }, { "epoch": 0.4404375, "grad_norm": 3.03125, "grad_norm_var": 0.028238932291666668, "learning_rate": 0.0001, "loss": 5.9168, "loss/crossentropy": 2.6656646728515625, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17472387850284576, "step": 14094 }, { "epoch": 0.4405, "grad_norm": 3.375, "grad_norm_var": 0.026318359375, "learning_rate": 0.0001, "loss": 5.6788, "loss/crossentropy": 2.457118511199951, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1737259030342102, "step": 14096 }, { "epoch": 0.4405625, "grad_norm": 3.09375, "grad_norm_var": 0.029222615559895835, "learning_rate": 0.0001, "loss": 5.9826, "loss/crossentropy": 2.6728047132492065, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17746330052614212, "step": 14098 }, { "epoch": 0.440625, "grad_norm": 3.484375, "grad_norm_var": 0.04126688639322917, "learning_rate": 0.0001, "loss": 5.787, "loss/crossentropy": 2.6073368787765503, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17069777846336365, "step": 14100 }, { "epoch": 0.4406875, "grad_norm": 3.390625, "grad_norm_var": 0.03775634765625, "learning_rate": 0.0001, "loss": 5.9069, "loss/crossentropy": 2.6937869787216187, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16896523535251617, "step": 14102 }, { "epoch": 0.44075, "grad_norm": 3.0625, "grad_norm_var": 0.0427734375, "learning_rate": 0.0001, "loss": 5.7835, "loss/crossentropy": 2.6282999515533447, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16864388436079025, "step": 14104 }, { "epoch": 0.4408125, "grad_norm": 3.09375, "grad_norm_var": 0.04303385416666667, "learning_rate": 0.0001, "loss": 5.7754, "loss/crossentropy": 2.611765742301941, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16558589041233063, "step": 14106 }, { "epoch": 0.440875, "grad_norm": 3.125, "grad_norm_var": 0.039697265625, "learning_rate": 0.0001, "loss": 5.9349, "loss/crossentropy": 2.622216582298279, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.18244481831789017, "step": 14108 }, { "epoch": 0.4409375, "grad_norm": 3.15625, "grad_norm_var": 0.037984212239583336, "learning_rate": 0.0001, "loss": 5.7041, "loss/crossentropy": 2.4544249773025513, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17418567091226578, "step": 14110 }, { "epoch": 0.441, "grad_norm": 3.21875, "grad_norm_var": 0.03574930826822917, "learning_rate": 0.0001, "loss": 6.0332, "loss/crossentropy": 2.6911864280700684, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1795118898153305, "step": 14112 }, { "epoch": 0.4410625, "grad_norm": 3.328125, "grad_norm_var": 0.0308258056640625, "learning_rate": 0.0001, "loss": 5.8747, "loss/crossentropy": 2.593514323234558, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17772328853607178, "step": 14114 }, { "epoch": 0.441125, "grad_norm": 3.59375, "grad_norm_var": 0.04641011555989583, "learning_rate": 0.0001, "loss": 6.148, "loss/crossentropy": 2.7083226442337036, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.18459460884332657, "step": 14116 }, { "epoch": 0.4411875, "grad_norm": 3.328125, "grad_norm_var": 0.04452718098958333, "learning_rate": 0.0001, "loss": 5.6375, "loss/crossentropy": 2.4524585008621216, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16733208298683167, "step": 14118 }, { "epoch": 0.44125, "grad_norm": 3.375, "grad_norm_var": 0.0410064697265625, "learning_rate": 0.0001, "loss": 5.6476, "loss/crossentropy": 2.442119598388672, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17016087472438812, "step": 14120 }, { "epoch": 0.4413125, "grad_norm": 3.28125, "grad_norm_var": 0.033935546875, "learning_rate": 0.0001, "loss": 5.9103, "loss/crossentropy": 2.598555088043213, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1815623864531517, "step": 14122 }, { "epoch": 0.441375, "grad_norm": 3.8125, "grad_norm_var": 0.04830322265625, "learning_rate": 0.0001, "loss": 6.1144, "loss/crossentropy": 2.685201048851013, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18784663081169128, "step": 14124 }, { "epoch": 0.4414375, "grad_norm": 3.78125, "grad_norm_var": 0.0636871337890625, "learning_rate": 0.0001, "loss": 6.1894, "loss/crossentropy": 2.7855113744735718, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18647854775190353, "step": 14126 }, { "epoch": 0.4415, "grad_norm": 3.1875, "grad_norm_var": 0.06370035807291667, "learning_rate": 0.0001, "loss": 5.7917, "loss/crossentropy": 2.6137852668762207, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16817715764045715, "step": 14128 }, { "epoch": 0.4415625, "grad_norm": 3.484375, "grad_norm_var": 0.0630035400390625, "learning_rate": 0.0001, "loss": 5.7611, "loss/crossentropy": 2.582552433013916, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17059174180030823, "step": 14130 }, { "epoch": 0.441625, "grad_norm": 3.28125, "grad_norm_var": 0.05446675618489583, "learning_rate": 0.0001, "loss": 5.768, "loss/crossentropy": 2.574445605278015, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.169355146586895, "step": 14132 }, { "epoch": 0.4416875, "grad_norm": 3.765625, "grad_norm_var": 0.06409098307291666, "learning_rate": 0.0001, "loss": 6.4481, "loss/crossentropy": 2.8273682594299316, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.20699873566627502, "step": 14134 }, { "epoch": 0.44175, "grad_norm": 3.28125, "grad_norm_var": 0.06486002604166667, "learning_rate": 0.0001, "loss": 5.859, "loss/crossentropy": 2.5572317838668823, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1786102056503296, "step": 14136 }, { "epoch": 0.4418125, "grad_norm": 3.390625, "grad_norm_var": 0.06030985514322917, "learning_rate": 0.0001, "loss": 5.9828, "loss/crossentropy": 2.6731334924697876, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17706193029880524, "step": 14138 }, { "epoch": 0.441875, "grad_norm": 2.921875, "grad_norm_var": 0.05513407389322917, "learning_rate": 0.0001, "loss": 5.558, "loss/crossentropy": 2.442023754119873, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1639430969953537, "step": 14140 }, { "epoch": 0.4419375, "grad_norm": 3.25, "grad_norm_var": 0.03472900390625, "learning_rate": 0.0001, "loss": 5.9016, "loss/crossentropy": 2.5875269174575806, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1806221604347229, "step": 14142 }, { "epoch": 0.442, "grad_norm": 3.171875, "grad_norm_var": 0.0385162353515625, "learning_rate": 0.0001, "loss": 5.7166, "loss/crossentropy": 2.5313161611557007, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17087139934301376, "step": 14144 }, { "epoch": 0.4420625, "grad_norm": 3.078125, "grad_norm_var": 0.0388336181640625, "learning_rate": 0.0001, "loss": 5.8014, "loss/crossentropy": 2.5723941326141357, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17251383513212204, "step": 14146 }, { "epoch": 0.442125, "grad_norm": 2.890625, "grad_norm_var": 0.04780171712239583, "learning_rate": 0.0001, "loss": 5.5317, "loss/crossentropy": 2.4141929149627686, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16487937420606613, "step": 14148 }, { "epoch": 0.4421875, "grad_norm": 3.15625, "grad_norm_var": 0.029195149739583332, "learning_rate": 0.0001, "loss": 6.1061, "loss/crossentropy": 2.7535077333450317, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18135527521371841, "step": 14150 }, { "epoch": 0.44225, "grad_norm": 3.3125, "grad_norm_var": 0.025126139322916668, "learning_rate": 0.0001, "loss": 6.0045, "loss/crossentropy": 2.7140774726867676, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17904505133628845, "step": 14152 }, { "epoch": 0.4423125, "grad_norm": 3.390625, "grad_norm_var": 0.023053995768229165, "learning_rate": 0.0001, "loss": 5.8206, "loss/crossentropy": 2.5800344944000244, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1760055422782898, "step": 14154 }, { "epoch": 0.442375, "grad_norm": 3.28125, "grad_norm_var": 0.025130208333333334, "learning_rate": 0.0001, "loss": 5.9944, "loss/crossentropy": 2.6406712532043457, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.1779462918639183, "step": 14156 }, { "epoch": 0.4424375, "grad_norm": 3.09375, "grad_norm_var": 0.029271443684895832, "learning_rate": 0.0001, "loss": 5.7035, "loss/crossentropy": 2.5728673934936523, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.166965052485466, "step": 14158 }, { "epoch": 0.4425, "grad_norm": 4.375, "grad_norm_var": 0.11597900390625, "learning_rate": 0.0001, "loss": 5.3905, "loss/crossentropy": 2.2556103467941284, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.16036295890808105, "step": 14160 }, { "epoch": 0.4425625, "grad_norm": 2.984375, "grad_norm_var": 0.12603759765625, "learning_rate": 0.0001, "loss": 5.8177, "loss/crossentropy": 2.564350128173828, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17455732077360153, "step": 14162 }, { "epoch": 0.442625, "grad_norm": 3.109375, "grad_norm_var": 0.11891276041666667, "learning_rate": 0.0001, "loss": 5.6852, "loss/crossentropy": 2.5463815927505493, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16583866626024246, "step": 14164 }, { "epoch": 0.4426875, "grad_norm": 3.3125, "grad_norm_var": 0.11741434733072917, "learning_rate": 0.0001, "loss": 5.7215, "loss/crossentropy": 2.581761956214905, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1620202511548996, "step": 14166 }, { "epoch": 0.44275, "grad_norm": 3.3125, "grad_norm_var": 0.12092997233072916, "learning_rate": 0.0001, "loss": 5.8324, "loss/crossentropy": 2.656448006629944, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16837306320667267, "step": 14168 }, { "epoch": 0.4428125, "grad_norm": 4.03125, "grad_norm_var": 0.17285054524739582, "learning_rate": 0.0001, "loss": 6.0773, "loss/crossentropy": 2.6987791061401367, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18590284883975983, "step": 14170 }, { "epoch": 0.442875, "grad_norm": 3.375, "grad_norm_var": 0.17138264973958334, "learning_rate": 0.0001, "loss": 5.8663, "loss/crossentropy": 2.5615211725234985, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17579397559165955, "step": 14172 }, { "epoch": 0.4429375, "grad_norm": 2.984375, "grad_norm_var": 0.165185546875, "learning_rate": 0.0001, "loss": 5.4595, "loss/crossentropy": 2.3379613161087036, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1648847609758377, "step": 14174 }, { "epoch": 0.443, "grad_norm": 3.03125, "grad_norm_var": 0.10237223307291667, "learning_rate": 0.0001, "loss": 5.8391, "loss/crossentropy": 2.6235986948013306, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17272279411554337, "step": 14176 }, { "epoch": 0.4430625, "grad_norm": 3.40625, "grad_norm_var": 0.10217692057291666, "learning_rate": 0.0001, "loss": 5.7728, "loss/crossentropy": 2.5508469343185425, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17219547182321548, "step": 14178 }, { "epoch": 0.443125, "grad_norm": 3.125, "grad_norm_var": 0.10484110514322917, "learning_rate": 0.0001, "loss": 5.6887, "loss/crossentropy": 2.5842883586883545, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1651291698217392, "step": 14180 }, { "epoch": 0.4431875, "grad_norm": 3.28125, "grad_norm_var": 0.10510152180989583, "learning_rate": 0.0001, "loss": 5.6395, "loss/crossentropy": 2.3743157386779785, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17573385685682297, "step": 14182 }, { "epoch": 0.44325, "grad_norm": 3.078125, "grad_norm_var": 0.10654195149739583, "learning_rate": 0.0001, "loss": 5.9045, "loss/crossentropy": 2.655468225479126, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1764613389968872, "step": 14184 }, { "epoch": 0.4433125, "grad_norm": 3.390625, "grad_norm_var": 0.032938639322916664, "learning_rate": 0.0001, "loss": 5.9218, "loss/crossentropy": 2.6279489994049072, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17703726887702942, "step": 14186 }, { "epoch": 0.443375, "grad_norm": 3.234375, "grad_norm_var": 0.024560546875, "learning_rate": 0.0001, "loss": 5.5574, "loss/crossentropy": 2.424659490585327, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1656167060136795, "step": 14188 }, { "epoch": 0.4434375, "grad_norm": 3.109375, "grad_norm_var": 0.026334635416666665, "learning_rate": 0.0001, "loss": 5.778, "loss/crossentropy": 2.502086043357849, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17289915680885315, "step": 14190 }, { "epoch": 0.4435, "grad_norm": 3.484375, "grad_norm_var": 0.030475870768229166, "learning_rate": 0.0001, "loss": 5.7241, "loss/crossentropy": 2.526078701019287, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17097116261720657, "step": 14192 }, { "epoch": 0.4435625, "grad_norm": 3.53125, "grad_norm_var": 0.030777994791666666, "learning_rate": 0.0001, "loss": 5.8012, "loss/crossentropy": 2.555241823196411, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17498234659433365, "step": 14194 }, { "epoch": 0.443625, "grad_norm": 3.46875, "grad_norm_var": 0.0344146728515625, "learning_rate": 0.0001, "loss": 6.0615, "loss/crossentropy": 2.6597427129745483, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.1827550157904625, "step": 14196 }, { "epoch": 0.4436875, "grad_norm": 3.328125, "grad_norm_var": 0.03463134765625, "learning_rate": 0.0001, "loss": 5.9326, "loss/crossentropy": 2.5371713638305664, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18329720944166183, "step": 14198 }, { "epoch": 0.44375, "grad_norm": 3.328125, "grad_norm_var": 0.0299957275390625, "learning_rate": 0.0001, "loss": 5.8034, "loss/crossentropy": 2.5516685247421265, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17477956414222717, "step": 14200 }, { "epoch": 0.4438125, "grad_norm": 3.015625, "grad_norm_var": 0.035302734375, "learning_rate": 0.0001, "loss": 5.8454, "loss/crossentropy": 2.634779691696167, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1726226583123207, "step": 14202 }, { "epoch": 0.443875, "grad_norm": 3.09375, "grad_norm_var": 0.0335357666015625, "learning_rate": 0.0001, "loss": 6.1434, "loss/crossentropy": 2.7896809577941895, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18146494776010513, "step": 14204 }, { "epoch": 0.4439375, "grad_norm": 3.40625, "grad_norm_var": 0.03239644368489583, "learning_rate": 0.0001, "loss": 5.9345, "loss/crossentropy": 2.648725152015686, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17779573053121567, "step": 14206 }, { "epoch": 0.444, "grad_norm": 3.5, "grad_norm_var": 0.034886678059895836, "learning_rate": 0.0001, "loss": 6.4137, "loss/crossentropy": 3.0079147815704346, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18511473387479782, "step": 14208 }, { "epoch": 0.4440625, "grad_norm": 3.1875, "grad_norm_var": 0.0277984619140625, "learning_rate": 0.0001, "loss": 5.8442, "loss/crossentropy": 2.6448980569839478, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1699315309524536, "step": 14210 }, { "epoch": 0.444125, "grad_norm": 3.03125, "grad_norm_var": 0.031078084309895834, "learning_rate": 0.0001, "loss": 5.5926, "loss/crossentropy": 2.4751064777374268, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.165263831615448, "step": 14212 }, { "epoch": 0.4441875, "grad_norm": 3.015625, "grad_norm_var": 0.031037394205729166, "learning_rate": 0.0001, "loss": 5.5775, "loss/crossentropy": 2.4614042043685913, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16707490384578705, "step": 14214 }, { "epoch": 0.44425, "grad_norm": 3.421875, "grad_norm_var": 0.20391337076822916, "learning_rate": 0.0001, "loss": 6.1049, "loss/crossentropy": 2.6063095331192017, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.19205011427402496, "step": 14216 }, { "epoch": 0.4443125, "grad_norm": 3.234375, "grad_norm_var": 0.19436442057291667, "learning_rate": 0.0001, "loss": 5.9834, "loss/crossentropy": 2.730865478515625, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17330465465784073, "step": 14218 }, { "epoch": 0.444375, "grad_norm": 3.125, "grad_norm_var": 0.22955322265625, "learning_rate": 0.0001, "loss": 5.673, "loss/crossentropy": 2.423761010169983, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.173749141395092, "step": 14220 }, { "epoch": 0.4444375, "grad_norm": 3.234375, "grad_norm_var": 0.22981363932291668, "learning_rate": 0.0001, "loss": 5.8368, "loss/crossentropy": 2.6157952547073364, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1728796511888504, "step": 14222 }, { "epoch": 0.4445, "grad_norm": 3.1875, "grad_norm_var": 0.23242899576822917, "learning_rate": 0.0001, "loss": 5.4949, "loss/crossentropy": 2.4254437685012817, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.156550794839859, "step": 14224 }, { "epoch": 0.4445625, "grad_norm": 3.484375, "grad_norm_var": 0.22834879557291668, "learning_rate": 0.0001, "loss": 6.1521, "loss/crossentropy": 2.844159722328186, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17884384095668793, "step": 14226 }, { "epoch": 0.444625, "grad_norm": 3.40625, "grad_norm_var": 0.22198893229166666, "learning_rate": 0.0001, "loss": 5.5103, "loss/crossentropy": 2.2973939180374146, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.16777722537517548, "step": 14228 }, { "epoch": 0.4446875, "grad_norm": 4.0625, "grad_norm_var": 0.2211334228515625, "learning_rate": 0.0001, "loss": 5.6075, "loss/crossentropy": 2.361604690551758, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17575780302286148, "step": 14230 }, { "epoch": 0.44475, "grad_norm": 3.203125, "grad_norm_var": 0.10288798014322917, "learning_rate": 0.0001, "loss": 5.7361, "loss/crossentropy": 2.5314559936523438, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1708575189113617, "step": 14232 }, { "epoch": 0.4448125, "grad_norm": 3.171875, "grad_norm_var": 0.11469624837239584, "learning_rate": 0.0001, "loss": 5.8871, "loss/crossentropy": 2.6339739561080933, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17453377693891525, "step": 14234 }, { "epoch": 0.444875, "grad_norm": 3.359375, "grad_norm_var": 0.07746480305989584, "learning_rate": 0.0001, "loss": 5.7914, "loss/crossentropy": 2.5673142671585083, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17202017456293106, "step": 14236 }, { "epoch": 0.4449375, "grad_norm": 3.25, "grad_norm_var": 0.08157145182291667, "learning_rate": 0.0001, "loss": 5.7828, "loss/crossentropy": 2.604298233985901, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1666785031557083, "step": 14238 }, { "epoch": 0.445, "grad_norm": 3.953125, "grad_norm_var": 0.29511617024739584, "learning_rate": 0.0001, "loss": 5.6492, "loss/crossentropy": 2.318596124649048, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.17250914126634598, "step": 14240 }, { "epoch": 0.4450625, "grad_norm": 3.40625, "grad_norm_var": 0.30865478515625, "learning_rate": 0.0001, "loss": 5.6779, "loss/crossentropy": 2.496907114982605, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16926919668912888, "step": 14242 }, { "epoch": 0.445125, "grad_norm": 3.546875, "grad_norm_var": 0.2950103759765625, "learning_rate": 0.0001, "loss": 5.8451, "loss/crossentropy": 2.5783416032791138, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17706424742937088, "step": 14244 }, { "epoch": 0.4451875, "grad_norm": 3.390625, "grad_norm_var": 0.27116597493489586, "learning_rate": 0.0001, "loss": 6.032, "loss/crossentropy": 2.7022446393966675, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18102285265922546, "step": 14246 }, { "epoch": 0.44525, "grad_norm": 3.296875, "grad_norm_var": 0.268603515625, "learning_rate": 0.0001, "loss": 5.6267, "loss/crossentropy": 2.4022055864334106, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.16854028403759003, "step": 14248 }, { "epoch": 0.4453125, "grad_norm": 3.078125, "grad_norm_var": 0.2626261393229167, "learning_rate": 0.0001, "loss": 5.7496, "loss/crossentropy": 2.60064959526062, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16724015772342682, "step": 14250 }, { "epoch": 0.445375, "grad_norm": 3.40625, "grad_norm_var": 0.26174723307291664, "learning_rate": 0.0001, "loss": 5.8662, "loss/crossentropy": 2.637366533279419, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17093206942081451, "step": 14252 }, { "epoch": 0.4454375, "grad_norm": 3.328125, "grad_norm_var": 0.24931538899739583, "learning_rate": 0.0001, "loss": 6.0655, "loss/crossentropy": 2.6863538026809692, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18322216719388962, "step": 14254 }, { "epoch": 0.4455, "grad_norm": 3.140625, "grad_norm_var": 0.029230753580729168, "learning_rate": 0.0001, "loss": 5.8473, "loss/crossentropy": 2.6832687854766846, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16796649247407913, "step": 14256 }, { "epoch": 0.4455625, "grad_norm": 3.296875, "grad_norm_var": 0.0343414306640625, "learning_rate": 0.0001, "loss": 5.6471, "loss/crossentropy": 2.428324341773987, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.16835793107748032, "step": 14258 }, { "epoch": 0.445625, "grad_norm": 3.09375, "grad_norm_var": 0.03557535807291667, "learning_rate": 0.0001, "loss": 5.6099, "loss/crossentropy": 2.470323920249939, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16512752324342728, "step": 14260 }, { "epoch": 0.4456875, "grad_norm": 3.296875, "grad_norm_var": 0.039460245768229166, "learning_rate": 0.0001, "loss": 5.7145, "loss/crossentropy": 2.581887722015381, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16717243194580078, "step": 14262 }, { "epoch": 0.44575, "grad_norm": 3.140625, "grad_norm_var": 0.039915974934895834, "learning_rate": 0.0001, "loss": 5.6021, "loss/crossentropy": 2.4398629665374756, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16818110644817352, "step": 14264 }, { "epoch": 0.4458125, "grad_norm": 3.140625, "grad_norm_var": 0.03868815104166667, "learning_rate": 0.0001, "loss": 5.7567, "loss/crossentropy": 2.552677869796753, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17118756473064423, "step": 14266 }, { "epoch": 0.445875, "grad_norm": 3.046875, "grad_norm_var": 0.04176025390625, "learning_rate": 0.0001, "loss": 5.9116, "loss/crossentropy": 2.7044728994369507, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17032155394554138, "step": 14268 }, { "epoch": 0.4459375, "grad_norm": 3.140625, "grad_norm_var": 0.03943684895833333, "learning_rate": 0.0001, "loss": 5.7551, "loss/crossentropy": 2.523730158805847, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17352360486984253, "step": 14270 }, { "epoch": 0.446, "grad_norm": 3.109375, "grad_norm_var": 0.03864644368489583, "learning_rate": 0.0001, "loss": 5.7731, "loss/crossentropy": 2.597599744796753, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16911684721708298, "step": 14272 }, { "epoch": 0.4460625, "grad_norm": 3.0625, "grad_norm_var": 0.019319661458333335, "learning_rate": 0.0001, "loss": 5.6792, "loss/crossentropy": 2.4801100492477417, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1699041873216629, "step": 14274 }, { "epoch": 0.446125, "grad_norm": 3.125, "grad_norm_var": 0.019090779622395835, "learning_rate": 0.0001, "loss": 5.739, "loss/crossentropy": 2.5322024822235107, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1695106029510498, "step": 14276 }, { "epoch": 0.4461875, "grad_norm": 3.46875, "grad_norm_var": 0.023758951822916666, "learning_rate": 0.0001, "loss": 6.1333, "loss/crossentropy": 2.7846380472183228, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1817398965358734, "step": 14278 }, { "epoch": 0.44625, "grad_norm": 3.40625, "grad_norm_var": 0.0288238525390625, "learning_rate": 0.0001, "loss": 5.7624, "loss/crossentropy": 2.5464032888412476, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17277182638645172, "step": 14280 }, { "epoch": 0.4463125, "grad_norm": 3.3125, "grad_norm_var": 0.0303863525390625, "learning_rate": 0.0001, "loss": 6.0514, "loss/crossentropy": 2.7622212171554565, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17579232156276703, "step": 14282 }, { "epoch": 0.446375, "grad_norm": 3.015625, "grad_norm_var": 0.04712626139322917, "learning_rate": 0.0001, "loss": 5.8248, "loss/crossentropy": 2.565149188041687, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17713207006454468, "step": 14284 }, { "epoch": 0.4464375, "grad_norm": 3.0625, "grad_norm_var": 0.039599609375, "learning_rate": 0.0001, "loss": 5.6326, "loss/crossentropy": 2.4806195497512817, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16832613199949265, "step": 14286 }, { "epoch": 0.4465, "grad_norm": 3.21875, "grad_norm_var": 0.0372467041015625, "learning_rate": 0.0001, "loss": 5.7442, "loss/crossentropy": 2.5185790061950684, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17217542231082916, "step": 14288 }, { "epoch": 0.4465625, "grad_norm": 3.0, "grad_norm_var": 0.03753255208333333, "learning_rate": 0.0001, "loss": 5.4591, "loss/crossentropy": 2.3245084285736084, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16423750668764114, "step": 14290 }, { "epoch": 0.446625, "grad_norm": 3.1875, "grad_norm_var": 0.03570963541666667, "learning_rate": 0.0001, "loss": 5.8494, "loss/crossentropy": 2.6422178745269775, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17149612307548523, "step": 14292 }, { "epoch": 0.4466875, "grad_norm": 3.5, "grad_norm_var": 0.03876953125, "learning_rate": 0.0001, "loss": 5.6575, "loss/crossentropy": 2.4422398805618286, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16957762837409973, "step": 14294 }, { "epoch": 0.44675, "grad_norm": 2.984375, "grad_norm_var": 0.038792928059895836, "learning_rate": 0.0001, "loss": 5.6236, "loss/crossentropy": 2.4454299211502075, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16898469626903534, "step": 14296 }, { "epoch": 0.4468125, "grad_norm": 3.671875, "grad_norm_var": 0.0551910400390625, "learning_rate": 0.0001, "loss": 6.2519, "loss/crossentropy": 2.8622608184814453, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1850590854883194, "step": 14298 }, { "epoch": 0.446875, "grad_norm": 3.21875, "grad_norm_var": 0.0386627197265625, "learning_rate": 0.0001, "loss": 5.6949, "loss/crossentropy": 2.469943046569824, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1736626774072647, "step": 14300 }, { "epoch": 0.4469375, "grad_norm": 3.578125, "grad_norm_var": 0.0585357666015625, "learning_rate": 0.0001, "loss": 6.1595, "loss/crossentropy": 2.8455876111984253, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1767076700925827, "step": 14302 }, { "epoch": 0.447, "grad_norm": 3.0625, "grad_norm_var": 0.0621490478515625, "learning_rate": 0.0001, "loss": 5.379, "loss/crossentropy": 2.3277957439422607, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15551482141017914, "step": 14304 }, { "epoch": 0.4470625, "grad_norm": 3.484375, "grad_norm_var": 0.06354166666666666, "learning_rate": 0.0001, "loss": 6.1249, "loss/crossentropy": 2.7531386613845825, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18014255166053772, "step": 14306 }, { "epoch": 0.447125, "grad_norm": 3.375, "grad_norm_var": 0.06317952473958334, "learning_rate": 0.0001, "loss": 5.9383, "loss/crossentropy": 2.638308882713318, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1784343644976616, "step": 14308 }, { "epoch": 0.4471875, "grad_norm": 3.0625, "grad_norm_var": 0.0624420166015625, "learning_rate": 0.0001, "loss": 5.4067, "loss/crossentropy": 2.3553584814071655, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15981775522232056, "step": 14310 }, { "epoch": 0.44725, "grad_norm": 3.453125, "grad_norm_var": 0.059619140625, "learning_rate": 0.0001, "loss": 5.899, "loss/crossentropy": 2.617624044418335, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17931316792964935, "step": 14312 }, { "epoch": 0.4473125, "grad_norm": 3.015625, "grad_norm_var": 0.05097554524739583, "learning_rate": 0.0001, "loss": 5.591, "loss/crossentropy": 2.4188079833984375, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16916995495557785, "step": 14314 }, { "epoch": 0.447375, "grad_norm": 3.609375, "grad_norm_var": 0.05291341145833333, "learning_rate": 0.0001, "loss": 6.1025, "loss/crossentropy": 2.644850015640259, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.1852177530527115, "step": 14316 }, { "epoch": 0.4474375, "grad_norm": 3.1875, "grad_norm_var": 0.0364654541015625, "learning_rate": 0.0001, "loss": 5.6787, "loss/crossentropy": 2.505561351776123, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1673169583082199, "step": 14318 }, { "epoch": 0.4475, "grad_norm": 4.1875, "grad_norm_var": 0.08759358723958334, "learning_rate": 0.0001, "loss": 6.1003, "loss/crossentropy": 2.6329729557037354, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.19009672850370407, "step": 14320 }, { "epoch": 0.4475625, "grad_norm": 3.34375, "grad_norm_var": 0.08820699055989584, "learning_rate": 0.0001, "loss": 5.4881, "loss/crossentropy": 2.3465631008148193, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1645406112074852, "step": 14322 }, { "epoch": 0.447625, "grad_norm": 3.046875, "grad_norm_var": 0.18772379557291666, "learning_rate": 0.0001, "loss": 5.7843, "loss/crossentropy": 2.445573091506958, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.17684581130743027, "step": 14324 }, { "epoch": 0.4476875, "grad_norm": 3.109375, "grad_norm_var": 0.18263346354166668, "learning_rate": 0.0001, "loss": 5.9039, "loss/crossentropy": 2.651862382888794, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1755986288189888, "step": 14326 }, { "epoch": 0.44775, "grad_norm": 3.015625, "grad_norm_var": 0.1947662353515625, "learning_rate": 0.0001, "loss": 5.6506, "loss/crossentropy": 2.5213125944137573, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1641010344028473, "step": 14328 }, { "epoch": 0.4478125, "grad_norm": 3.3125, "grad_norm_var": 0.18586832682291668, "learning_rate": 0.0001, "loss": 5.4415, "loss/crossentropy": 2.296280264854431, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16491208970546722, "step": 14330 }, { "epoch": 0.447875, "grad_norm": 3.65625, "grad_norm_var": 0.18569234212239583, "learning_rate": 0.0001, "loss": 5.7038, "loss/crossentropy": 2.455007314682007, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17409540712833405, "step": 14332 }, { "epoch": 0.4479375, "grad_norm": 3.375, "grad_norm_var": 0.17942301432291666, "learning_rate": 0.0001, "loss": 5.9901, "loss/crossentropy": 2.6195307970046997, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18393173068761826, "step": 14334 }, { "epoch": 0.448, "grad_norm": 4.84375, "grad_norm_var": 0.2781646728515625, "learning_rate": 0.0001, "loss": 6.1805, "loss/crossentropy": 2.726317882537842, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.18682335317134857, "step": 14336 }, { "epoch": 0.4480625, "grad_norm": 3.359375, "grad_norm_var": 0.2736612955729167, "learning_rate": 0.0001, "loss": 5.9181, "loss/crossentropy": 2.6938018798828125, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17165353149175644, "step": 14338 }, { "epoch": 0.448125, "grad_norm": 3.609375, "grad_norm_var": 0.18782145182291668, "learning_rate": 0.0001, "loss": 5.8565, "loss/crossentropy": 2.4743123054504395, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18587610125541687, "step": 14340 }, { "epoch": 0.4481875, "grad_norm": 3.1875, "grad_norm_var": 0.1931060791015625, "learning_rate": 0.0001, "loss": 5.6905, "loss/crossentropy": 2.4889925718307495, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.17405406385660172, "step": 14342 }, { "epoch": 0.44825, "grad_norm": 3.375, "grad_norm_var": 0.17135009765625, "learning_rate": 0.0001, "loss": 5.8152, "loss/crossentropy": 2.4915691614151, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1768980622291565, "step": 14344 }, { "epoch": 0.4483125, "grad_norm": 3.09375, "grad_norm_var": 0.17325846354166666, "learning_rate": 0.0001, "loss": 6.1619, "loss/crossentropy": 2.8394335508346558, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1802908331155777, "step": 14346 }, { "epoch": 0.448375, "grad_norm": 3.28125, "grad_norm_var": 0.18095703125, "learning_rate": 0.0001, "loss": 5.775, "loss/crossentropy": 2.5895841121673584, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17049019038677216, "step": 14348 }, { "epoch": 0.4484375, "grad_norm": 3.015625, "grad_norm_var": 0.2006011962890625, "learning_rate": 0.0001, "loss": 5.4213, "loss/crossentropy": 2.3961212635040283, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15330223739147186, "step": 14350 }, { "epoch": 0.4485, "grad_norm": 3.125, "grad_norm_var": 0.0506744384765625, "learning_rate": 0.0001, "loss": 5.909, "loss/crossentropy": 2.655321955680847, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17575698345899582, "step": 14352 }, { "epoch": 0.4485625, "grad_norm": 3.15625, "grad_norm_var": 0.05048828125, "learning_rate": 0.0001, "loss": 5.601, "loss/crossentropy": 2.3765926361083984, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17400510609149933, "step": 14354 }, { "epoch": 0.448625, "grad_norm": 3.25, "grad_norm_var": 0.04390360514322917, "learning_rate": 0.0001, "loss": 5.6403, "loss/crossentropy": 2.4661710262298584, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1674160286784172, "step": 14356 }, { "epoch": 0.4486875, "grad_norm": 3.90625, "grad_norm_var": 0.07631734212239584, "learning_rate": 0.0001, "loss": 6.0283, "loss/crossentropy": 2.7284250259399414, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.178815595805645, "step": 14358 }, { "epoch": 0.44875, "grad_norm": 3.484375, "grad_norm_var": 0.07952473958333334, "learning_rate": 0.0001, "loss": 5.9497, "loss/crossentropy": 2.676850438117981, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17611093819141388, "step": 14360 }, { "epoch": 0.4488125, "grad_norm": 3.640625, "grad_norm_var": 0.14666239420572916, "learning_rate": 0.0001, "loss": 6.2483, "loss/crossentropy": 2.6903653144836426, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.19564007222652435, "step": 14362 }, { "epoch": 0.448875, "grad_norm": 2.984375, "grad_norm_var": 0.1464263916015625, "learning_rate": 0.0001, "loss": 5.3807, "loss/crossentropy": 2.291722297668457, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.15889935940504074, "step": 14364 }, { "epoch": 0.4489375, "grad_norm": 3.0, "grad_norm_var": 0.13606363932291668, "learning_rate": 0.0001, "loss": 5.8725, "loss/crossentropy": 2.586019277572632, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1763058304786682, "step": 14366 }, { "epoch": 0.449, "grad_norm": 3.328125, "grad_norm_var": 0.127197265625, "learning_rate": 0.0001, "loss": 6.1236, "loss/crossentropy": 2.698602795600891, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18781539052724838, "step": 14368 }, { "epoch": 0.4490625, "grad_norm": 3.15625, "grad_norm_var": 0.1311920166015625, "learning_rate": 0.0001, "loss": 5.7755, "loss/crossentropy": 2.5386215448379517, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17212378978729248, "step": 14370 }, { "epoch": 0.449125, "grad_norm": 3.21875, "grad_norm_var": 0.12361551920572916, "learning_rate": 0.0001, "loss": 5.7539, "loss/crossentropy": 2.5283440351486206, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17450423538684845, "step": 14372 }, { "epoch": 0.4491875, "grad_norm": 3.328125, "grad_norm_var": 0.10881754557291666, "learning_rate": 0.0001, "loss": 5.8751, "loss/crossentropy": 2.707939386367798, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16827435791492462, "step": 14374 }, { "epoch": 0.44925, "grad_norm": 3.078125, "grad_norm_var": 0.12314351399739583, "learning_rate": 0.0001, "loss": 5.5668, "loss/crossentropy": 2.465551018714905, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16169127821922302, "step": 14376 }, { "epoch": 0.4493125, "grad_norm": 3.328125, "grad_norm_var": 0.043017578125, "learning_rate": 0.0001, "loss": 5.874, "loss/crossentropy": 2.6843870878219604, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16817626357078552, "step": 14378 }, { "epoch": 0.449375, "grad_norm": 2.953125, "grad_norm_var": 0.044189453125, "learning_rate": 0.0001, "loss": 5.7486, "loss/crossentropy": 2.5721516609191895, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1688157320022583, "step": 14380 }, { "epoch": 0.4494375, "grad_norm": 2.984375, "grad_norm_var": 0.03626302083333333, "learning_rate": 0.0001, "loss": 5.5518, "loss/crossentropy": 2.4162875413894653, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16706334054470062, "step": 14382 }, { "epoch": 0.4495, "grad_norm": 3.421875, "grad_norm_var": 0.03835347493489583, "learning_rate": 0.0001, "loss": 6.1768, "loss/crossentropy": 2.718705654144287, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.19034168124198914, "step": 14384 }, { "epoch": 0.4495625, "grad_norm": 3.125, "grad_norm_var": 0.038374837239583334, "learning_rate": 0.0001, "loss": 5.7778, "loss/crossentropy": 2.5771600008010864, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1735822558403015, "step": 14386 }, { "epoch": 0.449625, "grad_norm": 3.3125, "grad_norm_var": 0.04042867024739583, "learning_rate": 0.0001, "loss": 5.9021, "loss/crossentropy": 2.5540201663970947, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18128766119480133, "step": 14388 }, { "epoch": 0.4496875, "grad_norm": 3.03125, "grad_norm_var": 0.0366607666015625, "learning_rate": 0.0001, "loss": 5.7921, "loss/crossentropy": 2.554555296897888, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.174534872174263, "step": 14390 }, { "epoch": 0.44975, "grad_norm": 3.21875, "grad_norm_var": 0.02828369140625, "learning_rate": 0.0001, "loss": 5.5898, "loss/crossentropy": 2.409899115562439, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1640855148434639, "step": 14392 }, { "epoch": 0.4498125, "grad_norm": 2.90625, "grad_norm_var": 0.027994791666666668, "learning_rate": 0.0001, "loss": 5.6268, "loss/crossentropy": 2.5423814058303833, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16039558500051498, "step": 14394 }, { "epoch": 0.449875, "grad_norm": 3.40625, "grad_norm_var": 0.026594034830729165, "learning_rate": 0.0001, "loss": 5.8761, "loss/crossentropy": 2.5717968940734863, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17613419890403748, "step": 14396 }, { "epoch": 0.4499375, "grad_norm": 3.4375, "grad_norm_var": 0.02593994140625, "learning_rate": 0.0001, "loss": 6.0095, "loss/crossentropy": 2.6781710386276245, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18118468672037125, "step": 14398 }, { "epoch": 0.45, "grad_norm": 3.03125, "grad_norm_var": 0.021940104166666665, "learning_rate": 0.0001, "loss": 5.9548, "loss/crossentropy": 2.715733528137207, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17351695150136948, "step": 14400 }, { "epoch": 0.4500625, "grad_norm": 2.875, "grad_norm_var": 0.029938761393229166, "learning_rate": 0.0001, "loss": 5.5577, "loss/crossentropy": 2.4219359159469604, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16474615037441254, "step": 14402 }, { "epoch": 0.450125, "grad_norm": 3.5, "grad_norm_var": 0.0500152587890625, "learning_rate": 0.0001, "loss": 5.6615, "loss/crossentropy": 2.498612403869629, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1705838069319725, "step": 14404 }, { "epoch": 0.4501875, "grad_norm": 3.171875, "grad_norm_var": 0.05266520182291667, "learning_rate": 0.0001, "loss": 5.7041, "loss/crossentropy": 2.556857466697693, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16901937872171402, "step": 14406 }, { "epoch": 0.45025, "grad_norm": 3.234375, "grad_norm_var": 0.10281473795572917, "learning_rate": 0.0001, "loss": 5.8924, "loss/crossentropy": 2.53371000289917, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18508557230234146, "step": 14408 }, { "epoch": 0.4503125, "grad_norm": 3.015625, "grad_norm_var": 0.0964752197265625, "learning_rate": 0.0001, "loss": 5.8211, "loss/crossentropy": 2.5924088954925537, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17208484560251236, "step": 14410 }, { "epoch": 0.450375, "grad_norm": 3.171875, "grad_norm_var": 0.09641825358072917, "learning_rate": 0.0001, "loss": 5.8779, "loss/crossentropy": 2.6541298627853394, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1723785549402237, "step": 14412 }, { "epoch": 0.4504375, "grad_norm": 3.3125, "grad_norm_var": 0.10357666015625, "learning_rate": 0.0001, "loss": 5.9689, "loss/crossentropy": 2.6106170415878296, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18426914513111115, "step": 14414 }, { "epoch": 0.4505, "grad_norm": 3.296875, "grad_norm_var": 0.09934794108072917, "learning_rate": 0.0001, "loss": 5.7839, "loss/crossentropy": 2.536181330680847, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17437804490327835, "step": 14416 }, { "epoch": 0.4505625, "grad_norm": 3.25, "grad_norm_var": 0.08518473307291667, "learning_rate": 0.0001, "loss": 5.5786, "loss/crossentropy": 2.436997413635254, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1645461916923523, "step": 14418 }, { "epoch": 0.450625, "grad_norm": 3.34375, "grad_norm_var": 0.074755859375, "learning_rate": 0.0001, "loss": 6.0638, "loss/crossentropy": 2.716366171836853, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18161842226982117, "step": 14420 }, { "epoch": 0.4506875, "grad_norm": 3.578125, "grad_norm_var": 0.06943359375, "learning_rate": 0.0001, "loss": 6.0411, "loss/crossentropy": 2.7519595623016357, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17852392047643661, "step": 14422 }, { "epoch": 0.45075, "grad_norm": 3.375, "grad_norm_var": 0.028271484375, "learning_rate": 0.0001, "loss": 6.0271, "loss/crossentropy": 2.6504757404327393, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18297728896141052, "step": 14424 }, { "epoch": 0.4508125, "grad_norm": 3.03125, "grad_norm_var": 0.0276031494140625, "learning_rate": 0.0001, "loss": 5.7918, "loss/crossentropy": 2.558031678199768, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1741553172469139, "step": 14426 }, { "epoch": 0.450875, "grad_norm": 3.015625, "grad_norm_var": 0.031998697916666666, "learning_rate": 0.0001, "loss": 5.8355, "loss/crossentropy": 2.606409192085266, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17485878616571426, "step": 14428 }, { "epoch": 0.4509375, "grad_norm": 3.078125, "grad_norm_var": 0.0318756103515625, "learning_rate": 0.0001, "loss": 5.6906, "loss/crossentropy": 2.5467242002487183, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16516976058483124, "step": 14430 }, { "epoch": 0.451, "grad_norm": 3.21875, "grad_norm_var": 0.028587849934895833, "learning_rate": 0.0001, "loss": 6.0052, "loss/crossentropy": 2.616430401802063, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1857563853263855, "step": 14432 }, { "epoch": 0.4510625, "grad_norm": 2.96875, "grad_norm_var": 0.03583882649739583, "learning_rate": 0.0001, "loss": 5.4648, "loss/crossentropy": 2.3175641298294067, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16315650194883347, "step": 14434 }, { "epoch": 0.451125, "grad_norm": 3.25, "grad_norm_var": 0.039484659830729164, "learning_rate": 0.0001, "loss": 5.7707, "loss/crossentropy": 2.580053687095642, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17062243074178696, "step": 14436 }, { "epoch": 0.4511875, "grad_norm": 3.015625, "grad_norm_var": 0.030464680989583333, "learning_rate": 0.0001, "loss": 5.638, "loss/crossentropy": 2.5637532472610474, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1617240235209465, "step": 14438 }, { "epoch": 0.45125, "grad_norm": 3.234375, "grad_norm_var": 0.02828369140625, "learning_rate": 0.0001, "loss": 5.7075, "loss/crossentropy": 2.5479713678359985, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16946709156036377, "step": 14440 }, { "epoch": 0.4513125, "grad_norm": 3.140625, "grad_norm_var": 0.025927734375, "learning_rate": 0.0001, "loss": 5.6683, "loss/crossentropy": 2.513846278190613, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1701308637857437, "step": 14442 }, { "epoch": 0.451375, "grad_norm": 3.046875, "grad_norm_var": 0.03837890625, "learning_rate": 0.0001, "loss": 5.8887, "loss/crossentropy": 2.6817877292633057, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.16756878793239594, "step": 14444 }, { "epoch": 0.4514375, "grad_norm": 3.046875, "grad_norm_var": 0.03560791015625, "learning_rate": 0.0001, "loss": 5.4545, "loss/crossentropy": 2.336892008781433, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.164496548473835, "step": 14446 }, { "epoch": 0.4515, "grad_norm": 3.625, "grad_norm_var": 0.05224202473958333, "learning_rate": 0.0001, "loss": 5.8803, "loss/crossentropy": 2.680983781814575, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17149503529071808, "step": 14448 }, { "epoch": 0.4515625, "grad_norm": 4.25, "grad_norm_var": 0.10862223307291667, "learning_rate": 0.0001, "loss": 5.9772, "loss/crossentropy": 2.5449445247650146, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1881500631570816, "step": 14450 }, { "epoch": 0.451625, "grad_norm": 3.3125, "grad_norm_var": 0.09728190104166666, "learning_rate": 0.0001, "loss": 5.9131, "loss/crossentropy": 2.637911319732666, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17634259909391403, "step": 14452 }, { "epoch": 0.4516875, "grad_norm": 3.328125, "grad_norm_var": 0.17082926432291667, "learning_rate": 0.0001, "loss": 6.0235, "loss/crossentropy": 2.6543599367141724, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18496477603912354, "step": 14454 }, { "epoch": 0.45175, "grad_norm": 3.1875, "grad_norm_var": 0.17464192708333334, "learning_rate": 0.0001, "loss": 5.9006, "loss/crossentropy": 2.6123616695404053, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17726413905620575, "step": 14456 }, { "epoch": 0.4518125, "grad_norm": 3.109375, "grad_norm_var": 0.17293294270833334, "learning_rate": 0.0001, "loss": 5.566, "loss/crossentropy": 2.388457775115967, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1669737845659256, "step": 14458 }, { "epoch": 0.451875, "grad_norm": 2.859375, "grad_norm_var": 0.18325093587239583, "learning_rate": 0.0001, "loss": 5.6086, "loss/crossentropy": 2.465468168258667, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16470561921596527, "step": 14460 }, { "epoch": 0.4519375, "grad_norm": 2.890625, "grad_norm_var": 0.203955078125, "learning_rate": 0.0001, "loss": 5.5544, "loss/crossentropy": 2.454023003578186, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16394591331481934, "step": 14462 }, { "epoch": 0.452, "grad_norm": 3.046875, "grad_norm_var": 0.2122222900390625, "learning_rate": 0.0001, "loss": 5.9464, "loss/crossentropy": 2.717406749725342, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1760205179452896, "step": 14464 }, { "epoch": 0.4520625, "grad_norm": 3.015625, "grad_norm_var": 0.15450846354166667, "learning_rate": 0.0001, "loss": 5.7106, "loss/crossentropy": 2.5401346683502197, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1674344539642334, "step": 14466 }, { "epoch": 0.452125, "grad_norm": 3.296875, "grad_norm_var": 0.15325419108072916, "learning_rate": 0.0001, "loss": 5.7594, "loss/crossentropy": 2.5550618171691895, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1720004305243492, "step": 14468 }, { "epoch": 0.4521875, "grad_norm": 3.03125, "grad_norm_var": 0.025145467122395834, "learning_rate": 0.0001, "loss": 5.714, "loss/crossentropy": 2.555497169494629, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16545791178941727, "step": 14470 }, { "epoch": 0.45225, "grad_norm": 3.5625, "grad_norm_var": 0.03733622233072917, "learning_rate": 0.0001, "loss": 5.6401, "loss/crossentropy": 2.42263126373291, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16979046165943146, "step": 14472 }, { "epoch": 0.4523125, "grad_norm": 3.3125, "grad_norm_var": 0.04156494140625, "learning_rate": 0.0001, "loss": 5.4641, "loss/crossentropy": 2.412516951560974, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1578877568244934, "step": 14474 }, { "epoch": 0.452375, "grad_norm": 3.234375, "grad_norm_var": 0.03616129557291667, "learning_rate": 0.0001, "loss": 5.9363, "loss/crossentropy": 2.5886703729629517, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1820259764790535, "step": 14476 }, { "epoch": 0.4524375, "grad_norm": 3.546875, "grad_norm_var": 0.04942118326822917, "learning_rate": 0.0001, "loss": 6.0269, "loss/crossentropy": 2.652520537376404, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1835315078496933, "step": 14478 }, { "epoch": 0.4525, "grad_norm": 3.4375, "grad_norm_var": 0.0476715087890625, "learning_rate": 0.0001, "loss": 6.018, "loss/crossentropy": 2.7243393659591675, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1797575280070305, "step": 14480 }, { "epoch": 0.4525625, "grad_norm": 2.90625, "grad_norm_var": 0.053173828125, "learning_rate": 0.0001, "loss": 5.7898, "loss/crossentropy": 2.6540274620056152, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16631050407886505, "step": 14482 }, { "epoch": 0.452625, "grad_norm": 3.34375, "grad_norm_var": 0.053873697916666664, "learning_rate": 0.0001, "loss": 5.7773, "loss/crossentropy": 2.558873176574707, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17457697540521622, "step": 14484 }, { "epoch": 0.4526875, "grad_norm": 2.984375, "grad_norm_var": 0.05115458170572917, "learning_rate": 0.0001, "loss": 5.7279, "loss/crossentropy": 2.600783586502075, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1646689623594284, "step": 14486 }, { "epoch": 0.45275, "grad_norm": 3.5625, "grad_norm_var": 0.11155192057291667, "learning_rate": 0.0001, "loss": 5.9461, "loss/crossentropy": 2.5118407011032104, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.18287695944309235, "step": 14488 }, { "epoch": 0.4528125, "grad_norm": 3.390625, "grad_norm_var": 0.10028889973958334, "learning_rate": 0.0001, "loss": 6.1353, "loss/crossentropy": 2.701886773109436, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1933390125632286, "step": 14490 }, { "epoch": 0.452875, "grad_norm": 3.0, "grad_norm_var": 0.11120503743489583, "learning_rate": 0.0001, "loss": 5.2824, "loss/crossentropy": 2.273401975631714, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.15051215142011642, "step": 14492 }, { "epoch": 0.4529375, "grad_norm": 2.984375, "grad_norm_var": 0.10927327473958333, "learning_rate": 0.0001, "loss": 5.662, "loss/crossentropy": 2.533039927482605, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16446173936128616, "step": 14494 }, { "epoch": 0.453, "grad_norm": 3.15625, "grad_norm_var": 0.1080230712890625, "learning_rate": 0.0001, "loss": 5.9995, "loss/crossentropy": 2.7052998542785645, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17629235982894897, "step": 14496 }, { "epoch": 0.4530625, "grad_norm": 3.3125, "grad_norm_var": 0.12870992024739583, "learning_rate": 0.0001, "loss": 5.9878, "loss/crossentropy": 2.7074246406555176, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17999430745840073, "step": 14498 }, { "epoch": 0.453125, "grad_norm": 3.15625, "grad_norm_var": 0.135986328125, "learning_rate": 0.0001, "loss": 5.9737, "loss/crossentropy": 2.7000943422317505, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17579935491085052, "step": 14500 }, { "epoch": 0.4531875, "grad_norm": 3.15625, "grad_norm_var": 0.13108723958333332, "learning_rate": 0.0001, "loss": 5.727, "loss/crossentropy": 2.5456565618515015, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16696364432573318, "step": 14502 }, { "epoch": 0.45325, "grad_norm": 2.984375, "grad_norm_var": 0.0724029541015625, "learning_rate": 0.0001, "loss": 5.6147, "loss/crossentropy": 2.488164782524109, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1673370823264122, "step": 14504 }, { "epoch": 0.4533125, "grad_norm": 3.4375, "grad_norm_var": 0.07317606608072917, "learning_rate": 0.0001, "loss": 6.1609, "loss/crossentropy": 2.7739768028259277, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18713120371103287, "step": 14506 }, { "epoch": 0.453375, "grad_norm": 3.40625, "grad_norm_var": 0.06917317708333333, "learning_rate": 0.0001, "loss": 5.7904, "loss/crossentropy": 2.6002981662750244, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17252818495035172, "step": 14508 }, { "epoch": 0.4534375, "grad_norm": 3.421875, "grad_norm_var": 0.06503804524739583, "learning_rate": 0.0001, "loss": 5.9902, "loss/crossentropy": 2.698809862136841, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1775732785463333, "step": 14510 }, { "epoch": 0.4535, "grad_norm": 4.25, "grad_norm_var": 0.11190999348958333, "learning_rate": 0.0001, "loss": 5.6646, "loss/crossentropy": 2.4125418663024902, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17246823012828827, "step": 14512 }, { "epoch": 0.4535625, "grad_norm": 3.28125, "grad_norm_var": 0.09343159993489583, "learning_rate": 0.0001, "loss": 5.3442, "loss/crossentropy": 2.2412840127944946, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.15990018844604492, "step": 14514 }, { "epoch": 0.453625, "grad_norm": 3.265625, "grad_norm_var": 0.08662821451822916, "learning_rate": 0.0001, "loss": 5.4618, "loss/crossentropy": 2.2628493309020996, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16755583137273788, "step": 14516 }, { "epoch": 0.4536875, "grad_norm": 3.234375, "grad_norm_var": 0.08622945149739583, "learning_rate": 0.0001, "loss": 5.6243, "loss/crossentropy": 2.4937922954559326, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16461055725812912, "step": 14518 }, { "epoch": 0.45375, "grad_norm": 3.0625, "grad_norm_var": 0.16317952473958333, "learning_rate": 0.0001, "loss": 5.894, "loss/crossentropy": 2.5887582302093506, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17857546359300613, "step": 14520 }, { "epoch": 0.4538125, "grad_norm": 5.03125, "grad_norm_var": 0.34032796223958334, "learning_rate": 0.0001, "loss": 5.8418, "loss/crossentropy": 2.476287841796875, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18108688294887543, "step": 14522 }, { "epoch": 0.453875, "grad_norm": 3.1875, "grad_norm_var": 0.3337961832682292, "learning_rate": 0.0001, "loss": 5.6447, "loss/crossentropy": 2.4071391820907593, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17180682718753815, "step": 14524 }, { "epoch": 0.4539375, "grad_norm": 3.078125, "grad_norm_var": 0.3443349202473958, "learning_rate": 0.0001, "loss": 5.7284, "loss/crossentropy": 2.589739680290222, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16582194715738297, "step": 14526 }, { "epoch": 0.454, "grad_norm": 3.59375, "grad_norm_var": 0.3094390869140625, "learning_rate": 0.0001, "loss": 5.3112, "loss/crossentropy": 2.1507176756858826, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16487611830234528, "step": 14528 }, { "epoch": 0.4540625, "grad_norm": 3.140625, "grad_norm_var": 0.30414937337239584, "learning_rate": 0.0001, "loss": 5.7803, "loss/crossentropy": 2.494012713432312, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1778479814529419, "step": 14530 }, { "epoch": 0.454125, "grad_norm": 3.0, "grad_norm_var": 0.31660868326822916, "learning_rate": 0.0001, "loss": 5.9288, "loss/crossentropy": 2.6846877336502075, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17401766777038574, "step": 14532 }, { "epoch": 0.4541875, "grad_norm": 3.203125, "grad_norm_var": 0.3118560791015625, "learning_rate": 0.0001, "loss": 5.9263, "loss/crossentropy": 2.62195360660553, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17965682595968246, "step": 14534 }, { "epoch": 0.45425, "grad_norm": 3.09375, "grad_norm_var": 0.24228108723958333, "learning_rate": 0.0001, "loss": 5.7171, "loss/crossentropy": 2.461188793182373, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17481359094381332, "step": 14536 }, { "epoch": 0.4543125, "grad_norm": 3.46875, "grad_norm_var": 0.0307281494140625, "learning_rate": 0.0001, "loss": 5.9262, "loss/crossentropy": 2.575995922088623, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18189138174057007, "step": 14538 }, { "epoch": 0.454375, "grad_norm": 3.484375, "grad_norm_var": 0.028857421875, "learning_rate": 0.0001, "loss": 5.6891, "loss/crossentropy": 2.3918073177337646, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17659986019134521, "step": 14540 }, { "epoch": 0.4544375, "grad_norm": 2.90625, "grad_norm_var": 0.033568318684895834, "learning_rate": 0.0001, "loss": 6.0145, "loss/crossentropy": 2.788806915283203, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17179163545370102, "step": 14542 }, { "epoch": 0.4545, "grad_norm": 3.1875, "grad_norm_var": 0.026839192708333334, "learning_rate": 0.0001, "loss": 5.9726, "loss/crossentropy": 2.686835527420044, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17584455758333206, "step": 14544 }, { "epoch": 0.4545625, "grad_norm": 3.578125, "grad_norm_var": 0.03339742024739583, "learning_rate": 0.0001, "loss": 6.1558, "loss/crossentropy": 2.770237684249878, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.18816396594047546, "step": 14546 }, { "epoch": 0.454625, "grad_norm": 3.265625, "grad_norm_var": 0.03225911458333333, "learning_rate": 0.0001, "loss": 6.1124, "loss/crossentropy": 2.7354531288146973, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18222268670797348, "step": 14548 }, { "epoch": 0.4546875, "grad_norm": 3.21875, "grad_norm_var": 0.032648722330729164, "learning_rate": 0.0001, "loss": 5.8928, "loss/crossentropy": 2.6034258604049683, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17580794543027878, "step": 14550 }, { "epoch": 0.45475, "grad_norm": 3.046875, "grad_norm_var": 0.0453033447265625, "learning_rate": 0.0001, "loss": 5.6005, "loss/crossentropy": 2.529695749282837, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15942486375570297, "step": 14552 }, { "epoch": 0.4548125, "grad_norm": 3.25, "grad_norm_var": 0.05146077473958333, "learning_rate": 0.0001, "loss": 6.0578, "loss/crossentropy": 2.69464647769928, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18475749343633652, "step": 14554 }, { "epoch": 0.454875, "grad_norm": 3.0625, "grad_norm_var": 0.05123291015625, "learning_rate": 0.0001, "loss": 6.2026, "loss/crossentropy": 2.8169463872909546, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18778616189956665, "step": 14556 }, { "epoch": 0.4549375, "grad_norm": 3.515625, "grad_norm_var": 0.047900390625, "learning_rate": 0.0001, "loss": 6.048, "loss/crossentropy": 2.7339634895324707, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18062249571084976, "step": 14558 }, { "epoch": 0.455, "grad_norm": 3.71875, "grad_norm_var": 0.0690582275390625, "learning_rate": 0.0001, "loss": 5.85, "loss/crossentropy": 2.6157747507095337, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17224732041358948, "step": 14560 }, { "epoch": 0.4550625, "grad_norm": 2.90625, "grad_norm_var": 0.07171122233072917, "learning_rate": 0.0001, "loss": 6.0783, "loss/crossentropy": 2.7599072456359863, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18027833104133606, "step": 14562 }, { "epoch": 0.455125, "grad_norm": 3.484375, "grad_norm_var": 0.07132161458333333, "learning_rate": 0.0001, "loss": 5.8077, "loss/crossentropy": 2.5338187217712402, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17660734057426453, "step": 14564 }, { "epoch": 0.4551875, "grad_norm": 4.1875, "grad_norm_var": 0.12392578125, "learning_rate": 0.0001, "loss": 5.8379, "loss/crossentropy": 2.4399008750915527, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.17925576120615005, "step": 14566 }, { "epoch": 0.45525, "grad_norm": 3.234375, "grad_norm_var": 0.10426432291666667, "learning_rate": 0.0001, "loss": 6.3463, "loss/crossentropy": 2.948116660118103, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18590866029262543, "step": 14568 }, { "epoch": 0.4553125, "grad_norm": 3.15625, "grad_norm_var": 0.10764567057291667, "learning_rate": 0.0001, "loss": 5.6641, "loss/crossentropy": 2.558690071105957, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16014690697193146, "step": 14570 }, { "epoch": 0.455375, "grad_norm": 10.5, "grad_norm_var": 3.2926666259765627, "learning_rate": 0.0001, "loss": 6.0711, "loss/crossentropy": 2.5426149368286133, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.19894441962242126, "step": 14572 }, { "epoch": 0.4554375, "grad_norm": 3.25, "grad_norm_var": 3.319172159830729, "learning_rate": 0.0001, "loss": 5.8357, "loss/crossentropy": 2.600016236305237, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1762988045811653, "step": 14574 }, { "epoch": 0.4555, "grad_norm": 3.0625, "grad_norm_var": 3.32607421875, "learning_rate": 0.0001, "loss": 5.6709, "loss/crossentropy": 2.523502826690674, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16708385944366455, "step": 14576 }, { "epoch": 0.4555625, "grad_norm": 2.9375, "grad_norm_var": 3.338719685872396, "learning_rate": 0.0001, "loss": 5.8305, "loss/crossentropy": 2.6271212100982666, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16994474828243256, "step": 14578 }, { "epoch": 0.455625, "grad_norm": 3.1875, "grad_norm_var": 3.348705037434896, "learning_rate": 0.0001, "loss": 5.6505, "loss/crossentropy": 2.4964014291763306, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16423358023166656, "step": 14580 }, { "epoch": 0.4556875, "grad_norm": 3.375, "grad_norm_var": 3.341120402018229, "learning_rate": 0.0001, "loss": 5.7585, "loss/crossentropy": 2.471139073371887, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1787339448928833, "step": 14582 }, { "epoch": 0.45575, "grad_norm": 3.515625, "grad_norm_var": 3.3541575113932294, "learning_rate": 0.0001, "loss": 5.7238, "loss/crossentropy": 2.5060452222824097, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17333626747131348, "step": 14584 }, { "epoch": 0.4558125, "grad_norm": 3.234375, "grad_norm_var": 3.33756103515625, "learning_rate": 0.0001, "loss": 5.4606, "loss/crossentropy": 2.30147123336792, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.16239578276872635, "step": 14586 }, { "epoch": 0.455875, "grad_norm": 3.140625, "grad_norm_var": 0.07457275390625, "learning_rate": 0.0001, "loss": 5.4625, "loss/crossentropy": 2.429797410964966, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15561362355947495, "step": 14588 }, { "epoch": 0.4559375, "grad_norm": 3.21875, "grad_norm_var": 0.0742828369140625, "learning_rate": 0.0001, "loss": 5.3762, "loss/crossentropy": 2.2654976844787598, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16223956644535065, "step": 14590 }, { "epoch": 0.456, "grad_norm": 3.390625, "grad_norm_var": 0.0739898681640625, "learning_rate": 0.0001, "loss": 5.8523, "loss/crossentropy": 2.6187981367111206, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17452352494001389, "step": 14592 }, { "epoch": 0.4560625, "grad_norm": 3.5, "grad_norm_var": 0.06933492024739583, "learning_rate": 0.0001, "loss": 5.955, "loss/crossentropy": 2.7187896966934204, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17322969436645508, "step": 14594 }, { "epoch": 0.456125, "grad_norm": 3.375, "grad_norm_var": 0.06941731770833333, "learning_rate": 0.0001, "loss": 5.8254, "loss/crossentropy": 2.5588265657424927, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17705295979976654, "step": 14596 }, { "epoch": 0.4561875, "grad_norm": 3.28125, "grad_norm_var": 0.07266337076822917, "learning_rate": 0.0001, "loss": 5.932, "loss/crossentropy": 2.7176828384399414, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1686951220035553, "step": 14598 }, { "epoch": 0.45625, "grad_norm": 3.078125, "grad_norm_var": 0.07768452962239583, "learning_rate": 0.0001, "loss": 5.8074, "loss/crossentropy": 2.5815467834472656, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17610573023557663, "step": 14600 }, { "epoch": 0.4563125, "grad_norm": 3.515625, "grad_norm_var": 0.036864217122395834, "learning_rate": 0.0001, "loss": 5.6907, "loss/crossentropy": 2.534337639808655, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16524504125118256, "step": 14602 }, { "epoch": 0.456375, "grad_norm": 3.109375, "grad_norm_var": 0.0374908447265625, "learning_rate": 0.0001, "loss": 5.6976, "loss/crossentropy": 2.4520140886306763, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17416337877511978, "step": 14604 }, { "epoch": 0.4564375, "grad_norm": 2.96875, "grad_norm_var": 0.041552734375, "learning_rate": 0.0001, "loss": 5.7777, "loss/crossentropy": 2.6158251762390137, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16775169968605042, "step": 14606 }, { "epoch": 0.4565, "grad_norm": 3.3125, "grad_norm_var": 0.04243876139322917, "learning_rate": 0.0001, "loss": 5.9935, "loss/crossentropy": 2.687050700187683, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1806405782699585, "step": 14608 }, { "epoch": 0.4565625, "grad_norm": 3.046875, "grad_norm_var": 0.03892822265625, "learning_rate": 0.0001, "loss": 5.5589, "loss/crossentropy": 2.4694937467575073, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1593277007341385, "step": 14610 }, { "epoch": 0.456625, "grad_norm": 3.484375, "grad_norm_var": 0.03748270670572917, "learning_rate": 0.0001, "loss": 5.725, "loss/crossentropy": 2.4947859048843384, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17536167800426483, "step": 14612 }, { "epoch": 0.4566875, "grad_norm": 3.375, "grad_norm_var": 0.040461222330729164, "learning_rate": 0.0001, "loss": 5.6171, "loss/crossentropy": 2.421966314315796, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17303290963172913, "step": 14614 }, { "epoch": 0.45675, "grad_norm": 3.328125, "grad_norm_var": 0.034403483072916664, "learning_rate": 0.0001, "loss": 5.5022, "loss/crossentropy": 2.274025797843933, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1712556630373001, "step": 14616 }, { "epoch": 0.4568125, "grad_norm": 3.1875, "grad_norm_var": 0.020731608072916668, "learning_rate": 0.0001, "loss": 5.809, "loss/crossentropy": 2.603486657142639, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16859738528728485, "step": 14618 }, { "epoch": 0.456875, "grad_norm": 3.0, "grad_norm_var": 0.029618326822916666, "learning_rate": 0.0001, "loss": 5.6869, "loss/crossentropy": 2.578454375267029, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16280003637075424, "step": 14620 }, { "epoch": 0.4569375, "grad_norm": 3.140625, "grad_norm_var": 0.026659138997395835, "learning_rate": 0.0001, "loss": 5.7511, "loss/crossentropy": 2.5166012048721313, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17188695818185806, "step": 14622 }, { "epoch": 0.457, "grad_norm": 3.21875, "grad_norm_var": 0.024144490559895832, "learning_rate": 0.0001, "loss": 5.8898, "loss/crossentropy": 2.6979739665985107, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17035824060440063, "step": 14624 }, { "epoch": 0.4570625, "grad_norm": 2.96875, "grad_norm_var": 0.025419108072916665, "learning_rate": 0.0001, "loss": 5.438, "loss/crossentropy": 2.3709373474121094, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16139668226242065, "step": 14626 }, { "epoch": 0.457125, "grad_norm": 3.171875, "grad_norm_var": 0.019391886393229165, "learning_rate": 0.0001, "loss": 5.6868, "loss/crossentropy": 2.4807026386260986, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1694331392645836, "step": 14628 }, { "epoch": 0.4571875, "grad_norm": 3.21875, "grad_norm_var": 0.0139801025390625, "learning_rate": 0.0001, "loss": 5.8062, "loss/crossentropy": 2.5614261627197266, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17330555617809296, "step": 14630 }, { "epoch": 0.45725, "grad_norm": 3.09375, "grad_norm_var": 0.012255859375, "learning_rate": 0.0001, "loss": 5.8374, "loss/crossentropy": 2.601043462753296, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1740272492170334, "step": 14632 }, { "epoch": 0.4573125, "grad_norm": 3.296875, "grad_norm_var": 0.0140289306640625, "learning_rate": 0.0001, "loss": 5.9327, "loss/crossentropy": 2.6570407152175903, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17912764102220535, "step": 14634 }, { "epoch": 0.457375, "grad_norm": 2.953125, "grad_norm_var": 0.014264933268229167, "learning_rate": 0.0001, "loss": 5.7822, "loss/crossentropy": 2.581598401069641, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.170454740524292, "step": 14636 }, { "epoch": 0.4574375, "grad_norm": 3.609375, "grad_norm_var": 0.0278228759765625, "learning_rate": 0.0001, "loss": 5.8602, "loss/crossentropy": 2.5793548822402954, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17457029223442078, "step": 14638 }, { "epoch": 0.4575, "grad_norm": 3.5625, "grad_norm_var": 0.04016520182291667, "learning_rate": 0.0001, "loss": 5.8721, "loss/crossentropy": 2.624797224998474, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1727820411324501, "step": 14640 }, { "epoch": 0.4575625, "grad_norm": 3.171875, "grad_norm_var": 0.044098917643229166, "learning_rate": 0.0001, "loss": 5.7531, "loss/crossentropy": 2.534020185470581, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17268630117177963, "step": 14642 }, { "epoch": 0.457625, "grad_norm": 3.125, "grad_norm_var": 0.05367431640625, "learning_rate": 0.0001, "loss": 5.7927, "loss/crossentropy": 2.6541357040405273, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16464103013277054, "step": 14644 }, { "epoch": 0.4576875, "grad_norm": 3.28125, "grad_norm_var": 0.06168212890625, "learning_rate": 0.0001, "loss": 5.6557, "loss/crossentropy": 2.514155864715576, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16806161403656006, "step": 14646 }, { "epoch": 0.45775, "grad_norm": 3.171875, "grad_norm_var": 0.0607086181640625, "learning_rate": 0.0001, "loss": 5.8374, "loss/crossentropy": 2.6226083040237427, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17382187396287918, "step": 14648 }, { "epoch": 0.4578125, "grad_norm": 3.96875, "grad_norm_var": 0.097412109375, "learning_rate": 0.0001, "loss": 5.2514, "loss/crossentropy": 2.1894924640655518, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1569766104221344, "step": 14650 }, { "epoch": 0.457875, "grad_norm": 2.984375, "grad_norm_var": 0.09517822265625, "learning_rate": 0.0001, "loss": 5.3859, "loss/crossentropy": 2.330040216445923, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15871459245681763, "step": 14652 }, { "epoch": 0.4579375, "grad_norm": 3.609375, "grad_norm_var": 0.1093414306640625, "learning_rate": 0.0001, "loss": 5.9374, "loss/crossentropy": 2.528441071510315, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1881653070449829, "step": 14654 }, { "epoch": 0.458, "grad_norm": 3.359375, "grad_norm_var": 0.09735921223958334, "learning_rate": 0.0001, "loss": 5.8961, "loss/crossentropy": 2.661376118659973, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1715143546462059, "step": 14656 }, { "epoch": 0.4580625, "grad_norm": 3.03125, "grad_norm_var": 0.09684244791666667, "learning_rate": 0.0001, "loss": 5.9745, "loss/crossentropy": 2.7906899452209473, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16994313150644302, "step": 14658 }, { "epoch": 0.458125, "grad_norm": 3.234375, "grad_norm_var": 0.0846343994140625, "learning_rate": 0.0001, "loss": 5.9035, "loss/crossentropy": 2.65447998046875, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17568230628967285, "step": 14660 }, { "epoch": 0.4581875, "grad_norm": 3.015625, "grad_norm_var": 0.08594462076822916, "learning_rate": 0.0001, "loss": 5.9461, "loss/crossentropy": 2.699332356452942, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17624330520629883, "step": 14662 }, { "epoch": 0.45825, "grad_norm": 3.203125, "grad_norm_var": 0.09119466145833334, "learning_rate": 0.0001, "loss": 5.7475, "loss/crossentropy": 2.514883279800415, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17247988283634186, "step": 14664 }, { "epoch": 0.4583125, "grad_norm": 3.28125, "grad_norm_var": 0.0596343994140625, "learning_rate": 0.0001, "loss": 5.7979, "loss/crossentropy": 2.582164764404297, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1704052984714508, "step": 14666 }, { "epoch": 0.458375, "grad_norm": 2.9375, "grad_norm_var": 0.06144917805989583, "learning_rate": 0.0001, "loss": 5.9131, "loss/crossentropy": 2.6844900846481323, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17286403477191925, "step": 14668 }, { "epoch": 0.4584375, "grad_norm": 3.40625, "grad_norm_var": 0.03411458333333333, "learning_rate": 0.0001, "loss": 5.4473, "loss/crossentropy": 2.341725468635559, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.15821301192045212, "step": 14670 }, { "epoch": 0.4585, "grad_norm": 3.15625, "grad_norm_var": 0.03733622233072917, "learning_rate": 0.0001, "loss": 5.5986, "loss/crossentropy": 2.470059871673584, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16675861179828644, "step": 14672 }, { "epoch": 0.4585625, "grad_norm": 3.015625, "grad_norm_var": 0.03853759765625, "learning_rate": 0.0001, "loss": 5.3315, "loss/crossentropy": 2.319731116294861, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15976974368095398, "step": 14674 }, { "epoch": 0.458625, "grad_norm": 3.03125, "grad_norm_var": 0.040266927083333334, "learning_rate": 0.0001, "loss": 5.8873, "loss/crossentropy": 2.588124394416809, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17679382860660553, "step": 14676 }, { "epoch": 0.4586875, "grad_norm": 3.109375, "grad_norm_var": 0.024312337239583332, "learning_rate": 0.0001, "loss": 5.9772, "loss/crossentropy": 2.825955390930176, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1678621917963028, "step": 14678 }, { "epoch": 0.45875, "grad_norm": 3.390625, "grad_norm_var": 0.028287760416666665, "learning_rate": 0.0001, "loss": 5.6411, "loss/crossentropy": 2.525596022605896, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16389373689889908, "step": 14680 }, { "epoch": 0.4588125, "grad_norm": 3.3125, "grad_norm_var": 0.028857421875, "learning_rate": 0.0001, "loss": 5.6639, "loss/crossentropy": 2.4533002376556396, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16910212486982346, "step": 14682 }, { "epoch": 0.458875, "grad_norm": 3.15625, "grad_norm_var": 0.02818603515625, "learning_rate": 0.0001, "loss": 5.8839, "loss/crossentropy": 2.547485113143921, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17894960939884186, "step": 14684 }, { "epoch": 0.4589375, "grad_norm": 3.0625, "grad_norm_var": 0.026008097330729167, "learning_rate": 0.0001, "loss": 5.5148, "loss/crossentropy": 2.419428825378418, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16500765085220337, "step": 14686 }, { "epoch": 0.459, "grad_norm": 3.296875, "grad_norm_var": 0.020563761393229168, "learning_rate": 0.0001, "loss": 5.8017, "loss/crossentropy": 2.47537899017334, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1826336607336998, "step": 14688 }, { "epoch": 0.4590625, "grad_norm": 3.21875, "grad_norm_var": 0.02017822265625, "learning_rate": 0.0001, "loss": 5.6329, "loss/crossentropy": 2.4895445108413696, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16785429418087006, "step": 14690 }, { "epoch": 0.459125, "grad_norm": 3.21875, "grad_norm_var": 0.025194295247395835, "learning_rate": 0.0001, "loss": 5.7113, "loss/crossentropy": 2.4924451112747192, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16992872953414917, "step": 14692 }, { "epoch": 0.4591875, "grad_norm": 3.25, "grad_norm_var": 0.026057942708333334, "learning_rate": 0.0001, "loss": 5.5704, "loss/crossentropy": 2.432380795478821, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16224291920661926, "step": 14694 }, { "epoch": 0.45925, "grad_norm": 3.03125, "grad_norm_var": 0.021708170572916668, "learning_rate": 0.0001, "loss": 5.8647, "loss/crossentropy": 2.6404428482055664, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17320983111858368, "step": 14696 }, { "epoch": 0.4593125, "grad_norm": 3.703125, "grad_norm_var": 0.04390869140625, "learning_rate": 0.0001, "loss": 5.6257, "loss/crossentropy": 2.459041118621826, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.17174628376960754, "step": 14698 }, { "epoch": 0.459375, "grad_norm": 3.0625, "grad_norm_var": 0.04267171223958333, "learning_rate": 0.0001, "loss": 5.7677, "loss/crossentropy": 2.531997799873352, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1751360446214676, "step": 14700 }, { "epoch": 0.4594375, "grad_norm": 3.171875, "grad_norm_var": 0.0443023681640625, "learning_rate": 0.0001, "loss": 5.8761, "loss/crossentropy": 2.7251086235046387, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16588325053453445, "step": 14702 }, { "epoch": 0.4595, "grad_norm": 3.078125, "grad_norm_var": 0.06440327962239584, "learning_rate": 0.0001, "loss": 5.958, "loss/crossentropy": 2.6849414110183716, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17652680724859238, "step": 14704 }, { "epoch": 0.4595625, "grad_norm": 3.140625, "grad_norm_var": 0.06448465983072917, "learning_rate": 0.0001, "loss": 5.7037, "loss/crossentropy": 2.5385085344314575, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16964833438396454, "step": 14706 }, { "epoch": 0.459625, "grad_norm": 3.109375, "grad_norm_var": 0.05878499348958333, "learning_rate": 0.0001, "loss": 5.9176, "loss/crossentropy": 2.7037012577056885, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17139053344726562, "step": 14708 }, { "epoch": 0.4596875, "grad_norm": 3.296875, "grad_norm_var": 0.058089192708333334, "learning_rate": 0.0001, "loss": 5.9178, "loss/crossentropy": 2.68429696559906, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17451756447553635, "step": 14710 }, { "epoch": 0.45975, "grad_norm": 3.5625, "grad_norm_var": 0.06498921712239583, "learning_rate": 0.0001, "loss": 6.1829, "loss/crossentropy": 2.8171669244766235, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18501022458076477, "step": 14712 }, { "epoch": 0.4598125, "grad_norm": 3.25, "grad_norm_var": 0.048460896809895834, "learning_rate": 0.0001, "loss": 5.7204, "loss/crossentropy": 2.5563507080078125, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16992372274398804, "step": 14714 }, { "epoch": 0.459875, "grad_norm": 3.578125, "grad_norm_var": 0.05468343098958333, "learning_rate": 0.0001, "loss": 5.8308, "loss/crossentropy": 2.4868799448013306, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18204709142446518, "step": 14716 }, { "epoch": 0.4599375, "grad_norm": 2.96875, "grad_norm_var": 0.05761311848958333, "learning_rate": 0.0001, "loss": 5.7521, "loss/crossentropy": 2.5840706825256348, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16758275777101517, "step": 14718 }, { "epoch": 0.46, "grad_norm": 3.125, "grad_norm_var": 0.04110921223958333, "learning_rate": 0.0001, "loss": 5.8617, "loss/crossentropy": 2.6252224445343018, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1752118468284607, "step": 14720 }, { "epoch": 0.4600625, "grad_norm": 3.140625, "grad_norm_var": 0.04031575520833333, "learning_rate": 0.0001, "loss": 5.2771, "loss/crossentropy": 2.2195135354995728, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1557537019252777, "step": 14722 }, { "epoch": 0.460125, "grad_norm": 2.953125, "grad_norm_var": 0.04318033854166667, "learning_rate": 0.0001, "loss": 5.4511, "loss/crossentropy": 2.3469650745391846, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1666633039712906, "step": 14724 }, { "epoch": 0.4601875, "grad_norm": 3.1875, "grad_norm_var": 0.0434722900390625, "learning_rate": 0.0001, "loss": 5.6526, "loss/crossentropy": 2.4295217990875244, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1742624267935753, "step": 14726 }, { "epoch": 0.46025, "grad_norm": 3.3125, "grad_norm_var": 0.03411051432291667, "learning_rate": 0.0001, "loss": 5.916, "loss/crossentropy": 2.593013882637024, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17760702222585678, "step": 14728 }, { "epoch": 0.4603125, "grad_norm": 3.265625, "grad_norm_var": 0.03209635416666667, "learning_rate": 0.0001, "loss": 5.7603, "loss/crossentropy": 2.5226577520370483, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1749354526400566, "step": 14730 }, { "epoch": 0.460375, "grad_norm": 3.109375, "grad_norm_var": 0.0208160400390625, "learning_rate": 0.0001, "loss": 5.7309, "loss/crossentropy": 2.4947710037231445, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17400462925434113, "step": 14732 }, { "epoch": 0.4604375, "grad_norm": 3.59375, "grad_norm_var": 0.027034505208333334, "learning_rate": 0.0001, "loss": 5.8304, "loss/crossentropy": 2.486048698425293, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18130739778280258, "step": 14734 }, { "epoch": 0.4605, "grad_norm": 3.375, "grad_norm_var": 0.04313863118489583, "learning_rate": 0.0001, "loss": 5.9464, "loss/crossentropy": 2.545111060142517, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.18973546475172043, "step": 14736 }, { "epoch": 0.4605625, "grad_norm": 3.390625, "grad_norm_var": 0.0491119384765625, "learning_rate": 0.0001, "loss": 5.6791, "loss/crossentropy": 2.593599557876587, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.16518951207399368, "step": 14738 }, { "epoch": 0.460625, "grad_norm": 3.15625, "grad_norm_var": 0.04248758951822917, "learning_rate": 0.0001, "loss": 5.8006, "loss/crossentropy": 2.6278897523880005, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16844641417264938, "step": 14740 }, { "epoch": 0.4606875, "grad_norm": 3.3125, "grad_norm_var": 0.040816243489583334, "learning_rate": 0.0001, "loss": 5.9979, "loss/crossentropy": 2.687938690185547, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17942996323108673, "step": 14742 }, { "epoch": 0.46075, "grad_norm": 3.015625, "grad_norm_var": 0.0473297119140625, "learning_rate": 0.0001, "loss": 5.3834, "loss/crossentropy": 2.2958441972732544, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16579102724790573, "step": 14744 }, { "epoch": 0.4608125, "grad_norm": 3.109375, "grad_norm_var": 0.055882771809895836, "learning_rate": 0.0001, "loss": 5.9585, "loss/crossentropy": 2.646650195121765, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.18079080432653427, "step": 14746 }, { "epoch": 0.460875, "grad_norm": 3.421875, "grad_norm_var": 0.07305399576822917, "learning_rate": 0.0001, "loss": 6.0442, "loss/crossentropy": 2.5205646753311157, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.19767975062131882, "step": 14748 }, { "epoch": 0.4609375, "grad_norm": 3.03125, "grad_norm_var": 0.06814676920572917, "learning_rate": 0.0001, "loss": 6.0111, "loss/crossentropy": 2.8023595809936523, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17126596719026566, "step": 14750 }, { "epoch": 0.461, "grad_norm": 3.359375, "grad_norm_var": 0.066357421875, "learning_rate": 0.0001, "loss": 5.493, "loss/crossentropy": 2.3870071172714233, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1625504121184349, "step": 14752 }, { "epoch": 0.4610625, "grad_norm": 3.28125, "grad_norm_var": 0.0614166259765625, "learning_rate": 0.0001, "loss": 5.4415, "loss/crossentropy": 2.3839242458343506, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15849259495735168, "step": 14754 }, { "epoch": 0.461125, "grad_norm": 3.34375, "grad_norm_var": 0.06119384765625, "learning_rate": 0.0001, "loss": 5.6472, "loss/crossentropy": 2.4545260667800903, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16652970761060715, "step": 14756 }, { "epoch": 0.4611875, "grad_norm": 3.15625, "grad_norm_var": 0.061474609375, "learning_rate": 0.0001, "loss": 5.7819, "loss/crossentropy": 2.6248152256011963, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16297141462564468, "step": 14758 }, { "epoch": 0.46125, "grad_norm": 3.609375, "grad_norm_var": 0.06609598795572917, "learning_rate": 0.0001, "loss": 5.7236, "loss/crossentropy": 2.5196938514709473, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17195553332567215, "step": 14760 }, { "epoch": 0.4613125, "grad_norm": 3.046875, "grad_norm_var": 0.062300618489583334, "learning_rate": 0.0001, "loss": 5.5641, "loss/crossentropy": 2.48823082447052, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15875942260026932, "step": 14762 }, { "epoch": 0.461375, "grad_norm": 3.34375, "grad_norm_var": 0.032835896809895834, "learning_rate": 0.0001, "loss": 5.5414, "loss/crossentropy": 2.413806200027466, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1615920066833496, "step": 14764 }, { "epoch": 0.4614375, "grad_norm": 3.328125, "grad_norm_var": 0.032160441080729164, "learning_rate": 0.0001, "loss": 5.926, "loss/crossentropy": 2.5423930883407593, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18289238214492798, "step": 14766 }, { "epoch": 0.4615, "grad_norm": 3.109375, "grad_norm_var": 0.0253326416015625, "learning_rate": 0.0001, "loss": 5.624, "loss/crossentropy": 2.4361913204193115, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16955722868442535, "step": 14768 }, { "epoch": 0.4615625, "grad_norm": 3.0625, "grad_norm_var": 0.0267486572265625, "learning_rate": 0.0001, "loss": 5.6569, "loss/crossentropy": 2.519798755645752, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1676197052001953, "step": 14770 }, { "epoch": 0.461625, "grad_norm": 3.390625, "grad_norm_var": 0.028304036458333334, "learning_rate": 0.0001, "loss": 5.7614, "loss/crossentropy": 2.5674301385879517, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16979040205478668, "step": 14772 }, { "epoch": 0.4616875, "grad_norm": 4.34375, "grad_norm_var": 0.1083984375, "learning_rate": 0.0001, "loss": 5.8391, "loss/crossentropy": 2.525059461593628, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.18178994953632355, "step": 14774 }, { "epoch": 0.46175, "grad_norm": 2.953125, "grad_norm_var": 0.10592041015625, "learning_rate": 0.0001, "loss": 5.4408, "loss/crossentropy": 2.348313093185425, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16081024706363678, "step": 14776 }, { "epoch": 0.4618125, "grad_norm": 3.21875, "grad_norm_var": 0.09839579264322916, "learning_rate": 0.0001, "loss": 5.9184, "loss/crossentropy": 2.6475237607955933, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17552414536476135, "step": 14778 }, { "epoch": 0.461875, "grad_norm": 5.03125, "grad_norm_var": 0.29124348958333335, "learning_rate": 0.0001, "loss": 5.7123, "loss/crossentropy": 2.5059620141983032, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17219389975070953, "step": 14780 }, { "epoch": 0.4619375, "grad_norm": 3.78125, "grad_norm_var": 0.30024312337239584, "learning_rate": 0.0001, "loss": 5.5394, "loss/crossentropy": 2.3543527126312256, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16655007749795914, "step": 14782 }, { "epoch": 0.462, "grad_norm": 3.203125, "grad_norm_var": 0.2855143229166667, "learning_rate": 0.0001, "loss": 5.9099, "loss/crossentropy": 2.586949944496155, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18151184171438217, "step": 14784 }, { "epoch": 0.4620625, "grad_norm": 4.03125, "grad_norm_var": 0.29341532389322916, "learning_rate": 0.0001, "loss": 5.8707, "loss/crossentropy": 2.5481022596359253, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1826496571302414, "step": 14786 }, { "epoch": 0.462125, "grad_norm": 3.265625, "grad_norm_var": 0.2879058837890625, "learning_rate": 0.0001, "loss": 5.948, "loss/crossentropy": 2.6165642738342285, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17962933331727982, "step": 14788 }, { "epoch": 0.4621875, "grad_norm": 3.3125, "grad_norm_var": 0.23883056640625, "learning_rate": 0.0001, "loss": 5.9596, "loss/crossentropy": 2.6575080156326294, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1809903010725975, "step": 14790 }, { "epoch": 0.46225, "grad_norm": 2.9375, "grad_norm_var": 0.24378255208333333, "learning_rate": 0.0001, "loss": 5.0272, "loss/crossentropy": 2.03986132144928, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.14991004019975662, "step": 14792 }, { "epoch": 0.4623125, "grad_norm": 4.34375, "grad_norm_var": 0.3038238525390625, "learning_rate": 0.0001, "loss": 6.2764, "loss/crossentropy": 2.893642544746399, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18476461619138718, "step": 14794 }, { "epoch": 0.462375, "grad_norm": 2.953125, "grad_norm_var": 0.15435791015625, "learning_rate": 0.0001, "loss": 5.7914, "loss/crossentropy": 2.607421875, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16761763393878937, "step": 14796 }, { "epoch": 0.4624375, "grad_norm": 3.015625, "grad_norm_var": 0.14687093098958334, "learning_rate": 0.0001, "loss": 5.4684, "loss/crossentropy": 2.319623112678528, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.15980049967765808, "step": 14798 }, { "epoch": 0.4625, "grad_norm": 3.203125, "grad_norm_var": 0.15110270182291666, "learning_rate": 0.0001, "loss": 5.6973, "loss/crossentropy": 2.511590003967285, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16895774006843567, "step": 14800 }, { "epoch": 0.4625625, "grad_norm": 3.390625, "grad_norm_var": 0.11250712076822916, "learning_rate": 0.0001, "loss": 5.9353, "loss/crossentropy": 2.6513490676879883, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17722097784280777, "step": 14802 }, { "epoch": 0.462625, "grad_norm": 3.59375, "grad_norm_var": 0.12161051432291667, "learning_rate": 0.0001, "loss": 6.3433, "loss/crossentropy": 2.8445650339126587, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.19244921952486038, "step": 14804 }, { "epoch": 0.4626875, "grad_norm": 3.15625, "grad_norm_var": 0.12209370930989584, "learning_rate": 0.0001, "loss": 5.7802, "loss/crossentropy": 2.4950926303863525, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17733414471149445, "step": 14806 }, { "epoch": 0.46275, "grad_norm": 3.265625, "grad_norm_var": 0.11503499348958333, "learning_rate": 0.0001, "loss": 6.1628, "loss/crossentropy": 2.861618399620056, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17738831788301468, "step": 14808 }, { "epoch": 0.4628125, "grad_norm": 3.171875, "grad_norm_var": 0.0292144775390625, "learning_rate": 0.0001, "loss": 5.9535, "loss/crossentropy": 2.7600247859954834, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1716940999031067, "step": 14810 }, { "epoch": 0.462875, "grad_norm": 3.484375, "grad_norm_var": 0.07835184733072917, "learning_rate": 0.0001, "loss": 6.0486, "loss/crossentropy": 2.697732090950012, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.17570797353982925, "step": 14812 }, { "epoch": 0.4629375, "grad_norm": 3.53125, "grad_norm_var": 0.5511301676432292, "learning_rate": 0.0001, "loss": 6.3706, "loss/crossentropy": 2.7113730907440186, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.21123894304037094, "step": 14814 }, { "epoch": 0.463, "grad_norm": 3.09375, "grad_norm_var": 0.5447092692057292, "learning_rate": 0.0001, "loss": 5.9407, "loss/crossentropy": 2.696148991584778, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17640455812215805, "step": 14816 }, { "epoch": 0.4630625, "grad_norm": 3.203125, "grad_norm_var": 0.5544881184895833, "learning_rate": 0.0001, "loss": 5.9524, "loss/crossentropy": 2.7013330459594727, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17628022283315659, "step": 14818 }, { "epoch": 0.463125, "grad_norm": 3.140625, "grad_norm_var": 0.5630767822265625, "learning_rate": 0.0001, "loss": 5.4809, "loss/crossentropy": 2.3524303436279297, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16128921508789062, "step": 14820 }, { "epoch": 0.4631875, "grad_norm": 3.140625, "grad_norm_var": 0.5612050374348958, "learning_rate": 0.0001, "loss": 5.9576, "loss/crossentropy": 2.6361982822418213, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17705771327018738, "step": 14822 }, { "epoch": 0.46325, "grad_norm": 3.03125, "grad_norm_var": 0.5830078125, "learning_rate": 0.0001, "loss": 5.3445, "loss/crossentropy": 2.308445453643799, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15634051710367203, "step": 14824 }, { "epoch": 0.4633125, "grad_norm": 2.953125, "grad_norm_var": 0.5953928629557291, "learning_rate": 0.0001, "loss": 5.4586, "loss/crossentropy": 2.328156590461731, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16304896026849747, "step": 14826 }, { "epoch": 0.463375, "grad_norm": 3.15625, "grad_norm_var": 0.56376953125, "learning_rate": 0.0001, "loss": 6.092, "loss/crossentropy": 2.772488594055176, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.18077776581048965, "step": 14828 }, { "epoch": 0.4634375, "grad_norm": 3.078125, "grad_norm_var": 0.03418680826822917, "learning_rate": 0.0001, "loss": 5.293, "loss/crossentropy": 2.2395306825637817, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1553502231836319, "step": 14830 }, { "epoch": 0.4635, "grad_norm": 13.3125, "grad_norm_var": 6.447184244791667, "learning_rate": 0.0001, "loss": 6.6271, "loss/crossentropy": 2.7424668073654175, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.22869876772165298, "step": 14832 }, { "epoch": 0.4635625, "grad_norm": 3.703125, "grad_norm_var": 6.374535115559896, "learning_rate": 0.0001, "loss": 5.7731, "loss/crossentropy": 2.4544259309768677, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.18147774040699005, "step": 14834 }, { "epoch": 0.463625, "grad_norm": 3.4375, "grad_norm_var": 6.3384765625, "learning_rate": 0.0001, "loss": 5.9261, "loss/crossentropy": 2.574429750442505, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18047606199979782, "step": 14836 }, { "epoch": 0.4636875, "grad_norm": 3.640625, "grad_norm_var": 6.321174112955729, "learning_rate": 0.0001, "loss": 5.7131, "loss/crossentropy": 2.3872469663619995, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.18141291290521622, "step": 14838 }, { "epoch": 0.46375, "grad_norm": 3.171875, "grad_norm_var": 6.293912760416666, "learning_rate": 0.0001, "loss": 5.5187, "loss/crossentropy": 2.3648808002471924, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1665530502796173, "step": 14840 }, { "epoch": 0.4638125, "grad_norm": 3.328125, "grad_norm_var": 6.237385050455729, "learning_rate": 0.0001, "loss": 5.4878, "loss/crossentropy": 2.4001606702804565, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16071591526269913, "step": 14842 }, { "epoch": 0.463875, "grad_norm": 3.265625, "grad_norm_var": 6.230231730143229, "learning_rate": 0.0001, "loss": 6.2012, "loss/crossentropy": 2.899673342704773, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17897935211658478, "step": 14844 }, { "epoch": 0.4639375, "grad_norm": 3.234375, "grad_norm_var": 6.202144368489583, "learning_rate": 0.0001, "loss": 5.7554, "loss/crossentropy": 2.5932239294052124, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16504772007465363, "step": 14846 }, { "epoch": 0.464, "grad_norm": 3.140625, "grad_norm_var": 0.06018473307291667, "learning_rate": 0.0001, "loss": 5.7055, "loss/crossentropy": 2.6065553426742554, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16380469501018524, "step": 14848 }, { "epoch": 0.4640625, "grad_norm": 3.078125, "grad_norm_var": 0.03425191243489583, "learning_rate": 0.0001, "loss": 5.9366, "loss/crossentropy": 2.713751435279846, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17111469060182571, "step": 14850 }, { "epoch": 0.464125, "grad_norm": 3.171875, "grad_norm_var": 0.03703511555989583, "learning_rate": 0.0001, "loss": 5.7265, "loss/crossentropy": 2.6059426069259644, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16205142438411713, "step": 14852 }, { "epoch": 0.4641875, "grad_norm": 3.953125, "grad_norm_var": 0.06829427083333334, "learning_rate": 0.0001, "loss": 5.8085, "loss/crossentropy": 2.6018357276916504, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1706642284989357, "step": 14854 }, { "epoch": 0.46425, "grad_norm": 3.625, "grad_norm_var": 0.08379618326822917, "learning_rate": 0.0001, "loss": 6.0674, "loss/crossentropy": 2.7422547340393066, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.18212289363145828, "step": 14856 }, { "epoch": 0.4643125, "grad_norm": 3.375, "grad_norm_var": 0.08560791015625, "learning_rate": 0.0001, "loss": 5.7673, "loss/crossentropy": 2.5514076948165894, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17120198160409927, "step": 14858 }, { "epoch": 0.464375, "grad_norm": 3.125, "grad_norm_var": 0.0886627197265625, "learning_rate": 0.0001, "loss": 5.8941, "loss/crossentropy": 2.735658288002014, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1670130491256714, "step": 14860 }, { "epoch": 0.4644375, "grad_norm": 3.234375, "grad_norm_var": 0.0883941650390625, "learning_rate": 0.0001, "loss": 5.5859, "loss/crossentropy": 2.4559171199798584, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16690906137228012, "step": 14862 }, { "epoch": 0.4645, "grad_norm": 3.140625, "grad_norm_var": 0.08950093587239584, "learning_rate": 0.0001, "loss": 5.7883, "loss/crossentropy": 2.468541145324707, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17689593136310577, "step": 14864 }, { "epoch": 0.4645625, "grad_norm": 3.296875, "grad_norm_var": 0.0905670166015625, "learning_rate": 0.0001, "loss": 6.0488, "loss/crossentropy": 2.794520854949951, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17347898334264755, "step": 14866 }, { "epoch": 0.464625, "grad_norm": 3.796875, "grad_norm_var": 0.10972391764322917, "learning_rate": 0.0001, "loss": 5.8122, "loss/crossentropy": 2.4630837440490723, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18413476645946503, "step": 14868 }, { "epoch": 0.4646875, "grad_norm": 3.125, "grad_norm_var": 0.06687723795572917, "learning_rate": 0.0001, "loss": 5.662, "loss/crossentropy": 2.499110698699951, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16941003501415253, "step": 14870 }, { "epoch": 0.46475, "grad_norm": 3.015625, "grad_norm_var": 0.05271708170572917, "learning_rate": 0.0001, "loss": 5.6792, "loss/crossentropy": 2.492180109024048, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16908857226371765, "step": 14872 }, { "epoch": 0.4648125, "grad_norm": 3.421875, "grad_norm_var": 0.05548502604166667, "learning_rate": 0.0001, "loss": 5.9703, "loss/crossentropy": 2.632256269454956, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18107140809297562, "step": 14874 }, { "epoch": 0.464875, "grad_norm": 3.40625, "grad_norm_var": 0.052958170572916664, "learning_rate": 0.0001, "loss": 5.7635, "loss/crossentropy": 2.4615299701690674, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17785103619098663, "step": 14876 }, { "epoch": 0.4649375, "grad_norm": 3.1875, "grad_norm_var": 0.04986979166666667, "learning_rate": 0.0001, "loss": 5.6528, "loss/crossentropy": 2.455379843711853, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16818365454673767, "step": 14878 }, { "epoch": 0.465, "grad_norm": 3.21875, "grad_norm_var": 0.055174763997395834, "learning_rate": 0.0001, "loss": 5.8532, "loss/crossentropy": 2.6770154237747192, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.171913243830204, "step": 14880 }, { "epoch": 0.4650625, "grad_norm": 3.46875, "grad_norm_var": 0.05263570149739583, "learning_rate": 0.0001, "loss": 5.9324, "loss/crossentropy": 2.648526191711426, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1795620173215866, "step": 14882 }, { "epoch": 0.465125, "grad_norm": 3.125, "grad_norm_var": 0.028316243489583334, "learning_rate": 0.0001, "loss": 5.6334, "loss/crossentropy": 2.516292929649353, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16366364061832428, "step": 14884 }, { "epoch": 0.4651875, "grad_norm": 3.09375, "grad_norm_var": 0.028873697916666666, "learning_rate": 0.0001, "loss": 5.5178, "loss/crossentropy": 2.356394052505493, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16887901723384857, "step": 14886 }, { "epoch": 0.46525, "grad_norm": 2.953125, "grad_norm_var": 0.03427734375, "learning_rate": 0.0001, "loss": 5.7019, "loss/crossentropy": 2.5004215240478516, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16936583817005157, "step": 14888 }, { "epoch": 0.4653125, "grad_norm": 3.015625, "grad_norm_var": 0.034211222330729166, "learning_rate": 0.0001, "loss": 5.8795, "loss/crossentropy": 2.6740177869796753, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17093826830387115, "step": 14890 }, { "epoch": 0.465375, "grad_norm": 2.90625, "grad_norm_var": 0.03658854166666667, "learning_rate": 0.0001, "loss": 5.6259, "loss/crossentropy": 2.4603389501571655, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16694767773151398, "step": 14892 }, { "epoch": 0.4654375, "grad_norm": 3.859375, "grad_norm_var": 0.06480204264322917, "learning_rate": 0.0001, "loss": 5.9331, "loss/crossentropy": 2.5701653957366943, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.17652342468500137, "step": 14894 }, { "epoch": 0.4655, "grad_norm": 3.3125, "grad_norm_var": 0.05777587890625, "learning_rate": 0.0001, "loss": 5.598, "loss/crossentropy": 2.4629119634628296, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1635095402598381, "step": 14896 }, { "epoch": 0.4655625, "grad_norm": 2.9375, "grad_norm_var": 0.05530192057291667, "learning_rate": 0.0001, "loss": 5.7099, "loss/crossentropy": 2.5170661211013794, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16928022354841232, "step": 14898 }, { "epoch": 0.465625, "grad_norm": 2.90625, "grad_norm_var": 0.059370930989583334, "learning_rate": 0.0001, "loss": 5.7232, "loss/crossentropy": 2.577171802520752, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16733480244874954, "step": 14900 }, { "epoch": 0.4656875, "grad_norm": 3.484375, "grad_norm_var": 0.06769917805989584, "learning_rate": 0.0001, "loss": 5.8252, "loss/crossentropy": 2.646600842475891, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.16473273932933807, "step": 14902 }, { "epoch": 0.46575, "grad_norm": 3.1875, "grad_norm_var": 0.058592732747395834, "learning_rate": 0.0001, "loss": 6.0117, "loss/crossentropy": 2.7387936115264893, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17650548368692398, "step": 14904 }, { "epoch": 0.4658125, "grad_norm": 3.359375, "grad_norm_var": 0.05969950358072917, "learning_rate": 0.0001, "loss": 5.9957, "loss/crossentropy": 2.75613534450531, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1743444800376892, "step": 14906 }, { "epoch": 0.465875, "grad_norm": 3.234375, "grad_norm_var": 0.05598856608072917, "learning_rate": 0.0001, "loss": 5.8286, "loss/crossentropy": 2.555168867111206, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17577717453241348, "step": 14908 }, { "epoch": 0.4659375, "grad_norm": 3.203125, "grad_norm_var": 0.026188151041666666, "learning_rate": 0.0001, "loss": 5.7309, "loss/crossentropy": 2.5268471240997314, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1711895540356636, "step": 14910 }, { "epoch": 0.466, "grad_norm": 3.109375, "grad_norm_var": 0.0243560791015625, "learning_rate": 0.0001, "loss": 5.4166, "loss/crossentropy": 2.2632104754447937, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1649443358182907, "step": 14912 }, { "epoch": 0.4660625, "grad_norm": 3.40625, "grad_norm_var": 0.0246002197265625, "learning_rate": 0.0001, "loss": 6.0711, "loss/crossentropy": 2.8264983892440796, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17251016944646835, "step": 14914 }, { "epoch": 0.466125, "grad_norm": 3.5, "grad_norm_var": 0.02340087890625, "learning_rate": 0.0001, "loss": 5.7918, "loss/crossentropy": 2.5308409929275513, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17218975722789764, "step": 14916 }, { "epoch": 0.4661875, "grad_norm": 3.234375, "grad_norm_var": 0.015314737955729166, "learning_rate": 0.0001, "loss": 5.788, "loss/crossentropy": 2.5696581602096558, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1702689602971077, "step": 14918 }, { "epoch": 0.46625, "grad_norm": 3.484375, "grad_norm_var": 0.025581868489583333, "learning_rate": 0.0001, "loss": 5.6231, "loss/crossentropy": 2.4615012407302856, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16576935350894928, "step": 14920 }, { "epoch": 0.4663125, "grad_norm": 3.171875, "grad_norm_var": 0.028465779622395833, "learning_rate": 0.0001, "loss": 5.8206, "loss/crossentropy": 2.502258539199829, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18104856461286545, "step": 14922 }, { "epoch": 0.466375, "grad_norm": 3.09375, "grad_norm_var": 0.041869099934895834, "learning_rate": 0.0001, "loss": 5.7827, "loss/crossentropy": 2.507362484931946, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17401570081710815, "step": 14924 }, { "epoch": 0.4664375, "grad_norm": 3.265625, "grad_norm_var": 0.04195556640625, "learning_rate": 0.0001, "loss": 5.8868, "loss/crossentropy": 2.6286935806274414, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17620167881250381, "step": 14926 }, { "epoch": 0.4665, "grad_norm": 3.328125, "grad_norm_var": 0.044759114583333336, "learning_rate": 0.0001, "loss": 5.6895, "loss/crossentropy": 2.4355950355529785, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17031612992286682, "step": 14928 }, { "epoch": 0.4665625, "grad_norm": 3.34375, "grad_norm_var": 0.04107666015625, "learning_rate": 0.0001, "loss": 6.2603, "loss/crossentropy": 2.83007276058197, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18872267752885818, "step": 14930 }, { "epoch": 0.466625, "grad_norm": 3.078125, "grad_norm_var": 0.05322265625, "learning_rate": 0.0001, "loss": 5.9574, "loss/crossentropy": 2.6480225324630737, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17976664006710052, "step": 14932 }, { "epoch": 0.4666875, "grad_norm": 3.25, "grad_norm_var": 0.05519917805989583, "learning_rate": 0.0001, "loss": 5.4475, "loss/crossentropy": 2.3893600702285767, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15971647202968597, "step": 14934 }, { "epoch": 0.46675, "grad_norm": 3.0625, "grad_norm_var": 0.04731343587239583, "learning_rate": 0.0001, "loss": 5.7363, "loss/crossentropy": 2.5933868885040283, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16585280001163483, "step": 14936 }, { "epoch": 0.4668125, "grad_norm": 3.140625, "grad_norm_var": 0.045735677083333336, "learning_rate": 0.0001, "loss": 5.8121, "loss/crossentropy": 2.619690418243408, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17197076976299286, "step": 14938 }, { "epoch": 0.466875, "grad_norm": 3.015625, "grad_norm_var": 0.03780924479166667, "learning_rate": 0.0001, "loss": 5.7452, "loss/crossentropy": 2.5262022018432617, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17268161475658417, "step": 14940 }, { "epoch": 0.4669375, "grad_norm": 2.875, "grad_norm_var": 0.051789347330729166, "learning_rate": 0.0001, "loss": 5.4068, "loss/crossentropy": 2.4165834188461304, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1529240757226944, "step": 14942 }, { "epoch": 0.467, "grad_norm": 3.1875, "grad_norm_var": 0.0452301025390625, "learning_rate": 0.0001, "loss": 5.609, "loss/crossentropy": 2.5023202896118164, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16535700857639313, "step": 14944 }, { "epoch": 0.4670625, "grad_norm": 3.15625, "grad_norm_var": 0.03922119140625, "learning_rate": 0.0001, "loss": 5.5997, "loss/crossentropy": 2.428265929222107, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1722167730331421, "step": 14946 }, { "epoch": 0.467125, "grad_norm": 3.125, "grad_norm_var": 0.01822509765625, "learning_rate": 0.0001, "loss": 5.8591, "loss/crossentropy": 2.618662476539612, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17599669098854065, "step": 14948 }, { "epoch": 0.4671875, "grad_norm": 3.171875, "grad_norm_var": 0.05751953125, "learning_rate": 0.0001, "loss": 5.8571, "loss/crossentropy": 2.6167280673980713, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17168885469436646, "step": 14950 }, { "epoch": 0.46725, "grad_norm": 3.03125, "grad_norm_var": 0.05782877604166667, "learning_rate": 0.0001, "loss": 5.8178, "loss/crossentropy": 2.5427498817443848, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.17125794291496277, "step": 14952 }, { "epoch": 0.4673125, "grad_norm": 3.34375, "grad_norm_var": 0.05614827473958333, "learning_rate": 0.0001, "loss": 6.0896, "loss/crossentropy": 2.736640214920044, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18373557925224304, "step": 14954 }, { "epoch": 0.467375, "grad_norm": 3.03125, "grad_norm_var": 0.0591796875, "learning_rate": 0.0001, "loss": 5.4814, "loss/crossentropy": 2.4086307287216187, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16353166848421097, "step": 14956 }, { "epoch": 0.4674375, "grad_norm": 3.0, "grad_norm_var": 0.05366109212239583, "learning_rate": 0.0001, "loss": 5.6276, "loss/crossentropy": 2.5093424320220947, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16611898690462112, "step": 14958 }, { "epoch": 0.4675, "grad_norm": 3.515625, "grad_norm_var": 0.06289774576822917, "learning_rate": 0.0001, "loss": 6.0632, "loss/crossentropy": 2.6605218648910522, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1847984567284584, "step": 14960 }, { "epoch": 0.4675625, "grad_norm": 3.421875, "grad_norm_var": 0.06642964680989584, "learning_rate": 0.0001, "loss": 5.999, "loss/crossentropy": 2.737818717956543, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1718187928199768, "step": 14962 }, { "epoch": 0.467625, "grad_norm": 3.328125, "grad_norm_var": 0.0684722900390625, "learning_rate": 0.0001, "loss": 5.6779, "loss/crossentropy": 2.5076217651367188, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17054229974746704, "step": 14964 }, { "epoch": 0.4676875, "grad_norm": 3.265625, "grad_norm_var": 0.03534749348958333, "learning_rate": 0.0001, "loss": 5.6204, "loss/crossentropy": 2.4288675785064697, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1675926074385643, "step": 14966 }, { "epoch": 0.46775, "grad_norm": 3.296875, "grad_norm_var": 0.032868448893229166, "learning_rate": 0.0001, "loss": 5.8905, "loss/crossentropy": 2.590886354446411, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.18308492004871368, "step": 14968 }, { "epoch": 0.4678125, "grad_norm": 2.921875, "grad_norm_var": 0.04488932291666667, "learning_rate": 0.0001, "loss": 5.7074, "loss/crossentropy": 2.486487627029419, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17443692684173584, "step": 14970 }, { "epoch": 0.467875, "grad_norm": 3.125, "grad_norm_var": 0.03582356770833333, "learning_rate": 0.0001, "loss": 5.7548, "loss/crossentropy": 2.5663185119628906, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16962523013353348, "step": 14972 }, { "epoch": 0.4679375, "grad_norm": 3.34375, "grad_norm_var": 0.031201171875, "learning_rate": 0.0001, "loss": 5.7835, "loss/crossentropy": 2.5746841430664062, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17088166624307632, "step": 14974 }, { "epoch": 0.468, "grad_norm": 3.015625, "grad_norm_var": 0.029264322916666665, "learning_rate": 0.0001, "loss": 5.6593, "loss/crossentropy": 2.5325201749801636, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16619150340557098, "step": 14976 }, { "epoch": 0.4680625, "grad_norm": 3.109375, "grad_norm_var": 0.025992838541666667, "learning_rate": 0.0001, "loss": 5.7783, "loss/crossentropy": 2.621680974960327, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16956928372383118, "step": 14978 }, { "epoch": 0.468125, "grad_norm": 2.875, "grad_norm_var": 0.028685506184895834, "learning_rate": 0.0001, "loss": 5.1982, "loss/crossentropy": 2.177245855331421, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.1599075049161911, "step": 14980 }, { "epoch": 0.4681875, "grad_norm": 3.3125, "grad_norm_var": 0.028804524739583334, "learning_rate": 0.0001, "loss": 5.7523, "loss/crossentropy": 2.594241499900818, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1669737994670868, "step": 14982 }, { "epoch": 0.46825, "grad_norm": 3.234375, "grad_norm_var": 0.028841145833333335, "learning_rate": 0.0001, "loss": 5.9412, "loss/crossentropy": 2.71023952960968, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1734899953007698, "step": 14984 }, { "epoch": 0.4683125, "grad_norm": 3.296875, "grad_norm_var": 0.0262847900390625, "learning_rate": 0.0001, "loss": 5.8847, "loss/crossentropy": 2.6082526445388794, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17686737328767776, "step": 14986 }, { "epoch": 0.468375, "grad_norm": 3.171875, "grad_norm_var": 0.026178995768229168, "learning_rate": 0.0001, "loss": 5.9113, "loss/crossentropy": 2.6365636587142944, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17747656255960464, "step": 14988 }, { "epoch": 0.4684375, "grad_norm": 3.171875, "grad_norm_var": 0.0247711181640625, "learning_rate": 0.0001, "loss": 5.586, "loss/crossentropy": 2.374174952507019, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1719595566391945, "step": 14990 }, { "epoch": 0.4685, "grad_norm": 3.296875, "grad_norm_var": 0.18365478515625, "learning_rate": 0.0001, "loss": 5.7412, "loss/crossentropy": 2.4613447189331055, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1748618260025978, "step": 14992 }, { "epoch": 0.4685625, "grad_norm": 4.875, "grad_norm_var": 0.3292307535807292, "learning_rate": 0.0001, "loss": 6.2036, "loss/crossentropy": 2.71132493019104, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.19454240053892136, "step": 14994 }, { "epoch": 0.468625, "grad_norm": 7.25, "grad_norm_var": 1.204295857747396, "learning_rate": 0.0001, "loss": 5.7628, "loss/crossentropy": 2.4733160734176636, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17621204257011414, "step": 14996 }, { "epoch": 0.4686875, "grad_norm": 3.65625, "grad_norm_var": 1.1716217041015624, "learning_rate": 0.0001, "loss": 5.9932, "loss/crossentropy": 2.6372264623641968, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18247324228286743, "step": 14998 }, { "epoch": 0.46875, "grad_norm": 3.546875, "grad_norm_var": 1.1736317952473958, "learning_rate": 0.0001, "loss": 6.1257, "loss/crossentropy": 2.808167815208435, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1801864132285118, "step": 15000 }, { "epoch": 0.4688125, "grad_norm": 3.65625, "grad_norm_var": 1.1867472330729167, "learning_rate": 0.0001, "loss": 6.0516, "loss/crossentropy": 2.6355226039886475, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18809112906455994, "step": 15002 }, { "epoch": 0.468875, "grad_norm": 3.109375, "grad_norm_var": 1.1732747395833334, "learning_rate": 0.0001, "loss": 5.7178, "loss/crossentropy": 2.509262204170227, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17241171002388, "step": 15004 }, { "epoch": 0.4689375, "grad_norm": 3.078125, "grad_norm_var": 1.15576171875, "learning_rate": 0.0001, "loss": 5.5251, "loss/crossentropy": 2.393805503845215, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.15844451636075974, "step": 15006 }, { "epoch": 0.469, "grad_norm": 3.203125, "grad_norm_var": 1.1028879801432292, "learning_rate": 0.0001, "loss": 5.8695, "loss/crossentropy": 2.580573797225952, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17967334389686584, "step": 15008 }, { "epoch": 0.4690625, "grad_norm": 3.21875, "grad_norm_var": 1.020637003580729, "learning_rate": 0.0001, "loss": 5.9801, "loss/crossentropy": 2.7248635292053223, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17786447703838348, "step": 15010 }, { "epoch": 0.469125, "grad_norm": 3.296875, "grad_norm_var": 0.04663798014322917, "learning_rate": 0.0001, "loss": 5.9036, "loss/crossentropy": 2.589953064918518, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1762884259223938, "step": 15012 }, { "epoch": 0.4691875, "grad_norm": 3.015625, "grad_norm_var": 0.03866780598958333, "learning_rate": 0.0001, "loss": 5.6274, "loss/crossentropy": 2.503264307975769, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16553644835948944, "step": 15014 }, { "epoch": 0.46925, "grad_norm": 3.4375, "grad_norm_var": 6.729605102539063, "learning_rate": 0.0001, "loss": 6.1971, "loss/crossentropy": 2.5574769973754883, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.21240271627902985, "step": 15016 }, { "epoch": 0.4693125, "grad_norm": 3.265625, "grad_norm_var": 6.725877888997396, "learning_rate": 0.0001, "loss": 5.8184, "loss/crossentropy": 2.5202722549438477, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17747237533330917, "step": 15018 }, { "epoch": 0.469375, "grad_norm": 3.1875, "grad_norm_var": 6.75064697265625, "learning_rate": 0.0001, "loss": 5.5763, "loss/crossentropy": 2.4177215099334717, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16859114915132523, "step": 15020 }, { "epoch": 0.4694375, "grad_norm": 3.3125, "grad_norm_var": 6.739330037434896, "learning_rate": 0.0001, "loss": 6.071, "loss/crossentropy": 2.7344648838043213, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1801370605826378, "step": 15022 }, { "epoch": 0.4695, "grad_norm": 3.515625, "grad_norm_var": 6.714892578125, "learning_rate": 0.0001, "loss": 5.757, "loss/crossentropy": 2.531996011734009, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16976594924926758, "step": 15024 }, { "epoch": 0.4695625, "grad_norm": 2.96875, "grad_norm_var": 6.714094034830729, "learning_rate": 0.0001, "loss": 5.5923, "loss/crossentropy": 2.4961259365081787, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.15961505472660065, "step": 15026 }, { "epoch": 0.469625, "grad_norm": 2.9375, "grad_norm_var": 6.713212076822916, "learning_rate": 0.0001, "loss": 5.7165, "loss/crossentropy": 2.5338306427001953, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16748927533626556, "step": 15028 }, { "epoch": 0.4696875, "grad_norm": 3.125, "grad_norm_var": 6.718001302083334, "learning_rate": 0.0001, "loss": 5.3329, "loss/crossentropy": 2.2626683712005615, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16093085706233978, "step": 15030 }, { "epoch": 0.46975, "grad_norm": 3.328125, "grad_norm_var": 0.06539306640625, "learning_rate": 0.0001, "loss": 5.6058, "loss/crossentropy": 2.3245774507522583, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1765628531575203, "step": 15032 }, { "epoch": 0.4698125, "grad_norm": 3.40625, "grad_norm_var": 0.0667388916015625, "learning_rate": 0.0001, "loss": 5.8241, "loss/crossentropy": 2.5087594985961914, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1784081682562828, "step": 15034 }, { "epoch": 0.469875, "grad_norm": 3.078125, "grad_norm_var": 0.06494852701822916, "learning_rate": 0.0001, "loss": 5.7196, "loss/crossentropy": 2.504445433616638, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17112022638320923, "step": 15036 }, { "epoch": 0.4699375, "grad_norm": 3.296875, "grad_norm_var": 0.06573893229166666, "learning_rate": 0.0001, "loss": 5.812, "loss/crossentropy": 2.5174232721328735, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17828761786222458, "step": 15038 }, { "epoch": 0.47, "grad_norm": 3.078125, "grad_norm_var": 0.06652018229166666, "learning_rate": 0.0001, "loss": 5.6692, "loss/crossentropy": 2.443945288658142, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.16940295696258545, "step": 15040 }, { "epoch": 0.4700625, "grad_norm": 3.125, "grad_norm_var": 0.06179097493489583, "learning_rate": 0.0001, "loss": 5.6077, "loss/crossentropy": 2.446028470993042, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16499093174934387, "step": 15042 }, { "epoch": 0.470125, "grad_norm": 3.25, "grad_norm_var": 0.05515848795572917, "learning_rate": 0.0001, "loss": 5.7399, "loss/crossentropy": 2.582988977432251, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1680392622947693, "step": 15044 }, { "epoch": 0.4701875, "grad_norm": 3.234375, "grad_norm_var": 0.04488932291666667, "learning_rate": 0.0001, "loss": 5.629, "loss/crossentropy": 2.4428768157958984, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1697836071252823, "step": 15046 }, { "epoch": 0.47025, "grad_norm": 3.078125, "grad_norm_var": 0.028905232747395832, "learning_rate": 0.0001, "loss": 6.1088, "loss/crossentropy": 2.7833809852600098, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.18136659264564514, "step": 15048 }, { "epoch": 0.4703125, "grad_norm": 3.375, "grad_norm_var": 0.0274322509765625, "learning_rate": 0.0001, "loss": 5.8243, "loss/crossentropy": 2.5886348485946655, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17473383247852325, "step": 15050 }, { "epoch": 0.470375, "grad_norm": 3.21875, "grad_norm_var": 0.0244537353515625, "learning_rate": 0.0001, "loss": 5.6175, "loss/crossentropy": 2.433435082435608, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1703553944826126, "step": 15052 }, { "epoch": 0.4704375, "grad_norm": 3.1875, "grad_norm_var": 0.023095703125, "learning_rate": 0.0001, "loss": 5.849, "loss/crossentropy": 2.6403539180755615, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1700805500149727, "step": 15054 }, { "epoch": 0.4705, "grad_norm": 3.0, "grad_norm_var": 0.026200358072916666, "learning_rate": 0.0001, "loss": 5.5189, "loss/crossentropy": 2.4523452520370483, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1605592966079712, "step": 15056 }, { "epoch": 0.4705625, "grad_norm": 3.171875, "grad_norm_var": 0.025830078125, "learning_rate": 0.0001, "loss": 5.6711, "loss/crossentropy": 2.48645281791687, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16689835488796234, "step": 15058 }, { "epoch": 0.470625, "grad_norm": 3.203125, "grad_norm_var": 0.0253814697265625, "learning_rate": 0.0001, "loss": 5.9489, "loss/crossentropy": 2.6477524042129517, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17816638201475143, "step": 15060 }, { "epoch": 0.4706875, "grad_norm": 3.390625, "grad_norm_var": 0.03052978515625, "learning_rate": 0.0001, "loss": 5.6189, "loss/crossentropy": 2.463808298110962, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16511918604373932, "step": 15062 }, { "epoch": 0.47075, "grad_norm": 3.140625, "grad_norm_var": 0.023582967122395833, "learning_rate": 0.0001, "loss": 5.7961, "loss/crossentropy": 2.570745825767517, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17331210523843765, "step": 15064 }, { "epoch": 0.4708125, "grad_norm": 3.171875, "grad_norm_var": 0.024833170572916667, "learning_rate": 0.0001, "loss": 6.0957, "loss/crossentropy": 2.7117077112197876, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18527565896511078, "step": 15066 }, { "epoch": 0.470875, "grad_norm": 3.5625, "grad_norm_var": 0.03326416015625, "learning_rate": 0.0001, "loss": 6.104, "loss/crossentropy": 2.7489144802093506, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18277819454669952, "step": 15068 }, { "epoch": 0.4709375, "grad_norm": 3.34375, "grad_norm_var": 0.0377349853515625, "learning_rate": 0.0001, "loss": 6.0843, "loss/crossentropy": 2.7256009578704834, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18079321086406708, "step": 15070 }, { "epoch": 0.471, "grad_norm": 3.3125, "grad_norm_var": 0.027567545572916668, "learning_rate": 0.0001, "loss": 5.9187, "loss/crossentropy": 2.6594446897506714, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1763150617480278, "step": 15072 }, { "epoch": 0.4710625, "grad_norm": 3.3125, "grad_norm_var": 0.028498331705729168, "learning_rate": 0.0001, "loss": 5.8383, "loss/crossentropy": 2.5661813020706177, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17838828265666962, "step": 15074 }, { "epoch": 0.471125, "grad_norm": 3.234375, "grad_norm_var": 0.02818603515625, "learning_rate": 0.0001, "loss": 6.0568, "loss/crossentropy": 2.731650710105896, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18056457489728928, "step": 15076 }, { "epoch": 0.4711875, "grad_norm": 3.4375, "grad_norm_var": 0.0237701416015625, "learning_rate": 0.0001, "loss": 5.9464, "loss/crossentropy": 2.658777952194214, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17642329633235931, "step": 15078 }, { "epoch": 0.47125, "grad_norm": 3.03125, "grad_norm_var": 0.0229156494140625, "learning_rate": 0.0001, "loss": 6.0087, "loss/crossentropy": 2.726006269454956, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17943864315748215, "step": 15080 }, { "epoch": 0.4713125, "grad_norm": 3.140625, "grad_norm_var": 0.02261962890625, "learning_rate": 0.0001, "loss": 5.8073, "loss/crossentropy": 2.5403780937194824, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17278371006250381, "step": 15082 }, { "epoch": 0.471375, "grad_norm": 3.390625, "grad_norm_var": 0.019017537434895832, "learning_rate": 0.0001, "loss": 5.9824, "loss/crossentropy": 2.713652014732361, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17648645490407944, "step": 15084 }, { "epoch": 0.4714375, "grad_norm": 3.328125, "grad_norm_var": 0.016624959309895833, "learning_rate": 0.0001, "loss": 5.7174, "loss/crossentropy": 2.4842076301574707, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17175430804491043, "step": 15086 }, { "epoch": 0.4715, "grad_norm": 3.8125, "grad_norm_var": 0.03528544108072917, "learning_rate": 0.0001, "loss": 5.7058, "loss/crossentropy": 2.510892391204834, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1655847132205963, "step": 15088 }, { "epoch": 0.4715625, "grad_norm": 3.125, "grad_norm_var": 0.0415435791015625, "learning_rate": 0.0001, "loss": 5.5088, "loss/crossentropy": 2.3916221857070923, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16327939927577972, "step": 15090 }, { "epoch": 0.471625, "grad_norm": 3.390625, "grad_norm_var": 0.046891276041666666, "learning_rate": 0.0001, "loss": 5.5516, "loss/crossentropy": 2.492325186729431, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.15436577051877975, "step": 15092 }, { "epoch": 0.4716875, "grad_norm": 3.15625, "grad_norm_var": 0.04752197265625, "learning_rate": 0.0001, "loss": 5.7036, "loss/crossentropy": 2.5581146478652954, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16377197206020355, "step": 15094 }, { "epoch": 0.47175, "grad_norm": 3.390625, "grad_norm_var": 0.0471343994140625, "learning_rate": 0.0001, "loss": 5.6875, "loss/crossentropy": 2.494322180747986, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1704912930727005, "step": 15096 }, { "epoch": 0.4718125, "grad_norm": 3.015625, "grad_norm_var": 0.0501861572265625, "learning_rate": 0.0001, "loss": 5.633, "loss/crossentropy": 2.4340046644210815, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1679450124502182, "step": 15098 }, { "epoch": 0.471875, "grad_norm": 3.40625, "grad_norm_var": 0.048884073893229164, "learning_rate": 0.0001, "loss": 5.916, "loss/crossentropy": 2.6575233936309814, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17545659095048904, "step": 15100 }, { "epoch": 0.4719375, "grad_norm": 3.078125, "grad_norm_var": 0.049540201822916664, "learning_rate": 0.0001, "loss": 5.5774, "loss/crossentropy": 2.433680295944214, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16437102854251862, "step": 15102 }, { "epoch": 0.472, "grad_norm": 4.09375, "grad_norm_var": 0.07652994791666666, "learning_rate": 0.0001, "loss": 5.8867, "loss/crossentropy": 2.4997910261154175, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18361079692840576, "step": 15104 }, { "epoch": 0.4720625, "grad_norm": 3.15625, "grad_norm_var": 0.07822977701822917, "learning_rate": 0.0001, "loss": 5.6591, "loss/crossentropy": 2.4471428394317627, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16885577887296677, "step": 15106 }, { "epoch": 0.472125, "grad_norm": 3.0625, "grad_norm_var": 0.07629801432291666, "learning_rate": 0.0001, "loss": 5.5201, "loss/crossentropy": 2.4350301027297974, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15929025411605835, "step": 15108 }, { "epoch": 0.4721875, "grad_norm": 2.90625, "grad_norm_var": 0.0806793212890625, "learning_rate": 0.0001, "loss": 5.6057, "loss/crossentropy": 2.4947516918182373, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1634358912706375, "step": 15110 }, { "epoch": 0.47225, "grad_norm": 3.03125, "grad_norm_var": 0.08280843098958333, "learning_rate": 0.0001, "loss": 5.9371, "loss/crossentropy": 2.7836095094680786, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16691502183675766, "step": 15112 }, { "epoch": 0.4723125, "grad_norm": 3.25, "grad_norm_var": 0.07932942708333333, "learning_rate": 0.0001, "loss": 5.7497, "loss/crossentropy": 2.557365298271179, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17235562205314636, "step": 15114 }, { "epoch": 0.472375, "grad_norm": 3.078125, "grad_norm_var": 0.07916666666666666, "learning_rate": 0.0001, "loss": 5.6172, "loss/crossentropy": 2.493856430053711, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1642913818359375, "step": 15116 }, { "epoch": 0.4724375, "grad_norm": 3.046875, "grad_norm_var": 0.07940165201822917, "learning_rate": 0.0001, "loss": 5.4265, "loss/crossentropy": 2.3340771198272705, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16392983496189117, "step": 15118 }, { "epoch": 0.4725, "grad_norm": 3.0, "grad_norm_var": 0.024507649739583335, "learning_rate": 0.0001, "loss": 5.7192, "loss/crossentropy": 2.53964900970459, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16873925179243088, "step": 15120 }, { "epoch": 0.4725625, "grad_norm": 3.34375, "grad_norm_var": 0.013179524739583334, "learning_rate": 0.0001, "loss": 6.1749, "loss/crossentropy": 2.7930883169174194, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18466253578662872, "step": 15122 }, { "epoch": 0.472625, "grad_norm": 3.0625, "grad_norm_var": 0.013525390625, "learning_rate": 0.0001, "loss": 6.1091, "loss/crossentropy": 2.835660219192505, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17851269245147705, "step": 15124 }, { "epoch": 0.4726875, "grad_norm": 3.328125, "grad_norm_var": 0.019384765625, "learning_rate": 0.0001, "loss": 5.7592, "loss/crossentropy": 2.4408318996429443, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.17480113357305527, "step": 15126 }, { "epoch": 0.47275, "grad_norm": 3.140625, "grad_norm_var": 0.017964680989583332, "learning_rate": 0.0001, "loss": 5.4661, "loss/crossentropy": 2.359795093536377, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16101963818073273, "step": 15128 }, { "epoch": 0.4728125, "grad_norm": 3.15625, "grad_norm_var": 0.0249908447265625, "learning_rate": 0.0001, "loss": 5.9525, "loss/crossentropy": 2.5907492637634277, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18226782232522964, "step": 15130 }, { "epoch": 0.472875, "grad_norm": 3.5, "grad_norm_var": 0.0279296875, "learning_rate": 0.0001, "loss": 5.68, "loss/crossentropy": 2.4544483423233032, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.16825765371322632, "step": 15132 }, { "epoch": 0.4729375, "grad_norm": 3.296875, "grad_norm_var": 0.027179972330729166, "learning_rate": 0.0001, "loss": 5.8343, "loss/crossentropy": 2.6822274923324585, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1663799062371254, "step": 15134 }, { "epoch": 0.473, "grad_norm": 3.0625, "grad_norm_var": 0.025324503580729168, "learning_rate": 0.0001, "loss": 5.4519, "loss/crossentropy": 2.3754748106002808, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16116300970315933, "step": 15136 }, { "epoch": 0.4730625, "grad_norm": 3.09375, "grad_norm_var": 0.026871744791666666, "learning_rate": 0.0001, "loss": 5.6349, "loss/crossentropy": 2.5355676412582397, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.161496102809906, "step": 15138 }, { "epoch": 0.473125, "grad_norm": 3.359375, "grad_norm_var": 0.027057902018229166, "learning_rate": 0.0001, "loss": 5.8073, "loss/crossentropy": 2.50444233417511, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17598942667245865, "step": 15140 }, { "epoch": 0.4731875, "grad_norm": 3.0625, "grad_norm_var": 0.024544270833333333, "learning_rate": 0.0001, "loss": 5.787, "loss/crossentropy": 2.6119871139526367, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1671072095632553, "step": 15142 }, { "epoch": 0.47325, "grad_norm": 3.390625, "grad_norm_var": 0.025877888997395834, "learning_rate": 0.0001, "loss": 5.7882, "loss/crossentropy": 2.571872115135193, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1751495897769928, "step": 15144 }, { "epoch": 0.4733125, "grad_norm": 3.09375, "grad_norm_var": 0.022907511393229166, "learning_rate": 0.0001, "loss": 5.8456, "loss/crossentropy": 2.6580671072006226, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1707022786140442, "step": 15146 }, { "epoch": 0.473375, "grad_norm": 3.28125, "grad_norm_var": 0.02828369140625, "learning_rate": 0.0001, "loss": 5.836, "loss/crossentropy": 2.5903728008270264, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17651164531707764, "step": 15148 }, { "epoch": 0.4734375, "grad_norm": 2.984375, "grad_norm_var": 0.03052978515625, "learning_rate": 0.0001, "loss": 5.7862, "loss/crossentropy": 2.5991785526275635, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17182902991771698, "step": 15150 }, { "epoch": 0.4735, "grad_norm": 3.140625, "grad_norm_var": 0.0308502197265625, "learning_rate": 0.0001, "loss": 5.8214, "loss/crossentropy": 2.528180241584778, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17620117962360382, "step": 15152 }, { "epoch": 0.4735625, "grad_norm": 3.5625, "grad_norm_var": 0.03492431640625, "learning_rate": 0.0001, "loss": 5.9033, "loss/crossentropy": 2.655681610107422, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17437532544136047, "step": 15154 }, { "epoch": 0.473625, "grad_norm": 3.296875, "grad_norm_var": 0.03551025390625, "learning_rate": 0.0001, "loss": 5.62, "loss/crossentropy": 2.4807881116867065, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16977840662002563, "step": 15156 }, { "epoch": 0.4736875, "grad_norm": 2.828125, "grad_norm_var": 0.044417317708333334, "learning_rate": 0.0001, "loss": 6.0188, "loss/crossentropy": 2.7204389572143555, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17827719449996948, "step": 15158 }, { "epoch": 0.47375, "grad_norm": 3.421875, "grad_norm_var": 0.0440826416015625, "learning_rate": 0.0001, "loss": 6.0667, "loss/crossentropy": 2.8044852018356323, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1738826036453247, "step": 15160 }, { "epoch": 0.4738125, "grad_norm": 2.90625, "grad_norm_var": 0.04897359212239583, "learning_rate": 0.0001, "loss": 5.7891, "loss/crossentropy": 2.6122967004776, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.1735389232635498, "step": 15162 }, { "epoch": 0.473875, "grad_norm": 3.28125, "grad_norm_var": 0.036454264322916666, "learning_rate": 0.0001, "loss": 5.895, "loss/crossentropy": 2.642680287361145, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17523349821567535, "step": 15164 }, { "epoch": 0.4739375, "grad_norm": 3.1875, "grad_norm_var": 0.0365142822265625, "learning_rate": 0.0001, "loss": 5.6857, "loss/crossentropy": 2.474525213241577, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17150677740573883, "step": 15166 }, { "epoch": 0.474, "grad_norm": 3.0625, "grad_norm_var": 0.0347320556640625, "learning_rate": 0.0001, "loss": 5.3874, "loss/crossentropy": 2.324982166290283, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15663722157478333, "step": 15168 }, { "epoch": 0.4740625, "grad_norm": 3.15625, "grad_norm_var": 0.0246246337890625, "learning_rate": 0.0001, "loss": 6.0049, "loss/crossentropy": 2.7491393089294434, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1755783036351204, "step": 15170 }, { "epoch": 0.474125, "grad_norm": 3.0, "grad_norm_var": 0.0246978759765625, "learning_rate": 0.0001, "loss": 5.8553, "loss/crossentropy": 2.628808856010437, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17538384348154068, "step": 15172 }, { "epoch": 0.4741875, "grad_norm": 3.25, "grad_norm_var": 0.017577107747395834, "learning_rate": 0.0001, "loss": 5.636, "loss/crossentropy": 2.4430015087127686, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17203166335821152, "step": 15174 }, { "epoch": 0.47425, "grad_norm": 4.6875, "grad_norm_var": 0.16422119140625, "learning_rate": 0.0001, "loss": 5.3546, "loss/crossentropy": 2.3191999197006226, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1535409614443779, "step": 15176 }, { "epoch": 0.4743125, "grad_norm": 3.3125, "grad_norm_var": 0.15623372395833332, "learning_rate": 0.0001, "loss": 5.6493, "loss/crossentropy": 2.5304505825042725, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16266535967588425, "step": 15178 }, { "epoch": 0.474375, "grad_norm": 3.4375, "grad_norm_var": 0.16245829264322917, "learning_rate": 0.0001, "loss": 5.6971, "loss/crossentropy": 2.4235081672668457, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17658285051584244, "step": 15180 }, { "epoch": 0.4744375, "grad_norm": 3.3125, "grad_norm_var": 0.171240234375, "learning_rate": 0.0001, "loss": 5.6314, "loss/crossentropy": 2.447352647781372, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16645319014787674, "step": 15182 }, { "epoch": 0.4745, "grad_norm": 3.359375, "grad_norm_var": 0.16711832682291666, "learning_rate": 0.0001, "loss": 5.7577, "loss/crossentropy": 2.500947952270508, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1752820387482643, "step": 15184 }, { "epoch": 0.4745625, "grad_norm": 8.5, "grad_norm_var": 1.8448527018229166, "learning_rate": 0.0001, "loss": 6.1857, "loss/crossentropy": 2.762804865837097, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18838287889957428, "step": 15186 }, { "epoch": 0.474625, "grad_norm": 4.3125, "grad_norm_var": 1.8286692301432292, "learning_rate": 0.0001, "loss": 5.9227, "loss/crossentropy": 2.6097878217697144, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17973001301288605, "step": 15188 }, { "epoch": 0.4746875, "grad_norm": 3.765625, "grad_norm_var": 1.8067667643229166, "learning_rate": 0.0001, "loss": 5.8835, "loss/crossentropy": 2.567626476287842, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1788562387228012, "step": 15190 }, { "epoch": 0.47475, "grad_norm": 3.375, "grad_norm_var": 1.7167144775390626, "learning_rate": 0.0001, "loss": 6.1341, "loss/crossentropy": 2.7327338457107544, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18740259855985641, "step": 15192 }, { "epoch": 0.4748125, "grad_norm": 3.375, "grad_norm_var": 1.7120402018229166, "learning_rate": 0.0001, "loss": 5.6488, "loss/crossentropy": 2.49296772480011, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16597580909729004, "step": 15194 }, { "epoch": 0.474875, "grad_norm": 3.3125, "grad_norm_var": 1.7203450520833334, "learning_rate": 0.0001, "loss": 5.5357, "loss/crossentropy": 2.325214982032776, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17026569694280624, "step": 15196 }, { "epoch": 0.4749375, "grad_norm": 3.21875, "grad_norm_var": 1.6799763997395833, "learning_rate": 0.0001, "loss": 5.7957, "loss/crossentropy": 2.4964864253997803, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17796631157398224, "step": 15198 }, { "epoch": 0.475, "grad_norm": 3.390625, "grad_norm_var": 1.69111328125, "learning_rate": 0.0001, "loss": 5.965, "loss/crossentropy": 2.682921886444092, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1789863184094429, "step": 15200 }, { "epoch": 0.4750625, "grad_norm": 3.015625, "grad_norm_var": 0.123583984375, "learning_rate": 0.0001, "loss": 5.8367, "loss/crossentropy": 2.5980740785598755, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1758170649409294, "step": 15202 }, { "epoch": 0.475125, "grad_norm": 2.8125, "grad_norm_var": 0.10071207682291666, "learning_rate": 0.0001, "loss": 5.5116, "loss/crossentropy": 2.4391099214553833, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.15724600106477737, "step": 15204 }, { "epoch": 0.4751875, "grad_norm": 2.96875, "grad_norm_var": 0.8183664957682292, "learning_rate": 0.0001, "loss": 5.5985, "loss/crossentropy": 2.442660093307495, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16323567926883698, "step": 15206 }, { "epoch": 0.47525, "grad_norm": 3.140625, "grad_norm_var": 0.8284830729166667, "learning_rate": 0.0001, "loss": 5.78, "loss/crossentropy": 2.541997790336609, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1749734953045845, "step": 15208 }, { "epoch": 0.4753125, "grad_norm": 3.15625, "grad_norm_var": 0.84752197265625, "learning_rate": 0.0001, "loss": 5.2352, "loss/crossentropy": 2.1886537075042725, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.14839966222643852, "step": 15210 }, { "epoch": 0.475375, "grad_norm": 3.53125, "grad_norm_var": 0.8497792561848958, "learning_rate": 0.0001, "loss": 6.0524, "loss/crossentropy": 2.6989437341690063, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18378519266843796, "step": 15212 }, { "epoch": 0.4754375, "grad_norm": 3.046875, "grad_norm_var": 0.8433502197265625, "learning_rate": 0.0001, "loss": 5.9131, "loss/crossentropy": 2.6584397554397583, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1746854931116104, "step": 15214 }, { "epoch": 0.4755, "grad_norm": 3.0625, "grad_norm_var": 0.848291015625, "learning_rate": 0.0001, "loss": 5.7146, "loss/crossentropy": 2.4808326959609985, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17181871086359024, "step": 15216 }, { "epoch": 0.4755625, "grad_norm": 3.296875, "grad_norm_var": 0.8222076416015625, "learning_rate": 0.0001, "loss": 5.8577, "loss/crossentropy": 2.61141300201416, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1734614074230194, "step": 15218 }, { "epoch": 0.475625, "grad_norm": 3.3125, "grad_norm_var": 0.7963175455729167, "learning_rate": 0.0001, "loss": 5.8515, "loss/crossentropy": 2.571282982826233, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17333849519491196, "step": 15220 }, { "epoch": 0.4756875, "grad_norm": 3.15625, "grad_norm_var": 0.029227701822916667, "learning_rate": 0.0001, "loss": 5.3423, "loss/crossentropy": 2.299667716026306, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1527012139558792, "step": 15222 }, { "epoch": 0.47575, "grad_norm": 3.59375, "grad_norm_var": 0.03821614583333333, "learning_rate": 0.0001, "loss": 5.9217, "loss/crossentropy": 2.603001117706299, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17952166497707367, "step": 15224 }, { "epoch": 0.4758125, "grad_norm": 3.25, "grad_norm_var": 0.029325358072916665, "learning_rate": 0.0001, "loss": 5.6838, "loss/crossentropy": 2.5082921981811523, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1687181293964386, "step": 15226 }, { "epoch": 0.475875, "grad_norm": 2.8125, "grad_norm_var": 0.03948160807291667, "learning_rate": 0.0001, "loss": 5.6393, "loss/crossentropy": 2.6041629314422607, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15547103434801102, "step": 15228 }, { "epoch": 0.4759375, "grad_norm": 3.140625, "grad_norm_var": 0.03443094889322917, "learning_rate": 0.0001, "loss": 6.0314, "loss/crossentropy": 2.716339945793152, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17759885638952255, "step": 15230 }, { "epoch": 0.476, "grad_norm": 3.25, "grad_norm_var": 0.03528238932291667, "learning_rate": 0.0001, "loss": 5.9443, "loss/crossentropy": 2.6709004640579224, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17617063224315643, "step": 15232 }, { "epoch": 0.4760625, "grad_norm": 3.21875, "grad_norm_var": 0.03494466145833333, "learning_rate": 0.0001, "loss": 5.6875, "loss/crossentropy": 2.4924468994140625, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16794048994779587, "step": 15234 }, { "epoch": 0.476125, "grad_norm": 3.15625, "grad_norm_var": 0.03433837890625, "learning_rate": 0.0001, "loss": 5.7679, "loss/crossentropy": 2.523940920829773, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17243968695402145, "step": 15236 }, { "epoch": 0.4761875, "grad_norm": 3.171875, "grad_norm_var": 0.0352935791015625, "learning_rate": 0.0001, "loss": 5.9944, "loss/crossentropy": 2.714240550994873, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1784021183848381, "step": 15238 }, { "epoch": 0.47625, "grad_norm": 3.25, "grad_norm_var": 0.030663045247395833, "learning_rate": 0.0001, "loss": 5.6462, "loss/crossentropy": 2.5538183450698853, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16510005295276642, "step": 15240 }, { "epoch": 0.4763125, "grad_norm": 3.609375, "grad_norm_var": 0.05555013020833333, "learning_rate": 0.0001, "loss": 6.2182, "loss/crossentropy": 2.7652846574783325, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1913825199007988, "step": 15242 }, { "epoch": 0.476375, "grad_norm": 2.890625, "grad_norm_var": 0.05009663899739583, "learning_rate": 0.0001, "loss": 5.8987, "loss/crossentropy": 2.6722806692123413, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17147146910429, "step": 15244 }, { "epoch": 0.4764375, "grad_norm": 3.09375, "grad_norm_var": 0.05126546223958333, "learning_rate": 0.0001, "loss": 5.6072, "loss/crossentropy": 2.4855659008026123, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16411474347114563, "step": 15246 }, { "epoch": 0.4765, "grad_norm": 3.3125, "grad_norm_var": 0.04439697265625, "learning_rate": 0.0001, "loss": 5.7495, "loss/crossentropy": 2.5743420124053955, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.167519249022007, "step": 15248 }, { "epoch": 0.4765625, "grad_norm": 3.25, "grad_norm_var": 0.04524637858072917, "learning_rate": 0.0001, "loss": 5.7153, "loss/crossentropy": 2.43125057220459, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.17137471586465836, "step": 15250 }, { "epoch": 0.476625, "grad_norm": 3.421875, "grad_norm_var": 0.04830729166666667, "learning_rate": 0.0001, "loss": 6.0371, "loss/crossentropy": 2.599075436592102, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.1852112114429474, "step": 15252 }, { "epoch": 0.4766875, "grad_norm": 3.21875, "grad_norm_var": 0.04539286295572917, "learning_rate": 0.0001, "loss": 5.8265, "loss/crossentropy": 2.5461827516555786, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1725672110915184, "step": 15254 }, { "epoch": 0.47675, "grad_norm": 3.265625, "grad_norm_var": 0.04045817057291667, "learning_rate": 0.0001, "loss": 5.7316, "loss/crossentropy": 2.562483310699463, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16886383295059204, "step": 15256 }, { "epoch": 0.4768125, "grad_norm": 3.25, "grad_norm_var": 0.02314453125, "learning_rate": 0.0001, "loss": 6.0071, "loss/crossentropy": 2.7447198629379272, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17545223236083984, "step": 15258 }, { "epoch": 0.476875, "grad_norm": 3.546875, "grad_norm_var": 0.02427978515625, "learning_rate": 0.0001, "loss": 5.7061, "loss/crossentropy": 2.5041333436965942, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16941587626934052, "step": 15260 }, { "epoch": 0.4769375, "grad_norm": 3.28125, "grad_norm_var": 0.022516886393229168, "learning_rate": 0.0001, "loss": 5.55, "loss/crossentropy": 2.3615355491638184, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16806236654520035, "step": 15262 }, { "epoch": 0.477, "grad_norm": 3.015625, "grad_norm_var": 0.02578125, "learning_rate": 0.0001, "loss": 5.5874, "loss/crossentropy": 2.4267687797546387, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.17036070674657822, "step": 15264 }, { "epoch": 0.4770625, "grad_norm": 3.28125, "grad_norm_var": 0.029035441080729165, "learning_rate": 0.0001, "loss": 5.7918, "loss/crossentropy": 2.6040912866592407, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1707252711057663, "step": 15266 }, { "epoch": 0.477125, "grad_norm": 3.375, "grad_norm_var": 0.025983683268229165, "learning_rate": 0.0001, "loss": 5.7662, "loss/crossentropy": 2.5044026374816895, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1707111820578575, "step": 15268 }, { "epoch": 0.4771875, "grad_norm": 3.0625, "grad_norm_var": 0.0467193603515625, "learning_rate": 0.0001, "loss": 5.9211, "loss/crossentropy": 2.6799111366271973, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17450731992721558, "step": 15270 }, { "epoch": 0.47725, "grad_norm": 2.828125, "grad_norm_var": 0.05400390625, "learning_rate": 0.0001, "loss": 5.502, "loss/crossentropy": 2.43990296125412, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1562146171927452, "step": 15272 }, { "epoch": 0.4773125, "grad_norm": 3.34375, "grad_norm_var": 0.0623443603515625, "learning_rate": 0.0001, "loss": 6.0116, "loss/crossentropy": 2.6366543769836426, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.1812412366271019, "step": 15274 }, { "epoch": 0.477375, "grad_norm": 3.1875, "grad_norm_var": 0.056029256184895834, "learning_rate": 0.0001, "loss": 5.9316, "loss/crossentropy": 2.6097456216812134, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.18179281800985336, "step": 15276 }, { "epoch": 0.4774375, "grad_norm": 3.09375, "grad_norm_var": 0.058394368489583334, "learning_rate": 0.0001, "loss": 5.4876, "loss/crossentropy": 2.3936734199523926, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16485852003097534, "step": 15278 }, { "epoch": 0.4775, "grad_norm": 2.9375, "grad_norm_var": 0.0631011962890625, "learning_rate": 0.0001, "loss": 5.9462, "loss/crossentropy": 2.745453357696533, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17007002234458923, "step": 15280 }, { "epoch": 0.4775625, "grad_norm": 5.1875, "grad_norm_var": 0.29146728515625, "learning_rate": 0.0001, "loss": 5.9661, "loss/crossentropy": 2.5787419080734253, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1875605657696724, "step": 15282 }, { "epoch": 0.477625, "grad_norm": 3.53125, "grad_norm_var": 0.3244374593098958, "learning_rate": 0.0001, "loss": 5.8584, "loss/crossentropy": 2.4698829650878906, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18143144994974136, "step": 15284 }, { "epoch": 0.4776875, "grad_norm": 2.78125, "grad_norm_var": 0.34965718587239586, "learning_rate": 0.0001, "loss": 5.3, "loss/crossentropy": 2.2809072732925415, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15385840088129044, "step": 15286 }, { "epoch": 0.47775, "grad_norm": 3.0, "grad_norm_var": 0.34578348795572916, "learning_rate": 0.0001, "loss": 5.6377, "loss/crossentropy": 2.440936803817749, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16928477585315704, "step": 15288 }, { "epoch": 0.4778125, "grad_norm": 3.078125, "grad_norm_var": 0.3564117431640625, "learning_rate": 0.0001, "loss": 5.6101, "loss/crossentropy": 2.4943089485168457, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1619659587740898, "step": 15290 }, { "epoch": 0.477875, "grad_norm": 3.203125, "grad_norm_var": 0.35675455729166666, "learning_rate": 0.0001, "loss": 5.8208, "loss/crossentropy": 2.6396723985671997, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17162998020648956, "step": 15292 }, { "epoch": 0.4779375, "grad_norm": 3.1875, "grad_norm_var": 0.3532786051432292, "learning_rate": 0.0001, "loss": 5.7517, "loss/crossentropy": 2.5214877128601074, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1730218604207039, "step": 15294 }, { "epoch": 0.478, "grad_norm": 3.234375, "grad_norm_var": 0.46142578125, "learning_rate": 0.0001, "loss": 6.0902, "loss/crossentropy": 2.7099214792251587, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18412108719348907, "step": 15296 }, { "epoch": 0.4780625, "grad_norm": 3.265625, "grad_norm_var": 0.2379547119140625, "learning_rate": 0.0001, "loss": 5.9388, "loss/crossentropy": 2.6545175313949585, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17725564539432526, "step": 15298 }, { "epoch": 0.478125, "grad_norm": 3.546875, "grad_norm_var": 0.19612528483072916, "learning_rate": 0.0001, "loss": 5.7419, "loss/crossentropy": 2.5023709535598755, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1708322986960411, "step": 15300 }, { "epoch": 0.4781875, "grad_norm": 3.375, "grad_norm_var": 0.17860921223958334, "learning_rate": 0.0001, "loss": 5.8065, "loss/crossentropy": 2.561031699180603, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1753268912434578, "step": 15302 }, { "epoch": 0.47825, "grad_norm": 3.421875, "grad_norm_var": 0.1693023681640625, "learning_rate": 0.0001, "loss": 6.1622, "loss/crossentropy": 2.7797582149505615, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18433526903390884, "step": 15304 }, { "epoch": 0.4783125, "grad_norm": 3.03125, "grad_norm_var": 0.1610260009765625, "learning_rate": 0.0001, "loss": 5.8765, "loss/crossentropy": 2.6523128747940063, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1712425947189331, "step": 15306 }, { "epoch": 0.478375, "grad_norm": 3.125, "grad_norm_var": 0.15771077473958334, "learning_rate": 0.0001, "loss": 6.0212, "loss/crossentropy": 2.723678708076477, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.18053758889436722, "step": 15308 }, { "epoch": 0.4784375, "grad_norm": 3.0625, "grad_norm_var": 0.16161702473958334, "learning_rate": 0.0001, "loss": 5.5002, "loss/crossentropy": 2.4254177808761597, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15826012194156647, "step": 15310 }, { "epoch": 0.4785, "grad_norm": 2.984375, "grad_norm_var": 0.05628153483072917, "learning_rate": 0.0001, "loss": 5.9301, "loss/crossentropy": 2.665627956390381, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1776166409254074, "step": 15312 }, { "epoch": 0.4785625, "grad_norm": 3.25, "grad_norm_var": 0.06303609212239583, "learning_rate": 0.0001, "loss": 6.1482, "loss/crossentropy": 2.777933359146118, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1823379099369049, "step": 15314 }, { "epoch": 0.478625, "grad_norm": 3.15625, "grad_norm_var": 0.05572916666666667, "learning_rate": 0.0001, "loss": 5.8695, "loss/crossentropy": 2.57357656955719, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17842179536819458, "step": 15316 }, { "epoch": 0.4786875, "grad_norm": 2.875, "grad_norm_var": 0.0773101806640625, "learning_rate": 0.0001, "loss": 5.8675, "loss/crossentropy": 2.563399314880371, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.17377082258462906, "step": 15318 }, { "epoch": 0.47875, "grad_norm": 4.125, "grad_norm_var": 0.1182037353515625, "learning_rate": 0.0001, "loss": 6.0154, "loss/crossentropy": 2.6596556901931763, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1789350062608719, "step": 15320 }, { "epoch": 0.4788125, "grad_norm": 3.0, "grad_norm_var": 0.1208404541015625, "learning_rate": 0.0001, "loss": 5.5908, "loss/crossentropy": 2.445460319519043, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1680455207824707, "step": 15322 }, { "epoch": 0.478875, "grad_norm": 3.171875, "grad_norm_var": 0.12068583170572916, "learning_rate": 0.0001, "loss": 5.8791, "loss/crossentropy": 2.6736626625061035, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17171691358089447, "step": 15324 }, { "epoch": 0.4789375, "grad_norm": 3.53125, "grad_norm_var": 0.12881571451822918, "learning_rate": 0.0001, "loss": 5.6332, "loss/crossentropy": 2.4836422204971313, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16534702479839325, "step": 15326 }, { "epoch": 0.479, "grad_norm": 2.953125, "grad_norm_var": 0.1141754150390625, "learning_rate": 0.0001, "loss": 5.9135, "loss/crossentropy": 2.58274507522583, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.17409475147724152, "step": 15328 }, { "epoch": 0.4790625, "grad_norm": 3.03125, "grad_norm_var": 0.11633199055989583, "learning_rate": 0.0001, "loss": 5.881, "loss/crossentropy": 2.6876858472824097, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17049873620271683, "step": 15330 }, { "epoch": 0.479125, "grad_norm": 3.390625, "grad_norm_var": 0.1166015625, "learning_rate": 0.0001, "loss": 5.6899, "loss/crossentropy": 2.501584529876709, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17000600695610046, "step": 15332 }, { "epoch": 0.4791875, "grad_norm": 3.234375, "grad_norm_var": 0.09429931640625, "learning_rate": 0.0001, "loss": 5.7534, "loss/crossentropy": 2.568631649017334, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17120973765850067, "step": 15334 }, { "epoch": 0.47925, "grad_norm": 3.1875, "grad_norm_var": 0.035107421875, "learning_rate": 0.0001, "loss": 5.8323, "loss/crossentropy": 2.5619494915008545, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17625130712985992, "step": 15336 }, { "epoch": 0.4793125, "grad_norm": 3.453125, "grad_norm_var": 0.03955078125, "learning_rate": 0.0001, "loss": 5.796, "loss/crossentropy": 2.5548356771469116, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17489810287952423, "step": 15338 }, { "epoch": 0.479375, "grad_norm": 3.09375, "grad_norm_var": 0.04029541015625, "learning_rate": 0.0001, "loss": 5.6436, "loss/crossentropy": 2.48819100856781, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16671043634414673, "step": 15340 }, { "epoch": 0.4794375, "grad_norm": 3.15625, "grad_norm_var": 0.029059855143229167, "learning_rate": 0.0001, "loss": 5.5445, "loss/crossentropy": 2.449354887008667, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16107948124408722, "step": 15342 }, { "epoch": 0.4795, "grad_norm": 2.921875, "grad_norm_var": 0.02486572265625, "learning_rate": 0.0001, "loss": 6.0028, "loss/crossentropy": 2.7391287088394165, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17715124040842056, "step": 15344 }, { "epoch": 0.4795625, "grad_norm": 3.09375, "grad_norm_var": 0.029313151041666666, "learning_rate": 0.0001, "loss": 5.7572, "loss/crossentropy": 2.537302613258362, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17238043248653412, "step": 15346 }, { "epoch": 0.479625, "grad_norm": 2.90625, "grad_norm_var": 0.027079264322916668, "learning_rate": 0.0001, "loss": 5.3147, "loss/crossentropy": 2.316564679145813, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15332921594381332, "step": 15348 }, { "epoch": 0.4796875, "grad_norm": 2.984375, "grad_norm_var": 0.028544108072916668, "learning_rate": 0.0001, "loss": 5.5347, "loss/crossentropy": 2.40766978263855, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1607503741979599, "step": 15350 }, { "epoch": 0.47975, "grad_norm": 3.484375, "grad_norm_var": 0.036295572916666664, "learning_rate": 0.0001, "loss": 5.9035, "loss/crossentropy": 2.718092679977417, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17010437697172165, "step": 15352 }, { "epoch": 0.4798125, "grad_norm": 3.046875, "grad_norm_var": 0.02984619140625, "learning_rate": 0.0001, "loss": 5.7388, "loss/crossentropy": 2.571472644805908, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17024526000022888, "step": 15354 }, { "epoch": 0.479875, "grad_norm": 3.265625, "grad_norm_var": 0.03858133951822917, "learning_rate": 0.0001, "loss": 5.9969, "loss/crossentropy": 2.6315516233444214, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.1783294677734375, "step": 15356 }, { "epoch": 0.4799375, "grad_norm": 3.0, "grad_norm_var": 0.04397379557291667, "learning_rate": 0.0001, "loss": 5.5347, "loss/crossentropy": 2.4775781631469727, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1537545621395111, "step": 15358 }, { "epoch": 0.48, "grad_norm": 3.203125, "grad_norm_var": 0.040934244791666664, "learning_rate": 0.0001, "loss": 5.9967, "loss/crossentropy": 2.784665107727051, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1719825714826584, "step": 15360 }, { "epoch": 0.4800625, "grad_norm": 3.53125, "grad_norm_var": 0.04403889973958333, "learning_rate": 0.0001, "loss": 6.1269, "loss/crossentropy": 2.7253435850143433, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18508252501487732, "step": 15362 }, { "epoch": 0.480125, "grad_norm": 3.21875, "grad_norm_var": 0.04531148274739583, "learning_rate": 0.0001, "loss": 5.9595, "loss/crossentropy": 2.6731619834899902, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17472746223211288, "step": 15364 }, { "epoch": 0.4801875, "grad_norm": 2.78125, "grad_norm_var": 0.0546539306640625, "learning_rate": 0.0001, "loss": 5.3532, "loss/crossentropy": 2.348812699317932, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15473972260951996, "step": 15366 }, { "epoch": 0.48025, "grad_norm": 3.078125, "grad_norm_var": 0.054911295572916664, "learning_rate": 0.0001, "loss": 5.9021, "loss/crossentropy": 2.6510828733444214, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17431634664535522, "step": 15368 }, { "epoch": 0.4803125, "grad_norm": 3.296875, "grad_norm_var": 0.05706380208333333, "learning_rate": 0.0001, "loss": 5.7006, "loss/crossentropy": 2.522355556488037, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1693916618824005, "step": 15370 }, { "epoch": 0.480375, "grad_norm": 3.21875, "grad_norm_var": 0.0513580322265625, "learning_rate": 0.0001, "loss": 5.6771, "loss/crossentropy": 2.5551689863204956, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16414868086576462, "step": 15372 }, { "epoch": 0.4804375, "grad_norm": 3.5, "grad_norm_var": 0.046284993489583336, "learning_rate": 0.0001, "loss": 5.7002, "loss/crossentropy": 2.41007137298584, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1782316491007805, "step": 15374 }, { "epoch": 0.4805, "grad_norm": 3.578125, "grad_norm_var": 0.0570220947265625, "learning_rate": 0.0001, "loss": 5.9039, "loss/crossentropy": 2.639471411705017, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17839555442333221, "step": 15376 }, { "epoch": 0.4805625, "grad_norm": 3.5, "grad_norm_var": 0.060578409830729166, "learning_rate": 0.0001, "loss": 5.7466, "loss/crossentropy": 2.51907479763031, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17313816398382187, "step": 15378 }, { "epoch": 0.480625, "grad_norm": 3.171875, "grad_norm_var": 0.05147196451822917, "learning_rate": 0.0001, "loss": 5.8164, "loss/crossentropy": 2.5738741159439087, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17464550584554672, "step": 15380 }, { "epoch": 0.4806875, "grad_norm": 3.015625, "grad_norm_var": 0.042985026041666666, "learning_rate": 0.0001, "loss": 5.4853, "loss/crossentropy": 2.392674684524536, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16121415793895721, "step": 15382 }, { "epoch": 0.48075, "grad_norm": 3.03125, "grad_norm_var": 0.04097900390625, "learning_rate": 0.0001, "loss": 5.9724, "loss/crossentropy": 2.710559129714966, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17696435749530792, "step": 15384 }, { "epoch": 0.4808125, "grad_norm": 3.25, "grad_norm_var": 0.07803446451822917, "learning_rate": 0.0001, "loss": 5.907, "loss/crossentropy": 2.601228952407837, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17706552147865295, "step": 15386 }, { "epoch": 0.480875, "grad_norm": 3.15625, "grad_norm_var": 0.077294921875, "learning_rate": 0.0001, "loss": 5.4973, "loss/crossentropy": 2.385049819946289, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1588803306221962, "step": 15388 }, { "epoch": 0.4809375, "grad_norm": 3.296875, "grad_norm_var": 0.07291259765625, "learning_rate": 0.0001, "loss": 5.738, "loss/crossentropy": 2.5566126108169556, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1677480787038803, "step": 15390 }, { "epoch": 0.481, "grad_norm": 3.328125, "grad_norm_var": 0.06599019368489584, "learning_rate": 0.0001, "loss": 5.8847, "loss/crossentropy": 2.61326801776886, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17285117506980896, "step": 15392 }, { "epoch": 0.4810625, "grad_norm": 3.15625, "grad_norm_var": 0.05821024576822917, "learning_rate": 0.0001, "loss": 5.9517, "loss/crossentropy": 2.667181134223938, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1776692196726799, "step": 15394 }, { "epoch": 0.481125, "grad_norm": 3.0, "grad_norm_var": 0.07560221354166667, "learning_rate": 0.0001, "loss": 5.7601, "loss/crossentropy": 2.661770462989807, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16452480852603912, "step": 15396 }, { "epoch": 0.4811875, "grad_norm": 3.25, "grad_norm_var": 0.07385660807291666, "learning_rate": 0.0001, "loss": 5.8791, "loss/crossentropy": 2.5926543474197388, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1778632253408432, "step": 15398 }, { "epoch": 0.48125, "grad_norm": 3.109375, "grad_norm_var": 0.075537109375, "learning_rate": 0.0001, "loss": 5.461, "loss/crossentropy": 2.3669419288635254, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1633107140660286, "step": 15400 }, { "epoch": 0.4813125, "grad_norm": 3.546875, "grad_norm_var": 0.04103902180989583, "learning_rate": 0.0001, "loss": 6.2042, "loss/crossentropy": 2.851514220237732, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18058335036039352, "step": 15402 }, { "epoch": 0.481375, "grad_norm": 3.15625, "grad_norm_var": 0.04088134765625, "learning_rate": 0.0001, "loss": 5.8117, "loss/crossentropy": 2.5952255725860596, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1700821816921234, "step": 15404 }, { "epoch": 0.4814375, "grad_norm": 3.25, "grad_norm_var": 0.04527587890625, "learning_rate": 0.0001, "loss": 5.6987, "loss/crossentropy": 2.4063332080841064, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1741556078195572, "step": 15406 }, { "epoch": 0.4815, "grad_norm": 3.171875, "grad_norm_var": 0.0398101806640625, "learning_rate": 0.0001, "loss": 5.7181, "loss/crossentropy": 2.535059690475464, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.168694369494915, "step": 15408 }, { "epoch": 0.4815625, "grad_norm": 3.4375, "grad_norm_var": 0.04527587890625, "learning_rate": 0.0001, "loss": 5.4814, "loss/crossentropy": 2.3332561254501343, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16598624736070633, "step": 15410 }, { "epoch": 0.481625, "grad_norm": 3.125, "grad_norm_var": 0.028352864583333335, "learning_rate": 0.0001, "loss": 5.8816, "loss/crossentropy": 2.6249091625213623, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17410919815301895, "step": 15412 }, { "epoch": 0.4816875, "grad_norm": 3.4375, "grad_norm_var": 0.035380045572916664, "learning_rate": 0.0001, "loss": 5.6343, "loss/crossentropy": 2.45553982257843, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16709962487220764, "step": 15414 }, { "epoch": 0.48175, "grad_norm": 3.125, "grad_norm_var": 0.03248291015625, "learning_rate": 0.0001, "loss": 6.0957, "loss/crossentropy": 2.7755789756774902, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.18201275169849396, "step": 15416 }, { "epoch": 0.4818125, "grad_norm": 3.375, "grad_norm_var": 0.0263336181640625, "learning_rate": 0.0001, "loss": 6.1827, "loss/crossentropy": 2.8892905712127686, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17700207233428955, "step": 15418 }, { "epoch": 0.481875, "grad_norm": 3.421875, "grad_norm_var": 0.028473917643229166, "learning_rate": 0.0001, "loss": 6.0133, "loss/crossentropy": 2.846737265586853, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16470354795455933, "step": 15420 }, { "epoch": 0.4819375, "grad_norm": 3.40625, "grad_norm_var": 0.025837198893229166, "learning_rate": 0.0001, "loss": 5.8385, "loss/crossentropy": 2.6165854930877686, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17297424376010895, "step": 15422 }, { "epoch": 0.482, "grad_norm": 2.953125, "grad_norm_var": 0.030855305989583335, "learning_rate": 0.0001, "loss": 5.78, "loss/crossentropy": 2.587552547454834, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1704176440834999, "step": 15424 }, { "epoch": 0.4820625, "grad_norm": 3.15625, "grad_norm_var": 0.025275675455729167, "learning_rate": 0.0001, "loss": 5.9748, "loss/crossentropy": 2.706058979034424, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17570267617702484, "step": 15426 }, { "epoch": 0.482125, "grad_norm": 3.171875, "grad_norm_var": 0.03673502604166667, "learning_rate": 0.0001, "loss": 5.7973, "loss/crossentropy": 2.5178266763687134, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17834125459194183, "step": 15428 }, { "epoch": 0.4821875, "grad_norm": 3.25, "grad_norm_var": 0.026200358072916666, "learning_rate": 0.0001, "loss": 5.9385, "loss/crossentropy": 2.6587525606155396, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17719413340091705, "step": 15430 }, { "epoch": 0.48225, "grad_norm": 3.453125, "grad_norm_var": 0.026203409830729166, "learning_rate": 0.0001, "loss": 5.7926, "loss/crossentropy": 2.520912289619446, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17678218334913254, "step": 15432 }, { "epoch": 0.4823125, "grad_norm": 3.109375, "grad_norm_var": 0.0277008056640625, "learning_rate": 0.0001, "loss": 5.4106, "loss/crossentropy": 2.2114129066467285, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16952737420797348, "step": 15434 }, { "epoch": 0.482375, "grad_norm": 2.984375, "grad_norm_var": 0.030887858072916666, "learning_rate": 0.0001, "loss": 5.9596, "loss/crossentropy": 2.7331149578094482, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1718701273202896, "step": 15436 }, { "epoch": 0.4824375, "grad_norm": 3.28125, "grad_norm_var": 0.0292144775390625, "learning_rate": 0.0001, "loss": 6.0251, "loss/crossentropy": 2.7384718656539917, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17865867912769318, "step": 15438 }, { "epoch": 0.4825, "grad_norm": 3.21875, "grad_norm_var": 0.02349853515625, "learning_rate": 0.0001, "loss": 5.7592, "loss/crossentropy": 2.54393470287323, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1734769567847252, "step": 15440 }, { "epoch": 0.4825625, "grad_norm": 3.078125, "grad_norm_var": 0.02476806640625, "learning_rate": 0.0001, "loss": 5.7093, "loss/crossentropy": 2.558498978614807, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16976485401391983, "step": 15442 }, { "epoch": 0.482625, "grad_norm": 3.125, "grad_norm_var": 0.0173004150390625, "learning_rate": 0.0001, "loss": 5.95, "loss/crossentropy": 2.627052903175354, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17682897299528122, "step": 15444 }, { "epoch": 0.4826875, "grad_norm": 3.328125, "grad_norm_var": 0.01865234375, "learning_rate": 0.0001, "loss": 6.0299, "loss/crossentropy": 2.693580389022827, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.18441692739725113, "step": 15446 }, { "epoch": 0.48275, "grad_norm": 2.84375, "grad_norm_var": 0.021402994791666668, "learning_rate": 0.0001, "loss": 5.655, "loss/crossentropy": 2.5437878370285034, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16424334794282913, "step": 15448 }, { "epoch": 0.4828125, "grad_norm": 3.25, "grad_norm_var": 0.020536295572916665, "learning_rate": 0.0001, "loss": 5.804, "loss/crossentropy": 2.4936158657073975, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17596448212862015, "step": 15450 }, { "epoch": 0.482875, "grad_norm": 3.4375, "grad_norm_var": 0.021256510416666666, "learning_rate": 0.0001, "loss": 5.6857, "loss/crossentropy": 2.4962233304977417, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1677766889333725, "step": 15452 }, { "epoch": 0.4829375, "grad_norm": 3.1875, "grad_norm_var": 0.020926920572916667, "learning_rate": 0.0001, "loss": 5.6113, "loss/crossentropy": 2.497285485267639, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16062213480472565, "step": 15454 }, { "epoch": 0.483, "grad_norm": 3.5, "grad_norm_var": 0.029694620768229166, "learning_rate": 0.0001, "loss": 5.8732, "loss/crossentropy": 2.5101137161254883, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1835731416940689, "step": 15456 }, { "epoch": 0.4830625, "grad_norm": 3.5625, "grad_norm_var": 0.034077962239583336, "learning_rate": 0.0001, "loss": 5.853, "loss/crossentropy": 2.649649739265442, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1668233871459961, "step": 15458 }, { "epoch": 0.483125, "grad_norm": 3.796875, "grad_norm_var": 0.061572265625, "learning_rate": 0.0001, "loss": 5.4413, "loss/crossentropy": 2.3667826652526855, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1578378528356552, "step": 15460 }, { "epoch": 0.4831875, "grad_norm": 3.125, "grad_norm_var": 0.07208658854166666, "learning_rate": 0.0001, "loss": 5.8166, "loss/crossentropy": 2.555495262145996, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1729813516139984, "step": 15462 }, { "epoch": 0.48325, "grad_norm": 3.171875, "grad_norm_var": 0.06331278483072916, "learning_rate": 0.0001, "loss": 5.6925, "loss/crossentropy": 2.530726909637451, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16891532391309738, "step": 15464 }, { "epoch": 0.4833125, "grad_norm": 3.671875, "grad_norm_var": 0.0723052978515625, "learning_rate": 0.0001, "loss": 5.7791, "loss/crossentropy": 2.5951521396636963, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1695646271109581, "step": 15466 }, { "epoch": 0.483375, "grad_norm": 3.25, "grad_norm_var": 0.0817047119140625, "learning_rate": 0.0001, "loss": 5.9382, "loss/crossentropy": 2.709382176399231, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.16819705814123154, "step": 15468 }, { "epoch": 0.4834375, "grad_norm": 2.9375, "grad_norm_var": 0.09140625, "learning_rate": 0.0001, "loss": 5.6721, "loss/crossentropy": 2.4807130098342896, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1718704253435135, "step": 15470 }, { "epoch": 0.4835, "grad_norm": 3.421875, "grad_norm_var": 0.0961822509765625, "learning_rate": 0.0001, "loss": 5.6021, "loss/crossentropy": 2.491200089454651, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16499503701925278, "step": 15472 }, { "epoch": 0.4835625, "grad_norm": 3.375, "grad_norm_var": 0.09434305826822917, "learning_rate": 0.0001, "loss": 5.7448, "loss/crossentropy": 2.526549220085144, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17026139795780182, "step": 15474 }, { "epoch": 0.483625, "grad_norm": 3.046875, "grad_norm_var": 0.0687652587890625, "learning_rate": 0.0001, "loss": 5.3924, "loss/crossentropy": 2.311138868331909, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16281364113092422, "step": 15476 }, { "epoch": 0.4836875, "grad_norm": 3.09375, "grad_norm_var": 0.06520894368489584, "learning_rate": 0.0001, "loss": 5.7145, "loss/crossentropy": 2.522248387336731, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16883626580238342, "step": 15478 }, { "epoch": 0.48375, "grad_norm": 3.359375, "grad_norm_var": 0.06409098307291666, "learning_rate": 0.0001, "loss": 5.7127, "loss/crossentropy": 2.506693124771118, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17059792578220367, "step": 15480 }, { "epoch": 0.4838125, "grad_norm": 2.875, "grad_norm_var": 0.06145833333333333, "learning_rate": 0.0001, "loss": 5.7323, "loss/crossentropy": 2.513804316520691, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17106656730175018, "step": 15482 }, { "epoch": 0.483875, "grad_norm": 3.3125, "grad_norm_var": 0.045182291666666666, "learning_rate": 0.0001, "loss": 5.866, "loss/crossentropy": 2.5672656297683716, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.18338841944932938, "step": 15484 }, { "epoch": 0.4839375, "grad_norm": 3.34375, "grad_norm_var": 0.0447662353515625, "learning_rate": 0.0001, "loss": 5.8856, "loss/crossentropy": 2.5848830938339233, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17616067826747894, "step": 15486 }, { "epoch": 0.484, "grad_norm": 3.453125, "grad_norm_var": 0.0412994384765625, "learning_rate": 0.0001, "loss": 6.0439, "loss/crossentropy": 2.7313040494918823, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17852747440338135, "step": 15488 }, { "epoch": 0.4840625, "grad_norm": 3.28125, "grad_norm_var": 0.03629150390625, "learning_rate": 0.0001, "loss": 5.8479, "loss/crossentropy": 2.5692058801651, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.176309272646904, "step": 15490 }, { "epoch": 0.484125, "grad_norm": 3.90625, "grad_norm_var": 0.06212565104166667, "learning_rate": 0.0001, "loss": 6.0231, "loss/crossentropy": 2.691379427909851, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18238692730665207, "step": 15492 }, { "epoch": 0.4841875, "grad_norm": 3.546875, "grad_norm_var": 0.04924214680989583, "learning_rate": 0.0001, "loss": 5.8521, "loss/crossentropy": 2.636283278465271, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1711912304162979, "step": 15494 }, { "epoch": 0.48425, "grad_norm": 3.453125, "grad_norm_var": 0.045393880208333334, "learning_rate": 0.0001, "loss": 5.7119, "loss/crossentropy": 2.4659664630889893, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1765500009059906, "step": 15496 }, { "epoch": 0.4843125, "grad_norm": 2.84375, "grad_norm_var": 0.0512359619140625, "learning_rate": 0.0001, "loss": 5.7075, "loss/crossentropy": 2.5716251134872437, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.165537491440773, "step": 15498 }, { "epoch": 0.484375, "grad_norm": 3.390625, "grad_norm_var": 0.0505859375, "learning_rate": 0.0001, "loss": 5.9251, "loss/crossentropy": 2.612492561340332, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17579318583011627, "step": 15500 }, { "epoch": 0.4844375, "grad_norm": 3.09375, "grad_norm_var": 0.05455729166666667, "learning_rate": 0.0001, "loss": 5.7712, "loss/crossentropy": 2.5376791954040527, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17256756126880646, "step": 15502 }, { "epoch": 0.4845, "grad_norm": 3.234375, "grad_norm_var": 0.06252339680989584, "learning_rate": 0.0001, "loss": 5.7173, "loss/crossentropy": 2.570461869239807, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16546591371297836, "step": 15504 }, { "epoch": 0.4845625, "grad_norm": 3.0625, "grad_norm_var": 0.06642964680989584, "learning_rate": 0.0001, "loss": 5.9016, "loss/crossentropy": 2.675860047340393, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17452280223369598, "step": 15506 }, { "epoch": 0.484625, "grad_norm": 3.203125, "grad_norm_var": 0.03267313639322917, "learning_rate": 0.0001, "loss": 5.925, "loss/crossentropy": 2.6444398164749146, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17609969526529312, "step": 15508 }, { "epoch": 0.4846875, "grad_norm": 3.171875, "grad_norm_var": 0.027469889322916666, "learning_rate": 0.0001, "loss": 5.6911, "loss/crossentropy": 2.57095468044281, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1635763719677925, "step": 15510 }, { "epoch": 0.48475, "grad_norm": 3.078125, "grad_norm_var": 0.021629842122395833, "learning_rate": 0.0001, "loss": 5.8716, "loss/crossentropy": 2.681228756904602, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17255127429962158, "step": 15512 }, { "epoch": 0.4848125, "grad_norm": 3.359375, "grad_norm_var": 0.0184967041015625, "learning_rate": 0.0001, "loss": 5.5519, "loss/crossentropy": 2.4044448137283325, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.16162271797657013, "step": 15514 }, { "epoch": 0.484875, "grad_norm": 3.390625, "grad_norm_var": 0.0159332275390625, "learning_rate": 0.0001, "loss": 5.7978, "loss/crossentropy": 2.5779892206192017, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17471708357334137, "step": 15516 }, { "epoch": 0.4849375, "grad_norm": 3.4375, "grad_norm_var": 0.024214680989583334, "learning_rate": 0.0001, "loss": 5.7523, "loss/crossentropy": 2.6253888607025146, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16699223220348358, "step": 15518 }, { "epoch": 0.485, "grad_norm": 3.140625, "grad_norm_var": 0.031053670247395835, "learning_rate": 0.0001, "loss": 6.0675, "loss/crossentropy": 2.764855742454529, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17675137519836426, "step": 15520 }, { "epoch": 0.4850625, "grad_norm": 4.65625, "grad_norm_var": 0.16469624837239583, "learning_rate": 0.0001, "loss": 6.1342, "loss/crossentropy": 2.719378113746643, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.19070102274417877, "step": 15522 }, { "epoch": 0.485125, "grad_norm": 3.328125, "grad_norm_var": 0.16519266764322918, "learning_rate": 0.0001, "loss": 5.7085, "loss/crossentropy": 2.5239862203598022, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1696275919675827, "step": 15524 }, { "epoch": 0.4851875, "grad_norm": 3.53125, "grad_norm_var": 0.16188151041666668, "learning_rate": 0.0001, "loss": 5.9615, "loss/crossentropy": 2.628399610519409, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1821391060948372, "step": 15526 }, { "epoch": 0.48525, "grad_norm": 3.3125, "grad_norm_var": 0.15627848307291667, "learning_rate": 0.0001, "loss": 5.2108, "loss/crossentropy": 2.182291626930237, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15363489091396332, "step": 15528 }, { "epoch": 0.4853125, "grad_norm": 3.1875, "grad_norm_var": 0.15288798014322916, "learning_rate": 0.0001, "loss": 5.855, "loss/crossentropy": 2.5341769456863403, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18130408972501755, "step": 15530 }, { "epoch": 0.485375, "grad_norm": 3.25, "grad_norm_var": 0.15269775390625, "learning_rate": 0.0001, "loss": 5.9362, "loss/crossentropy": 2.680835485458374, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17866097390651703, "step": 15532 }, { "epoch": 0.4854375, "grad_norm": 3.203125, "grad_norm_var": 0.14075113932291666, "learning_rate": 0.0001, "loss": 5.983, "loss/crossentropy": 2.7550711631774902, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1743575856089592, "step": 15534 }, { "epoch": 0.4855, "grad_norm": 3.1875, "grad_norm_var": 0.1528961181640625, "learning_rate": 0.0001, "loss": 5.4632, "loss/crossentropy": 2.4080125093460083, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16176702082157135, "step": 15536 }, { "epoch": 0.4855625, "grad_norm": 3.046875, "grad_norm_var": 0.025487263997395832, "learning_rate": 0.0001, "loss": 5.5596, "loss/crossentropy": 2.4649598598480225, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1590685397386551, "step": 15538 }, { "epoch": 0.485625, "grad_norm": 4.25, "grad_norm_var": 0.0929107666015625, "learning_rate": 0.0001, "loss": 5.7771, "loss/crossentropy": 2.542220115661621, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17427068948745728, "step": 15540 }, { "epoch": 0.4856875, "grad_norm": 3.5625, "grad_norm_var": 0.09619140625, "learning_rate": 0.0001, "loss": 5.8391, "loss/crossentropy": 2.641697645187378, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1705191656947136, "step": 15542 }, { "epoch": 0.48575, "grad_norm": 3.25, "grad_norm_var": 0.09487202962239584, "learning_rate": 0.0001, "loss": 5.3877, "loss/crossentropy": 2.2709479331970215, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1640223264694214, "step": 15544 }, { "epoch": 0.4858125, "grad_norm": 2.828125, "grad_norm_var": 0.1052398681640625, "learning_rate": 0.0001, "loss": 5.6644, "loss/crossentropy": 2.567691445350647, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16474910080432892, "step": 15546 }, { "epoch": 0.485875, "grad_norm": 3.078125, "grad_norm_var": 0.11311848958333333, "learning_rate": 0.0001, "loss": 5.6234, "loss/crossentropy": 2.503561854362488, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16706478595733643, "step": 15548 }, { "epoch": 0.4859375, "grad_norm": 2.953125, "grad_norm_var": 0.11679585774739583, "learning_rate": 0.0001, "loss": 5.585, "loss/crossentropy": 2.456661343574524, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16557209193706512, "step": 15550 }, { "epoch": 0.486, "grad_norm": 2.984375, "grad_norm_var": 0.11770426432291667, "learning_rate": 0.0001, "loss": 5.8205, "loss/crossentropy": 2.609832286834717, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17068011313676834, "step": 15552 }, { "epoch": 0.4860625, "grad_norm": 3.109375, "grad_norm_var": 0.11530659993489584, "learning_rate": 0.0001, "loss": 5.7301, "loss/crossentropy": 2.4726040363311768, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17301952093839645, "step": 15554 }, { "epoch": 0.486125, "grad_norm": 2.765625, "grad_norm_var": 0.04895426432291667, "learning_rate": 0.0001, "loss": 5.5239, "loss/crossentropy": 2.456711530685425, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16297296434640884, "step": 15556 }, { "epoch": 0.4861875, "grad_norm": 3.203125, "grad_norm_var": 0.03626302083333333, "learning_rate": 0.0001, "loss": 5.5375, "loss/crossentropy": 2.369423985481262, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.17188247293233871, "step": 15558 }, { "epoch": 0.48625, "grad_norm": 3.046875, "grad_norm_var": 0.03292643229166667, "learning_rate": 0.0001, "loss": 6.0091, "loss/crossentropy": 2.796201467514038, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17090045660734177, "step": 15560 }, { "epoch": 0.4863125, "grad_norm": 3.109375, "grad_norm_var": 0.02896728515625, "learning_rate": 0.0001, "loss": 5.3326, "loss/crossentropy": 2.25577449798584, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15924570709466934, "step": 15562 }, { "epoch": 0.486375, "grad_norm": 3.015625, "grad_norm_var": 0.027977498372395833, "learning_rate": 0.0001, "loss": 5.806, "loss/crossentropy": 2.6025267839431763, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16995244473218918, "step": 15564 }, { "epoch": 0.4864375, "grad_norm": 3.015625, "grad_norm_var": 0.027197265625, "learning_rate": 0.0001, "loss": 5.7417, "loss/crossentropy": 2.589355945587158, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16601653397083282, "step": 15566 }, { "epoch": 0.4865, "grad_norm": 2.984375, "grad_norm_var": 0.0175689697265625, "learning_rate": 0.0001, "loss": 5.7621, "loss/crossentropy": 2.5684362649917603, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16975650191307068, "step": 15568 }, { "epoch": 0.4865625, "grad_norm": 3.140625, "grad_norm_var": 0.037507120768229166, "learning_rate": 0.0001, "loss": 5.9495, "loss/crossentropy": 2.5972514152526855, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1813228577375412, "step": 15570 }, { "epoch": 0.486625, "grad_norm": 3.21875, "grad_norm_var": 0.03493550618489583, "learning_rate": 0.0001, "loss": 5.4156, "loss/crossentropy": 2.384309411048889, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15625690668821335, "step": 15572 }, { "epoch": 0.4866875, "grad_norm": 3.375, "grad_norm_var": 0.03961588541666667, "learning_rate": 0.0001, "loss": 5.5536, "loss/crossentropy": 2.380679130554199, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1669015809893608, "step": 15574 }, { "epoch": 0.48675, "grad_norm": 3.4375, "grad_norm_var": 0.04234619140625, "learning_rate": 0.0001, "loss": 5.7799, "loss/crossentropy": 2.593273162841797, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1710069254040718, "step": 15576 }, { "epoch": 0.4868125, "grad_norm": 2.953125, "grad_norm_var": 0.04585673014322917, "learning_rate": 0.0001, "loss": 5.5643, "loss/crossentropy": 2.4406670331954956, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16510066390037537, "step": 15578 }, { "epoch": 0.486875, "grad_norm": 3.515625, "grad_norm_var": 0.05426432291666667, "learning_rate": 0.0001, "loss": 5.8131, "loss/crossentropy": 2.6028060913085938, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17063555121421814, "step": 15580 }, { "epoch": 0.4869375, "grad_norm": 2.90625, "grad_norm_var": 0.06106770833333333, "learning_rate": 0.0001, "loss": 5.4776, "loss/crossentropy": 2.4742895364761353, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1573660522699356, "step": 15582 }, { "epoch": 0.487, "grad_norm": 3.15625, "grad_norm_var": 0.0585357666015625, "learning_rate": 0.0001, "loss": 5.3708, "loss/crossentropy": 2.273473858833313, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16090303659439087, "step": 15584 }, { "epoch": 0.4870625, "grad_norm": 3.515625, "grad_norm_var": 0.050048828125, "learning_rate": 0.0001, "loss": 6.0934, "loss/crossentropy": 2.688918948173523, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18576036393642426, "step": 15586 }, { "epoch": 0.487125, "grad_norm": 3.015625, "grad_norm_var": 0.04457906087239583, "learning_rate": 0.0001, "loss": 5.6411, "loss/crossentropy": 2.4709302186965942, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16819019615650177, "step": 15588 }, { "epoch": 0.4871875, "grad_norm": 3.125, "grad_norm_var": 0.039351399739583334, "learning_rate": 0.0001, "loss": 5.7386, "loss/crossentropy": 2.539092183113098, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17190296202898026, "step": 15590 }, { "epoch": 0.48725, "grad_norm": 3.203125, "grad_norm_var": 0.047200520833333336, "learning_rate": 0.0001, "loss": 5.5852, "loss/crossentropy": 2.337055206298828, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.171297125518322, "step": 15592 }, { "epoch": 0.4873125, "grad_norm": 3.390625, "grad_norm_var": 0.05415751139322917, "learning_rate": 0.0001, "loss": 5.8436, "loss/crossentropy": 2.5291141271591187, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17597944289445877, "step": 15594 }, { "epoch": 0.487375, "grad_norm": 3.171875, "grad_norm_var": 0.046708170572916666, "learning_rate": 0.0001, "loss": 5.7844, "loss/crossentropy": 2.5487102270126343, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17357081174850464, "step": 15596 }, { "epoch": 0.4874375, "grad_norm": 3.296875, "grad_norm_var": 0.04097900390625, "learning_rate": 0.0001, "loss": 5.4457, "loss/crossentropy": 2.3209153413772583, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16287311166524887, "step": 15598 }, { "epoch": 0.4875, "grad_norm": 3.078125, "grad_norm_var": 0.040262858072916664, "learning_rate": 0.0001, "loss": 5.5744, "loss/crossentropy": 2.4439727067947388, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1642143875360489, "step": 15600 }, { "epoch": 0.4875625, "grad_norm": 3.28125, "grad_norm_var": 0.03516337076822917, "learning_rate": 0.0001, "loss": 5.4231, "loss/crossentropy": 2.3545764684677124, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.15567777305841446, "step": 15602 }, { "epoch": 0.487625, "grad_norm": 3.28125, "grad_norm_var": 0.9217112223307292, "learning_rate": 0.0001, "loss": 6.1146, "loss/crossentropy": 2.6810855865478516, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1855430081486702, "step": 15604 }, { "epoch": 0.4876875, "grad_norm": 3.921875, "grad_norm_var": 0.9090779622395834, "learning_rate": 0.0001, "loss": 5.4459, "loss/crossentropy": 2.262246608734131, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.16523557156324387, "step": 15606 }, { "epoch": 0.48775, "grad_norm": 3.046875, "grad_norm_var": 1.2462565104166667, "learning_rate": 0.0001, "loss": 5.3636, "loss/crossentropy": 2.2482502460479736, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16387996822595596, "step": 15608 }, { "epoch": 0.4878125, "grad_norm": 3.984375, "grad_norm_var": 1.25670166015625, "learning_rate": 0.0001, "loss": 5.7457, "loss/crossentropy": 2.5216753482818604, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17200765758752823, "step": 15610 }, { "epoch": 0.487875, "grad_norm": 3.09375, "grad_norm_var": 1.2897206624348958, "learning_rate": 0.0001, "loss": 5.3269, "loss/crossentropy": 2.31750226020813, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.14976944029331207, "step": 15612 }, { "epoch": 0.4879375, "grad_norm": 3.09375, "grad_norm_var": 1.2784464518229166, "learning_rate": 0.0001, "loss": 5.7985, "loss/crossentropy": 2.6566789150238037, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.161838561296463, "step": 15614 }, { "epoch": 0.488, "grad_norm": 3.09375, "grad_norm_var": 1.2850423177083334, "learning_rate": 0.0001, "loss": 5.3373, "loss/crossentropy": 2.2665454149246216, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.15667995065450668, "step": 15616 }, { "epoch": 0.4880625, "grad_norm": 3.125, "grad_norm_var": 1.3047190348307292, "learning_rate": 0.0001, "loss": 5.7071, "loss/crossentropy": 2.507651925086975, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1695544570684433, "step": 15618 }, { "epoch": 0.488125, "grad_norm": 3.046875, "grad_norm_var": 0.5093495686848958, "learning_rate": 0.0001, "loss": 5.7736, "loss/crossentropy": 2.583544969558716, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.17369394749403, "step": 15620 }, { "epoch": 0.4881875, "grad_norm": 2.8125, "grad_norm_var": 0.51334228515625, "learning_rate": 0.0001, "loss": 5.623, "loss/crossentropy": 2.515665292739868, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16581125557422638, "step": 15622 }, { "epoch": 0.48825, "grad_norm": 3.65625, "grad_norm_var": 0.1669097900390625, "learning_rate": 0.0001, "loss": 6.2384, "loss/crossentropy": 2.799090266227722, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.19041062146425247, "step": 15624 }, { "epoch": 0.4883125, "grad_norm": 3.34375, "grad_norm_var": 0.1380859375, "learning_rate": 0.0001, "loss": 5.7315, "loss/crossentropy": 2.5976243019104004, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16612115502357483, "step": 15626 }, { "epoch": 0.488375, "grad_norm": 3.125, "grad_norm_var": 0.13386942545572916, "learning_rate": 0.0001, "loss": 5.6682, "loss/crossentropy": 2.4756577014923096, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16847358644008636, "step": 15628 }, { "epoch": 0.4884375, "grad_norm": 3.625, "grad_norm_var": 0.14802144368489584, "learning_rate": 0.0001, "loss": 5.7986, "loss/crossentropy": 2.55096173286438, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17359618842601776, "step": 15630 }, { "epoch": 0.4885, "grad_norm": 3.109375, "grad_norm_var": 0.15494384765625, "learning_rate": 0.0001, "loss": 5.8165, "loss/crossentropy": 2.722593665122986, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16212552040815353, "step": 15632 }, { "epoch": 0.4885625, "grad_norm": 3.078125, "grad_norm_var": 0.15543212890625, "learning_rate": 0.0001, "loss": 5.7039, "loss/crossentropy": 2.5893197059631348, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16497574001550674, "step": 15634 }, { "epoch": 0.488625, "grad_norm": 3.0, "grad_norm_var": 0.1569976806640625, "learning_rate": 0.0001, "loss": 5.6041, "loss/crossentropy": 2.4363603591918945, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16716646403074265, "step": 15636 }, { "epoch": 0.4886875, "grad_norm": 3.09375, "grad_norm_var": 0.14580790201822916, "learning_rate": 0.0001, "loss": 5.6451, "loss/crossentropy": 2.409726619720459, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1719701960682869, "step": 15638 }, { "epoch": 0.48875, "grad_norm": 3.421875, "grad_norm_var": 0.03974507649739583, "learning_rate": 0.0001, "loss": 5.859, "loss/crossentropy": 2.5570164918899536, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1762908697128296, "step": 15640 }, { "epoch": 0.4888125, "grad_norm": 3.8125, "grad_norm_var": 0.05964253743489583, "learning_rate": 0.0001, "loss": 6.009, "loss/crossentropy": 2.591892957687378, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18936841934919357, "step": 15642 }, { "epoch": 0.488875, "grad_norm": 3.3125, "grad_norm_var": 0.06513570149739584, "learning_rate": 0.0001, "loss": 5.835, "loss/crossentropy": 2.527828335762024, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1736869290471077, "step": 15644 }, { "epoch": 0.4889375, "grad_norm": 3.34375, "grad_norm_var": 0.051102701822916666, "learning_rate": 0.0001, "loss": 5.7208, "loss/crossentropy": 2.525538444519043, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1706969365477562, "step": 15646 }, { "epoch": 0.489, "grad_norm": 2.84375, "grad_norm_var": 0.051953125, "learning_rate": 0.0001, "loss": 5.7361, "loss/crossentropy": 2.5371170043945312, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.17380855977535248, "step": 15648 }, { "epoch": 0.4890625, "grad_norm": 3.234375, "grad_norm_var": 0.05137430826822917, "learning_rate": 0.0001, "loss": 5.8679, "loss/crossentropy": 2.589292883872986, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17747006565332413, "step": 15650 }, { "epoch": 0.489125, "grad_norm": 3.234375, "grad_norm_var": 0.05194905598958333, "learning_rate": 0.0001, "loss": 5.6713, "loss/crossentropy": 2.5229251384735107, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16327373683452606, "step": 15652 }, { "epoch": 0.4891875, "grad_norm": 2.96875, "grad_norm_var": 0.06328023274739583, "learning_rate": 0.0001, "loss": 5.6995, "loss/crossentropy": 2.5932918787002563, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16296438872814178, "step": 15654 }, { "epoch": 0.48925, "grad_norm": 3.3125, "grad_norm_var": 0.06113993326822917, "learning_rate": 0.0001, "loss": 5.7168, "loss/crossentropy": 2.4919549226760864, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1709190458059311, "step": 15656 }, { "epoch": 0.4893125, "grad_norm": 3.03125, "grad_norm_var": 0.07034098307291667, "learning_rate": 0.0001, "loss": 5.9377, "loss/crossentropy": 2.6713995933532715, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17077427357435226, "step": 15658 }, { "epoch": 0.489375, "grad_norm": 3.140625, "grad_norm_var": 0.0634918212890625, "learning_rate": 0.0001, "loss": 5.8308, "loss/crossentropy": 2.574105978012085, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1756741628050804, "step": 15660 }, { "epoch": 0.4894375, "grad_norm": 3.140625, "grad_norm_var": 0.0641265869140625, "learning_rate": 0.0001, "loss": 5.6464, "loss/crossentropy": 2.481296420097351, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1680774688720703, "step": 15662 }, { "epoch": 0.4895, "grad_norm": 3.4375, "grad_norm_var": 0.06000874837239583, "learning_rate": 0.0001, "loss": 5.8343, "loss/crossentropy": 2.5626578330993652, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1756017655134201, "step": 15664 }, { "epoch": 0.4895625, "grad_norm": 2.984375, "grad_norm_var": 0.06164957682291667, "learning_rate": 0.0001, "loss": 6.038, "loss/crossentropy": 2.7693047523498535, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1753101423382759, "step": 15666 }, { "epoch": 0.489625, "grad_norm": 3.34375, "grad_norm_var": 0.06169331868489583, "learning_rate": 0.0001, "loss": 5.7044, "loss/crossentropy": 2.4952261447906494, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16896343231201172, "step": 15668 }, { "epoch": 0.4896875, "grad_norm": 3.171875, "grad_norm_var": 0.05386962890625, "learning_rate": 0.0001, "loss": 5.799, "loss/crossentropy": 2.552024006843567, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17508617788553238, "step": 15670 }, { "epoch": 0.48975, "grad_norm": 3.375, "grad_norm_var": 0.06438700358072917, "learning_rate": 0.0001, "loss": 5.6874, "loss/crossentropy": 2.5851725339889526, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16491178423166275, "step": 15672 }, { "epoch": 0.4898125, "grad_norm": 3.453125, "grad_norm_var": 0.03469645182291667, "learning_rate": 0.0001, "loss": 5.9859, "loss/crossentropy": 2.67172908782959, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.18102586269378662, "step": 15674 }, { "epoch": 0.489875, "grad_norm": 3.046875, "grad_norm_var": 0.036164347330729166, "learning_rate": 0.0001, "loss": 5.4261, "loss/crossentropy": 2.4043819904327393, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15763608366250992, "step": 15676 }, { "epoch": 0.4899375, "grad_norm": 3.15625, "grad_norm_var": 0.03381754557291667, "learning_rate": 0.0001, "loss": 5.4145, "loss/crossentropy": 2.3068490028381348, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16310925781726837, "step": 15678 }, { "epoch": 0.49, "grad_norm": 2.953125, "grad_norm_var": 0.04502665201822917, "learning_rate": 0.0001, "loss": 5.9619, "loss/crossentropy": 2.7321821451187134, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17258545756340027, "step": 15680 }, { "epoch": 0.4900625, "grad_norm": 3.28125, "grad_norm_var": 0.04262593587239583, "learning_rate": 0.0001, "loss": 5.8138, "loss/crossentropy": 2.5558485984802246, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17384207993745804, "step": 15682 }, { "epoch": 0.490125, "grad_norm": 3.046875, "grad_norm_var": 0.043115234375, "learning_rate": 0.0001, "loss": 5.7928, "loss/crossentropy": 2.6558672189712524, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1687743440270424, "step": 15684 }, { "epoch": 0.4901875, "grad_norm": 3.03125, "grad_norm_var": 0.0478912353515625, "learning_rate": 0.0001, "loss": 5.9135, "loss/crossentropy": 2.7171976566314697, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17041613161563873, "step": 15686 }, { "epoch": 0.49025, "grad_norm": 3.3125, "grad_norm_var": 0.03931376139322917, "learning_rate": 0.0001, "loss": 6.1663, "loss/crossentropy": 2.863266110420227, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17756742984056473, "step": 15688 }, { "epoch": 0.4903125, "grad_norm": 3.109375, "grad_norm_var": 0.032177734375, "learning_rate": 0.0001, "loss": 6.1456, "loss/crossentropy": 2.817554473876953, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.18437139689922333, "step": 15690 }, { "epoch": 0.490375, "grad_norm": 3.234375, "grad_norm_var": 0.03543294270833333, "learning_rate": 0.0001, "loss": 5.9821, "loss/crossentropy": 2.5803717374801636, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1878328025341034, "step": 15692 }, { "epoch": 0.4904375, "grad_norm": 3.09375, "grad_norm_var": 0.040511067708333334, "learning_rate": 0.0001, "loss": 5.8106, "loss/crossentropy": 2.566185235977173, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1724904626607895, "step": 15694 }, { "epoch": 0.4905, "grad_norm": 3.28125, "grad_norm_var": 0.025679524739583334, "learning_rate": 0.0001, "loss": 6.3179, "loss/crossentropy": 2.9657928943634033, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18443161249160767, "step": 15696 }, { "epoch": 0.4905625, "grad_norm": 3.0625, "grad_norm_var": 0.02681884765625, "learning_rate": 0.0001, "loss": 5.7591, "loss/crossentropy": 2.622571587562561, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16716735810041428, "step": 15698 }, { "epoch": 0.490625, "grad_norm": 2.953125, "grad_norm_var": 0.0323394775390625, "learning_rate": 0.0001, "loss": 5.1729, "loss/crossentropy": 2.186972200870514, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15054922550916672, "step": 15700 }, { "epoch": 0.4906875, "grad_norm": 3.703125, "grad_norm_var": 0.04387613932291667, "learning_rate": 0.0001, "loss": 6.3602, "loss/crossentropy": 2.898533582687378, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1922570914030075, "step": 15702 }, { "epoch": 0.49075, "grad_norm": 3.25, "grad_norm_var": 0.048095703125, "learning_rate": 0.0001, "loss": 5.5322, "loss/crossentropy": 2.4021382331848145, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16612635552883148, "step": 15704 }, { "epoch": 0.4908125, "grad_norm": 3.234375, "grad_norm_var": 0.0469879150390625, "learning_rate": 0.0001, "loss": 5.8965, "loss/crossentropy": 2.679166316986084, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17056386172771454, "step": 15706 }, { "epoch": 0.490875, "grad_norm": 2.765625, "grad_norm_var": 0.05694986979166667, "learning_rate": 0.0001, "loss": 5.495, "loss/crossentropy": 2.380611300468445, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16339337080717087, "step": 15708 }, { "epoch": 0.4909375, "grad_norm": 3.109375, "grad_norm_var": 0.05315348307291667, "learning_rate": 0.0001, "loss": 5.9962, "loss/crossentropy": 2.701938509941101, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.18255415558815002, "step": 15710 }, { "epoch": 0.491, "grad_norm": 3.03125, "grad_norm_var": 0.05447591145833333, "learning_rate": 0.0001, "loss": 5.9492, "loss/crossentropy": 2.657499313354492, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17917373776435852, "step": 15712 }, { "epoch": 0.4910625, "grad_norm": 2.9375, "grad_norm_var": 0.057428995768229164, "learning_rate": 0.0001, "loss": 5.8335, "loss/crossentropy": 2.63913357257843, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17060479521751404, "step": 15714 }, { "epoch": 0.491125, "grad_norm": 3.171875, "grad_norm_var": 0.0566070556640625, "learning_rate": 0.0001, "loss": 6.0182, "loss/crossentropy": 2.71595299243927, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.18139967322349548, "step": 15716 }, { "epoch": 0.4911875, "grad_norm": 2.984375, "grad_norm_var": 0.03808186848958333, "learning_rate": 0.0001, "loss": 5.0515, "loss/crossentropy": 2.112143397331238, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1470591500401497, "step": 15718 }, { "epoch": 0.49125, "grad_norm": 3.03125, "grad_norm_var": 0.0357421875, "learning_rate": 0.0001, "loss": 5.4537, "loss/crossentropy": 2.3805789947509766, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.1631671041250229, "step": 15720 }, { "epoch": 0.4913125, "grad_norm": 3.265625, "grad_norm_var": 0.0476226806640625, "learning_rate": 0.0001, "loss": 5.9306, "loss/crossentropy": 2.6870051622390747, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.16967561841011047, "step": 15722 }, { "epoch": 0.491375, "grad_norm": 3.765625, "grad_norm_var": 0.057356770833333334, "learning_rate": 0.0001, "loss": 5.511, "loss/crossentropy": 2.3568965196609497, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16892804205417633, "step": 15724 }, { "epoch": 0.4914375, "grad_norm": 3.6875, "grad_norm_var": 0.06994527180989583, "learning_rate": 0.0001, "loss": 6.0452, "loss/crossentropy": 2.7377495765686035, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17956935614347458, "step": 15726 }, { "epoch": 0.4915, "grad_norm": 3.359375, "grad_norm_var": 0.07031962076822916, "learning_rate": 0.0001, "loss": 5.8133, "loss/crossentropy": 2.5982768535614014, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.16681692004203796, "step": 15728 }, { "epoch": 0.4915625, "grad_norm": 3.71875, "grad_norm_var": 0.06838785807291667, "learning_rate": 0.0001, "loss": 6.1405, "loss/crossentropy": 2.72856867313385, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1865089237689972, "step": 15730 }, { "epoch": 0.491625, "grad_norm": 3.34375, "grad_norm_var": 0.0654449462890625, "learning_rate": 0.0001, "loss": 5.7988, "loss/crossentropy": 2.4939388036727905, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17814233899116516, "step": 15732 }, { "epoch": 0.4916875, "grad_norm": 3.515625, "grad_norm_var": 0.051285807291666666, "learning_rate": 0.0001, "loss": 6.1611, "loss/crossentropy": 2.8457610607147217, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.18036296963691711, "step": 15734 }, { "epoch": 0.49175, "grad_norm": 3.3125, "grad_norm_var": 0.041304524739583334, "learning_rate": 0.0001, "loss": 5.8231, "loss/crossentropy": 2.62145733833313, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17055156826972961, "step": 15736 }, { "epoch": 0.4918125, "grad_norm": 2.90625, "grad_norm_var": 0.05705464680989583, "learning_rate": 0.0001, "loss": 5.7634, "loss/crossentropy": 2.5695502758026123, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.17407134175300598, "step": 15738 }, { "epoch": 0.491875, "grad_norm": 3.0625, "grad_norm_var": 0.05419514973958333, "learning_rate": 0.0001, "loss": 5.8107, "loss/crossentropy": 2.575039505958557, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17434308677911758, "step": 15740 }, { "epoch": 0.4919375, "grad_norm": 3.109375, "grad_norm_var": 0.05912984212239583, "learning_rate": 0.0001, "loss": 5.9301, "loss/crossentropy": 2.6661219596862793, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1767871379852295, "step": 15742 }, { "epoch": 0.492, "grad_norm": 3.25, "grad_norm_var": 0.058527628580729164, "learning_rate": 0.0001, "loss": 5.585, "loss/crossentropy": 2.4229283332824707, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16894365847110748, "step": 15744 }, { "epoch": 0.4920625, "grad_norm": 3.1875, "grad_norm_var": 0.0479644775390625, "learning_rate": 0.0001, "loss": 5.8786, "loss/crossentropy": 2.6585851907730103, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17473089694976807, "step": 15746 }, { "epoch": 0.492125, "grad_norm": 3.328125, "grad_norm_var": 0.0489898681640625, "learning_rate": 0.0001, "loss": 5.791, "loss/crossentropy": 2.6436983346939087, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16511915624141693, "step": 15748 }, { "epoch": 0.4921875, "grad_norm": 3.0, "grad_norm_var": 0.04908447265625, "learning_rate": 0.0001, "loss": 5.7434, "loss/crossentropy": 2.5909314155578613, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16680838912725449, "step": 15750 }, { "epoch": 0.49225, "grad_norm": 3.25, "grad_norm_var": 0.0529449462890625, "learning_rate": 0.0001, "loss": 5.4595, "loss/crossentropy": 2.351040482521057, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16436628252267838, "step": 15752 }, { "epoch": 0.4923125, "grad_norm": 3.125, "grad_norm_var": 0.048151652018229164, "learning_rate": 0.0001, "loss": 5.8182, "loss/crossentropy": 2.605093002319336, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1732684075832367, "step": 15754 }, { "epoch": 0.492375, "grad_norm": 2.984375, "grad_norm_var": 0.03340555826822917, "learning_rate": 0.0001, "loss": 5.8776, "loss/crossentropy": 2.666801691055298, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1730295717716217, "step": 15756 }, { "epoch": 0.4924375, "grad_norm": 3.296875, "grad_norm_var": 0.014436848958333333, "learning_rate": 0.0001, "loss": 5.8782, "loss/crossentropy": 2.508259654045105, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1838727369904518, "step": 15758 }, { "epoch": 0.4925, "grad_norm": 3.203125, "grad_norm_var": 0.021654256184895835, "learning_rate": 0.0001, "loss": 6.0234, "loss/crossentropy": 2.7083660364151, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17837554216384888, "step": 15760 }, { "epoch": 0.4925625, "grad_norm": 3.03125, "grad_norm_var": 0.02467041015625, "learning_rate": 0.0001, "loss": 5.6442, "loss/crossentropy": 2.5146384239196777, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16412509232759476, "step": 15762 }, { "epoch": 0.492625, "grad_norm": 3.0625, "grad_norm_var": 0.023563639322916666, "learning_rate": 0.0001, "loss": 5.8087, "loss/crossentropy": 2.599011540412903, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17136258631944656, "step": 15764 }, { "epoch": 0.4926875, "grad_norm": 3.140625, "grad_norm_var": 0.02135009765625, "learning_rate": 0.0001, "loss": 6.1348, "loss/crossentropy": 2.864608883857727, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17545898258686066, "step": 15766 }, { "epoch": 0.49275, "grad_norm": 3.21875, "grad_norm_var": 0.017682902018229165, "learning_rate": 0.0001, "loss": 5.5526, "loss/crossentropy": 2.442115902900696, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16808071732521057, "step": 15768 }, { "epoch": 0.4928125, "grad_norm": 3.0, "grad_norm_var": 0.019391886393229165, "learning_rate": 0.0001, "loss": 5.5731, "loss/crossentropy": 2.4782867431640625, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16299203038215637, "step": 15770 }, { "epoch": 0.492875, "grad_norm": 3.125, "grad_norm_var": 0.016852823893229167, "learning_rate": 0.0001, "loss": 5.6081, "loss/crossentropy": 2.4362006187438965, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16719000041484833, "step": 15772 }, { "epoch": 0.4929375, "grad_norm": 3.390625, "grad_norm_var": 0.01842041015625, "learning_rate": 0.0001, "loss": 5.9624, "loss/crossentropy": 2.607534646987915, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.18509627133607864, "step": 15774 }, { "epoch": 0.493, "grad_norm": 2.875, "grad_norm_var": 0.04576416015625, "learning_rate": 0.0001, "loss": 5.6579, "loss/crossentropy": 2.476379632949829, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16541379690170288, "step": 15776 }, { "epoch": 0.4930625, "grad_norm": 3.375, "grad_norm_var": 0.049055989583333334, "learning_rate": 0.0001, "loss": 6.1386, "loss/crossentropy": 2.7492098808288574, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1865941658616066, "step": 15778 }, { "epoch": 0.493125, "grad_norm": 2.9375, "grad_norm_var": 0.05415751139322917, "learning_rate": 0.0001, "loss": 5.5255, "loss/crossentropy": 2.4384251832962036, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16143856197595596, "step": 15780 }, { "epoch": 0.4931875, "grad_norm": 3.125, "grad_norm_var": 0.05900777180989583, "learning_rate": 0.0001, "loss": 5.5614, "loss/crossentropy": 2.5072191953659058, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16088951379060745, "step": 15782 }, { "epoch": 0.49325, "grad_norm": 3.296875, "grad_norm_var": 0.0617340087890625, "learning_rate": 0.0001, "loss": 5.8466, "loss/crossentropy": 2.5994954109191895, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17510396987199783, "step": 15784 }, { "epoch": 0.4933125, "grad_norm": 3.796875, "grad_norm_var": 0.07994384765625, "learning_rate": 0.0001, "loss": 5.7199, "loss/crossentropy": 2.497785806655884, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1722072958946228, "step": 15786 }, { "epoch": 0.493375, "grad_norm": 3.21875, "grad_norm_var": 0.07884012858072917, "learning_rate": 0.0001, "loss": 5.8925, "loss/crossentropy": 2.6982651948928833, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17059308290481567, "step": 15788 }, { "epoch": 0.4934375, "grad_norm": 3.34375, "grad_norm_var": 0.07744852701822917, "learning_rate": 0.0001, "loss": 6.1106, "loss/crossentropy": 2.75845205783844, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18365000188350677, "step": 15790 }, { "epoch": 0.4935, "grad_norm": 3.203125, "grad_norm_var": 0.04768778483072917, "learning_rate": 0.0001, "loss": 5.853, "loss/crossentropy": 2.569619655609131, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17911946028470993, "step": 15792 }, { "epoch": 0.4935625, "grad_norm": 3.265625, "grad_norm_var": 0.043553670247395836, "learning_rate": 0.0001, "loss": 5.9874, "loss/crossentropy": 2.724018096923828, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1771218106150627, "step": 15794 }, { "epoch": 0.493625, "grad_norm": 3.171875, "grad_norm_var": 0.036539713541666664, "learning_rate": 0.0001, "loss": 5.7951, "loss/crossentropy": 2.6281638145446777, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1686425358057022, "step": 15796 }, { "epoch": 0.4936875, "grad_norm": 3.25, "grad_norm_var": 0.027827962239583334, "learning_rate": 0.0001, "loss": 5.8517, "loss/crossentropy": 2.591265320777893, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17408807575702667, "step": 15798 }, { "epoch": 0.49375, "grad_norm": 3.15625, "grad_norm_var": 0.03145243326822917, "learning_rate": 0.0001, "loss": 5.8795, "loss/crossentropy": 2.5996209383010864, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17720826715230942, "step": 15800 }, { "epoch": 0.4938125, "grad_norm": 2.859375, "grad_norm_var": 0.02047119140625, "learning_rate": 0.0001, "loss": 5.4859, "loss/crossentropy": 2.459252953529358, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1589120328426361, "step": 15802 }, { "epoch": 0.493875, "grad_norm": 3.09375, "grad_norm_var": 0.02164306640625, "learning_rate": 0.0001, "loss": 5.4081, "loss/crossentropy": 2.3998864889144897, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.1601930558681488, "step": 15804 }, { "epoch": 0.4939375, "grad_norm": 3.171875, "grad_norm_var": 0.14748942057291667, "learning_rate": 0.0001, "loss": 6.1105, "loss/crossentropy": 2.7312912940979004, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18128246814012527, "step": 15806 }, { "epoch": 0.494, "grad_norm": 3.375, "grad_norm_var": 0.14765218098958333, "learning_rate": 0.0001, "loss": 5.9199, "loss/crossentropy": 2.6582727432250977, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17694677412509918, "step": 15808 }, { "epoch": 0.4940625, "grad_norm": 12.0, "grad_norm_var": 4.86480712890625, "learning_rate": 0.0001, "loss": 6.2311, "loss/crossentropy": 2.7658541202545166, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.19027357548475266, "step": 15810 }, { "epoch": 0.494125, "grad_norm": 3.234375, "grad_norm_var": 4.840543619791666, "learning_rate": 0.0001, "loss": 5.6745, "loss/crossentropy": 2.480656147003174, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17172466963529587, "step": 15812 }, { "epoch": 0.4941875, "grad_norm": 4.0625, "grad_norm_var": 4.833177693684896, "learning_rate": 0.0001, "loss": 5.394, "loss/crossentropy": 2.290852904319763, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16305189579725266, "step": 15814 }, { "epoch": 0.49425, "grad_norm": 3.140625, "grad_norm_var": 4.824479166666666, "learning_rate": 0.0001, "loss": 6.2618, "loss/crossentropy": 2.8281394243240356, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1875031590461731, "step": 15816 }, { "epoch": 0.4943125, "grad_norm": 3.3125, "grad_norm_var": 4.766731770833333, "learning_rate": 0.0001, "loss": 5.9952, "loss/crossentropy": 2.7113758325576782, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1775989830493927, "step": 15818 }, { "epoch": 0.494375, "grad_norm": 3.171875, "grad_norm_var": 4.751414998372396, "learning_rate": 0.0001, "loss": 5.7191, "loss/crossentropy": 2.561946988105774, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16610819846391678, "step": 15820 }, { "epoch": 0.4944375, "grad_norm": 3.21875, "grad_norm_var": 4.773551432291667, "learning_rate": 0.0001, "loss": 5.7712, "loss/crossentropy": 2.569547414779663, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17250573635101318, "step": 15822 }, { "epoch": 0.4945, "grad_norm": 3.578125, "grad_norm_var": 4.76539306640625, "learning_rate": 0.0001, "loss": 5.8216, "loss/crossentropy": 2.549016833305359, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17452429980039597, "step": 15824 }, { "epoch": 0.4945625, "grad_norm": 3.015625, "grad_norm_var": 0.10800679524739583, "learning_rate": 0.0001, "loss": 5.2381, "loss/crossentropy": 2.2993987798690796, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15246036648750305, "step": 15826 }, { "epoch": 0.494625, "grad_norm": 3.359375, "grad_norm_var": 0.1213287353515625, "learning_rate": 0.0001, "loss": 5.6697, "loss/crossentropy": 2.5171492099761963, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16290698945522308, "step": 15828 }, { "epoch": 0.4946875, "grad_norm": 2.984375, "grad_norm_var": 0.0838043212890625, "learning_rate": 0.0001, "loss": 5.5798, "loss/crossentropy": 2.442649006843567, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1656683087348938, "step": 15830 }, { "epoch": 0.49475, "grad_norm": 3.03125, "grad_norm_var": 0.0387603759765625, "learning_rate": 0.0001, "loss": 5.7172, "loss/crossentropy": 2.546152353286743, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16945255547761917, "step": 15832 }, { "epoch": 0.4948125, "grad_norm": 3.328125, "grad_norm_var": 0.03957926432291667, "learning_rate": 0.0001, "loss": 5.9672, "loss/crossentropy": 2.719525694847107, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1778969019651413, "step": 15834 }, { "epoch": 0.494875, "grad_norm": 3.296875, "grad_norm_var": 0.04096577962239583, "learning_rate": 0.0001, "loss": 5.966, "loss/crossentropy": 2.706810235977173, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1759188175201416, "step": 15836 }, { "epoch": 0.4949375, "grad_norm": 3.125, "grad_norm_var": 0.03937886555989583, "learning_rate": 0.0001, "loss": 5.4779, "loss/crossentropy": 2.4548873901367188, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15620262920856476, "step": 15838 }, { "epoch": 0.495, "grad_norm": 3.328125, "grad_norm_var": 0.024637858072916668, "learning_rate": 0.0001, "loss": 5.7715, "loss/crossentropy": 2.557394862174988, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17102333158254623, "step": 15840 }, { "epoch": 0.4950625, "grad_norm": 3.28125, "grad_norm_var": 0.03892313639322917, "learning_rate": 0.0001, "loss": 6.1132, "loss/crossentropy": 2.7207332849502563, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18377604335546494, "step": 15842 }, { "epoch": 0.495125, "grad_norm": 3.421875, "grad_norm_var": 0.03394775390625, "learning_rate": 0.0001, "loss": 5.9958, "loss/crossentropy": 2.731584906578064, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17642249166965485, "step": 15844 }, { "epoch": 0.4951875, "grad_norm": 3.0625, "grad_norm_var": 0.04205322265625, "learning_rate": 0.0001, "loss": 5.6099, "loss/crossentropy": 2.4367045164108276, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17083469778299332, "step": 15846 }, { "epoch": 0.49525, "grad_norm": 3.109375, "grad_norm_var": 0.030985514322916668, "learning_rate": 0.0001, "loss": 5.7844, "loss/crossentropy": 2.558732748031616, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17257042229175568, "step": 15848 }, { "epoch": 0.4953125, "grad_norm": 3.109375, "grad_norm_var": 0.03092041015625, "learning_rate": 0.0001, "loss": 6.1288, "loss/crossentropy": 2.851402997970581, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17617689073085785, "step": 15850 }, { "epoch": 0.495375, "grad_norm": 3.15625, "grad_norm_var": 0.029889933268229165, "learning_rate": 0.0001, "loss": 5.5517, "loss/crossentropy": 2.4140695333480835, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16610509157180786, "step": 15852 }, { "epoch": 0.4954375, "grad_norm": 2.90625, "grad_norm_var": 0.0359375, "learning_rate": 0.0001, "loss": 5.8073, "loss/crossentropy": 2.63225257396698, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1671120896935463, "step": 15854 }, { "epoch": 0.4955, "grad_norm": 3.5, "grad_norm_var": 0.0517486572265625, "learning_rate": 0.0001, "loss": 5.9302, "loss/crossentropy": 2.7441340684890747, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17134114354848862, "step": 15856 }, { "epoch": 0.4955625, "grad_norm": 3.390625, "grad_norm_var": 0.05275777180989583, "learning_rate": 0.0001, "loss": 5.7635, "loss/crossentropy": 2.5725748538970947, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16752752661705017, "step": 15858 }, { "epoch": 0.495625, "grad_norm": 3.21875, "grad_norm_var": 0.049702962239583336, "learning_rate": 0.0001, "loss": 5.8945, "loss/crossentropy": 2.5877639055252075, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1802806630730629, "step": 15860 }, { "epoch": 0.4956875, "grad_norm": 3.25, "grad_norm_var": 0.04422200520833333, "learning_rate": 0.0001, "loss": 6.1524, "loss/crossentropy": 2.797191023826599, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1808350756764412, "step": 15862 }, { "epoch": 0.49575, "grad_norm": 3.390625, "grad_norm_var": 0.046849568684895836, "learning_rate": 0.0001, "loss": 5.7258, "loss/crossentropy": 2.5211658477783203, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17007212340831757, "step": 15864 }, { "epoch": 0.4958125, "grad_norm": 3.671875, "grad_norm_var": 0.06403706868489584, "learning_rate": 0.0001, "loss": 5.9797, "loss/crossentropy": 2.576448082923889, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.18055523186922073, "step": 15866 }, { "epoch": 0.495875, "grad_norm": 3.171875, "grad_norm_var": 0.06292215983072917, "learning_rate": 0.0001, "loss": 5.8199, "loss/crossentropy": 2.5816270112991333, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1746114268898964, "step": 15868 }, { "epoch": 0.4959375, "grad_norm": 2.984375, "grad_norm_var": 0.0600250244140625, "learning_rate": 0.0001, "loss": 5.9041, "loss/crossentropy": 2.7266438007354736, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16891635954380035, "step": 15870 }, { "epoch": 0.496, "grad_norm": 3.90625, "grad_norm_var": 0.06288655598958333, "learning_rate": 0.0001, "loss": 5.9347, "loss/crossentropy": 2.6614303588867188, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1714666336774826, "step": 15872 }, { "epoch": 0.4960625, "grad_norm": 2.890625, "grad_norm_var": 0.07214253743489583, "learning_rate": 0.0001, "loss": 5.5861, "loss/crossentropy": 2.4560959339141846, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16495682299137115, "step": 15874 }, { "epoch": 0.496125, "grad_norm": 3.078125, "grad_norm_var": 0.07622782389322917, "learning_rate": 0.0001, "loss": 5.4371, "loss/crossentropy": 2.36905038356781, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15953627973794937, "step": 15876 }, { "epoch": 0.4961875, "grad_norm": 3.125, "grad_norm_var": 0.07359110514322917, "learning_rate": 0.0001, "loss": 5.869, "loss/crossentropy": 2.6835721731185913, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1720564141869545, "step": 15878 }, { "epoch": 0.49625, "grad_norm": 3.3125, "grad_norm_var": 0.07060546875, "learning_rate": 0.0001, "loss": 5.8529, "loss/crossentropy": 2.643310546875, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1725200042128563, "step": 15880 }, { "epoch": 0.4963125, "grad_norm": 3.03125, "grad_norm_var": 0.0490386962890625, "learning_rate": 0.0001, "loss": 5.9573, "loss/crossentropy": 2.751884341239929, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17327306419610977, "step": 15882 }, { "epoch": 0.496375, "grad_norm": 3.3125, "grad_norm_var": 0.051611328125, "learning_rate": 0.0001, "loss": 5.7362, "loss/crossentropy": 2.4810686111450195, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17434068024158478, "step": 15884 }, { "epoch": 0.4964375, "grad_norm": 3.15625, "grad_norm_var": 0.0598785400390625, "learning_rate": 0.0001, "loss": 5.8693, "loss/crossentropy": 2.649235248565674, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.170439213514328, "step": 15886 }, { "epoch": 0.4965, "grad_norm": 3.34375, "grad_norm_var": 0.03590087890625, "learning_rate": 0.0001, "loss": 5.965, "loss/crossentropy": 2.570220947265625, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18401052057743073, "step": 15888 }, { "epoch": 0.4965625, "grad_norm": 2.953125, "grad_norm_var": 0.03332926432291667, "learning_rate": 0.0001, "loss": 5.635, "loss/crossentropy": 2.4804067611694336, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16819289326667786, "step": 15890 }, { "epoch": 0.496625, "grad_norm": 3.265625, "grad_norm_var": 0.0297515869140625, "learning_rate": 0.0001, "loss": 5.7789, "loss/crossentropy": 2.5864343643188477, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1684667244553566, "step": 15892 }, { "epoch": 0.4966875, "grad_norm": 2.84375, "grad_norm_var": 0.0404449462890625, "learning_rate": 0.0001, "loss": 5.695, "loss/crossentropy": 2.6350271701812744, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15951280295848846, "step": 15894 }, { "epoch": 0.49675, "grad_norm": 3.15625, "grad_norm_var": 0.041357421875, "learning_rate": 0.0001, "loss": 5.5124, "loss/crossentropy": 2.411333203315735, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16049957275390625, "step": 15896 }, { "epoch": 0.4968125, "grad_norm": 3.53125, "grad_norm_var": 0.04854227701822917, "learning_rate": 0.0001, "loss": 5.5888, "loss/crossentropy": 2.4651507139205933, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16431937366724014, "step": 15898 }, { "epoch": 0.496875, "grad_norm": 5.125, "grad_norm_var": 0.2789459228515625, "learning_rate": 0.0001, "loss": 5.4275, "loss/crossentropy": 2.292418360710144, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16116555780172348, "step": 15900 }, { "epoch": 0.4969375, "grad_norm": 3.3125, "grad_norm_var": 0.26962890625, "learning_rate": 0.0001, "loss": 5.5919, "loss/crossentropy": 2.468536615371704, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16780667752027512, "step": 15902 }, { "epoch": 0.497, "grad_norm": 3.328125, "grad_norm_var": 0.27049051920572914, "learning_rate": 0.0001, "loss": 6.0847, "loss/crossentropy": 2.765723466873169, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17994843423366547, "step": 15904 }, { "epoch": 0.4970625, "grad_norm": 3.328125, "grad_norm_var": 0.26126200358072915, "learning_rate": 0.0001, "loss": 5.9942, "loss/crossentropy": 2.714383363723755, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17758838087320328, "step": 15906 }, { "epoch": 0.497125, "grad_norm": 3.125, "grad_norm_var": 0.2636789957682292, "learning_rate": 0.0001, "loss": 5.866, "loss/crossentropy": 2.56651508808136, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1779971495270729, "step": 15908 }, { "epoch": 0.4971875, "grad_norm": 3.125, "grad_norm_var": 0.2510650634765625, "learning_rate": 0.0001, "loss": 5.8023, "loss/crossentropy": 2.5108169317245483, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1744590848684311, "step": 15910 }, { "epoch": 0.49725, "grad_norm": 3.53125, "grad_norm_var": 0.24410400390625, "learning_rate": 0.0001, "loss": 5.6003, "loss/crossentropy": 2.420087218284607, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17114916443824768, "step": 15912 }, { "epoch": 0.4973125, "grad_norm": 3.296875, "grad_norm_var": 0.24005533854166666, "learning_rate": 0.0001, "loss": 5.8849, "loss/crossentropy": 2.6445670127868652, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1732504740357399, "step": 15914 }, { "epoch": 0.497375, "grad_norm": 3.015625, "grad_norm_var": 0.03037109375, "learning_rate": 0.0001, "loss": 5.7636, "loss/crossentropy": 2.649904727935791, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1625397726893425, "step": 15916 }, { "epoch": 0.4974375, "grad_norm": 3.125, "grad_norm_var": 0.03775634765625, "learning_rate": 0.0001, "loss": 5.9128, "loss/crossentropy": 2.6058311462402344, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17757585644721985, "step": 15918 }, { "epoch": 0.4975, "grad_norm": 3.59375, "grad_norm_var": 0.0398590087890625, "learning_rate": 0.0001, "loss": 5.9503, "loss/crossentropy": 2.6391966342926025, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.18228397518396378, "step": 15920 }, { "epoch": 0.4975625, "grad_norm": 3.234375, "grad_norm_var": 0.04195556640625, "learning_rate": 0.0001, "loss": 5.6969, "loss/crossentropy": 2.5219361782073975, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16827717423439026, "step": 15922 }, { "epoch": 0.497625, "grad_norm": 3.375, "grad_norm_var": 0.04351806640625, "learning_rate": 0.0001, "loss": 5.883, "loss/crossentropy": 2.6181185245513916, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17570406198501587, "step": 15924 }, { "epoch": 0.4976875, "grad_norm": 3.40625, "grad_norm_var": 0.03619791666666667, "learning_rate": 0.0001, "loss": 5.5938, "loss/crossentropy": 2.4653561115264893, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.164012610912323, "step": 15926 }, { "epoch": 0.49775, "grad_norm": 3.53125, "grad_norm_var": 0.03616129557291667, "learning_rate": 0.0001, "loss": 5.8606, "loss/crossentropy": 2.5704281330108643, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1794053092598915, "step": 15928 }, { "epoch": 0.4978125, "grad_norm": 3.09375, "grad_norm_var": 0.036356608072916664, "learning_rate": 0.0001, "loss": 5.6433, "loss/crossentropy": 2.4557920694351196, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17031186819076538, "step": 15930 }, { "epoch": 0.497875, "grad_norm": 3.359375, "grad_norm_var": 0.036009724934895834, "learning_rate": 0.0001, "loss": 5.849, "loss/crossentropy": 2.6339017152786255, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17190106213092804, "step": 15932 }, { "epoch": 0.4979375, "grad_norm": 3.234375, "grad_norm_var": 0.0318511962890625, "learning_rate": 0.0001, "loss": 5.8076, "loss/crossentropy": 2.5940616130828857, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17330337315797806, "step": 15934 }, { "epoch": 0.498, "grad_norm": 3.09375, "grad_norm_var": 0.0240142822265625, "learning_rate": 0.0001, "loss": 5.7265, "loss/crossentropy": 2.5966960191726685, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16571569442749023, "step": 15936 }, { "epoch": 0.4980625, "grad_norm": 3.0, "grad_norm_var": 0.03152669270833333, "learning_rate": 0.0001, "loss": 5.542, "loss/crossentropy": 2.4944313764572144, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1602211520075798, "step": 15938 }, { "epoch": 0.498125, "grad_norm": 3.03125, "grad_norm_var": 0.029645792643229165, "learning_rate": 0.0001, "loss": 5.7389, "loss/crossentropy": 2.618900418281555, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16434358060359955, "step": 15940 }, { "epoch": 0.4981875, "grad_norm": 3.234375, "grad_norm_var": 0.0260894775390625, "learning_rate": 0.0001, "loss": 5.4484, "loss/crossentropy": 2.3067928552627563, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.15985964238643646, "step": 15942 }, { "epoch": 0.49825, "grad_norm": 3.53125, "grad_norm_var": 0.031005859375, "learning_rate": 0.0001, "loss": 5.6273, "loss/crossentropy": 2.4584217071533203, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1680569276213646, "step": 15944 }, { "epoch": 0.4983125, "grad_norm": 3.09375, "grad_norm_var": 0.031672159830729164, "learning_rate": 0.0001, "loss": 5.8246, "loss/crossentropy": 2.6650729179382324, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.17220134288072586, "step": 15946 }, { "epoch": 0.498375, "grad_norm": 3.09375, "grad_norm_var": 0.028669230143229165, "learning_rate": 0.0001, "loss": 5.8149, "loss/crossentropy": 2.5920504331588745, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17346028238534927, "step": 15948 }, { "epoch": 0.4984375, "grad_norm": 3.609375, "grad_norm_var": 0.04453837076822917, "learning_rate": 0.0001, "loss": 6.1388, "loss/crossentropy": 2.79826557636261, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17975997924804688, "step": 15950 }, { "epoch": 0.4985, "grad_norm": 3.265625, "grad_norm_var": 0.046873982747395834, "learning_rate": 0.0001, "loss": 5.8, "loss/crossentropy": 2.5743913650512695, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17138755321502686, "step": 15952 }, { "epoch": 0.4985625, "grad_norm": 2.953125, "grad_norm_var": 0.05056050618489583, "learning_rate": 0.0001, "loss": 5.7426, "loss/crossentropy": 2.596100926399231, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1665981337428093, "step": 15954 }, { "epoch": 0.498625, "grad_norm": 3.15625, "grad_norm_var": 0.049193318684895834, "learning_rate": 0.0001, "loss": 5.7905, "loss/crossentropy": 2.6612913608551025, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16370096802711487, "step": 15956 }, { "epoch": 0.4986875, "grad_norm": 3.578125, "grad_norm_var": 0.060042317708333334, "learning_rate": 0.0001, "loss": 6.1546, "loss/crossentropy": 2.6566189527511597, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.19237946718931198, "step": 15958 }, { "epoch": 0.49875, "grad_norm": 3.09375, "grad_norm_var": 0.04508056640625, "learning_rate": 0.0001, "loss": 5.8987, "loss/crossentropy": 2.6189379692077637, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17797543853521347, "step": 15960 }, { "epoch": 0.4988125, "grad_norm": 3.28125, "grad_norm_var": 0.049250284830729164, "learning_rate": 0.0001, "loss": 5.6761, "loss/crossentropy": 2.514289379119873, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1689145490527153, "step": 15962 }, { "epoch": 0.498875, "grad_norm": 3.421875, "grad_norm_var": 0.048974609375, "learning_rate": 0.0001, "loss": 6.1333, "loss/crossentropy": 2.835245370864868, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1794164776802063, "step": 15964 }, { "epoch": 0.4989375, "grad_norm": 3.359375, "grad_norm_var": 0.0409820556640625, "learning_rate": 0.0001, "loss": 5.9531, "loss/crossentropy": 2.6926647424697876, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1756543666124344, "step": 15966 }, { "epoch": 0.499, "grad_norm": 2.953125, "grad_norm_var": 0.040558878580729166, "learning_rate": 0.0001, "loss": 6.0007, "loss/crossentropy": 2.7710163593292236, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1725805401802063, "step": 15968 }, { "epoch": 0.4990625, "grad_norm": 3.578125, "grad_norm_var": 0.041829427083333336, "learning_rate": 0.0001, "loss": 5.8248, "loss/crossentropy": 2.563231348991394, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17420003563165665, "step": 15970 }, { "epoch": 0.499125, "grad_norm": 3.359375, "grad_norm_var": 0.042952473958333334, "learning_rate": 0.0001, "loss": 5.7934, "loss/crossentropy": 2.5948245525360107, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17337030172348022, "step": 15972 }, { "epoch": 0.4991875, "grad_norm": 3.0625, "grad_norm_var": 0.03693745930989583, "learning_rate": 0.0001, "loss": 6.0875, "loss/crossentropy": 2.794007182121277, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17739754915237427, "step": 15974 }, { "epoch": 0.49925, "grad_norm": 2.9375, "grad_norm_var": 0.04419657389322917, "learning_rate": 0.0001, "loss": 5.8663, "loss/crossentropy": 2.6902471780776978, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16839084029197693, "step": 15976 }, { "epoch": 0.4993125, "grad_norm": 3.171875, "grad_norm_var": 0.0392730712890625, "learning_rate": 0.0001, "loss": 5.6043, "loss/crossentropy": 2.4186110496520996, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16818278282880783, "step": 15978 }, { "epoch": 0.499375, "grad_norm": 3.171875, "grad_norm_var": 0.0382965087890625, "learning_rate": 0.0001, "loss": 5.893, "loss/crossentropy": 2.639996647834778, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1721796840429306, "step": 15980 }, { "epoch": 0.4994375, "grad_norm": 3.03125, "grad_norm_var": 0.03852437337239583, "learning_rate": 0.0001, "loss": 5.8705, "loss/crossentropy": 2.6478593349456787, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1726580560207367, "step": 15982 }, { "epoch": 0.4995, "grad_norm": 3.125, "grad_norm_var": 0.03477274576822917, "learning_rate": 0.0001, "loss": 5.9556, "loss/crossentropy": 2.6546578407287598, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1757965162396431, "step": 15984 }, { "epoch": 0.4995625, "grad_norm": 3.328125, "grad_norm_var": 0.0257476806640625, "learning_rate": 0.0001, "loss": 5.8515, "loss/crossentropy": 2.7225167751312256, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1652454286813736, "step": 15986 }, { "epoch": 0.499625, "grad_norm": 3.140625, "grad_norm_var": 0.024860636393229166, "learning_rate": 0.0001, "loss": 5.7254, "loss/crossentropy": 2.6186875104904175, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16575387120246887, "step": 15988 }, { "epoch": 0.4996875, "grad_norm": 3.25, "grad_norm_var": 0.025309244791666668, "learning_rate": 0.0001, "loss": 5.7961, "loss/crossentropy": 2.5866594314575195, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17172956466674805, "step": 15990 }, { "epoch": 0.49975, "grad_norm": 2.984375, "grad_norm_var": 0.02496337890625, "learning_rate": 0.0001, "loss": 5.6234, "loss/crossentropy": 2.483269691467285, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16713418066501617, "step": 15992 }, { "epoch": 0.4998125, "grad_norm": 3.234375, "grad_norm_var": 0.029491170247395834, "learning_rate": 0.0001, "loss": 5.9615, "loss/crossentropy": 2.7531800270080566, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17239541560411453, "step": 15994 }, { "epoch": 0.499875, "grad_norm": 2.953125, "grad_norm_var": 0.03139546712239583, "learning_rate": 0.0001, "loss": 5.5012, "loss/crossentropy": 2.471090793609619, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15848200023174286, "step": 15996 }, { "epoch": 0.4999375, "grad_norm": 3.03125, "grad_norm_var": 0.03134765625, "learning_rate": 0.0001, "loss": 5.8642, "loss/crossentropy": 2.6671937704086304, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.17360534518957138, "step": 15998 }, { "epoch": 0.5, "grad_norm": 3.109375, "grad_norm_var": 0.03413798014322917, "learning_rate": 0.0001, "loss": 5.8482, "loss/crossentropy": 2.6166086196899414, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1719907894730568, "step": 16000 }, { "epoch": 0.5000625, "grad_norm": 3.421875, "grad_norm_var": 0.03278706868489583, "learning_rate": 0.0001, "loss": 5.891, "loss/crossentropy": 2.64286208152771, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17442382872104645, "step": 16002 }, { "epoch": 0.500125, "grad_norm": 3.859375, "grad_norm_var": 0.06376546223958333, "learning_rate": 0.0001, "loss": 5.7983, "loss/crossentropy": 2.474073648452759, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17655911296606064, "step": 16004 }, { "epoch": 0.5001875, "grad_norm": 3.421875, "grad_norm_var": 0.06347249348958334, "learning_rate": 0.0001, "loss": 5.4582, "loss/crossentropy": 2.3265300989151, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16473408788442612, "step": 16006 }, { "epoch": 0.50025, "grad_norm": 3.328125, "grad_norm_var": 0.0589508056640625, "learning_rate": 0.0001, "loss": 5.7411, "loss/crossentropy": 2.614622473716736, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16382355988025665, "step": 16008 }, { "epoch": 0.5003125, "grad_norm": 3.09375, "grad_norm_var": 0.05357666015625, "learning_rate": 0.0001, "loss": 5.5548, "loss/crossentropy": 2.4665409326553345, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16273151338100433, "step": 16010 }, { "epoch": 0.500375, "grad_norm": 3.078125, "grad_norm_var": 0.048726399739583336, "learning_rate": 0.0001, "loss": 5.9993, "loss/crossentropy": 2.73794686794281, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17730379849672318, "step": 16012 }, { "epoch": 0.5004375, "grad_norm": 3.375, "grad_norm_var": 0.04503580729166667, "learning_rate": 0.0001, "loss": 5.8427, "loss/crossentropy": 2.6063257455825806, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17285971343517303, "step": 16014 }, { "epoch": 0.5005, "grad_norm": 3.15625, "grad_norm_var": 0.08115234375, "learning_rate": 0.0001, "loss": 5.8164, "loss/crossentropy": 2.5140546560287476, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17984529584646225, "step": 16016 }, { "epoch": 0.5005625, "grad_norm": 7.5625, "grad_norm_var": 1.1968912760416666, "learning_rate": 0.0001, "loss": 6.0672, "loss/crossentropy": 2.604338765144348, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.19550367444753647, "step": 16018 }, { "epoch": 0.500625, "grad_norm": 3.671875, "grad_norm_var": 1.1863444010416666, "learning_rate": 0.0001, "loss": 5.759, "loss/crossentropy": 2.476668357849121, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1766715720295906, "step": 16020 }, { "epoch": 0.5006875, "grad_norm": 3.296875, "grad_norm_var": 1.1625162760416667, "learning_rate": 0.0001, "loss": 6.1761, "loss/crossentropy": 2.6990877389907837, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.19144807755947113, "step": 16022 }, { "epoch": 0.50075, "grad_norm": 3.140625, "grad_norm_var": 1.1527506510416667, "learning_rate": 0.0001, "loss": 5.9818, "loss/crossentropy": 2.6940040588378906, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1791740357875824, "step": 16024 }, { "epoch": 0.5008125, "grad_norm": 3.3125, "grad_norm_var": 1.1218251546223958, "learning_rate": 0.0001, "loss": 6.1611, "loss/crossentropy": 2.806161403656006, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18314571678638458, "step": 16026 }, { "epoch": 0.500875, "grad_norm": 3.046875, "grad_norm_var": 1.1304921468098958, "learning_rate": 0.0001, "loss": 5.9907, "loss/crossentropy": 2.7044265270233154, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17706524580717087, "step": 16028 }, { "epoch": 0.5009375, "grad_norm": 3.0625, "grad_norm_var": 1.1634999593098958, "learning_rate": 0.0001, "loss": 5.6703, "loss/crossentropy": 2.5244998931884766, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16379740089178085, "step": 16030 }, { "epoch": 0.501, "grad_norm": 4.90625, "grad_norm_var": 1.232494099934896, "learning_rate": 0.0001, "loss": 6.0419, "loss/crossentropy": 2.602308511734009, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18927372992038727, "step": 16032 }, { "epoch": 0.5010625, "grad_norm": 3.375, "grad_norm_var": 0.19861551920572917, "learning_rate": 0.0001, "loss": 5.9137, "loss/crossentropy": 2.627810001373291, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1770292967557907, "step": 16034 }, { "epoch": 0.501125, "grad_norm": 3.078125, "grad_norm_var": 0.2118072509765625, "learning_rate": 0.0001, "loss": 5.4875, "loss/crossentropy": 2.417228937149048, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1597566306591034, "step": 16036 }, { "epoch": 0.5011875, "grad_norm": 2.984375, "grad_norm_var": 0.2126129150390625, "learning_rate": 0.0001, "loss": 5.6537, "loss/crossentropy": 2.5580382347106934, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16151735931634903, "step": 16038 }, { "epoch": 0.50125, "grad_norm": 3.03125, "grad_norm_var": 0.20921223958333332, "learning_rate": 0.0001, "loss": 5.3003, "loss/crossentropy": 2.281244397163391, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1538594588637352, "step": 16040 }, { "epoch": 0.5013125, "grad_norm": 3.625, "grad_norm_var": 1.855304972330729, "learning_rate": 0.0001, "loss": 6.4356, "loss/crossentropy": 2.6569602489471436, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.2173164114356041, "step": 16042 }, { "epoch": 0.501375, "grad_norm": 3.640625, "grad_norm_var": 1.8310292561848958, "learning_rate": 0.0001, "loss": 6.0169, "loss/crossentropy": 2.6723943948745728, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.18327917158603668, "step": 16044 }, { "epoch": 0.5014375, "grad_norm": 3.359375, "grad_norm_var": 1.8175944010416667, "learning_rate": 0.0001, "loss": 6.2204, "loss/crossentropy": 2.9426517486572266, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1765984445810318, "step": 16046 }, { "epoch": 0.5015, "grad_norm": 3.40625, "grad_norm_var": 1.7128733317057292, "learning_rate": 0.0001, "loss": 5.8617, "loss/crossentropy": 2.5231932401657104, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17994019389152527, "step": 16048 }, { "epoch": 0.5015625, "grad_norm": 3.359375, "grad_norm_var": 1.7375396728515624, "learning_rate": 0.0001, "loss": 5.6578, "loss/crossentropy": 2.5435420274734497, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16767632216215134, "step": 16050 }, { "epoch": 0.501625, "grad_norm": 3.078125, "grad_norm_var": 1.7257232666015625, "learning_rate": 0.0001, "loss": 5.7299, "loss/crossentropy": 2.5690709352493286, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16999147087335587, "step": 16052 }, { "epoch": 0.5016875, "grad_norm": 3.734375, "grad_norm_var": 1.7012115478515626, "learning_rate": 0.0001, "loss": 5.9279, "loss/crossentropy": 2.4675475358963013, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18900148570537567, "step": 16054 }, { "epoch": 0.50175, "grad_norm": 3.390625, "grad_norm_var": 1.6786946614583333, "learning_rate": 0.0001, "loss": 5.8318, "loss/crossentropy": 2.571017861366272, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17919930070638657, "step": 16056 }, { "epoch": 0.5018125, "grad_norm": 3.234375, "grad_norm_var": 0.047998046875, "learning_rate": 0.0001, "loss": 5.297, "loss/crossentropy": 2.2698299884796143, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16052843630313873, "step": 16058 }, { "epoch": 0.501875, "grad_norm": 3.171875, "grad_norm_var": 0.051512654622395834, "learning_rate": 0.0001, "loss": 5.459, "loss/crossentropy": 2.4018373489379883, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16001027822494507, "step": 16060 }, { "epoch": 0.5019375, "grad_norm": 3.21875, "grad_norm_var": 0.05315755208333333, "learning_rate": 0.0001, "loss": 5.6102, "loss/crossentropy": 2.4685033559799194, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1688537895679474, "step": 16062 }, { "epoch": 0.502, "grad_norm": 3.171875, "grad_norm_var": 0.04537353515625, "learning_rate": 0.0001, "loss": 5.9411, "loss/crossentropy": 2.6571015119552612, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17761804163455963, "step": 16064 }, { "epoch": 0.5020625, "grad_norm": 2.953125, "grad_norm_var": 0.04302978515625, "learning_rate": 0.0001, "loss": 5.4804, "loss/crossentropy": 2.413459300994873, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16138406842947006, "step": 16066 }, { "epoch": 0.502125, "grad_norm": 2.953125, "grad_norm_var": 0.043309529622395836, "learning_rate": 0.0001, "loss": 5.562, "loss/crossentropy": 2.447237491607666, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16498787701129913, "step": 16068 }, { "epoch": 0.5021875, "grad_norm": 3.265625, "grad_norm_var": 0.019977823893229166, "learning_rate": 0.0001, "loss": 5.8691, "loss/crossentropy": 2.575567126274109, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1789635792374611, "step": 16070 }, { "epoch": 0.50225, "grad_norm": 3.65625, "grad_norm_var": 0.03778889973958333, "learning_rate": 0.0001, "loss": 5.7838, "loss/crossentropy": 2.5296674966812134, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17462927103042603, "step": 16072 }, { "epoch": 0.5023125, "grad_norm": 3.375, "grad_norm_var": 0.042561848958333336, "learning_rate": 0.0001, "loss": 5.8879, "loss/crossentropy": 2.6592808961868286, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17403249442577362, "step": 16074 }, { "epoch": 0.502375, "grad_norm": 3.03125, "grad_norm_var": 0.03680013020833333, "learning_rate": 0.0001, "loss": 5.9504, "loss/crossentropy": 2.6783300638198853, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17564692348241806, "step": 16076 }, { "epoch": 0.5024375, "grad_norm": 3.171875, "grad_norm_var": 0.03975321451822917, "learning_rate": 0.0001, "loss": 5.6857, "loss/crossentropy": 2.4860860109329224, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1676216870546341, "step": 16078 }, { "epoch": 0.5025, "grad_norm": 3.21875, "grad_norm_var": 0.039876302083333336, "learning_rate": 0.0001, "loss": 5.7396, "loss/crossentropy": 2.6158945560455322, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1647173911333084, "step": 16080 }, { "epoch": 0.5025625, "grad_norm": 2.90625, "grad_norm_var": 0.045042928059895834, "learning_rate": 0.0001, "loss": 5.8268, "loss/crossentropy": 2.5639734268188477, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1774589642882347, "step": 16082 }, { "epoch": 0.502625, "grad_norm": 3.234375, "grad_norm_var": 0.064697265625, "learning_rate": 0.0001, "loss": 5.4842, "loss/crossentropy": 2.3451225757598877, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16429540514945984, "step": 16084 }, { "epoch": 0.5026875, "grad_norm": 3.21875, "grad_norm_var": 0.06734110514322916, "learning_rate": 0.0001, "loss": 5.8722, "loss/crossentropy": 2.6091232299804688, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17552156001329422, "step": 16086 }, { "epoch": 0.50275, "grad_norm": 3.078125, "grad_norm_var": 0.07672119140625, "learning_rate": 0.0001, "loss": 5.5526, "loss/crossentropy": 2.481239438056946, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15948085486888885, "step": 16088 }, { "epoch": 0.5028125, "grad_norm": 3.15625, "grad_norm_var": 0.07282613118489584, "learning_rate": 0.0001, "loss": 5.8227, "loss/crossentropy": 2.5843098163604736, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1730574518442154, "step": 16090 }, { "epoch": 0.502875, "grad_norm": 3.125, "grad_norm_var": 0.0718170166015625, "learning_rate": 0.0001, "loss": 5.8152, "loss/crossentropy": 2.566161870956421, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17646662145853043, "step": 16092 }, { "epoch": 0.5029375, "grad_norm": 2.921875, "grad_norm_var": 0.07415364583333334, "learning_rate": 0.0001, "loss": 5.346, "loss/crossentropy": 2.282988429069519, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15864654630422592, "step": 16094 }, { "epoch": 0.503, "grad_norm": 3.671875, "grad_norm_var": 0.09406636555989584, "learning_rate": 0.0001, "loss": 5.6663, "loss/crossentropy": 2.429767608642578, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17170406877994537, "step": 16096 }, { "epoch": 0.5030625, "grad_norm": 3.4375, "grad_norm_var": 0.09127197265625, "learning_rate": 0.0001, "loss": 5.8715, "loss/crossentropy": 2.6788880825042725, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17121805995702744, "step": 16098 }, { "epoch": 0.503125, "grad_norm": 3.15625, "grad_norm_var": 0.06845703125, "learning_rate": 0.0001, "loss": 5.3718, "loss/crossentropy": 2.3410524129867554, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16010761260986328, "step": 16100 }, { "epoch": 0.5031875, "grad_norm": 2.734375, "grad_norm_var": 0.07044169108072916, "learning_rate": 0.0001, "loss": 5.3351, "loss/crossentropy": 2.381006360054016, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15399903804063797, "step": 16102 }, { "epoch": 0.50325, "grad_norm": 2.890625, "grad_norm_var": 0.06923421223958333, "learning_rate": 0.0001, "loss": 5.5096, "loss/crossentropy": 2.404729962348938, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.15736467391252518, "step": 16104 }, { "epoch": 0.5033125, "grad_norm": 3.3125, "grad_norm_var": 0.07305399576822917, "learning_rate": 0.0001, "loss": 6.1712, "loss/crossentropy": 2.8202677965164185, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17923591285943985, "step": 16106 }, { "epoch": 0.503375, "grad_norm": 3.375, "grad_norm_var": 0.0749664306640625, "learning_rate": 0.0001, "loss": 6.0594, "loss/crossentropy": 2.7203365564346313, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.18273458629846573, "step": 16108 }, { "epoch": 0.5034375, "grad_norm": 3.15625, "grad_norm_var": 0.0721099853515625, "learning_rate": 0.0001, "loss": 5.546, "loss/crossentropy": 2.3975898027420044, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1636715978384018, "step": 16110 }, { "epoch": 0.5035, "grad_norm": 3.75, "grad_norm_var": 0.07496337890625, "learning_rate": 0.0001, "loss": 5.8336, "loss/crossentropy": 2.59959077835083, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.16871649026870728, "step": 16112 }, { "epoch": 0.5035625, "grad_norm": 3.078125, "grad_norm_var": 0.06832682291666667, "learning_rate": 0.0001, "loss": 5.6162, "loss/crossentropy": 2.442840099334717, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16733119636774063, "step": 16114 }, { "epoch": 0.503625, "grad_norm": 3.140625, "grad_norm_var": 0.05947265625, "learning_rate": 0.0001, "loss": 5.7089, "loss/crossentropy": 2.5029762983322144, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1733296662569046, "step": 16116 }, { "epoch": 0.5036875, "grad_norm": 2.890625, "grad_norm_var": 0.0436187744140625, "learning_rate": 0.0001, "loss": 5.5841, "loss/crossentropy": 2.5070003271102905, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16161343455314636, "step": 16118 }, { "epoch": 0.50375, "grad_norm": 3.234375, "grad_norm_var": 0.03753255208333333, "learning_rate": 0.0001, "loss": 5.943, "loss/crossentropy": 2.713122248649597, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17103470861911774, "step": 16120 }, { "epoch": 0.5038125, "grad_norm": 3.140625, "grad_norm_var": 0.03677978515625, "learning_rate": 0.0001, "loss": 5.6773, "loss/crossentropy": 2.4745407104492188, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1722284033894539, "step": 16122 }, { "epoch": 0.503875, "grad_norm": 2.890625, "grad_norm_var": 0.04394124348958333, "learning_rate": 0.0001, "loss": 5.5866, "loss/crossentropy": 2.469696044921875, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1632496565580368, "step": 16124 }, { "epoch": 0.5039375, "grad_norm": 2.9375, "grad_norm_var": 0.0474273681640625, "learning_rate": 0.0001, "loss": 5.536, "loss/crossentropy": 2.4524052143096924, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1622677817940712, "step": 16126 }, { "epoch": 0.504, "grad_norm": 3.21875, "grad_norm_var": 0.022347005208333333, "learning_rate": 0.0001, "loss": 5.8276, "loss/crossentropy": 2.58161997795105, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17655570805072784, "step": 16128 }, { "epoch": 0.5040625, "grad_norm": 3.203125, "grad_norm_var": 0.0209625244140625, "learning_rate": 0.0001, "loss": 5.6437, "loss/crossentropy": 2.453909993171692, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16937338560819626, "step": 16130 }, { "epoch": 0.504125, "grad_norm": 3.0, "grad_norm_var": 0.021675618489583333, "learning_rate": 0.0001, "loss": 5.7802, "loss/crossentropy": 2.6307448148727417, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1672848016023636, "step": 16132 }, { "epoch": 0.5041875, "grad_norm": 3.171875, "grad_norm_var": 0.018944295247395833, "learning_rate": 0.0001, "loss": 5.4908, "loss/crossentropy": 2.3591588735580444, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1639464944601059, "step": 16134 }, { "epoch": 0.50425, "grad_norm": 3.109375, "grad_norm_var": 0.021198527018229166, "learning_rate": 0.0001, "loss": 5.7267, "loss/crossentropy": 2.5237497091293335, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17303133010864258, "step": 16136 }, { "epoch": 0.5043125, "grad_norm": 3.375, "grad_norm_var": 0.024332682291666668, "learning_rate": 0.0001, "loss": 5.936, "loss/crossentropy": 2.6650710105895996, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17983216047286987, "step": 16138 }, { "epoch": 0.504375, "grad_norm": 3.171875, "grad_norm_var": 0.014094034830729166, "learning_rate": 0.0001, "loss": 5.7133, "loss/crossentropy": 2.5225549936294556, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16829188913106918, "step": 16140 }, { "epoch": 0.5044375, "grad_norm": 3.3125, "grad_norm_var": 0.011970011393229167, "learning_rate": 0.0001, "loss": 5.9035, "loss/crossentropy": 2.6384806632995605, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1757199615240097, "step": 16142 }, { "epoch": 0.5045, "grad_norm": 3.328125, "grad_norm_var": 0.013590494791666666, "learning_rate": 0.0001, "loss": 6.0729, "loss/crossentropy": 2.7575117349624634, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1749001070857048, "step": 16144 }, { "epoch": 0.5045625, "grad_norm": 3.3125, "grad_norm_var": 0.016120402018229167, "learning_rate": 0.0001, "loss": 6.0004, "loss/crossentropy": 2.742537498474121, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17773668467998505, "step": 16146 }, { "epoch": 0.504625, "grad_norm": 3.046875, "grad_norm_var": 0.0156158447265625, "learning_rate": 0.0001, "loss": 5.4671, "loss/crossentropy": 2.3550872802734375, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16588659584522247, "step": 16148 }, { "epoch": 0.5046875, "grad_norm": 3.140625, "grad_norm_var": 0.014069620768229167, "learning_rate": 0.0001, "loss": 5.8368, "loss/crossentropy": 2.667310953140259, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1716412603855133, "step": 16150 }, { "epoch": 0.50475, "grad_norm": 3.046875, "grad_norm_var": 0.0194732666015625, "learning_rate": 0.0001, "loss": 5.8639, "loss/crossentropy": 2.605831503868103, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17034223675727844, "step": 16152 }, { "epoch": 0.5048125, "grad_norm": 3.15625, "grad_norm_var": 0.018745930989583333, "learning_rate": 0.0001, "loss": 5.7254, "loss/crossentropy": 2.5661193132400513, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16670657694339752, "step": 16154 }, { "epoch": 0.504875, "grad_norm": 3.28125, "grad_norm_var": 0.0747955322265625, "learning_rate": 0.0001, "loss": 5.4511, "loss/crossentropy": 2.3625839948654175, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15963409841060638, "step": 16156 }, { "epoch": 0.5049375, "grad_norm": 3.453125, "grad_norm_var": 0.07726236979166666, "learning_rate": 0.0001, "loss": 5.7982, "loss/crossentropy": 2.584004282951355, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17376431822776794, "step": 16158 }, { "epoch": 0.505, "grad_norm": 3.03125, "grad_norm_var": 0.08076070149739584, "learning_rate": 0.0001, "loss": 5.7408, "loss/crossentropy": 2.628161072731018, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1651678830385208, "step": 16160 }, { "epoch": 0.5050625, "grad_norm": 2.984375, "grad_norm_var": 0.08547261555989584, "learning_rate": 0.0001, "loss": 6.0497, "loss/crossentropy": 2.759259819984436, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1786520555615425, "step": 16162 }, { "epoch": 0.505125, "grad_norm": 2.890625, "grad_norm_var": 0.0950347900390625, "learning_rate": 0.0001, "loss": 5.7571, "loss/crossentropy": 2.6068798303604126, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16697701811790466, "step": 16164 }, { "epoch": 0.5051875, "grad_norm": 3.375, "grad_norm_var": 0.09643452962239583, "learning_rate": 0.0001, "loss": 5.7515, "loss/crossentropy": 2.4896483421325684, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17736003547906876, "step": 16166 }, { "epoch": 0.50525, "grad_norm": 2.859375, "grad_norm_var": 0.09853413899739584, "learning_rate": 0.0001, "loss": 5.7608, "loss/crossentropy": 2.5751978158950806, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1693381741642952, "step": 16168 }, { "epoch": 0.5053125, "grad_norm": 2.953125, "grad_norm_var": 0.10435282389322917, "learning_rate": 0.0001, "loss": 5.7404, "loss/crossentropy": 2.627490282058716, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16598249971866608, "step": 16170 }, { "epoch": 0.505375, "grad_norm": 2.96875, "grad_norm_var": 0.03892822265625, "learning_rate": 0.0001, "loss": 5.3349, "loss/crossentropy": 2.360503673553467, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.1556451991200447, "step": 16172 }, { "epoch": 0.5054375, "grad_norm": 2.921875, "grad_norm_var": 0.032225545247395834, "learning_rate": 0.0001, "loss": 5.1259, "loss/crossentropy": 2.2279410362243652, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.14838526397943497, "step": 16174 }, { "epoch": 0.5055, "grad_norm": 3.15625, "grad_norm_var": 0.0465240478515625, "learning_rate": 0.0001, "loss": 5.7103, "loss/crossentropy": 2.4705700874328613, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17162708193063736, "step": 16176 }, { "epoch": 0.5055625, "grad_norm": 3.046875, "grad_norm_var": 0.039713541666666664, "learning_rate": 0.0001, "loss": 5.7458, "loss/crossentropy": 2.562490940093994, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.170286163687706, "step": 16178 }, { "epoch": 0.505625, "grad_norm": 4.28125, "grad_norm_var": 0.12094624837239583, "learning_rate": 0.0001, "loss": 6.1504, "loss/crossentropy": 2.6295450925827026, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.19856493920087814, "step": 16180 }, { "epoch": 0.5056875, "grad_norm": 3.5625, "grad_norm_var": 0.12541402180989583, "learning_rate": 0.0001, "loss": 6.0624, "loss/crossentropy": 2.729319453239441, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18253156542778015, "step": 16182 }, { "epoch": 0.50575, "grad_norm": 3.328125, "grad_norm_var": 0.11858622233072917, "learning_rate": 0.0001, "loss": 5.6087, "loss/crossentropy": 2.4301421642303467, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17059285938739777, "step": 16184 }, { "epoch": 0.5058125, "grad_norm": 3.203125, "grad_norm_var": 0.1106597900390625, "learning_rate": 0.0001, "loss": 5.8512, "loss/crossentropy": 2.5526453256607056, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1767268180847168, "step": 16186 }, { "epoch": 0.505875, "grad_norm": 3.0625, "grad_norm_var": 0.11020406087239583, "learning_rate": 0.0001, "loss": 5.6925, "loss/crossentropy": 2.5582053661346436, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1665545403957367, "step": 16188 }, { "epoch": 0.5059375, "grad_norm": 3.078125, "grad_norm_var": 0.10537109375, "learning_rate": 0.0001, "loss": 5.6352, "loss/crossentropy": 2.477321147918701, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16696136444807053, "step": 16190 }, { "epoch": 0.506, "grad_norm": 3.265625, "grad_norm_var": 0.11269429524739584, "learning_rate": 0.0001, "loss": 6.1459, "loss/crossentropy": 2.7620056867599487, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18604668229818344, "step": 16192 }, { "epoch": 0.5060625, "grad_norm": 3.234375, "grad_norm_var": 0.11366780598958333, "learning_rate": 0.0001, "loss": 5.8388, "loss/crossentropy": 2.644774913787842, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17174355685710907, "step": 16194 }, { "epoch": 0.506125, "grad_norm": 3.0, "grad_norm_var": 0.044733683268229164, "learning_rate": 0.0001, "loss": 5.802, "loss/crossentropy": 2.6588616371154785, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16900236904621124, "step": 16196 }, { "epoch": 0.5061875, "grad_norm": 3.0625, "grad_norm_var": 0.03515218098958333, "learning_rate": 0.0001, "loss": 5.7907, "loss/crossentropy": 2.557256817817688, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17295359075069427, "step": 16198 }, { "epoch": 0.50625, "grad_norm": 3.0625, "grad_norm_var": 0.06510416666666667, "learning_rate": 0.0001, "loss": 6.0178, "loss/crossentropy": 2.6430050134658813, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.18162298202514648, "step": 16200 }, { "epoch": 0.5063125, "grad_norm": 3.046875, "grad_norm_var": 0.06559956868489583, "learning_rate": 0.0001, "loss": 5.839, "loss/crossentropy": 2.69626522064209, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16622426360845566, "step": 16202 }, { "epoch": 0.506375, "grad_norm": 3.328125, "grad_norm_var": 0.06236063639322917, "learning_rate": 0.0001, "loss": 5.6734, "loss/crossentropy": 2.517215371131897, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1671859174966812, "step": 16204 }, { "epoch": 0.5064375, "grad_norm": 3.34375, "grad_norm_var": 0.06251627604166667, "learning_rate": 0.0001, "loss": 5.733, "loss/crossentropy": 2.5226190090179443, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1726020723581314, "step": 16206 }, { "epoch": 0.5065, "grad_norm": 3.0, "grad_norm_var": 0.04901936848958333, "learning_rate": 0.0001, "loss": 5.6748, "loss/crossentropy": 2.5151621103286743, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16869834065437317, "step": 16208 }, { "epoch": 0.5065625, "grad_norm": 3.09375, "grad_norm_var": 0.0471832275390625, "learning_rate": 0.0001, "loss": 5.7949, "loss/crossentropy": 2.5869085788726807, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1696242317557335, "step": 16210 }, { "epoch": 0.506625, "grad_norm": 2.875, "grad_norm_var": 0.052392578125, "learning_rate": 0.0001, "loss": 6.018, "loss/crossentropy": 2.7579660415649414, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17404796928167343, "step": 16212 }, { "epoch": 0.5066875, "grad_norm": 3.21875, "grad_norm_var": 0.0533111572265625, "learning_rate": 0.0001, "loss": 5.3567, "loss/crossentropy": 2.330523729324341, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15456832945346832, "step": 16214 }, { "epoch": 0.50675, "grad_norm": 3.203125, "grad_norm_var": 0.0234771728515625, "learning_rate": 0.0001, "loss": 5.8171, "loss/crossentropy": 2.6442378759384155, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16806329041719437, "step": 16216 }, { "epoch": 0.5068125, "grad_norm": 3.21875, "grad_norm_var": 0.023908487955729165, "learning_rate": 0.0001, "loss": 5.834, "loss/crossentropy": 2.5568329095840454, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.18006138503551483, "step": 16218 }, { "epoch": 0.506875, "grad_norm": 3.234375, "grad_norm_var": 0.0215728759765625, "learning_rate": 0.0001, "loss": 5.9564, "loss/crossentropy": 2.6573452949523926, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.18029828369617462, "step": 16220 }, { "epoch": 0.5069375, "grad_norm": 3.171875, "grad_norm_var": 0.023844401041666668, "learning_rate": 0.0001, "loss": 5.593, "loss/crossentropy": 2.477585196495056, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1638849675655365, "step": 16222 }, { "epoch": 0.507, "grad_norm": 3.046875, "grad_norm_var": 0.0238922119140625, "learning_rate": 0.0001, "loss": 5.7869, "loss/crossentropy": 2.584786057472229, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1690370962023735, "step": 16224 }, { "epoch": 0.5070625, "grad_norm": 3.375, "grad_norm_var": 0.025462849934895834, "learning_rate": 0.0001, "loss": 6.0407, "loss/crossentropy": 2.7082419395446777, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1797330230474472, "step": 16226 }, { "epoch": 0.507125, "grad_norm": 3.390625, "grad_norm_var": 0.019527180989583334, "learning_rate": 0.0001, "loss": 5.6953, "loss/crossentropy": 2.4971712827682495, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17020606994628906, "step": 16228 }, { "epoch": 0.5071875, "grad_norm": 3.3125, "grad_norm_var": 0.017964680989583332, "learning_rate": 0.0001, "loss": 5.7703, "loss/crossentropy": 2.5960510969161987, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16898278892040253, "step": 16230 }, { "epoch": 0.50725, "grad_norm": 3.84375, "grad_norm_var": 0.048046875, "learning_rate": 0.0001, "loss": 5.6682, "loss/crossentropy": 2.416340708732605, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.16854875534772873, "step": 16232 }, { "epoch": 0.5073125, "grad_norm": 3.203125, "grad_norm_var": 0.04859619140625, "learning_rate": 0.0001, "loss": 5.8382, "loss/crossentropy": 2.6066319942474365, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17589052021503448, "step": 16234 }, { "epoch": 0.507375, "grad_norm": 2.859375, "grad_norm_var": 0.07297770182291667, "learning_rate": 0.0001, "loss": 5.7389, "loss/crossentropy": 2.500886917114258, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17418906837701797, "step": 16236 }, { "epoch": 0.5074375, "grad_norm": 3.015625, "grad_norm_var": 0.06868489583333333, "learning_rate": 0.0001, "loss": 5.6729, "loss/crossentropy": 2.4839404821395874, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16733496636152267, "step": 16238 }, { "epoch": 0.5075, "grad_norm": 3.125, "grad_norm_var": 0.06521809895833333, "learning_rate": 0.0001, "loss": 5.6014, "loss/crossentropy": 2.413141131401062, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1699993684887886, "step": 16240 }, { "epoch": 0.5075625, "grad_norm": 3.140625, "grad_norm_var": 0.06571858723958333, "learning_rate": 0.0001, "loss": 5.5904, "loss/crossentropy": 2.5172938108444214, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16277482360601425, "step": 16242 }, { "epoch": 0.507625, "grad_norm": 3.375, "grad_norm_var": 0.06553446451822917, "learning_rate": 0.0001, "loss": 5.5444, "loss/crossentropy": 2.3586833477020264, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.16466347128152847, "step": 16244 }, { "epoch": 0.5076875, "grad_norm": 3.21875, "grad_norm_var": 0.06585184733072917, "learning_rate": 0.0001, "loss": 5.8779, "loss/crossentropy": 2.6398749351501465, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17497644573450089, "step": 16246 }, { "epoch": 0.50775, "grad_norm": 2.890625, "grad_norm_var": 0.04908447265625, "learning_rate": 0.0001, "loss": 5.7304, "loss/crossentropy": 2.6413955688476562, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1628054976463318, "step": 16248 }, { "epoch": 0.5078125, "grad_norm": 3.84375, "grad_norm_var": 0.0750885009765625, "learning_rate": 0.0001, "loss": 5.7429, "loss/crossentropy": 2.519925117492676, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1715191975235939, "step": 16250 }, { "epoch": 0.507875, "grad_norm": 3.0625, "grad_norm_var": 0.0544921875, "learning_rate": 0.0001, "loss": 5.3402, "loss/crossentropy": 2.3426631689071655, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15404711663722992, "step": 16252 }, { "epoch": 0.5079375, "grad_norm": 3.3125, "grad_norm_var": 0.0565826416015625, "learning_rate": 0.0001, "loss": 5.7011, "loss/crossentropy": 2.489269971847534, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1746997907757759, "step": 16254 }, { "epoch": 0.508, "grad_norm": 3.578125, "grad_norm_var": 0.06616923014322916, "learning_rate": 0.0001, "loss": 5.5958, "loss/crossentropy": 2.401709198951721, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16979826986789703, "step": 16256 }, { "epoch": 0.5080625, "grad_norm": 3.546875, "grad_norm_var": 0.07195536295572917, "learning_rate": 0.0001, "loss": 5.8205, "loss/crossentropy": 2.554845094680786, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17617210000753403, "step": 16258 }, { "epoch": 0.508125, "grad_norm": 3.046875, "grad_norm_var": 0.0723541259765625, "learning_rate": 0.0001, "loss": 5.5145, "loss/crossentropy": 2.38235604763031, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16712379455566406, "step": 16260 }, { "epoch": 0.5081875, "grad_norm": 3.375, "grad_norm_var": 0.07903544108072917, "learning_rate": 0.0001, "loss": 5.5439, "loss/crossentropy": 2.417195439338684, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1654004082083702, "step": 16262 }, { "epoch": 0.50825, "grad_norm": 2.984375, "grad_norm_var": 0.06752827962239584, "learning_rate": 0.0001, "loss": 5.3385, "loss/crossentropy": 2.28506600856781, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1596416011452675, "step": 16264 }, { "epoch": 0.5083125, "grad_norm": 3.234375, "grad_norm_var": 0.04104410807291667, "learning_rate": 0.0001, "loss": 5.9388, "loss/crossentropy": 2.6258240938186646, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1805119439959526, "step": 16266 }, { "epoch": 0.508375, "grad_norm": 3.984375, "grad_norm_var": 0.07288004557291666, "learning_rate": 0.0001, "loss": 5.7033, "loss/crossentropy": 2.517643094062805, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16778413206338882, "step": 16268 }, { "epoch": 0.5084375, "grad_norm": 3.125, "grad_norm_var": 0.08831380208333334, "learning_rate": 0.0001, "loss": 5.906, "loss/crossentropy": 2.567445397377014, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.18346509337425232, "step": 16270 }, { "epoch": 0.5085, "grad_norm": 3.546875, "grad_norm_var": 0.08516337076822916, "learning_rate": 0.0001, "loss": 6.2019, "loss/crossentropy": 2.753313660621643, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1901737079024315, "step": 16272 }, { "epoch": 0.5085625, "grad_norm": 3.21875, "grad_norm_var": 0.08279520670572917, "learning_rate": 0.0001, "loss": 5.8652, "loss/crossentropy": 2.6811606884002686, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17191822826862335, "step": 16274 }, { "epoch": 0.508625, "grad_norm": 3.0625, "grad_norm_var": 0.08159891764322917, "learning_rate": 0.0001, "loss": 5.9666, "loss/crossentropy": 2.694961190223694, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17715920507907867, "step": 16276 }, { "epoch": 0.5086875, "grad_norm": 3.546875, "grad_norm_var": 0.0837310791015625, "learning_rate": 0.0001, "loss": 5.9456, "loss/crossentropy": 2.748932719230652, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17005664110183716, "step": 16278 }, { "epoch": 0.50875, "grad_norm": 3.09375, "grad_norm_var": 0.07838541666666667, "learning_rate": 0.0001, "loss": 5.7193, "loss/crossentropy": 2.5305447578430176, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17122021317481995, "step": 16280 }, { "epoch": 0.5088125, "grad_norm": 3.1875, "grad_norm_var": 0.084375, "learning_rate": 0.0001, "loss": 5.5414, "loss/crossentropy": 2.4110606908798218, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16459402441978455, "step": 16282 }, { "epoch": 0.508875, "grad_norm": 3.328125, "grad_norm_var": 0.048859659830729166, "learning_rate": 0.0001, "loss": 5.8926, "loss/crossentropy": 2.5525397062301636, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1793188825249672, "step": 16284 }, { "epoch": 0.5089375, "grad_norm": 3.359375, "grad_norm_var": 0.032145182291666664, "learning_rate": 0.0001, "loss": 5.8081, "loss/crossentropy": 2.6179057359695435, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16901445388793945, "step": 16286 }, { "epoch": 0.509, "grad_norm": 3.078125, "grad_norm_var": 0.028058878580729165, "learning_rate": 0.0001, "loss": 5.7786, "loss/crossentropy": 2.521502375602722, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17531976848840714, "step": 16288 }, { "epoch": 0.5090625, "grad_norm": 3.171875, "grad_norm_var": 0.034684244791666666, "learning_rate": 0.0001, "loss": 5.9084, "loss/crossentropy": 2.615760326385498, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17809134721755981, "step": 16290 }, { "epoch": 0.509125, "grad_norm": 2.90625, "grad_norm_var": 0.04004618326822917, "learning_rate": 0.0001, "loss": 5.8047, "loss/crossentropy": 2.584061026573181, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1716773360967636, "step": 16292 }, { "epoch": 0.5091875, "grad_norm": 3.28125, "grad_norm_var": 0.03495992024739583, "learning_rate": 0.0001, "loss": 5.5849, "loss/crossentropy": 2.4501447677612305, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1603526473045349, "step": 16294 }, { "epoch": 0.50925, "grad_norm": 3.234375, "grad_norm_var": 0.03400777180989583, "learning_rate": 0.0001, "loss": 5.839, "loss/crossentropy": 2.6217808723449707, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17289509624242783, "step": 16296 }, { "epoch": 0.5093125, "grad_norm": 3.125, "grad_norm_var": 0.030745442708333334, "learning_rate": 0.0001, "loss": 5.7943, "loss/crossentropy": 2.6022061109542847, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16647806018590927, "step": 16298 }, { "epoch": 0.509375, "grad_norm": 3.453125, "grad_norm_var": 0.033951822916666666, "learning_rate": 0.0001, "loss": 5.6414, "loss/crossentropy": 2.5401326417922974, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16363850235939026, "step": 16300 }, { "epoch": 0.5094375, "grad_norm": 3.34375, "grad_norm_var": 0.033690388997395834, "learning_rate": 0.0001, "loss": 5.5292, "loss/crossentropy": 2.426029086112976, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16265857219696045, "step": 16302 }, { "epoch": 0.5095, "grad_norm": 3.140625, "grad_norm_var": 0.028662109375, "learning_rate": 0.0001, "loss": 5.5658, "loss/crossentropy": 2.4209630489349365, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1695614457130432, "step": 16304 }, { "epoch": 0.5095625, "grad_norm": 3.0, "grad_norm_var": 0.022542317708333332, "learning_rate": 0.0001, "loss": 5.6216, "loss/crossentropy": 2.4654128551483154, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16601119190454483, "step": 16306 }, { "epoch": 0.509625, "grad_norm": 3.28125, "grad_norm_var": 0.017215983072916666, "learning_rate": 0.0001, "loss": 5.6048, "loss/crossentropy": 2.4579662084579468, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16468004882335663, "step": 16308 }, { "epoch": 0.5096875, "grad_norm": 3.015625, "grad_norm_var": 0.014142862955729167, "learning_rate": 0.0001, "loss": 5.7639, "loss/crossentropy": 2.6069202423095703, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1703835353255272, "step": 16310 }, { "epoch": 0.50975, "grad_norm": 3.265625, "grad_norm_var": 0.014241536458333334, "learning_rate": 0.0001, "loss": 5.9887, "loss/crossentropy": 2.771964192390442, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1712872013449669, "step": 16312 }, { "epoch": 0.5098125, "grad_norm": 3.25, "grad_norm_var": 0.0157623291015625, "learning_rate": 0.0001, "loss": 5.5799, "loss/crossentropy": 2.3990046977996826, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17160802334547043, "step": 16314 }, { "epoch": 0.509875, "grad_norm": 3.46875, "grad_norm_var": 0.015165201822916667, "learning_rate": 0.0001, "loss": 5.7934, "loss/crossentropy": 2.5308748483657837, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17312680184841156, "step": 16316 }, { "epoch": 0.5099375, "grad_norm": 3.34375, "grad_norm_var": 0.016112263997395834, "learning_rate": 0.0001, "loss": 5.8915, "loss/crossentropy": 2.6198123693466187, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17560896277427673, "step": 16318 }, { "epoch": 0.51, "grad_norm": 2.984375, "grad_norm_var": 0.028425089518229165, "learning_rate": 0.0001, "loss": 5.6471, "loss/crossentropy": 2.49073326587677, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1640724316239357, "step": 16320 }, { "epoch": 0.5100625, "grad_norm": 3.265625, "grad_norm_var": 0.02587890625, "learning_rate": 0.0001, "loss": 5.9094, "loss/crossentropy": 2.6102280616760254, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1783544421195984, "step": 16322 }, { "epoch": 0.510125, "grad_norm": 2.96875, "grad_norm_var": 0.030826822916666666, "learning_rate": 0.0001, "loss": 5.9036, "loss/crossentropy": 2.6985751390457153, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17245478183031082, "step": 16324 }, { "epoch": 0.5101875, "grad_norm": 3.34375, "grad_norm_var": 0.0293121337890625, "learning_rate": 0.0001, "loss": 5.574, "loss/crossentropy": 2.410964608192444, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16630808264017105, "step": 16326 }, { "epoch": 0.51025, "grad_norm": 2.921875, "grad_norm_var": 0.034764607747395836, "learning_rate": 0.0001, "loss": 5.5465, "loss/crossentropy": 2.4809629917144775, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1628047674894333, "step": 16328 }, { "epoch": 0.5103125, "grad_norm": 3.515625, "grad_norm_var": 0.043000284830729166, "learning_rate": 0.0001, "loss": 5.8497, "loss/crossentropy": 2.472634196281433, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.18106115609407425, "step": 16330 }, { "epoch": 0.510375, "grad_norm": 3.078125, "grad_norm_var": 0.050324503580729166, "learning_rate": 0.0001, "loss": 5.3682, "loss/crossentropy": 2.3815958499908447, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.1572549194097519, "step": 16332 }, { "epoch": 0.5104375, "grad_norm": 3.296875, "grad_norm_var": 0.052652994791666664, "learning_rate": 0.0001, "loss": 5.8791, "loss/crossentropy": 2.515404224395752, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1816837266087532, "step": 16334 }, { "epoch": 0.5105, "grad_norm": 3.1875, "grad_norm_var": 0.0408111572265625, "learning_rate": 0.0001, "loss": 5.6377, "loss/crossentropy": 2.485397219657898, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16796302050352097, "step": 16336 }, { "epoch": 0.5105625, "grad_norm": 3.109375, "grad_norm_var": 0.040576171875, "learning_rate": 0.0001, "loss": 5.6999, "loss/crossentropy": 2.5654290914535522, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16579565405845642, "step": 16338 }, { "epoch": 0.510625, "grad_norm": 3.125, "grad_norm_var": 0.03681538899739583, "learning_rate": 0.0001, "loss": 5.9165, "loss/crossentropy": 2.7041677236557007, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17357587069272995, "step": 16340 }, { "epoch": 0.5106875, "grad_norm": 3.203125, "grad_norm_var": 0.03600260416666667, "learning_rate": 0.0001, "loss": 5.8067, "loss/crossentropy": 2.5463136434555054, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1740807741880417, "step": 16342 }, { "epoch": 0.51075, "grad_norm": 2.96875, "grad_norm_var": 0.03497721354166667, "learning_rate": 0.0001, "loss": 5.6408, "loss/crossentropy": 2.499354839324951, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16765828430652618, "step": 16344 }, { "epoch": 0.5108125, "grad_norm": 3.078125, "grad_norm_var": 0.08043619791666666, "learning_rate": 0.0001, "loss": 6.1104, "loss/crossentropy": 2.729995846748352, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1845257505774498, "step": 16346 }, { "epoch": 0.510875, "grad_norm": 3.171875, "grad_norm_var": 0.07063395182291667, "learning_rate": 0.0001, "loss": 5.9884, "loss/crossentropy": 2.6871135234832764, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17856331914663315, "step": 16348 }, { "epoch": 0.5109375, "grad_norm": 3.53125, "grad_norm_var": 0.07433980305989583, "learning_rate": 0.0001, "loss": 6.2738, "loss/crossentropy": 2.9303488731384277, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18083380907773972, "step": 16350 }, { "epoch": 0.511, "grad_norm": 3.203125, "grad_norm_var": 0.08007405598958334, "learning_rate": 0.0001, "loss": 5.7575, "loss/crossentropy": 2.6469838619232178, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16339480131864548, "step": 16352 }, { "epoch": 0.5110625, "grad_norm": 3.1875, "grad_norm_var": 0.07778218587239584, "learning_rate": 0.0001, "loss": 5.962, "loss/crossentropy": 2.700568914413452, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1769229918718338, "step": 16354 }, { "epoch": 0.511125, "grad_norm": 2.859375, "grad_norm_var": 0.08749593098958333, "learning_rate": 0.0001, "loss": 5.657, "loss/crossentropy": 2.526244282722473, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16658874601125717, "step": 16356 }, { "epoch": 0.5111875, "grad_norm": 3.0, "grad_norm_var": 0.08982747395833333, "learning_rate": 0.0001, "loss": 5.4606, "loss/crossentropy": 2.337547183036804, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16191042214632034, "step": 16358 }, { "epoch": 0.51125, "grad_norm": 3.203125, "grad_norm_var": 0.09379781087239583, "learning_rate": 0.0001, "loss": 5.814, "loss/crossentropy": 2.6115304231643677, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16829267889261246, "step": 16360 }, { "epoch": 0.5113125, "grad_norm": 3.28125, "grad_norm_var": 0.04244791666666667, "learning_rate": 0.0001, "loss": 5.8401, "loss/crossentropy": 2.6677355766296387, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16997096687555313, "step": 16362 }, { "epoch": 0.511375, "grad_norm": 3.4375, "grad_norm_var": 0.04417215983072917, "learning_rate": 0.0001, "loss": 5.9794, "loss/crossentropy": 2.653276562690735, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18105435371398926, "step": 16364 }, { "epoch": 0.5114375, "grad_norm": 3.203125, "grad_norm_var": 0.03186442057291667, "learning_rate": 0.0001, "loss": 5.9588, "loss/crossentropy": 2.6859607696533203, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.177672877907753, "step": 16366 }, { "epoch": 0.5115, "grad_norm": 2.96875, "grad_norm_var": 0.03181864420572917, "learning_rate": 0.0001, "loss": 5.5616, "loss/crossentropy": 2.3682419061660767, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1720716878771782, "step": 16368 }, { "epoch": 0.5115625, "grad_norm": 3.015625, "grad_norm_var": 0.030549112955729166, "learning_rate": 0.0001, "loss": 5.9132, "loss/crossentropy": 2.689664125442505, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1719593182206154, "step": 16370 }, { "epoch": 0.511625, "grad_norm": 3.21875, "grad_norm_var": 0.027457682291666667, "learning_rate": 0.0001, "loss": 5.1835, "loss/crossentropy": 2.225640296936035, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15359832346439362, "step": 16372 }, { "epoch": 0.5116875, "grad_norm": 3.03125, "grad_norm_var": 0.034235636393229164, "learning_rate": 0.0001, "loss": 5.9472, "loss/crossentropy": 2.6530286073684692, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1759035363793373, "step": 16374 }, { "epoch": 0.51175, "grad_norm": 3.25, "grad_norm_var": 0.0293853759765625, "learning_rate": 0.0001, "loss": 5.7623, "loss/crossentropy": 2.4916436672210693, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17745761573314667, "step": 16376 }, { "epoch": 0.5118125, "grad_norm": 2.984375, "grad_norm_var": 0.026463826497395832, "learning_rate": 0.0001, "loss": 5.8484, "loss/crossentropy": 2.694764733314514, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1645837500691414, "step": 16378 }, { "epoch": 0.511875, "grad_norm": 3.234375, "grad_norm_var": 0.022847493489583332, "learning_rate": 0.0001, "loss": 5.9255, "loss/crossentropy": 2.6192431449890137, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17945070564746857, "step": 16380 }, { "epoch": 0.5119375, "grad_norm": 3.109375, "grad_norm_var": 0.0229156494140625, "learning_rate": 0.0001, "loss": 5.9855, "loss/crossentropy": 2.7059956789016724, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1771649792790413, "step": 16382 }, { "epoch": 0.512, "grad_norm": 2.9375, "grad_norm_var": 0.022688802083333334, "learning_rate": 0.0001, "loss": 5.6971, "loss/crossentropy": 2.569319248199463, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1659058779478073, "step": 16384 }, { "epoch": 0.5120625, "grad_norm": 2.890625, "grad_norm_var": 0.025341796875, "learning_rate": 0.0001, "loss": 5.7879, "loss/crossentropy": 2.6588436365127563, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16564378887414932, "step": 16386 }, { "epoch": 0.512125, "grad_norm": 3.09375, "grad_norm_var": 0.024507649739583335, "learning_rate": 0.0001, "loss": 5.4127, "loss/crossentropy": 2.313828706741333, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16105583310127258, "step": 16388 }, { "epoch": 0.5121875, "grad_norm": 3.25, "grad_norm_var": 0.0187896728515625, "learning_rate": 0.0001, "loss": 6.0885, "loss/crossentropy": 2.743655562400818, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1844824254512787, "step": 16390 }, { "epoch": 0.51225, "grad_norm": 3.046875, "grad_norm_var": 0.018115234375, "learning_rate": 0.0001, "loss": 5.7395, "loss/crossentropy": 2.5687350034713745, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16511885076761246, "step": 16392 }, { "epoch": 0.5123125, "grad_norm": 3.375, "grad_norm_var": 0.020308430989583334, "learning_rate": 0.0001, "loss": 5.714, "loss/crossentropy": 2.506616234779358, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16917835175991058, "step": 16394 }, { "epoch": 0.512375, "grad_norm": 3.0, "grad_norm_var": 0.0195953369140625, "learning_rate": 0.0001, "loss": 5.8435, "loss/crossentropy": 2.6784114837646484, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.169635571539402, "step": 16396 }, { "epoch": 0.5124375, "grad_norm": 2.875, "grad_norm_var": 0.0249664306640625, "learning_rate": 0.0001, "loss": 5.5176, "loss/crossentropy": 2.4275470972061157, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16134928166866302, "step": 16398 }, { "epoch": 0.5125, "grad_norm": 3.40625, "grad_norm_var": 0.033447265625, "learning_rate": 0.0001, "loss": 5.7455, "loss/crossentropy": 2.469146490097046, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17255249619483948, "step": 16400 }, { "epoch": 0.5125625, "grad_norm": 3.09375, "grad_norm_var": 0.029645792643229165, "learning_rate": 0.0001, "loss": 5.749, "loss/crossentropy": 2.5529110431671143, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16999481618404388, "step": 16402 }, { "epoch": 0.512625, "grad_norm": 3.390625, "grad_norm_var": 0.030924479166666668, "learning_rate": 0.0001, "loss": 5.6468, "loss/crossentropy": 2.4244545698165894, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16988955438137054, "step": 16404 }, { "epoch": 0.5126875, "grad_norm": 3.671875, "grad_norm_var": 0.04410400390625, "learning_rate": 0.0001, "loss": 5.8922, "loss/crossentropy": 2.6311975717544556, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17922505736351013, "step": 16406 }, { "epoch": 0.51275, "grad_norm": 3.0625, "grad_norm_var": 0.0494140625, "learning_rate": 0.0001, "loss": 5.3777, "loss/crossentropy": 2.3718478679656982, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15527284890413284, "step": 16408 }, { "epoch": 0.5128125, "grad_norm": 3.078125, "grad_norm_var": 0.047684733072916666, "learning_rate": 0.0001, "loss": 5.9091, "loss/crossentropy": 2.7013646364212036, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.17506567388772964, "step": 16410 }, { "epoch": 0.512875, "grad_norm": 3.0625, "grad_norm_var": 0.046662394205729166, "learning_rate": 0.0001, "loss": 5.9451, "loss/crossentropy": 2.7101930379867554, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1734907478094101, "step": 16412 }, { "epoch": 0.5129375, "grad_norm": 3.28125, "grad_norm_var": 0.04045817057291667, "learning_rate": 0.0001, "loss": 5.7463, "loss/crossentropy": 2.600379467010498, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16654404252767563, "step": 16414 }, { "epoch": 0.513, "grad_norm": 3.0, "grad_norm_var": 0.035008748372395836, "learning_rate": 0.0001, "loss": 5.331, "loss/crossentropy": 2.3208197355270386, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15649104118347168, "step": 16416 }, { "epoch": 0.5130625, "grad_norm": 4.3125, "grad_norm_var": 0.11590169270833334, "learning_rate": 0.0001, "loss": 5.5822, "loss/crossentropy": 2.4566595554351807, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16372348368167877, "step": 16418 }, { "epoch": 0.513125, "grad_norm": 3.0, "grad_norm_var": 0.11842447916666667, "learning_rate": 0.0001, "loss": 5.7917, "loss/crossentropy": 2.640213131904602, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1663249060511589, "step": 16420 }, { "epoch": 0.5131875, "grad_norm": 3.21875, "grad_norm_var": 0.11492513020833334, "learning_rate": 0.0001, "loss": 6.2495, "loss/crossentropy": 2.9241329431533813, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.18409772962331772, "step": 16422 }, { "epoch": 0.51325, "grad_norm": 3.25, "grad_norm_var": 0.10549214680989584, "learning_rate": 0.0001, "loss": 5.7499, "loss/crossentropy": 2.5274248123168945, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17497973889112473, "step": 16424 }, { "epoch": 0.5133125, "grad_norm": 3.265625, "grad_norm_var": 0.10448811848958334, "learning_rate": 0.0001, "loss": 5.734, "loss/crossentropy": 2.4651782512664795, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17453866451978683, "step": 16426 }, { "epoch": 0.513375, "grad_norm": 3.046875, "grad_norm_var": 0.2105133056640625, "learning_rate": 0.0001, "loss": 5.7713, "loss/crossentropy": 2.5959800481796265, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.167924702167511, "step": 16428 }, { "epoch": 0.5134375, "grad_norm": 3.015625, "grad_norm_var": 0.209814453125, "learning_rate": 0.0001, "loss": 5.6805, "loss/crossentropy": 2.50624418258667, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16859985888004303, "step": 16430 }, { "epoch": 0.5135, "grad_norm": 3.1875, "grad_norm_var": 0.20103251139322917, "learning_rate": 0.0001, "loss": 5.8464, "loss/crossentropy": 2.643206238746643, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17188185453414917, "step": 16432 }, { "epoch": 0.5135625, "grad_norm": 2.953125, "grad_norm_var": 0.1538238525390625, "learning_rate": 0.0001, "loss": 5.4847, "loss/crossentropy": 2.4751126766204834, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15759844332933426, "step": 16434 }, { "epoch": 0.513625, "grad_norm": 2.796875, "grad_norm_var": 0.16464742024739584, "learning_rate": 0.0001, "loss": 5.5015, "loss/crossentropy": 2.44723117351532, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1585516333580017, "step": 16436 }, { "epoch": 0.5136875, "grad_norm": 3.390625, "grad_norm_var": 0.16023763020833334, "learning_rate": 0.0001, "loss": 5.8765, "loss/crossentropy": 2.7064484357833862, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16895991563796997, "step": 16438 }, { "epoch": 0.51375, "grad_norm": 3.109375, "grad_norm_var": 0.16331380208333332, "learning_rate": 0.0001, "loss": 5.7598, "loss/crossentropy": 2.588004469871521, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16834890842437744, "step": 16440 }, { "epoch": 0.5138125, "grad_norm": 3.296875, "grad_norm_var": 0.16309305826822917, "learning_rate": 0.0001, "loss": 5.699, "loss/crossentropy": 2.5069788694381714, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1656866893172264, "step": 16442 }, { "epoch": 0.513875, "grad_norm": 3.125, "grad_norm_var": 0.0258697509765625, "learning_rate": 0.0001, "loss": 5.9672, "loss/crossentropy": 2.789791703224182, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16930486261844635, "step": 16444 }, { "epoch": 0.5139375, "grad_norm": 3.140625, "grad_norm_var": 0.0309967041015625, "learning_rate": 0.0001, "loss": 5.55, "loss/crossentropy": 2.3457083702087402, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.16652101278305054, "step": 16446 }, { "epoch": 0.514, "grad_norm": 2.953125, "grad_norm_var": 0.0341796875, "learning_rate": 0.0001, "loss": 5.8116, "loss/crossentropy": 2.6447486877441406, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17020239681005478, "step": 16448 }, { "epoch": 0.5140625, "grad_norm": 3.421875, "grad_norm_var": 0.03357747395833333, "learning_rate": 0.0001, "loss": 5.6166, "loss/crossentropy": 2.412296772003174, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1692551001906395, "step": 16450 }, { "epoch": 0.514125, "grad_norm": 3.015625, "grad_norm_var": 0.027437337239583335, "learning_rate": 0.0001, "loss": 5.6995, "loss/crossentropy": 2.474939227104187, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17284496128559113, "step": 16452 }, { "epoch": 0.5141875, "grad_norm": 2.953125, "grad_norm_var": 0.0287261962890625, "learning_rate": 0.0001, "loss": 5.7438, "loss/crossentropy": 2.4855151176452637, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17583084106445312, "step": 16454 }, { "epoch": 0.51425, "grad_norm": 3.171875, "grad_norm_var": 0.028327433268229167, "learning_rate": 0.0001, "loss": 5.8503, "loss/crossentropy": 2.6569454669952393, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.174027219414711, "step": 16456 }, { "epoch": 0.5143125, "grad_norm": 3.734375, "grad_norm_var": 0.04450581868489583, "learning_rate": 0.0001, "loss": 5.8813, "loss/crossentropy": 2.5460811853408813, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17805617302656174, "step": 16458 }, { "epoch": 0.514375, "grad_norm": 3.140625, "grad_norm_var": 0.0438629150390625, "learning_rate": 0.0001, "loss": 5.8218, "loss/crossentropy": 2.6440343856811523, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17051490396261215, "step": 16460 }, { "epoch": 0.5144375, "grad_norm": 3.125, "grad_norm_var": 0.04116109212239583, "learning_rate": 0.0001, "loss": 5.5685, "loss/crossentropy": 2.430554509162903, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16965046525001526, "step": 16462 }, { "epoch": 0.5145, "grad_norm": 3.078125, "grad_norm_var": 0.0409088134765625, "learning_rate": 0.0001, "loss": 5.8054, "loss/crossentropy": 2.6475865840911865, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.167339488863945, "step": 16464 }, { "epoch": 0.5145625, "grad_norm": 3.171875, "grad_norm_var": 0.0390045166015625, "learning_rate": 0.0001, "loss": 5.7273, "loss/crossentropy": 2.575814366340637, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16905974596738815, "step": 16466 }, { "epoch": 0.514625, "grad_norm": 3.203125, "grad_norm_var": 0.035054524739583336, "learning_rate": 0.0001, "loss": 5.819, "loss/crossentropy": 2.6294543743133545, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17012812942266464, "step": 16468 }, { "epoch": 0.5146875, "grad_norm": 2.921875, "grad_norm_var": 0.03357645670572917, "learning_rate": 0.0001, "loss": 5.7404, "loss/crossentropy": 2.5590656995773315, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.17203693836927414, "step": 16470 }, { "epoch": 0.51475, "grad_norm": 3.3125, "grad_norm_var": 0.03512369791666667, "learning_rate": 0.0001, "loss": 5.9409, "loss/crossentropy": 2.639412760734558, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17820008099079132, "step": 16472 }, { "epoch": 0.5148125, "grad_norm": 2.984375, "grad_norm_var": 0.016552734375, "learning_rate": 0.0001, "loss": 5.6727, "loss/crossentropy": 2.60652494430542, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16208262741565704, "step": 16474 }, { "epoch": 0.514875, "grad_norm": 3.296875, "grad_norm_var": 0.020113118489583335, "learning_rate": 0.0001, "loss": 5.7368, "loss/crossentropy": 2.5230515003204346, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1713699847459793, "step": 16476 }, { "epoch": 0.5149375, "grad_norm": 3.15625, "grad_norm_var": 0.0196685791015625, "learning_rate": 0.0001, "loss": 6.1404, "loss/crossentropy": 2.8043954372406006, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18165170401334763, "step": 16478 }, { "epoch": 0.515, "grad_norm": 3.0, "grad_norm_var": 0.019627888997395832, "learning_rate": 0.0001, "loss": 5.7294, "loss/crossentropy": 2.607026219367981, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1661418303847313, "step": 16480 }, { "epoch": 0.5150625, "grad_norm": 3.203125, "grad_norm_var": 0.025288899739583332, "learning_rate": 0.0001, "loss": 5.8784, "loss/crossentropy": 2.613901972770691, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17645075917243958, "step": 16482 }, { "epoch": 0.515125, "grad_norm": 2.84375, "grad_norm_var": 0.030696614583333334, "learning_rate": 0.0001, "loss": 5.8019, "loss/crossentropy": 2.6880619525909424, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16450658440589905, "step": 16484 }, { "epoch": 0.5151875, "grad_norm": 3.15625, "grad_norm_var": 0.027782185872395834, "learning_rate": 0.0001, "loss": 5.6577, "loss/crossentropy": 2.528021812438965, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1657046154141426, "step": 16486 }, { "epoch": 0.51525, "grad_norm": 2.96875, "grad_norm_var": 0.027099609375, "learning_rate": 0.0001, "loss": 5.7887, "loss/crossentropy": 2.5553163290023804, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17450721561908722, "step": 16488 }, { "epoch": 0.5153125, "grad_norm": 3.0625, "grad_norm_var": 0.0278472900390625, "learning_rate": 0.0001, "loss": 5.9336, "loss/crossentropy": 2.7285076379776, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1708962619304657, "step": 16490 }, { "epoch": 0.515375, "grad_norm": 3.40625, "grad_norm_var": 0.03681233723958333, "learning_rate": 0.0001, "loss": 5.5904, "loss/crossentropy": 2.3793158531188965, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17071735113859177, "step": 16492 }, { "epoch": 0.5154375, "grad_norm": 2.953125, "grad_norm_var": 0.039404296875, "learning_rate": 0.0001, "loss": 5.7508, "loss/crossentropy": 2.6006277799606323, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16697286069393158, "step": 16494 }, { "epoch": 0.5155, "grad_norm": 3.109375, "grad_norm_var": 0.039118448893229164, "learning_rate": 0.0001, "loss": 5.6516, "loss/crossentropy": 2.445363163948059, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16827547550201416, "step": 16496 }, { "epoch": 0.5155625, "grad_norm": 3.15625, "grad_norm_var": 0.037775675455729164, "learning_rate": 0.0001, "loss": 5.6302, "loss/crossentropy": 2.4687918424606323, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16887445747852325, "step": 16498 }, { "epoch": 0.515625, "grad_norm": 3.53125, "grad_norm_var": 0.2933878580729167, "learning_rate": 0.0001, "loss": 6.2691, "loss/crossentropy": 2.774771213531494, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.1924007385969162, "step": 16500 }, { "epoch": 0.5156875, "grad_norm": 3.421875, "grad_norm_var": 0.44541727701822914, "learning_rate": 0.0001, "loss": 6.2772, "loss/crossentropy": 2.7692209482192993, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19377101957798004, "step": 16502 }, { "epoch": 0.51575, "grad_norm": 3.40625, "grad_norm_var": 0.4248860677083333, "learning_rate": 0.0001, "loss": 5.8903, "loss/crossentropy": 2.6010122299194336, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17775394767522812, "step": 16504 }, { "epoch": 0.5158125, "grad_norm": 2.859375, "grad_norm_var": 0.44664306640625, "learning_rate": 0.0001, "loss": 5.7192, "loss/crossentropy": 2.5332034826278687, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16977231949567795, "step": 16506 }, { "epoch": 0.515875, "grad_norm": 3.09375, "grad_norm_var": 0.45901285807291664, "learning_rate": 0.0001, "loss": 5.5482, "loss/crossentropy": 2.365523934364319, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1690482795238495, "step": 16508 }, { "epoch": 0.5159375, "grad_norm": 3.328125, "grad_norm_var": 111.96536356608073, "learning_rate": 0.0001, "loss": 6.7191, "loss/crossentropy": 2.786882162094116, "loss/hidden": 1.6640625, "loss/jsd": 0.0, "loss/logits": 0.22681649774312973, "step": 16510 }, { "epoch": 0.516, "grad_norm": 3.09375, "grad_norm_var": 112.07008056640625, "learning_rate": 0.0001, "loss": 6.0112, "loss/crossentropy": 2.7462023496627808, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17689534276723862, "step": 16512 }, { "epoch": 0.5160625, "grad_norm": 3.234375, "grad_norm_var": 112.233447265625, "learning_rate": 0.0001, "loss": 5.6887, "loss/crossentropy": 2.5591888427734375, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16490060836076736, "step": 16514 }, { "epoch": 0.516125, "grad_norm": 3.109375, "grad_norm_var": 112.88609110514322, "learning_rate": 0.0001, "loss": 6.0241, "loss/crossentropy": 2.7814793586730957, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17504484206438065, "step": 16516 }, { "epoch": 0.5161875, "grad_norm": 3.046875, "grad_norm_var": 113.45001525878907, "learning_rate": 0.0001, "loss": 5.9697, "loss/crossentropy": 2.6906793117523193, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17555953562259674, "step": 16518 }, { "epoch": 0.51625, "grad_norm": 3.0, "grad_norm_var": 113.58023986816406, "learning_rate": 0.0001, "loss": 5.9051, "loss/crossentropy": 2.697845458984375, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17345809936523438, "step": 16520 }, { "epoch": 0.5163125, "grad_norm": 3.0625, "grad_norm_var": 113.61848958333333, "learning_rate": 0.0001, "loss": 5.4842, "loss/crossentropy": 2.4003665447235107, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1622883379459381, "step": 16522 }, { "epoch": 0.516375, "grad_norm": 3.03125, "grad_norm_var": 113.70017801920572, "learning_rate": 0.0001, "loss": 6.0258, "loss/crossentropy": 2.810461163520813, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17153704166412354, "step": 16524 }, { "epoch": 0.5164375, "grad_norm": 3.21875, "grad_norm_var": 0.0197662353515625, "learning_rate": 0.0001, "loss": 5.4931, "loss/crossentropy": 2.37578809261322, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16485276818275452, "step": 16526 }, { "epoch": 0.5165, "grad_norm": 3.171875, "grad_norm_var": 0.020905558268229166, "learning_rate": 0.0001, "loss": 5.5587, "loss/crossentropy": 2.4451950788497925, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1636950746178627, "step": 16528 }, { "epoch": 0.5165625, "grad_norm": 2.953125, "grad_norm_var": 0.018082682291666666, "learning_rate": 0.0001, "loss": 5.6793, "loss/crossentropy": 2.548762559890747, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16149181127548218, "step": 16530 }, { "epoch": 0.516625, "grad_norm": 3.078125, "grad_norm_var": 0.018049112955729165, "learning_rate": 0.0001, "loss": 5.7365, "loss/crossentropy": 2.585257411003113, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16629429906606674, "step": 16532 }, { "epoch": 0.5166875, "grad_norm": 2.96875, "grad_norm_var": 0.0184722900390625, "learning_rate": 0.0001, "loss": 5.5841, "loss/crossentropy": 2.5218945741653442, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16012202203273773, "step": 16534 }, { "epoch": 0.51675, "grad_norm": 3.34375, "grad_norm_var": 0.019364420572916666, "learning_rate": 0.0001, "loss": 5.6886, "loss/crossentropy": 2.5698297023773193, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16695934534072876, "step": 16536 }, { "epoch": 0.5168125, "grad_norm": 3.0625, "grad_norm_var": 0.020783487955729166, "learning_rate": 0.0001, "loss": 5.8641, "loss/crossentropy": 2.625547170639038, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.173465296626091, "step": 16538 }, { "epoch": 0.516875, "grad_norm": 3.390625, "grad_norm_var": 0.025031534830729167, "learning_rate": 0.0001, "loss": 5.9813, "loss/crossentropy": 2.717432975769043, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17677276581525803, "step": 16540 }, { "epoch": 0.5169375, "grad_norm": 3.203125, "grad_norm_var": 0.020807902018229168, "learning_rate": 0.0001, "loss": 5.7959, "loss/crossentropy": 2.5902841091156006, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16938591748476028, "step": 16542 }, { "epoch": 0.517, "grad_norm": 3.109375, "grad_norm_var": 0.05287984212239583, "learning_rate": 0.0001, "loss": 6.015, "loss/crossentropy": 2.6680736541748047, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18039973080158234, "step": 16544 }, { "epoch": 0.5170625, "grad_norm": 2.984375, "grad_norm_var": 0.05230712890625, "learning_rate": 0.0001, "loss": 5.6888, "loss/crossentropy": 2.6125682592391968, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16308893263339996, "step": 16546 }, { "epoch": 0.517125, "grad_norm": 3.125, "grad_norm_var": 0.0503814697265625, "learning_rate": 0.0001, "loss": 5.532, "loss/crossentropy": 2.4145188331604004, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16877657175064087, "step": 16548 }, { "epoch": 0.5171875, "grad_norm": 3.125, "grad_norm_var": 0.045361328125, "learning_rate": 0.0001, "loss": 5.7908, "loss/crossentropy": 2.5912692546844482, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17034027725458145, "step": 16550 }, { "epoch": 0.51725, "grad_norm": 3.09375, "grad_norm_var": 0.04692281087239583, "learning_rate": 0.0001, "loss": 5.7964, "loss/crossentropy": 2.607316017150879, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.17281020432710648, "step": 16552 }, { "epoch": 0.5173125, "grad_norm": 3.015625, "grad_norm_var": 0.045638020833333334, "learning_rate": 0.0001, "loss": 5.7138, "loss/crossentropy": 2.5501874685287476, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16597293317317963, "step": 16554 }, { "epoch": 0.517375, "grad_norm": 2.796875, "grad_norm_var": 0.052083333333333336, "learning_rate": 0.0001, "loss": 5.7712, "loss/crossentropy": 2.677625060081482, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.162484772503376, "step": 16556 }, { "epoch": 0.5174375, "grad_norm": 3.6875, "grad_norm_var": 0.07293192545572917, "learning_rate": 0.0001, "loss": 5.9425, "loss/crossentropy": 2.6705286502838135, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1748487800359726, "step": 16558 }, { "epoch": 0.5175, "grad_norm": 2.984375, "grad_norm_var": 0.03701070149739583, "learning_rate": 0.0001, "loss": 5.6548, "loss/crossentropy": 2.520450472831726, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1681206300854683, "step": 16560 }, { "epoch": 0.5175625, "grad_norm": 3.1875, "grad_norm_var": 0.040201822916666664, "learning_rate": 0.0001, "loss": 5.8293, "loss/crossentropy": 2.5363374948501587, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17616698145866394, "step": 16562 }, { "epoch": 0.517625, "grad_norm": 3.015625, "grad_norm_var": 0.04423828125, "learning_rate": 0.0001, "loss": 5.6035, "loss/crossentropy": 2.4566866159439087, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16233892738819122, "step": 16564 }, { "epoch": 0.5176875, "grad_norm": 3.171875, "grad_norm_var": 0.04421284993489583, "learning_rate": 0.0001, "loss": 5.8993, "loss/crossentropy": 2.6745909452438354, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17051292955875397, "step": 16566 }, { "epoch": 0.51775, "grad_norm": 5.4375, "grad_norm_var": 0.37661844889322915, "learning_rate": 0.0001, "loss": 6.1697, "loss/crossentropy": 2.627561092376709, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1995277926325798, "step": 16568 }, { "epoch": 0.5178125, "grad_norm": 3.203125, "grad_norm_var": 0.38728739420572916, "learning_rate": 0.0001, "loss": 5.5743, "loss/crossentropy": 2.4943583011627197, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16424017399549484, "step": 16570 }, { "epoch": 0.517875, "grad_norm": 3.3125, "grad_norm_var": 0.3738352457682292, "learning_rate": 0.0001, "loss": 6.0507, "loss/crossentropy": 2.6343544721603394, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1869424656033516, "step": 16572 }, { "epoch": 0.5179375, "grad_norm": 3.484375, "grad_norm_var": 0.3589680989583333, "learning_rate": 0.0001, "loss": 5.8843, "loss/crossentropy": 2.6096925735473633, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1758989542722702, "step": 16574 }, { "epoch": 0.518, "grad_norm": 3.9375, "grad_norm_var": 24.849951171875, "learning_rate": 0.0001, "loss": 6.2974, "loss/crossentropy": 2.713488817214966, "loss/hidden": 1.59375, "loss/jsd": 0.0, "loss/logits": 0.19901321083307266, "step": 16576 }, { "epoch": 0.5180625, "grad_norm": 3.375, "grad_norm_var": 24.806192016601564, "learning_rate": 0.0001, "loss": 5.9657, "loss/crossentropy": 2.6843067407608032, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17735348641872406, "step": 16578 }, { "epoch": 0.518125, "grad_norm": 3.234375, "grad_norm_var": 24.819852701822917, "learning_rate": 0.0001, "loss": 5.4799, "loss/crossentropy": 2.402770161628723, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15810411423444748, "step": 16580 }, { "epoch": 0.5181875, "grad_norm": 3.71875, "grad_norm_var": 24.695731608072915, "learning_rate": 0.0001, "loss": 5.8796, "loss/crossentropy": 2.625036835670471, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17506562173366547, "step": 16582 }, { "epoch": 0.51825, "grad_norm": 2.953125, "grad_norm_var": 24.863863118489583, "learning_rate": 0.0001, "loss": 5.6303, "loss/crossentropy": 2.524322748184204, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16176536679267883, "step": 16584 }, { "epoch": 0.5183125, "grad_norm": 3.125, "grad_norm_var": 24.795340983072915, "learning_rate": 0.0001, "loss": 5.6144, "loss/crossentropy": 2.438259243965149, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17034796625375748, "step": 16586 }, { "epoch": 0.518375, "grad_norm": 2.921875, "grad_norm_var": 25.035846964518228, "learning_rate": 0.0001, "loss": 5.2784, "loss/crossentropy": 2.328306198120117, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15125726163387299, "step": 16588 }, { "epoch": 0.5184375, "grad_norm": 2.96875, "grad_norm_var": 25.15708719889323, "learning_rate": 0.0001, "loss": 5.8799, "loss/crossentropy": 2.670527696609497, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17250019311904907, "step": 16590 }, { "epoch": 0.5185, "grad_norm": 3.125, "grad_norm_var": 0.05158589680989583, "learning_rate": 0.0001, "loss": 5.8831, "loss/crossentropy": 2.6243882179260254, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17665047943592072, "step": 16592 }, { "epoch": 0.5185625, "grad_norm": 3.265625, "grad_norm_var": 0.06926676432291666, "learning_rate": 0.0001, "loss": 5.6799, "loss/crossentropy": 2.4813830852508545, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16868449747562408, "step": 16594 }, { "epoch": 0.518625, "grad_norm": 3.296875, "grad_norm_var": 0.06985270182291667, "learning_rate": 0.0001, "loss": 5.6264, "loss/crossentropy": 2.480758786201477, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16495513916015625, "step": 16596 }, { "epoch": 0.5186875, "grad_norm": 3.125, "grad_norm_var": 0.050658162434895834, "learning_rate": 0.0001, "loss": 5.8854, "loss/crossentropy": 2.6669927835464478, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17223212867975235, "step": 16598 }, { "epoch": 0.51875, "grad_norm": 2.875, "grad_norm_var": 0.049267578125, "learning_rate": 0.0001, "loss": 5.5787, "loss/crossentropy": 2.5182470083236694, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16229919344186783, "step": 16600 }, { "epoch": 0.5188125, "grad_norm": 3.171875, "grad_norm_var": 0.050614420572916666, "learning_rate": 0.0001, "loss": 5.6494, "loss/crossentropy": 2.543129086494446, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1645297035574913, "step": 16602 }, { "epoch": 0.518875, "grad_norm": 3.34375, "grad_norm_var": 0.04119466145833333, "learning_rate": 0.0001, "loss": 5.7711, "loss/crossentropy": 2.5774060487747192, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17054499685764313, "step": 16604 }, { "epoch": 0.5189375, "grad_norm": 3.40625, "grad_norm_var": 0.0501373291015625, "learning_rate": 0.0001, "loss": 6.0801, "loss/crossentropy": 2.738303065299988, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18144899606704712, "step": 16606 }, { "epoch": 0.519, "grad_norm": 3.15625, "grad_norm_var": 0.05136311848958333, "learning_rate": 0.0001, "loss": 5.7269, "loss/crossentropy": 2.525917887687683, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1700976938009262, "step": 16608 }, { "epoch": 0.5190625, "grad_norm": 3.046875, "grad_norm_var": 0.0331695556640625, "learning_rate": 0.0001, "loss": 5.8912, "loss/crossentropy": 2.6941726207733154, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1736096292734146, "step": 16610 }, { "epoch": 0.519125, "grad_norm": 3.484375, "grad_norm_var": 0.0389801025390625, "learning_rate": 0.0001, "loss": 5.8374, "loss/crossentropy": 2.6021909713745117, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17352460324764252, "step": 16612 }, { "epoch": 0.5191875, "grad_norm": 3.1875, "grad_norm_var": 0.03847554524739583, "learning_rate": 0.0001, "loss": 5.8745, "loss/crossentropy": 2.602517247200012, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1783716008067131, "step": 16614 }, { "epoch": 0.51925, "grad_norm": 3.09375, "grad_norm_var": 0.03681538899739583, "learning_rate": 0.0001, "loss": 5.9364, "loss/crossentropy": 2.758184313774109, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16938883066177368, "step": 16616 }, { "epoch": 0.5193125, "grad_norm": 3.203125, "grad_norm_var": 0.03352762858072917, "learning_rate": 0.0001, "loss": 6.0031, "loss/crossentropy": 2.6977959871292114, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1813122108578682, "step": 16618 }, { "epoch": 0.519375, "grad_norm": 2.828125, "grad_norm_var": 0.04378153483072917, "learning_rate": 0.0001, "loss": 5.5473, "loss/crossentropy": 2.456685423851013, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16179471462965012, "step": 16620 }, { "epoch": 0.5194375, "grad_norm": 3.265625, "grad_norm_var": 0.032242838541666666, "learning_rate": 0.0001, "loss": 5.5945, "loss/crossentropy": 2.4643516540527344, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16379916667938232, "step": 16622 }, { "epoch": 0.5195, "grad_norm": 3.609375, "grad_norm_var": 0.05373942057291667, "learning_rate": 0.0001, "loss": 5.9985, "loss/crossentropy": 2.665659546852112, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18016141653060913, "step": 16624 }, { "epoch": 0.5195625, "grad_norm": 3.328125, "grad_norm_var": 0.054352823893229166, "learning_rate": 0.0001, "loss": 5.7673, "loss/crossentropy": 2.585843801498413, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1697111800312996, "step": 16626 }, { "epoch": 0.519625, "grad_norm": 3.09375, "grad_norm_var": 0.05849609375, "learning_rate": 0.0001, "loss": 5.9058, "loss/crossentropy": 2.567280411720276, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.17682481557130814, "step": 16628 }, { "epoch": 0.5196875, "grad_norm": 3.421875, "grad_norm_var": 0.0635406494140625, "learning_rate": 0.0001, "loss": 6.013, "loss/crossentropy": 2.672019124031067, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18214056640863419, "step": 16630 }, { "epoch": 0.51975, "grad_norm": 2.90625, "grad_norm_var": 0.06884358723958334, "learning_rate": 0.0001, "loss": 5.392, "loss/crossentropy": 2.3738889694213867, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.15923285484313965, "step": 16632 }, { "epoch": 0.5198125, "grad_norm": 3.21875, "grad_norm_var": 0.0807525634765625, "learning_rate": 0.0001, "loss": 5.65, "loss/crossentropy": 2.488644242286682, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16886603087186813, "step": 16634 }, { "epoch": 0.519875, "grad_norm": 3.203125, "grad_norm_var": 0.06968994140625, "learning_rate": 0.0001, "loss": 5.7624, "loss/crossentropy": 2.5380845069885254, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1736041009426117, "step": 16636 }, { "epoch": 0.5199375, "grad_norm": 3.359375, "grad_norm_var": 0.07164306640625, "learning_rate": 0.0001, "loss": 5.7852, "loss/crossentropy": 2.5422178506851196, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1727387011051178, "step": 16638 }, { "epoch": 0.52, "grad_norm": 3.078125, "grad_norm_var": 0.075830078125, "learning_rate": 0.0001, "loss": 5.6866, "loss/crossentropy": 2.5098516941070557, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16923755407333374, "step": 16640 }, { "epoch": 0.5200625, "grad_norm": 3.359375, "grad_norm_var": 0.07096354166666667, "learning_rate": 0.0001, "loss": 5.7984, "loss/crossentropy": 2.5298749208450317, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17372490465641022, "step": 16642 }, { "epoch": 0.520125, "grad_norm": 3.671875, "grad_norm_var": 0.07226155598958334, "learning_rate": 0.0001, "loss": 5.6965, "loss/crossentropy": 2.485423445701599, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17150159925222397, "step": 16644 }, { "epoch": 0.5201875, "grad_norm": 3.34375, "grad_norm_var": 0.06940816243489584, "learning_rate": 0.0001, "loss": 5.8126, "loss/crossentropy": 2.626734972000122, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17132389545440674, "step": 16646 }, { "epoch": 0.52025, "grad_norm": 3.0, "grad_norm_var": 0.057738240559895834, "learning_rate": 0.0001, "loss": 5.7702, "loss/crossentropy": 2.619166374206543, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1650991588830948, "step": 16648 }, { "epoch": 0.5203125, "grad_norm": 3.59375, "grad_norm_var": 0.046483357747395836, "learning_rate": 0.0001, "loss": 6.0174, "loss/crossentropy": 2.705695629119873, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17765065282583237, "step": 16650 }, { "epoch": 0.520375, "grad_norm": 3.109375, "grad_norm_var": 0.0499664306640625, "learning_rate": 0.0001, "loss": 5.7881, "loss/crossentropy": 2.581183910369873, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17107804864645004, "step": 16652 }, { "epoch": 0.5204375, "grad_norm": 3.015625, "grad_norm_var": 0.0509674072265625, "learning_rate": 0.0001, "loss": 5.7454, "loss/crossentropy": 2.608821749687195, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.165218323469162, "step": 16654 }, { "epoch": 0.5205, "grad_norm": 3.4375, "grad_norm_var": 0.03501688639322917, "learning_rate": 0.0001, "loss": 5.79, "loss/crossentropy": 2.5277384519577026, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1770046204328537, "step": 16656 }, { "epoch": 0.5205625, "grad_norm": 2.890625, "grad_norm_var": 0.04708658854166667, "learning_rate": 0.0001, "loss": 5.7761, "loss/crossentropy": 2.6142302751541138, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16345254331827164, "step": 16658 }, { "epoch": 0.520625, "grad_norm": 2.921875, "grad_norm_var": 0.04256083170572917, "learning_rate": 0.0001, "loss": 5.3146, "loss/crossentropy": 2.3108972311019897, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15740341693162918, "step": 16660 }, { "epoch": 0.5206875, "grad_norm": 2.984375, "grad_norm_var": 0.05091145833333333, "learning_rate": 0.0001, "loss": 5.6101, "loss/crossentropy": 2.5152639150619507, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16065338253974915, "step": 16662 }, { "epoch": 0.52075, "grad_norm": 3.046875, "grad_norm_var": 0.05803629557291667, "learning_rate": 0.0001, "loss": 5.2244, "loss/crossentropy": 2.2537713050842285, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.14979984611272812, "step": 16664 }, { "epoch": 0.5208125, "grad_norm": 3.28125, "grad_norm_var": 0.05217692057291667, "learning_rate": 0.0001, "loss": 5.8836, "loss/crossentropy": 2.58497154712677, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.18025048077106476, "step": 16666 }, { "epoch": 0.520875, "grad_norm": 3.359375, "grad_norm_var": 0.05827534993489583, "learning_rate": 0.0001, "loss": 5.5951, "loss/crossentropy": 2.4491108655929565, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16889246553182602, "step": 16668 }, { "epoch": 0.5209375, "grad_norm": 3.390625, "grad_norm_var": 0.06366780598958334, "learning_rate": 0.0001, "loss": 6.0348, "loss/crossentropy": 2.7149627208709717, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.18158919364213943, "step": 16670 }, { "epoch": 0.521, "grad_norm": 3.359375, "grad_norm_var": 0.060445149739583336, "learning_rate": 0.0001, "loss": 5.8633, "loss/crossentropy": 2.554919481277466, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18005279451608658, "step": 16672 }, { "epoch": 0.5210625, "grad_norm": 3.21875, "grad_norm_var": 0.048094685872395834, "learning_rate": 0.0001, "loss": 5.8456, "loss/crossentropy": 2.6079788208007812, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17493051290512085, "step": 16674 }, { "epoch": 0.521125, "grad_norm": 3.125, "grad_norm_var": 0.0436431884765625, "learning_rate": 0.0001, "loss": 5.5138, "loss/crossentropy": 2.4231492280960083, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16335897147655487, "step": 16676 }, { "epoch": 0.5211875, "grad_norm": 3.203125, "grad_norm_var": 0.03212890625, "learning_rate": 0.0001, "loss": 5.5832, "loss/crossentropy": 2.4611895084381104, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1633726805448532, "step": 16678 }, { "epoch": 0.52125, "grad_norm": 3.640625, "grad_norm_var": 0.030464680989583333, "learning_rate": 0.0001, "loss": 5.9662, "loss/crossentropy": 2.601937174797058, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18173735588788986, "step": 16680 }, { "epoch": 0.5213125, "grad_norm": 3.1875, "grad_norm_var": 0.03173421223958333, "learning_rate": 0.0001, "loss": 5.6479, "loss/crossentropy": 2.5193867683410645, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16792523860931396, "step": 16682 }, { "epoch": 0.521375, "grad_norm": 2.96875, "grad_norm_var": 0.030451456705729168, "learning_rate": 0.0001, "loss": 5.9514, "loss/crossentropy": 2.780367136001587, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16827665269374847, "step": 16684 }, { "epoch": 0.5214375, "grad_norm": 2.9375, "grad_norm_var": 0.033589680989583336, "learning_rate": 0.0001, "loss": 5.5207, "loss/crossentropy": 2.4640052318573, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15723436325788498, "step": 16686 }, { "epoch": 0.5215, "grad_norm": 2.90625, "grad_norm_var": 0.037495930989583336, "learning_rate": 0.0001, "loss": 5.5658, "loss/crossentropy": 2.5082110166549683, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15770812332630157, "step": 16688 }, { "epoch": 0.5215625, "grad_norm": 3.203125, "grad_norm_var": 0.03766988118489583, "learning_rate": 0.0001, "loss": 5.6946, "loss/crossentropy": 2.541201949119568, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16573123633861542, "step": 16690 }, { "epoch": 0.521625, "grad_norm": 3.546875, "grad_norm_var": 0.0573394775390625, "learning_rate": 0.0001, "loss": 5.836, "loss/crossentropy": 2.496829390525818, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18118096888065338, "step": 16692 }, { "epoch": 0.5216875, "grad_norm": 2.96875, "grad_norm_var": 0.0611968994140625, "learning_rate": 0.0001, "loss": 5.6866, "loss/crossentropy": 2.536230444908142, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16659681499004364, "step": 16694 }, { "epoch": 0.52175, "grad_norm": 3.078125, "grad_norm_var": 0.039449055989583336, "learning_rate": 0.0001, "loss": 5.572, "loss/crossentropy": 2.4573066234588623, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16068410873413086, "step": 16696 }, { "epoch": 0.5218125, "grad_norm": 2.78125, "grad_norm_var": 0.0487213134765625, "learning_rate": 0.0001, "loss": 5.5547, "loss/crossentropy": 2.421579599380493, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1660444363951683, "step": 16698 }, { "epoch": 0.521875, "grad_norm": 3.203125, "grad_norm_var": 0.04829813639322917, "learning_rate": 0.0001, "loss": 5.7851, "loss/crossentropy": 2.5321544408798218, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1745123714208603, "step": 16700 }, { "epoch": 0.5219375, "grad_norm": 2.921875, "grad_norm_var": 0.04954325358072917, "learning_rate": 0.0001, "loss": 5.6871, "loss/crossentropy": 2.6101166009902954, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16160931438207626, "step": 16702 }, { "epoch": 0.522, "grad_norm": 3.125, "grad_norm_var": 0.04578348795572917, "learning_rate": 0.0001, "loss": 5.9245, "loss/crossentropy": 2.632740616798401, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1776122897863388, "step": 16704 }, { "epoch": 0.5220625, "grad_norm": 3.375, "grad_norm_var": 0.04827473958333333, "learning_rate": 0.0001, "loss": 5.657, "loss/crossentropy": 2.476766586303711, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17075669765472412, "step": 16706 }, { "epoch": 0.522125, "grad_norm": 3.0625, "grad_norm_var": 0.028889973958333332, "learning_rate": 0.0001, "loss": 5.6833, "loss/crossentropy": 2.533569812774658, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16810289025306702, "step": 16708 }, { "epoch": 0.5221875, "grad_norm": 3.328125, "grad_norm_var": 0.030785115559895833, "learning_rate": 0.0001, "loss": 5.6766, "loss/crossentropy": 2.5180299282073975, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16781431436538696, "step": 16710 }, { "epoch": 0.52225, "grad_norm": 3.0625, "grad_norm_var": 0.11048177083333334, "learning_rate": 0.0001, "loss": 5.8755, "loss/crossentropy": 2.620678663253784, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17352984845638275, "step": 16712 }, { "epoch": 0.5223125, "grad_norm": 3.140625, "grad_norm_var": 0.0986480712890625, "learning_rate": 0.0001, "loss": 5.7624, "loss/crossentropy": 2.5677719116210938, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17024896293878555, "step": 16714 }, { "epoch": 0.522375, "grad_norm": 3.265625, "grad_norm_var": 0.09802958170572916, "learning_rate": 0.0001, "loss": 5.9484, "loss/crossentropy": 2.703026533126831, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17531873285770416, "step": 16716 }, { "epoch": 0.5224375, "grad_norm": 3.25, "grad_norm_var": 0.088720703125, "learning_rate": 0.0001, "loss": 5.9424, "loss/crossentropy": 2.7067244052886963, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17318104952573776, "step": 16718 }, { "epoch": 0.5225, "grad_norm": 3.15625, "grad_norm_var": 0.08799540201822917, "learning_rate": 0.0001, "loss": 5.5793, "loss/crossentropy": 2.460513114929199, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16656684130430222, "step": 16720 }, { "epoch": 0.5225625, "grad_norm": 3.171875, "grad_norm_var": 0.08570556640625, "learning_rate": 0.0001, "loss": 5.6503, "loss/crossentropy": 2.5076273679733276, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1670023426413536, "step": 16722 }, { "epoch": 0.522625, "grad_norm": 2.875, "grad_norm_var": 0.0902740478515625, "learning_rate": 0.0001, "loss": 5.8403, "loss/crossentropy": 2.643712043762207, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17044231295585632, "step": 16724 }, { "epoch": 0.5226875, "grad_norm": 3.4375, "grad_norm_var": 0.09388020833333334, "learning_rate": 0.0001, "loss": 5.9001, "loss/crossentropy": 2.6782734394073486, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17296595126390457, "step": 16726 }, { "epoch": 0.52275, "grad_norm": 3.09375, "grad_norm_var": 0.01949462890625, "learning_rate": 0.0001, "loss": 5.2923, "loss/crossentropy": 2.2812613248825073, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15813816338777542, "step": 16728 }, { "epoch": 0.5228125, "grad_norm": 3.40625, "grad_norm_var": 0.024153645833333334, "learning_rate": 0.0001, "loss": 5.9144, "loss/crossentropy": 2.666238784790039, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17325755208730698, "step": 16730 }, { "epoch": 0.522875, "grad_norm": 3.296875, "grad_norm_var": 0.023998006184895834, "learning_rate": 0.0001, "loss": 5.896, "loss/crossentropy": 2.6160322427749634, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17643069475889206, "step": 16732 }, { "epoch": 0.5229375, "grad_norm": 3.328125, "grad_norm_var": 0.025016276041666667, "learning_rate": 0.0001, "loss": 5.8807, "loss/crossentropy": 2.6244956254959106, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17562486976385117, "step": 16734 }, { "epoch": 0.523, "grad_norm": 3.0625, "grad_norm_var": 0.025999959309895834, "learning_rate": 0.0001, "loss": 5.6844, "loss/crossentropy": 2.478963017463684, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1728852093219757, "step": 16736 }, { "epoch": 0.5230625, "grad_norm": 3.296875, "grad_norm_var": 0.030659993489583332, "learning_rate": 0.0001, "loss": 5.5548, "loss/crossentropy": 2.334673762321472, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17005780339241028, "step": 16738 }, { "epoch": 0.523125, "grad_norm": 3.390625, "grad_norm_var": 0.026155598958333335, "learning_rate": 0.0001, "loss": 6.0049, "loss/crossentropy": 2.6568034887313843, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17973309755325317, "step": 16740 }, { "epoch": 0.5231875, "grad_norm": 2.890625, "grad_norm_var": 0.030757649739583334, "learning_rate": 0.0001, "loss": 5.7537, "loss/crossentropy": 2.5928655862808228, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16529954969882965, "step": 16742 }, { "epoch": 0.52325, "grad_norm": 3.015625, "grad_norm_var": 0.027489217122395833, "learning_rate": 0.0001, "loss": 5.6691, "loss/crossentropy": 2.510942578315735, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16816120594739914, "step": 16744 }, { "epoch": 0.5233125, "grad_norm": 3.015625, "grad_norm_var": 0.02994384765625, "learning_rate": 0.0001, "loss": 5.6242, "loss/crossentropy": 2.524855375289917, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16384124755859375, "step": 16746 }, { "epoch": 0.523375, "grad_norm": 3.015625, "grad_norm_var": 0.033203125, "learning_rate": 0.0001, "loss": 5.5681, "loss/crossentropy": 2.483372688293457, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16238310188055038, "step": 16748 }, { "epoch": 0.5234375, "grad_norm": 3.359375, "grad_norm_var": 0.0370269775390625, "learning_rate": 0.0001, "loss": 5.6439, "loss/crossentropy": 2.466872215270996, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1688736230134964, "step": 16750 }, { "epoch": 0.5235, "grad_norm": 3.625, "grad_norm_var": 0.048628743489583334, "learning_rate": 0.0001, "loss": 5.9759, "loss/crossentropy": 2.658264398574829, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17942135781049728, "step": 16752 }, { "epoch": 0.5235625, "grad_norm": 2.921875, "grad_norm_var": 0.0514801025390625, "learning_rate": 0.0001, "loss": 5.3525, "loss/crossentropy": 2.3306466341018677, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1584375649690628, "step": 16754 }, { "epoch": 0.523625, "grad_norm": 3.25, "grad_norm_var": 0.04947509765625, "learning_rate": 0.0001, "loss": 5.7338, "loss/crossentropy": 2.5754432678222656, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16778850555419922, "step": 16756 }, { "epoch": 0.5236875, "grad_norm": 2.859375, "grad_norm_var": 0.05592041015625, "learning_rate": 0.0001, "loss": 5.8624, "loss/crossentropy": 2.63477885723114, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17237010598182678, "step": 16758 }, { "epoch": 0.52375, "grad_norm": 3.203125, "grad_norm_var": 0.053999837239583334, "learning_rate": 0.0001, "loss": 5.9421, "loss/crossentropy": 2.7633711099624634, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17021772265434265, "step": 16760 }, { "epoch": 0.5238125, "grad_norm": 3.203125, "grad_norm_var": 0.0580474853515625, "learning_rate": 0.0001, "loss": 5.4875, "loss/crossentropy": 2.3752094507217407, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1600557640194893, "step": 16762 }, { "epoch": 0.523875, "grad_norm": 3.265625, "grad_norm_var": 0.056929524739583334, "learning_rate": 0.0001, "loss": 5.988, "loss/crossentropy": 2.7755390405654907, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17202361673116684, "step": 16764 }, { "epoch": 0.5239375, "grad_norm": 3.328125, "grad_norm_var": 0.05338134765625, "learning_rate": 0.0001, "loss": 5.7225, "loss/crossentropy": 2.537709593772888, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16691192239522934, "step": 16766 }, { "epoch": 0.524, "grad_norm": 2.75, "grad_norm_var": 0.0473785400390625, "learning_rate": 0.0001, "loss": 5.4379, "loss/crossentropy": 2.393734335899353, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15558606386184692, "step": 16768 }, { "epoch": 0.5240625, "grad_norm": 3.171875, "grad_norm_var": 0.04442952473958333, "learning_rate": 0.0001, "loss": 5.7923, "loss/crossentropy": 2.632172107696533, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16796287894248962, "step": 16770 }, { "epoch": 0.524125, "grad_norm": 3.84375, "grad_norm_var": 0.07353413899739583, "learning_rate": 0.0001, "loss": 6.0387, "loss/crossentropy": 2.7371329069137573, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17469213902950287, "step": 16772 }, { "epoch": 0.5241875, "grad_norm": 3.265625, "grad_norm_var": 0.05758463541666667, "learning_rate": 0.0001, "loss": 5.7492, "loss/crossentropy": 2.611035943031311, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16967833042144775, "step": 16774 }, { "epoch": 0.52425, "grad_norm": 3.109375, "grad_norm_var": 0.05725911458333333, "learning_rate": 0.0001, "loss": 5.6682, "loss/crossentropy": 2.5155742168426514, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16682402789592743, "step": 16776 }, { "epoch": 0.5243125, "grad_norm": 3.40625, "grad_norm_var": 0.05660400390625, "learning_rate": 0.0001, "loss": 5.4848, "loss/crossentropy": 2.426730751991272, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1538565456867218, "step": 16778 }, { "epoch": 0.524375, "grad_norm": 3.4375, "grad_norm_var": 0.06402587890625, "learning_rate": 0.0001, "loss": 5.8914, "loss/crossentropy": 2.554630398750305, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17782258987426758, "step": 16780 }, { "epoch": 0.5244375, "grad_norm": 3.046875, "grad_norm_var": 0.06392822265625, "learning_rate": 0.0001, "loss": 5.8384, "loss/crossentropy": 2.6723839044570923, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17012126743793488, "step": 16782 }, { "epoch": 0.5245, "grad_norm": 3.0, "grad_norm_var": 0.05237223307291667, "learning_rate": 0.0001, "loss": 5.9261, "loss/crossentropy": 2.7359548807144165, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17253194004297256, "step": 16784 }, { "epoch": 0.5245625, "grad_norm": 3.03125, "grad_norm_var": 0.05284830729166667, "learning_rate": 0.0001, "loss": 5.629, "loss/crossentropy": 2.4921926259994507, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16328564286231995, "step": 16786 }, { "epoch": 0.524625, "grad_norm": 2.921875, "grad_norm_var": 0.03338216145833333, "learning_rate": 0.0001, "loss": 5.4287, "loss/crossentropy": 2.439249873161316, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15754255652427673, "step": 16788 }, { "epoch": 0.5246875, "grad_norm": 3.25, "grad_norm_var": 0.031151326497395833, "learning_rate": 0.0001, "loss": 5.4842, "loss/crossentropy": 2.3997395038604736, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.15805823355913162, "step": 16790 }, { "epoch": 0.52475, "grad_norm": 4.0, "grad_norm_var": 0.07854715983072917, "learning_rate": 0.0001, "loss": 5.832, "loss/crossentropy": 2.4776930809020996, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18035707622766495, "step": 16792 }, { "epoch": 0.5248125, "grad_norm": 3.171875, "grad_norm_var": 0.1131744384765625, "learning_rate": 0.0001, "loss": 6.0571, "loss/crossentropy": 2.649075508117676, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.18298634141683578, "step": 16794 }, { "epoch": 0.524875, "grad_norm": 3.0625, "grad_norm_var": 0.10982666015625, "learning_rate": 0.0001, "loss": 6.1231, "loss/crossentropy": 2.89070200920105, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17402183264493942, "step": 16796 }, { "epoch": 0.5249375, "grad_norm": 2.984375, "grad_norm_var": 0.11135965983072917, "learning_rate": 0.0001, "loss": 5.7474, "loss/crossentropy": 2.6006139516830444, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16663673520088196, "step": 16798 }, { "epoch": 0.525, "grad_norm": 3.125, "grad_norm_var": 0.1128326416015625, "learning_rate": 0.0001, "loss": 5.4162, "loss/crossentropy": 2.363920211791992, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15991061180830002, "step": 16800 }, { "epoch": 0.5250625, "grad_norm": 3.265625, "grad_norm_var": 0.1113433837890625, "learning_rate": 0.0001, "loss": 6.0503, "loss/crossentropy": 2.7622867822647095, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.18036288022994995, "step": 16802 }, { "epoch": 0.525125, "grad_norm": 3.015625, "grad_norm_var": 0.10338134765625, "learning_rate": 0.0001, "loss": 5.6703, "loss/crossentropy": 2.585463047027588, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16160530596971512, "step": 16804 }, { "epoch": 0.5251875, "grad_norm": 3.203125, "grad_norm_var": 0.10568033854166667, "learning_rate": 0.0001, "loss": 5.2132, "loss/crossentropy": 2.212006449699402, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.14972937107086182, "step": 16806 }, { "epoch": 0.52525, "grad_norm": 3.0625, "grad_norm_var": 0.062272135416666666, "learning_rate": 0.0001, "loss": 5.5782, "loss/crossentropy": 2.5070362091064453, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16102229803800583, "step": 16808 }, { "epoch": 0.5253125, "grad_norm": 3.0, "grad_norm_var": 0.01578369140625, "learning_rate": 0.0001, "loss": 5.6242, "loss/crossentropy": 2.506625771522522, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.1676207035779953, "step": 16810 }, { "epoch": 0.525375, "grad_norm": 2.96875, "grad_norm_var": 0.017220052083333333, "learning_rate": 0.0001, "loss": 5.5629, "loss/crossentropy": 2.4584254026412964, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16162265837192535, "step": 16812 }, { "epoch": 0.5254375, "grad_norm": 3.0625, "grad_norm_var": 0.016722615559895834, "learning_rate": 0.0001, "loss": 5.7189, "loss/crossentropy": 2.5626300573349, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16523746401071548, "step": 16814 }, { "epoch": 0.5255, "grad_norm": 2.828125, "grad_norm_var": 0.020531209309895833, "learning_rate": 0.0001, "loss": 5.4088, "loss/crossentropy": 2.378592610359192, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1584884375333786, "step": 16816 }, { "epoch": 0.5255625, "grad_norm": 3.09375, "grad_norm_var": 0.018480428059895835, "learning_rate": 0.0001, "loss": 5.7678, "loss/crossentropy": 2.639050841331482, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16444016993045807, "step": 16818 }, { "epoch": 0.525625, "grad_norm": 3.0, "grad_norm_var": 0.0238189697265625, "learning_rate": 0.0001, "loss": 5.579, "loss/crossentropy": 2.5458027124404907, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15801066905260086, "step": 16820 }, { "epoch": 0.5256875, "grad_norm": 2.96875, "grad_norm_var": 0.012548828125, "learning_rate": 0.0001, "loss": 5.7504, "loss/crossentropy": 2.607507109642029, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.17131800949573517, "step": 16822 }, { "epoch": 0.52575, "grad_norm": 3.453125, "grad_norm_var": 0.029801432291666666, "learning_rate": 0.0001, "loss": 5.8954, "loss/crossentropy": 2.6303540468215942, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17572374641895294, "step": 16824 }, { "epoch": 0.5258125, "grad_norm": 3.4375, "grad_norm_var": 0.03870035807291667, "learning_rate": 0.0001, "loss": 5.826, "loss/crossentropy": 2.6341618299484253, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16879215836524963, "step": 16826 }, { "epoch": 0.525875, "grad_norm": 3.1875, "grad_norm_var": 0.04209696451822917, "learning_rate": 0.0001, "loss": 5.7755, "loss/crossentropy": 2.6447147130966187, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1673756092786789, "step": 16828 }, { "epoch": 0.5259375, "grad_norm": 3.21875, "grad_norm_var": 0.042769368489583334, "learning_rate": 0.0001, "loss": 5.7975, "loss/crossentropy": 2.59163498878479, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16941649466753006, "step": 16830 }, { "epoch": 0.526, "grad_norm": 3.15625, "grad_norm_var": 0.038374837239583334, "learning_rate": 0.0001, "loss": 5.4617, "loss/crossentropy": 2.412532925605774, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16195210069417953, "step": 16832 }, { "epoch": 0.5260625, "grad_norm": 3.078125, "grad_norm_var": 0.03772684733072917, "learning_rate": 0.0001, "loss": 5.5503, "loss/crossentropy": 2.4097806215286255, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16287972778081894, "step": 16834 }, { "epoch": 0.526125, "grad_norm": 3.234375, "grad_norm_var": 0.03179931640625, "learning_rate": 0.0001, "loss": 5.5965, "loss/crossentropy": 2.3854483366012573, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.16680347174406052, "step": 16836 }, { "epoch": 0.5261875, "grad_norm": 3.359375, "grad_norm_var": 0.0328125, "learning_rate": 0.0001, "loss": 5.6514, "loss/crossentropy": 2.467614769935608, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16837888211011887, "step": 16838 }, { "epoch": 0.52625, "grad_norm": 3.40625, "grad_norm_var": 0.0302734375, "learning_rate": 0.0001, "loss": 5.9642, "loss/crossentropy": 2.715559482574463, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1744699776172638, "step": 16840 }, { "epoch": 0.5263125, "grad_norm": 2.953125, "grad_norm_var": 0.025715128580729166, "learning_rate": 0.0001, "loss": 5.478, "loss/crossentropy": 2.337415099143982, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16523519158363342, "step": 16842 }, { "epoch": 0.526375, "grad_norm": 2.984375, "grad_norm_var": 0.022581990559895834, "learning_rate": 0.0001, "loss": 5.3061, "loss/crossentropy": 2.248517870903015, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.14989487826824188, "step": 16844 }, { "epoch": 0.5264375, "grad_norm": 3.171875, "grad_norm_var": 0.022630818684895835, "learning_rate": 0.0001, "loss": 5.8628, "loss/crossentropy": 2.62682843208313, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17477399110794067, "step": 16846 }, { "epoch": 0.5265, "grad_norm": 3.84375, "grad_norm_var": 0.05113525390625, "learning_rate": 0.0001, "loss": 5.6703, "loss/crossentropy": 2.4922419786453247, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16898052394390106, "step": 16848 }, { "epoch": 0.5265625, "grad_norm": 2.84375, "grad_norm_var": 0.05813802083333333, "learning_rate": 0.0001, "loss": 5.5643, "loss/crossentropy": 2.472880244255066, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16109106689691544, "step": 16850 }, { "epoch": 0.526625, "grad_norm": 3.234375, "grad_norm_var": 0.06179097493489583, "learning_rate": 0.0001, "loss": 5.5819, "loss/crossentropy": 2.484124779701233, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16446161270141602, "step": 16852 }, { "epoch": 0.5266875, "grad_norm": 3.28125, "grad_norm_var": 0.06026102701822917, "learning_rate": 0.0001, "loss": 5.7263, "loss/crossentropy": 2.5401129722595215, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1697906032204628, "step": 16854 }, { "epoch": 0.52675, "grad_norm": 2.953125, "grad_norm_var": 0.0576812744140625, "learning_rate": 0.0001, "loss": 5.68, "loss/crossentropy": 2.5771374702453613, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16302035003900528, "step": 16856 }, { "epoch": 0.5268125, "grad_norm": 3.34375, "grad_norm_var": 0.05762430826822917, "learning_rate": 0.0001, "loss": 5.5129, "loss/crossentropy": 2.3532347679138184, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16440674662590027, "step": 16858 }, { "epoch": 0.526875, "grad_norm": 3.15625, "grad_norm_var": 0.060700480143229166, "learning_rate": 0.0001, "loss": 5.8512, "loss/crossentropy": 2.6577740907669067, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17246952652931213, "step": 16860 }, { "epoch": 0.5269375, "grad_norm": 2.921875, "grad_norm_var": 0.06601155598958333, "learning_rate": 0.0001, "loss": 5.8364, "loss/crossentropy": 2.6040685176849365, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17401274293661118, "step": 16862 }, { "epoch": 0.527, "grad_norm": 2.9375, "grad_norm_var": 0.04531148274739583, "learning_rate": 0.0001, "loss": 5.5698, "loss/crossentropy": 2.5193625688552856, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15972767025232315, "step": 16864 }, { "epoch": 0.5270625, "grad_norm": 2.984375, "grad_norm_var": 0.04179585774739583, "learning_rate": 0.0001, "loss": 5.6884, "loss/crossentropy": 2.616241216659546, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.16385561227798462, "step": 16866 }, { "epoch": 0.527125, "grad_norm": 3.15625, "grad_norm_var": 0.03840230305989583, "learning_rate": 0.0001, "loss": 5.867, "loss/crossentropy": 2.661938190460205, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17012009769678116, "step": 16868 }, { "epoch": 0.5271875, "grad_norm": 3.21875, "grad_norm_var": 0.03469645182291667, "learning_rate": 0.0001, "loss": 5.7668, "loss/crossentropy": 2.5564876794815063, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1702459156513214, "step": 16870 }, { "epoch": 0.52725, "grad_norm": 3.015625, "grad_norm_var": 0.033610026041666664, "learning_rate": 0.0001, "loss": 5.7368, "loss/crossentropy": 2.615280032157898, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16293253749608994, "step": 16872 }, { "epoch": 0.5273125, "grad_norm": 3.171875, "grad_norm_var": 0.028766886393229166, "learning_rate": 0.0001, "loss": 5.7585, "loss/crossentropy": 2.5623830556869507, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1684436872601509, "step": 16874 }, { "epoch": 0.527375, "grad_norm": 2.953125, "grad_norm_var": 0.01353759765625, "learning_rate": 0.0001, "loss": 5.7008, "loss/crossentropy": 2.6023114919662476, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16375557333230972, "step": 16876 }, { "epoch": 0.5274375, "grad_norm": 3.546875, "grad_norm_var": 0.0321929931640625, "learning_rate": 0.0001, "loss": 6.1932, "loss/crossentropy": 2.7963041067123413, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18616915494203568, "step": 16878 }, { "epoch": 0.5275, "grad_norm": 3.359375, "grad_norm_var": 0.0278472900390625, "learning_rate": 0.0001, "loss": 6.2065, "loss/crossentropy": 2.8618983030319214, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1821155846118927, "step": 16880 }, { "epoch": 0.5275625, "grad_norm": 3.203125, "grad_norm_var": 0.024723307291666666, "learning_rate": 0.0001, "loss": 5.7974, "loss/crossentropy": 2.694758176803589, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16143349558115005, "step": 16882 }, { "epoch": 0.527625, "grad_norm": 3.421875, "grad_norm_var": 0.026334635416666665, "learning_rate": 0.0001, "loss": 5.7665, "loss/crossentropy": 2.538394570350647, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17007964849472046, "step": 16884 }, { "epoch": 0.5276875, "grad_norm": 3.515625, "grad_norm_var": 0.0408111572265625, "learning_rate": 0.0001, "loss": 6.3388, "loss/crossentropy": 2.9788248538970947, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18209479749202728, "step": 16886 }, { "epoch": 0.52775, "grad_norm": 3.984375, "grad_norm_var": 0.06877848307291666, "learning_rate": 0.0001, "loss": 5.7101, "loss/crossentropy": 2.516141653060913, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1682264357805252, "step": 16888 }, { "epoch": 0.5278125, "grad_norm": 2.921875, "grad_norm_var": 0.08279520670572917, "learning_rate": 0.0001, "loss": 5.5814, "loss/crossentropy": 2.5133888721466064, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1610962450504303, "step": 16890 }, { "epoch": 0.527875, "grad_norm": 3.265625, "grad_norm_var": 0.079541015625, "learning_rate": 0.0001, "loss": 5.6858, "loss/crossentropy": 2.463269591331482, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1714681014418602, "step": 16892 }, { "epoch": 0.5279375, "grad_norm": 3.21875, "grad_norm_var": 0.07891337076822917, "learning_rate": 0.0001, "loss": 5.8659, "loss/crossentropy": 2.5936213731765747, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17683736234903336, "step": 16894 }, { "epoch": 0.528, "grad_norm": 3.53125, "grad_norm_var": 0.07958984375, "learning_rate": 0.0001, "loss": 5.6168, "loss/crossentropy": 2.5288150310516357, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.15879766643047333, "step": 16896 }, { "epoch": 0.5280625, "grad_norm": 3.453125, "grad_norm_var": 0.08035481770833333, "learning_rate": 0.0001, "loss": 5.8487, "loss/crossentropy": 2.5781877040863037, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17627078294754028, "step": 16898 }, { "epoch": 0.528125, "grad_norm": 2.96875, "grad_norm_var": 0.09530843098958333, "learning_rate": 0.0001, "loss": 5.5579, "loss/crossentropy": 2.4117249250411987, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1614910438656807, "step": 16900 }, { "epoch": 0.5281875, "grad_norm": 4.84375, "grad_norm_var": 0.23356831868489583, "learning_rate": 0.0001, "loss": 5.8338, "loss/crossentropy": 2.5839866399765015, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17420267313718796, "step": 16902 }, { "epoch": 0.52825, "grad_norm": 3.234375, "grad_norm_var": 0.21035868326822918, "learning_rate": 0.0001, "loss": 5.8998, "loss/crossentropy": 2.666812777519226, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17329959571361542, "step": 16904 }, { "epoch": 0.5283125, "grad_norm": 3.375, "grad_norm_var": 0.19847005208333332, "learning_rate": 0.0001, "loss": 5.8801, "loss/crossentropy": 2.608286142349243, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17523258924484253, "step": 16906 }, { "epoch": 0.528375, "grad_norm": 3.09375, "grad_norm_var": 0.19133199055989583, "learning_rate": 0.0001, "loss": 5.5642, "loss/crossentropy": 2.443881034851074, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16515463590621948, "step": 16908 }, { "epoch": 0.5284375, "grad_norm": 5.78125, "grad_norm_var": 0.5673004150390625, "learning_rate": 0.0001, "loss": 6.2954, "loss/crossentropy": 2.78745698928833, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.1941521167755127, "step": 16910 }, { "epoch": 0.5285, "grad_norm": 3.671875, "grad_norm_var": 0.5599894205729167, "learning_rate": 0.0001, "loss": 5.8464, "loss/crossentropy": 2.616065263748169, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17342819273471832, "step": 16912 }, { "epoch": 0.5285625, "grad_norm": 2.96875, "grad_norm_var": 0.5651692708333333, "learning_rate": 0.0001, "loss": 5.5398, "loss/crossentropy": 2.3426021337509155, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1685510277748108, "step": 16914 }, { "epoch": 0.528625, "grad_norm": 3.125, "grad_norm_var": 0.5444244384765625, "learning_rate": 0.0001, "loss": 5.67, "loss/crossentropy": 2.447251796722412, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17149558663368225, "step": 16916 }, { "epoch": 0.5286875, "grad_norm": 3.125, "grad_norm_var": 0.43374735514322915, "learning_rate": 0.0001, "loss": 5.9925, "loss/crossentropy": 2.736096978187561, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17564402520656586, "step": 16918 }, { "epoch": 0.52875, "grad_norm": 3.328125, "grad_norm_var": 0.4401519775390625, "learning_rate": 0.0001, "loss": 5.7838, "loss/crossentropy": 2.542062520980835, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.16987399011850357, "step": 16920 }, { "epoch": 0.5288125, "grad_norm": 3.1875, "grad_norm_var": 0.43704427083333336, "learning_rate": 0.0001, "loss": 5.9564, "loss/crossentropy": 2.6412583589553833, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18073027580976486, "step": 16922 }, { "epoch": 0.528875, "grad_norm": 3.21875, "grad_norm_var": 0.43429361979166664, "learning_rate": 0.0001, "loss": 5.828, "loss/crossentropy": 2.6030869483947754, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17366065829992294, "step": 16924 }, { "epoch": 0.5289375, "grad_norm": 3.296875, "grad_norm_var": 0.04169820149739583, "learning_rate": 0.0001, "loss": 5.8671, "loss/crossentropy": 2.557239294052124, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1782548651099205, "step": 16926 }, { "epoch": 0.529, "grad_norm": 3.5, "grad_norm_var": 0.0314605712890625, "learning_rate": 0.0001, "loss": 5.5674, "loss/crossentropy": 2.3005136251449585, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17278698831796646, "step": 16928 }, { "epoch": 0.5290625, "grad_norm": 3.0625, "grad_norm_var": 0.02236328125, "learning_rate": 0.0001, "loss": 5.8136, "loss/crossentropy": 2.6265273094177246, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17183005809783936, "step": 16930 }, { "epoch": 0.529125, "grad_norm": 2.8125, "grad_norm_var": 0.0405670166015625, "learning_rate": 0.0001, "loss": 5.3265, "loss/crossentropy": 2.387045741081238, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15019110590219498, "step": 16932 }, { "epoch": 0.5291875, "grad_norm": 4.09375, "grad_norm_var": 0.09065348307291667, "learning_rate": 0.0001, "loss": 6.0541, "loss/crossentropy": 2.6851927042007446, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.18689000606536865, "step": 16934 }, { "epoch": 0.52925, "grad_norm": 5.4375, "grad_norm_var": 0.3789621988932292, "learning_rate": 0.0001, "loss": 5.7548, "loss/crossentropy": 2.4790419340133667, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1728922575712204, "step": 16936 }, { "epoch": 0.5293125, "grad_norm": 3.265625, "grad_norm_var": 0.37795817057291664, "learning_rate": 0.0001, "loss": 5.8875, "loss/crossentropy": 2.6745702028274536, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17128994315862656, "step": 16938 }, { "epoch": 0.529375, "grad_norm": 4.15625, "grad_norm_var": 0.42385965983072915, "learning_rate": 0.0001, "loss": 5.409, "loss/crossentropy": 2.327757477760315, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.15695005655288696, "step": 16940 }, { "epoch": 0.5294375, "grad_norm": 3.546875, "grad_norm_var": 0.42753499348958335, "learning_rate": 0.0001, "loss": 5.8836, "loss/crossentropy": 2.6485486030578613, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17623953521251678, "step": 16942 }, { "epoch": 0.5295, "grad_norm": 3.125, "grad_norm_var": 0.4352447509765625, "learning_rate": 0.0001, "loss": 5.9013, "loss/crossentropy": 2.6693369150161743, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.173587866127491, "step": 16944 }, { "epoch": 0.5295625, "grad_norm": 2.921875, "grad_norm_var": 0.45735270182291665, "learning_rate": 0.0001, "loss": 5.7236, "loss/crossentropy": 2.646336793899536, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1616320163011551, "step": 16946 }, { "epoch": 0.529625, "grad_norm": 3.234375, "grad_norm_var": 0.4293131510416667, "learning_rate": 0.0001, "loss": 5.6899, "loss/crossentropy": 2.526552438735962, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1682843267917633, "step": 16948 }, { "epoch": 0.5296875, "grad_norm": 3.390625, "grad_norm_var": 0.40546773274739584, "learning_rate": 0.0001, "loss": 5.7597, "loss/crossentropy": 2.555445075035095, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17003649473190308, "step": 16950 }, { "epoch": 0.52975, "grad_norm": 3.171875, "grad_norm_var": 0.0985015869140625, "learning_rate": 0.0001, "loss": 5.9148, "loss/crossentropy": 2.7017569541931152, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16896067559719086, "step": 16952 }, { "epoch": 0.5298125, "grad_norm": 3.34375, "grad_norm_var": 0.09973551432291666, "learning_rate": 0.0001, "loss": 5.9579, "loss/crossentropy": 2.7303154468536377, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17432095855474472, "step": 16954 }, { "epoch": 0.529875, "grad_norm": 3.71875, "grad_norm_var": 0.05175374348958333, "learning_rate": 0.0001, "loss": 5.6725, "loss/crossentropy": 2.458512544631958, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16944526880979538, "step": 16956 }, { "epoch": 0.5299375, "grad_norm": 3.4375, "grad_norm_var": 0.04744364420572917, "learning_rate": 0.0001, "loss": 5.8231, "loss/crossentropy": 2.6536834239959717, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.17085148394107819, "step": 16958 }, { "epoch": 0.53, "grad_norm": 3.390625, "grad_norm_var": 0.05022786458333333, "learning_rate": 0.0001, "loss": 5.8461, "loss/crossentropy": 2.6176013946533203, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17207121849060059, "step": 16960 }, { "epoch": 0.5300625, "grad_norm": 3.265625, "grad_norm_var": 0.04282938639322917, "learning_rate": 0.0001, "loss": 5.5728, "loss/crossentropy": 2.491735577583313, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16006263345479965, "step": 16962 }, { "epoch": 0.530125, "grad_norm": 3.359375, "grad_norm_var": 0.044920857747395834, "learning_rate": 0.0001, "loss": 5.9665, "loss/crossentropy": 2.6180880069732666, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18133027851581573, "step": 16964 }, { "epoch": 0.5301875, "grad_norm": 2.953125, "grad_norm_var": 0.047591145833333334, "learning_rate": 0.0001, "loss": 5.6772, "loss/crossentropy": 2.62038254737854, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15881171077489853, "step": 16966 }, { "epoch": 0.53025, "grad_norm": 3.15625, "grad_norm_var": 0.04568684895833333, "learning_rate": 0.0001, "loss": 5.8876, "loss/crossentropy": 2.7255882024765015, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16971950232982635, "step": 16968 }, { "epoch": 0.5303125, "grad_norm": 3.53125, "grad_norm_var": 0.05624593098958333, "learning_rate": 0.0001, "loss": 5.7136, "loss/crossentropy": 2.551610231399536, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16736791282892227, "step": 16970 }, { "epoch": 0.530375, "grad_norm": 3.296875, "grad_norm_var": 0.03951416015625, "learning_rate": 0.0001, "loss": 5.7962, "loss/crossentropy": 2.5625109672546387, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17180463671684265, "step": 16972 }, { "epoch": 0.5304375, "grad_norm": 3.15625, "grad_norm_var": 0.035868326822916664, "learning_rate": 0.0001, "loss": 5.8789, "loss/crossentropy": 2.62811541557312, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17625083029270172, "step": 16974 }, { "epoch": 0.5305, "grad_norm": 3.0625, "grad_norm_var": 0.03430887858072917, "learning_rate": 0.0001, "loss": 5.856, "loss/crossentropy": 2.7236850261688232, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.164009727537632, "step": 16976 }, { "epoch": 0.5305625, "grad_norm": 3.421875, "grad_norm_var": 0.03540751139322917, "learning_rate": 0.0001, "loss": 5.8104, "loss/crossentropy": 2.5462101697921753, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17173375189304352, "step": 16978 }, { "epoch": 0.530625, "grad_norm": 3.421875, "grad_norm_var": 0.0314605712890625, "learning_rate": 0.0001, "loss": 5.8743, "loss/crossentropy": 2.651816487312317, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1687367781996727, "step": 16980 }, { "epoch": 0.5306875, "grad_norm": 3.1875, "grad_norm_var": 0.02447509765625, "learning_rate": 0.0001, "loss": 5.624, "loss/crossentropy": 2.5051662921905518, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.161881722509861, "step": 16982 }, { "epoch": 0.53075, "grad_norm": 2.9375, "grad_norm_var": 0.0282623291015625, "learning_rate": 0.0001, "loss": 5.5397, "loss/crossentropy": 2.4485573768615723, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16380631178617477, "step": 16984 }, { "epoch": 0.5308125, "grad_norm": 3.078125, "grad_norm_var": 0.018050130208333334, "learning_rate": 0.0001, "loss": 5.605, "loss/crossentropy": 2.442605137825012, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16818874329328537, "step": 16986 }, { "epoch": 0.530875, "grad_norm": 3.0625, "grad_norm_var": 0.0190093994140625, "learning_rate": 0.0001, "loss": 5.8612, "loss/crossentropy": 2.669662117958069, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17150143533945084, "step": 16988 }, { "epoch": 0.5309375, "grad_norm": 2.875, "grad_norm_var": 0.025830078125, "learning_rate": 0.0001, "loss": 5.7293, "loss/crossentropy": 2.5571272373199463, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1683904081583023, "step": 16990 }, { "epoch": 0.531, "grad_norm": 3.15625, "grad_norm_var": 0.0263092041015625, "learning_rate": 0.0001, "loss": 5.5736, "loss/crossentropy": 2.401633381843567, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16875667124986649, "step": 16992 }, { "epoch": 0.5310625, "grad_norm": 3.46875, "grad_norm_var": 0.03228759765625, "learning_rate": 0.0001, "loss": 5.6485, "loss/crossentropy": 2.5339184999465942, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16262532770633698, "step": 16994 }, { "epoch": 0.531125, "grad_norm": 3.265625, "grad_norm_var": 0.029878743489583335, "learning_rate": 0.0001, "loss": 5.3957, "loss/crossentropy": 2.365266442298889, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15421508252620697, "step": 16996 }, { "epoch": 0.5311875, "grad_norm": 3.21875, "grad_norm_var": 0.05676676432291667, "learning_rate": 0.0001, "loss": 5.9016, "loss/crossentropy": 2.7077423334121704, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17251364141702652, "step": 16998 }, { "epoch": 0.53125, "grad_norm": 3.109375, "grad_norm_var": 0.052953084309895836, "learning_rate": 0.0001, "loss": 5.5384, "loss/crossentropy": 2.4351983070373535, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16070988774299622, "step": 17000 }, { "epoch": 0.5313125, "grad_norm": 3.28125, "grad_norm_var": 0.05916239420572917, "learning_rate": 0.0001, "loss": 6.0018, "loss/crossentropy": 2.6615259647369385, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.18441465497016907, "step": 17002 }, { "epoch": 0.531375, "grad_norm": 3.09375, "grad_norm_var": 0.0683013916015625, "learning_rate": 0.0001, "loss": 5.7757, "loss/crossentropy": 2.6376312971115112, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16653899103403091, "step": 17004 }, { "epoch": 0.5314375, "grad_norm": 3.140625, "grad_norm_var": 0.06551106770833333, "learning_rate": 0.0001, "loss": 5.7551, "loss/crossentropy": 2.6392234563827515, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1654955893754959, "step": 17006 }, { "epoch": 0.5315, "grad_norm": 3.671875, "grad_norm_var": 0.0814605712890625, "learning_rate": 0.0001, "loss": 5.8047, "loss/crossentropy": 2.5538251399993896, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17469869554042816, "step": 17008 }, { "epoch": 0.5315625, "grad_norm": 3.21875, "grad_norm_var": 0.07419331868489583, "learning_rate": 0.0001, "loss": 5.7475, "loss/crossentropy": 2.5746891498565674, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1680661141872406, "step": 17010 }, { "epoch": 0.531625, "grad_norm": 2.984375, "grad_norm_var": 0.06934305826822916, "learning_rate": 0.0001, "loss": 5.3914, "loss/crossentropy": 2.348442792892456, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15858762711286545, "step": 17012 }, { "epoch": 0.5316875, "grad_norm": 3.0625, "grad_norm_var": 0.0548492431640625, "learning_rate": 0.0001, "loss": 5.6595, "loss/crossentropy": 2.5538713932037354, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16563858091831207, "step": 17014 }, { "epoch": 0.53175, "grad_norm": 3.328125, "grad_norm_var": 0.05650634765625, "learning_rate": 0.0001, "loss": 6.0582, "loss/crossentropy": 2.853422522544861, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17047683149576187, "step": 17016 }, { "epoch": 0.5318125, "grad_norm": 3.078125, "grad_norm_var": 0.049153645833333336, "learning_rate": 0.0001, "loss": 5.9466, "loss/crossentropy": 2.690248489379883, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1748548224568367, "step": 17018 }, { "epoch": 0.531875, "grad_norm": 3.125, "grad_norm_var": 0.040999348958333334, "learning_rate": 0.0001, "loss": 5.9512, "loss/crossentropy": 2.7441134452819824, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16797682642936707, "step": 17020 }, { "epoch": 0.5319375, "grad_norm": 3.15625, "grad_norm_var": 0.03736063639322917, "learning_rate": 0.0001, "loss": 5.7569, "loss/crossentropy": 2.612870931625366, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16557930409908295, "step": 17022 }, { "epoch": 0.532, "grad_norm": 2.78125, "grad_norm_var": 0.029654947916666667, "learning_rate": 0.0001, "loss": 5.234, "loss/crossentropy": 2.329947352409363, "loss/hidden": 1.390625, "loss/jsd": 0.0, "loss/logits": 0.15133944153785706, "step": 17024 }, { "epoch": 0.5320625, "grad_norm": 3.46875, "grad_norm_var": 0.030671183268229166, "learning_rate": 0.0001, "loss": 6.0237, "loss/crossentropy": 2.7970874309539795, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17343772947788239, "step": 17026 }, { "epoch": 0.532125, "grad_norm": 3.03125, "grad_norm_var": 0.03131103515625, "learning_rate": 0.0001, "loss": 5.8274, "loss/crossentropy": 2.6069366931915283, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1732211410999298, "step": 17028 }, { "epoch": 0.5321875, "grad_norm": 3.0, "grad_norm_var": 0.027082316080729165, "learning_rate": 0.0001, "loss": 5.7569, "loss/crossentropy": 2.6359177827835083, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16639314591884613, "step": 17030 }, { "epoch": 0.53225, "grad_norm": 3.1875, "grad_norm_var": 0.025813802083333334, "learning_rate": 0.0001, "loss": 5.764, "loss/crossentropy": 2.572619676589966, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1710943728685379, "step": 17032 }, { "epoch": 0.5323125, "grad_norm": 3.546875, "grad_norm_var": 0.033219401041666666, "learning_rate": 0.0001, "loss": 5.9794, "loss/crossentropy": 2.7105066776275635, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1749315857887268, "step": 17034 }, { "epoch": 0.532375, "grad_norm": 3.046875, "grad_norm_var": 0.03469645182291667, "learning_rate": 0.0001, "loss": 5.5228, "loss/crossentropy": 2.423842668533325, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16106973588466644, "step": 17036 }, { "epoch": 0.5324375, "grad_norm": 3.046875, "grad_norm_var": 0.03502197265625, "learning_rate": 0.0001, "loss": 5.6784, "loss/crossentropy": 2.511347532272339, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16904564201831818, "step": 17038 }, { "epoch": 0.5325, "grad_norm": 2.859375, "grad_norm_var": 0.0315582275390625, "learning_rate": 0.0001, "loss": 5.4597, "loss/crossentropy": 2.4301384687423706, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15881867706775665, "step": 17040 }, { "epoch": 0.5325625, "grad_norm": 3.0, "grad_norm_var": 0.027925618489583335, "learning_rate": 0.0001, "loss": 5.5839, "loss/crossentropy": 2.4871020317077637, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16085658222436905, "step": 17042 }, { "epoch": 0.532625, "grad_norm": 2.984375, "grad_norm_var": 0.024300130208333333, "learning_rate": 0.0001, "loss": 5.6928, "loss/crossentropy": 2.5313451290130615, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16770461946725845, "step": 17044 }, { "epoch": 0.5326875, "grad_norm": 3.0, "grad_norm_var": 0.0280426025390625, "learning_rate": 0.0001, "loss": 5.8872, "loss/crossentropy": 2.642845630645752, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17209375649690628, "step": 17046 }, { "epoch": 0.53275, "grad_norm": 3.15625, "grad_norm_var": 0.0261871337890625, "learning_rate": 0.0001, "loss": 5.7036, "loss/crossentropy": 2.4725565910339355, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17310353368520737, "step": 17048 }, { "epoch": 0.5328125, "grad_norm": 3.0, "grad_norm_var": 0.012645467122395834, "learning_rate": 0.0001, "loss": 5.7822, "loss/crossentropy": 2.6032555103302, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.17219020426273346, "step": 17050 }, { "epoch": 0.532875, "grad_norm": 3.359375, "grad_norm_var": 0.018097941080729166, "learning_rate": 0.0001, "loss": 5.8159, "loss/crossentropy": 2.618359684944153, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17092972993850708, "step": 17052 }, { "epoch": 0.5329375, "grad_norm": 3.140625, "grad_norm_var": 0.021532185872395835, "learning_rate": 0.0001, "loss": 5.6609, "loss/crossentropy": 2.5374127626419067, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.163516603410244, "step": 17054 }, { "epoch": 0.533, "grad_norm": 3.46875, "grad_norm_var": 0.030296834309895833, "learning_rate": 0.0001, "loss": 5.6632, "loss/crossentropy": 2.448095440864563, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.16838687658309937, "step": 17056 }, { "epoch": 0.5330625, "grad_norm": 3.125, "grad_norm_var": 0.028971354166666668, "learning_rate": 0.0001, "loss": 5.7122, "loss/crossentropy": 2.5715551376342773, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1671854928135872, "step": 17058 }, { "epoch": 0.533125, "grad_norm": 3.265625, "grad_norm_var": 0.02720947265625, "learning_rate": 0.0001, "loss": 5.9343, "loss/crossentropy": 2.7259186506271362, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17240135371685028, "step": 17060 }, { "epoch": 0.5331875, "grad_norm": 3.21875, "grad_norm_var": 0.023273722330729166, "learning_rate": 0.0001, "loss": 5.949, "loss/crossentropy": 2.677462577819824, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17598193883895874, "step": 17062 }, { "epoch": 0.53325, "grad_norm": 3.265625, "grad_norm_var": 0.021805826822916666, "learning_rate": 0.0001, "loss": 5.4291, "loss/crossentropy": 2.3025020360946655, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16343628615140915, "step": 17064 }, { "epoch": 0.5333125, "grad_norm": 3.234375, "grad_norm_var": 0.018550618489583334, "learning_rate": 0.0001, "loss": 5.9242, "loss/crossentropy": 2.6051729917526245, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.18229009956121445, "step": 17066 }, { "epoch": 0.533375, "grad_norm": 3.3125, "grad_norm_var": 0.022379557291666668, "learning_rate": 0.0001, "loss": 5.6842, "loss/crossentropy": 2.5065290927886963, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16815276443958282, "step": 17068 }, { "epoch": 0.5334375, "grad_norm": 3.28125, "grad_norm_var": 0.013313802083333333, "learning_rate": 0.0001, "loss": 5.8038, "loss/crossentropy": 2.5707377195358276, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1733020544052124, "step": 17070 }, { "epoch": 0.5335, "grad_norm": 3.34375, "grad_norm_var": 0.014867146809895834, "learning_rate": 0.0001, "loss": 5.9117, "loss/crossentropy": 2.6048630475997925, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17911715060472488, "step": 17072 }, { "epoch": 0.5335625, "grad_norm": 3.34375, "grad_norm_var": 0.15386962890625, "learning_rate": 0.0001, "loss": 6.1994, "loss/crossentropy": 2.790488123893738, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18893983960151672, "step": 17074 }, { "epoch": 0.533625, "grad_norm": 3.15625, "grad_norm_var": 0.15868733723958334, "learning_rate": 0.0001, "loss": 5.6514, "loss/crossentropy": 2.46434485912323, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16792075335979462, "step": 17076 }, { "epoch": 0.5336875, "grad_norm": 3.0, "grad_norm_var": 0.16470438639322918, "learning_rate": 0.0001, "loss": 5.5249, "loss/crossentropy": 2.329636335372925, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1656169816851616, "step": 17078 }, { "epoch": 0.53375, "grad_norm": 2.84375, "grad_norm_var": 0.18338216145833333, "learning_rate": 0.0001, "loss": 5.5114, "loss/crossentropy": 2.4661262035369873, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16039124131202698, "step": 17080 }, { "epoch": 0.5338125, "grad_norm": 3.140625, "grad_norm_var": 0.18459370930989583, "learning_rate": 0.0001, "loss": 5.7988, "loss/crossentropy": 2.580346941947937, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.17614497244358063, "step": 17082 }, { "epoch": 0.533875, "grad_norm": 3.03125, "grad_norm_var": 0.1836822509765625, "learning_rate": 0.0001, "loss": 5.4874, "loss/crossentropy": 2.4446455240249634, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15662309527397156, "step": 17084 }, { "epoch": 0.5339375, "grad_norm": 3.328125, "grad_norm_var": 0.19029032389322917, "learning_rate": 0.0001, "loss": 5.8172, "loss/crossentropy": 2.540899634361267, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17762987315654755, "step": 17086 }, { "epoch": 0.534, "grad_norm": 2.921875, "grad_norm_var": 0.19588216145833334, "learning_rate": 0.0001, "loss": 5.2604, "loss/crossentropy": 2.3293145895004272, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15092363953590393, "step": 17088 }, { "epoch": 0.5340625, "grad_norm": 3.140625, "grad_norm_var": 0.052464803059895836, "learning_rate": 0.0001, "loss": 5.7763, "loss/crossentropy": 2.595832109451294, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16999783366918564, "step": 17090 }, { "epoch": 0.534125, "grad_norm": 2.9375, "grad_norm_var": 0.05552469889322917, "learning_rate": 0.0001, "loss": 5.5164, "loss/crossentropy": 2.4288792610168457, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16070610284805298, "step": 17092 }, { "epoch": 0.5341875, "grad_norm": 3.125, "grad_norm_var": 0.05373942057291667, "learning_rate": 0.0001, "loss": 5.6126, "loss/crossentropy": 2.517060399055481, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1618964746594429, "step": 17094 }, { "epoch": 0.53425, "grad_norm": 3.421875, "grad_norm_var": 0.0662261962890625, "learning_rate": 0.0001, "loss": 6.0459, "loss/crossentropy": 2.7155656814575195, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18224790692329407, "step": 17096 }, { "epoch": 0.5343125, "grad_norm": 3.171875, "grad_norm_var": 0.06583658854166667, "learning_rate": 0.0001, "loss": 6.0407, "loss/crossentropy": 2.8346915245056152, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17255517095327377, "step": 17098 }, { "epoch": 0.534375, "grad_norm": 3.125, "grad_norm_var": 0.06780192057291666, "learning_rate": 0.0001, "loss": 5.7816, "loss/crossentropy": 2.6422271728515625, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1670575961470604, "step": 17100 }, { "epoch": 0.5344375, "grad_norm": 3.171875, "grad_norm_var": 0.05739644368489583, "learning_rate": 0.0001, "loss": 5.8924, "loss/crossentropy": 2.6804498434066772, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16884919255971909, "step": 17102 }, { "epoch": 0.5345, "grad_norm": 3.453125, "grad_norm_var": 0.055257161458333336, "learning_rate": 0.0001, "loss": 5.9318, "loss/crossentropy": 2.646846652030945, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1784997135400772, "step": 17104 }, { "epoch": 0.5345625, "grad_norm": 3.515625, "grad_norm_var": 0.04442952473958333, "learning_rate": 0.0001, "loss": 5.5162, "loss/crossentropy": 2.4204429388046265, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.162314772605896, "step": 17106 }, { "epoch": 0.534625, "grad_norm": 3.609375, "grad_norm_var": 0.05807291666666667, "learning_rate": 0.0001, "loss": 5.9085, "loss/crossentropy": 2.5476585626602173, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18139329552650452, "step": 17108 }, { "epoch": 0.5346875, "grad_norm": 2.9375, "grad_norm_var": 0.06311442057291666, "learning_rate": 0.0001, "loss": 5.525, "loss/crossentropy": 2.4291800260543823, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16036456823349, "step": 17110 }, { "epoch": 0.53475, "grad_norm": 3.265625, "grad_norm_var": 0.05123697916666667, "learning_rate": 0.0001, "loss": 5.8608, "loss/crossentropy": 2.63621985912323, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1716729998588562, "step": 17112 }, { "epoch": 0.5348125, "grad_norm": 2.84375, "grad_norm_var": 0.062418619791666664, "learning_rate": 0.0001, "loss": 5.8591, "loss/crossentropy": 2.6378746032714844, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17485301196575165, "step": 17114 }, { "epoch": 0.534875, "grad_norm": 3.46875, "grad_norm_var": 0.057470703125, "learning_rate": 0.0001, "loss": 5.8118, "loss/crossentropy": 2.553487181663513, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17583421617746353, "step": 17116 }, { "epoch": 0.5349375, "grad_norm": 2.859375, "grad_norm_var": 0.0693359375, "learning_rate": 0.0001, "loss": 5.8919, "loss/crossentropy": 2.6541894674301147, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17611850798130035, "step": 17118 }, { "epoch": 0.535, "grad_norm": 2.890625, "grad_norm_var": 0.07334696451822917, "learning_rate": 0.0001, "loss": 5.3697, "loss/crossentropy": 2.3797430992126465, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15446283668279648, "step": 17120 }, { "epoch": 0.5350625, "grad_norm": 3.140625, "grad_norm_var": 0.07300516764322916, "learning_rate": 0.0001, "loss": 5.486, "loss/crossentropy": 2.3793132305145264, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16222813725471497, "step": 17122 }, { "epoch": 0.535125, "grad_norm": 3.0, "grad_norm_var": 0.05357157389322917, "learning_rate": 0.0001, "loss": 5.9188, "loss/crossentropy": 2.6369359493255615, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17427990585565567, "step": 17124 }, { "epoch": 0.5351875, "grad_norm": 3.359375, "grad_norm_var": 0.055150349934895836, "learning_rate": 0.0001, "loss": 6.0767, "loss/crossentropy": 2.722614288330078, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18111039698123932, "step": 17126 }, { "epoch": 0.53525, "grad_norm": 3.203125, "grad_norm_var": 0.05728759765625, "learning_rate": 0.0001, "loss": 5.8847, "loss/crossentropy": 2.6280394792556763, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1780138835310936, "step": 17128 }, { "epoch": 0.5353125, "grad_norm": 3.140625, "grad_norm_var": 0.04834696451822917, "learning_rate": 0.0001, "loss": 5.7332, "loss/crossentropy": 2.53939950466156, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17132992297410965, "step": 17130 }, { "epoch": 0.535375, "grad_norm": 2.921875, "grad_norm_var": 0.049193318684895834, "learning_rate": 0.0001, "loss": 5.5148, "loss/crossentropy": 2.465831995010376, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.1627052202820778, "step": 17132 }, { "epoch": 0.5354375, "grad_norm": 3.21875, "grad_norm_var": 0.04259440104166667, "learning_rate": 0.0001, "loss": 5.5225, "loss/crossentropy": 2.4369946718215942, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.1644097939133644, "step": 17134 }, { "epoch": 0.5355, "grad_norm": 3.109375, "grad_norm_var": 0.03935546875, "learning_rate": 0.0001, "loss": 5.6981, "loss/crossentropy": 2.542572021484375, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16672320663928986, "step": 17136 }, { "epoch": 0.5355625, "grad_norm": 3.03125, "grad_norm_var": 0.037328084309895836, "learning_rate": 0.0001, "loss": 5.6913, "loss/crossentropy": 2.5692667961120605, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16571664810180664, "step": 17138 }, { "epoch": 0.535625, "grad_norm": 2.96875, "grad_norm_var": 0.030257161458333334, "learning_rate": 0.0001, "loss": 5.5473, "loss/crossentropy": 2.5101238489151, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15840435028076172, "step": 17140 }, { "epoch": 0.5356875, "grad_norm": 2.75, "grad_norm_var": 0.02789306640625, "learning_rate": 0.0001, "loss": 5.5962, "loss/crossentropy": 2.5291870832443237, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1598297655582428, "step": 17142 }, { "epoch": 0.53575, "grad_norm": 3.140625, "grad_norm_var": 0.025275675455729167, "learning_rate": 0.0001, "loss": 5.574, "loss/crossentropy": 2.5178639888763428, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16029997169971466, "step": 17144 }, { "epoch": 0.5358125, "grad_norm": 3.140625, "grad_norm_var": 0.04504801432291667, "learning_rate": 0.0001, "loss": 5.8875, "loss/crossentropy": 2.584782123565674, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17909906804561615, "step": 17146 }, { "epoch": 0.535875, "grad_norm": 3.171875, "grad_norm_var": 0.04586181640625, "learning_rate": 0.0001, "loss": 5.8854, "loss/crossentropy": 2.757336974143982, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16671134531497955, "step": 17148 }, { "epoch": 0.5359375, "grad_norm": 3.203125, "grad_norm_var": 0.04638264973958333, "learning_rate": 0.0001, "loss": 5.4471, "loss/crossentropy": 2.356896758079529, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16331280767917633, "step": 17150 }, { "epoch": 0.536, "grad_norm": 2.984375, "grad_norm_var": 0.047749837239583336, "learning_rate": 0.0001, "loss": 5.4821, "loss/crossentropy": 2.4455692768096924, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15990079194307327, "step": 17152 }, { "epoch": 0.5360625, "grad_norm": 3.375, "grad_norm_var": 0.05371805826822917, "learning_rate": 0.0001, "loss": 5.7659, "loss/crossentropy": 2.558240294456482, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16920633614063263, "step": 17154 }, { "epoch": 0.536125, "grad_norm": 3.21875, "grad_norm_var": 0.0527252197265625, "learning_rate": 0.0001, "loss": 6.0016, "loss/crossentropy": 2.7091336250305176, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1788558065891266, "step": 17156 }, { "epoch": 0.5361875, "grad_norm": 3.28125, "grad_norm_var": 0.04584859212239583, "learning_rate": 0.0001, "loss": 5.8222, "loss/crossentropy": 2.5406280755996704, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17580938339233398, "step": 17158 }, { "epoch": 0.53625, "grad_norm": 3.09375, "grad_norm_var": 0.07486063639322917, "learning_rate": 0.0001, "loss": 5.6154, "loss/crossentropy": 2.43525767326355, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16996947675943375, "step": 17160 }, { "epoch": 0.5363125, "grad_norm": 3.15625, "grad_norm_var": 0.07149149576822916, "learning_rate": 0.0001, "loss": 5.6441, "loss/crossentropy": 2.510724902153015, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16607657074928284, "step": 17162 }, { "epoch": 0.536375, "grad_norm": 3.015625, "grad_norm_var": 0.0732818603515625, "learning_rate": 0.0001, "loss": 5.4909, "loss/crossentropy": 2.316679298877716, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1662546843290329, "step": 17164 }, { "epoch": 0.5364375, "grad_norm": 3.421875, "grad_norm_var": 0.0728668212890625, "learning_rate": 0.0001, "loss": 5.8561, "loss/crossentropy": 2.56137752532959, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17595582455396652, "step": 17166 }, { "epoch": 0.5365, "grad_norm": 3.09375, "grad_norm_var": 0.0694976806640625, "learning_rate": 0.0001, "loss": 5.638, "loss/crossentropy": 2.509562849998474, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1624525785446167, "step": 17168 }, { "epoch": 0.5365625, "grad_norm": 3.234375, "grad_norm_var": 0.06886393229166667, "learning_rate": 0.0001, "loss": 5.6331, "loss/crossentropy": 2.5476927757263184, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1604909971356392, "step": 17170 }, { "epoch": 0.536625, "grad_norm": 2.9375, "grad_norm_var": 0.07893473307291667, "learning_rate": 0.0001, "loss": 5.5707, "loss/crossentropy": 2.5319403409957886, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15700259804725647, "step": 17172 }, { "epoch": 0.5366875, "grad_norm": 2.890625, "grad_norm_var": 0.08466389973958334, "learning_rate": 0.0001, "loss": 5.5913, "loss/crossentropy": 2.496548056602478, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1622052937746048, "step": 17174 }, { "epoch": 0.53675, "grad_norm": 3.765625, "grad_norm_var": 0.06318257649739584, "learning_rate": 0.0001, "loss": 5.779, "loss/crossentropy": 2.561243414878845, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1721685826778412, "step": 17176 }, { "epoch": 0.5368125, "grad_norm": 3.1875, "grad_norm_var": 0.061644490559895834, "learning_rate": 0.0001, "loss": 5.7119, "loss/crossentropy": 2.5556975603103638, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.17030397802591324, "step": 17178 }, { "epoch": 0.536875, "grad_norm": 3.203125, "grad_norm_var": 0.06154683430989583, "learning_rate": 0.0001, "loss": 5.8396, "loss/crossentropy": 2.5738922357559204, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1777389496564865, "step": 17180 }, { "epoch": 0.5369375, "grad_norm": 3.265625, "grad_norm_var": 0.052668253580729164, "learning_rate": 0.0001, "loss": 5.898, "loss/crossentropy": 2.605117678642273, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17850348353385925, "step": 17182 }, { "epoch": 0.537, "grad_norm": 3.5, "grad_norm_var": 0.0634918212890625, "learning_rate": 0.0001, "loss": 5.6326, "loss/crossentropy": 2.5269027948379517, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16408953815698624, "step": 17184 }, { "epoch": 0.5370625, "grad_norm": 3.0, "grad_norm_var": 0.07398681640625, "learning_rate": 0.0001, "loss": 5.385, "loss/crossentropy": 2.3865991830825806, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15608975291252136, "step": 17186 }, { "epoch": 0.537125, "grad_norm": 2.90625, "grad_norm_var": 0.07485249837239584, "learning_rate": 0.0001, "loss": 5.3913, "loss/crossentropy": 2.3155194520950317, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16187888383865356, "step": 17188 }, { "epoch": 0.5371875, "grad_norm": 3.140625, "grad_norm_var": 0.07071024576822917, "learning_rate": 0.0001, "loss": 5.6339, "loss/crossentropy": 2.4698076248168945, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1675824075937271, "step": 17190 }, { "epoch": 0.53725, "grad_norm": 3.609375, "grad_norm_var": 0.0535064697265625, "learning_rate": 0.0001, "loss": 5.6603, "loss/crossentropy": 2.5393755435943604, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16755837947130203, "step": 17192 }, { "epoch": 0.5373125, "grad_norm": 3.265625, "grad_norm_var": 0.05611572265625, "learning_rate": 0.0001, "loss": 5.8479, "loss/crossentropy": 2.6371690034866333, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1745872050523758, "step": 17194 }, { "epoch": 0.537375, "grad_norm": 3.375, "grad_norm_var": 0.06249593098958333, "learning_rate": 0.0001, "loss": 5.7458, "loss/crossentropy": 2.528953194618225, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.16817187517881393, "step": 17196 }, { "epoch": 0.5374375, "grad_norm": 3.171875, "grad_norm_var": 0.06855367024739584, "learning_rate": 0.0001, "loss": 5.7345, "loss/crossentropy": 2.479103684425354, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17436964064836502, "step": 17198 }, { "epoch": 0.5375, "grad_norm": 3.0625, "grad_norm_var": 0.05607808430989583, "learning_rate": 0.0001, "loss": 5.8815, "loss/crossentropy": 2.6622745990753174, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17231357842683792, "step": 17200 }, { "epoch": 0.5375625, "grad_norm": 2.9375, "grad_norm_var": 0.0782135009765625, "learning_rate": 0.0001, "loss": 5.8238, "loss/crossentropy": 2.6184778213500977, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16974829882383347, "step": 17202 }, { "epoch": 0.537625, "grad_norm": 3.0, "grad_norm_var": 0.0746490478515625, "learning_rate": 0.0001, "loss": 5.5295, "loss/crossentropy": 2.4015225172042847, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16591988503932953, "step": 17204 }, { "epoch": 0.5376875, "grad_norm": 3.6875, "grad_norm_var": 0.08297526041666667, "learning_rate": 0.0001, "loss": 6.0323, "loss/crossentropy": 2.739021420478821, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.18011116981506348, "step": 17206 }, { "epoch": 0.53775, "grad_norm": 3.1875, "grad_norm_var": 0.07071940104166667, "learning_rate": 0.0001, "loss": 5.8506, "loss/crossentropy": 2.689476251602173, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16885115206241608, "step": 17208 }, { "epoch": 0.5378125, "grad_norm": 3.421875, "grad_norm_var": 0.06568094889322916, "learning_rate": 0.0001, "loss": 6.0803, "loss/crossentropy": 2.750085473060608, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18145929276943207, "step": 17210 }, { "epoch": 0.537875, "grad_norm": 2.75, "grad_norm_var": 0.08286031087239583, "learning_rate": 0.0001, "loss": 5.185, "loss/crossentropy": 2.2621421813964844, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.14932042360305786, "step": 17212 }, { "epoch": 0.5379375, "grad_norm": 3.09375, "grad_norm_var": 0.0863433837890625, "learning_rate": 0.0001, "loss": 5.6305, "loss/crossentropy": 2.48822021484375, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.17047961056232452, "step": 17214 }, { "epoch": 0.538, "grad_norm": 2.9375, "grad_norm_var": 0.0932037353515625, "learning_rate": 0.0001, "loss": 5.6364, "loss/crossentropy": 2.4415475130081177, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16909129917621613, "step": 17216 }, { "epoch": 0.5380625, "grad_norm": 3.53125, "grad_norm_var": 0.061278279622395834, "learning_rate": 0.0001, "loss": 6.0746, "loss/crossentropy": 2.7563360929489136, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17830930650234222, "step": 17218 }, { "epoch": 0.538125, "grad_norm": 3.171875, "grad_norm_var": 0.06055399576822917, "learning_rate": 0.0001, "loss": 5.4381, "loss/crossentropy": 2.384289860725403, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16085246950387955, "step": 17220 }, { "epoch": 0.5381875, "grad_norm": 3.078125, "grad_norm_var": 0.04368387858072917, "learning_rate": 0.0001, "loss": 5.5677, "loss/crossentropy": 2.4485658407211304, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1638677716255188, "step": 17222 }, { "epoch": 0.53825, "grad_norm": 3.28125, "grad_norm_var": 0.045670572916666666, "learning_rate": 0.0001, "loss": 5.7756, "loss/crossentropy": 2.669786214828491, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16409556567668915, "step": 17224 }, { "epoch": 0.5383125, "grad_norm": 3.140625, "grad_norm_var": 0.041890462239583336, "learning_rate": 0.0001, "loss": 5.7154, "loss/crossentropy": 2.5550438165664673, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16603317111730576, "step": 17226 }, { "epoch": 0.538375, "grad_norm": 3.3125, "grad_norm_var": 0.033299763997395836, "learning_rate": 0.0001, "loss": 5.7385, "loss/crossentropy": 2.474865198135376, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1767580509185791, "step": 17228 }, { "epoch": 0.5384375, "grad_norm": 2.9375, "grad_norm_var": 0.036942545572916666, "learning_rate": 0.0001, "loss": 5.6299, "loss/crossentropy": 2.5964972972869873, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1603715643286705, "step": 17230 }, { "epoch": 0.5385, "grad_norm": 3.265625, "grad_norm_var": 0.031103515625, "learning_rate": 0.0001, "loss": 5.5366, "loss/crossentropy": 2.4126774072647095, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1659061163663864, "step": 17232 }, { "epoch": 0.5385625, "grad_norm": 3.0625, "grad_norm_var": 0.02265625, "learning_rate": 0.0001, "loss": 5.6391, "loss/crossentropy": 2.5527502298355103, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1609746515750885, "step": 17234 }, { "epoch": 0.538625, "grad_norm": 3.328125, "grad_norm_var": 0.024559529622395833, "learning_rate": 0.0001, "loss": 5.9366, "loss/crossentropy": 2.702728271484375, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17690350860357285, "step": 17236 }, { "epoch": 0.5386875, "grad_norm": 2.953125, "grad_norm_var": 0.027171834309895834, "learning_rate": 0.0001, "loss": 5.7457, "loss/crossentropy": 2.6052632331848145, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1671736240386963, "step": 17238 }, { "epoch": 0.53875, "grad_norm": 3.046875, "grad_norm_var": 0.025706990559895834, "learning_rate": 0.0001, "loss": 5.8378, "loss/crossentropy": 2.632414698600769, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.17600620537996292, "step": 17240 }, { "epoch": 0.5388125, "grad_norm": 3.40625, "grad_norm_var": 0.027372233072916665, "learning_rate": 0.0001, "loss": 5.9147, "loss/crossentropy": 2.611965537071228, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.18144212663173676, "step": 17242 }, { "epoch": 0.538875, "grad_norm": 3.0625, "grad_norm_var": 0.027372233072916665, "learning_rate": 0.0001, "loss": 5.3358, "loss/crossentropy": 2.3390179872512817, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15436232835054398, "step": 17244 }, { "epoch": 0.5389375, "grad_norm": 3.25, "grad_norm_var": 0.024193318684895833, "learning_rate": 0.0001, "loss": 6.0388, "loss/crossentropy": 2.7092678546905518, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17904328554868698, "step": 17246 }, { "epoch": 0.539, "grad_norm": 2.875, "grad_norm_var": 0.027815755208333334, "learning_rate": 0.0001, "loss": 5.5042, "loss/crossentropy": 2.4605921506881714, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.1621706485748291, "step": 17248 }, { "epoch": 0.5390625, "grad_norm": 3.109375, "grad_norm_var": 0.03173828125, "learning_rate": 0.0001, "loss": 6.1756, "loss/crossentropy": 2.8003345727920532, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18284225463867188, "step": 17250 }, { "epoch": 0.539125, "grad_norm": 3.453125, "grad_norm_var": 0.03570556640625, "learning_rate": 0.0001, "loss": 6.1073, "loss/crossentropy": 2.734965682029724, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1833227127790451, "step": 17252 }, { "epoch": 0.5391875, "grad_norm": 3.140625, "grad_norm_var": 0.031201171875, "learning_rate": 0.0001, "loss": 5.496, "loss/crossentropy": 2.426445722579956, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15929921716451645, "step": 17254 }, { "epoch": 0.53925, "grad_norm": 3.125, "grad_norm_var": 0.03459370930989583, "learning_rate": 0.0001, "loss": 5.4814, "loss/crossentropy": 2.4185190200805664, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15902447700500488, "step": 17256 }, { "epoch": 0.5393125, "grad_norm": 3.046875, "grad_norm_var": 0.031208292643229166, "learning_rate": 0.0001, "loss": 5.7525, "loss/crossentropy": 2.623996615409851, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16402489691972733, "step": 17258 }, { "epoch": 0.539375, "grad_norm": 3.265625, "grad_norm_var": 0.030394490559895834, "learning_rate": 0.0001, "loss": 5.3717, "loss/crossentropy": 2.283425211906433, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.16546471416950226, "step": 17260 }, { "epoch": 0.5394375, "grad_norm": 3.375, "grad_norm_var": 0.12984619140625, "learning_rate": 0.0001, "loss": 5.9672, "loss/crossentropy": 2.675995349884033, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17794667184352875, "step": 17262 }, { "epoch": 0.5395, "grad_norm": 3.046875, "grad_norm_var": 0.1250640869140625, "learning_rate": 0.0001, "loss": 5.7718, "loss/crossentropy": 2.583772897720337, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16880057007074356, "step": 17264 }, { "epoch": 0.5395625, "grad_norm": 3.21875, "grad_norm_var": 0.12254231770833333, "learning_rate": 0.0001, "loss": 6.3176, "loss/crossentropy": 2.9289597272872925, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18808703124523163, "step": 17266 }, { "epoch": 0.539625, "grad_norm": 3.734375, "grad_norm_var": 0.13996480305989584, "learning_rate": 0.0001, "loss": 5.7947, "loss/crossentropy": 2.5368372201919556, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17618079483509064, "step": 17268 }, { "epoch": 0.5396875, "grad_norm": 3.40625, "grad_norm_var": 0.14003804524739583, "learning_rate": 0.0001, "loss": 5.8048, "loss/crossentropy": 2.4929646253585815, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.18001490086317062, "step": 17270 }, { "epoch": 0.53975, "grad_norm": 3.171875, "grad_norm_var": 0.1302642822265625, "learning_rate": 0.0001, "loss": 5.8942, "loss/crossentropy": 2.701943278312683, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16961456090211868, "step": 17272 }, { "epoch": 0.5398125, "grad_norm": 3.015625, "grad_norm_var": 0.1318511962890625, "learning_rate": 0.0001, "loss": 5.328, "loss/crossentropy": 2.232863187789917, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.15755825489759445, "step": 17274 }, { "epoch": 0.539875, "grad_norm": 3.5, "grad_norm_var": 0.132177734375, "learning_rate": 0.0001, "loss": 5.6895, "loss/crossentropy": 2.518621563911438, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16747979819774628, "step": 17276 }, { "epoch": 0.5399375, "grad_norm": 3.3125, "grad_norm_var": 0.0429595947265625, "learning_rate": 0.0001, "loss": 5.5796, "loss/crossentropy": 2.4978870153427124, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1624642163515091, "step": 17278 }, { "epoch": 0.54, "grad_norm": 2.828125, "grad_norm_var": 0.04932352701822917, "learning_rate": 0.0001, "loss": 5.3894, "loss/crossentropy": 2.368393898010254, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1567913517355919, "step": 17280 }, { "epoch": 0.5400625, "grad_norm": 3.328125, "grad_norm_var": 0.049738566080729164, "learning_rate": 0.0001, "loss": 5.7971, "loss/crossentropy": 2.4818142652511597, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17879413068294525, "step": 17282 }, { "epoch": 0.540125, "grad_norm": 3.046875, "grad_norm_var": 0.03406473795572917, "learning_rate": 0.0001, "loss": 5.7518, "loss/crossentropy": 2.5515873432159424, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.16650700569152832, "step": 17284 }, { "epoch": 0.5401875, "grad_norm": 3.828125, "grad_norm_var": 0.07256571451822917, "learning_rate": 0.0001, "loss": 6.216, "loss/crossentropy": 2.8061646223068237, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.18395615369081497, "step": 17286 }, { "epoch": 0.54025, "grad_norm": 3.5625, "grad_norm_var": 0.09384663899739583, "learning_rate": 0.0001, "loss": 5.8815, "loss/crossentropy": 2.5948829650878906, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17319124191999435, "step": 17288 }, { "epoch": 0.5403125, "grad_norm": 3.015625, "grad_norm_var": 0.092529296875, "learning_rate": 0.0001, "loss": 5.5779, "loss/crossentropy": 2.507999897003174, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.160506471991539, "step": 17290 }, { "epoch": 0.540375, "grad_norm": 3.265625, "grad_norm_var": 0.09810791015625, "learning_rate": 0.0001, "loss": 5.6686, "loss/crossentropy": 2.5252314805984497, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16785302013158798, "step": 17292 }, { "epoch": 0.5404375, "grad_norm": 3.203125, "grad_norm_var": 0.09640299479166667, "learning_rate": 0.0001, "loss": 5.9182, "loss/crossentropy": 2.676273226737976, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1741974800825119, "step": 17294 }, { "epoch": 0.5405, "grad_norm": 2.859375, "grad_norm_var": 0.09888916015625, "learning_rate": 0.0001, "loss": 5.1875, "loss/crossentropy": 2.3228524923324585, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.1442803516983986, "step": 17296 }, { "epoch": 0.5405625, "grad_norm": 3.21875, "grad_norm_var": 0.0994293212890625, "learning_rate": 0.0001, "loss": 5.5537, "loss/crossentropy": 2.42146372795105, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16400451213121414, "step": 17298 }, { "epoch": 0.540625, "grad_norm": 3.0625, "grad_norm_var": 0.09726460774739583, "learning_rate": 0.0001, "loss": 5.7194, "loss/crossentropy": 2.5954188108444214, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16473744064569473, "step": 17300 }, { "epoch": 0.5406875, "grad_norm": 3.15625, "grad_norm_var": 0.05632222493489583, "learning_rate": 0.0001, "loss": 5.8397, "loss/crossentropy": 2.7246214151382446, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16580431908369064, "step": 17302 }, { "epoch": 0.54075, "grad_norm": 3.078125, "grad_norm_var": 0.015672810872395835, "learning_rate": 0.0001, "loss": 5.3303, "loss/crossentropy": 2.2647517919540405, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15889877825975418, "step": 17304 }, { "epoch": 0.5408125, "grad_norm": 3.125, "grad_norm_var": 0.013841756184895833, "learning_rate": 0.0001, "loss": 5.6368, "loss/crossentropy": 2.5183417797088623, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16379909217357635, "step": 17306 }, { "epoch": 0.540875, "grad_norm": 3.21875, "grad_norm_var": 0.017113240559895833, "learning_rate": 0.0001, "loss": 6.0625, "loss/crossentropy": 2.7601245641708374, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17906256020069122, "step": 17308 }, { "epoch": 0.5409375, "grad_norm": 3.0625, "grad_norm_var": 0.016487630208333333, "learning_rate": 0.0001, "loss": 5.7098, "loss/crossentropy": 2.5399783849716187, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1704939901828766, "step": 17310 }, { "epoch": 0.541, "grad_norm": 4.0, "grad_norm_var": 0.05878499348958333, "learning_rate": 0.0001, "loss": 5.8959, "loss/crossentropy": 2.6711167097091675, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1713021695613861, "step": 17312 }, { "epoch": 0.5410625, "grad_norm": 3.0, "grad_norm_var": 0.11057027180989583, "learning_rate": 0.0001, "loss": 6.0762, "loss/crossentropy": 2.7620147466659546, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.18024317920207977, "step": 17314 }, { "epoch": 0.541125, "grad_norm": 3.3125, "grad_norm_var": 0.10852762858072916, "learning_rate": 0.0001, "loss": 5.5332, "loss/crossentropy": 2.3659234046936035, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16594360023736954, "step": 17316 }, { "epoch": 0.5411875, "grad_norm": 3.265625, "grad_norm_var": 0.10676167805989584, "learning_rate": 0.0001, "loss": 6.0635, "loss/crossentropy": 2.782996892929077, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17726892232894897, "step": 17318 }, { "epoch": 0.54125, "grad_norm": 3.28125, "grad_norm_var": 0.10407613118489584, "learning_rate": 0.0001, "loss": 5.6753, "loss/crossentropy": 2.587460994720459, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.15722176432609558, "step": 17320 }, { "epoch": 0.5413125, "grad_norm": 3.03125, "grad_norm_var": 0.10534566243489583, "learning_rate": 0.0001, "loss": 5.5398, "loss/crossentropy": 2.4110758304595947, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16013644635677338, "step": 17322 }, { "epoch": 0.541375, "grad_norm": 3.421875, "grad_norm_var": 0.1089508056640625, "learning_rate": 0.0001, "loss": 5.8649, "loss/crossentropy": 2.5966124534606934, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17526501417160034, "step": 17324 }, { "epoch": 0.5414375, "grad_norm": 3.3125, "grad_norm_var": 0.11116434733072916, "learning_rate": 0.0001, "loss": 5.8719, "loss/crossentropy": 2.535356879234314, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17935575544834137, "step": 17326 }, { "epoch": 0.5415, "grad_norm": 3.3125, "grad_norm_var": 0.0928863525390625, "learning_rate": 0.0001, "loss": 5.3556, "loss/crossentropy": 2.3893096446990967, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15521960705518723, "step": 17328 }, { "epoch": 0.5415625, "grad_norm": 2.921875, "grad_norm_var": 0.06511942545572917, "learning_rate": 0.0001, "loss": 5.9359, "loss/crossentropy": 2.7159135341644287, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17239267379045486, "step": 17330 }, { "epoch": 0.541625, "grad_norm": 3.078125, "grad_norm_var": 0.07737223307291667, "learning_rate": 0.0001, "loss": 5.7177, "loss/crossentropy": 2.5383517742156982, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1675461083650589, "step": 17332 }, { "epoch": 0.5416875, "grad_norm": 3.34375, "grad_norm_var": 0.07959696451822916, "learning_rate": 0.0001, "loss": 5.5848, "loss/crossentropy": 2.3904402256011963, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17217034846544266, "step": 17334 }, { "epoch": 0.54175, "grad_norm": 3.03125, "grad_norm_var": 0.080126953125, "learning_rate": 0.0001, "loss": 5.7477, "loss/crossentropy": 2.5591678619384766, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17041853815317154, "step": 17336 }, { "epoch": 0.5418125, "grad_norm": 2.9375, "grad_norm_var": 0.090869140625, "learning_rate": 0.0001, "loss": 6.0124, "loss/crossentropy": 2.7603390216827393, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17364423722028732, "step": 17338 }, { "epoch": 0.541875, "grad_norm": 3.25, "grad_norm_var": 0.08847249348958333, "learning_rate": 0.0001, "loss": 5.8721, "loss/crossentropy": 2.6525124311447144, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17196206003427505, "step": 17340 }, { "epoch": 0.5419375, "grad_norm": 3.140625, "grad_norm_var": 0.071923828125, "learning_rate": 0.0001, "loss": 5.7777, "loss/crossentropy": 2.570147156715393, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16997401416301727, "step": 17342 }, { "epoch": 0.542, "grad_norm": 2.9375, "grad_norm_var": 0.07389322916666667, "learning_rate": 0.0001, "loss": 5.7201, "loss/crossentropy": 2.595253825187683, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16365204751491547, "step": 17344 }, { "epoch": 0.5420625, "grad_norm": 2.890625, "grad_norm_var": 0.06341044108072917, "learning_rate": 0.0001, "loss": 5.6749, "loss/crossentropy": 2.5351758003234863, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1635787934064865, "step": 17346 }, { "epoch": 0.542125, "grad_norm": 3.078125, "grad_norm_var": 0.04690348307291667, "learning_rate": 0.0001, "loss": 5.8137, "loss/crossentropy": 2.6436641216278076, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17012985795736313, "step": 17348 }, { "epoch": 0.5421875, "grad_norm": 3.140625, "grad_norm_var": 0.044066365559895834, "learning_rate": 0.0001, "loss": 5.544, "loss/crossentropy": 2.4790529012680054, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1600080132484436, "step": 17350 }, { "epoch": 0.54225, "grad_norm": 2.890625, "grad_norm_var": 0.046533203125, "learning_rate": 0.0001, "loss": 5.5663, "loss/crossentropy": 2.4919902086257935, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1636843979358673, "step": 17352 }, { "epoch": 0.5423125, "grad_norm": 3.15625, "grad_norm_var": 0.0270660400390625, "learning_rate": 0.0001, "loss": 5.7964, "loss/crossentropy": 2.619598388671875, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16807474195957184, "step": 17354 }, { "epoch": 0.542375, "grad_norm": 3.1875, "grad_norm_var": 0.014045206705729167, "learning_rate": 0.0001, "loss": 5.5497, "loss/crossentropy": 2.488608479499817, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1557212769985199, "step": 17356 }, { "epoch": 0.5424375, "grad_norm": 2.921875, "grad_norm_var": 0.014058430989583334, "learning_rate": 0.0001, "loss": 5.4728, "loss/crossentropy": 2.4885571002960205, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15467827767133713, "step": 17358 }, { "epoch": 0.5425, "grad_norm": 3.328125, "grad_norm_var": 0.015941365559895834, "learning_rate": 0.0001, "loss": 5.8309, "loss/crossentropy": 2.5929763317108154, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17457586526870728, "step": 17360 }, { "epoch": 0.5425625, "grad_norm": 2.859375, "grad_norm_var": 0.01705322265625, "learning_rate": 0.0001, "loss": 5.6502, "loss/crossentropy": 2.5163358449935913, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1653357744216919, "step": 17362 }, { "epoch": 0.542625, "grad_norm": 3.046875, "grad_norm_var": 0.014969889322916667, "learning_rate": 0.0001, "loss": 5.9868, "loss/crossentropy": 2.7236123085021973, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17475664615631104, "step": 17364 }, { "epoch": 0.5426875, "grad_norm": 3.703125, "grad_norm_var": 0.04247639973958333, "learning_rate": 0.0001, "loss": 5.7169, "loss/crossentropy": 2.5115528106689453, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17404936999082565, "step": 17366 }, { "epoch": 0.54275, "grad_norm": 3.328125, "grad_norm_var": 0.04132486979166667, "learning_rate": 0.0001, "loss": 5.4933, "loss/crossentropy": 2.438134789466858, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1574733927845955, "step": 17368 }, { "epoch": 0.5428125, "grad_norm": 2.96875, "grad_norm_var": 0.04158426920572917, "learning_rate": 0.0001, "loss": 5.662, "loss/crossentropy": 2.606270432472229, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16260183602571487, "step": 17370 }, { "epoch": 0.542875, "grad_norm": 3.5, "grad_norm_var": 0.0514068603515625, "learning_rate": 0.0001, "loss": 5.7725, "loss/crossentropy": 2.52018940448761, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1717132329940796, "step": 17372 }, { "epoch": 0.5429375, "grad_norm": 3.078125, "grad_norm_var": 0.0446929931640625, "learning_rate": 0.0001, "loss": 5.8875, "loss/crossentropy": 2.6899309158325195, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1717073693871498, "step": 17374 }, { "epoch": 0.543, "grad_norm": 3.078125, "grad_norm_var": 0.043431599934895836, "learning_rate": 0.0001, "loss": 5.5822, "loss/crossentropy": 2.4854485988616943, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1604522168636322, "step": 17376 }, { "epoch": 0.5430625, "grad_norm": 3.15625, "grad_norm_var": 0.03583577473958333, "learning_rate": 0.0001, "loss": 5.6836, "loss/crossentropy": 2.4923148155212402, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17029929906129837, "step": 17378 }, { "epoch": 0.543125, "grad_norm": 3.0625, "grad_norm_var": 0.03453369140625, "learning_rate": 0.0001, "loss": 5.6612, "loss/crossentropy": 2.5236387252807617, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16571299731731415, "step": 17380 }, { "epoch": 0.5431875, "grad_norm": 3.0, "grad_norm_var": 0.0330963134765625, "learning_rate": 0.0001, "loss": 5.8322, "loss/crossentropy": 2.6035473346710205, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.171689473092556, "step": 17382 }, { "epoch": 0.54325, "grad_norm": 3.21875, "grad_norm_var": 0.0328033447265625, "learning_rate": 0.0001, "loss": 5.7477, "loss/crossentropy": 2.5189948081970215, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1748279556632042, "step": 17384 }, { "epoch": 0.5433125, "grad_norm": 3.03125, "grad_norm_var": 0.032389322916666664, "learning_rate": 0.0001, "loss": 5.5113, "loss/crossentropy": 2.313473343849182, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17251309007406235, "step": 17386 }, { "epoch": 0.543375, "grad_norm": 2.921875, "grad_norm_var": 0.029150390625, "learning_rate": 0.0001, "loss": 5.7159, "loss/crossentropy": 2.5559000968933105, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16756704449653625, "step": 17388 }, { "epoch": 0.5434375, "grad_norm": 2.9375, "grad_norm_var": 0.0356842041015625, "learning_rate": 0.0001, "loss": 5.5085, "loss/crossentropy": 2.4819912910461426, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15811563283205032, "step": 17390 }, { "epoch": 0.5435, "grad_norm": 3.484375, "grad_norm_var": 0.0461090087890625, "learning_rate": 0.0001, "loss": 5.5871, "loss/crossentropy": 2.3881219625473022, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16755874454975128, "step": 17392 }, { "epoch": 0.5435625, "grad_norm": 2.84375, "grad_norm_var": 0.05329488118489583, "learning_rate": 0.0001, "loss": 5.2546, "loss/crossentropy": 2.2453823685646057, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.15053172409534454, "step": 17394 }, { "epoch": 0.543625, "grad_norm": 3.21875, "grad_norm_var": 0.052489217122395834, "learning_rate": 0.0001, "loss": 5.8638, "loss/crossentropy": 2.674225926399231, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17091196030378342, "step": 17396 }, { "epoch": 0.5436875, "grad_norm": 3.78125, "grad_norm_var": 0.06104227701822917, "learning_rate": 0.0001, "loss": 5.9071, "loss/crossentropy": 2.685681104660034, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1733183041214943, "step": 17398 }, { "epoch": 0.54375, "grad_norm": 3.734375, "grad_norm_var": 0.10305582682291667, "learning_rate": 0.0001, "loss": 5.8124, "loss/crossentropy": 2.4964096546173096, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17925040423870087, "step": 17400 }, { "epoch": 0.5438125, "grad_norm": 2.984375, "grad_norm_var": 0.1107574462890625, "learning_rate": 0.0001, "loss": 5.6789, "loss/crossentropy": 2.5261625051498413, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16878822445869446, "step": 17402 }, { "epoch": 0.543875, "grad_norm": 3.25, "grad_norm_var": 0.10347900390625, "learning_rate": 0.0001, "loss": 5.7427, "loss/crossentropy": 2.527507185935974, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1726917028427124, "step": 17404 }, { "epoch": 0.5439375, "grad_norm": 3.359375, "grad_norm_var": 0.089404296875, "learning_rate": 0.0001, "loss": 5.6494, "loss/crossentropy": 2.5336248874664307, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16626831889152527, "step": 17406 }, { "epoch": 0.544, "grad_norm": 3.1875, "grad_norm_var": 0.08929036458333334, "learning_rate": 0.0001, "loss": 5.783, "loss/crossentropy": 2.623347759246826, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16908900439739227, "step": 17408 }, { "epoch": 0.5440625, "grad_norm": 2.90625, "grad_norm_var": 0.08642476399739583, "learning_rate": 0.0001, "loss": 5.5582, "loss/crossentropy": 2.471962809562683, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.16526072472333908, "step": 17410 }, { "epoch": 0.544125, "grad_norm": 3.0, "grad_norm_var": 0.0893707275390625, "learning_rate": 0.0001, "loss": 5.6715, "loss/crossentropy": 2.559972047805786, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1650635376572609, "step": 17412 }, { "epoch": 0.5441875, "grad_norm": 2.859375, "grad_norm_var": 0.075146484375, "learning_rate": 0.0001, "loss": 5.6099, "loss/crossentropy": 2.588362455368042, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15762421488761902, "step": 17414 }, { "epoch": 0.54425, "grad_norm": 2.953125, "grad_norm_var": 0.023193359375, "learning_rate": 0.0001, "loss": 5.4951, "loss/crossentropy": 2.460350275039673, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15972907841205597, "step": 17416 }, { "epoch": 0.5443125, "grad_norm": 3.234375, "grad_norm_var": 0.022412109375, "learning_rate": 0.0001, "loss": 5.9098, "loss/crossentropy": 2.674430251121521, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17353634536266327, "step": 17418 }, { "epoch": 0.544375, "grad_norm": 3.265625, "grad_norm_var": 0.026447550455729166, "learning_rate": 0.0001, "loss": 5.7901, "loss/crossentropy": 2.562321186065674, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17160680145025253, "step": 17420 }, { "epoch": 0.5444375, "grad_norm": 3.203125, "grad_norm_var": 0.022980753580729166, "learning_rate": 0.0001, "loss": 5.9997, "loss/crossentropy": 2.7496341466903687, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17305376380681992, "step": 17422 }, { "epoch": 0.5445, "grad_norm": 3.125, "grad_norm_var": 0.025715128580729166, "learning_rate": 0.0001, "loss": 5.5739, "loss/crossentropy": 2.4427828788757324, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1674105003476143, "step": 17424 }, { "epoch": 0.5445625, "grad_norm": 3.203125, "grad_norm_var": 0.025, "learning_rate": 0.0001, "loss": 5.6343, "loss/crossentropy": 2.4660485982894897, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16799865663051605, "step": 17426 }, { "epoch": 0.544625, "grad_norm": 3.25, "grad_norm_var": 0.026883951822916665, "learning_rate": 0.0001, "loss": 5.8551, "loss/crossentropy": 2.63783597946167, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1736757457256317, "step": 17428 }, { "epoch": 0.5446875, "grad_norm": 3.578125, "grad_norm_var": 0.0374420166015625, "learning_rate": 0.0001, "loss": 5.9258, "loss/crossentropy": 2.6419161558151245, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1744867041707039, "step": 17430 }, { "epoch": 0.54475, "grad_norm": 3.0625, "grad_norm_var": 0.028548177083333334, "learning_rate": 0.0001, "loss": 5.9458, "loss/crossentropy": 2.711817502975464, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17339904606342316, "step": 17432 }, { "epoch": 0.5448125, "grad_norm": 3.1875, "grad_norm_var": 0.0311431884765625, "learning_rate": 0.0001, "loss": 5.771, "loss/crossentropy": 2.637289881706238, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16493095457553864, "step": 17434 }, { "epoch": 0.544875, "grad_norm": 3.265625, "grad_norm_var": 0.0318756103515625, "learning_rate": 0.0001, "loss": 6.0217, "loss/crossentropy": 2.7659993171691895, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17478535324335098, "step": 17436 }, { "epoch": 0.5449375, "grad_norm": 3.0625, "grad_norm_var": 0.034765625, "learning_rate": 0.0001, "loss": 5.7704, "loss/crossentropy": 2.5579363107681274, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17124298959970474, "step": 17438 }, { "epoch": 0.545, "grad_norm": 3.3125, "grad_norm_var": 0.029130045572916666, "learning_rate": 0.0001, "loss": 5.8219, "loss/crossentropy": 2.633441209793091, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16884766519069672, "step": 17440 }, { "epoch": 0.5450625, "grad_norm": 2.890625, "grad_norm_var": 0.036116536458333334, "learning_rate": 0.0001, "loss": 5.746, "loss/crossentropy": 2.5905871391296387, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16749078035354614, "step": 17442 }, { "epoch": 0.545125, "grad_norm": 3.0625, "grad_norm_var": 0.03466389973958333, "learning_rate": 0.0001, "loss": 5.4613, "loss/crossentropy": 2.308797836303711, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1679813116788864, "step": 17444 }, { "epoch": 0.5451875, "grad_norm": 3.328125, "grad_norm_var": 0.024344889322916667, "learning_rate": 0.0001, "loss": 6.0682, "loss/crossentropy": 2.7605719566345215, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17880941927433014, "step": 17446 }, { "epoch": 0.54525, "grad_norm": 3.0625, "grad_norm_var": 0.02330322265625, "learning_rate": 0.0001, "loss": 5.7359, "loss/crossentropy": 2.619320511817932, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1667317971587181, "step": 17448 }, { "epoch": 0.5453125, "grad_norm": 3.09375, "grad_norm_var": 0.022900390625, "learning_rate": 0.0001, "loss": 5.8526, "loss/crossentropy": 2.616609811782837, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17672811448574066, "step": 17450 }, { "epoch": 0.545375, "grad_norm": 3.09375, "grad_norm_var": 0.019563802083333335, "learning_rate": 0.0001, "loss": 5.7392, "loss/crossentropy": 2.5301653146743774, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17090477049350739, "step": 17452 }, { "epoch": 0.5454375, "grad_norm": 3.125, "grad_norm_var": 0.016402180989583334, "learning_rate": 0.0001, "loss": 5.8981, "loss/crossentropy": 2.665869116783142, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17166230827569962, "step": 17454 }, { "epoch": 0.5455, "grad_norm": 3.21875, "grad_norm_var": 0.013899739583333333, "learning_rate": 0.0001, "loss": 5.4047, "loss/crossentropy": 2.3639109134674072, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15798667073249817, "step": 17456 }, { "epoch": 0.5455625, "grad_norm": 2.90625, "grad_norm_var": 0.015583292643229166, "learning_rate": 0.0001, "loss": 5.3614, "loss/crossentropy": 2.4397586584091187, "loss/hidden": 1.40234375, "loss/jsd": 0.0, "loss/logits": 0.1519285961985588, "step": 17458 }, { "epoch": 0.545625, "grad_norm": 3.125, "grad_norm_var": 0.014937337239583333, "learning_rate": 0.0001, "loss": 5.5904, "loss/crossentropy": 2.44307541847229, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1662917137145996, "step": 17460 }, { "epoch": 0.5456875, "grad_norm": 3.265625, "grad_norm_var": 0.012418619791666667, "learning_rate": 0.0001, "loss": 5.7021, "loss/crossentropy": 2.5171843767166138, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17122726887464523, "step": 17462 }, { "epoch": 0.54575, "grad_norm": 3.1875, "grad_norm_var": 0.011128743489583334, "learning_rate": 0.0001, "loss": 5.7291, "loss/crossentropy": 2.5654937028884888, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1679239571094513, "step": 17464 }, { "epoch": 0.5458125, "grad_norm": 3.125, "grad_norm_var": 0.0106842041015625, "learning_rate": 0.0001, "loss": 5.6877, "loss/crossentropy": 2.58706271648407, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1651432141661644, "step": 17466 }, { "epoch": 0.545875, "grad_norm": 3.140625, "grad_norm_var": 0.0110504150390625, "learning_rate": 0.0001, "loss": 6.1347, "loss/crossentropy": 2.8856621980667114, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17646629363298416, "step": 17468 }, { "epoch": 0.5459375, "grad_norm": 2.90625, "grad_norm_var": 0.016141764322916665, "learning_rate": 0.0001, "loss": 5.7553, "loss/crossentropy": 2.582043766975403, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16732431203126907, "step": 17470 }, { "epoch": 0.546, "grad_norm": 3.171875, "grad_norm_var": 0.01513671875, "learning_rate": 0.0001, "loss": 5.7062, "loss/crossentropy": 2.5217323303222656, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1696169525384903, "step": 17472 }, { "epoch": 0.5460625, "grad_norm": 4.0625, "grad_norm_var": 0.0642486572265625, "learning_rate": 0.0001, "loss": 5.6209, "loss/crossentropy": 2.4642462730407715, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1656637042760849, "step": 17474 }, { "epoch": 0.546125, "grad_norm": 3.15625, "grad_norm_var": 0.1303375244140625, "learning_rate": 0.0001, "loss": 5.8618, "loss/crossentropy": 2.577812671661377, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17878559976816177, "step": 17476 }, { "epoch": 0.5461875, "grad_norm": 3.0, "grad_norm_var": 0.13186442057291667, "learning_rate": 0.0001, "loss": 5.5949, "loss/crossentropy": 2.417052149772644, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16817185282707214, "step": 17478 }, { "epoch": 0.54625, "grad_norm": 2.890625, "grad_norm_var": 0.14026692708333333, "learning_rate": 0.0001, "loss": 5.5622, "loss/crossentropy": 2.492171883583069, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16169288754463196, "step": 17480 }, { "epoch": 0.5463125, "grad_norm": 3.078125, "grad_norm_var": 0.14573567708333332, "learning_rate": 0.0001, "loss": 5.331, "loss/crossentropy": 2.395987033843994, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.15288100391626358, "step": 17482 }, { "epoch": 0.546375, "grad_norm": 3.3125, "grad_norm_var": 0.14401041666666667, "learning_rate": 0.0001, "loss": 5.749, "loss/crossentropy": 2.5485790967941284, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17004591971635818, "step": 17484 }, { "epoch": 0.5464375, "grad_norm": 3.28125, "grad_norm_var": 0.14600321451822917, "learning_rate": 0.0001, "loss": 5.4656, "loss/crossentropy": 2.3657922744750977, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16193493455648422, "step": 17486 }, { "epoch": 0.5465, "grad_norm": 3.359375, "grad_norm_var": 0.14504801432291667, "learning_rate": 0.0001, "loss": 5.671, "loss/crossentropy": 2.5214210748672485, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16691534966230392, "step": 17488 }, { "epoch": 0.5465625, "grad_norm": 8.6875, "grad_norm_var": 1.9765370686848958, "learning_rate": 0.0001, "loss": 6.0524, "loss/crossentropy": 2.622617244720459, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1921955570578575, "step": 17490 }, { "epoch": 0.546625, "grad_norm": 2.921875, "grad_norm_var": 1.9584309895833334, "learning_rate": 0.0001, "loss": 5.6667, "loss/crossentropy": 2.5299049615859985, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16797561943531036, "step": 17492 }, { "epoch": 0.5466875, "grad_norm": 3.46875, "grad_norm_var": 1.9653635660807292, "learning_rate": 0.0001, "loss": 5.5908, "loss/crossentropy": 2.408536672592163, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16705283522605896, "step": 17494 }, { "epoch": 0.54675, "grad_norm": 3.03125, "grad_norm_var": 1.96470947265625, "learning_rate": 0.0001, "loss": 5.8037, "loss/crossentropy": 2.660837769508362, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16663309931755066, "step": 17496 }, { "epoch": 0.5468125, "grad_norm": 3.046875, "grad_norm_var": 1.948583984375, "learning_rate": 0.0001, "loss": 5.9658, "loss/crossentropy": 2.700629472732544, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17573725432157516, "step": 17498 }, { "epoch": 0.546875, "grad_norm": 3.15625, "grad_norm_var": 1.9444173177083333, "learning_rate": 0.0001, "loss": 5.8598, "loss/crossentropy": 2.583473563194275, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17802652716636658, "step": 17500 }, { "epoch": 0.5469375, "grad_norm": 2.875, "grad_norm_var": 1.9409820556640625, "learning_rate": 0.0001, "loss": 5.2422, "loss/crossentropy": 2.2244415283203125, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15724655985832214, "step": 17502 }, { "epoch": 0.547, "grad_norm": 3.171875, "grad_norm_var": 1.9528961181640625, "learning_rate": 0.0001, "loss": 5.6346, "loss/crossentropy": 2.4827910661697388, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1694793701171875, "step": 17504 }, { "epoch": 0.5470625, "grad_norm": 5.96875, "grad_norm_var": 0.5597808837890625, "learning_rate": 0.0001, "loss": 5.4604, "loss/crossentropy": 2.2561005353927612, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.16730080544948578, "step": 17506 }, { "epoch": 0.547125, "grad_norm": 3.046875, "grad_norm_var": 0.5482574462890625, "learning_rate": 0.0001, "loss": 5.7717, "loss/crossentropy": 2.5395383834838867, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.16931431740522385, "step": 17508 }, { "epoch": 0.5471875, "grad_norm": 3.140625, "grad_norm_var": 0.5327107747395833, "learning_rate": 0.0001, "loss": 5.752, "loss/crossentropy": 2.5654181241989136, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16944094002246857, "step": 17510 }, { "epoch": 0.54725, "grad_norm": 3.515625, "grad_norm_var": 0.5152333577473959, "learning_rate": 0.0001, "loss": 5.5605, "loss/crossentropy": 2.41715931892395, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16355209052562714, "step": 17512 }, { "epoch": 0.5473125, "grad_norm": 3.25, "grad_norm_var": 0.5193684895833334, "learning_rate": 0.0001, "loss": 5.6707, "loss/crossentropy": 2.544505476951599, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1634012535214424, "step": 17514 }, { "epoch": 0.547375, "grad_norm": 3.21875, "grad_norm_var": 0.5138997395833333, "learning_rate": 0.0001, "loss": 5.59, "loss/crossentropy": 2.4491783380508423, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16720939427614212, "step": 17516 }, { "epoch": 0.5474375, "grad_norm": 3.59375, "grad_norm_var": 0.4985260009765625, "learning_rate": 0.0001, "loss": 5.9674, "loss/crossentropy": 2.7069283723831177, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17330986261367798, "step": 17518 }, { "epoch": 0.5475, "grad_norm": 3.34375, "grad_norm_var": 0.4915924072265625, "learning_rate": 0.0001, "loss": 5.9962, "loss/crossentropy": 2.5872387886047363, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18660012632608414, "step": 17520 }, { "epoch": 0.5475625, "grad_norm": 2.984375, "grad_norm_var": 0.061766560872395834, "learning_rate": 0.0001, "loss": 5.476, "loss/crossentropy": 2.3968299627304077, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1598661169409752, "step": 17522 }, { "epoch": 0.547625, "grad_norm": 3.0, "grad_norm_var": 0.06337483723958333, "learning_rate": 0.0001, "loss": 5.5755, "loss/crossentropy": 2.5098626613616943, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15812503546476364, "step": 17524 }, { "epoch": 0.5476875, "grad_norm": 3.140625, "grad_norm_var": 0.0638580322265625, "learning_rate": 0.0001, "loss": 5.9852, "loss/crossentropy": 2.7504621744155884, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17621267586946487, "step": 17526 }, { "epoch": 0.54775, "grad_norm": 3.09375, "grad_norm_var": 0.05088602701822917, "learning_rate": 0.0001, "loss": 5.5887, "loss/crossentropy": 2.4907697439193726, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1613566130399704, "step": 17528 }, { "epoch": 0.5478125, "grad_norm": 2.859375, "grad_norm_var": 0.055985514322916666, "learning_rate": 0.0001, "loss": 5.7495, "loss/crossentropy": 2.6151968240737915, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1669449657201767, "step": 17530 }, { "epoch": 0.547875, "grad_norm": 3.296875, "grad_norm_var": 0.058014933268229166, "learning_rate": 0.0001, "loss": 5.8629, "loss/crossentropy": 2.6741613149642944, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17083033919334412, "step": 17532 }, { "epoch": 0.5479375, "grad_norm": 3.328125, "grad_norm_var": 0.049051920572916664, "learning_rate": 0.0001, "loss": 5.7614, "loss/crossentropy": 2.511942148208618, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1769026666879654, "step": 17534 }, { "epoch": 0.548, "grad_norm": 3.375, "grad_norm_var": 0.026302083333333334, "learning_rate": 0.0001, "loss": 5.8136, "loss/crossentropy": 2.5612308979034424, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17211096733808517, "step": 17536 }, { "epoch": 0.5480625, "grad_norm": 3.0625, "grad_norm_var": 0.022477213541666666, "learning_rate": 0.0001, "loss": 5.6572, "loss/crossentropy": 2.5014984607696533, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.17182088643312454, "step": 17538 }, { "epoch": 0.548125, "grad_norm": 3.15625, "grad_norm_var": 0.0195220947265625, "learning_rate": 0.0001, "loss": 5.8609, "loss/crossentropy": 2.6298829317092896, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.171540766954422, "step": 17540 }, { "epoch": 0.5481875, "grad_norm": 3.15625, "grad_norm_var": 0.020068359375, "learning_rate": 0.0001, "loss": 5.7179, "loss/crossentropy": 2.565890073776245, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16480782628059387, "step": 17542 }, { "epoch": 0.54825, "grad_norm": 3.15625, "grad_norm_var": 0.026985677083333333, "learning_rate": 0.0001, "loss": 5.6059, "loss/crossentropy": 2.543424367904663, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16249486058950424, "step": 17544 }, { "epoch": 0.5483125, "grad_norm": 3.265625, "grad_norm_var": 0.029325358072916665, "learning_rate": 0.0001, "loss": 5.135, "loss/crossentropy": 2.2710988521575928, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.13678062707185745, "step": 17546 }, { "epoch": 0.548375, "grad_norm": 3.28125, "grad_norm_var": 0.0256256103515625, "learning_rate": 0.0001, "loss": 5.9581, "loss/crossentropy": 2.6833196878433228, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17669300734996796, "step": 17548 }, { "epoch": 0.5484375, "grad_norm": 3.328125, "grad_norm_var": 0.027783203125, "learning_rate": 0.0001, "loss": 5.3984, "loss/crossentropy": 2.224676489830017, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1650291532278061, "step": 17550 }, { "epoch": 0.5485, "grad_norm": 3.0625, "grad_norm_var": 0.025325520833333334, "learning_rate": 0.0001, "loss": 5.5222, "loss/crossentropy": 2.4809380769729614, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.1607636660337448, "step": 17552 }, { "epoch": 0.5485625, "grad_norm": 3.328125, "grad_norm_var": 0.03638916015625, "learning_rate": 0.0001, "loss": 5.8775, "loss/crossentropy": 2.6138750314712524, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17480115592479706, "step": 17554 }, { "epoch": 0.548625, "grad_norm": 3.203125, "grad_norm_var": 0.037255859375, "learning_rate": 0.0001, "loss": 5.6924, "loss/crossentropy": 2.468183398246765, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1708623617887497, "step": 17556 }, { "epoch": 0.5486875, "grad_norm": 3.28125, "grad_norm_var": 0.03777669270833333, "learning_rate": 0.0001, "loss": 5.633, "loss/crossentropy": 2.5118530988693237, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1617267206311226, "step": 17558 }, { "epoch": 0.54875, "grad_norm": 3.109375, "grad_norm_var": 0.03179931640625, "learning_rate": 0.0001, "loss": 5.7145, "loss/crossentropy": 2.5721532106399536, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16774551570415497, "step": 17560 }, { "epoch": 0.5488125, "grad_norm": 2.9375, "grad_norm_var": 0.026471964518229165, "learning_rate": 0.0001, "loss": 5.6571, "loss/crossentropy": 2.5563547611236572, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1651500016450882, "step": 17562 }, { "epoch": 0.548875, "grad_norm": 3.3125, "grad_norm_var": 0.02672119140625, "learning_rate": 0.0001, "loss": 5.8953, "loss/crossentropy": 2.671082019805908, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1712455302476883, "step": 17564 }, { "epoch": 0.5489375, "grad_norm": 3.1875, "grad_norm_var": 0.02349853515625, "learning_rate": 0.0001, "loss": 5.7482, "loss/crossentropy": 2.5788270235061646, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1688936948776245, "step": 17566 }, { "epoch": 0.549, "grad_norm": 3.390625, "grad_norm_var": 0.024364217122395834, "learning_rate": 0.0001, "loss": 5.7864, "loss/crossentropy": 2.586372137069702, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17000386118888855, "step": 17568 }, { "epoch": 0.5490625, "grad_norm": 3.171875, "grad_norm_var": 0.0194488525390625, "learning_rate": 0.0001, "loss": 5.5739, "loss/crossentropy": 2.394267201423645, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16718415170907974, "step": 17570 }, { "epoch": 0.549125, "grad_norm": 3.328125, "grad_norm_var": 0.021728515625, "learning_rate": 0.0001, "loss": 5.8179, "loss/crossentropy": 2.5978699922561646, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17239080369472504, "step": 17572 }, { "epoch": 0.5491875, "grad_norm": 2.96875, "grad_norm_var": 0.026219685872395832, "learning_rate": 0.0001, "loss": 6.0583, "loss/crossentropy": 2.7638529539108276, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17866214364767075, "step": 17574 }, { "epoch": 0.54925, "grad_norm": 3.109375, "grad_norm_var": 0.032633463541666664, "learning_rate": 0.0001, "loss": 5.7033, "loss/crossentropy": 2.573278069496155, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16456648707389832, "step": 17576 }, { "epoch": 0.5493125, "grad_norm": 3.265625, "grad_norm_var": 0.03645731608072917, "learning_rate": 0.0001, "loss": 6.0102, "loss/crossentropy": 2.7705767154693604, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1759153977036476, "step": 17578 }, { "epoch": 0.549375, "grad_norm": 2.96875, "grad_norm_var": 0.041764322916666666, "learning_rate": 0.0001, "loss": 5.6084, "loss/crossentropy": 2.516671657562256, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16229360550642014, "step": 17580 }, { "epoch": 0.5494375, "grad_norm": 3.359375, "grad_norm_var": 0.04462890625, "learning_rate": 0.0001, "loss": 5.8639, "loss/crossentropy": 2.664319157600403, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16878651082515717, "step": 17582 }, { "epoch": 0.5495, "grad_norm": 3.09375, "grad_norm_var": 0.040848795572916666, "learning_rate": 0.0001, "loss": 5.6724, "loss/crossentropy": 2.5199772119522095, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16758862137794495, "step": 17584 }, { "epoch": 0.5495625, "grad_norm": 3.046875, "grad_norm_var": 0.03671468098958333, "learning_rate": 0.0001, "loss": 5.9524, "loss/crossentropy": 2.77493953704834, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16930725425481796, "step": 17586 }, { "epoch": 0.549625, "grad_norm": 3.1875, "grad_norm_var": 0.03520406087239583, "learning_rate": 0.0001, "loss": 5.6959, "loss/crossentropy": 2.5064406394958496, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1693325936794281, "step": 17588 }, { "epoch": 0.5496875, "grad_norm": 3.234375, "grad_norm_var": 0.028999837239583333, "learning_rate": 0.0001, "loss": 5.7136, "loss/crossentropy": 2.5963058471679688, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16602276265621185, "step": 17590 }, { "epoch": 0.54975, "grad_norm": 3.171875, "grad_norm_var": 0.025028483072916666, "learning_rate": 0.0001, "loss": 5.5308, "loss/crossentropy": 2.449121594429016, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15855589509010315, "step": 17592 }, { "epoch": 0.5498125, "grad_norm": 3.015625, "grad_norm_var": 0.021321614583333332, "learning_rate": 0.0001, "loss": 5.6914, "loss/crossentropy": 2.521979570388794, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16733740270137787, "step": 17594 }, { "epoch": 0.549875, "grad_norm": 3.15625, "grad_norm_var": 0.018260701497395834, "learning_rate": 0.0001, "loss": 5.7296, "loss/crossentropy": 2.521511197090149, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1700238361954689, "step": 17596 }, { "epoch": 0.5499375, "grad_norm": 3.078125, "grad_norm_var": 0.021849568684895834, "learning_rate": 0.0001, "loss": 5.9507, "loss/crossentropy": 2.6598910093307495, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.18025202304124832, "step": 17598 }, { "epoch": 0.55, "grad_norm": 2.921875, "grad_norm_var": 0.025960286458333332, "learning_rate": 0.0001, "loss": 5.5521, "loss/crossentropy": 2.464334011077881, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1638558954000473, "step": 17600 }, { "epoch": 0.5500625, "grad_norm": 3.359375, "grad_norm_var": 0.022835286458333333, "learning_rate": 0.0001, "loss": 5.364, "loss/crossentropy": 2.2182083129882812, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16418644040822983, "step": 17602 }, { "epoch": 0.550125, "grad_norm": 3.109375, "grad_norm_var": 0.022184244791666665, "learning_rate": 0.0001, "loss": 5.8673, "loss/crossentropy": 2.7186864614486694, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16447259485721588, "step": 17604 }, { "epoch": 0.5501875, "grad_norm": 3.140625, "grad_norm_var": 0.0223297119140625, "learning_rate": 0.0001, "loss": 5.8666, "loss/crossentropy": 2.61619770526886, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1762106567621231, "step": 17606 }, { "epoch": 0.55025, "grad_norm": 3.140625, "grad_norm_var": 0.021712239583333334, "learning_rate": 0.0001, "loss": 5.8196, "loss/crossentropy": 2.6020315885543823, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17448948323726654, "step": 17608 }, { "epoch": 0.5503125, "grad_norm": 2.984375, "grad_norm_var": 0.02174072265625, "learning_rate": 0.0001, "loss": 5.6747, "loss/crossentropy": 2.512550115585327, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16895052045583725, "step": 17610 }, { "epoch": 0.550375, "grad_norm": 3.390625, "grad_norm_var": 0.024637858072916668, "learning_rate": 0.0001, "loss": 5.6881, "loss/crossentropy": 2.540887713432312, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1686241254210472, "step": 17612 }, { "epoch": 0.5504375, "grad_norm": 3.0625, "grad_norm_var": 0.018626912434895834, "learning_rate": 0.0001, "loss": 5.7581, "loss/crossentropy": 2.664351224899292, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16328338533639908, "step": 17614 }, { "epoch": 0.5505, "grad_norm": 3.140625, "grad_norm_var": 0.014867146809895834, "learning_rate": 0.0001, "loss": 5.7321, "loss/crossentropy": 2.6070908308029175, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16718365252017975, "step": 17616 }, { "epoch": 0.5505625, "grad_norm": 3.203125, "grad_norm_var": 0.013374837239583333, "learning_rate": 0.0001, "loss": 5.6804, "loss/crossentropy": 2.495343804359436, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16967647522687912, "step": 17618 }, { "epoch": 0.550625, "grad_norm": 3.15625, "grad_norm_var": 0.009761555989583334, "learning_rate": 0.0001, "loss": 5.6772, "loss/crossentropy": 2.551109790802002, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16338741779327393, "step": 17620 }, { "epoch": 0.5506875, "grad_norm": 3.421875, "grad_norm_var": 0.01929931640625, "learning_rate": 0.0001, "loss": 5.6067, "loss/crossentropy": 2.561010241508484, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16042368113994598, "step": 17622 }, { "epoch": 0.55075, "grad_norm": 3.0, "grad_norm_var": 0.020361328125, "learning_rate": 0.0001, "loss": 5.658, "loss/crossentropy": 2.5929129123687744, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1604117974638939, "step": 17624 }, { "epoch": 0.5508125, "grad_norm": 3.125, "grad_norm_var": 0.023746744791666666, "learning_rate": 0.0001, "loss": 5.3172, "loss/crossentropy": 2.3189679384231567, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15099874138832092, "step": 17626 }, { "epoch": 0.550875, "grad_norm": 3.109375, "grad_norm_var": 0.018192545572916666, "learning_rate": 0.0001, "loss": 5.8461, "loss/crossentropy": 2.6823806762695312, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.165588840842247, "step": 17628 }, { "epoch": 0.5509375, "grad_norm": 3.328125, "grad_norm_var": 0.028303019205729165, "learning_rate": 0.0001, "loss": 5.2909, "loss/crossentropy": 2.2500622868537903, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.15213175117969513, "step": 17630 }, { "epoch": 0.551, "grad_norm": 2.8125, "grad_norm_var": 0.03333231608072917, "learning_rate": 0.0001, "loss": 5.613, "loss/crossentropy": 2.534808039665222, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.155868798494339, "step": 17632 }, { "epoch": 0.5510625, "grad_norm": 3.0, "grad_norm_var": 0.0326324462890625, "learning_rate": 0.0001, "loss": 5.9002, "loss/crossentropy": 2.683477759361267, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1744086742401123, "step": 17634 }, { "epoch": 0.551125, "grad_norm": 3.015625, "grad_norm_var": 0.034566243489583336, "learning_rate": 0.0001, "loss": 6.0321, "loss/crossentropy": 2.7725576162338257, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17478548735380173, "step": 17636 }, { "epoch": 0.5511875, "grad_norm": 3.15625, "grad_norm_var": 0.024507649739583335, "learning_rate": 0.0001, "loss": 5.6146, "loss/crossentropy": 2.4677406549453735, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16976484656333923, "step": 17638 }, { "epoch": 0.55125, "grad_norm": 3.6875, "grad_norm_var": 0.04956766764322917, "learning_rate": 0.0001, "loss": 6.2732, "loss/crossentropy": 2.845934271812439, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18882165104150772, "step": 17640 }, { "epoch": 0.5513125, "grad_norm": 3.46875, "grad_norm_var": 0.054198201497395834, "learning_rate": 0.0001, "loss": 5.4354, "loss/crossentropy": 2.3915497064590454, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15594977885484695, "step": 17642 }, { "epoch": 0.551375, "grad_norm": 3.328125, "grad_norm_var": 0.056050618489583336, "learning_rate": 0.0001, "loss": 5.8491, "loss/crossentropy": 2.6383293867111206, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17186033725738525, "step": 17644 }, { "epoch": 0.5514375, "grad_norm": 2.9375, "grad_norm_var": 0.049117024739583334, "learning_rate": 0.0001, "loss": 5.7848, "loss/crossentropy": 2.616939425468445, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16795941442251205, "step": 17646 }, { "epoch": 0.5515, "grad_norm": 3.171875, "grad_norm_var": 0.04019775390625, "learning_rate": 0.0001, "loss": 6.0911, "loss/crossentropy": 2.8558987379074097, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1750853955745697, "step": 17648 }, { "epoch": 0.5515625, "grad_norm": 2.890625, "grad_norm_var": 0.04482320149739583, "learning_rate": 0.0001, "loss": 5.4487, "loss/crossentropy": 2.3902167081832886, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1609266996383667, "step": 17650 }, { "epoch": 0.551625, "grad_norm": 2.984375, "grad_norm_var": 0.048323567708333334, "learning_rate": 0.0001, "loss": 5.7564, "loss/crossentropy": 2.5943437814712524, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16933320462703705, "step": 17652 }, { "epoch": 0.5516875, "grad_norm": 3.078125, "grad_norm_var": 0.049616495768229164, "learning_rate": 0.0001, "loss": 6.0053, "loss/crossentropy": 2.7250771522521973, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17723697423934937, "step": 17654 }, { "epoch": 0.55175, "grad_norm": 3.109375, "grad_norm_var": 0.031901041666666664, "learning_rate": 0.0001, "loss": 5.3705, "loss/crossentropy": 2.387295126914978, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1545664295554161, "step": 17656 }, { "epoch": 0.5518125, "grad_norm": 3.125, "grad_norm_var": 0.028125, "learning_rate": 0.0001, "loss": 5.6115, "loss/crossentropy": 2.4386359453201294, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17080183327198029, "step": 17658 }, { "epoch": 0.551875, "grad_norm": 3.109375, "grad_norm_var": 0.024898274739583334, "learning_rate": 0.0001, "loss": 5.6052, "loss/crossentropy": 2.5384548902511597, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16371016204357147, "step": 17660 }, { "epoch": 0.5519375, "grad_norm": 3.234375, "grad_norm_var": 0.0279449462890625, "learning_rate": 0.0001, "loss": 5.7308, "loss/crossentropy": 2.529237985610962, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17249634116888046, "step": 17662 }, { "epoch": 0.552, "grad_norm": 3.28125, "grad_norm_var": 0.04268290201822917, "learning_rate": 0.0001, "loss": 5.6703, "loss/crossentropy": 2.4546802043914795, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17351430654525757, "step": 17664 }, { "epoch": 0.5520625, "grad_norm": 3.15625, "grad_norm_var": 0.03536783854166667, "learning_rate": 0.0001, "loss": 5.7763, "loss/crossentropy": 2.566525936126709, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1737072914838791, "step": 17666 }, { "epoch": 0.552125, "grad_norm": 3.359375, "grad_norm_var": 0.0426666259765625, "learning_rate": 0.0001, "loss": 5.8261, "loss/crossentropy": 2.5591336488723755, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17552430927753448, "step": 17668 }, { "epoch": 0.5521875, "grad_norm": 3.28125, "grad_norm_var": 0.03679097493489583, "learning_rate": 0.0001, "loss": 5.5321, "loss/crossentropy": 2.3835878372192383, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16563591361045837, "step": 17670 }, { "epoch": 0.55225, "grad_norm": 3.484375, "grad_norm_var": 0.028685506184895834, "learning_rate": 0.0001, "loss": 5.8604, "loss/crossentropy": 2.6192610263824463, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1709924191236496, "step": 17672 }, { "epoch": 0.5523125, "grad_norm": 3.453125, "grad_norm_var": 0.028857421875, "learning_rate": 0.0001, "loss": 5.9271, "loss/crossentropy": 2.6670114994049072, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17795803397893906, "step": 17674 }, { "epoch": 0.552375, "grad_norm": 3.046875, "grad_norm_var": 0.027586873372395834, "learning_rate": 0.0001, "loss": 5.7105, "loss/crossentropy": 2.5654743909835815, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16606971621513367, "step": 17676 }, { "epoch": 0.5524375, "grad_norm": 3.0625, "grad_norm_var": 0.03631184895833333, "learning_rate": 0.0001, "loss": 5.7238, "loss/crossentropy": 2.6403234004974365, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16225726157426834, "step": 17678 }, { "epoch": 0.5525, "grad_norm": 3.078125, "grad_norm_var": 0.037962849934895834, "learning_rate": 0.0001, "loss": 5.8579, "loss/crossentropy": 2.6207019090652466, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1760634407401085, "step": 17680 }, { "epoch": 0.5525625, "grad_norm": 3.40625, "grad_norm_var": 0.03909098307291667, "learning_rate": 0.0001, "loss": 5.9807, "loss/crossentropy": 2.8148363828659058, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1681457757949829, "step": 17682 }, { "epoch": 0.552625, "grad_norm": 3.21875, "grad_norm_var": 0.03428446451822917, "learning_rate": 0.0001, "loss": 5.4105, "loss/crossentropy": 2.3150410652160645, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16032294929027557, "step": 17684 }, { "epoch": 0.5526875, "grad_norm": 3.375, "grad_norm_var": 0.039774576822916664, "learning_rate": 0.0001, "loss": 5.6244, "loss/crossentropy": 2.53757905960083, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1625930666923523, "step": 17686 }, { "epoch": 0.55275, "grad_norm": 3.28125, "grad_norm_var": 0.03984375, "learning_rate": 0.0001, "loss": 6.06, "loss/crossentropy": 2.7653326988220215, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.18063924461603165, "step": 17688 }, { "epoch": 0.5528125, "grad_norm": 22.0, "grad_norm_var": 22.19509989420573, "learning_rate": 0.0001, "loss": 5.8548, "loss/crossentropy": 2.3556020259857178, "loss/hidden": 1.73828125, "loss/jsd": 0.0, "loss/logits": 0.17609627544879913, "step": 17690 }, { "epoch": 0.552875, "grad_norm": 3.078125, "grad_norm_var": 22.14544169108073, "learning_rate": 0.0001, "loss": 5.9341, "loss/crossentropy": 2.6883466243743896, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1734030321240425, "step": 17692 }, { "epoch": 0.5529375, "grad_norm": 3.28125, "grad_norm_var": 22.014655558268228, "learning_rate": 0.0001, "loss": 6.1103, "loss/crossentropy": 2.818718194961548, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1776002198457718, "step": 17694 }, { "epoch": 0.553, "grad_norm": 3.046875, "grad_norm_var": 21.980288696289062, "learning_rate": 0.0001, "loss": 5.9822, "loss/crossentropy": 2.687151551246643, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17794059216976166, "step": 17696 }, { "epoch": 0.5530625, "grad_norm": 3.125, "grad_norm_var": 21.997394816080728, "learning_rate": 0.0001, "loss": 5.9328, "loss/crossentropy": 2.6805999279022217, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17600084841251373, "step": 17698 }, { "epoch": 0.553125, "grad_norm": 2.890625, "grad_norm_var": 22.028034464518228, "learning_rate": 0.0001, "loss": 5.6446, "loss/crossentropy": 2.5476465225219727, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16203974187374115, "step": 17700 }, { "epoch": 0.5531875, "grad_norm": 3.328125, "grad_norm_var": 21.904325358072917, "learning_rate": 0.0001, "loss": 5.8891, "loss/crossentropy": 2.6642894744873047, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17248596251010895, "step": 17702 }, { "epoch": 0.55325, "grad_norm": 3.15625, "grad_norm_var": 21.82415262858073, "learning_rate": 0.0001, "loss": 5.5586, "loss/crossentropy": 2.3502798080444336, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.16770578175783157, "step": 17704 }, { "epoch": 0.5533125, "grad_norm": 2.9375, "grad_norm_var": 0.11916910807291667, "learning_rate": 0.0001, "loss": 5.7012, "loss/crossentropy": 2.5814989805221558, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16236452758312225, "step": 17706 }, { "epoch": 0.553375, "grad_norm": 3.03125, "grad_norm_var": 0.11990458170572917, "learning_rate": 0.0001, "loss": 5.5415, "loss/crossentropy": 2.53207266330719, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15797212719917297, "step": 17708 }, { "epoch": 0.5534375, "grad_norm": 3.484375, "grad_norm_var": 0.1141510009765625, "learning_rate": 0.0001, "loss": 5.7966, "loss/crossentropy": 2.583231806755066, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16977283358573914, "step": 17710 }, { "epoch": 0.5535, "grad_norm": 3.078125, "grad_norm_var": 0.07235921223958333, "learning_rate": 0.0001, "loss": 5.8542, "loss/crossentropy": 2.654178738594055, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1688312143087387, "step": 17712 }, { "epoch": 0.5535625, "grad_norm": 3.25, "grad_norm_var": 0.07957356770833333, "learning_rate": 0.0001, "loss": 5.3742, "loss/crossentropy": 2.416073203086853, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15205883234739304, "step": 17714 }, { "epoch": 0.553625, "grad_norm": 3.3125, "grad_norm_var": 0.08026936848958334, "learning_rate": 0.0001, "loss": 5.6411, "loss/crossentropy": 2.4960005283355713, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16724785417318344, "step": 17716 }, { "epoch": 0.5536875, "grad_norm": 3.046875, "grad_norm_var": 0.0369781494140625, "learning_rate": 0.0001, "loss": 5.8712, "loss/crossentropy": 2.614146113395691, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17336326092481613, "step": 17718 }, { "epoch": 0.55375, "grad_norm": 2.984375, "grad_norm_var": 0.026691691080729166, "learning_rate": 0.0001, "loss": 5.5363, "loss/crossentropy": 2.46210777759552, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16328229010105133, "step": 17720 }, { "epoch": 0.5538125, "grad_norm": 4.6875, "grad_norm_var": 0.18032938639322918, "learning_rate": 0.0001, "loss": 5.5073, "loss/crossentropy": 2.404494285583496, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16145338118076324, "step": 17722 }, { "epoch": 0.553875, "grad_norm": 3.765625, "grad_norm_var": 0.196728515625, "learning_rate": 0.0001, "loss": 5.4868, "loss/crossentropy": 2.4248130321502686, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15658820420503616, "step": 17724 }, { "epoch": 0.5539375, "grad_norm": 3.6875, "grad_norm_var": 0.20271809895833334, "learning_rate": 0.0001, "loss": 6.0828, "loss/crossentropy": 2.714733123779297, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1852409541606903, "step": 17726 }, { "epoch": 0.554, "grad_norm": 3.28125, "grad_norm_var": 0.20032450358072917, "learning_rate": 0.0001, "loss": 6.1021, "loss/crossentropy": 2.7352226972579956, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1839490830898285, "step": 17728 }, { "epoch": 0.5540625, "grad_norm": 3.921875, "grad_norm_var": 0.22157796223958334, "learning_rate": 0.0001, "loss": 5.6849, "loss/crossentropy": 2.5687484741210938, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1623956859111786, "step": 17730 }, { "epoch": 0.554125, "grad_norm": 3.328125, "grad_norm_var": 0.2098297119140625, "learning_rate": 0.0001, "loss": 5.8104, "loss/crossentropy": 2.620088815689087, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16824733465909958, "step": 17732 }, { "epoch": 0.5541875, "grad_norm": 3.15625, "grad_norm_var": 0.20611979166666666, "learning_rate": 0.0001, "loss": 5.8186, "loss/crossentropy": 2.5815773010253906, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1740909218788147, "step": 17734 }, { "epoch": 0.55425, "grad_norm": 3.078125, "grad_norm_var": 0.19973856608072918, "learning_rate": 0.0001, "loss": 5.7036, "loss/crossentropy": 2.53547203540802, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1687656044960022, "step": 17736 }, { "epoch": 0.5543125, "grad_norm": 3.25, "grad_norm_var": 0.07545572916666667, "learning_rate": 0.0001, "loss": 5.7654, "loss/crossentropy": 2.5381171703338623, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17155338823795319, "step": 17738 }, { "epoch": 0.554375, "grad_norm": 3.046875, "grad_norm_var": 0.06968485514322917, "learning_rate": 0.0001, "loss": 5.4443, "loss/crossentropy": 2.498365879058838, "loss/hidden": 1.3828125, "loss/jsd": 0.0, "loss/logits": 0.1563076302409172, "step": 17740 }, { "epoch": 0.5544375, "grad_norm": 3.296875, "grad_norm_var": 0.0581695556640625, "learning_rate": 0.0001, "loss": 6.1023, "loss/crossentropy": 2.746156334877014, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18405093252658844, "step": 17742 }, { "epoch": 0.5545, "grad_norm": 3.3125, "grad_norm_var": 0.06363525390625, "learning_rate": 0.0001, "loss": 5.8252, "loss/crossentropy": 2.6532708406448364, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16641514748334885, "step": 17744 }, { "epoch": 0.5545625, "grad_norm": 3.03125, "grad_norm_var": 0.025830078125, "learning_rate": 0.0001, "loss": 5.5518, "loss/crossentropy": 2.4523158073425293, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16229548305273056, "step": 17746 }, { "epoch": 0.554625, "grad_norm": 3.0625, "grad_norm_var": 0.027701822916666667, "learning_rate": 0.0001, "loss": 5.5929, "loss/crossentropy": 2.389325499534607, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17113812267780304, "step": 17748 }, { "epoch": 0.5546875, "grad_norm": 3.1875, "grad_norm_var": 0.027620442708333335, "learning_rate": 0.0001, "loss": 5.7688, "loss/crossentropy": 2.618277668952942, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16935007274150848, "step": 17750 }, { "epoch": 0.55475, "grad_norm": 3.3125, "grad_norm_var": 0.027953084309895834, "learning_rate": 0.0001, "loss": 5.9271, "loss/crossentropy": 2.619473695755005, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.18154089152812958, "step": 17752 }, { "epoch": 0.5548125, "grad_norm": 3.25, "grad_norm_var": 0.03378499348958333, "learning_rate": 0.0001, "loss": 5.9803, "loss/crossentropy": 2.542687177658081, "loss/hidden": 1.58984375, "loss/jsd": 0.0, "loss/logits": 0.18477710336446762, "step": 17754 }, { "epoch": 0.554875, "grad_norm": 3.109375, "grad_norm_var": 0.03287353515625, "learning_rate": 0.0001, "loss": 5.561, "loss/crossentropy": 2.477379322052002, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16226952522993088, "step": 17756 }, { "epoch": 0.5549375, "grad_norm": 3.171875, "grad_norm_var": 0.028473917643229166, "learning_rate": 0.0001, "loss": 5.9849, "loss/crossentropy": 2.737338662147522, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1731894165277481, "step": 17758 }, { "epoch": 0.555, "grad_norm": 3.34375, "grad_norm_var": 0.0525054931640625, "learning_rate": 0.0001, "loss": 6.1035, "loss/crossentropy": 2.6326135396957397, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.19240082055330276, "step": 17760 }, { "epoch": 0.5550625, "grad_norm": 3.125, "grad_norm_var": 0.0496490478515625, "learning_rate": 0.0001, "loss": 5.6892, "loss/crossentropy": 2.5344449281692505, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16430585086345673, "step": 17762 }, { "epoch": 0.555125, "grad_norm": 3.171875, "grad_norm_var": 0.050568644205729166, "learning_rate": 0.0001, "loss": 5.4346, "loss/crossentropy": 2.487262725830078, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.14786113798618317, "step": 17764 }, { "epoch": 0.5551875, "grad_norm": 3.0625, "grad_norm_var": 0.0554840087890625, "learning_rate": 0.0001, "loss": 5.7453, "loss/crossentropy": 2.5581740140914917, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16871501505374908, "step": 17766 }, { "epoch": 0.55525, "grad_norm": 3.171875, "grad_norm_var": 0.054585774739583336, "learning_rate": 0.0001, "loss": 5.9073, "loss/crossentropy": 2.76697313785553, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1675439178943634, "step": 17768 }, { "epoch": 0.5553125, "grad_norm": 3.078125, "grad_norm_var": 0.048628743489583334, "learning_rate": 0.0001, "loss": 5.9461, "loss/crossentropy": 2.757845640182495, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1703917160630226, "step": 17770 }, { "epoch": 0.555375, "grad_norm": 3.28125, "grad_norm_var": 0.0402252197265625, "learning_rate": 0.0001, "loss": 5.7144, "loss/crossentropy": 2.5307306051254272, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16680067032575607, "step": 17772 }, { "epoch": 0.5554375, "grad_norm": 3.0, "grad_norm_var": 0.0438140869140625, "learning_rate": 0.0001, "loss": 5.7311, "loss/crossentropy": 2.5602293014526367, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1690380722284317, "step": 17774 }, { "epoch": 0.5555, "grad_norm": 3.390625, "grad_norm_var": 0.01959228515625, "learning_rate": 0.0001, "loss": 5.6756, "loss/crossentropy": 2.5320883989334106, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16786563396453857, "step": 17776 }, { "epoch": 0.5555625, "grad_norm": 3.25, "grad_norm_var": 0.02301025390625, "learning_rate": 0.0001, "loss": 5.8345, "loss/crossentropy": 2.6048797369003296, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17062260955572128, "step": 17778 }, { "epoch": 0.555625, "grad_norm": 3.171875, "grad_norm_var": 0.02261962890625, "learning_rate": 0.0001, "loss": 5.8017, "loss/crossentropy": 2.54030978679657, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17496530711650848, "step": 17780 }, { "epoch": 0.5556875, "grad_norm": 3.03125, "grad_norm_var": 0.020210774739583333, "learning_rate": 0.0001, "loss": 5.9023, "loss/crossentropy": 2.7037198543548584, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16985435783863068, "step": 17782 }, { "epoch": 0.55575, "grad_norm": 3.015625, "grad_norm_var": 0.05103759765625, "learning_rate": 0.0001, "loss": 6.0906, "loss/crossentropy": 2.771214723587036, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18115680664777756, "step": 17784 }, { "epoch": 0.5558125, "grad_norm": 3.328125, "grad_norm_var": 0.05745340983072917, "learning_rate": 0.0001, "loss": 5.5596, "loss/crossentropy": 2.4504921436309814, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16052187979221344, "step": 17786 }, { "epoch": 0.555875, "grad_norm": 3.234375, "grad_norm_var": 0.0606109619140625, "learning_rate": 0.0001, "loss": 5.8293, "loss/crossentropy": 2.6419315338134766, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1730317920446396, "step": 17788 }, { "epoch": 0.5559375, "grad_norm": 2.96875, "grad_norm_var": 0.061324055989583334, "learning_rate": 0.0001, "loss": 5.8703, "loss/crossentropy": 2.636107921600342, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1769353449344635, "step": 17790 }, { "epoch": 0.556, "grad_norm": 3.296875, "grad_norm_var": 0.060933430989583336, "learning_rate": 0.0001, "loss": 6.014, "loss/crossentropy": 2.7061779499053955, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17805054038763046, "step": 17792 }, { "epoch": 0.5560625, "grad_norm": 4.9375, "grad_norm_var": 0.23993733723958333, "learning_rate": 0.0001, "loss": 5.6304, "loss/crossentropy": 2.3990488052368164, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.16688557714223862, "step": 17794 }, { "epoch": 0.556125, "grad_norm": 3.34375, "grad_norm_var": 0.24044596354166667, "learning_rate": 0.0001, "loss": 5.5713, "loss/crossentropy": 2.4327298402786255, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16463829576969147, "step": 17796 }, { "epoch": 0.5561875, "grad_norm": 2.875, "grad_norm_var": 0.24844462076822918, "learning_rate": 0.0001, "loss": 5.8767, "loss/crossentropy": 2.7174357175827026, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16748591512441635, "step": 17798 }, { "epoch": 0.55625, "grad_norm": 2.921875, "grad_norm_var": 0.23054097493489584, "learning_rate": 0.0001, "loss": 5.7476, "loss/crossentropy": 2.667687773704529, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.16463583707809448, "step": 17800 }, { "epoch": 0.5563125, "grad_norm": 3.484375, "grad_norm_var": 0.22349344889322917, "learning_rate": 0.0001, "loss": 5.9538, "loss/crossentropy": 2.688518762588501, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17574457824230194, "step": 17802 }, { "epoch": 0.556375, "grad_norm": 3.5, "grad_norm_var": 0.22744852701822918, "learning_rate": 0.0001, "loss": 5.4751, "loss/crossentropy": 2.3377124071121216, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16374120116233826, "step": 17804 }, { "epoch": 0.5564375, "grad_norm": 3.234375, "grad_norm_var": 0.28604227701822915, "learning_rate": 0.0001, "loss": 5.9729, "loss/crossentropy": 2.7133841514587402, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17477981001138687, "step": 17806 }, { "epoch": 0.5565, "grad_norm": 3.890625, "grad_norm_var": 0.3018870035807292, "learning_rate": 0.0001, "loss": 5.9125, "loss/crossentropy": 2.6736056804656982, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17154546082019806, "step": 17808 }, { "epoch": 0.5565625, "grad_norm": 2.90625, "grad_norm_var": 0.15346577962239583, "learning_rate": 0.0001, "loss": 5.8151, "loss/crossentropy": 2.6831527948379517, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1702243611216545, "step": 17810 }, { "epoch": 0.556625, "grad_norm": 3.484375, "grad_norm_var": 0.15243733723958333, "learning_rate": 0.0001, "loss": 6.1999, "loss/crossentropy": 2.8121283054351807, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18603944778442383, "step": 17812 }, { "epoch": 0.5566875, "grad_norm": 3.09375, "grad_norm_var": 0.15840555826822916, "learning_rate": 0.0001, "loss": 5.6435, "loss/crossentropy": 2.5500491857528687, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16441978514194489, "step": 17814 }, { "epoch": 0.55675, "grad_norm": 3.421875, "grad_norm_var": 0.15227457682291667, "learning_rate": 0.0001, "loss": 6.2562, "loss/crossentropy": 2.7869977951049805, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.19105692207813263, "step": 17816 }, { "epoch": 0.5568125, "grad_norm": 3.21875, "grad_norm_var": 0.15562744140625, "learning_rate": 0.0001, "loss": 5.6101, "loss/crossentropy": 2.5214409828186035, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16277530789375305, "step": 17818 }, { "epoch": 0.556875, "grad_norm": 3.140625, "grad_norm_var": 0.1571685791015625, "learning_rate": 0.0001, "loss": 6.0065, "loss/crossentropy": 2.8121590614318848, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17099380493164062, "step": 17820 }, { "epoch": 0.5569375, "grad_norm": 3.078125, "grad_norm_var": 0.08584696451822917, "learning_rate": 0.0001, "loss": 5.6346, "loss/crossentropy": 2.5494213104248047, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16554423421621323, "step": 17822 }, { "epoch": 0.557, "grad_norm": 3.515625, "grad_norm_var": 0.07345377604166667, "learning_rate": 0.0001, "loss": 6.0422, "loss/crossentropy": 2.7481939792633057, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17900924384593964, "step": 17824 }, { "epoch": 0.5570625, "grad_norm": 3.578125, "grad_norm_var": 0.07337137858072916, "learning_rate": 0.0001, "loss": 6.1145, "loss/crossentropy": 2.726052403450012, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18728207796812057, "step": 17826 }, { "epoch": 0.557125, "grad_norm": 3.09375, "grad_norm_var": 0.07099609375, "learning_rate": 0.0001, "loss": 5.7471, "loss/crossentropy": 2.601917862892151, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16490618139505386, "step": 17828 }, { "epoch": 0.5571875, "grad_norm": 3.3125, "grad_norm_var": 0.05976155598958333, "learning_rate": 0.0001, "loss": 5.961, "loss/crossentropy": 2.741401195526123, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17234960198402405, "step": 17830 }, { "epoch": 0.55725, "grad_norm": 3.109375, "grad_norm_var": 0.045458984375, "learning_rate": 0.0001, "loss": 5.6013, "loss/crossentropy": 2.4385952949523926, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16705621778964996, "step": 17832 }, { "epoch": 0.5573125, "grad_norm": 3.109375, "grad_norm_var": 0.0469146728515625, "learning_rate": 0.0001, "loss": 5.979, "loss/crossentropy": 2.74137020111084, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17181239277124405, "step": 17834 }, { "epoch": 0.557375, "grad_norm": 3.046875, "grad_norm_var": 0.0437164306640625, "learning_rate": 0.0001, "loss": 5.7128, "loss/crossentropy": 2.541237950325012, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1698867455124855, "step": 17836 }, { "epoch": 0.5574375, "grad_norm": 2.953125, "grad_norm_var": 0.047135416666666666, "learning_rate": 0.0001, "loss": 5.4935, "loss/crossentropy": 2.3597596883773804, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16532307863235474, "step": 17838 }, { "epoch": 0.5575, "grad_norm": 2.953125, "grad_norm_var": 0.03694559733072917, "learning_rate": 0.0001, "loss": 5.5403, "loss/crossentropy": 2.467650294303894, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.16390115767717361, "step": 17840 }, { "epoch": 0.5575625, "grad_norm": 3.078125, "grad_norm_var": 0.029097493489583334, "learning_rate": 0.0001, "loss": 5.6512, "loss/crossentropy": 2.5703890323638916, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16121027618646622, "step": 17842 }, { "epoch": 0.557625, "grad_norm": 3.015625, "grad_norm_var": 0.030768839518229167, "learning_rate": 0.0001, "loss": 5.4493, "loss/crossentropy": 2.3899370431900024, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16218648850917816, "step": 17844 }, { "epoch": 0.5576875, "grad_norm": 3.1875, "grad_norm_var": 0.030338541666666666, "learning_rate": 0.0001, "loss": 5.6036, "loss/crossentropy": 2.4563435316085815, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16550210118293762, "step": 17846 }, { "epoch": 0.55775, "grad_norm": 3.015625, "grad_norm_var": 0.030973307291666665, "learning_rate": 0.0001, "loss": 5.7082, "loss/crossentropy": 2.557474374771118, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.17014601081609726, "step": 17848 }, { "epoch": 0.5578125, "grad_norm": 2.953125, "grad_norm_var": 0.029427083333333333, "learning_rate": 0.0001, "loss": 5.5945, "loss/crossentropy": 2.4838316440582275, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16653349995613098, "step": 17850 }, { "epoch": 0.557875, "grad_norm": 3.109375, "grad_norm_var": 0.028609212239583334, "learning_rate": 0.0001, "loss": 5.3799, "loss/crossentropy": 2.3153923749923706, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16152680665254593, "step": 17852 }, { "epoch": 0.5579375, "grad_norm": 3.0625, "grad_norm_var": 0.010563151041666666, "learning_rate": 0.0001, "loss": 5.7112, "loss/crossentropy": 2.5469366312026978, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16642998158931732, "step": 17854 }, { "epoch": 0.558, "grad_norm": 3.265625, "grad_norm_var": 0.009479777018229166, "learning_rate": 0.0001, "loss": 5.6037, "loss/crossentropy": 2.4705650806427, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16369910538196564, "step": 17856 }, { "epoch": 0.5580625, "grad_norm": 3.140625, "grad_norm_var": 0.010270182291666667, "learning_rate": 0.0001, "loss": 5.7583, "loss/crossentropy": 2.622636079788208, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.166298508644104, "step": 17858 }, { "epoch": 0.558125, "grad_norm": 3.296875, "grad_norm_var": 0.011506144205729167, "learning_rate": 0.0001, "loss": 5.7669, "loss/crossentropy": 2.583187699317932, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17032814025878906, "step": 17860 }, { "epoch": 0.5581875, "grad_norm": 4.0, "grad_norm_var": 0.07049051920572917, "learning_rate": 0.0001, "loss": 6.0502, "loss/crossentropy": 2.632668972015381, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18433555960655212, "step": 17862 }, { "epoch": 0.55825, "grad_norm": 3.203125, "grad_norm_var": 0.06819254557291667, "learning_rate": 0.0001, "loss": 5.5807, "loss/crossentropy": 2.4633195400238037, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16525230556726456, "step": 17864 }, { "epoch": 0.5583125, "grad_norm": 3.1875, "grad_norm_var": 0.06282450358072916, "learning_rate": 0.0001, "loss": 5.2431, "loss/crossentropy": 2.2175941467285156, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15606901794672012, "step": 17866 }, { "epoch": 0.558375, "grad_norm": 3.25, "grad_norm_var": 0.05722249348958333, "learning_rate": 0.0001, "loss": 5.9187, "loss/crossentropy": 2.6782848834991455, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17248325049877167, "step": 17868 }, { "epoch": 0.5584375, "grad_norm": 2.953125, "grad_norm_var": 0.06363525390625, "learning_rate": 0.0001, "loss": 5.1332, "loss/crossentropy": 2.3006467819213867, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.14184421300888062, "step": 17870 }, { "epoch": 0.5585, "grad_norm": 3.375, "grad_norm_var": 0.0641754150390625, "learning_rate": 0.0001, "loss": 5.8978, "loss/crossentropy": 2.6230685710906982, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1778615415096283, "step": 17872 }, { "epoch": 0.5585625, "grad_norm": 3.28125, "grad_norm_var": 0.0650543212890625, "learning_rate": 0.0001, "loss": 5.8439, "loss/crossentropy": 2.6285284757614136, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1715351864695549, "step": 17874 }, { "epoch": 0.558625, "grad_norm": 3.1875, "grad_norm_var": 0.07255757649739583, "learning_rate": 0.0001, "loss": 5.6397, "loss/crossentropy": 2.5209524631500244, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.163827583193779, "step": 17876 }, { "epoch": 0.5586875, "grad_norm": 3.28125, "grad_norm_var": 0.018196614583333333, "learning_rate": 0.0001, "loss": 5.662, "loss/crossentropy": 2.5259251594543457, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1639961451292038, "step": 17878 }, { "epoch": 0.55875, "grad_norm": 3.09375, "grad_norm_var": 0.017878214518229168, "learning_rate": 0.0001, "loss": 6.034, "loss/crossentropy": 2.708481788635254, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17786332964897156, "step": 17880 }, { "epoch": 0.5588125, "grad_norm": 2.859375, "grad_norm_var": 0.029423014322916666, "learning_rate": 0.0001, "loss": 5.523, "loss/crossentropy": 2.5217885971069336, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15715662389993668, "step": 17882 }, { "epoch": 0.558875, "grad_norm": 3.5625, "grad_norm_var": 0.037581380208333334, "learning_rate": 0.0001, "loss": 6.0094, "loss/crossentropy": 2.653617262840271, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1824529990553856, "step": 17884 }, { "epoch": 0.5589375, "grad_norm": 3.078125, "grad_norm_var": 0.03511962890625, "learning_rate": 0.0001, "loss": 5.7433, "loss/crossentropy": 2.633055090904236, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16414683312177658, "step": 17886 }, { "epoch": 0.559, "grad_norm": 3.1875, "grad_norm_var": 0.03013916015625, "learning_rate": 0.0001, "loss": 5.8918, "loss/crossentropy": 2.649180054664612, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1738693118095398, "step": 17888 }, { "epoch": 0.5590625, "grad_norm": 3.140625, "grad_norm_var": 0.0300445556640625, "learning_rate": 0.0001, "loss": 5.9866, "loss/crossentropy": 2.6964234113693237, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17784736305475235, "step": 17890 }, { "epoch": 0.559125, "grad_norm": 2.984375, "grad_norm_var": 0.029227701822916667, "learning_rate": 0.0001, "loss": 5.8944, "loss/crossentropy": 2.665500283241272, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17210885137319565, "step": 17892 }, { "epoch": 0.5591875, "grad_norm": 3.03125, "grad_norm_var": 0.030887858072916666, "learning_rate": 0.0001, "loss": 5.6217, "loss/crossentropy": 2.572103977203369, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16042818129062653, "step": 17894 }, { "epoch": 0.55925, "grad_norm": 3.078125, "grad_norm_var": 0.0301910400390625, "learning_rate": 0.0001, "loss": 5.9123, "loss/crossentropy": 2.677404046058655, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17582855373620987, "step": 17896 }, { "epoch": 0.5593125, "grad_norm": 3.109375, "grad_norm_var": 0.020881144205729167, "learning_rate": 0.0001, "loss": 5.8686, "loss/crossentropy": 2.6037358045578003, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17687822878360748, "step": 17898 }, { "epoch": 0.559375, "grad_norm": 3.28125, "grad_norm_var": 0.0111724853515625, "learning_rate": 0.0001, "loss": 5.7929, "loss/crossentropy": 2.577305555343628, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1723366603255272, "step": 17900 }, { "epoch": 0.5594375, "grad_norm": 2.984375, "grad_norm_var": 0.0213043212890625, "learning_rate": 0.0001, "loss": 5.7368, "loss/crossentropy": 2.5786736011505127, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1685493439435959, "step": 17902 }, { "epoch": 0.5595, "grad_norm": 3.21875, "grad_norm_var": 0.02183837890625, "learning_rate": 0.0001, "loss": 5.9743, "loss/crossentropy": 2.698343873023987, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17915450781583786, "step": 17904 }, { "epoch": 0.5595625, "grad_norm": 3.21875, "grad_norm_var": 0.023421223958333334, "learning_rate": 0.0001, "loss": 5.6588, "loss/crossentropy": 2.5257489681243896, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16681639850139618, "step": 17906 }, { "epoch": 0.559625, "grad_norm": 3.09375, "grad_norm_var": 0.0233551025390625, "learning_rate": 0.0001, "loss": 5.7083, "loss/crossentropy": 2.522016763687134, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16980086266994476, "step": 17908 }, { "epoch": 0.5596875, "grad_norm": 2.84375, "grad_norm_var": 0.025414021809895833, "learning_rate": 0.0001, "loss": 5.5409, "loss/crossentropy": 2.454635977745056, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16252947598695755, "step": 17910 }, { "epoch": 0.55975, "grad_norm": 2.8125, "grad_norm_var": 0.03297119140625, "learning_rate": 0.0001, "loss": 5.3777, "loss/crossentropy": 2.4028183221817017, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.152959406375885, "step": 17912 }, { "epoch": 0.5598125, "grad_norm": 3.21875, "grad_norm_var": 0.034468587239583334, "learning_rate": 0.0001, "loss": 5.9517, "loss/crossentropy": 2.686766028404236, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17844261229038239, "step": 17914 }, { "epoch": 0.559875, "grad_norm": 3.28125, "grad_norm_var": 0.03355204264322917, "learning_rate": 0.0001, "loss": 5.9998, "loss/crossentropy": 2.801062822341919, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17182545363903046, "step": 17916 }, { "epoch": 0.5599375, "grad_norm": 3.078125, "grad_norm_var": 0.03743387858072917, "learning_rate": 0.0001, "loss": 5.9728, "loss/crossentropy": 2.7022135257720947, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17236721515655518, "step": 17918 }, { "epoch": 0.56, "grad_norm": 3.1875, "grad_norm_var": 0.0369537353515625, "learning_rate": 0.0001, "loss": 5.8791, "loss/crossentropy": 2.6077463626861572, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17674753814935684, "step": 17920 }, { "epoch": 0.5600625, "grad_norm": 2.96875, "grad_norm_var": 0.0353912353515625, "learning_rate": 0.0001, "loss": 5.7588, "loss/crossentropy": 2.6787171363830566, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16191715747117996, "step": 17922 }, { "epoch": 0.560125, "grad_norm": 3.09375, "grad_norm_var": 0.033812459309895834, "learning_rate": 0.0001, "loss": 5.5304, "loss/crossentropy": 2.386778473854065, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16592152416706085, "step": 17924 }, { "epoch": 0.5601875, "grad_norm": 3.109375, "grad_norm_var": 0.030985514322916668, "learning_rate": 0.0001, "loss": 6.1054, "loss/crossentropy": 2.827863335609436, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1765836700797081, "step": 17926 }, { "epoch": 0.56025, "grad_norm": 4.0625, "grad_norm_var": 0.08624674479166666, "learning_rate": 0.0001, "loss": 6.5417, "loss/crossentropy": 2.92037570476532, "loss/hidden": 1.60546875, "loss/jsd": 0.0, "loss/logits": 0.20158959925174713, "step": 17928 }, { "epoch": 0.5603125, "grad_norm": 3.484375, "grad_norm_var": 0.1042633056640625, "learning_rate": 0.0001, "loss": 5.62, "loss/crossentropy": 2.3516818284988403, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17214687913656235, "step": 17930 }, { "epoch": 0.560375, "grad_norm": 3.125, "grad_norm_var": 0.10723368326822917, "learning_rate": 0.0001, "loss": 5.599, "loss/crossentropy": 2.457643985748291, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16530482470989227, "step": 17932 }, { "epoch": 0.5604375, "grad_norm": 3.421875, "grad_norm_var": 0.09968973795572916, "learning_rate": 0.0001, "loss": 5.6505, "loss/crossentropy": 2.549486994743347, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1655661016702652, "step": 17934 }, { "epoch": 0.5605, "grad_norm": 3.125, "grad_norm_var": 0.10122782389322917, "learning_rate": 0.0001, "loss": 5.8692, "loss/crossentropy": 2.6171056032180786, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17521066218614578, "step": 17936 }, { "epoch": 0.5605625, "grad_norm": 3.0625, "grad_norm_var": 0.09503580729166666, "learning_rate": 0.0001, "loss": 5.907, "loss/crossentropy": 2.6747989654541016, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17439532279968262, "step": 17938 }, { "epoch": 0.560625, "grad_norm": 3.296875, "grad_norm_var": 0.08818359375, "learning_rate": 0.0001, "loss": 5.9037, "loss/crossentropy": 2.6498863697052, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17733769863843918, "step": 17940 }, { "epoch": 0.5606875, "grad_norm": 3.015625, "grad_norm_var": 0.1028472900390625, "learning_rate": 0.0001, "loss": 5.7041, "loss/crossentropy": 2.5769314765930176, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16389025747776031, "step": 17942 }, { "epoch": 0.56075, "grad_norm": 2.890625, "grad_norm_var": 0.0528472900390625, "learning_rate": 0.0001, "loss": 5.5264, "loss/crossentropy": 2.438861846923828, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1603192836046219, "step": 17944 }, { "epoch": 0.5608125, "grad_norm": 4.0625, "grad_norm_var": 0.07902018229166667, "learning_rate": 0.0001, "loss": 5.6449, "loss/crossentropy": 2.5336352586746216, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16385617852210999, "step": 17946 }, { "epoch": 0.560875, "grad_norm": 3.484375, "grad_norm_var": 0.3541493733723958, "learning_rate": 0.0001, "loss": 5.8157, "loss/crossentropy": 2.4811586141586304, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1822827011346817, "step": 17948 }, { "epoch": 0.5609375, "grad_norm": 3.265625, "grad_norm_var": 0.3592732747395833, "learning_rate": 0.0001, "loss": 5.6334, "loss/crossentropy": 2.538175344467163, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16108077764511108, "step": 17950 }, { "epoch": 0.561, "grad_norm": 3.4375, "grad_norm_var": 0.36804097493489585, "learning_rate": 0.0001, "loss": 5.8501, "loss/crossentropy": 2.6697648763656616, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16998320072889328, "step": 17952 }, { "epoch": 0.5610625, "grad_norm": 2.921875, "grad_norm_var": 0.3748982747395833, "learning_rate": 0.0001, "loss": 5.6979, "loss/crossentropy": 2.6105103492736816, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16029831767082214, "step": 17954 }, { "epoch": 0.561125, "grad_norm": 4.75, "grad_norm_var": 0.5028483072916666, "learning_rate": 0.0001, "loss": 5.7485, "loss/crossentropy": 2.5200387239456177, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17089248448610306, "step": 17956 }, { "epoch": 0.5611875, "grad_norm": 3.25, "grad_norm_var": 0.47515869140625, "learning_rate": 0.0001, "loss": 6.0636, "loss/crossentropy": 2.766772508621216, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.18007465451955795, "step": 17958 }, { "epoch": 0.56125, "grad_norm": 3.578125, "grad_norm_var": 0.4588826497395833, "learning_rate": 0.0001, "loss": 5.9404, "loss/crossentropy": 2.672636032104492, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17794343829154968, "step": 17960 }, { "epoch": 0.5613125, "grad_norm": 3.125, "grad_norm_var": 0.4217732747395833, "learning_rate": 0.0001, "loss": 5.8003, "loss/crossentropy": 2.610080599784851, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.16511627286672592, "step": 17962 }, { "epoch": 0.561375, "grad_norm": 3.28125, "grad_norm_var": 0.19097900390625, "learning_rate": 0.0001, "loss": 5.6038, "loss/crossentropy": 2.4610944986343384, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16622376441955566, "step": 17964 }, { "epoch": 0.5614375, "grad_norm": 3.140625, "grad_norm_var": 0.19611714680989584, "learning_rate": 0.0001, "loss": 5.5325, "loss/crossentropy": 2.4628101587295532, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16165287792682648, "step": 17966 }, { "epoch": 0.5615, "grad_norm": 3.453125, "grad_norm_var": 0.18842671712239584, "learning_rate": 0.0001, "loss": 5.8332, "loss/crossentropy": 2.6182034015655518, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17267628014087677, "step": 17968 }, { "epoch": 0.5615625, "grad_norm": 3.203125, "grad_norm_var": 0.17791341145833334, "learning_rate": 0.0001, "loss": 5.9445, "loss/crossentropy": 2.6934006214141846, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1743318811058998, "step": 17970 }, { "epoch": 0.561625, "grad_norm": 3.625, "grad_norm_var": 0.043929036458333334, "learning_rate": 0.0001, "loss": 5.7551, "loss/crossentropy": 2.579498291015625, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16755972057580948, "step": 17972 }, { "epoch": 0.5616875, "grad_norm": 3.5625, "grad_norm_var": 0.04993387858072917, "learning_rate": 0.0001, "loss": 5.9624, "loss/crossentropy": 2.709150195121765, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17571967840194702, "step": 17974 }, { "epoch": 0.56175, "grad_norm": 3.5, "grad_norm_var": 0.05461324055989583, "learning_rate": 0.0001, "loss": 5.5391, "loss/crossentropy": 2.4214595556259155, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16449732333421707, "step": 17976 }, { "epoch": 0.5618125, "grad_norm": 3.59375, "grad_norm_var": 0.07040608723958333, "learning_rate": 0.0001, "loss": 6.1091, "loss/crossentropy": 2.7031766176223755, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18746396899223328, "step": 17978 }, { "epoch": 0.561875, "grad_norm": 3.25, "grad_norm_var": 0.0666015625, "learning_rate": 0.0001, "loss": 5.5015, "loss/crossentropy": 2.404158592224121, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16207338869571686, "step": 17980 }, { "epoch": 0.5619375, "grad_norm": 3.078125, "grad_norm_var": 0.06629130045572916, "learning_rate": 0.0001, "loss": 5.6953, "loss/crossentropy": 2.560208320617676, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16585327684879303, "step": 17982 }, { "epoch": 0.562, "grad_norm": 3.09375, "grad_norm_var": 0.06707356770833334, "learning_rate": 0.0001, "loss": 5.4656, "loss/crossentropy": 2.439324140548706, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15653540194034576, "step": 17984 }, { "epoch": 0.5620625, "grad_norm": 3.078125, "grad_norm_var": 0.07502339680989584, "learning_rate": 0.0001, "loss": 5.6782, "loss/crossentropy": 2.511128067970276, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16944289952516556, "step": 17986 }, { "epoch": 0.562125, "grad_norm": 3.046875, "grad_norm_var": 0.06309305826822917, "learning_rate": 0.0001, "loss": 5.772, "loss/crossentropy": 2.579297661781311, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1692730113863945, "step": 17988 }, { "epoch": 0.5621875, "grad_norm": 3.046875, "grad_norm_var": 0.0550201416015625, "learning_rate": 0.0001, "loss": 5.5482, "loss/crossentropy": 2.4625638723373413, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16129494458436966, "step": 17990 }, { "epoch": 0.56225, "grad_norm": 3.140625, "grad_norm_var": 0.0456451416015625, "learning_rate": 0.0001, "loss": 5.5685, "loss/crossentropy": 2.4083163738250732, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16602043062448502, "step": 17992 }, { "epoch": 0.5623125, "grad_norm": 3.15625, "grad_norm_var": 0.0220367431640625, "learning_rate": 0.0001, "loss": 5.7701, "loss/crossentropy": 2.5844286680221558, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16973651200532913, "step": 17994 }, { "epoch": 0.562375, "grad_norm": 3.09375, "grad_norm_var": 0.0264556884765625, "learning_rate": 0.0001, "loss": 6.0729, "loss/crossentropy": 2.7710838317871094, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17784149944782257, "step": 17996 }, { "epoch": 0.5624375, "grad_norm": 3.046875, "grad_norm_var": 0.026009114583333333, "learning_rate": 0.0001, "loss": 5.6528, "loss/crossentropy": 2.5517423152923584, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16479262709617615, "step": 17998 }, { "epoch": 0.5625, "grad_norm": 3.109375, "grad_norm_var": 0.024543253580729167, "learning_rate": 0.0001, "loss": 5.6744, "loss/crossentropy": 2.527396082878113, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1670411452651024, "step": 18000 }, { "epoch": 0.5625625, "grad_norm": 2.953125, "grad_norm_var": 0.026707967122395832, "learning_rate": 0.0001, "loss": 5.4326, "loss/crossentropy": 2.3805371522903442, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1598893702030182, "step": 18002 }, { "epoch": 0.562625, "grad_norm": 3.203125, "grad_norm_var": 0.01998291015625, "learning_rate": 0.0001, "loss": 5.5445, "loss/crossentropy": 2.423843502998352, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1671416386961937, "step": 18004 }, { "epoch": 0.5626875, "grad_norm": 3.34375, "grad_norm_var": 0.0218170166015625, "learning_rate": 0.0001, "loss": 6.0422, "loss/crossentropy": 2.7827892303466797, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17320791631937027, "step": 18006 }, { "epoch": 0.56275, "grad_norm": 3.328125, "grad_norm_var": 0.017894490559895834, "learning_rate": 0.0001, "loss": 5.5495, "loss/crossentropy": 2.460228204727173, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16322024911642075, "step": 18008 }, { "epoch": 0.5628125, "grad_norm": 3.203125, "grad_norm_var": 0.020308430989583334, "learning_rate": 0.0001, "loss": 5.5649, "loss/crossentropy": 2.450485348701477, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16378730535507202, "step": 18010 }, { "epoch": 0.562875, "grad_norm": 3.78125, "grad_norm_var": 0.04619140625, "learning_rate": 0.0001, "loss": 5.4887, "loss/crossentropy": 2.4171236753463745, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1591072902083397, "step": 18012 }, { "epoch": 0.5629375, "grad_norm": 3.0, "grad_norm_var": 0.04705403645833333, "learning_rate": 0.0001, "loss": 5.3726, "loss/crossentropy": 2.2885377407073975, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.15801254659891129, "step": 18014 }, { "epoch": 0.563, "grad_norm": 3.015625, "grad_norm_var": 0.04920145670572917, "learning_rate": 0.0001, "loss": 5.6008, "loss/crossentropy": 2.5071990489959717, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16052991151809692, "step": 18016 }, { "epoch": 0.5630625, "grad_norm": 3.09375, "grad_norm_var": 0.042801920572916666, "learning_rate": 0.0001, "loss": 5.8623, "loss/crossentropy": 2.7066714763641357, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1702457219362259, "step": 18018 }, { "epoch": 0.563125, "grad_norm": 3.09375, "grad_norm_var": 0.04732666015625, "learning_rate": 0.0001, "loss": 6.058, "loss/crossentropy": 2.8239424228668213, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17106150835752487, "step": 18020 }, { "epoch": 0.5631875, "grad_norm": 3.0625, "grad_norm_var": 0.0458984375, "learning_rate": 0.0001, "loss": 5.6258, "loss/crossentropy": 2.5434820652008057, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1637033000588417, "step": 18022 }, { "epoch": 0.56325, "grad_norm": 3.25, "grad_norm_var": 0.0510406494140625, "learning_rate": 0.0001, "loss": 5.5696, "loss/crossentropy": 2.4420065879821777, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16666939854621887, "step": 18024 }, { "epoch": 0.5633125, "grad_norm": 3.953125, "grad_norm_var": 0.0898101806640625, "learning_rate": 0.0001, "loss": 5.8895, "loss/crossentropy": 2.6028175354003906, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17671451717615128, "step": 18026 }, { "epoch": 0.563375, "grad_norm": 3.296875, "grad_norm_var": 0.08720296223958333, "learning_rate": 0.0001, "loss": 5.7647, "loss/crossentropy": 2.6395243406295776, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16290399432182312, "step": 18028 }, { "epoch": 0.5634375, "grad_norm": 3.375, "grad_norm_var": 0.08398030598958334, "learning_rate": 0.0001, "loss": 5.6807, "loss/crossentropy": 2.485153317451477, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16877098381519318, "step": 18030 }, { "epoch": 0.5635, "grad_norm": 3.71875, "grad_norm_var": 0.0890289306640625, "learning_rate": 0.0001, "loss": 5.8858, "loss/crossentropy": 2.686255097389221, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17073087394237518, "step": 18032 }, { "epoch": 0.5635625, "grad_norm": 3.0625, "grad_norm_var": 0.09696858723958333, "learning_rate": 0.0001, "loss": 5.639, "loss/crossentropy": 2.5935487747192383, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16040372848510742, "step": 18034 }, { "epoch": 0.563625, "grad_norm": 3.234375, "grad_norm_var": 0.09919331868489584, "learning_rate": 0.0001, "loss": 5.5149, "loss/crossentropy": 2.442253351211548, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16273579001426697, "step": 18036 }, { "epoch": 0.5636875, "grad_norm": 3.15625, "grad_norm_var": 0.0981353759765625, "learning_rate": 0.0001, "loss": 5.7802, "loss/crossentropy": 2.650180459022522, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1661262959241867, "step": 18038 }, { "epoch": 0.56375, "grad_norm": 3.59375, "grad_norm_var": 0.09782613118489583, "learning_rate": 0.0001, "loss": 5.7985, "loss/crossentropy": 2.612444758415222, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17055834829807281, "step": 18040 }, { "epoch": 0.5638125, "grad_norm": 3.375, "grad_norm_var": 0.06830952962239584, "learning_rate": 0.0001, "loss": 5.4155, "loss/crossentropy": 2.27899968624115, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16404490172863007, "step": 18042 }, { "epoch": 0.563875, "grad_norm": 3.078125, "grad_norm_var": 0.04676106770833333, "learning_rate": 0.0001, "loss": 5.6563, "loss/crossentropy": 2.4841731786727905, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1707303375005722, "step": 18044 }, { "epoch": 0.5639375, "grad_norm": 2.9375, "grad_norm_var": 0.04976298014322917, "learning_rate": 0.0001, "loss": 5.3333, "loss/crossentropy": 2.3177614212036133, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15624137222766876, "step": 18046 }, { "epoch": 0.564, "grad_norm": 2.984375, "grad_norm_var": 0.05961812337239583, "learning_rate": 0.0001, "loss": 5.7262, "loss/crossentropy": 2.5988024473190308, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16312655061483383, "step": 18048 }, { "epoch": 0.5640625, "grad_norm": 3.203125, "grad_norm_var": 0.0648345947265625, "learning_rate": 0.0001, "loss": 5.5478, "loss/crossentropy": 2.4362375736236572, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16584526747465134, "step": 18050 }, { "epoch": 0.564125, "grad_norm": 3.125, "grad_norm_var": 0.06805013020833334, "learning_rate": 0.0001, "loss": 5.7547, "loss/crossentropy": 2.570965528488159, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16837147623300552, "step": 18052 }, { "epoch": 0.5641875, "grad_norm": 3.90625, "grad_norm_var": 0.09541727701822916, "learning_rate": 0.0001, "loss": 5.905, "loss/crossentropy": 2.674785614013672, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17458803951740265, "step": 18054 }, { "epoch": 0.56425, "grad_norm": 3.0625, "grad_norm_var": 0.08625895182291667, "learning_rate": 0.0001, "loss": 5.9082, "loss/crossentropy": 2.652057647705078, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17444629222154617, "step": 18056 }, { "epoch": 0.5643125, "grad_norm": 2.8125, "grad_norm_var": 0.09990946451822917, "learning_rate": 0.0001, "loss": 5.784, "loss/crossentropy": 2.6740591526031494, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1664658635854721, "step": 18058 }, { "epoch": 0.564375, "grad_norm": 3.28125, "grad_norm_var": 0.10211181640625, "learning_rate": 0.0001, "loss": 5.8103, "loss/crossentropy": 2.5839916467666626, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17185447365045547, "step": 18060 }, { "epoch": 0.5644375, "grad_norm": 3.125, "grad_norm_var": 0.0981842041015625, "learning_rate": 0.0001, "loss": 5.5052, "loss/crossentropy": 2.3681094646453857, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1641031578183174, "step": 18062 }, { "epoch": 0.5645, "grad_norm": 3.21875, "grad_norm_var": 0.07350260416666667, "learning_rate": 0.0001, "loss": 5.9768, "loss/crossentropy": 2.7085739374160767, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17760029435157776, "step": 18064 }, { "epoch": 0.5645625, "grad_norm": 3.03125, "grad_norm_var": 0.06754150390625, "learning_rate": 0.0001, "loss": 5.6523, "loss/crossentropy": 2.599989175796509, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16304685175418854, "step": 18066 }, { "epoch": 0.564625, "grad_norm": 3.34375, "grad_norm_var": 0.0619140625, "learning_rate": 0.0001, "loss": 5.6176, "loss/crossentropy": 2.507688045501709, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16021274030208588, "step": 18068 }, { "epoch": 0.5646875, "grad_norm": 3.171875, "grad_norm_var": 0.025516764322916666, "learning_rate": 0.0001, "loss": 5.6256, "loss/crossentropy": 2.464373230934143, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1711968332529068, "step": 18070 }, { "epoch": 0.56475, "grad_norm": 2.90625, "grad_norm_var": 0.029280598958333334, "learning_rate": 0.0001, "loss": 5.476, "loss/crossentropy": 2.4423896074295044, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.16078495979309082, "step": 18072 }, { "epoch": 0.5648125, "grad_norm": 3.09375, "grad_norm_var": 0.021833292643229165, "learning_rate": 0.0001, "loss": 6.0034, "loss/crossentropy": 2.7145031690597534, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17889142036437988, "step": 18074 }, { "epoch": 0.564875, "grad_norm": 3.15625, "grad_norm_var": 0.01295166015625, "learning_rate": 0.0001, "loss": 5.9951, "loss/crossentropy": 2.786222457885742, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17284026741981506, "step": 18076 }, { "epoch": 0.5649375, "grad_norm": 3.15625, "grad_norm_var": 0.013411458333333333, "learning_rate": 0.0001, "loss": 5.8743, "loss/crossentropy": 2.638497829437256, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17514370381832123, "step": 18078 }, { "epoch": 0.565, "grad_norm": 3.546875, "grad_norm_var": 0.022166951497395834, "learning_rate": 0.0001, "loss": 5.9054, "loss/crossentropy": 2.6657952070236206, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.172395721077919, "step": 18080 }, { "epoch": 0.5650625, "grad_norm": 2.875, "grad_norm_var": 0.026883951822916665, "learning_rate": 0.0001, "loss": 5.417, "loss/crossentropy": 2.3664300441741943, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15935391187667847, "step": 18082 }, { "epoch": 0.565125, "grad_norm": 3.296875, "grad_norm_var": 0.024332682291666668, "learning_rate": 0.0001, "loss": 5.7181, "loss/crossentropy": 2.5044695138931274, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17058125138282776, "step": 18084 }, { "epoch": 0.5651875, "grad_norm": 2.953125, "grad_norm_var": 0.027928670247395832, "learning_rate": 0.0001, "loss": 5.6824, "loss/crossentropy": 2.5775527954101562, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16126485168933868, "step": 18086 }, { "epoch": 0.56525, "grad_norm": 3.1875, "grad_norm_var": 0.0307769775390625, "learning_rate": 0.0001, "loss": 5.8573, "loss/crossentropy": 2.610924482345581, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1703444868326187, "step": 18088 }, { "epoch": 0.5653125, "grad_norm": 3.375, "grad_norm_var": 0.031722005208333334, "learning_rate": 0.0001, "loss": 6.1673, "loss/crossentropy": 2.812220335006714, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.18551187962293625, "step": 18090 }, { "epoch": 0.565375, "grad_norm": 3.15625, "grad_norm_var": 0.0330963134765625, "learning_rate": 0.0001, "loss": 5.709, "loss/crossentropy": 2.5566617250442505, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16836372017860413, "step": 18092 }, { "epoch": 0.5654375, "grad_norm": 3.0625, "grad_norm_var": 0.03693033854166667, "learning_rate": 0.0001, "loss": 5.6252, "loss/crossentropy": 2.5410051345825195, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1635015681385994, "step": 18094 }, { "epoch": 0.5655, "grad_norm": 3.21875, "grad_norm_var": 0.03241780598958333, "learning_rate": 0.0001, "loss": 5.7779, "loss/crossentropy": 2.598138451576233, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1687622368335724, "step": 18096 }, { "epoch": 0.5655625, "grad_norm": 3.296875, "grad_norm_var": 0.024739583333333332, "learning_rate": 0.0001, "loss": 5.8979, "loss/crossentropy": 2.6277064085006714, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17662711441516876, "step": 18098 }, { "epoch": 0.565625, "grad_norm": 3.609375, "grad_norm_var": 0.033984375, "learning_rate": 0.0001, "loss": 5.5418, "loss/crossentropy": 2.3807681798934937, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16727932542562485, "step": 18100 }, { "epoch": 0.5656875, "grad_norm": 3.3125, "grad_norm_var": 0.03699442545572917, "learning_rate": 0.0001, "loss": 5.6414, "loss/crossentropy": 2.5439178943634033, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16287732124328613, "step": 18102 }, { "epoch": 0.56575, "grad_norm": 3.234375, "grad_norm_var": 0.04049479166666667, "learning_rate": 0.0001, "loss": 6.0229, "loss/crossentropy": 2.7287250757217407, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17551034688949585, "step": 18104 }, { "epoch": 0.5658125, "grad_norm": 3.140625, "grad_norm_var": 0.04299723307291667, "learning_rate": 0.0001, "loss": 5.6857, "loss/crossentropy": 2.589537262916565, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1643010824918747, "step": 18106 }, { "epoch": 0.565875, "grad_norm": 3.375, "grad_norm_var": 0.04282938639322917, "learning_rate": 0.0001, "loss": 5.4694, "loss/crossentropy": 2.3726253509521484, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.15928775072097778, "step": 18108 }, { "epoch": 0.5659375, "grad_norm": 3.5, "grad_norm_var": 0.040299479166666666, "learning_rate": 0.0001, "loss": 6.0452, "loss/crossentropy": 2.6985702514648438, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1842763051390648, "step": 18110 }, { "epoch": 0.566, "grad_norm": 3.03125, "grad_norm_var": 0.04254150390625, "learning_rate": 0.0001, "loss": 5.8419, "loss/crossentropy": 2.572100520133972, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17620253562927246, "step": 18112 }, { "epoch": 0.5660625, "grad_norm": 2.671875, "grad_norm_var": 0.0853424072265625, "learning_rate": 0.0001, "loss": 5.4406, "loss/crossentropy": 2.40678334236145, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15767619758844376, "step": 18114 }, { "epoch": 0.566125, "grad_norm": 3.359375, "grad_norm_var": 0.07928059895833334, "learning_rate": 0.0001, "loss": 5.9884, "loss/crossentropy": 2.7045150995254517, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17838776856660843, "step": 18116 }, { "epoch": 0.5661875, "grad_norm": 3.21875, "grad_norm_var": 0.07405192057291667, "learning_rate": 0.0001, "loss": 5.853, "loss/crossentropy": 2.690333843231201, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16548164188861847, "step": 18118 }, { "epoch": 0.56625, "grad_norm": 2.765625, "grad_norm_var": 0.0814849853515625, "learning_rate": 0.0001, "loss": 5.5055, "loss/crossentropy": 2.431414484977722, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.162482850253582, "step": 18120 }, { "epoch": 0.5663125, "grad_norm": 3.328125, "grad_norm_var": 0.08037109375, "learning_rate": 0.0001, "loss": 5.6249, "loss/crossentropy": 2.4767180681228638, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16638048738241196, "step": 18122 }, { "epoch": 0.566375, "grad_norm": 3.0, "grad_norm_var": 0.08516337076822916, "learning_rate": 0.0001, "loss": 5.6001, "loss/crossentropy": 2.540703773498535, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16140630096197128, "step": 18124 }, { "epoch": 0.5664375, "grad_norm": 2.890625, "grad_norm_var": 0.07871805826822917, "learning_rate": 0.0001, "loss": 5.5986, "loss/crossentropy": 2.5523539781570435, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1585291028022766, "step": 18126 }, { "epoch": 0.5665, "grad_norm": 3.265625, "grad_norm_var": 0.07947489420572916, "learning_rate": 0.0001, "loss": 5.6393, "loss/crossentropy": 2.4613728523254395, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16778801381587982, "step": 18128 }, { "epoch": 0.5665625, "grad_norm": 3.09375, "grad_norm_var": 0.047998046875, "learning_rate": 0.0001, "loss": 5.7893, "loss/crossentropy": 2.535299062728882, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17422835528850555, "step": 18130 }, { "epoch": 0.566625, "grad_norm": 3.453125, "grad_norm_var": 0.0640777587890625, "learning_rate": 0.0001, "loss": 5.9657, "loss/crossentropy": 2.6000982522964478, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.17679427564144135, "step": 18132 }, { "epoch": 0.5666875, "grad_norm": 3.28125, "grad_norm_var": 0.06733296712239584, "learning_rate": 0.0001, "loss": 5.5046, "loss/crossentropy": 2.4269354343414307, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16050058603286743, "step": 18134 }, { "epoch": 0.56675, "grad_norm": 3.359375, "grad_norm_var": 0.05455729166666667, "learning_rate": 0.0001, "loss": 6.1821, "loss/crossentropy": 2.747152090072632, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18959320336580276, "step": 18136 }, { "epoch": 0.5668125, "grad_norm": 3.234375, "grad_norm_var": 0.05486653645833333, "learning_rate": 0.0001, "loss": 5.8644, "loss/crossentropy": 2.642526865005493, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17297182232141495, "step": 18138 }, { "epoch": 0.566875, "grad_norm": 3.046875, "grad_norm_var": 0.048005167643229166, "learning_rate": 0.0001, "loss": 5.8307, "loss/crossentropy": 2.650195360183716, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1684427112340927, "step": 18140 }, { "epoch": 0.5669375, "grad_norm": 3.25, "grad_norm_var": 0.0490234375, "learning_rate": 0.0001, "loss": 5.5932, "loss/crossentropy": 2.5209991931915283, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.15604948997497559, "step": 18142 }, { "epoch": 0.567, "grad_norm": 2.96875, "grad_norm_var": 0.057291666666666664, "learning_rate": 0.0001, "loss": 5.6901, "loss/crossentropy": 2.4706814289093018, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16960027068853378, "step": 18144 }, { "epoch": 0.5670625, "grad_norm": 3.453125, "grad_norm_var": 0.05049540201822917, "learning_rate": 0.0001, "loss": 6.0374, "loss/crossentropy": 2.76907217502594, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17527476698160172, "step": 18146 }, { "epoch": 0.567125, "grad_norm": 3.15625, "grad_norm_var": 0.03345438639322917, "learning_rate": 0.0001, "loss": 5.6292, "loss/crossentropy": 2.4950783252716064, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16536633670330048, "step": 18148 }, { "epoch": 0.5671875, "grad_norm": 3.125, "grad_norm_var": 0.029670206705729167, "learning_rate": 0.0001, "loss": 5.6338, "loss/crossentropy": 2.4978891611099243, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16632362455129623, "step": 18150 }, { "epoch": 0.56725, "grad_norm": 3.03125, "grad_norm_var": 0.0312164306640625, "learning_rate": 0.0001, "loss": 5.4433, "loss/crossentropy": 2.4270702600479126, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1586506888270378, "step": 18152 }, { "epoch": 0.5673125, "grad_norm": 2.9375, "grad_norm_var": 0.03443603515625, "learning_rate": 0.0001, "loss": 5.6759, "loss/crossentropy": 2.593072295188904, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.16570191085338593, "step": 18154 }, { "epoch": 0.567375, "grad_norm": 3.59375, "grad_norm_var": 0.049117024739583334, "learning_rate": 0.0001, "loss": 5.6579, "loss/crossentropy": 2.4805086851119995, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1693010777235031, "step": 18156 }, { "epoch": 0.5674375, "grad_norm": 3.46875, "grad_norm_var": 0.05154622395833333, "learning_rate": 0.0001, "loss": 5.7272, "loss/crossentropy": 2.6106998920440674, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1635986492037773, "step": 18158 }, { "epoch": 0.5675, "grad_norm": 3.484375, "grad_norm_var": 0.04954325358072917, "learning_rate": 0.0001, "loss": 5.8808, "loss/crossentropy": 2.5786153078079224, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17865897715091705, "step": 18160 }, { "epoch": 0.5675625, "grad_norm": 3.375, "grad_norm_var": 0.04856669108072917, "learning_rate": 0.0001, "loss": 5.8635, "loss/crossentropy": 2.6471441984176636, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16890473663806915, "step": 18162 }, { "epoch": 0.567625, "grad_norm": 2.8125, "grad_norm_var": 0.06799214680989583, "learning_rate": 0.0001, "loss": 5.5292, "loss/crossentropy": 2.555854558944702, "loss/hidden": 1.40234375, "loss/jsd": 0.0, "loss/logits": 0.1570984572172165, "step": 18164 }, { "epoch": 0.5676875, "grad_norm": 3.203125, "grad_norm_var": 0.06944071451822917, "learning_rate": 0.0001, "loss": 5.7648, "loss/crossentropy": 2.5097345113754272, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17394182085990906, "step": 18166 }, { "epoch": 0.56775, "grad_norm": 3.296875, "grad_norm_var": 0.06754150390625, "learning_rate": 0.0001, "loss": 5.915, "loss/crossentropy": 2.5866525173187256, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18127139657735825, "step": 18168 }, { "epoch": 0.5678125, "grad_norm": 3.078125, "grad_norm_var": 0.0651031494140625, "learning_rate": 0.0001, "loss": 5.6082, "loss/crossentropy": 2.573627233505249, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15931487083435059, "step": 18170 }, { "epoch": 0.567875, "grad_norm": 2.8125, "grad_norm_var": 0.06481831868489583, "learning_rate": 0.0001, "loss": 5.3138, "loss/crossentropy": 2.2177451848983765, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.15608614683151245, "step": 18172 }, { "epoch": 0.5679375, "grad_norm": 2.890625, "grad_norm_var": 0.06100260416666667, "learning_rate": 0.0001, "loss": 5.4315, "loss/crossentropy": 2.41492235660553, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.16024888306856155, "step": 18174 }, { "epoch": 0.568, "grad_norm": 3.046875, "grad_norm_var": 0.05423177083333333, "learning_rate": 0.0001, "loss": 5.7092, "loss/crossentropy": 2.5794578790664673, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16609789431095123, "step": 18176 }, { "epoch": 0.5680625, "grad_norm": 3.328125, "grad_norm_var": 0.049153645833333336, "learning_rate": 0.0001, "loss": 5.723, "loss/crossentropy": 2.566588878631592, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16798074543476105, "step": 18178 }, { "epoch": 0.568125, "grad_norm": 3.671875, "grad_norm_var": 0.053511555989583334, "learning_rate": 0.0001, "loss": 5.5526, "loss/crossentropy": 2.4305126667022705, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16377465426921844, "step": 18180 }, { "epoch": 0.5681875, "grad_norm": 3.421875, "grad_norm_var": 0.053055826822916666, "learning_rate": 0.0001, "loss": 5.7102, "loss/crossentropy": 2.4845412969589233, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17451970279216766, "step": 18182 }, { "epoch": 0.56825, "grad_norm": 3.46875, "grad_norm_var": 0.05862223307291667, "learning_rate": 0.0001, "loss": 5.7808, "loss/crossentropy": 2.5853875875473022, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16837100684642792, "step": 18184 }, { "epoch": 0.5683125, "grad_norm": 3.0625, "grad_norm_var": 0.057249959309895834, "learning_rate": 0.0001, "loss": 5.764, "loss/crossentropy": 2.5821412801742554, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17013757675886154, "step": 18186 }, { "epoch": 0.568375, "grad_norm": 2.984375, "grad_norm_var": 0.04781901041666667, "learning_rate": 0.0001, "loss": 5.4794, "loss/crossentropy": 2.471649646759033, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1570231318473816, "step": 18188 }, { "epoch": 0.5684375, "grad_norm": 2.96875, "grad_norm_var": 0.04551493326822917, "learning_rate": 0.0001, "loss": 5.6079, "loss/crossentropy": 2.527212381362915, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1604156643152237, "step": 18190 }, { "epoch": 0.5685, "grad_norm": 3.671875, "grad_norm_var": 0.06295572916666667, "learning_rate": 0.0001, "loss": 5.9374, "loss/crossentropy": 2.703904390335083, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1725686490535736, "step": 18192 }, { "epoch": 0.5685625, "grad_norm": 3.125, "grad_norm_var": 0.060282389322916664, "learning_rate": 0.0001, "loss": 5.9282, "loss/crossentropy": 2.6780800819396973, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17578843981027603, "step": 18194 }, { "epoch": 0.568625, "grad_norm": 3.328125, "grad_norm_var": 0.04529520670572917, "learning_rate": 0.0001, "loss": 5.8886, "loss/crossentropy": 2.6965173482894897, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16764166951179504, "step": 18196 }, { "epoch": 0.5686875, "grad_norm": 3.046875, "grad_norm_var": 0.045328776041666664, "learning_rate": 0.0001, "loss": 5.5994, "loss/crossentropy": 2.541398763656616, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16126679629087448, "step": 18198 }, { "epoch": 0.56875, "grad_norm": 3.3125, "grad_norm_var": 0.040461222330729164, "learning_rate": 0.0001, "loss": 5.799, "loss/crossentropy": 2.585677981376648, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1709369719028473, "step": 18200 }, { "epoch": 0.5688125, "grad_norm": 2.90625, "grad_norm_var": 0.05640360514322917, "learning_rate": 0.0001, "loss": 5.595, "loss/crossentropy": 2.4870870113372803, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16391517221927643, "step": 18202 }, { "epoch": 0.568875, "grad_norm": 3.109375, "grad_norm_var": 0.049470011393229166, "learning_rate": 0.0001, "loss": 5.6932, "loss/crossentropy": 2.5428810119628906, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16933035850524902, "step": 18204 }, { "epoch": 0.5689375, "grad_norm": 3.203125, "grad_norm_var": 0.04586181640625, "learning_rate": 0.0001, "loss": 5.7163, "loss/crossentropy": 2.5672882795333862, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.17036515474319458, "step": 18206 }, { "epoch": 0.569, "grad_norm": 2.984375, "grad_norm_var": 0.029857381184895834, "learning_rate": 0.0001, "loss": 5.6548, "loss/crossentropy": 2.4710731506347656, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1695476919412613, "step": 18208 }, { "epoch": 0.5690625, "grad_norm": 3.046875, "grad_norm_var": 0.030793253580729166, "learning_rate": 0.0001, "loss": 5.9761, "loss/crossentropy": 2.7513391971588135, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.17638200521469116, "step": 18210 }, { "epoch": 0.569125, "grad_norm": 3.671875, "grad_norm_var": 0.0537506103515625, "learning_rate": 0.0001, "loss": 5.7616, "loss/crossentropy": 2.5401495695114136, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1705845668911934, "step": 18212 }, { "epoch": 0.5691875, "grad_norm": 3.09375, "grad_norm_var": 0.0755859375, "learning_rate": 0.0001, "loss": 5.4031, "loss/crossentropy": 2.347140312194824, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15754767507314682, "step": 18214 }, { "epoch": 0.56925, "grad_norm": 3.015625, "grad_norm_var": 0.08241780598958333, "learning_rate": 0.0001, "loss": 5.6426, "loss/crossentropy": 2.508566737174988, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16965033113956451, "step": 18216 }, { "epoch": 0.5693125, "grad_norm": 2.984375, "grad_norm_var": 0.07366434733072917, "learning_rate": 0.0001, "loss": 5.7891, "loss/crossentropy": 2.6187134981155396, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16899175941944122, "step": 18218 }, { "epoch": 0.569375, "grad_norm": 3.015625, "grad_norm_var": 0.07834879557291667, "learning_rate": 0.0001, "loss": 5.4317, "loss/crossentropy": 2.37195086479187, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.15557928383350372, "step": 18220 }, { "epoch": 0.5694375, "grad_norm": 3.09375, "grad_norm_var": 0.08283589680989584, "learning_rate": 0.0001, "loss": 5.7955, "loss/crossentropy": 2.6413161754608154, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16776670515537262, "step": 18222 }, { "epoch": 0.5695, "grad_norm": 3.078125, "grad_norm_var": 0.0821685791015625, "learning_rate": 0.0001, "loss": 5.5111, "loss/crossentropy": 2.4040359258651733, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1626596450805664, "step": 18224 }, { "epoch": 0.5695625, "grad_norm": 3.015625, "grad_norm_var": 0.08507486979166666, "learning_rate": 0.0001, "loss": 6.0612, "loss/crossentropy": 2.7181628942489624, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18117854744195938, "step": 18226 }, { "epoch": 0.569625, "grad_norm": 2.703125, "grad_norm_var": 0.07452799479166666, "learning_rate": 0.0001, "loss": 5.2521, "loss/crossentropy": 2.3347413539886475, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.15111136436462402, "step": 18228 }, { "epoch": 0.5696875, "grad_norm": 3.5625, "grad_norm_var": 0.04504801432291667, "learning_rate": 0.0001, "loss": 5.963, "loss/crossentropy": 2.7114120721817017, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17554765939712524, "step": 18230 }, { "epoch": 0.56975, "grad_norm": 3.203125, "grad_norm_var": 0.038407389322916666, "learning_rate": 0.0001, "loss": 5.815, "loss/crossentropy": 2.5795810222625732, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17354222387075424, "step": 18232 }, { "epoch": 0.5698125, "grad_norm": 3.0, "grad_norm_var": 0.04634501139322917, "learning_rate": 0.0001, "loss": 5.6635, "loss/crossentropy": 2.581420063972473, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16093791276216507, "step": 18234 }, { "epoch": 0.569875, "grad_norm": 2.828125, "grad_norm_var": 0.053694661458333334, "learning_rate": 0.0001, "loss": 5.7541, "loss/crossentropy": 2.6179747581481934, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16947119683027267, "step": 18236 }, { "epoch": 0.5699375, "grad_norm": 3.28125, "grad_norm_var": 0.057184855143229164, "learning_rate": 0.0001, "loss": 6.0011, "loss/crossentropy": 2.693076252937317, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17767733335494995, "step": 18238 }, { "epoch": 0.57, "grad_norm": 3.1875, "grad_norm_var": 0.0557037353515625, "learning_rate": 0.0001, "loss": 5.4824, "loss/crossentropy": 2.4300020933151245, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15836870670318604, "step": 18240 }, { "epoch": 0.5700625, "grad_norm": 2.984375, "grad_norm_var": 0.054255167643229164, "learning_rate": 0.0001, "loss": 5.9279, "loss/crossentropy": 2.6520771980285645, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17524328082799911, "step": 18242 }, { "epoch": 0.570125, "grad_norm": 3.296875, "grad_norm_var": 0.03718973795572917, "learning_rate": 0.0001, "loss": 5.6965, "loss/crossentropy": 2.4949214458465576, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1732865646481514, "step": 18244 }, { "epoch": 0.5701875, "grad_norm": 3.1875, "grad_norm_var": 0.029296875, "learning_rate": 0.0001, "loss": 5.475, "loss/crossentropy": 2.4005489349365234, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1621282771229744, "step": 18246 }, { "epoch": 0.57025, "grad_norm": 2.984375, "grad_norm_var": 0.03469645182291667, "learning_rate": 0.0001, "loss": 5.9914, "loss/crossentropy": 2.6719961166381836, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1819431409239769, "step": 18248 }, { "epoch": 0.5703125, "grad_norm": 2.953125, "grad_norm_var": 0.03255106608072917, "learning_rate": 0.0001, "loss": 5.7835, "loss/crossentropy": 2.5861481428146362, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17246808856725693, "step": 18250 }, { "epoch": 0.570375, "grad_norm": 3.203125, "grad_norm_var": 0.022977701822916665, "learning_rate": 0.0001, "loss": 5.6208, "loss/crossentropy": 2.5238444805145264, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16165026277303696, "step": 18252 }, { "epoch": 0.5704375, "grad_norm": 3.625, "grad_norm_var": 0.03359375, "learning_rate": 0.0001, "loss": 5.9077, "loss/crossentropy": 2.633137583732605, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1747267246246338, "step": 18254 }, { "epoch": 0.5705, "grad_norm": 3.21875, "grad_norm_var": 0.03357747395833333, "learning_rate": 0.0001, "loss": 5.8805, "loss/crossentropy": 2.618165612220764, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17584676295518875, "step": 18256 }, { "epoch": 0.5705625, "grad_norm": 3.3125, "grad_norm_var": 0.030745442708333334, "learning_rate": 0.0001, "loss": 5.8216, "loss/crossentropy": 2.597626805305481, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17278896272182465, "step": 18258 }, { "epoch": 0.570625, "grad_norm": 3.109375, "grad_norm_var": 0.030858357747395832, "learning_rate": 0.0001, "loss": 5.8035, "loss/crossentropy": 2.5978556871414185, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16822044551372528, "step": 18260 }, { "epoch": 0.5706875, "grad_norm": 3.265625, "grad_norm_var": 0.029832967122395835, "learning_rate": 0.0001, "loss": 5.8593, "loss/crossentropy": 2.6246622800827026, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17268268764019012, "step": 18262 }, { "epoch": 0.57075, "grad_norm": 3.21875, "grad_norm_var": 0.026558430989583333, "learning_rate": 0.0001, "loss": 5.7571, "loss/crossentropy": 2.5931451320648193, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16717618703842163, "step": 18264 }, { "epoch": 0.5708125, "grad_norm": 3.28125, "grad_norm_var": 0.034505208333333336, "learning_rate": 0.0001, "loss": 6.2368, "loss/crossentropy": 2.871632218360901, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18261412531137466, "step": 18266 }, { "epoch": 0.570875, "grad_norm": 3.109375, "grad_norm_var": 0.035074869791666664, "learning_rate": 0.0001, "loss": 5.8242, "loss/crossentropy": 2.6054731607437134, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.16796521097421646, "step": 18268 }, { "epoch": 0.5709375, "grad_norm": 3.125, "grad_norm_var": 0.026041666666666668, "learning_rate": 0.0001, "loss": 5.5439, "loss/crossentropy": 2.4600071907043457, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16151829808950424, "step": 18270 }, { "epoch": 0.571, "grad_norm": 3.234375, "grad_norm_var": 0.026102701822916668, "learning_rate": 0.0001, "loss": 5.733, "loss/crossentropy": 2.5170669555664062, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1704181507229805, "step": 18272 }, { "epoch": 0.5710625, "grad_norm": 3.15625, "grad_norm_var": 0.02740478515625, "learning_rate": 0.0001, "loss": 5.5909, "loss/crossentropy": 2.5140548944473267, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15885886549949646, "step": 18274 }, { "epoch": 0.571125, "grad_norm": 3.1875, "grad_norm_var": 0.026904296875, "learning_rate": 0.0001, "loss": 5.587, "loss/crossentropy": 2.5062899589538574, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1596328616142273, "step": 18276 }, { "epoch": 0.5711875, "grad_norm": 3.375, "grad_norm_var": 0.03625895182291667, "learning_rate": 0.0001, "loss": 5.6553, "loss/crossentropy": 2.5024070739746094, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16763748228549957, "step": 18278 }, { "epoch": 0.57125, "grad_norm": 2.96875, "grad_norm_var": 0.03748372395833333, "learning_rate": 0.0001, "loss": 5.7911, "loss/crossentropy": 2.642797827720642, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16873770952224731, "step": 18280 }, { "epoch": 0.5713125, "grad_norm": 3.28125, "grad_norm_var": 0.029710896809895835, "learning_rate": 0.0001, "loss": 5.9137, "loss/crossentropy": 2.618430018424988, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17640304565429688, "step": 18282 }, { "epoch": 0.571375, "grad_norm": 3.546875, "grad_norm_var": 0.03865458170572917, "learning_rate": 0.0001, "loss": 5.6919, "loss/crossentropy": 2.452351212501526, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.16691983491182327, "step": 18284 }, { "epoch": 0.5714375, "grad_norm": 3.140625, "grad_norm_var": 0.03557942708333333, "learning_rate": 0.0001, "loss": 5.7971, "loss/crossentropy": 2.5840392112731934, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17091979831457138, "step": 18286 }, { "epoch": 0.5715, "grad_norm": 3.09375, "grad_norm_var": 0.0437164306640625, "learning_rate": 0.0001, "loss": 5.5782, "loss/crossentropy": 2.480460524559021, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1621190384030342, "step": 18288 }, { "epoch": 0.5715625, "grad_norm": 3.125, "grad_norm_var": 0.04248758951822917, "learning_rate": 0.0001, "loss": 5.649, "loss/crossentropy": 2.4713305234909058, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1665937900543213, "step": 18290 }, { "epoch": 0.571625, "grad_norm": 2.875, "grad_norm_var": 0.0480621337890625, "learning_rate": 0.0001, "loss": 5.4988, "loss/crossentropy": 2.4572752714157104, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1596246212720871, "step": 18292 }, { "epoch": 0.5716875, "grad_norm": 3.3125, "grad_norm_var": 0.03931884765625, "learning_rate": 0.0001, "loss": 5.6017, "loss/crossentropy": 2.4361897706985474, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16694504022598267, "step": 18294 }, { "epoch": 0.57175, "grad_norm": 3.140625, "grad_norm_var": 0.04073893229166667, "learning_rate": 0.0001, "loss": 5.6611, "loss/crossentropy": 2.462522864341736, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17141558229923248, "step": 18296 }, { "epoch": 0.5718125, "grad_norm": 2.953125, "grad_norm_var": 0.035868326822916664, "learning_rate": 0.0001, "loss": 5.5558, "loss/crossentropy": 2.4553279876708984, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16317609697580338, "step": 18298 }, { "epoch": 0.571875, "grad_norm": 3.09375, "grad_norm_var": 0.024079386393229166, "learning_rate": 0.0001, "loss": 5.6954, "loss/crossentropy": 2.598434805870056, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16399026662111282, "step": 18300 }, { "epoch": 0.5719375, "grad_norm": 3.0625, "grad_norm_var": 0.024300130208333333, "learning_rate": 0.0001, "loss": 6.0432, "loss/crossentropy": 2.7587474584579468, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.18001211434602737, "step": 18302 }, { "epoch": 0.572, "grad_norm": 3.03125, "grad_norm_var": 0.021024576822916665, "learning_rate": 0.0001, "loss": 5.8149, "loss/crossentropy": 2.6901882886886597, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1655968725681305, "step": 18304 }, { "epoch": 0.5720625, "grad_norm": 3.140625, "grad_norm_var": 0.018994140625, "learning_rate": 0.0001, "loss": 6.0247, "loss/crossentropy": 2.765766978263855, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17511239647865295, "step": 18306 }, { "epoch": 0.572125, "grad_norm": 3.296875, "grad_norm_var": 0.016792805989583333, "learning_rate": 0.0001, "loss": 5.7581, "loss/crossentropy": 2.6202938556671143, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1645662784576416, "step": 18308 }, { "epoch": 0.5721875, "grad_norm": 3.375, "grad_norm_var": 0.032942708333333334, "learning_rate": 0.0001, "loss": 5.7976, "loss/crossentropy": 2.501075863838196, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17848410457372665, "step": 18310 }, { "epoch": 0.57225, "grad_norm": 3.015625, "grad_norm_var": 0.02906494140625, "learning_rate": 0.0001, "loss": 5.4869, "loss/crossentropy": 2.4833608865737915, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1534803882241249, "step": 18312 }, { "epoch": 0.5723125, "grad_norm": 2.90625, "grad_norm_var": 0.037886555989583334, "learning_rate": 0.0001, "loss": 5.358, "loss/crossentropy": 2.4097559452056885, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.1522429808974266, "step": 18314 }, { "epoch": 0.572375, "grad_norm": 3.265625, "grad_norm_var": 0.040160115559895834, "learning_rate": 0.0001, "loss": 5.926, "loss/crossentropy": 2.712649703025818, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17094045877456665, "step": 18316 }, { "epoch": 0.5724375, "grad_norm": 3.1875, "grad_norm_var": 0.06760660807291667, "learning_rate": 0.0001, "loss": 5.6137, "loss/crossentropy": 2.382579803466797, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1727178767323494, "step": 18318 }, { "epoch": 0.5725, "grad_norm": 3.078125, "grad_norm_var": 0.064208984375, "learning_rate": 0.0001, "loss": 5.9288, "loss/crossentropy": 2.7321243286132812, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16928109526634216, "step": 18320 }, { "epoch": 0.5725625, "grad_norm": 3.375, "grad_norm_var": 0.0653228759765625, "learning_rate": 0.0001, "loss": 5.541, "loss/crossentropy": 2.371983051300049, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16612089425325394, "step": 18322 }, { "epoch": 0.572625, "grad_norm": 3.296875, "grad_norm_var": 0.06648661295572916, "learning_rate": 0.0001, "loss": 6.204, "loss/crossentropy": 2.8039320707321167, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.19000670313835144, "step": 18324 }, { "epoch": 0.5726875, "grad_norm": 3.046875, "grad_norm_var": 0.055887858072916664, "learning_rate": 0.0001, "loss": 5.7852, "loss/crossentropy": 2.6040135622024536, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1704602614045143, "step": 18326 }, { "epoch": 0.57275, "grad_norm": 3.109375, "grad_norm_var": 0.053446451822916664, "learning_rate": 0.0001, "loss": 5.8453, "loss/crossentropy": 2.6023638248443604, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17624496668577194, "step": 18328 }, { "epoch": 0.5728125, "grad_norm": 3.0, "grad_norm_var": 0.03758036295572917, "learning_rate": 0.0001, "loss": 5.4056, "loss/crossentropy": 2.3769582509994507, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15520767122507095, "step": 18330 }, { "epoch": 0.572875, "grad_norm": 3.34375, "grad_norm_var": 0.038134765625, "learning_rate": 0.0001, "loss": 5.8596, "loss/crossentropy": 2.6480783224105835, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16959280520677567, "step": 18332 }, { "epoch": 0.5729375, "grad_norm": 2.875, "grad_norm_var": 0.022801717122395832, "learning_rate": 0.0001, "loss": 5.8033, "loss/crossentropy": 2.5881412029266357, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1719089150428772, "step": 18334 }, { "epoch": 0.573, "grad_norm": 3.28125, "grad_norm_var": 0.02490234375, "learning_rate": 0.0001, "loss": 5.7223, "loss/crossentropy": 2.5636401176452637, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16976793110370636, "step": 18336 }, { "epoch": 0.5730625, "grad_norm": 2.953125, "grad_norm_var": 0.03193359375, "learning_rate": 0.0001, "loss": 5.3448, "loss/crossentropy": 2.340338110923767, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1566973328590393, "step": 18338 }, { "epoch": 0.573125, "grad_norm": 3.265625, "grad_norm_var": 0.02744140625, "learning_rate": 0.0001, "loss": 5.7583, "loss/crossentropy": 2.6048583984375, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16924548894166946, "step": 18340 }, { "epoch": 0.5731875, "grad_norm": 3.03125, "grad_norm_var": 0.028620402018229168, "learning_rate": 0.0001, "loss": 5.3011, "loss/crossentropy": 2.3738588094711304, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.1505402997136116, "step": 18342 }, { "epoch": 0.57325, "grad_norm": 3.015625, "grad_norm_var": 0.028629557291666666, "learning_rate": 0.0001, "loss": 5.6385, "loss/crossentropy": 2.5262138843536377, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16240377724170685, "step": 18344 }, { "epoch": 0.5733125, "grad_norm": 2.9375, "grad_norm_var": 0.0296875, "learning_rate": 0.0001, "loss": 5.7259, "loss/crossentropy": 2.621517777442932, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16629406809806824, "step": 18346 }, { "epoch": 0.573375, "grad_norm": 4.40625, "grad_norm_var": 0.13762919108072916, "learning_rate": 0.0001, "loss": 6.3429, "loss/crossentropy": 2.801121711730957, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.19714495539665222, "step": 18348 }, { "epoch": 0.5734375, "grad_norm": 3.265625, "grad_norm_var": 0.16383463541666668, "learning_rate": 0.0001, "loss": 6.0113, "loss/crossentropy": 2.6030811071395874, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18340133130550385, "step": 18350 }, { "epoch": 0.5735, "grad_norm": 3.421875, "grad_norm_var": 0.16230061848958333, "learning_rate": 0.0001, "loss": 5.8881, "loss/crossentropy": 2.6851214170455933, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17225381731987, "step": 18352 }, { "epoch": 0.5735625, "grad_norm": 3.921875, "grad_norm_var": 0.1701568603515625, "learning_rate": 0.0001, "loss": 5.8568, "loss/crossentropy": 2.567447066307068, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17659613490104675, "step": 18354 }, { "epoch": 0.573625, "grad_norm": 2.953125, "grad_norm_var": 0.18293863932291668, "learning_rate": 0.0001, "loss": 5.1646, "loss/crossentropy": 2.254288613796234, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1457211896777153, "step": 18356 }, { "epoch": 0.5736875, "grad_norm": 3.21875, "grad_norm_var": 0.17039286295572917, "learning_rate": 0.0001, "loss": 5.6014, "loss/crossentropy": 2.5073583126068115, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16213755309581757, "step": 18358 }, { "epoch": 0.57375, "grad_norm": 3.078125, "grad_norm_var": 0.16236572265625, "learning_rate": 0.0001, "loss": 5.8935, "loss/crossentropy": 2.6835970878601074, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17294839024543762, "step": 18360 }, { "epoch": 0.5738125, "grad_norm": 3.375, "grad_norm_var": 0.15689697265625, "learning_rate": 0.0001, "loss": 5.2427, "loss/crossentropy": 2.2930054664611816, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15044043213129044, "step": 18362 }, { "epoch": 0.573875, "grad_norm": 3.328125, "grad_norm_var": 0.08193359375, "learning_rate": 0.0001, "loss": 5.9014, "loss/crossentropy": 2.6530340909957886, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17171156406402588, "step": 18364 }, { "epoch": 0.5739375, "grad_norm": 3.03125, "grad_norm_var": 0.06304931640625, "learning_rate": 0.0001, "loss": 5.107, "loss/crossentropy": 2.146529793739319, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1523013487458229, "step": 18366 }, { "epoch": 0.574, "grad_norm": 3.40625, "grad_norm_var": 0.06550191243489584, "learning_rate": 0.0001, "loss": 5.7736, "loss/crossentropy": 2.613413095474243, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16914102435112, "step": 18368 }, { "epoch": 0.5740625, "grad_norm": 3.15625, "grad_norm_var": 0.0372222900390625, "learning_rate": 0.0001, "loss": 5.4298, "loss/crossentropy": 2.3642795085906982, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15850616991519928, "step": 18370 }, { "epoch": 0.574125, "grad_norm": 3.046875, "grad_norm_var": 0.029182942708333333, "learning_rate": 0.0001, "loss": 5.69, "loss/crossentropy": 2.529383659362793, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16645421832799911, "step": 18372 }, { "epoch": 0.5741875, "grad_norm": 3.140625, "grad_norm_var": 0.027391560872395835, "learning_rate": 0.0001, "loss": 5.8107, "loss/crossentropy": 2.658630132675171, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16794230043888092, "step": 18374 }, { "epoch": 0.57425, "grad_norm": 3.03125, "grad_norm_var": 0.024266560872395832, "learning_rate": 0.0001, "loss": 5.745, "loss/crossentropy": 2.672680377960205, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16192089766263962, "step": 18376 }, { "epoch": 0.5743125, "grad_norm": 3.09375, "grad_norm_var": 0.0211090087890625, "learning_rate": 0.0001, "loss": 5.4341, "loss/crossentropy": 2.433584213256836, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15864913165569305, "step": 18378 }, { "epoch": 0.574375, "grad_norm": 3.140625, "grad_norm_var": 0.018342081705729166, "learning_rate": 0.0001, "loss": 5.5019, "loss/crossentropy": 2.3593112230300903, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16269293427467346, "step": 18380 }, { "epoch": 0.5744375, "grad_norm": 2.96875, "grad_norm_var": 0.018097941080729166, "learning_rate": 0.0001, "loss": 5.9276, "loss/crossentropy": 2.6684865951538086, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17708152532577515, "step": 18382 }, { "epoch": 0.5745, "grad_norm": 3.25, "grad_norm_var": 0.01402587890625, "learning_rate": 0.0001, "loss": 5.6939, "loss/crossentropy": 2.51468563079834, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16986984014511108, "step": 18384 }, { "epoch": 0.5745625, "grad_norm": 3.296875, "grad_norm_var": 0.013863118489583333, "learning_rate": 0.0001, "loss": 5.8839, "loss/crossentropy": 2.6714022159576416, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17280860990285873, "step": 18386 }, { "epoch": 0.574625, "grad_norm": 2.96875, "grad_norm_var": 0.015230305989583333, "learning_rate": 0.0001, "loss": 5.5828, "loss/crossentropy": 2.4910420179367065, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1638609543442726, "step": 18388 }, { "epoch": 0.5746875, "grad_norm": 2.984375, "grad_norm_var": 0.029100545247395835, "learning_rate": 0.0001, "loss": 5.7003, "loss/crossentropy": 2.552237868309021, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1675410270690918, "step": 18390 }, { "epoch": 0.57475, "grad_norm": 2.9375, "grad_norm_var": 0.03125, "learning_rate": 0.0001, "loss": 5.8232, "loss/crossentropy": 2.698289394378662, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16600815951824188, "step": 18392 }, { "epoch": 0.5748125, "grad_norm": 3.140625, "grad_norm_var": 0.03033447265625, "learning_rate": 0.0001, "loss": 5.7958, "loss/crossentropy": 2.6374276876449585, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.17091414332389832, "step": 18394 }, { "epoch": 0.574875, "grad_norm": 3.359375, "grad_norm_var": 0.033036295572916666, "learning_rate": 0.0001, "loss": 5.9827, "loss/crossentropy": 2.7636018991470337, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17386632412672043, "step": 18396 }, { "epoch": 0.5749375, "grad_norm": 3.9375, "grad_norm_var": 0.061930338541666664, "learning_rate": 0.0001, "loss": 5.4679, "loss/crossentropy": 2.3220553398132324, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16341203451156616, "step": 18398 }, { "epoch": 0.575, "grad_norm": 3.078125, "grad_norm_var": 0.06453348795572916, "learning_rate": 0.0001, "loss": 5.5277, "loss/crossentropy": 2.4370415210723877, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1629718393087387, "step": 18400 }, { "epoch": 0.5750625, "grad_norm": 3.25, "grad_norm_var": 0.06345926920572917, "learning_rate": 0.0001, "loss": 5.7709, "loss/crossentropy": 2.5731245279312134, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16899394243955612, "step": 18402 }, { "epoch": 0.575125, "grad_norm": 3.53125, "grad_norm_var": 0.0649078369140625, "learning_rate": 0.0001, "loss": 5.5094, "loss/crossentropy": 2.4272419214248657, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15977945178747177, "step": 18404 }, { "epoch": 0.5751875, "grad_norm": 3.09375, "grad_norm_var": 0.05481363932291667, "learning_rate": 0.0001, "loss": 5.8179, "loss/crossentropy": 2.6184180974960327, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17189937829971313, "step": 18406 }, { "epoch": 0.57525, "grad_norm": 3.296875, "grad_norm_var": 0.05034077962239583, "learning_rate": 0.0001, "loss": 5.7, "loss/crossentropy": 2.5761481523513794, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16317031532526016, "step": 18408 }, { "epoch": 0.5753125, "grad_norm": 3.234375, "grad_norm_var": 0.0506744384765625, "learning_rate": 0.0001, "loss": 5.7413, "loss/crossentropy": 2.5246126651763916, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17089063674211502, "step": 18410 }, { "epoch": 0.575375, "grad_norm": 3.140625, "grad_norm_var": 0.04970703125, "learning_rate": 0.0001, "loss": 5.8667, "loss/crossentropy": 2.5869481563568115, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1764107272028923, "step": 18412 }, { "epoch": 0.5754375, "grad_norm": 3.109375, "grad_norm_var": 0.19708658854166666, "learning_rate": 0.0001, "loss": 5.6101, "loss/crossentropy": 2.4800440073013306, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16651704907417297, "step": 18414 }, { "epoch": 0.5755, "grad_norm": 3.0625, "grad_norm_var": 0.19455973307291666, "learning_rate": 0.0001, "loss": 5.7655, "loss/crossentropy": 2.6151453256607056, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1654304414987564, "step": 18416 }, { "epoch": 0.5755625, "grad_norm": 2.890625, "grad_norm_var": 0.20732014973958332, "learning_rate": 0.0001, "loss": 5.6632, "loss/crossentropy": 2.492995023727417, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1666259467601776, "step": 18418 }, { "epoch": 0.575625, "grad_norm": 3.609375, "grad_norm_var": 0.2154296875, "learning_rate": 0.0001, "loss": 5.7008, "loss/crossentropy": 2.5649964809417725, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16826294362545013, "step": 18420 }, { "epoch": 0.5756875, "grad_norm": 3.109375, "grad_norm_var": 0.21497395833333333, "learning_rate": 0.0001, "loss": 5.5362, "loss/crossentropy": 2.4699827432632446, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16209031641483307, "step": 18422 }, { "epoch": 0.57575, "grad_norm": 3.5, "grad_norm_var": 0.21179911295572917, "learning_rate": 0.0001, "loss": 5.9305, "loss/crossentropy": 2.666316509246826, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.176804319024086, "step": 18424 }, { "epoch": 0.5758125, "grad_norm": 3.203125, "grad_norm_var": 0.2167633056640625, "learning_rate": 0.0001, "loss": 5.6947, "loss/crossentropy": 2.5294259786605835, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16535243391990662, "step": 18426 }, { "epoch": 0.575875, "grad_norm": 3.359375, "grad_norm_var": 0.21601460774739584, "learning_rate": 0.0001, "loss": 5.8707, "loss/crossentropy": 2.6436872482299805, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1730949506163597, "step": 18428 }, { "epoch": 0.5759375, "grad_norm": 2.96875, "grad_norm_var": 0.04572652180989583, "learning_rate": 0.0001, "loss": 5.5814, "loss/crossentropy": 2.4956637620925903, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16482633352279663, "step": 18430 }, { "epoch": 0.576, "grad_norm": 3.1875, "grad_norm_var": 0.0509918212890625, "learning_rate": 0.0001, "loss": 5.5559, "loss/crossentropy": 2.48967969417572, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.159743033349514, "step": 18432 }, { "epoch": 0.5760625, "grad_norm": 3.09375, "grad_norm_var": 0.0450347900390625, "learning_rate": 0.0001, "loss": 5.5264, "loss/crossentropy": 2.4949833154678345, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15587330609560013, "step": 18434 }, { "epoch": 0.576125, "grad_norm": 3.3125, "grad_norm_var": 0.0343658447265625, "learning_rate": 0.0001, "loss": 5.5187, "loss/crossentropy": 2.3616496324539185, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16804824024438858, "step": 18436 }, { "epoch": 0.5761875, "grad_norm": 3.171875, "grad_norm_var": 0.03341471354166667, "learning_rate": 0.0001, "loss": 5.8019, "loss/crossentropy": 2.589166522026062, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17400363087654114, "step": 18438 }, { "epoch": 0.57625, "grad_norm": 2.984375, "grad_norm_var": 0.0271881103515625, "learning_rate": 0.0001, "loss": 5.901, "loss/crossentropy": 2.666108012199402, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17310288548469543, "step": 18440 }, { "epoch": 0.5763125, "grad_norm": 3.28125, "grad_norm_var": 0.028327433268229167, "learning_rate": 0.0001, "loss": 5.9678, "loss/crossentropy": 2.8197555541992188, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16714906692504883, "step": 18442 }, { "epoch": 0.576375, "grad_norm": 2.984375, "grad_norm_var": 0.028473917643229166, "learning_rate": 0.0001, "loss": 5.3827, "loss/crossentropy": 2.303182363510132, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16264119744300842, "step": 18444 }, { "epoch": 0.5764375, "grad_norm": 3.15625, "grad_norm_var": 0.10107014973958334, "learning_rate": 0.0001, "loss": 6.1568, "loss/crossentropy": 2.79031503200531, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18430818617343903, "step": 18446 }, { "epoch": 0.5765, "grad_norm": 3.234375, "grad_norm_var": 0.132275390625, "learning_rate": 0.0001, "loss": 5.8918, "loss/crossentropy": 2.7167197465896606, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1706361472606659, "step": 18448 }, { "epoch": 0.5765625, "grad_norm": 3.46875, "grad_norm_var": 0.12077534993489583, "learning_rate": 0.0001, "loss": 5.7644, "loss/crossentropy": 2.5743154287338257, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16939499974250793, "step": 18450 }, { "epoch": 0.576625, "grad_norm": 3.1875, "grad_norm_var": 0.11385091145833333, "learning_rate": 0.0001, "loss": 6.1072, "loss/crossentropy": 2.811020851135254, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17922312766313553, "step": 18452 }, { "epoch": 0.5766875, "grad_norm": 3.984375, "grad_norm_var": 0.14557291666666666, "learning_rate": 0.0001, "loss": 5.8725, "loss/crossentropy": 2.609155774116516, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17164289951324463, "step": 18454 }, { "epoch": 0.57675, "grad_norm": 3.09375, "grad_norm_var": 0.14801432291666666, "learning_rate": 0.0001, "loss": 5.6958, "loss/crossentropy": 2.572449564933777, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16623719781637192, "step": 18456 }, { "epoch": 0.5768125, "grad_norm": 3.03125, "grad_norm_var": 0.15652669270833333, "learning_rate": 0.0001, "loss": 5.8234, "loss/crossentropy": 2.6459431648254395, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17087049037218094, "step": 18458 }, { "epoch": 0.576875, "grad_norm": 3.09375, "grad_norm_var": 0.15507405598958332, "learning_rate": 0.0001, "loss": 5.7833, "loss/crossentropy": 2.5941377878189087, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16930782049894333, "step": 18460 }, { "epoch": 0.5769375, "grad_norm": 3.703125, "grad_norm_var": 0.11103108723958334, "learning_rate": 0.0001, "loss": 5.9248, "loss/crossentropy": 2.6534959077835083, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17478252947330475, "step": 18462 }, { "epoch": 0.577, "grad_norm": 3.359375, "grad_norm_var": 0.07961832682291667, "learning_rate": 0.0001, "loss": 5.7831, "loss/crossentropy": 2.5179115533828735, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17339365184307098, "step": 18464 }, { "epoch": 0.5770625, "grad_norm": 3.296875, "grad_norm_var": 0.08166910807291666, "learning_rate": 0.0001, "loss": 5.5137, "loss/crossentropy": 2.4228307008743286, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.15752895176410675, "step": 18466 }, { "epoch": 0.577125, "grad_norm": 2.9375, "grad_norm_var": 0.08742574055989584, "learning_rate": 0.0001, "loss": 5.3917, "loss/crossentropy": 2.4382351636886597, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.14769209921360016, "step": 18468 }, { "epoch": 0.5771875, "grad_norm": 2.9375, "grad_norm_var": 0.04976298014322917, "learning_rate": 0.0001, "loss": 5.2428, "loss/crossentropy": 2.2560296058654785, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15570665895938873, "step": 18470 }, { "epoch": 0.57725, "grad_norm": 3.375, "grad_norm_var": 0.05047200520833333, "learning_rate": 0.0001, "loss": 5.824, "loss/crossentropy": 2.6376614570617676, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17058780789375305, "step": 18472 }, { "epoch": 0.5773125, "grad_norm": 2.96875, "grad_norm_var": 0.05113525390625, "learning_rate": 0.0001, "loss": 5.8204, "loss/crossentropy": 2.6507813930511475, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16969439387321472, "step": 18474 }, { "epoch": 0.577375, "grad_norm": 3.0, "grad_norm_var": 0.0600250244140625, "learning_rate": 0.0001, "loss": 5.7476, "loss/crossentropy": 2.5414364337921143, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.17530686408281326, "step": 18476 }, { "epoch": 0.5774375, "grad_norm": 3.375, "grad_norm_var": 0.045221964518229164, "learning_rate": 0.0001, "loss": 5.8855, "loss/crossentropy": 2.636993646621704, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17758085578680038, "step": 18478 }, { "epoch": 0.5775, "grad_norm": 3.9375, "grad_norm_var": 0.07720947265625, "learning_rate": 0.0001, "loss": 6.3393, "loss/crossentropy": 2.9106377363204956, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18544892221689224, "step": 18480 }, { "epoch": 0.5775625, "grad_norm": 2.84375, "grad_norm_var": 0.08156636555989584, "learning_rate": 0.0001, "loss": 5.5022, "loss/crossentropy": 2.3790981769561768, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16348543763160706, "step": 18482 }, { "epoch": 0.577625, "grad_norm": 2.84375, "grad_norm_var": 0.08564046223958334, "learning_rate": 0.0001, "loss": 5.8974, "loss/crossentropy": 2.7403557300567627, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1680453196167946, "step": 18484 }, { "epoch": 0.5776875, "grad_norm": 3.734375, "grad_norm_var": 0.09696858723958333, "learning_rate": 0.0001, "loss": 5.8688, "loss/crossentropy": 2.5689350366592407, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1749121993780136, "step": 18486 }, { "epoch": 0.57775, "grad_norm": 3.453125, "grad_norm_var": 0.09922587076822917, "learning_rate": 0.0001, "loss": 5.7181, "loss/crossentropy": 2.4852821826934814, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1721075251698494, "step": 18488 }, { "epoch": 0.5778125, "grad_norm": 2.828125, "grad_norm_var": 0.11477457682291667, "learning_rate": 0.0001, "loss": 5.6, "loss/crossentropy": 2.5532870292663574, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15935859829187393, "step": 18490 }, { "epoch": 0.577875, "grad_norm": 2.9375, "grad_norm_var": 0.11077372233072917, "learning_rate": 0.0001, "loss": 5.7528, "loss/crossentropy": 2.609677314758301, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1662694588303566, "step": 18492 }, { "epoch": 0.5779375, "grad_norm": 3.421875, "grad_norm_var": 0.10816650390625, "learning_rate": 0.0001, "loss": 5.5932, "loss/crossentropy": 2.4124221801757812, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1676826924085617, "step": 18494 }, { "epoch": 0.578, "grad_norm": 3.125, "grad_norm_var": 0.07392578125, "learning_rate": 0.0001, "loss": 5.4063, "loss/crossentropy": 2.3695526123046875, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1575787216424942, "step": 18496 }, { "epoch": 0.5780625, "grad_norm": 2.984375, "grad_norm_var": 0.068359375, "learning_rate": 0.0001, "loss": 5.419, "loss/crossentropy": 2.3370147943496704, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16210711747407913, "step": 18498 }, { "epoch": 0.578125, "grad_norm": 3.046875, "grad_norm_var": 0.0612213134765625, "learning_rate": 0.0001, "loss": 5.8758, "loss/crossentropy": 2.6599076986312866, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1700289249420166, "step": 18500 }, { "epoch": 0.5781875, "grad_norm": 3.1875, "grad_norm_var": 0.0375396728515625, "learning_rate": 0.0001, "loss": 5.64, "loss/crossentropy": 2.544037103652954, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16194117069244385, "step": 18502 }, { "epoch": 0.57825, "grad_norm": 3.203125, "grad_norm_var": 0.029572550455729166, "learning_rate": 0.0001, "loss": 5.6184, "loss/crossentropy": 2.5240252017974854, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16099613159894943, "step": 18504 }, { "epoch": 0.5783125, "grad_norm": 3.265625, "grad_norm_var": 0.022330729166666667, "learning_rate": 0.0001, "loss": 5.52, "loss/crossentropy": 2.436471462249756, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16264820843935013, "step": 18506 }, { "epoch": 0.578375, "grad_norm": 3.140625, "grad_norm_var": 0.015946451822916666, "learning_rate": 0.0001, "loss": 5.7601, "loss/crossentropy": 2.595950961112976, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1718810498714447, "step": 18508 }, { "epoch": 0.5784375, "grad_norm": 3.03125, "grad_norm_var": 0.010758463541666667, "learning_rate": 0.0001, "loss": 5.4356, "loss/crossentropy": 2.3870849609375, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16032517701387405, "step": 18510 }, { "epoch": 0.5785, "grad_norm": 3.453125, "grad_norm_var": 0.017887369791666666, "learning_rate": 0.0001, "loss": 5.7266, "loss/crossentropy": 2.575134754180908, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16944736242294312, "step": 18512 }, { "epoch": 0.5785625, "grad_norm": 2.75, "grad_norm_var": 0.026423136393229168, "learning_rate": 0.0001, "loss": 5.4494, "loss/crossentropy": 2.3964990377426147, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16036802530288696, "step": 18514 }, { "epoch": 0.578625, "grad_norm": 3.171875, "grad_norm_var": 0.026611328125, "learning_rate": 0.0001, "loss": 5.8836, "loss/crossentropy": 2.674792528152466, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17361968755722046, "step": 18516 }, { "epoch": 0.5786875, "grad_norm": 3.0, "grad_norm_var": 0.025755818684895834, "learning_rate": 0.0001, "loss": 5.7156, "loss/crossentropy": 2.582158923149109, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16646727174520493, "step": 18518 }, { "epoch": 0.57875, "grad_norm": 2.953125, "grad_norm_var": 0.026903279622395835, "learning_rate": 0.0001, "loss": 5.6374, "loss/crossentropy": 2.5500084161758423, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16420669108629227, "step": 18520 }, { "epoch": 0.5788125, "grad_norm": 3.234375, "grad_norm_var": 0.025846354166666665, "learning_rate": 0.0001, "loss": 5.4583, "loss/crossentropy": 2.3921940326690674, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1538735032081604, "step": 18522 }, { "epoch": 0.578875, "grad_norm": 3.078125, "grad_norm_var": 0.025712076822916666, "learning_rate": 0.0001, "loss": 5.6155, "loss/crossentropy": 2.5623749494552612, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15687866508960724, "step": 18524 }, { "epoch": 0.5789375, "grad_norm": 3.015625, "grad_norm_var": 0.026106770833333334, "learning_rate": 0.0001, "loss": 5.2689, "loss/crossentropy": 2.3232126235961914, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15316598117351532, "step": 18526 }, { "epoch": 0.579, "grad_norm": 3.03125, "grad_norm_var": 0.0162506103515625, "learning_rate": 0.0001, "loss": 5.6008, "loss/crossentropy": 2.5733802318573, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16055817902088165, "step": 18528 }, { "epoch": 0.5790625, "grad_norm": 3.171875, "grad_norm_var": 0.00826416015625, "learning_rate": 0.0001, "loss": 5.7381, "loss/crossentropy": 2.586323618888855, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16830172389745712, "step": 18530 }, { "epoch": 0.579125, "grad_norm": 3.390625, "grad_norm_var": 0.014774576822916666, "learning_rate": 0.0001, "loss": 5.6249, "loss/crossentropy": 2.4533214569091797, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16754812002182007, "step": 18532 }, { "epoch": 0.5791875, "grad_norm": 2.96875, "grad_norm_var": 0.0162261962890625, "learning_rate": 0.0001, "loss": 5.7429, "loss/crossentropy": 2.59567129611969, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16550730913877487, "step": 18534 }, { "epoch": 0.57925, "grad_norm": 3.09375, "grad_norm_var": 0.014860026041666667, "learning_rate": 0.0001, "loss": 5.5887, "loss/crossentropy": 2.4598299264907837, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16561666131019592, "step": 18536 }, { "epoch": 0.5793125, "grad_norm": 3.21875, "grad_norm_var": 0.018050130208333334, "learning_rate": 0.0001, "loss": 6.103, "loss/crossentropy": 2.82500159740448, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17818892747163773, "step": 18538 }, { "epoch": 0.579375, "grad_norm": 3.34375, "grad_norm_var": 0.6931477864583333, "learning_rate": 0.0001, "loss": 5.702, "loss/crossentropy": 2.4253724813461304, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.16750602424144745, "step": 18540 }, { "epoch": 0.5794375, "grad_norm": 2.90625, "grad_norm_var": 0.6881256103515625, "learning_rate": 0.0001, "loss": 5.4788, "loss/crossentropy": 2.4489500522613525, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15728431940078735, "step": 18542 }, { "epoch": 0.5795, "grad_norm": 3.0, "grad_norm_var": 0.6911855061848958, "learning_rate": 0.0001, "loss": 5.536, "loss/crossentropy": 2.4353054761886597, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1647573560476303, "step": 18544 }, { "epoch": 0.5795625, "grad_norm": 3.171875, "grad_norm_var": 0.6865468343098958, "learning_rate": 0.0001, "loss": 5.6885, "loss/crossentropy": 2.5016510486602783, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.16556347906589508, "step": 18546 }, { "epoch": 0.579625, "grad_norm": 2.921875, "grad_norm_var": 0.6917958577473958, "learning_rate": 0.0001, "loss": 5.8193, "loss/crossentropy": 2.652044177055359, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1706322431564331, "step": 18548 }, { "epoch": 0.5796875, "grad_norm": 3.1875, "grad_norm_var": 0.6940500895182292, "learning_rate": 0.0001, "loss": 5.5408, "loss/crossentropy": 2.447631359100342, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1585312932729721, "step": 18550 }, { "epoch": 0.57975, "grad_norm": 3.4375, "grad_norm_var": 0.6898427327473958, "learning_rate": 0.0001, "loss": 5.7557, "loss/crossentropy": 2.596085786819458, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.169084370136261, "step": 18552 }, { "epoch": 0.5798125, "grad_norm": 2.90625, "grad_norm_var": 0.7029774983723959, "learning_rate": 0.0001, "loss": 5.7252, "loss/crossentropy": 2.5617101192474365, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16557105630636215, "step": 18554 }, { "epoch": 0.579875, "grad_norm": 3.0625, "grad_norm_var": 0.03741861979166667, "learning_rate": 0.0001, "loss": 5.8683, "loss/crossentropy": 2.6295604705810547, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17348121106624603, "step": 18556 }, { "epoch": 0.5799375, "grad_norm": 3.140625, "grad_norm_var": 0.03425191243489583, "learning_rate": 0.0001, "loss": 5.9799, "loss/crossentropy": 2.7225042581558228, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1761271357536316, "step": 18558 }, { "epoch": 0.58, "grad_norm": 3.375, "grad_norm_var": 0.038304646809895836, "learning_rate": 0.0001, "loss": 5.7409, "loss/crossentropy": 2.5768563747406006, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.17070002853870392, "step": 18560 }, { "epoch": 0.5800625, "grad_norm": 3.203125, "grad_norm_var": 0.035741170247395836, "learning_rate": 0.0001, "loss": 5.5646, "loss/crossentropy": 2.429896831512451, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16698598861694336, "step": 18562 }, { "epoch": 0.580125, "grad_norm": 3.515625, "grad_norm_var": 0.07639567057291667, "learning_rate": 0.0001, "loss": 5.6645, "loss/crossentropy": 2.482678174972534, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1701330468058586, "step": 18564 }, { "epoch": 0.5801875, "grad_norm": 3.265625, "grad_norm_var": 0.07169596354166667, "learning_rate": 0.0001, "loss": 5.4908, "loss/crossentropy": 2.3414900302886963, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.15985772758722305, "step": 18566 }, { "epoch": 0.58025, "grad_norm": 3.625, "grad_norm_var": 0.07626546223958333, "learning_rate": 0.0001, "loss": 5.7235, "loss/crossentropy": 2.464827537536621, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17274148762226105, "step": 18568 }, { "epoch": 0.5803125, "grad_norm": 3.359375, "grad_norm_var": 0.06392313639322916, "learning_rate": 0.0001, "loss": 5.628, "loss/crossentropy": 2.4404972791671753, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16640209406614304, "step": 18570 }, { "epoch": 0.580375, "grad_norm": 3.515625, "grad_norm_var": 0.0599609375, "learning_rate": 0.0001, "loss": 6.1257, "loss/crossentropy": 2.8498064279556274, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1760292649269104, "step": 18572 }, { "epoch": 0.5804375, "grad_norm": 3.046875, "grad_norm_var": 0.0705078125, "learning_rate": 0.0001, "loss": 5.6626, "loss/crossentropy": 2.5267767906188965, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16709468513727188, "step": 18574 }, { "epoch": 0.5805, "grad_norm": 3.4375, "grad_norm_var": 0.0618072509765625, "learning_rate": 0.0001, "loss": 5.7438, "loss/crossentropy": 2.5630754232406616, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16533689945936203, "step": 18576 }, { "epoch": 0.5805625, "grad_norm": 3.21875, "grad_norm_var": 0.06129150390625, "learning_rate": 0.0001, "loss": 5.6433, "loss/crossentropy": 2.4586691856384277, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16573145985603333, "step": 18578 }, { "epoch": 0.580625, "grad_norm": 2.96875, "grad_norm_var": 0.0387115478515625, "learning_rate": 0.0001, "loss": 5.5069, "loss/crossentropy": 2.389492630958557, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16291730105876923, "step": 18580 }, { "epoch": 0.5806875, "grad_norm": 3.15625, "grad_norm_var": 0.03619791666666667, "learning_rate": 0.0001, "loss": 5.7239, "loss/crossentropy": 2.5101183652877808, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1705961897969246, "step": 18582 }, { "epoch": 0.58075, "grad_norm": 3.296875, "grad_norm_var": 0.02613525390625, "learning_rate": 0.0001, "loss": 5.8409, "loss/crossentropy": 2.562360644340515, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1778540313243866, "step": 18584 }, { "epoch": 0.5808125, "grad_norm": 3.078125, "grad_norm_var": 0.028173828125, "learning_rate": 0.0001, "loss": 5.6442, "loss/crossentropy": 2.4837807416915894, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.17033448815345764, "step": 18586 }, { "epoch": 0.580875, "grad_norm": 3.328125, "grad_norm_var": 0.0241119384765625, "learning_rate": 0.0001, "loss": 5.9073, "loss/crossentropy": 2.6571329832077026, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17852745950222015, "step": 18588 }, { "epoch": 0.5809375, "grad_norm": 3.359375, "grad_norm_var": 0.020295206705729166, "learning_rate": 0.0001, "loss": 6.0707, "loss/crossentropy": 2.778269648551941, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1796359047293663, "step": 18590 }, { "epoch": 0.581, "grad_norm": 3.0, "grad_norm_var": 0.023656209309895832, "learning_rate": 0.0001, "loss": 5.5964, "loss/crossentropy": 2.5302563905715942, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15973998606204987, "step": 18592 }, { "epoch": 0.5810625, "grad_norm": 3.15625, "grad_norm_var": 0.023465983072916665, "learning_rate": 0.0001, "loss": 5.8044, "loss/crossentropy": 2.656304359436035, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1667615845799446, "step": 18594 }, { "epoch": 0.581125, "grad_norm": 3.78125, "grad_norm_var": 0.036253865559895834, "learning_rate": 0.0001, "loss": 5.9787, "loss/crossentropy": 2.6889857053756714, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.177021324634552, "step": 18596 }, { "epoch": 0.5811875, "grad_norm": 3.046875, "grad_norm_var": 0.039567057291666666, "learning_rate": 0.0001, "loss": 5.8973, "loss/crossentropy": 2.671121835708618, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17379028350114822, "step": 18598 }, { "epoch": 0.58125, "grad_norm": 3.03125, "grad_norm_var": 0.0404205322265625, "learning_rate": 0.0001, "loss": 5.5099, "loss/crossentropy": 2.3974192142486572, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16241814941167831, "step": 18600 }, { "epoch": 0.5813125, "grad_norm": 3.1875, "grad_norm_var": 0.03863525390625, "learning_rate": 0.0001, "loss": 5.924, "loss/crossentropy": 2.6862300634384155, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1757260039448738, "step": 18602 }, { "epoch": 0.581375, "grad_norm": 3.046875, "grad_norm_var": 0.04350484212239583, "learning_rate": 0.0001, "loss": 5.5963, "loss/crossentropy": 2.5258930921554565, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1582084447145462, "step": 18604 }, { "epoch": 0.5814375, "grad_norm": 4.28125, "grad_norm_var": 0.12067057291666666, "learning_rate": 0.0001, "loss": 5.7236, "loss/crossentropy": 2.469976305961609, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17614026367664337, "step": 18606 }, { "epoch": 0.5815, "grad_norm": 3.515625, "grad_norm_var": 0.11647135416666667, "learning_rate": 0.0001, "loss": 5.7335, "loss/crossentropy": 2.500939130783081, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1724713146686554, "step": 18608 }, { "epoch": 0.5815625, "grad_norm": 3.015625, "grad_norm_var": 0.12549540201822917, "learning_rate": 0.0001, "loss": 5.4021, "loss/crossentropy": 2.464106321334839, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.15122338384389877, "step": 18610 }, { "epoch": 0.581625, "grad_norm": 4.09375, "grad_norm_var": 0.1540191650390625, "learning_rate": 0.0001, "loss": 5.4409, "loss/crossentropy": 2.3574678897857666, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1634194403886795, "step": 18612 }, { "epoch": 0.5816875, "grad_norm": 3.125, "grad_norm_var": 0.15129801432291667, "learning_rate": 0.0001, "loss": 5.9564, "loss/crossentropy": 2.771583914756775, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.17239146679639816, "step": 18614 }, { "epoch": 0.58175, "grad_norm": 3.09375, "grad_norm_var": 0.14811197916666666, "learning_rate": 0.0001, "loss": 5.5109, "loss/crossentropy": 2.3945586681365967, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1647638976573944, "step": 18616 }, { "epoch": 0.5818125, "grad_norm": 3.0625, "grad_norm_var": 0.15637919108072917, "learning_rate": 0.0001, "loss": 5.7852, "loss/crossentropy": 2.6670639514923096, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1668880581855774, "step": 18618 }, { "epoch": 0.581875, "grad_norm": 2.984375, "grad_norm_var": 0.14918619791666668, "learning_rate": 0.0001, "loss": 5.7533, "loss/crossentropy": 2.5921837091445923, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16728005558252335, "step": 18620 }, { "epoch": 0.5819375, "grad_norm": 3.203125, "grad_norm_var": 0.0853515625, "learning_rate": 0.0001, "loss": 5.5894, "loss/crossentropy": 2.445537567138672, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16594630479812622, "step": 18622 }, { "epoch": 0.582, "grad_norm": 3.09375, "grad_norm_var": 0.08701883951822917, "learning_rate": 0.0001, "loss": 5.8582, "loss/crossentropy": 2.6308059692382812, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17000506818294525, "step": 18624 }, { "epoch": 0.5820625, "grad_norm": 3.296875, "grad_norm_var": 0.07776285807291666, "learning_rate": 0.0001, "loss": 6.0982, "loss/crossentropy": 2.7778422832489014, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17969654500484467, "step": 18626 }, { "epoch": 0.582125, "grad_norm": 3.109375, "grad_norm_var": 0.03763020833333333, "learning_rate": 0.0001, "loss": 5.5696, "loss/crossentropy": 2.4944108724594116, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16376770287752151, "step": 18628 }, { "epoch": 0.5821875, "grad_norm": 3.09375, "grad_norm_var": 0.0425933837890625, "learning_rate": 0.0001, "loss": 5.4778, "loss/crossentropy": 2.459985852241516, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15568386018276215, "step": 18630 }, { "epoch": 0.58225, "grad_norm": 3.390625, "grad_norm_var": 0.0460601806640625, "learning_rate": 0.0001, "loss": 5.9489, "loss/crossentropy": 2.643662929534912, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1785680204629898, "step": 18632 }, { "epoch": 0.5823125, "grad_norm": 3.171875, "grad_norm_var": 0.04049072265625, "learning_rate": 0.0001, "loss": 5.8371, "loss/crossentropy": 2.6857060194015503, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1670950949192047, "step": 18634 }, { "epoch": 0.582375, "grad_norm": 3.140625, "grad_norm_var": 0.03424072265625, "learning_rate": 0.0001, "loss": 5.7002, "loss/crossentropy": 2.5760804414749146, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16593117266893387, "step": 18636 }, { "epoch": 0.5824375, "grad_norm": 3.375, "grad_norm_var": 0.03526102701822917, "learning_rate": 0.0001, "loss": 5.7943, "loss/crossentropy": 2.575034499168396, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1715373918414116, "step": 18638 }, { "epoch": 0.5825, "grad_norm": 3.09375, "grad_norm_var": 0.03228759765625, "learning_rate": 0.0001, "loss": 5.4874, "loss/crossentropy": 2.4387048482894897, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16033842414617538, "step": 18640 }, { "epoch": 0.5825625, "grad_norm": 3.078125, "grad_norm_var": 0.027684529622395832, "learning_rate": 0.0001, "loss": 5.6826, "loss/crossentropy": 2.5084009170532227, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.17210392653942108, "step": 18642 }, { "epoch": 0.582625, "grad_norm": 3.0, "grad_norm_var": 0.025861612955729165, "learning_rate": 0.0001, "loss": 5.8363, "loss/crossentropy": 2.6931592226028442, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1670495942234993, "step": 18644 }, { "epoch": 0.5826875, "grad_norm": 3.1875, "grad_norm_var": 0.027469889322916666, "learning_rate": 0.0001, "loss": 5.3942, "loss/crossentropy": 2.3630157709121704, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15467745065689087, "step": 18646 }, { "epoch": 0.58275, "grad_norm": 3.5625, "grad_norm_var": 0.033219401041666666, "learning_rate": 0.0001, "loss": 5.701, "loss/crossentropy": 2.471379041671753, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17257267236709595, "step": 18648 }, { "epoch": 0.5828125, "grad_norm": 2.890625, "grad_norm_var": 0.03746744791666667, "learning_rate": 0.0001, "loss": 5.7769, "loss/crossentropy": 2.5884225368499756, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16767475754022598, "step": 18650 }, { "epoch": 0.582875, "grad_norm": 3.078125, "grad_norm_var": 0.037581380208333334, "learning_rate": 0.0001, "loss": 5.5594, "loss/crossentropy": 2.437865972518921, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16254276037216187, "step": 18652 }, { "epoch": 0.5829375, "grad_norm": 2.96875, "grad_norm_var": 0.026656087239583334, "learning_rate": 0.0001, "loss": 5.096, "loss/crossentropy": 2.1688880920410156, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.14896288514137268, "step": 18654 }, { "epoch": 0.583, "grad_norm": 3.21875, "grad_norm_var": 0.028597005208333335, "learning_rate": 0.0001, "loss": 5.6605, "loss/crossentropy": 2.4919804334640503, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.17075835168361664, "step": 18656 }, { "epoch": 0.5830625, "grad_norm": 3.03125, "grad_norm_var": 0.028609212239583334, "learning_rate": 0.0001, "loss": 5.4257, "loss/crossentropy": 2.4058111906051636, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15471865236759186, "step": 18658 }, { "epoch": 0.583125, "grad_norm": 3.375, "grad_norm_var": 0.28709208170572914, "learning_rate": 0.0001, "loss": 6.2958, "loss/crossentropy": 2.9767050743103027, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1780048906803131, "step": 18660 }, { "epoch": 0.5831875, "grad_norm": 3.0625, "grad_norm_var": 0.2811431884765625, "learning_rate": 0.0001, "loss": 5.6223, "loss/crossentropy": 2.5128010511398315, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1644660010933876, "step": 18662 }, { "epoch": 0.58325, "grad_norm": 3.21875, "grad_norm_var": 0.2728800455729167, "learning_rate": 0.0001, "loss": 5.7958, "loss/crossentropy": 2.575974225997925, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1731526404619217, "step": 18664 }, { "epoch": 0.5833125, "grad_norm": 3.296875, "grad_norm_var": 0.2719065348307292, "learning_rate": 0.0001, "loss": 5.8324, "loss/crossentropy": 2.730650544166565, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16603553295135498, "step": 18666 }, { "epoch": 0.583375, "grad_norm": 3.359375, "grad_norm_var": 0.2708821614583333, "learning_rate": 0.0001, "loss": 5.8089, "loss/crossentropy": 2.5872223377227783, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17373275756835938, "step": 18668 }, { "epoch": 0.5834375, "grad_norm": 3.515625, "grad_norm_var": 0.2674794514973958, "learning_rate": 0.0001, "loss": 5.9221, "loss/crossentropy": 2.6320900917053223, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17900218069553375, "step": 18670 }, { "epoch": 0.5835, "grad_norm": 2.84375, "grad_norm_var": 0.2742513020833333, "learning_rate": 0.0001, "loss": 5.5412, "loss/crossentropy": 2.4675287008285522, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16361329704523087, "step": 18672 }, { "epoch": 0.5835625, "grad_norm": 3.375, "grad_norm_var": 0.32249247233072914, "learning_rate": 0.0001, "loss": 5.8521, "loss/crossentropy": 2.5169920921325684, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18155356496572495, "step": 18674 }, { "epoch": 0.583625, "grad_norm": 3.375, "grad_norm_var": 0.10301005045572917, "learning_rate": 0.0001, "loss": 6.0169, "loss/crossentropy": 2.6857441663742065, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1823333203792572, "step": 18676 }, { "epoch": 0.5836875, "grad_norm": 2.96875, "grad_norm_var": 0.11301676432291667, "learning_rate": 0.0001, "loss": 5.7099, "loss/crossentropy": 2.6076717376708984, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16061626374721527, "step": 18678 }, { "epoch": 0.58375, "grad_norm": 3.484375, "grad_norm_var": 0.11795247395833333, "learning_rate": 0.0001, "loss": 5.7529, "loss/crossentropy": 2.589252471923828, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.168318510055542, "step": 18680 }, { "epoch": 0.5838125, "grad_norm": 3.5625, "grad_norm_var": 0.11670633951822916, "learning_rate": 0.0001, "loss": 5.7674, "loss/crossentropy": 2.5550804138183594, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17318449914455414, "step": 18682 }, { "epoch": 0.583875, "grad_norm": 2.9375, "grad_norm_var": 0.12859700520833334, "learning_rate": 0.0001, "loss": 5.8332, "loss/crossentropy": 2.606408715248108, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17150401324033737, "step": 18684 }, { "epoch": 0.5839375, "grad_norm": 3.078125, "grad_norm_var": 0.12725321451822916, "learning_rate": 0.0001, "loss": 5.8097, "loss/crossentropy": 2.6292680501937866, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1703866571187973, "step": 18686 }, { "epoch": 0.584, "grad_norm": 3.09375, "grad_norm_var": 0.12434488932291667, "learning_rate": 0.0001, "loss": 5.6515, "loss/crossentropy": 2.5895968675613403, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1632196456193924, "step": 18688 }, { "epoch": 0.5840625, "grad_norm": 3.53125, "grad_norm_var": 0.06166890462239583, "learning_rate": 0.0001, "loss": 5.8398, "loss/crossentropy": 2.5626882314682007, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17810668796300888, "step": 18690 }, { "epoch": 0.584125, "grad_norm": 3.21875, "grad_norm_var": 0.06065165201822917, "learning_rate": 0.0001, "loss": 5.407, "loss/crossentropy": 2.3457969427108765, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15729539096355438, "step": 18692 }, { "epoch": 0.5841875, "grad_norm": 3.140625, "grad_norm_var": 0.049560546875, "learning_rate": 0.0001, "loss": 5.9334, "loss/crossentropy": 2.7616426944732666, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17069139331579208, "step": 18694 }, { "epoch": 0.58425, "grad_norm": 3.5625, "grad_norm_var": 0.051813761393229164, "learning_rate": 0.0001, "loss": 5.8901, "loss/crossentropy": 2.542791485786438, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18121101707220078, "step": 18696 }, { "epoch": 0.5843125, "grad_norm": 3.125, "grad_norm_var": 0.045182291666666666, "learning_rate": 0.0001, "loss": 5.6316, "loss/crossentropy": 2.476907730102539, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16781611740589142, "step": 18698 }, { "epoch": 0.584375, "grad_norm": 2.9375, "grad_norm_var": 0.0344146728515625, "learning_rate": 0.0001, "loss": 5.6306, "loss/crossentropy": 2.5584638118743896, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16268333792686462, "step": 18700 }, { "epoch": 0.5844375, "grad_norm": 2.90625, "grad_norm_var": 0.03758036295572917, "learning_rate": 0.0001, "loss": 5.6783, "loss/crossentropy": 2.6262636184692383, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15989147126674652, "step": 18702 }, { "epoch": 0.5845, "grad_norm": 4.40625, "grad_norm_var": 0.12493082682291666, "learning_rate": 0.0001, "loss": 5.6156, "loss/crossentropy": 2.4813411235809326, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16694270819425583, "step": 18704 }, { "epoch": 0.5845625, "grad_norm": 3.3125, "grad_norm_var": 0.1207427978515625, "learning_rate": 0.0001, "loss": 5.694, "loss/crossentropy": 2.4842689037323, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17019431293010712, "step": 18706 }, { "epoch": 0.584625, "grad_norm": 3.90625, "grad_norm_var": 0.14440104166666667, "learning_rate": 0.0001, "loss": 5.9157, "loss/crossentropy": 2.5731600522994995, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.18308069556951523, "step": 18708 }, { "epoch": 0.5846875, "grad_norm": 3.140625, "grad_norm_var": 0.15906473795572917, "learning_rate": 0.0001, "loss": 5.6158, "loss/crossentropy": 2.5464928150177, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16278600692749023, "step": 18710 }, { "epoch": 0.58475, "grad_norm": 3.046875, "grad_norm_var": 0.15641276041666666, "learning_rate": 0.0001, "loss": 5.8485, "loss/crossentropy": 2.682703733444214, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16658055782318115, "step": 18712 }, { "epoch": 0.5848125, "grad_norm": 2.890625, "grad_norm_var": 0.16401265462239584, "learning_rate": 0.0001, "loss": 5.7586, "loss/crossentropy": 2.5853028297424316, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1696712076663971, "step": 18714 }, { "epoch": 0.584875, "grad_norm": 3.390625, "grad_norm_var": 0.16638997395833333, "learning_rate": 0.0001, "loss": 5.644, "loss/crossentropy": 2.5225616693496704, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16488178819417953, "step": 18716 }, { "epoch": 0.5849375, "grad_norm": 3.4375, "grad_norm_var": 0.16483968098958332, "learning_rate": 0.0001, "loss": 5.3215, "loss/crossentropy": 2.2986279726028442, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15385031700134277, "step": 18718 }, { "epoch": 0.585, "grad_norm": 3.296875, "grad_norm_var": 0.0817535400390625, "learning_rate": 0.0001, "loss": 5.8051, "loss/crossentropy": 2.5350691080093384, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17699886113405228, "step": 18720 }, { "epoch": 0.5850625, "grad_norm": 3.671875, "grad_norm_var": 0.09387919108072916, "learning_rate": 0.0001, "loss": 5.9967, "loss/crossentropy": 2.71618390083313, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1780504211783409, "step": 18722 }, { "epoch": 0.585125, "grad_norm": 2.9375, "grad_norm_var": 0.06747945149739583, "learning_rate": 0.0001, "loss": 5.6457, "loss/crossentropy": 2.52972674369812, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16354888677597046, "step": 18724 }, { "epoch": 0.5851875, "grad_norm": 2.8125, "grad_norm_var": 0.06922098795572916, "learning_rate": 0.0001, "loss": 5.4993, "loss/crossentropy": 2.4842694997787476, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.1600925326347351, "step": 18726 }, { "epoch": 0.58525, "grad_norm": 3.140625, "grad_norm_var": 0.067333984375, "learning_rate": 0.0001, "loss": 5.4522, "loss/crossentropy": 2.377071976661682, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16024264693260193, "step": 18728 }, { "epoch": 0.5853125, "grad_norm": 3.109375, "grad_norm_var": 0.06289774576822917, "learning_rate": 0.0001, "loss": 5.5121, "loss/crossentropy": 2.4099632501602173, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1574772521853447, "step": 18730 }, { "epoch": 0.585375, "grad_norm": 3.546875, "grad_norm_var": 0.064501953125, "learning_rate": 0.0001, "loss": 5.7326, "loss/crossentropy": 2.510116219520569, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.16756585240364075, "step": 18732 }, { "epoch": 0.5854375, "grad_norm": 3.125, "grad_norm_var": 0.060074869791666666, "learning_rate": 0.0001, "loss": 5.6425, "loss/crossentropy": 2.541126847267151, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16482357680797577, "step": 18734 }, { "epoch": 0.5855, "grad_norm": 3.03125, "grad_norm_var": 0.05244038899739583, "learning_rate": 0.0001, "loss": 5.8296, "loss/crossentropy": 2.7355847358703613, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16252835094928741, "step": 18736 }, { "epoch": 0.5855625, "grad_norm": 2.828125, "grad_norm_var": 0.03176167805989583, "learning_rate": 0.0001, "loss": 5.3245, "loss/crossentropy": 2.2850844860076904, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15589946508407593, "step": 18738 }, { "epoch": 0.585625, "grad_norm": 3.25, "grad_norm_var": 0.03371988932291667, "learning_rate": 0.0001, "loss": 5.8982, "loss/crossentropy": 2.6071465015411377, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1783270686864853, "step": 18740 }, { "epoch": 0.5856875, "grad_norm": 3.109375, "grad_norm_var": 0.027327473958333334, "learning_rate": 0.0001, "loss": 5.7234, "loss/crossentropy": 2.653437852859497, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.161686509847641, "step": 18742 }, { "epoch": 0.58575, "grad_norm": 3.28125, "grad_norm_var": 0.0290679931640625, "learning_rate": 0.0001, "loss": 5.8541, "loss/crossentropy": 2.637198805809021, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17168515175580978, "step": 18744 }, { "epoch": 0.5858125, "grad_norm": 3.28125, "grad_norm_var": 0.032124837239583336, "learning_rate": 0.0001, "loss": 6.0422, "loss/crossentropy": 2.6824125051498413, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1836380511522293, "step": 18746 }, { "epoch": 0.585875, "grad_norm": 3.109375, "grad_norm_var": 0.022196451822916668, "learning_rate": 0.0001, "loss": 5.7846, "loss/crossentropy": 2.6274431943893433, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1653219535946846, "step": 18748 }, { "epoch": 0.5859375, "grad_norm": 3.21875, "grad_norm_var": 0.02320556640625, "learning_rate": 0.0001, "loss": 5.4851, "loss/crossentropy": 2.4267383813858032, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15896280109882355, "step": 18750 }, { "epoch": 0.586, "grad_norm": 3.09375, "grad_norm_var": 0.020243326822916668, "learning_rate": 0.0001, "loss": 5.5005, "loss/crossentropy": 2.467852473258972, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15912079811096191, "step": 18752 }, { "epoch": 0.5860625, "grad_norm": 3.03125, "grad_norm_var": 0.014574178059895833, "learning_rate": 0.0001, "loss": 5.7226, "loss/crossentropy": 2.5836005210876465, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16702188551425934, "step": 18754 }, { "epoch": 0.586125, "grad_norm": 2.8125, "grad_norm_var": 0.020536295572916665, "learning_rate": 0.0001, "loss": 5.4636, "loss/crossentropy": 2.427434802055359, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15986891835927963, "step": 18756 }, { "epoch": 0.5861875, "grad_norm": 3.0, "grad_norm_var": 0.026005045572916666, "learning_rate": 0.0001, "loss": 6.0944, "loss/crossentropy": 2.8903247117996216, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17235716432332993, "step": 18758 }, { "epoch": 0.58625, "grad_norm": 3.25, "grad_norm_var": 0.030540974934895833, "learning_rate": 0.0001, "loss": 5.6706, "loss/crossentropy": 2.4873905181884766, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16714636236429214, "step": 18760 }, { "epoch": 0.5863125, "grad_norm": 3.484375, "grad_norm_var": 0.0416015625, "learning_rate": 0.0001, "loss": 5.8718, "loss/crossentropy": 2.551354169845581, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.174618661403656, "step": 18762 }, { "epoch": 0.586375, "grad_norm": 3.0, "grad_norm_var": 0.04360249837239583, "learning_rate": 0.0001, "loss": 5.7095, "loss/crossentropy": 2.521346688270569, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.17506151646375656, "step": 18764 }, { "epoch": 0.5864375, "grad_norm": 3.5625, "grad_norm_var": 0.06917317708333333, "learning_rate": 0.0001, "loss": 5.7361, "loss/crossentropy": 2.512497305870056, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17274749279022217, "step": 18766 }, { "epoch": 0.5865, "grad_norm": 3.59375, "grad_norm_var": 0.06972249348958333, "learning_rate": 0.0001, "loss": 5.7384, "loss/crossentropy": 2.605650782585144, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16640214622020721, "step": 18768 }, { "epoch": 0.5865625, "grad_norm": 3.125, "grad_norm_var": 0.07118733723958333, "learning_rate": 0.0001, "loss": 5.721, "loss/crossentropy": 2.5801135301589966, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1679932251572609, "step": 18770 }, { "epoch": 0.586625, "grad_norm": 3.09375, "grad_norm_var": 0.06012369791666667, "learning_rate": 0.0001, "loss": 5.7245, "loss/crossentropy": 2.556220769882202, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17034516483545303, "step": 18772 }, { "epoch": 0.5866875, "grad_norm": 3.40625, "grad_norm_var": 0.0532867431640625, "learning_rate": 0.0001, "loss": 5.8482, "loss/crossentropy": 2.669142723083496, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17142222821712494, "step": 18774 }, { "epoch": 0.58675, "grad_norm": 3.296875, "grad_norm_var": 0.0547515869140625, "learning_rate": 0.0001, "loss": 6.1194, "loss/crossentropy": 2.789446711540222, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1806519329547882, "step": 18776 }, { "epoch": 0.5868125, "grad_norm": 2.90625, "grad_norm_var": 0.06288655598958333, "learning_rate": 0.0001, "loss": 5.6243, "loss/crossentropy": 2.5601223707199097, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16032865643501282, "step": 18778 }, { "epoch": 0.586875, "grad_norm": 3.125, "grad_norm_var": 0.0595855712890625, "learning_rate": 0.0001, "loss": 5.6704, "loss/crossentropy": 2.4958789348602295, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17057915776968002, "step": 18780 }, { "epoch": 0.5869375, "grad_norm": 3.46875, "grad_norm_var": 0.034398396809895836, "learning_rate": 0.0001, "loss": 5.7679, "loss/crossentropy": 2.5094715356826782, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17584221065044403, "step": 18782 }, { "epoch": 0.587, "grad_norm": 2.765625, "grad_norm_var": 0.03583577473958333, "learning_rate": 0.0001, "loss": 5.5669, "loss/crossentropy": 2.5259240865707397, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15839381515979767, "step": 18784 }, { "epoch": 0.5870625, "grad_norm": 3.171875, "grad_norm_var": 0.03515218098958333, "learning_rate": 0.0001, "loss": 5.6445, "loss/crossentropy": 2.4920928478240967, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.17149509489536285, "step": 18786 }, { "epoch": 0.587125, "grad_norm": 3.34375, "grad_norm_var": 0.0355377197265625, "learning_rate": 0.0001, "loss": 5.4765, "loss/crossentropy": 2.3797152042388916, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16007137298583984, "step": 18788 }, { "epoch": 0.5871875, "grad_norm": 3.078125, "grad_norm_var": 0.032421875, "learning_rate": 0.0001, "loss": 5.7126, "loss/crossentropy": 2.66006600856781, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.16189232468605042, "step": 18790 }, { "epoch": 0.58725, "grad_norm": 3.4375, "grad_norm_var": 0.03732096354166667, "learning_rate": 0.0001, "loss": 5.463, "loss/crossentropy": 2.3955132961273193, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1551876962184906, "step": 18792 }, { "epoch": 0.5873125, "grad_norm": 3.40625, "grad_norm_var": 0.034012858072916666, "learning_rate": 0.0001, "loss": 5.5399, "loss/crossentropy": 2.406830072402954, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1652606800198555, "step": 18794 }, { "epoch": 0.587375, "grad_norm": 3.125, "grad_norm_var": 0.03414306640625, "learning_rate": 0.0001, "loss": 5.7499, "loss/crossentropy": 2.6117184162139893, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16928624361753464, "step": 18796 }, { "epoch": 0.5874375, "grad_norm": 2.984375, "grad_norm_var": 0.031916300455729164, "learning_rate": 0.0001, "loss": 5.6044, "loss/crossentropy": 2.5122451782226562, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16272682696580887, "step": 18798 }, { "epoch": 0.5875, "grad_norm": 3.46875, "grad_norm_var": 0.0217926025390625, "learning_rate": 0.0001, "loss": 5.8524, "loss/crossentropy": 2.638648271560669, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16942624002695084, "step": 18800 }, { "epoch": 0.5875625, "grad_norm": 3.15625, "grad_norm_var": 0.03274739583333333, "learning_rate": 0.0001, "loss": 5.7158, "loss/crossentropy": 2.6581921577453613, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.15498338639736176, "step": 18802 }, { "epoch": 0.587625, "grad_norm": 3.125, "grad_norm_var": 0.031346638997395836, "learning_rate": 0.0001, "loss": 5.8972, "loss/crossentropy": 2.6991318464279175, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17254384607076645, "step": 18804 }, { "epoch": 0.5876875, "grad_norm": 3.0, "grad_norm_var": 0.03234049479166667, "learning_rate": 0.0001, "loss": 5.5506, "loss/crossentropy": 2.4636647701263428, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1618233397603035, "step": 18806 }, { "epoch": 0.58775, "grad_norm": 3.09375, "grad_norm_var": 0.026805623372395834, "learning_rate": 0.0001, "loss": 5.8945, "loss/crossentropy": 2.715206265449524, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1690976768732071, "step": 18808 }, { "epoch": 0.5878125, "grad_norm": 3.109375, "grad_norm_var": 0.020849609375, "learning_rate": 0.0001, "loss": 5.7979, "loss/crossentropy": 2.602890729904175, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17145641148090363, "step": 18810 }, { "epoch": 0.587875, "grad_norm": 2.984375, "grad_norm_var": 0.020685831705729168, "learning_rate": 0.0001, "loss": 5.3204, "loss/crossentropy": 2.3325252532958984, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.15621185302734375, "step": 18812 }, { "epoch": 0.5879375, "grad_norm": 3.078125, "grad_norm_var": 0.020099894205729166, "learning_rate": 0.0001, "loss": 5.7891, "loss/crossentropy": 2.62211012840271, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17021489888429642, "step": 18814 }, { "epoch": 0.588, "grad_norm": 3.265625, "grad_norm_var": 0.011311848958333334, "learning_rate": 0.0001, "loss": 6.0024, "loss/crossentropy": 2.6743029356002808, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.18320034444332123, "step": 18816 }, { "epoch": 0.5880625, "grad_norm": 3.21875, "grad_norm_var": 0.006884765625, "learning_rate": 0.0001, "loss": 5.7251, "loss/crossentropy": 2.502592086791992, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1734180748462677, "step": 18818 }, { "epoch": 0.588125, "grad_norm": 3.078125, "grad_norm_var": 0.006966145833333334, "learning_rate": 0.0001, "loss": 5.7168, "loss/crossentropy": 2.5643508434295654, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16953930258750916, "step": 18820 }, { "epoch": 0.5881875, "grad_norm": 3.03125, "grad_norm_var": 0.0085845947265625, "learning_rate": 0.0001, "loss": 5.652, "loss/crossentropy": 2.526243805885315, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.15906336903572083, "step": 18822 }, { "epoch": 0.58825, "grad_norm": 3.03125, "grad_norm_var": 0.010138956705729167, "learning_rate": 0.0001, "loss": 5.8207, "loss/crossentropy": 2.62376070022583, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1716514155268669, "step": 18824 }, { "epoch": 0.5883125, "grad_norm": 3.09375, "grad_norm_var": 0.010184733072916667, "learning_rate": 0.0001, "loss": 5.4301, "loss/crossentropy": 2.3604893684387207, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.16438043117523193, "step": 18826 }, { "epoch": 0.588375, "grad_norm": 3.046875, "grad_norm_var": 0.009956868489583333, "learning_rate": 0.0001, "loss": 5.6351, "loss/crossentropy": 2.5153839588165283, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16353678703308105, "step": 18828 }, { "epoch": 0.5884375, "grad_norm": 3.34375, "grad_norm_var": 0.01256103515625, "learning_rate": 0.0001, "loss": 5.6013, "loss/crossentropy": 2.491378426551819, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16138406842947006, "step": 18830 }, { "epoch": 0.5885, "grad_norm": 3.265625, "grad_norm_var": 0.013700358072916667, "learning_rate": 0.0001, "loss": 5.7877, "loss/crossentropy": 2.5900150537490845, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17054571956396103, "step": 18832 }, { "epoch": 0.5885625, "grad_norm": 2.875, "grad_norm_var": 0.8783487955729167, "learning_rate": 0.0001, "loss": 5.6009, "loss/crossentropy": 2.3893556594848633, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1676376461982727, "step": 18834 }, { "epoch": 0.588625, "grad_norm": 3.453125, "grad_norm_var": 0.8694620768229167, "learning_rate": 0.0001, "loss": 5.7512, "loss/crossentropy": 2.4890904426574707, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17308519035577774, "step": 18836 }, { "epoch": 0.5886875, "grad_norm": 3.125, "grad_norm_var": 0.8741495768229167, "learning_rate": 0.0001, "loss": 5.4162, "loss/crossentropy": 2.4170466661453247, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.15929004549980164, "step": 18838 }, { "epoch": 0.58875, "grad_norm": 3.484375, "grad_norm_var": 0.8793253580729167, "learning_rate": 0.0001, "loss": 5.7454, "loss/crossentropy": 2.6104096174240112, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16505949199199677, "step": 18840 }, { "epoch": 0.5888125, "grad_norm": 2.984375, "grad_norm_var": 0.8839752197265625, "learning_rate": 0.0001, "loss": 5.6596, "loss/crossentropy": 2.524080991744995, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16433259844779968, "step": 18842 }, { "epoch": 0.588875, "grad_norm": 3.21875, "grad_norm_var": 0.8858357747395833, "learning_rate": 0.0001, "loss": 5.4786, "loss/crossentropy": 2.4198418855667114, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15704571455717087, "step": 18844 }, { "epoch": 0.5889375, "grad_norm": 3.125, "grad_norm_var": 0.887255859375, "learning_rate": 0.0001, "loss": 5.6893, "loss/crossentropy": 2.46872341632843, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17049581557512283, "step": 18846 }, { "epoch": 0.589, "grad_norm": 3.09375, "grad_norm_var": 0.8957427978515625, "learning_rate": 0.0001, "loss": 5.5962, "loss/crossentropy": 2.497233748435974, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16224434226751328, "step": 18848 }, { "epoch": 0.5890625, "grad_norm": 2.890625, "grad_norm_var": 0.03479817708333333, "learning_rate": 0.0001, "loss": 5.5171, "loss/crossentropy": 2.4555243253707886, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15889254212379456, "step": 18850 }, { "epoch": 0.589125, "grad_norm": 2.859375, "grad_norm_var": 0.032389322916666664, "learning_rate": 0.0001, "loss": 5.3986, "loss/crossentropy": 2.3385857343673706, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15951615571975708, "step": 18852 }, { "epoch": 0.5891875, "grad_norm": 3.234375, "grad_norm_var": 0.03331705729166667, "learning_rate": 0.0001, "loss": 5.5947, "loss/crossentropy": 2.479830026626587, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16656863689422607, "step": 18854 }, { "epoch": 0.58925, "grad_norm": 3.1875, "grad_norm_var": 0.02193603515625, "learning_rate": 0.0001, "loss": 5.4323, "loss/crossentropy": 2.368161678314209, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1626622974872589, "step": 18856 }, { "epoch": 0.5893125, "grad_norm": 3.015625, "grad_norm_var": 0.0214752197265625, "learning_rate": 0.0001, "loss": 6.0137, "loss/crossentropy": 2.783053994178772, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17267384380102158, "step": 18858 }, { "epoch": 0.589375, "grad_norm": 3.09375, "grad_norm_var": 0.0397613525390625, "learning_rate": 0.0001, "loss": 5.5611, "loss/crossentropy": 2.2689337730407715, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1745278239250183, "step": 18860 }, { "epoch": 0.5894375, "grad_norm": 3.234375, "grad_norm_var": 0.0404937744140625, "learning_rate": 0.0001, "loss": 5.7192, "loss/crossentropy": 2.5575358867645264, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16812192648649216, "step": 18862 }, { "epoch": 0.5895, "grad_norm": 3.015625, "grad_norm_var": 0.03795166015625, "learning_rate": 0.0001, "loss": 5.5239, "loss/crossentropy": 2.498926877975464, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15718135237693787, "step": 18864 }, { "epoch": 0.5895625, "grad_norm": 3.09375, "grad_norm_var": 0.0373931884765625, "learning_rate": 0.0001, "loss": 6.078, "loss/crossentropy": 2.8394166231155396, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1722913682460785, "step": 18866 }, { "epoch": 0.589625, "grad_norm": 3.015625, "grad_norm_var": 0.031689453125, "learning_rate": 0.0001, "loss": 5.9251, "loss/crossentropy": 2.6486889123916626, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17803260684013367, "step": 18868 }, { "epoch": 0.5896875, "grad_norm": 3.296875, "grad_norm_var": 0.03420817057291667, "learning_rate": 0.0001, "loss": 5.2808, "loss/crossentropy": 2.2919921875, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.15786871314048767, "step": 18870 }, { "epoch": 0.58975, "grad_norm": 3.171875, "grad_norm_var": 0.03389383951822917, "learning_rate": 0.0001, "loss": 5.8613, "loss/crossentropy": 2.656334161758423, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1720557063817978, "step": 18872 }, { "epoch": 0.5898125, "grad_norm": 3.296875, "grad_norm_var": 0.041337076822916666, "learning_rate": 0.0001, "loss": 5.8043, "loss/crossentropy": 2.6470354795455933, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.17080049961805344, "step": 18874 }, { "epoch": 0.589875, "grad_norm": 2.890625, "grad_norm_var": 0.0343902587890625, "learning_rate": 0.0001, "loss": 5.4058, "loss/crossentropy": 2.4188259840011597, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15299608558416367, "step": 18876 }, { "epoch": 0.5899375, "grad_norm": 3.046875, "grad_norm_var": 0.033381144205729164, "learning_rate": 0.0001, "loss": 5.9038, "loss/crossentropy": 2.736717104911804, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.17100075632333755, "step": 18878 }, { "epoch": 0.59, "grad_norm": 3.078125, "grad_norm_var": 0.03313395182291667, "learning_rate": 0.0001, "loss": 5.8452, "loss/crossentropy": 2.595299482345581, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.18007101863622665, "step": 18880 }, { "epoch": 0.5900625, "grad_norm": 3.203125, "grad_norm_var": 0.029051717122395834, "learning_rate": 0.0001, "loss": 6.0323, "loss/crossentropy": 2.7330769300460815, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.183043472468853, "step": 18882 }, { "epoch": 0.590125, "grad_norm": 3.390625, "grad_norm_var": 0.03362223307291667, "learning_rate": 0.0001, "loss": 5.6244, "loss/crossentropy": 2.4341530799865723, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1674579456448555, "step": 18884 }, { "epoch": 0.5901875, "grad_norm": 3.171875, "grad_norm_var": 0.031571451822916666, "learning_rate": 0.0001, "loss": 5.6893, "loss/crossentropy": 2.525902032852173, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16634322702884674, "step": 18886 }, { "epoch": 0.59025, "grad_norm": 3.21875, "grad_norm_var": 0.03188374837239583, "learning_rate": 0.0001, "loss": 5.5344, "loss/crossentropy": 2.4152660369873047, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16426139324903488, "step": 18888 }, { "epoch": 0.5903125, "grad_norm": 3.1875, "grad_norm_var": 0.023786417643229165, "learning_rate": 0.0001, "loss": 5.8936, "loss/crossentropy": 2.721444606781006, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17073000222444534, "step": 18890 }, { "epoch": 0.590375, "grad_norm": 3.1875, "grad_norm_var": 0.014598592122395834, "learning_rate": 0.0001, "loss": 5.7217, "loss/crossentropy": 2.588342785835266, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1621607542037964, "step": 18892 }, { "epoch": 0.5904375, "grad_norm": 3.28125, "grad_norm_var": 0.015478515625, "learning_rate": 0.0001, "loss": 5.6263, "loss/crossentropy": 2.4897148609161377, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16639653593301773, "step": 18894 }, { "epoch": 0.5905, "grad_norm": 3.390625, "grad_norm_var": 0.019254557291666665, "learning_rate": 0.0001, "loss": 5.8017, "loss/crossentropy": 2.6354910135269165, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17013906687498093, "step": 18896 }, { "epoch": 0.5905625, "grad_norm": 2.890625, "grad_norm_var": 0.026167805989583334, "learning_rate": 0.0001, "loss": 5.6968, "loss/crossentropy": 2.6278244256973267, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16275885701179504, "step": 18898 }, { "epoch": 0.590625, "grad_norm": 3.34375, "grad_norm_var": 0.028304036458333334, "learning_rate": 0.0001, "loss": 5.5764, "loss/crossentropy": 2.411203622817993, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16690727323293686, "step": 18900 }, { "epoch": 0.5906875, "grad_norm": 3.203125, "grad_norm_var": 0.026395670572916665, "learning_rate": 0.0001, "loss": 5.5561, "loss/crossentropy": 2.3805559873580933, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1671639084815979, "step": 18902 }, { "epoch": 0.59075, "grad_norm": 2.984375, "grad_norm_var": 0.034765625, "learning_rate": 0.0001, "loss": 5.9054, "loss/crossentropy": 2.646710991859436, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17391187697649002, "step": 18904 }, { "epoch": 0.5908125, "grad_norm": 3.515625, "grad_norm_var": 0.0384674072265625, "learning_rate": 0.0001, "loss": 5.7693, "loss/crossentropy": 2.5989474058151245, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16742993146181107, "step": 18906 }, { "epoch": 0.590875, "grad_norm": 3.453125, "grad_norm_var": 0.042215983072916664, "learning_rate": 0.0001, "loss": 5.6919, "loss/crossentropy": 2.455661177635193, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17167077958583832, "step": 18908 }, { "epoch": 0.5909375, "grad_norm": 3.171875, "grad_norm_var": 0.0461822509765625, "learning_rate": 0.0001, "loss": 5.801, "loss/crossentropy": 2.648666024208069, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1683606579899788, "step": 18910 }, { "epoch": 0.591, "grad_norm": 3.578125, "grad_norm_var": 0.05367431640625, "learning_rate": 0.0001, "loss": 5.9001, "loss/crossentropy": 2.6702526807785034, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17415423691272736, "step": 18912 }, { "epoch": 0.5910625, "grad_norm": 2.921875, "grad_norm_var": 0.0565093994140625, "learning_rate": 0.0001, "loss": 5.4425, "loss/crossentropy": 2.3983428478240967, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1606631726026535, "step": 18914 }, { "epoch": 0.591125, "grad_norm": 3.140625, "grad_norm_var": 0.0539947509765625, "learning_rate": 0.0001, "loss": 5.8541, "loss/crossentropy": 2.685070753097534, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.17119833827018738, "step": 18916 }, { "epoch": 0.5911875, "grad_norm": 2.96875, "grad_norm_var": 0.06687825520833333, "learning_rate": 0.0001, "loss": 5.3403, "loss/crossentropy": 2.359796643257141, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15000443160533905, "step": 18918 }, { "epoch": 0.59125, "grad_norm": 4.15625, "grad_norm_var": 0.11724344889322917, "learning_rate": 0.0001, "loss": 5.5051, "loss/crossentropy": 2.360744833946228, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16404935717582703, "step": 18920 }, { "epoch": 0.5913125, "grad_norm": 2.96875, "grad_norm_var": 0.11398111979166667, "learning_rate": 0.0001, "loss": 5.4941, "loss/crossentropy": 2.394318461418152, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16114606708288193, "step": 18922 }, { "epoch": 0.591375, "grad_norm": 3.390625, "grad_norm_var": 0.10827534993489583, "learning_rate": 0.0001, "loss": 5.4535, "loss/crossentropy": 2.3116272687911987, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16340911388397217, "step": 18924 }, { "epoch": 0.5914375, "grad_norm": 3.1875, "grad_norm_var": 0.10829671223958333, "learning_rate": 0.0001, "loss": 5.7031, "loss/crossentropy": 2.5502008199691772, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16645871102809906, "step": 18926 }, { "epoch": 0.5915, "grad_norm": 3.390625, "grad_norm_var": 0.09758707682291666, "learning_rate": 0.0001, "loss": 5.8017, "loss/crossentropy": 2.6481685638427734, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16378618776798248, "step": 18928 }, { "epoch": 0.5915625, "grad_norm": 3.109375, "grad_norm_var": 0.09195048014322917, "learning_rate": 0.0001, "loss": 5.7804, "loss/crossentropy": 2.582486391067505, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1725265383720398, "step": 18930 }, { "epoch": 0.591625, "grad_norm": 3.640625, "grad_norm_var": 0.10593159993489583, "learning_rate": 0.0001, "loss": 6.0065, "loss/crossentropy": 2.7718145847320557, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.17737869918346405, "step": 18932 }, { "epoch": 0.5916875, "grad_norm": 3.0, "grad_norm_var": 0.0904449462890625, "learning_rate": 0.0001, "loss": 5.7479, "loss/crossentropy": 2.580349564552307, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16714490205049515, "step": 18934 }, { "epoch": 0.59175, "grad_norm": 3.21875, "grad_norm_var": 0.03564453125, "learning_rate": 0.0001, "loss": 5.8126, "loss/crossentropy": 2.6517690420150757, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1707712784409523, "step": 18936 }, { "epoch": 0.5918125, "grad_norm": 2.96875, "grad_norm_var": 0.037007649739583336, "learning_rate": 0.0001, "loss": 5.6382, "loss/crossentropy": 2.518024206161499, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.1686599999666214, "step": 18938 }, { "epoch": 0.591875, "grad_norm": 2.796875, "grad_norm_var": 0.04729410807291667, "learning_rate": 0.0001, "loss": 5.2716, "loss/crossentropy": 2.290798306465149, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.15549713373184204, "step": 18940 }, { "epoch": 0.5919375, "grad_norm": 2.859375, "grad_norm_var": 0.05088602701822917, "learning_rate": 0.0001, "loss": 5.4848, "loss/crossentropy": 2.317240595817566, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.17105378210544586, "step": 18942 }, { "epoch": 0.592, "grad_norm": 2.9375, "grad_norm_var": 0.0511871337890625, "learning_rate": 0.0001, "loss": 5.3886, "loss/crossentropy": 2.420640707015991, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15460366010665894, "step": 18944 }, { "epoch": 0.5920625, "grad_norm": 2.984375, "grad_norm_var": 0.05243733723958333, "learning_rate": 0.0001, "loss": 5.4536, "loss/crossentropy": 2.412165403366089, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1541440710425377, "step": 18946 }, { "epoch": 0.592125, "grad_norm": 3.25, "grad_norm_var": 0.03235270182291667, "learning_rate": 0.0001, "loss": 5.3565, "loss/crossentropy": 2.2599799633026123, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1612170711159706, "step": 18948 }, { "epoch": 0.5921875, "grad_norm": 3.28125, "grad_norm_var": 0.031819661458333336, "learning_rate": 0.0001, "loss": 6.1042, "loss/crossentropy": 2.824833631515503, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17715388536453247, "step": 18950 }, { "epoch": 0.59225, "grad_norm": 3.015625, "grad_norm_var": 0.03179931640625, "learning_rate": 0.0001, "loss": 5.7712, "loss/crossentropy": 2.6190898418426514, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1683368757367134, "step": 18952 }, { "epoch": 0.5923125, "grad_norm": 3.296875, "grad_norm_var": 0.03291015625, "learning_rate": 0.0001, "loss": 5.6267, "loss/crossentropy": 2.531549572944641, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1606869176030159, "step": 18954 }, { "epoch": 0.592375, "grad_norm": 3.0625, "grad_norm_var": 0.027106730143229167, "learning_rate": 0.0001, "loss": 5.6599, "loss/crossentropy": 2.516823410987854, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16587327420711517, "step": 18956 }, { "epoch": 0.5924375, "grad_norm": 3.1875, "grad_norm_var": 0.021708170572916668, "learning_rate": 0.0001, "loss": 5.5947, "loss/crossentropy": 2.440278649330139, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16856301575899124, "step": 18958 }, { "epoch": 0.5925, "grad_norm": 3.671875, "grad_norm_var": 0.042378743489583336, "learning_rate": 0.0001, "loss": 5.9483, "loss/crossentropy": 2.732330799102783, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17277376353740692, "step": 18960 }, { "epoch": 0.5925625, "grad_norm": 3.0, "grad_norm_var": 0.04168294270833333, "learning_rate": 0.0001, "loss": 5.5481, "loss/crossentropy": 2.4707993268966675, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16241498291492462, "step": 18962 }, { "epoch": 0.592625, "grad_norm": 2.9375, "grad_norm_var": 0.0461334228515625, "learning_rate": 0.0001, "loss": 6.0532, "loss/crossentropy": 2.7712173461914062, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17585937678813934, "step": 18964 }, { "epoch": 0.5926875, "grad_norm": 3.125, "grad_norm_var": 0.04332275390625, "learning_rate": 0.0001, "loss": 5.7558, "loss/crossentropy": 2.6582868099212646, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16482648998498917, "step": 18966 }, { "epoch": 0.59275, "grad_norm": 3.703125, "grad_norm_var": 0.0842681884765625, "learning_rate": 0.0001, "loss": 5.9919, "loss/crossentropy": 2.6734771728515625, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17637837678194046, "step": 18968 }, { "epoch": 0.5928125, "grad_norm": 3.53125, "grad_norm_var": 0.08127848307291667, "learning_rate": 0.0001, "loss": 5.8368, "loss/crossentropy": 2.626417398452759, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1698618158698082, "step": 18970 }, { "epoch": 0.592875, "grad_norm": 3.28125, "grad_norm_var": 0.0812896728515625, "learning_rate": 0.0001, "loss": 6.1239, "loss/crossentropy": 2.816824197769165, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17797277867794037, "step": 18972 }, { "epoch": 0.5929375, "grad_norm": 3.078125, "grad_norm_var": 0.09773763020833333, "learning_rate": 0.0001, "loss": 5.3799, "loss/crossentropy": 2.425009846687317, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15252325683832169, "step": 18974 }, { "epoch": 0.593, "grad_norm": 3.125, "grad_norm_var": 0.08258463541666666, "learning_rate": 0.0001, "loss": 5.6301, "loss/crossentropy": 2.5467501878738403, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16106735169887543, "step": 18976 }, { "epoch": 0.5930625, "grad_norm": 3.0625, "grad_norm_var": 0.07834879557291667, "learning_rate": 0.0001, "loss": 5.7481, "loss/crossentropy": 2.5487849712371826, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1714925691485405, "step": 18978 }, { "epoch": 0.593125, "grad_norm": 3.390625, "grad_norm_var": 0.07740478515625, "learning_rate": 0.0001, "loss": 5.8858, "loss/crossentropy": 2.6531916856765747, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17287417501211166, "step": 18980 }, { "epoch": 0.5931875, "grad_norm": 3.15625, "grad_norm_var": 0.07591145833333333, "learning_rate": 0.0001, "loss": 6.0652, "loss/crossentropy": 2.7632105350494385, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17941667139530182, "step": 18982 }, { "epoch": 0.59325, "grad_norm": 3.0625, "grad_norm_var": 0.04586181640625, "learning_rate": 0.0001, "loss": 5.4704, "loss/crossentropy": 2.4141980409622192, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16108528524637222, "step": 18984 }, { "epoch": 0.5933125, "grad_norm": 2.875, "grad_norm_var": 0.03755594889322917, "learning_rate": 0.0001, "loss": 5.5901, "loss/crossentropy": 2.530953526496887, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.16333574801683426, "step": 18986 }, { "epoch": 0.593375, "grad_norm": 3.234375, "grad_norm_var": 0.03426005045572917, "learning_rate": 0.0001, "loss": 6.0152, "loss/crossentropy": 2.814469814300537, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17084968835115433, "step": 18988 }, { "epoch": 0.5934375, "grad_norm": 3.109375, "grad_norm_var": 0.027620442708333335, "learning_rate": 0.0001, "loss": 5.6947, "loss/crossentropy": 2.5568608045578003, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16729601472616196, "step": 18990 }, { "epoch": 0.5935, "grad_norm": 3.203125, "grad_norm_var": 0.027057902018229166, "learning_rate": 0.0001, "loss": 5.7036, "loss/crossentropy": 2.5921266078948975, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1654420644044876, "step": 18992 }, { "epoch": 0.5935625, "grad_norm": 2.890625, "grad_norm_var": 0.028563435872395834, "learning_rate": 0.0001, "loss": 5.4692, "loss/crossentropy": 2.388649821281433, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1635209619998932, "step": 18994 }, { "epoch": 0.593625, "grad_norm": 3.265625, "grad_norm_var": 0.04140625, "learning_rate": 0.0001, "loss": 5.9893, "loss/crossentropy": 2.656255602836609, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.18291820585727692, "step": 18996 }, { "epoch": 0.5936875, "grad_norm": 3.3125, "grad_norm_var": 0.04641520182291667, "learning_rate": 0.0001, "loss": 5.6909, "loss/crossentropy": 2.530544877052307, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16447626799345016, "step": 18998 }, { "epoch": 0.59375, "grad_norm": 3.0625, "grad_norm_var": 0.045572916666666664, "learning_rate": 0.0001, "loss": 5.7836, "loss/crossentropy": 2.607883930206299, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16717688739299774, "step": 19000 }, { "epoch": 0.5938125, "grad_norm": 3.171875, "grad_norm_var": 0.04083658854166667, "learning_rate": 0.0001, "loss": 5.7664, "loss/crossentropy": 2.6362565755844116, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1641901507973671, "step": 19002 }, { "epoch": 0.593875, "grad_norm": 2.953125, "grad_norm_var": 0.03870035807291667, "learning_rate": 0.0001, "loss": 5.5183, "loss/crossentropy": 2.4436482191085815, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16293271631002426, "step": 19004 }, { "epoch": 0.5939375, "grad_norm": 3.34375, "grad_norm_var": 0.04023030598958333, "learning_rate": 0.0001, "loss": 5.9618, "loss/crossentropy": 2.699216842651367, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17626157402992249, "step": 19006 }, { "epoch": 0.594, "grad_norm": 3.078125, "grad_norm_var": 3.042560831705729, "learning_rate": 0.0001, "loss": 5.9855, "loss/crossentropy": 2.5044608116149902, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.19692841917276382, "step": 19008 }, { "epoch": 0.5940625, "grad_norm": 3.15625, "grad_norm_var": 3.050096638997396, "learning_rate": 0.0001, "loss": 5.7888, "loss/crossentropy": 2.6429343223571777, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16810011863708496, "step": 19010 }, { "epoch": 0.594125, "grad_norm": 3.125, "grad_norm_var": 3.074803670247396, "learning_rate": 0.0001, "loss": 5.7232, "loss/crossentropy": 2.5194497108459473, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1688118353486061, "step": 19012 }, { "epoch": 0.5941875, "grad_norm": 3.25, "grad_norm_var": 3.0759073893229165, "learning_rate": 0.0001, "loss": 5.6813, "loss/crossentropy": 2.507745862007141, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16696728765964508, "step": 19014 }, { "epoch": 0.59425, "grad_norm": 3.46875, "grad_norm_var": 3.0719563802083334, "learning_rate": 0.0001, "loss": 5.793, "loss/crossentropy": 2.6356258392333984, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1712021380662918, "step": 19016 }, { "epoch": 0.5943125, "grad_norm": 3.15625, "grad_norm_var": 3.075755818684896, "learning_rate": 0.0001, "loss": 5.8095, "loss/crossentropy": 2.591075897216797, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17183955758810043, "step": 19018 }, { "epoch": 0.594375, "grad_norm": 3.40625, "grad_norm_var": 3.0625, "learning_rate": 0.0001, "loss": 5.5936, "loss/crossentropy": 2.451937198638916, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.165337435901165, "step": 19020 }, { "epoch": 0.5944375, "grad_norm": 3.28125, "grad_norm_var": 3.083788045247396, "learning_rate": 0.0001, "loss": 5.4149, "loss/crossentropy": 2.3058871030807495, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1628570780158043, "step": 19022 }, { "epoch": 0.5945, "grad_norm": 3.375, "grad_norm_var": 0.03297119140625, "learning_rate": 0.0001, "loss": 5.5628, "loss/crossentropy": 2.423704147338867, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1674252226948738, "step": 19024 }, { "epoch": 0.5945625, "grad_norm": 3.203125, "grad_norm_var": 0.0283843994140625, "learning_rate": 0.0001, "loss": 6.0417, "loss/crossentropy": 2.7136545181274414, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17733952403068542, "step": 19026 }, { "epoch": 0.594625, "grad_norm": 3.328125, "grad_norm_var": 0.0339508056640625, "learning_rate": 0.0001, "loss": 5.5453, "loss/crossentropy": 2.377169370651245, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16642732918262482, "step": 19028 }, { "epoch": 0.5946875, "grad_norm": 3.0, "grad_norm_var": 0.0364898681640625, "learning_rate": 0.0001, "loss": 5.4178, "loss/crossentropy": 2.413586974143982, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15159141272306442, "step": 19030 }, { "epoch": 0.59475, "grad_norm": 3.109375, "grad_norm_var": 0.045731608072916666, "learning_rate": 0.0001, "loss": 5.86, "loss/crossentropy": 2.6076817512512207, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17796234786510468, "step": 19032 }, { "epoch": 0.5948125, "grad_norm": 3.1875, "grad_norm_var": 0.0462066650390625, "learning_rate": 0.0001, "loss": 5.8138, "loss/crossentropy": 2.6422449350357056, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1691109985113144, "step": 19034 }, { "epoch": 0.594875, "grad_norm": 3.0, "grad_norm_var": 0.04415690104166667, "learning_rate": 0.0001, "loss": 5.6882, "loss/crossentropy": 2.6311895847320557, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1603873297572136, "step": 19036 }, { "epoch": 0.5949375, "grad_norm": 3.390625, "grad_norm_var": 0.052506510416666666, "learning_rate": 0.0001, "loss": 5.4202, "loss/crossentropy": 2.306910276412964, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16054462641477585, "step": 19038 }, { "epoch": 0.595, "grad_norm": 3.359375, "grad_norm_var": 0.0525787353515625, "learning_rate": 0.0001, "loss": 5.8284, "loss/crossentropy": 2.6306427717208862, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.168601892888546, "step": 19040 }, { "epoch": 0.5950625, "grad_norm": 2.96875, "grad_norm_var": 0.0587554931640625, "learning_rate": 0.0001, "loss": 5.8099, "loss/crossentropy": 2.637050747871399, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17002219706773758, "step": 19042 }, { "epoch": 0.595125, "grad_norm": 3.265625, "grad_norm_var": 0.057942708333333336, "learning_rate": 0.0001, "loss": 5.9011, "loss/crossentropy": 2.6271122694015503, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17739781737327576, "step": 19044 }, { "epoch": 0.5951875, "grad_norm": 3.03125, "grad_norm_var": 0.051878865559895834, "learning_rate": 0.0001, "loss": 5.8256, "loss/crossentropy": 2.6426377296447754, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1667339876294136, "step": 19046 }, { "epoch": 0.59525, "grad_norm": 3.140625, "grad_norm_var": 0.053319295247395836, "learning_rate": 0.0001, "loss": 5.4951, "loss/crossentropy": 2.320650339126587, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1654891073703766, "step": 19048 }, { "epoch": 0.5953125, "grad_norm": 2.859375, "grad_norm_var": 0.07265218098958333, "learning_rate": 0.0001, "loss": 5.5159, "loss/crossentropy": 2.4054372310638428, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1602659523487091, "step": 19050 }, { "epoch": 0.595375, "grad_norm": 3.3125, "grad_norm_var": 0.0620513916015625, "learning_rate": 0.0001, "loss": 5.4535, "loss/crossentropy": 2.3170766830444336, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16403310000896454, "step": 19052 }, { "epoch": 0.5954375, "grad_norm": 3.0, "grad_norm_var": 0.06348368326822916, "learning_rate": 0.0001, "loss": 5.584, "loss/crossentropy": 2.4700220823287964, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16374646127223969, "step": 19054 }, { "epoch": 0.5955, "grad_norm": 3.15625, "grad_norm_var": 0.06272379557291667, "learning_rate": 0.0001, "loss": 5.7364, "loss/crossentropy": 2.5879608392715454, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16914492845535278, "step": 19056 }, { "epoch": 0.5955625, "grad_norm": 3.1875, "grad_norm_var": 0.0620269775390625, "learning_rate": 0.0001, "loss": 5.7574, "loss/crossentropy": 2.60457980632782, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1672329530119896, "step": 19058 }, { "epoch": 0.595625, "grad_norm": 2.984375, "grad_norm_var": 0.06207275390625, "learning_rate": 0.0001, "loss": 5.5583, "loss/crossentropy": 2.3792275190353394, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16595008969306946, "step": 19060 }, { "epoch": 0.5956875, "grad_norm": 3.296875, "grad_norm_var": 0.0641021728515625, "learning_rate": 0.0001, "loss": 5.5144, "loss/crossentropy": 2.4050134420394897, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1613277867436409, "step": 19062 }, { "epoch": 0.59575, "grad_norm": 3.046875, "grad_norm_var": 0.0496490478515625, "learning_rate": 0.0001, "loss": 5.7238, "loss/crossentropy": 2.553224802017212, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17057671397924423, "step": 19064 }, { "epoch": 0.5958125, "grad_norm": 3.171875, "grad_norm_var": 0.025484212239583335, "learning_rate": 0.0001, "loss": 5.889, "loss/crossentropy": 2.669556975364685, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1735069379210472, "step": 19066 }, { "epoch": 0.595875, "grad_norm": 3.203125, "grad_norm_var": 0.024088541666666668, "learning_rate": 0.0001, "loss": 6.0692, "loss/crossentropy": 2.798303008079529, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1770864650607109, "step": 19068 }, { "epoch": 0.5959375, "grad_norm": 3.234375, "grad_norm_var": 0.0181793212890625, "learning_rate": 0.0001, "loss": 6.0967, "loss/crossentropy": 2.8803699016571045, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1747601479291916, "step": 19070 }, { "epoch": 0.596, "grad_norm": 3.046875, "grad_norm_var": 0.0188629150390625, "learning_rate": 0.0001, "loss": 5.7132, "loss/crossentropy": 2.6060245037078857, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1646246314048767, "step": 19072 }, { "epoch": 0.5960625, "grad_norm": 2.921875, "grad_norm_var": 0.023030598958333332, "learning_rate": 0.0001, "loss": 5.4366, "loss/crossentropy": 2.3716466426849365, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16274771094322205, "step": 19074 }, { "epoch": 0.596125, "grad_norm": 3.0, "grad_norm_var": 0.024689737955729166, "learning_rate": 0.0001, "loss": 5.7495, "loss/crossentropy": 2.6083240509033203, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1645103245973587, "step": 19076 }, { "epoch": 0.5961875, "grad_norm": 2.96875, "grad_norm_var": 0.022264607747395835, "learning_rate": 0.0001, "loss": 5.6416, "loss/crossentropy": 2.4824129343032837, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.164356030523777, "step": 19078 }, { "epoch": 0.59625, "grad_norm": 3.046875, "grad_norm_var": 0.0171875, "learning_rate": 0.0001, "loss": 5.6659, "loss/crossentropy": 2.5303527116775513, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16706840693950653, "step": 19080 }, { "epoch": 0.5963125, "grad_norm": 3.0625, "grad_norm_var": 0.022261555989583334, "learning_rate": 0.0001, "loss": 5.5689, "loss/crossentropy": 2.5071014165878296, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1608661264181137, "step": 19082 }, { "epoch": 0.596375, "grad_norm": 3.125, "grad_norm_var": 0.027339680989583334, "learning_rate": 0.0001, "loss": 5.4138, "loss/crossentropy": 2.3051012754440308, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16399822384119034, "step": 19084 }, { "epoch": 0.5964375, "grad_norm": 2.875, "grad_norm_var": 0.03173828125, "learning_rate": 0.0001, "loss": 5.2159, "loss/crossentropy": 2.3455780744552612, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.14601540565490723, "step": 19086 }, { "epoch": 0.5965, "grad_norm": 3.359375, "grad_norm_var": 0.037409464518229164, "learning_rate": 0.0001, "loss": 5.7854, "loss/crossentropy": 2.667971611022949, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16799738258123398, "step": 19088 }, { "epoch": 0.5965625, "grad_norm": 3.34375, "grad_norm_var": 0.058405558268229164, "learning_rate": 0.0001, "loss": 5.4464, "loss/crossentropy": 2.3377362489700317, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16047269850969315, "step": 19090 }, { "epoch": 0.596625, "grad_norm": 2.984375, "grad_norm_var": 0.05292867024739583, "learning_rate": 0.0001, "loss": 5.4976, "loss/crossentropy": 2.4580774307250977, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15707862377166748, "step": 19092 }, { "epoch": 0.5966875, "grad_norm": 3.125, "grad_norm_var": 0.05196024576822917, "learning_rate": 0.0001, "loss": 5.6936, "loss/crossentropy": 2.589480757713318, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16197218000888824, "step": 19094 }, { "epoch": 0.59675, "grad_norm": 3.375, "grad_norm_var": 0.06636962890625, "learning_rate": 0.0001, "loss": 5.8982, "loss/crossentropy": 2.621792435646057, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17842186987400055, "step": 19096 }, { "epoch": 0.5968125, "grad_norm": 3.078125, "grad_norm_var": 0.05995992024739583, "learning_rate": 0.0001, "loss": 5.9474, "loss/crossentropy": 2.7037535905838013, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17397662997245789, "step": 19098 }, { "epoch": 0.596875, "grad_norm": 3.125, "grad_norm_var": 0.05679931640625, "learning_rate": 0.0001, "loss": 5.7846, "loss/crossentropy": 2.606392741203308, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16860055178403854, "step": 19100 }, { "epoch": 0.5969375, "grad_norm": 3.078125, "grad_norm_var": 0.04195963541666667, "learning_rate": 0.0001, "loss": 5.8501, "loss/crossentropy": 2.660889506340027, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17009187489748, "step": 19102 }, { "epoch": 0.597, "grad_norm": 3.03125, "grad_norm_var": 0.04391276041666667, "learning_rate": 0.0001, "loss": 5.7535, "loss/crossentropy": 2.6644606590270996, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1612473577260971, "step": 19104 }, { "epoch": 0.5970625, "grad_norm": 3.140625, "grad_norm_var": 0.03188374837239583, "learning_rate": 0.0001, "loss": 5.5338, "loss/crossentropy": 2.452639102935791, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.16475605219602585, "step": 19106 }, { "epoch": 0.597125, "grad_norm": 3.15625, "grad_norm_var": 0.03176676432291667, "learning_rate": 0.0001, "loss": 5.6205, "loss/crossentropy": 2.5299713611602783, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16296324133872986, "step": 19108 }, { "epoch": 0.5971875, "grad_norm": 3.15625, "grad_norm_var": 0.03809305826822917, "learning_rate": 0.0001, "loss": 5.9802, "loss/crossentropy": 2.6791226863861084, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17698239535093307, "step": 19110 }, { "epoch": 0.59725, "grad_norm": 3.5, "grad_norm_var": 0.03294169108072917, "learning_rate": 0.0001, "loss": 5.7402, "loss/crossentropy": 2.533842444419861, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17024310678243637, "step": 19112 }, { "epoch": 0.5973125, "grad_norm": 2.90625, "grad_norm_var": 0.0326812744140625, "learning_rate": 0.0001, "loss": 5.8323, "loss/crossentropy": 2.661787986755371, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16861362755298615, "step": 19114 }, { "epoch": 0.597375, "grad_norm": 3.03125, "grad_norm_var": 0.0326171875, "learning_rate": 0.0001, "loss": 5.4835, "loss/crossentropy": 2.3731298446655273, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1645573154091835, "step": 19116 }, { "epoch": 0.5974375, "grad_norm": 3.75, "grad_norm_var": 0.060456339518229166, "learning_rate": 0.0001, "loss": 5.7119, "loss/crossentropy": 2.494303584098816, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1702006831765175, "step": 19118 }, { "epoch": 0.5975, "grad_norm": 3.21875, "grad_norm_var": 0.054230753580729166, "learning_rate": 0.0001, "loss": 6.0539, "loss/crossentropy": 2.7265243530273438, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.18274109065532684, "step": 19120 }, { "epoch": 0.5975625, "grad_norm": 3.0, "grad_norm_var": 0.1051910400390625, "learning_rate": 0.0001, "loss": 5.2535, "loss/crossentropy": 2.245564341545105, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.15040294080972672, "step": 19122 }, { "epoch": 0.597625, "grad_norm": 3.0625, "grad_norm_var": 0.10263264973958333, "learning_rate": 0.0001, "loss": 5.4162, "loss/crossentropy": 2.3298370838165283, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1621474325656891, "step": 19124 }, { "epoch": 0.5976875, "grad_norm": 3.21875, "grad_norm_var": 0.1067779541015625, "learning_rate": 0.0001, "loss": 5.7115, "loss/crossentropy": 2.5920151472091675, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16429658979177475, "step": 19126 }, { "epoch": 0.59775, "grad_norm": 3.0625, "grad_norm_var": 0.10891011555989584, "learning_rate": 0.0001, "loss": 5.9329, "loss/crossentropy": 2.7269396781921387, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17177174985408783, "step": 19128 }, { "epoch": 0.5978125, "grad_norm": 3.203125, "grad_norm_var": 0.10945536295572916, "learning_rate": 0.0001, "loss": 5.626, "loss/crossentropy": 2.475701689720154, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.17049477994441986, "step": 19130 }, { "epoch": 0.597875, "grad_norm": 2.8125, "grad_norm_var": 0.11669514973958334, "learning_rate": 0.0001, "loss": 5.428, "loss/crossentropy": 2.3634350299835205, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1611437350511551, "step": 19132 }, { "epoch": 0.5979375, "grad_norm": 3.25, "grad_norm_var": 0.10496317545572917, "learning_rate": 0.0001, "loss": 5.3409, "loss/crossentropy": 2.35762619972229, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15262891352176666, "step": 19134 }, { "epoch": 0.598, "grad_norm": 3.28125, "grad_norm_var": 0.1359771728515625, "learning_rate": 0.0001, "loss": 5.9168, "loss/crossentropy": 2.6832181215286255, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1714015007019043, "step": 19136 }, { "epoch": 0.5980625, "grad_norm": 3.171875, "grad_norm_var": 0.0734283447265625, "learning_rate": 0.0001, "loss": 5.2945, "loss/crossentropy": 2.269458770751953, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15523458272218704, "step": 19138 }, { "epoch": 0.598125, "grad_norm": 3.234375, "grad_norm_var": 0.07655843098958333, "learning_rate": 0.0001, "loss": 5.4908, "loss/crossentropy": 2.4571874141693115, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15805184841156006, "step": 19140 }, { "epoch": 0.5981875, "grad_norm": 3.296875, "grad_norm_var": 0.08671875, "learning_rate": 0.0001, "loss": 5.9359, "loss/crossentropy": 2.7241885662078857, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.16648481786251068, "step": 19142 }, { "epoch": 0.59825, "grad_norm": 3.0625, "grad_norm_var": 0.0853179931640625, "learning_rate": 0.0001, "loss": 5.9012, "loss/crossentropy": 2.720415949821472, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17081619054079056, "step": 19144 }, { "epoch": 0.5983125, "grad_norm": 3.03125, "grad_norm_var": 0.07872721354166666, "learning_rate": 0.0001, "loss": 5.2981, "loss/crossentropy": 2.2692290544509888, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.16031364351511002, "step": 19146 }, { "epoch": 0.598375, "grad_norm": 3.125, "grad_norm_var": 0.07541910807291667, "learning_rate": 0.0001, "loss": 5.3256, "loss/crossentropy": 2.3439310789108276, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15168164670467377, "step": 19148 }, { "epoch": 0.5984375, "grad_norm": 3.234375, "grad_norm_var": 0.06773681640625, "learning_rate": 0.0001, "loss": 5.7722, "loss/crossentropy": 2.554852604866028, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17173956334590912, "step": 19150 }, { "epoch": 0.5985, "grad_norm": 3.453125, "grad_norm_var": 0.04127197265625, "learning_rate": 0.0001, "loss": 5.977, "loss/crossentropy": 2.7384544610977173, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17346123605966568, "step": 19152 }, { "epoch": 0.5985625, "grad_norm": 3.390625, "grad_norm_var": 0.045947265625, "learning_rate": 0.0001, "loss": 5.8553, "loss/crossentropy": 2.676303505897522, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16555601358413696, "step": 19154 }, { "epoch": 0.598625, "grad_norm": 3.421875, "grad_norm_var": 0.04696858723958333, "learning_rate": 0.0001, "loss": 6.0491, "loss/crossentropy": 2.6892950534820557, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18089944124221802, "step": 19156 }, { "epoch": 0.5986875, "grad_norm": 3.046875, "grad_norm_var": 0.03271077473958333, "learning_rate": 0.0001, "loss": 5.5119, "loss/crossentropy": 2.427427649497986, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16352641582489014, "step": 19158 }, { "epoch": 0.59875, "grad_norm": 3.109375, "grad_norm_var": 0.036519368489583336, "learning_rate": 0.0001, "loss": 5.3986, "loss/crossentropy": 2.3395333290100098, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1586436927318573, "step": 19160 }, { "epoch": 0.5988125, "grad_norm": 3.234375, "grad_norm_var": 0.03778889973958333, "learning_rate": 0.0001, "loss": 5.7716, "loss/crossentropy": 2.6539366245269775, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16489075124263763, "step": 19162 }, { "epoch": 0.598875, "grad_norm": 3.5625, "grad_norm_var": 0.03687744140625, "learning_rate": 0.0001, "loss": 5.8816, "loss/crossentropy": 2.6859357357025146, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1719125434756279, "step": 19164 }, { "epoch": 0.5989375, "grad_norm": 3.09375, "grad_norm_var": 0.03909098307291667, "learning_rate": 0.0001, "loss": 5.7933, "loss/crossentropy": 2.581337332725525, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1735384166240692, "step": 19166 }, { "epoch": 0.599, "grad_norm": 3.015625, "grad_norm_var": 0.04478759765625, "learning_rate": 0.0001, "loss": 5.6245, "loss/crossentropy": 2.542153000831604, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16330838948488235, "step": 19168 }, { "epoch": 0.5990625, "grad_norm": 3.265625, "grad_norm_var": 0.04004618326822917, "learning_rate": 0.0001, "loss": 5.7736, "loss/crossentropy": 2.6168237924575806, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.17193201184272766, "step": 19170 }, { "epoch": 0.599125, "grad_norm": 3.03125, "grad_norm_var": 0.06161702473958333, "learning_rate": 0.0001, "loss": 5.4472, "loss/crossentropy": 2.2256386280059814, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.1639539822936058, "step": 19172 }, { "epoch": 0.5991875, "grad_norm": 4.03125, "grad_norm_var": 0.10725504557291667, "learning_rate": 0.0001, "loss": 5.8376, "loss/crossentropy": 2.637717127799988, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16725322604179382, "step": 19174 }, { "epoch": 0.59925, "grad_norm": 3.078125, "grad_norm_var": 0.10352274576822916, "learning_rate": 0.0001, "loss": 5.6908, "loss/crossentropy": 2.5423028469085693, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1664077639579773, "step": 19176 }, { "epoch": 0.5993125, "grad_norm": 3.1875, "grad_norm_var": 0.10003153483072917, "learning_rate": 0.0001, "loss": 5.725, "loss/crossentropy": 2.5043609142303467, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17401903122663498, "step": 19178 }, { "epoch": 0.599375, "grad_norm": 3.328125, "grad_norm_var": 0.10322265625, "learning_rate": 0.0001, "loss": 5.2699, "loss/crossentropy": 2.274636387825012, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.1569497212767601, "step": 19180 }, { "epoch": 0.5994375, "grad_norm": 3.046875, "grad_norm_var": 0.10481669108072916, "learning_rate": 0.0001, "loss": 5.7336, "loss/crossentropy": 2.5585625171661377, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17024116218090057, "step": 19182 }, { "epoch": 0.5995, "grad_norm": 2.96875, "grad_norm_var": 0.0996246337890625, "learning_rate": 0.0001, "loss": 5.6628, "loss/crossentropy": 2.603795886039734, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16097746789455414, "step": 19184 }, { "epoch": 0.5995625, "grad_norm": 3.328125, "grad_norm_var": 0.09787495930989583, "learning_rate": 0.0001, "loss": 5.6364, "loss/crossentropy": 2.4129215478897095, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1711808741092682, "step": 19186 }, { "epoch": 0.599625, "grad_norm": 3.203125, "grad_norm_var": 0.07096354166666667, "learning_rate": 0.0001, "loss": 5.5324, "loss/crossentropy": 2.4042720794677734, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16750407218933105, "step": 19188 }, { "epoch": 0.5996875, "grad_norm": 2.953125, "grad_norm_var": 0.04303385416666667, "learning_rate": 0.0001, "loss": 5.7264, "loss/crossentropy": 2.5236847400665283, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.17417382448911667, "step": 19190 }, { "epoch": 0.59975, "grad_norm": 3.390625, "grad_norm_var": 0.04317118326822917, "learning_rate": 0.0001, "loss": 5.8526, "loss/crossentropy": 2.5612668991088867, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1783522292971611, "step": 19192 }, { "epoch": 0.5998125, "grad_norm": 3.078125, "grad_norm_var": 0.04211832682291667, "learning_rate": 0.0001, "loss": 5.5537, "loss/crossentropy": 2.4094767570495605, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16716086119413376, "step": 19194 }, { "epoch": 0.599875, "grad_norm": 3.1875, "grad_norm_var": 0.03567301432291667, "learning_rate": 0.0001, "loss": 5.527, "loss/crossentropy": 2.483111023902893, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15908117592334747, "step": 19196 }, { "epoch": 0.5999375, "grad_norm": 3.03125, "grad_norm_var": 0.04003499348958333, "learning_rate": 0.0001, "loss": 5.4426, "loss/crossentropy": 2.4285643100738525, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15414240956306458, "step": 19198 }, { "epoch": 0.6, "grad_norm": 3.15625, "grad_norm_var": 0.03810933430989583, "learning_rate": 0.0001, "loss": 5.7239, "loss/crossentropy": 2.4994139671325684, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17167074233293533, "step": 19200 }, { "epoch": 0.6000625, "grad_norm": 3.234375, "grad_norm_var": 0.04234619140625, "learning_rate": 0.0001, "loss": 5.628, "loss/crossentropy": 2.506988286972046, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16678385436534882, "step": 19202 }, { "epoch": 0.600125, "grad_norm": 3.34375, "grad_norm_var": 0.04420166015625, "learning_rate": 0.0001, "loss": 5.7679, "loss/crossentropy": 2.5038286447525024, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17601903527975082, "step": 19204 }, { "epoch": 0.6001875, "grad_norm": 3.203125, "grad_norm_var": 0.027925618489583335, "learning_rate": 0.0001, "loss": 5.9747, "loss/crossentropy": 2.714989423751831, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1775367259979248, "step": 19206 }, { "epoch": 0.60025, "grad_norm": 2.90625, "grad_norm_var": 0.03817952473958333, "learning_rate": 0.0001, "loss": 5.483, "loss/crossentropy": 2.492974638938904, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.15798742324113846, "step": 19208 }, { "epoch": 0.6003125, "grad_norm": 3.125, "grad_norm_var": 0.03810933430989583, "learning_rate": 0.0001, "loss": 5.7218, "loss/crossentropy": 2.5865061283111572, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1674380898475647, "step": 19210 }, { "epoch": 0.600375, "grad_norm": 2.9375, "grad_norm_var": 0.060399373372395836, "learning_rate": 0.0001, "loss": 5.2924, "loss/crossentropy": 2.317363142967224, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15180116146802902, "step": 19212 }, { "epoch": 0.6004375, "grad_norm": 2.96875, "grad_norm_var": 0.06330973307291667, "learning_rate": 0.0001, "loss": 5.6514, "loss/crossentropy": 2.6089274883270264, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.1620563492178917, "step": 19214 }, { "epoch": 0.6005, "grad_norm": 2.9375, "grad_norm_var": 0.06081441243489583, "learning_rate": 0.0001, "loss": 5.1693, "loss/crossentropy": 2.2365992069244385, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.15225821733474731, "step": 19216 }, { "epoch": 0.6005625, "grad_norm": 3.125, "grad_norm_var": 0.059309895833333334, "learning_rate": 0.0001, "loss": 5.582, "loss/crossentropy": 2.494749903678894, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16341552883386612, "step": 19218 }, { "epoch": 0.600625, "grad_norm": 3.171875, "grad_norm_var": 0.05257059733072917, "learning_rate": 0.0001, "loss": 5.8487, "loss/crossentropy": 2.6580334901809692, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16906992346048355, "step": 19220 }, { "epoch": 0.6006875, "grad_norm": 3.171875, "grad_norm_var": 0.04804585774739583, "learning_rate": 0.0001, "loss": 5.8858, "loss/crossentropy": 2.6356533765792847, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1750139519572258, "step": 19222 }, { "epoch": 0.60075, "grad_norm": 3.109375, "grad_norm_var": 0.04234619140625, "learning_rate": 0.0001, "loss": 5.6525, "loss/crossentropy": 2.5122116804122925, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1687157079577446, "step": 19224 }, { "epoch": 0.6008125, "grad_norm": 3.125, "grad_norm_var": 0.0424224853515625, "learning_rate": 0.0001, "loss": 5.8, "loss/crossentropy": 2.5830377340316772, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1681763455271721, "step": 19226 }, { "epoch": 0.600875, "grad_norm": 3.75, "grad_norm_var": 0.0402252197265625, "learning_rate": 0.0001, "loss": 5.625, "loss/crossentropy": 2.4458080530166626, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16908913850784302, "step": 19228 }, { "epoch": 0.6009375, "grad_norm": 3.703125, "grad_norm_var": 0.05497945149739583, "learning_rate": 0.0001, "loss": 6.1473, "loss/crossentropy": 2.7635291814804077, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.18720870465040207, "step": 19230 }, { "epoch": 0.601, "grad_norm": 3.046875, "grad_norm_var": 0.058329264322916664, "learning_rate": 0.0001, "loss": 5.9621, "loss/crossentropy": 2.662600040435791, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.1760430634021759, "step": 19232 }, { "epoch": 0.6010625, "grad_norm": 3.1875, "grad_norm_var": 0.05292867024739583, "learning_rate": 0.0001, "loss": 5.597, "loss/crossentropy": 2.387401819229126, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1678304523229599, "step": 19234 }, { "epoch": 0.601125, "grad_norm": 3.484375, "grad_norm_var": 0.05894775390625, "learning_rate": 0.0001, "loss": 5.6045, "loss/crossentropy": 2.44003164768219, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1640988290309906, "step": 19236 }, { "epoch": 0.6011875, "grad_norm": 3.109375, "grad_norm_var": 0.062109375, "learning_rate": 0.0001, "loss": 5.542, "loss/crossentropy": 2.467086672782898, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.1641310602426529, "step": 19238 }, { "epoch": 0.60125, "grad_norm": 3.1875, "grad_norm_var": 0.06796875, "learning_rate": 0.0001, "loss": 5.7396, "loss/crossentropy": 2.6073544025421143, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16752376407384872, "step": 19240 }, { "epoch": 0.6013125, "grad_norm": 3.15625, "grad_norm_var": 0.06892903645833333, "learning_rate": 0.0001, "loss": 5.461, "loss/crossentropy": 2.3763049840927124, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16198724508285522, "step": 19242 }, { "epoch": 0.601375, "grad_norm": 3.453125, "grad_norm_var": 0.053873697916666664, "learning_rate": 0.0001, "loss": 5.9794, "loss/crossentropy": 2.6908187866210938, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1780814528465271, "step": 19244 }, { "epoch": 0.6014375, "grad_norm": 3.109375, "grad_norm_var": 0.03385009765625, "learning_rate": 0.0001, "loss": 5.7021, "loss/crossentropy": 2.5598464012145996, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16812731325626373, "step": 19246 }, { "epoch": 0.6015, "grad_norm": 2.859375, "grad_norm_var": 0.02711181640625, "learning_rate": 0.0001, "loss": 5.589, "loss/crossentropy": 2.5304884910583496, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16054195910692215, "step": 19248 }, { "epoch": 0.6015625, "grad_norm": 2.984375, "grad_norm_var": 0.0286529541015625, "learning_rate": 0.0001, "loss": 5.917, "loss/crossentropy": 2.7158541679382324, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17129085212945938, "step": 19250 }, { "epoch": 0.601625, "grad_norm": 2.875, "grad_norm_var": 0.024738566080729166, "learning_rate": 0.0001, "loss": 5.7501, "loss/crossentropy": 2.6116241216659546, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1618928164243698, "step": 19252 }, { "epoch": 0.6016875, "grad_norm": 3.0, "grad_norm_var": 0.02626953125, "learning_rate": 0.0001, "loss": 5.5211, "loss/crossentropy": 2.395193934440613, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16688557714223862, "step": 19254 }, { "epoch": 0.60175, "grad_norm": 2.578125, "grad_norm_var": 0.04390869140625, "learning_rate": 0.0001, "loss": 5.1934, "loss/crossentropy": 2.285116672515869, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.14785685390233994, "step": 19256 }, { "epoch": 0.6018125, "grad_norm": 3.078125, "grad_norm_var": 0.06172587076822917, "learning_rate": 0.0001, "loss": 6.0258, "loss/crossentropy": 2.773720860481262, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17637999355793, "step": 19258 }, { "epoch": 0.601875, "grad_norm": 2.734375, "grad_norm_var": 0.0645904541015625, "learning_rate": 0.0001, "loss": 5.4832, "loss/crossentropy": 2.465247869491577, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15648344159126282, "step": 19260 }, { "epoch": 0.6019375, "grad_norm": 2.984375, "grad_norm_var": 0.06220601399739583, "learning_rate": 0.0001, "loss": 5.537, "loss/crossentropy": 2.412294864654541, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1624700427055359, "step": 19262 }, { "epoch": 0.602, "grad_norm": 3.015625, "grad_norm_var": 0.0580963134765625, "learning_rate": 0.0001, "loss": 5.5669, "loss/crossentropy": 2.4570724964141846, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16293827444314957, "step": 19264 }, { "epoch": 0.6020625, "grad_norm": 3.234375, "grad_norm_var": 0.06041259765625, "learning_rate": 0.0001, "loss": 5.6432, "loss/crossentropy": 2.4601714611053467, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17103426903486252, "step": 19266 }, { "epoch": 0.602125, "grad_norm": 3.46875, "grad_norm_var": 0.06607666015625, "learning_rate": 0.0001, "loss": 5.8618, "loss/crossentropy": 2.581000566482544, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17652061581611633, "step": 19268 }, { "epoch": 0.6021875, "grad_norm": 3.015625, "grad_norm_var": 0.06591796875, "learning_rate": 0.0001, "loss": 5.2412, "loss/crossentropy": 2.2920453548431396, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.14804230630397797, "step": 19270 }, { "epoch": 0.60225, "grad_norm": 2.953125, "grad_norm_var": 0.058039347330729164, "learning_rate": 0.0001, "loss": 6.0194, "loss/crossentropy": 2.7348662614822388, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1796286329627037, "step": 19272 }, { "epoch": 0.6023125, "grad_norm": 3.109375, "grad_norm_var": 0.04020894368489583, "learning_rate": 0.0001, "loss": 5.5369, "loss/crossentropy": 2.4168903827667236, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16395603120326996, "step": 19274 }, { "epoch": 0.602375, "grad_norm": 3.015625, "grad_norm_var": 0.029195149739583332, "learning_rate": 0.0001, "loss": 5.7797, "loss/crossentropy": 2.668266177177429, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16231407225131989, "step": 19276 }, { "epoch": 0.6024375, "grad_norm": 3.125, "grad_norm_var": 0.033543904622395836, "learning_rate": 0.0001, "loss": 5.6619, "loss/crossentropy": 2.525829553604126, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16556084156036377, "step": 19278 }, { "epoch": 0.6025, "grad_norm": 3.03125, "grad_norm_var": 0.03453369140625, "learning_rate": 0.0001, "loss": 5.74, "loss/crossentropy": 2.6013892889022827, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1658184453845024, "step": 19280 }, { "epoch": 0.6025625, "grad_norm": 3.09375, "grad_norm_var": 0.035542805989583336, "learning_rate": 0.0001, "loss": 5.8211, "loss/crossentropy": 2.6150509119033813, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16982468217611313, "step": 19282 }, { "epoch": 0.602625, "grad_norm": 3.125, "grad_norm_var": 0.027765909830729168, "learning_rate": 0.0001, "loss": 5.8615, "loss/crossentropy": 2.733675479888916, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16668817400932312, "step": 19284 }, { "epoch": 0.6026875, "grad_norm": 3.140625, "grad_norm_var": 0.025837198893229166, "learning_rate": 0.0001, "loss": 5.754, "loss/crossentropy": 2.6141045093536377, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16711291670799255, "step": 19286 }, { "epoch": 0.60275, "grad_norm": 2.921875, "grad_norm_var": 0.24107666015625, "learning_rate": 0.0001, "loss": 5.8841, "loss/crossentropy": 2.614741325378418, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.18123618513345718, "step": 19288 }, { "epoch": 0.6028125, "grad_norm": 3.375, "grad_norm_var": 0.23961588541666667, "learning_rate": 0.0001, "loss": 5.7736, "loss/crossentropy": 2.566126227378845, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17270290851593018, "step": 19290 }, { "epoch": 0.602875, "grad_norm": 3.1875, "grad_norm_var": 0.2448150634765625, "learning_rate": 0.0001, "loss": 5.7644, "loss/crossentropy": 2.664409875869751, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16351237148046494, "step": 19292 }, { "epoch": 0.6029375, "grad_norm": 3.078125, "grad_norm_var": 0.2454010009765625, "learning_rate": 0.0001, "loss": 5.8293, "loss/crossentropy": 2.7215731143951416, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1646827906370163, "step": 19294 }, { "epoch": 0.603, "grad_norm": 2.9375, "grad_norm_var": 0.2463775634765625, "learning_rate": 0.0001, "loss": 5.7439, "loss/crossentropy": 2.5724823474884033, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1690913736820221, "step": 19296 }, { "epoch": 0.6030625, "grad_norm": 3.03125, "grad_norm_var": 0.24879150390625, "learning_rate": 0.0001, "loss": 6.1137, "loss/crossentropy": 2.854590892791748, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17747671157121658, "step": 19298 }, { "epoch": 0.603125, "grad_norm": 3.359375, "grad_norm_var": 0.25308329264322915, "learning_rate": 0.0001, "loss": 5.8245, "loss/crossentropy": 2.6510233879089355, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1689062938094139, "step": 19300 }, { "epoch": 0.6031875, "grad_norm": 3.359375, "grad_norm_var": 0.25236714680989586, "learning_rate": 0.0001, "loss": 5.8015, "loss/crossentropy": 2.5342386960983276, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17790137231349945, "step": 19302 }, { "epoch": 0.60325, "grad_norm": 3.640625, "grad_norm_var": 0.04108784993489583, "learning_rate": 0.0001, "loss": 5.9817, "loss/crossentropy": 2.6479718685150146, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18142174929380417, "step": 19304 }, { "epoch": 0.6033125, "grad_norm": 3.71875, "grad_norm_var": 0.06306050618489584, "learning_rate": 0.0001, "loss": 5.5137, "loss/crossentropy": 2.349372386932373, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16955320537090302, "step": 19306 }, { "epoch": 0.603375, "grad_norm": 3.859375, "grad_norm_var": 0.087744140625, "learning_rate": 0.0001, "loss": 6.2897, "loss/crossentropy": 2.863835573196411, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.19102825969457626, "step": 19308 }, { "epoch": 0.6034375, "grad_norm": 3.078125, "grad_norm_var": 0.08430887858072916, "learning_rate": 0.0001, "loss": 5.752, "loss/crossentropy": 2.6125062704086304, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16590477526187897, "step": 19310 }, { "epoch": 0.6035, "grad_norm": 3.328125, "grad_norm_var": 0.07696940104166666, "learning_rate": 0.0001, "loss": 5.7415, "loss/crossentropy": 2.5720916986465454, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16889013350009918, "step": 19312 }, { "epoch": 0.6035625, "grad_norm": 3.21875, "grad_norm_var": 0.071240234375, "learning_rate": 0.0001, "loss": 5.4345, "loss/crossentropy": 2.3364455699920654, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.162933811545372, "step": 19314 }, { "epoch": 0.603625, "grad_norm": 3.109375, "grad_norm_var": 0.06687723795572917, "learning_rate": 0.0001, "loss": 5.7682, "loss/crossentropy": 2.6161619424819946, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16520489007234573, "step": 19316 }, { "epoch": 0.6036875, "grad_norm": 3.3125, "grad_norm_var": 0.07131754557291667, "learning_rate": 0.0001, "loss": 5.8675, "loss/crossentropy": 2.642058491706848, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17489047348499298, "step": 19318 }, { "epoch": 0.60375, "grad_norm": 2.921875, "grad_norm_var": 0.06887919108072917, "learning_rate": 0.0001, "loss": 5.7439, "loss/crossentropy": 2.6179521083831787, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.163767471909523, "step": 19320 }, { "epoch": 0.6038125, "grad_norm": 3.125, "grad_norm_var": 0.052652994791666664, "learning_rate": 0.0001, "loss": 5.4641, "loss/crossentropy": 2.4575281143188477, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15573221445083618, "step": 19322 }, { "epoch": 0.603875, "grad_norm": 3.46875, "grad_norm_var": 0.025569661458333334, "learning_rate": 0.0001, "loss": 5.7645, "loss/crossentropy": 2.5369192361831665, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17002779245376587, "step": 19324 }, { "epoch": 0.6039375, "grad_norm": 3.9375, "grad_norm_var": 0.062272135416666666, "learning_rate": 0.0001, "loss": 5.7283, "loss/crossentropy": 2.4499664306640625, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1762692779302597, "step": 19326 }, { "epoch": 0.604, "grad_norm": 3.140625, "grad_norm_var": 0.062272135416666666, "learning_rate": 0.0001, "loss": 5.7353, "loss/crossentropy": 2.638036370277405, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16480784863233566, "step": 19328 }, { "epoch": 0.6040625, "grad_norm": 3.5, "grad_norm_var": 0.07062886555989584, "learning_rate": 0.0001, "loss": 5.8974, "loss/crossentropy": 2.7524850368499756, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1640986055135727, "step": 19330 }, { "epoch": 0.604125, "grad_norm": 3.28125, "grad_norm_var": 0.07642822265625, "learning_rate": 0.0001, "loss": 5.5819, "loss/crossentropy": 2.381034731864929, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17047860473394394, "step": 19332 }, { "epoch": 0.6041875, "grad_norm": 3.21875, "grad_norm_var": 0.07183837890625, "learning_rate": 0.0001, "loss": 5.6297, "loss/crossentropy": 2.5100589990615845, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16157501190900803, "step": 19334 }, { "epoch": 0.60425, "grad_norm": 3.015625, "grad_norm_var": 0.07021077473958333, "learning_rate": 0.0001, "loss": 5.4607, "loss/crossentropy": 2.4219084978103638, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15778452903032303, "step": 19336 }, { "epoch": 0.6043125, "grad_norm": 3.421875, "grad_norm_var": 0.0674957275390625, "learning_rate": 0.0001, "loss": 5.3473, "loss/crossentropy": 2.3017560243606567, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.15337897092103958, "step": 19338 }, { "epoch": 0.604375, "grad_norm": 2.984375, "grad_norm_var": 0.06689046223958334, "learning_rate": 0.0001, "loss": 5.4759, "loss/crossentropy": 2.462526559829712, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1564170941710472, "step": 19340 }, { "epoch": 0.6044375, "grad_norm": 3.203125, "grad_norm_var": 0.03827718098958333, "learning_rate": 0.0001, "loss": 5.5193, "loss/crossentropy": 2.4268585443496704, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.161198690533638, "step": 19342 }, { "epoch": 0.6045, "grad_norm": 2.875, "grad_norm_var": 0.04409077962239583, "learning_rate": 0.0001, "loss": 5.841, "loss/crossentropy": 2.7531384229660034, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1634698063135147, "step": 19344 }, { "epoch": 0.6045625, "grad_norm": 3.015625, "grad_norm_var": 0.037007649739583336, "learning_rate": 0.0001, "loss": 5.8738, "loss/crossentropy": 2.654186487197876, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17235031723976135, "step": 19346 }, { "epoch": 0.604625, "grad_norm": 5.15625, "grad_norm_var": 0.28866780598958336, "learning_rate": 0.0001, "loss": 5.8584, "loss/crossentropy": 2.6439250707626343, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16988525539636612, "step": 19348 }, { "epoch": 0.6046875, "grad_norm": 3.203125, "grad_norm_var": 0.28967183430989585, "learning_rate": 0.0001, "loss": 5.4388, "loss/crossentropy": 2.3536821603775024, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16319891065359116, "step": 19350 }, { "epoch": 0.60475, "grad_norm": 3.09375, "grad_norm_var": 0.28609619140625, "learning_rate": 0.0001, "loss": 6.0791, "loss/crossentropy": 2.845702886581421, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1748986840248108, "step": 19352 }, { "epoch": 0.6048125, "grad_norm": 3.546875, "grad_norm_var": 0.2860636393229167, "learning_rate": 0.0001, "loss": 5.7825, "loss/crossentropy": 2.5356805324554443, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17468440532684326, "step": 19354 }, { "epoch": 0.604875, "grad_norm": 3.21875, "grad_norm_var": 0.281982421875, "learning_rate": 0.0001, "loss": 5.6824, "loss/crossentropy": 2.5392826795578003, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16899523884058, "step": 19356 }, { "epoch": 0.6049375, "grad_norm": 3.25, "grad_norm_var": 0.27551676432291666, "learning_rate": 0.0001, "loss": 5.9311, "loss/crossentropy": 2.71337354183197, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17450948804616928, "step": 19358 }, { "epoch": 0.605, "grad_norm": 3.28125, "grad_norm_var": 0.2628082275390625, "learning_rate": 0.0001, "loss": 5.7855, "loss/crossentropy": 2.6472429037094116, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16929297894239426, "step": 19360 }, { "epoch": 0.6050625, "grad_norm": 3.109375, "grad_norm_var": 0.2653605143229167, "learning_rate": 0.0001, "loss": 5.462, "loss/crossentropy": 2.394970655441284, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16256201267242432, "step": 19362 }, { "epoch": 0.605125, "grad_norm": 3.546875, "grad_norm_var": 0.023786417643229165, "learning_rate": 0.0001, "loss": 6.01, "loss/crossentropy": 2.829805612564087, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16685090214014053, "step": 19364 }, { "epoch": 0.6051875, "grad_norm": 3.328125, "grad_norm_var": 0.2587890625, "learning_rate": 0.0001, "loss": 6.0703, "loss/crossentropy": 2.699275016784668, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18398147821426392, "step": 19366 }, { "epoch": 0.60525, "grad_norm": 3.21875, "grad_norm_var": 0.2541249593098958, "learning_rate": 0.0001, "loss": 5.8238, "loss/crossentropy": 2.6740514039993286, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16692347824573517, "step": 19368 }, { "epoch": 0.6053125, "grad_norm": 3.4375, "grad_norm_var": 0.2579254150390625, "learning_rate": 0.0001, "loss": 6.0299, "loss/crossentropy": 2.7693264484405518, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17722737044095993, "step": 19370 }, { "epoch": 0.605375, "grad_norm": 3.15625, "grad_norm_var": 0.25732014973958334, "learning_rate": 0.0001, "loss": 5.7676, "loss/crossentropy": 2.6156188249588013, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16676147282123566, "step": 19372 }, { "epoch": 0.6054375, "grad_norm": 2.953125, "grad_norm_var": 0.2650227864583333, "learning_rate": 0.0001, "loss": 5.7391, "loss/crossentropy": 2.6294628381729126, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16486700624227524, "step": 19374 }, { "epoch": 0.6055, "grad_norm": 3.46875, "grad_norm_var": 0.26396077473958335, "learning_rate": 0.0001, "loss": 5.8428, "loss/crossentropy": 2.6216262578964233, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17250894010066986, "step": 19376 }, { "epoch": 0.6055625, "grad_norm": 3.078125, "grad_norm_var": 0.2618479410807292, "learning_rate": 0.0001, "loss": 5.4434, "loss/crossentropy": 2.385737895965576, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16123860329389572, "step": 19378 }, { "epoch": 0.605625, "grad_norm": 3.21875, "grad_norm_var": 0.27021484375, "learning_rate": 0.0001, "loss": 5.8877, "loss/crossentropy": 2.6616382598876953, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17573069781064987, "step": 19380 }, { "epoch": 0.6056875, "grad_norm": 3.0625, "grad_norm_var": 0.022443644205729165, "learning_rate": 0.0001, "loss": 5.3565, "loss/crossentropy": 2.291105270385742, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15732458233833313, "step": 19382 }, { "epoch": 0.60575, "grad_norm": 3.140625, "grad_norm_var": 0.020881144205729167, "learning_rate": 0.0001, "loss": 5.7053, "loss/crossentropy": 2.6108888387680054, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16100048273801804, "step": 19384 }, { "epoch": 0.6058125, "grad_norm": 3.0625, "grad_norm_var": 0.0164947509765625, "learning_rate": 0.0001, "loss": 5.6552, "loss/crossentropy": 2.6393449306488037, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15744362771511078, "step": 19386 }, { "epoch": 0.605875, "grad_norm": 2.96875, "grad_norm_var": 0.01754150390625, "learning_rate": 0.0001, "loss": 5.3549, "loss/crossentropy": 2.325733184814453, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15799043327569962, "step": 19388 }, { "epoch": 0.6059375, "grad_norm": 3.171875, "grad_norm_var": 0.0178863525390625, "learning_rate": 0.0001, "loss": 5.6828, "loss/crossentropy": 2.5999585390090942, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16531386226415634, "step": 19390 }, { "epoch": 0.606, "grad_norm": 3.40625, "grad_norm_var": 0.013036092122395834, "learning_rate": 0.0001, "loss": 5.783, "loss/crossentropy": 2.5392236709594727, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1755482405424118, "step": 19392 }, { "epoch": 0.6060625, "grad_norm": 3.34375, "grad_norm_var": 0.019514973958333334, "learning_rate": 0.0001, "loss": 5.7082, "loss/crossentropy": 2.5588942766189575, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1696172133088112, "step": 19394 }, { "epoch": 0.606125, "grad_norm": 3.515625, "grad_norm_var": 0.028050740559895832, "learning_rate": 0.0001, "loss": 5.8606, "loss/crossentropy": 2.6143925189971924, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17813197523355484, "step": 19396 }, { "epoch": 0.6061875, "grad_norm": 3.078125, "grad_norm_var": 0.028294881184895832, "learning_rate": 0.0001, "loss": 5.9204, "loss/crossentropy": 2.693058729171753, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17234336584806442, "step": 19398 }, { "epoch": 0.60625, "grad_norm": 3.28125, "grad_norm_var": 0.033544921875, "learning_rate": 0.0001, "loss": 5.7255, "loss/crossentropy": 2.603538751602173, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16453826427459717, "step": 19400 }, { "epoch": 0.6063125, "grad_norm": 3.46875, "grad_norm_var": 0.0446685791015625, "learning_rate": 0.0001, "loss": 5.9881, "loss/crossentropy": 2.7714216709136963, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1743982583284378, "step": 19402 }, { "epoch": 0.606375, "grad_norm": 3.046875, "grad_norm_var": 0.04474995930989583, "learning_rate": 0.0001, "loss": 5.8255, "loss/crossentropy": 2.6851247549057007, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16872383654117584, "step": 19404 }, { "epoch": 0.6064375, "grad_norm": 3.09375, "grad_norm_var": 0.05657145182291667, "learning_rate": 0.0001, "loss": 6.0151, "loss/crossentropy": 2.7455601692199707, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1812482625246048, "step": 19406 }, { "epoch": 0.6065, "grad_norm": 3.46875, "grad_norm_var": 0.05781962076822917, "learning_rate": 0.0001, "loss": 5.8734, "loss/crossentropy": 2.6485707759857178, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17404945939779282, "step": 19408 }, { "epoch": 0.6065625, "grad_norm": 3.828125, "grad_norm_var": 0.08401692708333333, "learning_rate": 0.0001, "loss": 5.8872, "loss/crossentropy": 2.6375534534454346, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17340198904275894, "step": 19410 }, { "epoch": 0.606625, "grad_norm": 2.9375, "grad_norm_var": 0.08517964680989583, "learning_rate": 0.0001, "loss": 5.3536, "loss/crossentropy": 2.3068161010742188, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1593640148639679, "step": 19412 }, { "epoch": 0.6066875, "grad_norm": 3.609375, "grad_norm_var": 0.08928629557291666, "learning_rate": 0.0001, "loss": 5.8966, "loss/crossentropy": 2.6097759008407593, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1810266673564911, "step": 19414 }, { "epoch": 0.60675, "grad_norm": 3.359375, "grad_norm_var": 0.07962239583333333, "learning_rate": 0.0001, "loss": 5.9006, "loss/crossentropy": 2.6326334476470947, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1760202795267105, "step": 19416 }, { "epoch": 0.6068125, "grad_norm": 3.234375, "grad_norm_var": 0.0674957275390625, "learning_rate": 0.0001, "loss": 5.6606, "loss/crossentropy": 2.4949105978012085, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16539839655160904, "step": 19418 }, { "epoch": 0.606875, "grad_norm": 2.84375, "grad_norm_var": 0.07154947916666667, "learning_rate": 0.0001, "loss": 5.4326, "loss/crossentropy": 2.4538732767105103, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15646998584270477, "step": 19420 }, { "epoch": 0.6069375, "grad_norm": 3.015625, "grad_norm_var": 0.06676025390625, "learning_rate": 0.0001, "loss": 6.0018, "loss/crossentropy": 2.836829662322998, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16845224797725677, "step": 19422 }, { "epoch": 0.607, "grad_norm": 3.03125, "grad_norm_var": 0.0700836181640625, "learning_rate": 0.0001, "loss": 5.7035, "loss/crossentropy": 2.5772018432617188, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16341350972652435, "step": 19424 }, { "epoch": 0.6070625, "grad_norm": 3.046875, "grad_norm_var": 0.03340555826822917, "learning_rate": 0.0001, "loss": 5.6656, "loss/crossentropy": 2.574127674102783, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16305193305015564, "step": 19426 }, { "epoch": 0.607125, "grad_norm": 3.015625, "grad_norm_var": 0.03184305826822917, "learning_rate": 0.0001, "loss": 5.4449, "loss/crossentropy": 2.4027289152145386, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1569523811340332, "step": 19428 }, { "epoch": 0.6071875, "grad_norm": 3.796875, "grad_norm_var": 0.04780171712239583, "learning_rate": 0.0001, "loss": 5.4921, "loss/crossentropy": 2.4096295833587646, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16410991549491882, "step": 19430 }, { "epoch": 0.60725, "grad_norm": 2.84375, "grad_norm_var": 0.05351460774739583, "learning_rate": 0.0001, "loss": 5.413, "loss/crossentropy": 2.3793755769729614, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1596129685640335, "step": 19432 }, { "epoch": 0.6073125, "grad_norm": 2.90625, "grad_norm_var": 0.05611979166666667, "learning_rate": 0.0001, "loss": 5.4802, "loss/crossentropy": 2.4252909421920776, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.158619225025177, "step": 19434 }, { "epoch": 0.607375, "grad_norm": 2.921875, "grad_norm_var": 0.053563435872395836, "learning_rate": 0.0001, "loss": 5.6961, "loss/crossentropy": 2.5794930458068848, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16634531319141388, "step": 19436 }, { "epoch": 0.6074375, "grad_norm": 3.25, "grad_norm_var": 0.063720703125, "learning_rate": 0.0001, "loss": 5.7954, "loss/crossentropy": 2.6061872243881226, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16931024938821793, "step": 19438 }, { "epoch": 0.6075, "grad_norm": 3.109375, "grad_norm_var": 0.06134440104166667, "learning_rate": 0.0001, "loss": 5.8521, "loss/crossentropy": 2.6193517446517944, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1736624613404274, "step": 19440 }, { "epoch": 0.6075625, "grad_norm": 2.90625, "grad_norm_var": 0.06603190104166666, "learning_rate": 0.0001, "loss": 5.6158, "loss/crossentropy": 2.553231120109558, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15898773074150085, "step": 19442 }, { "epoch": 0.607625, "grad_norm": 2.90625, "grad_norm_var": 0.06636454264322916, "learning_rate": 0.0001, "loss": 5.4121, "loss/crossentropy": 2.3391834497451782, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16275667399168015, "step": 19444 }, { "epoch": 0.6076875, "grad_norm": 2.796875, "grad_norm_var": 0.0384185791015625, "learning_rate": 0.0001, "loss": 5.2162, "loss/crossentropy": 2.29744815826416, "loss/hidden": 1.38671875, "loss/jsd": 0.0, "loss/logits": 0.15320461988449097, "step": 19446 }, { "epoch": 0.60775, "grad_norm": 3.078125, "grad_norm_var": 0.029442342122395833, "learning_rate": 0.0001, "loss": 5.8668, "loss/crossentropy": 2.694766879081726, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16837353259325027, "step": 19448 }, { "epoch": 0.6078125, "grad_norm": 3.359375, "grad_norm_var": 0.032868448893229166, "learning_rate": 0.0001, "loss": 5.5795, "loss/crossentropy": 2.4977762699127197, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1624685749411583, "step": 19450 }, { "epoch": 0.607875, "grad_norm": 3.171875, "grad_norm_var": 0.030887858072916666, "learning_rate": 0.0001, "loss": 5.5431, "loss/crossentropy": 2.395267367362976, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1651710569858551, "step": 19452 }, { "epoch": 0.6079375, "grad_norm": 3.203125, "grad_norm_var": 0.0245269775390625, "learning_rate": 0.0001, "loss": 5.8255, "loss/crossentropy": 2.6514869928359985, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17013683915138245, "step": 19454 }, { "epoch": 0.608, "grad_norm": 2.984375, "grad_norm_var": 0.0296051025390625, "learning_rate": 0.0001, "loss": 5.6925, "loss/crossentropy": 2.543419122695923, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16607911884784698, "step": 19456 }, { "epoch": 0.6080625, "grad_norm": 3.3125, "grad_norm_var": 0.0296875, "learning_rate": 0.0001, "loss": 5.9382, "loss/crossentropy": 2.7204233407974243, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1733415350317955, "step": 19458 }, { "epoch": 0.608125, "grad_norm": 2.984375, "grad_norm_var": 0.027860514322916665, "learning_rate": 0.0001, "loss": 5.8821, "loss/crossentropy": 2.807587146759033, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1613534614443779, "step": 19460 }, { "epoch": 0.6081875, "grad_norm": 3.015625, "grad_norm_var": 0.021870930989583332, "learning_rate": 0.0001, "loss": 5.7142, "loss/crossentropy": 2.572507381439209, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16690758615732193, "step": 19462 }, { "epoch": 0.60825, "grad_norm": 3.375, "grad_norm_var": 0.030028279622395834, "learning_rate": 0.0001, "loss": 5.808, "loss/crossentropy": 2.6362345218658447, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.17264685779809952, "step": 19464 }, { "epoch": 0.6083125, "grad_norm": 3.21875, "grad_norm_var": 0.027000935872395833, "learning_rate": 0.0001, "loss": 5.8405, "loss/crossentropy": 2.657623291015625, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1678960844874382, "step": 19466 }, { "epoch": 0.608375, "grad_norm": 2.828125, "grad_norm_var": 0.0315582275390625, "learning_rate": 0.0001, "loss": 5.5147, "loss/crossentropy": 2.504341959953308, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15767860412597656, "step": 19468 }, { "epoch": 0.6084375, "grad_norm": 3.515625, "grad_norm_var": 0.036909993489583334, "learning_rate": 0.0001, "loss": 5.6847, "loss/crossentropy": 2.452295660972595, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17011572420597076, "step": 19470 }, { "epoch": 0.6085, "grad_norm": 3.34375, "grad_norm_var": 0.03620503743489583, "learning_rate": 0.0001, "loss": 5.598, "loss/crossentropy": 2.4428714513778687, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1705893874168396, "step": 19472 }, { "epoch": 0.6085625, "grad_norm": 3.125, "grad_norm_var": 0.036116536458333334, "learning_rate": 0.0001, "loss": 5.6301, "loss/crossentropy": 2.5319563150405884, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16254442185163498, "step": 19474 }, { "epoch": 0.608625, "grad_norm": 3.015625, "grad_norm_var": 0.04063212076822917, "learning_rate": 0.0001, "loss": 5.5087, "loss/crossentropy": 2.482611656188965, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.16003308445215225, "step": 19476 }, { "epoch": 0.6086875, "grad_norm": 3.359375, "grad_norm_var": 0.047379557291666666, "learning_rate": 0.0001, "loss": 5.677, "loss/crossentropy": 2.4080413579940796, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1710318624973297, "step": 19478 }, { "epoch": 0.60875, "grad_norm": 3.171875, "grad_norm_var": 0.03925679524739583, "learning_rate": 0.0001, "loss": 5.915, "loss/crossentropy": 2.721635103225708, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17129214107990265, "step": 19480 }, { "epoch": 0.6088125, "grad_norm": 3.484375, "grad_norm_var": 0.046773274739583336, "learning_rate": 0.0001, "loss": 5.8674, "loss/crossentropy": 2.635382056236267, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1735890507698059, "step": 19482 }, { "epoch": 0.608875, "grad_norm": 3.265625, "grad_norm_var": 0.04179585774739583, "learning_rate": 0.0001, "loss": 5.649, "loss/crossentropy": 2.55346143245697, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16268158704042435, "step": 19484 }, { "epoch": 0.6089375, "grad_norm": 3.015625, "grad_norm_var": 0.03560791015625, "learning_rate": 0.0001, "loss": 5.8329, "loss/crossentropy": 2.656839370727539, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16994751244783401, "step": 19486 }, { "epoch": 0.609, "grad_norm": 2.90625, "grad_norm_var": 0.036839803059895836, "learning_rate": 0.0001, "loss": 5.7469, "loss/crossentropy": 2.5906903743743896, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16874559968709946, "step": 19488 }, { "epoch": 0.6090625, "grad_norm": 3.296875, "grad_norm_var": 0.03396809895833333, "learning_rate": 0.0001, "loss": 5.9224, "loss/crossentropy": 2.668254256248474, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17463570088148117, "step": 19490 }, { "epoch": 0.609125, "grad_norm": 3.171875, "grad_norm_var": 0.0238433837890625, "learning_rate": 0.0001, "loss": 5.596, "loss/crossentropy": 2.4882742166519165, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16350454092025757, "step": 19492 }, { "epoch": 0.6091875, "grad_norm": 3.328125, "grad_norm_var": 0.0229644775390625, "learning_rate": 0.0001, "loss": 5.6077, "loss/crossentropy": 2.4969935417175293, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16107328236103058, "step": 19494 }, { "epoch": 0.60925, "grad_norm": 3.15625, "grad_norm_var": 0.022981770833333335, "learning_rate": 0.0001, "loss": 5.8845, "loss/crossentropy": 2.6997865438461304, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17042437195777893, "step": 19496 }, { "epoch": 0.6093125, "grad_norm": 3.78125, "grad_norm_var": 0.041162109375, "learning_rate": 0.0001, "loss": 5.8276, "loss/crossentropy": 2.568711996078491, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17510397732257843, "step": 19498 }, { "epoch": 0.609375, "grad_norm": 3.265625, "grad_norm_var": 0.03935546875, "learning_rate": 0.0001, "loss": 5.4971, "loss/crossentropy": 2.4446879625320435, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1568054035305977, "step": 19500 }, { "epoch": 0.6094375, "grad_norm": 3.15625, "grad_norm_var": 0.03697509765625, "learning_rate": 0.0001, "loss": 5.8709, "loss/crossentropy": 2.6936148405075073, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16655559837818146, "step": 19502 }, { "epoch": 0.6095, "grad_norm": 3.21875, "grad_norm_var": 0.030192057291666668, "learning_rate": 0.0001, "loss": 5.5124, "loss/crossentropy": 2.441460132598877, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16099561750888824, "step": 19504 }, { "epoch": 0.6095625, "grad_norm": 3.171875, "grad_norm_var": 0.029206339518229166, "learning_rate": 0.0001, "loss": 5.9482, "loss/crossentropy": 2.732633948326111, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17429260909557343, "step": 19506 }, { "epoch": 0.609625, "grad_norm": 3.203125, "grad_norm_var": 0.028446451822916666, "learning_rate": 0.0001, "loss": 5.6926, "loss/crossentropy": 2.580842614173889, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16430071741342545, "step": 19508 }, { "epoch": 0.6096875, "grad_norm": 3.4375, "grad_norm_var": 0.03033447265625, "learning_rate": 0.0001, "loss": 5.7759, "loss/crossentropy": 2.5772647857666016, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17025792598724365, "step": 19510 }, { "epoch": 0.60975, "grad_norm": 3.46875, "grad_norm_var": 0.033707682291666666, "learning_rate": 0.0001, "loss": 5.4905, "loss/crossentropy": 2.394649028778076, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.15880553424358368, "step": 19512 }, { "epoch": 0.6098125, "grad_norm": 3.203125, "grad_norm_var": 0.015620930989583334, "learning_rate": 0.0001, "loss": 5.4534, "loss/crossentropy": 2.343334913253784, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1606125831604004, "step": 19514 }, { "epoch": 0.609875, "grad_norm": 3.1875, "grad_norm_var": 0.014481608072916667, "learning_rate": 0.0001, "loss": 5.4632, "loss/crossentropy": 2.352441668510437, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16302981227636337, "step": 19516 }, { "epoch": 0.6099375, "grad_norm": 3.015625, "grad_norm_var": 0.016597493489583334, "learning_rate": 0.0001, "loss": 5.4485, "loss/crossentropy": 2.38815176486969, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15720786154270172, "step": 19518 }, { "epoch": 0.61, "grad_norm": 3.359375, "grad_norm_var": 0.01773681640625, "learning_rate": 0.0001, "loss": 5.6109, "loss/crossentropy": 2.381349205970764, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17256611585617065, "step": 19520 }, { "epoch": 0.6100625, "grad_norm": 2.953125, "grad_norm_var": 0.023810831705729167, "learning_rate": 0.0001, "loss": 5.4556, "loss/crossentropy": 2.4127821922302246, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1589699685573578, "step": 19522 }, { "epoch": 0.610125, "grad_norm": 3.203125, "grad_norm_var": 0.024605305989583333, "learning_rate": 0.0001, "loss": 6.0556, "loss/crossentropy": 2.782081723213196, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1761820837855339, "step": 19524 }, { "epoch": 0.6101875, "grad_norm": 3.125, "grad_norm_var": 0.0226715087890625, "learning_rate": 0.0001, "loss": 5.6567, "loss/crossentropy": 2.574386239051819, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16018719971179962, "step": 19526 }, { "epoch": 0.61025, "grad_norm": 3.625, "grad_norm_var": 0.0297515869140625, "learning_rate": 0.0001, "loss": 5.7258, "loss/crossentropy": 2.5339808464050293, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17152460664510727, "step": 19528 }, { "epoch": 0.6103125, "grad_norm": 3.125, "grad_norm_var": 0.0310211181640625, "learning_rate": 0.0001, "loss": 5.6892, "loss/crossentropy": 2.562675952911377, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1689067929983139, "step": 19530 }, { "epoch": 0.610375, "grad_norm": 3.0, "grad_norm_var": 0.029833984375, "learning_rate": 0.0001, "loss": 5.506, "loss/crossentropy": 2.4123008251190186, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1628851518034935, "step": 19532 }, { "epoch": 0.6104375, "grad_norm": 3.078125, "grad_norm_var": 0.0288482666015625, "learning_rate": 0.0001, "loss": 5.6306, "loss/crossentropy": 2.4892683029174805, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16803808510303497, "step": 19534 }, { "epoch": 0.6105, "grad_norm": 3.15625, "grad_norm_var": 0.023856608072916667, "learning_rate": 0.0001, "loss": 5.7239, "loss/crossentropy": 2.642979383468628, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1635577529668808, "step": 19536 }, { "epoch": 0.6105625, "grad_norm": 3.1875, "grad_norm_var": 0.02705078125, "learning_rate": 0.0001, "loss": 5.7529, "loss/crossentropy": 2.5309290885925293, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17571166157722473, "step": 19538 }, { "epoch": 0.610625, "grad_norm": 3.03125, "grad_norm_var": 0.03396809895833333, "learning_rate": 0.0001, "loss": 5.7253, "loss/crossentropy": 2.5540027618408203, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16908280551433563, "step": 19540 }, { "epoch": 0.6106875, "grad_norm": 4.4375, "grad_norm_var": 0.13192952473958333, "learning_rate": 0.0001, "loss": 6.0836, "loss/crossentropy": 2.78000009059906, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1799660548567772, "step": 19542 }, { "epoch": 0.61075, "grad_norm": 3.234375, "grad_norm_var": 0.12216389973958333, "learning_rate": 0.0001, "loss": 5.4596, "loss/crossentropy": 2.374886989593506, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1608191877603531, "step": 19544 }, { "epoch": 0.6108125, "grad_norm": 3.265625, "grad_norm_var": 0.11640625, "learning_rate": 0.0001, "loss": 5.5675, "loss/crossentropy": 2.514106512069702, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15924306213855743, "step": 19546 }, { "epoch": 0.610875, "grad_norm": 3.546875, "grad_norm_var": 0.1162994384765625, "learning_rate": 0.0001, "loss": 5.8744, "loss/crossentropy": 2.620888352394104, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17418219149112701, "step": 19548 }, { "epoch": 0.6109375, "grad_norm": 3.125, "grad_norm_var": 0.1143951416015625, "learning_rate": 0.0001, "loss": 5.7243, "loss/crossentropy": 2.5614830255508423, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.17096445709466934, "step": 19550 }, { "epoch": 0.611, "grad_norm": 3.015625, "grad_norm_var": 0.12652587890625, "learning_rate": 0.0001, "loss": 5.2421, "loss/crossentropy": 2.29048490524292, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.14828714728355408, "step": 19552 }, { "epoch": 0.6110625, "grad_norm": 3.296875, "grad_norm_var": 0.12499593098958334, "learning_rate": 0.0001, "loss": 6.0252, "loss/crossentropy": 2.767704486846924, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1753620281815529, "step": 19554 }, { "epoch": 0.611125, "grad_norm": 2.96875, "grad_norm_var": 0.13213602701822916, "learning_rate": 0.0001, "loss": 5.7292, "loss/crossentropy": 2.615246534347534, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16608589887619019, "step": 19556 }, { "epoch": 0.6111875, "grad_norm": 3.125, "grad_norm_var": 0.033600870768229166, "learning_rate": 0.0001, "loss": 5.8258, "loss/crossentropy": 2.6491048336029053, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17039896547794342, "step": 19558 }, { "epoch": 0.61125, "grad_norm": 3.03125, "grad_norm_var": 0.03459879557291667, "learning_rate": 0.0001, "loss": 5.638, "loss/crossentropy": 2.5442399978637695, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16132787615060806, "step": 19560 }, { "epoch": 0.6113125, "grad_norm": 3.078125, "grad_norm_var": 0.03526102701822917, "learning_rate": 0.0001, "loss": 5.4512, "loss/crossentropy": 2.4072697162628174, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1598571613430977, "step": 19562 }, { "epoch": 0.611375, "grad_norm": 3.171875, "grad_norm_var": 0.025614420572916668, "learning_rate": 0.0001, "loss": 5.6302, "loss/crossentropy": 2.457480549812317, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1664915680885315, "step": 19564 }, { "epoch": 0.6114375, "grad_norm": 2.921875, "grad_norm_var": 0.0220123291015625, "learning_rate": 0.0001, "loss": 5.7991, "loss/crossentropy": 2.6218069791793823, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16812142729759216, "step": 19566 }, { "epoch": 0.6115, "grad_norm": 3.34375, "grad_norm_var": 0.020524088541666666, "learning_rate": 0.0001, "loss": 5.9472, "loss/crossentropy": 2.6854790449142456, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17656263709068298, "step": 19568 }, { "epoch": 0.6115625, "grad_norm": 3.171875, "grad_norm_var": 0.01549072265625, "learning_rate": 0.0001, "loss": 5.8737, "loss/crossentropy": 2.614681124687195, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17511707544326782, "step": 19570 }, { "epoch": 0.611625, "grad_norm": 3.515625, "grad_norm_var": 0.02037353515625, "learning_rate": 0.0001, "loss": 5.8561, "loss/crossentropy": 2.5757195949554443, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1760895773768425, "step": 19572 }, { "epoch": 0.6116875, "grad_norm": 3.0625, "grad_norm_var": 0.024300130208333333, "learning_rate": 0.0001, "loss": 5.4319, "loss/crossentropy": 2.2783864736557007, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16730409860610962, "step": 19574 }, { "epoch": 0.61175, "grad_norm": 3.171875, "grad_norm_var": 0.022948201497395834, "learning_rate": 0.0001, "loss": 5.9062, "loss/crossentropy": 2.7144960165023804, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16956491768360138, "step": 19576 }, { "epoch": 0.6118125, "grad_norm": 3.203125, "grad_norm_var": 0.020210774739583333, "learning_rate": 0.0001, "loss": 5.4665, "loss/crossentropy": 2.357220768928528, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16209646314382553, "step": 19578 }, { "epoch": 0.611875, "grad_norm": 3.390625, "grad_norm_var": 0.022005208333333335, "learning_rate": 0.0001, "loss": 6.0825, "loss/crossentropy": 2.85296094417572, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.16904301196336746, "step": 19580 }, { "epoch": 0.6119375, "grad_norm": 3.609375, "grad_norm_var": 0.023631795247395834, "learning_rate": 0.0001, "loss": 5.735, "loss/crossentropy": 2.5890785455703735, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1638134941458702, "step": 19582 }, { "epoch": 0.612, "grad_norm": 3.15625, "grad_norm_var": 0.024576822916666668, "learning_rate": 0.0001, "loss": 5.8232, "loss/crossentropy": 2.627472996711731, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17074380815029144, "step": 19584 }, { "epoch": 0.6120625, "grad_norm": 6.09375, "grad_norm_var": 0.5181477864583334, "learning_rate": 0.0001, "loss": 5.7791, "loss/crossentropy": 2.4958267211914062, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17597881704568863, "step": 19586 }, { "epoch": 0.612125, "grad_norm": 3.28125, "grad_norm_var": 0.5228352864583333, "learning_rate": 0.0001, "loss": 5.9044, "loss/crossentropy": 2.673877239227295, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17265690118074417, "step": 19588 }, { "epoch": 0.6121875, "grad_norm": 3.375, "grad_norm_var": 0.5311848958333333, "learning_rate": 0.0001, "loss": 5.8418, "loss/crossentropy": 2.652164936065674, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1720835268497467, "step": 19590 }, { "epoch": 0.61225, "grad_norm": 2.96875, "grad_norm_var": 0.5369099934895833, "learning_rate": 0.0001, "loss": 5.5793, "loss/crossentropy": 2.507964611053467, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16065095365047455, "step": 19592 }, { "epoch": 0.6123125, "grad_norm": 3.078125, "grad_norm_var": 0.5409576416015625, "learning_rate": 0.0001, "loss": 5.7414, "loss/crossentropy": 2.6736977100372314, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15911806374788284, "step": 19594 }, { "epoch": 0.612375, "grad_norm": 3.609375, "grad_norm_var": 0.5402333577473958, "learning_rate": 0.0001, "loss": 6.1378, "loss/crossentropy": 2.7499048709869385, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.18254327028989792, "step": 19596 }, { "epoch": 0.6124375, "grad_norm": 2.8125, "grad_norm_var": 0.57115478515625, "learning_rate": 0.0001, "loss": 5.4608, "loss/crossentropy": 2.487305521965027, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15203223377466202, "step": 19598 }, { "epoch": 0.6125, "grad_norm": 3.078125, "grad_norm_var": 0.5739095052083333, "learning_rate": 0.0001, "loss": 5.5002, "loss/crossentropy": 2.383982539176941, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16123488545417786, "step": 19600 }, { "epoch": 0.6125625, "grad_norm": 2.96875, "grad_norm_var": 0.047215779622395836, "learning_rate": 0.0001, "loss": 5.7314, "loss/crossentropy": 2.5770827531814575, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16816139966249466, "step": 19602 }, { "epoch": 0.612625, "grad_norm": 3.171875, "grad_norm_var": 0.05194905598958333, "learning_rate": 0.0001, "loss": 5.8007, "loss/crossentropy": 2.6743478775024414, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16458549350500107, "step": 19604 }, { "epoch": 0.6126875, "grad_norm": 2.671875, "grad_norm_var": 0.06070556640625, "learning_rate": 0.0001, "loss": 5.4554, "loss/crossentropy": 2.4700160026550293, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.1551780253648758, "step": 19606 }, { "epoch": 0.61275, "grad_norm": 3.25, "grad_norm_var": 0.060770670572916664, "learning_rate": 0.0001, "loss": 5.5231, "loss/crossentropy": 2.4005579948425293, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16577185690402985, "step": 19608 }, { "epoch": 0.6128125, "grad_norm": 3.25, "grad_norm_var": 0.06189778645833333, "learning_rate": 0.0001, "loss": 5.4559, "loss/crossentropy": 2.332529306411743, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1662391945719719, "step": 19610 }, { "epoch": 0.612875, "grad_norm": 2.953125, "grad_norm_var": 0.04412434895833333, "learning_rate": 0.0001, "loss": 5.6385, "loss/crossentropy": 2.499752402305603, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16113810241222382, "step": 19612 }, { "epoch": 0.6129375, "grad_norm": 3.34375, "grad_norm_var": 0.0521484375, "learning_rate": 0.0001, "loss": 5.2879, "loss/crossentropy": 2.2084743976593018, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.161456897854805, "step": 19614 }, { "epoch": 0.613, "grad_norm": 2.984375, "grad_norm_var": 0.054011027018229164, "learning_rate": 0.0001, "loss": 5.3066, "loss/crossentropy": 2.333135962486267, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.14734511077404022, "step": 19616 }, { "epoch": 0.6130625, "grad_norm": 3.09375, "grad_norm_var": 0.046826171875, "learning_rate": 0.0001, "loss": 5.5485, "loss/crossentropy": 2.4865167140960693, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.16478903591632843, "step": 19618 }, { "epoch": 0.613125, "grad_norm": 2.953125, "grad_norm_var": 0.04421284993489583, "learning_rate": 0.0001, "loss": 5.7492, "loss/crossentropy": 2.589370846748352, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16598235815763474, "step": 19620 }, { "epoch": 0.6131875, "grad_norm": 3.21875, "grad_norm_var": 0.0346099853515625, "learning_rate": 0.0001, "loss": 5.7248, "loss/crossentropy": 2.5286474227905273, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16688184440135956, "step": 19622 }, { "epoch": 0.61325, "grad_norm": 3.15625, "grad_norm_var": 0.03664957682291667, "learning_rate": 0.0001, "loss": 6.0778, "loss/crossentropy": 2.711007833480835, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18511471152305603, "step": 19624 }, { "epoch": 0.6133125, "grad_norm": 3.5, "grad_norm_var": 0.041071573893229164, "learning_rate": 0.0001, "loss": 5.8477, "loss/crossentropy": 2.588347554206848, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17281106859445572, "step": 19626 }, { "epoch": 0.613375, "grad_norm": 3.1875, "grad_norm_var": 0.034089152018229166, "learning_rate": 0.0001, "loss": 6.1345, "loss/crossentropy": 2.8718771934509277, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1770443245768547, "step": 19628 }, { "epoch": 0.6134375, "grad_norm": 3.0625, "grad_norm_var": 0.03790690104166667, "learning_rate": 0.0001, "loss": 5.5464, "loss/crossentropy": 2.483364701271057, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1625508889555931, "step": 19630 }, { "epoch": 0.6135, "grad_norm": 2.890625, "grad_norm_var": 0.037919108072916666, "learning_rate": 0.0001, "loss": 5.6717, "loss/crossentropy": 2.6128073930740356, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16135795414447784, "step": 19632 }, { "epoch": 0.6135625, "grad_norm": 3.4375, "grad_norm_var": 0.0441314697265625, "learning_rate": 0.0001, "loss": 5.9193, "loss/crossentropy": 2.7125253677368164, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17106316983699799, "step": 19634 }, { "epoch": 0.613625, "grad_norm": 3.265625, "grad_norm_var": 0.0421783447265625, "learning_rate": 0.0001, "loss": 5.4595, "loss/crossentropy": 2.4429343938827515, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15790626406669617, "step": 19636 }, { "epoch": 0.6136875, "grad_norm": 3.296875, "grad_norm_var": 0.03485921223958333, "learning_rate": 0.0001, "loss": 5.5245, "loss/crossentropy": 2.5152522325515747, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1579592227935791, "step": 19638 }, { "epoch": 0.61375, "grad_norm": 3.1875, "grad_norm_var": 0.028902180989583335, "learning_rate": 0.0001, "loss": 5.4698, "loss/crossentropy": 2.4052783250808716, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15957404673099518, "step": 19640 }, { "epoch": 0.6138125, "grad_norm": 3.3125, "grad_norm_var": 0.023631795247395834, "learning_rate": 0.0001, "loss": 5.5826, "loss/crossentropy": 2.470545768737793, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1627647429704666, "step": 19642 }, { "epoch": 0.613875, "grad_norm": 2.953125, "grad_norm_var": 0.024559529622395833, "learning_rate": 0.0001, "loss": 5.8441, "loss/crossentropy": 2.7033053636550903, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16798724234104156, "step": 19644 }, { "epoch": 0.6139375, "grad_norm": 3.15625, "grad_norm_var": 0.021284993489583334, "learning_rate": 0.0001, "loss": 5.8487, "loss/crossentropy": 2.7112566232681274, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16882429271936417, "step": 19646 }, { "epoch": 0.614, "grad_norm": 2.96875, "grad_norm_var": 0.019872029622395832, "learning_rate": 0.0001, "loss": 5.363, "loss/crossentropy": 2.3288105726242065, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16122935712337494, "step": 19648 }, { "epoch": 0.6140625, "grad_norm": 3.265625, "grad_norm_var": 0.014937337239583333, "learning_rate": 0.0001, "loss": 5.9147, "loss/crossentropy": 2.6747225522994995, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17477816343307495, "step": 19650 }, { "epoch": 0.614125, "grad_norm": 3.0625, "grad_norm_var": 0.013752237955729166, "learning_rate": 0.0001, "loss": 5.7318, "loss/crossentropy": 2.6186282634735107, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1675650253891945, "step": 19652 }, { "epoch": 0.6141875, "grad_norm": 3.046875, "grad_norm_var": 0.011970011393229167, "learning_rate": 0.0001, "loss": 5.6658, "loss/crossentropy": 2.480633854866028, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16852006316184998, "step": 19654 }, { "epoch": 0.61425, "grad_norm": 2.96875, "grad_norm_var": 0.01298828125, "learning_rate": 0.0001, "loss": 5.4228, "loss/crossentropy": 2.372126817703247, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15780331194400787, "step": 19656 }, { "epoch": 0.6143125, "grad_norm": 3.125, "grad_norm_var": 0.009406534830729167, "learning_rate": 0.0001, "loss": 5.9133, "loss/crossentropy": 2.73931086063385, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1689605638384819, "step": 19658 }, { "epoch": 0.614375, "grad_norm": 3.078125, "grad_norm_var": 0.008821614583333333, "learning_rate": 0.0001, "loss": 5.4754, "loss/crossentropy": 2.466127395629883, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15873520076274872, "step": 19660 }, { "epoch": 0.6144375, "grad_norm": 2.875, "grad_norm_var": 0.0175445556640625, "learning_rate": 0.0001, "loss": 5.8, "loss/crossentropy": 2.5970553159713745, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16834472119808197, "step": 19662 }, { "epoch": 0.6145, "grad_norm": 2.90625, "grad_norm_var": 0.018583170572916665, "learning_rate": 0.0001, "loss": 5.4983, "loss/crossentropy": 2.4853075742721558, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1555929183959961, "step": 19664 }, { "epoch": 0.6145625, "grad_norm": 2.890625, "grad_norm_var": 0.024739583333333332, "learning_rate": 0.0001, "loss": 5.8179, "loss/crossentropy": 2.6911911964416504, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16422955691814423, "step": 19666 }, { "epoch": 0.614625, "grad_norm": 3.25, "grad_norm_var": 0.026851399739583334, "learning_rate": 0.0001, "loss": 5.5265, "loss/crossentropy": 2.46109676361084, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16083980351686478, "step": 19668 }, { "epoch": 0.6146875, "grad_norm": 3.421875, "grad_norm_var": 0.036896769205729166, "learning_rate": 0.0001, "loss": 5.9072, "loss/crossentropy": 2.6542869806289673, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17412163317203522, "step": 19670 }, { "epoch": 0.61475, "grad_norm": 3.25, "grad_norm_var": 0.0373687744140625, "learning_rate": 0.0001, "loss": 5.823, "loss/crossentropy": 2.600062370300293, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1711195632815361, "step": 19672 }, { "epoch": 0.6148125, "grad_norm": 3.125, "grad_norm_var": 0.03710530598958333, "learning_rate": 0.0001, "loss": 5.551, "loss/crossentropy": 2.4106554985046387, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16754940152168274, "step": 19674 }, { "epoch": 0.614875, "grad_norm": 3.125, "grad_norm_var": 0.0342681884765625, "learning_rate": 0.0001, "loss": 5.75, "loss/crossentropy": 2.5887335538864136, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1688610017299652, "step": 19676 }, { "epoch": 0.6149375, "grad_norm": 3.140625, "grad_norm_var": 0.025715128580729166, "learning_rate": 0.0001, "loss": 5.8252, "loss/crossentropy": 2.6288719177246094, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.17510154843330383, "step": 19678 }, { "epoch": 0.615, "grad_norm": 3.0625, "grad_norm_var": 0.027144368489583334, "learning_rate": 0.0001, "loss": 5.4602, "loss/crossentropy": 2.4112237691879272, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.1615402102470398, "step": 19680 }, { "epoch": 0.6150625, "grad_norm": 3.0625, "grad_norm_var": 0.021805826822916666, "learning_rate": 0.0001, "loss": 5.7387, "loss/crossentropy": 2.6867090463638306, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.1618417203426361, "step": 19682 }, { "epoch": 0.615125, "grad_norm": 3.296875, "grad_norm_var": 0.023053995768229165, "learning_rate": 0.0001, "loss": 5.6359, "loss/crossentropy": 2.4958173036575317, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16439910233020782, "step": 19684 }, { "epoch": 0.6151875, "grad_norm": 3.0, "grad_norm_var": 0.025886027018229167, "learning_rate": 0.0001, "loss": 5.5855, "loss/crossentropy": 2.500712513923645, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1639462411403656, "step": 19686 }, { "epoch": 0.61525, "grad_norm": 3.1875, "grad_norm_var": 0.022086588541666667, "learning_rate": 0.0001, "loss": 5.6299, "loss/crossentropy": 2.445050597190857, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.168094664812088, "step": 19688 }, { "epoch": 0.6153125, "grad_norm": 2.96875, "grad_norm_var": 0.028873697916666666, "learning_rate": 0.0001, "loss": 5.48, "loss/crossentropy": 2.4688230752944946, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.1597161442041397, "step": 19690 }, { "epoch": 0.615375, "grad_norm": 3.5625, "grad_norm_var": 0.054076131184895834, "learning_rate": 0.0001, "loss": 5.9437, "loss/crossentropy": 2.6283657550811768, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18075451254844666, "step": 19692 }, { "epoch": 0.6154375, "grad_norm": 3.234375, "grad_norm_var": 0.05478108723958333, "learning_rate": 0.0001, "loss": 5.7547, "loss/crossentropy": 2.62972092628479, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1636713668704033, "step": 19694 }, { "epoch": 0.6155, "grad_norm": 3.21875, "grad_norm_var": 0.05471598307291667, "learning_rate": 0.0001, "loss": 6.1012, "loss/crossentropy": 2.7816812992095947, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.18156083673238754, "step": 19696 }, { "epoch": 0.6155625, "grad_norm": 2.9375, "grad_norm_var": 0.052571614583333336, "learning_rate": 0.0001, "loss": 5.6343, "loss/crossentropy": 2.5516425371170044, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1590462327003479, "step": 19698 }, { "epoch": 0.615625, "grad_norm": 2.921875, "grad_norm_var": 0.05571187337239583, "learning_rate": 0.0001, "loss": 5.8999, "loss/crossentropy": 2.6493369340896606, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17662305384874344, "step": 19700 }, { "epoch": 0.6156875, "grad_norm": 3.203125, "grad_norm_var": 0.04784749348958333, "learning_rate": 0.0001, "loss": 5.7, "loss/crossentropy": 2.556334614753723, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16944894194602966, "step": 19702 }, { "epoch": 0.61575, "grad_norm": 3.140625, "grad_norm_var": 0.05321858723958333, "learning_rate": 0.0001, "loss": 5.7652, "loss/crossentropy": 2.6158326864242554, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1704033985733986, "step": 19704 }, { "epoch": 0.6158125, "grad_norm": 3.3125, "grad_norm_var": 0.05077718098958333, "learning_rate": 0.0001, "loss": 5.587, "loss/crossentropy": 2.5270044803619385, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.1626434177160263, "step": 19706 }, { "epoch": 0.615875, "grad_norm": 2.828125, "grad_norm_var": 0.036530558268229166, "learning_rate": 0.0001, "loss": 5.2316, "loss/crossentropy": 2.3332518339157104, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.14608360826969147, "step": 19708 }, { "epoch": 0.6159375, "grad_norm": 3.015625, "grad_norm_var": 0.03720296223958333, "learning_rate": 0.0001, "loss": 5.8753, "loss/crossentropy": 2.7151904106140137, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.17147938162088394, "step": 19710 }, { "epoch": 0.616, "grad_norm": 3.265625, "grad_norm_var": 0.032307942708333336, "learning_rate": 0.0001, "loss": 5.6265, "loss/crossentropy": 2.464982032775879, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16693414747714996, "step": 19712 }, { "epoch": 0.6160625, "grad_norm": 2.921875, "grad_norm_var": 0.031232706705729165, "learning_rate": 0.0001, "loss": 5.4135, "loss/crossentropy": 2.3861416578292847, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1574278026819229, "step": 19714 }, { "epoch": 0.616125, "grad_norm": 3.09375, "grad_norm_var": 0.0286529541015625, "learning_rate": 0.0001, "loss": 5.4291, "loss/crossentropy": 2.429054617881775, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15625176578760147, "step": 19716 }, { "epoch": 0.6161875, "grad_norm": 3.109375, "grad_norm_var": 0.04107666015625, "learning_rate": 0.0001, "loss": 5.8131, "loss/crossentropy": 2.645476818084717, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1687166914343834, "step": 19718 }, { "epoch": 0.61625, "grad_norm": 3.28125, "grad_norm_var": 0.0397857666015625, "learning_rate": 0.0001, "loss": 5.9102, "loss/crossentropy": 2.7409597635269165, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1696610003709793, "step": 19720 }, { "epoch": 0.6163125, "grad_norm": 3.3125, "grad_norm_var": 0.03638916015625, "learning_rate": 0.0001, "loss": 5.7133, "loss/crossentropy": 2.5925413370132446, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16637563705444336, "step": 19722 }, { "epoch": 0.616375, "grad_norm": 3.203125, "grad_norm_var": 0.0343658447265625, "learning_rate": 0.0001, "loss": 5.7412, "loss/crossentropy": 2.5285887718200684, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17126226425170898, "step": 19724 }, { "epoch": 0.6164375, "grad_norm": 3.046875, "grad_norm_var": 0.033447265625, "learning_rate": 0.0001, "loss": 5.5655, "loss/crossentropy": 2.4696013927459717, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16349675506353378, "step": 19726 }, { "epoch": 0.6165, "grad_norm": 3.046875, "grad_norm_var": 0.03194071451822917, "learning_rate": 0.0001, "loss": 5.5142, "loss/crossentropy": 2.450305223464966, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16224515438079834, "step": 19728 }, { "epoch": 0.6165625, "grad_norm": 3.0625, "grad_norm_var": 0.028092447916666666, "learning_rate": 0.0001, "loss": 5.632, "loss/crossentropy": 2.5135504007339478, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1649661585688591, "step": 19730 }, { "epoch": 0.616625, "grad_norm": 3.515625, "grad_norm_var": 0.03697509765625, "learning_rate": 0.0001, "loss": 5.2826, "loss/crossentropy": 2.243741989135742, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1531006470322609, "step": 19732 }, { "epoch": 0.6166875, "grad_norm": 3.0625, "grad_norm_var": 0.0291656494140625, "learning_rate": 0.0001, "loss": 6.0837, "loss/crossentropy": 2.8326183557510376, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.175108902156353, "step": 19734 }, { "epoch": 0.61675, "grad_norm": 2.953125, "grad_norm_var": 0.0334625244140625, "learning_rate": 0.0001, "loss": 5.5365, "loss/crossentropy": 2.565299153327942, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.15649481862783432, "step": 19736 }, { "epoch": 0.6168125, "grad_norm": 3.171875, "grad_norm_var": 0.033154296875, "learning_rate": 0.0001, "loss": 5.5647, "loss/crossentropy": 2.5372307300567627, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15743693709373474, "step": 19738 }, { "epoch": 0.616875, "grad_norm": 3.046875, "grad_norm_var": 0.024186197916666666, "learning_rate": 0.0001, "loss": 5.5928, "loss/crossentropy": 2.488158345222473, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16085833311080933, "step": 19740 }, { "epoch": 0.6169375, "grad_norm": 3.171875, "grad_norm_var": 0.024901326497395834, "learning_rate": 0.0001, "loss": 5.7667, "loss/crossentropy": 2.528436779975891, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17421747744083405, "step": 19742 }, { "epoch": 0.617, "grad_norm": 3.15625, "grad_norm_var": 0.025321451822916667, "learning_rate": 0.0001, "loss": 5.5162, "loss/crossentropy": 2.4038294553756714, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16163229942321777, "step": 19744 }, { "epoch": 0.6170625, "grad_norm": 3.15625, "grad_norm_var": 0.026692708333333332, "learning_rate": 0.0001, "loss": 5.6065, "loss/crossentropy": 2.4743977785110474, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16477020829916, "step": 19746 }, { "epoch": 0.617125, "grad_norm": 2.953125, "grad_norm_var": 0.015559895833333334, "learning_rate": 0.0001, "loss": 5.325, "loss/crossentropy": 2.3060061931610107, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.150724358856678, "step": 19748 }, { "epoch": 0.6171875, "grad_norm": 3.21875, "grad_norm_var": 0.019986979166666665, "learning_rate": 0.0001, "loss": 5.96, "loss/crossentropy": 2.808081030845642, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16831664741039276, "step": 19750 }, { "epoch": 0.61725, "grad_norm": 3.203125, "grad_norm_var": 0.020140584309895834, "learning_rate": 0.0001, "loss": 5.5748, "loss/crossentropy": 2.39535653591156, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.16364478319883347, "step": 19752 }, { "epoch": 0.6173125, "grad_norm": 3.109375, "grad_norm_var": 0.032957967122395834, "learning_rate": 0.0001, "loss": 5.7134, "loss/crossentropy": 2.51510488986969, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16748429089784622, "step": 19754 }, { "epoch": 0.617375, "grad_norm": 3.234375, "grad_norm_var": 0.03564046223958333, "learning_rate": 0.0001, "loss": 5.7733, "loss/crossentropy": 2.5583337545394897, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.16798176616430283, "step": 19756 }, { "epoch": 0.6174375, "grad_norm": 3.203125, "grad_norm_var": 0.03906962076822917, "learning_rate": 0.0001, "loss": 5.4938, "loss/crossentropy": 2.3841590881347656, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16447947174310684, "step": 19758 }, { "epoch": 0.6175, "grad_norm": 3.15625, "grad_norm_var": 0.040095011393229164, "learning_rate": 0.0001, "loss": 5.9566, "loss/crossentropy": 2.7379831075668335, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1753723919391632, "step": 19760 }, { "epoch": 0.6175625, "grad_norm": 2.890625, "grad_norm_var": 0.04627176920572917, "learning_rate": 0.0001, "loss": 5.475, "loss/crossentropy": 2.390742301940918, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15959543734788895, "step": 19762 }, { "epoch": 0.617625, "grad_norm": 3.109375, "grad_norm_var": 0.044661458333333334, "learning_rate": 0.0001, "loss": 5.532, "loss/crossentropy": 2.553749918937683, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1540743187069893, "step": 19764 }, { "epoch": 0.6176875, "grad_norm": 3.078125, "grad_norm_var": 0.03704020182291667, "learning_rate": 0.0001, "loss": 5.752, "loss/crossentropy": 2.596306085586548, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1663486734032631, "step": 19766 }, { "epoch": 0.61775, "grad_norm": 3.296875, "grad_norm_var": 0.037531534830729164, "learning_rate": 0.0001, "loss": 5.9308, "loss/crossentropy": 2.742679238319397, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1707676202058792, "step": 19768 }, { "epoch": 0.6178125, "grad_norm": 3.40625, "grad_norm_var": 0.0276519775390625, "learning_rate": 0.0001, "loss": 5.6956, "loss/crossentropy": 2.5400915145874023, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1655483990907669, "step": 19770 }, { "epoch": 0.617875, "grad_norm": 3.484375, "grad_norm_var": 0.028251139322916667, "learning_rate": 0.0001, "loss": 6.0722, "loss/crossentropy": 2.741970181465149, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18067777156829834, "step": 19772 }, { "epoch": 0.6179375, "grad_norm": 3.3125, "grad_norm_var": 0.028743489583333334, "learning_rate": 0.0001, "loss": 5.9508, "loss/crossentropy": 2.698033928871155, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17800699174404144, "step": 19774 }, { "epoch": 0.618, "grad_norm": 3.296875, "grad_norm_var": 0.025121053059895832, "learning_rate": 0.0001, "loss": 6.1399, "loss/crossentropy": 2.8511316776275635, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.178090900182724, "step": 19776 }, { "epoch": 0.6180625, "grad_norm": 3.09375, "grad_norm_var": 0.024128214518229166, "learning_rate": 0.0001, "loss": 5.1792, "loss/crossentropy": 2.223225235939026, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.15301509201526642, "step": 19778 }, { "epoch": 0.618125, "grad_norm": 3.0625, "grad_norm_var": 0.02291259765625, "learning_rate": 0.0001, "loss": 5.7736, "loss/crossentropy": 2.5856436491012573, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16958042234182358, "step": 19780 }, { "epoch": 0.6181875, "grad_norm": 3.578125, "grad_norm_var": 0.03052978515625, "learning_rate": 0.0001, "loss": 5.6399, "loss/crossentropy": 2.5239371061325073, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16628500074148178, "step": 19782 }, { "epoch": 0.61825, "grad_norm": 2.96875, "grad_norm_var": 0.037450154622395836, "learning_rate": 0.0001, "loss": 5.5843, "loss/crossentropy": 2.4954280853271484, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16435730457305908, "step": 19784 }, { "epoch": 0.6183125, "grad_norm": 3.15625, "grad_norm_var": 0.03564046223958333, "learning_rate": 0.0001, "loss": 5.9371, "loss/crossentropy": 2.6663039922714233, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17590922117233276, "step": 19786 }, { "epoch": 0.618375, "grad_norm": 2.84375, "grad_norm_var": 0.0433258056640625, "learning_rate": 0.0001, "loss": 5.8278, "loss/crossentropy": 2.6265984773635864, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17128776758909225, "step": 19788 }, { "epoch": 0.6184375, "grad_norm": 3.5625, "grad_norm_var": 0.07093098958333334, "learning_rate": 0.0001, "loss": 5.8409, "loss/crossentropy": 2.5903788805007935, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17270878702402115, "step": 19790 }, { "epoch": 0.6185, "grad_norm": 2.78125, "grad_norm_var": 0.08190816243489583, "learning_rate": 0.0001, "loss": 5.3253, "loss/crossentropy": 2.3577980995178223, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.14909353852272034, "step": 19792 }, { "epoch": 0.6185625, "grad_norm": 3.078125, "grad_norm_var": 0.0795562744140625, "learning_rate": 0.0001, "loss": 5.9199, "loss/crossentropy": 2.7423676252365112, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16814511269330978, "step": 19794 }, { "epoch": 0.618625, "grad_norm": 2.96875, "grad_norm_var": 0.080517578125, "learning_rate": 0.0001, "loss": 5.51, "loss/crossentropy": 2.4029040336608887, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16579217463731766, "step": 19796 }, { "epoch": 0.6186875, "grad_norm": 3.25, "grad_norm_var": 0.07124735514322916, "learning_rate": 0.0001, "loss": 5.8219, "loss/crossentropy": 2.6036036014556885, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17144262790679932, "step": 19798 }, { "epoch": 0.61875, "grad_norm": 3.203125, "grad_norm_var": 0.06711324055989583, "learning_rate": 0.0001, "loss": 5.8348, "loss/crossentropy": 2.702506184577942, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16400794684886932, "step": 19800 }, { "epoch": 0.6188125, "grad_norm": 2.921875, "grad_norm_var": 0.07254130045572917, "learning_rate": 0.0001, "loss": 5.8127, "loss/crossentropy": 2.6946845054626465, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16531235724687576, "step": 19802 }, { "epoch": 0.618875, "grad_norm": 3.28125, "grad_norm_var": 0.061335245768229164, "learning_rate": 0.0001, "loss": 5.5924, "loss/crossentropy": 2.483719825744629, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1651698350906372, "step": 19804 }, { "epoch": 0.6189375, "grad_norm": 3.203125, "grad_norm_var": 0.0256744384765625, "learning_rate": 0.0001, "loss": 5.5088, "loss/crossentropy": 2.47348690032959, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15900491923093796, "step": 19806 }, { "epoch": 0.619, "grad_norm": 3.46875, "grad_norm_var": 0.03137613932291667, "learning_rate": 0.0001, "loss": 6.1446, "loss/crossentropy": 2.730509877204895, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.18399201333522797, "step": 19808 }, { "epoch": 0.6190625, "grad_norm": 3.140625, "grad_norm_var": 0.030394490559895834, "learning_rate": 0.0001, "loss": 5.4351, "loss/crossentropy": 2.3951356410980225, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16024568676948547, "step": 19810 }, { "epoch": 0.619125, "grad_norm": 2.90625, "grad_norm_var": 0.0328765869140625, "learning_rate": 0.0001, "loss": 5.8967, "loss/crossentropy": 2.689175009727478, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1727081760764122, "step": 19812 }, { "epoch": 0.6191875, "grad_norm": 3.0625, "grad_norm_var": 0.03483784993489583, "learning_rate": 0.0001, "loss": 5.7387, "loss/crossentropy": 2.6214596033096313, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16562539339065552, "step": 19814 }, { "epoch": 0.61925, "grad_norm": 3.015625, "grad_norm_var": 0.03827718098958333, "learning_rate": 0.0001, "loss": 5.4334, "loss/crossentropy": 2.4262090921401978, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15345712006092072, "step": 19816 }, { "epoch": 0.6193125, "grad_norm": 3.28125, "grad_norm_var": 0.03860270182291667, "learning_rate": 0.0001, "loss": 5.8901, "loss/crossentropy": 2.733052372932434, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16805174201726913, "step": 19818 }, { "epoch": 0.619375, "grad_norm": 3.125, "grad_norm_var": 0.0427886962890625, "learning_rate": 0.0001, "loss": 5.5381, "loss/crossentropy": 2.5061721801757812, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1559237688779831, "step": 19820 }, { "epoch": 0.6194375, "grad_norm": 3.4375, "grad_norm_var": 0.058154296875, "learning_rate": 0.0001, "loss": 5.5695, "loss/crossentropy": 2.4504482746124268, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16463561356067657, "step": 19822 }, { "epoch": 0.6195, "grad_norm": 3.328125, "grad_norm_var": 0.045068359375, "learning_rate": 0.0001, "loss": 5.9245, "loss/crossentropy": 2.700040817260742, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1732223555445671, "step": 19824 }, { "epoch": 0.6195625, "grad_norm": 3.171875, "grad_norm_var": 0.0473052978515625, "learning_rate": 0.0001, "loss": 5.6389, "loss/crossentropy": 2.5035756826400757, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16705185174942017, "step": 19826 }, { "epoch": 0.619625, "grad_norm": 3.21875, "grad_norm_var": 0.0452545166015625, "learning_rate": 0.0001, "loss": 5.7924, "loss/crossentropy": 2.6258389949798584, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1697773039340973, "step": 19828 }, { "epoch": 0.6196875, "grad_norm": 3.0625, "grad_norm_var": 0.0395172119140625, "learning_rate": 0.0001, "loss": 5.7562, "loss/crossentropy": 2.574925184249878, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.170865960419178, "step": 19830 }, { "epoch": 0.61975, "grad_norm": 3.484375, "grad_norm_var": 0.0346099853515625, "learning_rate": 0.0001, "loss": 5.5627, "loss/crossentropy": 2.390936851501465, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16834847629070282, "step": 19832 }, { "epoch": 0.6198125, "grad_norm": 3.28125, "grad_norm_var": 0.0341796875, "learning_rate": 0.0001, "loss": 5.7697, "loss/crossentropy": 2.6261430978775024, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16592053323984146, "step": 19834 }, { "epoch": 0.619875, "grad_norm": 3.34375, "grad_norm_var": 0.02789306640625, "learning_rate": 0.0001, "loss": 5.6812, "loss/crossentropy": 2.529661774635315, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.17023325711488724, "step": 19836 }, { "epoch": 0.6199375, "grad_norm": 2.671875, "grad_norm_var": 0.03790690104166667, "learning_rate": 0.0001, "loss": 5.6216, "loss/crossentropy": 2.5542296171188354, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16259898990392685, "step": 19838 }, { "epoch": 0.62, "grad_norm": 3.328125, "grad_norm_var": 0.053446451822916664, "learning_rate": 0.0001, "loss": 5.7378, "loss/crossentropy": 2.6376454830169678, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1635262370109558, "step": 19840 }, { "epoch": 0.6200625, "grad_norm": 3.40625, "grad_norm_var": 0.060480753580729164, "learning_rate": 0.0001, "loss": 5.6424, "loss/crossentropy": 2.4937418699264526, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16760383546352386, "step": 19842 }, { "epoch": 0.620125, "grad_norm": 3.25, "grad_norm_var": 0.06047261555989583, "learning_rate": 0.0001, "loss": 5.5447, "loss/crossentropy": 2.4872806072235107, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15809065103530884, "step": 19844 }, { "epoch": 0.6201875, "grad_norm": 3.09375, "grad_norm_var": 0.0618316650390625, "learning_rate": 0.0001, "loss": 5.5921, "loss/crossentropy": 2.5492920875549316, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15701929479837418, "step": 19846 }, { "epoch": 0.62025, "grad_norm": 2.890625, "grad_norm_var": 0.05341389973958333, "learning_rate": 0.0001, "loss": 5.5839, "loss/crossentropy": 2.480614185333252, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16267366707324982, "step": 19848 }, { "epoch": 0.6203125, "grad_norm": 2.921875, "grad_norm_var": 0.05015869140625, "learning_rate": 0.0001, "loss": 5.5384, "loss/crossentropy": 2.5023187398910522, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15985383093357086, "step": 19850 }, { "epoch": 0.620375, "grad_norm": 2.984375, "grad_norm_var": 0.05698954264322917, "learning_rate": 0.0001, "loss": 5.5993, "loss/crossentropy": 2.4145087003707886, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16769680380821228, "step": 19852 }, { "epoch": 0.6204375, "grad_norm": 2.90625, "grad_norm_var": 0.049925740559895834, "learning_rate": 0.0001, "loss": 5.6951, "loss/crossentropy": 2.605020761489868, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16486431658267975, "step": 19854 }, { "epoch": 0.6205, "grad_norm": 3.265625, "grad_norm_var": 0.03661702473958333, "learning_rate": 0.0001, "loss": 5.9839, "loss/crossentropy": 2.7632559537887573, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1740158125758171, "step": 19856 }, { "epoch": 0.6205625, "grad_norm": 3.046875, "grad_norm_var": 0.026741536458333333, "learning_rate": 0.0001, "loss": 5.6185, "loss/crossentropy": 2.530821681022644, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1599380522966385, "step": 19858 }, { "epoch": 0.620625, "grad_norm": 3.109375, "grad_norm_var": 0.024983723958333332, "learning_rate": 0.0001, "loss": 5.6764, "loss/crossentropy": 2.5637917518615723, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1624293476343155, "step": 19860 }, { "epoch": 0.6206875, "grad_norm": 3.015625, "grad_norm_var": 0.025130208333333334, "learning_rate": 0.0001, "loss": 5.6027, "loss/crossentropy": 2.503827214241028, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16106384992599487, "step": 19862 }, { "epoch": 0.62075, "grad_norm": 3.109375, "grad_norm_var": 0.0257720947265625, "learning_rate": 0.0001, "loss": 5.6813, "loss/crossentropy": 2.6267412900924683, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16248583793640137, "step": 19864 }, { "epoch": 0.6208125, "grad_norm": 3.109375, "grad_norm_var": 0.0237945556640625, "learning_rate": 0.0001, "loss": 5.7132, "loss/crossentropy": 2.620882511138916, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16196326166391373, "step": 19866 }, { "epoch": 0.620875, "grad_norm": 2.984375, "grad_norm_var": 0.014207967122395833, "learning_rate": 0.0001, "loss": 5.5756, "loss/crossentropy": 2.514933943748474, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15919151902198792, "step": 19868 }, { "epoch": 0.6209375, "grad_norm": 3.1875, "grad_norm_var": 0.011937459309895834, "learning_rate": 0.0001, "loss": 5.509, "loss/crossentropy": 2.417167901992798, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16543804854154587, "step": 19870 }, { "epoch": 0.621, "grad_norm": 2.84375, "grad_norm_var": 0.0136383056640625, "learning_rate": 0.0001, "loss": 5.5005, "loss/crossentropy": 2.4885376691818237, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15823006629943848, "step": 19872 }, { "epoch": 0.6210625, "grad_norm": 3.125, "grad_norm_var": 0.013801066080729167, "learning_rate": 0.0001, "loss": 5.8575, "loss/crossentropy": 2.6915894746780396, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.17362503707408905, "step": 19874 }, { "epoch": 0.621125, "grad_norm": 3.15625, "grad_norm_var": 0.041239420572916664, "learning_rate": 0.0001, "loss": 5.6771, "loss/crossentropy": 2.500933289527893, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17113670706748962, "step": 19876 }, { "epoch": 0.6211875, "grad_norm": 3.3125, "grad_norm_var": 0.044417317708333334, "learning_rate": 0.0001, "loss": 5.5976, "loss/crossentropy": 2.4560399055480957, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16571440547704697, "step": 19878 }, { "epoch": 0.62125, "grad_norm": 2.984375, "grad_norm_var": 0.04436848958333333, "learning_rate": 0.0001, "loss": 5.732, "loss/crossentropy": 2.667479157447815, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16348617523908615, "step": 19880 }, { "epoch": 0.6213125, "grad_norm": 3.328125, "grad_norm_var": 0.04750874837239583, "learning_rate": 0.0001, "loss": 5.7578, "loss/crossentropy": 2.521763563156128, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17164885997772217, "step": 19882 }, { "epoch": 0.621375, "grad_norm": 3.15625, "grad_norm_var": 0.046605428059895836, "learning_rate": 0.0001, "loss": 5.4672, "loss/crossentropy": 2.3879551887512207, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16104530543088913, "step": 19884 }, { "epoch": 0.6214375, "grad_norm": 3.015625, "grad_norm_var": 0.0566314697265625, "learning_rate": 0.0001, "loss": 5.4134, "loss/crossentropy": 2.279152035713196, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16108424216508865, "step": 19886 }, { "epoch": 0.6215, "grad_norm": 2.953125, "grad_norm_var": 0.04547119140625, "learning_rate": 0.0001, "loss": 5.7432, "loss/crossentropy": 2.5680145025253296, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16946741938591003, "step": 19888 }, { "epoch": 0.6215625, "grad_norm": 3.3125, "grad_norm_var": 0.0491851806640625, "learning_rate": 0.0001, "loss": 5.8043, "loss/crossentropy": 2.530237555503845, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17584840953350067, "step": 19890 }, { "epoch": 0.621625, "grad_norm": 2.953125, "grad_norm_var": 0.03766276041666667, "learning_rate": 0.0001, "loss": 5.6202, "loss/crossentropy": 2.5808955430984497, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1578410044312477, "step": 19892 }, { "epoch": 0.6216875, "grad_norm": 2.96875, "grad_norm_var": 0.038960774739583336, "learning_rate": 0.0001, "loss": 5.8825, "loss/crossentropy": 2.767555594444275, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16266612708568573, "step": 19894 }, { "epoch": 0.62175, "grad_norm": 3.34375, "grad_norm_var": 0.037886555989583334, "learning_rate": 0.0001, "loss": 5.9589, "loss/crossentropy": 2.7096651792526245, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17609944194555283, "step": 19896 }, { "epoch": 0.6218125, "grad_norm": 2.875, "grad_norm_var": 0.04159749348958333, "learning_rate": 0.0001, "loss": 5.1233, "loss/crossentropy": 2.2056552171707153, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.14371608197689056, "step": 19898 }, { "epoch": 0.621875, "grad_norm": 3.046875, "grad_norm_var": 0.0403228759765625, "learning_rate": 0.0001, "loss": 5.8057, "loss/crossentropy": 2.621985077857971, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16837255656719208, "step": 19900 }, { "epoch": 0.6219375, "grad_norm": 2.859375, "grad_norm_var": 0.03504231770833333, "learning_rate": 0.0001, "loss": 5.5809, "loss/crossentropy": 2.5222402811050415, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16133303195238113, "step": 19902 }, { "epoch": 0.622, "grad_norm": 3.109375, "grad_norm_var": 0.030810546875, "learning_rate": 0.0001, "loss": 5.6809, "loss/crossentropy": 2.5464282035827637, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16734935343265533, "step": 19904 }, { "epoch": 0.6220625, "grad_norm": 3.1875, "grad_norm_var": 0.01968994140625, "learning_rate": 0.0001, "loss": 5.9077, "loss/crossentropy": 2.7067201137542725, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17049284279346466, "step": 19906 }, { "epoch": 0.622125, "grad_norm": 2.9375, "grad_norm_var": 0.021076456705729166, "learning_rate": 0.0001, "loss": 5.5868, "loss/crossentropy": 2.536835789680481, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16202454268932343, "step": 19908 }, { "epoch": 0.6221875, "grad_norm": 2.9375, "grad_norm_var": 0.021708170572916668, "learning_rate": 0.0001, "loss": 5.8591, "loss/crossentropy": 2.721961736679077, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16723129153251648, "step": 19910 }, { "epoch": 0.62225, "grad_norm": 3.015625, "grad_norm_var": 0.016877237955729166, "learning_rate": 0.0001, "loss": 5.5046, "loss/crossentropy": 2.4232596158981323, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15891293436288834, "step": 19912 }, { "epoch": 0.6223125, "grad_norm": 2.859375, "grad_norm_var": 0.024605305989583333, "learning_rate": 0.0001, "loss": 5.6248, "loss/crossentropy": 2.5254926681518555, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16344477981328964, "step": 19914 }, { "epoch": 0.622375, "grad_norm": 3.703125, "grad_norm_var": 0.04690755208333333, "learning_rate": 0.0001, "loss": 5.914, "loss/crossentropy": 2.6902559995651245, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17237311601638794, "step": 19916 }, { "epoch": 0.6224375, "grad_norm": 3.765625, "grad_norm_var": 0.07075907389322916, "learning_rate": 0.0001, "loss": 6.0173, "loss/crossentropy": 2.7229477167129517, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.18138906359672546, "step": 19918 }, { "epoch": 0.6225, "grad_norm": 3.140625, "grad_norm_var": 0.07154541015625, "learning_rate": 0.0001, "loss": 5.712, "loss/crossentropy": 2.527737617492676, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17193929851055145, "step": 19920 }, { "epoch": 0.6225625, "grad_norm": 3.203125, "grad_norm_var": 0.07968343098958333, "learning_rate": 0.0001, "loss": 5.5652, "loss/crossentropy": 2.5005797147750854, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.161932535469532, "step": 19922 }, { "epoch": 0.622625, "grad_norm": 2.90625, "grad_norm_var": 0.07846577962239583, "learning_rate": 0.0001, "loss": 5.5449, "loss/crossentropy": 2.5181914567947388, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15969933569431305, "step": 19924 }, { "epoch": 0.6226875, "grad_norm": 3.296875, "grad_norm_var": 0.07704671223958333, "learning_rate": 0.0001, "loss": 5.7643, "loss/crossentropy": 2.6106055974960327, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16771600395441055, "step": 19926 }, { "epoch": 0.62275, "grad_norm": 2.9375, "grad_norm_var": 0.0793365478515625, "learning_rate": 0.0001, "loss": 5.6237, "loss/crossentropy": 2.566754460334778, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15998917818069458, "step": 19928 }, { "epoch": 0.6228125, "grad_norm": 3.171875, "grad_norm_var": 0.06936442057291667, "learning_rate": 0.0001, "loss": 5.6092, "loss/crossentropy": 2.487126350402832, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16259347647428513, "step": 19930 }, { "epoch": 0.622875, "grad_norm": 3.046875, "grad_norm_var": 0.04719136555989583, "learning_rate": 0.0001, "loss": 5.7003, "loss/crossentropy": 2.6059796810150146, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16333814710378647, "step": 19932 }, { "epoch": 0.6229375, "grad_norm": 3.09375, "grad_norm_var": 0.018159993489583335, "learning_rate": 0.0001, "loss": 5.8046, "loss/crossentropy": 2.65829861164093, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16931987553834915, "step": 19934 }, { "epoch": 0.623, "grad_norm": 3.0625, "grad_norm_var": 0.016145833333333335, "learning_rate": 0.0001, "loss": 5.5101, "loss/crossentropy": 2.448823094367981, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1596415713429451, "step": 19936 }, { "epoch": 0.6230625, "grad_norm": 2.90625, "grad_norm_var": 0.012450154622395833, "learning_rate": 0.0001, "loss": 5.3862, "loss/crossentropy": 2.366100311279297, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.16021539270877838, "step": 19938 }, { "epoch": 0.623125, "grad_norm": 3.109375, "grad_norm_var": 0.0109375, "learning_rate": 0.0001, "loss": 5.7748, "loss/crossentropy": 2.6478099822998047, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16738468408584595, "step": 19940 }, { "epoch": 0.6231875, "grad_norm": 3.03125, "grad_norm_var": 0.0072265625, "learning_rate": 0.0001, "loss": 5.3404, "loss/crossentropy": 2.3103270530700684, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1545696258544922, "step": 19942 }, { "epoch": 0.62325, "grad_norm": 3.125, "grad_norm_var": 0.006004842122395834, "learning_rate": 0.0001, "loss": 5.681, "loss/crossentropy": 2.5023328065872192, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16825802624225616, "step": 19944 }, { "epoch": 0.6233125, "grad_norm": 3.125, "grad_norm_var": 0.006184895833333333, "learning_rate": 0.0001, "loss": 5.6783, "loss/crossentropy": 2.498602867126465, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16952957212924957, "step": 19946 }, { "epoch": 0.623375, "grad_norm": 3.40625, "grad_norm_var": 0.012108357747395833, "learning_rate": 0.0001, "loss": 5.7909, "loss/crossentropy": 2.64833664894104, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16659605503082275, "step": 19948 }, { "epoch": 0.6234375, "grad_norm": 3.015625, "grad_norm_var": 0.0127593994140625, "learning_rate": 0.0001, "loss": 5.2735, "loss/crossentropy": 2.2905895709991455, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15024419128894806, "step": 19950 }, { "epoch": 0.6235, "grad_norm": 3.140625, "grad_norm_var": 0.013505045572916667, "learning_rate": 0.0001, "loss": 5.6007, "loss/crossentropy": 2.5294933319091797, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16024452447891235, "step": 19952 }, { "epoch": 0.6235625, "grad_norm": 3.296875, "grad_norm_var": 0.030659993489583332, "learning_rate": 0.0001, "loss": 5.9605, "loss/crossentropy": 2.6646621227264404, "loss/hidden": 1.5703125, "loss/jsd": 0.0, "loss/logits": 0.17255229502916336, "step": 19954 }, { "epoch": 0.623625, "grad_norm": 3.03125, "grad_norm_var": 0.03189697265625, "learning_rate": 0.0001, "loss": 6.022, "loss/crossentropy": 2.806672692298889, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17466050386428833, "step": 19956 }, { "epoch": 0.6236875, "grad_norm": 3.15625, "grad_norm_var": 0.03511454264322917, "learning_rate": 0.0001, "loss": 5.4811, "loss/crossentropy": 2.452077865600586, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15836618095636368, "step": 19958 }, { "epoch": 0.62375, "grad_norm": 3.4375, "grad_norm_var": 0.038914998372395836, "learning_rate": 0.0001, "loss": 5.9342, "loss/crossentropy": 2.6591310501098633, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.18024230748414993, "step": 19960 }, { "epoch": 0.6238125, "grad_norm": 2.71875, "grad_norm_var": 0.05577799479166667, "learning_rate": 0.0001, "loss": 5.6501, "loss/crossentropy": 2.5616250038146973, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16196832805871964, "step": 19962 }, { "epoch": 0.623875, "grad_norm": 3.0625, "grad_norm_var": 0.052887980143229166, "learning_rate": 0.0001, "loss": 5.7081, "loss/crossentropy": 2.468526005744934, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17317667603492737, "step": 19964 }, { "epoch": 0.6239375, "grad_norm": 2.9375, "grad_norm_var": 0.10784098307291666, "learning_rate": 0.0001, "loss": 5.8608, "loss/crossentropy": 2.526008367538452, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.18347448855638504, "step": 19966 }, { "epoch": 0.624, "grad_norm": 3.40625, "grad_norm_var": 0.125048828125, "learning_rate": 0.0001, "loss": 6.0458, "loss/crossentropy": 2.776844620704651, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1768995299935341, "step": 19968 }, { "epoch": 0.6240625, "grad_norm": 3.203125, "grad_norm_var": 0.11021728515625, "learning_rate": 0.0001, "loss": 5.6742, "loss/crossentropy": 2.5258538722991943, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16835208982229233, "step": 19970 }, { "epoch": 0.624125, "grad_norm": 3.125, "grad_norm_var": 0.1097808837890625, "learning_rate": 0.0001, "loss": 5.7995, "loss/crossentropy": 2.632531762123108, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16786815226078033, "step": 19972 }, { "epoch": 0.6241875, "grad_norm": 3.03125, "grad_norm_var": 0.10949605305989583, "learning_rate": 0.0001, "loss": 5.8556, "loss/crossentropy": 2.727881669998169, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1678517833352089, "step": 19974 }, { "epoch": 0.62425, "grad_norm": 3.125, "grad_norm_var": 0.1069732666015625, "learning_rate": 0.0001, "loss": 5.5128, "loss/crossentropy": 2.3756444454193115, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16332362592220306, "step": 19976 }, { "epoch": 0.6243125, "grad_norm": 3.140625, "grad_norm_var": 0.08775634765625, "learning_rate": 0.0001, "loss": 5.5475, "loss/crossentropy": 2.406875252723694, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16875281929969788, "step": 19978 }, { "epoch": 0.624375, "grad_norm": 3.078125, "grad_norm_var": 0.0881988525390625, "learning_rate": 0.0001, "loss": 5.6718, "loss/crossentropy": 2.52127742767334, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16700471192598343, "step": 19980 }, { "epoch": 0.6244375, "grad_norm": 2.890625, "grad_norm_var": 0.039453125, "learning_rate": 0.0001, "loss": 5.741, "loss/crossentropy": 2.6321603059768677, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16439685225486755, "step": 19982 }, { "epoch": 0.6245, "grad_norm": 3.25, "grad_norm_var": 0.017243448893229166, "learning_rate": 0.0001, "loss": 5.8766, "loss/crossentropy": 2.636662721633911, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17282648384571075, "step": 19984 }, { "epoch": 0.6245625, "grad_norm": 3.0625, "grad_norm_var": 0.017215983072916666, "learning_rate": 0.0001, "loss": 6.0626, "loss/crossentropy": 2.7927119731903076, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1797192469239235, "step": 19986 }, { "epoch": 0.624625, "grad_norm": 3.078125, "grad_norm_var": 0.018701171875, "learning_rate": 0.0001, "loss": 5.7114, "loss/crossentropy": 2.577378988265991, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16769851744174957, "step": 19988 }, { "epoch": 0.6246875, "grad_norm": 2.984375, "grad_norm_var": 0.019661458333333333, "learning_rate": 0.0001, "loss": 5.28, "loss/crossentropy": 2.3434277772903442, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.15108251571655273, "step": 19990 }, { "epoch": 0.62475, "grad_norm": 3.046875, "grad_norm_var": 0.019880167643229165, "learning_rate": 0.0001, "loss": 5.5554, "loss/crossentropy": 2.4401341676712036, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16308779269456863, "step": 19992 }, { "epoch": 0.6248125, "grad_norm": 3.09375, "grad_norm_var": 0.021158854166666668, "learning_rate": 0.0001, "loss": 5.6507, "loss/crossentropy": 2.5970619916915894, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15927314013242722, "step": 19994 }, { "epoch": 0.624875, "grad_norm": 3.078125, "grad_norm_var": 0.021317545572916666, "learning_rate": 0.0001, "loss": 5.7013, "loss/crossentropy": 2.5487600564956665, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.1730649545788765, "step": 19996 }, { "epoch": 0.6249375, "grad_norm": 2.703125, "grad_norm_var": 0.030085245768229168, "learning_rate": 0.0001, "loss": 5.2125, "loss/crossentropy": 2.2816121578216553, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1481662392616272, "step": 19998 }, { "epoch": 0.625, "grad_norm": 3.03125, "grad_norm_var": 0.017333984375, "learning_rate": 0.0001, "loss": 5.8688, "loss/crossentropy": 2.6770824193954468, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16878335177898407, "step": 20000 }, { "epoch": 0.6250625, "grad_norm": 3.203125, "grad_norm_var": 0.0190093994140625, "learning_rate": 0.0001, "loss": 5.6288, "loss/crossentropy": 2.4984673261642456, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16225223988294601, "step": 20002 }, { "epoch": 0.625125, "grad_norm": 2.890625, "grad_norm_var": 0.0303375244140625, "learning_rate": 0.0001, "loss": 6.0646, "loss/crossentropy": 2.8133655786514282, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1762980967760086, "step": 20004 }, { "epoch": 0.6251875, "grad_norm": 2.9375, "grad_norm_var": 0.03092041015625, "learning_rate": 0.0001, "loss": 5.4517, "loss/crossentropy": 2.4188199043273926, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16109905391931534, "step": 20006 }, { "epoch": 0.62525, "grad_norm": 3.140625, "grad_norm_var": 0.036498006184895834, "learning_rate": 0.0001, "loss": 5.729, "loss/crossentropy": 2.5297917127609253, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.16523268818855286, "step": 20008 }, { "epoch": 0.6253125, "grad_norm": 2.875, "grad_norm_var": 0.037516276041666664, "learning_rate": 0.0001, "loss": 5.8529, "loss/crossentropy": 2.678296208381653, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16668106615543365, "step": 20010 }, { "epoch": 0.625375, "grad_norm": 3.09375, "grad_norm_var": 0.04182027180989583, "learning_rate": 0.0001, "loss": 5.8357, "loss/crossentropy": 2.7086238861083984, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1666112244129181, "step": 20012 }, { "epoch": 0.6254375, "grad_norm": 2.859375, "grad_norm_var": 0.054032389322916666, "learning_rate": 0.0001, "loss": 5.7439, "loss/crossentropy": 2.468640089035034, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17674177885055542, "step": 20014 }, { "epoch": 0.6255, "grad_norm": 2.984375, "grad_norm_var": 0.054621378580729164, "learning_rate": 0.0001, "loss": 5.5136, "loss/crossentropy": 2.4705368280410767, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15977267920970917, "step": 20016 }, { "epoch": 0.6255625, "grad_norm": 3.875, "grad_norm_var": 0.08739827473958334, "learning_rate": 0.0001, "loss": 5.725, "loss/crossentropy": 2.565088629722595, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16872488707304, "step": 20018 }, { "epoch": 0.625625, "grad_norm": 3.0625, "grad_norm_var": 0.07919514973958333, "learning_rate": 0.0001, "loss": 5.8745, "loss/crossentropy": 2.7002429962158203, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1697673499584198, "step": 20020 }, { "epoch": 0.6256875, "grad_norm": 3.09375, "grad_norm_var": 0.07160542805989584, "learning_rate": 0.0001, "loss": 5.8948, "loss/crossentropy": 2.6865261793136597, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17434507608413696, "step": 20022 }, { "epoch": 0.62575, "grad_norm": 3.265625, "grad_norm_var": 0.07259114583333333, "learning_rate": 0.0001, "loss": 5.5557, "loss/crossentropy": 2.4680683612823486, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16149750351905823, "step": 20024 }, { "epoch": 0.6258125, "grad_norm": 2.8125, "grad_norm_var": 0.07628580729166666, "learning_rate": 0.0001, "loss": 5.5424, "loss/crossentropy": 2.5413738489151, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15869881212711334, "step": 20026 }, { "epoch": 0.625875, "grad_norm": 3.140625, "grad_norm_var": 0.07774149576822917, "learning_rate": 0.0001, "loss": 5.6263, "loss/crossentropy": 2.5399714708328247, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16448822617530823, "step": 20028 }, { "epoch": 0.6259375, "grad_norm": 3.015625, "grad_norm_var": 0.053938802083333334, "learning_rate": 0.0001, "loss": 5.8731, "loss/crossentropy": 2.6865917444229126, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17138946801424026, "step": 20030 }, { "epoch": 0.626, "grad_norm": 2.78125, "grad_norm_var": 0.06081441243489583, "learning_rate": 0.0001, "loss": 5.5415, "loss/crossentropy": 2.536372423171997, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.15989230573177338, "step": 20032 }, { "epoch": 0.6260625, "grad_norm": 3.296875, "grad_norm_var": 0.021630859375, "learning_rate": 0.0001, "loss": 5.6821, "loss/crossentropy": 2.553732395172119, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.169091135263443, "step": 20034 }, { "epoch": 0.626125, "grad_norm": 2.953125, "grad_norm_var": 0.0237213134765625, "learning_rate": 0.0001, "loss": 5.7535, "loss/crossentropy": 2.5982415676116943, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1686534658074379, "step": 20036 }, { "epoch": 0.6261875, "grad_norm": 2.859375, "grad_norm_var": 0.0248046875, "learning_rate": 0.0001, "loss": 5.4695, "loss/crossentropy": 2.371529698371887, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1625317856669426, "step": 20038 }, { "epoch": 0.62625, "grad_norm": 3.125, "grad_norm_var": 0.0400299072265625, "learning_rate": 0.0001, "loss": 5.8379, "loss/crossentropy": 2.5744885206222534, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1767367273569107, "step": 20040 }, { "epoch": 0.6263125, "grad_norm": 3.234375, "grad_norm_var": 0.03654683430989583, "learning_rate": 0.0001, "loss": 5.5845, "loss/crossentropy": 2.5264803171157837, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16009458154439926, "step": 20042 }, { "epoch": 0.626375, "grad_norm": 3.140625, "grad_norm_var": 0.034716796875, "learning_rate": 0.0001, "loss": 5.651, "loss/crossentropy": 2.6154359579086304, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1543370485305786, "step": 20044 }, { "epoch": 0.6264375, "grad_norm": 3.078125, "grad_norm_var": 0.03862202962239583, "learning_rate": 0.0001, "loss": 5.3501, "loss/crossentropy": 2.4048666954040527, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.14765171706676483, "step": 20046 }, { "epoch": 0.6265, "grad_norm": 3.0625, "grad_norm_var": 0.03323160807291667, "learning_rate": 0.0001, "loss": 6.1096, "loss/crossentropy": 2.896267533302307, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17211604118347168, "step": 20048 }, { "epoch": 0.6265625, "grad_norm": 3.09375, "grad_norm_var": 0.03186442057291667, "learning_rate": 0.0001, "loss": 5.5006, "loss/crossentropy": 2.4230915307998657, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16243375092744827, "step": 20050 }, { "epoch": 0.626625, "grad_norm": 3.234375, "grad_norm_var": 0.032275390625, "learning_rate": 0.0001, "loss": 6.0838, "loss/crossentropy": 2.8078731298446655, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17915435135364532, "step": 20052 }, { "epoch": 0.6266875, "grad_norm": 3.265625, "grad_norm_var": 0.028416951497395832, "learning_rate": 0.0001, "loss": 5.7891, "loss/crossentropy": 2.6495046615600586, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16747775673866272, "step": 20054 }, { "epoch": 0.62675, "grad_norm": 3.15625, "grad_norm_var": 0.015436808268229166, "learning_rate": 0.0001, "loss": 5.6559, "loss/crossentropy": 2.511838912963867, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16636407375335693, "step": 20056 }, { "epoch": 0.6268125, "grad_norm": 2.953125, "grad_norm_var": 0.01988525390625, "learning_rate": 0.0001, "loss": 5.6049, "loss/crossentropy": 2.5633102655410767, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15454931557178497, "step": 20058 }, { "epoch": 0.626875, "grad_norm": 3.1875, "grad_norm_var": 0.020048014322916665, "learning_rate": 0.0001, "loss": 5.7108, "loss/crossentropy": 2.6183091402053833, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16276396811008453, "step": 20060 }, { "epoch": 0.6269375, "grad_norm": 3.140625, "grad_norm_var": 0.018192545572916666, "learning_rate": 0.0001, "loss": 5.6007, "loss/crossentropy": 2.537225842475891, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.1637696623802185, "step": 20062 }, { "epoch": 0.627, "grad_norm": 3.28125, "grad_norm_var": 0.019041951497395834, "learning_rate": 0.0001, "loss": 5.8561, "loss/crossentropy": 2.6517701148986816, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1719948723912239, "step": 20064 }, { "epoch": 0.6270625, "grad_norm": 3.109375, "grad_norm_var": 0.017137654622395835, "learning_rate": 0.0001, "loss": 5.4176, "loss/crossentropy": 2.3762134313583374, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1603914126753807, "step": 20066 }, { "epoch": 0.627125, "grad_norm": 3.328125, "grad_norm_var": 0.018001302083333334, "learning_rate": 0.0001, "loss": 5.8595, "loss/crossentropy": 2.690279245376587, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16887035220861435, "step": 20068 }, { "epoch": 0.6271875, "grad_norm": 2.90625, "grad_norm_var": 0.0216705322265625, "learning_rate": 0.0001, "loss": 5.4766, "loss/crossentropy": 2.4541242122650146, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1565433144569397, "step": 20070 }, { "epoch": 0.62725, "grad_norm": 3.171875, "grad_norm_var": 0.02451171875, "learning_rate": 0.0001, "loss": 5.7288, "loss/crossentropy": 2.5191383361816406, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17057938873767853, "step": 20072 }, { "epoch": 0.6273125, "grad_norm": 2.765625, "grad_norm_var": 0.026756795247395833, "learning_rate": 0.0001, "loss": 5.3662, "loss/crossentropy": 2.4036494493484497, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.14859680831432343, "step": 20074 }, { "epoch": 0.627375, "grad_norm": 3.046875, "grad_norm_var": 0.030256144205729165, "learning_rate": 0.0001, "loss": 5.4422, "loss/crossentropy": 2.383970260620117, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16051417589187622, "step": 20076 }, { "epoch": 0.6274375, "grad_norm": 3.015625, "grad_norm_var": 0.029059855143229167, "learning_rate": 0.0001, "loss": 6.0372, "loss/crossentropy": 2.847605347633362, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1728636845946312, "step": 20078 }, { "epoch": 0.6275, "grad_norm": 2.8125, "grad_norm_var": 0.0399078369140625, "learning_rate": 0.0001, "loss": 5.3906, "loss/crossentropy": 2.459246516227722, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.14899127185344696, "step": 20080 }, { "epoch": 0.6275625, "grad_norm": 3.296875, "grad_norm_var": 0.0457427978515625, "learning_rate": 0.0001, "loss": 5.4739, "loss/crossentropy": 2.353138566017151, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1648089736700058, "step": 20082 }, { "epoch": 0.627625, "grad_norm": 3.015625, "grad_norm_var": 0.04138895670572917, "learning_rate": 0.0001, "loss": 5.3657, "loss/crossentropy": 2.3784090280532837, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15536832809448242, "step": 20084 }, { "epoch": 0.6276875, "grad_norm": 2.8125, "grad_norm_var": 0.03606363932291667, "learning_rate": 0.0001, "loss": 5.4472, "loss/crossentropy": 2.4753423929214478, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15421781688928604, "step": 20086 }, { "epoch": 0.62775, "grad_norm": 3.46875, "grad_norm_var": 0.043390909830729164, "learning_rate": 0.0001, "loss": 6.049, "loss/crossentropy": 2.8337109088897705, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17153087258338928, "step": 20088 }, { "epoch": 0.6278125, "grad_norm": 3.125, "grad_norm_var": 0.04280497233072917, "learning_rate": 0.0001, "loss": 5.7366, "loss/crossentropy": 2.600230097770691, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1687176153063774, "step": 20090 }, { "epoch": 0.627875, "grad_norm": 3.28125, "grad_norm_var": 0.045807902018229166, "learning_rate": 0.0001, "loss": 5.7429, "loss/crossentropy": 2.6203036308288574, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16265041381120682, "step": 20092 }, { "epoch": 0.6279375, "grad_norm": 3.09375, "grad_norm_var": 0.0502838134765625, "learning_rate": 0.0001, "loss": 5.4523, "loss/crossentropy": 2.459430456161499, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.1570972502231598, "step": 20094 }, { "epoch": 0.628, "grad_norm": 2.9375, "grad_norm_var": 0.0337554931640625, "learning_rate": 0.0001, "loss": 5.7673, "loss/crossentropy": 2.6559654474258423, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16464993357658386, "step": 20096 }, { "epoch": 0.6280625, "grad_norm": 3.484375, "grad_norm_var": 0.04077860514322917, "learning_rate": 0.0001, "loss": 5.7831, "loss/crossentropy": 2.527703881263733, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17319798469543457, "step": 20098 }, { "epoch": 0.628125, "grad_norm": 3.1875, "grad_norm_var": 0.037923177083333336, "learning_rate": 0.0001, "loss": 5.8382, "loss/crossentropy": 2.5846521854400635, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17535626888275146, "step": 20100 }, { "epoch": 0.6281875, "grad_norm": 3.03125, "grad_norm_var": 0.027099609375, "learning_rate": 0.0001, "loss": 5.629, "loss/crossentropy": 2.563002824783325, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16090133786201477, "step": 20102 }, { "epoch": 0.62825, "grad_norm": 3.75, "grad_norm_var": 0.04318033854166667, "learning_rate": 0.0001, "loss": 6.3233, "loss/crossentropy": 2.8549082279205322, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.19176553934812546, "step": 20104 }, { "epoch": 0.6283125, "grad_norm": 3.328125, "grad_norm_var": 0.0426422119140625, "learning_rate": 0.0001, "loss": 5.7861, "loss/crossentropy": 2.5958460569381714, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17019325494766235, "step": 20106 }, { "epoch": 0.628375, "grad_norm": 3.1875, "grad_norm_var": 0.04341532389322917, "learning_rate": 0.0001, "loss": 5.7901, "loss/crossentropy": 2.606391191482544, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1703239008784294, "step": 20108 }, { "epoch": 0.6284375, "grad_norm": 2.890625, "grad_norm_var": 0.06373697916666667, "learning_rate": 0.0001, "loss": 5.871, "loss/crossentropy": 2.832258701324463, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15621717274188995, "step": 20110 }, { "epoch": 0.6285, "grad_norm": 2.84375, "grad_norm_var": 0.07620035807291667, "learning_rate": 0.0001, "loss": 5.3623, "loss/crossentropy": 2.385128617286682, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15201535820960999, "step": 20112 }, { "epoch": 0.6285625, "grad_norm": 3.609375, "grad_norm_var": 0.08222554524739584, "learning_rate": 0.0001, "loss": 5.402, "loss/crossentropy": 2.350269317626953, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15869221836328506, "step": 20114 }, { "epoch": 0.628625, "grad_norm": 2.96875, "grad_norm_var": 0.08289388020833334, "learning_rate": 0.0001, "loss": 5.6134, "loss/crossentropy": 2.5118885040283203, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1621083915233612, "step": 20116 }, { "epoch": 0.6286875, "grad_norm": 3.09375, "grad_norm_var": 0.08203837076822916, "learning_rate": 0.0001, "loss": 5.6185, "loss/crossentropy": 2.5212897062301636, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1644095480442047, "step": 20118 }, { "epoch": 0.62875, "grad_norm": 2.984375, "grad_norm_var": 0.05614827473958333, "learning_rate": 0.0001, "loss": 5.6847, "loss/crossentropy": 2.5570132732391357, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1651124209165573, "step": 20120 }, { "epoch": 0.6288125, "grad_norm": 3.171875, "grad_norm_var": 0.05250244140625, "learning_rate": 0.0001, "loss": 5.2034, "loss/crossentropy": 2.152095317840576, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1582512930035591, "step": 20122 }, { "epoch": 0.628875, "grad_norm": 2.890625, "grad_norm_var": 0.04940999348958333, "learning_rate": 0.0001, "loss": 5.5825, "loss/crossentropy": 2.54826283454895, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1581067144870758, "step": 20124 }, { "epoch": 0.6289375, "grad_norm": 3.078125, "grad_norm_var": 0.043929036458333334, "learning_rate": 0.0001, "loss": 5.6659, "loss/crossentropy": 2.5693126916885376, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16122236102819443, "step": 20126 }, { "epoch": 0.629, "grad_norm": 3.390625, "grad_norm_var": 0.03905843098958333, "learning_rate": 0.0001, "loss": 5.8873, "loss/crossentropy": 2.71217143535614, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.17219892889261246, "step": 20128 }, { "epoch": 0.6290625, "grad_norm": 3.3125, "grad_norm_var": 0.025516764322916666, "learning_rate": 0.0001, "loss": 5.7538, "loss/crossentropy": 2.5143548250198364, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.17784936726093292, "step": 20130 }, { "epoch": 0.629125, "grad_norm": 2.921875, "grad_norm_var": 0.026496378580729167, "learning_rate": 0.0001, "loss": 5.6382, "loss/crossentropy": 2.556102156639099, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16602304577827454, "step": 20132 }, { "epoch": 0.6291875, "grad_norm": 3.265625, "grad_norm_var": 0.0285064697265625, "learning_rate": 0.0001, "loss": 5.6289, "loss/crossentropy": 2.4836350679397583, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1688230112195015, "step": 20134 }, { "epoch": 0.62925, "grad_norm": 3.15625, "grad_norm_var": 0.03432515462239583, "learning_rate": 0.0001, "loss": 5.8909, "loss/crossentropy": 2.650188684463501, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17485381662845612, "step": 20136 }, { "epoch": 0.6293125, "grad_norm": 3.234375, "grad_norm_var": 0.033642578125, "learning_rate": 0.0001, "loss": 5.8458, "loss/crossentropy": 2.7010639905929565, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16838444769382477, "step": 20138 }, { "epoch": 0.629375, "grad_norm": 3.265625, "grad_norm_var": 0.030594889322916666, "learning_rate": 0.0001, "loss": 6.0277, "loss/crossentropy": 2.6885000467300415, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18314369022846222, "step": 20140 }, { "epoch": 0.6294375, "grad_norm": 2.90625, "grad_norm_var": 0.02880859375, "learning_rate": 0.0001, "loss": 5.4761, "loss/crossentropy": 2.3749966621398926, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.15854676812887192, "step": 20142 }, { "epoch": 0.6295, "grad_norm": 3.1875, "grad_norm_var": 0.025340779622395834, "learning_rate": 0.0001, "loss": 5.7075, "loss/crossentropy": 2.4974864721298218, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17334265261888504, "step": 20144 }, { "epoch": 0.6295625, "grad_norm": 3.328125, "grad_norm_var": 0.027318318684895832, "learning_rate": 0.0001, "loss": 5.7967, "loss/crossentropy": 2.662586212158203, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.17200879007577896, "step": 20146 }, { "epoch": 0.629625, "grad_norm": 3.1875, "grad_norm_var": 0.022216796875, "learning_rate": 0.0001, "loss": 6.091, "loss/crossentropy": 2.8375940322875977, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17338711023330688, "step": 20148 }, { "epoch": 0.6296875, "grad_norm": 2.96875, "grad_norm_var": 0.025748697916666667, "learning_rate": 0.0001, "loss": 5.5337, "loss/crossentropy": 2.515775442123413, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15882223844528198, "step": 20150 }, { "epoch": 0.62975, "grad_norm": 3.046875, "grad_norm_var": 0.025926717122395835, "learning_rate": 0.0001, "loss": 5.4365, "loss/crossentropy": 2.438557267189026, "loss/hidden": 1.40234375, "loss/jsd": 0.0, "loss/logits": 0.15956391394138336, "step": 20152 }, { "epoch": 0.6298125, "grad_norm": 2.859375, "grad_norm_var": 0.029979451497395834, "learning_rate": 0.0001, "loss": 5.4571, "loss/crossentropy": 2.4664463996887207, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.15726518630981445, "step": 20154 }, { "epoch": 0.629875, "grad_norm": 2.96875, "grad_norm_var": 0.029271443684895832, "learning_rate": 0.0001, "loss": 5.7039, "loss/crossentropy": 2.501569628715515, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17140649259090424, "step": 20156 }, { "epoch": 0.6299375, "grad_norm": 3.125, "grad_norm_var": 0.031712849934895836, "learning_rate": 0.0001, "loss": 5.7994, "loss/crossentropy": 2.6657474040985107, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1649295687675476, "step": 20158 }, { "epoch": 0.63, "grad_norm": 3.765625, "grad_norm_var": 0.05890299479166667, "learning_rate": 0.0001, "loss": 5.5409, "loss/crossentropy": 2.3929996490478516, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16830556094646454, "step": 20160 }, { "epoch": 0.6300625, "grad_norm": 3.140625, "grad_norm_var": 0.061066691080729166, "learning_rate": 0.0001, "loss": 5.5072, "loss/crossentropy": 2.5050963163375854, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15489353984594345, "step": 20162 }, { "epoch": 0.630125, "grad_norm": 3.125, "grad_norm_var": 0.057938639322916666, "learning_rate": 0.0001, "loss": 5.6582, "loss/crossentropy": 2.513243556022644, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1660541445016861, "step": 20164 }, { "epoch": 0.6301875, "grad_norm": 3.1875, "grad_norm_var": 0.0605621337890625, "learning_rate": 0.0001, "loss": 5.9565, "loss/crossentropy": 2.725982189178467, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17110057175159454, "step": 20166 }, { "epoch": 0.63025, "grad_norm": 3.015625, "grad_norm_var": 0.05732014973958333, "learning_rate": 0.0001, "loss": 5.8055, "loss/crossentropy": 2.648452043533325, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1672716811299324, "step": 20168 }, { "epoch": 0.6303125, "grad_norm": 2.875, "grad_norm_var": 0.05679931640625, "learning_rate": 0.0001, "loss": 5.4629, "loss/crossentropy": 2.4996532201766968, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15413638204336166, "step": 20170 }, { "epoch": 0.630375, "grad_norm": 2.609375, "grad_norm_var": 0.07001851399739584, "learning_rate": 0.0001, "loss": 5.4531, "loss/crossentropy": 2.355065703392029, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.15511402487754822, "step": 20172 }, { "epoch": 0.6304375, "grad_norm": 3.40625, "grad_norm_var": 0.07616780598958334, "learning_rate": 0.0001, "loss": 5.6556, "loss/crossentropy": 2.4644635915756226, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17184355854988098, "step": 20174 }, { "epoch": 0.6305, "grad_norm": 3.09375, "grad_norm_var": 0.04218343098958333, "learning_rate": 0.0001, "loss": 5.7584, "loss/crossentropy": 2.641898274421692, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1655595824122429, "step": 20176 }, { "epoch": 0.6305625, "grad_norm": 3.15625, "grad_norm_var": 0.0384674072265625, "learning_rate": 0.0001, "loss": 5.7657, "loss/crossentropy": 2.5578166246414185, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.16765928268432617, "step": 20178 }, { "epoch": 0.630625, "grad_norm": 2.984375, "grad_norm_var": 0.03941650390625, "learning_rate": 0.0001, "loss": 5.695, "loss/crossentropy": 2.5939255952835083, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16440007835626602, "step": 20180 }, { "epoch": 0.6306875, "grad_norm": 3.25, "grad_norm_var": 0.05777587890625, "learning_rate": 0.0001, "loss": 6.0959, "loss/crossentropy": 2.768033504486084, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.18239657580852509, "step": 20182 }, { "epoch": 0.63075, "grad_norm": 3.0, "grad_norm_var": 0.05871988932291667, "learning_rate": 0.0001, "loss": 5.8539, "loss/crossentropy": 2.6732946634292603, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1696242392063141, "step": 20184 }, { "epoch": 0.6308125, "grad_norm": 3.25, "grad_norm_var": 0.057795206705729164, "learning_rate": 0.0001, "loss": 5.3569, "loss/crossentropy": 2.3667612075805664, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15447886288166046, "step": 20186 }, { "epoch": 0.630875, "grad_norm": 3.203125, "grad_norm_var": 0.0368072509765625, "learning_rate": 0.0001, "loss": 5.6897, "loss/crossentropy": 2.572500467300415, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.161333829164505, "step": 20188 }, { "epoch": 0.6309375, "grad_norm": 2.796875, "grad_norm_var": 0.04058837890625, "learning_rate": 0.0001, "loss": 5.6884, "loss/crossentropy": 2.636229991912842, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16224461793899536, "step": 20190 }, { "epoch": 0.631, "grad_norm": 3.078125, "grad_norm_var": 0.043290201822916666, "learning_rate": 0.0001, "loss": 5.7374, "loss/crossentropy": 2.5980584621429443, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16627852618694305, "step": 20192 }, { "epoch": 0.6310625, "grad_norm": 2.90625, "grad_norm_var": 0.04617513020833333, "learning_rate": 0.0001, "loss": 5.7323, "loss/crossentropy": 2.6956746578216553, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15873641520738602, "step": 20194 }, { "epoch": 0.631125, "grad_norm": 3.03125, "grad_norm_var": 0.04560139973958333, "learning_rate": 0.0001, "loss": 5.5217, "loss/crossentropy": 2.4796465635299683, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1581074669957161, "step": 20196 }, { "epoch": 0.6311875, "grad_norm": 3.578125, "grad_norm_var": 0.038960774739583336, "learning_rate": 0.0001, "loss": 5.6897, "loss/crossentropy": 2.58573317527771, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1643029898405075, "step": 20198 }, { "epoch": 0.63125, "grad_norm": 3.21875, "grad_norm_var": 0.03801676432291667, "learning_rate": 0.0001, "loss": 5.8526, "loss/crossentropy": 2.6438595056533813, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1704840511083603, "step": 20200 }, { "epoch": 0.6313125, "grad_norm": 3.171875, "grad_norm_var": 0.038427734375, "learning_rate": 0.0001, "loss": 6.0056, "loss/crossentropy": 2.7956109046936035, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17373424023389816, "step": 20202 }, { "epoch": 0.631375, "grad_norm": 3.015625, "grad_norm_var": 0.041624959309895834, "learning_rate": 0.0001, "loss": 5.8579, "loss/crossentropy": 2.740612268447876, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1672016829252243, "step": 20204 }, { "epoch": 0.6314375, "grad_norm": 3.359375, "grad_norm_var": 0.0406646728515625, "learning_rate": 0.0001, "loss": 5.7933, "loss/crossentropy": 2.612033247947693, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17008215934038162, "step": 20206 }, { "epoch": 0.6315, "grad_norm": 2.90625, "grad_norm_var": 0.0406646728515625, "learning_rate": 0.0001, "loss": 5.4172, "loss/crossentropy": 2.4040307998657227, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1563967764377594, "step": 20208 }, { "epoch": 0.6315625, "grad_norm": 2.90625, "grad_norm_var": 0.03931884765625, "learning_rate": 0.0001, "loss": 5.8952, "loss/crossentropy": 2.786734104156494, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16553699225187302, "step": 20210 }, { "epoch": 0.631625, "grad_norm": 3.484375, "grad_norm_var": 0.04474283854166667, "learning_rate": 0.0001, "loss": 5.8679, "loss/crossentropy": 2.6364437341690063, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17392823845148087, "step": 20212 }, { "epoch": 0.6316875, "grad_norm": 3.515625, "grad_norm_var": 0.0413970947265625, "learning_rate": 0.0001, "loss": 5.9131, "loss/crossentropy": 2.738698720932007, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16665589809417725, "step": 20214 }, { "epoch": 0.63175, "grad_norm": 2.90625, "grad_norm_var": 0.0447418212890625, "learning_rate": 0.0001, "loss": 5.6964, "loss/crossentropy": 2.573652744293213, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1610986292362213, "step": 20216 }, { "epoch": 0.6318125, "grad_norm": 3.015625, "grad_norm_var": 0.04342447916666667, "learning_rate": 0.0001, "loss": 5.6864, "loss/crossentropy": 2.592434287071228, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1609569787979126, "step": 20218 }, { "epoch": 0.631875, "grad_norm": 3.546875, "grad_norm_var": 0.04836832682291667, "learning_rate": 0.0001, "loss": 5.7919, "loss/crossentropy": 2.6078829765319824, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16957232356071472, "step": 20220 }, { "epoch": 0.6319375, "grad_norm": 4.125, "grad_norm_var": 0.11231180826822916, "learning_rate": 0.0001, "loss": 5.279, "loss/crossentropy": 2.2583394050598145, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15323758870363235, "step": 20222 }, { "epoch": 0.632, "grad_norm": 2.8125, "grad_norm_var": 0.11551106770833333, "learning_rate": 0.0001, "loss": 5.7031, "loss/crossentropy": 2.6227800846099854, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.158027783036232, "step": 20224 }, { "epoch": 0.6320625, "grad_norm": 3.390625, "grad_norm_var": 0.10192057291666666, "learning_rate": 0.0001, "loss": 5.8896, "loss/crossentropy": 2.6651804447174072, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17517473548650742, "step": 20226 }, { "epoch": 0.632125, "grad_norm": 3.109375, "grad_norm_var": 0.1005279541015625, "learning_rate": 0.0001, "loss": 5.6459, "loss/crossentropy": 2.5703917741775513, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1618465781211853, "step": 20228 }, { "epoch": 0.6321875, "grad_norm": 3.109375, "grad_norm_var": 0.096923828125, "learning_rate": 0.0001, "loss": 5.7883, "loss/crossentropy": 2.613739013671875, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17057643830776215, "step": 20230 }, { "epoch": 0.63225, "grad_norm": 2.828125, "grad_norm_var": 0.10400390625, "learning_rate": 0.0001, "loss": 5.3312, "loss/crossentropy": 2.3937498331069946, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.1515582799911499, "step": 20232 }, { "epoch": 0.6323125, "grad_norm": 3.453125, "grad_norm_var": 0.10446675618489583, "learning_rate": 0.0001, "loss": 6.1037, "loss/crossentropy": 2.8300225734710693, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17698094248771667, "step": 20234 }, { "epoch": 0.632375, "grad_norm": 3.1875, "grad_norm_var": 0.0999176025390625, "learning_rate": 0.0001, "loss": 5.8462, "loss/crossentropy": 2.651261806488037, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.173788882791996, "step": 20236 }, { "epoch": 0.6324375, "grad_norm": 3.15625, "grad_norm_var": 0.03095703125, "learning_rate": 0.0001, "loss": 5.8181, "loss/crossentropy": 2.6541812419891357, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16600612550973892, "step": 20238 }, { "epoch": 0.6325, "grad_norm": 2.734375, "grad_norm_var": 0.0330963134765625, "learning_rate": 0.0001, "loss": 5.4874, "loss/crossentropy": 2.4278383255004883, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15986019372940063, "step": 20240 }, { "epoch": 0.6325625, "grad_norm": 3.34375, "grad_norm_var": 0.04148661295572917, "learning_rate": 0.0001, "loss": 5.4431, "loss/crossentropy": 2.306453824043274, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.16054243594408035, "step": 20242 }, { "epoch": 0.632625, "grad_norm": 3.140625, "grad_norm_var": 0.039839680989583334, "learning_rate": 0.0001, "loss": 5.6074, "loss/crossentropy": 2.5486620664596558, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15938962996006012, "step": 20244 }, { "epoch": 0.6326875, "grad_norm": 3.28125, "grad_norm_var": 0.0414459228515625, "learning_rate": 0.0001, "loss": 5.7226, "loss/crossentropy": 2.6033793687820435, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16621871292591095, "step": 20246 }, { "epoch": 0.63275, "grad_norm": 3.25, "grad_norm_var": 0.0333984375, "learning_rate": 0.0001, "loss": 5.286, "loss/crossentropy": 2.2284141778945923, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15848813951015472, "step": 20248 }, { "epoch": 0.6328125, "grad_norm": 2.90625, "grad_norm_var": 0.0353424072265625, "learning_rate": 0.0001, "loss": 5.4818, "loss/crossentropy": 2.321297287940979, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1652720496058464, "step": 20250 }, { "epoch": 0.632875, "grad_norm": 3.3125, "grad_norm_var": 0.03580322265625, "learning_rate": 0.0001, "loss": 5.76, "loss/crossentropy": 2.5480066537857056, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17198427021503448, "step": 20252 }, { "epoch": 0.6329375, "grad_norm": 2.984375, "grad_norm_var": 0.03824462890625, "learning_rate": 0.0001, "loss": 5.7146, "loss/crossentropy": 2.5619568824768066, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16760719567537308, "step": 20254 }, { "epoch": 0.633, "grad_norm": 3.046875, "grad_norm_var": 0.03181050618489583, "learning_rate": 0.0001, "loss": 5.9623, "loss/crossentropy": 2.7362542152404785, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17299064993858337, "step": 20256 }, { "epoch": 0.6330625, "grad_norm": 3.234375, "grad_norm_var": 0.023436482747395834, "learning_rate": 0.0001, "loss": 6.0276, "loss/crossentropy": 2.796047568321228, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17432530224323273, "step": 20258 }, { "epoch": 0.633125, "grad_norm": 2.65625, "grad_norm_var": 0.0421051025390625, "learning_rate": 0.0001, "loss": 5.2651, "loss/crossentropy": 2.3361037969589233, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.14837107062339783, "step": 20260 }, { "epoch": 0.6331875, "grad_norm": 2.984375, "grad_norm_var": 0.042073567708333336, "learning_rate": 0.0001, "loss": 5.3996, "loss/crossentropy": 2.365097999572754, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1542360633611679, "step": 20262 }, { "epoch": 0.63325, "grad_norm": 3.0, "grad_norm_var": 0.03868815104166667, "learning_rate": 0.0001, "loss": 5.5728, "loss/crossentropy": 2.448459506034851, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16672926396131516, "step": 20264 }, { "epoch": 0.6333125, "grad_norm": 3.328125, "grad_norm_var": 0.03535868326822917, "learning_rate": 0.0001, "loss": 5.9353, "loss/crossentropy": 2.6578985452651978, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17735158652067184, "step": 20266 }, { "epoch": 0.633375, "grad_norm": 3.203125, "grad_norm_var": 0.033447265625, "learning_rate": 0.0001, "loss": 5.9451, "loss/crossentropy": 2.7756892442703247, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16811110079288483, "step": 20268 }, { "epoch": 0.6334375, "grad_norm": 3.09375, "grad_norm_var": 0.03857421875, "learning_rate": 0.0001, "loss": 5.2494, "loss/crossentropy": 2.253188133239746, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15196974575519562, "step": 20270 }, { "epoch": 0.6335, "grad_norm": 3.234375, "grad_norm_var": 0.0308746337890625, "learning_rate": 0.0001, "loss": 5.9143, "loss/crossentropy": 2.7492603063583374, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16846046596765518, "step": 20272 }, { "epoch": 0.6335625, "grad_norm": 3.375, "grad_norm_var": 0.0353515625, "learning_rate": 0.0001, "loss": 5.8884, "loss/crossentropy": 2.677353024482727, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1703251749277115, "step": 20274 }, { "epoch": 0.633625, "grad_norm": 2.984375, "grad_norm_var": 0.021629842122395833, "learning_rate": 0.0001, "loss": 5.7466, "loss/crossentropy": 2.561260223388672, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16579557955265045, "step": 20276 }, { "epoch": 0.6336875, "grad_norm": 3.0, "grad_norm_var": 0.021219889322916668, "learning_rate": 0.0001, "loss": 5.4467, "loss/crossentropy": 2.4198994636535645, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15893103182315826, "step": 20278 }, { "epoch": 0.63375, "grad_norm": 3.046875, "grad_norm_var": 0.03815104166666667, "learning_rate": 0.0001, "loss": 5.34, "loss/crossentropy": 2.381423592567444, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15171539783477783, "step": 20280 }, { "epoch": 0.6338125, "grad_norm": 3.75, "grad_norm_var": 0.05452372233072917, "learning_rate": 0.0001, "loss": 6.1022, "loss/crossentropy": 2.7580807209014893, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.18440742045640945, "step": 20282 }, { "epoch": 0.633875, "grad_norm": 2.96875, "grad_norm_var": 0.06870829264322917, "learning_rate": 0.0001, "loss": 5.993, "loss/crossentropy": 2.768987536430359, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1700623482465744, "step": 20284 }, { "epoch": 0.6339375, "grad_norm": 3.359375, "grad_norm_var": 0.06580403645833334, "learning_rate": 0.0001, "loss": 5.8209, "loss/crossentropy": 2.6002098321914673, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17050490528345108, "step": 20286 }, { "epoch": 0.634, "grad_norm": 3.171875, "grad_norm_var": 0.07008056640625, "learning_rate": 0.0001, "loss": 5.63, "loss/crossentropy": 2.541434168815613, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.15768882632255554, "step": 20288 }, { "epoch": 0.6340625, "grad_norm": 3.25, "grad_norm_var": 0.07315165201822917, "learning_rate": 0.0001, "loss": 5.7851, "loss/crossentropy": 2.6267735958099365, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16895414888858795, "step": 20290 }, { "epoch": 0.634125, "grad_norm": 3.015625, "grad_norm_var": 0.07481180826822917, "learning_rate": 0.0001, "loss": 5.2985, "loss/crossentropy": 2.2350289821624756, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.1622042953968048, "step": 20292 }, { "epoch": 0.6341875, "grad_norm": 3.140625, "grad_norm_var": 0.07447001139322916, "learning_rate": 0.0001, "loss": 5.6256, "loss/crossentropy": 2.5115227699279785, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16101675480604172, "step": 20294 }, { "epoch": 0.63425, "grad_norm": 3.1875, "grad_norm_var": 0.05950520833333333, "learning_rate": 0.0001, "loss": 5.8101, "loss/crossentropy": 2.6483267545700073, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.17008645087480545, "step": 20296 }, { "epoch": 0.6343125, "grad_norm": 3.140625, "grad_norm_var": 0.03593648274739583, "learning_rate": 0.0001, "loss": 5.406, "loss/crossentropy": 2.3410415649414062, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15844881534576416, "step": 20298 }, { "epoch": 0.634375, "grad_norm": 3.421875, "grad_norm_var": 0.023029581705729166, "learning_rate": 0.0001, "loss": 5.4763, "loss/crossentropy": 2.400785207748413, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16223635524511337, "step": 20300 }, { "epoch": 0.6344375, "grad_norm": 3.0, "grad_norm_var": 0.0150543212890625, "learning_rate": 0.0001, "loss": 5.7139, "loss/crossentropy": 2.5901176929473877, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16277092695236206, "step": 20302 }, { "epoch": 0.6345, "grad_norm": 2.96875, "grad_norm_var": 0.0147125244140625, "learning_rate": 0.0001, "loss": 5.6907, "loss/crossentropy": 2.5511187314987183, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16825895011425018, "step": 20304 }, { "epoch": 0.6345625, "grad_norm": 3.0625, "grad_norm_var": 0.0141998291015625, "learning_rate": 0.0001, "loss": 5.4864, "loss/crossentropy": 2.466139554977417, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15906154364347458, "step": 20306 }, { "epoch": 0.634625, "grad_norm": 3.375, "grad_norm_var": 0.01988525390625, "learning_rate": 0.0001, "loss": 5.4633, "loss/crossentropy": 2.3712934255599976, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1588064432144165, "step": 20308 }, { "epoch": 0.6346875, "grad_norm": 3.171875, "grad_norm_var": 0.0225006103515625, "learning_rate": 0.0001, "loss": 5.9008, "loss/crossentropy": 2.6653735637664795, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17314809560775757, "step": 20310 }, { "epoch": 0.63475, "grad_norm": 3.15625, "grad_norm_var": 0.020438639322916667, "learning_rate": 0.0001, "loss": 5.6956, "loss/crossentropy": 2.5791908502578735, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16125231981277466, "step": 20312 }, { "epoch": 0.6348125, "grad_norm": 3.015625, "grad_norm_var": 0.023079427083333333, "learning_rate": 0.0001, "loss": 5.5867, "loss/crossentropy": 2.458158254623413, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1651960387825966, "step": 20314 }, { "epoch": 0.634875, "grad_norm": 3.15625, "grad_norm_var": 0.019456990559895835, "learning_rate": 0.0001, "loss": 5.7762, "loss/crossentropy": 2.637978434562683, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16929318010807037, "step": 20316 }, { "epoch": 0.6349375, "grad_norm": 3.0625, "grad_norm_var": 0.019559733072916665, "learning_rate": 0.0001, "loss": 5.7862, "loss/crossentropy": 2.644035220146179, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16460420936346054, "step": 20318 }, { "epoch": 0.635, "grad_norm": 3.078125, "grad_norm_var": 0.018708292643229166, "learning_rate": 0.0001, "loss": 5.5519, "loss/crossentropy": 2.3760015964508057, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16759207099676132, "step": 20320 }, { "epoch": 0.6350625, "grad_norm": 3.203125, "grad_norm_var": 0.016551717122395834, "learning_rate": 0.0001, "loss": 5.7103, "loss/crossentropy": 2.6155636310577393, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1641618013381958, "step": 20322 }, { "epoch": 0.635125, "grad_norm": 3.078125, "grad_norm_var": 0.018504842122395834, "learning_rate": 0.0001, "loss": 5.5773, "loss/crossentropy": 2.521718978881836, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15751251578330994, "step": 20324 }, { "epoch": 0.6351875, "grad_norm": 3.046875, "grad_norm_var": 0.014972941080729166, "learning_rate": 0.0001, "loss": 5.4521, "loss/crossentropy": 2.381601095199585, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15978585928678513, "step": 20326 }, { "epoch": 0.63525, "grad_norm": 3.09375, "grad_norm_var": 0.013802083333333333, "learning_rate": 0.0001, "loss": 5.4768, "loss/crossentropy": 2.364861845970154, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1658855304121971, "step": 20328 }, { "epoch": 0.6353125, "grad_norm": 2.859375, "grad_norm_var": 0.01451416015625, "learning_rate": 0.0001, "loss": 5.6741, "loss/crossentropy": 2.612994074821472, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16157987713813782, "step": 20330 }, { "epoch": 0.635375, "grad_norm": 3.125, "grad_norm_var": 0.012760416666666666, "learning_rate": 0.0001, "loss": 5.718, "loss/crossentropy": 2.6030982732772827, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16540135443210602, "step": 20332 }, { "epoch": 0.6354375, "grad_norm": 3.078125, "grad_norm_var": 0.012507120768229166, "learning_rate": 0.0001, "loss": 5.539, "loss/crossentropy": 2.4375611543655396, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16287478804588318, "step": 20334 }, { "epoch": 0.6355, "grad_norm": 3.34375, "grad_norm_var": 0.01715087890625, "learning_rate": 0.0001, "loss": 5.7189, "loss/crossentropy": 2.618451237678528, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16239001601934433, "step": 20336 }, { "epoch": 0.6355625, "grad_norm": 3.6875, "grad_norm_var": 0.046708170572916666, "learning_rate": 0.0001, "loss": 6.1406, "loss/crossentropy": 2.780633568763733, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1848253831267357, "step": 20338 }, { "epoch": 0.635625, "grad_norm": 3.453125, "grad_norm_var": 0.056538899739583336, "learning_rate": 0.0001, "loss": 6.3322, "loss/crossentropy": 2.917917490005493, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18908123672008514, "step": 20340 }, { "epoch": 0.6356875, "grad_norm": 3.328125, "grad_norm_var": 0.05495503743489583, "learning_rate": 0.0001, "loss": 5.5889, "loss/crossentropy": 2.501395583152771, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16070107370615005, "step": 20342 }, { "epoch": 0.63575, "grad_norm": 3.171875, "grad_norm_var": 0.05390218098958333, "learning_rate": 0.0001, "loss": 5.5669, "loss/crossentropy": 2.465433955192566, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.1660088747739792, "step": 20344 }, { "epoch": 0.6358125, "grad_norm": 3.125, "grad_norm_var": 0.045628865559895836, "learning_rate": 0.0001, "loss": 5.4948, "loss/crossentropy": 2.3818790912628174, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16480587422847748, "step": 20346 }, { "epoch": 0.635875, "grad_norm": 3.140625, "grad_norm_var": 0.04334208170572917, "learning_rate": 0.0001, "loss": 5.5542, "loss/crossentropy": 2.4394543170928955, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16265016049146652, "step": 20348 }, { "epoch": 0.6359375, "grad_norm": 3.1875, "grad_norm_var": 0.0401763916015625, "learning_rate": 0.0001, "loss": 5.477, "loss/crossentropy": 2.4162708520889282, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15880703181028366, "step": 20350 }, { "epoch": 0.636, "grad_norm": 3.109375, "grad_norm_var": 0.0600250244140625, "learning_rate": 0.0001, "loss": 5.037, "loss/crossentropy": 2.1076958179473877, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1476190909743309, "step": 20352 }, { "epoch": 0.6360625, "grad_norm": 3.28125, "grad_norm_var": 0.042235310872395834, "learning_rate": 0.0001, "loss": 5.5061, "loss/crossentropy": 2.429261088371277, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16549523919820786, "step": 20354 }, { "epoch": 0.636125, "grad_norm": 3.75, "grad_norm_var": 0.04729410807291667, "learning_rate": 0.0001, "loss": 5.7567, "loss/crossentropy": 2.602792263031006, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16539494693279266, "step": 20356 }, { "epoch": 0.6361875, "grad_norm": 2.8125, "grad_norm_var": 0.0538482666015625, "learning_rate": 0.0001, "loss": 5.4911, "loss/crossentropy": 2.491291046142578, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15427955985069275, "step": 20358 }, { "epoch": 0.63625, "grad_norm": 3.21875, "grad_norm_var": 0.061083984375, "learning_rate": 0.0001, "loss": 5.5776, "loss/crossentropy": 2.5022329092025757, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16026996076107025, "step": 20360 }, { "epoch": 0.6363125, "grad_norm": 3.515625, "grad_norm_var": 0.06746317545572916, "learning_rate": 0.0001, "loss": 5.7371, "loss/crossentropy": 2.6219738721847534, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16190487891435623, "step": 20362 }, { "epoch": 0.636375, "grad_norm": 3.21875, "grad_norm_var": 0.06974283854166667, "learning_rate": 0.0001, "loss": 5.859, "loss/crossentropy": 2.569133162498474, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.178208127617836, "step": 20364 }, { "epoch": 0.6364375, "grad_norm": 3.015625, "grad_norm_var": 0.07108968098958333, "learning_rate": 0.0001, "loss": 5.7103, "loss/crossentropy": 2.5686323642730713, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16728801280260086, "step": 20366 }, { "epoch": 0.6365, "grad_norm": 2.65625, "grad_norm_var": 0.07828776041666667, "learning_rate": 0.0001, "loss": 5.2009, "loss/crossentropy": 2.3160207271575928, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.146303191781044, "step": 20368 }, { "epoch": 0.6365625, "grad_norm": 2.859375, "grad_norm_var": 0.0799224853515625, "learning_rate": 0.0001, "loss": 5.7433, "loss/crossentropy": 2.633384346961975, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16294124722480774, "step": 20370 }, { "epoch": 0.636625, "grad_norm": 2.890625, "grad_norm_var": 0.051390584309895834, "learning_rate": 0.0001, "loss": 5.4866, "loss/crossentropy": 2.462988018989563, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15704964846372604, "step": 20372 }, { "epoch": 0.6366875, "grad_norm": 3.25, "grad_norm_var": 0.051634724934895834, "learning_rate": 0.0001, "loss": 5.4468, "loss/crossentropy": 2.3518134355545044, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16301118582487106, "step": 20374 }, { "epoch": 0.63675, "grad_norm": 3.1875, "grad_norm_var": 0.05366109212239583, "learning_rate": 0.0001, "loss": 5.9309, "loss/crossentropy": 2.7629919052124023, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16952699422836304, "step": 20376 }, { "epoch": 0.6368125, "grad_norm": 3.125, "grad_norm_var": 0.0410797119140625, "learning_rate": 0.0001, "loss": 6.0113, "loss/crossentropy": 2.8072842359542847, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17079093307256699, "step": 20378 }, { "epoch": 0.636875, "grad_norm": 2.90625, "grad_norm_var": 0.0366851806640625, "learning_rate": 0.0001, "loss": 5.7854, "loss/crossentropy": 2.6800915002822876, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16443432867527008, "step": 20380 }, { "epoch": 0.6369375, "grad_norm": 2.859375, "grad_norm_var": 0.03655192057291667, "learning_rate": 0.0001, "loss": 5.6768, "loss/crossentropy": 2.5980231761932373, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16256462782621384, "step": 20382 }, { "epoch": 0.637, "grad_norm": 3.046875, "grad_norm_var": 0.028107706705729166, "learning_rate": 0.0001, "loss": 5.5433, "loss/crossentropy": 2.491496205329895, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16025366634130478, "step": 20384 }, { "epoch": 0.6370625, "grad_norm": 3.40625, "grad_norm_var": 0.03585611979166667, "learning_rate": 0.0001, "loss": 5.4569, "loss/crossentropy": 2.4254097938537598, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.154323972761631, "step": 20386 }, { "epoch": 0.637125, "grad_norm": 3.015625, "grad_norm_var": 0.03494466145833333, "learning_rate": 0.0001, "loss": 5.5164, "loss/crossentropy": 2.4353718757629395, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1616169586777687, "step": 20388 }, { "epoch": 0.6371875, "grad_norm": 3.046875, "grad_norm_var": 0.031061808268229168, "learning_rate": 0.0001, "loss": 5.886, "loss/crossentropy": 2.7061909437179565, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16836927086114883, "step": 20390 }, { "epoch": 0.63725, "grad_norm": 3.328125, "grad_norm_var": 0.027180989583333332, "learning_rate": 0.0001, "loss": 5.7933, "loss/crossentropy": 2.5726131200790405, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17128415405750275, "step": 20392 }, { "epoch": 0.6373125, "grad_norm": 3.609375, "grad_norm_var": 0.0496978759765625, "learning_rate": 0.0001, "loss": 5.3625, "loss/crossentropy": 2.32717502117157, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.1593942791223526, "step": 20394 }, { "epoch": 0.637375, "grad_norm": 2.84375, "grad_norm_var": 0.0496734619140625, "learning_rate": 0.0001, "loss": 5.6418, "loss/crossentropy": 2.4990261793136597, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16700708121061325, "step": 20396 }, { "epoch": 0.6374375, "grad_norm": 3.015625, "grad_norm_var": 0.04729817708333333, "learning_rate": 0.0001, "loss": 5.7106, "loss/crossentropy": 2.536941170692444, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16697710752487183, "step": 20398 }, { "epoch": 0.6375, "grad_norm": 3.15625, "grad_norm_var": 0.043603515625, "learning_rate": 0.0001, "loss": 5.7261, "loss/crossentropy": 2.541267156600952, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17043745517730713, "step": 20400 }, { "epoch": 0.6375625, "grad_norm": 2.84375, "grad_norm_var": 0.04146728515625, "learning_rate": 0.0001, "loss": 5.7255, "loss/crossentropy": 2.630297064781189, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.159913070499897, "step": 20402 }, { "epoch": 0.637625, "grad_norm": 3.078125, "grad_norm_var": 0.04138997395833333, "learning_rate": 0.0001, "loss": 5.7414, "loss/crossentropy": 2.5599911212921143, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16892090439796448, "step": 20404 }, { "epoch": 0.6376875, "grad_norm": 3.203125, "grad_norm_var": 0.04202067057291667, "learning_rate": 0.0001, "loss": 5.7447, "loss/crossentropy": 2.6057400703430176, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16858305037021637, "step": 20406 }, { "epoch": 0.63775, "grad_norm": 2.9375, "grad_norm_var": 0.039549763997395834, "learning_rate": 0.0001, "loss": 5.7431, "loss/crossentropy": 2.6216236352920532, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16488446295261383, "step": 20408 }, { "epoch": 0.6378125, "grad_norm": 3.0625, "grad_norm_var": 0.018131510416666666, "learning_rate": 0.0001, "loss": 5.5229, "loss/crossentropy": 2.42576265335083, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16557565331459045, "step": 20410 }, { "epoch": 0.637875, "grad_norm": 3.5, "grad_norm_var": 0.02515869140625, "learning_rate": 0.0001, "loss": 5.6878, "loss/crossentropy": 2.5755895376205444, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1639513000845909, "step": 20412 }, { "epoch": 0.6379375, "grad_norm": 3.671875, "grad_norm_var": 0.04109598795572917, "learning_rate": 0.0001, "loss": 5.824, "loss/crossentropy": 2.5592713356018066, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.172563798725605, "step": 20414 }, { "epoch": 0.638, "grad_norm": 3.40625, "grad_norm_var": 0.04631245930989583, "learning_rate": 0.0001, "loss": 5.8793, "loss/crossentropy": 2.6633025407791138, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16964267194271088, "step": 20416 }, { "epoch": 0.6380625, "grad_norm": 3.25, "grad_norm_var": 0.039990234375, "learning_rate": 0.0001, "loss": 5.9089, "loss/crossentropy": 2.712688684463501, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1688438281416893, "step": 20418 }, { "epoch": 0.638125, "grad_norm": 2.90625, "grad_norm_var": 0.05426025390625, "learning_rate": 0.0001, "loss": 5.3497, "loss/crossentropy": 2.433741807937622, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.1505783349275589, "step": 20420 }, { "epoch": 0.6381875, "grad_norm": 3.21875, "grad_norm_var": 0.05386962890625, "learning_rate": 0.0001, "loss": 5.897, "loss/crossentropy": 2.6967657804489136, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16767611354589462, "step": 20422 }, { "epoch": 0.63825, "grad_norm": 3.140625, "grad_norm_var": 0.0518707275390625, "learning_rate": 0.0001, "loss": 5.5642, "loss/crossentropy": 2.3478355407714844, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17241621762514114, "step": 20424 }, { "epoch": 0.6383125, "grad_norm": 3.359375, "grad_norm_var": 0.058430989583333336, "learning_rate": 0.0001, "loss": 5.7552, "loss/crossentropy": 2.456655979156494, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17516592890024185, "step": 20426 }, { "epoch": 0.638375, "grad_norm": 3.3125, "grad_norm_var": 0.06259358723958333, "learning_rate": 0.0001, "loss": 5.6379, "loss/crossentropy": 2.5944056510925293, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15590814501047134, "step": 20428 }, { "epoch": 0.6384375, "grad_norm": 3.171875, "grad_norm_var": 0.05068359375, "learning_rate": 0.0001, "loss": 5.4455, "loss/crossentropy": 2.4222077131271362, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15350348502397537, "step": 20430 }, { "epoch": 0.6385, "grad_norm": 3.5625, "grad_norm_var": 0.05671284993489583, "learning_rate": 0.0001, "loss": 5.7682, "loss/crossentropy": 2.5905617475509644, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16971492767333984, "step": 20432 }, { "epoch": 0.6385625, "grad_norm": 2.921875, "grad_norm_var": 0.0601959228515625, "learning_rate": 0.0001, "loss": 5.7718, "loss/crossentropy": 2.6473952531814575, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16751841455698013, "step": 20434 }, { "epoch": 0.638625, "grad_norm": 3.578125, "grad_norm_var": 0.05845947265625, "learning_rate": 0.0001, "loss": 5.902, "loss/crossentropy": 2.6589845418930054, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1727406159043312, "step": 20436 }, { "epoch": 0.6386875, "grad_norm": 3.09375, "grad_norm_var": 0.05920817057291667, "learning_rate": 0.0001, "loss": 5.5358, "loss/crossentropy": 2.4186350107192993, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16054274141788483, "step": 20438 }, { "epoch": 0.63875, "grad_norm": 3.34375, "grad_norm_var": 0.05964253743489583, "learning_rate": 0.0001, "loss": 5.7618, "loss/crossentropy": 2.552546262741089, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17131836712360382, "step": 20440 }, { "epoch": 0.6388125, "grad_norm": 3.203125, "grad_norm_var": 0.0472808837890625, "learning_rate": 0.0001, "loss": 5.7838, "loss/crossentropy": 2.559280514717102, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.1689380407333374, "step": 20442 }, { "epoch": 0.638875, "grad_norm": 3.359375, "grad_norm_var": 0.042399088541666664, "learning_rate": 0.0001, "loss": 5.5916, "loss/crossentropy": 2.479541301727295, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.162373349070549, "step": 20444 }, { "epoch": 0.6389375, "grad_norm": 2.921875, "grad_norm_var": 0.045710245768229164, "learning_rate": 0.0001, "loss": 5.257, "loss/crossentropy": 2.314408540725708, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1497253105044365, "step": 20446 }, { "epoch": 0.639, "grad_norm": 3.328125, "grad_norm_var": 0.039143880208333336, "learning_rate": 0.0001, "loss": 5.7524, "loss/crossentropy": 2.600519895553589, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16792187094688416, "step": 20448 }, { "epoch": 0.6390625, "grad_norm": 3.359375, "grad_norm_var": 0.03867899576822917, "learning_rate": 0.0001, "loss": 5.8297, "loss/crossentropy": 2.672330379486084, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.167303629219532, "step": 20450 }, { "epoch": 0.639125, "grad_norm": 3.515625, "grad_norm_var": 0.0659576416015625, "learning_rate": 0.0001, "loss": 5.675, "loss/crossentropy": 2.4223849773406982, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1760462373495102, "step": 20452 }, { "epoch": 0.6391875, "grad_norm": 3.703125, "grad_norm_var": 0.08121337890625, "learning_rate": 0.0001, "loss": 5.605, "loss/crossentropy": 2.5124597549438477, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1608152985572815, "step": 20454 }, { "epoch": 0.63925, "grad_norm": 3.0625, "grad_norm_var": 0.08369140625, "learning_rate": 0.0001, "loss": 5.82, "loss/crossentropy": 2.6119704246520996, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17392588406801224, "step": 20456 }, { "epoch": 0.6393125, "grad_norm": 3.265625, "grad_norm_var": 0.0875152587890625, "learning_rate": 0.0001, "loss": 5.9441, "loss/crossentropy": 2.8331947326660156, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16304653882980347, "step": 20458 }, { "epoch": 0.639375, "grad_norm": 3.296875, "grad_norm_var": 0.08245340983072917, "learning_rate": 0.0001, "loss": 5.7966, "loss/crossentropy": 2.632740616798401, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16872943937778473, "step": 20460 }, { "epoch": 0.6394375, "grad_norm": 3.1875, "grad_norm_var": 0.068310546875, "learning_rate": 0.0001, "loss": 5.764, "loss/crossentropy": 2.6361597776412964, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.17020221799612045, "step": 20462 }, { "epoch": 0.6395, "grad_norm": 3.03125, "grad_norm_var": 0.06526285807291667, "learning_rate": 0.0001, "loss": 5.9375, "loss/crossentropy": 2.679958701133728, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17458638548851013, "step": 20464 }, { "epoch": 0.6395625, "grad_norm": 2.875, "grad_norm_var": 0.07730712890625, "learning_rate": 0.0001, "loss": 5.7443, "loss/crossentropy": 2.6192407608032227, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1671948879957199, "step": 20466 }, { "epoch": 0.639625, "grad_norm": 2.953125, "grad_norm_var": 0.0546539306640625, "learning_rate": 0.0001, "loss": 5.5122, "loss/crossentropy": 2.5090534687042236, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15812255442142487, "step": 20468 }, { "epoch": 0.6396875, "grad_norm": 3.296875, "grad_norm_var": 0.03459879557291667, "learning_rate": 0.0001, "loss": 5.4934, "loss/crossentropy": 2.371386408805847, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1606387421488762, "step": 20470 }, { "epoch": 0.63975, "grad_norm": 3.203125, "grad_norm_var": 0.030985514322916668, "learning_rate": 0.0001, "loss": 5.862, "loss/crossentropy": 2.667320728302002, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17064325511455536, "step": 20472 }, { "epoch": 0.6398125, "grad_norm": 3.03125, "grad_norm_var": 0.028815714518229167, "learning_rate": 0.0001, "loss": 5.8138, "loss/crossentropy": 2.7414956092834473, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15996047109365463, "step": 20474 }, { "epoch": 0.639875, "grad_norm": 3.59375, "grad_norm_var": 0.1031890869140625, "learning_rate": 0.0001, "loss": 6.0073, "loss/crossentropy": 2.7020500898361206, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17544400691986084, "step": 20476 }, { "epoch": 0.6399375, "grad_norm": 2.953125, "grad_norm_var": 0.1067047119140625, "learning_rate": 0.0001, "loss": 5.6185, "loss/crossentropy": 2.5471861362457275, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15986835211515427, "step": 20478 }, { "epoch": 0.64, "grad_norm": 3.15625, "grad_norm_var": 0.10097249348958333, "learning_rate": 0.0001, "loss": 5.844, "loss/crossentropy": 2.666524887084961, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.170479454100132, "step": 20480 }, { "epoch": 0.6400625, "grad_norm": 3.65625, "grad_norm_var": 0.11204020182291667, "learning_rate": 0.0001, "loss": 5.9971, "loss/crossentropy": 2.670848250389099, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1771557331085205, "step": 20482 }, { "epoch": 0.640125, "grad_norm": 3.125, "grad_norm_var": 0.100830078125, "learning_rate": 0.0001, "loss": 5.4289, "loss/crossentropy": 2.423728346824646, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.15793531388044357, "step": 20484 }, { "epoch": 0.6401875, "grad_norm": 3.09375, "grad_norm_var": 0.10591532389322916, "learning_rate": 0.0001, "loss": 5.6005, "loss/crossentropy": 2.5539534091949463, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15778449177742004, "step": 20486 }, { "epoch": 0.64025, "grad_norm": 3.21875, "grad_norm_var": 0.10319722493489583, "learning_rate": 0.0001, "loss": 5.6379, "loss/crossentropy": 2.4964656829833984, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16765498369932175, "step": 20488 }, { "epoch": 0.6403125, "grad_norm": 3.15625, "grad_norm_var": 0.09967041015625, "learning_rate": 0.0001, "loss": 5.8429, "loss/crossentropy": 2.6325289011001587, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17259898036718369, "step": 20490 }, { "epoch": 0.640375, "grad_norm": 3.09375, "grad_norm_var": 0.03677978515625, "learning_rate": 0.0001, "loss": 6.0761, "loss/crossentropy": 2.8093377351760864, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17667745053768158, "step": 20492 }, { "epoch": 0.6404375, "grad_norm": 3.0625, "grad_norm_var": 0.03414306640625, "learning_rate": 0.0001, "loss": 5.988, "loss/crossentropy": 2.711090922355652, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17847144603729248, "step": 20494 }, { "epoch": 0.6405, "grad_norm": 3.71875, "grad_norm_var": 0.05367431640625, "learning_rate": 0.0001, "loss": 5.4906, "loss/crossentropy": 2.3758429288864136, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.161868117749691, "step": 20496 }, { "epoch": 0.6405625, "grad_norm": 3.125, "grad_norm_var": 0.03181864420572917, "learning_rate": 0.0001, "loss": 5.3742, "loss/crossentropy": 2.3135257959365845, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15997402369976044, "step": 20498 }, { "epoch": 0.640625, "grad_norm": 3.59375, "grad_norm_var": 0.039078776041666666, "learning_rate": 0.0001, "loss": 5.4782, "loss/crossentropy": 2.4464190006256104, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15825734287500381, "step": 20500 }, { "epoch": 0.6406875, "grad_norm": 3.203125, "grad_norm_var": 0.04088134765625, "learning_rate": 0.0001, "loss": 5.2256, "loss/crossentropy": 2.296721577644348, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.14952635765075684, "step": 20502 }, { "epoch": 0.64075, "grad_norm": 2.921875, "grad_norm_var": 0.04540608723958333, "learning_rate": 0.0001, "loss": 5.4201, "loss/crossentropy": 2.388209342956543, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15748750418424606, "step": 20504 }, { "epoch": 0.6408125, "grad_norm": 2.984375, "grad_norm_var": 0.050568644205729166, "learning_rate": 0.0001, "loss": 5.5847, "loss/crossentropy": 2.496085286140442, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16120856255292892, "step": 20506 }, { "epoch": 0.640875, "grad_norm": 3.296875, "grad_norm_var": 0.052277628580729166, "learning_rate": 0.0001, "loss": 5.664, "loss/crossentropy": 2.503101348876953, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16531158983707428, "step": 20508 }, { "epoch": 0.6409375, "grad_norm": 2.828125, "grad_norm_var": 0.06347249348958334, "learning_rate": 0.0001, "loss": 5.6787, "loss/crossentropy": 2.5898317098617554, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16005895286798477, "step": 20510 }, { "epoch": 0.641, "grad_norm": 2.875, "grad_norm_var": 0.04778238932291667, "learning_rate": 0.0001, "loss": 5.8977, "loss/crossentropy": 2.720097303390503, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.17361589521169662, "step": 20512 }, { "epoch": 0.6410625, "grad_norm": 3.21875, "grad_norm_var": 0.0487945556640625, "learning_rate": 0.0001, "loss": 5.8166, "loss/crossentropy": 2.6614229679107666, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16708486527204514, "step": 20514 }, { "epoch": 0.641125, "grad_norm": 3.1875, "grad_norm_var": 0.03380533854166667, "learning_rate": 0.0001, "loss": 5.6232, "loss/crossentropy": 2.4592409133911133, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16991379112005234, "step": 20516 }, { "epoch": 0.6411875, "grad_norm": 3.15625, "grad_norm_var": 0.031168619791666668, "learning_rate": 0.0001, "loss": 5.7806, "loss/crossentropy": 2.6537065505981445, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1669885218143463, "step": 20518 }, { "epoch": 0.64125, "grad_norm": 3.125, "grad_norm_var": 0.028563435872395834, "learning_rate": 0.0001, "loss": 6.0185, "loss/crossentropy": 2.7539583444595337, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17606395483016968, "step": 20520 }, { "epoch": 0.6413125, "grad_norm": 2.921875, "grad_norm_var": 0.025484212239583335, "learning_rate": 0.0001, "loss": 5.5576, "loss/crossentropy": 2.5074567794799805, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16125962138175964, "step": 20522 }, { "epoch": 0.641375, "grad_norm": 3.171875, "grad_norm_var": 0.025389607747395834, "learning_rate": 0.0001, "loss": 5.7862, "loss/crossentropy": 2.631861090660095, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16504361480474472, "step": 20524 }, { "epoch": 0.6414375, "grad_norm": 4.21875, "grad_norm_var": 0.09474283854166667, "learning_rate": 0.0001, "loss": 5.8544, "loss/crossentropy": 2.5563753843307495, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17746364325284958, "step": 20526 }, { "epoch": 0.6415, "grad_norm": 3.46875, "grad_norm_var": 0.08902587890625, "learning_rate": 0.0001, "loss": 5.7431, "loss/crossentropy": 2.5309959650039673, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16886798292398453, "step": 20528 }, { "epoch": 0.6415625, "grad_norm": 2.75, "grad_norm_var": 0.10445556640625, "learning_rate": 0.0001, "loss": 5.6452, "loss/crossentropy": 2.6100722551345825, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15975908190011978, "step": 20530 }, { "epoch": 0.641625, "grad_norm": 3.078125, "grad_norm_var": 0.10625, "learning_rate": 0.0001, "loss": 5.6942, "loss/crossentropy": 2.5716564655303955, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16342197358608246, "step": 20532 }, { "epoch": 0.6416875, "grad_norm": 3.375, "grad_norm_var": 0.10946858723958333, "learning_rate": 0.0001, "loss": 5.7065, "loss/crossentropy": 2.552822470664978, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1688838005065918, "step": 20534 }, { "epoch": 0.64175, "grad_norm": 3.125, "grad_norm_var": 0.11346028645833334, "learning_rate": 0.0001, "loss": 5.8805, "loss/crossentropy": 2.661011815071106, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17312116920948029, "step": 20536 }, { "epoch": 0.6418125, "grad_norm": 3.25, "grad_norm_var": 0.10871988932291667, "learning_rate": 0.0001, "loss": 5.8692, "loss/crossentropy": 2.5903857946395874, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1778780147433281, "step": 20538 }, { "epoch": 0.641875, "grad_norm": 3.40625, "grad_norm_var": 0.10724283854166666, "learning_rate": 0.0001, "loss": 5.8436, "loss/crossentropy": 2.5993826389312744, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1732523962855339, "step": 20540 }, { "epoch": 0.6419375, "grad_norm": 3.203125, "grad_norm_var": 0.0626373291015625, "learning_rate": 0.0001, "loss": 5.9778, "loss/crossentropy": 2.727904796600342, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17498829215765, "step": 20542 }, { "epoch": 0.642, "grad_norm": 3.078125, "grad_norm_var": 0.0645416259765625, "learning_rate": 0.0001, "loss": 5.6195, "loss/crossentropy": 2.527278184890747, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16507700830698013, "step": 20544 }, { "epoch": 0.6420625, "grad_norm": 3.109375, "grad_norm_var": 0.0537017822265625, "learning_rate": 0.0001, "loss": 5.6138, "loss/crossentropy": 2.535588502883911, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16251179575920105, "step": 20546 }, { "epoch": 0.642125, "grad_norm": 3.140625, "grad_norm_var": 0.05260009765625, "learning_rate": 0.0001, "loss": 5.7288, "loss/crossentropy": 2.600610852241516, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16515814512968063, "step": 20548 }, { "epoch": 0.6421875, "grad_norm": 3.109375, "grad_norm_var": 0.05413004557291667, "learning_rate": 0.0001, "loss": 5.6988, "loss/crossentropy": 2.5278310775756836, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16435878723859787, "step": 20550 }, { "epoch": 0.64225, "grad_norm": 2.953125, "grad_norm_var": 0.05419514973958333, "learning_rate": 0.0001, "loss": 5.8973, "loss/crossentropy": 2.7441052198410034, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16727203875780106, "step": 20552 }, { "epoch": 0.6423125, "grad_norm": 2.890625, "grad_norm_var": 0.05361328125, "learning_rate": 0.0001, "loss": 5.7992, "loss/crossentropy": 2.6445915699005127, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16741015017032623, "step": 20554 }, { "epoch": 0.642375, "grad_norm": 2.96875, "grad_norm_var": 0.06968485514322917, "learning_rate": 0.0001, "loss": 5.2992, "loss/crossentropy": 2.3941731452941895, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.14831183850765228, "step": 20556 }, { "epoch": 0.6424375, "grad_norm": 2.90625, "grad_norm_var": 0.03802083333333333, "learning_rate": 0.0001, "loss": 5.497, "loss/crossentropy": 2.4603993892669678, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1599094420671463, "step": 20558 }, { "epoch": 0.6425, "grad_norm": 3.03125, "grad_norm_var": 0.03727925618489583, "learning_rate": 0.0001, "loss": 5.4223, "loss/crossentropy": 2.4426238536834717, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1495300829410553, "step": 20560 }, { "epoch": 0.6425625, "grad_norm": 3.015625, "grad_norm_var": 0.041276041666666666, "learning_rate": 0.0001, "loss": 5.5665, "loss/crossentropy": 2.448686718940735, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16412575542926788, "step": 20562 }, { "epoch": 0.642625, "grad_norm": 3.484375, "grad_norm_var": 0.05435791015625, "learning_rate": 0.0001, "loss": 5.9063, "loss/crossentropy": 2.6836531162261963, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17069945484399796, "step": 20564 }, { "epoch": 0.6426875, "grad_norm": 3.15625, "grad_norm_var": 0.0501861572265625, "learning_rate": 0.0001, "loss": 5.6015, "loss/crossentropy": 2.438377857208252, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16982437670230865, "step": 20566 }, { "epoch": 0.64275, "grad_norm": 3.125, "grad_norm_var": 0.04888916015625, "learning_rate": 0.0001, "loss": 5.6914, "loss/crossentropy": 2.5879935026168823, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16424890607595444, "step": 20568 }, { "epoch": 0.6428125, "grad_norm": 3.28125, "grad_norm_var": 0.05414937337239583, "learning_rate": 0.0001, "loss": 6.0771, "loss/crossentropy": 2.7591071128845215, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.18413875252008438, "step": 20570 }, { "epoch": 0.642875, "grad_norm": 3.171875, "grad_norm_var": 0.033234659830729166, "learning_rate": 0.0001, "loss": 5.6281, "loss/crossentropy": 2.5197192430496216, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16435106843709946, "step": 20572 }, { "epoch": 0.6429375, "grad_norm": 3.0, "grad_norm_var": 0.02867431640625, "learning_rate": 0.0001, "loss": 5.4993, "loss/crossentropy": 2.4176703691482544, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16245892643928528, "step": 20574 }, { "epoch": 0.643, "grad_norm": 4.03125, "grad_norm_var": 0.06992899576822917, "learning_rate": 0.0001, "loss": 5.7244, "loss/crossentropy": 2.5312092304229736, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.16501778364181519, "step": 20576 }, { "epoch": 0.6430625, "grad_norm": 4.125, "grad_norm_var": 0.11340738932291666, "learning_rate": 0.0001, "loss": 5.7225, "loss/crossentropy": 2.5450315475463867, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16696663200855255, "step": 20578 }, { "epoch": 0.643125, "grad_norm": 3.078125, "grad_norm_var": 0.11417643229166667, "learning_rate": 0.0001, "loss": 5.721, "loss/crossentropy": 2.5756813287734985, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1664893478155136, "step": 20580 }, { "epoch": 0.6431875, "grad_norm": 3.1875, "grad_norm_var": 0.11578369140625, "learning_rate": 0.0001, "loss": 5.5566, "loss/crossentropy": 2.5012201070785522, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1606113612651825, "step": 20582 }, { "epoch": 0.64325, "grad_norm": 3.265625, "grad_norm_var": 0.1085601806640625, "learning_rate": 0.0001, "loss": 5.8301, "loss/crossentropy": 2.604956269264221, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17173685878515244, "step": 20584 }, { "epoch": 0.6433125, "grad_norm": 3.359375, "grad_norm_var": 0.11462300618489583, "learning_rate": 0.0001, "loss": 6.0687, "loss/crossentropy": 2.6878198385238647, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18378955870866776, "step": 20586 }, { "epoch": 0.643375, "grad_norm": 3.296875, "grad_norm_var": 0.10331929524739583, "learning_rate": 0.0001, "loss": 5.8921, "loss/crossentropy": 2.6400766372680664, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17637501657009125, "step": 20588 }, { "epoch": 0.6434375, "grad_norm": 3.34375, "grad_norm_var": 0.1000396728515625, "learning_rate": 0.0001, "loss": 5.7808, "loss/crossentropy": 2.6643242835998535, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1628221496939659, "step": 20590 }, { "epoch": 0.6435, "grad_norm": 3.5, "grad_norm_var": 0.07281494140625, "learning_rate": 0.0001, "loss": 5.6668, "loss/crossentropy": 2.597736358642578, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.15651683509349823, "step": 20592 }, { "epoch": 0.6435625, "grad_norm": 2.890625, "grad_norm_var": 0.03601786295572917, "learning_rate": 0.0001, "loss": 5.4977, "loss/crossentropy": 2.4329657554626465, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15764636546373367, "step": 20594 }, { "epoch": 0.643625, "grad_norm": 3.109375, "grad_norm_var": 0.04039713541666667, "learning_rate": 0.0001, "loss": 5.8102, "loss/crossentropy": 2.579249858856201, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17192527651786804, "step": 20596 }, { "epoch": 0.6436875, "grad_norm": 3.1875, "grad_norm_var": 0.04241129557291667, "learning_rate": 0.0001, "loss": 5.8364, "loss/crossentropy": 2.60555100440979, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1719166487455368, "step": 20598 }, { "epoch": 0.64375, "grad_norm": 3.359375, "grad_norm_var": 0.04322509765625, "learning_rate": 0.0001, "loss": 5.4199, "loss/crossentropy": 2.4294326305389404, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15607880055904388, "step": 20600 }, { "epoch": 0.6438125, "grad_norm": 3.234375, "grad_norm_var": 0.0437896728515625, "learning_rate": 0.0001, "loss": 5.6764, "loss/crossentropy": 2.5371029376983643, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16979029774665833, "step": 20602 }, { "epoch": 0.643875, "grad_norm": 3.15625, "grad_norm_var": 0.04836832682291667, "learning_rate": 0.0001, "loss": 5.6754, "loss/crossentropy": 2.4644941091537476, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17147650569677353, "step": 20604 }, { "epoch": 0.6439375, "grad_norm": 3.0625, "grad_norm_var": 0.051904296875, "learning_rate": 0.0001, "loss": 5.7433, "loss/crossentropy": 2.5431348085403442, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17352993786334991, "step": 20606 }, { "epoch": 0.644, "grad_norm": 2.796875, "grad_norm_var": 0.059626261393229164, "learning_rate": 0.0001, "loss": 5.6671, "loss/crossentropy": 2.5326255559921265, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16500743478536606, "step": 20608 }, { "epoch": 0.6440625, "grad_norm": 3.171875, "grad_norm_var": 0.06204020182291667, "learning_rate": 0.0001, "loss": 5.5522, "loss/crossentropy": 2.481705665588379, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15900429338216782, "step": 20610 }, { "epoch": 0.644125, "grad_norm": 3.171875, "grad_norm_var": 0.062189737955729164, "learning_rate": 0.0001, "loss": 5.4313, "loss/crossentropy": 2.461795449256897, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.154764823615551, "step": 20612 }, { "epoch": 0.6441875, "grad_norm": 3.078125, "grad_norm_var": 0.06052958170572917, "learning_rate": 0.0001, "loss": 5.7733, "loss/crossentropy": 2.6286559104919434, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16837041825056076, "step": 20614 }, { "epoch": 0.64425, "grad_norm": 3.015625, "grad_norm_var": 0.06249593098958333, "learning_rate": 0.0001, "loss": 5.6556, "loss/crossentropy": 2.5903440713882446, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16434288769960403, "step": 20616 }, { "epoch": 0.6443125, "grad_norm": 3.0625, "grad_norm_var": 0.056151326497395834, "learning_rate": 0.0001, "loss": 5.8679, "loss/crossentropy": 2.6790761947631836, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16771499812602997, "step": 20618 }, { "epoch": 0.644375, "grad_norm": 3.09375, "grad_norm_var": 0.047362263997395834, "learning_rate": 0.0001, "loss": 5.7163, "loss/crossentropy": 2.5803322792053223, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16398627310991287, "step": 20620 }, { "epoch": 0.6444375, "grad_norm": 5.75, "grad_norm_var": 0.48156636555989585, "learning_rate": 0.0001, "loss": 6.2044, "loss/crossentropy": 2.9062217473983765, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1802084892988205, "step": 20622 }, { "epoch": 0.6445, "grad_norm": 2.875, "grad_norm_var": 0.4784820556640625, "learning_rate": 0.0001, "loss": 5.751, "loss/crossentropy": 2.6165133714675903, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16774094849824905, "step": 20624 }, { "epoch": 0.6445625, "grad_norm": 3.046875, "grad_norm_var": 0.48323160807291665, "learning_rate": 0.0001, "loss": 5.5445, "loss/crossentropy": 2.511154890060425, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.16153790056705475, "step": 20626 }, { "epoch": 0.644625, "grad_norm": 2.96875, "grad_norm_var": 0.4850819905598958, "learning_rate": 0.0001, "loss": 5.9264, "loss/crossentropy": 2.789646625518799, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16758083552122116, "step": 20628 }, { "epoch": 0.6446875, "grad_norm": 3.171875, "grad_norm_var": 0.4852203369140625, "learning_rate": 0.0001, "loss": 5.5333, "loss/crossentropy": 2.466044545173645, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1614159345626831, "step": 20630 }, { "epoch": 0.64475, "grad_norm": 3.765625, "grad_norm_var": 0.5007151285807292, "learning_rate": 0.0001, "loss": 5.4144, "loss/crossentropy": 2.3553906679153442, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15824922919273376, "step": 20632 }, { "epoch": 0.6448125, "grad_norm": 3.203125, "grad_norm_var": 0.49524332682291666, "learning_rate": 0.0001, "loss": 5.7369, "loss/crossentropy": 2.565440535545349, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17065993696451187, "step": 20634 }, { "epoch": 0.644875, "grad_norm": 3.453125, "grad_norm_var": 0.49905598958333336, "learning_rate": 0.0001, "loss": 6.0343, "loss/crossentropy": 2.700536012649536, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18142636120319366, "step": 20636 }, { "epoch": 0.6449375, "grad_norm": 3.3125, "grad_norm_var": 0.06806233723958334, "learning_rate": 0.0001, "loss": 5.7073, "loss/crossentropy": 2.539323091506958, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.17226801067590714, "step": 20638 }, { "epoch": 0.645, "grad_norm": 3.296875, "grad_norm_var": 0.06585286458333334, "learning_rate": 0.0001, "loss": 5.7115, "loss/crossentropy": 2.510597586631775, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1681392937898636, "step": 20640 }, { "epoch": 0.6450625, "grad_norm": 3.171875, "grad_norm_var": 0.055985514322916666, "learning_rate": 0.0001, "loss": 5.7717, "loss/crossentropy": 2.5721393823623657, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1703510284423828, "step": 20642 }, { "epoch": 0.645125, "grad_norm": 3.09375, "grad_norm_var": 0.05738016764322917, "learning_rate": 0.0001, "loss": 5.4939, "loss/crossentropy": 2.4927202463150024, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15676119178533554, "step": 20644 }, { "epoch": 0.6451875, "grad_norm": 3.265625, "grad_norm_var": 0.05315348307291667, "learning_rate": 0.0001, "loss": 5.6851, "loss/crossentropy": 2.594095230102539, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16261142492294312, "step": 20646 }, { "epoch": 0.64525, "grad_norm": 2.96875, "grad_norm_var": 0.029130045572916666, "learning_rate": 0.0001, "loss": 5.634, "loss/crossentropy": 2.4565051794052124, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16891717910766602, "step": 20648 }, { "epoch": 0.6453125, "grad_norm": 3.046875, "grad_norm_var": 0.029195149739583332, "learning_rate": 0.0001, "loss": 5.451, "loss/crossentropy": 2.455316662788391, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.1589415967464447, "step": 20650 }, { "epoch": 0.645375, "grad_norm": 3.03125, "grad_norm_var": 0.021654256184895835, "learning_rate": 0.0001, "loss": 6.0149, "loss/crossentropy": 2.8132808208465576, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.17406649887561798, "step": 20652 }, { "epoch": 0.6454375, "grad_norm": 3.21875, "grad_norm_var": 0.04731343587239583, "learning_rate": 0.0001, "loss": 5.8153, "loss/crossentropy": 2.5763078927993774, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17506926506757736, "step": 20654 }, { "epoch": 0.6455, "grad_norm": 3.140625, "grad_norm_var": 0.04488525390625, "learning_rate": 0.0001, "loss": 5.5542, "loss/crossentropy": 2.451761245727539, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16570903360843658, "step": 20656 }, { "epoch": 0.6455625, "grad_norm": 2.953125, "grad_norm_var": 0.04789937337239583, "learning_rate": 0.0001, "loss": 5.5099, "loss/crossentropy": 2.5485183000564575, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15199314057826996, "step": 20658 }, { "epoch": 0.645625, "grad_norm": 3.0625, "grad_norm_var": 0.047053019205729164, "learning_rate": 0.0001, "loss": 5.8736, "loss/crossentropy": 2.657061815261841, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17282658070325851, "step": 20660 }, { "epoch": 0.6456875, "grad_norm": 3.296875, "grad_norm_var": 0.046708170572916666, "learning_rate": 0.0001, "loss": 5.5583, "loss/crossentropy": 2.428104043006897, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1657543182373047, "step": 20662 }, { "epoch": 0.64575, "grad_norm": 3.171875, "grad_norm_var": 0.08806864420572917, "learning_rate": 0.0001, "loss": 5.6182, "loss/crossentropy": 2.4839119911193848, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16264984756708145, "step": 20664 }, { "epoch": 0.6458125, "grad_norm": 3.078125, "grad_norm_var": 0.2552571614583333, "learning_rate": 0.0001, "loss": 5.525, "loss/crossentropy": 2.4292737245559692, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16465438157320023, "step": 20666 }, { "epoch": 0.645875, "grad_norm": 2.9375, "grad_norm_var": 0.2591623942057292, "learning_rate": 0.0001, "loss": 5.5494, "loss/crossentropy": 2.532724976539612, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.1575249433517456, "step": 20668 }, { "epoch": 0.6459375, "grad_norm": 3.8125, "grad_norm_var": 0.27838541666666666, "learning_rate": 0.0001, "loss": 5.5343, "loss/crossentropy": 2.400562286376953, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16376598179340363, "step": 20670 }, { "epoch": 0.646, "grad_norm": 2.859375, "grad_norm_var": 0.2901529947916667, "learning_rate": 0.0001, "loss": 5.2208, "loss/crossentropy": 2.2743862867355347, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15167373418807983, "step": 20672 }, { "epoch": 0.6460625, "grad_norm": 2.984375, "grad_norm_var": 0.2821248372395833, "learning_rate": 0.0001, "loss": 5.4874, "loss/crossentropy": 2.4957116842269897, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.1565897911787033, "step": 20674 }, { "epoch": 0.646125, "grad_norm": 3.171875, "grad_norm_var": 0.28769124348958336, "learning_rate": 0.0001, "loss": 5.7027, "loss/crossentropy": 2.582889437675476, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16900938749313354, "step": 20676 }, { "epoch": 0.6461875, "grad_norm": 2.9375, "grad_norm_var": 0.29375, "learning_rate": 0.0001, "loss": 5.629, "loss/crossentropy": 2.523389458656311, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16173437237739563, "step": 20678 }, { "epoch": 0.64625, "grad_norm": 3.28125, "grad_norm_var": 0.25565999348958335, "learning_rate": 0.0001, "loss": 5.7128, "loss/crossentropy": 2.5503554344177246, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1674199253320694, "step": 20680 }, { "epoch": 0.6463125, "grad_norm": 3.03125, "grad_norm_var": 0.053955078125, "learning_rate": 0.0001, "loss": 5.697, "loss/crossentropy": 2.5437591075897217, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16844742000102997, "step": 20682 }, { "epoch": 0.646375, "grad_norm": 3.390625, "grad_norm_var": 0.05681966145833333, "learning_rate": 0.0001, "loss": 5.8445, "loss/crossentropy": 2.6223974227905273, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17065011709928513, "step": 20684 }, { "epoch": 0.6464375, "grad_norm": 2.984375, "grad_norm_var": 0.03388671875, "learning_rate": 0.0001, "loss": 5.9883, "loss/crossentropy": 2.740772008895874, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17592400312423706, "step": 20686 }, { "epoch": 0.6465, "grad_norm": 3.1875, "grad_norm_var": 0.0319488525390625, "learning_rate": 0.0001, "loss": 5.8538, "loss/crossentropy": 2.614986300468445, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1750514805316925, "step": 20688 }, { "epoch": 0.6465625, "grad_norm": 3.15625, "grad_norm_var": 0.04364827473958333, "learning_rate": 0.0001, "loss": 6.1904, "loss/crossentropy": 2.8218883275985718, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.18294403702020645, "step": 20690 }, { "epoch": 0.646625, "grad_norm": 3.28125, "grad_norm_var": 0.041779581705729166, "learning_rate": 0.0001, "loss": 5.6118, "loss/crossentropy": 2.48758864402771, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16672097146511078, "step": 20692 }, { "epoch": 0.6466875, "grad_norm": 3.28125, "grad_norm_var": 0.03966471354166667, "learning_rate": 0.0001, "loss": 5.8225, "loss/crossentropy": 2.6528961658477783, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1700805202126503, "step": 20694 }, { "epoch": 0.64675, "grad_norm": 3.15625, "grad_norm_var": 0.05286051432291667, "learning_rate": 0.0001, "loss": 5.5818, "loss/crossentropy": 2.5731911659240723, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15671736001968384, "step": 20696 }, { "epoch": 0.6468125, "grad_norm": 3.0625, "grad_norm_var": 0.052057902018229164, "learning_rate": 0.0001, "loss": 5.8673, "loss/crossentropy": 2.653502941131592, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17294374108314514, "step": 20698 }, { "epoch": 0.646875, "grad_norm": 2.984375, "grad_norm_var": 0.05219624837239583, "learning_rate": 0.0001, "loss": 5.3281, "loss/crossentropy": 2.2781084775924683, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16203365474939346, "step": 20700 }, { "epoch": 0.6469375, "grad_norm": 3.265625, "grad_norm_var": 0.039525349934895836, "learning_rate": 0.0001, "loss": 5.8599, "loss/crossentropy": 2.6676172018051147, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1715681254863739, "step": 20702 }, { "epoch": 0.647, "grad_norm": 3.015625, "grad_norm_var": 0.036742146809895834, "learning_rate": 0.0001, "loss": 5.6922, "loss/crossentropy": 2.5569517612457275, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16743438690900803, "step": 20704 }, { "epoch": 0.6470625, "grad_norm": 2.953125, "grad_norm_var": 0.018550618489583334, "learning_rate": 0.0001, "loss": 5.7862, "loss/crossentropy": 2.641975998878479, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.17145609855651855, "step": 20706 }, { "epoch": 0.647125, "grad_norm": 2.90625, "grad_norm_var": 0.018001302083333334, "learning_rate": 0.0001, "loss": 5.5241, "loss/crossentropy": 2.5126917362213135, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15387942641973495, "step": 20708 }, { "epoch": 0.6471875, "grad_norm": 2.734375, "grad_norm_var": 0.0216461181640625, "learning_rate": 0.0001, "loss": 5.4066, "loss/crossentropy": 2.474587321281433, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.14944685995578766, "step": 20710 }, { "epoch": 0.64725, "grad_norm": 3.0, "grad_norm_var": 0.01705322265625, "learning_rate": 0.0001, "loss": 5.8376, "loss/crossentropy": 2.6972298622131348, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1648198664188385, "step": 20712 }, { "epoch": 0.6473125, "grad_norm": 2.921875, "grad_norm_var": 0.0204010009765625, "learning_rate": 0.0001, "loss": 5.6284, "loss/crossentropy": 2.530660629272461, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16329185664653778, "step": 20714 }, { "epoch": 0.647375, "grad_norm": 3.359375, "grad_norm_var": 0.03780924479166667, "learning_rate": 0.0001, "loss": 5.8959, "loss/crossentropy": 2.6485893726348877, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17668356746435165, "step": 20716 }, { "epoch": 0.6474375, "grad_norm": 2.890625, "grad_norm_var": 0.03723958333333333, "learning_rate": 0.0001, "loss": 5.5549, "loss/crossentropy": 2.5403462648391724, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15771017968654633, "step": 20718 }, { "epoch": 0.6475, "grad_norm": 3.59375, "grad_norm_var": 0.0558013916015625, "learning_rate": 0.0001, "loss": 5.9226, "loss/crossentropy": 2.650268077850342, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17723120003938675, "step": 20720 }, { "epoch": 0.6475625, "grad_norm": 7.3125, "grad_norm_var": 1.1860097249348958, "learning_rate": 0.0001, "loss": 5.3982, "loss/crossentropy": 2.3582738637924194, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15984924882650375, "step": 20722 }, { "epoch": 0.647625, "grad_norm": 2.90625, "grad_norm_var": 1.1778391520182292, "learning_rate": 0.0001, "loss": 5.6724, "loss/crossentropy": 2.561447024345398, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16265869140625, "step": 20724 }, { "epoch": 0.6476875, "grad_norm": 2.984375, "grad_norm_var": 1.1653798421223958, "learning_rate": 0.0001, "loss": 5.4925, "loss/crossentropy": 2.5229510068893433, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.15633327513933182, "step": 20726 }, { "epoch": 0.64775, "grad_norm": 3.359375, "grad_norm_var": 1.1530232747395834, "learning_rate": 0.0001, "loss": 5.5759, "loss/crossentropy": 2.5082759857177734, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1579337641596794, "step": 20728 }, { "epoch": 0.6478125, "grad_norm": 3.203125, "grad_norm_var": 1.1388671875, "learning_rate": 0.0001, "loss": 5.7815, "loss/crossentropy": 2.631260395050049, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16736982762813568, "step": 20730 }, { "epoch": 0.647875, "grad_norm": 3.109375, "grad_norm_var": 1.1561024983723958, "learning_rate": 0.0001, "loss": 5.3087, "loss/crossentropy": 2.279842734336853, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.158743254840374, "step": 20732 }, { "epoch": 0.6479375, "grad_norm": 3.296875, "grad_norm_var": 1.132941691080729, "learning_rate": 0.0001, "loss": 5.8052, "loss/crossentropy": 2.63310444355011, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17072393000125885, "step": 20734 }, { "epoch": 0.648, "grad_norm": 3.015625, "grad_norm_var": 1.13355712890625, "learning_rate": 0.0001, "loss": 5.8553, "loss/crossentropy": 2.7163825035095215, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1662379875779152, "step": 20736 }, { "epoch": 0.6480625, "grad_norm": 3.21875, "grad_norm_var": 0.030663045247395833, "learning_rate": 0.0001, "loss": 5.5855, "loss/crossentropy": 2.453284978866577, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1632222756743431, "step": 20738 }, { "epoch": 0.648125, "grad_norm": 3.4375, "grad_norm_var": 0.03299153645833333, "learning_rate": 0.0001, "loss": 5.5864, "loss/crossentropy": 2.5098971128463745, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16116367280483246, "step": 20740 }, { "epoch": 0.6481875, "grad_norm": 3.25, "grad_norm_var": 0.024006144205729166, "learning_rate": 0.0001, "loss": 5.5355, "loss/crossentropy": 2.530406355857849, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15441171824932098, "step": 20742 }, { "epoch": 0.64825, "grad_norm": 3.09375, "grad_norm_var": 0.0184234619140625, "learning_rate": 0.0001, "loss": 5.6487, "loss/crossentropy": 2.559556484222412, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.16555877029895782, "step": 20744 }, { "epoch": 0.6483125, "grad_norm": 3.140625, "grad_norm_var": 0.018928019205729167, "learning_rate": 0.0001, "loss": 5.6332, "loss/crossentropy": 2.507178783416748, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1669030785560608, "step": 20746 }, { "epoch": 0.648375, "grad_norm": 3.46875, "grad_norm_var": 0.02216796875, "learning_rate": 0.0001, "loss": 5.7268, "loss/crossentropy": 2.5367095470428467, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16822326183319092, "step": 20748 }, { "epoch": 0.6484375, "grad_norm": 3.390625, "grad_norm_var": 0.03330078125, "learning_rate": 0.0001, "loss": 5.7544, "loss/crossentropy": 2.697374939918518, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1588297113776207, "step": 20750 }, { "epoch": 0.6485, "grad_norm": 3.25, "grad_norm_var": 0.031371053059895834, "learning_rate": 0.0001, "loss": 5.7108, "loss/crossentropy": 2.590539574623108, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16476020216941833, "step": 20752 }, { "epoch": 0.6485625, "grad_norm": 3.390625, "grad_norm_var": 0.036742146809895834, "learning_rate": 0.0001, "loss": 5.6735, "loss/crossentropy": 2.5535773038864136, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1639494150876999, "step": 20754 }, { "epoch": 0.648625, "grad_norm": 2.96875, "grad_norm_var": 0.0344879150390625, "learning_rate": 0.0001, "loss": 5.6174, "loss/crossentropy": 2.572333812713623, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1568552851676941, "step": 20756 }, { "epoch": 0.6486875, "grad_norm": 2.953125, "grad_norm_var": 0.034375, "learning_rate": 0.0001, "loss": 5.7969, "loss/crossentropy": 2.663484573364258, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16490188986063004, "step": 20758 }, { "epoch": 0.64875, "grad_norm": 3.328125, "grad_norm_var": 0.0371734619140625, "learning_rate": 0.0001, "loss": 5.9749, "loss/crossentropy": 2.6919151544570923, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17673945426940918, "step": 20760 }, { "epoch": 0.6488125, "grad_norm": 2.90625, "grad_norm_var": 0.04039306640625, "learning_rate": 0.0001, "loss": 5.5585, "loss/crossentropy": 2.5666253566741943, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1554359495639801, "step": 20762 }, { "epoch": 0.648875, "grad_norm": 2.9375, "grad_norm_var": 0.03527018229166667, "learning_rate": 0.0001, "loss": 5.6896, "loss/crossentropy": 2.591607093811035, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1644899696111679, "step": 20764 }, { "epoch": 0.6489375, "grad_norm": 3.109375, "grad_norm_var": 0.024593098958333334, "learning_rate": 0.0001, "loss": 5.5727, "loss/crossentropy": 2.4673659801483154, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16053176671266556, "step": 20766 }, { "epoch": 0.649, "grad_norm": 3.234375, "grad_norm_var": 0.02509765625, "learning_rate": 0.0001, "loss": 5.4418, "loss/crossentropy": 2.364788770675659, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16004639863967896, "step": 20768 }, { "epoch": 0.6490625, "grad_norm": 3.1875, "grad_norm_var": 0.019596354166666666, "learning_rate": 0.0001, "loss": 5.9013, "loss/crossentropy": 2.7390103340148926, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.17091257125139236, "step": 20770 }, { "epoch": 0.649125, "grad_norm": 3.03125, "grad_norm_var": 0.019872029622395832, "learning_rate": 0.0001, "loss": 5.6071, "loss/crossentropy": 2.577046036720276, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16081403940916061, "step": 20772 }, { "epoch": 0.6491875, "grad_norm": 3.140625, "grad_norm_var": 0.018805948893229167, "learning_rate": 0.0001, "loss": 5.4083, "loss/crossentropy": 2.356221079826355, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1602884978055954, "step": 20774 }, { "epoch": 0.64925, "grad_norm": 3.21875, "grad_norm_var": 0.016999308268229166, "learning_rate": 0.0001, "loss": 5.6074, "loss/crossentropy": 2.5159261226654053, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1591450348496437, "step": 20776 }, { "epoch": 0.6493125, "grad_norm": 3.265625, "grad_norm_var": 0.0179107666015625, "learning_rate": 0.0001, "loss": 5.6751, "loss/crossentropy": 2.467453718185425, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17193461954593658, "step": 20778 }, { "epoch": 0.649375, "grad_norm": 3.375, "grad_norm_var": 0.018603515625, "learning_rate": 0.0001, "loss": 5.6841, "loss/crossentropy": 2.5426772832870483, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1692201793193817, "step": 20780 }, { "epoch": 0.6494375, "grad_norm": 2.953125, "grad_norm_var": 0.019820149739583334, "learning_rate": 0.0001, "loss": 5.5734, "loss/crossentropy": 2.5002176761627197, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16356637328863144, "step": 20782 }, { "epoch": 0.6495, "grad_norm": 3.234375, "grad_norm_var": 0.019254557291666665, "learning_rate": 0.0001, "loss": 5.7456, "loss/crossentropy": 2.587036371231079, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.168982595205307, "step": 20784 }, { "epoch": 0.6495625, "grad_norm": 2.875, "grad_norm_var": 0.028351847330729166, "learning_rate": 0.0001, "loss": 5.7281, "loss/crossentropy": 2.5481547117233276, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16994479298591614, "step": 20786 }, { "epoch": 0.649625, "grad_norm": 2.890625, "grad_norm_var": 0.029816691080729166, "learning_rate": 0.0001, "loss": 5.5235, "loss/crossentropy": 2.586545944213867, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15072697401046753, "step": 20788 }, { "epoch": 0.6496875, "grad_norm": 3.0625, "grad_norm_var": 0.036473592122395836, "learning_rate": 0.0001, "loss": 5.3723, "loss/crossentropy": 2.3707480430603027, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15483906120061874, "step": 20790 }, { "epoch": 0.64975, "grad_norm": 3.046875, "grad_norm_var": 0.03330790201822917, "learning_rate": 0.0001, "loss": 5.3547, "loss/crossentropy": 2.3479338884353638, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1541936844587326, "step": 20792 }, { "epoch": 0.6498125, "grad_norm": 3.0625, "grad_norm_var": 0.03268229166666667, "learning_rate": 0.0001, "loss": 5.9684, "loss/crossentropy": 2.7685261964797974, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.174676351249218, "step": 20794 }, { "epoch": 0.649875, "grad_norm": 3.328125, "grad_norm_var": 0.031037394205729166, "learning_rate": 0.0001, "loss": 5.4786, "loss/crossentropy": 2.3692315816879272, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.15937399864196777, "step": 20796 }, { "epoch": 0.6499375, "grad_norm": 3.4375, "grad_norm_var": 0.0392974853515625, "learning_rate": 0.0001, "loss": 5.9443, "loss/crossentropy": 2.654773235321045, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17894787341356277, "step": 20798 }, { "epoch": 0.65, "grad_norm": 3.015625, "grad_norm_var": 0.0437408447265625, "learning_rate": 0.0001, "loss": 5.5112, "loss/crossentropy": 2.430831551551819, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16155192255973816, "step": 20800 }, { "epoch": 0.6500625, "grad_norm": 2.9375, "grad_norm_var": 0.0348297119140625, "learning_rate": 0.0001, "loss": 5.5582, "loss/crossentropy": 2.5225579738616943, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1598166674375534, "step": 20802 }, { "epoch": 0.650125, "grad_norm": 3.203125, "grad_norm_var": 0.0316314697265625, "learning_rate": 0.0001, "loss": 5.5031, "loss/crossentropy": 2.4108450412750244, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.15883759409189224, "step": 20804 }, { "epoch": 0.6501875, "grad_norm": 3.125, "grad_norm_var": 0.03200581868489583, "learning_rate": 0.0001, "loss": 5.887, "loss/crossentropy": 2.6739065647125244, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17287658154964447, "step": 20806 }, { "epoch": 0.65025, "grad_norm": 3.15625, "grad_norm_var": 0.031087239583333332, "learning_rate": 0.0001, "loss": 5.7727, "loss/crossentropy": 2.559706211090088, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17364367842674255, "step": 20808 }, { "epoch": 0.6503125, "grad_norm": 3.0, "grad_norm_var": 0.03287353515625, "learning_rate": 0.0001, "loss": 5.6713, "loss/crossentropy": 2.5572668313980103, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16492042690515518, "step": 20810 }, { "epoch": 0.650375, "grad_norm": 3.546875, "grad_norm_var": 0.037923177083333336, "learning_rate": 0.0001, "loss": 5.7524, "loss/crossentropy": 2.5259430408477783, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17264336347579956, "step": 20812 }, { "epoch": 0.6504375, "grad_norm": 2.984375, "grad_norm_var": 0.0379791259765625, "learning_rate": 0.0001, "loss": 5.7859, "loss/crossentropy": 2.605214238166809, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17001979798078537, "step": 20814 }, { "epoch": 0.6505, "grad_norm": 3.8125, "grad_norm_var": 0.07247721354166667, "learning_rate": 0.0001, "loss": 5.7527, "loss/crossentropy": 2.5045515298843384, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17012901604175568, "step": 20816 }, { "epoch": 0.6505625, "grad_norm": 3.0, "grad_norm_var": 0.07073160807291666, "learning_rate": 0.0001, "loss": 6.0264, "loss/crossentropy": 2.8550941944122314, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16986792534589767, "step": 20818 }, { "epoch": 0.650625, "grad_norm": 3.125, "grad_norm_var": 0.07301025390625, "learning_rate": 0.0001, "loss": 6.1044, "loss/crossentropy": 2.8505197763442993, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17734120786190033, "step": 20820 }, { "epoch": 0.6506875, "grad_norm": 3.25, "grad_norm_var": 0.06678059895833334, "learning_rate": 0.0001, "loss": 5.798, "loss/crossentropy": 2.6078113317489624, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16863227635622025, "step": 20822 }, { "epoch": 0.65075, "grad_norm": 3.15625, "grad_norm_var": 0.06692606608072917, "learning_rate": 0.0001, "loss": 5.6749, "loss/crossentropy": 2.5907870531082153, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1615317314863205, "step": 20824 }, { "epoch": 0.6508125, "grad_norm": 3.03125, "grad_norm_var": 0.06776936848958333, "learning_rate": 0.0001, "loss": 5.6762, "loss/crossentropy": 2.572179913520813, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1654839813709259, "step": 20826 }, { "epoch": 0.650875, "grad_norm": 2.890625, "grad_norm_var": 0.06816304524739583, "learning_rate": 0.0001, "loss": 5.6502, "loss/crossentropy": 2.5975730419158936, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1599457710981369, "step": 20828 }, { "epoch": 0.6509375, "grad_norm": 3.4375, "grad_norm_var": 0.08326822916666667, "learning_rate": 0.0001, "loss": 5.7692, "loss/crossentropy": 2.684068441390991, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16202397644519806, "step": 20830 }, { "epoch": 0.651, "grad_norm": 3.234375, "grad_norm_var": 0.03372395833333333, "learning_rate": 0.0001, "loss": 5.7825, "loss/crossentropy": 2.6395801305770874, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16819562017917633, "step": 20832 }, { "epoch": 0.6510625, "grad_norm": 2.9375, "grad_norm_var": 0.03487040201822917, "learning_rate": 0.0001, "loss": 5.9598, "loss/crossentropy": 2.7669568061828613, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.17397551983594894, "step": 20834 }, { "epoch": 0.651125, "grad_norm": 3.09375, "grad_norm_var": 0.03615620930989583, "learning_rate": 0.0001, "loss": 5.758, "loss/crossentropy": 2.595520853996277, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.17015812546014786, "step": 20836 }, { "epoch": 0.6511875, "grad_norm": 3.109375, "grad_norm_var": 0.02965087890625, "learning_rate": 0.0001, "loss": 5.8458, "loss/crossentropy": 2.6007357835769653, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17606915533542633, "step": 20838 }, { "epoch": 0.65125, "grad_norm": 3.078125, "grad_norm_var": 0.028831990559895833, "learning_rate": 0.0001, "loss": 5.5189, "loss/crossentropy": 2.512834906578064, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1560702845454216, "step": 20840 }, { "epoch": 0.6513125, "grad_norm": 2.953125, "grad_norm_var": 0.030305989583333335, "learning_rate": 0.0001, "loss": 5.7302, "loss/crossentropy": 2.6041375398635864, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1649482548236847, "step": 20842 }, { "epoch": 0.651375, "grad_norm": 3.3125, "grad_norm_var": 0.03330078125, "learning_rate": 0.0001, "loss": 5.7008, "loss/crossentropy": 2.5407623052597046, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16717741638422012, "step": 20844 }, { "epoch": 0.6514375, "grad_norm": 4.28125, "grad_norm_var": 0.11002197265625, "learning_rate": 0.0001, "loss": 6.0975, "loss/crossentropy": 2.60309374332428, "loss/hidden": 1.578125, "loss/jsd": 0.0, "loss/logits": 0.1916261613368988, "step": 20846 }, { "epoch": 0.6515, "grad_norm": 2.96875, "grad_norm_var": 0.10901285807291666, "learning_rate": 0.0001, "loss": 5.6536, "loss/crossentropy": 2.5152251720428467, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1653994396328926, "step": 20848 }, { "epoch": 0.6515625, "grad_norm": 3.109375, "grad_norm_var": 0.10564676920572917, "learning_rate": 0.0001, "loss": 5.5958, "loss/crossentropy": 2.4631600379943848, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16756527870893478, "step": 20850 }, { "epoch": 0.651625, "grad_norm": 3.15625, "grad_norm_var": 0.10240478515625, "learning_rate": 0.0001, "loss": 5.6409, "loss/crossentropy": 2.5698968172073364, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16140027344226837, "step": 20852 }, { "epoch": 0.6516875, "grad_norm": 3.34375, "grad_norm_var": 0.10437825520833334, "learning_rate": 0.0001, "loss": 5.8536, "loss/crossentropy": 2.6735728979110718, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1726933866739273, "step": 20854 }, { "epoch": 0.65175, "grad_norm": 2.8125, "grad_norm_var": 0.11347554524739584, "learning_rate": 0.0001, "loss": 5.3547, "loss/crossentropy": 2.397574305534363, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15274271368980408, "step": 20856 }, { "epoch": 0.6518125, "grad_norm": 3.3125, "grad_norm_var": 0.10851949055989583, "learning_rate": 0.0001, "loss": 5.766, "loss/crossentropy": 2.6301355361938477, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16710658371448517, "step": 20858 }, { "epoch": 0.651875, "grad_norm": 3.78125, "grad_norm_var": 0.12888997395833332, "learning_rate": 0.0001, "loss": 5.9323, "loss/crossentropy": 2.753417730331421, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16750143468379974, "step": 20860 }, { "epoch": 0.6519375, "grad_norm": 3.1875, "grad_norm_var": 0.04876302083333333, "learning_rate": 0.0001, "loss": 5.6276, "loss/crossentropy": 2.5113483667373657, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16357584297657013, "step": 20862 }, { "epoch": 0.652, "grad_norm": 2.953125, "grad_norm_var": 0.04888916015625, "learning_rate": 0.0001, "loss": 5.4786, "loss/crossentropy": 2.478987693786621, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15465336292982101, "step": 20864 }, { "epoch": 0.6520625, "grad_norm": 3.21875, "grad_norm_var": 0.05211181640625, "learning_rate": 0.0001, "loss": 5.8281, "loss/crossentropy": 2.576219320297241, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1744033619761467, "step": 20866 }, { "epoch": 0.652125, "grad_norm": 3.453125, "grad_norm_var": 0.06768290201822917, "learning_rate": 0.0001, "loss": 5.7824, "loss/crossentropy": 2.516385316848755, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.173087440431118, "step": 20868 }, { "epoch": 0.6521875, "grad_norm": 3.34375, "grad_norm_var": 0.06589253743489583, "learning_rate": 0.0001, "loss": 6.0909, "loss/crossentropy": 2.820400357246399, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17743618041276932, "step": 20870 }, { "epoch": 0.65225, "grad_norm": 3.21875, "grad_norm_var": 0.04413655598958333, "learning_rate": 0.0001, "loss": 5.9355, "loss/crossentropy": 2.644858717918396, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1798480600118637, "step": 20872 }, { "epoch": 0.6523125, "grad_norm": 3.125, "grad_norm_var": 0.04957275390625, "learning_rate": 0.0001, "loss": 5.4576, "loss/crossentropy": 2.409852147102356, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1563372015953064, "step": 20874 }, { "epoch": 0.652375, "grad_norm": 3.25, "grad_norm_var": 0.028449503580729167, "learning_rate": 0.0001, "loss": 6.0919, "loss/crossentropy": 2.7479069232940674, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1816682666540146, "step": 20876 }, { "epoch": 0.6524375, "grad_norm": 2.984375, "grad_norm_var": 0.03329671223958333, "learning_rate": 0.0001, "loss": 5.4214, "loss/crossentropy": 2.3958070278167725, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15528929978609085, "step": 20878 }, { "epoch": 0.6525, "grad_norm": 2.875, "grad_norm_var": 0.046337890625, "learning_rate": 0.0001, "loss": 5.4317, "loss/crossentropy": 2.4208797216415405, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.15030142664909363, "step": 20880 }, { "epoch": 0.6525625, "grad_norm": 3.203125, "grad_norm_var": 0.04453125, "learning_rate": 0.0001, "loss": 5.71, "loss/crossentropy": 2.512806534767151, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17049754410982132, "step": 20882 }, { "epoch": 0.652625, "grad_norm": 3.984375, "grad_norm_var": 0.0686676025390625, "learning_rate": 0.0001, "loss": 5.8605, "loss/crossentropy": 2.578509211540222, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.17351631820201874, "step": 20884 }, { "epoch": 0.6526875, "grad_norm": 3.015625, "grad_norm_var": 0.0745025634765625, "learning_rate": 0.0001, "loss": 5.3133, "loss/crossentropy": 2.328048348426819, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15360206365585327, "step": 20886 }, { "epoch": 0.65275, "grad_norm": 2.9375, "grad_norm_var": 0.0755035400390625, "learning_rate": 0.0001, "loss": 5.5179, "loss/crossentropy": 2.4790738821029663, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16169846802949905, "step": 20888 }, { "epoch": 0.6528125, "grad_norm": 4.15625, "grad_norm_var": 0.1389556884765625, "learning_rate": 0.0001, "loss": 5.443, "loss/crossentropy": 2.366265058517456, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16352909058332443, "step": 20890 }, { "epoch": 0.652875, "grad_norm": 3.15625, "grad_norm_var": 0.13908589680989583, "learning_rate": 0.0001, "loss": 5.8661, "loss/crossentropy": 2.628006100654602, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1730271428823471, "step": 20892 }, { "epoch": 0.6529375, "grad_norm": 3.28125, "grad_norm_var": 0.14055887858072916, "learning_rate": 0.0001, "loss": 5.6117, "loss/crossentropy": 2.505952835083008, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1636989787220955, "step": 20894 }, { "epoch": 0.653, "grad_norm": 3.171875, "grad_norm_var": 0.12853190104166667, "learning_rate": 0.0001, "loss": 5.8703, "loss/crossentropy": 2.658256769180298, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1751089245080948, "step": 20896 }, { "epoch": 0.6530625, "grad_norm": 3.265625, "grad_norm_var": 0.13076883951822918, "learning_rate": 0.0001, "loss": 5.5952, "loss/crossentropy": 2.4552528858184814, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16360285133123398, "step": 20898 }, { "epoch": 0.653125, "grad_norm": 3.21875, "grad_norm_var": 0.08946024576822917, "learning_rate": 0.0001, "loss": 5.7229, "loss/crossentropy": 2.575530529022217, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.17099007219076157, "step": 20900 }, { "epoch": 0.6531875, "grad_norm": 3.03125, "grad_norm_var": 0.08788655598958334, "learning_rate": 0.0001, "loss": 5.6576, "loss/crossentropy": 2.6053110361099243, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.158348947763443, "step": 20902 }, { "epoch": 0.65325, "grad_norm": 2.96875, "grad_norm_var": 0.09016011555989584, "learning_rate": 0.0001, "loss": 5.535, "loss/crossentropy": 2.453185796737671, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16247710585594177, "step": 20904 }, { "epoch": 0.6533125, "grad_norm": 2.921875, "grad_norm_var": 0.027099609375, "learning_rate": 0.0001, "loss": 5.5489, "loss/crossentropy": 2.5232917070388794, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15919943153858185, "step": 20906 }, { "epoch": 0.653375, "grad_norm": 3.21875, "grad_norm_var": 0.029466756184895835, "learning_rate": 0.0001, "loss": 5.4208, "loss/crossentropy": 2.498198390007019, "loss/hidden": 1.3828125, "loss/jsd": 0.0, "loss/logits": 0.1539750099182129, "step": 20908 }, { "epoch": 0.6534375, "grad_norm": 3.015625, "grad_norm_var": 0.03271484375, "learning_rate": 0.0001, "loss": 5.8123, "loss/crossentropy": 2.6328794956207275, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17067986726760864, "step": 20910 }, { "epoch": 0.6535, "grad_norm": 2.96875, "grad_norm_var": 0.03424072265625, "learning_rate": 0.0001, "loss": 5.6506, "loss/crossentropy": 2.5400296449661255, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16223189234733582, "step": 20912 }, { "epoch": 0.6535625, "grad_norm": 3.46875, "grad_norm_var": 0.03673502604166667, "learning_rate": 0.0001, "loss": 5.9651, "loss/crossentropy": 2.7477434873580933, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.17603643983602524, "step": 20914 }, { "epoch": 0.653625, "grad_norm": 3.453125, "grad_norm_var": 0.04446207682291667, "learning_rate": 0.0001, "loss": 5.4801, "loss/crossentropy": 2.4584161043167114, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1568535938858986, "step": 20916 }, { "epoch": 0.6536875, "grad_norm": 2.953125, "grad_norm_var": 0.06629231770833334, "learning_rate": 0.0001, "loss": 5.6703, "loss/crossentropy": 2.584798574447632, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16363202035427094, "step": 20918 }, { "epoch": 0.65375, "grad_norm": 3.3125, "grad_norm_var": 0.061995442708333334, "learning_rate": 0.0001, "loss": 5.775, "loss/crossentropy": 2.5854105949401855, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17013494670391083, "step": 20920 }, { "epoch": 0.6538125, "grad_norm": 3.078125, "grad_norm_var": 0.06536356608072917, "learning_rate": 0.0001, "loss": 5.5909, "loss/crossentropy": 2.5749701261520386, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.16057872772216797, "step": 20922 }, { "epoch": 0.653875, "grad_norm": 3.21875, "grad_norm_var": 0.060887654622395836, "learning_rate": 0.0001, "loss": 5.8097, "loss/crossentropy": 2.593639850616455, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1743416041135788, "step": 20924 }, { "epoch": 0.6539375, "grad_norm": 3.0, "grad_norm_var": 0.057515462239583336, "learning_rate": 0.0001, "loss": 5.6364, "loss/crossentropy": 2.555126428604126, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.164768286049366, "step": 20926 }, { "epoch": 0.654, "grad_norm": 3.40625, "grad_norm_var": 0.0628326416015625, "learning_rate": 0.0001, "loss": 5.8082, "loss/crossentropy": 2.6723456382751465, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16553768515586853, "step": 20928 }, { "epoch": 0.6540625, "grad_norm": 3.125, "grad_norm_var": 0.060498046875, "learning_rate": 0.0001, "loss": 5.6475, "loss/crossentropy": 2.4885549545288086, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16706141084432602, "step": 20930 }, { "epoch": 0.654125, "grad_norm": 3.03125, "grad_norm_var": 0.054488118489583334, "learning_rate": 0.0001, "loss": 5.5655, "loss/crossentropy": 2.4763563871383667, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16476880759000778, "step": 20932 }, { "epoch": 0.6541875, "grad_norm": 3.171875, "grad_norm_var": 0.026590983072916668, "learning_rate": 0.0001, "loss": 5.3277, "loss/crossentropy": 2.2567296028137207, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1609998345375061, "step": 20934 }, { "epoch": 0.65425, "grad_norm": 3.109375, "grad_norm_var": 0.029450480143229166, "learning_rate": 0.0001, "loss": 5.6583, "loss/crossentropy": 2.573989987373352, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.16507582366466522, "step": 20936 }, { "epoch": 0.6543125, "grad_norm": 3.125, "grad_norm_var": 0.027521769205729168, "learning_rate": 0.0001, "loss": 5.5077, "loss/crossentropy": 2.466416597366333, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15881579369306564, "step": 20938 }, { "epoch": 0.654375, "grad_norm": 3.40625, "grad_norm_var": 0.03472900390625, "learning_rate": 0.0001, "loss": 5.7926, "loss/crossentropy": 2.641322612762451, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16864470392465591, "step": 20940 }, { "epoch": 0.6544375, "grad_norm": 3.296875, "grad_norm_var": 0.037262980143229166, "learning_rate": 0.0001, "loss": 5.8734, "loss/crossentropy": 2.6494545936584473, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17474105954170227, "step": 20942 }, { "epoch": 0.6545, "grad_norm": 3.265625, "grad_norm_var": 0.034912109375, "learning_rate": 0.0001, "loss": 5.6772, "loss/crossentropy": 2.4440113306045532, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17370863258838654, "step": 20944 }, { "epoch": 0.6545625, "grad_norm": 2.984375, "grad_norm_var": 0.03351949055989583, "learning_rate": 0.0001, "loss": 5.6806, "loss/crossentropy": 2.58296000957489, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16445454955101013, "step": 20946 }, { "epoch": 0.654625, "grad_norm": 3.171875, "grad_norm_var": 0.03394266764322917, "learning_rate": 0.0001, "loss": 5.5769, "loss/crossentropy": 2.4426203966140747, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1673378050327301, "step": 20948 }, { "epoch": 0.6546875, "grad_norm": 3.46875, "grad_norm_var": 0.03850504557291667, "learning_rate": 0.0001, "loss": 6.0441, "loss/crossentropy": 2.807307004928589, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17562832683324814, "step": 20950 }, { "epoch": 0.65475, "grad_norm": 3.28125, "grad_norm_var": 0.037206013997395836, "learning_rate": 0.0001, "loss": 5.6364, "loss/crossentropy": 2.48821759223938, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16676755249500275, "step": 20952 }, { "epoch": 0.6548125, "grad_norm": 3.015625, "grad_norm_var": 0.041803995768229164, "learning_rate": 0.0001, "loss": 5.6989, "loss/crossentropy": 2.59624445438385, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16495025157928467, "step": 20954 }, { "epoch": 0.654875, "grad_norm": 3.171875, "grad_norm_var": 0.03389383951822917, "learning_rate": 0.0001, "loss": 5.925, "loss/crossentropy": 2.659188747406006, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17463178932666779, "step": 20956 }, { "epoch": 0.6549375, "grad_norm": 3.578125, "grad_norm_var": 0.04120686848958333, "learning_rate": 0.0001, "loss": 5.8806, "loss/crossentropy": 2.624800443649292, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17401887476444244, "step": 20958 }, { "epoch": 0.655, "grad_norm": 2.953125, "grad_norm_var": 0.04192708333333333, "learning_rate": 0.0001, "loss": 5.5481, "loss/crossentropy": 2.4357486963272095, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1616295427083969, "step": 20960 }, { "epoch": 0.6550625, "grad_norm": 3.328125, "grad_norm_var": 0.045685831705729166, "learning_rate": 0.0001, "loss": 5.7787, "loss/crossentropy": 2.6577001810073853, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16483888775110245, "step": 20962 }, { "epoch": 0.655125, "grad_norm": 3.09375, "grad_norm_var": 0.047761027018229166, "learning_rate": 0.0001, "loss": 5.8686, "loss/crossentropy": 2.726725220680237, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16770515590906143, "step": 20964 }, { "epoch": 0.6551875, "grad_norm": 3.0625, "grad_norm_var": 0.045563761393229166, "learning_rate": 0.0001, "loss": 5.5425, "loss/crossentropy": 2.4814471006393433, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16001326590776443, "step": 20966 }, { "epoch": 0.65525, "grad_norm": 3.109375, "grad_norm_var": 0.0405181884765625, "learning_rate": 0.0001, "loss": 5.2998, "loss/crossentropy": 2.2771515250205994, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15265830606222153, "step": 20968 }, { "epoch": 0.6553125, "grad_norm": 3.0625, "grad_norm_var": 0.0336578369140625, "learning_rate": 0.0001, "loss": 5.7622, "loss/crossentropy": 2.6117970943450928, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16660621017217636, "step": 20970 }, { "epoch": 0.655375, "grad_norm": 3.296875, "grad_norm_var": 0.03400065104166667, "learning_rate": 0.0001, "loss": 5.4569, "loss/crossentropy": 2.395979404449463, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15999948978424072, "step": 20972 }, { "epoch": 0.6554375, "grad_norm": 2.734375, "grad_norm_var": 0.023729451497395835, "learning_rate": 0.0001, "loss": 5.1661, "loss/crossentropy": 2.211157202720642, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.14588858932256699, "step": 20974 }, { "epoch": 0.6555, "grad_norm": 2.84375, "grad_norm_var": 0.024235026041666666, "learning_rate": 0.0001, "loss": 5.6514, "loss/crossentropy": 2.6521668434143066, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15656233578920364, "step": 20976 }, { "epoch": 0.6555625, "grad_norm": 3.234375, "grad_norm_var": 0.023566691080729167, "learning_rate": 0.0001, "loss": 5.859, "loss/crossentropy": 2.66953444480896, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17011483013629913, "step": 20978 }, { "epoch": 0.655625, "grad_norm": 2.921875, "grad_norm_var": 0.024507649739583335, "learning_rate": 0.0001, "loss": 5.3542, "loss/crossentropy": 2.415980339050293, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.14890418201684952, "step": 20980 }, { "epoch": 0.6556875, "grad_norm": 2.828125, "grad_norm_var": 0.030757649739583334, "learning_rate": 0.0001, "loss": 5.6073, "loss/crossentropy": 2.585160493850708, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1561153382062912, "step": 20982 }, { "epoch": 0.65575, "grad_norm": 2.90625, "grad_norm_var": 0.033707682291666666, "learning_rate": 0.0001, "loss": 5.8202, "loss/crossentropy": 2.6771342754364014, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16859956085681915, "step": 20984 }, { "epoch": 0.6558125, "grad_norm": 3.015625, "grad_norm_var": 0.0550689697265625, "learning_rate": 0.0001, "loss": 5.3667, "loss/crossentropy": 2.3741567134857178, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15276531130075455, "step": 20986 }, { "epoch": 0.655875, "grad_norm": 3.234375, "grad_norm_var": 0.054280598958333336, "learning_rate": 0.0001, "loss": 5.4782, "loss/crossentropy": 2.4002526998519897, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1620914787054062, "step": 20988 }, { "epoch": 0.6559375, "grad_norm": 2.90625, "grad_norm_var": 0.06416015625, "learning_rate": 0.0001, "loss": 5.6154, "loss/crossentropy": 2.4854971170425415, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16689575463533401, "step": 20990 }, { "epoch": 0.656, "grad_norm": 3.4375, "grad_norm_var": 0.06702067057291666, "learning_rate": 0.0001, "loss": 5.6022, "loss/crossentropy": 2.4607324600219727, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16337022185325623, "step": 20992 }, { "epoch": 0.6560625, "grad_norm": 3.296875, "grad_norm_var": 0.06819254557291667, "learning_rate": 0.0001, "loss": 5.9668, "loss/crossentropy": 2.7102184295654297, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1764424666762352, "step": 20994 }, { "epoch": 0.656125, "grad_norm": 3.09375, "grad_norm_var": 0.06874593098958333, "learning_rate": 0.0001, "loss": 5.852, "loss/crossentropy": 2.7109466791152954, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16371320933103561, "step": 20996 }, { "epoch": 0.6561875, "grad_norm": 3.140625, "grad_norm_var": 0.05446675618489583, "learning_rate": 0.0001, "loss": 5.6789, "loss/crossentropy": 2.5835018157958984, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1657932549715042, "step": 20998 }, { "epoch": 0.65625, "grad_norm": 3.1875, "grad_norm_var": 0.0583892822265625, "learning_rate": 0.0001, "loss": 5.8651, "loss/crossentropy": 2.704118251800537, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1680501401424408, "step": 21000 }, { "epoch": 0.6563125, "grad_norm": 3.28125, "grad_norm_var": 0.0467681884765625, "learning_rate": 0.0001, "loss": 5.7122, "loss/crossentropy": 2.5057101249694824, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17064709216356277, "step": 21002 }, { "epoch": 0.656375, "grad_norm": 3.421875, "grad_norm_var": 0.0434234619140625, "learning_rate": 0.0001, "loss": 6.0921, "loss/crossentropy": 2.817593574523926, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17784057557582855, "step": 21004 }, { "epoch": 0.6564375, "grad_norm": 3.34375, "grad_norm_var": 0.030269368489583334, "learning_rate": 0.0001, "loss": 6.1687, "loss/crossentropy": 2.834757924079895, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.18300583958625793, "step": 21006 }, { "epoch": 0.6565, "grad_norm": 2.953125, "grad_norm_var": 0.028888956705729166, "learning_rate": 0.0001, "loss": 5.7256, "loss/crossentropy": 2.686676263809204, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.16209668666124344, "step": 21008 }, { "epoch": 0.6565625, "grad_norm": 2.8125, "grad_norm_var": 0.042215983072916664, "learning_rate": 0.0001, "loss": 5.6487, "loss/crossentropy": 2.5283303260803223, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16321031749248505, "step": 21010 }, { "epoch": 0.656625, "grad_norm": 2.859375, "grad_norm_var": 0.045572916666666664, "learning_rate": 0.0001, "loss": 5.4632, "loss/crossentropy": 2.4296209812164307, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15960296988487244, "step": 21012 }, { "epoch": 0.6566875, "grad_norm": 3.125, "grad_norm_var": 0.04383036295572917, "learning_rate": 0.0001, "loss": 5.7991, "loss/crossentropy": 2.6751062870025635, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16708219796419144, "step": 21014 }, { "epoch": 0.65675, "grad_norm": 2.96875, "grad_norm_var": 0.0384674072265625, "learning_rate": 0.0001, "loss": 5.6379, "loss/crossentropy": 2.577240228652954, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16153688728809357, "step": 21016 }, { "epoch": 0.6568125, "grad_norm": 2.890625, "grad_norm_var": 0.038996378580729164, "learning_rate": 0.0001, "loss": 5.771, "loss/crossentropy": 2.6858417987823486, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16086158901453018, "step": 21018 }, { "epoch": 0.656875, "grad_norm": 2.765625, "grad_norm_var": 0.053132120768229166, "learning_rate": 0.0001, "loss": 5.3949, "loss/crossentropy": 2.346004366874695, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15918176621198654, "step": 21020 }, { "epoch": 0.6569375, "grad_norm": 3.046875, "grad_norm_var": 0.049641927083333336, "learning_rate": 0.0001, "loss": 5.5529, "loss/crossentropy": 2.5203301906585693, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15950361639261246, "step": 21022 }, { "epoch": 0.657, "grad_norm": 2.9375, "grad_norm_var": 0.0576080322265625, "learning_rate": 0.0001, "loss": 5.0891, "loss/crossentropy": 2.2516844272613525, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.14155269414186478, "step": 21024 }, { "epoch": 0.6570625, "grad_norm": 3.46875, "grad_norm_var": 0.0552886962890625, "learning_rate": 0.0001, "loss": 5.8603, "loss/crossentropy": 2.5943716764450073, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1742505356669426, "step": 21026 }, { "epoch": 0.657125, "grad_norm": 3.0, "grad_norm_var": 0.05332743326822917, "learning_rate": 0.0001, "loss": 5.5413, "loss/crossentropy": 2.4588228464126587, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16527670621871948, "step": 21028 }, { "epoch": 0.6571875, "grad_norm": 3.203125, "grad_norm_var": 0.05583394368489583, "learning_rate": 0.0001, "loss": 5.5807, "loss/crossentropy": 2.4765560626983643, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16275492310523987, "step": 21030 }, { "epoch": 0.65725, "grad_norm": 3.03125, "grad_norm_var": 0.05660807291666667, "learning_rate": 0.0001, "loss": 5.5726, "loss/crossentropy": 2.4483338594436646, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16594135016202927, "step": 21032 }, { "epoch": 0.6573125, "grad_norm": 3.015625, "grad_norm_var": 0.0526519775390625, "learning_rate": 0.0001, "loss": 5.4477, "loss/crossentropy": 2.403680682182312, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15713957697153091, "step": 21034 }, { "epoch": 0.657375, "grad_norm": 2.921875, "grad_norm_var": 0.03738606770833333, "learning_rate": 0.0001, "loss": 5.7809, "loss/crossentropy": 2.5883078575134277, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17082417011260986, "step": 21036 }, { "epoch": 0.6574375, "grad_norm": 2.90625, "grad_norm_var": 0.04119466145833333, "learning_rate": 0.0001, "loss": 5.3847, "loss/crossentropy": 2.3743726015090942, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15650182962417603, "step": 21038 }, { "epoch": 0.6575, "grad_norm": 3.265625, "grad_norm_var": 0.031966145833333334, "learning_rate": 0.0001, "loss": 5.4838, "loss/crossentropy": 2.399978518486023, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16150595992803574, "step": 21040 }, { "epoch": 0.6575625, "grad_norm": 3.046875, "grad_norm_var": 0.0249176025390625, "learning_rate": 0.0001, "loss": 5.5571, "loss/crossentropy": 2.4963085651397705, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16154929995536804, "step": 21042 }, { "epoch": 0.657625, "grad_norm": 3.125, "grad_norm_var": 0.030589803059895834, "learning_rate": 0.0001, "loss": 5.7662, "loss/crossentropy": 2.568527936935425, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17094258964061737, "step": 21044 }, { "epoch": 0.6576875, "grad_norm": 3.265625, "grad_norm_var": 0.03866780598958333, "learning_rate": 0.0001, "loss": 5.9367, "loss/crossentropy": 2.7443350553512573, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.17352941632270813, "step": 21046 }, { "epoch": 0.65775, "grad_norm": 3.0625, "grad_norm_var": 0.0400787353515625, "learning_rate": 0.0001, "loss": 5.4498, "loss/crossentropy": 2.3897674083709717, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1599130928516388, "step": 21048 }, { "epoch": 0.6578125, "grad_norm": 3.21875, "grad_norm_var": 0.040070597330729166, "learning_rate": 0.0001, "loss": 5.6568, "loss/crossentropy": 2.5966427326202393, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16070196777582169, "step": 21050 }, { "epoch": 0.657875, "grad_norm": 3.03125, "grad_norm_var": 0.036149088541666666, "learning_rate": 0.0001, "loss": 6.0151, "loss/crossentropy": 2.810759663581848, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17200005799531937, "step": 21052 }, { "epoch": 0.6579375, "grad_norm": 3.15625, "grad_norm_var": 0.04091695149739583, "learning_rate": 0.0001, "loss": 5.6564, "loss/crossentropy": 2.6177507638931274, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15933813899755478, "step": 21054 }, { "epoch": 0.658, "grad_norm": 3.09375, "grad_norm_var": 0.038426717122395836, "learning_rate": 0.0001, "loss": 5.5803, "loss/crossentropy": 2.5462610721588135, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.1600475162267685, "step": 21056 }, { "epoch": 0.6580625, "grad_norm": 2.890625, "grad_norm_var": 0.04216206868489583, "learning_rate": 0.0001, "loss": 5.3908, "loss/crossentropy": 2.464584231376648, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.1508210301399231, "step": 21058 }, { "epoch": 0.658125, "grad_norm": 2.984375, "grad_norm_var": 0.04221903483072917, "learning_rate": 0.0001, "loss": 5.8657, "loss/crossentropy": 2.6992465257644653, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17016176879405975, "step": 21060 }, { "epoch": 0.6581875, "grad_norm": 3.65625, "grad_norm_var": 0.051171875, "learning_rate": 0.0001, "loss": 5.7709, "loss/crossentropy": 2.619776964187622, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16550444811582565, "step": 21062 }, { "epoch": 0.65825, "grad_norm": 3.203125, "grad_norm_var": 0.050537109375, "learning_rate": 0.0001, "loss": 5.9504, "loss/crossentropy": 2.735660672187805, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17342735826969147, "step": 21064 }, { "epoch": 0.6583125, "grad_norm": 2.984375, "grad_norm_var": 0.054585774739583336, "learning_rate": 0.0001, "loss": 5.9137, "loss/crossentropy": 2.6759467124938965, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1741655245423317, "step": 21066 }, { "epoch": 0.658375, "grad_norm": 3.234375, "grad_norm_var": 0.0538726806640625, "learning_rate": 0.0001, "loss": 5.8245, "loss/crossentropy": 2.6125426292419434, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1739320084452629, "step": 21068 }, { "epoch": 0.6584375, "grad_norm": 3.09375, "grad_norm_var": 0.04413960774739583, "learning_rate": 0.0001, "loss": 5.6678, "loss/crossentropy": 2.539334297180176, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16518954187631607, "step": 21070 }, { "epoch": 0.6585, "grad_norm": 3.09375, "grad_norm_var": 0.05071207682291667, "learning_rate": 0.0001, "loss": 5.5312, "loss/crossentropy": 2.46273410320282, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.15685029327869415, "step": 21072 }, { "epoch": 0.6585625, "grad_norm": 3.1875, "grad_norm_var": 0.045099894205729164, "learning_rate": 0.0001, "loss": 5.8338, "loss/crossentropy": 2.6176934242248535, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.173955999314785, "step": 21074 }, { "epoch": 0.658625, "grad_norm": 3.15625, "grad_norm_var": 0.04244384765625, "learning_rate": 0.0001, "loss": 5.5685, "loss/crossentropy": 2.509552836418152, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.16254013031721115, "step": 21076 }, { "epoch": 0.6586875, "grad_norm": 3.046875, "grad_norm_var": 0.03134765625, "learning_rate": 0.0001, "loss": 5.777, "loss/crossentropy": 2.595310091972351, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1709018349647522, "step": 21078 }, { "epoch": 0.65875, "grad_norm": 3.46875, "grad_norm_var": 0.03997395833333333, "learning_rate": 0.0001, "loss": 5.8275, "loss/crossentropy": 2.5864028930664062, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17333343625068665, "step": 21080 }, { "epoch": 0.6588125, "grad_norm": 3.09375, "grad_norm_var": 0.03824462890625, "learning_rate": 0.0001, "loss": 5.4689, "loss/crossentropy": 2.441880226135254, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1554327756166458, "step": 21082 }, { "epoch": 0.658875, "grad_norm": 2.765625, "grad_norm_var": 0.05142822265625, "learning_rate": 0.0001, "loss": 5.6495, "loss/crossentropy": 2.653690457344055, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.15973851829767227, "step": 21084 }, { "epoch": 0.6589375, "grad_norm": 3.109375, "grad_norm_var": 0.05142822265625, "learning_rate": 0.0001, "loss": 5.7854, "loss/crossentropy": 2.6315271854400635, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16811668872833252, "step": 21086 }, { "epoch": 0.659, "grad_norm": 3.171875, "grad_norm_var": 0.04655659993489583, "learning_rate": 0.0001, "loss": 5.7797, "loss/crossentropy": 2.6315035820007324, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16833917051553726, "step": 21088 }, { "epoch": 0.6590625, "grad_norm": 3.1875, "grad_norm_var": 0.040949503580729164, "learning_rate": 0.0001, "loss": 5.5435, "loss/crossentropy": 2.438621759414673, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16400685161352158, "step": 21090 }, { "epoch": 0.659125, "grad_norm": 3.40625, "grad_norm_var": 0.046468098958333336, "learning_rate": 0.0001, "loss": 5.5214, "loss/crossentropy": 2.4408299922943115, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1607937514781952, "step": 21092 }, { "epoch": 0.6591875, "grad_norm": 3.09375, "grad_norm_var": 0.04550374348958333, "learning_rate": 0.0001, "loss": 5.7973, "loss/crossentropy": 2.6019203662872314, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.17539478838443756, "step": 21094 }, { "epoch": 0.65925, "grad_norm": 2.984375, "grad_norm_var": 0.03632710774739583, "learning_rate": 0.0001, "loss": 5.3967, "loss/crossentropy": 2.3610873222351074, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15668457746505737, "step": 21096 }, { "epoch": 0.6593125, "grad_norm": 3.15625, "grad_norm_var": 0.03873291015625, "learning_rate": 0.0001, "loss": 5.3789, "loss/crossentropy": 2.3296267986297607, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.158445805311203, "step": 21098 }, { "epoch": 0.659375, "grad_norm": 2.953125, "grad_norm_var": 0.026423136393229168, "learning_rate": 0.0001, "loss": 5.7804, "loss/crossentropy": 2.701469898223877, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16413865983486176, "step": 21100 }, { "epoch": 0.6594375, "grad_norm": 2.9375, "grad_norm_var": 0.028902180989583335, "learning_rate": 0.0001, "loss": 5.4956, "loss/crossentropy": 2.532857656478882, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15408175438642502, "step": 21102 }, { "epoch": 0.6595, "grad_norm": 3.796875, "grad_norm_var": 0.059382120768229164, "learning_rate": 0.0001, "loss": 5.8234, "loss/crossentropy": 2.519111752510071, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17964977771043777, "step": 21104 }, { "epoch": 0.6595625, "grad_norm": 3.015625, "grad_norm_var": 0.06043294270833333, "learning_rate": 0.0001, "loss": 5.6892, "loss/crossentropy": 2.55735445022583, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16669873893260956, "step": 21106 }, { "epoch": 0.659625, "grad_norm": 2.890625, "grad_norm_var": 0.06165364583333333, "learning_rate": 0.0001, "loss": 5.5278, "loss/crossentropy": 2.5014939308166504, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.161612868309021, "step": 21108 }, { "epoch": 0.6596875, "grad_norm": 3.296875, "grad_norm_var": 0.056005859375, "learning_rate": 0.0001, "loss": 5.9416, "loss/crossentropy": 2.7261011600494385, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17193977534770966, "step": 21110 }, { "epoch": 0.65975, "grad_norm": 3.0625, "grad_norm_var": 0.0574615478515625, "learning_rate": 0.0001, "loss": 5.6383, "loss/crossentropy": 2.5475538969039917, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16415174305438995, "step": 21112 }, { "epoch": 0.6598125, "grad_norm": 3.1875, "grad_norm_var": 0.052958170572916664, "learning_rate": 0.0001, "loss": 5.7181, "loss/crossentropy": 2.5600234270095825, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.17010456323623657, "step": 21114 }, { "epoch": 0.659875, "grad_norm": 3.0625, "grad_norm_var": 0.05175374348958333, "learning_rate": 0.0001, "loss": 5.7516, "loss/crossentropy": 2.628481864929199, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16621547937393188, "step": 21116 }, { "epoch": 0.6599375, "grad_norm": 3.03125, "grad_norm_var": 0.04990946451822917, "learning_rate": 0.0001, "loss": 6.2269, "loss/crossentropy": 2.9299408197402954, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17774052917957306, "step": 21118 }, { "epoch": 0.66, "grad_norm": 2.796875, "grad_norm_var": 0.027632649739583334, "learning_rate": 0.0001, "loss": 5.7446, "loss/crossentropy": 2.6586681604385376, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1636732593178749, "step": 21120 }, { "epoch": 0.6600625, "grad_norm": 3.0625, "grad_norm_var": 0.027046712239583333, "learning_rate": 0.0001, "loss": 5.4781, "loss/crossentropy": 2.4577144384384155, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.15946026891469955, "step": 21122 }, { "epoch": 0.660125, "grad_norm": 2.953125, "grad_norm_var": 0.026395670572916665, "learning_rate": 0.0001, "loss": 5.675, "loss/crossentropy": 2.625125527381897, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.16241003572940826, "step": 21124 }, { "epoch": 0.6601875, "grad_norm": 4.03125, "grad_norm_var": 0.09013671875, "learning_rate": 0.0001, "loss": 5.6091, "loss/crossentropy": 2.228906750679016, "loss/hidden": 1.6015625, "loss/jsd": 0.0, "loss/logits": 0.17785951495170593, "step": 21126 }, { "epoch": 0.66025, "grad_norm": 3.3125, "grad_norm_var": 0.08947652180989583, "learning_rate": 0.0001, "loss": 5.5693, "loss/crossentropy": 2.491586923599243, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15894466638565063, "step": 21128 }, { "epoch": 0.6603125, "grad_norm": 3.421875, "grad_norm_var": 0.09631245930989583, "learning_rate": 0.0001, "loss": 5.9102, "loss/crossentropy": 2.6011446714401245, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1777852401137352, "step": 21130 }, { "epoch": 0.660375, "grad_norm": 3.171875, "grad_norm_var": 0.09434305826822917, "learning_rate": 0.0001, "loss": 5.5044, "loss/crossentropy": 2.491545796394348, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15597651898860931, "step": 21132 }, { "epoch": 0.6604375, "grad_norm": 2.875, "grad_norm_var": 0.10035807291666667, "learning_rate": 0.0001, "loss": 5.3615, "loss/crossentropy": 2.4181984663009644, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15214426070451736, "step": 21134 }, { "epoch": 0.6605, "grad_norm": 3.234375, "grad_norm_var": 0.09345703125, "learning_rate": 0.0001, "loss": 5.8034, "loss/crossentropy": 2.5535932779312134, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17302973568439484, "step": 21136 }, { "epoch": 0.6605625, "grad_norm": 3.078125, "grad_norm_var": 0.5128163655598958, "learning_rate": 0.0001, "loss": 5.6741, "loss/crossentropy": 2.4165477752685547, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.17068202793598175, "step": 21138 }, { "epoch": 0.660625, "grad_norm": 3.046875, "grad_norm_var": 0.48266499837239585, "learning_rate": 0.0001, "loss": 5.8364, "loss/crossentropy": 2.6450281143188477, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16953130811452866, "step": 21140 }, { "epoch": 0.6606875, "grad_norm": 3.078125, "grad_norm_var": 0.4637369791666667, "learning_rate": 0.0001, "loss": 5.6399, "loss/crossentropy": 2.5641591548919678, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1607028692960739, "step": 21142 }, { "epoch": 0.66075, "grad_norm": 4.5625, "grad_norm_var": 0.5629140218098958, "learning_rate": 0.0001, "loss": 5.4791, "loss/crossentropy": 2.4822245836257935, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.1590593308210373, "step": 21144 }, { "epoch": 0.6608125, "grad_norm": 3.125, "grad_norm_var": 0.5676717122395833, "learning_rate": 0.0001, "loss": 5.7843, "loss/crossentropy": 2.622680902481079, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16772551089525223, "step": 21146 }, { "epoch": 0.660875, "grad_norm": 3.046875, "grad_norm_var": 0.5866495768229166, "learning_rate": 0.0001, "loss": 5.1549, "loss/crossentropy": 2.2301281690597534, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.14716807007789612, "step": 21148 }, { "epoch": 0.6609375, "grad_norm": 4.0, "grad_norm_var": 0.5773111979166666, "learning_rate": 0.0001, "loss": 5.8639, "loss/crossentropy": 2.5864195823669434, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17775243520736694, "step": 21150 }, { "epoch": 0.661, "grad_norm": 3.09375, "grad_norm_var": 0.5822906494140625, "learning_rate": 0.0001, "loss": 5.6977, "loss/crossentropy": 2.5363423824310303, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1661316081881523, "step": 21152 }, { "epoch": 0.6610625, "grad_norm": 2.84375, "grad_norm_var": 0.20327046712239583, "learning_rate": 0.0001, "loss": 5.7573, "loss/crossentropy": 2.723081111907959, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.159281924366951, "step": 21154 }, { "epoch": 0.661125, "grad_norm": 3.375, "grad_norm_var": 0.20532124837239582, "learning_rate": 0.0001, "loss": 5.9345, "loss/crossentropy": 2.7293641567230225, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17246946692466736, "step": 21156 }, { "epoch": 0.6611875, "grad_norm": 3.0625, "grad_norm_var": 0.20722249348958333, "learning_rate": 0.0001, "loss": 5.5277, "loss/crossentropy": 2.4667723178863525, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15999778360128403, "step": 21158 }, { "epoch": 0.66125, "grad_norm": 3.203125, "grad_norm_var": 0.08376363118489584, "learning_rate": 0.0001, "loss": 5.6088, "loss/crossentropy": 2.5209559202194214, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16269070655107498, "step": 21160 }, { "epoch": 0.6613125, "grad_norm": 3.109375, "grad_norm_var": 0.0775787353515625, "learning_rate": 0.0001, "loss": 6.0978, "loss/crossentropy": 2.811536431312561, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1755012571811676, "step": 21162 }, { "epoch": 0.661375, "grad_norm": 3.40625, "grad_norm_var": 0.0805816650390625, "learning_rate": 0.0001, "loss": 5.8546, "loss/crossentropy": 2.5932637453079224, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17535630613565445, "step": 21164 }, { "epoch": 0.6614375, "grad_norm": 3.53125, "grad_norm_var": 0.044820149739583336, "learning_rate": 0.0001, "loss": 5.983, "loss/crossentropy": 2.724538803100586, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17779680341482162, "step": 21166 }, { "epoch": 0.6615, "grad_norm": 3.0, "grad_norm_var": 0.04363606770833333, "learning_rate": 0.0001, "loss": 5.5289, "loss/crossentropy": 2.4889990091323853, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15790055692195892, "step": 21168 }, { "epoch": 0.6615625, "grad_norm": 2.796875, "grad_norm_var": 0.04157613118489583, "learning_rate": 0.0001, "loss": 5.6187, "loss/crossentropy": 2.5857560634613037, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.16189289093017578, "step": 21170 }, { "epoch": 0.661625, "grad_norm": 3.15625, "grad_norm_var": 0.0437896728515625, "learning_rate": 0.0001, "loss": 5.53, "loss/crossentropy": 2.5200599431991577, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.1588016375899315, "step": 21172 }, { "epoch": 0.6616875, "grad_norm": 2.921875, "grad_norm_var": 0.0469390869140625, "learning_rate": 0.0001, "loss": 5.8791, "loss/crossentropy": 2.679272770881653, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1703711748123169, "step": 21174 }, { "epoch": 0.66175, "grad_norm": 2.921875, "grad_norm_var": 0.04895426432291667, "learning_rate": 0.0001, "loss": 5.7639, "loss/crossentropy": 2.62574303150177, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16732646524906158, "step": 21176 }, { "epoch": 0.6618125, "grad_norm": 3.296875, "grad_norm_var": 0.052164713541666664, "learning_rate": 0.0001, "loss": 5.7762, "loss/crossentropy": 2.6213289499282837, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16743764281272888, "step": 21178 }, { "epoch": 0.661875, "grad_norm": 3.078125, "grad_norm_var": 0.03671468098958333, "learning_rate": 0.0001, "loss": 5.7035, "loss/crossentropy": 2.6381725072860718, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16083265841007233, "step": 21180 }, { "epoch": 0.6619375, "grad_norm": 2.890625, "grad_norm_var": 0.0255279541015625, "learning_rate": 0.0001, "loss": 5.9097, "loss/crossentropy": 2.7510194778442383, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16899508982896805, "step": 21182 }, { "epoch": 0.662, "grad_norm": 2.859375, "grad_norm_var": 0.02769775390625, "learning_rate": 0.0001, "loss": 5.7053, "loss/crossentropy": 2.687032103538513, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15768758207559586, "step": 21184 }, { "epoch": 0.6620625, "grad_norm": 2.921875, "grad_norm_var": 0.027534993489583333, "learning_rate": 0.0001, "loss": 5.5797, "loss/crossentropy": 2.58084499835968, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.15926294028759003, "step": 21186 }, { "epoch": 0.662125, "grad_norm": 3.046875, "grad_norm_var": 0.023860677083333334, "learning_rate": 0.0001, "loss": 5.7182, "loss/crossentropy": 2.575597405433655, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1642555668950081, "step": 21188 }, { "epoch": 0.6621875, "grad_norm": 3.21875, "grad_norm_var": 0.021613566080729167, "learning_rate": 0.0001, "loss": 5.7413, "loss/crossentropy": 2.6081173419952393, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1683983951807022, "step": 21190 }, { "epoch": 0.66225, "grad_norm": 3.203125, "grad_norm_var": 0.018973795572916667, "learning_rate": 0.0001, "loss": 5.7651, "loss/crossentropy": 2.646545171737671, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1665422022342682, "step": 21192 }, { "epoch": 0.6623125, "grad_norm": 3.46875, "grad_norm_var": 0.025516764322916666, "learning_rate": 0.0001, "loss": 5.9382, "loss/crossentropy": 2.613059163093567, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.18134091049432755, "step": 21194 }, { "epoch": 0.662375, "grad_norm": 3.328125, "grad_norm_var": 0.029703776041666668, "learning_rate": 0.0001, "loss": 5.9396, "loss/crossentropy": 2.7155404090881348, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17318322509527206, "step": 21196 }, { "epoch": 0.6624375, "grad_norm": 2.9375, "grad_norm_var": 0.0309234619140625, "learning_rate": 0.0001, "loss": 5.6522, "loss/crossentropy": 2.5789085626602173, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16318871080875397, "step": 21198 }, { "epoch": 0.6625, "grad_norm": 2.5, "grad_norm_var": 0.051005045572916664, "learning_rate": 0.0001, "loss": 5.012, "loss/crossentropy": 2.206539809703827, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.13991685956716537, "step": 21200 }, { "epoch": 0.6625625, "grad_norm": 3.28125, "grad_norm_var": 0.0497467041015625, "learning_rate": 0.0001, "loss": 5.5861, "loss/crossentropy": 2.5295369625091553, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15800325572490692, "step": 21202 }, { "epoch": 0.662625, "grad_norm": 2.96875, "grad_norm_var": 0.052571614583333336, "learning_rate": 0.0001, "loss": 5.6726, "loss/crossentropy": 2.5857043266296387, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16259709000587463, "step": 21204 }, { "epoch": 0.6626875, "grad_norm": 3.453125, "grad_norm_var": 0.061620076497395836, "learning_rate": 0.0001, "loss": 5.6861, "loss/crossentropy": 2.527758479118347, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1677919551730156, "step": 21206 }, { "epoch": 0.66275, "grad_norm": 2.828125, "grad_norm_var": 0.06516927083333333, "learning_rate": 0.0001, "loss": 5.8011, "loss/crossentropy": 2.6378647089004517, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16945039480924606, "step": 21208 }, { "epoch": 0.6628125, "grad_norm": 3.53125, "grad_norm_var": 0.06885477701822916, "learning_rate": 0.0001, "loss": 5.801, "loss/crossentropy": 2.5858579874038696, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.17542026937007904, "step": 21210 }, { "epoch": 0.662875, "grad_norm": 3.078125, "grad_norm_var": 0.06466471354166667, "learning_rate": 0.0001, "loss": 5.694, "loss/crossentropy": 2.597716808319092, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1611892729997635, "step": 21212 }, { "epoch": 0.6629375, "grad_norm": 3.25, "grad_norm_var": 0.06177978515625, "learning_rate": 0.0001, "loss": 5.7917, "loss/crossentropy": 2.6291065216064453, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16508817672729492, "step": 21214 }, { "epoch": 0.663, "grad_norm": 3.515625, "grad_norm_var": 0.04289449055989583, "learning_rate": 0.0001, "loss": 5.3654, "loss/crossentropy": 2.2221158742904663, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16432448476552963, "step": 21216 }, { "epoch": 0.6630625, "grad_norm": 3.15625, "grad_norm_var": 0.0451812744140625, "learning_rate": 0.0001, "loss": 5.4499, "loss/crossentropy": 2.4725834131240845, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.1567157581448555, "step": 21218 }, { "epoch": 0.663125, "grad_norm": 3.5, "grad_norm_var": 0.052668253580729164, "learning_rate": 0.0001, "loss": 5.8421, "loss/crossentropy": 2.664140462875366, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16975214332342148, "step": 21220 }, { "epoch": 0.6631875, "grad_norm": 3.0, "grad_norm_var": 0.04563700358072917, "learning_rate": 0.0001, "loss": 5.558, "loss/crossentropy": 2.504623770713806, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16236679255962372, "step": 21222 }, { "epoch": 0.66325, "grad_norm": 3.015625, "grad_norm_var": 0.04132486979166667, "learning_rate": 0.0001, "loss": 5.6483, "loss/crossentropy": 2.5294255018234253, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16071248054504395, "step": 21224 }, { "epoch": 0.6633125, "grad_norm": 3.359375, "grad_norm_var": 0.034845987955729164, "learning_rate": 0.0001, "loss": 6.0104, "loss/crossentropy": 2.7339723110198975, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1780320480465889, "step": 21226 }, { "epoch": 0.663375, "grad_norm": 3.03125, "grad_norm_var": 0.04215087890625, "learning_rate": 0.0001, "loss": 5.3252, "loss/crossentropy": 2.380972385406494, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.1534043401479721, "step": 21228 }, { "epoch": 0.6634375, "grad_norm": 3.828125, "grad_norm_var": 0.07174479166666667, "learning_rate": 0.0001, "loss": 5.6034, "loss/crossentropy": 2.4696223735809326, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16220373660326004, "step": 21230 }, { "epoch": 0.6635, "grad_norm": 3.078125, "grad_norm_var": 0.06929931640625, "learning_rate": 0.0001, "loss": 5.5928, "loss/crossentropy": 2.5176429748535156, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15946735441684723, "step": 21232 }, { "epoch": 0.6635625, "grad_norm": 3.0625, "grad_norm_var": 0.06607157389322917, "learning_rate": 0.0001, "loss": 5.8498, "loss/crossentropy": 2.645890712738037, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17273297160863876, "step": 21234 }, { "epoch": 0.663625, "grad_norm": 3.484375, "grad_norm_var": 0.06752827962239584, "learning_rate": 0.0001, "loss": 5.4285, "loss/crossentropy": 2.3075203895568848, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16131679713726044, "step": 21236 }, { "epoch": 0.6636875, "grad_norm": 3.25, "grad_norm_var": 0.0632476806640625, "learning_rate": 0.0001, "loss": 5.6491, "loss/crossentropy": 2.4615968465805054, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1722680702805519, "step": 21238 }, { "epoch": 0.66375, "grad_norm": 2.9375, "grad_norm_var": 0.08355712890625, "learning_rate": 0.0001, "loss": 5.6558, "loss/crossentropy": 2.5240232944488525, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16435149312019348, "step": 21240 }, { "epoch": 0.6638125, "grad_norm": 3.03125, "grad_norm_var": 0.090478515625, "learning_rate": 0.0001, "loss": 5.7438, "loss/crossentropy": 2.590863347053528, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16803047060966492, "step": 21242 }, { "epoch": 0.663875, "grad_norm": 3.125, "grad_norm_var": 0.08412984212239584, "learning_rate": 0.0001, "loss": 5.6828, "loss/crossentropy": 2.592938184738159, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16602183133363724, "step": 21244 }, { "epoch": 0.6639375, "grad_norm": 3.078125, "grad_norm_var": 0.06193745930989583, "learning_rate": 0.0001, "loss": 5.8362, "loss/crossentropy": 2.6598994731903076, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16880616545677185, "step": 21246 }, { "epoch": 0.664, "grad_norm": 3.03125, "grad_norm_var": 0.056477864583333336, "learning_rate": 0.0001, "loss": 5.5684, "loss/crossentropy": 2.458032250404358, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16455282270908356, "step": 21248 }, { "epoch": 0.6640625, "grad_norm": 2.75, "grad_norm_var": 0.0675933837890625, "learning_rate": 0.0001, "loss": 5.2155, "loss/crossentropy": 2.270583391189575, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.14840058982372284, "step": 21250 }, { "epoch": 0.664125, "grad_norm": 2.859375, "grad_norm_var": 0.055074055989583336, "learning_rate": 0.0001, "loss": 5.6114, "loss/crossentropy": 2.515115261077881, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16470438987016678, "step": 21252 }, { "epoch": 0.6641875, "grad_norm": 3.515625, "grad_norm_var": 0.06201171875, "learning_rate": 0.0001, "loss": 5.9282, "loss/crossentropy": 2.6761960983276367, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1740298494696617, "step": 21254 }, { "epoch": 0.66425, "grad_norm": 3.234375, "grad_norm_var": 0.031769816080729166, "learning_rate": 0.0001, "loss": 5.8655, "loss/crossentropy": 2.6693637371063232, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1707821488380432, "step": 21256 }, { "epoch": 0.6643125, "grad_norm": 3.03125, "grad_norm_var": 0.031050618489583334, "learning_rate": 0.0001, "loss": 5.7682, "loss/crossentropy": 2.620382785797119, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1694653034210205, "step": 21258 }, { "epoch": 0.664375, "grad_norm": 3.3125, "grad_norm_var": 0.032470703125, "learning_rate": 0.0001, "loss": 6.0041, "loss/crossentropy": 2.6569840908050537, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18119489401578903, "step": 21260 }, { "epoch": 0.6644375, "grad_norm": 3.171875, "grad_norm_var": 0.034098307291666664, "learning_rate": 0.0001, "loss": 5.4112, "loss/crossentropy": 2.4031245708465576, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15666569769382477, "step": 21262 }, { "epoch": 0.6645, "grad_norm": 3.265625, "grad_norm_var": 0.03453776041666667, "learning_rate": 0.0001, "loss": 5.9051, "loss/crossentropy": 2.836301565170288, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16234861314296722, "step": 21264 }, { "epoch": 0.6645625, "grad_norm": 2.890625, "grad_norm_var": 0.0279937744140625, "learning_rate": 0.0001, "loss": 5.5214, "loss/crossentropy": 2.517667055130005, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.1577962040901184, "step": 21266 }, { "epoch": 0.664625, "grad_norm": 3.015625, "grad_norm_var": 0.026960245768229165, "learning_rate": 0.0001, "loss": 5.3337, "loss/crossentropy": 2.3482940196990967, "loss/hidden": 1.40234375, "loss/jsd": 0.0, "loss/logits": 0.15830513089895248, "step": 21268 }, { "epoch": 0.6646875, "grad_norm": 3.609375, "grad_norm_var": 0.035319010416666664, "learning_rate": 0.0001, "loss": 5.9632, "loss/crossentropy": 2.7076185941696167, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1747806891798973, "step": 21270 }, { "epoch": 0.66475, "grad_norm": 3.125, "grad_norm_var": 0.03606363932291667, "learning_rate": 0.0001, "loss": 5.7607, "loss/crossentropy": 2.6146886348724365, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16811973601579666, "step": 21272 }, { "epoch": 0.6648125, "grad_norm": 3.078125, "grad_norm_var": 0.03766276041666667, "learning_rate": 0.0001, "loss": 5.7106, "loss/crossentropy": 2.6125776767730713, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16448620706796646, "step": 21274 }, { "epoch": 0.664875, "grad_norm": 2.859375, "grad_norm_var": 0.04273681640625, "learning_rate": 0.0001, "loss": 5.4658, "loss/crossentropy": 2.48010790348053, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.15794584155082703, "step": 21276 }, { "epoch": 0.6649375, "grad_norm": 2.9375, "grad_norm_var": 0.04445699055989583, "learning_rate": 0.0001, "loss": 5.5905, "loss/crossentropy": 2.5024571418762207, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16309750080108643, "step": 21278 }, { "epoch": 0.665, "grad_norm": 3.078125, "grad_norm_var": 0.04411519368489583, "learning_rate": 0.0001, "loss": 5.7933, "loss/crossentropy": 2.650007486343384, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16745546460151672, "step": 21280 }, { "epoch": 0.6650625, "grad_norm": 3.40625, "grad_norm_var": 0.05068257649739583, "learning_rate": 0.0001, "loss": 5.9504, "loss/crossentropy": 2.692716598510742, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1765463650226593, "step": 21282 }, { "epoch": 0.665125, "grad_norm": 3.0625, "grad_norm_var": 0.04860738118489583, "learning_rate": 0.0001, "loss": 5.4174, "loss/crossentropy": 2.4194761514663696, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.15995173901319504, "step": 21284 }, { "epoch": 0.6651875, "grad_norm": 2.765625, "grad_norm_var": 0.03352762858072917, "learning_rate": 0.0001, "loss": 5.4512, "loss/crossentropy": 2.5085253715515137, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15090972930192947, "step": 21286 }, { "epoch": 0.66525, "grad_norm": 3.3125, "grad_norm_var": 0.03857421875, "learning_rate": 0.0001, "loss": 5.4336, "loss/crossentropy": 2.438277840614319, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1565670222043991, "step": 21288 }, { "epoch": 0.6653125, "grad_norm": 3.296875, "grad_norm_var": 0.043488566080729166, "learning_rate": 0.0001, "loss": 5.6838, "loss/crossentropy": 2.546603798866272, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16840271651744843, "step": 21290 }, { "epoch": 0.665375, "grad_norm": 3.140625, "grad_norm_var": 0.09138895670572916, "learning_rate": 0.0001, "loss": 5.5774, "loss/crossentropy": 2.3896507024765015, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16682545095682144, "step": 21292 }, { "epoch": 0.6654375, "grad_norm": 2.96875, "grad_norm_var": 0.0893218994140625, "learning_rate": 0.0001, "loss": 5.6017, "loss/crossentropy": 2.497402787208557, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16277802735567093, "step": 21294 }, { "epoch": 0.6655, "grad_norm": 3.015625, "grad_norm_var": 0.09031575520833333, "learning_rate": 0.0001, "loss": 5.6609, "loss/crossentropy": 2.578903317451477, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16600920259952545, "step": 21296 }, { "epoch": 0.6655625, "grad_norm": 3.03125, "grad_norm_var": 0.0870513916015625, "learning_rate": 0.0001, "loss": 5.4887, "loss/crossentropy": 2.3647782802581787, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1612161248922348, "step": 21298 }, { "epoch": 0.665625, "grad_norm": 3.0625, "grad_norm_var": 0.08688863118489583, "learning_rate": 0.0001, "loss": 5.5759, "loss/crossentropy": 2.519879937171936, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15951135009527206, "step": 21300 }, { "epoch": 0.6656875, "grad_norm": 3.015625, "grad_norm_var": 0.0798980712890625, "learning_rate": 0.0001, "loss": 5.3471, "loss/crossentropy": 2.348939299583435, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1541113629937172, "step": 21302 }, { "epoch": 0.66575, "grad_norm": 3.34375, "grad_norm_var": 0.06988525390625, "learning_rate": 0.0001, "loss": 5.6263, "loss/crossentropy": 2.4221882820129395, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1723650023341179, "step": 21304 }, { "epoch": 0.6658125, "grad_norm": 3.046875, "grad_norm_var": 0.06957906087239583, "learning_rate": 0.0001, "loss": 5.7512, "loss/crossentropy": 2.606347441673279, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16487867385149002, "step": 21306 }, { "epoch": 0.665875, "grad_norm": 3.3125, "grad_norm_var": 0.024811808268229166, "learning_rate": 0.0001, "loss": 5.9485, "loss/crossentropy": 2.7326987981796265, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17314036935567856, "step": 21308 }, { "epoch": 0.6659375, "grad_norm": 2.9375, "grad_norm_var": 0.025276692708333333, "learning_rate": 0.0001, "loss": 5.442, "loss/crossentropy": 2.4333741664886475, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1547662913799286, "step": 21310 }, { "epoch": 0.666, "grad_norm": 3.265625, "grad_norm_var": 0.022151692708333334, "learning_rate": 0.0001, "loss": 5.7553, "loss/crossentropy": 2.5214457511901855, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17260191589593887, "step": 21312 }, { "epoch": 0.6660625, "grad_norm": 2.6875, "grad_norm_var": 0.032389322916666664, "learning_rate": 0.0001, "loss": 5.4029, "loss/crossentropy": 2.4361422061920166, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15135882049798965, "step": 21314 }, { "epoch": 0.666125, "grad_norm": 2.859375, "grad_norm_var": 0.04114481608072917, "learning_rate": 0.0001, "loss": 5.8165, "loss/crossentropy": 2.668753743171692, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1690707951784134, "step": 21316 }, { "epoch": 0.6661875, "grad_norm": 3.171875, "grad_norm_var": 0.03762613932291667, "learning_rate": 0.0001, "loss": 5.7533, "loss/crossentropy": 2.59453284740448, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16743647307157516, "step": 21318 }, { "epoch": 0.66625, "grad_norm": 3.03125, "grad_norm_var": 0.04049479166666667, "learning_rate": 0.0001, "loss": 5.5846, "loss/crossentropy": 2.6005107164382935, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.155830517411232, "step": 21320 }, { "epoch": 0.6663125, "grad_norm": 3.046875, "grad_norm_var": 0.04830729166666667, "learning_rate": 0.0001, "loss": 5.7108, "loss/crossentropy": 2.6046417951583862, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1660839170217514, "step": 21322 }, { "epoch": 0.666375, "grad_norm": 3.15625, "grad_norm_var": 0.04592692057291667, "learning_rate": 0.0001, "loss": 5.4813, "loss/crossentropy": 2.4988549947738647, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.15840186923742294, "step": 21324 }, { "epoch": 0.6664375, "grad_norm": 3.1875, "grad_norm_var": 0.049779256184895836, "learning_rate": 0.0001, "loss": 5.369, "loss/crossentropy": 2.350496530532837, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15888623148202896, "step": 21326 }, { "epoch": 0.6665, "grad_norm": 3.078125, "grad_norm_var": 0.053343709309895834, "learning_rate": 0.0001, "loss": 5.8258, "loss/crossentropy": 2.6401236057281494, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1716877520084381, "step": 21328 }, { "epoch": 0.6665625, "grad_norm": 3.078125, "grad_norm_var": 0.04805399576822917, "learning_rate": 0.0001, "loss": 5.7729, "loss/crossentropy": 2.562077760696411, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16835245490074158, "step": 21330 }, { "epoch": 0.666625, "grad_norm": 3.15625, "grad_norm_var": 0.03990478515625, "learning_rate": 0.0001, "loss": 5.4159, "loss/crossentropy": 2.346079468727112, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15893695503473282, "step": 21332 }, { "epoch": 0.6666875, "grad_norm": 2.84375, "grad_norm_var": 0.04348958333333333, "learning_rate": 0.0001, "loss": 5.637, "loss/crossentropy": 2.576659083366394, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16228371113538742, "step": 21334 }, { "epoch": 0.66675, "grad_norm": 3.03125, "grad_norm_var": 0.03827718098958333, "learning_rate": 0.0001, "loss": 5.9511, "loss/crossentropy": 2.740101456642151, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1710963174700737, "step": 21336 }, { "epoch": 0.6668125, "grad_norm": 2.953125, "grad_norm_var": 0.032515462239583334, "learning_rate": 0.0001, "loss": 5.6354, "loss/crossentropy": 2.5601917505264282, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16298599541187286, "step": 21338 }, { "epoch": 0.666875, "grad_norm": 3.09375, "grad_norm_var": 0.03551025390625, "learning_rate": 0.0001, "loss": 5.262, "loss/crossentropy": 2.3531311750411987, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.14830512553453445, "step": 21340 }, { "epoch": 0.6669375, "grad_norm": 3.015625, "grad_norm_var": 0.03329671223958333, "learning_rate": 0.0001, "loss": 5.7388, "loss/crossentropy": 2.5914554595947266, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16668959707021713, "step": 21342 }, { "epoch": 0.667, "grad_norm": 2.96875, "grad_norm_var": 0.02880859375, "learning_rate": 0.0001, "loss": 5.1932, "loss/crossentropy": 2.2438119649887085, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15275021642446518, "step": 21344 }, { "epoch": 0.6670625, "grad_norm": 3.453125, "grad_norm_var": 0.029964192708333334, "learning_rate": 0.0001, "loss": 5.9101, "loss/crossentropy": 2.704999804496765, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1736399382352829, "step": 21346 }, { "epoch": 0.667125, "grad_norm": 3.1875, "grad_norm_var": 0.030464680989583333, "learning_rate": 0.0001, "loss": 5.6993, "loss/crossentropy": 2.557963013648987, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1641298159956932, "step": 21348 }, { "epoch": 0.6671875, "grad_norm": 2.953125, "grad_norm_var": 0.031233723958333334, "learning_rate": 0.0001, "loss": 5.6561, "loss/crossentropy": 2.53562068939209, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16830183565616608, "step": 21350 }, { "epoch": 0.66725, "grad_norm": 3.234375, "grad_norm_var": 0.03280843098958333, "learning_rate": 0.0001, "loss": 5.3138, "loss/crossentropy": 2.3396353721618652, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1513269543647766, "step": 21352 }, { "epoch": 0.6673125, "grad_norm": 3.078125, "grad_norm_var": 0.030973307291666665, "learning_rate": 0.0001, "loss": 6.12, "loss/crossentropy": 2.8311607837677, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17771321535110474, "step": 21354 }, { "epoch": 0.667375, "grad_norm": 3.84375, "grad_norm_var": 0.05821024576822917, "learning_rate": 0.0001, "loss": 6.0085, "loss/crossentropy": 2.7028005123138428, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.178620845079422, "step": 21356 }, { "epoch": 0.6674375, "grad_norm": 3.28125, "grad_norm_var": 0.05707906087239583, "learning_rate": 0.0001, "loss": 5.7083, "loss/crossentropy": 2.558958649635315, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1700163409113884, "step": 21358 }, { "epoch": 0.6675, "grad_norm": 3.21875, "grad_norm_var": 0.05222880045572917, "learning_rate": 0.0001, "loss": 5.4728, "loss/crossentropy": 2.4816389083862305, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.1569308191537857, "step": 21360 }, { "epoch": 0.6675625, "grad_norm": 3.0625, "grad_norm_var": 0.04908447265625, "learning_rate": 0.0001, "loss": 5.5566, "loss/crossentropy": 2.457713007926941, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16457998007535934, "step": 21362 }, { "epoch": 0.667625, "grad_norm": 3.265625, "grad_norm_var": 0.049267578125, "learning_rate": 0.0001, "loss": 5.8844, "loss/crossentropy": 2.669553279876709, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.17617502063512802, "step": 21364 }, { "epoch": 0.6676875, "grad_norm": 2.890625, "grad_norm_var": 0.05087483723958333, "learning_rate": 0.0001, "loss": 5.5797, "loss/crossentropy": 2.515458106994629, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16111420094966888, "step": 21366 }, { "epoch": 0.66775, "grad_norm": 3.21875, "grad_norm_var": 0.0529205322265625, "learning_rate": 0.0001, "loss": 6.0597, "loss/crossentropy": 2.8077352046966553, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17558830231428146, "step": 21368 }, { "epoch": 0.6678125, "grad_norm": 3.40625, "grad_norm_var": 0.05445556640625, "learning_rate": 0.0001, "loss": 5.8566, "loss/crossentropy": 2.696689248085022, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1687242016196251, "step": 21370 }, { "epoch": 0.667875, "grad_norm": 3.375, "grad_norm_var": 0.03394775390625, "learning_rate": 0.0001, "loss": 5.5609, "loss/crossentropy": 2.50481379032135, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15755990892648697, "step": 21372 }, { "epoch": 0.6679375, "grad_norm": 3.28125, "grad_norm_var": 0.033299763997395836, "learning_rate": 0.0001, "loss": 5.705, "loss/crossentropy": 2.565472960472107, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16473621875047684, "step": 21374 }, { "epoch": 0.668, "grad_norm": 2.90625, "grad_norm_var": 0.034566243489583336, "learning_rate": 0.0001, "loss": 5.2835, "loss/crossentropy": 2.309640049934387, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15206913650035858, "step": 21376 }, { "epoch": 0.6680625, "grad_norm": 3.171875, "grad_norm_var": 0.035374959309895836, "learning_rate": 0.0001, "loss": 5.4765, "loss/crossentropy": 2.4518284797668457, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15441888570785522, "step": 21378 }, { "epoch": 0.668125, "grad_norm": 3.21875, "grad_norm_var": 0.03623758951822917, "learning_rate": 0.0001, "loss": 5.6201, "loss/crossentropy": 2.522407650947571, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16328418999910355, "step": 21380 }, { "epoch": 0.6681875, "grad_norm": 3.125, "grad_norm_var": 0.03209228515625, "learning_rate": 0.0001, "loss": 5.5553, "loss/crossentropy": 2.4795387983322144, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16031239926815033, "step": 21382 }, { "epoch": 0.66825, "grad_norm": 2.96875, "grad_norm_var": 0.028059895833333334, "learning_rate": 0.0001, "loss": 5.7353, "loss/crossentropy": 2.7149728536605835, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1582816243171692, "step": 21384 }, { "epoch": 0.6683125, "grad_norm": 2.953125, "grad_norm_var": 0.024836222330729168, "learning_rate": 0.0001, "loss": 5.6475, "loss/crossentropy": 2.560445189476013, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.165740005671978, "step": 21386 }, { "epoch": 0.668375, "grad_norm": 2.921875, "grad_norm_var": 0.0182769775390625, "learning_rate": 0.0001, "loss": 5.4794, "loss/crossentropy": 2.439791202545166, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16099288314580917, "step": 21388 }, { "epoch": 0.6684375, "grad_norm": 2.875, "grad_norm_var": 0.016706339518229165, "learning_rate": 0.0001, "loss": 5.2074, "loss/crossentropy": 2.2957775592803955, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.14662948995828629, "step": 21390 }, { "epoch": 0.6685, "grad_norm": 3.296875, "grad_norm_var": 0.017985026041666668, "learning_rate": 0.0001, "loss": 5.5945, "loss/crossentropy": 2.4474011659622192, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16509786248207092, "step": 21392 }, { "epoch": 0.6685625, "grad_norm": 2.875, "grad_norm_var": 0.020335896809895834, "learning_rate": 0.0001, "loss": 5.5399, "loss/crossentropy": 2.48570454120636, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1608891859650612, "step": 21394 }, { "epoch": 0.668625, "grad_norm": 3.140625, "grad_norm_var": 0.019222005208333334, "learning_rate": 0.0001, "loss": 5.9927, "loss/crossentropy": 2.7924585342407227, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.170022115111351, "step": 21396 }, { "epoch": 0.6686875, "grad_norm": 3.703125, "grad_norm_var": 0.04445699055989583, "learning_rate": 0.0001, "loss": 6.0819, "loss/crossentropy": 2.764808773994446, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1817099004983902, "step": 21398 }, { "epoch": 0.66875, "grad_norm": 3.0, "grad_norm_var": 0.0437896728515625, "learning_rate": 0.0001, "loss": 5.5681, "loss/crossentropy": 2.5108526945114136, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16080047190189362, "step": 21400 }, { "epoch": 0.6688125, "grad_norm": 3.03125, "grad_norm_var": 0.04038798014322917, "learning_rate": 0.0001, "loss": 5.7719, "loss/crossentropy": 2.595277428627014, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17040060460567474, "step": 21402 }, { "epoch": 0.668875, "grad_norm": 3.1875, "grad_norm_var": 0.04690348307291667, "learning_rate": 0.0001, "loss": 5.7047, "loss/crossentropy": 2.4800525903701782, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.172075092792511, "step": 21404 }, { "epoch": 0.6689375, "grad_norm": 3.203125, "grad_norm_var": 0.046708170572916666, "learning_rate": 0.0001, "loss": 6.2405, "loss/crossentropy": 2.8728411197662354, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18325071781873703, "step": 21406 }, { "epoch": 0.669, "grad_norm": 3.15625, "grad_norm_var": 0.04472554524739583, "learning_rate": 0.0001, "loss": 5.6739, "loss/crossentropy": 2.6061822175979614, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1630219966173172, "step": 21408 }, { "epoch": 0.6690625, "grad_norm": 3.234375, "grad_norm_var": 0.034520467122395836, "learning_rate": 0.0001, "loss": 5.4019, "loss/crossentropy": 2.3656262159347534, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15831893682479858, "step": 21410 }, { "epoch": 0.669125, "grad_norm": 3.171875, "grad_norm_var": 0.03560791015625, "learning_rate": 0.0001, "loss": 5.614, "loss/crossentropy": 2.4813379049301147, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16795629262924194, "step": 21412 }, { "epoch": 0.6691875, "grad_norm": 3.21875, "grad_norm_var": 0.019115193684895834, "learning_rate": 0.0001, "loss": 5.8957, "loss/crossentropy": 2.6477352380752563, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17518383264541626, "step": 21414 }, { "epoch": 0.66925, "grad_norm": 3.140625, "grad_norm_var": 0.015152994791666667, "learning_rate": 0.0001, "loss": 5.8608, "loss/crossentropy": 2.6706048250198364, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1705779880285263, "step": 21416 }, { "epoch": 0.6693125, "grad_norm": 3.15625, "grad_norm_var": 0.012767537434895834, "learning_rate": 0.0001, "loss": 5.9529, "loss/crossentropy": 2.699122905731201, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17694467306137085, "step": 21418 }, { "epoch": 0.669375, "grad_norm": 3.234375, "grad_norm_var": 0.009105428059895834, "learning_rate": 0.0001, "loss": 5.7373, "loss/crossentropy": 2.5856951475143433, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1675005704164505, "step": 21420 }, { "epoch": 0.6694375, "grad_norm": 3.796875, "grad_norm_var": 0.02802734375, "learning_rate": 0.0001, "loss": 5.5522, "loss/crossentropy": 2.4567084312438965, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1614982932806015, "step": 21422 }, { "epoch": 0.6695, "grad_norm": 2.59375, "grad_norm_var": 0.061930338541666664, "learning_rate": 0.0001, "loss": 4.9384, "loss/crossentropy": 2.1080212593078613, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.13928364217281342, "step": 21424 }, { "epoch": 0.6695625, "grad_norm": 2.890625, "grad_norm_var": 0.07755533854166667, "learning_rate": 0.0001, "loss": 5.5534, "loss/crossentropy": 2.539226770401001, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1545458659529686, "step": 21426 }, { "epoch": 0.669625, "grad_norm": 2.96875, "grad_norm_var": 0.081494140625, "learning_rate": 0.0001, "loss": 5.442, "loss/crossentropy": 2.441792845726013, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15432009100914001, "step": 21428 }, { "epoch": 0.6696875, "grad_norm": 2.921875, "grad_norm_var": 0.08239644368489583, "learning_rate": 0.0001, "loss": 5.705, "loss/crossentropy": 2.6042795181274414, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16163183003664017, "step": 21430 }, { "epoch": 0.66975, "grad_norm": 3.265625, "grad_norm_var": 0.08306884765625, "learning_rate": 0.0001, "loss": 5.8226, "loss/crossentropy": 2.6924946308135986, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16535554826259613, "step": 21432 }, { "epoch": 0.6698125, "grad_norm": 3.015625, "grad_norm_var": 0.07955729166666667, "learning_rate": 0.0001, "loss": 5.6431, "loss/crossentropy": 2.638508439064026, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15709708631038666, "step": 21434 }, { "epoch": 0.669875, "grad_norm": 3.15625, "grad_norm_var": 0.07449442545572917, "learning_rate": 0.0001, "loss": 5.4824, "loss/crossentropy": 2.4303663969039917, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.156769298017025, "step": 21436 }, { "epoch": 0.6699375, "grad_norm": 2.765625, "grad_norm_var": 0.031201171875, "learning_rate": 0.0001, "loss": 4.9212, "loss/crossentropy": 2.09871244430542, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.14161968231201172, "step": 21438 }, { "epoch": 0.67, "grad_norm": 2.859375, "grad_norm_var": 0.03365885416666667, "learning_rate": 0.0001, "loss": 5.7513, "loss/crossentropy": 2.6521228551864624, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16499435156583786, "step": 21440 }, { "epoch": 0.6700625, "grad_norm": 3.21875, "grad_norm_var": 0.03212890625, "learning_rate": 0.0001, "loss": 5.7794, "loss/crossentropy": 2.672476291656494, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16499171406030655, "step": 21442 }, { "epoch": 0.670125, "grad_norm": 3.140625, "grad_norm_var": 0.03137613932291667, "learning_rate": 0.0001, "loss": 5.6225, "loss/crossentropy": 2.4903299808502197, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16633928567171097, "step": 21444 }, { "epoch": 0.6701875, "grad_norm": 3.0625, "grad_norm_var": 0.0308502197265625, "learning_rate": 0.0001, "loss": 5.7248, "loss/crossentropy": 2.611571788787842, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16522642225027084, "step": 21446 }, { "epoch": 0.67025, "grad_norm": 3.203125, "grad_norm_var": 0.029124959309895834, "learning_rate": 0.0001, "loss": 5.5744, "loss/crossentropy": 2.4765546321868896, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16408215463161469, "step": 21448 }, { "epoch": 0.6703125, "grad_norm": 3.125, "grad_norm_var": 0.030159505208333333, "learning_rate": 0.0001, "loss": 5.5132, "loss/crossentropy": 2.464064121246338, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1568623036146164, "step": 21450 }, { "epoch": 0.670375, "grad_norm": 3.328125, "grad_norm_var": 0.03284098307291667, "learning_rate": 0.0001, "loss": 5.7917, "loss/crossentropy": 2.536476969718933, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1743551641702652, "step": 21452 }, { "epoch": 0.6704375, "grad_norm": 3.265625, "grad_norm_var": 0.021882120768229166, "learning_rate": 0.0001, "loss": 5.632, "loss/crossentropy": 2.472257375717163, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1675385981798172, "step": 21454 }, { "epoch": 0.6705, "grad_norm": 3.078125, "grad_norm_var": 0.012776692708333334, "learning_rate": 0.0001, "loss": 5.5936, "loss/crossentropy": 2.5181620121002197, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.15754467248916626, "step": 21456 }, { "epoch": 0.6705625, "grad_norm": 3.375, "grad_norm_var": 0.012970987955729167, "learning_rate": 0.0001, "loss": 5.408, "loss/crossentropy": 2.344880700111389, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15865999460220337, "step": 21458 }, { "epoch": 0.670625, "grad_norm": 3.078125, "grad_norm_var": 0.046418253580729166, "learning_rate": 0.0001, "loss": 6.0235, "loss/crossentropy": 2.6762707233428955, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1847182661294937, "step": 21460 }, { "epoch": 0.6706875, "grad_norm": 2.921875, "grad_norm_var": 0.0536041259765625, "learning_rate": 0.0001, "loss": 5.6682, "loss/crossentropy": 2.5822503566741943, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16250474750995636, "step": 21462 }, { "epoch": 0.67075, "grad_norm": 3.125, "grad_norm_var": 0.05126953125, "learning_rate": 0.0001, "loss": 5.4845, "loss/crossentropy": 2.3889511823654175, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16228647530078888, "step": 21464 }, { "epoch": 0.6708125, "grad_norm": 3.3125, "grad_norm_var": 0.1002593994140625, "learning_rate": 0.0001, "loss": 6.1805, "loss/crossentropy": 2.7867921590805054, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18429580330848694, "step": 21466 }, { "epoch": 0.670875, "grad_norm": 3.0625, "grad_norm_var": 0.11489156087239584, "learning_rate": 0.0001, "loss": 6.0297, "loss/crossentropy": 2.723998546600342, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17901095747947693, "step": 21468 }, { "epoch": 0.6709375, "grad_norm": 2.9375, "grad_norm_var": 0.1339508056640625, "learning_rate": 0.0001, "loss": 5.5777, "loss/crossentropy": 2.58323073387146, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15725874155759811, "step": 21470 }, { "epoch": 0.671, "grad_norm": 3.140625, "grad_norm_var": 0.13498942057291666, "learning_rate": 0.0001, "loss": 5.7627, "loss/crossentropy": 2.617396593093872, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1684407889842987, "step": 21472 }, { "epoch": 0.6710625, "grad_norm": 3.15625, "grad_norm_var": 0.13696187337239582, "learning_rate": 0.0001, "loss": 6.154, "loss/crossentropy": 2.908242344856262, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17261794209480286, "step": 21474 }, { "epoch": 0.671125, "grad_norm": 3.0625, "grad_norm_var": 0.10506083170572916, "learning_rate": 0.0001, "loss": 5.4484, "loss/crossentropy": 2.439389705657959, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15597765892744064, "step": 21476 }, { "epoch": 0.6711875, "grad_norm": 3.21875, "grad_norm_var": 0.09988505045572917, "learning_rate": 0.0001, "loss": 5.6118, "loss/crossentropy": 2.499711036682129, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16199331730604172, "step": 21478 }, { "epoch": 0.67125, "grad_norm": 3.109375, "grad_norm_var": 0.10393778483072917, "learning_rate": 0.0001, "loss": 5.4711, "loss/crossentropy": 2.4389195442199707, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15672899782657623, "step": 21480 }, { "epoch": 0.6713125, "grad_norm": 3.28125, "grad_norm_var": 0.04990234375, "learning_rate": 0.0001, "loss": 5.6451, "loss/crossentropy": 2.5340847969055176, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16657179594039917, "step": 21482 }, { "epoch": 0.671375, "grad_norm": 3.125, "grad_norm_var": 0.030436197916666668, "learning_rate": 0.0001, "loss": 5.9978, "loss/crossentropy": 2.721407890319824, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17646276205778122, "step": 21484 }, { "epoch": 0.6714375, "grad_norm": 3.25, "grad_norm_var": 0.024117024739583333, "learning_rate": 0.0001, "loss": 5.5812, "loss/crossentropy": 2.4138031005859375, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16829850524663925, "step": 21486 }, { "epoch": 0.6715, "grad_norm": 3.671875, "grad_norm_var": 0.20581766764322917, "learning_rate": 0.0001, "loss": 5.9694, "loss/crossentropy": 2.6807148456573486, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17769964784383774, "step": 21488 }, { "epoch": 0.6715625, "grad_norm": 3.453125, "grad_norm_var": 0.20328369140625, "learning_rate": 0.0001, "loss": 5.5746, "loss/crossentropy": 2.3858500719070435, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1657535880804062, "step": 21490 }, { "epoch": 0.671625, "grad_norm": 2.9375, "grad_norm_var": 0.2178619384765625, "learning_rate": 0.0001, "loss": 5.9514, "loss/crossentropy": 2.840806007385254, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16301733255386353, "step": 21492 }, { "epoch": 0.6716875, "grad_norm": 2.984375, "grad_norm_var": 0.22433980305989584, "learning_rate": 0.0001, "loss": 5.4522, "loss/crossentropy": 2.5097970962524414, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15283824503421783, "step": 21494 }, { "epoch": 0.67175, "grad_norm": 2.84375, "grad_norm_var": 0.23854878743489583, "learning_rate": 0.0001, "loss": 5.1412, "loss/crossentropy": 2.231008291244507, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.149219810962677, "step": 21496 }, { "epoch": 0.6718125, "grad_norm": 2.890625, "grad_norm_var": 0.24462788899739582, "learning_rate": 0.0001, "loss": 5.9064, "loss/crossentropy": 2.7614312171936035, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1652786284685135, "step": 21498 }, { "epoch": 0.671875, "grad_norm": 3.265625, "grad_norm_var": 0.24353841145833333, "learning_rate": 0.0001, "loss": 5.7548, "loss/crossentropy": 2.602095127105713, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16917718201875687, "step": 21500 }, { "epoch": 0.6719375, "grad_norm": 3.375, "grad_norm_var": 5.868277994791667, "learning_rate": 0.0001, "loss": 6.7616, "loss/crossentropy": 2.907825231552124, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.23147086799144745, "step": 21502 }, { "epoch": 0.672, "grad_norm": 4.03125, "grad_norm_var": 5.832027180989583, "learning_rate": 0.0001, "loss": 5.7947, "loss/crossentropy": 2.5979628562927246, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1708494797348976, "step": 21504 }, { "epoch": 0.6720625, "grad_norm": 3.140625, "grad_norm_var": 5.864997355143229, "learning_rate": 0.0001, "loss": 5.5542, "loss/crossentropy": 2.5263938903808594, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1574731022119522, "step": 21506 }, { "epoch": 0.672125, "grad_norm": 3.265625, "grad_norm_var": 5.815843709309896, "learning_rate": 0.0001, "loss": 5.6225, "loss/crossentropy": 2.5533034801483154, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16004548966884613, "step": 21508 }, { "epoch": 0.6721875, "grad_norm": 2.875, "grad_norm_var": 5.85592041015625, "learning_rate": 0.0001, "loss": 5.5983, "loss/crossentropy": 2.50632381439209, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16270896047353745, "step": 21510 }, { "epoch": 0.67225, "grad_norm": 3.15625, "grad_norm_var": 5.825260416666667, "learning_rate": 0.0001, "loss": 5.7735, "loss/crossentropy": 2.6349629163742065, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16659299284219742, "step": 21512 }, { "epoch": 0.6723125, "grad_norm": 3.078125, "grad_norm_var": 5.812165323893229, "learning_rate": 0.0001, "loss": 5.5054, "loss/crossentropy": 2.469195604324341, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16065293550491333, "step": 21514 }, { "epoch": 0.672375, "grad_norm": 2.765625, "grad_norm_var": 5.850516764322917, "learning_rate": 0.0001, "loss": 5.7099, "loss/crossentropy": 2.6085174083709717, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16287671774625778, "step": 21516 }, { "epoch": 0.6724375, "grad_norm": 2.984375, "grad_norm_var": 0.07402242024739583, "learning_rate": 0.0001, "loss": 5.4277, "loss/crossentropy": 2.424909234046936, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15574946999549866, "step": 21518 }, { "epoch": 0.6725, "grad_norm": 3.171875, "grad_norm_var": 0.024299112955729167, "learning_rate": 0.0001, "loss": 5.9345, "loss/crossentropy": 2.6294257640838623, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17972489446401596, "step": 21520 }, { "epoch": 0.6725625, "grad_norm": 3.46875, "grad_norm_var": 0.03630269368489583, "learning_rate": 0.0001, "loss": 5.7392, "loss/crossentropy": 2.563357949256897, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1695333793759346, "step": 21522 }, { "epoch": 0.672625, "grad_norm": 3.34375, "grad_norm_var": 0.040511067708333334, "learning_rate": 0.0001, "loss": 5.7447, "loss/crossentropy": 2.613749384880066, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16426987200975418, "step": 21524 }, { "epoch": 0.6726875, "grad_norm": 3.015625, "grad_norm_var": 0.03713277180989583, "learning_rate": 0.0001, "loss": 6.1799, "loss/crossentropy": 2.9965966939926147, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1706780195236206, "step": 21526 }, { "epoch": 0.67275, "grad_norm": 3.21875, "grad_norm_var": 0.037840779622395834, "learning_rate": 0.0001, "loss": 5.8798, "loss/crossentropy": 2.6346405744552612, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17412938922643661, "step": 21528 }, { "epoch": 0.6728125, "grad_norm": 3.3125, "grad_norm_var": 0.03998921712239583, "learning_rate": 0.0001, "loss": 5.8051, "loss/crossentropy": 2.5978105068206787, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17073268443346024, "step": 21530 }, { "epoch": 0.672875, "grad_norm": 3.28125, "grad_norm_var": 0.027669270833333332, "learning_rate": 0.0001, "loss": 5.7746, "loss/crossentropy": 2.630752921104431, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16594670712947845, "step": 21532 }, { "epoch": 0.6729375, "grad_norm": 3.375, "grad_norm_var": 0.03235270182291667, "learning_rate": 0.0001, "loss": 5.8934, "loss/crossentropy": 2.652822256088257, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17288265377283096, "step": 21534 }, { "epoch": 0.673, "grad_norm": 3.109375, "grad_norm_var": 0.0338043212890625, "learning_rate": 0.0001, "loss": 5.6394, "loss/crossentropy": 2.5622141361236572, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16240905970335007, "step": 21536 }, { "epoch": 0.6730625, "grad_norm": 2.984375, "grad_norm_var": 0.025126139322916668, "learning_rate": 0.0001, "loss": 5.2742, "loss/crossentropy": 2.327731966972351, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15324432402849197, "step": 21538 }, { "epoch": 0.673125, "grad_norm": 2.890625, "grad_norm_var": 0.024168904622395834, "learning_rate": 0.0001, "loss": 5.4945, "loss/crossentropy": 2.427959680557251, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15977810323238373, "step": 21540 }, { "epoch": 0.6731875, "grad_norm": 3.28125, "grad_norm_var": 0.025178019205729166, "learning_rate": 0.0001, "loss": 5.9594, "loss/crossentropy": 2.7185678482055664, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1764236018061638, "step": 21542 }, { "epoch": 0.67325, "grad_norm": 3.328125, "grad_norm_var": 0.027632649739583334, "learning_rate": 0.0001, "loss": 5.5374, "loss/crossentropy": 2.5168328285217285, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1567484438419342, "step": 21544 }, { "epoch": 0.6733125, "grad_norm": 3.578125, "grad_norm_var": 0.0830718994140625, "learning_rate": 0.0001, "loss": 5.3888, "loss/crossentropy": 2.248305916786194, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.161318838596344, "step": 21546 }, { "epoch": 0.673375, "grad_norm": 3.140625, "grad_norm_var": 0.0821929931640625, "learning_rate": 0.0001, "loss": 5.8188, "loss/crossentropy": 2.6391515731811523, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16991789638996124, "step": 21548 }, { "epoch": 0.6734375, "grad_norm": 3.03125, "grad_norm_var": 0.07672119140625, "learning_rate": 0.0001, "loss": 5.1313, "loss/crossentropy": 2.2192198038101196, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.14667852967977524, "step": 21550 }, { "epoch": 0.6735, "grad_norm": 2.734375, "grad_norm_var": 0.08629150390625, "learning_rate": 0.0001, "loss": 5.2923, "loss/crossentropy": 2.354954719543457, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15154416859149933, "step": 21552 }, { "epoch": 0.6735625, "grad_norm": 3.140625, "grad_norm_var": 0.08384501139322917, "learning_rate": 0.0001, "loss": 5.6326, "loss/crossentropy": 2.539131999015808, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16286291182041168, "step": 21554 }, { "epoch": 0.673625, "grad_norm": 3.046875, "grad_norm_var": 0.07922770182291666, "learning_rate": 0.0001, "loss": 5.6455, "loss/crossentropy": 2.562136173248291, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16263478249311447, "step": 21556 }, { "epoch": 0.6736875, "grad_norm": 3.203125, "grad_norm_var": 0.07893778483072916, "learning_rate": 0.0001, "loss": 5.7154, "loss/crossentropy": 2.568955898284912, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16777236759662628, "step": 21558 }, { "epoch": 0.67375, "grad_norm": 2.890625, "grad_norm_var": 0.0842681884765625, "learning_rate": 0.0001, "loss": 5.5765, "loss/crossentropy": 2.5343549251556396, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1585097759962082, "step": 21560 }, { "epoch": 0.6738125, "grad_norm": 3.5625, "grad_norm_var": 0.0351470947265625, "learning_rate": 0.0001, "loss": 5.6905, "loss/crossentropy": 2.507950782775879, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16825632750988007, "step": 21562 }, { "epoch": 0.673875, "grad_norm": 2.984375, "grad_norm_var": 0.03794657389322917, "learning_rate": 0.0001, "loss": 5.4478, "loss/crossentropy": 2.4567400217056274, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15614214539527893, "step": 21564 }, { "epoch": 0.6739375, "grad_norm": 3.0625, "grad_norm_var": 0.0377105712890625, "learning_rate": 0.0001, "loss": 5.8502, "loss/crossentropy": 2.651641011238098, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.17610616981983185, "step": 21566 }, { "epoch": 0.674, "grad_norm": 2.953125, "grad_norm_var": 0.03033447265625, "learning_rate": 0.0001, "loss": 5.7605, "loss/crossentropy": 2.601743459701538, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1701701581478119, "step": 21568 }, { "epoch": 0.6740625, "grad_norm": 3.390625, "grad_norm_var": 0.031298828125, "learning_rate": 0.0001, "loss": 5.5561, "loss/crossentropy": 2.5366307497024536, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1589776575565338, "step": 21570 }, { "epoch": 0.674125, "grad_norm": 3.078125, "grad_norm_var": 0.030402628580729167, "learning_rate": 0.0001, "loss": 5.903, "loss/crossentropy": 2.720766544342041, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17017146944999695, "step": 21572 }, { "epoch": 0.6741875, "grad_norm": 2.71875, "grad_norm_var": 0.07251688639322916, "learning_rate": 0.0001, "loss": 5.59, "loss/crossentropy": 2.456699252128601, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16723531484603882, "step": 21574 }, { "epoch": 0.67425, "grad_norm": 3.15625, "grad_norm_var": 0.06866861979166666, "learning_rate": 0.0001, "loss": 5.5863, "loss/crossentropy": 2.5174355506896973, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.16547858715057373, "step": 21576 }, { "epoch": 0.6743125, "grad_norm": 3.34375, "grad_norm_var": 0.060286458333333334, "learning_rate": 0.0001, "loss": 5.5883, "loss/crossentropy": 2.590228796005249, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.15722859650850296, "step": 21578 }, { "epoch": 0.674375, "grad_norm": 3.5, "grad_norm_var": 0.06468098958333333, "learning_rate": 0.0001, "loss": 5.7053, "loss/crossentropy": 2.455095648765564, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17501623928546906, "step": 21580 }, { "epoch": 0.6744375, "grad_norm": 3.59375, "grad_norm_var": 0.08015950520833333, "learning_rate": 0.0001, "loss": 5.771, "loss/crossentropy": 2.598675847053528, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16958113014698029, "step": 21582 }, { "epoch": 0.6745, "grad_norm": 3.109375, "grad_norm_var": 0.07649637858072916, "learning_rate": 0.0001, "loss": 5.4936, "loss/crossentropy": 2.440723180770874, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1564580649137497, "step": 21584 }, { "epoch": 0.6745625, "grad_norm": 3.15625, "grad_norm_var": 0.07935282389322916, "learning_rate": 0.0001, "loss": 5.3376, "loss/crossentropy": 2.354195713996887, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1530267298221588, "step": 21586 }, { "epoch": 0.674625, "grad_norm": 2.9375, "grad_norm_var": 0.08111063639322917, "learning_rate": 0.0001, "loss": 5.5999, "loss/crossentropy": 2.4619847536087036, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1649627760052681, "step": 21588 }, { "epoch": 0.6746875, "grad_norm": 2.984375, "grad_norm_var": 0.04377339680989583, "learning_rate": 0.0001, "loss": 5.6582, "loss/crossentropy": 2.5627092123031616, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16071902960538864, "step": 21590 }, { "epoch": 0.67475, "grad_norm": 3.1875, "grad_norm_var": 0.04342041015625, "learning_rate": 0.0001, "loss": 5.598, "loss/crossentropy": 2.445231556892395, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1683986634016037, "step": 21592 }, { "epoch": 0.6748125, "grad_norm": 3.0, "grad_norm_var": 0.04360249837239583, "learning_rate": 0.0001, "loss": 5.7165, "loss/crossentropy": 2.5691791772842407, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16941748559474945, "step": 21594 }, { "epoch": 0.674875, "grad_norm": 3.0, "grad_norm_var": 0.03290608723958333, "learning_rate": 0.0001, "loss": 5.2587, "loss/crossentropy": 2.328010857105255, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.14658305048942566, "step": 21596 }, { "epoch": 0.6749375, "grad_norm": 3.859375, "grad_norm_var": 0.052733357747395834, "learning_rate": 0.0001, "loss": 5.5409, "loss/crossentropy": 2.464159846305847, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1619701012969017, "step": 21598 }, { "epoch": 0.675, "grad_norm": 3.796875, "grad_norm_var": 0.08989969889322917, "learning_rate": 0.0001, "loss": 5.9731, "loss/crossentropy": 2.5941025018692017, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.18829017877578735, "step": 21600 }, { "epoch": 0.6750625, "grad_norm": 3.40625, "grad_norm_var": 0.08860575358072917, "learning_rate": 0.0001, "loss": 5.9492, "loss/crossentropy": 2.6869665384292603, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1805206462740898, "step": 21602 }, { "epoch": 0.675125, "grad_norm": 3.0625, "grad_norm_var": 0.0867828369140625, "learning_rate": 0.0001, "loss": 5.392, "loss/crossentropy": 2.291312098503113, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16436392813920975, "step": 21604 }, { "epoch": 0.6751875, "grad_norm": 3.484375, "grad_norm_var": 0.08621317545572917, "learning_rate": 0.0001, "loss": 6.0364, "loss/crossentropy": 2.701277017593384, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.17765580117702484, "step": 21606 }, { "epoch": 0.67525, "grad_norm": 3.046875, "grad_norm_var": 0.0879302978515625, "learning_rate": 0.0001, "loss": 5.9724, "loss/crossentropy": 2.739699602127075, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1724887266755104, "step": 21608 }, { "epoch": 0.6753125, "grad_norm": 2.984375, "grad_norm_var": 0.08313700358072916, "learning_rate": 0.0001, "loss": 5.7393, "loss/crossentropy": 2.6435153484344482, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.1673872172832489, "step": 21610 }, { "epoch": 0.675375, "grad_norm": 3.03125, "grad_norm_var": 0.0766021728515625, "learning_rate": 0.0001, "loss": 5.6872, "loss/crossentropy": 2.5578598976135254, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16566597670316696, "step": 21612 }, { "epoch": 0.6754375, "grad_norm": 2.828125, "grad_norm_var": 0.0662017822265625, "learning_rate": 0.0001, "loss": 5.5189, "loss/crossentropy": 2.4635757207870483, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16178404539823532, "step": 21614 }, { "epoch": 0.6755, "grad_norm": 3.296875, "grad_norm_var": 0.033991495768229164, "learning_rate": 0.0001, "loss": 5.9125, "loss/crossentropy": 2.6843149662017822, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1743776500225067, "step": 21616 }, { "epoch": 0.6755625, "grad_norm": 2.9375, "grad_norm_var": 0.038981119791666664, "learning_rate": 0.0001, "loss": 5.7046, "loss/crossentropy": 2.483850121498108, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1701253280043602, "step": 21618 }, { "epoch": 0.675625, "grad_norm": 3.203125, "grad_norm_var": 0.04343973795572917, "learning_rate": 0.0001, "loss": 5.5076, "loss/crossentropy": 2.4350651502609253, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.1631159856915474, "step": 21620 }, { "epoch": 0.6756875, "grad_norm": 3.046875, "grad_norm_var": 0.028571573893229167, "learning_rate": 0.0001, "loss": 5.5037, "loss/crossentropy": 2.5485366582870483, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15411410480737686, "step": 21622 }, { "epoch": 0.67575, "grad_norm": 2.71875, "grad_norm_var": 0.03479410807291667, "learning_rate": 0.0001, "loss": 5.4561, "loss/crossentropy": 2.519161343574524, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15228858590126038, "step": 21624 }, { "epoch": 0.6758125, "grad_norm": 3.171875, "grad_norm_var": 0.034968058268229164, "learning_rate": 0.0001, "loss": 5.7888, "loss/crossentropy": 2.658233880996704, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1669674590229988, "step": 21626 }, { "epoch": 0.675875, "grad_norm": 3.203125, "grad_norm_var": 0.0362457275390625, "learning_rate": 0.0001, "loss": 5.5435, "loss/crossentropy": 2.451690912246704, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1638697236776352, "step": 21628 }, { "epoch": 0.6759375, "grad_norm": 3.109375, "grad_norm_var": 0.03424072265625, "learning_rate": 0.0001, "loss": 5.6949, "loss/crossentropy": 2.565598487854004, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1641046106815338, "step": 21630 }, { "epoch": 0.676, "grad_norm": 3.21875, "grad_norm_var": 0.0347564697265625, "learning_rate": 0.0001, "loss": 5.9732, "loss/crossentropy": 2.781027913093567, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17194978892803192, "step": 21632 }, { "epoch": 0.6760625, "grad_norm": 3.25, "grad_norm_var": 0.0332672119140625, "learning_rate": 0.0001, "loss": 5.6233, "loss/crossentropy": 2.475935459136963, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16629965603351593, "step": 21634 }, { "epoch": 0.676125, "grad_norm": 2.921875, "grad_norm_var": 0.03291015625, "learning_rate": 0.0001, "loss": 5.4409, "loss/crossentropy": 2.413808822631836, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15739503502845764, "step": 21636 }, { "epoch": 0.6761875, "grad_norm": 3.453125, "grad_norm_var": 0.03726806640625, "learning_rate": 0.0001, "loss": 5.7385, "loss/crossentropy": 2.514286160469055, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17359095811843872, "step": 21638 }, { "epoch": 0.67625, "grad_norm": 3.109375, "grad_norm_var": 0.026667277018229168, "learning_rate": 0.0001, "loss": 5.6374, "loss/crossentropy": 2.5960875749588013, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.1635073944926262, "step": 21640 }, { "epoch": 0.6763125, "grad_norm": 3.421875, "grad_norm_var": 0.0535552978515625, "learning_rate": 0.0001, "loss": 5.8778, "loss/crossentropy": 2.601802349090576, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1776026487350464, "step": 21642 }, { "epoch": 0.676375, "grad_norm": 3.171875, "grad_norm_var": 0.05562744140625, "learning_rate": 0.0001, "loss": 5.4776, "loss/crossentropy": 2.3748942613601685, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1614421010017395, "step": 21644 }, { "epoch": 0.6764375, "grad_norm": 3.1875, "grad_norm_var": 0.05523681640625, "learning_rate": 0.0001, "loss": 5.512, "loss/crossentropy": 2.480241894721985, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15630466490983963, "step": 21646 }, { "epoch": 0.6765, "grad_norm": 2.84375, "grad_norm_var": 0.06116129557291667, "learning_rate": 0.0001, "loss": 5.5291, "loss/crossentropy": 2.4708411693573, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1593453586101532, "step": 21648 }, { "epoch": 0.6765625, "grad_norm": 3.328125, "grad_norm_var": 0.061751302083333334, "learning_rate": 0.0001, "loss": 5.9215, "loss/crossentropy": 2.7264283895492554, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1738065481185913, "step": 21650 }, { "epoch": 0.676625, "grad_norm": 3.296875, "grad_norm_var": 0.059342447916666666, "learning_rate": 0.0001, "loss": 6.0063, "loss/crossentropy": 2.7244467735290527, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17779753357172012, "step": 21652 }, { "epoch": 0.6766875, "grad_norm": 3.703125, "grad_norm_var": 0.07317606608072917, "learning_rate": 0.0001, "loss": 5.5412, "loss/crossentropy": 2.442210555076599, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1618504673242569, "step": 21654 }, { "epoch": 0.67675, "grad_norm": 3.09375, "grad_norm_var": 0.07888997395833333, "learning_rate": 0.0001, "loss": 5.6021, "loss/crossentropy": 2.5033955574035645, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1637721210718155, "step": 21656 }, { "epoch": 0.6768125, "grad_norm": 3.359375, "grad_norm_var": 0.051416015625, "learning_rate": 0.0001, "loss": 6.0564, "loss/crossentropy": 2.7717570066452026, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17611663043498993, "step": 21658 }, { "epoch": 0.676875, "grad_norm": 3.640625, "grad_norm_var": 0.0673828125, "learning_rate": 0.0001, "loss": 5.4322, "loss/crossentropy": 2.362541079521179, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15852497518062592, "step": 21660 }, { "epoch": 0.6769375, "grad_norm": 3.1875, "grad_norm_var": 0.06842041015625, "learning_rate": 0.0001, "loss": 5.7047, "loss/crossentropy": 2.586043357849121, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16889940202236176, "step": 21662 }, { "epoch": 0.677, "grad_norm": 3.09375, "grad_norm_var": 0.06282552083333333, "learning_rate": 0.0001, "loss": 5.4755, "loss/crossentropy": 2.4535220861434937, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15375986695289612, "step": 21664 }, { "epoch": 0.6770625, "grad_norm": 3.03125, "grad_norm_var": 0.06164957682291667, "learning_rate": 0.0001, "loss": 5.6035, "loss/crossentropy": 2.4998347759246826, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.15997806936502457, "step": 21666 }, { "epoch": 0.677125, "grad_norm": 3.359375, "grad_norm_var": 0.06210530598958333, "learning_rate": 0.0001, "loss": 6.0946, "loss/crossentropy": 2.818961977958679, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17678148299455643, "step": 21668 }, { "epoch": 0.6771875, "grad_norm": 3.171875, "grad_norm_var": 0.06658528645833334, "learning_rate": 0.0001, "loss": 5.779, "loss/crossentropy": 2.6024211645126343, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16805046051740646, "step": 21670 }, { "epoch": 0.67725, "grad_norm": 3.0, "grad_norm_var": 0.06057535807291667, "learning_rate": 0.0001, "loss": 5.9059, "loss/crossentropy": 2.696392774581909, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17016559094190598, "step": 21672 }, { "epoch": 0.6773125, "grad_norm": 3.078125, "grad_norm_var": 0.06682840983072917, "learning_rate": 0.0001, "loss": 5.466, "loss/crossentropy": 2.409417152404785, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15839356929063797, "step": 21674 }, { "epoch": 0.677375, "grad_norm": 2.875, "grad_norm_var": 0.060074869791666666, "learning_rate": 0.0001, "loss": 5.2729, "loss/crossentropy": 2.3077032566070557, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15316389501094818, "step": 21676 }, { "epoch": 0.6774375, "grad_norm": 2.71875, "grad_norm_var": 0.06892903645833333, "learning_rate": 0.0001, "loss": 5.0794, "loss/crossentropy": 2.1735494136810303, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.14761539548635483, "step": 21678 }, { "epoch": 0.6775, "grad_norm": 3.015625, "grad_norm_var": 0.0689453125, "learning_rate": 0.0001, "loss": 5.4218, "loss/crossentropy": 2.421651840209961, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15860862284898758, "step": 21680 }, { "epoch": 0.6775625, "grad_norm": 3.125, "grad_norm_var": 0.066796875, "learning_rate": 0.0001, "loss": 5.5028, "loss/crossentropy": 2.4240047931671143, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16295696049928665, "step": 21682 }, { "epoch": 0.677625, "grad_norm": 2.6875, "grad_norm_var": 0.07112223307291667, "learning_rate": 0.0001, "loss": 5.4404, "loss/crossentropy": 2.4916775226593018, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15268395096063614, "step": 21684 }, { "epoch": 0.6776875, "grad_norm": 3.296875, "grad_norm_var": 0.0428619384765625, "learning_rate": 0.0001, "loss": 5.8948, "loss/crossentropy": 2.6532318592071533, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17571566998958588, "step": 21686 }, { "epoch": 0.67775, "grad_norm": 3.1875, "grad_norm_var": 0.038330078125, "learning_rate": 0.0001, "loss": 5.5301, "loss/crossentropy": 2.513607978820801, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15555134415626526, "step": 21688 }, { "epoch": 0.6778125, "grad_norm": 3.234375, "grad_norm_var": 0.039281209309895836, "learning_rate": 0.0001, "loss": 5.5822, "loss/crossentropy": 2.5421735048294067, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1594759076833725, "step": 21690 }, { "epoch": 0.677875, "grad_norm": 3.3125, "grad_norm_var": 0.04052327473958333, "learning_rate": 0.0001, "loss": 5.4246, "loss/crossentropy": 2.3655821084976196, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15942063927650452, "step": 21692 }, { "epoch": 0.6779375, "grad_norm": 3.125, "grad_norm_var": 0.037629191080729166, "learning_rate": 0.0001, "loss": 5.7691, "loss/crossentropy": 2.653545618057251, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16506946086883545, "step": 21694 }, { "epoch": 0.678, "grad_norm": 3.1875, "grad_norm_var": 0.03615620930989583, "learning_rate": 0.0001, "loss": 5.9941, "loss/crossentropy": 2.679787755012512, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18064585328102112, "step": 21696 }, { "epoch": 0.6780625, "grad_norm": 3.03125, "grad_norm_var": 0.038508097330729164, "learning_rate": 0.0001, "loss": 5.5408, "loss/crossentropy": 2.5309895277023315, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.1568371206521988, "step": 21698 }, { "epoch": 0.678125, "grad_norm": 3.1875, "grad_norm_var": 0.0251861572265625, "learning_rate": 0.0001, "loss": 5.9506, "loss/crossentropy": 2.7247650623321533, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1741444393992424, "step": 21700 }, { "epoch": 0.6781875, "grad_norm": 3.078125, "grad_norm_var": 0.029295857747395834, "learning_rate": 0.0001, "loss": 5.77, "loss/crossentropy": 2.7215741872787476, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16031458973884583, "step": 21702 }, { "epoch": 0.67825, "grad_norm": 3.4375, "grad_norm_var": 0.03905843098958333, "learning_rate": 0.0001, "loss": 5.6592, "loss/crossentropy": 2.507062554359436, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16951168328523636, "step": 21704 }, { "epoch": 0.6783125, "grad_norm": 2.875, "grad_norm_var": 0.04080403645833333, "learning_rate": 0.0001, "loss": 5.3356, "loss/crossentropy": 2.342239499092102, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15246369689702988, "step": 21706 }, { "epoch": 0.678375, "grad_norm": 3.3125, "grad_norm_var": 0.04680074055989583, "learning_rate": 0.0001, "loss": 5.6685, "loss/crossentropy": 2.6061320304870605, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16209647804498672, "step": 21708 }, { "epoch": 0.6784375, "grad_norm": 2.921875, "grad_norm_var": 0.0479644775390625, "learning_rate": 0.0001, "loss": 5.4719, "loss/crossentropy": 2.491135835647583, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.15706124156713486, "step": 21710 }, { "epoch": 0.6785, "grad_norm": 3.03125, "grad_norm_var": 0.04545796712239583, "learning_rate": 0.0001, "loss": 5.507, "loss/crossentropy": 2.4627164602279663, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1614636331796646, "step": 21712 }, { "epoch": 0.6785625, "grad_norm": 3.09375, "grad_norm_var": 0.0504547119140625, "learning_rate": 0.0001, "loss": 5.5285, "loss/crossentropy": 2.550132632255554, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.1560390442609787, "step": 21714 }, { "epoch": 0.678625, "grad_norm": 2.890625, "grad_norm_var": 0.04195963541666667, "learning_rate": 0.0001, "loss": 5.7092, "loss/crossentropy": 2.5589277744293213, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16580668836832047, "step": 21716 }, { "epoch": 0.6786875, "grad_norm": 3.15625, "grad_norm_var": 0.043294270833333336, "learning_rate": 0.0001, "loss": 5.8755, "loss/crossentropy": 2.6546236276626587, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17404113709926605, "step": 21718 }, { "epoch": 0.67875, "grad_norm": 3.1875, "grad_norm_var": 0.0341796875, "learning_rate": 0.0001, "loss": 5.6575, "loss/crossentropy": 2.581426978111267, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16307304054498672, "step": 21720 }, { "epoch": 0.6788125, "grad_norm": 3.0, "grad_norm_var": 0.0287109375, "learning_rate": 0.0001, "loss": 5.7229, "loss/crossentropy": 2.6428853273391724, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16424734890460968, "step": 21722 }, { "epoch": 0.678875, "grad_norm": 3.265625, "grad_norm_var": 0.024267578125, "learning_rate": 0.0001, "loss": 5.7864, "loss/crossentropy": 2.600390076637268, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17133549600839615, "step": 21724 }, { "epoch": 0.6789375, "grad_norm": 3.015625, "grad_norm_var": 0.024348958333333334, "learning_rate": 0.0001, "loss": 5.5554, "loss/crossentropy": 2.4865139722824097, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16157332062721252, "step": 21726 }, { "epoch": 0.679, "grad_norm": 2.78125, "grad_norm_var": 0.028450520833333333, "learning_rate": 0.0001, "loss": 5.5107, "loss/crossentropy": 2.514100670814514, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1508304327726364, "step": 21728 }, { "epoch": 0.6790625, "grad_norm": 3.5, "grad_norm_var": 0.0369537353515625, "learning_rate": 0.0001, "loss": 5.5489, "loss/crossentropy": 2.3980984687805176, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16507767140865326, "step": 21730 }, { "epoch": 0.679125, "grad_norm": 3.140625, "grad_norm_var": 0.035384114583333334, "learning_rate": 0.0001, "loss": 5.5816, "loss/crossentropy": 2.4350985288619995, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16582182049751282, "step": 21732 }, { "epoch": 0.6791875, "grad_norm": 3.046875, "grad_norm_var": 0.03336181640625, "learning_rate": 0.0001, "loss": 5.4235, "loss/crossentropy": 2.3969926834106445, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.1592930406332016, "step": 21734 }, { "epoch": 0.67925, "grad_norm": 3.078125, "grad_norm_var": 0.029645792643229165, "learning_rate": 0.0001, "loss": 5.8289, "loss/crossentropy": 2.715415120124817, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1636960431933403, "step": 21736 }, { "epoch": 0.6793125, "grad_norm": 3.15625, "grad_norm_var": 0.028889973958333332, "learning_rate": 0.0001, "loss": 5.8493, "loss/crossentropy": 2.6418182849884033, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1734854057431221, "step": 21738 }, { "epoch": 0.679375, "grad_norm": 3.03125, "grad_norm_var": 0.02734375, "learning_rate": 0.0001, "loss": 5.4197, "loss/crossentropy": 2.398454427719116, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15525156259536743, "step": 21740 }, { "epoch": 0.6794375, "grad_norm": 3.3125, "grad_norm_var": 0.027864583333333335, "learning_rate": 0.0001, "loss": 6.0178, "loss/crossentropy": 2.7353005409240723, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1782461702823639, "step": 21742 }, { "epoch": 0.6795, "grad_norm": 3.203125, "grad_norm_var": 0.018504842122395834, "learning_rate": 0.0001, "loss": 5.674, "loss/crossentropy": 2.527986168861389, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16459853947162628, "step": 21744 }, { "epoch": 0.6795625, "grad_norm": 3.03125, "grad_norm_var": 0.011421712239583333, "learning_rate": 0.0001, "loss": 5.5751, "loss/crossentropy": 2.5177600383758545, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.1615949645638466, "step": 21746 }, { "epoch": 0.679625, "grad_norm": 3.125, "grad_norm_var": 0.012646484375, "learning_rate": 0.0001, "loss": 5.8048, "loss/crossentropy": 2.615201950073242, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17130304872989655, "step": 21748 }, { "epoch": 0.6796875, "grad_norm": 2.953125, "grad_norm_var": 0.0217193603515625, "learning_rate": 0.0001, "loss": 5.4275, "loss/crossentropy": 2.363314151763916, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15875961631536484, "step": 21750 }, { "epoch": 0.67975, "grad_norm": 3.265625, "grad_norm_var": 0.0248443603515625, "learning_rate": 0.0001, "loss": 5.4695, "loss/crossentropy": 2.4244298934936523, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1588074341416359, "step": 21752 }, { "epoch": 0.6798125, "grad_norm": 3.28125, "grad_norm_var": 0.03337300618489583, "learning_rate": 0.0001, "loss": 6.0363, "loss/crossentropy": 2.735056757926941, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17895662039518356, "step": 21754 }, { "epoch": 0.679875, "grad_norm": 2.8125, "grad_norm_var": 0.03889872233072917, "learning_rate": 0.0001, "loss": 5.4128, "loss/crossentropy": 2.4331005811691284, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1526607722043991, "step": 21756 }, { "epoch": 0.6799375, "grad_norm": 3.328125, "grad_norm_var": 0.0373199462890625, "learning_rate": 0.0001, "loss": 5.8498, "loss/crossentropy": 2.6393890380859375, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.167527437210083, "step": 21758 }, { "epoch": 0.68, "grad_norm": 2.984375, "grad_norm_var": 0.039826456705729166, "learning_rate": 0.0001, "loss": 5.4507, "loss/crossentropy": 2.4479658603668213, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15887079387903214, "step": 21760 }, { "epoch": 0.6800625, "grad_norm": 3.296875, "grad_norm_var": 0.03677978515625, "learning_rate": 0.0001, "loss": 5.9617, "loss/crossentropy": 2.7392600774765015, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.173413947224617, "step": 21762 }, { "epoch": 0.680125, "grad_norm": 3.109375, "grad_norm_var": 0.03687744140625, "learning_rate": 0.0001, "loss": 5.6082, "loss/crossentropy": 2.4955997467041016, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16555281728506088, "step": 21764 }, { "epoch": 0.6801875, "grad_norm": 3.125, "grad_norm_var": 0.028804524739583334, "learning_rate": 0.0001, "loss": 5.702, "loss/crossentropy": 2.5557764768600464, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16657356917858124, "step": 21766 }, { "epoch": 0.68025, "grad_norm": 3.140625, "grad_norm_var": 0.026276652018229166, "learning_rate": 0.0001, "loss": 5.6545, "loss/crossentropy": 2.5388453006744385, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16742068529129028, "step": 21768 }, { "epoch": 0.6803125, "grad_norm": 3.25, "grad_norm_var": 0.016434733072916666, "learning_rate": 0.0001, "loss": 5.7271, "loss/crossentropy": 2.5621873140335083, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.17079270631074905, "step": 21770 }, { "epoch": 0.680375, "grad_norm": 3.453125, "grad_norm_var": 0.016087849934895832, "learning_rate": 0.0001, "loss": 5.7628, "loss/crossentropy": 2.5121124982833862, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17350760102272034, "step": 21772 }, { "epoch": 0.6804375, "grad_norm": 2.890625, "grad_norm_var": 0.021637980143229166, "learning_rate": 0.0001, "loss": 5.3457, "loss/crossentropy": 2.324031352996826, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15684963762760162, "step": 21774 }, { "epoch": 0.6805, "grad_norm": 3.109375, "grad_norm_var": 0.023193359375, "learning_rate": 0.0001, "loss": 5.6466, "loss/crossentropy": 2.584893584251404, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16241639107465744, "step": 21776 }, { "epoch": 0.6805625, "grad_norm": 3.0625, "grad_norm_var": 0.02105712890625, "learning_rate": 0.0001, "loss": 5.4492, "loss/crossentropy": 2.3818461894989014, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15869245678186417, "step": 21778 }, { "epoch": 0.680625, "grad_norm": 2.9375, "grad_norm_var": 0.024396769205729165, "learning_rate": 0.0001, "loss": 5.6076, "loss/crossentropy": 2.5006372928619385, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16616414487361908, "step": 21780 }, { "epoch": 0.6806875, "grad_norm": 3.25, "grad_norm_var": 0.04126688639322917, "learning_rate": 0.0001, "loss": 5.7623, "loss/crossentropy": 2.6387823820114136, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16430340707302094, "step": 21782 }, { "epoch": 0.68075, "grad_norm": 3.5, "grad_norm_var": 0.0513092041015625, "learning_rate": 0.0001, "loss": 5.7836, "loss/crossentropy": 2.6003860235214233, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17027339339256287, "step": 21784 }, { "epoch": 0.6808125, "grad_norm": 2.84375, "grad_norm_var": 0.0592926025390625, "learning_rate": 0.0001, "loss": 5.65, "loss/crossentropy": 2.577357530593872, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16195328533649445, "step": 21786 }, { "epoch": 0.680875, "grad_norm": 2.921875, "grad_norm_var": 0.05533447265625, "learning_rate": 0.0001, "loss": 5.2702, "loss/crossentropy": 2.31095814704895, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15295067429542542, "step": 21788 }, { "epoch": 0.6809375, "grad_norm": 3.234375, "grad_norm_var": 0.0498687744140625, "learning_rate": 0.0001, "loss": 5.8766, "loss/crossentropy": 2.666735887527466, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17137304693460464, "step": 21790 }, { "epoch": 0.681, "grad_norm": 3.390625, "grad_norm_var": 0.0470367431640625, "learning_rate": 0.0001, "loss": 6.1725, "loss/crossentropy": 2.964896559715271, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17075765132904053, "step": 21792 }, { "epoch": 0.6810625, "grad_norm": 3.265625, "grad_norm_var": 0.0511383056640625, "learning_rate": 0.0001, "loss": 5.6333, "loss/crossentropy": 2.51242196559906, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16130948811769485, "step": 21794 }, { "epoch": 0.681125, "grad_norm": 3.15625, "grad_norm_var": 0.04722900390625, "learning_rate": 0.0001, "loss": 5.761, "loss/crossentropy": 2.619884967803955, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16762836277484894, "step": 21796 }, { "epoch": 0.6811875, "grad_norm": 3.1875, "grad_norm_var": 0.03577473958333333, "learning_rate": 0.0001, "loss": 5.5565, "loss/crossentropy": 2.514464259147644, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15928348153829575, "step": 21798 }, { "epoch": 0.68125, "grad_norm": 2.9375, "grad_norm_var": 0.028547159830729165, "learning_rate": 0.0001, "loss": 5.5508, "loss/crossentropy": 2.459440588951111, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16343362629413605, "step": 21800 }, { "epoch": 0.6813125, "grad_norm": 3.15625, "grad_norm_var": 0.0224609375, "learning_rate": 0.0001, "loss": 6.0475, "loss/crossentropy": 2.7599663734436035, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17719028145074844, "step": 21802 }, { "epoch": 0.681375, "grad_norm": 3.234375, "grad_norm_var": 0.018098958333333335, "learning_rate": 0.0001, "loss": 5.8111, "loss/crossentropy": 2.6380518674850464, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16847773641347885, "step": 21804 }, { "epoch": 0.6814375, "grad_norm": 3.265625, "grad_norm_var": 0.018229166666666668, "learning_rate": 0.0001, "loss": 5.8106, "loss/crossentropy": 2.6051825284957886, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.168195903301239, "step": 21806 }, { "epoch": 0.6815, "grad_norm": 3.140625, "grad_norm_var": 0.043229166666666666, "learning_rate": 0.0001, "loss": 5.7213, "loss/crossentropy": 2.54864764213562, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16531335562467575, "step": 21808 }, { "epoch": 0.6815625, "grad_norm": 3.078125, "grad_norm_var": 0.07844136555989584, "learning_rate": 0.0001, "loss": 5.5756, "loss/crossentropy": 2.446473240852356, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.1582270786166191, "step": 21810 }, { "epoch": 0.681625, "grad_norm": 3.09375, "grad_norm_var": 0.08017171223958333, "learning_rate": 0.0001, "loss": 5.7248, "loss/crossentropy": 2.618484854698181, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16610309481620789, "step": 21812 }, { "epoch": 0.6816875, "grad_norm": 3.34375, "grad_norm_var": 0.08569234212239583, "learning_rate": 0.0001, "loss": 5.8648, "loss/crossentropy": 2.748309850692749, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16360018402338028, "step": 21814 }, { "epoch": 0.68175, "grad_norm": 2.953125, "grad_norm_var": 0.08810221354166667, "learning_rate": 0.0001, "loss": 5.6683, "loss/crossentropy": 2.60297167301178, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1604352742433548, "step": 21816 }, { "epoch": 0.6818125, "grad_norm": 3.375, "grad_norm_var": 0.09358317057291667, "learning_rate": 0.0001, "loss": 5.6063, "loss/crossentropy": 2.4555102586746216, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1662549301981926, "step": 21818 }, { "epoch": 0.681875, "grad_norm": 3.265625, "grad_norm_var": 0.1030426025390625, "learning_rate": 0.0001, "loss": 5.8703, "loss/crossentropy": 2.67289662361145, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17208616435527802, "step": 21820 }, { "epoch": 0.6819375, "grad_norm": 3.0625, "grad_norm_var": 0.10695699055989584, "learning_rate": 0.0001, "loss": 5.7378, "loss/crossentropy": 2.6012319326400757, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16639358550310135, "step": 21822 }, { "epoch": 0.682, "grad_norm": 3.09375, "grad_norm_var": 0.07779541015625, "learning_rate": 0.0001, "loss": 5.6664, "loss/crossentropy": 2.509590744972229, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16841749101877213, "step": 21824 }, { "epoch": 0.6820625, "grad_norm": 2.953125, "grad_norm_var": 0.02222900390625, "learning_rate": 0.0001, "loss": 5.6028, "loss/crossentropy": 2.49535071849823, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1634838804602623, "step": 21826 }, { "epoch": 0.682125, "grad_norm": 3.0625, "grad_norm_var": 0.026883951822916665, "learning_rate": 0.0001, "loss": 5.5229, "loss/crossentropy": 2.367142081260681, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.16244809329509735, "step": 21828 }, { "epoch": 0.6821875, "grad_norm": 3.1875, "grad_norm_var": 0.021898396809895835, "learning_rate": 0.0001, "loss": 5.4361, "loss/crossentropy": 2.474857807159424, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15472085028886795, "step": 21830 }, { "epoch": 0.68225, "grad_norm": 3.21875, "grad_norm_var": 0.0219879150390625, "learning_rate": 0.0001, "loss": 5.8704, "loss/crossentropy": 2.7120243310928345, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16857147961854935, "step": 21832 }, { "epoch": 0.6823125, "grad_norm": 3.0625, "grad_norm_var": 0.01715087890625, "learning_rate": 0.0001, "loss": 5.8478, "loss/crossentropy": 2.694658398628235, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.171563059091568, "step": 21834 }, { "epoch": 0.682375, "grad_norm": 3.578125, "grad_norm_var": 0.0235992431640625, "learning_rate": 0.0001, "loss": 5.7727, "loss/crossentropy": 2.5386829376220703, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17378805577754974, "step": 21836 }, { "epoch": 0.6824375, "grad_norm": 2.796875, "grad_norm_var": 0.037262980143229166, "learning_rate": 0.0001, "loss": 5.491, "loss/crossentropy": 2.5196973085403442, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1541638895869255, "step": 21838 }, { "epoch": 0.6825, "grad_norm": 2.984375, "grad_norm_var": 0.041169230143229166, "learning_rate": 0.0001, "loss": 5.4463, "loss/crossentropy": 2.4267587661743164, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15703026950359344, "step": 21840 }, { "epoch": 0.6825625, "grad_norm": 3.40625, "grad_norm_var": 0.04413655598958333, "learning_rate": 0.0001, "loss": 5.8878, "loss/crossentropy": 2.619802474975586, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17953240871429443, "step": 21842 }, { "epoch": 0.682625, "grad_norm": 3.15625, "grad_norm_var": 0.04462483723958333, "learning_rate": 0.0001, "loss": 5.741, "loss/crossentropy": 2.5955699682235718, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1692328080534935, "step": 21844 }, { "epoch": 0.6826875, "grad_norm": 3.421875, "grad_norm_var": 0.05447591145833333, "learning_rate": 0.0001, "loss": 5.4384, "loss/crossentropy": 2.3693102598190308, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16120661795139313, "step": 21846 }, { "epoch": 0.68275, "grad_norm": 2.9375, "grad_norm_var": 0.05701497395833333, "learning_rate": 0.0001, "loss": 5.4267, "loss/crossentropy": 2.371813416481018, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16095516085624695, "step": 21848 }, { "epoch": 0.6828125, "grad_norm": 3.015625, "grad_norm_var": 0.05681966145833333, "learning_rate": 0.0001, "loss": 5.8023, "loss/crossentropy": 2.6406824588775635, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1661587730050087, "step": 21850 }, { "epoch": 0.682875, "grad_norm": 3.390625, "grad_norm_var": 0.04744364420572917, "learning_rate": 0.0001, "loss": 5.7604, "loss/crossentropy": 2.5873711109161377, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16691134870052338, "step": 21852 }, { "epoch": 0.6829375, "grad_norm": 3.125, "grad_norm_var": 0.033600870768229166, "learning_rate": 0.0001, "loss": 5.4775, "loss/crossentropy": 2.450773000717163, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.15227775275707245, "step": 21854 }, { "epoch": 0.683, "grad_norm": 2.984375, "grad_norm_var": 0.029052734375, "learning_rate": 0.0001, "loss": 5.589, "loss/crossentropy": 2.4866209030151367, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16453347355127335, "step": 21856 }, { "epoch": 0.6830625, "grad_norm": 2.828125, "grad_norm_var": 0.033524576822916666, "learning_rate": 0.0001, "loss": 5.1388, "loss/crossentropy": 2.2659223079681396, "loss/hidden": 1.3828125, "loss/jsd": 0.0, "loss/logits": 0.14900429546833038, "step": 21858 }, { "epoch": 0.683125, "grad_norm": 3.09375, "grad_norm_var": 0.031859334309895834, "learning_rate": 0.0001, "loss": 5.4301, "loss/crossentropy": 2.3820217847824097, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15754248201847076, "step": 21860 }, { "epoch": 0.6831875, "grad_norm": 3.140625, "grad_norm_var": 0.021711222330729165, "learning_rate": 0.0001, "loss": 5.8392, "loss/crossentropy": 2.633925437927246, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1709151640534401, "step": 21862 }, { "epoch": 0.68325, "grad_norm": 3.03125, "grad_norm_var": 0.019059244791666666, "learning_rate": 0.0001, "loss": 5.9715, "loss/crossentropy": 2.7862902879714966, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17164774239063263, "step": 21864 }, { "epoch": 0.6833125, "grad_norm": 3.125, "grad_norm_var": 0.01978759765625, "learning_rate": 0.0001, "loss": 5.4585, "loss/crossentropy": 2.4424999952316284, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.1590244174003601, "step": 21866 }, { "epoch": 0.683375, "grad_norm": 2.96875, "grad_norm_var": 0.013895670572916666, "learning_rate": 0.0001, "loss": 5.4504, "loss/crossentropy": 2.384376049041748, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15972968190908432, "step": 21868 }, { "epoch": 0.6834375, "grad_norm": 3.3125, "grad_norm_var": 0.0204254150390625, "learning_rate": 0.0001, "loss": 5.9383, "loss/crossentropy": 2.7200969457626343, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17260020226240158, "step": 21870 }, { "epoch": 0.6835, "grad_norm": 3.328125, "grad_norm_var": 0.023291015625, "learning_rate": 0.0001, "loss": 5.5581, "loss/crossentropy": 2.4979900121688843, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1583554670214653, "step": 21872 }, { "epoch": 0.6835625, "grad_norm": 3.0, "grad_norm_var": 0.017406209309895834, "learning_rate": 0.0001, "loss": 5.5118, "loss/crossentropy": 2.4855817556381226, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15692057460546494, "step": 21874 }, { "epoch": 0.683625, "grad_norm": 3.640625, "grad_norm_var": 0.03599853515625, "learning_rate": 0.0001, "loss": 5.3984, "loss/crossentropy": 2.402698278427124, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.1562107652425766, "step": 21876 }, { "epoch": 0.6836875, "grad_norm": 3.046875, "grad_norm_var": 0.0371002197265625, "learning_rate": 0.0001, "loss": 5.8649, "loss/crossentropy": 2.6718828678131104, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17046964913606644, "step": 21878 }, { "epoch": 0.68375, "grad_norm": 3.265625, "grad_norm_var": 0.03863525390625, "learning_rate": 0.0001, "loss": 5.8438, "loss/crossentropy": 2.690075397491455, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16693761944770813, "step": 21880 }, { "epoch": 0.6838125, "grad_norm": 3.15625, "grad_norm_var": 0.036408487955729166, "learning_rate": 0.0001, "loss": 5.9486, "loss/crossentropy": 2.7212469577789307, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17078034579753876, "step": 21882 }, { "epoch": 0.683875, "grad_norm": 3.546875, "grad_norm_var": 0.04589436848958333, "learning_rate": 0.0001, "loss": 5.8821, "loss/crossentropy": 2.655408501625061, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17461851239204407, "step": 21884 }, { "epoch": 0.6839375, "grad_norm": 3.359375, "grad_norm_var": 0.047281901041666664, "learning_rate": 0.0001, "loss": 5.7517, "loss/crossentropy": 2.567640542984009, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17191973328590393, "step": 21886 }, { "epoch": 0.684, "grad_norm": 2.9375, "grad_norm_var": 0.06629130045572916, "learning_rate": 0.0001, "loss": 5.7444, "loss/crossentropy": 2.6137380599975586, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16619385033845901, "step": 21888 }, { "epoch": 0.6840625, "grad_norm": 3.15625, "grad_norm_var": 0.06210530598958333, "learning_rate": 0.0001, "loss": 5.7919, "loss/crossentropy": 2.644736409187317, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16901639103889465, "step": 21890 }, { "epoch": 0.684125, "grad_norm": 3.796875, "grad_norm_var": 0.07151285807291667, "learning_rate": 0.0001, "loss": 5.6171, "loss/crossentropy": 2.507060408592224, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16373561322689056, "step": 21892 }, { "epoch": 0.6841875, "grad_norm": 3.046875, "grad_norm_var": 0.07157796223958333, "learning_rate": 0.0001, "loss": 5.8277, "loss/crossentropy": 2.6672521829605103, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1687830165028572, "step": 21894 }, { "epoch": 0.68425, "grad_norm": 2.90625, "grad_norm_var": 0.07733968098958334, "learning_rate": 0.0001, "loss": 5.7131, "loss/crossentropy": 2.591132402420044, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16258373111486435, "step": 21896 }, { "epoch": 0.6843125, "grad_norm": 3.296875, "grad_norm_var": 0.07913004557291667, "learning_rate": 0.0001, "loss": 5.8509, "loss/crossentropy": 2.5987366437911987, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17482291162014008, "step": 21898 }, { "epoch": 0.684375, "grad_norm": 2.921875, "grad_norm_var": 0.07366434733072917, "learning_rate": 0.0001, "loss": 5.3414, "loss/crossentropy": 2.376563310623169, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1519482657313347, "step": 21900 }, { "epoch": 0.6844375, "grad_norm": 2.90625, "grad_norm_var": 0.074755859375, "learning_rate": 0.0001, "loss": 5.6567, "loss/crossentropy": 2.512323498725891, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1671675145626068, "step": 21902 }, { "epoch": 0.6845, "grad_norm": 3.296875, "grad_norm_var": 0.05561421712239583, "learning_rate": 0.0001, "loss": 5.6793, "loss/crossentropy": 2.6000667810440063, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16573521494865417, "step": 21904 }, { "epoch": 0.6845625, "grad_norm": 3.0, "grad_norm_var": 0.0597320556640625, "learning_rate": 0.0001, "loss": 5.2702, "loss/crossentropy": 2.292046546936035, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15640486031770706, "step": 21906 }, { "epoch": 0.684625, "grad_norm": 2.859375, "grad_norm_var": 0.03037109375, "learning_rate": 0.0001, "loss": 5.7362, "loss/crossentropy": 2.6720213890075684, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15836820751428604, "step": 21908 }, { "epoch": 0.6846875, "grad_norm": 3.0625, "grad_norm_var": 0.03447163899739583, "learning_rate": 0.0001, "loss": 5.5336, "loss/crossentropy": 2.515222191810608, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.15925846248865128, "step": 21910 }, { "epoch": 0.68475, "grad_norm": 3.46875, "grad_norm_var": 0.04980061848958333, "learning_rate": 0.0001, "loss": 5.3531, "loss/crossentropy": 2.3858895301818848, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.1525760367512703, "step": 21912 }, { "epoch": 0.6848125, "grad_norm": 3.0, "grad_norm_var": 0.04840087890625, "learning_rate": 0.0001, "loss": 5.4175, "loss/crossentropy": 2.37613046169281, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16117016226053238, "step": 21914 }, { "epoch": 0.684875, "grad_norm": 3.0625, "grad_norm_var": 0.0574127197265625, "learning_rate": 0.0001, "loss": 5.7245, "loss/crossentropy": 2.587957739830017, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16600116342306137, "step": 21916 }, { "epoch": 0.6849375, "grad_norm": 2.96875, "grad_norm_var": 0.0575836181640625, "learning_rate": 0.0001, "loss": 5.719, "loss/crossentropy": 2.6305911540985107, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16352810710668564, "step": 21918 }, { "epoch": 0.685, "grad_norm": 3.109375, "grad_norm_var": 0.05487874348958333, "learning_rate": 0.0001, "loss": 5.7561, "loss/crossentropy": 2.6287907361984253, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16468214243650436, "step": 21920 }, { "epoch": 0.6850625, "grad_norm": 2.859375, "grad_norm_var": 0.055464680989583334, "learning_rate": 0.0001, "loss": 5.233, "loss/crossentropy": 2.2569140195846558, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15424616634845734, "step": 21922 }, { "epoch": 0.685125, "grad_norm": 2.953125, "grad_norm_var": 0.0605865478515625, "learning_rate": 0.0001, "loss": 5.7256, "loss/crossentropy": 2.5466452836990356, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16907097399234772, "step": 21924 }, { "epoch": 0.6851875, "grad_norm": 2.84375, "grad_norm_var": 0.059178670247395836, "learning_rate": 0.0001, "loss": 5.4947, "loss/crossentropy": 2.4349265098571777, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1571447104215622, "step": 21926 }, { "epoch": 0.68525, "grad_norm": 3.140625, "grad_norm_var": 0.04527587890625, "learning_rate": 0.0001, "loss": 5.649, "loss/crossentropy": 2.5687440633773804, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16153889894485474, "step": 21928 }, { "epoch": 0.6853125, "grad_norm": 2.9375, "grad_norm_var": 0.0420318603515625, "learning_rate": 0.0001, "loss": 5.8262, "loss/crossentropy": 2.7170010805130005, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16522109508514404, "step": 21930 }, { "epoch": 0.685375, "grad_norm": 3.21875, "grad_norm_var": 0.035986328125, "learning_rate": 0.0001, "loss": 5.5689, "loss/crossentropy": 2.470358729362488, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16336709260940552, "step": 21932 }, { "epoch": 0.6854375, "grad_norm": 3.078125, "grad_norm_var": 0.03255106608072917, "learning_rate": 0.0001, "loss": 5.3914, "loss/crossentropy": 2.3599261045455933, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15861255675554276, "step": 21934 }, { "epoch": 0.6855, "grad_norm": 3.203125, "grad_norm_var": 0.03178609212239583, "learning_rate": 0.0001, "loss": 5.6691, "loss/crossentropy": 2.5072624683380127, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16579187661409378, "step": 21936 }, { "epoch": 0.6855625, "grad_norm": 3.265625, "grad_norm_var": 0.027197265625, "learning_rate": 0.0001, "loss": 5.6996, "loss/crossentropy": 2.5683263540267944, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16429609060287476, "step": 21938 }, { "epoch": 0.685625, "grad_norm": 2.96875, "grad_norm_var": 0.022948201497395834, "learning_rate": 0.0001, "loss": 5.5415, "loss/crossentropy": 2.510878562927246, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15774516761302948, "step": 21940 }, { "epoch": 0.6856875, "grad_norm": 2.796875, "grad_norm_var": 0.036351521809895836, "learning_rate": 0.0001, "loss": 5.285, "loss/crossentropy": 2.3741862773895264, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1473342627286911, "step": 21942 }, { "epoch": 0.68575, "grad_norm": 2.984375, "grad_norm_var": 0.03428446451822917, "learning_rate": 0.0001, "loss": 5.5747, "loss/crossentropy": 2.489713668823242, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16475097090005875, "step": 21944 }, { "epoch": 0.6858125, "grad_norm": 2.875, "grad_norm_var": 0.03524983723958333, "learning_rate": 0.0001, "loss": 5.269, "loss/crossentropy": 2.2745388746261597, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15295787900686264, "step": 21946 }, { "epoch": 0.685875, "grad_norm": 3.015625, "grad_norm_var": 0.031168619791666668, "learning_rate": 0.0001, "loss": 6.0097, "loss/crossentropy": 2.8562405109405518, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16691070795059204, "step": 21948 }, { "epoch": 0.6859375, "grad_norm": 3.453125, "grad_norm_var": 0.04331766764322917, "learning_rate": 0.0001, "loss": 5.9235, "loss/crossentropy": 2.685052752494812, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17227753251791, "step": 21950 }, { "epoch": 0.686, "grad_norm": 3.578125, "grad_norm_var": 0.061812337239583334, "learning_rate": 0.0001, "loss": 5.5667, "loss/crossentropy": 2.4415215253829956, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16290438175201416, "step": 21952 }, { "epoch": 0.6860625, "grad_norm": 3.109375, "grad_norm_var": 0.0598052978515625, "learning_rate": 0.0001, "loss": 5.7266, "loss/crossentropy": 2.640066146850586, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16295120865106583, "step": 21954 }, { "epoch": 0.686125, "grad_norm": 3.09375, "grad_norm_var": 0.055436197916666666, "learning_rate": 0.0001, "loss": 5.4187, "loss/crossentropy": 2.4268020391464233, "loss/hidden": 1.40234375, "loss/jsd": 0.0, "loss/logits": 0.1589563563466072, "step": 21956 }, { "epoch": 0.6861875, "grad_norm": 3.296875, "grad_norm_var": 0.032568359375, "learning_rate": 0.0001, "loss": 6.0263, "loss/crossentropy": 2.7968064546585083, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.17802338302135468, "step": 21958 }, { "epoch": 0.68625, "grad_norm": 2.96875, "grad_norm_var": 0.030720011393229166, "learning_rate": 0.0001, "loss": 5.5455, "loss/crossentropy": 2.4713134765625, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16211090236902237, "step": 21960 }, { "epoch": 0.6863125, "grad_norm": 3.0, "grad_norm_var": 0.028678385416666667, "learning_rate": 0.0001, "loss": 5.3926, "loss/crossentropy": 2.370116949081421, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15810777992010117, "step": 21962 }, { "epoch": 0.686375, "grad_norm": 3.21875, "grad_norm_var": 0.027269490559895835, "learning_rate": 0.0001, "loss": 5.7012, "loss/crossentropy": 2.512209415435791, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17045696079730988, "step": 21964 }, { "epoch": 0.6864375, "grad_norm": 3.359375, "grad_norm_var": 0.024788411458333333, "learning_rate": 0.0001, "loss": 5.8152, "loss/crossentropy": 2.6030231714248657, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17472904175519943, "step": 21966 }, { "epoch": 0.6865, "grad_norm": 2.984375, "grad_norm_var": 0.0165924072265625, "learning_rate": 0.0001, "loss": 5.8785, "loss/crossentropy": 2.6595598459243774, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1715015023946762, "step": 21968 }, { "epoch": 0.6865625, "grad_norm": 3.09375, "grad_norm_var": 0.016813151041666665, "learning_rate": 0.0001, "loss": 6.0522, "loss/crossentropy": 2.818776845932007, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17529769986867905, "step": 21970 }, { "epoch": 0.686625, "grad_norm": 2.875, "grad_norm_var": 0.0226715087890625, "learning_rate": 0.0001, "loss": 6.0145, "loss/crossentropy": 2.9263845682144165, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16193270683288574, "step": 21972 }, { "epoch": 0.6866875, "grad_norm": 3.125, "grad_norm_var": 0.022607421875, "learning_rate": 0.0001, "loss": 5.4597, "loss/crossentropy": 2.4031002521514893, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1603483036160469, "step": 21974 }, { "epoch": 0.68675, "grad_norm": 3.1875, "grad_norm_var": 0.030858357747395832, "learning_rate": 0.0001, "loss": 5.9979, "loss/crossentropy": 2.7784128189086914, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1746804192662239, "step": 21976 }, { "epoch": 0.6868125, "grad_norm": 3.796875, "grad_norm_var": 0.052079264322916666, "learning_rate": 0.0001, "loss": 5.9265, "loss/crossentropy": 2.6443564891815186, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1786029413342476, "step": 21978 }, { "epoch": 0.686875, "grad_norm": 3.296875, "grad_norm_var": 0.05781148274739583, "learning_rate": 0.0001, "loss": 5.6427, "loss/crossentropy": 2.613027572631836, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1599966436624527, "step": 21980 }, { "epoch": 0.6869375, "grad_norm": 3.0, "grad_norm_var": 0.057062784830729164, "learning_rate": 0.0001, "loss": 5.5465, "loss/crossentropy": 2.536167621612549, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15767823159694672, "step": 21982 }, { "epoch": 0.687, "grad_norm": 3.046875, "grad_norm_var": 0.056868489583333334, "learning_rate": 0.0001, "loss": 5.7003, "loss/crossentropy": 2.591855049133301, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16318461298942566, "step": 21984 }, { "epoch": 0.6870625, "grad_norm": 2.96875, "grad_norm_var": 0.06162821451822917, "learning_rate": 0.0001, "loss": 5.8229, "loss/crossentropy": 2.7432347536087036, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16421236097812653, "step": 21986 }, { "epoch": 0.687125, "grad_norm": 3.109375, "grad_norm_var": 0.05715230305989583, "learning_rate": 0.0001, "loss": 5.7634, "loss/crossentropy": 2.633246064186096, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1673073247075081, "step": 21988 }, { "epoch": 0.6871875, "grad_norm": 3.015625, "grad_norm_var": 0.0553863525390625, "learning_rate": 0.0001, "loss": 5.6576, "loss/crossentropy": 2.5245636701583862, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1636907309293747, "step": 21990 }, { "epoch": 0.68725, "grad_norm": 3.21875, "grad_norm_var": 0.0464508056640625, "learning_rate": 0.0001, "loss": 5.8027, "loss/crossentropy": 2.5923744440078735, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17025303095579147, "step": 21992 }, { "epoch": 0.6873125, "grad_norm": 2.9375, "grad_norm_var": 0.016109212239583334, "learning_rate": 0.0001, "loss": 5.6666, "loss/crossentropy": 2.5845742225646973, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.1648474708199501, "step": 21994 }, { "epoch": 0.687375, "grad_norm": 3.09375, "grad_norm_var": 0.0107421875, "learning_rate": 0.0001, "loss": 5.5381, "loss/crossentropy": 2.4800325632095337, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16127480566501617, "step": 21996 }, { "epoch": 0.6874375, "grad_norm": 3.09375, "grad_norm_var": 0.013581339518229167, "learning_rate": 0.0001, "loss": 6.0703, "loss/crossentropy": 2.813502311706543, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17646300047636032, "step": 21998 }, { "epoch": 0.6875, "grad_norm": 3.1875, "grad_norm_var": 0.013263956705729166, "learning_rate": 0.0001, "loss": 5.8116, "loss/crossentropy": 2.689139246940613, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1669357866048813, "step": 22000 }, { "epoch": 0.6875625, "grad_norm": 3.625, "grad_norm_var": 0.028706868489583332, "learning_rate": 0.0001, "loss": 5.6819, "loss/crossentropy": 2.4935706853866577, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16961680352687836, "step": 22002 }, { "epoch": 0.687625, "grad_norm": 3.0, "grad_norm_var": 0.0355621337890625, "learning_rate": 0.0001, "loss": 5.6342, "loss/crossentropy": 2.5379968881607056, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1600138247013092, "step": 22004 }, { "epoch": 0.6876875, "grad_norm": 3.015625, "grad_norm_var": 0.03567708333333333, "learning_rate": 0.0001, "loss": 5.5031, "loss/crossentropy": 2.484773635864258, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1549551784992218, "step": 22006 }, { "epoch": 0.68775, "grad_norm": 3.453125, "grad_norm_var": 0.0484283447265625, "learning_rate": 0.0001, "loss": 5.7447, "loss/crossentropy": 2.6551584005355835, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1624705046415329, "step": 22008 }, { "epoch": 0.6878125, "grad_norm": 2.953125, "grad_norm_var": 0.053929646809895836, "learning_rate": 0.0001, "loss": 5.6424, "loss/crossentropy": 2.5648895502090454, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15970075130462646, "step": 22010 }, { "epoch": 0.687875, "grad_norm": 3.34375, "grad_norm_var": 0.05709228515625, "learning_rate": 0.0001, "loss": 5.6356, "loss/crossentropy": 2.454726457595825, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16730573028326035, "step": 22012 }, { "epoch": 0.6879375, "grad_norm": 3.21875, "grad_norm_var": 0.06226806640625, "learning_rate": 0.0001, "loss": 5.5746, "loss/crossentropy": 2.499474883079529, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16219748556613922, "step": 22014 }, { "epoch": 0.688, "grad_norm": 3.671875, "grad_norm_var": 0.087548828125, "learning_rate": 0.0001, "loss": 5.9268, "loss/crossentropy": 2.5879805088043213, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1831054538488388, "step": 22016 }, { "epoch": 0.6880625, "grad_norm": 3.0625, "grad_norm_var": 0.07528889973958333, "learning_rate": 0.0001, "loss": 5.6894, "loss/crossentropy": 2.5440523624420166, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16492797434329987, "step": 22018 }, { "epoch": 0.688125, "grad_norm": 3.21875, "grad_norm_var": 0.07002665201822916, "learning_rate": 0.0001, "loss": 5.6702, "loss/crossentropy": 2.543825387954712, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16459020972251892, "step": 22020 }, { "epoch": 0.6881875, "grad_norm": 3.328125, "grad_norm_var": 0.07581278483072916, "learning_rate": 0.0001, "loss": 5.8816, "loss/crossentropy": 2.706055998802185, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17107157409191132, "step": 22022 }, { "epoch": 0.68825, "grad_norm": 3.0625, "grad_norm_var": 0.0632720947265625, "learning_rate": 0.0001, "loss": 5.5121, "loss/crossentropy": 2.443795084953308, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1579991579055786, "step": 22024 }, { "epoch": 0.6883125, "grad_norm": 2.875, "grad_norm_var": 0.05939839680989583, "learning_rate": 0.0001, "loss": 5.6521, "loss/crossentropy": 2.6103020906448364, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15964560210704803, "step": 22026 }, { "epoch": 0.688375, "grad_norm": 3.203125, "grad_norm_var": 0.058333333333333334, "learning_rate": 0.0001, "loss": 5.8219, "loss/crossentropy": 2.6712719202041626, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16858209669589996, "step": 22028 }, { "epoch": 0.6884375, "grad_norm": 3.28125, "grad_norm_var": 0.06360677083333334, "learning_rate": 0.0001, "loss": 5.4321, "loss/crossentropy": 2.389652729034424, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16010120511054993, "step": 22030 }, { "epoch": 0.6885, "grad_norm": 3.34375, "grad_norm_var": 0.03693033854166667, "learning_rate": 0.0001, "loss": 5.829, "loss/crossentropy": 2.5845173597335815, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.17991462349891663, "step": 22032 }, { "epoch": 0.6885625, "grad_norm": 3.46875, "grad_norm_var": 0.043257649739583334, "learning_rate": 0.0001, "loss": 5.6335, "loss/crossentropy": 2.524790644645691, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16321934014558792, "step": 22034 }, { "epoch": 0.688625, "grad_norm": 3.078125, "grad_norm_var": 0.040816243489583334, "learning_rate": 0.0001, "loss": 5.4435, "loss/crossentropy": 2.342835545539856, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1628004088997841, "step": 22036 }, { "epoch": 0.6886875, "grad_norm": 3.09375, "grad_norm_var": 0.033812459309895834, "learning_rate": 0.0001, "loss": 5.8164, "loss/crossentropy": 2.640567421913147, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16758081316947937, "step": 22038 }, { "epoch": 0.68875, "grad_norm": 3.4375, "grad_norm_var": 0.0361480712890625, "learning_rate": 0.0001, "loss": 5.5189, "loss/crossentropy": 2.3515186309814453, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16752216964960098, "step": 22040 }, { "epoch": 0.6888125, "grad_norm": 3.28125, "grad_norm_var": 0.027546183268229166, "learning_rate": 0.0001, "loss": 6.0336, "loss/crossentropy": 2.7967907190322876, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1756298840045929, "step": 22042 }, { "epoch": 0.688875, "grad_norm": 4.0, "grad_norm_var": 0.059691365559895834, "learning_rate": 0.0001, "loss": 5.8296, "loss/crossentropy": 2.5987207889556885, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.173872709274292, "step": 22044 }, { "epoch": 0.6889375, "grad_norm": 2.90625, "grad_norm_var": 0.06129150390625, "learning_rate": 0.0001, "loss": 5.7161, "loss/crossentropy": 2.584370255470276, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16551848500967026, "step": 22046 }, { "epoch": 0.689, "grad_norm": 2.703125, "grad_norm_var": 0.08826395670572916, "learning_rate": 0.0001, "loss": 5.3159, "loss/crossentropy": 2.424273729324341, "loss/hidden": 1.38671875, "loss/jsd": 0.0, "loss/logits": 0.15049034357070923, "step": 22048 }, { "epoch": 0.6890625, "grad_norm": 3.078125, "grad_norm_var": 0.08479715983072916, "learning_rate": 0.0001, "loss": 5.8019, "loss/crossentropy": 2.6658477783203125, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16555969417095184, "step": 22050 }, { "epoch": 0.689125, "grad_norm": 3.4375, "grad_norm_var": 0.08819986979166666, "learning_rate": 0.0001, "loss": 6.0253, "loss/crossentropy": 2.7549906969070435, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17742366343736649, "step": 22052 }, { "epoch": 0.6891875, "grad_norm": 3.1875, "grad_norm_var": 0.08970947265625, "learning_rate": 0.0001, "loss": 5.7482, "loss/crossentropy": 2.583523154258728, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16842039674520493, "step": 22054 }, { "epoch": 0.68925, "grad_norm": 3.0625, "grad_norm_var": 0.087890625, "learning_rate": 0.0001, "loss": 5.0966, "loss/crossentropy": 2.2068421840667725, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.14483093470335007, "step": 22056 }, { "epoch": 0.6893125, "grad_norm": 2.96875, "grad_norm_var": 0.09039306640625, "learning_rate": 0.0001, "loss": 5.3215, "loss/crossentropy": 2.3375245332717896, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15464308857917786, "step": 22058 }, { "epoch": 0.689375, "grad_norm": 2.921875, "grad_norm_var": 0.046044921875, "learning_rate": 0.0001, "loss": 5.5924, "loss/crossentropy": 2.555259585380554, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.16191434860229492, "step": 22060 }, { "epoch": 0.6894375, "grad_norm": 2.96875, "grad_norm_var": 0.04265034993489583, "learning_rate": 0.0001, "loss": 5.5317, "loss/crossentropy": 2.476559638977051, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1602018103003502, "step": 22062 }, { "epoch": 0.6895, "grad_norm": 4.21875, "grad_norm_var": 0.10701395670572916, "learning_rate": 0.0001, "loss": 5.6194, "loss/crossentropy": 2.500504970550537, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16618820279836655, "step": 22064 }, { "epoch": 0.6895625, "grad_norm": 3.015625, "grad_norm_var": 0.10738016764322916, "learning_rate": 0.0001, "loss": 5.649, "loss/crossentropy": 2.625067114830017, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15903447568416595, "step": 22066 }, { "epoch": 0.689625, "grad_norm": 2.890625, "grad_norm_var": 0.10705973307291666, "learning_rate": 0.0001, "loss": 5.5305, "loss/crossentropy": 2.4101606607437134, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16672495007514954, "step": 22068 }, { "epoch": 0.6896875, "grad_norm": 3.265625, "grad_norm_var": 0.11030171712239584, "learning_rate": 0.0001, "loss": 5.6851, "loss/crossentropy": 2.5571197271347046, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1647544801235199, "step": 22070 }, { "epoch": 0.68975, "grad_norm": 3.046875, "grad_norm_var": 0.11188863118489584, "learning_rate": 0.0001, "loss": 5.5543, "loss/crossentropy": 2.49932861328125, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1586238443851471, "step": 22072 }, { "epoch": 0.6898125, "grad_norm": 3.203125, "grad_norm_var": 0.11961263020833333, "learning_rate": 0.0001, "loss": 5.9969, "loss/crossentropy": 2.7396615743637085, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1776818111538887, "step": 22074 }, { "epoch": 0.689875, "grad_norm": 3.34375, "grad_norm_var": 0.10357666015625, "learning_rate": 0.0001, "loss": 5.6957, "loss/crossentropy": 2.5299636125564575, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16696564108133316, "step": 22076 }, { "epoch": 0.6899375, "grad_norm": 3.0625, "grad_norm_var": 0.10089518229166666, "learning_rate": 0.0001, "loss": 5.5318, "loss/crossentropy": 2.5148743391036987, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15677031874656677, "step": 22078 }, { "epoch": 0.69, "grad_norm": 3.078125, "grad_norm_var": 0.05152587890625, "learning_rate": 0.0001, "loss": 5.2976, "loss/crossentropy": 2.294735014438629, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15458299219608307, "step": 22080 }, { "epoch": 0.6900625, "grad_norm": 3.46875, "grad_norm_var": 0.0514801025390625, "learning_rate": 0.0001, "loss": 5.8681, "loss/crossentropy": 2.6674221754074097, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1724105179309845, "step": 22082 }, { "epoch": 0.690125, "grad_norm": 3.734375, "grad_norm_var": 0.056315104166666664, "learning_rate": 0.0001, "loss": 5.8382, "loss/crossentropy": 2.5484025478363037, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1766359657049179, "step": 22084 }, { "epoch": 0.6901875, "grad_norm": 3.5, "grad_norm_var": 0.049462890625, "learning_rate": 0.0001, "loss": 5.5799, "loss/crossentropy": 2.4521220922470093, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16356316953897476, "step": 22086 }, { "epoch": 0.69025, "grad_norm": 3.1875, "grad_norm_var": 0.045654296875, "learning_rate": 0.0001, "loss": 5.8716, "loss/crossentropy": 2.670419692993164, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1724613457918167, "step": 22088 }, { "epoch": 0.6903125, "grad_norm": 3.140625, "grad_norm_var": 0.0429595947265625, "learning_rate": 0.0001, "loss": 5.7759, "loss/crossentropy": 2.6571903228759766, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1630413457751274, "step": 22090 }, { "epoch": 0.690375, "grad_norm": 3.140625, "grad_norm_var": 0.045775349934895834, "learning_rate": 0.0001, "loss": 5.6678, "loss/crossentropy": 2.6170836687088013, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1589749976992607, "step": 22092 }, { "epoch": 0.6904375, "grad_norm": 2.859375, "grad_norm_var": 0.05419514973958333, "learning_rate": 0.0001, "loss": 5.3578, "loss/crossentropy": 2.3640111684799194, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1532842516899109, "step": 22094 }, { "epoch": 0.6905, "grad_norm": 2.890625, "grad_norm_var": 0.04895833333333333, "learning_rate": 0.0001, "loss": 5.3275, "loss/crossentropy": 2.372657537460327, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.14900419861078262, "step": 22096 }, { "epoch": 0.6905625, "grad_norm": 2.765625, "grad_norm_var": 0.0707916259765625, "learning_rate": 0.0001, "loss": 5.4045, "loss/crossentropy": 2.5098663568496704, "loss/hidden": 1.38671875, "loss/jsd": 0.0, "loss/logits": 0.1507904753088951, "step": 22098 }, { "epoch": 0.690625, "grad_norm": 3.21875, "grad_norm_var": 0.0517730712890625, "learning_rate": 0.0001, "loss": 6.0669, "loss/crossentropy": 2.717814803123474, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1817857176065445, "step": 22100 }, { "epoch": 0.6906875, "grad_norm": 3.15625, "grad_norm_var": 0.0433013916015625, "learning_rate": 0.0001, "loss": 5.55, "loss/crossentropy": 2.4737772941589355, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16387274861335754, "step": 22102 }, { "epoch": 0.69075, "grad_norm": 3.203125, "grad_norm_var": 0.0423492431640625, "learning_rate": 0.0001, "loss": 5.7436, "loss/crossentropy": 2.639219284057617, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16239119321107864, "step": 22104 }, { "epoch": 0.6908125, "grad_norm": 3.390625, "grad_norm_var": 0.051595052083333336, "learning_rate": 0.0001, "loss": 5.8804, "loss/crossentropy": 2.7018351554870605, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16903229802846909, "step": 22106 }, { "epoch": 0.690875, "grad_norm": 3.046875, "grad_norm_var": 0.05776265462239583, "learning_rate": 0.0001, "loss": 5.1394, "loss/crossentropy": 2.168722927570343, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.14941561222076416, "step": 22108 }, { "epoch": 0.6909375, "grad_norm": 2.9375, "grad_norm_var": 0.052783203125, "learning_rate": 0.0001, "loss": 5.4124, "loss/crossentropy": 2.3674614429473877, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15879163146018982, "step": 22110 }, { "epoch": 0.691, "grad_norm": 3.0, "grad_norm_var": 0.05085347493489583, "learning_rate": 0.0001, "loss": 5.9384, "loss/crossentropy": 2.75888991355896, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.169517882168293, "step": 22112 }, { "epoch": 0.6910625, "grad_norm": 3.078125, "grad_norm_var": 0.0372467041015625, "learning_rate": 0.0001, "loss": 5.5238, "loss/crossentropy": 2.486830711364746, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15916302800178528, "step": 22114 }, { "epoch": 0.691125, "grad_norm": 3.171875, "grad_norm_var": 0.029938761393229166, "learning_rate": 0.0001, "loss": 5.5068, "loss/crossentropy": 2.4414623975753784, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16044388711452484, "step": 22116 }, { "epoch": 0.6911875, "grad_norm": 3.296875, "grad_norm_var": 0.03482666015625, "learning_rate": 0.0001, "loss": 5.4823, "loss/crossentropy": 2.430028796195984, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15874332934617996, "step": 22118 }, { "epoch": 0.69125, "grad_norm": 3.09375, "grad_norm_var": 0.0339752197265625, "learning_rate": 0.0001, "loss": 5.8687, "loss/crossentropy": 2.71201491355896, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16957851499319077, "step": 22120 }, { "epoch": 0.6913125, "grad_norm": 3.171875, "grad_norm_var": 0.022850545247395833, "learning_rate": 0.0001, "loss": 5.9176, "loss/crossentropy": 2.667091488838196, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.177008718252182, "step": 22122 }, { "epoch": 0.691375, "grad_norm": 3.640625, "grad_norm_var": 0.03797200520833333, "learning_rate": 0.0001, "loss": 5.9542, "loss/crossentropy": 2.7011945247650146, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1756882593035698, "step": 22124 }, { "epoch": 0.6914375, "grad_norm": 3.203125, "grad_norm_var": 0.041259765625, "learning_rate": 0.0001, "loss": 5.3941, "loss/crossentropy": 2.4094094038009644, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.1578417792916298, "step": 22126 }, { "epoch": 0.6915, "grad_norm": 3.1875, "grad_norm_var": 0.04081929524739583, "learning_rate": 0.0001, "loss": 5.5588, "loss/crossentropy": 2.509129285812378, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15613751113414764, "step": 22128 }, { "epoch": 0.6915625, "grad_norm": 3.265625, "grad_norm_var": 0.04101155598958333, "learning_rate": 0.0001, "loss": 5.7464, "loss/crossentropy": 2.60485577583313, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16767163574695587, "step": 22130 }, { "epoch": 0.691625, "grad_norm": 3.203125, "grad_norm_var": 0.0484771728515625, "learning_rate": 0.0001, "loss": 5.4953, "loss/crossentropy": 2.4983582496643066, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15828609466552734, "step": 22132 }, { "epoch": 0.6916875, "grad_norm": 3.328125, "grad_norm_var": 0.044596354166666664, "learning_rate": 0.0001, "loss": 6.045, "loss/crossentropy": 2.789808750152588, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17512879520654678, "step": 22134 }, { "epoch": 0.69175, "grad_norm": 3.109375, "grad_norm_var": 0.04453125, "learning_rate": 0.0001, "loss": 5.9074, "loss/crossentropy": 2.6833715438842773, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.16850131005048752, "step": 22136 }, { "epoch": 0.6918125, "grad_norm": 3.4375, "grad_norm_var": 0.04853108723958333, "learning_rate": 0.0001, "loss": 5.8736, "loss/crossentropy": 2.608944535255432, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17607013881206512, "step": 22138 }, { "epoch": 0.691875, "grad_norm": 3.078125, "grad_norm_var": 0.0348785400390625, "learning_rate": 0.0001, "loss": 5.8523, "loss/crossentropy": 2.6538909673690796, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.170235276222229, "step": 22140 }, { "epoch": 0.6919375, "grad_norm": 3.125, "grad_norm_var": 0.0294830322265625, "learning_rate": 0.0001, "loss": 5.8493, "loss/crossentropy": 2.710782289505005, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1681513860821724, "step": 22142 }, { "epoch": 0.692, "grad_norm": 3.515625, "grad_norm_var": 0.03876546223958333, "learning_rate": 0.0001, "loss": 5.8845, "loss/crossentropy": 2.6459481716156006, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17424651235342026, "step": 22144 }, { "epoch": 0.6920625, "grad_norm": 3.328125, "grad_norm_var": 0.03941650390625, "learning_rate": 0.0001, "loss": 5.6237, "loss/crossentropy": 2.524052619934082, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1646539568901062, "step": 22146 }, { "epoch": 0.692125, "grad_norm": 3.296875, "grad_norm_var": 0.025, "learning_rate": 0.0001, "loss": 5.6005, "loss/crossentropy": 2.4987930059432983, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16329386830329895, "step": 22148 }, { "epoch": 0.6921875, "grad_norm": 3.75, "grad_norm_var": 0.0445709228515625, "learning_rate": 0.0001, "loss": 6.3365, "loss/crossentropy": 2.899762511253357, "loss/hidden": 1.58203125, "loss/jsd": 0.0, "loss/logits": 0.1854744702577591, "step": 22150 }, { "epoch": 0.69225, "grad_norm": 3.53125, "grad_norm_var": 0.0510406494140625, "learning_rate": 0.0001, "loss": 5.7907, "loss/crossentropy": 2.5561044216156006, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1765880137681961, "step": 22152 }, { "epoch": 0.6923125, "grad_norm": 3.03125, "grad_norm_var": 0.051366170247395836, "learning_rate": 0.0001, "loss": 5.5862, "loss/crossentropy": 2.501978039741516, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15959802269935608, "step": 22154 }, { "epoch": 0.692375, "grad_norm": 2.921875, "grad_norm_var": 0.058919270833333336, "learning_rate": 0.0001, "loss": 5.5544, "loss/crossentropy": 2.509579300880432, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15916655957698822, "step": 22156 }, { "epoch": 0.6924375, "grad_norm": 3.671875, "grad_norm_var": 0.066552734375, "learning_rate": 0.0001, "loss": 5.6933, "loss/crossentropy": 2.5143656730651855, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1721944585442543, "step": 22158 }, { "epoch": 0.6925, "grad_norm": 2.875, "grad_norm_var": 0.07075093587239584, "learning_rate": 0.0001, "loss": 5.6441, "loss/crossentropy": 2.6157257556915283, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16064903140068054, "step": 22160 }, { "epoch": 0.6925625, "grad_norm": 3.265625, "grad_norm_var": 0.07592671712239583, "learning_rate": 0.0001, "loss": 5.712, "loss/crossentropy": 2.6100316047668457, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16722562909126282, "step": 22162 }, { "epoch": 0.692625, "grad_norm": 2.96875, "grad_norm_var": 0.07788798014322916, "learning_rate": 0.0001, "loss": 5.708, "loss/crossentropy": 2.6058512926101685, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16607257723808289, "step": 22164 }, { "epoch": 0.6926875, "grad_norm": 3.1875, "grad_norm_var": 0.05210673014322917, "learning_rate": 0.0001, "loss": 5.8378, "loss/crossentropy": 2.6714993715286255, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16858133673667908, "step": 22166 }, { "epoch": 0.69275, "grad_norm": 3.1875, "grad_norm_var": 0.040283203125, "learning_rate": 0.0001, "loss": 5.7096, "loss/crossentropy": 2.5298666954040527, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16875465214252472, "step": 22168 }, { "epoch": 0.6928125, "grad_norm": 3.0625, "grad_norm_var": 0.04036356608072917, "learning_rate": 0.0001, "loss": 5.5792, "loss/crossentropy": 2.5069416761398315, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16112709790468216, "step": 22170 }, { "epoch": 0.692875, "grad_norm": 3.15625, "grad_norm_var": 0.03577067057291667, "learning_rate": 0.0001, "loss": 5.5173, "loss/crossentropy": 2.427741289138794, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16403573751449585, "step": 22172 }, { "epoch": 0.6929375, "grad_norm": 2.9375, "grad_norm_var": 0.02047119140625, "learning_rate": 0.0001, "loss": 5.5594, "loss/crossentropy": 2.544328212738037, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15697290748357773, "step": 22174 }, { "epoch": 0.693, "grad_norm": 3.15625, "grad_norm_var": 0.025472005208333332, "learning_rate": 0.0001, "loss": 5.3211, "loss/crossentropy": 2.3427733778953552, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15017352998256683, "step": 22176 }, { "epoch": 0.6930625, "grad_norm": 3.25, "grad_norm_var": 0.022614542643229166, "learning_rate": 0.0001, "loss": 5.8383, "loss/crossentropy": 2.668258547782898, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16817117482423782, "step": 22178 }, { "epoch": 0.693125, "grad_norm": 3.21875, "grad_norm_var": 0.026416015625, "learning_rate": 0.0001, "loss": 6.0641, "loss/crossentropy": 2.8303595781326294, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17259375751018524, "step": 22180 }, { "epoch": 0.6931875, "grad_norm": 3.234375, "grad_norm_var": 0.027372233072916665, "learning_rate": 0.0001, "loss": 5.6434, "loss/crossentropy": 2.4964150190353394, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1670391485095024, "step": 22182 }, { "epoch": 0.69325, "grad_norm": 2.96875, "grad_norm_var": 0.03242899576822917, "learning_rate": 0.0001, "loss": 6.0384, "loss/crossentropy": 2.809204339981079, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17331138253211975, "step": 22184 }, { "epoch": 0.6933125, "grad_norm": 2.9375, "grad_norm_var": 0.03474833170572917, "learning_rate": 0.0001, "loss": 5.2075, "loss/crossentropy": 2.2899743914604187, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.1503501981496811, "step": 22186 }, { "epoch": 0.693375, "grad_norm": 3.109375, "grad_norm_var": 0.0342681884765625, "learning_rate": 0.0001, "loss": 5.4312, "loss/crossentropy": 2.44270658493042, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.15627416968345642, "step": 22188 }, { "epoch": 0.6934375, "grad_norm": 2.84375, "grad_norm_var": 0.0326171875, "learning_rate": 0.0001, "loss": 5.389, "loss/crossentropy": 2.4759762287139893, "loss/hidden": 1.39453125, "loss/jsd": 0.0, "loss/logits": 0.15184426307678223, "step": 22190 }, { "epoch": 0.6935, "grad_norm": 3.140625, "grad_norm_var": 0.022777303059895834, "learning_rate": 0.0001, "loss": 5.5177, "loss/crossentropy": 2.4368534088134766, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16121289879083633, "step": 22192 }, { "epoch": 0.6935625, "grad_norm": 2.953125, "grad_norm_var": 0.0743072509765625, "learning_rate": 0.0001, "loss": 5.3237, "loss/crossentropy": 2.30250883102417, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15368013083934784, "step": 22194 }, { "epoch": 0.693625, "grad_norm": 3.234375, "grad_norm_var": 0.07288411458333334, "learning_rate": 0.0001, "loss": 5.8712, "loss/crossentropy": 2.6539297103881836, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1736777201294899, "step": 22196 }, { "epoch": 0.6936875, "grad_norm": 3.0625, "grad_norm_var": 0.07444559733072917, "learning_rate": 0.0001, "loss": 5.6884, "loss/crossentropy": 2.590583086013794, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16173090040683746, "step": 22198 }, { "epoch": 0.69375, "grad_norm": 3.21875, "grad_norm_var": 0.06975809733072917, "learning_rate": 0.0001, "loss": 5.7269, "loss/crossentropy": 2.5890592336654663, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1669059693813324, "step": 22200 }, { "epoch": 0.6938125, "grad_norm": 3.390625, "grad_norm_var": 0.0677154541015625, "learning_rate": 0.0001, "loss": 5.5632, "loss/crossentropy": 2.403108596801758, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.17108941823244095, "step": 22202 }, { "epoch": 0.693875, "grad_norm": 3.28125, "grad_norm_var": 0.0819976806640625, "learning_rate": 0.0001, "loss": 5.7487, "loss/crossentropy": 2.54872989654541, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16961067914962769, "step": 22204 }, { "epoch": 0.6939375, "grad_norm": 2.859375, "grad_norm_var": 0.08206380208333333, "learning_rate": 0.0001, "loss": 5.6329, "loss/crossentropy": 2.557260513305664, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1626439094543457, "step": 22206 }, { "epoch": 0.694, "grad_norm": 3.1875, "grad_norm_var": 0.09238179524739583, "learning_rate": 0.0001, "loss": 5.9052, "loss/crossentropy": 2.705229640007019, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1707761213183403, "step": 22208 }, { "epoch": 0.6940625, "grad_norm": 3.046875, "grad_norm_var": 0.0510650634765625, "learning_rate": 0.0001, "loss": 5.6836, "loss/crossentropy": 2.6229788064956665, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.16465359181165695, "step": 22210 }, { "epoch": 0.694125, "grad_norm": 3.28125, "grad_norm_var": 0.052079264322916666, "learning_rate": 0.0001, "loss": 5.9075, "loss/crossentropy": 2.6422927379608154, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1757410168647766, "step": 22212 }, { "epoch": 0.6941875, "grad_norm": 3.78125, "grad_norm_var": 0.070703125, "learning_rate": 0.0001, "loss": 5.4121, "loss/crossentropy": 2.340365767478943, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1583406627178192, "step": 22214 }, { "epoch": 0.69425, "grad_norm": 2.953125, "grad_norm_var": 0.07559305826822917, "learning_rate": 0.0001, "loss": 5.659, "loss/crossentropy": 2.5273908376693726, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1635541394352913, "step": 22216 }, { "epoch": 0.6943125, "grad_norm": 3.34375, "grad_norm_var": 0.07333882649739583, "learning_rate": 0.0001, "loss": 5.8834, "loss/crossentropy": 2.6678963899612427, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17428787797689438, "step": 22218 }, { "epoch": 0.694375, "grad_norm": 3.015625, "grad_norm_var": 0.06564127604166667, "learning_rate": 0.0001, "loss": 5.7687, "loss/crossentropy": 2.62005078792572, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1683787852525711, "step": 22220 }, { "epoch": 0.6944375, "grad_norm": 3.265625, "grad_norm_var": 0.05554911295572917, "learning_rate": 0.0001, "loss": 5.733, "loss/crossentropy": 2.633288860321045, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.1658271849155426, "step": 22222 }, { "epoch": 0.6945, "grad_norm": 3.171875, "grad_norm_var": 0.06530659993489583, "learning_rate": 0.0001, "loss": 4.906, "loss/crossentropy": 2.101522386074066, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.1406061053276062, "step": 22224 }, { "epoch": 0.6945625, "grad_norm": 2.890625, "grad_norm_var": 0.06741536458333333, "learning_rate": 0.0001, "loss": 5.7833, "loss/crossentropy": 2.6757954359054565, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16700387001037598, "step": 22226 }, { "epoch": 0.694625, "grad_norm": 3.171875, "grad_norm_var": 0.07576395670572916, "learning_rate": 0.0001, "loss": 5.7715, "loss/crossentropy": 2.6445634365081787, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16426067054271698, "step": 22228 }, { "epoch": 0.6946875, "grad_norm": 3.34375, "grad_norm_var": 0.05032145182291667, "learning_rate": 0.0001, "loss": 5.9006, "loss/crossentropy": 2.7169244289398193, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1683690845966339, "step": 22230 }, { "epoch": 0.69475, "grad_norm": 2.984375, "grad_norm_var": 0.047272745768229166, "learning_rate": 0.0001, "loss": 5.7788, "loss/crossentropy": 2.602481245994568, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16997309029102325, "step": 22232 }, { "epoch": 0.6948125, "grad_norm": 3.296875, "grad_norm_var": 0.045003255208333336, "learning_rate": 0.0001, "loss": 5.9348, "loss/crossentropy": 2.7773529291152954, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16418687999248505, "step": 22234 }, { "epoch": 0.694875, "grad_norm": 3.03125, "grad_norm_var": 0.04049479166666667, "learning_rate": 0.0001, "loss": 5.7328, "loss/crossentropy": 2.6272052526474, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.16798030585050583, "step": 22236 }, { "epoch": 0.6949375, "grad_norm": 3.078125, "grad_norm_var": 0.03769124348958333, "learning_rate": 0.0001, "loss": 5.8072, "loss/crossentropy": 2.632036566734314, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16985852271318436, "step": 22238 }, { "epoch": 0.695, "grad_norm": 3.1875, "grad_norm_var": 0.02760009765625, "learning_rate": 0.0001, "loss": 5.7408, "loss/crossentropy": 2.665483832359314, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16300511360168457, "step": 22240 }, { "epoch": 0.6950625, "grad_norm": 3.265625, "grad_norm_var": 0.02603759765625, "learning_rate": 0.0001, "loss": 5.6665, "loss/crossentropy": 2.5961122512817383, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15977414697408676, "step": 22242 }, { "epoch": 0.695125, "grad_norm": 3.203125, "grad_norm_var": 0.020238240559895832, "learning_rate": 0.0001, "loss": 5.5223, "loss/crossentropy": 2.467007637023926, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16255847364664078, "step": 22244 }, { "epoch": 0.6951875, "grad_norm": 3.15625, "grad_norm_var": 0.018257649739583333, "learning_rate": 0.0001, "loss": 5.5286, "loss/crossentropy": 2.427953839302063, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16397497057914734, "step": 22246 }, { "epoch": 0.69525, "grad_norm": 3.578125, "grad_norm_var": 0.03287353515625, "learning_rate": 0.0001, "loss": 5.6832, "loss/crossentropy": 2.5346686840057373, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.17071057856082916, "step": 22248 }, { "epoch": 0.6953125, "grad_norm": 2.8125, "grad_norm_var": 0.03791402180989583, "learning_rate": 0.0001, "loss": 5.6754, "loss/crossentropy": 2.630379319190979, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1607513353228569, "step": 22250 }, { "epoch": 0.695375, "grad_norm": 2.96875, "grad_norm_var": 0.0409088134765625, "learning_rate": 0.0001, "loss": 5.7436, "loss/crossentropy": 2.6902812719345093, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16002224385738373, "step": 22252 }, { "epoch": 0.6954375, "grad_norm": 3.0625, "grad_norm_var": 0.04243062337239583, "learning_rate": 0.0001, "loss": 5.5621, "loss/crossentropy": 2.513045907020569, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1595899686217308, "step": 22254 }, { "epoch": 0.6955, "grad_norm": 2.875, "grad_norm_var": 0.04206441243489583, "learning_rate": 0.0001, "loss": 5.7661, "loss/crossentropy": 2.732425332069397, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15765976905822754, "step": 22256 }, { "epoch": 0.6955625, "grad_norm": 3.109375, "grad_norm_var": 0.039713541666666664, "learning_rate": 0.0001, "loss": 5.5049, "loss/crossentropy": 2.5390454530715942, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1536208540201187, "step": 22258 }, { "epoch": 0.695625, "grad_norm": 2.828125, "grad_norm_var": 0.05579020182291667, "learning_rate": 0.0001, "loss": 5.8636, "loss/crossentropy": 2.620804190635681, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1742827668786049, "step": 22260 }, { "epoch": 0.6956875, "grad_norm": 2.96875, "grad_norm_var": 0.053278605143229164, "learning_rate": 0.0001, "loss": 5.7285, "loss/crossentropy": 2.6134908199310303, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16775041818618774, "step": 22262 }, { "epoch": 0.69575, "grad_norm": 3.265625, "grad_norm_var": 0.036767578125, "learning_rate": 0.0001, "loss": 5.5856, "loss/crossentropy": 2.5176788568496704, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1622568964958191, "step": 22264 }, { "epoch": 0.6958125, "grad_norm": 3.046875, "grad_norm_var": 0.03386128743489583, "learning_rate": 0.0001, "loss": 5.5092, "loss/crossentropy": 2.406510591506958, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1665196791291237, "step": 22266 }, { "epoch": 0.695875, "grad_norm": 3.078125, "grad_norm_var": 0.03135477701822917, "learning_rate": 0.0001, "loss": 5.7848, "loss/crossentropy": 2.6489561796188354, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16631732136011124, "step": 22268 }, { "epoch": 0.6959375, "grad_norm": 3.25, "grad_norm_var": 0.0601226806640625, "learning_rate": 0.0001, "loss": 5.8294, "loss/crossentropy": 2.568448781967163, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1729675531387329, "step": 22270 }, { "epoch": 0.696, "grad_norm": 2.953125, "grad_norm_var": 0.0565338134765625, "learning_rate": 0.0001, "loss": 5.6264, "loss/crossentropy": 2.542552947998047, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.16502072662115097, "step": 22272 }, { "epoch": 0.6960625, "grad_norm": 3.046875, "grad_norm_var": 0.05479227701822917, "learning_rate": 0.0001, "loss": 5.6426, "loss/crossentropy": 2.531023144721985, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16467513889074326, "step": 22274 }, { "epoch": 0.696125, "grad_norm": 3.234375, "grad_norm_var": 0.0438873291015625, "learning_rate": 0.0001, "loss": 5.811, "loss/crossentropy": 2.658156991004944, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16763115674257278, "step": 22276 }, { "epoch": 0.6961875, "grad_norm": 2.953125, "grad_norm_var": 0.0424957275390625, "learning_rate": 0.0001, "loss": 5.6239, "loss/crossentropy": 2.5519994497299194, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16266239434480667, "step": 22278 }, { "epoch": 0.69625, "grad_norm": 2.984375, "grad_norm_var": 0.03970947265625, "learning_rate": 0.0001, "loss": 5.5622, "loss/crossentropy": 2.445394515991211, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16558968275785446, "step": 22280 }, { "epoch": 0.6963125, "grad_norm": 3.109375, "grad_norm_var": 0.039388020833333336, "learning_rate": 0.0001, "loss": 5.5865, "loss/crossentropy": 2.5204278230667114, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16129783540964127, "step": 22282 }, { "epoch": 0.696375, "grad_norm": 3.421875, "grad_norm_var": 0.04601236979166667, "learning_rate": 0.0001, "loss": 5.9431, "loss/crossentropy": 2.6833596229553223, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17479926347732544, "step": 22284 }, { "epoch": 0.6964375, "grad_norm": 3.046875, "grad_norm_var": 0.023270670572916666, "learning_rate": 0.0001, "loss": 5.8238, "loss/crossentropy": 2.56610107421875, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17459864169359207, "step": 22286 }, { "epoch": 0.6965, "grad_norm": 3.046875, "grad_norm_var": 0.022899373372395834, "learning_rate": 0.0001, "loss": 5.546, "loss/crossentropy": 2.441725015640259, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.15964962542057037, "step": 22288 }, { "epoch": 0.6965625, "grad_norm": 2.984375, "grad_norm_var": 0.025972493489583335, "learning_rate": 0.0001, "loss": 5.5882, "loss/crossentropy": 2.5388636589050293, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15884168446063995, "step": 22290 }, { "epoch": 0.696625, "grad_norm": 3.125, "grad_norm_var": 0.021418253580729168, "learning_rate": 0.0001, "loss": 5.4641, "loss/crossentropy": 2.4065194129943848, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15927237272262573, "step": 22292 }, { "epoch": 0.6966875, "grad_norm": 3.140625, "grad_norm_var": 0.019254557291666665, "learning_rate": 0.0001, "loss": 5.7093, "loss/crossentropy": 2.559041976928711, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1673680618405342, "step": 22294 }, { "epoch": 0.69675, "grad_norm": 3.09375, "grad_norm_var": 0.018700154622395833, "learning_rate": 0.0001, "loss": 5.5445, "loss/crossentropy": 2.5248336791992188, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15626318752765656, "step": 22296 }, { "epoch": 0.6968125, "grad_norm": 3.046875, "grad_norm_var": 0.019391886393229165, "learning_rate": 0.0001, "loss": 5.7617, "loss/crossentropy": 2.5865273475646973, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17025582492351532, "step": 22298 }, { "epoch": 0.696875, "grad_norm": 2.953125, "grad_norm_var": 0.0148834228515625, "learning_rate": 0.0001, "loss": 5.693, "loss/crossentropy": 2.6101256608963013, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1633646935224533, "step": 22300 }, { "epoch": 0.6969375, "grad_norm": 3.15625, "grad_norm_var": 0.01259765625, "learning_rate": 0.0001, "loss": 5.7081, "loss/crossentropy": 2.6090636253356934, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16380954533815384, "step": 22302 }, { "epoch": 0.697, "grad_norm": 4.28125, "grad_norm_var": 0.10497945149739583, "learning_rate": 0.0001, "loss": 5.4828, "loss/crossentropy": 2.45250141620636, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15810998529195786, "step": 22304 }, { "epoch": 0.6970625, "grad_norm": 3.359375, "grad_norm_var": 0.10263264973958333, "learning_rate": 0.0001, "loss": 6.1271, "loss/crossentropy": 2.794241786003113, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18016577512025833, "step": 22306 }, { "epoch": 0.697125, "grad_norm": 3.0625, "grad_norm_var": 0.10592447916666667, "learning_rate": 0.0001, "loss": 5.6457, "loss/crossentropy": 2.5611395835876465, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.165873683989048, "step": 22308 }, { "epoch": 0.6971875, "grad_norm": 3.375, "grad_norm_var": 0.11043192545572916, "learning_rate": 0.0001, "loss": 5.52, "loss/crossentropy": 2.4050599336624146, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1650119423866272, "step": 22310 }, { "epoch": 0.69725, "grad_norm": 3.109375, "grad_norm_var": 0.11057535807291667, "learning_rate": 0.0001, "loss": 5.7917, "loss/crossentropy": 2.6355658769607544, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16874302178621292, "step": 22312 }, { "epoch": 0.6973125, "grad_norm": 3.21875, "grad_norm_var": 0.1091461181640625, "learning_rate": 0.0001, "loss": 5.6542, "loss/crossentropy": 2.6047236919403076, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16197961568832397, "step": 22314 }, { "epoch": 0.697375, "grad_norm": 2.984375, "grad_norm_var": 0.10572509765625, "learning_rate": 0.0001, "loss": 5.908, "loss/crossentropy": 2.7273367643356323, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17080094665288925, "step": 22316 }, { "epoch": 0.6974375, "grad_norm": 3.296875, "grad_norm_var": 0.10445556640625, "learning_rate": 0.0001, "loss": 5.5467, "loss/crossentropy": 2.4545209407806396, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16430046409368515, "step": 22318 }, { "epoch": 0.6975, "grad_norm": 2.984375, "grad_norm_var": 0.0266998291015625, "learning_rate": 0.0001, "loss": 5.8954, "loss/crossentropy": 2.7398087978363037, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16790051758289337, "step": 22320 }, { "epoch": 0.6975625, "grad_norm": 2.984375, "grad_norm_var": 0.024507649739583335, "learning_rate": 0.0001, "loss": 5.4263, "loss/crossentropy": 2.391477584838867, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15817295014858246, "step": 22322 }, { "epoch": 0.697625, "grad_norm": 3.421875, "grad_norm_var": 0.028815714518229167, "learning_rate": 0.0001, "loss": 6.1122, "loss/crossentropy": 2.827970504760742, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17920669168233871, "step": 22324 }, { "epoch": 0.6976875, "grad_norm": 3.125, "grad_norm_var": 0.0246490478515625, "learning_rate": 0.0001, "loss": 5.5779, "loss/crossentropy": 2.504065990447998, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1644120216369629, "step": 22326 }, { "epoch": 0.69775, "grad_norm": 2.9375, "grad_norm_var": 0.027684529622395832, "learning_rate": 0.0001, "loss": 5.3827, "loss/crossentropy": 2.36915385723114, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15369880199432373, "step": 22328 }, { "epoch": 0.6978125, "grad_norm": 3.296875, "grad_norm_var": 0.029866536458333332, "learning_rate": 0.0001, "loss": 5.7214, "loss/crossentropy": 2.5773812532424927, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16830333322286606, "step": 22330 }, { "epoch": 0.697875, "grad_norm": 3.25, "grad_norm_var": 0.028413899739583335, "learning_rate": 0.0001, "loss": 5.3243, "loss/crossentropy": 2.2541534900665283, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1581818237900734, "step": 22332 }, { "epoch": 0.6979375, "grad_norm": 2.9375, "grad_norm_var": 0.0273590087890625, "learning_rate": 0.0001, "loss": 5.9036, "loss/crossentropy": 2.73256778717041, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17023073136806488, "step": 22334 }, { "epoch": 0.698, "grad_norm": 2.8125, "grad_norm_var": 0.0284332275390625, "learning_rate": 0.0001, "loss": 5.5798, "loss/crossentropy": 2.5840107202529907, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15582574903964996, "step": 22336 }, { "epoch": 0.6980625, "grad_norm": 2.984375, "grad_norm_var": 0.0275543212890625, "learning_rate": 0.0001, "loss": 5.7119, "loss/crossentropy": 2.5801466703414917, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16668801009655, "step": 22338 }, { "epoch": 0.698125, "grad_norm": 2.984375, "grad_norm_var": 0.025853474934895832, "learning_rate": 0.0001, "loss": 5.2978, "loss/crossentropy": 2.357309341430664, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.1526389792561531, "step": 22340 }, { "epoch": 0.6981875, "grad_norm": 3.1875, "grad_norm_var": 0.025169881184895833, "learning_rate": 0.0001, "loss": 5.788, "loss/crossentropy": 2.6856919527053833, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16569886356592178, "step": 22342 }, { "epoch": 0.69825, "grad_norm": 3.015625, "grad_norm_var": 0.030159505208333333, "learning_rate": 0.0001, "loss": 5.8538, "loss/crossentropy": 2.76581346988678, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16075499355793, "step": 22344 }, { "epoch": 0.6983125, "grad_norm": 3.34375, "grad_norm_var": 0.031538899739583334, "learning_rate": 0.0001, "loss": 5.5739, "loss/crossentropy": 2.4678162336349487, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16412169486284256, "step": 22346 }, { "epoch": 0.698375, "grad_norm": 3.171875, "grad_norm_var": 0.029963175455729168, "learning_rate": 0.0001, "loss": 5.3019, "loss/crossentropy": 2.34357488155365, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1497362107038498, "step": 22348 }, { "epoch": 0.6984375, "grad_norm": 7.46875, "grad_norm_var": 1.2452056884765625, "learning_rate": 0.0001, "loss": 5.8482, "loss/crossentropy": 2.4086620807647705, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.19160857051610947, "step": 22350 }, { "epoch": 0.6985, "grad_norm": 3.46875, "grad_norm_var": 1.22056884765625, "learning_rate": 0.0001, "loss": 6.0269, "loss/crossentropy": 2.7019847631454468, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.18015021085739136, "step": 22352 }, { "epoch": 0.6985625, "grad_norm": 3.234375, "grad_norm_var": 1.2114898681640625, "learning_rate": 0.0001, "loss": 5.9025, "loss/crossentropy": 2.6564154624938965, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17577841877937317, "step": 22354 }, { "epoch": 0.698625, "grad_norm": 3.15625, "grad_norm_var": 1.1758097330729167, "learning_rate": 0.0001, "loss": 5.9447, "loss/crossentropy": 2.748743772506714, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16881877928972244, "step": 22356 }, { "epoch": 0.6986875, "grad_norm": 3.09375, "grad_norm_var": 1.162555948893229, "learning_rate": 0.0001, "loss": 5.8294, "loss/crossentropy": 2.6740111112594604, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16866113990545273, "step": 22358 }, { "epoch": 0.69875, "grad_norm": 2.859375, "grad_norm_var": 1.172874959309896, "learning_rate": 0.0001, "loss": 5.6562, "loss/crossentropy": 2.5262296199798584, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.164167582988739, "step": 22360 }, { "epoch": 0.6988125, "grad_norm": 2.859375, "grad_norm_var": 1.195563761393229, "learning_rate": 0.0001, "loss": 5.4895, "loss/crossentropy": 2.496787905693054, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15590853989124298, "step": 22362 }, { "epoch": 0.698875, "grad_norm": 3.125, "grad_norm_var": 1.1972941080729167, "learning_rate": 0.0001, "loss": 5.6964, "loss/crossentropy": 2.6194320917129517, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16394924372434616, "step": 22364 }, { "epoch": 0.6989375, "grad_norm": 3.125, "grad_norm_var": 0.032124837239583336, "learning_rate": 0.0001, "loss": 5.9581, "loss/crossentropy": 2.741639733314514, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.17593914270401, "step": 22366 }, { "epoch": 0.699, "grad_norm": 3.109375, "grad_norm_var": 0.22603759765625, "learning_rate": 0.0001, "loss": 5.4965, "loss/crossentropy": 2.397615075111389, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.15715742111206055, "step": 22368 }, { "epoch": 0.6990625, "grad_norm": 3.25, "grad_norm_var": 0.2276763916015625, "learning_rate": 0.0001, "loss": 5.6063, "loss/crossentropy": 2.5021533966064453, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16432107985019684, "step": 22370 }, { "epoch": 0.699125, "grad_norm": 2.890625, "grad_norm_var": 0.2720855712890625, "learning_rate": 0.0001, "loss": 5.752, "loss/crossentropy": 2.6542662382125854, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16407115757465363, "step": 22372 }, { "epoch": 0.6991875, "grad_norm": 2.984375, "grad_norm_var": 0.27852274576822916, "learning_rate": 0.0001, "loss": 5.629, "loss/crossentropy": 2.462101697921753, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1702076569199562, "step": 22374 }, { "epoch": 0.69925, "grad_norm": 3.0625, "grad_norm_var": 0.269140625, "learning_rate": 0.0001, "loss": 5.3237, "loss/crossentropy": 2.40639591217041, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.15188443660736084, "step": 22376 }, { "epoch": 0.6993125, "grad_norm": 3.234375, "grad_norm_var": 0.2586090087890625, "learning_rate": 0.0001, "loss": 5.6065, "loss/crossentropy": 2.5098297595977783, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1616167202591896, "step": 22378 }, { "epoch": 0.699375, "grad_norm": 3.15625, "grad_norm_var": 0.2508941650390625, "learning_rate": 0.0001, "loss": 5.6762, "loss/crossentropy": 2.5527048110961914, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1674271747469902, "step": 22380 }, { "epoch": 0.6994375, "grad_norm": 3.265625, "grad_norm_var": 0.2538726806640625, "learning_rate": 0.0001, "loss": 5.3082, "loss/crossentropy": 2.306580424308777, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1509408950805664, "step": 22382 }, { "epoch": 0.6995, "grad_norm": 3.421875, "grad_norm_var": 0.06692708333333333, "learning_rate": 0.0001, "loss": 5.734, "loss/crossentropy": 2.5101386308670044, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17277875542640686, "step": 22384 }, { "epoch": 0.6995625, "grad_norm": 3.015625, "grad_norm_var": 0.07023824055989583, "learning_rate": 0.0001, "loss": 5.6629, "loss/crossentropy": 2.5359352827072144, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.162301167845726, "step": 22386 }, { "epoch": 0.699625, "grad_norm": 3.015625, "grad_norm_var": 0.020873006184895834, "learning_rate": 0.0001, "loss": 5.9309, "loss/crossentropy": 2.7789628505706787, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1679278016090393, "step": 22388 }, { "epoch": 0.6996875, "grad_norm": 2.984375, "grad_norm_var": 0.024214680989583334, "learning_rate": 0.0001, "loss": 5.6155, "loss/crossentropy": 2.5941267013549805, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15995129197835922, "step": 22390 }, { "epoch": 0.69975, "grad_norm": 3.265625, "grad_norm_var": 0.0268707275390625, "learning_rate": 0.0001, "loss": 5.3302, "loss/crossentropy": 2.297610640525818, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.15325400233268738, "step": 22392 }, { "epoch": 0.6998125, "grad_norm": 3.0625, "grad_norm_var": 0.026949055989583335, "learning_rate": 0.0001, "loss": 5.9984, "loss/crossentropy": 2.773570418357849, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17170123010873795, "step": 22394 }, { "epoch": 0.699875, "grad_norm": 3.125, "grad_norm_var": 0.029548136393229167, "learning_rate": 0.0001, "loss": 5.5626, "loss/crossentropy": 2.4833754301071167, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1633896306157112, "step": 22396 }, { "epoch": 0.6999375, "grad_norm": 9.625, "grad_norm_var": 2.6657623291015624, "learning_rate": 0.0001, "loss": 5.7952, "loss/crossentropy": 2.604393482208252, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1683036834001541, "step": 22398 }, { "epoch": 0.7, "grad_norm": 3.015625, "grad_norm_var": 2.6793365478515625, "learning_rate": 0.0001, "loss": 5.9193, "loss/crossentropy": 2.8233916759490967, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16271594911813736, "step": 22400 }, { "epoch": 0.7000625, "grad_norm": 3.109375, "grad_norm_var": 2.6681304931640626, "learning_rate": 0.0001, "loss": 5.5348, "loss/crossentropy": 2.4189138412475586, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1678370013833046, "step": 22402 }, { "epoch": 0.700125, "grad_norm": 3.09375, "grad_norm_var": 2.6681304931640626, "learning_rate": 0.0001, "loss": 5.5848, "loss/crossentropy": 2.53902804851532, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16239406913518906, "step": 22404 }, { "epoch": 0.7001875, "grad_norm": 3.109375, "grad_norm_var": 2.642625935872396, "learning_rate": 0.0001, "loss": 5.6684, "loss/crossentropy": 2.503175377845764, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16886447370052338, "step": 22406 }, { "epoch": 0.70025, "grad_norm": 2.875, "grad_norm_var": 2.680101521809896, "learning_rate": 0.0001, "loss": 5.3362, "loss/crossentropy": 2.391018033027649, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.14802981913089752, "step": 22408 }, { "epoch": 0.7003125, "grad_norm": 3.296875, "grad_norm_var": 2.6794230143229165, "learning_rate": 0.0001, "loss": 5.629, "loss/crossentropy": 2.5314453840255737, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1624908298254013, "step": 22410 }, { "epoch": 0.700375, "grad_norm": 2.9375, "grad_norm_var": 2.6909464518229167, "learning_rate": 0.0001, "loss": 5.474, "loss/crossentropy": 2.4601889848709106, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.1595887541770935, "step": 22412 }, { "epoch": 0.7004375, "grad_norm": 3.09375, "grad_norm_var": 0.010993448893229167, "learning_rate": 0.0001, "loss": 5.5297, "loss/crossentropy": 2.3893154859542847, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1683400347828865, "step": 22414 }, { "epoch": 0.7005, "grad_norm": 3.0625, "grad_norm_var": 0.014387003580729167, "learning_rate": 0.0001, "loss": 5.9771, "loss/crossentropy": 2.7257237434387207, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17396235466003418, "step": 22416 }, { "epoch": 0.7005625, "grad_norm": 3.0, "grad_norm_var": 0.015034993489583334, "learning_rate": 0.0001, "loss": 5.627, "loss/crossentropy": 2.5427207946777344, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16233046352863312, "step": 22418 }, { "epoch": 0.700625, "grad_norm": 3.1875, "grad_norm_var": 0.015623982747395833, "learning_rate": 0.0001, "loss": 5.7423, "loss/crossentropy": 2.5589810609817505, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16637687385082245, "step": 22420 }, { "epoch": 0.7006875, "grad_norm": 2.96875, "grad_norm_var": 0.017073567708333334, "learning_rate": 0.0001, "loss": 5.4075, "loss/crossentropy": 2.446865677833557, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.15505224466323853, "step": 22422 }, { "epoch": 0.70075, "grad_norm": 2.9375, "grad_norm_var": 0.017039998372395834, "learning_rate": 0.0001, "loss": 5.6526, "loss/crossentropy": 2.5972577333450317, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15748517215251923, "step": 22424 }, { "epoch": 0.7008125, "grad_norm": 3.0625, "grad_norm_var": 0.013179524739583334, "learning_rate": 0.0001, "loss": 5.575, "loss/crossentropy": 2.4511340856552124, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16356197744607925, "step": 22426 }, { "epoch": 0.700875, "grad_norm": 2.984375, "grad_norm_var": 0.015501912434895833, "learning_rate": 0.0001, "loss": 5.5413, "loss/crossentropy": 2.511778950691223, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15959448367357254, "step": 22428 }, { "epoch": 0.7009375, "grad_norm": 3.03125, "grad_norm_var": 0.015608723958333333, "learning_rate": 0.0001, "loss": 5.8105, "loss/crossentropy": 2.707659602165222, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.1669275015592575, "step": 22430 }, { "epoch": 0.701, "grad_norm": 3.65625, "grad_norm_var": 0.03448893229166667, "learning_rate": 0.0001, "loss": 5.8825, "loss/crossentropy": 2.6640822887420654, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17496730387210846, "step": 22432 }, { "epoch": 0.7010625, "grad_norm": 3.234375, "grad_norm_var": 0.06253255208333333, "learning_rate": 0.0001, "loss": 5.8073, "loss/crossentropy": 2.696197271347046, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1650131791830063, "step": 22434 }, { "epoch": 0.701125, "grad_norm": 3.234375, "grad_norm_var": 0.06267903645833334, "learning_rate": 0.0001, "loss": 5.815, "loss/crossentropy": 2.612404942512512, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17299189418554306, "step": 22436 }, { "epoch": 0.7011875, "grad_norm": 2.984375, "grad_norm_var": 0.06051025390625, "learning_rate": 0.0001, "loss": 5.5862, "loss/crossentropy": 2.5015735626220703, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16470873355865479, "step": 22438 }, { "epoch": 0.70125, "grad_norm": 3.0, "grad_norm_var": 0.058121744791666666, "learning_rate": 0.0001, "loss": 5.6935, "loss/crossentropy": 2.5801044702529907, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16407185792922974, "step": 22440 }, { "epoch": 0.7013125, "grad_norm": 3.078125, "grad_norm_var": 0.07774149576822917, "learning_rate": 0.0001, "loss": 5.9704, "loss/crossentropy": 2.6817033290863037, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1777021884918213, "step": 22442 }, { "epoch": 0.701375, "grad_norm": 3.5, "grad_norm_var": 0.06877848307291666, "learning_rate": 0.0001, "loss": 5.5508, "loss/crossentropy": 2.5040571689605713, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16014103591442108, "step": 22444 }, { "epoch": 0.7014375, "grad_norm": 3.234375, "grad_norm_var": 0.06417643229166667, "learning_rate": 0.0001, "loss": 5.7439, "loss/crossentropy": 2.5734212398529053, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16899675130844116, "step": 22446 }, { "epoch": 0.7015, "grad_norm": 2.859375, "grad_norm_var": 0.06323140462239583, "learning_rate": 0.0001, "loss": 5.7392, "loss/crossentropy": 2.6356717348098755, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16348008811473846, "step": 22448 }, { "epoch": 0.7015625, "grad_norm": 3.21875, "grad_norm_var": 0.044189453125, "learning_rate": 0.0001, "loss": 5.6081, "loss/crossentropy": 2.4915573596954346, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16478148102760315, "step": 22450 }, { "epoch": 0.701625, "grad_norm": 3.046875, "grad_norm_var": 0.0458648681640625, "learning_rate": 0.0001, "loss": 5.5288, "loss/crossentropy": 2.4801501035690308, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15916209667921066, "step": 22452 }, { "epoch": 0.7016875, "grad_norm": 2.890625, "grad_norm_var": 0.0484375, "learning_rate": 0.0001, "loss": 5.6895, "loss/crossentropy": 2.573846220970154, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16312579065561295, "step": 22454 }, { "epoch": 0.70175, "grad_norm": 2.84375, "grad_norm_var": 0.05082906087239583, "learning_rate": 0.0001, "loss": 5.5499, "loss/crossentropy": 2.513450026512146, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.16263306885957718, "step": 22456 }, { "epoch": 0.7018125, "grad_norm": 3.046875, "grad_norm_var": 0.0286529541015625, "learning_rate": 0.0001, "loss": 5.8345, "loss/crossentropy": 2.6493040323257446, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.17281652241945267, "step": 22458 }, { "epoch": 0.701875, "grad_norm": 3.09375, "grad_norm_var": 0.019383748372395832, "learning_rate": 0.0001, "loss": 5.9764, "loss/crossentropy": 2.7259128093719482, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17544399201869965, "step": 22460 }, { "epoch": 0.7019375, "grad_norm": 3.03125, "grad_norm_var": 0.019156901041666667, "learning_rate": 0.0001, "loss": 5.91, "loss/crossentropy": 2.703311800956726, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.17496934533119202, "step": 22462 }, { "epoch": 0.702, "grad_norm": 3.09375, "grad_norm_var": 0.023844401041666668, "learning_rate": 0.0001, "loss": 5.6215, "loss/crossentropy": 2.57110595703125, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16128679364919662, "step": 22464 }, { "epoch": 0.7020625, "grad_norm": 3.40625, "grad_norm_var": 0.03418680826822917, "learning_rate": 0.0001, "loss": 5.9372, "loss/crossentropy": 2.742884635925293, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17216527462005615, "step": 22466 }, { "epoch": 0.702125, "grad_norm": 3.109375, "grad_norm_var": 0.035542805989583336, "learning_rate": 0.0001, "loss": 5.5319, "loss/crossentropy": 2.433596611022949, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16177868843078613, "step": 22468 }, { "epoch": 0.7021875, "grad_norm": 3.1875, "grad_norm_var": 0.0365875244140625, "learning_rate": 0.0001, "loss": 5.9063, "loss/crossentropy": 2.8023040294647217, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16352402418851852, "step": 22470 }, { "epoch": 0.70225, "grad_norm": 3.5, "grad_norm_var": 0.043782552083333336, "learning_rate": 0.0001, "loss": 5.9647, "loss/crossentropy": 2.6569935083389282, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17764723300933838, "step": 22472 }, { "epoch": 0.7023125, "grad_norm": 2.859375, "grad_norm_var": 0.0514801025390625, "learning_rate": 0.0001, "loss": 5.5043, "loss/crossentropy": 2.5186818838119507, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1555919125676155, "step": 22474 }, { "epoch": 0.702375, "grad_norm": 3.046875, "grad_norm_var": 0.04978739420572917, "learning_rate": 0.0001, "loss": 5.7322, "loss/crossentropy": 2.6452149152755737, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1633850336074829, "step": 22476 }, { "epoch": 0.7024375, "grad_norm": 3.25, "grad_norm_var": 0.048388671875, "learning_rate": 0.0001, "loss": 5.3693, "loss/crossentropy": 2.3515241146087646, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1549069508910179, "step": 22478 }, { "epoch": 0.7025, "grad_norm": 3.109375, "grad_norm_var": 0.05031636555989583, "learning_rate": 0.0001, "loss": 5.9579, "loss/crossentropy": 2.68467915058136, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17615192383527756, "step": 22480 }, { "epoch": 0.7025625, "grad_norm": 2.8125, "grad_norm_var": 0.0432769775390625, "learning_rate": 0.0001, "loss": 5.5088, "loss/crossentropy": 2.465117931365967, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15866827964782715, "step": 22482 }, { "epoch": 0.702625, "grad_norm": 3.296875, "grad_norm_var": 0.04519856770833333, "learning_rate": 0.0001, "loss": 5.8039, "loss/crossentropy": 2.592761993408203, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1722838431596756, "step": 22484 }, { "epoch": 0.7026875, "grad_norm": 2.875, "grad_norm_var": 0.058882649739583334, "learning_rate": 0.0001, "loss": 5.5698, "loss/crossentropy": 2.515580892562866, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1577618569135666, "step": 22486 }, { "epoch": 0.70275, "grad_norm": 3.34375, "grad_norm_var": 0.05354715983072917, "learning_rate": 0.0001, "loss": 5.9011, "loss/crossentropy": 2.8239874839782715, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1627890020608902, "step": 22488 }, { "epoch": 0.7028125, "grad_norm": 3.28125, "grad_norm_var": 0.046174112955729166, "learning_rate": 0.0001, "loss": 5.8753, "loss/crossentropy": 2.7165865898132324, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.166265070438385, "step": 22490 }, { "epoch": 0.702875, "grad_norm": 3.0, "grad_norm_var": 0.04716796875, "learning_rate": 0.0001, "loss": 5.6852, "loss/crossentropy": 2.6169623136520386, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16072610020637512, "step": 22492 }, { "epoch": 0.7029375, "grad_norm": 3.078125, "grad_norm_var": 0.06935933430989584, "learning_rate": 0.0001, "loss": 5.4155, "loss/crossentropy": 2.2609105706214905, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1642833724617958, "step": 22494 }, { "epoch": 0.703, "grad_norm": 3.25, "grad_norm_var": 0.07418619791666667, "learning_rate": 0.0001, "loss": 5.6373, "loss/crossentropy": 2.5342200994491577, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16460448503494263, "step": 22496 }, { "epoch": 0.7030625, "grad_norm": 2.84375, "grad_norm_var": 0.326904296875, "learning_rate": 0.0001, "loss": 5.7002, "loss/crossentropy": 2.575178384780884, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16445918381214142, "step": 22498 }, { "epoch": 0.703125, "grad_norm": 3.015625, "grad_norm_var": 0.3385894775390625, "learning_rate": 0.0001, "loss": 5.6958, "loss/crossentropy": 2.621804714202881, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16091208904981613, "step": 22500 }, { "epoch": 0.7031875, "grad_norm": 2.9375, "grad_norm_var": 0.34890950520833336, "learning_rate": 0.0001, "loss": 5.7022, "loss/crossentropy": 2.5340776443481445, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1691538318991661, "step": 22502 }, { "epoch": 0.70325, "grad_norm": 3.0, "grad_norm_var": 0.34834696451822916, "learning_rate": 0.0001, "loss": 5.7394, "loss/crossentropy": 2.6451767683029175, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16450387239456177, "step": 22504 }, { "epoch": 0.7033125, "grad_norm": 3.34375, "grad_norm_var": 0.3486328125, "learning_rate": 0.0001, "loss": 5.6377, "loss/crossentropy": 2.47772753238678, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16834013909101486, "step": 22506 }, { "epoch": 0.703375, "grad_norm": 2.859375, "grad_norm_var": 0.36357320149739586, "learning_rate": 0.0001, "loss": 5.5959, "loss/crossentropy": 2.5505971908569336, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15648002177476883, "step": 22508 }, { "epoch": 0.7034375, "grad_norm": 3.796875, "grad_norm_var": 0.38450520833333335, "learning_rate": 0.0001, "loss": 5.9153, "loss/crossentropy": 2.672336220741272, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17273079603910446, "step": 22510 }, { "epoch": 0.7035, "grad_norm": 3.0, "grad_norm_var": 0.367822265625, "learning_rate": 0.0001, "loss": 5.4554, "loss/crossentropy": 2.4188915491104126, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15833529829978943, "step": 22512 }, { "epoch": 0.7035625, "grad_norm": 9.0625, "grad_norm_var": 2.200902303059896, "learning_rate": 0.0001, "loss": 6.1889, "loss/crossentropy": 2.603859305381775, "loss/hidden": 1.57421875, "loss/jsd": 0.0, "loss/logits": 0.20108668506145477, "step": 22514 }, { "epoch": 0.703625, "grad_norm": 3.515625, "grad_norm_var": 2.1885080973307294, "learning_rate": 0.0001, "loss": 5.8481, "loss/crossentropy": 2.6655837297439575, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16942200809717178, "step": 22516 }, { "epoch": 0.7036875, "grad_norm": 3.078125, "grad_norm_var": 2.178076171875, "learning_rate": 0.0001, "loss": 5.9334, "loss/crossentropy": 2.6331652402877808, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1808079555630684, "step": 22518 }, { "epoch": 0.70375, "grad_norm": 3.078125, "grad_norm_var": 2.179638671875, "learning_rate": 0.0001, "loss": 5.8864, "loss/crossentropy": 2.683835744857788, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1722119152545929, "step": 22520 }, { "epoch": 0.7038125, "grad_norm": 3.296875, "grad_norm_var": 2.2008046468098956, "learning_rate": 0.0001, "loss": 5.5452, "loss/crossentropy": 2.4897459745407104, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16140858829021454, "step": 22522 }, { "epoch": 0.703875, "grad_norm": 3.203125, "grad_norm_var": 2.1633453369140625, "learning_rate": 0.0001, "loss": 5.5807, "loss/crossentropy": 2.4016828536987305, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17141630500555038, "step": 22524 }, { "epoch": 0.7039375, "grad_norm": 2.890625, "grad_norm_var": 2.191389973958333, "learning_rate": 0.0001, "loss": 5.4236, "loss/crossentropy": 2.412192225456238, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15543314814567566, "step": 22526 }, { "epoch": 0.704, "grad_norm": 3.234375, "grad_norm_var": 2.1904256184895834, "learning_rate": 0.0001, "loss": 5.7931, "loss/crossentropy": 2.5884499549865723, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1716323122382164, "step": 22528 }, { "epoch": 0.7040625, "grad_norm": 2.984375, "grad_norm_var": 0.0510162353515625, "learning_rate": 0.0001, "loss": 5.7378, "loss/crossentropy": 2.5632896423339844, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.17174633592367172, "step": 22530 }, { "epoch": 0.704125, "grad_norm": 3.34375, "grad_norm_var": 0.048173014322916666, "learning_rate": 0.0001, "loss": 5.9122, "loss/crossentropy": 2.7450913190841675, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16865945607423782, "step": 22532 }, { "epoch": 0.7041875, "grad_norm": 3.5625, "grad_norm_var": 0.0542633056640625, "learning_rate": 0.0001, "loss": 5.4997, "loss/crossentropy": 2.4344513416290283, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1615985557436943, "step": 22534 }, { "epoch": 0.70425, "grad_norm": 3.125, "grad_norm_var": 0.05813395182291667, "learning_rate": 0.0001, "loss": 5.1764, "loss/crossentropy": 2.20544570684433, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.14748192578554153, "step": 22536 }, { "epoch": 0.7043125, "grad_norm": 3.5, "grad_norm_var": 0.062132771809895834, "learning_rate": 0.0001, "loss": 5.5872, "loss/crossentropy": 2.50904643535614, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15976479649543762, "step": 22538 }, { "epoch": 0.704375, "grad_norm": 3.125, "grad_norm_var": 0.06211649576822917, "learning_rate": 0.0001, "loss": 5.6799, "loss/crossentropy": 2.5684973001480103, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16817133128643036, "step": 22540 }, { "epoch": 0.7044375, "grad_norm": 3.5625, "grad_norm_var": 0.062108357747395836, "learning_rate": 0.0001, "loss": 5.94, "loss/crossentropy": 2.6243035793304443, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17883047461509705, "step": 22542 }, { "epoch": 0.7045, "grad_norm": 2.6875, "grad_norm_var": 0.07993876139322917, "learning_rate": 0.0001, "loss": 5.8652, "loss/crossentropy": 2.7938671112060547, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16025998443365097, "step": 22544 }, { "epoch": 0.7045625, "grad_norm": 3.078125, "grad_norm_var": 0.0614166259765625, "learning_rate": 0.0001, "loss": 5.5639, "loss/crossentropy": 2.563488721847534, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15668240934610367, "step": 22546 }, { "epoch": 0.704625, "grad_norm": 3.078125, "grad_norm_var": 0.05966796875, "learning_rate": 0.0001, "loss": 5.712, "loss/crossentropy": 2.61086368560791, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16246207803487778, "step": 22548 }, { "epoch": 0.7046875, "grad_norm": 3.203125, "grad_norm_var": 0.05096028645833333, "learning_rate": 0.0001, "loss": 5.9104, "loss/crossentropy": 2.6754400730133057, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17544656991958618, "step": 22550 }, { "epoch": 0.70475, "grad_norm": 3.15625, "grad_norm_var": 0.047587076822916664, "learning_rate": 0.0001, "loss": 5.9104, "loss/crossentropy": 2.794792652130127, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1631246656179428, "step": 22552 }, { "epoch": 0.7048125, "grad_norm": 2.921875, "grad_norm_var": 0.040234375, "learning_rate": 0.0001, "loss": 5.755, "loss/crossentropy": 2.629794120788574, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16799188405275345, "step": 22554 }, { "epoch": 0.704875, "grad_norm": 2.953125, "grad_norm_var": 0.04254557291666667, "learning_rate": 0.0001, "loss": 5.7324, "loss/crossentropy": 2.6186139583587646, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16411759704351425, "step": 22556 }, { "epoch": 0.7049375, "grad_norm": 3.171875, "grad_norm_var": 0.030582682291666666, "learning_rate": 0.0001, "loss": 5.9094, "loss/crossentropy": 2.634056806564331, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.176364503800869, "step": 22558 }, { "epoch": 0.705, "grad_norm": 3.0, "grad_norm_var": 0.019136555989583335, "learning_rate": 0.0001, "loss": 5.8174, "loss/crossentropy": 2.7001943588256836, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.164451003074646, "step": 22560 }, { "epoch": 0.7050625, "grad_norm": 3.03125, "grad_norm_var": 0.03247782389322917, "learning_rate": 0.0001, "loss": 5.9079, "loss/crossentropy": 2.684187173843384, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17237317562103271, "step": 22562 }, { "epoch": 0.705125, "grad_norm": 4.03125, "grad_norm_var": 0.07877197265625, "learning_rate": 0.0001, "loss": 5.9664, "loss/crossentropy": 2.6163182258605957, "loss/hidden": 1.5859375, "loss/jsd": 0.0, "loss/logits": 0.1764097362756729, "step": 22564 }, { "epoch": 0.7051875, "grad_norm": 2.921875, "grad_norm_var": 0.08662007649739584, "learning_rate": 0.0001, "loss": 5.6746, "loss/crossentropy": 2.6956058740615845, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15337137877941132, "step": 22566 }, { "epoch": 0.70525, "grad_norm": 2.9375, "grad_norm_var": 0.09000244140625, "learning_rate": 0.0001, "loss": 5.9217, "loss/crossentropy": 2.71065890789032, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.17501109093427658, "step": 22568 }, { "epoch": 0.7053125, "grad_norm": 3.046875, "grad_norm_var": 0.14767252604166667, "learning_rate": 0.0001, "loss": 5.6179, "loss/crossentropy": 2.4611226320266724, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16763439029455185, "step": 22570 }, { "epoch": 0.705375, "grad_norm": 2.96875, "grad_norm_var": 0.14949442545572916, "learning_rate": 0.0001, "loss": 5.7302, "loss/crossentropy": 2.598994493484497, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16390138864517212, "step": 22572 }, { "epoch": 0.7054375, "grad_norm": 3.046875, "grad_norm_var": 0.154541015625, "learning_rate": 0.0001, "loss": 5.1796, "loss/crossentropy": 2.1796997785568237, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.14882108569145203, "step": 22574 }, { "epoch": 0.7055, "grad_norm": 3.40625, "grad_norm_var": 0.15673421223958334, "learning_rate": 0.0001, "loss": 5.9949, "loss/crossentropy": 2.7754679918289185, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1719452291727066, "step": 22576 }, { "epoch": 0.7055625, "grad_norm": 2.84375, "grad_norm_var": 0.15115458170572918, "learning_rate": 0.0001, "loss": 5.7318, "loss/crossentropy": 2.6822497844696045, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.1623767912387848, "step": 22578 }, { "epoch": 0.705625, "grad_norm": 3.046875, "grad_norm_var": 0.10041910807291667, "learning_rate": 0.0001, "loss": 5.6734, "loss/crossentropy": 2.65487802028656, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1580996885895729, "step": 22580 }, { "epoch": 0.7056875, "grad_norm": 3.046875, "grad_norm_var": 0.09967447916666666, "learning_rate": 0.0001, "loss": 5.6133, "loss/crossentropy": 2.5546118021011353, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1605612114071846, "step": 22582 }, { "epoch": 0.70575, "grad_norm": 3.75, "grad_norm_var": 0.1258209228515625, "learning_rate": 0.0001, "loss": 5.7549, "loss/crossentropy": 2.6321762800216675, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16734836995601654, "step": 22584 }, { "epoch": 0.7058125, "grad_norm": 3.015625, "grad_norm_var": 0.0508209228515625, "learning_rate": 0.0001, "loss": 5.6999, "loss/crossentropy": 2.607453227043152, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16119518876075745, "step": 22586 }, { "epoch": 0.705875, "grad_norm": 2.921875, "grad_norm_var": 0.05927327473958333, "learning_rate": 0.0001, "loss": 5.8221, "loss/crossentropy": 2.6366634368896484, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17049414664506912, "step": 22588 }, { "epoch": 0.7059375, "grad_norm": 2.859375, "grad_norm_var": 0.0621978759765625, "learning_rate": 0.0001, "loss": 5.625, "loss/crossentropy": 2.556036114692688, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16197659075260162, "step": 22590 }, { "epoch": 0.706, "grad_norm": 3.09375, "grad_norm_var": 0.05572916666666667, "learning_rate": 0.0001, "loss": 5.6676, "loss/crossentropy": 2.602640151977539, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16079703718423843, "step": 22592 }, { "epoch": 0.7060625, "grad_norm": 2.703125, "grad_norm_var": 0.06265360514322917, "learning_rate": 0.0001, "loss": 5.4918, "loss/crossentropy": 2.4886010885238647, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15618036687374115, "step": 22594 }, { "epoch": 0.706125, "grad_norm": 3.125, "grad_norm_var": 0.05956624348958333, "learning_rate": 0.0001, "loss": 6.0395, "loss/crossentropy": 2.781382203102112, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1773706078529358, "step": 22596 }, { "epoch": 0.7061875, "grad_norm": 2.921875, "grad_norm_var": 0.06730855305989583, "learning_rate": 0.0001, "loss": 5.4773, "loss/crossentropy": 2.5254284143447876, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15222245454788208, "step": 22598 }, { "epoch": 0.70625, "grad_norm": 2.96875, "grad_norm_var": 0.03313395182291667, "learning_rate": 0.0001, "loss": 5.5662, "loss/crossentropy": 2.493793249130249, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15880389511585236, "step": 22600 }, { "epoch": 0.7063125, "grad_norm": 3.0, "grad_norm_var": 0.034789021809895834, "learning_rate": 0.0001, "loss": 5.682, "loss/crossentropy": 2.577637553215027, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16433850675821304, "step": 22602 }, { "epoch": 0.706375, "grad_norm": 2.96875, "grad_norm_var": 0.022541300455729166, "learning_rate": 0.0001, "loss": 5.4109, "loss/crossentropy": 2.39993155002594, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.155780628323555, "step": 22604 }, { "epoch": 0.7064375, "grad_norm": 9.0, "grad_norm_var": 2.27955322265625, "learning_rate": 0.0001, "loss": 5.4753, "loss/crossentropy": 2.225648522377014, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.18043851852416992, "step": 22606 }, { "epoch": 0.7065, "grad_norm": 2.96875, "grad_norm_var": 2.2759724934895833, "learning_rate": 0.0001, "loss": 5.4955, "loss/crossentropy": 2.482828974723816, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15438725054264069, "step": 22608 }, { "epoch": 0.7065625, "grad_norm": 3.25, "grad_norm_var": 2.2589152018229166, "learning_rate": 0.0001, "loss": 5.6719, "loss/crossentropy": 2.524471640586853, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16630669683218002, "step": 22610 }, { "epoch": 0.706625, "grad_norm": 3.578125, "grad_norm_var": 2.265062459309896, "learning_rate": 0.0001, "loss": 5.8865, "loss/crossentropy": 2.6477824449539185, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17582054436206818, "step": 22612 }, { "epoch": 0.7066875, "grad_norm": 2.953125, "grad_norm_var": 2.2369049072265623, "learning_rate": 0.0001, "loss": 5.677, "loss/crossentropy": 2.5583536624908447, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16382235288619995, "step": 22614 }, { "epoch": 0.70675, "grad_norm": 2.84375, "grad_norm_var": 2.2416951497395834, "learning_rate": 0.0001, "loss": 5.3948, "loss/crossentropy": 2.3632187843322754, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15549806505441666, "step": 22616 }, { "epoch": 0.7068125, "grad_norm": 2.9375, "grad_norm_var": 2.2510650634765623, "learning_rate": 0.0001, "loss": 5.3167, "loss/crossentropy": 2.351553440093994, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15432695299386978, "step": 22618 }, { "epoch": 0.706875, "grad_norm": 3.0, "grad_norm_var": 2.257664998372396, "learning_rate": 0.0001, "loss": 5.5772, "loss/crossentropy": 2.5519754886627197, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.15994001924991608, "step": 22620 }, { "epoch": 0.7069375, "grad_norm": 2.703125, "grad_norm_var": 0.03928934733072917, "learning_rate": 0.0001, "loss": 5.5584, "loss/crossentropy": 2.647280216217041, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.14501510560512543, "step": 22622 }, { "epoch": 0.707, "grad_norm": 2.8125, "grad_norm_var": 0.04212239583333333, "learning_rate": 0.0001, "loss": 5.5797, "loss/crossentropy": 2.595065474510193, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15432358533143997, "step": 22624 }, { "epoch": 0.7070625, "grad_norm": 2.84375, "grad_norm_var": 0.0416015625, "learning_rate": 0.0001, "loss": 5.7241, "loss/crossentropy": 2.5783623456954956, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16809231787919998, "step": 22626 }, { "epoch": 0.707125, "grad_norm": 3.21875, "grad_norm_var": 0.13616536458333334, "learning_rate": 0.0001, "loss": 6.1164, "loss/crossentropy": 2.7184011936187744, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.18667079508304596, "step": 22628 }, { "epoch": 0.7071875, "grad_norm": 3.109375, "grad_norm_var": 0.1376861572265625, "learning_rate": 0.0001, "loss": 5.3682, "loss/crossentropy": 2.3315646648406982, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15912887454032898, "step": 22630 }, { "epoch": 0.70725, "grad_norm": 3.171875, "grad_norm_var": 0.13580322265625, "learning_rate": 0.0001, "loss": 5.6181, "loss/crossentropy": 2.535012722015381, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16260423511266708, "step": 22632 }, { "epoch": 0.7073125, "grad_norm": 5.1875, "grad_norm_var": 0.40627848307291664, "learning_rate": 0.0001, "loss": 5.9121, "loss/crossentropy": 2.619154453277588, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.18202994018793106, "step": 22634 }, { "epoch": 0.707375, "grad_norm": 3.234375, "grad_norm_var": 0.3915191650390625, "learning_rate": 0.0001, "loss": 5.699, "loss/crossentropy": 2.5490050315856934, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16577870398759842, "step": 22636 }, { "epoch": 0.7074375, "grad_norm": 2.9375, "grad_norm_var": 0.37292378743489585, "learning_rate": 0.0001, "loss": 5.6291, "loss/crossentropy": 2.544523000717163, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.16509998589754105, "step": 22638 }, { "epoch": 0.7075, "grad_norm": 2.828125, "grad_norm_var": 0.358642578125, "learning_rate": 0.0001, "loss": 5.4892, "loss/crossentropy": 2.431329131126404, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.15422901511192322, "step": 22640 }, { "epoch": 0.7075625, "grad_norm": 3.03125, "grad_norm_var": 0.3481353759765625, "learning_rate": 0.0001, "loss": 5.5501, "loss/crossentropy": 2.4429216384887695, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16305910795927048, "step": 22642 }, { "epoch": 0.707625, "grad_norm": 3.15625, "grad_norm_var": 0.28142903645833334, "learning_rate": 0.0001, "loss": 5.5926, "loss/crossentropy": 2.505157470703125, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16421548277139664, "step": 22644 }, { "epoch": 0.7076875, "grad_norm": 2.96875, "grad_norm_var": 0.29156494140625, "learning_rate": 0.0001, "loss": 5.4453, "loss/crossentropy": 2.4603381156921387, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15552835166454315, "step": 22646 }, { "epoch": 0.70775, "grad_norm": 3.046875, "grad_norm_var": 0.2938629150390625, "learning_rate": 0.0001, "loss": 5.461, "loss/crossentropy": 2.3692561388015747, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16113196313381195, "step": 22648 }, { "epoch": 0.7078125, "grad_norm": 3.1875, "grad_norm_var": 0.031233723958333334, "learning_rate": 0.0001, "loss": 5.5372, "loss/crossentropy": 2.4094094038009644, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1682494729757309, "step": 22650 }, { "epoch": 0.707875, "grad_norm": 3.296875, "grad_norm_var": 0.03400777180989583, "learning_rate": 0.0001, "loss": 5.8546, "loss/crossentropy": 2.647633194923401, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1706990748643875, "step": 22652 }, { "epoch": 0.7079375, "grad_norm": 3.109375, "grad_norm_var": 0.0291656494140625, "learning_rate": 0.0001, "loss": 5.6755, "loss/crossentropy": 2.5117732286453247, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16715115308761597, "step": 22654 }, { "epoch": 0.708, "grad_norm": 3.375, "grad_norm_var": 0.0236480712890625, "learning_rate": 0.0001, "loss": 5.5235, "loss/crossentropy": 2.3812849521636963, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16695304214954376, "step": 22656 }, { "epoch": 0.7080625, "grad_norm": 3.359375, "grad_norm_var": 0.023249308268229168, "learning_rate": 0.0001, "loss": 5.3083, "loss/crossentropy": 2.313563823699951, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15494468063116074, "step": 22658 }, { "epoch": 0.708125, "grad_norm": 2.953125, "grad_norm_var": 0.030720011393229166, "learning_rate": 0.0001, "loss": 5.5461, "loss/crossentropy": 2.483505964279175, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1617322638630867, "step": 22660 }, { "epoch": 0.7081875, "grad_norm": 3.265625, "grad_norm_var": 0.025423177083333335, "learning_rate": 0.0001, "loss": 5.7102, "loss/crossentropy": 2.581833243370056, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16479483246803284, "step": 22662 }, { "epoch": 0.70825, "grad_norm": 3.109375, "grad_norm_var": 0.029613240559895834, "learning_rate": 0.0001, "loss": 5.5946, "loss/crossentropy": 2.4872822761535645, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.16737564653158188, "step": 22664 }, { "epoch": 0.7083125, "grad_norm": 3.671875, "grad_norm_var": 0.05006103515625, "learning_rate": 0.0001, "loss": 5.5572, "loss/crossentropy": 2.428568482398987, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16716498136520386, "step": 22666 }, { "epoch": 0.708375, "grad_norm": 3.203125, "grad_norm_var": 0.049128214518229164, "learning_rate": 0.0001, "loss": 6.1102, "loss/crossentropy": 2.7997725009918213, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1802627444267273, "step": 22668 }, { "epoch": 0.7084375, "grad_norm": 2.953125, "grad_norm_var": 0.08024088541666667, "learning_rate": 0.0001, "loss": 5.6215, "loss/crossentropy": 2.4986475706100464, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16658470779657364, "step": 22670 }, { "epoch": 0.7085, "grad_norm": 3.1875, "grad_norm_var": 0.08087565104166666, "learning_rate": 0.0001, "loss": 5.6899, "loss/crossentropy": 2.5867003202438354, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16501083225011826, "step": 22672 }, { "epoch": 0.7085625, "grad_norm": 3.5, "grad_norm_var": 0.092041015625, "learning_rate": 0.0001, "loss": 6.0346, "loss/crossentropy": 2.792657971382141, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17731842398643494, "step": 22674 }, { "epoch": 0.708625, "grad_norm": 3.34375, "grad_norm_var": 0.08580322265625, "learning_rate": 0.0001, "loss": 5.501, "loss/crossentropy": 2.4093992710113525, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16267625987529755, "step": 22676 }, { "epoch": 0.7086875, "grad_norm": 2.921875, "grad_norm_var": 0.09345296223958334, "learning_rate": 0.0001, "loss": 5.6387, "loss/crossentropy": 2.5944132804870605, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1579430103302002, "step": 22678 }, { "epoch": 0.70875, "grad_norm": 3.109375, "grad_norm_var": 0.09018452962239583, "learning_rate": 0.0001, "loss": 5.9749, "loss/crossentropy": 2.775382161140442, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1730796843767166, "step": 22680 }, { "epoch": 0.7088125, "grad_norm": 2.96875, "grad_norm_var": 0.06813863118489584, "learning_rate": 0.0001, "loss": 5.4292, "loss/crossentropy": 2.382672429084778, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16089950501918793, "step": 22682 }, { "epoch": 0.708875, "grad_norm": 2.8125, "grad_norm_var": 0.07604166666666666, "learning_rate": 0.0001, "loss": 5.5873, "loss/crossentropy": 2.5621368885040283, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15798881649971008, "step": 22684 }, { "epoch": 0.7089375, "grad_norm": 3.203125, "grad_norm_var": 0.037398274739583334, "learning_rate": 0.0001, "loss": 6.1326, "loss/crossentropy": 2.880889654159546, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17556104063987732, "step": 22686 }, { "epoch": 0.709, "grad_norm": 2.875, "grad_norm_var": 0.03953348795572917, "learning_rate": 0.0001, "loss": 5.6047, "loss/crossentropy": 2.578131675720215, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1596865952014923, "step": 22688 }, { "epoch": 0.7090625, "grad_norm": 3.171875, "grad_norm_var": 0.025202433268229168, "learning_rate": 0.0001, "loss": 5.7346, "loss/crossentropy": 2.608475089073181, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1661328747868538, "step": 22690 }, { "epoch": 0.709125, "grad_norm": 3.203125, "grad_norm_var": 0.020807902018229168, "learning_rate": 0.0001, "loss": 5.7047, "loss/crossentropy": 2.5614583492279053, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16705966740846634, "step": 22692 }, { "epoch": 0.7091875, "grad_norm": 3.265625, "grad_norm_var": 0.021565755208333332, "learning_rate": 0.0001, "loss": 5.7064, "loss/crossentropy": 2.5638712644577026, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1705031916499138, "step": 22694 }, { "epoch": 0.70925, "grad_norm": 3.0, "grad_norm_var": 0.02838134765625, "learning_rate": 0.0001, "loss": 5.2853, "loss/crossentropy": 2.359517216682434, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.1519572213292122, "step": 22696 }, { "epoch": 0.7093125, "grad_norm": 3.0, "grad_norm_var": 0.025484212239583335, "learning_rate": 0.0001, "loss": 5.6032, "loss/crossentropy": 2.5778008699417114, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.159569650888443, "step": 22698 }, { "epoch": 0.709375, "grad_norm": 3.171875, "grad_norm_var": 0.022118123372395833, "learning_rate": 0.0001, "loss": 5.465, "loss/crossentropy": 2.4808367490768433, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15466291457414627, "step": 22700 }, { "epoch": 0.7094375, "grad_norm": 3.25, "grad_norm_var": 0.024739583333333332, "learning_rate": 0.0001, "loss": 5.7297, "loss/crossentropy": 2.6298859119415283, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16310980916023254, "step": 22702 }, { "epoch": 0.7095, "grad_norm": 3.03125, "grad_norm_var": 0.022379557291666668, "learning_rate": 0.0001, "loss": 5.6075, "loss/crossentropy": 2.5181902647018433, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1624443531036377, "step": 22704 }, { "epoch": 0.7095625, "grad_norm": 2.859375, "grad_norm_var": 0.025731404622395832, "learning_rate": 0.0001, "loss": 5.5071, "loss/crossentropy": 2.48038911819458, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15618938207626343, "step": 22706 }, { "epoch": 0.709625, "grad_norm": 2.828125, "grad_norm_var": 0.026236979166666667, "learning_rate": 0.0001, "loss": 5.639, "loss/crossentropy": 2.5747874975204468, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1611132174730301, "step": 22708 }, { "epoch": 0.7096875, "grad_norm": 3.375, "grad_norm_var": 0.03178609212239583, "learning_rate": 0.0001, "loss": 5.6398, "loss/crossentropy": 2.533713221549988, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1668551117181778, "step": 22710 }, { "epoch": 0.70975, "grad_norm": 2.921875, "grad_norm_var": 0.025202433268229168, "learning_rate": 0.0001, "loss": 5.7806, "loss/crossentropy": 2.641892910003662, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1666097566485405, "step": 22712 }, { "epoch": 0.7098125, "grad_norm": 3.484375, "grad_norm_var": 0.03564046223958333, "learning_rate": 0.0001, "loss": 5.7634, "loss/crossentropy": 2.483535051345825, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17407961189746857, "step": 22714 }, { "epoch": 0.709875, "grad_norm": 2.8125, "grad_norm_var": 0.03870442708333333, "learning_rate": 0.0001, "loss": 5.4512, "loss/crossentropy": 2.4775872230529785, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15322404354810715, "step": 22716 }, { "epoch": 0.7099375, "grad_norm": 2.890625, "grad_norm_var": 0.0653961181640625, "learning_rate": 0.0001, "loss": 5.4074, "loss/crossentropy": 2.2718788385391235, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16316396743059158, "step": 22718 }, { "epoch": 0.71, "grad_norm": 3.265625, "grad_norm_var": 0.06968994140625, "learning_rate": 0.0001, "loss": 5.4643, "loss/crossentropy": 2.4302117824554443, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15927303582429886, "step": 22720 }, { "epoch": 0.7100625, "grad_norm": 3.109375, "grad_norm_var": 0.06345113118489583, "learning_rate": 0.0001, "loss": 5.4779, "loss/crossentropy": 2.4662187099456787, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1581977978348732, "step": 22722 }, { "epoch": 0.710125, "grad_norm": 3.453125, "grad_norm_var": 0.12180989583333333, "learning_rate": 0.0001, "loss": 5.8428, "loss/crossentropy": 2.6261080503463745, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17323270440101624, "step": 22724 }, { "epoch": 0.7101875, "grad_norm": 3.125, "grad_norm_var": 0.12086181640625, "learning_rate": 0.0001, "loss": 5.5741, "loss/crossentropy": 2.5218567848205566, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15678295493125916, "step": 22726 }, { "epoch": 0.71025, "grad_norm": 3.734375, "grad_norm_var": 0.1319244384765625, "learning_rate": 0.0001, "loss": 6.0828, "loss/crossentropy": 2.759239077568054, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18157602846622467, "step": 22728 }, { "epoch": 0.7103125, "grad_norm": 2.90625, "grad_norm_var": 0.14137369791666668, "learning_rate": 0.0001, "loss": 5.3898, "loss/crossentropy": 2.4318522214889526, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.14813748002052307, "step": 22730 }, { "epoch": 0.710375, "grad_norm": 2.953125, "grad_norm_var": 0.13028971354166666, "learning_rate": 0.0001, "loss": 5.9064, "loss/crossentropy": 2.769029974937439, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16646908223628998, "step": 22732 }, { "epoch": 0.7104375, "grad_norm": 3.171875, "grad_norm_var": 0.10664774576822916, "learning_rate": 0.0001, "loss": 5.6145, "loss/crossentropy": 2.5253701210021973, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1600886434316635, "step": 22734 }, { "epoch": 0.7105, "grad_norm": 2.90625, "grad_norm_var": 0.10767822265625, "learning_rate": 0.0001, "loss": 5.7827, "loss/crossentropy": 2.6507482528686523, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16319461911916733, "step": 22736 }, { "epoch": 0.7105625, "grad_norm": 3.203125, "grad_norm_var": 0.106640625, "learning_rate": 0.0001, "loss": 5.7554, "loss/crossentropy": 2.5523555278778076, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17187045514583588, "step": 22738 }, { "epoch": 0.710625, "grad_norm": 2.90625, "grad_norm_var": 0.050634765625, "learning_rate": 0.0001, "loss": 5.6187, "loss/crossentropy": 2.5167863368988037, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1617513820528984, "step": 22740 }, { "epoch": 0.7106875, "grad_norm": 3.03125, "grad_norm_var": 0.0516998291015625, "learning_rate": 0.0001, "loss": 5.4693, "loss/crossentropy": 2.4850971698760986, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15467499941587448, "step": 22742 }, { "epoch": 0.71075, "grad_norm": 2.890625, "grad_norm_var": 0.01519775390625, "learning_rate": 0.0001, "loss": 5.7114, "loss/crossentropy": 2.630742907524109, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1627499684691429, "step": 22744 }, { "epoch": 0.7108125, "grad_norm": 3.03125, "grad_norm_var": 0.016499837239583332, "learning_rate": 0.0001, "loss": 5.6352, "loss/crossentropy": 2.5919694900512695, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.16252951323986053, "step": 22746 }, { "epoch": 0.710875, "grad_norm": 2.9375, "grad_norm_var": 0.015816243489583333, "learning_rate": 0.0001, "loss": 5.4342, "loss/crossentropy": 2.4220200777053833, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.16059560328722, "step": 22748 }, { "epoch": 0.7109375, "grad_norm": 2.9375, "grad_norm_var": 0.013948567708333333, "learning_rate": 0.0001, "loss": 5.5684, "loss/crossentropy": 2.493459463119507, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.1653081774711609, "step": 22750 }, { "epoch": 0.711, "grad_norm": 3.03125, "grad_norm_var": 0.020555623372395835, "learning_rate": 0.0001, "loss": 5.845, "loss/crossentropy": 2.6640223264694214, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17083007842302322, "step": 22752 }, { "epoch": 0.7110625, "grad_norm": 4.375, "grad_norm_var": 0.13097330729166667, "learning_rate": 0.0001, "loss": 5.9282, "loss/crossentropy": 2.667199730873108, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.171409510076046, "step": 22754 }, { "epoch": 0.711125, "grad_norm": 3.0, "grad_norm_var": 0.12916259765625, "learning_rate": 0.0001, "loss": 5.7468, "loss/crossentropy": 2.60368275642395, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16821421682834625, "step": 22756 }, { "epoch": 0.7111875, "grad_norm": 3.265625, "grad_norm_var": 0.12870992024739583, "learning_rate": 0.0001, "loss": 5.8475, "loss/crossentropy": 2.6243757009506226, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.173094242811203, "step": 22758 }, { "epoch": 0.71125, "grad_norm": 2.859375, "grad_norm_var": 0.12905985514322918, "learning_rate": 0.0001, "loss": 5.4948, "loss/crossentropy": 2.5094510316848755, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.1559608057141304, "step": 22760 }, { "epoch": 0.7113125, "grad_norm": 3.03125, "grad_norm_var": 0.12356669108072917, "learning_rate": 0.0001, "loss": 5.4227, "loss/crossentropy": 2.4355177879333496, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15731318295001984, "step": 22762 }, { "epoch": 0.711375, "grad_norm": 3.15625, "grad_norm_var": 0.16108296712239584, "learning_rate": 0.0001, "loss": 5.772, "loss/crossentropy": 2.555185317993164, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1712891086935997, "step": 22764 }, { "epoch": 0.7114375, "grad_norm": 3.359375, "grad_norm_var": 0.15607096354166666, "learning_rate": 0.0001, "loss": 5.8258, "loss/crossentropy": 2.6384923458099365, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.17381110787391663, "step": 22766 }, { "epoch": 0.7115, "grad_norm": 3.03125, "grad_norm_var": 0.16676432291666668, "learning_rate": 0.0001, "loss": 5.3878, "loss/crossentropy": 2.346682071685791, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1568494439125061, "step": 22768 }, { "epoch": 0.7115625, "grad_norm": 3.125, "grad_norm_var": 0.08124898274739584, "learning_rate": 0.0001, "loss": 5.2037, "loss/crossentropy": 2.2740384340286255, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.14999421685934067, "step": 22770 }, { "epoch": 0.711625, "grad_norm": 3.375, "grad_norm_var": 0.08825581868489583, "learning_rate": 0.0001, "loss": 6.1145, "loss/crossentropy": 2.7673765420913696, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1823655590415001, "step": 22772 }, { "epoch": 0.7116875, "grad_norm": 2.96875, "grad_norm_var": 0.09159749348958333, "learning_rate": 0.0001, "loss": 5.3939, "loss/crossentropy": 2.4127246141433716, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15437036752700806, "step": 22774 }, { "epoch": 0.71175, "grad_norm": 2.828125, "grad_norm_var": 0.09659830729166667, "learning_rate": 0.0001, "loss": 5.6882, "loss/crossentropy": 2.651946783065796, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.16104654967784882, "step": 22776 }, { "epoch": 0.7118125, "grad_norm": 3.1875, "grad_norm_var": 0.09666341145833333, "learning_rate": 0.0001, "loss": 5.9907, "loss/crossentropy": 2.804213285446167, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17099642753601074, "step": 22778 }, { "epoch": 0.711875, "grad_norm": 3.03125, "grad_norm_var": 0.046923828125, "learning_rate": 0.0001, "loss": 5.7334, "loss/crossentropy": 2.5786982774734497, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.17054951190948486, "step": 22780 }, { "epoch": 0.7119375, "grad_norm": 3.578125, "grad_norm_var": 0.057062784830729164, "learning_rate": 0.0001, "loss": 5.1693, "loss/crossentropy": 2.170661151409149, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15143033862113953, "step": 22782 }, { "epoch": 0.712, "grad_norm": 3.078125, "grad_norm_var": 0.0527984619140625, "learning_rate": 0.0001, "loss": 5.762, "loss/crossentropy": 2.667365312576294, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16376512497663498, "step": 22784 }, { "epoch": 0.7120625, "grad_norm": 2.90625, "grad_norm_var": 0.04910481770833333, "learning_rate": 0.0001, "loss": 5.5265, "loss/crossentropy": 2.572754383087158, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.1531859189271927, "step": 22786 }, { "epoch": 0.712125, "grad_norm": 2.890625, "grad_norm_var": 0.033430989583333334, "learning_rate": 0.0001, "loss": 5.5468, "loss/crossentropy": 2.4839890003204346, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1625334694981575, "step": 22788 }, { "epoch": 0.7121875, "grad_norm": 2.875, "grad_norm_var": 0.034586588541666664, "learning_rate": 0.0001, "loss": 5.2954, "loss/crossentropy": 2.3394761085510254, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15106205642223358, "step": 22790 }, { "epoch": 0.71225, "grad_norm": 3.140625, "grad_norm_var": 0.031722005208333334, "learning_rate": 0.0001, "loss": 5.7823, "loss/crossentropy": 2.633091449737549, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16413924098014832, "step": 22792 }, { "epoch": 0.7123125, "grad_norm": 3.015625, "grad_norm_var": 0.0318511962890625, "learning_rate": 0.0001, "loss": 5.7476, "loss/crossentropy": 2.5560721158981323, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16954263299703598, "step": 22794 }, { "epoch": 0.712375, "grad_norm": 3.921875, "grad_norm_var": 0.07866923014322917, "learning_rate": 0.0001, "loss": 5.5303, "loss/crossentropy": 2.4877405166625977, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15621329098939896, "step": 22796 }, { "epoch": 0.7124375, "grad_norm": 3.171875, "grad_norm_var": 0.06638895670572917, "learning_rate": 0.0001, "loss": 5.9457, "loss/crossentropy": 2.737141489982605, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17203232645988464, "step": 22798 }, { "epoch": 0.7125, "grad_norm": 2.96875, "grad_norm_var": 0.0666168212890625, "learning_rate": 0.0001, "loss": 5.7573, "loss/crossentropy": 2.634291172027588, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1650400161743164, "step": 22800 }, { "epoch": 0.7125625, "grad_norm": 3.3125, "grad_norm_var": 0.08924051920572916, "learning_rate": 0.0001, "loss": 6.0942, "loss/crossentropy": 2.8320658206939697, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17621354013681412, "step": 22802 }, { "epoch": 0.712625, "grad_norm": 3.078125, "grad_norm_var": 0.08406575520833333, "learning_rate": 0.0001, "loss": 5.4304, "loss/crossentropy": 2.411654829978943, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15851899981498718, "step": 22804 }, { "epoch": 0.7126875, "grad_norm": 2.9375, "grad_norm_var": 0.08391011555989583, "learning_rate": 0.0001, "loss": 5.3661, "loss/crossentropy": 2.352829337120056, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15562819689512253, "step": 22806 }, { "epoch": 0.71275, "grad_norm": 2.953125, "grad_norm_var": 0.0881256103515625, "learning_rate": 0.0001, "loss": 5.4315, "loss/crossentropy": 2.429062843322754, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.1592329815030098, "step": 22808 }, { "epoch": 0.7128125, "grad_norm": 2.859375, "grad_norm_var": 0.09368489583333334, "learning_rate": 0.0001, "loss": 5.3305, "loss/crossentropy": 2.345287322998047, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15555164217948914, "step": 22810 }, { "epoch": 0.712875, "grad_norm": 3.125, "grad_norm_var": 0.05252278645833333, "learning_rate": 0.0001, "loss": 5.8159, "loss/crossentropy": 2.6428064107894897, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16926005482673645, "step": 22812 }, { "epoch": 0.7129375, "grad_norm": 3.375, "grad_norm_var": 0.05526936848958333, "learning_rate": 0.0001, "loss": 5.7927, "loss/crossentropy": 2.6445947885513306, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16754399240016937, "step": 22814 }, { "epoch": 0.713, "grad_norm": 3.390625, "grad_norm_var": 0.11797587076822917, "learning_rate": 0.0001, "loss": 6.1274, "loss/crossentropy": 2.7830978631973267, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.18716295063495636, "step": 22816 }, { "epoch": 0.7130625, "grad_norm": 2.984375, "grad_norm_var": 0.09731343587239584, "learning_rate": 0.0001, "loss": 5.7858, "loss/crossentropy": 2.581252932548523, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1724056899547577, "step": 22818 }, { "epoch": 0.713125, "grad_norm": 3.0, "grad_norm_var": 0.1576324462890625, "learning_rate": 0.0001, "loss": 5.6434, "loss/crossentropy": 2.4850512742996216, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.16231805831193924, "step": 22820 }, { "epoch": 0.7131875, "grad_norm": 3.203125, "grad_norm_var": 0.15260009765625, "learning_rate": 0.0001, "loss": 5.5542, "loss/crossentropy": 2.5062475204467773, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16105009615421295, "step": 22822 }, { "epoch": 0.71325, "grad_norm": 2.984375, "grad_norm_var": 0.15286051432291667, "learning_rate": 0.0001, "loss": 5.4837, "loss/crossentropy": 2.4561463594436646, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15549034625291824, "step": 22824 }, { "epoch": 0.7133125, "grad_norm": 2.9375, "grad_norm_var": 0.17031962076822918, "learning_rate": 0.0001, "loss": 5.3374, "loss/crossentropy": 2.40144145488739, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.14945203065872192, "step": 22826 }, { "epoch": 0.713375, "grad_norm": 3.703125, "grad_norm_var": 0.19029541015625, "learning_rate": 0.0001, "loss": 5.7645, "loss/crossentropy": 2.567002296447754, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17092538625001907, "step": 22828 }, { "epoch": 0.7134375, "grad_norm": 3.140625, "grad_norm_var": 0.18230692545572916, "learning_rate": 0.0001, "loss": 5.6136, "loss/crossentropy": 2.457472801208496, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16835028678178787, "step": 22830 }, { "epoch": 0.7135, "grad_norm": 3.296875, "grad_norm_var": 0.12547098795572917, "learning_rate": 0.0001, "loss": 6.0906, "loss/crossentropy": 2.879745602607727, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1738201305270195, "step": 22832 }, { "epoch": 0.7135625, "grad_norm": 3.03125, "grad_norm_var": 0.13761393229166666, "learning_rate": 0.0001, "loss": 5.8168, "loss/crossentropy": 2.593244194984436, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17547693848609924, "step": 22834 }, { "epoch": 0.713625, "grad_norm": 3.21875, "grad_norm_var": 0.4930338541666667, "learning_rate": 0.0001, "loss": 6.4787, "loss/crossentropy": 2.883357286453247, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.2052336409687996, "step": 22836 }, { "epoch": 0.7136875, "grad_norm": 3.015625, "grad_norm_var": 0.49075419108072915, "learning_rate": 0.0001, "loss": 5.6144, "loss/crossentropy": 2.549429774284363, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1572815477848053, "step": 22838 }, { "epoch": 0.71375, "grad_norm": 3.078125, "grad_norm_var": 0.48004150390625, "learning_rate": 0.0001, "loss": 5.672, "loss/crossentropy": 2.589589238166809, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16292711347341537, "step": 22840 }, { "epoch": 0.7138125, "grad_norm": 2.953125, "grad_norm_var": 0.45465087890625, "learning_rate": 0.0001, "loss": 5.4062, "loss/crossentropy": 2.3374747037887573, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16000014543533325, "step": 22842 }, { "epoch": 0.713875, "grad_norm": 2.796875, "grad_norm_var": 0.45360921223958334, "learning_rate": 0.0001, "loss": 5.7364, "loss/crossentropy": 2.690854072570801, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16237051039934158, "step": 22844 }, { "epoch": 0.7139375, "grad_norm": 2.953125, "grad_norm_var": 0.4587198893229167, "learning_rate": 0.0001, "loss": 5.636, "loss/crossentropy": 2.5606768131256104, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16455943137407303, "step": 22846 }, { "epoch": 0.714, "grad_norm": 3.015625, "grad_norm_var": 0.4665323893229167, "learning_rate": 0.0001, "loss": 5.7475, "loss/crossentropy": 2.6324750185012817, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1657993420958519, "step": 22848 }, { "epoch": 0.7140625, "grad_norm": 3.265625, "grad_norm_var": 0.4652303059895833, "learning_rate": 0.0001, "loss": 5.621, "loss/crossentropy": 2.5494478940963745, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1606711894273758, "step": 22850 }, { "epoch": 0.714125, "grad_norm": 3.265625, "grad_norm_var": 0.0226470947265625, "learning_rate": 0.0001, "loss": 5.8994, "loss/crossentropy": 2.7274543046951294, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.169148251414299, "step": 22852 }, { "epoch": 0.7141875, "grad_norm": 2.875, "grad_norm_var": 0.024095662434895835, "learning_rate": 0.0001, "loss": 5.6623, "loss/crossentropy": 2.647777795791626, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15730953961610794, "step": 22854 }, { "epoch": 0.71425, "grad_norm": 2.875, "grad_norm_var": 0.026253255208333333, "learning_rate": 0.0001, "loss": 5.7908, "loss/crossentropy": 2.6484906673431396, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1654057577252388, "step": 22856 }, { "epoch": 0.7143125, "grad_norm": 3.15625, "grad_norm_var": 0.034163411458333334, "learning_rate": 0.0001, "loss": 5.9103, "loss/crossentropy": 2.688157796859741, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17572540789842606, "step": 22858 }, { "epoch": 0.714375, "grad_norm": 3.125, "grad_norm_var": 0.03681233723958333, "learning_rate": 0.0001, "loss": 6.0367, "loss/crossentropy": 2.717068076133728, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.18313800543546677, "step": 22860 }, { "epoch": 0.7144375, "grad_norm": 3.015625, "grad_norm_var": 0.0355133056640625, "learning_rate": 0.0001, "loss": 5.9411, "loss/crossentropy": 2.7882550954818726, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.169186569750309, "step": 22862 }, { "epoch": 0.7145, "grad_norm": 2.8125, "grad_norm_var": 0.041966756184895836, "learning_rate": 0.0001, "loss": 5.2297, "loss/crossentropy": 2.3523730039596558, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.1478886529803276, "step": 22864 }, { "epoch": 0.7145625, "grad_norm": 2.984375, "grad_norm_var": 0.03498433430989583, "learning_rate": 0.0001, "loss": 6.05, "loss/crossentropy": 2.833707571029663, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.17592568695545197, "step": 22866 }, { "epoch": 0.714625, "grad_norm": 3.15625, "grad_norm_var": 0.038525390625, "learning_rate": 0.0001, "loss": 5.6687, "loss/crossentropy": 2.6187517642974854, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15929661691188812, "step": 22868 }, { "epoch": 0.7146875, "grad_norm": 3.25, "grad_norm_var": 0.03892822265625, "learning_rate": 0.0001, "loss": 5.7494, "loss/crossentropy": 2.5784850120544434, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.17217203229665756, "step": 22870 }, { "epoch": 0.71475, "grad_norm": 2.984375, "grad_norm_var": 0.0366363525390625, "learning_rate": 0.0001, "loss": 5.631, "loss/crossentropy": 2.5323336124420166, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16182377189397812, "step": 22872 }, { "epoch": 0.7148125, "grad_norm": 3.0, "grad_norm_var": 0.031473795572916664, "learning_rate": 0.0001, "loss": 5.802, "loss/crossentropy": 2.5953755378723145, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17105583101511002, "step": 22874 }, { "epoch": 0.714875, "grad_norm": 3.1875, "grad_norm_var": 0.021239217122395834, "learning_rate": 0.0001, "loss": 5.8375, "loss/crossentropy": 2.65300977230072, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17118091881275177, "step": 22876 }, { "epoch": 0.7149375, "grad_norm": 3.1875, "grad_norm_var": 0.0223052978515625, "learning_rate": 0.0001, "loss": 5.865, "loss/crossentropy": 2.6619762182235718, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17108601331710815, "step": 22878 }, { "epoch": 0.715, "grad_norm": 2.96875, "grad_norm_var": 0.018355305989583334, "learning_rate": 0.0001, "loss": 5.8736, "loss/crossentropy": 2.695634603500366, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17130812257528305, "step": 22880 }, { "epoch": 0.7150625, "grad_norm": 3.046875, "grad_norm_var": 0.0179351806640625, "learning_rate": 0.0001, "loss": 5.5798, "loss/crossentropy": 2.54266095161438, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.16034962981939316, "step": 22882 }, { "epoch": 0.715125, "grad_norm": 3.375, "grad_norm_var": 0.019701131184895835, "learning_rate": 0.0001, "loss": 5.4497, "loss/crossentropy": 2.444697618484497, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15636342763900757, "step": 22884 }, { "epoch": 0.7151875, "grad_norm": 3.25, "grad_norm_var": 0.017769368489583333, "learning_rate": 0.0001, "loss": 5.6857, "loss/crossentropy": 2.6012312173843384, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.16509142518043518, "step": 22886 }, { "epoch": 0.71525, "grad_norm": 2.921875, "grad_norm_var": 0.019091796875, "learning_rate": 0.0001, "loss": 5.6813, "loss/crossentropy": 2.6111589670181274, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.16599716246128082, "step": 22888 }, { "epoch": 0.7153125, "grad_norm": 3.234375, "grad_norm_var": 0.017210896809895834, "learning_rate": 0.0001, "loss": 5.7435, "loss/crossentropy": 2.613040566444397, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1657794564962387, "step": 22890 }, { "epoch": 0.715375, "grad_norm": 3.421875, "grad_norm_var": 0.022261555989583334, "learning_rate": 0.0001, "loss": 5.8041, "loss/crossentropy": 2.6595606803894043, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16718725115060806, "step": 22892 }, { "epoch": 0.7154375, "grad_norm": 3.25, "grad_norm_var": 0.022077433268229165, "learning_rate": 0.0001, "loss": 6.1265, "loss/crossentropy": 2.8601614236831665, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17545753717422485, "step": 22894 }, { "epoch": 0.7155, "grad_norm": 3.28125, "grad_norm_var": 0.024681599934895833, "learning_rate": 0.0001, "loss": 5.5148, "loss/crossentropy": 2.471987009048462, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.16092655807733536, "step": 22896 }, { "epoch": 0.7155625, "grad_norm": 3.0625, "grad_norm_var": 0.024168904622395834, "learning_rate": 0.0001, "loss": 5.6747, "loss/crossentropy": 2.5920382738113403, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16412097215652466, "step": 22898 }, { "epoch": 0.715625, "grad_norm": 3.234375, "grad_norm_var": 0.018993123372395834, "learning_rate": 0.0001, "loss": 5.2929, "loss/crossentropy": 2.325380504131317, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1479194536805153, "step": 22900 }, { "epoch": 0.7156875, "grad_norm": 3.09375, "grad_norm_var": 0.019075520833333335, "learning_rate": 0.0001, "loss": 5.4117, "loss/crossentropy": 2.3158397674560547, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16075515002012253, "step": 22902 }, { "epoch": 0.71575, "grad_norm": 2.984375, "grad_norm_var": 0.0173492431640625, "learning_rate": 0.0001, "loss": 5.7636, "loss/crossentropy": 2.6401582956314087, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16586346924304962, "step": 22904 }, { "epoch": 0.7158125, "grad_norm": 2.921875, "grad_norm_var": 0.020018513997395834, "learning_rate": 0.0001, "loss": 5.3527, "loss/crossentropy": 2.3282387256622314, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1579195261001587, "step": 22906 }, { "epoch": 0.715875, "grad_norm": 2.96875, "grad_norm_var": 0.01734619140625, "learning_rate": 0.0001, "loss": 5.646, "loss/crossentropy": 2.5214792490005493, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1651904061436653, "step": 22908 }, { "epoch": 0.7159375, "grad_norm": 3.15625, "grad_norm_var": 0.01617431640625, "learning_rate": 0.0001, "loss": 5.3251, "loss/crossentropy": 2.353050470352173, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.1550159454345703, "step": 22910 }, { "epoch": 0.716, "grad_norm": 2.84375, "grad_norm_var": 0.016402180989583334, "learning_rate": 0.0001, "loss": 5.5424, "loss/crossentropy": 2.4919172525405884, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1605149209499359, "step": 22912 }, { "epoch": 0.7160625, "grad_norm": 3.015625, "grad_norm_var": 0.018040974934895832, "learning_rate": 0.0001, "loss": 5.8854, "loss/crossentropy": 2.7260115146636963, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16827943176031113, "step": 22914 }, { "epoch": 0.716125, "grad_norm": 3.0625, "grad_norm_var": 0.016014607747395833, "learning_rate": 0.0001, "loss": 5.7663, "loss/crossentropy": 2.635956048965454, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16811296343803406, "step": 22916 }, { "epoch": 0.7161875, "grad_norm": 2.96875, "grad_norm_var": 0.016792805989583333, "learning_rate": 0.0001, "loss": 5.7874, "loss/crossentropy": 2.626030683517456, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1669207662343979, "step": 22918 }, { "epoch": 0.71625, "grad_norm": 2.984375, "grad_norm_var": 0.017064412434895832, "learning_rate": 0.0001, "loss": 5.6213, "loss/crossentropy": 2.505921483039856, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16231827437877655, "step": 22920 }, { "epoch": 0.7163125, "grad_norm": 2.859375, "grad_norm_var": 0.0242340087890625, "learning_rate": 0.0001, "loss": 5.1162, "loss/crossentropy": 2.253924250602722, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.14442609250545502, "step": 22922 }, { "epoch": 0.716375, "grad_norm": 3.0625, "grad_norm_var": 0.0188385009765625, "learning_rate": 0.0001, "loss": 5.6083, "loss/crossentropy": 2.535650134086609, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16117332875728607, "step": 22924 }, { "epoch": 0.7164375, "grad_norm": 3.09375, "grad_norm_var": 0.0178863525390625, "learning_rate": 0.0001, "loss": 5.5294, "loss/crossentropy": 2.52425217628479, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.16066953539848328, "step": 22926 }, { "epoch": 0.7165, "grad_norm": 3.390625, "grad_norm_var": 0.0256256103515625, "learning_rate": 0.0001, "loss": 5.922, "loss/crossentropy": 2.7153269052505493, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17340506613254547, "step": 22928 }, { "epoch": 0.7165625, "grad_norm": 3.140625, "grad_norm_var": 0.024950154622395835, "learning_rate": 0.0001, "loss": 5.3528, "loss/crossentropy": 2.3147772550582886, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.16043871641159058, "step": 22930 }, { "epoch": 0.716625, "grad_norm": 2.9375, "grad_norm_var": 0.026200358072916666, "learning_rate": 0.0001, "loss": 5.3001, "loss/crossentropy": 2.3875588178634644, "loss/hidden": 1.39453125, "loss/jsd": 0.0, "loss/logits": 0.15180308371782303, "step": 22932 }, { "epoch": 0.7166875, "grad_norm": 2.71875, "grad_norm_var": 0.03156636555989583, "learning_rate": 0.0001, "loss": 5.4529, "loss/crossentropy": 2.507371425628662, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15080364048480988, "step": 22934 }, { "epoch": 0.71675, "grad_norm": 3.0625, "grad_norm_var": 0.0298492431640625, "learning_rate": 0.0001, "loss": 5.6018, "loss/crossentropy": 2.554092526435852, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1586805135011673, "step": 22936 }, { "epoch": 0.7168125, "grad_norm": 3.046875, "grad_norm_var": 0.027392578125, "learning_rate": 0.0001, "loss": 5.5521, "loss/crossentropy": 2.3645232915878296, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16602084040641785, "step": 22938 }, { "epoch": 0.716875, "grad_norm": 3.0, "grad_norm_var": 0.027757771809895835, "learning_rate": 0.0001, "loss": 5.5889, "loss/crossentropy": 2.5736584663391113, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15855270624160767, "step": 22940 }, { "epoch": 0.7169375, "grad_norm": 2.921875, "grad_norm_var": 0.028180948893229165, "learning_rate": 0.0001, "loss": 5.7128, "loss/crossentropy": 2.663956642150879, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15801136195659637, "step": 22942 }, { "epoch": 0.717, "grad_norm": 2.65625, "grad_norm_var": 0.025389607747395834, "learning_rate": 0.0001, "loss": 5.381, "loss/crossentropy": 2.4947913885116577, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.14252623170614243, "step": 22944 }, { "epoch": 0.7170625, "grad_norm": 2.75, "grad_norm_var": 0.024409993489583334, "learning_rate": 0.0001, "loss": 5.5959, "loss/crossentropy": 2.5649075508117676, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15934739261865616, "step": 22946 }, { "epoch": 0.717125, "grad_norm": 3.5, "grad_norm_var": 0.04482320149739583, "learning_rate": 0.0001, "loss": 5.7552, "loss/crossentropy": 2.581146717071533, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1689656898379326, "step": 22948 }, { "epoch": 0.7171875, "grad_norm": 2.90625, "grad_norm_var": 0.04300130208333333, "learning_rate": 0.0001, "loss": 5.6355, "loss/crossentropy": 2.5125614404678345, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16658750921487808, "step": 22950 }, { "epoch": 0.71725, "grad_norm": 2.953125, "grad_norm_var": 0.04597066243489583, "learning_rate": 0.0001, "loss": 5.5647, "loss/crossentropy": 2.460248827934265, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16161371022462845, "step": 22952 }, { "epoch": 0.7173125, "grad_norm": 3.0, "grad_norm_var": 0.03937886555989583, "learning_rate": 0.0001, "loss": 5.6261, "loss/crossentropy": 2.554978609085083, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16024113446474075, "step": 22954 }, { "epoch": 0.717375, "grad_norm": 2.75, "grad_norm_var": 0.043257649739583334, "learning_rate": 0.0001, "loss": 5.5287, "loss/crossentropy": 2.538967490196228, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1560032069683075, "step": 22956 }, { "epoch": 0.7174375, "grad_norm": 3.53125, "grad_norm_var": 0.06214192708333333, "learning_rate": 0.0001, "loss": 5.5909, "loss/crossentropy": 2.4516427516937256, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16470374912023544, "step": 22958 }, { "epoch": 0.7175, "grad_norm": 3.25, "grad_norm_var": 0.07893778483072916, "learning_rate": 0.0001, "loss": 5.7601, "loss/crossentropy": 2.5798234939575195, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16998391598463058, "step": 22960 }, { "epoch": 0.7175625, "grad_norm": 3.390625, "grad_norm_var": 0.07330729166666666, "learning_rate": 0.0001, "loss": 5.9709, "loss/crossentropy": 2.748919129371643, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1749340146780014, "step": 22962 }, { "epoch": 0.717625, "grad_norm": 3.265625, "grad_norm_var": 0.06301981608072917, "learning_rate": 0.0001, "loss": 5.6071, "loss/crossentropy": 2.4429022073745728, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.16290469467639923, "step": 22964 }, { "epoch": 0.7176875, "grad_norm": 2.890625, "grad_norm_var": 0.0654937744140625, "learning_rate": 0.0001, "loss": 5.578, "loss/crossentropy": 2.533684253692627, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16028668731451035, "step": 22966 }, { "epoch": 0.71775, "grad_norm": 3.15625, "grad_norm_var": 0.0646392822265625, "learning_rate": 0.0001, "loss": 5.2242, "loss/crossentropy": 2.1830204725265503, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15451082587242126, "step": 22968 }, { "epoch": 0.7178125, "grad_norm": 2.890625, "grad_norm_var": 0.0666015625, "learning_rate": 0.0001, "loss": 5.8123, "loss/crossentropy": 2.733540177345276, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16178182512521744, "step": 22970 }, { "epoch": 0.717875, "grad_norm": 2.875, "grad_norm_var": 0.059845987955729166, "learning_rate": 0.0001, "loss": 5.5909, "loss/crossentropy": 2.5439876317977905, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15820924937725067, "step": 22972 }, { "epoch": 0.7179375, "grad_norm": 2.84375, "grad_norm_var": 0.059178670247395836, "learning_rate": 0.0001, "loss": 5.5805, "loss/crossentropy": 2.587586283683777, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.15671729296445847, "step": 22974 }, { "epoch": 0.718, "grad_norm": 3.0625, "grad_norm_var": 0.04228413899739583, "learning_rate": 0.0001, "loss": 5.5181, "loss/crossentropy": 2.4439518451690674, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16131775826215744, "step": 22976 }, { "epoch": 0.7180625, "grad_norm": 3.078125, "grad_norm_var": 0.0376861572265625, "learning_rate": 0.0001, "loss": 5.7114, "loss/crossentropy": 2.6640597581863403, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15785618126392365, "step": 22978 }, { "epoch": 0.718125, "grad_norm": 3.34375, "grad_norm_var": 0.13805338541666667, "learning_rate": 0.0001, "loss": 6.0807, "loss/crossentropy": 2.655805468559265, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18897271901369095, "step": 22980 }, { "epoch": 0.7181875, "grad_norm": 3.21875, "grad_norm_var": 0.1283355712890625, "learning_rate": 0.0001, "loss": 5.8266, "loss/crossentropy": 2.6418944597244263, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16768986731767654, "step": 22982 }, { "epoch": 0.71825, "grad_norm": 3.453125, "grad_norm_var": 0.13191731770833334, "learning_rate": 0.0001, "loss": 5.8889, "loss/crossentropy": 2.703049659729004, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16819187998771667, "step": 22984 }, { "epoch": 0.7183125, "grad_norm": 3.515625, "grad_norm_var": 0.12906901041666666, "learning_rate": 0.0001, "loss": 5.7941, "loss/crossentropy": 2.600367307662964, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1717212051153183, "step": 22986 }, { "epoch": 0.718375, "grad_norm": 3.34375, "grad_norm_var": 0.12375895182291667, "learning_rate": 0.0001, "loss": 5.7472, "loss/crossentropy": 2.6864311695098877, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15920286625623703, "step": 22988 }, { "epoch": 0.7184375, "grad_norm": 3.09375, "grad_norm_var": 0.11237691243489584, "learning_rate": 0.0001, "loss": 5.5005, "loss/crossentropy": 2.42072856426239, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15992847084999084, "step": 22990 }, { "epoch": 0.7185, "grad_norm": 3.234375, "grad_norm_var": 0.11028645833333334, "learning_rate": 0.0001, "loss": 5.8857, "loss/crossentropy": 2.6808758974075317, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17243708670139313, "step": 22992 }, { "epoch": 0.7185625, "grad_norm": 3.234375, "grad_norm_var": 0.14885965983072916, "learning_rate": 0.0001, "loss": 5.6984, "loss/crossentropy": 2.490253210067749, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17159316688776016, "step": 22994 }, { "epoch": 0.718625, "grad_norm": 2.921875, "grad_norm_var": 0.09265848795572916, "learning_rate": 0.0001, "loss": 5.4208, "loss/crossentropy": 2.4393712282180786, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15009599179029465, "step": 22996 }, { "epoch": 0.7186875, "grad_norm": 3.0, "grad_norm_var": 0.0980377197265625, "learning_rate": 0.0001, "loss": 5.4572, "loss/crossentropy": 2.387829542160034, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16162166744470596, "step": 22998 }, { "epoch": 0.71875, "grad_norm": 3.59375, "grad_norm_var": 0.102587890625, "learning_rate": 0.0001, "loss": 6.041, "loss/crossentropy": 2.757057785987854, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17761504650115967, "step": 23000 }, { "epoch": 0.7188125, "grad_norm": 3.03125, "grad_norm_var": 0.09999898274739584, "learning_rate": 0.0001, "loss": 5.6528, "loss/crossentropy": 2.550535798072815, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16374579071998596, "step": 23002 }, { "epoch": 0.718875, "grad_norm": 2.90625, "grad_norm_var": 0.10453999837239583, "learning_rate": 0.0001, "loss": 5.5285, "loss/crossentropy": 2.5141395330429077, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.158076211810112, "step": 23004 }, { "epoch": 0.7189375, "grad_norm": 3.15625, "grad_norm_var": 0.11061909993489584, "learning_rate": 0.0001, "loss": 5.5961, "loss/crossentropy": 2.559632420539856, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.16106736660003662, "step": 23006 }, { "epoch": 0.719, "grad_norm": 3.3125, "grad_norm_var": 0.11005859375, "learning_rate": 0.0001, "loss": 5.6422, "loss/crossentropy": 2.5025556087493896, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16591913998126984, "step": 23008 }, { "epoch": 0.7190625, "grad_norm": 3.078125, "grad_norm_var": 0.045441691080729166, "learning_rate": 0.0001, "loss": 5.7458, "loss/crossentropy": 2.616849422454834, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16602325439453125, "step": 23010 }, { "epoch": 0.719125, "grad_norm": 3.046875, "grad_norm_var": 0.039774576822916664, "learning_rate": 0.0001, "loss": 5.6429, "loss/crossentropy": 2.5550752878189087, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16229917109012604, "step": 23012 }, { "epoch": 0.7191875, "grad_norm": 3.234375, "grad_norm_var": 0.033854166666666664, "learning_rate": 0.0001, "loss": 5.9789, "loss/crossentropy": 2.7079319953918457, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17319077253341675, "step": 23014 }, { "epoch": 0.71925, "grad_norm": 3.140625, "grad_norm_var": 0.015501912434895833, "learning_rate": 0.0001, "loss": 5.7121, "loss/crossentropy": 2.5766741037368774, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1705784946680069, "step": 23016 }, { "epoch": 0.7193125, "grad_norm": 3.15625, "grad_norm_var": 0.014557902018229167, "learning_rate": 0.0001, "loss": 5.5817, "loss/crossentropy": 2.5128093957901, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1623556911945343, "step": 23018 }, { "epoch": 0.719375, "grad_norm": 3.546875, "grad_norm_var": 0.025047810872395833, "learning_rate": 0.0001, "loss": 5.7667, "loss/crossentropy": 2.5243011713027954, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1718970388174057, "step": 23020 }, { "epoch": 0.7194375, "grad_norm": 3.03125, "grad_norm_var": 0.024290974934895834, "learning_rate": 0.0001, "loss": 5.6906, "loss/crossentropy": 2.5249842405319214, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16655787080526352, "step": 23022 }, { "epoch": 0.7195, "grad_norm": 3.171875, "grad_norm_var": 0.023656209309895832, "learning_rate": 0.0001, "loss": 5.4646, "loss/crossentropy": 2.458473801612854, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.15803193300962448, "step": 23024 }, { "epoch": 0.7195625, "grad_norm": 3.25, "grad_norm_var": 0.03250325520833333, "learning_rate": 0.0001, "loss": 5.8432, "loss/crossentropy": 2.659113883972168, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16996963322162628, "step": 23026 }, { "epoch": 0.719625, "grad_norm": 2.96875, "grad_norm_var": 0.0338287353515625, "learning_rate": 0.0001, "loss": 5.2914, "loss/crossentropy": 2.321761965751648, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15204261988401413, "step": 23028 }, { "epoch": 0.7196875, "grad_norm": 3.140625, "grad_norm_var": 0.03336181640625, "learning_rate": 0.0001, "loss": 6.0239, "loss/crossentropy": 2.8034214973449707, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17204715311527252, "step": 23030 }, { "epoch": 0.71975, "grad_norm": 2.921875, "grad_norm_var": 0.04973958333333333, "learning_rate": 0.0001, "loss": 5.131, "loss/crossentropy": 2.263086676597595, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.14381912350654602, "step": 23032 }, { "epoch": 0.7198125, "grad_norm": 3.296875, "grad_norm_var": 0.0476226806640625, "learning_rate": 0.0001, "loss": 5.7328, "loss/crossentropy": 2.5711982250213623, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16967156529426575, "step": 23034 }, { "epoch": 0.719875, "grad_norm": 3.125, "grad_norm_var": 0.038671875, "learning_rate": 0.0001, "loss": 6.0364, "loss/crossentropy": 2.778636336326599, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17655359208583832, "step": 23036 }, { "epoch": 0.7199375, "grad_norm": 3.25, "grad_norm_var": 0.036962890625, "learning_rate": 0.0001, "loss": 5.7919, "loss/crossentropy": 2.6048576831817627, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16909153014421463, "step": 23038 }, { "epoch": 0.72, "grad_norm": 3.25, "grad_norm_var": 0.039892578125, "learning_rate": 0.0001, "loss": 5.7048, "loss/crossentropy": 2.5766351222991943, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1667202264070511, "step": 23040 }, { "epoch": 0.7200625, "grad_norm": 3.453125, "grad_norm_var": 0.03515625, "learning_rate": 0.0001, "loss": 5.8284, "loss/crossentropy": 2.6341872215270996, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1705925315618515, "step": 23042 }, { "epoch": 0.720125, "grad_norm": 3.15625, "grad_norm_var": 0.033356730143229166, "learning_rate": 0.0001, "loss": 5.7009, "loss/crossentropy": 2.5701510906219482, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16073225438594818, "step": 23044 }, { "epoch": 0.7201875, "grad_norm": 3.21875, "grad_norm_var": 0.0351226806640625, "learning_rate": 0.0001, "loss": 5.5986, "loss/crossentropy": 2.5418559312820435, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1607527956366539, "step": 23046 }, { "epoch": 0.72025, "grad_norm": 3.5, "grad_norm_var": 0.035986328125, "learning_rate": 0.0001, "loss": 5.6231, "loss/crossentropy": 2.5080316066741943, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16580697894096375, "step": 23048 }, { "epoch": 0.7203125, "grad_norm": 2.984375, "grad_norm_var": 0.036454264322916666, "learning_rate": 0.0001, "loss": 5.7042, "loss/crossentropy": 2.5495556592941284, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16663891077041626, "step": 23050 }, { "epoch": 0.720375, "grad_norm": 3.125, "grad_norm_var": 0.03889872233072917, "learning_rate": 0.0001, "loss": 5.3949, "loss/crossentropy": 2.3359018564224243, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16019276529550552, "step": 23052 }, { "epoch": 0.7204375, "grad_norm": 3.0625, "grad_norm_var": 0.04053446451822917, "learning_rate": 0.0001, "loss": 5.4801, "loss/crossentropy": 2.403424859046936, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16079019010066986, "step": 23054 }, { "epoch": 0.7205, "grad_norm": 3.1875, "grad_norm_var": 0.0397125244140625, "learning_rate": 0.0001, "loss": 5.8981, "loss/crossentropy": 2.72353732585907, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17019527405500412, "step": 23056 }, { "epoch": 0.7205625, "grad_norm": 3.0, "grad_norm_var": 0.033040364583333336, "learning_rate": 0.0001, "loss": 5.618, "loss/crossentropy": 2.494266986846924, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1670648530125618, "step": 23058 }, { "epoch": 0.720625, "grad_norm": 2.921875, "grad_norm_var": 0.03543294270833333, "learning_rate": 0.0001, "loss": 5.8462, "loss/crossentropy": 2.6992437839508057, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16703598201274872, "step": 23060 }, { "epoch": 0.7206875, "grad_norm": 2.890625, "grad_norm_var": 0.035033162434895834, "learning_rate": 0.0001, "loss": 5.5348, "loss/crossentropy": 2.49991774559021, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15505250543355942, "step": 23062 }, { "epoch": 0.72075, "grad_norm": 3.453125, "grad_norm_var": 0.027684529622395832, "learning_rate": 0.0001, "loss": 5.797, "loss/crossentropy": 2.56563663482666, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1754772886633873, "step": 23064 }, { "epoch": 0.7208125, "grad_norm": 3.015625, "grad_norm_var": 0.027033487955729168, "learning_rate": 0.0001, "loss": 5.7839, "loss/crossentropy": 2.6267071962356567, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1696234866976738, "step": 23066 }, { "epoch": 0.720875, "grad_norm": 3.1875, "grad_norm_var": 0.024625651041666665, "learning_rate": 0.0001, "loss": 5.5554, "loss/crossentropy": 2.4868232011795044, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15998029708862305, "step": 23068 }, { "epoch": 0.7209375, "grad_norm": 3.109375, "grad_norm_var": 0.023639933268229166, "learning_rate": 0.0001, "loss": 5.5957, "loss/crossentropy": 2.4670718908309937, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16755016148090363, "step": 23070 }, { "epoch": 0.721, "grad_norm": 3.296875, "grad_norm_var": 0.07302144368489584, "learning_rate": 0.0001, "loss": 5.7273, "loss/crossentropy": 2.5400205850601196, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1687304750084877, "step": 23072 }, { "epoch": 0.7210625, "grad_norm": 3.203125, "grad_norm_var": 0.08131103515625, "learning_rate": 0.0001, "loss": 5.4401, "loss/crossentropy": 2.464422821998596, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.15577441453933716, "step": 23074 }, { "epoch": 0.721125, "grad_norm": 3.4375, "grad_norm_var": 0.08234049479166666, "learning_rate": 0.0001, "loss": 5.5466, "loss/crossentropy": 2.4490153789520264, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1640603244304657, "step": 23076 }, { "epoch": 0.7211875, "grad_norm": 3.5, "grad_norm_var": 0.08513895670572917, "learning_rate": 0.0001, "loss": 5.6498, "loss/crossentropy": 2.5793803930282593, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16094836592674255, "step": 23078 }, { "epoch": 0.72125, "grad_norm": 3.203125, "grad_norm_var": 0.08606669108072916, "learning_rate": 0.0001, "loss": 5.512, "loss/crossentropy": 2.5071429014205933, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15517634898424149, "step": 23080 }, { "epoch": 0.7213125, "grad_norm": 3.328125, "grad_norm_var": 0.08509012858072916, "learning_rate": 0.0001, "loss": 5.8955, "loss/crossentropy": 2.6934951543807983, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1713682785630226, "step": 23082 }, { "epoch": 0.721375, "grad_norm": 3.40625, "grad_norm_var": 0.0873687744140625, "learning_rate": 0.0001, "loss": 5.7627, "loss/crossentropy": 2.5480767488479614, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.16755439341068268, "step": 23084 }, { "epoch": 0.7214375, "grad_norm": 3.125, "grad_norm_var": 0.09079488118489583, "learning_rate": 0.0001, "loss": 5.6522, "loss/crossentropy": 2.550952911376953, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1648128256201744, "step": 23086 }, { "epoch": 0.7215, "grad_norm": 3.0625, "grad_norm_var": 0.041259765625, "learning_rate": 0.0001, "loss": 5.6769, "loss/crossentropy": 2.6425377130508423, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16124935448169708, "step": 23088 }, { "epoch": 0.7215625, "grad_norm": 2.890625, "grad_norm_var": 0.038117472330729166, "learning_rate": 0.0001, "loss": 5.2548, "loss/crossentropy": 2.326769709587097, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.14905112981796265, "step": 23090 }, { "epoch": 0.721625, "grad_norm": 3.296875, "grad_norm_var": 0.03504130045572917, "learning_rate": 0.0001, "loss": 5.8017, "loss/crossentropy": 2.5980507135391235, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17114394903182983, "step": 23092 }, { "epoch": 0.7216875, "grad_norm": 2.96875, "grad_norm_var": 0.029173787434895834, "learning_rate": 0.0001, "loss": 5.8539, "loss/crossentropy": 2.746086359024048, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16351471841335297, "step": 23094 }, { "epoch": 0.72175, "grad_norm": 2.8125, "grad_norm_var": 0.0394195556640625, "learning_rate": 0.0001, "loss": 5.6244, "loss/crossentropy": 2.5009769201278687, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16703161597251892, "step": 23096 }, { "epoch": 0.7218125, "grad_norm": 3.203125, "grad_norm_var": 0.03814697265625, "learning_rate": 0.0001, "loss": 5.6159, "loss/crossentropy": 2.5001531839370728, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16547824442386627, "step": 23098 }, { "epoch": 0.721875, "grad_norm": 3.640625, "grad_norm_var": 0.05012105305989583, "learning_rate": 0.0001, "loss": 6.0574, "loss/crossentropy": 2.8432135581970215, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.17649948596954346, "step": 23100 }, { "epoch": 0.7219375, "grad_norm": 3.09375, "grad_norm_var": 0.048502604166666664, "learning_rate": 0.0001, "loss": 5.727, "loss/crossentropy": 2.6021103858947754, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16678505390882492, "step": 23102 }, { "epoch": 0.722, "grad_norm": 3.359375, "grad_norm_var": 0.051102701822916666, "learning_rate": 0.0001, "loss": 5.8847, "loss/crossentropy": 2.7236799001693726, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16610267758369446, "step": 23104 }, { "epoch": 0.7220625, "grad_norm": 3.390625, "grad_norm_var": 0.0484283447265625, "learning_rate": 0.0001, "loss": 5.8858, "loss/crossentropy": 2.6859112977981567, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17311672121286392, "step": 23106 }, { "epoch": 0.722125, "grad_norm": 3.234375, "grad_norm_var": 0.05194905598958333, "learning_rate": 0.0001, "loss": 5.4255, "loss/crossentropy": 2.386235475540161, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16017398983240128, "step": 23108 }, { "epoch": 0.7221875, "grad_norm": 2.921875, "grad_norm_var": 0.05156148274739583, "learning_rate": 0.0001, "loss": 5.7212, "loss/crossentropy": 2.6586928367614746, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15937219560146332, "step": 23110 }, { "epoch": 0.72225, "grad_norm": 3.3125, "grad_norm_var": 0.042496744791666666, "learning_rate": 0.0001, "loss": 5.9033, "loss/crossentropy": 2.7014816999435425, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17252995818853378, "step": 23112 }, { "epoch": 0.7223125, "grad_norm": 3.296875, "grad_norm_var": 0.04475809733072917, "learning_rate": 0.0001, "loss": 5.7803, "loss/crossentropy": 2.649962306022644, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16420765966176987, "step": 23114 }, { "epoch": 0.722375, "grad_norm": 3.046875, "grad_norm_var": 0.030256144205729165, "learning_rate": 0.0001, "loss": 5.8306, "loss/crossentropy": 2.587769389152527, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1754559651017189, "step": 23116 }, { "epoch": 0.7224375, "grad_norm": 2.84375, "grad_norm_var": 0.035546875, "learning_rate": 0.0001, "loss": 5.527, "loss/crossentropy": 2.4696303606033325, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16003839671611786, "step": 23118 }, { "epoch": 0.7225, "grad_norm": 3.03125, "grad_norm_var": 0.03193359375, "learning_rate": 0.0001, "loss": 5.7403, "loss/crossentropy": 2.585333228111267, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16783996671438217, "step": 23120 }, { "epoch": 0.7225625, "grad_norm": 2.828125, "grad_norm_var": 0.030224609375, "learning_rate": 0.0001, "loss": 5.2633, "loss/crossentropy": 2.360242009162903, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.14812248200178146, "step": 23122 }, { "epoch": 0.722625, "grad_norm": 3.09375, "grad_norm_var": 0.02691650390625, "learning_rate": 0.0001, "loss": 5.5555, "loss/crossentropy": 2.5515612363815308, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1554725095629692, "step": 23124 }, { "epoch": 0.7226875, "grad_norm": 2.859375, "grad_norm_var": 0.0286041259765625, "learning_rate": 0.0001, "loss": 5.5207, "loss/crossentropy": 2.4835323095321655, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.16270030289888382, "step": 23126 }, { "epoch": 0.72275, "grad_norm": 3.109375, "grad_norm_var": 0.024934895833333335, "learning_rate": 0.0001, "loss": 5.8374, "loss/crossentropy": 2.613251566886902, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17202463746070862, "step": 23128 }, { "epoch": 0.7228125, "grad_norm": 3.265625, "grad_norm_var": 0.0234771728515625, "learning_rate": 0.0001, "loss": 5.8513, "loss/crossentropy": 2.652708888053894, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17063865065574646, "step": 23130 }, { "epoch": 0.722875, "grad_norm": 3.78125, "grad_norm_var": 0.07051493326822916, "learning_rate": 0.0001, "loss": 6.1522, "loss/crossentropy": 2.759737491607666, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.18807834386825562, "step": 23132 }, { "epoch": 0.7229375, "grad_norm": 3.03125, "grad_norm_var": 0.06780598958333334, "learning_rate": 0.0001, "loss": 5.2282, "loss/crossentropy": 2.3027509450912476, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.1519230529665947, "step": 23134 }, { "epoch": 0.723, "grad_norm": 2.96875, "grad_norm_var": 0.07200419108072917, "learning_rate": 0.0001, "loss": 5.4546, "loss/crossentropy": 2.4205862283706665, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1569124236702919, "step": 23136 }, { "epoch": 0.7230625, "grad_norm": 2.96875, "grad_norm_var": 0.06336263020833334, "learning_rate": 0.0001, "loss": 5.4985, "loss/crossentropy": 2.4439892768859863, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1570119559764862, "step": 23138 }, { "epoch": 0.723125, "grad_norm": 2.96875, "grad_norm_var": 0.06421610514322916, "learning_rate": 0.0001, "loss": 5.3426, "loss/crossentropy": 2.3728432655334473, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15283336490392685, "step": 23140 }, { "epoch": 0.7231875, "grad_norm": 2.96875, "grad_norm_var": 0.06144917805989583, "learning_rate": 0.0001, "loss": 5.6772, "loss/crossentropy": 2.6229491233825684, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.164021298289299, "step": 23142 }, { "epoch": 0.72325, "grad_norm": 3.0, "grad_norm_var": 0.06145426432291667, "learning_rate": 0.0001, "loss": 5.6777, "loss/crossentropy": 2.547545075416565, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16887624561786652, "step": 23144 }, { "epoch": 0.7233125, "grad_norm": 3.25, "grad_norm_var": 0.0609527587890625, "learning_rate": 0.0001, "loss": 5.4967, "loss/crossentropy": 2.3959068059921265, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1608622595667839, "step": 23146 }, { "epoch": 0.723375, "grad_norm": 3.140625, "grad_norm_var": 0.0107818603515625, "learning_rate": 0.0001, "loss": 5.9147, "loss/crossentropy": 2.743956446647644, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1705934852361679, "step": 23148 }, { "epoch": 0.7234375, "grad_norm": 2.90625, "grad_norm_var": 0.01343994140625, "learning_rate": 0.0001, "loss": 5.49, "loss/crossentropy": 2.5214651823043823, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1531044840812683, "step": 23150 }, { "epoch": 0.7235, "grad_norm": 3.015625, "grad_norm_var": 0.012581380208333333, "learning_rate": 0.0001, "loss": 5.5244, "loss/crossentropy": 2.451177477836609, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1584925651550293, "step": 23152 }, { "epoch": 0.7235625, "grad_norm": 3.140625, "grad_norm_var": 0.0198638916015625, "learning_rate": 0.0001, "loss": 5.9048, "loss/crossentropy": 2.7149475812911987, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17132603377103806, "step": 23154 }, { "epoch": 0.723625, "grad_norm": 3.234375, "grad_norm_var": 0.023827107747395833, "learning_rate": 0.0001, "loss": 5.7513, "loss/crossentropy": 2.6011768579483032, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1665768250823021, "step": 23156 }, { "epoch": 0.7236875, "grad_norm": 2.828125, "grad_norm_var": 0.027864583333333335, "learning_rate": 0.0001, "loss": 5.705, "loss/crossentropy": 2.6476725339889526, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1592516303062439, "step": 23158 }, { "epoch": 0.72375, "grad_norm": 3.125, "grad_norm_var": 0.028791300455729165, "learning_rate": 0.0001, "loss": 5.6737, "loss/crossentropy": 2.561895489692688, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16157186031341553, "step": 23160 }, { "epoch": 0.7238125, "grad_norm": 3.203125, "grad_norm_var": 0.028547159830729165, "learning_rate": 0.0001, "loss": 5.5481, "loss/crossentropy": 2.5043612718582153, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.1610122099518776, "step": 23162 }, { "epoch": 0.723875, "grad_norm": 3.1875, "grad_norm_var": 0.030028279622395834, "learning_rate": 0.0001, "loss": 5.5372, "loss/crossentropy": 2.4577651023864746, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1618458554148674, "step": 23164 }, { "epoch": 0.7239375, "grad_norm": 3.25, "grad_norm_var": 0.026463826497395832, "learning_rate": 0.0001, "loss": 5.8778, "loss/crossentropy": 2.692238926887512, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17090285569429398, "step": 23166 }, { "epoch": 0.724, "grad_norm": 3.484375, "grad_norm_var": 0.03661702473958333, "learning_rate": 0.0001, "loss": 5.5408, "loss/crossentropy": 2.447907328605652, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1631990671157837, "step": 23168 }, { "epoch": 0.7240625, "grad_norm": 3.125, "grad_norm_var": 0.03411051432291667, "learning_rate": 0.0001, "loss": 5.4933, "loss/crossentropy": 2.3692781925201416, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1655312478542328, "step": 23170 }, { "epoch": 0.724125, "grad_norm": 3.4375, "grad_norm_var": 0.03964436848958333, "learning_rate": 0.0001, "loss": 5.6876, "loss/crossentropy": 2.4974652528762817, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1705765798687935, "step": 23172 }, { "epoch": 0.7241875, "grad_norm": 2.921875, "grad_norm_var": 0.03624674479166667, "learning_rate": 0.0001, "loss": 5.4289, "loss/crossentropy": 2.3903775215148926, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15892648696899414, "step": 23174 }, { "epoch": 0.72425, "grad_norm": 3.109375, "grad_norm_var": 0.03912353515625, "learning_rate": 0.0001, "loss": 5.6177, "loss/crossentropy": 2.5819114446640015, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15670160949230194, "step": 23176 }, { "epoch": 0.7243125, "grad_norm": 3.65625, "grad_norm_var": 0.053766886393229164, "learning_rate": 0.0001, "loss": 6.3302, "loss/crossentropy": 2.925212264060974, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.18541866540908813, "step": 23178 }, { "epoch": 0.724375, "grad_norm": 2.890625, "grad_norm_var": 0.05998433430989583, "learning_rate": 0.0001, "loss": 5.3544, "loss/crossentropy": 2.3588887453079224, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15540842711925507, "step": 23180 }, { "epoch": 0.7244375, "grad_norm": 2.9375, "grad_norm_var": 0.05816650390625, "learning_rate": 0.0001, "loss": 5.5821, "loss/crossentropy": 2.573221445083618, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.15987595170736313, "step": 23182 }, { "epoch": 0.7245, "grad_norm": 3.15625, "grad_norm_var": 0.04605712890625, "learning_rate": 0.0001, "loss": 5.5574, "loss/crossentropy": 2.514816403388977, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15933258831501007, "step": 23184 }, { "epoch": 0.7245625, "grad_norm": 3.1875, "grad_norm_var": 0.04547119140625, "learning_rate": 0.0001, "loss": 5.7732, "loss/crossentropy": 2.653873920440674, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1650610715150833, "step": 23186 }, { "epoch": 0.724625, "grad_norm": 3.203125, "grad_norm_var": 0.036595662434895836, "learning_rate": 0.0001, "loss": 5.5357, "loss/crossentropy": 2.4859979152679443, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16043585538864136, "step": 23188 }, { "epoch": 0.7246875, "grad_norm": 2.890625, "grad_norm_var": 0.038752237955729164, "learning_rate": 0.0001, "loss": 5.7962, "loss/crossentropy": 2.7121152877807617, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16543519496917725, "step": 23190 }, { "epoch": 0.72475, "grad_norm": 3.0625, "grad_norm_var": 0.03465067545572917, "learning_rate": 0.0001, "loss": 5.5116, "loss/crossentropy": 2.459586977958679, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15754922479391098, "step": 23192 }, { "epoch": 0.7248125, "grad_norm": 3.28125, "grad_norm_var": 0.0182525634765625, "learning_rate": 0.0001, "loss": 5.8184, "loss/crossentropy": 2.695042848587036, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16624674201011658, "step": 23194 }, { "epoch": 0.724875, "grad_norm": 3.28125, "grad_norm_var": 0.0158843994140625, "learning_rate": 0.0001, "loss": 5.6659, "loss/crossentropy": 2.5744264125823975, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1610967516899109, "step": 23196 }, { "epoch": 0.7249375, "grad_norm": 3.171875, "grad_norm_var": 0.01304931640625, "learning_rate": 0.0001, "loss": 5.9551, "loss/crossentropy": 2.687638759613037, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1787019520998001, "step": 23198 }, { "epoch": 0.725, "grad_norm": 2.890625, "grad_norm_var": 0.0178375244140625, "learning_rate": 0.0001, "loss": 5.509, "loss/crossentropy": 2.4411755800247192, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1587366759777069, "step": 23200 }, { "epoch": 0.7250625, "grad_norm": 3.015625, "grad_norm_var": 0.018244425455729168, "learning_rate": 0.0001, "loss": 5.7031, "loss/crossentropy": 2.5218595266342163, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16733865439891815, "step": 23202 }, { "epoch": 0.725125, "grad_norm": 3.0625, "grad_norm_var": 0.03420308430989583, "learning_rate": 0.0001, "loss": 5.2764, "loss/crossentropy": 2.259980320930481, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15359201282262802, "step": 23204 }, { "epoch": 0.7251875, "grad_norm": 3.203125, "grad_norm_var": 0.027534993489583333, "learning_rate": 0.0001, "loss": 5.5431, "loss/crossentropy": 2.405370593070984, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16455841064453125, "step": 23206 }, { "epoch": 0.72525, "grad_norm": 2.890625, "grad_norm_var": 0.03427327473958333, "learning_rate": 0.0001, "loss": 5.5312, "loss/crossentropy": 2.502629280090332, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15676090121269226, "step": 23208 }, { "epoch": 0.7253125, "grad_norm": 2.90625, "grad_norm_var": 0.04399312337239583, "learning_rate": 0.0001, "loss": 5.3974, "loss/crossentropy": 2.4476083517074585, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.149665467441082, "step": 23210 }, { "epoch": 0.725375, "grad_norm": 3.0, "grad_norm_var": 0.042561848958333336, "learning_rate": 0.0001, "loss": 5.7794, "loss/crossentropy": 2.648160219192505, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1685919389128685, "step": 23212 }, { "epoch": 0.7254375, "grad_norm": 3.203125, "grad_norm_var": 0.041910807291666664, "learning_rate": 0.0001, "loss": 5.6714, "loss/crossentropy": 2.575753092765808, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16190358996391296, "step": 23214 }, { "epoch": 0.7255, "grad_norm": 3.0625, "grad_norm_var": 0.04676005045572917, "learning_rate": 0.0001, "loss": 5.3755, "loss/crossentropy": 2.368937611579895, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15690668672323227, "step": 23216 }, { "epoch": 0.7255625, "grad_norm": 2.953125, "grad_norm_var": 0.05066630045572917, "learning_rate": 0.0001, "loss": 5.6621, "loss/crossentropy": 2.5506527423858643, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16817519813776016, "step": 23218 }, { "epoch": 0.725625, "grad_norm": 3.296875, "grad_norm_var": 0.03635965983072917, "learning_rate": 0.0001, "loss": 5.6234, "loss/crossentropy": 2.510103940963745, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16484995186328888, "step": 23220 }, { "epoch": 0.7256875, "grad_norm": 2.9375, "grad_norm_var": 0.0321929931640625, "learning_rate": 0.0001, "loss": 5.5839, "loss/crossentropy": 2.5797786712646484, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1527549773454666, "step": 23222 }, { "epoch": 0.72575, "grad_norm": 2.921875, "grad_norm_var": 0.0326171875, "learning_rate": 0.0001, "loss": 5.7047, "loss/crossentropy": 2.5932576656341553, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16504817456007004, "step": 23224 }, { "epoch": 0.7258125, "grad_norm": 2.953125, "grad_norm_var": 0.0533843994140625, "learning_rate": 0.0001, "loss": 6.1421, "loss/crossentropy": 2.8035610914230347, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.18385039269924164, "step": 23226 }, { "epoch": 0.725875, "grad_norm": 5.3125, "grad_norm_var": 0.36940104166666665, "learning_rate": 0.0001, "loss": 5.7915, "loss/crossentropy": 2.585119843482971, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.17689155787229538, "step": 23228 }, { "epoch": 0.7259375, "grad_norm": 3.15625, "grad_norm_var": 0.3672841389973958, "learning_rate": 0.0001, "loss": 5.5665, "loss/crossentropy": 2.4747084379196167, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1634795442223549, "step": 23230 }, { "epoch": 0.726, "grad_norm": 3.09375, "grad_norm_var": 0.34823811848958336, "learning_rate": 0.0001, "loss": 5.7145, "loss/crossentropy": 2.5751774311065674, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16861557960510254, "step": 23232 }, { "epoch": 0.7260625, "grad_norm": 3.421875, "grad_norm_var": 0.3407216389973958, "learning_rate": 0.0001, "loss": 5.8606, "loss/crossentropy": 2.624040961265564, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1763881966471672, "step": 23234 }, { "epoch": 0.726125, "grad_norm": 3.515625, "grad_norm_var": 0.3234771728515625, "learning_rate": 0.0001, "loss": 5.5971, "loss/crossentropy": 2.530275583267212, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.1652732416987419, "step": 23236 }, { "epoch": 0.7261875, "grad_norm": 2.796875, "grad_norm_var": 0.3291656494140625, "learning_rate": 0.0001, "loss": 5.5069, "loss/crossentropy": 2.53396999835968, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.1558818519115448, "step": 23238 }, { "epoch": 0.72625, "grad_norm": 3.21875, "grad_norm_var": 0.32779541015625, "learning_rate": 0.0001, "loss": 5.6379, "loss/crossentropy": 2.5906230211257935, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1594114750623703, "step": 23240 }, { "epoch": 0.7263125, "grad_norm": 3.0, "grad_norm_var": 0.3327301025390625, "learning_rate": 0.0001, "loss": 5.4127, "loss/crossentropy": 2.444182872772217, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15310557186603546, "step": 23242 }, { "epoch": 0.726375, "grad_norm": 3.28125, "grad_norm_var": 0.045751953125, "learning_rate": 0.0001, "loss": 5.7193, "loss/crossentropy": 2.512703776359558, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16948899626731873, "step": 23244 }, { "epoch": 0.7264375, "grad_norm": 3.3125, "grad_norm_var": 0.043473307291666666, "learning_rate": 0.0001, "loss": 5.3024, "loss/crossentropy": 2.282405376434326, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.15082381665706635, "step": 23246 }, { "epoch": 0.7265, "grad_norm": 3.109375, "grad_norm_var": 0.0408843994140625, "learning_rate": 0.0001, "loss": 5.4735, "loss/crossentropy": 2.376451849937439, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16009971499443054, "step": 23248 }, { "epoch": 0.7265625, "grad_norm": 2.90625, "grad_norm_var": 0.03740132649739583, "learning_rate": 0.0001, "loss": 5.4683, "loss/crossentropy": 2.4280524253845215, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15870977938175201, "step": 23250 }, { "epoch": 0.726625, "grad_norm": 3.375, "grad_norm_var": 0.030269368489583334, "learning_rate": 0.0001, "loss": 5.5973, "loss/crossentropy": 2.4845110177993774, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16245494782924652, "step": 23252 }, { "epoch": 0.7266875, "grad_norm": 3.34375, "grad_norm_var": 0.05828450520833333, "learning_rate": 0.0001, "loss": 5.7597, "loss/crossentropy": 2.5968947410583496, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16823583096265793, "step": 23254 }, { "epoch": 0.72675, "grad_norm": 3.265625, "grad_norm_var": 0.055403645833333334, "learning_rate": 0.0001, "loss": 5.709, "loss/crossentropy": 2.5472995042800903, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16929778456687927, "step": 23256 }, { "epoch": 0.7268125, "grad_norm": 3.25, "grad_norm_var": 0.04604390462239583, "learning_rate": 0.0001, "loss": 5.5923, "loss/crossentropy": 2.472580909729004, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1639285385608673, "step": 23258 }, { "epoch": 0.726875, "grad_norm": 3.234375, "grad_norm_var": 0.044286092122395836, "learning_rate": 0.0001, "loss": 5.7344, "loss/crossentropy": 2.5646402835845947, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.17088643461465836, "step": 23260 }, { "epoch": 0.7269375, "grad_norm": 3.09375, "grad_norm_var": 0.04517313639322917, "learning_rate": 0.0001, "loss": 5.3972, "loss/crossentropy": 2.3815526962280273, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15859290957450867, "step": 23262 }, { "epoch": 0.727, "grad_norm": 3.5625, "grad_norm_var": 0.05178629557291667, "learning_rate": 0.0001, "loss": 5.6886, "loss/crossentropy": 2.453500509262085, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17272623628377914, "step": 23264 }, { "epoch": 0.7270625, "grad_norm": 2.96875, "grad_norm_var": 0.04517822265625, "learning_rate": 0.0001, "loss": 5.7463, "loss/crossentropy": 2.564083695411682, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1674373894929886, "step": 23266 }, { "epoch": 0.727125, "grad_norm": 3.171875, "grad_norm_var": 0.0429595947265625, "learning_rate": 0.0001, "loss": 5.895, "loss/crossentropy": 2.712197422981262, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17179730534553528, "step": 23268 }, { "epoch": 0.7271875, "grad_norm": 3.203125, "grad_norm_var": 0.024039713541666667, "learning_rate": 0.0001, "loss": 5.8484, "loss/crossentropy": 2.683329939842224, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1672929972410202, "step": 23270 }, { "epoch": 0.72725, "grad_norm": 3.140625, "grad_norm_var": 0.03157450358072917, "learning_rate": 0.0001, "loss": 5.6134, "loss/crossentropy": 2.6072771549224854, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1560782864689827, "step": 23272 }, { "epoch": 0.7273125, "grad_norm": 3.0, "grad_norm_var": 0.032892862955729164, "learning_rate": 0.0001, "loss": 5.6551, "loss/crossentropy": 2.530093789100647, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16874957084655762, "step": 23274 }, { "epoch": 0.727375, "grad_norm": 3.265625, "grad_norm_var": 0.03328348795572917, "learning_rate": 0.0001, "loss": 5.6012, "loss/crossentropy": 2.4686496257781982, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16286510229110718, "step": 23276 }, { "epoch": 0.7274375, "grad_norm": 3.109375, "grad_norm_var": 0.0338043212890625, "learning_rate": 0.0001, "loss": 5.5035, "loss/crossentropy": 2.4162285327911377, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15989916026592255, "step": 23278 }, { "epoch": 0.7275, "grad_norm": 2.875, "grad_norm_var": 0.027197265625, "learning_rate": 0.0001, "loss": 5.7379, "loss/crossentropy": 2.6118892431259155, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1657288670539856, "step": 23280 }, { "epoch": 0.7275625, "grad_norm": 3.046875, "grad_norm_var": 0.023193359375, "learning_rate": 0.0001, "loss": 5.563, "loss/crossentropy": 2.5222445726394653, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.161496102809906, "step": 23282 }, { "epoch": 0.727625, "grad_norm": 3.0625, "grad_norm_var": 0.030663045247395833, "learning_rate": 0.0001, "loss": 5.8126, "loss/crossentropy": 2.6276007890701294, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16889140009880066, "step": 23284 }, { "epoch": 0.7276875, "grad_norm": 3.046875, "grad_norm_var": 0.022728474934895833, "learning_rate": 0.0001, "loss": 5.7388, "loss/crossentropy": 2.597113609313965, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16729465126991272, "step": 23286 }, { "epoch": 0.72775, "grad_norm": 2.890625, "grad_norm_var": 0.022752888997395835, "learning_rate": 0.0001, "loss": 5.9324, "loss/crossentropy": 2.7669402360916138, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1669377014040947, "step": 23288 }, { "epoch": 0.7278125, "grad_norm": 3.109375, "grad_norm_var": 0.023924763997395834, "learning_rate": 0.0001, "loss": 5.6937, "loss/crossentropy": 2.5009844303131104, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1708378866314888, "step": 23290 }, { "epoch": 0.727875, "grad_norm": 3.1875, "grad_norm_var": 7.334375, "learning_rate": 0.0001, "loss": 5.7008, "loss/crossentropy": 2.488111734390259, "loss/hidden": 1.59765625, "loss/jsd": 0.0, "loss/logits": 0.1614990085363388, "step": 23292 }, { "epoch": 0.7279375, "grad_norm": 3.015625, "grad_norm_var": 7.3345703125, "learning_rate": 0.0001, "loss": 5.703, "loss/crossentropy": 2.5763041973114014, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16813740134239197, "step": 23294 }, { "epoch": 0.728, "grad_norm": 3.140625, "grad_norm_var": 7.324169921875, "learning_rate": 0.0001, "loss": 5.7199, "loss/crossentropy": 2.588278889656067, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16589736938476562, "step": 23296 }, { "epoch": 0.7280625, "grad_norm": 2.890625, "grad_norm_var": 7.303023274739584, "learning_rate": 0.0001, "loss": 5.5376, "loss/crossentropy": 2.4617420434951782, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15797995775938034, "step": 23298 }, { "epoch": 0.728125, "grad_norm": 3.421875, "grad_norm_var": 7.318310546875, "learning_rate": 0.0001, "loss": 5.9341, "loss/crossentropy": 2.7568886280059814, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16889671981334686, "step": 23300 }, { "epoch": 0.7281875, "grad_norm": 3.0625, "grad_norm_var": 7.294266764322916, "learning_rate": 0.0001, "loss": 6.0928, "loss/crossentropy": 2.799323797225952, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17778201401233673, "step": 23302 }, { "epoch": 0.72825, "grad_norm": 3.453125, "grad_norm_var": 7.282840983072917, "learning_rate": 0.0001, "loss": 5.9203, "loss/crossentropy": 2.6988720893859863, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17448870837688446, "step": 23304 }, { "epoch": 0.7283125, "grad_norm": 3.203125, "grad_norm_var": 7.276558430989583, "learning_rate": 0.0001, "loss": 6.1345, "loss/crossentropy": 2.802777409553528, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1819995865225792, "step": 23306 }, { "epoch": 0.728375, "grad_norm": 3.5625, "grad_norm_var": 0.0599609375, "learning_rate": 0.0001, "loss": 5.6137, "loss/crossentropy": 2.3838824033737183, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17298342287540436, "step": 23308 }, { "epoch": 0.7284375, "grad_norm": 3.078125, "grad_norm_var": 0.06024983723958333, "learning_rate": 0.0001, "loss": 5.7482, "loss/crossentropy": 2.623828649520874, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16673285514116287, "step": 23310 }, { "epoch": 0.7285, "grad_norm": 3.4375, "grad_norm_var": 0.06101786295572917, "learning_rate": 0.0001, "loss": 5.9586, "loss/crossentropy": 2.7022147178649902, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1752513349056244, "step": 23312 }, { "epoch": 0.7285625, "grad_norm": 3.0, "grad_norm_var": 0.050537109375, "learning_rate": 0.0001, "loss": 5.7232, "loss/crossentropy": 2.609552264213562, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1660483181476593, "step": 23314 }, { "epoch": 0.728625, "grad_norm": 3.09375, "grad_norm_var": 0.0460357666015625, "learning_rate": 0.0001, "loss": 5.4811, "loss/crossentropy": 2.451301693916321, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15844624489545822, "step": 23316 }, { "epoch": 0.7286875, "grad_norm": 3.328125, "grad_norm_var": 0.03616536458333333, "learning_rate": 0.0001, "loss": 5.6874, "loss/crossentropy": 2.5450918674468994, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16501231491565704, "step": 23318 }, { "epoch": 0.72875, "grad_norm": 3.09375, "grad_norm_var": 0.048981730143229166, "learning_rate": 0.0001, "loss": 6.0266, "loss/crossentropy": 2.7788890600204468, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17399199306964874, "step": 23320 }, { "epoch": 0.7288125, "grad_norm": 3.125, "grad_norm_var": 0.049779256184895836, "learning_rate": 0.0001, "loss": 5.6936, "loss/crossentropy": 2.529011607170105, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.17231817543506622, "step": 23322 }, { "epoch": 0.728875, "grad_norm": 3.328125, "grad_norm_var": 0.04312744140625, "learning_rate": 0.0001, "loss": 5.8842, "loss/crossentropy": 2.744863271713257, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16705384105443954, "step": 23324 }, { "epoch": 0.7289375, "grad_norm": 3.140625, "grad_norm_var": 0.040990193684895836, "learning_rate": 0.0001, "loss": 5.823, "loss/crossentropy": 2.595468759536743, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1715800240635872, "step": 23326 }, { "epoch": 0.729, "grad_norm": 3.546875, "grad_norm_var": 0.045750935872395836, "learning_rate": 0.0001, "loss": 5.9501, "loss/crossentropy": 2.647188663482666, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.18029607087373734, "step": 23328 }, { "epoch": 0.7290625, "grad_norm": 2.921875, "grad_norm_var": 0.051106770833333336, "learning_rate": 0.0001, "loss": 5.6685, "loss/crossentropy": 2.650219440460205, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15885566920042038, "step": 23330 }, { "epoch": 0.729125, "grad_norm": 3.046875, "grad_norm_var": 0.15070699055989584, "learning_rate": 0.0001, "loss": 5.7041, "loss/crossentropy": 2.5037001371383667, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.1680903658270836, "step": 23332 }, { "epoch": 0.7291875, "grad_norm": 3.0, "grad_norm_var": 0.15506184895833333, "learning_rate": 0.0001, "loss": 5.6046, "loss/crossentropy": 2.51932156085968, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16399675607681274, "step": 23334 }, { "epoch": 0.72925, "grad_norm": 2.96875, "grad_norm_var": 0.19091389973958334, "learning_rate": 0.0001, "loss": 6.0395, "loss/crossentropy": 2.728891968727112, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17871622741222382, "step": 23336 }, { "epoch": 0.7293125, "grad_norm": 3.625, "grad_norm_var": 0.20361226399739582, "learning_rate": 0.0001, "loss": 5.6939, "loss/crossentropy": 2.4913874864578247, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17024771869182587, "step": 23338 }, { "epoch": 0.729375, "grad_norm": 2.828125, "grad_norm_var": 0.22463785807291667, "learning_rate": 0.0001, "loss": 5.5474, "loss/crossentropy": 2.4907031059265137, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15801078081130981, "step": 23340 }, { "epoch": 0.7294375, "grad_norm": 2.859375, "grad_norm_var": 0.2403228759765625, "learning_rate": 0.0001, "loss": 5.5717, "loss/crossentropy": 2.578675150871277, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1563306376338005, "step": 23342 }, { "epoch": 0.7295, "grad_norm": 3.03125, "grad_norm_var": 0.23493550618489584, "learning_rate": 0.0001, "loss": 5.4622, "loss/crossentropy": 2.441346287727356, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.16028911620378494, "step": 23344 }, { "epoch": 0.7295625, "grad_norm": 2.8125, "grad_norm_var": 0.24150390625, "learning_rate": 0.0001, "loss": 5.6568, "loss/crossentropy": 2.57526171207428, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16167286783456802, "step": 23346 }, { "epoch": 0.729625, "grad_norm": 3.015625, "grad_norm_var": 0.1181304931640625, "learning_rate": 0.0001, "loss": 5.8356, "loss/crossentropy": 2.71990966796875, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16391023993492126, "step": 23348 }, { "epoch": 0.7296875, "grad_norm": 4.21875, "grad_norm_var": 0.20810139973958333, "learning_rate": 0.0001, "loss": 5.5542, "loss/crossentropy": 2.486966371536255, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.16492543369531631, "step": 23350 }, { "epoch": 0.72975, "grad_norm": 3.171875, "grad_norm_var": 0.13358968098958332, "learning_rate": 0.0001, "loss": 5.5869, "loss/crossentropy": 2.454864978790283, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16398197412490845, "step": 23352 }, { "epoch": 0.7298125, "grad_norm": 2.9375, "grad_norm_var": 0.11433817545572916, "learning_rate": 0.0001, "loss": 5.6214, "loss/crossentropy": 2.5570112466812134, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.1642509251832962, "step": 23354 }, { "epoch": 0.729875, "grad_norm": 3.140625, "grad_norm_var": 0.11079813639322916, "learning_rate": 0.0001, "loss": 5.3264, "loss/crossentropy": 2.3819743394851685, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.14678432047367096, "step": 23356 }, { "epoch": 0.7299375, "grad_norm": 3.34375, "grad_norm_var": 0.11305338541666667, "learning_rate": 0.0001, "loss": 6.0956, "loss/crossentropy": 2.7229779958724976, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.18179063498973846, "step": 23358 }, { "epoch": 0.73, "grad_norm": 2.8125, "grad_norm_var": 0.1189361572265625, "learning_rate": 0.0001, "loss": 5.5458, "loss/crossentropy": 2.601516842842102, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1491125375032425, "step": 23360 }, { "epoch": 0.7300625, "grad_norm": 3.703125, "grad_norm_var": 0.13271484375, "learning_rate": 0.0001, "loss": 6.0007, "loss/crossentropy": 2.764381527900696, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17246322333812714, "step": 23362 }, { "epoch": 0.730125, "grad_norm": 3.265625, "grad_norm_var": 0.12694905598958334, "learning_rate": 0.0001, "loss": 5.541, "loss/crossentropy": 2.4775807857513428, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15946689993143082, "step": 23364 }, { "epoch": 0.7301875, "grad_norm": 2.921875, "grad_norm_var": 0.056761678059895834, "learning_rate": 0.0001, "loss": 5.8493, "loss/crossentropy": 2.7315409183502197, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16567786782979965, "step": 23366 }, { "epoch": 0.73025, "grad_norm": 3.203125, "grad_norm_var": 0.054011027018229164, "learning_rate": 0.0001, "loss": 5.5005, "loss/crossentropy": 2.474494218826294, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15455739945173264, "step": 23368 }, { "epoch": 0.7303125, "grad_norm": 3.109375, "grad_norm_var": 0.051146443684895834, "learning_rate": 0.0001, "loss": 5.5954, "loss/crossentropy": 2.527936339378357, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15908553451299667, "step": 23370 }, { "epoch": 0.730375, "grad_norm": 2.953125, "grad_norm_var": 0.06220601399739583, "learning_rate": 0.0001, "loss": 5.682, "loss/crossentropy": 2.630681872367859, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16059622168540955, "step": 23372 }, { "epoch": 0.7304375, "grad_norm": 3.140625, "grad_norm_var": 0.058203125, "learning_rate": 0.0001, "loss": 5.5371, "loss/crossentropy": 2.4286952018737793, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1651330292224884, "step": 23374 }, { "epoch": 0.7305, "grad_norm": 3.3125, "grad_norm_var": 0.049657185872395836, "learning_rate": 0.0001, "loss": 5.9044, "loss/crossentropy": 2.8086371421813965, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16504616290330887, "step": 23376 }, { "epoch": 0.7305625, "grad_norm": 2.96875, "grad_norm_var": 0.027098592122395834, "learning_rate": 0.0001, "loss": 5.4529, "loss/crossentropy": 2.361377239227295, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16032811254262924, "step": 23378 }, { "epoch": 0.730625, "grad_norm": 3.296875, "grad_norm_var": 0.028218587239583332, "learning_rate": 0.0001, "loss": 5.5693, "loss/crossentropy": 2.4290562868118286, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16558928787708282, "step": 23380 }, { "epoch": 0.7306875, "grad_norm": 3.0625, "grad_norm_var": 0.015404256184895833, "learning_rate": 0.0001, "loss": 5.4597, "loss/crossentropy": 2.3813695907592773, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16213400661945343, "step": 23382 }, { "epoch": 0.73075, "grad_norm": 2.8125, "grad_norm_var": 0.0199859619140625, "learning_rate": 0.0001, "loss": 5.7522, "loss/crossentropy": 2.689029335975647, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1610071212053299, "step": 23384 }, { "epoch": 0.7308125, "grad_norm": 3.15625, "grad_norm_var": 0.024689737955729166, "learning_rate": 0.0001, "loss": 5.808, "loss/crossentropy": 2.6453691720962524, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16899652779102325, "step": 23386 }, { "epoch": 0.730875, "grad_norm": 3.375, "grad_norm_var": 0.022477213541666666, "learning_rate": 0.0001, "loss": 5.8856, "loss/crossentropy": 2.7170687913894653, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16880813241004944, "step": 23388 }, { "epoch": 0.7309375, "grad_norm": 3.203125, "grad_norm_var": 0.13961181640625, "learning_rate": 0.0001, "loss": 5.5278, "loss/crossentropy": 2.4707634449005127, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15843693166971207, "step": 23390 }, { "epoch": 0.731, "grad_norm": 3.109375, "grad_norm_var": 0.1425689697265625, "learning_rate": 0.0001, "loss": 5.2349, "loss/crossentropy": 2.2763832807540894, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.14780624210834503, "step": 23392 }, { "epoch": 0.7310625, "grad_norm": 3.125, "grad_norm_var": 0.13893941243489583, "learning_rate": 0.0001, "loss": 5.5035, "loss/crossentropy": 2.481665849685669, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15531041473150253, "step": 23394 }, { "epoch": 0.731125, "grad_norm": 3.109375, "grad_norm_var": 0.13613179524739583, "learning_rate": 0.0001, "loss": 5.7716, "loss/crossentropy": 2.6283375024795532, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1682298630475998, "step": 23396 }, { "epoch": 0.7311875, "grad_norm": 3.203125, "grad_norm_var": 0.13552144368489583, "learning_rate": 0.0001, "loss": 5.5228, "loss/crossentropy": 2.4387375116348267, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1619235798716545, "step": 23398 }, { "epoch": 0.73125, "grad_norm": 3.0, "grad_norm_var": 0.12942708333333333, "learning_rate": 0.0001, "loss": 5.7851, "loss/crossentropy": 2.7239224910736084, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16197381168603897, "step": 23400 }, { "epoch": 0.7313125, "grad_norm": 3.375, "grad_norm_var": 0.13816630045572917, "learning_rate": 0.0001, "loss": 5.3917, "loss/crossentropy": 2.3603626489639282, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15938717126846313, "step": 23402 }, { "epoch": 0.731375, "grad_norm": 3.3125, "grad_norm_var": 0.13778889973958333, "learning_rate": 0.0001, "loss": 5.7806, "loss/crossentropy": 2.5797680616378784, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1739886999130249, "step": 23404 }, { "epoch": 0.7314375, "grad_norm": 3.296875, "grad_norm_var": 0.020091756184895834, "learning_rate": 0.0001, "loss": 5.5676, "loss/crossentropy": 2.4198325872421265, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16438789665699005, "step": 23406 }, { "epoch": 0.7315, "grad_norm": 2.921875, "grad_norm_var": 0.025972493489583335, "learning_rate": 0.0001, "loss": 5.3688, "loss/crossentropy": 2.426826000213623, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.15357258170843124, "step": 23408 }, { "epoch": 0.7315625, "grad_norm": 3.390625, "grad_norm_var": 0.030855305989583335, "learning_rate": 0.0001, "loss": 5.9291, "loss/crossentropy": 2.6948466300964355, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17616183310747147, "step": 23410 }, { "epoch": 0.731625, "grad_norm": 3.078125, "grad_norm_var": 0.035791015625, "learning_rate": 0.0001, "loss": 5.867, "loss/crossentropy": 2.759811282157898, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1642349362373352, "step": 23412 }, { "epoch": 0.7316875, "grad_norm": 3.375, "grad_norm_var": 0.038960774739583336, "learning_rate": 0.0001, "loss": 5.2287, "loss/crossentropy": 2.2166020274162292, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1519937738776207, "step": 23414 }, { "epoch": 0.73175, "grad_norm": 2.859375, "grad_norm_var": 0.04243062337239583, "learning_rate": 0.0001, "loss": 5.703, "loss/crossentropy": 2.632703423500061, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16405853629112244, "step": 23416 }, { "epoch": 0.7318125, "grad_norm": 2.96875, "grad_norm_var": 0.036149088541666666, "learning_rate": 0.0001, "loss": 5.0297, "loss/crossentropy": 2.118439018726349, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.14307677745819092, "step": 23418 }, { "epoch": 0.731875, "grad_norm": 3.21875, "grad_norm_var": 0.03445638020833333, "learning_rate": 0.0001, "loss": 5.8998, "loss/crossentropy": 2.6661545038223267, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17297054082155228, "step": 23420 }, { "epoch": 0.7319375, "grad_norm": 3.109375, "grad_norm_var": 0.0317535400390625, "learning_rate": 0.0001, "loss": 5.8514, "loss/crossentropy": 2.6101226806640625, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17647041380405426, "step": 23422 }, { "epoch": 0.732, "grad_norm": 3.1875, "grad_norm_var": 0.028023274739583333, "learning_rate": 0.0001, "loss": 5.6823, "loss/crossentropy": 2.5592458248138428, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16425611078739166, "step": 23424 }, { "epoch": 0.7320625, "grad_norm": 3.1875, "grad_norm_var": 0.024095662434895835, "learning_rate": 0.0001, "loss": 5.7849, "loss/crossentropy": 2.5620510578155518, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17423319816589355, "step": 23426 }, { "epoch": 0.732125, "grad_norm": 2.90625, "grad_norm_var": 0.0216461181640625, "learning_rate": 0.0001, "loss": 5.4815, "loss/crossentropy": 2.4160321950912476, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16201941668987274, "step": 23428 }, { "epoch": 0.7321875, "grad_norm": 3.03125, "grad_norm_var": 0.0192291259765625, "learning_rate": 0.0001, "loss": 5.7321, "loss/crossentropy": 2.6281731128692627, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1635151132941246, "step": 23430 }, { "epoch": 0.73225, "grad_norm": 3.046875, "grad_norm_var": 0.015380859375, "learning_rate": 0.0001, "loss": 5.6919, "loss/crossentropy": 2.573588252067566, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16768812388181686, "step": 23432 }, { "epoch": 0.7323125, "grad_norm": 3.171875, "grad_norm_var": 0.0131744384765625, "learning_rate": 0.0001, "loss": 5.7698, "loss/crossentropy": 2.588934898376465, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1665220931172371, "step": 23434 }, { "epoch": 0.732375, "grad_norm": 3.234375, "grad_norm_var": 0.01363525390625, "learning_rate": 0.0001, "loss": 5.7479, "loss/crossentropy": 2.640743613243103, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16110744327306747, "step": 23436 }, { "epoch": 0.7324375, "grad_norm": 3.40625, "grad_norm_var": 0.020817057291666666, "learning_rate": 0.0001, "loss": 5.8297, "loss/crossentropy": 2.5773215293884277, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17484461516141891, "step": 23438 }, { "epoch": 0.7325, "grad_norm": 3.15625, "grad_norm_var": 0.019652303059895834, "learning_rate": 0.0001, "loss": 5.6132, "loss/crossentropy": 2.5538493394851685, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15906429290771484, "step": 23440 }, { "epoch": 0.7325625, "grad_norm": 3.09375, "grad_norm_var": 0.0196441650390625, "learning_rate": 0.0001, "loss": 5.5364, "loss/crossentropy": 2.5850027799606323, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.14943711459636688, "step": 23442 }, { "epoch": 0.732625, "grad_norm": 3.125, "grad_norm_var": 0.016950480143229165, "learning_rate": 0.0001, "loss": 5.7022, "loss/crossentropy": 2.5973565578460693, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16556359827518463, "step": 23444 }, { "epoch": 0.7326875, "grad_norm": 2.96875, "grad_norm_var": 0.0246978759765625, "learning_rate": 0.0001, "loss": 5.381, "loss/crossentropy": 2.403241515159607, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15441212058067322, "step": 23446 }, { "epoch": 0.73275, "grad_norm": 3.0, "grad_norm_var": 0.025651041666666666, "learning_rate": 0.0001, "loss": 5.5208, "loss/crossentropy": 2.5156280994415283, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15911425650119781, "step": 23448 }, { "epoch": 0.7328125, "grad_norm": 3.421875, "grad_norm_var": 0.033492024739583334, "learning_rate": 0.0001, "loss": 5.7593, "loss/crossentropy": 2.604570508003235, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.17055577784776688, "step": 23450 }, { "epoch": 0.732875, "grad_norm": 3.25, "grad_norm_var": 0.03437093098958333, "learning_rate": 0.0001, "loss": 5.5476, "loss/crossentropy": 2.461097002029419, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16099774837493896, "step": 23452 }, { "epoch": 0.7329375, "grad_norm": 2.890625, "grad_norm_var": 0.0237457275390625, "learning_rate": 0.0001, "loss": 5.507, "loss/crossentropy": 2.5025230646133423, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15826108306646347, "step": 23454 }, { "epoch": 0.733, "grad_norm": 3.46875, "grad_norm_var": 0.07079671223958334, "learning_rate": 0.0001, "loss": 5.5568, "loss/crossentropy": 2.3830480575561523, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.16268550604581833, "step": 23456 }, { "epoch": 0.7330625, "grad_norm": 3.171875, "grad_norm_var": 0.0978912353515625, "learning_rate": 0.0001, "loss": 5.6719, "loss/crossentropy": 2.587242603302002, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.1643211841583252, "step": 23458 }, { "epoch": 0.733125, "grad_norm": 3.0625, "grad_norm_var": 0.09876200358072916, "learning_rate": 0.0001, "loss": 5.4057, "loss/crossentropy": 2.3633275032043457, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15892165899276733, "step": 23460 }, { "epoch": 0.7331875, "grad_norm": 2.890625, "grad_norm_var": 0.09299723307291667, "learning_rate": 0.0001, "loss": 5.7002, "loss/crossentropy": 2.608008861541748, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16312935948371887, "step": 23462 }, { "epoch": 0.73325, "grad_norm": 3.140625, "grad_norm_var": 0.08854166666666667, "learning_rate": 0.0001, "loss": 5.7402, "loss/crossentropy": 2.5507601499557495, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1720656156539917, "step": 23464 }, { "epoch": 0.7333125, "grad_norm": 3.40625, "grad_norm_var": 0.09127197265625, "learning_rate": 0.0001, "loss": 5.5159, "loss/crossentropy": 2.4633994102478027, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15681590884923935, "step": 23466 }, { "epoch": 0.733375, "grad_norm": 3.125, "grad_norm_var": 0.09455973307291667, "learning_rate": 0.0001, "loss": 5.5678, "loss/crossentropy": 2.502510190010071, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16239215433597565, "step": 23468 }, { "epoch": 0.7334375, "grad_norm": 3.015625, "grad_norm_var": 0.10484110514322917, "learning_rate": 0.0001, "loss": 5.421, "loss/crossentropy": 2.4273658990859985, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15404889732599258, "step": 23470 }, { "epoch": 0.7335, "grad_norm": 3.078125, "grad_norm_var": 0.06075846354166667, "learning_rate": 0.0001, "loss": 5.4922, "loss/crossentropy": 2.429620862007141, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1570415273308754, "step": 23472 }, { "epoch": 0.7335625, "grad_norm": 2.9375, "grad_norm_var": 0.022965494791666666, "learning_rate": 0.0001, "loss": 5.7666, "loss/crossentropy": 2.5914634466171265, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16985638439655304, "step": 23474 }, { "epoch": 0.733625, "grad_norm": 3.0625, "grad_norm_var": 0.032990519205729166, "learning_rate": 0.0001, "loss": 5.7969, "loss/crossentropy": 2.6145005226135254, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1701931208372116, "step": 23476 }, { "epoch": 0.7336875, "grad_norm": 3.265625, "grad_norm_var": 0.03289388020833333, "learning_rate": 0.0001, "loss": 5.5761, "loss/crossentropy": 2.4450889825820923, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.1709156483411789, "step": 23478 }, { "epoch": 0.73375, "grad_norm": 3.015625, "grad_norm_var": 0.033137003580729164, "learning_rate": 0.0001, "loss": 5.6349, "loss/crossentropy": 2.5668121576309204, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16149134933948517, "step": 23480 }, { "epoch": 0.7338125, "grad_norm": 3.203125, "grad_norm_var": 0.0259185791015625, "learning_rate": 0.0001, "loss": 5.7231, "loss/crossentropy": 2.5993869304656982, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16432444006204605, "step": 23482 }, { "epoch": 0.733875, "grad_norm": 3.0625, "grad_norm_var": 0.025223795572916666, "learning_rate": 0.0001, "loss": 5.8019, "loss/crossentropy": 2.6502548456192017, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1675073504447937, "step": 23484 }, { "epoch": 0.7339375, "grad_norm": 2.703125, "grad_norm_var": 0.028693644205729167, "learning_rate": 0.0001, "loss": 5.6372, "loss/crossentropy": 2.5472875833511353, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15937808901071548, "step": 23486 }, { "epoch": 0.734, "grad_norm": 3.0, "grad_norm_var": 0.029069010416666666, "learning_rate": 0.0001, "loss": 5.5728, "loss/crossentropy": 2.508591651916504, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16188612580299377, "step": 23488 }, { "epoch": 0.7340625, "grad_norm": 2.875, "grad_norm_var": 0.030549112955729166, "learning_rate": 0.0001, "loss": 5.4278, "loss/crossentropy": 2.442284345626831, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15519032627344131, "step": 23490 }, { "epoch": 0.734125, "grad_norm": 3.15625, "grad_norm_var": 0.023270670572916666, "learning_rate": 0.0001, "loss": 5.5983, "loss/crossentropy": 2.4656132459640503, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16366274654865265, "step": 23492 }, { "epoch": 0.7341875, "grad_norm": 3.078125, "grad_norm_var": 0.020531209309895833, "learning_rate": 0.0001, "loss": 5.8963, "loss/crossentropy": 2.685925006866455, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17298579961061478, "step": 23494 }, { "epoch": 0.73425, "grad_norm": 2.765625, "grad_norm_var": 0.02945556640625, "learning_rate": 0.0001, "loss": 5.2837, "loss/crossentropy": 2.4336854219436646, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.14163990318775177, "step": 23496 }, { "epoch": 0.7343125, "grad_norm": 3.40625, "grad_norm_var": 0.0365386962890625, "learning_rate": 0.0001, "loss": 5.9122, "loss/crossentropy": 2.64986252784729, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17701907455921173, "step": 23498 }, { "epoch": 0.734375, "grad_norm": 2.859375, "grad_norm_var": 0.04112040201822917, "learning_rate": 0.0001, "loss": 5.616, "loss/crossentropy": 2.5660492181777954, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16046810150146484, "step": 23500 }, { "epoch": 0.7344375, "grad_norm": 2.796875, "grad_norm_var": 0.03284098307291667, "learning_rate": 0.0001, "loss": 5.602, "loss/crossentropy": 2.5677409172058105, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15733551979064941, "step": 23502 }, { "epoch": 0.7345, "grad_norm": 3.09375, "grad_norm_var": 0.034699503580729166, "learning_rate": 0.0001, "loss": 5.7832, "loss/crossentropy": 2.6343828439712524, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16839541494846344, "step": 23504 }, { "epoch": 0.7345625, "grad_norm": 2.859375, "grad_norm_var": 0.04716695149739583, "learning_rate": 0.0001, "loss": 5.4971, "loss/crossentropy": 2.4877257347106934, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15210432559251785, "step": 23506 }, { "epoch": 0.734625, "grad_norm": 3.0625, "grad_norm_var": 0.058089192708333334, "learning_rate": 0.0001, "loss": 5.9026, "loss/crossentropy": 2.7096848487854004, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17124372720718384, "step": 23508 }, { "epoch": 0.7346875, "grad_norm": 2.875, "grad_norm_var": 0.07830403645833334, "learning_rate": 0.0001, "loss": 5.3596, "loss/crossentropy": 2.4028228521347046, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15114928781986237, "step": 23510 }, { "epoch": 0.73475, "grad_norm": 3.40625, "grad_norm_var": 0.06862691243489584, "learning_rate": 0.0001, "loss": 5.9145, "loss/crossentropy": 2.733505964279175, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16888293623924255, "step": 23512 }, { "epoch": 0.7348125, "grad_norm": 2.796875, "grad_norm_var": 0.07138570149739583, "learning_rate": 0.0001, "loss": 5.3005, "loss/crossentropy": 2.3271204233169556, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15319299697875977, "step": 23514 }, { "epoch": 0.734875, "grad_norm": 3.34375, "grad_norm_var": 0.0735504150390625, "learning_rate": 0.0001, "loss": 5.3765, "loss/crossentropy": 2.407612442970276, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.1554863601922989, "step": 23516 }, { "epoch": 0.7349375, "grad_norm": 2.796875, "grad_norm_var": 0.08217671712239584, "learning_rate": 0.0001, "loss": 5.3266, "loss/crossentropy": 2.4507007598876953, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.14501401782035828, "step": 23518 }, { "epoch": 0.735, "grad_norm": 3.1875, "grad_norm_var": 0.08747456868489584, "learning_rate": 0.0001, "loss": 5.6318, "loss/crossentropy": 2.5163025856018066, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16271989792585373, "step": 23520 }, { "epoch": 0.7350625, "grad_norm": 3.0, "grad_norm_var": 0.07698160807291667, "learning_rate": 0.0001, "loss": 5.4528, "loss/crossentropy": 2.4520334005355835, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1551515907049179, "step": 23522 }, { "epoch": 0.735125, "grad_norm": 2.71875, "grad_norm_var": 0.07219136555989583, "learning_rate": 0.0001, "loss": 5.6717, "loss/crossentropy": 2.61397123336792, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16201938688755035, "step": 23524 }, { "epoch": 0.7351875, "grad_norm": 3.078125, "grad_norm_var": 0.050755818684895836, "learning_rate": 0.0001, "loss": 5.6804, "loss/crossentropy": 2.5597420930862427, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.165585957467556, "step": 23526 }, { "epoch": 0.73525, "grad_norm": 3.03125, "grad_norm_var": 0.042512003580729166, "learning_rate": 0.0001, "loss": 5.5602, "loss/crossentropy": 2.529832124710083, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15850237756967545, "step": 23528 }, { "epoch": 0.7353125, "grad_norm": 3.234375, "grad_norm_var": 0.044164021809895836, "learning_rate": 0.0001, "loss": 5.7882, "loss/crossentropy": 2.6322215795516968, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.167157880961895, "step": 23530 }, { "epoch": 0.735375, "grad_norm": 3.046875, "grad_norm_var": 0.0363922119140625, "learning_rate": 0.0001, "loss": 6.0217, "loss/crossentropy": 2.894606351852417, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16661609709262848, "step": 23532 }, { "epoch": 0.7354375, "grad_norm": 2.921875, "grad_norm_var": 0.0268707275390625, "learning_rate": 0.0001, "loss": 5.4766, "loss/crossentropy": 2.4781914949417114, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.158429816365242, "step": 23534 }, { "epoch": 0.7355, "grad_norm": 3.453125, "grad_norm_var": 0.029002888997395834, "learning_rate": 0.0001, "loss": 5.9379, "loss/crossentropy": 2.737681031227112, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17275556176900864, "step": 23536 }, { "epoch": 0.7355625, "grad_norm": 2.921875, "grad_norm_var": 0.02916259765625, "learning_rate": 0.0001, "loss": 5.6641, "loss/crossentropy": 2.589012384414673, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.1649332493543625, "step": 23538 }, { "epoch": 0.735625, "grad_norm": 2.828125, "grad_norm_var": 0.0323883056640625, "learning_rate": 0.0001, "loss": 5.6689, "loss/crossentropy": 2.5991783142089844, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1612653061747551, "step": 23540 }, { "epoch": 0.7356875, "grad_norm": 3.015625, "grad_norm_var": 0.030289713541666666, "learning_rate": 0.0001, "loss": 5.3967, "loss/crossentropy": 2.4203792810440063, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15544036030769348, "step": 23542 }, { "epoch": 0.73575, "grad_norm": 2.859375, "grad_norm_var": 0.031737263997395834, "learning_rate": 0.0001, "loss": 5.6952, "loss/crossentropy": 2.635646939277649, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16142567247152328, "step": 23544 }, { "epoch": 0.7358125, "grad_norm": 3.234375, "grad_norm_var": 0.03383687337239583, "learning_rate": 0.0001, "loss": 5.4899, "loss/crossentropy": 2.4383760690689087, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1606234908103943, "step": 23546 }, { "epoch": 0.735875, "grad_norm": 2.859375, "grad_norm_var": 0.03642578125, "learning_rate": 0.0001, "loss": 5.4974, "loss/crossentropy": 2.5072513818740845, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15409240871667862, "step": 23548 }, { "epoch": 0.7359375, "grad_norm": 3.140625, "grad_norm_var": 0.05164286295572917, "learning_rate": 0.0001, "loss": 6.0929, "loss/crossentropy": 2.7838547229766846, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.18050900846719742, "step": 23550 }, { "epoch": 0.736, "grad_norm": 3.484375, "grad_norm_var": 0.0539947509765625, "learning_rate": 0.0001, "loss": 5.9286, "loss/crossentropy": 2.6969679594039917, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17433885484933853, "step": 23552 }, { "epoch": 0.7360625, "grad_norm": 3.421875, "grad_norm_var": 0.05956624348958333, "learning_rate": 0.0001, "loss": 5.9226, "loss/crossentropy": 2.698677182197571, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17434575408697128, "step": 23554 }, { "epoch": 0.736125, "grad_norm": 2.875, "grad_norm_var": 0.05289306640625, "learning_rate": 0.0001, "loss": 5.7134, "loss/crossentropy": 2.5809249877929688, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16597800701856613, "step": 23556 }, { "epoch": 0.7361875, "grad_norm": 2.890625, "grad_norm_var": 0.05805562337239583, "learning_rate": 0.0001, "loss": 5.5054, "loss/crossentropy": 2.546760082244873, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.15406754612922668, "step": 23558 }, { "epoch": 0.73625, "grad_norm": 3.53125, "grad_norm_var": 0.06536051432291666, "learning_rate": 0.0001, "loss": 5.7136, "loss/crossentropy": 2.630208373069763, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1626323238015175, "step": 23560 }, { "epoch": 0.7363125, "grad_norm": 2.90625, "grad_norm_var": 0.06128641764322917, "learning_rate": 0.0001, "loss": 5.6842, "loss/crossentropy": 2.5686936378479004, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1638946235179901, "step": 23562 }, { "epoch": 0.736375, "grad_norm": 3.125, "grad_norm_var": 0.05694986979166667, "learning_rate": 0.0001, "loss": 5.7707, "loss/crossentropy": 2.6414307355880737, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1668316125869751, "step": 23564 }, { "epoch": 0.7364375, "grad_norm": 3.125, "grad_norm_var": 0.075146484375, "learning_rate": 0.0001, "loss": 5.8567, "loss/crossentropy": 2.614556074142456, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17343351244926453, "step": 23566 }, { "epoch": 0.7365, "grad_norm": 3.09375, "grad_norm_var": 0.06851806640625, "learning_rate": 0.0001, "loss": 5.6407, "loss/crossentropy": 2.5622506141662598, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.163309745490551, "step": 23568 }, { "epoch": 0.7365625, "grad_norm": 3.15625, "grad_norm_var": 0.0625396728515625, "learning_rate": 0.0001, "loss": 6.0154, "loss/crossentropy": 2.810123562812805, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1744309365749359, "step": 23570 }, { "epoch": 0.736625, "grad_norm": 3.078125, "grad_norm_var": 0.058610026041666666, "learning_rate": 0.0001, "loss": 5.5339, "loss/crossentropy": 2.4425787925720215, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16225220263004303, "step": 23572 }, { "epoch": 0.7366875, "grad_norm": 3.171875, "grad_norm_var": 0.04915262858072917, "learning_rate": 0.0001, "loss": 5.6152, "loss/crossentropy": 2.536233425140381, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16180337965488434, "step": 23574 }, { "epoch": 0.73675, "grad_norm": 3.140625, "grad_norm_var": 0.07273763020833333, "learning_rate": 0.0001, "loss": 5.4295, "loss/crossentropy": 2.3130571842193604, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1616436317563057, "step": 23576 }, { "epoch": 0.7368125, "grad_norm": 3.34375, "grad_norm_var": 0.06689046223958334, "learning_rate": 0.0001, "loss": 5.4234, "loss/crossentropy": 2.3387938737869263, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.15767645090818405, "step": 23578 }, { "epoch": 0.736875, "grad_norm": 3.578125, "grad_norm_var": 0.07784830729166667, "learning_rate": 0.0001, "loss": 5.7788, "loss/crossentropy": 2.565428376197815, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16938666254281998, "step": 23580 }, { "epoch": 0.7369375, "grad_norm": 3.09375, "grad_norm_var": 0.059956868489583336, "learning_rate": 0.0001, "loss": 5.3839, "loss/crossentropy": 2.338326930999756, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1615864410996437, "step": 23582 }, { "epoch": 0.737, "grad_norm": 3.34375, "grad_norm_var": 0.06139322916666667, "learning_rate": 0.0001, "loss": 5.6858, "loss/crossentropy": 2.5333563089370728, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.17188771069049835, "step": 23584 }, { "epoch": 0.7370625, "grad_norm": 2.796875, "grad_norm_var": 0.07151285807291667, "learning_rate": 0.0001, "loss": 5.3732, "loss/crossentropy": 2.3642250299453735, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15324292331933975, "step": 23586 }, { "epoch": 0.737125, "grad_norm": 2.9375, "grad_norm_var": 0.07737223307291667, "learning_rate": 0.0001, "loss": 5.6459, "loss/crossentropy": 2.4873459339141846, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16780464351177216, "step": 23588 }, { "epoch": 0.7371875, "grad_norm": 3.203125, "grad_norm_var": 0.07736714680989583, "learning_rate": 0.0001, "loss": 5.8558, "loss/crossentropy": 2.613994836807251, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17496132850646973, "step": 23590 }, { "epoch": 0.73725, "grad_norm": 2.859375, "grad_norm_var": 0.04999593098958333, "learning_rate": 0.0001, "loss": 5.6823, "loss/crossentropy": 2.616868495941162, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16357173025608063, "step": 23592 }, { "epoch": 0.7373125, "grad_norm": 2.71875, "grad_norm_var": 0.05718994140625, "learning_rate": 0.0001, "loss": 5.3136, "loss/crossentropy": 2.4151304960250854, "loss/hidden": 1.40234375, "loss/jsd": 0.0, "loss/logits": 0.14961008727550507, "step": 23594 }, { "epoch": 0.737375, "grad_norm": 3.203125, "grad_norm_var": 0.0427642822265625, "learning_rate": 0.0001, "loss": 5.7147, "loss/crossentropy": 2.619183301925659, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16150399297475815, "step": 23596 }, { "epoch": 0.7374375, "grad_norm": 3.46875, "grad_norm_var": 0.05130208333333333, "learning_rate": 0.0001, "loss": 5.5533, "loss/crossentropy": 2.520646572113037, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15599774569272995, "step": 23598 }, { "epoch": 0.7375, "grad_norm": 3.734375, "grad_norm_var": 0.07461649576822917, "learning_rate": 0.0001, "loss": 5.5009, "loss/crossentropy": 2.389772653579712, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.164240300655365, "step": 23600 }, { "epoch": 0.7375625, "grad_norm": 2.84375, "grad_norm_var": 0.07599995930989584, "learning_rate": 0.0001, "loss": 5.3118, "loss/crossentropy": 2.35556161403656, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15421681106090546, "step": 23602 }, { "epoch": 0.737625, "grad_norm": 3.4375, "grad_norm_var": 0.08384501139322917, "learning_rate": 0.0001, "loss": 5.653, "loss/crossentropy": 2.506429433822632, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1689501628279686, "step": 23604 }, { "epoch": 0.7376875, "grad_norm": 3.046875, "grad_norm_var": 0.0824615478515625, "learning_rate": 0.0001, "loss": 5.4944, "loss/crossentropy": 2.504259943962097, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15604222565889359, "step": 23606 }, { "epoch": 0.73775, "grad_norm": 3.140625, "grad_norm_var": 0.07965494791666666, "learning_rate": 0.0001, "loss": 5.7588, "loss/crossentropy": 2.6146479845046997, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16949427127838135, "step": 23608 }, { "epoch": 0.7378125, "grad_norm": 2.890625, "grad_norm_var": 0.07296549479166667, "learning_rate": 0.0001, "loss": 5.3794, "loss/crossentropy": 2.4034372568130493, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.1561950296163559, "step": 23610 }, { "epoch": 0.737875, "grad_norm": 2.984375, "grad_norm_var": 0.07371317545572917, "learning_rate": 0.0001, "loss": 5.9049, "loss/crossentropy": 2.7431684732437134, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16851452738046646, "step": 23612 }, { "epoch": 0.7379375, "grad_norm": 3.140625, "grad_norm_var": 0.10067952473958333, "learning_rate": 0.0001, "loss": 5.7946, "loss/crossentropy": 2.633496046066284, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1692311391234398, "step": 23614 }, { "epoch": 0.738, "grad_norm": 3.328125, "grad_norm_var": 0.08065999348958333, "learning_rate": 0.0001, "loss": 5.5748, "loss/crossentropy": 2.506125807762146, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15960664302110672, "step": 23616 }, { "epoch": 0.7380625, "grad_norm": 3.203125, "grad_norm_var": 0.07156575520833333, "learning_rate": 0.0001, "loss": 5.7517, "loss/crossentropy": 2.6554681062698364, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1611897200345993, "step": 23618 }, { "epoch": 0.738125, "grad_norm": 3.171875, "grad_norm_var": 0.05950113932291667, "learning_rate": 0.0001, "loss": 5.713, "loss/crossentropy": 2.6006507873535156, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16514024883508682, "step": 23620 }, { "epoch": 0.7381875, "grad_norm": 2.84375, "grad_norm_var": 0.0748931884765625, "learning_rate": 0.0001, "loss": 5.374, "loss/crossentropy": 2.279668092727661, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16217041015625, "step": 23622 }, { "epoch": 0.73825, "grad_norm": 2.90625, "grad_norm_var": 0.0783203125, "learning_rate": 0.0001, "loss": 5.6124, "loss/crossentropy": 2.5494585037231445, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15981171280145645, "step": 23624 }, { "epoch": 0.7383125, "grad_norm": 2.9375, "grad_norm_var": 0.0767974853515625, "learning_rate": 0.0001, "loss": 5.5042, "loss/crossentropy": 2.494245767593384, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15646066516637802, "step": 23626 }, { "epoch": 0.738375, "grad_norm": 3.015625, "grad_norm_var": 0.08000895182291666, "learning_rate": 0.0001, "loss": 5.7017, "loss/crossentropy": 2.626276969909668, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16456867009401321, "step": 23628 }, { "epoch": 0.7384375, "grad_norm": 2.9375, "grad_norm_var": 0.041337076822916666, "learning_rate": 0.0001, "loss": 5.5703, "loss/crossentropy": 2.528649926185608, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16041024029254913, "step": 23630 }, { "epoch": 0.7385, "grad_norm": 4.03125, "grad_norm_var": 0.09514567057291666, "learning_rate": 0.0001, "loss": 5.9694, "loss/crossentropy": 2.622844696044922, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18036100268363953, "step": 23632 }, { "epoch": 0.7385625, "grad_norm": 2.859375, "grad_norm_var": 0.09716695149739583, "learning_rate": 0.0001, "loss": 5.5787, "loss/crossentropy": 2.520618438720703, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.16479463130235672, "step": 23634 }, { "epoch": 0.738625, "grad_norm": 3.265625, "grad_norm_var": 0.09853108723958333, "learning_rate": 0.0001, "loss": 5.481, "loss/crossentropy": 2.4266551733016968, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16246679425239563, "step": 23636 }, { "epoch": 0.7386875, "grad_norm": 3.046875, "grad_norm_var": 0.08171284993489583, "learning_rate": 0.0001, "loss": 5.8134, "loss/crossentropy": 2.6680673360824585, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16687805950641632, "step": 23638 }, { "epoch": 0.73875, "grad_norm": 2.953125, "grad_norm_var": 0.08269856770833334, "learning_rate": 0.0001, "loss": 5.2904, "loss/crossentropy": 2.321548342704773, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1523580253124237, "step": 23640 }, { "epoch": 0.7388125, "grad_norm": 2.9375, "grad_norm_var": 0.08245442708333334, "learning_rate": 0.0001, "loss": 5.6741, "loss/crossentropy": 2.5963666439056396, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16246125102043152, "step": 23642 }, { "epoch": 0.738875, "grad_norm": 3.125, "grad_norm_var": 0.07638346354166667, "learning_rate": 0.0001, "loss": 5.5873, "loss/crossentropy": 2.5040605068206787, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16144713759422302, "step": 23644 }, { "epoch": 0.7389375, "grad_norm": 3.5, "grad_norm_var": 0.08185221354166666, "learning_rate": 0.0001, "loss": 5.3826, "loss/crossentropy": 2.3296725749969482, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16037489473819733, "step": 23646 }, { "epoch": 0.739, "grad_norm": 2.921875, "grad_norm_var": 0.030760701497395834, "learning_rate": 0.0001, "loss": 5.6415, "loss/crossentropy": 2.5903111696243286, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15863718837499619, "step": 23648 }, { "epoch": 0.7390625, "grad_norm": 2.796875, "grad_norm_var": 0.03705952962239583, "learning_rate": 0.0001, "loss": 5.3715, "loss/crossentropy": 2.3111305236816406, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16033006459474564, "step": 23650 }, { "epoch": 0.739125, "grad_norm": 3.03125, "grad_norm_var": 0.03726806640625, "learning_rate": 0.0001, "loss": 5.36, "loss/crossentropy": 2.398424506187439, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15240325778722763, "step": 23652 }, { "epoch": 0.7391875, "grad_norm": 2.96875, "grad_norm_var": 0.0329498291015625, "learning_rate": 0.0001, "loss": 5.8221, "loss/crossentropy": 2.6814838647842407, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.164058618247509, "step": 23654 }, { "epoch": 0.73925, "grad_norm": 2.859375, "grad_norm_var": 0.0348297119140625, "learning_rate": 0.0001, "loss": 5.5606, "loss/crossentropy": 2.570521354675293, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15682382881641388, "step": 23656 }, { "epoch": 0.7393125, "grad_norm": 3.09375, "grad_norm_var": 0.033014933268229164, "learning_rate": 0.0001, "loss": 5.5078, "loss/crossentropy": 2.4352725744247437, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16155054420232773, "step": 23658 }, { "epoch": 0.739375, "grad_norm": 2.96875, "grad_norm_var": 0.034956868489583334, "learning_rate": 0.0001, "loss": 5.8403, "loss/crossentropy": 2.633499503135681, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17263556271791458, "step": 23660 }, { "epoch": 0.7394375, "grad_norm": 3.25, "grad_norm_var": 0.0244537353515625, "learning_rate": 0.0001, "loss": 5.8884, "loss/crossentropy": 2.6757001876831055, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17361054569482803, "step": 23662 }, { "epoch": 0.7395, "grad_norm": 3.125, "grad_norm_var": 0.026024373372395833, "learning_rate": 0.0001, "loss": 5.6931, "loss/crossentropy": 2.592305064201355, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16594009846448898, "step": 23664 }, { "epoch": 0.7395625, "grad_norm": 3.0, "grad_norm_var": 0.016532389322916667, "learning_rate": 0.0001, "loss": 5.5975, "loss/crossentropy": 2.524364471435547, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16161108016967773, "step": 23666 }, { "epoch": 0.739625, "grad_norm": 3.328125, "grad_norm_var": 0.027961222330729167, "learning_rate": 0.0001, "loss": 5.9427, "loss/crossentropy": 2.7580608129501343, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17158909887075424, "step": 23668 }, { "epoch": 0.7396875, "grad_norm": 3.203125, "grad_norm_var": 0.028706868489583332, "learning_rate": 0.0001, "loss": 5.84, "loss/crossentropy": 2.7155197858810425, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16635117679834366, "step": 23670 }, { "epoch": 0.73975, "grad_norm": 3.375, "grad_norm_var": 0.029781087239583334, "learning_rate": 0.0001, "loss": 5.8557, "loss/crossentropy": 2.634499430656433, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17407264560461044, "step": 23672 }, { "epoch": 0.7398125, "grad_norm": 3.46875, "grad_norm_var": 0.039281209309895836, "learning_rate": 0.0001, "loss": 5.8832, "loss/crossentropy": 2.57876718044281, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17810030281543732, "step": 23674 }, { "epoch": 0.739875, "grad_norm": 3.125, "grad_norm_var": 0.038411458333333336, "learning_rate": 0.0001, "loss": 5.7498, "loss/crossentropy": 2.727282166481018, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15771931409835815, "step": 23676 }, { "epoch": 0.7399375, "grad_norm": 3.046875, "grad_norm_var": 0.03795572916666667, "learning_rate": 0.0001, "loss": 5.9869, "loss/crossentropy": 2.7334845066070557, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17846722155809402, "step": 23678 }, { "epoch": 0.74, "grad_norm": 2.953125, "grad_norm_var": 0.035643513997395834, "learning_rate": 0.0001, "loss": 5.7289, "loss/crossentropy": 2.726085901260376, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15379394590854645, "step": 23680 }, { "epoch": 0.7400625, "grad_norm": 3.390625, "grad_norm_var": 0.0536285400390625, "learning_rate": 0.0001, "loss": 5.4931, "loss/crossentropy": 2.3859097957611084, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16697047650814056, "step": 23682 }, { "epoch": 0.740125, "grad_norm": 3.25, "grad_norm_var": 0.038263956705729164, "learning_rate": 0.0001, "loss": 5.6054, "loss/crossentropy": 2.557660222053528, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16024072468280792, "step": 23684 }, { "epoch": 0.7401875, "grad_norm": 3.140625, "grad_norm_var": 0.03748270670572917, "learning_rate": 0.0001, "loss": 6.0288, "loss/crossentropy": 2.8301045894622803, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1702597811818123, "step": 23686 }, { "epoch": 0.74025, "grad_norm": 3.25, "grad_norm_var": 0.0434478759765625, "learning_rate": 0.0001, "loss": 5.7256, "loss/crossentropy": 2.661123752593994, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1627010926604271, "step": 23688 }, { "epoch": 0.7403125, "grad_norm": 3.3125, "grad_norm_var": 0.03803609212239583, "learning_rate": 0.0001, "loss": 5.8583, "loss/crossentropy": 2.6228137016296387, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17159803211688995, "step": 23690 }, { "epoch": 0.740375, "grad_norm": 3.234375, "grad_norm_var": 0.03662821451822917, "learning_rate": 0.0001, "loss": 6.0075, "loss/crossentropy": 2.808443069458008, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17146998643875122, "step": 23692 }, { "epoch": 0.7404375, "grad_norm": 2.984375, "grad_norm_var": 0.0408111572265625, "learning_rate": 0.0001, "loss": 5.7996, "loss/crossentropy": 2.6491732597351074, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16621138155460358, "step": 23694 }, { "epoch": 0.7405, "grad_norm": 3.328125, "grad_norm_var": 0.0383697509765625, "learning_rate": 0.0001, "loss": 5.7836, "loss/crossentropy": 2.545189380645752, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17696908861398697, "step": 23696 }, { "epoch": 0.7405625, "grad_norm": 3.421875, "grad_norm_var": 0.029320271809895833, "learning_rate": 0.0001, "loss": 5.5782, "loss/crossentropy": 2.4729238748550415, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1652190387248993, "step": 23698 }, { "epoch": 0.740625, "grad_norm": 3.15625, "grad_norm_var": 0.028351847330729166, "learning_rate": 0.0001, "loss": 5.7948, "loss/crossentropy": 2.632536292076111, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.17130043357610703, "step": 23700 }, { "epoch": 0.7406875, "grad_norm": 2.921875, "grad_norm_var": 0.030367024739583335, "learning_rate": 0.0001, "loss": 5.3192, "loss/crossentropy": 2.3588616847991943, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.148768350481987, "step": 23702 }, { "epoch": 0.74075, "grad_norm": 3.0, "grad_norm_var": 0.029686482747395833, "learning_rate": 0.0001, "loss": 5.9918, "loss/crossentropy": 2.700800657272339, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17910093069076538, "step": 23704 }, { "epoch": 0.7408125, "grad_norm": 3.015625, "grad_norm_var": 0.02974853515625, "learning_rate": 0.0001, "loss": 5.7756, "loss/crossentropy": 2.7007559537887573, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16373586654663086, "step": 23706 }, { "epoch": 0.740875, "grad_norm": 3.046875, "grad_norm_var": 0.03736063639322917, "learning_rate": 0.0001, "loss": 5.5338, "loss/crossentropy": 2.461254835128784, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16428812593221664, "step": 23708 }, { "epoch": 0.7409375, "grad_norm": 3.015625, "grad_norm_var": 0.039872233072916666, "learning_rate": 0.0001, "loss": 5.2952, "loss/crossentropy": 2.367771625518799, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15134061872959137, "step": 23710 }, { "epoch": 0.741, "grad_norm": 3.046875, "grad_norm_var": 0.03721415201822917, "learning_rate": 0.0001, "loss": 5.6265, "loss/crossentropy": 2.4913235902786255, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16663941740989685, "step": 23712 }, { "epoch": 0.7410625, "grad_norm": 3.375, "grad_norm_var": 0.03476155598958333, "learning_rate": 0.0001, "loss": 5.6103, "loss/crossentropy": 2.4854609966278076, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1636602133512497, "step": 23714 }, { "epoch": 0.741125, "grad_norm": 3.0, "grad_norm_var": 0.03599853515625, "learning_rate": 0.0001, "loss": 5.7972, "loss/crossentropy": 2.671536087989807, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16647516191005707, "step": 23716 }, { "epoch": 0.7411875, "grad_norm": 2.984375, "grad_norm_var": 0.04309895833333333, "learning_rate": 0.0001, "loss": 5.6616, "loss/crossentropy": 2.609768509864807, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16221675276756287, "step": 23718 }, { "epoch": 0.74125, "grad_norm": 3.265625, "grad_norm_var": 0.03778889973958333, "learning_rate": 0.0001, "loss": 5.5797, "loss/crossentropy": 2.4606499671936035, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1626831218600273, "step": 23720 }, { "epoch": 0.7413125, "grad_norm": 3.34375, "grad_norm_var": 0.039383951822916666, "learning_rate": 0.0001, "loss": 5.581, "loss/crossentropy": 2.534591317176819, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1601075753569603, "step": 23722 }, { "epoch": 0.741375, "grad_norm": 3.171875, "grad_norm_var": 0.03013916015625, "learning_rate": 0.0001, "loss": 5.8479, "loss/crossentropy": 2.680930733680725, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1698269248008728, "step": 23724 }, { "epoch": 0.7414375, "grad_norm": 3.53125, "grad_norm_var": 0.03504130045572917, "learning_rate": 0.0001, "loss": 5.7852, "loss/crossentropy": 2.5099505186080933, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.175571970641613, "step": 23726 }, { "epoch": 0.7415, "grad_norm": 3.171875, "grad_norm_var": 0.0338287353515625, "learning_rate": 0.0001, "loss": 5.7857, "loss/crossentropy": 2.6229697465896606, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16510528326034546, "step": 23728 }, { "epoch": 0.7415625, "grad_norm": 3.046875, "grad_norm_var": 0.0298004150390625, "learning_rate": 0.0001, "loss": 5.5974, "loss/crossentropy": 2.504623770713806, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16006328165531158, "step": 23730 }, { "epoch": 0.741625, "grad_norm": 3.46875, "grad_norm_var": 0.0399810791015625, "learning_rate": 0.0001, "loss": 5.4548, "loss/crossentropy": 2.3802608251571655, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16018792241811752, "step": 23732 }, { "epoch": 0.7416875, "grad_norm": 3.140625, "grad_norm_var": 0.0331451416015625, "learning_rate": 0.0001, "loss": 6.0567, "loss/crossentropy": 2.829725742340088, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1758190467953682, "step": 23734 }, { "epoch": 0.74175, "grad_norm": 3.0625, "grad_norm_var": 0.03906962076822917, "learning_rate": 0.0001, "loss": 5.4737, "loss/crossentropy": 2.4233882427215576, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15932518243789673, "step": 23736 }, { "epoch": 0.7418125, "grad_norm": 3.1875, "grad_norm_var": 0.03612874348958333, "learning_rate": 0.0001, "loss": 5.6637, "loss/crossentropy": 2.5260519981384277, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16532868146896362, "step": 23738 }, { "epoch": 0.741875, "grad_norm": 3.046875, "grad_norm_var": 0.039839680989583334, "learning_rate": 0.0001, "loss": 5.5615, "loss/crossentropy": 2.5807985067367554, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1551012247800827, "step": 23740 }, { "epoch": 0.7419375, "grad_norm": 3.109375, "grad_norm_var": 0.031966145833333334, "learning_rate": 0.0001, "loss": 5.6253, "loss/crossentropy": 2.571129322052002, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16245053708553314, "step": 23742 }, { "epoch": 0.742, "grad_norm": 2.984375, "grad_norm_var": 0.0354400634765625, "learning_rate": 0.0001, "loss": 5.6157, "loss/crossentropy": 2.533260226249695, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16175467520952225, "step": 23744 }, { "epoch": 0.7420625, "grad_norm": 3.34375, "grad_norm_var": 0.046875, "learning_rate": 0.0001, "loss": 6.0052, "loss/crossentropy": 2.831053376197815, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1697554662823677, "step": 23746 }, { "epoch": 0.742125, "grad_norm": 4.1875, "grad_norm_var": 0.1024078369140625, "learning_rate": 0.0001, "loss": 5.6104, "loss/crossentropy": 2.438306450843811, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16486083716154099, "step": 23748 }, { "epoch": 0.7421875, "grad_norm": 3.25, "grad_norm_var": 0.10673421223958333, "learning_rate": 0.0001, "loss": 5.8286, "loss/crossentropy": 2.676417589187622, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1679568737745285, "step": 23750 }, { "epoch": 0.74225, "grad_norm": 3.359375, "grad_norm_var": 0.10194905598958333, "learning_rate": 0.0001, "loss": 5.7243, "loss/crossentropy": 2.5188801288604736, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1717102751135826, "step": 23752 }, { "epoch": 0.7423125, "grad_norm": 3.140625, "grad_norm_var": 0.10461324055989583, "learning_rate": 0.0001, "loss": 5.5154, "loss/crossentropy": 2.4078171253204346, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16466298699378967, "step": 23754 }, { "epoch": 0.742375, "grad_norm": 2.90625, "grad_norm_var": 0.11620992024739583, "learning_rate": 0.0001, "loss": 5.5563, "loss/crossentropy": 2.47242271900177, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15955600142478943, "step": 23756 }, { "epoch": 0.7424375, "grad_norm": 3.390625, "grad_norm_var": 0.11027018229166667, "learning_rate": 0.0001, "loss": 5.5343, "loss/crossentropy": 2.456767201423645, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1600937768816948, "step": 23758 }, { "epoch": 0.7425, "grad_norm": 3.421875, "grad_norm_var": 0.10869038899739583, "learning_rate": 0.0001, "loss": 5.8712, "loss/crossentropy": 2.7310062646865845, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.17026840150356293, "step": 23760 }, { "epoch": 0.7425625, "grad_norm": 3.0625, "grad_norm_var": 0.09637044270833334, "learning_rate": 0.0001, "loss": 5.8395, "loss/crossentropy": 2.7477962970733643, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16112085431814194, "step": 23762 }, { "epoch": 0.742625, "grad_norm": 2.921875, "grad_norm_var": 0.042455037434895836, "learning_rate": 0.0001, "loss": 5.8389, "loss/crossentropy": 2.7224472761154175, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16633137315511703, "step": 23764 }, { "epoch": 0.7426875, "grad_norm": 3.0, "grad_norm_var": 0.04687093098958333, "learning_rate": 0.0001, "loss": 5.4884, "loss/crossentropy": 2.4695459604263306, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15618199855089188, "step": 23766 }, { "epoch": 0.74275, "grad_norm": 3.171875, "grad_norm_var": 0.04413960774739583, "learning_rate": 0.0001, "loss": 5.8696, "loss/crossentropy": 2.6840864419937134, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.17324209958314896, "step": 23768 }, { "epoch": 0.7428125, "grad_norm": 3.15625, "grad_norm_var": 0.04756571451822917, "learning_rate": 0.0001, "loss": 5.6721, "loss/crossentropy": 2.6337684392929077, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15852554142475128, "step": 23770 }, { "epoch": 0.742875, "grad_norm": 2.984375, "grad_norm_var": 0.030516560872395834, "learning_rate": 0.0001, "loss": 5.485, "loss/crossentropy": 2.437623381614685, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16060182452201843, "step": 23772 }, { "epoch": 0.7429375, "grad_norm": 3.234375, "grad_norm_var": 0.03082275390625, "learning_rate": 0.0001, "loss": 5.6025, "loss/crossentropy": 2.4522485733032227, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.1618979573249817, "step": 23774 }, { "epoch": 0.743, "grad_norm": 3.046875, "grad_norm_var": 0.02603759765625, "learning_rate": 0.0001, "loss": 5.5423, "loss/crossentropy": 2.5391552448272705, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.16047395020723343, "step": 23776 }, { "epoch": 0.7430625, "grad_norm": 3.171875, "grad_norm_var": 0.047587076822916664, "learning_rate": 0.0001, "loss": 5.678, "loss/crossentropy": 2.538044571876526, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1667301505804062, "step": 23778 }, { "epoch": 0.743125, "grad_norm": 3.015625, "grad_norm_var": 0.045897420247395834, "learning_rate": 0.0001, "loss": 5.9731, "loss/crossentropy": 2.809144616127014, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.17030035704374313, "step": 23780 }, { "epoch": 0.7431875, "grad_norm": 3.0625, "grad_norm_var": 0.039061482747395834, "learning_rate": 0.0001, "loss": 5.7187, "loss/crossentropy": 2.593548893928528, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1632988154888153, "step": 23782 }, { "epoch": 0.74325, "grad_norm": 2.96875, "grad_norm_var": 0.04285481770833333, "learning_rate": 0.0001, "loss": 5.5048, "loss/crossentropy": 2.466431975364685, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1577415019273758, "step": 23784 }, { "epoch": 0.7433125, "grad_norm": 2.859375, "grad_norm_var": 0.044123331705729164, "learning_rate": 0.0001, "loss": 5.5427, "loss/crossentropy": 2.5535603761672974, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15555761754512787, "step": 23786 }, { "epoch": 0.743375, "grad_norm": 3.234375, "grad_norm_var": 0.043257649739583334, "learning_rate": 0.0001, "loss": 5.8237, "loss/crossentropy": 2.560186743736267, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17322541028261185, "step": 23788 }, { "epoch": 0.7434375, "grad_norm": 3.125, "grad_norm_var": 0.03844401041666667, "learning_rate": 0.0001, "loss": 5.8743, "loss/crossentropy": 2.6807111501693726, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17131386697292328, "step": 23790 }, { "epoch": 0.7435, "grad_norm": 3.0, "grad_norm_var": 0.038752237955729164, "learning_rate": 0.0001, "loss": 5.7172, "loss/crossentropy": 2.6006758213043213, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16243094950914383, "step": 23792 }, { "epoch": 0.7435625, "grad_norm": 3.5625, "grad_norm_var": 0.03437093098958333, "learning_rate": 0.0001, "loss": 6.0407, "loss/crossentropy": 2.7483190298080444, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17728640139102936, "step": 23794 }, { "epoch": 0.743625, "grad_norm": 3.28125, "grad_norm_var": 0.03459879557291667, "learning_rate": 0.0001, "loss": 5.854, "loss/crossentropy": 2.657385230064392, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17122593522071838, "step": 23796 }, { "epoch": 0.7436875, "grad_norm": 3.203125, "grad_norm_var": 0.03426005045572917, "learning_rate": 0.0001, "loss": 5.4711, "loss/crossentropy": 2.4196746349334717, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15709207952022552, "step": 23798 }, { "epoch": 0.74375, "grad_norm": 2.796875, "grad_norm_var": 0.0406646728515625, "learning_rate": 0.0001, "loss": 5.457, "loss/crossentropy": 2.4605520963668823, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15667151659727097, "step": 23800 }, { "epoch": 0.7438125, "grad_norm": 3.453125, "grad_norm_var": 0.0395416259765625, "learning_rate": 0.0001, "loss": 5.8347, "loss/crossentropy": 2.6478604078292847, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16907228529453278, "step": 23802 }, { "epoch": 0.743875, "grad_norm": 3.03125, "grad_norm_var": 0.04052327473958333, "learning_rate": 0.0001, "loss": 5.7251, "loss/crossentropy": 2.6099071502685547, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1662066951394081, "step": 23804 }, { "epoch": 0.7439375, "grad_norm": 3.109375, "grad_norm_var": 0.04078369140625, "learning_rate": 0.0001, "loss": 5.649, "loss/crossentropy": 2.5908056497573853, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.160505473613739, "step": 23806 }, { "epoch": 0.744, "grad_norm": 3.234375, "grad_norm_var": 0.0348541259765625, "learning_rate": 0.0001, "loss": 5.6204, "loss/crossentropy": 2.495275855064392, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1621190384030342, "step": 23808 }, { "epoch": 0.7440625, "grad_norm": 3.234375, "grad_norm_var": 0.0240631103515625, "learning_rate": 0.0001, "loss": 5.9735, "loss/crossentropy": 2.73429536819458, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17626766860485077, "step": 23810 }, { "epoch": 0.744125, "grad_norm": 2.984375, "grad_norm_var": 0.02476806640625, "learning_rate": 0.0001, "loss": 5.9545, "loss/crossentropy": 2.7888200283050537, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.170477956533432, "step": 23812 }, { "epoch": 0.7441875, "grad_norm": 3.0, "grad_norm_var": 0.026090494791666665, "learning_rate": 0.0001, "loss": 5.5622, "loss/crossentropy": 2.5557327270507812, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15768138319253922, "step": 23814 }, { "epoch": 0.74425, "grad_norm": 2.90625, "grad_norm_var": 0.028385416666666666, "learning_rate": 0.0001, "loss": 5.5638, "loss/crossentropy": 2.6190967559814453, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.15462665259838104, "step": 23816 }, { "epoch": 0.7443125, "grad_norm": 3.09375, "grad_norm_var": 0.021219889322916668, "learning_rate": 0.0001, "loss": 5.7951, "loss/crossentropy": 2.600494861602783, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17023959755897522, "step": 23818 }, { "epoch": 0.744375, "grad_norm": 3.125, "grad_norm_var": 0.022175089518229166, "learning_rate": 0.0001, "loss": 5.365, "loss/crossentropy": 2.3739324808120728, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15614111721515656, "step": 23820 }, { "epoch": 0.7444375, "grad_norm": 3.234375, "grad_norm_var": 0.027269490559895835, "learning_rate": 0.0001, "loss": 5.6678, "loss/crossentropy": 2.529435157775879, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16384144872426987, "step": 23822 }, { "epoch": 0.7445, "grad_norm": 3.21875, "grad_norm_var": 0.03376363118489583, "learning_rate": 0.0001, "loss": 5.8025, "loss/crossentropy": 2.5526143312454224, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17772244662046432, "step": 23824 }, { "epoch": 0.7445625, "grad_norm": 3.203125, "grad_norm_var": 0.042496744791666666, "learning_rate": 0.0001, "loss": 5.5248, "loss/crossentropy": 2.4738471508026123, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15783115476369858, "step": 23826 }, { "epoch": 0.744625, "grad_norm": 2.75, "grad_norm_var": 0.05019429524739583, "learning_rate": 0.0001, "loss": 5.5796, "loss/crossentropy": 2.524854302406311, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1566486656665802, "step": 23828 }, { "epoch": 0.7446875, "grad_norm": 3.296875, "grad_norm_var": 0.06464436848958334, "learning_rate": 0.0001, "loss": 5.839, "loss/crossentropy": 2.54987108707428, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17930391430854797, "step": 23830 }, { "epoch": 0.74475, "grad_norm": 3.140625, "grad_norm_var": 0.05021870930989583, "learning_rate": 0.0001, "loss": 5.6872, "loss/crossentropy": 2.557926297187805, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16840001940727234, "step": 23832 }, { "epoch": 0.7448125, "grad_norm": 4.21875, "grad_norm_var": 0.11629231770833333, "learning_rate": 0.0001, "loss": 5.6444, "loss/crossentropy": 2.4489688873291016, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16680970042943954, "step": 23834 }, { "epoch": 0.744875, "grad_norm": 2.875, "grad_norm_var": 0.11988525390625, "learning_rate": 0.0001, "loss": 5.7966, "loss/crossentropy": 2.7099976539611816, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1641286015510559, "step": 23836 }, { "epoch": 0.7449375, "grad_norm": 3.078125, "grad_norm_var": 0.12444661458333334, "learning_rate": 0.0001, "loss": 5.633, "loss/crossentropy": 2.5509352684020996, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1609424576163292, "step": 23838 }, { "epoch": 0.745, "grad_norm": 2.703125, "grad_norm_var": 0.13852437337239584, "learning_rate": 0.0001, "loss": 5.4337, "loss/crossentropy": 2.5036728382110596, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.14964640885591507, "step": 23840 }, { "epoch": 0.7450625, "grad_norm": 3.25, "grad_norm_var": 0.13062744140625, "learning_rate": 0.0001, "loss": 5.6479, "loss/crossentropy": 2.545154333114624, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.15949159860610962, "step": 23842 }, { "epoch": 0.745125, "grad_norm": 3.375, "grad_norm_var": 0.12294514973958333, "learning_rate": 0.0001, "loss": 5.601, "loss/crossentropy": 2.5171507596969604, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.15799562633037567, "step": 23844 }, { "epoch": 0.7451875, "grad_norm": 2.796875, "grad_norm_var": 0.1241363525390625, "learning_rate": 0.0001, "loss": 5.1171, "loss/crossentropy": 2.2651237845420837, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.14535468816757202, "step": 23846 }, { "epoch": 0.74525, "grad_norm": 2.796875, "grad_norm_var": 0.13225911458333334, "learning_rate": 0.0001, "loss": 5.1535, "loss/crossentropy": 2.32041072845459, "loss/hidden": 1.390625, "loss/jsd": 0.0, "loss/logits": 0.14425078779459, "step": 23848 }, { "epoch": 0.7453125, "grad_norm": 3.09375, "grad_norm_var": 0.03590087890625, "learning_rate": 0.0001, "loss": 5.417, "loss/crossentropy": 2.4762797355651855, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15071460604667664, "step": 23850 }, { "epoch": 0.745375, "grad_norm": 2.96875, "grad_norm_var": 0.03487040201822917, "learning_rate": 0.0001, "loss": 5.4015, "loss/crossentropy": 2.346813201904297, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15742014348506927, "step": 23852 }, { "epoch": 0.7454375, "grad_norm": 3.109375, "grad_norm_var": 0.042821248372395836, "learning_rate": 0.0001, "loss": 5.8207, "loss/crossentropy": 2.6550374031066895, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16969284415245056, "step": 23854 }, { "epoch": 0.7455, "grad_norm": 2.875, "grad_norm_var": 0.0358551025390625, "learning_rate": 0.0001, "loss": 5.4539, "loss/crossentropy": 2.4543232917785645, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15699072182178497, "step": 23856 }, { "epoch": 0.7455625, "grad_norm": 3.265625, "grad_norm_var": 0.03863525390625, "learning_rate": 0.0001, "loss": 5.5597, "loss/crossentropy": 2.4374698400497437, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16573935747146606, "step": 23858 }, { "epoch": 0.745625, "grad_norm": 3.1875, "grad_norm_var": 0.0354156494140625, "learning_rate": 0.0001, "loss": 5.8478, "loss/crossentropy": 2.682563543319702, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.169651597738266, "step": 23860 }, { "epoch": 0.7456875, "grad_norm": 3.234375, "grad_norm_var": 0.030321248372395835, "learning_rate": 0.0001, "loss": 5.2931, "loss/crossentropy": 2.266256332397461, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1569819077849388, "step": 23862 }, { "epoch": 0.74575, "grad_norm": 2.890625, "grad_norm_var": 0.024120076497395834, "learning_rate": 0.0001, "loss": 5.5981, "loss/crossentropy": 2.5668485164642334, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.1589885726571083, "step": 23864 }, { "epoch": 0.7458125, "grad_norm": 3.34375, "grad_norm_var": 0.028734334309895835, "learning_rate": 0.0001, "loss": 5.6187, "loss/crossentropy": 2.527132987976074, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16462143510580063, "step": 23866 }, { "epoch": 0.745875, "grad_norm": 2.96875, "grad_norm_var": 0.0303619384765625, "learning_rate": 0.0001, "loss": 5.5882, "loss/crossentropy": 2.5037399530410767, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16040148586034775, "step": 23868 }, { "epoch": 0.7459375, "grad_norm": 3.25, "grad_norm_var": 0.028547159830729165, "learning_rate": 0.0001, "loss": 5.4586, "loss/crossentropy": 2.3871986865997314, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1590976044535637, "step": 23870 }, { "epoch": 0.746, "grad_norm": 2.953125, "grad_norm_var": 0.0265045166015625, "learning_rate": 0.0001, "loss": 5.6925, "loss/crossentropy": 2.6443164348602295, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1610671430826187, "step": 23872 }, { "epoch": 0.7460625, "grad_norm": 2.921875, "grad_norm_var": 0.022606404622395833, "learning_rate": 0.0001, "loss": 5.7462, "loss/crossentropy": 2.6622105836868286, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15956847369670868, "step": 23874 }, { "epoch": 0.746125, "grad_norm": 3.375, "grad_norm_var": 0.027887980143229168, "learning_rate": 0.0001, "loss": 5.7199, "loss/crossentropy": 2.5624345541000366, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16652469336986542, "step": 23876 }, { "epoch": 0.7461875, "grad_norm": 2.90625, "grad_norm_var": 0.03357747395833333, "learning_rate": 0.0001, "loss": 5.5662, "loss/crossentropy": 2.488021492958069, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15859421342611313, "step": 23878 }, { "epoch": 0.74625, "grad_norm": 3.28125, "grad_norm_var": 0.03280843098958333, "learning_rate": 0.0001, "loss": 5.5614, "loss/crossentropy": 2.439563274383545, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16257558017969131, "step": 23880 }, { "epoch": 0.7463125, "grad_norm": 2.90625, "grad_norm_var": 0.030517578125, "learning_rate": 0.0001, "loss": 5.7069, "loss/crossentropy": 2.659354090690613, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16178329288959503, "step": 23882 }, { "epoch": 0.746375, "grad_norm": 3.09375, "grad_norm_var": 0.028450520833333333, "learning_rate": 0.0001, "loss": 5.8174, "loss/crossentropy": 2.6543742418289185, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16864780336618423, "step": 23884 }, { "epoch": 0.7464375, "grad_norm": 3.09375, "grad_norm_var": 0.0263671875, "learning_rate": 0.0001, "loss": 5.6579, "loss/crossentropy": 2.539881706237793, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16532030701637268, "step": 23886 }, { "epoch": 0.7465, "grad_norm": 3.1875, "grad_norm_var": 0.024925740559895833, "learning_rate": 0.0001, "loss": 5.7044, "loss/crossentropy": 2.5410473346710205, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.17024239152669907, "step": 23888 }, { "epoch": 0.7465625, "grad_norm": 3.21875, "grad_norm_var": 0.5532948811848958, "learning_rate": 0.0001, "loss": 6.1183, "loss/crossentropy": 2.7540390491485596, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.18290842324495316, "step": 23890 }, { "epoch": 0.746625, "grad_norm": 3.15625, "grad_norm_var": 0.5614003499348958, "learning_rate": 0.0001, "loss": 5.4943, "loss/crossentropy": 2.4996707439422607, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.1553225964307785, "step": 23892 }, { "epoch": 0.7466875, "grad_norm": 3.296875, "grad_norm_var": 0.5522450764973958, "learning_rate": 0.0001, "loss": 5.5942, "loss/crossentropy": 2.500562906265259, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1648305132985115, "step": 23894 }, { "epoch": 0.74675, "grad_norm": 2.71875, "grad_norm_var": 0.5752675374348958, "learning_rate": 0.0001, "loss": 5.5396, "loss/crossentropy": 2.5759114027023315, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15417730808258057, "step": 23896 }, { "epoch": 0.7468125, "grad_norm": 3.421875, "grad_norm_var": 0.5693522135416667, "learning_rate": 0.0001, "loss": 5.8975, "loss/crossentropy": 2.683432936668396, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17257541418075562, "step": 23898 }, { "epoch": 0.746875, "grad_norm": 3.421875, "grad_norm_var": 0.5674875895182292, "learning_rate": 0.0001, "loss": 5.7925, "loss/crossentropy": 2.609610438346863, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17024364322423935, "step": 23900 }, { "epoch": 0.7469375, "grad_norm": 3.515625, "grad_norm_var": 0.5640462239583334, "learning_rate": 0.0001, "loss": 6.1357, "loss/crossentropy": 2.832602620124817, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17952553182840347, "step": 23902 }, { "epoch": 0.747, "grad_norm": 3.03125, "grad_norm_var": 0.58209228515625, "learning_rate": 0.0001, "loss": 5.2262, "loss/crossentropy": 2.287443995475769, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.14856766909360886, "step": 23904 }, { "epoch": 0.7470625, "grad_norm": 3.484375, "grad_norm_var": 0.06466471354166667, "learning_rate": 0.0001, "loss": 5.5386, "loss/crossentropy": 2.4046937227249146, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1649523451924324, "step": 23906 }, { "epoch": 0.747125, "grad_norm": 2.984375, "grad_norm_var": 0.06090087890625, "learning_rate": 0.0001, "loss": 5.4569, "loss/crossentropy": 2.4906638860702515, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15521354228258133, "step": 23908 }, { "epoch": 0.7471875, "grad_norm": 3.0, "grad_norm_var": 0.0646392822265625, "learning_rate": 0.0001, "loss": 5.4602, "loss/crossentropy": 2.4620014429092407, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.15723932534456253, "step": 23910 }, { "epoch": 0.74725, "grad_norm": 2.734375, "grad_norm_var": 0.06411844889322917, "learning_rate": 0.0001, "loss": 5.5213, "loss/crossentropy": 2.5211902856826782, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15704332292079926, "step": 23912 }, { "epoch": 0.7473125, "grad_norm": 3.359375, "grad_norm_var": 0.062409464518229166, "learning_rate": 0.0001, "loss": 5.8967, "loss/crossentropy": 2.6899460554122925, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17223425954580307, "step": 23914 }, { "epoch": 0.747375, "grad_norm": 3.125, "grad_norm_var": 0.050902303059895834, "learning_rate": 0.0001, "loss": 5.7158, "loss/crossentropy": 2.600480079650879, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16504865884780884, "step": 23916 }, { "epoch": 0.7474375, "grad_norm": 3.3125, "grad_norm_var": 0.0417633056640625, "learning_rate": 0.0001, "loss": 5.8917, "loss/crossentropy": 2.760825276374817, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16464951634407043, "step": 23918 }, { "epoch": 0.7475, "grad_norm": 3.15625, "grad_norm_var": 0.0383209228515625, "learning_rate": 0.0001, "loss": 5.6164, "loss/crossentropy": 2.49069881439209, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.167646124958992, "step": 23920 }, { "epoch": 0.7475625, "grad_norm": 3.03125, "grad_norm_var": 0.0501953125, "learning_rate": 0.0001, "loss": 5.6284, "loss/crossentropy": 2.4274455308914185, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.16697538644075394, "step": 23922 }, { "epoch": 0.747625, "grad_norm": 3.046875, "grad_norm_var": 0.0490142822265625, "learning_rate": 0.0001, "loss": 5.1744, "loss/crossentropy": 2.2075916528701782, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.15410692989826202, "step": 23924 }, { "epoch": 0.7476875, "grad_norm": 3.234375, "grad_norm_var": 0.05569254557291667, "learning_rate": 0.0001, "loss": 5.4039, "loss/crossentropy": 2.415930151939392, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15426428616046906, "step": 23926 }, { "epoch": 0.74775, "grad_norm": 3.34375, "grad_norm_var": 0.05054423014322917, "learning_rate": 0.0001, "loss": 5.744, "loss/crossentropy": 2.6106520891189575, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16646113991737366, "step": 23928 }, { "epoch": 0.7478125, "grad_norm": 3.015625, "grad_norm_var": 0.04761962890625, "learning_rate": 0.0001, "loss": 5.4452, "loss/crossentropy": 2.40711510181427, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15966404229402542, "step": 23930 }, { "epoch": 0.747875, "grad_norm": 4.21875, "grad_norm_var": 0.12275390625, "learning_rate": 0.0001, "loss": 5.8813, "loss/crossentropy": 2.5960358381271362, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.1742255985736847, "step": 23932 }, { "epoch": 0.7479375, "grad_norm": 3.46875, "grad_norm_var": 0.14806315104166667, "learning_rate": 0.0001, "loss": 5.5221, "loss/crossentropy": 2.374852418899536, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.16120747476816177, "step": 23934 }, { "epoch": 0.748, "grad_norm": 3.265625, "grad_norm_var": 0.14806315104166667, "learning_rate": 0.0001, "loss": 5.6962, "loss/crossentropy": 2.648604393005371, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15671633183956146, "step": 23936 }, { "epoch": 0.7480625, "grad_norm": 3.171875, "grad_norm_var": 0.1526519775390625, "learning_rate": 0.0001, "loss": 5.2132, "loss/crossentropy": 2.3242075443267822, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.14671359956264496, "step": 23938 }, { "epoch": 0.748125, "grad_norm": 2.921875, "grad_norm_var": 0.16282145182291666, "learning_rate": 0.0001, "loss": 5.6767, "loss/crossentropy": 2.611236333847046, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15967395156621933, "step": 23940 }, { "epoch": 0.7481875, "grad_norm": 3.046875, "grad_norm_var": 0.1510650634765625, "learning_rate": 0.0001, "loss": 5.8731, "loss/crossentropy": 2.672127604484558, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.17399899661540985, "step": 23942 }, { "epoch": 0.74825, "grad_norm": 2.96875, "grad_norm_var": 0.15441792805989582, "learning_rate": 0.0001, "loss": 5.5507, "loss/crossentropy": 2.551435947418213, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.15734337270259857, "step": 23944 }, { "epoch": 0.7483125, "grad_norm": 3.390625, "grad_norm_var": 0.15718485514322916, "learning_rate": 0.0001, "loss": 5.6394, "loss/crossentropy": 2.5235499143600464, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16393250226974487, "step": 23946 }, { "epoch": 0.748375, "grad_norm": 3.1875, "grad_norm_var": 0.08142801920572916, "learning_rate": 0.0001, "loss": 5.7568, "loss/crossentropy": 2.6940174102783203, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16408731043338776, "step": 23948 }, { "epoch": 0.7484375, "grad_norm": 2.9375, "grad_norm_var": 0.06448465983072917, "learning_rate": 0.0001, "loss": 5.6808, "loss/crossentropy": 2.5052374601364136, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17029106616973877, "step": 23950 }, { "epoch": 0.7485, "grad_norm": 3.40625, "grad_norm_var": 0.0692535400390625, "learning_rate": 0.0001, "loss": 5.6076, "loss/crossentropy": 2.44515597820282, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1674189642071724, "step": 23952 }, { "epoch": 0.7485625, "grad_norm": 3.59375, "grad_norm_var": 0.1969879150390625, "learning_rate": 0.0001, "loss": 5.9776, "loss/crossentropy": 2.746953010559082, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1730656549334526, "step": 23954 }, { "epoch": 0.748625, "grad_norm": 3.21875, "grad_norm_var": 0.1804840087890625, "learning_rate": 0.0001, "loss": 5.5424, "loss/crossentropy": 2.4625093936920166, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16267801821231842, "step": 23956 }, { "epoch": 0.7486875, "grad_norm": 3.046875, "grad_norm_var": 0.1767974853515625, "learning_rate": 0.0001, "loss": 5.5192, "loss/crossentropy": 2.421678066253662, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1659986898303032, "step": 23958 }, { "epoch": 0.74875, "grad_norm": 2.8125, "grad_norm_var": 0.17277730305989583, "learning_rate": 0.0001, "loss": 5.3055, "loss/crossentropy": 2.3576548099517822, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1502518653869629, "step": 23960 }, { "epoch": 0.7488125, "grad_norm": 3.140625, "grad_norm_var": 0.17234700520833332, "learning_rate": 0.0001, "loss": 5.7065, "loss/crossentropy": 2.5937485694885254, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16517798602581024, "step": 23962 }, { "epoch": 0.748875, "grad_norm": 3.390625, "grad_norm_var": 0.16630859375, "learning_rate": 0.0001, "loss": 5.3547, "loss/crossentropy": 2.3310216665267944, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15549761056900024, "step": 23964 }, { "epoch": 0.7489375, "grad_norm": 3.234375, "grad_norm_var": 0.1443359375, "learning_rate": 0.0001, "loss": 5.9194, "loss/crossentropy": 2.7323626279830933, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1690945252776146, "step": 23966 }, { "epoch": 0.749, "grad_norm": 2.96875, "grad_norm_var": 0.15989481608072917, "learning_rate": 0.0001, "loss": 5.2454, "loss/crossentropy": 2.316407322883606, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.15227185189723969, "step": 23968 }, { "epoch": 0.7490625, "grad_norm": 2.9375, "grad_norm_var": 0.027457682291666667, "learning_rate": 0.0001, "loss": 5.5194, "loss/crossentropy": 2.451360821723938, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16305788606405258, "step": 23970 }, { "epoch": 0.749125, "grad_norm": 3.28125, "grad_norm_var": 0.031956990559895836, "learning_rate": 0.0001, "loss": 5.4111, "loss/crossentropy": 2.38328218460083, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15473628044128418, "step": 23972 }, { "epoch": 0.7491875, "grad_norm": 3.40625, "grad_norm_var": 0.036961873372395836, "learning_rate": 0.0001, "loss": 5.7902, "loss/crossentropy": 2.6268088817596436, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16868211328983307, "step": 23974 }, { "epoch": 0.74925, "grad_norm": 3.15625, "grad_norm_var": 0.0292388916015625, "learning_rate": 0.0001, "loss": 5.7644, "loss/crossentropy": 2.583236336708069, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17045753449201584, "step": 23976 }, { "epoch": 0.7493125, "grad_norm": 2.796875, "grad_norm_var": 0.03853251139322917, "learning_rate": 0.0001, "loss": 5.3857, "loss/crossentropy": 2.4263709783554077, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1506212204694748, "step": 23978 }, { "epoch": 0.749375, "grad_norm": 3.0625, "grad_norm_var": 0.0349517822265625, "learning_rate": 0.0001, "loss": 5.8157, "loss/crossentropy": 2.6879948377609253, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1662856861948967, "step": 23980 }, { "epoch": 0.7494375, "grad_norm": 3.015625, "grad_norm_var": 0.03521728515625, "learning_rate": 0.0001, "loss": 5.4518, "loss/crossentropy": 2.350409746170044, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16052792966365814, "step": 23982 }, { "epoch": 0.7495, "grad_norm": 3.046875, "grad_norm_var": 0.0280181884765625, "learning_rate": 0.0001, "loss": 5.2946, "loss/crossentropy": 2.3216720819473267, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15119792520999908, "step": 23984 }, { "epoch": 0.7495625, "grad_norm": 3.046875, "grad_norm_var": 0.026366170247395834, "learning_rate": 0.0001, "loss": 5.3652, "loss/crossentropy": 2.35917329788208, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15685203671455383, "step": 23986 }, { "epoch": 0.749625, "grad_norm": 3.03125, "grad_norm_var": 0.02017822265625, "learning_rate": 0.0001, "loss": 5.545, "loss/crossentropy": 2.5058690309524536, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15938085317611694, "step": 23988 }, { "epoch": 0.7496875, "grad_norm": 3.296875, "grad_norm_var": 0.013277180989583333, "learning_rate": 0.0001, "loss": 5.814, "loss/crossentropy": 2.5942448377609253, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17197741568088531, "step": 23990 }, { "epoch": 0.74975, "grad_norm": 3.09375, "grad_norm_var": 0.013765462239583333, "learning_rate": 0.0001, "loss": 5.4701, "loss/crossentropy": 2.4266786575317383, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1578565388917923, "step": 23992 }, { "epoch": 0.7498125, "grad_norm": 3.109375, "grad_norm_var": 0.011668904622395834, "learning_rate": 0.0001, "loss": 5.4139, "loss/crossentropy": 2.3503929376602173, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1567407101392746, "step": 23994 }, { "epoch": 0.749875, "grad_norm": 3.625, "grad_norm_var": 0.026725260416666667, "learning_rate": 0.0001, "loss": 6.0104, "loss/crossentropy": 2.7236788272857666, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17789245396852493, "step": 23996 }, { "epoch": 0.7499375, "grad_norm": 2.953125, "grad_norm_var": 0.027490234375, "learning_rate": 0.0001, "loss": 5.7012, "loss/crossentropy": 2.6194682121276855, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16364064812660217, "step": 23998 }, { "epoch": 0.75, "grad_norm": 3.4375, "grad_norm_var": 0.03530171712239583, "learning_rate": 0.0001, "loss": 5.6843, "loss/crossentropy": 2.554969072341919, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16254231333732605, "step": 24000 }, { "epoch": 0.7500625, "grad_norm": 2.703125, "grad_norm_var": 0.04576416015625, "learning_rate": 0.0001, "loss": 5.3964, "loss/crossentropy": 2.4495993852615356, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15171610563993454, "step": 24002 }, { "epoch": 0.750125, "grad_norm": 2.921875, "grad_norm_var": 0.051081339518229164, "learning_rate": 0.0001, "loss": 5.7108, "loss/crossentropy": 2.550310969352722, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1687837466597557, "step": 24004 }, { "epoch": 0.7501875, "grad_norm": 3.5, "grad_norm_var": 0.05676981608072917, "learning_rate": 0.0001, "loss": 5.9811, "loss/crossentropy": 2.6983426809310913, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.17554175853729248, "step": 24006 }, { "epoch": 0.75025, "grad_norm": 3.296875, "grad_norm_var": 0.05952046712239583, "learning_rate": 0.0001, "loss": 5.6996, "loss/crossentropy": 2.505595564842224, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.17330221831798553, "step": 24008 }, { "epoch": 0.7503125, "grad_norm": 3.203125, "grad_norm_var": 0.05813802083333333, "learning_rate": 0.0001, "loss": 5.6695, "loss/crossentropy": 2.5647228956222534, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.159697987139225, "step": 24010 }, { "epoch": 0.750375, "grad_norm": 3.0625, "grad_norm_var": 0.048567708333333334, "learning_rate": 0.0001, "loss": 5.7166, "loss/crossentropy": 2.544753909111023, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17030484229326248, "step": 24012 }, { "epoch": 0.7504375, "grad_norm": 3.078125, "grad_norm_var": 0.052245076497395834, "learning_rate": 0.0001, "loss": 5.3793, "loss/crossentropy": 2.4289255142211914, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15284773707389832, "step": 24014 }, { "epoch": 0.7505, "grad_norm": 3.078125, "grad_norm_var": 0.04543355305989583, "learning_rate": 0.0001, "loss": 5.5457, "loss/crossentropy": 2.4863643646240234, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16062580794095993, "step": 24016 }, { "epoch": 0.7505625, "grad_norm": 2.78125, "grad_norm_var": 0.047607421875, "learning_rate": 0.0001, "loss": 5.5945, "loss/crossentropy": 2.5885279178619385, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15763090550899506, "step": 24018 }, { "epoch": 0.750625, "grad_norm": 2.9375, "grad_norm_var": 0.04299723307291667, "learning_rate": 0.0001, "loss": 5.6919, "loss/crossentropy": 2.659751534461975, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15985984355211258, "step": 24020 }, { "epoch": 0.7506875, "grad_norm": 3.015625, "grad_norm_var": 0.031966145833333334, "learning_rate": 0.0001, "loss": 5.7268, "loss/crossentropy": 2.62524950504303, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16210521757602692, "step": 24022 }, { "epoch": 0.75075, "grad_norm": 3.203125, "grad_norm_var": 0.0222320556640625, "learning_rate": 0.0001, "loss": 5.5927, "loss/crossentropy": 2.542557954788208, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1589244231581688, "step": 24024 }, { "epoch": 0.7508125, "grad_norm": 3.359375, "grad_norm_var": 0.028120930989583334, "learning_rate": 0.0001, "loss": 5.8997, "loss/crossentropy": 2.760553002357483, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16665276139974594, "step": 24026 }, { "epoch": 0.750875, "grad_norm": 3.0625, "grad_norm_var": 0.0209625244140625, "learning_rate": 0.0001, "loss": 5.6023, "loss/crossentropy": 2.5729691982269287, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.1595735400915146, "step": 24028 }, { "epoch": 0.7509375, "grad_norm": 3.203125, "grad_norm_var": 0.021451822916666665, "learning_rate": 0.0001, "loss": 5.6939, "loss/crossentropy": 2.6090248823165894, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16356821358203888, "step": 24030 }, { "epoch": 0.751, "grad_norm": 2.828125, "grad_norm_var": 0.023566691080729167, "learning_rate": 0.0001, "loss": 5.3827, "loss/crossentropy": 2.3962953090667725, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15567557513713837, "step": 24032 }, { "epoch": 0.7510625, "grad_norm": 3.1875, "grad_norm_var": 0.018929036458333333, "learning_rate": 0.0001, "loss": 5.4211, "loss/crossentropy": 2.427218198776245, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1536843404173851, "step": 24034 }, { "epoch": 0.751125, "grad_norm": 3.109375, "grad_norm_var": 0.018648274739583335, "learning_rate": 0.0001, "loss": 5.85, "loss/crossentropy": 2.7219003438949585, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16593755781650543, "step": 24036 }, { "epoch": 0.7511875, "grad_norm": 3.1875, "grad_norm_var": 0.021735636393229167, "learning_rate": 0.0001, "loss": 5.9337, "loss/crossentropy": 2.666556239128113, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17593314498662949, "step": 24038 }, { "epoch": 0.75125, "grad_norm": 2.921875, "grad_norm_var": 0.021903483072916667, "learning_rate": 0.0001, "loss": 5.3222, "loss/crossentropy": 2.3676997423171997, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.1548285260796547, "step": 24040 }, { "epoch": 0.7513125, "grad_norm": 2.9375, "grad_norm_var": 0.015461222330729166, "learning_rate": 0.0001, "loss": 5.5964, "loss/crossentropy": 2.4753365516662598, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16522684693336487, "step": 24042 }, { "epoch": 0.751375, "grad_norm": 2.984375, "grad_norm_var": 0.013981119791666666, "learning_rate": 0.0001, "loss": 5.5272, "loss/crossentropy": 2.5083736181259155, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15735136717557907, "step": 24044 }, { "epoch": 0.7514375, "grad_norm": 2.875, "grad_norm_var": 0.014762369791666667, "learning_rate": 0.0001, "loss": 5.1681, "loss/crossentropy": 2.299069046974182, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.14627868682146072, "step": 24046 }, { "epoch": 0.7515, "grad_norm": 3.09375, "grad_norm_var": 0.012007649739583333, "learning_rate": 0.0001, "loss": 5.7343, "loss/crossentropy": 2.62216579914093, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16551008820533752, "step": 24048 }, { "epoch": 0.7515625, "grad_norm": 3.140625, "grad_norm_var": 0.013890584309895834, "learning_rate": 0.0001, "loss": 5.6754, "loss/crossentropy": 2.5800259113311768, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.161882646381855, "step": 24050 }, { "epoch": 0.751625, "grad_norm": 2.984375, "grad_norm_var": 0.0150054931640625, "learning_rate": 0.0001, "loss": 5.5522, "loss/crossentropy": 2.519499659538269, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16030453890562057, "step": 24052 }, { "epoch": 0.7516875, "grad_norm": 3.046875, "grad_norm_var": 0.0165435791015625, "learning_rate": 0.0001, "loss": 5.7906, "loss/crossentropy": 2.6092609167099, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16735679656267166, "step": 24054 }, { "epoch": 0.75175, "grad_norm": 3.140625, "grad_norm_var": 0.0208984375, "learning_rate": 0.0001, "loss": 5.5616, "loss/crossentropy": 2.5605705976486206, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15869764983654022, "step": 24056 }, { "epoch": 0.7518125, "grad_norm": 2.953125, "grad_norm_var": 0.0206451416015625, "learning_rate": 0.0001, "loss": 5.6485, "loss/crossentropy": 2.631874203681946, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15752369165420532, "step": 24058 }, { "epoch": 0.751875, "grad_norm": 2.75, "grad_norm_var": 0.026497395833333333, "learning_rate": 0.0001, "loss": 5.5585, "loss/crossentropy": 2.5394619703292847, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15971557796001434, "step": 24060 }, { "epoch": 0.7519375, "grad_norm": 3.15625, "grad_norm_var": 0.024409993489583334, "learning_rate": 0.0001, "loss": 5.7917, "loss/crossentropy": 2.7013940811157227, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16293393820524216, "step": 24062 }, { "epoch": 0.752, "grad_norm": 3.171875, "grad_norm_var": 0.025153605143229167, "learning_rate": 0.0001, "loss": 5.8292, "loss/crossentropy": 2.7043410539627075, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16717380285263062, "step": 24064 }, { "epoch": 0.7520625, "grad_norm": 2.953125, "grad_norm_var": 0.022777303059895834, "learning_rate": 0.0001, "loss": 5.4734, "loss/crossentropy": 2.470320701599121, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.1569492593407631, "step": 24066 }, { "epoch": 0.752125, "grad_norm": 3.1875, "grad_norm_var": 0.023631795247395834, "learning_rate": 0.0001, "loss": 5.554, "loss/crossentropy": 2.4775390625, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16311536729335785, "step": 24068 }, { "epoch": 0.7521875, "grad_norm": 3.390625, "grad_norm_var": 0.027220662434895834, "learning_rate": 0.0001, "loss": 5.5499, "loss/crossentropy": 2.4541794061660767, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.15956879407167435, "step": 24070 }, { "epoch": 0.75225, "grad_norm": 3.59375, "grad_norm_var": 0.0439605712890625, "learning_rate": 0.0001, "loss": 5.2566, "loss/crossentropy": 2.3025163412094116, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.14736465364694595, "step": 24072 }, { "epoch": 0.7523125, "grad_norm": 3.4375, "grad_norm_var": 0.04894205729166667, "learning_rate": 0.0001, "loss": 5.9081, "loss/crossentropy": 2.5796762704849243, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17971716821193695, "step": 24074 }, { "epoch": 0.752375, "grad_norm": 3.234375, "grad_norm_var": 0.04103902180989583, "learning_rate": 0.0001, "loss": 5.484, "loss/crossentropy": 2.3495601415634155, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16579163074493408, "step": 24076 }, { "epoch": 0.7524375, "grad_norm": 3.4375, "grad_norm_var": 0.07107747395833333, "learning_rate": 0.0001, "loss": 5.9585, "loss/crossentropy": 2.749734044075012, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.16814526170492172, "step": 24078 }, { "epoch": 0.7525, "grad_norm": 3.1875, "grad_norm_var": 0.07060445149739583, "learning_rate": 0.0001, "loss": 5.6366, "loss/crossentropy": 2.507352113723755, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16760773211717606, "step": 24080 }, { "epoch": 0.7525625, "grad_norm": 2.75, "grad_norm_var": 0.09156494140625, "learning_rate": 0.0001, "loss": 5.5621, "loss/crossentropy": 2.5038325786590576, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15777898579835892, "step": 24082 }, { "epoch": 0.752625, "grad_norm": 2.96875, "grad_norm_var": 0.09780171712239584, "learning_rate": 0.0001, "loss": 5.8253, "loss/crossentropy": 2.680901050567627, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1683458387851715, "step": 24084 }, { "epoch": 0.7526875, "grad_norm": 2.984375, "grad_norm_var": 0.09470926920572917, "learning_rate": 0.0001, "loss": 5.4327, "loss/crossentropy": 2.367120862007141, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16280563920736313, "step": 24086 }, { "epoch": 0.75275, "grad_norm": 3.09375, "grad_norm_var": 0.0748687744140625, "learning_rate": 0.0001, "loss": 5.2026, "loss/crossentropy": 2.247442364692688, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.15371931344270706, "step": 24088 }, { "epoch": 0.7528125, "grad_norm": 3.078125, "grad_norm_var": 0.08723551432291667, "learning_rate": 0.0001, "loss": 5.3154, "loss/crossentropy": 2.3488672971725464, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15368030965328217, "step": 24090 }, { "epoch": 0.752875, "grad_norm": 2.90625, "grad_norm_var": 0.08950093587239584, "learning_rate": 0.0001, "loss": 5.4821, "loss/crossentropy": 2.4977235794067383, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15391092747449875, "step": 24092 }, { "epoch": 0.7529375, "grad_norm": 3.21875, "grad_norm_var": 0.04742431640625, "learning_rate": 0.0001, "loss": 5.6068, "loss/crossentropy": 2.4802398681640625, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.166949562728405, "step": 24094 }, { "epoch": 0.753, "grad_norm": 3.609375, "grad_norm_var": 0.06437886555989583, "learning_rate": 0.0001, "loss": 5.3373, "loss/crossentropy": 2.3183417320251465, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15814294666051865, "step": 24096 }, { "epoch": 0.7530625, "grad_norm": 3.140625, "grad_norm_var": 0.039449055989583336, "learning_rate": 0.0001, "loss": 5.7298, "loss/crossentropy": 2.5677947998046875, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.17049698531627655, "step": 24098 }, { "epoch": 0.753125, "grad_norm": 2.96875, "grad_norm_var": 0.0394927978515625, "learning_rate": 0.0001, "loss": 5.4705, "loss/crossentropy": 2.3900952339172363, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16390328109264374, "step": 24100 }, { "epoch": 0.7531875, "grad_norm": 2.890625, "grad_norm_var": 0.04500223795572917, "learning_rate": 0.0001, "loss": 5.4324, "loss/crossentropy": 2.4493796825408936, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.15650839358568192, "step": 24102 }, { "epoch": 0.75325, "grad_norm": 3.578125, "grad_norm_var": 0.05972900390625, "learning_rate": 0.0001, "loss": 5.455, "loss/crossentropy": 2.44339120388031, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15428748726844788, "step": 24104 }, { "epoch": 0.7533125, "grad_norm": 2.921875, "grad_norm_var": 0.05113932291666667, "learning_rate": 0.0001, "loss": 5.6347, "loss/crossentropy": 2.5348976850509644, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16389034688472748, "step": 24106 }, { "epoch": 0.753375, "grad_norm": 3.515625, "grad_norm_var": 0.05664774576822917, "learning_rate": 0.0001, "loss": 5.453, "loss/crossentropy": 2.435604214668274, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1544729620218277, "step": 24108 }, { "epoch": 0.7534375, "grad_norm": 3.25, "grad_norm_var": 0.061864217122395836, "learning_rate": 0.0001, "loss": 5.7974, "loss/crossentropy": 2.584768056869507, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1712661236524582, "step": 24110 }, { "epoch": 0.7535, "grad_norm": 3.09375, "grad_norm_var": 0.05432942708333333, "learning_rate": 0.0001, "loss": 5.8904, "loss/crossentropy": 2.7961983680725098, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.163322813808918, "step": 24112 }, { "epoch": 0.7535625, "grad_norm": 3.03125, "grad_norm_var": 0.05172526041666667, "learning_rate": 0.0001, "loss": 5.516, "loss/crossentropy": 2.4154030084609985, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16122816503047943, "step": 24114 }, { "epoch": 0.753625, "grad_norm": 2.953125, "grad_norm_var": 0.07306315104166666, "learning_rate": 0.0001, "loss": 5.6623, "loss/crossentropy": 2.535654902458191, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16110121458768845, "step": 24116 }, { "epoch": 0.7536875, "grad_norm": 3.25, "grad_norm_var": 0.06221415201822917, "learning_rate": 0.0001, "loss": 6.0447, "loss/crossentropy": 2.713833808898926, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17995687574148178, "step": 24118 }, { "epoch": 0.75375, "grad_norm": 2.953125, "grad_norm_var": 0.05815327962239583, "learning_rate": 0.0001, "loss": 5.6069, "loss/crossentropy": 2.5649408102035522, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1569312885403633, "step": 24120 }, { "epoch": 0.7538125, "grad_norm": 2.90625, "grad_norm_var": 0.05870768229166667, "learning_rate": 0.0001, "loss": 5.3703, "loss/crossentropy": 2.4001930952072144, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15130487829446793, "step": 24122 }, { "epoch": 0.753875, "grad_norm": 3.296875, "grad_norm_var": 0.053587849934895834, "learning_rate": 0.0001, "loss": 5.7407, "loss/crossentropy": 2.560784935951233, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.17228714376688004, "step": 24124 }, { "epoch": 0.7539375, "grad_norm": 3.203125, "grad_norm_var": 0.04698893229166667, "learning_rate": 0.0001, "loss": 5.7398, "loss/crossentropy": 2.5797245502471924, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16640272736549377, "step": 24126 }, { "epoch": 0.754, "grad_norm": 3.265625, "grad_norm_var": 0.045344034830729164, "learning_rate": 0.0001, "loss": 5.8133, "loss/crossentropy": 2.6464954614639282, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1682446300983429, "step": 24128 }, { "epoch": 0.7540625, "grad_norm": 3.0625, "grad_norm_var": 0.04700520833333333, "learning_rate": 0.0001, "loss": 5.791, "loss/crossentropy": 2.6895121335983276, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1621066927909851, "step": 24130 }, { "epoch": 0.754125, "grad_norm": 3.078125, "grad_norm_var": 0.02320556640625, "learning_rate": 0.0001, "loss": 5.944, "loss/crossentropy": 2.7760531902313232, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16796286404132843, "step": 24132 }, { "epoch": 0.7541875, "grad_norm": 3.671875, "grad_norm_var": 0.0392242431640625, "learning_rate": 0.0001, "loss": 5.3381, "loss/crossentropy": 2.2714842557907104, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15861865878105164, "step": 24134 }, { "epoch": 0.75425, "grad_norm": 3.0625, "grad_norm_var": 0.0407379150390625, "learning_rate": 0.0001, "loss": 5.4312, "loss/crossentropy": 2.505277156829834, "loss/hidden": 1.390625, "loss/jsd": 0.0, "loss/logits": 0.15353471040725708, "step": 24136 }, { "epoch": 0.7543125, "grad_norm": 3.171875, "grad_norm_var": 0.0370025634765625, "learning_rate": 0.0001, "loss": 5.5424, "loss/crossentropy": 2.5124454498291016, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15768658369779587, "step": 24138 }, { "epoch": 0.754375, "grad_norm": 3.1875, "grad_norm_var": 0.034012858072916666, "learning_rate": 0.0001, "loss": 5.7433, "loss/crossentropy": 2.6044552326202393, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.165834940969944, "step": 24140 }, { "epoch": 0.7544375, "grad_norm": 2.84375, "grad_norm_var": 0.039183553059895834, "learning_rate": 0.0001, "loss": 5.6496, "loss/crossentropy": 2.5876888036727905, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16127284616231918, "step": 24142 }, { "epoch": 0.7545, "grad_norm": 3.25, "grad_norm_var": 0.05074462890625, "learning_rate": 0.0001, "loss": 5.0513, "loss/crossentropy": 2.2023468017578125, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.14427007734775543, "step": 24144 }, { "epoch": 0.7545625, "grad_norm": 3.1875, "grad_norm_var": 0.05077718098958333, "learning_rate": 0.0001, "loss": 5.8935, "loss/crossentropy": 2.7609182596206665, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1679467335343361, "step": 24146 }, { "epoch": 0.754625, "grad_norm": 3.203125, "grad_norm_var": 0.0512603759765625, "learning_rate": 0.0001, "loss": 5.6736, "loss/crossentropy": 2.554359197616577, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16622508317232132, "step": 24148 }, { "epoch": 0.7546875, "grad_norm": 2.609375, "grad_norm_var": 0.042708333333333334, "learning_rate": 0.0001, "loss": 5.3942, "loss/crossentropy": 2.432632327079773, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15279417484998703, "step": 24150 }, { "epoch": 0.75475, "grad_norm": 3.1875, "grad_norm_var": 0.04178059895833333, "learning_rate": 0.0001, "loss": 5.6383, "loss/crossentropy": 2.4687283039093018, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.1642264425754547, "step": 24152 }, { "epoch": 0.7548125, "grad_norm": 3.171875, "grad_norm_var": 0.041727701822916664, "learning_rate": 0.0001, "loss": 5.6806, "loss/crossentropy": 2.546117663383484, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16383732110261917, "step": 24154 }, { "epoch": 0.754875, "grad_norm": 3.515625, "grad_norm_var": 0.05084228515625, "learning_rate": 0.0001, "loss": 5.9499, "loss/crossentropy": 2.7119476795196533, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17652510851621628, "step": 24156 }, { "epoch": 0.7549375, "grad_norm": 3.203125, "grad_norm_var": 0.04735921223958333, "learning_rate": 0.0001, "loss": 5.6438, "loss/crossentropy": 2.5351009368896484, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16360601782798767, "step": 24158 }, { "epoch": 0.755, "grad_norm": 2.734375, "grad_norm_var": 0.04263916015625, "learning_rate": 0.0001, "loss": 5.3956, "loss/crossentropy": 2.4043564796447754, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1526353657245636, "step": 24160 }, { "epoch": 0.7550625, "grad_norm": 3.140625, "grad_norm_var": 0.0416168212890625, "learning_rate": 0.0001, "loss": 5.69, "loss/crossentropy": 2.6212981939315796, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16077616065740585, "step": 24162 }, { "epoch": 0.755125, "grad_norm": 2.9375, "grad_norm_var": 0.045263671875, "learning_rate": 0.0001, "loss": 5.4384, "loss/crossentropy": 2.402511715888977, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15710320323705673, "step": 24164 }, { "epoch": 0.7551875, "grad_norm": 3.046875, "grad_norm_var": 0.029520670572916668, "learning_rate": 0.0001, "loss": 5.6641, "loss/crossentropy": 2.587397575378418, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16352634876966476, "step": 24166 }, { "epoch": 0.75525, "grad_norm": 3.234375, "grad_norm_var": 0.04068603515625, "learning_rate": 0.0001, "loss": 6.0794, "loss/crossentropy": 2.8126444816589355, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17628030478954315, "step": 24168 }, { "epoch": 0.7553125, "grad_norm": 3.40625, "grad_norm_var": 0.0530426025390625, "learning_rate": 0.0001, "loss": 5.724, "loss/crossentropy": 2.6390836238861084, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16162113845348358, "step": 24170 }, { "epoch": 0.755375, "grad_norm": 3.1875, "grad_norm_var": 0.04641011555989583, "learning_rate": 0.0001, "loss": 5.6136, "loss/crossentropy": 2.4587961435317993, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16548041254281998, "step": 24172 }, { "epoch": 0.7554375, "grad_norm": 3.203125, "grad_norm_var": 0.046630859375, "learning_rate": 0.0001, "loss": 5.5038, "loss/crossentropy": 2.445357918739319, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16209685802459717, "step": 24174 }, { "epoch": 0.7555, "grad_norm": 3.328125, "grad_norm_var": 0.036279296875, "learning_rate": 0.0001, "loss": 5.8233, "loss/crossentropy": 2.671785831451416, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16632480919361115, "step": 24176 }, { "epoch": 0.7555625, "grad_norm": 2.96875, "grad_norm_var": 0.044774373372395836, "learning_rate": 0.0001, "loss": 5.543, "loss/crossentropy": 2.4994109869003296, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15983228385448456, "step": 24178 }, { "epoch": 0.755625, "grad_norm": 3.28125, "grad_norm_var": 0.04189351399739583, "learning_rate": 0.0001, "loss": 5.7399, "loss/crossentropy": 2.571989893913269, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.17148178815841675, "step": 24180 }, { "epoch": 0.7556875, "grad_norm": 2.859375, "grad_norm_var": 0.05201416015625, "learning_rate": 0.0001, "loss": 5.3768, "loss/crossentropy": 2.479288935661316, "loss/hidden": 1.39453125, "loss/jsd": 0.0, "loss/logits": 0.15029726922512054, "step": 24182 }, { "epoch": 0.75575, "grad_norm": 3.390625, "grad_norm_var": 0.08507486979166666, "learning_rate": 0.0001, "loss": 5.5751, "loss/crossentropy": 2.4374505281448364, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16337814927101135, "step": 24184 }, { "epoch": 0.7558125, "grad_norm": 2.84375, "grad_norm_var": 0.08245035807291666, "learning_rate": 0.0001, "loss": 5.8305, "loss/crossentropy": 2.621421456336975, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1713024377822876, "step": 24186 }, { "epoch": 0.755875, "grad_norm": 3.078125, "grad_norm_var": 0.08056233723958334, "learning_rate": 0.0001, "loss": 5.6943, "loss/crossentropy": 2.602727174758911, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1630595624446869, "step": 24188 }, { "epoch": 0.7559375, "grad_norm": 3.3125, "grad_norm_var": 0.08196614583333334, "learning_rate": 0.0001, "loss": 5.978, "loss/crossentropy": 2.690164089202881, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.181515172123909, "step": 24190 }, { "epoch": 0.756, "grad_norm": 3.25, "grad_norm_var": 0.08019205729166666, "learning_rate": 0.0001, "loss": 5.5361, "loss/crossentropy": 2.4419296979904175, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16370952129364014, "step": 24192 }, { "epoch": 0.7560625, "grad_norm": 3.0625, "grad_norm_var": 0.07509663899739584, "learning_rate": 0.0001, "loss": 5.7707, "loss/crossentropy": 2.5682854652404785, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1702408790588379, "step": 24194 }, { "epoch": 0.756125, "grad_norm": 2.984375, "grad_norm_var": 0.0865386962890625, "learning_rate": 0.0001, "loss": 5.2196, "loss/crossentropy": 2.2683480978012085, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1498098075389862, "step": 24196 }, { "epoch": 0.7561875, "grad_norm": 6.875, "grad_norm_var": 0.9197092692057292, "learning_rate": 0.0001, "loss": 5.7087, "loss/crossentropy": 2.475526809692383, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1752721145749092, "step": 24198 }, { "epoch": 0.75625, "grad_norm": 2.984375, "grad_norm_var": 0.9122233072916667, "learning_rate": 0.0001, "loss": 5.7623, "loss/crossentropy": 2.6058956384658813, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16954650729894638, "step": 24200 }, { "epoch": 0.7563125, "grad_norm": 3.203125, "grad_norm_var": 0.9079386393229166, "learning_rate": 0.0001, "loss": 5.544, "loss/crossentropy": 2.4620473384857178, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16171350330114365, "step": 24202 }, { "epoch": 0.756375, "grad_norm": 3.1875, "grad_norm_var": 0.90953369140625, "learning_rate": 0.0001, "loss": 5.8217, "loss/crossentropy": 2.6526798009872437, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16534314304590225, "step": 24204 }, { "epoch": 0.7564375, "grad_norm": 3.078125, "grad_norm_var": 0.9124837239583333, "learning_rate": 0.0001, "loss": 5.3978, "loss/crossentropy": 2.3579283952713013, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1602371633052826, "step": 24206 }, { "epoch": 0.7565, "grad_norm": 3.03125, "grad_norm_var": 0.9220123291015625, "learning_rate": 0.0001, "loss": 5.357, "loss/crossentropy": 2.4251667261123657, "loss/hidden": 1.40234375, "loss/jsd": 0.0, "loss/logits": 0.15295164287090302, "step": 24208 }, { "epoch": 0.7565625, "grad_norm": 2.953125, "grad_norm_var": 0.9258860270182292, "learning_rate": 0.0001, "loss": 5.611, "loss/crossentropy": 2.51955783367157, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1642208844423294, "step": 24210 }, { "epoch": 0.756625, "grad_norm": 3.125, "grad_norm_var": 0.9121897379557292, "learning_rate": 0.0001, "loss": 5.5745, "loss/crossentropy": 2.4795562028884888, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16067060083150864, "step": 24212 }, { "epoch": 0.7566875, "grad_norm": 3.234375, "grad_norm_var": 0.027424112955729166, "learning_rate": 0.0001, "loss": 5.6013, "loss/crossentropy": 2.609973669052124, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15577159821987152, "step": 24214 }, { "epoch": 0.75675, "grad_norm": 3.375, "grad_norm_var": 0.030467732747395834, "learning_rate": 0.0001, "loss": 6.1636, "loss/crossentropy": 2.8224769830703735, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.18567093461751938, "step": 24216 }, { "epoch": 0.7568125, "grad_norm": 3.046875, "grad_norm_var": 0.030192057291666668, "learning_rate": 0.0001, "loss": 5.4149, "loss/crossentropy": 2.4502590894699097, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.1542799025774002, "step": 24218 }, { "epoch": 0.756875, "grad_norm": 3.203125, "grad_norm_var": 0.030980428059895832, "learning_rate": 0.0001, "loss": 5.6365, "loss/crossentropy": 2.582989811897278, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16082438826560974, "step": 24220 }, { "epoch": 0.7569375, "grad_norm": 3.21875, "grad_norm_var": 0.032689412434895836, "learning_rate": 0.0001, "loss": 5.5686, "loss/crossentropy": 2.437768340110779, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1681620180606842, "step": 24222 }, { "epoch": 0.757, "grad_norm": 2.984375, "grad_norm_var": 0.037495930989583336, "learning_rate": 0.0001, "loss": 5.5878, "loss/crossentropy": 2.5189844369888306, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16156598925590515, "step": 24224 }, { "epoch": 0.7570625, "grad_norm": 3.234375, "grad_norm_var": 0.03509114583333333, "learning_rate": 0.0001, "loss": 5.5288, "loss/crossentropy": 2.4203603267669678, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16396590322256088, "step": 24226 }, { "epoch": 0.757125, "grad_norm": 2.875, "grad_norm_var": 0.036351521809895836, "learning_rate": 0.0001, "loss": 5.7988, "loss/crossentropy": 2.723006010055542, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16305013000965118, "step": 24228 }, { "epoch": 0.7571875, "grad_norm": 3.046875, "grad_norm_var": 0.0280426025390625, "learning_rate": 0.0001, "loss": 5.783, "loss/crossentropy": 2.657711148262024, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16253386437892914, "step": 24230 }, { "epoch": 0.75725, "grad_norm": 3.359375, "grad_norm_var": 0.025780232747395833, "learning_rate": 0.0001, "loss": 5.4359, "loss/crossentropy": 2.420749545097351, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15542279183864594, "step": 24232 }, { "epoch": 0.7573125, "grad_norm": 2.8125, "grad_norm_var": 0.036214192708333336, "learning_rate": 0.0001, "loss": 5.6808, "loss/crossentropy": 2.5912322998046875, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16247299313545227, "step": 24234 }, { "epoch": 0.757375, "grad_norm": 2.984375, "grad_norm_var": 0.03571675618489583, "learning_rate": 0.0001, "loss": 5.4814, "loss/crossentropy": 2.444804072380066, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15600281953811646, "step": 24236 }, { "epoch": 0.7574375, "grad_norm": 3.15625, "grad_norm_var": 0.03280843098958333, "learning_rate": 0.0001, "loss": 5.7374, "loss/crossentropy": 2.647869110107422, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16402629017829895, "step": 24238 }, { "epoch": 0.7575, "grad_norm": 3.46875, "grad_norm_var": 0.03557027180989583, "learning_rate": 0.0001, "loss": 5.5875, "loss/crossentropy": 2.433461546897888, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1681414619088173, "step": 24240 }, { "epoch": 0.7575625, "grad_norm": 2.984375, "grad_norm_var": 0.04597880045572917, "learning_rate": 0.0001, "loss": 5.7412, "loss/crossentropy": 2.6030138731002808, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1665511652827263, "step": 24242 }, { "epoch": 0.757625, "grad_norm": 2.890625, "grad_norm_var": 0.04523111979166667, "learning_rate": 0.0001, "loss": 5.8167, "loss/crossentropy": 2.7053295373916626, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.1685625985264778, "step": 24244 }, { "epoch": 0.7576875, "grad_norm": 3.078125, "grad_norm_var": 0.04425455729166667, "learning_rate": 0.0001, "loss": 5.9314, "loss/crossentropy": 2.7330563068389893, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1729612499475479, "step": 24246 }, { "epoch": 0.75775, "grad_norm": 2.796875, "grad_norm_var": 0.04986572265625, "learning_rate": 0.0001, "loss": 5.514, "loss/crossentropy": 2.4921834468841553, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15843446552753448, "step": 24248 }, { "epoch": 0.7578125, "grad_norm": 2.9375, "grad_norm_var": 0.0417388916015625, "learning_rate": 0.0001, "loss": 4.9194, "loss/crossentropy": 2.052195191383362, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.14296723902225494, "step": 24250 }, { "epoch": 0.757875, "grad_norm": 2.96875, "grad_norm_var": 0.04306233723958333, "learning_rate": 0.0001, "loss": 5.4979, "loss/crossentropy": 2.5355305671691895, "loss/hidden": 1.40234375, "loss/jsd": 0.0, "loss/logits": 0.15599776804447174, "step": 24252 }, { "epoch": 0.7579375, "grad_norm": 3.25, "grad_norm_var": 0.04788411458333333, "learning_rate": 0.0001, "loss": 5.8001, "loss/crossentropy": 2.610070824623108, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.17290769517421722, "step": 24254 }, { "epoch": 0.758, "grad_norm": 3.171875, "grad_norm_var": 0.050093587239583334, "learning_rate": 0.0001, "loss": 5.374, "loss/crossentropy": 2.4149253368377686, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15137417614459991, "step": 24256 }, { "epoch": 0.7580625, "grad_norm": 3.125, "grad_norm_var": 0.03486226399739583, "learning_rate": 0.0001, "loss": 5.3875, "loss/crossentropy": 2.3582857847213745, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15565423667430878, "step": 24258 }, { "epoch": 0.758125, "grad_norm": 3.0625, "grad_norm_var": 0.032698567708333334, "learning_rate": 0.0001, "loss": 5.7439, "loss/crossentropy": 2.627885580062866, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16550765186548233, "step": 24260 }, { "epoch": 0.7581875, "grad_norm": 2.953125, "grad_norm_var": 0.02802734375, "learning_rate": 0.0001, "loss": 5.4714, "loss/crossentropy": 2.4771320819854736, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15489110350608826, "step": 24262 }, { "epoch": 0.75825, "grad_norm": 3.078125, "grad_norm_var": 0.02252197265625, "learning_rate": 0.0001, "loss": 5.7646, "loss/crossentropy": 2.6379897594451904, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1642233058810234, "step": 24264 }, { "epoch": 0.7583125, "grad_norm": 3.125, "grad_norm_var": 0.02232666015625, "learning_rate": 0.0001, "loss": 5.6799, "loss/crossentropy": 2.560634732246399, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16818025708198547, "step": 24266 }, { "epoch": 0.758375, "grad_norm": 2.828125, "grad_norm_var": 0.024836222330729168, "learning_rate": 0.0001, "loss": 5.6186, "loss/crossentropy": 2.5361427068710327, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16293242573738098, "step": 24268 }, { "epoch": 0.7584375, "grad_norm": 2.984375, "grad_norm_var": 0.024071248372395833, "learning_rate": 0.0001, "loss": 5.19, "loss/crossentropy": 2.3382837772369385, "loss/hidden": 1.3828125, "loss/jsd": 0.0, "loss/logits": 0.1468924880027771, "step": 24270 }, { "epoch": 0.7585, "grad_norm": 2.921875, "grad_norm_var": 0.016405232747395835, "learning_rate": 0.0001, "loss": 5.9336, "loss/crossentropy": 2.762654423713684, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16787448525428772, "step": 24272 }, { "epoch": 0.7585625, "grad_norm": 3.09375, "grad_norm_var": 0.014793904622395833, "learning_rate": 0.0001, "loss": 5.0658, "loss/crossentropy": 2.1516292095184326, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.14649128168821335, "step": 24274 }, { "epoch": 0.758625, "grad_norm": 3.265625, "grad_norm_var": 0.023746744791666666, "learning_rate": 0.0001, "loss": 5.4255, "loss/crossentropy": 2.3457034826278687, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16110370308160782, "step": 24276 }, { "epoch": 0.7586875, "grad_norm": 3.046875, "grad_norm_var": 0.03266499837239583, "learning_rate": 0.0001, "loss": 5.7083, "loss/crossentropy": 2.626542091369629, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16325343400239944, "step": 24278 }, { "epoch": 0.75875, "grad_norm": 2.796875, "grad_norm_var": 0.039449055989583336, "learning_rate": 0.0001, "loss": 5.1111, "loss/crossentropy": 2.238158345222473, "loss/hidden": 1.39453125, "loss/jsd": 0.0, "loss/logits": 0.1478414461016655, "step": 24280 }, { "epoch": 0.7588125, "grad_norm": 3.0, "grad_norm_var": 0.042333984375, "learning_rate": 0.0001, "loss": 5.4943, "loss/crossentropy": 2.5649493932724, "loss/hidden": 1.3828125, "loss/jsd": 0.0, "loss/logits": 0.15464885532855988, "step": 24282 }, { "epoch": 0.758875, "grad_norm": 3.3125, "grad_norm_var": 0.04350484212239583, "learning_rate": 0.0001, "loss": 5.7973, "loss/crossentropy": 2.6341344118118286, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16748441010713577, "step": 24284 }, { "epoch": 0.7589375, "grad_norm": 3.125, "grad_norm_var": 0.040339152018229164, "learning_rate": 0.0001, "loss": 5.641, "loss/crossentropy": 2.560652017593384, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16506946831941605, "step": 24286 }, { "epoch": 0.759, "grad_norm": 3.109375, "grad_norm_var": 0.04024149576822917, "learning_rate": 0.0001, "loss": 5.9467, "loss/crossentropy": 2.7992547750473022, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16669610887765884, "step": 24288 }, { "epoch": 0.7590625, "grad_norm": 3.328125, "grad_norm_var": 0.0446685791015625, "learning_rate": 0.0001, "loss": 5.6246, "loss/crossentropy": 2.567845940589905, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.1615363210439682, "step": 24290 }, { "epoch": 0.759125, "grad_norm": 3.0625, "grad_norm_var": 0.03837890625, "learning_rate": 0.0001, "loss": 5.9742, "loss/crossentropy": 2.819923162460327, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16972891241312027, "step": 24292 }, { "epoch": 0.7591875, "grad_norm": 3.109375, "grad_norm_var": 0.029715983072916667, "learning_rate": 0.0001, "loss": 5.8447, "loss/crossentropy": 2.6782902479171753, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16976940631866455, "step": 24294 }, { "epoch": 0.75925, "grad_norm": 3.296875, "grad_norm_var": 0.02779541015625, "learning_rate": 0.0001, "loss": 5.5734, "loss/crossentropy": 2.5208466053009033, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15993858128786087, "step": 24296 }, { "epoch": 0.7593125, "grad_norm": 3.65625, "grad_norm_var": 0.04777730305989583, "learning_rate": 0.0001, "loss": 5.3045, "loss/crossentropy": 2.2961453199386597, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15591763705015182, "step": 24298 }, { "epoch": 0.759375, "grad_norm": 3.1875, "grad_norm_var": 0.0461578369140625, "learning_rate": 0.0001, "loss": 6.1393, "loss/crossentropy": 2.8317224979400635, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.18232179433107376, "step": 24300 }, { "epoch": 0.7594375, "grad_norm": 3.0, "grad_norm_var": 0.04163411458333333, "learning_rate": 0.0001, "loss": 5.648, "loss/crossentropy": 2.5475919246673584, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16512297093868256, "step": 24302 }, { "epoch": 0.7595, "grad_norm": 3.09375, "grad_norm_var": 0.037043253580729164, "learning_rate": 0.0001, "loss": 5.5121, "loss/crossentropy": 2.5225062370300293, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.15716056525707245, "step": 24304 }, { "epoch": 0.7595625, "grad_norm": 3.0625, "grad_norm_var": 0.0380279541015625, "learning_rate": 0.0001, "loss": 5.3988, "loss/crossentropy": 2.4485208988189697, "loss/hidden": 1.375, "loss/jsd": 0.0, "loss/logits": 0.1575288325548172, "step": 24306 }, { "epoch": 0.759625, "grad_norm": 3.015625, "grad_norm_var": 0.03650716145833333, "learning_rate": 0.0001, "loss": 5.7898, "loss/crossentropy": 2.6098283529281616, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16877630352973938, "step": 24308 }, { "epoch": 0.7596875, "grad_norm": 2.765625, "grad_norm_var": 0.042704264322916664, "learning_rate": 0.0001, "loss": 5.5829, "loss/crossentropy": 2.555821418762207, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15700319409370422, "step": 24310 }, { "epoch": 0.75975, "grad_norm": 3.078125, "grad_norm_var": 0.04006754557291667, "learning_rate": 0.0001, "loss": 5.6753, "loss/crossentropy": 2.5307703018188477, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.17148318886756897, "step": 24312 }, { "epoch": 0.7598125, "grad_norm": 3.125, "grad_norm_var": 0.07297261555989583, "learning_rate": 0.0001, "loss": 5.6941, "loss/crossentropy": 2.550258755683899, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16712124645709991, "step": 24314 }, { "epoch": 0.759875, "grad_norm": 2.890625, "grad_norm_var": 0.0743072509765625, "learning_rate": 0.0001, "loss": 5.3145, "loss/crossentropy": 2.4267187118530273, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.14307159185409546, "step": 24316 }, { "epoch": 0.7599375, "grad_norm": 3.25, "grad_norm_var": 0.076611328125, "learning_rate": 0.0001, "loss": 5.7224, "loss/crossentropy": 2.6389344930648804, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16107800602912903, "step": 24318 }, { "epoch": 0.76, "grad_norm": 3.0625, "grad_norm_var": 0.07613016764322916, "learning_rate": 0.0001, "loss": 5.6364, "loss/crossentropy": 2.609094738960266, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.1609373688697815, "step": 24320 }, { "epoch": 0.7600625, "grad_norm": 2.953125, "grad_norm_var": 0.0740631103515625, "learning_rate": 0.0001, "loss": 5.7142, "loss/crossentropy": 2.6537078619003296, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15995817631483078, "step": 24322 }, { "epoch": 0.760125, "grad_norm": 2.921875, "grad_norm_var": 0.075927734375, "learning_rate": 0.0001, "loss": 5.5217, "loss/crossentropy": 2.4636749029159546, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15775848925113678, "step": 24324 }, { "epoch": 0.7601875, "grad_norm": 3.15625, "grad_norm_var": 0.06871337890625, "learning_rate": 0.0001, "loss": 5.5488, "loss/crossentropy": 2.476245641708374, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1592119336128235, "step": 24326 }, { "epoch": 0.76025, "grad_norm": 3.109375, "grad_norm_var": 0.09335530598958333, "learning_rate": 0.0001, "loss": 5.8164, "loss/crossentropy": 2.5851922035217285, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17585048079490662, "step": 24328 }, { "epoch": 0.7603125, "grad_norm": 3.1875, "grad_norm_var": 0.04155171712239583, "learning_rate": 0.0001, "loss": 5.8689, "loss/crossentropy": 2.6981635093688965, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16902291029691696, "step": 24330 }, { "epoch": 0.760375, "grad_norm": 3.0625, "grad_norm_var": 0.04455973307291667, "learning_rate": 0.0001, "loss": 5.3983, "loss/crossentropy": 2.4496692419052124, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1495552659034729, "step": 24332 }, { "epoch": 0.7604375, "grad_norm": 3.046875, "grad_norm_var": 0.04292704264322917, "learning_rate": 0.0001, "loss": 5.6176, "loss/crossentropy": 2.527258038520813, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16137798130512238, "step": 24334 }, { "epoch": 0.7605, "grad_norm": 2.890625, "grad_norm_var": 0.0487701416015625, "learning_rate": 0.0001, "loss": 5.8402, "loss/crossentropy": 2.702823281288147, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16725626587867737, "step": 24336 }, { "epoch": 0.7605625, "grad_norm": 3.40625, "grad_norm_var": 0.052490234375, "learning_rate": 0.0001, "loss": 5.5783, "loss/crossentropy": 2.5081727504730225, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1593562588095665, "step": 24338 }, { "epoch": 0.760625, "grad_norm": 2.75, "grad_norm_var": 0.05944010416666667, "learning_rate": 0.0001, "loss": 5.4505, "loss/crossentropy": 2.499588966369629, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15368393063545227, "step": 24340 }, { "epoch": 0.7606875, "grad_norm": 3.296875, "grad_norm_var": 0.06326395670572917, "learning_rate": 0.0001, "loss": 6.1718, "loss/crossentropy": 2.8815900087356567, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1786285936832428, "step": 24342 }, { "epoch": 0.76075, "grad_norm": 2.90625, "grad_norm_var": 0.0376861572265625, "learning_rate": 0.0001, "loss": 5.8847, "loss/crossentropy": 2.7646361589431763, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16473662108182907, "step": 24344 }, { "epoch": 0.7608125, "grad_norm": 3.203125, "grad_norm_var": 0.037385050455729166, "learning_rate": 0.0001, "loss": 5.756, "loss/crossentropy": 2.6093757152557373, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16583281755447388, "step": 24346 }, { "epoch": 0.760875, "grad_norm": 3.609375, "grad_norm_var": 0.050126139322916666, "learning_rate": 0.0001, "loss": 5.9246, "loss/crossentropy": 2.7277426719665527, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16890300810337067, "step": 24348 }, { "epoch": 0.7609375, "grad_norm": 3.46875, "grad_norm_var": 0.0551422119140625, "learning_rate": 0.0001, "loss": 5.805, "loss/crossentropy": 2.618245482444763, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17141090333461761, "step": 24350 }, { "epoch": 0.761, "grad_norm": 3.203125, "grad_norm_var": 0.0514068603515625, "learning_rate": 0.0001, "loss": 5.6128, "loss/crossentropy": 2.567072868347168, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.1623883917927742, "step": 24352 }, { "epoch": 0.7610625, "grad_norm": 2.921875, "grad_norm_var": 0.049738566080729164, "learning_rate": 0.0001, "loss": 5.7429, "loss/crossentropy": 2.6708853244781494, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16266701370477676, "step": 24354 }, { "epoch": 0.761125, "grad_norm": 3.0625, "grad_norm_var": 0.0439605712890625, "learning_rate": 0.0001, "loss": 5.3716, "loss/crossentropy": 2.4212406873703003, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.1528492048382759, "step": 24356 }, { "epoch": 0.7611875, "grad_norm": 3.546875, "grad_norm_var": 0.051070149739583334, "learning_rate": 0.0001, "loss": 6.0262, "loss/crossentropy": 2.813529849052429, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17282679677009583, "step": 24358 }, { "epoch": 0.76125, "grad_norm": 3.046875, "grad_norm_var": 0.047102864583333334, "learning_rate": 0.0001, "loss": 5.9006, "loss/crossentropy": 2.7431578636169434, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.168087400496006, "step": 24360 }, { "epoch": 0.7613125, "grad_norm": 3.390625, "grad_norm_var": 0.050093587239583334, "learning_rate": 0.0001, "loss": 5.7242, "loss/crossentropy": 2.5602455139160156, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16756587475538254, "step": 24362 }, { "epoch": 0.761375, "grad_norm": 2.953125, "grad_norm_var": 0.03797200520833333, "learning_rate": 0.0001, "loss": 5.7465, "loss/crossentropy": 2.664482831954956, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1613277718424797, "step": 24364 }, { "epoch": 0.7614375, "grad_norm": 3.0625, "grad_norm_var": 0.03612874348958333, "learning_rate": 0.0001, "loss": 5.672, "loss/crossentropy": 2.642616391181946, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15997271239757538, "step": 24366 }, { "epoch": 0.7615, "grad_norm": 2.984375, "grad_norm_var": 0.035497029622395836, "learning_rate": 0.0001, "loss": 5.7899, "loss/crossentropy": 2.6880706548690796, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16565346717834473, "step": 24368 }, { "epoch": 0.7615625, "grad_norm": 3.140625, "grad_norm_var": 0.034016927083333336, "learning_rate": 0.0001, "loss": 5.3715, "loss/crossentropy": 2.3503127098083496, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15914569050073624, "step": 24370 }, { "epoch": 0.761625, "grad_norm": 3.8125, "grad_norm_var": 0.060155232747395836, "learning_rate": 0.0001, "loss": 5.5425, "loss/crossentropy": 2.3365726470947266, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.16747110337018967, "step": 24372 }, { "epoch": 0.7616875, "grad_norm": 2.96875, "grad_norm_var": 0.05335286458333333, "learning_rate": 0.0001, "loss": 5.7142, "loss/crossentropy": 2.5692873001098633, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16761834919452667, "step": 24374 }, { "epoch": 0.76175, "grad_norm": 3.15625, "grad_norm_var": 0.052912394205729164, "learning_rate": 0.0001, "loss": 5.6844, "loss/crossentropy": 2.541516900062561, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1666291505098343, "step": 24376 }, { "epoch": 0.7618125, "grad_norm": 3.09375, "grad_norm_var": 0.049046834309895836, "learning_rate": 0.0001, "loss": 5.6735, "loss/crossentropy": 2.5311325788497925, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16814269870519638, "step": 24378 }, { "epoch": 0.761875, "grad_norm": 12.6875, "grad_norm_var": 5.766681925455729, "learning_rate": 0.0001, "loss": 5.8868, "loss/crossentropy": 2.381693482398987, "loss/hidden": 1.56640625, "loss/jsd": 0.0, "loss/logits": 0.19386768341064453, "step": 24380 }, { "epoch": 0.7619375, "grad_norm": 3.015625, "grad_norm_var": 5.729117838541667, "learning_rate": 0.0001, "loss": 5.5047, "loss/crossentropy": 2.4462047815322876, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16366249322891235, "step": 24382 }, { "epoch": 0.762, "grad_norm": 2.875, "grad_norm_var": 5.736490885416667, "learning_rate": 0.0001, "loss": 5.4341, "loss/crossentropy": 2.42786180973053, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1576521024107933, "step": 24384 }, { "epoch": 0.7620625, "grad_norm": 3.171875, "grad_norm_var": 5.711669921875, "learning_rate": 0.0001, "loss": 6.0457, "loss/crossentropy": 2.748769998550415, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17813345044851303, "step": 24386 }, { "epoch": 0.762125, "grad_norm": 3.40625, "grad_norm_var": 5.703075154622396, "learning_rate": 0.0001, "loss": 5.6918, "loss/crossentropy": 2.4644813537597656, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.17155954241752625, "step": 24388 }, { "epoch": 0.7621875, "grad_norm": 3.125, "grad_norm_var": 5.700065104166667, "learning_rate": 0.0001, "loss": 5.9113, "loss/crossentropy": 2.708987832069397, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16906185448169708, "step": 24390 }, { "epoch": 0.76225, "grad_norm": 3.0625, "grad_norm_var": 5.743529256184896, "learning_rate": 0.0001, "loss": 5.4029, "loss/crossentropy": 2.4219563007354736, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15395769476890564, "step": 24392 }, { "epoch": 0.7623125, "grad_norm": 2.90625, "grad_norm_var": 5.737987263997396, "learning_rate": 0.0001, "loss": 5.7952, "loss/crossentropy": 2.6631529331207275, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1682787463068962, "step": 24394 }, { "epoch": 0.762375, "grad_norm": 2.84375, "grad_norm_var": 0.036742146809895834, "learning_rate": 0.0001, "loss": 5.8027, "loss/crossentropy": 2.6886179447174072, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1657099425792694, "step": 24396 }, { "epoch": 0.7624375, "grad_norm": 2.921875, "grad_norm_var": 0.03632405598958333, "learning_rate": 0.0001, "loss": 5.5923, "loss/crossentropy": 2.535095453262329, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1604101061820984, "step": 24398 }, { "epoch": 0.7625, "grad_norm": 2.90625, "grad_norm_var": 0.041178385416666664, "learning_rate": 0.0001, "loss": 5.3001, "loss/crossentropy": 2.3773635625839233, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15086649358272552, "step": 24400 }, { "epoch": 0.7625625, "grad_norm": 2.921875, "grad_norm_var": 0.04439188639322917, "learning_rate": 0.0001, "loss": 5.4394, "loss/crossentropy": 2.5263442993164062, "loss/hidden": 1.40234375, "loss/jsd": 0.0, "loss/logits": 0.1510726362466812, "step": 24402 }, { "epoch": 0.762625, "grad_norm": 2.96875, "grad_norm_var": 0.027046712239583333, "learning_rate": 0.0001, "loss": 5.49, "loss/crossentropy": 2.4671220779418945, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15814447402954102, "step": 24404 }, { "epoch": 0.7626875, "grad_norm": 3.359375, "grad_norm_var": 0.0331451416015625, "learning_rate": 0.0001, "loss": 5.717, "loss/crossentropy": 2.609921932220459, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16656387597322464, "step": 24406 }, { "epoch": 0.76275, "grad_norm": 2.953125, "grad_norm_var": 0.033186848958333334, "learning_rate": 0.0001, "loss": 5.5609, "loss/crossentropy": 2.5512131452560425, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.15917153656482697, "step": 24408 }, { "epoch": 0.7628125, "grad_norm": 3.265625, "grad_norm_var": 0.03452860514322917, "learning_rate": 0.0001, "loss": 5.6542, "loss/crossentropy": 2.482442259788513, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.17420323193073273, "step": 24410 }, { "epoch": 0.762875, "grad_norm": 2.890625, "grad_norm_var": 0.03711649576822917, "learning_rate": 0.0001, "loss": 5.6288, "loss/crossentropy": 2.5039873123168945, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16482964158058167, "step": 24412 }, { "epoch": 0.7629375, "grad_norm": 3.875, "grad_norm_var": 0.09339090983072916, "learning_rate": 0.0001, "loss": 5.5329, "loss/crossentropy": 2.4025429487228394, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16616224497556686, "step": 24414 }, { "epoch": 0.763, "grad_norm": 3.171875, "grad_norm_var": 0.0822418212890625, "learning_rate": 0.0001, "loss": 5.8005, "loss/crossentropy": 2.6739391088485718, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16344071179628372, "step": 24416 }, { "epoch": 0.7630625, "grad_norm": 3.078125, "grad_norm_var": 0.06708984375, "learning_rate": 0.0001, "loss": 5.7469, "loss/crossentropy": 2.596408247947693, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1677859202027321, "step": 24418 }, { "epoch": 0.763125, "grad_norm": 3.078125, "grad_norm_var": 0.06049702962239583, "learning_rate": 0.0001, "loss": 5.6792, "loss/crossentropy": 2.595887303352356, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1626303270459175, "step": 24420 }, { "epoch": 0.7631875, "grad_norm": 3.390625, "grad_norm_var": 0.05894775390625, "learning_rate": 0.0001, "loss": 5.4664, "loss/crossentropy": 2.448666214942932, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15880563855171204, "step": 24422 }, { "epoch": 0.76325, "grad_norm": 3.25, "grad_norm_var": 0.060139973958333336, "learning_rate": 0.0001, "loss": 5.6138, "loss/crossentropy": 2.532195806503296, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16401486098766327, "step": 24424 }, { "epoch": 0.7633125, "grad_norm": 3.140625, "grad_norm_var": 0.06027018229166667, "learning_rate": 0.0001, "loss": 5.8564, "loss/crossentropy": 2.7116355895996094, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16760292649269104, "step": 24426 }, { "epoch": 0.763375, "grad_norm": 3.15625, "grad_norm_var": 0.05575764973958333, "learning_rate": 0.0001, "loss": 5.8755, "loss/crossentropy": 2.709297776222229, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16935163736343384, "step": 24428 }, { "epoch": 0.7634375, "grad_norm": 3.015625, "grad_norm_var": 0.020340983072916666, "learning_rate": 0.0001, "loss": 5.58, "loss/crossentropy": 2.4961061477661133, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1591692715883255, "step": 24430 }, { "epoch": 0.7635, "grad_norm": 3.171875, "grad_norm_var": 0.025484212239583335, "learning_rate": 0.0001, "loss": 5.6587, "loss/crossentropy": 2.598578453063965, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16069485992193222, "step": 24432 }, { "epoch": 0.7635625, "grad_norm": 3.21875, "grad_norm_var": 0.0270904541015625, "learning_rate": 0.0001, "loss": 5.4905, "loss/crossentropy": 2.4428911209106445, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1567188948392868, "step": 24434 }, { "epoch": 0.763625, "grad_norm": 2.953125, "grad_norm_var": 0.029423014322916666, "learning_rate": 0.0001, "loss": 5.7569, "loss/crossentropy": 2.7318053245544434, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15563537180423737, "step": 24436 }, { "epoch": 0.7636875, "grad_norm": 3.34375, "grad_norm_var": 0.029084269205729166, "learning_rate": 0.0001, "loss": 5.7455, "loss/crossentropy": 2.58599317073822, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16868651658296585, "step": 24438 }, { "epoch": 0.76375, "grad_norm": 3.09375, "grad_norm_var": 0.025178019205729166, "learning_rate": 0.0001, "loss": 5.6417, "loss/crossentropy": 2.529034972190857, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16633976995944977, "step": 24440 }, { "epoch": 0.7638125, "grad_norm": 3.140625, "grad_norm_var": 0.027762858072916667, "learning_rate": 0.0001, "loss": 5.7549, "loss/crossentropy": 2.6712522506713867, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15992914140224457, "step": 24442 }, { "epoch": 0.763875, "grad_norm": 2.9375, "grad_norm_var": 0.0231353759765625, "learning_rate": 0.0001, "loss": 5.3658, "loss/crossentropy": 2.390221357345581, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1487329974770546, "step": 24444 }, { "epoch": 0.7639375, "grad_norm": 3.21875, "grad_norm_var": 0.019904581705729167, "learning_rate": 0.0001, "loss": 5.7898, "loss/crossentropy": 2.6352105140686035, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16975723206996918, "step": 24446 }, { "epoch": 0.764, "grad_norm": 3.0625, "grad_norm_var": 0.024051920572916666, "learning_rate": 0.0001, "loss": 5.8497, "loss/crossentropy": 2.6421698331832886, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16879693418741226, "step": 24448 }, { "epoch": 0.7640625, "grad_norm": 3.015625, "grad_norm_var": 0.0232574462890625, "learning_rate": 0.0001, "loss": 5.5762, "loss/crossentropy": 2.4971476793289185, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.1657225862145424, "step": 24450 }, { "epoch": 0.764125, "grad_norm": 2.890625, "grad_norm_var": 0.026416015625, "learning_rate": 0.0001, "loss": 5.5338, "loss/crossentropy": 2.509614109992981, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15788687020540237, "step": 24452 }, { "epoch": 0.7641875, "grad_norm": 2.890625, "grad_norm_var": 0.03954671223958333, "learning_rate": 0.0001, "loss": 5.5469, "loss/crossentropy": 2.4922231435775757, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16171645373106003, "step": 24454 }, { "epoch": 0.76425, "grad_norm": 3.109375, "grad_norm_var": 6.216796875, "learning_rate": 0.0001, "loss": 5.7248, "loss/crossentropy": 2.4789448976516724, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1765417903661728, "step": 24456 }, { "epoch": 0.7643125, "grad_norm": 3.0, "grad_norm_var": 6.195254516601563, "learning_rate": 0.0001, "loss": 6.0502, "loss/crossentropy": 2.818184018135071, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17437320202589035, "step": 24458 }, { "epoch": 0.764375, "grad_norm": 2.96875, "grad_norm_var": 6.205427042643229, "learning_rate": 0.0001, "loss": 5.4949, "loss/crossentropy": 2.4427298307418823, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1614658683538437, "step": 24460 }, { "epoch": 0.7644375, "grad_norm": 3.171875, "grad_norm_var": 6.226203409830729, "learning_rate": 0.0001, "loss": 5.5257, "loss/crossentropy": 2.477227807044983, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15680351108312607, "step": 24462 }, { "epoch": 0.7645, "grad_norm": 3.109375, "grad_norm_var": 6.236197916666667, "learning_rate": 0.0001, "loss": 6.0102, "loss/crossentropy": 2.7584580183029175, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.175169438123703, "step": 24464 }, { "epoch": 0.7645625, "grad_norm": 3.125, "grad_norm_var": 6.250234985351563, "learning_rate": 0.0001, "loss": 5.7546, "loss/crossentropy": 2.706199288368225, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1583528071641922, "step": 24466 }, { "epoch": 0.764625, "grad_norm": 3.09375, "grad_norm_var": 6.216487630208333, "learning_rate": 0.0001, "loss": 5.6573, "loss/crossentropy": 2.5890300273895264, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16033885627985, "step": 24468 }, { "epoch": 0.7646875, "grad_norm": 3.203125, "grad_norm_var": 6.203205362955729, "learning_rate": 0.0001, "loss": 5.8204, "loss/crossentropy": 2.629197597503662, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1722457855939865, "step": 24470 }, { "epoch": 0.76475, "grad_norm": 2.890625, "grad_norm_var": 0.023531087239583335, "learning_rate": 0.0001, "loss": 5.1167, "loss/crossentropy": 2.2669665813446045, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.14512494206428528, "step": 24472 }, { "epoch": 0.7648125, "grad_norm": 3.203125, "grad_norm_var": 0.018680826822916666, "learning_rate": 0.0001, "loss": 5.7604, "loss/crossentropy": 2.639541268348694, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16677073389291763, "step": 24474 }, { "epoch": 0.764875, "grad_norm": 2.921875, "grad_norm_var": 0.01968994140625, "learning_rate": 0.0001, "loss": 5.3079, "loss/crossentropy": 2.35752010345459, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15089724957942963, "step": 24476 }, { "epoch": 0.7649375, "grad_norm": 3.625, "grad_norm_var": 0.0392578125, "learning_rate": 0.0001, "loss": 5.6516, "loss/crossentropy": 2.4392318725585938, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1731855645775795, "step": 24478 }, { "epoch": 0.765, "grad_norm": 3.0625, "grad_norm_var": 0.04045817057291667, "learning_rate": 0.0001, "loss": 5.5255, "loss/crossentropy": 2.5260192155838013, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15853818506002426, "step": 24480 }, { "epoch": 0.7650625, "grad_norm": 3.015625, "grad_norm_var": 0.04094645182291667, "learning_rate": 0.0001, "loss": 5.4057, "loss/crossentropy": 2.385251045227051, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15829485654830933, "step": 24482 }, { "epoch": 0.765125, "grad_norm": 2.84375, "grad_norm_var": 0.0445709228515625, "learning_rate": 0.0001, "loss": 5.6976, "loss/crossentropy": 2.7166390419006348, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15473493933677673, "step": 24484 }, { "epoch": 0.7651875, "grad_norm": 3.328125, "grad_norm_var": 0.052099609375, "learning_rate": 0.0001, "loss": 5.4798, "loss/crossentropy": 2.475341320037842, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.15865373611450195, "step": 24486 }, { "epoch": 0.76525, "grad_norm": 3.03125, "grad_norm_var": 0.20618489583333333, "learning_rate": 0.0001, "loss": 5.4679, "loss/crossentropy": 2.3743693828582764, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.15779267251491547, "step": 24488 }, { "epoch": 0.7653125, "grad_norm": 3.0625, "grad_norm_var": 0.20689188639322917, "learning_rate": 0.0001, "loss": 5.447, "loss/crossentropy": 2.4905470609664917, "loss/hidden": 1.3828125, "loss/jsd": 0.0, "loss/logits": 0.15736067295074463, "step": 24490 }, { "epoch": 0.765375, "grad_norm": 3.46875, "grad_norm_var": 0.20894775390625, "learning_rate": 0.0001, "loss": 5.5884, "loss/crossentropy": 2.488085150718689, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.1658872589468956, "step": 24492 }, { "epoch": 0.7654375, "grad_norm": 3.03125, "grad_norm_var": 0.20964253743489583, "learning_rate": 0.0001, "loss": 5.6334, "loss/crossentropy": 2.4770509004592896, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16836615651845932, "step": 24494 }, { "epoch": 0.7655, "grad_norm": 3.125, "grad_norm_var": 0.21066080729166667, "learning_rate": 0.0001, "loss": 5.6307, "loss/crossentropy": 2.5260634422302246, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16397515684366226, "step": 24496 }, { "epoch": 0.7655625, "grad_norm": 2.921875, "grad_norm_var": 0.2105377197265625, "learning_rate": 0.0001, "loss": 5.7065, "loss/crossentropy": 2.653430461883545, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16311953961849213, "step": 24498 }, { "epoch": 0.765625, "grad_norm": 3.171875, "grad_norm_var": 0.20191141764322917, "learning_rate": 0.0001, "loss": 5.7151, "loss/crossentropy": 2.553924083709717, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1653314083814621, "step": 24500 }, { "epoch": 0.7656875, "grad_norm": 3.140625, "grad_norm_var": 0.18728841145833333, "learning_rate": 0.0001, "loss": 5.5741, "loss/crossentropy": 2.460555672645569, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1640918105840683, "step": 24502 }, { "epoch": 0.76575, "grad_norm": 3.15625, "grad_norm_var": 0.0445709228515625, "learning_rate": 0.0001, "loss": 5.7112, "loss/crossentropy": 2.573582649230957, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1668897494673729, "step": 24504 }, { "epoch": 0.7658125, "grad_norm": 3.15625, "grad_norm_var": 0.04297587076822917, "learning_rate": 0.0001, "loss": 5.573, "loss/crossentropy": 2.539095640182495, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15964529663324356, "step": 24506 }, { "epoch": 0.765875, "grad_norm": 2.9375, "grad_norm_var": 0.04067281087239583, "learning_rate": 0.0001, "loss": 5.6078, "loss/crossentropy": 2.590778946876526, "loss/hidden": 1.39453125, "loss/jsd": 0.0, "loss/logits": 0.1622440367937088, "step": 24508 }, { "epoch": 0.7659375, "grad_norm": 3.0, "grad_norm_var": 0.0162017822265625, "learning_rate": 0.0001, "loss": 5.7587, "loss/crossentropy": 2.689617156982422, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16237246990203857, "step": 24510 }, { "epoch": 0.766, "grad_norm": 3.234375, "grad_norm_var": 0.016634114583333335, "learning_rate": 0.0001, "loss": 5.6707, "loss/crossentropy": 2.514092445373535, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1687850058078766, "step": 24512 }, { "epoch": 0.7660625, "grad_norm": 2.9375, "grad_norm_var": 0.019505818684895832, "learning_rate": 0.0001, "loss": 5.6159, "loss/crossentropy": 2.5805495977401733, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.16173956543207169, "step": 24514 }, { "epoch": 0.766125, "grad_norm": 3.09375, "grad_norm_var": 0.023616536458333334, "learning_rate": 0.0001, "loss": 5.5594, "loss/crossentropy": 2.513151168823242, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15696899592876434, "step": 24516 }, { "epoch": 0.7661875, "grad_norm": 3.109375, "grad_norm_var": 0.019123331705729166, "learning_rate": 0.0001, "loss": 6.0361, "loss/crossentropy": 2.814682960510254, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1737068146467209, "step": 24518 }, { "epoch": 0.76625, "grad_norm": 2.953125, "grad_norm_var": 0.0194000244140625, "learning_rate": 0.0001, "loss": 5.6034, "loss/crossentropy": 2.504035234451294, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16383955627679825, "step": 24520 }, { "epoch": 0.7663125, "grad_norm": 2.921875, "grad_norm_var": 0.0197174072265625, "learning_rate": 0.0001, "loss": 5.2152, "loss/crossentropy": 2.2720115184783936, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1486142799258232, "step": 24522 }, { "epoch": 0.766375, "grad_norm": 3.03125, "grad_norm_var": 0.0179351806640625, "learning_rate": 0.0001, "loss": 5.4113, "loss/crossentropy": 2.3895599842071533, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15647148340940475, "step": 24524 }, { "epoch": 0.7664375, "grad_norm": 3.5625, "grad_norm_var": 0.03140869140625, "learning_rate": 0.0001, "loss": 5.8155, "loss/crossentropy": 2.6010197401046753, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1714508831501007, "step": 24526 }, { "epoch": 0.7665, "grad_norm": 3.203125, "grad_norm_var": 0.031473795572916664, "learning_rate": 0.0001, "loss": 5.6301, "loss/crossentropy": 2.5225613117218018, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.1673906147480011, "step": 24528 }, { "epoch": 0.7665625, "grad_norm": 3.296875, "grad_norm_var": 0.028661092122395832, "learning_rate": 0.0001, "loss": 5.5383, "loss/crossentropy": 2.413439989089966, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1636573150753975, "step": 24530 }, { "epoch": 0.766625, "grad_norm": 3.28125, "grad_norm_var": 0.027464803059895834, "learning_rate": 0.0001, "loss": 5.8101, "loss/crossentropy": 2.6230051517486572, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17066382616758347, "step": 24532 }, { "epoch": 0.7666875, "grad_norm": 3.078125, "grad_norm_var": 0.030134073893229165, "learning_rate": 0.0001, "loss": 5.9387, "loss/crossentropy": 2.724544644355774, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17336948961019516, "step": 24534 }, { "epoch": 0.76675, "grad_norm": 2.796875, "grad_norm_var": 0.03655598958333333, "learning_rate": 0.0001, "loss": 5.5393, "loss/crossentropy": 2.5030943155288696, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1551826074719429, "step": 24536 }, { "epoch": 0.7668125, "grad_norm": 2.84375, "grad_norm_var": 0.04495035807291667, "learning_rate": 0.0001, "loss": 5.5044, "loss/crossentropy": 2.5110961198806763, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15636055171489716, "step": 24538 }, { "epoch": 0.766875, "grad_norm": 3.5625, "grad_norm_var": 0.05181884765625, "learning_rate": 0.0001, "loss": 5.6657, "loss/crossentropy": 2.506834864616394, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1658850461244583, "step": 24540 }, { "epoch": 0.7669375, "grad_norm": 3.046875, "grad_norm_var": 0.046418253580729166, "learning_rate": 0.0001, "loss": 5.8044, "loss/crossentropy": 2.661869168281555, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16620570421218872, "step": 24542 }, { "epoch": 0.767, "grad_norm": 3.0625, "grad_norm_var": 0.054638671875, "learning_rate": 0.0001, "loss": 5.6193, "loss/crossentropy": 2.5758901834487915, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15746265649795532, "step": 24544 }, { "epoch": 0.7670625, "grad_norm": 2.78125, "grad_norm_var": 0.0587799072265625, "learning_rate": 0.0001, "loss": 5.1982, "loss/crossentropy": 2.2970075607299805, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.14675837755203247, "step": 24546 }, { "epoch": 0.767125, "grad_norm": 2.96875, "grad_norm_var": 0.054133097330729164, "learning_rate": 0.0001, "loss": 5.6817, "loss/crossentropy": 2.6156165599823, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15777605026960373, "step": 24548 }, { "epoch": 0.7671875, "grad_norm": 3.0625, "grad_norm_var": 0.074853515625, "learning_rate": 0.0001, "loss": 5.5262, "loss/crossentropy": 2.412881851196289, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16249888390302658, "step": 24550 }, { "epoch": 0.76725, "grad_norm": 3.09375, "grad_norm_var": 0.07151285807291667, "learning_rate": 0.0001, "loss": 5.2193, "loss/crossentropy": 2.2569509744644165, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15014218538999557, "step": 24552 }, { "epoch": 0.7673125, "grad_norm": 3.0, "grad_norm_var": 0.07652994791666666, "learning_rate": 0.0001, "loss": 5.469, "loss/crossentropy": 2.3602248430252075, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16517900675535202, "step": 24554 }, { "epoch": 0.767375, "grad_norm": 2.984375, "grad_norm_var": 0.06404520670572916, "learning_rate": 0.0001, "loss": 5.5428, "loss/crossentropy": 2.502559542655945, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16027280688285828, "step": 24556 }, { "epoch": 0.7674375, "grad_norm": 3.171875, "grad_norm_var": 0.059000651041666664, "learning_rate": 0.0001, "loss": 6.0277, "loss/crossentropy": 2.832331895828247, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1695387214422226, "step": 24558 }, { "epoch": 0.7675, "grad_norm": 3.25, "grad_norm_var": 0.05788472493489583, "learning_rate": 0.0001, "loss": 6.0129, "loss/crossentropy": 2.8243350982666016, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17237577587366104, "step": 24560 }, { "epoch": 0.7675625, "grad_norm": 2.921875, "grad_norm_var": 0.052408854166666664, "learning_rate": 0.0001, "loss": 5.5788, "loss/crossentropy": 2.550336241722107, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.16026712208986282, "step": 24562 }, { "epoch": 0.767625, "grad_norm": 2.984375, "grad_norm_var": 0.05190327962239583, "learning_rate": 0.0001, "loss": 5.4595, "loss/crossentropy": 2.463630437850952, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.1573958843946457, "step": 24564 }, { "epoch": 0.7676875, "grad_norm": 3.0625, "grad_norm_var": 0.03681233723958333, "learning_rate": 0.0001, "loss": 5.638, "loss/crossentropy": 2.459873914718628, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16741740703582764, "step": 24566 }, { "epoch": 0.76775, "grad_norm": 2.90625, "grad_norm_var": 0.03730367024739583, "learning_rate": 0.0001, "loss": 5.5341, "loss/crossentropy": 2.5051268339157104, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15836410969495773, "step": 24568 }, { "epoch": 0.7678125, "grad_norm": 3.21875, "grad_norm_var": 0.0269195556640625, "learning_rate": 0.0001, "loss": 6.196, "loss/crossentropy": 2.882757544517517, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.18015281111001968, "step": 24570 }, { "epoch": 0.767875, "grad_norm": 3.234375, "grad_norm_var": 0.02672119140625, "learning_rate": 0.0001, "loss": 5.7163, "loss/crossentropy": 2.5850621461868286, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16312387585639954, "step": 24572 }, { "epoch": 0.7679375, "grad_norm": 2.984375, "grad_norm_var": 0.029292805989583334, "learning_rate": 0.0001, "loss": 5.5689, "loss/crossentropy": 2.532110333442688, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1599290817975998, "step": 24574 }, { "epoch": 0.768, "grad_norm": 2.9375, "grad_norm_var": 0.029182942708333333, "learning_rate": 0.0001, "loss": 5.6969, "loss/crossentropy": 2.588170289993286, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1620495393872261, "step": 24576 }, { "epoch": 0.7680625, "grad_norm": 2.890625, "grad_norm_var": 0.029011027018229166, "learning_rate": 0.0001, "loss": 5.6838, "loss/crossentropy": 2.591399908065796, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16431699693202972, "step": 24578 }, { "epoch": 0.768125, "grad_norm": 2.90625, "grad_norm_var": 0.0312652587890625, "learning_rate": 0.0001, "loss": 5.5217, "loss/crossentropy": 2.458386540412903, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15984228253364563, "step": 24580 }, { "epoch": 0.7681875, "grad_norm": 3.359375, "grad_norm_var": 0.0259185791015625, "learning_rate": 0.0001, "loss": 5.6103, "loss/crossentropy": 2.492032527923584, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16611865162849426, "step": 24582 }, { "epoch": 0.76825, "grad_norm": 2.84375, "grad_norm_var": 0.027131144205729166, "learning_rate": 0.0001, "loss": 5.6992, "loss/crossentropy": 2.63890540599823, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16228335350751877, "step": 24584 }, { "epoch": 0.7683125, "grad_norm": 3.640625, "grad_norm_var": 0.04303385416666667, "learning_rate": 0.0001, "loss": 5.5841, "loss/crossentropy": 2.472393035888672, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16390815377235413, "step": 24586 }, { "epoch": 0.768375, "grad_norm": 2.796875, "grad_norm_var": 0.049637858072916666, "learning_rate": 0.0001, "loss": 5.5688, "loss/crossentropy": 2.5740370750427246, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.15963692218065262, "step": 24588 }, { "epoch": 0.7684375, "grad_norm": 3.046875, "grad_norm_var": 0.049788411458333334, "learning_rate": 0.0001, "loss": 5.3792, "loss/crossentropy": 2.3682631254196167, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.1569526493549347, "step": 24590 }, { "epoch": 0.7685, "grad_norm": 2.796875, "grad_norm_var": 0.04980061848958333, "learning_rate": 0.0001, "loss": 5.6177, "loss/crossentropy": 2.562471032142639, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15826137363910675, "step": 24592 }, { "epoch": 0.7685625, "grad_norm": 3.09375, "grad_norm_var": 0.05293680826822917, "learning_rate": 0.0001, "loss": 5.431, "loss/crossentropy": 2.345592975616455, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15971240401268005, "step": 24594 }, { "epoch": 0.768625, "grad_norm": 3.3125, "grad_norm_var": 0.053238932291666666, "learning_rate": 0.0001, "loss": 5.5878, "loss/crossentropy": 2.4835760593414307, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16394073516130447, "step": 24596 }, { "epoch": 0.7686875, "grad_norm": 3.234375, "grad_norm_var": 0.05015360514322917, "learning_rate": 0.0001, "loss": 5.4102, "loss/crossentropy": 2.3610429763793945, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15881942212581635, "step": 24598 }, { "epoch": 0.76875, "grad_norm": 3.3125, "grad_norm_var": 0.05064697265625, "learning_rate": 0.0001, "loss": 5.6839, "loss/crossentropy": 2.5284552574157715, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16827517747879028, "step": 24600 }, { "epoch": 0.7688125, "grad_norm": 2.921875, "grad_norm_var": 0.0415924072265625, "learning_rate": 0.0001, "loss": 5.9036, "loss/crossentropy": 2.785243272781372, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16417969018220901, "step": 24602 }, { "epoch": 0.768875, "grad_norm": 2.96875, "grad_norm_var": 0.03511962890625, "learning_rate": 0.0001, "loss": 5.427, "loss/crossentropy": 2.4293742179870605, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15093691647052765, "step": 24604 }, { "epoch": 0.7689375, "grad_norm": 3.015625, "grad_norm_var": 0.044482421875, "learning_rate": 0.0001, "loss": 5.9005, "loss/crossentropy": 2.66623592376709, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17576977610588074, "step": 24606 }, { "epoch": 0.769, "grad_norm": 3.15625, "grad_norm_var": 0.04067281087239583, "learning_rate": 0.0001, "loss": 5.6656, "loss/crossentropy": 2.585243344306946, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1619393303990364, "step": 24608 }, { "epoch": 0.7690625, "grad_norm": 3.390625, "grad_norm_var": 0.051024373372395834, "learning_rate": 0.0001, "loss": 5.6106, "loss/crossentropy": 2.5044806003570557, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16295979917049408, "step": 24610 }, { "epoch": 0.769125, "grad_norm": 3.109375, "grad_norm_var": 0.0509429931640625, "learning_rate": 0.0001, "loss": 5.8575, "loss/crossentropy": 2.700343132019043, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1684456691145897, "step": 24612 }, { "epoch": 0.7691875, "grad_norm": 3.234375, "grad_norm_var": 0.0538238525390625, "learning_rate": 0.0001, "loss": 5.5381, "loss/crossentropy": 2.435794472694397, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16413667798042297, "step": 24614 }, { "epoch": 0.76925, "grad_norm": 2.78125, "grad_norm_var": 0.057470703125, "learning_rate": 0.0001, "loss": 5.4452, "loss/crossentropy": 2.4861371517181396, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15293768048286438, "step": 24616 }, { "epoch": 0.7693125, "grad_norm": 3.140625, "grad_norm_var": 0.04978739420572917, "learning_rate": 0.0001, "loss": 5.7474, "loss/crossentropy": 2.54036545753479, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17070432007312775, "step": 24618 }, { "epoch": 0.769375, "grad_norm": 3.0625, "grad_norm_var": 0.05236002604166667, "learning_rate": 0.0001, "loss": 5.5705, "loss/crossentropy": 2.5813162326812744, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.15633995831012726, "step": 24620 }, { "epoch": 0.7694375, "grad_norm": 3.09375, "grad_norm_var": 0.04547526041666667, "learning_rate": 0.0001, "loss": 5.6974, "loss/crossentropy": 2.709510087966919, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15660195797681808, "step": 24622 }, { "epoch": 0.7695, "grad_norm": 3.09375, "grad_norm_var": 0.043192545572916664, "learning_rate": 0.0001, "loss": 5.6082, "loss/crossentropy": 2.563498377799988, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15837693214416504, "step": 24624 }, { "epoch": 0.7695625, "grad_norm": 3.15625, "grad_norm_var": 0.030866495768229165, "learning_rate": 0.0001, "loss": 5.7813, "loss/crossentropy": 2.671633005142212, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16370201110839844, "step": 24626 }, { "epoch": 0.769625, "grad_norm": 3.109375, "grad_norm_var": 0.03408203125, "learning_rate": 0.0001, "loss": 5.5951, "loss/crossentropy": 2.5170499086380005, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1605348140001297, "step": 24628 }, { "epoch": 0.7696875, "grad_norm": 3.0, "grad_norm_var": 0.028108723958333335, "learning_rate": 0.0001, "loss": 5.7315, "loss/crossentropy": 2.6486425399780273, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16609492897987366, "step": 24630 }, { "epoch": 0.76975, "grad_norm": 2.875, "grad_norm_var": 0.033772786458333336, "learning_rate": 0.0001, "loss": 5.6552, "loss/crossentropy": 2.550073027610779, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1655862033367157, "step": 24632 }, { "epoch": 0.7698125, "grad_norm": 2.96875, "grad_norm_var": 0.030594889322916666, "learning_rate": 0.0001, "loss": 5.8422, "loss/crossentropy": 2.697143793106079, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16762875020503998, "step": 24634 }, { "epoch": 0.769875, "grad_norm": 3.140625, "grad_norm_var": 0.028718058268229166, "learning_rate": 0.0001, "loss": 5.4412, "loss/crossentropy": 2.3402713537216187, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16438943147659302, "step": 24636 }, { "epoch": 0.7699375, "grad_norm": 3.015625, "grad_norm_var": 0.0243804931640625, "learning_rate": 0.0001, "loss": 5.2724, "loss/crossentropy": 2.31243634223938, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15302617102861404, "step": 24638 }, { "epoch": 0.77, "grad_norm": 3.109375, "grad_norm_var": 0.0235748291015625, "learning_rate": 0.0001, "loss": 5.5695, "loss/crossentropy": 2.542052388191223, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15860824286937714, "step": 24640 }, { "epoch": 0.7700625, "grad_norm": 2.78125, "grad_norm_var": 0.032124837239583336, "learning_rate": 0.0001, "loss": 5.4541, "loss/crossentropy": 2.534891366958618, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.15129749476909637, "step": 24642 }, { "epoch": 0.770125, "grad_norm": 3.03125, "grad_norm_var": 0.029637654622395832, "learning_rate": 0.0001, "loss": 5.4417, "loss/crossentropy": 2.3925892114639282, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16037844121456146, "step": 24644 }, { "epoch": 0.7701875, "grad_norm": 3.125, "grad_norm_var": 0.03162333170572917, "learning_rate": 0.0001, "loss": 5.9961, "loss/crossentropy": 2.7234139442443848, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1764841303229332, "step": 24646 }, { "epoch": 0.77025, "grad_norm": 3.8125, "grad_norm_var": 0.0557769775390625, "learning_rate": 0.0001, "loss": 5.5932, "loss/crossentropy": 2.582900047302246, "loss/hidden": 1.40234375, "loss/jsd": 0.0, "loss/logits": 0.160794235765934, "step": 24648 }, { "epoch": 0.7703125, "grad_norm": 2.96875, "grad_norm_var": 0.05937398274739583, "learning_rate": 0.0001, "loss": 5.7556, "loss/crossentropy": 2.619705557823181, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16437150537967682, "step": 24650 }, { "epoch": 0.770375, "grad_norm": 3.359375, "grad_norm_var": 0.414208984375, "learning_rate": 0.0001, "loss": 5.6751, "loss/crossentropy": 2.449666142463684, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1760595217347145, "step": 24652 }, { "epoch": 0.7704375, "grad_norm": 2.9375, "grad_norm_var": 0.40784098307291666, "learning_rate": 0.0001, "loss": 5.5639, "loss/crossentropy": 2.472114086151123, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.16855724155902863, "step": 24654 }, { "epoch": 0.7705, "grad_norm": 3.046875, "grad_norm_var": 0.41793212890625, "learning_rate": 0.0001, "loss": 5.8266, "loss/crossentropy": 2.689388871192932, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16997243463993073, "step": 24656 }, { "epoch": 0.7705625, "grad_norm": 3.015625, "grad_norm_var": 0.39348958333333334, "learning_rate": 0.0001, "loss": 5.5236, "loss/crossentropy": 2.5500930547714233, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15438473224639893, "step": 24658 }, { "epoch": 0.770625, "grad_norm": 3.265625, "grad_norm_var": 0.39765625, "learning_rate": 0.0001, "loss": 5.8364, "loss/crossentropy": 2.690742611885071, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16729624569416046, "step": 24660 }, { "epoch": 0.7706875, "grad_norm": 3.25, "grad_norm_var": 0.39846598307291664, "learning_rate": 0.0001, "loss": 5.5172, "loss/crossentropy": 2.4236879348754883, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16169168800115585, "step": 24662 }, { "epoch": 0.77075, "grad_norm": 3.265625, "grad_norm_var": 0.3762440999348958, "learning_rate": 0.0001, "loss": 6.1183, "loss/crossentropy": 2.8352246284484863, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17635374516248703, "step": 24664 }, { "epoch": 0.7708125, "grad_norm": 3.203125, "grad_norm_var": 0.37018941243489584, "learning_rate": 0.0001, "loss": 5.7557, "loss/crossentropy": 2.561169385910034, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16866753995418549, "step": 24666 }, { "epoch": 0.770875, "grad_norm": 3.234375, "grad_norm_var": 0.02154541015625, "learning_rate": 0.0001, "loss": 5.6417, "loss/crossentropy": 2.5521970987319946, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16480600833892822, "step": 24668 }, { "epoch": 0.7709375, "grad_norm": 3.1875, "grad_norm_var": 0.018745930989583333, "learning_rate": 0.0001, "loss": 5.7403, "loss/crossentropy": 2.6300442218780518, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16219773888587952, "step": 24670 }, { "epoch": 0.771, "grad_norm": 3.09375, "grad_norm_var": 0.012235514322916667, "learning_rate": 0.0001, "loss": 5.2951, "loss/crossentropy": 2.3326436281204224, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.1540607511997223, "step": 24672 }, { "epoch": 0.7710625, "grad_norm": 3.34375, "grad_norm_var": 0.014090983072916667, "learning_rate": 0.0001, "loss": 5.6972, "loss/crossentropy": 2.546350121498108, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16743051260709763, "step": 24674 }, { "epoch": 0.771125, "grad_norm": 3.0, "grad_norm_var": 0.01383056640625, "learning_rate": 0.0001, "loss": 5.4417, "loss/crossentropy": 2.4570369720458984, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15198008716106415, "step": 24676 }, { "epoch": 0.7711875, "grad_norm": 3.53125, "grad_norm_var": 0.0245513916015625, "learning_rate": 0.0001, "loss": 5.6782, "loss/crossentropy": 2.674981474876404, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15735703706741333, "step": 24678 }, { "epoch": 0.77125, "grad_norm": 3.03125, "grad_norm_var": 0.025634765625, "learning_rate": 0.0001, "loss": 5.8005, "loss/crossentropy": 2.643213987350464, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16924799233675003, "step": 24680 }, { "epoch": 0.7713125, "grad_norm": 2.859375, "grad_norm_var": 0.028369140625, "learning_rate": 0.0001, "loss": 5.821, "loss/crossentropy": 2.7028998136520386, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16650206595659256, "step": 24682 }, { "epoch": 0.771375, "grad_norm": 3.25, "grad_norm_var": 0.03264058430989583, "learning_rate": 0.0001, "loss": 5.9664, "loss/crossentropy": 2.768761396408081, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17328058928251266, "step": 24684 }, { "epoch": 0.7714375, "grad_norm": 4.375, "grad_norm_var": 0.1312164306640625, "learning_rate": 0.0001, "loss": 5.8115, "loss/crossentropy": 2.6067157983779907, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1704786792397499, "step": 24686 }, { "epoch": 0.7715, "grad_norm": 3.109375, "grad_norm_var": 0.13782552083333333, "learning_rate": 0.0001, "loss": 5.8724, "loss/crossentropy": 2.8215242624282837, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1605537086725235, "step": 24688 }, { "epoch": 0.7715625, "grad_norm": 3.109375, "grad_norm_var": 0.13571675618489584, "learning_rate": 0.0001, "loss": 5.9054, "loss/crossentropy": 2.739312529563904, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.17129508405923843, "step": 24690 }, { "epoch": 0.771625, "grad_norm": 3.078125, "grad_norm_var": 0.1363677978515625, "learning_rate": 0.0001, "loss": 5.6326, "loss/crossentropy": 2.5702801942825317, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.16482558846473694, "step": 24692 }, { "epoch": 0.7716875, "grad_norm": 3.1875, "grad_norm_var": 0.12546284993489584, "learning_rate": 0.0001, "loss": 5.5806, "loss/crossentropy": 2.48318612575531, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16287126392126083, "step": 24694 }, { "epoch": 0.77175, "grad_norm": 3.515625, "grad_norm_var": 0.13491923014322918, "learning_rate": 0.0001, "loss": 5.9828, "loss/crossentropy": 2.823164939880371, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16635094583034515, "step": 24696 }, { "epoch": 0.7718125, "grad_norm": 2.90625, "grad_norm_var": 0.13479715983072918, "learning_rate": 0.0001, "loss": 5.2987, "loss/crossentropy": 2.3417288064956665, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.155068501830101, "step": 24698 }, { "epoch": 0.771875, "grad_norm": 3.21875, "grad_norm_var": 0.13093973795572916, "learning_rate": 0.0001, "loss": 5.5299, "loss/crossentropy": 2.449816107749939, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16113300621509552, "step": 24700 }, { "epoch": 0.7719375, "grad_norm": 3.296875, "grad_norm_var": 0.037353515625, "learning_rate": 0.0001, "loss": 6.0161, "loss/crossentropy": 2.7602035999298096, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17402522265911102, "step": 24702 }, { "epoch": 0.772, "grad_norm": 3.09375, "grad_norm_var": 0.03395894368489583, "learning_rate": 0.0001, "loss": 5.3871, "loss/crossentropy": 2.3438072204589844, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16136392951011658, "step": 24704 }, { "epoch": 0.7720625, "grad_norm": 3.125, "grad_norm_var": 0.04055582682291667, "learning_rate": 0.0001, "loss": 5.8478, "loss/crossentropy": 2.6474716663360596, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16964183002710342, "step": 24706 }, { "epoch": 0.772125, "grad_norm": 3.28125, "grad_norm_var": 0.037646484375, "learning_rate": 0.0001, "loss": 5.6259, "loss/crossentropy": 2.491701126098633, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16185308247804642, "step": 24708 }, { "epoch": 0.7721875, "grad_norm": 3.1875, "grad_norm_var": 0.0439849853515625, "learning_rate": 0.0001, "loss": 5.6412, "loss/crossentropy": 2.3866571187973022, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17232473194599152, "step": 24710 }, { "epoch": 0.77225, "grad_norm": 2.984375, "grad_norm_var": 0.0389068603515625, "learning_rate": 0.0001, "loss": 5.3108, "loss/crossentropy": 2.3295599222183228, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15086036920547485, "step": 24712 }, { "epoch": 0.7723125, "grad_norm": 2.921875, "grad_norm_var": 0.03654683430989583, "learning_rate": 0.0001, "loss": 5.5482, "loss/crossentropy": 2.5633562803268433, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15708180516958237, "step": 24714 }, { "epoch": 0.772375, "grad_norm": 2.828125, "grad_norm_var": 0.04842020670572917, "learning_rate": 0.0001, "loss": 5.7862, "loss/crossentropy": 2.6539541482925415, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16868995130062103, "step": 24716 }, { "epoch": 0.7724375, "grad_norm": 3.171875, "grad_norm_var": 0.03962300618489583, "learning_rate": 0.0001, "loss": 5.6143, "loss/crossentropy": 2.4918437004089355, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16146771609783173, "step": 24718 }, { "epoch": 0.7725, "grad_norm": 3.0625, "grad_norm_var": 0.04254150390625, "learning_rate": 0.0001, "loss": 5.3944, "loss/crossentropy": 2.48309588432312, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.15128827840089798, "step": 24720 }, { "epoch": 0.7725625, "grad_norm": 2.953125, "grad_norm_var": 0.05237630208333333, "learning_rate": 0.0001, "loss": 5.4593, "loss/crossentropy": 2.4418129920959473, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1568250209093094, "step": 24722 }, { "epoch": 0.772625, "grad_norm": 3.046875, "grad_norm_var": 0.055826822916666664, "learning_rate": 0.0001, "loss": 5.3066, "loss/crossentropy": 2.3395392894744873, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.15412385761737823, "step": 24724 }, { "epoch": 0.7726875, "grad_norm": 3.21875, "grad_norm_var": 0.044331868489583336, "learning_rate": 0.0001, "loss": 5.512, "loss/crossentropy": 2.472067952156067, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.16141273826360703, "step": 24726 }, { "epoch": 0.77275, "grad_norm": 3.25, "grad_norm_var": 0.04455973307291667, "learning_rate": 0.0001, "loss": 5.6816, "loss/crossentropy": 2.6148258447647095, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16175460070371628, "step": 24728 }, { "epoch": 0.7728125, "grad_norm": 3.359375, "grad_norm_var": 0.0489654541015625, "learning_rate": 0.0001, "loss": 5.5001, "loss/crossentropy": 2.4213614463806152, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16295599192380905, "step": 24730 }, { "epoch": 0.772875, "grad_norm": 2.8125, "grad_norm_var": 0.0503326416015625, "learning_rate": 0.0001, "loss": 5.649, "loss/crossentropy": 2.5784149169921875, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.16448526084423065, "step": 24732 }, { "epoch": 0.7729375, "grad_norm": 3.109375, "grad_norm_var": 0.0536529541015625, "learning_rate": 0.0001, "loss": 5.6552, "loss/crossentropy": 2.6306252479553223, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16026630997657776, "step": 24734 }, { "epoch": 0.773, "grad_norm": 2.90625, "grad_norm_var": 0.062939453125, "learning_rate": 0.0001, "loss": 5.3572, "loss/crossentropy": 2.4248207807540894, "loss/hidden": 1.39453125, "loss/jsd": 0.0, "loss/logits": 0.15378645062446594, "step": 24736 }, { "epoch": 0.7730625, "grad_norm": 2.734375, "grad_norm_var": 0.050048828125, "learning_rate": 0.0001, "loss": 5.5771, "loss/crossentropy": 2.5675575733184814, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15759897232055664, "step": 24738 }, { "epoch": 0.773125, "grad_norm": 3.0625, "grad_norm_var": 0.0483795166015625, "learning_rate": 0.0001, "loss": 5.2249, "loss/crossentropy": 2.2803955078125, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15304088592529297, "step": 24740 }, { "epoch": 0.7731875, "grad_norm": 3.03125, "grad_norm_var": 0.0474609375, "learning_rate": 0.0001, "loss": 5.3043, "loss/crossentropy": 2.3219728469848633, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15448330342769623, "step": 24742 }, { "epoch": 0.77325, "grad_norm": 3.15625, "grad_norm_var": 0.0469635009765625, "learning_rate": 0.0001, "loss": 5.5888, "loss/crossentropy": 2.528611898422241, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.16499833017587662, "step": 24744 }, { "epoch": 0.7733125, "grad_norm": 2.90625, "grad_norm_var": 0.04052327473958333, "learning_rate": 0.0001, "loss": 5.63, "loss/crossentropy": 2.6092342138290405, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15989167988300323, "step": 24746 }, { "epoch": 0.773375, "grad_norm": 3.421875, "grad_norm_var": 0.03453369140625, "learning_rate": 0.0001, "loss": 5.5182, "loss/crossentropy": 2.3983983993530273, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16314804553985596, "step": 24748 }, { "epoch": 0.7734375, "grad_norm": 3.171875, "grad_norm_var": 0.03707682291666667, "learning_rate": 0.0001, "loss": 5.7343, "loss/crossentropy": 2.582399845123291, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16597573459148407, "step": 24750 }, { "epoch": 0.7735, "grad_norm": 3.0, "grad_norm_var": 0.027936808268229165, "learning_rate": 0.0001, "loss": 6.0781, "loss/crossentropy": 2.916908025741577, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16611584275960922, "step": 24752 }, { "epoch": 0.7735625, "grad_norm": 3.015625, "grad_norm_var": 0.019269816080729165, "learning_rate": 0.0001, "loss": 5.567, "loss/crossentropy": 2.5042638778686523, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15861467272043228, "step": 24754 }, { "epoch": 0.773625, "grad_norm": 3.375, "grad_norm_var": 0.0253082275390625, "learning_rate": 0.0001, "loss": 5.654, "loss/crossentropy": 2.4719542264938354, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1666421741247177, "step": 24756 }, { "epoch": 0.7736875, "grad_norm": 3.015625, "grad_norm_var": 0.0220367431640625, "learning_rate": 0.0001, "loss": 5.8333, "loss/crossentropy": 2.6948615312576294, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16696688532829285, "step": 24758 }, { "epoch": 0.77375, "grad_norm": 2.890625, "grad_norm_var": 0.025300089518229166, "learning_rate": 0.0001, "loss": 5.6459, "loss/crossentropy": 2.5435097217559814, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1618061512708664, "step": 24760 }, { "epoch": 0.7738125, "grad_norm": 3.03125, "grad_norm_var": 0.024779256184895834, "learning_rate": 0.0001, "loss": 5.5149, "loss/crossentropy": 2.5401512384414673, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.15567483007907867, "step": 24762 }, { "epoch": 0.773875, "grad_norm": 3.203125, "grad_norm_var": 0.01689453125, "learning_rate": 0.0001, "loss": 5.8176, "loss/crossentropy": 2.6180412769317627, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.17542476952075958, "step": 24764 }, { "epoch": 0.7739375, "grad_norm": 3.796875, "grad_norm_var": 0.04742431640625, "learning_rate": 0.0001, "loss": 6.158, "loss/crossentropy": 2.888972043991089, "loss/hidden": 1.53515625, "loss/jsd": 0.0, "loss/logits": 0.17338383942842484, "step": 24766 }, { "epoch": 0.774, "grad_norm": 3.109375, "grad_norm_var": 0.04649149576822917, "learning_rate": 0.0001, "loss": 5.9144, "loss/crossentropy": 2.7113550901412964, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17303407192230225, "step": 24768 }, { "epoch": 0.7740625, "grad_norm": 3.546875, "grad_norm_var": 0.0568023681640625, "learning_rate": 0.0001, "loss": 5.7253, "loss/crossentropy": 2.5738178491592407, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1671004444360733, "step": 24770 }, { "epoch": 0.774125, "grad_norm": 3.484375, "grad_norm_var": 0.06061197916666667, "learning_rate": 0.0001, "loss": 5.9538, "loss/crossentropy": 2.709931254386902, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17360104620456696, "step": 24772 }, { "epoch": 0.7741875, "grad_norm": 2.921875, "grad_norm_var": 0.06789957682291667, "learning_rate": 0.0001, "loss": 5.4632, "loss/crossentropy": 2.4295272827148438, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15883446484804153, "step": 24774 }, { "epoch": 0.77425, "grad_norm": 3.03125, "grad_norm_var": 0.06398824055989584, "learning_rate": 0.0001, "loss": 5.6834, "loss/crossentropy": 2.581278085708618, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1641213744878769, "step": 24776 }, { "epoch": 0.7743125, "grad_norm": 2.8125, "grad_norm_var": 0.067626953125, "learning_rate": 0.0001, "loss": 5.5977, "loss/crossentropy": 2.5667072534561157, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1585664376616478, "step": 24778 }, { "epoch": 0.774375, "grad_norm": 3.21875, "grad_norm_var": 0.0672027587890625, "learning_rate": 0.0001, "loss": 6.0705, "loss/crossentropy": 2.816719889640808, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1753796860575676, "step": 24780 }, { "epoch": 0.7744375, "grad_norm": 3.015625, "grad_norm_var": 0.039143880208333336, "learning_rate": 0.0001, "loss": 5.2543, "loss/crossentropy": 2.2697088718414307, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15118995308876038, "step": 24782 }, { "epoch": 0.7745, "grad_norm": 3.296875, "grad_norm_var": 0.04049072265625, "learning_rate": 0.0001, "loss": 5.8765, "loss/crossentropy": 2.7213023900985718, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16435182094573975, "step": 24784 }, { "epoch": 0.7745625, "grad_norm": 3.171875, "grad_norm_var": 0.0284820556640625, "learning_rate": 0.0001, "loss": 5.5373, "loss/crossentropy": 2.464460611343384, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16001853346824646, "step": 24786 }, { "epoch": 0.774625, "grad_norm": 3.046875, "grad_norm_var": 0.018195597330729167, "learning_rate": 0.0001, "loss": 5.7178, "loss/crossentropy": 2.6299628019332886, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16112637519836426, "step": 24788 }, { "epoch": 0.7746875, "grad_norm": 3.0, "grad_norm_var": 0.014306640625, "learning_rate": 0.0001, "loss": 5.662, "loss/crossentropy": 2.570648670196533, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.158745676279068, "step": 24790 }, { "epoch": 0.77475, "grad_norm": 2.828125, "grad_norm_var": 0.018798828125, "learning_rate": 0.0001, "loss": 5.5327, "loss/crossentropy": 2.5075855255126953, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.16149257868528366, "step": 24792 }, { "epoch": 0.7748125, "grad_norm": 2.90625, "grad_norm_var": 0.015478515625, "learning_rate": 0.0001, "loss": 5.6924, "loss/crossentropy": 2.586639642715454, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1625300496816635, "step": 24794 }, { "epoch": 0.774875, "grad_norm": 3.09375, "grad_norm_var": 0.015119425455729167, "learning_rate": 0.0001, "loss": 5.5558, "loss/crossentropy": 2.5161207914352417, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.16217201948165894, "step": 24796 }, { "epoch": 0.7749375, "grad_norm": 2.90625, "grad_norm_var": 0.016109212239583334, "learning_rate": 0.0001, "loss": 5.5311, "loss/crossentropy": 2.5070807933807373, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15865183621644974, "step": 24798 }, { "epoch": 0.775, "grad_norm": 2.96875, "grad_norm_var": 0.011507161458333333, "learning_rate": 0.0001, "loss": 5.5333, "loss/crossentropy": 2.538558006286621, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15416593849658966, "step": 24800 }, { "epoch": 0.7750625, "grad_norm": 3.09375, "grad_norm_var": 0.010677083333333334, "learning_rate": 0.0001, "loss": 5.7173, "loss/crossentropy": 2.6183621883392334, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16458562016487122, "step": 24802 }, { "epoch": 0.775125, "grad_norm": 3.125, "grad_norm_var": 0.013850911458333334, "learning_rate": 0.0001, "loss": 6.0465, "loss/crossentropy": 2.82187819480896, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1736362800002098, "step": 24804 }, { "epoch": 0.7751875, "grad_norm": 3.328125, "grad_norm_var": 0.017073567708333334, "learning_rate": 0.0001, "loss": 5.8288, "loss/crossentropy": 2.6323657035827637, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1727638989686966, "step": 24806 }, { "epoch": 0.77525, "grad_norm": 2.828125, "grad_norm_var": 0.017243448893229166, "learning_rate": 0.0001, "loss": 5.759, "loss/crossentropy": 2.6707929372787476, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16038425266742706, "step": 24808 }, { "epoch": 0.7753125, "grad_norm": 3.015625, "grad_norm_var": 0.020970662434895832, "learning_rate": 0.0001, "loss": 5.672, "loss/crossentropy": 2.5206936597824097, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16591551899909973, "step": 24810 }, { "epoch": 0.775375, "grad_norm": 3.0625, "grad_norm_var": 0.020751953125, "learning_rate": 0.0001, "loss": 5.6463, "loss/crossentropy": 2.5930248498916626, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16157421469688416, "step": 24812 }, { "epoch": 0.7754375, "grad_norm": 3.328125, "grad_norm_var": 0.022233072916666666, "learning_rate": 0.0001, "loss": 5.7211, "loss/crossentropy": 2.590226650238037, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16738758236169815, "step": 24814 }, { "epoch": 0.7755, "grad_norm": 3.59375, "grad_norm_var": 0.033503214518229164, "learning_rate": 0.0001, "loss": 5.7101, "loss/crossentropy": 2.5524097681045532, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16850201040506363, "step": 24816 }, { "epoch": 0.7755625, "grad_norm": 3.21875, "grad_norm_var": 0.03428446451822917, "learning_rate": 0.0001, "loss": 5.4872, "loss/crossentropy": 2.4411537647247314, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16007166355848312, "step": 24818 }, { "epoch": 0.775625, "grad_norm": 3.34375, "grad_norm_var": 0.037840779622395834, "learning_rate": 0.0001, "loss": 5.6216, "loss/crossentropy": 2.4603710174560547, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16846421360969543, "step": 24820 }, { "epoch": 0.7756875, "grad_norm": 2.984375, "grad_norm_var": 0.03760477701822917, "learning_rate": 0.0001, "loss": 5.5557, "loss/crossentropy": 2.583541512489319, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15385423600673676, "step": 24822 }, { "epoch": 0.77575, "grad_norm": 3.0, "grad_norm_var": 0.031962076822916664, "learning_rate": 0.0001, "loss": 5.96, "loss/crossentropy": 2.7172566652297974, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17466983199119568, "step": 24824 }, { "epoch": 0.7758125, "grad_norm": 3.078125, "grad_norm_var": 0.030695597330729168, "learning_rate": 0.0001, "loss": 5.7281, "loss/crossentropy": 2.6229422092437744, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16754963248968124, "step": 24826 }, { "epoch": 0.775875, "grad_norm": 2.953125, "grad_norm_var": 0.0330230712890625, "learning_rate": 0.0001, "loss": 5.57, "loss/crossentropy": 2.596638798713684, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.15554015338420868, "step": 24828 }, { "epoch": 0.7759375, "grad_norm": 2.890625, "grad_norm_var": 7.518680826822917, "learning_rate": 0.0001, "loss": 5.2667, "loss/crossentropy": 2.226096510887146, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.14898383617401123, "step": 24830 }, { "epoch": 0.776, "grad_norm": 3.34375, "grad_norm_var": 7.539273071289062, "learning_rate": 0.0001, "loss": 5.4205, "loss/crossentropy": 2.4121206998825073, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15747754275798798, "step": 24832 }, { "epoch": 0.7760625, "grad_norm": 3.59375, "grad_norm_var": 7.495536295572917, "learning_rate": 0.0001, "loss": 6.0159, "loss/crossentropy": 2.6774450540542603, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.17837996780872345, "step": 24834 }, { "epoch": 0.776125, "grad_norm": 3.1875, "grad_norm_var": 7.490347290039063, "learning_rate": 0.0001, "loss": 5.8412, "loss/crossentropy": 2.6141446828842163, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17114035040140152, "step": 24836 }, { "epoch": 0.7761875, "grad_norm": 3.25, "grad_norm_var": 7.467740885416666, "learning_rate": 0.0001, "loss": 5.4981, "loss/crossentropy": 2.413454294204712, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1604211926460266, "step": 24838 }, { "epoch": 0.77625, "grad_norm": 3.21875, "grad_norm_var": 7.442805989583333, "learning_rate": 0.0001, "loss": 5.7351, "loss/crossentropy": 2.582782506942749, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1671806275844574, "step": 24840 }, { "epoch": 0.7763125, "grad_norm": 3.71875, "grad_norm_var": 7.39107666015625, "learning_rate": 0.0001, "loss": 5.5931, "loss/crossentropy": 2.4901710748672485, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1614660620689392, "step": 24842 }, { "epoch": 0.776375, "grad_norm": 2.9375, "grad_norm_var": 7.373177083333333, "learning_rate": 0.0001, "loss": 5.6264, "loss/crossentropy": 2.586179494857788, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15948884189128876, "step": 24844 }, { "epoch": 0.7764375, "grad_norm": 3.109375, "grad_norm_var": 0.0643707275390625, "learning_rate": 0.0001, "loss": 5.9235, "loss/crossentropy": 2.758601188659668, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1680537387728691, "step": 24846 }, { "epoch": 0.7765, "grad_norm": 3.453125, "grad_norm_var": 0.08557942708333334, "learning_rate": 0.0001, "loss": 5.5767, "loss/crossentropy": 2.444355845451355, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.16010946035385132, "step": 24848 }, { "epoch": 0.7765625, "grad_norm": 3.0625, "grad_norm_var": 0.07801005045572916, "learning_rate": 0.0001, "loss": 5.5247, "loss/crossentropy": 2.4560447931289673, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16155368089675903, "step": 24850 }, { "epoch": 0.776625, "grad_norm": 3.03125, "grad_norm_var": 0.06894124348958333, "learning_rate": 0.0001, "loss": 5.8591, "loss/crossentropy": 2.7664119005203247, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16317206621170044, "step": 24852 }, { "epoch": 0.7766875, "grad_norm": 5.0625, "grad_norm_var": 0.28684895833333335, "learning_rate": 0.0001, "loss": 5.5702, "loss/crossentropy": 2.3672115802764893, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17381131649017334, "step": 24854 }, { "epoch": 0.77675, "grad_norm": 3.15625, "grad_norm_var": 0.2888671875, "learning_rate": 0.0001, "loss": 5.8842, "loss/crossentropy": 2.6977726221084595, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.17333439737558365, "step": 24856 }, { "epoch": 0.7768125, "grad_norm": 3.078125, "grad_norm_var": 0.2814605712890625, "learning_rate": 0.0001, "loss": 5.4317, "loss/crossentropy": 2.418553113937378, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.1587332859635353, "step": 24858 }, { "epoch": 0.776875, "grad_norm": 3.09375, "grad_norm_var": 0.284814453125, "learning_rate": 0.0001, "loss": 5.505, "loss/crossentropy": 2.3784608840942383, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1669526994228363, "step": 24860 }, { "epoch": 0.7769375, "grad_norm": 3.078125, "grad_norm_var": 0.28543192545572915, "learning_rate": 0.0001, "loss": 5.8212, "loss/crossentropy": 2.6897518634796143, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1678316667675972, "step": 24862 }, { "epoch": 0.777, "grad_norm": 3.140625, "grad_norm_var": 0.2692616780598958, "learning_rate": 0.0001, "loss": 5.552, "loss/crossentropy": 2.5030031204223633, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1588040366768837, "step": 24864 }, { "epoch": 0.7770625, "grad_norm": 3.71875, "grad_norm_var": 0.2734771728515625, "learning_rate": 0.0001, "loss": 6.067, "loss/crossentropy": 2.8639168739318848, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1730470359325409, "step": 24866 }, { "epoch": 0.777125, "grad_norm": 3.1875, "grad_norm_var": 0.26049702962239585, "learning_rate": 0.0001, "loss": 5.8398, "loss/crossentropy": 2.6833678483963013, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16408467292785645, "step": 24868 }, { "epoch": 0.7771875, "grad_norm": 3.1875, "grad_norm_var": 0.04606831868489583, "learning_rate": 0.0001, "loss": 5.9563, "loss/crossentropy": 2.810359001159668, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16381268203258514, "step": 24870 }, { "epoch": 0.77725, "grad_norm": 3.078125, "grad_norm_var": 0.0539703369140625, "learning_rate": 0.0001, "loss": 5.4282, "loss/crossentropy": 2.3819870948791504, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.15422788262367249, "step": 24872 }, { "epoch": 0.7773125, "grad_norm": 2.90625, "grad_norm_var": 0.05467020670572917, "learning_rate": 0.0001, "loss": 5.6281, "loss/crossentropy": 2.5575016736984253, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16330711543560028, "step": 24874 }, { "epoch": 0.777375, "grad_norm": 2.9375, "grad_norm_var": 0.051806640625, "learning_rate": 0.0001, "loss": 5.7753, "loss/crossentropy": 2.6280269622802734, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16707344353199005, "step": 24876 }, { "epoch": 0.7774375, "grad_norm": 3.265625, "grad_norm_var": 0.0498443603515625, "learning_rate": 0.0001, "loss": 6.1472, "loss/crossentropy": 2.795683979988098, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.18241284787654877, "step": 24878 }, { "epoch": 0.7775, "grad_norm": 3.8125, "grad_norm_var": 0.0611724853515625, "learning_rate": 0.0001, "loss": 5.9611, "loss/crossentropy": 2.6614965200424194, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.18074382841587067, "step": 24880 }, { "epoch": 0.7775625, "grad_norm": 3.234375, "grad_norm_var": 0.06274312337239583, "learning_rate": 0.0001, "loss": 5.7818, "loss/crossentropy": 2.665590763092041, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16552364826202393, "step": 24882 }, { "epoch": 0.777625, "grad_norm": 3.171875, "grad_norm_var": 0.06363016764322917, "learning_rate": 0.0001, "loss": 5.6286, "loss/crossentropy": 2.4945160150527954, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16653265058994293, "step": 24884 }, { "epoch": 0.7776875, "grad_norm": 2.890625, "grad_norm_var": 0.07089436848958333, "learning_rate": 0.0001, "loss": 5.6881, "loss/crossentropy": 2.6458394527435303, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1569637879729271, "step": 24886 }, { "epoch": 0.77775, "grad_norm": 2.953125, "grad_norm_var": 0.06551106770833333, "learning_rate": 0.0001, "loss": 5.7243, "loss/crossentropy": 2.640910267829895, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16068334877490997, "step": 24888 }, { "epoch": 0.7778125, "grad_norm": 3.015625, "grad_norm_var": 0.0659088134765625, "learning_rate": 0.0001, "loss": 5.549, "loss/crossentropy": 2.4916951656341553, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15651002526283264, "step": 24890 }, { "epoch": 0.777875, "grad_norm": 3.359375, "grad_norm_var": 0.061498006184895836, "learning_rate": 0.0001, "loss": 5.5354, "loss/crossentropy": 2.4490002393722534, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16293669492006302, "step": 24892 }, { "epoch": 0.7779375, "grad_norm": 3.453125, "grad_norm_var": 0.06623942057291667, "learning_rate": 0.0001, "loss": 5.9291, "loss/crossentropy": 2.730451464653015, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17181465774774551, "step": 24894 }, { "epoch": 0.778, "grad_norm": 3.109375, "grad_norm_var": 0.03559468587239583, "learning_rate": 0.0001, "loss": 5.9067, "loss/crossentropy": 2.7338948249816895, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17040778696537018, "step": 24896 }, { "epoch": 0.7780625, "grad_norm": 2.953125, "grad_norm_var": 0.029588826497395835, "learning_rate": 0.0001, "loss": 5.7572, "loss/crossentropy": 2.622227907180786, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.168186254799366, "step": 24898 }, { "epoch": 0.778125, "grad_norm": 3.0625, "grad_norm_var": 0.02750244140625, "learning_rate": 0.0001, "loss": 5.7004, "loss/crossentropy": 2.5909247398376465, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16290107369422913, "step": 24900 }, { "epoch": 0.7781875, "grad_norm": 3.171875, "grad_norm_var": 0.0531890869140625, "learning_rate": 0.0001, "loss": 5.733, "loss/crossentropy": 2.54416024684906, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1712314933538437, "step": 24902 }, { "epoch": 0.77825, "grad_norm": 3.15625, "grad_norm_var": 0.054361979166666664, "learning_rate": 0.0001, "loss": 5.6732, "loss/crossentropy": 2.612813949584961, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1630677431821823, "step": 24904 }, { "epoch": 0.7783125, "grad_norm": 3.5625, "grad_norm_var": 0.05924072265625, "learning_rate": 0.0001, "loss": 5.705, "loss/crossentropy": 2.6064003705978394, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1637658178806305, "step": 24906 }, { "epoch": 0.778375, "grad_norm": 3.03125, "grad_norm_var": 0.05500386555989583, "learning_rate": 0.0001, "loss": 5.5631, "loss/crossentropy": 2.5389318466186523, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1555420458316803, "step": 24908 }, { "epoch": 0.7784375, "grad_norm": 3.75, "grad_norm_var": 0.08580322265625, "learning_rate": 0.0001, "loss": 5.8155, "loss/crossentropy": 2.735503077507019, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16190160065889359, "step": 24910 }, { "epoch": 0.7785, "grad_norm": 2.96875, "grad_norm_var": 0.08651936848958333, "learning_rate": 0.0001, "loss": 5.6744, "loss/crossentropy": 2.610866665840149, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1610436588525772, "step": 24912 }, { "epoch": 0.7785625, "grad_norm": 3.234375, "grad_norm_var": 0.08622639973958333, "learning_rate": 0.0001, "loss": 5.6425, "loss/crossentropy": 2.6181299686431885, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15947112441062927, "step": 24914 }, { "epoch": 0.778625, "grad_norm": 3.03125, "grad_norm_var": 0.0912506103515625, "learning_rate": 0.0001, "loss": 5.6656, "loss/crossentropy": 2.5940080881118774, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1618470698595047, "step": 24916 }, { "epoch": 0.7786875, "grad_norm": 3.53125, "grad_norm_var": 0.07492574055989583, "learning_rate": 0.0001, "loss": 5.7441, "loss/crossentropy": 2.642926573753357, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16245951503515244, "step": 24918 }, { "epoch": 0.77875, "grad_norm": 3.046875, "grad_norm_var": 0.07492574055989583, "learning_rate": 0.0001, "loss": 5.7488, "loss/crossentropy": 2.70668888092041, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16123943030834198, "step": 24920 }, { "epoch": 0.7788125, "grad_norm": 3.15625, "grad_norm_var": 0.06214090983072917, "learning_rate": 0.0001, "loss": 5.5348, "loss/crossentropy": 2.4590747356414795, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16225910186767578, "step": 24922 }, { "epoch": 0.778875, "grad_norm": 2.875, "grad_norm_var": 0.0652008056640625, "learning_rate": 0.0001, "loss": 5.3856, "loss/crossentropy": 2.4269120693206787, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.15524086356163025, "step": 24924 }, { "epoch": 0.7789375, "grad_norm": 3.21875, "grad_norm_var": 0.036498006184895834, "learning_rate": 0.0001, "loss": 5.965, "loss/crossentropy": 2.7144534587860107, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17505048215389252, "step": 24926 }, { "epoch": 0.779, "grad_norm": 3.921875, "grad_norm_var": 0.39690755208333334, "learning_rate": 0.0001, "loss": 6.2914, "loss/crossentropy": 2.870009183883667, "loss/hidden": 1.55859375, "loss/jsd": 0.0, "loss/logits": 0.1862778663635254, "step": 24928 }, { "epoch": 0.7790625, "grad_norm": 3.125, "grad_norm_var": 0.3919097900390625, "learning_rate": 0.0001, "loss": 5.9061, "loss/crossentropy": 2.698647975921631, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17113231867551804, "step": 24930 }, { "epoch": 0.779125, "grad_norm": 3.65625, "grad_norm_var": 0.3816151936848958, "learning_rate": 0.0001, "loss": 5.943, "loss/crossentropy": 2.770224094390869, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16689125448465347, "step": 24932 }, { "epoch": 0.7791875, "grad_norm": 3.328125, "grad_norm_var": 0.37463785807291666, "learning_rate": 0.0001, "loss": 5.8719, "loss/crossentropy": 2.6833068132400513, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16768933832645416, "step": 24934 }, { "epoch": 0.77925, "grad_norm": 3.390625, "grad_norm_var": 0.36253153483072914, "learning_rate": 0.0001, "loss": 5.7564, "loss/crossentropy": 2.602460741996765, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16930370777845383, "step": 24936 }, { "epoch": 0.7793125, "grad_norm": 3.171875, "grad_norm_var": 0.36128641764322916, "learning_rate": 0.0001, "loss": 5.6362, "loss/crossentropy": 2.526370644569397, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16449592262506485, "step": 24938 }, { "epoch": 0.779375, "grad_norm": 2.75, "grad_norm_var": 0.3646769205729167, "learning_rate": 0.0001, "loss": 5.496, "loss/crossentropy": 2.463905930519104, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15868177264928818, "step": 24940 }, { "epoch": 0.7794375, "grad_norm": 3.125, "grad_norm_var": 0.3747955322265625, "learning_rate": 0.0001, "loss": 5.9716, "loss/crossentropy": 2.7801040410995483, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16875829547643661, "step": 24942 }, { "epoch": 0.7795, "grad_norm": 3.140625, "grad_norm_var": 0.046370442708333334, "learning_rate": 0.0001, "loss": 5.9026, "loss/crossentropy": 2.709265947341919, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.16776718199253082, "step": 24944 }, { "epoch": 0.7795625, "grad_norm": 2.96875, "grad_norm_var": 0.0492095947265625, "learning_rate": 0.0001, "loss": 5.5134, "loss/crossentropy": 2.4592331647872925, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15736858546733856, "step": 24946 }, { "epoch": 0.779625, "grad_norm": 3.1875, "grad_norm_var": 0.03206278483072917, "learning_rate": 0.0001, "loss": 5.6377, "loss/crossentropy": 2.5303972959518433, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1650300770998001, "step": 24948 }, { "epoch": 0.7796875, "grad_norm": 3.296875, "grad_norm_var": 0.021630859375, "learning_rate": 0.0001, "loss": 5.865, "loss/crossentropy": 2.693328857421875, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1714666560292244, "step": 24950 }, { "epoch": 0.77975, "grad_norm": 3.1875, "grad_norm_var": 0.0161285400390625, "learning_rate": 0.0001, "loss": 5.9176, "loss/crossentropy": 2.705698251724243, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17080193012952805, "step": 24952 }, { "epoch": 0.7798125, "grad_norm": 3.234375, "grad_norm_var": 0.019384765625, "learning_rate": 0.0001, "loss": 5.6606, "loss/crossentropy": 2.5922571420669556, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16230715066194534, "step": 24954 }, { "epoch": 0.779875, "grad_norm": 3.078125, "grad_norm_var": 0.016087849934895832, "learning_rate": 0.0001, "loss": 5.4266, "loss/crossentropy": 2.3955196142196655, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15467171370983124, "step": 24956 }, { "epoch": 0.7799375, "grad_norm": 3.046875, "grad_norm_var": 0.0160308837890625, "learning_rate": 0.0001, "loss": 5.2, "loss/crossentropy": 2.201227366924286, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15026813745498657, "step": 24958 }, { "epoch": 0.78, "grad_norm": 3.1875, "grad_norm_var": 0.02261962890625, "learning_rate": 0.0001, "loss": 5.5798, "loss/crossentropy": 2.509858250617981, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1593395695090294, "step": 24960 }, { "epoch": 0.7800625, "grad_norm": 3.0, "grad_norm_var": 0.025777180989583332, "learning_rate": 0.0001, "loss": 5.3551, "loss/crossentropy": 2.4357187747955322, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15053577721118927, "step": 24962 }, { "epoch": 0.780125, "grad_norm": 3.203125, "grad_norm_var": 0.0277984619140625, "learning_rate": 0.0001, "loss": 5.9254, "loss/crossentropy": 2.688409209251404, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17214136570692062, "step": 24964 }, { "epoch": 0.7801875, "grad_norm": 3.109375, "grad_norm_var": 0.0298736572265625, "learning_rate": 0.0001, "loss": 5.7217, "loss/crossentropy": 2.5876524448394775, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16847819089889526, "step": 24966 }, { "epoch": 0.78025, "grad_norm": 3.03125, "grad_norm_var": 0.031571451822916666, "learning_rate": 0.0001, "loss": 5.6469, "loss/crossentropy": 2.6176631450653076, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15761353820562363, "step": 24968 }, { "epoch": 0.7803125, "grad_norm": 3.109375, "grad_norm_var": 0.0280670166015625, "learning_rate": 0.0001, "loss": 5.6812, "loss/crossentropy": 2.5582680702209473, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16111940145492554, "step": 24970 }, { "epoch": 0.780375, "grad_norm": 3.484375, "grad_norm_var": 0.03092041015625, "learning_rate": 0.0001, "loss": 5.5374, "loss/crossentropy": 2.415781617164612, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1633347123861313, "step": 24972 }, { "epoch": 0.7804375, "grad_norm": 3.265625, "grad_norm_var": 0.037337239583333334, "learning_rate": 0.0001, "loss": 5.8287, "loss/crossentropy": 2.635912775993347, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17083972692489624, "step": 24974 }, { "epoch": 0.7805, "grad_norm": 3.109375, "grad_norm_var": 0.03524983723958333, "learning_rate": 0.0001, "loss": 5.4134, "loss/crossentropy": 2.5047943592071533, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.14984039962291718, "step": 24976 }, { "epoch": 0.7805625, "grad_norm": 3.796875, "grad_norm_var": 0.06164957682291667, "learning_rate": 0.0001, "loss": 5.691, "loss/crossentropy": 2.5033782720565796, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1671987697482109, "step": 24978 }, { "epoch": 0.780625, "grad_norm": 3.25, "grad_norm_var": 0.06101786295572917, "learning_rate": 0.0001, "loss": 5.477, "loss/crossentropy": 2.4886432886123657, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1527436152100563, "step": 24980 }, { "epoch": 0.7806875, "grad_norm": 3.0625, "grad_norm_var": 0.05538736979166667, "learning_rate": 0.0001, "loss": 5.6513, "loss/crossentropy": 2.586162567138672, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.16393253952264786, "step": 24982 }, { "epoch": 0.78075, "grad_norm": 3.1875, "grad_norm_var": 0.05191650390625, "learning_rate": 0.0001, "loss": 5.6875, "loss/crossentropy": 2.634481191635132, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15920409560203552, "step": 24984 }, { "epoch": 0.7808125, "grad_norm": 3.03125, "grad_norm_var": 0.05263671875, "learning_rate": 0.0001, "loss": 5.5731, "loss/crossentropy": 2.5229707956314087, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16282780468463898, "step": 24986 }, { "epoch": 0.780875, "grad_norm": 2.984375, "grad_norm_var": 0.04582926432291667, "learning_rate": 0.0001, "loss": 5.64, "loss/crossentropy": 2.545683741569519, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16138702630996704, "step": 24988 }, { "epoch": 0.7809375, "grad_norm": 2.703125, "grad_norm_var": 0.06370035807291667, "learning_rate": 0.0001, "loss": 5.0855, "loss/crossentropy": 2.239551544189453, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.14162637293338776, "step": 24990 }, { "epoch": 0.781, "grad_norm": 3.28125, "grad_norm_var": 0.0631011962890625, "learning_rate": 0.0001, "loss": 5.53, "loss/crossentropy": 2.4307796955108643, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16616912931203842, "step": 24992 }, { "epoch": 0.7810625, "grad_norm": 3.375, "grad_norm_var": 0.03509114583333333, "learning_rate": 0.0001, "loss": 5.839, "loss/crossentropy": 2.6514469385147095, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1710943505167961, "step": 24994 }, { "epoch": 0.781125, "grad_norm": 3.0, "grad_norm_var": 0.03408101399739583, "learning_rate": 0.0001, "loss": 5.3903, "loss/crossentropy": 2.3674404621124268, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.16165786236524582, "step": 24996 }, { "epoch": 0.7811875, "grad_norm": 3.140625, "grad_norm_var": 0.03472900390625, "learning_rate": 0.0001, "loss": 5.8161, "loss/crossentropy": 2.677467703819275, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16816051304340363, "step": 24998 }, { "epoch": 0.78125, "grad_norm": 3.25, "grad_norm_var": 0.07089742024739583, "learning_rate": 0.0001, "loss": 5.7517, "loss/crossentropy": 2.4565203189849854, "loss/hidden": 1.5625, "loss/jsd": 0.0, "loss/logits": 0.17326726019382477, "step": 25000 }, { "epoch": 0.7813125, "grad_norm": 3.296875, "grad_norm_var": 0.07288004557291666, "learning_rate": 0.0001, "loss": 5.7661, "loss/crossentropy": 2.6471210718154907, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16658222675323486, "step": 25002 }, { "epoch": 0.781375, "grad_norm": 3.625, "grad_norm_var": 0.08578999837239583, "learning_rate": 0.0001, "loss": 5.5766, "loss/crossentropy": 2.4884352684020996, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16194364428520203, "step": 25004 }, { "epoch": 0.7814375, "grad_norm": 2.859375, "grad_norm_var": 0.07358296712239583, "learning_rate": 0.0001, "loss": 5.3731, "loss/crossentropy": 2.4396510124206543, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.14959152787923813, "step": 25006 }, { "epoch": 0.7815, "grad_norm": 3.375, "grad_norm_var": 0.07263895670572916, "learning_rate": 0.0001, "loss": 5.7882, "loss/crossentropy": 2.618377923965454, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16854405403137207, "step": 25008 }, { "epoch": 0.7815625, "grad_norm": 3.046875, "grad_norm_var": 0.0718902587890625, "learning_rate": 0.0001, "loss": 5.6774, "loss/crossentropy": 2.5781971216201782, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16109603643417358, "step": 25010 }, { "epoch": 0.781625, "grad_norm": 3.203125, "grad_norm_var": 0.06816304524739583, "learning_rate": 0.0001, "loss": 5.9065, "loss/crossentropy": 2.7340006828308105, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16685685515403748, "step": 25012 }, { "epoch": 0.7816875, "grad_norm": 2.828125, "grad_norm_var": 0.07422587076822916, "learning_rate": 0.0001, "loss": 5.8071, "loss/crossentropy": 2.6172449588775635, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16937804222106934, "step": 25014 }, { "epoch": 0.78175, "grad_norm": 2.90625, "grad_norm_var": 0.04372456868489583, "learning_rate": 0.0001, "loss": 5.0782, "loss/crossentropy": 2.183945894241333, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.14450307190418243, "step": 25016 }, { "epoch": 0.7818125, "grad_norm": 3.359375, "grad_norm_var": 0.04592692057291667, "learning_rate": 0.0001, "loss": 5.8498, "loss/crossentropy": 2.7048791646957397, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16918237507343292, "step": 25018 }, { "epoch": 0.781875, "grad_norm": 3.078125, "grad_norm_var": 0.030094401041666666, "learning_rate": 0.0001, "loss": 5.2292, "loss/crossentropy": 2.3088778257369995, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.15101177990436554, "step": 25020 }, { "epoch": 0.7819375, "grad_norm": 3.296875, "grad_norm_var": 0.026178995768229168, "learning_rate": 0.0001, "loss": 5.8802, "loss/crossentropy": 2.7492371797561646, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1623164564371109, "step": 25022 }, { "epoch": 0.782, "grad_norm": 2.9375, "grad_norm_var": 0.025809733072916667, "learning_rate": 0.0001, "loss": 5.5296, "loss/crossentropy": 2.482593536376953, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1578279659152031, "step": 25024 }, { "epoch": 0.7820625, "grad_norm": 3.1875, "grad_norm_var": 0.026949055989583335, "learning_rate": 0.0001, "loss": 5.3924, "loss/crossentropy": 2.354622721672058, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15767978131771088, "step": 25026 }, { "epoch": 0.782125, "grad_norm": 3.0625, "grad_norm_var": 0.027632649739583334, "learning_rate": 0.0001, "loss": 5.397, "loss/crossentropy": 2.4253294467926025, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15380343049764633, "step": 25028 }, { "epoch": 0.7821875, "grad_norm": 3.4375, "grad_norm_var": 0.03220926920572917, "learning_rate": 0.0001, "loss": 5.9257, "loss/crossentropy": 2.655179262161255, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17783797532320023, "step": 25030 }, { "epoch": 0.78225, "grad_norm": 3.0, "grad_norm_var": 0.030134073893229165, "learning_rate": 0.0001, "loss": 5.4419, "loss/crossentropy": 2.4656208753585815, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15543990582227707, "step": 25032 }, { "epoch": 0.7823125, "grad_norm": 3.21875, "grad_norm_var": 0.025373331705729165, "learning_rate": 0.0001, "loss": 5.1499, "loss/crossentropy": 2.189010500907898, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.15507280081510544, "step": 25034 }, { "epoch": 0.782375, "grad_norm": 2.953125, "grad_norm_var": 0.0256500244140625, "learning_rate": 0.0001, "loss": 5.4865, "loss/crossentropy": 2.499233603477478, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15575594455003738, "step": 25036 }, { "epoch": 0.7824375, "grad_norm": 3.6875, "grad_norm_var": 0.047826131184895836, "learning_rate": 0.0001, "loss": 5.6478, "loss/crossentropy": 2.4892709255218506, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1654580757021904, "step": 25038 }, { "epoch": 0.7825, "grad_norm": 3.359375, "grad_norm_var": 0.04643452962239583, "learning_rate": 0.0001, "loss": 5.5592, "loss/crossentropy": 2.4499223232269287, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16248713433742523, "step": 25040 }, { "epoch": 0.7825625, "grad_norm": 3.171875, "grad_norm_var": 0.051806640625, "learning_rate": 0.0001, "loss": 5.5163, "loss/crossentropy": 2.44981050491333, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16133665293455124, "step": 25042 }, { "epoch": 0.782625, "grad_norm": 3.109375, "grad_norm_var": 0.0479156494140625, "learning_rate": 0.0001, "loss": 5.5172, "loss/crossentropy": 2.4054744243621826, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1642991378903389, "step": 25044 }, { "epoch": 0.7826875, "grad_norm": 3.109375, "grad_norm_var": 0.04194234212239583, "learning_rate": 0.0001, "loss": 5.663, "loss/crossentropy": 2.524221658706665, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.169351264834404, "step": 25046 }, { "epoch": 0.78275, "grad_norm": 3.046875, "grad_norm_var": 0.04138895670572917, "learning_rate": 0.0001, "loss": 5.5084, "loss/crossentropy": 2.432820677757263, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.15755394101142883, "step": 25048 }, { "epoch": 0.7828125, "grad_norm": 2.90625, "grad_norm_var": 0.04365234375, "learning_rate": 0.0001, "loss": 5.8103, "loss/crossentropy": 2.659608244895935, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16780192404985428, "step": 25050 }, { "epoch": 0.782875, "grad_norm": 2.75, "grad_norm_var": 0.049836222330729166, "learning_rate": 0.0001, "loss": 5.4689, "loss/crossentropy": 2.4619545936584473, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.1596793532371521, "step": 25052 }, { "epoch": 0.7829375, "grad_norm": 3.0625, "grad_norm_var": 0.0306304931640625, "learning_rate": 0.0001, "loss": 5.394, "loss/crossentropy": 2.407675623893738, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.1560579463839531, "step": 25054 }, { "epoch": 0.783, "grad_norm": 2.890625, "grad_norm_var": 0.02506103515625, "learning_rate": 0.0001, "loss": 5.6879, "loss/crossentropy": 2.6041946411132812, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16110891103744507, "step": 25056 }, { "epoch": 0.7830625, "grad_norm": 3.140625, "grad_norm_var": 0.02271728515625, "learning_rate": 0.0001, "loss": 5.401, "loss/crossentropy": 2.3181673288345337, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1594579517841339, "step": 25058 }, { "epoch": 0.783125, "grad_norm": 2.890625, "grad_norm_var": 0.024755859375, "learning_rate": 0.0001, "loss": 5.7628, "loss/crossentropy": 2.681661009788513, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16436689347028732, "step": 25060 }, { "epoch": 0.7831875, "grad_norm": 3.0625, "grad_norm_var": 0.0270416259765625, "learning_rate": 0.0001, "loss": 5.4755, "loss/crossentropy": 2.4530845880508423, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1561483070254326, "step": 25062 }, { "epoch": 0.78325, "grad_norm": 3.265625, "grad_norm_var": 0.030549112955729166, "learning_rate": 0.0001, "loss": 5.6392, "loss/crossentropy": 2.560398578643799, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16374336183071136, "step": 25064 }, { "epoch": 0.7833125, "grad_norm": 3.09375, "grad_norm_var": 0.0261138916015625, "learning_rate": 0.0001, "loss": 5.3838, "loss/crossentropy": 2.349510431289673, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15928371995687485, "step": 25066 }, { "epoch": 0.783375, "grad_norm": 3.171875, "grad_norm_var": 0.0192535400390625, "learning_rate": 0.0001, "loss": 5.6535, "loss/crossentropy": 2.5598082542419434, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15976376086473465, "step": 25068 }, { "epoch": 0.7834375, "grad_norm": 3.03125, "grad_norm_var": 0.015397135416666667, "learning_rate": 0.0001, "loss": 5.8171, "loss/crossentropy": 2.687534809112549, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16413022577762604, "step": 25070 }, { "epoch": 0.7835, "grad_norm": 3.296875, "grad_norm_var": 0.024344889322916667, "learning_rate": 0.0001, "loss": 5.0117, "loss/crossentropy": 2.0827004313468933, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.14485756307840347, "step": 25072 }, { "epoch": 0.7835625, "grad_norm": 3.109375, "grad_norm_var": 0.029182942708333333, "learning_rate": 0.0001, "loss": 5.7212, "loss/crossentropy": 2.6073025465011597, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16763654351234436, "step": 25074 }, { "epoch": 0.783625, "grad_norm": 2.9375, "grad_norm_var": 0.0277740478515625, "learning_rate": 0.0001, "loss": 5.8606, "loss/crossentropy": 2.748585820198059, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16393591463565826, "step": 25076 }, { "epoch": 0.7836875, "grad_norm": 3.40625, "grad_norm_var": 0.031428019205729164, "learning_rate": 0.0001, "loss": 5.6233, "loss/crossentropy": 2.511659026145935, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16429278999567032, "step": 25078 }, { "epoch": 0.78375, "grad_norm": 3.203125, "grad_norm_var": 0.0341217041015625, "learning_rate": 0.0001, "loss": 5.4974, "loss/crossentropy": 2.5064194202423096, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15495362877845764, "step": 25080 }, { "epoch": 0.7838125, "grad_norm": 2.875, "grad_norm_var": 0.040283203125, "learning_rate": 0.0001, "loss": 5.511, "loss/crossentropy": 2.525681257247925, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.15751774609088898, "step": 25082 }, { "epoch": 0.783875, "grad_norm": 2.953125, "grad_norm_var": 0.042455037434895836, "learning_rate": 0.0001, "loss": 5.5149, "loss/crossentropy": 2.45516037940979, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1602674052119255, "step": 25084 }, { "epoch": 0.7839375, "grad_norm": 3.40625, "grad_norm_var": 0.04993489583333333, "learning_rate": 0.0001, "loss": 5.5154, "loss/crossentropy": 2.417151927947998, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1613856852054596, "step": 25086 }, { "epoch": 0.784, "grad_norm": 2.96875, "grad_norm_var": 0.041413370768229166, "learning_rate": 0.0001, "loss": 5.7797, "loss/crossentropy": 2.7250806093215942, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15858832746744156, "step": 25088 }, { "epoch": 0.7840625, "grad_norm": 3.046875, "grad_norm_var": 0.036421712239583334, "learning_rate": 0.0001, "loss": 5.4991, "loss/crossentropy": 2.470256209373474, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1583540141582489, "step": 25090 }, { "epoch": 0.784125, "grad_norm": 3.328125, "grad_norm_var": 0.04127197265625, "learning_rate": 0.0001, "loss": 5.5174, "loss/crossentropy": 2.43650221824646, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16316696256399155, "step": 25092 }, { "epoch": 0.7841875, "grad_norm": 2.796875, "grad_norm_var": 0.03912353515625, "learning_rate": 0.0001, "loss": 5.5422, "loss/crossentropy": 2.5840858221054077, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15284348279237747, "step": 25094 }, { "epoch": 0.78425, "grad_norm": 3.15625, "grad_norm_var": 0.03687744140625, "learning_rate": 0.0001, "loss": 5.6881, "loss/crossentropy": 2.564616560935974, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16703927516937256, "step": 25096 }, { "epoch": 0.7843125, "grad_norm": 3.1875, "grad_norm_var": 0.3617472330729167, "learning_rate": 0.0001, "loss": 5.8486, "loss/crossentropy": 2.5736570358276367, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17358990013599396, "step": 25098 }, { "epoch": 0.784375, "grad_norm": 2.859375, "grad_norm_var": 0.36842041015625, "learning_rate": 0.0001, "loss": 5.7328, "loss/crossentropy": 2.5624277591705322, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16859986633062363, "step": 25100 }, { "epoch": 0.7844375, "grad_norm": 3.25, "grad_norm_var": 0.36705322265625, "learning_rate": 0.0001, "loss": 5.5661, "loss/crossentropy": 2.5077946186065674, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15778060257434845, "step": 25102 }, { "epoch": 0.7845, "grad_norm": 3.265625, "grad_norm_var": 0.3607706705729167, "learning_rate": 0.0001, "loss": 5.5609, "loss/crossentropy": 2.513390064239502, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16099922358989716, "step": 25104 }, { "epoch": 0.7845625, "grad_norm": 3.046875, "grad_norm_var": 0.3536773681640625, "learning_rate": 0.0001, "loss": 5.5773, "loss/crossentropy": 2.5185513496398926, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.1617312878370285, "step": 25106 }, { "epoch": 0.784625, "grad_norm": 3.265625, "grad_norm_var": 0.3480631510416667, "learning_rate": 0.0001, "loss": 5.6833, "loss/crossentropy": 2.5997354984283447, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16343086957931519, "step": 25108 }, { "epoch": 0.7846875, "grad_norm": 3.390625, "grad_norm_var": 0.32638346354166664, "learning_rate": 0.0001, "loss": 5.625, "loss/crossentropy": 2.5467759370803833, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1617325320839882, "step": 25110 }, { "epoch": 0.78475, "grad_norm": 3.25, "grad_norm_var": 0.39422200520833334, "learning_rate": 0.0001, "loss": 6.0529, "loss/crossentropy": 2.8289847373962402, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17200353741645813, "step": 25112 }, { "epoch": 0.7848125, "grad_norm": 2.984375, "grad_norm_var": 0.12730204264322917, "learning_rate": 0.0001, "loss": 5.471, "loss/crossentropy": 2.35847806930542, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16515729576349258, "step": 25114 }, { "epoch": 0.784875, "grad_norm": 3.3125, "grad_norm_var": 0.11646219889322916, "learning_rate": 0.0001, "loss": 5.8522, "loss/crossentropy": 2.6589726209640503, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17049360275268555, "step": 25116 }, { "epoch": 0.7849375, "grad_norm": 3.40625, "grad_norm_var": 0.1243804931640625, "learning_rate": 0.0001, "loss": 5.9077, "loss/crossentropy": 2.679678797721863, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1724073886871338, "step": 25118 }, { "epoch": 0.785, "grad_norm": 3.1875, "grad_norm_var": 0.11791890462239583, "learning_rate": 0.0001, "loss": 5.9043, "loss/crossentropy": 2.716698169708252, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.17266739159822464, "step": 25120 }, { "epoch": 0.7850625, "grad_norm": 3.046875, "grad_norm_var": 0.12151692708333334, "learning_rate": 0.0001, "loss": 5.1997, "loss/crossentropy": 2.222951889038086, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15314562618732452, "step": 25122 }, { "epoch": 0.785125, "grad_norm": 2.921875, "grad_norm_var": 0.14719950358072917, "learning_rate": 0.0001, "loss": 5.2749, "loss/crossentropy": 2.384072422981262, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.1492406353354454, "step": 25124 }, { "epoch": 0.7851875, "grad_norm": 3.25, "grad_norm_var": 0.14053446451822918, "learning_rate": 0.0001, "loss": 5.7775, "loss/crossentropy": 2.6351888179779053, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16813918203115463, "step": 25126 }, { "epoch": 0.78525, "grad_norm": 3.015625, "grad_norm_var": 0.057027180989583336, "learning_rate": 0.0001, "loss": 5.7459, "loss/crossentropy": 2.6335253715515137, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16162623465061188, "step": 25128 }, { "epoch": 0.7853125, "grad_norm": 3.625, "grad_norm_var": 0.058714803059895834, "learning_rate": 0.0001, "loss": 5.5279, "loss/crossentropy": 2.392796754837036, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16233666986227036, "step": 25130 }, { "epoch": 0.785375, "grad_norm": 2.859375, "grad_norm_var": 0.065234375, "learning_rate": 0.0001, "loss": 5.2826, "loss/crossentropy": 2.327790141105652, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15250813215970993, "step": 25132 }, { "epoch": 0.7854375, "grad_norm": 2.921875, "grad_norm_var": 0.05224202473958333, "learning_rate": 0.0001, "loss": 5.3842, "loss/crossentropy": 2.4696340560913086, "loss/hidden": 1.38671875, "loss/jsd": 0.0, "loss/logits": 0.15278441458940506, "step": 25134 }, { "epoch": 0.7855, "grad_norm": 3.0, "grad_norm_var": 0.05273030598958333, "learning_rate": 0.0001, "loss": 5.482, "loss/crossentropy": 2.44424831867218, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1584630385041237, "step": 25136 }, { "epoch": 0.7855625, "grad_norm": 3.453125, "grad_norm_var": 0.061945597330729164, "learning_rate": 0.0001, "loss": 5.9022, "loss/crossentropy": 2.772692084312439, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1641230285167694, "step": 25138 }, { "epoch": 0.785625, "grad_norm": 3.15625, "grad_norm_var": 0.051122029622395836, "learning_rate": 0.0001, "loss": 5.7926, "loss/crossentropy": 2.6483733654022217, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16793441772460938, "step": 25140 }, { "epoch": 0.7856875, "grad_norm": 3.140625, "grad_norm_var": 0.0496734619140625, "learning_rate": 0.0001, "loss": 5.8279, "loss/crossentropy": 2.611177921295166, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17088709026575089, "step": 25142 }, { "epoch": 0.78575, "grad_norm": 2.984375, "grad_norm_var": 0.05120035807291667, "learning_rate": 0.0001, "loss": 5.8282, "loss/crossentropy": 2.71530544757843, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16597488522529602, "step": 25144 }, { "epoch": 0.7858125, "grad_norm": 2.984375, "grad_norm_var": 0.0348541259765625, "learning_rate": 0.0001, "loss": 5.4036, "loss/crossentropy": 2.425929307937622, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.157137930393219, "step": 25146 }, { "epoch": 0.785875, "grad_norm": 2.96875, "grad_norm_var": 0.035416666666666666, "learning_rate": 0.0001, "loss": 5.5936, "loss/crossentropy": 2.464033603668213, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1684267818927765, "step": 25148 }, { "epoch": 0.7859375, "grad_norm": 3.46875, "grad_norm_var": 0.0369293212890625, "learning_rate": 0.0001, "loss": 5.8477, "loss/crossentropy": 2.7339954376220703, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16645215451717377, "step": 25150 }, { "epoch": 0.786, "grad_norm": 3.09375, "grad_norm_var": 0.033935546875, "learning_rate": 0.0001, "loss": 5.7675, "loss/crossentropy": 2.707942247390747, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16064704209566116, "step": 25152 }, { "epoch": 0.7860625, "grad_norm": 3.171875, "grad_norm_var": 0.0323883056640625, "learning_rate": 0.0001, "loss": 5.6805, "loss/crossentropy": 2.600821614265442, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16187451779842377, "step": 25154 }, { "epoch": 0.786125, "grad_norm": 3.0, "grad_norm_var": 0.03359273274739583, "learning_rate": 0.0001, "loss": 5.7208, "loss/crossentropy": 2.628736734390259, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16507025063037872, "step": 25156 }, { "epoch": 0.7861875, "grad_norm": 2.8125, "grad_norm_var": 0.036031087239583336, "learning_rate": 0.0001, "loss": 5.5739, "loss/crossentropy": 2.5731139183044434, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15789590775966644, "step": 25158 }, { "epoch": 0.78625, "grad_norm": 3.171875, "grad_norm_var": 0.034891764322916664, "learning_rate": 0.0001, "loss": 5.7913, "loss/crossentropy": 2.678464412689209, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16636203974485397, "step": 25160 }, { "epoch": 0.7863125, "grad_norm": 3.03125, "grad_norm_var": 0.033869425455729164, "learning_rate": 0.0001, "loss": 5.7663, "loss/crossentropy": 2.6258822679519653, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16677294671535492, "step": 25162 }, { "epoch": 0.786375, "grad_norm": 2.90625, "grad_norm_var": 0.03193257649739583, "learning_rate": 0.0001, "loss": 5.6864, "loss/crossentropy": 2.6077847480773926, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1648903787136078, "step": 25164 }, { "epoch": 0.7864375, "grad_norm": 2.96875, "grad_norm_var": 0.019172159830729167, "learning_rate": 0.0001, "loss": 5.6069, "loss/crossentropy": 2.495919704437256, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16539262235164642, "step": 25166 }, { "epoch": 0.7865, "grad_norm": 2.828125, "grad_norm_var": 0.0217437744140625, "learning_rate": 0.0001, "loss": 5.5293, "loss/crossentropy": 2.525461792945862, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1542908474802971, "step": 25168 }, { "epoch": 0.7865625, "grad_norm": 3.265625, "grad_norm_var": 0.022484334309895833, "learning_rate": 0.0001, "loss": 5.6908, "loss/crossentropy": 2.5442755222320557, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16543786972761154, "step": 25170 }, { "epoch": 0.786625, "grad_norm": 2.828125, "grad_norm_var": 0.02564697265625, "learning_rate": 0.0001, "loss": 5.2421, "loss/crossentropy": 2.3321306705474854, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.1484227403998375, "step": 25172 }, { "epoch": 0.7866875, "grad_norm": 3.125, "grad_norm_var": 0.017756144205729168, "learning_rate": 0.0001, "loss": 5.4943, "loss/crossentropy": 2.4257287979125977, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1599854677915573, "step": 25174 }, { "epoch": 0.78675, "grad_norm": 3.21875, "grad_norm_var": 0.018062337239583334, "learning_rate": 0.0001, "loss": 5.5799, "loss/crossentropy": 2.5376850366592407, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15656720101833344, "step": 25176 }, { "epoch": 0.7868125, "grad_norm": 3.0, "grad_norm_var": 0.022705078125, "learning_rate": 0.0001, "loss": 5.348, "loss/crossentropy": 2.39486563205719, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15313006937503815, "step": 25178 }, { "epoch": 0.786875, "grad_norm": 3.140625, "grad_norm_var": 0.0227691650390625, "learning_rate": 0.0001, "loss": 5.5245, "loss/crossentropy": 2.5132004022598267, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.1585562601685524, "step": 25180 }, { "epoch": 0.7869375, "grad_norm": 2.84375, "grad_norm_var": 0.025809733072916667, "learning_rate": 0.0001, "loss": 5.3467, "loss/crossentropy": 2.390488862991333, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.15304618328809738, "step": 25182 }, { "epoch": 0.787, "grad_norm": 3.015625, "grad_norm_var": 0.01959228515625, "learning_rate": 0.0001, "loss": 5.5321, "loss/crossentropy": 2.4580490589141846, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16170482337474823, "step": 25184 }, { "epoch": 0.7870625, "grad_norm": 3.3125, "grad_norm_var": 0.020978800455729165, "learning_rate": 0.0001, "loss": 5.7807, "loss/crossentropy": 2.6099783182144165, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16902922838926315, "step": 25186 }, { "epoch": 0.787125, "grad_norm": 2.765625, "grad_norm_var": 0.024800618489583332, "learning_rate": 0.0001, "loss": 5.602, "loss/crossentropy": 2.617928385734558, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.15739066153764725, "step": 25188 }, { "epoch": 0.7871875, "grad_norm": 3.140625, "grad_norm_var": 0.026952107747395832, "learning_rate": 0.0001, "loss": 5.5574, "loss/crossentropy": 2.4698808193206787, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16226434707641602, "step": 25190 }, { "epoch": 0.78725, "grad_norm": 3.03125, "grad_norm_var": 0.023737589518229168, "learning_rate": 0.0001, "loss": 5.5291, "loss/crossentropy": 2.515956163406372, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1544433981180191, "step": 25192 }, { "epoch": 0.7873125, "grad_norm": 3.375, "grad_norm_var": 0.033003743489583334, "learning_rate": 0.0001, "loss": 5.7061, "loss/crossentropy": 2.519581437110901, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17021550238132477, "step": 25194 }, { "epoch": 0.787375, "grad_norm": 2.921875, "grad_norm_var": 0.03264058430989583, "learning_rate": 0.0001, "loss": 5.4925, "loss/crossentropy": 2.4986947774887085, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15524086356163025, "step": 25196 }, { "epoch": 0.7874375, "grad_norm": 3.40625, "grad_norm_var": 0.03798726399739583, "learning_rate": 0.0001, "loss": 5.6295, "loss/crossentropy": 2.582840085029602, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15935365855693817, "step": 25198 }, { "epoch": 0.7875, "grad_norm": 3.015625, "grad_norm_var": 0.04254150390625, "learning_rate": 0.0001, "loss": 5.7632, "loss/crossentropy": 2.6446453332901, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16303178668022156, "step": 25200 }, { "epoch": 0.7875625, "grad_norm": 2.96875, "grad_norm_var": 0.03829752604166667, "learning_rate": 0.0001, "loss": 5.2524, "loss/crossentropy": 2.2934986352920532, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.1552659571170807, "step": 25202 }, { "epoch": 0.787625, "grad_norm": 3.140625, "grad_norm_var": 0.0299957275390625, "learning_rate": 0.0001, "loss": 5.6602, "loss/crossentropy": 2.617307186126709, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15819956362247467, "step": 25204 }, { "epoch": 0.7876875, "grad_norm": 3.109375, "grad_norm_var": 0.03992411295572917, "learning_rate": 0.0001, "loss": 5.4376, "loss/crossentropy": 2.359330892562866, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16134297847747803, "step": 25206 }, { "epoch": 0.78775, "grad_norm": 3.578125, "grad_norm_var": 0.055810546875, "learning_rate": 0.0001, "loss": 6.1531, "loss/crossentropy": 2.840658664703369, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.18202606588602066, "step": 25208 }, { "epoch": 0.7878125, "grad_norm": 3.03125, "grad_norm_var": 0.053831990559895834, "learning_rate": 0.0001, "loss": 5.6484, "loss/crossentropy": 2.5800265073776245, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1611306369304657, "step": 25210 }, { "epoch": 0.787875, "grad_norm": 2.953125, "grad_norm_var": 0.0541656494140625, "learning_rate": 0.0001, "loss": 5.6995, "loss/crossentropy": 2.568533182144165, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1662180870771408, "step": 25212 }, { "epoch": 0.7879375, "grad_norm": 3.15625, "grad_norm_var": 0.0494049072265625, "learning_rate": 0.0001, "loss": 5.5971, "loss/crossentropy": 2.4888023138046265, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16160811483860016, "step": 25214 }, { "epoch": 0.788, "grad_norm": 3.234375, "grad_norm_var": 0.042220052083333334, "learning_rate": 0.0001, "loss": 5.8117, "loss/crossentropy": 2.6967689990997314, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16540372371673584, "step": 25216 }, { "epoch": 0.7880625, "grad_norm": 2.84375, "grad_norm_var": 0.04708658854166667, "learning_rate": 0.0001, "loss": 5.5943, "loss/crossentropy": 2.5863274335861206, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.16016855835914612, "step": 25218 }, { "epoch": 0.788125, "grad_norm": 3.53125, "grad_norm_var": 0.061324055989583334, "learning_rate": 0.0001, "loss": 5.9415, "loss/crossentropy": 2.6803689002990723, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1761108636856079, "step": 25220 }, { "epoch": 0.7881875, "grad_norm": 2.953125, "grad_norm_var": 0.05654296875, "learning_rate": 0.0001, "loss": 5.4505, "loss/crossentropy": 2.423671245574951, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15736592561006546, "step": 25222 }, { "epoch": 0.78825, "grad_norm": 3.078125, "grad_norm_var": 0.037125651041666666, "learning_rate": 0.0001, "loss": 5.609, "loss/crossentropy": 2.61668598651886, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.1566566824913025, "step": 25224 }, { "epoch": 0.7883125, "grad_norm": 3.203125, "grad_norm_var": 0.04055989583333333, "learning_rate": 0.0001, "loss": 5.7241, "loss/crossentropy": 2.6395206451416016, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16314589977264404, "step": 25226 }, { "epoch": 0.788375, "grad_norm": 3.234375, "grad_norm_var": 0.041064453125, "learning_rate": 0.0001, "loss": 5.7187, "loss/crossentropy": 2.600376844406128, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16534972190856934, "step": 25228 }, { "epoch": 0.7884375, "grad_norm": 3.234375, "grad_norm_var": 0.04329325358072917, "learning_rate": 0.0001, "loss": 5.3233, "loss/crossentropy": 2.3683305978775024, "loss/hidden": 1.38671875, "loss/jsd": 0.0, "loss/logits": 0.15682659298181534, "step": 25230 }, { "epoch": 0.7885, "grad_norm": 3.59375, "grad_norm_var": 0.06189676920572917, "learning_rate": 0.0001, "loss": 5.7239, "loss/crossentropy": 2.564695119857788, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1670876443386078, "step": 25232 }, { "epoch": 0.7885625, "grad_norm": 3.046875, "grad_norm_var": 0.05627848307291667, "learning_rate": 0.0001, "loss": 5.6439, "loss/crossentropy": 2.560565233230591, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.15989135205745697, "step": 25234 }, { "epoch": 0.788625, "grad_norm": 3.015625, "grad_norm_var": 0.04110921223958333, "learning_rate": 0.0001, "loss": 5.4061, "loss/crossentropy": 2.4623721837997437, "loss/hidden": 1.40234375, "loss/jsd": 0.0, "loss/logits": 0.1541389375925064, "step": 25236 }, { "epoch": 0.7886875, "grad_norm": 3.390625, "grad_norm_var": 0.04722900390625, "learning_rate": 0.0001, "loss": 5.5537, "loss/crossentropy": 2.4472579956054688, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16494275629520416, "step": 25238 }, { "epoch": 0.78875, "grad_norm": 3.296875, "grad_norm_var": 0.046019490559895834, "learning_rate": 0.0001, "loss": 5.8679, "loss/crossentropy": 2.6663858890533447, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17366329580545425, "step": 25240 }, { "epoch": 0.7888125, "grad_norm": 2.953125, "grad_norm_var": 0.04617513020833333, "learning_rate": 0.0001, "loss": 5.7008, "loss/crossentropy": 2.6034947633743286, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.1663685292005539, "step": 25242 }, { "epoch": 0.788875, "grad_norm": 3.15625, "grad_norm_var": 0.044798787434895834, "learning_rate": 0.0001, "loss": 5.7471, "loss/crossentropy": 2.64049232006073, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1633920818567276, "step": 25244 }, { "epoch": 0.7889375, "grad_norm": 3.125, "grad_norm_var": 0.04024149576822917, "learning_rate": 0.0001, "loss": 5.7013, "loss/crossentropy": 2.5922815799713135, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16441580653190613, "step": 25246 }, { "epoch": 0.789, "grad_norm": 2.890625, "grad_norm_var": 0.02486572265625, "learning_rate": 0.0001, "loss": 5.3338, "loss/crossentropy": 2.324529767036438, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15444006025791168, "step": 25248 }, { "epoch": 0.7890625, "grad_norm": 3.515625, "grad_norm_var": 0.03499247233072917, "learning_rate": 0.0001, "loss": 5.8315, "loss/crossentropy": 2.6474636793136597, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16996200382709503, "step": 25250 }, { "epoch": 0.789125, "grad_norm": 3.484375, "grad_norm_var": 0.0591217041015625, "learning_rate": 0.0001, "loss": 6.2729, "loss/crossentropy": 2.8888838291168213, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.1833200380206108, "step": 25252 }, { "epoch": 0.7891875, "grad_norm": 3.203125, "grad_norm_var": 0.0658111572265625, "learning_rate": 0.0001, "loss": 5.5424, "loss/crossentropy": 2.489980459213257, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.1626640036702156, "step": 25254 }, { "epoch": 0.78925, "grad_norm": 2.796875, "grad_norm_var": 0.07440999348958334, "learning_rate": 0.0001, "loss": 5.6484, "loss/crossentropy": 2.60439932346344, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.16103915870189667, "step": 25256 }, { "epoch": 0.7893125, "grad_norm": 3.15625, "grad_norm_var": 0.0716705322265625, "learning_rate": 0.0001, "loss": 5.2819, "loss/crossentropy": 2.246806025505066, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1558496356010437, "step": 25258 }, { "epoch": 0.789375, "grad_norm": 3.203125, "grad_norm_var": 0.07154541015625, "learning_rate": 0.0001, "loss": 5.7365, "loss/crossentropy": 2.556297779083252, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.17231497913599014, "step": 25260 }, { "epoch": 0.7894375, "grad_norm": 3.421875, "grad_norm_var": 0.07289937337239584, "learning_rate": 0.0001, "loss": 6.093, "loss/crossentropy": 2.913823127746582, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16674679517745972, "step": 25262 }, { "epoch": 0.7895, "grad_norm": 3.265625, "grad_norm_var": 0.0753082275390625, "learning_rate": 0.0001, "loss": 5.7693, "loss/crossentropy": 2.6205087900161743, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16722586750984192, "step": 25264 }, { "epoch": 0.7895625, "grad_norm": 3.0, "grad_norm_var": 0.0881256103515625, "learning_rate": 0.0001, "loss": 5.5436, "loss/crossentropy": 2.519482970237732, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15905684977769852, "step": 25266 }, { "epoch": 0.789625, "grad_norm": 3.0625, "grad_norm_var": 0.0528228759765625, "learning_rate": 0.0001, "loss": 5.5977, "loss/crossentropy": 2.558209180831909, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15902253985404968, "step": 25268 }, { "epoch": 0.7896875, "grad_norm": 3.046875, "grad_norm_var": 0.0469635009765625, "learning_rate": 0.0001, "loss": 5.8884, "loss/crossentropy": 2.7374707460403442, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16665419191122055, "step": 25270 }, { "epoch": 0.78975, "grad_norm": 3.078125, "grad_norm_var": 0.045807902018229166, "learning_rate": 0.0001, "loss": 5.2376, "loss/crossentropy": 2.2393096685409546, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15373366326093674, "step": 25272 }, { "epoch": 0.7898125, "grad_norm": 3.015625, "grad_norm_var": 0.04550374348958333, "learning_rate": 0.0001, "loss": 5.5198, "loss/crossentropy": 2.484234571456909, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.16019342839717865, "step": 25274 }, { "epoch": 0.789875, "grad_norm": 2.859375, "grad_norm_var": 0.048737589518229166, "learning_rate": 0.0001, "loss": 5.4838, "loss/crossentropy": 2.489290952682495, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.1588217169046402, "step": 25276 }, { "epoch": 0.7899375, "grad_norm": 2.796875, "grad_norm_var": 0.04658203125, "learning_rate": 0.0001, "loss": 5.6814, "loss/crossentropy": 2.6445512771606445, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1560288816690445, "step": 25278 }, { "epoch": 0.79, "grad_norm": 3.125, "grad_norm_var": 0.04098307291666667, "learning_rate": 0.0001, "loss": 5.7944, "loss/crossentropy": 2.666891098022461, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16469985991716385, "step": 25280 }, { "epoch": 0.7900625, "grad_norm": 3.046875, "grad_norm_var": 0.017975870768229166, "learning_rate": 0.0001, "loss": 5.9708, "loss/crossentropy": 2.8003090620040894, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16939659416675568, "step": 25282 }, { "epoch": 0.790125, "grad_norm": 3.109375, "grad_norm_var": 0.02193603515625, "learning_rate": 0.0001, "loss": 5.5681, "loss/crossentropy": 2.465001106262207, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16226035356521606, "step": 25284 }, { "epoch": 0.7901875, "grad_norm": 3.15625, "grad_norm_var": 0.023363240559895835, "learning_rate": 0.0001, "loss": 5.7736, "loss/crossentropy": 2.6344821453094482, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16782230883836746, "step": 25286 }, { "epoch": 0.79025, "grad_norm": 2.96875, "grad_norm_var": 0.027350870768229167, "learning_rate": 0.0001, "loss": 5.5039, "loss/crossentropy": 2.5562633275985718, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.15413692593574524, "step": 25288 }, { "epoch": 0.7903125, "grad_norm": 2.765625, "grad_norm_var": 0.03326416015625, "learning_rate": 0.0001, "loss": 5.391, "loss/crossentropy": 2.3242186307907104, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16136524826288223, "step": 25290 }, { "epoch": 0.790375, "grad_norm": 2.9375, "grad_norm_var": 0.034032185872395836, "learning_rate": 0.0001, "loss": 5.6922, "loss/crossentropy": 2.5918469429016113, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1639409139752388, "step": 25292 }, { "epoch": 0.7904375, "grad_norm": 3.15625, "grad_norm_var": 0.03150634765625, "learning_rate": 0.0001, "loss": 5.5406, "loss/crossentropy": 2.4601529836654663, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1650724932551384, "step": 25294 }, { "epoch": 0.7905, "grad_norm": 3.265625, "grad_norm_var": 0.03427327473958333, "learning_rate": 0.0001, "loss": 5.56, "loss/crossentropy": 2.5213820934295654, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15854918956756592, "step": 25296 }, { "epoch": 0.7905625, "grad_norm": 3.15625, "grad_norm_var": 0.029637654622395832, "learning_rate": 0.0001, "loss": 5.4759, "loss/crossentropy": 2.4062318801879883, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1616571843624115, "step": 25298 }, { "epoch": 0.790625, "grad_norm": 3.21875, "grad_norm_var": 0.031029256184895833, "learning_rate": 0.0001, "loss": 5.9488, "loss/crossentropy": 2.6806533336639404, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.177203968167305, "step": 25300 }, { "epoch": 0.7906875, "grad_norm": 3.0625, "grad_norm_var": 0.029198201497395833, "learning_rate": 0.0001, "loss": 5.8204, "loss/crossentropy": 2.7386746406555176, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16285746544599533, "step": 25302 }, { "epoch": 0.79075, "grad_norm": 3.421875, "grad_norm_var": 0.029292805989583334, "learning_rate": 0.0001, "loss": 5.652, "loss/crossentropy": 2.5269299745559692, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16250652819871902, "step": 25304 }, { "epoch": 0.7908125, "grad_norm": 3.390625, "grad_norm_var": 0.027962239583333333, "learning_rate": 0.0001, "loss": 5.6296, "loss/crossentropy": 2.3718210458755493, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17265165597200394, "step": 25306 }, { "epoch": 0.790875, "grad_norm": 3.203125, "grad_norm_var": 0.018822224934895833, "learning_rate": 0.0001, "loss": 5.7265, "loss/crossentropy": 2.5841060876846313, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16697849333286285, "step": 25308 }, { "epoch": 0.7909375, "grad_norm": 2.828125, "grad_norm_var": 0.0208984375, "learning_rate": 0.0001, "loss": 5.4832, "loss/crossentropy": 2.473755359649658, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15954020619392395, "step": 25310 }, { "epoch": 0.791, "grad_norm": 3.203125, "grad_norm_var": 0.019188435872395833, "learning_rate": 0.0001, "loss": 5.6534, "loss/crossentropy": 2.588550329208374, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16273347288370132, "step": 25312 }, { "epoch": 0.7910625, "grad_norm": 3.109375, "grad_norm_var": 0.026558430989583333, "learning_rate": 0.0001, "loss": 5.3421, "loss/crossentropy": 2.397429347038269, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.14758940786123276, "step": 25314 }, { "epoch": 0.791125, "grad_norm": 3.21875, "grad_norm_var": 0.026009114583333333, "learning_rate": 0.0001, "loss": 5.5765, "loss/crossentropy": 2.5067743062973022, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15970566868782043, "step": 25316 }, { "epoch": 0.7911875, "grad_norm": 3.34375, "grad_norm_var": 0.031217447916666665, "learning_rate": 0.0001, "loss": 5.7345, "loss/crossentropy": 2.6668519973754883, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1622350960969925, "step": 25318 }, { "epoch": 0.79125, "grad_norm": 3.234375, "grad_norm_var": 0.030060831705729166, "learning_rate": 0.0001, "loss": 5.929, "loss/crossentropy": 2.67513906955719, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17694664001464844, "step": 25320 }, { "epoch": 0.7913125, "grad_norm": 2.84375, "grad_norm_var": 0.040816243489583334, "learning_rate": 0.0001, "loss": 5.3198, "loss/crossentropy": 2.40627658367157, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.1487717479467392, "step": 25322 }, { "epoch": 0.791375, "grad_norm": 3.28125, "grad_norm_var": 0.055540974934895834, "learning_rate": 0.0001, "loss": 5.2989, "loss/crossentropy": 2.361093282699585, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.14651653915643692, "step": 25324 }, { "epoch": 0.7914375, "grad_norm": 4.75, "grad_norm_var": 0.22847900390625, "learning_rate": 0.0001, "loss": 5.5822, "loss/crossentropy": 2.4573179483413696, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16717679053544998, "step": 25326 }, { "epoch": 0.7915, "grad_norm": 3.0, "grad_norm_var": 0.23121337890625, "learning_rate": 0.0001, "loss": 5.9451, "loss/crossentropy": 2.749058485031128, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17311637103557587, "step": 25328 }, { "epoch": 0.7915625, "grad_norm": 2.984375, "grad_norm_var": 0.23987223307291666, "learning_rate": 0.0001, "loss": 5.4095, "loss/crossentropy": 2.459301233291626, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.1532263681292534, "step": 25330 }, { "epoch": 0.791625, "grad_norm": 3.0625, "grad_norm_var": 0.24000244140625, "learning_rate": 0.0001, "loss": 5.8423, "loss/crossentropy": 2.5985885858535767, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17554303258657455, "step": 25332 }, { "epoch": 0.7916875, "grad_norm": 3.15625, "grad_norm_var": 0.24426167805989582, "learning_rate": 0.0001, "loss": 5.7257, "loss/crossentropy": 2.593381404876709, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16479890793561935, "step": 25334 }, { "epoch": 0.79175, "grad_norm": 3.234375, "grad_norm_var": 0.24197591145833333, "learning_rate": 0.0001, "loss": 5.9831, "loss/crossentropy": 2.7751917839050293, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17118044197559357, "step": 25336 }, { "epoch": 0.7918125, "grad_norm": 3.03125, "grad_norm_var": 0.22265218098958334, "learning_rate": 0.0001, "loss": 5.5792, "loss/crossentropy": 2.529574751853943, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15964554250240326, "step": 25338 }, { "epoch": 0.791875, "grad_norm": 2.953125, "grad_norm_var": 0.20861714680989582, "learning_rate": 0.0001, "loss": 5.6736, "loss/crossentropy": 2.5997570753097534, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16090020537376404, "step": 25340 }, { "epoch": 0.7919375, "grad_norm": 3.15625, "grad_norm_var": 0.043610636393229166, "learning_rate": 0.0001, "loss": 5.8498, "loss/crossentropy": 2.7276843786239624, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16495086252689362, "step": 25342 }, { "epoch": 0.792, "grad_norm": 3.203125, "grad_norm_var": 0.0501129150390625, "learning_rate": 0.0001, "loss": 5.618, "loss/crossentropy": 2.5446499586105347, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16201840341091156, "step": 25344 }, { "epoch": 0.7920625, "grad_norm": 3.0, "grad_norm_var": 0.037230428059895834, "learning_rate": 0.0001, "loss": 5.5496, "loss/crossentropy": 2.499722480773926, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1588894948363304, "step": 25346 }, { "epoch": 0.792125, "grad_norm": 3.1875, "grad_norm_var": 0.04060770670572917, "learning_rate": 0.0001, "loss": 5.8169, "loss/crossentropy": 2.7432724237442017, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16204586625099182, "step": 25348 }, { "epoch": 0.7921875, "grad_norm": 2.90625, "grad_norm_var": 0.032746378580729166, "learning_rate": 0.0001, "loss": 5.6696, "loss/crossentropy": 2.6122941970825195, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16276054084300995, "step": 25350 }, { "epoch": 0.79225, "grad_norm": 3.0625, "grad_norm_var": 0.06959228515625, "learning_rate": 0.0001, "loss": 5.8806, "loss/crossentropy": 2.6536483764648438, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17386756837368011, "step": 25352 }, { "epoch": 0.7923125, "grad_norm": 3.296875, "grad_norm_var": 0.07049153645833334, "learning_rate": 0.0001, "loss": 5.7874, "loss/crossentropy": 2.648374080657959, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16585220396518707, "step": 25354 }, { "epoch": 0.792375, "grad_norm": 3.21875, "grad_norm_var": 0.06725260416666666, "learning_rate": 0.0001, "loss": 5.6055, "loss/crossentropy": 2.5399500131607056, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16436853259801865, "step": 25356 }, { "epoch": 0.7924375, "grad_norm": 3.1875, "grad_norm_var": 0.07284749348958333, "learning_rate": 0.0001, "loss": 5.3793, "loss/crossentropy": 2.5220072269439697, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.13924633711576462, "step": 25358 }, { "epoch": 0.7925, "grad_norm": 3.109375, "grad_norm_var": 0.06595052083333333, "learning_rate": 0.0001, "loss": 5.7704, "loss/crossentropy": 2.6193450689315796, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.17057766020298004, "step": 25360 }, { "epoch": 0.7925625, "grad_norm": 3.703125, "grad_norm_var": 0.09200846354166667, "learning_rate": 0.0001, "loss": 5.6755, "loss/crossentropy": 2.6169503927230835, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16054153442382812, "step": 25362 }, { "epoch": 0.792625, "grad_norm": 3.78125, "grad_norm_var": 0.11628316243489584, "learning_rate": 0.0001, "loss": 5.7137, "loss/crossentropy": 2.4692364931106567, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.17210125178098679, "step": 25364 }, { "epoch": 0.7926875, "grad_norm": 3.03125, "grad_norm_var": 0.11047770182291666, "learning_rate": 0.0001, "loss": 5.4257, "loss/crossentropy": 2.536818504333496, "loss/hidden": 1.3828125, "loss/jsd": 0.0, "loss/logits": 0.15060505270957947, "step": 25366 }, { "epoch": 0.79275, "grad_norm": 3.0625, "grad_norm_var": 0.08189188639322917, "learning_rate": 0.0001, "loss": 5.4759, "loss/crossentropy": 2.4374356269836426, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16165433824062347, "step": 25368 }, { "epoch": 0.7928125, "grad_norm": 3.15625, "grad_norm_var": 0.08318684895833334, "learning_rate": 0.0001, "loss": 5.5178, "loss/crossentropy": 2.4698901176452637, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1579151228070259, "step": 25370 }, { "epoch": 0.792875, "grad_norm": 3.015625, "grad_norm_var": 0.07545572916666667, "learning_rate": 0.0001, "loss": 5.4255, "loss/crossentropy": 2.3560034036636353, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16085121780633926, "step": 25372 }, { "epoch": 0.7929375, "grad_norm": 3.25, "grad_norm_var": 0.0664459228515625, "learning_rate": 0.0001, "loss": 5.8468, "loss/crossentropy": 2.631547451019287, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17113661766052246, "step": 25374 }, { "epoch": 0.793, "grad_norm": 3.015625, "grad_norm_var": 0.07095947265625, "learning_rate": 0.0001, "loss": 5.4377, "loss/crossentropy": 2.4203755855560303, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15758804976940155, "step": 25376 }, { "epoch": 0.7930625, "grad_norm": 3.125, "grad_norm_var": 0.04597066243489583, "learning_rate": 0.0001, "loss": 5.7576, "loss/crossentropy": 2.6211706399917603, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16402891278266907, "step": 25378 }, { "epoch": 0.793125, "grad_norm": 2.875, "grad_norm_var": 0.01806640625, "learning_rate": 0.0001, "loss": 5.7183, "loss/crossentropy": 2.667958617210388, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16089429706335068, "step": 25380 }, { "epoch": 0.7931875, "grad_norm": 3.0625, "grad_norm_var": 0.016227213541666667, "learning_rate": 0.0001, "loss": 5.9311, "loss/crossentropy": 2.689044237136841, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17732784152030945, "step": 25382 }, { "epoch": 0.79325, "grad_norm": 3.140625, "grad_norm_var": 0.016373697916666666, "learning_rate": 0.0001, "loss": 5.6614, "loss/crossentropy": 2.6168936491012573, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.160306878387928, "step": 25384 }, { "epoch": 0.7933125, "grad_norm": 3.21875, "grad_norm_var": 0.01279296875, "learning_rate": 0.0001, "loss": 5.6141, "loss/crossentropy": 2.4809409379959106, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16292868554592133, "step": 25386 }, { "epoch": 0.793375, "grad_norm": 3.0625, "grad_norm_var": 0.01578369140625, "learning_rate": 0.0001, "loss": 5.7743, "loss/crossentropy": 2.720241069793701, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16126443445682526, "step": 25388 }, { "epoch": 0.7934375, "grad_norm": 3.25, "grad_norm_var": 0.01510009765625, "learning_rate": 0.0001, "loss": 5.4794, "loss/crossentropy": 2.481152892112732, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.15881385654211044, "step": 25390 }, { "epoch": 0.7935, "grad_norm": 3.0, "grad_norm_var": 0.0148101806640625, "learning_rate": 0.0001, "loss": 5.4523, "loss/crossentropy": 2.4096468687057495, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16051514446735382, "step": 25392 }, { "epoch": 0.7935625, "grad_norm": 2.953125, "grad_norm_var": 0.021361287434895834, "learning_rate": 0.0001, "loss": 5.4221, "loss/crossentropy": 2.420169234275818, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1572248712182045, "step": 25394 }, { "epoch": 0.793625, "grad_norm": 3.203125, "grad_norm_var": 0.018114217122395835, "learning_rate": 0.0001, "loss": 5.9426, "loss/crossentropy": 2.776470184326172, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.17051556706428528, "step": 25396 }, { "epoch": 0.7936875, "grad_norm": 3.15625, "grad_norm_var": 0.0198394775390625, "learning_rate": 0.0001, "loss": 5.7185, "loss/crossentropy": 2.651002049446106, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16143406927585602, "step": 25398 }, { "epoch": 0.79375, "grad_norm": 3.625, "grad_norm_var": 0.03655192057291667, "learning_rate": 0.0001, "loss": 5.673, "loss/crossentropy": 2.528809070587158, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16793614625930786, "step": 25400 }, { "epoch": 0.7938125, "grad_norm": 3.015625, "grad_norm_var": 0.03650716145833333, "learning_rate": 0.0001, "loss": 5.8523, "loss/crossentropy": 2.7354499101638794, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16871225088834763, "step": 25402 }, { "epoch": 0.793875, "grad_norm": 3.046875, "grad_norm_var": 0.0333892822265625, "learning_rate": 0.0001, "loss": 5.4758, "loss/crossentropy": 2.4791252613067627, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15513592958450317, "step": 25404 }, { "epoch": 0.7939375, "grad_norm": 3.1875, "grad_norm_var": 0.033186848958333334, "learning_rate": 0.0001, "loss": 5.9236, "loss/crossentropy": 2.7395559549331665, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16958092153072357, "step": 25406 }, { "epoch": 0.794, "grad_norm": 3.3125, "grad_norm_var": 0.037430826822916666, "learning_rate": 0.0001, "loss": 5.6847, "loss/crossentropy": 2.5286134481430054, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.17107945680618286, "step": 25408 }, { "epoch": 0.7940625, "grad_norm": 3.34375, "grad_norm_var": 0.028580729166666666, "learning_rate": 0.0001, "loss": 5.5004, "loss/crossentropy": 2.4793365001678467, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1567937731742859, "step": 25410 }, { "epoch": 0.794125, "grad_norm": 3.0, "grad_norm_var": 0.03483784993489583, "learning_rate": 0.0001, "loss": 5.4253, "loss/crossentropy": 2.441056251525879, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15428167581558228, "step": 25412 }, { "epoch": 0.7941875, "grad_norm": 3.15625, "grad_norm_var": 0.034520467122395836, "learning_rate": 0.0001, "loss": 5.3587, "loss/crossentropy": 2.388457775115967, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1513218656182289, "step": 25414 }, { "epoch": 0.79425, "grad_norm": 3.046875, "grad_norm_var": 0.0177734375, "learning_rate": 0.0001, "loss": 5.8201, "loss/crossentropy": 2.6421027183532715, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16975290328264236, "step": 25416 }, { "epoch": 0.7943125, "grad_norm": 3.21875, "grad_norm_var": 0.02080078125, "learning_rate": 0.0001, "loss": 5.5715, "loss/crossentropy": 2.463736057281494, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16077560186386108, "step": 25418 }, { "epoch": 0.794375, "grad_norm": 3.515625, "grad_norm_var": 0.027925618489583335, "learning_rate": 0.0001, "loss": 5.6985, "loss/crossentropy": 2.555183529853821, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16628053784370422, "step": 25420 }, { "epoch": 0.7944375, "grad_norm": 3.0625, "grad_norm_var": 0.030809529622395835, "learning_rate": 0.0001, "loss": 5.5911, "loss/crossentropy": 2.55877947807312, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15596868097782135, "step": 25422 }, { "epoch": 0.7945, "grad_norm": 3.015625, "grad_norm_var": 0.03922119140625, "learning_rate": 0.0001, "loss": 5.558, "loss/crossentropy": 2.5130953788757324, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16035164892673492, "step": 25424 }, { "epoch": 0.7945625, "grad_norm": 2.984375, "grad_norm_var": 0.04362691243489583, "learning_rate": 0.0001, "loss": 6.1642, "loss/crossentropy": 2.8820998668670654, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17898721247911453, "step": 25426 }, { "epoch": 0.794625, "grad_norm": 3.09375, "grad_norm_var": 0.0427734375, "learning_rate": 0.0001, "loss": 5.5461, "loss/crossentropy": 2.5282377004623413, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15451814234256744, "step": 25428 }, { "epoch": 0.7946875, "grad_norm": 3.203125, "grad_norm_var": 0.045563761393229166, "learning_rate": 0.0001, "loss": 5.8231, "loss/crossentropy": 2.67414653301239, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1668485924601555, "step": 25430 }, { "epoch": 0.79475, "grad_norm": 3.203125, "grad_norm_var": 0.0478424072265625, "learning_rate": 0.0001, "loss": 5.8406, "loss/crossentropy": 2.715545177459717, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1656324788928032, "step": 25432 }, { "epoch": 0.7948125, "grad_norm": 3.03125, "grad_norm_var": 0.046647135416666666, "learning_rate": 0.0001, "loss": 5.8858, "loss/crossentropy": 2.738201379776001, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16905217617750168, "step": 25434 }, { "epoch": 0.794875, "grad_norm": 3.125, "grad_norm_var": 0.03877665201822917, "learning_rate": 0.0001, "loss": 5.8226, "loss/crossentropy": 2.615545630455017, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17265405505895615, "step": 25436 }, { "epoch": 0.7949375, "grad_norm": 3.234375, "grad_norm_var": 0.042512003580729166, "learning_rate": 0.0001, "loss": 5.3998, "loss/crossentropy": 2.320631504058838, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16182444989681244, "step": 25438 }, { "epoch": 0.795, "grad_norm": 3.28125, "grad_norm_var": 0.029618326822916666, "learning_rate": 0.0001, "loss": 5.5821, "loss/crossentropy": 2.484802722930908, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16403156518936157, "step": 25440 }, { "epoch": 0.7950625, "grad_norm": 2.703125, "grad_norm_var": 0.033014933268229164, "learning_rate": 0.0001, "loss": 5.7428, "loss/crossentropy": 2.6623945236206055, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1607770398259163, "step": 25442 }, { "epoch": 0.795125, "grad_norm": 2.796875, "grad_norm_var": 0.04680989583333333, "learning_rate": 0.0001, "loss": 5.5889, "loss/crossentropy": 2.497969150543213, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1579175516963005, "step": 25444 }, { "epoch": 0.7951875, "grad_norm": 3.140625, "grad_norm_var": 0.048094685872395834, "learning_rate": 0.0001, "loss": 5.5234, "loss/crossentropy": 2.492112636566162, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.1609383374452591, "step": 25446 }, { "epoch": 0.79525, "grad_norm": 2.96875, "grad_norm_var": 0.04838765462239583, "learning_rate": 0.0001, "loss": 5.746, "loss/crossentropy": 2.713121175765991, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16109861433506012, "step": 25448 }, { "epoch": 0.7953125, "grad_norm": 3.46875, "grad_norm_var": 0.05676676432291667, "learning_rate": 0.0001, "loss": 6.0213, "loss/crossentropy": 2.716894030570984, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17887406796216965, "step": 25450 }, { "epoch": 0.795375, "grad_norm": 2.84375, "grad_norm_var": 0.05751953125, "learning_rate": 0.0001, "loss": 5.5568, "loss/crossentropy": 2.47613787651062, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16001805663108826, "step": 25452 }, { "epoch": 0.7954375, "grad_norm": 3.15625, "grad_norm_var": 0.0588531494140625, "learning_rate": 0.0001, "loss": 5.4774, "loss/crossentropy": 2.3346649408340454, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1654464304447174, "step": 25454 }, { "epoch": 0.7955, "grad_norm": 3.421875, "grad_norm_var": 0.0620758056640625, "learning_rate": 0.0001, "loss": 5.8435, "loss/crossentropy": 2.6950998306274414, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1675708219408989, "step": 25456 }, { "epoch": 0.7955625, "grad_norm": 2.671875, "grad_norm_var": 0.06360677083333334, "learning_rate": 0.0001, "loss": 5.4595, "loss/crossentropy": 2.5493483543395996, "loss/hidden": 1.39453125, "loss/jsd": 0.0, "loss/logits": 0.15155760198831558, "step": 25458 }, { "epoch": 0.795625, "grad_norm": 3.140625, "grad_norm_var": 0.04726460774739583, "learning_rate": 0.0001, "loss": 5.712, "loss/crossentropy": 2.5620031356811523, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.165386363863945, "step": 25460 }, { "epoch": 0.7956875, "grad_norm": 2.859375, "grad_norm_var": 0.047728474934895834, "learning_rate": 0.0001, "loss": 5.5077, "loss/crossentropy": 2.5065815448760986, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1547996699810028, "step": 25462 }, { "epoch": 0.79575, "grad_norm": 3.046875, "grad_norm_var": 0.044482421875, "learning_rate": 0.0001, "loss": 5.7657, "loss/crossentropy": 2.664130926132202, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1664118468761444, "step": 25464 }, { "epoch": 0.7958125, "grad_norm": 2.890625, "grad_norm_var": 0.03772379557291667, "learning_rate": 0.0001, "loss": 5.6806, "loss/crossentropy": 2.6315428018569946, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1580308899283409, "step": 25466 }, { "epoch": 0.795875, "grad_norm": 3.015625, "grad_norm_var": 0.042845662434895834, "learning_rate": 0.0001, "loss": 6.0401, "loss/crossentropy": 2.8397724628448486, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17237255722284317, "step": 25468 }, { "epoch": 0.7959375, "grad_norm": 3.140625, "grad_norm_var": 0.0360992431640625, "learning_rate": 0.0001, "loss": 5.9452, "loss/crossentropy": 2.73575496673584, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1725090742111206, "step": 25470 }, { "epoch": 0.796, "grad_norm": 3.609375, "grad_norm_var": 0.04736328125, "learning_rate": 0.0001, "loss": 5.6821, "loss/crossentropy": 2.530869245529175, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16902897506952286, "step": 25472 }, { "epoch": 0.7960625, "grad_norm": 2.875, "grad_norm_var": 0.03997395833333333, "learning_rate": 0.0001, "loss": 5.2324, "loss/crossentropy": 2.2642139196395874, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15307102352380753, "step": 25474 }, { "epoch": 0.796125, "grad_norm": 3.09375, "grad_norm_var": 0.04102274576822917, "learning_rate": 0.0001, "loss": 5.7425, "loss/crossentropy": 2.693701982498169, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.16230234503746033, "step": 25476 }, { "epoch": 0.7961875, "grad_norm": 3.109375, "grad_norm_var": 0.04023030598958333, "learning_rate": 0.0001, "loss": 5.8272, "loss/crossentropy": 2.7727891206741333, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.16481461375951767, "step": 25478 }, { "epoch": 0.79625, "grad_norm": 3.078125, "grad_norm_var": 0.0412994384765625, "learning_rate": 0.0001, "loss": 5.7098, "loss/crossentropy": 2.5666359663009644, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1643167957663536, "step": 25480 }, { "epoch": 0.7963125, "grad_norm": 3.0625, "grad_norm_var": 0.03814697265625, "learning_rate": 0.0001, "loss": 5.6605, "loss/crossentropy": 2.5249993801116943, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1639430746436119, "step": 25482 }, { "epoch": 0.796375, "grad_norm": 2.984375, "grad_norm_var": 0.03033447265625, "learning_rate": 0.0001, "loss": 5.702, "loss/crossentropy": 2.6121197938919067, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1660180762410164, "step": 25484 }, { "epoch": 0.7964375, "grad_norm": 3.359375, "grad_norm_var": 0.03516337076822917, "learning_rate": 0.0001, "loss": 5.6047, "loss/crossentropy": 2.5446722507476807, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16069257259368896, "step": 25486 }, { "epoch": 0.7965, "grad_norm": 3.203125, "grad_norm_var": 0.019465128580729168, "learning_rate": 0.0001, "loss": 5.7265, "loss/crossentropy": 2.615384578704834, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16657806932926178, "step": 25488 }, { "epoch": 0.7965625, "grad_norm": 3.078125, "grad_norm_var": 0.017936197916666667, "learning_rate": 0.0001, "loss": 5.4358, "loss/crossentropy": 2.3771530389785767, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16172577440738678, "step": 25490 }, { "epoch": 0.796625, "grad_norm": 3.171875, "grad_norm_var": 0.016405232747395835, "learning_rate": 0.0001, "loss": 5.66, "loss/crossentropy": 2.5950770378112793, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16352280974388123, "step": 25492 }, { "epoch": 0.7966875, "grad_norm": 3.0625, "grad_norm_var": 0.013459269205729167, "learning_rate": 0.0001, "loss": 5.3398, "loss/crossentropy": 2.3305797576904297, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1552230790257454, "step": 25494 }, { "epoch": 0.79675, "grad_norm": 3.140625, "grad_norm_var": 0.015592447916666667, "learning_rate": 0.0001, "loss": 5.5147, "loss/crossentropy": 2.4388809204101562, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15953578054904938, "step": 25496 }, { "epoch": 0.7968125, "grad_norm": 3.125, "grad_norm_var": 0.0158355712890625, "learning_rate": 0.0001, "loss": 5.6602, "loss/crossentropy": 2.576777219772339, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16459254920482635, "step": 25498 }, { "epoch": 0.796875, "grad_norm": 3.0, "grad_norm_var": 0.019017537434895832, "learning_rate": 0.0001, "loss": 5.369, "loss/crossentropy": 2.4149060249328613, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15165583789348602, "step": 25500 }, { "epoch": 0.7969375, "grad_norm": 2.9375, "grad_norm_var": 0.0140289306640625, "learning_rate": 0.0001, "loss": 5.3528, "loss/crossentropy": 2.387664318084717, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15158936381340027, "step": 25502 }, { "epoch": 0.797, "grad_norm": 2.953125, "grad_norm_var": 0.0192291259765625, "learning_rate": 0.0001, "loss": 5.3215, "loss/crossentropy": 2.366239547729492, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.15372508019208908, "step": 25504 }, { "epoch": 0.7970625, "grad_norm": 3.25, "grad_norm_var": 0.024820963541666668, "learning_rate": 0.0001, "loss": 5.6459, "loss/crossentropy": 2.630552649497986, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.1589585244655609, "step": 25506 }, { "epoch": 0.797125, "grad_norm": 3.359375, "grad_norm_var": 0.16367085774739584, "learning_rate": 0.0001, "loss": 6.0366, "loss/crossentropy": 2.8165512084960938, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17278824746608734, "step": 25508 }, { "epoch": 0.7971875, "grad_norm": 3.171875, "grad_norm_var": 0.1638824462890625, "learning_rate": 0.0001, "loss": 5.8457, "loss/crossentropy": 2.689768671989441, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1710585355758667, "step": 25510 }, { "epoch": 0.79725, "grad_norm": 2.828125, "grad_norm_var": 0.16585286458333334, "learning_rate": 0.0001, "loss": 5.5155, "loss/crossentropy": 2.4981210231781006, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.16033216565847397, "step": 25512 }, { "epoch": 0.7973125, "grad_norm": 3.046875, "grad_norm_var": 0.16577860514322917, "learning_rate": 0.0001, "loss": 5.4075, "loss/crossentropy": 2.446346402168274, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15314766019582748, "step": 25514 }, { "epoch": 0.797375, "grad_norm": 3.46875, "grad_norm_var": 0.16868082682291666, "learning_rate": 0.0001, "loss": 5.8491, "loss/crossentropy": 2.7132939100265503, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16748397052288055, "step": 25516 }, { "epoch": 0.7974375, "grad_norm": 2.765625, "grad_norm_var": 0.1755859375, "learning_rate": 0.0001, "loss": 5.3634, "loss/crossentropy": 2.3772170543670654, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15174540877342224, "step": 25518 }, { "epoch": 0.7975, "grad_norm": 3.125, "grad_norm_var": 0.15921122233072918, "learning_rate": 0.0001, "loss": 5.4941, "loss/crossentropy": 2.4577242136001587, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15598254650831223, "step": 25520 }, { "epoch": 0.7975625, "grad_norm": 2.8125, "grad_norm_var": 0.15374247233072916, "learning_rate": 0.0001, "loss": 5.605, "loss/crossentropy": 2.6322391033172607, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15430255234241486, "step": 25522 }, { "epoch": 0.797625, "grad_norm": 2.984375, "grad_norm_var": 0.031183878580729168, "learning_rate": 0.0001, "loss": 5.4833, "loss/crossentropy": 2.444926142692566, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15852493047714233, "step": 25524 }, { "epoch": 0.7976875, "grad_norm": 2.953125, "grad_norm_var": 0.030094401041666666, "learning_rate": 0.0001, "loss": 5.3054, "loss/crossentropy": 2.328010082244873, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1528138443827629, "step": 25526 }, { "epoch": 0.79775, "grad_norm": 3.4375, "grad_norm_var": 0.03902587890625, "learning_rate": 0.0001, "loss": 5.9898, "loss/crossentropy": 2.72395658493042, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17814859747886658, "step": 25528 }, { "epoch": 0.7978125, "grad_norm": 2.828125, "grad_norm_var": 0.04814453125, "learning_rate": 0.0001, "loss": 5.7455, "loss/crossentropy": 2.7303361892700195, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15698431432247162, "step": 25530 }, { "epoch": 0.797875, "grad_norm": 3.0625, "grad_norm_var": 0.0395172119140625, "learning_rate": 0.0001, "loss": 5.6479, "loss/crossentropy": 2.506553530693054, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16530776768922806, "step": 25532 }, { "epoch": 0.7979375, "grad_norm": 2.796875, "grad_norm_var": 0.039891560872395836, "learning_rate": 0.0001, "loss": 5.3655, "loss/crossentropy": 2.433992028236389, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.14940283447504044, "step": 25534 }, { "epoch": 0.798, "grad_norm": 2.90625, "grad_norm_var": 0.0386138916015625, "learning_rate": 0.0001, "loss": 5.5089, "loss/crossentropy": 2.4843519926071167, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15870817005634308, "step": 25536 }, { "epoch": 0.7980625, "grad_norm": 3.109375, "grad_norm_var": 0.0354400634765625, "learning_rate": 0.0001, "loss": 5.8525, "loss/crossentropy": 2.7046704292297363, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16673485934734344, "step": 25538 }, { "epoch": 0.798125, "grad_norm": 3.265625, "grad_norm_var": 0.0436431884765625, "learning_rate": 0.0001, "loss": 5.1388, "loss/crossentropy": 2.208470582962036, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.14537180960178375, "step": 25540 }, { "epoch": 0.7981875, "grad_norm": 3.15625, "grad_norm_var": 0.043309529622395836, "learning_rate": 0.0001, "loss": 5.5689, "loss/crossentropy": 2.476935029029846, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1646619364619255, "step": 25542 }, { "epoch": 0.79825, "grad_norm": 3.390625, "grad_norm_var": 0.08238525390625, "learning_rate": 0.0001, "loss": 5.8632, "loss/crossentropy": 2.6023300886154175, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17765138298273087, "step": 25544 }, { "epoch": 0.7983125, "grad_norm": 3.28125, "grad_norm_var": 0.06796875, "learning_rate": 0.0001, "loss": 5.5825, "loss/crossentropy": 2.4996442794799805, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1606249436736107, "step": 25546 }, { "epoch": 0.798375, "grad_norm": 3.515625, "grad_norm_var": 0.07424723307291667, "learning_rate": 0.0001, "loss": 5.8853, "loss/crossentropy": 2.6848857402801514, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1712096408009529, "step": 25548 }, { "epoch": 0.7984375, "grad_norm": 2.984375, "grad_norm_var": 0.06333719889322917, "learning_rate": 0.0001, "loss": 5.9677, "loss/crossentropy": 2.8552619218826294, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1643703281879425, "step": 25550 }, { "epoch": 0.7985, "grad_norm": 3.09375, "grad_norm_var": 0.059228515625, "learning_rate": 0.0001, "loss": 5.8195, "loss/crossentropy": 2.719208598136902, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16393586993217468, "step": 25552 }, { "epoch": 0.7985625, "grad_norm": 3.125, "grad_norm_var": 0.06682535807291666, "learning_rate": 0.0001, "loss": 5.5242, "loss/crossentropy": 2.478301763534546, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16162413358688354, "step": 25554 }, { "epoch": 0.798625, "grad_norm": 2.890625, "grad_norm_var": 0.07568359375, "learning_rate": 0.0001, "loss": 5.6991, "loss/crossentropy": 2.6302788257598877, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16235194355249405, "step": 25556 }, { "epoch": 0.7986875, "grad_norm": 3.125, "grad_norm_var": 0.07669169108072917, "learning_rate": 0.0001, "loss": 5.9282, "loss/crossentropy": 2.741113543510437, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16909489780664444, "step": 25558 }, { "epoch": 0.79875, "grad_norm": 2.984375, "grad_norm_var": 0.0422760009765625, "learning_rate": 0.0001, "loss": 6.0116, "loss/crossentropy": 2.803410530090332, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17354948073625565, "step": 25560 }, { "epoch": 0.7988125, "grad_norm": 2.890625, "grad_norm_var": 0.0452545166015625, "learning_rate": 0.0001, "loss": 5.1638, "loss/crossentropy": 2.1904959678649902, "loss/hidden": 1.40234375, "loss/jsd": 0.0, "loss/logits": 0.1570926457643509, "step": 25562 }, { "epoch": 0.798875, "grad_norm": 2.75, "grad_norm_var": 0.04071858723958333, "learning_rate": 0.0001, "loss": 5.6028, "loss/crossentropy": 2.54690682888031, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15988218784332275, "step": 25564 }, { "epoch": 0.7989375, "grad_norm": 3.0625, "grad_norm_var": 0.03173421223958333, "learning_rate": 0.0001, "loss": 5.8456, "loss/crossentropy": 2.7087689638137817, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.164850115776062, "step": 25566 }, { "epoch": 0.799, "grad_norm": 3.078125, "grad_norm_var": 0.035563151041666664, "learning_rate": 0.0001, "loss": 5.6834, "loss/crossentropy": 2.6643515825271606, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1565898060798645, "step": 25568 }, { "epoch": 0.7990625, "grad_norm": 2.71875, "grad_norm_var": 0.04633687337239583, "learning_rate": 0.0001, "loss": 5.2925, "loss/crossentropy": 2.2745431661605835, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15609276294708252, "step": 25570 }, { "epoch": 0.799125, "grad_norm": 2.953125, "grad_norm_var": 0.04537353515625, "learning_rate": 0.0001, "loss": 5.4212, "loss/crossentropy": 2.426013469696045, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15459664165973663, "step": 25572 }, { "epoch": 0.7991875, "grad_norm": 2.90625, "grad_norm_var": 0.045221964518229164, "learning_rate": 0.0001, "loss": 5.5825, "loss/crossentropy": 2.662925362586975, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.15210948884487152, "step": 25574 }, { "epoch": 0.79925, "grad_norm": 3.046875, "grad_norm_var": 0.04396870930989583, "learning_rate": 0.0001, "loss": 6.1286, "loss/crossentropy": 2.873689293861389, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17666620761156082, "step": 25576 }, { "epoch": 0.7993125, "grad_norm": 3.09375, "grad_norm_var": 0.072265625, "learning_rate": 0.0001, "loss": 5.6605, "loss/crossentropy": 2.56427001953125, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16587091982364655, "step": 25578 }, { "epoch": 0.799375, "grad_norm": 3.46875, "grad_norm_var": 0.0746246337890625, "learning_rate": 0.0001, "loss": 5.6584, "loss/crossentropy": 2.5480706691741943, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16181902587413788, "step": 25580 }, { "epoch": 0.7994375, "grad_norm": 2.921875, "grad_norm_var": 0.07457275390625, "learning_rate": 0.0001, "loss": 5.2879, "loss/crossentropy": 2.3415573835372925, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.15362046658992767, "step": 25582 }, { "epoch": 0.7995, "grad_norm": 3.1875, "grad_norm_var": 0.0777984619140625, "learning_rate": 0.0001, "loss": 5.7093, "loss/crossentropy": 2.651996612548828, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16275802999734879, "step": 25584 }, { "epoch": 0.7995625, "grad_norm": 3.125, "grad_norm_var": 0.06565348307291667, "learning_rate": 0.0001, "loss": 5.2063, "loss/crossentropy": 2.2687193155288696, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.14844368398189545, "step": 25586 }, { "epoch": 0.799625, "grad_norm": 3.3125, "grad_norm_var": 0.0683258056640625, "learning_rate": 0.0001, "loss": 5.6074, "loss/crossentropy": 2.521701693534851, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16051886975765228, "step": 25588 }, { "epoch": 0.7996875, "grad_norm": 3.34375, "grad_norm_var": 0.06256510416666666, "learning_rate": 0.0001, "loss": 5.8776, "loss/crossentropy": 2.7814104557037354, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16156956553459167, "step": 25590 }, { "epoch": 0.79975, "grad_norm": 2.96875, "grad_norm_var": 0.06262613932291666, "learning_rate": 0.0001, "loss": 5.4677, "loss/crossentropy": 2.461822032928467, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1576220542192459, "step": 25592 }, { "epoch": 0.7998125, "grad_norm": 2.96875, "grad_norm_var": 0.0400787353515625, "learning_rate": 0.0001, "loss": 5.7341, "loss/crossentropy": 2.5990320444107056, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16663643717765808, "step": 25594 }, { "epoch": 0.799875, "grad_norm": 31.625, "grad_norm_var": 51.0037109375, "learning_rate": 0.0001, "loss": 6.0037, "loss/crossentropy": 2.4253209829330444, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.20627228915691376, "step": 25596 }, { "epoch": 0.7999375, "grad_norm": 3.046875, "grad_norm_var": 50.943033854166664, "learning_rate": 0.0001, "loss": 5.5132, "loss/crossentropy": 2.444235324859619, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.16432026028633118, "step": 25598 }, { "epoch": 0.8, "grad_norm": 3.28125, "grad_norm_var": 50.81070556640625, "learning_rate": 0.0001, "loss": 5.6477, "loss/crossentropy": 2.529646158218384, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.1696132943034172, "step": 25600 }, { "epoch": 0.8000625, "grad_norm": 3.28125, "grad_norm_var": 50.59339090983073, "learning_rate": 0.0001, "loss": 5.6978, "loss/crossentropy": 2.522778630256653, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17062946408987045, "step": 25602 }, { "epoch": 0.800125, "grad_norm": 3.234375, "grad_norm_var": 50.6152089436849, "learning_rate": 0.0001, "loss": 5.4611, "loss/crossentropy": 2.4644765853881836, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15474426746368408, "step": 25604 }, { "epoch": 0.8001875, "grad_norm": 2.921875, "grad_norm_var": 50.69345296223958, "learning_rate": 0.0001, "loss": 5.6861, "loss/crossentropy": 2.6215637922286987, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1619187593460083, "step": 25606 }, { "epoch": 0.80025, "grad_norm": 3.578125, "grad_norm_var": 50.59892171223958, "learning_rate": 0.0001, "loss": 5.742, "loss/crossentropy": 2.6077487468719482, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16342793405056, "step": 25608 }, { "epoch": 0.8003125, "grad_norm": 3.546875, "grad_norm_var": 50.42652079264323, "learning_rate": 0.0001, "loss": 5.9068, "loss/crossentropy": 2.6417452096939087, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17885131388902664, "step": 25610 }, { "epoch": 0.800375, "grad_norm": 3.40625, "grad_norm_var": 0.058991495768229166, "learning_rate": 0.0001, "loss": 5.9333, "loss/crossentropy": 2.686414957046509, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17625177651643753, "step": 25612 }, { "epoch": 0.8004375, "grad_norm": 3.390625, "grad_norm_var": 0.05634663899739583, "learning_rate": 0.0001, "loss": 5.8919, "loss/crossentropy": 2.6309632062911987, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1760942041873932, "step": 25614 }, { "epoch": 0.8005, "grad_norm": 2.796875, "grad_norm_var": 0.07636311848958334, "learning_rate": 0.0001, "loss": 5.5093, "loss/crossentropy": 2.454187512397766, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16098280996084213, "step": 25616 }, { "epoch": 0.8005625, "grad_norm": 3.0, "grad_norm_var": 0.06408589680989583, "learning_rate": 0.0001, "loss": 5.6101, "loss/crossentropy": 2.5555964708328247, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16169817745685577, "step": 25618 }, { "epoch": 0.800625, "grad_norm": 2.96875, "grad_norm_var": 0.06236063639322917, "learning_rate": 0.0001, "loss": 5.856, "loss/crossentropy": 2.673764228820801, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1701762080192566, "step": 25620 }, { "epoch": 0.8006875, "grad_norm": 3.0, "grad_norm_var": 0.05994364420572917, "learning_rate": 0.0001, "loss": 5.6185, "loss/crossentropy": 2.5303525924682617, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1642872840166092, "step": 25622 }, { "epoch": 0.80075, "grad_norm": 3.25, "grad_norm_var": 0.05245768229166667, "learning_rate": 0.0001, "loss": 5.9096, "loss/crossentropy": 2.699185848236084, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17416246980428696, "step": 25624 }, { "epoch": 0.8008125, "grad_norm": 3.296875, "grad_norm_var": 0.04036356608072917, "learning_rate": 0.0001, "loss": 5.7008, "loss/crossentropy": 2.6083273887634277, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1639333814382553, "step": 25626 }, { "epoch": 0.800875, "grad_norm": 3.640625, "grad_norm_var": 0.054520670572916666, "learning_rate": 0.0001, "loss": 6.2766, "loss/crossentropy": 2.897311806678772, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.18636631220579147, "step": 25628 }, { "epoch": 0.8009375, "grad_norm": 3.53125, "grad_norm_var": 0.06113993326822917, "learning_rate": 0.0001, "loss": 5.8779, "loss/crossentropy": 2.686615228652954, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.17342127859592438, "step": 25630 }, { "epoch": 0.801, "grad_norm": 3.0, "grad_norm_var": 0.052750651041666666, "learning_rate": 0.0001, "loss": 5.7031, "loss/crossentropy": 2.6328877210617065, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16210304200649261, "step": 25632 }, { "epoch": 0.8010625, "grad_norm": 2.921875, "grad_norm_var": 0.0535308837890625, "learning_rate": 0.0001, "loss": 5.3814, "loss/crossentropy": 2.446844458580017, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.1516549289226532, "step": 25634 }, { "epoch": 0.801125, "grad_norm": 3.015625, "grad_norm_var": 0.0518463134765625, "learning_rate": 0.0001, "loss": 5.6919, "loss/crossentropy": 2.631867289543152, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15913043916225433, "step": 25636 }, { "epoch": 0.8011875, "grad_norm": 3.109375, "grad_norm_var": 0.06028544108072917, "learning_rate": 0.0001, "loss": 5.039, "loss/crossentropy": 2.216692805290222, "loss/hidden": 1.3671875, "loss/jsd": 0.0, "loss/logits": 0.14551055431365967, "step": 25638 }, { "epoch": 0.80125, "grad_norm": 3.015625, "grad_norm_var": 0.0585357666015625, "learning_rate": 0.0001, "loss": 5.7764, "loss/crossentropy": 2.65237557888031, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16592242568731308, "step": 25640 }, { "epoch": 0.8013125, "grad_norm": 3.203125, "grad_norm_var": 0.06978759765625, "learning_rate": 0.0001, "loss": 4.9999, "loss/crossentropy": 2.1383305191993713, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.13654698431491852, "step": 25642 }, { "epoch": 0.801375, "grad_norm": 3.375, "grad_norm_var": 0.052897135416666664, "learning_rate": 0.0001, "loss": 5.5595, "loss/crossentropy": 2.56177020072937, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.15993209183216095, "step": 25644 }, { "epoch": 0.8014375, "grad_norm": 2.984375, "grad_norm_var": 0.040934244791666664, "learning_rate": 0.0001, "loss": 5.9067, "loss/crossentropy": 2.7646554708480835, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1681097373366356, "step": 25646 }, { "epoch": 0.8015, "grad_norm": 3.109375, "grad_norm_var": 0.0415679931640625, "learning_rate": 0.0001, "loss": 5.586, "loss/crossentropy": 2.5196781158447266, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15780264139175415, "step": 25648 }, { "epoch": 0.8015625, "grad_norm": 2.90625, "grad_norm_var": 0.04228413899739583, "learning_rate": 0.0001, "loss": 5.7445, "loss/crossentropy": 2.6249492168426514, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1627407670021057, "step": 25650 }, { "epoch": 0.801625, "grad_norm": 3.1875, "grad_norm_var": 0.04625244140625, "learning_rate": 0.0001, "loss": 5.8244, "loss/crossentropy": 2.7322680950164795, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16272947937250137, "step": 25652 }, { "epoch": 0.8016875, "grad_norm": 3.234375, "grad_norm_var": 0.04169514973958333, "learning_rate": 0.0001, "loss": 5.4287, "loss/crossentropy": 2.3911421298980713, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.15375907719135284, "step": 25654 }, { "epoch": 0.80175, "grad_norm": 3.4375, "grad_norm_var": 0.04491780598958333, "learning_rate": 0.0001, "loss": 5.9986, "loss/crossentropy": 2.765831232070923, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17366764694452286, "step": 25656 }, { "epoch": 0.8018125, "grad_norm": 3.046875, "grad_norm_var": 0.03046875, "learning_rate": 0.0001, "loss": 5.9062, "loss/crossentropy": 2.736068844795227, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16897106915712357, "step": 25658 }, { "epoch": 0.801875, "grad_norm": 3.40625, "grad_norm_var": 0.04478251139322917, "learning_rate": 0.0001, "loss": 5.3685, "loss/crossentropy": 2.4307442903518677, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.1515898033976555, "step": 25660 }, { "epoch": 0.8019375, "grad_norm": 3.171875, "grad_norm_var": 0.0458648681640625, "learning_rate": 0.0001, "loss": 5.3073, "loss/crossentropy": 2.3094619512557983, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1540796086192131, "step": 25662 }, { "epoch": 0.802, "grad_norm": 3.296875, "grad_norm_var": 0.04804280598958333, "learning_rate": 0.0001, "loss": 5.4717, "loss/crossentropy": 2.4414368867874146, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15927442908287048, "step": 25664 }, { "epoch": 0.8020625, "grad_norm": 3.0, "grad_norm_var": 0.046126302083333334, "learning_rate": 0.0001, "loss": 5.4901, "loss/crossentropy": 2.4707207679748535, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.158184215426445, "step": 25666 }, { "epoch": 0.802125, "grad_norm": 3.234375, "grad_norm_var": 0.039338175455729166, "learning_rate": 0.0001, "loss": 5.5646, "loss/crossentropy": 2.598528742790222, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.1548113077878952, "step": 25668 }, { "epoch": 0.8021875, "grad_norm": 2.953125, "grad_norm_var": 0.050715128580729164, "learning_rate": 0.0001, "loss": 5.4259, "loss/crossentropy": 2.482747793197632, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.15330323576927185, "step": 25670 }, { "epoch": 0.80225, "grad_norm": 3.140625, "grad_norm_var": 0.04548238118489583, "learning_rate": 0.0001, "loss": 5.7484, "loss/crossentropy": 2.6026499271392822, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16613566130399704, "step": 25672 }, { "epoch": 0.8023125, "grad_norm": 3.28125, "grad_norm_var": 0.043309529622395836, "learning_rate": 0.0001, "loss": 5.8977, "loss/crossentropy": 2.7315609455108643, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16661161929368973, "step": 25674 }, { "epoch": 0.802375, "grad_norm": 3.25, "grad_norm_var": 0.0368316650390625, "learning_rate": 0.0001, "loss": 5.4293, "loss/crossentropy": 2.437578558921814, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15542569011449814, "step": 25676 }, { "epoch": 0.8024375, "grad_norm": 3.0625, "grad_norm_var": 0.03489481608072917, "learning_rate": 0.0001, "loss": 5.4747, "loss/crossentropy": 2.4499117136001587, "loss/hidden": 1.40234375, "loss/jsd": 0.0, "loss/logits": 0.1622449979186058, "step": 25678 }, { "epoch": 0.8025, "grad_norm": 3.1875, "grad_norm_var": 0.06774088541666666, "learning_rate": 0.0001, "loss": 5.5693, "loss/crossentropy": 2.462652325630188, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.160662479698658, "step": 25680 }, { "epoch": 0.8025625, "grad_norm": 3.21875, "grad_norm_var": 0.07193603515625, "learning_rate": 0.0001, "loss": 5.3556, "loss/crossentropy": 2.336050271987915, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.15195493400096893, "step": 25682 }, { "epoch": 0.802625, "grad_norm": 3.09375, "grad_norm_var": 0.07487691243489583, "learning_rate": 0.0001, "loss": 5.4765, "loss/crossentropy": 2.4275963306427, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1619264781475067, "step": 25684 }, { "epoch": 0.8026875, "grad_norm": 3.03125, "grad_norm_var": 0.0613677978515625, "learning_rate": 0.0001, "loss": 5.6656, "loss/crossentropy": 2.613976240158081, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15828579664230347, "step": 25686 }, { "epoch": 0.80275, "grad_norm": 2.890625, "grad_norm_var": 0.06897786458333334, "learning_rate": 0.0001, "loss": 5.4136, "loss/crossentropy": 2.4856724739074707, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.15216396749019623, "step": 25688 }, { "epoch": 0.8028125, "grad_norm": 3.296875, "grad_norm_var": 0.07062886555989584, "learning_rate": 0.0001, "loss": 6.0441, "loss/crossentropy": 2.7790746688842773, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17806822806596756, "step": 25690 }, { "epoch": 0.802875, "grad_norm": 3.390625, "grad_norm_var": 0.08241780598958333, "learning_rate": 0.0001, "loss": 5.4503, "loss/crossentropy": 2.4067717790603638, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15669343620538712, "step": 25692 }, { "epoch": 0.8029375, "grad_norm": 2.921875, "grad_norm_var": 0.0868316650390625, "learning_rate": 0.0001, "loss": 5.4701, "loss/crossentropy": 2.4876248836517334, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15449516475200653, "step": 25694 }, { "epoch": 0.803, "grad_norm": 2.953125, "grad_norm_var": 0.04761962890625, "learning_rate": 0.0001, "loss": 5.7045, "loss/crossentropy": 2.689522862434387, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15657395124435425, "step": 25696 }, { "epoch": 0.8030625, "grad_norm": 2.671875, "grad_norm_var": 0.0401275634765625, "learning_rate": 0.0001, "loss": 5.4649, "loss/crossentropy": 2.47347354888916, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15539731830358505, "step": 25698 }, { "epoch": 0.803125, "grad_norm": 2.953125, "grad_norm_var": 0.045633951822916664, "learning_rate": 0.0001, "loss": 5.3044, "loss/crossentropy": 2.332689046859741, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.15654505789279938, "step": 25700 }, { "epoch": 0.8031875, "grad_norm": 3.0625, "grad_norm_var": 0.045703125, "learning_rate": 0.0001, "loss": 5.4595, "loss/crossentropy": 2.3924553394317627, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15866002440452576, "step": 25702 }, { "epoch": 0.80325, "grad_norm": 3.40625, "grad_norm_var": 0.057062784830729164, "learning_rate": 0.0001, "loss": 5.6308, "loss/crossentropy": 2.5779011249542236, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16153736412525177, "step": 25704 }, { "epoch": 0.8033125, "grad_norm": 3.25, "grad_norm_var": 0.058690388997395836, "learning_rate": 0.0001, "loss": 5.7869, "loss/crossentropy": 2.5727206468582153, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17219997197389603, "step": 25706 }, { "epoch": 0.803375, "grad_norm": 3.09375, "grad_norm_var": 0.0398590087890625, "learning_rate": 0.0001, "loss": 5.6147, "loss/crossentropy": 2.514239192008972, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16355954110622406, "step": 25708 }, { "epoch": 0.8034375, "grad_norm": 2.96875, "grad_norm_var": 0.0429351806640625, "learning_rate": 0.0001, "loss": 5.7726, "loss/crossentropy": 2.6182950735092163, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16777712106704712, "step": 25710 }, { "epoch": 0.8035, "grad_norm": 3.296875, "grad_norm_var": 0.04412434895833333, "learning_rate": 0.0001, "loss": 5.4248, "loss/crossentropy": 2.3947904109954834, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.1596369817852974, "step": 25712 }, { "epoch": 0.8035625, "grad_norm": 3.046875, "grad_norm_var": 0.03072509765625, "learning_rate": 0.0001, "loss": 5.796, "loss/crossentropy": 2.6150922775268555, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16887310147285461, "step": 25714 }, { "epoch": 0.803625, "grad_norm": 3.171875, "grad_norm_var": 0.016209920247395832, "learning_rate": 0.0001, "loss": 5.6265, "loss/crossentropy": 2.5220504999160767, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16513632237911224, "step": 25716 }, { "epoch": 0.8036875, "grad_norm": 3.046875, "grad_norm_var": 0.014256795247395834, "learning_rate": 0.0001, "loss": 6.0629, "loss/crossentropy": 2.8565841913223267, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1729728877544403, "step": 25718 }, { "epoch": 0.80375, "grad_norm": 3.09375, "grad_norm_var": 0.017121378580729166, "learning_rate": 0.0001, "loss": 5.6883, "loss/crossentropy": 2.6064772605895996, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16364658623933792, "step": 25720 }, { "epoch": 0.8038125, "grad_norm": 3.046875, "grad_norm_var": 0.015217081705729166, "learning_rate": 0.0001, "loss": 5.4772, "loss/crossentropy": 2.4333605766296387, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.159462071955204, "step": 25722 }, { "epoch": 0.803875, "grad_norm": 3.421875, "grad_norm_var": 0.021415201822916667, "learning_rate": 0.0001, "loss": 5.907, "loss/crossentropy": 2.6885892152786255, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17418118566274643, "step": 25724 }, { "epoch": 0.8039375, "grad_norm": 3.234375, "grad_norm_var": 0.017780558268229166, "learning_rate": 0.0001, "loss": 5.7036, "loss/crossentropy": 2.620099186897278, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16147109866142273, "step": 25726 }, { "epoch": 0.804, "grad_norm": 2.890625, "grad_norm_var": 0.021385701497395833, "learning_rate": 0.0001, "loss": 5.1817, "loss/crossentropy": 2.275924324989319, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1452612727880478, "step": 25728 }, { "epoch": 0.8040625, "grad_norm": 3.265625, "grad_norm_var": 0.15966389973958334, "learning_rate": 0.0001, "loss": 5.6325, "loss/crossentropy": 2.5340161323547363, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1610231250524521, "step": 25730 }, { "epoch": 0.804125, "grad_norm": 2.84375, "grad_norm_var": 0.18509114583333333, "learning_rate": 0.0001, "loss": 5.7786, "loss/crossentropy": 2.6004785299301147, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17093750834465027, "step": 25732 }, { "epoch": 0.8041875, "grad_norm": 2.8125, "grad_norm_var": 0.19388020833333333, "learning_rate": 0.0001, "loss": 5.781, "loss/crossentropy": 2.6742039918899536, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16614725440740585, "step": 25734 }, { "epoch": 0.80425, "grad_norm": 3.03125, "grad_norm_var": 0.18931376139322917, "learning_rate": 0.0001, "loss": 5.3246, "loss/crossentropy": 2.3369566202163696, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.156968355178833, "step": 25736 }, { "epoch": 0.8043125, "grad_norm": 3.15625, "grad_norm_var": 0.19371744791666667, "learning_rate": 0.0001, "loss": 5.9781, "loss/crossentropy": 2.8178043365478516, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16993281990289688, "step": 25738 }, { "epoch": 0.804375, "grad_norm": 3.078125, "grad_norm_var": 0.19742431640625, "learning_rate": 0.0001, "loss": 5.588, "loss/crossentropy": 2.5488440990448, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15860477834939957, "step": 25740 }, { "epoch": 0.8044375, "grad_norm": 3.109375, "grad_norm_var": 0.19734700520833334, "learning_rate": 0.0001, "loss": 5.6494, "loss/crossentropy": 2.6056177616119385, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1590660735964775, "step": 25742 }, { "epoch": 0.8045, "grad_norm": 3.390625, "grad_norm_var": 0.18928629557291668, "learning_rate": 0.0001, "loss": 5.4437, "loss/crossentropy": 2.3790863752365112, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15997406095266342, "step": 25744 }, { "epoch": 0.8045625, "grad_norm": 3.015625, "grad_norm_var": 0.06025390625, "learning_rate": 0.0001, "loss": 5.5304, "loss/crossentropy": 2.4592326879501343, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1606363207101822, "step": 25746 }, { "epoch": 0.804625, "grad_norm": 3.21875, "grad_norm_var": 0.03284505208333333, "learning_rate": 0.0001, "loss": 5.3759, "loss/crossentropy": 2.338041305541992, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.16199104487895966, "step": 25748 }, { "epoch": 0.8046875, "grad_norm": 3.203125, "grad_norm_var": 0.030980428059895832, "learning_rate": 0.0001, "loss": 5.5584, "loss/crossentropy": 2.522215723991394, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1579131782054901, "step": 25750 }, { "epoch": 0.80475, "grad_norm": 2.921875, "grad_norm_var": 0.03166402180989583, "learning_rate": 0.0001, "loss": 5.2946, "loss/crossentropy": 2.321294665336609, "loss/hidden": 1.40234375, "loss/jsd": 0.0, "loss/logits": 0.15710077434778214, "step": 25752 }, { "epoch": 0.8048125, "grad_norm": 3.15625, "grad_norm_var": 0.026839192708333334, "learning_rate": 0.0001, "loss": 5.8861, "loss/crossentropy": 2.8006707429885864, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16128186881542206, "step": 25754 }, { "epoch": 0.804875, "grad_norm": 3.34375, "grad_norm_var": 0.022175089518229166, "learning_rate": 0.0001, "loss": 5.2714, "loss/crossentropy": 2.295003652572632, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.1542847454547882, "step": 25756 }, { "epoch": 0.8049375, "grad_norm": 3.078125, "grad_norm_var": 0.0223541259765625, "learning_rate": 0.0001, "loss": 5.5602, "loss/crossentropy": 2.499534487724304, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16114644706249237, "step": 25758 }, { "epoch": 0.805, "grad_norm": 3.046875, "grad_norm_var": 0.018993123372395834, "learning_rate": 0.0001, "loss": 5.5152, "loss/crossentropy": 2.5322351455688477, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1545419991016388, "step": 25760 }, { "epoch": 0.8050625, "grad_norm": 2.96875, "grad_norm_var": 0.017479451497395833, "learning_rate": 0.0001, "loss": 5.4664, "loss/crossentropy": 2.490007162094116, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.1550569236278534, "step": 25762 }, { "epoch": 0.805125, "grad_norm": 2.828125, "grad_norm_var": 0.0184967041015625, "learning_rate": 0.0001, "loss": 5.0951, "loss/crossentropy": 2.164342999458313, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1493251919746399, "step": 25764 }, { "epoch": 0.8051875, "grad_norm": 3.203125, "grad_norm_var": 0.016162109375, "learning_rate": 0.0001, "loss": 5.5416, "loss/crossentropy": 2.4952151775360107, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1608925759792328, "step": 25766 }, { "epoch": 0.80525, "grad_norm": 3.03125, "grad_norm_var": 0.015623982747395833, "learning_rate": 0.0001, "loss": 5.4543, "loss/crossentropy": 2.396771192550659, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1580957993865013, "step": 25768 }, { "epoch": 0.8053125, "grad_norm": 2.640625, "grad_norm_var": 0.042215983072916664, "learning_rate": 0.0001, "loss": 5.3644, "loss/crossentropy": 2.3864080905914307, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15444162487983704, "step": 25770 }, { "epoch": 0.805375, "grad_norm": 3.109375, "grad_norm_var": 0.037873331705729166, "learning_rate": 0.0001, "loss": 5.611, "loss/crossentropy": 2.5382243394851685, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1619631052017212, "step": 25772 }, { "epoch": 0.8054375, "grad_norm": 3.3125, "grad_norm_var": 0.04263916015625, "learning_rate": 0.0001, "loss": 5.7934, "loss/crossentropy": 2.6957108974456787, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16680126637220383, "step": 25774 }, { "epoch": 0.8055, "grad_norm": 3.0, "grad_norm_var": 0.042333984375, "learning_rate": 0.0001, "loss": 5.3557, "loss/crossentropy": 2.397977590560913, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.15397745370864868, "step": 25776 }, { "epoch": 0.8055625, "grad_norm": 3.1875, "grad_norm_var": 0.042170206705729164, "learning_rate": 0.0001, "loss": 5.725, "loss/crossentropy": 2.65224027633667, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.16470235586166382, "step": 25778 }, { "epoch": 0.805625, "grad_norm": 3.171875, "grad_norm_var": 0.039281209309895836, "learning_rate": 0.0001, "loss": 5.4874, "loss/crossentropy": 2.4238076210021973, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16026780009269714, "step": 25780 }, { "epoch": 0.8056875, "grad_norm": 3.265625, "grad_norm_var": 0.04670308430989583, "learning_rate": 0.0001, "loss": 5.2495, "loss/crossentropy": 2.3126675486564636, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1472015157341957, "step": 25782 }, { "epoch": 0.80575, "grad_norm": 3.046875, "grad_norm_var": 0.046442667643229164, "learning_rate": 0.0001, "loss": 5.6502, "loss/crossentropy": 2.618051767349243, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.162589430809021, "step": 25784 }, { "epoch": 0.8058125, "grad_norm": 3.1875, "grad_norm_var": 0.0198638916015625, "learning_rate": 0.0001, "loss": 5.6136, "loss/crossentropy": 2.607123851776123, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15376880764961243, "step": 25786 }, { "epoch": 0.805875, "grad_norm": 3.703125, "grad_norm_var": 0.0438385009765625, "learning_rate": 0.0001, "loss": 5.8406, "loss/crossentropy": 2.6182804107666016, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1745753288269043, "step": 25788 }, { "epoch": 0.8059375, "grad_norm": 2.78125, "grad_norm_var": 0.04641520182291667, "learning_rate": 0.0001, "loss": 5.623, "loss/crossentropy": 2.5778337717056274, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15803135931491852, "step": 25790 }, { "epoch": 0.806, "grad_norm": 2.96875, "grad_norm_var": 0.05774637858072917, "learning_rate": 0.0001, "loss": 5.9335, "loss/crossentropy": 2.7090498208999634, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1744006648659706, "step": 25792 }, { "epoch": 0.8060625, "grad_norm": 3.125, "grad_norm_var": 0.057917277018229164, "learning_rate": 0.0001, "loss": 5.5817, "loss/crossentropy": 2.451915979385376, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1645418405532837, "step": 25794 }, { "epoch": 0.806125, "grad_norm": 2.84375, "grad_norm_var": 0.06393229166666667, "learning_rate": 0.0001, "loss": 5.7454, "loss/crossentropy": 2.624258518218994, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16562572121620178, "step": 25796 }, { "epoch": 0.8061875, "grad_norm": 3.140625, "grad_norm_var": 0.0544097900390625, "learning_rate": 0.0001, "loss": 5.7204, "loss/crossentropy": 2.6387758255004883, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16207094490528107, "step": 25798 }, { "epoch": 0.80625, "grad_norm": 3.265625, "grad_norm_var": 0.05766499837239583, "learning_rate": 0.0001, "loss": 5.9534, "loss/crossentropy": 2.6982457637786865, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1743442565202713, "step": 25800 }, { "epoch": 0.8063125, "grad_norm": 2.828125, "grad_norm_var": 0.07045796712239584, "learning_rate": 0.0001, "loss": 5.5196, "loss/crossentropy": 2.570582151412964, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.14724723994731903, "step": 25802 }, { "epoch": 0.806375, "grad_norm": 3.15625, "grad_norm_var": 0.04912109375, "learning_rate": 0.0001, "loss": 5.8479, "loss/crossentropy": 2.696234703063965, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16556153446435928, "step": 25804 }, { "epoch": 0.8064375, "grad_norm": 2.953125, "grad_norm_var": 0.0493560791015625, "learning_rate": 0.0001, "loss": 5.2374, "loss/crossentropy": 2.359342098236084, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.1479654163122177, "step": 25806 }, { "epoch": 0.8065, "grad_norm": 3.140625, "grad_norm_var": 0.03502197265625, "learning_rate": 0.0001, "loss": 5.8056, "loss/crossentropy": 2.625843048095703, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16953448951244354, "step": 25808 }, { "epoch": 0.8065625, "grad_norm": 2.890625, "grad_norm_var": 0.03536783854166667, "learning_rate": 0.0001, "loss": 5.5963, "loss/crossentropy": 2.5228800773620605, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16124501079320908, "step": 25810 }, { "epoch": 0.806625, "grad_norm": 3.109375, "grad_norm_var": 0.033177693684895836, "learning_rate": 0.0001, "loss": 5.6845, "loss/crossentropy": 2.55816388130188, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16458744555711746, "step": 25812 }, { "epoch": 0.8066875, "grad_norm": 3.125, "grad_norm_var": 0.03134358723958333, "learning_rate": 0.0001, "loss": 5.527, "loss/crossentropy": 2.5222166776657104, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.1590726673603058, "step": 25814 }, { "epoch": 0.80675, "grad_norm": 3.078125, "grad_norm_var": 0.019222005208333334, "learning_rate": 0.0001, "loss": 5.728, "loss/crossentropy": 2.6434502601623535, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16392720490694046, "step": 25816 }, { "epoch": 0.8068125, "grad_norm": 3.359375, "grad_norm_var": 0.02164306640625, "learning_rate": 0.0001, "loss": 5.6377, "loss/crossentropy": 2.5822893381118774, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16061674058437347, "step": 25818 }, { "epoch": 0.806875, "grad_norm": 3.140625, "grad_norm_var": 0.02711181640625, "learning_rate": 0.0001, "loss": 6.0137, "loss/crossentropy": 2.72200345993042, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17604146897792816, "step": 25820 }, { "epoch": 0.8069375, "grad_norm": 2.984375, "grad_norm_var": 0.020833333333333332, "learning_rate": 0.0001, "loss": 5.779, "loss/crossentropy": 2.6549432277679443, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.17100276798009872, "step": 25822 }, { "epoch": 0.807, "grad_norm": 3.09375, "grad_norm_var": 0.024918619791666666, "learning_rate": 0.0001, "loss": 5.5089, "loss/crossentropy": 2.4635682106018066, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1588340327143669, "step": 25824 }, { "epoch": 0.8070625, "grad_norm": 3.1875, "grad_norm_var": 0.0245758056640625, "learning_rate": 0.0001, "loss": 5.8354, "loss/crossentropy": 2.7161245346069336, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16388526558876038, "step": 25826 }, { "epoch": 0.807125, "grad_norm": 3.03125, "grad_norm_var": 0.0297271728515625, "learning_rate": 0.0001, "loss": 5.5257, "loss/crossentropy": 2.570802092552185, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.15486504882574081, "step": 25828 }, { "epoch": 0.8071875, "grad_norm": 3.21875, "grad_norm_var": 0.030475870768229166, "learning_rate": 0.0001, "loss": 5.7724, "loss/crossentropy": 2.628113031387329, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16912047564983368, "step": 25830 }, { "epoch": 0.80725, "grad_norm": 2.921875, "grad_norm_var": 0.03525390625, "learning_rate": 0.0001, "loss": 5.4256, "loss/crossentropy": 2.4442425966262817, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.15633535385131836, "step": 25832 }, { "epoch": 0.8073125, "grad_norm": 3.234375, "grad_norm_var": 0.030003865559895832, "learning_rate": 0.0001, "loss": 5.7966, "loss/crossentropy": 2.6853344440460205, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16620568186044693, "step": 25834 }, { "epoch": 0.807375, "grad_norm": 3.3125, "grad_norm_var": 0.02691650390625, "learning_rate": 0.0001, "loss": 5.6149, "loss/crossentropy": 2.539078116416931, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16227243840694427, "step": 25836 }, { "epoch": 0.8074375, "grad_norm": 3.15625, "grad_norm_var": 0.03668212890625, "learning_rate": 0.0001, "loss": 5.6767, "loss/crossentropy": 2.4889843463897705, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17150284349918365, "step": 25838 }, { "epoch": 0.8075, "grad_norm": 2.90625, "grad_norm_var": 0.03472900390625, "learning_rate": 0.0001, "loss": 5.4635, "loss/crossentropy": 2.4544711112976074, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15636713802814484, "step": 25840 }, { "epoch": 0.8075625, "grad_norm": 3.1875, "grad_norm_var": 0.05281473795572917, "learning_rate": 0.0001, "loss": 5.8003, "loss/crossentropy": 2.6237099170684814, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16883177310228348, "step": 25842 }, { "epoch": 0.807625, "grad_norm": 2.796875, "grad_norm_var": 0.0544830322265625, "learning_rate": 0.0001, "loss": 5.7551, "loss/crossentropy": 2.6296327114105225, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16801517456769943, "step": 25844 }, { "epoch": 0.8076875, "grad_norm": 2.65625, "grad_norm_var": 0.07205301920572917, "learning_rate": 0.0001, "loss": 5.3143, "loss/crossentropy": 2.3800086975097656, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1496814787387848, "step": 25846 }, { "epoch": 0.80775, "grad_norm": 3.0625, "grad_norm_var": 0.06725260416666666, "learning_rate": 0.0001, "loss": 5.5656, "loss/crossentropy": 2.520174264907837, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15845265984535217, "step": 25848 }, { "epoch": 0.8078125, "grad_norm": 2.875, "grad_norm_var": 0.06857808430989583, "learning_rate": 0.0001, "loss": 5.4627, "loss/crossentropy": 2.472960591316223, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15288261324167252, "step": 25850 }, { "epoch": 0.807875, "grad_norm": 3.0, "grad_norm_var": 0.06544596354166667, "learning_rate": 0.0001, "loss": 5.6572, "loss/crossentropy": 2.5968352556228638, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1615081951022148, "step": 25852 }, { "epoch": 0.8079375, "grad_norm": 3.21875, "grad_norm_var": 0.06123046875, "learning_rate": 0.0001, "loss": 6.0538, "loss/crossentropy": 2.8122987747192383, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17414553463459015, "step": 25854 }, { "epoch": 0.808, "grad_norm": 3.0625, "grad_norm_var": 0.05886128743489583, "learning_rate": 0.0001, "loss": 5.675, "loss/crossentropy": 2.5798511505126953, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16576293855905533, "step": 25856 }, { "epoch": 0.8080625, "grad_norm": 2.953125, "grad_norm_var": 0.04052327473958333, "learning_rate": 0.0001, "loss": 5.4631, "loss/crossentropy": 2.5038857460021973, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1498228833079338, "step": 25858 }, { "epoch": 0.808125, "grad_norm": 3.25, "grad_norm_var": 0.0347808837890625, "learning_rate": 0.0001, "loss": 5.5511, "loss/crossentropy": 2.5348498821258545, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.16099654883146286, "step": 25860 }, { "epoch": 0.8081875, "grad_norm": 3.046875, "grad_norm_var": 0.018651326497395832, "learning_rate": 0.0001, "loss": 5.4165, "loss/crossentropy": 2.404328227043152, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.158634215593338, "step": 25862 }, { "epoch": 0.80825, "grad_norm": 3.1875, "grad_norm_var": 0.019220987955729168, "learning_rate": 0.0001, "loss": 5.4641, "loss/crossentropy": 2.364362955093384, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16349120438098907, "step": 25864 }, { "epoch": 0.8083125, "grad_norm": 2.875, "grad_norm_var": 0.019364420572916666, "learning_rate": 0.0001, "loss": 5.5132, "loss/crossentropy": 2.410773754119873, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16258376836776733, "step": 25866 }, { "epoch": 0.808375, "grad_norm": 3.296875, "grad_norm_var": 0.021968587239583334, "learning_rate": 0.0001, "loss": 5.6447, "loss/crossentropy": 2.5925387144088745, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.16185922920703888, "step": 25868 }, { "epoch": 0.8084375, "grad_norm": 3.0625, "grad_norm_var": 0.0190338134765625, "learning_rate": 0.0001, "loss": 5.8386, "loss/crossentropy": 2.6724647283554077, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16700299084186554, "step": 25870 }, { "epoch": 0.8085, "grad_norm": 3.265625, "grad_norm_var": 0.0214752197265625, "learning_rate": 0.0001, "loss": 5.7479, "loss/crossentropy": 2.63448703289032, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16525235772132874, "step": 25872 }, { "epoch": 0.8085625, "grad_norm": 2.78125, "grad_norm_var": 0.02340087890625, "learning_rate": 0.0001, "loss": 5.2758, "loss/crossentropy": 2.299230217933655, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.1535119041800499, "step": 25874 }, { "epoch": 0.808625, "grad_norm": 2.75, "grad_norm_var": 0.03804931640625, "learning_rate": 0.0001, "loss": 5.5636, "loss/crossentropy": 2.5843125581741333, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.15730515867471695, "step": 25876 }, { "epoch": 0.8086875, "grad_norm": 2.921875, "grad_norm_var": 0.04053446451822917, "learning_rate": 0.0001, "loss": 5.6744, "loss/crossentropy": 2.5767170190811157, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1652393788099289, "step": 25878 }, { "epoch": 0.80875, "grad_norm": 3.875, "grad_norm_var": 0.08639322916666667, "learning_rate": 0.0001, "loss": 5.8538, "loss/crossentropy": 2.6336944103240967, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1739620342850685, "step": 25880 }, { "epoch": 0.8088125, "grad_norm": 3.34375, "grad_norm_var": 0.08307291666666666, "learning_rate": 0.0001, "loss": 5.8214, "loss/crossentropy": 2.643694758415222, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17089402675628662, "step": 25882 }, { "epoch": 0.808875, "grad_norm": 3.328125, "grad_norm_var": 0.08222249348958334, "learning_rate": 0.0001, "loss": 5.3962, "loss/crossentropy": 2.347903251647949, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15912645310163498, "step": 25884 }, { "epoch": 0.8089375, "grad_norm": 3.265625, "grad_norm_var": 0.08266499837239584, "learning_rate": 0.0001, "loss": 5.6709, "loss/crossentropy": 2.528378963470459, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1650313213467598, "step": 25886 }, { "epoch": 0.809, "grad_norm": 2.9375, "grad_norm_var": 0.08391011555989583, "learning_rate": 0.0001, "loss": 5.7254, "loss/crossentropy": 2.6138205528259277, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16584961116313934, "step": 25888 }, { "epoch": 0.8090625, "grad_norm": 2.625, "grad_norm_var": 0.09212239583333333, "learning_rate": 0.0001, "loss": 5.312, "loss/crossentropy": 2.38472843170166, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.1521066278219223, "step": 25890 }, { "epoch": 0.809125, "grad_norm": 2.96875, "grad_norm_var": 0.07939046223958333, "learning_rate": 0.0001, "loss": 5.6212, "loss/crossentropy": 2.601081967353821, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.15942999720573425, "step": 25892 }, { "epoch": 0.8091875, "grad_norm": 3.296875, "grad_norm_var": 0.08157145182291667, "learning_rate": 0.0001, "loss": 5.8314, "loss/crossentropy": 2.6287145614624023, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17183279991149902, "step": 25894 }, { "epoch": 0.80925, "grad_norm": 2.921875, "grad_norm_var": 0.05066731770833333, "learning_rate": 0.0001, "loss": 5.6516, "loss/crossentropy": 2.633685350418091, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1568736582994461, "step": 25896 }, { "epoch": 0.8093125, "grad_norm": 3.21875, "grad_norm_var": 0.04893290201822917, "learning_rate": 0.0001, "loss": 5.9869, "loss/crossentropy": 2.7616509199142456, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1721365600824356, "step": 25898 }, { "epoch": 0.809375, "grad_norm": 3.03125, "grad_norm_var": 0.0486236572265625, "learning_rate": 0.0001, "loss": 6.0606, "loss/crossentropy": 2.7832703590393066, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.18007761240005493, "step": 25900 }, { "epoch": 0.8094375, "grad_norm": 3.140625, "grad_norm_var": 0.042723592122395834, "learning_rate": 0.0001, "loss": 5.6753, "loss/crossentropy": 2.538694739341736, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16639665514230728, "step": 25902 }, { "epoch": 0.8095, "grad_norm": 3.015625, "grad_norm_var": 0.0408111572265625, "learning_rate": 0.0001, "loss": 5.3537, "loss/crossentropy": 2.4291588068008423, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15104401111602783, "step": 25904 }, { "epoch": 0.8095625, "grad_norm": 3.390625, "grad_norm_var": 0.03291015625, "learning_rate": 0.0001, "loss": 5.6602, "loss/crossentropy": 2.547026038169861, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1640501320362091, "step": 25906 }, { "epoch": 0.809625, "grad_norm": 3.015625, "grad_norm_var": 0.032450358072916664, "learning_rate": 0.0001, "loss": 5.4983, "loss/crossentropy": 2.5117374658584595, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15646952390670776, "step": 25908 }, { "epoch": 0.8096875, "grad_norm": 3.0625, "grad_norm_var": 0.019587198893229168, "learning_rate": 0.0001, "loss": 5.5231, "loss/crossentropy": 2.5682032108306885, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.154079370200634, "step": 25910 }, { "epoch": 0.80975, "grad_norm": 3.0, "grad_norm_var": 0.017801920572916668, "learning_rate": 0.0001, "loss": 5.83, "loss/crossentropy": 2.6992393732070923, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16541622579097748, "step": 25912 }, { "epoch": 0.8098125, "grad_norm": 3.0625, "grad_norm_var": 0.016727701822916666, "learning_rate": 0.0001, "loss": 5.5044, "loss/crossentropy": 2.5401517152786255, "loss/hidden": 1.39453125, "loss/jsd": 0.0, "loss/logits": 0.15696705877780914, "step": 25914 }, { "epoch": 0.809875, "grad_norm": 3.234375, "grad_norm_var": 0.014484659830729166, "learning_rate": 0.0001, "loss": 5.8543, "loss/crossentropy": 2.7261210680007935, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1643773913383484, "step": 25916 }, { "epoch": 0.8099375, "grad_norm": 3.1875, "grad_norm_var": 0.015673828125, "learning_rate": 0.0001, "loss": 5.6174, "loss/crossentropy": 2.6042429208755493, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15756148099899292, "step": 25918 }, { "epoch": 0.81, "grad_norm": 3.0625, "grad_norm_var": 0.0150299072265625, "learning_rate": 0.0001, "loss": 5.679, "loss/crossentropy": 2.636616587638855, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16009552031755447, "step": 25920 }, { "epoch": 0.8100625, "grad_norm": 3.296875, "grad_norm_var": 0.013114420572916667, "learning_rate": 0.0001, "loss": 5.3148, "loss/crossentropy": 2.367428779602051, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15176884829998016, "step": 25922 }, { "epoch": 0.810125, "grad_norm": 3.4375, "grad_norm_var": 0.020849609375, "learning_rate": 0.0001, "loss": 5.5218, "loss/crossentropy": 2.435949444770813, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.158583365380764, "step": 25924 }, { "epoch": 0.8101875, "grad_norm": 3.0625, "grad_norm_var": 0.017854817708333335, "learning_rate": 0.0001, "loss": 5.5599, "loss/crossentropy": 2.5772972106933594, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15607711672782898, "step": 25926 }, { "epoch": 0.81025, "grad_norm": 3.09375, "grad_norm_var": 0.017878214518229168, "learning_rate": 0.0001, "loss": 5.8299, "loss/crossentropy": 2.6923950910568237, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16687103360891342, "step": 25928 }, { "epoch": 0.8103125, "grad_norm": 2.875, "grad_norm_var": 0.019831339518229168, "learning_rate": 0.0001, "loss": 5.726, "loss/crossentropy": 2.693419575691223, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15911632776260376, "step": 25930 }, { "epoch": 0.810375, "grad_norm": 2.796875, "grad_norm_var": 0.024592081705729168, "learning_rate": 0.0001, "loss": 5.4774, "loss/crossentropy": 2.5127739906311035, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15427283942699432, "step": 25932 }, { "epoch": 0.8104375, "grad_norm": 3.140625, "grad_norm_var": 0.025614420572916668, "learning_rate": 0.0001, "loss": 5.566, "loss/crossentropy": 2.5519654750823975, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15921468287706375, "step": 25934 }, { "epoch": 0.8105, "grad_norm": 3.421875, "grad_norm_var": 0.03321024576822917, "learning_rate": 0.0001, "loss": 5.9783, "loss/crossentropy": 2.8077282905578613, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.17135663330554962, "step": 25936 }, { "epoch": 0.8105625, "grad_norm": 2.921875, "grad_norm_var": 0.030855305989583335, "learning_rate": 0.0001, "loss": 5.7128, "loss/crossentropy": 2.6672173738479614, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1615942120552063, "step": 25938 }, { "epoch": 0.810625, "grad_norm": 3.046875, "grad_norm_var": 0.023856608072916667, "learning_rate": 0.0001, "loss": 5.7031, "loss/crossentropy": 2.5845017433166504, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.1677147075533867, "step": 25940 }, { "epoch": 0.8106875, "grad_norm": 2.921875, "grad_norm_var": 0.024413045247395834, "learning_rate": 0.0001, "loss": 5.7142, "loss/crossentropy": 2.6113885641098022, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.164574533700943, "step": 25942 }, { "epoch": 0.81075, "grad_norm": 2.96875, "grad_norm_var": 0.03306376139322917, "learning_rate": 0.0001, "loss": 5.5484, "loss/crossentropy": 2.4556208848953247, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16239862143993378, "step": 25944 }, { "epoch": 0.8108125, "grad_norm": 2.96875, "grad_norm_var": 0.031266276041666666, "learning_rate": 0.0001, "loss": 5.5104, "loss/crossentropy": 2.480081081390381, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15771479159593582, "step": 25946 }, { "epoch": 0.810875, "grad_norm": 2.875, "grad_norm_var": 0.03152669270833333, "learning_rate": 0.0001, "loss": 5.7399, "loss/crossentropy": 2.600390076637268, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16902713477611542, "step": 25948 }, { "epoch": 0.8109375, "grad_norm": 2.9375, "grad_norm_var": 0.0316314697265625, "learning_rate": 0.0001, "loss": 5.6332, "loss/crossentropy": 2.538271427154541, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16418224573135376, "step": 25950 }, { "epoch": 0.811, "grad_norm": 3.203125, "grad_norm_var": 0.023469034830729166, "learning_rate": 0.0001, "loss": 5.6978, "loss/crossentropy": 2.557962417602539, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16554241627454758, "step": 25952 }, { "epoch": 0.8110625, "grad_norm": 3.296875, "grad_norm_var": 0.04248758951822917, "learning_rate": 0.0001, "loss": 5.8858, "loss/crossentropy": 2.6491034030914307, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1736716628074646, "step": 25954 }, { "epoch": 0.811125, "grad_norm": 3.125, "grad_norm_var": 0.04049479166666667, "learning_rate": 0.0001, "loss": 5.9979, "loss/crossentropy": 2.8587846755981445, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16743197292089462, "step": 25956 }, { "epoch": 0.8111875, "grad_norm": 3.40625, "grad_norm_var": 0.0432281494140625, "learning_rate": 0.0001, "loss": 5.7378, "loss/crossentropy": 2.593713879585266, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1636243313550949, "step": 25958 }, { "epoch": 0.81125, "grad_norm": 3.140625, "grad_norm_var": 0.041901652018229166, "learning_rate": 0.0001, "loss": 5.4866, "loss/crossentropy": 2.398123025894165, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16119029372930527, "step": 25960 }, { "epoch": 0.8113125, "grad_norm": 3.015625, "grad_norm_var": 0.0491607666015625, "learning_rate": 0.0001, "loss": 5.4391, "loss/crossentropy": 2.4695407152175903, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1539890244603157, "step": 25962 }, { "epoch": 0.811375, "grad_norm": 3.15625, "grad_norm_var": 0.04673563639322917, "learning_rate": 0.0001, "loss": 5.7556, "loss/crossentropy": 2.649569034576416, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16451101005077362, "step": 25964 }, { "epoch": 0.8114375, "grad_norm": 3.171875, "grad_norm_var": 0.043701171875, "learning_rate": 0.0001, "loss": 5.697, "loss/crossentropy": 2.580646514892578, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16866681724786758, "step": 25966 }, { "epoch": 0.8115, "grad_norm": 2.828125, "grad_norm_var": 0.04605712890625, "learning_rate": 0.0001, "loss": 5.5413, "loss/crossentropy": 2.5573774576187134, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15503410249948502, "step": 25968 }, { "epoch": 0.8115625, "grad_norm": 3.21875, "grad_norm_var": 0.03156636555989583, "learning_rate": 0.0001, "loss": 5.9362, "loss/crossentropy": 2.795621633529663, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16562466323375702, "step": 25970 }, { "epoch": 0.811625, "grad_norm": 3.015625, "grad_norm_var": 0.031794230143229164, "learning_rate": 0.0001, "loss": 5.5216, "loss/crossentropy": 2.500490665435791, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.16109219193458557, "step": 25972 }, { "epoch": 0.8116875, "grad_norm": 3.25, "grad_norm_var": 0.027904256184895834, "learning_rate": 0.0001, "loss": 5.5786, "loss/crossentropy": 2.542115569114685, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.1595092937350273, "step": 25974 }, { "epoch": 0.81175, "grad_norm": 3.0625, "grad_norm_var": 0.018245442708333334, "learning_rate": 0.0001, "loss": 5.5512, "loss/crossentropy": 2.539905309677124, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.16011208295822144, "step": 25976 }, { "epoch": 0.8118125, "grad_norm": 2.90625, "grad_norm_var": 0.015165201822916667, "learning_rate": 0.0001, "loss": 5.6823, "loss/crossentropy": 2.6318790912628174, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.1632421314716339, "step": 25978 }, { "epoch": 0.811875, "grad_norm": 2.75, "grad_norm_var": 0.0322906494140625, "learning_rate": 0.0001, "loss": 5.6499, "loss/crossentropy": 2.562356114387512, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16383300721645355, "step": 25980 }, { "epoch": 0.8119375, "grad_norm": 3.09375, "grad_norm_var": 0.032079060872395836, "learning_rate": 0.0001, "loss": 5.4906, "loss/crossentropy": 2.507889151573181, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.15842348337173462, "step": 25982 }, { "epoch": 0.812, "grad_norm": 2.859375, "grad_norm_var": 0.03125, "learning_rate": 0.0001, "loss": 5.5122, "loss/crossentropy": 2.4938517808914185, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1580801010131836, "step": 25984 }, { "epoch": 0.8120625, "grad_norm": 3.171875, "grad_norm_var": 0.0365386962890625, "learning_rate": 0.0001, "loss": 5.709, "loss/crossentropy": 2.6493401527404785, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.1637757271528244, "step": 25986 }, { "epoch": 0.812125, "grad_norm": 3.375, "grad_norm_var": 0.04219462076822917, "learning_rate": 0.0001, "loss": 5.5582, "loss/crossentropy": 2.431844115257263, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16536659747362137, "step": 25988 }, { "epoch": 0.8121875, "grad_norm": 3.0, "grad_norm_var": 0.039013671875, "learning_rate": 0.0001, "loss": 5.3936, "loss/crossentropy": 2.386796236038208, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.15888264775276184, "step": 25990 }, { "epoch": 0.81225, "grad_norm": 2.921875, "grad_norm_var": 0.04033203125, "learning_rate": 0.0001, "loss": 5.4206, "loss/crossentropy": 2.414778470993042, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.15996189415454865, "step": 25992 }, { "epoch": 0.8123125, "grad_norm": 2.984375, "grad_norm_var": 0.0384918212890625, "learning_rate": 0.0001, "loss": 5.3062, "loss/crossentropy": 2.351076602935791, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.1529332399368286, "step": 25994 }, { "epoch": 0.812375, "grad_norm": 3.078125, "grad_norm_var": 0.020319620768229168, "learning_rate": 0.0001, "loss": 5.3583, "loss/crossentropy": 2.4164395332336426, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.1535656899213791, "step": 25996 }, { "epoch": 0.8124375, "grad_norm": 3.03125, "grad_norm_var": 0.021312459309895834, "learning_rate": 0.0001, "loss": 5.9078, "loss/crossentropy": 2.7739639282226562, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16690263152122498, "step": 25998 }, { "epoch": 0.8125, "grad_norm": 3.140625, "grad_norm_var": 0.021305338541666666, "learning_rate": 0.0001, "loss": 5.6046, "loss/crossentropy": 2.5973480939865112, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1550179421901703, "step": 26000 }, { "epoch": 0.8125625, "grad_norm": 3.34375, "grad_norm_var": 0.020601399739583335, "learning_rate": 0.0001, "loss": 5.6036, "loss/crossentropy": 2.5490570068359375, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.16209101676940918, "step": 26002 }, { "epoch": 0.812625, "grad_norm": 3.40625, "grad_norm_var": 0.023958333333333335, "learning_rate": 0.0001, "loss": 5.6212, "loss/crossentropy": 2.565270185470581, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15949514508247375, "step": 26004 }, { "epoch": 0.8126875, "grad_norm": 3.203125, "grad_norm_var": 0.026537068684895835, "learning_rate": 0.0001, "loss": 5.4849, "loss/crossentropy": 2.472626805305481, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15512911975383759, "step": 26006 }, { "epoch": 0.81275, "grad_norm": 3.0, "grad_norm_var": 0.031689453125, "learning_rate": 0.0001, "loss": 5.3621, "loss/crossentropy": 2.368017077445984, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.14863138645887375, "step": 26008 }, { "epoch": 0.8128125, "grad_norm": 2.953125, "grad_norm_var": 0.03704427083333333, "learning_rate": 0.0001, "loss": 5.7561, "loss/crossentropy": 2.593183994293213, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16981104016304016, "step": 26010 }, { "epoch": 0.812875, "grad_norm": 3.21875, "grad_norm_var": 0.03508199055989583, "learning_rate": 0.0001, "loss": 5.9786, "loss/crossentropy": 2.7332016229629517, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17375550419092178, "step": 26012 }, { "epoch": 0.8129375, "grad_norm": 3.1875, "grad_norm_var": 0.035033162434895834, "learning_rate": 0.0001, "loss": 5.4169, "loss/crossentropy": 2.36329984664917, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1596594601869583, "step": 26014 }, { "epoch": 0.813, "grad_norm": 2.84375, "grad_norm_var": 0.037385050455729166, "learning_rate": 0.0001, "loss": 5.3814, "loss/crossentropy": 2.4783889055252075, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.14811166375875473, "step": 26016 }, { "epoch": 0.8130625, "grad_norm": 3.25, "grad_norm_var": 0.03621317545572917, "learning_rate": 0.0001, "loss": 5.9081, "loss/crossentropy": 2.695285677909851, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17245175689458847, "step": 26018 }, { "epoch": 0.813125, "grad_norm": 2.921875, "grad_norm_var": 0.030354817708333332, "learning_rate": 0.0001, "loss": 5.761, "loss/crossentropy": 2.6396515369415283, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1660393923521042, "step": 26020 }, { "epoch": 0.8131875, "grad_norm": 3.234375, "grad_norm_var": 0.03125712076822917, "learning_rate": 0.0001, "loss": 5.8516, "loss/crossentropy": 2.702316403388977, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16882963478565216, "step": 26022 }, { "epoch": 0.81325, "grad_norm": 3.171875, "grad_norm_var": 0.030989583333333334, "learning_rate": 0.0001, "loss": 5.5435, "loss/crossentropy": 2.5398088693618774, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15662306547164917, "step": 26024 }, { "epoch": 0.8133125, "grad_norm": 3.59375, "grad_norm_var": 0.04273681640625, "learning_rate": 0.0001, "loss": 5.7943, "loss/crossentropy": 2.6339192390441895, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16525793820619583, "step": 26026 }, { "epoch": 0.813375, "grad_norm": 3.125, "grad_norm_var": 0.0432037353515625, "learning_rate": 0.0001, "loss": 5.9342, "loss/crossentropy": 2.775349497795105, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.17018358409404755, "step": 26028 }, { "epoch": 0.8134375, "grad_norm": 3.0, "grad_norm_var": 0.04478251139322917, "learning_rate": 0.0001, "loss": 5.6793, "loss/crossentropy": 2.548333764076233, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1677834466099739, "step": 26030 }, { "epoch": 0.8135, "grad_norm": 2.96875, "grad_norm_var": 0.04029541015625, "learning_rate": 0.0001, "loss": 5.6844, "loss/crossentropy": 2.603028178215027, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1612669825553894, "step": 26032 }, { "epoch": 0.8135625, "grad_norm": 3.0, "grad_norm_var": 0.03905843098958333, "learning_rate": 0.0001, "loss": 5.4034, "loss/crossentropy": 2.386218786239624, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15953043848276138, "step": 26034 }, { "epoch": 0.813625, "grad_norm": 3.046875, "grad_norm_var": 0.039159138997395836, "learning_rate": 0.0001, "loss": 5.4987, "loss/crossentropy": 2.523100256919861, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.157719187438488, "step": 26036 }, { "epoch": 0.8136875, "grad_norm": 3.078125, "grad_norm_var": 0.036946614583333336, "learning_rate": 0.0001, "loss": 5.7258, "loss/crossentropy": 2.5546183586120605, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16711720824241638, "step": 26038 }, { "epoch": 0.81375, "grad_norm": 3.140625, "grad_norm_var": 0.02896728515625, "learning_rate": 0.0001, "loss": 5.867, "loss/crossentropy": 2.7451947927474976, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16335587948560715, "step": 26040 }, { "epoch": 0.8138125, "grad_norm": 2.859375, "grad_norm_var": 0.016991170247395833, "learning_rate": 0.0001, "loss": 5.3652, "loss/crossentropy": 2.373391032218933, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15542739629745483, "step": 26042 }, { "epoch": 0.813875, "grad_norm": 2.796875, "grad_norm_var": 0.020308430989583334, "learning_rate": 0.0001, "loss": 5.5794, "loss/crossentropy": 2.56760573387146, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15977203845977783, "step": 26044 }, { "epoch": 0.8139375, "grad_norm": 2.90625, "grad_norm_var": 0.018094889322916665, "learning_rate": 0.0001, "loss": 5.7479, "loss/crossentropy": 2.640484571456909, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1662125587463379, "step": 26046 }, { "epoch": 0.814, "grad_norm": 3.0625, "grad_norm_var": 0.017780558268229166, "learning_rate": 0.0001, "loss": 5.4578, "loss/crossentropy": 2.455212354660034, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15494395792484283, "step": 26048 }, { "epoch": 0.8140625, "grad_norm": 3.390625, "grad_norm_var": 0.04582926432291667, "learning_rate": 0.0001, "loss": 5.8634, "loss/crossentropy": 2.6449553966522217, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17145580798387527, "step": 26050 }, { "epoch": 0.814125, "grad_norm": 3.53125, "grad_norm_var": 0.053278605143229164, "learning_rate": 0.0001, "loss": 5.5121, "loss/crossentropy": 2.373735785484314, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.16149048507213593, "step": 26052 }, { "epoch": 0.8141875, "grad_norm": 3.265625, "grad_norm_var": 0.05279541015625, "learning_rate": 0.0001, "loss": 5.3452, "loss/crossentropy": 2.277380883693695, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1614658087491989, "step": 26054 }, { "epoch": 0.81425, "grad_norm": 2.875, "grad_norm_var": 0.058577473958333334, "learning_rate": 0.0001, "loss": 5.5921, "loss/crossentropy": 2.530734062194824, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16238905489444733, "step": 26056 }, { "epoch": 0.8143125, "grad_norm": 3.15625, "grad_norm_var": 0.06541341145833333, "learning_rate": 0.0001, "loss": 5.2169, "loss/crossentropy": 2.275343418121338, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15196342766284943, "step": 26058 }, { "epoch": 0.814375, "grad_norm": 2.9375, "grad_norm_var": 0.0645416259765625, "learning_rate": 0.0001, "loss": 5.4366, "loss/crossentropy": 2.3882514238357544, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15952109545469284, "step": 26060 }, { "epoch": 0.8144375, "grad_norm": 2.921875, "grad_norm_var": 0.0640289306640625, "learning_rate": 0.0001, "loss": 5.5776, "loss/crossentropy": 2.522711992263794, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1609606221318245, "step": 26062 }, { "epoch": 0.8145, "grad_norm": 3.28125, "grad_norm_var": 0.0617584228515625, "learning_rate": 0.0001, "loss": 5.7144, "loss/crossentropy": 2.5757981538772583, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16659953445196152, "step": 26064 }, { "epoch": 0.8145625, "grad_norm": 3.375, "grad_norm_var": 0.05034077962239583, "learning_rate": 0.0001, "loss": 5.3892, "loss/crossentropy": 2.3622301816940308, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1569949835538864, "step": 26066 }, { "epoch": 0.814625, "grad_norm": 2.84375, "grad_norm_var": 0.04413960774739583, "learning_rate": 0.0001, "loss": 5.6939, "loss/crossentropy": 2.5889511108398438, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16440598666667938, "step": 26068 }, { "epoch": 0.8146875, "grad_norm": 3.046875, "grad_norm_var": 0.04318033854166667, "learning_rate": 0.0001, "loss": 6.1497, "loss/crossentropy": 2.917039155960083, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17444155365228653, "step": 26070 }, { "epoch": 0.81475, "grad_norm": 3.0625, "grad_norm_var": 0.040751139322916664, "learning_rate": 0.0001, "loss": 5.2042, "loss/crossentropy": 2.2803984880447388, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1458997204899788, "step": 26072 }, { "epoch": 0.8148125, "grad_norm": 3.0625, "grad_norm_var": 0.032591756184895834, "learning_rate": 0.0001, "loss": 5.5049, "loss/crossentropy": 2.3861804008483887, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16265819221735, "step": 26074 }, { "epoch": 0.814875, "grad_norm": 3.4375, "grad_norm_var": 0.030915323893229166, "learning_rate": 0.0001, "loss": 5.6712, "loss/crossentropy": 2.4557416439056396, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17115310579538345, "step": 26076 }, { "epoch": 0.8149375, "grad_norm": 3.03125, "grad_norm_var": 0.06640218098958334, "learning_rate": 0.0001, "loss": 5.8032, "loss/crossentropy": 2.7046096324920654, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16533035784959793, "step": 26078 }, { "epoch": 0.815, "grad_norm": 3.6875, "grad_norm_var": 0.08331705729166666, "learning_rate": 0.0001, "loss": 5.6713, "loss/crossentropy": 2.5506922006607056, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1683076173067093, "step": 26080 }, { "epoch": 0.8150625, "grad_norm": 3.0625, "grad_norm_var": 0.08404947916666666, "learning_rate": 0.0001, "loss": 5.6315, "loss/crossentropy": 2.5789612531661987, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.16189821809530258, "step": 26082 }, { "epoch": 0.815125, "grad_norm": 3.40625, "grad_norm_var": 0.07544657389322916, "learning_rate": 0.0001, "loss": 5.5345, "loss/crossentropy": 2.4583956003189087, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16191180050373077, "step": 26084 }, { "epoch": 0.8151875, "grad_norm": 3.0625, "grad_norm_var": 0.07571207682291667, "learning_rate": 0.0001, "loss": 5.69, "loss/crossentropy": 2.5922963619232178, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16367991268634796, "step": 26086 }, { "epoch": 0.81525, "grad_norm": 3.0625, "grad_norm_var": 0.07688395182291667, "learning_rate": 0.0001, "loss": 5.7573, "loss/crossentropy": 2.63303804397583, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16437966376543045, "step": 26088 }, { "epoch": 0.8153125, "grad_norm": 3.625, "grad_norm_var": 0.08284403483072916, "learning_rate": 0.0001, "loss": 5.9082, "loss/crossentropy": 2.746885895729065, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.17004229128360748, "step": 26090 }, { "epoch": 0.815375, "grad_norm": 3.03125, "grad_norm_var": 0.09219462076822917, "learning_rate": 0.0001, "loss": 5.2256, "loss/crossentropy": 2.322092294692993, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.15050920844078064, "step": 26092 }, { "epoch": 0.8154375, "grad_norm": 3.125, "grad_norm_var": 0.057291666666666664, "learning_rate": 0.0001, "loss": 5.364, "loss/crossentropy": 2.370709538459778, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15519004315137863, "step": 26094 }, { "epoch": 0.8155, "grad_norm": 3.171875, "grad_norm_var": 0.03404032389322917, "learning_rate": 0.0001, "loss": 5.2883, "loss/crossentropy": 2.3643643856048584, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.14551495760679245, "step": 26096 }, { "epoch": 0.8155625, "grad_norm": 3.1875, "grad_norm_var": 0.030321248372395835, "learning_rate": 0.0001, "loss": 5.8893, "loss/crossentropy": 2.7373058795928955, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16989066451787949, "step": 26098 }, { "epoch": 0.815625, "grad_norm": 3.15625, "grad_norm_var": 0.02564697265625, "learning_rate": 0.0001, "loss": 5.6633, "loss/crossentropy": 2.583295702934265, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16269079595804214, "step": 26100 }, { "epoch": 0.8156875, "grad_norm": 2.9375, "grad_norm_var": 0.027730305989583332, "learning_rate": 0.0001, "loss": 5.359, "loss/crossentropy": 2.4208203554153442, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.14967238903045654, "step": 26102 }, { "epoch": 0.81575, "grad_norm": 2.921875, "grad_norm_var": 0.030589803059895834, "learning_rate": 0.0001, "loss": 5.5604, "loss/crossentropy": 2.5293065309524536, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15701773017644882, "step": 26104 }, { "epoch": 0.8158125, "grad_norm": 3.0, "grad_norm_var": 0.010965983072916666, "learning_rate": 0.0001, "loss": 5.4673, "loss/crossentropy": 2.474220871925354, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15516993403434753, "step": 26106 }, { "epoch": 0.815875, "grad_norm": 3.046875, "grad_norm_var": 0.0093658447265625, "learning_rate": 0.0001, "loss": 5.3832, "loss/crossentropy": 2.3631885051727295, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.1578560248017311, "step": 26108 }, { "epoch": 0.8159375, "grad_norm": 3.296875, "grad_norm_var": 0.018359375, "learning_rate": 0.0001, "loss": 4.843, "loss/crossentropy": 2.020129442214966, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.14127205312252045, "step": 26110 }, { "epoch": 0.816, "grad_norm": 3.359375, "grad_norm_var": 0.0252838134765625, "learning_rate": 0.0001, "loss": 5.6984, "loss/crossentropy": 2.670602560043335, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16058924794197083, "step": 26112 }, { "epoch": 0.8160625, "grad_norm": 3.109375, "grad_norm_var": 0.024583943684895835, "learning_rate": 0.0001, "loss": 5.6156, "loss/crossentropy": 2.510475754737854, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16676156967878342, "step": 26114 }, { "epoch": 0.816125, "grad_norm": 3.703125, "grad_norm_var": 0.05015360514322917, "learning_rate": 0.0001, "loss": 5.7941, "loss/crossentropy": 2.5985480546951294, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1715070903301239, "step": 26116 }, { "epoch": 0.8161875, "grad_norm": 3.109375, "grad_norm_var": 0.048924763997395836, "learning_rate": 0.0001, "loss": 5.9325, "loss/crossentropy": 2.7597391605377197, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17001105099916458, "step": 26118 }, { "epoch": 0.81625, "grad_norm": 2.96875, "grad_norm_var": 0.050690714518229166, "learning_rate": 0.0001, "loss": 5.7365, "loss/crossentropy": 2.629301905632019, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16267205029726028, "step": 26120 }, { "epoch": 0.8163125, "grad_norm": 3.0, "grad_norm_var": 0.048173014322916666, "learning_rate": 0.0001, "loss": 5.4366, "loss/crossentropy": 2.4595502614974976, "loss/hidden": 1.40234375, "loss/jsd": 0.0, "loss/logits": 0.15746787190437317, "step": 26122 }, { "epoch": 0.816375, "grad_norm": 2.828125, "grad_norm_var": 0.05196024576822917, "learning_rate": 0.0001, "loss": 5.6512, "loss/crossentropy": 2.585309863090515, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16362079232931137, "step": 26124 }, { "epoch": 0.8164375, "grad_norm": 3.5, "grad_norm_var": 0.05559488932291667, "learning_rate": 0.0001, "loss": 5.9476, "loss/crossentropy": 2.749737501144409, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17212870717048645, "step": 26126 }, { "epoch": 0.8165, "grad_norm": 2.96875, "grad_norm_var": 0.05279541015625, "learning_rate": 0.0001, "loss": 5.5055, "loss/crossentropy": 2.5037710666656494, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15407906472682953, "step": 26128 }, { "epoch": 0.8165625, "grad_norm": 3.140625, "grad_norm_var": 0.05406494140625, "learning_rate": 0.0001, "loss": 5.8976, "loss/crossentropy": 2.7795106172561646, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16884027421474457, "step": 26130 }, { "epoch": 0.816625, "grad_norm": 3.03125, "grad_norm_var": 0.0371002197265625, "learning_rate": 0.0001, "loss": 5.8581, "loss/crossentropy": 2.699077606201172, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16785462200641632, "step": 26132 }, { "epoch": 0.8166875, "grad_norm": 3.265625, "grad_norm_var": 0.03968098958333333, "learning_rate": 0.0001, "loss": 5.8573, "loss/crossentropy": 2.6877238750457764, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16890715807676315, "step": 26134 }, { "epoch": 0.81675, "grad_norm": 2.96875, "grad_norm_var": 0.04016825358072917, "learning_rate": 0.0001, "loss": 5.4469, "loss/crossentropy": 2.3794307708740234, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15947842597961426, "step": 26136 }, { "epoch": 0.8168125, "grad_norm": 2.9375, "grad_norm_var": 0.043440755208333334, "learning_rate": 0.0001, "loss": 5.3964, "loss/crossentropy": 2.3684866428375244, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15787289291620255, "step": 26138 }, { "epoch": 0.816875, "grad_norm": 3.015625, "grad_norm_var": 0.03884175618489583, "learning_rate": 0.0001, "loss": 5.5438, "loss/crossentropy": 2.532816171646118, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15656939148902893, "step": 26140 }, { "epoch": 0.8169375, "grad_norm": 3.21875, "grad_norm_var": 0.025288899739583332, "learning_rate": 0.0001, "loss": 5.4799, "loss/crossentropy": 2.3790860176086426, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16203491389751434, "step": 26142 }, { "epoch": 0.817, "grad_norm": 2.90625, "grad_norm_var": 0.024348958333333334, "learning_rate": 0.0001, "loss": 5.6526, "loss/crossentropy": 2.560869574546814, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16268550604581833, "step": 26144 }, { "epoch": 0.8170625, "grad_norm": 3.296875, "grad_norm_var": 0.031834920247395836, "learning_rate": 0.0001, "loss": 5.5227, "loss/crossentropy": 2.5373018980026245, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.15791461616754532, "step": 26146 }, { "epoch": 0.817125, "grad_norm": 3.0625, "grad_norm_var": 0.025316365559895835, "learning_rate": 0.0001, "loss": 5.4378, "loss/crossentropy": 2.4222248792648315, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15741781145334244, "step": 26148 }, { "epoch": 0.8171875, "grad_norm": 2.984375, "grad_norm_var": 0.024470011393229168, "learning_rate": 0.0001, "loss": 5.6441, "loss/crossentropy": 2.5666340589523315, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.16516925394535065, "step": 26150 }, { "epoch": 0.81725, "grad_norm": 3.015625, "grad_norm_var": 0.021610514322916666, "learning_rate": 0.0001, "loss": 5.599, "loss/crossentropy": 2.5882219076156616, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1530270129442215, "step": 26152 }, { "epoch": 0.8173125, "grad_norm": 2.921875, "grad_norm_var": 0.05354715983072917, "learning_rate": 0.0001, "loss": 5.3402, "loss/crossentropy": 2.3540260791778564, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15291216224431992, "step": 26154 }, { "epoch": 0.817375, "grad_norm": 3.046875, "grad_norm_var": 0.054423014322916664, "learning_rate": 0.0001, "loss": 5.6191, "loss/crossentropy": 2.5458675622940063, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1604481339454651, "step": 26156 }, { "epoch": 0.8174375, "grad_norm": 3.0625, "grad_norm_var": 0.053059895833333336, "learning_rate": 0.0001, "loss": 5.5605, "loss/crossentropy": 2.513525128364563, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15899336338043213, "step": 26158 }, { "epoch": 0.8175, "grad_norm": 3.265625, "grad_norm_var": 0.0528228759765625, "learning_rate": 0.0001, "loss": 5.4506, "loss/crossentropy": 2.376926302909851, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16010113060474396, "step": 26160 }, { "epoch": 0.8175625, "grad_norm": 3.0625, "grad_norm_var": 0.04463602701822917, "learning_rate": 0.0001, "loss": 5.7315, "loss/crossentropy": 2.590888023376465, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1640632301568985, "step": 26162 }, { "epoch": 0.817625, "grad_norm": 2.9375, "grad_norm_var": 0.0499908447265625, "learning_rate": 0.0001, "loss": 5.3026, "loss/crossentropy": 2.4341611862182617, "loss/hidden": 1.38671875, "loss/jsd": 0.0, "loss/logits": 0.14817240089178085, "step": 26164 }, { "epoch": 0.8176875, "grad_norm": 2.890625, "grad_norm_var": 0.055322265625, "learning_rate": 0.0001, "loss": 5.4102, "loss/crossentropy": 2.4750607013702393, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.1524970903992653, "step": 26166 }, { "epoch": 0.81775, "grad_norm": 3.15625, "grad_norm_var": 0.0527496337890625, "learning_rate": 0.0001, "loss": 5.691, "loss/crossentropy": 2.6213563680648804, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1620451584458351, "step": 26168 }, { "epoch": 0.8178125, "grad_norm": 3.34375, "grad_norm_var": 0.022554524739583335, "learning_rate": 0.0001, "loss": 5.7447, "loss/crossentropy": 2.6336199045181274, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16501329839229584, "step": 26170 }, { "epoch": 0.817875, "grad_norm": 3.203125, "grad_norm_var": 0.022606404622395833, "learning_rate": 0.0001, "loss": 5.7543, "loss/crossentropy": 2.610470414161682, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16594266146421432, "step": 26172 }, { "epoch": 0.8179375, "grad_norm": 3.0625, "grad_norm_var": 0.023078409830729167, "learning_rate": 0.0001, "loss": 5.7982, "loss/crossentropy": 2.6537013053894043, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16913538426160812, "step": 26174 }, { "epoch": 0.818, "grad_norm": 3.28125, "grad_norm_var": 0.026590983072916668, "learning_rate": 0.0001, "loss": 5.8476, "loss/crossentropy": 2.6743801832199097, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1704486906528473, "step": 26176 }, { "epoch": 0.8180625, "grad_norm": 3.078125, "grad_norm_var": 0.026102701822916668, "learning_rate": 0.0001, "loss": 5.4365, "loss/crossentropy": 2.379012107849121, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16200068593025208, "step": 26178 }, { "epoch": 0.818125, "grad_norm": 2.96875, "grad_norm_var": 0.021068318684895834, "learning_rate": 0.0001, "loss": 5.6285, "loss/crossentropy": 2.5713045597076416, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1565016508102417, "step": 26180 }, { "epoch": 0.8181875, "grad_norm": 2.78125, "grad_norm_var": 0.024152628580729165, "learning_rate": 0.0001, "loss": 5.4027, "loss/crossentropy": 2.5099167823791504, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.14865048229694366, "step": 26182 }, { "epoch": 0.81825, "grad_norm": 3.140625, "grad_norm_var": 0.0400054931640625, "learning_rate": 0.0001, "loss": 5.6903, "loss/crossentropy": 2.5764319896698, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16334353387355804, "step": 26184 }, { "epoch": 0.8183125, "grad_norm": 3.046875, "grad_norm_var": 0.03680013020833333, "learning_rate": 0.0001, "loss": 5.3066, "loss/crossentropy": 2.2904746532440186, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15474095940589905, "step": 26186 }, { "epoch": 0.818375, "grad_norm": 3.328125, "grad_norm_var": 0.06179097493489583, "learning_rate": 0.0001, "loss": 5.9701, "loss/crossentropy": 2.688890814781189, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1785091534256935, "step": 26188 }, { "epoch": 0.8184375, "grad_norm": 3.421875, "grad_norm_var": 0.06430562337239583, "learning_rate": 0.0001, "loss": 5.6174, "loss/crossentropy": 2.592252016067505, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16033123433589935, "step": 26190 }, { "epoch": 0.8185, "grad_norm": 3.140625, "grad_norm_var": 0.067333984375, "learning_rate": 0.0001, "loss": 5.6458, "loss/crossentropy": 2.599801540374756, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16241006553173065, "step": 26192 }, { "epoch": 0.8185625, "grad_norm": 3.640625, "grad_norm_var": 0.08297119140625, "learning_rate": 0.0001, "loss": 5.5937, "loss/crossentropy": 2.4990642070770264, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16376066952943802, "step": 26194 }, { "epoch": 0.818625, "grad_norm": 3.125, "grad_norm_var": 0.08010965983072917, "learning_rate": 0.0001, "loss": 5.5154, "loss/crossentropy": 2.4857006072998047, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1592223197221756, "step": 26196 }, { "epoch": 0.8186875, "grad_norm": 3.453125, "grad_norm_var": 0.05854390462239583, "learning_rate": 0.0001, "loss": 5.7373, "loss/crossentropy": 2.541812300682068, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17306101322174072, "step": 26198 }, { "epoch": 0.81875, "grad_norm": 3.078125, "grad_norm_var": 0.06412760416666667, "learning_rate": 0.0001, "loss": 5.3927, "loss/crossentropy": 2.4223748445510864, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.1571846753358841, "step": 26200 }, { "epoch": 0.8188125, "grad_norm": 3.4375, "grad_norm_var": 0.06555989583333334, "learning_rate": 0.0001, "loss": 5.8268, "loss/crossentropy": 2.6781102418899536, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16526120901107788, "step": 26202 }, { "epoch": 0.818875, "grad_norm": 3.015625, "grad_norm_var": 0.05287984212239583, "learning_rate": 0.0001, "loss": 5.3714, "loss/crossentropy": 2.4879703521728516, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.14849692583084106, "step": 26204 }, { "epoch": 0.8189375, "grad_norm": 2.828125, "grad_norm_var": 0.05705973307291667, "learning_rate": 0.0001, "loss": 5.3034, "loss/crossentropy": 2.3397125005722046, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.15652088820934296, "step": 26206 }, { "epoch": 0.819, "grad_norm": 2.859375, "grad_norm_var": 0.06199442545572917, "learning_rate": 0.0001, "loss": 5.4446, "loss/crossentropy": 2.4718334674835205, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1503976359963417, "step": 26208 }, { "epoch": 0.8190625, "grad_norm": 3.171875, "grad_norm_var": 0.03916015625, "learning_rate": 0.0001, "loss": 5.6137, "loss/crossentropy": 2.5755761861801147, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1600641831755638, "step": 26210 }, { "epoch": 0.819125, "grad_norm": 2.984375, "grad_norm_var": 0.038916015625, "learning_rate": 0.0001, "loss": 5.432, "loss/crossentropy": 2.433050751686096, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15575602650642395, "step": 26212 }, { "epoch": 0.8191875, "grad_norm": 3.0, "grad_norm_var": 0.023714192708333335, "learning_rate": 0.0001, "loss": 5.5737, "loss/crossentropy": 2.5103474855422974, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.16297713667154312, "step": 26214 }, { "epoch": 0.81925, "grad_norm": 4.40625, "grad_norm_var": 0.14511311848958333, "learning_rate": 0.0001, "loss": 5.8492, "loss/crossentropy": 2.643450140953064, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17096712440252304, "step": 26216 }, { "epoch": 0.8193125, "grad_norm": 3.1875, "grad_norm_var": 0.13736979166666666, "learning_rate": 0.0001, "loss": 5.703, "loss/crossentropy": 2.5356518030166626, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16712727397680283, "step": 26218 }, { "epoch": 0.819375, "grad_norm": 2.765625, "grad_norm_var": 0.14334208170572918, "learning_rate": 0.0001, "loss": 5.5674, "loss/crossentropy": 2.5801234245300293, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15732340514659882, "step": 26220 }, { "epoch": 0.8194375, "grad_norm": 2.953125, "grad_norm_var": 0.27905171712239585, "learning_rate": 0.0001, "loss": 5.5043, "loss/crossentropy": 2.4334503412246704, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15903393179178238, "step": 26222 }, { "epoch": 0.8195, "grad_norm": 2.859375, "grad_norm_var": 0.27244364420572914, "learning_rate": 0.0001, "loss": 5.3334, "loss/crossentropy": 2.3872623443603516, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15008265525102615, "step": 26224 }, { "epoch": 0.8195625, "grad_norm": 3.296875, "grad_norm_var": 0.271826171875, "learning_rate": 0.0001, "loss": 5.7474, "loss/crossentropy": 2.575577139854431, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.17264865338802338, "step": 26226 }, { "epoch": 0.819625, "grad_norm": 3.375, "grad_norm_var": 0.27444254557291664, "learning_rate": 0.0001, "loss": 5.469, "loss/crossentropy": 2.4319013357162476, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15761180222034454, "step": 26228 }, { "epoch": 0.8196875, "grad_norm": 3.265625, "grad_norm_var": 0.26929931640625, "learning_rate": 0.0001, "loss": 5.7769, "loss/crossentropy": 2.641359329223633, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16746356338262558, "step": 26230 }, { "epoch": 0.81975, "grad_norm": 3.0, "grad_norm_var": 0.1694976806640625, "learning_rate": 0.0001, "loss": 5.689, "loss/crossentropy": 2.6191372871398926, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1597168892621994, "step": 26232 }, { "epoch": 0.8198125, "grad_norm": 2.96875, "grad_norm_var": 0.1767730712890625, "learning_rate": 0.0001, "loss": 5.2413, "loss/crossentropy": 2.310689330101013, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15087038278579712, "step": 26234 }, { "epoch": 0.819875, "grad_norm": 3.0625, "grad_norm_var": 0.16747639973958334, "learning_rate": 0.0001, "loss": 5.6303, "loss/crossentropy": 2.503048062324524, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1658461093902588, "step": 26236 }, { "epoch": 0.8199375, "grad_norm": 2.765625, "grad_norm_var": 0.03764546712239583, "learning_rate": 0.0001, "loss": 5.5754, "loss/crossentropy": 2.5644038915634155, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15774518996477127, "step": 26238 }, { "epoch": 0.82, "grad_norm": 3.09375, "grad_norm_var": 0.03371480305989583, "learning_rate": 0.0001, "loss": 5.5648, "loss/crossentropy": 2.482118010520935, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16334488987922668, "step": 26240 }, { "epoch": 0.8200625, "grad_norm": 3.109375, "grad_norm_var": 0.033036295572916666, "learning_rate": 0.0001, "loss": 5.4303, "loss/crossentropy": 2.4456228017807007, "loss/hidden": 1.390625, "loss/jsd": 0.0, "loss/logits": 0.15940222144126892, "step": 26242 }, { "epoch": 0.820125, "grad_norm": 2.859375, "grad_norm_var": 0.030501302083333334, "learning_rate": 0.0001, "loss": 5.8435, "loss/crossentropy": 2.747655987739563, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1646673008799553, "step": 26244 }, { "epoch": 0.8201875, "grad_norm": 3.03125, "grad_norm_var": 0.030562337239583334, "learning_rate": 0.0001, "loss": 5.5265, "loss/crossentropy": 2.4170150756835938, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1629014015197754, "step": 26246 }, { "epoch": 0.82025, "grad_norm": 2.78125, "grad_norm_var": 0.0355865478515625, "learning_rate": 0.0001, "loss": 5.4901, "loss/crossentropy": 2.5667566061019897, "loss/hidden": 1.3828125, "loss/jsd": 0.0, "loss/logits": 0.15405111759901047, "step": 26248 }, { "epoch": 0.8203125, "grad_norm": 2.984375, "grad_norm_var": 0.0397857666015625, "learning_rate": 0.0001, "loss": 5.4788, "loss/crossentropy": 2.5163642168045044, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15484175086021423, "step": 26250 }, { "epoch": 0.820375, "grad_norm": 3.046875, "grad_norm_var": 0.034684244791666666, "learning_rate": 0.0001, "loss": 5.7484, "loss/crossentropy": 2.7177315950393677, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15853740274906158, "step": 26252 }, { "epoch": 0.8204375, "grad_norm": 2.828125, "grad_norm_var": 0.031298828125, "learning_rate": 0.0001, "loss": 5.4406, "loss/crossentropy": 2.4661972522735596, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15213170647621155, "step": 26254 }, { "epoch": 0.8205, "grad_norm": 2.90625, "grad_norm_var": 0.03665364583333333, "learning_rate": 0.0001, "loss": 5.4724, "loss/crossentropy": 2.4164193868637085, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16067221015691757, "step": 26256 }, { "epoch": 0.8205625, "grad_norm": 3.375, "grad_norm_var": 0.04504801432291667, "learning_rate": 0.0001, "loss": 5.8206, "loss/crossentropy": 2.6252888441085815, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1695270612835884, "step": 26258 }, { "epoch": 0.820625, "grad_norm": 2.953125, "grad_norm_var": 0.0408599853515625, "learning_rate": 0.0001, "loss": 5.2859, "loss/crossentropy": 2.355344533920288, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.14696674793958664, "step": 26260 }, { "epoch": 0.8206875, "grad_norm": 2.875, "grad_norm_var": 0.04544270833333333, "learning_rate": 0.0001, "loss": 5.494, "loss/crossentropy": 2.482057571411133, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15744779258966446, "step": 26262 }, { "epoch": 0.82075, "grad_norm": 2.953125, "grad_norm_var": 0.041380818684895834, "learning_rate": 0.0001, "loss": 5.4578, "loss/crossentropy": 2.4409984350204468, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.16027343273162842, "step": 26264 }, { "epoch": 0.8208125, "grad_norm": 3.125, "grad_norm_var": 0.03461812337239583, "learning_rate": 0.0001, "loss": 5.5687, "loss/crossentropy": 2.4897130727767944, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16415006667375565, "step": 26266 }, { "epoch": 0.820875, "grad_norm": 2.875, "grad_norm_var": 0.03427327473958333, "learning_rate": 0.0001, "loss": 5.6886, "loss/crossentropy": 2.5870476961135864, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16445458680391312, "step": 26268 }, { "epoch": 0.8209375, "grad_norm": 3.5, "grad_norm_var": 0.040038045247395834, "learning_rate": 0.0001, "loss": 5.8109, "loss/crossentropy": 2.6275731325149536, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17067311704158783, "step": 26270 }, { "epoch": 0.821, "grad_norm": 3.078125, "grad_norm_var": 0.036458333333333336, "learning_rate": 0.0001, "loss": 5.6087, "loss/crossentropy": 2.5324538946151733, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16309446096420288, "step": 26272 }, { "epoch": 0.8210625, "grad_norm": 3.078125, "grad_norm_var": 0.0330078125, "learning_rate": 0.0001, "loss": 6.1243, "loss/crossentropy": 2.8941376209259033, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1730210930109024, "step": 26274 }, { "epoch": 0.821125, "grad_norm": 3.3125, "grad_norm_var": 0.0441314697265625, "learning_rate": 0.0001, "loss": 5.3515, "loss/crossentropy": 2.359967350959778, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1499393805861473, "step": 26276 }, { "epoch": 0.8211875, "grad_norm": 2.734375, "grad_norm_var": 0.044970703125, "learning_rate": 0.0001, "loss": 5.8185, "loss/crossentropy": 2.7468066215515137, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16107572615146637, "step": 26278 }, { "epoch": 0.82125, "grad_norm": 3.125, "grad_norm_var": 0.044066365559895834, "learning_rate": 0.0001, "loss": 5.808, "loss/crossentropy": 2.70841383934021, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1634780615568161, "step": 26280 }, { "epoch": 0.8213125, "grad_norm": 2.90625, "grad_norm_var": 0.04649149576822917, "learning_rate": 0.0001, "loss": 5.2681, "loss/crossentropy": 2.286479353904724, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15206390619277954, "step": 26282 }, { "epoch": 0.821375, "grad_norm": 2.96875, "grad_norm_var": 0.04403889973958333, "learning_rate": 0.0001, "loss": 5.7495, "loss/crossentropy": 2.649899482727051, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16543245315551758, "step": 26284 }, { "epoch": 0.8214375, "grad_norm": 3.0, "grad_norm_var": 0.03518880208333333, "learning_rate": 0.0001, "loss": 5.33, "loss/crossentropy": 2.421257972717285, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.14829303324222565, "step": 26286 }, { "epoch": 0.8215, "grad_norm": 2.90625, "grad_norm_var": 0.03608296712239583, "learning_rate": 0.0001, "loss": 5.6065, "loss/crossentropy": 2.5276122093200684, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16219016909599304, "step": 26288 }, { "epoch": 0.8215625, "grad_norm": 2.84375, "grad_norm_var": 0.03323160807291667, "learning_rate": 0.0001, "loss": 5.6477, "loss/crossentropy": 2.637739658355713, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15568828582763672, "step": 26290 }, { "epoch": 0.821625, "grad_norm": 2.84375, "grad_norm_var": 0.022782389322916666, "learning_rate": 0.0001, "loss": 5.397, "loss/crossentropy": 2.4032782316207886, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15444639325141907, "step": 26292 }, { "epoch": 0.8216875, "grad_norm": 3.0625, "grad_norm_var": 0.017634073893229168, "learning_rate": 0.0001, "loss": 5.5938, "loss/crossentropy": 2.527271032333374, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16055792570114136, "step": 26294 }, { "epoch": 0.82175, "grad_norm": 3.109375, "grad_norm_var": 0.023958333333333335, "learning_rate": 0.0001, "loss": 5.5984, "loss/crossentropy": 2.5177271366119385, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16041366010904312, "step": 26296 }, { "epoch": 0.8218125, "grad_norm": 3.078125, "grad_norm_var": 0.021808878580729166, "learning_rate": 0.0001, "loss": 5.6578, "loss/crossentropy": 2.5493358373641968, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16397219896316528, "step": 26298 }, { "epoch": 0.821875, "grad_norm": 2.953125, "grad_norm_var": 0.015355428059895834, "learning_rate": 0.0001, "loss": 5.6355, "loss/crossentropy": 2.5864739418029785, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15919510275125504, "step": 26300 }, { "epoch": 0.8219375, "grad_norm": 2.953125, "grad_norm_var": 0.017317708333333334, "learning_rate": 0.0001, "loss": 5.4856, "loss/crossentropy": 2.3710405826568604, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16263197362422943, "step": 26302 }, { "epoch": 0.822, "grad_norm": 3.421875, "grad_norm_var": 0.027595011393229167, "learning_rate": 0.0001, "loss": 5.6385, "loss/crossentropy": 2.4434726238250732, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17184783518314362, "step": 26304 }, { "epoch": 0.8220625, "grad_norm": 3.4375, "grad_norm_var": 0.05213114420572917, "learning_rate": 0.0001, "loss": 6.1962, "loss/crossentropy": 2.876863956451416, "loss/hidden": 1.53125, "loss/jsd": 0.0, "loss/logits": 0.17880384624004364, "step": 26306 }, { "epoch": 0.822125, "grad_norm": 3.203125, "grad_norm_var": 0.04628804524739583, "learning_rate": 0.0001, "loss": 5.8211, "loss/crossentropy": 2.7134053707122803, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1658443659543991, "step": 26308 }, { "epoch": 0.8221875, "grad_norm": 3.25, "grad_norm_var": 0.04149983723958333, "learning_rate": 0.0001, "loss": 5.7838, "loss/crossentropy": 2.677572727203369, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16062747687101364, "step": 26310 }, { "epoch": 0.82225, "grad_norm": 3.15625, "grad_norm_var": 0.04345296223958333, "learning_rate": 0.0001, "loss": 5.3967, "loss/crossentropy": 2.3640542030334473, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15443439036607742, "step": 26312 }, { "epoch": 0.8223125, "grad_norm": 3.0625, "grad_norm_var": 0.04263916015625, "learning_rate": 0.0001, "loss": 5.7215, "loss/crossentropy": 2.6260221004486084, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1626776158809662, "step": 26314 }, { "epoch": 0.822375, "grad_norm": 3.171875, "grad_norm_var": 0.0373046875, "learning_rate": 0.0001, "loss": 5.8018, "loss/crossentropy": 2.510248303413391, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17876074463129044, "step": 26316 }, { "epoch": 0.8224375, "grad_norm": 2.78125, "grad_norm_var": 0.04501953125, "learning_rate": 0.0001, "loss": 5.2927, "loss/crossentropy": 2.311138153076172, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1536274254322052, "step": 26318 }, { "epoch": 0.8225, "grad_norm": 3.21875, "grad_norm_var": 0.041747029622395834, "learning_rate": 0.0001, "loss": 5.9675, "loss/crossentropy": 2.7409067153930664, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17421995103359222, "step": 26320 }, { "epoch": 0.8225625, "grad_norm": 3.25, "grad_norm_var": 0.01923828125, "learning_rate": 0.0001, "loss": 5.4253, "loss/crossentropy": 2.4497876167297363, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1537996083498001, "step": 26322 }, { "epoch": 0.822625, "grad_norm": 2.75, "grad_norm_var": 0.027730305989583332, "learning_rate": 0.0001, "loss": 5.7434, "loss/crossentropy": 2.674960494041443, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.162706196308136, "step": 26324 }, { "epoch": 0.8226875, "grad_norm": 3.5625, "grad_norm_var": 0.04254150390625, "learning_rate": 0.0001, "loss": 5.6512, "loss/crossentropy": 2.4605530500411987, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17062649875879288, "step": 26326 }, { "epoch": 0.82275, "grad_norm": 2.984375, "grad_norm_var": 0.04202067057291667, "learning_rate": 0.0001, "loss": 5.3649, "loss/crossentropy": 2.400713562965393, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15189073979854584, "step": 26328 }, { "epoch": 0.8228125, "grad_norm": 3.09375, "grad_norm_var": 0.052144368489583336, "learning_rate": 0.0001, "loss": 5.4342, "loss/crossentropy": 2.393681764602661, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15756287425756454, "step": 26330 }, { "epoch": 0.822875, "grad_norm": 3.078125, "grad_norm_var": 0.0521148681640625, "learning_rate": 0.0001, "loss": 5.6294, "loss/crossentropy": 2.5355751514434814, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16250969469547272, "step": 26332 }, { "epoch": 0.8229375, "grad_norm": 3.171875, "grad_norm_var": 0.043553670247395836, "learning_rate": 0.0001, "loss": 5.8098, "loss/crossentropy": 2.636785864830017, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1688680425286293, "step": 26334 }, { "epoch": 0.823, "grad_norm": 3.03125, "grad_norm_var": 0.04490458170572917, "learning_rate": 0.0001, "loss": 5.359, "loss/crossentropy": 2.3224642276763916, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15794599056243896, "step": 26336 }, { "epoch": 0.8230625, "grad_norm": 3.140625, "grad_norm_var": 0.044977823893229164, "learning_rate": 0.0001, "loss": 5.5461, "loss/crossentropy": 2.468558430671692, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16048458218574524, "step": 26338 }, { "epoch": 0.823125, "grad_norm": 3.03125, "grad_norm_var": 0.03583577473958333, "learning_rate": 0.0001, "loss": 5.6669, "loss/crossentropy": 2.578675150871277, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16273091733455658, "step": 26340 }, { "epoch": 0.8231875, "grad_norm": 3.109375, "grad_norm_var": 0.021415201822916667, "learning_rate": 0.0001, "loss": 5.6098, "loss/crossentropy": 2.4914400577545166, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1610499694943428, "step": 26342 }, { "epoch": 0.82325, "grad_norm": 3.140625, "grad_norm_var": 0.019071451822916665, "learning_rate": 0.0001, "loss": 5.7348, "loss/crossentropy": 2.608322024345398, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16342993825674057, "step": 26344 }, { "epoch": 0.8233125, "grad_norm": 3.125, "grad_norm_var": 0.008837890625, "learning_rate": 0.0001, "loss": 5.6239, "loss/crossentropy": 2.4591116905212402, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16999010741710663, "step": 26346 }, { "epoch": 0.823375, "grad_norm": 3.234375, "grad_norm_var": 0.00728759765625, "learning_rate": 0.0001, "loss": 5.8587, "loss/crossentropy": 2.7089649438858032, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16809836775064468, "step": 26348 }, { "epoch": 0.8234375, "grad_norm": 3.03125, "grad_norm_var": 0.0187408447265625, "learning_rate": 0.0001, "loss": 6.0662, "loss/crossentropy": 2.854896903038025, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17152578383684158, "step": 26350 }, { "epoch": 0.8235, "grad_norm": 3.328125, "grad_norm_var": 0.018876139322916666, "learning_rate": 0.0001, "loss": 5.9049, "loss/crossentropy": 2.756648540496826, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16951383650302887, "step": 26352 }, { "epoch": 0.8235625, "grad_norm": 3.265625, "grad_norm_var": 0.01802978515625, "learning_rate": 0.0001, "loss": 5.9413, "loss/crossentropy": 2.757227897644043, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16880160570144653, "step": 26354 }, { "epoch": 0.823625, "grad_norm": 3.21875, "grad_norm_var": 0.016650390625, "learning_rate": 0.0001, "loss": 5.5885, "loss/crossentropy": 2.4658830165863037, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16460354626178741, "step": 26356 }, { "epoch": 0.8236875, "grad_norm": 2.734375, "grad_norm_var": 0.0368316650390625, "learning_rate": 0.0001, "loss": 5.0838, "loss/crossentropy": 2.2466323375701904, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.14387667924165726, "step": 26358 }, { "epoch": 0.82375, "grad_norm": 3.3125, "grad_norm_var": 0.04722900390625, "learning_rate": 0.0001, "loss": 5.7701, "loss/crossentropy": 2.6672489643096924, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.16848907619714737, "step": 26360 }, { "epoch": 0.8238125, "grad_norm": 6.21875, "grad_norm_var": 0.64293212890625, "learning_rate": 0.0001, "loss": 5.5528, "loss/crossentropy": 2.5347325801849365, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1580551490187645, "step": 26362 }, { "epoch": 0.823875, "grad_norm": 2.765625, "grad_norm_var": 0.6656565348307292, "learning_rate": 0.0001, "loss": 5.488, "loss/crossentropy": 2.4814869165420532, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1557292342185974, "step": 26364 }, { "epoch": 0.8239375, "grad_norm": 2.96875, "grad_norm_var": 0.6638631184895833, "learning_rate": 0.0001, "loss": 5.6194, "loss/crossentropy": 2.56308913230896, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16187691688537598, "step": 26366 }, { "epoch": 0.824, "grad_norm": 3.234375, "grad_norm_var": 0.6730784098307292, "learning_rate": 0.0001, "loss": 5.5467, "loss/crossentropy": 2.48854660987854, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.16323573142290115, "step": 26368 }, { "epoch": 0.8240625, "grad_norm": 3.171875, "grad_norm_var": 0.6733306884765625, "learning_rate": 0.0001, "loss": 5.8946, "loss/crossentropy": 2.660510301589966, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17379587143659592, "step": 26370 }, { "epoch": 0.824125, "grad_norm": 2.921875, "grad_norm_var": 0.6805948893229167, "learning_rate": 0.0001, "loss": 5.6283, "loss/crossentropy": 2.585998773574829, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1597011610865593, "step": 26372 }, { "epoch": 0.8241875, "grad_norm": 3.109375, "grad_norm_var": 0.6638010660807292, "learning_rate": 0.0001, "loss": 5.731, "loss/crossentropy": 2.639747738838196, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16459402441978455, "step": 26374 }, { "epoch": 0.82425, "grad_norm": 3.15625, "grad_norm_var": 0.6514719645182292, "learning_rate": 0.0001, "loss": 5.5863, "loss/crossentropy": 2.5839895009994507, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15882544219493866, "step": 26376 }, { "epoch": 0.8243125, "grad_norm": 2.734375, "grad_norm_var": 0.02548828125, "learning_rate": 0.0001, "loss": 5.4645, "loss/crossentropy": 2.4918004274368286, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.1566426083445549, "step": 26378 }, { "epoch": 0.824375, "grad_norm": 2.84375, "grad_norm_var": 0.023697916666666666, "learning_rate": 0.0001, "loss": 5.4738, "loss/crossentropy": 2.4789711236953735, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15729864686727524, "step": 26380 }, { "epoch": 0.8244375, "grad_norm": 2.703125, "grad_norm_var": 0.029899088541666667, "learning_rate": 0.0001, "loss": 5.4287, "loss/crossentropy": 2.518850564956665, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.14879833906888962, "step": 26382 }, { "epoch": 0.8245, "grad_norm": 3.09375, "grad_norm_var": 0.10966389973958333, "learning_rate": 0.0001, "loss": 5.7521, "loss/crossentropy": 2.5595513582229614, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.16456907987594604, "step": 26384 }, { "epoch": 0.8245625, "grad_norm": 3.390625, "grad_norm_var": 0.1145660400390625, "learning_rate": 0.0001, "loss": 6.0436, "loss/crossentropy": 2.7219513654708862, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.18020762503147125, "step": 26386 }, { "epoch": 0.824625, "grad_norm": 3.0625, "grad_norm_var": 0.11431376139322917, "learning_rate": 0.0001, "loss": 5.7164, "loss/crossentropy": 2.6323970556259155, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16113485395908356, "step": 26388 }, { "epoch": 0.8246875, "grad_norm": 2.984375, "grad_norm_var": 0.11272379557291666, "learning_rate": 0.0001, "loss": 5.6277, "loss/crossentropy": 2.5497812032699585, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16013897210359573, "step": 26390 }, { "epoch": 0.82475, "grad_norm": 3.046875, "grad_norm_var": 0.11663004557291666, "learning_rate": 0.0001, "loss": 5.6706, "loss/crossentropy": 2.611387848854065, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16022178530693054, "step": 26392 }, { "epoch": 0.8248125, "grad_norm": 3.0625, "grad_norm_var": 0.11233317057291667, "learning_rate": 0.0001, "loss": 5.9252, "loss/crossentropy": 2.758017063140869, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.17101654410362244, "step": 26394 }, { "epoch": 0.824875, "grad_norm": 3.078125, "grad_norm_var": 0.10614827473958334, "learning_rate": 0.0001, "loss": 5.4983, "loss/crossentropy": 2.471684217453003, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.1612573266029358, "step": 26396 }, { "epoch": 0.8249375, "grad_norm": 3.515625, "grad_norm_var": 0.10191650390625, "learning_rate": 0.0001, "loss": 5.9139, "loss/crossentropy": 2.7498886585235596, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1667894497513771, "step": 26398 }, { "epoch": 0.825, "grad_norm": 3.15625, "grad_norm_var": 0.033426920572916664, "learning_rate": 0.0001, "loss": 5.7226, "loss/crossentropy": 2.6924824714660645, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15847747027873993, "step": 26400 }, { "epoch": 0.8250625, "grad_norm": 2.921875, "grad_norm_var": 0.031305948893229164, "learning_rate": 0.0001, "loss": 5.6605, "loss/crossentropy": 2.630774140357971, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.16235052794218063, "step": 26402 }, { "epoch": 0.825125, "grad_norm": 7.4375, "grad_norm_var": 1.2342274983723958, "learning_rate": 0.0001, "loss": 5.7267, "loss/crossentropy": 2.5541563034057617, "loss/hidden": 1.5546875, "loss/jsd": 0.0, "loss/logits": 0.1617891490459442, "step": 26404 }, { "epoch": 0.8251875, "grad_norm": 2.9375, "grad_norm_var": 1.2349680582682292, "learning_rate": 0.0001, "loss": 5.9378, "loss/crossentropy": 2.739465355873108, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1733456403017044, "step": 26406 }, { "epoch": 0.82525, "grad_norm": 3.546875, "grad_norm_var": 1.213451131184896, "learning_rate": 0.0001, "loss": 5.4925, "loss/crossentropy": 2.345843553543091, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16466321051120758, "step": 26408 }, { "epoch": 0.8253125, "grad_norm": 2.984375, "grad_norm_var": 1.2509185791015625, "learning_rate": 0.0001, "loss": 5.3811, "loss/crossentropy": 2.4296364784240723, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.1529592126607895, "step": 26410 }, { "epoch": 0.825375, "grad_norm": 2.828125, "grad_norm_var": 1.2613922119140626, "learning_rate": 0.0001, "loss": 5.5242, "loss/crossentropy": 2.5729185342788696, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.15450449287891388, "step": 26412 }, { "epoch": 0.8254375, "grad_norm": 2.96875, "grad_norm_var": 1.2567708333333334, "learning_rate": 0.0001, "loss": 5.7273, "loss/crossentropy": 2.608040928840637, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16387560963630676, "step": 26414 }, { "epoch": 0.8255, "grad_norm": 2.96875, "grad_norm_var": 1.260326131184896, "learning_rate": 0.0001, "loss": 5.8647, "loss/crossentropy": 2.699707269668579, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1661122441291809, "step": 26416 }, { "epoch": 0.8255625, "grad_norm": 3.015625, "grad_norm_var": 1.2520172119140625, "learning_rate": 0.0001, "loss": 5.7293, "loss/crossentropy": 2.646604895591736, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16413111239671707, "step": 26418 }, { "epoch": 0.825625, "grad_norm": 3.296875, "grad_norm_var": 0.05634765625, "learning_rate": 0.0001, "loss": 5.6879, "loss/crossentropy": 2.508752465248108, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1710352897644043, "step": 26420 }, { "epoch": 0.8256875, "grad_norm": 3.015625, "grad_norm_var": 0.04879557291666667, "learning_rate": 0.0001, "loss": 5.6641, "loss/crossentropy": 2.589567542076111, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16136162728071213, "step": 26422 }, { "epoch": 0.82575, "grad_norm": 3.71875, "grad_norm_var": 0.06659749348958334, "learning_rate": 0.0001, "loss": 5.7171, "loss/crossentropy": 2.5890969038009644, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16514231264591217, "step": 26424 }, { "epoch": 0.8258125, "grad_norm": 3.234375, "grad_norm_var": 0.0551177978515625, "learning_rate": 0.0001, "loss": 5.5481, "loss/crossentropy": 2.4240739345550537, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1623988151550293, "step": 26426 }, { "epoch": 0.825875, "grad_norm": 2.859375, "grad_norm_var": 0.056916300455729166, "learning_rate": 0.0001, "loss": 5.7019, "loss/crossentropy": 2.5907323360443115, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16307149827480316, "step": 26428 }, { "epoch": 0.8259375, "grad_norm": 3.609375, "grad_norm_var": 0.06676025390625, "learning_rate": 0.0001, "loss": 5.8064, "loss/crossentropy": 2.57390820980072, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.16934466361999512, "step": 26430 }, { "epoch": 0.826, "grad_norm": 3.125, "grad_norm_var": 0.07798563639322917, "learning_rate": 0.0001, "loss": 5.3473, "loss/crossentropy": 2.319072723388672, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1563357710838318, "step": 26432 }, { "epoch": 0.8260625, "grad_norm": 3.46875, "grad_norm_var": 0.07955729166666667, "learning_rate": 0.0001, "loss": 5.9472, "loss/crossentropy": 2.8393198251724243, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16117888689041138, "step": 26434 }, { "epoch": 0.826125, "grad_norm": 3.25, "grad_norm_var": 0.0787506103515625, "learning_rate": 0.0001, "loss": 5.5096, "loss/crossentropy": 2.4227101802825928, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16493766009807587, "step": 26436 }, { "epoch": 0.8261875, "grad_norm": 3.03125, "grad_norm_var": 0.0774078369140625, "learning_rate": 0.0001, "loss": 5.612, "loss/crossentropy": 2.5441017150878906, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16030529141426086, "step": 26438 }, { "epoch": 0.82625, "grad_norm": 3.28125, "grad_norm_var": 0.05364583333333333, "learning_rate": 0.0001, "loss": 5.8929, "loss/crossentropy": 2.7285382747650146, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.17034100741147995, "step": 26440 }, { "epoch": 0.8263125, "grad_norm": 3.234375, "grad_norm_var": 0.05396728515625, "learning_rate": 0.0001, "loss": 5.9828, "loss/crossentropy": 2.851389527320862, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1643085852265358, "step": 26442 }, { "epoch": 0.826375, "grad_norm": 3.0, "grad_norm_var": 0.04797770182291667, "learning_rate": 0.0001, "loss": 5.7981, "loss/crossentropy": 2.68903386592865, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.1667705997824669, "step": 26444 }, { "epoch": 0.8264375, "grad_norm": 2.9375, "grad_norm_var": 0.04982808430989583, "learning_rate": 0.0001, "loss": 5.9516, "loss/crossentropy": 2.7996885776519775, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16909538954496384, "step": 26446 }, { "epoch": 0.8265, "grad_norm": 3.375, "grad_norm_var": 0.04019775390625, "learning_rate": 0.0001, "loss": 5.7933, "loss/crossentropy": 2.629693865776062, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16987202316522598, "step": 26448 }, { "epoch": 0.8265625, "grad_norm": 3.140625, "grad_norm_var": 0.033589680989583336, "learning_rate": 0.0001, "loss": 5.864, "loss/crossentropy": 2.723311424255371, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16797932237386703, "step": 26450 }, { "epoch": 0.826625, "grad_norm": 3.03125, "grad_norm_var": 0.033984375, "learning_rate": 0.0001, "loss": 5.6234, "loss/crossentropy": 2.5940492153167725, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1576242372393608, "step": 26452 }, { "epoch": 0.8266875, "grad_norm": 3.21875, "grad_norm_var": 0.03339436848958333, "learning_rate": 0.0001, "loss": 5.561, "loss/crossentropy": 2.5500476360321045, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15695472061634064, "step": 26454 }, { "epoch": 0.82675, "grad_norm": 3.015625, "grad_norm_var": 0.027978515625, "learning_rate": 0.0001, "loss": 5.3775, "loss/crossentropy": 2.3505324125289917, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15621038526296616, "step": 26456 }, { "epoch": 0.8268125, "grad_norm": 3.234375, "grad_norm_var": 0.029206339518229166, "learning_rate": 0.0001, "loss": 6.0761, "loss/crossentropy": 2.8190720081329346, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17570750415325165, "step": 26458 }, { "epoch": 0.826875, "grad_norm": 3.0, "grad_norm_var": 0.028620402018229168, "learning_rate": 0.0001, "loss": 5.5801, "loss/crossentropy": 2.554157018661499, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15806709975004196, "step": 26460 }, { "epoch": 0.8269375, "grad_norm": 2.96875, "grad_norm_var": 0.015184529622395833, "learning_rate": 0.0001, "loss": 5.7697, "loss/crossentropy": 2.625096917152405, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16876161843538284, "step": 26462 }, { "epoch": 0.827, "grad_norm": 3.140625, "grad_norm_var": 0.013916015625, "learning_rate": 0.0001, "loss": 5.8531, "loss/crossentropy": 2.731651186943054, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16605433821678162, "step": 26464 }, { "epoch": 0.8270625, "grad_norm": 3.234375, "grad_norm_var": 0.014518229166666667, "learning_rate": 0.0001, "loss": 5.939, "loss/crossentropy": 2.704955220222473, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17418386042118073, "step": 26466 }, { "epoch": 0.827125, "grad_norm": 2.953125, "grad_norm_var": 0.017699178059895834, "learning_rate": 0.0001, "loss": 5.872, "loss/crossentropy": 2.667886972427368, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17041241377592087, "step": 26468 }, { "epoch": 0.8271875, "grad_norm": 2.921875, "grad_norm_var": 0.01943359375, "learning_rate": 0.0001, "loss": 5.6555, "loss/crossentropy": 2.6146072149276733, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16034063696861267, "step": 26470 }, { "epoch": 0.82725, "grad_norm": 2.90625, "grad_norm_var": 0.0251373291015625, "learning_rate": 0.0001, "loss": 5.3293, "loss/crossentropy": 2.3477360010147095, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15401406586170197, "step": 26472 }, { "epoch": 0.8273125, "grad_norm": 3.359375, "grad_norm_var": 0.0262603759765625, "learning_rate": 0.0001, "loss": 5.5267, "loss/crossentropy": 2.5378081798553467, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15318354219198227, "step": 26474 }, { "epoch": 0.827375, "grad_norm": 2.890625, "grad_norm_var": 0.027765909830729168, "learning_rate": 0.0001, "loss": 5.2999, "loss/crossentropy": 2.333868145942688, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1520739421248436, "step": 26476 }, { "epoch": 0.8274375, "grad_norm": 3.328125, "grad_norm_var": 0.0291900634765625, "learning_rate": 0.0001, "loss": 5.9343, "loss/crossentropy": 2.6245245933532715, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.17707359045743942, "step": 26478 }, { "epoch": 0.8275, "grad_norm": 3.1875, "grad_norm_var": 0.03076171875, "learning_rate": 0.0001, "loss": 5.7936, "loss/crossentropy": 2.589395761489868, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1704207882285118, "step": 26480 }, { "epoch": 0.8275625, "grad_norm": 3.265625, "grad_norm_var": 0.03173828125, "learning_rate": 0.0001, "loss": 5.6295, "loss/crossentropy": 2.4367631673812866, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17122900485992432, "step": 26482 }, { "epoch": 0.827625, "grad_norm": 2.9375, "grad_norm_var": 0.03654683430989583, "learning_rate": 0.0001, "loss": 5.1427, "loss/crossentropy": 2.2844330072402954, "loss/hidden": 1.38671875, "loss/jsd": 0.0, "loss/logits": 0.1471521183848381, "step": 26484 }, { "epoch": 0.8276875, "grad_norm": 2.9375, "grad_norm_var": 0.036031087239583336, "learning_rate": 0.0001, "loss": 5.818, "loss/crossentropy": 2.7138901948928833, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1666596755385399, "step": 26486 }, { "epoch": 0.82775, "grad_norm": 3.25, "grad_norm_var": 0.0419921875, "learning_rate": 0.0001, "loss": 5.8464, "loss/crossentropy": 2.6777409315109253, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16608533263206482, "step": 26488 }, { "epoch": 0.8278125, "grad_norm": 3.265625, "grad_norm_var": 0.0384429931640625, "learning_rate": 0.0001, "loss": 5.7181, "loss/crossentropy": 2.6251548528671265, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16398146003484726, "step": 26490 }, { "epoch": 0.827875, "grad_norm": 2.859375, "grad_norm_var": 0.04639383951822917, "learning_rate": 0.0001, "loss": 5.3811, "loss/crossentropy": 2.5030300617218018, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.14717745035886765, "step": 26492 }, { "epoch": 0.8279375, "grad_norm": 3.421875, "grad_norm_var": 0.056396484375, "learning_rate": 0.0001, "loss": 5.5396, "loss/crossentropy": 2.4696165323257446, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16090339422225952, "step": 26494 }, { "epoch": 0.828, "grad_norm": 2.8125, "grad_norm_var": 0.0555572509765625, "learning_rate": 0.0001, "loss": 5.5684, "loss/crossentropy": 2.5611249208450317, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15658611059188843, "step": 26496 }, { "epoch": 0.8280625, "grad_norm": 2.96875, "grad_norm_var": 0.061962890625, "learning_rate": 0.0001, "loss": 5.8567, "loss/crossentropy": 2.6480154991149902, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17243517935276031, "step": 26498 }, { "epoch": 0.828125, "grad_norm": 2.890625, "grad_norm_var": 0.05626627604166667, "learning_rate": 0.0001, "loss": 5.4199, "loss/crossentropy": 2.3727545738220215, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16252897679805756, "step": 26500 }, { "epoch": 0.8281875, "grad_norm": 2.9375, "grad_norm_var": 0.057306925455729164, "learning_rate": 0.0001, "loss": 5.6952, "loss/crossentropy": 2.6760060787200928, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15973235666751862, "step": 26502 }, { "epoch": 0.82825, "grad_norm": 3.0625, "grad_norm_var": 0.041901652018229166, "learning_rate": 0.0001, "loss": 5.6175, "loss/crossentropy": 2.581863760948181, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16059322655200958, "step": 26504 }, { "epoch": 0.8283125, "grad_norm": 2.859375, "grad_norm_var": 0.03951822916666667, "learning_rate": 0.0001, "loss": 5.4037, "loss/crossentropy": 2.4558279514312744, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.15299250930547714, "step": 26506 }, { "epoch": 0.828375, "grad_norm": 3.15625, "grad_norm_var": 0.04651692708333333, "learning_rate": 0.0001, "loss": 5.2724, "loss/crossentropy": 2.3377411365509033, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15206074714660645, "step": 26508 }, { "epoch": 0.8284375, "grad_norm": 2.921875, "grad_norm_var": 0.0355621337890625, "learning_rate": 0.0001, "loss": 5.6686, "loss/crossentropy": 2.570454478263855, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.165279820561409, "step": 26510 }, { "epoch": 0.8285, "grad_norm": 2.578125, "grad_norm_var": 0.044287109375, "learning_rate": 0.0001, "loss": 5.2783, "loss/crossentropy": 2.3998725414276123, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.1444854512810707, "step": 26512 }, { "epoch": 0.8285625, "grad_norm": 3.28125, "grad_norm_var": 0.0391754150390625, "learning_rate": 0.0001, "loss": 5.4749, "loss/crossentropy": 2.476272702217102, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.156505286693573, "step": 26514 }, { "epoch": 0.828625, "grad_norm": 3.296875, "grad_norm_var": 0.04589436848958333, "learning_rate": 0.0001, "loss": 5.5233, "loss/crossentropy": 2.439882516860962, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1614651456475258, "step": 26516 }, { "epoch": 0.8286875, "grad_norm": 3.03125, "grad_norm_var": 0.04641011555989583, "learning_rate": 0.0001, "loss": 5.5137, "loss/crossentropy": 2.5422792434692383, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.15534193813800812, "step": 26518 }, { "epoch": 0.82875, "grad_norm": 3.703125, "grad_norm_var": 0.07837626139322916, "learning_rate": 0.0001, "loss": 5.7862, "loss/crossentropy": 2.646515369415283, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1643586978316307, "step": 26520 }, { "epoch": 0.8288125, "grad_norm": 3.109375, "grad_norm_var": 0.0773101806640625, "learning_rate": 0.0001, "loss": 5.5665, "loss/crossentropy": 2.519219160079956, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15824301540851593, "step": 26522 }, { "epoch": 0.828875, "grad_norm": 3.03125, "grad_norm_var": 0.06526590983072916, "learning_rate": 0.0001, "loss": 5.5823, "loss/crossentropy": 2.5451369285583496, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15996336936950684, "step": 26524 }, { "epoch": 0.8289375, "grad_norm": 2.90625, "grad_norm_var": 0.0631988525390625, "learning_rate": 0.0001, "loss": 5.2962, "loss/crossentropy": 2.3923909664154053, "loss/hidden": 1.390625, "loss/jsd": 0.0, "loss/logits": 0.15131926536560059, "step": 26526 }, { "epoch": 0.829, "grad_norm": 3.265625, "grad_norm_var": 0.05148111979166667, "learning_rate": 0.0001, "loss": 5.4582, "loss/crossentropy": 2.4119802713394165, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1597050204873085, "step": 26528 }, { "epoch": 0.8290625, "grad_norm": 2.984375, "grad_norm_var": 0.044482421875, "learning_rate": 0.0001, "loss": 5.573, "loss/crossentropy": 2.570006012916565, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15811413526535034, "step": 26530 }, { "epoch": 0.829125, "grad_norm": 3.078125, "grad_norm_var": 0.04120686848958333, "learning_rate": 0.0001, "loss": 5.5177, "loss/crossentropy": 2.4561526775360107, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.15615782141685486, "step": 26532 }, { "epoch": 0.8291875, "grad_norm": 3.140625, "grad_norm_var": 0.040339152018229164, "learning_rate": 0.0001, "loss": 5.7838, "loss/crossentropy": 2.690218448638916, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16483154892921448, "step": 26534 }, { "epoch": 0.82925, "grad_norm": 3.125, "grad_norm_var": 0.014069620768229167, "learning_rate": 0.0001, "loss": 5.6097, "loss/crossentropy": 2.4879987239837646, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.164124995470047, "step": 26536 }, { "epoch": 0.8293125, "grad_norm": 2.890625, "grad_norm_var": 0.0142730712890625, "learning_rate": 0.0001, "loss": 5.7149, "loss/crossentropy": 2.6120272874832153, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16809770464897156, "step": 26538 }, { "epoch": 0.829375, "grad_norm": 3.046875, "grad_norm_var": 0.016402180989583334, "learning_rate": 0.0001, "loss": 5.7451, "loss/crossentropy": 2.646128296852112, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16497544944286346, "step": 26540 }, { "epoch": 0.8294375, "grad_norm": 2.90625, "grad_norm_var": 0.016097005208333334, "learning_rate": 0.0001, "loss": 5.439, "loss/crossentropy": 2.3879107236862183, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16096339374780655, "step": 26542 }, { "epoch": 0.8295, "grad_norm": 2.921875, "grad_norm_var": 0.015949503580729166, "learning_rate": 0.0001, "loss": 5.8367, "loss/crossentropy": 2.6953009366989136, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16687103360891342, "step": 26544 }, { "epoch": 0.8295625, "grad_norm": 3.03125, "grad_norm_var": 0.014142862955729167, "learning_rate": 0.0001, "loss": 5.7148, "loss/crossentropy": 2.690555214881897, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.16102024912834167, "step": 26546 }, { "epoch": 0.829625, "grad_norm": 2.90625, "grad_norm_var": 0.0148590087890625, "learning_rate": 0.0001, "loss": 5.6024, "loss/crossentropy": 2.5594701766967773, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16014951467514038, "step": 26548 }, { "epoch": 0.8296875, "grad_norm": 3.0, "grad_norm_var": 0.011751302083333333, "learning_rate": 0.0001, "loss": 5.3773, "loss/crossentropy": 2.4803178310394287, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.14829088747501373, "step": 26550 }, { "epoch": 0.82975, "grad_norm": 2.890625, "grad_norm_var": 0.013231404622395833, "learning_rate": 0.0001, "loss": 5.7185, "loss/crossentropy": 2.6627864837646484, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.16221345216035843, "step": 26552 }, { "epoch": 0.8298125, "grad_norm": 3.078125, "grad_norm_var": 0.012238566080729167, "learning_rate": 0.0001, "loss": 5.7239, "loss/crossentropy": 2.6020004749298096, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16453495621681213, "step": 26554 }, { "epoch": 0.829875, "grad_norm": 3.109375, "grad_norm_var": 0.013395182291666667, "learning_rate": 0.0001, "loss": 5.7971, "loss/crossentropy": 2.7000378370285034, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16322240978479385, "step": 26556 }, { "epoch": 0.8299375, "grad_norm": 2.8125, "grad_norm_var": 0.015494791666666667, "learning_rate": 0.0001, "loss": 5.3251, "loss/crossentropy": 2.398852825164795, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.15199558436870575, "step": 26558 }, { "epoch": 0.83, "grad_norm": 2.78125, "grad_norm_var": 0.014449055989583333, "learning_rate": 0.0001, "loss": 5.5648, "loss/crossentropy": 2.6085457801818848, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15422050654888153, "step": 26560 }, { "epoch": 0.8300625, "grad_norm": 3.171875, "grad_norm_var": 0.015946451822916666, "learning_rate": 0.0001, "loss": 5.6234, "loss/crossentropy": 2.5029183626174927, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16790633648633957, "step": 26562 }, { "epoch": 0.830125, "grad_norm": 3.328125, "grad_norm_var": 0.03160807291666667, "learning_rate": 0.0001, "loss": 5.4127, "loss/crossentropy": 2.3239113092422485, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16278766095638275, "step": 26564 }, { "epoch": 0.8301875, "grad_norm": 2.921875, "grad_norm_var": 0.03394775390625, "learning_rate": 0.0001, "loss": 5.442, "loss/crossentropy": 2.4539923667907715, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15426688641309738, "step": 26566 }, { "epoch": 0.83025, "grad_norm": 2.8125, "grad_norm_var": 0.035380045572916664, "learning_rate": 0.0001, "loss": 5.429, "loss/crossentropy": 2.4697686433792114, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15373125672340393, "step": 26568 }, { "epoch": 0.8303125, "grad_norm": 3.09375, "grad_norm_var": 0.03853759765625, "learning_rate": 0.0001, "loss": 5.6651, "loss/crossentropy": 2.653131365776062, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.16135179996490479, "step": 26570 }, { "epoch": 0.830375, "grad_norm": 2.890625, "grad_norm_var": 0.041422526041666664, "learning_rate": 0.0001, "loss": 5.437, "loss/crossentropy": 2.4077842235565186, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.16033972054719925, "step": 26572 }, { "epoch": 0.8304375, "grad_norm": 3.265625, "grad_norm_var": 0.062352498372395836, "learning_rate": 0.0001, "loss": 6.1193, "loss/crossentropy": 2.7360039949417114, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18755102157592773, "step": 26574 }, { "epoch": 0.8305, "grad_norm": 3.21875, "grad_norm_var": 0.056029256184895834, "learning_rate": 0.0001, "loss": 5.4868, "loss/crossentropy": 2.4786585569381714, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15784411132335663, "step": 26576 }, { "epoch": 0.8305625, "grad_norm": 3.390625, "grad_norm_var": 0.0599761962890625, "learning_rate": 0.0001, "loss": 5.7899, "loss/crossentropy": 2.640892744064331, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.17076198756694794, "step": 26578 }, { "epoch": 0.830625, "grad_norm": 3.140625, "grad_norm_var": 0.05230712890625, "learning_rate": 0.0001, "loss": 5.9349, "loss/crossentropy": 2.810156226158142, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16364441066980362, "step": 26580 }, { "epoch": 0.8306875, "grad_norm": 2.9375, "grad_norm_var": 0.04895426432291667, "learning_rate": 0.0001, "loss": 5.5483, "loss/crossentropy": 2.5238953828811646, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15713290870189667, "step": 26582 }, { "epoch": 0.83075, "grad_norm": 2.921875, "grad_norm_var": 0.04453125, "learning_rate": 0.0001, "loss": 5.0923, "loss/crossentropy": 2.2778791189193726, "loss/hidden": 1.34375, "loss/jsd": 0.0, "loss/logits": 0.14706549048423767, "step": 26584 }, { "epoch": 0.8308125, "grad_norm": 3.046875, "grad_norm_var": 0.03824462890625, "learning_rate": 0.0001, "loss": 5.9693, "loss/crossentropy": 2.776100277900696, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1708783060312271, "step": 26586 }, { "epoch": 0.830875, "grad_norm": 3.0625, "grad_norm_var": 0.034130859375, "learning_rate": 0.0001, "loss": 5.5534, "loss/crossentropy": 2.4902278184890747, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15904773771762848, "step": 26588 }, { "epoch": 0.8309375, "grad_norm": 3.53125, "grad_norm_var": 0.04877827962239583, "learning_rate": 0.0001, "loss": 5.3848, "loss/crossentropy": 2.412701368331909, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1515059694647789, "step": 26590 }, { "epoch": 0.831, "grad_norm": 3.171875, "grad_norm_var": 0.04833577473958333, "learning_rate": 0.0001, "loss": 5.7852, "loss/crossentropy": 2.6917717456817627, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16129279136657715, "step": 26592 }, { "epoch": 0.8310625, "grad_norm": 2.6875, "grad_norm_var": 0.06445210774739583, "learning_rate": 0.0001, "loss": 5.378, "loss/crossentropy": 2.3800243139266968, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15644137561321259, "step": 26594 }, { "epoch": 0.831125, "grad_norm": 3.28125, "grad_norm_var": 0.06321614583333333, "learning_rate": 0.0001, "loss": 5.6866, "loss/crossentropy": 2.6062453985214233, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16350360214710236, "step": 26596 }, { "epoch": 0.8311875, "grad_norm": 3.015625, "grad_norm_var": 0.0645660400390625, "learning_rate": 0.0001, "loss": 5.4337, "loss/crossentropy": 2.4476349353790283, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15368564426898956, "step": 26598 }, { "epoch": 0.83125, "grad_norm": 3.1875, "grad_norm_var": 0.06405843098958333, "learning_rate": 0.0001, "loss": 5.7728, "loss/crossentropy": 2.641290545463562, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16823303699493408, "step": 26600 }, { "epoch": 0.8313125, "grad_norm": 2.890625, "grad_norm_var": 0.06389058430989583, "learning_rate": 0.0001, "loss": 5.6648, "loss/crossentropy": 2.550861358642578, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16530445218086243, "step": 26602 }, { "epoch": 0.831375, "grad_norm": 3.296875, "grad_norm_var": 0.06747945149739583, "learning_rate": 0.0001, "loss": 5.9078, "loss/crossentropy": 2.655044198036194, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1744927167892456, "step": 26604 }, { "epoch": 0.8314375, "grad_norm": 3.09375, "grad_norm_var": 0.03706766764322917, "learning_rate": 0.0001, "loss": 5.6888, "loss/crossentropy": 2.5507737398147583, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16810215264558792, "step": 26606 }, { "epoch": 0.8315, "grad_norm": 3.15625, "grad_norm_var": 0.036774698893229166, "learning_rate": 0.0001, "loss": 5.6882, "loss/crossentropy": 2.556751012802124, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16627078503370285, "step": 26608 }, { "epoch": 0.8315625, "grad_norm": 2.921875, "grad_norm_var": 0.018586222330729166, "learning_rate": 0.0001, "loss": 5.7332, "loss/crossentropy": 2.6276328563690186, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16134114563465118, "step": 26610 }, { "epoch": 0.831625, "grad_norm": 2.671875, "grad_norm_var": 0.0405426025390625, "learning_rate": 0.0001, "loss": 5.5495, "loss/crossentropy": 2.455042839050293, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16374597698450089, "step": 26612 }, { "epoch": 0.8316875, "grad_norm": 3.046875, "grad_norm_var": 0.03710530598958333, "learning_rate": 0.0001, "loss": 5.7492, "loss/crossentropy": 2.6724430322647095, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15962416678667068, "step": 26614 }, { "epoch": 0.83175, "grad_norm": 3.078125, "grad_norm_var": 0.0346343994140625, "learning_rate": 0.0001, "loss": 5.3511, "loss/crossentropy": 2.3925164937973022, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15132832527160645, "step": 26616 }, { "epoch": 0.8318125, "grad_norm": 3.09375, "grad_norm_var": 0.03276265462239583, "learning_rate": 0.0001, "loss": 6.0376, "loss/crossentropy": 2.8824446201324463, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16942650824785233, "step": 26618 }, { "epoch": 0.831875, "grad_norm": 3.046875, "grad_norm_var": 0.0291412353515625, "learning_rate": 0.0001, "loss": 5.5934, "loss/crossentropy": 2.4897799491882324, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16621798276901245, "step": 26620 }, { "epoch": 0.8319375, "grad_norm": 3.65625, "grad_norm_var": 0.05239156087239583, "learning_rate": 0.0001, "loss": 5.5129, "loss/crossentropy": 2.4113610982894897, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16288983821868896, "step": 26622 }, { "epoch": 0.832, "grad_norm": 3.109375, "grad_norm_var": 0.05408528645833333, "learning_rate": 0.0001, "loss": 5.6785, "loss/crossentropy": 2.59414279460907, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.16586247086524963, "step": 26624 }, { "epoch": 0.8320625, "grad_norm": 3.125, "grad_norm_var": 0.05830790201822917, "learning_rate": 0.0001, "loss": 5.2388, "loss/crossentropy": 2.319770097732544, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.14776460826396942, "step": 26626 }, { "epoch": 0.832125, "grad_norm": 2.96875, "grad_norm_var": 0.035399373372395834, "learning_rate": 0.0001, "loss": 5.6976, "loss/crossentropy": 2.633196473121643, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16347172856330872, "step": 26628 }, { "epoch": 0.8321875, "grad_norm": 3.21875, "grad_norm_var": 0.03687744140625, "learning_rate": 0.0001, "loss": 5.8177, "loss/crossentropy": 2.6326440572738647, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1708531677722931, "step": 26630 }, { "epoch": 0.83225, "grad_norm": 3.234375, "grad_norm_var": 0.04069722493489583, "learning_rate": 0.0001, "loss": 5.6339, "loss/crossentropy": 2.51995050907135, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16451912373304367, "step": 26632 }, { "epoch": 0.8323125, "grad_norm": 2.90625, "grad_norm_var": 0.04394124348958333, "learning_rate": 0.0001, "loss": 5.7602, "loss/crossentropy": 2.667533040046692, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16551541537046432, "step": 26634 }, { "epoch": 0.832375, "grad_norm": 2.765625, "grad_norm_var": 0.05290425618489583, "learning_rate": 0.0001, "loss": 5.5446, "loss/crossentropy": 2.634292483329773, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.15001527965068817, "step": 26636 }, { "epoch": 0.8324375, "grad_norm": 3.109375, "grad_norm_var": 0.0281646728515625, "learning_rate": 0.0001, "loss": 5.6131, "loss/crossentropy": 2.544378161430359, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16312159597873688, "step": 26638 }, { "epoch": 0.8325, "grad_norm": 3.015625, "grad_norm_var": 0.026741536458333333, "learning_rate": 0.0001, "loss": 5.4294, "loss/crossentropy": 2.410798668861389, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15810702741146088, "step": 26640 }, { "epoch": 0.8325625, "grad_norm": 3.15625, "grad_norm_var": 0.027790323893229166, "learning_rate": 0.0001, "loss": 5.5651, "loss/crossentropy": 2.5781432390213013, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.1565045714378357, "step": 26642 }, { "epoch": 0.832625, "grad_norm": 3.171875, "grad_norm_var": 0.028544108072916668, "learning_rate": 0.0001, "loss": 5.5522, "loss/crossentropy": 2.501611590385437, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16014138609170914, "step": 26644 }, { "epoch": 0.8326875, "grad_norm": 3.453125, "grad_norm_var": 0.038798014322916664, "learning_rate": 0.0001, "loss": 5.1769, "loss/crossentropy": 2.2129558324813843, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15225522220134735, "step": 26646 }, { "epoch": 0.83275, "grad_norm": 3.09375, "grad_norm_var": 0.0319000244140625, "learning_rate": 0.0001, "loss": 5.5751, "loss/crossentropy": 2.527201771736145, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15869136154651642, "step": 26648 }, { "epoch": 0.8328125, "grad_norm": 3.03125, "grad_norm_var": 0.038960774739583336, "learning_rate": 0.0001, "loss": 5.8576, "loss/crossentropy": 2.6822643280029297, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16988150775432587, "step": 26650 }, { "epoch": 0.832875, "grad_norm": 3.171875, "grad_norm_var": 0.030615234375, "learning_rate": 0.0001, "loss": 5.4277, "loss/crossentropy": 2.4621567726135254, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.15593001246452332, "step": 26652 }, { "epoch": 0.8329375, "grad_norm": 3.09375, "grad_norm_var": 0.03381754557291667, "learning_rate": 0.0001, "loss": 5.7046, "loss/crossentropy": 2.5616977214813232, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16781005263328552, "step": 26654 }, { "epoch": 0.833, "grad_norm": 3.046875, "grad_norm_var": 0.03369140625, "learning_rate": 0.0001, "loss": 5.8725, "loss/crossentropy": 2.6961796283721924, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16958148777484894, "step": 26656 }, { "epoch": 0.8330625, "grad_norm": 3.140625, "grad_norm_var": 0.023851521809895835, "learning_rate": 0.0001, "loss": 5.6901, "loss/crossentropy": 2.6127448081970215, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16125278174877167, "step": 26658 }, { "epoch": 0.833125, "grad_norm": 3.421875, "grad_norm_var": 0.029173787434895834, "learning_rate": 0.0001, "loss": 5.7964, "loss/crossentropy": 2.625298857688904, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16749783605337143, "step": 26660 }, { "epoch": 0.8331875, "grad_norm": 3.546875, "grad_norm_var": 0.03072509765625, "learning_rate": 0.0001, "loss": 5.862, "loss/crossentropy": 2.6517386436462402, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1729767546057701, "step": 26662 }, { "epoch": 0.83325, "grad_norm": 3.109375, "grad_norm_var": 0.0299713134765625, "learning_rate": 0.0001, "loss": 5.6328, "loss/crossentropy": 2.574502468109131, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16363906115293503, "step": 26664 }, { "epoch": 0.8333125, "grad_norm": 3.265625, "grad_norm_var": 0.032469685872395834, "learning_rate": 0.0001, "loss": 5.4988, "loss/crossentropy": 2.5313034057617188, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.15612319856882095, "step": 26666 }, { "epoch": 0.833375, "grad_norm": 3.140625, "grad_norm_var": 0.03329671223958333, "learning_rate": 0.0001, "loss": 5.6757, "loss/crossentropy": 2.530162215232849, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16651113331317902, "step": 26668 }, { "epoch": 0.8334375, "grad_norm": 3.1875, "grad_norm_var": 0.02779541015625, "learning_rate": 0.0001, "loss": 5.733, "loss/crossentropy": 2.65896737575531, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16443108767271042, "step": 26670 }, { "epoch": 0.8335, "grad_norm": 3.171875, "grad_norm_var": 0.028083292643229167, "learning_rate": 0.0001, "loss": 5.6405, "loss/crossentropy": 2.536733031272888, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16584675759077072, "step": 26672 }, { "epoch": 0.8335625, "grad_norm": 3.21875, "grad_norm_var": 0.03177083333333333, "learning_rate": 0.0001, "loss": 5.5154, "loss/crossentropy": 2.423623204231262, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15957237780094147, "step": 26674 }, { "epoch": 0.833625, "grad_norm": 3.0, "grad_norm_var": 0.028706868489583332, "learning_rate": 0.0001, "loss": 5.5164, "loss/crossentropy": 2.5392919778823853, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.15708287060260773, "step": 26676 }, { "epoch": 0.8336875, "grad_norm": 3.25, "grad_norm_var": 0.0214752197265625, "learning_rate": 0.0001, "loss": 5.4774, "loss/crossentropy": 2.5430959463119507, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.14967641979455948, "step": 26678 }, { "epoch": 0.83375, "grad_norm": 3.515625, "grad_norm_var": 0.03196512858072917, "learning_rate": 0.0001, "loss": 5.4983, "loss/crossentropy": 2.4303911924362183, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16030970215797424, "step": 26680 }, { "epoch": 0.8338125, "grad_norm": 2.921875, "grad_norm_var": 0.028400675455729166, "learning_rate": 0.0001, "loss": 5.5658, "loss/crossentropy": 2.545443296432495, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.1594526320695877, "step": 26682 }, { "epoch": 0.833875, "grad_norm": 3.734375, "grad_norm_var": 0.058268229166666664, "learning_rate": 0.0001, "loss": 5.4435, "loss/crossentropy": 2.3132035732269287, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16458793729543686, "step": 26684 }, { "epoch": 0.8339375, "grad_norm": 2.984375, "grad_norm_var": 0.062174479166666664, "learning_rate": 0.0001, "loss": 5.279, "loss/crossentropy": 2.3491846323013306, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.14766483008861542, "step": 26686 }, { "epoch": 0.834, "grad_norm": 2.953125, "grad_norm_var": 0.0600738525390625, "learning_rate": 0.0001, "loss": 5.3633, "loss/crossentropy": 2.4419000148773193, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.15151993930339813, "step": 26688 }, { "epoch": 0.8340625, "grad_norm": 2.796875, "grad_norm_var": 0.06399332682291667, "learning_rate": 0.0001, "loss": 5.5113, "loss/crossentropy": 2.4870275259017944, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15945614874362946, "step": 26690 }, { "epoch": 0.834125, "grad_norm": 2.640625, "grad_norm_var": 0.0761383056640625, "learning_rate": 0.0001, "loss": 5.4755, "loss/crossentropy": 2.4982056617736816, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15475623309612274, "step": 26692 }, { "epoch": 0.8341875, "grad_norm": 3.09375, "grad_norm_var": 0.07292378743489583, "learning_rate": 0.0001, "loss": 5.7732, "loss/crossentropy": 2.6511059999465942, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16767705231904984, "step": 26694 }, { "epoch": 0.83425, "grad_norm": 3.375, "grad_norm_var": 0.06965230305989584, "learning_rate": 0.0001, "loss": 5.3915, "loss/crossentropy": 2.3754031658172607, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15668748319149017, "step": 26696 }, { "epoch": 0.8343125, "grad_norm": 2.6875, "grad_norm_var": 0.07779541015625, "learning_rate": 0.0001, "loss": 5.2696, "loss/crossentropy": 2.4492520093917847, "loss/hidden": 1.375, "loss/jsd": 0.0, "loss/logits": 0.1445300504565239, "step": 26698 }, { "epoch": 0.834375, "grad_norm": 3.125, "grad_norm_var": 0.04003499348958333, "learning_rate": 0.0001, "loss": 5.5888, "loss/crossentropy": 2.5078903436660767, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1604384109377861, "step": 26700 }, { "epoch": 0.8344375, "grad_norm": 3.0625, "grad_norm_var": 0.0409088134765625, "learning_rate": 0.0001, "loss": 5.5608, "loss/crossentropy": 2.512549877166748, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15989995747804642, "step": 26702 }, { "epoch": 0.8345, "grad_norm": 3.078125, "grad_norm_var": 0.044432576497395834, "learning_rate": 0.0001, "loss": 5.9121, "loss/crossentropy": 2.7601088285446167, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16832401603460312, "step": 26704 }, { "epoch": 0.8345625, "grad_norm": 2.9375, "grad_norm_var": 0.03886617024739583, "learning_rate": 0.0001, "loss": 5.6269, "loss/crossentropy": 2.5564210414886475, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16329863667488098, "step": 26706 }, { "epoch": 0.834625, "grad_norm": 3.09375, "grad_norm_var": 0.031689453125, "learning_rate": 0.0001, "loss": 5.4439, "loss/crossentropy": 2.488960862159729, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15096069872379303, "step": 26708 }, { "epoch": 0.8346875, "grad_norm": 2.96875, "grad_norm_var": 0.030402628580729167, "learning_rate": 0.0001, "loss": 5.5835, "loss/crossentropy": 2.5353397130966187, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16067619621753693, "step": 26710 }, { "epoch": 0.83475, "grad_norm": 3.09375, "grad_norm_var": 0.05784403483072917, "learning_rate": 0.0001, "loss": 5.7989, "loss/crossentropy": 2.5801597833633423, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17226530611515045, "step": 26712 }, { "epoch": 0.8348125, "grad_norm": 3.234375, "grad_norm_var": 0.04716796875, "learning_rate": 0.0001, "loss": 5.8157, "loss/crossentropy": 2.6295214891433716, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1682291179895401, "step": 26714 }, { "epoch": 0.834875, "grad_norm": 3.109375, "grad_norm_var": 0.044091796875, "learning_rate": 0.0001, "loss": 5.8039, "loss/crossentropy": 2.604735016822815, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17030832171440125, "step": 26716 }, { "epoch": 0.8349375, "grad_norm": 3.09375, "grad_norm_var": 0.03968098958333333, "learning_rate": 0.0001, "loss": 5.8926, "loss/crossentropy": 2.7313610315322876, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1692444309592247, "step": 26718 }, { "epoch": 0.835, "grad_norm": 2.765625, "grad_norm_var": 0.04872945149739583, "learning_rate": 0.0001, "loss": 5.307, "loss/crossentropy": 2.4017326831817627, "loss/hidden": 1.3828125, "loss/jsd": 0.0, "loss/logits": 0.1522488072514534, "step": 26720 }, { "epoch": 0.8350625, "grad_norm": 3.171875, "grad_norm_var": 0.047606404622395834, "learning_rate": 0.0001, "loss": 5.6059, "loss/crossentropy": 2.5100516080856323, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16544535756111145, "step": 26722 }, { "epoch": 0.835125, "grad_norm": 2.859375, "grad_norm_var": 0.052473958333333334, "learning_rate": 0.0001, "loss": 5.5598, "loss/crossentropy": 2.4760489463806152, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1626765951514244, "step": 26724 }, { "epoch": 0.8351875, "grad_norm": 2.953125, "grad_norm_var": 0.05281473795572917, "learning_rate": 0.0001, "loss": 5.3427, "loss/crossentropy": 2.367329955101013, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.15691465139389038, "step": 26726 }, { "epoch": 0.83525, "grad_norm": 2.859375, "grad_norm_var": 0.022587076822916666, "learning_rate": 0.0001, "loss": 5.3456, "loss/crossentropy": 2.42109215259552, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15026235580444336, "step": 26728 }, { "epoch": 0.8353125, "grad_norm": 2.875, "grad_norm_var": 0.026366170247395834, "learning_rate": 0.0001, "loss": 5.4864, "loss/crossentropy": 2.4277414083480835, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16212007403373718, "step": 26730 }, { "epoch": 0.835375, "grad_norm": 3.046875, "grad_norm_var": 0.026204427083333332, "learning_rate": 0.0001, "loss": 5.8032, "loss/crossentropy": 2.7227463722229004, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16390687227249146, "step": 26732 }, { "epoch": 0.8354375, "grad_norm": 3.859375, "grad_norm_var": 0.0723785400390625, "learning_rate": 0.0001, "loss": 5.6879, "loss/crossentropy": 2.5874462127685547, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16473715007305145, "step": 26734 }, { "epoch": 0.8355, "grad_norm": 3.140625, "grad_norm_var": 0.06676025390625, "learning_rate": 0.0001, "loss": 5.6737, "loss/crossentropy": 2.6122844219207764, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15809710323810577, "step": 26736 }, { "epoch": 0.8355625, "grad_norm": 2.9375, "grad_norm_var": 0.06702067057291666, "learning_rate": 0.0001, "loss": 5.4637, "loss/crossentropy": 2.457230806350708, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15767929702997208, "step": 26738 }, { "epoch": 0.835625, "grad_norm": 3.03125, "grad_norm_var": 0.063623046875, "learning_rate": 0.0001, "loss": 5.6956, "loss/crossentropy": 2.612622022628784, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16299007087945938, "step": 26740 }, { "epoch": 0.8356875, "grad_norm": 2.890625, "grad_norm_var": 0.06609598795572917, "learning_rate": 0.0001, "loss": 5.9292, "loss/crossentropy": 2.7688095569610596, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16916365921497345, "step": 26742 }, { "epoch": 0.83575, "grad_norm": 3.0, "grad_norm_var": 0.06223856608072917, "learning_rate": 0.0001, "loss": 5.5947, "loss/crossentropy": 2.5085628032684326, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15939905494451523, "step": 26744 }, { "epoch": 0.8358125, "grad_norm": 2.96875, "grad_norm_var": 0.05424702962239583, "learning_rate": 0.0001, "loss": 5.4353, "loss/crossentropy": 2.3988637924194336, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15637993812561035, "step": 26746 }, { "epoch": 0.835875, "grad_norm": 2.984375, "grad_norm_var": 0.05304361979166667, "learning_rate": 0.0001, "loss": 5.4795, "loss/crossentropy": 2.4259506464004517, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16160430014133453, "step": 26748 }, { "epoch": 0.8359375, "grad_norm": 3.453125, "grad_norm_var": 0.0195465087890625, "learning_rate": 0.0001, "loss": 5.7092, "loss/crossentropy": 2.5474666357040405, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.17007629573345184, "step": 26750 }, { "epoch": 0.836, "grad_norm": 3.234375, "grad_norm_var": 0.024787394205729167, "learning_rate": 0.0001, "loss": 5.5902, "loss/crossentropy": 2.5298532247543335, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15642627328634262, "step": 26752 }, { "epoch": 0.8360625, "grad_norm": 3.09375, "grad_norm_var": 0.0242584228515625, "learning_rate": 0.0001, "loss": 5.6481, "loss/crossentropy": 2.5728485584259033, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15987379103899002, "step": 26754 }, { "epoch": 0.836125, "grad_norm": 3.140625, "grad_norm_var": 0.025223795572916666, "learning_rate": 0.0001, "loss": 5.5884, "loss/crossentropy": 2.4491742849349976, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1627497524023056, "step": 26756 }, { "epoch": 0.8361875, "grad_norm": 3.015625, "grad_norm_var": 0.021857706705729167, "learning_rate": 0.0001, "loss": 5.7338, "loss/crossentropy": 2.59290087223053, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16643692553043365, "step": 26758 }, { "epoch": 0.83625, "grad_norm": 3.21875, "grad_norm_var": 0.021773274739583334, "learning_rate": 0.0001, "loss": 5.6905, "loss/crossentropy": 2.5556634664535522, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16387174278497696, "step": 26760 }, { "epoch": 0.8363125, "grad_norm": 2.859375, "grad_norm_var": 0.023824055989583332, "learning_rate": 0.0001, "loss": 5.83, "loss/crossentropy": 2.696183443069458, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16533113270998, "step": 26762 }, { "epoch": 0.836375, "grad_norm": 2.703125, "grad_norm_var": 0.03271077473958333, "learning_rate": 0.0001, "loss": 5.1793, "loss/crossentropy": 2.3029249906539917, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.14701150357723236, "step": 26764 }, { "epoch": 0.8364375, "grad_norm": 3.21875, "grad_norm_var": 0.02427978515625, "learning_rate": 0.0001, "loss": 5.9573, "loss/crossentropy": 2.7467890977859497, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1733977273106575, "step": 26766 }, { "epoch": 0.8365, "grad_norm": 3.0625, "grad_norm_var": 0.021695963541666665, "learning_rate": 0.0001, "loss": 5.5861, "loss/crossentropy": 2.5231176614761353, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15981249511241913, "step": 26768 }, { "epoch": 0.8365625, "grad_norm": 6.875, "grad_norm_var": 0.9330800374348959, "learning_rate": 0.0001, "loss": 5.8358, "loss/crossentropy": 2.6720504760742188, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.17027679085731506, "step": 26770 }, { "epoch": 0.836625, "grad_norm": 3.0625, "grad_norm_var": 0.94205322265625, "learning_rate": 0.0001, "loss": 5.6374, "loss/crossentropy": 2.564761996269226, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16429024934768677, "step": 26772 }, { "epoch": 0.8366875, "grad_norm": 3.28125, "grad_norm_var": 0.9464192708333333, "learning_rate": 0.0001, "loss": 5.9044, "loss/crossentropy": 2.7284083366394043, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17033323645591736, "step": 26774 }, { "epoch": 0.83675, "grad_norm": 3.0, "grad_norm_var": 0.9486979166666667, "learning_rate": 0.0001, "loss": 5.6843, "loss/crossentropy": 2.6526867151260376, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1578490436077118, "step": 26776 }, { "epoch": 0.8368125, "grad_norm": 3.0625, "grad_norm_var": 0.9392161051432292, "learning_rate": 0.0001, "loss": 5.5223, "loss/crossentropy": 2.4756247997283936, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15740208327770233, "step": 26778 }, { "epoch": 0.836875, "grad_norm": 3.203125, "grad_norm_var": 0.9165201822916667, "learning_rate": 0.0001, "loss": 5.5949, "loss/crossentropy": 2.4976896047592163, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1651892215013504, "step": 26780 }, { "epoch": 0.8369375, "grad_norm": 3.328125, "grad_norm_var": 0.934912109375, "learning_rate": 0.0001, "loss": 5.6704, "loss/crossentropy": 2.551721930503845, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16460269689559937, "step": 26782 }, { "epoch": 0.837, "grad_norm": 3.265625, "grad_norm_var": 0.9153472900390625, "learning_rate": 0.0001, "loss": 6.0132, "loss/crossentropy": 2.706874132156372, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.18141163140535355, "step": 26784 }, { "epoch": 0.8370625, "grad_norm": 3.015625, "grad_norm_var": 0.038939412434895834, "learning_rate": 0.0001, "loss": 5.415, "loss/crossentropy": 2.47431743144989, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.14953739196062088, "step": 26786 }, { "epoch": 0.837125, "grad_norm": 2.984375, "grad_norm_var": 0.0384674072265625, "learning_rate": 0.0001, "loss": 5.6568, "loss/crossentropy": 2.5297670364379883, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16738921403884888, "step": 26788 }, { "epoch": 0.8371875, "grad_norm": 2.921875, "grad_norm_var": 0.03527018229166667, "learning_rate": 0.0001, "loss": 5.4379, "loss/crossentropy": 2.397291421890259, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15875116735696793, "step": 26790 }, { "epoch": 0.83725, "grad_norm": 2.890625, "grad_norm_var": 0.03779296875, "learning_rate": 0.0001, "loss": 5.5707, "loss/crossentropy": 2.5675047636032104, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.1577436849474907, "step": 26792 }, { "epoch": 0.8373125, "grad_norm": 2.96875, "grad_norm_var": 0.03433837890625, "learning_rate": 0.0001, "loss": 5.673, "loss/crossentropy": 2.5663158893585205, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16614055633544922, "step": 26794 }, { "epoch": 0.837375, "grad_norm": 3.109375, "grad_norm_var": 0.03267822265625, "learning_rate": 0.0001, "loss": 5.9705, "loss/crossentropy": 2.836636543273926, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16690289974212646, "step": 26796 }, { "epoch": 0.8374375, "grad_norm": 3.265625, "grad_norm_var": 0.02998046875, "learning_rate": 0.0001, "loss": 5.7814, "loss/crossentropy": 2.54093074798584, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1717006117105484, "step": 26798 }, { "epoch": 0.8375, "grad_norm": 3.09375, "grad_norm_var": 0.02125244140625, "learning_rate": 0.0001, "loss": 5.5185, "loss/crossentropy": 2.5173208713531494, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15090397745370865, "step": 26800 }, { "epoch": 0.8375625, "grad_norm": 3.3125, "grad_norm_var": 0.022370402018229166, "learning_rate": 0.0001, "loss": 5.5954, "loss/crossentropy": 2.535297393798828, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.16264691948890686, "step": 26802 }, { "epoch": 0.837625, "grad_norm": 3.453125, "grad_norm_var": 0.032373046875, "learning_rate": 0.0001, "loss": 5.8944, "loss/crossentropy": 2.706671714782715, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16876784712076187, "step": 26804 }, { "epoch": 0.8376875, "grad_norm": 2.84375, "grad_norm_var": 0.042313639322916666, "learning_rate": 0.0001, "loss": 5.1955, "loss/crossentropy": 2.3233484029769897, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1434631049633026, "step": 26806 }, { "epoch": 0.83775, "grad_norm": 2.875, "grad_norm_var": 0.0427398681640625, "learning_rate": 0.0001, "loss": 5.6275, "loss/crossentropy": 2.5967692136764526, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.16049602627754211, "step": 26808 }, { "epoch": 0.8378125, "grad_norm": 3.59375, "grad_norm_var": 0.058771769205729164, "learning_rate": 0.0001, "loss": 5.9516, "loss/crossentropy": 2.7301191091537476, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.170590378344059, "step": 26810 }, { "epoch": 0.837875, "grad_norm": 3.046875, "grad_norm_var": 0.061909993489583336, "learning_rate": 0.0001, "loss": 5.1865, "loss/crossentropy": 2.310048818588257, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.14702502638101578, "step": 26812 }, { "epoch": 0.8379375, "grad_norm": 2.75, "grad_norm_var": 0.06623942057291667, "learning_rate": 0.0001, "loss": 5.2667, "loss/crossentropy": 2.4080445766448975, "loss/hidden": 1.3671875, "loss/jsd": 0.0, "loss/logits": 0.14914533495903015, "step": 26814 }, { "epoch": 0.838, "grad_norm": 3.015625, "grad_norm_var": 0.06544596354166667, "learning_rate": 0.0001, "loss": 5.9025, "loss/crossentropy": 2.729759693145752, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17001153528690338, "step": 26816 }, { "epoch": 0.8380625, "grad_norm": 3.296875, "grad_norm_var": 0.0635406494140625, "learning_rate": 0.0001, "loss": 5.3389, "loss/crossentropy": 2.3518441915512085, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15378347784280777, "step": 26818 }, { "epoch": 0.838125, "grad_norm": 4.84375, "grad_norm_var": 0.2645467122395833, "learning_rate": 0.0001, "loss": 5.9562, "loss/crossentropy": 2.596237540245056, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.18521567434072495, "step": 26820 }, { "epoch": 0.8381875, "grad_norm": 2.890625, "grad_norm_var": 0.2518056233723958, "learning_rate": 0.0001, "loss": 5.4601, "loss/crossentropy": 2.451179027557373, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15519140660762787, "step": 26822 }, { "epoch": 0.83825, "grad_norm": 2.90625, "grad_norm_var": 0.25244038899739585, "learning_rate": 0.0001, "loss": 5.4229, "loss/crossentropy": 2.4421818256378174, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15002215653657913, "step": 26824 }, { "epoch": 0.8383125, "grad_norm": 2.859375, "grad_norm_var": 0.24560445149739582, "learning_rate": 0.0001, "loss": 5.4341, "loss/crossentropy": 2.4391945600509644, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1526143178343773, "step": 26826 }, { "epoch": 0.838375, "grad_norm": 3.34375, "grad_norm_var": 0.24294331868489583, "learning_rate": 0.0001, "loss": 5.8242, "loss/crossentropy": 2.630326747894287, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17290284484624863, "step": 26828 }, { "epoch": 0.8384375, "grad_norm": 3.953125, "grad_norm_var": 0.24970296223958333, "learning_rate": 0.0001, "loss": 5.9347, "loss/crossentropy": 2.5525777339935303, "loss/hidden": 1.546875, "loss/jsd": 0.0, "loss/logits": 0.18352903425693512, "step": 26830 }, { "epoch": 0.8385, "grad_norm": 2.90625, "grad_norm_var": 0.28645731608072916, "learning_rate": 0.0001, "loss": 5.9471, "loss/crossentropy": 2.721782088279724, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.17682532221078873, "step": 26832 }, { "epoch": 0.8385625, "grad_norm": 3.0, "grad_norm_var": 0.30128580729166665, "learning_rate": 0.0001, "loss": 5.5491, "loss/crossentropy": 2.541784167289734, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15502379834651947, "step": 26834 }, { "epoch": 0.838625, "grad_norm": 3.3125, "grad_norm_var": 0.15608622233072916, "learning_rate": 0.0001, "loss": 5.9573, "loss/crossentropy": 2.6573015451431274, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.17570366710424423, "step": 26836 }, { "epoch": 0.8386875, "grad_norm": 3.078125, "grad_norm_var": 0.14849344889322916, "learning_rate": 0.0001, "loss": 5.4387, "loss/crossentropy": 2.422448754310608, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15670091658830643, "step": 26838 }, { "epoch": 0.83875, "grad_norm": 3.078125, "grad_norm_var": 0.14223531087239583, "learning_rate": 0.0001, "loss": 5.3166, "loss/crossentropy": 2.348435401916504, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.1534605398774147, "step": 26840 }, { "epoch": 0.8388125, "grad_norm": 2.96875, "grad_norm_var": 0.139013671875, "learning_rate": 0.0001, "loss": 5.2335, "loss/crossentropy": 2.250566601753235, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15376639366149902, "step": 26842 }, { "epoch": 0.838875, "grad_norm": 3.59375, "grad_norm_var": 0.14752197265625, "learning_rate": 0.0001, "loss": 5.7326, "loss/crossentropy": 2.6011509895324707, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16470693796873093, "step": 26844 }, { "epoch": 0.8389375, "grad_norm": 3.234375, "grad_norm_var": 0.14312744140625, "learning_rate": 0.0001, "loss": 5.6885, "loss/crossentropy": 2.4735031127929688, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17344865947961807, "step": 26846 }, { "epoch": 0.839, "grad_norm": 3.0, "grad_norm_var": 0.09752197265625, "learning_rate": 0.0001, "loss": 5.5867, "loss/crossentropy": 2.5408254861831665, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16083333641290665, "step": 26848 }, { "epoch": 0.8390625, "grad_norm": 2.890625, "grad_norm_var": 0.10227457682291667, "learning_rate": 0.0001, "loss": 5.4898, "loss/crossentropy": 2.5737812519073486, "loss/hidden": 1.390625, "loss/jsd": 0.0, "loss/logits": 0.15253794938325882, "step": 26850 }, { "epoch": 0.839125, "grad_norm": 3.296875, "grad_norm_var": 0.07851155598958333, "learning_rate": 0.0001, "loss": 5.6971, "loss/crossentropy": 2.6061235666275024, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16300751268863678, "step": 26852 }, { "epoch": 0.8391875, "grad_norm": 3.140625, "grad_norm_var": 0.07815348307291667, "learning_rate": 0.0001, "loss": 5.5509, "loss/crossentropy": 2.4616470336914062, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16009312123060226, "step": 26854 }, { "epoch": 0.83925, "grad_norm": 2.875, "grad_norm_var": 0.08430074055989584, "learning_rate": 0.0001, "loss": 5.688, "loss/crossentropy": 2.626276135444641, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1600799262523651, "step": 26856 }, { "epoch": 0.8393125, "grad_norm": 3.046875, "grad_norm_var": 0.08238016764322917, "learning_rate": 0.0001, "loss": 5.647, "loss/crossentropy": 2.580414891242981, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16095992922782898, "step": 26858 }, { "epoch": 0.839375, "grad_norm": 3.40625, "grad_norm_var": 0.07401936848958333, "learning_rate": 0.0001, "loss": 5.7336, "loss/crossentropy": 2.6484148502349854, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16242486238479614, "step": 26860 }, { "epoch": 0.8394375, "grad_norm": 3.078125, "grad_norm_var": 0.0338531494140625, "learning_rate": 0.0001, "loss": 5.7906, "loss/crossentropy": 2.6796486377716064, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16500383615493774, "step": 26862 }, { "epoch": 0.8395, "grad_norm": 3.0, "grad_norm_var": 0.9651194254557292, "learning_rate": 0.0001, "loss": 5.2826, "loss/crossentropy": 2.179580330848694, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.1591331586241722, "step": 26864 }, { "epoch": 0.8395625, "grad_norm": 2.796875, "grad_norm_var": 0.9588826497395834, "learning_rate": 0.0001, "loss": 5.5379, "loss/crossentropy": 2.5491769313812256, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15316757559776306, "step": 26866 }, { "epoch": 0.839625, "grad_norm": 2.890625, "grad_norm_var": 0.9625, "learning_rate": 0.0001, "loss": 5.6079, "loss/crossentropy": 2.5456382036209106, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16208944469690323, "step": 26868 }, { "epoch": 0.8396875, "grad_norm": 3.125, "grad_norm_var": 0.9618804931640625, "learning_rate": 0.0001, "loss": 5.6729, "loss/crossentropy": 2.5196443796157837, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16493035852909088, "step": 26870 }, { "epoch": 0.83975, "grad_norm": 2.90625, "grad_norm_var": 0.9611328125, "learning_rate": 0.0001, "loss": 5.7829, "loss/crossentropy": 2.6576212644577026, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16565701365470886, "step": 26872 }, { "epoch": 0.8398125, "grad_norm": 3.15625, "grad_norm_var": 0.9554921468098958, "learning_rate": 0.0001, "loss": 5.3669, "loss/crossentropy": 2.3514479398727417, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15544924139976501, "step": 26874 }, { "epoch": 0.839875, "grad_norm": 2.84375, "grad_norm_var": 0.9740234375, "learning_rate": 0.0001, "loss": 5.6978, "loss/crossentropy": 2.684528112411499, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15445105731487274, "step": 26876 }, { "epoch": 0.8399375, "grad_norm": 2.90625, "grad_norm_var": 0.9736887613932291, "learning_rate": 0.0001, "loss": 5.7286, "loss/crossentropy": 2.690067172050476, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15932255238294601, "step": 26878 }, { "epoch": 0.84, "grad_norm": 3.0625, "grad_norm_var": 0.03420817057291667, "learning_rate": 0.0001, "loss": 5.3626, "loss/crossentropy": 2.3932125568389893, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15358121693134308, "step": 26880 }, { "epoch": 0.8400625, "grad_norm": 2.9375, "grad_norm_var": 0.030744425455729165, "learning_rate": 0.0001, "loss": 5.8626, "loss/crossentropy": 2.7045029401779175, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1669803336262703, "step": 26882 }, { "epoch": 0.840125, "grad_norm": 3.15625, "grad_norm_var": 0.024925740559895833, "learning_rate": 0.0001, "loss": 5.8938, "loss/crossentropy": 2.766055107116699, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16316113620996475, "step": 26884 }, { "epoch": 0.8401875, "grad_norm": 3.25, "grad_norm_var": 0.0205718994140625, "learning_rate": 0.0001, "loss": 5.4507, "loss/crossentropy": 2.528176784515381, "loss/hidden": 1.39453125, "loss/jsd": 0.0, "loss/logits": 0.15280170738697052, "step": 26886 }, { "epoch": 0.84025, "grad_norm": 3.203125, "grad_norm_var": 0.0192291259765625, "learning_rate": 0.0001, "loss": 5.4286, "loss/crossentropy": 2.4476903676986694, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15668931603431702, "step": 26888 }, { "epoch": 0.8403125, "grad_norm": 3.015625, "grad_norm_var": 0.03310546875, "learning_rate": 0.0001, "loss": 5.9517, "loss/crossentropy": 2.68823504447937, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17751407623291016, "step": 26890 }, { "epoch": 0.840375, "grad_norm": 3.140625, "grad_norm_var": 0.026688639322916666, "learning_rate": 0.0001, "loss": 5.8172, "loss/crossentropy": 2.668814778327942, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16796687245368958, "step": 26892 }, { "epoch": 0.8404375, "grad_norm": 3.0625, "grad_norm_var": 0.0240142822265625, "learning_rate": 0.0001, "loss": 5.6673, "loss/crossentropy": 2.572339177131653, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16262523084878922, "step": 26894 }, { "epoch": 0.8405, "grad_norm": 3.21875, "grad_norm_var": 0.023485310872395835, "learning_rate": 0.0001, "loss": 5.7972, "loss/crossentropy": 2.6653146743774414, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16436417400836945, "step": 26896 }, { "epoch": 0.8405625, "grad_norm": 3.1875, "grad_norm_var": 0.019624837239583335, "learning_rate": 0.0001, "loss": 5.9304, "loss/crossentropy": 2.7217326164245605, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1732107624411583, "step": 26898 }, { "epoch": 0.840625, "grad_norm": 2.984375, "grad_norm_var": 0.021337890625, "learning_rate": 0.0001, "loss": 5.7385, "loss/crossentropy": 2.6471580266952515, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16382071375846863, "step": 26900 }, { "epoch": 0.8406875, "grad_norm": 2.96875, "grad_norm_var": 0.023395792643229166, "learning_rate": 0.0001, "loss": 5.707, "loss/crossentropy": 2.6310672760009766, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.1654074341058731, "step": 26902 }, { "epoch": 0.84075, "grad_norm": 3.0625, "grad_norm_var": 0.023517862955729166, "learning_rate": 0.0001, "loss": 5.5822, "loss/crossentropy": 2.538162112236023, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16025841236114502, "step": 26904 }, { "epoch": 0.8408125, "grad_norm": 2.953125, "grad_norm_var": 0.011994425455729167, "learning_rate": 0.0001, "loss": 5.5235, "loss/crossentropy": 2.5428719520568848, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.1558748334646225, "step": 26906 }, { "epoch": 0.840875, "grad_norm": 2.75, "grad_norm_var": 0.017268880208333334, "learning_rate": 0.0001, "loss": 5.6886, "loss/crossentropy": 2.6650702953338623, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15821386873722076, "step": 26908 }, { "epoch": 0.8409375, "grad_norm": 2.734375, "grad_norm_var": 0.03150634765625, "learning_rate": 0.0001, "loss": 5.5548, "loss/crossentropy": 2.48689067363739, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1606953889131546, "step": 26910 }, { "epoch": 0.841, "grad_norm": 3.265625, "grad_norm_var": 0.03775634765625, "learning_rate": 0.0001, "loss": 5.8837, "loss/crossentropy": 2.644471287727356, "loss/hidden": 1.55078125, "loss/jsd": 0.0, "loss/logits": 0.16884031891822815, "step": 26912 }, { "epoch": 0.8410625, "grad_norm": 3.09375, "grad_norm_var": 0.0431793212890625, "learning_rate": 0.0001, "loss": 5.7563, "loss/crossentropy": 2.681451916694641, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.161394402384758, "step": 26914 }, { "epoch": 0.841125, "grad_norm": 3.1875, "grad_norm_var": 0.046971638997395836, "learning_rate": 0.0001, "loss": 5.7341, "loss/crossentropy": 2.6222978830337524, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16625793278217316, "step": 26916 }, { "epoch": 0.8411875, "grad_norm": 3.15625, "grad_norm_var": 0.04518941243489583, "learning_rate": 0.0001, "loss": 5.8303, "loss/crossentropy": 2.6999884843826294, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16577009856700897, "step": 26918 }, { "epoch": 0.84125, "grad_norm": 3.15625, "grad_norm_var": 0.04499409993489583, "learning_rate": 0.0001, "loss": 5.4822, "loss/crossentropy": 2.466330647468567, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15275505185127258, "step": 26920 }, { "epoch": 0.8413125, "grad_norm": 2.859375, "grad_norm_var": 0.048094685872395834, "learning_rate": 0.0001, "loss": 5.5703, "loss/crossentropy": 2.5702513456344604, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15664538741111755, "step": 26922 }, { "epoch": 0.841375, "grad_norm": 3.015625, "grad_norm_var": 0.04749247233072917, "learning_rate": 0.0001, "loss": 5.2683, "loss/crossentropy": 2.370810389518738, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.1499096229672432, "step": 26924 }, { "epoch": 0.8414375, "grad_norm": 3.0, "grad_norm_var": 0.0365875244140625, "learning_rate": 0.0001, "loss": 5.2108, "loss/crossentropy": 2.2897136211395264, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.14679966866970062, "step": 26926 }, { "epoch": 0.8415, "grad_norm": 3.265625, "grad_norm_var": 0.027880859375, "learning_rate": 0.0001, "loss": 5.4859, "loss/crossentropy": 2.413581371307373, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16504260152578354, "step": 26928 }, { "epoch": 0.8415625, "grad_norm": 2.90625, "grad_norm_var": 0.02841796875, "learning_rate": 0.0001, "loss": 5.4686, "loss/crossentropy": 2.5018755197525024, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.15565899014472961, "step": 26930 }, { "epoch": 0.841625, "grad_norm": 3.109375, "grad_norm_var": 0.019401041666666667, "learning_rate": 0.0001, "loss": 5.2209, "loss/crossentropy": 2.2474820613861084, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15281382948160172, "step": 26932 }, { "epoch": 0.8416875, "grad_norm": 2.78125, "grad_norm_var": 0.023726399739583334, "learning_rate": 0.0001, "loss": 5.2421, "loss/crossentropy": 2.306075096130371, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1498563215136528, "step": 26934 }, { "epoch": 0.84175, "grad_norm": 3.140625, "grad_norm_var": 0.024437459309895833, "learning_rate": 0.0001, "loss": 5.7416, "loss/crossentropy": 2.6036221981048584, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1665356457233429, "step": 26936 }, { "epoch": 0.8418125, "grad_norm": 3.0, "grad_norm_var": 0.03424479166666667, "learning_rate": 0.0001, "loss": 5.5409, "loss/crossentropy": 2.4251224994659424, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16508854925632477, "step": 26938 }, { "epoch": 0.841875, "grad_norm": 2.96875, "grad_norm_var": 0.030301920572916665, "learning_rate": 0.0001, "loss": 5.7345, "loss/crossentropy": 2.6435906887054443, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16377675533294678, "step": 26940 }, { "epoch": 0.8419375, "grad_norm": 3.8125, "grad_norm_var": 0.06806538899739584, "learning_rate": 0.0001, "loss": 5.6693, "loss/crossentropy": 2.5307010412216187, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16463905572891235, "step": 26942 }, { "epoch": 0.842, "grad_norm": 3.109375, "grad_norm_var": 0.06562398274739584, "learning_rate": 0.0001, "loss": 5.7085, "loss/crossentropy": 2.601099133491516, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16542357206344604, "step": 26944 }, { "epoch": 0.8420625, "grad_norm": 3.1875, "grad_norm_var": 0.05486653645833333, "learning_rate": 0.0001, "loss": 5.8455, "loss/crossentropy": 2.588975667953491, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17565176635980606, "step": 26946 }, { "epoch": 0.842125, "grad_norm": 2.9375, "grad_norm_var": 0.05418192545572917, "learning_rate": 0.0001, "loss": 5.7066, "loss/crossentropy": 2.621884822845459, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16550437361001968, "step": 26948 }, { "epoch": 0.8421875, "grad_norm": 3.265625, "grad_norm_var": 0.04524637858072917, "learning_rate": 0.0001, "loss": 5.706, "loss/crossentropy": 2.552005410194397, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16695862263441086, "step": 26950 }, { "epoch": 0.84225, "grad_norm": 2.890625, "grad_norm_var": 0.053609212239583336, "learning_rate": 0.0001, "loss": 5.6121, "loss/crossentropy": 2.542546510696411, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.16555316001176834, "step": 26952 }, { "epoch": 0.8423125, "grad_norm": 3.609375, "grad_norm_var": 0.05915425618489583, "learning_rate": 0.0001, "loss": 5.7437, "loss/crossentropy": 2.591462731361389, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.17030170559883118, "step": 26954 }, { "epoch": 0.842375, "grad_norm": 2.84375, "grad_norm_var": 0.06367085774739584, "learning_rate": 0.0001, "loss": 5.5539, "loss/crossentropy": 2.514713168144226, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16016453504562378, "step": 26956 }, { "epoch": 0.8424375, "grad_norm": 3.1875, "grad_norm_var": 0.03808186848958333, "learning_rate": 0.0001, "loss": 5.8469, "loss/crossentropy": 2.7059032917022705, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16526933759450912, "step": 26958 }, { "epoch": 0.8425, "grad_norm": 3.0, "grad_norm_var": 0.0410552978515625, "learning_rate": 0.0001, "loss": 5.1531, "loss/crossentropy": 2.266539216041565, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.14725393056869507, "step": 26960 }, { "epoch": 0.8425625, "grad_norm": 3.234375, "grad_norm_var": 0.040087890625, "learning_rate": 0.0001, "loss": 5.8384, "loss/crossentropy": 2.7500157356262207, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1635221764445305, "step": 26962 }, { "epoch": 0.842625, "grad_norm": 3.171875, "grad_norm_var": 0.038069661458333334, "learning_rate": 0.0001, "loss": 5.4276, "loss/crossentropy": 2.4385040998458862, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15086626261472702, "step": 26964 }, { "epoch": 0.8426875, "grad_norm": 3.265625, "grad_norm_var": 0.04104410807291667, "learning_rate": 0.0001, "loss": 5.4244, "loss/crossentropy": 2.385402202606201, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.156633660197258, "step": 26966 }, { "epoch": 0.84275, "grad_norm": 2.890625, "grad_norm_var": 0.03535868326822917, "learning_rate": 0.0001, "loss": 5.5479, "loss/crossentropy": 2.523975968360901, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.1598101630806923, "step": 26968 }, { "epoch": 0.8428125, "grad_norm": 3.265625, "grad_norm_var": 0.019383748372395832, "learning_rate": 0.0001, "loss": 5.4475, "loss/crossentropy": 2.3999253511428833, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15866780281066895, "step": 26970 }, { "epoch": 0.842875, "grad_norm": 3.546875, "grad_norm_var": 0.06405843098958333, "learning_rate": 0.0001, "loss": 6.2068, "loss/crossentropy": 2.8445491790771484, "loss/hidden": 1.54296875, "loss/jsd": 0.0, "loss/logits": 0.18193239718675613, "step": 26972 }, { "epoch": 0.8429375, "grad_norm": 3.0, "grad_norm_var": 0.07138671875, "learning_rate": 0.0001, "loss": 5.268, "loss/crossentropy": 2.3104175329208374, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15357424318790436, "step": 26974 }, { "epoch": 0.843, "grad_norm": 3.109375, "grad_norm_var": 0.07037353515625, "learning_rate": 0.0001, "loss": 5.7229, "loss/crossentropy": 2.6200207471847534, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16497711837291718, "step": 26976 }, { "epoch": 0.8430625, "grad_norm": 3.28125, "grad_norm_var": 0.07506103515625, "learning_rate": 0.0001, "loss": 5.5779, "loss/crossentropy": 2.4063092470169067, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.16637791693210602, "step": 26978 }, { "epoch": 0.843125, "grad_norm": 3.015625, "grad_norm_var": 0.07935791015625, "learning_rate": 0.0001, "loss": 5.3624, "loss/crossentropy": 2.3784230947494507, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15543343126773834, "step": 26980 }, { "epoch": 0.8431875, "grad_norm": 2.90625, "grad_norm_var": 0.07746988932291667, "learning_rate": 0.0001, "loss": 5.5087, "loss/crossentropy": 2.4464324712753296, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1632547602057457, "step": 26982 }, { "epoch": 0.84325, "grad_norm": 3.296875, "grad_norm_var": 0.07587788899739584, "learning_rate": 0.0001, "loss": 5.9727, "loss/crossentropy": 2.7504212856292725, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17261478304862976, "step": 26984 }, { "epoch": 0.8433125, "grad_norm": 2.765625, "grad_norm_var": 0.08612874348958334, "learning_rate": 0.0001, "loss": 5.4549, "loss/crossentropy": 2.4451680183410645, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15683747082948685, "step": 26986 }, { "epoch": 0.843375, "grad_norm": 2.984375, "grad_norm_var": 0.044041951497395836, "learning_rate": 0.0001, "loss": 5.6748, "loss/crossentropy": 2.5268555879592896, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16791684925556183, "step": 26988 }, { "epoch": 0.8434375, "grad_norm": 3.171875, "grad_norm_var": 0.041341145833333336, "learning_rate": 0.0001, "loss": 5.5076, "loss/crossentropy": 2.501393675804138, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.1564769521355629, "step": 26990 }, { "epoch": 0.8435, "grad_norm": 3.0625, "grad_norm_var": 0.04091695149739583, "learning_rate": 0.0001, "loss": 5.4816, "loss/crossentropy": 2.4734781980514526, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1523718759417534, "step": 26992 }, { "epoch": 0.8435625, "grad_norm": 3.453125, "grad_norm_var": 0.03901265462239583, "learning_rate": 0.0001, "loss": 5.9334, "loss/crossentropy": 2.6941498517990112, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.17236074060201645, "step": 26994 }, { "epoch": 0.843625, "grad_norm": 2.9375, "grad_norm_var": 0.040013631184895836, "learning_rate": 0.0001, "loss": 5.3828, "loss/crossentropy": 2.34332537651062, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15667743980884552, "step": 26996 }, { "epoch": 0.8436875, "grad_norm": 3.4375, "grad_norm_var": 0.05001627604166667, "learning_rate": 0.0001, "loss": 5.437, "loss/crossentropy": 2.4447511434555054, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.15665173530578613, "step": 26998 }, { "epoch": 0.84375, "grad_norm": 3.984375, "grad_norm_var": 0.0950347900390625, "learning_rate": 0.0001, "loss": 6.0429, "loss/crossentropy": 2.723562240600586, "loss/hidden": 1.5234375, "loss/jsd": 0.0, "loss/logits": 0.1795891523361206, "step": 27000 }, { "epoch": 0.8438125, "grad_norm": 3.09375, "grad_norm_var": 0.08463134765625, "learning_rate": 0.0001, "loss": 5.2803, "loss/crossentropy": 2.301753044128418, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1540997475385666, "step": 27002 }, { "epoch": 0.843875, "grad_norm": 3.1875, "grad_norm_var": 0.077197265625, "learning_rate": 0.0001, "loss": 5.842, "loss/crossentropy": 2.7305915355682373, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1615283042192459, "step": 27004 }, { "epoch": 0.8439375, "grad_norm": 3.59375, "grad_norm_var": 0.08388264973958333, "learning_rate": 0.0001, "loss": 5.5253, "loss/crossentropy": 2.4625903367996216, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16213227808475494, "step": 27006 }, { "epoch": 0.844, "grad_norm": 2.828125, "grad_norm_var": 0.08460286458333334, "learning_rate": 0.0001, "loss": 5.6316, "loss/crossentropy": 2.5852612257003784, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16010281443595886, "step": 27008 }, { "epoch": 0.8440625, "grad_norm": 3.03125, "grad_norm_var": 0.08253580729166667, "learning_rate": 0.0001, "loss": 5.6831, "loss/crossentropy": 2.568989872932434, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16336346417665482, "step": 27010 }, { "epoch": 0.844125, "grad_norm": 3.234375, "grad_norm_var": 0.08439839680989583, "learning_rate": 0.0001, "loss": 5.5188, "loss/crossentropy": 2.4843783378601074, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15968778729438782, "step": 27012 }, { "epoch": 0.8441875, "grad_norm": 3.53125, "grad_norm_var": 0.08073628743489583, "learning_rate": 0.0001, "loss": 5.8848, "loss/crossentropy": 2.715808629989624, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.17119789123535156, "step": 27014 }, { "epoch": 0.84425, "grad_norm": 2.96875, "grad_norm_var": 0.04158426920572917, "learning_rate": 0.0001, "loss": 5.674, "loss/crossentropy": 2.5980119705200195, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1618957370519638, "step": 27016 }, { "epoch": 0.8443125, "grad_norm": 3.125, "grad_norm_var": 0.046686808268229164, "learning_rate": 0.0001, "loss": 5.4329, "loss/crossentropy": 2.3620887994766235, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16059454530477524, "step": 27018 }, { "epoch": 0.844375, "grad_norm": 3.0625, "grad_norm_var": 0.047053019205729164, "learning_rate": 0.0001, "loss": 5.5869, "loss/crossentropy": 2.491005778312683, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16310331225395203, "step": 27020 }, { "epoch": 0.8444375, "grad_norm": 3.03125, "grad_norm_var": 0.036188761393229164, "learning_rate": 0.0001, "loss": 5.7232, "loss/crossentropy": 2.5387020111083984, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16884002089500427, "step": 27022 }, { "epoch": 0.8445, "grad_norm": 3.03125, "grad_norm_var": 0.03412984212239583, "learning_rate": 0.0001, "loss": 5.6325, "loss/crossentropy": 2.481870174407959, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16623012721538544, "step": 27024 }, { "epoch": 0.8445625, "grad_norm": 2.921875, "grad_norm_var": 0.038060506184895836, "learning_rate": 0.0001, "loss": 5.4642, "loss/crossentropy": 2.429790735244751, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15968939661979675, "step": 27026 }, { "epoch": 0.844625, "grad_norm": 3.046875, "grad_norm_var": 0.0376861572265625, "learning_rate": 0.0001, "loss": 5.4997, "loss/crossentropy": 2.5007166862487793, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15693041682243347, "step": 27028 }, { "epoch": 0.8446875, "grad_norm": 4.125, "grad_norm_var": 0.09543863932291667, "learning_rate": 0.0001, "loss": 5.7831, "loss/crossentropy": 2.61792528629303, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16846979409456253, "step": 27030 }, { "epoch": 0.84475, "grad_norm": 2.953125, "grad_norm_var": 0.0945465087890625, "learning_rate": 0.0001, "loss": 5.874, "loss/crossentropy": 2.761651039123535, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16709166765213013, "step": 27032 }, { "epoch": 0.8448125, "grad_norm": 3.28125, "grad_norm_var": 0.09298502604166667, "learning_rate": 0.0001, "loss": 5.5276, "loss/crossentropy": 2.4779335260391235, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16043298691511154, "step": 27034 }, { "epoch": 0.844875, "grad_norm": 2.953125, "grad_norm_var": 0.09420166015625, "learning_rate": 0.0001, "loss": 5.4692, "loss/crossentropy": 2.4889973402023315, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1550476774573326, "step": 27036 }, { "epoch": 0.8449375, "grad_norm": 3.171875, "grad_norm_var": 0.093798828125, "learning_rate": 0.0001, "loss": 5.534, "loss/crossentropy": 2.4192898273468018, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16263999044895172, "step": 27038 }, { "epoch": 0.845, "grad_norm": 3.328125, "grad_norm_var": 0.09368082682291666, "learning_rate": 0.0001, "loss": 5.6515, "loss/crossentropy": 2.518969416618347, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16715529561042786, "step": 27040 }, { "epoch": 0.8450625, "grad_norm": 3.140625, "grad_norm_var": 0.09722900390625, "learning_rate": 0.0001, "loss": 5.5178, "loss/crossentropy": 2.487750291824341, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15808579325675964, "step": 27042 }, { "epoch": 0.845125, "grad_norm": 3.25, "grad_norm_var": 0.09236653645833333, "learning_rate": 0.0001, "loss": 5.8472, "loss/crossentropy": 2.698338747024536, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.1707422360777855, "step": 27044 }, { "epoch": 0.8451875, "grad_norm": 2.875, "grad_norm_var": 0.031168619791666668, "learning_rate": 0.0001, "loss": 5.4016, "loss/crossentropy": 2.3835963010787964, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15570556372404099, "step": 27046 }, { "epoch": 0.84525, "grad_norm": 3.03125, "grad_norm_var": 0.030345662434895834, "learning_rate": 0.0001, "loss": 5.6156, "loss/crossentropy": 2.5234217643737793, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16390936076641083, "step": 27048 }, { "epoch": 0.8453125, "grad_norm": 3.09375, "grad_norm_var": 0.027144368489583334, "learning_rate": 0.0001, "loss": 5.5738, "loss/crossentropy": 2.48178231716156, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16466858237981796, "step": 27050 }, { "epoch": 0.845375, "grad_norm": 2.953125, "grad_norm_var": 0.027196248372395832, "learning_rate": 0.0001, "loss": 5.4942, "loss/crossentropy": 2.4780293703079224, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15787175297737122, "step": 27052 }, { "epoch": 0.8454375, "grad_norm": 3.1875, "grad_norm_var": 0.022337849934895834, "learning_rate": 0.0001, "loss": 5.8641, "loss/crossentropy": 2.692082405090332, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.17384406924247742, "step": 27054 }, { "epoch": 0.8455, "grad_norm": 3.25, "grad_norm_var": 0.02857666015625, "learning_rate": 0.0001, "loss": 5.7946, "loss/crossentropy": 2.5548572540283203, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.17436685413122177, "step": 27056 }, { "epoch": 0.8455625, "grad_norm": 3.0, "grad_norm_var": 0.02008056640625, "learning_rate": 0.0001, "loss": 5.2714, "loss/crossentropy": 2.349884033203125, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.14488628506660461, "step": 27058 }, { "epoch": 0.845625, "grad_norm": 2.953125, "grad_norm_var": 0.0199127197265625, "learning_rate": 0.0001, "loss": 5.511, "loss/crossentropy": 2.5070693492889404, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1574266254901886, "step": 27060 }, { "epoch": 0.8456875, "grad_norm": 3.0625, "grad_norm_var": 0.0166412353515625, "learning_rate": 0.0001, "loss": 5.7185, "loss/crossentropy": 2.5456626415252686, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1692354381084442, "step": 27062 }, { "epoch": 0.84575, "grad_norm": 3.296875, "grad_norm_var": 0.020637003580729167, "learning_rate": 0.0001, "loss": 5.7958, "loss/crossentropy": 2.549852728843689, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17263749986886978, "step": 27064 }, { "epoch": 0.8458125, "grad_norm": 2.890625, "grad_norm_var": 0.0243316650390625, "learning_rate": 0.0001, "loss": 5.5744, "loss/crossentropy": 2.5248301029205322, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16042758524417877, "step": 27066 }, { "epoch": 0.845875, "grad_norm": 3.09375, "grad_norm_var": 0.0237457275390625, "learning_rate": 0.0001, "loss": 5.8239, "loss/crossentropy": 2.7303425073623657, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16403896361589432, "step": 27068 }, { "epoch": 0.8459375, "grad_norm": 2.78125, "grad_norm_var": 0.030367024739583335, "learning_rate": 0.0001, "loss": 5.5004, "loss/crossentropy": 2.513803482055664, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15529655665159225, "step": 27070 }, { "epoch": 0.846, "grad_norm": 2.9375, "grad_norm_var": 0.020015462239583334, "learning_rate": 0.0001, "loss": 5.7383, "loss/crossentropy": 2.7543208599090576, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.1577717885375023, "step": 27072 }, { "epoch": 0.8460625, "grad_norm": 3.0625, "grad_norm_var": 0.020992024739583334, "learning_rate": 0.0001, "loss": 5.9565, "loss/crossentropy": 2.8108898401260376, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.17081285268068314, "step": 27074 }, { "epoch": 0.846125, "grad_norm": 2.953125, "grad_norm_var": 0.021068318684895834, "learning_rate": 0.0001, "loss": 5.6215, "loss/crossentropy": 2.528372049331665, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1604895070195198, "step": 27076 }, { "epoch": 0.8461875, "grad_norm": 2.875, "grad_norm_var": 0.022835286458333333, "learning_rate": 0.0001, "loss": 5.5454, "loss/crossentropy": 2.5339020490646362, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.157789446413517, "step": 27078 }, { "epoch": 0.84625, "grad_norm": 3.0, "grad_norm_var": 0.013704427083333333, "learning_rate": 0.0001, "loss": 5.4112, "loss/crossentropy": 2.400560140609741, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15379934757947922, "step": 27080 }, { "epoch": 0.8463125, "grad_norm": 3.0625, "grad_norm_var": 0.012702433268229167, "learning_rate": 0.0001, "loss": 5.3947, "loss/crossentropy": 2.4423404932022095, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.1534384861588478, "step": 27082 }, { "epoch": 0.846375, "grad_norm": 3.109375, "grad_norm_var": 0.0214508056640625, "learning_rate": 0.0001, "loss": 5.6499, "loss/crossentropy": 2.490090489387512, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.167543925344944, "step": 27084 }, { "epoch": 0.8464375, "grad_norm": 3.109375, "grad_norm_var": 0.04205322265625, "learning_rate": 0.0001, "loss": 5.7125, "loss/crossentropy": 2.5722068548202515, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16637550294399261, "step": 27086 }, { "epoch": 0.8465, "grad_norm": 3.203125, "grad_norm_var": 0.04016825358072917, "learning_rate": 0.0001, "loss": 5.7551, "loss/crossentropy": 2.637491822242737, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1672314703464508, "step": 27088 }, { "epoch": 0.8465625, "grad_norm": 3.015625, "grad_norm_var": 0.04234110514322917, "learning_rate": 0.0001, "loss": 5.4526, "loss/crossentropy": 2.3890881538391113, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16103775799274445, "step": 27090 }, { "epoch": 0.846625, "grad_norm": 3.171875, "grad_norm_var": 0.03996988932291667, "learning_rate": 0.0001, "loss": 5.6648, "loss/crossentropy": 2.5229744911193848, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16848351806402206, "step": 27092 }, { "epoch": 0.8466875, "grad_norm": 3.03125, "grad_norm_var": 0.03902079264322917, "learning_rate": 0.0001, "loss": 5.8173, "loss/crossentropy": 2.644046187400818, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1673254668712616, "step": 27094 }, { "epoch": 0.84675, "grad_norm": 2.921875, "grad_norm_var": 0.041258748372395834, "learning_rate": 0.0001, "loss": 5.7519, "loss/crossentropy": 2.6347395181655884, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1664055809378624, "step": 27096 }, { "epoch": 0.8468125, "grad_norm": 3.203125, "grad_norm_var": 0.045653279622395834, "learning_rate": 0.0001, "loss": 5.6848, "loss/crossentropy": 2.592831015586853, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16231727600097656, "step": 27098 }, { "epoch": 0.846875, "grad_norm": 2.984375, "grad_norm_var": 0.052000935872395834, "learning_rate": 0.0001, "loss": 5.5228, "loss/crossentropy": 2.5616806745529175, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1500208005309105, "step": 27100 }, { "epoch": 0.8469375, "grad_norm": 3.171875, "grad_norm_var": 0.028739420572916667, "learning_rate": 0.0001, "loss": 5.8282, "loss/crossentropy": 2.6943808794021606, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16572364419698715, "step": 27102 }, { "epoch": 0.847, "grad_norm": 3.21875, "grad_norm_var": 0.03095703125, "learning_rate": 0.0001, "loss": 5.7187, "loss/crossentropy": 2.6862971782684326, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.16183152049779892, "step": 27104 }, { "epoch": 0.8470625, "grad_norm": 3.0, "grad_norm_var": 0.028425089518229165, "learning_rate": 0.0001, "loss": 5.6099, "loss/crossentropy": 2.6098971366882324, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1535153165459633, "step": 27106 }, { "epoch": 0.847125, "grad_norm": 3.15625, "grad_norm_var": 0.032124837239583336, "learning_rate": 0.0001, "loss": 5.7344, "loss/crossentropy": 2.5661354064941406, "loss/hidden": 1.515625, "loss/jsd": 0.0, "loss/logits": 0.1652621030807495, "step": 27108 }, { "epoch": 0.8471875, "grad_norm": 2.953125, "grad_norm_var": 0.025614420572916668, "learning_rate": 0.0001, "loss": 5.6601, "loss/crossentropy": 2.5817039012908936, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16487447917461395, "step": 27110 }, { "epoch": 0.84725, "grad_norm": 3.03125, "grad_norm_var": 0.023958333333333335, "learning_rate": 0.0001, "loss": 5.8438, "loss/crossentropy": 2.6846741437911987, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.17060133069753647, "step": 27112 }, { "epoch": 0.8473125, "grad_norm": 3.015625, "grad_norm_var": 0.019270833333333334, "learning_rate": 0.0001, "loss": 5.8774, "loss/crossentropy": 2.711457371711731, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1685514897108078, "step": 27114 }, { "epoch": 0.847375, "grad_norm": 3.15625, "grad_norm_var": 0.018171183268229165, "learning_rate": 0.0001, "loss": 5.4803, "loss/crossentropy": 2.421882748603821, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16052868217229843, "step": 27116 }, { "epoch": 0.8474375, "grad_norm": 3.640625, "grad_norm_var": 0.0386383056640625, "learning_rate": 0.0001, "loss": 5.5062, "loss/crossentropy": 2.4428123235702515, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16220122575759888, "step": 27118 }, { "epoch": 0.8475, "grad_norm": 2.96875, "grad_norm_var": 0.0363189697265625, "learning_rate": 0.0001, "loss": 5.3899, "loss/crossentropy": 2.4994590282440186, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.1491985023021698, "step": 27120 }, { "epoch": 0.8475625, "grad_norm": 3.03125, "grad_norm_var": 0.03611551920572917, "learning_rate": 0.0001, "loss": 5.9667, "loss/crossentropy": 2.7980575561523438, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16842877864837646, "step": 27122 }, { "epoch": 0.847625, "grad_norm": 3.140625, "grad_norm_var": 0.03232014973958333, "learning_rate": 0.0001, "loss": 5.6818, "loss/crossentropy": 2.566531181335449, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16425713896751404, "step": 27124 }, { "epoch": 0.8476875, "grad_norm": 2.953125, "grad_norm_var": 0.039839680989583334, "learning_rate": 0.0001, "loss": 5.1265, "loss/crossentropy": 2.329495072364807, "loss/hidden": 1.3828125, "loss/jsd": 0.0, "loss/logits": 0.14141619950532913, "step": 27126 }, { "epoch": 0.84775, "grad_norm": 3.171875, "grad_norm_var": 0.04466044108072917, "learning_rate": 0.0001, "loss": 5.5252, "loss/crossentropy": 2.5107333660125732, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.15886925905942917, "step": 27128 }, { "epoch": 0.8478125, "grad_norm": 2.921875, "grad_norm_var": 0.048111979166666666, "learning_rate": 0.0001, "loss": 5.5582, "loss/crossentropy": 2.5598950386047363, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15256302058696747, "step": 27130 }, { "epoch": 0.847875, "grad_norm": 3.328125, "grad_norm_var": 0.04785868326822917, "learning_rate": 0.0001, "loss": 5.7948, "loss/crossentropy": 2.7138450145721436, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15848314762115479, "step": 27132 }, { "epoch": 0.8479375, "grad_norm": 3.140625, "grad_norm_var": 0.03396809895833333, "learning_rate": 0.0001, "loss": 5.8073, "loss/crossentropy": 2.602906346321106, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17277860641479492, "step": 27134 }, { "epoch": 0.848, "grad_norm": 3.03125, "grad_norm_var": 0.03310445149739583, "learning_rate": 0.0001, "loss": 5.6494, "loss/crossentropy": 2.6044774055480957, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.16113029420375824, "step": 27136 }, { "epoch": 0.8480625, "grad_norm": 2.921875, "grad_norm_var": 0.03605855305989583, "learning_rate": 0.0001, "loss": 5.5735, "loss/crossentropy": 2.5667667388916016, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15926573425531387, "step": 27138 }, { "epoch": 0.848125, "grad_norm": 2.984375, "grad_norm_var": 0.038313802083333334, "learning_rate": 0.0001, "loss": 5.6048, "loss/crossentropy": 2.5968477725982666, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15938495099544525, "step": 27140 }, { "epoch": 0.8481875, "grad_norm": 3.203125, "grad_norm_var": 0.030760701497395834, "learning_rate": 0.0001, "loss": 5.7919, "loss/crossentropy": 2.604135513305664, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16956109553575516, "step": 27142 }, { "epoch": 0.84825, "grad_norm": 2.859375, "grad_norm_var": 0.031201171875, "learning_rate": 0.0001, "loss": 5.5577, "loss/crossentropy": 2.6035202741622925, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.1543988510966301, "step": 27144 }, { "epoch": 0.8483125, "grad_norm": 3.25, "grad_norm_var": 0.03583577473958333, "learning_rate": 0.0001, "loss": 5.9122, "loss/crossentropy": 2.7238636016845703, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16961700469255447, "step": 27146 }, { "epoch": 0.848375, "grad_norm": 3.03125, "grad_norm_var": 0.03459879557291667, "learning_rate": 0.0001, "loss": 5.6259, "loss/crossentropy": 2.596389889717102, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15646573901176453, "step": 27148 }, { "epoch": 0.8484375, "grad_norm": 3.375, "grad_norm_var": 0.03264567057291667, "learning_rate": 0.0001, "loss": 6.0514, "loss/crossentropy": 2.76579213142395, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17856386303901672, "step": 27150 }, { "epoch": 0.8485, "grad_norm": 3.078125, "grad_norm_var": 0.032698567708333334, "learning_rate": 0.0001, "loss": 5.6132, "loss/crossentropy": 2.6267541646957397, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15645737200975418, "step": 27152 }, { "epoch": 0.8485625, "grad_norm": 3.203125, "grad_norm_var": 0.0305084228515625, "learning_rate": 0.0001, "loss": 5.5019, "loss/crossentropy": 2.4569544792175293, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15918374061584473, "step": 27154 }, { "epoch": 0.848625, "grad_norm": 3.25, "grad_norm_var": 0.03181050618489583, "learning_rate": 0.0001, "loss": 5.7746, "loss/crossentropy": 2.654120922088623, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16556436568498611, "step": 27156 }, { "epoch": 0.8486875, "grad_norm": 3.25, "grad_norm_var": 0.03177083333333333, "learning_rate": 0.0001, "loss": 5.6566, "loss/crossentropy": 2.6050009727478027, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16062700748443604, "step": 27158 }, { "epoch": 0.84875, "grad_norm": 3.140625, "grad_norm_var": 0.019950358072916667, "learning_rate": 0.0001, "loss": 5.5895, "loss/crossentropy": 2.555176854133606, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15890159457921982, "step": 27160 }, { "epoch": 0.8488125, "grad_norm": 3.171875, "grad_norm_var": 0.015555826822916667, "learning_rate": 0.0001, "loss": 5.6621, "loss/crossentropy": 2.5718332529067993, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1644948124885559, "step": 27162 }, { "epoch": 0.848875, "grad_norm": 2.96875, "grad_norm_var": 0.020710245768229166, "learning_rate": 0.0001, "loss": 5.7778, "loss/crossentropy": 2.684635043144226, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1647833213210106, "step": 27164 }, { "epoch": 0.8489375, "grad_norm": 2.828125, "grad_norm_var": 0.025569661458333334, "learning_rate": 0.0001, "loss": 5.5427, "loss/crossentropy": 2.5394362211227417, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15579772740602493, "step": 27166 }, { "epoch": 0.849, "grad_norm": 4.1875, "grad_norm_var": 0.09032796223958334, "learning_rate": 0.0001, "loss": 5.6074, "loss/crossentropy": 2.5032652616500854, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16158964484930038, "step": 27168 }, { "epoch": 0.8490625, "grad_norm": 3.5, "grad_norm_var": 0.097802734375, "learning_rate": 0.0001, "loss": 5.7246, "loss/crossentropy": 2.668478012084961, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1575625315308571, "step": 27170 }, { "epoch": 0.849125, "grad_norm": 3.0625, "grad_norm_var": 0.10175374348958334, "learning_rate": 0.0001, "loss": 5.4272, "loss/crossentropy": 2.40272319316864, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15674276649951935, "step": 27172 }, { "epoch": 0.8491875, "grad_norm": 2.734375, "grad_norm_var": 0.1138580322265625, "learning_rate": 0.0001, "loss": 5.4967, "loss/crossentropy": 2.505965232849121, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15415487438440323, "step": 27174 }, { "epoch": 0.84925, "grad_norm": 2.90625, "grad_norm_var": 0.1227447509765625, "learning_rate": 0.0001, "loss": 5.1261, "loss/crossentropy": 2.239875912666321, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1456497609615326, "step": 27176 }, { "epoch": 0.8493125, "grad_norm": 2.96875, "grad_norm_var": 0.1259429931640625, "learning_rate": 0.0001, "loss": 5.8369, "loss/crossentropy": 2.739956498146057, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16242799162864685, "step": 27178 }, { "epoch": 0.849375, "grad_norm": 2.8125, "grad_norm_var": 0.12740478515625, "learning_rate": 0.0001, "loss": 5.671, "loss/crossentropy": 2.7142287492752075, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.1542678400874138, "step": 27180 }, { "epoch": 0.8494375, "grad_norm": 2.9375, "grad_norm_var": 0.1268707275390625, "learning_rate": 0.0001, "loss": 5.4992, "loss/crossentropy": 2.5635247230529785, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.1521565392613411, "step": 27182 }, { "epoch": 0.8495, "grad_norm": 3.125, "grad_norm_var": 0.039061482747395834, "learning_rate": 0.0001, "loss": 5.7256, "loss/crossentropy": 2.661946415901184, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.16300630569458008, "step": 27184 }, { "epoch": 0.8495625, "grad_norm": 3.0625, "grad_norm_var": 0.021061197916666666, "learning_rate": 0.0001, "loss": 5.6889, "loss/crossentropy": 2.6285845041275024, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15954901278018951, "step": 27186 }, { "epoch": 0.849625, "grad_norm": 3.078125, "grad_norm_var": 0.0217437744140625, "learning_rate": 0.0001, "loss": 5.5159, "loss/crossentropy": 2.4490103721618652, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16216076165437698, "step": 27188 }, { "epoch": 0.8496875, "grad_norm": 3.21875, "grad_norm_var": 0.029069010416666666, "learning_rate": 0.0001, "loss": 5.69, "loss/crossentropy": 2.5842500925064087, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.1687745600938797, "step": 27190 }, { "epoch": 0.84975, "grad_norm": 3.40625, "grad_norm_var": 0.03280843098958333, "learning_rate": 0.0001, "loss": 5.6734, "loss/crossentropy": 2.562857985496521, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16535142809152603, "step": 27192 }, { "epoch": 0.8498125, "grad_norm": 3.25, "grad_norm_var": 0.044041951497395836, "learning_rate": 0.0001, "loss": 5.8665, "loss/crossentropy": 2.6680439710617065, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1710200533270836, "step": 27194 }, { "epoch": 0.849875, "grad_norm": 3.015625, "grad_norm_var": 0.0418853759765625, "learning_rate": 0.0001, "loss": 5.2359, "loss/crossentropy": 2.295186996459961, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15266850590705872, "step": 27196 }, { "epoch": 0.8499375, "grad_norm": 3.078125, "grad_norm_var": 0.038863118489583334, "learning_rate": 0.0001, "loss": 5.7304, "loss/crossentropy": 2.696526050567627, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15924210846424103, "step": 27198 }, { "epoch": 0.85, "grad_norm": 3.28125, "grad_norm_var": 0.042389933268229166, "learning_rate": 0.0001, "loss": 5.5092, "loss/crossentropy": 2.518649458885193, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.15920644998550415, "step": 27200 }, { "epoch": 0.8500625, "grad_norm": 2.96875, "grad_norm_var": 0.04246317545572917, "learning_rate": 0.0001, "loss": 5.5849, "loss/crossentropy": 2.47367525100708, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16385667771100998, "step": 27202 }, { "epoch": 0.850125, "grad_norm": 3.078125, "grad_norm_var": 0.04265034993489583, "learning_rate": 0.0001, "loss": 5.6853, "loss/crossentropy": 2.605410099029541, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16306568682193756, "step": 27204 }, { "epoch": 0.8501875, "grad_norm": 2.953125, "grad_norm_var": 0.04045308430989583, "learning_rate": 0.0001, "loss": 5.6113, "loss/crossentropy": 2.5274053812026978, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.16502834856510162, "step": 27206 }, { "epoch": 0.85025, "grad_norm": 3.0, "grad_norm_var": 0.03517964680989583, "learning_rate": 0.0001, "loss": 5.6351, "loss/crossentropy": 2.5945483446121216, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15679188072681427, "step": 27208 }, { "epoch": 0.8503125, "grad_norm": 3.1875, "grad_norm_var": 0.0193023681640625, "learning_rate": 0.0001, "loss": 5.7849, "loss/crossentropy": 2.6564905643463135, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16557051241397858, "step": 27210 }, { "epoch": 0.850375, "grad_norm": 3.21875, "grad_norm_var": 0.019498697916666665, "learning_rate": 0.0001, "loss": 6.0189, "loss/crossentropy": 2.771043300628662, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17595846205949783, "step": 27212 }, { "epoch": 0.8504375, "grad_norm": 2.90625, "grad_norm_var": 0.01832275390625, "learning_rate": 0.0001, "loss": 5.6694, "loss/crossentropy": 2.6642444133758545, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15481116622686386, "step": 27214 }, { "epoch": 0.8505, "grad_norm": 3.328125, "grad_norm_var": 0.0170806884765625, "learning_rate": 0.0001, "loss": 5.733, "loss/crossentropy": 2.5549402236938477, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16897690296173096, "step": 27216 }, { "epoch": 0.8505625, "grad_norm": 2.921875, "grad_norm_var": 0.0191558837890625, "learning_rate": 0.0001, "loss": 5.2818, "loss/crossentropy": 2.38115918636322, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.15022436529397964, "step": 27218 }, { "epoch": 0.850625, "grad_norm": 3.09375, "grad_norm_var": 0.021219889322916668, "learning_rate": 0.0001, "loss": 6.0002, "loss/crossentropy": 2.7196956872940063, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17727024108171463, "step": 27220 }, { "epoch": 0.8506875, "grad_norm": 3.09375, "grad_norm_var": 0.018680826822916666, "learning_rate": 0.0001, "loss": 5.5349, "loss/crossentropy": 2.497388243675232, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.16116993129253387, "step": 27222 }, { "epoch": 0.85075, "grad_norm": 3.25, "grad_norm_var": 0.017381795247395835, "learning_rate": 0.0001, "loss": 5.5134, "loss/crossentropy": 2.506207227706909, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15735848993062973, "step": 27224 }, { "epoch": 0.8508125, "grad_norm": 3.046875, "grad_norm_var": 0.018538411458333334, "learning_rate": 0.0001, "loss": 5.5864, "loss/crossentropy": 2.51308536529541, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16006861627101898, "step": 27226 }, { "epoch": 0.850875, "grad_norm": 3.21875, "grad_norm_var": 0.018485514322916667, "learning_rate": 0.0001, "loss": 5.6591, "loss/crossentropy": 2.569067597389221, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16251851618289948, "step": 27228 }, { "epoch": 0.8509375, "grad_norm": 3.109375, "grad_norm_var": 0.018220011393229166, "learning_rate": 0.0001, "loss": 5.8602, "loss/crossentropy": 2.6755404472351074, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17042145133018494, "step": 27230 }, { "epoch": 0.851, "grad_norm": 3.25, "grad_norm_var": 0.0253814697265625, "learning_rate": 0.0001, "loss": 5.5924, "loss/crossentropy": 2.5000529289245605, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16197387874126434, "step": 27232 }, { "epoch": 0.8510625, "grad_norm": 2.875, "grad_norm_var": 0.0294921875, "learning_rate": 0.0001, "loss": 5.6575, "loss/crossentropy": 2.6328113079071045, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.16066823154687881, "step": 27234 }, { "epoch": 0.851125, "grad_norm": 2.890625, "grad_norm_var": 0.02861328125, "learning_rate": 0.0001, "loss": 5.6378, "loss/crossentropy": 2.6340949535369873, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1554490625858307, "step": 27236 }, { "epoch": 0.8511875, "grad_norm": 3.078125, "grad_norm_var": 0.028783162434895832, "learning_rate": 0.0001, "loss": 5.4258, "loss/crossentropy": 2.425517439842224, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1527608558535576, "step": 27238 }, { "epoch": 0.85125, "grad_norm": 2.953125, "grad_norm_var": 0.02388916015625, "learning_rate": 0.0001, "loss": 5.5994, "loss/crossentropy": 2.536966323852539, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.15976081043481827, "step": 27240 }, { "epoch": 0.8513125, "grad_norm": 3.125, "grad_norm_var": 0.020807902018229168, "learning_rate": 0.0001, "loss": 5.5573, "loss/crossentropy": 2.535452723503113, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15843620896339417, "step": 27242 }, { "epoch": 0.851375, "grad_norm": 3.078125, "grad_norm_var": 0.018285115559895832, "learning_rate": 0.0001, "loss": 5.674, "loss/crossentropy": 2.550739884376526, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16505959630012512, "step": 27244 }, { "epoch": 0.8514375, "grad_norm": 3.53125, "grad_norm_var": 0.03713785807291667, "learning_rate": 0.0001, "loss": 5.8234, "loss/crossentropy": 2.652758240699768, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.17096707224845886, "step": 27246 }, { "epoch": 0.8515, "grad_norm": 3.140625, "grad_norm_var": 0.03351949055989583, "learning_rate": 0.0001, "loss": 5.5629, "loss/crossentropy": 2.5765405893325806, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15645325928926468, "step": 27248 }, { "epoch": 0.8515625, "grad_norm": 3.1875, "grad_norm_var": 0.030182902018229166, "learning_rate": 0.0001, "loss": 5.894, "loss/crossentropy": 2.749528646469116, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16600635647773743, "step": 27250 }, { "epoch": 0.851625, "grad_norm": 2.9375, "grad_norm_var": 0.029166666666666667, "learning_rate": 0.0001, "loss": 5.3832, "loss/crossentropy": 2.4195252656936646, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15183337777853012, "step": 27252 }, { "epoch": 0.8516875, "grad_norm": 3.234375, "grad_norm_var": 0.03183186848958333, "learning_rate": 0.0001, "loss": 5.7519, "loss/crossentropy": 2.588433623313904, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16674208641052246, "step": 27254 }, { "epoch": 0.85175, "grad_norm": 3.078125, "grad_norm_var": 0.0295806884765625, "learning_rate": 0.0001, "loss": 5.6704, "loss/crossentropy": 2.6264848709106445, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16220723092556, "step": 27256 }, { "epoch": 0.8518125, "grad_norm": 2.984375, "grad_norm_var": 0.03189697265625, "learning_rate": 0.0001, "loss": 5.6843, "loss/crossentropy": 2.5776803493499756, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16535252332687378, "step": 27258 }, { "epoch": 0.851875, "grad_norm": 3.34375, "grad_norm_var": 0.045458984375, "learning_rate": 0.0001, "loss": 5.6823, "loss/crossentropy": 2.604082226753235, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16173037886619568, "step": 27260 }, { "epoch": 0.8519375, "grad_norm": 2.828125, "grad_norm_var": 0.034684244791666666, "learning_rate": 0.0001, "loss": 5.6682, "loss/crossentropy": 2.6651251316070557, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15889926254749298, "step": 27262 }, { "epoch": 0.852, "grad_norm": 3.125, "grad_norm_var": 0.0301177978515625, "learning_rate": 0.0001, "loss": 5.3971, "loss/crossentropy": 2.373060882091522, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15943298488855362, "step": 27264 }, { "epoch": 0.8520625, "grad_norm": 2.921875, "grad_norm_var": 0.05545247395833333, "learning_rate": 0.0001, "loss": 5.702, "loss/crossentropy": 2.537823438644409, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.17266496270895004, "step": 27266 }, { "epoch": 0.852125, "grad_norm": 4.4375, "grad_norm_var": 0.1651275634765625, "learning_rate": 0.0001, "loss": 5.6176, "loss/crossentropy": 2.5497862100601196, "loss/hidden": 1.52734375, "loss/jsd": 0.0, "loss/logits": 0.15404419600963593, "step": 27268 }, { "epoch": 0.8521875, "grad_norm": 3.03125, "grad_norm_var": 0.16867574055989584, "learning_rate": 0.0001, "loss": 5.5756, "loss/crossentropy": 2.4981300830841064, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1604846939444542, "step": 27270 }, { "epoch": 0.85225, "grad_norm": 2.9375, "grad_norm_var": 0.17483317057291667, "learning_rate": 0.0001, "loss": 5.4375, "loss/crossentropy": 2.4712727069854736, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15443594008684158, "step": 27272 }, { "epoch": 0.8523125, "grad_norm": 3.359375, "grad_norm_var": 0.17137044270833332, "learning_rate": 0.0001, "loss": 5.6358, "loss/crossentropy": 2.556638479232788, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16026213765144348, "step": 27274 }, { "epoch": 0.852375, "grad_norm": 3.078125, "grad_norm_var": 0.15822652180989583, "learning_rate": 0.0001, "loss": 5.5691, "loss/crossentropy": 2.5130996704101562, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.15638145804405212, "step": 27276 }, { "epoch": 0.8524375, "grad_norm": 2.828125, "grad_norm_var": 0.15387369791666666, "learning_rate": 0.0001, "loss": 5.7646, "loss/crossentropy": 2.7054131031036377, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16138488054275513, "step": 27278 }, { "epoch": 0.8525, "grad_norm": 3.234375, "grad_norm_var": 0.15914306640625, "learning_rate": 0.0001, "loss": 5.8122, "loss/crossentropy": 2.655655264854431, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16917280107736588, "step": 27280 }, { "epoch": 0.8525625, "grad_norm": 2.921875, "grad_norm_var": 0.14888916015625, "learning_rate": 0.0001, "loss": 5.4202, "loss/crossentropy": 2.392878293991089, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1562449336051941, "step": 27282 }, { "epoch": 0.852625, "grad_norm": 2.703125, "grad_norm_var": 0.0577789306640625, "learning_rate": 0.0001, "loss": 5.3767, "loss/crossentropy": 2.54988694190979, "loss/hidden": 1.3671875, "loss/jsd": 0.0, "loss/logits": 0.14596494287252426, "step": 27284 }, { "epoch": 0.8526875, "grad_norm": 2.75, "grad_norm_var": 0.054133097330729164, "learning_rate": 0.0001, "loss": 5.3467, "loss/crossentropy": 2.472472071647644, "loss/hidden": 1.40234375, "loss/jsd": 0.0, "loss/logits": 0.1471906527876854, "step": 27286 }, { "epoch": 0.85275, "grad_norm": 3.171875, "grad_norm_var": 0.058268229166666664, "learning_rate": 0.0001, "loss": 5.504, "loss/crossentropy": 2.4292874336242676, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16098542511463165, "step": 27288 }, { "epoch": 0.8528125, "grad_norm": 2.96875, "grad_norm_var": 0.05670166015625, "learning_rate": 0.0001, "loss": 5.8281, "loss/crossentropy": 2.7193890810012817, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16711793839931488, "step": 27290 }, { "epoch": 0.852875, "grad_norm": 3.140625, "grad_norm_var": 0.0574615478515625, "learning_rate": 0.0001, "loss": 5.5502, "loss/crossentropy": 2.527142882347107, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15425408631563187, "step": 27292 }, { "epoch": 0.8529375, "grad_norm": 3.421875, "grad_norm_var": 0.0751129150390625, "learning_rate": 0.0001, "loss": 5.8299, "loss/crossentropy": 2.71413791179657, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16352686285972595, "step": 27294 }, { "epoch": 0.853, "grad_norm": 3.390625, "grad_norm_var": 0.07781575520833334, "learning_rate": 0.0001, "loss": 5.8199, "loss/crossentropy": 2.669779896736145, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.17087218165397644, "step": 27296 }, { "epoch": 0.8530625, "grad_norm": 2.90625, "grad_norm_var": 0.07008056640625, "learning_rate": 0.0001, "loss": 5.4575, "loss/crossentropy": 2.4028143882751465, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16015125811100006, "step": 27298 }, { "epoch": 0.853125, "grad_norm": 2.9375, "grad_norm_var": 0.0460845947265625, "learning_rate": 0.0001, "loss": 5.5584, "loss/crossentropy": 2.5087332725524902, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16199709475040436, "step": 27300 }, { "epoch": 0.8531875, "grad_norm": 3.234375, "grad_norm_var": 0.036214192708333336, "learning_rate": 0.0001, "loss": 5.3389, "loss/crossentropy": 2.3470911979675293, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15582001954317093, "step": 27302 }, { "epoch": 0.85325, "grad_norm": 3.40625, "grad_norm_var": 0.042073567708333336, "learning_rate": 0.0001, "loss": 5.7984, "loss/crossentropy": 2.6537342071533203, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16993840038776398, "step": 27304 }, { "epoch": 0.8533125, "grad_norm": 3.140625, "grad_norm_var": 0.037873331705729166, "learning_rate": 0.0001, "loss": 5.3756, "loss/crossentropy": 2.4142991304397583, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.15550129860639572, "step": 27306 }, { "epoch": 0.853375, "grad_norm": 3.125, "grad_norm_var": 0.03782450358072917, "learning_rate": 0.0001, "loss": 5.794, "loss/crossentropy": 2.6065648794174194, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.16835353523492813, "step": 27308 }, { "epoch": 0.8534375, "grad_norm": 3.09375, "grad_norm_var": 0.03922119140625, "learning_rate": 0.0001, "loss": 5.3525, "loss/crossentropy": 2.394999146461487, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.14965371787548065, "step": 27310 }, { "epoch": 0.8535, "grad_norm": 2.875, "grad_norm_var": 0.03906962076822917, "learning_rate": 0.0001, "loss": 5.536, "loss/crossentropy": 2.5364619493484497, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.1577666699886322, "step": 27312 }, { "epoch": 0.8535625, "grad_norm": 2.984375, "grad_norm_var": 0.04576822916666667, "learning_rate": 0.0001, "loss": 5.8537, "loss/crossentropy": 2.6952576637268066, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16740810126066208, "step": 27314 }, { "epoch": 0.853625, "grad_norm": 3.3125, "grad_norm_var": 0.043635050455729164, "learning_rate": 0.0001, "loss": 5.8972, "loss/crossentropy": 2.6425116062164307, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.17507479339838028, "step": 27316 }, { "epoch": 0.8536875, "grad_norm": 3.203125, "grad_norm_var": 0.04394124348958333, "learning_rate": 0.0001, "loss": 5.7404, "loss/crossentropy": 2.5958350896835327, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1687539964914322, "step": 27318 }, { "epoch": 0.85375, "grad_norm": 2.609375, "grad_norm_var": 0.056966145833333336, "learning_rate": 0.0001, "loss": 5.0976, "loss/crossentropy": 2.2704073190689087, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1381903663277626, "step": 27320 }, { "epoch": 0.8538125, "grad_norm": 3.109375, "grad_norm_var": 0.05510660807291667, "learning_rate": 0.0001, "loss": 5.3793, "loss/crossentropy": 2.3639668226242065, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1558261588215828, "step": 27322 }, { "epoch": 0.853875, "grad_norm": 3.40625, "grad_norm_var": 8.250065104166667, "learning_rate": 0.0001, "loss": 5.5607, "loss/crossentropy": 2.356520652770996, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17158974707126617, "step": 27324 }, { "epoch": 0.8539375, "grad_norm": 3.328125, "grad_norm_var": 8.179377237955729, "learning_rate": 0.0001, "loss": 6.0781, "loss/crossentropy": 2.8043962717056274, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.177764892578125, "step": 27326 }, { "epoch": 0.854, "grad_norm": 2.84375, "grad_norm_var": 8.200804646809896, "learning_rate": 0.0001, "loss": 5.4275, "loss/crossentropy": 2.44600248336792, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15322712063789368, "step": 27328 }, { "epoch": 0.8540625, "grad_norm": 3.109375, "grad_norm_var": 8.213004557291667, "learning_rate": 0.0001, "loss": 5.6881, "loss/crossentropy": 2.5942888259887695, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.16602341830730438, "step": 27330 }, { "epoch": 0.854125, "grad_norm": 2.90625, "grad_norm_var": 8.2279296875, "learning_rate": 0.0001, "loss": 5.4666, "loss/crossentropy": 2.3722946643829346, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.1594339907169342, "step": 27332 }, { "epoch": 0.8541875, "grad_norm": 2.828125, "grad_norm_var": 8.229181925455729, "learning_rate": 0.0001, "loss": 5.5654, "loss/crossentropy": 2.539860963821411, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15685267001390457, "step": 27334 }, { "epoch": 0.85425, "grad_norm": 3.25, "grad_norm_var": 8.126528930664062, "learning_rate": 0.0001, "loss": 5.5007, "loss/crossentropy": 2.429807662963867, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16490648686885834, "step": 27336 }, { "epoch": 0.8543125, "grad_norm": 3.28125, "grad_norm_var": 8.13121337890625, "learning_rate": 0.0001, "loss": 5.7599, "loss/crossentropy": 2.6894830465316772, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16133538633584976, "step": 27338 }, { "epoch": 0.854375, "grad_norm": 2.875, "grad_norm_var": 0.046320597330729164, "learning_rate": 0.0001, "loss": 5.709, "loss/crossentropy": 2.6138765811920166, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16263822466135025, "step": 27340 }, { "epoch": 0.8544375, "grad_norm": 2.953125, "grad_norm_var": 0.040185546875, "learning_rate": 0.0001, "loss": 6.016, "loss/crossentropy": 2.7757010459899902, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.17793571949005127, "step": 27342 }, { "epoch": 0.8545, "grad_norm": 3.15625, "grad_norm_var": 0.0419097900390625, "learning_rate": 0.0001, "loss": 5.25, "loss/crossentropy": 2.3976263999938965, "loss/hidden": 1.37890625, "loss/jsd": 0.0, "loss/logits": 0.1473466232419014, "step": 27344 }, { "epoch": 0.8545625, "grad_norm": 3.515625, "grad_norm_var": 0.05858968098958333, "learning_rate": 0.0001, "loss": 5.6017, "loss/crossentropy": 2.5026453733444214, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16615450382232666, "step": 27346 }, { "epoch": 0.854625, "grad_norm": 3.03125, "grad_norm_var": 0.0456451416015625, "learning_rate": 0.0001, "loss": 5.6635, "loss/crossentropy": 2.5777475833892822, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1613086760044098, "step": 27348 }, { "epoch": 0.8546875, "grad_norm": 2.859375, "grad_norm_var": 0.1615234375, "learning_rate": 0.0001, "loss": 5.2415, "loss/crossentropy": 2.213112473487854, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1590876430273056, "step": 27350 }, { "epoch": 0.85475, "grad_norm": 3.90625, "grad_norm_var": 0.20972900390625, "learning_rate": 0.0001, "loss": 5.4069, "loss/crossentropy": 2.318873167037964, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1599702313542366, "step": 27352 }, { "epoch": 0.8548125, "grad_norm": 3.015625, "grad_norm_var": 0.20634358723958332, "learning_rate": 0.0001, "loss": 5.8637, "loss/crossentropy": 2.720026969909668, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16748729348182678, "step": 27354 }, { "epoch": 0.854875, "grad_norm": 3.265625, "grad_norm_var": 0.20331624348958333, "learning_rate": 0.0001, "loss": 5.7398, "loss/crossentropy": 2.6489644050598145, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16337642818689346, "step": 27356 }, { "epoch": 0.8549375, "grad_norm": 3.578125, "grad_norm_var": 0.20748697916666667, "learning_rate": 0.0001, "loss": 5.9554, "loss/crossentropy": 2.705081582069397, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1777697503566742, "step": 27358 }, { "epoch": 0.855, "grad_norm": 2.96875, "grad_norm_var": 0.198046875, "learning_rate": 0.0001, "loss": 5.7981, "loss/crossentropy": 2.6675941944122314, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16773658990859985, "step": 27360 }, { "epoch": 0.8550625, "grad_norm": 3.75, "grad_norm_var": 0.19925130208333333, "learning_rate": 0.0001, "loss": 5.967, "loss/crossentropy": 2.6779096126556396, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.17695285379886627, "step": 27362 }, { "epoch": 0.855125, "grad_norm": 3.03125, "grad_norm_var": 0.19638264973958333, "learning_rate": 0.0001, "loss": 5.6363, "loss/crossentropy": 2.5565619468688965, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1642194539308548, "step": 27364 }, { "epoch": 0.8551875, "grad_norm": 5.03125, "grad_norm_var": 0.2955403645833333, "learning_rate": 0.0001, "loss": 5.9018, "loss/crossentropy": 2.591494917869568, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.1802462637424469, "step": 27366 }, { "epoch": 0.85525, "grad_norm": 3.03125, "grad_norm_var": 0.2580362955729167, "learning_rate": 0.0001, "loss": 5.5429, "loss/crossentropy": 2.496594190597534, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16243939846754074, "step": 27368 }, { "epoch": 0.8553125, "grad_norm": 3.125, "grad_norm_var": 0.2589670817057292, "learning_rate": 0.0001, "loss": 5.6632, "loss/crossentropy": 2.5484431982040405, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16187050938606262, "step": 27370 }, { "epoch": 0.855375, "grad_norm": 2.84375, "grad_norm_var": 0.2725494384765625, "learning_rate": 0.0001, "loss": 5.2668, "loss/crossentropy": 2.3581648468971252, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1471143513917923, "step": 27372 }, { "epoch": 0.8554375, "grad_norm": 2.953125, "grad_norm_var": 0.348681640625, "learning_rate": 0.0001, "loss": 5.5454, "loss/crossentropy": 2.467699885368347, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.1663680225610733, "step": 27374 }, { "epoch": 0.8555, "grad_norm": 2.953125, "grad_norm_var": 0.35290425618489585, "learning_rate": 0.0001, "loss": 5.4983, "loss/crossentropy": 2.4543418884277344, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1583036109805107, "step": 27376 }, { "epoch": 0.8555625, "grad_norm": 3.375, "grad_norm_var": 0.33666890462239585, "learning_rate": 0.0001, "loss": 5.8463, "loss/crossentropy": 2.632842183113098, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17212244868278503, "step": 27378 }, { "epoch": 0.855625, "grad_norm": 3.296875, "grad_norm_var": 0.33943583170572916, "learning_rate": 0.0001, "loss": 5.4097, "loss/crossentropy": 2.404983639717102, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15515553206205368, "step": 27380 }, { "epoch": 0.8556875, "grad_norm": 3.125, "grad_norm_var": 0.12312825520833333, "learning_rate": 0.0001, "loss": 5.2698, "loss/crossentropy": 2.2714738845825195, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15646881610155106, "step": 27382 }, { "epoch": 0.85575, "grad_norm": 3.078125, "grad_norm_var": 0.12574869791666668, "learning_rate": 0.0001, "loss": 5.9026, "loss/crossentropy": 2.703365683555603, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.17109481990337372, "step": 27384 }, { "epoch": 0.8558125, "grad_norm": 3.140625, "grad_norm_var": 0.12421773274739584, "learning_rate": 0.0001, "loss": 5.8108, "loss/crossentropy": 2.725627303123474, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16242322325706482, "step": 27386 }, { "epoch": 0.855875, "grad_norm": 2.9375, "grad_norm_var": 0.11724344889322917, "learning_rate": 0.0001, "loss": 5.445, "loss/crossentropy": 2.445341467857361, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.15934404730796814, "step": 27388 }, { "epoch": 0.8559375, "grad_norm": 3.03125, "grad_norm_var": 0.024735514322916666, "learning_rate": 0.0001, "loss": 5.5114, "loss/crossentropy": 2.426816940307617, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.161977618932724, "step": 27390 }, { "epoch": 0.856, "grad_norm": 3.34375, "grad_norm_var": 0.0250152587890625, "learning_rate": 0.0001, "loss": 5.7785, "loss/crossentropy": 2.6386823654174805, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16749393939971924, "step": 27392 }, { "epoch": 0.8560625, "grad_norm": 3.046875, "grad_norm_var": 0.0224517822265625, "learning_rate": 0.0001, "loss": 5.9373, "loss/crossentropy": 2.7545653581619263, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.17257516831159592, "step": 27394 }, { "epoch": 0.856125, "grad_norm": 2.875, "grad_norm_var": 0.02451171875, "learning_rate": 0.0001, "loss": 5.7075, "loss/crossentropy": 2.6612977981567383, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.1636067032814026, "step": 27396 }, { "epoch": 0.8561875, "grad_norm": 2.84375, "grad_norm_var": 0.03731180826822917, "learning_rate": 0.0001, "loss": 5.3092, "loss/crossentropy": 2.444177985191345, "loss/hidden": 1.375, "loss/jsd": 0.0, "loss/logits": 0.1489994302392006, "step": 27398 }, { "epoch": 0.85625, "grad_norm": 3.078125, "grad_norm_var": 0.02828369140625, "learning_rate": 0.0001, "loss": 5.2391, "loss/crossentropy": 2.3072571754455566, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.15334443002939224, "step": 27400 }, { "epoch": 0.8563125, "grad_norm": 3.171875, "grad_norm_var": 0.028153483072916666, "learning_rate": 0.0001, "loss": 5.7229, "loss/crossentropy": 2.60724675655365, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16507958620786667, "step": 27402 }, { "epoch": 0.856375, "grad_norm": 3.03125, "grad_norm_var": 0.0283355712890625, "learning_rate": 0.0001, "loss": 5.5167, "loss/crossentropy": 2.4488946199417114, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16186027228832245, "step": 27404 }, { "epoch": 0.8564375, "grad_norm": 2.890625, "grad_norm_var": 0.0293609619140625, "learning_rate": 0.0001, "loss": 5.6154, "loss/crossentropy": 2.5921465158462524, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15896372497081757, "step": 27406 }, { "epoch": 0.8565, "grad_norm": 3.125, "grad_norm_var": 0.022801717122395832, "learning_rate": 0.0001, "loss": 5.8056, "loss/crossentropy": 2.7032735347747803, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16569754481315613, "step": 27408 }, { "epoch": 0.8565625, "grad_norm": 3.296875, "grad_norm_var": 0.024137369791666665, "learning_rate": 0.0001, "loss": 5.3409, "loss/crossentropy": 2.3539106845855713, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15299847722053528, "step": 27410 }, { "epoch": 0.856625, "grad_norm": 2.921875, "grad_norm_var": 0.026627604166666666, "learning_rate": 0.0001, "loss": 5.541, "loss/crossentropy": 2.4986627101898193, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16048026829957962, "step": 27412 }, { "epoch": 0.8566875, "grad_norm": 3.171875, "grad_norm_var": 0.018257649739583333, "learning_rate": 0.0001, "loss": 5.537, "loss/crossentropy": 2.5085893869400024, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15752866864204407, "step": 27414 }, { "epoch": 0.85675, "grad_norm": 3.625, "grad_norm_var": 0.0424224853515625, "learning_rate": 0.0001, "loss": 5.4293, "loss/crossentropy": 2.3147988319396973, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16184009611606598, "step": 27416 }, { "epoch": 0.8568125, "grad_norm": 3.265625, "grad_norm_var": 0.04510091145833333, "learning_rate": 0.0001, "loss": 5.7798, "loss/crossentropy": 2.7156347036361694, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16266556829214096, "step": 27418 }, { "epoch": 0.856875, "grad_norm": 3.265625, "grad_norm_var": 0.0453277587890625, "learning_rate": 0.0001, "loss": 5.7641, "loss/crossentropy": 2.704028010368347, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1587410494685173, "step": 27420 }, { "epoch": 0.8569375, "grad_norm": 2.765625, "grad_norm_var": 0.04970601399739583, "learning_rate": 0.0001, "loss": 5.2174, "loss/crossentropy": 2.2957347631454468, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.15114806592464447, "step": 27422 }, { "epoch": 0.857, "grad_norm": 2.984375, "grad_norm_var": 0.06676025390625, "learning_rate": 0.0001, "loss": 5.4437, "loss/crossentropy": 2.395915150642395, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.1614210456609726, "step": 27424 }, { "epoch": 0.8570625, "grad_norm": 2.9375, "grad_norm_var": 0.06992899576822917, "learning_rate": 0.0001, "loss": 5.9027, "loss/crossentropy": 2.7567888498306274, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16459564864635468, "step": 27426 }, { "epoch": 0.857125, "grad_norm": 2.890625, "grad_norm_var": 0.07231343587239583, "learning_rate": 0.0001, "loss": 5.4217, "loss/crossentropy": 2.4296735525131226, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.15857718884944916, "step": 27428 }, { "epoch": 0.8571875, "grad_norm": 3.03125, "grad_norm_var": 0.06971028645833334, "learning_rate": 0.0001, "loss": 5.8225, "loss/crossentropy": 2.723418951034546, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16186387836933136, "step": 27430 }, { "epoch": 0.85725, "grad_norm": 3.234375, "grad_norm_var": 0.04876302083333333, "learning_rate": 0.0001, "loss": 5.795, "loss/crossentropy": 2.605363130569458, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17169761657714844, "step": 27432 }, { "epoch": 0.8573125, "grad_norm": 3.234375, "grad_norm_var": 0.04739176432291667, "learning_rate": 0.0001, "loss": 5.399, "loss/crossentropy": 2.381725788116455, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15485329180955887, "step": 27434 }, { "epoch": 0.857375, "grad_norm": 3.109375, "grad_norm_var": 0.047459920247395836, "learning_rate": 0.0001, "loss": 5.6726, "loss/crossentropy": 2.600713849067688, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.16461395472288132, "step": 27436 }, { "epoch": 0.8574375, "grad_norm": 3.0, "grad_norm_var": 0.04155171712239583, "learning_rate": 0.0001, "loss": 5.623, "loss/crossentropy": 2.550519824028015, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16271229088306427, "step": 27438 }, { "epoch": 0.8575, "grad_norm": 3.125, "grad_norm_var": 0.027229817708333333, "learning_rate": 0.0001, "loss": 5.4119, "loss/crossentropy": 2.4456595182418823, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15131648629903793, "step": 27440 }, { "epoch": 0.8575625, "grad_norm": 2.921875, "grad_norm_var": 0.0182525634765625, "learning_rate": 0.0001, "loss": 5.4175, "loss/crossentropy": 2.394680976867676, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1585310995578766, "step": 27442 }, { "epoch": 0.857625, "grad_norm": 2.859375, "grad_norm_var": 0.019383748372395832, "learning_rate": 0.0001, "loss": 5.4036, "loss/crossentropy": 2.468693494796753, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.14817877113819122, "step": 27444 }, { "epoch": 0.8576875, "grad_norm": 3.515625, "grad_norm_var": 0.033056640625, "learning_rate": 0.0001, "loss": 5.9536, "loss/crossentropy": 2.7569260597229004, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.17396005988121033, "step": 27446 }, { "epoch": 0.85775, "grad_norm": 3.0, "grad_norm_var": 0.0317535400390625, "learning_rate": 0.0001, "loss": 5.6816, "loss/crossentropy": 2.5776615142822266, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16273970156908035, "step": 27448 }, { "epoch": 0.8578125, "grad_norm": 3.890625, "grad_norm_var": 0.073974609375, "learning_rate": 0.0001, "loss": 5.676, "loss/crossentropy": 2.4964919090270996, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16912566870450974, "step": 27450 }, { "epoch": 0.857875, "grad_norm": 3.453125, "grad_norm_var": 0.07975972493489583, "learning_rate": 0.0001, "loss": 5.459, "loss/crossentropy": 2.4694265127182007, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15481796860694885, "step": 27452 }, { "epoch": 0.8579375, "grad_norm": 3.28125, "grad_norm_var": 0.08210347493489584, "learning_rate": 0.0001, "loss": 5.7259, "loss/crossentropy": 2.6806472539901733, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.16312392055988312, "step": 27454 }, { "epoch": 0.858, "grad_norm": 2.921875, "grad_norm_var": 0.08254292805989584, "learning_rate": 0.0001, "loss": 5.3031, "loss/crossentropy": 2.4080389738082886, "loss/hidden": 1.3828125, "loss/jsd": 0.0, "loss/logits": 0.15122409909963608, "step": 27456 }, { "epoch": 0.8580625, "grad_norm": 3.09375, "grad_norm_var": 0.07893880208333333, "learning_rate": 0.0001, "loss": 5.6916, "loss/crossentropy": 2.54345440864563, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16715845465660095, "step": 27458 }, { "epoch": 0.858125, "grad_norm": 3.21875, "grad_norm_var": 0.06770426432291667, "learning_rate": 0.0001, "loss": 5.7702, "loss/crossentropy": 2.634869337081909, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1670462042093277, "step": 27460 }, { "epoch": 0.8581875, "grad_norm": 3.15625, "grad_norm_var": 0.06020406087239583, "learning_rate": 0.0001, "loss": 5.6032, "loss/crossentropy": 2.5240001678466797, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.1657312512397766, "step": 27462 }, { "epoch": 0.85825, "grad_norm": 3.015625, "grad_norm_var": 0.06500651041666666, "learning_rate": 0.0001, "loss": 5.5489, "loss/crossentropy": 2.4676438570022583, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16280968487262726, "step": 27464 }, { "epoch": 0.8583125, "grad_norm": 2.8125, "grad_norm_var": 0.043375651041666664, "learning_rate": 0.0001, "loss": 5.8323, "loss/crossentropy": 2.6945706605911255, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16728605329990387, "step": 27466 }, { "epoch": 0.858375, "grad_norm": 3.34375, "grad_norm_var": 0.1019683837890625, "learning_rate": 0.0001, "loss": 6.2779, "loss/crossentropy": 2.962168574333191, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1823587492108345, "step": 27468 }, { "epoch": 0.8584375, "grad_norm": 2.8125, "grad_norm_var": 0.1092437744140625, "learning_rate": 0.0001, "loss": 5.3642, "loss/crossentropy": 2.365685820579529, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1541442573070526, "step": 27470 }, { "epoch": 0.8585, "grad_norm": 3.046875, "grad_norm_var": 0.11240132649739583, "learning_rate": 0.0001, "loss": 5.3879, "loss/crossentropy": 2.463380217552185, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.15183178335428238, "step": 27472 }, { "epoch": 0.8585625, "grad_norm": 3.09375, "grad_norm_var": 0.11122639973958333, "learning_rate": 0.0001, "loss": 5.5478, "loss/crossentropy": 2.5058140754699707, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.15732257813215256, "step": 27474 }, { "epoch": 0.858625, "grad_norm": 2.984375, "grad_norm_var": 0.11325581868489583, "learning_rate": 0.0001, "loss": 5.455, "loss/crossentropy": 2.4045562744140625, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16129323840141296, "step": 27476 }, { "epoch": 0.8586875, "grad_norm": 2.90625, "grad_norm_var": 0.1173980712890625, "learning_rate": 0.0001, "loss": 5.6568, "loss/crossentropy": 2.6127243041992188, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.16261501610279083, "step": 27478 }, { "epoch": 0.85875, "grad_norm": 3.03125, "grad_norm_var": 0.11155192057291667, "learning_rate": 0.0001, "loss": 5.6026, "loss/crossentropy": 2.579832673072815, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15813258290290833, "step": 27480 }, { "epoch": 0.8588125, "grad_norm": 3.4375, "grad_norm_var": 0.10341389973958333, "learning_rate": 0.0001, "loss": 5.7871, "loss/crossentropy": 2.5788873434066772, "loss/hidden": 1.51171875, "loss/jsd": 0.0, "loss/logits": 0.16964523494243622, "step": 27482 }, { "epoch": 0.858875, "grad_norm": 3.0625, "grad_norm_var": 0.022484334309895833, "learning_rate": 0.0001, "loss": 5.4816, "loss/crossentropy": 2.4420281648635864, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15864020586013794, "step": 27484 }, { "epoch": 0.8589375, "grad_norm": 2.796875, "grad_norm_var": 0.024128214518229166, "learning_rate": 0.0001, "loss": 5.6988, "loss/crossentropy": 2.617542028427124, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16359714418649673, "step": 27486 }, { "epoch": 0.859, "grad_norm": 3.140625, "grad_norm_var": 0.021610514322916666, "learning_rate": 0.0001, "loss": 5.684, "loss/crossentropy": 2.529650926589966, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16582664102315903, "step": 27488 }, { "epoch": 0.8590625, "grad_norm": 3.609375, "grad_norm_var": 0.046686808268229164, "learning_rate": 0.0001, "loss": 5.5575, "loss/crossentropy": 2.4721691608428955, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16321710497140884, "step": 27490 }, { "epoch": 0.859125, "grad_norm": 2.796875, "grad_norm_var": 0.05162760416666667, "learning_rate": 0.0001, "loss": 5.5237, "loss/crossentropy": 2.4864814281463623, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15645458549261093, "step": 27492 }, { "epoch": 0.8591875, "grad_norm": 2.828125, "grad_norm_var": 0.054052734375, "learning_rate": 0.0001, "loss": 5.3984, "loss/crossentropy": 2.463224768638611, "loss/hidden": 1.38671875, "loss/jsd": 0.0, "loss/logits": 0.15484201908111572, "step": 27494 }, { "epoch": 0.85925, "grad_norm": 3.046875, "grad_norm_var": 0.0532379150390625, "learning_rate": 0.0001, "loss": 5.3393, "loss/crossentropy": 2.3829842805862427, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.1557861790060997, "step": 27496 }, { "epoch": 0.8593125, "grad_norm": 3.25, "grad_norm_var": 0.0466461181640625, "learning_rate": 0.0001, "loss": 5.5659, "loss/crossentropy": 2.4582934379577637, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16271311789751053, "step": 27498 }, { "epoch": 0.859375, "grad_norm": 2.953125, "grad_norm_var": 0.0477935791015625, "learning_rate": 0.0001, "loss": 5.2957, "loss/crossentropy": 2.3343403339385986, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15160369873046875, "step": 27500 }, { "epoch": 0.8594375, "grad_norm": 3.25, "grad_norm_var": 0.047118123372395834, "learning_rate": 0.0001, "loss": 5.8696, "loss/crossentropy": 2.621949076652527, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17710597068071365, "step": 27502 }, { "epoch": 0.8595, "grad_norm": 2.703125, "grad_norm_var": 0.05353902180989583, "learning_rate": 0.0001, "loss": 5.6026, "loss/crossentropy": 2.6189295053482056, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.15657124668359756, "step": 27504 }, { "epoch": 0.8595625, "grad_norm": 3.28125, "grad_norm_var": 0.035445149739583334, "learning_rate": 0.0001, "loss": 5.238, "loss/crossentropy": 2.364015221595764, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.14677252620458603, "step": 27506 }, { "epoch": 0.859625, "grad_norm": 3.0, "grad_norm_var": 0.03430582682291667, "learning_rate": 0.0001, "loss": 5.9101, "loss/crossentropy": 2.6949862241744995, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17463178932666779, "step": 27508 }, { "epoch": 0.8596875, "grad_norm": 3.0625, "grad_norm_var": 0.033219401041666666, "learning_rate": 0.0001, "loss": 5.8926, "loss/crossentropy": 2.6635549068450928, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17368291318416595, "step": 27510 }, { "epoch": 0.85975, "grad_norm": 3.015625, "grad_norm_var": 0.03497721354166667, "learning_rate": 0.0001, "loss": 5.5182, "loss/crossentropy": 2.4884071350097656, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16000645607709885, "step": 27512 }, { "epoch": 0.8598125, "grad_norm": 2.984375, "grad_norm_var": 0.033524576822916666, "learning_rate": 0.0001, "loss": 5.6427, "loss/crossentropy": 2.646321177482605, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1566697359085083, "step": 27514 }, { "epoch": 0.859875, "grad_norm": 2.953125, "grad_norm_var": 0.03303934733072917, "learning_rate": 0.0001, "loss": 5.9071, "loss/crossentropy": 2.816299080848694, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16142502427101135, "step": 27516 }, { "epoch": 0.8599375, "grad_norm": 2.96875, "grad_norm_var": 0.026883951822916665, "learning_rate": 0.0001, "loss": 5.5468, "loss/crossentropy": 2.5224190950393677, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15517209470272064, "step": 27518 }, { "epoch": 0.86, "grad_norm": 3.140625, "grad_norm_var": 0.021317545572916666, "learning_rate": 0.0001, "loss": 5.3727, "loss/crossentropy": 2.3843857049942017, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15742884576320648, "step": 27520 }, { "epoch": 0.8600625, "grad_norm": 2.734375, "grad_norm_var": 0.017585245768229167, "learning_rate": 0.0001, "loss": 5.6011, "loss/crossentropy": 2.5978078842163086, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.15774965286254883, "step": 27522 }, { "epoch": 0.860125, "grad_norm": 2.90625, "grad_norm_var": 0.015458170572916667, "learning_rate": 0.0001, "loss": 5.6141, "loss/crossentropy": 2.585380434989929, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.162246972322464, "step": 27524 }, { "epoch": 0.8601875, "grad_norm": 2.921875, "grad_norm_var": 0.01259765625, "learning_rate": 0.0001, "loss": 5.6191, "loss/crossentropy": 2.557377338409424, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16086407750844955, "step": 27526 }, { "epoch": 0.86025, "grad_norm": 2.984375, "grad_norm_var": 0.013792928059895833, "learning_rate": 0.0001, "loss": 5.6683, "loss/crossentropy": 2.6066770553588867, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.16280128806829453, "step": 27528 }, { "epoch": 0.8603125, "grad_norm": 2.8125, "grad_norm_var": 0.01500244140625, "learning_rate": 0.0001, "loss": 5.5504, "loss/crossentropy": 2.544152617454529, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.1592186689376831, "step": 27530 }, { "epoch": 0.860375, "grad_norm": 3.3125, "grad_norm_var": 0.02144775390625, "learning_rate": 0.0001, "loss": 5.6296, "loss/crossentropy": 2.54948353767395, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16387441009283066, "step": 27532 }, { "epoch": 0.8604375, "grad_norm": 3.328125, "grad_norm_var": 0.0269927978515625, "learning_rate": 0.0001, "loss": 5.7507, "loss/crossentropy": 2.6128547191619873, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16573945432901382, "step": 27534 }, { "epoch": 0.8605, "grad_norm": 2.890625, "grad_norm_var": 0.04712626139322917, "learning_rate": 0.0001, "loss": 5.9992, "loss/crossentropy": 2.6797434091567993, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.1823408454656601, "step": 27536 }, { "epoch": 0.8605625, "grad_norm": 3.421875, "grad_norm_var": 0.04597066243489583, "learning_rate": 0.0001, "loss": 5.7349, "loss/crossentropy": 2.512515425682068, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.17223601788282394, "step": 27538 }, { "epoch": 0.860625, "grad_norm": 3.09375, "grad_norm_var": 0.043431599934895836, "learning_rate": 0.0001, "loss": 5.6209, "loss/crossentropy": 2.54488742351532, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1591637134552002, "step": 27540 }, { "epoch": 0.8606875, "grad_norm": 3.0625, "grad_norm_var": 0.040526326497395834, "learning_rate": 0.0001, "loss": 5.8191, "loss/crossentropy": 2.6895852088928223, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16842305660247803, "step": 27542 }, { "epoch": 0.86075, "grad_norm": 3.015625, "grad_norm_var": 0.04138895670572917, "learning_rate": 0.0001, "loss": 5.655, "loss/crossentropy": 2.6419804096221924, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.15793906152248383, "step": 27544 }, { "epoch": 0.8608125, "grad_norm": 3.265625, "grad_norm_var": 0.03638407389322917, "learning_rate": 0.0001, "loss": 5.7652, "loss/crossentropy": 2.6107258796691895, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16583485901355743, "step": 27546 }, { "epoch": 0.860875, "grad_norm": 3.0, "grad_norm_var": 0.03551432291666667, "learning_rate": 0.0001, "loss": 5.3518, "loss/crossentropy": 2.3826801776885986, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15472815185785294, "step": 27548 }, { "epoch": 0.8609375, "grad_norm": 3.078125, "grad_norm_var": 0.034912109375, "learning_rate": 0.0001, "loss": 5.7562, "loss/crossentropy": 2.644647240638733, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16310906410217285, "step": 27550 }, { "epoch": 0.861, "grad_norm": 3.25, "grad_norm_var": 0.019115193684895834, "learning_rate": 0.0001, "loss": 5.51, "loss/crossentropy": 2.437491297721863, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16037526726722717, "step": 27552 }, { "epoch": 0.8610625, "grad_norm": 2.9375, "grad_norm_var": 0.0150390625, "learning_rate": 0.0001, "loss": 5.6361, "loss/crossentropy": 2.5380167961120605, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16606280207633972, "step": 27554 }, { "epoch": 0.861125, "grad_norm": 3.0, "grad_norm_var": 0.015327962239583333, "learning_rate": 0.0001, "loss": 5.5454, "loss/crossentropy": 2.4664628505706787, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.16648901253938675, "step": 27556 }, { "epoch": 0.8611875, "grad_norm": 2.890625, "grad_norm_var": 0.020921834309895835, "learning_rate": 0.0001, "loss": 5.6024, "loss/crossentropy": 2.569869041442871, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.1598953977227211, "step": 27558 }, { "epoch": 0.86125, "grad_norm": 3.453125, "grad_norm_var": 0.03203023274739583, "learning_rate": 0.0001, "loss": 5.9652, "loss/crossentropy": 2.7367547750473022, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.1740185245871544, "step": 27560 }, { "epoch": 0.8613125, "grad_norm": 2.953125, "grad_norm_var": 0.0314117431640625, "learning_rate": 0.0001, "loss": 5.9583, "loss/crossentropy": 2.7699532508850098, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1703985407948494, "step": 27562 }, { "epoch": 0.861375, "grad_norm": 3.015625, "grad_norm_var": 0.039383951822916666, "learning_rate": 0.0001, "loss": 5.4268, "loss/crossentropy": 2.501665711402893, "loss/hidden": 1.38671875, "loss/jsd": 0.0, "loss/logits": 0.1538463532924652, "step": 27564 }, { "epoch": 0.8614375, "grad_norm": 3.203125, "grad_norm_var": 0.038798014322916664, "learning_rate": 0.0001, "loss": 5.4313, "loss/crossentropy": 2.372091054916382, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16100051999092102, "step": 27566 }, { "epoch": 0.8615, "grad_norm": 2.9375, "grad_norm_var": 0.041014607747395834, "learning_rate": 0.0001, "loss": 5.579, "loss/crossentropy": 2.53856885433197, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1595095619559288, "step": 27568 }, { "epoch": 0.8615625, "grad_norm": 3.203125, "grad_norm_var": 0.040751139322916664, "learning_rate": 0.0001, "loss": 5.5701, "loss/crossentropy": 2.5165361166000366, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16082197427749634, "step": 27570 }, { "epoch": 0.861625, "grad_norm": 3.328125, "grad_norm_var": 0.046610514322916664, "learning_rate": 0.0001, "loss": 5.542, "loss/crossentropy": 2.5036321878433228, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15930962562561035, "step": 27572 }, { "epoch": 0.8616875, "grad_norm": 3.140625, "grad_norm_var": 0.04087626139322917, "learning_rate": 0.0001, "loss": 5.5689, "loss/crossentropy": 2.4433289766311646, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16607220470905304, "step": 27574 }, { "epoch": 0.86175, "grad_norm": 2.859375, "grad_norm_var": 0.037775675455729164, "learning_rate": 0.0001, "loss": 5.8773, "loss/crossentropy": 2.7035598754882812, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.17167061567306519, "step": 27576 }, { "epoch": 0.8618125, "grad_norm": 3.03125, "grad_norm_var": 0.03623046875, "learning_rate": 0.0001, "loss": 5.4398, "loss/crossentropy": 2.4577486515045166, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15289676189422607, "step": 27578 }, { "epoch": 0.861875, "grad_norm": 3.09375, "grad_norm_var": 0.040608723958333336, "learning_rate": 0.0001, "loss": 5.6029, "loss/crossentropy": 2.5073471069335938, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.16267910599708557, "step": 27580 }, { "epoch": 0.8619375, "grad_norm": 3.375, "grad_norm_var": 0.04343973795572917, "learning_rate": 0.0001, "loss": 5.5533, "loss/crossentropy": 2.4675310850143433, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.15896278619766235, "step": 27582 }, { "epoch": 0.862, "grad_norm": 3.0, "grad_norm_var": 0.035187784830729166, "learning_rate": 0.0001, "loss": 5.2987, "loss/crossentropy": 2.3355921506881714, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.1549062505364418, "step": 27584 }, { "epoch": 0.8620625, "grad_norm": 3.140625, "grad_norm_var": 0.0320220947265625, "learning_rate": 0.0001, "loss": 5.8557, "loss/crossentropy": 2.7173553705215454, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16618236899375916, "step": 27586 }, { "epoch": 0.862125, "grad_norm": 3.015625, "grad_norm_var": 0.033202107747395834, "learning_rate": 0.0001, "loss": 5.6455, "loss/crossentropy": 2.6528557538986206, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15629655122756958, "step": 27588 }, { "epoch": 0.8621875, "grad_norm": 2.875, "grad_norm_var": 0.03720703125, "learning_rate": 0.0001, "loss": 5.5023, "loss/crossentropy": 2.517209768295288, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15553860366344452, "step": 27590 }, { "epoch": 0.86225, "grad_norm": 2.8125, "grad_norm_var": 0.03209228515625, "learning_rate": 0.0001, "loss": 5.3081, "loss/crossentropy": 2.4127840995788574, "loss/hidden": 1.38671875, "loss/jsd": 0.0, "loss/logits": 0.15086285769939423, "step": 27592 }, { "epoch": 0.8623125, "grad_norm": 2.703125, "grad_norm_var": 0.03828125, "learning_rate": 0.0001, "loss": 5.3493, "loss/crossentropy": 2.449027419090271, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.14940007030963898, "step": 27594 }, { "epoch": 0.862375, "grad_norm": 2.828125, "grad_norm_var": 0.026595052083333334, "learning_rate": 0.0001, "loss": 5.4339, "loss/crossentropy": 2.473372459411621, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1495700106024742, "step": 27596 }, { "epoch": 0.8624375, "grad_norm": 3.03125, "grad_norm_var": 0.014867146809895834, "learning_rate": 0.0001, "loss": 5.4278, "loss/crossentropy": 2.488494396209717, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.1540842279791832, "step": 27598 }, { "epoch": 0.8625, "grad_norm": 2.921875, "grad_norm_var": 0.01197509765625, "learning_rate": 0.0001, "loss": 5.5219, "loss/crossentropy": 2.4091190099716187, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16323351114988327, "step": 27600 }, { "epoch": 0.8625625, "grad_norm": 3.453125, "grad_norm_var": 0.027046712239583333, "learning_rate": 0.0001, "loss": 5.6252, "loss/crossentropy": 2.429298520088196, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16998374462127686, "step": 27602 }, { "epoch": 0.862625, "grad_norm": 2.90625, "grad_norm_var": 0.027668253580729166, "learning_rate": 0.0001, "loss": 5.4972, "loss/crossentropy": 2.4304704666137695, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16136115789413452, "step": 27604 }, { "epoch": 0.8626875, "grad_norm": 3.03125, "grad_norm_var": 0.0279296875, "learning_rate": 0.0001, "loss": 5.716, "loss/crossentropy": 2.5851006507873535, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1650448888540268, "step": 27606 }, { "epoch": 0.86275, "grad_norm": 3.0, "grad_norm_var": 0.026488240559895834, "learning_rate": 0.0001, "loss": 5.627, "loss/crossentropy": 2.5394445657730103, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.1634387969970703, "step": 27608 }, { "epoch": 0.8628125, "grad_norm": 2.890625, "grad_norm_var": 0.021956380208333334, "learning_rate": 0.0001, "loss": 5.5363, "loss/crossentropy": 2.5663862228393555, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15401819348335266, "step": 27610 }, { "epoch": 0.862875, "grad_norm": 2.96875, "grad_norm_var": 0.023726399739583334, "learning_rate": 0.0001, "loss": 5.5966, "loss/crossentropy": 2.5395649671554565, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15999913960695267, "step": 27612 }, { "epoch": 0.8629375, "grad_norm": 3.34375, "grad_norm_var": 0.0301422119140625, "learning_rate": 0.0001, "loss": 5.9448, "loss/crossentropy": 2.744190216064453, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.17161927372217178, "step": 27614 }, { "epoch": 0.863, "grad_norm": 3.359375, "grad_norm_var": 0.03528645833333333, "learning_rate": 0.0001, "loss": 5.3661, "loss/crossentropy": 2.300475001335144, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.15890540182590485, "step": 27616 }, { "epoch": 0.8630625, "grad_norm": 3.203125, "grad_norm_var": 0.027098592122395834, "learning_rate": 0.0001, "loss": 5.4225, "loss/crossentropy": 2.4023449420928955, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15826784819364548, "step": 27618 }, { "epoch": 0.863125, "grad_norm": 3.078125, "grad_norm_var": 0.024974568684895834, "learning_rate": 0.0001, "loss": 5.7417, "loss/crossentropy": 2.674994111061096, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1609710156917572, "step": 27620 }, { "epoch": 0.8631875, "grad_norm": 2.859375, "grad_norm_var": 0.0282135009765625, "learning_rate": 0.0001, "loss": 5.268, "loss/crossentropy": 2.3086975812911987, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1529662385582924, "step": 27622 }, { "epoch": 0.86325, "grad_norm": 3.484375, "grad_norm_var": 0.037430826822916666, "learning_rate": 0.0001, "loss": 5.7246, "loss/crossentropy": 2.5959867238998413, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16715864837169647, "step": 27624 }, { "epoch": 0.8633125, "grad_norm": 2.921875, "grad_norm_var": 0.03291015625, "learning_rate": 0.0001, "loss": 5.6348, "loss/crossentropy": 2.5706393718719482, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1571996808052063, "step": 27626 }, { "epoch": 0.863375, "grad_norm": 3.390625, "grad_norm_var": 0.03599853515625, "learning_rate": 0.0001, "loss": 5.7237, "loss/crossentropy": 2.5594701766967773, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.16877028346061707, "step": 27628 }, { "epoch": 0.8634375, "grad_norm": 3.09375, "grad_norm_var": 0.031151326497395833, "learning_rate": 0.0001, "loss": 5.5329, "loss/crossentropy": 2.47390353679657, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16019625216722488, "step": 27630 }, { "epoch": 0.8635, "grad_norm": 3.109375, "grad_norm_var": 0.025316365559895835, "learning_rate": 0.0001, "loss": 5.4281, "loss/crossentropy": 2.486311912536621, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.14847956597805023, "step": 27632 }, { "epoch": 0.8635625, "grad_norm": 2.953125, "grad_norm_var": 0.025755818684895834, "learning_rate": 0.0001, "loss": 5.5117, "loss/crossentropy": 2.4831504821777344, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.1610625460743904, "step": 27634 }, { "epoch": 0.863625, "grad_norm": 3.046875, "grad_norm_var": 0.029938761393229166, "learning_rate": 0.0001, "loss": 5.7136, "loss/crossentropy": 2.6036019325256348, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1672532856464386, "step": 27636 }, { "epoch": 0.8636875, "grad_norm": 2.859375, "grad_norm_var": 0.0309967041015625, "learning_rate": 0.0001, "loss": 5.1623, "loss/crossentropy": 2.291244864463806, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.1456953063607216, "step": 27638 }, { "epoch": 0.86375, "grad_norm": 2.84375, "grad_norm_var": 0.024803670247395833, "learning_rate": 0.0001, "loss": 5.4878, "loss/crossentropy": 2.5077956914901733, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.1573728397488594, "step": 27640 }, { "epoch": 0.8638125, "grad_norm": 2.9375, "grad_norm_var": 0.024177042643229167, "learning_rate": 0.0001, "loss": 5.3704, "loss/crossentropy": 2.4454660415649414, "loss/hidden": 1.3828125, "loss/jsd": 0.0, "loss/logits": 0.15420855581760406, "step": 27642 }, { "epoch": 0.863875, "grad_norm": 3.109375, "grad_norm_var": 0.017560831705729165, "learning_rate": 0.0001, "loss": 5.6064, "loss/crossentropy": 2.6103018522262573, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15468335151672363, "step": 27644 }, { "epoch": 0.8639375, "grad_norm": 3.015625, "grad_norm_var": 0.01744384765625, "learning_rate": 0.0001, "loss": 5.5682, "loss/crossentropy": 2.5905452966690063, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15284481644630432, "step": 27646 }, { "epoch": 0.864, "grad_norm": 3.140625, "grad_norm_var": 0.017267862955729168, "learning_rate": 0.0001, "loss": 5.6378, "loss/crossentropy": 2.5828408002853394, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.161359004676342, "step": 27648 }, { "epoch": 0.8640625, "grad_norm": 3.109375, "grad_norm_var": 0.018244425455729168, "learning_rate": 0.0001, "loss": 5.549, "loss/crossentropy": 2.5506070852279663, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.157648004591465, "step": 27650 }, { "epoch": 0.864125, "grad_norm": 3.078125, "grad_norm_var": 0.013109334309895833, "learning_rate": 0.0001, "loss": 5.7008, "loss/crossentropy": 2.610270857810974, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.16569805890321732, "step": 27652 }, { "epoch": 0.8641875, "grad_norm": 3.125, "grad_norm_var": 0.00875244140625, "learning_rate": 0.0001, "loss": 5.381, "loss/crossentropy": 2.4877549409866333, "loss/hidden": 1.3671875, "loss/jsd": 0.0, "loss/logits": 0.1526072770357132, "step": 27654 }, { "epoch": 0.86425, "grad_norm": 2.84375, "grad_norm_var": 0.017838541666666666, "learning_rate": 0.0001, "loss": 5.8142, "loss/crossentropy": 2.636727809906006, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17126110941171646, "step": 27656 }, { "epoch": 0.8643125, "grad_norm": 2.96875, "grad_norm_var": 0.024543253580729167, "learning_rate": 0.0001, "loss": 5.6218, "loss/crossentropy": 2.5793492794036865, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.16088585555553436, "step": 27658 }, { "epoch": 0.864375, "grad_norm": 3.0, "grad_norm_var": 0.030403645833333333, "learning_rate": 0.0001, "loss": 5.6904, "loss/crossentropy": 2.505601406097412, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1704343557357788, "step": 27660 }, { "epoch": 0.8644375, "grad_norm": 3.0, "grad_norm_var": 0.029866536458333332, "learning_rate": 0.0001, "loss": 5.7843, "loss/crossentropy": 2.6765681505203247, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16468166559934616, "step": 27662 }, { "epoch": 0.8645, "grad_norm": 2.9375, "grad_norm_var": 0.04508056640625, "learning_rate": 0.0001, "loss": 5.6697, "loss/crossentropy": 2.5738131999969482, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16231893748044968, "step": 27664 }, { "epoch": 0.8645625, "grad_norm": 3.203125, "grad_norm_var": 0.04541015625, "learning_rate": 0.0001, "loss": 5.4086, "loss/crossentropy": 2.4533698558807373, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15137922018766403, "step": 27666 }, { "epoch": 0.864625, "grad_norm": 2.921875, "grad_norm_var": 0.0484771728515625, "learning_rate": 0.0001, "loss": 5.7024, "loss/crossentropy": 2.58325457572937, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16620948165655136, "step": 27668 }, { "epoch": 0.8646875, "grad_norm": 3.125, "grad_norm_var": 0.051691691080729164, "learning_rate": 0.0001, "loss": 5.7393, "loss/crossentropy": 2.6752718687057495, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.16539110243320465, "step": 27670 }, { "epoch": 0.86475, "grad_norm": 3.109375, "grad_norm_var": 0.042578125, "learning_rate": 0.0001, "loss": 5.562, "loss/crossentropy": 2.531635284423828, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15929072350263596, "step": 27672 }, { "epoch": 0.8648125, "grad_norm": 3.0, "grad_norm_var": 0.07111714680989584, "learning_rate": 0.0001, "loss": 5.6956, "loss/crossentropy": 2.5755029916763306, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1674804836511612, "step": 27674 }, { "epoch": 0.864875, "grad_norm": 3.125, "grad_norm_var": 0.06580301920572916, "learning_rate": 0.0001, "loss": 5.3852, "loss/crossentropy": 2.3932870626449585, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15778710693120956, "step": 27676 }, { "epoch": 0.8649375, "grad_norm": 3.421875, "grad_norm_var": 0.0716705322265625, "learning_rate": 0.0001, "loss": 5.8202, "loss/crossentropy": 2.75151526927948, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16038886457681656, "step": 27678 }, { "epoch": 0.865, "grad_norm": 2.984375, "grad_norm_var": 0.06311442057291666, "learning_rate": 0.0001, "loss": 5.5359, "loss/crossentropy": 2.5368924140930176, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15771739929914474, "step": 27680 }, { "epoch": 0.8650625, "grad_norm": 3.28125, "grad_norm_var": 0.06184794108072917, "learning_rate": 0.0001, "loss": 5.4507, "loss/crossentropy": 2.4856276512145996, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15119660645723343, "step": 27682 }, { "epoch": 0.865125, "grad_norm": 3.3125, "grad_norm_var": 0.0642730712890625, "learning_rate": 0.0001, "loss": 5.6671, "loss/crossentropy": 2.5770288705825806, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16369254887104034, "step": 27684 }, { "epoch": 0.8651875, "grad_norm": 2.8125, "grad_norm_var": 0.06990559895833333, "learning_rate": 0.0001, "loss": 5.2007, "loss/crossentropy": 2.3456820249557495, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.14409735053777695, "step": 27686 }, { "epoch": 0.86525, "grad_norm": 3.265625, "grad_norm_var": 0.06848551432291666, "learning_rate": 0.0001, "loss": 5.7854, "loss/crossentropy": 2.645024299621582, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16560453176498413, "step": 27688 }, { "epoch": 0.8653125, "grad_norm": 3.0625, "grad_norm_var": 0.03316141764322917, "learning_rate": 0.0001, "loss": 5.7614, "loss/crossentropy": 2.680987596511841, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.16545983403921127, "step": 27690 }, { "epoch": 0.865375, "grad_norm": 3.03125, "grad_norm_var": 0.03179931640625, "learning_rate": 0.0001, "loss": 5.7494, "loss/crossentropy": 2.634661316871643, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16537588089704514, "step": 27692 }, { "epoch": 0.8654375, "grad_norm": 3.0625, "grad_norm_var": 0.022249348958333335, "learning_rate": 0.0001, "loss": 5.6987, "loss/crossentropy": 2.62981116771698, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16314272582530975, "step": 27694 }, { "epoch": 0.8655, "grad_norm": 2.96875, "grad_norm_var": 0.019806925455729166, "learning_rate": 0.0001, "loss": 5.5717, "loss/crossentropy": 2.5568922758102417, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.1573409140110016, "step": 27696 }, { "epoch": 0.8655625, "grad_norm": 2.984375, "grad_norm_var": 0.017756144205729168, "learning_rate": 0.0001, "loss": 5.3276, "loss/crossentropy": 2.3811562061309814, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.1532423198223114, "step": 27698 }, { "epoch": 0.865625, "grad_norm": 3.078125, "grad_norm_var": 0.012157185872395834, "learning_rate": 0.0001, "loss": 5.4508, "loss/crossentropy": 2.5310704708099365, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.15212911367416382, "step": 27700 }, { "epoch": 0.8656875, "grad_norm": 2.953125, "grad_norm_var": 0.0090972900390625, "learning_rate": 0.0001, "loss": 5.7516, "loss/crossentropy": 2.7092376947402954, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15853281319141388, "step": 27702 }, { "epoch": 0.86575, "grad_norm": 3.015625, "grad_norm_var": 0.0045806884765625, "learning_rate": 0.0001, "loss": 5.2488, "loss/crossentropy": 2.309325337409973, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.14941205829381943, "step": 27704 }, { "epoch": 0.8658125, "grad_norm": 2.9375, "grad_norm_var": 0.004084269205729167, "learning_rate": 0.0001, "loss": 5.4029, "loss/crossentropy": 2.487838864326477, "loss/hidden": 1.40234375, "loss/jsd": 0.0, "loss/logits": 0.15126828849315643, "step": 27706 }, { "epoch": 0.865875, "grad_norm": 3.15625, "grad_norm_var": 0.0103424072265625, "learning_rate": 0.0001, "loss": 5.8147, "loss/crossentropy": 2.7337146997451782, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.16474289447069168, "step": 27708 }, { "epoch": 0.8659375, "grad_norm": 2.859375, "grad_norm_var": 0.011139933268229167, "learning_rate": 0.0001, "loss": 5.6107, "loss/crossentropy": 2.599394202232361, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15738170593976974, "step": 27710 }, { "epoch": 0.866, "grad_norm": 2.828125, "grad_norm_var": 0.022004191080729166, "learning_rate": 0.0001, "loss": 5.7284, "loss/crossentropy": 2.6974366903305054, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16090577095746994, "step": 27712 }, { "epoch": 0.8660625, "grad_norm": 3.0, "grad_norm_var": 0.027164713541666666, "learning_rate": 0.0001, "loss": 5.2408, "loss/crossentropy": 2.3260730504989624, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1485009863972664, "step": 27714 }, { "epoch": 0.866125, "grad_norm": 3.234375, "grad_norm_var": 0.03365478515625, "learning_rate": 0.0001, "loss": 5.843, "loss/crossentropy": 2.6683355569839478, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.17176324129104614, "step": 27716 }, { "epoch": 0.8661875, "grad_norm": 3.078125, "grad_norm_var": 0.033991495768229164, "learning_rate": 0.0001, "loss": 5.5059, "loss/crossentropy": 2.5150938034057617, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15416057407855988, "step": 27718 }, { "epoch": 0.86625, "grad_norm": 3.03125, "grad_norm_var": 0.0335601806640625, "learning_rate": 0.0001, "loss": 5.325, "loss/crossentropy": 2.4102606773376465, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.15084628015756607, "step": 27720 }, { "epoch": 0.8663125, "grad_norm": 3.125, "grad_norm_var": 0.03590087890625, "learning_rate": 0.0001, "loss": 5.6581, "loss/crossentropy": 2.6280406713485718, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.1619897335767746, "step": 27722 }, { "epoch": 0.866375, "grad_norm": 3.234375, "grad_norm_var": 0.0326080322265625, "learning_rate": 0.0001, "loss": 5.4041, "loss/crossentropy": 2.382719874382019, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15760424733161926, "step": 27724 }, { "epoch": 0.8664375, "grad_norm": 3.125, "grad_norm_var": 0.031184895833333334, "learning_rate": 0.0001, "loss": 5.4625, "loss/crossentropy": 2.4307433366775513, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16020990163087845, "step": 27726 }, { "epoch": 0.8665, "grad_norm": 3.125, "grad_norm_var": 0.021907552083333334, "learning_rate": 0.0001, "loss": 5.6706, "loss/crossentropy": 2.555809736251831, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1669429913163185, "step": 27728 }, { "epoch": 0.8665625, "grad_norm": 2.765625, "grad_norm_var": 0.019896443684895834, "learning_rate": 0.0001, "loss": 5.4588, "loss/crossentropy": 2.4327945709228516, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.15728867053985596, "step": 27730 }, { "epoch": 0.866625, "grad_norm": 3.125, "grad_norm_var": 0.016600545247395834, "learning_rate": 0.0001, "loss": 5.6227, "loss/crossentropy": 2.5396151542663574, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16025733202695847, "step": 27732 }, { "epoch": 0.8666875, "grad_norm": 3.15625, "grad_norm_var": 0.018871053059895834, "learning_rate": 0.0001, "loss": 5.9417, "loss/crossentropy": 2.8445409536361694, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16518766433000565, "step": 27734 }, { "epoch": 0.86675, "grad_norm": 2.875, "grad_norm_var": 0.0204986572265625, "learning_rate": 0.0001, "loss": 5.5576, "loss/crossentropy": 2.480849266052246, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16158322989940643, "step": 27736 }, { "epoch": 0.8668125, "grad_norm": 3.140625, "grad_norm_var": 0.027730305989583332, "learning_rate": 0.0001, "loss": 5.6466, "loss/crossentropy": 2.617606520652771, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15915010124444962, "step": 27738 }, { "epoch": 0.866875, "grad_norm": 3.34375, "grad_norm_var": 0.030427042643229166, "learning_rate": 0.0001, "loss": 5.7358, "loss/crossentropy": 2.6470601558685303, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16316870599985123, "step": 27740 }, { "epoch": 0.8669375, "grad_norm": 3.203125, "grad_norm_var": 0.03192952473958333, "learning_rate": 0.0001, "loss": 5.6234, "loss/crossentropy": 2.565005898475647, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1620922088623047, "step": 27742 }, { "epoch": 0.867, "grad_norm": 2.984375, "grad_norm_var": 0.033528645833333336, "learning_rate": 0.0001, "loss": 5.4992, "loss/crossentropy": 2.540012001991272, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.15607950091362, "step": 27744 }, { "epoch": 0.8670625, "grad_norm": 3.21875, "grad_norm_var": 0.028449503580729167, "learning_rate": 0.0001, "loss": 5.7067, "loss/crossentropy": 2.6245768070220947, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16289737820625305, "step": 27746 }, { "epoch": 0.867125, "grad_norm": 2.953125, "grad_norm_var": 0.0289703369140625, "learning_rate": 0.0001, "loss": 5.7415, "loss/crossentropy": 2.6854456663131714, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1606845110654831, "step": 27748 }, { "epoch": 0.8671875, "grad_norm": 3.1875, "grad_norm_var": 0.028841145833333335, "learning_rate": 0.0001, "loss": 5.7299, "loss/crossentropy": 2.5976706743240356, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1671339049935341, "step": 27750 }, { "epoch": 0.86725, "grad_norm": 2.9375, "grad_norm_var": 0.0320953369140625, "learning_rate": 0.0001, "loss": 5.4988, "loss/crossentropy": 2.3713111877441406, "loss/hidden": 1.51953125, "loss/jsd": 0.0, "loss/logits": 0.16079971939325333, "step": 27752 }, { "epoch": 0.8673125, "grad_norm": 3.75, "grad_norm_var": 0.04864908854166667, "learning_rate": 0.0001, "loss": 5.8481, "loss/crossentropy": 2.6818984746932983, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1701388955116272, "step": 27754 }, { "epoch": 0.867375, "grad_norm": 3.109375, "grad_norm_var": 0.04512430826822917, "learning_rate": 0.0001, "loss": 5.6312, "loss/crossentropy": 2.5963293313980103, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15974120795726776, "step": 27756 }, { "epoch": 0.8674375, "grad_norm": 2.984375, "grad_norm_var": 0.0453125, "learning_rate": 0.0001, "loss": 5.3357, "loss/crossentropy": 2.417480707168579, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.1511980965733528, "step": 27758 }, { "epoch": 0.8675, "grad_norm": 2.859375, "grad_norm_var": 0.0462310791015625, "learning_rate": 0.0001, "loss": 5.4992, "loss/crossentropy": 2.5086978673934937, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15608545392751694, "step": 27760 }, { "epoch": 0.8675625, "grad_norm": 3.015625, "grad_norm_var": 0.0462310791015625, "learning_rate": 0.0001, "loss": 5.3478, "loss/crossentropy": 2.4178558588027954, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.15237433463335037, "step": 27762 }, { "epoch": 0.867625, "grad_norm": 3.078125, "grad_norm_var": 0.04551493326822917, "learning_rate": 0.0001, "loss": 5.8964, "loss/crossentropy": 2.801416039466858, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.1634017676115036, "step": 27764 }, { "epoch": 0.8676875, "grad_norm": 2.890625, "grad_norm_var": 0.0498687744140625, "learning_rate": 0.0001, "loss": 5.5029, "loss/crossentropy": 2.491839647293091, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.15930673480033875, "step": 27766 }, { "epoch": 0.86775, "grad_norm": 3.15625, "grad_norm_var": 0.05849507649739583, "learning_rate": 0.0001, "loss": 5.8077, "loss/crossentropy": 2.6288340091705322, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.16789048165082932, "step": 27768 }, { "epoch": 0.8678125, "grad_norm": 2.9375, "grad_norm_var": 0.04075419108072917, "learning_rate": 0.0001, "loss": 5.6089, "loss/crossentropy": 2.5784683227539062, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.16085676103830338, "step": 27770 }, { "epoch": 0.867875, "grad_norm": 3.0, "grad_norm_var": 0.0421295166015625, "learning_rate": 0.0001, "loss": 5.7072, "loss/crossentropy": 2.6747682094573975, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.160274475812912, "step": 27772 }, { "epoch": 0.8679375, "grad_norm": 3.0, "grad_norm_var": 0.04052327473958333, "learning_rate": 0.0001, "loss": 5.783, "loss/crossentropy": 2.699275851249695, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.16189026832580566, "step": 27774 }, { "epoch": 0.868, "grad_norm": 3.21875, "grad_norm_var": 0.035807291666666664, "learning_rate": 0.0001, "loss": 5.8626, "loss/crossentropy": 2.6744813919067383, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.170765720307827, "step": 27776 }, { "epoch": 0.8680625, "grad_norm": 4.0, "grad_norm_var": 0.08801676432291666, "learning_rate": 0.0001, "loss": 5.8071, "loss/crossentropy": 2.5578606128692627, "loss/hidden": 1.5078125, "loss/jsd": 0.0, "loss/logits": 0.17414429783821106, "step": 27778 }, { "epoch": 0.868125, "grad_norm": 3.03125, "grad_norm_var": 0.08368733723958334, "learning_rate": 0.0001, "loss": 5.1385, "loss/crossentropy": 2.221524477005005, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.1518501415848732, "step": 27780 }, { "epoch": 0.8681875, "grad_norm": 3.15625, "grad_norm_var": 0.08029683430989583, "learning_rate": 0.0001, "loss": 5.5732, "loss/crossentropy": 2.4585397243499756, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1634185016155243, "step": 27782 }, { "epoch": 0.86825, "grad_norm": 3.15625, "grad_norm_var": 0.07093098958333334, "learning_rate": 0.0001, "loss": 5.6945, "loss/crossentropy": 2.5165436267852783, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.1697457879781723, "step": 27784 }, { "epoch": 0.8683125, "grad_norm": 2.9375, "grad_norm_var": 0.0662750244140625, "learning_rate": 0.0001, "loss": 5.8333, "loss/crossentropy": 2.7102712392807007, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1677682101726532, "step": 27786 }, { "epoch": 0.868375, "grad_norm": 3.0, "grad_norm_var": 0.06702372233072916, "learning_rate": 0.0001, "loss": 5.7649, "loss/crossentropy": 2.6999846696853638, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15922239422798157, "step": 27788 }, { "epoch": 0.8684375, "grad_norm": 3.0625, "grad_norm_var": 0.06606343587239584, "learning_rate": 0.0001, "loss": 5.8227, "loss/crossentropy": 2.7487266063690186, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15935276448726654, "step": 27790 }, { "epoch": 0.8685, "grad_norm": 3.421875, "grad_norm_var": 0.07010091145833333, "learning_rate": 0.0001, "loss": 5.5804, "loss/crossentropy": 2.501925468444824, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15902023017406464, "step": 27792 }, { "epoch": 0.8685625, "grad_norm": 3.0625, "grad_norm_var": 0.02359619140625, "learning_rate": 0.0001, "loss": 5.7561, "loss/crossentropy": 2.677756905555725, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16213340312242508, "step": 27794 }, { "epoch": 0.868625, "grad_norm": 2.71875, "grad_norm_var": 0.0349609375, "learning_rate": 0.0001, "loss": 5.6271, "loss/crossentropy": 2.5794237852096558, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1590612530708313, "step": 27796 }, { "epoch": 0.8686875, "grad_norm": 2.890625, "grad_norm_var": 0.03434956868489583, "learning_rate": 0.0001, "loss": 5.7891, "loss/crossentropy": 2.701713800430298, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16303890198469162, "step": 27798 }, { "epoch": 0.86875, "grad_norm": 3.1875, "grad_norm_var": 0.03466389973958333, "learning_rate": 0.0001, "loss": 5.489, "loss/crossentropy": 2.4837100505828857, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.15599625557661057, "step": 27800 }, { "epoch": 0.8688125, "grad_norm": 3.046875, "grad_norm_var": 0.032938639322916664, "learning_rate": 0.0001, "loss": 5.4444, "loss/crossentropy": 2.425615072250366, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15812792629003525, "step": 27802 }, { "epoch": 0.868875, "grad_norm": 2.96875, "grad_norm_var": 0.04101155598958333, "learning_rate": 0.0001, "loss": 5.4726, "loss/crossentropy": 2.5447142124176025, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.15216126292943954, "step": 27804 }, { "epoch": 0.8689375, "grad_norm": 3.109375, "grad_norm_var": 0.041731770833333334, "learning_rate": 0.0001, "loss": 5.5897, "loss/crossentropy": 2.5250545740127563, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1627161130309105, "step": 27806 }, { "epoch": 0.869, "grad_norm": 2.859375, "grad_norm_var": 0.03648681640625, "learning_rate": 0.0001, "loss": 5.5572, "loss/crossentropy": 2.523909568786621, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15918756276369095, "step": 27808 }, { "epoch": 0.8690625, "grad_norm": 3.4375, "grad_norm_var": 0.03961181640625, "learning_rate": 0.0001, "loss": 5.5009, "loss/crossentropy": 2.442892074584961, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1581428349018097, "step": 27810 }, { "epoch": 0.869125, "grad_norm": 2.9375, "grad_norm_var": 0.0334136962890625, "learning_rate": 0.0001, "loss": 5.1772, "loss/crossentropy": 2.2754937410354614, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.15032844245433807, "step": 27812 }, { "epoch": 0.8691875, "grad_norm": 3.140625, "grad_norm_var": 0.03062744140625, "learning_rate": 0.0001, "loss": 5.7346, "loss/crossentropy": 2.6316630840301514, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1653677299618721, "step": 27814 }, { "epoch": 0.86925, "grad_norm": 3.296875, "grad_norm_var": 0.0335113525390625, "learning_rate": 0.0001, "loss": 5.5422, "loss/crossentropy": 2.443118453025818, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1642000824213028, "step": 27816 }, { "epoch": 0.8693125, "grad_norm": 3.21875, "grad_norm_var": 0.03533528645833333, "learning_rate": 0.0001, "loss": 5.6963, "loss/crossentropy": 2.551207184791565, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.16880378127098083, "step": 27818 }, { "epoch": 0.869375, "grad_norm": 3.078125, "grad_norm_var": 0.028107706705729166, "learning_rate": 0.0001, "loss": 5.5252, "loss/crossentropy": 2.554909586906433, "loss/hidden": 1.3984375, "loss/jsd": 0.0, "loss/logits": 0.15718352049589157, "step": 27820 }, { "epoch": 0.8694375, "grad_norm": 3.375, "grad_norm_var": 0.04263916015625, "learning_rate": 0.0001, "loss": 5.7741, "loss/crossentropy": 2.5711824893951416, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17302384972572327, "step": 27822 }, { "epoch": 0.8695, "grad_norm": 2.875, "grad_norm_var": 0.0422760009765625, "learning_rate": 0.0001, "loss": 5.6609, "loss/crossentropy": 2.5747915506362915, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1636883169412613, "step": 27824 }, { "epoch": 0.8695625, "grad_norm": 3.078125, "grad_norm_var": 0.031769816080729166, "learning_rate": 0.0001, "loss": 5.5748, "loss/crossentropy": 2.5290616750717163, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1553570255637169, "step": 27826 }, { "epoch": 0.869625, "grad_norm": 3.015625, "grad_norm_var": 0.026073201497395834, "learning_rate": 0.0001, "loss": 5.6205, "loss/crossentropy": 2.5588048696517944, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.15890072286128998, "step": 27828 }, { "epoch": 0.8696875, "grad_norm": 2.96875, "grad_norm_var": 0.028023274739583333, "learning_rate": 0.0001, "loss": 5.5863, "loss/crossentropy": 2.535132646560669, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.16097809374332428, "step": 27830 }, { "epoch": 0.86975, "grad_norm": 3.359375, "grad_norm_var": 0.045633951822916664, "learning_rate": 0.0001, "loss": 5.5593, "loss/crossentropy": 2.5300434827804565, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15996113419532776, "step": 27832 }, { "epoch": 0.8698125, "grad_norm": 2.796875, "grad_norm_var": 0.0514556884765625, "learning_rate": 0.0001, "loss": 5.2111, "loss/crossentropy": 2.2957112789154053, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1470032036304474, "step": 27834 }, { "epoch": 0.869875, "grad_norm": 3.046875, "grad_norm_var": 0.05061747233072917, "learning_rate": 0.0001, "loss": 5.7635, "loss/crossentropy": 2.6749976873397827, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16276133060455322, "step": 27836 }, { "epoch": 0.8699375, "grad_norm": 3.0625, "grad_norm_var": 0.030223592122395834, "learning_rate": 0.0001, "loss": 5.5158, "loss/crossentropy": 2.498887300491333, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.1602827087044716, "step": 27838 }, { "epoch": 0.87, "grad_norm": 3.125, "grad_norm_var": 0.0294097900390625, "learning_rate": 0.0001, "loss": 5.7285, "loss/crossentropy": 2.5993796586990356, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16447728872299194, "step": 27840 }, { "epoch": 0.8700625, "grad_norm": 2.96875, "grad_norm_var": 0.029588826497395835, "learning_rate": 0.0001, "loss": 5.9302, "loss/crossentropy": 2.816034197807312, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.1684448942542076, "step": 27842 }, { "epoch": 0.870125, "grad_norm": 2.859375, "grad_norm_var": 0.03131103515625, "learning_rate": 0.0001, "loss": 5.6262, "loss/crossentropy": 2.6053075790405273, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.1599065214395523, "step": 27844 }, { "epoch": 0.8701875, "grad_norm": 2.9375, "grad_norm_var": 0.029499308268229166, "learning_rate": 0.0001, "loss": 5.5842, "loss/crossentropy": 2.5269936323165894, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.16119346767663956, "step": 27846 }, { "epoch": 0.87025, "grad_norm": 3.0625, "grad_norm_var": 0.021214803059895832, "learning_rate": 0.0001, "loss": 6.0234, "loss/crossentropy": 2.9078108072280884, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.1631210893392563, "step": 27848 }, { "epoch": 0.8703125, "grad_norm": 3.046875, "grad_norm_var": 0.018586222330729166, "learning_rate": 0.0001, "loss": 5.3611, "loss/crossentropy": 2.36064875125885, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.15512390434741974, "step": 27850 }, { "epoch": 0.870375, "grad_norm": 3.234375, "grad_norm_var": 0.028522745768229166, "learning_rate": 0.0001, "loss": 5.4436, "loss/crossentropy": 2.451321840286255, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.1582152545452118, "step": 27852 }, { "epoch": 0.8704375, "grad_norm": 2.890625, "grad_norm_var": 0.0332672119140625, "learning_rate": 0.0001, "loss": 5.163, "loss/crossentropy": 2.3075605630874634, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.14335403591394424, "step": 27854 }, { "epoch": 0.8705, "grad_norm": 2.875, "grad_norm_var": 0.029637654622395832, "learning_rate": 0.0001, "loss": 5.7182, "loss/crossentropy": 2.655929207801819, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.16521596908569336, "step": 27856 }, { "epoch": 0.8705625, "grad_norm": 3.1875, "grad_norm_var": 0.035237630208333336, "learning_rate": 0.0001, "loss": 5.5307, "loss/crossentropy": 2.4572445154190063, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.1659395396709442, "step": 27858 }, { "epoch": 0.870625, "grad_norm": 2.953125, "grad_norm_var": 0.03640034993489583, "learning_rate": 0.0001, "loss": 5.5832, "loss/crossentropy": 2.559994101524353, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.16052212566137314, "step": 27860 }, { "epoch": 0.8706875, "grad_norm": 3.40625, "grad_norm_var": 0.0455718994140625, "learning_rate": 0.0001, "loss": 5.6618, "loss/crossentropy": 2.6116913557052612, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.16203808784484863, "step": 27862 }, { "epoch": 0.87075, "grad_norm": 3.0625, "grad_norm_var": 0.039697265625, "learning_rate": 0.0001, "loss": 5.5123, "loss/crossentropy": 2.436099648475647, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.1607440710067749, "step": 27864 }, { "epoch": 0.8708125, "grad_norm": 3.015625, "grad_norm_var": 0.04202067057291667, "learning_rate": 0.0001, "loss": 5.4329, "loss/crossentropy": 2.4671441316604614, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15438677370548248, "step": 27866 }, { "epoch": 0.870875, "grad_norm": 3.140625, "grad_norm_var": 0.031615193684895834, "learning_rate": 0.0001, "loss": 5.5803, "loss/crossentropy": 2.4586938619613647, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16489533334970474, "step": 27868 }, { "epoch": 0.8709375, "grad_norm": 3.1875, "grad_norm_var": 0.027229817708333333, "learning_rate": 0.0001, "loss": 5.4877, "loss/crossentropy": 2.4380314350128174, "loss/hidden": 1.5, "loss/jsd": 0.0, "loss/logits": 0.15496237576007843, "step": 27870 }, { "epoch": 0.871, "grad_norm": 3.25, "grad_norm_var": 0.05241597493489583, "learning_rate": 0.0001, "loss": 5.6678, "loss/crossentropy": 2.560866951942444, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16460144519805908, "step": 27872 }, { "epoch": 0.8710625, "grad_norm": 3.0, "grad_norm_var": 0.05629781087239583, "learning_rate": 0.0001, "loss": 5.0139, "loss/crossentropy": 2.1779093146324158, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1390697881579399, "step": 27874 }, { "epoch": 0.871125, "grad_norm": 2.953125, "grad_norm_var": 0.0569976806640625, "learning_rate": 0.0001, "loss": 5.7047, "loss/crossentropy": 2.626152992248535, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16058950126171112, "step": 27876 }, { "epoch": 0.8711875, "grad_norm": 2.96875, "grad_norm_var": 0.054539998372395836, "learning_rate": 0.0001, "loss": 5.8489, "loss/crossentropy": 2.699462413787842, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.17041077464818954, "step": 27878 }, { "epoch": 0.87125, "grad_norm": 3.234375, "grad_norm_var": 0.05657145182291667, "learning_rate": 0.0001, "loss": 5.6879, "loss/crossentropy": 2.6047691106796265, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.1657308042049408, "step": 27880 }, { "epoch": 0.8713125, "grad_norm": 2.90625, "grad_norm_var": 0.05953369140625, "learning_rate": 0.0001, "loss": 5.5722, "loss/crossentropy": 2.595220685005188, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.1527717337012291, "step": 27882 }, { "epoch": 0.871375, "grad_norm": 3.21875, "grad_norm_var": 0.060498046875, "learning_rate": 0.0001, "loss": 5.7597, "loss/crossentropy": 2.5783873796463013, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.16930709779262543, "step": 27884 }, { "epoch": 0.8714375, "grad_norm": 3.109375, "grad_norm_var": 0.053023274739583334, "learning_rate": 0.0001, "loss": 5.5359, "loss/crossentropy": 2.4618382453918457, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.15936200320720673, "step": 27886 }, { "epoch": 0.8715, "grad_norm": 3.125, "grad_norm_var": 0.00953369140625, "learning_rate": 0.0001, "loss": 5.6499, "loss/crossentropy": 2.5861200094223022, "loss/hidden": 1.48828125, "loss/jsd": 0.0, "loss/logits": 0.15754631161689758, "step": 27888 }, { "epoch": 0.8715625, "grad_norm": 3.03125, "grad_norm_var": 0.011083984375, "learning_rate": 0.0001, "loss": 5.389, "loss/crossentropy": 2.3745484352111816, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1522216796875, "step": 27890 }, { "epoch": 0.871625, "grad_norm": 2.828125, "grad_norm_var": 0.014078776041666666, "learning_rate": 0.0001, "loss": 5.5995, "loss/crossentropy": 2.5988909006118774, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.15748737752437592, "step": 27892 }, { "epoch": 0.8716875, "grad_norm": 2.921875, "grad_norm_var": 0.014274088541666667, "learning_rate": 0.0001, "loss": 5.5922, "loss/crossentropy": 2.5821398496627808, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1564764752984047, "step": 27894 }, { "epoch": 0.87175, "grad_norm": 3.09375, "grad_norm_var": 0.01207275390625, "learning_rate": 0.0001, "loss": 5.6847, "loss/crossentropy": 2.625950574874878, "loss/hidden": 1.4453125, "loss/jsd": 0.0, "loss/logits": 0.1613416001200676, "step": 27896 }, { "epoch": 0.8718125, "grad_norm": 2.984375, "grad_norm_var": 0.01207275390625, "learning_rate": 0.0001, "loss": 5.5418, "loss/crossentropy": 2.5476146936416626, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.1572282910346985, "step": 27898 }, { "epoch": 0.871875, "grad_norm": 2.859375, "grad_norm_var": 0.012272135416666666, "learning_rate": 0.0001, "loss": 5.416, "loss/crossentropy": 2.4924851655960083, "loss/hidden": 1.41015625, "loss/jsd": 0.0, "loss/logits": 0.1513383388519287, "step": 27900 }, { "epoch": 0.8719375, "grad_norm": 2.9375, "grad_norm_var": 0.0104644775390625, "learning_rate": 0.0001, "loss": 5.4087, "loss/crossentropy": 2.4145772457122803, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.1556660234928131, "step": 27902 }, { "epoch": 0.872, "grad_norm": 2.984375, "grad_norm_var": 0.048273722330729164, "learning_rate": 0.0001, "loss": 5.7284, "loss/crossentropy": 2.542745590209961, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.17129983752965927, "step": 27904 }, { "epoch": 0.8720625, "grad_norm": 3.046875, "grad_norm_var": 0.04576416015625, "learning_rate": 0.0001, "loss": 5.5258, "loss/crossentropy": 2.5590767860412598, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.1552676558494568, "step": 27906 }, { "epoch": 0.872125, "grad_norm": 3.21875, "grad_norm_var": 0.051854451497395836, "learning_rate": 0.0001, "loss": 5.6264, "loss/crossentropy": 2.5522128343582153, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.1601487547159195, "step": 27908 }, { "epoch": 0.8721875, "grad_norm": 2.828125, "grad_norm_var": 0.055887858072916664, "learning_rate": 0.0001, "loss": 5.3312, "loss/crossentropy": 2.476024866104126, "loss/hidden": 1.3828125, "loss/jsd": 0.0, "loss/logits": 0.14723709970712662, "step": 27910 }, { "epoch": 0.87225, "grad_norm": 2.828125, "grad_norm_var": 0.05878499348958333, "learning_rate": 0.0001, "loss": 5.571, "loss/crossentropy": 2.498559832572937, "loss/hidden": 1.5390625, "loss/jsd": 0.0, "loss/logits": 0.15333276242017746, "step": 27912 }, { "epoch": 0.8723125, "grad_norm": 2.890625, "grad_norm_var": 0.05969950358072917, "learning_rate": 0.0001, "loss": 5.4344, "loss/crossentropy": 2.528977632522583, "loss/hidden": 1.40234375, "loss/jsd": 0.0, "loss/logits": 0.15030576288700104, "step": 27914 }, { "epoch": 0.872375, "grad_norm": 3.09375, "grad_norm_var": 0.05712483723958333, "learning_rate": 0.0001, "loss": 5.5819, "loss/crossentropy": 2.4894860982894897, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16432037949562073, "step": 27916 }, { "epoch": 0.8724375, "grad_norm": 2.921875, "grad_norm_var": 0.0864410400390625, "learning_rate": 0.0001, "loss": 5.7793, "loss/crossentropy": 2.604734182357788, "loss/hidden": 1.484375, "loss/jsd": 0.0, "loss/logits": 0.16901922971010208, "step": 27918 }, { "epoch": 0.8725, "grad_norm": 2.921875, "grad_norm_var": 0.05397847493489583, "learning_rate": 0.0001, "loss": 5.0961, "loss/crossentropy": 2.179744601249695, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.14632698148488998, "step": 27920 }, { "epoch": 0.8725625, "grad_norm": 3.234375, "grad_norm_var": 0.059407552083333336, "learning_rate": 0.0001, "loss": 5.4898, "loss/crossentropy": 2.4999831914901733, "loss/hidden": 1.421875, "loss/jsd": 0.0, "loss/logits": 0.15679404884576797, "step": 27922 }, { "epoch": 0.872625, "grad_norm": 3.15625, "grad_norm_var": 0.051569620768229164, "learning_rate": 0.0001, "loss": 5.3989, "loss/crossentropy": 2.3952577114105225, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.15426691621541977, "step": 27924 }, { "epoch": 0.8726875, "grad_norm": 3.484375, "grad_norm_var": 0.061986287434895836, "learning_rate": 0.0001, "loss": 5.4991, "loss/crossentropy": 2.388551950454712, "loss/hidden": 1.50390625, "loss/jsd": 0.0, "loss/logits": 0.1606658473610878, "step": 27926 }, { "epoch": 0.87275, "grad_norm": 3.3125, "grad_norm_var": 0.0607330322265625, "learning_rate": 0.0001, "loss": 5.8942, "loss/crossentropy": 2.658738136291504, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.17432327568531036, "step": 27928 }, { "epoch": 0.8728125, "grad_norm": 3.34375, "grad_norm_var": 0.060888671875, "learning_rate": 0.0001, "loss": 5.738, "loss/crossentropy": 2.5245888233184814, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.17368977516889572, "step": 27930 }, { "epoch": 0.872875, "grad_norm": 3.265625, "grad_norm_var": 0.06109619140625, "learning_rate": 0.0001, "loss": 5.7505, "loss/crossentropy": 2.649577260017395, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16087592393159866, "step": 27932 }, { "epoch": 0.8729375, "grad_norm": 3.125, "grad_norm_var": 0.03780924479166667, "learning_rate": 0.0001, "loss": 5.7636, "loss/crossentropy": 2.5796053409576416, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.1719168797135353, "step": 27934 }, { "epoch": 0.873, "grad_norm": 2.90625, "grad_norm_var": 0.04420166015625, "learning_rate": 0.0001, "loss": 5.4642, "loss/crossentropy": 2.502958655357361, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.15550021082162857, "step": 27936 }, { "epoch": 0.8730625, "grad_norm": 3.015625, "grad_norm_var": 0.03916015625, "learning_rate": 0.0001, "loss": 5.7373, "loss/crossentropy": 2.6349422931671143, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16297203302383423, "step": 27938 }, { "epoch": 0.873125, "grad_norm": 3.0625, "grad_norm_var": 0.041748046875, "learning_rate": 0.0001, "loss": 5.7272, "loss/crossentropy": 2.6609092950820923, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16171126067638397, "step": 27940 }, { "epoch": 0.8731875, "grad_norm": 2.921875, "grad_norm_var": 0.02720947265625, "learning_rate": 0.0001, "loss": 5.6362, "loss/crossentropy": 2.5398753881454468, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.16041860729455948, "step": 27942 }, { "epoch": 0.87325, "grad_norm": 2.859375, "grad_norm_var": 0.026414998372395835, "learning_rate": 0.0001, "loss": 5.7588, "loss/crossentropy": 2.6977330446243286, "loss/hidden": 1.453125, "loss/jsd": 0.0, "loss/logits": 0.16079582273960114, "step": 27944 }, { "epoch": 0.8733125, "grad_norm": 2.859375, "grad_norm_var": 0.019189453125, "learning_rate": 0.0001, "loss": 5.2738, "loss/crossentropy": 2.329177975654602, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.15305107086896896, "step": 27946 }, { "epoch": 0.873375, "grad_norm": 3.53125, "grad_norm_var": 0.03835347493489583, "learning_rate": 0.0001, "loss": 5.7664, "loss/crossentropy": 2.633378744125366, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16525045782327652, "step": 27948 }, { "epoch": 0.8734375, "grad_norm": 3.125, "grad_norm_var": 0.0434478759765625, "learning_rate": 0.0001, "loss": 5.8757, "loss/crossentropy": 2.620681881904602, "loss/hidden": 1.46484375, "loss/jsd": 0.0, "loss/logits": 0.17902208864688873, "step": 27950 }, { "epoch": 0.8735, "grad_norm": 2.875, "grad_norm_var": 0.041413370768229166, "learning_rate": 0.0001, "loss": 5.3654, "loss/crossentropy": 2.3401836156845093, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15682092308998108, "step": 27952 }, { "epoch": 0.8735625, "grad_norm": 3.328125, "grad_norm_var": 0.04488525390625, "learning_rate": 0.0001, "loss": 5.5194, "loss/crossentropy": 2.421567916870117, "loss/hidden": 1.44921875, "loss/jsd": 0.0, "loss/logits": 0.16486230492591858, "step": 27954 }, { "epoch": 0.873625, "grad_norm": 2.984375, "grad_norm_var": 0.0427642822265625, "learning_rate": 0.0001, "loss": 5.7097, "loss/crossentropy": 2.6570467948913574, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.16151063144207, "step": 27956 }, { "epoch": 0.8736875, "grad_norm": 3.046875, "grad_norm_var": 0.04407450358072917, "learning_rate": 0.0001, "loss": 5.437, "loss/crossentropy": 2.421030282974243, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.159804105758667, "step": 27958 }, { "epoch": 0.87375, "grad_norm": 2.890625, "grad_norm_var": 0.043553670247395836, "learning_rate": 0.0001, "loss": 5.3645, "loss/crossentropy": 2.3721309900283813, "loss/hidden": 1.42578125, "loss/jsd": 0.0, "loss/logits": 0.15665796399116516, "step": 27960 }, { "epoch": 0.8738125, "grad_norm": 3.15625, "grad_norm_var": 0.03753255208333333, "learning_rate": 0.0001, "loss": 5.6812, "loss/crossentropy": 2.558328866958618, "loss/hidden": 1.4609375, "loss/jsd": 0.0, "loss/logits": 0.16619154810905457, "step": 27962 }, { "epoch": 0.873875, "grad_norm": 3.3125, "grad_norm_var": 57.6794179280599, "learning_rate": 0.0001, "loss": 6.8675, "loss/crossentropy": 2.5392621755599976, "loss/hidden": 2.33203125, "loss/jsd": 0.0, "loss/logits": 0.19961903989315033, "step": 27964 }, { "epoch": 0.8739375, "grad_norm": 3.25, "grad_norm_var": 57.57952067057292, "learning_rate": 0.0001, "loss": 6.1596, "loss/crossentropy": 2.856844186782837, "loss/hidden": 1.4921875, "loss/jsd": 0.0, "loss/logits": 0.1810523048043251, "step": 27966 }, { "epoch": 0.874, "grad_norm": 3.15625, "grad_norm_var": 57.439134724934895, "learning_rate": 0.0001, "loss": 5.7996, "loss/crossentropy": 2.6578762531280518, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.16612619161605835, "step": 27968 }, { "epoch": 0.8740625, "grad_norm": 2.984375, "grad_norm_var": 57.474690755208336, "learning_rate": 0.0001, "loss": 5.4024, "loss/crossentropy": 2.3677303791046143, "loss/hidden": 1.44140625, "loss/jsd": 0.0, "loss/logits": 0.15932901203632355, "step": 27970 }, { "epoch": 0.874125, "grad_norm": 2.875, "grad_norm_var": 57.57089436848958, "learning_rate": 0.0001, "loss": 5.2109, "loss/crossentropy": 2.308212399482727, "loss/hidden": 1.4140625, "loss/jsd": 0.0, "loss/logits": 0.14886417984962463, "step": 27972 }, { "epoch": 0.8741875, "grad_norm": 2.890625, "grad_norm_var": 57.66669514973958, "learning_rate": 0.0001, "loss": 5.3797, "loss/crossentropy": 2.4334518909454346, "loss/hidden": 1.40625, "loss/jsd": 0.0, "loss/logits": 0.15399521589279175, "step": 27974 }, { "epoch": 0.87425, "grad_norm": 3.5, "grad_norm_var": 57.528706868489586, "learning_rate": 0.0001, "loss": 5.5566, "loss/crossentropy": 2.4991366863250732, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.1600421965122223, "step": 27976 }, { "epoch": 0.8743125, "grad_norm": 3.375, "grad_norm_var": 57.4717763264974, "learning_rate": 0.0001, "loss": 6.0771, "loss/crossentropy": 2.8167072534561157, "loss/hidden": 1.4765625, "loss/jsd": 0.0, "loss/logits": 0.1783793643116951, "step": 27978 }, { "epoch": 0.874375, "grad_norm": 3.21875, "grad_norm_var": 0.06281636555989584, "learning_rate": 0.0001, "loss": 5.7711, "loss/crossentropy": 2.637932062149048, "loss/hidden": 1.49609375, "loss/jsd": 0.0, "loss/logits": 0.16370397806167603, "step": 27980 }, { "epoch": 0.8744375, "grad_norm": 2.796875, "grad_norm_var": 0.04554036458333333, "learning_rate": 0.0001, "loss": 5.3857, "loss/crossentropy": 2.453414559364319, "loss/hidden": 1.40234375, "loss/jsd": 0.0, "loss/logits": 0.15299443900585175, "step": 27982 }, { "epoch": 0.8745, "grad_norm": 3.1875, "grad_norm_var": 0.038623046875, "learning_rate": 0.0001, "loss": 5.8691, "loss/crossentropy": 2.6978119611740112, "loss/hidden": 1.46875, "loss/jsd": 0.0, "loss/logits": 0.17025170475244522, "step": 27984 }, { "epoch": 0.8745625, "grad_norm": 3.15625, "grad_norm_var": 0.98033447265625, "learning_rate": 0.0001, "loss": 5.6903, "loss/crossentropy": 2.477673649787903, "loss/hidden": 1.48046875, "loss/jsd": 0.0, "loss/logits": 0.17321471869945526, "step": 27986 }, { "epoch": 0.874625, "grad_norm": 3.109375, "grad_norm_var": 0.9660308837890625, "learning_rate": 0.0001, "loss": 5.6387, "loss/crossentropy": 2.6202985048294067, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15809043496847153, "step": 27988 }, { "epoch": 0.8746875, "grad_norm": 3.34375, "grad_norm_var": 0.9475870768229167, "learning_rate": 0.0001, "loss": 5.7237, "loss/crossentropy": 2.6030184030532837, "loss/hidden": 1.43359375, "loss/jsd": 0.0, "loss/logits": 0.16871295869350433, "step": 27990 }, { "epoch": 0.87475, "grad_norm": 2.984375, "grad_norm_var": 0.9591135660807292, "learning_rate": 0.0001, "loss": 5.7788, "loss/crossentropy": 2.6670650243759155, "loss/hidden": 1.47265625, "loss/jsd": 0.0, "loss/logits": 0.16391193866729736, "step": 27992 }, { "epoch": 0.8748125, "grad_norm": 3.078125, "grad_norm_var": 0.9908437093098958, "learning_rate": 0.0001, "loss": 5.3174, "loss/crossentropy": 2.3251583576202393, "loss/hidden": 1.45703125, "loss/jsd": 0.0, "loss/logits": 0.15352027863264084, "step": 27994 }, { "epoch": 0.874875, "grad_norm": 2.71875, "grad_norm_var": 1.0175120035807292, "learning_rate": 0.0001, "loss": 5.5982, "loss/crossentropy": 2.5926759243011475, "loss/hidden": 1.4375, "loss/jsd": 0.0, "loss/logits": 0.15680398046970367, "step": 27996 }, { "epoch": 0.8749375, "grad_norm": 2.96875, "grad_norm_var": 1.0045562744140626, "learning_rate": 0.0001, "loss": 5.5022, "loss/crossentropy": 2.51913321018219, "loss/hidden": 1.4296875, "loss/jsd": 0.0, "loss/logits": 0.15533510595560074, "step": 27998 }, { "epoch": 0.875, "grad_norm": 2.8125, "grad_norm_var": 1.0251617431640625, "learning_rate": 0.0001, "loss": 5.7521, "loss/crossentropy": 2.7920806407928467, "loss/hidden": 1.41796875, "loss/jsd": 0.0, "loss/logits": 0.1542009562253952, "step": 28000 } ], "logging_steps": 2, "max_steps": 32000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 4000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.61518537375744e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }