{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 2000, "global_step": 30000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003333333333333333, "grad_norm": 5088.0, "learning_rate": 1.9e-05, "loss": 132.6959, "loss/crossentropy": 12.028920578956605, "loss/hidden": 18.7375, "loss/jsd": 0.0, "loss/logits": 10.20107181072235, "step": 10 }, { "epoch": 0.0006666666666666666, "grad_norm": 428.0, "grad_norm_var": 86465919.73333333, "learning_rate": 2.8000000000000003e-05, "loss": 97.5714, "loss/crossentropy": 8.78247936964035, "loss/hidden": 18.70625, "loss/jsd": 0.0, "loss/logits": 6.826563286781311, "step": 20 }, { "epoch": 0.001, "grad_norm": 206.0, "grad_norm_var": 183176.66666666666, "learning_rate": 3.7e-05, "loss": 87.3595, "loss/crossentropy": 8.069220972061157, "loss/hidden": 18.36875, "loss/jsd": 0.0, "loss/logits": 6.267633223533631, "step": 30 }, { "epoch": 0.0013333333333333333, "grad_norm": 1064.0, "grad_norm_var": 99002.91666666667, "learning_rate": 4.600000000000001e-05, "loss": 84.1061, "loss/crossentropy": 7.728456330299378, "loss/hidden": 17.69375, "loss/jsd": 0.0, "loss/logits": 5.882031416893005, "step": 40 }, { "epoch": 0.0016666666666666668, "grad_norm": 474.0, "grad_norm_var": 84834.06666666667, "learning_rate": 5.500000000000001e-05, "loss": 75.8277, "loss/crossentropy": 6.95980271100998, "loss/hidden": 17.3125, "loss/jsd": 0.0, "loss/logits": 5.054542422294617, "step": 50 }, { "epoch": 0.002, "grad_norm": 616.0, "grad_norm_var": 52564.2, "learning_rate": 6.400000000000001e-05, "loss": 60.9591, "loss/crossentropy": 5.805091935396194, "loss/hidden": 15.93125, "loss/jsd": 0.0, "loss/logits": 3.9220160007476808, "step": 60 }, { "epoch": 0.0023333333333333335, "grad_norm": 384.0, "grad_norm_var": 67375.4, "learning_rate": 7.3e-05, "loss": 41.3956, "loss/crossentropy": 4.246163284778595, "loss/hidden": 13.2265625, "loss/jsd": 0.0, "loss/logits": 2.3237137854099275, "step": 70 }, { "epoch": 0.0026666666666666666, "grad_norm": 272.0, "grad_norm_var": 11768.466666666667, "learning_rate": 8.200000000000001e-05, "loss": 28.7252, "loss/crossentropy": 3.3240436017513275, "loss/hidden": 11.096875, "loss/jsd": 0.0, "loss/logits": 1.4113391578197478, "step": 80 }, { "epoch": 0.003, "grad_norm": 298.0, "grad_norm_var": 2.1871038589218397e+17, "learning_rate": 9.1e-05, "loss": 24.0937, "loss/crossentropy": 3.320331507921219, "loss/hidden": 9.5125, "loss/jsd": 0.0, "loss/logits": 1.10816071331501, "step": 90 }, { "epoch": 0.0033333333333333335, "grad_norm": 288.0, "grad_norm_var": 5410.866666666667, "learning_rate": 0.0001, "loss": 21.4439, "loss/crossentropy": 2.901010638475418, "loss/hidden": 9.178125, "loss/jsd": 0.0, "loss/logits": 0.9687246754765511, "step": 100 }, { "epoch": 0.0036666666666666666, "grad_norm": 280.0, "grad_norm_var": 3854.6625, "learning_rate": 0.0001, "loss": 19.6349, "loss/crossentropy": 2.818925604224205, "loss/hidden": 8.39375, "loss/jsd": 0.0, "loss/logits": 0.8407707408070564, "step": 110 }, { "epoch": 0.004, "grad_norm": 222.0, "grad_norm_var": 1976.8958333333333, "learning_rate": 0.0001, "loss": 18.756, "loss/crossentropy": 2.66967076510191, "loss/hidden": 8.33125, "loss/jsd": 0.0, "loss/logits": 0.7849601306021213, "step": 120 }, { "epoch": 0.004333333333333333, "grad_norm": 163.0, "grad_norm_var": 1472.3833333333334, "learning_rate": 0.0001, "loss": 18.1448, "loss/crossentropy": 2.513835993409157, "loss/hidden": 8.1203125, "loss/jsd": 0.0, "loss/logits": 0.7554221481084824, "step": 130 }, { "epoch": 0.004666666666666667, "grad_norm": 239.0, "grad_norm_var": 1318.5958333333333, "learning_rate": 0.0001, "loss": 17.6846, "loss/crossentropy": 2.591602721810341, "loss/hidden": 7.6875, "loss/jsd": 0.0, "loss/logits": 0.7016006924211979, "step": 140 }, { "epoch": 0.005, "grad_norm": 214.0, "grad_norm_var": 11592.6625, "learning_rate": 0.0001, "loss": 17.3952, "loss/crossentropy": 2.6045392960309983, "loss/hidden": 7.6734375, "loss/jsd": 0.0, "loss/logits": 0.7216012105345726, "step": 150 }, { "epoch": 0.005333333333333333, "grad_norm": 188.0, "grad_norm_var": 1593.7625, "learning_rate": 0.0001, "loss": 16.5206, "loss/crossentropy": 2.59020614027977, "loss/hidden": 7.3015625, "loss/jsd": 0.0, "loss/logits": 0.6340146750211716, "step": 160 }, { "epoch": 0.005666666666666667, "grad_norm": 174.0, "grad_norm_var": 1288.140625, "learning_rate": 0.0001, "loss": 16.4628, "loss/crossentropy": 2.5054407477378846, "loss/hidden": 7.43359375, "loss/jsd": 0.0, "loss/logits": 0.667105832695961, "step": 170 }, { "epoch": 0.006, "grad_norm": 186.0, "grad_norm_var": 1223.015625, "learning_rate": 0.0001, "loss": 15.796, "loss/crossentropy": 2.3755379378795625, "loss/hidden": 7.1609375, "loss/jsd": 0.0, "loss/logits": 0.5874520897865295, "step": 180 }, { "epoch": 0.006333333333333333, "grad_norm": 107.5, "grad_norm_var": 1354.0239583333334, "learning_rate": 0.0001, "loss": 15.5922, "loss/crossentropy": 2.3762576043605805, "loss/hidden": 7.26484375, "loss/jsd": 0.0, "loss/logits": 0.6503637298941612, "step": 190 }, { "epoch": 0.006666666666666667, "grad_norm": 109.5, "grad_norm_var": 3071.4, "learning_rate": 0.0001, "loss": 15.2694, "loss/crossentropy": 2.2902157098054885, "loss/hidden": 7.0265625, "loss/jsd": 0.0, "loss/logits": 0.6195260547101498, "step": 200 }, { "epoch": 0.007, "grad_norm": 194.0, "grad_norm_var": 2804.890625, "learning_rate": 0.0001, "loss": 15.2413, "loss/crossentropy": 2.623681750893593, "loss/hidden": 6.796875, "loss/jsd": 0.0, "loss/logits": 0.6029888540506363, "step": 210 }, { "epoch": 0.007333333333333333, "grad_norm": 136.0, "grad_norm_var": 624.8989583333333, "learning_rate": 0.0001, "loss": 14.771, "loss/crossentropy": 2.319578641653061, "loss/hidden": 6.809375, "loss/jsd": 0.0, "loss/logits": 0.5596946202218532, "step": 220 }, { "epoch": 0.007666666666666666, "grad_norm": 96.0, "grad_norm_var": 524.8989583333333, "learning_rate": 0.0001, "loss": 14.4901, "loss/crossentropy": 2.2120961263775825, "loss/hidden": 6.6609375, "loss/jsd": 0.0, "loss/logits": 0.5178675353527069, "step": 230 }, { "epoch": 0.008, "grad_norm": 119.0, "grad_norm_var": 801.0822916666667, "learning_rate": 0.0001, "loss": 14.4657, "loss/crossentropy": 2.444407218694687, "loss/hidden": 6.40546875, "loss/jsd": 0.0, "loss/logits": 0.543676958233118, "step": 240 }, { "epoch": 0.008333333333333333, "grad_norm": 266.0, "grad_norm_var": 1661.15, "learning_rate": 0.0001, "loss": 14.384, "loss/crossentropy": 2.4827089831233025, "loss/hidden": 6.49375, "loss/jsd": 0.0, "loss/logits": 0.5518345102667809, "step": 250 }, { "epoch": 0.008666666666666666, "grad_norm": 171.0, "grad_norm_var": 1844.040625, "learning_rate": 0.0001, "loss": 14.1886, "loss/crossentropy": 2.3922463700175287, "loss/hidden": 6.5015625, "loss/jsd": 0.0, "loss/logits": 0.5299135472625494, "step": 260 }, { "epoch": 0.009, "grad_norm": 90.5, "grad_norm_var": 1093.9625, "learning_rate": 0.0001, "loss": 14.2358, "loss/crossentropy": 2.395447109639645, "loss/hidden": 6.4796875, "loss/jsd": 0.0, "loss/logits": 0.5689245201647282, "step": 270 }, { "epoch": 0.009333333333333334, "grad_norm": 123.0, "grad_norm_var": 597.1166666666667, "learning_rate": 0.0001, "loss": 13.9794, "loss/crossentropy": 2.240339662134647, "loss/hidden": 6.4296875, "loss/jsd": 0.0, "loss/logits": 0.5223498687148094, "step": 280 }, { "epoch": 0.009666666666666667, "grad_norm": 144.0, "grad_norm_var": 427.7, "learning_rate": 0.0001, "loss": 13.7849, "loss/crossentropy": 2.214311620593071, "loss/hidden": 6.26796875, "loss/jsd": 0.0, "loss/logits": 0.5229207530617714, "step": 290 }, { "epoch": 0.01, "grad_norm": 91.0, "grad_norm_var": 293.015625, "learning_rate": 0.0001, "loss": 13.5058, "loss/crossentropy": 2.431586265563965, "loss/hidden": 6.21015625, "loss/jsd": 0.0, "loss/logits": 0.5209170162677765, "step": 300 }, { "epoch": 0.010333333333333333, "grad_norm": 94.0, "grad_norm_var": 305.990625, "learning_rate": 0.0001, "loss": 13.5941, "loss/crossentropy": 2.4835646122694017, "loss/hidden": 5.96328125, "loss/jsd": 0.0, "loss/logits": 0.48731190487742426, "step": 310 }, { "epoch": 0.010666666666666666, "grad_norm": 80.0, "grad_norm_var": 289.59583333333336, "learning_rate": 0.0001, "loss": 13.3078, "loss/crossentropy": 2.441184702515602, "loss/hidden": 5.97421875, "loss/jsd": 0.0, "loss/logits": 0.46742800548672675, "step": 320 }, { "epoch": 0.011, "grad_norm": 102.0, "grad_norm_var": 257.42395833333336, "learning_rate": 0.0001, "loss": 13.0426, "loss/crossentropy": 2.2604060992598534, "loss/hidden": 6.03359375, "loss/jsd": 0.0, "loss/logits": 0.4528258040547371, "step": 330 }, { "epoch": 0.011333333333333334, "grad_norm": 96.5, "grad_norm_var": 2910.0958333333333, "learning_rate": 0.0001, "loss": 13.1213, "loss/crossentropy": 2.4144359961152078, "loss/hidden": 5.98203125, "loss/jsd": 0.0, "loss/logits": 0.488891564682126, "step": 340 }, { "epoch": 0.011666666666666667, "grad_norm": 96.5, "grad_norm_var": 3266.95, "learning_rate": 0.0001, "loss": 13.0347, "loss/crossentropy": 2.3632063284516334, "loss/hidden": 5.91640625, "loss/jsd": 0.0, "loss/logits": 0.4649609446525574, "step": 350 }, { "epoch": 0.012, "grad_norm": 106.5, "grad_norm_var": 655.115625, "learning_rate": 0.0001, "loss": 12.8798, "loss/crossentropy": 2.2149820044636725, "loss/hidden": 6.115625, "loss/jsd": 0.0, "loss/logits": 0.47544198893010614, "step": 360 }, { "epoch": 0.012333333333333333, "grad_norm": 84.5, "grad_norm_var": 172.790625, "learning_rate": 0.0001, "loss": 12.8471, "loss/crossentropy": 2.5810438305139543, "loss/hidden": 6.00078125, "loss/jsd": 0.0, "loss/logits": 0.48572778329253197, "step": 370 }, { "epoch": 0.012666666666666666, "grad_norm": 95.0, "grad_norm_var": 128.88333333333333, "learning_rate": 0.0001, "loss": 12.7666, "loss/crossentropy": 2.2736202508211134, "loss/hidden": 5.878125, "loss/jsd": 0.0, "loss/logits": 0.46869536861777306, "step": 380 }, { "epoch": 0.013, "grad_norm": 124.5, "grad_norm_var": 140.51666666666668, "learning_rate": 0.0001, "loss": 12.605, "loss/crossentropy": 2.276585566997528, "loss/hidden": 5.825, "loss/jsd": 0.0, "loss/logits": 0.45089508444070814, "step": 390 }, { "epoch": 0.013333333333333334, "grad_norm": 94.5, "grad_norm_var": 311.590625, "learning_rate": 0.0001, "loss": 12.5188, "loss/crossentropy": 2.4131533786654473, "loss/hidden": 5.66875, "loss/jsd": 0.0, "loss/logits": 0.4504088945686817, "step": 400 }, { "epoch": 0.013666666666666667, "grad_norm": 83.5, "grad_norm_var": 364.76666666666665, "learning_rate": 0.0001, "loss": 12.4151, "loss/crossentropy": 2.1796241596341135, "loss/hidden": 5.73671875, "loss/jsd": 0.0, "loss/logits": 0.435577293112874, "step": 410 }, { "epoch": 0.014, "grad_norm": 93.5, "grad_norm_var": 187.52395833333333, "learning_rate": 0.0001, "loss": 12.4676, "loss/crossentropy": 2.1736431539058687, "loss/hidden": 5.78359375, "loss/jsd": 0.0, "loss/logits": 0.4561117485165596, "step": 420 }, { "epoch": 0.014333333333333333, "grad_norm": 90.5, "grad_norm_var": 48.795833333333334, "learning_rate": 0.0001, "loss": 12.0711, "loss/crossentropy": 2.283279325067997, "loss/hidden": 5.45859375, "loss/jsd": 0.0, "loss/logits": 0.4215318731963634, "step": 430 }, { "epoch": 0.014666666666666666, "grad_norm": 89.5, "grad_norm_var": 49.42916666666667, "learning_rate": 0.0001, "loss": 12.1374, "loss/crossentropy": 2.291328126192093, "loss/hidden": 5.5125, "loss/jsd": 0.0, "loss/logits": 0.434352046251297, "step": 440 }, { "epoch": 0.015, "grad_norm": 92.0, "grad_norm_var": 156.45, "learning_rate": 0.0001, "loss": 12.1345, "loss/crossentropy": 2.4191192060709, "loss/hidden": 5.35546875, "loss/jsd": 0.0, "loss/logits": 0.4199687227606773, "step": 450 }, { "epoch": 0.015333333333333332, "grad_norm": 88.0, "grad_norm_var": 208.2625, "learning_rate": 0.0001, "loss": 12.1403, "loss/crossentropy": 2.1956121422350408, "loss/hidden": 5.4875, "loss/jsd": 0.0, "loss/logits": 0.4142579145729542, "step": 460 }, { "epoch": 0.015666666666666666, "grad_norm": 89.0, "grad_norm_var": 140.31666666666666, "learning_rate": 0.0001, "loss": 12.0963, "loss/crossentropy": 2.2322947554290296, "loss/hidden": 5.6171875, "loss/jsd": 0.0, "loss/logits": 0.4137666640803218, "step": 470 }, { "epoch": 0.016, "grad_norm": 76.0, "grad_norm_var": 64.23333333333333, "learning_rate": 0.0001, "loss": 12.219, "loss/crossentropy": 2.3545165613293646, "loss/hidden": 5.5296875, "loss/jsd": 0.0, "loss/logits": 0.4294994674623013, "step": 480 }, { "epoch": 0.01633333333333333, "grad_norm": 83.0, "grad_norm_var": 58.915625, "learning_rate": 0.0001, "loss": 11.7744, "loss/crossentropy": 2.3146368995308877, "loss/hidden": 5.428125, "loss/jsd": 0.0, "loss/logits": 0.4236735228449106, "step": 490 }, { "epoch": 0.016666666666666666, "grad_norm": 78.5, "grad_norm_var": 64.83229166666666, "learning_rate": 0.0001, "loss": 11.8087, "loss/crossentropy": 2.1017669927328826, "loss/hidden": 5.6140625, "loss/jsd": 0.0, "loss/logits": 0.4007458407431841, "step": 500 }, { "epoch": 0.017, "grad_norm": 79.0, "grad_norm_var": 105.6625, "learning_rate": 0.0001, "loss": 11.9731, "loss/crossentropy": 2.2921740829944612, "loss/hidden": 5.35390625, "loss/jsd": 0.0, "loss/logits": 0.4179579775780439, "step": 510 }, { "epoch": 0.017333333333333333, "grad_norm": 108.0, "grad_norm_var": 187.390625, "learning_rate": 0.0001, "loss": 11.6599, "loss/crossentropy": 2.2103257328271866, "loss/hidden": 5.28671875, "loss/jsd": 0.0, "loss/logits": 0.3849416717886925, "step": 520 }, { "epoch": 0.017666666666666667, "grad_norm": 81.5, "grad_norm_var": 107.10729166666667, "learning_rate": 0.0001, "loss": 12.1065, "loss/crossentropy": 2.397639387845993, "loss/hidden": 5.52265625, "loss/jsd": 0.0, "loss/logits": 0.45107794776558874, "step": 530 }, { "epoch": 0.018, "grad_norm": 129.0, "grad_norm_var": 193.940625, "learning_rate": 0.0001, "loss": 11.7656, "loss/crossentropy": 2.3040026426315308, "loss/hidden": 5.2296875, "loss/jsd": 0.0, "loss/logits": 0.39434340633451936, "step": 540 }, { "epoch": 0.018333333333333333, "grad_norm": 92.0, "grad_norm_var": 172.44895833333334, "learning_rate": 0.0001, "loss": 11.8646, "loss/crossentropy": 2.2055136799812316, "loss/hidden": 5.30703125, "loss/jsd": 0.0, "loss/logits": 0.3980072047561407, "step": 550 }, { "epoch": 0.018666666666666668, "grad_norm": 71.0, "grad_norm_var": 129.69895833333334, "learning_rate": 0.0001, "loss": 11.9471, "loss/crossentropy": 2.5020270466804506, "loss/hidden": 5.4015625, "loss/jsd": 0.0, "loss/logits": 0.44894293025135995, "step": 560 }, { "epoch": 0.019, "grad_norm": 84.0, "grad_norm_var": 93.34895833333333, "learning_rate": 0.0001, "loss": 11.7947, "loss/crossentropy": 2.094514015316963, "loss/hidden": 5.4953125, "loss/jsd": 0.0, "loss/logits": 0.40853818207979203, "step": 570 }, { "epoch": 0.019333333333333334, "grad_norm": 59.5, "grad_norm_var": 270.1322916666667, "learning_rate": 0.0001, "loss": 11.6187, "loss/crossentropy": 2.3205525130033493, "loss/hidden": 5.3140625, "loss/jsd": 0.0, "loss/logits": 0.4034106068313122, "step": 580 }, { "epoch": 0.019666666666666666, "grad_norm": 71.0, "grad_norm_var": 190.77395833333333, "learning_rate": 0.0001, "loss": 11.591, "loss/crossentropy": 2.110408242046833, "loss/hidden": 5.26328125, "loss/jsd": 0.0, "loss/logits": 0.3818111319094896, "step": 590 }, { "epoch": 0.02, "grad_norm": 78.5, "grad_norm_var": 245.09765625, "learning_rate": 0.0001, "loss": 11.5567, "loss/crossentropy": 2.301485204696655, "loss/hidden": 5.28671875, "loss/jsd": 0.0, "loss/logits": 0.39956560730934143, "step": 600 }, { "epoch": 0.02033333333333333, "grad_norm": 85.0, "grad_norm_var": 210.44895833333334, "learning_rate": 0.0001, "loss": 11.4252, "loss/crossentropy": 2.115160013735294, "loss/hidden": 5.18671875, "loss/jsd": 0.0, "loss/logits": 0.3820294298231602, "step": 610 }, { "epoch": 0.020666666666666667, "grad_norm": 76.5, "grad_norm_var": 312.940625, "learning_rate": 0.0001, "loss": 11.7005, "loss/crossentropy": 2.3295557737350463, "loss/hidden": 5.3875, "loss/jsd": 0.0, "loss/logits": 0.40171602740883827, "step": 620 }, { "epoch": 0.021, "grad_norm": 84.0, "grad_norm_var": 189.29557291666666, "learning_rate": 0.0001, "loss": 11.3593, "loss/crossentropy": 2.1354906648397445, "loss/hidden": 5.2125, "loss/jsd": 0.0, "loss/logits": 0.3748495355248451, "step": 630 }, { "epoch": 0.021333333333333333, "grad_norm": 109.0, "grad_norm_var": 1600.7072916666666, "learning_rate": 0.0001, "loss": 11.5477, "loss/crossentropy": 2.3208198621869087, "loss/hidden": 5.1359375, "loss/jsd": 0.0, "loss/logits": 0.38336952701210975, "step": 640 }, { "epoch": 0.021666666666666667, "grad_norm": 89.5, "grad_norm_var": 1528.8166666666666, "learning_rate": 0.0001, "loss": 11.4408, "loss/crossentropy": 2.3916768223047256, "loss/hidden": 5.29375, "loss/jsd": 0.0, "loss/logits": 0.39794068187475207, "step": 650 }, { "epoch": 0.022, "grad_norm": 4294967296.0, "grad_norm_var": 1.1529214644399553e+18, "learning_rate": 0.0001, "loss": 11.5534, "loss/crossentropy": 2.384619304537773, "loss/hidden": 5.54375, "loss/jsd": 0.0, "loss/logits": 0.44498190060257914, "step": 660 }, { "epoch": 0.022333333333333334, "grad_norm": 73.5, "grad_norm_var": 1.152921464574173e+18, "learning_rate": 0.0001, "loss": 11.4083, "loss/crossentropy": 2.305806961655617, "loss/hidden": 5.1078125, "loss/jsd": 0.0, "loss/logits": 0.3943803641945124, "step": 670 }, { "epoch": 0.02266666666666667, "grad_norm": 82.5, "grad_norm_var": 56.72890625, "learning_rate": 0.0001, "loss": 11.2602, "loss/crossentropy": 2.3092581436038015, "loss/hidden": 5.09765625, "loss/jsd": 0.0, "loss/logits": 0.39187341518700125, "step": 680 }, { "epoch": 0.023, "grad_norm": 65.0, "grad_norm_var": 117.25833333333334, "learning_rate": 0.0001, "loss": 11.1613, "loss/crossentropy": 2.324054108560085, "loss/hidden": 5.12578125, "loss/jsd": 0.0, "loss/logits": 0.3896127313375473, "step": 690 }, { "epoch": 0.023333333333333334, "grad_norm": 68.0, "grad_norm_var": 89.20833333333333, "learning_rate": 0.0001, "loss": 11.1875, "loss/crossentropy": 2.2932113975286486, "loss/hidden": 5.19453125, "loss/jsd": 0.0, "loss/logits": 0.40212590657174585, "step": 700 }, { "epoch": 0.023666666666666666, "grad_norm": 79.5, "grad_norm_var": 90.28515625, "learning_rate": 0.0001, "loss": 11.3541, "loss/crossentropy": 2.4318029940128327, "loss/hidden": 5.1109375, "loss/jsd": 0.0, "loss/logits": 0.4051614128053188, "step": 710 }, { "epoch": 0.024, "grad_norm": 73.0, "grad_norm_var": 89.22473958333333, "learning_rate": 0.0001, "loss": 11.1475, "loss/crossentropy": 2.167645823955536, "loss/hidden": 5.29296875, "loss/jsd": 0.0, "loss/logits": 0.43062909580767156, "step": 720 }, { "epoch": 0.024333333333333332, "grad_norm": 74.5, "grad_norm_var": 49.701822916666664, "learning_rate": 0.0001, "loss": 11.1669, "loss/crossentropy": 2.1876634269952775, "loss/hidden": 5.13671875, "loss/jsd": 0.0, "loss/logits": 0.3738254923373461, "step": 730 }, { "epoch": 0.024666666666666667, "grad_norm": 77.5, "grad_norm_var": 30.557291666666668, "learning_rate": 0.0001, "loss": 11.0079, "loss/crossentropy": 2.248200983554125, "loss/hidden": 5.09140625, "loss/jsd": 0.0, "loss/logits": 0.3778664033859968, "step": 740 }, { "epoch": 0.025, "grad_norm": 68.0, "grad_norm_var": 46.8625, "learning_rate": 0.0001, "loss": 11.0535, "loss/crossentropy": 2.162605920433998, "loss/hidden": 5.04921875, "loss/jsd": 0.0, "loss/logits": 0.38323657512664794, "step": 750 }, { "epoch": 0.025333333333333333, "grad_norm": 63.5, "grad_norm_var": 83.825, "learning_rate": 0.0001, "loss": 11.126, "loss/crossentropy": 2.2463886097073553, "loss/hidden": 5.05390625, "loss/jsd": 0.0, "loss/logits": 0.3759196888655424, "step": 760 }, { "epoch": 0.025666666666666667, "grad_norm": 67.5, "grad_norm_var": 39.149739583333336, "learning_rate": 0.0001, "loss": 11.2542, "loss/crossentropy": 2.368313530087471, "loss/hidden": 5.025, "loss/jsd": 0.0, "loss/logits": 0.40871408879756926, "step": 770 }, { "epoch": 0.026, "grad_norm": 76.0, "grad_norm_var": 37.065625, "learning_rate": 0.0001, "loss": 11.2333, "loss/crossentropy": 2.2050742127001284, "loss/hidden": 5.03359375, "loss/jsd": 0.0, "loss/logits": 0.3826916288584471, "step": 780 }, { "epoch": 0.026333333333333334, "grad_norm": 71.5, "grad_norm_var": 50.54765625, "learning_rate": 0.0001, "loss": 10.8321, "loss/crossentropy": 2.2104426354169844, "loss/hidden": 4.78125, "loss/jsd": 0.0, "loss/logits": 0.34428741969168186, "step": 790 }, { "epoch": 0.02666666666666667, "grad_norm": 62.0, "grad_norm_var": 51.157291666666666, "learning_rate": 0.0001, "loss": 10.9576, "loss/crossentropy": 2.054627813398838, "loss/hidden": 5.0390625, "loss/jsd": 0.0, "loss/logits": 0.37033863496035335, "step": 800 }, { "epoch": 0.027, "grad_norm": 71.0, "grad_norm_var": 30.090625, "learning_rate": 0.0001, "loss": 10.8745, "loss/crossentropy": 2.171517415344715, "loss/hidden": 5.04921875, "loss/jsd": 0.0, "loss/logits": 0.36901252083480357, "step": 810 }, { "epoch": 0.027333333333333334, "grad_norm": 83.0, "grad_norm_var": 66.25833333333334, "learning_rate": 0.0001, "loss": 10.9984, "loss/crossentropy": 2.1388269782066347, "loss/hidden": 4.98828125, "loss/jsd": 0.0, "loss/logits": 0.3639502193778753, "step": 820 }, { "epoch": 0.027666666666666666, "grad_norm": 60.25, "grad_norm_var": 65.29895833333333, "learning_rate": 0.0001, "loss": 10.9877, "loss/crossentropy": 2.1383705154061317, "loss/hidden": 5.08203125, "loss/jsd": 0.0, "loss/logits": 0.35995694771409037, "step": 830 }, { "epoch": 0.028, "grad_norm": 60.0, "grad_norm_var": 28.024739583333332, "learning_rate": 0.0001, "loss": 10.8345, "loss/crossentropy": 2.2445564195513725, "loss/hidden": 4.8421875, "loss/jsd": 0.0, "loss/logits": 0.35285502672195435, "step": 840 }, { "epoch": 0.028333333333333332, "grad_norm": 106.0, "grad_norm_var": 137.91432291666666, "learning_rate": 0.0001, "loss": 10.9112, "loss/crossentropy": 2.26119641661644, "loss/hidden": 4.9140625, "loss/jsd": 0.0, "loss/logits": 0.3745645940303802, "step": 850 }, { "epoch": 0.028666666666666667, "grad_norm": 61.5, "grad_norm_var": 270.00807291666666, "learning_rate": 0.0001, "loss": 10.9954, "loss/crossentropy": 2.2785057038068772, "loss/hidden": 5.071875, "loss/jsd": 0.0, "loss/logits": 0.3852051142603159, "step": 860 }, { "epoch": 0.029, "grad_norm": 114.5, "grad_norm_var": 280.5541666666667, "learning_rate": 0.0001, "loss": 11.0732, "loss/crossentropy": 2.1508523888885973, "loss/hidden": 4.98984375, "loss/jsd": 0.0, "loss/logits": 0.36836351118981836, "step": 870 }, { "epoch": 0.029333333333333333, "grad_norm": 84.5, "grad_norm_var": 265.72265625, "learning_rate": 0.0001, "loss": 10.9948, "loss/crossentropy": 2.407784271240234, "loss/hidden": 5.109375, "loss/jsd": 0.0, "loss/logits": 0.4067291602492332, "step": 880 }, { "epoch": 0.029666666666666668, "grad_norm": 68.5, "grad_norm_var": 199.90182291666667, "learning_rate": 0.0001, "loss": 11.052, "loss/crossentropy": 2.266117498278618, "loss/hidden": 4.8125, "loss/jsd": 0.0, "loss/logits": 0.3621700868010521, "step": 890 }, { "epoch": 0.03, "grad_norm": 66.0, "grad_norm_var": 79.33307291666667, "learning_rate": 0.0001, "loss": 10.8697, "loss/crossentropy": 2.2878661900758743, "loss/hidden": 4.9578125, "loss/jsd": 0.0, "loss/logits": 0.38751605823636054, "step": 900 }, { "epoch": 0.030333333333333334, "grad_norm": 71.5, "grad_norm_var": 107.53333333333333, "learning_rate": 0.0001, "loss": 10.843, "loss/crossentropy": 2.1773984007537366, "loss/hidden": 4.98828125, "loss/jsd": 0.0, "loss/logits": 0.35708251409232616, "step": 910 }, { "epoch": 0.030666666666666665, "grad_norm": 67.0, "grad_norm_var": 161.60390625, "learning_rate": 0.0001, "loss": 10.7503, "loss/crossentropy": 2.1578031152486803, "loss/hidden": 4.833984375, "loss/jsd": 0.0, "loss/logits": 0.34460235945880413, "step": 920 }, { "epoch": 0.031, "grad_norm": 84.0, "grad_norm_var": 74.48307291666667, "learning_rate": 0.0001, "loss": 10.8098, "loss/crossentropy": 2.30833805501461, "loss/hidden": 4.9, "loss/jsd": 0.0, "loss/logits": 0.363369470089674, "step": 930 }, { "epoch": 0.03133333333333333, "grad_norm": 64.0, "grad_norm_var": 80.66848958333334, "learning_rate": 0.0001, "loss": 10.7164, "loss/crossentropy": 2.270130616426468, "loss/hidden": 4.9078125, "loss/jsd": 0.0, "loss/logits": 0.3668188262730837, "step": 940 }, { "epoch": 0.03166666666666667, "grad_norm": 61.75, "grad_norm_var": 31.870833333333334, "learning_rate": 0.0001, "loss": 10.7845, "loss/crossentropy": 2.313658607006073, "loss/hidden": 4.87734375, "loss/jsd": 0.0, "loss/logits": 0.38930617440491916, "step": 950 }, { "epoch": 0.032, "grad_norm": 65.5, "grad_norm_var": 29.598958333333332, "learning_rate": 0.0001, "loss": 10.6697, "loss/crossentropy": 2.3520640432834625, "loss/hidden": 4.84921875, "loss/jsd": 0.0, "loss/logits": 0.39863014221191406, "step": 960 }, { "epoch": 0.03233333333333333, "grad_norm": 5771362304.0, "grad_norm_var": 2.0817888831321674e+18, "learning_rate": 0.0001, "loss": 10.7484, "loss/crossentropy": 2.192959766089916, "loss/hidden": 4.798828125, "loss/jsd": 0.0, "loss/logits": 0.33885405771434307, "step": 970 }, { "epoch": 0.03266666666666666, "grad_norm": 51.25, "grad_norm_var": 2.081788882663244e+18, "learning_rate": 0.0001, "loss": 10.6531, "loss/crossentropy": 2.0819257736206054, "loss/hidden": 4.775, "loss/jsd": 0.0, "loss/logits": 0.3428235022351146, "step": 980 }, { "epoch": 0.033, "grad_norm": 63.0, "grad_norm_var": 35.618489583333336, "learning_rate": 0.0001, "loss": 10.8209, "loss/crossentropy": 2.250964765995741, "loss/hidden": 4.784375, "loss/jsd": 0.0, "loss/logits": 0.3942587487399578, "step": 990 }, { "epoch": 0.03333333333333333, "grad_norm": 52.25, "grad_norm_var": 62.21015625, "learning_rate": 0.0001, "loss": 10.6187, "loss/crossentropy": 2.1501222252845764, "loss/hidden": 4.93984375, "loss/jsd": 0.0, "loss/logits": 0.3645794134587049, "step": 1000 }, { "epoch": 0.033666666666666664, "grad_norm": 147.0, "grad_norm_var": 810.01640625, "learning_rate": 0.0001, "loss": 10.7409, "loss/crossentropy": 2.265306806564331, "loss/hidden": 4.85390625, "loss/jsd": 0.0, "loss/logits": 0.3604784071445465, "step": 1010 }, { "epoch": 0.034, "grad_norm": 75.0, "grad_norm_var": 562.55, "learning_rate": 0.0001, "loss": 10.6671, "loss/crossentropy": 2.1804181709885597, "loss/hidden": 4.74765625, "loss/jsd": 0.0, "loss/logits": 0.3643572922796011, "step": 1020 }, { "epoch": 0.034333333333333334, "grad_norm": 72.0, "grad_norm_var": 76.40598958333334, "learning_rate": 0.0001, "loss": 10.7713, "loss/crossentropy": 2.3172024488449097, "loss/hidden": 4.83671875, "loss/jsd": 0.0, "loss/logits": 0.37977495454251764, "step": 1030 }, { "epoch": 0.034666666666666665, "grad_norm": 57.25, "grad_norm_var": 179.57265625, "learning_rate": 0.0001, "loss": 10.7404, "loss/crossentropy": 2.346377784013748, "loss/hidden": 4.78046875, "loss/jsd": 0.0, "loss/logits": 0.36581903472542765, "step": 1040 }, { "epoch": 0.035, "grad_norm": 50.0, "grad_norm_var": 17.880989583333335, "learning_rate": 0.0001, "loss": 10.6162, "loss/crossentropy": 2.27249199450016, "loss/hidden": 4.8953125, "loss/jsd": 0.0, "loss/logits": 0.35631081983447077, "step": 1050 }, { "epoch": 0.035333333333333335, "grad_norm": 58.25, "grad_norm_var": 37.270572916666666, "learning_rate": 0.0001, "loss": 10.6205, "loss/crossentropy": 2.3702693939208985, "loss/hidden": 4.74921875, "loss/jsd": 0.0, "loss/logits": 0.3705620352178812, "step": 1060 }, { "epoch": 0.035666666666666666, "grad_norm": 59.75, "grad_norm_var": 36.85, "learning_rate": 0.0001, "loss": 10.6335, "loss/crossentropy": 2.2196707010269163, "loss/hidden": 4.776953125, "loss/jsd": 0.0, "loss/logits": 0.3813734740018845, "step": 1070 }, { "epoch": 0.036, "grad_norm": 71.5, "grad_norm_var": 35.77682291666667, "learning_rate": 0.0001, "loss": 10.6103, "loss/crossentropy": 2.0884764015674593, "loss/hidden": 4.862109375, "loss/jsd": 0.0, "loss/logits": 0.3436795238405466, "step": 1080 }, { "epoch": 0.036333333333333336, "grad_norm": 57.5, "grad_norm_var": 30.591666666666665, "learning_rate": 0.0001, "loss": 10.6457, "loss/crossentropy": 2.2101589158177375, "loss/hidden": 4.82109375, "loss/jsd": 0.0, "loss/logits": 0.3618529438972473, "step": 1090 }, { "epoch": 0.03666666666666667, "grad_norm": 55.0, "grad_norm_var": 23.695572916666666, "learning_rate": 0.0001, "loss": 10.6528, "loss/crossentropy": 2.122966104745865, "loss/hidden": 4.823828125, "loss/jsd": 0.0, "loss/logits": 0.3658104814589024, "step": 1100 }, { "epoch": 0.037, "grad_norm": 81.0, "grad_norm_var": 10727.665625, "learning_rate": 0.0001, "loss": 10.5432, "loss/crossentropy": 2.0415431298315525, "loss/hidden": 4.83515625, "loss/jsd": 0.0, "loss/logits": 0.3341557715088129, "step": 1110 }, { "epoch": 0.037333333333333336, "grad_norm": 50.0, "grad_norm_var": 98.640625, "learning_rate": 0.0001, "loss": 10.6684, "loss/crossentropy": 2.3639739483594893, "loss/hidden": 4.715234375, "loss/jsd": 0.0, "loss/logits": 0.3543642643839121, "step": 1120 }, { "epoch": 0.03766666666666667, "grad_norm": 51.25, "grad_norm_var": 404.8833333333333, "learning_rate": 0.0001, "loss": 10.5142, "loss/crossentropy": 2.2790277168154716, "loss/hidden": 4.78203125, "loss/jsd": 0.0, "loss/logits": 0.335112139955163, "step": 1130 }, { "epoch": 0.038, "grad_norm": 66.0, "grad_norm_var": 80.90729166666667, "learning_rate": 0.0001, "loss": 10.5643, "loss/crossentropy": 2.3700348407030107, "loss/hidden": 4.6640625, "loss/jsd": 0.0, "loss/logits": 0.35619163103401663, "step": 1140 }, { "epoch": 0.03833333333333333, "grad_norm": 52.0, "grad_norm_var": 128.78932291666666, "learning_rate": 0.0001, "loss": 10.547, "loss/crossentropy": 2.2056336015462876, "loss/hidden": 4.8265625, "loss/jsd": 0.0, "loss/logits": 0.3479484971612692, "step": 1150 }, { "epoch": 0.03866666666666667, "grad_norm": 57.75, "grad_norm_var": 47.465625, "learning_rate": 0.0001, "loss": 10.459, "loss/crossentropy": 2.33939877897501, "loss/hidden": 4.743359375, "loss/jsd": 0.0, "loss/logits": 0.3586887318640947, "step": 1160 }, { "epoch": 0.039, "grad_norm": 44.5, "grad_norm_var": 43.43515625, "learning_rate": 0.0001, "loss": 10.3386, "loss/crossentropy": 2.1996493458747866, "loss/hidden": 4.859375, "loss/jsd": 0.0, "loss/logits": 0.35360681600868704, "step": 1170 }, { "epoch": 0.03933333333333333, "grad_norm": 63.0, "grad_norm_var": 40115.25390625, "learning_rate": 0.0001, "loss": 10.4938, "loss/crossentropy": 2.345403802394867, "loss/hidden": 4.74375, "loss/jsd": 0.0, "loss/logits": 0.3767122995108366, "step": 1180 }, { "epoch": 0.03966666666666667, "grad_norm": 55.0, "grad_norm_var": 39818.41223958333, "learning_rate": 0.0001, "loss": 10.633, "loss/crossentropy": 2.179220561683178, "loss/hidden": 4.80234375, "loss/jsd": 0.0, "loss/logits": 0.36714412793517115, "step": 1190 }, { "epoch": 0.04, "grad_norm": 59.0, "grad_norm_var": 87.50416666666666, "learning_rate": 0.0001, "loss": 10.464, "loss/crossentropy": 2.324831709265709, "loss/hidden": 4.62578125, "loss/jsd": 0.0, "loss/logits": 0.3588165879249573, "step": 1200 }, { "epoch": 0.04033333333333333, "grad_norm": 52.5, "grad_norm_var": 22.1625, "learning_rate": 0.0001, "loss": 10.4303, "loss/crossentropy": 2.3655824601650237, "loss/hidden": 4.5671875, "loss/jsd": 0.0, "loss/logits": 0.34746520072221754, "step": 1210 }, { "epoch": 0.04066666666666666, "grad_norm": 59.0, "grad_norm_var": 25.076822916666668, "learning_rate": 0.0001, "loss": 10.3775, "loss/crossentropy": 2.3404723912477494, "loss/hidden": 4.560546875, "loss/jsd": 0.0, "loss/logits": 0.35012954138219354, "step": 1220 }, { "epoch": 0.041, "grad_norm": 55.75, "grad_norm_var": 33.32057291666667, "learning_rate": 0.0001, "loss": 10.2275, "loss/crossentropy": 2.3334707781672477, "loss/hidden": 4.620703125, "loss/jsd": 0.0, "loss/logits": 0.33375904690474273, "step": 1230 }, { "epoch": 0.04133333333333333, "grad_norm": 54.25, "grad_norm_var": 56.390625, "learning_rate": 0.0001, "loss": 10.2067, "loss/crossentropy": 2.2778391599655152, "loss/hidden": 4.7, "loss/jsd": 0.0, "loss/logits": 0.36180679500102997, "step": 1240 }, { "epoch": 0.041666666666666664, "grad_norm": 48.0, "grad_norm_var": 58.657291666666666, "learning_rate": 0.0001, "loss": 10.3731, "loss/crossentropy": 2.1620317712426185, "loss/hidden": 4.6765625, "loss/jsd": 0.0, "loss/logits": 0.3339271958917379, "step": 1250 }, { "epoch": 0.042, "grad_norm": 60.75, "grad_norm_var": 38.33932291666667, "learning_rate": 0.0001, "loss": 10.356, "loss/crossentropy": 2.1490842700004578, "loss/hidden": 4.6578125, "loss/jsd": 0.0, "loss/logits": 0.3494755119085312, "step": 1260 }, { "epoch": 0.042333333333333334, "grad_norm": 54.0, "grad_norm_var": 30.624739583333334, "learning_rate": 0.0001, "loss": 10.3963, "loss/crossentropy": 2.255769196152687, "loss/hidden": 4.75390625, "loss/jsd": 0.0, "loss/logits": 0.352866580337286, "step": 1270 }, { "epoch": 0.042666666666666665, "grad_norm": 46.25, "grad_norm_var": 41.215625, "learning_rate": 0.0001, "loss": 10.3482, "loss/crossentropy": 2.188653063029051, "loss/hidden": 4.7625, "loss/jsd": 0.0, "loss/logits": 0.35777630396187304, "step": 1280 }, { "epoch": 0.043, "grad_norm": 49.5, "grad_norm_var": 140.48333333333332, "learning_rate": 0.0001, "loss": 10.4993, "loss/crossentropy": 2.2085324838757514, "loss/hidden": 4.764453125, "loss/jsd": 0.0, "loss/logits": 0.35029419548809526, "step": 1290 }, { "epoch": 0.043333333333333335, "grad_norm": 68.5, "grad_norm_var": 87.10807291666667, "learning_rate": 0.0001, "loss": 10.387, "loss/crossentropy": 2.0001343421638014, "loss/hidden": 4.740625, "loss/jsd": 0.0, "loss/logits": 0.3210140850394964, "step": 1300 }, { "epoch": 0.043666666666666666, "grad_norm": 53.0, "grad_norm_var": 63.59348958333333, "learning_rate": 0.0001, "loss": 10.4462, "loss/crossentropy": 2.261654701828957, "loss/hidden": 4.64765625, "loss/jsd": 0.0, "loss/logits": 0.3344265431165695, "step": 1310 }, { "epoch": 0.044, "grad_norm": 51.25, "grad_norm_var": 141.92395833333333, "learning_rate": 0.0001, "loss": 10.3567, "loss/crossentropy": 2.2908728308975697, "loss/hidden": 4.7203125, "loss/jsd": 0.0, "loss/logits": 0.3544796362519264, "step": 1320 }, { "epoch": 0.044333333333333336, "grad_norm": 55.75, "grad_norm_var": 95.390625, "learning_rate": 0.0001, "loss": 10.2347, "loss/crossentropy": 2.342694191634655, "loss/hidden": 4.574609375, "loss/jsd": 0.0, "loss/logits": 0.33147499468177555, "step": 1330 }, { "epoch": 0.04466666666666667, "grad_norm": 63.0, "grad_norm_var": 143.86848958333334, "learning_rate": 0.0001, "loss": 10.2828, "loss/crossentropy": 2.1463702358305454, "loss/hidden": 4.641796875, "loss/jsd": 0.0, "loss/logits": 0.3379510557278991, "step": 1340 }, { "epoch": 0.045, "grad_norm": 58.0, "grad_norm_var": 145.73333333333332, "learning_rate": 0.0001, "loss": 10.4051, "loss/crossentropy": 2.1320372141897677, "loss/hidden": 4.6515625, "loss/jsd": 0.0, "loss/logits": 0.3248579815030098, "step": 1350 }, { "epoch": 0.04533333333333334, "grad_norm": 50.5, "grad_norm_var": 1564.9205729166667, "learning_rate": 0.0001, "loss": 10.3519, "loss/crossentropy": 2.390837848186493, "loss/hidden": 4.482421875, "loss/jsd": 0.0, "loss/logits": 0.33150205537676813, "step": 1360 }, { "epoch": 0.04566666666666667, "grad_norm": 57.0, "grad_norm_var": 86.76640625, "learning_rate": 0.0001, "loss": 10.2536, "loss/crossentropy": 2.238551476597786, "loss/hidden": 4.677734375, "loss/jsd": 0.0, "loss/logits": 0.32355707660317423, "step": 1370 }, { "epoch": 0.046, "grad_norm": 54.0, "grad_norm_var": 33.41848958333333, "learning_rate": 0.0001, "loss": 10.2308, "loss/crossentropy": 2.172594637423754, "loss/hidden": 4.6640625, "loss/jsd": 0.0, "loss/logits": 0.31351518500596287, "step": 1380 }, { "epoch": 0.04633333333333333, "grad_norm": 54.75, "grad_norm_var": 37.065625, "learning_rate": 0.0001, "loss": 10.1326, "loss/crossentropy": 2.299109023809433, "loss/hidden": 4.53359375, "loss/jsd": 0.0, "loss/logits": 0.3222783401608467, "step": 1390 }, { "epoch": 0.04666666666666667, "grad_norm": 55.5, "grad_norm_var": 97.47916666666667, "learning_rate": 0.0001, "loss": 10.4755, "loss/crossentropy": 2.1733594447374345, "loss/hidden": 4.76171875, "loss/jsd": 0.0, "loss/logits": 0.3652454580180347, "step": 1400 }, { "epoch": 0.047, "grad_norm": 54.25, "grad_norm_var": 66.57395833333334, "learning_rate": 0.0001, "loss": 10.3266, "loss/crossentropy": 2.2273528560996056, "loss/hidden": 4.498828125, "loss/jsd": 0.0, "loss/logits": 0.30771929658949376, "step": 1410 }, { "epoch": 0.04733333333333333, "grad_norm": 51.75, "grad_norm_var": 83.07395833333334, "learning_rate": 0.0001, "loss": 10.2202, "loss/crossentropy": 2.286280909180641, "loss/hidden": 4.652734375, "loss/jsd": 0.0, "loss/logits": 0.34912221878767014, "step": 1420 }, { "epoch": 0.04766666666666667, "grad_norm": 54.0, "grad_norm_var": 236.29557291666666, "learning_rate": 0.0001, "loss": 10.2577, "loss/crossentropy": 2.1812940359115602, "loss/hidden": 4.534765625, "loss/jsd": 0.0, "loss/logits": 0.33046910390257833, "step": 1430 }, { "epoch": 0.048, "grad_norm": 53.25, "grad_norm_var": 62.09765625, "learning_rate": 0.0001, "loss": 10.0937, "loss/crossentropy": 2.263388830423355, "loss/hidden": 4.46640625, "loss/jsd": 0.0, "loss/logits": 0.3291785676032305, "step": 1440 }, { "epoch": 0.04833333333333333, "grad_norm": 57.75, "grad_norm_var": 30.408072916666665, "learning_rate": 0.0001, "loss": 10.1197, "loss/crossentropy": 2.3318180561065676, "loss/hidden": 4.530078125, "loss/jsd": 0.0, "loss/logits": 0.32444748654961586, "step": 1450 }, { "epoch": 0.048666666666666664, "grad_norm": 46.25, "grad_norm_var": 51.42265625, "learning_rate": 0.0001, "loss": 10.2713, "loss/crossentropy": 2.0284368000924586, "loss/hidden": 4.673828125, "loss/jsd": 0.0, "loss/logits": 0.3348676819354296, "step": 1460 }, { "epoch": 0.049, "grad_norm": 47.5, "grad_norm_var": 12.757291666666667, "learning_rate": 0.0001, "loss": 10.1257, "loss/crossentropy": 2.2734193384647368, "loss/hidden": 4.6890625, "loss/jsd": 0.0, "loss/logits": 0.3383447080850601, "step": 1470 }, { "epoch": 0.04933333333333333, "grad_norm": 48.5, "grad_norm_var": 10.7625, "learning_rate": 0.0001, "loss": 10.1081, "loss/crossentropy": 2.1734183013439177, "loss/hidden": 4.680078125, "loss/jsd": 0.0, "loss/logits": 0.3549729684367776, "step": 1480 }, { "epoch": 0.049666666666666665, "grad_norm": 52.75, "grad_norm_var": 25.864322916666666, "learning_rate": 0.0001, "loss": 10.0498, "loss/crossentropy": 2.125520133972168, "loss/hidden": 4.56640625, "loss/jsd": 0.0, "loss/logits": 0.33357073105871676, "step": 1490 }, { "epoch": 0.05, "grad_norm": 41.75, "grad_norm_var": 39.50182291666667, "learning_rate": 0.0001, "loss": 10.0335, "loss/crossentropy": 1.9868148412555455, "loss/hidden": 4.60703125, "loss/jsd": 0.0, "loss/logits": 0.3306591158732772, "step": 1500 }, { "epoch": 0.050333333333333334, "grad_norm": 54.0, "grad_norm_var": 25.6875, "learning_rate": 0.0001, "loss": 10.2335, "loss/crossentropy": 2.1235173836350443, "loss/hidden": 4.740234375, "loss/jsd": 0.0, "loss/logits": 0.3614276949316263, "step": 1510 }, { "epoch": 0.050666666666666665, "grad_norm": 51.75, "grad_norm_var": 32.75416666666667, "learning_rate": 0.0001, "loss": 10.2879, "loss/crossentropy": 2.2561273902654646, "loss/hidden": 4.503125, "loss/jsd": 0.0, "loss/logits": 0.32090449519455433, "step": 1520 }, { "epoch": 0.051, "grad_norm": 47.75, "grad_norm_var": 37.532291666666666, "learning_rate": 0.0001, "loss": 10.0812, "loss/crossentropy": 2.144378663599491, "loss/hidden": 4.47890625, "loss/jsd": 0.0, "loss/logits": 0.29765736870467663, "step": 1530 }, { "epoch": 0.051333333333333335, "grad_norm": 46.5, "grad_norm_var": 24.870833333333334, "learning_rate": 0.0001, "loss": 10.0522, "loss/crossentropy": 2.2584436416625975, "loss/hidden": 4.6109375, "loss/jsd": 0.0, "loss/logits": 0.34671921730041505, "step": 1540 }, { "epoch": 0.051666666666666666, "grad_norm": 61.75, "grad_norm_var": 32.15807291666667, "learning_rate": 0.0001, "loss": 9.9727, "loss/crossentropy": 2.290890319645405, "loss/hidden": 4.437109375, "loss/jsd": 0.0, "loss/logits": 0.33176828771829603, "step": 1550 }, { "epoch": 0.052, "grad_norm": 52.5, "grad_norm_var": 23.875, "learning_rate": 0.0001, "loss": 10.0301, "loss/crossentropy": 2.1280356660485267, "loss/hidden": 4.634375, "loss/jsd": 0.0, "loss/logits": 0.30865246467292307, "step": 1560 }, { "epoch": 0.052333333333333336, "grad_norm": 46.25, "grad_norm_var": 23.5875, "learning_rate": 0.0001, "loss": 10.1737, "loss/crossentropy": 2.032886290922761, "loss/hidden": 4.7875, "loss/jsd": 0.0, "loss/logits": 0.3365877510979772, "step": 1570 }, { "epoch": 0.05266666666666667, "grad_norm": 5100273664.0, "grad_norm_var": 1.6257994331790162e+18, "learning_rate": 0.0001, "loss": 10.0954, "loss/crossentropy": 2.1190722532570363, "loss/hidden": 4.566015625, "loss/jsd": 0.0, "loss/logits": 0.3125073878094554, "step": 1580 }, { "epoch": 0.053, "grad_norm": 48.0, "grad_norm_var": 1.6257994343053266e+18, "learning_rate": 0.0001, "loss": 10.2018, "loss/crossentropy": 2.222577328979969, "loss/hidden": 4.548828125, "loss/jsd": 0.0, "loss/logits": 0.31913691386580467, "step": 1590 }, { "epoch": 0.05333333333333334, "grad_norm": 53.5, "grad_norm_var": 34.01015625, "learning_rate": 0.0001, "loss": 10.0954, "loss/crossentropy": 2.093307490646839, "loss/hidden": 4.612109375, "loss/jsd": 0.0, "loss/logits": 0.31212261263281105, "step": 1600 }, { "epoch": 0.05366666666666667, "grad_norm": 48.75, "grad_norm_var": 35.86015625, "learning_rate": 0.0001, "loss": 10.0662, "loss/crossentropy": 2.234019846469164, "loss/hidden": 4.5765625, "loss/jsd": 0.0, "loss/logits": 0.31956249102950096, "step": 1610 }, { "epoch": 0.054, "grad_norm": 46.5, "grad_norm_var": 8.089322916666667, "learning_rate": 0.0001, "loss": 10.1413, "loss/crossentropy": 2.334869381785393, "loss/hidden": 4.3875, "loss/jsd": 0.0, "loss/logits": 0.31581548042595387, "step": 1620 }, { "epoch": 0.05433333333333333, "grad_norm": 47.75, "grad_norm_var": 9.44140625, "learning_rate": 0.0001, "loss": 10.1052, "loss/crossentropy": 2.3633588939905166, "loss/hidden": 4.56875, "loss/jsd": 0.0, "loss/logits": 0.34331442005932333, "step": 1630 }, { "epoch": 0.05466666666666667, "grad_norm": 53.0, "grad_norm_var": 90.81640625, "learning_rate": 0.0001, "loss": 10.168, "loss/crossentropy": 2.4216663956642153, "loss/hidden": 4.409765625, "loss/jsd": 0.0, "loss/logits": 0.3401012416929007, "step": 1640 }, { "epoch": 0.055, "grad_norm": 51.75, "grad_norm_var": 39.29348958333333, "learning_rate": 0.0001, "loss": 10.0832, "loss/crossentropy": 2.063167358934879, "loss/hidden": 4.7140625, "loss/jsd": 0.0, "loss/logits": 0.3456306353211403, "step": 1650 }, { "epoch": 0.05533333333333333, "grad_norm": 46.75, "grad_norm_var": 26.27265625, "learning_rate": 0.0001, "loss": 10.072, "loss/crossentropy": 2.212946060299873, "loss/hidden": 4.4515625, "loss/jsd": 0.0, "loss/logits": 0.3171877060085535, "step": 1660 }, { "epoch": 0.05566666666666667, "grad_norm": 55.0, "grad_norm_var": 28.607291666666665, "learning_rate": 0.0001, "loss": 9.9379, "loss/crossentropy": 2.141632245481014, "loss/hidden": 4.5515625, "loss/jsd": 0.0, "loss/logits": 0.3336161907762289, "step": 1670 }, { "epoch": 0.056, "grad_norm": 50.25, "grad_norm_var": 24.514322916666668, "learning_rate": 0.0001, "loss": 10.1308, "loss/crossentropy": 2.2568211957812307, "loss/hidden": 4.49453125, "loss/jsd": 0.0, "loss/logits": 0.3477105274796486, "step": 1680 }, { "epoch": 0.05633333333333333, "grad_norm": 48.75, "grad_norm_var": 18.523958333333333, "learning_rate": 0.0001, "loss": 9.9396, "loss/crossentropy": 2.2201522469520567, "loss/hidden": 4.54453125, "loss/jsd": 0.0, "loss/logits": 0.34930348955094814, "step": 1690 }, { "epoch": 0.056666666666666664, "grad_norm": 49.75, "grad_norm_var": 18.798958333333335, "learning_rate": 0.0001, "loss": 9.8497, "loss/crossentropy": 2.0173508882522584, "loss/hidden": 4.617578125, "loss/jsd": 0.0, "loss/logits": 0.33352648206055163, "step": 1700 }, { "epoch": 0.057, "grad_norm": 53.5, "grad_norm_var": 53.06848958333333, "learning_rate": 0.0001, "loss": 10.0437, "loss/crossentropy": 2.2075788587331773, "loss/hidden": 4.520703125, "loss/jsd": 0.0, "loss/logits": 0.3259766954928637, "step": 1710 }, { "epoch": 0.05733333333333333, "grad_norm": 47.5, "grad_norm_var": 12.4875, "learning_rate": 0.0001, "loss": 10.0156, "loss/crossentropy": 2.2491456001996992, "loss/hidden": 4.46796875, "loss/jsd": 0.0, "loss/logits": 0.32154099717736245, "step": 1720 }, { "epoch": 0.057666666666666665, "grad_norm": 51.5, "grad_norm_var": 22.698958333333334, "learning_rate": 0.0001, "loss": 10.1127, "loss/crossentropy": 2.2360637068748472, "loss/hidden": 4.599609375, "loss/jsd": 0.0, "loss/logits": 0.34004257917404174, "step": 1730 }, { "epoch": 0.058, "grad_norm": 46.5, "grad_norm_var": 20.483072916666668, "learning_rate": 0.0001, "loss": 9.9202, "loss/crossentropy": 2.169334437698126, "loss/hidden": 4.485546875, "loss/jsd": 0.0, "loss/logits": 0.3213648945093155, "step": 1740 }, { "epoch": 0.058333333333333334, "grad_norm": 47.0, "grad_norm_var": 18.501822916666665, "learning_rate": 0.0001, "loss": 9.9663, "loss/crossentropy": 2.0624472610652447, "loss/hidden": 4.672265625, "loss/jsd": 0.0, "loss/logits": 0.33040957022458317, "step": 1750 }, { "epoch": 0.058666666666666666, "grad_norm": 58.75, "grad_norm_var": 20.214322916666667, "learning_rate": 0.0001, "loss": 9.903, "loss/crossentropy": 2.3360592156648634, "loss/hidden": 4.465625, "loss/jsd": 0.0, "loss/logits": 0.34421659298241136, "step": 1760 }, { "epoch": 0.059, "grad_norm": 52.75, "grad_norm_var": 22.5125, "learning_rate": 0.0001, "loss": 10.0066, "loss/crossentropy": 2.2802910655736923, "loss/hidden": 4.39609375, "loss/jsd": 0.0, "loss/logits": 0.32004439644515514, "step": 1770 }, { "epoch": 0.059333333333333335, "grad_norm": 44.25, "grad_norm_var": 27.908072916666665, "learning_rate": 0.0001, "loss": 9.8284, "loss/crossentropy": 2.3136008724570276, "loss/hidden": 4.3859375, "loss/jsd": 0.0, "loss/logits": 0.31898586712777616, "step": 1780 }, { "epoch": 0.059666666666666666, "grad_norm": 45.75, "grad_norm_var": 33.890625, "learning_rate": 0.0001, "loss": 9.9335, "loss/crossentropy": 2.2768970370292663, "loss/hidden": 4.32265625, "loss/jsd": 0.0, "loss/logits": 0.31204146817326545, "step": 1790 }, { "epoch": 0.06, "grad_norm": 45.5, "grad_norm_var": 22.015625, "learning_rate": 0.0001, "loss": 9.9846, "loss/crossentropy": 2.2742267102003098, "loss/hidden": 4.58046875, "loss/jsd": 0.0, "loss/logits": 0.3438062757253647, "step": 1800 }, { "epoch": 0.060333333333333336, "grad_norm": 44.25, "grad_norm_var": 409.8541666666667, "learning_rate": 0.0001, "loss": 9.9716, "loss/crossentropy": 2.2834268152713775, "loss/hidden": 4.514453125, "loss/jsd": 0.0, "loss/logits": 0.33578878715634347, "step": 1810 }, { "epoch": 0.06066666666666667, "grad_norm": 65.0, "grad_norm_var": 437.6489583333333, "learning_rate": 0.0001, "loss": 10.0204, "loss/crossentropy": 2.2084247410297393, "loss/hidden": 4.514453125, "loss/jsd": 0.0, "loss/logits": 0.3214238926768303, "step": 1820 }, { "epoch": 0.061, "grad_norm": 47.25, "grad_norm_var": 63.00390625, "learning_rate": 0.0001, "loss": 9.8694, "loss/crossentropy": 2.4278147757053374, "loss/hidden": 4.340625, "loss/jsd": 0.0, "loss/logits": 0.3172066226601601, "step": 1830 }, { "epoch": 0.06133333333333333, "grad_norm": 40.0, "grad_norm_var": 51.55807291666667, "learning_rate": 0.0001, "loss": 9.9545, "loss/crossentropy": 2.2956237584352492, "loss/hidden": 4.4421875, "loss/jsd": 0.0, "loss/logits": 0.33112434335052965, "step": 1840 }, { "epoch": 0.06166666666666667, "grad_norm": 44.5, "grad_norm_var": 62.907291666666666, "learning_rate": 0.0001, "loss": 9.9812, "loss/crossentropy": 2.143566229194403, "loss/hidden": 4.630078125, "loss/jsd": 0.0, "loss/logits": 0.3249357048422098, "step": 1850 }, { "epoch": 0.062, "grad_norm": 66.0, "grad_norm_var": 63.49166666666667, "learning_rate": 0.0001, "loss": 10.0621, "loss/crossentropy": 2.112970842421055, "loss/hidden": 4.344140625, "loss/jsd": 0.0, "loss/logits": 0.290663880482316, "step": 1860 }, { "epoch": 0.06233333333333333, "grad_norm": 48.0, "grad_norm_var": 91.80416666666666, "learning_rate": 0.0001, "loss": 9.9348, "loss/crossentropy": 2.1757256247103216, "loss/hidden": 4.416796875, "loss/jsd": 0.0, "loss/logits": 0.315375828742981, "step": 1870 }, { "epoch": 0.06266666666666666, "grad_norm": 48.25, "grad_norm_var": 2.0336566681475924e+18, "learning_rate": 0.0001, "loss": 10.1178, "loss/crossentropy": 2.209875613451004, "loss/hidden": 4.583203125, "loss/jsd": 0.0, "loss/logits": 0.3451205603778362, "step": 1880 }, { "epoch": 0.063, "grad_norm": 38.25, "grad_norm_var": 378.6205729166667, "learning_rate": 0.0001, "loss": 9.8686, "loss/crossentropy": 2.234115143120289, "loss/hidden": 4.44296875, "loss/jsd": 0.0, "loss/logits": 0.31297464594244956, "step": 1890 }, { "epoch": 0.06333333333333334, "grad_norm": 52.0, "grad_norm_var": 30.382291666666667, "learning_rate": 0.0001, "loss": 9.9681, "loss/crossentropy": 2.314877039194107, "loss/hidden": 4.561328125, "loss/jsd": 0.0, "loss/logits": 0.34244176670908927, "step": 1900 }, { "epoch": 0.06366666666666666, "grad_norm": 59.0, "grad_norm_var": 534.7708333333334, "learning_rate": 0.0001, "loss": 10.1051, "loss/crossentropy": 2.148053403198719, "loss/hidden": 4.525, "loss/jsd": 0.0, "loss/logits": 0.3273327838629484, "step": 1910 }, { "epoch": 0.064, "grad_norm": 47.25, "grad_norm_var": 553.11015625, "learning_rate": 0.0001, "loss": 9.9442, "loss/crossentropy": 2.3466587856411936, "loss/hidden": 4.334765625, "loss/jsd": 0.0, "loss/logits": 0.31107306741178037, "step": 1920 }, { "epoch": 0.06433333333333334, "grad_norm": 50.0, "grad_norm_var": 42.541666666666664, "learning_rate": 0.0001, "loss": 9.8719, "loss/crossentropy": 2.252930277585983, "loss/hidden": 4.3234375, "loss/jsd": 0.0, "loss/logits": 0.3164752185344696, "step": 1930 }, { "epoch": 0.06466666666666666, "grad_norm": 46.25, "grad_norm_var": 34.29348958333333, "learning_rate": 0.0001, "loss": 9.7802, "loss/crossentropy": 2.1432655058801173, "loss/hidden": 4.53671875, "loss/jsd": 0.0, "loss/logits": 0.30679955538362264, "step": 1940 }, { "epoch": 0.065, "grad_norm": 40.0, "grad_norm_var": 29.705989583333334, "learning_rate": 0.0001, "loss": 9.8725, "loss/crossentropy": 2.2932953238487244, "loss/hidden": 4.425, "loss/jsd": 0.0, "loss/logits": 0.311083947122097, "step": 1950 }, { "epoch": 0.06533333333333333, "grad_norm": 47.25, "grad_norm_var": 40.08515625, "learning_rate": 0.0001, "loss": 9.8434, "loss/crossentropy": 2.1042870871722696, "loss/hidden": 4.48125, "loss/jsd": 0.0, "loss/logits": 0.31923425998538735, "step": 1960 }, { "epoch": 0.06566666666666666, "grad_norm": 47.0, "grad_norm_var": 32.69583333333333, "learning_rate": 0.0001, "loss": 9.9911, "loss/crossentropy": 2.228940861672163, "loss/hidden": 4.470703125, "loss/jsd": 0.0, "loss/logits": 0.3330340197309852, "step": 1970 }, { "epoch": 0.066, "grad_norm": 49.5, "grad_norm_var": 19.858072916666668, "learning_rate": 0.0001, "loss": 9.8909, "loss/crossentropy": 2.365998923778534, "loss/hidden": 4.4140625, "loss/jsd": 0.0, "loss/logits": 0.3426622122526169, "step": 1980 }, { "epoch": 0.06633333333333333, "grad_norm": 70.0, "grad_norm_var": 43.84166666666667, "learning_rate": 0.0001, "loss": 9.8246, "loss/crossentropy": 2.21089443564415, "loss/hidden": 4.508203125, "loss/jsd": 0.0, "loss/logits": 0.33040032908320427, "step": 1990 }, { "epoch": 0.06666666666666667, "grad_norm": 46.0, "grad_norm_var": 200.29140625, "learning_rate": 0.0001, "loss": 9.9707, "loss/crossentropy": 2.161339648067951, "loss/hidden": 4.59375, "loss/jsd": 0.0, "loss/logits": 0.33537587746977804, "step": 2000 }, { "epoch": 0.067, "grad_norm": 41.0, "grad_norm_var": 34.44166666666667, "learning_rate": 0.0001, "loss": 9.8169, "loss/crossentropy": 2.162997691333294, "loss/hidden": 4.3140625, "loss/jsd": 0.0, "loss/logits": 0.3058626361191273, "step": 2010 }, { "epoch": 0.06733333333333333, "grad_norm": 48.75, "grad_norm_var": 776.290625, "learning_rate": 0.0001, "loss": 9.8411, "loss/crossentropy": 2.1134648233652116, "loss/hidden": 4.4953125, "loss/jsd": 0.0, "loss/logits": 0.3073283813893795, "step": 2020 }, { "epoch": 0.06766666666666667, "grad_norm": 44.0, "grad_norm_var": 17.798958333333335, "learning_rate": 0.0001, "loss": 9.8272, "loss/crossentropy": 2.1669696398079394, "loss/hidden": 4.311328125, "loss/jsd": 0.0, "loss/logits": 0.3024018405005336, "step": 2030 }, { "epoch": 0.068, "grad_norm": 52.75, "grad_norm_var": 39.25, "learning_rate": 0.0001, "loss": 9.8041, "loss/crossentropy": 2.1358415842056275, "loss/hidden": 4.31796875, "loss/jsd": 0.0, "loss/logits": 0.2974515471607447, "step": 2040 }, { "epoch": 0.06833333333333333, "grad_norm": 50.25, "grad_norm_var": 372.12682291666664, "learning_rate": 0.0001, "loss": 9.8634, "loss/crossentropy": 2.179739834368229, "loss/hidden": 4.478125, "loss/jsd": 0.0, "loss/logits": 0.31879689246416093, "step": 2050 }, { "epoch": 0.06866666666666667, "grad_norm": 55.5, "grad_norm_var": 1220.3247395833334, "learning_rate": 0.0001, "loss": 9.7749, "loss/crossentropy": 2.196292628347874, "loss/hidden": 4.4796875, "loss/jsd": 0.0, "loss/logits": 0.3256720818579197, "step": 2060 }, { "epoch": 0.069, "grad_norm": 63.75, "grad_norm_var": 930.6125, "learning_rate": 0.0001, "loss": 9.893, "loss/crossentropy": 2.2043800972402097, "loss/hidden": 4.47421875, "loss/jsd": 0.0, "loss/logits": 0.3335278692655265, "step": 2070 }, { "epoch": 0.06933333333333333, "grad_norm": 44.75, "grad_norm_var": 79.98723958333333, "learning_rate": 0.0001, "loss": 9.8999, "loss/crossentropy": 2.3483674988150596, "loss/hidden": 4.380078125, "loss/jsd": 0.0, "loss/logits": 0.33991905823349955, "step": 2080 }, { "epoch": 0.06966666666666667, "grad_norm": 47.0, "grad_norm_var": 29.093489583333334, "learning_rate": 0.0001, "loss": 9.9509, "loss/crossentropy": 2.418730080127716, "loss/hidden": 4.301953125, "loss/jsd": 0.0, "loss/logits": 0.31339589357376096, "step": 2090 }, { "epoch": 0.07, "grad_norm": 42.0, "grad_norm_var": 20.145572916666666, "learning_rate": 0.0001, "loss": 9.8714, "loss/crossentropy": 2.3859033226966857, "loss/hidden": 4.46953125, "loss/jsd": 0.0, "loss/logits": 0.33473276533186436, "step": 2100 }, { "epoch": 0.07033333333333333, "grad_norm": 45.75, "grad_norm_var": 18.748958333333334, "learning_rate": 0.0001, "loss": 9.6503, "loss/crossentropy": 2.0000314809381963, "loss/hidden": 4.35859375, "loss/jsd": 0.0, "loss/logits": 0.28535501156002285, "step": 2110 }, { "epoch": 0.07066666666666667, "grad_norm": 50.5, "grad_norm_var": 104.64348958333333, "learning_rate": 0.0001, "loss": 9.8653, "loss/crossentropy": 2.3264140084385874, "loss/hidden": 4.554296875, "loss/jsd": 0.0, "loss/logits": 0.3423323597759008, "step": 2120 }, { "epoch": 0.071, "grad_norm": 49.25, "grad_norm_var": 111.43932291666667, "learning_rate": 0.0001, "loss": 9.8327, "loss/crossentropy": 2.2525949284434317, "loss/hidden": 4.52578125, "loss/jsd": 0.0, "loss/logits": 0.3271168455481529, "step": 2130 }, { "epoch": 0.07133333333333333, "grad_norm": 48.25, "grad_norm_var": 34.56848958333333, "learning_rate": 0.0001, "loss": 9.7428, "loss/crossentropy": 2.1882525816559792, "loss/hidden": 4.42734375, "loss/jsd": 0.0, "loss/logits": 0.30069184843450786, "step": 2140 }, { "epoch": 0.07166666666666667, "grad_norm": 38.75, "grad_norm_var": 36.018489583333334, "learning_rate": 0.0001, "loss": 9.6677, "loss/crossentropy": 2.149352750182152, "loss/hidden": 4.32890625, "loss/jsd": 0.0, "loss/logits": 0.30152420345693826, "step": 2150 }, { "epoch": 0.072, "grad_norm": 47.5, "grad_norm_var": 41.665625, "learning_rate": 0.0001, "loss": 9.7964, "loss/crossentropy": 2.191788887232542, "loss/hidden": 4.412890625, "loss/jsd": 0.0, "loss/logits": 0.3204044759273529, "step": 2160 }, { "epoch": 0.07233333333333333, "grad_norm": 43.75, "grad_norm_var": 21.66015625, "learning_rate": 0.0001, "loss": 9.774, "loss/crossentropy": 2.057549092173576, "loss/hidden": 4.312890625, "loss/jsd": 0.0, "loss/logits": 0.29129388704895975, "step": 2170 }, { "epoch": 0.07266666666666667, "grad_norm": 69.5, "grad_norm_var": 1.4591662505790013e+18, "learning_rate": 0.0001, "loss": 9.8565, "loss/crossentropy": 2.1569569408893585, "loss/hidden": 4.54453125, "loss/jsd": 0.0, "loss/logits": 0.3287381026893854, "step": 2180 }, { "epoch": 0.073, "grad_norm": 41.5, "grad_norm_var": 1.459166249522037e+18, "learning_rate": 0.0001, "loss": 9.7733, "loss/crossentropy": 2.166168002039194, "loss/hidden": 4.42109375, "loss/jsd": 0.0, "loss/logits": 0.314485302567482, "step": 2190 }, { "epoch": 0.07333333333333333, "grad_norm": 51.5, "grad_norm_var": 15.843489583333334, "learning_rate": 0.0001, "loss": 10.0454, "loss/crossentropy": 2.2209738835692407, "loss/hidden": 4.4734375, "loss/jsd": 0.0, "loss/logits": 0.3463482726365328, "step": 2200 }, { "epoch": 0.07366666666666667, "grad_norm": 47.25, "grad_norm_var": 25.479166666666668, "learning_rate": 0.0001, "loss": 9.6539, "loss/crossentropy": 2.225794421136379, "loss/hidden": 4.46640625, "loss/jsd": 0.0, "loss/logits": 0.3246209166944027, "step": 2210 }, { "epoch": 0.074, "grad_norm": 48.75, "grad_norm_var": 23.154166666666665, "learning_rate": 0.0001, "loss": 9.9317, "loss/crossentropy": 2.207696130871773, "loss/hidden": 4.442578125, "loss/jsd": 0.0, "loss/logits": 0.33632578104734423, "step": 2220 }, { "epoch": 0.07433333333333333, "grad_norm": 44.0, "grad_norm_var": 17.530989583333334, "learning_rate": 0.0001, "loss": 9.8728, "loss/crossentropy": 1.9358359836041927, "loss/hidden": 4.57578125, "loss/jsd": 0.0, "loss/logits": 0.30893346965312957, "step": 2230 }, { "epoch": 0.07466666666666667, "grad_norm": 41.25, "grad_norm_var": 356.12682291666664, "learning_rate": 0.0001, "loss": 9.7122, "loss/crossentropy": 2.1984036192297935, "loss/hidden": 4.46171875, "loss/jsd": 0.0, "loss/logits": 0.30973851270973685, "step": 2240 }, { "epoch": 0.075, "grad_norm": 48.25, "grad_norm_var": 126.44166666666666, "learning_rate": 0.0001, "loss": 9.9046, "loss/crossentropy": 2.2213550955057144, "loss/hidden": 4.4125, "loss/jsd": 0.0, "loss/logits": 0.32458372712135314, "step": 2250 }, { "epoch": 0.07533333333333334, "grad_norm": 42.75, "grad_norm_var": 140.88515625, "learning_rate": 0.0001, "loss": 9.6263, "loss/crossentropy": 2.2533512063324452, "loss/hidden": 4.326953125, "loss/jsd": 0.0, "loss/logits": 0.2968948673456907, "step": 2260 }, { "epoch": 0.07566666666666666, "grad_norm": 40.75, "grad_norm_var": 30.940625, "learning_rate": 0.0001, "loss": 9.6899, "loss/crossentropy": 2.157477790862322, "loss/hidden": 4.3984375, "loss/jsd": 0.0, "loss/logits": 0.31756968759000304, "step": 2270 }, { "epoch": 0.076, "grad_norm": 40.25, "grad_norm_var": 48.38098958333333, "learning_rate": 0.0001, "loss": 9.767, "loss/crossentropy": 2.1325583457946777, "loss/hidden": 4.4203125, "loss/jsd": 0.0, "loss/logits": 0.3174896206706762, "step": 2280 }, { "epoch": 0.07633333333333334, "grad_norm": 44.0, "grad_norm_var": 292.840625, "learning_rate": 0.0001, "loss": 9.6629, "loss/crossentropy": 2.116827255487442, "loss/hidden": 4.578125, "loss/jsd": 0.0, "loss/logits": 0.32861895225942134, "step": 2290 }, { "epoch": 0.07666666666666666, "grad_norm": 45.75, "grad_norm_var": 20.098958333333332, "learning_rate": 0.0001, "loss": 9.848, "loss/crossentropy": 2.2478859812021255, "loss/hidden": 4.432421875, "loss/jsd": 0.0, "loss/logits": 0.3231295388191938, "step": 2300 }, { "epoch": 0.077, "grad_norm": 44.5, "grad_norm_var": 21.03515625, "learning_rate": 0.0001, "loss": 9.7126, "loss/crossentropy": 2.2487671941518785, "loss/hidden": 4.370703125, "loss/jsd": 0.0, "loss/logits": 0.3190602418035269, "step": 2310 }, { "epoch": 0.07733333333333334, "grad_norm": 48.0, "grad_norm_var": 78.95, "learning_rate": 0.0001, "loss": 9.8802, "loss/crossentropy": 2.0769578374922277, "loss/hidden": 4.3765625, "loss/jsd": 0.0, "loss/logits": 0.2989305056631565, "step": 2320 }, { "epoch": 0.07766666666666666, "grad_norm": 47.0, "grad_norm_var": 88.0875, "learning_rate": 0.0001, "loss": 9.6776, "loss/crossentropy": 2.2335385888814927, "loss/hidden": 4.439453125, "loss/jsd": 0.0, "loss/logits": 0.3145127721130848, "step": 2330 }, { "epoch": 0.078, "grad_norm": 45.0, "grad_norm_var": 29.162239583333335, "learning_rate": 0.0001, "loss": 9.7538, "loss/crossentropy": 2.0275358721613883, "loss/hidden": 4.390234375, "loss/jsd": 0.0, "loss/logits": 0.3278283253312111, "step": 2340 }, { "epoch": 0.07833333333333334, "grad_norm": 46.0, "grad_norm_var": 15.825, "learning_rate": 0.0001, "loss": 9.7288, "loss/crossentropy": 2.2636055946350098, "loss/hidden": 4.3, "loss/jsd": 0.0, "loss/logits": 0.3025582984089851, "step": 2350 }, { "epoch": 0.07866666666666666, "grad_norm": 44.25, "grad_norm_var": 13.548958333333333, "learning_rate": 0.0001, "loss": 9.5923, "loss/crossentropy": 2.0806369572877883, "loss/hidden": 4.4328125, "loss/jsd": 0.0, "loss/logits": 0.3299530727788806, "step": 2360 }, { "epoch": 0.079, "grad_norm": 48.75, "grad_norm_var": 22.079166666666666, "learning_rate": 0.0001, "loss": 9.5852, "loss/crossentropy": 2.3268432706594466, "loss/hidden": 4.3140625, "loss/jsd": 0.0, "loss/logits": 0.30601568184792993, "step": 2370 }, { "epoch": 0.07933333333333334, "grad_norm": 42.75, "grad_norm_var": 20.670833333333334, "learning_rate": 0.0001, "loss": 9.5338, "loss/crossentropy": 2.1361410327255728, "loss/hidden": 4.1984375, "loss/jsd": 0.0, "loss/logits": 0.27998521625995637, "step": 2380 }, { "epoch": 0.07966666666666666, "grad_norm": 49.25, "grad_norm_var": 15.266666666666667, "learning_rate": 0.0001, "loss": 9.5332, "loss/crossentropy": 2.1129361763596535, "loss/hidden": 4.35390625, "loss/jsd": 0.0, "loss/logits": 0.29650157541036604, "step": 2390 }, { "epoch": 0.08, "grad_norm": 41.5, "grad_norm_var": 25.429166666666667, "learning_rate": 0.0001, "loss": 9.6695, "loss/crossentropy": 2.1574720084667205, "loss/hidden": 4.269921875, "loss/jsd": 0.0, "loss/logits": 0.2908360369503498, "step": 2400 }, { "epoch": 0.08033333333333334, "grad_norm": 47.75, "grad_norm_var": 27.645572916666666, "learning_rate": 0.0001, "loss": 9.6921, "loss/crossentropy": 2.1829341441392898, "loss/hidden": 4.381640625, "loss/jsd": 0.0, "loss/logits": 0.32286781407892706, "step": 2410 }, { "epoch": 0.08066666666666666, "grad_norm": 39.25, "grad_norm_var": 50.555989583333336, "learning_rate": 0.0001, "loss": 9.6315, "loss/crossentropy": 2.202047623693943, "loss/hidden": 4.3578125, "loss/jsd": 0.0, "loss/logits": 0.3005070973187685, "step": 2420 }, { "epoch": 0.081, "grad_norm": 55.0, "grad_norm_var": 59.32083333333333, "learning_rate": 0.0001, "loss": 9.6481, "loss/crossentropy": 2.036600667051971, "loss/hidden": 4.3484375, "loss/jsd": 0.0, "loss/logits": 0.29114639018662275, "step": 2430 }, { "epoch": 0.08133333333333333, "grad_norm": 46.75, "grad_norm_var": 23.999739583333334, "learning_rate": 0.0001, "loss": 9.6458, "loss/crossentropy": 2.2270599991083144, "loss/hidden": 4.397265625, "loss/jsd": 0.0, "loss/logits": 0.29978666119277475, "step": 2440 }, { "epoch": 0.08166666666666667, "grad_norm": 44.25, "grad_norm_var": 12.179166666666667, "learning_rate": 0.0001, "loss": 9.5471, "loss/crossentropy": 2.024305185675621, "loss/hidden": 4.57265625, "loss/jsd": 0.0, "loss/logits": 0.3118948549032211, "step": 2450 }, { "epoch": 0.082, "grad_norm": 44.75, "grad_norm_var": 34.35729166666667, "learning_rate": 0.0001, "loss": 9.6578, "loss/crossentropy": 2.293112243711948, "loss/hidden": 4.36640625, "loss/jsd": 0.0, "loss/logits": 0.3087532136589289, "step": 2460 }, { "epoch": 0.08233333333333333, "grad_norm": 45.5, "grad_norm_var": 54.340625, "learning_rate": 0.0001, "loss": 9.8106, "loss/crossentropy": 2.159098155796528, "loss/hidden": 4.462109375, "loss/jsd": 0.0, "loss/logits": 0.33799122273921967, "step": 2470 }, { "epoch": 0.08266666666666667, "grad_norm": 40.0, "grad_norm_var": 48.490625, "learning_rate": 0.0001, "loss": 9.6198, "loss/crossentropy": 2.1624063357710837, "loss/hidden": 4.332421875, "loss/jsd": 0.0, "loss/logits": 0.2944056583568454, "step": 2480 }, { "epoch": 0.083, "grad_norm": 40.75, "grad_norm_var": 10.579166666666667, "learning_rate": 0.0001, "loss": 9.3685, "loss/crossentropy": 1.9723315440118312, "loss/hidden": 4.365234375, "loss/jsd": 0.0, "loss/logits": 0.2733167437836528, "step": 2490 }, { "epoch": 0.08333333333333333, "grad_norm": 45.5, "grad_norm_var": 13.723958333333334, "learning_rate": 0.0001, "loss": 9.7588, "loss/crossentropy": 2.3331384271383286, "loss/hidden": 4.4953125, "loss/jsd": 0.0, "loss/logits": 0.3479126874357462, "step": 2500 }, { "epoch": 0.08366666666666667, "grad_norm": 86.5, "grad_norm_var": 176.58098958333332, "learning_rate": 0.0001, "loss": 9.604, "loss/crossentropy": 1.999950359016657, "loss/hidden": 4.355078125, "loss/jsd": 0.0, "loss/logits": 0.2928037021309137, "step": 2510 }, { "epoch": 0.084, "grad_norm": 49.75, "grad_norm_var": 139.44140625, "learning_rate": 0.0001, "loss": 9.6797, "loss/crossentropy": 2.1708203181624413, "loss/hidden": 4.387109375, "loss/jsd": 0.0, "loss/logits": 0.32291998714208603, "step": 2520 }, { "epoch": 0.08433333333333333, "grad_norm": 42.25, "grad_norm_var": 8.774739583333334, "learning_rate": 0.0001, "loss": 9.5592, "loss/crossentropy": 2.2025649711489677, "loss/hidden": 4.34453125, "loss/jsd": 0.0, "loss/logits": 0.29643226135522127, "step": 2530 }, { "epoch": 0.08466666666666667, "grad_norm": 46.25, "grad_norm_var": 9.9875, "learning_rate": 0.0001, "loss": 9.5765, "loss/crossentropy": 2.043403333425522, "loss/hidden": 4.23671875, "loss/jsd": 0.0, "loss/logits": 0.2749296611174941, "step": 2540 }, { "epoch": 0.085, "grad_norm": 48.75, "grad_norm_var": 31.115625, "learning_rate": 0.0001, "loss": 9.7494, "loss/crossentropy": 2.3262161046266554, "loss/hidden": 4.371875, "loss/jsd": 0.0, "loss/logits": 0.32624533101916314, "step": 2550 }, { "epoch": 0.08533333333333333, "grad_norm": 48.75, "grad_norm_var": 57.76015625, "learning_rate": 0.0001, "loss": 9.7786, "loss/crossentropy": 2.222296068072319, "loss/hidden": 4.425390625, "loss/jsd": 0.0, "loss/logits": 0.3247109234333038, "step": 2560 }, { "epoch": 0.08566666666666667, "grad_norm": 42.25, "grad_norm_var": 79.12265625, "learning_rate": 0.0001, "loss": 9.5314, "loss/crossentropy": 2.2209610506892203, "loss/hidden": 4.214453125, "loss/jsd": 0.0, "loss/logits": 0.3056815842166543, "step": 2570 }, { "epoch": 0.086, "grad_norm": 40.25, "grad_norm_var": 11.74765625, "learning_rate": 0.0001, "loss": 9.6404, "loss/crossentropy": 2.27297485768795, "loss/hidden": 4.391015625, "loss/jsd": 0.0, "loss/logits": 0.3162984177470207, "step": 2580 }, { "epoch": 0.08633333333333333, "grad_norm": 43.75, "grad_norm_var": 49.432291666666664, "learning_rate": 0.0001, "loss": 9.7363, "loss/crossentropy": 2.1449129566550256, "loss/hidden": 4.16640625, "loss/jsd": 0.0, "loss/logits": 0.2817653050646186, "step": 2590 }, { "epoch": 0.08666666666666667, "grad_norm": 50.25, "grad_norm_var": 38.78307291666667, "learning_rate": 0.0001, "loss": 9.7758, "loss/crossentropy": 2.1352466866374016, "loss/hidden": 4.255078125, "loss/jsd": 0.0, "loss/logits": 0.2975259907543659, "step": 2600 }, { "epoch": 0.087, "grad_norm": 44.0, "grad_norm_var": 567.5809895833333, "learning_rate": 0.0001, "loss": 9.8575, "loss/crossentropy": 2.139151658862829, "loss/hidden": 4.4125, "loss/jsd": 0.0, "loss/logits": 0.2895995612256229, "step": 2610 }, { "epoch": 0.08733333333333333, "grad_norm": 52.0, "grad_norm_var": 273.78932291666666, "learning_rate": 0.0001, "loss": 9.6349, "loss/crossentropy": 2.2437764003872873, "loss/hidden": 4.2875, "loss/jsd": 0.0, "loss/logits": 0.32964606285095216, "step": 2620 }, { "epoch": 0.08766666666666667, "grad_norm": 41.25, "grad_norm_var": 14.832291666666666, "learning_rate": 0.0001, "loss": 9.8805, "loss/crossentropy": 2.3176336243748663, "loss/hidden": 4.34296875, "loss/jsd": 0.0, "loss/logits": 0.3469231605529785, "step": 2630 }, { "epoch": 0.088, "grad_norm": 42.5, "grad_norm_var": 63.924739583333334, "learning_rate": 0.0001, "loss": 9.6733, "loss/crossentropy": 2.3317414090037345, "loss/hidden": 4.4453125, "loss/jsd": 0.0, "loss/logits": 0.3158239943906665, "step": 2640 }, { "epoch": 0.08833333333333333, "grad_norm": 42.0, "grad_norm_var": 47.2625, "learning_rate": 0.0001, "loss": 9.53, "loss/crossentropy": 2.085783836245537, "loss/hidden": 4.154296875, "loss/jsd": 0.0, "loss/logits": 0.30830717273056507, "step": 2650 }, { "epoch": 0.08866666666666667, "grad_norm": 44.75, "grad_norm_var": 10.5125, "learning_rate": 0.0001, "loss": 9.6251, "loss/crossentropy": 2.1499151602387427, "loss/hidden": 4.29765625, "loss/jsd": 0.0, "loss/logits": 0.2804956670850515, "step": 2660 }, { "epoch": 0.089, "grad_norm": 38.5, "grad_norm_var": 11.532291666666667, "learning_rate": 0.0001, "loss": 9.6324, "loss/crossentropy": 2.209009498357773, "loss/hidden": 4.35, "loss/jsd": 0.0, "loss/logits": 0.3170790944248438, "step": 2670 }, { "epoch": 0.08933333333333333, "grad_norm": 39.75, "grad_norm_var": 18.598958333333332, "learning_rate": 0.0001, "loss": 9.5224, "loss/crossentropy": 2.2298269629478455, "loss/hidden": 4.39765625, "loss/jsd": 0.0, "loss/logits": 0.32219739593565466, "step": 2680 }, { "epoch": 0.08966666666666667, "grad_norm": 46.0, "grad_norm_var": 15.04140625, "learning_rate": 0.0001, "loss": 9.6132, "loss/crossentropy": 1.9786877676844596, "loss/hidden": 4.453515625, "loss/jsd": 0.0, "loss/logits": 0.32063512736931443, "step": 2690 }, { "epoch": 0.09, "grad_norm": 48.5, "grad_norm_var": 16.370572916666667, "learning_rate": 0.0001, "loss": 9.5549, "loss/crossentropy": 1.9581148944795133, "loss/hidden": 4.3390625, "loss/jsd": 0.0, "loss/logits": 0.2741738385986537, "step": 2700 }, { "epoch": 0.09033333333333333, "grad_norm": 39.25, "grad_norm_var": 39.25390625, "learning_rate": 0.0001, "loss": 9.584, "loss/crossentropy": 2.3024230673909187, "loss/hidden": 4.284765625, "loss/jsd": 0.0, "loss/logits": 0.31776211857795716, "step": 2710 }, { "epoch": 0.09066666666666667, "grad_norm": 49.0, "grad_norm_var": 17.290625, "learning_rate": 0.0001, "loss": 9.5623, "loss/crossentropy": 2.2617855593562126, "loss/hidden": 4.403515625, "loss/jsd": 0.0, "loss/logits": 0.32258614907041194, "step": 2720 }, { "epoch": 0.091, "grad_norm": 42.5, "grad_norm_var": 13.182291666666666, "learning_rate": 0.0001, "loss": 9.6273, "loss/crossentropy": 2.079073026776314, "loss/hidden": 4.480859375, "loss/jsd": 0.0, "loss/logits": 0.3417033176869154, "step": 2730 }, { "epoch": 0.09133333333333334, "grad_norm": 42.75, "grad_norm_var": 13.691666666666666, "learning_rate": 0.0001, "loss": 9.5346, "loss/crossentropy": 2.0505348153412344, "loss/hidden": 4.206640625, "loss/jsd": 0.0, "loss/logits": 0.2786023462191224, "step": 2740 }, { "epoch": 0.09166666666666666, "grad_norm": 41.0, "grad_norm_var": 14.207291666666666, "learning_rate": 0.0001, "loss": 9.4132, "loss/crossentropy": 2.0336243584752083, "loss/hidden": 4.423828125, "loss/jsd": 0.0, "loss/logits": 0.3069038312882185, "step": 2750 }, { "epoch": 0.092, "grad_norm": 42.25, "grad_norm_var": 22.057291666666668, "learning_rate": 0.0001, "loss": 9.5053, "loss/crossentropy": 2.140151581168175, "loss/hidden": 4.319140625, "loss/jsd": 0.0, "loss/logits": 0.2916400883346796, "step": 2760 }, { "epoch": 0.09233333333333334, "grad_norm": 39.75, "grad_norm_var": 15.432291666666666, "learning_rate": 0.0001, "loss": 9.5438, "loss/crossentropy": 2.1993202224373816, "loss/hidden": 4.226953125, "loss/jsd": 0.0, "loss/logits": 0.28949977159500123, "step": 2770 }, { "epoch": 0.09266666666666666, "grad_norm": 41.75, "grad_norm_var": 12.848958333333334, "learning_rate": 0.0001, "loss": 9.3797, "loss/crossentropy": 2.1902914479374886, "loss/hidden": 4.280859375, "loss/jsd": 0.0, "loss/logits": 0.28133582808077334, "step": 2780 }, { "epoch": 0.093, "grad_norm": 46.25, "grad_norm_var": 19.35, "learning_rate": 0.0001, "loss": 9.5715, "loss/crossentropy": 2.249551972001791, "loss/hidden": 4.199609375, "loss/jsd": 0.0, "loss/logits": 0.2843886561691761, "step": 2790 }, { "epoch": 0.09333333333333334, "grad_norm": 41.0, "grad_norm_var": 20.740625, "learning_rate": 0.0001, "loss": 9.6114, "loss/crossentropy": 2.228690019249916, "loss/hidden": 4.262109375, "loss/jsd": 0.0, "loss/logits": 0.320504542812705, "step": 2800 }, { "epoch": 0.09366666666666666, "grad_norm": 43.0, "grad_norm_var": 8.529166666666667, "learning_rate": 0.0001, "loss": 9.5136, "loss/crossentropy": 2.291595259308815, "loss/hidden": 4.24375, "loss/jsd": 0.0, "loss/logits": 0.3068136487156153, "step": 2810 }, { "epoch": 0.094, "grad_norm": 35.25, "grad_norm_var": 74.32395833333334, "learning_rate": 0.0001, "loss": 9.51, "loss/crossentropy": 2.192962332069874, "loss/hidden": 4.22265625, "loss/jsd": 0.0, "loss/logits": 0.2983078990131617, "step": 2820 }, { "epoch": 0.09433333333333334, "grad_norm": 39.75, "grad_norm_var": 191.76640625, "learning_rate": 0.0001, "loss": 9.5953, "loss/crossentropy": 2.2489975869655607, "loss/hidden": 4.309375, "loss/jsd": 0.0, "loss/logits": 0.3156152920797467, "step": 2830 }, { "epoch": 0.09466666666666666, "grad_norm": 33.75, "grad_norm_var": 51.555989583333336, "learning_rate": 0.0001, "loss": 9.5037, "loss/crossentropy": 2.1171005085110663, "loss/hidden": 4.358984375, "loss/jsd": 0.0, "loss/logits": 0.29232311621308327, "step": 2840 }, { "epoch": 0.095, "grad_norm": 41.25, "grad_norm_var": 15.114322916666667, "learning_rate": 0.0001, "loss": 9.4713, "loss/crossentropy": 2.1562278002500532, "loss/hidden": 4.270703125, "loss/jsd": 0.0, "loss/logits": 0.32199123315513134, "step": 2850 }, { "epoch": 0.09533333333333334, "grad_norm": 50.25, "grad_norm_var": 15.640625, "learning_rate": 0.0001, "loss": 9.4132, "loss/crossentropy": 2.1052547857165336, "loss/hidden": 4.25390625, "loss/jsd": 0.0, "loss/logits": 0.28381253518164157, "step": 2860 }, { "epoch": 0.09566666666666666, "grad_norm": 38.75, "grad_norm_var": 13.95, "learning_rate": 0.0001, "loss": 9.5959, "loss/crossentropy": 2.2764726355671883, "loss/hidden": 4.232421875, "loss/jsd": 0.0, "loss/logits": 0.3202834574505687, "step": 2870 }, { "epoch": 0.096, "grad_norm": 36.5, "grad_norm_var": 6.601822916666666, "learning_rate": 0.0001, "loss": 9.4651, "loss/crossentropy": 2.2641511857509613, "loss/hidden": 4.339453125, "loss/jsd": 0.0, "loss/logits": 0.30131282322108743, "step": 2880 }, { "epoch": 0.09633333333333334, "grad_norm": 39.75, "grad_norm_var": 259.33229166666666, "learning_rate": 0.0001, "loss": 9.5955, "loss/crossentropy": 2.1798644959926605, "loss/hidden": 4.316015625, "loss/jsd": 0.0, "loss/logits": 0.321273997426033, "step": 2890 }, { "epoch": 0.09666666666666666, "grad_norm": 42.25, "grad_norm_var": 265.51015625, "learning_rate": 0.0001, "loss": 9.4789, "loss/crossentropy": 2.2448938064277173, "loss/hidden": 4.21953125, "loss/jsd": 0.0, "loss/logits": 0.295270549505949, "step": 2900 }, { "epoch": 0.097, "grad_norm": 39.5, "grad_norm_var": 22.404166666666665, "learning_rate": 0.0001, "loss": 9.4008, "loss/crossentropy": 2.1135250240564347, "loss/hidden": 4.376953125, "loss/jsd": 0.0, "loss/logits": 0.28801401853561404, "step": 2910 }, { "epoch": 0.09733333333333333, "grad_norm": 40.0, "grad_norm_var": 16.864322916666666, "learning_rate": 0.0001, "loss": 9.3394, "loss/crossentropy": 2.2067424938082696, "loss/hidden": 4.271484375, "loss/jsd": 0.0, "loss/logits": 0.28688893765211104, "step": 2920 }, { "epoch": 0.09766666666666667, "grad_norm": 46.25, "grad_norm_var": 16.148958333333333, "learning_rate": 0.0001, "loss": 9.5709, "loss/crossentropy": 2.375057080388069, "loss/hidden": 4.37734375, "loss/jsd": 0.0, "loss/logits": 0.3258885521441698, "step": 2930 }, { "epoch": 0.098, "grad_norm": 32.25, "grad_norm_var": 21.30390625, "learning_rate": 0.0001, "loss": 9.4697, "loss/crossentropy": 2.2150216475129128, "loss/hidden": 4.350390625, "loss/jsd": 0.0, "loss/logits": 0.301010762155056, "step": 2940 }, { "epoch": 0.09833333333333333, "grad_norm": 40.75, "grad_norm_var": 14.576822916666666, "learning_rate": 0.0001, "loss": 9.5142, "loss/crossentropy": 2.166199879348278, "loss/hidden": 4.278125, "loss/jsd": 0.0, "loss/logits": 0.30978226438164713, "step": 2950 }, { "epoch": 0.09866666666666667, "grad_norm": 43.25, "grad_norm_var": 6.07265625, "learning_rate": 0.0001, "loss": 9.4248, "loss/crossentropy": 2.0727218955755236, "loss/hidden": 4.240234375, "loss/jsd": 0.0, "loss/logits": 0.2816809505224228, "step": 2960 }, { "epoch": 0.099, "grad_norm": 43.0, "grad_norm_var": 9.79765625, "learning_rate": 0.0001, "loss": 9.296, "loss/crossentropy": 2.194628655910492, "loss/hidden": 4.23359375, "loss/jsd": 0.0, "loss/logits": 0.2876921635121107, "step": 2970 }, { "epoch": 0.09933333333333333, "grad_norm": 42.25, "grad_norm_var": 6.145572916666667, "learning_rate": 0.0001, "loss": 9.4016, "loss/crossentropy": 2.1081935077905656, "loss/hidden": 4.201171875, "loss/jsd": 0.0, "loss/logits": 0.2900088790804148, "step": 2980 }, { "epoch": 0.09966666666666667, "grad_norm": 38.75, "grad_norm_var": 5.890625, "learning_rate": 0.0001, "loss": 9.4648, "loss/crossentropy": 2.121137388050556, "loss/hidden": 4.300390625, "loss/jsd": 0.0, "loss/logits": 0.2918519277125597, "step": 2990 }, { "epoch": 0.1, "grad_norm": 44.75, "grad_norm_var": 328.47083333333336, "learning_rate": 0.0001, "loss": 9.6444, "loss/crossentropy": 2.127225194871426, "loss/hidden": 4.36328125, "loss/jsd": 0.0, "loss/logits": 0.30514415316283705, "step": 3000 }, { "epoch": 0.10033333333333333, "grad_norm": 39.5, "grad_norm_var": 307.89348958333335, "learning_rate": 0.0001, "loss": 9.6355, "loss/crossentropy": 2.250686952471733, "loss/hidden": 4.398828125, "loss/jsd": 0.0, "loss/logits": 0.30530925914645196, "step": 3010 }, { "epoch": 0.10066666666666667, "grad_norm": 36.75, "grad_norm_var": 20.974739583333335, "learning_rate": 0.0001, "loss": 9.5967, "loss/crossentropy": 2.182039903104305, "loss/hidden": 4.254296875, "loss/jsd": 0.0, "loss/logits": 0.29223496429622176, "step": 3020 }, { "epoch": 0.101, "grad_norm": 36.25, "grad_norm_var": 19.032291666666666, "learning_rate": 0.0001, "loss": 9.2889, "loss/crossentropy": 2.193696314841509, "loss/hidden": 4.347265625, "loss/jsd": 0.0, "loss/logits": 0.31729465052485467, "step": 3030 }, { "epoch": 0.10133333333333333, "grad_norm": 41.25, "grad_norm_var": 5.1625, "learning_rate": 0.0001, "loss": 9.2799, "loss/crossentropy": 1.9293710552155972, "loss/hidden": 4.2890625, "loss/jsd": 0.0, "loss/logits": 0.29089682549238205, "step": 3040 }, { "epoch": 0.10166666666666667, "grad_norm": 46.75, "grad_norm_var": 94.43932291666667, "learning_rate": 0.0001, "loss": 9.5891, "loss/crossentropy": 2.186572279036045, "loss/hidden": 4.351171875, "loss/jsd": 0.0, "loss/logits": 0.31275569424033167, "step": 3050 }, { "epoch": 0.102, "grad_norm": 39.5, "grad_norm_var": 96.21015625, "learning_rate": 0.0001, "loss": 9.4484, "loss/crossentropy": 2.1623737648129464, "loss/hidden": 4.15390625, "loss/jsd": 0.0, "loss/logits": 0.27935802303254603, "step": 3060 }, { "epoch": 0.10233333333333333, "grad_norm": 34.0, "grad_norm_var": 23.951822916666668, "learning_rate": 0.0001, "loss": 9.3534, "loss/crossentropy": 2.1772946141660214, "loss/hidden": 4.130859375, "loss/jsd": 0.0, "loss/logits": 0.2741739235818386, "step": 3070 }, { "epoch": 0.10266666666666667, "grad_norm": 40.5, "grad_norm_var": 1.9625138775123822e+18, "learning_rate": 0.0001, "loss": 9.5206, "loss/crossentropy": 2.2383458808064463, "loss/hidden": 4.276171875, "loss/jsd": 0.0, "loss/logits": 0.28973059728741646, "step": 3080 }, { "epoch": 0.103, "grad_norm": 40.25, "grad_norm_var": 2.9817910414744924e+18, "learning_rate": 0.0001, "loss": 9.55, "loss/crossentropy": 2.3438141733407973, "loss/hidden": 4.469140625, "loss/jsd": 0.0, "loss/logits": 0.3148787975311279, "step": 3090 }, { "epoch": 0.10333333333333333, "grad_norm": 40.25, "grad_norm_var": 1.2261049715428168e+18, "learning_rate": 0.0001, "loss": 9.5073, "loss/crossentropy": 2.3278582096099854, "loss/hidden": 4.212109375, "loss/jsd": 0.0, "loss/logits": 0.29142517112195493, "step": 3100 }, { "epoch": 0.10366666666666667, "grad_norm": 38.0, "grad_norm_var": 291.4830729166667, "learning_rate": 0.0001, "loss": 9.424, "loss/crossentropy": 2.1446138307452203, "loss/hidden": 4.2796875, "loss/jsd": 0.0, "loss/logits": 0.3030157912522554, "step": 3110 }, { "epoch": 0.104, "grad_norm": 42.5, "grad_norm_var": 567.9125, "learning_rate": 0.0001, "loss": 9.5702, "loss/crossentropy": 2.243132984638214, "loss/hidden": 4.39296875, "loss/jsd": 0.0, "loss/logits": 0.35173722021281717, "step": 3120 }, { "epoch": 0.10433333333333333, "grad_norm": 35.5, "grad_norm_var": 327.62395833333335, "learning_rate": 0.0001, "loss": 9.5972, "loss/crossentropy": 2.124360602349043, "loss/hidden": 4.216015625, "loss/jsd": 0.0, "loss/logits": 0.2907536863349378, "step": 3130 }, { "epoch": 0.10466666666666667, "grad_norm": 52.75, "grad_norm_var": 22.407291666666666, "learning_rate": 0.0001, "loss": 9.437, "loss/crossentropy": 2.2831270948052405, "loss/hidden": 4.153515625, "loss/jsd": 0.0, "loss/logits": 0.291528220102191, "step": 3140 }, { "epoch": 0.105, "grad_norm": 40.0, "grad_norm_var": 1.8926377153833818e+18, "learning_rate": 0.0001, "loss": 9.4417, "loss/crossentropy": 2.0570017248392105, "loss/hidden": 4.233984375, "loss/jsd": 0.0, "loss/logits": 0.28441670089960097, "step": 3150 }, { "epoch": 0.10533333333333333, "grad_norm": 39.75, "grad_norm_var": 7.590625, "learning_rate": 0.0001, "loss": 9.184, "loss/crossentropy": 2.362034395337105, "loss/hidden": 4.20234375, "loss/jsd": 0.0, "loss/logits": 0.30373654775321485, "step": 3160 }, { "epoch": 0.10566666666666667, "grad_norm": 40.75, "grad_norm_var": 9.939322916666667, "learning_rate": 0.0001, "loss": 9.4179, "loss/crossentropy": 2.137139005959034, "loss/hidden": 4.36796875, "loss/jsd": 0.0, "loss/logits": 0.30007231421768665, "step": 3170 }, { "epoch": 0.106, "grad_norm": 43.25, "grad_norm_var": 168.83229166666666, "learning_rate": 0.0001, "loss": 9.5242, "loss/crossentropy": 2.084735092520714, "loss/hidden": 4.266015625, "loss/jsd": 0.0, "loss/logits": 0.29063573814928534, "step": 3180 }, { "epoch": 0.10633333333333334, "grad_norm": 39.75, "grad_norm_var": 13.895833333333334, "learning_rate": 0.0001, "loss": 9.3732, "loss/crossentropy": 2.0884271055459975, "loss/hidden": 4.301953125, "loss/jsd": 0.0, "loss/logits": 0.292556369304657, "step": 3190 }, { "epoch": 0.10666666666666667, "grad_norm": 40.25, "grad_norm_var": 10.104166666666666, "learning_rate": 0.0001, "loss": 9.409, "loss/crossentropy": 2.1861984208226204, "loss/hidden": 4.359375, "loss/jsd": 0.0, "loss/logits": 0.3046982977539301, "step": 3200 }, { "epoch": 0.107, "grad_norm": 33.25, "grad_norm_var": 19.754166666666666, "learning_rate": 0.0001, "loss": 9.333, "loss/crossentropy": 2.1967382043600083, "loss/hidden": 4.280859375, "loss/jsd": 0.0, "loss/logits": 0.3142522796988487, "step": 3210 }, { "epoch": 0.10733333333333334, "grad_norm": 37.75, "grad_norm_var": 15.83515625, "learning_rate": 0.0001, "loss": 9.4087, "loss/crossentropy": 2.099241575598717, "loss/hidden": 4.223046875, "loss/jsd": 0.0, "loss/logits": 0.3019026231020689, "step": 3220 }, { "epoch": 0.10766666666666666, "grad_norm": 36.75, "grad_norm_var": 12.68515625, "learning_rate": 0.0001, "loss": 9.3162, "loss/crossentropy": 2.0056164607405664, "loss/hidden": 4.308984375, "loss/jsd": 0.0, "loss/logits": 0.2895685002207756, "step": 3230 }, { "epoch": 0.108, "grad_norm": 51.75, "grad_norm_var": 25.812239583333334, "learning_rate": 0.0001, "loss": 9.4161, "loss/crossentropy": 2.194224573671818, "loss/hidden": 4.276953125, "loss/jsd": 0.0, "loss/logits": 0.2994342103600502, "step": 3240 }, { "epoch": 0.10833333333333334, "grad_norm": 37.75, "grad_norm_var": 30.448958333333334, "learning_rate": 0.0001, "loss": 9.3774, "loss/crossentropy": 2.1500812068581583, "loss/hidden": 4.316796875, "loss/jsd": 0.0, "loss/logits": 0.297881081327796, "step": 3250 }, { "epoch": 0.10866666666666666, "grad_norm": 50.0, "grad_norm_var": 15.873958333333333, "learning_rate": 0.0001, "loss": 9.3746, "loss/crossentropy": 2.149769604206085, "loss/hidden": 4.337890625, "loss/jsd": 0.0, "loss/logits": 0.2857384353876114, "step": 3260 }, { "epoch": 0.109, "grad_norm": 49.25, "grad_norm_var": 23.241666666666667, "learning_rate": 0.0001, "loss": 9.4298, "loss/crossentropy": 2.1580600261688234, "loss/hidden": 4.203125, "loss/jsd": 0.0, "loss/logits": 0.2893156711012125, "step": 3270 }, { "epoch": 0.10933333333333334, "grad_norm": 42.0, "grad_norm_var": 14.820572916666666, "learning_rate": 0.0001, "loss": 9.3562, "loss/crossentropy": 2.285036212205887, "loss/hidden": 4.28984375, "loss/jsd": 0.0, "loss/logits": 0.3082501322031021, "step": 3280 }, { "epoch": 0.10966666666666666, "grad_norm": 37.0, "grad_norm_var": 6.114322916666667, "learning_rate": 0.0001, "loss": 9.6223, "loss/crossentropy": 2.2134475603699686, "loss/hidden": 4.16015625, "loss/jsd": 0.0, "loss/logits": 0.28552069552242754, "step": 3290 }, { "epoch": 0.11, "grad_norm": 33.75, "grad_norm_var": 13.50390625, "learning_rate": 0.0001, "loss": 9.3289, "loss/crossentropy": 2.1722410164773462, "loss/hidden": 4.277734375, "loss/jsd": 0.0, "loss/logits": 0.2915722324512899, "step": 3300 }, { "epoch": 0.11033333333333334, "grad_norm": 39.5, "grad_norm_var": 12.824739583333333, "learning_rate": 0.0001, "loss": 9.3712, "loss/crossentropy": 1.9897394858300685, "loss/hidden": 4.20390625, "loss/jsd": 0.0, "loss/logits": 0.2664288356900215, "step": 3310 }, { "epoch": 0.11066666666666666, "grad_norm": 37.25, "grad_norm_var": 13.051822916666667, "learning_rate": 0.0001, "loss": 9.21, "loss/crossentropy": 2.109373450279236, "loss/hidden": 4.2296875, "loss/jsd": 0.0, "loss/logits": 0.2850793283432722, "step": 3320 }, { "epoch": 0.111, "grad_norm": 42.5, "grad_norm_var": 14.47890625, "learning_rate": 0.0001, "loss": 9.2866, "loss/crossentropy": 2.2932792961597444, "loss/hidden": 4.23203125, "loss/jsd": 0.0, "loss/logits": 0.30185060724616053, "step": 3330 }, { "epoch": 0.11133333333333334, "grad_norm": 39.25, "grad_norm_var": 8.583333333333334, "learning_rate": 0.0001, "loss": 9.427, "loss/crossentropy": 2.1133791759610174, "loss/hidden": 4.183984375, "loss/jsd": 0.0, "loss/logits": 0.29095215909183025, "step": 3340 }, { "epoch": 0.11166666666666666, "grad_norm": 36.25, "grad_norm_var": 12.520572916666667, "learning_rate": 0.0001, "loss": 9.3503, "loss/crossentropy": 2.032514417171478, "loss/hidden": 4.228515625, "loss/jsd": 0.0, "loss/logits": 0.2894793044775724, "step": 3350 }, { "epoch": 0.112, "grad_norm": 52.5, "grad_norm_var": 24.51640625, "learning_rate": 0.0001, "loss": 9.3076, "loss/crossentropy": 1.9752195596694946, "loss/hidden": 4.3046875, "loss/jsd": 0.0, "loss/logits": 0.2814787019044161, "step": 3360 }, { "epoch": 0.11233333333333333, "grad_norm": 34.75, "grad_norm_var": 28.245833333333334, "learning_rate": 0.0001, "loss": 9.4876, "loss/crossentropy": 2.3702859327197077, "loss/hidden": 4.1859375, "loss/jsd": 0.0, "loss/logits": 0.3126111339777708, "step": 3370 }, { "epoch": 0.11266666666666666, "grad_norm": 34.5, "grad_norm_var": 20.070572916666666, "learning_rate": 0.0001, "loss": 9.2654, "loss/crossentropy": 1.8932878598570824, "loss/hidden": 4.261328125, "loss/jsd": 0.0, "loss/logits": 0.2764943749643862, "step": 3380 }, { "epoch": 0.113, "grad_norm": 41.0, "grad_norm_var": 11.174739583333333, "learning_rate": 0.0001, "loss": 9.4906, "loss/crossentropy": 2.155991692841053, "loss/hidden": 4.19921875, "loss/jsd": 0.0, "loss/logits": 0.2793617382645607, "step": 3390 }, { "epoch": 0.11333333333333333, "grad_norm": 39.75, "grad_norm_var": 7.01015625, "learning_rate": 0.0001, "loss": 9.3088, "loss/crossentropy": 2.093905381858349, "loss/hidden": 4.49609375, "loss/jsd": 0.0, "loss/logits": 0.3261349702253938, "step": 3400 }, { "epoch": 0.11366666666666667, "grad_norm": 39.75, "grad_norm_var": 46.25, "learning_rate": 0.0001, "loss": 9.486, "loss/crossentropy": 2.1121586173772813, "loss/hidden": 4.215625, "loss/jsd": 0.0, "loss/logits": 0.2922355983406305, "step": 3410 }, { "epoch": 0.114, "grad_norm": 35.5, "grad_norm_var": 5.224739583333333, "learning_rate": 0.0001, "loss": 9.3474, "loss/crossentropy": 2.177844299376011, "loss/hidden": 4.23671875, "loss/jsd": 0.0, "loss/logits": 0.29271903187036513, "step": 3420 }, { "epoch": 0.11433333333333333, "grad_norm": 49.75, "grad_norm_var": 15.948958333333334, "learning_rate": 0.0001, "loss": 9.4016, "loss/crossentropy": 2.2300315856933595, "loss/hidden": 4.13828125, "loss/jsd": 0.0, "loss/logits": 0.29194765314459803, "step": 3430 }, { "epoch": 0.11466666666666667, "grad_norm": 40.75, "grad_norm_var": 20.995833333333334, "learning_rate": 0.0001, "loss": 9.2609, "loss/crossentropy": 2.072411538660526, "loss/hidden": 4.219140625, "loss/jsd": 0.0, "loss/logits": 0.2944797810167074, "step": 3440 }, { "epoch": 0.115, "grad_norm": 35.25, "grad_norm_var": 23.179166666666667, "learning_rate": 0.0001, "loss": 9.3101, "loss/crossentropy": 2.106787271797657, "loss/hidden": 4.253515625, "loss/jsd": 0.0, "loss/logits": 0.2934326458722353, "step": 3450 }, { "epoch": 0.11533333333333333, "grad_norm": 35.0, "grad_norm_var": 21.362239583333334, "learning_rate": 0.0001, "loss": 9.3648, "loss/crossentropy": 2.1898166716098784, "loss/hidden": 4.171484375, "loss/jsd": 0.0, "loss/logits": 0.29939313270151613, "step": 3460 }, { "epoch": 0.11566666666666667, "grad_norm": 41.0, "grad_norm_var": 10.940625, "learning_rate": 0.0001, "loss": 9.2795, "loss/crossentropy": 2.3480966717004774, "loss/hidden": 4.236328125, "loss/jsd": 0.0, "loss/logits": 0.304421117156744, "step": 3470 }, { "epoch": 0.116, "grad_norm": 39.75, "grad_norm_var": 5.11015625, "learning_rate": 0.0001, "loss": 9.361, "loss/crossentropy": 2.061821439862251, "loss/hidden": 4.258203125, "loss/jsd": 0.0, "loss/logits": 0.280943001806736, "step": 3480 }, { "epoch": 0.11633333333333333, "grad_norm": 38.75, "grad_norm_var": 5.082291666666666, "learning_rate": 0.0001, "loss": 9.2472, "loss/crossentropy": 2.089048261940479, "loss/hidden": 4.203515625, "loss/jsd": 0.0, "loss/logits": 0.27816532738506794, "step": 3490 }, { "epoch": 0.11666666666666667, "grad_norm": 39.25, "grad_norm_var": 5.648958333333334, "learning_rate": 0.0001, "loss": 9.3, "loss/crossentropy": 2.0424555987119675, "loss/hidden": 4.278515625, "loss/jsd": 0.0, "loss/logits": 0.2886748146265745, "step": 3500 }, { "epoch": 0.117, "grad_norm": 41.25, "grad_norm_var": 5.230989583333334, "learning_rate": 0.0001, "loss": 9.3444, "loss/crossentropy": 2.2320514231920243, "loss/hidden": 4.188671875, "loss/jsd": 0.0, "loss/logits": 0.29398479498922825, "step": 3510 }, { "epoch": 0.11733333333333333, "grad_norm": 32.0, "grad_norm_var": 14.45, "learning_rate": 0.0001, "loss": 9.352, "loss/crossentropy": 2.0569834411144257, "loss/hidden": 4.1828125, "loss/jsd": 0.0, "loss/logits": 0.28648389838635924, "step": 3520 }, { "epoch": 0.11766666666666667, "grad_norm": 40.5, "grad_norm_var": 15.245572916666667, "learning_rate": 0.0001, "loss": 9.2835, "loss/crossentropy": 2.1193760722875594, "loss/hidden": 4.11171875, "loss/jsd": 0.0, "loss/logits": 0.277429373562336, "step": 3530 }, { "epoch": 0.118, "grad_norm": 40.25, "grad_norm_var": 9.59140625, "learning_rate": 0.0001, "loss": 9.2573, "loss/crossentropy": 2.1044032208621504, "loss/hidden": 4.258984375, "loss/jsd": 0.0, "loss/logits": 0.2949466461315751, "step": 3540 }, { "epoch": 0.11833333333333333, "grad_norm": 43.0, "grad_norm_var": 15.15, "learning_rate": 0.0001, "loss": 9.1153, "loss/crossentropy": 2.07734075859189, "loss/hidden": 4.13203125, "loss/jsd": 0.0, "loss/logits": 0.2726396777667105, "step": 3550 }, { "epoch": 0.11866666666666667, "grad_norm": 41.75, "grad_norm_var": 9.605989583333333, "learning_rate": 0.0001, "loss": 9.189, "loss/crossentropy": 2.196866528689861, "loss/hidden": 4.25546875, "loss/jsd": 0.0, "loss/logits": 0.3004810862243176, "step": 3560 }, { "epoch": 0.119, "grad_norm": 36.75, "grad_norm_var": 47.27682291666667, "learning_rate": 0.0001, "loss": 9.089, "loss/crossentropy": 2.297177466750145, "loss/hidden": 4.2296875, "loss/jsd": 0.0, "loss/logits": 0.3052255652844906, "step": 3570 }, { "epoch": 0.11933333333333333, "grad_norm": 37.75, "grad_norm_var": 321.41432291666666, "learning_rate": 0.0001, "loss": 9.2827, "loss/crossentropy": 2.129167598485947, "loss/hidden": 4.31171875, "loss/jsd": 0.0, "loss/logits": 0.304591753706336, "step": 3580 }, { "epoch": 0.11966666666666667, "grad_norm": 40.25, "grad_norm_var": 345.54973958333335, "learning_rate": 0.0001, "loss": 9.3048, "loss/crossentropy": 2.1689872413873674, "loss/hidden": 4.12578125, "loss/jsd": 0.0, "loss/logits": 0.27596075274050236, "step": 3590 }, { "epoch": 0.12, "grad_norm": 40.0, "grad_norm_var": 17.140625, "learning_rate": 0.0001, "loss": 9.2268, "loss/crossentropy": 1.9578171581029893, "loss/hidden": 4.265625, "loss/jsd": 0.0, "loss/logits": 0.28636636175215247, "step": 3600 }, { "epoch": 0.12033333333333333, "grad_norm": 41.0, "grad_norm_var": 68.175, "learning_rate": 0.0001, "loss": 9.1787, "loss/crossentropy": 2.067359810322523, "loss/hidden": 4.250390625, "loss/jsd": 0.0, "loss/logits": 0.28545970730483533, "step": 3610 }, { "epoch": 0.12066666666666667, "grad_norm": 32.75, "grad_norm_var": 29.920572916666668, "learning_rate": 0.0001, "loss": 9.2359, "loss/crossentropy": 2.2047273397445677, "loss/hidden": 4.180078125, "loss/jsd": 0.0, "loss/logits": 0.28760180845856664, "step": 3620 }, { "epoch": 0.121, "grad_norm": 38.25, "grad_norm_var": 13.314322916666667, "learning_rate": 0.0001, "loss": 9.0908, "loss/crossentropy": 2.1291835106909276, "loss/hidden": 4.183984375, "loss/jsd": 0.0, "loss/logits": 0.2917652137577534, "step": 3630 }, { "epoch": 0.12133333333333333, "grad_norm": 41.25, "grad_norm_var": 11.356184895833334, "learning_rate": 0.0001, "loss": 9.2854, "loss/crossentropy": 2.2793887823820116, "loss/hidden": 4.24765625, "loss/jsd": 0.0, "loss/logits": 0.30008267536759375, "step": 3640 }, { "epoch": 0.12166666666666667, "grad_norm": 36.5, "grad_norm_var": 56.16920572916667, "learning_rate": 0.0001, "loss": 9.2821, "loss/crossentropy": 2.2533405125141144, "loss/hidden": 4.32109375, "loss/jsd": 0.0, "loss/logits": 0.30765612684190274, "step": 3650 }, { "epoch": 0.122, "grad_norm": 42.5, "grad_norm_var": 53.973958333333336, "learning_rate": 0.0001, "loss": 9.2816, "loss/crossentropy": 2.294741216301918, "loss/hidden": 4.243359375, "loss/jsd": 0.0, "loss/logits": 0.29128187969326974, "step": 3660 }, { "epoch": 0.12233333333333334, "grad_norm": 37.0, "grad_norm_var": 18.740625, "learning_rate": 0.0001, "loss": 9.2943, "loss/crossentropy": 2.073512817919254, "loss/hidden": 4.188671875, "loss/jsd": 0.0, "loss/logits": 0.2968620590865612, "step": 3670 }, { "epoch": 0.12266666666666666, "grad_norm": 43.0, "grad_norm_var": 13.92890625, "learning_rate": 0.0001, "loss": 9.1891, "loss/crossentropy": 2.0943071067333223, "loss/hidden": 4.151171875, "loss/jsd": 0.0, "loss/logits": 0.2801366148516536, "step": 3680 }, { "epoch": 0.123, "grad_norm": 40.5, "grad_norm_var": 11.00390625, "learning_rate": 0.0001, "loss": 9.1903, "loss/crossentropy": 2.0213806182146072, "loss/hidden": 4.13359375, "loss/jsd": 0.0, "loss/logits": 0.27214346677064893, "step": 3690 }, { "epoch": 0.12333333333333334, "grad_norm": 50.0, "grad_norm_var": 1.5413569484339108e+18, "learning_rate": 0.0001, "loss": 9.2765, "loss/crossentropy": 2.103906211256981, "loss/hidden": 4.205078125, "loss/jsd": 0.0, "loss/logits": 0.30408407263457776, "step": 3700 }, { "epoch": 0.12366666666666666, "grad_norm": 34.5, "grad_norm_var": 1.541356947316548e+18, "learning_rate": 0.0001, "loss": 9.2341, "loss/crossentropy": 2.2875989854335783, "loss/hidden": 4.12421875, "loss/jsd": 0.0, "loss/logits": 0.29328348860144615, "step": 3710 }, { "epoch": 0.124, "grad_norm": 40.25, "grad_norm_var": 20.280989583333334, "learning_rate": 0.0001, "loss": 9.3043, "loss/crossentropy": 2.302832932770252, "loss/hidden": 4.191796875, "loss/jsd": 0.0, "loss/logits": 0.30611949125304816, "step": 3720 }, { "epoch": 0.12433333333333334, "grad_norm": 31.75, "grad_norm_var": 43.416666666666664, "learning_rate": 0.0001, "loss": 9.1562, "loss/crossentropy": 2.1152502104640005, "loss/hidden": 4.129296875, "loss/jsd": 0.0, "loss/logits": 0.28165129497647284, "step": 3730 }, { "epoch": 0.12466666666666666, "grad_norm": 38.5, "grad_norm_var": 33.608333333333334, "learning_rate": 0.0001, "loss": 9.3006, "loss/crossentropy": 2.320794602483511, "loss/hidden": 4.10390625, "loss/jsd": 0.0, "loss/logits": 0.2863430541008711, "step": 3740 }, { "epoch": 0.125, "grad_norm": 34.5, "grad_norm_var": 6.52890625, "learning_rate": 0.0001, "loss": 9.275, "loss/crossentropy": 2.0785109654068945, "loss/hidden": 4.22265625, "loss/jsd": 0.0, "loss/logits": 0.3000879239290953, "step": 3750 }, { "epoch": 0.12533333333333332, "grad_norm": 34.25, "grad_norm_var": 22.823958333333334, "learning_rate": 0.0001, "loss": 9.3243, "loss/crossentropy": 2.1262876391410828, "loss/hidden": 4.30859375, "loss/jsd": 0.0, "loss/logits": 0.3010214529931545, "step": 3760 }, { "epoch": 0.12566666666666668, "grad_norm": 37.75, "grad_norm_var": 21.157291666666666, "learning_rate": 0.0001, "loss": 9.2353, "loss/crossentropy": 2.2993004634976386, "loss/hidden": 4.254296875, "loss/jsd": 0.0, "loss/logits": 0.3304103210568428, "step": 3770 }, { "epoch": 0.126, "grad_norm": 33.0, "grad_norm_var": 8.33515625, "learning_rate": 0.0001, "loss": 9.2457, "loss/crossentropy": 2.1940866082906725, "loss/hidden": 4.199609375, "loss/jsd": 0.0, "loss/logits": 0.294326201826334, "step": 3780 }, { "epoch": 0.12633333333333333, "grad_norm": 38.5, "grad_norm_var": 135.71015625, "learning_rate": 0.0001, "loss": 9.2567, "loss/crossentropy": 2.13921734392643, "loss/hidden": 4.177734375, "loss/jsd": 0.0, "loss/logits": 0.28696890603750946, "step": 3790 }, { "epoch": 0.12666666666666668, "grad_norm": 36.0, "grad_norm_var": 4.058333333333334, "learning_rate": 0.0001, "loss": 9.0874, "loss/crossentropy": 2.2284460753202437, "loss/hidden": 4.209375, "loss/jsd": 0.0, "loss/logits": 0.3025734366849065, "step": 3800 }, { "epoch": 0.127, "grad_norm": 34.5, "grad_norm_var": 9.889322916666666, "learning_rate": 0.0001, "loss": 9.2163, "loss/crossentropy": 2.299591761827469, "loss/hidden": 4.055859375, "loss/jsd": 0.0, "loss/logits": 0.296408361941576, "step": 3810 }, { "epoch": 0.12733333333333333, "grad_norm": 37.75, "grad_norm_var": 11.239322916666667, "learning_rate": 0.0001, "loss": 9.1761, "loss/crossentropy": 2.2842276841402054, "loss/hidden": 4.11875, "loss/jsd": 0.0, "loss/logits": 0.2857844788581133, "step": 3820 }, { "epoch": 0.12766666666666668, "grad_norm": 42.0, "grad_norm_var": 10.793489583333333, "learning_rate": 0.0001, "loss": 9.2579, "loss/crossentropy": 2.2464538693428038, "loss/hidden": 4.245703125, "loss/jsd": 0.0, "loss/logits": 0.2956688392907381, "step": 3830 }, { "epoch": 0.128, "grad_norm": 35.0, "grad_norm_var": 16.21015625, "learning_rate": 0.0001, "loss": 9.1472, "loss/crossentropy": 2.228955328464508, "loss/hidden": 4.26640625, "loss/jsd": 0.0, "loss/logits": 0.30142183881253004, "step": 3840 }, { "epoch": 0.12833333333333333, "grad_norm": 41.0, "grad_norm_var": 4.840625, "learning_rate": 0.0001, "loss": 9.2928, "loss/crossentropy": 2.223462516069412, "loss/hidden": 4.230078125, "loss/jsd": 0.0, "loss/logits": 0.29133280031383035, "step": 3850 }, { "epoch": 0.12866666666666668, "grad_norm": 36.5, "grad_norm_var": 20.939322916666665, "learning_rate": 0.0001, "loss": 9.0495, "loss/crossentropy": 2.2790828943252563, "loss/hidden": 4.126953125, "loss/jsd": 0.0, "loss/logits": 0.2985575716942549, "step": 3860 }, { "epoch": 0.129, "grad_norm": 40.25, "grad_norm_var": 20.11640625, "learning_rate": 0.0001, "loss": 9.1951, "loss/crossentropy": 2.2769517719745638, "loss/hidden": 4.061328125, "loss/jsd": 0.0, "loss/logits": 0.2779423680156469, "step": 3870 }, { "epoch": 0.12933333333333333, "grad_norm": 32.0, "grad_norm_var": 7.958072916666667, "learning_rate": 0.0001, "loss": 9.1347, "loss/crossentropy": 2.097635033726692, "loss/hidden": 4.102734375, "loss/jsd": 0.0, "loss/logits": 0.2780100252479315, "step": 3880 }, { "epoch": 0.12966666666666668, "grad_norm": 33.25, "grad_norm_var": 8.944205729166667, "learning_rate": 0.0001, "loss": 9.0803, "loss/crossentropy": 2.1658859461545945, "loss/hidden": 4.18359375, "loss/jsd": 0.0, "loss/logits": 0.27850373424589636, "step": 3890 }, { "epoch": 0.13, "grad_norm": 33.25, "grad_norm_var": 10.558268229166666, "learning_rate": 0.0001, "loss": 9.167, "loss/crossentropy": 2.1380916953086855, "loss/hidden": 4.279296875, "loss/jsd": 0.0, "loss/logits": 0.2888460006564856, "step": 3900 }, { "epoch": 0.13033333333333333, "grad_norm": 41.75, "grad_norm_var": 6.151822916666666, "learning_rate": 0.0001, "loss": 9.2188, "loss/crossentropy": 2.246109126508236, "loss/hidden": 4.090234375, "loss/jsd": 0.0, "loss/logits": 0.2838183153420687, "step": 3910 }, { "epoch": 0.13066666666666665, "grad_norm": 38.0, "grad_norm_var": 3.626822916666667, "learning_rate": 0.0001, "loss": 9.0951, "loss/crossentropy": 1.9172726094722747, "loss/hidden": 4.18359375, "loss/jsd": 0.0, "loss/logits": 0.27877611964941024, "step": 3920 }, { "epoch": 0.131, "grad_norm": 36.25, "grad_norm_var": 7.6875, "learning_rate": 0.0001, "loss": 9.123, "loss/crossentropy": 2.216602808237076, "loss/hidden": 4.17421875, "loss/jsd": 0.0, "loss/logits": 0.29801386743783953, "step": 3930 }, { "epoch": 0.13133333333333333, "grad_norm": 37.75, "grad_norm_var": 15.170247395833334, "learning_rate": 0.0001, "loss": 9.0372, "loss/crossentropy": 2.0827317383140325, "loss/hidden": 4.1515625, "loss/jsd": 0.0, "loss/logits": 0.25494101997464896, "step": 3940 }, { "epoch": 0.13166666666666665, "grad_norm": 39.25, "grad_norm_var": 16.780143229166665, "learning_rate": 0.0001, "loss": 9.1713, "loss/crossentropy": 2.0472889255732296, "loss/hidden": 4.125390625, "loss/jsd": 0.0, "loss/logits": 0.2605790663510561, "step": 3950 }, { "epoch": 0.132, "grad_norm": 35.0, "grad_norm_var": 24.373958333333334, "learning_rate": 0.0001, "loss": 9.1236, "loss/crossentropy": 2.2061238437891006, "loss/hidden": 4.06953125, "loss/jsd": 0.0, "loss/logits": 0.27440296970307826, "step": 3960 }, { "epoch": 0.13233333333333333, "grad_norm": 37.75, "grad_norm_var": 17.36015625, "learning_rate": 0.0001, "loss": 9.0615, "loss/crossentropy": 2.067065991461277, "loss/hidden": 4.19609375, "loss/jsd": 0.0, "loss/logits": 0.2724686389788985, "step": 3970 }, { "epoch": 0.13266666666666665, "grad_norm": 35.5, "grad_norm_var": 13.895833333333334, "learning_rate": 0.0001, "loss": 9.2604, "loss/crossentropy": 2.3754075884819033, "loss/hidden": 4.234765625, "loss/jsd": 0.0, "loss/logits": 0.3084482606500387, "step": 3980 }, { "epoch": 0.133, "grad_norm": 36.5, "grad_norm_var": 15.72265625, "learning_rate": 0.0001, "loss": 9.0579, "loss/crossentropy": 2.0445886969566347, "loss/hidden": 4.1140625, "loss/jsd": 0.0, "loss/logits": 0.28532980997115376, "step": 3990 }, { "epoch": 0.13333333333333333, "grad_norm": 38.0, "grad_norm_var": 4.351822916666666, "learning_rate": 0.0001, "loss": 8.9463, "loss/crossentropy": 2.154660718142986, "loss/hidden": 4.159375, "loss/jsd": 0.0, "loss/logits": 0.29097947776317595, "step": 4000 }, { "epoch": 0.13366666666666666, "grad_norm": 39.25, "grad_norm_var": 10.275, "learning_rate": 0.0001, "loss": 9.1226, "loss/crossentropy": 2.0323362797498703, "loss/hidden": 4.0609375, "loss/jsd": 0.0, "loss/logits": 0.25012299232184887, "step": 4010 }, { "epoch": 0.134, "grad_norm": 35.75, "grad_norm_var": 5.273958333333334, "learning_rate": 0.0001, "loss": 9.2453, "loss/crossentropy": 2.1468474209308623, "loss/hidden": 4.19765625, "loss/jsd": 0.0, "loss/logits": 0.2808349967002869, "step": 4020 }, { "epoch": 0.13433333333333333, "grad_norm": 32.0, "grad_norm_var": 21.4875, "learning_rate": 0.0001, "loss": 9.227, "loss/crossentropy": 2.1914032608270646, "loss/hidden": 4.169140625, "loss/jsd": 0.0, "loss/logits": 0.2951745491474867, "step": 4030 }, { "epoch": 0.13466666666666666, "grad_norm": 38.25, "grad_norm_var": 15.52265625, "learning_rate": 0.0001, "loss": 9.1906, "loss/crossentropy": 2.374897816777229, "loss/hidden": 4.23515625, "loss/jsd": 0.0, "loss/logits": 0.29634634144604205, "step": 4040 }, { "epoch": 0.135, "grad_norm": 38.5, "grad_norm_var": 8.176822916666667, "learning_rate": 0.0001, "loss": 9.2041, "loss/crossentropy": 2.213481293618679, "loss/hidden": 4.249609375, "loss/jsd": 0.0, "loss/logits": 0.30703250467777254, "step": 4050 }, { "epoch": 0.13533333333333333, "grad_norm": 199.0, "grad_norm_var": 1616.4239583333333, "learning_rate": 0.0001, "loss": 9.4101, "loss/crossentropy": 2.3092150717973707, "loss/hidden": 4.17890625, "loss/jsd": 0.0, "loss/logits": 0.298384091258049, "step": 4060 }, { "epoch": 0.13566666666666666, "grad_norm": 37.25, "grad_norm_var": 1616.6239583333333, "learning_rate": 0.0001, "loss": 9.3071, "loss/crossentropy": 2.219760200381279, "loss/hidden": 4.290625, "loss/jsd": 0.0, "loss/logits": 0.3202596869319677, "step": 4070 }, { "epoch": 0.136, "grad_norm": 40.0, "grad_norm_var": 9.732291666666667, "learning_rate": 0.0001, "loss": 9.3305, "loss/crossentropy": 2.289233188331127, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.29089505709707736, "step": 4080 }, { "epoch": 0.13633333333333333, "grad_norm": 38.0, "grad_norm_var": 151.53932291666666, "learning_rate": 0.0001, "loss": 9.2719, "loss/crossentropy": 2.3422865584492683, "loss/hidden": 4.199609375, "loss/jsd": 0.0, "loss/logits": 0.2982410844415426, "step": 4090 }, { "epoch": 0.13666666666666666, "grad_norm": 34.5, "grad_norm_var": 8.22265625, "learning_rate": 0.0001, "loss": 9.154, "loss/crossentropy": 2.0941394165158274, "loss/hidden": 4.131640625, "loss/jsd": 0.0, "loss/logits": 0.28526539970189335, "step": 4100 }, { "epoch": 0.137, "grad_norm": 42.5, "grad_norm_var": 13.489322916666667, "learning_rate": 0.0001, "loss": 9.2834, "loss/crossentropy": 2.336969590187073, "loss/hidden": 4.314453125, "loss/jsd": 0.0, "loss/logits": 0.3301690086722374, "step": 4110 }, { "epoch": 0.13733333333333334, "grad_norm": 38.75, "grad_norm_var": 75.96640625, "learning_rate": 0.0001, "loss": 9.2545, "loss/crossentropy": 2.354858273267746, "loss/hidden": 4.16953125, "loss/jsd": 0.0, "loss/logits": 0.31785392127931117, "step": 4120 }, { "epoch": 0.13766666666666666, "grad_norm": 32.25, "grad_norm_var": 74.625, "learning_rate": 0.0001, "loss": 9.017, "loss/crossentropy": 2.242242157459259, "loss/hidden": 4.15703125, "loss/jsd": 0.0, "loss/logits": 0.27648379243910315, "step": 4130 }, { "epoch": 0.138, "grad_norm": 38.25, "grad_norm_var": 12.843489583333334, "learning_rate": 0.0001, "loss": 9.2173, "loss/crossentropy": 2.240199755132198, "loss/hidden": 4.201953125, "loss/jsd": 0.0, "loss/logits": 0.2927041232585907, "step": 4140 }, { "epoch": 0.13833333333333334, "grad_norm": 33.75, "grad_norm_var": 25.674739583333334, "learning_rate": 0.0001, "loss": 9.1547, "loss/crossentropy": 2.1676330864429474, "loss/hidden": 4.211328125, "loss/jsd": 0.0, "loss/logits": 0.2752385437488556, "step": 4150 }, { "epoch": 0.13866666666666666, "grad_norm": 36.5, "grad_norm_var": 4.523958333333334, "learning_rate": 0.0001, "loss": 9.08, "loss/crossentropy": 1.9810823224484921, "loss/hidden": 4.034765625, "loss/jsd": 0.0, "loss/logits": 0.2782739015296102, "step": 4160 }, { "epoch": 0.139, "grad_norm": 37.75, "grad_norm_var": 6.179166666666666, "learning_rate": 0.0001, "loss": 9.0211, "loss/crossentropy": 2.088325909897685, "loss/hidden": 4.081640625, "loss/jsd": 0.0, "loss/logits": 0.2718936084304005, "step": 4170 }, { "epoch": 0.13933333333333334, "grad_norm": 36.25, "grad_norm_var": 4.368489583333333, "learning_rate": 0.0001, "loss": 9.2223, "loss/crossentropy": 2.345874647796154, "loss/hidden": 4.141015625, "loss/jsd": 0.0, "loss/logits": 0.29183666668832303, "step": 4180 }, { "epoch": 0.13966666666666666, "grad_norm": 35.25, "grad_norm_var": 6.133333333333334, "learning_rate": 0.0001, "loss": 8.9914, "loss/crossentropy": 2.076162505149841, "loss/hidden": 4.159375, "loss/jsd": 0.0, "loss/logits": 0.29356912411749364, "step": 4190 }, { "epoch": 0.14, "grad_norm": 34.25, "grad_norm_var": 6.707291666666666, "learning_rate": 0.0001, "loss": 8.9754, "loss/crossentropy": 2.1641511037945746, "loss/hidden": 4.112109375, "loss/jsd": 0.0, "loss/logits": 0.2857985034584999, "step": 4200 }, { "epoch": 0.14033333333333334, "grad_norm": 36.75, "grad_norm_var": 4.530989583333334, "learning_rate": 0.0001, "loss": 9.1635, "loss/crossentropy": 2.104039117693901, "loss/hidden": 4.08671875, "loss/jsd": 0.0, "loss/logits": 0.26876664757728574, "step": 4210 }, { "epoch": 0.14066666666666666, "grad_norm": 36.75, "grad_norm_var": 4.38515625, "learning_rate": 0.0001, "loss": 8.998, "loss/crossentropy": 2.1638469099998474, "loss/hidden": 4.260546875, "loss/jsd": 0.0, "loss/logits": 0.29361540265381336, "step": 4220 }, { "epoch": 0.141, "grad_norm": 37.25, "grad_norm_var": 2.720572916666667, "learning_rate": 0.0001, "loss": 9.1357, "loss/crossentropy": 2.1269990049302576, "loss/hidden": 3.925, "loss/jsd": 0.0, "loss/logits": 0.24402263071388006, "step": 4230 }, { "epoch": 0.14133333333333334, "grad_norm": 38.5, "grad_norm_var": 9.082291666666666, "learning_rate": 0.0001, "loss": 9.0115, "loss/crossentropy": 2.08816104978323, "loss/hidden": 3.98203125, "loss/jsd": 0.0, "loss/logits": 0.2717157419770956, "step": 4240 }, { "epoch": 0.14166666666666666, "grad_norm": 39.75, "grad_norm_var": 10.212955729166667, "learning_rate": 0.0001, "loss": 9.0041, "loss/crossentropy": 2.144673664495349, "loss/hidden": 4.121875, "loss/jsd": 0.0, "loss/logits": 0.26794750997796657, "step": 4250 }, { "epoch": 0.142, "grad_norm": 39.75, "grad_norm_var": 17.005143229166666, "learning_rate": 0.0001, "loss": 9.0036, "loss/crossentropy": 2.198061776161194, "loss/hidden": 4.16171875, "loss/jsd": 0.0, "loss/logits": 0.28953395783901215, "step": 4260 }, { "epoch": 0.14233333333333334, "grad_norm": 44.0, "grad_norm_var": 11.192122395833334, "learning_rate": 0.0001, "loss": 8.9185, "loss/crossentropy": 2.0459933675825597, "loss/hidden": 4.1234375, "loss/jsd": 0.0, "loss/logits": 0.26404702849686146, "step": 4270 }, { "epoch": 0.14266666666666666, "grad_norm": 34.0, "grad_norm_var": 14.708333333333334, "learning_rate": 0.0001, "loss": 9.0252, "loss/crossentropy": 2.142607557028532, "loss/hidden": 4.148828125, "loss/jsd": 0.0, "loss/logits": 0.2864328293129802, "step": 4280 }, { "epoch": 0.143, "grad_norm": 37.0, "grad_norm_var": 1981.7268229166666, "learning_rate": 0.0001, "loss": 9.2393, "loss/crossentropy": 2.346937409043312, "loss/hidden": 4.025390625, "loss/jsd": 0.0, "loss/logits": 0.29519574213773014, "step": 4290 }, { "epoch": 0.14333333333333334, "grad_norm": 33.75, "grad_norm_var": 40.1125, "learning_rate": 0.0001, "loss": 9.0816, "loss/crossentropy": 2.2757950969040395, "loss/hidden": 4.086328125, "loss/jsd": 0.0, "loss/logits": 0.2885062342509627, "step": 4300 }, { "epoch": 0.14366666666666666, "grad_norm": 36.5, "grad_norm_var": 863.840625, "learning_rate": 0.0001, "loss": 9.1514, "loss/crossentropy": 2.225219927728176, "loss/hidden": 4.180078125, "loss/jsd": 0.0, "loss/logits": 0.29695004131644964, "step": 4310 }, { "epoch": 0.144, "grad_norm": 33.75, "grad_norm_var": 886.9458333333333, "learning_rate": 0.0001, "loss": 8.9425, "loss/crossentropy": 2.143592892587185, "loss/hidden": 4.075390625, "loss/jsd": 0.0, "loss/logits": 0.280602141469717, "step": 4320 }, { "epoch": 0.14433333333333334, "grad_norm": 36.0, "grad_norm_var": 14.0375, "learning_rate": 0.0001, "loss": 9.0048, "loss/crossentropy": 2.196325662732124, "loss/hidden": 4.073828125, "loss/jsd": 0.0, "loss/logits": 0.28205970898270605, "step": 4330 }, { "epoch": 0.14466666666666667, "grad_norm": 34.0, "grad_norm_var": 17.295833333333334, "learning_rate": 0.0001, "loss": 9.0125, "loss/crossentropy": 2.2133986562490464, "loss/hidden": 4.078125, "loss/jsd": 0.0, "loss/logits": 0.2758210487663746, "step": 4340 }, { "epoch": 0.145, "grad_norm": 35.25, "grad_norm_var": 17.939322916666665, "learning_rate": 0.0001, "loss": 8.9665, "loss/crossentropy": 2.2748197615146637, "loss/hidden": 4.061328125, "loss/jsd": 0.0, "loss/logits": 0.27680257745087145, "step": 4350 }, { "epoch": 0.14533333333333334, "grad_norm": 32.5, "grad_norm_var": 9.857291666666667, "learning_rate": 0.0001, "loss": 9.0182, "loss/crossentropy": 2.108441038429737, "loss/hidden": 4.170703125, "loss/jsd": 0.0, "loss/logits": 0.2624512787908316, "step": 4360 }, { "epoch": 0.14566666666666667, "grad_norm": 44.25, "grad_norm_var": 15.3947265625, "learning_rate": 0.0001, "loss": 8.9951, "loss/crossentropy": 2.038861893117428, "loss/hidden": 4.069140625, "loss/jsd": 0.0, "loss/logits": 0.26383627485483885, "step": 4370 }, { "epoch": 0.146, "grad_norm": 38.25, "grad_norm_var": 14.3884765625, "learning_rate": 0.0001, "loss": 9.0622, "loss/crossentropy": 2.1965081602334977, "loss/hidden": 4.124609375, "loss/jsd": 0.0, "loss/logits": 0.29782434441149236, "step": 4380 }, { "epoch": 0.14633333333333334, "grad_norm": 32.5, "grad_norm_var": 127.815625, "learning_rate": 0.0001, "loss": 9.0006, "loss/crossentropy": 2.0395655959844587, "loss/hidden": 4.130078125, "loss/jsd": 0.0, "loss/logits": 0.26886530220508575, "step": 4390 }, { "epoch": 0.14666666666666667, "grad_norm": 37.0, "grad_norm_var": 10.148958333333333, "learning_rate": 0.0001, "loss": 9.0164, "loss/crossentropy": 2.228328402340412, "loss/hidden": 4.174609375, "loss/jsd": 0.0, "loss/logits": 0.28383949398994446, "step": 4400 }, { "epoch": 0.147, "grad_norm": 38.75, "grad_norm_var": 4.183072916666666, "learning_rate": 0.0001, "loss": 8.9809, "loss/crossentropy": 2.1509799271821977, "loss/hidden": 4.066015625, "loss/jsd": 0.0, "loss/logits": 0.26196911465376616, "step": 4410 }, { "epoch": 0.14733333333333334, "grad_norm": 37.25, "grad_norm_var": 5.3353515625, "learning_rate": 0.0001, "loss": 9.056, "loss/crossentropy": 2.217263324558735, "loss/hidden": 4.2015625, "loss/jsd": 0.0, "loss/logits": 0.28669508136808874, "step": 4420 }, { "epoch": 0.14766666666666667, "grad_norm": 35.5, "grad_norm_var": 6.494205729166667, "learning_rate": 0.0001, "loss": 9.0301, "loss/crossentropy": 2.15300203114748, "loss/hidden": 4.10703125, "loss/jsd": 0.0, "loss/logits": 0.27612753622233865, "step": 4430 }, { "epoch": 0.148, "grad_norm": 35.5, "grad_norm_var": 136.375, "learning_rate": 0.0001, "loss": 9.0967, "loss/crossentropy": 2.093575692176819, "loss/hidden": 4.1453125, "loss/jsd": 0.0, "loss/logits": 0.29885905496776105, "step": 4440 }, { "epoch": 0.14833333333333334, "grad_norm": 39.25, "grad_norm_var": 13.1119140625, "learning_rate": 0.0001, "loss": 9.0634, "loss/crossentropy": 2.216056075692177, "loss/hidden": 4.144921875, "loss/jsd": 0.0, "loss/logits": 0.30221954099833964, "step": 4450 }, { "epoch": 0.14866666666666667, "grad_norm": 33.5, "grad_norm_var": 11.009830729166667, "learning_rate": 0.0001, "loss": 9.2091, "loss/crossentropy": 2.2127518743276595, "loss/hidden": 4.108984375, "loss/jsd": 0.0, "loss/logits": 0.2779125362634659, "step": 4460 }, { "epoch": 0.149, "grad_norm": 39.5, "grad_norm_var": 8.137239583333333, "learning_rate": 0.0001, "loss": 9.086, "loss/crossentropy": 2.192157284915447, "loss/hidden": 4.0703125, "loss/jsd": 0.0, "loss/logits": 0.27739516645669937, "step": 4470 }, { "epoch": 0.14933333333333335, "grad_norm": 40.0, "grad_norm_var": 7.815625, "learning_rate": 0.0001, "loss": 9.0644, "loss/crossentropy": 2.1084683328866958, "loss/hidden": 4.044921875, "loss/jsd": 0.0, "loss/logits": 0.26986431013792755, "step": 4480 }, { "epoch": 0.14966666666666667, "grad_norm": 35.75, "grad_norm_var": 10.302018229166666, "learning_rate": 0.0001, "loss": 9.0806, "loss/crossentropy": 2.0632939770817758, "loss/hidden": 4.218359375, "loss/jsd": 0.0, "loss/logits": 0.3002682067453861, "step": 4490 }, { "epoch": 0.15, "grad_norm": 39.5, "grad_norm_var": 11.279166666666667, "learning_rate": 0.0001, "loss": 9.1169, "loss/crossentropy": 2.2110743284225465, "loss/hidden": 4.1859375, "loss/jsd": 0.0, "loss/logits": 0.2930508263409138, "step": 4500 }, { "epoch": 0.15033333333333335, "grad_norm": 38.5, "grad_norm_var": 10.308072916666667, "learning_rate": 0.0001, "loss": 9.1674, "loss/crossentropy": 2.3797917008399962, "loss/hidden": 4.17109375, "loss/jsd": 0.0, "loss/logits": 0.3082386564463377, "step": 4510 }, { "epoch": 0.15066666666666667, "grad_norm": 33.75, "grad_norm_var": 11.398958333333333, "learning_rate": 0.0001, "loss": 8.9182, "loss/crossentropy": 2.099128992110491, "loss/hidden": 4.08125, "loss/jsd": 0.0, "loss/logits": 0.26461148345842955, "step": 4520 }, { "epoch": 0.151, "grad_norm": 30.25, "grad_norm_var": 18.073893229166668, "learning_rate": 0.0001, "loss": 9.0073, "loss/crossentropy": 2.121720698475838, "loss/hidden": 4.0703125, "loss/jsd": 0.0, "loss/logits": 0.26439094692468645, "step": 4530 }, { "epoch": 0.15133333333333332, "grad_norm": 51.0, "grad_norm_var": 40.3994140625, "learning_rate": 0.0001, "loss": 9.0847, "loss/crossentropy": 2.1047763034701346, "loss/hidden": 4.223828125, "loss/jsd": 0.0, "loss/logits": 0.30176166333258153, "step": 4540 }, { "epoch": 0.15166666666666667, "grad_norm": 40.25, "grad_norm_var": 30.270833333333332, "learning_rate": 0.0001, "loss": 8.9126, "loss/crossentropy": 2.112935496866703, "loss/hidden": 3.990234375, "loss/jsd": 0.0, "loss/logits": 0.24414603877812624, "step": 4550 }, { "epoch": 0.152, "grad_norm": 34.5, "grad_norm_var": 9.594205729166667, "learning_rate": 0.0001, "loss": 8.823, "loss/crossentropy": 2.057783196866512, "loss/hidden": 4.141796875, "loss/jsd": 0.0, "loss/logits": 0.27383373510092496, "step": 4560 }, { "epoch": 0.15233333333333332, "grad_norm": 33.5, "grad_norm_var": 7.947330729166667, "learning_rate": 0.0001, "loss": 8.9423, "loss/crossentropy": 2.2479268461465836, "loss/hidden": 3.940625, "loss/jsd": 0.0, "loss/logits": 0.26401854380965234, "step": 4570 }, { "epoch": 0.15266666666666667, "grad_norm": 40.25, "grad_norm_var": 8.190625, "learning_rate": 0.0001, "loss": 8.8901, "loss/crossentropy": 2.228538802266121, "loss/hidden": 4.050390625, "loss/jsd": 0.0, "loss/logits": 0.27664305865764616, "step": 4580 }, { "epoch": 0.153, "grad_norm": 32.5, "grad_norm_var": 21.790625, "learning_rate": 0.0001, "loss": 9.0083, "loss/crossentropy": 2.119870799779892, "loss/hidden": 4.17265625, "loss/jsd": 0.0, "loss/logits": 0.27753249146044257, "step": 4590 }, { "epoch": 0.15333333333333332, "grad_norm": 56.5, "grad_norm_var": 470.8247395833333, "learning_rate": 0.0001, "loss": 9.1533, "loss/crossentropy": 2.0553093053400517, "loss/hidden": 4.362109375, "loss/jsd": 0.0, "loss/logits": 0.3021026611328125, "step": 4600 }, { "epoch": 0.15366666666666667, "grad_norm": 41.75, "grad_norm_var": 463.7455729166667, "learning_rate": 0.0001, "loss": 8.9574, "loss/crossentropy": 2.0937911093235018, "loss/hidden": 4.11953125, "loss/jsd": 0.0, "loss/logits": 0.2679052516818047, "step": 4610 }, { "epoch": 0.154, "grad_norm": 35.0, "grad_norm_var": 6.95, "learning_rate": 0.0001, "loss": 9.0725, "loss/crossentropy": 2.1097617581486703, "loss/hidden": 4.151171875, "loss/jsd": 0.0, "loss/logits": 0.3005022447556257, "step": 4620 }, { "epoch": 0.15433333333333332, "grad_norm": 34.0, "grad_norm_var": 6.76875, "learning_rate": 0.0001, "loss": 8.8903, "loss/crossentropy": 2.187983478605747, "loss/hidden": 4.08125, "loss/jsd": 0.0, "loss/logits": 0.26959136240184306, "step": 4630 }, { "epoch": 0.15466666666666667, "grad_norm": 36.75, "grad_norm_var": 6.955143229166667, "learning_rate": 0.0001, "loss": 8.9611, "loss/crossentropy": 2.096404644846916, "loss/hidden": 4.02265625, "loss/jsd": 0.0, "loss/logits": 0.2623141951858997, "step": 4640 }, { "epoch": 0.155, "grad_norm": 33.75, "grad_norm_var": 8.134309895833333, "learning_rate": 0.0001, "loss": 8.8924, "loss/crossentropy": 1.9974856853485108, "loss/hidden": 4.131640625, "loss/jsd": 0.0, "loss/logits": 0.27121835108846426, "step": 4650 }, { "epoch": 0.15533333333333332, "grad_norm": 36.75, "grad_norm_var": 7.055989583333333, "learning_rate": 0.0001, "loss": 9.1254, "loss/crossentropy": 2.255522921681404, "loss/hidden": 4.053125, "loss/jsd": 0.0, "loss/logits": 0.2813701078295708, "step": 4660 }, { "epoch": 0.15566666666666668, "grad_norm": 33.5, "grad_norm_var": 15.758333333333333, "learning_rate": 0.0001, "loss": 8.9782, "loss/crossentropy": 2.1178917974233626, "loss/hidden": 4.116796875, "loss/jsd": 0.0, "loss/logits": 0.27898423206061124, "step": 4670 }, { "epoch": 0.156, "grad_norm": 37.25, "grad_norm_var": 8.62890625, "learning_rate": 0.0001, "loss": 8.9927, "loss/crossentropy": 2.2914595365524293, "loss/hidden": 4.09921875, "loss/jsd": 0.0, "loss/logits": 0.28754087798297406, "step": 4680 }, { "epoch": 0.15633333333333332, "grad_norm": 37.5, "grad_norm_var": 8.773958333333333, "learning_rate": 0.0001, "loss": 8.8989, "loss/crossentropy": 2.075756361335516, "loss/hidden": 4.016015625, "loss/jsd": 0.0, "loss/logits": 0.27427870500832796, "step": 4690 }, { "epoch": 0.15666666666666668, "grad_norm": 43.5, "grad_norm_var": 41.68307291666667, "learning_rate": 0.0001, "loss": 9.0313, "loss/crossentropy": 2.114923672378063, "loss/hidden": 4.1328125, "loss/jsd": 0.0, "loss/logits": 0.2685729030519724, "step": 4700 }, { "epoch": 0.157, "grad_norm": 32.25, "grad_norm_var": 23.582291666666666, "learning_rate": 0.0001, "loss": 8.9486, "loss/crossentropy": 2.1114466533064844, "loss/hidden": 4.230859375, "loss/jsd": 0.0, "loss/logits": 0.28084823917597534, "step": 4710 }, { "epoch": 0.15733333333333333, "grad_norm": 40.0, "grad_norm_var": 17.48515625, "learning_rate": 0.0001, "loss": 8.993, "loss/crossentropy": 2.1299724817276, "loss/hidden": 4.07265625, "loss/jsd": 0.0, "loss/logits": 0.2675957553088665, "step": 4720 }, { "epoch": 0.15766666666666668, "grad_norm": 45.75, "grad_norm_var": 16.35, "learning_rate": 0.0001, "loss": 8.9449, "loss/crossentropy": 2.253695184737444, "loss/hidden": 4.016796875, "loss/jsd": 0.0, "loss/logits": 0.2566886018961668, "step": 4730 }, { "epoch": 0.158, "grad_norm": 35.5, "grad_norm_var": 25.812239583333334, "learning_rate": 0.0001, "loss": 9.1158, "loss/crossentropy": 2.2754971385002136, "loss/hidden": 4.198046875, "loss/jsd": 0.0, "loss/logits": 0.28829921074211595, "step": 4740 }, { "epoch": 0.15833333333333333, "grad_norm": 40.0, "grad_norm_var": 44.77682291666667, "learning_rate": 0.0001, "loss": 8.8164, "loss/crossentropy": 2.0852062880992888, "loss/hidden": 4.101171875, "loss/jsd": 0.0, "loss/logits": 0.2623613655567169, "step": 4750 }, { "epoch": 0.15866666666666668, "grad_norm": 34.25, "grad_norm_var": 18.587239583333332, "learning_rate": 0.0001, "loss": 8.8431, "loss/crossentropy": 2.2237706407904625, "loss/hidden": 4.155078125, "loss/jsd": 0.0, "loss/logits": 0.3078520778566599, "step": 4760 }, { "epoch": 0.159, "grad_norm": 39.5, "grad_norm_var": 10.715625, "learning_rate": 0.0001, "loss": 8.8808, "loss/crossentropy": 2.139245317876339, "loss/hidden": 4.1, "loss/jsd": 0.0, "loss/logits": 0.2770055137574673, "step": 4770 }, { "epoch": 0.15933333333333333, "grad_norm": 38.5, "grad_norm_var": 11.529166666666667, "learning_rate": 0.0001, "loss": 9.0282, "loss/crossentropy": 2.153920599073172, "loss/hidden": 4.14765625, "loss/jsd": 0.0, "loss/logits": 0.2666714245453477, "step": 4780 }, { "epoch": 0.15966666666666668, "grad_norm": 37.25, "grad_norm_var": 3.2831240942886323e+18, "learning_rate": 0.0001, "loss": 9.2047, "loss/crossentropy": 2.181428015232086, "loss/hidden": 4.17421875, "loss/jsd": 0.0, "loss/logits": 0.2813129939138889, "step": 4790 }, { "epoch": 0.16, "grad_norm": 33.75, "grad_norm_var": 6.765625, "learning_rate": 0.0001, "loss": 9.0916, "loss/crossentropy": 2.148864021897316, "loss/hidden": 4.162890625, "loss/jsd": 0.0, "loss/logits": 0.28246700279414655, "step": 4800 }, { "epoch": 0.16033333333333333, "grad_norm": 32.0, "grad_norm_var": 5.557291666666667, "learning_rate": 0.0001, "loss": 8.7833, "loss/crossentropy": 2.20737906396389, "loss/hidden": 4.041796875, "loss/jsd": 0.0, "loss/logits": 0.27596224322915075, "step": 4810 }, { "epoch": 0.16066666666666668, "grad_norm": 35.0, "grad_norm_var": 5.990625, "learning_rate": 0.0001, "loss": 8.9472, "loss/crossentropy": 2.0124839752912522, "loss/hidden": 4.1796875, "loss/jsd": 0.0, "loss/logits": 0.2861068371683359, "step": 4820 }, { "epoch": 0.161, "grad_norm": 33.25, "grad_norm_var": 7.820247395833333, "learning_rate": 0.0001, "loss": 8.9053, "loss/crossentropy": 2.158030679821968, "loss/hidden": 3.937109375, "loss/jsd": 0.0, "loss/logits": 0.26078329905867575, "step": 4830 }, { "epoch": 0.16133333333333333, "grad_norm": 34.5, "grad_norm_var": 10.298372395833333, "learning_rate": 0.0001, "loss": 8.9212, "loss/crossentropy": 2.046878896653652, "loss/hidden": 4.169921875, "loss/jsd": 0.0, "loss/logits": 0.277280671428889, "step": 4840 }, { "epoch": 0.16166666666666665, "grad_norm": 36.0, "grad_norm_var": 8.980989583333333, "learning_rate": 0.0001, "loss": 8.8992, "loss/crossentropy": 2.2036330491304397, "loss/hidden": 3.940625, "loss/jsd": 0.0, "loss/logits": 0.25981649905443194, "step": 4850 }, { "epoch": 0.162, "grad_norm": 37.5, "grad_norm_var": 6.245833333333334, "learning_rate": 0.0001, "loss": 8.8779, "loss/crossentropy": 2.001459051668644, "loss/hidden": 4.107421875, "loss/jsd": 0.0, "loss/logits": 0.2751261981204152, "step": 4860 }, { "epoch": 0.16233333333333333, "grad_norm": 34.0, "grad_norm_var": 24.165625, "learning_rate": 0.0001, "loss": 9.0285, "loss/crossentropy": 2.236599923670292, "loss/hidden": 3.955078125, "loss/jsd": 0.0, "loss/logits": 0.2600332200527191, "step": 4870 }, { "epoch": 0.16266666666666665, "grad_norm": 37.0, "grad_norm_var": 6.662239583333333, "learning_rate": 0.0001, "loss": 8.9864, "loss/crossentropy": 2.0701673187315466, "loss/hidden": 4.109765625, "loss/jsd": 0.0, "loss/logits": 0.27043242640793325, "step": 4880 }, { "epoch": 0.163, "grad_norm": 58.0, "grad_norm_var": 1.8014398247254098e+18, "learning_rate": 0.0001, "loss": 8.9593, "loss/crossentropy": 2.128389702364802, "loss/hidden": 3.9984375, "loss/jsd": 0.0, "loss/logits": 0.2562539763748646, "step": 4890 }, { "epoch": 0.16333333333333333, "grad_norm": 33.25, "grad_norm_var": 1.801439823002949e+18, "learning_rate": 0.0001, "loss": 9.0252, "loss/crossentropy": 2.0912427961826325, "loss/hidden": 4.151171875, "loss/jsd": 0.0, "loss/logits": 0.2807155104354024, "step": 4900 }, { "epoch": 0.16366666666666665, "grad_norm": 41.25, "grad_norm_var": 27.223958333333332, "learning_rate": 0.0001, "loss": 8.7847, "loss/crossentropy": 1.889319808036089, "loss/hidden": 4.073046875, "loss/jsd": 0.0, "loss/logits": 0.2481829353608191, "step": 4910 }, { "epoch": 0.164, "grad_norm": 32.75, "grad_norm_var": 7.939322916666667, "learning_rate": 0.0001, "loss": 8.744, "loss/crossentropy": 2.2621133089065553, "loss/hidden": 4.037109375, "loss/jsd": 0.0, "loss/logits": 0.2557232953608036, "step": 4920 }, { "epoch": 0.16433333333333333, "grad_norm": 35.75, "grad_norm_var": 7.367122395833333, "learning_rate": 0.0001, "loss": 8.9305, "loss/crossentropy": 2.1436791688203813, "loss/hidden": 4.185546875, "loss/jsd": 0.0, "loss/logits": 0.28117387779057024, "step": 4930 }, { "epoch": 0.16466666666666666, "grad_norm": 34.0, "grad_norm_var": 6.180989583333333, "learning_rate": 0.0001, "loss": 8.8538, "loss/crossentropy": 2.124467818439007, "loss/hidden": 4.153125, "loss/jsd": 0.0, "loss/logits": 0.27935762144625187, "step": 4940 }, { "epoch": 0.165, "grad_norm": 34.5, "grad_norm_var": 3.9208333333333334, "learning_rate": 0.0001, "loss": 9.0238, "loss/crossentropy": 2.1031661182641983, "loss/hidden": 4.13359375, "loss/jsd": 0.0, "loss/logits": 0.2820569805800915, "step": 4950 }, { "epoch": 0.16533333333333333, "grad_norm": 36.75, "grad_norm_var": 7.434309895833334, "learning_rate": 0.0001, "loss": 8.9877, "loss/crossentropy": 2.1274607971310617, "loss/hidden": 4.0453125, "loss/jsd": 0.0, "loss/logits": 0.25978016797453163, "step": 4960 }, { "epoch": 0.16566666666666666, "grad_norm": 35.75, "grad_norm_var": 9.023372395833333, "learning_rate": 0.0001, "loss": 8.9786, "loss/crossentropy": 2.265768840909004, "loss/hidden": 3.933984375, "loss/jsd": 0.0, "loss/logits": 0.2593372922390699, "step": 4970 }, { "epoch": 0.166, "grad_norm": 35.25, "grad_norm_var": 2.4580729166666666, "learning_rate": 0.0001, "loss": 8.9312, "loss/crossentropy": 2.077937413752079, "loss/hidden": 4.08984375, "loss/jsd": 0.0, "loss/logits": 0.27119250893592833, "step": 4980 }, { "epoch": 0.16633333333333333, "grad_norm": 34.25, "grad_norm_var": 300.88743489583334, "learning_rate": 0.0001, "loss": 8.8076, "loss/crossentropy": 2.2653584659099577, "loss/hidden": 4.03515625, "loss/jsd": 0.0, "loss/logits": 0.27666972354054453, "step": 4990 }, { "epoch": 0.16666666666666666, "grad_norm": 35.25, "grad_norm_var": 6.1087890625, "learning_rate": 0.0001, "loss": 8.9492, "loss/crossentropy": 2.1622142292559148, "loss/hidden": 4.054296875, "loss/jsd": 0.0, "loss/logits": 0.2614580035209656, "step": 5000 }, { "epoch": 0.167, "grad_norm": 36.5, "grad_norm_var": 6.083072916666667, "learning_rate": 0.0001, "loss": 8.8944, "loss/crossentropy": 2.125202904641628, "loss/hidden": 4.095703125, "loss/jsd": 0.0, "loss/logits": 0.27341773808002473, "step": 5010 }, { "epoch": 0.16733333333333333, "grad_norm": 35.25, "grad_norm_var": 12.689322916666667, "learning_rate": 0.0001, "loss": 9.0034, "loss/crossentropy": 2.2574457243084907, "loss/hidden": 4.0109375, "loss/jsd": 0.0, "loss/logits": 0.27399955950677396, "step": 5020 }, { "epoch": 0.16766666666666666, "grad_norm": 42.0, "grad_norm_var": 12.624739583333334, "learning_rate": 0.0001, "loss": 9.0668, "loss/crossentropy": 2.0945447117090223, "loss/hidden": 4.031640625, "loss/jsd": 0.0, "loss/logits": 0.27295216396450994, "step": 5030 }, { "epoch": 0.168, "grad_norm": 35.0, "grad_norm_var": 20.599739583333335, "learning_rate": 0.0001, "loss": 8.9586, "loss/crossentropy": 2.2430111899971963, "loss/hidden": 4.026953125, "loss/jsd": 0.0, "loss/logits": 0.27031025942415, "step": 5040 }, { "epoch": 0.16833333333333333, "grad_norm": 36.5, "grad_norm_var": 17.99765625, "learning_rate": 0.0001, "loss": 9.1637, "loss/crossentropy": 2.1593209132552147, "loss/hidden": 4.162109375, "loss/jsd": 0.0, "loss/logits": 0.2839784752577543, "step": 5050 }, { "epoch": 0.16866666666666666, "grad_norm": 32.25, "grad_norm_var": 9.022330729166667, "learning_rate": 0.0001, "loss": 8.8245, "loss/crossentropy": 2.1471748799085617, "loss/hidden": 4.0046875, "loss/jsd": 0.0, "loss/logits": 0.26835247687995434, "step": 5060 }, { "epoch": 0.169, "grad_norm": 36.5, "grad_norm_var": 87.60390625, "learning_rate": 0.0001, "loss": 8.8161, "loss/crossentropy": 2.0496731594204904, "loss/hidden": 4.125390625, "loss/jsd": 0.0, "loss/logits": 0.2703436575829983, "step": 5070 }, { "epoch": 0.16933333333333334, "grad_norm": 34.5, "grad_norm_var": 100.34557291666667, "learning_rate": 0.0001, "loss": 9.0138, "loss/crossentropy": 2.1915629282593727, "loss/hidden": 4.15546875, "loss/jsd": 0.0, "loss/logits": 0.2765682227909565, "step": 5080 }, { "epoch": 0.16966666666666666, "grad_norm": 37.0, "grad_norm_var": 3.41640625, "learning_rate": 0.0001, "loss": 8.9229, "loss/crossentropy": 2.257983461022377, "loss/hidden": 4.04921875, "loss/jsd": 0.0, "loss/logits": 0.262128459662199, "step": 5090 }, { "epoch": 0.17, "grad_norm": 35.75, "grad_norm_var": 15.81640625, "learning_rate": 0.0001, "loss": 8.8926, "loss/crossentropy": 2.187410834431648, "loss/hidden": 4.071484375, "loss/jsd": 0.0, "loss/logits": 0.2799120504409075, "step": 5100 }, { "epoch": 0.17033333333333334, "grad_norm": 35.75, "grad_norm_var": 5.523958333333334, "learning_rate": 0.0001, "loss": 8.8704, "loss/crossentropy": 2.0666673690080644, "loss/hidden": 4.118359375, "loss/jsd": 0.0, "loss/logits": 0.2736505573615432, "step": 5110 }, { "epoch": 0.17066666666666666, "grad_norm": 36.75, "grad_norm_var": 16.540625, "learning_rate": 0.0001, "loss": 8.784, "loss/crossentropy": 2.055477923154831, "loss/hidden": 4.126953125, "loss/jsd": 0.0, "loss/logits": 0.27156198769807816, "step": 5120 }, { "epoch": 0.171, "grad_norm": 41.75, "grad_norm_var": 20.34765625, "learning_rate": 0.0001, "loss": 8.9327, "loss/crossentropy": 2.037408724427223, "loss/hidden": 4.025390625, "loss/jsd": 0.0, "loss/logits": 0.2697664858773351, "step": 5130 }, { "epoch": 0.17133333333333334, "grad_norm": 36.75, "grad_norm_var": 17.532747395833333, "learning_rate": 0.0001, "loss": 8.8144, "loss/crossentropy": 2.1504238069057466, "loss/hidden": 3.99453125, "loss/jsd": 0.0, "loss/logits": 0.2602078752592206, "step": 5140 }, { "epoch": 0.17166666666666666, "grad_norm": 50.25, "grad_norm_var": 22.230989583333333, "learning_rate": 0.0001, "loss": 8.7907, "loss/crossentropy": 2.0753002099692823, "loss/hidden": 3.951953125, "loss/jsd": 0.0, "loss/logits": 0.24998050797730684, "step": 5150 }, { "epoch": 0.172, "grad_norm": 33.5, "grad_norm_var": 18.791666666666668, "learning_rate": 0.0001, "loss": 8.8137, "loss/crossentropy": 2.179745650291443, "loss/hidden": 3.980078125, "loss/jsd": 0.0, "loss/logits": 0.259993402659893, "step": 5160 }, { "epoch": 0.17233333333333334, "grad_norm": 33.25, "grad_norm_var": 6.505989583333333, "learning_rate": 0.0001, "loss": 8.8, "loss/crossentropy": 2.090472859144211, "loss/hidden": 3.967578125, "loss/jsd": 0.0, "loss/logits": 0.26382889300584794, "step": 5170 }, { "epoch": 0.17266666666666666, "grad_norm": 32.75, "grad_norm_var": 9.782291666666667, "learning_rate": 0.0001, "loss": 8.8922, "loss/crossentropy": 2.0885345190763474, "loss/hidden": 4.10078125, "loss/jsd": 0.0, "loss/logits": 0.2721933271735907, "step": 5180 }, { "epoch": 0.173, "grad_norm": 33.25, "grad_norm_var": 12.4244140625, "learning_rate": 0.0001, "loss": 8.9584, "loss/crossentropy": 2.141716684401035, "loss/hidden": 4.02578125, "loss/jsd": 0.0, "loss/logits": 0.2675710514187813, "step": 5190 }, { "epoch": 0.17333333333333334, "grad_norm": 32.5, "grad_norm_var": 9.2931640625, "learning_rate": 0.0001, "loss": 8.9231, "loss/crossentropy": 2.0917196184396745, "loss/hidden": 4.080859375, "loss/jsd": 0.0, "loss/logits": 0.29290595967322586, "step": 5200 }, { "epoch": 0.17366666666666666, "grad_norm": 48.25, "grad_norm_var": 11236.739322916666, "learning_rate": 0.0001, "loss": 9.1762, "loss/crossentropy": 2.257916547358036, "loss/hidden": 4.270703125, "loss/jsd": 0.0, "loss/logits": 0.3100666496902704, "step": 5210 }, { "epoch": 0.174, "grad_norm": 43.0, "grad_norm_var": 11218.0125, "learning_rate": 0.0001, "loss": 9.0758, "loss/crossentropy": 2.14205886721611, "loss/hidden": 4.219140625, "loss/jsd": 0.0, "loss/logits": 0.3077359441667795, "step": 5220 }, { "epoch": 0.17433333333333334, "grad_norm": 35.75, "grad_norm_var": 17.448958333333334, "learning_rate": 0.0001, "loss": 8.8649, "loss/crossentropy": 2.228261913359165, "loss/hidden": 4.16484375, "loss/jsd": 0.0, "loss/logits": 0.31307811439037325, "step": 5230 }, { "epoch": 0.17466666666666666, "grad_norm": 37.75, "grad_norm_var": 10.398958333333333, "learning_rate": 0.0001, "loss": 8.8652, "loss/crossentropy": 2.1450634144246576, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.28023948706686497, "step": 5240 }, { "epoch": 0.175, "grad_norm": 35.0, "grad_norm_var": 9.215559895833334, "learning_rate": 0.0001, "loss": 8.8716, "loss/crossentropy": 2.114921988546848, "loss/hidden": 4.04375, "loss/jsd": 0.0, "loss/logits": 0.27058052010834216, "step": 5250 }, { "epoch": 0.17533333333333334, "grad_norm": 33.25, "grad_norm_var": 7.984309895833333, "learning_rate": 0.0001, "loss": 8.9649, "loss/crossentropy": 2.2500276297330855, "loss/hidden": 3.951171875, "loss/jsd": 0.0, "loss/logits": 0.2647197004407644, "step": 5260 }, { "epoch": 0.17566666666666667, "grad_norm": 33.75, "grad_norm_var": 5.643489583333333, "learning_rate": 0.0001, "loss": 8.7341, "loss/crossentropy": 2.175753255933523, "loss/hidden": 4.11171875, "loss/jsd": 0.0, "loss/logits": 0.27517074095085264, "step": 5270 }, { "epoch": 0.176, "grad_norm": 36.0, "grad_norm_var": 10.556184895833333, "learning_rate": 0.0001, "loss": 8.9871, "loss/crossentropy": 2.175162176787853, "loss/hidden": 4.271484375, "loss/jsd": 0.0, "loss/logits": 0.3377554725855589, "step": 5280 }, { "epoch": 0.17633333333333334, "grad_norm": 34.5, "grad_norm_var": 10.546809895833333, "learning_rate": 0.0001, "loss": 8.922, "loss/crossentropy": 2.143188028782606, "loss/hidden": 4.023828125, "loss/jsd": 0.0, "loss/logits": 0.2618455559015274, "step": 5290 }, { "epoch": 0.17666666666666667, "grad_norm": 31.625, "grad_norm_var": 19.506705729166665, "learning_rate": 0.0001, "loss": 8.7904, "loss/crossentropy": 2.0671297401189803, "loss/hidden": 4.09765625, "loss/jsd": 0.0, "loss/logits": 0.25971175618469716, "step": 5300 }, { "epoch": 0.177, "grad_norm": 42.75, "grad_norm_var": 15.354622395833333, "learning_rate": 0.0001, "loss": 8.9056, "loss/crossentropy": 2.221310918033123, "loss/hidden": 4.06796875, "loss/jsd": 0.0, "loss/logits": 0.278064251691103, "step": 5310 }, { "epoch": 0.17733333333333334, "grad_norm": 33.5, "grad_norm_var": 9.375, "learning_rate": 0.0001, "loss": 8.7992, "loss/crossentropy": 2.0719747349619864, "loss/hidden": 4.105859375, "loss/jsd": 0.0, "loss/logits": 0.2616250865161419, "step": 5320 }, { "epoch": 0.17766666666666667, "grad_norm": 32.0, "grad_norm_var": 5.623958333333333, "learning_rate": 0.0001, "loss": 8.7475, "loss/crossentropy": 2.0968958541750906, "loss/hidden": 4.021484375, "loss/jsd": 0.0, "loss/logits": 0.2842200789600611, "step": 5330 }, { "epoch": 0.178, "grad_norm": 38.5, "grad_norm_var": 7.82265625, "learning_rate": 0.0001, "loss": 8.8437, "loss/crossentropy": 2.1536238461732866, "loss/hidden": 3.87421875, "loss/jsd": 0.0, "loss/logits": 0.24708390831947327, "step": 5340 }, { "epoch": 0.17833333333333334, "grad_norm": 30.875, "grad_norm_var": 11.901497395833333, "learning_rate": 0.0001, "loss": 8.7492, "loss/crossentropy": 2.1353225603699686, "loss/hidden": 3.94453125, "loss/jsd": 0.0, "loss/logits": 0.2444358326494694, "step": 5350 }, { "epoch": 0.17866666666666667, "grad_norm": 34.25, "grad_norm_var": 21.014518229166665, "learning_rate": 0.0001, "loss": 8.9323, "loss/crossentropy": 2.148284465074539, "loss/hidden": 4.20703125, "loss/jsd": 0.0, "loss/logits": 0.2887397948652506, "step": 5360 }, { "epoch": 0.179, "grad_norm": 35.5, "grad_norm_var": 7.264518229166667, "learning_rate": 0.0001, "loss": 8.7864, "loss/crossentropy": 1.94754306524992, "loss/hidden": 4.208984375, "loss/jsd": 0.0, "loss/logits": 0.2864205963909626, "step": 5370 }, { "epoch": 0.17933333333333334, "grad_norm": 32.75, "grad_norm_var": 17.825455729166666, "learning_rate": 0.0001, "loss": 8.7709, "loss/crossentropy": 2.111784017086029, "loss/hidden": 4.049609375, "loss/jsd": 0.0, "loss/logits": 0.2593661729246378, "step": 5380 }, { "epoch": 0.17966666666666667, "grad_norm": 35.25, "grad_norm_var": 19.040625, "learning_rate": 0.0001, "loss": 8.9434, "loss/crossentropy": 2.1750704884529113, "loss/hidden": 4.074609375, "loss/jsd": 0.0, "loss/logits": 0.27710040137171743, "step": 5390 }, { "epoch": 0.18, "grad_norm": 30.375, "grad_norm_var": 366.95520833333336, "learning_rate": 0.0001, "loss": 8.848, "loss/crossentropy": 2.1083780497312548, "loss/hidden": 3.966015625, "loss/jsd": 0.0, "loss/logits": 0.25344079583883283, "step": 5400 }, { "epoch": 0.18033333333333335, "grad_norm": 34.75, "grad_norm_var": 379.9759765625, "learning_rate": 0.0001, "loss": 9.0775, "loss/crossentropy": 2.1748669266700746, "loss/hidden": 4.206640625, "loss/jsd": 0.0, "loss/logits": 0.32070644982159136, "step": 5410 }, { "epoch": 0.18066666666666667, "grad_norm": 43.0, "grad_norm_var": 7.449739583333334, "learning_rate": 0.0001, "loss": 9.0152, "loss/crossentropy": 2.113310632109642, "loss/hidden": 4.053125, "loss/jsd": 0.0, "loss/logits": 0.25953211821615696, "step": 5420 }, { "epoch": 0.181, "grad_norm": 34.5, "grad_norm_var": 11.134375, "learning_rate": 0.0001, "loss": 8.7841, "loss/crossentropy": 2.132595753669739, "loss/hidden": 3.9265625, "loss/jsd": 0.0, "loss/logits": 0.2528179976157844, "step": 5430 }, { "epoch": 0.18133333333333335, "grad_norm": 33.5, "grad_norm_var": 3.84765625, "learning_rate": 0.0001, "loss": 8.8766, "loss/crossentropy": 2.1760675475001334, "loss/hidden": 4.0890625, "loss/jsd": 0.0, "loss/logits": 0.2604052824899554, "step": 5440 }, { "epoch": 0.18166666666666667, "grad_norm": 33.75, "grad_norm_var": 4.217643229166667, "learning_rate": 0.0001, "loss": 8.9479, "loss/crossentropy": 2.10125227868557, "loss/hidden": 4.119140625, "loss/jsd": 0.0, "loss/logits": 0.275428506731987, "step": 5450 }, { "epoch": 0.182, "grad_norm": 36.75, "grad_norm_var": 3.2603515625, "learning_rate": 0.0001, "loss": 8.8121, "loss/crossentropy": 2.0651059225201607, "loss/hidden": 3.9671875, "loss/jsd": 0.0, "loss/logits": 0.26424469240009785, "step": 5460 }, { "epoch": 0.18233333333333332, "grad_norm": 41.5, "grad_norm_var": 55.25807291666667, "learning_rate": 0.0001, "loss": 8.9347, "loss/crossentropy": 2.1086655259132385, "loss/hidden": 4.075, "loss/jsd": 0.0, "loss/logits": 0.2710310023277998, "step": 5470 }, { "epoch": 0.18266666666666667, "grad_norm": 31.25, "grad_norm_var": 128.47858072916668, "learning_rate": 0.0001, "loss": 9.0253, "loss/crossentropy": 2.069367530941963, "loss/hidden": 4.250390625, "loss/jsd": 0.0, "loss/logits": 0.26114317737519743, "step": 5480 }, { "epoch": 0.183, "grad_norm": 34.25, "grad_norm_var": 97.15670572916666, "learning_rate": 0.0001, "loss": 8.9139, "loss/crossentropy": 2.1907971248030664, "loss/hidden": 4.216015625, "loss/jsd": 0.0, "loss/logits": 0.301979548484087, "step": 5490 }, { "epoch": 0.18333333333333332, "grad_norm": 33.0, "grad_norm_var": 1.647261910309955e+18, "learning_rate": 0.0001, "loss": 8.9109, "loss/crossentropy": 2.268464684486389, "loss/hidden": 4.323828125, "loss/jsd": 0.0, "loss/logits": 0.3525669999420643, "step": 5500 }, { "epoch": 0.18366666666666667, "grad_norm": 32.0, "grad_norm_var": 1.6472619102618255e+18, "learning_rate": 0.0001, "loss": 8.7968, "loss/crossentropy": 2.2150946646928786, "loss/hidden": 3.966015625, "loss/jsd": 0.0, "loss/logits": 0.25991535745561123, "step": 5510 }, { "epoch": 0.184, "grad_norm": 46.0, "grad_norm_var": 40.7556640625, "learning_rate": 0.0001, "loss": 8.8022, "loss/crossentropy": 1.9760248348116876, "loss/hidden": 3.994140625, "loss/jsd": 0.0, "loss/logits": 0.26316012144088746, "step": 5520 }, { "epoch": 0.18433333333333332, "grad_norm": 33.25, "grad_norm_var": 38.213541666666664, "learning_rate": 0.0001, "loss": 8.9662, "loss/crossentropy": 2.1715099826455115, "loss/hidden": 4.0765625, "loss/jsd": 0.0, "loss/logits": 0.2730803471058607, "step": 5530 }, { "epoch": 0.18466666666666667, "grad_norm": 37.0, "grad_norm_var": 16.27265625, "learning_rate": 0.0001, "loss": 8.8223, "loss/crossentropy": 2.039857251942158, "loss/hidden": 3.984765625, "loss/jsd": 0.0, "loss/logits": 0.2515922043472528, "step": 5540 }, { "epoch": 0.185, "grad_norm": 35.25, "grad_norm_var": 25.537239583333335, "learning_rate": 0.0001, "loss": 8.9861, "loss/crossentropy": 2.2859031215310095, "loss/hidden": 4.180078125, "loss/jsd": 0.0, "loss/logits": 0.29738733656704425, "step": 5550 }, { "epoch": 0.18533333333333332, "grad_norm": 36.75, "grad_norm_var": 30.639322916666668, "learning_rate": 0.0001, "loss": 8.6376, "loss/crossentropy": 2.193627268075943, "loss/hidden": 4.059765625, "loss/jsd": 0.0, "loss/logits": 0.2650878496468067, "step": 5560 }, { "epoch": 0.18566666666666667, "grad_norm": 45.5, "grad_norm_var": 41.41223958333333, "learning_rate": 0.0001, "loss": 8.7447, "loss/crossentropy": 2.255605274438858, "loss/hidden": 4.147265625, "loss/jsd": 0.0, "loss/logits": 0.29602186791598795, "step": 5570 }, { "epoch": 0.186, "grad_norm": 33.25, "grad_norm_var": 23.320572916666666, "learning_rate": 0.0001, "loss": 8.7573, "loss/crossentropy": 1.9119407512247562, "loss/hidden": 4.146484375, "loss/jsd": 0.0, "loss/logits": 0.25584258073940874, "step": 5580 }, { "epoch": 0.18633333333333332, "grad_norm": 33.0, "grad_norm_var": 12.671875, "learning_rate": 0.0001, "loss": 8.8568, "loss/crossentropy": 2.20727731436491, "loss/hidden": 4.006640625, "loss/jsd": 0.0, "loss/logits": 0.2666482891887426, "step": 5590 }, { "epoch": 0.18666666666666668, "grad_norm": 34.5, "grad_norm_var": 17.099739583333335, "learning_rate": 0.0001, "loss": 8.7688, "loss/crossentropy": 2.250939354300499, "loss/hidden": 4.031640625, "loss/jsd": 0.0, "loss/logits": 0.27695418410003186, "step": 5600 }, { "epoch": 0.187, "grad_norm": 34.25, "grad_norm_var": 55.0259765625, "learning_rate": 0.0001, "loss": 8.7403, "loss/crossentropy": 2.123698775470257, "loss/hidden": 4.128515625, "loss/jsd": 0.0, "loss/logits": 0.266297522559762, "step": 5610 }, { "epoch": 0.18733333333333332, "grad_norm": 37.5, "grad_norm_var": 9.993684895833333, "learning_rate": 0.0001, "loss": 8.7692, "loss/crossentropy": 1.985411663353443, "loss/hidden": 4.025390625, "loss/jsd": 0.0, "loss/logits": 0.24248458091169595, "step": 5620 }, { "epoch": 0.18766666666666668, "grad_norm": 33.25, "grad_norm_var": 9.817122395833334, "learning_rate": 0.0001, "loss": 8.8696, "loss/crossentropy": 2.1818307891488073, "loss/hidden": 4.0328125, "loss/jsd": 0.0, "loss/logits": 0.26730893813073636, "step": 5630 }, { "epoch": 0.188, "grad_norm": 34.75, "grad_norm_var": 2.0837890625, "learning_rate": 0.0001, "loss": 8.6818, "loss/crossentropy": 1.9562100693583488, "loss/hidden": 3.86953125, "loss/jsd": 0.0, "loss/logits": 0.2361563365906477, "step": 5640 }, { "epoch": 0.18833333333333332, "grad_norm": 35.0, "grad_norm_var": 8.556705729166667, "learning_rate": 0.0001, "loss": 8.7822, "loss/crossentropy": 2.2117529645562173, "loss/hidden": 4.125390625, "loss/jsd": 0.0, "loss/logits": 0.27936047930270436, "step": 5650 }, { "epoch": 0.18866666666666668, "grad_norm": 37.0, "grad_norm_var": 18.499934895833334, "learning_rate": 0.0001, "loss": 8.8269, "loss/crossentropy": 2.1983281478285788, "loss/hidden": 4.016015625, "loss/jsd": 0.0, "loss/logits": 0.27177377715706824, "step": 5660 }, { "epoch": 0.189, "grad_norm": 33.5, "grad_norm_var": 53.44348958333333, "learning_rate": 0.0001, "loss": 8.9078, "loss/crossentropy": 2.1987064227461817, "loss/hidden": 4.033984375, "loss/jsd": 0.0, "loss/logits": 0.2688568111509085, "step": 5670 }, { "epoch": 0.18933333333333333, "grad_norm": 39.5, "grad_norm_var": 79.27682291666666, "learning_rate": 0.0001, "loss": 8.9224, "loss/crossentropy": 2.0661555171012878, "loss/hidden": 4.004296875, "loss/jsd": 0.0, "loss/logits": 0.26280424017459153, "step": 5680 }, { "epoch": 0.18966666666666668, "grad_norm": 34.5, "grad_norm_var": 76.48515625, "learning_rate": 0.0001, "loss": 8.8074, "loss/crossentropy": 2.2006511926651, "loss/hidden": 3.94453125, "loss/jsd": 0.0, "loss/logits": 0.2695758603513241, "step": 5690 }, { "epoch": 0.19, "grad_norm": 29.625, "grad_norm_var": 14.560872395833334, "learning_rate": 0.0001, "loss": 8.7091, "loss/crossentropy": 2.1932696878910063, "loss/hidden": 3.92578125, "loss/jsd": 0.0, "loss/logits": 0.2508305963128805, "step": 5700 }, { "epoch": 0.19033333333333333, "grad_norm": 30.125, "grad_norm_var": 29.80390625, "learning_rate": 0.0001, "loss": 8.8264, "loss/crossentropy": 1.9624864727258682, "loss/hidden": 4.17421875, "loss/jsd": 0.0, "loss/logits": 0.270268096216023, "step": 5710 }, { "epoch": 0.19066666666666668, "grad_norm": 34.5, "grad_norm_var": 10.429622395833333, "learning_rate": 0.0001, "loss": 8.7384, "loss/crossentropy": 2.1305344730615614, "loss/hidden": 3.949609375, "loss/jsd": 0.0, "loss/logits": 0.25091919098049403, "step": 5720 }, { "epoch": 0.191, "grad_norm": 31.125, "grad_norm_var": 10.763997395833334, "learning_rate": 0.0001, "loss": 8.7249, "loss/crossentropy": 2.138452613353729, "loss/hidden": 4.002734375, "loss/jsd": 0.0, "loss/logits": 0.2561824645847082, "step": 5730 }, { "epoch": 0.19133333333333333, "grad_norm": 36.25, "grad_norm_var": 10.508268229166667, "learning_rate": 0.0001, "loss": 8.7972, "loss/crossentropy": 2.291805052757263, "loss/hidden": 4.128125, "loss/jsd": 0.0, "loss/logits": 0.2823994573205709, "step": 5740 }, { "epoch": 0.19166666666666668, "grad_norm": 33.75, "grad_norm_var": 6.726822916666666, "learning_rate": 0.0001, "loss": 8.7028, "loss/crossentropy": 2.0102537497878075, "loss/hidden": 3.940234375, "loss/jsd": 0.0, "loss/logits": 0.2453432971611619, "step": 5750 }, { "epoch": 0.192, "grad_norm": 33.5, "grad_norm_var": 6.389518229166667, "learning_rate": 0.0001, "loss": 8.7465, "loss/crossentropy": 2.1731634236872197, "loss/hidden": 3.894921875, "loss/jsd": 0.0, "loss/logits": 0.24393599089235068, "step": 5760 }, { "epoch": 0.19233333333333333, "grad_norm": 40.5, "grad_norm_var": 9.5837890625, "learning_rate": 0.0001, "loss": 8.7436, "loss/crossentropy": 1.9870410725474357, "loss/hidden": 4.0390625, "loss/jsd": 0.0, "loss/logits": 0.25361278727650644, "step": 5770 }, { "epoch": 0.19266666666666668, "grad_norm": 42.0, "grad_norm_var": 14.058333333333334, "learning_rate": 0.0001, "loss": 8.7938, "loss/crossentropy": 1.954255884513259, "loss/hidden": 4.16328125, "loss/jsd": 0.0, "loss/logits": 0.27968817451037464, "step": 5780 }, { "epoch": 0.193, "grad_norm": 53.5, "grad_norm_var": 26.7869140625, "learning_rate": 0.0001, "loss": 8.8003, "loss/crossentropy": 2.2312229365110396, "loss/hidden": 3.932421875, "loss/jsd": 0.0, "loss/logits": 0.25941078290343283, "step": 5790 }, { "epoch": 0.19333333333333333, "grad_norm": 30.375, "grad_norm_var": 26.9681640625, "learning_rate": 0.0001, "loss": 8.6889, "loss/crossentropy": 2.2241889007389544, "loss/hidden": 4.01640625, "loss/jsd": 0.0, "loss/logits": 0.25501362718641757, "step": 5800 }, { "epoch": 0.19366666666666665, "grad_norm": 34.25, "grad_norm_var": 6.756705729166667, "learning_rate": 0.0001, "loss": 8.6629, "loss/crossentropy": 2.168029661476612, "loss/hidden": 4.10390625, "loss/jsd": 0.0, "loss/logits": 0.2718012981116772, "step": 5810 }, { "epoch": 0.194, "grad_norm": 32.75, "grad_norm_var": 4.945833333333334, "learning_rate": 0.0001, "loss": 8.7136, "loss/crossentropy": 2.1019906878471373, "loss/hidden": 3.940234375, "loss/jsd": 0.0, "loss/logits": 0.25093156583607196, "step": 5820 }, { "epoch": 0.19433333333333333, "grad_norm": 35.75, "grad_norm_var": 9.555989583333334, "learning_rate": 0.0001, "loss": 8.6915, "loss/crossentropy": 1.9436087012290955, "loss/hidden": 4.0234375, "loss/jsd": 0.0, "loss/logits": 0.25613378193229436, "step": 5830 }, { "epoch": 0.19466666666666665, "grad_norm": 33.25, "grad_norm_var": 28.499739583333334, "learning_rate": 0.0001, "loss": 8.8635, "loss/crossentropy": 2.1651584833860396, "loss/hidden": 4.078125, "loss/jsd": 0.0, "loss/logits": 0.2904574632644653, "step": 5840 }, { "epoch": 0.195, "grad_norm": 36.0, "grad_norm_var": 21.8806640625, "learning_rate": 0.0001, "loss": 8.8006, "loss/crossentropy": 2.1804804012179373, "loss/hidden": 3.935546875, "loss/jsd": 0.0, "loss/logits": 0.27036979123950006, "step": 5850 }, { "epoch": 0.19533333333333333, "grad_norm": 34.25, "grad_norm_var": 5.287434895833333, "learning_rate": 0.0001, "loss": 8.7782, "loss/crossentropy": 2.218150386214256, "loss/hidden": 4.070703125, "loss/jsd": 0.0, "loss/logits": 0.2726810619235039, "step": 5860 }, { "epoch": 0.19566666666666666, "grad_norm": 34.0, "grad_norm_var": 6.895572916666667, "learning_rate": 0.0001, "loss": 8.7639, "loss/crossentropy": 1.987610936909914, "loss/hidden": 3.981640625, "loss/jsd": 0.0, "loss/logits": 0.2631619516760111, "step": 5870 }, { "epoch": 0.196, "grad_norm": 34.25, "grad_norm_var": 7.1416015625, "learning_rate": 0.0001, "loss": 8.8358, "loss/crossentropy": 2.0127531036734583, "loss/hidden": 3.98828125, "loss/jsd": 0.0, "loss/logits": 0.24521693456918, "step": 5880 }, { "epoch": 0.19633333333333333, "grad_norm": 37.0, "grad_norm_var": 2.460724587808127e+18, "learning_rate": 0.0001, "loss": 8.7964, "loss/crossentropy": 2.158496895432472, "loss/hidden": 4.165625, "loss/jsd": 0.0, "loss/logits": 0.27088434621691704, "step": 5890 }, { "epoch": 0.19666666666666666, "grad_norm": 37.5, "grad_norm_var": 2.460724587657796e+18, "learning_rate": 0.0001, "loss": 8.7617, "loss/crossentropy": 2.0347786456346513, "loss/hidden": 3.994921875, "loss/jsd": 0.0, "loss/logits": 0.260778752155602, "step": 5900 }, { "epoch": 0.197, "grad_norm": 34.75, "grad_norm_var": 3.6489583333333333, "learning_rate": 0.0001, "loss": 8.6594, "loss/crossentropy": 1.9388806536793708, "loss/hidden": 4.1, "loss/jsd": 0.0, "loss/logits": 0.25395537763834, "step": 5910 }, { "epoch": 0.19733333333333333, "grad_norm": 32.75, "grad_norm_var": 6.189518229166667, "learning_rate": 0.0001, "loss": 8.7194, "loss/crossentropy": 2.3467346012592314, "loss/hidden": 3.984375, "loss/jsd": 0.0, "loss/logits": 0.27363166082650425, "step": 5920 }, { "epoch": 0.19766666666666666, "grad_norm": 28.25, "grad_norm_var": 6.940559895833333, "learning_rate": 0.0001, "loss": 8.7149, "loss/crossentropy": 2.1685155972838404, "loss/hidden": 4.040625, "loss/jsd": 0.0, "loss/logits": 0.2792276293039322, "step": 5930 }, { "epoch": 0.198, "grad_norm": 44.0, "grad_norm_var": 14.130143229166666, "learning_rate": 0.0001, "loss": 8.9314, "loss/crossentropy": 2.12467120885849, "loss/hidden": 4.0234375, "loss/jsd": 0.0, "loss/logits": 0.2669179428368807, "step": 5940 }, { "epoch": 0.19833333333333333, "grad_norm": 28.5, "grad_norm_var": 16.2056640625, "learning_rate": 0.0001, "loss": 8.8839, "loss/crossentropy": 2.1688647463917734, "loss/hidden": 3.9515625, "loss/jsd": 0.0, "loss/logits": 0.26754092089831827, "step": 5950 }, { "epoch": 0.19866666666666666, "grad_norm": 36.0, "grad_norm_var": 20.826041666666665, "learning_rate": 0.0001, "loss": 8.8604, "loss/crossentropy": 2.182598438858986, "loss/hidden": 4.00234375, "loss/jsd": 0.0, "loss/logits": 0.2616792509332299, "step": 5960 }, { "epoch": 0.199, "grad_norm": 34.0, "grad_norm_var": 18.274934895833333, "learning_rate": 0.0001, "loss": 8.7711, "loss/crossentropy": 2.132971841096878, "loss/hidden": 4.112890625, "loss/jsd": 0.0, "loss/logits": 0.3083286764100194, "step": 5970 }, { "epoch": 0.19933333333333333, "grad_norm": 29.75, "grad_norm_var": 7.208333333333333, "learning_rate": 0.0001, "loss": 8.7614, "loss/crossentropy": 2.1833253771066667, "loss/hidden": 3.871484375, "loss/jsd": 0.0, "loss/logits": 0.24754995480179787, "step": 5980 }, { "epoch": 0.19966666666666666, "grad_norm": 31.0, "grad_norm_var": 30.2509765625, "learning_rate": 0.0001, "loss": 8.6977, "loss/crossentropy": 2.2232595421373844, "loss/hidden": 3.9734375, "loss/jsd": 0.0, "loss/logits": 0.2633102308958769, "step": 5990 }, { "epoch": 0.2, "grad_norm": 38.25, "grad_norm_var": 27.7150390625, "learning_rate": 0.0001, "loss": 8.8047, "loss/crossentropy": 2.0881299793720247, "loss/hidden": 4.12109375, "loss/jsd": 0.0, "loss/logits": 0.2889344684779644, "step": 6000 }, { "epoch": 0.20033333333333334, "grad_norm": 30.375, "grad_norm_var": 8.056705729166667, "learning_rate": 0.0001, "loss": 8.6569, "loss/crossentropy": 2.0547440201044083, "loss/hidden": 3.99921875, "loss/jsd": 0.0, "loss/logits": 0.24832881577312946, "step": 6010 }, { "epoch": 0.20066666666666666, "grad_norm": 38.0, "grad_norm_var": 7.056184895833334, "learning_rate": 0.0001, "loss": 8.7794, "loss/crossentropy": 2.1193090736866, "loss/hidden": 4.0, "loss/jsd": 0.0, "loss/logits": 0.2597900453954935, "step": 6020 }, { "epoch": 0.201, "grad_norm": 32.25, "grad_norm_var": 5.3025390625, "learning_rate": 0.0001, "loss": 8.6794, "loss/crossentropy": 2.2494138766080143, "loss/hidden": 3.8703125, "loss/jsd": 0.0, "loss/logits": 0.2641505628824234, "step": 6030 }, { "epoch": 0.20133333333333334, "grad_norm": 4932501504.0, "grad_norm_var": 1.5205981723933279e+18, "learning_rate": 0.0001, "loss": 8.9463, "loss/crossentropy": 2.3256581157445906, "loss/hidden": 3.9296875, "loss/jsd": 0.0, "loss/logits": 0.26595249325037, "step": 6040 }, { "epoch": 0.20166666666666666, "grad_norm": 37.0, "grad_norm_var": 1.52059817108827e+18, "learning_rate": 0.0001, "loss": 8.8264, "loss/crossentropy": 2.053881608694792, "loss/hidden": 3.95546875, "loss/jsd": 0.0, "loss/logits": 0.24859177209436895, "step": 6050 }, { "epoch": 0.202, "grad_norm": 31.125, "grad_norm_var": 4.933268229166667, "learning_rate": 0.0001, "loss": 8.7041, "loss/crossentropy": 2.1878477543592454, "loss/hidden": 3.91875, "loss/jsd": 0.0, "loss/logits": 0.2409949317574501, "step": 6060 }, { "epoch": 0.20233333333333334, "grad_norm": 33.0, "grad_norm_var": 351.74140625, "learning_rate": 0.0001, "loss": 8.7973, "loss/crossentropy": 2.1990287870168688, "loss/hidden": 4.031640625, "loss/jsd": 0.0, "loss/logits": 0.2801033824682236, "step": 6070 }, { "epoch": 0.20266666666666666, "grad_norm": 30.375, "grad_norm_var": 63.73125, "learning_rate": 0.0001, "loss": 8.7133, "loss/crossentropy": 2.2455021925270557, "loss/hidden": 3.989453125, "loss/jsd": 0.0, "loss/logits": 0.27408372741192577, "step": 6080 }, { "epoch": 0.203, "grad_norm": 41.5, "grad_norm_var": 15.795572916666666, "learning_rate": 0.0001, "loss": 8.7392, "loss/crossentropy": 2.005758151039481, "loss/hidden": 3.888671875, "loss/jsd": 0.0, "loss/logits": 0.24863583762198688, "step": 6090 }, { "epoch": 0.20333333333333334, "grad_norm": 31.125, "grad_norm_var": 6.892122395833334, "learning_rate": 0.0001, "loss": 8.6557, "loss/crossentropy": 2.0756575793027876, "loss/hidden": 3.941796875, "loss/jsd": 0.0, "loss/logits": 0.25259528737515213, "step": 6100 }, { "epoch": 0.20366666666666666, "grad_norm": 35.75, "grad_norm_var": 3.395768229166667, "learning_rate": 0.0001, "loss": 8.8129, "loss/crossentropy": 2.1710173338651657, "loss/hidden": 4.084375, "loss/jsd": 0.0, "loss/logits": 0.24820818062871694, "step": 6110 }, { "epoch": 0.204, "grad_norm": 33.75, "grad_norm_var": 14.166080729166667, "learning_rate": 0.0001, "loss": 8.6818, "loss/crossentropy": 2.2217075169086455, "loss/hidden": 4.032421875, "loss/jsd": 0.0, "loss/logits": 0.2922105029225349, "step": 6120 }, { "epoch": 0.20433333333333334, "grad_norm": 36.75, "grad_norm_var": 15.454166666666667, "learning_rate": 0.0001, "loss": 8.7605, "loss/crossentropy": 2.103864422440529, "loss/hidden": 3.80625, "loss/jsd": 0.0, "loss/logits": 0.2378301707096398, "step": 6130 }, { "epoch": 0.20466666666666666, "grad_norm": 34.25, "grad_norm_var": 5.6337890625, "learning_rate": 0.0001, "loss": 8.5937, "loss/crossentropy": 2.066901922225952, "loss/hidden": 3.96796875, "loss/jsd": 0.0, "loss/logits": 0.24710494233295321, "step": 6140 }, { "epoch": 0.205, "grad_norm": 39.0, "grad_norm_var": 13.94765625, "learning_rate": 0.0001, "loss": 8.7255, "loss/crossentropy": 2.068124470114708, "loss/hidden": 3.91015625, "loss/jsd": 0.0, "loss/logits": 0.25081984605640173, "step": 6150 }, { "epoch": 0.20533333333333334, "grad_norm": 30.125, "grad_norm_var": 23.54140625, "learning_rate": 0.0001, "loss": 8.759, "loss/crossentropy": 2.2413846030831337, "loss/hidden": 4.1171875, "loss/jsd": 0.0, "loss/logits": 0.30093136746436355, "step": 6160 }, { "epoch": 0.20566666666666666, "grad_norm": 41.0, "grad_norm_var": 8.30390625, "learning_rate": 0.0001, "loss": 8.6887, "loss/crossentropy": 2.060081334412098, "loss/hidden": 3.8984375, "loss/jsd": 0.0, "loss/logits": 0.24737574942409993, "step": 6170 }, { "epoch": 0.206, "grad_norm": 33.5, "grad_norm_var": 9.3087890625, "learning_rate": 0.0001, "loss": 8.7567, "loss/crossentropy": 2.1451657354831695, "loss/hidden": 4.12265625, "loss/jsd": 0.0, "loss/logits": 0.2983713150024414, "step": 6180 }, { "epoch": 0.20633333333333334, "grad_norm": 40.75, "grad_norm_var": 76.71399739583333, "learning_rate": 0.0001, "loss": 8.715, "loss/crossentropy": 2.2015042565762997, "loss/hidden": 4.01796875, "loss/jsd": 0.0, "loss/logits": 0.27279378157109024, "step": 6190 }, { "epoch": 0.20666666666666667, "grad_norm": 34.75, "grad_norm_var": 80.88958333333333, "learning_rate": 0.0001, "loss": 8.6619, "loss/crossentropy": 2.127535600960255, "loss/hidden": 3.954296875, "loss/jsd": 0.0, "loss/logits": 0.2594041220843792, "step": 6200 }, { "epoch": 0.207, "grad_norm": 32.0, "grad_norm_var": 7.032747395833334, "learning_rate": 0.0001, "loss": 8.7577, "loss/crossentropy": 2.1385881870985033, "loss/hidden": 3.964453125, "loss/jsd": 0.0, "loss/logits": 0.2612423226237297, "step": 6210 }, { "epoch": 0.20733333333333334, "grad_norm": 32.5, "grad_norm_var": 11.701041666666667, "learning_rate": 0.0001, "loss": 8.688, "loss/crossentropy": 2.0760431602597236, "loss/hidden": 3.905859375, "loss/jsd": 0.0, "loss/logits": 0.2630361717194319, "step": 6220 }, { "epoch": 0.20766666666666667, "grad_norm": 37.25, "grad_norm_var": 22.809375, "learning_rate": 0.0001, "loss": 8.5522, "loss/crossentropy": 2.135862450301647, "loss/hidden": 3.986328125, "loss/jsd": 0.0, "loss/logits": 0.2494693139567971, "step": 6230 }, { "epoch": 0.208, "grad_norm": 34.5, "grad_norm_var": 16.30390625, "learning_rate": 0.0001, "loss": 8.6282, "loss/crossentropy": 2.0777343571186067, "loss/hidden": 4.07109375, "loss/jsd": 0.0, "loss/logits": 0.2899452358484268, "step": 6240 }, { "epoch": 0.20833333333333334, "grad_norm": 30.5, "grad_norm_var": 5.9728515625, "learning_rate": 0.0001, "loss": 8.7862, "loss/crossentropy": 2.2019074261188507, "loss/hidden": 3.9609375, "loss/jsd": 0.0, "loss/logits": 0.25109023675322534, "step": 6250 }, { "epoch": 0.20866666666666667, "grad_norm": 31.5, "grad_norm_var": 5.348372395833334, "learning_rate": 0.0001, "loss": 8.6853, "loss/crossentropy": 2.2050928086042405, "loss/hidden": 3.96953125, "loss/jsd": 0.0, "loss/logits": 0.2619493114762008, "step": 6260 }, { "epoch": 0.209, "grad_norm": 31.875, "grad_norm_var": 4.7322265625, "learning_rate": 0.0001, "loss": 8.5978, "loss/crossentropy": 2.322963085025549, "loss/hidden": 3.87578125, "loss/jsd": 0.0, "loss/logits": 0.26414704993367194, "step": 6270 }, { "epoch": 0.20933333333333334, "grad_norm": 6845104128.0, "grad_norm_var": 2.9284656289268367e+18, "learning_rate": 0.0001, "loss": 8.7111, "loss/crossentropy": 1.9891023762524127, "loss/hidden": 4.237890625, "loss/jsd": 0.0, "loss/logits": 0.24320064708590508, "step": 6280 }, { "epoch": 0.20966666666666667, "grad_norm": 42.25, "grad_norm_var": 2.928465627087215e+18, "learning_rate": 0.0001, "loss": 8.8478, "loss/crossentropy": 2.1030918568372727, "loss/hidden": 3.91796875, "loss/jsd": 0.0, "loss/logits": 0.2577778071165085, "step": 6290 }, { "epoch": 0.21, "grad_norm": 33.75, "grad_norm_var": 9.833333333333334, "learning_rate": 0.0001, "loss": 8.7841, "loss/crossentropy": 2.1194095268845556, "loss/hidden": 3.93515625, "loss/jsd": 0.0, "loss/logits": 0.25438457299023864, "step": 6300 }, { "epoch": 0.21033333333333334, "grad_norm": 38.0, "grad_norm_var": 5.684830729166666, "learning_rate": 0.0001, "loss": 8.6073, "loss/crossentropy": 2.1838410973548887, "loss/hidden": 3.905078125, "loss/jsd": 0.0, "loss/logits": 0.2590842802077532, "step": 6310 }, { "epoch": 0.21066666666666667, "grad_norm": 32.25, "grad_norm_var": 14.130989583333333, "learning_rate": 0.0001, "loss": 8.6961, "loss/crossentropy": 2.1097051329910754, "loss/hidden": 3.87265625, "loss/jsd": 0.0, "loss/logits": 0.2509492003358901, "step": 6320 }, { "epoch": 0.211, "grad_norm": 32.75, "grad_norm_var": 7.970572916666667, "learning_rate": 0.0001, "loss": 8.758, "loss/crossentropy": 2.1945007756352424, "loss/hidden": 3.985546875, "loss/jsd": 0.0, "loss/logits": 0.25893947295844555, "step": 6330 }, { "epoch": 0.21133333333333335, "grad_norm": 36.0, "grad_norm_var": 1.7789921967526118e+18, "learning_rate": 0.0001, "loss": 8.905, "loss/crossentropy": 2.058439862728119, "loss/hidden": 4.17421875, "loss/jsd": 0.0, "loss/logits": 0.26833444014191626, "step": 6340 }, { "epoch": 0.21166666666666667, "grad_norm": 32.0, "grad_norm_var": 104.33229166666666, "learning_rate": 0.0001, "loss": 8.6526, "loss/crossentropy": 1.890061966329813, "loss/hidden": 4.045703125, "loss/jsd": 0.0, "loss/logits": 0.2450747612863779, "step": 6350 }, { "epoch": 0.212, "grad_norm": 29.625, "grad_norm_var": 22.006184895833332, "learning_rate": 0.0001, "loss": 8.7864, "loss/crossentropy": 2.1768174074590205, "loss/hidden": 4.052734375, "loss/jsd": 0.0, "loss/logits": 0.24862836562097074, "step": 6360 }, { "epoch": 0.21233333333333335, "grad_norm": 34.75, "grad_norm_var": 99.80598958333333, "learning_rate": 0.0001, "loss": 8.6528, "loss/crossentropy": 2.2797497868537904, "loss/hidden": 3.957421875, "loss/jsd": 0.0, "loss/logits": 0.26709459256380796, "step": 6370 }, { "epoch": 0.21266666666666667, "grad_norm": 32.5, "grad_norm_var": 86.98274739583333, "learning_rate": 0.0001, "loss": 8.6966, "loss/crossentropy": 2.178654319047928, "loss/hidden": 4.08046875, "loss/jsd": 0.0, "loss/logits": 0.2537883473560214, "step": 6380 }, { "epoch": 0.213, "grad_norm": 29.625, "grad_norm_var": 13.905208333333333, "learning_rate": 0.0001, "loss": 8.705, "loss/crossentropy": 2.0725951939821243, "loss/hidden": 3.972265625, "loss/jsd": 0.0, "loss/logits": 0.25900917164981363, "step": 6390 }, { "epoch": 0.21333333333333335, "grad_norm": 34.75, "grad_norm_var": 6.137239583333334, "learning_rate": 0.0001, "loss": 8.7598, "loss/crossentropy": 2.3110455900430678, "loss/hidden": 4.031640625, "loss/jsd": 0.0, "loss/logits": 0.28035171553492544, "step": 6400 }, { "epoch": 0.21366666666666667, "grad_norm": 35.0, "grad_norm_var": 10.5744140625, "learning_rate": 0.0001, "loss": 8.7293, "loss/crossentropy": 2.2058258563280106, "loss/hidden": 3.8546875, "loss/jsd": 0.0, "loss/logits": 0.2550595965236425, "step": 6410 }, { "epoch": 0.214, "grad_norm": 35.25, "grad_norm_var": 8.562239583333334, "learning_rate": 0.0001, "loss": 8.7376, "loss/crossentropy": 2.150018022954464, "loss/hidden": 3.87890625, "loss/jsd": 0.0, "loss/logits": 0.2502748826518655, "step": 6420 }, { "epoch": 0.21433333333333332, "grad_norm": 34.5, "grad_norm_var": 1.9860874111488097e+18, "learning_rate": 0.0001, "loss": 8.7269, "loss/crossentropy": 2.2726529754698275, "loss/hidden": 3.92578125, "loss/jsd": 0.0, "loss/logits": 0.2701293595135212, "step": 6430 }, { "epoch": 0.21466666666666667, "grad_norm": 34.0, "grad_norm_var": 6.182747395833333, "learning_rate": 0.0001, "loss": 8.5577, "loss/crossentropy": 2.217649821192026, "loss/hidden": 3.965234375, "loss/jsd": 0.0, "loss/logits": 0.2551640780642629, "step": 6440 }, { "epoch": 0.215, "grad_norm": 39.5, "grad_norm_var": 8.399739583333334, "learning_rate": 0.0001, "loss": 8.5535, "loss/crossentropy": 2.080750811100006, "loss/hidden": 3.88671875, "loss/jsd": 0.0, "loss/logits": 0.23864807337522506, "step": 6450 }, { "epoch": 0.21533333333333332, "grad_norm": 31.125, "grad_norm_var": 5.915559895833334, "learning_rate": 0.0001, "loss": 8.5244, "loss/crossentropy": 2.1134666696190836, "loss/hidden": 3.774609375, "loss/jsd": 0.0, "loss/logits": 0.23039107713848353, "step": 6460 }, { "epoch": 0.21566666666666667, "grad_norm": 34.5, "grad_norm_var": 12.49765625, "learning_rate": 0.0001, "loss": 8.6924, "loss/crossentropy": 2.2306397944688796, "loss/hidden": 4.040234375, "loss/jsd": 0.0, "loss/logits": 0.28511182554066183, "step": 6470 }, { "epoch": 0.216, "grad_norm": 38.75, "grad_norm_var": 8.626497395833333, "learning_rate": 0.0001, "loss": 8.7111, "loss/crossentropy": 2.084462544322014, "loss/hidden": 4.030859375, "loss/jsd": 0.0, "loss/logits": 0.2595676215365529, "step": 6480 }, { "epoch": 0.21633333333333332, "grad_norm": 36.5, "grad_norm_var": 12.863997395833334, "learning_rate": 0.0001, "loss": 8.6783, "loss/crossentropy": 2.2381670981645585, "loss/hidden": 3.99375, "loss/jsd": 0.0, "loss/logits": 0.272398603707552, "step": 6490 }, { "epoch": 0.21666666666666667, "grad_norm": 34.75, "grad_norm_var": 11.4994140625, "learning_rate": 0.0001, "loss": 8.575, "loss/crossentropy": 2.169520039856434, "loss/hidden": 3.975390625, "loss/jsd": 0.0, "loss/logits": 0.26597979068756106, "step": 6500 }, { "epoch": 0.217, "grad_norm": 31.375, "grad_norm_var": 14.201822916666666, "learning_rate": 0.0001, "loss": 8.7438, "loss/crossentropy": 2.3124695271253586, "loss/hidden": 3.96484375, "loss/jsd": 0.0, "loss/logits": 0.28047500401735304, "step": 6510 }, { "epoch": 0.21733333333333332, "grad_norm": 30.625, "grad_norm_var": 18.00625, "learning_rate": 0.0001, "loss": 8.6633, "loss/crossentropy": 2.23331324160099, "loss/hidden": 3.9703125, "loss/jsd": 0.0, "loss/logits": 0.2535081097856164, "step": 6520 }, { "epoch": 0.21766666666666667, "grad_norm": 32.5, "grad_norm_var": 13.676041666666666, "learning_rate": 0.0001, "loss": 8.6314, "loss/crossentropy": 2.073501707613468, "loss/hidden": 4.06171875, "loss/jsd": 0.0, "loss/logits": 0.2697511712089181, "step": 6530 }, { "epoch": 0.218, "grad_norm": 36.0, "grad_norm_var": 18.245572916666667, "learning_rate": 0.0001, "loss": 8.5764, "loss/crossentropy": 2.1756048664450645, "loss/hidden": 3.879296875, "loss/jsd": 0.0, "loss/logits": 0.24116889759898186, "step": 6540 }, { "epoch": 0.21833333333333332, "grad_norm": 31.625, "grad_norm_var": 8.464322916666667, "learning_rate": 0.0001, "loss": 8.6286, "loss/crossentropy": 1.9958701081573964, "loss/hidden": 3.88671875, "loss/jsd": 0.0, "loss/logits": 0.24323785230517386, "step": 6550 }, { "epoch": 0.21866666666666668, "grad_norm": 33.5, "grad_norm_var": 6.9931640625, "learning_rate": 0.0001, "loss": 8.6222, "loss/crossentropy": 2.1122898295521737, "loss/hidden": 3.958984375, "loss/jsd": 0.0, "loss/logits": 0.27048107255250214, "step": 6560 }, { "epoch": 0.219, "grad_norm": 31.5, "grad_norm_var": 7.69765625, "learning_rate": 0.0001, "loss": 8.6798, "loss/crossentropy": 2.167936125397682, "loss/hidden": 3.95, "loss/jsd": 0.0, "loss/logits": 0.2612238049507141, "step": 6570 }, { "epoch": 0.21933333333333332, "grad_norm": 30.75, "grad_norm_var": 12.91015625, "learning_rate": 0.0001, "loss": 8.6542, "loss/crossentropy": 2.060488347709179, "loss/hidden": 3.866015625, "loss/jsd": 0.0, "loss/logits": 0.23970827981829643, "step": 6580 }, { "epoch": 0.21966666666666668, "grad_norm": 34.5, "grad_norm_var": 12.857747395833334, "learning_rate": 0.0001, "loss": 8.7149, "loss/crossentropy": 2.2109495267271995, "loss/hidden": 3.906640625, "loss/jsd": 0.0, "loss/logits": 0.2613677404820919, "step": 6590 }, { "epoch": 0.22, "grad_norm": 34.5, "grad_norm_var": 2.7032856480426143e+18, "learning_rate": 0.0001, "loss": 8.6825, "loss/crossentropy": 2.1443465147167444, "loss/hidden": 4.11484375, "loss/jsd": 0.0, "loss/logits": 0.2631368327885866, "step": 6600 }, { "epoch": 0.22033333333333333, "grad_norm": 33.25, "grad_norm_var": 37.95774739583333, "learning_rate": 0.0001, "loss": 8.6734, "loss/crossentropy": 2.1459231124259532, "loss/hidden": 3.95625, "loss/jsd": 0.0, "loss/logits": 0.2577024588827044, "step": 6610 }, { "epoch": 0.22066666666666668, "grad_norm": 30.5, "grad_norm_var": 5.280989583333334, "learning_rate": 0.0001, "loss": 8.716, "loss/crossentropy": 2.4239099472761154, "loss/hidden": 3.99921875, "loss/jsd": 0.0, "loss/logits": 0.28959855400025847, "step": 6620 }, { "epoch": 0.221, "grad_norm": 31.625, "grad_norm_var": 6.0494140625, "learning_rate": 0.0001, "loss": 8.5097, "loss/crossentropy": 1.954162660241127, "loss/hidden": 3.962890625, "loss/jsd": 0.0, "loss/logits": 0.2510778192430735, "step": 6630 }, { "epoch": 0.22133333333333333, "grad_norm": 32.5, "grad_norm_var": 4.501041666666667, "learning_rate": 0.0001, "loss": 8.5961, "loss/crossentropy": 2.131689856946468, "loss/hidden": 3.87421875, "loss/jsd": 0.0, "loss/logits": 0.24224275033921003, "step": 6640 }, { "epoch": 0.22166666666666668, "grad_norm": 34.25, "grad_norm_var": 10.9759765625, "learning_rate": 0.0001, "loss": 8.5582, "loss/crossentropy": 2.0610960900783537, "loss/hidden": 3.917578125, "loss/jsd": 0.0, "loss/logits": 0.24408777449280022, "step": 6650 }, { "epoch": 0.222, "grad_norm": 34.75, "grad_norm_var": 7.430989583333333, "learning_rate": 0.0001, "loss": 8.6865, "loss/crossentropy": 2.12496095597744, "loss/hidden": 3.811328125, "loss/jsd": 0.0, "loss/logits": 0.2433070670813322, "step": 6660 }, { "epoch": 0.22233333333333333, "grad_norm": 32.25, "grad_norm_var": 10.940559895833333, "learning_rate": 0.0001, "loss": 8.57, "loss/crossentropy": 2.1832097455859185, "loss/hidden": 3.923828125, "loss/jsd": 0.0, "loss/logits": 0.2531871374696493, "step": 6670 }, { "epoch": 0.22266666666666668, "grad_norm": 31.0, "grad_norm_var": 11.620572916666667, "learning_rate": 0.0001, "loss": 8.6083, "loss/crossentropy": 2.2909538954496385, "loss/hidden": 3.934765625, "loss/jsd": 0.0, "loss/logits": 0.26815793141722677, "step": 6680 }, { "epoch": 0.223, "grad_norm": 32.5, "grad_norm_var": 9.41875, "learning_rate": 0.0001, "loss": 8.6731, "loss/crossentropy": 2.208388736844063, "loss/hidden": 3.87265625, "loss/jsd": 0.0, "loss/logits": 0.25899204462766645, "step": 6690 }, { "epoch": 0.22333333333333333, "grad_norm": 49.25, "grad_norm_var": 24.110872395833333, "learning_rate": 0.0001, "loss": 8.7066, "loss/crossentropy": 2.1818058155477047, "loss/hidden": 4.038671875, "loss/jsd": 0.0, "loss/logits": 0.2769562091678381, "step": 6700 }, { "epoch": 0.22366666666666668, "grad_norm": 31.125, "grad_norm_var": 38.39375, "learning_rate": 0.0001, "loss": 8.7785, "loss/crossentropy": 2.1179177343845366, "loss/hidden": 4.037109375, "loss/jsd": 0.0, "loss/logits": 0.2616199808195233, "step": 6710 }, { "epoch": 0.224, "grad_norm": 31.875, "grad_norm_var": 5.8947265625, "learning_rate": 0.0001, "loss": 8.6151, "loss/crossentropy": 2.0090429857373238, "loss/hidden": 3.823046875, "loss/jsd": 0.0, "loss/logits": 0.2248888023197651, "step": 6720 }, { "epoch": 0.22433333333333333, "grad_norm": 31.0, "grad_norm_var": 55.145572916666666, "learning_rate": 0.0001, "loss": 8.7802, "loss/crossentropy": 2.1233505457639694, "loss/hidden": 3.835546875, "loss/jsd": 0.0, "loss/logits": 0.2540145181119442, "step": 6730 }, { "epoch": 0.22466666666666665, "grad_norm": 32.75, "grad_norm_var": 2.668684895833333, "learning_rate": 0.0001, "loss": 8.6411, "loss/crossentropy": 2.240196964144707, "loss/hidden": 4.01953125, "loss/jsd": 0.0, "loss/logits": 0.26511474009603264, "step": 6740 }, { "epoch": 0.225, "grad_norm": 39.25, "grad_norm_var": 7.196875, "learning_rate": 0.0001, "loss": 8.4853, "loss/crossentropy": 2.1406675301492215, "loss/hidden": 3.93125, "loss/jsd": 0.0, "loss/logits": 0.24782155379652976, "step": 6750 }, { "epoch": 0.22533333333333333, "grad_norm": 31.0, "grad_norm_var": 7.8353515625, "learning_rate": 0.0001, "loss": 8.6089, "loss/crossentropy": 2.071791734546423, "loss/hidden": 3.878515625, "loss/jsd": 0.0, "loss/logits": 0.25682480856776235, "step": 6760 }, { "epoch": 0.22566666666666665, "grad_norm": 48.0, "grad_norm_var": 1.824028194531967e+18, "learning_rate": 0.0001, "loss": 8.8247, "loss/crossentropy": 2.240962551534176, "loss/hidden": 3.891796875, "loss/jsd": 0.0, "loss/logits": 0.25645633824169634, "step": 6770 }, { "epoch": 0.226, "grad_norm": 29.5, "grad_norm_var": 3.556337769787687e+18, "learning_rate": 0.0001, "loss": 8.7193, "loss/crossentropy": 1.969706627726555, "loss/hidden": 4.058203125, "loss/jsd": 0.0, "loss/logits": 0.2760220758616924, "step": 6780 }, { "epoch": 0.22633333333333333, "grad_norm": 37.5, "grad_norm_var": 1.986087411876941e+18, "learning_rate": 0.0001, "loss": 8.718, "loss/crossentropy": 2.0835460133850576, "loss/hidden": 4.033203125, "loss/jsd": 0.0, "loss/logits": 0.27087474074214696, "step": 6790 }, { "epoch": 0.22666666666666666, "grad_norm": 37.0, "grad_norm_var": 12.514583333333333, "learning_rate": 0.0001, "loss": 8.5738, "loss/crossentropy": 2.0835996329784394, "loss/hidden": 4.028125, "loss/jsd": 0.0, "loss/logits": 0.26487845852971076, "step": 6800 }, { "epoch": 0.227, "grad_norm": 33.75, "grad_norm_var": 13.1212890625, "learning_rate": 0.0001, "loss": 8.6835, "loss/crossentropy": 2.174676289409399, "loss/hidden": 3.9765625, "loss/jsd": 0.0, "loss/logits": 0.24940601829439402, "step": 6810 }, { "epoch": 0.22733333333333333, "grad_norm": 30.0, "grad_norm_var": 30.77890625, "learning_rate": 0.0001, "loss": 8.7017, "loss/crossentropy": 1.9795321062207223, "loss/hidden": 3.940625, "loss/jsd": 0.0, "loss/logits": 0.2277604851871729, "step": 6820 }, { "epoch": 0.22766666666666666, "grad_norm": 36.0, "grad_norm_var": 29.243489583333332, "learning_rate": 0.0001, "loss": 8.6398, "loss/crossentropy": 2.0192734390497207, "loss/hidden": 3.87421875, "loss/jsd": 0.0, "loss/logits": 0.27642418537288904, "step": 6830 }, { "epoch": 0.228, "grad_norm": 31.875, "grad_norm_var": 8.983268229166667, "learning_rate": 0.0001, "loss": 8.6125, "loss/crossentropy": 2.0098339319229126, "loss/hidden": 4.141015625, "loss/jsd": 0.0, "loss/logits": 0.2576570626348257, "step": 6840 }, { "epoch": 0.22833333333333333, "grad_norm": 31.25, "grad_norm_var": 16.34765625, "learning_rate": 0.0001, "loss": 8.5918, "loss/crossentropy": 2.0969722121953964, "loss/hidden": 4.009765625, "loss/jsd": 0.0, "loss/logits": 0.27101105730980635, "step": 6850 }, { "epoch": 0.22866666666666666, "grad_norm": 30.375, "grad_norm_var": 13.826822916666666, "learning_rate": 0.0001, "loss": 8.5323, "loss/crossentropy": 2.1093384474515915, "loss/hidden": 3.993359375, "loss/jsd": 0.0, "loss/logits": 0.26533141303807495, "step": 6860 }, { "epoch": 0.229, "grad_norm": 38.75, "grad_norm_var": 5.7994140625, "learning_rate": 0.0001, "loss": 8.6041, "loss/crossentropy": 2.2051602229475975, "loss/hidden": 4.044921875, "loss/jsd": 0.0, "loss/logits": 0.27079470865428446, "step": 6870 }, { "epoch": 0.22933333333333333, "grad_norm": 36.25, "grad_norm_var": 6.357291666666667, "learning_rate": 0.0001, "loss": 8.5977, "loss/crossentropy": 2.13309033960104, "loss/hidden": 3.709765625, "loss/jsd": 0.0, "loss/logits": 0.23595739863812923, "step": 6880 }, { "epoch": 0.22966666666666666, "grad_norm": 32.0, "grad_norm_var": 5.145833333333333, "learning_rate": 0.0001, "loss": 8.485, "loss/crossentropy": 2.205070769786835, "loss/hidden": 3.733984375, "loss/jsd": 0.0, "loss/logits": 0.23339474331587554, "step": 6890 }, { "epoch": 0.23, "grad_norm": 32.25, "grad_norm_var": 12.338541666666666, "learning_rate": 0.0001, "loss": 8.5976, "loss/crossentropy": 2.162086985260248, "loss/hidden": 3.76953125, "loss/jsd": 0.0, "loss/logits": 0.23039890434592963, "step": 6900 }, { "epoch": 0.23033333333333333, "grad_norm": 41.25, "grad_norm_var": 15.233268229166667, "learning_rate": 0.0001, "loss": 8.5384, "loss/crossentropy": 2.0678157053887842, "loss/hidden": 3.982421875, "loss/jsd": 0.0, "loss/logits": 0.2481289473362267, "step": 6910 }, { "epoch": 0.23066666666666666, "grad_norm": 44.5, "grad_norm_var": 12.778580729166666, "learning_rate": 0.0001, "loss": 8.6622, "loss/crossentropy": 2.114139196276665, "loss/hidden": 3.91328125, "loss/jsd": 0.0, "loss/logits": 0.25735178850591184, "step": 6920 }, { "epoch": 0.231, "grad_norm": 34.25, "grad_norm_var": 31.0625, "learning_rate": 0.0001, "loss": 8.607, "loss/crossentropy": 2.0753188371658324, "loss/hidden": 4.03359375, "loss/jsd": 0.0, "loss/logits": 0.24515043962746857, "step": 6930 }, { "epoch": 0.23133333333333334, "grad_norm": 36.5, "grad_norm_var": 3.3962890625, "learning_rate": 0.0001, "loss": 8.4551, "loss/crossentropy": 2.1153680123388767, "loss/hidden": 3.856640625, "loss/jsd": 0.0, "loss/logits": 0.2439242374151945, "step": 6940 }, { "epoch": 0.23166666666666666, "grad_norm": 34.25, "grad_norm_var": 7.212955729166667, "learning_rate": 0.0001, "loss": 8.5571, "loss/crossentropy": 2.2401276588439942, "loss/hidden": 3.83671875, "loss/jsd": 0.0, "loss/logits": 0.2469344925135374, "step": 6950 }, { "epoch": 0.232, "grad_norm": 33.0, "grad_norm_var": 14.067643229166666, "learning_rate": 0.0001, "loss": 8.5338, "loss/crossentropy": 2.1172975957393647, "loss/hidden": 4.00703125, "loss/jsd": 0.0, "loss/logits": 0.2541731720790267, "step": 6960 }, { "epoch": 0.23233333333333334, "grad_norm": 40.75, "grad_norm_var": 8.700455729166666, "learning_rate": 0.0001, "loss": 8.5392, "loss/crossentropy": 2.144708326458931, "loss/hidden": 3.851171875, "loss/jsd": 0.0, "loss/logits": 0.2414156835526228, "step": 6970 }, { "epoch": 0.23266666666666666, "grad_norm": 37.75, "grad_norm_var": 7.82265625, "learning_rate": 0.0001, "loss": 8.4426, "loss/crossentropy": 2.066808733344078, "loss/hidden": 3.741015625, "loss/jsd": 0.0, "loss/logits": 0.23481324184685945, "step": 6980 }, { "epoch": 0.233, "grad_norm": 36.0, "grad_norm_var": 6.967708333333333, "learning_rate": 0.0001, "loss": 8.6706, "loss/crossentropy": 2.0547366201877595, "loss/hidden": 3.919140625, "loss/jsd": 0.0, "loss/logits": 0.2535815857350826, "step": 6990 }, { "epoch": 0.23333333333333334, "grad_norm": 35.75, "grad_norm_var": 16.675455729166668, "learning_rate": 0.0001, "loss": 8.5713, "loss/crossentropy": 2.146591657400131, "loss/hidden": 3.892578125, "loss/jsd": 0.0, "loss/logits": 0.26058912873268125, "step": 7000 }, { "epoch": 0.23366666666666666, "grad_norm": 35.0, "grad_norm_var": 14.667122395833333, "learning_rate": 0.0001, "loss": 8.7385, "loss/crossentropy": 2.028310924768448, "loss/hidden": 3.966015625, "loss/jsd": 0.0, "loss/logits": 0.254487244784832, "step": 7010 }, { "epoch": 0.234, "grad_norm": 33.5, "grad_norm_var": 5.967643229166667, "learning_rate": 0.0001, "loss": 8.7735, "loss/crossentropy": 2.1410273112356664, "loss/hidden": 3.881640625, "loss/jsd": 0.0, "loss/logits": 0.24528108015656472, "step": 7020 }, { "epoch": 0.23433333333333334, "grad_norm": 33.25, "grad_norm_var": 3.5994140625, "learning_rate": 0.0001, "loss": 8.5423, "loss/crossentropy": 2.0229434952139855, "loss/hidden": 3.830078125, "loss/jsd": 0.0, "loss/logits": 0.23408238925039768, "step": 7030 }, { "epoch": 0.23466666666666666, "grad_norm": 31.75, "grad_norm_var": 2.78125, "learning_rate": 0.0001, "loss": 8.6283, "loss/crossentropy": 2.1765091590583325, "loss/hidden": 3.98203125, "loss/jsd": 0.0, "loss/logits": 0.26205482967197896, "step": 7040 }, { "epoch": 0.235, "grad_norm": 31.75, "grad_norm_var": 5.094205729166666, "learning_rate": 0.0001, "loss": 8.5875, "loss/crossentropy": 2.1615469992160796, "loss/hidden": 3.816015625, "loss/jsd": 0.0, "loss/logits": 0.2432116275653243, "step": 7050 }, { "epoch": 0.23533333333333334, "grad_norm": 33.5, "grad_norm_var": 8.434830729166666, "learning_rate": 0.0001, "loss": 8.6217, "loss/crossentropy": 2.0332022219896317, "loss/hidden": 3.891015625, "loss/jsd": 0.0, "loss/logits": 0.24271235838532448, "step": 7060 }, { "epoch": 0.23566666666666666, "grad_norm": 33.0, "grad_norm_var": 80.83541666666666, "learning_rate": 0.0001, "loss": 8.6128, "loss/crossentropy": 2.094720220565796, "loss/hidden": 3.84296875, "loss/jsd": 0.0, "loss/logits": 0.22856017146259547, "step": 7070 }, { "epoch": 0.236, "grad_norm": 31.5, "grad_norm_var": 1.5458333333333334, "learning_rate": 0.0001, "loss": 8.5501, "loss/crossentropy": 2.1444082021713258, "loss/hidden": 3.938671875, "loss/jsd": 0.0, "loss/logits": 0.25706543773412704, "step": 7080 }, { "epoch": 0.23633333333333334, "grad_norm": 32.5, "grad_norm_var": 5.824739583333334, "learning_rate": 0.0001, "loss": 8.7055, "loss/crossentropy": 2.269250899553299, "loss/hidden": 4.0234375, "loss/jsd": 0.0, "loss/logits": 0.2765824764966965, "step": 7090 }, { "epoch": 0.23666666666666666, "grad_norm": 36.75, "grad_norm_var": 16.210416666666667, "learning_rate": 0.0001, "loss": 8.6437, "loss/crossentropy": 2.0246976539492607, "loss/hidden": 3.973828125, "loss/jsd": 0.0, "loss/logits": 0.24731771647930145, "step": 7100 }, { "epoch": 0.237, "grad_norm": 30.125, "grad_norm_var": 11.0166015625, "learning_rate": 0.0001, "loss": 8.5064, "loss/crossentropy": 2.1460629656910895, "loss/hidden": 3.94375, "loss/jsd": 0.0, "loss/logits": 0.24730791207402944, "step": 7110 }, { "epoch": 0.23733333333333334, "grad_norm": 32.75, "grad_norm_var": 15.567122395833334, "learning_rate": 0.0001, "loss": 8.6119, "loss/crossentropy": 2.0084538377821444, "loss/hidden": 3.947265625, "loss/jsd": 0.0, "loss/logits": 0.2514385598711669, "step": 7120 }, { "epoch": 0.23766666666666666, "grad_norm": 32.75, "grad_norm_var": 8.709830729166667, "learning_rate": 0.0001, "loss": 8.5552, "loss/crossentropy": 2.188428722321987, "loss/hidden": 3.987890625, "loss/jsd": 0.0, "loss/logits": 0.2734973944723606, "step": 7130 }, { "epoch": 0.238, "grad_norm": 33.5, "grad_norm_var": 9.23125, "learning_rate": 0.0001, "loss": 8.6669, "loss/crossentropy": 2.163307761400938, "loss/hidden": 3.998046875, "loss/jsd": 0.0, "loss/logits": 0.26316353445872664, "step": 7140 }, { "epoch": 0.23833333333333334, "grad_norm": 32.5, "grad_norm_var": 7.6744140625, "learning_rate": 0.0001, "loss": 8.6865, "loss/crossentropy": 2.0473650604486466, "loss/hidden": 3.98984375, "loss/jsd": 0.0, "loss/logits": 0.2502355322241783, "step": 7150 }, { "epoch": 0.23866666666666667, "grad_norm": 37.25, "grad_norm_var": 6.209375, "learning_rate": 0.0001, "loss": 8.5554, "loss/crossentropy": 2.10597411096096, "loss/hidden": 3.9953125, "loss/jsd": 0.0, "loss/logits": 0.2646968217566609, "step": 7160 }, { "epoch": 0.239, "grad_norm": 32.5, "grad_norm_var": 7.327083333333333, "learning_rate": 0.0001, "loss": 8.5166, "loss/crossentropy": 2.2311090558767317, "loss/hidden": 3.971875, "loss/jsd": 0.0, "loss/logits": 0.2651707552373409, "step": 7170 }, { "epoch": 0.23933333333333334, "grad_norm": 33.75, "grad_norm_var": 2.708268229166667, "learning_rate": 0.0001, "loss": 8.5943, "loss/crossentropy": 2.1325844526290894, "loss/hidden": 3.856640625, "loss/jsd": 0.0, "loss/logits": 0.25004746317863463, "step": 7180 }, { "epoch": 0.23966666666666667, "grad_norm": 33.0, "grad_norm_var": 5.72265625, "learning_rate": 0.0001, "loss": 8.5078, "loss/crossentropy": 2.127976506203413, "loss/hidden": 4.0015625, "loss/jsd": 0.0, "loss/logits": 0.25839042402803897, "step": 7190 }, { "epoch": 0.24, "grad_norm": 34.75, "grad_norm_var": 4.226822916666666, "learning_rate": 0.0001, "loss": 8.5393, "loss/crossentropy": 2.1524706527590753, "loss/hidden": 3.78359375, "loss/jsd": 0.0, "loss/logits": 0.24184909779578448, "step": 7200 }, { "epoch": 0.24033333333333334, "grad_norm": 38.0, "grad_norm_var": 24.615559895833332, "learning_rate": 0.0001, "loss": 8.7409, "loss/crossentropy": 2.0844357013702393, "loss/hidden": 4.113671875, "loss/jsd": 0.0, "loss/logits": 0.25061873607337476, "step": 7210 }, { "epoch": 0.24066666666666667, "grad_norm": 34.5, "grad_norm_var": 4.8791015625, "learning_rate": 0.0001, "loss": 8.5332, "loss/crossentropy": 2.2334757328033445, "loss/hidden": 3.786328125, "loss/jsd": 0.0, "loss/logits": 0.24229202494025232, "step": 7220 }, { "epoch": 0.241, "grad_norm": 39.75, "grad_norm_var": 5.398372395833333, "learning_rate": 0.0001, "loss": 8.6905, "loss/crossentropy": 2.095822374522686, "loss/hidden": 4.03359375, "loss/jsd": 0.0, "loss/logits": 0.276114359125495, "step": 7230 }, { "epoch": 0.24133333333333334, "grad_norm": 33.5, "grad_norm_var": 6.189322916666667, "learning_rate": 0.0001, "loss": 8.621, "loss/crossentropy": 2.083872254192829, "loss/hidden": 3.843359375, "loss/jsd": 0.0, "loss/logits": 0.25903829988092186, "step": 7240 }, { "epoch": 0.24166666666666667, "grad_norm": 40.0, "grad_norm_var": 9.3556640625, "learning_rate": 0.0001, "loss": 8.8226, "loss/crossentropy": 2.1576643377542495, "loss/hidden": 3.93046875, "loss/jsd": 0.0, "loss/logits": 0.277049720287323, "step": 7250 }, { "epoch": 0.242, "grad_norm": 46.0, "grad_norm_var": 42.713541666666664, "learning_rate": 0.0001, "loss": 8.5405, "loss/crossentropy": 2.1249560177326203, "loss/hidden": 3.844921875, "loss/jsd": 0.0, "loss/logits": 0.24358038194477558, "step": 7260 }, { "epoch": 0.24233333333333335, "grad_norm": 32.0, "grad_norm_var": 43.81666666666667, "learning_rate": 0.0001, "loss": 8.6632, "loss/crossentropy": 2.271902731060982, "loss/hidden": 3.877734375, "loss/jsd": 0.0, "loss/logits": 0.2662301120348275, "step": 7270 }, { "epoch": 0.24266666666666667, "grad_norm": 29.25, "grad_norm_var": 3.8811848958333335, "learning_rate": 0.0001, "loss": 8.4007, "loss/crossentropy": 2.037639981508255, "loss/hidden": 3.926953125, "loss/jsd": 0.0, "loss/logits": 0.2521992586553097, "step": 7280 }, { "epoch": 0.243, "grad_norm": 37.75, "grad_norm_var": 11.527018229166666, "learning_rate": 0.0001, "loss": 8.7671, "loss/crossentropy": 2.0688220985233783, "loss/hidden": 4.000390625, "loss/jsd": 0.0, "loss/logits": 0.2653772059828043, "step": 7290 }, { "epoch": 0.24333333333333335, "grad_norm": 34.75, "grad_norm_var": 9.0103515625, "learning_rate": 0.0001, "loss": 8.6682, "loss/crossentropy": 1.9946186635643244, "loss/hidden": 3.946875, "loss/jsd": 0.0, "loss/logits": 0.2328943044412881, "step": 7300 }, { "epoch": 0.24366666666666667, "grad_norm": 31.5, "grad_norm_var": 7.376497395833334, "learning_rate": 0.0001, "loss": 8.5699, "loss/crossentropy": 2.139767034351826, "loss/hidden": 3.88828125, "loss/jsd": 0.0, "loss/logits": 0.2668099632486701, "step": 7310 }, { "epoch": 0.244, "grad_norm": 30.875, "grad_norm_var": 5.427018229166666, "learning_rate": 0.0001, "loss": 8.6389, "loss/crossentropy": 2.0648159228265284, "loss/hidden": 3.88359375, "loss/jsd": 0.0, "loss/logits": 0.2502809874713421, "step": 7320 }, { "epoch": 0.24433333333333335, "grad_norm": 31.25, "grad_norm_var": 7.09140625, "learning_rate": 0.0001, "loss": 8.6659, "loss/crossentropy": 2.106768397986889, "loss/hidden": 3.928125, "loss/jsd": 0.0, "loss/logits": 0.2518464956432581, "step": 7330 }, { "epoch": 0.24466666666666667, "grad_norm": 31.75, "grad_norm_var": 11.3603515625, "learning_rate": 0.0001, "loss": 8.5392, "loss/crossentropy": 2.1911858722567557, "loss/hidden": 3.996484375, "loss/jsd": 0.0, "loss/logits": 0.26760652028024196, "step": 7340 }, { "epoch": 0.245, "grad_norm": 33.75, "grad_norm_var": 19.9259765625, "learning_rate": 0.0001, "loss": 8.5007, "loss/crossentropy": 2.066247297823429, "loss/hidden": 3.937109375, "loss/jsd": 0.0, "loss/logits": 0.2681329587474465, "step": 7350 }, { "epoch": 0.24533333333333332, "grad_norm": 28.75, "grad_norm_var": 3.015230290082031e+18, "learning_rate": 0.0001, "loss": 8.6882, "loss/crossentropy": 2.286055992543697, "loss/hidden": 3.84609375, "loss/jsd": 0.0, "loss/logits": 0.262431076541543, "step": 7360 }, { "epoch": 0.24566666666666667, "grad_norm": 33.5, "grad_norm_var": 15.270247395833334, "learning_rate": 0.0001, "loss": 8.4911, "loss/crossentropy": 2.203142321109772, "loss/hidden": 4.007421875, "loss/jsd": 0.0, "loss/logits": 0.25816880762577055, "step": 7370 }, { "epoch": 0.246, "grad_norm": 31.25, "grad_norm_var": 6.605208333333334, "learning_rate": 0.0001, "loss": 8.5749, "loss/crossentropy": 2.168962088227272, "loss/hidden": 3.844140625, "loss/jsd": 0.0, "loss/logits": 0.24683325868099928, "step": 7380 }, { "epoch": 0.24633333333333332, "grad_norm": 29.125, "grad_norm_var": 8.370572916666667, "learning_rate": 0.0001, "loss": 8.6485, "loss/crossentropy": 2.201642544567585, "loss/hidden": 3.88984375, "loss/jsd": 0.0, "loss/logits": 0.25863207802176474, "step": 7390 }, { "epoch": 0.24666666666666667, "grad_norm": 31.5, "grad_norm_var": 3.2416015625, "learning_rate": 0.0001, "loss": 8.4782, "loss/crossentropy": 1.960936988890171, "loss/hidden": 3.831640625, "loss/jsd": 0.0, "loss/logits": 0.22650879565626383, "step": 7400 }, { "epoch": 0.247, "grad_norm": 38.25, "grad_norm_var": 5.086393229166666, "learning_rate": 0.0001, "loss": 8.6146, "loss/crossentropy": 2.148800623416901, "loss/hidden": 3.9265625, "loss/jsd": 0.0, "loss/logits": 0.27121419459581375, "step": 7410 }, { "epoch": 0.24733333333333332, "grad_norm": 32.25, "grad_norm_var": 4.398372395833333, "learning_rate": 0.0001, "loss": 8.5397, "loss/crossentropy": 2.1694126784801484, "loss/hidden": 3.891796875, "loss/jsd": 0.0, "loss/logits": 0.24342003595083953, "step": 7420 }, { "epoch": 0.24766666666666667, "grad_norm": 32.75, "grad_norm_var": 9.516666666666667, "learning_rate": 0.0001, "loss": 8.6136, "loss/crossentropy": 2.1182317078113555, "loss/hidden": 3.844921875, "loss/jsd": 0.0, "loss/logits": 0.24156781807541847, "step": 7430 }, { "epoch": 0.248, "grad_norm": 36.5, "grad_norm_var": 7.377018229166667, "learning_rate": 0.0001, "loss": 8.5314, "loss/crossentropy": 2.1088318437337876, "loss/hidden": 3.822265625, "loss/jsd": 0.0, "loss/logits": 0.250948965921998, "step": 7440 }, { "epoch": 0.24833333333333332, "grad_norm": 34.25, "grad_norm_var": 3248.1872395833334, "learning_rate": 0.0001, "loss": 8.6694, "loss/crossentropy": 2.2178388088941574, "loss/hidden": 3.946875, "loss/jsd": 0.0, "loss/logits": 0.25901649333536625, "step": 7450 }, { "epoch": 0.24866666666666667, "grad_norm": 31.125, "grad_norm_var": 3290.9358723958335, "learning_rate": 0.0001, "loss": 8.5781, "loss/crossentropy": 2.2698897421360016, "loss/hidden": 3.926171875, "loss/jsd": 0.0, "loss/logits": 0.2536152143031359, "step": 7460 }, { "epoch": 0.249, "grad_norm": 33.0, "grad_norm_var": 1772.1875, "learning_rate": 0.0001, "loss": 8.5832, "loss/crossentropy": 2.1621989846229552, "loss/hidden": 3.84609375, "loss/jsd": 0.0, "loss/logits": 0.24833030719310045, "step": 7470 }, { "epoch": 0.24933333333333332, "grad_norm": 39.25, "grad_norm_var": 2.814749738438492e+18, "learning_rate": 0.0001, "loss": 8.6171, "loss/crossentropy": 2.14983384013176, "loss/hidden": 4.019921875, "loss/jsd": 0.0, "loss/logits": 0.27834896706044676, "step": 7480 }, { "epoch": 0.24966666666666668, "grad_norm": 31.25, "grad_norm_var": 2.81474973868316e+18, "learning_rate": 0.0001, "loss": 8.6016, "loss/crossentropy": 2.168473194539547, "loss/hidden": 4.10546875, "loss/jsd": 0.0, "loss/logits": 0.29676152374595405, "step": 7490 }, { "epoch": 0.25, "grad_norm": 33.0, "grad_norm_var": 3567.3869140625, "learning_rate": 0.0001, "loss": 8.6921, "loss/crossentropy": 2.042189783602953, "loss/hidden": 3.898828125, "loss/jsd": 0.0, "loss/logits": 0.2364582633599639, "step": 7500 }, { "epoch": 0.25033333333333335, "grad_norm": 30.25, "grad_norm_var": 5.686458333333333, "learning_rate": 0.0001, "loss": 8.6958, "loss/crossentropy": 2.062736430764198, "loss/hidden": 3.84609375, "loss/jsd": 0.0, "loss/logits": 0.240386860165745, "step": 7510 }, { "epoch": 0.25066666666666665, "grad_norm": 31.375, "grad_norm_var": 17.955989583333334, "learning_rate": 0.0001, "loss": 8.515, "loss/crossentropy": 2.0180068641901014, "loss/hidden": 3.877734375, "loss/jsd": 0.0, "loss/logits": 0.23197599165141583, "step": 7520 }, { "epoch": 0.251, "grad_norm": 33.75, "grad_norm_var": 2.0817889033199114e+18, "learning_rate": 0.0001, "loss": 8.5997, "loss/crossentropy": 2.106365057826042, "loss/hidden": 3.99921875, "loss/jsd": 0.0, "loss/logits": 0.24268077537417412, "step": 7530 }, { "epoch": 0.25133333333333335, "grad_norm": 32.5, "grad_norm_var": 8.407291666666667, "learning_rate": 0.0001, "loss": 8.4314, "loss/crossentropy": 1.9716456033289433, "loss/hidden": 3.909375, "loss/jsd": 0.0, "loss/logits": 0.2563434375450015, "step": 7540 }, { "epoch": 0.25166666666666665, "grad_norm": 30.375, "grad_norm_var": 9.754166666666666, "learning_rate": 0.0001, "loss": 8.4357, "loss/crossentropy": 2.214118207991123, "loss/hidden": 3.816796875, "loss/jsd": 0.0, "loss/logits": 0.2364793760702014, "step": 7550 }, { "epoch": 0.252, "grad_norm": 32.0, "grad_norm_var": 11.27265625, "learning_rate": 0.0001, "loss": 8.5187, "loss/crossentropy": 2.0577072143554687, "loss/hidden": 3.916015625, "loss/jsd": 0.0, "loss/logits": 0.2660173770040274, "step": 7560 }, { "epoch": 0.25233333333333335, "grad_norm": 34.25, "grad_norm_var": 6.864518229166666, "learning_rate": 0.0001, "loss": 8.5382, "loss/crossentropy": 2.28112398609519, "loss/hidden": 3.882421875, "loss/jsd": 0.0, "loss/logits": 0.2548609297722578, "step": 7570 }, { "epoch": 0.25266666666666665, "grad_norm": 31.0, "grad_norm_var": 7.536458333333333, "learning_rate": 0.0001, "loss": 8.5838, "loss/crossentropy": 2.148515190184116, "loss/hidden": 3.8375, "loss/jsd": 0.0, "loss/logits": 0.24244300834834576, "step": 7580 }, { "epoch": 0.253, "grad_norm": 32.0, "grad_norm_var": 74.91087239583334, "learning_rate": 0.0001, "loss": 8.5155, "loss/crossentropy": 2.1178071200847626, "loss/hidden": 3.883984375, "loss/jsd": 0.0, "loss/logits": 0.25539283007383345, "step": 7590 }, { "epoch": 0.25333333333333335, "grad_norm": 34.75, "grad_norm_var": 7.797330729166666, "learning_rate": 0.0001, "loss": 8.5274, "loss/crossentropy": 2.0491638466715814, "loss/hidden": 3.93828125, "loss/jsd": 0.0, "loss/logits": 0.24984916690737008, "step": 7600 }, { "epoch": 0.25366666666666665, "grad_norm": 34.0, "grad_norm_var": 4.676497395833334, "learning_rate": 0.0001, "loss": 8.4896, "loss/crossentropy": 1.989129790663719, "loss/hidden": 3.969921875, "loss/jsd": 0.0, "loss/logits": 0.2540574911981821, "step": 7610 }, { "epoch": 0.254, "grad_norm": 35.0, "grad_norm_var": 10.124934895833333, "learning_rate": 0.0001, "loss": 8.5499, "loss/crossentropy": 2.1711443960666656, "loss/hidden": 3.880859375, "loss/jsd": 0.0, "loss/logits": 0.2434135077521205, "step": 7620 }, { "epoch": 0.25433333333333336, "grad_norm": 31.75, "grad_norm_var": 10.613541666666666, "learning_rate": 0.0001, "loss": 8.4787, "loss/crossentropy": 2.111149328947067, "loss/hidden": 4.075, "loss/jsd": 0.0, "loss/logits": 0.28135603088885547, "step": 7630 }, { "epoch": 0.25466666666666665, "grad_norm": 32.25, "grad_norm_var": 4.476041666666666, "learning_rate": 0.0001, "loss": 8.5342, "loss/crossentropy": 2.0875839471817015, "loss/hidden": 3.80234375, "loss/jsd": 0.0, "loss/logits": 0.2278506338596344, "step": 7640 }, { "epoch": 0.255, "grad_norm": 31.75, "grad_norm_var": 2.718489583333333, "learning_rate": 0.0001, "loss": 8.5779, "loss/crossentropy": 2.2961817413568495, "loss/hidden": 3.91171875, "loss/jsd": 0.0, "loss/logits": 0.2761024903506041, "step": 7650 }, { "epoch": 0.25533333333333336, "grad_norm": 30.875, "grad_norm_var": 4.695572916666666, "learning_rate": 0.0001, "loss": 8.4936, "loss/crossentropy": 2.064536126330495, "loss/hidden": 4.03125, "loss/jsd": 0.0, "loss/logits": 0.2472051708959043, "step": 7660 }, { "epoch": 0.25566666666666665, "grad_norm": 31.5, "grad_norm_var": 31.70390625, "learning_rate": 0.0001, "loss": 8.5492, "loss/crossentropy": 2.0761756777763365, "loss/hidden": 3.91953125, "loss/jsd": 0.0, "loss/logits": 0.2548640869557858, "step": 7670 }, { "epoch": 0.256, "grad_norm": 31.75, "grad_norm_var": 11.212239583333334, "learning_rate": 0.0001, "loss": 8.5113, "loss/crossentropy": 2.1705673079937697, "loss/hidden": 3.92109375, "loss/jsd": 0.0, "loss/logits": 0.24495558133348821, "step": 7680 }, { "epoch": 0.25633333333333336, "grad_norm": 35.0, "grad_norm_var": 5.424739583333333, "learning_rate": 0.0001, "loss": 8.4703, "loss/crossentropy": 2.2372745871543884, "loss/hidden": 3.875390625, "loss/jsd": 0.0, "loss/logits": 0.24880910199135542, "step": 7690 }, { "epoch": 0.25666666666666665, "grad_norm": 31.375, "grad_norm_var": 4.406705729166666, "learning_rate": 0.0001, "loss": 8.4214, "loss/crossentropy": 2.114253217726946, "loss/hidden": 3.767578125, "loss/jsd": 0.0, "loss/logits": 0.23989613354206085, "step": 7700 }, { "epoch": 0.257, "grad_norm": 32.5, "grad_norm_var": 5.330989583333333, "learning_rate": 0.0001, "loss": 8.5299, "loss/crossentropy": 2.1180673211812975, "loss/hidden": 3.901171875, "loss/jsd": 0.0, "loss/logits": 0.26575036309659483, "step": 7710 }, { "epoch": 0.25733333333333336, "grad_norm": 29.75, "grad_norm_var": 7.863541666666666, "learning_rate": 0.0001, "loss": 8.5739, "loss/crossentropy": 2.1078746899962426, "loss/hidden": 3.819140625, "loss/jsd": 0.0, "loss/logits": 0.235656151548028, "step": 7720 }, { "epoch": 0.25766666666666665, "grad_norm": 32.5, "grad_norm_var": 8.870572916666667, "learning_rate": 0.0001, "loss": 8.5049, "loss/crossentropy": 2.2084743842482566, "loss/hidden": 3.92265625, "loss/jsd": 0.0, "loss/logits": 0.24384659044444562, "step": 7730 }, { "epoch": 0.258, "grad_norm": 34.25, "grad_norm_var": 2.2997395833333334, "learning_rate": 0.0001, "loss": 8.3794, "loss/crossentropy": 2.1690520867705345, "loss/hidden": 3.808203125, "loss/jsd": 0.0, "loss/logits": 0.2447241246700287, "step": 7740 }, { "epoch": 0.25833333333333336, "grad_norm": 34.5, "grad_norm_var": 7.211458333333334, "learning_rate": 0.0001, "loss": 8.566, "loss/crossentropy": 2.237781625241041, "loss/hidden": 3.851171875, "loss/jsd": 0.0, "loss/logits": 0.23708969578146935, "step": 7750 }, { "epoch": 0.25866666666666666, "grad_norm": 31.875, "grad_norm_var": 51.203125, "learning_rate": 0.0001, "loss": 8.4896, "loss/crossentropy": 2.0946034505963325, "loss/hidden": 3.991796875, "loss/jsd": 0.0, "loss/logits": 0.25754191167652607, "step": 7760 }, { "epoch": 0.259, "grad_norm": 30.375, "grad_norm_var": 5.221875, "learning_rate": 0.0001, "loss": 8.5675, "loss/crossentropy": 2.2402331814169885, "loss/hidden": 3.866015625, "loss/jsd": 0.0, "loss/logits": 0.2628116441890597, "step": 7770 }, { "epoch": 0.25933333333333336, "grad_norm": 40.25, "grad_norm_var": 17.0900390625, "learning_rate": 0.0001, "loss": 8.5823, "loss/crossentropy": 2.1165684029459952, "loss/hidden": 3.88125, "loss/jsd": 0.0, "loss/logits": 0.2519789934158325, "step": 7780 }, { "epoch": 0.25966666666666666, "grad_norm": 33.75, "grad_norm_var": 8.1728515625, "learning_rate": 0.0001, "loss": 8.5378, "loss/crossentropy": 2.0959367021918296, "loss/hidden": 3.931640625, "loss/jsd": 0.0, "loss/logits": 0.23852321952581407, "step": 7790 }, { "epoch": 0.26, "grad_norm": 36.5, "grad_norm_var": 4.790625, "learning_rate": 0.0001, "loss": 8.3399, "loss/crossentropy": 1.9469283685088157, "loss/hidden": 3.78046875, "loss/jsd": 0.0, "loss/logits": 0.22438753712922335, "step": 7800 }, { "epoch": 0.26033333333333336, "grad_norm": 32.25, "grad_norm_var": 4.2625, "learning_rate": 0.0001, "loss": 8.5059, "loss/crossentropy": 2.1072940029203893, "loss/hidden": 3.945703125, "loss/jsd": 0.0, "loss/logits": 0.2441211288794875, "step": 7810 }, { "epoch": 0.26066666666666666, "grad_norm": 29.5, "grad_norm_var": 17.897330729166665, "learning_rate": 0.0001, "loss": 8.4184, "loss/crossentropy": 1.8887931071221828, "loss/hidden": 3.984765625, "loss/jsd": 0.0, "loss/logits": 0.24000756442546844, "step": 7820 }, { "epoch": 0.261, "grad_norm": 33.25, "grad_norm_var": 21.8119140625, "learning_rate": 0.0001, "loss": 8.4814, "loss/crossentropy": 2.1927064463496206, "loss/hidden": 3.80546875, "loss/jsd": 0.0, "loss/logits": 0.23618043400347233, "step": 7830 }, { "epoch": 0.2613333333333333, "grad_norm": 37.75, "grad_norm_var": 12.579622395833333, "learning_rate": 0.0001, "loss": 8.5882, "loss/crossentropy": 2.0585607342422008, "loss/hidden": 3.861328125, "loss/jsd": 0.0, "loss/logits": 0.24375837668776512, "step": 7840 }, { "epoch": 0.26166666666666666, "grad_norm": 33.0, "grad_norm_var": 6.588997395833333, "learning_rate": 0.0001, "loss": 8.5555, "loss/crossentropy": 2.0930290199816226, "loss/hidden": 3.85234375, "loss/jsd": 0.0, "loss/logits": 0.2372116858139634, "step": 7850 }, { "epoch": 0.262, "grad_norm": 32.75, "grad_norm_var": 4.052018229166666, "learning_rate": 0.0001, "loss": 8.5284, "loss/crossentropy": 2.2643882423639297, "loss/hidden": 3.848046875, "loss/jsd": 0.0, "loss/logits": 0.2545361390337348, "step": 7860 }, { "epoch": 0.2623333333333333, "grad_norm": 37.25, "grad_norm_var": 32.57265625, "learning_rate": 0.0001, "loss": 8.4368, "loss/crossentropy": 2.151304465532303, "loss/hidden": 3.903515625, "loss/jsd": 0.0, "loss/logits": 0.2559200949966908, "step": 7870 }, { "epoch": 0.26266666666666666, "grad_norm": 32.75, "grad_norm_var": 9.16640625, "learning_rate": 0.0001, "loss": 8.6239, "loss/crossentropy": 2.215903551876545, "loss/hidden": 3.84375, "loss/jsd": 0.0, "loss/logits": 0.2483787966892123, "step": 7880 }, { "epoch": 0.263, "grad_norm": 31.375, "grad_norm_var": 5.636393229166667, "learning_rate": 0.0001, "loss": 8.522, "loss/crossentropy": 2.2514882028102874, "loss/hidden": 3.9765625, "loss/jsd": 0.0, "loss/logits": 0.2960167687386274, "step": 7890 }, { "epoch": 0.2633333333333333, "grad_norm": 30.75, "grad_norm_var": 4.056184895833334, "learning_rate": 0.0001, "loss": 8.4697, "loss/crossentropy": 2.1326026201248167, "loss/hidden": 4.03125, "loss/jsd": 0.0, "loss/logits": 0.2860325377434492, "step": 7900 }, { "epoch": 0.26366666666666666, "grad_norm": 31.75, "grad_norm_var": 3.7244140625, "learning_rate": 0.0001, "loss": 8.6586, "loss/crossentropy": 2.2379645466804505, "loss/hidden": 3.812109375, "loss/jsd": 0.0, "loss/logits": 0.24608665630221366, "step": 7910 }, { "epoch": 0.264, "grad_norm": 32.5, "grad_norm_var": 6.681184895833334, "learning_rate": 0.0001, "loss": 8.5366, "loss/crossentropy": 2.066775370389223, "loss/hidden": 4.01796875, "loss/jsd": 0.0, "loss/logits": 0.2635978292673826, "step": 7920 }, { "epoch": 0.2643333333333333, "grad_norm": 32.25, "grad_norm_var": 6.824934895833334, "learning_rate": 0.0001, "loss": 8.5181, "loss/crossentropy": 2.0588886007666587, "loss/hidden": 3.890625, "loss/jsd": 0.0, "loss/logits": 0.2531863532960415, "step": 7930 }, { "epoch": 0.26466666666666666, "grad_norm": 29.25, "grad_norm_var": 2.8147497390536566e+18, "learning_rate": 0.0001, "loss": 8.5683, "loss/crossentropy": 2.1350580543279647, "loss/hidden": 3.85703125, "loss/jsd": 0.0, "loss/logits": 0.24998535066843033, "step": 7940 }, { "epoch": 0.265, "grad_norm": 34.25, "grad_norm_var": 2.814749738836951e+18, "learning_rate": 0.0001, "loss": 8.5132, "loss/crossentropy": 2.0762300439178945, "loss/hidden": 3.84609375, "loss/jsd": 0.0, "loss/logits": 0.2441089889034629, "step": 7950 }, { "epoch": 0.2653333333333333, "grad_norm": 31.875, "grad_norm_var": 8.36640625, "learning_rate": 0.0001, "loss": 8.4256, "loss/crossentropy": 2.1040103793144227, "loss/hidden": 3.84296875, "loss/jsd": 0.0, "loss/logits": 0.2428693912923336, "step": 7960 }, { "epoch": 0.26566666666666666, "grad_norm": 29.375, "grad_norm_var": 8.192122395833334, "learning_rate": 0.0001, "loss": 8.6841, "loss/crossentropy": 2.0118587724864483, "loss/hidden": 3.854296875, "loss/jsd": 0.0, "loss/logits": 0.21796635556966065, "step": 7970 }, { "epoch": 0.266, "grad_norm": 31.625, "grad_norm_var": 7.3525390625, "learning_rate": 0.0001, "loss": 8.4692, "loss/crossentropy": 2.134692121297121, "loss/hidden": 3.940625, "loss/jsd": 0.0, "loss/logits": 0.24534992277622222, "step": 7980 }, { "epoch": 0.2663333333333333, "grad_norm": 35.0, "grad_norm_var": 4.893684895833333, "learning_rate": 0.0001, "loss": 8.5476, "loss/crossentropy": 2.161470866203308, "loss/hidden": 4.1234375, "loss/jsd": 0.0, "loss/logits": 0.2858205262571573, "step": 7990 }, { "epoch": 0.26666666666666666, "grad_norm": 44.25, "grad_norm_var": 24.559830729166666, "learning_rate": 0.0001, "loss": 8.5115, "loss/crossentropy": 1.9794396072626115, "loss/hidden": 3.883984375, "loss/jsd": 0.0, "loss/logits": 0.253316623903811, "step": 8000 }, { "epoch": 0.267, "grad_norm": 34.0, "grad_norm_var": 22.160416666666666, "learning_rate": 0.0001, "loss": 8.5737, "loss/crossentropy": 2.203532671928406, "loss/hidden": 3.905859375, "loss/jsd": 0.0, "loss/logits": 0.26553509533405306, "step": 8010 }, { "epoch": 0.2673333333333333, "grad_norm": 33.0, "grad_norm_var": 9.372916666666667, "learning_rate": 0.0001, "loss": 8.5596, "loss/crossentropy": 2.116582728922367, "loss/hidden": 3.955078125, "loss/jsd": 0.0, "loss/logits": 0.28444691337645056, "step": 8020 }, { "epoch": 0.26766666666666666, "grad_norm": 33.0, "grad_norm_var": 4.118489583333333, "learning_rate": 0.0001, "loss": 8.7085, "loss/crossentropy": 2.136409956216812, "loss/hidden": 3.94609375, "loss/jsd": 0.0, "loss/logits": 0.26452013887465, "step": 8030 }, { "epoch": 0.268, "grad_norm": 34.25, "grad_norm_var": 5.120572916666666, "learning_rate": 0.0001, "loss": 8.7193, "loss/crossentropy": 2.17041220664978, "loss/hidden": 3.9, "loss/jsd": 0.0, "loss/logits": 0.25961695313453675, "step": 8040 }, { "epoch": 0.2683333333333333, "grad_norm": 38.75, "grad_norm_var": 11.583072916666667, "learning_rate": 0.0001, "loss": 8.55, "loss/crossentropy": 2.1484374403953552, "loss/hidden": 3.784375, "loss/jsd": 0.0, "loss/logits": 0.2425002339296043, "step": 8050 }, { "epoch": 0.26866666666666666, "grad_norm": 32.0, "grad_norm_var": 10.539583333333333, "learning_rate": 0.0001, "loss": 8.4497, "loss/crossentropy": 2.201520799845457, "loss/hidden": 3.866796875, "loss/jsd": 0.0, "loss/logits": 0.23861566837877035, "step": 8060 }, { "epoch": 0.269, "grad_norm": 33.75, "grad_norm_var": 7.472916666666666, "learning_rate": 0.0001, "loss": 8.6457, "loss/crossentropy": 2.260037848353386, "loss/hidden": 3.873828125, "loss/jsd": 0.0, "loss/logits": 0.2488136703148484, "step": 8070 }, { "epoch": 0.2693333333333333, "grad_norm": 29.25, "grad_norm_var": 4.987239583333333, "learning_rate": 0.0001, "loss": 8.5122, "loss/crossentropy": 2.239154724776745, "loss/hidden": 3.805078125, "loss/jsd": 0.0, "loss/logits": 0.24010765701532363, "step": 8080 }, { "epoch": 0.26966666666666667, "grad_norm": 38.0, "grad_norm_var": 4.78515625, "learning_rate": 0.0001, "loss": 8.5795, "loss/crossentropy": 2.123927664756775, "loss/hidden": 4.057421875, "loss/jsd": 0.0, "loss/logits": 0.27356666754931214, "step": 8090 }, { "epoch": 0.27, "grad_norm": 29.75, "grad_norm_var": 2.6757714700654346e+18, "learning_rate": 0.0001, "loss": 8.5442, "loss/crossentropy": 2.1999975204467774, "loss/hidden": 3.831640625, "loss/jsd": 0.0, "loss/logits": 0.2607791792601347, "step": 8100 }, { "epoch": 0.2703333333333333, "grad_norm": 52.25, "grad_norm_var": 2.675771468504629e+18, "learning_rate": 0.0001, "loss": 8.5682, "loss/crossentropy": 2.1693030931055546, "loss/hidden": 3.843359375, "loss/jsd": 0.0, "loss/logits": 0.251168143004179, "step": 8110 }, { "epoch": 0.27066666666666667, "grad_norm": 32.25, "grad_norm_var": 31.667708333333334, "learning_rate": 0.0001, "loss": 8.3905, "loss/crossentropy": 2.1287291169166567, "loss/hidden": 3.902734375, "loss/jsd": 0.0, "loss/logits": 0.24765251912176608, "step": 8120 }, { "epoch": 0.271, "grad_norm": 30.0, "grad_norm_var": 8.573958333333334, "learning_rate": 0.0001, "loss": 8.4201, "loss/crossentropy": 2.0576461493968963, "loss/hidden": 3.87734375, "loss/jsd": 0.0, "loss/logits": 0.241207036934793, "step": 8130 }, { "epoch": 0.2713333333333333, "grad_norm": 43.75, "grad_norm_var": 2.7309405654549356e+18, "learning_rate": 0.0001, "loss": 8.4172, "loss/crossentropy": 2.0151774257421495, "loss/hidden": 3.844921875, "loss/jsd": 0.0, "loss/logits": 0.2428217800334096, "step": 8140 }, { "epoch": 0.27166666666666667, "grad_norm": 32.75, "grad_norm_var": 2.7309405653723075e+18, "learning_rate": 0.0001, "loss": 8.5564, "loss/crossentropy": 2.226572999358177, "loss/hidden": 3.91328125, "loss/jsd": 0.0, "loss/logits": 0.26639420036226513, "step": 8150 }, { "epoch": 0.272, "grad_norm": 31.5, "grad_norm_var": 1.8499348958333333, "learning_rate": 0.0001, "loss": 8.5586, "loss/crossentropy": 2.336693507432938, "loss/hidden": 4.00859375, "loss/jsd": 0.0, "loss/logits": 0.28926637172698977, "step": 8160 }, { "epoch": 0.2723333333333333, "grad_norm": 31.75, "grad_norm_var": 2.5447265625, "learning_rate": 0.0001, "loss": 8.4725, "loss/crossentropy": 2.2132105618715285, "loss/hidden": 3.820703125, "loss/jsd": 0.0, "loss/logits": 0.24436241313815116, "step": 8170 }, { "epoch": 0.27266666666666667, "grad_norm": 30.375, "grad_norm_var": 2.330894253998281e+18, "learning_rate": 0.0001, "loss": 8.5414, "loss/crossentropy": 1.975562959909439, "loss/hidden": 3.817578125, "loss/jsd": 0.0, "loss/logits": 0.22787413820624353, "step": 8180 }, { "epoch": 0.273, "grad_norm": 31.875, "grad_norm_var": 40.270833333333336, "learning_rate": 0.0001, "loss": 8.4846, "loss/crossentropy": 2.0327411964535713, "loss/hidden": 3.84453125, "loss/jsd": 0.0, "loss/logits": 0.242176865786314, "step": 8190 }, { "epoch": 0.2733333333333333, "grad_norm": 29.0, "grad_norm_var": 50.523372395833334, "learning_rate": 0.0001, "loss": 8.4559, "loss/crossentropy": 2.1319633327424525, "loss/hidden": 3.97421875, "loss/jsd": 0.0, "loss/logits": 0.2501418372616172, "step": 8200 }, { "epoch": 0.27366666666666667, "grad_norm": 34.0, "grad_norm_var": 18.219205729166667, "learning_rate": 0.0001, "loss": 8.5216, "loss/crossentropy": 2.2018288552761076, "loss/hidden": 3.9546875, "loss/jsd": 0.0, "loss/logits": 0.2552491381764412, "step": 8210 }, { "epoch": 0.274, "grad_norm": 33.5, "grad_norm_var": 4.381705729166667, "learning_rate": 0.0001, "loss": 8.4802, "loss/crossentropy": 2.0432650595903397, "loss/hidden": 3.86640625, "loss/jsd": 0.0, "loss/logits": 0.2392191395163536, "step": 8220 }, { "epoch": 0.2743333333333333, "grad_norm": 30.25, "grad_norm_var": 2.1, "learning_rate": 0.0001, "loss": 8.4893, "loss/crossentropy": 2.1175171703100206, "loss/hidden": 3.949609375, "loss/jsd": 0.0, "loss/logits": 0.26016080360859634, "step": 8230 }, { "epoch": 0.27466666666666667, "grad_norm": 32.0, "grad_norm_var": 4.23125, "learning_rate": 0.0001, "loss": 8.4556, "loss/crossentropy": 2.103234487399459, "loss/hidden": 3.947265625, "loss/jsd": 0.0, "loss/logits": 0.24362556543201208, "step": 8240 }, { "epoch": 0.275, "grad_norm": 29.5, "grad_norm_var": 1.3207509407140326e+18, "learning_rate": 0.0001, "loss": 8.3986, "loss/crossentropy": 2.1267191670835017, "loss/hidden": 3.93359375, "loss/jsd": 0.0, "loss/logits": 0.24572798358276488, "step": 8250 }, { "epoch": 0.2753333333333333, "grad_norm": 29.0, "grad_norm_var": 6.517643229166667, "learning_rate": 0.0001, "loss": 8.4968, "loss/crossentropy": 2.0798249572515486, "loss/hidden": 3.844140625, "loss/jsd": 0.0, "loss/logits": 0.23399676829576493, "step": 8260 }, { "epoch": 0.27566666666666667, "grad_norm": 33.75, "grad_norm_var": 5.4759765625, "learning_rate": 0.0001, "loss": 8.4268, "loss/crossentropy": 2.0175932213664054, "loss/hidden": 3.88203125, "loss/jsd": 0.0, "loss/logits": 0.2511680208146572, "step": 8270 }, { "epoch": 0.276, "grad_norm": 33.25, "grad_norm_var": 8.170247395833334, "learning_rate": 0.0001, "loss": 8.4686, "loss/crossentropy": 2.050298312306404, "loss/hidden": 3.818359375, "loss/jsd": 0.0, "loss/logits": 0.24749082382768392, "step": 8280 }, { "epoch": 0.2763333333333333, "grad_norm": 46.5, "grad_norm_var": 2.2546849066262026e+18, "learning_rate": 0.0001, "loss": 8.5837, "loss/crossentropy": 2.107571153342724, "loss/hidden": 3.908984375, "loss/jsd": 0.0, "loss/logits": 0.24962719343602657, "step": 8290 }, { "epoch": 0.27666666666666667, "grad_norm": 31.625, "grad_norm_var": 2.2546849069640538e+18, "learning_rate": 0.0001, "loss": 8.4953, "loss/crossentropy": 2.1752505511045457, "loss/hidden": 3.854296875, "loss/jsd": 0.0, "loss/logits": 0.24640031829476355, "step": 8300 }, { "epoch": 0.277, "grad_norm": 31.25, "grad_norm_var": 7.98515625, "learning_rate": 0.0001, "loss": 8.5105, "loss/crossentropy": 2.1649673506617546, "loss/hidden": 3.9171875, "loss/jsd": 0.0, "loss/logits": 0.24632014315575362, "step": 8310 }, { "epoch": 0.2773333333333333, "grad_norm": 36.25, "grad_norm_var": 5.1103515625, "learning_rate": 0.0001, "loss": 8.6269, "loss/crossentropy": 2.1055419132113458, "loss/hidden": 3.809375, "loss/jsd": 0.0, "loss/logits": 0.24666682127863168, "step": 8320 }, { "epoch": 0.2776666666666667, "grad_norm": 35.25, "grad_norm_var": 4.3712890625, "learning_rate": 0.0001, "loss": 8.5265, "loss/crossentropy": 2.051154574751854, "loss/hidden": 4.063671875, "loss/jsd": 0.0, "loss/logits": 0.2739197164773941, "step": 8330 }, { "epoch": 0.278, "grad_norm": 27.375, "grad_norm_var": 4.311458333333333, "learning_rate": 0.0001, "loss": 8.467, "loss/crossentropy": 1.931150709837675, "loss/hidden": 3.9625, "loss/jsd": 0.0, "loss/logits": 0.26600636430084706, "step": 8340 }, { "epoch": 0.2783333333333333, "grad_norm": 41.25, "grad_norm_var": 31.12265625, "learning_rate": 0.0001, "loss": 8.4761, "loss/crossentropy": 2.0970492526888846, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.2257708402350545, "step": 8350 }, { "epoch": 0.2786666666666667, "grad_norm": 32.75, "grad_norm_var": 3.313593760837691e+18, "learning_rate": 0.0001, "loss": 8.5959, "loss/crossentropy": 2.107315970212221, "loss/hidden": 3.95546875, "loss/jsd": 0.0, "loss/logits": 0.2386183949187398, "step": 8360 }, { "epoch": 0.279, "grad_norm": 31.25, "grad_norm_var": 3.313593762134675e+18, "learning_rate": 0.0001, "loss": 8.5496, "loss/crossentropy": 2.1075058460235594, "loss/hidden": 3.875390625, "loss/jsd": 0.0, "loss/logits": 0.25149064473807814, "step": 8370 }, { "epoch": 0.2793333333333333, "grad_norm": 36.25, "grad_norm_var": 20.053059895833332, "learning_rate": 0.0001, "loss": 8.5338, "loss/crossentropy": 2.059878170490265, "loss/hidden": 3.878515625, "loss/jsd": 0.0, "loss/logits": 0.23977196607738732, "step": 8380 }, { "epoch": 0.2796666666666667, "grad_norm": 30.875, "grad_norm_var": 19.945768229166667, "learning_rate": 0.0001, "loss": 8.5962, "loss/crossentropy": 2.052942344546318, "loss/hidden": 3.8734375, "loss/jsd": 0.0, "loss/logits": 0.24317781031131744, "step": 8390 }, { "epoch": 0.28, "grad_norm": 30.25, "grad_norm_var": 17.1150390625, "learning_rate": 0.0001, "loss": 8.4453, "loss/crossentropy": 2.103861276805401, "loss/hidden": 3.9421875, "loss/jsd": 0.0, "loss/logits": 0.2519053351134062, "step": 8400 }, { "epoch": 0.2803333333333333, "grad_norm": 31.75, "grad_norm_var": 16.060872395833332, "learning_rate": 0.0001, "loss": 8.4694, "loss/crossentropy": 2.0992368295788766, "loss/hidden": 3.91484375, "loss/jsd": 0.0, "loss/logits": 0.2527425540611148, "step": 8410 }, { "epoch": 0.2806666666666667, "grad_norm": 30.625, "grad_norm_var": 1.9884765625, "learning_rate": 0.0001, "loss": 8.4347, "loss/crossentropy": 2.264920949935913, "loss/hidden": 4.038671875, "loss/jsd": 0.0, "loss/logits": 0.27555460929870607, "step": 8420 }, { "epoch": 0.281, "grad_norm": 32.25, "grad_norm_var": 1.3729166666666666, "learning_rate": 0.0001, "loss": 8.4686, "loss/crossentropy": 2.0547826454043387, "loss/hidden": 3.818359375, "loss/jsd": 0.0, "loss/logits": 0.2389751397073269, "step": 8430 }, { "epoch": 0.2813333333333333, "grad_norm": 33.25, "grad_norm_var": 3.923958333333333, "learning_rate": 0.0001, "loss": 8.4972, "loss/crossentropy": 2.141887503862381, "loss/hidden": 3.865234375, "loss/jsd": 0.0, "loss/logits": 0.23841603249311447, "step": 8440 }, { "epoch": 0.2816666666666667, "grad_norm": 29.375, "grad_norm_var": 5.348958333333333, "learning_rate": 0.0001, "loss": 8.5568, "loss/crossentropy": 2.070135848224163, "loss/hidden": 3.938671875, "loss/jsd": 0.0, "loss/logits": 0.2530257642269135, "step": 8450 }, { "epoch": 0.282, "grad_norm": 28.75, "grad_norm_var": 7.098372395833334, "learning_rate": 0.0001, "loss": 8.4062, "loss/crossentropy": 2.1277823865413668, "loss/hidden": 3.954296875, "loss/jsd": 0.0, "loss/logits": 0.25204644426703454, "step": 8460 }, { "epoch": 0.2823333333333333, "grad_norm": 33.25, "grad_norm_var": 4.593489583333334, "learning_rate": 0.0001, "loss": 8.4317, "loss/crossentropy": 2.169590988755226, "loss/hidden": 3.957421875, "loss/jsd": 0.0, "loss/logits": 0.26035118848085403, "step": 8470 }, { "epoch": 0.2826666666666667, "grad_norm": 30.0, "grad_norm_var": 3.678580729166667, "learning_rate": 0.0001, "loss": 8.4946, "loss/crossentropy": 2.287361499667168, "loss/hidden": 3.77109375, "loss/jsd": 0.0, "loss/logits": 0.23725899122655392, "step": 8480 }, { "epoch": 0.283, "grad_norm": 54.25, "grad_norm_var": 35.12265625, "learning_rate": 0.0001, "loss": 8.5028, "loss/crossentropy": 2.078622847050428, "loss/hidden": 3.809375, "loss/jsd": 0.0, "loss/logits": 0.2394928558729589, "step": 8490 }, { "epoch": 0.2833333333333333, "grad_norm": 30.375, "grad_norm_var": 35.994791666666664, "learning_rate": 0.0001, "loss": 8.3967, "loss/crossentropy": 1.993205615878105, "loss/hidden": 3.875, "loss/jsd": 0.0, "loss/logits": 0.25120790507644414, "step": 8500 }, { "epoch": 0.2836666666666667, "grad_norm": 30.375, "grad_norm_var": 54.930989583333336, "learning_rate": 0.0001, "loss": 8.4862, "loss/crossentropy": 2.1012352854013443, "loss/hidden": 4.03984375, "loss/jsd": 0.0, "loss/logits": 0.26620072200894357, "step": 8510 }, { "epoch": 0.284, "grad_norm": 32.25, "grad_norm_var": 53.98098958333333, "learning_rate": 0.0001, "loss": 8.4943, "loss/crossentropy": 2.0601254284381865, "loss/hidden": 3.88046875, "loss/jsd": 0.0, "loss/logits": 0.24300396777689456, "step": 8520 }, { "epoch": 0.2843333333333333, "grad_norm": 45.75, "grad_norm_var": 19.379622395833334, "learning_rate": 0.0001, "loss": 8.5667, "loss/crossentropy": 2.150593836605549, "loss/hidden": 3.97578125, "loss/jsd": 0.0, "loss/logits": 0.2611148880794644, "step": 8530 }, { "epoch": 0.2846666666666667, "grad_norm": 30.0, "grad_norm_var": 17.92890625, "learning_rate": 0.0001, "loss": 8.5768, "loss/crossentropy": 2.158891648054123, "loss/hidden": 3.895703125, "loss/jsd": 0.0, "loss/logits": 0.2555501349270344, "step": 8540 }, { "epoch": 0.285, "grad_norm": 30.75, "grad_norm_var": 6.314583333333333, "learning_rate": 0.0001, "loss": 8.4415, "loss/crossentropy": 2.1268053114414216, "loss/hidden": 3.8828125, "loss/jsd": 0.0, "loss/logits": 0.2675272192806005, "step": 8550 }, { "epoch": 0.2853333333333333, "grad_norm": 30.125, "grad_norm_var": 5.924739583333333, "learning_rate": 0.0001, "loss": 8.3905, "loss/crossentropy": 1.8700345799326896, "loss/hidden": 3.9625, "loss/jsd": 0.0, "loss/logits": 0.22793399412184953, "step": 8560 }, { "epoch": 0.2856666666666667, "grad_norm": 34.75, "grad_norm_var": 28.47265625, "learning_rate": 0.0001, "loss": 8.5967, "loss/crossentropy": 2.2868270367383956, "loss/hidden": 3.873046875, "loss/jsd": 0.0, "loss/logits": 0.2460212778300047, "step": 8570 }, { "epoch": 0.286, "grad_norm": 31.375, "grad_norm_var": 25.4125, "learning_rate": 0.0001, "loss": 8.5052, "loss/crossentropy": 1.9612272754311562, "loss/hidden": 4.025, "loss/jsd": 0.0, "loss/logits": 0.24744862429797648, "step": 8580 }, { "epoch": 0.28633333333333333, "grad_norm": 31.875, "grad_norm_var": 19.161458333333332, "learning_rate": 0.0001, "loss": 8.5026, "loss/crossentropy": 1.9540777966380118, "loss/hidden": 3.840625, "loss/jsd": 0.0, "loss/logits": 0.22912114206701517, "step": 8590 }, { "epoch": 0.2866666666666667, "grad_norm": 32.75, "grad_norm_var": 1.7705729166666666, "learning_rate": 0.0001, "loss": 8.4321, "loss/crossentropy": 2.183373187482357, "loss/hidden": 3.820703125, "loss/jsd": 0.0, "loss/logits": 0.24152661189436914, "step": 8600 }, { "epoch": 0.287, "grad_norm": 39.0, "grad_norm_var": 14.377018229166667, "learning_rate": 0.0001, "loss": 8.6251, "loss/crossentropy": 2.0541095778346063, "loss/hidden": 4.001953125, "loss/jsd": 0.0, "loss/logits": 0.27750074546784165, "step": 8610 }, { "epoch": 0.28733333333333333, "grad_norm": 30.625, "grad_norm_var": 21.0462890625, "learning_rate": 0.0001, "loss": 8.4404, "loss/crossentropy": 2.1556227087974547, "loss/hidden": 3.7109375, "loss/jsd": 0.0, "loss/logits": 0.23535772711038588, "step": 8620 }, { "epoch": 0.2876666666666667, "grad_norm": 31.625, "grad_norm_var": 6.3337890625, "learning_rate": 0.0001, "loss": 8.5323, "loss/crossentropy": 2.2881136484444142, "loss/hidden": 3.823046875, "loss/jsd": 0.0, "loss/logits": 0.26564789917320014, "step": 8630 }, { "epoch": 0.288, "grad_norm": 29.875, "grad_norm_var": 5.957747395833334, "learning_rate": 0.0001, "loss": 8.4478, "loss/crossentropy": 2.1415953427553176, "loss/hidden": 3.861328125, "loss/jsd": 0.0, "loss/logits": 0.24978371188044549, "step": 8640 }, { "epoch": 0.28833333333333333, "grad_norm": 32.75, "grad_norm_var": 8.3916015625, "learning_rate": 0.0001, "loss": 8.4509, "loss/crossentropy": 2.0754496946930887, "loss/hidden": 3.883203125, "loss/jsd": 0.0, "loss/logits": 0.2334285033866763, "step": 8650 }, { "epoch": 0.2886666666666667, "grad_norm": 39.0, "grad_norm_var": 12.662239583333333, "learning_rate": 0.0001, "loss": 8.5949, "loss/crossentropy": 2.2709645599126818, "loss/hidden": 3.901953125, "loss/jsd": 0.0, "loss/logits": 0.2653419800102711, "step": 8660 }, { "epoch": 0.289, "grad_norm": 30.375, "grad_norm_var": 11.949739583333333, "learning_rate": 0.0001, "loss": 8.4114, "loss/crossentropy": 2.142335993051529, "loss/hidden": 3.825390625, "loss/jsd": 0.0, "loss/logits": 0.2271432813256979, "step": 8670 }, { "epoch": 0.28933333333333333, "grad_norm": 30.75, "grad_norm_var": 13.236393229166667, "learning_rate": 0.0001, "loss": 8.4548, "loss/crossentropy": 1.991011817008257, "loss/hidden": 3.896875, "loss/jsd": 0.0, "loss/logits": 0.2274771448224783, "step": 8680 }, { "epoch": 0.2896666666666667, "grad_norm": 31.875, "grad_norm_var": 21.591666666666665, "learning_rate": 0.0001, "loss": 8.5036, "loss/crossentropy": 2.067255499958992, "loss/hidden": 3.975, "loss/jsd": 0.0, "loss/logits": 0.25000386498868465, "step": 8690 }, { "epoch": 0.29, "grad_norm": 31.625, "grad_norm_var": 13.7697265625, "learning_rate": 0.0001, "loss": 8.4065, "loss/crossentropy": 2.032663035392761, "loss/hidden": 3.9734375, "loss/jsd": 0.0, "loss/logits": 0.26009538136422633, "step": 8700 }, { "epoch": 0.29033333333333333, "grad_norm": 28.625, "grad_norm_var": 9.573372395833333, "learning_rate": 0.0001, "loss": 8.4264, "loss/crossentropy": 2.129157376289368, "loss/hidden": 3.781640625, "loss/jsd": 0.0, "loss/logits": 0.2426974017173052, "step": 8710 }, { "epoch": 0.2906666666666667, "grad_norm": 31.125, "grad_norm_var": 9.345247395833333, "learning_rate": 0.0001, "loss": 8.4556, "loss/crossentropy": 2.1253123968839644, "loss/hidden": 3.79296875, "loss/jsd": 0.0, "loss/logits": 0.2316815422847867, "step": 8720 }, { "epoch": 0.291, "grad_norm": 30.375, "grad_norm_var": 288.3697265625, "learning_rate": 0.0001, "loss": 8.3447, "loss/crossentropy": 2.0766125731170177, "loss/hidden": 3.826171875, "loss/jsd": 0.0, "loss/logits": 0.2288608182221651, "step": 8730 }, { "epoch": 0.29133333333333333, "grad_norm": 30.0, "grad_norm_var": 291.34140625, "learning_rate": 0.0001, "loss": 8.4282, "loss/crossentropy": 2.0344169199466706, "loss/hidden": 3.881640625, "loss/jsd": 0.0, "loss/logits": 0.2401146437972784, "step": 8740 }, { "epoch": 0.2916666666666667, "grad_norm": 31.375, "grad_norm_var": 8.283333333333333, "learning_rate": 0.0001, "loss": 8.5577, "loss/crossentropy": 2.0992552161216738, "loss/hidden": 3.89296875, "loss/jsd": 0.0, "loss/logits": 0.2575317870825529, "step": 8750 }, { "epoch": 0.292, "grad_norm": 27.75, "grad_norm_var": 18.577083333333334, "learning_rate": 0.0001, "loss": 8.572, "loss/crossentropy": 2.1781798616051673, "loss/hidden": 3.837890625, "loss/jsd": 0.0, "loss/logits": 0.24388179033994675, "step": 8760 }, { "epoch": 0.29233333333333333, "grad_norm": 31.125, "grad_norm_var": 20.822330729166666, "learning_rate": 0.0001, "loss": 8.3706, "loss/crossentropy": 2.0903132036328316, "loss/hidden": 3.845703125, "loss/jsd": 0.0, "loss/logits": 0.22936930637806655, "step": 8770 }, { "epoch": 0.2926666666666667, "grad_norm": 32.25, "grad_norm_var": 12.4869140625, "learning_rate": 0.0001, "loss": 8.4817, "loss/crossentropy": 2.1733203932642935, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.23095921371132136, "step": 8780 }, { "epoch": 0.293, "grad_norm": 33.25, "grad_norm_var": 17.363997395833334, "learning_rate": 0.0001, "loss": 8.5659, "loss/crossentropy": 2.088620986789465, "loss/hidden": 3.82578125, "loss/jsd": 0.0, "loss/logits": 0.2334995089098811, "step": 8790 }, { "epoch": 0.29333333333333333, "grad_norm": 33.75, "grad_norm_var": 11.377018229166667, "learning_rate": 0.0001, "loss": 8.5218, "loss/crossentropy": 1.9903348997235297, "loss/hidden": 3.859375, "loss/jsd": 0.0, "loss/logits": 0.24103877376765012, "step": 8800 }, { "epoch": 0.2936666666666667, "grad_norm": 31.625, "grad_norm_var": 23.551041666666666, "learning_rate": 0.0001, "loss": 8.4445, "loss/crossentropy": 2.0936522856354713, "loss/hidden": 3.83125, "loss/jsd": 0.0, "loss/logits": 0.2551413768902421, "step": 8810 }, { "epoch": 0.294, "grad_norm": 29.625, "grad_norm_var": 9.57890625, "learning_rate": 0.0001, "loss": 8.4923, "loss/crossentropy": 2.3439304143190385, "loss/hidden": 3.803515625, "loss/jsd": 0.0, "loss/logits": 0.2454788561910391, "step": 8820 }, { "epoch": 0.29433333333333334, "grad_norm": 35.5, "grad_norm_var": 5.115559895833333, "learning_rate": 0.0001, "loss": 8.506, "loss/crossentropy": 2.0696601763367655, "loss/hidden": 3.872265625, "loss/jsd": 0.0, "loss/logits": 0.2583671987056732, "step": 8830 }, { "epoch": 0.2946666666666667, "grad_norm": 30.625, "grad_norm_var": 10.8509765625, "learning_rate": 0.0001, "loss": 8.4108, "loss/crossentropy": 2.0794931963086127, "loss/hidden": 3.76328125, "loss/jsd": 0.0, "loss/logits": 0.2228974211961031, "step": 8840 }, { "epoch": 0.295, "grad_norm": 30.25, "grad_norm_var": 10.792708333333334, "learning_rate": 0.0001, "loss": 8.4583, "loss/crossentropy": 2.083049839735031, "loss/hidden": 3.909765625, "loss/jsd": 0.0, "loss/logits": 0.243487436324358, "step": 8850 }, { "epoch": 0.29533333333333334, "grad_norm": 33.25, "grad_norm_var": 24.915625, "learning_rate": 0.0001, "loss": 8.5509, "loss/crossentropy": 2.260637935996056, "loss/hidden": 3.880859375, "loss/jsd": 0.0, "loss/logits": 0.24824294932186602, "step": 8860 }, { "epoch": 0.2956666666666667, "grad_norm": 29.0, "grad_norm_var": 29.515559895833334, "learning_rate": 0.0001, "loss": 8.3554, "loss/crossentropy": 2.0667995259165766, "loss/hidden": 3.91875, "loss/jsd": 0.0, "loss/logits": 0.24629948288202286, "step": 8870 }, { "epoch": 0.296, "grad_norm": 35.25, "grad_norm_var": 11.530208333333333, "learning_rate": 0.0001, "loss": 8.575, "loss/crossentropy": 2.1920250236988066, "loss/hidden": 3.991796875, "loss/jsd": 0.0, "loss/logits": 0.2532145943492651, "step": 8880 }, { "epoch": 0.29633333333333334, "grad_norm": 32.75, "grad_norm_var": 9.559375, "learning_rate": 0.0001, "loss": 8.3751, "loss/crossentropy": 2.1635709404945374, "loss/hidden": 3.77265625, "loss/jsd": 0.0, "loss/logits": 0.23346599154174327, "step": 8890 }, { "epoch": 0.2966666666666667, "grad_norm": 33.25, "grad_norm_var": 6.438997395833334, "learning_rate": 0.0001, "loss": 8.48, "loss/crossentropy": 2.2036055833101273, "loss/hidden": 3.7109375, "loss/jsd": 0.0, "loss/logits": 0.22973880134522914, "step": 8900 }, { "epoch": 0.297, "grad_norm": 32.5, "grad_norm_var": 5.424739583333333, "learning_rate": 0.0001, "loss": 8.4154, "loss/crossentropy": 2.319879895448685, "loss/hidden": 3.826171875, "loss/jsd": 0.0, "loss/logits": 0.24667385257780552, "step": 8910 }, { "epoch": 0.29733333333333334, "grad_norm": 30.25, "grad_norm_var": 9.153059895833334, "learning_rate": 0.0001, "loss": 8.4004, "loss/crossentropy": 2.2534718930721285, "loss/hidden": 3.858984375, "loss/jsd": 0.0, "loss/logits": 0.25190291851758956, "step": 8920 }, { "epoch": 0.2976666666666667, "grad_norm": 30.375, "grad_norm_var": 6.745833333333334, "learning_rate": 0.0001, "loss": 8.4038, "loss/crossentropy": 2.002136807143688, "loss/hidden": 3.998828125, "loss/jsd": 0.0, "loss/logits": 0.2326902337372303, "step": 8930 }, { "epoch": 0.298, "grad_norm": 30.75, "grad_norm_var": 2.4567057291666665, "learning_rate": 0.0001, "loss": 8.4458, "loss/crossentropy": 2.013306239247322, "loss/hidden": 3.84921875, "loss/jsd": 0.0, "loss/logits": 0.22715070880949498, "step": 8940 }, { "epoch": 0.29833333333333334, "grad_norm": 33.0, "grad_norm_var": 4.373372395833333, "learning_rate": 0.0001, "loss": 8.447, "loss/crossentropy": 2.2556902036070823, "loss/hidden": 3.84921875, "loss/jsd": 0.0, "loss/logits": 0.2423699676990509, "step": 8950 }, { "epoch": 0.2986666666666667, "grad_norm": 29.5, "grad_norm_var": 43.209309895833336, "learning_rate": 0.0001, "loss": 8.4247, "loss/crossentropy": 2.2286648035049437, "loss/hidden": 3.874609375, "loss/jsd": 0.0, "loss/logits": 0.2592891216278076, "step": 8960 }, { "epoch": 0.299, "grad_norm": 30.375, "grad_norm_var": 7.9244140625, "learning_rate": 0.0001, "loss": 8.4795, "loss/crossentropy": 2.003975507616997, "loss/hidden": 3.9015625, "loss/jsd": 0.0, "loss/logits": 0.22656005583703517, "step": 8970 }, { "epoch": 0.29933333333333334, "grad_norm": 29.875, "grad_norm_var": 86.21979166666667, "learning_rate": 0.0001, "loss": 8.3252, "loss/crossentropy": 2.1622410126030447, "loss/hidden": 3.7859375, "loss/jsd": 0.0, "loss/logits": 0.25566081050783396, "step": 8980 }, { "epoch": 0.2996666666666667, "grad_norm": 30.625, "grad_norm_var": 28.685872395833332, "learning_rate": 0.0001, "loss": 8.4789, "loss/crossentropy": 2.012830953299999, "loss/hidden": 3.932421875, "loss/jsd": 0.0, "loss/logits": 0.22984218932688236, "step": 8990 }, { "epoch": 0.3, "grad_norm": 34.5, "grad_norm_var": 53.1634765625, "learning_rate": 0.0001, "loss": 8.5887, "loss/crossentropy": 2.114059830456972, "loss/hidden": 3.7703125, "loss/jsd": 0.0, "loss/logits": 0.23531355792656541, "step": 9000 }, { "epoch": 0.30033333333333334, "grad_norm": 30.875, "grad_norm_var": 16.183333333333334, "learning_rate": 0.0001, "loss": 8.433, "loss/crossentropy": 2.0997436851263047, "loss/hidden": 3.858984375, "loss/jsd": 0.0, "loss/logits": 0.2503551162779331, "step": 9010 }, { "epoch": 0.3006666666666667, "grad_norm": 32.25, "grad_norm_var": 6.585872395833333, "learning_rate": 0.0001, "loss": 8.4156, "loss/crossentropy": 1.9693719133734704, "loss/hidden": 3.89375, "loss/jsd": 0.0, "loss/logits": 0.24317112397402524, "step": 9020 }, { "epoch": 0.301, "grad_norm": 32.75, "grad_norm_var": 6.77265625, "learning_rate": 0.0001, "loss": 8.4748, "loss/crossentropy": 2.2020839557051657, "loss/hidden": 3.834375, "loss/jsd": 0.0, "loss/logits": 0.24702335204929113, "step": 9030 }, { "epoch": 0.30133333333333334, "grad_norm": 32.75, "grad_norm_var": 39.889322916666664, "learning_rate": 0.0001, "loss": 8.5507, "loss/crossentropy": 2.1863808527588846, "loss/hidden": 3.866015625, "loss/jsd": 0.0, "loss/logits": 0.2480563845485449, "step": 9040 }, { "epoch": 0.3016666666666667, "grad_norm": 30.875, "grad_norm_var": 3.921875, "learning_rate": 0.0001, "loss": 8.468, "loss/crossentropy": 2.190934830904007, "loss/hidden": 3.825390625, "loss/jsd": 0.0, "loss/logits": 0.23958997726440429, "step": 9050 }, { "epoch": 0.302, "grad_norm": 32.0, "grad_norm_var": 3.06875, "learning_rate": 0.0001, "loss": 8.6069, "loss/crossentropy": 2.0262902580201625, "loss/hidden": 3.857421875, "loss/jsd": 0.0, "loss/logits": 0.238032066822052, "step": 9060 }, { "epoch": 0.30233333333333334, "grad_norm": 31.125, "grad_norm_var": 10.905143229166667, "learning_rate": 0.0001, "loss": 8.5303, "loss/crossentropy": 2.250582979619503, "loss/hidden": 3.889453125, "loss/jsd": 0.0, "loss/logits": 0.2632708761841059, "step": 9070 }, { "epoch": 0.30266666666666664, "grad_norm": 32.0, "grad_norm_var": 7.757747395833333, "learning_rate": 0.0001, "loss": 8.3925, "loss/crossentropy": 2.2143336325883864, "loss/hidden": 3.739453125, "loss/jsd": 0.0, "loss/logits": 0.24901481308043003, "step": 9080 }, { "epoch": 0.303, "grad_norm": 29.0, "grad_norm_var": 4.017122395833334, "learning_rate": 0.0001, "loss": 8.2542, "loss/crossentropy": 1.9929497942328454, "loss/hidden": 3.869921875, "loss/jsd": 0.0, "loss/logits": 0.21925227269530295, "step": 9090 }, { "epoch": 0.30333333333333334, "grad_norm": 35.25, "grad_norm_var": 5.552083333333333, "learning_rate": 0.0001, "loss": 8.4873, "loss/crossentropy": 2.0095451258122923, "loss/hidden": 3.928515625, "loss/jsd": 0.0, "loss/logits": 0.2549102198332548, "step": 9100 }, { "epoch": 0.30366666666666664, "grad_norm": 31.875, "grad_norm_var": 382.7212890625, "learning_rate": 0.0001, "loss": 8.2141, "loss/crossentropy": 2.1979493319988253, "loss/hidden": 3.853515625, "loss/jsd": 0.0, "loss/logits": 0.24009186886250972, "step": 9110 }, { "epoch": 0.304, "grad_norm": 27.375, "grad_norm_var": 8.060416666666667, "learning_rate": 0.0001, "loss": 8.4104, "loss/crossentropy": 1.9613987788558007, "loss/hidden": 3.8875, "loss/jsd": 0.0, "loss/logits": 0.24363567791879176, "step": 9120 }, { "epoch": 0.30433333333333334, "grad_norm": 31.75, "grad_norm_var": 4.2900390625, "learning_rate": 0.0001, "loss": 8.2729, "loss/crossentropy": 2.1863881021738054, "loss/hidden": 3.91875, "loss/jsd": 0.0, "loss/logits": 0.2690675131976604, "step": 9130 }, { "epoch": 0.30466666666666664, "grad_norm": 33.75, "grad_norm_var": 6.741080729166667, "learning_rate": 0.0001, "loss": 8.5514, "loss/crossentropy": 2.2427688628435134, "loss/hidden": 3.885546875, "loss/jsd": 0.0, "loss/logits": 0.26169066652655604, "step": 9140 }, { "epoch": 0.305, "grad_norm": 37.0, "grad_norm_var": 285.70598958333335, "learning_rate": 0.0001, "loss": 8.6121, "loss/crossentropy": 1.990150697529316, "loss/hidden": 3.994921875, "loss/jsd": 0.0, "loss/logits": 0.24852563850581647, "step": 9150 }, { "epoch": 0.30533333333333335, "grad_norm": 29.125, "grad_norm_var": 306.2962890625, "learning_rate": 0.0001, "loss": 8.3866, "loss/crossentropy": 2.1101179368793965, "loss/hidden": 3.81328125, "loss/jsd": 0.0, "loss/logits": 0.22837954824790357, "step": 9160 }, { "epoch": 0.30566666666666664, "grad_norm": 32.25, "grad_norm_var": 2.093489583333333, "learning_rate": 0.0001, "loss": 8.4458, "loss/crossentropy": 2.197181521356106, "loss/hidden": 3.775390625, "loss/jsd": 0.0, "loss/logits": 0.22719864509999751, "step": 9170 }, { "epoch": 0.306, "grad_norm": 33.5, "grad_norm_var": 7.672916666666667, "learning_rate": 0.0001, "loss": 8.4434, "loss/crossentropy": 2.16142196059227, "loss/hidden": 3.771875, "loss/jsd": 0.0, "loss/logits": 0.23021659553050994, "step": 9180 }, { "epoch": 0.30633333333333335, "grad_norm": 31.75, "grad_norm_var": 9.355989583333333, "learning_rate": 0.0001, "loss": 8.3877, "loss/crossentropy": 2.226871684193611, "loss/hidden": 3.780859375, "loss/jsd": 0.0, "loss/logits": 0.24033361952751875, "step": 9190 }, { "epoch": 0.30666666666666664, "grad_norm": 33.75, "grad_norm_var": 9.868489583333334, "learning_rate": 0.0001, "loss": 8.3055, "loss/crossentropy": 2.077723103761673, "loss/hidden": 3.84375, "loss/jsd": 0.0, "loss/logits": 0.22728441171348096, "step": 9200 }, { "epoch": 0.307, "grad_norm": 30.125, "grad_norm_var": 2.35390625, "learning_rate": 0.0001, "loss": 8.3532, "loss/crossentropy": 1.9681759729981423, "loss/hidden": 3.755859375, "loss/jsd": 0.0, "loss/logits": 0.21074463604018093, "step": 9210 }, { "epoch": 0.30733333333333335, "grad_norm": 29.0, "grad_norm_var": 3.51015625, "learning_rate": 0.0001, "loss": 8.3401, "loss/crossentropy": 2.04879729449749, "loss/hidden": 3.869921875, "loss/jsd": 0.0, "loss/logits": 0.2396139794960618, "step": 9220 }, { "epoch": 0.30766666666666664, "grad_norm": 30.0, "grad_norm_var": 4.790625, "learning_rate": 0.0001, "loss": 8.4286, "loss/crossentropy": 2.1882148049771786, "loss/hidden": 3.808984375, "loss/jsd": 0.0, "loss/logits": 0.24543070700019598, "step": 9230 }, { "epoch": 0.308, "grad_norm": 31.625, "grad_norm_var": 2.678125, "learning_rate": 0.0001, "loss": 8.4211, "loss/crossentropy": 2.2667997002601625, "loss/hidden": 3.884375, "loss/jsd": 0.0, "loss/logits": 0.24577980488538742, "step": 9240 }, { "epoch": 0.30833333333333335, "grad_norm": 29.75, "grad_norm_var": 7.91875, "learning_rate": 0.0001, "loss": 8.219, "loss/crossentropy": 1.994037589430809, "loss/hidden": 3.87890625, "loss/jsd": 0.0, "loss/logits": 0.24291819017380475, "step": 9250 }, { "epoch": 0.30866666666666664, "grad_norm": 31.125, "grad_norm_var": 8.796809895833333, "learning_rate": 0.0001, "loss": 8.3915, "loss/crossentropy": 2.081589598953724, "loss/hidden": 3.825390625, "loss/jsd": 0.0, "loss/logits": 0.2480682110413909, "step": 9260 }, { "epoch": 0.309, "grad_norm": 32.75, "grad_norm_var": 3.5931640625, "learning_rate": 0.0001, "loss": 8.3068, "loss/crossentropy": 2.1044846177101135, "loss/hidden": 3.815625, "loss/jsd": 0.0, "loss/logits": 0.24079927131533624, "step": 9270 }, { "epoch": 0.30933333333333335, "grad_norm": 29.875, "grad_norm_var": 2.27890625, "learning_rate": 0.0001, "loss": 8.2079, "loss/crossentropy": 2.1612381815910338, "loss/hidden": 3.8203125, "loss/jsd": 0.0, "loss/logits": 0.24531424194574356, "step": 9280 }, { "epoch": 0.30966666666666665, "grad_norm": 37.5, "grad_norm_var": 5.51640625, "learning_rate": 0.0001, "loss": 8.4078, "loss/crossentropy": 2.2455517396330835, "loss/hidden": 3.787890625, "loss/jsd": 0.0, "loss/logits": 0.2453090760856867, "step": 9290 }, { "epoch": 0.31, "grad_norm": 30.5, "grad_norm_var": 6.86015625, "learning_rate": 0.0001, "loss": 8.3582, "loss/crossentropy": 2.20194024592638, "loss/hidden": 3.809375, "loss/jsd": 0.0, "loss/logits": 0.23695877343416213, "step": 9300 }, { "epoch": 0.31033333333333335, "grad_norm": 30.0, "grad_norm_var": 1.6389973958333333, "learning_rate": 0.0001, "loss": 8.3247, "loss/crossentropy": 2.0889609307050705, "loss/hidden": 3.86796875, "loss/jsd": 0.0, "loss/logits": 0.2596073430031538, "step": 9310 }, { "epoch": 0.31066666666666665, "grad_norm": 30.5, "grad_norm_var": 2.6884765625, "learning_rate": 0.0001, "loss": 8.352, "loss/crossentropy": 2.1579457476735113, "loss/hidden": 3.731640625, "loss/jsd": 0.0, "loss/logits": 0.2405384209007025, "step": 9320 }, { "epoch": 0.311, "grad_norm": 33.25, "grad_norm_var": 4.2119140625, "learning_rate": 0.0001, "loss": 8.4274, "loss/crossentropy": 2.1878247916698457, "loss/hidden": 3.809765625, "loss/jsd": 0.0, "loss/logits": 0.25121636800467967, "step": 9330 }, { "epoch": 0.31133333333333335, "grad_norm": 31.75, "grad_norm_var": 10.9603515625, "learning_rate": 0.0001, "loss": 8.4321, "loss/crossentropy": 1.912748570740223, "loss/hidden": 3.85703125, "loss/jsd": 0.0, "loss/logits": 0.23128144443035126, "step": 9340 }, { "epoch": 0.31166666666666665, "grad_norm": 33.25, "grad_norm_var": 2.4077473958333333, "learning_rate": 0.0001, "loss": 8.3519, "loss/crossentropy": 2.1460745990276338, "loss/hidden": 3.698046875, "loss/jsd": 0.0, "loss/logits": 0.21920422809198498, "step": 9350 }, { "epoch": 0.312, "grad_norm": 33.25, "grad_norm_var": 6.1134765625, "learning_rate": 0.0001, "loss": 8.464, "loss/crossentropy": 2.0449389033019543, "loss/hidden": 3.89140625, "loss/jsd": 0.0, "loss/logits": 0.23441595807671547, "step": 9360 }, { "epoch": 0.31233333333333335, "grad_norm": 31.875, "grad_norm_var": 9.36015625, "learning_rate": 0.0001, "loss": 8.4169, "loss/crossentropy": 2.0924183890223502, "loss/hidden": 3.936328125, "loss/jsd": 0.0, "loss/logits": 0.25249840430915355, "step": 9370 }, { "epoch": 0.31266666666666665, "grad_norm": 30.625, "grad_norm_var": 2.5635416666666666, "learning_rate": 0.0001, "loss": 8.3858, "loss/crossentropy": 2.085674402490258, "loss/hidden": 3.870703125, "loss/jsd": 0.0, "loss/logits": 0.2478548888117075, "step": 9380 }, { "epoch": 0.313, "grad_norm": 31.625, "grad_norm_var": 1.5947916666666666, "learning_rate": 0.0001, "loss": 8.3522, "loss/crossentropy": 2.216167467832565, "loss/hidden": 3.952734375, "loss/jsd": 0.0, "loss/logits": 0.2461514551192522, "step": 9390 }, { "epoch": 0.31333333333333335, "grad_norm": 27.25, "grad_norm_var": 4.6212890625, "learning_rate": 0.0001, "loss": 8.3455, "loss/crossentropy": 2.0884574115276338, "loss/hidden": 3.975, "loss/jsd": 0.0, "loss/logits": 0.23749772738665342, "step": 9400 }, { "epoch": 0.31366666666666665, "grad_norm": 31.5, "grad_norm_var": 4.935872395833333, "learning_rate": 0.0001, "loss": 8.4968, "loss/crossentropy": 2.0512664087116717, "loss/hidden": 3.9125, "loss/jsd": 0.0, "loss/logits": 0.24940967485308646, "step": 9410 }, { "epoch": 0.314, "grad_norm": 32.75, "grad_norm_var": 3.8374348958333333, "learning_rate": 0.0001, "loss": 8.507, "loss/crossentropy": 2.1305520750582216, "loss/hidden": 3.708203125, "loss/jsd": 0.0, "loss/logits": 0.22748439833521844, "step": 9420 }, { "epoch": 0.31433333333333335, "grad_norm": 33.25, "grad_norm_var": 3.1197265625, "learning_rate": 0.0001, "loss": 8.3505, "loss/crossentropy": 2.1478428706526755, "loss/hidden": 3.8171875, "loss/jsd": 0.0, "loss/logits": 0.26068285927176477, "step": 9430 }, { "epoch": 0.31466666666666665, "grad_norm": 31.0, "grad_norm_var": 10.255989583333333, "learning_rate": 0.0001, "loss": 8.4617, "loss/crossentropy": 2.257087817788124, "loss/hidden": 3.928515625, "loss/jsd": 0.0, "loss/logits": 0.24564366415143013, "step": 9440 }, { "epoch": 0.315, "grad_norm": 30.625, "grad_norm_var": 3.189322916666667, "learning_rate": 0.0001, "loss": 8.5139, "loss/crossentropy": 2.2028581954538824, "loss/hidden": 3.878125, "loss/jsd": 0.0, "loss/logits": 0.24985253605991603, "step": 9450 }, { "epoch": 0.31533333333333335, "grad_norm": 32.25, "grad_norm_var": 12.878059895833333, "learning_rate": 0.0001, "loss": 8.3581, "loss/crossentropy": 2.2839835971593856, "loss/hidden": 3.7609375, "loss/jsd": 0.0, "loss/logits": 0.24003779105842113, "step": 9460 }, { "epoch": 0.31566666666666665, "grad_norm": 27.125, "grad_norm_var": 13.624934895833333, "learning_rate": 0.0001, "loss": 8.3315, "loss/crossentropy": 2.1846924126148224, "loss/hidden": 3.828515625, "loss/jsd": 0.0, "loss/logits": 0.2577309591695666, "step": 9470 }, { "epoch": 0.316, "grad_norm": 30.875, "grad_norm_var": 11.371809895833334, "learning_rate": 0.0001, "loss": 8.3005, "loss/crossentropy": 2.1467300802469254, "loss/hidden": 3.784765625, "loss/jsd": 0.0, "loss/logits": 0.2385113213211298, "step": 9480 }, { "epoch": 0.31633333333333336, "grad_norm": 32.25, "grad_norm_var": 6.6697265625, "learning_rate": 0.0001, "loss": 8.4251, "loss/crossentropy": 2.0814339205622674, "loss/hidden": 3.93984375, "loss/jsd": 0.0, "loss/logits": 0.2647860247641802, "step": 9490 }, { "epoch": 0.31666666666666665, "grad_norm": 40.0, "grad_norm_var": 10.087955729166667, "learning_rate": 0.0001, "loss": 8.2889, "loss/crossentropy": 2.0348707735538483, "loss/hidden": 3.828125, "loss/jsd": 0.0, "loss/logits": 0.23109398372471332, "step": 9500 }, { "epoch": 0.317, "grad_norm": 31.25, "grad_norm_var": 8.975, "learning_rate": 0.0001, "loss": 8.3453, "loss/crossentropy": 2.011518883705139, "loss/hidden": 3.915234375, "loss/jsd": 0.0, "loss/logits": 0.24540937803685664, "step": 9510 }, { "epoch": 0.31733333333333336, "grad_norm": 30.125, "grad_norm_var": 2.7587890625, "learning_rate": 0.0001, "loss": 8.3162, "loss/crossentropy": 2.1556458704173567, "loss/hidden": 3.7796875, "loss/jsd": 0.0, "loss/logits": 0.2444867927581072, "step": 9520 }, { "epoch": 0.31766666666666665, "grad_norm": 32.75, "grad_norm_var": 3.5035807291666665, "learning_rate": 0.0001, "loss": 8.3472, "loss/crossentropy": 2.081064721941948, "loss/hidden": 3.833984375, "loss/jsd": 0.0, "loss/logits": 0.24423375371843575, "step": 9530 }, { "epoch": 0.318, "grad_norm": 30.0, "grad_norm_var": 9.223372395833334, "learning_rate": 0.0001, "loss": 8.3239, "loss/crossentropy": 2.0272342920303346, "loss/hidden": 4.003515625, "loss/jsd": 0.0, "loss/logits": 0.24455212838947774, "step": 9540 }, { "epoch": 0.31833333333333336, "grad_norm": 34.5, "grad_norm_var": 2.3824041777970545e+18, "learning_rate": 0.0001, "loss": 8.4159, "loss/crossentropy": 2.2179324753582477, "loss/hidden": 3.867578125, "loss/jsd": 0.0, "loss/logits": 0.23996016178280116, "step": 9550 }, { "epoch": 0.31866666666666665, "grad_norm": 30.75, "grad_norm_var": 2.382404178144343e+18, "learning_rate": 0.0001, "loss": 8.3233, "loss/crossentropy": 2.08397556617856, "loss/hidden": 3.85234375, "loss/jsd": 0.0, "loss/logits": 0.22639973247423767, "step": 9560 }, { "epoch": 0.319, "grad_norm": 34.25, "grad_norm_var": 99.1587890625, "learning_rate": 0.0001, "loss": 8.3621, "loss/crossentropy": 2.1781677812337876, "loss/hidden": 3.758984375, "loss/jsd": 0.0, "loss/logits": 0.23908968791365623, "step": 9570 }, { "epoch": 0.31933333333333336, "grad_norm": 31.625, "grad_norm_var": 3.3208333333333333, "learning_rate": 0.0001, "loss": 8.3194, "loss/crossentropy": 2.165592886507511, "loss/hidden": 3.739453125, "loss/jsd": 0.0, "loss/logits": 0.23889986649155617, "step": 9580 }, { "epoch": 0.31966666666666665, "grad_norm": 31.75, "grad_norm_var": 4.983072916666667, "learning_rate": 0.0001, "loss": 8.3029, "loss/crossentropy": 2.0493919394910334, "loss/hidden": 3.8265625, "loss/jsd": 0.0, "loss/logits": 0.21441856175661086, "step": 9590 }, { "epoch": 0.32, "grad_norm": 32.25, "grad_norm_var": 3.874739583333333, "learning_rate": 0.0001, "loss": 8.2748, "loss/crossentropy": 2.035661220550537, "loss/hidden": 3.841796875, "loss/jsd": 0.0, "loss/logits": 0.24448039066046476, "step": 9600 }, { "epoch": 0.32033333333333336, "grad_norm": 28.375, "grad_norm_var": 42.1072265625, "learning_rate": 0.0001, "loss": 8.3015, "loss/crossentropy": 2.126982606202364, "loss/hidden": 3.825390625, "loss/jsd": 0.0, "loss/logits": 0.2457258015871048, "step": 9610 }, { "epoch": 0.32066666666666666, "grad_norm": 28.375, "grad_norm_var": 28.330989583333334, "learning_rate": 0.0001, "loss": 8.2735, "loss/crossentropy": 2.0983067765831946, "loss/hidden": 3.830078125, "loss/jsd": 0.0, "loss/logits": 0.23017309829592705, "step": 9620 }, { "epoch": 0.321, "grad_norm": 31.125, "grad_norm_var": 40.84264322916667, "learning_rate": 0.0001, "loss": 8.3467, "loss/crossentropy": 2.183681347966194, "loss/hidden": 3.85546875, "loss/jsd": 0.0, "loss/logits": 0.24658725559711456, "step": 9630 }, { "epoch": 0.32133333333333336, "grad_norm": 31.625, "grad_norm_var": 27.297916666666666, "learning_rate": 0.0001, "loss": 8.2972, "loss/crossentropy": 2.162729802727699, "loss/hidden": 3.79609375, "loss/jsd": 0.0, "loss/logits": 0.23747125826776028, "step": 9640 }, { "epoch": 0.32166666666666666, "grad_norm": 35.0, "grad_norm_var": 5.288541666666666, "learning_rate": 0.0001, "loss": 8.434, "loss/crossentropy": 2.1427035361528395, "loss/hidden": 3.78671875, "loss/jsd": 0.0, "loss/logits": 0.249039919488132, "step": 9650 }, { "epoch": 0.322, "grad_norm": 28.875, "grad_norm_var": 6.216666666666667, "learning_rate": 0.0001, "loss": 8.2608, "loss/crossentropy": 2.2181741327047346, "loss/hidden": 3.7984375, "loss/jsd": 0.0, "loss/logits": 0.2580121297389269, "step": 9660 }, { "epoch": 0.32233333333333336, "grad_norm": 32.0, "grad_norm_var": 2.3535807291666666, "learning_rate": 0.0001, "loss": 8.3163, "loss/crossentropy": 2.1240747086703777, "loss/hidden": 3.805859375, "loss/jsd": 0.0, "loss/logits": 0.23749625347554684, "step": 9670 }, { "epoch": 0.32266666666666666, "grad_norm": 31.0, "grad_norm_var": 10.026041666666666, "learning_rate": 0.0001, "loss": 8.3716, "loss/crossentropy": 2.178921973705292, "loss/hidden": 3.826953125, "loss/jsd": 0.0, "loss/logits": 0.23335804082453251, "step": 9680 }, { "epoch": 0.323, "grad_norm": 29.875, "grad_norm_var": 3.7514973958333333, "learning_rate": 0.0001, "loss": 8.2053, "loss/crossentropy": 1.9151308126747608, "loss/hidden": 3.718359375, "loss/jsd": 0.0, "loss/logits": 0.22061082273721694, "step": 9690 }, { "epoch": 0.3233333333333333, "grad_norm": 32.25, "grad_norm_var": 6.75, "learning_rate": 0.0001, "loss": 8.2783, "loss/crossentropy": 2.1763371601700783, "loss/hidden": 3.832421875, "loss/jsd": 0.0, "loss/logits": 0.24064910151064395, "step": 9700 }, { "epoch": 0.32366666666666666, "grad_norm": 35.5, "grad_norm_var": 11.44765625, "learning_rate": 0.0001, "loss": 8.5377, "loss/crossentropy": 2.058997410535812, "loss/hidden": 3.8796875, "loss/jsd": 0.0, "loss/logits": 0.23803395926952362, "step": 9710 }, { "epoch": 0.324, "grad_norm": 29.0, "grad_norm_var": 4.698372395833333, "learning_rate": 0.0001, "loss": 8.1845, "loss/crossentropy": 2.0421560525894167, "loss/hidden": 3.766015625, "loss/jsd": 0.0, "loss/logits": 0.22774729747325181, "step": 9720 }, { "epoch": 0.3243333333333333, "grad_norm": 29.5, "grad_norm_var": 31.884309895833333, "learning_rate": 0.0001, "loss": 8.2768, "loss/crossentropy": 2.1532857537269594, "loss/hidden": 3.821875, "loss/jsd": 0.0, "loss/logits": 0.24113734550774096, "step": 9730 }, { "epoch": 0.32466666666666666, "grad_norm": 31.0, "grad_norm_var": 7.264518229166667, "learning_rate": 0.0001, "loss": 8.4747, "loss/crossentropy": 2.0760410211980345, "loss/hidden": 3.835546875, "loss/jsd": 0.0, "loss/logits": 0.23573338724672793, "step": 9740 }, { "epoch": 0.325, "grad_norm": 29.375, "grad_norm_var": 6.7337890625, "learning_rate": 0.0001, "loss": 8.3715, "loss/crossentropy": 2.2192045249044896, "loss/hidden": 3.683203125, "loss/jsd": 0.0, "loss/logits": 0.21446713693439962, "step": 9750 }, { "epoch": 0.3253333333333333, "grad_norm": 31.25, "grad_norm_var": 8.4291015625, "learning_rate": 0.0001, "loss": 8.3085, "loss/crossentropy": 2.195969696342945, "loss/hidden": 3.7796875, "loss/jsd": 0.0, "loss/logits": 0.24359578415751457, "step": 9760 }, { "epoch": 0.32566666666666666, "grad_norm": 29.375, "grad_norm_var": 19.993489583333332, "learning_rate": 0.0001, "loss": 8.3417, "loss/crossentropy": 2.0906290262937546, "loss/hidden": 3.74921875, "loss/jsd": 0.0, "loss/logits": 0.2237309933640063, "step": 9770 }, { "epoch": 0.326, "grad_norm": 36.75, "grad_norm_var": 21.134830729166666, "learning_rate": 0.0001, "loss": 8.5136, "loss/crossentropy": 2.0349312365055083, "loss/hidden": 4.02265625, "loss/jsd": 0.0, "loss/logits": 0.25023720134049654, "step": 9780 }, { "epoch": 0.3263333333333333, "grad_norm": 33.5, "grad_norm_var": 6.266666666666667, "learning_rate": 0.0001, "loss": 8.3713, "loss/crossentropy": 2.0479774929583074, "loss/hidden": 3.88203125, "loss/jsd": 0.0, "loss/logits": 0.2510897688567638, "step": 9790 }, { "epoch": 0.32666666666666666, "grad_norm": 29.25, "grad_norm_var": 3.5947916666666666, "learning_rate": 0.0001, "loss": 8.3595, "loss/crossentropy": 2.1898303270339965, "loss/hidden": 3.8953125, "loss/jsd": 0.0, "loss/logits": 0.25962252635508776, "step": 9800 }, { "epoch": 0.327, "grad_norm": 36.5, "grad_norm_var": 6.862434895833333, "learning_rate": 0.0001, "loss": 8.3717, "loss/crossentropy": 2.0998566307127478, "loss/hidden": 3.752734375, "loss/jsd": 0.0, "loss/logits": 0.2255854407325387, "step": 9810 }, { "epoch": 0.3273333333333333, "grad_norm": 30.125, "grad_norm_var": 7.448893229166667, "learning_rate": 0.0001, "loss": 8.3888, "loss/crossentropy": 2.15216224193573, "loss/hidden": 3.88203125, "loss/jsd": 0.0, "loss/logits": 0.24142319150269032, "step": 9820 }, { "epoch": 0.32766666666666666, "grad_norm": 32.5, "grad_norm_var": 5.168684895833334, "learning_rate": 0.0001, "loss": 8.2977, "loss/crossentropy": 2.0397166281938555, "loss/hidden": 3.85859375, "loss/jsd": 0.0, "loss/logits": 0.24393697939813136, "step": 9830 }, { "epoch": 0.328, "grad_norm": 30.125, "grad_norm_var": 3.12890625, "learning_rate": 0.0001, "loss": 8.3692, "loss/crossentropy": 2.008080554753542, "loss/hidden": 3.7828125, "loss/jsd": 0.0, "loss/logits": 0.22885626852512359, "step": 9840 }, { "epoch": 0.3283333333333333, "grad_norm": 30.5, "grad_norm_var": 2.624739583333333, "learning_rate": 0.0001, "loss": 8.3227, "loss/crossentropy": 2.1162449195981026, "loss/hidden": 3.86953125, "loss/jsd": 0.0, "loss/logits": 0.2397829968482256, "step": 9850 }, { "epoch": 0.32866666666666666, "grad_norm": 29.5, "grad_norm_var": 5.151041666666667, "learning_rate": 0.0001, "loss": 8.347, "loss/crossentropy": 2.2286925226449967, "loss/hidden": 3.859375, "loss/jsd": 0.0, "loss/logits": 0.24280091263353826, "step": 9860 }, { "epoch": 0.329, "grad_norm": 30.375, "grad_norm_var": 8.0072265625, "learning_rate": 0.0001, "loss": 8.3725, "loss/crossentropy": 2.1761581540107726, "loss/hidden": 3.762890625, "loss/jsd": 0.0, "loss/logits": 0.22693675048649312, "step": 9870 }, { "epoch": 0.3293333333333333, "grad_norm": 29.125, "grad_norm_var": 2.90390625, "learning_rate": 0.0001, "loss": 8.3481, "loss/crossentropy": 2.080559401214123, "loss/hidden": 3.91015625, "loss/jsd": 0.0, "loss/logits": 0.23833436965942384, "step": 9880 }, { "epoch": 0.32966666666666666, "grad_norm": 28.625, "grad_norm_var": 10.793489583333333, "learning_rate": 0.0001, "loss": 8.2629, "loss/crossentropy": 2.1615509897470475, "loss/hidden": 3.900390625, "loss/jsd": 0.0, "loss/logits": 0.2443408088758588, "step": 9890 }, { "epoch": 0.33, "grad_norm": 29.0, "grad_norm_var": 13.769205729166666, "learning_rate": 0.0001, "loss": 8.4146, "loss/crossentropy": 2.0900515958666803, "loss/hidden": 3.8265625, "loss/jsd": 0.0, "loss/logits": 0.23338977247476578, "step": 9900 }, { "epoch": 0.3303333333333333, "grad_norm": 29.875, "grad_norm_var": 5.7009765625, "learning_rate": 0.0001, "loss": 8.3165, "loss/crossentropy": 2.0612318471074103, "loss/hidden": 3.8640625, "loss/jsd": 0.0, "loss/logits": 0.2572694033384323, "step": 9910 }, { "epoch": 0.33066666666666666, "grad_norm": 38.5, "grad_norm_var": 9.492122395833333, "learning_rate": 0.0001, "loss": 8.3621, "loss/crossentropy": 2.1355748385190965, "loss/hidden": 3.861328125, "loss/jsd": 0.0, "loss/logits": 0.2555574133992195, "step": 9920 }, { "epoch": 0.331, "grad_norm": 31.875, "grad_norm_var": 8.562434895833333, "learning_rate": 0.0001, "loss": 8.2093, "loss/crossentropy": 2.032300639897585, "loss/hidden": 3.8375, "loss/jsd": 0.0, "loss/logits": 0.25119177643209695, "step": 9930 }, { "epoch": 0.3313333333333333, "grad_norm": 33.25, "grad_norm_var": 1.31640625, "learning_rate": 0.0001, "loss": 8.3696, "loss/crossentropy": 2.0900708585977554, "loss/hidden": 3.874609375, "loss/jsd": 0.0, "loss/logits": 0.23378594107925893, "step": 9940 }, { "epoch": 0.33166666666666667, "grad_norm": 34.5, "grad_norm_var": 55.3228515625, "learning_rate": 0.0001, "loss": 8.3493, "loss/crossentropy": 1.9557788401842118, "loss/hidden": 3.908984375, "loss/jsd": 0.0, "loss/logits": 0.24859324246644973, "step": 9950 }, { "epoch": 0.332, "grad_norm": 30.625, "grad_norm_var": 58.469791666666666, "learning_rate": 0.0001, "loss": 8.3664, "loss/crossentropy": 2.041897915303707, "loss/hidden": 3.775, "loss/jsd": 0.0, "loss/logits": 0.229884634912014, "step": 9960 }, { "epoch": 0.3323333333333333, "grad_norm": 33.5, "grad_norm_var": 3.70390625, "learning_rate": 0.0001, "loss": 8.4056, "loss/crossentropy": 2.10531293079257, "loss/hidden": 3.849609375, "loss/jsd": 0.0, "loss/logits": 0.23604489639401435, "step": 9970 }, { "epoch": 0.33266666666666667, "grad_norm": 33.0, "grad_norm_var": 15.415559895833333, "learning_rate": 0.0001, "loss": 8.2762, "loss/crossentropy": 2.3318694084882736, "loss/hidden": 3.791015625, "loss/jsd": 0.0, "loss/logits": 0.24015157260000705, "step": 9980 }, { "epoch": 0.333, "grad_norm": 30.625, "grad_norm_var": 3.520768229166667, "learning_rate": 0.0001, "loss": 8.3135, "loss/crossentropy": 2.2009642593562604, "loss/hidden": 3.73671875, "loss/jsd": 0.0, "loss/logits": 0.22714318558573723, "step": 9990 }, { "epoch": 0.3333333333333333, "grad_norm": 28.5, "grad_norm_var": 3.31875, "learning_rate": 0.0001, "loss": 8.0996, "loss/crossentropy": 1.9759045481681823, "loss/hidden": 3.86015625, "loss/jsd": 0.0, "loss/logits": 0.2291174167767167, "step": 10000 }, { "epoch": 0.33366666666666667, "grad_norm": 32.25, "grad_norm_var": 7.280143229166667, "learning_rate": 0.0001, "loss": 8.3198, "loss/crossentropy": 2.250378394126892, "loss/hidden": 3.8390625, "loss/jsd": 0.0, "loss/logits": 0.24800184294581412, "step": 10010 }, { "epoch": 0.334, "grad_norm": 37.5, "grad_norm_var": 1.892637721373547e+18, "learning_rate": 0.0001, "loss": 8.5697, "loss/crossentropy": 2.0878022998571395, "loss/hidden": 3.894140625, "loss/jsd": 0.0, "loss/logits": 0.24692457877099513, "step": 10020 }, { "epoch": 0.3343333333333333, "grad_norm": 34.25, "grad_norm_var": 17.612239583333334, "learning_rate": 0.0001, "loss": 8.4287, "loss/crossentropy": 2.1991532504558564, "loss/hidden": 3.73828125, "loss/jsd": 0.0, "loss/logits": 0.22239614240825176, "step": 10030 }, { "epoch": 0.33466666666666667, "grad_norm": 33.75, "grad_norm_var": 14.96015625, "learning_rate": 0.0001, "loss": 8.3611, "loss/crossentropy": 2.1013733208179475, "loss/hidden": 3.845703125, "loss/jsd": 0.0, "loss/logits": 0.2412415651604533, "step": 10040 }, { "epoch": 0.335, "grad_norm": 31.625, "grad_norm_var": 2.1184895833333335, "learning_rate": 0.0001, "loss": 8.3079, "loss/crossentropy": 2.1323711395263674, "loss/hidden": 3.775390625, "loss/jsd": 0.0, "loss/logits": 0.2319201186299324, "step": 10050 }, { "epoch": 0.3353333333333333, "grad_norm": 28.5, "grad_norm_var": 4.184309895833334, "learning_rate": 0.0001, "loss": 8.3487, "loss/crossentropy": 2.1746664479374886, "loss/hidden": 3.869921875, "loss/jsd": 0.0, "loss/logits": 0.24217009954154492, "step": 10060 }, { "epoch": 0.33566666666666667, "grad_norm": 31.0, "grad_norm_var": 4.6712890625, "learning_rate": 0.0001, "loss": 8.4495, "loss/crossentropy": 2.1526307716965674, "loss/hidden": 3.76875, "loss/jsd": 0.0, "loss/logits": 0.24897574950009585, "step": 10070 }, { "epoch": 0.336, "grad_norm": 33.25, "grad_norm_var": 3.486458333333333, "learning_rate": 0.0001, "loss": 8.3761, "loss/crossentropy": 2.175819969177246, "loss/hidden": 3.89453125, "loss/jsd": 0.0, "loss/logits": 0.2495252525433898, "step": 10080 }, { "epoch": 0.3363333333333333, "grad_norm": 31.375, "grad_norm_var": 3.7577473958333334, "learning_rate": 0.0001, "loss": 8.2976, "loss/crossentropy": 2.1154340267181397, "loss/hidden": 3.886328125, "loss/jsd": 0.0, "loss/logits": 0.2435309149324894, "step": 10090 }, { "epoch": 0.33666666666666667, "grad_norm": 28.25, "grad_norm_var": 5.289583333333334, "learning_rate": 0.0001, "loss": 8.2897, "loss/crossentropy": 1.9862043529748916, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.22356732599437237, "step": 10100 }, { "epoch": 0.337, "grad_norm": 30.125, "grad_norm_var": 4.676497395833334, "learning_rate": 0.0001, "loss": 8.4647, "loss/crossentropy": 2.149342668801546, "loss/hidden": 3.8328125, "loss/jsd": 0.0, "loss/logits": 0.25235841386020186, "step": 10110 }, { "epoch": 0.3373333333333333, "grad_norm": 41.75, "grad_norm_var": 70.57057291666666, "learning_rate": 0.0001, "loss": 8.4197, "loss/crossentropy": 2.3661131516098974, "loss/hidden": 3.769140625, "loss/jsd": 0.0, "loss/logits": 0.25782175101339816, "step": 10120 }, { "epoch": 0.33766666666666667, "grad_norm": 30.75, "grad_norm_var": 295.07604166666664, "learning_rate": 0.0001, "loss": 8.2983, "loss/crossentropy": 2.2677551925182344, "loss/hidden": 3.787890625, "loss/jsd": 0.0, "loss/logits": 0.24740365371108056, "step": 10130 }, { "epoch": 0.338, "grad_norm": 35.5, "grad_norm_var": 284.6176432291667, "learning_rate": 0.0001, "loss": 8.4841, "loss/crossentropy": 2.134307199716568, "loss/hidden": 3.8296875, "loss/jsd": 0.0, "loss/logits": 0.23806515689939262, "step": 10140 }, { "epoch": 0.3383333333333333, "grad_norm": 32.5, "grad_norm_var": 4.538997395833333, "learning_rate": 0.0001, "loss": 8.3621, "loss/crossentropy": 2.3126337975263596, "loss/hidden": 3.85703125, "loss/jsd": 0.0, "loss/logits": 0.26148965023458004, "step": 10150 }, { "epoch": 0.33866666666666667, "grad_norm": 37.0, "grad_norm_var": 5.0353515625, "learning_rate": 0.0001, "loss": 8.2753, "loss/crossentropy": 2.0247806657105683, "loss/hidden": 3.89375, "loss/jsd": 0.0, "loss/logits": 0.23136391188018024, "step": 10160 }, { "epoch": 0.339, "grad_norm": 27.75, "grad_norm_var": 7.939518229166667, "learning_rate": 0.0001, "loss": 8.2568, "loss/crossentropy": 2.1857830375432967, "loss/hidden": 3.741796875, "loss/jsd": 0.0, "loss/logits": 0.23809754736721517, "step": 10170 }, { "epoch": 0.3393333333333333, "grad_norm": 31.75, "grad_norm_var": 6.262239583333334, "learning_rate": 0.0001, "loss": 8.321, "loss/crossentropy": 2.045456614345312, "loss/hidden": 3.776953125, "loss/jsd": 0.0, "loss/logits": 0.23576115854084492, "step": 10180 }, { "epoch": 0.3396666666666667, "grad_norm": 30.5, "grad_norm_var": 1.8822265625, "learning_rate": 0.0001, "loss": 8.2247, "loss/crossentropy": 2.035719431936741, "loss/hidden": 3.757421875, "loss/jsd": 0.0, "loss/logits": 0.2281944528222084, "step": 10190 }, { "epoch": 0.34, "grad_norm": 31.375, "grad_norm_var": 7.2744140625, "learning_rate": 0.0001, "loss": 8.2969, "loss/crossentropy": 2.130661930143833, "loss/hidden": 3.798046875, "loss/jsd": 0.0, "loss/logits": 0.23119777366518973, "step": 10200 }, { "epoch": 0.3403333333333333, "grad_norm": 38.0, "grad_norm_var": 9.555989583333334, "learning_rate": 0.0001, "loss": 8.3353, "loss/crossentropy": 2.007842856645584, "loss/hidden": 3.954296875, "loss/jsd": 0.0, "loss/logits": 0.2692394644021988, "step": 10210 }, { "epoch": 0.3406666666666667, "grad_norm": 29.75, "grad_norm_var": 6.705208333333333, "learning_rate": 0.0001, "loss": 8.3837, "loss/crossentropy": 2.097426188737154, "loss/hidden": 3.803515625, "loss/jsd": 0.0, "loss/logits": 0.23504080064594746, "step": 10220 }, { "epoch": 0.341, "grad_norm": 34.75, "grad_norm_var": 5.630143229166666, "learning_rate": 0.0001, "loss": 8.3845, "loss/crossentropy": 2.030475867539644, "loss/hidden": 3.895703125, "loss/jsd": 0.0, "loss/logits": 0.2574477320536971, "step": 10230 }, { "epoch": 0.3413333333333333, "grad_norm": 29.75, "grad_norm_var": 4.81640625, "learning_rate": 0.0001, "loss": 8.3345, "loss/crossentropy": 2.1374482408165933, "loss/hidden": 3.755859375, "loss/jsd": 0.0, "loss/logits": 0.23863786160945893, "step": 10240 }, { "epoch": 0.3416666666666667, "grad_norm": 31.625, "grad_norm_var": 3.5625, "learning_rate": 0.0001, "loss": 8.4214, "loss/crossentropy": 1.949062729626894, "loss/hidden": 3.873828125, "loss/jsd": 0.0, "loss/logits": 0.24437980260699987, "step": 10250 }, { "epoch": 0.342, "grad_norm": 31.75, "grad_norm_var": 3.8447265625, "learning_rate": 0.0001, "loss": 8.4116, "loss/crossentropy": 2.06142435669899, "loss/hidden": 3.733984375, "loss/jsd": 0.0, "loss/logits": 0.21934528928250074, "step": 10260 }, { "epoch": 0.3423333333333333, "grad_norm": 29.25, "grad_norm_var": 4.991080729166667, "learning_rate": 0.0001, "loss": 8.3459, "loss/crossentropy": 2.0533831655979156, "loss/hidden": 3.821484375, "loss/jsd": 0.0, "loss/logits": 0.237164250575006, "step": 10270 }, { "epoch": 0.3426666666666667, "grad_norm": 29.875, "grad_norm_var": 8.5125, "learning_rate": 0.0001, "loss": 8.4675, "loss/crossentropy": 2.220197274535894, "loss/hidden": 3.89140625, "loss/jsd": 0.0, "loss/logits": 0.253680607303977, "step": 10280 }, { "epoch": 0.343, "grad_norm": 31.625, "grad_norm_var": 4.987239583333333, "learning_rate": 0.0001, "loss": 8.2682, "loss/crossentropy": 2.1430200926959513, "loss/hidden": 3.75703125, "loss/jsd": 0.0, "loss/logits": 0.22555206064134836, "step": 10290 }, { "epoch": 0.3433333333333333, "grad_norm": 28.375, "grad_norm_var": 4.96640625, "learning_rate": 0.0001, "loss": 8.3617, "loss/crossentropy": 2.2133467949926855, "loss/hidden": 3.903125, "loss/jsd": 0.0, "loss/logits": 0.26156550645828247, "step": 10300 }, { "epoch": 0.3436666666666667, "grad_norm": 29.375, "grad_norm_var": 7.2212890625, "learning_rate": 0.0001, "loss": 8.2326, "loss/crossentropy": 2.02518198415637, "loss/hidden": 3.685546875, "loss/jsd": 0.0, "loss/logits": 0.21950181983411313, "step": 10310 }, { "epoch": 0.344, "grad_norm": 29.875, "grad_norm_var": 8.96015625, "learning_rate": 0.0001, "loss": 8.2243, "loss/crossentropy": 2.1252332836389543, "loss/hidden": 3.783203125, "loss/jsd": 0.0, "loss/logits": 0.23033270034939052, "step": 10320 }, { "epoch": 0.3443333333333333, "grad_norm": 28.25, "grad_norm_var": 7.640625, "learning_rate": 0.0001, "loss": 8.4185, "loss/crossentropy": 2.0800456672906877, "loss/hidden": 3.892578125, "loss/jsd": 0.0, "loss/logits": 0.25349009446799753, "step": 10330 }, { "epoch": 0.3446666666666667, "grad_norm": 6408896512.0, "grad_norm_var": 2.5671221312037934e+18, "learning_rate": 0.0001, "loss": 8.4544, "loss/crossentropy": 2.2498678863048553, "loss/hidden": 3.81015625, "loss/jsd": 0.0, "loss/logits": 0.24522239342331886, "step": 10340 }, { "epoch": 0.345, "grad_norm": 29.75, "grad_norm_var": 2.56712212992869e+18, "learning_rate": 0.0001, "loss": 8.332, "loss/crossentropy": 2.067190906405449, "loss/hidden": 3.79765625, "loss/jsd": 0.0, "loss/logits": 0.22377760540693997, "step": 10350 }, { "epoch": 0.3453333333333333, "grad_norm": 30.875, "grad_norm_var": 35.8744140625, "learning_rate": 0.0001, "loss": 8.3931, "loss/crossentropy": 2.3343340516090394, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.23258175030350686, "step": 10360 }, { "epoch": 0.3456666666666667, "grad_norm": 30.125, "grad_norm_var": 5.64765625, "learning_rate": 0.0001, "loss": 8.2511, "loss/crossentropy": 2.0171373963356016, "loss/hidden": 3.93359375, "loss/jsd": 0.0, "loss/logits": 0.24750286806374788, "step": 10370 }, { "epoch": 0.346, "grad_norm": 34.0, "grad_norm_var": 2117.798372395833, "learning_rate": 0.0001, "loss": 8.4717, "loss/crossentropy": 2.027893168479204, "loss/hidden": 4.028125, "loss/jsd": 0.0, "loss/logits": 0.256832991912961, "step": 10380 }, { "epoch": 0.3463333333333333, "grad_norm": 33.75, "grad_norm_var": 2119.6337890625, "learning_rate": 0.0001, "loss": 8.4619, "loss/crossentropy": 2.106752772629261, "loss/hidden": 3.865625, "loss/jsd": 0.0, "loss/logits": 0.2447080912068486, "step": 10390 }, { "epoch": 0.3466666666666667, "grad_norm": 32.5, "grad_norm_var": 10.728059895833333, "learning_rate": 0.0001, "loss": 8.4674, "loss/crossentropy": 2.150819255411625, "loss/hidden": 3.839453125, "loss/jsd": 0.0, "loss/logits": 0.2522192716598511, "step": 10400 }, { "epoch": 0.347, "grad_norm": 29.25, "grad_norm_var": 19.118489583333332, "learning_rate": 0.0001, "loss": 8.3815, "loss/crossentropy": 2.1638945795595648, "loss/hidden": 3.760546875, "loss/jsd": 0.0, "loss/logits": 0.2254624404013157, "step": 10410 }, { "epoch": 0.3473333333333333, "grad_norm": 27.625, "grad_norm_var": 5.62890625, "learning_rate": 0.0001, "loss": 8.2479, "loss/crossentropy": 2.0657031178474425, "loss/hidden": 3.637890625, "loss/jsd": 0.0, "loss/logits": 0.21996993869543074, "step": 10420 }, { "epoch": 0.3476666666666667, "grad_norm": 28.375, "grad_norm_var": 5.833072916666667, "learning_rate": 0.0001, "loss": 8.3344, "loss/crossentropy": 2.1299983762204646, "loss/hidden": 3.780859375, "loss/jsd": 0.0, "loss/logits": 0.21561280181631445, "step": 10430 }, { "epoch": 0.348, "grad_norm": 28.375, "grad_norm_var": 8.4150390625, "learning_rate": 0.0001, "loss": 8.3951, "loss/crossentropy": 2.1073717825114726, "loss/hidden": 3.808203125, "loss/jsd": 0.0, "loss/logits": 0.21989304553717376, "step": 10440 }, { "epoch": 0.34833333333333333, "grad_norm": 34.0, "grad_norm_var": 3.081184895833333, "learning_rate": 0.0001, "loss": 8.2967, "loss/crossentropy": 2.077288343012333, "loss/hidden": 3.78359375, "loss/jsd": 0.0, "loss/logits": 0.23938167467713356, "step": 10450 }, { "epoch": 0.3486666666666667, "grad_norm": 31.25, "grad_norm_var": 2.089583333333333, "learning_rate": 0.0001, "loss": 8.3636, "loss/crossentropy": 1.9518108278512956, "loss/hidden": 3.88125, "loss/jsd": 0.0, "loss/logits": 0.2389603516086936, "step": 10460 }, { "epoch": 0.349, "grad_norm": 28.625, "grad_norm_var": 1.7738932291666667, "learning_rate": 0.0001, "loss": 8.4079, "loss/crossentropy": 2.0541266784071923, "loss/hidden": 3.709375, "loss/jsd": 0.0, "loss/logits": 0.21510923374444246, "step": 10470 }, { "epoch": 0.34933333333333333, "grad_norm": 29.5, "grad_norm_var": 3.5259765625, "learning_rate": 0.0001, "loss": 8.3548, "loss/crossentropy": 1.9732642628252506, "loss/hidden": 3.90234375, "loss/jsd": 0.0, "loss/logits": 0.23084475416690112, "step": 10480 }, { "epoch": 0.3496666666666667, "grad_norm": 31.25, "grad_norm_var": 2.0927083333333334, "learning_rate": 0.0001, "loss": 8.245, "loss/crossentropy": 2.0425802804529667, "loss/hidden": 3.7640625, "loss/jsd": 0.0, "loss/logits": 0.2300992401316762, "step": 10490 }, { "epoch": 0.35, "grad_norm": 27.375, "grad_norm_var": 3.1681640625, "learning_rate": 0.0001, "loss": 8.3515, "loss/crossentropy": 2.1685428470373154, "loss/hidden": 3.887109375, "loss/jsd": 0.0, "loss/logits": 0.2667371932417154, "step": 10500 }, { "epoch": 0.35033333333333333, "grad_norm": 30.375, "grad_norm_var": 8.442708333333334, "learning_rate": 0.0001, "loss": 8.2839, "loss/crossentropy": 2.0770496785640717, "loss/hidden": 3.8609375, "loss/jsd": 0.0, "loss/logits": 0.23902000039815902, "step": 10510 }, { "epoch": 0.3506666666666667, "grad_norm": 32.0, "grad_norm_var": 8.070768229166667, "learning_rate": 0.0001, "loss": 8.2154, "loss/crossentropy": 2.2012623459100724, "loss/hidden": 3.900390625, "loss/jsd": 0.0, "loss/logits": 0.26947569735348226, "step": 10520 }, { "epoch": 0.351, "grad_norm": 31.875, "grad_norm_var": 6.551822916666667, "learning_rate": 0.0001, "loss": 8.5346, "loss/crossentropy": 2.1797248646616936, "loss/hidden": 3.939453125, "loss/jsd": 0.0, "loss/logits": 0.2714757215231657, "step": 10530 }, { "epoch": 0.35133333333333333, "grad_norm": 28.875, "grad_norm_var": 9.1759765625, "learning_rate": 0.0001, "loss": 8.271, "loss/crossentropy": 2.2226035714149477, "loss/hidden": 3.975, "loss/jsd": 0.0, "loss/logits": 0.2569460779428482, "step": 10540 }, { "epoch": 0.3516666666666667, "grad_norm": 34.0, "grad_norm_var": 9.7556640625, "learning_rate": 0.0001, "loss": 8.4072, "loss/crossentropy": 2.0563524261116983, "loss/hidden": 3.73046875, "loss/jsd": 0.0, "loss/logits": 0.22892842460423707, "step": 10550 }, { "epoch": 0.352, "grad_norm": 38.5, "grad_norm_var": 13.52265625, "learning_rate": 0.0001, "loss": 8.3639, "loss/crossentropy": 2.146382841467857, "loss/hidden": 3.823046875, "loss/jsd": 0.0, "loss/logits": 0.2412761567160487, "step": 10560 }, { "epoch": 0.35233333333333333, "grad_norm": 31.0, "grad_norm_var": 14.380143229166666, "learning_rate": 0.0001, "loss": 8.4058, "loss/crossentropy": 2.0560551561415195, "loss/hidden": 3.8359375, "loss/jsd": 0.0, "loss/logits": 0.22143340446054935, "step": 10570 }, { "epoch": 0.3526666666666667, "grad_norm": 31.375, "grad_norm_var": 6.331705729166667, "learning_rate": 0.0001, "loss": 8.4463, "loss/crossentropy": 1.9261908829212189, "loss/hidden": 3.851953125, "loss/jsd": 0.0, "loss/logits": 0.23094973117113113, "step": 10580 }, { "epoch": 0.353, "grad_norm": 31.125, "grad_norm_var": 765.9447916666667, "learning_rate": 0.0001, "loss": 8.561, "loss/crossentropy": 2.198425108194351, "loss/hidden": 3.90390625, "loss/jsd": 0.0, "loss/logits": 0.32166901491582395, "step": 10590 }, { "epoch": 0.35333333333333333, "grad_norm": 34.0, "grad_norm_var": 68.19557291666666, "learning_rate": 0.0001, "loss": 8.3153, "loss/crossentropy": 2.0817312106490133, "loss/hidden": 3.9421875, "loss/jsd": 0.0, "loss/logits": 0.24946041442453862, "step": 10600 }, { "epoch": 0.3536666666666667, "grad_norm": 32.75, "grad_norm_var": 3.442122395833333, "learning_rate": 0.0001, "loss": 8.2396, "loss/crossentropy": 2.0021930016577243, "loss/hidden": 3.927734375, "loss/jsd": 0.0, "loss/logits": 0.23456851877272128, "step": 10610 }, { "epoch": 0.354, "grad_norm": 31.25, "grad_norm_var": 2.9041015625, "learning_rate": 0.0001, "loss": 8.5516, "loss/crossentropy": 2.3048987478017806, "loss/hidden": 3.7765625, "loss/jsd": 0.0, "loss/logits": 0.2382359316572547, "step": 10620 }, { "epoch": 0.35433333333333333, "grad_norm": 29.625, "grad_norm_var": 2.2348307291666667, "learning_rate": 0.0001, "loss": 8.2786, "loss/crossentropy": 2.1949386417865755, "loss/hidden": 3.804296875, "loss/jsd": 0.0, "loss/logits": 0.2493376847356558, "step": 10630 }, { "epoch": 0.3546666666666667, "grad_norm": 29.375, "grad_norm_var": 3.965625, "learning_rate": 0.0001, "loss": 8.3273, "loss/crossentropy": 2.0018108122050764, "loss/hidden": 3.7875, "loss/jsd": 0.0, "loss/logits": 0.21743584834039212, "step": 10640 }, { "epoch": 0.355, "grad_norm": 31.75, "grad_norm_var": 44.281184895833334, "learning_rate": 0.0001, "loss": 8.2713, "loss/crossentropy": 2.1437037006020545, "loss/hidden": 3.738671875, "loss/jsd": 0.0, "loss/logits": 0.23090928047895432, "step": 10650 }, { "epoch": 0.35533333333333333, "grad_norm": 30.5, "grad_norm_var": 3.9192575434663747e+18, "learning_rate": 0.0001, "loss": 8.478, "loss/crossentropy": 2.2309256963431836, "loss/hidden": 3.974609375, "loss/jsd": 0.0, "loss/logits": 0.2605215635150671, "step": 10660 }, { "epoch": 0.3556666666666667, "grad_norm": 29.375, "grad_norm_var": 12.66015625, "learning_rate": 0.0001, "loss": 8.3008, "loss/crossentropy": 2.087778661772609, "loss/hidden": 3.8953125, "loss/jsd": 0.0, "loss/logits": 0.24346806921530514, "step": 10670 }, { "epoch": 0.356, "grad_norm": 28.375, "grad_norm_var": 3.5434895833333333, "learning_rate": 0.0001, "loss": 8.4067, "loss/crossentropy": 2.0724117450416086, "loss/hidden": 3.762890625, "loss/jsd": 0.0, "loss/logits": 0.23170766066759824, "step": 10680 }, { "epoch": 0.35633333333333334, "grad_norm": 28.25, "grad_norm_var": 3.880143229166667, "learning_rate": 0.0001, "loss": 8.3141, "loss/crossentropy": 2.0804959647357464, "loss/hidden": 3.932421875, "loss/jsd": 0.0, "loss/logits": 0.26033860705792905, "step": 10690 }, { "epoch": 0.3566666666666667, "grad_norm": 31.375, "grad_norm_var": 2.2768229166666667, "learning_rate": 0.0001, "loss": 8.2118, "loss/crossentropy": 2.1898273028433324, "loss/hidden": 3.808203125, "loss/jsd": 0.0, "loss/logits": 0.23994966400787235, "step": 10700 }, { "epoch": 0.357, "grad_norm": 30.625, "grad_norm_var": 38.139322916666664, "learning_rate": 0.0001, "loss": 8.3548, "loss/crossentropy": 2.1299043744802475, "loss/hidden": 3.891796875, "loss/jsd": 0.0, "loss/logits": 0.25432286225259304, "step": 10710 }, { "epoch": 0.35733333333333334, "grad_norm": 30.5, "grad_norm_var": 2.2309895833333333, "learning_rate": 0.0001, "loss": 8.3522, "loss/crossentropy": 2.230180199444294, "loss/hidden": 3.7171875, "loss/jsd": 0.0, "loss/logits": 0.22721530161798, "step": 10720 }, { "epoch": 0.3576666666666667, "grad_norm": 30.375, "grad_norm_var": 7.738541666666666, "learning_rate": 0.0001, "loss": 8.23, "loss/crossentropy": 2.0474620938301085, "loss/hidden": 3.76640625, "loss/jsd": 0.0, "loss/logits": 0.2256452445872128, "step": 10730 }, { "epoch": 0.358, "grad_norm": 31.25, "grad_norm_var": 3.0249348958333333, "learning_rate": 0.0001, "loss": 8.3739, "loss/crossentropy": 2.152415704727173, "loss/hidden": 3.884765625, "loss/jsd": 0.0, "loss/logits": 0.2414580164477229, "step": 10740 }, { "epoch": 0.35833333333333334, "grad_norm": 35.5, "grad_norm_var": 4.0625, "learning_rate": 0.0001, "loss": 8.205, "loss/crossentropy": 2.3166534900665283, "loss/hidden": 3.880859375, "loss/jsd": 0.0, "loss/logits": 0.2524943361058831, "step": 10750 }, { "epoch": 0.3586666666666667, "grad_norm": 32.25, "grad_norm_var": 3.746875, "learning_rate": 0.0001, "loss": 8.3312, "loss/crossentropy": 2.152712790668011, "loss/hidden": 3.825, "loss/jsd": 0.0, "loss/logits": 0.2455398654565215, "step": 10760 }, { "epoch": 0.359, "grad_norm": 29.0, "grad_norm_var": 2.4900390625, "learning_rate": 0.0001, "loss": 8.3316, "loss/crossentropy": 2.1520379945635795, "loss/hidden": 3.7921875, "loss/jsd": 0.0, "loss/logits": 0.23001325819641352, "step": 10770 }, { "epoch": 0.35933333333333334, "grad_norm": 30.5, "grad_norm_var": 2.9306640625, "learning_rate": 0.0001, "loss": 8.2616, "loss/crossentropy": 1.945100226998329, "loss/hidden": 3.8171875, "loss/jsd": 0.0, "loss/logits": 0.23169657299295068, "step": 10780 }, { "epoch": 0.3596666666666667, "grad_norm": 59.0, "grad_norm_var": 58.0666015625, "learning_rate": 0.0001, "loss": 8.3974, "loss/crossentropy": 2.103043520450592, "loss/hidden": 3.84921875, "loss/jsd": 0.0, "loss/logits": 0.24081441648304464, "step": 10790 }, { "epoch": 0.36, "grad_norm": 29.0, "grad_norm_var": 59.16087239583333, "learning_rate": 0.0001, "loss": 8.2643, "loss/crossentropy": 1.9263419553637504, "loss/hidden": 3.848046875, "loss/jsd": 0.0, "loss/logits": 0.22828295342624189, "step": 10800 }, { "epoch": 0.36033333333333334, "grad_norm": 41.75, "grad_norm_var": 14.025, "learning_rate": 0.0001, "loss": 8.3654, "loss/crossentropy": 2.2317003183066846, "loss/hidden": 3.78828125, "loss/jsd": 0.0, "loss/logits": 0.23634406868368388, "step": 10810 }, { "epoch": 0.3606666666666667, "grad_norm": 28.75, "grad_norm_var": 11.925, "learning_rate": 0.0001, "loss": 8.3209, "loss/crossentropy": 2.0176270991563796, "loss/hidden": 3.842578125, "loss/jsd": 0.0, "loss/logits": 0.2656209450215101, "step": 10820 }, { "epoch": 0.361, "grad_norm": 29.875, "grad_norm_var": 3.6561848958333334, "learning_rate": 0.0001, "loss": 8.3232, "loss/crossentropy": 2.127125210314989, "loss/hidden": 3.6890625, "loss/jsd": 0.0, "loss/logits": 0.23250760212540628, "step": 10830 }, { "epoch": 0.36133333333333334, "grad_norm": 27.625, "grad_norm_var": 14.4447265625, "learning_rate": 0.0001, "loss": 8.3106, "loss/crossentropy": 2.065077592432499, "loss/hidden": 3.804296875, "loss/jsd": 0.0, "loss/logits": 0.22122176755219697, "step": 10840 }, { "epoch": 0.3616666666666667, "grad_norm": 32.75, "grad_norm_var": 14.9212890625, "learning_rate": 0.0001, "loss": 8.1455, "loss/crossentropy": 1.9944854885339738, "loss/hidden": 3.878515625, "loss/jsd": 0.0, "loss/logits": 0.2446516625583172, "step": 10850 }, { "epoch": 0.362, "grad_norm": 28.125, "grad_norm_var": 16.002018229166666, "learning_rate": 0.0001, "loss": 8.1836, "loss/crossentropy": 2.1445549219846725, "loss/hidden": 3.805078125, "loss/jsd": 0.0, "loss/logits": 0.24592317193746566, "step": 10860 }, { "epoch": 0.36233333333333334, "grad_norm": 32.0, "grad_norm_var": 4.886393229166667, "learning_rate": 0.0001, "loss": 8.3139, "loss/crossentropy": 2.0830292530357837, "loss/hidden": 3.759765625, "loss/jsd": 0.0, "loss/logits": 0.2338992802426219, "step": 10870 }, { "epoch": 0.3626666666666667, "grad_norm": 28.25, "grad_norm_var": 8.9744140625, "learning_rate": 0.0001, "loss": 8.3703, "loss/crossentropy": 2.074661585688591, "loss/hidden": 3.763671875, "loss/jsd": 0.0, "loss/logits": 0.2267523631453514, "step": 10880 }, { "epoch": 0.363, "grad_norm": 30.0, "grad_norm_var": 6.71015625, "learning_rate": 0.0001, "loss": 8.2861, "loss/crossentropy": 2.218615745007992, "loss/hidden": 3.708984375, "loss/jsd": 0.0, "loss/logits": 0.2329158153384924, "step": 10890 }, { "epoch": 0.36333333333333334, "grad_norm": 31.5, "grad_norm_var": 2.10390625, "learning_rate": 0.0001, "loss": 8.4114, "loss/crossentropy": 2.2118052423000334, "loss/hidden": 3.8671875, "loss/jsd": 0.0, "loss/logits": 0.24035598244518042, "step": 10900 }, { "epoch": 0.3636666666666667, "grad_norm": 32.25, "grad_norm_var": 6.5916015625, "learning_rate": 0.0001, "loss": 8.253, "loss/crossentropy": 1.9713818281888962, "loss/hidden": 3.73828125, "loss/jsd": 0.0, "loss/logits": 0.21810249648988247, "step": 10910 }, { "epoch": 0.364, "grad_norm": 28.125, "grad_norm_var": 3.3604166666666666, "learning_rate": 0.0001, "loss": 8.402, "loss/crossentropy": 2.087474272400141, "loss/hidden": 3.903515625, "loss/jsd": 0.0, "loss/logits": 0.25297512784600257, "step": 10920 }, { "epoch": 0.36433333333333334, "grad_norm": 29.0, "grad_norm_var": 2.8676432291666667, "learning_rate": 0.0001, "loss": 8.3376, "loss/crossentropy": 2.055064349621534, "loss/hidden": 3.871484375, "loss/jsd": 0.0, "loss/logits": 0.23781403247267008, "step": 10930 }, { "epoch": 0.36466666666666664, "grad_norm": 29.25, "grad_norm_var": 3.84140625, "learning_rate": 0.0001, "loss": 8.3393, "loss/crossentropy": 2.240050254762173, "loss/hidden": 3.837890625, "loss/jsd": 0.0, "loss/logits": 0.26323652667924763, "step": 10940 }, { "epoch": 0.365, "grad_norm": 31.875, "grad_norm_var": 5.914322916666666, "learning_rate": 0.0001, "loss": 8.4046, "loss/crossentropy": 2.0274403050541876, "loss/hidden": 3.884765625, "loss/jsd": 0.0, "loss/logits": 0.255847645457834, "step": 10950 }, { "epoch": 0.36533333333333334, "grad_norm": 31.625, "grad_norm_var": 4.284375, "learning_rate": 0.0001, "loss": 8.2417, "loss/crossentropy": 2.148882707953453, "loss/hidden": 3.7375, "loss/jsd": 0.0, "loss/logits": 0.22276342548429967, "step": 10960 }, { "epoch": 0.36566666666666664, "grad_norm": 34.75, "grad_norm_var": 4.983072916666667, "learning_rate": 0.0001, "loss": 8.2975, "loss/crossentropy": 2.1014214023947715, "loss/hidden": 3.73359375, "loss/jsd": 0.0, "loss/logits": 0.2284602228552103, "step": 10970 }, { "epoch": 0.366, "grad_norm": 32.0, "grad_norm_var": 2.1797421953052575e+18, "learning_rate": 0.0001, "loss": 8.3736, "loss/crossentropy": 2.01134799271822, "loss/hidden": 3.832421875, "loss/jsd": 0.0, "loss/logits": 0.23655065074563025, "step": 10980 }, { "epoch": 0.36633333333333334, "grad_norm": 29.75, "grad_norm_var": 8.880989583333333, "learning_rate": 0.0001, "loss": 8.3184, "loss/crossentropy": 2.019283553212881, "loss/hidden": 3.891015625, "loss/jsd": 0.0, "loss/logits": 0.24222001079469918, "step": 10990 }, { "epoch": 0.36666666666666664, "grad_norm": 34.75, "grad_norm_var": 4.905989583333334, "learning_rate": 0.0001, "loss": 8.1939, "loss/crossentropy": 2.0408636704087257, "loss/hidden": 3.95625, "loss/jsd": 0.0, "loss/logits": 0.257934401743114, "step": 11000 }, { "epoch": 0.367, "grad_norm": 31.125, "grad_norm_var": 6.711393229166666, "learning_rate": 0.0001, "loss": 8.3125, "loss/crossentropy": 1.987822836637497, "loss/hidden": 3.8765625, "loss/jsd": 0.0, "loss/logits": 0.24592833667993547, "step": 11010 }, { "epoch": 0.36733333333333335, "grad_norm": 28.0, "grad_norm_var": 10.1884765625, "learning_rate": 0.0001, "loss": 8.2068, "loss/crossentropy": 1.934238361567259, "loss/hidden": 3.6390625, "loss/jsd": 0.0, "loss/logits": 0.20752198286354542, "step": 11020 }, { "epoch": 0.36766666666666664, "grad_norm": 30.75, "grad_norm_var": 14.797330729166667, "learning_rate": 0.0001, "loss": 8.3522, "loss/crossentropy": 2.091546893119812, "loss/hidden": 3.8109375, "loss/jsd": 0.0, "loss/logits": 0.22276817485690117, "step": 11030 }, { "epoch": 0.368, "grad_norm": 32.25, "grad_norm_var": 10.873372395833334, "learning_rate": 0.0001, "loss": 8.26, "loss/crossentropy": 2.046231422573328, "loss/hidden": 3.66796875, "loss/jsd": 0.0, "loss/logits": 0.218076004460454, "step": 11040 }, { "epoch": 0.36833333333333335, "grad_norm": 29.625, "grad_norm_var": 9.6994140625, "learning_rate": 0.0001, "loss": 8.2113, "loss/crossentropy": 2.1118013873696326, "loss/hidden": 3.73828125, "loss/jsd": 0.0, "loss/logits": 0.22814447209239005, "step": 11050 }, { "epoch": 0.36866666666666664, "grad_norm": 32.75, "grad_norm_var": 5.477018229166666, "learning_rate": 0.0001, "loss": 8.3915, "loss/crossentropy": 2.138951501250267, "loss/hidden": 3.796484375, "loss/jsd": 0.0, "loss/logits": 0.2464024931192398, "step": 11060 }, { "epoch": 0.369, "grad_norm": 32.5, "grad_norm_var": 4.268489583333333, "learning_rate": 0.0001, "loss": 8.2656, "loss/crossentropy": 2.0544290356338024, "loss/hidden": 3.797265625, "loss/jsd": 0.0, "loss/logits": 0.21736350897699594, "step": 11070 }, { "epoch": 0.36933333333333335, "grad_norm": 30.625, "grad_norm_var": 2.41640625, "learning_rate": 0.0001, "loss": 8.2015, "loss/crossentropy": 2.0753560826182365, "loss/hidden": 3.994921875, "loss/jsd": 0.0, "loss/logits": 0.25065676774829626, "step": 11080 }, { "epoch": 0.36966666666666664, "grad_norm": 32.5, "grad_norm_var": 2.3811848958333335, "learning_rate": 0.0001, "loss": 8.3318, "loss/crossentropy": 2.3124043948948385, "loss/hidden": 3.866015625, "loss/jsd": 0.0, "loss/logits": 0.25761293675750496, "step": 11090 }, { "epoch": 0.37, "grad_norm": 29.375, "grad_norm_var": 75.17057291666667, "learning_rate": 0.0001, "loss": 8.3787, "loss/crossentropy": 2.0899481564760207, "loss/hidden": 3.833984375, "loss/jsd": 0.0, "loss/logits": 0.24109712056815624, "step": 11100 }, { "epoch": 0.37033333333333335, "grad_norm": 29.5, "grad_norm_var": 87.39973958333333, "learning_rate": 0.0001, "loss": 8.2869, "loss/crossentropy": 2.068728582933545, "loss/hidden": 3.794921875, "loss/jsd": 0.0, "loss/logits": 0.21987394848838449, "step": 11110 }, { "epoch": 0.37066666666666664, "grad_norm": 30.875, "grad_norm_var": 20.74765625, "learning_rate": 0.0001, "loss": 8.3058, "loss/crossentropy": 2.1063128843903542, "loss/hidden": 3.7140625, "loss/jsd": 0.0, "loss/logits": 0.22465858031064273, "step": 11120 }, { "epoch": 0.371, "grad_norm": 31.125, "grad_norm_var": 34.37805989583333, "learning_rate": 0.0001, "loss": 8.2634, "loss/crossentropy": 2.128019214421511, "loss/hidden": 3.806640625, "loss/jsd": 0.0, "loss/logits": 0.2511159829795361, "step": 11130 }, { "epoch": 0.37133333333333335, "grad_norm": 28.875, "grad_norm_var": 216.6681640625, "learning_rate": 0.0001, "loss": 8.3359, "loss/crossentropy": 2.0605734646320344, "loss/hidden": 3.862109375, "loss/jsd": 0.0, "loss/logits": 0.23643396981060505, "step": 11140 }, { "epoch": 0.37166666666666665, "grad_norm": 29.75, "grad_norm_var": 218.5197265625, "learning_rate": 0.0001, "loss": 8.3266, "loss/crossentropy": 1.8941866405308248, "loss/hidden": 4.06171875, "loss/jsd": 0.0, "loss/logits": 0.2489516731351614, "step": 11150 }, { "epoch": 0.372, "grad_norm": 31.75, "grad_norm_var": 2.745247395833333, "learning_rate": 0.0001, "loss": 8.2646, "loss/crossentropy": 2.023110543191433, "loss/hidden": 3.8078125, "loss/jsd": 0.0, "loss/logits": 0.23438771143555642, "step": 11160 }, { "epoch": 0.37233333333333335, "grad_norm": 34.5, "grad_norm_var": 5.59140625, "learning_rate": 0.0001, "loss": 8.293, "loss/crossentropy": 2.115256902575493, "loss/hidden": 3.8171875, "loss/jsd": 0.0, "loss/logits": 0.23501987420022488, "step": 11170 }, { "epoch": 0.37266666666666665, "grad_norm": 33.0, "grad_norm_var": 5.412239583333333, "learning_rate": 0.0001, "loss": 8.2813, "loss/crossentropy": 2.183938892185688, "loss/hidden": 3.8171875, "loss/jsd": 0.0, "loss/logits": 0.23923480361700059, "step": 11180 }, { "epoch": 0.373, "grad_norm": 29.5, "grad_norm_var": 4.455989583333333, "learning_rate": 0.0001, "loss": 8.2506, "loss/crossentropy": 2.09958486109972, "loss/hidden": 3.836328125, "loss/jsd": 0.0, "loss/logits": 0.24175492376089097, "step": 11190 }, { "epoch": 0.37333333333333335, "grad_norm": 29.375, "grad_norm_var": 3.034375, "learning_rate": 0.0001, "loss": 8.1771, "loss/crossentropy": 2.150550900399685, "loss/hidden": 3.712109375, "loss/jsd": 0.0, "loss/logits": 0.2199710313230753, "step": 11200 }, { "epoch": 0.37366666666666665, "grad_norm": 39.0, "grad_norm_var": 7.083072916666667, "learning_rate": 0.0001, "loss": 8.3038, "loss/crossentropy": 2.160426365584135, "loss/hidden": 3.8125, "loss/jsd": 0.0, "loss/logits": 0.22729940414428712, "step": 11210 }, { "epoch": 0.374, "grad_norm": 29.625, "grad_norm_var": 8.17890625, "learning_rate": 0.0001, "loss": 8.3813, "loss/crossentropy": 2.2687479466199876, "loss/hidden": 3.866796875, "loss/jsd": 0.0, "loss/logits": 0.2643072698265314, "step": 11220 }, { "epoch": 0.37433333333333335, "grad_norm": 33.25, "grad_norm_var": 4.737434895833333, "learning_rate": 0.0001, "loss": 8.2897, "loss/crossentropy": 1.9880081087350845, "loss/hidden": 3.82265625, "loss/jsd": 0.0, "loss/logits": 0.23600170221179723, "step": 11230 }, { "epoch": 0.37466666666666665, "grad_norm": 30.25, "grad_norm_var": 2.0268229166666667, "learning_rate": 0.0001, "loss": 8.2243, "loss/crossentropy": 2.157718874514103, "loss/hidden": 3.725390625, "loss/jsd": 0.0, "loss/logits": 0.22776034101843834, "step": 11240 }, { "epoch": 0.375, "grad_norm": 31.0, "grad_norm_var": 1.9270182291666667, "learning_rate": 0.0001, "loss": 8.2751, "loss/crossentropy": 2.1132646039128304, "loss/hidden": 3.725, "loss/jsd": 0.0, "loss/logits": 0.22339010071009396, "step": 11250 }, { "epoch": 0.37533333333333335, "grad_norm": 36.75, "grad_norm_var": 7.060872395833333, "learning_rate": 0.0001, "loss": 8.4215, "loss/crossentropy": 2.206932783126831, "loss/hidden": 3.808203125, "loss/jsd": 0.0, "loss/logits": 0.2356038186699152, "step": 11260 }, { "epoch": 0.37566666666666665, "grad_norm": 40.5, "grad_norm_var": 14.249739583333334, "learning_rate": 0.0001, "loss": 8.3276, "loss/crossentropy": 2.2165247052907944, "loss/hidden": 3.765234375, "loss/jsd": 0.0, "loss/logits": 0.23299887999892235, "step": 11270 }, { "epoch": 0.376, "grad_norm": 34.25, "grad_norm_var": 23.989518229166666, "learning_rate": 0.0001, "loss": 8.202, "loss/crossentropy": 2.1586028307676317, "loss/hidden": 3.811328125, "loss/jsd": 0.0, "loss/logits": 0.23767958618700505, "step": 11280 }, { "epoch": 0.37633333333333335, "grad_norm": 28.25, "grad_norm_var": 6.742122395833333, "learning_rate": 0.0001, "loss": 8.2606, "loss/crossentropy": 2.0374640226364136, "loss/hidden": 3.73203125, "loss/jsd": 0.0, "loss/logits": 0.21241307370364665, "step": 11290 }, { "epoch": 0.37666666666666665, "grad_norm": 32.25, "grad_norm_var": 1.2348307291666667, "learning_rate": 0.0001, "loss": 8.2161, "loss/crossentropy": 2.06580873131752, "loss/hidden": 3.783984375, "loss/jsd": 0.0, "loss/logits": 0.25028328634798525, "step": 11300 }, { "epoch": 0.377, "grad_norm": 28.875, "grad_norm_var": 31.407747395833333, "learning_rate": 0.0001, "loss": 8.1998, "loss/crossentropy": 2.120278796553612, "loss/hidden": 3.77578125, "loss/jsd": 0.0, "loss/logits": 0.24368874300271273, "step": 11310 }, { "epoch": 0.37733333333333335, "grad_norm": 31.125, "grad_norm_var": 14.039322916666666, "learning_rate": 0.0001, "loss": 8.3848, "loss/crossentropy": 2.0408181294798853, "loss/hidden": 3.86796875, "loss/jsd": 0.0, "loss/logits": 0.23626107163727283, "step": 11320 }, { "epoch": 0.37766666666666665, "grad_norm": 28.25, "grad_norm_var": 8.001497395833333, "learning_rate": 0.0001, "loss": 8.1908, "loss/crossentropy": 2.026593156158924, "loss/hidden": 3.78359375, "loss/jsd": 0.0, "loss/logits": 0.21986434515565634, "step": 11330 }, { "epoch": 0.378, "grad_norm": 28.375, "grad_norm_var": 3.3416015625, "learning_rate": 0.0001, "loss": 8.1949, "loss/crossentropy": 2.04437660574913, "loss/hidden": 3.7546875, "loss/jsd": 0.0, "loss/logits": 0.2269467730075121, "step": 11340 }, { "epoch": 0.37833333333333335, "grad_norm": 28.0, "grad_norm_var": 3.505989583333333, "learning_rate": 0.0001, "loss": 8.2826, "loss/crossentropy": 2.074940774589777, "loss/hidden": 3.762109375, "loss/jsd": 0.0, "loss/logits": 0.2203363472595811, "step": 11350 }, { "epoch": 0.37866666666666665, "grad_norm": 31.375, "grad_norm_var": 3.77890625, "learning_rate": 0.0001, "loss": 8.0388, "loss/crossentropy": 2.0113267697393895, "loss/hidden": 3.8703125, "loss/jsd": 0.0, "loss/logits": 0.2317951550707221, "step": 11360 }, { "epoch": 0.379, "grad_norm": 30.75, "grad_norm_var": 3.2884765625, "learning_rate": 0.0001, "loss": 8.212, "loss/crossentropy": 2.1366314753890037, "loss/hidden": 3.79921875, "loss/jsd": 0.0, "loss/logits": 0.2496491651982069, "step": 11370 }, { "epoch": 0.37933333333333336, "grad_norm": 31.75, "grad_norm_var": 8.070572916666666, "learning_rate": 0.0001, "loss": 8.256, "loss/crossentropy": 1.991414950788021, "loss/hidden": 3.87890625, "loss/jsd": 0.0, "loss/logits": 0.23439864348620176, "step": 11380 }, { "epoch": 0.37966666666666665, "grad_norm": 30.625, "grad_norm_var": 6.458268229166666, "learning_rate": 0.0001, "loss": 8.2754, "loss/crossentropy": 2.055978857725859, "loss/hidden": 3.683984375, "loss/jsd": 0.0, "loss/logits": 0.20942887850105762, "step": 11390 }, { "epoch": 0.38, "grad_norm": 29.75, "grad_norm_var": 2.795768229166667, "learning_rate": 0.0001, "loss": 8.2368, "loss/crossentropy": 2.2677480787038804, "loss/hidden": 3.74296875, "loss/jsd": 0.0, "loss/logits": 0.24482562113553286, "step": 11400 }, { "epoch": 0.38033333333333336, "grad_norm": 29.875, "grad_norm_var": 6.1806640625, "learning_rate": 0.0001, "loss": 8.3087, "loss/crossentropy": 2.010858987271786, "loss/hidden": 3.876171875, "loss/jsd": 0.0, "loss/logits": 0.2329118952155113, "step": 11410 }, { "epoch": 0.38066666666666665, "grad_norm": 29.5, "grad_norm_var": 3.107291666666667, "learning_rate": 0.0001, "loss": 8.3397, "loss/crossentropy": 2.19465371966362, "loss/hidden": 3.771484375, "loss/jsd": 0.0, "loss/logits": 0.24298481158912183, "step": 11420 }, { "epoch": 0.381, "grad_norm": 29.625, "grad_norm_var": 3.9291015625, "learning_rate": 0.0001, "loss": 8.3047, "loss/crossentropy": 2.1756315886974336, "loss/hidden": 3.7765625, "loss/jsd": 0.0, "loss/logits": 0.23532975129783154, "step": 11430 }, { "epoch": 0.38133333333333336, "grad_norm": 32.75, "grad_norm_var": 2.5580729166666667, "learning_rate": 0.0001, "loss": 8.3201, "loss/crossentropy": 2.1976757258176804, "loss/hidden": 3.82734375, "loss/jsd": 0.0, "loss/logits": 0.2476160578429699, "step": 11440 }, { "epoch": 0.38166666666666665, "grad_norm": 5838471168.0, "grad_norm_var": 2.130484076579337e+18, "learning_rate": 0.0001, "loss": 8.2311, "loss/crossentropy": 2.0670726232230665, "loss/hidden": 3.984765625, "loss/jsd": 0.0, "loss/logits": 0.22541351839900017, "step": 11450 }, { "epoch": 0.382, "grad_norm": 29.0, "grad_norm_var": 2.1304840749737574e+18, "learning_rate": 0.0001, "loss": 8.1004, "loss/crossentropy": 2.2240239530801773, "loss/hidden": 3.746484375, "loss/jsd": 0.0, "loss/logits": 0.23139581680297852, "step": 11460 }, { "epoch": 0.38233333333333336, "grad_norm": 29.5, "grad_norm_var": 35.31555989583333, "learning_rate": 0.0001, "loss": 8.2446, "loss/crossentropy": 2.029728998243809, "loss/hidden": 3.733203125, "loss/jsd": 0.0, "loss/logits": 0.22069137105718256, "step": 11470 }, { "epoch": 0.38266666666666665, "grad_norm": 30.125, "grad_norm_var": 2.24765625, "learning_rate": 0.0001, "loss": 8.1198, "loss/crossentropy": 1.9788580626249312, "loss/hidden": 3.730078125, "loss/jsd": 0.0, "loss/logits": 0.21759489141404628, "step": 11480 }, { "epoch": 0.383, "grad_norm": 31.5, "grad_norm_var": 2.2822916666666666, "learning_rate": 0.0001, "loss": 8.1322, "loss/crossentropy": 2.1184426814317705, "loss/hidden": 3.819140625, "loss/jsd": 0.0, "loss/logits": 0.23590471846982836, "step": 11490 }, { "epoch": 0.38333333333333336, "grad_norm": 29.25, "grad_norm_var": 2.4567057291666665, "learning_rate": 0.0001, "loss": 8.3504, "loss/crossentropy": 2.023914510011673, "loss/hidden": 3.8296875, "loss/jsd": 0.0, "loss/logits": 0.23768907226622105, "step": 11500 }, { "epoch": 0.38366666666666666, "grad_norm": 30.5, "grad_norm_var": 34.16875, "learning_rate": 0.0001, "loss": 8.3174, "loss/crossentropy": 1.9882623553276062, "loss/hidden": 3.70859375, "loss/jsd": 0.0, "loss/logits": 0.20902501344680785, "step": 11510 }, { "epoch": 0.384, "grad_norm": 31.875, "grad_norm_var": 37.2625, "learning_rate": 0.0001, "loss": 8.3973, "loss/crossentropy": 1.9671563521027564, "loss/hidden": 3.816796875, "loss/jsd": 0.0, "loss/logits": 0.21824515145272017, "step": 11520 }, { "epoch": 0.38433333333333336, "grad_norm": 36.0, "grad_norm_var": 9.521809895833334, "learning_rate": 0.0001, "loss": 8.2993, "loss/crossentropy": 2.104415476322174, "loss/hidden": 3.745703125, "loss/jsd": 0.0, "loss/logits": 0.24023280292749405, "step": 11530 }, { "epoch": 0.38466666666666666, "grad_norm": 30.375, "grad_norm_var": 6.6275390625, "learning_rate": 0.0001, "loss": 8.2753, "loss/crossentropy": 2.1653599768877028, "loss/hidden": 3.816796875, "loss/jsd": 0.0, "loss/logits": 0.22879305072128772, "step": 11540 }, { "epoch": 0.385, "grad_norm": 30.125, "grad_norm_var": 6.420572916666667, "learning_rate": 0.0001, "loss": 8.2014, "loss/crossentropy": 2.0401563957333564, "loss/hidden": 3.77890625, "loss/jsd": 0.0, "loss/logits": 0.23115058969706298, "step": 11550 }, { "epoch": 0.38533333333333336, "grad_norm": 29.25, "grad_norm_var": 10.065559895833333, "learning_rate": 0.0001, "loss": 8.3626, "loss/crossentropy": 2.2447339951992036, "loss/hidden": 3.770703125, "loss/jsd": 0.0, "loss/logits": 0.2418015170842409, "step": 11560 }, { "epoch": 0.38566666666666666, "grad_norm": 32.75, "grad_norm_var": 3.3020833333333335, "learning_rate": 0.0001, "loss": 8.4421, "loss/crossentropy": 2.1326376996934413, "loss/hidden": 3.803515625, "loss/jsd": 0.0, "loss/logits": 0.24543905295431614, "step": 11570 }, { "epoch": 0.386, "grad_norm": 30.625, "grad_norm_var": 3.28125, "learning_rate": 0.0001, "loss": 8.2912, "loss/crossentropy": 2.099239933490753, "loss/hidden": 3.776953125, "loss/jsd": 0.0, "loss/logits": 0.235281278192997, "step": 11580 }, { "epoch": 0.3863333333333333, "grad_norm": 35.0, "grad_norm_var": 12.728580729166667, "learning_rate": 0.0001, "loss": 8.3226, "loss/crossentropy": 2.091626935452223, "loss/hidden": 3.803515625, "loss/jsd": 0.0, "loss/logits": 0.22663789633661507, "step": 11590 }, { "epoch": 0.38666666666666666, "grad_norm": 31.375, "grad_norm_var": 4.16015625, "learning_rate": 0.0001, "loss": 8.2711, "loss/crossentropy": 1.935661745071411, "loss/hidden": 3.773046875, "loss/jsd": 0.0, "loss/logits": 0.22390243038535118, "step": 11600 }, { "epoch": 0.387, "grad_norm": 42.25, "grad_norm_var": 18.571809895833333, "learning_rate": 0.0001, "loss": 8.3944, "loss/crossentropy": 2.2436830163002015, "loss/hidden": 3.830078125, "loss/jsd": 0.0, "loss/logits": 0.26995023861527445, "step": 11610 }, { "epoch": 0.3873333333333333, "grad_norm": 36.25, "grad_norm_var": 19.981705729166666, "learning_rate": 0.0001, "loss": 8.2244, "loss/crossentropy": 2.096161872893572, "loss/hidden": 3.787109375, "loss/jsd": 0.0, "loss/logits": 0.25067653246223925, "step": 11620 }, { "epoch": 0.38766666666666666, "grad_norm": 31.375, "grad_norm_var": 7.373893229166667, "learning_rate": 0.0001, "loss": 8.3287, "loss/crossentropy": 2.2269149988889696, "loss/hidden": 3.855859375, "loss/jsd": 0.0, "loss/logits": 0.2346229925751686, "step": 11630 }, { "epoch": 0.388, "grad_norm": 31.75, "grad_norm_var": 4.336458333333334, "learning_rate": 0.0001, "loss": 8.3193, "loss/crossentropy": 2.1625199913978577, "loss/hidden": 3.838671875, "loss/jsd": 0.0, "loss/logits": 0.24539714939892293, "step": 11640 }, { "epoch": 0.3883333333333333, "grad_norm": 33.5, "grad_norm_var": 24.7009765625, "learning_rate": 0.0001, "loss": 8.4532, "loss/crossentropy": 2.1474158462136983, "loss/hidden": 3.900390625, "loss/jsd": 0.0, "loss/logits": 0.24705803375691177, "step": 11650 }, { "epoch": 0.38866666666666666, "grad_norm": 30.0, "grad_norm_var": 4.01875, "learning_rate": 0.0001, "loss": 8.1543, "loss/crossentropy": 2.1757007278501987, "loss/hidden": 3.7453125, "loss/jsd": 0.0, "loss/logits": 0.23021320514380933, "step": 11660 }, { "epoch": 0.389, "grad_norm": 32.25, "grad_norm_var": 13.9056640625, "learning_rate": 0.0001, "loss": 8.3819, "loss/crossentropy": 2.0909801930189134, "loss/hidden": 3.8921875, "loss/jsd": 0.0, "loss/logits": 0.25553538724780084, "step": 11670 }, { "epoch": 0.3893333333333333, "grad_norm": 33.25, "grad_norm_var": 13.77265625, "learning_rate": 0.0001, "loss": 8.342, "loss/crossentropy": 2.209575629234314, "loss/hidden": 3.7796875, "loss/jsd": 0.0, "loss/logits": 0.24838075898587703, "step": 11680 }, { "epoch": 0.38966666666666666, "grad_norm": 32.0, "grad_norm_var": 2.9744140625, "learning_rate": 0.0001, "loss": 8.249, "loss/crossentropy": 2.150561396032572, "loss/hidden": 3.758984375, "loss/jsd": 0.0, "loss/logits": 0.23041013162583113, "step": 11690 }, { "epoch": 0.39, "grad_norm": 28.5, "grad_norm_var": 2.283072916666667, "learning_rate": 0.0001, "loss": 8.3467, "loss/crossentropy": 1.9362058877944945, "loss/hidden": 3.861328125, "loss/jsd": 0.0, "loss/logits": 0.2275281075388193, "step": 11700 }, { "epoch": 0.3903333333333333, "grad_norm": 30.25, "grad_norm_var": 9.784375, "learning_rate": 0.0001, "loss": 8.2697, "loss/crossentropy": 2.184156297147274, "loss/hidden": 3.823046875, "loss/jsd": 0.0, "loss/logits": 0.227661694213748, "step": 11710 }, { "epoch": 0.39066666666666666, "grad_norm": 30.0, "grad_norm_var": 7.74140625, "learning_rate": 0.0001, "loss": 8.2916, "loss/crossentropy": 2.005098359286785, "loss/hidden": 3.771875, "loss/jsd": 0.0, "loss/logits": 0.22573864944279193, "step": 11720 }, { "epoch": 0.391, "grad_norm": 39.0, "grad_norm_var": 8.258333333333333, "learning_rate": 0.0001, "loss": 8.2232, "loss/crossentropy": 1.9749820090830326, "loss/hidden": 3.86328125, "loss/jsd": 0.0, "loss/logits": 0.2378404688090086, "step": 11730 }, { "epoch": 0.3913333333333333, "grad_norm": 29.5, "grad_norm_var": 9.064322916666667, "learning_rate": 0.0001, "loss": 8.2125, "loss/crossentropy": 2.0756162479519844, "loss/hidden": 3.665234375, "loss/jsd": 0.0, "loss/logits": 0.22730147559195757, "step": 11740 }, { "epoch": 0.39166666666666666, "grad_norm": 30.75, "grad_norm_var": 6.217708333333333, "learning_rate": 0.0001, "loss": 8.2565, "loss/crossentropy": 2.1849037185311317, "loss/hidden": 3.685546875, "loss/jsd": 0.0, "loss/logits": 0.2365235961973667, "step": 11750 }, { "epoch": 0.392, "grad_norm": 30.25, "grad_norm_var": 2.765625, "learning_rate": 0.0001, "loss": 8.2262, "loss/crossentropy": 2.0965361006557943, "loss/hidden": 3.73515625, "loss/jsd": 0.0, "loss/logits": 0.2350387828424573, "step": 11760 }, { "epoch": 0.3923333333333333, "grad_norm": 29.625, "grad_norm_var": 1.9030598958333333, "learning_rate": 0.0001, "loss": 8.2198, "loss/crossentropy": 2.1148156762123107, "loss/hidden": 3.747265625, "loss/jsd": 0.0, "loss/logits": 0.24504089988768102, "step": 11770 }, { "epoch": 0.39266666666666666, "grad_norm": 30.75, "grad_norm_var": 1.6375, "learning_rate": 0.0001, "loss": 8.1997, "loss/crossentropy": 1.9177335992455482, "loss/hidden": 3.763671875, "loss/jsd": 0.0, "loss/logits": 0.22061962708830835, "step": 11780 }, { "epoch": 0.393, "grad_norm": 30.0, "grad_norm_var": 2.4176432291666665, "learning_rate": 0.0001, "loss": 8.1247, "loss/crossentropy": 1.9748799093067646, "loss/hidden": 3.68125, "loss/jsd": 0.0, "loss/logits": 0.21441790107637643, "step": 11790 }, { "epoch": 0.3933333333333333, "grad_norm": 28.25, "grad_norm_var": 4.838997395833333, "learning_rate": 0.0001, "loss": 8.167, "loss/crossentropy": 2.2271966516971586, "loss/hidden": 3.799609375, "loss/jsd": 0.0, "loss/logits": 0.25528619475662706, "step": 11800 }, { "epoch": 0.39366666666666666, "grad_norm": 33.75, "grad_norm_var": 5.270768229166666, "learning_rate": 0.0001, "loss": 8.2939, "loss/crossentropy": 2.261426217854023, "loss/hidden": 3.82578125, "loss/jsd": 0.0, "loss/logits": 0.2428593784570694, "step": 11810 }, { "epoch": 0.394, "grad_norm": 34.25, "grad_norm_var": 10.937434895833333, "learning_rate": 0.0001, "loss": 8.4139, "loss/crossentropy": 1.9368678316473962, "loss/hidden": 3.75390625, "loss/jsd": 0.0, "loss/logits": 0.22458538115024568, "step": 11820 }, { "epoch": 0.3943333333333333, "grad_norm": 28.5, "grad_norm_var": 44247710555649.48, "learning_rate": 0.0001, "loss": 8.368, "loss/crossentropy": 2.1160020515322686, "loss/hidden": 3.730078125, "loss/jsd": 0.0, "loss/logits": 0.2263868011534214, "step": 11830 }, { "epoch": 0.39466666666666667, "grad_norm": 27.625, "grad_norm_var": 96.88098958333333, "learning_rate": 0.0001, "loss": 8.2835, "loss/crossentropy": 2.120930030941963, "loss/hidden": 3.69921875, "loss/jsd": 0.0, "loss/logits": 0.21769896019250154, "step": 11840 }, { "epoch": 0.395, "grad_norm": 30.375, "grad_norm_var": 4.537955729166667, "learning_rate": 0.0001, "loss": 8.0825, "loss/crossentropy": 2.06858219653368, "loss/hidden": 3.687890625, "loss/jsd": 0.0, "loss/logits": 0.21344130001962186, "step": 11850 }, { "epoch": 0.3953333333333333, "grad_norm": 33.0, "grad_norm_var": 12.7337890625, "learning_rate": 0.0001, "loss": 8.3254, "loss/crossentropy": 2.1121141463518143, "loss/hidden": 3.840234375, "loss/jsd": 0.0, "loss/logits": 0.23941405918449163, "step": 11860 }, { "epoch": 0.39566666666666667, "grad_norm": 32.0, "grad_norm_var": 2.2372395833333334, "learning_rate": 0.0001, "loss": 8.3672, "loss/crossentropy": 2.1878841519355774, "loss/hidden": 3.74921875, "loss/jsd": 0.0, "loss/logits": 0.22403320614248515, "step": 11870 }, { "epoch": 0.396, "grad_norm": 30.25, "grad_norm_var": 3.2372395833333334, "learning_rate": 0.0001, "loss": 8.4055, "loss/crossentropy": 2.1782354429364204, "loss/hidden": 3.84921875, "loss/jsd": 0.0, "loss/logits": 0.2412811905145645, "step": 11880 }, { "epoch": 0.3963333333333333, "grad_norm": 29.0, "grad_norm_var": 5.423958333333333, "learning_rate": 0.0001, "loss": 8.162, "loss/crossentropy": 1.9386512018740176, "loss/hidden": 3.79765625, "loss/jsd": 0.0, "loss/logits": 0.23406942784786225, "step": 11890 }, { "epoch": 0.39666666666666667, "grad_norm": 29.625, "grad_norm_var": 7.05, "learning_rate": 0.0001, "loss": 8.3352, "loss/crossentropy": 2.165928477048874, "loss/hidden": 3.92890625, "loss/jsd": 0.0, "loss/logits": 0.2521603927016258, "step": 11900 }, { "epoch": 0.397, "grad_norm": 30.875, "grad_norm_var": 5.3822265625, "learning_rate": 0.0001, "loss": 8.2937, "loss/crossentropy": 2.1375706143677236, "loss/hidden": 3.666796875, "loss/jsd": 0.0, "loss/logits": 0.22246642410755157, "step": 11910 }, { "epoch": 0.3973333333333333, "grad_norm": 29.5, "grad_norm_var": 2.851822916666667, "learning_rate": 0.0001, "loss": 8.2117, "loss/crossentropy": 2.0515445560216903, "loss/hidden": 3.798828125, "loss/jsd": 0.0, "loss/logits": 0.2385630363598466, "step": 11920 }, { "epoch": 0.39766666666666667, "grad_norm": 32.0, "grad_norm_var": 2.3583333333333334, "learning_rate": 0.0001, "loss": 8.1148, "loss/crossentropy": 2.233632105588913, "loss/hidden": 3.760546875, "loss/jsd": 0.0, "loss/logits": 0.24521742388606071, "step": 11930 }, { "epoch": 0.398, "grad_norm": 31.75, "grad_norm_var": 4.1587890625, "learning_rate": 0.0001, "loss": 8.3179, "loss/crossentropy": 2.2155081748962404, "loss/hidden": 3.841796875, "loss/jsd": 0.0, "loss/logits": 0.2516425810754299, "step": 11940 }, { "epoch": 0.3983333333333333, "grad_norm": 31.875, "grad_norm_var": 6.0322265625, "learning_rate": 0.0001, "loss": 8.3005, "loss/crossentropy": 2.1274189479649066, "loss/hidden": 3.746484375, "loss/jsd": 0.0, "loss/logits": 0.24990401780232788, "step": 11950 }, { "epoch": 0.39866666666666667, "grad_norm": 53.75, "grad_norm_var": 36.696875, "learning_rate": 0.0001, "loss": 8.0809, "loss/crossentropy": 2.0530085660517217, "loss/hidden": 3.639453125, "loss/jsd": 0.0, "loss/logits": 0.21376553494483233, "step": 11960 }, { "epoch": 0.399, "grad_norm": 29.875, "grad_norm_var": 36.781184895833334, "learning_rate": 0.0001, "loss": 8.1538, "loss/crossentropy": 2.0867031171917914, "loss/hidden": 3.821484375, "loss/jsd": 0.0, "loss/logits": 0.23751907888799906, "step": 11970 }, { "epoch": 0.3993333333333333, "grad_norm": 28.25, "grad_norm_var": 2.7080729166666666, "learning_rate": 0.0001, "loss": 8.0891, "loss/crossentropy": 2.2460917532444, "loss/hidden": 3.734765625, "loss/jsd": 0.0, "loss/logits": 0.22936972938477992, "step": 11980 }, { "epoch": 0.39966666666666667, "grad_norm": 29.5, "grad_norm_var": 14.363997395833334, "learning_rate": 0.0001, "loss": 8.2289, "loss/crossentropy": 2.11966609954834, "loss/hidden": 3.81796875, "loss/jsd": 0.0, "loss/logits": 0.25193934664130213, "step": 11990 }, { "epoch": 0.4, "grad_norm": 31.25, "grad_norm_var": 11.431705729166667, "learning_rate": 0.0001, "loss": 8.3615, "loss/crossentropy": 1.9913646757602692, "loss/hidden": 3.76875, "loss/jsd": 0.0, "loss/logits": 0.2379540206864476, "step": 12000 }, { "epoch": 0.4003333333333333, "grad_norm": 34.25, "grad_norm_var": 2.90625, "learning_rate": 0.0001, "loss": 8.3603, "loss/crossentropy": 2.265652423352003, "loss/hidden": 3.737890625, "loss/jsd": 0.0, "loss/logits": 0.23408753713592886, "step": 12010 }, { "epoch": 0.40066666666666667, "grad_norm": 33.75, "grad_norm_var": 5.2181640625, "learning_rate": 0.0001, "loss": 8.2743, "loss/crossentropy": 2.255320507287979, "loss/hidden": 3.74375, "loss/jsd": 0.0, "loss/logits": 0.23977783247828482, "step": 12020 }, { "epoch": 0.401, "grad_norm": 33.25, "grad_norm_var": 4.6869140625, "learning_rate": 0.0001, "loss": 8.2277, "loss/crossentropy": 2.155760329961777, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.246426010876894, "step": 12030 }, { "epoch": 0.4013333333333333, "grad_norm": 26.25, "grad_norm_var": 7.045768229166667, "learning_rate": 0.0001, "loss": 8.1739, "loss/crossentropy": 2.1292715579271317, "loss/hidden": 3.857421875, "loss/jsd": 0.0, "loss/logits": 0.23238162267953158, "step": 12040 }, { "epoch": 0.40166666666666667, "grad_norm": 32.0, "grad_norm_var": 8.898958333333333, "learning_rate": 0.0001, "loss": 8.1614, "loss/crossentropy": 2.148128533363342, "loss/hidden": 3.73203125, "loss/jsd": 0.0, "loss/logits": 0.23012944478541614, "step": 12050 }, { "epoch": 0.402, "grad_norm": 30.5, "grad_norm_var": 3.3577473958333335, "learning_rate": 0.0001, "loss": 8.2708, "loss/crossentropy": 1.872607284784317, "loss/hidden": 3.90234375, "loss/jsd": 0.0, "loss/logits": 0.2496652290225029, "step": 12060 }, { "epoch": 0.4023333333333333, "grad_norm": 30.125, "grad_norm_var": 2.1304840760927977e+18, "learning_rate": 0.0001, "loss": 8.3292, "loss/crossentropy": 2.3182139307260514, "loss/hidden": 3.66015625, "loss/jsd": 0.0, "loss/logits": 0.238917101547122, "step": 12070 }, { "epoch": 0.4026666666666667, "grad_norm": 29.375, "grad_norm_var": 2.1304840765610918e+18, "learning_rate": 0.0001, "loss": 8.158, "loss/crossentropy": 1.9857861787080764, "loss/hidden": 3.871484375, "loss/jsd": 0.0, "loss/logits": 0.23468630164861679, "step": 12080 }, { "epoch": 0.403, "grad_norm": 32.25, "grad_norm_var": 14.543489583333333, "learning_rate": 0.0001, "loss": 8.1162, "loss/crossentropy": 2.062743777036667, "loss/hidden": 3.77421875, "loss/jsd": 0.0, "loss/logits": 0.23012079745531083, "step": 12090 }, { "epoch": 0.4033333333333333, "grad_norm": 31.0, "grad_norm_var": 14.9791015625, "learning_rate": 0.0001, "loss": 8.1531, "loss/crossentropy": 2.027921313047409, "loss/hidden": 3.718359375, "loss/jsd": 0.0, "loss/logits": 0.22394589530304074, "step": 12100 }, { "epoch": 0.4036666666666667, "grad_norm": 40.0, "grad_norm_var": 14.040559895833333, "learning_rate": 0.0001, "loss": 8.3209, "loss/crossentropy": 2.006143531948328, "loss/hidden": 3.65703125, "loss/jsd": 0.0, "loss/logits": 0.21074291467666625, "step": 12110 }, { "epoch": 0.404, "grad_norm": 28.25, "grad_norm_var": 16.646875, "learning_rate": 0.0001, "loss": 8.2199, "loss/crossentropy": 2.064042943716049, "loss/hidden": 3.8078125, "loss/jsd": 0.0, "loss/logits": 0.22763095535337924, "step": 12120 }, { "epoch": 0.4043333333333333, "grad_norm": 30.5, "grad_norm_var": 1.8330729166666666, "learning_rate": 0.0001, "loss": 8.1802, "loss/crossentropy": 2.1694528847932815, "loss/hidden": 3.924609375, "loss/jsd": 0.0, "loss/logits": 0.23536618407815696, "step": 12130 }, { "epoch": 0.4046666666666667, "grad_norm": 31.625, "grad_norm_var": 2.0556640625, "learning_rate": 0.0001, "loss": 8.1972, "loss/crossentropy": 2.119773244857788, "loss/hidden": 3.8078125, "loss/jsd": 0.0, "loss/logits": 0.24443610161542892, "step": 12140 }, { "epoch": 0.405, "grad_norm": 29.625, "grad_norm_var": 2.2260416666666667, "learning_rate": 0.0001, "loss": 8.2187, "loss/crossentropy": 2.132966651767492, "loss/hidden": 3.776171875, "loss/jsd": 0.0, "loss/logits": 0.23649816121906042, "step": 12150 }, { "epoch": 0.4053333333333333, "grad_norm": 25.625, "grad_norm_var": 21.917122395833335, "learning_rate": 0.0001, "loss": 8.2396, "loss/crossentropy": 2.133436472713947, "loss/hidden": 3.783203125, "loss/jsd": 0.0, "loss/logits": 0.2467921631410718, "step": 12160 }, { "epoch": 0.4056666666666667, "grad_norm": 29.5, "grad_norm_var": 87.078125, "learning_rate": 0.0001, "loss": 8.2219, "loss/crossentropy": 2.2029434219002724, "loss/hidden": 3.776171875, "loss/jsd": 0.0, "loss/logits": 0.2552025170996785, "step": 12170 }, { "epoch": 0.406, "grad_norm": 33.25, "grad_norm_var": 96.19270833333333, "learning_rate": 0.0001, "loss": 8.2495, "loss/crossentropy": 2.253942059725523, "loss/hidden": 3.66796875, "loss/jsd": 0.0, "loss/logits": 0.22677662651985883, "step": 12180 }, { "epoch": 0.4063333333333333, "grad_norm": 28.375, "grad_norm_var": 4.829166666666667, "learning_rate": 0.0001, "loss": 8.2984, "loss/crossentropy": 1.909020482003689, "loss/hidden": 4.008984375, "loss/jsd": 0.0, "loss/logits": 0.22903512194752693, "step": 12190 }, { "epoch": 0.4066666666666667, "grad_norm": 27.25, "grad_norm_var": 43.521809895833336, "learning_rate": 0.0001, "loss": 8.0987, "loss/crossentropy": 2.0227720350027085, "loss/hidden": 3.803125, "loss/jsd": 0.0, "loss/logits": 0.2277982523664832, "step": 12200 }, { "epoch": 0.407, "grad_norm": 54.0, "grad_norm_var": 40.82962239583333, "learning_rate": 0.0001, "loss": 8.1799, "loss/crossentropy": 2.0709748052060606, "loss/hidden": 3.848828125, "loss/jsd": 0.0, "loss/logits": 0.22707768445834517, "step": 12210 }, { "epoch": 0.4073333333333333, "grad_norm": 30.875, "grad_norm_var": 35.93723958333333, "learning_rate": 0.0001, "loss": 8.2627, "loss/crossentropy": 2.0857065066695215, "loss/hidden": 3.755859375, "loss/jsd": 0.0, "loss/logits": 0.23003701977431773, "step": 12220 }, { "epoch": 0.4076666666666667, "grad_norm": 29.625, "grad_norm_var": 1.0643229166666666, "learning_rate": 0.0001, "loss": 8.2114, "loss/crossentropy": 2.136391428112984, "loss/hidden": 3.7546875, "loss/jsd": 0.0, "loss/logits": 0.23175083976238967, "step": 12230 }, { "epoch": 0.408, "grad_norm": 31.375, "grad_norm_var": 4.687239583333334, "learning_rate": 0.0001, "loss": 8.193, "loss/crossentropy": 2.0665802858769893, "loss/hidden": 3.741015625, "loss/jsd": 0.0, "loss/logits": 0.23225973546504974, "step": 12240 }, { "epoch": 0.4083333333333333, "grad_norm": 32.0, "grad_norm_var": 11.190559895833333, "learning_rate": 0.0001, "loss": 8.2347, "loss/crossentropy": 2.0192012012004854, "loss/hidden": 3.7859375, "loss/jsd": 0.0, "loss/logits": 0.2294670270755887, "step": 12250 }, { "epoch": 0.4086666666666667, "grad_norm": 30.0, "grad_norm_var": 5.26875, "learning_rate": 0.0001, "loss": 8.3742, "loss/crossentropy": 2.1292088687419892, "loss/hidden": 3.85234375, "loss/jsd": 0.0, "loss/logits": 0.24639325439929963, "step": 12260 }, { "epoch": 0.409, "grad_norm": 29.75, "grad_norm_var": 6.730989583333334, "learning_rate": 0.0001, "loss": 8.1186, "loss/crossentropy": 2.134373862296343, "loss/hidden": 3.84375, "loss/jsd": 0.0, "loss/logits": 0.2399260677397251, "step": 12270 }, { "epoch": 0.4093333333333333, "grad_norm": 31.25, "grad_norm_var": 2.959830729166667, "learning_rate": 0.0001, "loss": 8.2645, "loss/crossentropy": 2.1076954215765, "loss/hidden": 3.67734375, "loss/jsd": 0.0, "loss/logits": 0.22277417313307524, "step": 12280 }, { "epoch": 0.4096666666666667, "grad_norm": 30.75, "grad_norm_var": 1.9390810930048315e+18, "learning_rate": 0.0001, "loss": 8.2145, "loss/crossentropy": 1.9754585176706314, "loss/hidden": 3.7109375, "loss/jsd": 0.0, "loss/logits": 0.22273671329021455, "step": 12290 }, { "epoch": 0.41, "grad_norm": 31.75, "grad_norm_var": 1.9390810922215452e+18, "learning_rate": 0.0001, "loss": 8.255, "loss/crossentropy": 2.21117245554924, "loss/hidden": 3.76953125, "loss/jsd": 0.0, "loss/logits": 0.22954177036881446, "step": 12300 }, { "epoch": 0.4103333333333333, "grad_norm": 28.75, "grad_norm_var": 11.9619140625, "learning_rate": 0.0001, "loss": 8.3073, "loss/crossentropy": 2.0862128123641015, "loss/hidden": 3.717578125, "loss/jsd": 0.0, "loss/logits": 0.23974426425993442, "step": 12310 }, { "epoch": 0.4106666666666667, "grad_norm": 30.875, "grad_norm_var": 5.587239583333333, "learning_rate": 0.0001, "loss": 8.1389, "loss/crossentropy": 1.9903224393725396, "loss/hidden": 3.753515625, "loss/jsd": 0.0, "loss/logits": 0.22904759608209133, "step": 12320 }, { "epoch": 0.411, "grad_norm": 31.75, "grad_norm_var": 5.352083333333334, "learning_rate": 0.0001, "loss": 8.1885, "loss/crossentropy": 2.1092441350221636, "loss/hidden": 3.68671875, "loss/jsd": 0.0, "loss/logits": 0.2219138015061617, "step": 12330 }, { "epoch": 0.41133333333333333, "grad_norm": 28.0, "grad_norm_var": 4.24140625, "learning_rate": 0.0001, "loss": 8.2663, "loss/crossentropy": 2.008776394277811, "loss/hidden": 3.861328125, "loss/jsd": 0.0, "loss/logits": 0.23205508813261985, "step": 12340 }, { "epoch": 0.4116666666666667, "grad_norm": 29.75, "grad_norm_var": 2.79765625, "learning_rate": 0.0001, "loss": 8.2388, "loss/crossentropy": 2.10398950278759, "loss/hidden": 3.7484375, "loss/jsd": 0.0, "loss/logits": 0.2270292304456234, "step": 12350 }, { "epoch": 0.412, "grad_norm": 30.5, "grad_norm_var": 13.00625, "learning_rate": 0.0001, "loss": 8.1708, "loss/crossentropy": 1.9869476959109307, "loss/hidden": 3.760546875, "loss/jsd": 0.0, "loss/logits": 0.2278031835332513, "step": 12360 }, { "epoch": 0.41233333333333333, "grad_norm": 28.875, "grad_norm_var": 15.693684895833334, "learning_rate": 0.0001, "loss": 8.2568, "loss/crossentropy": 2.1288417890667914, "loss/hidden": 3.849609375, "loss/jsd": 0.0, "loss/logits": 0.2364706289023161, "step": 12370 }, { "epoch": 0.4126666666666667, "grad_norm": 28.375, "grad_norm_var": 13.906184895833333, "learning_rate": 0.0001, "loss": 8.203, "loss/crossentropy": 2.0824377298355103, "loss/hidden": 3.742578125, "loss/jsd": 0.0, "loss/logits": 0.219917696993798, "step": 12380 }, { "epoch": 0.413, "grad_norm": 32.0, "grad_norm_var": 13.921875, "learning_rate": 0.0001, "loss": 8.2617, "loss/crossentropy": 2.155314549803734, "loss/hidden": 3.882421875, "loss/jsd": 0.0, "loss/logits": 0.24771923571825027, "step": 12390 }, { "epoch": 0.41333333333333333, "grad_norm": 36.0, "grad_norm_var": 12.428125, "learning_rate": 0.0001, "loss": 8.092, "loss/crossentropy": 1.9445300944149495, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.22118018846958876, "step": 12400 }, { "epoch": 0.4136666666666667, "grad_norm": 31.5, "grad_norm_var": 5.564518229166667, "learning_rate": 0.0001, "loss": 8.2348, "loss/crossentropy": 2.07020313590765, "loss/hidden": 3.790625, "loss/jsd": 0.0, "loss/logits": 0.22232855744659902, "step": 12410 }, { "epoch": 0.414, "grad_norm": 35.25, "grad_norm_var": 8.139322916666666, "learning_rate": 0.0001, "loss": 8.3157, "loss/crossentropy": 2.108063217997551, "loss/hidden": 3.845703125, "loss/jsd": 0.0, "loss/logits": 0.2458704814314842, "step": 12420 }, { "epoch": 0.41433333333333333, "grad_norm": 29.5, "grad_norm_var": 5.182291666666667, "learning_rate": 0.0001, "loss": 8.0417, "loss/crossentropy": 2.181876909732819, "loss/hidden": 3.773828125, "loss/jsd": 0.0, "loss/logits": 0.22876899931579828, "step": 12430 }, { "epoch": 0.4146666666666667, "grad_norm": 28.375, "grad_norm_var": 1.6083333333333334, "learning_rate": 0.0001, "loss": 8.1716, "loss/crossentropy": 2.095130206644535, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.2197817573323846, "step": 12440 }, { "epoch": 0.415, "grad_norm": 29.625, "grad_norm_var": 5.591666666666667, "learning_rate": 0.0001, "loss": 8.0813, "loss/crossentropy": 2.0588746845722197, "loss/hidden": 3.812109375, "loss/jsd": 0.0, "loss/logits": 0.232548057846725, "step": 12450 }, { "epoch": 0.41533333333333333, "grad_norm": 32.0, "grad_norm_var": 4.77265625, "learning_rate": 0.0001, "loss": 8.3514, "loss/crossentropy": 2.0703504741191865, "loss/hidden": 3.870703125, "loss/jsd": 0.0, "loss/logits": 0.25821793731302023, "step": 12460 }, { "epoch": 0.4156666666666667, "grad_norm": 27.75, "grad_norm_var": 42.38854166666667, "learning_rate": 0.0001, "loss": 8.2507, "loss/crossentropy": 2.1307108625769615, "loss/hidden": 3.871484375, "loss/jsd": 0.0, "loss/logits": 0.24973033610731363, "step": 12470 }, { "epoch": 0.416, "grad_norm": 30.875, "grad_norm_var": 34.26640625, "learning_rate": 0.0001, "loss": 8.1867, "loss/crossentropy": 2.117819709330797, "loss/hidden": 3.648828125, "loss/jsd": 0.0, "loss/logits": 0.22720290068536997, "step": 12480 }, { "epoch": 0.41633333333333333, "grad_norm": 30.25, "grad_norm_var": 12.4416015625, "learning_rate": 0.0001, "loss": 8.2697, "loss/crossentropy": 2.109950542449951, "loss/hidden": 3.687890625, "loss/jsd": 0.0, "loss/logits": 0.21628530863672496, "step": 12490 }, { "epoch": 0.4166666666666667, "grad_norm": 31.0, "grad_norm_var": 10.474739583333333, "learning_rate": 0.0001, "loss": 8.105, "loss/crossentropy": 1.9980547428131104, "loss/hidden": 3.808984375, "loss/jsd": 0.0, "loss/logits": 0.21721296831965448, "step": 12500 }, { "epoch": 0.417, "grad_norm": 29.625, "grad_norm_var": 3.655143229166667, "learning_rate": 0.0001, "loss": 8.0715, "loss/crossentropy": 2.131350800395012, "loss/hidden": 3.78984375, "loss/jsd": 0.0, "loss/logits": 0.23274635933339596, "step": 12510 }, { "epoch": 0.41733333333333333, "grad_norm": 35.5, "grad_norm_var": 4.509830729166667, "learning_rate": 0.0001, "loss": 8.1491, "loss/crossentropy": 2.1224256813526154, "loss/hidden": 3.827734375, "loss/jsd": 0.0, "loss/logits": 0.25079987831413747, "step": 12520 }, { "epoch": 0.4176666666666667, "grad_norm": 29.5, "grad_norm_var": 4.074739583333334, "learning_rate": 0.0001, "loss": 8.2616, "loss/crossentropy": 2.0152181297540666, "loss/hidden": 3.7921875, "loss/jsd": 0.0, "loss/logits": 0.23770801294595004, "step": 12530 }, { "epoch": 0.418, "grad_norm": 29.625, "grad_norm_var": 1.6389973958333333, "learning_rate": 0.0001, "loss": 8.1426, "loss/crossentropy": 1.915038924664259, "loss/hidden": 3.79140625, "loss/jsd": 0.0, "loss/logits": 0.21815692326053976, "step": 12540 }, { "epoch": 0.41833333333333333, "grad_norm": 32.5, "grad_norm_var": 2.895247395833333, "learning_rate": 0.0001, "loss": 8.1342, "loss/crossentropy": 2.038285069167614, "loss/hidden": 3.685546875, "loss/jsd": 0.0, "loss/logits": 0.2054979182779789, "step": 12550 }, { "epoch": 0.4186666666666667, "grad_norm": 28.75, "grad_norm_var": 4.293489583333334, "learning_rate": 0.0001, "loss": 8.2244, "loss/crossentropy": 2.0995118111371993, "loss/hidden": 3.823046875, "loss/jsd": 0.0, "loss/logits": 0.22938680201768874, "step": 12560 }, { "epoch": 0.419, "grad_norm": 31.75, "grad_norm_var": 3.577018229166667, "learning_rate": 0.0001, "loss": 8.2214, "loss/crossentropy": 2.1287095353007315, "loss/hidden": 3.70703125, "loss/jsd": 0.0, "loss/logits": 0.23023095317184925, "step": 12570 }, { "epoch": 0.41933333333333334, "grad_norm": 27.25, "grad_norm_var": 6.601497395833333, "learning_rate": 0.0001, "loss": 8.0257, "loss/crossentropy": 2.163818618655205, "loss/hidden": 3.804296875, "loss/jsd": 0.0, "loss/logits": 0.2438998742029071, "step": 12580 }, { "epoch": 0.4196666666666667, "grad_norm": 30.75, "grad_norm_var": 12.27265625, "learning_rate": 0.0001, "loss": 8.1848, "loss/crossentropy": 2.00474643856287, "loss/hidden": 3.778515625, "loss/jsd": 0.0, "loss/logits": 0.22157613541930915, "step": 12590 }, { "epoch": 0.42, "grad_norm": 32.5, "grad_norm_var": 3.70390625, "learning_rate": 0.0001, "loss": 8.1346, "loss/crossentropy": 2.164738741517067, "loss/hidden": 3.70546875, "loss/jsd": 0.0, "loss/logits": 0.23143419064581394, "step": 12600 }, { "epoch": 0.42033333333333334, "grad_norm": 31.75, "grad_norm_var": 1.8309895833333334, "learning_rate": 0.0001, "loss": 8.016, "loss/crossentropy": 2.0151191845536234, "loss/hidden": 3.86796875, "loss/jsd": 0.0, "loss/logits": 0.22970234788954258, "step": 12610 }, { "epoch": 0.4206666666666667, "grad_norm": 30.5, "grad_norm_var": 1.6122395833333334, "learning_rate": 0.0001, "loss": 8.2133, "loss/crossentropy": 2.1432757824659348, "loss/hidden": 3.746484375, "loss/jsd": 0.0, "loss/logits": 0.22702465616166592, "step": 12620 }, { "epoch": 0.421, "grad_norm": 32.25, "grad_norm_var": 4.889322916666667, "learning_rate": 0.0001, "loss": 8.0972, "loss/crossentropy": 2.182205152511597, "loss/hidden": 3.598046875, "loss/jsd": 0.0, "loss/logits": 0.20892607383430004, "step": 12630 }, { "epoch": 0.42133333333333334, "grad_norm": 34.5, "grad_norm_var": 8.5041015625, "learning_rate": 0.0001, "loss": 8.0624, "loss/crossentropy": 2.151243197917938, "loss/hidden": 3.69765625, "loss/jsd": 0.0, "loss/logits": 0.22152379900217056, "step": 12640 }, { "epoch": 0.4216666666666667, "grad_norm": 31.625, "grad_norm_var": 2.472916666666667, "learning_rate": 0.0001, "loss": 8.1911, "loss/crossentropy": 2.1016953229904174, "loss/hidden": 3.759765625, "loss/jsd": 0.0, "loss/logits": 0.23559323363006116, "step": 12650 }, { "epoch": 0.422, "grad_norm": 30.75, "grad_norm_var": 7.993684895833334, "learning_rate": 0.0001, "loss": 8.165, "loss/crossentropy": 1.9417916133999824, "loss/hidden": 3.772265625, "loss/jsd": 0.0, "loss/logits": 0.2225018298253417, "step": 12660 }, { "epoch": 0.42233333333333334, "grad_norm": 30.0, "grad_norm_var": 16.370572916666667, "learning_rate": 0.0001, "loss": 8.0033, "loss/crossentropy": 1.9000740669667722, "loss/hidden": 3.640234375, "loss/jsd": 0.0, "loss/logits": 0.20819500964134932, "step": 12670 }, { "epoch": 0.4226666666666667, "grad_norm": 30.0, "grad_norm_var": 3.5973307291666665, "learning_rate": 0.0001, "loss": 8.006, "loss/crossentropy": 2.1980207815766333, "loss/hidden": 3.790625, "loss/jsd": 0.0, "loss/logits": 0.22516860738396643, "step": 12680 }, { "epoch": 0.423, "grad_norm": 30.625, "grad_norm_var": 2.098893229166667, "learning_rate": 0.0001, "loss": 8.1445, "loss/crossentropy": 2.3503684222698213, "loss/hidden": 3.687109375, "loss/jsd": 0.0, "loss/logits": 0.236199764162302, "step": 12690 }, { "epoch": 0.42333333333333334, "grad_norm": 32.75, "grad_norm_var": 1.47890625, "learning_rate": 0.0001, "loss": 7.9247, "loss/crossentropy": 2.023277834057808, "loss/hidden": 3.72890625, "loss/jsd": 0.0, "loss/logits": 0.2181034479290247, "step": 12700 }, { "epoch": 0.4236666666666667, "grad_norm": 31.0, "grad_norm_var": 3.9884765625, "learning_rate": 0.0001, "loss": 8.1299, "loss/crossentropy": 2.138180735707283, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.2189898299984634, "step": 12710 }, { "epoch": 0.424, "grad_norm": 30.75, "grad_norm_var": 1.6708333333333334, "learning_rate": 0.0001, "loss": 7.9982, "loss/crossentropy": 2.0362440764904024, "loss/hidden": 3.639453125, "loss/jsd": 0.0, "loss/logits": 0.20388144720345736, "step": 12720 }, { "epoch": 0.42433333333333334, "grad_norm": 30.5, "grad_norm_var": 11.117708333333333, "learning_rate": 0.0001, "loss": 8.1727, "loss/crossentropy": 2.1874516278505327, "loss/hidden": 3.585546875, "loss/jsd": 0.0, "loss/logits": 0.21905291676521302, "step": 12730 }, { "epoch": 0.4246666666666667, "grad_norm": 41.0, "grad_norm_var": 20.251822916666665, "learning_rate": 0.0001, "loss": 8.0777, "loss/crossentropy": 2.2824623227119445, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.22527546025812625, "step": 12740 }, { "epoch": 0.425, "grad_norm": 29.5, "grad_norm_var": 25.060416666666665, "learning_rate": 0.0001, "loss": 8.1515, "loss/crossentropy": 2.0866646379232408, "loss/hidden": 3.727734375, "loss/jsd": 0.0, "loss/logits": 0.2159626353532076, "step": 12750 }, { "epoch": 0.42533333333333334, "grad_norm": 32.5, "grad_norm_var": 25.107747395833332, "learning_rate": 0.0001, "loss": 7.8544, "loss/crossentropy": 2.126218634843826, "loss/hidden": 3.794140625, "loss/jsd": 0.0, "loss/logits": 0.23769555874168874, "step": 12760 }, { "epoch": 0.4256666666666667, "grad_norm": 36.5, "grad_norm_var": 39.00625, "learning_rate": 0.0001, "loss": 8.0843, "loss/crossentropy": 2.1776451751589776, "loss/hidden": 3.627734375, "loss/jsd": 0.0, "loss/logits": 0.22230409383773803, "step": 12770 }, { "epoch": 0.426, "grad_norm": 31.375, "grad_norm_var": 13.474934895833334, "learning_rate": 0.0001, "loss": 7.99, "loss/crossentropy": 2.101397790014744, "loss/hidden": 3.725390625, "loss/jsd": 0.0, "loss/logits": 0.231523541174829, "step": 12780 }, { "epoch": 0.42633333333333334, "grad_norm": 35.25, "grad_norm_var": 13.215625, "learning_rate": 0.0001, "loss": 7.9369, "loss/crossentropy": 2.1546214550733565, "loss/hidden": 3.74296875, "loss/jsd": 0.0, "loss/logits": 0.22521314695477485, "step": 12790 }, { "epoch": 0.4266666666666667, "grad_norm": 28.875, "grad_norm_var": 17.4791015625, "learning_rate": 0.0001, "loss": 8.0401, "loss/crossentropy": 2.232549238950014, "loss/hidden": 3.86640625, "loss/jsd": 0.0, "loss/logits": 0.2322084965184331, "step": 12800 }, { "epoch": 0.427, "grad_norm": 30.25, "grad_norm_var": 8.587239583333334, "learning_rate": 0.0001, "loss": 8.1028, "loss/crossentropy": 2.1313828572630884, "loss/hidden": 3.651171875, "loss/jsd": 0.0, "loss/logits": 0.21781184244900942, "step": 12810 }, { "epoch": 0.42733333333333334, "grad_norm": 27.625, "grad_norm_var": 10.428125, "learning_rate": 0.0001, "loss": 8.0773, "loss/crossentropy": 2.243235859274864, "loss/hidden": 3.6453125, "loss/jsd": 0.0, "loss/logits": 0.21965202130377293, "step": 12820 }, { "epoch": 0.42766666666666664, "grad_norm": 37.0, "grad_norm_var": 14.7791015625, "learning_rate": 0.0001, "loss": 8.1248, "loss/crossentropy": 2.162339176237583, "loss/hidden": 3.74296875, "loss/jsd": 0.0, "loss/logits": 0.23140791803598404, "step": 12830 }, { "epoch": 0.428, "grad_norm": 29.625, "grad_norm_var": 12.1447265625, "learning_rate": 0.0001, "loss": 8.1194, "loss/crossentropy": 2.1178503066301344, "loss/hidden": 3.759375, "loss/jsd": 0.0, "loss/logits": 0.243923881649971, "step": 12840 }, { "epoch": 0.42833333333333334, "grad_norm": 49.0, "grad_norm_var": 25.64140625, "learning_rate": 0.0001, "loss": 8.0362, "loss/crossentropy": 2.1236428640782834, "loss/hidden": 3.683984375, "loss/jsd": 0.0, "loss/logits": 0.21966639999300241, "step": 12850 }, { "epoch": 0.42866666666666664, "grad_norm": 30.375, "grad_norm_var": 23.0181640625, "learning_rate": 0.0001, "loss": 8.0254, "loss/crossentropy": 2.1352688685059547, "loss/hidden": 3.699609375, "loss/jsd": 0.0, "loss/logits": 0.2124914363026619, "step": 12860 }, { "epoch": 0.429, "grad_norm": 29.875, "grad_norm_var": 4.287239583333333, "learning_rate": 0.0001, "loss": 8.0469, "loss/crossentropy": 2.0519951224327087, "loss/hidden": 3.68828125, "loss/jsd": 0.0, "loss/logits": 0.22668614089488984, "step": 12870 }, { "epoch": 0.42933333333333334, "grad_norm": 29.875, "grad_norm_var": 2.6497395833333335, "learning_rate": 0.0001, "loss": 8.0736, "loss/crossentropy": 2.089827132225037, "loss/hidden": 3.68046875, "loss/jsd": 0.0, "loss/logits": 0.2167070461437106, "step": 12880 }, { "epoch": 0.42966666666666664, "grad_norm": 27.125, "grad_norm_var": 6.01875, "learning_rate": 0.0001, "loss": 7.9577, "loss/crossentropy": 2.1272108972072603, "loss/hidden": 3.645703125, "loss/jsd": 0.0, "loss/logits": 0.22290566843003035, "step": 12890 }, { "epoch": 0.43, "grad_norm": 26.75, "grad_norm_var": 7.3875, "learning_rate": 0.0001, "loss": 7.9521, "loss/crossentropy": 2.169513902813196, "loss/hidden": 3.675, "loss/jsd": 0.0, "loss/logits": 0.21037282003089786, "step": 12900 }, { "epoch": 0.43033333333333335, "grad_norm": 32.25, "grad_norm_var": 3.3, "learning_rate": 0.0001, "loss": 7.9838, "loss/crossentropy": 2.093190697580576, "loss/hidden": 3.71484375, "loss/jsd": 0.0, "loss/logits": 0.21419555507600307, "step": 12910 }, { "epoch": 0.43066666666666664, "grad_norm": 33.5, "grad_norm_var": 4.025, "learning_rate": 0.0001, "loss": 8.1292, "loss/crossentropy": 2.128591850399971, "loss/hidden": 3.81328125, "loss/jsd": 0.0, "loss/logits": 0.2363378331065178, "step": 12920 }, { "epoch": 0.431, "grad_norm": 33.0, "grad_norm_var": 3.588997395833333, "learning_rate": 0.0001, "loss": 8.0753, "loss/crossentropy": 2.07633658349514, "loss/hidden": 3.728125, "loss/jsd": 0.0, "loss/logits": 0.23476697821170092, "step": 12930 }, { "epoch": 0.43133333333333335, "grad_norm": 34.0, "grad_norm_var": 4.872330729166666, "learning_rate": 0.0001, "loss": 8.0557, "loss/crossentropy": 2.173259836435318, "loss/hidden": 3.8171875, "loss/jsd": 0.0, "loss/logits": 0.23491751477122308, "step": 12940 }, { "epoch": 0.43166666666666664, "grad_norm": 33.5, "grad_norm_var": 4.478059895833334, "learning_rate": 0.0001, "loss": 8.0571, "loss/crossentropy": 2.076581171154976, "loss/hidden": 3.694140625, "loss/jsd": 0.0, "loss/logits": 0.21625892743468283, "step": 12950 }, { "epoch": 0.432, "grad_norm": 41.5, "grad_norm_var": 2.540311638953689e+18, "learning_rate": 0.0001, "loss": 8.2307, "loss/crossentropy": 2.2042708441615106, "loss/hidden": 3.641015625, "loss/jsd": 0.0, "loss/logits": 0.23953549321740866, "step": 12960 }, { "epoch": 0.43233333333333335, "grad_norm": 33.5, "grad_norm_var": 2.5403116395912233e+18, "learning_rate": 0.0001, "loss": 8.1052, "loss/crossentropy": 2.1010278701782226, "loss/hidden": 3.75625, "loss/jsd": 0.0, "loss/logits": 0.23639172241091727, "step": 12970 }, { "epoch": 0.43266666666666664, "grad_norm": 33.25, "grad_norm_var": 3.4994140625, "learning_rate": 0.0001, "loss": 8.0009, "loss/crossentropy": 2.057431307435036, "loss/hidden": 3.7015625, "loss/jsd": 0.0, "loss/logits": 0.23213282637298108, "step": 12980 }, { "epoch": 0.433, "grad_norm": 29.0, "grad_norm_var": 94.1931640625, "learning_rate": 0.0001, "loss": 8.0487, "loss/crossentropy": 2.0685934379696844, "loss/hidden": 3.766796875, "loss/jsd": 0.0, "loss/logits": 0.23443159088492393, "step": 12990 }, { "epoch": 0.43333333333333335, "grad_norm": 31.0, "grad_norm_var": 92.73326822916667, "learning_rate": 0.0001, "loss": 8.1327, "loss/crossentropy": 2.1575643092393877, "loss/hidden": 3.73984375, "loss/jsd": 0.0, "loss/logits": 0.2321011306717992, "step": 13000 }, { "epoch": 0.43366666666666664, "grad_norm": 30.0, "grad_norm_var": 6.570572916666666, "learning_rate": 0.0001, "loss": 8.0488, "loss/crossentropy": 1.9470253214240074, "loss/hidden": 3.684765625, "loss/jsd": 0.0, "loss/logits": 0.20168767049908637, "step": 13010 }, { "epoch": 0.434, "grad_norm": 31.875, "grad_norm_var": 6.384830729166667, "learning_rate": 0.0001, "loss": 7.9454, "loss/crossentropy": 1.8895531304180622, "loss/hidden": 3.55, "loss/jsd": 0.0, "loss/logits": 0.19299599220976232, "step": 13020 }, { "epoch": 0.43433333333333335, "grad_norm": 30.875, "grad_norm_var": 2.934375, "learning_rate": 0.0001, "loss": 8.0835, "loss/crossentropy": 2.1535514682531356, "loss/hidden": 3.66796875, "loss/jsd": 0.0, "loss/logits": 0.22370851337909697, "step": 13030 }, { "epoch": 0.43466666666666665, "grad_norm": 30.625, "grad_norm_var": 5.705989583333333, "learning_rate": 0.0001, "loss": 7.979, "loss/crossentropy": 2.1214609906077384, "loss/hidden": 3.688671875, "loss/jsd": 0.0, "loss/logits": 0.21629442609846591, "step": 13040 }, { "epoch": 0.435, "grad_norm": 31.75, "grad_norm_var": 6.716666666666667, "learning_rate": 0.0001, "loss": 7.9544, "loss/crossentropy": 1.9861764639616013, "loss/hidden": 3.801171875, "loss/jsd": 0.0, "loss/logits": 0.23933750428259373, "step": 13050 }, { "epoch": 0.43533333333333335, "grad_norm": 30.625, "grad_norm_var": 4.792643229166667, "learning_rate": 0.0001, "loss": 8.1328, "loss/crossentropy": 2.0345228269696234, "loss/hidden": 3.725390625, "loss/jsd": 0.0, "loss/logits": 0.2294387150555849, "step": 13060 }, { "epoch": 0.43566666666666665, "grad_norm": 31.25, "grad_norm_var": 9.701822916666666, "learning_rate": 0.0001, "loss": 7.9392, "loss/crossentropy": 2.0680396020412446, "loss/hidden": 3.887109375, "loss/jsd": 0.0, "loss/logits": 0.23335713148117065, "step": 13070 }, { "epoch": 0.436, "grad_norm": 34.75, "grad_norm_var": 5.339322916666666, "learning_rate": 0.0001, "loss": 8.1304, "loss/crossentropy": 2.0736231788992883, "loss/hidden": 3.753125, "loss/jsd": 0.0, "loss/logits": 0.23877801094204187, "step": 13080 }, { "epoch": 0.43633333333333335, "grad_norm": 30.75, "grad_norm_var": 4.684025051839935e+18, "learning_rate": 0.0001, "loss": 8.1914, "loss/crossentropy": 2.2048259407281874, "loss/hidden": 3.637109375, "loss/jsd": 0.0, "loss/logits": 0.23145930115133523, "step": 13090 }, { "epoch": 0.43666666666666665, "grad_norm": 31.625, "grad_norm_var": 21.812955729166667, "learning_rate": 0.0001, "loss": 8.0019, "loss/crossentropy": 2.1128788188099863, "loss/hidden": 3.742578125, "loss/jsd": 0.0, "loss/logits": 0.2392245376482606, "step": 13100 }, { "epoch": 0.437, "grad_norm": 32.25, "grad_norm_var": 9.125, "learning_rate": 0.0001, "loss": 8.0213, "loss/crossentropy": 2.0316510528326033, "loss/hidden": 3.684375, "loss/jsd": 0.0, "loss/logits": 0.21703706961125135, "step": 13110 }, { "epoch": 0.43733333333333335, "grad_norm": 30.625, "grad_norm_var": 13.457747395833334, "learning_rate": 0.0001, "loss": 8.1833, "loss/crossentropy": 1.937141789495945, "loss/hidden": 3.655078125, "loss/jsd": 0.0, "loss/logits": 0.2118115139193833, "step": 13120 }, { "epoch": 0.43766666666666665, "grad_norm": 32.75, "grad_norm_var": 12.8369140625, "learning_rate": 0.0001, "loss": 8.0128, "loss/crossentropy": 2.057337316870689, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.2210848169401288, "step": 13130 }, { "epoch": 0.438, "grad_norm": 34.25, "grad_norm_var": 8.7587890625, "learning_rate": 0.0001, "loss": 8.0243, "loss/crossentropy": 2.086243689060211, "loss/hidden": 3.696484375, "loss/jsd": 0.0, "loss/logits": 0.2156898221001029, "step": 13140 }, { "epoch": 0.43833333333333335, "grad_norm": 31.875, "grad_norm_var": 7.94765625, "learning_rate": 0.0001, "loss": 8.1641, "loss/crossentropy": 2.2972807347774507, "loss/hidden": 3.756640625, "loss/jsd": 0.0, "loss/logits": 0.24422766156494619, "step": 13150 }, { "epoch": 0.43866666666666665, "grad_norm": 30.5, "grad_norm_var": 5.2791015625, "learning_rate": 0.0001, "loss": 8.0809, "loss/crossentropy": 2.2393217980861664, "loss/hidden": 3.623828125, "loss/jsd": 0.0, "loss/logits": 0.22840084582567216, "step": 13160 }, { "epoch": 0.439, "grad_norm": 32.0, "grad_norm_var": 4.076497395833333, "learning_rate": 0.0001, "loss": 8.0278, "loss/crossentropy": 2.149459010362625, "loss/hidden": 3.697265625, "loss/jsd": 0.0, "loss/logits": 0.2299880154430866, "step": 13170 }, { "epoch": 0.43933333333333335, "grad_norm": 29.0, "grad_norm_var": 2.039322916666667, "learning_rate": 0.0001, "loss": 8.0435, "loss/crossentropy": 2.022595777362585, "loss/hidden": 3.65390625, "loss/jsd": 0.0, "loss/logits": 0.21843499960377813, "step": 13180 }, { "epoch": 0.43966666666666665, "grad_norm": 32.5, "grad_norm_var": 8.3072265625, "learning_rate": 0.0001, "loss": 8.0339, "loss/crossentropy": 1.966893842816353, "loss/hidden": 3.7140625, "loss/jsd": 0.0, "loss/logits": 0.23092244230210782, "step": 13190 }, { "epoch": 0.44, "grad_norm": 30.125, "grad_norm_var": 1.8580729166666667, "learning_rate": 0.0001, "loss": 8.1766, "loss/crossentropy": 2.2937954008579253, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.24365076944231986, "step": 13200 }, { "epoch": 0.44033333333333335, "grad_norm": 42.0, "grad_norm_var": 8.8447265625, "learning_rate": 0.0001, "loss": 8.0685, "loss/crossentropy": 2.203465947508812, "loss/hidden": 3.625390625, "loss/jsd": 0.0, "loss/logits": 0.22517771869897843, "step": 13210 }, { "epoch": 0.44066666666666665, "grad_norm": 32.25, "grad_norm_var": 8.635872395833333, "learning_rate": 0.0001, "loss": 8.0298, "loss/crossentropy": 2.14248249232769, "loss/hidden": 3.504296875, "loss/jsd": 0.0, "loss/logits": 0.21284189969301223, "step": 13220 }, { "epoch": 0.441, "grad_norm": 27.75, "grad_norm_var": 4.1634765625, "learning_rate": 0.0001, "loss": 7.9522, "loss/crossentropy": 2.112074154615402, "loss/hidden": 3.856640625, "loss/jsd": 0.0, "loss/logits": 0.23715684618800878, "step": 13230 }, { "epoch": 0.44133333333333336, "grad_norm": 32.25, "grad_norm_var": 6.068489583333333, "learning_rate": 0.0001, "loss": 7.9992, "loss/crossentropy": 2.0217273235321045, "loss/hidden": 3.68984375, "loss/jsd": 0.0, "loss/logits": 0.22903131749480962, "step": 13240 }, { "epoch": 0.44166666666666665, "grad_norm": 30.5, "grad_norm_var": 11.684375, "learning_rate": 0.0001, "loss": 8.0119, "loss/crossentropy": 2.080713841319084, "loss/hidden": 3.68515625, "loss/jsd": 0.0, "loss/logits": 0.22152625005692245, "step": 13250 }, { "epoch": 0.442, "grad_norm": 37.0, "grad_norm_var": 1240.8749348958333, "learning_rate": 0.0001, "loss": 8.1225, "loss/crossentropy": 2.0688235819339753, "loss/hidden": 3.74296875, "loss/jsd": 0.0, "loss/logits": 0.22032655104994775, "step": 13260 }, { "epoch": 0.44233333333333336, "grad_norm": 32.75, "grad_norm_var": 1244.62890625, "learning_rate": 0.0001, "loss": 7.9617, "loss/crossentropy": 2.0978364631533624, "loss/hidden": 3.712109375, "loss/jsd": 0.0, "loss/logits": 0.2254788476973772, "step": 13270 }, { "epoch": 0.44266666666666665, "grad_norm": 34.0, "grad_norm_var": 5.268684895833333, "learning_rate": 0.0001, "loss": 8.0872, "loss/crossentropy": 2.0726170748472215, "loss/hidden": 3.562890625, "loss/jsd": 0.0, "loss/logits": 0.19704890344291925, "step": 13280 }, { "epoch": 0.443, "grad_norm": 29.75, "grad_norm_var": 6.375455729166666, "learning_rate": 0.0001, "loss": 7.9701, "loss/crossentropy": 2.2288454949855803, "loss/hidden": 3.67890625, "loss/jsd": 0.0, "loss/logits": 0.21981892809271814, "step": 13290 }, { "epoch": 0.44333333333333336, "grad_norm": 32.5, "grad_norm_var": 4.275, "learning_rate": 0.0001, "loss": 8.1481, "loss/crossentropy": 2.1981609016656876, "loss/hidden": 3.7671875, "loss/jsd": 0.0, "loss/logits": 0.2299773920327425, "step": 13300 }, { "epoch": 0.44366666666666665, "grad_norm": 31.125, "grad_norm_var": 5.218489583333334, "learning_rate": 0.0001, "loss": 8.1545, "loss/crossentropy": 2.1825950175523756, "loss/hidden": 3.631640625, "loss/jsd": 0.0, "loss/logits": 0.22280107364058493, "step": 13310 }, { "epoch": 0.444, "grad_norm": 31.375, "grad_norm_var": 5.693684895833333, "learning_rate": 0.0001, "loss": 8.0126, "loss/crossentropy": 2.1816250920295714, "loss/hidden": 3.725, "loss/jsd": 0.0, "loss/logits": 0.23028801158070564, "step": 13320 }, { "epoch": 0.44433333333333336, "grad_norm": 35.25, "grad_norm_var": 5.933072916666666, "learning_rate": 0.0001, "loss": 8.0725, "loss/crossentropy": 2.0108284398913385, "loss/hidden": 3.71484375, "loss/jsd": 0.0, "loss/logits": 0.2190965536981821, "step": 13330 }, { "epoch": 0.44466666666666665, "grad_norm": 30.5, "grad_norm_var": 9.8134765625, "learning_rate": 0.0001, "loss": 7.9707, "loss/crossentropy": 2.1558491311967374, "loss/hidden": 3.773828125, "loss/jsd": 0.0, "loss/logits": 0.20952776670455933, "step": 13340 }, { "epoch": 0.445, "grad_norm": 29.875, "grad_norm_var": 10.82890625, "learning_rate": 0.0001, "loss": 8.1556, "loss/crossentropy": 2.0820034801959992, "loss/hidden": 3.894921875, "loss/jsd": 0.0, "loss/logits": 0.25748000014573336, "step": 13350 }, { "epoch": 0.44533333333333336, "grad_norm": 32.0, "grad_norm_var": 20.269791666666666, "learning_rate": 0.0001, "loss": 8.1191, "loss/crossentropy": 2.0460492126643657, "loss/hidden": 3.7515625, "loss/jsd": 0.0, "loss/logits": 0.22600278463214635, "step": 13360 }, { "epoch": 0.44566666666666666, "grad_norm": 27.625, "grad_norm_var": 14.958072916666667, "learning_rate": 0.0001, "loss": 8.042, "loss/crossentropy": 2.1917740404605865, "loss/hidden": 3.61875, "loss/jsd": 0.0, "loss/logits": 0.20980511526577175, "step": 13370 }, { "epoch": 0.446, "grad_norm": 33.25, "grad_norm_var": 16.076497395833332, "learning_rate": 0.0001, "loss": 8.0894, "loss/crossentropy": 2.2024613440036775, "loss/hidden": 3.649609375, "loss/jsd": 0.0, "loss/logits": 0.2355958294123411, "step": 13380 }, { "epoch": 0.44633333333333336, "grad_norm": 33.5, "grad_norm_var": 17.540625, "learning_rate": 0.0001, "loss": 7.9545, "loss/crossentropy": 1.9565787248313427, "loss/hidden": 3.79765625, "loss/jsd": 0.0, "loss/logits": 0.21949218455702066, "step": 13390 }, { "epoch": 0.44666666666666666, "grad_norm": 33.75, "grad_norm_var": 20.5400390625, "learning_rate": 0.0001, "loss": 7.9703, "loss/crossentropy": 2.0982728376984596, "loss/hidden": 3.64140625, "loss/jsd": 0.0, "loss/logits": 0.20750523190945386, "step": 13400 }, { "epoch": 0.447, "grad_norm": 30.0, "grad_norm_var": 5.753580729166667, "learning_rate": 0.0001, "loss": 7.9816, "loss/crossentropy": 2.1122142657637597, "loss/hidden": 3.613671875, "loss/jsd": 0.0, "loss/logits": 0.22008793614804745, "step": 13410 }, { "epoch": 0.44733333333333336, "grad_norm": 35.25, "grad_norm_var": 5.796875, "learning_rate": 0.0001, "loss": 8.0286, "loss/crossentropy": 2.07668551504612, "loss/hidden": 3.783203125, "loss/jsd": 0.0, "loss/logits": 0.21882363129407167, "step": 13420 }, { "epoch": 0.44766666666666666, "grad_norm": 28.125, "grad_norm_var": 7.4822265625, "learning_rate": 0.0001, "loss": 7.8724, "loss/crossentropy": 2.0558082655072214, "loss/hidden": 3.616796875, "loss/jsd": 0.0, "loss/logits": 0.20779120028018952, "step": 13430 }, { "epoch": 0.448, "grad_norm": 31.25, "grad_norm_var": 43.58743489583333, "learning_rate": 0.0001, "loss": 8.0085, "loss/crossentropy": 2.0572322949767115, "loss/hidden": 3.719140625, "loss/jsd": 0.0, "loss/logits": 0.22204263061285018, "step": 13440 }, { "epoch": 0.4483333333333333, "grad_norm": 31.125, "grad_norm_var": 23.96640625, "learning_rate": 0.0001, "loss": 7.9379, "loss/crossentropy": 2.07759770154953, "loss/hidden": 3.603515625, "loss/jsd": 0.0, "loss/logits": 0.21129580233246087, "step": 13450 }, { "epoch": 0.44866666666666666, "grad_norm": 33.25, "grad_norm_var": 18.164322916666666, "learning_rate": 0.0001, "loss": 8.0347, "loss/crossentropy": 2.015417565405369, "loss/hidden": 3.744140625, "loss/jsd": 0.0, "loss/logits": 0.22753252387046813, "step": 13460 }, { "epoch": 0.449, "grad_norm": 29.25, "grad_norm_var": 25.124739583333334, "learning_rate": 0.0001, "loss": 8.1289, "loss/crossentropy": 2.05652796626091, "loss/hidden": 3.781640625, "loss/jsd": 0.0, "loss/logits": 0.24078646618872881, "step": 13470 }, { "epoch": 0.4493333333333333, "grad_norm": 29.625, "grad_norm_var": 24.924934895833335, "learning_rate": 0.0001, "loss": 8.0484, "loss/crossentropy": 2.1728298760950566, "loss/hidden": 3.636328125, "loss/jsd": 0.0, "loss/logits": 0.2125161023810506, "step": 13480 }, { "epoch": 0.44966666666666666, "grad_norm": 30.875, "grad_norm_var": 17.8150390625, "learning_rate": 0.0001, "loss": 7.8773, "loss/crossentropy": 2.021086546033621, "loss/hidden": 3.686328125, "loss/jsd": 0.0, "loss/logits": 0.2219966731965542, "step": 13490 }, { "epoch": 0.45, "grad_norm": 43.25, "grad_norm_var": 20.687955729166667, "learning_rate": 0.0001, "loss": 8.0709, "loss/crossentropy": 1.9942471355199813, "loss/hidden": 3.686328125, "loss/jsd": 0.0, "loss/logits": 0.24962616860866546, "step": 13500 }, { "epoch": 0.4503333333333333, "grad_norm": 30.375, "grad_norm_var": 19.948372395833335, "learning_rate": 0.0001, "loss": 8.031, "loss/crossentropy": 2.1690925747156142, "loss/hidden": 3.634765625, "loss/jsd": 0.0, "loss/logits": 0.2215597040951252, "step": 13510 }, { "epoch": 0.45066666666666666, "grad_norm": 32.25, "grad_norm_var": 11.6369140625, "learning_rate": 0.0001, "loss": 7.948, "loss/crossentropy": 2.134739102423191, "loss/hidden": 3.713671875, "loss/jsd": 0.0, "loss/logits": 0.2251646015793085, "step": 13520 }, { "epoch": 0.451, "grad_norm": 38.25, "grad_norm_var": 11.8212890625, "learning_rate": 0.0001, "loss": 7.8974, "loss/crossentropy": 2.251084867119789, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.214023519679904, "step": 13530 }, { "epoch": 0.4513333333333333, "grad_norm": 26.5, "grad_norm_var": 9.022916666666667, "learning_rate": 0.0001, "loss": 7.8569, "loss/crossentropy": 2.065364643931389, "loss/hidden": 3.59296875, "loss/jsd": 0.0, "loss/logits": 0.2112195746973157, "step": 13540 }, { "epoch": 0.45166666666666666, "grad_norm": 31.875, "grad_norm_var": 6.6087890625, "learning_rate": 0.0001, "loss": 8.094, "loss/crossentropy": 2.207910177111626, "loss/hidden": 3.665234375, "loss/jsd": 0.0, "loss/logits": 0.2230087785050273, "step": 13550 }, { "epoch": 0.452, "grad_norm": 32.25, "grad_norm_var": 2.4344770479298447e+18, "learning_rate": 0.0001, "loss": 8.0482, "loss/crossentropy": 2.0523271694779397, "loss/hidden": 3.74296875, "loss/jsd": 0.0, "loss/logits": 0.2225760780274868, "step": 13560 }, { "epoch": 0.4523333333333333, "grad_norm": 29.125, "grad_norm_var": 2.4344770485864627e+18, "learning_rate": 0.0001, "loss": 8.008, "loss/crossentropy": 2.1910267025232315, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.2098704855889082, "step": 13570 }, { "epoch": 0.45266666666666666, "grad_norm": 29.75, "grad_norm_var": 10.664583333333333, "learning_rate": 0.0001, "loss": 8.1107, "loss/crossentropy": 2.2485829517245293, "loss/hidden": 3.705078125, "loss/jsd": 0.0, "loss/logits": 0.23670779224485158, "step": 13580 }, { "epoch": 0.453, "grad_norm": 29.375, "grad_norm_var": 11.153125, "learning_rate": 0.0001, "loss": 8.0307, "loss/crossentropy": 2.176164289563894, "loss/hidden": 3.590625, "loss/jsd": 0.0, "loss/logits": 0.216172288171947, "step": 13590 }, { "epoch": 0.4533333333333333, "grad_norm": 28.5, "grad_norm_var": 4.143489583333333, "learning_rate": 0.0001, "loss": 7.9631, "loss/crossentropy": 2.0201455272734163, "loss/hidden": 3.5984375, "loss/jsd": 0.0, "loss/logits": 0.20862020272761583, "step": 13600 }, { "epoch": 0.45366666666666666, "grad_norm": 32.25, "grad_norm_var": 13.143489583333333, "learning_rate": 0.0001, "loss": 8.0374, "loss/crossentropy": 2.041334181651473, "loss/hidden": 3.77578125, "loss/jsd": 0.0, "loss/logits": 0.22034290386363864, "step": 13610 }, { "epoch": 0.454, "grad_norm": 29.75, "grad_norm_var": 2.6327473958333334, "learning_rate": 0.0001, "loss": 8.0312, "loss/crossentropy": 2.2032358795404434, "loss/hidden": 3.691796875, "loss/jsd": 0.0, "loss/logits": 0.23516745883971452, "step": 13620 }, { "epoch": 0.4543333333333333, "grad_norm": 34.0, "grad_norm_var": 10.5134765625, "learning_rate": 0.0001, "loss": 8.0251, "loss/crossentropy": 2.212277019023895, "loss/hidden": 3.70703125, "loss/jsd": 0.0, "loss/logits": 0.24109804332256318, "step": 13630 }, { "epoch": 0.45466666666666666, "grad_norm": 29.375, "grad_norm_var": 10.6587890625, "learning_rate": 0.0001, "loss": 7.9654, "loss/crossentropy": 2.0316985830664636, "loss/hidden": 3.719921875, "loss/jsd": 0.0, "loss/logits": 0.22388876546174288, "step": 13640 }, { "epoch": 0.455, "grad_norm": 27.25, "grad_norm_var": 15.3166015625, "learning_rate": 0.0001, "loss": 7.9874, "loss/crossentropy": 1.9860161900520326, "loss/hidden": 3.616796875, "loss/jsd": 0.0, "loss/logits": 0.2077900541946292, "step": 13650 }, { "epoch": 0.4553333333333333, "grad_norm": 30.125, "grad_norm_var": 22.734830729166667, "learning_rate": 0.0001, "loss": 8.0461, "loss/crossentropy": 2.2462501987814902, "loss/hidden": 3.64921875, "loss/jsd": 0.0, "loss/logits": 0.2214917227625847, "step": 13660 }, { "epoch": 0.45566666666666666, "grad_norm": 30.875, "grad_norm_var": 10.469205729166667, "learning_rate": 0.0001, "loss": 7.9203, "loss/crossentropy": 1.9449552714824676, "loss/hidden": 3.7953125, "loss/jsd": 0.0, "loss/logits": 0.21775004733353853, "step": 13670 }, { "epoch": 0.456, "grad_norm": 32.5, "grad_norm_var": 3.26640625, "learning_rate": 0.0001, "loss": 8.0556, "loss/crossentropy": 2.0170522332191467, "loss/hidden": 3.7125, "loss/jsd": 0.0, "loss/logits": 0.2343472855165601, "step": 13680 }, { "epoch": 0.4563333333333333, "grad_norm": 32.75, "grad_norm_var": 4.356184895833334, "learning_rate": 0.0001, "loss": 7.9995, "loss/crossentropy": 2.104297934472561, "loss/hidden": 3.646875, "loss/jsd": 0.0, "loss/logits": 0.21948921959847212, "step": 13690 }, { "epoch": 0.45666666666666667, "grad_norm": 30.875, "grad_norm_var": 3.4082682291666666, "learning_rate": 0.0001, "loss": 7.9706, "loss/crossentropy": 2.0912899121642115, "loss/hidden": 3.707421875, "loss/jsd": 0.0, "loss/logits": 0.21979312859475614, "step": 13700 }, { "epoch": 0.457, "grad_norm": 30.75, "grad_norm_var": 3.9270833333333335, "learning_rate": 0.0001, "loss": 7.9165, "loss/crossentropy": 2.121154861152172, "loss/hidden": 3.65546875, "loss/jsd": 0.0, "loss/logits": 0.2177841143682599, "step": 13710 }, { "epoch": 0.4573333333333333, "grad_norm": 36.5, "grad_norm_var": 5.361393229166667, "learning_rate": 0.0001, "loss": 8.0071, "loss/crossentropy": 2.136076480150223, "loss/hidden": 3.635546875, "loss/jsd": 0.0, "loss/logits": 0.23270511198788882, "step": 13720 }, { "epoch": 0.45766666666666667, "grad_norm": 29.625, "grad_norm_var": 7.4634765625, "learning_rate": 0.0001, "loss": 8.0277, "loss/crossentropy": 2.091973701864481, "loss/hidden": 3.5796875, "loss/jsd": 0.0, "loss/logits": 0.20484627303667366, "step": 13730 }, { "epoch": 0.458, "grad_norm": 30.625, "grad_norm_var": 3.123893229166667, "learning_rate": 0.0001, "loss": 7.9874, "loss/crossentropy": 2.2389348953962327, "loss/hidden": 3.6984375, "loss/jsd": 0.0, "loss/logits": 0.2297331139445305, "step": 13740 }, { "epoch": 0.4583333333333333, "grad_norm": 30.875, "grad_norm_var": 3.8634765625, "learning_rate": 0.0001, "loss": 8.2201, "loss/crossentropy": 2.1332207426428793, "loss/hidden": 3.782421875, "loss/jsd": 0.0, "loss/logits": 0.24505181200802326, "step": 13750 }, { "epoch": 0.45866666666666667, "grad_norm": 31.125, "grad_norm_var": 2.3067057291666666, "learning_rate": 0.0001, "loss": 7.8715, "loss/crossentropy": 1.9607113853096962, "loss/hidden": 3.636328125, "loss/jsd": 0.0, "loss/logits": 0.21614569872617723, "step": 13760 }, { "epoch": 0.459, "grad_norm": 30.75, "grad_norm_var": 3.914322916666667, "learning_rate": 0.0001, "loss": 8.024, "loss/crossentropy": 2.0893411085009577, "loss/hidden": 3.710546875, "loss/jsd": 0.0, "loss/logits": 0.22553631979972125, "step": 13770 }, { "epoch": 0.4593333333333333, "grad_norm": 35.5, "grad_norm_var": 31.7822265625, "learning_rate": 0.0001, "loss": 8.0481, "loss/crossentropy": 2.132881796360016, "loss/hidden": 3.697265625, "loss/jsd": 0.0, "loss/logits": 0.23121243454515933, "step": 13780 }, { "epoch": 0.45966666666666667, "grad_norm": 30.875, "grad_norm_var": 30.0931640625, "learning_rate": 0.0001, "loss": 8.2036, "loss/crossentropy": 2.3052436083555223, "loss/hidden": 3.640234375, "loss/jsd": 0.0, "loss/logits": 0.23779372237622737, "step": 13790 }, { "epoch": 0.46, "grad_norm": 31.375, "grad_norm_var": 3.6020182291666667, "learning_rate": 0.0001, "loss": 8.0591, "loss/crossentropy": 2.032654400169849, "loss/hidden": 3.706640625, "loss/jsd": 0.0, "loss/logits": 0.22621268928050994, "step": 13800 }, { "epoch": 0.4603333333333333, "grad_norm": 30.125, "grad_norm_var": 13.4212890625, "learning_rate": 0.0001, "loss": 8.158, "loss/crossentropy": 2.011768199503422, "loss/hidden": 3.775390625, "loss/jsd": 0.0, "loss/logits": 0.21962493509054185, "step": 13810 }, { "epoch": 0.46066666666666667, "grad_norm": 33.5, "grad_norm_var": 13.245247395833333, "learning_rate": 0.0001, "loss": 7.9673, "loss/crossentropy": 2.0957317486405374, "loss/hidden": 3.58203125, "loss/jsd": 0.0, "loss/logits": 0.21697306856513024, "step": 13820 }, { "epoch": 0.461, "grad_norm": 30.875, "grad_norm_var": 5.01640625, "learning_rate": 0.0001, "loss": 8.0855, "loss/crossentropy": 2.2701291263103487, "loss/hidden": 3.691796875, "loss/jsd": 0.0, "loss/logits": 0.2501831637695432, "step": 13830 }, { "epoch": 0.4613333333333333, "grad_norm": 38.5, "grad_norm_var": 6.1462890625, "learning_rate": 0.0001, "loss": 7.9714, "loss/crossentropy": 2.0974492438137533, "loss/hidden": 3.861328125, "loss/jsd": 0.0, "loss/logits": 0.2554807654581964, "step": 13840 }, { "epoch": 0.46166666666666667, "grad_norm": 31.75, "grad_norm_var": 5.338997395833333, "learning_rate": 0.0001, "loss": 7.9723, "loss/crossentropy": 2.0560192227363587, "loss/hidden": 3.7453125, "loss/jsd": 0.0, "loss/logits": 0.22991488091647624, "step": 13850 }, { "epoch": 0.462, "grad_norm": 33.75, "grad_norm_var": 3.0452473958333335, "learning_rate": 0.0001, "loss": 8.1789, "loss/crossentropy": 2.1576769910752773, "loss/hidden": 3.63671875, "loss/jsd": 0.0, "loss/logits": 0.23487880751490592, "step": 13860 }, { "epoch": 0.4623333333333333, "grad_norm": 36.0, "grad_norm_var": 5.680208333333334, "learning_rate": 0.0001, "loss": 7.9999, "loss/crossentropy": 2.061297869682312, "loss/hidden": 3.701171875, "loss/jsd": 0.0, "loss/logits": 0.24054675735533237, "step": 13870 }, { "epoch": 0.46266666666666667, "grad_norm": 29.75, "grad_norm_var": 2.9879557291666665, "learning_rate": 0.0001, "loss": 8.0911, "loss/crossentropy": 2.0008441783487796, "loss/hidden": 3.7109375, "loss/jsd": 0.0, "loss/logits": 0.22590927435085179, "step": 13880 }, { "epoch": 0.463, "grad_norm": 31.0, "grad_norm_var": 4.104622395833333, "learning_rate": 0.0001, "loss": 7.9195, "loss/crossentropy": 2.132499638199806, "loss/hidden": 3.634765625, "loss/jsd": 0.0, "loss/logits": 0.2145272171124816, "step": 13890 }, { "epoch": 0.4633333333333333, "grad_norm": 35.25, "grad_norm_var": 5.206705729166667, "learning_rate": 0.0001, "loss": 8.0658, "loss/crossentropy": 2.259749516099691, "loss/hidden": 3.7109375, "loss/jsd": 0.0, "loss/logits": 0.2152292856015265, "step": 13900 }, { "epoch": 0.46366666666666667, "grad_norm": 31.75, "grad_norm_var": 3.825455729166667, "learning_rate": 0.0001, "loss": 7.924, "loss/crossentropy": 2.1031736478209497, "loss/hidden": 3.691796875, "loss/jsd": 0.0, "loss/logits": 0.23380641918629408, "step": 13910 }, { "epoch": 0.464, "grad_norm": 32.75, "grad_norm_var": 3.6497395833333335, "learning_rate": 0.0001, "loss": 8.152, "loss/crossentropy": 2.0122231051325796, "loss/hidden": 3.760546875, "loss/jsd": 0.0, "loss/logits": 0.22742101289331912, "step": 13920 }, { "epoch": 0.4643333333333333, "grad_norm": 33.25, "grad_norm_var": 4.8625, "learning_rate": 0.0001, "loss": 8.0922, "loss/crossentropy": 2.1263721615076063, "loss/hidden": 3.7046875, "loss/jsd": 0.0, "loss/logits": 0.22606851048767568, "step": 13930 }, { "epoch": 0.4646666666666667, "grad_norm": 30.5, "grad_norm_var": 2.9322265625, "learning_rate": 0.0001, "loss": 8.1061, "loss/crossentropy": 2.079650565981865, "loss/hidden": 3.73203125, "loss/jsd": 0.0, "loss/logits": 0.2313553038984537, "step": 13940 }, { "epoch": 0.465, "grad_norm": 30.125, "grad_norm_var": 5.832747395833334, "learning_rate": 0.0001, "loss": 7.9564, "loss/crossentropy": 2.284359359741211, "loss/hidden": 3.587109375, "loss/jsd": 0.0, "loss/logits": 0.22488110959529878, "step": 13950 }, { "epoch": 0.4653333333333333, "grad_norm": 30.75, "grad_norm_var": 2.8223307291666666, "learning_rate": 0.0001, "loss": 8.0784, "loss/crossentropy": 1.9363142460584641, "loss/hidden": 3.709375, "loss/jsd": 0.0, "loss/logits": 0.22093999302014708, "step": 13960 }, { "epoch": 0.4656666666666667, "grad_norm": 29.625, "grad_norm_var": 16.656184895833334, "learning_rate": 0.0001, "loss": 8.1271, "loss/crossentropy": 2.150686714053154, "loss/hidden": 3.700390625, "loss/jsd": 0.0, "loss/logits": 0.2396129213273525, "step": 13970 }, { "epoch": 0.466, "grad_norm": 30.5, "grad_norm_var": 1.6785807291666666, "learning_rate": 0.0001, "loss": 8.0096, "loss/crossentropy": 2.039792370796204, "loss/hidden": 3.678515625, "loss/jsd": 0.0, "loss/logits": 0.23292775694280862, "step": 13980 }, { "epoch": 0.4663333333333333, "grad_norm": 30.75, "grad_norm_var": 3.801497395833333, "learning_rate": 0.0001, "loss": 7.9907, "loss/crossentropy": 2.0111122995615007, "loss/hidden": 3.690234375, "loss/jsd": 0.0, "loss/logits": 0.22928319536149502, "step": 13990 }, { "epoch": 0.4666666666666667, "grad_norm": 38.0, "grad_norm_var": 5.3822265625, "learning_rate": 0.0001, "loss": 7.9428, "loss/crossentropy": 2.140259427577257, "loss/hidden": 3.747265625, "loss/jsd": 0.0, "loss/logits": 0.23131428118795155, "step": 14000 }, { "epoch": 0.467, "grad_norm": 36.25, "grad_norm_var": 6.494791666666667, "learning_rate": 0.0001, "loss": 8.0925, "loss/crossentropy": 2.0670676976442337, "loss/hidden": 3.81953125, "loss/jsd": 0.0, "loss/logits": 0.2178192425519228, "step": 14010 }, { "epoch": 0.4673333333333333, "grad_norm": 34.25, "grad_norm_var": 4.870572916666666, "learning_rate": 0.0001, "loss": 8.002, "loss/crossentropy": 2.1283276975154877, "loss/hidden": 3.64140625, "loss/jsd": 0.0, "loss/logits": 0.2105252131819725, "step": 14020 }, { "epoch": 0.4676666666666667, "grad_norm": 29.5, "grad_norm_var": 2.8785807291666665, "learning_rate": 0.0001, "loss": 7.9375, "loss/crossentropy": 2.189284147322178, "loss/hidden": 3.700390625, "loss/jsd": 0.0, "loss/logits": 0.23593165911734104, "step": 14030 }, { "epoch": 0.468, "grad_norm": 33.0, "grad_norm_var": 1.7705729166666666, "learning_rate": 0.0001, "loss": 7.8595, "loss/crossentropy": 2.0938302144408225, "loss/hidden": 3.683203125, "loss/jsd": 0.0, "loss/logits": 0.2286398086696863, "step": 14040 }, { "epoch": 0.4683333333333333, "grad_norm": 32.25, "grad_norm_var": 7.680208333333334, "learning_rate": 0.0001, "loss": 7.8761, "loss/crossentropy": 2.1059680595993995, "loss/hidden": 3.79921875, "loss/jsd": 0.0, "loss/logits": 0.22907451894134284, "step": 14050 }, { "epoch": 0.4686666666666667, "grad_norm": 31.375, "grad_norm_var": 15.355208333333334, "learning_rate": 0.0001, "loss": 8.0853, "loss/crossentropy": 2.011585946381092, "loss/hidden": 3.675390625, "loss/jsd": 0.0, "loss/logits": 0.23615019097924234, "step": 14060 }, { "epoch": 0.469, "grad_norm": 31.875, "grad_norm_var": 10.226041666666667, "learning_rate": 0.0001, "loss": 8.0453, "loss/crossentropy": 2.1759623274207116, "loss/hidden": 3.6734375, "loss/jsd": 0.0, "loss/logits": 0.23897310364991425, "step": 14070 }, { "epoch": 0.4693333333333333, "grad_norm": 28.875, "grad_norm_var": 3.2926432291666665, "learning_rate": 0.0001, "loss": 7.9501, "loss/crossentropy": 2.1067129537463187, "loss/hidden": 3.684765625, "loss/jsd": 0.0, "loss/logits": 0.21687035337090493, "step": 14080 }, { "epoch": 0.4696666666666667, "grad_norm": 32.75, "grad_norm_var": 10.553125, "learning_rate": 0.0001, "loss": 7.8951, "loss/crossentropy": 2.073711508512497, "loss/hidden": 3.58359375, "loss/jsd": 0.0, "loss/logits": 0.20740561783313752, "step": 14090 }, { "epoch": 0.47, "grad_norm": 42.75, "grad_norm_var": 11.9791015625, "learning_rate": 0.0001, "loss": 8.0888, "loss/crossentropy": 2.1998244017362594, "loss/hidden": 3.66171875, "loss/jsd": 0.0, "loss/logits": 0.24048520512878896, "step": 14100 }, { "epoch": 0.4703333333333333, "grad_norm": 35.75, "grad_norm_var": 12.2572265625, "learning_rate": 0.0001, "loss": 7.9691, "loss/crossentropy": 1.9739395514130593, "loss/hidden": 3.6375, "loss/jsd": 0.0, "loss/logits": 0.20827601440250873, "step": 14110 }, { "epoch": 0.4706666666666667, "grad_norm": 30.375, "grad_norm_var": 7.487239583333333, "learning_rate": 0.0001, "loss": 8.0045, "loss/crossentropy": 1.9031679958105088, "loss/hidden": 3.577734375, "loss/jsd": 0.0, "loss/logits": 0.22086440566927196, "step": 14120 }, { "epoch": 0.471, "grad_norm": 29.375, "grad_norm_var": 2.3363932291666667, "learning_rate": 0.0001, "loss": 7.9829, "loss/crossentropy": 2.1658167608082293, "loss/hidden": 3.58125, "loss/jsd": 0.0, "loss/logits": 0.21573278903961182, "step": 14130 }, { "epoch": 0.4713333333333333, "grad_norm": 31.125, "grad_norm_var": 5.509375, "learning_rate": 0.0001, "loss": 8.0107, "loss/crossentropy": 2.1494806349277495, "loss/hidden": 3.708203125, "loss/jsd": 0.0, "loss/logits": 0.23198096118867398, "step": 14140 }, { "epoch": 0.4716666666666667, "grad_norm": 32.75, "grad_norm_var": 7.82890625, "learning_rate": 0.0001, "loss": 7.9608, "loss/crossentropy": 2.176995001733303, "loss/hidden": 3.669921875, "loss/jsd": 0.0, "loss/logits": 0.22184601295739412, "step": 14150 }, { "epoch": 0.472, "grad_norm": 31.5, "grad_norm_var": 7.715559895833334, "learning_rate": 0.0001, "loss": 7.9168, "loss/crossentropy": 2.1046732500195504, "loss/hidden": 3.767578125, "loss/jsd": 0.0, "loss/logits": 0.2493480734527111, "step": 14160 }, { "epoch": 0.4723333333333333, "grad_norm": 34.5, "grad_norm_var": 4.651822916666666, "learning_rate": 0.0001, "loss": 8.0195, "loss/crossentropy": 2.0825782030820847, "loss/hidden": 3.859765625, "loss/jsd": 0.0, "loss/logits": 0.21776366755366325, "step": 14170 }, { "epoch": 0.4726666666666667, "grad_norm": 30.25, "grad_norm_var": 2.594073359300323e+18, "learning_rate": 0.0001, "loss": 7.9941, "loss/crossentropy": 2.0367950215935706, "loss/hidden": 3.656640625, "loss/jsd": 0.0, "loss/logits": 0.21370016261935235, "step": 14180 }, { "epoch": 0.473, "grad_norm": 31.375, "grad_norm_var": 8.9119140625, "learning_rate": 0.0001, "loss": 7.9755, "loss/crossentropy": 2.044953337311745, "loss/hidden": 3.748828125, "loss/jsd": 0.0, "loss/logits": 0.23148317448794842, "step": 14190 }, { "epoch": 0.47333333333333333, "grad_norm": 33.0, "grad_norm_var": 3.4613932291666667, "learning_rate": 0.0001, "loss": 8.0085, "loss/crossentropy": 2.0775508999824526, "loss/hidden": 3.666796875, "loss/jsd": 0.0, "loss/logits": 0.21293406821787358, "step": 14200 }, { "epoch": 0.4736666666666667, "grad_norm": 34.0, "grad_norm_var": 3.6113932291666666, "learning_rate": 0.0001, "loss": 7.9466, "loss/crossentropy": 2.060066529363394, "loss/hidden": 3.63125, "loss/jsd": 0.0, "loss/logits": 0.21972225215286018, "step": 14210 }, { "epoch": 0.474, "grad_norm": 29.625, "grad_norm_var": 14.949739583333333, "learning_rate": 0.0001, "loss": 7.9651, "loss/crossentropy": 2.1454847924411298, "loss/hidden": 3.675390625, "loss/jsd": 0.0, "loss/logits": 0.23835664317011834, "step": 14220 }, { "epoch": 0.47433333333333333, "grad_norm": 30.625, "grad_norm_var": 23.971809895833335, "learning_rate": 0.0001, "loss": 7.9706, "loss/crossentropy": 2.0558887153863905, "loss/hidden": 3.663671875, "loss/jsd": 0.0, "loss/logits": 0.22023830823600293, "step": 14230 }, { "epoch": 0.4746666666666667, "grad_norm": 30.75, "grad_norm_var": 5.146875, "learning_rate": 0.0001, "loss": 7.9699, "loss/crossentropy": 2.0701268397271635, "loss/hidden": 3.599609375, "loss/jsd": 0.0, "loss/logits": 0.21326845940202474, "step": 14240 }, { "epoch": 0.475, "grad_norm": 31.875, "grad_norm_var": 6.955989583333333, "learning_rate": 0.0001, "loss": 7.911, "loss/crossentropy": 2.0048009738326074, "loss/hidden": 3.685546875, "loss/jsd": 0.0, "loss/logits": 0.20537266619503497, "step": 14250 }, { "epoch": 0.47533333333333333, "grad_norm": 31.0, "grad_norm_var": 1.5291015625, "learning_rate": 0.0001, "loss": 8.0106, "loss/crossentropy": 1.9276894822716713, "loss/hidden": 3.639453125, "loss/jsd": 0.0, "loss/logits": 0.19781601782888175, "step": 14260 }, { "epoch": 0.4756666666666667, "grad_norm": 29.625, "grad_norm_var": 4.012955729166666, "learning_rate": 0.0001, "loss": 7.7983, "loss/crossentropy": 2.127088063955307, "loss/hidden": 3.523828125, "loss/jsd": 0.0, "loss/logits": 0.19607614930719136, "step": 14270 }, { "epoch": 0.476, "grad_norm": 32.25, "grad_norm_var": 4.188997395833334, "learning_rate": 0.0001, "loss": 7.859, "loss/crossentropy": 2.0276701495051386, "loss/hidden": 3.63515625, "loss/jsd": 0.0, "loss/logits": 0.22934147100895644, "step": 14280 }, { "epoch": 0.47633333333333333, "grad_norm": 28.875, "grad_norm_var": 15.313997395833333, "learning_rate": 0.0001, "loss": 7.9605, "loss/crossentropy": 2.1230327248573304, "loss/hidden": 3.80703125, "loss/jsd": 0.0, "loss/logits": 0.24875117875635624, "step": 14290 }, { "epoch": 0.4766666666666667, "grad_norm": 30.75, "grad_norm_var": 19.182291666666668, "learning_rate": 0.0001, "loss": 7.9298, "loss/crossentropy": 2.065612518787384, "loss/hidden": 3.6734375, "loss/jsd": 0.0, "loss/logits": 0.22124754767864943, "step": 14300 }, { "epoch": 0.477, "grad_norm": 30.75, "grad_norm_var": 10.710872395833333, "learning_rate": 0.0001, "loss": 7.812, "loss/crossentropy": 2.04611222743988, "loss/hidden": 3.694921875, "loss/jsd": 0.0, "loss/logits": 0.20459628701210023, "step": 14310 }, { "epoch": 0.47733333333333333, "grad_norm": 31.25, "grad_norm_var": 4.3697265625, "learning_rate": 0.0001, "loss": 8.1206, "loss/crossentropy": 2.121537686884403, "loss/hidden": 3.793359375, "loss/jsd": 0.0, "loss/logits": 0.2447736568748951, "step": 14320 }, { "epoch": 0.4776666666666667, "grad_norm": 35.0, "grad_norm_var": 6.0625, "learning_rate": 0.0001, "loss": 7.918, "loss/crossentropy": 1.9219128280878066, "loss/hidden": 3.623828125, "loss/jsd": 0.0, "loss/logits": 0.2076597328297794, "step": 14330 }, { "epoch": 0.478, "grad_norm": 33.5, "grad_norm_var": 9.618489583333334, "learning_rate": 0.0001, "loss": 7.8741, "loss/crossentropy": 2.039739317446947, "loss/hidden": 3.623046875, "loss/jsd": 0.0, "loss/logits": 0.21342823561280966, "step": 14340 }, { "epoch": 0.47833333333333333, "grad_norm": 30.0, "grad_norm_var": 10.0306640625, "learning_rate": 0.0001, "loss": 7.9885, "loss/crossentropy": 2.163815528154373, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.20537124276161195, "step": 14350 }, { "epoch": 0.4786666666666667, "grad_norm": 32.25, "grad_norm_var": 3.658072916666667, "learning_rate": 0.0001, "loss": 8.0153, "loss/crossentropy": 2.087558428943157, "loss/hidden": 3.76640625, "loss/jsd": 0.0, "loss/logits": 0.2245228797197342, "step": 14360 }, { "epoch": 0.479, "grad_norm": 31.0, "grad_norm_var": 1.5530598958333333, "learning_rate": 0.0001, "loss": 7.9594, "loss/crossentropy": 2.2264768585562704, "loss/hidden": 3.705078125, "loss/jsd": 0.0, "loss/logits": 0.23378158863633872, "step": 14370 }, { "epoch": 0.47933333333333333, "grad_norm": 30.25, "grad_norm_var": 1.8177083333333333, "learning_rate": 0.0001, "loss": 7.9091, "loss/crossentropy": 2.1510671019554137, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.21997169237583875, "step": 14380 }, { "epoch": 0.4796666666666667, "grad_norm": 30.0, "grad_norm_var": 4.183072916666666, "learning_rate": 0.0001, "loss": 8.1278, "loss/crossentropy": 1.996177999675274, "loss/hidden": 3.83203125, "loss/jsd": 0.0, "loss/logits": 0.22189988046884537, "step": 14390 }, { "epoch": 0.48, "grad_norm": 33.5, "grad_norm_var": 6.430143229166666, "learning_rate": 0.0001, "loss": 8.0033, "loss/crossentropy": 2.106117682904005, "loss/hidden": 3.78125, "loss/jsd": 0.0, "loss/logits": 0.20855927262455226, "step": 14400 }, { "epoch": 0.48033333333333333, "grad_norm": 31.125, "grad_norm_var": 10.13515625, "learning_rate": 0.0001, "loss": 7.9176, "loss/crossentropy": 2.1830692276358605, "loss/hidden": 3.616796875, "loss/jsd": 0.0, "loss/logits": 0.2149069756269455, "step": 14410 }, { "epoch": 0.4806666666666667, "grad_norm": 31.875, "grad_norm_var": 8.387434895833334, "learning_rate": 0.0001, "loss": 8.0137, "loss/crossentropy": 2.2273536786437034, "loss/hidden": 3.727734375, "loss/jsd": 0.0, "loss/logits": 0.23837392879649996, "step": 14420 }, { "epoch": 0.481, "grad_norm": 29.375, "grad_norm_var": 7.53125, "learning_rate": 0.0001, "loss": 7.8473, "loss/crossentropy": 2.0939138531684875, "loss/hidden": 3.64453125, "loss/jsd": 0.0, "loss/logits": 0.20872913114726543, "step": 14430 }, { "epoch": 0.48133333333333334, "grad_norm": 30.375, "grad_norm_var": 5.299934895833333, "learning_rate": 0.0001, "loss": 7.9815, "loss/crossentropy": 2.166853901743889, "loss/hidden": 3.783203125, "loss/jsd": 0.0, "loss/logits": 0.24097004868090152, "step": 14440 }, { "epoch": 0.4816666666666667, "grad_norm": 32.0, "grad_norm_var": 3.1468098958333335, "learning_rate": 0.0001, "loss": 8.0419, "loss/crossentropy": 2.1047842048108576, "loss/hidden": 3.592578125, "loss/jsd": 0.0, "loss/logits": 0.20935575142502785, "step": 14450 }, { "epoch": 0.482, "grad_norm": 30.5, "grad_norm_var": 3.626030986454421e+18, "learning_rate": 0.0001, "loss": 7.9523, "loss/crossentropy": 2.057080474495888, "loss/hidden": 3.55078125, "loss/jsd": 0.0, "loss/logits": 0.2086722683161497, "step": 14460 }, { "epoch": 0.48233333333333334, "grad_norm": 30.0, "grad_norm_var": 103.56223958333334, "learning_rate": 0.0001, "loss": 7.8493, "loss/crossentropy": 2.046734869480133, "loss/hidden": 3.613671875, "loss/jsd": 0.0, "loss/logits": 0.21412031259387732, "step": 14470 }, { "epoch": 0.4826666666666667, "grad_norm": 31.875, "grad_norm_var": 101.18307291666666, "learning_rate": 0.0001, "loss": 7.9051, "loss/crossentropy": 2.065974645316601, "loss/hidden": 3.724609375, "loss/jsd": 0.0, "loss/logits": 0.23375277630984784, "step": 14480 }, { "epoch": 0.483, "grad_norm": 31.25, "grad_norm_var": 5.0541015625, "learning_rate": 0.0001, "loss": 7.8899, "loss/crossentropy": 2.0884762033820152, "loss/hidden": 3.63203125, "loss/jsd": 0.0, "loss/logits": 0.20585475508123635, "step": 14490 }, { "epoch": 0.48333333333333334, "grad_norm": 32.25, "grad_norm_var": 2.6499348958333333, "learning_rate": 0.0001, "loss": 7.9974, "loss/crossentropy": 2.2915369153022764, "loss/hidden": 3.62890625, "loss/jsd": 0.0, "loss/logits": 0.2142224058508873, "step": 14500 }, { "epoch": 0.4836666666666667, "grad_norm": 32.25, "grad_norm_var": 6.114322916666667, "learning_rate": 0.0001, "loss": 8.0747, "loss/crossentropy": 2.240130066871643, "loss/hidden": 3.709375, "loss/jsd": 0.0, "loss/logits": 0.23856233302503824, "step": 14510 }, { "epoch": 0.484, "grad_norm": 30.125, "grad_norm_var": 6.818684895833333, "learning_rate": 0.0001, "loss": 8.0631, "loss/crossentropy": 2.109334260225296, "loss/hidden": 3.775390625, "loss/jsd": 0.0, "loss/logits": 0.22316622659564017, "step": 14520 }, { "epoch": 0.48433333333333334, "grad_norm": 29.75, "grad_norm_var": 2.5233723958333334, "learning_rate": 0.0001, "loss": 7.9728, "loss/crossentropy": 2.1873391047120094, "loss/hidden": 3.580078125, "loss/jsd": 0.0, "loss/logits": 0.213110950961709, "step": 14530 }, { "epoch": 0.4846666666666667, "grad_norm": 177.0, "grad_norm_var": 1326.0431640625, "learning_rate": 0.0001, "loss": 7.9814, "loss/crossentropy": 2.0492543891072272, "loss/hidden": 3.687890625, "loss/jsd": 0.0, "loss/logits": 0.22612165659666061, "step": 14540 }, { "epoch": 0.485, "grad_norm": 35.0, "grad_norm_var": 1308.9384765625, "learning_rate": 0.0001, "loss": 7.9937, "loss/crossentropy": 2.0765153512358667, "loss/hidden": 3.716796875, "loss/jsd": 0.0, "loss/logits": 0.21763208881020546, "step": 14550 }, { "epoch": 0.48533333333333334, "grad_norm": 35.25, "grad_norm_var": 7.351822916666666, "learning_rate": 0.0001, "loss": 7.9428, "loss/crossentropy": 2.065848244726658, "loss/hidden": 3.709375, "loss/jsd": 0.0, "loss/logits": 0.21093793530017138, "step": 14560 }, { "epoch": 0.4856666666666667, "grad_norm": 34.75, "grad_norm_var": 4.43125, "learning_rate": 0.0001, "loss": 8.1182, "loss/crossentropy": 2.021928811073303, "loss/hidden": 3.63046875, "loss/jsd": 0.0, "loss/logits": 0.2118746515363455, "step": 14570 }, { "epoch": 0.486, "grad_norm": 35.25, "grad_norm_var": 6.9916015625, "learning_rate": 0.0001, "loss": 8.0468, "loss/crossentropy": 1.9547654077410699, "loss/hidden": 3.819140625, "loss/jsd": 0.0, "loss/logits": 0.23913611695170403, "step": 14580 }, { "epoch": 0.48633333333333334, "grad_norm": 33.25, "grad_norm_var": 833.4160807291667, "learning_rate": 0.0001, "loss": 8.0743, "loss/crossentropy": 1.9865302249789238, "loss/hidden": 3.90703125, "loss/jsd": 0.0, "loss/logits": 0.23207287043333052, "step": 14590 }, { "epoch": 0.4866666666666667, "grad_norm": 32.75, "grad_norm_var": 11.039322916666666, "learning_rate": 0.0001, "loss": 8.1016, "loss/crossentropy": 2.2250148117542268, "loss/hidden": 3.704296875, "loss/jsd": 0.0, "loss/logits": 0.21750411633402109, "step": 14600 }, { "epoch": 0.487, "grad_norm": 33.0, "grad_norm_var": 3.9218098958333334, "learning_rate": 0.0001, "loss": 7.9989, "loss/crossentropy": 2.0343859881162643, "loss/hidden": 3.581640625, "loss/jsd": 0.0, "loss/logits": 0.20473443409428, "step": 14610 }, { "epoch": 0.48733333333333334, "grad_norm": 33.5, "grad_norm_var": 102.97858072916667, "learning_rate": 0.0001, "loss": 8.0228, "loss/crossentropy": 2.1920588284730913, "loss/hidden": 3.72578125, "loss/jsd": 0.0, "loss/logits": 0.2238582916557789, "step": 14620 }, { "epoch": 0.4876666666666667, "grad_norm": 32.75, "grad_norm_var": 106.72057291666667, "learning_rate": 0.0001, "loss": 7.9914, "loss/crossentropy": 2.131534478068352, "loss/hidden": 3.618359375, "loss/jsd": 0.0, "loss/logits": 0.21192469485104085, "step": 14630 }, { "epoch": 0.488, "grad_norm": 37.0, "grad_norm_var": 130.6650390625, "learning_rate": 0.0001, "loss": 8.0006, "loss/crossentropy": 1.9975043579936027, "loss/hidden": 3.716796875, "loss/jsd": 0.0, "loss/logits": 0.20322852581739426, "step": 14640 }, { "epoch": 0.48833333333333334, "grad_norm": 30.375, "grad_norm_var": 125.80833333333334, "learning_rate": 0.0001, "loss": 7.9713, "loss/crossentropy": 2.1270855344831943, "loss/hidden": 3.663671875, "loss/jsd": 0.0, "loss/logits": 0.220531555917114, "step": 14650 }, { "epoch": 0.4886666666666667, "grad_norm": 31.5, "grad_norm_var": 3.64765625, "learning_rate": 0.0001, "loss": 8.0117, "loss/crossentropy": 2.089583569765091, "loss/hidden": 3.6625, "loss/jsd": 0.0, "loss/logits": 0.22326747328042984, "step": 14660 }, { "epoch": 0.489, "grad_norm": 30.75, "grad_norm_var": 4.487434895833333, "learning_rate": 0.0001, "loss": 7.9531, "loss/crossentropy": 1.9965915471315383, "loss/hidden": 3.665625, "loss/jsd": 0.0, "loss/logits": 0.21961635909974575, "step": 14670 }, { "epoch": 0.48933333333333334, "grad_norm": 38.0, "grad_norm_var": 24.405989583333334, "learning_rate": 0.0001, "loss": 8.0753, "loss/crossentropy": 2.1705081194639204, "loss/hidden": 3.66484375, "loss/jsd": 0.0, "loss/logits": 0.21931095998734235, "step": 14680 }, { "epoch": 0.48966666666666664, "grad_norm": 36.0, "grad_norm_var": 12.178125, "learning_rate": 0.0001, "loss": 7.943, "loss/crossentropy": 2.050249530375004, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.2148410253226757, "step": 14690 }, { "epoch": 0.49, "grad_norm": 30.125, "grad_norm_var": 4.697916666666667, "learning_rate": 0.0001, "loss": 7.9524, "loss/crossentropy": 2.0615778759121897, "loss/hidden": 3.587890625, "loss/jsd": 0.0, "loss/logits": 0.21635166741907597, "step": 14700 }, { "epoch": 0.49033333333333334, "grad_norm": 37.0, "grad_norm_var": 4.417643229166667, "learning_rate": 0.0001, "loss": 8.0631, "loss/crossentropy": 2.2596657291054725, "loss/hidden": 3.695703125, "loss/jsd": 0.0, "loss/logits": 0.23083409201353788, "step": 14710 }, { "epoch": 0.49066666666666664, "grad_norm": 39.5, "grad_norm_var": 16.4875, "learning_rate": 0.0001, "loss": 7.9577, "loss/crossentropy": 2.0465092822909354, "loss/hidden": 3.67890625, "loss/jsd": 0.0, "loss/logits": 0.22049062736332417, "step": 14720 }, { "epoch": 0.491, "grad_norm": 30.0, "grad_norm_var": 8.873893229166667, "learning_rate": 0.0001, "loss": 7.995, "loss/crossentropy": 2.16557691693306, "loss/hidden": 3.664453125, "loss/jsd": 0.0, "loss/logits": 0.21666408702731133, "step": 14730 }, { "epoch": 0.49133333333333334, "grad_norm": 33.75, "grad_norm_var": 5.620247395833333, "learning_rate": 0.0001, "loss": 7.9573, "loss/crossentropy": 2.158591315150261, "loss/hidden": 3.630859375, "loss/jsd": 0.0, "loss/logits": 0.22307645455002784, "step": 14740 }, { "epoch": 0.49166666666666664, "grad_norm": 31.5, "grad_norm_var": 9.358268229166667, "learning_rate": 0.0001, "loss": 7.9671, "loss/crossentropy": 2.0228962182998655, "loss/hidden": 3.733203125, "loss/jsd": 0.0, "loss/logits": 0.21425336729735137, "step": 14750 }, { "epoch": 0.492, "grad_norm": 44.75, "grad_norm_var": 16.083333333333332, "learning_rate": 0.0001, "loss": 7.9912, "loss/crossentropy": 2.1796020001173018, "loss/hidden": 3.570703125, "loss/jsd": 0.0, "loss/logits": 0.2131647277623415, "step": 14760 }, { "epoch": 0.49233333333333335, "grad_norm": 38.0, "grad_norm_var": 18.555989583333332, "learning_rate": 0.0001, "loss": 8.0265, "loss/crossentropy": 2.079108493030071, "loss/hidden": 3.732421875, "loss/jsd": 0.0, "loss/logits": 0.21902708765119314, "step": 14770 }, { "epoch": 0.49266666666666664, "grad_norm": 30.875, "grad_norm_var": 5.920247395833333, "learning_rate": 0.0001, "loss": 8.1693, "loss/crossentropy": 2.2386298209428785, "loss/hidden": 3.790234375, "loss/jsd": 0.0, "loss/logits": 0.24830550476908683, "step": 14780 }, { "epoch": 0.493, "grad_norm": 37.5, "grad_norm_var": 14.933333333333334, "learning_rate": 0.0001, "loss": 7.9356, "loss/crossentropy": 2.29894140958786, "loss/hidden": 3.671484375, "loss/jsd": 0.0, "loss/logits": 0.21857519987970592, "step": 14790 }, { "epoch": 0.49333333333333335, "grad_norm": 38.5, "grad_norm_var": 19.596809895833335, "learning_rate": 0.0001, "loss": 7.993, "loss/crossentropy": 2.003948637843132, "loss/hidden": 3.584765625, "loss/jsd": 0.0, "loss/logits": 0.20476762484759092, "step": 14800 }, { "epoch": 0.49366666666666664, "grad_norm": 31.625, "grad_norm_var": 9.97890625, "learning_rate": 0.0001, "loss": 8.0814, "loss/crossentropy": 2.151717406511307, "loss/hidden": 3.7, "loss/jsd": 0.0, "loss/logits": 0.22604979574680328, "step": 14810 }, { "epoch": 0.494, "grad_norm": 37.5, "grad_norm_var": 8.230989583333333, "learning_rate": 0.0001, "loss": 8.1202, "loss/crossentropy": 1.9646364904940128, "loss/hidden": 3.797265625, "loss/jsd": 0.0, "loss/logits": 0.24005853114649653, "step": 14820 }, { "epoch": 0.49433333333333335, "grad_norm": 29.875, "grad_norm_var": 10.67890625, "learning_rate": 0.0001, "loss": 8.032, "loss/crossentropy": 2.162761890888214, "loss/hidden": 3.69375, "loss/jsd": 0.0, "loss/logits": 0.2281369637697935, "step": 14830 }, { "epoch": 0.49466666666666664, "grad_norm": 29.25, "grad_norm_var": 17.162955729166665, "learning_rate": 0.0001, "loss": 7.9308, "loss/crossentropy": 1.809071047604084, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.20106423925608397, "step": 14840 }, { "epoch": 0.495, "grad_norm": 31.25, "grad_norm_var": 14.433268229166666, "learning_rate": 0.0001, "loss": 7.8974, "loss/crossentropy": 2.1457689851522446, "loss/hidden": 3.73984375, "loss/jsd": 0.0, "loss/logits": 0.24656938333064318, "step": 14850 }, { "epoch": 0.49533333333333335, "grad_norm": 29.125, "grad_norm_var": 12.724739583333333, "learning_rate": 0.0001, "loss": 8.0257, "loss/crossentropy": 2.097585503757, "loss/hidden": 3.728125, "loss/jsd": 0.0, "loss/logits": 0.2211755273863673, "step": 14860 }, { "epoch": 0.49566666666666664, "grad_norm": 34.25, "grad_norm_var": 11.1931640625, "learning_rate": 0.0001, "loss": 8.0109, "loss/crossentropy": 2.1149248749017717, "loss/hidden": 3.628515625, "loss/jsd": 0.0, "loss/logits": 0.20364532712846994, "step": 14870 }, { "epoch": 0.496, "grad_norm": 29.625, "grad_norm_var": 2.5940733597902177e+18, "learning_rate": 0.0001, "loss": 8.023, "loss/crossentropy": 2.096835497021675, "loss/hidden": 3.75546875, "loss/jsd": 0.0, "loss/logits": 0.21487231757491826, "step": 14880 }, { "epoch": 0.49633333333333335, "grad_norm": 27.625, "grad_norm_var": 7.139583333333333, "learning_rate": 0.0001, "loss": 7.9587, "loss/crossentropy": 1.9992181949317456, "loss/hidden": 3.6421875, "loss/jsd": 0.0, "loss/logits": 0.20439089126884938, "step": 14890 }, { "epoch": 0.49666666666666665, "grad_norm": 31.0, "grad_norm_var": 11.989583333333334, "learning_rate": 0.0001, "loss": 8.0151, "loss/crossentropy": 1.997377061843872, "loss/hidden": 3.794921875, "loss/jsd": 0.0, "loss/logits": 0.22013774681836368, "step": 14900 }, { "epoch": 0.497, "grad_norm": 31.625, "grad_norm_var": 6.09765625, "learning_rate": 0.0001, "loss": 8.0484, "loss/crossentropy": 2.2603927135467528, "loss/hidden": 3.6703125, "loss/jsd": 0.0, "loss/logits": 0.22995477840304374, "step": 14910 }, { "epoch": 0.49733333333333335, "grad_norm": 28.25, "grad_norm_var": 2.544791666666667, "learning_rate": 0.0001, "loss": 7.9737, "loss/crossentropy": 2.167400282621384, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.20647088065743446, "step": 14920 }, { "epoch": 0.49766666666666665, "grad_norm": 28.25, "grad_norm_var": 17.2556640625, "learning_rate": 0.0001, "loss": 7.9713, "loss/crossentropy": 1.9748560175299645, "loss/hidden": 3.684765625, "loss/jsd": 0.0, "loss/logits": 0.21687341015785933, "step": 14930 }, { "epoch": 0.498, "grad_norm": 34.75, "grad_norm_var": 15.188997395833333, "learning_rate": 0.0001, "loss": 7.9405, "loss/crossentropy": 2.1966336160898208, "loss/hidden": 3.56796875, "loss/jsd": 0.0, "loss/logits": 0.20776706095784903, "step": 14940 }, { "epoch": 0.49833333333333335, "grad_norm": 7079985152.0, "grad_norm_var": 3.1328868311327135e+18, "learning_rate": 0.0001, "loss": 7.9715, "loss/crossentropy": 2.108351056277752, "loss/hidden": 3.567578125, "loss/jsd": 0.0, "loss/logits": 0.21284203305840493, "step": 14950 }, { "epoch": 0.49866666666666665, "grad_norm": 32.5, "grad_norm_var": 3.1328868312875884e+18, "learning_rate": 0.0001, "loss": 8.0864, "loss/crossentropy": 2.0066813334822653, "loss/hidden": 3.6734375, "loss/jsd": 0.0, "loss/logits": 0.226681593246758, "step": 14960 }, { "epoch": 0.499, "grad_norm": 33.5, "grad_norm_var": 2.80390625, "learning_rate": 0.0001, "loss": 8.0583, "loss/crossentropy": 1.9094878628849983, "loss/hidden": 3.566015625, "loss/jsd": 0.0, "loss/logits": 0.21297882869839668, "step": 14970 }, { "epoch": 0.49933333333333335, "grad_norm": 31.375, "grad_norm_var": 3.4322916666666665, "learning_rate": 0.0001, "loss": 7.9844, "loss/crossentropy": 2.138418735563755, "loss/hidden": 3.585546875, "loss/jsd": 0.0, "loss/logits": 0.218233341909945, "step": 14980 }, { "epoch": 0.49966666666666665, "grad_norm": 32.25, "grad_norm_var": 2.3619140625, "learning_rate": 0.0001, "loss": 7.9809, "loss/crossentropy": 2.0114166542887686, "loss/hidden": 3.661328125, "loss/jsd": 0.0, "loss/logits": 0.21623858716338873, "step": 14990 }, { "epoch": 0.5, "grad_norm": 41.0, "grad_norm_var": 246.77180989583334, "learning_rate": 0.0001, "loss": 8.0603, "loss/crossentropy": 2.034164222329855, "loss/hidden": 3.712109375, "loss/jsd": 0.0, "loss/logits": 0.2103666251525283, "step": 15000 }, { "epoch": 0.5003333333333333, "grad_norm": 30.5, "grad_norm_var": 248.26640625, "learning_rate": 0.0001, "loss": 8.1163, "loss/crossentropy": 2.1978528052568436, "loss/hidden": 3.641796875, "loss/jsd": 0.0, "loss/logits": 0.2373816639184952, "step": 15010 }, { "epoch": 0.5006666666666667, "grad_norm": 31.375, "grad_norm_var": 2.21015625, "learning_rate": 0.0001, "loss": 7.9552, "loss/crossentropy": 2.03947726637125, "loss/hidden": 3.6125, "loss/jsd": 0.0, "loss/logits": 0.2085746269673109, "step": 15020 }, { "epoch": 0.501, "grad_norm": 32.75, "grad_norm_var": 6.3572265625, "learning_rate": 0.0001, "loss": 8.1111, "loss/crossentropy": 2.008555364608765, "loss/hidden": 3.66171875, "loss/jsd": 0.0, "loss/logits": 0.20857672542333602, "step": 15030 }, { "epoch": 0.5013333333333333, "grad_norm": 29.875, "grad_norm_var": 4.277018229166667, "learning_rate": 0.0001, "loss": 7.942, "loss/crossentropy": 2.0552697718143462, "loss/hidden": 3.753125, "loss/jsd": 0.0, "loss/logits": 0.21989797037094833, "step": 15040 }, { "epoch": 0.5016666666666667, "grad_norm": 169.0, "grad_norm_var": 1190.8098307291666, "learning_rate": 0.0001, "loss": 8.0327, "loss/crossentropy": 2.1051773697137834, "loss/hidden": 3.775390625, "loss/jsd": 0.0, "loss/logits": 0.2577056746929884, "step": 15050 }, { "epoch": 0.502, "grad_norm": 31.25, "grad_norm_var": 1164.67890625, "learning_rate": 0.0001, "loss": 8.0819, "loss/crossentropy": 2.186911401152611, "loss/hidden": 3.72421875, "loss/jsd": 0.0, "loss/logits": 0.21980911456048488, "step": 15060 }, { "epoch": 0.5023333333333333, "grad_norm": 28.0, "grad_norm_var": 4.070833333333334, "learning_rate": 0.0001, "loss": 8.0812, "loss/crossentropy": 2.112710100412369, "loss/hidden": 3.774609375, "loss/jsd": 0.0, "loss/logits": 0.23789308052510022, "step": 15070 }, { "epoch": 0.5026666666666667, "grad_norm": 29.0, "grad_norm_var": 7.726497395833333, "learning_rate": 0.0001, "loss": 8.0101, "loss/crossentropy": 2.1419308796525, "loss/hidden": 3.6109375, "loss/jsd": 0.0, "loss/logits": 0.21457258183509112, "step": 15080 }, { "epoch": 0.503, "grad_norm": 38.0, "grad_norm_var": 12497.576041666667, "learning_rate": 0.0001, "loss": 8.1218, "loss/crossentropy": 2.134933979809284, "loss/hidden": 3.790234375, "loss/jsd": 0.0, "loss/logits": 0.23342457674443723, "step": 15090 }, { "epoch": 0.5033333333333333, "grad_norm": 29.5, "grad_norm_var": 20.352083333333333, "learning_rate": 0.0001, "loss": 7.9202, "loss/crossentropy": 2.3430344820022584, "loss/hidden": 3.519140625, "loss/jsd": 0.0, "loss/logits": 0.2118502750992775, "step": 15100 }, { "epoch": 0.5036666666666667, "grad_norm": 35.25, "grad_norm_var": 22.322330729166666, "learning_rate": 0.0001, "loss": 7.9041, "loss/crossentropy": 1.9799678571522237, "loss/hidden": 3.635546875, "loss/jsd": 0.0, "loss/logits": 0.21085582114756107, "step": 15110 }, { "epoch": 0.504, "grad_norm": 30.625, "grad_norm_var": 16.030143229166665, "learning_rate": 0.0001, "loss": 7.9714, "loss/crossentropy": 1.9788647621870041, "loss/hidden": 3.731640625, "loss/jsd": 0.0, "loss/logits": 0.20716245826333762, "step": 15120 }, { "epoch": 0.5043333333333333, "grad_norm": 32.25, "grad_norm_var": 14.680989583333334, "learning_rate": 0.0001, "loss": 7.8813, "loss/crossentropy": 2.1622576892375944, "loss/hidden": 3.61875, "loss/jsd": 0.0, "loss/logits": 0.21591525189578534, "step": 15130 }, { "epoch": 0.5046666666666667, "grad_norm": 34.25, "grad_norm_var": 12.263541666666667, "learning_rate": 0.0001, "loss": 8.0432, "loss/crossentropy": 2.158085845410824, "loss/hidden": 3.726953125, "loss/jsd": 0.0, "loss/logits": 0.22357744220644235, "step": 15140 }, { "epoch": 0.505, "grad_norm": 37.0, "grad_norm_var": 1032.2447265625, "learning_rate": 0.0001, "loss": 8.0755, "loss/crossentropy": 2.2892831161618235, "loss/hidden": 3.76328125, "loss/jsd": 0.0, "loss/logits": 0.23992609437555074, "step": 15150 }, { "epoch": 0.5053333333333333, "grad_norm": 34.0, "grad_norm_var": 1034.0916666666667, "learning_rate": 0.0001, "loss": 7.9403, "loss/crossentropy": 2.1311425492167473, "loss/hidden": 3.624609375, "loss/jsd": 0.0, "loss/logits": 0.21165461391210555, "step": 15160 }, { "epoch": 0.5056666666666667, "grad_norm": 29.25, "grad_norm_var": 3.64375, "learning_rate": 0.0001, "loss": 7.9553, "loss/crossentropy": 2.171247933804989, "loss/hidden": 3.805859375, "loss/jsd": 0.0, "loss/logits": 0.23857482858002185, "step": 15170 }, { "epoch": 0.506, "grad_norm": 28.75, "grad_norm_var": 2.582291666666667, "learning_rate": 0.0001, "loss": 7.9445, "loss/crossentropy": 1.846039692312479, "loss/hidden": 3.678515625, "loss/jsd": 0.0, "loss/logits": 0.19486570674926043, "step": 15180 }, { "epoch": 0.5063333333333333, "grad_norm": 34.75, "grad_norm_var": 6.375455729166666, "learning_rate": 0.0001, "loss": 8.0507, "loss/crossentropy": 2.1973116233944894, "loss/hidden": 3.7453125, "loss/jsd": 0.0, "loss/logits": 0.2503014124929905, "step": 15190 }, { "epoch": 0.5066666666666667, "grad_norm": 30.0, "grad_norm_var": 13.45625, "learning_rate": 0.0001, "loss": 8.0027, "loss/crossentropy": 1.9380142621695995, "loss/hidden": 3.798828125, "loss/jsd": 0.0, "loss/logits": 0.23612585961818694, "step": 15200 }, { "epoch": 0.507, "grad_norm": 31.0, "grad_norm_var": 4.994205729166667, "learning_rate": 0.0001, "loss": 7.9165, "loss/crossentropy": 1.953425794839859, "loss/hidden": 3.744921875, "loss/jsd": 0.0, "loss/logits": 0.24761936087161301, "step": 15210 }, { "epoch": 0.5073333333333333, "grad_norm": 33.0, "grad_norm_var": 6.151822916666666, "learning_rate": 0.0001, "loss": 7.947, "loss/crossentropy": 2.066864788532257, "loss/hidden": 3.654296875, "loss/jsd": 0.0, "loss/logits": 0.21229154095053673, "step": 15220 }, { "epoch": 0.5076666666666667, "grad_norm": 33.0, "grad_norm_var": 13.453059895833333, "learning_rate": 0.0001, "loss": 7.9878, "loss/crossentropy": 2.2750732988119124, "loss/hidden": 3.66640625, "loss/jsd": 0.0, "loss/logits": 0.226811171323061, "step": 15230 }, { "epoch": 0.508, "grad_norm": 30.375, "grad_norm_var": 9.880143229166666, "learning_rate": 0.0001, "loss": 7.8715, "loss/crossentropy": 2.0473168551921845, "loss/hidden": 3.66015625, "loss/jsd": 0.0, "loss/logits": 0.20602242182940245, "step": 15240 }, { "epoch": 0.5083333333333333, "grad_norm": 34.25, "grad_norm_var": 7.327018229166667, "learning_rate": 0.0001, "loss": 8.1526, "loss/crossentropy": 2.1135044425725935, "loss/hidden": 3.73125, "loss/jsd": 0.0, "loss/logits": 0.25442443899810313, "step": 15250 }, { "epoch": 0.5086666666666667, "grad_norm": 33.25, "grad_norm_var": 5.2322265625, "learning_rate": 0.0001, "loss": 7.9502, "loss/crossentropy": 2.002006813138723, "loss/hidden": 3.751953125, "loss/jsd": 0.0, "loss/logits": 0.22036314904689788, "step": 15260 }, { "epoch": 0.509, "grad_norm": 31.5, "grad_norm_var": 5.1212890625, "learning_rate": 0.0001, "loss": 7.9084, "loss/crossentropy": 2.3665783375501634, "loss/hidden": 3.685546875, "loss/jsd": 0.0, "loss/logits": 0.23151662331074477, "step": 15270 }, { "epoch": 0.5093333333333333, "grad_norm": 39.0, "grad_norm_var": 7.746809895833334, "learning_rate": 0.0001, "loss": 7.9309, "loss/crossentropy": 1.96308908239007, "loss/hidden": 3.65703125, "loss/jsd": 0.0, "loss/logits": 0.21435827864333987, "step": 15280 }, { "epoch": 0.5096666666666667, "grad_norm": 31.25, "grad_norm_var": 12.887239583333333, "learning_rate": 0.0001, "loss": 7.8462, "loss/crossentropy": 2.0475788712501526, "loss/hidden": 3.5890625, "loss/jsd": 0.0, "loss/logits": 0.209853470697999, "step": 15290 }, { "epoch": 0.51, "grad_norm": 34.5, "grad_norm_var": 8.1666015625, "learning_rate": 0.0001, "loss": 7.9855, "loss/crossentropy": 2.048559895157814, "loss/hidden": 3.743359375, "loss/jsd": 0.0, "loss/logits": 0.22797312829643487, "step": 15300 }, { "epoch": 0.5103333333333333, "grad_norm": 30.25, "grad_norm_var": 4.605989583333334, "learning_rate": 0.0001, "loss": 7.9321, "loss/crossentropy": 2.081040045619011, "loss/hidden": 3.7390625, "loss/jsd": 0.0, "loss/logits": 0.22966741789132356, "step": 15310 }, { "epoch": 0.5106666666666667, "grad_norm": 33.75, "grad_norm_var": 5.2681640625, "learning_rate": 0.0001, "loss": 8.0778, "loss/crossentropy": 2.1107777029275896, "loss/hidden": 3.535546875, "loss/jsd": 0.0, "loss/logits": 0.20291287880390882, "step": 15320 }, { "epoch": 0.511, "grad_norm": 29.5, "grad_norm_var": 4.077083333333333, "learning_rate": 0.0001, "loss": 7.9746, "loss/crossentropy": 2.1766668647527694, "loss/hidden": 3.625390625, "loss/jsd": 0.0, "loss/logits": 0.21749209128320218, "step": 15330 }, { "epoch": 0.5113333333333333, "grad_norm": 35.75, "grad_norm_var": 6.962239583333333, "learning_rate": 0.0001, "loss": 8.039, "loss/crossentropy": 2.0856088645756246, "loss/hidden": 3.663671875, "loss/jsd": 0.0, "loss/logits": 0.22033254262059926, "step": 15340 }, { "epoch": 0.5116666666666667, "grad_norm": 30.375, "grad_norm_var": 6.554166666666666, "learning_rate": 0.0001, "loss": 7.9891, "loss/crossentropy": 2.093947410583496, "loss/hidden": 3.667578125, "loss/jsd": 0.0, "loss/logits": 0.22944947555661202, "step": 15350 }, { "epoch": 0.512, "grad_norm": 41.25, "grad_norm_var": 9.861393229166667, "learning_rate": 0.0001, "loss": 7.9193, "loss/crossentropy": 2.1564169749617577, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.2197624057531357, "step": 15360 }, { "epoch": 0.5123333333333333, "grad_norm": 33.25, "grad_norm_var": 10.402018229166666, "learning_rate": 0.0001, "loss": 7.9731, "loss/crossentropy": 2.135643947124481, "loss/hidden": 3.63203125, "loss/jsd": 0.0, "loss/logits": 0.21202728524804115, "step": 15370 }, { "epoch": 0.5126666666666667, "grad_norm": 30.0, "grad_norm_var": 4.05323963237086e+18, "learning_rate": 0.0001, "loss": 7.9059, "loss/crossentropy": 1.9712642952799797, "loss/hidden": 3.63125, "loss/jsd": 0.0, "loss/logits": 0.19932207949459552, "step": 15380 }, { "epoch": 0.513, "grad_norm": 32.0, "grad_norm_var": 4.053239631984984e+18, "learning_rate": 0.0001, "loss": 8.0131, "loss/crossentropy": 2.114562599360943, "loss/hidden": 3.741796875, "loss/jsd": 0.0, "loss/logits": 0.24068702533841133, "step": 15390 }, { "epoch": 0.5133333333333333, "grad_norm": 30.5, "grad_norm_var": 15.04140625, "learning_rate": 0.0001, "loss": 7.9017, "loss/crossentropy": 2.1701153457164764, "loss/hidden": 3.631640625, "loss/jsd": 0.0, "loss/logits": 0.21901333723217248, "step": 15400 }, { "epoch": 0.5136666666666667, "grad_norm": 29.25, "grad_norm_var": 8.6369140625, "learning_rate": 0.0001, "loss": 7.9558, "loss/crossentropy": 2.1302370369434356, "loss/hidden": 3.73359375, "loss/jsd": 0.0, "loss/logits": 0.22752520255744457, "step": 15410 }, { "epoch": 0.514, "grad_norm": 37.5, "grad_norm_var": 10.702018229166667, "learning_rate": 0.0001, "loss": 8.0334, "loss/crossentropy": 2.076037485152483, "loss/hidden": 3.653125, "loss/jsd": 0.0, "loss/logits": 0.2252957560122013, "step": 15420 }, { "epoch": 0.5143333333333333, "grad_norm": 33.0, "grad_norm_var": 11.815625, "learning_rate": 0.0001, "loss": 7.9922, "loss/crossentropy": 2.090342365950346, "loss/hidden": 3.630859375, "loss/jsd": 0.0, "loss/logits": 0.2138144064694643, "step": 15430 }, { "epoch": 0.5146666666666667, "grad_norm": 39.0, "grad_norm_var": 28.45390625, "learning_rate": 0.0001, "loss": 8.0114, "loss/crossentropy": 2.163317432999611, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.20781796853989362, "step": 15440 }, { "epoch": 0.515, "grad_norm": 34.5, "grad_norm_var": 10.485872395833333, "learning_rate": 0.0001, "loss": 7.9288, "loss/crossentropy": 2.043715859204531, "loss/hidden": 3.691015625, "loss/jsd": 0.0, "loss/logits": 0.2161100683733821, "step": 15450 }, { "epoch": 0.5153333333333333, "grad_norm": 30.375, "grad_norm_var": 5.3556640625, "learning_rate": 0.0001, "loss": 8.0563, "loss/crossentropy": 2.294564101099968, "loss/hidden": 3.6375, "loss/jsd": 0.0, "loss/logits": 0.23372058011591434, "step": 15460 }, { "epoch": 0.5156666666666667, "grad_norm": 28.25, "grad_norm_var": 7.530989583333334, "learning_rate": 0.0001, "loss": 7.8509, "loss/crossentropy": 1.954894269257784, "loss/hidden": 3.632421875, "loss/jsd": 0.0, "loss/logits": 0.2037217952311039, "step": 15470 }, { "epoch": 0.516, "grad_norm": 31.375, "grad_norm_var": 16.819791666666667, "learning_rate": 0.0001, "loss": 7.7803, "loss/crossentropy": 2.1873670905828475, "loss/hidden": 3.649609375, "loss/jsd": 0.0, "loss/logits": 0.22097154781222345, "step": 15480 }, { "epoch": 0.5163333333333333, "grad_norm": 27.875, "grad_norm_var": 16.633072916666666, "learning_rate": 0.0001, "loss": 7.9936, "loss/crossentropy": 2.12020967900753, "loss/hidden": 3.719140625, "loss/jsd": 0.0, "loss/logits": 0.2284359024837613, "step": 15490 }, { "epoch": 0.5166666666666667, "grad_norm": 33.0, "grad_norm_var": 13.01875, "learning_rate": 0.0001, "loss": 8.0295, "loss/crossentropy": 2.144003964960575, "loss/hidden": 3.5765625, "loss/jsd": 0.0, "loss/logits": 0.2127245606854558, "step": 15500 }, { "epoch": 0.517, "grad_norm": 6140461056.0, "grad_norm_var": 2.3565788491207936e+18, "learning_rate": 0.0001, "loss": 7.9846, "loss/crossentropy": 2.0483295105397703, "loss/hidden": 3.84296875, "loss/jsd": 0.0, "loss/logits": 0.21562441848218442, "step": 15510 }, { "epoch": 0.5173333333333333, "grad_norm": 27.625, "grad_norm_var": 2.3565788492039455e+18, "learning_rate": 0.0001, "loss": 7.8762, "loss/crossentropy": 2.045905639976263, "loss/hidden": 3.5796875, "loss/jsd": 0.0, "loss/logits": 0.21305822925642132, "step": 15520 }, { "epoch": 0.5176666666666667, "grad_norm": 30.25, "grad_norm_var": 7.0087890625, "learning_rate": 0.0001, "loss": 7.9002, "loss/crossentropy": 2.0140881910920143, "loss/hidden": 3.78203125, "loss/jsd": 0.0, "loss/logits": 0.2302077604457736, "step": 15530 }, { "epoch": 0.518, "grad_norm": 31.375, "grad_norm_var": 8.4869140625, "learning_rate": 0.0001, "loss": 7.9189, "loss/crossentropy": 2.144743651151657, "loss/hidden": 3.662890625, "loss/jsd": 0.0, "loss/logits": 0.22745619527995586, "step": 15540 }, { "epoch": 0.5183333333333333, "grad_norm": 34.5, "grad_norm_var": 16.46640625, "learning_rate": 0.0001, "loss": 8.0041, "loss/crossentropy": 2.162237875163555, "loss/hidden": 3.667578125, "loss/jsd": 0.0, "loss/logits": 0.22348958030343055, "step": 15550 }, { "epoch": 0.5186666666666667, "grad_norm": 33.0, "grad_norm_var": 10.215625, "learning_rate": 0.0001, "loss": 7.9595, "loss/crossentropy": 2.1002556174993514, "loss/hidden": 3.69921875, "loss/jsd": 0.0, "loss/logits": 0.221992826461792, "step": 15560 }, { "epoch": 0.519, "grad_norm": 28.5, "grad_norm_var": 11.612239583333333, "learning_rate": 0.0001, "loss": 7.9435, "loss/crossentropy": 2.1175016567111014, "loss/hidden": 3.697265625, "loss/jsd": 0.0, "loss/logits": 0.2175672125071287, "step": 15570 }, { "epoch": 0.5193333333333333, "grad_norm": 51.75, "grad_norm_var": 32.14479166666667, "learning_rate": 0.0001, "loss": 7.8969, "loss/crossentropy": 2.1000607915222647, "loss/hidden": 3.578515625, "loss/jsd": 0.0, "loss/logits": 0.22374147176742554, "step": 15580 }, { "epoch": 0.5196666666666667, "grad_norm": 35.25, "grad_norm_var": 216.00826822916667, "learning_rate": 0.0001, "loss": 7.9352, "loss/crossentropy": 2.062793227285147, "loss/hidden": 3.69765625, "loss/jsd": 0.0, "loss/logits": 0.22876775842159985, "step": 15590 }, { "epoch": 0.52, "grad_norm": 29.875, "grad_norm_var": 209.046875, "learning_rate": 0.0001, "loss": 7.9366, "loss/crossentropy": 2.120285242795944, "loss/hidden": 3.648046875, "loss/jsd": 0.0, "loss/logits": 0.21306953616440297, "step": 15600 }, { "epoch": 0.5203333333333333, "grad_norm": 31.125, "grad_norm_var": 6.2775390625, "learning_rate": 0.0001, "loss": 7.8629, "loss/crossentropy": 2.0886681511998177, "loss/hidden": 3.725, "loss/jsd": 0.0, "loss/logits": 0.23063711524009706, "step": 15610 }, { "epoch": 0.5206666666666667, "grad_norm": 42.75, "grad_norm_var": 2.3565788485707105e+18, "learning_rate": 0.0001, "loss": 7.9075, "loss/crossentropy": 2.059058104455471, "loss/hidden": 3.790625, "loss/jsd": 0.0, "loss/logits": 0.2093208000063896, "step": 15620 }, { "epoch": 0.521, "grad_norm": 29.0, "grad_norm_var": 2.356578849024849e+18, "learning_rate": 0.0001, "loss": 8.0243, "loss/crossentropy": 2.0345511339604854, "loss/hidden": 3.615625, "loss/jsd": 0.0, "loss/logits": 0.20063564516603946, "step": 15630 }, { "epoch": 0.5213333333333333, "grad_norm": 34.0, "grad_norm_var": 994.1535807291667, "learning_rate": 0.0001, "loss": 7.9865, "loss/crossentropy": 1.9753425560891629, "loss/hidden": 3.65390625, "loss/jsd": 0.0, "loss/logits": 0.21483086440712212, "step": 15640 }, { "epoch": 0.5216666666666666, "grad_norm": 30.125, "grad_norm_var": 23.786393229166666, "learning_rate": 0.0001, "loss": 7.9327, "loss/crossentropy": 2.0674639120697975, "loss/hidden": 3.81171875, "loss/jsd": 0.0, "loss/logits": 0.2364231664687395, "step": 15650 }, { "epoch": 0.522, "grad_norm": 28.875, "grad_norm_var": 258.47083333333336, "learning_rate": 0.0001, "loss": 7.9931, "loss/crossentropy": 1.968726746737957, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.21212349347770215, "step": 15660 }, { "epoch": 0.5223333333333333, "grad_norm": 40.25, "grad_norm_var": 11.349739583333333, "learning_rate": 0.0001, "loss": 7.7757, "loss/crossentropy": 2.2387484058737757, "loss/hidden": 3.62109375, "loss/jsd": 0.0, "loss/logits": 0.21647185329347848, "step": 15670 }, { "epoch": 0.5226666666666666, "grad_norm": 27.5, "grad_norm_var": 13.4103515625, "learning_rate": 0.0001, "loss": 7.79, "loss/crossentropy": 2.063428644835949, "loss/hidden": 3.5640625, "loss/jsd": 0.0, "loss/logits": 0.21321867797523736, "step": 15680 }, { "epoch": 0.523, "grad_norm": 31.875, "grad_norm_var": 13.243489583333334, "learning_rate": 0.0001, "loss": 7.9336, "loss/crossentropy": 2.1281570941209793, "loss/hidden": 3.611328125, "loss/jsd": 0.0, "loss/logits": 0.22930383421480655, "step": 15690 }, { "epoch": 0.5233333333333333, "grad_norm": 30.5, "grad_norm_var": 13.87890625, "learning_rate": 0.0001, "loss": 7.9178, "loss/crossentropy": 2.07712532132864, "loss/hidden": 3.64375, "loss/jsd": 0.0, "loss/logits": 0.21440593730658292, "step": 15700 }, { "epoch": 0.5236666666666666, "grad_norm": 33.0, "grad_norm_var": 62.56848958333333, "learning_rate": 0.0001, "loss": 7.8597, "loss/crossentropy": 2.111837570369244, "loss/hidden": 3.63984375, "loss/jsd": 0.0, "loss/logits": 0.2352095700800419, "step": 15710 }, { "epoch": 0.524, "grad_norm": 30.25, "grad_norm_var": 59.56295572916667, "learning_rate": 0.0001, "loss": 7.8544, "loss/crossentropy": 1.9604034937918187, "loss/hidden": 3.65390625, "loss/jsd": 0.0, "loss/logits": 0.21035183649510145, "step": 15720 }, { "epoch": 0.5243333333333333, "grad_norm": 30.25, "grad_norm_var": 19.3744140625, "learning_rate": 0.0001, "loss": 8.0273, "loss/crossentropy": 2.1459563463926314, "loss/hidden": 3.573828125, "loss/jsd": 0.0, "loss/logits": 0.21267954409122466, "step": 15730 }, { "epoch": 0.5246666666666666, "grad_norm": 30.0, "grad_norm_var": 23.273372395833334, "learning_rate": 0.0001, "loss": 7.9939, "loss/crossentropy": 2.1543472737073897, "loss/hidden": 3.731640625, "loss/jsd": 0.0, "loss/logits": 0.2248241593129933, "step": 15740 }, { "epoch": 0.525, "grad_norm": 36.25, "grad_norm_var": 5.14765625, "learning_rate": 0.0001, "loss": 7.9856, "loss/crossentropy": 2.0191838264465334, "loss/hidden": 3.64140625, "loss/jsd": 0.0, "loss/logits": 0.22251855283975602, "step": 15750 }, { "epoch": 0.5253333333333333, "grad_norm": 34.75, "grad_norm_var": 6.240625, "learning_rate": 0.0001, "loss": 8.0418, "loss/crossentropy": 2.22621650993824, "loss/hidden": 3.591796875, "loss/jsd": 0.0, "loss/logits": 0.21183471530675888, "step": 15760 }, { "epoch": 0.5256666666666666, "grad_norm": 34.0, "grad_norm_var": 8.637434895833334, "learning_rate": 0.0001, "loss": 7.9608, "loss/crossentropy": 2.0110961377620695, "loss/hidden": 3.655078125, "loss/jsd": 0.0, "loss/logits": 0.22007959876209499, "step": 15770 }, { "epoch": 0.526, "grad_norm": 34.75, "grad_norm_var": 9.070572916666666, "learning_rate": 0.0001, "loss": 7.9034, "loss/crossentropy": 1.9873220488429069, "loss/hidden": 3.691015625, "loss/jsd": 0.0, "loss/logits": 0.2183900134637952, "step": 15780 }, { "epoch": 0.5263333333333333, "grad_norm": 30.375, "grad_norm_var": 5.1853515625, "learning_rate": 0.0001, "loss": 7.9344, "loss/crossentropy": 1.9563458181917668, "loss/hidden": 3.67890625, "loss/jsd": 0.0, "loss/logits": 0.20562523752450942, "step": 15790 }, { "epoch": 0.5266666666666666, "grad_norm": 30.25, "grad_norm_var": 7.5353515625, "learning_rate": 0.0001, "loss": 8.0172, "loss/crossentropy": 2.1347015112638474, "loss/hidden": 3.6546875, "loss/jsd": 0.0, "loss/logits": 0.22778294049203396, "step": 15800 }, { "epoch": 0.527, "grad_norm": 29.625, "grad_norm_var": 8.6978515625, "learning_rate": 0.0001, "loss": 7.9092, "loss/crossentropy": 2.0619646534323692, "loss/hidden": 3.64140625, "loss/jsd": 0.0, "loss/logits": 0.2228071277961135, "step": 15810 }, { "epoch": 0.5273333333333333, "grad_norm": 30.5, "grad_norm_var": 3.0259765625, "learning_rate": 0.0001, "loss": 7.9127, "loss/crossentropy": 2.079516027867794, "loss/hidden": 3.61953125, "loss/jsd": 0.0, "loss/logits": 0.20713965147733687, "step": 15820 }, { "epoch": 0.5276666666666666, "grad_norm": 30.875, "grad_norm_var": 4.55390625, "learning_rate": 0.0001, "loss": 7.9888, "loss/crossentropy": 2.1745656400918962, "loss/hidden": 3.707421875, "loss/jsd": 0.0, "loss/logits": 0.22695780582726002, "step": 15830 }, { "epoch": 0.528, "grad_norm": 29.25, "grad_norm_var": 5.205208333333333, "learning_rate": 0.0001, "loss": 7.9376, "loss/crossentropy": 2.1931827545166014, "loss/hidden": 3.61171875, "loss/jsd": 0.0, "loss/logits": 0.21837961710989476, "step": 15840 }, { "epoch": 0.5283333333333333, "grad_norm": 37.0, "grad_norm_var": 5.242708333333334, "learning_rate": 0.0001, "loss": 7.973, "loss/crossentropy": 2.1738994657993316, "loss/hidden": 3.61953125, "loss/jsd": 0.0, "loss/logits": 0.21239091642200947, "step": 15850 }, { "epoch": 0.5286666666666666, "grad_norm": 32.0, "grad_norm_var": 2.6757714712718213e+18, "learning_rate": 0.0001, "loss": 7.8871, "loss/crossentropy": 2.017245587706566, "loss/hidden": 3.933203125, "loss/jsd": 0.0, "loss/logits": 0.2155450826510787, "step": 15860 }, { "epoch": 0.529, "grad_norm": 36.5, "grad_norm_var": 5.8291015625, "learning_rate": 0.0001, "loss": 8.0041, "loss/crossentropy": 2.173825052380562, "loss/hidden": 3.65390625, "loss/jsd": 0.0, "loss/logits": 0.22582603432238102, "step": 15870 }, { "epoch": 0.5293333333333333, "grad_norm": 30.375, "grad_norm_var": 5.437434895833333, "learning_rate": 0.0001, "loss": 7.8263, "loss/crossentropy": 1.9266249172389507, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.18464618194848298, "step": 15880 }, { "epoch": 0.5296666666666666, "grad_norm": 32.5, "grad_norm_var": 5.3228515625, "learning_rate": 0.0001, "loss": 7.8421, "loss/crossentropy": 2.084079180657864, "loss/hidden": 3.684375, "loss/jsd": 0.0, "loss/logits": 0.21603441275656224, "step": 15890 }, { "epoch": 0.53, "grad_norm": 31.75, "grad_norm_var": 145.74479166666666, "learning_rate": 0.0001, "loss": 7.9261, "loss/crossentropy": 2.1224375024437903, "loss/hidden": 3.7546875, "loss/jsd": 0.0, "loss/logits": 0.2454449266195297, "step": 15900 }, { "epoch": 0.5303333333333333, "grad_norm": 32.75, "grad_norm_var": 4.843684895833333, "learning_rate": 0.0001, "loss": 8.0083, "loss/crossentropy": 2.198984383046627, "loss/hidden": 3.601171875, "loss/jsd": 0.0, "loss/logits": 0.22642315719276668, "step": 15910 }, { "epoch": 0.5306666666666666, "grad_norm": 32.0, "grad_norm_var": 2.9103515625, "learning_rate": 0.0001, "loss": 8.0344, "loss/crossentropy": 2.021199995279312, "loss/hidden": 3.823046875, "loss/jsd": 0.0, "loss/logits": 0.23787404000759124, "step": 15920 }, { "epoch": 0.531, "grad_norm": 34.0, "grad_norm_var": 3.325455729166667, "learning_rate": 0.0001, "loss": 7.9977, "loss/crossentropy": 2.1375532552599905, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.21390576139092446, "step": 15930 }, { "epoch": 0.5313333333333333, "grad_norm": 30.875, "grad_norm_var": 3.0072916666666667, "learning_rate": 0.0001, "loss": 7.8004, "loss/crossentropy": 2.2248755604028703, "loss/hidden": 3.55859375, "loss/jsd": 0.0, "loss/logits": 0.21414327146485448, "step": 15940 }, { "epoch": 0.5316666666666666, "grad_norm": 28.625, "grad_norm_var": 7.127083333333333, "learning_rate": 0.0001, "loss": 7.8167, "loss/crossentropy": 2.024421763420105, "loss/hidden": 3.63671875, "loss/jsd": 0.0, "loss/logits": 0.2099402707070112, "step": 15950 }, { "epoch": 0.532, "grad_norm": 28.75, "grad_norm_var": 7.255989583333333, "learning_rate": 0.0001, "loss": 7.9159, "loss/crossentropy": 2.0035487972199917, "loss/hidden": 3.780078125, "loss/jsd": 0.0, "loss/logits": 0.23340979432687164, "step": 15960 }, { "epoch": 0.5323333333333333, "grad_norm": 31.75, "grad_norm_var": 28.76640625, "learning_rate": 0.0001, "loss": 8.009, "loss/crossentropy": 2.155665622651577, "loss/hidden": 3.703515625, "loss/jsd": 0.0, "loss/logits": 0.23035227973014116, "step": 15970 }, { "epoch": 0.5326666666666666, "grad_norm": 45.25, "grad_norm_var": 19.4119140625, "learning_rate": 0.0001, "loss": 8.0717, "loss/crossentropy": 2.0463739298284054, "loss/hidden": 3.700390625, "loss/jsd": 0.0, "loss/logits": 0.24000506065785884, "step": 15980 }, { "epoch": 0.533, "grad_norm": 32.75, "grad_norm_var": 20.661458333333332, "learning_rate": 0.0001, "loss": 7.9986, "loss/crossentropy": 2.107135473191738, "loss/hidden": 3.735546875, "loss/jsd": 0.0, "loss/logits": 0.22552732955664395, "step": 15990 }, { "epoch": 0.5333333333333333, "grad_norm": 28.625, "grad_norm_var": 13.914583333333333, "learning_rate": 0.0001, "loss": 7.9784, "loss/crossentropy": 2.142227107286453, "loss/hidden": 3.600390625, "loss/jsd": 0.0, "loss/logits": 0.21058713737875223, "step": 16000 }, { "epoch": 0.5336666666666666, "grad_norm": 33.0, "grad_norm_var": 18.480208333333334, "learning_rate": 0.0001, "loss": 7.9253, "loss/crossentropy": 2.209307189285755, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.22212357576936484, "step": 16010 }, { "epoch": 0.534, "grad_norm": 27.625, "grad_norm_var": 7.594791666666667, "learning_rate": 0.0001, "loss": 7.8662, "loss/crossentropy": 2.009366624057293, "loss/hidden": 3.6296875, "loss/jsd": 0.0, "loss/logits": 0.22330178935080766, "step": 16020 }, { "epoch": 0.5343333333333333, "grad_norm": 29.625, "grad_norm_var": 11.910872395833334, "learning_rate": 0.0001, "loss": 8.0186, "loss/crossentropy": 2.1544925197958946, "loss/hidden": 3.6453125, "loss/jsd": 0.0, "loss/logits": 0.22340870313346387, "step": 16030 }, { "epoch": 0.5346666666666666, "grad_norm": 30.75, "grad_norm_var": 14.717122395833334, "learning_rate": 0.0001, "loss": 7.8662, "loss/crossentropy": 2.2394671350717545, "loss/hidden": 3.53671875, "loss/jsd": 0.0, "loss/logits": 0.21254578419029713, "step": 16040 }, { "epoch": 0.535, "grad_norm": 32.5, "grad_norm_var": 4.167708333333334, "learning_rate": 0.0001, "loss": 7.9273, "loss/crossentropy": 2.0182781517505646, "loss/hidden": 3.722265625, "loss/jsd": 0.0, "loss/logits": 0.23184970542788505, "step": 16050 }, { "epoch": 0.5353333333333333, "grad_norm": 34.75, "grad_norm_var": 3.883072916666667, "learning_rate": 0.0001, "loss": 7.9456, "loss/crossentropy": 2.200615034997463, "loss/hidden": 3.752734375, "loss/jsd": 0.0, "loss/logits": 0.2355814663693309, "step": 16060 }, { "epoch": 0.5356666666666666, "grad_norm": 31.0, "grad_norm_var": 10.541666666666666, "learning_rate": 0.0001, "loss": 7.9085, "loss/crossentropy": 1.7848753452301025, "loss/hidden": 3.684375, "loss/jsd": 0.0, "loss/logits": 0.2178065821528435, "step": 16070 }, { "epoch": 0.536, "grad_norm": 30.375, "grad_norm_var": 9.554622395833333, "learning_rate": 0.0001, "loss": 7.9787, "loss/crossentropy": 2.126433804631233, "loss/hidden": 3.66640625, "loss/jsd": 0.0, "loss/logits": 0.2092861395329237, "step": 16080 }, { "epoch": 0.5363333333333333, "grad_norm": 31.875, "grad_norm_var": 2.609375, "learning_rate": 0.0001, "loss": 7.8776, "loss/crossentropy": 2.0730943456292152, "loss/hidden": 3.683984375, "loss/jsd": 0.0, "loss/logits": 0.22322714999318122, "step": 16090 }, { "epoch": 0.5366666666666666, "grad_norm": 31.5, "grad_norm_var": 4.9822265625, "learning_rate": 0.0001, "loss": 7.9642, "loss/crossentropy": 2.1567077368497847, "loss/hidden": 3.841015625, "loss/jsd": 0.0, "loss/logits": 0.24368763864040374, "step": 16100 }, { "epoch": 0.537, "grad_norm": 27.875, "grad_norm_var": 2.101822916666667, "learning_rate": 0.0001, "loss": 7.9671, "loss/crossentropy": 2.0234420910477637, "loss/hidden": 3.687109375, "loss/jsd": 0.0, "loss/logits": 0.20963803213089705, "step": 16110 }, { "epoch": 0.5373333333333333, "grad_norm": 31.125, "grad_norm_var": 2.290625, "learning_rate": 0.0001, "loss": 7.8891, "loss/crossentropy": 2.162296248972416, "loss/hidden": 3.69765625, "loss/jsd": 0.0, "loss/logits": 0.21969048418104647, "step": 16120 }, { "epoch": 0.5376666666666666, "grad_norm": 28.125, "grad_norm_var": 4.483072916666667, "learning_rate": 0.0001, "loss": 7.7938, "loss/crossentropy": 2.0626881010830402, "loss/hidden": 3.5109375, "loss/jsd": 0.0, "loss/logits": 0.20465954188257457, "step": 16130 }, { "epoch": 0.538, "grad_norm": 31.25, "grad_norm_var": 2.715625, "learning_rate": 0.0001, "loss": 7.9009, "loss/crossentropy": 2.1950813859701155, "loss/hidden": 3.492578125, "loss/jsd": 0.0, "loss/logits": 0.21106129735708237, "step": 16140 }, { "epoch": 0.5383333333333333, "grad_norm": 33.75, "grad_norm_var": 2.15, "learning_rate": 0.0001, "loss": 7.8827, "loss/crossentropy": 2.0365090548992155, "loss/hidden": 3.5890625, "loss/jsd": 0.0, "loss/logits": 0.19862626306712627, "step": 16150 }, { "epoch": 0.5386666666666666, "grad_norm": 29.875, "grad_norm_var": 2.46015625, "learning_rate": 0.0001, "loss": 8.0171, "loss/crossentropy": 2.0772834539413454, "loss/hidden": 3.652734375, "loss/jsd": 0.0, "loss/logits": 0.22482867874205112, "step": 16160 }, { "epoch": 0.539, "grad_norm": 31.5, "grad_norm_var": 13.727083333333333, "learning_rate": 0.0001, "loss": 7.9269, "loss/crossentropy": 2.0035090267658235, "loss/hidden": 3.639453125, "loss/jsd": 0.0, "loss/logits": 0.21533504147082566, "step": 16170 }, { "epoch": 0.5393333333333333, "grad_norm": 29.625, "grad_norm_var": 3.4058471876790845e+18, "learning_rate": 0.0001, "loss": 7.9526, "loss/crossentropy": 2.0684109210968016, "loss/hidden": 3.809765625, "loss/jsd": 0.0, "loss/logits": 0.24036269690841436, "step": 16180 }, { "epoch": 0.5396666666666666, "grad_norm": 29.75, "grad_norm_var": 1.0208333333333333, "learning_rate": 0.0001, "loss": 7.9665, "loss/crossentropy": 2.0767479740083217, "loss/hidden": 3.6546875, "loss/jsd": 0.0, "loss/logits": 0.22391563206911086, "step": 16190 }, { "epoch": 0.54, "grad_norm": 32.75, "grad_norm_var": 0.9249348958333333, "learning_rate": 0.0001, "loss": 7.8065, "loss/crossentropy": 2.070406360924244, "loss/hidden": 3.697265625, "loss/jsd": 0.0, "loss/logits": 0.22352562863379716, "step": 16200 }, { "epoch": 0.5403333333333333, "grad_norm": 31.125, "grad_norm_var": 2.8259765625, "learning_rate": 0.0001, "loss": 7.7915, "loss/crossentropy": 2.135431842878461, "loss/hidden": 3.585546875, "loss/jsd": 0.0, "loss/logits": 0.20899291220121086, "step": 16210 }, { "epoch": 0.5406666666666666, "grad_norm": 27.75, "grad_norm_var": 5.374934895833333, "learning_rate": 0.0001, "loss": 7.8726, "loss/crossentropy": 1.9553619243204594, "loss/hidden": 3.6859375, "loss/jsd": 0.0, "loss/logits": 0.21065005520358682, "step": 16220 }, { "epoch": 0.541, "grad_norm": 32.25, "grad_norm_var": 2.9205729166666665, "learning_rate": 0.0001, "loss": 7.9361, "loss/crossentropy": 2.235548512637615, "loss/hidden": 3.632421875, "loss/jsd": 0.0, "loss/logits": 0.23148855995386838, "step": 16230 }, { "epoch": 0.5413333333333333, "grad_norm": 28.75, "grad_norm_var": 5.443489583333333, "learning_rate": 0.0001, "loss": 7.9547, "loss/crossentropy": 2.0689282923936845, "loss/hidden": 3.73359375, "loss/jsd": 0.0, "loss/logits": 0.22652086950838565, "step": 16240 }, { "epoch": 0.5416666666666666, "grad_norm": 31.875, "grad_norm_var": 4.34765625, "learning_rate": 0.0001, "loss": 7.903, "loss/crossentropy": 1.9587130278348923, "loss/hidden": 3.82578125, "loss/jsd": 0.0, "loss/logits": 0.24109735526144505, "step": 16250 }, { "epoch": 0.542, "grad_norm": 31.0, "grad_norm_var": 2.956184895833333, "learning_rate": 0.0001, "loss": 7.9763, "loss/crossentropy": 2.231505811214447, "loss/hidden": 3.670703125, "loss/jsd": 0.0, "loss/logits": 0.22332519851624966, "step": 16260 }, { "epoch": 0.5423333333333333, "grad_norm": 30.875, "grad_norm_var": 2.87890625, "learning_rate": 0.0001, "loss": 7.9534, "loss/crossentropy": 2.21473398655653, "loss/hidden": 3.684765625, "loss/jsd": 0.0, "loss/logits": 0.2405968852341175, "step": 16270 }, { "epoch": 0.5426666666666666, "grad_norm": 31.125, "grad_norm_var": 2.044205729166667, "learning_rate": 0.0001, "loss": 7.8368, "loss/crossentropy": 2.0223120510578156, "loss/hidden": 3.600390625, "loss/jsd": 0.0, "loss/logits": 0.20706812832504512, "step": 16280 }, { "epoch": 0.543, "grad_norm": 31.125, "grad_norm_var": 3.6705729166666665, "learning_rate": 0.0001, "loss": 8.1192, "loss/crossentropy": 2.2137394294142725, "loss/hidden": 3.80234375, "loss/jsd": 0.0, "loss/logits": 0.24676046203821897, "step": 16290 }, { "epoch": 0.5433333333333333, "grad_norm": 31.875, "grad_norm_var": 2.3997395833333335, "learning_rate": 0.0001, "loss": 8.0137, "loss/crossentropy": 2.1324679240584374, "loss/hidden": 3.715234375, "loss/jsd": 0.0, "loss/logits": 0.21872996147722007, "step": 16300 }, { "epoch": 0.5436666666666666, "grad_norm": 30.875, "grad_norm_var": 6.243489583333333, "learning_rate": 0.0001, "loss": 8.0561, "loss/crossentropy": 2.1422536253929136, "loss/hidden": 3.705078125, "loss/jsd": 0.0, "loss/logits": 0.24038595696911216, "step": 16310 }, { "epoch": 0.544, "grad_norm": 28.875, "grad_norm_var": 15.54375, "learning_rate": 0.0001, "loss": 7.9935, "loss/crossentropy": 1.994092260301113, "loss/hidden": 3.68359375, "loss/jsd": 0.0, "loss/logits": 0.20646399296820164, "step": 16320 }, { "epoch": 0.5443333333333333, "grad_norm": 34.5, "grad_norm_var": 8.833072916666667, "learning_rate": 0.0001, "loss": 7.8759, "loss/crossentropy": 2.1263050198554994, "loss/hidden": 3.73359375, "loss/jsd": 0.0, "loss/logits": 0.23247295394539833, "step": 16330 }, { "epoch": 0.5446666666666666, "grad_norm": 32.25, "grad_norm_var": 3.8926432291666666, "learning_rate": 0.0001, "loss": 7.8947, "loss/crossentropy": 2.1508326224982737, "loss/hidden": 3.615625, "loss/jsd": 0.0, "loss/logits": 0.22677485179156065, "step": 16340 }, { "epoch": 0.545, "grad_norm": 33.75, "grad_norm_var": 3.565625, "learning_rate": 0.0001, "loss": 7.9352, "loss/crossentropy": 2.146810656785965, "loss/hidden": 3.7203125, "loss/jsd": 0.0, "loss/logits": 0.22902542352676392, "step": 16350 }, { "epoch": 0.5453333333333333, "grad_norm": 32.25, "grad_norm_var": 2.5322916666666666, "learning_rate": 0.0001, "loss": 7.895, "loss/crossentropy": 2.1730375468730925, "loss/hidden": 3.6609375, "loss/jsd": 0.0, "loss/logits": 0.22302960231900215, "step": 16360 }, { "epoch": 0.5456666666666666, "grad_norm": 28.875, "grad_norm_var": 2.1747395833333334, "learning_rate": 0.0001, "loss": 7.965, "loss/crossentropy": 2.1157036066055297, "loss/hidden": 3.708203125, "loss/jsd": 0.0, "loss/logits": 0.24063412323594094, "step": 16370 }, { "epoch": 0.546, "grad_norm": 29.75, "grad_norm_var": 6.789322916666666, "learning_rate": 0.0001, "loss": 7.9686, "loss/crossentropy": 2.04927616417408, "loss/hidden": 3.698828125, "loss/jsd": 0.0, "loss/logits": 0.21224019005894662, "step": 16380 }, { "epoch": 0.5463333333333333, "grad_norm": 31.125, "grad_norm_var": 5.550455729166667, "learning_rate": 0.0001, "loss": 7.9747, "loss/crossentropy": 2.068675779551268, "loss/hidden": 3.584765625, "loss/jsd": 0.0, "loss/logits": 0.2084495802409947, "step": 16390 }, { "epoch": 0.5466666666666666, "grad_norm": 30.875, "grad_norm_var": 1.5639973958333333, "learning_rate": 0.0001, "loss": 7.988, "loss/crossentropy": 2.0940013602375984, "loss/hidden": 3.691796875, "loss/jsd": 0.0, "loss/logits": 0.22360229659825565, "step": 16400 }, { "epoch": 0.547, "grad_norm": 31.625, "grad_norm_var": 1.7455729166666667, "learning_rate": 0.0001, "loss": 8.0907, "loss/crossentropy": 2.1774737536907196, "loss/hidden": 3.6859375, "loss/jsd": 0.0, "loss/logits": 0.22327901497483255, "step": 16410 }, { "epoch": 0.5473333333333333, "grad_norm": 29.875, "grad_norm_var": 4.50625, "learning_rate": 0.0001, "loss": 8.0022, "loss/crossentropy": 2.1608075901865957, "loss/hidden": 3.700390625, "loss/jsd": 0.0, "loss/logits": 0.25244998149573805, "step": 16420 }, { "epoch": 0.5476666666666666, "grad_norm": 31.875, "grad_norm_var": 3.2671223958333333, "learning_rate": 0.0001, "loss": 8.1234, "loss/crossentropy": 1.982553929835558, "loss/hidden": 3.565625, "loss/jsd": 0.0, "loss/logits": 0.20166566986590623, "step": 16430 }, { "epoch": 0.548, "grad_norm": 32.5, "grad_norm_var": 1.7520833333333334, "learning_rate": 0.0001, "loss": 8.037, "loss/crossentropy": 2.0213874965906142, "loss/hidden": 3.787109375, "loss/jsd": 0.0, "loss/logits": 0.23120209593325852, "step": 16440 }, { "epoch": 0.5483333333333333, "grad_norm": 32.5, "grad_norm_var": 2.7625, "learning_rate": 0.0001, "loss": 7.9356, "loss/crossentropy": 2.0151995003223417, "loss/hidden": 3.6828125, "loss/jsd": 0.0, "loss/logits": 0.21384722124785185, "step": 16450 }, { "epoch": 0.5486666666666666, "grad_norm": 30.75, "grad_norm_var": 2.8580729166666665, "learning_rate": 0.0001, "loss": 8.0426, "loss/crossentropy": 2.258426922559738, "loss/hidden": 3.57109375, "loss/jsd": 0.0, "loss/logits": 0.21724292561411856, "step": 16460 }, { "epoch": 0.549, "grad_norm": 30.625, "grad_norm_var": 2.6211653255110676e+18, "learning_rate": 0.0001, "loss": 7.8979, "loss/crossentropy": 2.1004390507936477, "loss/hidden": 3.63125, "loss/jsd": 0.0, "loss/logits": 0.2168941769748926, "step": 16470 }, { "epoch": 0.5493333333333333, "grad_norm": 32.0, "grad_norm_var": 21.358072916666668, "learning_rate": 0.0001, "loss": 7.9368, "loss/crossentropy": 2.234841299057007, "loss/hidden": 3.671484375, "loss/jsd": 0.0, "loss/logits": 0.22458531036973, "step": 16480 }, { "epoch": 0.5496666666666666, "grad_norm": 31.0, "grad_norm_var": 19.398372395833334, "learning_rate": 0.0001, "loss": 7.7756, "loss/crossentropy": 1.9220365844666958, "loss/hidden": 3.650390625, "loss/jsd": 0.0, "loss/logits": 0.21966882292181253, "step": 16490 }, { "epoch": 0.55, "grad_norm": 29.25, "grad_norm_var": 10.578059895833333, "learning_rate": 0.0001, "loss": 7.9483, "loss/crossentropy": 2.137505892664194, "loss/hidden": 3.684375, "loss/jsd": 0.0, "loss/logits": 0.20756643787026405, "step": 16500 }, { "epoch": 0.5503333333333333, "grad_norm": 30.5, "grad_norm_var": 4.569791666666666, "learning_rate": 0.0001, "loss": 7.842, "loss/crossentropy": 2.163301798701286, "loss/hidden": 3.641015625, "loss/jsd": 0.0, "loss/logits": 0.21592859141528606, "step": 16510 }, { "epoch": 0.5506666666666666, "grad_norm": 30.25, "grad_norm_var": 1.9749348958333333, "learning_rate": 0.0001, "loss": 7.9053, "loss/crossentropy": 1.8833866529166698, "loss/hidden": 3.783203125, "loss/jsd": 0.0, "loss/logits": 0.22102598939090967, "step": 16520 }, { "epoch": 0.551, "grad_norm": 30.5, "grad_norm_var": 2.0176432291666666, "learning_rate": 0.0001, "loss": 7.8003, "loss/crossentropy": 1.942694688588381, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.21318345218896867, "step": 16530 }, { "epoch": 0.5513333333333333, "grad_norm": 33.0, "grad_norm_var": 2.539322916666667, "learning_rate": 0.0001, "loss": 7.7828, "loss/crossentropy": 2.0692317470908166, "loss/hidden": 3.541015625, "loss/jsd": 0.0, "loss/logits": 0.20164419133216144, "step": 16540 }, { "epoch": 0.5516666666666666, "grad_norm": 29.5, "grad_norm_var": 2.595572916666667, "learning_rate": 0.0001, "loss": 7.9161, "loss/crossentropy": 2.046563369035721, "loss/hidden": 3.6890625, "loss/jsd": 0.0, "loss/logits": 0.22405224461108447, "step": 16550 }, { "epoch": 0.552, "grad_norm": 34.75, "grad_norm_var": 3.570572916666667, "learning_rate": 0.0001, "loss": 7.9911, "loss/crossentropy": 2.018775662779808, "loss/hidden": 3.7109375, "loss/jsd": 0.0, "loss/logits": 0.21541824340820312, "step": 16560 }, { "epoch": 0.5523333333333333, "grad_norm": 29.875, "grad_norm_var": 3.6080729166666665, "learning_rate": 0.0001, "loss": 7.9031, "loss/crossentropy": 2.25411321669817, "loss/hidden": 3.666015625, "loss/jsd": 0.0, "loss/logits": 0.2227392230182886, "step": 16570 }, { "epoch": 0.5526666666666666, "grad_norm": 30.25, "grad_norm_var": 2.8603515625, "learning_rate": 0.0001, "loss": 7.8556, "loss/crossentropy": 2.0392999947071075, "loss/hidden": 3.684375, "loss/jsd": 0.0, "loss/logits": 0.2251502934843302, "step": 16580 }, { "epoch": 0.553, "grad_norm": 30.75, "grad_norm_var": 3.153125, "learning_rate": 0.0001, "loss": 7.7396, "loss/crossentropy": 1.9471760131418705, "loss/hidden": 3.534765625, "loss/jsd": 0.0, "loss/logits": 0.1881936783902347, "step": 16590 }, { "epoch": 0.5533333333333333, "grad_norm": 32.25, "grad_norm_var": 4.223883836303868e+18, "learning_rate": 0.0001, "loss": 7.9172, "loss/crossentropy": 2.0934195905923842, "loss/hidden": 3.65703125, "loss/jsd": 0.0, "loss/logits": 0.217781463265419, "step": 16600 }, { "epoch": 0.5536666666666666, "grad_norm": 30.125, "grad_norm_var": 4.223883835918516e+18, "learning_rate": 0.0001, "loss": 7.8719, "loss/crossentropy": 2.053102213144302, "loss/hidden": 3.6890625, "loss/jsd": 0.0, "loss/logits": 0.22234885692596434, "step": 16610 }, { "epoch": 0.554, "grad_norm": 32.0, "grad_norm_var": 4.234309895833333, "learning_rate": 0.0001, "loss": 7.9785, "loss/crossentropy": 2.0579131454229356, "loss/hidden": 3.679296875, "loss/jsd": 0.0, "loss/logits": 0.22978297639638184, "step": 16620 }, { "epoch": 0.5543333333333333, "grad_norm": 29.0, "grad_norm_var": 6.4478515625, "learning_rate": 0.0001, "loss": 7.9221, "loss/crossentropy": 2.098356659710407, "loss/hidden": 3.72421875, "loss/jsd": 0.0, "loss/logits": 0.22436762768775226, "step": 16630 }, { "epoch": 0.5546666666666666, "grad_norm": 29.25, "grad_norm_var": 3.4427083333333335, "learning_rate": 0.0001, "loss": 7.8154, "loss/crossentropy": 1.9613312944769858, "loss/hidden": 3.689453125, "loss/jsd": 0.0, "loss/logits": 0.21493730675429107, "step": 16640 }, { "epoch": 0.555, "grad_norm": 29.5, "grad_norm_var": 1.4729166666666667, "learning_rate": 0.0001, "loss": 7.9289, "loss/crossentropy": 1.970650000870228, "loss/hidden": 3.795703125, "loss/jsd": 0.0, "loss/logits": 0.22653428725898267, "step": 16650 }, { "epoch": 0.5553333333333333, "grad_norm": 31.75, "grad_norm_var": 2.425, "learning_rate": 0.0001, "loss": 7.8909, "loss/crossentropy": 2.1313917048275473, "loss/hidden": 3.67421875, "loss/jsd": 0.0, "loss/logits": 0.2265587305650115, "step": 16660 }, { "epoch": 0.5556666666666666, "grad_norm": 32.5, "grad_norm_var": 3.2223307291666665, "learning_rate": 0.0001, "loss": 7.8742, "loss/crossentropy": 2.010394226014614, "loss/hidden": 3.540625, "loss/jsd": 0.0, "loss/logits": 0.19476988408714532, "step": 16670 }, { "epoch": 0.556, "grad_norm": 33.25, "grad_norm_var": 2.989518229166667, "learning_rate": 0.0001, "loss": 7.9211, "loss/crossentropy": 2.206305223703384, "loss/hidden": 3.595703125, "loss/jsd": 0.0, "loss/logits": 0.22279220167547464, "step": 16680 }, { "epoch": 0.5563333333333333, "grad_norm": 29.5, "grad_norm_var": 7.698372395833333, "learning_rate": 0.0001, "loss": 7.8795, "loss/crossentropy": 2.1127166926860808, "loss/hidden": 3.58203125, "loss/jsd": 0.0, "loss/logits": 0.2053209213539958, "step": 16690 }, { "epoch": 0.5566666666666666, "grad_norm": 30.25, "grad_norm_var": 3.4082682291666666, "learning_rate": 0.0001, "loss": 7.9118, "loss/crossentropy": 1.980313377827406, "loss/hidden": 3.665234375, "loss/jsd": 0.0, "loss/logits": 0.19750017933547498, "step": 16700 }, { "epoch": 0.557, "grad_norm": 32.0, "grad_norm_var": 4.745572916666666, "learning_rate": 0.0001, "loss": 7.8491, "loss/crossentropy": 2.187881177663803, "loss/hidden": 3.666796875, "loss/jsd": 0.0, "loss/logits": 0.2327125236392021, "step": 16710 }, { "epoch": 0.5573333333333333, "grad_norm": 30.125, "grad_norm_var": 3.0895182291666665, "learning_rate": 0.0001, "loss": 7.8578, "loss/crossentropy": 2.0527888640761374, "loss/hidden": 3.8359375, "loss/jsd": 0.0, "loss/logits": 0.24014483857899904, "step": 16720 }, { "epoch": 0.5576666666666666, "grad_norm": 29.125, "grad_norm_var": 2.315625, "learning_rate": 0.0001, "loss": 7.7581, "loss/crossentropy": 1.989125171303749, "loss/hidden": 3.584765625, "loss/jsd": 0.0, "loss/logits": 0.20171179957687854, "step": 16730 }, { "epoch": 0.558, "grad_norm": 34.0, "grad_norm_var": 3.3030598958333335, "learning_rate": 0.0001, "loss": 7.8911, "loss/crossentropy": 1.909910400211811, "loss/hidden": 3.766015625, "loss/jsd": 0.0, "loss/logits": 0.2186163429170847, "step": 16740 }, { "epoch": 0.5583333333333333, "grad_norm": 30.875, "grad_norm_var": 3.330208333333333, "learning_rate": 0.0001, "loss": 7.8985, "loss/crossentropy": 2.0753150559961795, "loss/hidden": 3.61640625, "loss/jsd": 0.0, "loss/logits": 0.21899790465831756, "step": 16750 }, { "epoch": 0.5586666666666666, "grad_norm": 30.875, "grad_norm_var": 2.187239583333333, "learning_rate": 0.0001, "loss": 7.8861, "loss/crossentropy": 2.1016260892152787, "loss/hidden": 3.615625, "loss/jsd": 0.0, "loss/logits": 0.210488342307508, "step": 16760 }, { "epoch": 0.559, "grad_norm": 31.0, "grad_norm_var": 2.8421223958333335, "learning_rate": 0.0001, "loss": 8.0766, "loss/crossentropy": 2.208987255394459, "loss/hidden": 3.822265625, "loss/jsd": 0.0, "loss/logits": 0.245903043076396, "step": 16770 }, { "epoch": 0.5593333333333333, "grad_norm": 36.25, "grad_norm_var": 320.3322265625, "learning_rate": 0.0001, "loss": 7.9084, "loss/crossentropy": 2.0669932678341865, "loss/hidden": 3.662109375, "loss/jsd": 0.0, "loss/logits": 0.21224353071302177, "step": 16780 }, { "epoch": 0.5596666666666666, "grad_norm": 43.5, "grad_norm_var": 305.8958333333333, "learning_rate": 0.0001, "loss": 8.0134, "loss/crossentropy": 2.2563454896211623, "loss/hidden": 3.6125, "loss/jsd": 0.0, "loss/logits": 0.22311148904263972, "step": 16790 }, { "epoch": 0.56, "grad_norm": 32.75, "grad_norm_var": 20.851497395833334, "learning_rate": 0.0001, "loss": 8.0603, "loss/crossentropy": 2.2054395377635956, "loss/hidden": 3.583203125, "loss/jsd": 0.0, "loss/logits": 0.20811190865933896, "step": 16800 }, { "epoch": 0.5603333333333333, "grad_norm": 28.25, "grad_norm_var": 2.448958333333333, "learning_rate": 0.0001, "loss": 7.7593, "loss/crossentropy": 2.236998660862446, "loss/hidden": 3.611328125, "loss/jsd": 0.0, "loss/logits": 0.22802891582250595, "step": 16810 }, { "epoch": 0.5606666666666666, "grad_norm": 31.5, "grad_norm_var": 3.7239583333333335, "learning_rate": 0.0001, "loss": 7.7787, "loss/crossentropy": 1.884455829113722, "loss/hidden": 3.53828125, "loss/jsd": 0.0, "loss/logits": 0.19550493340939284, "step": 16820 }, { "epoch": 0.561, "grad_norm": 32.25, "grad_norm_var": 3.5822265625, "learning_rate": 0.0001, "loss": 7.9028, "loss/crossentropy": 2.0985020123422147, "loss/hidden": 3.604296875, "loss/jsd": 0.0, "loss/logits": 0.2114247432909906, "step": 16830 }, { "epoch": 0.5613333333333334, "grad_norm": 29.75, "grad_norm_var": 2.0468098958333334, "learning_rate": 0.0001, "loss": 7.9145, "loss/crossentropy": 1.935218346118927, "loss/hidden": 3.733203125, "loss/jsd": 0.0, "loss/logits": 0.22361029600724577, "step": 16840 }, { "epoch": 0.5616666666666666, "grad_norm": 29.75, "grad_norm_var": 2.8337890625, "learning_rate": 0.0001, "loss": 7.8986, "loss/crossentropy": 1.9841939061880112, "loss/hidden": 3.7, "loss/jsd": 0.0, "loss/logits": 0.20547962225973607, "step": 16850 }, { "epoch": 0.562, "grad_norm": 33.0, "grad_norm_var": 10.559830729166666, "learning_rate": 0.0001, "loss": 7.9661, "loss/crossentropy": 2.242799472808838, "loss/hidden": 3.653515625, "loss/jsd": 0.0, "loss/logits": 0.24114026986062526, "step": 16860 }, { "epoch": 0.5623333333333334, "grad_norm": 30.75, "grad_norm_var": 10.02265625, "learning_rate": 0.0001, "loss": 7.9501, "loss/crossentropy": 2.272703301906586, "loss/hidden": 3.5921875, "loss/jsd": 0.0, "loss/logits": 0.219810495339334, "step": 16870 }, { "epoch": 0.5626666666666666, "grad_norm": 28.75, "grad_norm_var": 8.073893229166666, "learning_rate": 0.0001, "loss": 7.8381, "loss/crossentropy": 1.99768578261137, "loss/hidden": 3.56796875, "loss/jsd": 0.0, "loss/logits": 0.229691500402987, "step": 16880 }, { "epoch": 0.563, "grad_norm": 30.625, "grad_norm_var": 3.9358723958333335, "learning_rate": 0.0001, "loss": 7.7505, "loss/crossentropy": 1.958486919105053, "loss/hidden": 3.501171875, "loss/jsd": 0.0, "loss/logits": 0.19877009009942412, "step": 16890 }, { "epoch": 0.5633333333333334, "grad_norm": 31.625, "grad_norm_var": 12.167643229166666, "learning_rate": 0.0001, "loss": 7.9282, "loss/crossentropy": 2.2899662777781487, "loss/hidden": 3.60703125, "loss/jsd": 0.0, "loss/logits": 0.22736097928136587, "step": 16900 }, { "epoch": 0.5636666666666666, "grad_norm": 31.625, "grad_norm_var": 1.9051432291666666, "learning_rate": 0.0001, "loss": 7.9337, "loss/crossentropy": 2.046753417700529, "loss/hidden": 3.581640625, "loss/jsd": 0.0, "loss/logits": 0.20713044237345457, "step": 16910 }, { "epoch": 0.564, "grad_norm": 31.5, "grad_norm_var": 12.576041666666667, "learning_rate": 0.0001, "loss": 7.8576, "loss/crossentropy": 2.137194776535034, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.22013001535087823, "step": 16920 }, { "epoch": 0.5643333333333334, "grad_norm": 31.25, "grad_norm_var": 15.774934895833333, "learning_rate": 0.0001, "loss": 7.913, "loss/crossentropy": 1.9944169409573078, "loss/hidden": 3.628125, "loss/jsd": 0.0, "loss/logits": 0.21307808943092824, "step": 16930 }, { "epoch": 0.5646666666666667, "grad_norm": 30.375, "grad_norm_var": 5.337955729166667, "learning_rate": 0.0001, "loss": 7.886, "loss/crossentropy": 2.018931347131729, "loss/hidden": 3.6046875, "loss/jsd": 0.0, "loss/logits": 0.21659799050539733, "step": 16940 }, { "epoch": 0.565, "grad_norm": 30.375, "grad_norm_var": 7.773958333333334, "learning_rate": 0.0001, "loss": 7.8816, "loss/crossentropy": 2.0590699821710587, "loss/hidden": 3.665234375, "loss/jsd": 0.0, "loss/logits": 0.20541296005249024, "step": 16950 }, { "epoch": 0.5653333333333334, "grad_norm": 31.5, "grad_norm_var": 5.34140625, "learning_rate": 0.0001, "loss": 7.9332, "loss/crossentropy": 2.048498646169901, "loss/hidden": 3.7015625, "loss/jsd": 0.0, "loss/logits": 0.21477829590439795, "step": 16960 }, { "epoch": 0.5656666666666667, "grad_norm": 28.875, "grad_norm_var": 2.6676432291666665, "learning_rate": 0.0001, "loss": 7.864, "loss/crossentropy": 2.0648021958768368, "loss/hidden": 3.712109375, "loss/jsd": 0.0, "loss/logits": 0.2224846264347434, "step": 16970 }, { "epoch": 0.566, "grad_norm": 33.5, "grad_norm_var": 2.24765625, "learning_rate": 0.0001, "loss": 7.7163, "loss/crossentropy": 2.107031860947609, "loss/hidden": 3.53671875, "loss/jsd": 0.0, "loss/logits": 0.20737082306295634, "step": 16980 }, { "epoch": 0.5663333333333334, "grad_norm": 32.0, "grad_norm_var": 7.234830729166666, "learning_rate": 0.0001, "loss": 8.0486, "loss/crossentropy": 2.139879457652569, "loss/hidden": 3.6375, "loss/jsd": 0.0, "loss/logits": 0.23415213711559774, "step": 16990 }, { "epoch": 0.5666666666666667, "grad_norm": 28.375, "grad_norm_var": 26.10625, "learning_rate": 0.0001, "loss": 7.9511, "loss/crossentropy": 2.0089096277952194, "loss/hidden": 3.590234375, "loss/jsd": 0.0, "loss/logits": 0.20133000621572136, "step": 17000 }, { "epoch": 0.567, "grad_norm": 32.5, "grad_norm_var": 28.608072916666668, "learning_rate": 0.0001, "loss": 7.8054, "loss/crossentropy": 1.9827151045203208, "loss/hidden": 3.64296875, "loss/jsd": 0.0, "loss/logits": 0.2022013606503606, "step": 17010 }, { "epoch": 0.5673333333333334, "grad_norm": 30.5, "grad_norm_var": 519.3660807291667, "learning_rate": 0.0001, "loss": 8.0306, "loss/crossentropy": 2.227891056239605, "loss/hidden": 3.672265625, "loss/jsd": 0.0, "loss/logits": 0.23921767324209214, "step": 17020 }, { "epoch": 0.5676666666666667, "grad_norm": 30.75, "grad_norm_var": 517.3622395833333, "learning_rate": 0.0001, "loss": 7.7689, "loss/crossentropy": 1.9673498637974263, "loss/hidden": 3.55078125, "loss/jsd": 0.0, "loss/logits": 0.20032266862690448, "step": 17030 }, { "epoch": 0.568, "grad_norm": 35.0, "grad_norm_var": 5.939583333333333, "learning_rate": 0.0001, "loss": 7.8396, "loss/crossentropy": 2.0264181800186636, "loss/hidden": 3.62734375, "loss/jsd": 0.0, "loss/logits": 0.22799574863165617, "step": 17040 }, { "epoch": 0.5683333333333334, "grad_norm": 30.75, "grad_norm_var": 4.541666666666667, "learning_rate": 0.0001, "loss": 7.8507, "loss/crossentropy": 2.1397699415683746, "loss/hidden": 3.669140625, "loss/jsd": 0.0, "loss/logits": 0.22351325545459985, "step": 17050 }, { "epoch": 0.5686666666666667, "grad_norm": 33.0, "grad_norm_var": 3.1244140625, "learning_rate": 0.0001, "loss": 7.9212, "loss/crossentropy": 2.0302638575434684, "loss/hidden": 3.717578125, "loss/jsd": 0.0, "loss/logits": 0.21876529976725578, "step": 17060 }, { "epoch": 0.569, "grad_norm": 36.75, "grad_norm_var": 6.9625, "learning_rate": 0.0001, "loss": 7.9896, "loss/crossentropy": 2.150805290043354, "loss/hidden": 3.660546875, "loss/jsd": 0.0, "loss/logits": 0.22219337709248066, "step": 17070 }, { "epoch": 0.5693333333333334, "grad_norm": 32.5, "grad_norm_var": 3.405847187879013e+18, "learning_rate": 0.0001, "loss": 8.0392, "loss/crossentropy": 2.158428954333067, "loss/hidden": 3.6515625, "loss/jsd": 0.0, "loss/logits": 0.22352358270436526, "step": 17080 }, { "epoch": 0.5696666666666667, "grad_norm": 31.25, "grad_norm_var": 3.405847188432661e+18, "learning_rate": 0.0001, "loss": 7.9175, "loss/crossentropy": 2.0420360594987867, "loss/hidden": 3.608203125, "loss/jsd": 0.0, "loss/logits": 0.20006783921271562, "step": 17090 }, { "epoch": 0.57, "grad_norm": 28.75, "grad_norm_var": 3.9212890625, "learning_rate": 0.0001, "loss": 7.9392, "loss/crossentropy": 2.012987617403269, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.21622594874352216, "step": 17100 }, { "epoch": 0.5703333333333334, "grad_norm": 33.75, "grad_norm_var": 4.261393229166667, "learning_rate": 0.0001, "loss": 7.891, "loss/crossentropy": 1.951448941975832, "loss/hidden": 3.6390625, "loss/jsd": 0.0, "loss/logits": 0.22058595921844243, "step": 17110 }, { "epoch": 0.5706666666666667, "grad_norm": 29.25, "grad_norm_var": 1.7567057291666666, "learning_rate": 0.0001, "loss": 7.8458, "loss/crossentropy": 2.106409525871277, "loss/hidden": 3.5765625, "loss/jsd": 0.0, "loss/logits": 0.20967069901525975, "step": 17120 }, { "epoch": 0.571, "grad_norm": 30.25, "grad_norm_var": 1.96875, "learning_rate": 0.0001, "loss": 7.7126, "loss/crossentropy": 2.0033893555402758, "loss/hidden": 3.51640625, "loss/jsd": 0.0, "loss/logits": 0.1892759721726179, "step": 17130 }, { "epoch": 0.5713333333333334, "grad_norm": 28.5, "grad_norm_var": 2.662239583333333, "learning_rate": 0.0001, "loss": 7.8506, "loss/crossentropy": 2.1353479593992235, "loss/hidden": 3.612890625, "loss/jsd": 0.0, "loss/logits": 0.21052795574069022, "step": 17140 }, { "epoch": 0.5716666666666667, "grad_norm": 32.5, "grad_norm_var": 2.648398029737392e+18, "learning_rate": 0.0001, "loss": 7.9248, "loss/crossentropy": 2.06460902094841, "loss/hidden": 3.50859375, "loss/jsd": 0.0, "loss/logits": 0.20914022997021675, "step": 17150 }, { "epoch": 0.572, "grad_norm": 30.5, "grad_norm_var": 2.648398030388348e+18, "learning_rate": 0.0001, "loss": 7.9167, "loss/crossentropy": 2.0927888706326483, "loss/hidden": 3.646484375, "loss/jsd": 0.0, "loss/logits": 0.23212119033560158, "step": 17160 }, { "epoch": 0.5723333333333334, "grad_norm": 29.375, "grad_norm_var": 5.845833333333333, "learning_rate": 0.0001, "loss": 7.7342, "loss/crossentropy": 1.9288011252880097, "loss/hidden": 3.569140625, "loss/jsd": 0.0, "loss/logits": 0.19227788979187607, "step": 17170 }, { "epoch": 0.5726666666666667, "grad_norm": 31.75, "grad_norm_var": 5.4416015625, "learning_rate": 0.0001, "loss": 7.8135, "loss/crossentropy": 2.0256782703101637, "loss/hidden": 3.76328125, "loss/jsd": 0.0, "loss/logits": 0.22770339585840702, "step": 17180 }, { "epoch": 0.573, "grad_norm": 30.75, "grad_norm_var": 3.3447265625, "learning_rate": 0.0001, "loss": 7.8143, "loss/crossentropy": 2.1895001590251923, "loss/hidden": 3.615625, "loss/jsd": 0.0, "loss/logits": 0.21934428252279758, "step": 17190 }, { "epoch": 0.5733333333333334, "grad_norm": 28.25, "grad_norm_var": 3.7504557291666667, "learning_rate": 0.0001, "loss": 7.7612, "loss/crossentropy": 2.141167312860489, "loss/hidden": 3.6140625, "loss/jsd": 0.0, "loss/logits": 0.2146891826763749, "step": 17200 }, { "epoch": 0.5736666666666667, "grad_norm": 33.25, "grad_norm_var": 4.369791666666667, "learning_rate": 0.0001, "loss": 7.7998, "loss/crossentropy": 2.129530963301659, "loss/hidden": 3.57578125, "loss/jsd": 0.0, "loss/logits": 0.21841056421399116, "step": 17210 }, { "epoch": 0.574, "grad_norm": 34.25, "grad_norm_var": 5.690559895833333, "learning_rate": 0.0001, "loss": 7.7933, "loss/crossentropy": 2.0486621774733065, "loss/hidden": 3.641015625, "loss/jsd": 0.0, "loss/logits": 0.23156200405210256, "step": 17220 }, { "epoch": 0.5743333333333334, "grad_norm": 32.5, "grad_norm_var": 1.8551432291666667, "learning_rate": 0.0001, "loss": 7.7678, "loss/crossentropy": 1.8859696760773659, "loss/hidden": 3.709765625, "loss/jsd": 0.0, "loss/logits": 0.22125184228643774, "step": 17230 }, { "epoch": 0.5746666666666667, "grad_norm": 30.75, "grad_norm_var": 0.84140625, "learning_rate": 0.0001, "loss": 7.9437, "loss/crossentropy": 2.071402122825384, "loss/hidden": 3.684375, "loss/jsd": 0.0, "loss/logits": 0.21905634058639406, "step": 17240 }, { "epoch": 0.575, "grad_norm": 30.375, "grad_norm_var": 1.1559895833333333, "learning_rate": 0.0001, "loss": 7.9371, "loss/crossentropy": 2.094747845083475, "loss/hidden": 3.583984375, "loss/jsd": 0.0, "loss/logits": 0.21283579124137758, "step": 17250 }, { "epoch": 0.5753333333333334, "grad_norm": 30.875, "grad_norm_var": 1.7791015625, "learning_rate": 0.0001, "loss": 7.8497, "loss/crossentropy": 2.0071739844977854, "loss/hidden": 3.646875, "loss/jsd": 0.0, "loss/logits": 0.2293556292541325, "step": 17260 }, { "epoch": 0.5756666666666667, "grad_norm": 36.75, "grad_norm_var": 5.1978515625, "learning_rate": 0.0001, "loss": 7.8883, "loss/crossentropy": 2.0315137624740602, "loss/hidden": 3.611328125, "loss/jsd": 0.0, "loss/logits": 0.21106694657355546, "step": 17270 }, { "epoch": 0.576, "grad_norm": 32.75, "grad_norm_var": 4.042122395833333, "learning_rate": 0.0001, "loss": 7.7853, "loss/crossentropy": 2.161001367866993, "loss/hidden": 3.676953125, "loss/jsd": 0.0, "loss/logits": 0.22858329731971025, "step": 17280 }, { "epoch": 0.5763333333333334, "grad_norm": 29.75, "grad_norm_var": 2.7973307291666667, "learning_rate": 0.0001, "loss": 7.8583, "loss/crossentropy": 2.130069175362587, "loss/hidden": 3.63359375, "loss/jsd": 0.0, "loss/logits": 0.2083722459152341, "step": 17290 }, { "epoch": 0.5766666666666667, "grad_norm": 31.25, "grad_norm_var": 3.4238932291666666, "learning_rate": 0.0001, "loss": 7.8721, "loss/crossentropy": 2.082015645503998, "loss/hidden": 3.614453125, "loss/jsd": 0.0, "loss/logits": 0.20946309231221677, "step": 17300 }, { "epoch": 0.577, "grad_norm": 30.75, "grad_norm_var": 3.5875, "learning_rate": 0.0001, "loss": 7.9545, "loss/crossentropy": 2.096890838444233, "loss/hidden": 3.560546875, "loss/jsd": 0.0, "loss/logits": 0.20209384206682443, "step": 17310 }, { "epoch": 0.5773333333333334, "grad_norm": 34.75, "grad_norm_var": 4.995572916666666, "learning_rate": 0.0001, "loss": 7.8557, "loss/crossentropy": 2.106398382782936, "loss/hidden": 3.70546875, "loss/jsd": 0.0, "loss/logits": 0.21345611102879047, "step": 17320 }, { "epoch": 0.5776666666666667, "grad_norm": 34.0, "grad_norm_var": 2.546875, "learning_rate": 0.0001, "loss": 7.8012, "loss/crossentropy": 2.096838581562042, "loss/hidden": 3.58828125, "loss/jsd": 0.0, "loss/logits": 0.2053753226995468, "step": 17330 }, { "epoch": 0.578, "grad_norm": 26.875, "grad_norm_var": 2.9572464468024796e+18, "learning_rate": 0.0001, "loss": 7.8668, "loss/crossentropy": 2.1166788838803767, "loss/hidden": 3.55546875, "loss/jsd": 0.0, "loss/logits": 0.20833127275109292, "step": 17340 }, { "epoch": 0.5783333333333334, "grad_norm": 29.625, "grad_norm_var": 7.27265625, "learning_rate": 0.0001, "loss": 7.7414, "loss/crossentropy": 1.9886778131127358, "loss/hidden": 3.65390625, "loss/jsd": 0.0, "loss/logits": 0.21065135411918162, "step": 17350 }, { "epoch": 0.5786666666666667, "grad_norm": 29.5, "grad_norm_var": 11.183268229166666, "learning_rate": 0.0001, "loss": 7.9642, "loss/crossentropy": 2.0476474441587924, "loss/hidden": 3.547265625, "loss/jsd": 0.0, "loss/logits": 0.21347164940088986, "step": 17360 }, { "epoch": 0.579, "grad_norm": 32.75, "grad_norm_var": 10.703059895833333, "learning_rate": 0.0001, "loss": 7.8541, "loss/crossentropy": 2.146274469792843, "loss/hidden": 3.580859375, "loss/jsd": 0.0, "loss/logits": 0.24069792926311492, "step": 17370 }, { "epoch": 0.5793333333333334, "grad_norm": 33.0, "grad_norm_var": 3.1455729166666666, "learning_rate": 0.0001, "loss": 7.8856, "loss/crossentropy": 2.0081627793610095, "loss/hidden": 3.589453125, "loss/jsd": 0.0, "loss/logits": 0.20462132934480906, "step": 17380 }, { "epoch": 0.5796666666666667, "grad_norm": 30.625, "grad_norm_var": 4.042708333333334, "learning_rate": 0.0001, "loss": 7.8438, "loss/crossentropy": 2.133964368700981, "loss/hidden": 3.604296875, "loss/jsd": 0.0, "loss/logits": 0.2076242446899414, "step": 17390 }, { "epoch": 0.58, "grad_norm": 33.25, "grad_norm_var": 1.9681640625, "learning_rate": 0.0001, "loss": 7.883, "loss/crossentropy": 1.9912319853901863, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.22393401358276604, "step": 17400 }, { "epoch": 0.5803333333333334, "grad_norm": 28.625, "grad_norm_var": 4.534309895833333, "learning_rate": 0.0001, "loss": 7.8925, "loss/crossentropy": 2.1712723806500436, "loss/hidden": 3.639453125, "loss/jsd": 0.0, "loss/logits": 0.2111651472747326, "step": 17410 }, { "epoch": 0.5806666666666667, "grad_norm": 31.125, "grad_norm_var": 2.540311639717402e+18, "learning_rate": 0.0001, "loss": 7.8283, "loss/crossentropy": 1.96769128292799, "loss/hidden": 3.621484375, "loss/jsd": 0.0, "loss/logits": 0.21129720862954854, "step": 17420 }, { "epoch": 0.581, "grad_norm": 34.0, "grad_norm_var": 6.0900390625, "learning_rate": 0.0001, "loss": 7.9127, "loss/crossentropy": 2.0979452088475226, "loss/hidden": 3.63671875, "loss/jsd": 0.0, "loss/logits": 0.23280210494995118, "step": 17430 }, { "epoch": 0.5813333333333334, "grad_norm": 31.125, "grad_norm_var": 3.7671223958333333, "learning_rate": 0.0001, "loss": 7.9495, "loss/crossentropy": 2.3015418693423273, "loss/hidden": 3.496875, "loss/jsd": 0.0, "loss/logits": 0.2063061658293009, "step": 17440 }, { "epoch": 0.5816666666666667, "grad_norm": 31.125, "grad_norm_var": 8.673958333333333, "learning_rate": 0.0001, "loss": 7.977, "loss/crossentropy": 2.1084218993782997, "loss/hidden": 3.688671875, "loss/jsd": 0.0, "loss/logits": 0.2300797041505575, "step": 17450 }, { "epoch": 0.582, "grad_norm": 29.875, "grad_norm_var": 18.838997395833335, "learning_rate": 0.0001, "loss": 7.8748, "loss/crossentropy": 2.125347241014242, "loss/hidden": 3.650390625, "loss/jsd": 0.0, "loss/logits": 0.21882319189608096, "step": 17460 }, { "epoch": 0.5823333333333334, "grad_norm": 31.625, "grad_norm_var": 13.420247395833334, "learning_rate": 0.0001, "loss": 7.7885, "loss/crossentropy": 1.9236932694911957, "loss/hidden": 3.629296875, "loss/jsd": 0.0, "loss/logits": 0.20307795237749815, "step": 17470 }, { "epoch": 0.5826666666666667, "grad_norm": 31.625, "grad_norm_var": 6.923372395833334, "learning_rate": 0.0001, "loss": 7.9531, "loss/crossentropy": 2.0547634214162827, "loss/hidden": 3.689453125, "loss/jsd": 0.0, "loss/logits": 0.20383700206875802, "step": 17480 }, { "epoch": 0.583, "grad_norm": 30.625, "grad_norm_var": 8.875, "learning_rate": 0.0001, "loss": 7.8942, "loss/crossentropy": 2.1729307577013968, "loss/hidden": 3.663671875, "loss/jsd": 0.0, "loss/logits": 0.22479334622621536, "step": 17490 }, { "epoch": 0.5833333333333334, "grad_norm": 30.25, "grad_norm_var": 5.0056640625, "learning_rate": 0.0001, "loss": 7.9187, "loss/crossentropy": 2.1915037110447884, "loss/hidden": 3.710546875, "loss/jsd": 0.0, "loss/logits": 0.22519682794809343, "step": 17500 }, { "epoch": 0.5836666666666667, "grad_norm": 35.0, "grad_norm_var": 12.376822916666667, "learning_rate": 0.0001, "loss": 7.8727, "loss/crossentropy": 2.2274638898670673, "loss/hidden": 3.530078125, "loss/jsd": 0.0, "loss/logits": 0.20867663882672788, "step": 17510 }, { "epoch": 0.584, "grad_norm": 31.375, "grad_norm_var": 12.6072265625, "learning_rate": 0.0001, "loss": 7.8598, "loss/crossentropy": 1.9983490526676178, "loss/hidden": 3.81796875, "loss/jsd": 0.0, "loss/logits": 0.23317093290388585, "step": 17520 }, { "epoch": 0.5843333333333334, "grad_norm": 28.625, "grad_norm_var": 5.734309895833333, "learning_rate": 0.0001, "loss": 7.8956, "loss/crossentropy": 2.036000092327595, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.2117432462051511, "step": 17530 }, { "epoch": 0.5846666666666667, "grad_norm": 29.5, "grad_norm_var": 3.980208333333333, "learning_rate": 0.0001, "loss": 7.8556, "loss/crossentropy": 2.1683146730065346, "loss/hidden": 3.63984375, "loss/jsd": 0.0, "loss/logits": 0.21113013792783022, "step": 17540 }, { "epoch": 0.585, "grad_norm": 36.5, "grad_norm_var": 65.62473958333334, "learning_rate": 0.0001, "loss": 7.8502, "loss/crossentropy": 1.9271365851163864, "loss/hidden": 3.673828125, "loss/jsd": 0.0, "loss/logits": 0.21881148554384708, "step": 17550 }, { "epoch": 0.5853333333333334, "grad_norm": 30.125, "grad_norm_var": 64.79264322916667, "learning_rate": 0.0001, "loss": 7.853, "loss/crossentropy": 2.1277351498603823, "loss/hidden": 3.63046875, "loss/jsd": 0.0, "loss/logits": 0.21536952927708625, "step": 17560 }, { "epoch": 0.5856666666666667, "grad_norm": 34.25, "grad_norm_var": 2.48125, "learning_rate": 0.0001, "loss": 7.8684, "loss/crossentropy": 2.2192533940076826, "loss/hidden": 3.625390625, "loss/jsd": 0.0, "loss/logits": 0.24672232531011104, "step": 17570 }, { "epoch": 0.586, "grad_norm": 30.5, "grad_norm_var": 3.376822916666667, "learning_rate": 0.0001, "loss": 7.825, "loss/crossentropy": 1.9819466196000577, "loss/hidden": 3.597265625, "loss/jsd": 0.0, "loss/logits": 0.2152025356888771, "step": 17580 }, { "epoch": 0.5863333333333334, "grad_norm": 32.0, "grad_norm_var": 3.51015625, "learning_rate": 0.0001, "loss": 7.7497, "loss/crossentropy": 2.1631928592920304, "loss/hidden": 3.626171875, "loss/jsd": 0.0, "loss/logits": 0.21683248728513718, "step": 17590 }, { "epoch": 0.5866666666666667, "grad_norm": 29.75, "grad_norm_var": 5.7322265625, "learning_rate": 0.0001, "loss": 7.7934, "loss/crossentropy": 2.0797213554382323, "loss/hidden": 3.641015625, "loss/jsd": 0.0, "loss/logits": 0.21361089181154966, "step": 17600 }, { "epoch": 0.587, "grad_norm": 30.75, "grad_norm_var": 4.33125, "learning_rate": 0.0001, "loss": 7.9096, "loss/crossentropy": 2.1399587824940682, "loss/hidden": 3.652734375, "loss/jsd": 0.0, "loss/logits": 0.22931497786194086, "step": 17610 }, { "epoch": 0.5873333333333334, "grad_norm": 32.5, "grad_norm_var": 2.1080729166666665, "learning_rate": 0.0001, "loss": 8.0199, "loss/crossentropy": 2.082290044426918, "loss/hidden": 3.6421875, "loss/jsd": 0.0, "loss/logits": 0.21377197336405515, "step": 17620 }, { "epoch": 0.5876666666666667, "grad_norm": 29.0, "grad_norm_var": 3.7447916666666665, "learning_rate": 0.0001, "loss": 7.8002, "loss/crossentropy": 1.9901188783347608, "loss/hidden": 3.667578125, "loss/jsd": 0.0, "loss/logits": 0.22527266927063466, "step": 17630 }, { "epoch": 0.588, "grad_norm": 51.0, "grad_norm_var": 32.34557291666667, "learning_rate": 0.0001, "loss": 7.9691, "loss/crossentropy": 2.2055923312902452, "loss/hidden": 3.60859375, "loss/jsd": 0.0, "loss/logits": 0.2215042721480131, "step": 17640 }, { "epoch": 0.5883333333333334, "grad_norm": 33.0, "grad_norm_var": 27.630143229166666, "learning_rate": 0.0001, "loss": 7.8568, "loss/crossentropy": 1.9692361816763877, "loss/hidden": 3.622265625, "loss/jsd": 0.0, "loss/logits": 0.20123363118618726, "step": 17650 }, { "epoch": 0.5886666666666667, "grad_norm": 31.25, "grad_norm_var": 2.424934895833333, "learning_rate": 0.0001, "loss": 7.9636, "loss/crossentropy": 2.0752515137195586, "loss/hidden": 3.627734375, "loss/jsd": 0.0, "loss/logits": 0.21257804036140443, "step": 17660 }, { "epoch": 0.589, "grad_norm": 31.375, "grad_norm_var": 3.3559895833333333, "learning_rate": 0.0001, "loss": 7.8761, "loss/crossentropy": 2.0425384148955343, "loss/hidden": 3.70078125, "loss/jsd": 0.0, "loss/logits": 0.21111119659617544, "step": 17670 }, { "epoch": 0.5893333333333334, "grad_norm": 5301600256.0, "grad_norm_var": 1.7566853087264507e+18, "learning_rate": 0.0001, "loss": 8.0848, "loss/crossentropy": 2.2280178025364874, "loss/hidden": 3.89609375, "loss/jsd": 0.0, "loss/logits": 0.29019313380122186, "step": 17680 }, { "epoch": 0.5896666666666667, "grad_norm": 30.75, "grad_norm_var": 1.756685307947778e+18, "learning_rate": 0.0001, "loss": 7.9978, "loss/crossentropy": 2.1417148754000666, "loss/hidden": 3.7203125, "loss/jsd": 0.0, "loss/logits": 0.2305316347628832, "step": 17690 }, { "epoch": 0.59, "grad_norm": 29.0, "grad_norm_var": 1.6520182291666667, "learning_rate": 0.0001, "loss": 7.732, "loss/crossentropy": 1.9635935053229332, "loss/hidden": 3.571875, "loss/jsd": 0.0, "loss/logits": 0.19552220217883587, "step": 17700 }, { "epoch": 0.5903333333333334, "grad_norm": 31.125, "grad_norm_var": 4.158072916666667, "learning_rate": 0.0001, "loss": 8.101, "loss/crossentropy": 1.9116744890809059, "loss/hidden": 3.78671875, "loss/jsd": 0.0, "loss/logits": 0.21419992987066508, "step": 17710 }, { "epoch": 0.5906666666666667, "grad_norm": 32.25, "grad_norm_var": 2.2223307291666665, "learning_rate": 0.0001, "loss": 7.75, "loss/crossentropy": 2.117913420498371, "loss/hidden": 3.62421875, "loss/jsd": 0.0, "loss/logits": 0.20440428592264653, "step": 17720 }, { "epoch": 0.591, "grad_norm": 31.375, "grad_norm_var": 4.03125, "learning_rate": 0.0001, "loss": 7.9153, "loss/crossentropy": 2.0854917369782924, "loss/hidden": 3.64609375, "loss/jsd": 0.0, "loss/logits": 0.21392401214689016, "step": 17730 }, { "epoch": 0.5913333333333334, "grad_norm": 29.5, "grad_norm_var": 3.948958333333333, "learning_rate": 0.0001, "loss": 7.7836, "loss/crossentropy": 1.9979670539498329, "loss/hidden": 3.59453125, "loss/jsd": 0.0, "loss/logits": 0.20232196077704429, "step": 17740 }, { "epoch": 0.5916666666666667, "grad_norm": 28.0, "grad_norm_var": 2.9452473958333334, "learning_rate": 0.0001, "loss": 7.7919, "loss/crossentropy": 2.124793681502342, "loss/hidden": 3.6234375, "loss/jsd": 0.0, "loss/logits": 0.2240230105817318, "step": 17750 }, { "epoch": 0.592, "grad_norm": 30.5, "grad_norm_var": 1.1768229166666666, "learning_rate": 0.0001, "loss": 7.8956, "loss/crossentropy": 2.3084448873996735, "loss/hidden": 3.575, "loss/jsd": 0.0, "loss/logits": 0.21302221789956094, "step": 17760 }, { "epoch": 0.5923333333333334, "grad_norm": 32.5, "grad_norm_var": 4.325, "learning_rate": 0.0001, "loss": 7.7268, "loss/crossentropy": 2.060881958901882, "loss/hidden": 3.60546875, "loss/jsd": 0.0, "loss/logits": 0.20148423872888088, "step": 17770 }, { "epoch": 0.5926666666666667, "grad_norm": 31.875, "grad_norm_var": 6.21640625, "learning_rate": 0.0001, "loss": 7.881, "loss/crossentropy": 2.123748776316643, "loss/hidden": 3.608984375, "loss/jsd": 0.0, "loss/logits": 0.22054104711860417, "step": 17780 }, { "epoch": 0.593, "grad_norm": 59.0, "grad_norm_var": 53.02649739583333, "learning_rate": 0.0001, "loss": 7.8065, "loss/crossentropy": 2.0521097406744957, "loss/hidden": 3.47421875, "loss/jsd": 0.0, "loss/logits": 0.18986649625003338, "step": 17790 }, { "epoch": 0.5933333333333334, "grad_norm": 30.125, "grad_norm_var": 50.04479166666667, "learning_rate": 0.0001, "loss": 7.9947, "loss/crossentropy": 1.9684382773935796, "loss/hidden": 3.701953125, "loss/jsd": 0.0, "loss/logits": 0.2199052505195141, "step": 17800 }, { "epoch": 0.5936666666666667, "grad_norm": 31.25, "grad_norm_var": 2.84140625, "learning_rate": 0.0001, "loss": 7.8549, "loss/crossentropy": 2.2951119184494018, "loss/hidden": 3.68203125, "loss/jsd": 0.0, "loss/logits": 0.23184156119823457, "step": 17810 }, { "epoch": 0.594, "grad_norm": 29.5, "grad_norm_var": 1.8166015625, "learning_rate": 0.0001, "loss": 7.9599, "loss/crossentropy": 2.1213736176490783, "loss/hidden": 3.627734375, "loss/jsd": 0.0, "loss/logits": 0.22608150485903025, "step": 17820 }, { "epoch": 0.5943333333333334, "grad_norm": 29.75, "grad_norm_var": 2.29140625, "learning_rate": 0.0001, "loss": 7.9982, "loss/crossentropy": 2.08011159747839, "loss/hidden": 3.68203125, "loss/jsd": 0.0, "loss/logits": 0.21864136941730977, "step": 17830 }, { "epoch": 0.5946666666666667, "grad_norm": 30.375, "grad_norm_var": 2.092643229166667, "learning_rate": 0.0001, "loss": 7.9028, "loss/crossentropy": 2.084545207023621, "loss/hidden": 3.675390625, "loss/jsd": 0.0, "loss/logits": 0.2244249342009425, "step": 17840 }, { "epoch": 0.595, "grad_norm": 31.375, "grad_norm_var": 2.0020182291666666, "learning_rate": 0.0001, "loss": 7.9467, "loss/crossentropy": 1.9666382275521754, "loss/hidden": 3.605078125, "loss/jsd": 0.0, "loss/logits": 0.21182227209210397, "step": 17850 }, { "epoch": 0.5953333333333334, "grad_norm": 29.25, "grad_norm_var": 1.9979166666666666, "learning_rate": 0.0001, "loss": 7.7981, "loss/crossentropy": 2.0992372572422027, "loss/hidden": 3.65078125, "loss/jsd": 0.0, "loss/logits": 0.20616454482078553, "step": 17860 }, { "epoch": 0.5956666666666667, "grad_norm": 31.5, "grad_norm_var": 3.7270833333333333, "learning_rate": 0.0001, "loss": 7.8429, "loss/crossentropy": 2.115238733589649, "loss/hidden": 3.659765625, "loss/jsd": 0.0, "loss/logits": 0.22112161125987767, "step": 17870 }, { "epoch": 0.596, "grad_norm": 38.5, "grad_norm_var": 49.32076822916667, "learning_rate": 0.0001, "loss": 7.9198, "loss/crossentropy": 2.004359558224678, "loss/hidden": 3.55078125, "loss/jsd": 0.0, "loss/logits": 0.20139172691851853, "step": 17880 }, { "epoch": 0.5963333333333334, "grad_norm": 62.75, "grad_norm_var": 542.1577473958333, "learning_rate": 0.0001, "loss": 8.0141, "loss/crossentropy": 2.0738891914486883, "loss/hidden": 3.722265625, "loss/jsd": 0.0, "loss/logits": 0.27466350272297857, "step": 17890 }, { "epoch": 0.5966666666666667, "grad_norm": 28.875, "grad_norm_var": 123.88170572916667, "learning_rate": 0.0001, "loss": 7.773, "loss/crossentropy": 2.1576938211917875, "loss/hidden": 3.61015625, "loss/jsd": 0.0, "loss/logits": 0.21197507828474044, "step": 17900 }, { "epoch": 0.597, "grad_norm": 29.875, "grad_norm_var": 4.25625, "learning_rate": 0.0001, "loss": 7.7789, "loss/crossentropy": 2.0792277440428735, "loss/hidden": 3.653125, "loss/jsd": 0.0, "loss/logits": 0.21021516602486373, "step": 17910 }, { "epoch": 0.5973333333333334, "grad_norm": 28.5, "grad_norm_var": 4.112955729166667, "learning_rate": 0.0001, "loss": 7.791, "loss/crossentropy": 2.0338504150509835, "loss/hidden": 3.58515625, "loss/jsd": 0.0, "loss/logits": 0.21274630688130855, "step": 17920 }, { "epoch": 0.5976666666666667, "grad_norm": 34.0, "grad_norm_var": 4.971875, "learning_rate": 0.0001, "loss": 7.7827, "loss/crossentropy": 1.9480732060968875, "loss/hidden": 3.644921875, "loss/jsd": 0.0, "loss/logits": 0.1969135446473956, "step": 17930 }, { "epoch": 0.598, "grad_norm": 33.25, "grad_norm_var": 2.4208333333333334, "learning_rate": 0.0001, "loss": 7.7746, "loss/crossentropy": 2.045719124376774, "loss/hidden": 3.585546875, "loss/jsd": 0.0, "loss/logits": 0.2005929106846452, "step": 17940 }, { "epoch": 0.5983333333333334, "grad_norm": 33.75, "grad_norm_var": 7.9587890625, "learning_rate": 0.0001, "loss": 7.8875, "loss/crossentropy": 2.07333282828331, "loss/hidden": 3.62890625, "loss/jsd": 0.0, "loss/logits": 0.20832881294190883, "step": 17950 }, { "epoch": 0.5986666666666667, "grad_norm": 33.75, "grad_norm_var": 8.305989583333334, "learning_rate": 0.0001, "loss": 7.7515, "loss/crossentropy": 2.0196738630533217, "loss/hidden": 3.61015625, "loss/jsd": 0.0, "loss/logits": 0.21534995995461942, "step": 17960 }, { "epoch": 0.599, "grad_norm": 29.125, "grad_norm_var": 10.612434895833333, "learning_rate": 0.0001, "loss": 7.8834, "loss/crossentropy": 2.0255559869110584, "loss/hidden": 3.601171875, "loss/jsd": 0.0, "loss/logits": 0.19980104472488164, "step": 17970 }, { "epoch": 0.5993333333333334, "grad_norm": 33.0, "grad_norm_var": 5.305989583333333, "learning_rate": 0.0001, "loss": 7.8593, "loss/crossentropy": 2.0782414257526396, "loss/hidden": 3.631640625, "loss/jsd": 0.0, "loss/logits": 0.2156868301331997, "step": 17980 }, { "epoch": 0.5996666666666667, "grad_norm": 29.375, "grad_norm_var": 3.23125, "learning_rate": 0.0001, "loss": 7.8168, "loss/crossentropy": 2.1622331708669664, "loss/hidden": 3.577734375, "loss/jsd": 0.0, "loss/logits": 0.2100482653826475, "step": 17990 }, { "epoch": 0.6, "grad_norm": 30.375, "grad_norm_var": 5.889322916666667, "learning_rate": 0.0001, "loss": 7.9181, "loss/crossentropy": 2.0450053200125695, "loss/hidden": 3.779296875, "loss/jsd": 0.0, "loss/logits": 0.22763790674507617, "step": 18000 }, { "epoch": 0.6003333333333334, "grad_norm": 30.625, "grad_norm_var": 4.785416666666666, "learning_rate": 0.0001, "loss": 7.7476, "loss/crossentropy": 2.1718004338443277, "loss/hidden": 3.62421875, "loss/jsd": 0.0, "loss/logits": 0.21642480613663792, "step": 18010 }, { "epoch": 0.6006666666666667, "grad_norm": 38.0, "grad_norm_var": 5.748372395833333, "learning_rate": 0.0001, "loss": 8.012, "loss/crossentropy": 2.207152932882309, "loss/hidden": 3.66328125, "loss/jsd": 0.0, "loss/logits": 0.24362048767507077, "step": 18020 }, { "epoch": 0.601, "grad_norm": 33.0, "grad_norm_var": 8.595768229166667, "learning_rate": 0.0001, "loss": 7.9312, "loss/crossentropy": 2.0700930416584016, "loss/hidden": 3.679296875, "loss/jsd": 0.0, "loss/logits": 0.20959441419690847, "step": 18030 }, { "epoch": 0.6013333333333334, "grad_norm": 36.75, "grad_norm_var": 6.439322916666667, "learning_rate": 0.0001, "loss": 7.8615, "loss/crossentropy": 1.945305197685957, "loss/hidden": 3.768359375, "loss/jsd": 0.0, "loss/logits": 0.23298689387738705, "step": 18040 }, { "epoch": 0.6016666666666667, "grad_norm": 33.5, "grad_norm_var": 4.405208333333333, "learning_rate": 0.0001, "loss": 7.7177, "loss/crossentropy": 2.1207897052168847, "loss/hidden": 3.72421875, "loss/jsd": 0.0, "loss/logits": 0.21883321572095155, "step": 18050 }, { "epoch": 0.602, "grad_norm": 30.125, "grad_norm_var": 3.4468098958333333, "learning_rate": 0.0001, "loss": 7.8522, "loss/crossentropy": 1.9687151461839676, "loss/hidden": 3.64453125, "loss/jsd": 0.0, "loss/logits": 0.21371175833046435, "step": 18060 }, { "epoch": 0.6023333333333334, "grad_norm": 30.375, "grad_norm_var": 3.4989583333333334, "learning_rate": 0.0001, "loss": 7.7965, "loss/crossentropy": 2.0616400502622128, "loss/hidden": 3.56953125, "loss/jsd": 0.0, "loss/logits": 0.19918912472203373, "step": 18070 }, { "epoch": 0.6026666666666667, "grad_norm": 30.375, "grad_norm_var": 2.894791666666667, "learning_rate": 0.0001, "loss": 7.8959, "loss/crossentropy": 2.110419492423534, "loss/hidden": 3.580859375, "loss/jsd": 0.0, "loss/logits": 0.21065182648599148, "step": 18080 }, { "epoch": 0.603, "grad_norm": 37.75, "grad_norm_var": 30.475455729166665, "learning_rate": 0.0001, "loss": 7.8809, "loss/crossentropy": 2.0502669103443623, "loss/hidden": 3.59609375, "loss/jsd": 0.0, "loss/logits": 0.2201010322198272, "step": 18090 }, { "epoch": 0.6033333333333334, "grad_norm": 31.75, "grad_norm_var": 7.795572916666667, "learning_rate": 0.0001, "loss": 7.8007, "loss/crossentropy": 1.998288343846798, "loss/hidden": 3.690234375, "loss/jsd": 0.0, "loss/logits": 0.20705017000436782, "step": 18100 }, { "epoch": 0.6036666666666667, "grad_norm": 29.125, "grad_norm_var": 4.1125, "learning_rate": 0.0001, "loss": 7.919, "loss/crossentropy": 2.1517566978931426, "loss/hidden": 3.579296875, "loss/jsd": 0.0, "loss/logits": 0.22426084131002427, "step": 18110 }, { "epoch": 0.604, "grad_norm": 30.0, "grad_norm_var": 4.998893229166667, "learning_rate": 0.0001, "loss": 7.7191, "loss/crossentropy": 2.1819751486182213, "loss/hidden": 3.59296875, "loss/jsd": 0.0, "loss/logits": 0.2131780631840229, "step": 18120 }, { "epoch": 0.6043333333333333, "grad_norm": 28.125, "grad_norm_var": 3.2264973958333334, "learning_rate": 0.0001, "loss": 7.8286, "loss/crossentropy": 2.0302142813801765, "loss/hidden": 3.49140625, "loss/jsd": 0.0, "loss/logits": 0.21191238649189473, "step": 18130 }, { "epoch": 0.6046666666666667, "grad_norm": 27.75, "grad_norm_var": 4.076041666666667, "learning_rate": 0.0001, "loss": 7.8356, "loss/crossentropy": 2.1582538709044456, "loss/hidden": 3.681640625, "loss/jsd": 0.0, "loss/logits": 0.22549642771482467, "step": 18140 }, { "epoch": 0.605, "grad_norm": 32.25, "grad_norm_var": 3.0893229166666667, "learning_rate": 0.0001, "loss": 7.9474, "loss/crossentropy": 1.9815901264548301, "loss/hidden": 3.696875, "loss/jsd": 0.0, "loss/logits": 0.20591741409152747, "step": 18150 }, { "epoch": 0.6053333333333333, "grad_norm": 29.75, "grad_norm_var": 5.76015625, "learning_rate": 0.0001, "loss": 7.8622, "loss/crossentropy": 2.011575572192669, "loss/hidden": 3.656640625, "loss/jsd": 0.0, "loss/logits": 0.21709777507930994, "step": 18160 }, { "epoch": 0.6056666666666667, "grad_norm": 31.125, "grad_norm_var": 1.6330729166666667, "learning_rate": 0.0001, "loss": 7.8002, "loss/crossentropy": 1.9985271662473678, "loss/hidden": 3.662890625, "loss/jsd": 0.0, "loss/logits": 0.2164109718054533, "step": 18170 }, { "epoch": 0.606, "grad_norm": 28.875, "grad_norm_var": 2.1587890625, "learning_rate": 0.0001, "loss": 7.8748, "loss/crossentropy": 1.9468450605869294, "loss/hidden": 3.795703125, "loss/jsd": 0.0, "loss/logits": 0.22511965408921242, "step": 18180 }, { "epoch": 0.6063333333333333, "grad_norm": 31.875, "grad_norm_var": 8.801041666666666, "learning_rate": 0.0001, "loss": 7.7904, "loss/crossentropy": 1.8904692113399506, "loss/hidden": 3.530078125, "loss/jsd": 0.0, "loss/logits": 0.19247866850346326, "step": 18190 }, { "epoch": 0.6066666666666667, "grad_norm": 30.125, "grad_norm_var": 1.32265625, "learning_rate": 0.0001, "loss": 7.9876, "loss/crossentropy": 2.1818495839834213, "loss/hidden": 3.655859375, "loss/jsd": 0.0, "loss/logits": 0.2132304223254323, "step": 18200 }, { "epoch": 0.607, "grad_norm": 29.125, "grad_norm_var": 1.7712890625, "learning_rate": 0.0001, "loss": 7.7898, "loss/crossentropy": 2.0314568877220154, "loss/hidden": 3.555859375, "loss/jsd": 0.0, "loss/logits": 0.20160603299736976, "step": 18210 }, { "epoch": 0.6073333333333333, "grad_norm": 29.0, "grad_norm_var": 6.9947265625, "learning_rate": 0.0001, "loss": 7.7958, "loss/crossentropy": 2.127374881505966, "loss/hidden": 3.6359375, "loss/jsd": 0.0, "loss/logits": 0.2240319259464741, "step": 18220 }, { "epoch": 0.6076666666666667, "grad_norm": 30.375, "grad_norm_var": 7.166080729166667, "learning_rate": 0.0001, "loss": 7.8324, "loss/crossentropy": 2.0251319468021394, "loss/hidden": 3.702734375, "loss/jsd": 0.0, "loss/logits": 0.23137976359575987, "step": 18230 }, { "epoch": 0.608, "grad_norm": 34.75, "grad_norm_var": 3.998893229166667, "learning_rate": 0.0001, "loss": 7.9401, "loss/crossentropy": 2.006920612603426, "loss/hidden": 3.603125, "loss/jsd": 0.0, "loss/logits": 0.20304168649017812, "step": 18240 }, { "epoch": 0.6083333333333333, "grad_norm": 32.5, "grad_norm_var": 2.3916666666666666, "learning_rate": 0.0001, "loss": 7.7267, "loss/crossentropy": 2.189460189640522, "loss/hidden": 3.564453125, "loss/jsd": 0.0, "loss/logits": 0.20912937633693218, "step": 18250 }, { "epoch": 0.6086666666666667, "grad_norm": 28.875, "grad_norm_var": 2.653059895833333, "learning_rate": 0.0001, "loss": 7.8839, "loss/crossentropy": 2.109463243186474, "loss/hidden": 3.65, "loss/jsd": 0.0, "loss/logits": 0.22287122681736946, "step": 18260 }, { "epoch": 0.609, "grad_norm": 29.625, "grad_norm_var": 5.537434895833333, "learning_rate": 0.0001, "loss": 7.9143, "loss/crossentropy": 1.9888273879885674, "loss/hidden": 3.60546875, "loss/jsd": 0.0, "loss/logits": 0.19882734641432762, "step": 18270 }, { "epoch": 0.6093333333333333, "grad_norm": 29.875, "grad_norm_var": 6.3697265625, "learning_rate": 0.0001, "loss": 7.8695, "loss/crossentropy": 2.1051917299628258, "loss/hidden": 3.73203125, "loss/jsd": 0.0, "loss/logits": 0.22681960612535476, "step": 18280 }, { "epoch": 0.6096666666666667, "grad_norm": 30.125, "grad_norm_var": 4.312239583333334, "learning_rate": 0.0001, "loss": 7.7899, "loss/crossentropy": 1.9936149284243583, "loss/hidden": 3.71953125, "loss/jsd": 0.0, "loss/logits": 0.22636721413582564, "step": 18290 }, { "epoch": 0.61, "grad_norm": 32.75, "grad_norm_var": 3.8655598958333335, "learning_rate": 0.0001, "loss": 7.9113, "loss/crossentropy": 2.1669924929738045, "loss/hidden": 3.64375, "loss/jsd": 0.0, "loss/logits": 0.21989998165518046, "step": 18300 }, { "epoch": 0.6103333333333333, "grad_norm": 33.25, "grad_norm_var": 4.784830729166667, "learning_rate": 0.0001, "loss": 7.8107, "loss/crossentropy": 2.1663272455334663, "loss/hidden": 3.603125, "loss/jsd": 0.0, "loss/logits": 0.22850660514086485, "step": 18310 }, { "epoch": 0.6106666666666667, "grad_norm": 29.5, "grad_norm_var": 15.930989583333334, "learning_rate": 0.0001, "loss": 7.921, "loss/crossentropy": 2.106405158340931, "loss/hidden": 3.580859375, "loss/jsd": 0.0, "loss/logits": 0.22974986005574466, "step": 18320 }, { "epoch": 0.611, "grad_norm": 31.25, "grad_norm_var": 18.299739583333334, "learning_rate": 0.0001, "loss": 7.8336, "loss/crossentropy": 2.130492168664932, "loss/hidden": 3.606640625, "loss/jsd": 0.0, "loss/logits": 0.2170414287596941, "step": 18330 }, { "epoch": 0.6113333333333333, "grad_norm": 29.875, "grad_norm_var": 7.1650390625, "learning_rate": 0.0001, "loss": 7.8573, "loss/crossentropy": 2.3027665317058563, "loss/hidden": 3.6, "loss/jsd": 0.0, "loss/logits": 0.22886360697448255, "step": 18340 }, { "epoch": 0.6116666666666667, "grad_norm": 31.875, "grad_norm_var": 6.0697265625, "learning_rate": 0.0001, "loss": 7.7478, "loss/crossentropy": 2.0985746681690216, "loss/hidden": 3.57734375, "loss/jsd": 0.0, "loss/logits": 0.20908834878355265, "step": 18350 }, { "epoch": 0.612, "grad_norm": 30.5, "grad_norm_var": 6.792643229166667, "learning_rate": 0.0001, "loss": 7.8184, "loss/crossentropy": 2.152451690286398, "loss/hidden": 3.6, "loss/jsd": 0.0, "loss/logits": 0.21251746444031597, "step": 18360 }, { "epoch": 0.6123333333333333, "grad_norm": 27.25, "grad_norm_var": 2.3962890625, "learning_rate": 0.0001, "loss": 7.8537, "loss/crossentropy": 2.0875338673591615, "loss/hidden": 3.631640625, "loss/jsd": 0.0, "loss/logits": 0.23585428856313229, "step": 18370 }, { "epoch": 0.6126666666666667, "grad_norm": 29.75, "grad_norm_var": 4.625, "learning_rate": 0.0001, "loss": 7.9296, "loss/crossentropy": 2.0466977350413798, "loss/hidden": 3.60859375, "loss/jsd": 0.0, "loss/logits": 0.2116424733772874, "step": 18380 }, { "epoch": 0.613, "grad_norm": 29.875, "grad_norm_var": 2.3478515625, "learning_rate": 0.0001, "loss": 7.9076, "loss/crossentropy": 2.0156113907694815, "loss/hidden": 3.674609375, "loss/jsd": 0.0, "loss/logits": 0.21385385412722827, "step": 18390 }, { "epoch": 0.6133333333333333, "grad_norm": 31.0, "grad_norm_var": 3.7384765625, "learning_rate": 0.0001, "loss": 7.9007, "loss/crossentropy": 2.1177249431610106, "loss/hidden": 3.64453125, "loss/jsd": 0.0, "loss/logits": 0.2171054555103183, "step": 18400 }, { "epoch": 0.6136666666666667, "grad_norm": 30.875, "grad_norm_var": 5.0056640625, "learning_rate": 0.0001, "loss": 7.8653, "loss/crossentropy": 2.0109619170427324, "loss/hidden": 3.734375, "loss/jsd": 0.0, "loss/logits": 0.24297788869589568, "step": 18410 }, { "epoch": 0.614, "grad_norm": 29.875, "grad_norm_var": 6.345247395833334, "learning_rate": 0.0001, "loss": 7.8787, "loss/crossentropy": 2.008924402296543, "loss/hidden": 3.815625, "loss/jsd": 0.0, "loss/logits": 0.23545185066759586, "step": 18420 }, { "epoch": 0.6143333333333333, "grad_norm": 31.875, "grad_norm_var": 1.9572265625, "learning_rate": 0.0001, "loss": 7.7924, "loss/crossentropy": 2.1681634426116942, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.2179112009704113, "step": 18430 }, { "epoch": 0.6146666666666667, "grad_norm": 29.5, "grad_norm_var": 2.56015625, "learning_rate": 0.0001, "loss": 7.8609, "loss/crossentropy": 2.100195789337158, "loss/hidden": 3.582421875, "loss/jsd": 0.0, "loss/logits": 0.21292860489338636, "step": 18440 }, { "epoch": 0.615, "grad_norm": 30.5, "grad_norm_var": 1.2983723958333333, "learning_rate": 0.0001, "loss": 7.967, "loss/crossentropy": 2.2022788748145103, "loss/hidden": 3.609765625, "loss/jsd": 0.0, "loss/logits": 0.2205308698117733, "step": 18450 }, { "epoch": 0.6153333333333333, "grad_norm": 39.25, "grad_norm_var": 5.6375, "learning_rate": 0.0001, "loss": 7.914, "loss/crossentropy": 2.1412271529436113, "loss/hidden": 3.605859375, "loss/jsd": 0.0, "loss/logits": 0.21023106891661883, "step": 18460 }, { "epoch": 0.6156666666666667, "grad_norm": 31.0, "grad_norm_var": 3.313593764281145e+18, "learning_rate": 0.0001, "loss": 7.8255, "loss/crossentropy": 2.116056500375271, "loss/hidden": 3.68359375, "loss/jsd": 0.0, "loss/logits": 0.22026809379458429, "step": 18470 }, { "epoch": 0.616, "grad_norm": 34.5, "grad_norm_var": 3.3135937641066967e+18, "learning_rate": 0.0001, "loss": 7.8251, "loss/crossentropy": 2.1441701896488667, "loss/hidden": 3.804296875, "loss/jsd": 0.0, "loss/logits": 0.23796399366110563, "step": 18480 }, { "epoch": 0.6163333333333333, "grad_norm": 30.5, "grad_norm_var": 3.0306640625, "learning_rate": 0.0001, "loss": 7.8743, "loss/crossentropy": 2.05239050835371, "loss/hidden": 3.64140625, "loss/jsd": 0.0, "loss/logits": 0.20417920276522636, "step": 18490 }, { "epoch": 0.6166666666666667, "grad_norm": 29.75, "grad_norm_var": 7.375, "learning_rate": 0.0001, "loss": 7.8488, "loss/crossentropy": 1.98827982544899, "loss/hidden": 3.65546875, "loss/jsd": 0.0, "loss/logits": 0.21382159925997257, "step": 18500 }, { "epoch": 0.617, "grad_norm": 31.125, "grad_norm_var": 9.1916015625, "learning_rate": 0.0001, "loss": 7.9309, "loss/crossentropy": 2.108893929421902, "loss/hidden": 3.49921875, "loss/jsd": 0.0, "loss/logits": 0.2015895338729024, "step": 18510 }, { "epoch": 0.6173333333333333, "grad_norm": 30.375, "grad_norm_var": 4.250455729166666, "learning_rate": 0.0001, "loss": 7.8394, "loss/crossentropy": 2.2074019432067873, "loss/hidden": 3.61875, "loss/jsd": 0.0, "loss/logits": 0.21143595930188894, "step": 18520 }, { "epoch": 0.6176666666666667, "grad_norm": 30.5, "grad_norm_var": 3.97265625, "learning_rate": 0.0001, "loss": 7.8296, "loss/crossentropy": 2.0721921652555464, "loss/hidden": 3.702734375, "loss/jsd": 0.0, "loss/logits": 0.23150747194886206, "step": 18530 }, { "epoch": 0.618, "grad_norm": 32.5, "grad_norm_var": 3.901497395833333, "learning_rate": 0.0001, "loss": 7.8179, "loss/crossentropy": 2.12041699886322, "loss/hidden": 3.6109375, "loss/jsd": 0.0, "loss/logits": 0.20933685936033725, "step": 18540 }, { "epoch": 0.6183333333333333, "grad_norm": 32.25, "grad_norm_var": 22.773893229166667, "learning_rate": 0.0001, "loss": 7.8798, "loss/crossentropy": 2.073936428129673, "loss/hidden": 3.55625, "loss/jsd": 0.0, "loss/logits": 0.21363019309937953, "step": 18550 }, { "epoch": 0.6186666666666667, "grad_norm": 31.375, "grad_norm_var": 21.6416015625, "learning_rate": 0.0001, "loss": 7.7396, "loss/crossentropy": 1.969584984332323, "loss/hidden": 3.607421875, "loss/jsd": 0.0, "loss/logits": 0.20250753909349442, "step": 18560 }, { "epoch": 0.619, "grad_norm": 36.5, "grad_norm_var": 3.5582682291666665, "learning_rate": 0.0001, "loss": 7.881, "loss/crossentropy": 2.3890004098415374, "loss/hidden": 3.58359375, "loss/jsd": 0.0, "loss/logits": 0.21717921365052462, "step": 18570 }, { "epoch": 0.6193333333333333, "grad_norm": 31.875, "grad_norm_var": 5.0962890625, "learning_rate": 0.0001, "loss": 7.8276, "loss/crossentropy": 1.9156524941325188, "loss/hidden": 3.602734375, "loss/jsd": 0.0, "loss/logits": 0.22258005812764167, "step": 18580 }, { "epoch": 0.6196666666666667, "grad_norm": 34.25, "grad_norm_var": 3.2916015625, "learning_rate": 0.0001, "loss": 7.8671, "loss/crossentropy": 2.1052445240318773, "loss/hidden": 3.541015625, "loss/jsd": 0.0, "loss/logits": 0.20677947774529457, "step": 18590 }, { "epoch": 0.62, "grad_norm": 29.25, "grad_norm_var": 4.991080729166667, "learning_rate": 0.0001, "loss": 7.8464, "loss/crossentropy": 1.9568229861557485, "loss/hidden": 3.6140625, "loss/jsd": 0.0, "loss/logits": 0.22466170443221928, "step": 18600 }, { "epoch": 0.6203333333333333, "grad_norm": 29.875, "grad_norm_var": 3.1832682291666665, "learning_rate": 0.0001, "loss": 7.8225, "loss/crossentropy": 2.082220788300037, "loss/hidden": 3.5328125, "loss/jsd": 0.0, "loss/logits": 0.20434877574443816, "step": 18610 }, { "epoch": 0.6206666666666667, "grad_norm": 29.625, "grad_norm_var": 11.320833333333333, "learning_rate": 0.0001, "loss": 7.9303, "loss/crossentropy": 2.120234587043524, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.23275276124477387, "step": 18620 }, { "epoch": 0.621, "grad_norm": 29.375, "grad_norm_var": 2.448372395833333, "learning_rate": 0.0001, "loss": 7.8128, "loss/crossentropy": 2.058797413110733, "loss/hidden": 3.676171875, "loss/jsd": 0.0, "loss/logits": 0.2265950959175825, "step": 18630 }, { "epoch": 0.6213333333333333, "grad_norm": 29.5, "grad_norm_var": 9.9197265625, "learning_rate": 0.0001, "loss": 7.8673, "loss/crossentropy": 1.9986446030437945, "loss/hidden": 3.60078125, "loss/jsd": 0.0, "loss/logits": 0.19868890419602395, "step": 18640 }, { "epoch": 0.6216666666666667, "grad_norm": 33.0, "grad_norm_var": 2.082747395833333, "learning_rate": 0.0001, "loss": 7.9691, "loss/crossentropy": 2.2711413890123366, "loss/hidden": 3.621875, "loss/jsd": 0.0, "loss/logits": 0.22637809179723262, "step": 18650 }, { "epoch": 0.622, "grad_norm": 29.0, "grad_norm_var": 3.0853515625, "learning_rate": 0.0001, "loss": 7.9306, "loss/crossentropy": 2.130834940075874, "loss/hidden": 3.587109375, "loss/jsd": 0.0, "loss/logits": 0.21620727181434632, "step": 18660 }, { "epoch": 0.6223333333333333, "grad_norm": 29.0, "grad_norm_var": 4.4884765625, "learning_rate": 0.0001, "loss": 7.9184, "loss/crossentropy": 2.132387759536505, "loss/hidden": 3.719921875, "loss/jsd": 0.0, "loss/logits": 0.2213591465726495, "step": 18670 }, { "epoch": 0.6226666666666667, "grad_norm": 37.5, "grad_norm_var": 5.250455729166666, "learning_rate": 0.0001, "loss": 7.8612, "loss/crossentropy": 2.172727197408676, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.21677472554147242, "step": 18680 }, { "epoch": 0.623, "grad_norm": 32.75, "grad_norm_var": 5.864583333333333, "learning_rate": 0.0001, "loss": 7.8139, "loss/crossentropy": 2.1482373237609864, "loss/hidden": 3.638671875, "loss/jsd": 0.0, "loss/logits": 0.22568456567823886, "step": 18690 }, { "epoch": 0.6233333333333333, "grad_norm": 33.25, "grad_norm_var": 9.995768229166666, "learning_rate": 0.0001, "loss": 7.667, "loss/crossentropy": 2.1058941036462784, "loss/hidden": 3.533203125, "loss/jsd": 0.0, "loss/logits": 0.2003629505634308, "step": 18700 }, { "epoch": 0.6236666666666667, "grad_norm": 31.25, "grad_norm_var": 3.0462890625, "learning_rate": 0.0001, "loss": 7.8472, "loss/crossentropy": 2.0365977115929126, "loss/hidden": 3.583984375, "loss/jsd": 0.0, "loss/logits": 0.21015565544366838, "step": 18710 }, { "epoch": 0.624, "grad_norm": 31.125, "grad_norm_var": 4.335416666666666, "learning_rate": 0.0001, "loss": 7.7206, "loss/crossentropy": 2.099413389712572, "loss/hidden": 3.494921875, "loss/jsd": 0.0, "loss/logits": 0.20410673916339875, "step": 18720 }, { "epoch": 0.6243333333333333, "grad_norm": 27.875, "grad_norm_var": 11.883072916666666, "learning_rate": 0.0001, "loss": 7.9741, "loss/crossentropy": 2.0666474759578706, "loss/hidden": 3.708984375, "loss/jsd": 0.0, "loss/logits": 0.21149700321257114, "step": 18730 }, { "epoch": 0.6246666666666667, "grad_norm": 32.25, "grad_norm_var": 8.562239583333334, "learning_rate": 0.0001, "loss": 7.8691, "loss/crossentropy": 2.0636663090437652, "loss/hidden": 3.5921875, "loss/jsd": 0.0, "loss/logits": 0.21715150568634273, "step": 18740 }, { "epoch": 0.625, "grad_norm": 37.75, "grad_norm_var": 8.626822916666667, "learning_rate": 0.0001, "loss": 7.7649, "loss/crossentropy": 2.0067582026124002, "loss/hidden": 3.5578125, "loss/jsd": 0.0, "loss/logits": 0.204201880376786, "step": 18750 }, { "epoch": 0.6253333333333333, "grad_norm": 31.75, "grad_norm_var": 9.3634765625, "learning_rate": 0.0001, "loss": 7.8405, "loss/crossentropy": 2.0874268427491187, "loss/hidden": 3.680859375, "loss/jsd": 0.0, "loss/logits": 0.2324770163744688, "step": 18760 }, { "epoch": 0.6256666666666667, "grad_norm": 28.75, "grad_norm_var": 2.8872395833333333, "learning_rate": 0.0001, "loss": 7.8715, "loss/crossentropy": 1.892872792482376, "loss/hidden": 3.663671875, "loss/jsd": 0.0, "loss/logits": 0.21375333461910487, "step": 18770 }, { "epoch": 0.626, "grad_norm": 30.625, "grad_norm_var": 3.6372395833333333, "learning_rate": 0.0001, "loss": 7.9126, "loss/crossentropy": 2.181600275635719, "loss/hidden": 3.6828125, "loss/jsd": 0.0, "loss/logits": 0.23229926731437445, "step": 18780 }, { "epoch": 0.6263333333333333, "grad_norm": 31.375, "grad_norm_var": 532.3809895833333, "learning_rate": 0.0001, "loss": 7.8422, "loss/crossentropy": 2.1315030232071877, "loss/hidden": 3.515234375, "loss/jsd": 0.0, "loss/logits": 0.20269421245902777, "step": 18790 }, { "epoch": 0.6266666666666667, "grad_norm": 31.125, "grad_norm_var": 24.878125, "learning_rate": 0.0001, "loss": 7.9348, "loss/crossentropy": 2.173051218688488, "loss/hidden": 3.5890625, "loss/jsd": 0.0, "loss/logits": 0.2082651512697339, "step": 18800 }, { "epoch": 0.627, "grad_norm": 30.375, "grad_norm_var": 3.22265625, "learning_rate": 0.0001, "loss": 7.9028, "loss/crossentropy": 2.1568052619695663, "loss/hidden": 3.58828125, "loss/jsd": 0.0, "loss/logits": 0.21820257026702167, "step": 18810 }, { "epoch": 0.6273333333333333, "grad_norm": 31.25, "grad_norm_var": 3.409309895833333, "learning_rate": 0.0001, "loss": 7.9375, "loss/crossentropy": 1.8293808348476888, "loss/hidden": 3.6078125, "loss/jsd": 0.0, "loss/logits": 0.19814650276675821, "step": 18820 }, { "epoch": 0.6276666666666667, "grad_norm": 28.875, "grad_norm_var": 21.13125, "learning_rate": 0.0001, "loss": 7.9111, "loss/crossentropy": 2.19702318161726, "loss/hidden": 3.63125, "loss/jsd": 0.0, "loss/logits": 0.21540935784578324, "step": 18830 }, { "epoch": 0.628, "grad_norm": 32.25, "grad_norm_var": 20.922916666666666, "learning_rate": 0.0001, "loss": 7.8251, "loss/crossentropy": 2.1570638747885824, "loss/hidden": 3.54609375, "loss/jsd": 0.0, "loss/logits": 0.20867985542863607, "step": 18840 }, { "epoch": 0.6283333333333333, "grad_norm": 30.375, "grad_norm_var": 4.72890625, "learning_rate": 0.0001, "loss": 7.8015, "loss/crossentropy": 2.1077527910470963, "loss/hidden": 3.674609375, "loss/jsd": 0.0, "loss/logits": 0.22238324768841267, "step": 18850 }, { "epoch": 0.6286666666666667, "grad_norm": 32.75, "grad_norm_var": 12.74765625, "learning_rate": 0.0001, "loss": 7.8461, "loss/crossentropy": 2.0374200642108917, "loss/hidden": 3.715234375, "loss/jsd": 0.0, "loss/logits": 0.21136217713356018, "step": 18860 }, { "epoch": 0.629, "grad_norm": 30.625, "grad_norm_var": 6.53515625, "learning_rate": 0.0001, "loss": 7.9069, "loss/crossentropy": 2.0224805563688277, "loss/hidden": 3.51015625, "loss/jsd": 0.0, "loss/logits": 0.2152063086628914, "step": 18870 }, { "epoch": 0.6293333333333333, "grad_norm": 37.0, "grad_norm_var": 10.820572916666666, "learning_rate": 0.0001, "loss": 7.9014, "loss/crossentropy": 2.063296413421631, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.20050969813019037, "step": 18880 }, { "epoch": 0.6296666666666667, "grad_norm": 31.625, "grad_norm_var": 7.643684895833333, "learning_rate": 0.0001, "loss": 7.8825, "loss/crossentropy": 2.2169769048690795, "loss/hidden": 3.5921875, "loss/jsd": 0.0, "loss/logits": 0.21663882099092008, "step": 18890 }, { "epoch": 0.63, "grad_norm": 29.5, "grad_norm_var": 5.8775390625, "learning_rate": 0.0001, "loss": 7.8653, "loss/crossentropy": 2.0760796412825586, "loss/hidden": 3.532421875, "loss/jsd": 0.0, "loss/logits": 0.1870915897190571, "step": 18900 }, { "epoch": 0.6303333333333333, "grad_norm": 29.75, "grad_norm_var": 1.6059895833333333, "learning_rate": 0.0001, "loss": 7.9256, "loss/crossentropy": 2.2268298670649527, "loss/hidden": 3.6609375, "loss/jsd": 0.0, "loss/logits": 0.21423916518688202, "step": 18910 }, { "epoch": 0.6306666666666667, "grad_norm": 33.75, "grad_norm_var": 6.870572916666666, "learning_rate": 0.0001, "loss": 7.9247, "loss/crossentropy": 2.192274183034897, "loss/hidden": 3.730078125, "loss/jsd": 0.0, "loss/logits": 0.2382037065923214, "step": 18920 }, { "epoch": 0.631, "grad_norm": 34.5, "grad_norm_var": 4.92265625, "learning_rate": 0.0001, "loss": 7.9498, "loss/crossentropy": 2.1766092889010906, "loss/hidden": 3.615234375, "loss/jsd": 0.0, "loss/logits": 0.20994122456759215, "step": 18930 }, { "epoch": 0.6313333333333333, "grad_norm": 31.25, "grad_norm_var": 3.9643229166666667, "learning_rate": 0.0001, "loss": 7.8339, "loss/crossentropy": 2.1794259466230868, "loss/hidden": 3.58125, "loss/jsd": 0.0, "loss/logits": 0.21326899696141483, "step": 18940 }, { "epoch": 0.6316666666666667, "grad_norm": 30.375, "grad_norm_var": 4.389518229166667, "learning_rate": 0.0001, "loss": 7.8916, "loss/crossentropy": 2.1451626420021057, "loss/hidden": 3.74765625, "loss/jsd": 0.0, "loss/logits": 0.22022609002888202, "step": 18950 }, { "epoch": 0.632, "grad_norm": 28.5, "grad_norm_var": 5.9275390625, "learning_rate": 0.0001, "loss": 7.885, "loss/crossentropy": 2.143310196697712, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.21481310818344354, "step": 18960 }, { "epoch": 0.6323333333333333, "grad_norm": 33.5, "grad_norm_var": 111.21041666666666, "learning_rate": 0.0001, "loss": 7.8308, "loss/crossentropy": 2.0815651670098303, "loss/hidden": 3.738671875, "loss/jsd": 0.0, "loss/logits": 0.2265876606106758, "step": 18970 }, { "epoch": 0.6326666666666667, "grad_norm": 81.0, "grad_norm_var": 249.68515625, "learning_rate": 0.0001, "loss": 7.8743, "loss/crossentropy": 2.1671969212591646, "loss/hidden": 3.51796875, "loss/jsd": 0.0, "loss/logits": 0.20081726741045713, "step": 18980 }, { "epoch": 0.633, "grad_norm": 29.25, "grad_norm_var": 167.84368489583332, "learning_rate": 0.0001, "loss": 7.7065, "loss/crossentropy": 2.221551278233528, "loss/hidden": 3.54296875, "loss/jsd": 0.0, "loss/logits": 0.20150368921458722, "step": 18990 }, { "epoch": 0.6333333333333333, "grad_norm": 31.75, "grad_norm_var": 2.2393229166666666, "learning_rate": 0.0001, "loss": 7.8872, "loss/crossentropy": 2.1742024421691895, "loss/hidden": 3.70078125, "loss/jsd": 0.0, "loss/logits": 0.2297104850411415, "step": 19000 }, { "epoch": 0.6336666666666667, "grad_norm": 30.125, "grad_norm_var": 1.6155598958333333, "learning_rate": 0.0001, "loss": 7.8937, "loss/crossentropy": 2.013886445760727, "loss/hidden": 3.634765625, "loss/jsd": 0.0, "loss/logits": 0.2087485622614622, "step": 19010 }, { "epoch": 0.634, "grad_norm": 29.25, "grad_norm_var": 3.457747395833333, "learning_rate": 0.0001, "loss": 7.8267, "loss/crossentropy": 2.125851184129715, "loss/hidden": 3.680078125, "loss/jsd": 0.0, "loss/logits": 0.2237798146903515, "step": 19020 }, { "epoch": 0.6343333333333333, "grad_norm": 30.625, "grad_norm_var": 1.234375, "learning_rate": 0.0001, "loss": 7.8299, "loss/crossentropy": 2.154933376610279, "loss/hidden": 3.540625, "loss/jsd": 0.0, "loss/logits": 0.19935899265110493, "step": 19030 }, { "epoch": 0.6346666666666667, "grad_norm": 28.875, "grad_norm_var": 1.6129557291666667, "learning_rate": 0.0001, "loss": 7.8469, "loss/crossentropy": 2.093220832943916, "loss/hidden": 3.62734375, "loss/jsd": 0.0, "loss/logits": 0.2095857124775648, "step": 19040 }, { "epoch": 0.635, "grad_norm": 32.5, "grad_norm_var": 3.49765625, "learning_rate": 0.0001, "loss": 7.8074, "loss/crossentropy": 2.1023634552955626, "loss/hidden": 3.59140625, "loss/jsd": 0.0, "loss/logits": 0.1920078145340085, "step": 19050 }, { "epoch": 0.6353333333333333, "grad_norm": 30.875, "grad_norm_var": 2.9302083333333333, "learning_rate": 0.0001, "loss": 7.836, "loss/crossentropy": 2.2064808174967765, "loss/hidden": 3.49609375, "loss/jsd": 0.0, "loss/logits": 0.20856849979609252, "step": 19060 }, { "epoch": 0.6356666666666667, "grad_norm": 31.875, "grad_norm_var": 14.134375, "learning_rate": 0.0001, "loss": 7.9529, "loss/crossentropy": 2.022094827145338, "loss/hidden": 3.588671875, "loss/jsd": 0.0, "loss/logits": 0.21211060788482428, "step": 19070 }, { "epoch": 0.636, "grad_norm": 30.625, "grad_norm_var": 7.253125, "learning_rate": 0.0001, "loss": 7.9353, "loss/crossentropy": 2.1081229224801064, "loss/hidden": 3.61875, "loss/jsd": 0.0, "loss/logits": 0.20318191722035409, "step": 19080 }, { "epoch": 0.6363333333333333, "grad_norm": 31.0, "grad_norm_var": 7.374934895833333, "learning_rate": 0.0001, "loss": 7.9194, "loss/crossentropy": 2.179345028847456, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.2187561433762312, "step": 19090 }, { "epoch": 0.6366666666666667, "grad_norm": 30.75, "grad_norm_var": 1.6650390625, "learning_rate": 0.0001, "loss": 7.7918, "loss/crossentropy": 2.073818951845169, "loss/hidden": 3.637890625, "loss/jsd": 0.0, "loss/logits": 0.22620401345193386, "step": 19100 }, { "epoch": 0.637, "grad_norm": 33.5, "grad_norm_var": 13.125, "learning_rate": 0.0001, "loss": 7.8342, "loss/crossentropy": 2.1005424194037916, "loss/hidden": 3.555859375, "loss/jsd": 0.0, "loss/logits": 0.197703623957932, "step": 19110 }, { "epoch": 0.6373333333333333, "grad_norm": 29.5, "grad_norm_var": 13.416666666666666, "learning_rate": 0.0001, "loss": 7.712, "loss/crossentropy": 1.9248533256351947, "loss/hidden": 3.691015625, "loss/jsd": 0.0, "loss/logits": 0.22173443883657457, "step": 19120 }, { "epoch": 0.6376666666666667, "grad_norm": 32.75, "grad_norm_var": 7.163608467333465e+18, "learning_rate": 0.0001, "loss": 7.8861, "loss/crossentropy": 2.133715681731701, "loss/hidden": 3.56328125, "loss/jsd": 0.0, "loss/logits": 0.20394432581961155, "step": 19130 }, { "epoch": 0.638, "grad_norm": 32.0, "grad_norm_var": 3.9525419601292687e+18, "learning_rate": 0.0001, "loss": 7.9626, "loss/crossentropy": 2.325319290161133, "loss/hidden": 3.668359375, "loss/jsd": 0.0, "loss/logits": 0.23943714387714862, "step": 19140 }, { "epoch": 0.6383333333333333, "grad_norm": 31.375, "grad_norm_var": 1.5083333333333333, "learning_rate": 0.0001, "loss": 7.8766, "loss/crossentropy": 2.0861736297607423, "loss/hidden": 3.673046875, "loss/jsd": 0.0, "loss/logits": 0.2469935854896903, "step": 19150 }, { "epoch": 0.6386666666666667, "grad_norm": 29.25, "grad_norm_var": 4.2587890625, "learning_rate": 0.0001, "loss": 7.7649, "loss/crossentropy": 2.025897032767534, "loss/hidden": 3.508984375, "loss/jsd": 0.0, "loss/logits": 0.19602714721113443, "step": 19160 }, { "epoch": 0.639, "grad_norm": 310.0, "grad_norm_var": 4881.70390625, "learning_rate": 0.0001, "loss": 8.0063, "loss/crossentropy": 2.2703626573085787, "loss/hidden": 3.65078125, "loss/jsd": 0.0, "loss/logits": 0.22815561573952436, "step": 19170 }, { "epoch": 0.6393333333333333, "grad_norm": 30.75, "grad_norm_var": 4814.96875, "learning_rate": 0.0001, "loss": 7.9568, "loss/crossentropy": 2.130793032050133, "loss/hidden": 3.572265625, "loss/jsd": 0.0, "loss/logits": 0.21214349009096622, "step": 19180 }, { "epoch": 0.6396666666666667, "grad_norm": 40.0, "grad_norm_var": 18.782291666666666, "learning_rate": 0.0001, "loss": 7.8414, "loss/crossentropy": 1.9671615742146968, "loss/hidden": 3.69296875, "loss/jsd": 0.0, "loss/logits": 0.21655476819723846, "step": 19190 }, { "epoch": 0.64, "grad_norm": 28.5, "grad_norm_var": 7.598958333333333, "learning_rate": 0.0001, "loss": 7.6977, "loss/crossentropy": 1.9905995845794677, "loss/hidden": 3.551953125, "loss/jsd": 0.0, "loss/logits": 0.20198220126330851, "step": 19200 }, { "epoch": 0.6403333333333333, "grad_norm": 29.875, "grad_norm_var": 14.686393229166667, "learning_rate": 0.0001, "loss": 7.7702, "loss/crossentropy": 2.1015154205262663, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.1957567347213626, "step": 19210 }, { "epoch": 0.6406666666666667, "grad_norm": 29.875, "grad_norm_var": 13.4166015625, "learning_rate": 0.0001, "loss": 7.8531, "loss/crossentropy": 2.102202596515417, "loss/hidden": 3.497265625, "loss/jsd": 0.0, "loss/logits": 0.20049150586128234, "step": 19220 }, { "epoch": 0.641, "grad_norm": 29.625, "grad_norm_var": 4.143489583333333, "learning_rate": 0.0001, "loss": 7.7549, "loss/crossentropy": 2.0605901539325715, "loss/hidden": 3.601171875, "loss/jsd": 0.0, "loss/logits": 0.21827564649283887, "step": 19230 }, { "epoch": 0.6413333333333333, "grad_norm": 28.875, "grad_norm_var": 26.48515625, "learning_rate": 0.0001, "loss": 7.8123, "loss/crossentropy": 2.0212767884135245, "loss/hidden": 3.65234375, "loss/jsd": 0.0, "loss/logits": 0.2223007800988853, "step": 19240 }, { "epoch": 0.6416666666666667, "grad_norm": 31.375, "grad_norm_var": 26.699739583333333, "learning_rate": 0.0001, "loss": 7.937, "loss/crossentropy": 2.0694237641990183, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.20976187251508235, "step": 19250 }, { "epoch": 0.642, "grad_norm": 29.625, "grad_norm_var": 5.612239583333333, "learning_rate": 0.0001, "loss": 7.805, "loss/crossentropy": 2.163316985964775, "loss/hidden": 3.512109375, "loss/jsd": 0.0, "loss/logits": 0.20325905755162238, "step": 19260 }, { "epoch": 0.6423333333333333, "grad_norm": 30.875, "grad_norm_var": 4.861458333333333, "learning_rate": 0.0001, "loss": 7.7719, "loss/crossentropy": 2.0881971672177313, "loss/hidden": 3.61953125, "loss/jsd": 0.0, "loss/logits": 0.21606105621904134, "step": 19270 }, { "epoch": 0.6426666666666667, "grad_norm": 29.0, "grad_norm_var": 9.8556640625, "learning_rate": 0.0001, "loss": 7.8889, "loss/crossentropy": 2.171047043800354, "loss/hidden": 3.62421875, "loss/jsd": 0.0, "loss/logits": 0.22386016957461835, "step": 19280 }, { "epoch": 0.643, "grad_norm": 30.5, "grad_norm_var": 5.2375, "learning_rate": 0.0001, "loss": 7.7399, "loss/crossentropy": 2.09329297542572, "loss/hidden": 3.684765625, "loss/jsd": 0.0, "loss/logits": 0.218923170119524, "step": 19290 }, { "epoch": 0.6433333333333333, "grad_norm": 27.25, "grad_norm_var": 7.108072916666667, "learning_rate": 0.0001, "loss": 7.7758, "loss/crossentropy": 2.0641403660178184, "loss/hidden": 3.558984375, "loss/jsd": 0.0, "loss/logits": 0.20265798550099134, "step": 19300 }, { "epoch": 0.6436666666666667, "grad_norm": 35.75, "grad_norm_var": 4.120833333333334, "learning_rate": 0.0001, "loss": 7.8368, "loss/crossentropy": 2.1241923332214356, "loss/hidden": 3.4953125, "loss/jsd": 0.0, "loss/logits": 0.19217857848852873, "step": 19310 }, { "epoch": 0.644, "grad_norm": 36.75, "grad_norm_var": 27.8978515625, "learning_rate": 0.0001, "loss": 7.8321, "loss/crossentropy": 2.0242124810814857, "loss/hidden": 3.682421875, "loss/jsd": 0.0, "loss/logits": 0.19688315968960524, "step": 19320 }, { "epoch": 0.6443333333333333, "grad_norm": 28.375, "grad_norm_var": 7.702083333333333, "learning_rate": 0.0001, "loss": 7.7452, "loss/crossentropy": 2.093150442838669, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.1998631376773119, "step": 19330 }, { "epoch": 0.6446666666666667, "grad_norm": 36.75, "grad_norm_var": 9.519791666666666, "learning_rate": 0.0001, "loss": 7.8062, "loss/crossentropy": 2.1420770615339277, "loss/hidden": 3.5640625, "loss/jsd": 0.0, "loss/logits": 0.20922097396105527, "step": 19340 }, { "epoch": 0.645, "grad_norm": 33.0, "grad_norm_var": 9.233333333333333, "learning_rate": 0.0001, "loss": 7.8477, "loss/crossentropy": 2.1200323194265365, "loss/hidden": 3.65234375, "loss/jsd": 0.0, "loss/logits": 0.23046185187995433, "step": 19350 }, { "epoch": 0.6453333333333333, "grad_norm": 29.375, "grad_norm_var": 6.540559895833334, "learning_rate": 0.0001, "loss": 7.7967, "loss/crossentropy": 2.054234591126442, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.2208017086610198, "step": 19360 }, { "epoch": 0.6456666666666667, "grad_norm": 30.25, "grad_norm_var": 2.4139973958333334, "learning_rate": 0.0001, "loss": 7.8385, "loss/crossentropy": 2.0128255039453506, "loss/hidden": 3.786328125, "loss/jsd": 0.0, "loss/logits": 0.22819863110780716, "step": 19370 }, { "epoch": 0.646, "grad_norm": 32.0, "grad_norm_var": 1.6363932291666667, "learning_rate": 0.0001, "loss": 7.7009, "loss/crossentropy": 2.024762587249279, "loss/hidden": 3.61640625, "loss/jsd": 0.0, "loss/logits": 0.20730710867792368, "step": 19380 }, { "epoch": 0.6463333333333333, "grad_norm": 36.0, "grad_norm_var": 3.693489583333333, "learning_rate": 0.0001, "loss": 7.7849, "loss/crossentropy": 1.9413291484117507, "loss/hidden": 3.58984375, "loss/jsd": 0.0, "loss/logits": 0.20006432849913836, "step": 19390 }, { "epoch": 0.6466666666666666, "grad_norm": 28.0, "grad_norm_var": 6.737239583333333, "learning_rate": 0.0001, "loss": 7.7068, "loss/crossentropy": 2.219281970709562, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.2173599960282445, "step": 19400 }, { "epoch": 0.647, "grad_norm": 29.5, "grad_norm_var": 5.31640625, "learning_rate": 0.0001, "loss": 7.9205, "loss/crossentropy": 2.0932188779115677, "loss/hidden": 3.573046875, "loss/jsd": 0.0, "loss/logits": 0.2144004687666893, "step": 19410 }, { "epoch": 0.6473333333333333, "grad_norm": 33.75, "grad_norm_var": 2.167708333333333, "learning_rate": 0.0001, "loss": 7.9694, "loss/crossentropy": 2.187512440979481, "loss/hidden": 3.666015625, "loss/jsd": 0.0, "loss/logits": 0.21288955435156823, "step": 19420 }, { "epoch": 0.6476666666666666, "grad_norm": 32.0, "grad_norm_var": 19.3244140625, "learning_rate": 0.0001, "loss": 7.9391, "loss/crossentropy": 2.156757637858391, "loss/hidden": 3.61875, "loss/jsd": 0.0, "loss/logits": 0.21084595024585723, "step": 19430 }, { "epoch": 0.648, "grad_norm": 29.875, "grad_norm_var": 4.955143229166667, "learning_rate": 0.0001, "loss": 7.8517, "loss/crossentropy": 2.218225100636482, "loss/hidden": 3.56015625, "loss/jsd": 0.0, "loss/logits": 0.21003954205662012, "step": 19440 }, { "epoch": 0.6483333333333333, "grad_norm": 32.25, "grad_norm_var": 2.2905598958333333, "learning_rate": 0.0001, "loss": 7.8119, "loss/crossentropy": 2.1156990200281145, "loss/hidden": 3.6578125, "loss/jsd": 0.0, "loss/logits": 0.21486669424921273, "step": 19450 }, { "epoch": 0.6486666666666666, "grad_norm": 30.875, "grad_norm_var": 3.03515625, "learning_rate": 0.0001, "loss": 7.7407, "loss/crossentropy": 2.0934729874134064, "loss/hidden": 3.61328125, "loss/jsd": 0.0, "loss/logits": 0.20924665350466967, "step": 19460 }, { "epoch": 0.649, "grad_norm": 31.25, "grad_norm_var": 1.3832682291666667, "learning_rate": 0.0001, "loss": 7.7772, "loss/crossentropy": 2.167033377289772, "loss/hidden": 3.596484375, "loss/jsd": 0.0, "loss/logits": 0.21081157084554433, "step": 19470 }, { "epoch": 0.6493333333333333, "grad_norm": 30.25, "grad_norm_var": 6.167643229166667, "learning_rate": 0.0001, "loss": 7.8026, "loss/crossentropy": 2.0574812293052673, "loss/hidden": 3.6078125, "loss/jsd": 0.0, "loss/logits": 0.21090691294521094, "step": 19480 }, { "epoch": 0.6496666666666666, "grad_norm": 30.125, "grad_norm_var": 131.05625, "learning_rate": 0.0001, "loss": 7.8775, "loss/crossentropy": 2.037951024621725, "loss/hidden": 3.607421875, "loss/jsd": 0.0, "loss/logits": 0.2193830787204206, "step": 19490 }, { "epoch": 0.65, "grad_norm": 32.5, "grad_norm_var": 1.7166666666666666, "learning_rate": 0.0001, "loss": 7.9099, "loss/crossentropy": 2.1335637837648393, "loss/hidden": 3.621484375, "loss/jsd": 0.0, "loss/logits": 0.22563568409532309, "step": 19500 }, { "epoch": 0.6503333333333333, "grad_norm": 31.0, "grad_norm_var": 7.759309895833334, "learning_rate": 0.0001, "loss": 7.6947, "loss/crossentropy": 2.0860137730836867, "loss/hidden": 3.56640625, "loss/jsd": 0.0, "loss/logits": 0.20874691233038903, "step": 19510 }, { "epoch": 0.6506666666666666, "grad_norm": 32.0, "grad_norm_var": 20.720247395833333, "learning_rate": 0.0001, "loss": 7.9359, "loss/crossentropy": 2.176954896748066, "loss/hidden": 3.58359375, "loss/jsd": 0.0, "loss/logits": 0.21417912952601909, "step": 19520 }, { "epoch": 0.651, "grad_norm": 29.25, "grad_norm_var": 16.797916666666666, "learning_rate": 0.0001, "loss": 7.7905, "loss/crossentropy": 2.112550212442875, "loss/hidden": 3.535546875, "loss/jsd": 0.0, "loss/logits": 0.21057379432022572, "step": 19530 }, { "epoch": 0.6513333333333333, "grad_norm": 35.25, "grad_norm_var": 5.109375, "learning_rate": 0.0001, "loss": 7.7381, "loss/crossentropy": 2.0441600404679776, "loss/hidden": 3.759765625, "loss/jsd": 0.0, "loss/logits": 0.2118663378059864, "step": 19540 }, { "epoch": 0.6516666666666666, "grad_norm": 29.25, "grad_norm_var": 3.8622395833333334, "learning_rate": 0.0001, "loss": 7.8123, "loss/crossentropy": 2.015673951804638, "loss/hidden": 3.516015625, "loss/jsd": 0.0, "loss/logits": 0.2035064060240984, "step": 19550 }, { "epoch": 0.652, "grad_norm": 30.5, "grad_norm_var": 4.835416666666666, "learning_rate": 0.0001, "loss": 7.9088, "loss/crossentropy": 2.1083447858691216, "loss/hidden": 3.686328125, "loss/jsd": 0.0, "loss/logits": 0.22277338355779647, "step": 19560 }, { "epoch": 0.6523333333333333, "grad_norm": 29.5, "grad_norm_var": 7.986458333333333, "learning_rate": 0.0001, "loss": 7.7413, "loss/crossentropy": 2.2140139706432818, "loss/hidden": 3.536328125, "loss/jsd": 0.0, "loss/logits": 0.19835165999829768, "step": 19570 }, { "epoch": 0.6526666666666666, "grad_norm": 46.25, "grad_norm_var": 22.6853515625, "learning_rate": 0.0001, "loss": 7.8156, "loss/crossentropy": 2.098857820034027, "loss/hidden": 3.628125, "loss/jsd": 0.0, "loss/logits": 0.2128402628004551, "step": 19580 }, { "epoch": 0.653, "grad_norm": 29.0, "grad_norm_var": 3.4058471867640274e+18, "learning_rate": 0.0001, "loss": 7.941, "loss/crossentropy": 2.161790570616722, "loss/hidden": 3.558203125, "loss/jsd": 0.0, "loss/logits": 0.21066656708717346, "step": 19590 }, { "epoch": 0.6533333333333333, "grad_norm": 30.5, "grad_norm_var": 31.020572916666666, "learning_rate": 0.0001, "loss": 7.8532, "loss/crossentropy": 2.1808153837919235, "loss/hidden": 3.630859375, "loss/jsd": 0.0, "loss/logits": 0.21154428124427796, "step": 19600 }, { "epoch": 0.6536666666666666, "grad_norm": 30.0, "grad_norm_var": 2.294791666666667, "learning_rate": 0.0001, "loss": 7.7141, "loss/crossentropy": 2.084941604733467, "loss/hidden": 3.51171875, "loss/jsd": 0.0, "loss/logits": 0.2059195751324296, "step": 19610 }, { "epoch": 0.654, "grad_norm": 29.875, "grad_norm_var": 2.474739583333333, "learning_rate": 0.0001, "loss": 7.7995, "loss/crossentropy": 1.9402640502899886, "loss/hidden": 3.602734375, "loss/jsd": 0.0, "loss/logits": 0.19613281125202775, "step": 19620 }, { "epoch": 0.6543333333333333, "grad_norm": 30.125, "grad_norm_var": 123.0462890625, "learning_rate": 0.0001, "loss": 7.8662, "loss/crossentropy": 2.056548047810793, "loss/hidden": 3.443359375, "loss/jsd": 0.0, "loss/logits": 0.19428260792046786, "step": 19630 }, { "epoch": 0.6546666666666666, "grad_norm": 5469372416.0, "grad_norm_var": 3.757034120500845e+18, "learning_rate": 0.0001, "loss": 7.8375, "loss/crossentropy": 2.173163182288408, "loss/hidden": 3.60703125, "loss/jsd": 0.0, "loss/logits": 0.21105932276695966, "step": 19640 }, { "epoch": 0.655, "grad_norm": 30.125, "grad_norm_var": 3.757034118776007e+18, "learning_rate": 0.0001, "loss": 7.7935, "loss/crossentropy": 2.129123020917177, "loss/hidden": 3.625390625, "loss/jsd": 0.0, "loss/logits": 0.21240208223462104, "step": 19650 }, { "epoch": 0.6553333333333333, "grad_norm": 29.25, "grad_norm_var": 4.189518229166667, "learning_rate": 0.0001, "loss": 7.6158, "loss/crossentropy": 2.080010825395584, "loss/hidden": 3.594921875, "loss/jsd": 0.0, "loss/logits": 0.20815913137048483, "step": 19660 }, { "epoch": 0.6556666666666666, "grad_norm": 27.25, "grad_norm_var": 6.65, "learning_rate": 0.0001, "loss": 7.6821, "loss/crossentropy": 1.9834842666983605, "loss/hidden": 3.469140625, "loss/jsd": 0.0, "loss/logits": 0.19239332657307387, "step": 19670 }, { "epoch": 0.656, "grad_norm": 49.0, "grad_norm_var": 26.429622395833334, "learning_rate": 0.0001, "loss": 7.7928, "loss/crossentropy": 2.0769309490919112, "loss/hidden": 3.56640625, "loss/jsd": 0.0, "loss/logits": 0.2172299936413765, "step": 19680 }, { "epoch": 0.6563333333333333, "grad_norm": 31.0, "grad_norm_var": 24.833333333333332, "learning_rate": 0.0001, "loss": 7.7632, "loss/crossentropy": 2.0378880076110364, "loss/hidden": 3.68359375, "loss/jsd": 0.0, "loss/logits": 0.21062599224969744, "step": 19690 }, { "epoch": 0.6566666666666666, "grad_norm": 33.5, "grad_norm_var": 3.2285807291666666, "learning_rate": 0.0001, "loss": 7.8179, "loss/crossentropy": 2.0788474015891554, "loss/hidden": 3.703515625, "loss/jsd": 0.0, "loss/logits": 0.20390022164210678, "step": 19700 }, { "epoch": 0.657, "grad_norm": 32.0, "grad_norm_var": 2.20625, "learning_rate": 0.0001, "loss": 7.796, "loss/crossentropy": 1.9499784991145135, "loss/hidden": 3.72890625, "loss/jsd": 0.0, "loss/logits": 0.21807905454188586, "step": 19710 }, { "epoch": 0.6573333333333333, "grad_norm": 29.125, "grad_norm_var": 2.9082682291666666, "learning_rate": 0.0001, "loss": 7.7345, "loss/crossentropy": 2.05738410204649, "loss/hidden": 3.614453125, "loss/jsd": 0.0, "loss/logits": 0.2033387843519449, "step": 19720 }, { "epoch": 0.6576666666666666, "grad_norm": 31.75, "grad_norm_var": 6.730989583333334, "learning_rate": 0.0001, "loss": 7.8437, "loss/crossentropy": 2.076332356035709, "loss/hidden": 3.60703125, "loss/jsd": 0.0, "loss/logits": 0.2070010544732213, "step": 19730 }, { "epoch": 0.658, "grad_norm": 31.5, "grad_norm_var": 1.5077473958333334, "learning_rate": 0.0001, "loss": 7.7077, "loss/crossentropy": 2.165965069830418, "loss/hidden": 3.56328125, "loss/jsd": 0.0, "loss/logits": 0.20955195166170598, "step": 19740 }, { "epoch": 0.6583333333333333, "grad_norm": 29.875, "grad_norm_var": 1.075, "learning_rate": 0.0001, "loss": 7.8811, "loss/crossentropy": 2.1568801373243334, "loss/hidden": 3.615625, "loss/jsd": 0.0, "loss/logits": 0.21716065630316733, "step": 19750 }, { "epoch": 0.6586666666666666, "grad_norm": 28.625, "grad_norm_var": 6.21015625, "learning_rate": 0.0001, "loss": 7.8508, "loss/crossentropy": 2.0843460261821747, "loss/hidden": 3.658984375, "loss/jsd": 0.0, "loss/logits": 0.23474157508462667, "step": 19760 }, { "epoch": 0.659, "grad_norm": 29.75, "grad_norm_var": 5.1494140625, "learning_rate": 0.0001, "loss": 7.792, "loss/crossentropy": 2.084406663477421, "loss/hidden": 3.51796875, "loss/jsd": 0.0, "loss/logits": 0.20483186282217503, "step": 19770 }, { "epoch": 0.6593333333333333, "grad_norm": 29.25, "grad_norm_var": 2.3197265625, "learning_rate": 0.0001, "loss": 7.7708, "loss/crossentropy": 2.0922622852027417, "loss/hidden": 3.5796875, "loss/jsd": 0.0, "loss/logits": 0.20298854364082217, "step": 19780 }, { "epoch": 0.6596666666666666, "grad_norm": 31.375, "grad_norm_var": 3.8041015625, "learning_rate": 0.0001, "loss": 7.9193, "loss/crossentropy": 1.9548385262489318, "loss/hidden": 3.68125, "loss/jsd": 0.0, "loss/logits": 0.2344514699652791, "step": 19790 }, { "epoch": 0.66, "grad_norm": 31.875, "grad_norm_var": 3.436393229166667, "learning_rate": 0.0001, "loss": 7.9176, "loss/crossentropy": 2.099733465909958, "loss/hidden": 3.55234375, "loss/jsd": 0.0, "loss/logits": 0.20575151350349188, "step": 19800 }, { "epoch": 0.6603333333333333, "grad_norm": 29.125, "grad_norm_var": 13.2375, "learning_rate": 0.0001, "loss": 7.7812, "loss/crossentropy": 2.038871665298939, "loss/hidden": 3.61484375, "loss/jsd": 0.0, "loss/logits": 0.2236450683325529, "step": 19810 }, { "epoch": 0.6606666666666666, "grad_norm": 30.75, "grad_norm_var": 13.39765625, "learning_rate": 0.0001, "loss": 7.7482, "loss/crossentropy": 2.075606144964695, "loss/hidden": 3.666796875, "loss/jsd": 0.0, "loss/logits": 0.20769685432314872, "step": 19820 }, { "epoch": 0.661, "grad_norm": 37.0, "grad_norm_var": 6.546875, "learning_rate": 0.0001, "loss": 7.7292, "loss/crossentropy": 2.247103089094162, "loss/hidden": 3.686328125, "loss/jsd": 0.0, "loss/logits": 0.2320896226912737, "step": 19830 }, { "epoch": 0.6613333333333333, "grad_norm": 29.0, "grad_norm_var": 18.645247395833334, "learning_rate": 0.0001, "loss": 7.782, "loss/crossentropy": 2.1111979484558105, "loss/hidden": 3.64453125, "loss/jsd": 0.0, "loss/logits": 0.2169733637943864, "step": 19840 }, { "epoch": 0.6616666666666666, "grad_norm": 34.25, "grad_norm_var": 20.564518229166666, "learning_rate": 0.0001, "loss": 7.805, "loss/crossentropy": 2.1517412811517715, "loss/hidden": 3.656640625, "loss/jsd": 0.0, "loss/logits": 0.22375482488423587, "step": 19850 }, { "epoch": 0.662, "grad_norm": 30.75, "grad_norm_var": 6.059375, "learning_rate": 0.0001, "loss": 7.6481, "loss/crossentropy": 1.8939931578934193, "loss/hidden": 3.6609375, "loss/jsd": 0.0, "loss/logits": 0.2128385290503502, "step": 19860 }, { "epoch": 0.6623333333333333, "grad_norm": 30.5, "grad_norm_var": 7.92890625, "learning_rate": 0.0001, "loss": 7.7487, "loss/crossentropy": 2.2018162116408346, "loss/hidden": 3.61875, "loss/jsd": 0.0, "loss/logits": 0.22412298023700714, "step": 19870 }, { "epoch": 0.6626666666666666, "grad_norm": 32.75, "grad_norm_var": 9.864322916666667, "learning_rate": 0.0001, "loss": 7.7606, "loss/crossentropy": 2.1517828926444054, "loss/hidden": 3.54140625, "loss/jsd": 0.0, "loss/logits": 0.2149300893768668, "step": 19880 }, { "epoch": 0.663, "grad_norm": 30.0, "grad_norm_var": 3.3268229166666665, "learning_rate": 0.0001, "loss": 7.9545, "loss/crossentropy": 2.1691304370760918, "loss/hidden": 3.662890625, "loss/jsd": 0.0, "loss/logits": 0.21897413935512305, "step": 19890 }, { "epoch": 0.6633333333333333, "grad_norm": 35.5, "grad_norm_var": 11.280143229166667, "learning_rate": 0.0001, "loss": 7.8222, "loss/crossentropy": 2.1076577827334404, "loss/hidden": 3.65078125, "loss/jsd": 0.0, "loss/logits": 0.21274937596172094, "step": 19900 }, { "epoch": 0.6636666666666666, "grad_norm": 29.625, "grad_norm_var": 13.858072916666666, "learning_rate": 0.0001, "loss": 7.8676, "loss/crossentropy": 2.1668387286365034, "loss/hidden": 3.5890625, "loss/jsd": 0.0, "loss/logits": 0.21026035211980343, "step": 19910 }, { "epoch": 0.664, "grad_norm": 30.125, "grad_norm_var": 1.4061848958333334, "learning_rate": 0.0001, "loss": 7.7496, "loss/crossentropy": 2.052803510427475, "loss/hidden": 3.642578125, "loss/jsd": 0.0, "loss/logits": 0.20628087930381298, "step": 19920 }, { "epoch": 0.6643333333333333, "grad_norm": 31.5, "grad_norm_var": 5.010872395833333, "learning_rate": 0.0001, "loss": 7.8597, "loss/crossentropy": 2.2376641765236855, "loss/hidden": 3.6359375, "loss/jsd": 0.0, "loss/logits": 0.22267442829906942, "step": 19930 }, { "epoch": 0.6646666666666666, "grad_norm": 30.875, "grad_norm_var": 7.891666666666667, "learning_rate": 0.0001, "loss": 7.7664, "loss/crossentropy": 2.1567267000675203, "loss/hidden": 3.590625, "loss/jsd": 0.0, "loss/logits": 0.20887317396700383, "step": 19940 }, { "epoch": 0.665, "grad_norm": 29.375, "grad_norm_var": 33.853580729166666, "learning_rate": 0.0001, "loss": 7.7958, "loss/crossentropy": 2.0834827691316606, "loss/hidden": 3.5921875, "loss/jsd": 0.0, "loss/logits": 0.21954385321587325, "step": 19950 }, { "epoch": 0.6653333333333333, "grad_norm": 32.75, "grad_norm_var": 35.7337890625, "learning_rate": 0.0001, "loss": 7.855, "loss/crossentropy": 2.0476011231541635, "loss/hidden": 3.689453125, "loss/jsd": 0.0, "loss/logits": 0.23169058002531528, "step": 19960 }, { "epoch": 0.6656666666666666, "grad_norm": 31.5, "grad_norm_var": 8.638997395833334, "learning_rate": 0.0001, "loss": 7.741, "loss/crossentropy": 2.2387500554323196, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.2047835446894169, "step": 19970 }, { "epoch": 0.666, "grad_norm": 38.25, "grad_norm_var": 11.805208333333333, "learning_rate": 0.0001, "loss": 7.7604, "loss/crossentropy": 2.0881649285554884, "loss/hidden": 3.614453125, "loss/jsd": 0.0, "loss/logits": 0.19943447094410657, "step": 19980 }, { "epoch": 0.6663333333333333, "grad_norm": 38.0, "grad_norm_var": 10.564518229166667, "learning_rate": 0.0001, "loss": 7.7988, "loss/crossentropy": 2.0458758428692816, "loss/hidden": 3.673046875, "loss/jsd": 0.0, "loss/logits": 0.22079507317394018, "step": 19990 }, { "epoch": 0.6666666666666666, "grad_norm": 35.0, "grad_norm_var": 8.631184895833334, "learning_rate": 0.0001, "loss": 7.7353, "loss/crossentropy": 2.1835017532110212, "loss/hidden": 3.6046875, "loss/jsd": 0.0, "loss/logits": 0.21447087433189155, "step": 20000 }, { "epoch": 0.667, "grad_norm": 27.75, "grad_norm_var": 2.8713262106311393e+18, "learning_rate": 9.999977793408362e-05, "loss": 7.7009, "loss/crossentropy": 2.1651393327862025, "loss/hidden": 3.6609375, "loss/jsd": 0.0, "loss/logits": 0.22053546169772745, "step": 20010 }, { "epoch": 0.6673333333333333, "grad_norm": 29.0, "grad_norm_var": 10.5337890625, "learning_rate": 9.999911173852618e-05, "loss": 7.7892, "loss/crossentropy": 2.049077409505844, "loss/hidden": 3.62890625, "loss/jsd": 0.0, "loss/logits": 0.21919873766601086, "step": 20020 }, { "epoch": 0.6676666666666666, "grad_norm": 33.25, "grad_norm_var": 5.30390625, "learning_rate": 9.999800141990274e-05, "loss": 7.7849, "loss/crossentropy": 2.100401471555233, "loss/hidden": 3.543359375, "loss/jsd": 0.0, "loss/logits": 0.1976662116125226, "step": 20030 }, { "epoch": 0.668, "grad_norm": 30.75, "grad_norm_var": 4.901041666666667, "learning_rate": 9.999644698917173e-05, "loss": 7.7615, "loss/crossentropy": 2.0303331464529037, "loss/hidden": 3.64140625, "loss/jsd": 0.0, "loss/logits": 0.20121301785111428, "step": 20040 }, { "epoch": 0.6683333333333333, "grad_norm": 6710886400.0, "grad_norm_var": 6.481711869968061e+18, "learning_rate": 9.999444846167473e-05, "loss": 7.9027, "loss/crossentropy": 2.11352252215147, "loss/hidden": 3.676953125, "loss/jsd": 0.0, "loss/logits": 0.23072675410658122, "step": 20050 }, { "epoch": 0.6686666666666666, "grad_norm": 29.875, "grad_norm_var": 6.481711869365549e+18, "learning_rate": 9.99920058571364e-05, "loss": 7.8042, "loss/crossentropy": 2.0778122201561926, "loss/hidden": 3.620703125, "loss/jsd": 0.0, "loss/logits": 0.20652580186724662, "step": 20060 }, { "epoch": 0.669, "grad_norm": 30.125, "grad_norm_var": 6.4056640625, "learning_rate": 9.99891191996643e-05, "loss": 7.6094, "loss/crossentropy": 2.1776298195123673, "loss/hidden": 3.56796875, "loss/jsd": 0.0, "loss/logits": 0.20591478087008, "step": 20070 }, { "epoch": 0.6693333333333333, "grad_norm": 28.75, "grad_norm_var": 8.021875, "learning_rate": 9.99857885177485e-05, "loss": 7.7139, "loss/crossentropy": 2.0494499459862707, "loss/hidden": 3.546484375, "loss/jsd": 0.0, "loss/logits": 0.19295884054154158, "step": 20080 }, { "epoch": 0.6696666666666666, "grad_norm": 32.5, "grad_norm_var": 5.2994140625, "learning_rate": 9.998201384426155e-05, "loss": 7.7195, "loss/crossentropy": 1.9814274668693543, "loss/hidden": 3.580078125, "loss/jsd": 0.0, "loss/logits": 0.20319805517792702, "step": 20090 }, { "epoch": 0.67, "grad_norm": 28.25, "grad_norm_var": 7.0119140625, "learning_rate": 9.997779521645793e-05, "loss": 7.8642, "loss/crossentropy": 2.0849661231040955, "loss/hidden": 3.614453125, "loss/jsd": 0.0, "loss/logits": 0.21840104656293988, "step": 20100 }, { "epoch": 0.6703333333333333, "grad_norm": 30.875, "grad_norm_var": 6.142122395833334, "learning_rate": 9.997313267597378e-05, "loss": 7.8638, "loss/crossentropy": 2.0836787208914758, "loss/hidden": 3.576953125, "loss/jsd": 0.0, "loss/logits": 0.2186410004273057, "step": 20110 }, { "epoch": 0.6706666666666666, "grad_norm": 30.375, "grad_norm_var": 12.291080729166667, "learning_rate": 9.996802626882653e-05, "loss": 7.7777, "loss/crossentropy": 2.03349449634552, "loss/hidden": 3.59609375, "loss/jsd": 0.0, "loss/logits": 0.21252898061648012, "step": 20120 }, { "epoch": 0.671, "grad_norm": 28.875, "grad_norm_var": 12.816080729166666, "learning_rate": 9.99624760454143e-05, "loss": 7.7899, "loss/crossentropy": 2.032351566851139, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.21746116746217012, "step": 20130 }, { "epoch": 0.6713333333333333, "grad_norm": 27.75, "grad_norm_var": 4.3947265625, "learning_rate": 9.995648206051563e-05, "loss": 7.8777, "loss/crossentropy": 2.148711860179901, "loss/hidden": 3.753125, "loss/jsd": 0.0, "loss/logits": 0.22083626296371223, "step": 20140 }, { "epoch": 0.6716666666666666, "grad_norm": 31.75, "grad_norm_var": 2.348372395833333, "learning_rate": 9.995004437328867e-05, "loss": 7.7507, "loss/crossentropy": 1.838768770545721, "loss/hidden": 3.54765625, "loss/jsd": 0.0, "loss/logits": 0.18891090219840406, "step": 20150 }, { "epoch": 0.672, "grad_norm": 30.75, "grad_norm_var": 3.405989583333333, "learning_rate": 9.99431630472708e-05, "loss": 7.7609, "loss/crossentropy": 1.9995342150330544, "loss/hidden": 3.71796875, "loss/jsd": 0.0, "loss/logits": 0.22888598432764412, "step": 20160 }, { "epoch": 0.6723333333333333, "grad_norm": 29.25, "grad_norm_var": 115.0259765625, "learning_rate": 9.993583815037793e-05, "loss": 7.8473, "loss/crossentropy": 2.0745826318860052, "loss/hidden": 3.52421875, "loss/jsd": 0.0, "loss/logits": 0.1968067741021514, "step": 20170 }, { "epoch": 0.6726666666666666, "grad_norm": 30.5, "grad_norm_var": 119.87682291666667, "learning_rate": 9.992806975490389e-05, "loss": 7.6878, "loss/crossentropy": 2.186102945357561, "loss/hidden": 3.604296875, "loss/jsd": 0.0, "loss/logits": 0.21814998863264917, "step": 20180 }, { "epoch": 0.673, "grad_norm": 30.375, "grad_norm_var": 3.3150390625, "learning_rate": 9.991985793751955e-05, "loss": 7.869, "loss/crossentropy": 2.1551688984036446, "loss/hidden": 3.740234375, "loss/jsd": 0.0, "loss/logits": 0.2181480558589101, "step": 20190 }, { "epoch": 0.6733333333333333, "grad_norm": 31.125, "grad_norm_var": 2.3955729166666666, "learning_rate": 9.991120277927223e-05, "loss": 7.7436, "loss/crossentropy": 2.1464362293481827, "loss/hidden": 3.53671875, "loss/jsd": 0.0, "loss/logits": 0.20534380227327348, "step": 20200 }, { "epoch": 0.6736666666666666, "grad_norm": 33.75, "grad_norm_var": 2.9457682291666667, "learning_rate": 9.990210436558488e-05, "loss": 7.7295, "loss/crossentropy": 2.1891664013266565, "loss/hidden": 3.49296875, "loss/jsd": 0.0, "loss/logits": 0.20168747715651988, "step": 20210 }, { "epoch": 0.674, "grad_norm": 30.25, "grad_norm_var": 4.6306640625, "learning_rate": 9.989256278625514e-05, "loss": 7.6581, "loss/crossentropy": 1.9678195029497147, "loss/hidden": 3.469140625, "loss/jsd": 0.0, "loss/logits": 0.1858463604003191, "step": 20220 }, { "epoch": 0.6743333333333333, "grad_norm": 33.0, "grad_norm_var": 5.518684895833333, "learning_rate": 9.988257813545458e-05, "loss": 7.7895, "loss/crossentropy": 2.106415245682001, "loss/hidden": 3.66953125, "loss/jsd": 0.0, "loss/logits": 0.22159097539260983, "step": 20230 }, { "epoch": 0.6746666666666666, "grad_norm": 30.75, "grad_norm_var": 2.7979166666666666, "learning_rate": 9.987215051172763e-05, "loss": 7.8636, "loss/crossentropy": 2.040626636892557, "loss/hidden": 3.558984375, "loss/jsd": 0.0, "loss/logits": 0.21652155686169863, "step": 20240 }, { "epoch": 0.675, "grad_norm": 31.0, "grad_norm_var": 3.1832682291666665, "learning_rate": 9.986128001799077e-05, "loss": 7.7925, "loss/crossentropy": 2.1772810891270638, "loss/hidden": 3.684375, "loss/jsd": 0.0, "loss/logits": 0.2191918555647135, "step": 20250 }, { "epoch": 0.6753333333333333, "grad_norm": 31.875, "grad_norm_var": 3.9124348958333335, "learning_rate": 9.984996676153134e-05, "loss": 7.695, "loss/crossentropy": 2.180080857872963, "loss/hidden": 3.647265625, "loss/jsd": 0.0, "loss/logits": 0.22119200490415097, "step": 20260 }, { "epoch": 0.6756666666666666, "grad_norm": 30.0, "grad_norm_var": 4.589518229166667, "learning_rate": 9.983821085400665e-05, "loss": 7.9482, "loss/crossentropy": 2.102330905199051, "loss/hidden": 3.6609375, "loss/jsd": 0.0, "loss/logits": 0.2217390850186348, "step": 20270 }, { "epoch": 0.676, "grad_norm": 31.0, "grad_norm_var": 3.888541666666667, "learning_rate": 9.982601241144277e-05, "loss": 7.7288, "loss/crossentropy": 1.9200996845960616, "loss/hidden": 3.636328125, "loss/jsd": 0.0, "loss/logits": 0.19473480042070151, "step": 20280 }, { "epoch": 0.6763333333333333, "grad_norm": 27.25, "grad_norm_var": 13.608072916666666, "learning_rate": 9.981337155423336e-05, "loss": 7.876, "loss/crossentropy": 2.160831370949745, "loss/hidden": 3.55, "loss/jsd": 0.0, "loss/logits": 0.21856002546846867, "step": 20290 }, { "epoch": 0.6766666666666666, "grad_norm": 29.0, "grad_norm_var": 3.2994140625, "learning_rate": 9.980028840713861e-05, "loss": 7.7397, "loss/crossentropy": 1.8465786181390285, "loss/hidden": 3.592578125, "loss/jsd": 0.0, "loss/logits": 0.19492179118096828, "step": 20300 }, { "epoch": 0.677, "grad_norm": 31.125, "grad_norm_var": 2.7372395833333334, "learning_rate": 9.978676309928389e-05, "loss": 7.7692, "loss/crossentropy": 2.069914309307933, "loss/hidden": 3.591015625, "loss/jsd": 0.0, "loss/logits": 0.20638742656446993, "step": 20310 }, { "epoch": 0.6773333333333333, "grad_norm": 32.25, "grad_norm_var": 2.5479166666666666, "learning_rate": 9.977279576415853e-05, "loss": 8.0062, "loss/crossentropy": 2.1518563136458395, "loss/hidden": 3.65, "loss/jsd": 0.0, "loss/logits": 0.21888233721256256, "step": 20320 }, { "epoch": 0.6776666666666666, "grad_norm": 32.5, "grad_norm_var": 1.9254557291666667, "learning_rate": 9.975838653961446e-05, "loss": 7.7995, "loss/crossentropy": 2.1415294885635374, "loss/hidden": 3.61796875, "loss/jsd": 0.0, "loss/logits": 0.2294903416186571, "step": 20330 }, { "epoch": 0.678, "grad_norm": 30.125, "grad_norm_var": 3.7520182291666666, "learning_rate": 9.974353556786496e-05, "loss": 7.7562, "loss/crossentropy": 2.060848282277584, "loss/hidden": 3.518359375, "loss/jsd": 0.0, "loss/logits": 0.2027845649048686, "step": 20340 }, { "epoch": 0.6783333333333333, "grad_norm": 33.25, "grad_norm_var": 1.8582682291666666, "learning_rate": 9.97282429954831e-05, "loss": 7.8245, "loss/crossentropy": 2.0787692457437514, "loss/hidden": 3.639453125, "loss/jsd": 0.0, "loss/logits": 0.21248381081968545, "step": 20350 }, { "epoch": 0.6786666666666666, "grad_norm": 29.25, "grad_norm_var": 1.2686848958333334, "learning_rate": 9.971250897340038e-05, "loss": 7.7767, "loss/crossentropy": 2.1000698655843735, "loss/hidden": 3.746484375, "loss/jsd": 0.0, "loss/logits": 0.2223961053416133, "step": 20360 }, { "epoch": 0.679, "grad_norm": 30.0, "grad_norm_var": 2.798958333333333, "learning_rate": 9.969633365690528e-05, "loss": 7.7855, "loss/crossentropy": 2.2210501074790954, "loss/hidden": 3.634765625, "loss/jsd": 0.0, "loss/logits": 0.2190061157569289, "step": 20370 }, { "epoch": 0.6793333333333333, "grad_norm": 31.5, "grad_norm_var": 10.903580729166666, "learning_rate": 9.967971720564162e-05, "loss": 7.8671, "loss/crossentropy": 2.25355578660965, "loss/hidden": 3.623828125, "loss/jsd": 0.0, "loss/logits": 0.22110446617007257, "step": 20380 }, { "epoch": 0.6796666666666666, "grad_norm": 32.25, "grad_norm_var": 10.023958333333333, "learning_rate": 9.966265978360708e-05, "loss": 7.9619, "loss/crossentropy": 2.2148331478238106, "loss/hidden": 3.708984375, "loss/jsd": 0.0, "loss/logits": 0.2353464813902974, "step": 20390 }, { "epoch": 0.68, "grad_norm": 29.25, "grad_norm_var": 3.583268229166667, "learning_rate": 9.964516155915151e-05, "loss": 7.7277, "loss/crossentropy": 2.0867557391524314, "loss/hidden": 3.665234375, "loss/jsd": 0.0, "loss/logits": 0.217846536077559, "step": 20400 }, { "epoch": 0.6803333333333333, "grad_norm": 29.625, "grad_norm_var": 2.0872395833333335, "learning_rate": 9.962722270497534e-05, "loss": 7.81, "loss/crossentropy": 2.1222174257040023, "loss/hidden": 3.691796875, "loss/jsd": 0.0, "loss/logits": 0.2225545782595873, "step": 20410 }, { "epoch": 0.6806666666666666, "grad_norm": 32.5, "grad_norm_var": 3.3593098958333334, "learning_rate": 9.960884339812781e-05, "loss": 7.8604, "loss/crossentropy": 2.0085777252912522, "loss/hidden": 3.63359375, "loss/jsd": 0.0, "loss/logits": 0.2187149330973625, "step": 20420 }, { "epoch": 0.681, "grad_norm": 29.0, "grad_norm_var": 306.26875, "learning_rate": 9.959002382000524e-05, "loss": 7.7649, "loss/crossentropy": 1.929932000488043, "loss/hidden": 3.672265625, "loss/jsd": 0.0, "loss/logits": 0.20384540902450682, "step": 20430 }, { "epoch": 0.6813333333333333, "grad_norm": 31.625, "grad_norm_var": 299.79765625, "learning_rate": 9.95707641563493e-05, "loss": 7.8215, "loss/crossentropy": 2.055840089917183, "loss/hidden": 3.61953125, "loss/jsd": 0.0, "loss/logits": 0.19900662265717983, "step": 20440 }, { "epoch": 0.6816666666666666, "grad_norm": 34.25, "grad_norm_var": 22.658072916666665, "learning_rate": 9.95510645972451e-05, "loss": 7.7012, "loss/crossentropy": 2.1488531097769736, "loss/hidden": 3.63359375, "loss/jsd": 0.0, "loss/logits": 0.2117614457383752, "step": 20450 }, { "epoch": 0.682, "grad_norm": 33.5, "grad_norm_var": 26.0291015625, "learning_rate": 9.95309253371193e-05, "loss": 7.717, "loss/crossentropy": 2.2348560094833374, "loss/hidden": 3.512890625, "loss/jsd": 0.0, "loss/logits": 0.1998794011771679, "step": 20460 }, { "epoch": 0.6823333333333333, "grad_norm": 29.125, "grad_norm_var": 2.5136418842933724e+18, "learning_rate": 9.951034657473828e-05, "loss": 7.7369, "loss/crossentropy": 2.160574886202812, "loss/hidden": 3.598046875, "loss/jsd": 0.0, "loss/logits": 0.20629960373044015, "step": 20470 }, { "epoch": 0.6826666666666666, "grad_norm": 28.5, "grad_norm_var": 65.0134765625, "learning_rate": 9.948932851320614e-05, "loss": 7.7652, "loss/crossentropy": 2.1478210985660553, "loss/hidden": 3.62265625, "loss/jsd": 0.0, "loss/logits": 0.21312507782131435, "step": 20480 }, { "epoch": 0.683, "grad_norm": 28.0, "grad_norm_var": 73.778125, "learning_rate": 9.946787135996263e-05, "loss": 7.6988, "loss/crossentropy": 1.978414911031723, "loss/hidden": 3.618359375, "loss/jsd": 0.0, "loss/logits": 0.22136187348514796, "step": 20490 }, { "epoch": 0.6833333333333333, "grad_norm": 30.25, "grad_norm_var": 13.6369140625, "learning_rate": 9.94459753267812e-05, "loss": 7.9084, "loss/crossentropy": 2.1078326418995856, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.20318037196993827, "step": 20500 }, { "epoch": 0.6836666666666666, "grad_norm": 28.625, "grad_norm_var": 15.780143229166667, "learning_rate": 9.942364062976687e-05, "loss": 7.8032, "loss/crossentropy": 2.143668609857559, "loss/hidden": 3.65703125, "loss/jsd": 0.0, "loss/logits": 0.21416857857257127, "step": 20510 }, { "epoch": 0.684, "grad_norm": 33.0, "grad_norm_var": 3.283124096840447e+18, "learning_rate": 9.940086748935406e-05, "loss": 7.8061, "loss/crossentropy": 1.9484010234475135, "loss/hidden": 3.91875, "loss/jsd": 0.0, "loss/logits": 0.22513604983687402, "step": 20520 }, { "epoch": 0.6843333333333333, "grad_norm": 36.75, "grad_norm_var": 3.2831240977690655e+18, "learning_rate": 9.937765613030451e-05, "loss": 7.7232, "loss/crossentropy": 2.0105784103274345, "loss/hidden": 3.689453125, "loss/jsd": 0.0, "loss/logits": 0.20260654278099538, "step": 20530 }, { "epoch": 0.6846666666666666, "grad_norm": 34.75, "grad_norm_var": 11.262239583333333, "learning_rate": 9.935400678170492e-05, "loss": 7.808, "loss/crossentropy": 2.044108145684004, "loss/hidden": 3.599609375, "loss/jsd": 0.0, "loss/logits": 0.20465944344177842, "step": 20540 }, { "epoch": 0.685, "grad_norm": 32.0, "grad_norm_var": 13.480143229166666, "learning_rate": 9.932991967696483e-05, "loss": 7.7901, "loss/crossentropy": 2.155579614639282, "loss/hidden": 3.554296875, "loss/jsd": 0.0, "loss/logits": 0.21164779588580132, "step": 20550 }, { "epoch": 0.6853333333333333, "grad_norm": 49.25, "grad_norm_var": 34.08274739583333, "learning_rate": 9.930539505381426e-05, "loss": 7.8068, "loss/crossentropy": 2.2071971163153647, "loss/hidden": 3.643359375, "loss/jsd": 0.0, "loss/logits": 0.21646333318203687, "step": 20560 }, { "epoch": 0.6856666666666666, "grad_norm": 30.0, "grad_norm_var": 25.3353515625, "learning_rate": 9.928043315430128e-05, "loss": 7.7163, "loss/crossentropy": 2.0667000114917755, "loss/hidden": 3.5875, "loss/jsd": 0.0, "loss/logits": 0.21009525340050458, "step": 20570 }, { "epoch": 0.686, "grad_norm": 28.75, "grad_norm_var": 8.79140625, "learning_rate": 9.925503422478984e-05, "loss": 7.7736, "loss/crossentropy": 2.207004964351654, "loss/hidden": 3.572265625, "loss/jsd": 0.0, "loss/logits": 0.21299835238605738, "step": 20580 }, { "epoch": 0.6863333333333334, "grad_norm": 31.875, "grad_norm_var": 18.999934895833334, "learning_rate": 9.922919851595707e-05, "loss": 7.864, "loss/crossentropy": 2.0951112896203994, "loss/hidden": 3.70546875, "loss/jsd": 0.0, "loss/logits": 0.2267284881323576, "step": 20590 }, { "epoch": 0.6866666666666666, "grad_norm": 33.5, "grad_norm_var": 20.902083333333334, "learning_rate": 9.920292628279099e-05, "loss": 7.7329, "loss/crossentropy": 2.0619683027267457, "loss/hidden": 3.67265625, "loss/jsd": 0.0, "loss/logits": 0.21196486745029688, "step": 20600 }, { "epoch": 0.687, "grad_norm": 31.5, "grad_norm_var": 9.529166666666667, "learning_rate": 9.917621778458796e-05, "loss": 7.7423, "loss/crossentropy": 2.003011184930801, "loss/hidden": 3.54765625, "loss/jsd": 0.0, "loss/logits": 0.19809650387614966, "step": 20610 }, { "epoch": 0.6873333333333334, "grad_norm": 31.125, "grad_norm_var": 2.4936848958333333, "learning_rate": 9.914907328495003e-05, "loss": 7.7587, "loss/crossentropy": 2.047277623414993, "loss/hidden": 3.625390625, "loss/jsd": 0.0, "loss/logits": 0.20881472658365965, "step": 20620 }, { "epoch": 0.6876666666666666, "grad_norm": 38.75, "grad_norm_var": 7.1369140625, "learning_rate": 9.91214930517825e-05, "loss": 7.8442, "loss/crossentropy": 2.047130857408047, "loss/hidden": 3.61953125, "loss/jsd": 0.0, "loss/logits": 0.2019510269165039, "step": 20630 }, { "epoch": 0.688, "grad_norm": 28.0, "grad_norm_var": 8.914518229166667, "learning_rate": 9.909347735729111e-05, "loss": 7.7182, "loss/crossentropy": 2.101368544995785, "loss/hidden": 3.518359375, "loss/jsd": 0.0, "loss/logits": 0.22426690720021725, "step": 20640 }, { "epoch": 0.6883333333333334, "grad_norm": 30.25, "grad_norm_var": 4.451497395833333, "learning_rate": 9.906502647797946e-05, "loss": 7.8233, "loss/crossentropy": 1.987765783816576, "loss/hidden": 3.694921875, "loss/jsd": 0.0, "loss/logits": 0.2175267556682229, "step": 20650 }, { "epoch": 0.6886666666666666, "grad_norm": 27.375, "grad_norm_var": 21.977083333333333, "learning_rate": 9.903614069464625e-05, "loss": 7.7182, "loss/crossentropy": 2.124682963639498, "loss/hidden": 3.592578125, "loss/jsd": 0.0, "loss/logits": 0.20821518804877998, "step": 20660 }, { "epoch": 0.689, "grad_norm": 31.75, "grad_norm_var": 7.99375, "learning_rate": 9.900682029238249e-05, "loss": 7.8463, "loss/crossentropy": 2.0248075053095818, "loss/hidden": 3.645703125, "loss/jsd": 0.0, "loss/logits": 0.22646026099100708, "step": 20670 }, { "epoch": 0.6893333333333334, "grad_norm": 31.625, "grad_norm_var": 5.612955729166667, "learning_rate": 9.897706556056872e-05, "loss": 7.9115, "loss/crossentropy": 2.0372205063700677, "loss/hidden": 3.763671875, "loss/jsd": 0.0, "loss/logits": 0.23688461929559707, "step": 20680 }, { "epoch": 0.6896666666666667, "grad_norm": 30.0, "grad_norm_var": 3.0962890625, "learning_rate": 9.894687679287211e-05, "loss": 7.7103, "loss/crossentropy": 1.9503981947898865, "loss/hidden": 3.5734375, "loss/jsd": 0.0, "loss/logits": 0.20171856675297023, "step": 20690 }, { "epoch": 0.69, "grad_norm": 32.25, "grad_norm_var": 1.8863932291666667, "learning_rate": 9.891625428724363e-05, "loss": 7.8373, "loss/crossentropy": 1.9779121212661266, "loss/hidden": 3.706640625, "loss/jsd": 0.0, "loss/logits": 0.20387490205466746, "step": 20700 }, { "epoch": 0.6903333333333334, "grad_norm": 28.75, "grad_norm_var": 2.567122395833333, "learning_rate": 9.888519834591505e-05, "loss": 7.8038, "loss/crossentropy": 2.0854588031768797, "loss/hidden": 3.56171875, "loss/jsd": 0.0, "loss/logits": 0.19318762868642808, "step": 20710 }, { "epoch": 0.6906666666666667, "grad_norm": 28.625, "grad_norm_var": 5.96640625, "learning_rate": 9.885370927539598e-05, "loss": 7.8213, "loss/crossentropy": 2.0958469033241274, "loss/hidden": 3.70078125, "loss/jsd": 0.0, "loss/logits": 0.21331228129565716, "step": 20720 }, { "epoch": 0.691, "grad_norm": 27.75, "grad_norm_var": 4.359375, "learning_rate": 9.88217873864708e-05, "loss": 7.7409, "loss/crossentropy": 2.0466203540563583, "loss/hidden": 3.627734375, "loss/jsd": 0.0, "loss/logits": 0.20199444703757763, "step": 20730 }, { "epoch": 0.6913333333333334, "grad_norm": 36.5, "grad_norm_var": 4.928059895833333, "learning_rate": 9.878943299419571e-05, "loss": 7.7546, "loss/crossentropy": 1.9727531932294369, "loss/hidden": 3.6375, "loss/jsd": 0.0, "loss/logits": 0.20264990702271463, "step": 20740 }, { "epoch": 0.6916666666666667, "grad_norm": 30.125, "grad_norm_var": 7.667643229166667, "learning_rate": 9.875664641789545e-05, "loss": 7.7905, "loss/crossentropy": 1.9583543412387372, "loss/hidden": 3.551953125, "loss/jsd": 0.0, "loss/logits": 0.21098938062787057, "step": 20750 }, { "epoch": 0.692, "grad_norm": 37.25, "grad_norm_var": 2.155042766665373e+18, "learning_rate": 9.872342798116033e-05, "loss": 7.9216, "loss/crossentropy": 2.1403904750943186, "loss/hidden": 3.613671875, "loss/jsd": 0.0, "loss/logits": 0.20952636245638132, "step": 20760 }, { "epoch": 0.6923333333333334, "grad_norm": 8019509248.0, "grad_norm_var": 5.782152745791608e+18, "learning_rate": 9.86897780118429e-05, "loss": 7.6639, "loss/crossentropy": 2.111561615765095, "loss/hidden": 3.617578125, "loss/jsd": 0.0, "loss/logits": 0.2071863466873765, "step": 20770 }, { "epoch": 0.6926666666666667, "grad_norm": 27.375, "grad_norm_var": 4.0195330041945523e+18, "learning_rate": 9.865569684205477e-05, "loss": 7.8907, "loss/crossentropy": 2.1496111884713174, "loss/hidden": 3.567578125, "loss/jsd": 0.0, "loss/logits": 0.2130032055079937, "step": 20780 }, { "epoch": 0.693, "grad_norm": 30.75, "grad_norm_var": 4.995572916666666, "learning_rate": 9.862118480816331e-05, "loss": 7.8467, "loss/crossentropy": 2.1802233815193177, "loss/hidden": 3.70703125, "loss/jsd": 0.0, "loss/logits": 0.22255988270044327, "step": 20790 }, { "epoch": 0.6933333333333334, "grad_norm": 28.25, "grad_norm_var": 4.992643229166666, "learning_rate": 9.858624225078841e-05, "loss": 7.695, "loss/crossentropy": 2.0942075878381727, "loss/hidden": 3.585546875, "loss/jsd": 0.0, "loss/logits": 0.2092249434441328, "step": 20800 }, { "epoch": 0.6936666666666667, "grad_norm": 31.5, "grad_norm_var": 5.141666666666667, "learning_rate": 9.855086951479894e-05, "loss": 7.7499, "loss/crossentropy": 2.094904583692551, "loss/hidden": 3.546484375, "loss/jsd": 0.0, "loss/logits": 0.20001194700598718, "step": 20810 }, { "epoch": 0.694, "grad_norm": 7616856064.0, "grad_norm_var": 3.626030989921677e+18, "learning_rate": 9.851506694930958e-05, "loss": 7.8424, "loss/crossentropy": 2.24151945784688, "loss/hidden": 3.616796875, "loss/jsd": 0.0, "loss/logits": 0.21622586157172918, "step": 20820 }, { "epoch": 0.6943333333333334, "grad_norm": 35.0, "grad_norm_var": 3.6260309876366203e+18, "learning_rate": 9.847883490767716e-05, "loss": 7.7868, "loss/crossentropy": 2.077386400103569, "loss/hidden": 3.5921875, "loss/jsd": 0.0, "loss/logits": 0.21033733878284694, "step": 20830 }, { "epoch": 0.6946666666666667, "grad_norm": 31.25, "grad_norm_var": 25.595572916666665, "learning_rate": 9.844217374749732e-05, "loss": 7.7393, "loss/crossentropy": 2.1060422867536546, "loss/hidden": 3.43984375, "loss/jsd": 0.0, "loss/logits": 0.19694673204794527, "step": 20840 }, { "epoch": 0.695, "grad_norm": 30.125, "grad_norm_var": 2.786672612471944e+18, "learning_rate": 9.840508383060093e-05, "loss": 7.7705, "loss/crossentropy": 2.167571856081486, "loss/hidden": 3.524609375, "loss/jsd": 0.0, "loss/logits": 0.2048779834061861, "step": 20850 }, { "epoch": 0.6953333333333334, "grad_norm": 31.5, "grad_norm_var": 2.7866726123189217e+18, "learning_rate": 9.836756552305044e-05, "loss": 7.7599, "loss/crossentropy": 2.1368554055690767, "loss/hidden": 3.569921875, "loss/jsd": 0.0, "loss/logits": 0.20562508180737496, "step": 20860 }, { "epoch": 0.6956666666666667, "grad_norm": 28.75, "grad_norm_var": 3.823372395833333, "learning_rate": 9.832961919513646e-05, "loss": 7.7132, "loss/crossentropy": 2.0987789154052736, "loss/hidden": 3.653515625, "loss/jsd": 0.0, "loss/logits": 0.21177561171352863, "step": 20870 }, { "epoch": 0.696, "grad_norm": 30.5, "grad_norm_var": 8.401822916666667, "learning_rate": 9.829124522137386e-05, "loss": 7.7658, "loss/crossentropy": 2.0699412554502485, "loss/hidden": 3.548828125, "loss/jsd": 0.0, "loss/logits": 0.19380738902837039, "step": 20880 }, { "epoch": 0.6963333333333334, "grad_norm": 28.125, "grad_norm_var": 3.7056640625, "learning_rate": 9.825244398049834e-05, "loss": 7.8394, "loss/crossentropy": 2.1802071809768675, "loss/hidden": 3.55234375, "loss/jsd": 0.0, "loss/logits": 0.20943715162575244, "step": 20890 }, { "epoch": 0.6966666666666667, "grad_norm": 30.75, "grad_norm_var": 2.9302083333333333, "learning_rate": 9.821321585546244e-05, "loss": 7.801, "loss/crossentropy": 2.187861883640289, "loss/hidden": 3.58828125, "loss/jsd": 0.0, "loss/logits": 0.20320057701319455, "step": 20900 }, { "epoch": 0.697, "grad_norm": 33.75, "grad_norm_var": 2.439322916666667, "learning_rate": 9.817356123343193e-05, "loss": 7.8392, "loss/crossentropy": 2.143737843632698, "loss/hidden": 3.59609375, "loss/jsd": 0.0, "loss/logits": 0.21037317141890527, "step": 20910 }, { "epoch": 0.6973333333333334, "grad_norm": 34.5, "grad_norm_var": 5.75625, "learning_rate": 9.813348050578191e-05, "loss": 7.6046, "loss/crossentropy": 2.01915595009923, "loss/hidden": 3.496875, "loss/jsd": 0.0, "loss/logits": 0.19461457710713148, "step": 20920 }, { "epoch": 0.6976666666666667, "grad_norm": 29.625, "grad_norm_var": 10.854166666666666, "learning_rate": 9.8092974068093e-05, "loss": 7.6391, "loss/crossentropy": 2.104771558940411, "loss/hidden": 3.622265625, "loss/jsd": 0.0, "loss/logits": 0.20720904739573598, "step": 20930 }, { "epoch": 0.698, "grad_norm": 31.625, "grad_norm_var": 28.412434895833332, "learning_rate": 9.805204232014738e-05, "loss": 7.7715, "loss/crossentropy": 2.0513715844601395, "loss/hidden": 3.775, "loss/jsd": 0.0, "loss/logits": 0.20735556054860355, "step": 20940 }, { "epoch": 0.6983333333333334, "grad_norm": 29.625, "grad_norm_var": 28.428059895833332, "learning_rate": 9.801068566592485e-05, "loss": 7.7446, "loss/crossentropy": 2.141914916783571, "loss/hidden": 3.487109375, "loss/jsd": 0.0, "loss/logits": 0.20847442476078867, "step": 20950 }, { "epoch": 0.6986666666666667, "grad_norm": 28.75, "grad_norm_var": 4.476822916666666, "learning_rate": 9.796890451359894e-05, "loss": 7.8283, "loss/crossentropy": 2.0166680470108984, "loss/hidden": 3.70078125, "loss/jsd": 0.0, "loss/logits": 0.2056173078715801, "step": 20960 }, { "epoch": 0.699, "grad_norm": 28.625, "grad_norm_var": 4.481705729166666, "learning_rate": 9.792669927553271e-05, "loss": 7.6997, "loss/crossentropy": 2.0283680982887744, "loss/hidden": 3.576171875, "loss/jsd": 0.0, "loss/logits": 0.2040084034204483, "step": 20970 }, { "epoch": 0.6993333333333334, "grad_norm": 32.25, "grad_norm_var": 15.808072916666667, "learning_rate": 9.788407036827486e-05, "loss": 7.8985, "loss/crossentropy": 2.110441932082176, "loss/hidden": 3.62734375, "loss/jsd": 0.0, "loss/logits": 0.2083257043734193, "step": 20980 }, { "epoch": 0.6996666666666667, "grad_norm": 31.0, "grad_norm_var": 13.953125, "learning_rate": 9.784101821255546e-05, "loss": 7.8639, "loss/crossentropy": 2.1061757408082484, "loss/hidden": 3.59765625, "loss/jsd": 0.0, "loss/logits": 0.2161664988845587, "step": 20990 }, { "epoch": 0.7, "grad_norm": 30.25, "grad_norm_var": 2.733268229166667, "learning_rate": 9.779754323328192e-05, "loss": 7.6809, "loss/crossentropy": 1.968663776665926, "loss/hidden": 3.59453125, "loss/jsd": 0.0, "loss/logits": 0.20843339152634144, "step": 21000 }, { "epoch": 0.7003333333333334, "grad_norm": 33.75, "grad_norm_var": 4.762239583333334, "learning_rate": 9.775364585953473e-05, "loss": 7.7791, "loss/crossentropy": 2.0461377993226053, "loss/hidden": 3.669140625, "loss/jsd": 0.0, "loss/logits": 0.20731903873384, "step": 21010 }, { "epoch": 0.7006666666666667, "grad_norm": 28.75, "grad_norm_var": 5.0916015625, "learning_rate": 9.770932652456326e-05, "loss": 7.7952, "loss/crossentropy": 2.0643589437007903, "loss/hidden": 3.619140625, "loss/jsd": 0.0, "loss/logits": 0.2156803973019123, "step": 21020 }, { "epoch": 0.701, "grad_norm": 28.875, "grad_norm_var": 5.80625, "learning_rate": 9.766458566578143e-05, "loss": 7.773, "loss/crossentropy": 2.0716071873903275, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.2185486238449812, "step": 21030 }, { "epoch": 0.7013333333333334, "grad_norm": 38.0, "grad_norm_var": 6.7322265625, "learning_rate": 9.76194237247635e-05, "loss": 7.728, "loss/crossentropy": 2.182681308686733, "loss/hidden": 3.44375, "loss/jsd": 0.0, "loss/logits": 0.1979642266407609, "step": 21040 }, { "epoch": 0.7016666666666667, "grad_norm": 30.25, "grad_norm_var": 6.1759765625, "learning_rate": 9.757384114723954e-05, "loss": 7.7438, "loss/crossentropy": 2.0533548787236215, "loss/hidden": 3.613671875, "loss/jsd": 0.0, "loss/logits": 0.21022803112864494, "step": 21050 }, { "epoch": 0.702, "grad_norm": 29.5, "grad_norm_var": 3.6103515625, "learning_rate": 9.752783838309123e-05, "loss": 7.7947, "loss/crossentropy": 2.148482698202133, "loss/hidden": 3.63828125, "loss/jsd": 0.0, "loss/logits": 0.2157578205689788, "step": 21060 }, { "epoch": 0.7023333333333334, "grad_norm": 27.875, "grad_norm_var": 2.6416666666666666, "learning_rate": 9.748141588634725e-05, "loss": 7.7997, "loss/crossentropy": 1.9508283972740172, "loss/hidden": 3.637890625, "loss/jsd": 0.0, "loss/logits": 0.20378196705132723, "step": 21070 }, { "epoch": 0.7026666666666667, "grad_norm": 31.75, "grad_norm_var": 23.627083333333335, "learning_rate": 9.743457411517892e-05, "loss": 7.8502, "loss/crossentropy": 2.063946034014225, "loss/hidden": 3.628515625, "loss/jsd": 0.0, "loss/logits": 0.22184212561696767, "step": 21080 }, { "epoch": 0.703, "grad_norm": 34.0, "grad_norm_var": 2.7603515625, "learning_rate": 9.738731353189558e-05, "loss": 7.7532, "loss/crossentropy": 1.9890851199626922, "loss/hidden": 3.553125, "loss/jsd": 0.0, "loss/logits": 0.19615612691268325, "step": 21090 }, { "epoch": 0.7033333333333334, "grad_norm": 30.0, "grad_norm_var": 2.9686848958333334, "learning_rate": 9.733963460294015e-05, "loss": 7.8208, "loss/crossentropy": 1.9296169601380826, "loss/hidden": 3.633984375, "loss/jsd": 0.0, "loss/logits": 0.2059748636558652, "step": 21100 }, { "epoch": 0.7036666666666667, "grad_norm": 31.25, "grad_norm_var": 41.9806640625, "learning_rate": 9.729153779888439e-05, "loss": 7.7385, "loss/crossentropy": 1.9927678152918815, "loss/hidden": 3.566796875, "loss/jsd": 0.0, "loss/logits": 0.19540305892005563, "step": 21110 }, { "epoch": 0.704, "grad_norm": 28.75, "grad_norm_var": 2.295768229166667, "learning_rate": 9.724302359442434e-05, "loss": 7.8334, "loss/crossentropy": 2.0166243493556975, "loss/hidden": 3.549609375, "loss/jsd": 0.0, "loss/logits": 0.20116372499614954, "step": 21120 }, { "epoch": 0.7043333333333334, "grad_norm": 29.125, "grad_norm_var": 1.8184895833333334, "learning_rate": 9.719409246837561e-05, "loss": 7.8302, "loss/crossentropy": 1.9066402643918992, "loss/hidden": 3.54140625, "loss/jsd": 0.0, "loss/logits": 0.1955884052440524, "step": 21130 }, { "epoch": 0.7046666666666667, "grad_norm": 29.625, "grad_norm_var": 14619.393489583334, "learning_rate": 9.714474490366866e-05, "loss": 7.917, "loss/crossentropy": 1.9063568994402886, "loss/hidden": 3.60859375, "loss/jsd": 0.0, "loss/logits": 0.1982713321223855, "step": 21140 }, { "epoch": 0.705, "grad_norm": 28.375, "grad_norm_var": 14504.148958333333, "learning_rate": 9.709498138734405e-05, "loss": 7.7312, "loss/crossentropy": 1.9017325207591056, "loss/hidden": 3.636328125, "loss/jsd": 0.0, "loss/logits": 0.19890787806361915, "step": 21150 }, { "epoch": 0.7053333333333334, "grad_norm": 28.875, "grad_norm_var": 19.811458333333334, "learning_rate": 9.704480241054755e-05, "loss": 7.6471, "loss/crossentropy": 1.9751385256648064, "loss/hidden": 3.6046875, "loss/jsd": 0.0, "loss/logits": 0.20594419166445732, "step": 21160 }, { "epoch": 0.7056666666666667, "grad_norm": 28.875, "grad_norm_var": 0.9884765625, "learning_rate": 9.699420846852544e-05, "loss": 7.7681, "loss/crossentropy": 2.0237425029277802, "loss/hidden": 3.624609375, "loss/jsd": 0.0, "loss/logits": 0.2027163729071617, "step": 21170 }, { "epoch": 0.706, "grad_norm": 29.5, "grad_norm_var": 0.8358723958333333, "learning_rate": 9.694320006061949e-05, "loss": 7.706, "loss/crossentropy": 1.9728805258870126, "loss/hidden": 3.50625, "loss/jsd": 0.0, "loss/logits": 0.20415272628888487, "step": 21180 }, { "epoch": 0.7063333333333334, "grad_norm": 31.75, "grad_norm_var": 1.5285807291666667, "learning_rate": 9.689177769026211e-05, "loss": 7.7188, "loss/crossentropy": 2.114629329741001, "loss/hidden": 3.634375, "loss/jsd": 0.0, "loss/logits": 0.2147956196218729, "step": 21190 }, { "epoch": 0.7066666666666667, "grad_norm": 31.625, "grad_norm_var": 14.480989583333333, "learning_rate": 9.683994186497132e-05, "loss": 7.8607, "loss/crossentropy": 2.210577738285065, "loss/hidden": 3.62578125, "loss/jsd": 0.0, "loss/logits": 0.22065361309796572, "step": 21200 }, { "epoch": 0.707, "grad_norm": 28.75, "grad_norm_var": 15.8478515625, "learning_rate": 9.678769309634579e-05, "loss": 7.7987, "loss/crossentropy": 2.1382463231682776, "loss/hidden": 3.630078125, "loss/jsd": 0.0, "loss/logits": 0.20682547576725482, "step": 21210 }, { "epoch": 0.7073333333333334, "grad_norm": 50.0, "grad_norm_var": 27.705989583333334, "learning_rate": 9.673503190005977e-05, "loss": 7.7034, "loss/crossentropy": 1.9322528079152108, "loss/hidden": 3.531640625, "loss/jsd": 0.0, "loss/logits": 0.19704128410667182, "step": 21220 }, { "epoch": 0.7076666666666667, "grad_norm": 30.0, "grad_norm_var": 34.70358072916667, "learning_rate": 9.6681958795858e-05, "loss": 7.7049, "loss/crossentropy": 2.050449796766043, "loss/hidden": 3.621484375, "loss/jsd": 0.0, "loss/logits": 0.23184078792110085, "step": 21230 }, { "epoch": 0.708, "grad_norm": 31.375, "grad_norm_var": 2.04140625, "learning_rate": 9.66284743075506e-05, "loss": 7.8191, "loss/crossentropy": 2.2278542831540107, "loss/hidden": 3.62265625, "loss/jsd": 0.0, "loss/logits": 0.22254880461841822, "step": 21240 }, { "epoch": 0.7083333333333334, "grad_norm": 31.375, "grad_norm_var": 6.119791666666667, "learning_rate": 9.657457896300791e-05, "loss": 7.7321, "loss/crossentropy": 2.144664096832275, "loss/hidden": 3.49453125, "loss/jsd": 0.0, "loss/logits": 0.20551785565912722, "step": 21250 }, { "epoch": 0.7086666666666667, "grad_norm": 31.0, "grad_norm_var": 9.594791666666667, "learning_rate": 9.652027329415517e-05, "loss": 7.7328, "loss/crossentropy": 1.999681544303894, "loss/hidden": 3.6890625, "loss/jsd": 0.0, "loss/logits": 0.2114706352353096, "step": 21260 }, { "epoch": 0.709, "grad_norm": 28.375, "grad_norm_var": 268.0978515625, "learning_rate": 9.646555783696743e-05, "loss": 7.8, "loss/crossentropy": 2.0811921998858454, "loss/hidden": 3.57421875, "loss/jsd": 0.0, "loss/logits": 0.21599450353533028, "step": 21270 }, { "epoch": 0.7093333333333334, "grad_norm": 28.5, "grad_norm_var": 303.81458333333336, "learning_rate": 9.641043313146417e-05, "loss": 7.9496, "loss/crossentropy": 2.1750354193151, "loss/hidden": 3.63046875, "loss/jsd": 0.0, "loss/logits": 0.2116277886554599, "step": 21280 }, { "epoch": 0.7096666666666667, "grad_norm": 29.375, "grad_norm_var": 169.38125, "learning_rate": 9.635489972170397e-05, "loss": 7.6372, "loss/crossentropy": 2.006293947994709, "loss/hidden": 3.588671875, "loss/jsd": 0.0, "loss/logits": 0.19880817979574203, "step": 21290 }, { "epoch": 0.71, "grad_norm": 28.75, "grad_norm_var": 14.073958333333334, "learning_rate": 9.629895815577916e-05, "loss": 7.7662, "loss/crossentropy": 2.0567120373249055, "loss/hidden": 3.612890625, "loss/jsd": 0.0, "loss/logits": 0.2118833553045988, "step": 21300 }, { "epoch": 0.7103333333333334, "grad_norm": 28.75, "grad_norm_var": 14.435416666666667, "learning_rate": 9.62426089858104e-05, "loss": 7.7359, "loss/crossentropy": 2.0882872194051743, "loss/hidden": 3.63359375, "loss/jsd": 0.0, "loss/logits": 0.19910078253597022, "step": 21310 }, { "epoch": 0.7106666666666667, "grad_norm": 31.125, "grad_norm_var": 8.761393229166666, "learning_rate": 9.618585276794129e-05, "loss": 7.7853, "loss/crossentropy": 2.0896404944360256, "loss/hidden": 3.64921875, "loss/jsd": 0.0, "loss/logits": 0.22682881793007253, "step": 21320 }, { "epoch": 0.711, "grad_norm": 33.25, "grad_norm_var": 23.874739583333334, "learning_rate": 9.612869006233275e-05, "loss": 7.9966, "loss/crossentropy": 2.1692073047161102, "loss/hidden": 3.61796875, "loss/jsd": 0.0, "loss/logits": 0.2034358810633421, "step": 21330 }, { "epoch": 0.7113333333333334, "grad_norm": 32.25, "grad_norm_var": 18.7791015625, "learning_rate": 9.607112143315763e-05, "loss": 7.8539, "loss/crossentropy": 2.0817694112658502, "loss/hidden": 3.56171875, "loss/jsd": 0.0, "loss/logits": 0.2055924255400896, "step": 21340 }, { "epoch": 0.7116666666666667, "grad_norm": 31.625, "grad_norm_var": 9.392643229166667, "learning_rate": 9.601314744859504e-05, "loss": 7.5976, "loss/crossentropy": 2.005814277380705, "loss/hidden": 3.581640625, "loss/jsd": 0.0, "loss/logits": 0.2026340899989009, "step": 21350 }, { "epoch": 0.712, "grad_norm": 31.625, "grad_norm_var": 3.4900390625, "learning_rate": 9.595476868082481e-05, "loss": 7.8636, "loss/crossentropy": 1.9804232444614172, "loss/hidden": 3.69453125, "loss/jsd": 0.0, "loss/logits": 0.21351769701577722, "step": 21360 }, { "epoch": 0.7123333333333334, "grad_norm": 31.0, "grad_norm_var": 2.987434895833333, "learning_rate": 9.589598570602181e-05, "loss": 7.858, "loss/crossentropy": 2.1446668222546577, "loss/hidden": 3.575390625, "loss/jsd": 0.0, "loss/logits": 0.20793895637616516, "step": 21370 }, { "epoch": 0.7126666666666667, "grad_norm": 30.25, "grad_norm_var": 120.17545572916667, "learning_rate": 9.583679910435026e-05, "loss": 7.8126, "loss/crossentropy": 2.034381502121687, "loss/hidden": 3.66328125, "loss/jsd": 0.0, "loss/logits": 0.20835008025169371, "step": 21380 }, { "epoch": 0.713, "grad_norm": 34.25, "grad_norm_var": 112.87708333333333, "learning_rate": 9.577720945995803e-05, "loss": 7.8897, "loss/crossentropy": 2.081572139263153, "loss/hidden": 3.615625, "loss/jsd": 0.0, "loss/logits": 0.21697744037956, "step": 21390 }, { "epoch": 0.7133333333333334, "grad_norm": 28.875, "grad_norm_var": 119.82493489583334, "learning_rate": 9.571721736097089e-05, "loss": 7.6045, "loss/crossentropy": 2.171599693596363, "loss/hidden": 3.445703125, "loss/jsd": 0.0, "loss/logits": 0.19405515491962433, "step": 21400 }, { "epoch": 0.7136666666666667, "grad_norm": 30.0, "grad_norm_var": 20.370247395833335, "learning_rate": 9.565682339948657e-05, "loss": 7.8137, "loss/crossentropy": 2.099457284808159, "loss/hidden": 3.64609375, "loss/jsd": 0.0, "loss/logits": 0.20623879097402095, "step": 21410 }, { "epoch": 0.714, "grad_norm": 29.0, "grad_norm_var": 5.495768229166667, "learning_rate": 9.559602817156913e-05, "loss": 7.6481, "loss/crossentropy": 2.1219607055187226, "loss/hidden": 3.60234375, "loss/jsd": 0.0, "loss/logits": 0.20431581195443868, "step": 21420 }, { "epoch": 0.7143333333333334, "grad_norm": 29.25, "grad_norm_var": 2.7, "learning_rate": 9.553483227724292e-05, "loss": 7.7096, "loss/crossentropy": 2.0621854946017266, "loss/hidden": 3.661328125, "loss/jsd": 0.0, "loss/logits": 0.21732638962566853, "step": 21430 }, { "epoch": 0.7146666666666667, "grad_norm": 30.0, "grad_norm_var": 2.5171223958333333, "learning_rate": 9.54732363204867e-05, "loss": 7.5899, "loss/crossentropy": 2.1101118355989454, "loss/hidden": 3.526171875, "loss/jsd": 0.0, "loss/logits": 0.20486003924161195, "step": 21440 }, { "epoch": 0.715, "grad_norm": 29.0, "grad_norm_var": 4.364518229166666, "learning_rate": 9.54112409092277e-05, "loss": 7.7143, "loss/crossentropy": 2.039649748057127, "loss/hidden": 3.55, "loss/jsd": 0.0, "loss/logits": 0.20175143275409937, "step": 21450 }, { "epoch": 0.7153333333333334, "grad_norm": 30.75, "grad_norm_var": 3.331184895833333, "learning_rate": 9.534884665533563e-05, "loss": 7.6551, "loss/crossentropy": 2.087866473197937, "loss/hidden": 3.572265625, "loss/jsd": 0.0, "loss/logits": 0.20887105632573366, "step": 21460 }, { "epoch": 0.7156666666666667, "grad_norm": 31.375, "grad_norm_var": 3.495247395833333, "learning_rate": 9.528605417461653e-05, "loss": 7.6824, "loss/crossentropy": 1.9937364026904105, "loss/hidden": 3.440625, "loss/jsd": 0.0, "loss/logits": 0.19099010657519103, "step": 21470 }, { "epoch": 0.716, "grad_norm": 28.625, "grad_norm_var": 2.4607245902591734e+18, "learning_rate": 9.522286408680687e-05, "loss": 7.7331, "loss/crossentropy": 2.0533931560814382, "loss/hidden": 3.495703125, "loss/jsd": 0.0, "loss/logits": 0.19130754433572292, "step": 21480 }, { "epoch": 0.7163333333333334, "grad_norm": 28.875, "grad_norm_var": 6.831430466374188e+18, "learning_rate": 9.51592770155673e-05, "loss": 7.6974, "loss/crossentropy": 2.0633395805954935, "loss/hidden": 3.654296875, "loss/jsd": 0.0, "loss/logits": 0.220349186565727, "step": 21490 }, { "epoch": 0.7166666666666667, "grad_norm": 54.25, "grad_norm_var": 38.76640625, "learning_rate": 9.509529358847655e-05, "loss": 7.6228, "loss/crossentropy": 1.8947391845285892, "loss/hidden": 3.4703125, "loss/jsd": 0.0, "loss/logits": 0.19983619190752505, "step": 21500 }, { "epoch": 0.717, "grad_norm": 31.75, "grad_norm_var": 38.83326822916667, "learning_rate": 9.503091443702522e-05, "loss": 7.7001, "loss/crossentropy": 2.0377241536974906, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.19533433131873607, "step": 21510 }, { "epoch": 0.7173333333333334, "grad_norm": 29.625, "grad_norm_var": 3.3018229166666666, "learning_rate": 9.496614019660951e-05, "loss": 7.803, "loss/crossentropy": 2.1326826021075247, "loss/hidden": 3.651171875, "loss/jsd": 0.0, "loss/logits": 0.2108999377116561, "step": 21520 }, { "epoch": 0.7176666666666667, "grad_norm": 33.75, "grad_norm_var": 6.46640625, "learning_rate": 9.490097150652505e-05, "loss": 7.7089, "loss/crossentropy": 1.8233733780682087, "loss/hidden": 3.501171875, "loss/jsd": 0.0, "loss/logits": 0.17455916814506053, "step": 21530 }, { "epoch": 0.718, "grad_norm": 29.125, "grad_norm_var": 5.584309895833333, "learning_rate": 9.483540900996049e-05, "loss": 7.7816, "loss/crossentropy": 2.222658357024193, "loss/hidden": 3.5171875, "loss/jsd": 0.0, "loss/logits": 0.20714823082089423, "step": 21540 }, { "epoch": 0.7183333333333334, "grad_norm": 28.125, "grad_norm_var": 7.22890625, "learning_rate": 9.476945335399122e-05, "loss": 7.76, "loss/crossentropy": 2.0847948037087916, "loss/hidden": 3.44765625, "loss/jsd": 0.0, "loss/logits": 0.19452479667961597, "step": 21550 }, { "epoch": 0.7186666666666667, "grad_norm": 59.0, "grad_norm_var": 2.899825550166275e+18, "learning_rate": 9.47031051895729e-05, "loss": 7.7464, "loss/crossentropy": 2.0613483414053917, "loss/hidden": 3.583984375, "loss/jsd": 0.0, "loss/logits": 0.2091040827333927, "step": 21560 }, { "epoch": 0.719, "grad_norm": 157.0, "grad_norm_var": 1006.6166015625, "learning_rate": 9.463636517153517e-05, "loss": 7.8657, "loss/crossentropy": 2.310398209095001, "loss/hidden": 3.470703125, "loss/jsd": 0.0, "loss/logits": 0.2095829950645566, "step": 21570 }, { "epoch": 0.7193333333333334, "grad_norm": 27.25, "grad_norm_var": 999.0166015625, "learning_rate": 9.456923395857503e-05, "loss": 7.7325, "loss/crossentropy": 1.9690666690468788, "loss/hidden": 3.622265625, "loss/jsd": 0.0, "loss/logits": 0.2094871997833252, "step": 21580 }, { "epoch": 0.7196666666666667, "grad_norm": 31.125, "grad_norm_var": 12.08515625, "learning_rate": 9.450171221325049e-05, "loss": 7.6418, "loss/crossentropy": 1.9917885288596153, "loss/hidden": 3.512109375, "loss/jsd": 0.0, "loss/logits": 0.1998442027717829, "step": 21590 }, { "epoch": 0.72, "grad_norm": 28.25, "grad_norm_var": 5.362434895833333, "learning_rate": 9.443380060197387e-05, "loss": 7.7914, "loss/crossentropy": 2.0604188472032545, "loss/hidden": 3.528515625, "loss/jsd": 0.0, "loss/logits": 0.20265566650778055, "step": 21600 }, { "epoch": 0.7203333333333334, "grad_norm": 114.5, "grad_norm_var": 447.34973958333336, "learning_rate": 9.436549979500539e-05, "loss": 7.693, "loss/crossentropy": 2.003858245909214, "loss/hidden": 3.709375, "loss/jsd": 0.0, "loss/logits": 0.22414386905729772, "step": 21610 }, { "epoch": 0.7206666666666667, "grad_norm": 26.625, "grad_norm_var": 453.5893229166667, "learning_rate": 9.42968104664464e-05, "loss": 7.6233, "loss/crossentropy": 2.0694996997714044, "loss/hidden": 3.617578125, "loss/jsd": 0.0, "loss/logits": 0.205183663405478, "step": 21620 }, { "epoch": 0.721, "grad_norm": 33.25, "grad_norm_var": 13.485416666666667, "learning_rate": 9.422773329423292e-05, "loss": 7.7632, "loss/crossentropy": 2.0694395408034323, "loss/hidden": 3.654296875, "loss/jsd": 0.0, "loss/logits": 0.20871220286935568, "step": 21630 }, { "epoch": 0.7213333333333334, "grad_norm": 32.5, "grad_norm_var": 3.9759765625, "learning_rate": 9.415826896012865e-05, "loss": 7.6489, "loss/crossentropy": 1.975140118598938, "loss/hidden": 3.442578125, "loss/jsd": 0.0, "loss/logits": 0.20482220761477948, "step": 21640 }, { "epoch": 0.7216666666666667, "grad_norm": 29.375, "grad_norm_var": 3.7978515625, "learning_rate": 9.408841814971861e-05, "loss": 7.6203, "loss/crossentropy": 2.039801698923111, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.19393778946250678, "step": 21650 }, { "epoch": 0.722, "grad_norm": 31.125, "grad_norm_var": 15.802018229166666, "learning_rate": 9.401818155240205e-05, "loss": 7.7741, "loss/crossentropy": 2.119242195785046, "loss/hidden": 3.559765625, "loss/jsd": 0.0, "loss/logits": 0.20300054959952832, "step": 21660 }, { "epoch": 0.7223333333333334, "grad_norm": 28.125, "grad_norm_var": 15.137239583333333, "learning_rate": 9.394755986138586e-05, "loss": 7.7166, "loss/crossentropy": 1.8970364406704903, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.20859413947910072, "step": 21670 }, { "epoch": 0.7226666666666667, "grad_norm": 27.125, "grad_norm_var": 2.8337890625, "learning_rate": 9.387655377367758e-05, "loss": 7.6903, "loss/crossentropy": 2.132764849066734, "loss/hidden": 3.533203125, "loss/jsd": 0.0, "loss/logits": 0.2067256074398756, "step": 21680 }, { "epoch": 0.723, "grad_norm": 28.5, "grad_norm_var": 21.7197265625, "learning_rate": 9.380516399007868e-05, "loss": 7.6587, "loss/crossentropy": 1.9144255444407463, "loss/hidden": 3.6828125, "loss/jsd": 0.0, "loss/logits": 0.2001453947275877, "step": 21690 }, { "epoch": 0.7233333333333334, "grad_norm": 30.0, "grad_norm_var": 5.901822916666666, "learning_rate": 9.373339121517747e-05, "loss": 7.7691, "loss/crossentropy": 2.2400635674595835, "loss/hidden": 3.60234375, "loss/jsd": 0.0, "loss/logits": 0.2141938941553235, "step": 21700 }, { "epoch": 0.7236666666666667, "grad_norm": 28.125, "grad_norm_var": 3.468053159442855e+18, "learning_rate": 9.366123615734227e-05, "loss": 7.7074, "loss/crossentropy": 2.0811144724488257, "loss/hidden": 3.775, "loss/jsd": 0.0, "loss/logits": 0.1989194665104151, "step": 21710 }, { "epoch": 0.724, "grad_norm": 28.125, "grad_norm_var": 5.074739583333334, "learning_rate": 9.358869952871436e-05, "loss": 7.6993, "loss/crossentropy": 2.1098939001560213, "loss/hidden": 3.491015625, "loss/jsd": 0.0, "loss/logits": 0.20545907951891423, "step": 21720 }, { "epoch": 0.7243333333333334, "grad_norm": 30.875, "grad_norm_var": 5.959375, "learning_rate": 9.351578204520099e-05, "loss": 7.5884, "loss/crossentropy": 1.8791275858879088, "loss/hidden": 3.540625, "loss/jsd": 0.0, "loss/logits": 0.20307840630412102, "step": 21730 }, { "epoch": 0.7246666666666667, "grad_norm": 27.25, "grad_norm_var": 3.1426432291666666, "learning_rate": 9.344248442646829e-05, "loss": 7.6224, "loss/crossentropy": 2.0195631198585033, "loss/hidden": 3.648828125, "loss/jsd": 0.0, "loss/logits": 0.19278614791110157, "step": 21740 }, { "epoch": 0.725, "grad_norm": 28.25, "grad_norm_var": 2.1442057291666665, "learning_rate": 9.336880739593416e-05, "loss": 7.535, "loss/crossentropy": 2.1503761291503904, "loss/hidden": 3.5609375, "loss/jsd": 0.0, "loss/logits": 0.2019724454730749, "step": 21750 }, { "epoch": 0.7253333333333334, "grad_norm": 25.125, "grad_norm_var": 5.632291666666666, "learning_rate": 9.329475168076114e-05, "loss": 7.6548, "loss/crossentropy": 2.0659672379493714, "loss/hidden": 3.661328125, "loss/jsd": 0.0, "loss/logits": 0.21330699287354946, "step": 21760 }, { "epoch": 0.7256666666666667, "grad_norm": 30.5, "grad_norm_var": 9.28125, "learning_rate": 9.322031801184925e-05, "loss": 7.6069, "loss/crossentropy": 2.1512291483581065, "loss/hidden": 3.644140625, "loss/jsd": 0.0, "loss/logits": 0.21503518372774125, "step": 21770 }, { "epoch": 0.726, "grad_norm": 29.875, "grad_norm_var": 7.08125, "learning_rate": 9.314550712382875e-05, "loss": 7.6104, "loss/crossentropy": 2.112006691843271, "loss/hidden": 3.452734375, "loss/jsd": 0.0, "loss/logits": 0.19404661422595382, "step": 21780 }, { "epoch": 0.7263333333333334, "grad_norm": 28.75, "grad_norm_var": 3.4056640625, "learning_rate": 9.307031975505291e-05, "loss": 7.672, "loss/crossentropy": 2.076837729662657, "loss/hidden": 3.55859375, "loss/jsd": 0.0, "loss/logits": 0.1984027072787285, "step": 21790 }, { "epoch": 0.7266666666666667, "grad_norm": 28.75, "grad_norm_var": 1.8900390625, "learning_rate": 9.299475664759069e-05, "loss": 7.6271, "loss/crossentropy": 2.0399362690746785, "loss/hidden": 3.519921875, "loss/jsd": 0.0, "loss/logits": 0.19836630206555128, "step": 21800 }, { "epoch": 0.727, "grad_norm": 33.0, "grad_norm_var": 1.9166666666666667, "learning_rate": 9.291881854721946e-05, "loss": 7.6745, "loss/crossentropy": 2.128536182641983, "loss/hidden": 3.63671875, "loss/jsd": 0.0, "loss/logits": 0.21894673127681016, "step": 21810 }, { "epoch": 0.7273333333333334, "grad_norm": 29.125, "grad_norm_var": 1.7624348958333333, "learning_rate": 9.28425062034176e-05, "loss": 7.6159, "loss/crossentropy": 2.0389281518757345, "loss/hidden": 3.598828125, "loss/jsd": 0.0, "loss/logits": 0.21084144692867995, "step": 21820 }, { "epoch": 0.7276666666666667, "grad_norm": 29.125, "grad_norm_var": 2.925, "learning_rate": 9.276582036935717e-05, "loss": 7.6311, "loss/crossentropy": 2.02659250497818, "loss/hidden": 3.496484375, "loss/jsd": 0.0, "loss/logits": 0.21077424064278602, "step": 21830 }, { "epoch": 0.728, "grad_norm": 29.375, "grad_norm_var": 3.093489583333333, "learning_rate": 9.268876180189639e-05, "loss": 7.627, "loss/crossentropy": 2.014843727648258, "loss/hidden": 3.561328125, "loss/jsd": 0.0, "loss/logits": 0.21272912081331014, "step": 21840 }, { "epoch": 0.7283333333333334, "grad_norm": 28.125, "grad_norm_var": 5.7712890625, "learning_rate": 9.261133126157218e-05, "loss": 7.7058, "loss/crossentropy": 2.11625085696578, "loss/hidden": 3.67109375, "loss/jsd": 0.0, "loss/logits": 0.2178550474345684, "step": 21850 }, { "epoch": 0.7286666666666667, "grad_norm": 29.125, "grad_norm_var": 1.3955729166666666, "learning_rate": 9.253352951259271e-05, "loss": 7.7112, "loss/crossentropy": 2.1387905418872832, "loss/hidden": 3.59140625, "loss/jsd": 0.0, "loss/logits": 0.2076106144115329, "step": 21860 }, { "epoch": 0.729, "grad_norm": 29.75, "grad_norm_var": 1.2997395833333334, "learning_rate": 9.245535732282986e-05, "loss": 7.6483, "loss/crossentropy": 2.1955421969294546, "loss/hidden": 3.541796875, "loss/jsd": 0.0, "loss/logits": 0.21497300919145346, "step": 21870 }, { "epoch": 0.7293333333333333, "grad_norm": 28.375, "grad_norm_var": 2.2643229166666665, "learning_rate": 9.237681546381157e-05, "loss": 7.6649, "loss/crossentropy": 2.0390606805682183, "loss/hidden": 3.533203125, "loss/jsd": 0.0, "loss/logits": 0.1902286982163787, "step": 21880 }, { "epoch": 0.7296666666666667, "grad_norm": 30.625, "grad_norm_var": 2.988997395833333, "learning_rate": 9.229790471071429e-05, "loss": 7.7516, "loss/crossentropy": 2.15336195230484, "loss/hidden": 3.5890625, "loss/jsd": 0.0, "loss/logits": 0.2047728981822729, "step": 21890 }, { "epoch": 0.73, "grad_norm": 30.0, "grad_norm_var": 1.6747395833333334, "learning_rate": 9.221862584235528e-05, "loss": 7.7046, "loss/crossentropy": 2.0997920632362366, "loss/hidden": 3.599609375, "loss/jsd": 0.0, "loss/logits": 0.20689391866326332, "step": 21900 }, { "epoch": 0.7303333333333333, "grad_norm": 31.375, "grad_norm_var": 2.26640625, "learning_rate": 9.213897964118499e-05, "loss": 7.5344, "loss/crossentropy": 2.067221947014332, "loss/hidden": 3.539453125, "loss/jsd": 0.0, "loss/logits": 0.19843466561287643, "step": 21910 }, { "epoch": 0.7306666666666667, "grad_norm": 29.125, "grad_norm_var": 6.4853515625, "learning_rate": 9.205896689327923e-05, "loss": 7.737, "loss/crossentropy": 2.1428965769708155, "loss/hidden": 3.585546875, "loss/jsd": 0.0, "loss/logits": 0.2188850357197225, "step": 21920 }, { "epoch": 0.731, "grad_norm": 31.5, "grad_norm_var": 7.158333333333333, "learning_rate": 9.197858838833157e-05, "loss": 7.6879, "loss/crossentropy": 1.9083328664302825, "loss/hidden": 3.662109375, "loss/jsd": 0.0, "loss/logits": 0.20349522549659013, "step": 21930 }, { "epoch": 0.7313333333333333, "grad_norm": 31.125, "grad_norm_var": 2.8212890625, "learning_rate": 9.189784491964536e-05, "loss": 7.6476, "loss/crossentropy": 1.9690759629011154, "loss/hidden": 3.66640625, "loss/jsd": 0.0, "loss/logits": 0.21341784493997693, "step": 21940 }, { "epoch": 0.7316666666666667, "grad_norm": 26.875, "grad_norm_var": 3.5497395833333334, "learning_rate": 9.181673728412605e-05, "loss": 7.6774, "loss/crossentropy": 2.0544747814536093, "loss/hidden": 3.682421875, "loss/jsd": 0.0, "loss/logits": 0.21329349987208843, "step": 21950 }, { "epoch": 0.732, "grad_norm": 27.875, "grad_norm_var": 30.809830729166666, "learning_rate": 9.173526628227329e-05, "loss": 7.6327, "loss/crossentropy": 2.000888040661812, "loss/hidden": 3.57109375, "loss/jsd": 0.0, "loss/logits": 0.20834028851240874, "step": 21960 }, { "epoch": 0.7323333333333333, "grad_norm": 30.5, "grad_norm_var": 3.89765625, "learning_rate": 9.165343271817292e-05, "loss": 7.7272, "loss/crossentropy": 2.2294601082801817, "loss/hidden": 3.483203125, "loss/jsd": 0.0, "loss/logits": 0.20684596356004475, "step": 21970 }, { "epoch": 0.7326666666666667, "grad_norm": 35.75, "grad_norm_var": 3.42265625, "learning_rate": 9.157123739948924e-05, "loss": 7.6675, "loss/crossentropy": 2.032812249660492, "loss/hidden": 3.5453125, "loss/jsd": 0.0, "loss/logits": 0.21877431515604256, "step": 21980 }, { "epoch": 0.733, "grad_norm": 31.625, "grad_norm_var": 3.84140625, "learning_rate": 9.148868113745681e-05, "loss": 7.7739, "loss/crossentropy": 1.9926821939647197, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.20471897087991237, "step": 21990 }, { "epoch": 0.7333333333333333, "grad_norm": 29.375, "grad_norm_var": 4.79140625, "learning_rate": 9.140576474687264e-05, "loss": 7.6612, "loss/crossentropy": 2.0667272046208383, "loss/hidden": 3.5640625, "loss/jsd": 0.0, "loss/logits": 0.2066561786457896, "step": 22000 }, { "epoch": 0.7336666666666667, "grad_norm": 29.375, "grad_norm_var": 5.839518229166667, "learning_rate": 9.132248904608801e-05, "loss": 7.678, "loss/crossentropy": 2.0169097036123276, "loss/hidden": 3.441015625, "loss/jsd": 0.0, "loss/logits": 0.1938495047390461, "step": 22010 }, { "epoch": 0.734, "grad_norm": 28.25, "grad_norm_var": 1.7587890625, "learning_rate": 9.123885485700049e-05, "loss": 7.5806, "loss/crossentropy": 2.1955192267894743, "loss/hidden": 3.57734375, "loss/jsd": 0.0, "loss/logits": 0.21994279995560645, "step": 22020 }, { "epoch": 0.7343333333333333, "grad_norm": 29.625, "grad_norm_var": 908.9122395833333, "learning_rate": 9.115486300504575e-05, "loss": 7.7442, "loss/crossentropy": 2.1037651874125003, "loss/hidden": 3.608984375, "loss/jsd": 0.0, "loss/logits": 0.20334150791168212, "step": 22030 }, { "epoch": 0.7346666666666667, "grad_norm": 27.875, "grad_norm_var": 9.395572916666667, "learning_rate": 9.107051431918944e-05, "loss": 7.7365, "loss/crossentropy": 2.210801270604134, "loss/hidden": 3.435546875, "loss/jsd": 0.0, "loss/logits": 0.19594964981079102, "step": 22040 }, { "epoch": 0.735, "grad_norm": 28.25, "grad_norm_var": 3.0942057291666667, "learning_rate": 9.098580963191908e-05, "loss": 7.7258, "loss/crossentropy": 2.0567986249923704, "loss/hidden": 3.58203125, "loss/jsd": 0.0, "loss/logits": 0.19552275333553554, "step": 22050 }, { "epoch": 0.7353333333333333, "grad_norm": 28.0, "grad_norm_var": 1.80390625, "learning_rate": 9.09007497792357e-05, "loss": 7.6263, "loss/crossentropy": 2.0325539082288744, "loss/hidden": 3.584375, "loss/jsd": 0.0, "loss/logits": 0.19655003491789103, "step": 22060 }, { "epoch": 0.7356666666666667, "grad_norm": 29.375, "grad_norm_var": 1.70390625, "learning_rate": 9.08153356006457e-05, "loss": 7.6428, "loss/crossentropy": 1.9758184522390365, "loss/hidden": 3.4390625, "loss/jsd": 0.0, "loss/logits": 0.180863216239959, "step": 22070 }, { "epoch": 0.736, "grad_norm": 28.75, "grad_norm_var": 3.6546223958333335, "learning_rate": 9.07295679391526e-05, "loss": 7.6479, "loss/crossentropy": 2.0911218881607057, "loss/hidden": 3.5734375, "loss/jsd": 0.0, "loss/logits": 0.21416922919452192, "step": 22080 }, { "epoch": 0.7363333333333333, "grad_norm": 29.375, "grad_norm_var": 2.2997395833333334, "learning_rate": 9.064344764124852e-05, "loss": 7.5806, "loss/crossentropy": 1.9327972888946534, "loss/hidden": 3.623828125, "loss/jsd": 0.0, "loss/logits": 0.19096189700067043, "step": 22090 }, { "epoch": 0.7366666666666667, "grad_norm": 26.625, "grad_norm_var": 2.6122395833333334, "learning_rate": 9.055697555690608e-05, "loss": 7.6489, "loss/crossentropy": 2.1428242295980455, "loss/hidden": 3.5375, "loss/jsd": 0.0, "loss/logits": 0.21426594704389573, "step": 22100 }, { "epoch": 0.737, "grad_norm": 28.125, "grad_norm_var": 2.536393229166667, "learning_rate": 9.047015253956981e-05, "loss": 7.6099, "loss/crossentropy": 2.2596158146858216, "loss/hidden": 3.55703125, "loss/jsd": 0.0, "loss/logits": 0.21648634187877178, "step": 22110 }, { "epoch": 0.7373333333333333, "grad_norm": 28.375, "grad_norm_var": 2.1442057291666665, "learning_rate": 9.038297944614785e-05, "loss": 7.7341, "loss/crossentropy": 2.044772403687239, "loss/hidden": 3.441015625, "loss/jsd": 0.0, "loss/logits": 0.18932544207200408, "step": 22120 }, { "epoch": 0.7376666666666667, "grad_norm": 32.5, "grad_norm_var": 3.3999348958333333, "learning_rate": 9.029545713700346e-05, "loss": 7.6648, "loss/crossentropy": 1.961163030564785, "loss/hidden": 3.614453125, "loss/jsd": 0.0, "loss/logits": 0.1884706408716738, "step": 22130 }, { "epoch": 0.738, "grad_norm": 28.0, "grad_norm_var": 3.909375, "learning_rate": 9.020758647594646e-05, "loss": 7.5784, "loss/crossentropy": 1.8796802014112473, "loss/hidden": 3.5046875, "loss/jsd": 0.0, "loss/logits": 0.18157810363918542, "step": 22140 }, { "epoch": 0.7383333333333333, "grad_norm": 28.0, "grad_norm_var": 3.0444333217627853e+18, "learning_rate": 9.011936833022484e-05, "loss": 7.7373, "loss/crossentropy": 2.1361525490880013, "loss/hidden": 3.5359375, "loss/jsd": 0.0, "loss/logits": 0.20192783158272504, "step": 22150 }, { "epoch": 0.7386666666666667, "grad_norm": 32.0, "grad_norm_var": 5.5259765625, "learning_rate": 9.003080357051607e-05, "loss": 7.5862, "loss/crossentropy": 2.1341183722019195, "loss/hidden": 3.521484375, "loss/jsd": 0.0, "loss/logits": 0.21967477165162563, "step": 22160 }, { "epoch": 0.739, "grad_norm": 28.5, "grad_norm_var": 1.7645182291666666, "learning_rate": 8.994189307091854e-05, "loss": 7.6545, "loss/crossentropy": 2.0633115977048875, "loss/hidden": 3.61796875, "loss/jsd": 0.0, "loss/logits": 0.193378933891654, "step": 22170 }, { "epoch": 0.7393333333333333, "grad_norm": 32.25, "grad_norm_var": 1.4718098958333334, "learning_rate": 8.985263770894302e-05, "loss": 7.7163, "loss/crossentropy": 2.150431227684021, "loss/hidden": 3.693359375, "loss/jsd": 0.0, "loss/logits": 0.21285873763263224, "step": 22180 }, { "epoch": 0.7396666666666667, "grad_norm": 28.625, "grad_norm_var": 16.570768229166667, "learning_rate": 8.97630383655039e-05, "loss": 7.6727, "loss/crossentropy": 1.9807396337389946, "loss/hidden": 3.448046875, "loss/jsd": 0.0, "loss/logits": 0.19658283134922386, "step": 22190 }, { "epoch": 0.74, "grad_norm": 27.25, "grad_norm_var": 19.762239583333333, "learning_rate": 8.967309592491052e-05, "loss": 7.5625, "loss/crossentropy": 2.1757678367197513, "loss/hidden": 3.458984375, "loss/jsd": 0.0, "loss/logits": 0.2128155424259603, "step": 22200 }, { "epoch": 0.7403333333333333, "grad_norm": 30.375, "grad_norm_var": 1.4488932291666667, "learning_rate": 8.958281127485845e-05, "loss": 7.5472, "loss/crossentropy": 2.061194130033255, "loss/hidden": 3.479296875, "loss/jsd": 0.0, "loss/logits": 0.1889759385958314, "step": 22210 }, { "epoch": 0.7406666666666667, "grad_norm": 27.625, "grad_norm_var": 2.7952473958333335, "learning_rate": 8.949218530642075e-05, "loss": 7.6151, "loss/crossentropy": 2.0728229813277723, "loss/hidden": 3.4953125, "loss/jsd": 0.0, "loss/logits": 0.1952021485194564, "step": 22220 }, { "epoch": 0.741, "grad_norm": 29.75, "grad_norm_var": 32.40807291666667, "learning_rate": 8.940121891403912e-05, "loss": 7.5998, "loss/crossentropy": 2.088392072916031, "loss/hidden": 3.605078125, "loss/jsd": 0.0, "loss/logits": 0.1989578979089856, "step": 22230 }, { "epoch": 0.7413333333333333, "grad_norm": 29.125, "grad_norm_var": 16.971809895833335, "learning_rate": 8.930991299551515e-05, "loss": 7.678, "loss/crossentropy": 2.089249915629625, "loss/hidden": 3.569140625, "loss/jsd": 0.0, "loss/logits": 0.20038176514208317, "step": 22240 }, { "epoch": 0.7416666666666667, "grad_norm": 29.75, "grad_norm_var": 3.230208333333333, "learning_rate": 8.921826845200139e-05, "loss": 7.5722, "loss/crossentropy": 2.1239826932549475, "loss/hidden": 3.61953125, "loss/jsd": 0.0, "loss/logits": 0.2005884603597224, "step": 22250 }, { "epoch": 0.742, "grad_norm": 30.625, "grad_norm_var": 2.9854166666666666, "learning_rate": 8.91262861879925e-05, "loss": 7.6025, "loss/crossentropy": 2.14085738658905, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.2137705808505416, "step": 22260 }, { "epoch": 0.7423333333333333, "grad_norm": 27.25, "grad_norm_var": 2.343489583333333, "learning_rate": 8.903396711131624e-05, "loss": 7.6046, "loss/crossentropy": 1.8913337871432305, "loss/hidden": 3.537890625, "loss/jsd": 0.0, "loss/logits": 0.1923683611676097, "step": 22270 }, { "epoch": 0.7426666666666667, "grad_norm": 29.5, "grad_norm_var": 1.8832682291666667, "learning_rate": 8.894131213312467e-05, "loss": 7.5532, "loss/crossentropy": 1.9501267954707147, "loss/hidden": 3.5484375, "loss/jsd": 0.0, "loss/logits": 0.19819272067397833, "step": 22280 }, { "epoch": 0.743, "grad_norm": 28.875, "grad_norm_var": 2.819205729166667, "learning_rate": 8.884832216788501e-05, "loss": 7.6744, "loss/crossentropy": 2.220875917375088, "loss/hidden": 3.469921875, "loss/jsd": 0.0, "loss/logits": 0.19938996117562055, "step": 22290 }, { "epoch": 0.7433333333333333, "grad_norm": 30.0, "grad_norm_var": 1.6025390625, "learning_rate": 8.875499813337069e-05, "loss": 7.5482, "loss/crossentropy": 2.1203695118427275, "loss/hidden": 3.4828125, "loss/jsd": 0.0, "loss/logits": 0.19239178942516447, "step": 22300 }, { "epoch": 0.7436666666666667, "grad_norm": 26.625, "grad_norm_var": 8.838541666666666, "learning_rate": 8.866134095065222e-05, "loss": 7.6747, "loss/crossentropy": 2.07410399466753, "loss/hidden": 3.4828125, "loss/jsd": 0.0, "loss/logits": 0.1968332275748253, "step": 22310 }, { "epoch": 0.744, "grad_norm": 29.625, "grad_norm_var": 8.917122395833333, "learning_rate": 8.85673515440882e-05, "loss": 7.5404, "loss/crossentropy": 2.104242541640997, "loss/hidden": 3.562890625, "loss/jsd": 0.0, "loss/logits": 0.20138515261933207, "step": 22320 }, { "epoch": 0.7443333333333333, "grad_norm": 27.25, "grad_norm_var": 2.3087890625, "learning_rate": 8.847303084131613e-05, "loss": 7.679, "loss/crossentropy": 2.076655426621437, "loss/hidden": 3.521484375, "loss/jsd": 0.0, "loss/logits": 0.20225820317864418, "step": 22330 }, { "epoch": 0.7446666666666667, "grad_norm": 28.25, "grad_norm_var": 1.4270182291666667, "learning_rate": 8.837837977324328e-05, "loss": 7.5549, "loss/crossentropy": 1.97199331484735, "loss/hidden": 3.46015625, "loss/jsd": 0.0, "loss/logits": 0.18509145381394773, "step": 22340 }, { "epoch": 0.745, "grad_norm": 29.625, "grad_norm_var": 0.9, "learning_rate": 8.828339927403745e-05, "loss": 7.5815, "loss/crossentropy": 2.1016619965434074, "loss/hidden": 3.515234375, "loss/jsd": 0.0, "loss/logits": 0.2104162724688649, "step": 22350 }, { "epoch": 0.7453333333333333, "grad_norm": 27.25, "grad_norm_var": 1.4723307291666667, "learning_rate": 8.818809028111783e-05, "loss": 7.7285, "loss/crossentropy": 2.034029767662287, "loss/hidden": 3.473828125, "loss/jsd": 0.0, "loss/logits": 0.18854013606905937, "step": 22360 }, { "epoch": 0.7456666666666667, "grad_norm": 28.5, "grad_norm_var": 11.351822916666666, "learning_rate": 8.809245373514572e-05, "loss": 7.5918, "loss/crossentropy": 2.0507765501737594, "loss/hidden": 3.602734375, "loss/jsd": 0.0, "loss/logits": 0.21991799995303155, "step": 22370 }, { "epoch": 0.746, "grad_norm": 27.875, "grad_norm_var": 10.278059895833334, "learning_rate": 8.799649058001521e-05, "loss": 7.6398, "loss/crossentropy": 2.0420378386974334, "loss/hidden": 3.62265625, "loss/jsd": 0.0, "loss/logits": 0.19834638610482216, "step": 22380 }, { "epoch": 0.7463333333333333, "grad_norm": 29.875, "grad_norm_var": 139.56139322916667, "learning_rate": 8.79002017628439e-05, "loss": 7.6281, "loss/crossentropy": 2.114726561307907, "loss/hidden": 3.511328125, "loss/jsd": 0.0, "loss/logits": 0.1945993335917592, "step": 22390 }, { "epoch": 0.7466666666666667, "grad_norm": 27.25, "grad_norm_var": 143.01764322916668, "learning_rate": 8.780358823396352e-05, "loss": 7.6033, "loss/crossentropy": 2.1299582980573177, "loss/hidden": 3.465625, "loss/jsd": 0.0, "loss/logits": 0.21268012626096605, "step": 22400 }, { "epoch": 0.747, "grad_norm": 28.5, "grad_norm_var": 24.453059895833334, "learning_rate": 8.770665094691064e-05, "loss": 7.6004, "loss/crossentropy": 2.09407639503479, "loss/hidden": 3.528125, "loss/jsd": 0.0, "loss/logits": 0.20005517825484276, "step": 22410 }, { "epoch": 0.7473333333333333, "grad_norm": 68.5, "grad_norm_var": 109.85807291666667, "learning_rate": 8.76093908584171e-05, "loss": 7.6905, "loss/crossentropy": 2.253283692896366, "loss/hidden": 3.551171875, "loss/jsd": 0.0, "loss/logits": 0.21519504617899657, "step": 22420 }, { "epoch": 0.7476666666666667, "grad_norm": 29.875, "grad_norm_var": 100.96920572916666, "learning_rate": 8.751180892840074e-05, "loss": 7.53, "loss/crossentropy": 1.9626074001193046, "loss/hidden": 3.508203125, "loss/jsd": 0.0, "loss/logits": 0.18422856479883193, "step": 22430 }, { "epoch": 0.748, "grad_norm": 27.25, "grad_norm_var": 3.2129557291666666, "learning_rate": 8.741390611995581e-05, "loss": 7.4322, "loss/crossentropy": 1.8774556368589401, "loss/hidden": 3.54375, "loss/jsd": 0.0, "loss/logits": 0.21066911737434565, "step": 22440 }, { "epoch": 0.7483333333333333, "grad_norm": 30.0, "grad_norm_var": 3.0580729166666667, "learning_rate": 8.731568339934349e-05, "loss": 7.611, "loss/crossentropy": 2.0886680990457536, "loss/hidden": 3.53984375, "loss/jsd": 0.0, "loss/logits": 0.20858072843402625, "step": 22450 }, { "epoch": 0.7486666666666667, "grad_norm": 26.625, "grad_norm_var": 3.0931640625, "learning_rate": 8.72171417359824e-05, "loss": 7.4627, "loss/crossentropy": 2.0577007859945295, "loss/hidden": 3.546484375, "loss/jsd": 0.0, "loss/logits": 0.19628361649811268, "step": 22460 }, { "epoch": 0.749, "grad_norm": 30.625, "grad_norm_var": 4.778059895833334, "learning_rate": 8.711828210243896e-05, "loss": 7.5627, "loss/crossentropy": 2.1247499108314516, "loss/hidden": 3.594140625, "loss/jsd": 0.0, "loss/logits": 0.2208824411034584, "step": 22470 }, { "epoch": 0.7493333333333333, "grad_norm": 28.75, "grad_norm_var": 3.2431640625, "learning_rate": 8.701910547441786e-05, "loss": 7.5314, "loss/crossentropy": 2.1249613009393213, "loss/hidden": 3.519921875, "loss/jsd": 0.0, "loss/logits": 0.18236859384924173, "step": 22480 }, { "epoch": 0.7496666666666667, "grad_norm": 28.875, "grad_norm_var": 0.9884765625, "learning_rate": 8.691961283075233e-05, "loss": 7.7002, "loss/crossentropy": 2.0473913952708243, "loss/hidden": 3.4546875, "loss/jsd": 0.0, "loss/logits": 0.19657958708703518, "step": 22490 }, { "epoch": 0.75, "grad_norm": 27.875, "grad_norm_var": 1.3979166666666667, "learning_rate": 8.681980515339464e-05, "loss": 7.545, "loss/crossentropy": 2.0645889565348625, "loss/hidden": 3.58828125, "loss/jsd": 0.0, "loss/logits": 0.21940149031579495, "step": 22500 }, { "epoch": 0.7503333333333333, "grad_norm": 29.625, "grad_norm_var": 0.8614583333333333, "learning_rate": 8.671968342740627e-05, "loss": 7.5906, "loss/crossentropy": 2.134204125404358, "loss/hidden": 3.38984375, "loss/jsd": 0.0, "loss/logits": 0.1961730806156993, "step": 22510 }, { "epoch": 0.7506666666666667, "grad_norm": 27.125, "grad_norm_var": 2.3247395833333333, "learning_rate": 8.661924864094822e-05, "loss": 7.6235, "loss/crossentropy": 1.9457140512764455, "loss/hidden": 3.56953125, "loss/jsd": 0.0, "loss/logits": 0.1988594863563776, "step": 22520 }, { "epoch": 0.751, "grad_norm": 31.0, "grad_norm_var": 6.930208333333334, "learning_rate": 8.65185017852713e-05, "loss": 7.5477, "loss/crossentropy": 2.175088369846344, "loss/hidden": 3.48125, "loss/jsd": 0.0, "loss/logits": 0.19464628268033266, "step": 22530 }, { "epoch": 0.7513333333333333, "grad_norm": 27.875, "grad_norm_var": 6.657291666666667, "learning_rate": 8.641744385470628e-05, "loss": 7.5943, "loss/crossentropy": 1.9843165129423141, "loss/hidden": 3.610546875, "loss/jsd": 0.0, "loss/logits": 0.20345317116007208, "step": 22540 }, { "epoch": 0.7516666666666667, "grad_norm": 30.25, "grad_norm_var": 1.2, "learning_rate": 8.631607584665414e-05, "loss": 7.5538, "loss/crossentropy": 2.139354394376278, "loss/hidden": 3.590234375, "loss/jsd": 0.0, "loss/logits": 0.2131027102470398, "step": 22550 }, { "epoch": 0.752, "grad_norm": 30.875, "grad_norm_var": 3.14140625, "learning_rate": 8.621439876157622e-05, "loss": 7.5358, "loss/crossentropy": 1.9600604437291622, "loss/hidden": 3.341015625, "loss/jsd": 0.0, "loss/logits": 0.19278565216809512, "step": 22560 }, { "epoch": 0.7523333333333333, "grad_norm": 31.75, "grad_norm_var": 3.1697265625, "learning_rate": 8.611241360298429e-05, "loss": 7.601, "loss/crossentropy": 2.1606590077281, "loss/hidden": 3.551171875, "loss/jsd": 0.0, "loss/logits": 0.20311268288642167, "step": 22570 }, { "epoch": 0.7526666666666667, "grad_norm": 27.75, "grad_norm_var": 1.6947916666666667, "learning_rate": 8.601012137743069e-05, "loss": 7.4441, "loss/crossentropy": 1.9806740552186965, "loss/hidden": 3.47734375, "loss/jsd": 0.0, "loss/logits": 0.1898936064913869, "step": 22580 }, { "epoch": 0.753, "grad_norm": 26.75, "grad_norm_var": 1.121875, "learning_rate": 8.590752309449837e-05, "loss": 7.5579, "loss/crossentropy": 2.1649864450097085, "loss/hidden": 3.501953125, "loss/jsd": 0.0, "loss/logits": 0.1936817906796932, "step": 22590 }, { "epoch": 0.7533333333333333, "grad_norm": 28.875, "grad_norm_var": 1.2957682291666666, "learning_rate": 8.5804619766791e-05, "loss": 7.4623, "loss/crossentropy": 1.9912237107753754, "loss/hidden": 3.369921875, "loss/jsd": 0.0, "loss/logits": 0.17677724361419678, "step": 22600 }, { "epoch": 0.7536666666666667, "grad_norm": 34.5, "grad_norm_var": 4.583072916666667, "learning_rate": 8.570141240992285e-05, "loss": 7.5052, "loss/crossentropy": 1.9258248887956142, "loss/hidden": 3.41640625, "loss/jsd": 0.0, "loss/logits": 0.1718863126821816, "step": 22610 }, { "epoch": 0.754, "grad_norm": 27.25, "grad_norm_var": 4.39765625, "learning_rate": 8.559790204250887e-05, "loss": 7.6286, "loss/crossentropy": 1.920270534604788, "loss/hidden": 3.58203125, "loss/jsd": 0.0, "loss/logits": 0.21592878960072995, "step": 22620 }, { "epoch": 0.7543333333333333, "grad_norm": 28.375, "grad_norm_var": 3.350455729166667, "learning_rate": 8.549408968615461e-05, "loss": 7.4976, "loss/crossentropy": 1.9794688627123833, "loss/hidden": 3.555859375, "loss/jsd": 0.0, "loss/logits": 0.20622843131422997, "step": 22630 }, { "epoch": 0.7546666666666667, "grad_norm": 27.75, "grad_norm_var": 3.265625, "learning_rate": 8.53899763654461e-05, "loss": 7.5832, "loss/crossentropy": 2.1832057766616346, "loss/hidden": 3.578515625, "loss/jsd": 0.0, "loss/logits": 0.20718610547482968, "step": 22640 }, { "epoch": 0.755, "grad_norm": 30.625, "grad_norm_var": 7.5572265625, "learning_rate": 8.52855631079398e-05, "loss": 7.5354, "loss/crossentropy": 2.0273820132017137, "loss/hidden": 3.46796875, "loss/jsd": 0.0, "loss/logits": 0.1982576385140419, "step": 22650 }, { "epoch": 0.7553333333333333, "grad_norm": 28.0, "grad_norm_var": 7.424739583333333, "learning_rate": 8.51808509441524e-05, "loss": 7.6259, "loss/crossentropy": 2.224448761343956, "loss/hidden": 3.496875, "loss/jsd": 0.0, "loss/logits": 0.21609959285706282, "step": 22660 }, { "epoch": 0.7556666666666667, "grad_norm": 27.0, "grad_norm_var": 1.3926432291666666, "learning_rate": 8.507584090755069e-05, "loss": 7.541, "loss/crossentropy": 2.052679204940796, "loss/hidden": 3.462109375, "loss/jsd": 0.0, "loss/logits": 0.2042101456783712, "step": 22670 }, { "epoch": 0.756, "grad_norm": 29.25, "grad_norm_var": 1.3041015625, "learning_rate": 8.497053403454133e-05, "loss": 7.4877, "loss/crossentropy": 2.0474780216813087, "loss/hidden": 3.46171875, "loss/jsd": 0.0, "loss/logits": 0.19255555672571062, "step": 22680 }, { "epoch": 0.7563333333333333, "grad_norm": 27.375, "grad_norm_var": 1.6796223958333334, "learning_rate": 8.486493136446064e-05, "loss": 7.6495, "loss/crossentropy": 2.1654643058776855, "loss/hidden": 3.448828125, "loss/jsd": 0.0, "loss/logits": 0.19368699844926596, "step": 22690 }, { "epoch": 0.7566666666666667, "grad_norm": 35.75, "grad_norm_var": 8.53515625, "learning_rate": 8.475903393956434e-05, "loss": 7.699, "loss/crossentropy": 2.2046103149652483, "loss/hidden": 3.51484375, "loss/jsd": 0.0, "loss/logits": 0.20576203987002373, "step": 22700 }, { "epoch": 0.757, "grad_norm": 28.375, "grad_norm_var": 4.378059895833333, "learning_rate": 8.465284280501728e-05, "loss": 7.673, "loss/crossentropy": 2.0582919239997866, "loss/hidden": 3.466796875, "loss/jsd": 0.0, "loss/logits": 0.19977389723062516, "step": 22710 }, { "epoch": 0.7573333333333333, "grad_norm": 27.25, "grad_norm_var": 1.103125, "learning_rate": 8.454635900888305e-05, "loss": 7.503, "loss/crossentropy": 2.1073424354195596, "loss/hidden": 3.462109375, "loss/jsd": 0.0, "loss/logits": 0.2041914898902178, "step": 22720 }, { "epoch": 0.7576666666666667, "grad_norm": 28.5, "grad_norm_var": 3.9934895833333335, "learning_rate": 8.443958360211376e-05, "loss": 7.5862, "loss/crossentropy": 1.996552325040102, "loss/hidden": 3.50234375, "loss/jsd": 0.0, "loss/logits": 0.19313392527401446, "step": 22730 }, { "epoch": 0.758, "grad_norm": 25.375, "grad_norm_var": 12.517643229166667, "learning_rate": 8.433251763853955e-05, "loss": 7.5433, "loss/crossentropy": 2.0218321952968834, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.17924583591520787, "step": 22740 }, { "epoch": 0.7583333333333333, "grad_norm": 30.0, "grad_norm_var": 13.1822265625, "learning_rate": 8.422516217485826e-05, "loss": 7.4756, "loss/crossentropy": 1.9910611510276794, "loss/hidden": 3.531640625, "loss/jsd": 0.0, "loss/logits": 0.19828474670648574, "step": 22750 }, { "epoch": 0.7586666666666667, "grad_norm": 30.5, "grad_norm_var": 4.151497395833333, "learning_rate": 8.4117518270625e-05, "loss": 7.457, "loss/crossentropy": 2.0528080210089685, "loss/hidden": 3.537890625, "loss/jsd": 0.0, "loss/logits": 0.19192924145609142, "step": 22760 }, { "epoch": 0.759, "grad_norm": 27.5, "grad_norm_var": 3.5759765625, "learning_rate": 8.400958698824161e-05, "loss": 7.5715, "loss/crossentropy": 1.9694609761238098, "loss/hidden": 3.496875, "loss/jsd": 0.0, "loss/logits": 0.20183660499751568, "step": 22770 }, { "epoch": 0.7593333333333333, "grad_norm": 27.875, "grad_norm_var": 3.09375, "learning_rate": 8.390136939294631e-05, "loss": 7.526, "loss/crossentropy": 2.0170742586255073, "loss/hidden": 3.530078125, "loss/jsd": 0.0, "loss/logits": 0.21597461104393006, "step": 22780 }, { "epoch": 0.7596666666666667, "grad_norm": 28.125, "grad_norm_var": 5.36640625, "learning_rate": 8.379286655280302e-05, "loss": 7.6393, "loss/crossentropy": 2.0312787666916847, "loss/hidden": 3.508203125, "loss/jsd": 0.0, "loss/logits": 0.19151955414563418, "step": 22790 }, { "epoch": 0.76, "grad_norm": 29.625, "grad_norm_var": 3.6395833333333334, "learning_rate": 8.368407953869104e-05, "loss": 7.4895, "loss/crossentropy": 2.09163758456707, "loss/hidden": 3.4484375, "loss/jsd": 0.0, "loss/logits": 0.19382547289133073, "step": 22800 }, { "epoch": 0.7603333333333333, "grad_norm": 30.5, "grad_norm_var": 2.655143229166667, "learning_rate": 8.357500942429424e-05, "loss": 7.5593, "loss/crossentropy": 2.095877369493246, "loss/hidden": 3.53515625, "loss/jsd": 0.0, "loss/logits": 0.20801592376083136, "step": 22810 }, { "epoch": 0.7606666666666667, "grad_norm": 29.5, "grad_norm_var": 1.4619140625, "learning_rate": 8.34656572860906e-05, "loss": 7.7253, "loss/crossentropy": 2.164764193445444, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.20626930426806211, "step": 22820 }, { "epoch": 0.761, "grad_norm": 28.5, "grad_norm_var": 0.8291666666666667, "learning_rate": 8.335602420334162e-05, "loss": 7.5984, "loss/crossentropy": 2.2208146676421165, "loss/hidden": 3.469921875, "loss/jsd": 0.0, "loss/logits": 0.1930639874190092, "step": 22830 }, { "epoch": 0.7613333333333333, "grad_norm": 27.375, "grad_norm_var": 1.4035807291666667, "learning_rate": 8.324611125808153e-05, "loss": 7.323, "loss/crossentropy": 2.047605223953724, "loss/hidden": 3.500390625, "loss/jsd": 0.0, "loss/logits": 0.1907837161794305, "step": 22840 }, { "epoch": 0.7616666666666667, "grad_norm": 26.0, "grad_norm_var": 1.9572916666666667, "learning_rate": 8.313591953510675e-05, "loss": 7.4896, "loss/crossentropy": 1.9626680858433248, "loss/hidden": 3.481640625, "loss/jsd": 0.0, "loss/logits": 0.18827831279486418, "step": 22850 }, { "epoch": 0.762, "grad_norm": 30.75, "grad_norm_var": 1.478125, "learning_rate": 8.302545012196506e-05, "loss": 7.4469, "loss/crossentropy": 2.024528782069683, "loss/hidden": 3.434765625, "loss/jsd": 0.0, "loss/logits": 0.1860325404442847, "step": 22860 }, { "epoch": 0.7623333333333333, "grad_norm": 27.5, "grad_norm_var": 1.5614583333333334, "learning_rate": 8.291470410894503e-05, "loss": 7.4275, "loss/crossentropy": 2.144756194204092, "loss/hidden": 3.383984375, "loss/jsd": 0.0, "loss/logits": 0.18779438687488437, "step": 22870 }, { "epoch": 0.7626666666666667, "grad_norm": 27.25, "grad_norm_var": 0.7822916666666667, "learning_rate": 8.280368258906505e-05, "loss": 7.4683, "loss/crossentropy": 1.9406724080443383, "loss/hidden": 3.487890625, "loss/jsd": 0.0, "loss/logits": 0.1975240783765912, "step": 22880 }, { "epoch": 0.763, "grad_norm": 25.875, "grad_norm_var": 1.4681640625, "learning_rate": 8.269238665806273e-05, "loss": 7.5113, "loss/crossentropy": 2.1385065048933027, "loss/hidden": 3.37265625, "loss/jsd": 0.0, "loss/logits": 0.19636416397988796, "step": 22890 }, { "epoch": 0.7633333333333333, "grad_norm": 27.625, "grad_norm_var": 8.433268229166666, "learning_rate": 8.258081741438395e-05, "loss": 7.4728, "loss/crossentropy": 2.0857705280184744, "loss/hidden": 3.497265625, "loss/jsd": 0.0, "loss/logits": 0.2008900310844183, "step": 22900 }, { "epoch": 0.7636666666666667, "grad_norm": 30.625, "grad_norm_var": 1.9275390625, "learning_rate": 8.246897595917212e-05, "loss": 7.5127, "loss/crossentropy": 2.101105071604252, "loss/hidden": 3.49140625, "loss/jsd": 0.0, "loss/logits": 0.2026148896664381, "step": 22910 }, { "epoch": 0.764, "grad_norm": 28.75, "grad_norm_var": 1.1254557291666667, "learning_rate": 8.235686339625725e-05, "loss": 7.4431, "loss/crossentropy": 2.023197513818741, "loss/hidden": 3.453515625, "loss/jsd": 0.0, "loss/logits": 0.20429725479334593, "step": 22920 }, { "epoch": 0.7643333333333333, "grad_norm": 29.625, "grad_norm_var": 1.9676432291666666, "learning_rate": 8.224448083214506e-05, "loss": 7.4991, "loss/crossentropy": 2.096772846579552, "loss/hidden": 3.530078125, "loss/jsd": 0.0, "loss/logits": 0.19150063805282116, "step": 22930 }, { "epoch": 0.7646666666666667, "grad_norm": 27.25, "grad_norm_var": 1.6228515625, "learning_rate": 8.213182937600612e-05, "loss": 7.4012, "loss/crossentropy": 1.9597499519586563, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.17938691582530736, "step": 22940 }, { "epoch": 0.765, "grad_norm": 30.125, "grad_norm_var": 2.763541666666667, "learning_rate": 8.201891013966478e-05, "loss": 7.4707, "loss/crossentropy": 1.9817360505461692, "loss/hidden": 3.523828125, "loss/jsd": 0.0, "loss/logits": 0.19509880822151898, "step": 22950 }, { "epoch": 0.7653333333333333, "grad_norm": 35.25, "grad_norm_var": 9.289518229166667, "learning_rate": 8.190572423758835e-05, "loss": 7.5923, "loss/crossentropy": 2.0987179767340423, "loss/hidden": 3.45625, "loss/jsd": 0.0, "loss/logits": 0.18913752851076424, "step": 22960 }, { "epoch": 0.7656666666666667, "grad_norm": 27.0, "grad_norm_var": 8.620247395833333, "learning_rate": 8.179227278687598e-05, "loss": 7.5594, "loss/crossentropy": 2.007404398918152, "loss/hidden": 3.62578125, "loss/jsd": 0.0, "loss/logits": 0.22985132094472646, "step": 22970 }, { "epoch": 0.766, "grad_norm": 32.0, "grad_norm_var": 2.2978515625, "learning_rate": 8.167855690724767e-05, "loss": 7.4219, "loss/crossentropy": 1.8726444259285926, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.1758563483133912, "step": 22980 }, { "epoch": 0.7663333333333333, "grad_norm": 26.625, "grad_norm_var": 3.0171223958333333, "learning_rate": 8.156457772103326e-05, "loss": 7.3819, "loss/crossentropy": 1.9945810578763485, "loss/hidden": 3.42265625, "loss/jsd": 0.0, "loss/logits": 0.17547337915748357, "step": 22990 }, { "epoch": 0.7666666666666667, "grad_norm": 28.0, "grad_norm_var": 5.90390625, "learning_rate": 8.14503363531613e-05, "loss": 7.4977, "loss/crossentropy": 1.9299149721860887, "loss/hidden": 3.561328125, "loss/jsd": 0.0, "loss/logits": 0.19388887714594602, "step": 23000 }, { "epoch": 0.767, "grad_norm": 27.625, "grad_norm_var": 1.2747395833333333, "learning_rate": 8.133583393114797e-05, "loss": 7.5107, "loss/crossentropy": 2.1192862302064897, "loss/hidden": 3.494140625, "loss/jsd": 0.0, "loss/logits": 0.19927413761615753, "step": 23010 }, { "epoch": 0.7673333333333333, "grad_norm": 28.25, "grad_norm_var": 2.528580729166667, "learning_rate": 8.122107158508592e-05, "loss": 7.5196, "loss/crossentropy": 2.044304075837135, "loss/hidden": 3.53359375, "loss/jsd": 0.0, "loss/logits": 0.20039083026349544, "step": 23020 }, { "epoch": 0.7676666666666667, "grad_norm": 27.25, "grad_norm_var": 3.283333333333333, "learning_rate": 8.110605044763323e-05, "loss": 7.5047, "loss/crossentropy": 2.1309088692069054, "loss/hidden": 3.490234375, "loss/jsd": 0.0, "loss/logits": 0.19504800960421562, "step": 23030 }, { "epoch": 0.768, "grad_norm": 28.125, "grad_norm_var": 5.66640625, "learning_rate": 8.099077165400204e-05, "loss": 7.533, "loss/crossentropy": 1.9686566561460495, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.18667809749022127, "step": 23040 }, { "epoch": 0.7683333333333333, "grad_norm": 29.75, "grad_norm_var": 5.320768229166666, "learning_rate": 8.087523634194755e-05, "loss": 7.4886, "loss/crossentropy": 2.005880794674158, "loss/hidden": 3.44296875, "loss/jsd": 0.0, "loss/logits": 0.19481752207502723, "step": 23050 }, { "epoch": 0.7686666666666667, "grad_norm": 26.375, "grad_norm_var": 1.2410807291666666, "learning_rate": 8.075944565175659e-05, "loss": 7.5662, "loss/crossentropy": 2.192203278839588, "loss/hidden": 3.458984375, "loss/jsd": 0.0, "loss/logits": 0.21429934445768595, "step": 23060 }, { "epoch": 0.769, "grad_norm": 26.25, "grad_norm_var": 1.6624348958333333, "learning_rate": 8.064340072623657e-05, "loss": 7.3836, "loss/crossentropy": 2.021069821715355, "loss/hidden": 3.543359375, "loss/jsd": 0.0, "loss/logits": 0.21521367449313403, "step": 23070 }, { "epoch": 0.7693333333333333, "grad_norm": 28.625, "grad_norm_var": 39.1353515625, "learning_rate": 8.052710271070405e-05, "loss": 7.4727, "loss/crossentropy": 1.9759119272232055, "loss/hidden": 3.487109375, "loss/jsd": 0.0, "loss/logits": 0.18273938745260238, "step": 23080 }, { "epoch": 0.7696666666666667, "grad_norm": 26.125, "grad_norm_var": 113.128125, "learning_rate": 8.041055275297348e-05, "loss": 7.467, "loss/crossentropy": 1.9765710644423962, "loss/hidden": 3.504296875, "loss/jsd": 0.0, "loss/logits": 0.19071924965828657, "step": 23090 }, { "epoch": 0.77, "grad_norm": 25.25, "grad_norm_var": 111.61399739583334, "learning_rate": 8.029375200334588e-05, "loss": 7.3353, "loss/crossentropy": 2.1736841291189193, "loss/hidden": 3.3328125, "loss/jsd": 0.0, "loss/logits": 0.17423349283635617, "step": 23100 }, { "epoch": 0.7703333333333333, "grad_norm": 26.0, "grad_norm_var": 1.8122395833333333, "learning_rate": 8.017670161459752e-05, "loss": 7.3313, "loss/crossentropy": 2.2096520021557806, "loss/hidden": 3.362890625, "loss/jsd": 0.0, "loss/logits": 0.19355475530028343, "step": 23110 }, { "epoch": 0.7706666666666667, "grad_norm": 28.25, "grad_norm_var": 2.364518229166667, "learning_rate": 8.005940274196846e-05, "loss": 7.4944, "loss/crossentropy": 2.2522154793143274, "loss/hidden": 3.3484375, "loss/jsd": 0.0, "loss/logits": 0.19156052991747857, "step": 23120 }, { "epoch": 0.771, "grad_norm": 27.25, "grad_norm_var": 3.044205729166667, "learning_rate": 7.994185654315124e-05, "loss": 7.3322, "loss/crossentropy": 2.017530345916748, "loss/hidden": 3.308984375, "loss/jsd": 0.0, "loss/logits": 0.17398178149014712, "step": 23130 }, { "epoch": 0.7713333333333333, "grad_norm": 25.25, "grad_norm_var": 6.49140625, "learning_rate": 7.982406417827936e-05, "loss": 7.4188, "loss/crossentropy": 2.13538076877594, "loss/hidden": 3.479296875, "loss/jsd": 0.0, "loss/logits": 0.19466998036950828, "step": 23140 }, { "epoch": 0.7716666666666666, "grad_norm": 25.625, "grad_norm_var": 7.2697265625, "learning_rate": 7.970602680991594e-05, "loss": 7.4274, "loss/crossentropy": 1.9886194601655007, "loss/hidden": 3.508984375, "loss/jsd": 0.0, "loss/logits": 0.187652344442904, "step": 23150 }, { "epoch": 0.772, "grad_norm": 28.625, "grad_norm_var": 2.081184895833333, "learning_rate": 7.958774560304213e-05, "loss": 7.4564, "loss/crossentropy": 2.0018317684531213, "loss/hidden": 3.51796875, "loss/jsd": 0.0, "loss/logits": 0.18953299205750226, "step": 23160 }, { "epoch": 0.7723333333333333, "grad_norm": 29.875, "grad_norm_var": 3.2228515625, "learning_rate": 7.946922172504567e-05, "loss": 7.6272, "loss/crossentropy": 2.1628062181174754, "loss/hidden": 3.5109375, "loss/jsd": 0.0, "loss/logits": 0.20114805568009614, "step": 23170 }, { "epoch": 0.7726666666666666, "grad_norm": 29.25, "grad_norm_var": 2.3030598958333335, "learning_rate": 7.935045634570941e-05, "loss": 7.454, "loss/crossentropy": 1.9677003532648087, "loss/hidden": 3.47265625, "loss/jsd": 0.0, "loss/logits": 0.20842795697972177, "step": 23180 }, { "epoch": 0.773, "grad_norm": 27.0, "grad_norm_var": 2.3705729166666667, "learning_rate": 7.923145063719972e-05, "loss": 7.3973, "loss/crossentropy": 1.9138947121798993, "loss/hidden": 3.378125, "loss/jsd": 0.0, "loss/logits": 0.18487755134701728, "step": 23190 }, { "epoch": 0.7733333333333333, "grad_norm": 27.5, "grad_norm_var": 1.2983723958333333, "learning_rate": 7.911220577405484e-05, "loss": 7.4438, "loss/crossentropy": 2.0332114972174167, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.19924609856680037, "step": 23200 }, { "epoch": 0.7736666666666666, "grad_norm": 24.375, "grad_norm_var": 7.162434895833333, "learning_rate": 7.89927229331735e-05, "loss": 7.4012, "loss/crossentropy": 2.2687218472361566, "loss/hidden": 3.284765625, "loss/jsd": 0.0, "loss/logits": 0.18496205024421214, "step": 23210 }, { "epoch": 0.774, "grad_norm": 26.625, "grad_norm_var": 2.1125, "learning_rate": 7.887300329380304e-05, "loss": 7.4174, "loss/crossentropy": 2.0132935985922815, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.18549591191112996, "step": 23220 }, { "epoch": 0.7743333333333333, "grad_norm": 26.125, "grad_norm_var": 2.008072916666667, "learning_rate": 7.8753048037528e-05, "loss": 7.3578, "loss/crossentropy": 2.038771292567253, "loss/hidden": 3.46484375, "loss/jsd": 0.0, "loss/logits": 0.1932824071496725, "step": 23230 }, { "epoch": 0.7746666666666666, "grad_norm": 32.5, "grad_norm_var": 4.643489583333333, "learning_rate": 7.863285834825832e-05, "loss": 7.399, "loss/crossentropy": 2.0289094880223275, "loss/hidden": 3.521484375, "loss/jsd": 0.0, "loss/logits": 0.19404235538095235, "step": 23240 }, { "epoch": 0.775, "grad_norm": 29.125, "grad_norm_var": 8.295572916666666, "learning_rate": 7.85124354122177e-05, "loss": 7.4128, "loss/crossentropy": 2.110806605219841, "loss/hidden": 3.377734375, "loss/jsd": 0.0, "loss/logits": 0.1978354908525944, "step": 23250 }, { "epoch": 0.7753333333333333, "grad_norm": 27.125, "grad_norm_var": 2.4707682291666666, "learning_rate": 7.839178041793193e-05, "loss": 7.4052, "loss/crossentropy": 2.0687429390847685, "loss/hidden": 3.491796875, "loss/jsd": 0.0, "loss/logits": 0.2014083441346884, "step": 23260 }, { "epoch": 0.7756666666666666, "grad_norm": 27.375, "grad_norm_var": 1.3455729166666666, "learning_rate": 7.827089455621707e-05, "loss": 7.4279, "loss/crossentropy": 2.0050750449299812, "loss/hidden": 3.269921875, "loss/jsd": 0.0, "loss/logits": 0.1765454810112715, "step": 23270 }, { "epoch": 0.776, "grad_norm": 28.0, "grad_norm_var": 1.1393229166666667, "learning_rate": 7.814977902016779e-05, "loss": 7.4339, "loss/crossentropy": 2.0278828397393225, "loss/hidden": 3.414453125, "loss/jsd": 0.0, "loss/logits": 0.1700163958594203, "step": 23280 }, { "epoch": 0.7763333333333333, "grad_norm": 26.5, "grad_norm_var": 1.82890625, "learning_rate": 7.802843500514553e-05, "loss": 7.4093, "loss/crossentropy": 1.9275280833244324, "loss/hidden": 3.5328125, "loss/jsd": 0.0, "loss/logits": 0.19342173589393497, "step": 23290 }, { "epoch": 0.7766666666666666, "grad_norm": 27.625, "grad_norm_var": 1.4684895833333333, "learning_rate": 7.790686370876671e-05, "loss": 7.582, "loss/crossentropy": 2.128317493200302, "loss/hidden": 3.448046875, "loss/jsd": 0.0, "loss/logits": 0.1935270931571722, "step": 23300 }, { "epoch": 0.777, "grad_norm": 26.5, "grad_norm_var": 1.24140625, "learning_rate": 7.778506633089096e-05, "loss": 7.3099, "loss/crossentropy": 2.1446583211421966, "loss/hidden": 3.35546875, "loss/jsd": 0.0, "loss/logits": 0.1842292295768857, "step": 23310 }, { "epoch": 0.7773333333333333, "grad_norm": 29.875, "grad_norm_var": 1.647261914673709e+18, "learning_rate": 7.766304407360924e-05, "loss": 7.5152, "loss/crossentropy": 2.117107591032982, "loss/hidden": 3.361328125, "loss/jsd": 0.0, "loss/logits": 0.18693095725029707, "step": 23320 }, { "epoch": 0.7776666666666666, "grad_norm": 27.25, "grad_norm_var": 1.647261914636275e+18, "learning_rate": 7.754079814123195e-05, "loss": 7.3871, "loss/crossentropy": 1.9594203799962997, "loss/hidden": 3.43359375, "loss/jsd": 0.0, "loss/logits": 0.18217467218637468, "step": 23330 }, { "epoch": 0.778, "grad_norm": 25.625, "grad_norm_var": 1.3916015625, "learning_rate": 7.741832974027709e-05, "loss": 7.3621, "loss/crossentropy": 1.9576537497341633, "loss/hidden": 3.344921875, "loss/jsd": 0.0, "loss/logits": 0.17676338767632843, "step": 23340 }, { "epoch": 0.7783333333333333, "grad_norm": 26.0, "grad_norm_var": 1.1270833333333334, "learning_rate": 7.729564007945835e-05, "loss": 7.345, "loss/crossentropy": 1.9789281010627746, "loss/hidden": 3.346484375, "loss/jsd": 0.0, "loss/logits": 0.1813190994784236, "step": 23350 }, { "epoch": 0.7786666666666666, "grad_norm": 25.75, "grad_norm_var": 3.4680531624457667e+18, "learning_rate": 7.717273036967312e-05, "loss": 7.3519, "loss/crossentropy": 2.2810946226119997, "loss/hidden": 3.4328125, "loss/jsd": 0.0, "loss/logits": 0.1951879994943738, "step": 23360 }, { "epoch": 0.779, "grad_norm": 28.375, "grad_norm_var": 1.7999348958333334, "learning_rate": 7.704960182399065e-05, "loss": 7.3035, "loss/crossentropy": 2.106377599388361, "loss/hidden": 3.356640625, "loss/jsd": 0.0, "loss/logits": 0.18016488589346408, "step": 23370 }, { "epoch": 0.7793333333333333, "grad_norm": 27.125, "grad_norm_var": 1.2702473958333333, "learning_rate": 7.692625565763996e-05, "loss": 7.3742, "loss/crossentropy": 1.963225745409727, "loss/hidden": 3.4859375, "loss/jsd": 0.0, "loss/logits": 0.201007841527462, "step": 23380 }, { "epoch": 0.7796666666666666, "grad_norm": 27.25, "grad_norm_var": 1.5384765625, "learning_rate": 7.680269308799791e-05, "loss": 7.279, "loss/crossentropy": 2.0159963831305503, "loss/hidden": 3.458984375, "loss/jsd": 0.0, "loss/logits": 0.187699238024652, "step": 23390 }, { "epoch": 0.78, "grad_norm": 25.625, "grad_norm_var": 0.9184895833333333, "learning_rate": 7.667891533457719e-05, "loss": 7.419, "loss/crossentropy": 2.0797833621501924, "loss/hidden": 3.42890625, "loss/jsd": 0.0, "loss/logits": 0.1819260410964489, "step": 23400 }, { "epoch": 0.7803333333333333, "grad_norm": 27.375, "grad_norm_var": 1.5462890625, "learning_rate": 7.655492361901425e-05, "loss": 7.4127, "loss/crossentropy": 1.9969025284051896, "loss/hidden": 3.566796875, "loss/jsd": 0.0, "loss/logits": 0.20141962189227341, "step": 23410 }, { "epoch": 0.7806666666666666, "grad_norm": 24.625, "grad_norm_var": 2.5885416666666665, "learning_rate": 7.643071916505726e-05, "loss": 7.2426, "loss/crossentropy": 2.1777842193841934, "loss/hidden": 3.383203125, "loss/jsd": 0.0, "loss/logits": 0.19313708059489726, "step": 23420 }, { "epoch": 0.781, "grad_norm": 28.875, "grad_norm_var": 2.540311642526537e+18, "learning_rate": 7.630630319855406e-05, "loss": 7.4692, "loss/crossentropy": 1.98684598878026, "loss/hidden": 3.345703125, "loss/jsd": 0.0, "loss/logits": 0.17943298360332846, "step": 23430 }, { "epoch": 0.7813333333333333, "grad_norm": 25.875, "grad_norm_var": 1.8872395833333333, "learning_rate": 7.618167694743998e-05, "loss": 7.3491, "loss/crossentropy": 1.9924467638134957, "loss/hidden": 3.455859375, "loss/jsd": 0.0, "loss/logits": 0.19452919848263264, "step": 23440 }, { "epoch": 0.7816666666666666, "grad_norm": 26.375, "grad_norm_var": 0.8955729166666667, "learning_rate": 7.60568416417258e-05, "loss": 7.4007, "loss/crossentropy": 2.1785530865192415, "loss/hidden": 3.355859375, "loss/jsd": 0.0, "loss/logits": 0.1901299361139536, "step": 23450 }, { "epoch": 0.782, "grad_norm": 26.375, "grad_norm_var": 1.4455729166666667, "learning_rate": 7.593179851348563e-05, "loss": 7.3419, "loss/crossentropy": 2.048526135832071, "loss/hidden": 3.457421875, "loss/jsd": 0.0, "loss/logits": 0.1861328760161996, "step": 23460 }, { "epoch": 0.7823333333333333, "grad_norm": 26.75, "grad_norm_var": 3.459375, "learning_rate": 7.580654879684464e-05, "loss": 7.4101, "loss/crossentropy": 1.9338685415685177, "loss/hidden": 3.4875, "loss/jsd": 0.0, "loss/logits": 0.19165972275659443, "step": 23470 }, { "epoch": 0.7826666666666666, "grad_norm": 27.125, "grad_norm_var": 4.140625, "learning_rate": 7.568109372796697e-05, "loss": 7.3926, "loss/crossentropy": 2.115327002480626, "loss/hidden": 3.335546875, "loss/jsd": 0.0, "loss/logits": 0.187891862122342, "step": 23480 }, { "epoch": 0.783, "grad_norm": 28.25, "grad_norm_var": 2.3053504071336287e+18, "learning_rate": 7.555543454504348e-05, "loss": 7.3786, "loss/crossentropy": 2.1964672222733497, "loss/hidden": 3.740234375, "loss/jsd": 0.0, "loss/logits": 0.1924813449382782, "step": 23490 }, { "epoch": 0.7833333333333333, "grad_norm": 27.375, "grad_norm_var": 1.5384765625, "learning_rate": 7.542957248827961e-05, "loss": 7.3937, "loss/crossentropy": 2.072338564693928, "loss/hidden": 3.492578125, "loss/jsd": 0.0, "loss/logits": 0.20393363032490014, "step": 23500 }, { "epoch": 0.7836666666666666, "grad_norm": 26.0, "grad_norm_var": 1.6676432291666667, "learning_rate": 7.530350879988304e-05, "loss": 7.2573, "loss/crossentropy": 2.1300232261419296, "loss/hidden": 3.4359375, "loss/jsd": 0.0, "loss/logits": 0.19609809312969445, "step": 23510 }, { "epoch": 0.784, "grad_norm": 26.875, "grad_norm_var": 1.9218098958333334, "learning_rate": 7.517724472405146e-05, "loss": 7.3829, "loss/crossentropy": 2.1858866199851037, "loss/hidden": 3.410546875, "loss/jsd": 0.0, "loss/logits": 0.18005712442100047, "step": 23520 }, { "epoch": 0.7843333333333333, "grad_norm": 27.5, "grad_norm_var": 2.6322265625, "learning_rate": 7.505078150696035e-05, "loss": 7.259, "loss/crossentropy": 2.11143764257431, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.17099386416375636, "step": 23530 }, { "epoch": 0.7846666666666666, "grad_norm": 27.875, "grad_norm_var": 19.049739583333334, "learning_rate": 7.492412039675058e-05, "loss": 7.3895, "loss/crossentropy": 2.0830544363707304, "loss/hidden": 3.37421875, "loss/jsd": 0.0, "loss/logits": 0.184346787026152, "step": 23540 }, { "epoch": 0.785, "grad_norm": 28.0, "grad_norm_var": 19.7978515625, "learning_rate": 7.479726264351618e-05, "loss": 7.3045, "loss/crossentropy": 2.0409920796751977, "loss/hidden": 3.45234375, "loss/jsd": 0.0, "loss/logits": 0.190955501049757, "step": 23550 }, { "epoch": 0.7853333333333333, "grad_norm": 27.0, "grad_norm_var": 1.64140625, "learning_rate": 7.4670209499292e-05, "loss": 7.3454, "loss/crossentropy": 1.9914120055735112, "loss/hidden": 3.43671875, "loss/jsd": 0.0, "loss/logits": 0.1819184892810881, "step": 23560 }, { "epoch": 0.7856666666666666, "grad_norm": 28.125, "grad_norm_var": 1.2546223958333333, "learning_rate": 7.454296221804121e-05, "loss": 7.2387, "loss/crossentropy": 2.0846676357090472, "loss/hidden": 3.41875, "loss/jsd": 0.0, "loss/logits": 0.18154996410012245, "step": 23570 }, { "epoch": 0.786, "grad_norm": 26.625, "grad_norm_var": 2.4833333333333334, "learning_rate": 7.441552205564317e-05, "loss": 7.3623, "loss/crossentropy": 2.1405218988656998, "loss/hidden": 3.52890625, "loss/jsd": 0.0, "loss/logits": 0.21327570956200362, "step": 23580 }, { "epoch": 0.7863333333333333, "grad_norm": 27.0, "grad_norm_var": 1.6942057291666666, "learning_rate": 7.428789026988078e-05, "loss": 7.3275, "loss/crossentropy": 2.189077128469944, "loss/hidden": 3.36796875, "loss/jsd": 0.0, "loss/logits": 0.183055036701262, "step": 23590 }, { "epoch": 0.7866666666666666, "grad_norm": 26.5, "grad_norm_var": 1.03125, "learning_rate": 7.416006812042828e-05, "loss": 7.2771, "loss/crossentropy": 2.1273641705513002, "loss/hidden": 3.39921875, "loss/jsd": 0.0, "loss/logits": 0.1837273458018899, "step": 23600 }, { "epoch": 0.787, "grad_norm": 25.25, "grad_norm_var": 1.1322916666666667, "learning_rate": 7.403205686883864e-05, "loss": 7.1614, "loss/crossentropy": 2.0558855824172495, "loss/hidden": 3.311328125, "loss/jsd": 0.0, "loss/logits": 0.17631138348951936, "step": 23610 }, { "epoch": 0.7873333333333333, "grad_norm": 26.75, "grad_norm_var": 1.50390625, "learning_rate": 7.39038577785313e-05, "loss": 7.3046, "loss/crossentropy": 1.9993584722280502, "loss/hidden": 3.648828125, "loss/jsd": 0.0, "loss/logits": 0.21630566865205764, "step": 23620 }, { "epoch": 0.7876666666666666, "grad_norm": 26.375, "grad_norm_var": 1.0979166666666667, "learning_rate": 7.377547211477946e-05, "loss": 7.4785, "loss/crossentropy": 2.142679235339165, "loss/hidden": 3.372265625, "loss/jsd": 0.0, "loss/logits": 0.18648772593587637, "step": 23630 }, { "epoch": 0.788, "grad_norm": 28.5, "grad_norm_var": 2.067122395833333, "learning_rate": 7.36469011446978e-05, "loss": 7.4104, "loss/crossentropy": 2.191077730059624, "loss/hidden": 3.295703125, "loss/jsd": 0.0, "loss/logits": 0.17883399985730647, "step": 23640 }, { "epoch": 0.7883333333333333, "grad_norm": 26.125, "grad_norm_var": 1.7035807291666667, "learning_rate": 7.35181461372299e-05, "loss": 7.3389, "loss/crossentropy": 2.063406619429588, "loss/hidden": 3.396875, "loss/jsd": 0.0, "loss/logits": 0.20122182425111532, "step": 23650 }, { "epoch": 0.7886666666666666, "grad_norm": 25.5, "grad_norm_var": 18.208072916666666, "learning_rate": 7.338920836313572e-05, "loss": 7.2588, "loss/crossentropy": 2.062808007001877, "loss/hidden": 3.40390625, "loss/jsd": 0.0, "loss/logits": 0.19022527448832988, "step": 23660 }, { "epoch": 0.789, "grad_norm": 27.375, "grad_norm_var": 1.4186848958333333, "learning_rate": 7.326008909497901e-05, "loss": 7.2722, "loss/crossentropy": 2.0678359627723695, "loss/hidden": 3.456640625, "loss/jsd": 0.0, "loss/logits": 0.19251144528388978, "step": 23670 }, { "epoch": 0.7893333333333333, "grad_norm": 27.5, "grad_norm_var": 2.3308942610148623e+18, "learning_rate": 7.313078960711483e-05, "loss": 7.3772, "loss/crossentropy": 2.3504543006420135, "loss/hidden": 3.399609375, "loss/jsd": 0.0, "loss/logits": 0.1857314633205533, "step": 23680 }, { "epoch": 0.7896666666666666, "grad_norm": 26.875, "grad_norm_var": 2.330894261447435e+18, "learning_rate": 7.300131117567692e-05, "loss": 7.2372, "loss/crossentropy": 2.0384044095873834, "loss/hidden": 3.373828125, "loss/jsd": 0.0, "loss/logits": 0.1833371376618743, "step": 23690 }, { "epoch": 0.79, "grad_norm": 26.375, "grad_norm_var": 2.9447916666666667, "learning_rate": 7.287165507856512e-05, "loss": 7.3968, "loss/crossentropy": 2.0169043824076653, "loss/hidden": 3.4125, "loss/jsd": 0.0, "loss/logits": 0.17834869027137756, "step": 23700 }, { "epoch": 0.7903333333333333, "grad_norm": 25.875, "grad_norm_var": 2.8291015625, "learning_rate": 7.27418225954328e-05, "loss": 7.209, "loss/crossentropy": 2.0671774983406066, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.1780660256743431, "step": 23710 }, { "epoch": 0.7906666666666666, "grad_norm": 24.875, "grad_norm_var": 0.7567057291666667, "learning_rate": 7.261181500767413e-05, "loss": 7.2326, "loss/crossentropy": 1.840275975316763, "loss/hidden": 3.2921875, "loss/jsd": 0.0, "loss/logits": 0.1688528038561344, "step": 23720 }, { "epoch": 0.791, "grad_norm": 28.25, "grad_norm_var": 1.8389973958333334, "learning_rate": 7.248163359841148e-05, "loss": 7.2749, "loss/crossentropy": 1.863051414489746, "loss/hidden": 3.254296875, "loss/jsd": 0.0, "loss/logits": 0.15819047279655934, "step": 23730 }, { "epoch": 0.7913333333333333, "grad_norm": 25.875, "grad_norm_var": 1.14140625, "learning_rate": 7.235127965248285e-05, "loss": 7.3715, "loss/crossentropy": 2.033174179494381, "loss/hidden": 3.48125, "loss/jsd": 0.0, "loss/logits": 0.18322906009852885, "step": 23740 }, { "epoch": 0.7916666666666666, "grad_norm": 26.375, "grad_norm_var": 0.790625, "learning_rate": 7.222075445642904e-05, "loss": 7.3671, "loss/crossentropy": 2.1419862687587736, "loss/hidden": 3.3890625, "loss/jsd": 0.0, "loss/logits": 0.1809433963149786, "step": 23750 }, { "epoch": 0.792, "grad_norm": 25.0, "grad_norm_var": 0.6431640625, "learning_rate": 7.209005929848107e-05, "loss": 7.3336, "loss/crossentropy": 2.0215205609798432, "loss/hidden": 3.341796875, "loss/jsd": 0.0, "loss/logits": 0.1783350331708789, "step": 23760 }, { "epoch": 0.7923333333333333, "grad_norm": 26.25, "grad_norm_var": 1.0122395833333333, "learning_rate": 7.195919546854732e-05, "loss": 7.2975, "loss/crossentropy": 2.16986320912838, "loss/hidden": 3.305078125, "loss/jsd": 0.0, "loss/logits": 0.17684876844286918, "step": 23770 }, { "epoch": 0.7926666666666666, "grad_norm": 26.125, "grad_norm_var": 6.220572916666667, "learning_rate": 7.182816425820101e-05, "loss": 7.286, "loss/crossentropy": 2.0275315180420876, "loss/hidden": 3.40234375, "loss/jsd": 0.0, "loss/logits": 0.1788380341604352, "step": 23780 }, { "epoch": 0.793, "grad_norm": 27.875, "grad_norm_var": 1.3247395833333333, "learning_rate": 7.16969669606673e-05, "loss": 7.4463, "loss/crossentropy": 2.0448300421237944, "loss/hidden": 3.374609375, "loss/jsd": 0.0, "loss/logits": 0.1782231353223324, "step": 23790 }, { "epoch": 0.7933333333333333, "grad_norm": 27.5, "grad_norm_var": 0.9686848958333333, "learning_rate": 7.156560487081053e-05, "loss": 7.2929, "loss/crossentropy": 2.0439211681485174, "loss/hidden": 3.4046875, "loss/jsd": 0.0, "loss/logits": 0.18058539805933832, "step": 23800 }, { "epoch": 0.7936666666666666, "grad_norm": 25.875, "grad_norm_var": 1.1385416666666666, "learning_rate": 7.143407928512146e-05, "loss": 7.1752, "loss/crossentropy": 2.0834071934223175, "loss/hidden": 3.4390625, "loss/jsd": 0.0, "loss/logits": 0.1776286605745554, "step": 23810 }, { "epoch": 0.794, "grad_norm": 29.0, "grad_norm_var": 2.6458333333333335, "learning_rate": 7.130239150170455e-05, "loss": 7.3533, "loss/crossentropy": 2.069356369972229, "loss/hidden": 3.456640625, "loss/jsd": 0.0, "loss/logits": 0.18835821226239205, "step": 23820 }, { "epoch": 0.7943333333333333, "grad_norm": 26.375, "grad_norm_var": 2.9332682291666665, "learning_rate": 7.117054282026508e-05, "loss": 7.3786, "loss/crossentropy": 2.15442588403821, "loss/hidden": 3.30234375, "loss/jsd": 0.0, "loss/logits": 0.17487594103440643, "step": 23830 }, { "epoch": 0.7946666666666666, "grad_norm": 26.0, "grad_norm_var": 1.1457682291666667, "learning_rate": 7.103853454209628e-05, "loss": 7.2191, "loss/crossentropy": 2.086764992028475, "loss/hidden": 3.351171875, "loss/jsd": 0.0, "loss/logits": 0.17575874989852308, "step": 23840 }, { "epoch": 0.795, "grad_norm": 25.625, "grad_norm_var": 1.2385416666666667, "learning_rate": 7.090636797006658e-05, "loss": 7.3092, "loss/crossentropy": 2.035817837715149, "loss/hidden": 3.4421875, "loss/jsd": 0.0, "loss/logits": 0.1798616824671626, "step": 23850 }, { "epoch": 0.7953333333333333, "grad_norm": 23.625, "grad_norm_var": 2.1634765625, "learning_rate": 7.077404440860666e-05, "loss": 7.2022, "loss/crossentropy": 2.0340130746364595, "loss/hidden": 3.32421875, "loss/jsd": 0.0, "loss/logits": 0.18757070507854223, "step": 23860 }, { "epoch": 0.7956666666666666, "grad_norm": 24.5, "grad_norm_var": 2.4514973958333335, "learning_rate": 7.064156516369666e-05, "loss": 7.3237, "loss/crossentropy": 2.046588622033596, "loss/hidden": 3.430078125, "loss/jsd": 0.0, "loss/logits": 0.18935591662302614, "step": 23870 }, { "epoch": 0.796, "grad_norm": 27.0, "grad_norm_var": 1.1660807291666666, "learning_rate": 7.050893154285327e-05, "loss": 7.331, "loss/crossentropy": 2.1309110179543493, "loss/hidden": 3.371875, "loss/jsd": 0.0, "loss/logits": 0.17215617671608924, "step": 23880 }, { "epoch": 0.7963333333333333, "grad_norm": 26.5, "grad_norm_var": 0.9455729166666667, "learning_rate": 7.037614485511676e-05, "loss": 7.251, "loss/crossentropy": 2.123047386109829, "loss/hidden": 3.3, "loss/jsd": 0.0, "loss/logits": 0.1786116823554039, "step": 23890 }, { "epoch": 0.7966666666666666, "grad_norm": 27.875, "grad_norm_var": 1.2010416666666666, "learning_rate": 7.024320641103812e-05, "loss": 7.2897, "loss/crossentropy": 2.1215869694948197, "loss/hidden": 3.40546875, "loss/jsd": 0.0, "loss/logits": 0.1894577570259571, "step": 23900 }, { "epoch": 0.797, "grad_norm": 25.375, "grad_norm_var": 1.1927083333333333, "learning_rate": 7.011011752266612e-05, "loss": 7.112, "loss/crossentropy": 2.1410455122590064, "loss/hidden": 3.294921875, "loss/jsd": 0.0, "loss/logits": 0.18200043272227048, "step": 23910 }, { "epoch": 0.7973333333333333, "grad_norm": 23.75, "grad_norm_var": 0.8145833333333333, "learning_rate": 6.99768795035344e-05, "loss": 7.1817, "loss/crossentropy": 1.9395751819014548, "loss/hidden": 3.3234375, "loss/jsd": 0.0, "loss/logits": 0.17033605417236686, "step": 23920 }, { "epoch": 0.7976666666666666, "grad_norm": 23.25, "grad_norm_var": 2.395768229166667, "learning_rate": 6.984349366864839e-05, "loss": 7.1585, "loss/crossentropy": 1.8782190293073655, "loss/hidden": 3.371875, "loss/jsd": 0.0, "loss/logits": 0.1784798389300704, "step": 23930 }, { "epoch": 0.798, "grad_norm": 25.125, "grad_norm_var": 2.3053504077789222e+18, "learning_rate": 6.97099613344724e-05, "loss": 7.308, "loss/crossentropy": 2.1117516651749613, "loss/hidden": 3.307421875, "loss/jsd": 0.0, "loss/logits": 0.1816192871890962, "step": 23940 }, { "epoch": 0.7983333333333333, "grad_norm": 24.625, "grad_norm_var": 2.305350407854839e+18, "learning_rate": 6.957628381891673e-05, "loss": 7.3415, "loss/crossentropy": 2.0898191846907137, "loss/hidden": 3.33125, "loss/jsd": 0.0, "loss/logits": 0.17528143543750047, "step": 23950 }, { "epoch": 0.7986666666666666, "grad_norm": 23.75, "grad_norm_var": 2.019791666666667, "learning_rate": 6.944246244132443e-05, "loss": 7.1587, "loss/crossentropy": 2.0945689618587493, "loss/hidden": 3.31328125, "loss/jsd": 0.0, "loss/logits": 0.1783385954797268, "step": 23960 }, { "epoch": 0.799, "grad_norm": 25.625, "grad_norm_var": 2.0619140625, "learning_rate": 6.930849852245848e-05, "loss": 7.3629, "loss/crossentropy": 2.0345364600419997, "loss/hidden": 3.33984375, "loss/jsd": 0.0, "loss/logits": 0.18947122450917958, "step": 23970 }, { "epoch": 0.7993333333333333, "grad_norm": 30.375, "grad_norm_var": 2.7583333333333333, "learning_rate": 6.917439338448872e-05, "loss": 7.2008, "loss/crossentropy": 2.0551148861646653, "loss/hidden": 3.27734375, "loss/jsd": 0.0, "loss/logits": 0.17965832073241472, "step": 23980 }, { "epoch": 0.7996666666666666, "grad_norm": 26.625, "grad_norm_var": 21.099739583333335, "learning_rate": 6.904014835097867e-05, "loss": 7.2763, "loss/crossentropy": 2.044001418352127, "loss/hidden": 3.451953125, "loss/jsd": 0.0, "loss/logits": 0.18677353039383887, "step": 23990 }, { "epoch": 0.8, "grad_norm": 25.375, "grad_norm_var": 0.9718098958333333, "learning_rate": 6.890576474687263e-05, "loss": 7.211, "loss/crossentropy": 2.2494696259498594, "loss/hidden": 3.355078125, "loss/jsd": 0.0, "loss/logits": 0.18474820386618376, "step": 24000 }, { "epoch": 0.8003333333333333, "grad_norm": 25.25, "grad_norm_var": 1.2018229166666667, "learning_rate": 6.877124389848254e-05, "loss": 7.2794, "loss/crossentropy": 2.300140696763992, "loss/hidden": 3.36875, "loss/jsd": 0.0, "loss/logits": 0.19267369732260703, "step": 24010 }, { "epoch": 0.8006666666666666, "grad_norm": 22.75, "grad_norm_var": 2.919791666666667, "learning_rate": 6.863658713347484e-05, "loss": 7.2944, "loss/crossentropy": 2.0973946295678614, "loss/hidden": 3.3421875, "loss/jsd": 0.0, "loss/logits": 0.18365285200998188, "step": 24020 }, { "epoch": 0.801, "grad_norm": 25.5, "grad_norm_var": 2.533072916666667, "learning_rate": 6.850179578085744e-05, "loss": 7.226, "loss/crossentropy": 1.9838631860911846, "loss/hidden": 3.324609375, "loss/jsd": 0.0, "loss/logits": 0.17135070022195578, "step": 24030 }, { "epoch": 0.8013333333333333, "grad_norm": 25.875, "grad_norm_var": 0.7624348958333333, "learning_rate": 6.836687117096657e-05, "loss": 7.1629, "loss/crossentropy": 2.1537132054567336, "loss/hidden": 3.30390625, "loss/jsd": 0.0, "loss/logits": 0.17650238294154405, "step": 24040 }, { "epoch": 0.8016666666666666, "grad_norm": 25.75, "grad_norm_var": 0.9291666666666667, "learning_rate": 6.823181463545368e-05, "loss": 7.2053, "loss/crossentropy": 2.0979036509990694, "loss/hidden": 3.3296875, "loss/jsd": 0.0, "loss/logits": 0.17083593588322402, "step": 24050 }, { "epoch": 0.802, "grad_norm": 26.75, "grad_norm_var": 1.4358723958333333, "learning_rate": 6.809662750727222e-05, "loss": 7.3477, "loss/crossentropy": 2.0106400445103647, "loss/hidden": 3.440625, "loss/jsd": 0.0, "loss/logits": 0.17500849366188048, "step": 24060 }, { "epoch": 0.8023333333333333, "grad_norm": 27.75, "grad_norm_var": 3.159375, "learning_rate": 6.796131112066461e-05, "loss": 7.1892, "loss/crossentropy": 2.011872109770775, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.16934235505759715, "step": 24070 }, { "epoch": 0.8026666666666666, "grad_norm": 25.25, "grad_norm_var": 3.3551432291666665, "learning_rate": 6.782586681114894e-05, "loss": 7.2536, "loss/crossentropy": 1.9991176337003709, "loss/hidden": 3.26875, "loss/jsd": 0.0, "loss/logits": 0.173070646263659, "step": 24080 }, { "epoch": 0.803, "grad_norm": 25.125, "grad_norm_var": 0.7145833333333333, "learning_rate": 6.769029591550581e-05, "loss": 7.2212, "loss/crossentropy": 2.190063714981079, "loss/hidden": 3.3859375, "loss/jsd": 0.0, "loss/logits": 0.19409491550177335, "step": 24090 }, { "epoch": 0.8033333333333333, "grad_norm": 27.25, "grad_norm_var": 1.0520833333333333, "learning_rate": 6.755459977176533e-05, "loss": 7.226, "loss/crossentropy": 2.0515845850110055, "loss/hidden": 3.2640625, "loss/jsd": 0.0, "loss/logits": 0.17549332650378346, "step": 24100 }, { "epoch": 0.8036666666666666, "grad_norm": 23.875, "grad_norm_var": 1.3291666666666666, "learning_rate": 6.741877971919357e-05, "loss": 7.3119, "loss/crossentropy": 2.1062870398163795, "loss/hidden": 3.355859375, "loss/jsd": 0.0, "loss/logits": 0.1822298699989915, "step": 24110 }, { "epoch": 0.804, "grad_norm": 25.125, "grad_norm_var": 2.460724593429193e+18, "learning_rate": 6.728283709827963e-05, "loss": 7.2493, "loss/crossentropy": 2.1633963331580164, "loss/hidden": 3.459375, "loss/jsd": 0.0, "loss/logits": 0.19363478161394596, "step": 24120 }, { "epoch": 0.8043333333333333, "grad_norm": 24.375, "grad_norm_var": 22.76640625, "learning_rate": 6.714677325072235e-05, "loss": 7.2432, "loss/crossentropy": 2.1364134401082993, "loss/hidden": 3.352734375, "loss/jsd": 0.0, "loss/logits": 0.1911363998427987, "step": 24130 }, { "epoch": 0.8046666666666666, "grad_norm": 27.125, "grad_norm_var": 1.0369140625, "learning_rate": 6.701058951941691e-05, "loss": 7.2875, "loss/crossentropy": 2.144788406789303, "loss/hidden": 3.46484375, "loss/jsd": 0.0, "loss/logits": 0.19746586456894874, "step": 24140 }, { "epoch": 0.805, "grad_norm": 28.5, "grad_norm_var": 4.764322916666667, "learning_rate": 6.687428724844179e-05, "loss": 7.143, "loss/crossentropy": 1.9672424003481865, "loss/hidden": 3.464453125, "loss/jsd": 0.0, "loss/logits": 0.1883099837228656, "step": 24150 }, { "epoch": 0.8053333333333333, "grad_norm": 26.125, "grad_norm_var": 5.80390625, "learning_rate": 6.673786778304537e-05, "loss": 7.1432, "loss/crossentropy": 1.940127792209387, "loss/hidden": 3.219140625, "loss/jsd": 0.0, "loss/logits": 0.15666389614343643, "step": 24160 }, { "epoch": 0.8056666666666666, "grad_norm": 25.0, "grad_norm_var": 373.32890625, "learning_rate": 6.66013324696327e-05, "loss": 7.2992, "loss/crossentropy": 2.203534686565399, "loss/hidden": 3.34921875, "loss/jsd": 0.0, "loss/logits": 0.17815530616790057, "step": 24170 }, { "epoch": 0.806, "grad_norm": 25.25, "grad_norm_var": 0.9864583333333333, "learning_rate": 6.646468265575219e-05, "loss": 7.2117, "loss/crossentropy": 2.0774502992630004, "loss/hidden": 3.400390625, "loss/jsd": 0.0, "loss/logits": 0.17327735256403684, "step": 24180 }, { "epoch": 0.8063333333333333, "grad_norm": 26.875, "grad_norm_var": 0.8587890625, "learning_rate": 6.632791969008237e-05, "loss": 7.2638, "loss/crossentropy": 2.0702610716223715, "loss/hidden": 3.4234375, "loss/jsd": 0.0, "loss/logits": 0.18914096765220165, "step": 24190 }, { "epoch": 0.8066666666666666, "grad_norm": 26.0, "grad_norm_var": 2.709830729166667, "learning_rate": 6.619104492241848e-05, "loss": 7.2388, "loss/crossentropy": 2.02001933157444, "loss/hidden": 3.426953125, "loss/jsd": 0.0, "loss/logits": 0.200399025157094, "step": 24200 }, { "epoch": 0.807, "grad_norm": 24.75, "grad_norm_var": 1.3978515625, "learning_rate": 6.60540597036592e-05, "loss": 7.1491, "loss/crossentropy": 2.1803820818662643, "loss/hidden": 3.303125, "loss/jsd": 0.0, "loss/logits": 0.17881411854177715, "step": 24210 }, { "epoch": 0.8073333333333333, "grad_norm": 23.125, "grad_norm_var": 0.85390625, "learning_rate": 6.591696538579334e-05, "loss": 7.1335, "loss/crossentropy": 2.097467389702797, "loss/hidden": 3.31953125, "loss/jsd": 0.0, "loss/logits": 0.17912068534642459, "step": 24220 }, { "epoch": 0.8076666666666666, "grad_norm": 23.75, "grad_norm_var": 3.476822916666667, "learning_rate": 6.577976332188649e-05, "loss": 7.1903, "loss/crossentropy": 2.0106175623834135, "loss/hidden": 3.3859375, "loss/jsd": 0.0, "loss/logits": 0.19294406063854694, "step": 24230 }, { "epoch": 0.808, "grad_norm": 25.875, "grad_norm_var": 1.7233723958333333, "learning_rate": 6.564245486606762e-05, "loss": 7.123, "loss/crossentropy": 2.056936872005463, "loss/hidden": 3.296484375, "loss/jsd": 0.0, "loss/logits": 0.16803365563973785, "step": 24240 }, { "epoch": 0.8083333333333333, "grad_norm": 22.875, "grad_norm_var": 1.75, "learning_rate": 6.550504137351576e-05, "loss": 7.0223, "loss/crossentropy": 2.026827494055033, "loss/hidden": 3.321875, "loss/jsd": 0.0, "loss/logits": 0.18043983895331622, "step": 24250 }, { "epoch": 0.8086666666666666, "grad_norm": 26.125, "grad_norm_var": 52.36451822916667, "learning_rate": 6.536752420044659e-05, "loss": 7.0505, "loss/crossentropy": 2.047990356385708, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.16095355469733477, "step": 24260 }, { "epoch": 0.809, "grad_norm": 24.75, "grad_norm_var": 51.09791666666667, "learning_rate": 6.522990470409909e-05, "loss": 7.1775, "loss/crossentropy": 2.093078485131264, "loss/hidden": 3.343359375, "loss/jsd": 0.0, "loss/logits": 0.17120972126722336, "step": 24270 }, { "epoch": 0.8093333333333333, "grad_norm": 23.875, "grad_norm_var": 1.6280598958333334, "learning_rate": 6.509218424272216e-05, "loss": 7.1735, "loss/crossentropy": 2.125036987662315, "loss/hidden": 3.366796875, "loss/jsd": 0.0, "loss/logits": 0.19020479824393988, "step": 24280 }, { "epoch": 0.8096666666666666, "grad_norm": 23.375, "grad_norm_var": 63.603125, "learning_rate": 6.495436417556113e-05, "loss": 7.1969, "loss/crossentropy": 2.22238949239254, "loss/hidden": 3.383984375, "loss/jsd": 0.0, "loss/logits": 0.19780659209936857, "step": 24290 }, { "epoch": 0.81, "grad_norm": 24.25, "grad_norm_var": 11.257291666666667, "learning_rate": 6.481644586284442e-05, "loss": 7.153, "loss/crossentropy": 2.0770174629986284, "loss/hidden": 3.31796875, "loss/jsd": 0.0, "loss/logits": 0.18271327950060368, "step": 24300 }, { "epoch": 0.8103333333333333, "grad_norm": 24.625, "grad_norm_var": 0.8643229166666667, "learning_rate": 6.46784306657701e-05, "loss": 7.1539, "loss/crossentropy": 1.8965822540223598, "loss/hidden": 3.353515625, "loss/jsd": 0.0, "loss/logits": 0.17321830820292233, "step": 24310 }, { "epoch": 0.8106666666666666, "grad_norm": 25.25, "grad_norm_var": 1.559375, "learning_rate": 6.454031994649247e-05, "loss": 7.1674, "loss/crossentropy": 2.2343272864818573, "loss/hidden": 3.285546875, "loss/jsd": 0.0, "loss/logits": 0.17637284398078917, "step": 24320 }, { "epoch": 0.811, "grad_norm": 196.0, "grad_norm_var": 1813.99375, "learning_rate": 6.440211506810852e-05, "loss": 7.2401, "loss/crossentropy": 2.118020176887512, "loss/hidden": 3.42890625, "loss/jsd": 0.0, "loss/logits": 0.18733559399843216, "step": 24330 }, { "epoch": 0.8113333333333334, "grad_norm": 24.375, "grad_norm_var": 1817.9728515625, "learning_rate": 6.426381739464466e-05, "loss": 7.1611, "loss/crossentropy": 2.0687691517174245, "loss/hidden": 3.313671875, "loss/jsd": 0.0, "loss/logits": 0.18494962928816677, "step": 24340 }, { "epoch": 0.8116666666666666, "grad_norm": 24.25, "grad_norm_var": 2.7593098958333333, "learning_rate": 6.412542829104307e-05, "loss": 7.1438, "loss/crossentropy": 2.1410858571529388, "loss/hidden": 3.366796875, "loss/jsd": 0.0, "loss/logits": 0.1797313429415226, "step": 24350 }, { "epoch": 0.812, "grad_norm": 23.875, "grad_norm_var": 1.2186848958333334, "learning_rate": 6.398694912314831e-05, "loss": 7.1624, "loss/crossentropy": 2.146064803004265, "loss/hidden": 3.25234375, "loss/jsd": 0.0, "loss/logits": 0.17778887879103422, "step": 24360 }, { "epoch": 0.8123333333333334, "grad_norm": 24.25, "grad_norm_var": 0.9973307291666667, "learning_rate": 6.38483812576939e-05, "loss": 7.0275, "loss/crossentropy": 2.1565047204494476, "loss/hidden": 3.282421875, "loss/jsd": 0.0, "loss/logits": 0.17124725691974163, "step": 24370 }, { "epoch": 0.8126666666666666, "grad_norm": 23.0, "grad_norm_var": 0.5893229166666667, "learning_rate": 6.370972606228872e-05, "loss": 7.1234, "loss/crossentropy": 2.0467451363801956, "loss/hidden": 3.420703125, "loss/jsd": 0.0, "loss/logits": 0.1872189924120903, "step": 24380 }, { "epoch": 0.813, "grad_norm": 24.75, "grad_norm_var": 0.5145833333333333, "learning_rate": 6.357098490540355e-05, "loss": 7.1545, "loss/crossentropy": 2.3109287858009337, "loss/hidden": 3.271875, "loss/jsd": 0.0, "loss/logits": 0.1918891828507185, "step": 24390 }, { "epoch": 0.8133333333333334, "grad_norm": 23.375, "grad_norm_var": 0.9212890625, "learning_rate": 6.343215915635762e-05, "loss": 7.1035, "loss/crossentropy": 1.956181785464287, "loss/hidden": 3.404296875, "loss/jsd": 0.0, "loss/logits": 0.1630766457878053, "step": 24400 }, { "epoch": 0.8136666666666666, "grad_norm": 25.0, "grad_norm_var": 0.709375, "learning_rate": 6.329325018530501e-05, "loss": 7.0542, "loss/crossentropy": 1.9934518307447433, "loss/hidden": 3.31015625, "loss/jsd": 0.0, "loss/logits": 0.16185767110437155, "step": 24410 }, { "epoch": 0.814, "grad_norm": 23.125, "grad_norm_var": 5.9931640625, "learning_rate": 6.315425936322118e-05, "loss": 7.0992, "loss/crossentropy": 2.089629125595093, "loss/hidden": 3.338671875, "loss/jsd": 0.0, "loss/logits": 0.1817337304353714, "step": 24420 }, { "epoch": 0.8143333333333334, "grad_norm": 24.5, "grad_norm_var": 5.65, "learning_rate": 6.301518806188946e-05, "loss": 7.0823, "loss/crossentropy": 2.0498588502407076, "loss/hidden": 3.274609375, "loss/jsd": 0.0, "loss/logits": 0.17004711236804723, "step": 24430 }, { "epoch": 0.8146666666666667, "grad_norm": 27.0, "grad_norm_var": 1.5229166666666667, "learning_rate": 6.287603765388743e-05, "loss": 7.2639, "loss/crossentropy": 2.05265491604805, "loss/hidden": 3.2953125, "loss/jsd": 0.0, "loss/logits": 0.16803640704602002, "step": 24440 }, { "epoch": 0.815, "grad_norm": 26.125, "grad_norm_var": 2.348372395833333, "learning_rate": 6.273680951257342e-05, "loss": 7.1844, "loss/crossentropy": 2.1695328533649443, "loss/hidden": 3.35703125, "loss/jsd": 0.0, "loss/logits": 0.18274456169456244, "step": 24450 }, { "epoch": 0.8153333333333334, "grad_norm": 26.625, "grad_norm_var": 1.5375, "learning_rate": 6.259750501207302e-05, "loss": 7.18, "loss/crossentropy": 2.132620003819466, "loss/hidden": 3.33515625, "loss/jsd": 0.0, "loss/logits": 0.17444018907845021, "step": 24460 }, { "epoch": 0.8156666666666667, "grad_norm": 23.875, "grad_norm_var": 0.8046223958333333, "learning_rate": 6.245812552726538e-05, "loss": 7.1452, "loss/crossentropy": 2.058624839782715, "loss/hidden": 3.30390625, "loss/jsd": 0.0, "loss/logits": 0.1701804917305708, "step": 24470 }, { "epoch": 0.816, "grad_norm": 25.5, "grad_norm_var": 1.4427083333333333, "learning_rate": 6.231867243376977e-05, "loss": 7.0437, "loss/crossentropy": 2.1087178610265256, "loss/hidden": 3.196875, "loss/jsd": 0.0, "loss/logits": 0.16274321246892215, "step": 24480 }, { "epoch": 0.8163333333333334, "grad_norm": 25.125, "grad_norm_var": 1.4010416666666667, "learning_rate": 6.217914710793189e-05, "loss": 6.9728, "loss/crossentropy": 1.9185438066720963, "loss/hidden": 3.347265625, "loss/jsd": 0.0, "loss/logits": 0.1684217657893896, "step": 24490 }, { "epoch": 0.8166666666666667, "grad_norm": 24.5, "grad_norm_var": 0.509375, "learning_rate": 6.203955092681039e-05, "loss": 7.0605, "loss/crossentropy": 2.115895939618349, "loss/hidden": 3.401171875, "loss/jsd": 0.0, "loss/logits": 0.18307006321847438, "step": 24500 }, { "epoch": 0.817, "grad_norm": 23.5, "grad_norm_var": 1.1372395833333333, "learning_rate": 6.189988526816323e-05, "loss": 7.0337, "loss/crossentropy": 2.096031680703163, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.1889460150152445, "step": 24510 }, { "epoch": 0.8173333333333334, "grad_norm": 24.0, "grad_norm_var": 1.6379557291666667, "learning_rate": 6.176015151043407e-05, "loss": 7.2482, "loss/crossentropy": 2.0701269775629045, "loss/hidden": 3.2984375, "loss/jsd": 0.0, "loss/logits": 0.1742158493027091, "step": 24520 }, { "epoch": 0.8176666666666667, "grad_norm": 24.25, "grad_norm_var": 2.842967612297891e+18, "learning_rate": 6.16203510327387e-05, "loss": 7.0644, "loss/crossentropy": 2.0755894117057325, "loss/hidden": 3.527734375, "loss/jsd": 0.0, "loss/logits": 0.16671819221228362, "step": 24530 }, { "epoch": 0.818, "grad_norm": 25.125, "grad_norm_var": 3.0311848958333334, "learning_rate": 6.148048521485134e-05, "loss": 7.0163, "loss/crossentropy": 2.0209048211574556, "loss/hidden": 3.2140625, "loss/jsd": 0.0, "loss/logits": 0.15955890230834485, "step": 24540 }, { "epoch": 0.8183333333333334, "grad_norm": 24.125, "grad_norm_var": 2.5462890625, "learning_rate": 6.134055543719121e-05, "loss": 7.0369, "loss/crossentropy": 1.9814843587577342, "loss/hidden": 3.271484375, "loss/jsd": 0.0, "loss/logits": 0.16640124581754207, "step": 24550 }, { "epoch": 0.8186666666666667, "grad_norm": 23.5, "grad_norm_var": 0.7041015625, "learning_rate": 6.120056308080872e-05, "loss": 6.9686, "loss/crossentropy": 2.104407861828804, "loss/hidden": 3.325390625, "loss/jsd": 0.0, "loss/logits": 0.18350574728101493, "step": 24560 }, { "epoch": 0.819, "grad_norm": 24.0, "grad_norm_var": 1.1535807291666667, "learning_rate": 6.106050952737186e-05, "loss": 6.955, "loss/crossentropy": 2.0084666229784487, "loss/hidden": 3.27734375, "loss/jsd": 0.0, "loss/logits": 0.1645617727190256, "step": 24570 }, { "epoch": 0.8193333333333334, "grad_norm": 25.375, "grad_norm_var": 1.9801432291666667, "learning_rate": 6.0920396159152716e-05, "loss": 7.0885, "loss/crossentropy": 2.1339985907077788, "loss/hidden": 3.247265625, "loss/jsd": 0.0, "loss/logits": 0.17009613076224922, "step": 24580 }, { "epoch": 0.8196666666666667, "grad_norm": 24.5, "grad_norm_var": 2.2291666666666665, "learning_rate": 6.078022435901364e-05, "loss": 7.0881, "loss/crossentropy": 2.0973347425460815, "loss/hidden": 3.33125, "loss/jsd": 0.0, "loss/logits": 0.18192722033709288, "step": 24590 }, { "epoch": 0.82, "grad_norm": 24.125, "grad_norm_var": 1.88125, "learning_rate": 6.06399955103937e-05, "loss": 7.169, "loss/crossentropy": 1.9335032500326634, "loss/hidden": 3.326171875, "loss/jsd": 0.0, "loss/logits": 0.17163177412003278, "step": 24600 }, { "epoch": 0.8203333333333334, "grad_norm": 22.375, "grad_norm_var": 1.2608723958333334, "learning_rate": 6.049971099729502e-05, "loss": 7.1524, "loss/crossentropy": 2.197794906795025, "loss/hidden": 3.253515625, "loss/jsd": 0.0, "loss/logits": 0.18091356940567493, "step": 24610 }, { "epoch": 0.8206666666666667, "grad_norm": 25.0, "grad_norm_var": 3.07377709315215e+18, "learning_rate": 6.035937220426915e-05, "loss": 7.0232, "loss/crossentropy": 2.0041457399725915, "loss/hidden": 3.27578125, "loss/jsd": 0.0, "loss/logits": 0.1686849119141698, "step": 24620 }, { "epoch": 0.821, "grad_norm": 23.25, "grad_norm_var": 3.073777092874557e+18, "learning_rate": 6.0218980516403265e-05, "loss": 7.1444, "loss/crossentropy": 1.9922945663332938, "loss/hidden": 3.185546875, "loss/jsd": 0.0, "loss/logits": 0.15499509871006012, "step": 24630 }, { "epoch": 0.8213333333333334, "grad_norm": 25.0, "grad_norm_var": 0.7931640625, "learning_rate": 6.007853731930667e-05, "loss": 7.0009, "loss/crossentropy": 2.0803733453154565, "loss/hidden": 3.4109375, "loss/jsd": 0.0, "loss/logits": 0.2030915966257453, "step": 24640 }, { "epoch": 0.8216666666666667, "grad_norm": 24.375, "grad_norm_var": 1.1309895833333334, "learning_rate": 5.993804399909704e-05, "loss": 7.0855, "loss/crossentropy": 2.0983700484037398, "loss/hidden": 3.289453125, "loss/jsd": 0.0, "loss/logits": 0.1855665436014533, "step": 24650 }, { "epoch": 0.822, "grad_norm": 24.25, "grad_norm_var": 0.6684895833333333, "learning_rate": 5.97975019423867e-05, "loss": 7.0473, "loss/crossentropy": 2.035327473282814, "loss/hidden": 3.296484375, "loss/jsd": 0.0, "loss/logits": 0.168466529622674, "step": 24660 }, { "epoch": 0.8223333333333334, "grad_norm": 25.125, "grad_norm_var": 0.9580729166666667, "learning_rate": 5.9656912536269015e-05, "loss": 7.1414, "loss/crossentropy": 2.1208567664027216, "loss/hidden": 3.266796875, "loss/jsd": 0.0, "loss/logits": 0.17649125978350638, "step": 24670 }, { "epoch": 0.8226666666666667, "grad_norm": 22.875, "grad_norm_var": 1.2436848958333333, "learning_rate": 5.951627716830467e-05, "loss": 7.0739, "loss/crossentropy": 2.098179739713669, "loss/hidden": 3.323046875, "loss/jsd": 0.0, "loss/logits": 0.16574389152228833, "step": 24680 }, { "epoch": 0.823, "grad_norm": 24.75, "grad_norm_var": 3.40625, "learning_rate": 5.937559722650799e-05, "loss": 7.0056, "loss/crossentropy": 1.9866340532898903, "loss/hidden": 3.21015625, "loss/jsd": 0.0, "loss/logits": 0.15324429739266635, "step": 24690 }, { "epoch": 0.8233333333333334, "grad_norm": 23.75, "grad_norm_var": 1.1768229166666666, "learning_rate": 5.923487409933316e-05, "loss": 7.0449, "loss/crossentropy": 2.0189765483140945, "loss/hidden": 3.226953125, "loss/jsd": 0.0, "loss/logits": 0.1633864961564541, "step": 24700 }, { "epoch": 0.8236666666666667, "grad_norm": 24.75, "grad_norm_var": 2.551041666666667, "learning_rate": 5.909410917566066e-05, "loss": 7.1172, "loss/crossentropy": 1.9779780194163323, "loss/hidden": 3.369921875, "loss/jsd": 0.0, "loss/logits": 0.16957656461745502, "step": 24710 }, { "epoch": 0.824, "grad_norm": 23.0, "grad_norm_var": 2.0374348958333335, "learning_rate": 5.8953303844783456e-05, "loss": 7.0346, "loss/crossentropy": 2.046160864830017, "loss/hidden": 3.26171875, "loss/jsd": 0.0, "loss/logits": 0.17642345037311316, "step": 24720 }, { "epoch": 0.8243333333333334, "grad_norm": 25.625, "grad_norm_var": 3.0152302976861993e+18, "learning_rate": 5.881245949639331e-05, "loss": 7.1386, "loss/crossentropy": 2.0644598096609115, "loss/hidden": 3.26953125, "loss/jsd": 0.0, "loss/logits": 0.16755576469004155, "step": 24730 }, { "epoch": 0.8246666666666667, "grad_norm": 22.625, "grad_norm_var": 1.5059895833333334, "learning_rate": 5.86715775205671e-05, "loss": 6.9054, "loss/crossentropy": 1.9000537507236004, "loss/hidden": 3.348828125, "loss/jsd": 0.0, "loss/logits": 0.16331597846001386, "step": 24740 }, { "epoch": 0.825, "grad_norm": 23.375, "grad_norm_var": 1.2119140625, "learning_rate": 5.8530659307753036e-05, "loss": 7.0357, "loss/crossentropy": 1.8462681017816067, "loss/hidden": 3.323046875, "loss/jsd": 0.0, "loss/logits": 0.1659678179770708, "step": 24750 }, { "epoch": 0.8253333333333334, "grad_norm": 23.125, "grad_norm_var": 1.2677083333333334, "learning_rate": 5.838970624875698e-05, "loss": 7.0251, "loss/crossentropy": 2.1098924592137336, "loss/hidden": 3.2328125, "loss/jsd": 0.0, "loss/logits": 0.16376893278211355, "step": 24760 }, { "epoch": 0.8256666666666667, "grad_norm": 26.25, "grad_norm_var": 1.5848307291666666, "learning_rate": 5.824871973472874e-05, "loss": 7.0887, "loss/crossentropy": 1.9424061939120292, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.16495383866131305, "step": 24770 }, { "epoch": 0.826, "grad_norm": 4898947072.0, "grad_norm_var": 1.4999801359186788e+18, "learning_rate": 5.8107701157148277e-05, "loss": 7.0898, "loss/crossentropy": 2.0248693346977236, "loss/hidden": 3.4078125, "loss/jsd": 0.0, "loss/logits": 0.17345572579652072, "step": 24780 }, { "epoch": 0.8263333333333334, "grad_norm": 25.625, "grad_norm_var": 1.4999801358166175e+18, "learning_rate": 5.796665190781201e-05, "loss": 6.9785, "loss/crossentropy": 1.9292976334691048, "loss/hidden": 3.349609375, "loss/jsd": 0.0, "loss/logits": 0.17304837796837091, "step": 24790 }, { "epoch": 0.8266666666666667, "grad_norm": 23.5, "grad_norm_var": 32.87024739583333, "learning_rate": 5.782557337881911e-05, "loss": 7.1747, "loss/crossentropy": 1.8966447107493878, "loss/hidden": 3.32578125, "loss/jsd": 0.0, "loss/logits": 0.16593249971047044, "step": 24800 }, { "epoch": 0.827, "grad_norm": 5066719232.0, "grad_norm_var": 1.6044777186466813e+18, "learning_rate": 5.768446696255769e-05, "loss": 7.1285, "loss/crossentropy": 1.9656455472111702, "loss/hidden": 3.52421875, "loss/jsd": 0.0, "loss/logits": 0.17714616544544698, "step": 24810 }, { "epoch": 0.8273333333333334, "grad_norm": 22.375, "grad_norm_var": 1.6044777203778104e+18, "learning_rate": 5.754333405169111e-05, "loss": 6.9896, "loss/crossentropy": 2.0341189607977865, "loss/hidden": 3.303515625, "loss/jsd": 0.0, "loss/logits": 0.17659138450399042, "step": 24820 }, { "epoch": 0.8276666666666667, "grad_norm": 25.25, "grad_norm_var": 2.066666666666667, "learning_rate": 5.740217603914423e-05, "loss": 7.0219, "loss/crossentropy": 1.9185968987643718, "loss/hidden": 3.372265625, "loss/jsd": 0.0, "loss/logits": 0.1869355977512896, "step": 24830 }, { "epoch": 0.828, "grad_norm": 24.625, "grad_norm_var": 0.9889973958333333, "learning_rate": 5.726099431808963e-05, "loss": 7.1146, "loss/crossentropy": 1.8310720384120942, "loss/hidden": 3.32109375, "loss/jsd": 0.0, "loss/logits": 0.16051160339266063, "step": 24840 }, { "epoch": 0.8283333333333334, "grad_norm": 24.875, "grad_norm_var": 0.5910807291666667, "learning_rate": 5.7119790281933914e-05, "loss": 7.0388, "loss/crossentropy": 1.9527421653270722, "loss/hidden": 3.23203125, "loss/jsd": 0.0, "loss/logits": 0.16238325983285903, "step": 24850 }, { "epoch": 0.8286666666666667, "grad_norm": 25.25, "grad_norm_var": 89.24583333333334, "learning_rate": 5.6978565324303926e-05, "loss": 7.0605, "loss/crossentropy": 2.168795867264271, "loss/hidden": 3.24453125, "loss/jsd": 0.0, "loss/logits": 0.16646072771400214, "step": 24860 }, { "epoch": 0.829, "grad_norm": 22.875, "grad_norm_var": 89.57057291666666, "learning_rate": 5.683732083903296e-05, "loss": 7.0862, "loss/crossentropy": 1.9611302673816682, "loss/hidden": 3.26796875, "loss/jsd": 0.0, "loss/logits": 0.1669299216940999, "step": 24870 }, { "epoch": 0.8293333333333334, "grad_norm": 23.25, "grad_norm_var": 1.1806640625, "learning_rate": 5.669605822014706e-05, "loss": 7.0333, "loss/crossentropy": 1.7856550820171833, "loss/hidden": 3.243359375, "loss/jsd": 0.0, "loss/logits": 0.15327755445614458, "step": 24880 }, { "epoch": 0.8296666666666667, "grad_norm": 24.75, "grad_norm_var": 0.8072916666666666, "learning_rate": 5.655477886185126e-05, "loss": 7.0163, "loss/crossentropy": 2.0259492844343185, "loss/hidden": 3.324609375, "loss/jsd": 0.0, "loss/logits": 0.16790905371308326, "step": 24890 }, { "epoch": 0.83, "grad_norm": 24.5, "grad_norm_var": 0.8129557291666667, "learning_rate": 5.641348415851577e-05, "loss": 6.97, "loss/crossentropy": 1.9602381430566311, "loss/hidden": 3.240625, "loss/jsd": 0.0, "loss/logits": 0.1515656548552215, "step": 24900 }, { "epoch": 0.8303333333333334, "grad_norm": 23.875, "grad_norm_var": 0.9968098958333333, "learning_rate": 5.62721755046623e-05, "loss": 6.9703, "loss/crossentropy": 2.0206805035471915, "loss/hidden": 3.434765625, "loss/jsd": 0.0, "loss/logits": 0.18460522294044496, "step": 24910 }, { "epoch": 0.8306666666666667, "grad_norm": 23.875, "grad_norm_var": 0.8238932291666666, "learning_rate": 5.61308542949502e-05, "loss": 7.0159, "loss/crossentropy": 1.9096168451011182, "loss/hidden": 3.27421875, "loss/jsd": 0.0, "loss/logits": 0.16171670304611324, "step": 24920 }, { "epoch": 0.831, "grad_norm": 24.75, "grad_norm_var": 0.5723307291666667, "learning_rate": 5.598952192416274e-05, "loss": 7.0482, "loss/crossentropy": 1.9502201959490777, "loss/hidden": 3.251953125, "loss/jsd": 0.0, "loss/logits": 0.1745383620262146, "step": 24930 }, { "epoch": 0.8313333333333334, "grad_norm": 26.875, "grad_norm_var": 1.8233723958333334, "learning_rate": 5.584817978719338e-05, "loss": 7.0207, "loss/crossentropy": 2.020615467429161, "loss/hidden": 3.23203125, "loss/jsd": 0.0, "loss/logits": 0.1621037432923913, "step": 24940 }, { "epoch": 0.8316666666666667, "grad_norm": 23.5, "grad_norm_var": 0.9497395833333333, "learning_rate": 5.570682927903194e-05, "loss": 7.0464, "loss/crossentropy": 1.9485878251492976, "loss/hidden": 3.305078125, "loss/jsd": 0.0, "loss/logits": 0.18183569833636284, "step": 24950 }, { "epoch": 0.832, "grad_norm": 22.625, "grad_norm_var": 0.57265625, "learning_rate": 5.556547179475088e-05, "loss": 6.9525, "loss/crossentropy": 2.003249977529049, "loss/hidden": 3.172265625, "loss/jsd": 0.0, "loss/logits": 0.15923047866672277, "step": 24960 }, { "epoch": 0.8323333333333334, "grad_norm": 24.375, "grad_norm_var": 0.4988932291666667, "learning_rate": 5.54241087294915e-05, "loss": 7.0322, "loss/crossentropy": 1.9297384425997735, "loss/hidden": 3.25859375, "loss/jsd": 0.0, "loss/logits": 0.1763775937259197, "step": 24970 }, { "epoch": 0.8326666666666667, "grad_norm": 21.75, "grad_norm_var": 1.0848307291666666, "learning_rate": 5.528274147845016e-05, "loss": 6.994, "loss/crossentropy": 2.1030173070728777, "loss/hidden": 3.31796875, "loss/jsd": 0.0, "loss/logits": 0.17390758330002426, "step": 24980 }, { "epoch": 0.833, "grad_norm": 23.25, "grad_norm_var": 1.94765625, "learning_rate": 5.514137143686459e-05, "loss": 7.0266, "loss/crossentropy": 2.0625434547662733, "loss/hidden": 3.228125, "loss/jsd": 0.0, "loss/logits": 0.1578001905232668, "step": 24990 }, { "epoch": 0.8333333333333334, "grad_norm": 22.625, "grad_norm_var": 0.7910807291666667, "learning_rate": 5.500000000000001e-05, "loss": 6.9959, "loss/crossentropy": 2.1564994513988496, "loss/hidden": 3.234765625, "loss/jsd": 0.0, "loss/logits": 0.16419483684003353, "step": 25000 }, { "epoch": 0.8336666666666667, "grad_norm": 22.125, "grad_norm_var": 1.3046223958333334, "learning_rate": 5.485862856313543e-05, "loss": 6.9643, "loss/crossentropy": 1.9692196190357207, "loss/hidden": 3.23046875, "loss/jsd": 0.0, "loss/logits": 0.17642919681966304, "step": 25010 }, { "epoch": 0.834, "grad_norm": 25.0, "grad_norm_var": 1.1301432291666667, "learning_rate": 5.4717258521549855e-05, "loss": 7.0045, "loss/crossentropy": 1.896971306949854, "loss/hidden": 3.316015625, "loss/jsd": 0.0, "loss/logits": 0.16555657889693975, "step": 25020 }, { "epoch": 0.8343333333333334, "grad_norm": 23.0, "grad_norm_var": 5.253125, "learning_rate": 5.4575891270508526e-05, "loss": 7.0076, "loss/crossentropy": 1.9228805772960187, "loss/hidden": 3.23671875, "loss/jsd": 0.0, "loss/logits": 0.1604012963362038, "step": 25030 }, { "epoch": 0.8346666666666667, "grad_norm": 23.875, "grad_norm_var": 4.692708333333333, "learning_rate": 5.443452820524913e-05, "loss": 7.1268, "loss/crossentropy": 1.9976628370583058, "loss/hidden": 3.33828125, "loss/jsd": 0.0, "loss/logits": 0.18949662614613771, "step": 25040 }, { "epoch": 0.835, "grad_norm": 23.25, "grad_norm_var": 0.9931640625, "learning_rate": 5.429317072096808e-05, "loss": 6.9171, "loss/crossentropy": 1.95694150775671, "loss/hidden": 3.30703125, "loss/jsd": 0.0, "loss/logits": 0.17416810244321823, "step": 25050 }, { "epoch": 0.8353333333333334, "grad_norm": 23.0, "grad_norm_var": 1.4254557291666667, "learning_rate": 5.4151820212806633e-05, "loss": 6.9901, "loss/crossentropy": 1.967911347746849, "loss/hidden": 3.346875, "loss/jsd": 0.0, "loss/logits": 0.16883484926074743, "step": 25060 }, { "epoch": 0.8356666666666667, "grad_norm": 24.5, "grad_norm_var": 1.0718098958333333, "learning_rate": 5.401047807583728e-05, "loss": 7.0916, "loss/crossentropy": 1.9769475132226944, "loss/hidden": 3.325390625, "loss/jsd": 0.0, "loss/logits": 0.1776350039988756, "step": 25070 }, { "epoch": 0.836, "grad_norm": 24.0, "grad_norm_var": 0.6205729166666667, "learning_rate": 5.3869145705049814e-05, "loss": 7.1016, "loss/crossentropy": 2.0979168742895125, "loss/hidden": 3.24453125, "loss/jsd": 0.0, "loss/logits": 0.17412376143038272, "step": 25080 }, { "epoch": 0.8363333333333334, "grad_norm": 22.625, "grad_norm_var": 1.0655598958333334, "learning_rate": 5.372782449533771e-05, "loss": 7.0239, "loss/crossentropy": 2.2201668590307237, "loss/hidden": 3.15703125, "loss/jsd": 0.0, "loss/logits": 0.16662366669625045, "step": 25090 }, { "epoch": 0.8366666666666667, "grad_norm": 24.5, "grad_norm_var": 0.7947916666666667, "learning_rate": 5.358651584148423e-05, "loss": 7.0296, "loss/crossentropy": 1.9929725021123885, "loss/hidden": 3.25859375, "loss/jsd": 0.0, "loss/logits": 0.16217339746654033, "step": 25100 }, { "epoch": 0.837, "grad_norm": 23.5, "grad_norm_var": 0.9895182291666667, "learning_rate": 5.344522113814875e-05, "loss": 6.9488, "loss/crossentropy": 1.940374694764614, "loss/hidden": 3.068359375, "loss/jsd": 0.0, "loss/logits": 0.15328829986974596, "step": 25110 }, { "epoch": 0.8373333333333334, "grad_norm": 24.625, "grad_norm_var": 1.365625, "learning_rate": 5.330394177985295e-05, "loss": 7.0492, "loss/crossentropy": 1.9210307955741883, "loss/hidden": 3.316796875, "loss/jsd": 0.0, "loss/logits": 0.16172695737332105, "step": 25120 }, { "epoch": 0.8376666666666667, "grad_norm": 23.5, "grad_norm_var": 1.39140625, "learning_rate": 5.316267916096705e-05, "loss": 6.838, "loss/crossentropy": 2.1268305659294127, "loss/hidden": 3.221484375, "loss/jsd": 0.0, "loss/logits": 0.1686540162190795, "step": 25130 }, { "epoch": 0.838, "grad_norm": 23.125, "grad_norm_var": 1.3035807291666666, "learning_rate": 5.302143467569609e-05, "loss": 7.0213, "loss/crossentropy": 2.2016376689076425, "loss/hidden": 3.239453125, "loss/jsd": 0.0, "loss/logits": 0.185344104655087, "step": 25140 }, { "epoch": 0.8383333333333334, "grad_norm": 23.625, "grad_norm_var": 0.7619140625, "learning_rate": 5.288020971806609e-05, "loss": 7.0272, "loss/crossentropy": 1.9791180558502675, "loss/hidden": 3.294140625, "loss/jsd": 0.0, "loss/logits": 0.16557303946465254, "step": 25150 }, { "epoch": 0.8386666666666667, "grad_norm": 21.75, "grad_norm_var": 0.9837890625, "learning_rate": 5.273900568191038e-05, "loss": 6.871, "loss/crossentropy": 2.078375779092312, "loss/hidden": 3.248046875, "loss/jsd": 0.0, "loss/logits": 0.1593981696292758, "step": 25160 }, { "epoch": 0.839, "grad_norm": 22.375, "grad_norm_var": 1.7968098958333334, "learning_rate": 5.259782396085579e-05, "loss": 6.963, "loss/crossentropy": 1.9973531074821949, "loss/hidden": 3.166796875, "loss/jsd": 0.0, "loss/logits": 0.15650712680071593, "step": 25170 }, { "epoch": 0.8393333333333334, "grad_norm": 22.375, "grad_norm_var": 1.8041666666666667, "learning_rate": 5.24566659483089e-05, "loss": 6.8987, "loss/crossentropy": 1.9296558193862439, "loss/hidden": 3.121875, "loss/jsd": 0.0, "loss/logits": 0.1516895718872547, "step": 25180 }, { "epoch": 0.8396666666666667, "grad_norm": 21.75, "grad_norm_var": 0.7635416666666667, "learning_rate": 5.231553303744232e-05, "loss": 6.9936, "loss/crossentropy": 2.114676037430763, "loss/hidden": 3.329296875, "loss/jsd": 0.0, "loss/logits": 0.17353012934327125, "step": 25190 }, { "epoch": 0.84, "grad_norm": 22.5, "grad_norm_var": 0.4205729166666667, "learning_rate": 5.2174426621180906e-05, "loss": 6.9546, "loss/crossentropy": 2.0760419577360154, "loss/hidden": 3.226171875, "loss/jsd": 0.0, "loss/logits": 0.16167073398828508, "step": 25200 }, { "epoch": 0.8403333333333334, "grad_norm": 22.125, "grad_norm_var": 2.5940733667158523e+18, "learning_rate": 5.2033348092187996e-05, "loss": 6.8677, "loss/crossentropy": 1.9620779484510422, "loss/hidden": 3.191796875, "loss/jsd": 0.0, "loss/logits": 0.1550289398059249, "step": 25210 }, { "epoch": 0.8406666666666667, "grad_norm": 22.125, "grad_norm_var": 2.594073366910468e+18, "learning_rate": 5.189229884285174e-05, "loss": 6.9043, "loss/crossentropy": 1.9976623475551605, "loss/hidden": 3.2375, "loss/jsd": 0.0, "loss/logits": 0.16729694679379464, "step": 25220 }, { "epoch": 0.841, "grad_norm": 25.25, "grad_norm_var": 2.8238932291666665, "learning_rate": 5.175128026527128e-05, "loss": 7.0103, "loss/crossentropy": 2.192066043615341, "loss/hidden": 3.353515625, "loss/jsd": 0.0, "loss/logits": 0.20732564926147462, "step": 25230 }, { "epoch": 0.8413333333333334, "grad_norm": 23.375, "grad_norm_var": 2.5020182291666666, "learning_rate": 5.161029375124303e-05, "loss": 6.9139, "loss/crossentropy": 1.877561804652214, "loss/hidden": 3.178515625, "loss/jsd": 0.0, "loss/logits": 0.15400900933891534, "step": 25240 }, { "epoch": 0.8416666666666667, "grad_norm": 21.25, "grad_norm_var": 1.5604166666666666, "learning_rate": 5.1469340692246995e-05, "loss": 6.9029, "loss/crossentropy": 2.0022835403680803, "loss/hidden": 3.1765625, "loss/jsd": 0.0, "loss/logits": 0.16735202725976706, "step": 25250 }, { "epoch": 0.842, "grad_norm": 22.125, "grad_norm_var": 4.509375, "learning_rate": 5.1328422479432915e-05, "loss": 6.9624, "loss/crossentropy": 1.8773959062993526, "loss/hidden": 3.25546875, "loss/jsd": 0.0, "loss/logits": 0.15194975724443793, "step": 25260 }, { "epoch": 0.8423333333333334, "grad_norm": 21.75, "grad_norm_var": 1.1247395833333333, "learning_rate": 5.11875405036067e-05, "loss": 6.9817, "loss/crossentropy": 2.165113839507103, "loss/hidden": 3.247265625, "loss/jsd": 0.0, "loss/logits": 0.16993321236222983, "step": 25270 }, { "epoch": 0.8426666666666667, "grad_norm": 22.125, "grad_norm_var": 0.9184895833333333, "learning_rate": 5.104669615521657e-05, "loss": 6.923, "loss/crossentropy": 2.084018699079752, "loss/hidden": 3.15546875, "loss/jsd": 0.0, "loss/logits": 0.16481912517920136, "step": 25280 }, { "epoch": 0.843, "grad_norm": 22.375, "grad_norm_var": 1.2830729166666666, "learning_rate": 5.090589082433935e-05, "loss": 6.9785, "loss/crossentropy": 2.2565275222063064, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.17223725598305464, "step": 25290 }, { "epoch": 0.8433333333333334, "grad_norm": 23.0, "grad_norm_var": 1.3186848958333333, "learning_rate": 5.076512590066685e-05, "loss": 7.0165, "loss/crossentropy": 2.0910873889923094, "loss/hidden": 3.279296875, "loss/jsd": 0.0, "loss/logits": 0.16965348087251186, "step": 25300 }, { "epoch": 0.8436666666666667, "grad_norm": 22.25, "grad_norm_var": 1.065625, "learning_rate": 5.062440277349203e-05, "loss": 6.9454, "loss/crossentropy": 2.2037932582199575, "loss/hidden": 3.130078125, "loss/jsd": 0.0, "loss/logits": 0.16220169235020876, "step": 25310 }, { "epoch": 0.844, "grad_norm": 22.75, "grad_norm_var": 2.582747395833333, "learning_rate": 5.048372283169532e-05, "loss": 6.9965, "loss/crossentropy": 2.1661527663469315, "loss/hidden": 3.18984375, "loss/jsd": 0.0, "loss/logits": 0.16337131895124912, "step": 25320 }, { "epoch": 0.8443333333333334, "grad_norm": 23.0, "grad_norm_var": 1.9525390625, "learning_rate": 5.0343087463730996e-05, "loss": 6.9872, "loss/crossentropy": 2.1112076193094254, "loss/hidden": 3.278125, "loss/jsd": 0.0, "loss/logits": 0.16945380419492723, "step": 25330 }, { "epoch": 0.8446666666666667, "grad_norm": 23.0, "grad_norm_var": 0.6872395833333333, "learning_rate": 5.020249805761331e-05, "loss": 7.0327, "loss/crossentropy": 1.9474051117897033, "loss/hidden": 3.345703125, "loss/jsd": 0.0, "loss/logits": 0.1742366042919457, "step": 25340 }, { "epoch": 0.845, "grad_norm": 22.25, "grad_norm_var": 0.9494140625, "learning_rate": 5.006195600090297e-05, "loss": 7.0176, "loss/crossentropy": 2.0254900440573693, "loss/hidden": 3.362109375, "loss/jsd": 0.0, "loss/logits": 0.1812945833429694, "step": 25350 }, { "epoch": 0.8453333333333334, "grad_norm": 22.5, "grad_norm_var": 1.2004557291666667, "learning_rate": 4.992146268069333e-05, "loss": 6.899, "loss/crossentropy": 1.9700914964079856, "loss/hidden": 3.333984375, "loss/jsd": 0.0, "loss/logits": 0.17236013878136874, "step": 25360 }, { "epoch": 0.8456666666666667, "grad_norm": 21.375, "grad_norm_var": 1.1629557291666666, "learning_rate": 4.9781019483596746e-05, "loss": 6.9598, "loss/crossentropy": 2.0217429384589196, "loss/hidden": 3.219921875, "loss/jsd": 0.0, "loss/logits": 0.1671298835426569, "step": 25370 }, { "epoch": 0.846, "grad_norm": 23.25, "grad_norm_var": 1.7785807291666667, "learning_rate": 4.9640627795730866e-05, "loss": 6.88, "loss/crossentropy": 1.943567543849349, "loss/hidden": 3.249609375, "loss/jsd": 0.0, "loss/logits": 0.1804880647920072, "step": 25380 }, { "epoch": 0.8463333333333334, "grad_norm": 24.875, "grad_norm_var": 2.3009765625, "learning_rate": 4.9500289002704984e-05, "loss": 6.8981, "loss/crossentropy": 2.006009988486767, "loss/hidden": 3.3328125, "loss/jsd": 0.0, "loss/logits": 0.17958016656339168, "step": 25390 }, { "epoch": 0.8466666666666667, "grad_norm": 22.5, "grad_norm_var": 4.074934895833334, "learning_rate": 4.936000448960631e-05, "loss": 7.0513, "loss/crossentropy": 2.2360173970460893, "loss/hidden": 3.230078125, "loss/jsd": 0.0, "loss/logits": 0.18158553242683412, "step": 25400 }, { "epoch": 0.847, "grad_norm": 24.0, "grad_norm_var": 2.7462890625, "learning_rate": 4.9219775640986366e-05, "loss": 6.924, "loss/crossentropy": 1.8591318547725677, "loss/hidden": 3.469140625, "loss/jsd": 0.0, "loss/logits": 0.1813932742923498, "step": 25410 }, { "epoch": 0.8473333333333334, "grad_norm": 22.75, "grad_norm_var": 1.3337890625, "learning_rate": 4.907960384084729e-05, "loss": 6.9502, "loss/crossentropy": 2.1037135615944864, "loss/hidden": 3.21953125, "loss/jsd": 0.0, "loss/logits": 0.16334721986204387, "step": 25420 }, { "epoch": 0.8476666666666667, "grad_norm": 22.375, "grad_norm_var": 1.86015625, "learning_rate": 4.8939490472628136e-05, "loss": 6.8949, "loss/crossentropy": 1.9349641531705857, "loss/hidden": 3.32734375, "loss/jsd": 0.0, "loss/logits": 0.16478215027600526, "step": 25430 }, { "epoch": 0.848, "grad_norm": 22.75, "grad_norm_var": 0.5947265625, "learning_rate": 4.87994369191913e-05, "loss": 6.9594, "loss/crossentropy": 2.1374287590384484, "loss/hidden": 3.138671875, "loss/jsd": 0.0, "loss/logits": 0.1534802021458745, "step": 25440 }, { "epoch": 0.8483333333333334, "grad_norm": 22.75, "grad_norm_var": 0.9608723958333333, "learning_rate": 4.865944456280879e-05, "loss": 6.876, "loss/crossentropy": 1.952788008749485, "loss/hidden": 3.270703125, "loss/jsd": 0.0, "loss/logits": 0.16467729359865188, "step": 25450 }, { "epoch": 0.8486666666666667, "grad_norm": 23.375, "grad_norm_var": 11.014518229166667, "learning_rate": 4.851951478514866e-05, "loss": 7.014, "loss/crossentropy": 2.1284357413649557, "loss/hidden": 3.238671875, "loss/jsd": 0.0, "loss/logits": 0.16996914581395686, "step": 25460 }, { "epoch": 0.849, "grad_norm": 23.25, "grad_norm_var": 11.498958333333333, "learning_rate": 4.837964896726132e-05, "loss": 6.9063, "loss/crossentropy": 2.096763235330582, "loss/hidden": 3.18984375, "loss/jsd": 0.0, "loss/logits": 0.162611080147326, "step": 25470 }, { "epoch": 0.8493333333333334, "grad_norm": 20.125, "grad_norm_var": 1.0518229166666666, "learning_rate": 4.823984848956593e-05, "loss": 6.8635, "loss/crossentropy": 2.0680422112345695, "loss/hidden": 3.23984375, "loss/jsd": 0.0, "loss/logits": 0.15710455570369958, "step": 25480 }, { "epoch": 0.8496666666666667, "grad_norm": 20.75, "grad_norm_var": 1.9317057291666666, "learning_rate": 4.810011473183677e-05, "loss": 6.8994, "loss/crossentropy": 2.0600294291973116, "loss/hidden": 3.29375, "loss/jsd": 0.0, "loss/logits": 0.16484030187129975, "step": 25490 }, { "epoch": 0.85, "grad_norm": 23.25, "grad_norm_var": 8.0212890625, "learning_rate": 4.7960449073189606e-05, "loss": 6.9659, "loss/crossentropy": 1.971318671107292, "loss/hidden": 3.330859375, "loss/jsd": 0.0, "loss/logits": 0.18275586236268282, "step": 25500 }, { "epoch": 0.8503333333333334, "grad_norm": 21.875, "grad_norm_var": 0.8061848958333333, "learning_rate": 4.7820852892068114e-05, "loss": 6.9837, "loss/crossentropy": 1.809413194656372, "loss/hidden": 3.2796875, "loss/jsd": 0.0, "loss/logits": 0.15677141044288873, "step": 25510 }, { "epoch": 0.8506666666666667, "grad_norm": 22.5, "grad_norm_var": 0.7009765625, "learning_rate": 4.768132756623024e-05, "loss": 6.8624, "loss/crossentropy": 1.8316463023424148, "loss/hidden": 3.30859375, "loss/jsd": 0.0, "loss/logits": 0.1573034648783505, "step": 25520 }, { "epoch": 0.851, "grad_norm": 21.0, "grad_norm_var": 0.953125, "learning_rate": 4.754187447273461e-05, "loss": 6.8507, "loss/crossentropy": 1.9352269530296327, "loss/hidden": 3.27578125, "loss/jsd": 0.0, "loss/logits": 0.15878485683351756, "step": 25530 }, { "epoch": 0.8513333333333334, "grad_norm": 22.25, "grad_norm_var": 1.61015625, "learning_rate": 4.740249498792698e-05, "loss": 6.8568, "loss/crossentropy": 2.010533457994461, "loss/hidden": 3.3015625, "loss/jsd": 0.0, "loss/logits": 0.17276135310530663, "step": 25540 }, { "epoch": 0.8516666666666667, "grad_norm": 21.375, "grad_norm_var": 0.8395833333333333, "learning_rate": 4.7263190487426564e-05, "loss": 6.9387, "loss/crossentropy": 2.134304754436016, "loss/hidden": 3.274609375, "loss/jsd": 0.0, "loss/logits": 0.1872491927817464, "step": 25550 }, { "epoch": 0.852, "grad_norm": 21.75, "grad_norm_var": 0.9747395833333333, "learning_rate": 4.7123962346112584e-05, "loss": 6.886, "loss/crossentropy": 2.072511524707079, "loss/hidden": 3.144921875, "loss/jsd": 0.0, "loss/logits": 0.1534841218031943, "step": 25560 }, { "epoch": 0.8523333333333334, "grad_norm": 23.375, "grad_norm_var": 0.7905598958333333, "learning_rate": 4.698481193811054e-05, "loss": 6.8584, "loss/crossentropy": 2.1463774725794793, "loss/hidden": 3.184375, "loss/jsd": 0.0, "loss/logits": 0.16341485381126403, "step": 25570 }, { "epoch": 0.8526666666666667, "grad_norm": 21.125, "grad_norm_var": 2.0010416666666666, "learning_rate": 4.684574063677881e-05, "loss": 6.9106, "loss/crossentropy": 2.065951754152775, "loss/hidden": 3.2859375, "loss/jsd": 0.0, "loss/logits": 0.16066975481808185, "step": 25580 }, { "epoch": 0.853, "grad_norm": 29.5, "grad_norm_var": 5.664518229166666, "learning_rate": 4.6706749814694997e-05, "loss": 6.7972, "loss/crossentropy": 2.0351175434887407, "loss/hidden": 3.193359375, "loss/jsd": 0.0, "loss/logits": 0.16005632225424052, "step": 25590 }, { "epoch": 0.8533333333333334, "grad_norm": 21.75, "grad_norm_var": 6.993684895833334, "learning_rate": 4.6567840843642384e-05, "loss": 6.9496, "loss/crossentropy": 2.0523830361664297, "loss/hidden": 3.146875, "loss/jsd": 0.0, "loss/logits": 0.15730505622923374, "step": 25600 }, { "epoch": 0.8536666666666667, "grad_norm": 23.0, "grad_norm_var": 7.039518229166666, "learning_rate": 4.642901509459646e-05, "loss": 6.7785, "loss/crossentropy": 2.1218873113393784, "loss/hidden": 3.20703125, "loss/jsd": 0.0, "loss/logits": 0.16152856182307004, "step": 25610 }, { "epoch": 0.854, "grad_norm": 21.625, "grad_norm_var": 6.968489583333334, "learning_rate": 4.629027393771129e-05, "loss": 6.8866, "loss/crossentropy": 1.9210114896297454, "loss/hidden": 3.155078125, "loss/jsd": 0.0, "loss/logits": 0.14758066833019257, "step": 25620 }, { "epoch": 0.8543333333333333, "grad_norm": 23.5, "grad_norm_var": 1.2405598958333333, "learning_rate": 4.61516187423061e-05, "loss": 6.899, "loss/crossentropy": 2.0598912209272386, "loss/hidden": 3.251953125, "loss/jsd": 0.0, "loss/logits": 0.1691096406430006, "step": 25630 }, { "epoch": 0.8546666666666667, "grad_norm": 23.875, "grad_norm_var": 1.0854166666666667, "learning_rate": 4.601305087685169e-05, "loss": 6.9173, "loss/crossentropy": 2.205300694704056, "loss/hidden": 3.23671875, "loss/jsd": 0.0, "loss/logits": 0.17890902925282717, "step": 25640 }, { "epoch": 0.855, "grad_norm": 22.0, "grad_norm_var": 1.8083333333333333, "learning_rate": 4.587457170895696e-05, "loss": 6.868, "loss/crossentropy": 2.148914474248886, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.17795586232095956, "step": 25650 }, { "epoch": 0.8553333333333333, "grad_norm": 21.25, "grad_norm_var": 2.0822265625, "learning_rate": 4.573618260535536e-05, "loss": 6.8924, "loss/crossentropy": 1.9687716513872147, "loss/hidden": 3.313671875, "loss/jsd": 0.0, "loss/logits": 0.18561361059546472, "step": 25660 }, { "epoch": 0.8556666666666667, "grad_norm": 21.5, "grad_norm_var": 284.1080729166667, "learning_rate": 4.559788493189149e-05, "loss": 6.8702, "loss/crossentropy": 2.0562238790094853, "loss/hidden": 3.18359375, "loss/jsd": 0.0, "loss/logits": 0.1623332142829895, "step": 25670 }, { "epoch": 0.856, "grad_norm": 20.5, "grad_norm_var": 282.97682291666666, "learning_rate": 4.545968005350756e-05, "loss": 6.8716, "loss/crossentropy": 2.110888344049454, "loss/hidden": 3.22578125, "loss/jsd": 0.0, "loss/logits": 0.16720662415027618, "step": 25680 }, { "epoch": 0.8563333333333333, "grad_norm": 21.25, "grad_norm_var": 0.49765625, "learning_rate": 4.5321569334229916e-05, "loss": 6.8537, "loss/crossentropy": 1.988807225972414, "loss/hidden": 3.259375, "loss/jsd": 0.0, "loss/logits": 0.16363519094884396, "step": 25690 }, { "epoch": 0.8566666666666667, "grad_norm": 21.5, "grad_norm_var": 4.025, "learning_rate": 4.5183554137155606e-05, "loss": 6.84, "loss/crossentropy": 1.9043106943368913, "loss/hidden": 3.165625, "loss/jsd": 0.0, "loss/logits": 0.14644915759563445, "step": 25700 }, { "epoch": 0.857, "grad_norm": 24.0, "grad_norm_var": 2.8580729166666665, "learning_rate": 4.504563582443889e-05, "loss": 6.964, "loss/crossentropy": 2.051722328364849, "loss/hidden": 3.206640625, "loss/jsd": 0.0, "loss/logits": 0.17551100347191095, "step": 25710 }, { "epoch": 0.8573333333333333, "grad_norm": 21.625, "grad_norm_var": 0.96640625, "learning_rate": 4.490781575727786e-05, "loss": 6.8005, "loss/crossentropy": 2.009612035751343, "loss/hidden": 3.187109375, "loss/jsd": 0.0, "loss/logits": 0.14919841345399618, "step": 25720 }, { "epoch": 0.8576666666666667, "grad_norm": 22.125, "grad_norm_var": 0.5322916666666667, "learning_rate": 4.4770095295900924e-05, "loss": 6.8377, "loss/crossentropy": 2.0377252414822578, "loss/hidden": 3.158203125, "loss/jsd": 0.0, "loss/logits": 0.18529058247804642, "step": 25730 }, { "epoch": 0.858, "grad_norm": 5771362304.0, "grad_norm_var": 2.0817889116463037e+18, "learning_rate": 4.463247579955344e-05, "loss": 7.0199, "loss/crossentropy": 1.9959793724119663, "loss/hidden": 3.283203125, "loss/jsd": 0.0, "loss/logits": 0.16647218465805053, "step": 25740 }, { "epoch": 0.8583333333333333, "grad_norm": 21.125, "grad_norm_var": 2.081788911934872e+18, "learning_rate": 4.4494958626484276e-05, "loss": 6.8499, "loss/crossentropy": 2.0159687541425226, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.15637603402137756, "step": 25750 }, { "epoch": 0.8586666666666667, "grad_norm": 22.125, "grad_norm_var": 1.3205729166666667, "learning_rate": 4.43575451339324e-05, "loss": 6.9875, "loss/crossentropy": 2.023764471709728, "loss/hidden": 3.19609375, "loss/jsd": 0.0, "loss/logits": 0.15618936270475386, "step": 25760 }, { "epoch": 0.859, "grad_norm": 20.75, "grad_norm_var": 4.481184895833334, "learning_rate": 4.4220236678113536e-05, "loss": 6.8386, "loss/crossentropy": 2.0717529535293577, "loss/hidden": 3.173828125, "loss/jsd": 0.0, "loss/logits": 0.1514882566407323, "step": 25770 }, { "epoch": 0.8593333333333333, "grad_norm": 22.875, "grad_norm_var": 4.6025390625, "learning_rate": 4.4083034614206674e-05, "loss": 6.9052, "loss/crossentropy": 2.154834459722042, "loss/hidden": 3.181640625, "loss/jsd": 0.0, "loss/logits": 0.15938506573438643, "step": 25780 }, { "epoch": 0.8596666666666667, "grad_norm": 22.5, "grad_norm_var": 1.3811848958333333, "learning_rate": 4.3945940296340824e-05, "loss": 6.9774, "loss/crossentropy": 2.141025458276272, "loss/hidden": 3.224609375, "loss/jsd": 0.0, "loss/logits": 0.1737861094996333, "step": 25790 }, { "epoch": 0.86, "grad_norm": 21.125, "grad_norm_var": 1.7874348958333333, "learning_rate": 4.380895507758155e-05, "loss": 6.8555, "loss/crossentropy": 1.9630529195070268, "loss/hidden": 3.222265625, "loss/jsd": 0.0, "loss/logits": 0.16939648147672415, "step": 25800 }, { "epoch": 0.8603333333333333, "grad_norm": 20.375, "grad_norm_var": 1.5978515625, "learning_rate": 4.367208030991764e-05, "loss": 6.8227, "loss/crossentropy": 1.893832840025425, "loss/hidden": 3.196484375, "loss/jsd": 0.0, "loss/logits": 0.15906968284398318, "step": 25810 }, { "epoch": 0.8606666666666667, "grad_norm": 22.25, "grad_norm_var": 1.7520182291666666, "learning_rate": 4.353531734424782e-05, "loss": 6.9535, "loss/crossentropy": 1.9903650164604187, "loss/hidden": 3.324609375, "loss/jsd": 0.0, "loss/logits": 0.17482327315956353, "step": 25820 }, { "epoch": 0.861, "grad_norm": 22.375, "grad_norm_var": 0.61640625, "learning_rate": 4.3398667530367306e-05, "loss": 6.8628, "loss/crossentropy": 2.053640615940094, "loss/hidden": 3.285546875, "loss/jsd": 0.0, "loss/logits": 0.19147922191768885, "step": 25830 }, { "epoch": 0.8613333333333333, "grad_norm": 24.625, "grad_norm_var": 56.209309895833336, "learning_rate": 4.3262132216954656e-05, "loss": 6.9087, "loss/crossentropy": 2.0847674936056135, "loss/hidden": 3.31015625, "loss/jsd": 0.0, "loss/logits": 0.17146144881844522, "step": 25840 }, { "epoch": 0.8616666666666667, "grad_norm": 20.75, "grad_norm_var": 3.0233723958333334, "learning_rate": 4.312571275155823e-05, "loss": 6.8788, "loss/crossentropy": 2.0731761664152146, "loss/hidden": 3.144921875, "loss/jsd": 0.0, "loss/logits": 0.16782324127852916, "step": 25850 }, { "epoch": 0.862, "grad_norm": 22.875, "grad_norm_var": 2.468489583333333, "learning_rate": 4.2989410480583116e-05, "loss": 6.8898, "loss/crossentropy": 1.9479005321860314, "loss/hidden": 3.233984375, "loss/jsd": 0.0, "loss/logits": 0.15610639620572328, "step": 25860 }, { "epoch": 0.8623333333333333, "grad_norm": 26.0, "grad_norm_var": 489.34583333333336, "learning_rate": 4.285322674927768e-05, "loss": 6.858, "loss/crossentropy": 1.9504839967936278, "loss/hidden": 3.195703125, "loss/jsd": 0.0, "loss/logits": 0.16356785856187345, "step": 25870 }, { "epoch": 0.8626666666666667, "grad_norm": 20.375, "grad_norm_var": 2.1858723958333335, "learning_rate": 4.271716290172038e-05, "loss": 6.9462, "loss/crossentropy": 2.123225097358227, "loss/hidden": 3.27109375, "loss/jsd": 0.0, "loss/logits": 0.16322279013693333, "step": 25880 }, { "epoch": 0.863, "grad_norm": 21.5, "grad_norm_var": 1.52265625, "learning_rate": 4.258122028080646e-05, "loss": 6.8839, "loss/crossentropy": 2.133731837570667, "loss/hidden": 3.16640625, "loss/jsd": 0.0, "loss/logits": 0.16982710380107163, "step": 25890 }, { "epoch": 0.8633333333333333, "grad_norm": 21.375, "grad_norm_var": 1.2479166666666666, "learning_rate": 4.2445400228234686e-05, "loss": 6.8131, "loss/crossentropy": 2.0845893740653993, "loss/hidden": 3.18203125, "loss/jsd": 0.0, "loss/logits": 0.17494960688054562, "step": 25900 }, { "epoch": 0.8636666666666667, "grad_norm": 24.625, "grad_norm_var": 1.909375, "learning_rate": 4.230970408449418e-05, "loss": 6.8778, "loss/crossentropy": 2.051340754330158, "loss/hidden": 3.279296875, "loss/jsd": 0.0, "loss/logits": 0.17781901303678752, "step": 25910 }, { "epoch": 0.864, "grad_norm": 20.875, "grad_norm_var": 2.33515625, "learning_rate": 4.217413318885108e-05, "loss": 6.8758, "loss/crossentropy": 2.0643552422523497, "loss/hidden": 3.170703125, "loss/jsd": 0.0, "loss/logits": 0.1577897410839796, "step": 25920 }, { "epoch": 0.8643333333333333, "grad_norm": 24.875, "grad_norm_var": 1.6145182291666667, "learning_rate": 4.203868887933541e-05, "loss": 6.8634, "loss/crossentropy": 2.0819766454398634, "loss/hidden": 3.286328125, "loss/jsd": 0.0, "loss/logits": 0.16645964570343494, "step": 25930 }, { "epoch": 0.8646666666666667, "grad_norm": 22.125, "grad_norm_var": 1.2541666666666667, "learning_rate": 4.190337249272778e-05, "loss": 6.7763, "loss/crossentropy": 2.0638196393847466, "loss/hidden": 3.2515625, "loss/jsd": 0.0, "loss/logits": 0.1723222305998206, "step": 25940 }, { "epoch": 0.865, "grad_norm": 21.375, "grad_norm_var": 0.8973307291666667, "learning_rate": 4.176818536454633e-05, "loss": 6.9368, "loss/crossentropy": 1.9603420421481133, "loss/hidden": 3.14609375, "loss/jsd": 0.0, "loss/logits": 0.14999181237071751, "step": 25950 }, { "epoch": 0.8653333333333333, "grad_norm": 21.625, "grad_norm_var": 0.8802083333333334, "learning_rate": 4.163312882903344e-05, "loss": 6.7771, "loss/crossentropy": 2.026094362139702, "loss/hidden": 3.26875, "loss/jsd": 0.0, "loss/logits": 0.15913072023540736, "step": 25960 }, { "epoch": 0.8656666666666667, "grad_norm": 24.5, "grad_norm_var": 1.1759765625, "learning_rate": 4.1498204219142575e-05, "loss": 6.8418, "loss/crossentropy": 2.0223079532384873, "loss/hidden": 3.153125, "loss/jsd": 0.0, "loss/logits": 0.1593662802129984, "step": 25970 }, { "epoch": 0.866, "grad_norm": 21.25, "grad_norm_var": 1.265625, "learning_rate": 4.1363412866525185e-05, "loss": 6.8294, "loss/crossentropy": 2.1484047800302504, "loss/hidden": 3.16875, "loss/jsd": 0.0, "loss/logits": 0.15932908514514565, "step": 25980 }, { "epoch": 0.8663333333333333, "grad_norm": 21.75, "grad_norm_var": 0.91640625, "learning_rate": 4.1228756101517475e-05, "loss": 6.7557, "loss/crossentropy": 1.8646988950669765, "loss/hidden": 3.26953125, "loss/jsd": 0.0, "loss/logits": 0.17402277877554298, "step": 25990 }, { "epoch": 0.8666666666666667, "grad_norm": 21.875, "grad_norm_var": 0.76640625, "learning_rate": 4.109423525312738e-05, "loss": 6.9098, "loss/crossentropy": 2.212962034344673, "loss/hidden": 3.174609375, "loss/jsd": 0.0, "loss/logits": 0.16355629544705153, "step": 26000 }, { "epoch": 0.867, "grad_norm": 23.0, "grad_norm_var": 0.5978515625, "learning_rate": 4.0959851649021344e-05, "loss": 6.9753, "loss/crossentropy": 1.9954494401812553, "loss/hidden": 3.270703125, "loss/jsd": 0.0, "loss/logits": 0.17405376564711333, "step": 26010 }, { "epoch": 0.8673333333333333, "grad_norm": 22.375, "grad_norm_var": 0.75390625, "learning_rate": 4.0825606615511305e-05, "loss": 7.0159, "loss/crossentropy": 2.1162398613989355, "loss/hidden": 3.087890625, "loss/jsd": 0.0, "loss/logits": 0.14784672670066357, "step": 26020 }, { "epoch": 0.8676666666666667, "grad_norm": 21.0, "grad_norm_var": 2.594073367574846e+18, "learning_rate": 4.069150147754151e-05, "loss": 6.9345, "loss/crossentropy": 1.9555442228913307, "loss/hidden": 3.191796875, "loss/jsd": 0.0, "loss/logits": 0.157724441960454, "step": 26030 }, { "epoch": 0.868, "grad_norm": 22.25, "grad_norm_var": 0.5895833333333333, "learning_rate": 4.0557537558675583e-05, "loss": 7.0644, "loss/crossentropy": 2.1746340721845625, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.1773978678509593, "step": 26040 }, { "epoch": 0.8683333333333333, "grad_norm": 24.25, "grad_norm_var": 1.3176432291666667, "learning_rate": 4.042371618108329e-05, "loss": 6.7896, "loss/crossentropy": 1.9868990987539292, "loss/hidden": 3.216796875, "loss/jsd": 0.0, "loss/logits": 0.15773731619119644, "step": 26050 }, { "epoch": 0.8686666666666667, "grad_norm": 20.875, "grad_norm_var": 1.96015625, "learning_rate": 4.0290038665527596e-05, "loss": 6.8032, "loss/crossentropy": 2.1249034658074377, "loss/hidden": 3.162109375, "loss/jsd": 0.0, "loss/logits": 0.1534424176439643, "step": 26060 }, { "epoch": 0.869, "grad_norm": 24.125, "grad_norm_var": 2.8893229166666665, "learning_rate": 4.015650633135163e-05, "loss": 6.861, "loss/crossentropy": 2.0854105949401855, "loss/hidden": 3.12265625, "loss/jsd": 0.0, "loss/logits": 0.16376592293381692, "step": 26070 }, { "epoch": 0.8693333333333333, "grad_norm": 20.375, "grad_norm_var": 1.4309895833333333, "learning_rate": 4.00231204964656e-05, "loss": 6.8508, "loss/crossentropy": 2.0845695704221727, "loss/hidden": 3.164453125, "loss/jsd": 0.0, "loss/logits": 0.15720440819859505, "step": 26080 }, { "epoch": 0.8696666666666667, "grad_norm": 23.375, "grad_norm_var": 1.2332682291666666, "learning_rate": 3.9889882477333874e-05, "loss": 6.7856, "loss/crossentropy": 1.9149738550186157, "loss/hidden": 3.256640625, "loss/jsd": 0.0, "loss/logits": 0.15268718972802162, "step": 26090 }, { "epoch": 0.87, "grad_norm": 22.625, "grad_norm_var": 2.383072916666667, "learning_rate": 3.9756793588961896e-05, "loss": 6.895, "loss/crossentropy": 2.0855982795357706, "loss/hidden": 3.18046875, "loss/jsd": 0.0, "loss/logits": 0.15859134048223494, "step": 26100 }, { "epoch": 0.8703333333333333, "grad_norm": 23.875, "grad_norm_var": 1.2754557291666666, "learning_rate": 3.962385514488326e-05, "loss": 6.8528, "loss/crossentropy": 2.095623345673084, "loss/hidden": 3.217578125, "loss/jsd": 0.0, "loss/logits": 0.1637012053281069, "step": 26110 }, { "epoch": 0.8706666666666667, "grad_norm": 21.375, "grad_norm_var": 1.7705729166666666, "learning_rate": 3.949106845714674e-05, "loss": 6.8451, "loss/crossentropy": 1.8034477911889553, "loss/hidden": 3.085546875, "loss/jsd": 0.0, "loss/logits": 0.14811227219179274, "step": 26120 }, { "epoch": 0.871, "grad_norm": 22.375, "grad_norm_var": 14.495247395833333, "learning_rate": 3.9358434836303336e-05, "loss": 6.8776, "loss/crossentropy": 1.918418012559414, "loss/hidden": 3.282421875, "loss/jsd": 0.0, "loss/logits": 0.16268355417996644, "step": 26130 }, { "epoch": 0.8713333333333333, "grad_norm": 21.0, "grad_norm_var": 1.4895833333333333, "learning_rate": 3.922595559139336e-05, "loss": 6.8094, "loss/crossentropy": 2.0094055980443954, "loss/hidden": 3.13984375, "loss/jsd": 0.0, "loss/logits": 0.14857212770730258, "step": 26140 }, { "epoch": 0.8716666666666667, "grad_norm": 21.75, "grad_norm_var": 1.16640625, "learning_rate": 3.9093632029933435e-05, "loss": 6.8686, "loss/crossentropy": 1.9957379199564458, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.14984978251159192, "step": 26150 }, { "epoch": 0.872, "grad_norm": 20.75, "grad_norm_var": 0.4947916666666667, "learning_rate": 3.896146545790372e-05, "loss": 6.7922, "loss/crossentropy": 2.0107031047344206, "loss/hidden": 3.17421875, "loss/jsd": 0.0, "loss/logits": 0.15825871471315622, "step": 26160 }, { "epoch": 0.8723333333333333, "grad_norm": 22.375, "grad_norm_var": 0.7457682291666666, "learning_rate": 3.882945717973493e-05, "loss": 6.873, "loss/crossentropy": 1.9550271481275558, "loss/hidden": 3.1921875, "loss/jsd": 0.0, "loss/logits": 0.15566041497513652, "step": 26170 }, { "epoch": 0.8726666666666667, "grad_norm": 21.75, "grad_norm_var": 0.5197916666666667, "learning_rate": 3.8697608498295445e-05, "loss": 6.8371, "loss/crossentropy": 2.016065427660942, "loss/hidden": 3.176953125, "loss/jsd": 0.0, "loss/logits": 0.1619328921660781, "step": 26180 }, { "epoch": 0.873, "grad_norm": 20.625, "grad_norm_var": 0.97890625, "learning_rate": 3.856592071487856e-05, "loss": 6.8235, "loss/crossentropy": 2.003811553120613, "loss/hidden": 3.15234375, "loss/jsd": 0.0, "loss/logits": 0.1609561923891306, "step": 26190 }, { "epoch": 0.8733333333333333, "grad_norm": 23.0, "grad_norm_var": 1.50390625, "learning_rate": 3.843439512918949e-05, "loss": 6.8469, "loss/crossentropy": 2.0764830335974693, "loss/hidden": 3.256640625, "loss/jsd": 0.0, "loss/logits": 0.16151853874325753, "step": 26200 }, { "epoch": 0.8736666666666667, "grad_norm": 21.5, "grad_norm_var": 1.196875, "learning_rate": 3.830303303933271e-05, "loss": 6.7814, "loss/crossentropy": 2.1526197090744974, "loss/hidden": 3.108984375, "loss/jsd": 0.0, "loss/logits": 0.1596878958866, "step": 26210 }, { "epoch": 0.874, "grad_norm": 20.75, "grad_norm_var": 2.0893229166666667, "learning_rate": 3.817183574179899e-05, "loss": 6.9767, "loss/crossentropy": 2.1804853290319444, "loss/hidden": 3.18671875, "loss/jsd": 0.0, "loss/logits": 0.16332378438673914, "step": 26220 }, { "epoch": 0.8743333333333333, "grad_norm": 22.75, "grad_norm_var": 1.4337890625, "learning_rate": 3.804080453145269e-05, "loss": 6.8338, "loss/crossentropy": 2.0908204093575478, "loss/hidden": 3.300390625, "loss/jsd": 0.0, "loss/logits": 0.16485908310860395, "step": 26230 }, { "epoch": 0.8746666666666667, "grad_norm": 21.875, "grad_norm_var": 0.5223307291666667, "learning_rate": 3.790994070151895e-05, "loss": 6.8804, "loss/crossentropy": 2.14748295545578, "loss/hidden": 3.276953125, "loss/jsd": 0.0, "loss/logits": 0.17930770702660084, "step": 26240 }, { "epoch": 0.875, "grad_norm": 20.25, "grad_norm_var": 0.7072916666666667, "learning_rate": 3.777924554357096e-05, "loss": 6.7229, "loss/crossentropy": 2.000728341937065, "loss/hidden": 3.21328125, "loss/jsd": 0.0, "loss/logits": 0.1487280648201704, "step": 26250 }, { "epoch": 0.8753333333333333, "grad_norm": 21.125, "grad_norm_var": 2.13515625, "learning_rate": 3.7648720347517166e-05, "loss": 6.7624, "loss/crossentropy": 1.9981106102466584, "loss/hidden": 3.25703125, "loss/jsd": 0.0, "loss/logits": 0.15397127764299512, "step": 26260 }, { "epoch": 0.8756666666666667, "grad_norm": 20.375, "grad_norm_var": 1.5372395833333334, "learning_rate": 3.7518366401588536e-05, "loss": 6.8671, "loss/crossentropy": 2.229478067159653, "loss/hidden": 3.126171875, "loss/jsd": 0.0, "loss/logits": 0.15968595184385775, "step": 26270 }, { "epoch": 0.876, "grad_norm": 21.625, "grad_norm_var": 1.2525390625, "learning_rate": 3.738818499232589e-05, "loss": 6.786, "loss/crossentropy": 2.017266020178795, "loss/hidden": 3.18828125, "loss/jsd": 0.0, "loss/logits": 0.16191368382424115, "step": 26280 }, { "epoch": 0.8763333333333333, "grad_norm": 21.75, "grad_norm_var": 1.1780598958333333, "learning_rate": 3.725817740456721e-05, "loss": 7.0003, "loss/crossentropy": 2.014402036368847, "loss/hidden": 3.177734375, "loss/jsd": 0.0, "loss/logits": 0.15464963000267745, "step": 26290 }, { "epoch": 0.8766666666666667, "grad_norm": 20.5, "grad_norm_var": 1.271875, "learning_rate": 3.712834492143488e-05, "loss": 6.856, "loss/crossentropy": 1.9306327871978284, "loss/hidden": 3.261328125, "loss/jsd": 0.0, "loss/logits": 0.15594524987973274, "step": 26300 }, { "epoch": 0.877, "grad_norm": 21.0, "grad_norm_var": 0.88515625, "learning_rate": 3.699868882432309e-05, "loss": 6.8967, "loss/crossentropy": 1.9130300246179104, "loss/hidden": 3.09765625, "loss/jsd": 0.0, "loss/logits": 0.13649473995901645, "step": 26310 }, { "epoch": 0.8773333333333333, "grad_norm": 21.5, "grad_norm_var": 2.0336566906021478e+18, "learning_rate": 3.686921039288519e-05, "loss": 7.0264, "loss/crossentropy": 2.181557595729828, "loss/hidden": 3.279296875, "loss/jsd": 0.0, "loss/logits": 0.16082917023450136, "step": 26320 }, { "epoch": 0.8776666666666667, "grad_norm": 22.0, "grad_norm_var": 2.0336566913151795e+18, "learning_rate": 3.673991090502101e-05, "loss": 6.8328, "loss/crossentropy": 1.9753350079059602, "loss/hidden": 3.13828125, "loss/jsd": 0.0, "loss/logits": 0.1500071782618761, "step": 26330 }, { "epoch": 0.878, "grad_norm": 22.5, "grad_norm_var": 1.1697916666666666, "learning_rate": 3.661079163686431e-05, "loss": 6.8732, "loss/crossentropy": 1.9919006183743477, "loss/hidden": 3.105859375, "loss/jsd": 0.0, "loss/logits": 0.1555755365639925, "step": 26340 }, { "epoch": 0.8783333333333333, "grad_norm": 23.25, "grad_norm_var": 1.1916015625, "learning_rate": 3.648185386277011e-05, "loss": 6.8612, "loss/crossentropy": 2.0109338417649267, "loss/hidden": 3.1859375, "loss/jsd": 0.0, "loss/logits": 0.15537302363663913, "step": 26350 }, { "epoch": 0.8786666666666667, "grad_norm": 21.25, "grad_norm_var": 1.0379557291666666, "learning_rate": 3.6353098855302215e-05, "loss": 6.7091, "loss/crossentropy": 1.9878887504339218, "loss/hidden": 3.0875, "loss/jsd": 0.0, "loss/logits": 0.1486053698696196, "step": 26360 }, { "epoch": 0.879, "grad_norm": 21.5, "grad_norm_var": 0.5676432291666667, "learning_rate": 3.622452788522057e-05, "loss": 6.8374, "loss/crossentropy": 1.9497860811650753, "loss/hidden": 3.23046875, "loss/jsd": 0.0, "loss/logits": 0.1838926389813423, "step": 26370 }, { "epoch": 0.8793333333333333, "grad_norm": 21.5, "grad_norm_var": 0.9098307291666666, "learning_rate": 3.609614222146872e-05, "loss": 6.8293, "loss/crossentropy": 2.1610096618533134, "loss/hidden": 3.177734375, "loss/jsd": 0.0, "loss/logits": 0.1660961801186204, "step": 26380 }, { "epoch": 0.8796666666666667, "grad_norm": 20.375, "grad_norm_var": 0.99140625, "learning_rate": 3.596794313116136e-05, "loss": 6.8184, "loss/crossentropy": 2.0240518391132354, "loss/hidden": 3.149609375, "loss/jsd": 0.0, "loss/logits": 0.16600796654820443, "step": 26390 }, { "epoch": 0.88, "grad_norm": 24.25, "grad_norm_var": 1.4854166666666666, "learning_rate": 3.583993187957173e-05, "loss": 6.8498, "loss/crossentropy": 1.9888987004756928, "loss/hidden": 3.249609375, "loss/jsd": 0.0, "loss/logits": 0.1616065276786685, "step": 26400 }, { "epoch": 0.8803333333333333, "grad_norm": 20.625, "grad_norm_var": 1.8791666666666667, "learning_rate": 3.571210973011924e-05, "loss": 6.8116, "loss/crossentropy": 1.8744437299668788, "loss/hidden": 3.220703125, "loss/jsd": 0.0, "loss/logits": 0.15525809191167356, "step": 26410 }, { "epoch": 0.8806666666666667, "grad_norm": 22.0, "grad_norm_var": 98.81608072916667, "learning_rate": 3.5584477944356845e-05, "loss": 6.9597, "loss/crossentropy": 2.1749876379966735, "loss/hidden": 3.2375, "loss/jsd": 0.0, "loss/logits": 0.16630711518228053, "step": 26420 }, { "epoch": 0.881, "grad_norm": 21.5, "grad_norm_var": 98.72265625, "learning_rate": 3.5457037781958805e-05, "loss": 6.9383, "loss/crossentropy": 2.053211937844753, "loss/hidden": 3.176171875, "loss/jsd": 0.0, "loss/logits": 0.15977289276197554, "step": 26430 }, { "epoch": 0.8813333333333333, "grad_norm": 23.125, "grad_norm_var": 1.06015625, "learning_rate": 3.532979050070804e-05, "loss": 6.8057, "loss/crossentropy": 1.9794712126255036, "loss/hidden": 3.29609375, "loss/jsd": 0.0, "loss/logits": 0.16630720421671868, "step": 26440 }, { "epoch": 0.8816666666666667, "grad_norm": 20.5, "grad_norm_var": 4.742122395833333, "learning_rate": 3.520273735648382e-05, "loss": 6.7564, "loss/crossentropy": 2.0068790689110756, "loss/hidden": 3.21328125, "loss/jsd": 0.0, "loss/logits": 0.15001734271645545, "step": 26450 }, { "epoch": 0.882, "grad_norm": 20.875, "grad_norm_var": 0.8375, "learning_rate": 3.507587960324944e-05, "loss": 6.9896, "loss/crossentropy": 1.996173518896103, "loss/hidden": 3.202734375, "loss/jsd": 0.0, "loss/logits": 0.16484985016286374, "step": 26460 }, { "epoch": 0.8823333333333333, "grad_norm": 22.375, "grad_norm_var": 0.9186848958333333, "learning_rate": 3.494921849303967e-05, "loss": 6.9035, "loss/crossentropy": 2.044399265944958, "loss/hidden": 3.31796875, "loss/jsd": 0.0, "loss/logits": 0.17791436351835727, "step": 26470 }, { "epoch": 0.8826666666666667, "grad_norm": 20.0, "grad_norm_var": 1.3457682291666666, "learning_rate": 3.482275527594856e-05, "loss": 6.7077, "loss/crossentropy": 1.9672799199819564, "loss/hidden": 3.09609375, "loss/jsd": 0.0, "loss/logits": 0.14841998741030693, "step": 26480 }, { "epoch": 0.883, "grad_norm": 23.625, "grad_norm_var": 1.25390625, "learning_rate": 3.469649120011697e-05, "loss": 6.714, "loss/crossentropy": 2.0286851942539217, "loss/hidden": 3.160546875, "loss/jsd": 0.0, "loss/logits": 0.15806122818030416, "step": 26490 }, { "epoch": 0.8833333333333333, "grad_norm": 20.375, "grad_norm_var": 1.3455729166666666, "learning_rate": 3.45704275117204e-05, "loss": 6.8575, "loss/crossentropy": 2.0830163829028607, "loss/hidden": 3.194140625, "loss/jsd": 0.0, "loss/logits": 0.15631103357300163, "step": 26500 }, { "epoch": 0.8836666666666667, "grad_norm": 20.875, "grad_norm_var": 1.9254557291666667, "learning_rate": 3.444456545495652e-05, "loss": 6.8168, "loss/crossentropy": 1.9875051081180573, "loss/hidden": 3.215234375, "loss/jsd": 0.0, "loss/logits": 0.15791778452694416, "step": 26510 }, { "epoch": 0.884, "grad_norm": 21.125, "grad_norm_var": 1.2979166666666666, "learning_rate": 3.431890627203305e-05, "loss": 6.8636, "loss/crossentropy": 2.145845976471901, "loss/hidden": 3.14921875, "loss/jsd": 0.0, "loss/logits": 0.16650803480297327, "step": 26520 }, { "epoch": 0.8843333333333333, "grad_norm": 23.75, "grad_norm_var": 1.3427083333333334, "learning_rate": 3.419345120315538e-05, "loss": 6.8361, "loss/crossentropy": 2.108425536751747, "loss/hidden": 3.209765625, "loss/jsd": 0.0, "loss/logits": 0.16783894039690495, "step": 26530 }, { "epoch": 0.8846666666666667, "grad_norm": 21.5, "grad_norm_var": 1.2098307291666666, "learning_rate": 3.4068201486514376e-05, "loss": 6.9219, "loss/crossentropy": 2.0194236926734446, "loss/hidden": 3.217578125, "loss/jsd": 0.0, "loss/logits": 0.16996841207146646, "step": 26540 }, { "epoch": 0.885, "grad_norm": 21.375, "grad_norm_var": 4.703580729166666, "learning_rate": 3.394315835827421e-05, "loss": 6.9036, "loss/crossentropy": 2.0967435270547865, "loss/hidden": 3.17265625, "loss/jsd": 0.0, "loss/logits": 0.15975359827280045, "step": 26550 }, { "epoch": 0.8853333333333333, "grad_norm": 27.875, "grad_norm_var": 3.9785807291666666, "learning_rate": 3.381832305256004e-05, "loss": 6.9396, "loss/crossentropy": 2.10302966684103, "loss/hidden": 3.077734375, "loss/jsd": 0.0, "loss/logits": 0.17214497793465852, "step": 26560 }, { "epoch": 0.8856666666666667, "grad_norm": 20.5, "grad_norm_var": 1.9625138908734423e+18, "learning_rate": 3.3693696801445954e-05, "loss": 6.9578, "loss/crossentropy": 2.2149000599980355, "loss/hidden": 3.213671875, "loss/jsd": 0.0, "loss/logits": 0.17492201793938875, "step": 26570 }, { "epoch": 0.886, "grad_norm": 22.125, "grad_norm_var": 1.9625138910719027e+18, "learning_rate": 3.356928083494274e-05, "loss": 6.9037, "loss/crossentropy": 2.0742943078279494, "loss/hidden": 3.119140625, "loss/jsd": 0.0, "loss/logits": 0.15730843115597964, "step": 26580 }, { "epoch": 0.8863333333333333, "grad_norm": 20.75, "grad_norm_var": 2.6059895833333333, "learning_rate": 3.344507638098576e-05, "loss": 6.7874, "loss/crossentropy": 1.9956744939088822, "loss/hidden": 3.201953125, "loss/jsd": 0.0, "loss/logits": 0.15899997791275383, "step": 26590 }, { "epoch": 0.8866666666666667, "grad_norm": 20.375, "grad_norm_var": 0.6603515625, "learning_rate": 3.3321084665422807e-05, "loss": 6.8337, "loss/crossentropy": 1.9816527277231217, "loss/hidden": 3.175, "loss/jsd": 0.0, "loss/logits": 0.16425166334956884, "step": 26600 }, { "epoch": 0.887, "grad_norm": 20.5, "grad_norm_var": 1.3455729166666666, "learning_rate": 3.319730691200209e-05, "loss": 6.8578, "loss/crossentropy": 1.8931610018014908, "loss/hidden": 3.1734375, "loss/jsd": 0.0, "loss/logits": 0.15017597610130906, "step": 26610 }, { "epoch": 0.8873333333333333, "grad_norm": 23.625, "grad_norm_var": 2.134375, "learning_rate": 3.307374434236003e-05, "loss": 6.7593, "loss/crossentropy": 2.0294124722480773, "loss/hidden": 3.161328125, "loss/jsd": 0.0, "loss/logits": 0.15351739330217243, "step": 26620 }, { "epoch": 0.8876666666666667, "grad_norm": 22.625, "grad_norm_var": 19.565625, "learning_rate": 3.295039817600936e-05, "loss": 6.8753, "loss/crossentropy": 2.105466166138649, "loss/hidden": 3.216796875, "loss/jsd": 0.0, "loss/logits": 0.18532855240628124, "step": 26630 }, { "epoch": 0.888, "grad_norm": 21.25, "grad_norm_var": 15.985416666666667, "learning_rate": 3.2827269630326885e-05, "loss": 6.7157, "loss/crossentropy": 2.0061062544584276, "loss/hidden": 3.219921875, "loss/jsd": 0.0, "loss/logits": 0.15896273953840137, "step": 26640 }, { "epoch": 0.8883333333333333, "grad_norm": 21.75, "grad_norm_var": 9.217122395833334, "learning_rate": 3.270435992054166e-05, "loss": 6.9179, "loss/crossentropy": 2.0659444093704225, "loss/hidden": 3.2515625, "loss/jsd": 0.0, "loss/logits": 0.16416719797998666, "step": 26650 }, { "epoch": 0.8886666666666667, "grad_norm": 26.375, "grad_norm_var": 3.4905598958333335, "learning_rate": 3.258167025972292e-05, "loss": 6.851, "loss/crossentropy": 2.0915834248065948, "loss/hidden": 3.230078125, "loss/jsd": 0.0, "loss/logits": 0.15449760612100363, "step": 26660 }, { "epoch": 0.889, "grad_norm": 29.75, "grad_norm_var": 10.816080729166666, "learning_rate": 3.245920185876805e-05, "loss": 6.9643, "loss/crossentropy": 1.9756429754197598, "loss/hidden": 3.17578125, "loss/jsd": 0.0, "loss/logits": 0.15356689458712935, "step": 26670 }, { "epoch": 0.8893333333333333, "grad_norm": 26.75, "grad_norm_var": 11.518489583333333, "learning_rate": 3.233695592639077e-05, "loss": 6.9679, "loss/crossentropy": 1.9231618136167525, "loss/hidden": 3.290625, "loss/jsd": 0.0, "loss/logits": 0.15716882031410934, "step": 26680 }, { "epoch": 0.8896666666666667, "grad_norm": 22.25, "grad_norm_var": 13.483072916666666, "learning_rate": 3.221493366910903e-05, "loss": 6.8899, "loss/crossentropy": 1.9402207165956498, "loss/hidden": 3.172265625, "loss/jsd": 0.0, "loss/logits": 0.15298937689512968, "step": 26690 }, { "epoch": 0.89, "grad_norm": 23.25, "grad_norm_var": 12.5056640625, "learning_rate": 3.2093136291233296e-05, "loss": 6.8965, "loss/crossentropy": 1.9652688920497894, "loss/hidden": 3.266015625, "loss/jsd": 0.0, "loss/logits": 0.15768850333988665, "step": 26700 }, { "epoch": 0.8903333333333333, "grad_norm": 21.5, "grad_norm_var": 4.339518229166667, "learning_rate": 3.197156499485447e-05, "loss": 6.8229, "loss/crossentropy": 2.013945384323597, "loss/hidden": 3.150390625, "loss/jsd": 0.0, "loss/logits": 0.15189841520041228, "step": 26710 }, { "epoch": 0.8906666666666667, "grad_norm": 25.25, "grad_norm_var": 2.01640625, "learning_rate": 3.185022097983221e-05, "loss": 6.8226, "loss/crossentropy": 2.015190437436104, "loss/hidden": 3.262109375, "loss/jsd": 0.0, "loss/logits": 0.16249268716201187, "step": 26720 }, { "epoch": 0.891, "grad_norm": 26.875, "grad_norm_var": 4.39375, "learning_rate": 3.172910544378294e-05, "loss": 6.9557, "loss/crossentropy": 2.177249902486801, "loss/hidden": 3.132421875, "loss/jsd": 0.0, "loss/logits": 0.15406437516212462, "step": 26730 }, { "epoch": 0.8913333333333333, "grad_norm": 24.375, "grad_norm_var": 4.626497395833334, "learning_rate": 3.160821958206807e-05, "loss": 6.9043, "loss/crossentropy": 2.0702683687210084, "loss/hidden": 3.2078125, "loss/jsd": 0.0, "loss/logits": 0.1650611654855311, "step": 26740 }, { "epoch": 0.8916666666666667, "grad_norm": 23.0, "grad_norm_var": 3.2301432291666665, "learning_rate": 3.1487564587782306e-05, "loss": 6.9284, "loss/crossentropy": 2.2026931807398795, "loss/hidden": 3.140234375, "loss/jsd": 0.0, "loss/logits": 0.17769969888031484, "step": 26750 }, { "epoch": 0.892, "grad_norm": 22.875, "grad_norm_var": 3.2577473958333334, "learning_rate": 3.1367141651741694e-05, "loss": 6.838, "loss/crossentropy": 1.979924051463604, "loss/hidden": 3.16875, "loss/jsd": 0.0, "loss/logits": 0.16592128686606883, "step": 26760 }, { "epoch": 0.8923333333333333, "grad_norm": 7348420608.0, "grad_norm_var": 3.374955317819448e+18, "learning_rate": 3.124695196247202e-05, "loss": 6.9768, "loss/crossentropy": 2.240220108628273, "loss/hidden": 3.144140625, "loss/jsd": 0.0, "loss/logits": 0.16037558643147348, "step": 26770 }, { "epoch": 0.8926666666666667, "grad_norm": 22.75, "grad_norm_var": 3.374955318079704e+18, "learning_rate": 3.112699670619696e-05, "loss": 6.8575, "loss/crossentropy": 2.125392961502075, "loss/hidden": 3.223046875, "loss/jsd": 0.0, "loss/logits": 0.16047360915690662, "step": 26780 }, { "epoch": 0.893, "grad_norm": 24.625, "grad_norm_var": 9.473958333333334, "learning_rate": 3.100727706682651e-05, "loss": 6.954, "loss/crossentropy": 2.044107362627983, "loss/hidden": 3.1625, "loss/jsd": 0.0, "loss/logits": 0.16029341490939258, "step": 26790 }, { "epoch": 0.8933333333333333, "grad_norm": 24.125, "grad_norm_var": 8.245572916666667, "learning_rate": 3.088779422594514e-05, "loss": 6.9426, "loss/crossentropy": 1.9640724688768387, "loss/hidden": 3.24453125, "loss/jsd": 0.0, "loss/logits": 0.15900588724762202, "step": 26800 }, { "epoch": 0.8936666666666667, "grad_norm": 20.875, "grad_norm_var": 4.214322916666666, "learning_rate": 3.0768549362800294e-05, "loss": 6.9375, "loss/crossentropy": 2.050625918060541, "loss/hidden": 3.1703125, "loss/jsd": 0.0, "loss/logits": 0.16425505680963398, "step": 26810 }, { "epoch": 0.894, "grad_norm": 23.5, "grad_norm_var": 3.2604166666666665, "learning_rate": 3.064954365429059e-05, "loss": 6.9182, "loss/crossentropy": 1.9688321188092233, "loss/hidden": 3.227734375, "loss/jsd": 0.0, "loss/logits": 0.17398671787232162, "step": 26820 }, { "epoch": 0.8943333333333333, "grad_norm": 22.25, "grad_norm_var": 3.302018229166667, "learning_rate": 3.053077827495433e-05, "loss": 6.8169, "loss/crossentropy": 2.088005256652832, "loss/hidden": 3.047265625, "loss/jsd": 0.0, "loss/logits": 0.14643877744674683, "step": 26830 }, { "epoch": 0.8946666666666667, "grad_norm": 21.625, "grad_norm_var": 2.1304840813595853e+18, "learning_rate": 3.0412254396957896e-05, "loss": 6.8132, "loss/crossentropy": 2.170038291811943, "loss/hidden": 3.19765625, "loss/jsd": 0.0, "loss/logits": 0.16484488490968943, "step": 26840 }, { "epoch": 0.895, "grad_norm": 26.0, "grad_norm_var": 2.1304840811710513e+18, "learning_rate": 3.0293973190084068e-05, "loss": 6.7694, "loss/crossentropy": 1.9106760919094086, "loss/hidden": 3.17421875, "loss/jsd": 0.0, "loss/logits": 0.14406620375812054, "step": 26850 }, { "epoch": 0.8953333333333333, "grad_norm": 26.0, "grad_norm_var": 2.5884765625, "learning_rate": 3.0175935821720648e-05, "loss": 6.8457, "loss/crossentropy": 2.27715582549572, "loss/hidden": 3.127734375, "loss/jsd": 0.0, "loss/logits": 0.158475461602211, "step": 26860 }, { "epoch": 0.8956666666666667, "grad_norm": 26.375, "grad_norm_var": 3.62890625, "learning_rate": 3.0058143456848765e-05, "loss": 6.7258, "loss/crossentropy": 1.9482488855719566, "loss/hidden": 3.14921875, "loss/jsd": 0.0, "loss/logits": 0.1498094605281949, "step": 26870 }, { "epoch": 0.896, "grad_norm": 21.125, "grad_norm_var": 1.4389704356790623e+18, "learning_rate": 2.994059725803156e-05, "loss": 6.7777, "loss/crossentropy": 2.0152445122599603, "loss/hidden": 3.187109375, "loss/jsd": 0.0, "loss/logits": 0.15422911364585162, "step": 26880 }, { "epoch": 0.8963333333333333, "grad_norm": 23.5, "grad_norm_var": 1.4389704356690657e+18, "learning_rate": 2.9823298385402492e-05, "loss": 6.8206, "loss/crossentropy": 2.0900501251220702, "loss/hidden": 3.166015625, "loss/jsd": 0.0, "loss/logits": 0.16731840167194606, "step": 26890 }, { "epoch": 0.8966666666666666, "grad_norm": 22.5, "grad_norm_var": 2.513641891262733e+18, "learning_rate": 2.9706247996654137e-05, "loss": 6.876, "loss/crossentropy": 1.9369783684611321, "loss/hidden": 3.293359375, "loss/jsd": 0.0, "loss/logits": 0.16292606424540282, "step": 26900 }, { "epoch": 0.897, "grad_norm": 26.5, "grad_norm_var": 6.4962890625, "learning_rate": 2.958944724702654e-05, "loss": 6.7905, "loss/crossentropy": 1.9939923129975796, "loss/hidden": 3.150390625, "loss/jsd": 0.0, "loss/logits": 0.15159244257956744, "step": 26910 }, { "epoch": 0.8973333333333333, "grad_norm": 26.75, "grad_norm_var": 2.792122395833333, "learning_rate": 2.947289728929597e-05, "loss": 6.8971, "loss/crossentropy": 2.078751567006111, "loss/hidden": 3.190234375, "loss/jsd": 0.0, "loss/logits": 0.17754473332315684, "step": 26920 }, { "epoch": 0.8976666666666666, "grad_norm": 25.0, "grad_norm_var": 1.6082682291666666, "learning_rate": 2.935659927376343e-05, "loss": 6.8012, "loss/crossentropy": 2.0356945395469666, "loss/hidden": 3.208984375, "loss/jsd": 0.0, "loss/logits": 0.16562622915953398, "step": 26930 }, { "epoch": 0.898, "grad_norm": 26.625, "grad_norm_var": 3.0184895833333334, "learning_rate": 2.924055434824342e-05, "loss": 6.7869, "loss/crossentropy": 2.1356831192970276, "loss/hidden": 3.251953125, "loss/jsd": 0.0, "loss/logits": 0.17853607889264822, "step": 26940 }, { "epoch": 0.8983333333333333, "grad_norm": 22.125, "grad_norm_var": 11.651822916666667, "learning_rate": 2.9124763658052478e-05, "loss": 6.8149, "loss/crossentropy": 1.9252381205558777, "loss/hidden": 3.224609375, "loss/jsd": 0.0, "loss/logits": 0.16029497589915992, "step": 26950 }, { "epoch": 0.8986666666666666, "grad_norm": 22.75, "grad_norm_var": 3.1830729166666667, "learning_rate": 2.900922834599797e-05, "loss": 6.9226, "loss/crossentropy": 2.205397879332304, "loss/hidden": 3.209765625, "loss/jsd": 0.0, "loss/logits": 0.15344799063168466, "step": 26960 }, { "epoch": 0.899, "grad_norm": 23.875, "grad_norm_var": 2.919205729166667, "learning_rate": 2.8893949552366796e-05, "loss": 6.8206, "loss/crossentropy": 2.09553968757391, "loss/hidden": 3.194140625, "loss/jsd": 0.0, "loss/logits": 0.16381179327145218, "step": 26970 }, { "epoch": 0.8993333333333333, "grad_norm": 25.875, "grad_norm_var": 3.3749348958333334, "learning_rate": 2.8778928414914085e-05, "loss": 6.8139, "loss/crossentropy": 2.029978536069393, "loss/hidden": 3.2390625, "loss/jsd": 0.0, "loss/logits": 0.1680966019630432, "step": 26980 }, { "epoch": 0.8996666666666666, "grad_norm": 24.875, "grad_norm_var": 2.4541015625, "learning_rate": 2.8664166068852062e-05, "loss": 6.8405, "loss/crossentropy": 1.9429209612309932, "loss/hidden": 3.27578125, "loss/jsd": 0.0, "loss/logits": 0.16744533190503716, "step": 26990 }, { "epoch": 0.9, "grad_norm": 21.75, "grad_norm_var": 3.388541666666667, "learning_rate": 2.854966364683872e-05, "loss": 6.8216, "loss/crossentropy": 1.9241836979985236, "loss/hidden": 3.226953125, "loss/jsd": 0.0, "loss/logits": 0.16613443605601788, "step": 27000 }, { "epoch": 0.9003333333333333, "grad_norm": 26.125, "grad_norm_var": 2.4205729166666665, "learning_rate": 2.843542227896676e-05, "loss": 6.8825, "loss/crossentropy": 2.012719841301441, "loss/hidden": 3.23515625, "loss/jsd": 0.0, "loss/logits": 0.15762700429186224, "step": 27010 }, { "epoch": 0.9006666666666666, "grad_norm": 22.625, "grad_norm_var": 2.0228515625, "learning_rate": 2.8321443092752338e-05, "loss": 6.7563, "loss/crossentropy": 1.9895868554711342, "loss/hidden": 3.196484375, "loss/jsd": 0.0, "loss/logits": 0.17497619222849609, "step": 27020 }, { "epoch": 0.901, "grad_norm": 24.0, "grad_norm_var": 3.278059895833333, "learning_rate": 2.8207727213124035e-05, "loss": 6.7559, "loss/crossentropy": 1.9740510500967503, "loss/hidden": 3.12265625, "loss/jsd": 0.0, "loss/logits": 0.1432420744560659, "step": 27030 }, { "epoch": 0.9013333333333333, "grad_norm": 24.125, "grad_norm_var": 2.713997395833333, "learning_rate": 2.809427576241167e-05, "loss": 6.8997, "loss/crossentropy": 2.1658532321453094, "loss/hidden": 3.162109375, "loss/jsd": 0.0, "loss/logits": 0.1637007687240839, "step": 27040 }, { "epoch": 0.9016666666666666, "grad_norm": 21.125, "grad_norm_var": 3.376822916666667, "learning_rate": 2.798108986033523e-05, "loss": 6.9438, "loss/crossentropy": 2.1903593868017195, "loss/hidden": 3.13359375, "loss/jsd": 0.0, "loss/logits": 0.1633994322270155, "step": 27050 }, { "epoch": 0.902, "grad_norm": 21.875, "grad_norm_var": 5.095572916666667, "learning_rate": 2.7868170623993905e-05, "loss": 7.0145, "loss/crossentropy": 2.0305363297462464, "loss/hidden": 3.247265625, "loss/jsd": 0.0, "loss/logits": 0.17958665620535613, "step": 27060 }, { "epoch": 0.9023333333333333, "grad_norm": 25.625, "grad_norm_var": 3.6702473958333335, "learning_rate": 2.7755519167854944e-05, "loss": 6.7408, "loss/crossentropy": 1.86053267121315, "loss/hidden": 3.155078125, "loss/jsd": 0.0, "loss/logits": 0.14872891837731003, "step": 27070 }, { "epoch": 0.9026666666666666, "grad_norm": 21.0, "grad_norm_var": 4.24375, "learning_rate": 2.764313660374277e-05, "loss": 6.8407, "loss/crossentropy": 2.007214891910553, "loss/hidden": 3.1125, "loss/jsd": 0.0, "loss/logits": 0.15470210947096347, "step": 27080 }, { "epoch": 0.903, "grad_norm": 23.375, "grad_norm_var": 6.474739583333333, "learning_rate": 2.753102404082789e-05, "loss": 6.9169, "loss/crossentropy": 2.1241619139909744, "loss/hidden": 3.139453125, "loss/jsd": 0.0, "loss/logits": 0.15222108382731675, "step": 27090 }, { "epoch": 0.9033333333333333, "grad_norm": 23.625, "grad_norm_var": 2.134375, "learning_rate": 2.741918258561607e-05, "loss": 6.7749, "loss/crossentropy": 1.9174664333462714, "loss/hidden": 3.1, "loss/jsd": 0.0, "loss/logits": 0.14461091123521327, "step": 27100 }, { "epoch": 0.9036666666666666, "grad_norm": 25.625, "grad_norm_var": 1.6541015625, "learning_rate": 2.7307613341937282e-05, "loss": 6.8602, "loss/crossentropy": 2.0042121566832067, "loss/hidden": 3.255859375, "loss/jsd": 0.0, "loss/logits": 0.1733078501187265, "step": 27110 }, { "epoch": 0.904, "grad_norm": 25.875, "grad_norm_var": 3.5332682291666666, "learning_rate": 2.7196317410934964e-05, "loss": 6.886, "loss/crossentropy": 2.019241477549076, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.19402055349200964, "step": 27120 }, { "epoch": 0.9043333333333333, "grad_norm": 22.875, "grad_norm_var": 2.234375, "learning_rate": 2.7085295891054997e-05, "loss": 6.9037, "loss/crossentropy": 2.0694904938340186, "loss/hidden": 3.2640625, "loss/jsd": 0.0, "loss/logits": 0.16915742177516221, "step": 27130 }, { "epoch": 0.9046666666666666, "grad_norm": 28.375, "grad_norm_var": 8.305143229166667, "learning_rate": 2.697454987803495e-05, "loss": 6.8822, "loss/crossentropy": 2.0155827552080154, "loss/hidden": 3.23671875, "loss/jsd": 0.0, "loss/logits": 0.16962535195052625, "step": 27140 }, { "epoch": 0.905, "grad_norm": 22.0, "grad_norm_var": 10.370768229166666, "learning_rate": 2.6864080464893282e-05, "loss": 6.7997, "loss/crossentropy": 2.08170278519392, "loss/hidden": 3.153515625, "loss/jsd": 0.0, "loss/logits": 0.1523496536538005, "step": 27150 }, { "epoch": 0.9053333333333333, "grad_norm": 20.125, "grad_norm_var": 1.3744140625, "learning_rate": 2.6753888741918488e-05, "loss": 6.9908, "loss/crossentropy": 2.0863103806972503, "loss/hidden": 3.188671875, "loss/jsd": 0.0, "loss/logits": 0.15311094475910067, "step": 27160 }, { "epoch": 0.9056666666666666, "grad_norm": 22.0, "grad_norm_var": 1.1864583333333334, "learning_rate": 2.6643975796658406e-05, "loss": 6.8451, "loss/crossentropy": 2.150225210189819, "loss/hidden": 3.04375, "loss/jsd": 0.0, "loss/logits": 0.14159671682864428, "step": 27170 }, { "epoch": 0.906, "grad_norm": 22.75, "grad_norm_var": 0.8052083333333333, "learning_rate": 2.65343427139094e-05, "loss": 6.8341, "loss/crossentropy": 2.0567213878035546, "loss/hidden": 3.133984375, "loss/jsd": 0.0, "loss/logits": 0.15704135950654746, "step": 27180 }, { "epoch": 0.9063333333333333, "grad_norm": 21.25, "grad_norm_var": 0.8254557291666667, "learning_rate": 2.642499057570578e-05, "loss": 6.8178, "loss/crossentropy": 2.0047308802604675, "loss/hidden": 3.1453125, "loss/jsd": 0.0, "loss/logits": 0.15476641841232777, "step": 27190 }, { "epoch": 0.9066666666666666, "grad_norm": 20.625, "grad_norm_var": 0.9639973958333333, "learning_rate": 2.6315920461308964e-05, "loss": 6.8925, "loss/crossentropy": 2.0975175350904465, "loss/hidden": 3.23984375, "loss/jsd": 0.0, "loss/logits": 0.17165146991610528, "step": 27200 }, { "epoch": 0.907, "grad_norm": 20.75, "grad_norm_var": 1.4910807291666666, "learning_rate": 2.620713344719698e-05, "loss": 6.9033, "loss/crossentropy": 2.116207906603813, "loss/hidden": 3.19609375, "loss/jsd": 0.0, "loss/logits": 0.17307401802390815, "step": 27210 }, { "epoch": 0.9073333333333333, "grad_norm": 19.75, "grad_norm_var": 1.5059895833333334, "learning_rate": 2.6098630607053704e-05, "loss": 6.8249, "loss/crossentropy": 2.1922834485769274, "loss/hidden": 3.1828125, "loss/jsd": 0.0, "loss/logits": 0.16770193502306938, "step": 27220 }, { "epoch": 0.9076666666666666, "grad_norm": 21.75, "grad_norm_var": 0.8363932291666667, "learning_rate": 2.5990413011758396e-05, "loss": 6.9028, "loss/crossentropy": 2.190079639852047, "loss/hidden": 3.20546875, "loss/jsd": 0.0, "loss/logits": 0.17454652497544884, "step": 27230 }, { "epoch": 0.908, "grad_norm": 21.625, "grad_norm_var": 0.7738932291666667, "learning_rate": 2.588248172937502e-05, "loss": 6.7325, "loss/crossentropy": 1.9603225111961364, "loss/hidden": 3.145703125, "loss/jsd": 0.0, "loss/logits": 0.14061546474695205, "step": 27240 }, { "epoch": 0.9083333333333333, "grad_norm": 21.0, "grad_norm_var": 0.3322265625, "learning_rate": 2.577483782514174e-05, "loss": 6.8603, "loss/crossentropy": 2.165058287978172, "loss/hidden": 3.1921875, "loss/jsd": 0.0, "loss/logits": 0.16146605722606183, "step": 27250 }, { "epoch": 0.9086666666666666, "grad_norm": 21.5, "grad_norm_var": 0.8113932291666667, "learning_rate": 2.5667482361460467e-05, "loss": 6.8768, "loss/crossentropy": 2.0848546117544173, "loss/hidden": 3.243359375, "loss/jsd": 0.0, "loss/logits": 0.16068812049925327, "step": 27260 }, { "epoch": 0.909, "grad_norm": 21.125, "grad_norm_var": 0.6427083333333333, "learning_rate": 2.5560416397886257e-05, "loss": 6.9293, "loss/crossentropy": 1.8594784066081047, "loss/hidden": 3.20703125, "loss/jsd": 0.0, "loss/logits": 0.1557474084198475, "step": 27270 }, { "epoch": 0.9093333333333333, "grad_norm": 22.375, "grad_norm_var": 0.4705729166666667, "learning_rate": 2.5453640991116967e-05, "loss": 6.8813, "loss/crossentropy": 2.048447531461716, "loss/hidden": 3.159375, "loss/jsd": 0.0, "loss/logits": 0.1582455337047577, "step": 27280 }, { "epoch": 0.9096666666666666, "grad_norm": 22.375, "grad_norm_var": 6.491666666666666, "learning_rate": 2.5347157194982742e-05, "loss": 6.8006, "loss/crossentropy": 2.028676262497902, "loss/hidden": 3.18125, "loss/jsd": 0.0, "loss/logits": 0.15210597179830074, "step": 27290 }, { "epoch": 0.91, "grad_norm": 20.5, "grad_norm_var": 1.2197916666666666, "learning_rate": 2.5240966060435677e-05, "loss": 6.908, "loss/crossentropy": 2.1295453563332556, "loss/hidden": 3.267578125, "loss/jsd": 0.0, "loss/logits": 0.18300293069332838, "step": 27300 }, { "epoch": 0.9103333333333333, "grad_norm": 20.5, "grad_norm_var": 0.5468098958333333, "learning_rate": 2.5135068635539366e-05, "loss": 6.7928, "loss/crossentropy": 2.176609678566456, "loss/hidden": 3.1765625, "loss/jsd": 0.0, "loss/logits": 0.1436314729042351, "step": 27310 }, { "epoch": 0.9106666666666666, "grad_norm": 21.0, "grad_norm_var": 1.2052083333333334, "learning_rate": 2.5029465965458683e-05, "loss": 6.8852, "loss/crossentropy": 1.9842437624931335, "loss/hidden": 3.241015625, "loss/jsd": 0.0, "loss/logits": 0.15974466726183892, "step": 27320 }, { "epoch": 0.911, "grad_norm": 22.0, "grad_norm_var": 1.2155598958333333, "learning_rate": 2.4924159092449325e-05, "loss": 6.8875, "loss/crossentropy": 1.9043216429650784, "loss/hidden": 3.2078125, "loss/jsd": 0.0, "loss/logits": 0.17278967509046197, "step": 27330 }, { "epoch": 0.9113333333333333, "grad_norm": 21.75, "grad_norm_var": 5.60625, "learning_rate": 2.48191490558476e-05, "loss": 6.8304, "loss/crossentropy": 2.0150970712304117, "loss/hidden": 3.262109375, "loss/jsd": 0.0, "loss/logits": 0.16625587958842517, "step": 27340 }, { "epoch": 0.9116666666666666, "grad_norm": 21.5, "grad_norm_var": 8.317708333333334, "learning_rate": 2.4714436892060213e-05, "loss": 6.8042, "loss/crossentropy": 2.058341934531927, "loss/hidden": 3.1359375, "loss/jsd": 0.0, "loss/logits": 0.154982496984303, "step": 27350 }, { "epoch": 0.912, "grad_norm": 20.75, "grad_norm_var": 3.8749348958333334, "learning_rate": 2.46100236345539e-05, "loss": 6.7786, "loss/crossentropy": 1.8252541318535804, "loss/hidden": 3.261328125, "loss/jsd": 0.0, "loss/logits": 0.1562123046256602, "step": 27360 }, { "epoch": 0.9123333333333333, "grad_norm": 21.875, "grad_norm_var": 0.9858723958333333, "learning_rate": 2.4505910313845408e-05, "loss": 6.8645, "loss/crossentropy": 1.9364535629749298, "loss/hidden": 3.12578125, "loss/jsd": 0.0, "loss/logits": 0.1512385666370392, "step": 27370 }, { "epoch": 0.9126666666666666, "grad_norm": 21.75, "grad_norm_var": 1.0983723958333333, "learning_rate": 2.440209795749114e-05, "loss": 6.8863, "loss/crossentropy": 1.9228644296526909, "loss/hidden": 3.23984375, "loss/jsd": 0.0, "loss/logits": 0.16667801439762114, "step": 27380 }, { "epoch": 0.913, "grad_norm": 20.875, "grad_norm_var": 0.9931640625, "learning_rate": 2.4298587590077164e-05, "loss": 6.9802, "loss/crossentropy": 1.9965920761227607, "loss/hidden": 3.22109375, "loss/jsd": 0.0, "loss/logits": 0.16229025460779667, "step": 27390 }, { "epoch": 0.9133333333333333, "grad_norm": 20.25, "grad_norm_var": 0.5082682291666667, "learning_rate": 2.4195380233209008e-05, "loss": 6.6642, "loss/crossentropy": 1.916854026913643, "loss/hidden": 3.20078125, "loss/jsd": 0.0, "loss/logits": 0.15667275432497263, "step": 27400 }, { "epoch": 0.9136666666666666, "grad_norm": 25.25, "grad_norm_var": 2.068489583333333, "learning_rate": 2.4092476905501634e-05, "loss": 6.9134, "loss/crossentropy": 2.0995171763002873, "loss/hidden": 3.16640625, "loss/jsd": 0.0, "loss/logits": 0.15996734565123916, "step": 27410 }, { "epoch": 0.914, "grad_norm": 23.0, "grad_norm_var": 2.5400390625, "learning_rate": 2.398987862256933e-05, "loss": 6.979, "loss/crossentropy": 2.112964731827378, "loss/hidden": 3.340234375, "loss/jsd": 0.0, "loss/logits": 0.1689059093594551, "step": 27420 }, { "epoch": 0.9143333333333333, "grad_norm": 21.75, "grad_norm_var": 1.621875, "learning_rate": 2.3887586397015716e-05, "loss": 6.9236, "loss/crossentropy": 2.0072560638189314, "loss/hidden": 3.2296875, "loss/jsd": 0.0, "loss/logits": 0.1832346895709634, "step": 27430 }, { "epoch": 0.9146666666666666, "grad_norm": 26.375, "grad_norm_var": 2.6416015625, "learning_rate": 2.3785601238423787e-05, "loss": 6.8881, "loss/crossentropy": 2.0381537839770316, "loss/hidden": 3.209765625, "loss/jsd": 0.0, "loss/logits": 0.1642130235210061, "step": 27440 }, { "epoch": 0.915, "grad_norm": 21.875, "grad_norm_var": 2.71015625, "learning_rate": 2.3683924153345856e-05, "loss": 6.898, "loss/crossentropy": 2.050332149863243, "loss/hidden": 3.220703125, "loss/jsd": 0.0, "loss/logits": 0.15957360472530127, "step": 27450 }, { "epoch": 0.9153333333333333, "grad_norm": 20.75, "grad_norm_var": 1.1238932291666666, "learning_rate": 2.358255614529374e-05, "loss": 6.7788, "loss/crossentropy": 1.9567649722099305, "loss/hidden": 3.21640625, "loss/jsd": 0.0, "loss/logits": 0.1502897882834077, "step": 27460 }, { "epoch": 0.9156666666666666, "grad_norm": 20.875, "grad_norm_var": 0.890625, "learning_rate": 2.3481498214728717e-05, "loss": 6.7887, "loss/crossentropy": 1.9293017938733101, "loss/hidden": 3.262890625, "loss/jsd": 0.0, "loss/logits": 0.16474825162440537, "step": 27470 }, { "epoch": 0.916, "grad_norm": 20.25, "grad_norm_var": 0.6587890625, "learning_rate": 2.3380751359051795e-05, "loss": 6.8496, "loss/crossentropy": 2.057722179591656, "loss/hidden": 3.166015625, "loss/jsd": 0.0, "loss/logits": 0.1477236093953252, "step": 27480 }, { "epoch": 0.9163333333333333, "grad_norm": 21.0, "grad_norm_var": 0.6978515625, "learning_rate": 2.3280316572593735e-05, "loss": 6.851, "loss/crossentropy": 2.0436879307031632, "loss/hidden": 3.100390625, "loss/jsd": 0.0, "loss/logits": 0.15924023166298867, "step": 27490 }, { "epoch": 0.9166666666666666, "grad_norm": 21.5, "grad_norm_var": 0.82890625, "learning_rate": 2.3180194846605367e-05, "loss": 6.8103, "loss/crossentropy": 2.0646505132317543, "loss/hidden": 3.11796875, "loss/jsd": 0.0, "loss/logits": 0.14654937675222754, "step": 27500 }, { "epoch": 0.917, "grad_norm": 21.0, "grad_norm_var": 0.8551432291666666, "learning_rate": 2.3080387169247687e-05, "loss": 6.8039, "loss/crossentropy": 2.101382979750633, "loss/hidden": 3.2671875, "loss/jsd": 0.0, "loss/logits": 0.16098164729773998, "step": 27510 }, { "epoch": 0.9173333333333333, "grad_norm": 21.375, "grad_norm_var": 1.1291015625, "learning_rate": 2.298089452558216e-05, "loss": 6.7319, "loss/crossentropy": 1.9046258434653283, "loss/hidden": 3.130078125, "loss/jsd": 0.0, "loss/logits": 0.15497801061719657, "step": 27520 }, { "epoch": 0.9176666666666666, "grad_norm": 22.0, "grad_norm_var": 0.8749348958333333, "learning_rate": 2.288171789756105e-05, "loss": 6.8369, "loss/crossentropy": 2.108339750766754, "loss/hidden": 3.21171875, "loss/jsd": 0.0, "loss/logits": 0.17621326725929976, "step": 27530 }, { "epoch": 0.918, "grad_norm": 20.875, "grad_norm_var": 1.2478515625, "learning_rate": 2.2782858264017598e-05, "loss": 6.8024, "loss/crossentropy": 2.042202705144882, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.15201376751065254, "step": 27540 }, { "epoch": 0.9183333333333333, "grad_norm": 21.125, "grad_norm_var": 1.3363932291666667, "learning_rate": 2.268431660065651e-05, "loss": 6.7997, "loss/crossentropy": 1.8682068414986133, "loss/hidden": 3.153125, "loss/jsd": 0.0, "loss/logits": 0.15471092467196285, "step": 27550 }, { "epoch": 0.9186666666666666, "grad_norm": 21.5, "grad_norm_var": 0.43723958333333335, "learning_rate": 2.258609388004419e-05, "loss": 6.7734, "loss/crossentropy": 1.9644837513566018, "loss/hidden": 3.20390625, "loss/jsd": 0.0, "loss/logits": 0.16138502229005097, "step": 27560 }, { "epoch": 0.919, "grad_norm": 21.75, "grad_norm_var": 0.6681640625, "learning_rate": 2.2488191071599263e-05, "loss": 6.799, "loss/crossentropy": 2.0926445186138154, "loss/hidden": 3.29765625, "loss/jsd": 0.0, "loss/logits": 0.19344071615487338, "step": 27570 }, { "epoch": 0.9193333333333333, "grad_norm": 21.0, "grad_norm_var": 0.7291015625, "learning_rate": 2.2390609141582902e-05, "loss": 6.7563, "loss/crossentropy": 2.0497403740882874, "loss/hidden": 3.1421875, "loss/jsd": 0.0, "loss/logits": 0.15796293318271637, "step": 27580 }, { "epoch": 0.9196666666666666, "grad_norm": 20.5, "grad_norm_var": 8.709375, "learning_rate": 2.229334905308938e-05, "loss": 6.679, "loss/crossentropy": 1.8748921178281308, "loss/hidden": 3.208984375, "loss/jsd": 0.0, "loss/logits": 0.14360655695199967, "step": 27590 }, { "epoch": 0.92, "grad_norm": 21.75, "grad_norm_var": 7.952018229166667, "learning_rate": 2.219641176603649e-05, "loss": 6.8976, "loss/crossentropy": 1.9149666860699655, "loss/hidden": 3.216796875, "loss/jsd": 0.0, "loss/logits": 0.1568117355927825, "step": 27600 }, { "epoch": 0.9203333333333333, "grad_norm": 24.0, "grad_norm_var": 3.1035807291666666, "learning_rate": 2.2099798237156116e-05, "loss": 6.8551, "loss/crossentropy": 2.1548288121819494, "loss/hidden": 3.271875, "loss/jsd": 0.0, "loss/logits": 0.1809004159644246, "step": 27610 }, { "epoch": 0.9206666666666666, "grad_norm": 21.5, "grad_norm_var": 3.595833333333333, "learning_rate": 2.200350941998481e-05, "loss": 6.8465, "loss/crossentropy": 2.0366897195577622, "loss/hidden": 3.103515625, "loss/jsd": 0.0, "loss/logits": 0.15638676267117263, "step": 27620 }, { "epoch": 0.921, "grad_norm": 20.125, "grad_norm_var": 0.8080729166666667, "learning_rate": 2.1907546264854283e-05, "loss": 6.9391, "loss/crossentropy": 1.9020028218626976, "loss/hidden": 3.107421875, "loss/jsd": 0.0, "loss/logits": 0.14446177333593369, "step": 27630 }, { "epoch": 0.9213333333333333, "grad_norm": 21.0, "grad_norm_var": 0.878125, "learning_rate": 2.181190971888218e-05, "loss": 6.8741, "loss/crossentropy": 2.1242057621479034, "loss/hidden": 3.25078125, "loss/jsd": 0.0, "loss/logits": 0.16890477053821087, "step": 27640 }, { "epoch": 0.9216666666666666, "grad_norm": 21.5, "grad_norm_var": 0.5712890625, "learning_rate": 2.1716600725962562e-05, "loss": 6.854, "loss/crossentropy": 2.003288094699383, "loss/hidden": 3.13203125, "loss/jsd": 0.0, "loss/logits": 0.15149989314377307, "step": 27650 }, { "epoch": 0.922, "grad_norm": 21.75, "grad_norm_var": 1.2197265625, "learning_rate": 2.1621620226756745e-05, "loss": 6.8234, "loss/crossentropy": 1.8800167009234428, "loss/hidden": 3.2140625, "loss/jsd": 0.0, "loss/logits": 0.154136617295444, "step": 27660 }, { "epoch": 0.9223333333333333, "grad_norm": 20.75, "grad_norm_var": 0.8014973958333333, "learning_rate": 2.1526969158683875e-05, "loss": 6.7598, "loss/crossentropy": 2.1428465634584426, "loss/hidden": 3.107421875, "loss/jsd": 0.0, "loss/logits": 0.1558793431147933, "step": 27670 }, { "epoch": 0.9226666666666666, "grad_norm": 20.75, "grad_norm_var": 1.2302083333333333, "learning_rate": 2.1432648455911808e-05, "loss": 6.8209, "loss/crossentropy": 1.9693931117653847, "loss/hidden": 3.183984375, "loss/jsd": 0.0, "loss/logits": 0.16253619380295276, "step": 27680 }, { "epoch": 0.923, "grad_norm": 23.0, "grad_norm_var": 1.6561848958333334, "learning_rate": 2.1338659049347798e-05, "loss": 6.871, "loss/crossentropy": 2.2309056654572488, "loss/hidden": 3.18046875, "loss/jsd": 0.0, "loss/logits": 0.16542233377695084, "step": 27690 }, { "epoch": 0.9233333333333333, "grad_norm": 22.375, "grad_norm_var": 0.6228515625, "learning_rate": 2.1245001866629322e-05, "loss": 6.8937, "loss/crossentropy": 2.0580638118088244, "loss/hidden": 3.295703125, "loss/jsd": 0.0, "loss/logits": 0.1742462942842394, "step": 27700 }, { "epoch": 0.9236666666666666, "grad_norm": 23.375, "grad_norm_var": 2.3160807291666665, "learning_rate": 2.1151677832114996e-05, "loss": 6.9408, "loss/crossentropy": 1.997247189283371, "loss/hidden": 3.149609375, "loss/jsd": 0.0, "loss/logits": 0.14960271613672377, "step": 27710 }, { "epoch": 0.924, "grad_norm": 23.625, "grad_norm_var": 2.661393229166667, "learning_rate": 2.1058687866875328e-05, "loss": 6.8154, "loss/crossentropy": 1.9769588127732276, "loss/hidden": 3.16875, "loss/jsd": 0.0, "loss/logits": 0.15400861240923405, "step": 27720 }, { "epoch": 0.9243333333333333, "grad_norm": 21.875, "grad_norm_var": 10.42265625, "learning_rate": 2.0966032888683773e-05, "loss": 6.8957, "loss/crossentropy": 1.9895775854587554, "loss/hidden": 3.198046875, "loss/jsd": 0.0, "loss/logits": 0.15623829020187258, "step": 27730 }, { "epoch": 0.9246666666666666, "grad_norm": 22.25, "grad_norm_var": 1.3666666666666667, "learning_rate": 2.0873713812007517e-05, "loss": 6.9308, "loss/crossentropy": 2.0563116490840914, "loss/hidden": 3.275, "loss/jsd": 0.0, "loss/logits": 0.16867623366415502, "step": 27740 }, { "epoch": 0.925, "grad_norm": 20.875, "grad_norm_var": 0.6372395833333333, "learning_rate": 2.0781731547998614e-05, "loss": 6.8815, "loss/crossentropy": 1.8822642505168914, "loss/hidden": 3.246875, "loss/jsd": 0.0, "loss/logits": 0.16076278118416668, "step": 27750 }, { "epoch": 0.9253333333333333, "grad_norm": 19.875, "grad_norm_var": 0.9455729166666667, "learning_rate": 2.0690087004484844e-05, "loss": 6.801, "loss/crossentropy": 2.086822558939457, "loss/hidden": 3.212109375, "loss/jsd": 0.0, "loss/logits": 0.16068840138614177, "step": 27760 }, { "epoch": 0.9256666666666666, "grad_norm": 23.75, "grad_norm_var": 1.9806640625, "learning_rate": 2.0598781085960883e-05, "loss": 6.8743, "loss/crossentropy": 2.00646168962121, "loss/hidden": 3.18203125, "loss/jsd": 0.0, "loss/logits": 0.15596114667132496, "step": 27770 }, { "epoch": 0.926, "grad_norm": 21.5, "grad_norm_var": 1.0457682291666666, "learning_rate": 2.0507814693579263e-05, "loss": 6.8621, "loss/crossentropy": 1.9436368495225906, "loss/hidden": 3.1109375, "loss/jsd": 0.0, "loss/logits": 0.14514606250450016, "step": 27780 }, { "epoch": 0.9263333333333333, "grad_norm": 21.125, "grad_norm_var": 1.446875, "learning_rate": 2.0417188725141557e-05, "loss": 6.8546, "loss/crossentropy": 2.0124169424176217, "loss/hidden": 3.271484375, "loss/jsd": 0.0, "loss/logits": 0.16959600700065494, "step": 27790 }, { "epoch": 0.9266666666666666, "grad_norm": 21.125, "grad_norm_var": 1.4510416666666666, "learning_rate": 2.0326904075089492e-05, "loss": 6.8477, "loss/crossentropy": 2.062483602762222, "loss/hidden": 3.134375, "loss/jsd": 0.0, "loss/logits": 0.1529495507478714, "step": 27800 }, { "epoch": 0.927, "grad_norm": 20.25, "grad_norm_var": 1.040625, "learning_rate": 2.02369616344961e-05, "loss": 6.8063, "loss/crossentropy": 2.018744045495987, "loss/hidden": 3.1296875, "loss/jsd": 0.0, "loss/logits": 0.15262581091374158, "step": 27810 }, { "epoch": 0.9273333333333333, "grad_norm": 22.5, "grad_norm_var": 0.7434895833333334, "learning_rate": 2.0147362291056983e-05, "loss": 6.8214, "loss/crossentropy": 1.9554542362689973, "loss/hidden": 3.255078125, "loss/jsd": 0.0, "loss/logits": 0.15628779772669077, "step": 27820 }, { "epoch": 0.9276666666666666, "grad_norm": 20.375, "grad_norm_var": 0.6375, "learning_rate": 2.005810692908146e-05, "loss": 6.773, "loss/crossentropy": 1.8790936447679996, "loss/hidden": 3.12265625, "loss/jsd": 0.0, "loss/logits": 0.1430271876975894, "step": 27830 }, { "epoch": 0.928, "grad_norm": 22.75, "grad_norm_var": 0.7018229166666666, "learning_rate": 1.996919642948395e-05, "loss": 6.9103, "loss/crossentropy": 1.9114558339118957, "loss/hidden": 3.26953125, "loss/jsd": 0.0, "loss/logits": 0.16306452695280313, "step": 27840 }, { "epoch": 0.9283333333333333, "grad_norm": 22.875, "grad_norm_var": 2.474739583333333, "learning_rate": 1.9880631669775164e-05, "loss": 6.9135, "loss/crossentropy": 1.938335907459259, "loss/hidden": 3.210546875, "loss/jsd": 0.0, "loss/logits": 0.15777956116944553, "step": 27850 }, { "epoch": 0.9286666666666666, "grad_norm": 21.25, "grad_norm_var": 0.9306640625, "learning_rate": 1.9792413524053538e-05, "loss": 6.8582, "loss/crossentropy": 2.0509339734911918, "loss/hidden": 3.16640625, "loss/jsd": 0.0, "loss/logits": 0.17118664290755986, "step": 27860 }, { "epoch": 0.929, "grad_norm": 23.125, "grad_norm_var": 1.2134765625, "learning_rate": 1.970454286299654e-05, "loss": 6.8609, "loss/crossentropy": 2.035054676234722, "loss/hidden": 3.24453125, "loss/jsd": 0.0, "loss/logits": 0.17292858399450778, "step": 27870 }, { "epoch": 0.9293333333333333, "grad_norm": 21.75, "grad_norm_var": 0.65, "learning_rate": 1.961702055385215e-05, "loss": 6.9531, "loss/crossentropy": 2.035239374637604, "loss/hidden": 3.10625, "loss/jsd": 0.0, "loss/logits": 0.15701537095010282, "step": 27880 }, { "epoch": 0.9296666666666666, "grad_norm": 21.5, "grad_norm_var": 0.8754557291666667, "learning_rate": 1.9529847460430206e-05, "loss": 6.7717, "loss/crossentropy": 2.0914264246821404, "loss/hidden": 3.19921875, "loss/jsd": 0.0, "loss/logits": 0.1649886442348361, "step": 27890 }, { "epoch": 0.93, "grad_norm": 20.75, "grad_norm_var": 2.5403116477197844e+18, "learning_rate": 1.944302444309393e-05, "loss": 6.8185, "loss/crossentropy": 1.963211180269718, "loss/hidden": 3.149609375, "loss/jsd": 0.0, "loss/logits": 0.15954519156366587, "step": 27900 }, { "epoch": 0.9303333333333333, "grad_norm": 19.875, "grad_norm_var": 2.540311647507273e+18, "learning_rate": 1.9356552358751486e-05, "loss": 6.8701, "loss/crossentropy": 1.9134356677532196, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.16030828636139632, "step": 27910 }, { "epoch": 0.9306666666666666, "grad_norm": 20.75, "grad_norm_var": 1.9268229166666666, "learning_rate": 1.927043206084741e-05, "loss": 6.8608, "loss/crossentropy": 2.0048422425985337, "loss/hidden": 3.26015625, "loss/jsd": 0.0, "loss/logits": 0.16555739659816027, "step": 27920 }, { "epoch": 0.931, "grad_norm": 21.75, "grad_norm_var": 1.1997395833333333, "learning_rate": 1.918466439935429e-05, "loss": 6.8752, "loss/crossentropy": 2.1238563142716886, "loss/hidden": 3.2640625, "loss/jsd": 0.0, "loss/logits": 0.16962270541116595, "step": 27930 }, { "epoch": 0.9313333333333333, "grad_norm": 21.875, "grad_norm_var": 1.2989583333333334, "learning_rate": 1.9099250220764303e-05, "loss": 6.8435, "loss/crossentropy": 2.0242248825728892, "loss/hidden": 3.190234375, "loss/jsd": 0.0, "loss/logits": 0.17032607905566693, "step": 27940 }, { "epoch": 0.9316666666666666, "grad_norm": 22.0, "grad_norm_var": 0.5895182291666666, "learning_rate": 1.9014190368080926e-05, "loss": 7.0145, "loss/crossentropy": 2.1218235939741135, "loss/hidden": 3.275390625, "loss/jsd": 0.0, "loss/logits": 0.17224793788045645, "step": 27950 }, { "epoch": 0.932, "grad_norm": 22.375, "grad_norm_var": 0.7541666666666667, "learning_rate": 1.892948568081055e-05, "loss": 6.8804, "loss/crossentropy": 2.163966727256775, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.1737080292776227, "step": 27960 }, { "epoch": 0.9323333333333333, "grad_norm": 22.5, "grad_norm_var": 0.7546223958333333, "learning_rate": 1.884513699495426e-05, "loss": 6.8348, "loss/crossentropy": 2.0167593225836753, "loss/hidden": 3.251953125, "loss/jsd": 0.0, "loss/logits": 0.15947899948805572, "step": 27970 }, { "epoch": 0.9326666666666666, "grad_norm": 21.875, "grad_norm_var": 0.39837239583333334, "learning_rate": 1.8761145142999516e-05, "loss": 6.8929, "loss/crossentropy": 1.9491732098162173, "loss/hidden": 3.207421875, "loss/jsd": 0.0, "loss/logits": 0.17612145710736513, "step": 27980 }, { "epoch": 0.933, "grad_norm": 26.0, "grad_norm_var": 1.6577473958333333, "learning_rate": 1.8677510953911987e-05, "loss": 6.9866, "loss/crossentropy": 2.0607218489050867, "loss/hidden": 3.302734375, "loss/jsd": 0.0, "loss/logits": 0.17759426180273294, "step": 27990 }, { "epoch": 0.9333333333333333, "grad_norm": 21.875, "grad_norm_var": 1.38515625, "learning_rate": 1.8594235253127375e-05, "loss": 7.0145, "loss/crossentropy": 2.211387987434864, "loss/hidden": 3.19296875, "loss/jsd": 0.0, "loss/logits": 0.1805992743000388, "step": 28000 }, { "epoch": 0.9336666666666666, "grad_norm": 22.875, "grad_norm_var": 0.5228515625, "learning_rate": 1.851131886254319e-05, "loss": 6.8231, "loss/crossentropy": 2.094671034812927, "loss/hidden": 3.12109375, "loss/jsd": 0.0, "loss/logits": 0.15865894313901663, "step": 28010 }, { "epoch": 0.934, "grad_norm": 20.25, "grad_norm_var": 0.6889973958333333, "learning_rate": 1.8428762600510772e-05, "loss": 6.8635, "loss/crossentropy": 2.0300868436694146, "loss/hidden": 3.245703125, "loss/jsd": 0.0, "loss/logits": 0.1639298925176263, "step": 28020 }, { "epoch": 0.9343333333333333, "grad_norm": 25.125, "grad_norm_var": 2.202018229166667, "learning_rate": 1.8346567281827077e-05, "loss": 6.7595, "loss/crossentropy": 1.9703581586480141, "loss/hidden": 3.2296875, "loss/jsd": 0.0, "loss/logits": 0.17280979938805102, "step": 28030 }, { "epoch": 0.9346666666666666, "grad_norm": 21.625, "grad_norm_var": 2.2697265625, "learning_rate": 1.8264733717726722e-05, "loss": 6.864, "loss/crossentropy": 1.8735784053802491, "loss/hidden": 3.290625, "loss/jsd": 0.0, "loss/logits": 0.17170735779218377, "step": 28040 }, { "epoch": 0.935, "grad_norm": 21.0, "grad_norm_var": 0.7077473958333333, "learning_rate": 1.818326271587394e-05, "loss": 6.871, "loss/crossentropy": 1.9842637002468109, "loss/hidden": 3.215234375, "loss/jsd": 0.0, "loss/logits": 0.16598169598728418, "step": 28050 }, { "epoch": 0.9353333333333333, "grad_norm": 22.125, "grad_norm_var": 1.0958333333333334, "learning_rate": 1.8102155080354642e-05, "loss": 6.8726, "loss/crossentropy": 2.063341203331947, "loss/hidden": 3.079296875, "loss/jsd": 0.0, "loss/logits": 0.15235913041979074, "step": 28060 }, { "epoch": 0.9356666666666666, "grad_norm": 23.5, "grad_norm_var": 1.3186848958333333, "learning_rate": 1.8021411611668444e-05, "loss": 6.8173, "loss/crossentropy": 2.0359160229563713, "loss/hidden": 3.2375, "loss/jsd": 0.0, "loss/logits": 0.16284253299236298, "step": 28070 }, { "epoch": 0.936, "grad_norm": 21.75, "grad_norm_var": 1.9625138911536218e+18, "learning_rate": 1.7941033106720768e-05, "loss": 6.8429, "loss/crossentropy": 1.9865235716104508, "loss/hidden": 3.175, "loss/jsd": 0.0, "loss/logits": 0.15688623264431953, "step": 28080 }, { "epoch": 0.9363333333333334, "grad_norm": 21.625, "grad_norm_var": 1.962513891247015e+18, "learning_rate": 1.7861020358815024e-05, "loss": 6.9292, "loss/crossentropy": 2.052123652398586, "loss/hidden": 3.27734375, "loss/jsd": 0.0, "loss/logits": 0.1626156263053417, "step": 28090 }, { "epoch": 0.9366666666666666, "grad_norm": 21.625, "grad_norm_var": 0.5916015625, "learning_rate": 1.7781374157644715e-05, "loss": 6.8609, "loss/crossentropy": 2.0096867479383946, "loss/hidden": 3.196484375, "loss/jsd": 0.0, "loss/logits": 0.15360062830150129, "step": 28100 }, { "epoch": 0.937, "grad_norm": 21.875, "grad_norm_var": 0.3650390625, "learning_rate": 1.7702095289285717e-05, "loss": 6.8755, "loss/crossentropy": 2.258693332970142, "loss/hidden": 3.0671875, "loss/jsd": 0.0, "loss/logits": 0.15729560470208526, "step": 28110 }, { "epoch": 0.9373333333333334, "grad_norm": 21.25, "grad_norm_var": 0.6020182291666667, "learning_rate": 1.7623184536188424e-05, "loss": 6.926, "loss/crossentropy": 1.9955579489469528, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.16989528406411408, "step": 28120 }, { "epoch": 0.9376666666666666, "grad_norm": 21.375, "grad_norm_var": 0.6176432291666667, "learning_rate": 1.7544642677170152e-05, "loss": 6.9319, "loss/crossentropy": 2.0671664133667944, "loss/hidden": 3.248046875, "loss/jsd": 0.0, "loss/logits": 0.1739020137116313, "step": 28130 }, { "epoch": 0.938, "grad_norm": 20.5, "grad_norm_var": 0.6077473958333334, "learning_rate": 1.74664704874073e-05, "loss": 6.8389, "loss/crossentropy": 1.8632956266403198, "loss/hidden": 3.19453125, "loss/jsd": 0.0, "loss/logits": 0.1709655337035656, "step": 28140 }, { "epoch": 0.9383333333333334, "grad_norm": 22.625, "grad_norm_var": 0.9457682291666667, "learning_rate": 1.738866873842785e-05, "loss": 6.8875, "loss/crossentropy": 2.1864455230534077, "loss/hidden": 3.13828125, "loss/jsd": 0.0, "loss/logits": 0.1634229407645762, "step": 28150 }, { "epoch": 0.9386666666666666, "grad_norm": 21.5, "grad_norm_var": 0.7875, "learning_rate": 1.7311238198103627e-05, "loss": 6.8575, "loss/crossentropy": 1.8006783843040466, "loss/hidden": 3.20859375, "loss/jsd": 0.0, "loss/logits": 0.14259467422962188, "step": 28160 }, { "epoch": 0.939, "grad_norm": 21.5, "grad_norm_var": 0.4369140625, "learning_rate": 1.7234179630642834e-05, "loss": 6.7653, "loss/crossentropy": 2.0104843035340307, "loss/hidden": 3.162109375, "loss/jsd": 0.0, "loss/logits": 0.15113328117877245, "step": 28170 }, { "epoch": 0.9393333333333334, "grad_norm": 20.25, "grad_norm_var": 1709.6525390625, "learning_rate": 1.7157493796582398e-05, "loss": 6.8128, "loss/crossentropy": 1.9046835117042065, "loss/hidden": 3.2140625, "loss/jsd": 0.0, "loss/logits": 0.15331623200327157, "step": 28180 }, { "epoch": 0.9396666666666667, "grad_norm": 21.75, "grad_norm_var": 1700.040625, "learning_rate": 1.708118145278056e-05, "loss": 6.8447, "loss/crossentropy": 1.9132866755127906, "loss/hidden": 3.25859375, "loss/jsd": 0.0, "loss/logits": 0.16736448789015412, "step": 28190 }, { "epoch": 0.94, "grad_norm": 21.5, "grad_norm_var": 2.4009765625, "learning_rate": 1.7005243352409334e-05, "loss": 6.8374, "loss/crossentropy": 2.1414462864398955, "loss/hidden": 3.1203125, "loss/jsd": 0.0, "loss/logits": 0.157612294703722, "step": 28200 }, { "epoch": 0.9403333333333334, "grad_norm": 21.125, "grad_norm_var": 2.1567057291666667, "learning_rate": 1.692968024494711e-05, "loss": 6.7885, "loss/crossentropy": 1.9627116709947585, "loss/hidden": 3.141015625, "loss/jsd": 0.0, "loss/logits": 0.14800271224230527, "step": 28210 }, { "epoch": 0.9406666666666667, "grad_norm": 22.25, "grad_norm_var": 0.9718098958333333, "learning_rate": 1.6854492876171264e-05, "loss": 6.8826, "loss/crossentropy": 1.9164805084466934, "loss/hidden": 3.2640625, "loss/jsd": 0.0, "loss/logits": 0.1629214364103973, "step": 28220 }, { "epoch": 0.941, "grad_norm": 21.375, "grad_norm_var": 1.3614583333333334, "learning_rate": 1.677968198815076e-05, "loss": 6.731, "loss/crossentropy": 1.940926407277584, "loss/hidden": 3.167578125, "loss/jsd": 0.0, "loss/logits": 0.1472564697265625, "step": 28230 }, { "epoch": 0.9413333333333334, "grad_norm": 21.25, "grad_norm_var": 1.4962890625, "learning_rate": 1.6705248319238876e-05, "loss": 6.9204, "loss/crossentropy": 2.1460791036486624, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.15511545334011317, "step": 28240 }, { "epoch": 0.9416666666666667, "grad_norm": 20.625, "grad_norm_var": 1.2593098958333333, "learning_rate": 1.6631192604065855e-05, "loss": 6.7155, "loss/crossentropy": 2.0769747786223887, "loss/hidden": 3.1859375, "loss/jsd": 0.0, "loss/logits": 0.1538231515791267, "step": 28250 }, { "epoch": 0.942, "grad_norm": 22.75, "grad_norm_var": 1.3264973958333333, "learning_rate": 1.6557515573531724e-05, "loss": 6.913, "loss/crossentropy": 2.01955421641469, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.1581470502540469, "step": 28260 }, { "epoch": 0.9423333333333334, "grad_norm": 21.0, "grad_norm_var": 1.2729166666666667, "learning_rate": 1.6484217954799018e-05, "loss": 6.7413, "loss/crossentropy": 1.9801385045051574, "loss/hidden": 3.24609375, "loss/jsd": 0.0, "loss/logits": 0.16784953828901053, "step": 28270 }, { "epoch": 0.9426666666666667, "grad_norm": 20.5, "grad_norm_var": 0.51015625, "learning_rate": 1.6411300471285656e-05, "loss": 6.8198, "loss/crossentropy": 2.2034427911043166, "loss/hidden": 3.079296875, "loss/jsd": 0.0, "loss/logits": 0.15067932959645985, "step": 28280 }, { "epoch": 0.943, "grad_norm": 23.875, "grad_norm_var": 1.01015625, "learning_rate": 1.6338763842657757e-05, "loss": 6.9264, "loss/crossentropy": 1.979648907482624, "loss/hidden": 3.31171875, "loss/jsd": 0.0, "loss/logits": 0.1757309900596738, "step": 28290 }, { "epoch": 0.9433333333333334, "grad_norm": 22.375, "grad_norm_var": 2.1809895833333335, "learning_rate": 1.6266608784822544e-05, "loss": 6.7541, "loss/crossentropy": 2.0895672395825384, "loss/hidden": 3.17265625, "loss/jsd": 0.0, "loss/logits": 0.16868185754865408, "step": 28300 }, { "epoch": 0.9436666666666667, "grad_norm": 20.625, "grad_norm_var": 0.7218098958333333, "learning_rate": 1.6194836009921332e-05, "loss": 6.7872, "loss/crossentropy": 1.9959282279014587, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.1535317923873663, "step": 28310 }, { "epoch": 0.944, "grad_norm": 21.5, "grad_norm_var": 0.9622395833333334, "learning_rate": 1.6123446226322414e-05, "loss": 6.7534, "loss/crossentropy": 2.1295602142810823, "loss/hidden": 3.19921875, "loss/jsd": 0.0, "loss/logits": 0.16768959537148476, "step": 28320 }, { "epoch": 0.9443333333333334, "grad_norm": 21.125, "grad_norm_var": 1.1333333333333333, "learning_rate": 1.6052440138614155e-05, "loss": 6.9399, "loss/crossentropy": 2.0156208984553814, "loss/hidden": 3.18828125, "loss/jsd": 0.0, "loss/logits": 0.15371856791898608, "step": 28330 }, { "epoch": 0.9446666666666667, "grad_norm": 21.0, "grad_norm_var": 1.05, "learning_rate": 1.598181844759795e-05, "loss": 6.8335, "loss/crossentropy": 2.0985936269164087, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.1626562364399433, "step": 28340 }, { "epoch": 0.945, "grad_norm": 20.75, "grad_norm_var": 1.15, "learning_rate": 1.5911581850281403e-05, "loss": 6.7963, "loss/crossentropy": 1.8603245675563813, "loss/hidden": 3.183203125, "loss/jsd": 0.0, "loss/logits": 0.14134703744202853, "step": 28350 }, { "epoch": 0.9453333333333334, "grad_norm": 21.5, "grad_norm_var": 0.72890625, "learning_rate": 1.5841731039871348e-05, "loss": 6.7295, "loss/crossentropy": 2.0212074637413027, "loss/hidden": 3.219921875, "loss/jsd": 0.0, "loss/logits": 0.15525523126125335, "step": 28360 }, { "epoch": 0.9456666666666667, "grad_norm": 23.625, "grad_norm_var": 1.4372395833333333, "learning_rate": 1.5772266705767108e-05, "loss": 6.8022, "loss/crossentropy": 2.0861593782901764, "loss/hidden": 3.192578125, "loss/jsd": 0.0, "loss/logits": 0.17841291818767785, "step": 28370 }, { "epoch": 0.946, "grad_norm": 22.375, "grad_norm_var": 2.3707682291666665, "learning_rate": 1.5703189533553605e-05, "loss": 6.9794, "loss/crossentropy": 2.1092930763959883, "loss/hidden": 3.28515625, "loss/jsd": 0.0, "loss/logits": 0.18949546683579682, "step": 28380 }, { "epoch": 0.9463333333333334, "grad_norm": 21.375, "grad_norm_var": 0.9843098958333333, "learning_rate": 1.563450020499463e-05, "loss": 6.8461, "loss/crossentropy": 2.025676953792572, "loss/hidden": 3.21015625, "loss/jsd": 0.0, "loss/logits": 0.17395553570240735, "step": 28390 }, { "epoch": 0.9466666666666667, "grad_norm": 21.25, "grad_norm_var": 3.162652818595678e+18, "learning_rate": 1.556619939802615e-05, "loss": 6.8821, "loss/crossentropy": 2.098123352229595, "loss/hidden": 3.2015625, "loss/jsd": 0.0, "loss/logits": 0.1534987824037671, "step": 28400 }, { "epoch": 0.947, "grad_norm": 22.0, "grad_norm_var": 0.74140625, "learning_rate": 1.549828778674953e-05, "loss": 6.9398, "loss/crossentropy": 2.09067225754261, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.17208856642246245, "step": 28410 }, { "epoch": 0.9473333333333334, "grad_norm": 22.75, "grad_norm_var": 0.55390625, "learning_rate": 1.5430766041424978e-05, "loss": 6.8388, "loss/crossentropy": 1.8563894510269165, "loss/hidden": 3.2703125, "loss/jsd": 0.0, "loss/logits": 0.15164101766422391, "step": 28420 }, { "epoch": 0.9476666666666667, "grad_norm": 23.125, "grad_norm_var": 0.8442057291666667, "learning_rate": 1.536363482846484e-05, "loss": 6.8602, "loss/crossentropy": 1.9244048327207566, "loss/hidden": 3.348828125, "loss/jsd": 0.0, "loss/logits": 0.15772407911717892, "step": 28430 }, { "epoch": 0.948, "grad_norm": 20.25, "grad_norm_var": 94.33743489583334, "learning_rate": 1.529689481042711e-05, "loss": 6.8611, "loss/crossentropy": 1.9303564444184302, "loss/hidden": 3.19921875, "loss/jsd": 0.0, "loss/logits": 0.1556209173053503, "step": 28440 }, { "epoch": 0.9483333333333334, "grad_norm": 22.125, "grad_norm_var": 94.33483072916667, "learning_rate": 1.5230546646008795e-05, "loss": 6.8918, "loss/crossentropy": 2.0158408626914026, "loss/hidden": 3.240234375, "loss/jsd": 0.0, "loss/logits": 0.15476850140839815, "step": 28450 }, { "epoch": 0.9486666666666667, "grad_norm": 21.875, "grad_norm_var": 1.2551432291666667, "learning_rate": 1.516459099003952e-05, "loss": 6.802, "loss/crossentropy": 2.104232335090637, "loss/hidden": 3.19375, "loss/jsd": 0.0, "loss/logits": 0.16216706801205874, "step": 28460 }, { "epoch": 0.949, "grad_norm": 23.375, "grad_norm_var": 2.8147497485817175e+18, "learning_rate": 1.5099028493474956e-05, "loss": 6.8888, "loss/crossentropy": 2.0458613131195307, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.17042651497758926, "step": 28470 }, { "epoch": 0.9493333333333334, "grad_norm": 21.5, "grad_norm_var": 2.814749748588708e+18, "learning_rate": 1.50338598033905e-05, "loss": 6.8804, "loss/crossentropy": 2.069540320336819, "loss/hidden": 3.151953125, "loss/jsd": 0.0, "loss/logits": 0.15929017215967178, "step": 28480 }, { "epoch": 0.9496666666666667, "grad_norm": 21.75, "grad_norm_var": 2.448958333333333, "learning_rate": 1.49690855629748e-05, "loss": 6.8573, "loss/crossentropy": 2.0447287276387214, "loss/hidden": 3.244921875, "loss/jsd": 0.0, "loss/logits": 0.17378965076059102, "step": 28490 }, { "epoch": 0.95, "grad_norm": 23.125, "grad_norm_var": 4.262239583333334, "learning_rate": 1.490470641152345e-05, "loss": 6.8061, "loss/crossentropy": 2.057085025310516, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.15816838014870882, "step": 28500 }, { "epoch": 0.9503333333333334, "grad_norm": 22.0, "grad_norm_var": 5.236458333333333, "learning_rate": 1.4840722984432701e-05, "loss": 6.8545, "loss/crossentropy": 1.9297899812459947, "loss/hidden": 3.16015625, "loss/jsd": 0.0, "loss/logits": 0.15209535052999854, "step": 28510 }, { "epoch": 0.9506666666666667, "grad_norm": 21.75, "grad_norm_var": 2.0322916666666666, "learning_rate": 1.4777135913193132e-05, "loss": 6.8245, "loss/crossentropy": 2.107157987356186, "loss/hidden": 3.212890625, "loss/jsd": 0.0, "loss/logits": 0.16366199534386397, "step": 28520 }, { "epoch": 0.951, "grad_norm": 22.0, "grad_norm_var": 0.25833333333333336, "learning_rate": 1.471394582538348e-05, "loss": 6.8325, "loss/crossentropy": 2.0403877660632133, "loss/hidden": 3.17265625, "loss/jsd": 0.0, "loss/logits": 0.16406202521175145, "step": 28530 }, { "epoch": 0.9513333333333334, "grad_norm": 24.125, "grad_norm_var": 3.139518229166667, "learning_rate": 1.4651153344664387e-05, "loss": 6.9738, "loss/crossentropy": 2.258484014868736, "loss/hidden": 3.1625, "loss/jsd": 0.0, "loss/logits": 0.15992612596601247, "step": 28540 }, { "epoch": 0.9516666666666667, "grad_norm": 20.375, "grad_norm_var": 1.6613932291666667, "learning_rate": 1.4588759090772302e-05, "loss": 6.8308, "loss/crossentropy": 2.0535311087965966, "loss/hidden": 3.146484375, "loss/jsd": 0.0, "loss/logits": 0.167077792994678, "step": 28550 }, { "epoch": 0.952, "grad_norm": 20.75, "grad_norm_var": 2.121809895833333, "learning_rate": 1.4526763679513303e-05, "loss": 6.9378, "loss/crossentropy": 2.1362095795571805, "loss/hidden": 3.16875, "loss/jsd": 0.0, "loss/logits": 0.15281093278899788, "step": 28560 }, { "epoch": 0.9523333333333334, "grad_norm": 21.875, "grad_norm_var": 1.9955729166666667, "learning_rate": 1.446516772275709e-05, "loss": 6.8711, "loss/crossentropy": 1.9070044673979283, "loss/hidden": 3.177734375, "loss/jsd": 0.0, "loss/logits": 0.16114689372479915, "step": 28570 }, { "epoch": 0.9526666666666667, "grad_norm": 22.25, "grad_norm_var": 0.5455729166666666, "learning_rate": 1.440397182843088e-05, "loss": 6.8352, "loss/crossentropy": 1.896916215121746, "loss/hidden": 3.27734375, "loss/jsd": 0.0, "loss/logits": 0.16120940092951058, "step": 28580 }, { "epoch": 0.953, "grad_norm": 22.75, "grad_norm_var": 0.5768229166666666, "learning_rate": 1.4343176600513433e-05, "loss": 6.9722, "loss/crossentropy": 2.0961402654647827, "loss/hidden": 3.2859375, "loss/jsd": 0.0, "loss/logits": 0.18284041043370963, "step": 28590 }, { "epoch": 0.9533333333333334, "grad_norm": 21.875, "grad_norm_var": 0.6259765625, "learning_rate": 1.428278263902913e-05, "loss": 6.8938, "loss/crossentropy": 1.9410855919122696, "loss/hidden": 3.192578125, "loss/jsd": 0.0, "loss/logits": 0.15987232998013495, "step": 28600 }, { "epoch": 0.9536666666666667, "grad_norm": 22.625, "grad_norm_var": 0.6947265625, "learning_rate": 1.422279054004196e-05, "loss": 6.7791, "loss/crossentropy": 2.0120147198438643, "loss/hidden": 3.147265625, "loss/jsd": 0.0, "loss/logits": 0.15940133705735207, "step": 28610 }, { "epoch": 0.954, "grad_norm": 24.875, "grad_norm_var": 1.1875, "learning_rate": 1.4163200895649742e-05, "loss": 6.9824, "loss/crossentropy": 1.899172729998827, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.14629402589052914, "step": 28620 }, { "epoch": 0.9543333333333334, "grad_norm": 22.625, "grad_norm_var": 2.0936848958333334, "learning_rate": 1.4104014293978196e-05, "loss": 6.8647, "loss/crossentropy": 2.0003262996673583, "loss/hidden": 3.188671875, "loss/jsd": 0.0, "loss/logits": 0.1569441094994545, "step": 28630 }, { "epoch": 0.9546666666666667, "grad_norm": 22.625, "grad_norm_var": 1.5327473958333333, "learning_rate": 1.4045231319175198e-05, "loss": 6.9435, "loss/crossentropy": 2.0988379955291747, "loss/hidden": 3.31328125, "loss/jsd": 0.0, "loss/logits": 0.17967260386794806, "step": 28640 }, { "epoch": 0.955, "grad_norm": 21.375, "grad_norm_var": 1.1369140625, "learning_rate": 1.3986852551404964e-05, "loss": 6.8599, "loss/crossentropy": 2.1141707748174667, "loss/hidden": 3.1359375, "loss/jsd": 0.0, "loss/logits": 0.15787138119339944, "step": 28650 }, { "epoch": 0.9553333333333334, "grad_norm": 23.75, "grad_norm_var": 0.8989583333333333, "learning_rate": 1.3928878566842376e-05, "loss": 6.8908, "loss/crossentropy": 2.0345154732465742, "loss/hidden": 3.225, "loss/jsd": 0.0, "loss/logits": 0.18289269097149372, "step": 28660 }, { "epoch": 0.9556666666666667, "grad_norm": 22.75, "grad_norm_var": 0.9889973958333333, "learning_rate": 1.3871309937667253e-05, "loss": 6.9933, "loss/crossentropy": 2.1303988128900526, "loss/hidden": 3.21171875, "loss/jsd": 0.0, "loss/logits": 0.17532578259706497, "step": 28670 }, { "epoch": 0.956, "grad_norm": 22.125, "grad_norm_var": 1.1150390625, "learning_rate": 1.3814147232058714e-05, "loss": 6.6818, "loss/crossentropy": 1.7408723145723344, "loss/hidden": 3.17265625, "loss/jsd": 0.0, "loss/logits": 0.15916957296431064, "step": 28680 }, { "epoch": 0.9563333333333334, "grad_norm": 22.5, "grad_norm_var": 1.0160807291666667, "learning_rate": 1.3757391014189596e-05, "loss": 6.9554, "loss/crossentropy": 1.9530368164181708, "loss/hidden": 3.181640625, "loss/jsd": 0.0, "loss/logits": 0.18459425549954175, "step": 28690 }, { "epoch": 0.9566666666666667, "grad_norm": 21.5, "grad_norm_var": 1.0666015625, "learning_rate": 1.3701041844220849e-05, "loss": 6.9349, "loss/crossentropy": 1.9662514954805375, "loss/hidden": 3.18671875, "loss/jsd": 0.0, "loss/logits": 0.17320307586342096, "step": 28700 }, { "epoch": 0.957, "grad_norm": 22.125, "grad_norm_var": 1.0697265625, "learning_rate": 1.3645100278296047e-05, "loss": 6.937, "loss/crossentropy": 2.046416383981705, "loss/hidden": 3.233984375, "loss/jsd": 0.0, "loss/logits": 0.19815693870186807, "step": 28710 }, { "epoch": 0.9573333333333334, "grad_norm": 21.5, "grad_norm_var": 1.4759765625, "learning_rate": 1.3589566868535836e-05, "loss": 6.8148, "loss/crossentropy": 2.0860094636678697, "loss/hidden": 3.12734375, "loss/jsd": 0.0, "loss/logits": 0.15798233803361655, "step": 28720 }, { "epoch": 0.9576666666666667, "grad_norm": 20.75, "grad_norm_var": 1.5302083333333334, "learning_rate": 1.3534442163032574e-05, "loss": 6.8177, "loss/crossentropy": 2.1211801931262015, "loss/hidden": 3.222265625, "loss/jsd": 0.0, "loss/logits": 0.1808505615219474, "step": 28730 }, { "epoch": 0.958, "grad_norm": 22.375, "grad_norm_var": 0.9270833333333334, "learning_rate": 1.347972670584483e-05, "loss": 6.8425, "loss/crossentropy": 1.9814658090472221, "loss/hidden": 3.116796875, "loss/jsd": 0.0, "loss/logits": 0.15671081114560365, "step": 28740 }, { "epoch": 0.9583333333333334, "grad_norm": 20.75, "grad_norm_var": 0.8885416666666667, "learning_rate": 1.3425421036992098e-05, "loss": 6.7837, "loss/crossentropy": 1.9576505310833454, "loss/hidden": 3.153515625, "loss/jsd": 0.0, "loss/logits": 0.1521891091018915, "step": 28750 }, { "epoch": 0.9586666666666667, "grad_norm": 20.625, "grad_norm_var": 0.6791666666666667, "learning_rate": 1.3371525692449394e-05, "loss": 6.9583, "loss/crossentropy": 2.1901199877262116, "loss/hidden": 3.2046875, "loss/jsd": 0.0, "loss/logits": 0.16546592973172664, "step": 28760 }, { "epoch": 0.959, "grad_norm": 22.25, "grad_norm_var": 2.3824041859454684e+18, "learning_rate": 1.3318041204142004e-05, "loss": 6.9222, "loss/crossentropy": 1.9837070412933826, "loss/hidden": 3.20234375, "loss/jsd": 0.0, "loss/logits": 0.16084651360288263, "step": 28770 }, { "epoch": 0.9593333333333334, "grad_norm": 20.375, "grad_norm_var": 1.9385416666666666, "learning_rate": 1.3264968099940245e-05, "loss": 6.9218, "loss/crossentropy": 2.1181742370128633, "loss/hidden": 3.244140625, "loss/jsd": 0.0, "loss/logits": 0.18821782916784285, "step": 28780 }, { "epoch": 0.9596666666666667, "grad_norm": 24.0, "grad_norm_var": 2.1869140625, "learning_rate": 1.321230690365422e-05, "loss": 6.8798, "loss/crossentropy": 2.0393978893756866, "loss/hidden": 3.191796875, "loss/jsd": 0.0, "loss/logits": 0.15889321286231278, "step": 28790 }, { "epoch": 0.96, "grad_norm": 22.25, "grad_norm_var": 1.2372395833333334, "learning_rate": 1.3160058135028691e-05, "loss": 6.8816, "loss/crossentropy": 1.9314091876149178, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.154315375816077, "step": 28800 }, { "epoch": 0.9603333333333334, "grad_norm": 21.125, "grad_norm_var": 1.59765625, "learning_rate": 1.3108222309737892e-05, "loss": 6.9788, "loss/crossentropy": 1.9973760724067688, "loss/hidden": 3.198828125, "loss/jsd": 0.0, "loss/logits": 0.1759620831348002, "step": 28810 }, { "epoch": 0.9606666666666667, "grad_norm": 22.375, "grad_norm_var": 1.0561848958333333, "learning_rate": 1.305679993938051e-05, "loss": 6.8864, "loss/crossentropy": 2.043069842457771, "loss/hidden": 3.166796875, "loss/jsd": 0.0, "loss/logits": 0.15617891773581505, "step": 28820 }, { "epoch": 0.961, "grad_norm": 22.125, "grad_norm_var": 12.6650390625, "learning_rate": 1.3005791531474562e-05, "loss": 6.8934, "loss/crossentropy": 2.0857333853840827, "loss/hidden": 3.164453125, "loss/jsd": 0.0, "loss/logits": 0.1577781980857253, "step": 28830 }, { "epoch": 0.9613333333333334, "grad_norm": 21.625, "grad_norm_var": 12.689322916666667, "learning_rate": 1.2955197589452462e-05, "loss": 6.8934, "loss/crossentropy": 1.9508272759616374, "loss/hidden": 3.271484375, "loss/jsd": 0.0, "loss/logits": 0.16099842144176363, "step": 28840 }, { "epoch": 0.9616666666666667, "grad_norm": 23.25, "grad_norm_var": 0.4393229166666667, "learning_rate": 1.2905018612655975e-05, "loss": 6.86, "loss/crossentropy": 2.013827832788229, "loss/hidden": 3.207421875, "loss/jsd": 0.0, "loss/logits": 0.16024797260761262, "step": 28850 }, { "epoch": 0.962, "grad_norm": 21.5, "grad_norm_var": 0.6760416666666667, "learning_rate": 1.2855255096331348e-05, "loss": 6.9056, "loss/crossentropy": 2.1667084366083147, "loss/hidden": 3.098046875, "loss/jsd": 0.0, "loss/logits": 0.15538214575499296, "step": 28860 }, { "epoch": 0.9623333333333334, "grad_norm": 22.75, "grad_norm_var": 0.5228515625, "learning_rate": 1.2805907531624403e-05, "loss": 6.7589, "loss/crossentropy": 1.8827613063156605, "loss/hidden": 3.068359375, "loss/jsd": 0.0, "loss/logits": 0.14002714012749493, "step": 28870 }, { "epoch": 0.9626666666666667, "grad_norm": 23.125, "grad_norm_var": 1.7014973958333333, "learning_rate": 1.2756976405575668e-05, "loss": 6.8948, "loss/crossentropy": 1.9622853726148606, "loss/hidden": 3.303515625, "loss/jsd": 0.0, "loss/logits": 0.1584384061396122, "step": 28880 }, { "epoch": 0.963, "grad_norm": 21.875, "grad_norm_var": 1.6854166666666666, "learning_rate": 1.2708462201115617e-05, "loss": 6.7792, "loss/crossentropy": 1.8909013763070106, "loss/hidden": 3.235546875, "loss/jsd": 0.0, "loss/logits": 0.16017456604167818, "step": 28890 }, { "epoch": 0.9633333333333334, "grad_norm": 22.375, "grad_norm_var": 0.5988932291666667, "learning_rate": 1.2660365397059856e-05, "loss": 6.7184, "loss/crossentropy": 1.9674251511693002, "loss/hidden": 3.17890625, "loss/jsd": 0.0, "loss/logits": 0.1554909586906433, "step": 28900 }, { "epoch": 0.9636666666666667, "grad_norm": 23.125, "grad_norm_var": 0.99140625, "learning_rate": 1.2612686468104426e-05, "loss": 6.8514, "loss/crossentropy": 2.0045790046453478, "loss/hidden": 3.18046875, "loss/jsd": 0.0, "loss/logits": 0.16668143030256033, "step": 28910 }, { "epoch": 0.964, "grad_norm": 22.125, "grad_norm_var": 1.246875, "learning_rate": 1.2565425884821096e-05, "loss": 6.9451, "loss/crossentropy": 2.1058658018708227, "loss/hidden": 3.23671875, "loss/jsd": 0.0, "loss/logits": 0.17369681475684046, "step": 28920 }, { "epoch": 0.9643333333333334, "grad_norm": 22.125, "grad_norm_var": 0.9249348958333333, "learning_rate": 1.2518584113652767e-05, "loss": 6.92, "loss/crossentropy": 2.0629913471639156, "loss/hidden": 3.19453125, "loss/jsd": 0.0, "loss/logits": 0.16003647521138192, "step": 28930 }, { "epoch": 0.9646666666666667, "grad_norm": 23.75, "grad_norm_var": 0.7143229166666667, "learning_rate": 1.247216161690879e-05, "loss": 6.9623, "loss/crossentropy": 2.127345842123032, "loss/hidden": 3.251171875, "loss/jsd": 0.0, "loss/logits": 0.17870207615196704, "step": 28940 }, { "epoch": 0.965, "grad_norm": 22.0, "grad_norm_var": 0.7660807291666667, "learning_rate": 1.2426158852760462e-05, "loss": 6.7875, "loss/crossentropy": 1.906770334392786, "loss/hidden": 3.098828125, "loss/jsd": 0.0, "loss/logits": 0.14323475370183586, "step": 28950 }, { "epoch": 0.9653333333333334, "grad_norm": 22.75, "grad_norm_var": 2.2660807291666667, "learning_rate": 1.2380576275236511e-05, "loss": 6.8731, "loss/crossentropy": 2.055904617905617, "loss/hidden": 3.18046875, "loss/jsd": 0.0, "loss/logits": 0.15869035460054876, "step": 28960 }, { "epoch": 0.9656666666666667, "grad_norm": 24.125, "grad_norm_var": 1.1556640625, "learning_rate": 1.2335414334218561e-05, "loss": 7.0247, "loss/crossentropy": 2.0497659265995027, "loss/hidden": 3.180859375, "loss/jsd": 0.0, "loss/logits": 0.1614781607873738, "step": 28970 }, { "epoch": 0.966, "grad_norm": 22.25, "grad_norm_var": 1.1497395833333333, "learning_rate": 1.229067347543675e-05, "loss": 6.8011, "loss/crossentropy": 2.0531945556402205, "loss/hidden": 3.221875, "loss/jsd": 0.0, "loss/logits": 0.16420614402741193, "step": 28980 }, { "epoch": 0.9663333333333334, "grad_norm": 21.625, "grad_norm_var": 0.8541666666666666, "learning_rate": 1.224635414046527e-05, "loss": 6.8427, "loss/crossentropy": 1.9432912215590477, "loss/hidden": 3.16328125, "loss/jsd": 0.0, "loss/logits": 0.16180085185915233, "step": 28990 }, { "epoch": 0.9666666666666667, "grad_norm": 20.875, "grad_norm_var": 0.6416666666666667, "learning_rate": 1.2202456766718093e-05, "loss": 6.7551, "loss/crossentropy": 2.1089743584394456, "loss/hidden": 3.159375, "loss/jsd": 0.0, "loss/logits": 0.1651729150209576, "step": 29000 }, { "epoch": 0.967, "grad_norm": 21.0, "grad_norm_var": 1.4041666666666666, "learning_rate": 1.2158981787444552e-05, "loss": 6.8471, "loss/crossentropy": 1.9657625079154968, "loss/hidden": 3.24453125, "loss/jsd": 0.0, "loss/logits": 0.16529466435313225, "step": 29010 }, { "epoch": 0.9673333333333334, "grad_norm": 22.25, "grad_norm_var": 0.9593098958333334, "learning_rate": 1.2115929631725158e-05, "loss": 6.8562, "loss/crossentropy": 1.982128444686532, "loss/hidden": 3.14296875, "loss/jsd": 0.0, "loss/logits": 0.15026735952123998, "step": 29020 }, { "epoch": 0.9676666666666667, "grad_norm": 24.0, "grad_norm_var": 1.4122395833333334, "learning_rate": 1.2073300724467295e-05, "loss": 6.7686, "loss/crossentropy": 2.0765829384326935, "loss/hidden": 3.253125, "loss/jsd": 0.0, "loss/logits": 0.16304893530905246, "step": 29030 }, { "epoch": 0.968, "grad_norm": 22.375, "grad_norm_var": 2.3053504105878477e+18, "learning_rate": 1.2031095486401069e-05, "loss": 6.9941, "loss/crossentropy": 2.093338930606842, "loss/hidden": 3.43125, "loss/jsd": 0.0, "loss/logits": 0.16548265926539898, "step": 29040 }, { "epoch": 0.9683333333333334, "grad_norm": 26.5, "grad_norm_var": 2.305350410347444e+18, "learning_rate": 1.1989314334075145e-05, "loss": 6.9207, "loss/crossentropy": 2.0706306278705595, "loss/hidden": 3.175, "loss/jsd": 0.0, "loss/logits": 0.1563433837145567, "step": 29050 }, { "epoch": 0.9686666666666667, "grad_norm": 23.5, "grad_norm_var": 2.209830729166667, "learning_rate": 1.1947957679852627e-05, "loss": 6.9284, "loss/crossentropy": 1.939845222979784, "loss/hidden": 3.170703125, "loss/jsd": 0.0, "loss/logits": 0.15329579524695874, "step": 29060 }, { "epoch": 0.969, "grad_norm": 22.5, "grad_norm_var": 0.6143229166666667, "learning_rate": 1.1907025931907e-05, "loss": 6.8166, "loss/crossentropy": 2.0235880702733993, "loss/hidden": 3.131640625, "loss/jsd": 0.0, "loss/logits": 0.15207564570009707, "step": 29070 }, { "epoch": 0.9693333333333334, "grad_norm": 23.125, "grad_norm_var": 1.1895182291666666, "learning_rate": 1.1866519494218084e-05, "loss": 6.9347, "loss/crossentropy": 2.039486038684845, "loss/hidden": 3.239453125, "loss/jsd": 0.0, "loss/logits": 0.18337175534106792, "step": 29080 }, { "epoch": 0.9696666666666667, "grad_norm": 22.75, "grad_norm_var": 1.1238932291666666, "learning_rate": 1.1826438766568076e-05, "loss": 6.8713, "loss/crossentropy": 2.1116551235318184, "loss/hidden": 3.141015625, "loss/jsd": 0.0, "loss/logits": 0.16442451104521752, "step": 29090 }, { "epoch": 0.97, "grad_norm": 22.375, "grad_norm_var": 1.9671223958333333, "learning_rate": 1.1786784144537563e-05, "loss": 6.8258, "loss/crossentropy": 2.092792363464832, "loss/hidden": 3.242578125, "loss/jsd": 0.0, "loss/logits": 0.1660682398825884, "step": 29100 }, { "epoch": 0.9703333333333334, "grad_norm": 21.75, "grad_norm_var": 1.4775390625, "learning_rate": 1.1747556019501665e-05, "loss": 6.8014, "loss/crossentropy": 2.1302355214953423, "loss/hidden": 3.139453125, "loss/jsd": 0.0, "loss/logits": 0.1670895716175437, "step": 29110 }, { "epoch": 0.9706666666666667, "grad_norm": 21.5, "grad_norm_var": 1.8885416666666666, "learning_rate": 1.1708754778626134e-05, "loss": 6.9092, "loss/crossentropy": 2.0474965393543245, "loss/hidden": 3.373046875, "loss/jsd": 0.0, "loss/logits": 0.19082598555833102, "step": 29120 }, { "epoch": 0.971, "grad_norm": 24.375, "grad_norm_var": 2.9284656384458097e+18, "learning_rate": 1.1670380804863557e-05, "loss": 7.0317, "loss/crossentropy": 2.060644108057022, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.18089290745556355, "step": 29130 }, { "epoch": 0.9713333333333334, "grad_norm": 22.5, "grad_norm_var": 0.46868489583333334, "learning_rate": 1.1632434476949564e-05, "loss": 6.9555, "loss/crossentropy": 2.125787417590618, "loss/hidden": 3.22421875, "loss/jsd": 0.0, "loss/logits": 0.17629719469696284, "step": 29140 }, { "epoch": 0.9716666666666667, "grad_norm": 22.5, "grad_norm_var": 0.6518229166666667, "learning_rate": 1.1594916169399088e-05, "loss": 6.8684, "loss/crossentropy": 2.202189776301384, "loss/hidden": 3.119921875, "loss/jsd": 0.0, "loss/logits": 0.15515435487031937, "step": 29150 }, { "epoch": 0.972, "grad_norm": 22.25, "grad_norm_var": 0.8455729166666667, "learning_rate": 1.1557826252502677e-05, "loss": 6.8047, "loss/crossentropy": 2.0502734132111073, "loss/hidden": 3.1578125, "loss/jsd": 0.0, "loss/logits": 0.1607513885013759, "step": 29160 }, { "epoch": 0.9723333333333334, "grad_norm": 22.0, "grad_norm_var": 0.8205729166666667, "learning_rate": 1.1521165092322836e-05, "loss": 6.8834, "loss/crossentropy": 2.01381069123745, "loss/hidden": 3.333984375, "loss/jsd": 0.0, "loss/logits": 0.16493042316287757, "step": 29170 }, { "epoch": 0.9726666666666667, "grad_norm": 25.0, "grad_norm_var": 0.646875, "learning_rate": 1.1484933050690425e-05, "loss": 6.9093, "loss/crossentropy": 2.077186991274357, "loss/hidden": 3.162890625, "loss/jsd": 0.0, "loss/logits": 0.1712075762450695, "step": 29180 }, { "epoch": 0.973, "grad_norm": 22.375, "grad_norm_var": 0.759375, "learning_rate": 1.1449130485201056e-05, "loss": 6.8158, "loss/crossentropy": 1.9148377593606711, "loss/hidden": 3.103125, "loss/jsd": 0.0, "loss/logits": 0.14505248717032374, "step": 29190 }, { "epoch": 0.9733333333333334, "grad_norm": 23.25, "grad_norm_var": 0.3634765625, "learning_rate": 1.1413757749211602e-05, "loss": 6.8682, "loss/crossentropy": 2.138897517323494, "loss/hidden": 3.0640625, "loss/jsd": 0.0, "loss/logits": 0.1604563297703862, "step": 29200 }, { "epoch": 0.9736666666666667, "grad_norm": 23.375, "grad_norm_var": 0.4837890625, "learning_rate": 1.1378815191836679e-05, "loss": 6.8282, "loss/crossentropy": 2.014554353058338, "loss/hidden": 3.26796875, "loss/jsd": 0.0, "loss/logits": 0.16188111137598754, "step": 29210 }, { "epoch": 0.974, "grad_norm": 22.0, "grad_norm_var": 2894.9393229166667, "learning_rate": 1.1344303157945242e-05, "loss": 6.9728, "loss/crossentropy": 2.0439544051885603, "loss/hidden": 3.188671875, "loss/jsd": 0.0, "loss/logits": 0.1644747108221054, "step": 29220 }, { "epoch": 0.9743333333333334, "grad_norm": 21.625, "grad_norm_var": 2892.2497395833334, "learning_rate": 1.1310221988157106e-05, "loss": 6.8651, "loss/crossentropy": 2.073014111816883, "loss/hidden": 3.30546875, "loss/jsd": 0.0, "loss/logits": 0.1810118304565549, "step": 29230 }, { "epoch": 0.9746666666666667, "grad_norm": 23.25, "grad_norm_var": 0.915625, "learning_rate": 1.1276572018839673e-05, "loss": 6.9769, "loss/crossentropy": 2.0511143311858175, "loss/hidden": 3.195703125, "loss/jsd": 0.0, "loss/logits": 0.15696678645908832, "step": 29240 }, { "epoch": 0.975, "grad_norm": 22.75, "grad_norm_var": 0.38014322916666665, "learning_rate": 1.1243353582104556e-05, "loss": 7.0323, "loss/crossentropy": 2.084176428616047, "loss/hidden": 3.24765625, "loss/jsd": 0.0, "loss/logits": 0.17536051329225302, "step": 29250 }, { "epoch": 0.9753333333333334, "grad_norm": 25.5, "grad_norm_var": 1.859375, "learning_rate": 1.1210567005804302e-05, "loss": 6.8833, "loss/crossentropy": 2.0693555802106856, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.1687312951311469, "step": 29260 }, { "epoch": 0.9756666666666667, "grad_norm": 22.375, "grad_norm_var": 2.265625, "learning_rate": 1.1178212613529202e-05, "loss": 6.7959, "loss/crossentropy": 2.032899996638298, "loss/hidden": 3.148828125, "loss/jsd": 0.0, "loss/logits": 0.15500539531931282, "step": 29270 }, { "epoch": 0.976, "grad_norm": 22.5, "grad_norm_var": 3.958268229166667, "learning_rate": 1.1146290724604024e-05, "loss": 6.9032, "loss/crossentropy": 2.0004256799817086, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.16235917941667138, "step": 29280 }, { "epoch": 0.9763333333333334, "grad_norm": 21.875, "grad_norm_var": 3.7122395833333335, "learning_rate": 1.1114801654084949e-05, "loss": 6.8216, "loss/crossentropy": 1.98628860861063, "loss/hidden": 3.196875, "loss/jsd": 0.0, "loss/logits": 0.16457067504525186, "step": 29290 }, { "epoch": 0.9766666666666667, "grad_norm": 23.0, "grad_norm_var": 0.884375, "learning_rate": 1.1083745712756367e-05, "loss": 6.9611, "loss/crossentropy": 2.2105645328760146, "loss/hidden": 3.204296875, "loss/jsd": 0.0, "loss/logits": 0.17289726454764603, "step": 29300 }, { "epoch": 0.977, "grad_norm": 23.375, "grad_norm_var": 0.46041666666666664, "learning_rate": 1.1053123207127896e-05, "loss": 6.9226, "loss/crossentropy": 2.0838935345411302, "loss/hidden": 3.16015625, "loss/jsd": 0.0, "loss/logits": 0.15632101874798537, "step": 29310 }, { "epoch": 0.9773333333333334, "grad_norm": 24.25, "grad_norm_var": 0.7247395833333333, "learning_rate": 1.1022934439431295e-05, "loss": 6.8949, "loss/crossentropy": 1.9940695136785507, "loss/hidden": 3.185546875, "loss/jsd": 0.0, "loss/logits": 0.1685311601497233, "step": 29320 }, { "epoch": 0.9776666666666667, "grad_norm": 22.25, "grad_norm_var": 0.5684895833333333, "learning_rate": 1.0993179707617519e-05, "loss": 6.8931, "loss/crossentropy": 2.195150835812092, "loss/hidden": 3.161328125, "loss/jsd": 0.0, "loss/logits": 0.16006924994289876, "step": 29330 }, { "epoch": 0.978, "grad_norm": 22.375, "grad_norm_var": 0.4952473958333333, "learning_rate": 1.0963859305353758e-05, "loss": 6.954, "loss/crossentropy": 2.0411001086235045, "loss/hidden": 3.287890625, "loss/jsd": 0.0, "loss/logits": 0.17475655488669872, "step": 29340 }, { "epoch": 0.9783333333333334, "grad_norm": 23.375, "grad_norm_var": 6.3931640625, "learning_rate": 1.0934973522020538e-05, "loss": 6.9526, "loss/crossentropy": 2.057637444138527, "loss/hidden": 3.314453125, "loss/jsd": 0.0, "loss/logits": 0.1770678885281086, "step": 29350 }, { "epoch": 0.9786666666666667, "grad_norm": 21.125, "grad_norm_var": 2.4233723958333333, "learning_rate": 1.0906522642708893e-05, "loss": 6.897, "loss/crossentropy": 2.0793089002370833, "loss/hidden": 3.180859375, "loss/jsd": 0.0, "loss/logits": 0.16001901477575303, "step": 29360 }, { "epoch": 0.979, "grad_norm": 23.375, "grad_norm_var": 2.24765625, "learning_rate": 1.0878506948217503e-05, "loss": 6.9443, "loss/crossentropy": 2.04323640614748, "loss/hidden": 3.2484375, "loss/jsd": 0.0, "loss/logits": 0.16657722741365433, "step": 29370 }, { "epoch": 0.9793333333333333, "grad_norm": 23.0, "grad_norm_var": 0.4520182291666667, "learning_rate": 1.0850926715049972e-05, "loss": 6.8765, "loss/crossentropy": 1.9868069365620613, "loss/hidden": 3.137109375, "loss/jsd": 0.0, "loss/logits": 0.1584441527724266, "step": 29380 }, { "epoch": 0.9796666666666667, "grad_norm": 22.25, "grad_norm_var": 0.7582682291666667, "learning_rate": 1.0823782215412054e-05, "loss": 6.9489, "loss/crossentropy": 1.9985451444983482, "loss/hidden": 3.283984375, "loss/jsd": 0.0, "loss/logits": 0.15724884811788797, "step": 29390 }, { "epoch": 0.98, "grad_norm": 22.5, "grad_norm_var": 0.6583333333333333, "learning_rate": 1.0797073717209014e-05, "loss": 6.8264, "loss/crossentropy": 2.087764638662338, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.1608564306050539, "step": 29400 }, { "epoch": 0.9803333333333333, "grad_norm": 23.75, "grad_norm_var": 1.7789922062113964e+18, "learning_rate": 1.0770801484042939e-05, "loss": 6.8378, "loss/crossentropy": 1.9793974101543426, "loss/hidden": 3.416015625, "loss/jsd": 0.0, "loss/logits": 0.1507678169757128, "step": 29410 }, { "epoch": 0.9806666666666667, "grad_norm": 23.5, "grad_norm_var": 0.8333333333333334, "learning_rate": 1.0744965775210168e-05, "loss": 6.9311, "loss/crossentropy": 2.0206541672348974, "loss/hidden": 3.29609375, "loss/jsd": 0.0, "loss/logits": 0.18002614229917527, "step": 29420 }, { "epoch": 0.981, "grad_norm": 21.625, "grad_norm_var": 1.0957682291666666, "learning_rate": 1.0719566845698715e-05, "loss": 6.9871, "loss/crossentropy": 1.9728245690464974, "loss/hidden": 3.217578125, "loss/jsd": 0.0, "loss/logits": 0.16548432894051074, "step": 29430 }, { "epoch": 0.9813333333333333, "grad_norm": 28.5, "grad_norm_var": 2.74375, "learning_rate": 1.0694604946185762e-05, "loss": 6.9848, "loss/crossentropy": 2.042478208243847, "loss/hidden": 3.17890625, "loss/jsd": 0.0, "loss/logits": 0.17351799383759497, "step": 29440 }, { "epoch": 0.9816666666666667, "grad_norm": 23.75, "grad_norm_var": 2.24140625, "learning_rate": 1.0670080323035176e-05, "loss": 6.9812, "loss/crossentropy": 2.0702026799321174, "loss/hidden": 3.1515625, "loss/jsd": 0.0, "loss/logits": 0.15049307681620122, "step": 29450 }, { "epoch": 0.982, "grad_norm": 23.125, "grad_norm_var": 0.77265625, "learning_rate": 1.0645993218295088e-05, "loss": 6.8362, "loss/crossentropy": 1.9153663486242294, "loss/hidden": 3.1890625, "loss/jsd": 0.0, "loss/logits": 0.16654033735394477, "step": 29460 }, { "epoch": 0.9823333333333333, "grad_norm": 24.125, "grad_norm_var": 0.9309895833333334, "learning_rate": 1.0622343869695508e-05, "loss": 6.8102, "loss/crossentropy": 1.9985662505030632, "loss/hidden": 3.265234375, "loss/jsd": 0.0, "loss/logits": 0.16704850597307086, "step": 29470 }, { "epoch": 0.9826666666666667, "grad_norm": 23.25, "grad_norm_var": 1.1309895833333334, "learning_rate": 1.0599132510645939e-05, "loss": 6.8195, "loss/crossentropy": 2.0518441289663314, "loss/hidden": 3.111328125, "loss/jsd": 0.0, "loss/logits": 0.14787574112415314, "step": 29480 }, { "epoch": 0.983, "grad_norm": 25.625, "grad_norm_var": 2.0879557291666666, "learning_rate": 1.057635937023314e-05, "loss": 6.9664, "loss/crossentropy": 2.0693943217396735, "loss/hidden": 3.16015625, "loss/jsd": 0.0, "loss/logits": 0.15590767320245505, "step": 29490 }, { "epoch": 0.9833333333333333, "grad_norm": 23.875, "grad_norm_var": 0.9830729166666666, "learning_rate": 1.0554024673218807e-05, "loss": 6.9185, "loss/crossentropy": 1.8844229593873023, "loss/hidden": 3.323046875, "loss/jsd": 0.0, "loss/logits": 0.17540524620562792, "step": 29500 }, { "epoch": 0.9836666666666667, "grad_norm": 23.25, "grad_norm_var": 1.0747395833333333, "learning_rate": 1.053212864003738e-05, "loss": 6.869, "loss/crossentropy": 1.8106282196938992, "loss/hidden": 3.278515625, "loss/jsd": 0.0, "loss/logits": 0.15563625153154134, "step": 29510 }, { "epoch": 0.984, "grad_norm": 22.5, "grad_norm_var": 0.5796223958333333, "learning_rate": 1.0510671486793873e-05, "loss": 6.9721, "loss/crossentropy": 2.120162781327963, "loss/hidden": 3.1421875, "loss/jsd": 0.0, "loss/logits": 0.15654509966261684, "step": 29520 }, { "epoch": 0.9843333333333333, "grad_norm": 24.875, "grad_norm_var": 1.2080729166666666, "learning_rate": 1.0489653425261721e-05, "loss": 6.9396, "loss/crossentropy": 2.093233019858599, "loss/hidden": 3.149609375, "loss/jsd": 0.0, "loss/logits": 0.15795425418764353, "step": 29530 }, { "epoch": 0.9846666666666667, "grad_norm": 22.25, "grad_norm_var": 1.6087890625, "learning_rate": 1.046907466288071e-05, "loss": 6.7796, "loss/crossentropy": 2.213361156731844, "loss/hidden": 3.19765625, "loss/jsd": 0.0, "loss/logits": 0.16810417706146835, "step": 29540 }, { "epoch": 0.985, "grad_norm": 22.125, "grad_norm_var": 1.1510416666666667, "learning_rate": 1.0448935402754912e-05, "loss": 6.8736, "loss/crossentropy": 2.0324639290571214, "loss/hidden": 3.151171875, "loss/jsd": 0.0, "loss/logits": 0.16171343475580216, "step": 29550 }, { "epoch": 0.9853333333333333, "grad_norm": 38.25, "grad_norm_var": 33.201822916666664, "learning_rate": 1.0429235843650698e-05, "loss": 6.9544, "loss/crossentropy": 2.1379873633384703, "loss/hidden": 3.222265625, "loss/jsd": 0.0, "loss/logits": 0.16771480459719895, "step": 29560 }, { "epoch": 0.9856666666666667, "grad_norm": 22.0, "grad_norm_var": 33.7134765625, "learning_rate": 1.0409976179994762e-05, "loss": 6.8317, "loss/crossentropy": 1.8709135249257087, "loss/hidden": 3.3171875, "loss/jsd": 0.0, "loss/logits": 0.15978736570104957, "step": 29570 }, { "epoch": 0.986, "grad_norm": 23.625, "grad_norm_var": 0.9660807291666667, "learning_rate": 1.039115660187221e-05, "loss": 6.8523, "loss/crossentropy": 1.9445186778903008, "loss/hidden": 3.201953125, "loss/jsd": 0.0, "loss/logits": 0.15582914650440216, "step": 29580 }, { "epoch": 0.9863333333333333, "grad_norm": 25.5, "grad_norm_var": 2.6393229166666665, "learning_rate": 1.0372777295024676e-05, "loss": 6.8136, "loss/crossentropy": 1.7506938025355339, "loss/hidden": 3.274609375, "loss/jsd": 0.0, "loss/logits": 0.16162756085395813, "step": 29590 }, { "epoch": 0.9866666666666667, "grad_norm": 22.375, "grad_norm_var": 3.03125, "learning_rate": 1.0354838440848503e-05, "loss": 6.848, "loss/crossentropy": 2.032399223744869, "loss/hidden": 3.1625, "loss/jsd": 0.0, "loss/logits": 0.16586268395185472, "step": 29600 }, { "epoch": 0.987, "grad_norm": 22.625, "grad_norm_var": 0.3900390625, "learning_rate": 1.0337340216392933e-05, "loss": 6.8192, "loss/crossentropy": 1.934443362057209, "loss/hidden": 3.3578125, "loss/jsd": 0.0, "loss/logits": 0.180020921677351, "step": 29610 }, { "epoch": 0.9873333333333333, "grad_norm": 25.25, "grad_norm_var": 1.2455729166666667, "learning_rate": 1.032028279435839e-05, "loss": 6.9186, "loss/crossentropy": 2.2124100014567376, "loss/hidden": 3.185546875, "loss/jsd": 0.0, "loss/logits": 0.18372708857059478, "step": 29620 }, { "epoch": 0.9876666666666667, "grad_norm": 22.5, "grad_norm_var": 1.2145833333333333, "learning_rate": 1.030366634309473e-05, "loss": 6.8804, "loss/crossentropy": 2.0187501519918443, "loss/hidden": 3.20703125, "loss/jsd": 0.0, "loss/logits": 0.17060858262702822, "step": 29630 }, { "epoch": 0.988, "grad_norm": 23.125, "grad_norm_var": 1.0729166666666667, "learning_rate": 1.0287491026599623e-05, "loss": 6.892, "loss/crossentropy": 2.02512718886137, "loss/hidden": 3.1984375, "loss/jsd": 0.0, "loss/logits": 0.17388947010040284, "step": 29640 }, { "epoch": 0.9883333333333333, "grad_norm": 21.125, "grad_norm_var": 1.2705729166666666, "learning_rate": 1.0271757004516918e-05, "loss": 6.8015, "loss/crossentropy": 2.1398928314447403, "loss/hidden": 3.289453125, "loss/jsd": 0.0, "loss/logits": 0.1810132971033454, "step": 29650 }, { "epoch": 0.9886666666666667, "grad_norm": 23.875, "grad_norm_var": 2.3364583333333333, "learning_rate": 1.0256464432135048e-05, "loss": 7.0092, "loss/crossentropy": 2.1282053992152212, "loss/hidden": 3.175390625, "loss/jsd": 0.0, "loss/logits": 0.16205706167966127, "step": 29660 }, { "epoch": 0.989, "grad_norm": 25.75, "grad_norm_var": 1.3434895833333333, "learning_rate": 1.0241613460385547e-05, "loss": 6.9536, "loss/crossentropy": 2.082467722892761, "loss/hidden": 3.23515625, "loss/jsd": 0.0, "loss/logits": 0.17200557347387074, "step": 29670 }, { "epoch": 0.9893333333333333, "grad_norm": 22.75, "grad_norm_var": 1.3639973958333333, "learning_rate": 1.0227204235841493e-05, "loss": 7.0005, "loss/crossentropy": 2.0898186802864074, "loss/hidden": 3.2125, "loss/jsd": 0.0, "loss/logits": 0.18023983463644982, "step": 29680 }, { "epoch": 0.9896666666666667, "grad_norm": 23.0, "grad_norm_var": 1.6541015625, "learning_rate": 1.0213236900716126e-05, "loss": 6.853, "loss/crossentropy": 2.187703275680542, "loss/hidden": 3.0921875, "loss/jsd": 0.0, "loss/logits": 0.16371893137693405, "step": 29690 }, { "epoch": 0.99, "grad_norm": 22.875, "grad_norm_var": 0.7145182291666666, "learning_rate": 1.01997115928614e-05, "loss": 6.8488, "loss/crossentropy": 2.0195549950003624, "loss/hidden": 3.162109375, "loss/jsd": 0.0, "loss/logits": 0.16155789233744144, "step": 29700 }, { "epoch": 0.9903333333333333, "grad_norm": 24.5, "grad_norm_var": 0.5082682291666667, "learning_rate": 1.0186628445766647e-05, "loss": 6.8828, "loss/crossentropy": 2.007741495221853, "loss/hidden": 3.296484375, "loss/jsd": 0.0, "loss/logits": 0.17947645513340832, "step": 29710 }, { "epoch": 0.9906666666666667, "grad_norm": 22.25, "grad_norm_var": 0.8624348958333333, "learning_rate": 1.0173987588557237e-05, "loss": 6.9354, "loss/crossentropy": 2.135732203722, "loss/hidden": 3.221875, "loss/jsd": 0.0, "loss/logits": 0.16441609486937522, "step": 29720 }, { "epoch": 0.991, "grad_norm": 22.75, "grad_norm_var": 0.9004557291666667, "learning_rate": 1.0161789145993343e-05, "loss": 6.9097, "loss/crossentropy": 1.8655649699270724, "loss/hidden": 3.1390625, "loss/jsd": 0.0, "loss/logits": 0.15107152182608843, "step": 29730 }, { "epoch": 0.9913333333333333, "grad_norm": 22.0, "grad_norm_var": 1.1739583333333334, "learning_rate": 1.0150033238468656e-05, "loss": 6.9158, "loss/crossentropy": 1.901015117764473, "loss/hidden": 3.223828125, "loss/jsd": 0.0, "loss/logits": 0.1663993639871478, "step": 29740 }, { "epoch": 0.9916666666666667, "grad_norm": 23.75, "grad_norm_var": 0.9447916666666667, "learning_rate": 1.0138719982009242e-05, "loss": 6.8547, "loss/crossentropy": 2.1790758818387985, "loss/hidden": 3.03671875, "loss/jsd": 0.0, "loss/logits": 0.15662378910928965, "step": 29750 }, { "epoch": 0.992, "grad_norm": 23.875, "grad_norm_var": 0.5333333333333333, "learning_rate": 1.0127849488272375e-05, "loss": 6.9303, "loss/crossentropy": 1.960058543086052, "loss/hidden": 3.29765625, "loss/jsd": 0.0, "loss/logits": 0.16203988939523697, "step": 29760 }, { "epoch": 0.9923333333333333, "grad_norm": 23.25, "grad_norm_var": 1.4504557291666667, "learning_rate": 1.0117421864545435e-05, "loss": 6.9141, "loss/crossentropy": 2.013607097789645, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.1687733193859458, "step": 29770 }, { "epoch": 0.9926666666666667, "grad_norm": 25.0, "grad_norm_var": 0.8330729166666667, "learning_rate": 1.0107437213744867e-05, "loss": 6.8548, "loss/crossentropy": 1.9799937024712562, "loss/hidden": 3.115234375, "loss/jsd": 0.0, "loss/logits": 0.1619122765958309, "step": 29780 }, { "epoch": 0.993, "grad_norm": 22.625, "grad_norm_var": 0.8077473958333333, "learning_rate": 1.0097895634415135e-05, "loss": 6.8267, "loss/crossentropy": 2.14156903848052, "loss/hidden": 3.17109375, "loss/jsd": 0.0, "loss/logits": 0.16840279754251242, "step": 29790 }, { "epoch": 0.9933333333333333, "grad_norm": 24.25, "grad_norm_var": 2.7143229166666667, "learning_rate": 1.008879722072778e-05, "loss": 6.9818, "loss/crossentropy": 2.0121023267507554, "loss/hidden": 3.331640625, "loss/jsd": 0.0, "loss/logits": 0.17791486158967018, "step": 29800 }, { "epoch": 0.9936666666666667, "grad_norm": 23.375, "grad_norm_var": 1.2275390625, "learning_rate": 1.008014206248047e-05, "loss": 6.9668, "loss/crossentropy": 2.126334875077009, "loss/hidden": 3.2703125, "loss/jsd": 0.0, "loss/logits": 0.17075007781386375, "step": 29810 }, { "epoch": 0.994, "grad_norm": 22.5, "grad_norm_var": 0.6520182291666666, "learning_rate": 1.0071930245096125e-05, "loss": 6.9318, "loss/crossentropy": 2.3043752014636993, "loss/hidden": 3.238671875, "loss/jsd": 0.0, "loss/logits": 0.17486888822168112, "step": 29820 }, { "epoch": 0.9943333333333333, "grad_norm": 23.25, "grad_norm_var": 0.6082682291666667, "learning_rate": 1.0064161849622065e-05, "loss": 6.8321, "loss/crossentropy": 2.0536348327994345, "loss/hidden": 3.244921875, "loss/jsd": 0.0, "loss/logits": 0.16649605836719275, "step": 29830 }, { "epoch": 0.9946666666666667, "grad_norm": 22.125, "grad_norm_var": 1.2979166666666666, "learning_rate": 1.0056836952729215e-05, "loss": 6.9919, "loss/crossentropy": 1.991570144891739, "loss/hidden": 3.15234375, "loss/jsd": 0.0, "loss/logits": 0.19409504476934672, "step": 29840 }, { "epoch": 0.995, "grad_norm": 23.625, "grad_norm_var": 1.1014973958333334, "learning_rate": 1.0049955626711355e-05, "loss": 6.9519, "loss/crossentropy": 2.031588687002659, "loss/hidden": 3.28984375, "loss/jsd": 0.0, "loss/logits": 0.1794443614780903, "step": 29850 }, { "epoch": 0.9953333333333333, "grad_norm": 23.5, "grad_norm_var": 0.8025390625, "learning_rate": 1.004351793948439e-05, "loss": 6.8755, "loss/crossentropy": 2.027684749662876, "loss/hidden": 3.226953125, "loss/jsd": 0.0, "loss/logits": 0.17099270056933163, "step": 29860 }, { "epoch": 0.9956666666666667, "grad_norm": 23.75, "grad_norm_var": 0.9811848958333333, "learning_rate": 1.0037523954585697e-05, "loss": 6.8869, "loss/crossentropy": 1.9910648241639137, "loss/hidden": 3.235546875, "loss/jsd": 0.0, "loss/logits": 0.156367249134928, "step": 29870 }, { "epoch": 0.996, "grad_norm": 23.25, "grad_norm_var": 1.2125, "learning_rate": 1.0031973731173486e-05, "loss": 6.716, "loss/crossentropy": 1.9186164811253548, "loss/hidden": 3.153125, "loss/jsd": 0.0, "loss/logits": 0.15821984894573687, "step": 29880 }, { "epoch": 0.9963333333333333, "grad_norm": 24.625, "grad_norm_var": 0.98125, "learning_rate": 1.002686732402622e-05, "loss": 6.9111, "loss/crossentropy": 2.014424833655357, "loss/hidden": 3.20234375, "loss/jsd": 0.0, "loss/logits": 0.16852001175284387, "step": 29890 }, { "epoch": 0.9966666666666667, "grad_norm": 24.125, "grad_norm_var": 1.0712890625, "learning_rate": 1.002220478354208e-05, "loss": 6.9157, "loss/crossentropy": 2.054695198684931, "loss/hidden": 3.137109375, "loss/jsd": 0.0, "loss/logits": 0.15729085877537727, "step": 29900 }, { "epoch": 0.997, "grad_norm": 23.0, "grad_norm_var": 0.97265625, "learning_rate": 1.0017986155738457e-05, "loss": 6.8971, "loss/crossentropy": 1.8903593212366103, "loss/hidden": 3.093359375, "loss/jsd": 0.0, "loss/logits": 0.14642833340913058, "step": 29910 }, { "epoch": 0.9973333333333333, "grad_norm": 22.75, "grad_norm_var": 1.9572916666666667, "learning_rate": 1.0014211482251503e-05, "loss": 7.0028, "loss/crossentropy": 2.0199310213327406, "loss/hidden": 3.270703125, "loss/jsd": 0.0, "loss/logits": 0.17333223409950732, "step": 29920 }, { "epoch": 0.9976666666666667, "grad_norm": 27.5, "grad_norm_var": 2.383072916666667, "learning_rate": 1.0010880800335719e-05, "loss": 6.9141, "loss/crossentropy": 2.015287238359451, "loss/hidden": 3.228515625, "loss/jsd": 0.0, "loss/logits": 0.17535847648978234, "step": 29930 }, { "epoch": 0.998, "grad_norm": 23.375, "grad_norm_var": 2.2910807291666666, "learning_rate": 1.0007994142863597e-05, "loss": 6.8943, "loss/crossentropy": 2.1094014227390288, "loss/hidden": 3.202734375, "loss/jsd": 0.0, "loss/logits": 0.16332617327570914, "step": 29940 }, { "epoch": 0.9983333333333333, "grad_norm": 21.75, "grad_norm_var": 1.0893229166666667, "learning_rate": 1.0005551538325275e-05, "loss": 6.7901, "loss/crossentropy": 1.9851688370108604, "loss/hidden": 3.212890625, "loss/jsd": 0.0, "loss/logits": 0.1485843539237976, "step": 29950 }, { "epoch": 0.9986666666666667, "grad_norm": 24.25, "grad_norm_var": 0.675, "learning_rate": 1.0003553010828276e-05, "loss": 6.8926, "loss/crossentropy": 2.0727066323161125, "loss/hidden": 3.213671875, "loss/jsd": 0.0, "loss/logits": 0.16851818263530732, "step": 29960 }, { "epoch": 0.999, "grad_norm": 23.5, "grad_norm_var": 0.8747395833333333, "learning_rate": 1.000199858009726e-05, "loss": 6.8944, "loss/crossentropy": 2.1489027053117753, "loss/hidden": 3.341015625, "loss/jsd": 0.0, "loss/logits": 0.1805833499878645, "step": 29970 }, { "epoch": 0.9993333333333333, "grad_norm": 22.625, "grad_norm_var": 0.4759765625, "learning_rate": 1.0000888261473831e-05, "loss": 6.957, "loss/crossentropy": 2.0706566661596297, "loss/hidden": 3.34453125, "loss/jsd": 0.0, "loss/logits": 0.18294469746761025, "step": 29980 }, { "epoch": 0.9996666666666667, "grad_norm": 23.5, "grad_norm_var": 2.99140625, "learning_rate": 1.0000222065916382e-05, "loss": 6.9522, "loss/crossentropy": 2.0081850692629812, "loss/hidden": 3.33984375, "loss/jsd": 0.0, "loss/logits": 0.177884781640023, "step": 29990 }, { "epoch": 1.0, "grad_norm": 21.125, "grad_norm_var": 1.1768229166666666, "learning_rate": 1e-05, "loss": 6.8942, "loss/crossentropy": 2.0051390439271928, "loss/hidden": 3.146875, "loss/jsd": 0.0, "loss/logits": 0.15861098784953356, "step": 30000 } ], "logging_steps": 10, "max_steps": 30000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.57253009602642e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }