{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.25, "eval_steps": 2000, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000125, "grad_norm": 408.0, "learning_rate": 1.18e-05, "loss": 99.4106, "loss/crossentropy": 9.463456630706787, "loss/hidden": 16.5625, "loss/jsd": 0.0, "loss/logits": 7.299906253814697, "step": 2 }, { "epoch": 0.00025, "grad_norm": 394.0, "learning_rate": 1.3600000000000002e-05, "loss": 98.4945, "loss/crossentropy": 9.324356079101562, "loss/hidden": 16.5, "loss/jsd": 0.0, "loss/logits": 7.335062265396118, "step": 4 }, { "epoch": 0.000375, "grad_norm": 400.0, "learning_rate": 1.54e-05, "loss": 98.8084, "loss/crossentropy": 9.356433391571045, "loss/hidden": 16.5625, "loss/jsd": 0.0, "loss/logits": 7.372653961181641, "step": 6 }, { "epoch": 0.0005, "grad_norm": 200.0, "learning_rate": 1.72e-05, "loss": 95.0285, "loss/crossentropy": 8.909065246582031, "loss/hidden": 16.5625, "loss/jsd": 0.0, "loss/logits": 6.833947658538818, "step": 8 }, { "epoch": 0.000625, "grad_norm": 168.0, "learning_rate": 1.9e-05, "loss": 91.8288, "loss/crossentropy": 8.619627475738525, "loss/hidden": 16.5, "loss/jsd": 0.0, "loss/logits": 6.534380674362183, "step": 10 }, { "epoch": 0.00075, "grad_norm": 142.0, "learning_rate": 2.0800000000000004e-05, "loss": 87.6764, "loss/crossentropy": 8.3522367477417, "loss/hidden": 16.3125, "loss/jsd": 0.0, "loss/logits": 6.020437479019165, "step": 12 }, { "epoch": 0.000875, "grad_norm": 120.0, "learning_rate": 2.2600000000000004e-05, "loss": 86.5162, "loss/crossentropy": 8.181874752044678, "loss/hidden": 16.25, "loss/jsd": 0.0, "loss/logits": 6.170382499694824, "step": 14 }, { "epoch": 0.001, "grad_norm": 102.0, "grad_norm_var": 17309.115625, "learning_rate": 2.4400000000000004e-05, "loss": 83.2473, "loss/crossentropy": 7.804777383804321, "loss/hidden": 15.875, "loss/jsd": 0.0, "loss/logits": 5.761872053146362, "step": 16 }, { "epoch": 0.001125, "grad_norm": 98.0, "grad_norm_var": 16404.073958333334, "learning_rate": 2.6200000000000003e-05, "loss": 82.994, "loss/crossentropy": 8.070159673690796, "loss/hidden": 15.4375, "loss/jsd": 0.0, "loss/logits": 6.122724771499634, "step": 18 }, { "epoch": 0.00125, "grad_norm": 91.0, "grad_norm_var": 13484.980989583333, "learning_rate": 2.8000000000000003e-05, "loss": 75.5942, "loss/crossentropy": 7.523748397827148, "loss/hidden": 15.21875, "loss/jsd": 0.0, "loss/logits": 5.568537950515747, "step": 20 }, { "epoch": 0.001375, "grad_norm": 42.75, "grad_norm_var": 7560.057291666667, "learning_rate": 2.9800000000000006e-05, "loss": 74.5522, "loss/crossentropy": 7.2410197257995605, "loss/hidden": 15.0, "loss/jsd": 0.0, "loss/logits": 5.393213272094727, "step": 22 }, { "epoch": 0.0015, "grad_norm": 47.25, "grad_norm_var": 7167.35, "learning_rate": 3.16e-05, "loss": 70.721, "loss/crossentropy": 6.929592132568359, "loss/hidden": 15.0, "loss/jsd": 0.0, "loss/logits": 5.017954587936401, "step": 24 }, { "epoch": 0.001625, "grad_norm": 68.0, "grad_norm_var": 7036.907291666666, "learning_rate": 3.3400000000000005e-05, "loss": 67.2375, "loss/crossentropy": 6.483344078063965, "loss/hidden": 14.90625, "loss/jsd": 0.0, "loss/logits": 4.4856719970703125, "step": 26 }, { "epoch": 0.00175, "grad_norm": 85.5, "grad_norm_var": 7041.079166666666, "learning_rate": 3.520000000000001e-05, "loss": 62.0161, "loss/crossentropy": 6.2141289710998535, "loss/hidden": 13.9375, "loss/jsd": 0.0, "loss/logits": 3.9922406673431396, "step": 28 }, { "epoch": 0.001875, "grad_norm": 55.0, "grad_norm_var": 7087.145833333333, "learning_rate": 3.7e-05, "loss": 58.0482, "loss/crossentropy": 5.753799676895142, "loss/hidden": 13.625, "loss/jsd": 0.0, "loss/logits": 3.7023929357528687, "step": 30 }, { "epoch": 0.002, "grad_norm": 61.75, "grad_norm_var": 7260.804166666667, "learning_rate": 3.88e-05, "loss": 54.3741, "loss/crossentropy": 5.369441032409668, "loss/hidden": 13.40625, "loss/jsd": 0.0, "loss/logits": 3.5642011165618896, "step": 32 }, { "epoch": 0.002125, "grad_norm": 64.5, "grad_norm_var": 891.0416666666666, "learning_rate": 4.0600000000000004e-05, "loss": 49.2896, "loss/crossentropy": 5.303982257843018, "loss/hidden": 13.0, "loss/jsd": 0.0, "loss/logits": 3.1789742708206177, "step": 34 }, { "epoch": 0.00225, "grad_norm": 59.5, "grad_norm_var": 854.5059895833333, "learning_rate": 4.240000000000001e-05, "loss": 45.052, "loss/crossentropy": 4.87110447883606, "loss/hidden": 12.40625, "loss/jsd": 0.0, "loss/logits": 2.8510398864746094, "step": 36 }, { "epoch": 0.002375, "grad_norm": 55.0, "grad_norm_var": 841.62890625, "learning_rate": 4.420000000000001e-05, "loss": 41.4091, "loss/crossentropy": 4.75779914855957, "loss/hidden": 11.84375, "loss/jsd": 0.0, "loss/logits": 2.4686522483825684, "step": 38 }, { "epoch": 0.0025, "grad_norm": 43.75, "grad_norm_var": 855.7958333333333, "learning_rate": 4.600000000000001e-05, "loss": 38.7329, "loss/crossentropy": 4.4945924282073975, "loss/hidden": 11.4375, "loss/jsd": 0.0, "loss/logits": 2.3037325143814087, "step": 40 }, { "epoch": 0.002625, "grad_norm": 45.25, "grad_norm_var": 894.5875, "learning_rate": 4.78e-05, "loss": 36.1262, "loss/crossentropy": 4.092380046844482, "loss/hidden": 11.03125, "loss/jsd": 0.0, "loss/logits": 2.2024214267730713, "step": 42 }, { "epoch": 0.00275, "grad_norm": 30.875, "grad_norm_var": 165.4806640625, "learning_rate": 4.96e-05, "loss": 34.0356, "loss/crossentropy": 3.9454950094223022, "loss/hidden": 10.5, "loss/jsd": 0.0, "loss/logits": 1.9096736907958984, "step": 44 }, { "epoch": 0.002875, "grad_norm": 31.5, "grad_norm_var": 190.79368489583334, "learning_rate": 5.14e-05, "loss": 31.1559, "loss/crossentropy": 3.7383527755737305, "loss/hidden": 9.96875, "loss/jsd": 0.0, "loss/logits": 1.6771507859230042, "step": 46 }, { "epoch": 0.003, "grad_norm": 28.125, "grad_norm_var": 214.92604166666666, "learning_rate": 5.3200000000000006e-05, "loss": 30.2863, "loss/crossentropy": 3.717210292816162, "loss/hidden": 9.90625, "loss/jsd": 0.0, "loss/logits": 1.7804301381111145, "step": 48 }, { "epoch": 0.003125, "grad_norm": 26.375, "grad_norm_var": 156.2681640625, "learning_rate": 5.500000000000001e-05, "loss": 28.8697, "loss/crossentropy": 3.8603330850601196, "loss/hidden": 9.40625, "loss/jsd": 0.0, "loss/logits": 1.5492290258407593, "step": 50 }, { "epoch": 0.00325, "grad_norm": 23.125, "grad_norm_var": 97.6994140625, "learning_rate": 5.680000000000001e-05, "loss": 27.3189, "loss/crossentropy": 3.6423943042755127, "loss/hidden": 9.28125, "loss/jsd": 0.0, "loss/logits": 1.4414083361625671, "step": 52 }, { "epoch": 0.003375, "grad_norm": 31.75, "grad_norm_var": 54.634375, "learning_rate": 5.860000000000001e-05, "loss": 26.6009, "loss/crossentropy": 3.383611798286438, "loss/hidden": 9.0, "loss/jsd": 0.0, "loss/logits": 1.3774727582931519, "step": 54 }, { "epoch": 0.0035, "grad_norm": 17.5, "grad_norm_var": 56.4431640625, "learning_rate": 6.040000000000001e-05, "loss": 25.9301, "loss/crossentropy": 3.3406291007995605, "loss/hidden": 8.90625, "loss/jsd": 0.0, "loss/logits": 1.3027867674827576, "step": 56 }, { "epoch": 0.003625, "grad_norm": 19.625, "grad_norm_var": 805.69140625, "learning_rate": 6.220000000000001e-05, "loss": 24.8141, "loss/crossentropy": 3.271217703819275, "loss/hidden": 8.71875, "loss/jsd": 0.0, "loss/logits": 1.3078763484954834, "step": 58 }, { "epoch": 0.00375, "grad_norm": 19.875, "grad_norm_var": 835.2473307291667, "learning_rate": 6.400000000000001e-05, "loss": 24.7392, "loss/crossentropy": 3.1293649673461914, "loss/hidden": 8.4375, "loss/jsd": 0.0, "loss/logits": 1.280139446258545, "step": 60 }, { "epoch": 0.003875, "grad_norm": 19.25, "grad_norm_var": 852.6686848958333, "learning_rate": 6.58e-05, "loss": 22.9728, "loss/crossentropy": 3.0743978023529053, "loss/hidden": 8.109375, "loss/jsd": 0.0, "loss/logits": 1.1214115023612976, "step": 62 }, { "epoch": 0.004, "grad_norm": 22.625, "grad_norm_var": 859.1119140625, "learning_rate": 6.76e-05, "loss": 23.0224, "loss/crossentropy": 3.3769785165786743, "loss/hidden": 8.25, "loss/jsd": 0.0, "loss/logits": 1.1677427291870117, "step": 64 }, { "epoch": 0.004125, "grad_norm": 18.25, "grad_norm_var": 869.5059895833333, "learning_rate": 6.94e-05, "loss": 22.2622, "loss/crossentropy": 3.1320669651031494, "loss/hidden": 7.8125, "loss/jsd": 0.0, "loss/logits": 1.08091539144516, "step": 66 }, { "epoch": 0.00425, "grad_norm": 20.0, "grad_norm_var": 884.07265625, "learning_rate": 7.120000000000001e-05, "loss": 21.2449, "loss/crossentropy": 3.257786750793457, "loss/hidden": 7.65625, "loss/jsd": 0.0, "loss/logits": 1.028902530670166, "step": 68 }, { "epoch": 0.004375, "grad_norm": 13.6875, "grad_norm_var": 899.974853515625, "learning_rate": 7.3e-05, "loss": 20.7807, "loss/crossentropy": 2.906672239303589, "loss/hidden": 7.609375, "loss/jsd": 0.0, "loss/logits": 0.9820225834846497, "step": 70 }, { "epoch": 0.0045, "grad_norm": 19.5, "grad_norm_var": 901.5062337239583, "learning_rate": 7.48e-05, "loss": 20.4968, "loss/crossentropy": 3.15093994140625, "loss/hidden": 7.578125, "loss/jsd": 0.0, "loss/logits": 1.0488424897193909, "step": 72 }, { "epoch": 0.004625, "grad_norm": 13.75, "grad_norm_var": 11.296207682291667, "learning_rate": 7.66e-05, "loss": 19.8961, "loss/crossentropy": 3.124137759208679, "loss/hidden": 7.34375, "loss/jsd": 0.0, "loss/logits": 0.9181300401687622, "step": 74 }, { "epoch": 0.00475, "grad_norm": 21.375, "grad_norm_var": 11.601676432291667, "learning_rate": 7.840000000000001e-05, "loss": 19.5071, "loss/crossentropy": 3.0490193367004395, "loss/hidden": 7.34375, "loss/jsd": 0.0, "loss/logits": 0.9544317126274109, "step": 76 }, { "epoch": 0.004875, "grad_norm": 16.625, "grad_norm_var": 11.332405598958333, "learning_rate": 8.020000000000001e-05, "loss": 19.1397, "loss/crossentropy": 2.9865410327911377, "loss/hidden": 7.234375, "loss/jsd": 0.0, "loss/logits": 0.9217109084129333, "step": 78 }, { "epoch": 0.005, "grad_norm": 13.375, "grad_norm_var": 10.337955729166667, "learning_rate": 8.200000000000001e-05, "loss": 18.751, "loss/crossentropy": 2.766746163368225, "loss/hidden": 7.109375, "loss/jsd": 0.0, "loss/logits": 0.8647720515727997, "step": 80 }, { "epoch": 0.005125, "grad_norm": 16.25, "grad_norm_var": 11.413802083333334, "learning_rate": 8.38e-05, "loss": 18.6336, "loss/crossentropy": 3.0917201042175293, "loss/hidden": 6.875, "loss/jsd": 0.0, "loss/logits": 0.8534270823001862, "step": 82 }, { "epoch": 0.00525, "grad_norm": 14.5, "grad_norm_var": 11.0328125, "learning_rate": 8.560000000000001e-05, "loss": 18.3569, "loss/crossentropy": 2.9698469638824463, "loss/hidden": 7.03125, "loss/jsd": 0.0, "loss/logits": 0.8276518881320953, "step": 84 }, { "epoch": 0.005375, "grad_norm": 11.625, "grad_norm_var": 11.439567057291667, "learning_rate": 8.740000000000001e-05, "loss": 18.0427, "loss/crossentropy": 2.968225121498108, "loss/hidden": 6.8125, "loss/jsd": 0.0, "loss/logits": 0.8551926612854004, "step": 86 }, { "epoch": 0.0055, "grad_norm": 15.875, "grad_norm_var": 11.452978515625, "learning_rate": 8.92e-05, "loss": 17.7372, "loss/crossentropy": 2.954566717147827, "loss/hidden": 6.65625, "loss/jsd": 0.0, "loss/logits": 0.8068905174732208, "step": 88 }, { "epoch": 0.005625, "grad_norm": 13.75, "grad_norm_var": 11.686572265625, "learning_rate": 9.1e-05, "loss": 17.6758, "loss/crossentropy": 3.231398582458496, "loss/hidden": 6.5625, "loss/jsd": 0.0, "loss/logits": 0.8298341929912567, "step": 90 }, { "epoch": 0.00575, "grad_norm": 11.0625, "grad_norm_var": 11.1984375, "learning_rate": 9.28e-05, "loss": 17.011, "loss/crossentropy": 2.629736542701721, "loss/hidden": 6.5, "loss/jsd": 0.0, "loss/logits": 0.7391310036182404, "step": 92 }, { "epoch": 0.005875, "grad_norm": 14.8125, "grad_norm_var": 10.108186848958333, "learning_rate": 9.46e-05, "loss": 17.0497, "loss/crossentropy": 2.803489089012146, "loss/hidden": 6.5625, "loss/jsd": 0.0, "loss/logits": 0.8144381940364838, "step": 94 }, { "epoch": 0.006, "grad_norm": 12.5625, "grad_norm_var": 10.022916666666667, "learning_rate": 9.64e-05, "loss": 16.5939, "loss/crossentropy": 3.019722819328308, "loss/hidden": 6.46875, "loss/jsd": 0.0, "loss/logits": 0.8095101118087769, "step": 96 }, { "epoch": 0.006125, "grad_norm": 15.5, "grad_norm_var": 4.770686848958333, "learning_rate": 9.82e-05, "loss": 16.5934, "loss/crossentropy": 2.80954647064209, "loss/hidden": 6.421875, "loss/jsd": 0.0, "loss/logits": 0.7483513355255127, "step": 98 }, { "epoch": 0.00625, "grad_norm": 13.5625, "grad_norm_var": 6.648893229166666, "learning_rate": 0.0001, "loss": 16.4383, "loss/crossentropy": 2.4182586669921875, "loss/hidden": 6.21875, "loss/jsd": 0.0, "loss/logits": 0.6813479959964752, "step": 100 }, { "epoch": 0.006375, "grad_norm": 15.6875, "grad_norm_var": 6.005322265625, "learning_rate": 0.0001, "loss": 16.3773, "loss/crossentropy": 2.767289161682129, "loss/hidden": 6.40625, "loss/jsd": 0.0, "loss/logits": 0.7437765300273895, "step": 102 }, { "epoch": 0.0065, "grad_norm": 13.9375, "grad_norm_var": 6.213004557291667, "learning_rate": 0.0001, "loss": 16.6123, "loss/crossentropy": 2.8596930503845215, "loss/hidden": 6.328125, "loss/jsd": 0.0, "loss/logits": 0.7475454211235046, "step": 104 }, { "epoch": 0.006625, "grad_norm": 11.1875, "grad_norm_var": 6.547900390625, "learning_rate": 0.0001, "loss": 16.4102, "loss/crossentropy": 3.031221389770508, "loss/hidden": 6.296875, "loss/jsd": 0.0, "loss/logits": 0.8019546568393707, "step": 106 }, { "epoch": 0.00675, "grad_norm": 12.8125, "grad_norm_var": 6.1900390625, "learning_rate": 0.0001, "loss": 15.9361, "loss/crossentropy": 2.9293311834335327, "loss/hidden": 6.125, "loss/jsd": 0.0, "loss/logits": 0.7174897193908691, "step": 108 }, { "epoch": 0.006875, "grad_norm": 13.5625, "grad_norm_var": 6.680452473958334, "learning_rate": 0.0001, "loss": 15.6893, "loss/crossentropy": 3.064392924308777, "loss/hidden": 6.03125, "loss/jsd": 0.0, "loss/logits": 0.7035545110702515, "step": 110 }, { "epoch": 0.007, "grad_norm": 11.125, "grad_norm_var": 6.978059895833334, "learning_rate": 0.0001, "loss": 15.9123, "loss/crossentropy": 2.668829083442688, "loss/hidden": 6.1875, "loss/jsd": 0.0, "loss/logits": 0.6724480390548706, "step": 112 }, { "epoch": 0.007125, "grad_norm": 14.1875, "grad_norm_var": 6.252083333333333, "learning_rate": 0.0001, "loss": 15.4118, "loss/crossentropy": 2.6381983757019043, "loss/hidden": 6.140625, "loss/jsd": 0.0, "loss/logits": 0.6702737212181091, "step": 114 }, { "epoch": 0.00725, "grad_norm": 10.0625, "grad_norm_var": 3.822379557291667, "learning_rate": 0.0001, "loss": 14.9318, "loss/crossentropy": 2.583546996116638, "loss/hidden": 5.96875, "loss/jsd": 0.0, "loss/logits": 0.65639927983284, "step": 116 }, { "epoch": 0.007375, "grad_norm": 10.3125, "grad_norm_var": 2.2548014322916665, "learning_rate": 0.0001, "loss": 15.1022, "loss/crossentropy": 2.70282781124115, "loss/hidden": 5.890625, "loss/jsd": 0.0, "loss/logits": 0.6296679973602295, "step": 118 }, { "epoch": 0.0075, "grad_norm": 10.8125, "grad_norm_var": 2.323681640625, "learning_rate": 0.0001, "loss": 14.9772, "loss/crossentropy": 2.714564800262451, "loss/hidden": 5.84375, "loss/jsd": 0.0, "loss/logits": 0.631663054227829, "step": 120 }, { "epoch": 0.007625, "grad_norm": 9.75, "grad_norm_var": 2.398811848958333, "learning_rate": 0.0001, "loss": 15.0945, "loss/crossentropy": 2.716395616531372, "loss/hidden": 5.84375, "loss/jsd": 0.0, "loss/logits": 0.648982048034668, "step": 122 }, { "epoch": 0.00775, "grad_norm": 13.3125, "grad_norm_var": 2.604150390625, "learning_rate": 0.0001, "loss": 15.3371, "loss/crossentropy": 2.7478760480880737, "loss/hidden": 5.890625, "loss/jsd": 0.0, "loss/logits": 0.6630505919456482, "step": 124 }, { "epoch": 0.007875, "grad_norm": 10.5625, "grad_norm_var": 2.456884765625, "learning_rate": 0.0001, "loss": 15.198, "loss/crossentropy": 2.681770443916321, "loss/hidden": 5.71875, "loss/jsd": 0.0, "loss/logits": 0.631014883518219, "step": 126 }, { "epoch": 0.008, "grad_norm": 9.8125, "grad_norm_var": 2.166650390625, "learning_rate": 0.0001, "loss": 14.9414, "loss/crossentropy": 2.5563963651657104, "loss/hidden": 5.796875, "loss/jsd": 0.0, "loss/logits": 0.6471997797489166, "step": 128 }, { "epoch": 0.008125, "grad_norm": 12.25, "grad_norm_var": 1.3471354166666667, "learning_rate": 0.0001, "loss": 14.2076, "loss/crossentropy": 2.4711976051330566, "loss/hidden": 5.59375, "loss/jsd": 0.0, "loss/logits": 0.6036389470100403, "step": 130 }, { "epoch": 0.00825, "grad_norm": 8.0625, "grad_norm_var": 1.8124348958333334, "learning_rate": 0.0001, "loss": 14.3162, "loss/crossentropy": 2.717278480529785, "loss/hidden": 5.71875, "loss/jsd": 0.0, "loss/logits": 0.5794410407543182, "step": 132 }, { "epoch": 0.008375, "grad_norm": 9.3125, "grad_norm_var": 1.694384765625, "learning_rate": 0.0001, "loss": 14.3277, "loss/crossentropy": 2.708119750022888, "loss/hidden": 5.609375, "loss/jsd": 0.0, "loss/logits": 0.6186929643154144, "step": 134 }, { "epoch": 0.0085, "grad_norm": 9.4375, "grad_norm_var": 1.8212890625, "learning_rate": 0.0001, "loss": 14.342, "loss/crossentropy": 2.626417875289917, "loss/hidden": 5.609375, "loss/jsd": 0.0, "loss/logits": 0.6479573547840118, "step": 136 }, { "epoch": 0.008625, "grad_norm": 9.375, "grad_norm_var": 1.9150390625, "learning_rate": 0.0001, "loss": 14.3908, "loss/crossentropy": 2.669326066970825, "loss/hidden": 5.59375, "loss/jsd": 0.0, "loss/logits": 0.5919320583343506, "step": 138 }, { "epoch": 0.00875, "grad_norm": 9.875, "grad_norm_var": 1.1773274739583333, "learning_rate": 0.0001, "loss": 14.3644, "loss/crossentropy": 2.523828148841858, "loss/hidden": 5.5625, "loss/jsd": 0.0, "loss/logits": 0.5997762680053711, "step": 140 }, { "epoch": 0.008875, "grad_norm": 10.3125, "grad_norm_var": 1.1572265625, "learning_rate": 0.0001, "loss": 13.9086, "loss/crossentropy": 2.4612866640090942, "loss/hidden": 5.5625, "loss/jsd": 0.0, "loss/logits": 0.5790427327156067, "step": 142 }, { "epoch": 0.009, "grad_norm": 9.3125, "grad_norm_var": 1.0247395833333333, "learning_rate": 0.0001, "loss": 14.3827, "loss/crossentropy": 2.3406189680099487, "loss/hidden": 5.53125, "loss/jsd": 0.0, "loss/logits": 0.5852385461330414, "step": 144 }, { "epoch": 0.009125, "grad_norm": 8.0625, "grad_norm_var": 0.7343098958333333, "learning_rate": 0.0001, "loss": 14.0124, "loss/crossentropy": 2.3046847581863403, "loss/hidden": 5.453125, "loss/jsd": 0.0, "loss/logits": 0.5159327685832977, "step": 146 }, { "epoch": 0.00925, "grad_norm": 14.1875, "grad_norm_var": 1.8917805989583334, "learning_rate": 0.0001, "loss": 14.2854, "loss/crossentropy": 2.673864483833313, "loss/hidden": 5.421875, "loss/jsd": 0.0, "loss/logits": 0.568247377872467, "step": 148 }, { "epoch": 0.009375, "grad_norm": 7.59375, "grad_norm_var": 2.16685791015625, "learning_rate": 0.0001, "loss": 13.7424, "loss/crossentropy": 2.4085657596588135, "loss/hidden": 5.328125, "loss/jsd": 0.0, "loss/logits": 0.576050728559494, "step": 150 }, { "epoch": 0.0095, "grad_norm": 9.625, "grad_norm_var": 2.378759765625, "learning_rate": 0.0001, "loss": 13.8437, "loss/crossentropy": 2.6601611375808716, "loss/hidden": 5.40625, "loss/jsd": 0.0, "loss/logits": 0.5511062890291214, "step": 152 }, { "epoch": 0.009625, "grad_norm": 8.375, "grad_norm_var": 2.7024576822916666, "learning_rate": 0.0001, "loss": 13.9772, "loss/crossentropy": 2.6024333238601685, "loss/hidden": 5.390625, "loss/jsd": 0.0, "loss/logits": 0.6262157559394836, "step": 154 }, { "epoch": 0.00975, "grad_norm": 7.875, "grad_norm_var": 3.062744140625, "learning_rate": 0.0001, "loss": 13.9552, "loss/crossentropy": 2.8262619972229004, "loss/hidden": 5.40625, "loss/jsd": 0.0, "loss/logits": 0.5858268141746521, "step": 156 }, { "epoch": 0.009875, "grad_norm": 7.46875, "grad_norm_var": 3.2928670247395835, "learning_rate": 0.0001, "loss": 13.5471, "loss/crossentropy": 2.4514461755752563, "loss/hidden": 5.4375, "loss/jsd": 0.0, "loss/logits": 0.5691545605659485, "step": 158 }, { "epoch": 0.01, "grad_norm": 10.125, "grad_norm_var": 3.296317545572917, "learning_rate": 0.0001, "loss": 13.7877, "loss/crossentropy": 2.5704935789108276, "loss/hidden": 5.390625, "loss/jsd": 0.0, "loss/logits": 0.5797623991966248, "step": 160 }, { "epoch": 0.010125, "grad_norm": 8.9375, "grad_norm_var": 3.193648274739583, "learning_rate": 0.0001, "loss": 13.5383, "loss/crossentropy": 2.5421087741851807, "loss/hidden": 5.296875, "loss/jsd": 0.0, "loss/logits": 0.5653413534164429, "step": 162 }, { "epoch": 0.01025, "grad_norm": 9.375, "grad_norm_var": 1.48746337890625, "learning_rate": 0.0001, "loss": 13.5876, "loss/crossentropy": 2.5322933197021484, "loss/hidden": 5.34375, "loss/jsd": 0.0, "loss/logits": 0.5471592545509338, "step": 164 }, { "epoch": 0.010375, "grad_norm": 7.71875, "grad_norm_var": 1.2683878580729167, "learning_rate": 0.0001, "loss": 13.3671, "loss/crossentropy": 2.5972307920455933, "loss/hidden": 5.25, "loss/jsd": 0.0, "loss/logits": 0.5242535173892975, "step": 166 }, { "epoch": 0.0105, "grad_norm": 7.71875, "grad_norm_var": 1.3320149739583333, "learning_rate": 0.0001, "loss": 13.727, "loss/crossentropy": 2.654882788658142, "loss/hidden": 5.25, "loss/jsd": 0.0, "loss/logits": 0.5885663330554962, "step": 168 }, { "epoch": 0.010625, "grad_norm": 8.6875, "grad_norm_var": 0.7624348958333333, "learning_rate": 0.0001, "loss": 13.6388, "loss/crossentropy": 2.577346444129944, "loss/hidden": 5.25, "loss/jsd": 0.0, "loss/logits": 0.5597317218780518, "step": 170 }, { "epoch": 0.01075, "grad_norm": 7.78125, "grad_norm_var": 0.7722615559895833, "learning_rate": 0.0001, "loss": 13.3253, "loss/crossentropy": 2.513775587081909, "loss/hidden": 5.25, "loss/jsd": 0.0, "loss/logits": 0.5256665050983429, "step": 172 }, { "epoch": 0.010875, "grad_norm": 7.75, "grad_norm_var": 0.7695149739583333, "learning_rate": 0.0001, "loss": 13.1859, "loss/crossentropy": 2.507006049156189, "loss/hidden": 5.171875, "loss/jsd": 0.0, "loss/logits": 0.5298479795455933, "step": 174 }, { "epoch": 0.011, "grad_norm": 10.5, "grad_norm_var": 0.8642578125, "learning_rate": 0.0001, "loss": 13.3871, "loss/crossentropy": 2.546314835548401, "loss/hidden": 5.203125, "loss/jsd": 0.0, "loss/logits": 0.5244591236114502, "step": 176 }, { "epoch": 0.011125, "grad_norm": 7.71875, "grad_norm_var": 1.03707275390625, "learning_rate": 0.0001, "loss": 13.6056, "loss/crossentropy": 2.5370965003967285, "loss/hidden": 5.15625, "loss/jsd": 0.0, "loss/logits": 0.5421458780765533, "step": 178 }, { "epoch": 0.01125, "grad_norm": 7.8125, "grad_norm_var": 1.2237589518229166, "learning_rate": 0.0001, "loss": 13.6754, "loss/crossentropy": 2.702122926712036, "loss/hidden": 5.265625, "loss/jsd": 0.0, "loss/logits": 0.5821575820446014, "step": 180 }, { "epoch": 0.011375, "grad_norm": 10.1875, "grad_norm_var": 1.307666015625, "learning_rate": 0.0001, "loss": 13.4621, "loss/crossentropy": 2.6636409759521484, "loss/hidden": 5.1875, "loss/jsd": 0.0, "loss/logits": 0.5116533488035202, "step": 182 }, { "epoch": 0.0115, "grad_norm": 7.375, "grad_norm_var": 1.208984375, "learning_rate": 0.0001, "loss": 13.4015, "loss/crossentropy": 2.608703851699829, "loss/hidden": 5.203125, "loss/jsd": 0.0, "loss/logits": 0.5401088297367096, "step": 184 }, { "epoch": 0.011625, "grad_norm": 8.5625, "grad_norm_var": 1.2628214518229166, "learning_rate": 0.0001, "loss": 13.3483, "loss/crossentropy": 2.604197859764099, "loss/hidden": 5.296875, "loss/jsd": 0.0, "loss/logits": 0.5501621663570404, "step": 186 }, { "epoch": 0.01175, "grad_norm": 8.0625, "grad_norm_var": 1.2360514322916667, "learning_rate": 0.0001, "loss": 13.2514, "loss/crossentropy": 2.384081482887268, "loss/hidden": 5.109375, "loss/jsd": 0.0, "loss/logits": 0.5311485826969147, "step": 188 }, { "epoch": 0.011875, "grad_norm": 7.5, "grad_norm_var": 1.4585896809895833, "learning_rate": 0.0001, "loss": 12.9879, "loss/crossentropy": 2.501517176628113, "loss/hidden": 5.0625, "loss/jsd": 0.0, "loss/logits": 0.5271838307380676, "step": 190 }, { "epoch": 0.012, "grad_norm": 9.125, "grad_norm_var": 1.30015869140625, "learning_rate": 0.0001, "loss": 13.1282, "loss/crossentropy": 2.4795455932617188, "loss/hidden": 5.03125, "loss/jsd": 0.0, "loss/logits": 0.5566309094429016, "step": 192 }, { "epoch": 0.012125, "grad_norm": 7.5625, "grad_norm_var": 1.1060506184895833, "learning_rate": 0.0001, "loss": 13.0203, "loss/crossentropy": 2.615292191505432, "loss/hidden": 5.0625, "loss/jsd": 0.0, "loss/logits": 0.5351596176624298, "step": 194 }, { "epoch": 0.01225, "grad_norm": 8.4375, "grad_norm_var": 0.74791259765625, "learning_rate": 0.0001, "loss": 13.2183, "loss/crossentropy": 2.777504324913025, "loss/hidden": 5.0625, "loss/jsd": 0.0, "loss/logits": 0.5449462532997131, "step": 196 }, { "epoch": 0.012375, "grad_norm": 8.25, "grad_norm_var": 0.42310791015625, "learning_rate": 0.0001, "loss": 13.2077, "loss/crossentropy": 2.8624597787857056, "loss/hidden": 5.109375, "loss/jsd": 0.0, "loss/logits": 0.546259343624115, "step": 198 }, { "epoch": 0.0125, "grad_norm": 8.1875, "grad_norm_var": 0.5727498372395833, "learning_rate": 0.0001, "loss": 13.1209, "loss/crossentropy": 2.6138558387756348, "loss/hidden": 5.046875, "loss/jsd": 0.0, "loss/logits": 0.5038184821605682, "step": 200 }, { "epoch": 0.012625, "grad_norm": 7.625, "grad_norm_var": 0.5636678059895833, "learning_rate": 0.0001, "loss": 12.7479, "loss/crossentropy": 2.6792138814926147, "loss/hidden": 5.046875, "loss/jsd": 0.0, "loss/logits": 0.5708663761615753, "step": 202 }, { "epoch": 0.01275, "grad_norm": 10.1875, "grad_norm_var": 0.7628743489583333, "learning_rate": 0.0001, "loss": 12.7616, "loss/crossentropy": 2.436267137527466, "loss/hidden": 5.015625, "loss/jsd": 0.0, "loss/logits": 0.56131911277771, "step": 204 }, { "epoch": 0.012875, "grad_norm": 8.3125, "grad_norm_var": 1.3605428059895834, "learning_rate": 0.0001, "loss": 12.5898, "loss/crossentropy": 2.3147178888320923, "loss/hidden": 5.0, "loss/jsd": 0.0, "loss/logits": 0.48664502799510956, "step": 206 }, { "epoch": 0.013, "grad_norm": 6.3125, "grad_norm_var": 1.7156209309895833, "learning_rate": 0.0001, "loss": 12.1952, "loss/crossentropy": 2.36074697971344, "loss/hidden": 5.03125, "loss/jsd": 0.0, "loss/logits": 0.4715626835823059, "step": 208 }, { "epoch": 0.013125, "grad_norm": 7.65625, "grad_norm_var": 1.71051025390625, "learning_rate": 0.0001, "loss": 12.6705, "loss/crossentropy": 2.585902214050293, "loss/hidden": 5.03125, "loss/jsd": 0.0, "loss/logits": 0.5059193968772888, "step": 210 }, { "epoch": 0.01325, "grad_norm": 7.5625, "grad_norm_var": 1.7617146809895834, "learning_rate": 0.0001, "loss": 12.6794, "loss/crossentropy": 2.485123038291931, "loss/hidden": 4.90625, "loss/jsd": 0.0, "loss/logits": 0.5168211758136749, "step": 212 }, { "epoch": 0.013375, "grad_norm": 6.28125, "grad_norm_var": 1.9130167643229166, "learning_rate": 0.0001, "loss": 12.3766, "loss/crossentropy": 2.730853319168091, "loss/hidden": 4.96875, "loss/jsd": 0.0, "loss/logits": 0.5259307771921158, "step": 214 }, { "epoch": 0.0135, "grad_norm": 7.5, "grad_norm_var": 1.82261962890625, "learning_rate": 0.0001, "loss": 12.2796, "loss/crossentropy": 2.5561338663101196, "loss/hidden": 4.875, "loss/jsd": 0.0, "loss/logits": 0.5048320591449738, "step": 216 }, { "epoch": 0.013625, "grad_norm": 7.03125, "grad_norm_var": 1.89547119140625, "learning_rate": 0.0001, "loss": 12.4252, "loss/crossentropy": 2.579685091972351, "loss/hidden": 4.953125, "loss/jsd": 0.0, "loss/logits": 0.4986896812915802, "step": 218 }, { "epoch": 0.01375, "grad_norm": 6.875, "grad_norm_var": 1.7075520833333333, "learning_rate": 0.0001, "loss": 12.2975, "loss/crossentropy": 2.646655321121216, "loss/hidden": 4.890625, "loss/jsd": 0.0, "loss/logits": 0.488240122795105, "step": 220 }, { "epoch": 0.013875, "grad_norm": 7.15625, "grad_norm_var": 0.49394124348958335, "learning_rate": 0.0001, "loss": 12.4559, "loss/crossentropy": 2.576615571975708, "loss/hidden": 4.921875, "loss/jsd": 0.0, "loss/logits": 0.4858951270580292, "step": 222 }, { "epoch": 0.014, "grad_norm": 7.28125, "grad_norm_var": 0.4115193684895833, "learning_rate": 0.0001, "loss": 12.0337, "loss/crossentropy": 2.532575011253357, "loss/hidden": 4.78125, "loss/jsd": 0.0, "loss/logits": 0.5103432983160019, "step": 224 }, { "epoch": 0.014125, "grad_norm": 6.375, "grad_norm_var": 0.3961588541666667, "learning_rate": 0.0001, "loss": 12.3373, "loss/crossentropy": 2.6340404748916626, "loss/hidden": 4.890625, "loss/jsd": 0.0, "loss/logits": 0.4943613111972809, "step": 226 }, { "epoch": 0.01425, "grad_norm": 10.1875, "grad_norm_var": 1.01871337890625, "learning_rate": 0.0001, "loss": 12.8549, "loss/crossentropy": 2.454265832901001, "loss/hidden": 4.703125, "loss/jsd": 0.0, "loss/logits": 0.45377302169799805, "step": 228 }, { "epoch": 0.014375, "grad_norm": 7.65625, "grad_norm_var": 0.9704427083333333, "learning_rate": 0.0001, "loss": 12.2599, "loss/crossentropy": 2.6582183837890625, "loss/hidden": 4.921875, "loss/jsd": 0.0, "loss/logits": 0.5020380616188049, "step": 230 }, { "epoch": 0.0145, "grad_norm": 7.5, "grad_norm_var": 1.1243326822916666, "learning_rate": 0.0001, "loss": 12.6468, "loss/crossentropy": 2.657613158226013, "loss/hidden": 4.90625, "loss/jsd": 0.0, "loss/logits": 0.5409696698188782, "step": 232 }, { "epoch": 0.014625, "grad_norm": 7.0625, "grad_norm_var": 1.0277628580729166, "learning_rate": 0.0001, "loss": 12.4568, "loss/crossentropy": 2.5104974508285522, "loss/hidden": 4.859375, "loss/jsd": 0.0, "loss/logits": 0.4923330545425415, "step": 234 }, { "epoch": 0.01475, "grad_norm": 7.59375, "grad_norm_var": 0.8995930989583333, "learning_rate": 0.0001, "loss": 11.9849, "loss/crossentropy": 2.52110493183136, "loss/hidden": 4.828125, "loss/jsd": 0.0, "loss/logits": 0.4490865021944046, "step": 236 }, { "epoch": 0.014875, "grad_norm": 6.84375, "grad_norm_var": 1.0208984375, "learning_rate": 0.0001, "loss": 12.546, "loss/crossentropy": 2.8875030279159546, "loss/hidden": 4.890625, "loss/jsd": 0.0, "loss/logits": 0.5406565517187119, "step": 238 }, { "epoch": 0.015, "grad_norm": 6.0, "grad_norm_var": 1.1588175455729166, "learning_rate": 0.0001, "loss": 12.2478, "loss/crossentropy": 2.5239052772521973, "loss/hidden": 4.84375, "loss/jsd": 0.0, "loss/logits": 0.4846698194742203, "step": 240 }, { "epoch": 0.015125, "grad_norm": 6.40625, "grad_norm_var": 1.221728515625, "learning_rate": 0.0001, "loss": 12.3295, "loss/crossentropy": 2.4903002977371216, "loss/hidden": 4.78125, "loss/jsd": 0.0, "loss/logits": 0.48552611470222473, "step": 242 }, { "epoch": 0.01525, "grad_norm": 6.9375, "grad_norm_var": 0.6412760416666666, "learning_rate": 0.0001, "loss": 12.0959, "loss/crossentropy": 2.723002791404724, "loss/hidden": 4.796875, "loss/jsd": 0.0, "loss/logits": 0.4821483790874481, "step": 244 }, { "epoch": 0.015375, "grad_norm": 5.90625, "grad_norm_var": 0.7720662434895833, "learning_rate": 0.0001, "loss": 12.21, "loss/crossentropy": 2.433968424797058, "loss/hidden": 4.765625, "loss/jsd": 0.0, "loss/logits": 0.47880972921848297, "step": 246 }, { "epoch": 0.0155, "grad_norm": 8.5, "grad_norm_var": 0.6696248372395833, "learning_rate": 0.0001, "loss": 12.3585, "loss/crossentropy": 2.672105550765991, "loss/hidden": 4.859375, "loss/jsd": 0.0, "loss/logits": 0.5268009155988693, "step": 248 }, { "epoch": 0.015625, "grad_norm": 6.4375, "grad_norm_var": 0.7093587239583333, "learning_rate": 0.0001, "loss": 11.961, "loss/crossentropy": 2.404228091239929, "loss/hidden": 4.65625, "loss/jsd": 0.0, "loss/logits": 0.45947059988975525, "step": 250 }, { "epoch": 0.01575, "grad_norm": 6.5625, "grad_norm_var": 0.6781209309895834, "learning_rate": 0.0001, "loss": 11.9045, "loss/crossentropy": 2.5632474422454834, "loss/hidden": 4.78125, "loss/jsd": 0.0, "loss/logits": 0.49997828900814056, "step": 252 }, { "epoch": 0.015875, "grad_norm": 6.28125, "grad_norm_var": 0.38396809895833334, "learning_rate": 0.0001, "loss": 11.9424, "loss/crossentropy": 2.7031558752059937, "loss/hidden": 4.8125, "loss/jsd": 0.0, "loss/logits": 0.487554132938385, "step": 254 }, { "epoch": 0.016, "grad_norm": 7.15625, "grad_norm_var": 0.36288655598958336, "learning_rate": 0.0001, "loss": 12.2919, "loss/crossentropy": 2.5938860177993774, "loss/hidden": 4.765625, "loss/jsd": 0.0, "loss/logits": 0.5034405440092087, "step": 256 }, { "epoch": 0.016125, "grad_norm": 5.96875, "grad_norm_var": 0.37994384765625, "learning_rate": 0.0001, "loss": 12.0851, "loss/crossentropy": 2.503898859024048, "loss/hidden": 4.71875, "loss/jsd": 0.0, "loss/logits": 0.45617610216140747, "step": 258 }, { "epoch": 0.01625, "grad_norm": 7.25, "grad_norm_var": 0.39628499348958335, "learning_rate": 0.0001, "loss": 12.2487, "loss/crossentropy": 2.721401333808899, "loss/hidden": 4.765625, "loss/jsd": 0.0, "loss/logits": 0.5100695192813873, "step": 260 }, { "epoch": 0.016375, "grad_norm": 6.5, "grad_norm_var": 0.3338541666666667, "learning_rate": 0.0001, "loss": 11.701, "loss/crossentropy": 2.6579989194869995, "loss/hidden": 4.671875, "loss/jsd": 0.0, "loss/logits": 0.4803234338760376, "step": 262 }, { "epoch": 0.0165, "grad_norm": 6.3125, "grad_norm_var": 0.11678059895833333, "learning_rate": 0.0001, "loss": 11.7867, "loss/crossentropy": 2.5109288692474365, "loss/hidden": 4.734375, "loss/jsd": 0.0, "loss/logits": 0.4530385881662369, "step": 264 }, { "epoch": 0.016625, "grad_norm": 6.875, "grad_norm_var": 0.12444254557291666, "learning_rate": 0.0001, "loss": 12.4496, "loss/crossentropy": 2.5854159593582153, "loss/hidden": 4.640625, "loss/jsd": 0.0, "loss/logits": 0.4961777627468109, "step": 266 }, { "epoch": 0.01675, "grad_norm": 6.0625, "grad_norm_var": 0.17303059895833334, "learning_rate": 0.0001, "loss": 11.7866, "loss/crossentropy": 2.411824584007263, "loss/hidden": 4.5, "loss/jsd": 0.0, "loss/logits": 0.43849293887615204, "step": 268 }, { "epoch": 0.016875, "grad_norm": 6.875, "grad_norm_var": 0.17593994140625, "learning_rate": 0.0001, "loss": 11.6888, "loss/crossentropy": 2.568202257156372, "loss/hidden": 4.65625, "loss/jsd": 0.0, "loss/logits": 0.4507843852043152, "step": 270 }, { "epoch": 0.017, "grad_norm": 6.8125, "grad_norm_var": 0.16825764973958332, "learning_rate": 0.0001, "loss": 12.0805, "loss/crossentropy": 2.734534502029419, "loss/hidden": 4.71875, "loss/jsd": 0.0, "loss/logits": 0.47038406133651733, "step": 272 }, { "epoch": 0.017125, "grad_norm": 6.0625, "grad_norm_var": 0.18333333333333332, "learning_rate": 0.0001, "loss": 11.6923, "loss/crossentropy": 2.545106887817383, "loss/hidden": 4.65625, "loss/jsd": 0.0, "loss/logits": 0.4470268785953522, "step": 274 }, { "epoch": 0.01725, "grad_norm": 6.75, "grad_norm_var": 0.15310872395833333, "learning_rate": 0.0001, "loss": 11.8355, "loss/crossentropy": 2.5739216804504395, "loss/hidden": 4.6875, "loss/jsd": 0.0, "loss/logits": 0.5115124732255936, "step": 276 }, { "epoch": 0.017375, "grad_norm": 6.25, "grad_norm_var": 0.15276285807291667, "learning_rate": 0.0001, "loss": 12.1692, "loss/crossentropy": 2.8947519063949585, "loss/hidden": 4.71875, "loss/jsd": 0.0, "loss/logits": 0.5023118555545807, "step": 278 }, { "epoch": 0.0175, "grad_norm": 6.34375, "grad_norm_var": 0.16066080729166668, "learning_rate": 0.0001, "loss": 11.3566, "loss/crossentropy": 2.4826793670654297, "loss/hidden": 4.5625, "loss/jsd": 0.0, "loss/logits": 0.45133158564567566, "step": 280 }, { "epoch": 0.017625, "grad_norm": 6.03125, "grad_norm_var": 0.142041015625, "learning_rate": 0.0001, "loss": 11.841, "loss/crossentropy": 2.4146469831466675, "loss/hidden": 4.640625, "loss/jsd": 0.0, "loss/logits": 0.4536858946084976, "step": 282 }, { "epoch": 0.01775, "grad_norm": 7.28125, "grad_norm_var": 0.18800455729166668, "learning_rate": 0.0001, "loss": 11.688, "loss/crossentropy": 2.450984477996826, "loss/hidden": 4.671875, "loss/jsd": 0.0, "loss/logits": 0.4384435713291168, "step": 284 }, { "epoch": 0.017875, "grad_norm": 6.25, "grad_norm_var": 0.18196207682291668, "learning_rate": 0.0001, "loss": 11.8491, "loss/crossentropy": 2.374552607536316, "loss/hidden": 4.609375, "loss/jsd": 0.0, "loss/logits": 0.42816005647182465, "step": 286 }, { "epoch": 0.018, "grad_norm": 5.625, "grad_norm_var": 0.16256510416666667, "learning_rate": 0.0001, "loss": 11.6167, "loss/crossentropy": 2.4289149045944214, "loss/hidden": 4.609375, "loss/jsd": 0.0, "loss/logits": 0.4989480674266815, "step": 288 }, { "epoch": 0.018125, "grad_norm": 6.46875, "grad_norm_var": 0.16741129557291667, "learning_rate": 0.0001, "loss": 11.5302, "loss/crossentropy": 2.5054534673690796, "loss/hidden": 4.578125, "loss/jsd": 0.0, "loss/logits": 0.45670510828495026, "step": 290 }, { "epoch": 0.01825, "grad_norm": 5.46875, "grad_norm_var": 0.18970947265625, "learning_rate": 0.0001, "loss": 11.3159, "loss/crossentropy": 2.421309471130371, "loss/hidden": 4.46875, "loss/jsd": 0.0, "loss/logits": 0.4126213788986206, "step": 292 }, { "epoch": 0.018375, "grad_norm": 6.71875, "grad_norm_var": 0.21484375, "learning_rate": 0.0001, "loss": 11.7242, "loss/crossentropy": 2.6865302324295044, "loss/hidden": 4.609375, "loss/jsd": 0.0, "loss/logits": 0.48639141023159027, "step": 294 }, { "epoch": 0.0185, "grad_norm": 6.71875, "grad_norm_var": 0.23710530598958332, "learning_rate": 0.0001, "loss": 11.7623, "loss/crossentropy": 2.530078172683716, "loss/hidden": 4.625, "loss/jsd": 0.0, "loss/logits": 0.4684665650129318, "step": 296 }, { "epoch": 0.018625, "grad_norm": 5.8125, "grad_norm_var": 0.24947916666666667, "learning_rate": 0.0001, "loss": 11.4115, "loss/crossentropy": 2.2663209438323975, "loss/hidden": 4.453125, "loss/jsd": 0.0, "loss/logits": 0.4574204683303833, "step": 298 }, { "epoch": 0.01875, "grad_norm": 6.0, "grad_norm_var": 0.17994384765625, "learning_rate": 0.0001, "loss": 11.4217, "loss/crossentropy": 2.260213851928711, "loss/hidden": 4.578125, "loss/jsd": 0.0, "loss/logits": 0.48128053545951843, "step": 300 }, { "epoch": 0.018875, "grad_norm": 6.0, "grad_norm_var": 0.15660400390625, "learning_rate": 0.0001, "loss": 11.3582, "loss/crossentropy": 2.445768356323242, "loss/hidden": 4.421875, "loss/jsd": 0.0, "loss/logits": 0.44302406907081604, "step": 302 }, { "epoch": 0.019, "grad_norm": 7.40625, "grad_norm_var": 0.284619140625, "learning_rate": 0.0001, "loss": 11.3538, "loss/crossentropy": 2.497570037841797, "loss/hidden": 4.484375, "loss/jsd": 0.0, "loss/logits": 0.42873415350914, "step": 304 }, { "epoch": 0.019125, "grad_norm": 5.375, "grad_norm_var": 0.33358968098958336, "learning_rate": 0.0001, "loss": 11.704, "loss/crossentropy": 2.4702342748641968, "loss/hidden": 4.515625, "loss/jsd": 0.0, "loss/logits": 0.4643610119819641, "step": 306 }, { "epoch": 0.01925, "grad_norm": 5.875, "grad_norm_var": 0.3282389322916667, "learning_rate": 0.0001, "loss": 11.5569, "loss/crossentropy": 2.613131880760193, "loss/hidden": 4.4375, "loss/jsd": 0.0, "loss/logits": 0.4228426665067673, "step": 308 }, { "epoch": 0.019375, "grad_norm": 5.9375, "grad_norm_var": 0.32916259765625, "learning_rate": 0.0001, "loss": 11.82, "loss/crossentropy": 2.655569911003113, "loss/hidden": 4.46875, "loss/jsd": 0.0, "loss/logits": 0.4577495604753494, "step": 310 }, { "epoch": 0.0195, "grad_norm": 5.5, "grad_norm_var": 0.396875, "learning_rate": 0.0001, "loss": 11.2265, "loss/crossentropy": 2.5523258447647095, "loss/hidden": 4.4375, "loss/jsd": 0.0, "loss/logits": 0.4802166670560837, "step": 312 }, { "epoch": 0.019625, "grad_norm": 6.0625, "grad_norm_var": 0.4364420572916667, "learning_rate": 0.0001, "loss": 11.369, "loss/crossentropy": 2.7622017860412598, "loss/hidden": 4.421875, "loss/jsd": 0.0, "loss/logits": 0.4416071027517319, "step": 314 }, { "epoch": 0.01975, "grad_norm": 6.46875, "grad_norm_var": 0.43814697265625, "learning_rate": 0.0001, "loss": 11.5738, "loss/crossentropy": 2.592649459838867, "loss/hidden": 4.515625, "loss/jsd": 0.0, "loss/logits": 0.4513923078775406, "step": 316 }, { "epoch": 0.019875, "grad_norm": 6.90625, "grad_norm_var": 0.48209228515625, "learning_rate": 0.0001, "loss": 11.6672, "loss/crossentropy": 2.4992705583572388, "loss/hidden": 4.40625, "loss/jsd": 0.0, "loss/logits": 0.4300658255815506, "step": 318 }, { "epoch": 0.02, "grad_norm": 5.84375, "grad_norm_var": 0.34729410807291666, "learning_rate": 0.0001, "loss": 11.2722, "loss/crossentropy": 2.493373394012451, "loss/hidden": 4.421875, "loss/jsd": 0.0, "loss/logits": 0.442849725484848, "step": 320 }, { "epoch": 0.020125, "grad_norm": 6.28125, "grad_norm_var": 0.30530192057291666, "learning_rate": 0.0001, "loss": 11.5529, "loss/crossentropy": 2.547645926475525, "loss/hidden": 4.5625, "loss/jsd": 0.0, "loss/logits": 0.48772141337394714, "step": 322 }, { "epoch": 0.02025, "grad_norm": 5.6875, "grad_norm_var": 0.30276285807291664, "learning_rate": 0.0001, "loss": 11.7243, "loss/crossentropy": 2.523668885231018, "loss/hidden": 4.578125, "loss/jsd": 0.0, "loss/logits": 0.46751105785369873, "step": 324 }, { "epoch": 0.020375, "grad_norm": 5.5, "grad_norm_var": 0.25611979166666665, "learning_rate": 0.0001, "loss": 11.3933, "loss/crossentropy": 2.300553321838379, "loss/hidden": 4.453125, "loss/jsd": 0.0, "loss/logits": 0.4517511576414108, "step": 326 }, { "epoch": 0.0205, "grad_norm": 6.46875, "grad_norm_var": 0.2505859375, "learning_rate": 0.0001, "loss": 11.1293, "loss/crossentropy": 2.2729378938674927, "loss/hidden": 4.46875, "loss/jsd": 0.0, "loss/logits": 0.40413306653499603, "step": 328 }, { "epoch": 0.020625, "grad_norm": 6.0, "grad_norm_var": 0.21888020833333333, "learning_rate": 0.0001, "loss": 11.1315, "loss/crossentropy": 2.4203063249588013, "loss/hidden": 4.46875, "loss/jsd": 0.0, "loss/logits": 0.4310520142316818, "step": 330 }, { "epoch": 0.02075, "grad_norm": 5.65625, "grad_norm_var": 0.23023681640625, "learning_rate": 0.0001, "loss": 11.1679, "loss/crossentropy": 2.399292469024658, "loss/hidden": 4.4375, "loss/jsd": 0.0, "loss/logits": 0.42272189259529114, "step": 332 }, { "epoch": 0.020875, "grad_norm": 6.625, "grad_norm_var": 0.326953125, "learning_rate": 0.0001, "loss": 11.5921, "loss/crossentropy": 2.7525601387023926, "loss/hidden": 4.578125, "loss/jsd": 0.0, "loss/logits": 0.4559982270002365, "step": 334 }, { "epoch": 0.021, "grad_norm": 4.71875, "grad_norm_var": 0.3798136393229167, "learning_rate": 0.0001, "loss": 11.1814, "loss/crossentropy": 2.5639878511428833, "loss/hidden": 4.4375, "loss/jsd": 0.0, "loss/logits": 0.4693567156791687, "step": 336 }, { "epoch": 0.021125, "grad_norm": 5.15625, "grad_norm_var": 0.3953125, "learning_rate": 0.0001, "loss": 11.0082, "loss/crossentropy": 2.3304390907287598, "loss/hidden": 4.40625, "loss/jsd": 0.0, "loss/logits": 0.4606510251760483, "step": 338 }, { "epoch": 0.02125, "grad_norm": 6.59375, "grad_norm_var": 0.4320597330729167, "learning_rate": 0.0001, "loss": 11.3795, "loss/crossentropy": 2.3963816165924072, "loss/hidden": 4.46875, "loss/jsd": 0.0, "loss/logits": 0.4275331646203995, "step": 340 }, { "epoch": 0.021375, "grad_norm": 5.34375, "grad_norm_var": 0.53834228515625, "learning_rate": 0.0001, "loss": 11.5201, "loss/crossentropy": 2.6739399433135986, "loss/hidden": 4.5625, "loss/jsd": 0.0, "loss/logits": 0.5361433923244476, "step": 342 }, { "epoch": 0.0215, "grad_norm": 6.28125, "grad_norm_var": 0.532421875, "learning_rate": 0.0001, "loss": 11.2029, "loss/crossentropy": 2.246910810470581, "loss/hidden": 4.28125, "loss/jsd": 0.0, "loss/logits": 0.40490400791168213, "step": 344 }, { "epoch": 0.021625, "grad_norm": 5.0625, "grad_norm_var": 0.5819010416666667, "learning_rate": 0.0001, "loss": 11.0779, "loss/crossentropy": 2.273064136505127, "loss/hidden": 4.3125, "loss/jsd": 0.0, "loss/logits": 0.4033001661300659, "step": 346 }, { "epoch": 0.02175, "grad_norm": 6.1875, "grad_norm_var": 0.559375, "learning_rate": 0.0001, "loss": 11.326, "loss/crossentropy": 2.379375696182251, "loss/hidden": 4.34375, "loss/jsd": 0.0, "loss/logits": 0.4155968874692917, "step": 348 }, { "epoch": 0.021875, "grad_norm": 5.5, "grad_norm_var": 0.3761678059895833, "learning_rate": 0.0001, "loss": 11.0718, "loss/crossentropy": 2.4657520055770874, "loss/hidden": 4.265625, "loss/jsd": 0.0, "loss/logits": 0.4061667025089264, "step": 350 }, { "epoch": 0.022, "grad_norm": 5.8125, "grad_norm_var": 0.30662434895833335, "learning_rate": 0.0001, "loss": 11.0622, "loss/crossentropy": 2.480490565299988, "loss/hidden": 4.46875, "loss/jsd": 0.0, "loss/logits": 0.43048132956027985, "step": 352 }, { "epoch": 0.022125, "grad_norm": 5.96875, "grad_norm_var": 0.28765869140625, "learning_rate": 0.0001, "loss": 11.5931, "loss/crossentropy": 2.6550657749176025, "loss/hidden": 4.390625, "loss/jsd": 0.0, "loss/logits": 0.4658609926700592, "step": 354 }, { "epoch": 0.02225, "grad_norm": 5.5625, "grad_norm_var": 0.26099853515625, "learning_rate": 0.0001, "loss": 11.0385, "loss/crossentropy": 2.5135127305984497, "loss/hidden": 4.484375, "loss/jsd": 0.0, "loss/logits": 0.4206026792526245, "step": 356 }, { "epoch": 0.022375, "grad_norm": 5.28125, "grad_norm_var": 0.15416259765625, "learning_rate": 0.0001, "loss": 11.2004, "loss/crossentropy": 2.5652741193771362, "loss/hidden": 4.375, "loss/jsd": 0.0, "loss/logits": 0.41297411918640137, "step": 358 }, { "epoch": 0.0225, "grad_norm": 6.0, "grad_norm_var": 0.13004150390625, "learning_rate": 0.0001, "loss": 10.9409, "loss/crossentropy": 2.472624897956848, "loss/hidden": 4.390625, "loss/jsd": 0.0, "loss/logits": 0.4368878901004791, "step": 360 }, { "epoch": 0.022625, "grad_norm": 6.34375, "grad_norm_var": 0.12721354166666668, "learning_rate": 0.0001, "loss": 11.2239, "loss/crossentropy": 2.3381993770599365, "loss/hidden": 4.28125, "loss/jsd": 0.0, "loss/logits": 0.40575116872787476, "step": 362 }, { "epoch": 0.02275, "grad_norm": 6.0, "grad_norm_var": 0.11730143229166666, "learning_rate": 0.0001, "loss": 11.1714, "loss/crossentropy": 2.4883733987808228, "loss/hidden": 4.28125, "loss/jsd": 0.0, "loss/logits": 0.44851288199424744, "step": 364 }, { "epoch": 0.022875, "grad_norm": 5.4375, "grad_norm_var": 0.160791015625, "learning_rate": 0.0001, "loss": 11.059, "loss/crossentropy": 2.4785518646240234, "loss/hidden": 4.25, "loss/jsd": 0.0, "loss/logits": 0.4333874583244324, "step": 366 }, { "epoch": 0.023, "grad_norm": 5.53125, "grad_norm_var": 0.16568603515625, "learning_rate": 0.0001, "loss": 10.9517, "loss/crossentropy": 2.678915023803711, "loss/hidden": 4.375, "loss/jsd": 0.0, "loss/logits": 0.41231468319892883, "step": 368 }, { "epoch": 0.023125, "grad_norm": 5.15625, "grad_norm_var": 0.22571207682291666, "learning_rate": 0.0001, "loss": 10.9966, "loss/crossentropy": 2.273219585418701, "loss/hidden": 4.234375, "loss/jsd": 0.0, "loss/logits": 0.42474931478500366, "step": 370 }, { "epoch": 0.02325, "grad_norm": 5.875, "grad_norm_var": 0.21131184895833333, "learning_rate": 0.0001, "loss": 11.0027, "loss/crossentropy": 2.3425220251083374, "loss/hidden": 4.3125, "loss/jsd": 0.0, "loss/logits": 0.42784735560417175, "step": 372 }, { "epoch": 0.023375, "grad_norm": 5.28125, "grad_norm_var": 0.26174723307291664, "learning_rate": 0.0001, "loss": 11.0737, "loss/crossentropy": 2.5677989721298218, "loss/hidden": 4.296875, "loss/jsd": 0.0, "loss/logits": 0.42918097972869873, "step": 374 }, { "epoch": 0.0235, "grad_norm": 5.6875, "grad_norm_var": 0.25816650390625, "learning_rate": 0.0001, "loss": 11.175, "loss/crossentropy": 2.4179869890213013, "loss/hidden": 4.28125, "loss/jsd": 0.0, "loss/logits": 0.3684898316860199, "step": 376 }, { "epoch": 0.023625, "grad_norm": 5.375, "grad_norm_var": 0.20123291015625, "learning_rate": 0.0001, "loss": 10.9388, "loss/crossentropy": 2.52616810798645, "loss/hidden": 4.296875, "loss/jsd": 0.0, "loss/logits": 0.43494799733161926, "step": 378 }, { "epoch": 0.02375, "grad_norm": 5.4375, "grad_norm_var": 0.18599853515625, "learning_rate": 0.0001, "loss": 10.9656, "loss/crossentropy": 2.2381917238235474, "loss/hidden": 4.234375, "loss/jsd": 0.0, "loss/logits": 0.4034838378429413, "step": 380 }, { "epoch": 0.023875, "grad_norm": 5.40625, "grad_norm_var": 0.16360677083333333, "learning_rate": 0.0001, "loss": 10.9349, "loss/crossentropy": 2.5972191095352173, "loss/hidden": 4.296875, "loss/jsd": 0.0, "loss/logits": 0.41289472579956055, "step": 382 }, { "epoch": 0.024, "grad_norm": 6.9375, "grad_norm_var": 0.2912109375, "learning_rate": 0.0001, "loss": 10.8274, "loss/crossentropy": 2.310404658317566, "loss/hidden": 4.28125, "loss/jsd": 0.0, "loss/logits": 0.4219682365655899, "step": 384 }, { "epoch": 0.024125, "grad_norm": 5.03125, "grad_norm_var": 0.25735677083333336, "learning_rate": 0.0001, "loss": 10.7633, "loss/crossentropy": 2.5043996572494507, "loss/hidden": 4.28125, "loss/jsd": 0.0, "loss/logits": 0.3986252546310425, "step": 386 }, { "epoch": 0.02425, "grad_norm": 5.46875, "grad_norm_var": 0.2723307291666667, "learning_rate": 0.0001, "loss": 10.7861, "loss/crossentropy": 2.4293577671051025, "loss/hidden": 4.328125, "loss/jsd": 0.0, "loss/logits": 0.422202467918396, "step": 388 }, { "epoch": 0.024375, "grad_norm": 4.59375, "grad_norm_var": 0.29833577473958334, "learning_rate": 0.0001, "loss": 11.0955, "loss/crossentropy": 2.5617754459381104, "loss/hidden": 4.25, "loss/jsd": 0.0, "loss/logits": 0.4159541577100754, "step": 390 }, { "epoch": 0.0245, "grad_norm": 6.375, "grad_norm_var": 0.3329264322916667, "learning_rate": 0.0001, "loss": 10.8782, "loss/crossentropy": 2.6902949810028076, "loss/hidden": 4.265625, "loss/jsd": 0.0, "loss/logits": 0.4204605668783188, "step": 392 }, { "epoch": 0.024625, "grad_norm": 4.75, "grad_norm_var": 0.3837890625, "learning_rate": 0.0001, "loss": 10.5797, "loss/crossentropy": 2.579855442047119, "loss/hidden": 4.203125, "loss/jsd": 0.0, "loss/logits": 0.3951130211353302, "step": 394 }, { "epoch": 0.02475, "grad_norm": 5.28125, "grad_norm_var": 0.3661295572916667, "learning_rate": 0.0001, "loss": 10.9375, "loss/crossentropy": 2.575868844985962, "loss/hidden": 4.3125, "loss/jsd": 0.0, "loss/logits": 0.4067578613758087, "step": 396 }, { "epoch": 0.024875, "grad_norm": 5.53125, "grad_norm_var": 0.3630859375, "learning_rate": 0.0001, "loss": 11.1795, "loss/crossentropy": 2.3657878637313843, "loss/hidden": 4.25, "loss/jsd": 0.0, "loss/logits": 0.4360276013612747, "step": 398 }, { "epoch": 0.025, "grad_norm": 5.0, "grad_norm_var": 0.2508748372395833, "learning_rate": 0.0001, "loss": 10.8242, "loss/crossentropy": 2.3945010900497437, "loss/hidden": 4.1875, "loss/jsd": 0.0, "loss/logits": 0.4413905292749405, "step": 400 }, { "epoch": 0.025125, "grad_norm": 5.25, "grad_norm_var": 0.2595703125, "learning_rate": 0.0001, "loss": 10.5581, "loss/crossentropy": 2.1977953910827637, "loss/hidden": 4.1875, "loss/jsd": 0.0, "loss/logits": 0.3550948351621628, "step": 402 }, { "epoch": 0.02525, "grad_norm": 4.875, "grad_norm_var": 0.23834635416666666, "learning_rate": 0.0001, "loss": 10.6479, "loss/crossentropy": 2.3345136642456055, "loss/hidden": 4.265625, "loss/jsd": 0.0, "loss/logits": 0.38985343277454376, "step": 404 }, { "epoch": 0.025375, "grad_norm": 5.46875, "grad_norm_var": 0.15650634765625, "learning_rate": 0.0001, "loss": 10.6649, "loss/crossentropy": 2.44900119304657, "loss/hidden": 4.21875, "loss/jsd": 0.0, "loss/logits": 0.4164131134748459, "step": 406 }, { "epoch": 0.0255, "grad_norm": 4.75, "grad_norm_var": 0.083203125, "learning_rate": 0.0001, "loss": 10.6422, "loss/crossentropy": 2.4326157569885254, "loss/hidden": 4.265625, "loss/jsd": 0.0, "loss/logits": 0.409572958946228, "step": 408 }, { "epoch": 0.025625, "grad_norm": 5.21875, "grad_norm_var": 0.06988525390625, "learning_rate": 0.0001, "loss": 10.6566, "loss/crossentropy": 2.4960622787475586, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.41588515043258667, "step": 410 }, { "epoch": 0.02575, "grad_norm": 5.3125, "grad_norm_var": 0.06795247395833333, "learning_rate": 0.0001, "loss": 10.7553, "loss/crossentropy": 2.590168833732605, "loss/hidden": 4.265625, "loss/jsd": 0.0, "loss/logits": 0.4012999087572098, "step": 412 }, { "epoch": 0.025875, "grad_norm": 5.65625, "grad_norm_var": 0.6649739583333333, "learning_rate": 0.0001, "loss": 11.135, "loss/crossentropy": 2.545465111732483, "loss/hidden": 4.375, "loss/jsd": 0.0, "loss/logits": 0.4800722002983093, "step": 414 }, { "epoch": 0.026, "grad_norm": 5.21875, "grad_norm_var": 0.6665974934895833, "learning_rate": 0.0001, "loss": 10.9887, "loss/crossentropy": 2.4133318662643433, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.4137228727340698, "step": 416 }, { "epoch": 0.026125, "grad_norm": 5.09375, "grad_norm_var": 0.6603474934895833, "learning_rate": 0.0001, "loss": 10.5496, "loss/crossentropy": 2.1947737336158752, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.4033205509185791, "step": 418 }, { "epoch": 0.02625, "grad_norm": 5.375, "grad_norm_var": 0.6346354166666667, "learning_rate": 0.0001, "loss": 10.7508, "loss/crossentropy": 2.1539926528930664, "loss/hidden": 4.15625, "loss/jsd": 0.0, "loss/logits": 0.40337586402893066, "step": 420 }, { "epoch": 0.026375, "grad_norm": 6.75, "grad_norm_var": 0.7378743489583334, "learning_rate": 0.0001, "loss": 11.0461, "loss/crossentropy": 2.524567127227783, "loss/hidden": 4.25, "loss/jsd": 0.0, "loss/logits": 0.4650000333786011, "step": 422 }, { "epoch": 0.0265, "grad_norm": 5.3125, "grad_norm_var": 0.6646443684895833, "learning_rate": 0.0001, "loss": 11.0872, "loss/crossentropy": 2.333670735359192, "loss/hidden": 4.203125, "loss/jsd": 0.0, "loss/logits": 0.4344635456800461, "step": 424 }, { "epoch": 0.026625, "grad_norm": 5.3125, "grad_norm_var": 0.6597005208333333, "learning_rate": 0.0001, "loss": 10.8727, "loss/crossentropy": 2.4718040227890015, "loss/hidden": 4.1875, "loss/jsd": 0.0, "loss/logits": 0.42122460901737213, "step": 426 }, { "epoch": 0.02675, "grad_norm": 4.75, "grad_norm_var": 0.67672119140625, "learning_rate": 0.0001, "loss": 10.7434, "loss/crossentropy": 2.60198974609375, "loss/hidden": 4.25, "loss/jsd": 0.0, "loss/logits": 0.41160446405410767, "step": 428 }, { "epoch": 0.026875, "grad_norm": 5.0625, "grad_norm_var": 0.20933837890625, "learning_rate": 0.0001, "loss": 10.79, "loss/crossentropy": 2.614295244216919, "loss/hidden": 4.234375, "loss/jsd": 0.0, "loss/logits": 0.4241054952144623, "step": 430 }, { "epoch": 0.027, "grad_norm": 5.65625, "grad_norm_var": 0.202587890625, "learning_rate": 0.0001, "loss": 10.7823, "loss/crossentropy": 2.578159213066101, "loss/hidden": 4.203125, "loss/jsd": 0.0, "loss/logits": 0.40188899636268616, "step": 432 }, { "epoch": 0.027125, "grad_norm": 5.09375, "grad_norm_var": 0.2130859375, "learning_rate": 0.0001, "loss": 10.5528, "loss/crossentropy": 2.068563759326935, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.4180772006511688, "step": 434 }, { "epoch": 0.02725, "grad_norm": 6.0, "grad_norm_var": 0.23834228515625, "learning_rate": 0.0001, "loss": 10.6453, "loss/crossentropy": 2.596070408821106, "loss/hidden": 4.25, "loss/jsd": 0.0, "loss/logits": 0.4149198830127716, "step": 436 }, { "epoch": 0.027375, "grad_norm": 4.75, "grad_norm_var": 0.151025390625, "learning_rate": 0.0001, "loss": 10.5426, "loss/crossentropy": 2.305434226989746, "loss/hidden": 4.1875, "loss/jsd": 0.0, "loss/logits": 0.384622722864151, "step": 438 }, { "epoch": 0.0275, "grad_norm": 4.96875, "grad_norm_var": 0.1396484375, "learning_rate": 0.0001, "loss": 10.7981, "loss/crossentropy": 2.502975344657898, "loss/hidden": 4.1875, "loss/jsd": 0.0, "loss/logits": 0.37728238105773926, "step": 440 }, { "epoch": 0.027625, "grad_norm": 5.25, "grad_norm_var": 0.142822265625, "learning_rate": 0.0001, "loss": 10.4751, "loss/crossentropy": 2.305208921432495, "loss/hidden": 4.1875, "loss/jsd": 0.0, "loss/logits": 0.38094912469387054, "step": 442 }, { "epoch": 0.02775, "grad_norm": 5.625, "grad_norm_var": 0.13279622395833332, "learning_rate": 0.0001, "loss": 10.7068, "loss/crossentropy": 2.6903436183929443, "loss/hidden": 4.15625, "loss/jsd": 0.0, "loss/logits": 0.3932228535413742, "step": 444 }, { "epoch": 0.027875, "grad_norm": 6.53125, "grad_norm_var": 0.229541015625, "learning_rate": 0.0001, "loss": 10.9159, "loss/crossentropy": 2.724982738494873, "loss/hidden": 4.234375, "loss/jsd": 0.0, "loss/logits": 0.48244524002075195, "step": 446 }, { "epoch": 0.028, "grad_norm": 4.71875, "grad_norm_var": 0.26022135416666664, "learning_rate": 0.0001, "loss": 10.5689, "loss/crossentropy": 2.1659945249557495, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.38983820378780365, "step": 448 }, { "epoch": 0.028125, "grad_norm": 6.09375, "grad_norm_var": 0.3133748372395833, "learning_rate": 0.0001, "loss": 10.4284, "loss/crossentropy": 2.4513895511627197, "loss/hidden": 4.28125, "loss/jsd": 0.0, "loss/logits": 0.3891524076461792, "step": 450 }, { "epoch": 0.02825, "grad_norm": 5.28125, "grad_norm_var": 0.28541666666666665, "learning_rate": 0.0001, "loss": 10.6392, "loss/crossentropy": 2.3762803077697754, "loss/hidden": 4.203125, "loss/jsd": 0.0, "loss/logits": 0.4033055752515793, "step": 452 }, { "epoch": 0.028375, "grad_norm": 6.625, "grad_norm_var": 0.87955322265625, "learning_rate": 0.0001, "loss": 10.9979, "loss/crossentropy": 2.5074435472488403, "loss/hidden": 4.234375, "loss/jsd": 0.0, "loss/logits": 0.4134407788515091, "step": 454 }, { "epoch": 0.0285, "grad_norm": 5.3125, "grad_norm_var": 0.85484619140625, "learning_rate": 0.0001, "loss": 10.7103, "loss/crossentropy": 2.4922858476638794, "loss/hidden": 4.203125, "loss/jsd": 0.0, "loss/logits": 0.40840162336826324, "step": 456 }, { "epoch": 0.028625, "grad_norm": 4.96875, "grad_norm_var": 0.9095662434895834, "learning_rate": 0.0001, "loss": 10.4182, "loss/crossentropy": 2.332331657409668, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.37381240725517273, "step": 458 }, { "epoch": 0.02875, "grad_norm": 4.8125, "grad_norm_var": 0.9465983072916667, "learning_rate": 0.0001, "loss": 10.5666, "loss/crossentropy": 2.6399567127227783, "loss/hidden": 4.15625, "loss/jsd": 0.0, "loss/logits": 0.38797518610954285, "step": 460 }, { "epoch": 0.028875, "grad_norm": 5.3125, "grad_norm_var": 0.889697265625, "learning_rate": 0.0001, "loss": 10.5769, "loss/crossentropy": 2.302717089653015, "loss/hidden": 4.1875, "loss/jsd": 0.0, "loss/logits": 0.3878500908613205, "step": 462 }, { "epoch": 0.029, "grad_norm": 4.96875, "grad_norm_var": 0.8904256184895833, "learning_rate": 0.0001, "loss": 10.4066, "loss/crossentropy": 2.3778563737869263, "loss/hidden": 4.109375, "loss/jsd": 0.0, "loss/logits": 0.38498103618621826, "step": 464 }, { "epoch": 0.029125, "grad_norm": 4.78125, "grad_norm_var": 0.854931640625, "learning_rate": 0.0001, "loss": 10.6144, "loss/crossentropy": 2.0649060010910034, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.3695131242275238, "step": 466 }, { "epoch": 0.02925, "grad_norm": 4.90625, "grad_norm_var": 0.8793619791666667, "learning_rate": 0.0001, "loss": 10.7395, "loss/crossentropy": 2.8441067934036255, "loss/hidden": 4.21875, "loss/jsd": 0.0, "loss/logits": 0.3995439410209656, "step": 468 }, { "epoch": 0.029375, "grad_norm": 6.46875, "grad_norm_var": 0.209765625, "learning_rate": 0.0001, "loss": 10.6913, "loss/crossentropy": 2.557037353515625, "loss/hidden": 4.109375, "loss/jsd": 0.0, "loss/logits": 0.410599485039711, "step": 470 }, { "epoch": 0.0295, "grad_norm": 5.375, "grad_norm_var": 0.223828125, "learning_rate": 0.0001, "loss": 10.6722, "loss/crossentropy": 2.367073655128479, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.3934127390384674, "step": 472 }, { "epoch": 0.029625, "grad_norm": 5.03125, "grad_norm_var": 0.21222330729166666, "learning_rate": 0.0001, "loss": 10.6489, "loss/crossentropy": 2.4897983074188232, "loss/hidden": 4.109375, "loss/jsd": 0.0, "loss/logits": 0.42430558800697327, "step": 474 }, { "epoch": 0.02975, "grad_norm": 5.5, "grad_norm_var": 0.20976155598958332, "learning_rate": 0.0001, "loss": 10.6471, "loss/crossentropy": 2.4441006183624268, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.4049318730831146, "step": 476 }, { "epoch": 0.029875, "grad_norm": 4.46875, "grad_norm_var": 0.24075520833333333, "learning_rate": 0.0001, "loss": 10.9236, "loss/crossentropy": 2.4001717567443848, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.3930549919605255, "step": 478 }, { "epoch": 0.03, "grad_norm": 4.9375, "grad_norm_var": 0.22499593098958334, "learning_rate": 0.0001, "loss": 10.4589, "loss/crossentropy": 2.5291205644607544, "loss/hidden": 4.15625, "loss/jsd": 0.0, "loss/logits": 0.43080681562423706, "step": 480 }, { "epoch": 0.030125, "grad_norm": 6.40625, "grad_norm_var": 0.30774332682291666, "learning_rate": 0.0001, "loss": 10.4734, "loss/crossentropy": 2.388319194316864, "loss/hidden": 4.21875, "loss/jsd": 0.0, "loss/logits": 0.3897434026002884, "step": 482 }, { "epoch": 0.03025, "grad_norm": 5.5625, "grad_norm_var": 0.29670817057291665, "learning_rate": 0.0001, "loss": 10.4658, "loss/crossentropy": 2.2806296348571777, "loss/hidden": 4.1875, "loss/jsd": 0.0, "loss/logits": 0.3782193958759308, "step": 484 }, { "epoch": 0.030375, "grad_norm": 7.34375, "grad_norm_var": 1.065625, "learning_rate": 0.0001, "loss": 10.5857, "loss/crossentropy": 2.3721553087234497, "loss/hidden": 4.109375, "loss/jsd": 0.0, "loss/logits": 0.4088495075702667, "step": 486 }, { "epoch": 0.0305, "grad_norm": 4.59375, "grad_norm_var": 1.071728515625, "learning_rate": 0.0001, "loss": 10.3529, "loss/crossentropy": 2.214341640472412, "loss/hidden": 4.109375, "loss/jsd": 0.0, "loss/logits": 0.37406377494335175, "step": 488 }, { "epoch": 0.030625, "grad_norm": 5.0, "grad_norm_var": 1.05445556640625, "learning_rate": 0.0001, "loss": 10.5074, "loss/crossentropy": 2.5398401021957397, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.4116057902574539, "step": 490 }, { "epoch": 0.03075, "grad_norm": 5.28125, "grad_norm_var": 1.0507649739583333, "learning_rate": 0.0001, "loss": 10.5333, "loss/crossentropy": 2.6637160778045654, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.40726278722286224, "step": 492 }, { "epoch": 0.030875, "grad_norm": 4.625, "grad_norm_var": 1.0790974934895834, "learning_rate": 0.0001, "loss": 10.3532, "loss/crossentropy": 2.311811923980713, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.3635952025651932, "step": 494 }, { "epoch": 0.031, "grad_norm": 5.4375, "grad_norm_var": 1.1916300455729167, "learning_rate": 0.0001, "loss": 10.7405, "loss/crossentropy": 2.5883506536483765, "loss/hidden": 4.1875, "loss/jsd": 0.0, "loss/logits": 0.4075654149055481, "step": 496 }, { "epoch": 0.031125, "grad_norm": 7.46875, "grad_norm_var": 1.3474894205729167, "learning_rate": 0.0001, "loss": 10.7376, "loss/crossentropy": 2.5056556463241577, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.4202713221311569, "step": 498 }, { "epoch": 0.03125, "grad_norm": 5.53125, "grad_norm_var": 1.3410807291666667, "learning_rate": 0.0001, "loss": 10.6792, "loss/crossentropy": 2.5138707160949707, "loss/hidden": 4.21875, "loss/jsd": 0.0, "loss/logits": 0.44250351190567017, "step": 500 }, { "epoch": 0.031375, "grad_norm": 4.84375, "grad_norm_var": 0.64498291015625, "learning_rate": 0.0001, "loss": 10.5416, "loss/crossentropy": 2.6356310844421387, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.40963026881217957, "step": 502 }, { "epoch": 0.0315, "grad_norm": 4.40625, "grad_norm_var": 0.8839152018229167, "learning_rate": 0.0001, "loss": 10.5626, "loss/crossentropy": 2.4822793006896973, "loss/hidden": 4.125, "loss/jsd": 0.0, "loss/logits": 0.3620312958955765, "step": 504 }, { "epoch": 0.031625, "grad_norm": 5.40625, "grad_norm_var": 0.86373291015625, "learning_rate": 0.0001, "loss": 10.5343, "loss/crossentropy": 2.581249713897705, "loss/hidden": 4.171875, "loss/jsd": 0.0, "loss/logits": 0.39074860513210297, "step": 506 }, { "epoch": 0.03175, "grad_norm": 4.65625, "grad_norm_var": 0.991259765625, "learning_rate": 0.0001, "loss": 10.3297, "loss/crossentropy": 2.4520576000213623, "loss/hidden": 4.015625, "loss/jsd": 0.0, "loss/logits": 0.38373272120952606, "step": 508 }, { "epoch": 0.031875, "grad_norm": 5.03125, "grad_norm_var": 0.913134765625, "learning_rate": 0.0001, "loss": 10.6075, "loss/crossentropy": 2.5660005807876587, "loss/hidden": 4.203125, "loss/jsd": 0.0, "loss/logits": 0.44344601035118103, "step": 510 }, { "epoch": 0.032, "grad_norm": 4.90625, "grad_norm_var": 0.7636027018229167, "learning_rate": 0.0001, "loss": 10.4046, "loss/crossentropy": 2.527319550514221, "loss/hidden": 4.046875, "loss/jsd": 0.0, "loss/logits": 0.37920865416526794, "step": 512 }, { "epoch": 0.032125, "grad_norm": 4.5, "grad_norm_var": 0.46835530598958336, "learning_rate": 0.0001, "loss": 10.0486, "loss/crossentropy": 2.509230613708496, "loss/hidden": 4.0625, "loss/jsd": 0.0, "loss/logits": 0.3975517302751541, "step": 514 }, { "epoch": 0.03225, "grad_norm": 4.78125, "grad_norm_var": 0.45974934895833336, "learning_rate": 0.0001, "loss": 10.3748, "loss/crossentropy": 2.296907901763916, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.37533947825431824, "step": 516 }, { "epoch": 0.032375, "grad_norm": 5.21875, "grad_norm_var": 0.45927327473958335, "learning_rate": 0.0001, "loss": 10.661, "loss/crossentropy": 2.587345600128174, "loss/hidden": 4.15625, "loss/jsd": 0.0, "loss/logits": 0.44121941924095154, "step": 518 }, { "epoch": 0.0325, "grad_norm": 5.25, "grad_norm_var": 0.121337890625, "learning_rate": 0.0001, "loss": 10.3292, "loss/crossentropy": 2.4897454977035522, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.37143947184085846, "step": 520 }, { "epoch": 0.032625, "grad_norm": 5.3125, "grad_norm_var": 0.121337890625, "learning_rate": 0.0001, "loss": 10.5275, "loss/crossentropy": 2.3746429681777954, "loss/hidden": 4.03125, "loss/jsd": 0.0, "loss/logits": 0.36811254918575287, "step": 522 }, { "epoch": 0.03275, "grad_norm": 4.90625, "grad_norm_var": 0.09269205729166667, "learning_rate": 0.0001, "loss": 10.4857, "loss/crossentropy": 2.516595959663391, "loss/hidden": 4.15625, "loss/jsd": 0.0, "loss/logits": 0.3954416811466217, "step": 524 }, { "epoch": 0.032875, "grad_norm": 4.59375, "grad_norm_var": 0.12864176432291666, "learning_rate": 0.0001, "loss": 10.4063, "loss/crossentropy": 2.1858848333358765, "loss/hidden": 4.0, "loss/jsd": 0.0, "loss/logits": 0.3671702444553375, "step": 526 }, { "epoch": 0.033, "grad_norm": 4.71875, "grad_norm_var": 0.12877604166666667, "learning_rate": 0.0001, "loss": 10.4706, "loss/crossentropy": 2.3672547340393066, "loss/hidden": 4.03125, "loss/jsd": 0.0, "loss/logits": 0.42084239423274994, "step": 528 }, { "epoch": 0.033125, "grad_norm": 5.375, "grad_norm_var": 0.15623372395833332, "learning_rate": 0.0001, "loss": 10.5682, "loss/crossentropy": 2.4755560159683228, "loss/hidden": 4.109375, "loss/jsd": 0.0, "loss/logits": 0.38027940690517426, "step": 530 }, { "epoch": 0.03325, "grad_norm": 4.84375, "grad_norm_var": 0.16353759765625, "learning_rate": 0.0001, "loss": 10.3109, "loss/crossentropy": 2.4052809476852417, "loss/hidden": 4.015625, "loss/jsd": 0.0, "loss/logits": 0.37095922231674194, "step": 532 }, { "epoch": 0.033375, "grad_norm": 5.5625, "grad_norm_var": 0.185546875, "learning_rate": 0.0001, "loss": 10.6675, "loss/crossentropy": 2.7321990728378296, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.40198850631713867, "step": 534 }, { "epoch": 0.0335, "grad_norm": 4.5, "grad_norm_var": 0.24308268229166666, "learning_rate": 0.0001, "loss": 10.4618, "loss/crossentropy": 2.609284520149231, "loss/hidden": 4.109375, "loss/jsd": 0.0, "loss/logits": 0.3769884407520294, "step": 536 }, { "epoch": 0.033625, "grad_norm": 4.625, "grad_norm_var": 0.229931640625, "learning_rate": 0.0001, "loss": 10.2295, "loss/crossentropy": 2.4272639751434326, "loss/hidden": 4.0390625, "loss/jsd": 0.0, "loss/logits": 0.38766802847385406, "step": 538 }, { "epoch": 0.03375, "grad_norm": 4.90625, "grad_norm_var": 0.22877604166666668, "learning_rate": 0.0001, "loss": 10.3895, "loss/crossentropy": 2.3962435722351074, "loss/hidden": 4.015625, "loss/jsd": 0.0, "loss/logits": 0.3932725340127945, "step": 540 }, { "epoch": 0.033875, "grad_norm": 5.21875, "grad_norm_var": 0.20539957682291668, "learning_rate": 0.0001, "loss": 10.1794, "loss/crossentropy": 2.4086058139801025, "loss/hidden": 3.921875, "loss/jsd": 0.0, "loss/logits": 0.37670063972473145, "step": 542 }, { "epoch": 0.034, "grad_norm": 4.90625, "grad_norm_var": 0.21051025390625, "learning_rate": 0.0001, "loss": 10.5333, "loss/crossentropy": 2.406098246574402, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.3975420147180557, "step": 544 }, { "epoch": 0.034125, "grad_norm": 4.46875, "grad_norm_var": 0.19163004557291666, "learning_rate": 0.0001, "loss": 10.4258, "loss/crossentropy": 2.6447004079818726, "loss/hidden": 4.0, "loss/jsd": 0.0, "loss/logits": 0.40193626284599304, "step": 546 }, { "epoch": 0.03425, "grad_norm": 5.09375, "grad_norm_var": 0.17701416015625, "learning_rate": 0.0001, "loss": 10.5984, "loss/crossentropy": 2.570632576942444, "loss/hidden": 4.078125, "loss/jsd": 0.0, "loss/logits": 0.4150645583868027, "step": 548 }, { "epoch": 0.034375, "grad_norm": 5.1875, "grad_norm_var": 0.15758056640625, "learning_rate": 0.0001, "loss": 10.1492, "loss/crossentropy": 2.250994086265564, "loss/hidden": 4.0078125, "loss/jsd": 0.0, "loss/logits": 0.37618446350097656, "step": 550 }, { "epoch": 0.0345, "grad_norm": 4.59375, "grad_norm_var": 0.09928385416666667, "learning_rate": 0.0001, "loss": 10.2205, "loss/crossentropy": 2.2985726594924927, "loss/hidden": 3.96875, "loss/jsd": 0.0, "loss/logits": 0.38329558074474335, "step": 552 }, { "epoch": 0.034625, "grad_norm": 5.03125, "grad_norm_var": 0.084765625, "learning_rate": 0.0001, "loss": 10.2594, "loss/crossentropy": 2.409608244895935, "loss/hidden": 3.921875, "loss/jsd": 0.0, "loss/logits": 0.3573233038187027, "step": 554 }, { "epoch": 0.03475, "grad_norm": 5.0, "grad_norm_var": 0.08723958333333333, "learning_rate": 0.0001, "loss": 10.4628, "loss/crossentropy": 2.2365437746047974, "loss/hidden": 4.03125, "loss/jsd": 0.0, "loss/logits": 0.36603498458862305, "step": 556 }, { "epoch": 0.034875, "grad_norm": 5.28125, "grad_norm_var": 0.107275390625, "learning_rate": 0.0001, "loss": 10.4873, "loss/crossentropy": 2.30200457572937, "loss/hidden": 3.96875, "loss/jsd": 0.0, "loss/logits": 0.38180042803287506, "step": 558 }, { "epoch": 0.035, "grad_norm": 4.78125, "grad_norm_var": 0.156494140625, "learning_rate": 0.0001, "loss": 10.5403, "loss/crossentropy": 2.4553037881851196, "loss/hidden": 4.109375, "loss/jsd": 0.0, "loss/logits": 0.3985973298549652, "step": 560 }, { "epoch": 0.035125, "grad_norm": 4.875, "grad_norm_var": 0.15087483723958334, "learning_rate": 0.0001, "loss": 10.4663, "loss/crossentropy": 2.484908103942871, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.39745810627937317, "step": 562 }, { "epoch": 0.03525, "grad_norm": 4.65625, "grad_norm_var": 0.15787760416666666, "learning_rate": 0.0001, "loss": 10.2568, "loss/crossentropy": 2.386349678039551, "loss/hidden": 3.8984375, "loss/jsd": 0.0, "loss/logits": 0.3671696186065674, "step": 564 }, { "epoch": 0.035375, "grad_norm": 4.5, "grad_norm_var": 0.15891520182291666, "learning_rate": 0.0001, "loss": 10.2005, "loss/crossentropy": 2.796995162963867, "loss/hidden": 4.046875, "loss/jsd": 0.0, "loss/logits": 0.3951940983533859, "step": 566 }, { "epoch": 0.0355, "grad_norm": 4.625, "grad_norm_var": 0.14308268229166668, "learning_rate": 0.0001, "loss": 10.3384, "loss/crossentropy": 2.167185425758362, "loss/hidden": 3.9921875, "loss/jsd": 0.0, "loss/logits": 0.3679066449403763, "step": 568 }, { "epoch": 0.035625, "grad_norm": 4.3125, "grad_norm_var": 0.16002197265625, "learning_rate": 0.0001, "loss": 10.1328, "loss/crossentropy": 2.331982374191284, "loss/hidden": 3.90625, "loss/jsd": 0.0, "loss/logits": 0.35966630280017853, "step": 570 }, { "epoch": 0.03575, "grad_norm": 4.9375, "grad_norm_var": 0.16490885416666667, "learning_rate": 0.0001, "loss": 10.2677, "loss/crossentropy": 2.545064091682434, "loss/hidden": 3.921875, "loss/jsd": 0.0, "loss/logits": 0.3585609495639801, "step": 572 }, { "epoch": 0.035875, "grad_norm": 4.5, "grad_norm_var": 0.13590087890625, "learning_rate": 0.0001, "loss": 10.1833, "loss/crossentropy": 2.673481822013855, "loss/hidden": 3.96875, "loss/jsd": 0.0, "loss/logits": 0.3861938863992691, "step": 574 }, { "epoch": 0.036, "grad_norm": 4.40625, "grad_norm_var": 0.06877848307291666, "learning_rate": 0.0001, "loss": 9.8329, "loss/crossentropy": 2.2054319381713867, "loss/hidden": 3.890625, "loss/jsd": 0.0, "loss/logits": 0.34671929478645325, "step": 576 }, { "epoch": 0.036125, "grad_norm": 4.40625, "grad_norm_var": 0.04698893229166667, "learning_rate": 0.0001, "loss": 9.8525, "loss/crossentropy": 2.3083781003952026, "loss/hidden": 3.8359375, "loss/jsd": 0.0, "loss/logits": 0.3572017103433609, "step": 578 }, { "epoch": 0.03625, "grad_norm": 4.84375, "grad_norm_var": 0.12235921223958333, "learning_rate": 0.0001, "loss": 10.1868, "loss/crossentropy": 2.5364460945129395, "loss/hidden": 3.8515625, "loss/jsd": 0.0, "loss/logits": 0.35016657412052155, "step": 580 }, { "epoch": 0.036375, "grad_norm": 4.6875, "grad_norm_var": 0.11933186848958334, "learning_rate": 0.0001, "loss": 10.1112, "loss/crossentropy": 2.1111658811569214, "loss/hidden": 3.921875, "loss/jsd": 0.0, "loss/logits": 0.35201945900917053, "step": 582 }, { "epoch": 0.0365, "grad_norm": 4.28125, "grad_norm_var": 0.13157145182291666, "learning_rate": 0.0001, "loss": 10.0743, "loss/crossentropy": 2.2730716466903687, "loss/hidden": 3.9609375, "loss/jsd": 0.0, "loss/logits": 0.3673980087041855, "step": 584 }, { "epoch": 0.036625, "grad_norm": 4.4375, "grad_norm_var": 0.11443684895833334, "learning_rate": 0.0001, "loss": 10.1844, "loss/crossentropy": 2.423087477684021, "loss/hidden": 4.0625, "loss/jsd": 0.0, "loss/logits": 0.3773965388536453, "step": 586 }, { "epoch": 0.03675, "grad_norm": 5.09375, "grad_norm_var": 0.11886393229166667, "learning_rate": 0.0001, "loss": 9.9989, "loss/crossentropy": 2.429610013961792, "loss/hidden": 4.046875, "loss/jsd": 0.0, "loss/logits": 0.37359054386615753, "step": 588 }, { "epoch": 0.036875, "grad_norm": 4.34375, "grad_norm_var": 0.15779622395833334, "learning_rate": 0.0001, "loss": 10.2243, "loss/crossentropy": 2.705647587776184, "loss/hidden": 4.140625, "loss/jsd": 0.0, "loss/logits": 0.418183371424675, "step": 590 }, { "epoch": 0.037, "grad_norm": 4.375, "grad_norm_var": 0.16418863932291666, "learning_rate": 0.0001, "loss": 10.3141, "loss/crossentropy": 2.5757263898849487, "loss/hidden": 3.984375, "loss/jsd": 0.0, "loss/logits": 0.38670916855335236, "step": 592 }, { "epoch": 0.037125, "grad_norm": 4.0, "grad_norm_var": 0.20058186848958334, "learning_rate": 0.0001, "loss": 9.958, "loss/crossentropy": 2.660887598991394, "loss/hidden": 3.8515625, "loss/jsd": 0.0, "loss/logits": 0.3619972914457321, "step": 594 }, { "epoch": 0.03725, "grad_norm": 4.34375, "grad_norm_var": 0.11832275390625, "learning_rate": 0.0001, "loss": 10.0076, "loss/crossentropy": 2.2089375257492065, "loss/hidden": 3.84375, "loss/jsd": 0.0, "loss/logits": 0.37448957562446594, "step": 596 }, { "epoch": 0.037375, "grad_norm": 4.9375, "grad_norm_var": 0.1216796875, "learning_rate": 0.0001, "loss": 10.307, "loss/crossentropy": 2.52751886844635, "loss/hidden": 4.046875, "loss/jsd": 0.0, "loss/logits": 0.3705534487962723, "step": 598 }, { "epoch": 0.0375, "grad_norm": 5.25, "grad_norm_var": 0.14667561848958333, "learning_rate": 0.0001, "loss": 9.986, "loss/crossentropy": 1.9802033305168152, "loss/hidden": 3.9453125, "loss/jsd": 0.0, "loss/logits": 0.34747298061847687, "step": 600 }, { "epoch": 0.037625, "grad_norm": 4.90625, "grad_norm_var": 0.16565348307291666, "learning_rate": 0.0001, "loss": 9.8585, "loss/crossentropy": 2.3861488103866577, "loss/hidden": 3.953125, "loss/jsd": 0.0, "loss/logits": 0.3828616738319397, "step": 602 }, { "epoch": 0.03775, "grad_norm": 17.125, "grad_norm_var": 9.949593098958333, "learning_rate": 0.0001, "loss": 10.9595, "loss/crossentropy": 2.5604605674743652, "loss/hidden": 3.984375, "loss/jsd": 0.0, "loss/logits": 0.3666844367980957, "step": 604 }, { "epoch": 0.037875, "grad_norm": 5.15625, "grad_norm_var": 9.885091145833334, "learning_rate": 0.0001, "loss": 9.9475, "loss/crossentropy": 2.2230526208877563, "loss/hidden": 3.9375, "loss/jsd": 0.0, "loss/logits": 0.3571869730949402, "step": 606 }, { "epoch": 0.038, "grad_norm": 4.53125, "grad_norm_var": 9.79449462890625, "learning_rate": 0.0001, "loss": 10.3505, "loss/crossentropy": 2.500959277153015, "loss/hidden": 4.0, "loss/jsd": 0.0, "loss/logits": 0.37224216759204865, "step": 608 }, { "epoch": 0.038125, "grad_norm": 4.875, "grad_norm_var": 9.682796223958333, "learning_rate": 0.0001, "loss": 9.9394, "loss/crossentropy": 2.41185462474823, "loss/hidden": 3.921875, "loss/jsd": 0.0, "loss/logits": 0.3980567008256912, "step": 610 }, { "epoch": 0.03825, "grad_norm": 4.65625, "grad_norm_var": 9.60631103515625, "learning_rate": 0.0001, "loss": 10.2283, "loss/crossentropy": 1.9943309426307678, "loss/hidden": 3.8125, "loss/jsd": 0.0, "loss/logits": 0.3476145565509796, "step": 612 }, { "epoch": 0.038375, "grad_norm": 4.9375, "grad_norm_var": 9.56656494140625, "learning_rate": 0.0001, "loss": 10.4794, "loss/crossentropy": 2.747882127761841, "loss/hidden": 4.046875, "loss/jsd": 0.0, "loss/logits": 0.4243037551641464, "step": 614 }, { "epoch": 0.0385, "grad_norm": 5.4375, "grad_norm_var": 9.51558837890625, "learning_rate": 0.0001, "loss": 10.3842, "loss/crossentropy": 2.56972599029541, "loss/hidden": 4.015625, "loss/jsd": 0.0, "loss/logits": 0.4048524647951126, "step": 616 }, { "epoch": 0.038625, "grad_norm": 4.40625, "grad_norm_var": 9.47301025390625, "learning_rate": 0.0001, "loss": 9.9875, "loss/crossentropy": 2.5478017330169678, "loss/hidden": 3.9375, "loss/jsd": 0.0, "loss/logits": 0.36699262261390686, "step": 618 }, { "epoch": 0.03875, "grad_norm": 4.3125, "grad_norm_var": 0.20136311848958333, "learning_rate": 0.0001, "loss": 9.9678, "loss/crossentropy": 2.3284069299697876, "loss/hidden": 3.890625, "loss/jsd": 0.0, "loss/logits": 0.3594726324081421, "step": 620 }, { "epoch": 0.038875, "grad_norm": 4.5625, "grad_norm_var": 0.15087483723958334, "learning_rate": 0.0001, "loss": 10.0477, "loss/crossentropy": 2.486843466758728, "loss/hidden": 3.9921875, "loss/jsd": 0.0, "loss/logits": 0.3659539967775345, "step": 622 }, { "epoch": 0.039, "grad_norm": 5.03125, "grad_norm_var": 0.15201416015625, "learning_rate": 0.0001, "loss": 10.1201, "loss/crossentropy": 2.581569790840149, "loss/hidden": 3.9453125, "loss/jsd": 0.0, "loss/logits": 0.3597100079059601, "step": 624 }, { "epoch": 0.039125, "grad_norm": 4.40625, "grad_norm_var": 0.13865559895833332, "learning_rate": 0.0001, "loss": 10.007, "loss/crossentropy": 2.284889340400696, "loss/hidden": 3.921875, "loss/jsd": 0.0, "loss/logits": 0.3608839064836502, "step": 626 }, { "epoch": 0.03925, "grad_norm": 4.71875, "grad_norm_var": 0.13800455729166666, "learning_rate": 0.0001, "loss": 10.2784, "loss/crossentropy": 2.603570342063904, "loss/hidden": 3.890625, "loss/jsd": 0.0, "loss/logits": 0.3870154470205307, "step": 628 }, { "epoch": 0.039375, "grad_norm": 4.15625, "grad_norm_var": 0.14263916015625, "learning_rate": 0.0001, "loss": 9.9924, "loss/crossentropy": 2.538639187812805, "loss/hidden": 4.0, "loss/jsd": 0.0, "loss/logits": 0.36513669788837433, "step": 630 }, { "epoch": 0.0395, "grad_norm": 4.3125, "grad_norm_var": 0.07198893229166667, "learning_rate": 0.0001, "loss": 9.8024, "loss/crossentropy": 2.3725547790527344, "loss/hidden": 3.953125, "loss/jsd": 0.0, "loss/logits": 0.3675404489040375, "step": 632 }, { "epoch": 0.039625, "grad_norm": 4.9375, "grad_norm_var": 0.09101155598958334, "learning_rate": 0.0001, "loss": 10.1525, "loss/crossentropy": 2.455158233642578, "loss/hidden": 3.953125, "loss/jsd": 0.0, "loss/logits": 0.39550477266311646, "step": 634 }, { "epoch": 0.03975, "grad_norm": 4.8125, "grad_norm_var": 0.08709309895833334, "learning_rate": 0.0001, "loss": 9.9672, "loss/crossentropy": 2.4096927642822266, "loss/hidden": 4.0, "loss/jsd": 0.0, "loss/logits": 0.37770088016986847, "step": 636 }, { "epoch": 0.039875, "grad_norm": 4.1875, "grad_norm_var": 0.09761962890625, "learning_rate": 0.0001, "loss": 10.0311, "loss/crossentropy": 2.3568464517593384, "loss/hidden": 3.859375, "loss/jsd": 0.0, "loss/logits": 0.3562964200973511, "step": 638 }, { "epoch": 0.04, "grad_norm": 4.34375, "grad_norm_var": 0.09308268229166666, "learning_rate": 0.0001, "loss": 9.9439, "loss/crossentropy": 2.312902569770813, "loss/hidden": 3.9296875, "loss/jsd": 0.0, "loss/logits": 0.38113781809806824, "step": 640 }, { "epoch": 0.040125, "grad_norm": 4.34375, "grad_norm_var": 0.09225260416666667, "learning_rate": 0.0001, "loss": 9.8767, "loss/crossentropy": 2.607424736022949, "loss/hidden": 3.9453125, "loss/jsd": 0.0, "loss/logits": 0.37369397282600403, "step": 642 }, { "epoch": 0.04025, "grad_norm": 6.84375, "grad_norm_var": 0.45631103515625, "learning_rate": 0.0001, "loss": 10.4975, "loss/crossentropy": 2.197988510131836, "loss/hidden": 3.8046875, "loss/jsd": 0.0, "loss/logits": 0.3591903746128082, "step": 644 }, { "epoch": 0.040375, "grad_norm": 4.9375, "grad_norm_var": 0.45435791015625, "learning_rate": 0.0001, "loss": 10.0203, "loss/crossentropy": 2.617353677749634, "loss/hidden": 3.96875, "loss/jsd": 0.0, "loss/logits": 0.38244011998176575, "step": 646 }, { "epoch": 0.0405, "grad_norm": 5.25, "grad_norm_var": 0.43922119140625, "learning_rate": 0.0001, "loss": 9.9007, "loss/crossentropy": 2.3076788187026978, "loss/hidden": 3.84375, "loss/jsd": 0.0, "loss/logits": 0.3668941855430603, "step": 648 }, { "epoch": 0.040625, "grad_norm": 4.59375, "grad_norm_var": 0.437744140625, "learning_rate": 0.0001, "loss": 10.1593, "loss/crossentropy": 2.3939337730407715, "loss/hidden": 3.8359375, "loss/jsd": 0.0, "loss/logits": 0.3384999781847, "step": 650 }, { "epoch": 0.04075, "grad_norm": 4.25, "grad_norm_var": 0.45924072265625, "learning_rate": 0.0001, "loss": 9.905, "loss/crossentropy": 2.3927820920944214, "loss/hidden": 3.8203125, "loss/jsd": 0.0, "loss/logits": 0.35453103482723236, "step": 652 }, { "epoch": 0.040875, "grad_norm": 4.25, "grad_norm_var": 0.4364217122395833, "learning_rate": 0.0001, "loss": 10.0505, "loss/crossentropy": 2.327817440032959, "loss/hidden": 3.953125, "loss/jsd": 0.0, "loss/logits": 0.35317686200141907, "step": 654 }, { "epoch": 0.041, "grad_norm": 4.28125, "grad_norm_var": 0.4278483072916667, "learning_rate": 0.0001, "loss": 10.0239, "loss/crossentropy": 2.6120766401290894, "loss/hidden": 3.8984375, "loss/jsd": 0.0, "loss/logits": 0.3679347038269043, "step": 656 }, { "epoch": 0.041125, "grad_norm": 5.15625, "grad_norm_var": 0.41848958333333336, "learning_rate": 0.0001, "loss": 10.1713, "loss/crossentropy": 2.545218348503113, "loss/hidden": 3.8671875, "loss/jsd": 0.0, "loss/logits": 0.37261858582496643, "step": 658 }, { "epoch": 0.04125, "grad_norm": 5.8125, "grad_norm_var": 0.5816243489583334, "learning_rate": 0.0001, "loss": 10.3016, "loss/crossentropy": 2.606196641921997, "loss/hidden": 3.984375, "loss/jsd": 0.0, "loss/logits": 0.39967936277389526, "step": 660 }, { "epoch": 0.041375, "grad_norm": 4.15625, "grad_norm_var": 0.5972615559895833, "learning_rate": 0.0001, "loss": 9.9053, "loss/crossentropy": 2.30068039894104, "loss/hidden": 3.84375, "loss/jsd": 0.0, "loss/logits": 0.3504374176263809, "step": 662 }, { "epoch": 0.0415, "grad_norm": 7.1875, "grad_norm_var": 0.9585774739583334, "learning_rate": 0.0001, "loss": 10.2778, "loss/crossentropy": 2.5713064670562744, "loss/hidden": 4.09375, "loss/jsd": 0.0, "loss/logits": 0.43116797506809235, "step": 664 }, { "epoch": 0.041625, "grad_norm": 5.6875, "grad_norm_var": 0.9890462239583333, "learning_rate": 0.0001, "loss": 10.1834, "loss/crossentropy": 2.442265272140503, "loss/hidden": 3.890625, "loss/jsd": 0.0, "loss/logits": 0.36182868480682373, "step": 666 }, { "epoch": 0.04175, "grad_norm": 4.375, "grad_norm_var": 0.9586222330729167, "learning_rate": 0.0001, "loss": 10.064, "loss/crossentropy": 2.4865576028823853, "loss/hidden": 3.953125, "loss/jsd": 0.0, "loss/logits": 0.37539851665496826, "step": 668 }, { "epoch": 0.041875, "grad_norm": 4.375, "grad_norm_var": 0.97222900390625, "learning_rate": 0.0001, "loss": 10.1095, "loss/crossentropy": 2.382893681526184, "loss/hidden": 3.890625, "loss/jsd": 0.0, "loss/logits": 0.3759250193834305, "step": 670 }, { "epoch": 0.042, "grad_norm": 4.46875, "grad_norm_var": 0.96640625, "learning_rate": 0.0001, "loss": 10.0076, "loss/crossentropy": 2.407153367996216, "loss/hidden": 3.890625, "loss/jsd": 0.0, "loss/logits": 0.3480299711227417, "step": 672 }, { "epoch": 0.042125, "grad_norm": 4.40625, "grad_norm_var": 1.0254557291666666, "learning_rate": 0.0001, "loss": 10.209, "loss/crossentropy": 2.239955425262451, "loss/hidden": 3.9140625, "loss/jsd": 0.0, "loss/logits": 0.38244734704494476, "step": 674 }, { "epoch": 0.04225, "grad_norm": 5.75, "grad_norm_var": 0.6648722330729167, "learning_rate": 0.0001, "loss": 10.3096, "loss/crossentropy": 2.3780544996261597, "loss/hidden": 4.125, "loss/jsd": 0.0, "loss/logits": 0.40292245149612427, "step": 676 }, { "epoch": 0.042375, "grad_norm": 4.375, "grad_norm_var": 0.6613240559895833, "learning_rate": 0.0001, "loss": 9.7608, "loss/crossentropy": 2.2625954151153564, "loss/hidden": 3.890625, "loss/jsd": 0.0, "loss/logits": 0.3428070992231369, "step": 678 }, { "epoch": 0.0425, "grad_norm": 5.03125, "grad_norm_var": 0.2589803059895833, "learning_rate": 0.0001, "loss": 9.8559, "loss/crossentropy": 2.404393434524536, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.35663238167762756, "step": 680 }, { "epoch": 0.042625, "grad_norm": 4.625, "grad_norm_var": 0.146728515625, "learning_rate": 0.0001, "loss": 10.0169, "loss/crossentropy": 2.354637026786804, "loss/hidden": 3.9609375, "loss/jsd": 0.0, "loss/logits": 0.38327428698539734, "step": 682 }, { "epoch": 0.04275, "grad_norm": 5.5, "grad_norm_var": 0.20545247395833333, "learning_rate": 0.0001, "loss": 10.3256, "loss/crossentropy": 2.3648595809936523, "loss/hidden": 3.8671875, "loss/jsd": 0.0, "loss/logits": 0.3722570538520813, "step": 684 }, { "epoch": 0.042875, "grad_norm": 11.625, "grad_norm_var": 3.2235677083333334, "learning_rate": 0.0001, "loss": 10.0052, "loss/crossentropy": 2.255233407020569, "loss/hidden": 3.8125, "loss/jsd": 0.0, "loss/logits": 0.3729362040758133, "step": 686 }, { "epoch": 0.043, "grad_norm": 6.75, "grad_norm_var": 3.73131103515625, "learning_rate": 0.0001, "loss": 10.9058, "loss/crossentropy": 2.6284834146499634, "loss/hidden": 3.921875, "loss/jsd": 0.0, "loss/logits": 0.40316125750541687, "step": 688 }, { "epoch": 0.043125, "grad_norm": 4.84375, "grad_norm_var": 3.603706868489583, "learning_rate": 0.0001, "loss": 10.2674, "loss/crossentropy": 2.111253321170807, "loss/hidden": 3.828125, "loss/jsd": 0.0, "loss/logits": 0.33769945800304413, "step": 690 }, { "epoch": 0.04325, "grad_norm": 4.3125, "grad_norm_var": 3.69654541015625, "learning_rate": 0.0001, "loss": 9.9638, "loss/crossentropy": 2.461041808128357, "loss/hidden": 3.90625, "loss/jsd": 0.0, "loss/logits": 0.3787677586078644, "step": 692 }, { "epoch": 0.043375, "grad_norm": 5.0, "grad_norm_var": 3.5995442708333334, "learning_rate": 0.0001, "loss": 10.4949, "loss/crossentropy": 2.615231990814209, "loss/hidden": 4.015625, "loss/jsd": 0.0, "loss/logits": 0.4022253602743149, "step": 694 }, { "epoch": 0.0435, "grad_norm": 4.15625, "grad_norm_var": 3.6969889322916667, "learning_rate": 0.0001, "loss": 9.9086, "loss/crossentropy": 2.005247116088867, "loss/hidden": 3.8359375, "loss/jsd": 0.0, "loss/logits": 0.325066015124321, "step": 696 }, { "epoch": 0.043625, "grad_norm": 4.59375, "grad_norm_var": 3.723921712239583, "learning_rate": 0.0001, "loss": 9.6101, "loss/crossentropy": 2.4810917377471924, "loss/hidden": 3.8515625, "loss/jsd": 0.0, "loss/logits": 0.3531768321990967, "step": 698 }, { "epoch": 0.04375, "grad_norm": 5.34375, "grad_norm_var": 3.715104166666667, "learning_rate": 0.0001, "loss": 9.931, "loss/crossentropy": 2.4594435691833496, "loss/hidden": 3.90625, "loss/jsd": 0.0, "loss/logits": 0.3658078759908676, "step": 700 }, { "epoch": 0.043875, "grad_norm": 4.21875, "grad_norm_var": 1.0486979166666666, "learning_rate": 0.0001, "loss": 9.831, "loss/crossentropy": 2.4843145608901978, "loss/hidden": 3.8515625, "loss/jsd": 0.0, "loss/logits": 0.3550899028778076, "step": 702 }, { "epoch": 0.044, "grad_norm": 4.53125, "grad_norm_var": 0.1243560791015625, "learning_rate": 0.0001, "loss": 9.8716, "loss/crossentropy": 2.630019426345825, "loss/hidden": 3.9609375, "loss/jsd": 0.0, "loss/logits": 0.3656453341245651, "step": 704 }, { "epoch": 0.044125, "grad_norm": 4.71875, "grad_norm_var": 0.12111714680989584, "learning_rate": 0.0001, "loss": 10.0534, "loss/crossentropy": 2.42562735080719, "loss/hidden": 3.890625, "loss/jsd": 0.0, "loss/logits": 0.3582864999771118, "step": 706 }, { "epoch": 0.04425, "grad_norm": 4.21875, "grad_norm_var": 0.13979390462239583, "learning_rate": 0.0001, "loss": 9.9731, "loss/crossentropy": 2.4668819904327393, "loss/hidden": 3.9296875, "loss/jsd": 0.0, "loss/logits": 0.40261900424957275, "step": 708 }, { "epoch": 0.044375, "grad_norm": 4.34375, "grad_norm_var": 0.12498270670572917, "learning_rate": 0.0001, "loss": 9.74, "loss/crossentropy": 2.565619111061096, "loss/hidden": 3.8671875, "loss/jsd": 0.0, "loss/logits": 0.3622848391532898, "step": 710 }, { "epoch": 0.0445, "grad_norm": 4.8125, "grad_norm_var": 0.1250396728515625, "learning_rate": 0.0001, "loss": 9.886, "loss/crossentropy": 2.53997802734375, "loss/hidden": 3.78125, "loss/jsd": 0.0, "loss/logits": 0.36765336990356445, "step": 712 }, { "epoch": 0.044625, "grad_norm": 4.3125, "grad_norm_var": 0.13161519368489583, "learning_rate": 0.0001, "loss": 9.7684, "loss/crossentropy": 2.4978381395339966, "loss/hidden": 3.7890625, "loss/jsd": 0.0, "loss/logits": 0.33836664259433746, "step": 714 }, { "epoch": 0.04475, "grad_norm": 4.125, "grad_norm_var": 0.08456929524739583, "learning_rate": 0.0001, "loss": 9.78, "loss/crossentropy": 2.5785136222839355, "loss/hidden": 3.8046875, "loss/jsd": 0.0, "loss/logits": 0.36389264464378357, "step": 716 }, { "epoch": 0.044875, "grad_norm": 4.5, "grad_norm_var": 0.0640289306640625, "learning_rate": 0.0001, "loss": 9.7739, "loss/crossentropy": 2.398101568222046, "loss/hidden": 3.765625, "loss/jsd": 0.0, "loss/logits": 0.36110346019268036, "step": 718 }, { "epoch": 0.045, "grad_norm": 3.90625, "grad_norm_var": 0.07908426920572917, "learning_rate": 0.0001, "loss": 9.7225, "loss/crossentropy": 2.591996669769287, "loss/hidden": 3.7890625, "loss/jsd": 0.0, "loss/logits": 0.36347195506095886, "step": 720 }, { "epoch": 0.045125, "grad_norm": 4.28125, "grad_norm_var": 0.0589752197265625, "learning_rate": 0.0001, "loss": 9.7784, "loss/crossentropy": 2.527889609336853, "loss/hidden": 3.8203125, "loss/jsd": 0.0, "loss/logits": 0.36425699293613434, "step": 722 }, { "epoch": 0.04525, "grad_norm": 4.125, "grad_norm_var": 0.06559244791666667, "learning_rate": 0.0001, "loss": 9.2434, "loss/crossentropy": 2.3371061086654663, "loss/hidden": 3.765625, "loss/jsd": 0.0, "loss/logits": 0.3342244625091553, "step": 724 }, { "epoch": 0.045375, "grad_norm": 4.21875, "grad_norm_var": 0.06256103515625, "learning_rate": 0.0001, "loss": 9.7158, "loss/crossentropy": 2.479300379753113, "loss/hidden": 3.859375, "loss/jsd": 0.0, "loss/logits": 0.362941637635231, "step": 726 }, { "epoch": 0.0455, "grad_norm": 4.15625, "grad_norm_var": 0.04332275390625, "learning_rate": 0.0001, "loss": 9.7492, "loss/crossentropy": 2.529498338699341, "loss/hidden": 3.859375, "loss/jsd": 0.0, "loss/logits": 0.35392117500305176, "step": 728 }, { "epoch": 0.045625, "grad_norm": 5.21875, "grad_norm_var": 0.11004231770833334, "learning_rate": 0.0001, "loss": 10.1377, "loss/crossentropy": 2.6024062633514404, "loss/hidden": 3.90625, "loss/jsd": 0.0, "loss/logits": 0.375337615609169, "step": 730 }, { "epoch": 0.04575, "grad_norm": 4.15625, "grad_norm_var": 0.10641276041666667, "learning_rate": 0.0001, "loss": 9.4994, "loss/crossentropy": 2.29757559299469, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.32550813257694244, "step": 732 }, { "epoch": 0.045875, "grad_norm": 4.125, "grad_norm_var": 0.10484619140625, "learning_rate": 0.0001, "loss": 9.6179, "loss/crossentropy": 2.2457196712493896, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.3559332340955734, "step": 734 }, { "epoch": 0.046, "grad_norm": 4.9375, "grad_norm_var": 0.17092997233072918, "learning_rate": 0.0001, "loss": 9.7065, "loss/crossentropy": 2.2584201097488403, "loss/hidden": 3.765625, "loss/jsd": 0.0, "loss/logits": 0.3422286808490753, "step": 736 }, { "epoch": 0.046125, "grad_norm": 3.96875, "grad_norm_var": 0.18339742024739583, "learning_rate": 0.0001, "loss": 9.7707, "loss/crossentropy": 2.2206841707229614, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.3323328047990799, "step": 738 }, { "epoch": 0.04625, "grad_norm": 4.15625, "grad_norm_var": 0.18538004557291668, "learning_rate": 0.0001, "loss": 9.668, "loss/crossentropy": 2.2953317165374756, "loss/hidden": 3.7265625, "loss/jsd": 0.0, "loss/logits": 0.3379151374101639, "step": 740 }, { "epoch": 0.046375, "grad_norm": 4.4375, "grad_norm_var": 0.18704020182291667, "learning_rate": 0.0001, "loss": 9.8679, "loss/crossentropy": 2.5212793350219727, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.3425360172986984, "step": 742 }, { "epoch": 0.0465, "grad_norm": 4.5, "grad_norm_var": 0.18088785807291666, "learning_rate": 0.0001, "loss": 9.8863, "loss/crossentropy": 2.3973162174224854, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.3348172605037689, "step": 744 }, { "epoch": 0.046625, "grad_norm": 4.34375, "grad_norm_var": 0.14060872395833332, "learning_rate": 0.0001, "loss": 9.5977, "loss/crossentropy": 2.435719132423401, "loss/hidden": 3.765625, "loss/jsd": 0.0, "loss/logits": 0.3557274043560028, "step": 746 }, { "epoch": 0.04675, "grad_norm": 4.09375, "grad_norm_var": 0.14296468098958334, "learning_rate": 0.0001, "loss": 9.6008, "loss/crossentropy": 2.349796175956726, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.3267946243286133, "step": 748 }, { "epoch": 0.046875, "grad_norm": 4.34375, "grad_norm_var": 0.13899332682291668, "learning_rate": 0.0001, "loss": 9.7616, "loss/crossentropy": 2.2053170204162598, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.34219157695770264, "step": 750 }, { "epoch": 0.047, "grad_norm": 4.125, "grad_norm_var": 0.052718098958333334, "learning_rate": 0.0001, "loss": 9.7818, "loss/crossentropy": 2.4950714111328125, "loss/hidden": 3.8515625, "loss/jsd": 0.0, "loss/logits": 0.36529192328453064, "step": 752 }, { "epoch": 0.047125, "grad_norm": 4.125, "grad_norm_var": 0.03909098307291667, "learning_rate": 0.0001, "loss": 9.655, "loss/crossentropy": 2.20254647731781, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.31340985000133514, "step": 754 }, { "epoch": 0.04725, "grad_norm": 4.40625, "grad_norm_var": 0.03970947265625, "learning_rate": 0.0001, "loss": 9.4693, "loss/crossentropy": 2.384778141975403, "loss/hidden": 3.78125, "loss/jsd": 0.0, "loss/logits": 0.33873017132282257, "step": 756 }, { "epoch": 0.047375, "grad_norm": 4.125, "grad_norm_var": 0.044709269205729166, "learning_rate": 0.0001, "loss": 9.6639, "loss/crossentropy": 2.4044910669326782, "loss/hidden": 3.734375, "loss/jsd": 0.0, "loss/logits": 0.3298284709453583, "step": 758 }, { "epoch": 0.0475, "grad_norm": 4.5625, "grad_norm_var": 0.05974019368489583, "learning_rate": 0.0001, "loss": 9.8048, "loss/crossentropy": 2.125362753868103, "loss/hidden": 3.7890625, "loss/jsd": 0.0, "loss/logits": 0.31744199991226196, "step": 760 }, { "epoch": 0.047625, "grad_norm": 4.25, "grad_norm_var": 0.05503641764322917, "learning_rate": 0.0001, "loss": 9.5929, "loss/crossentropy": 2.4581737518310547, "loss/hidden": 3.8203125, "loss/jsd": 0.0, "loss/logits": 0.3449174612760544, "step": 762 }, { "epoch": 0.04775, "grad_norm": 4.1875, "grad_norm_var": 0.05503641764322917, "learning_rate": 0.0001, "loss": 9.5925, "loss/crossentropy": 2.319092273712158, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.33179551362991333, "step": 764 }, { "epoch": 0.047875, "grad_norm": 4.25, "grad_norm_var": 0.0488677978515625, "learning_rate": 0.0001, "loss": 9.7524, "loss/crossentropy": 2.6482656002044678, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.3577173054218292, "step": 766 }, { "epoch": 0.048, "grad_norm": 3.765625, "grad_norm_var": 0.06024983723958333, "learning_rate": 0.0001, "loss": 9.6826, "loss/crossentropy": 2.3165204524993896, "loss/hidden": 3.78125, "loss/jsd": 0.0, "loss/logits": 0.32748541235923767, "step": 768 }, { "epoch": 0.048125, "grad_norm": 4.375, "grad_norm_var": 0.058821614583333334, "learning_rate": 0.0001, "loss": 9.6115, "loss/crossentropy": 2.3324743509292603, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.3622802048921585, "step": 770 }, { "epoch": 0.04825, "grad_norm": 4.0625, "grad_norm_var": 0.054911295572916664, "learning_rate": 0.0001, "loss": 9.7171, "loss/crossentropy": 2.293349862098694, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.341478168964386, "step": 772 }, { "epoch": 0.048375, "grad_norm": 3.8125, "grad_norm_var": 0.06122639973958333, "learning_rate": 0.0001, "loss": 9.5262, "loss/crossentropy": 2.6875650882720947, "loss/hidden": 3.78125, "loss/jsd": 0.0, "loss/logits": 0.35754619538784027, "step": 774 }, { "epoch": 0.0485, "grad_norm": 4.15625, "grad_norm_var": 0.03291015625, "learning_rate": 0.0001, "loss": 9.6137, "loss/crossentropy": 2.319961667060852, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.3247549384832382, "step": 776 }, { "epoch": 0.048625, "grad_norm": 4.5, "grad_norm_var": 0.049117024739583334, "learning_rate": 0.0001, "loss": 9.7145, "loss/crossentropy": 2.2820927500724792, "loss/hidden": 3.7890625, "loss/jsd": 0.0, "loss/logits": 0.36693014204502106, "step": 778 }, { "epoch": 0.04875, "grad_norm": 4.4375, "grad_norm_var": 0.054520670572916666, "learning_rate": 0.0001, "loss": 9.6572, "loss/crossentropy": 2.323164224624634, "loss/hidden": 3.8046875, "loss/jsd": 0.0, "loss/logits": 0.3539418578147888, "step": 780 }, { "epoch": 0.048875, "grad_norm": 4.34375, "grad_norm_var": 0.05587565104166667, "learning_rate": 0.0001, "loss": 9.6451, "loss/crossentropy": 2.234144926071167, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.3302062600851059, "step": 782 }, { "epoch": 0.049, "grad_norm": 4.15625, "grad_norm_var": 0.044530232747395836, "learning_rate": 0.0001, "loss": 9.6092, "loss/crossentropy": 2.0940393209457397, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.33372722566127777, "step": 784 }, { "epoch": 0.049125, "grad_norm": 3.796875, "grad_norm_var": 0.06750895182291666, "learning_rate": 0.0001, "loss": 9.4303, "loss/crossentropy": 2.165642499923706, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.3112798035144806, "step": 786 }, { "epoch": 0.04925, "grad_norm": 4.875, "grad_norm_var": 0.09644775390625, "learning_rate": 0.0001, "loss": 9.8704, "loss/crossentropy": 2.4933313131332397, "loss/hidden": 3.84375, "loss/jsd": 0.0, "loss/logits": 0.3489539921283722, "step": 788 }, { "epoch": 0.049375, "grad_norm": 3.578125, "grad_norm_var": 0.10728759765625, "learning_rate": 0.0001, "loss": 9.4931, "loss/crossentropy": 2.1803754568099976, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.3406267315149307, "step": 790 }, { "epoch": 0.0495, "grad_norm": 3.9375, "grad_norm_var": 0.11580301920572916, "learning_rate": 0.0001, "loss": 9.7055, "loss/crossentropy": 2.444434642791748, "loss/hidden": 3.734375, "loss/jsd": 0.0, "loss/logits": 0.34211790561676025, "step": 792 }, { "epoch": 0.049625, "grad_norm": 4.78125, "grad_norm_var": 0.12771708170572918, "learning_rate": 0.0001, "loss": 9.6484, "loss/crossentropy": 2.041724741458893, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.3167402148246765, "step": 794 }, { "epoch": 0.04975, "grad_norm": 4.0625, "grad_norm_var": 0.12382405598958333, "learning_rate": 0.0001, "loss": 9.616, "loss/crossentropy": 2.430485963821411, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.359543040394783, "step": 796 }, { "epoch": 0.049875, "grad_norm": 3.9375, "grad_norm_var": 0.22916666666666666, "learning_rate": 0.0001, "loss": 9.8588, "loss/crossentropy": 2.421727180480957, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.3639049679040909, "step": 798 }, { "epoch": 0.05, "grad_norm": 3.9375, "grad_norm_var": 0.23111979166666666, "learning_rate": 0.0001, "loss": 9.5411, "loss/crossentropy": 2.173619508743286, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.34150390326976776, "step": 800 }, { "epoch": 0.050125, "grad_norm": 4.0, "grad_norm_var": 0.2822825113932292, "learning_rate": 0.0001, "loss": 9.6478, "loss/crossentropy": 2.5516271591186523, "loss/hidden": 3.984375, "loss/jsd": 0.0, "loss/logits": 0.35007384419441223, "step": 802 }, { "epoch": 0.05025, "grad_norm": 4.4375, "grad_norm_var": 0.25654195149739584, "learning_rate": 0.0001, "loss": 9.6222, "loss/crossentropy": 2.5491143465042114, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.3419859707355499, "step": 804 }, { "epoch": 0.050375, "grad_norm": 4.4375, "grad_norm_var": 0.23424072265625, "learning_rate": 0.0001, "loss": 9.6289, "loss/crossentropy": 2.5568002462387085, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.38842974603176117, "step": 806 }, { "epoch": 0.0505, "grad_norm": 4.03125, "grad_norm_var": 0.22262369791666667, "learning_rate": 0.0001, "loss": 9.5115, "loss/crossentropy": 2.328479051589966, "loss/hidden": 3.7890625, "loss/jsd": 0.0, "loss/logits": 0.33506618440151215, "step": 808 }, { "epoch": 0.050625, "grad_norm": 3.96875, "grad_norm_var": 0.21578369140625, "learning_rate": 0.0001, "loss": 9.6195, "loss/crossentropy": 2.2139497995376587, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.3508079797029495, "step": 810 }, { "epoch": 0.05075, "grad_norm": 3.875, "grad_norm_var": 0.2201812744140625, "learning_rate": 0.0001, "loss": 9.3948, "loss/crossentropy": 2.3431121110916138, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.34683074057102203, "step": 812 }, { "epoch": 0.050875, "grad_norm": 4.4375, "grad_norm_var": 0.12883199055989583, "learning_rate": 0.0001, "loss": 9.9172, "loss/crossentropy": 2.356547713279724, "loss/hidden": 3.7890625, "loss/jsd": 0.0, "loss/logits": 0.3437325358390808, "step": 814 }, { "epoch": 0.051, "grad_norm": 4.21875, "grad_norm_var": 0.1194732666015625, "learning_rate": 0.0001, "loss": 9.5043, "loss/crossentropy": 2.438585638999939, "loss/hidden": 3.9140625, "loss/jsd": 0.0, "loss/logits": 0.34822140634059906, "step": 816 }, { "epoch": 0.051125, "grad_norm": 4.125, "grad_norm_var": 0.04539286295572917, "learning_rate": 0.0001, "loss": 9.398, "loss/crossentropy": 2.232826828956604, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.33623330295085907, "step": 818 }, { "epoch": 0.05125, "grad_norm": 4.6875, "grad_norm_var": 0.05720113118489583, "learning_rate": 0.0001, "loss": 9.5585, "loss/crossentropy": 2.3401767015457153, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.33902715146541595, "step": 820 }, { "epoch": 0.051375, "grad_norm": 4.875, "grad_norm_var": 0.07072652180989583, "learning_rate": 0.0001, "loss": 9.4945, "loss/crossentropy": 2.1463793516159058, "loss/hidden": 3.84375, "loss/jsd": 0.0, "loss/logits": 0.3293427973985672, "step": 822 }, { "epoch": 0.0515, "grad_norm": 4.09375, "grad_norm_var": 0.06341145833333334, "learning_rate": 0.0001, "loss": 9.4966, "loss/crossentropy": 2.2964788675308228, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.3311201483011246, "step": 824 }, { "epoch": 0.051625, "grad_norm": 4.46875, "grad_norm_var": 0.06575520833333333, "learning_rate": 0.0001, "loss": 9.6328, "loss/crossentropy": 2.4484344720840454, "loss/hidden": 3.734375, "loss/jsd": 0.0, "loss/logits": 0.3489266186952591, "step": 826 }, { "epoch": 0.05175, "grad_norm": 4.03125, "grad_norm_var": 0.054541015625, "learning_rate": 0.0001, "loss": 9.4653, "loss/crossentropy": 2.2709579467773438, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.3134896159172058, "step": 828 }, { "epoch": 0.051875, "grad_norm": 3.8125, "grad_norm_var": 0.08080952962239583, "learning_rate": 0.0001, "loss": 9.5952, "loss/crossentropy": 2.3467541933059692, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.33979402482509613, "step": 830 }, { "epoch": 0.052, "grad_norm": 4.375, "grad_norm_var": 0.09925028483072916, "learning_rate": 0.0001, "loss": 9.5125, "loss/crossentropy": 2.3023515939712524, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.31126780807971954, "step": 832 }, { "epoch": 0.052125, "grad_norm": 4.0, "grad_norm_var": 0.10331929524739583, "learning_rate": 0.0001, "loss": 9.6691, "loss/crossentropy": 2.2762893438339233, "loss/hidden": 3.734375, "loss/jsd": 0.0, "loss/logits": 0.33598124980926514, "step": 834 }, { "epoch": 0.05225, "grad_norm": 4.625, "grad_norm_var": 0.10900065104166666, "learning_rate": 0.0001, "loss": 9.6829, "loss/crossentropy": 2.4334222078323364, "loss/hidden": 3.8203125, "loss/jsd": 0.0, "loss/logits": 0.3680105209350586, "step": 836 }, { "epoch": 0.052375, "grad_norm": 4.25, "grad_norm_var": 0.07909749348958334, "learning_rate": 0.0001, "loss": 9.585, "loss/crossentropy": 2.4400514364242554, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.34038496017456055, "step": 838 }, { "epoch": 0.0525, "grad_norm": 3.921875, "grad_norm_var": 0.08203837076822916, "learning_rate": 0.0001, "loss": 9.4398, "loss/crossentropy": 2.413126230239868, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.36697521805763245, "step": 840 }, { "epoch": 0.052625, "grad_norm": 4.3125, "grad_norm_var": 0.06610921223958334, "learning_rate": 0.0001, "loss": 9.9185, "loss/crossentropy": 2.547673225402832, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.36152128875255585, "step": 842 }, { "epoch": 0.05275, "grad_norm": 3.75, "grad_norm_var": 0.07534077962239584, "learning_rate": 0.0001, "loss": 9.3408, "loss/crossentropy": 2.469245195388794, "loss/hidden": 3.7265625, "loss/jsd": 0.0, "loss/logits": 0.34433713555336, "step": 844 }, { "epoch": 0.052875, "grad_norm": 4.53125, "grad_norm_var": 0.08121337890625, "learning_rate": 0.0001, "loss": 9.6554, "loss/crossentropy": 2.5020763874053955, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.3375013768672943, "step": 846 }, { "epoch": 0.053, "grad_norm": 3.6875, "grad_norm_var": 0.082080078125, "learning_rate": 0.0001, "loss": 9.4903, "loss/crossentropy": 2.294095277786255, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.32213783264160156, "step": 848 }, { "epoch": 0.053125, "grad_norm": 3.953125, "grad_norm_var": 0.07913411458333333, "learning_rate": 0.0001, "loss": 9.4497, "loss/crossentropy": 2.2911940813064575, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.31579577922821045, "step": 850 }, { "epoch": 0.05325, "grad_norm": 4.46875, "grad_norm_var": 0.06796468098958333, "learning_rate": 0.0001, "loss": 9.4926, "loss/crossentropy": 2.4447981119155884, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.33244381844997406, "step": 852 }, { "epoch": 0.053375, "grad_norm": 3.96875, "grad_norm_var": 0.06682942708333334, "learning_rate": 0.0001, "loss": 9.2842, "loss/crossentropy": 2.3211770057678223, "loss/hidden": 3.734375, "loss/jsd": 0.0, "loss/logits": 0.33892732858657837, "step": 854 }, { "epoch": 0.0535, "grad_norm": 3.96875, "grad_norm_var": 0.06518452962239583, "learning_rate": 0.0001, "loss": 9.7257, "loss/crossentropy": 2.362980604171753, "loss/hidden": 3.8671875, "loss/jsd": 0.0, "loss/logits": 0.3292359262704849, "step": 856 }, { "epoch": 0.053625, "grad_norm": 4.375, "grad_norm_var": 0.0662750244140625, "learning_rate": 0.0001, "loss": 9.2672, "loss/crossentropy": 2.178554058074951, "loss/hidden": 3.8671875, "loss/jsd": 0.0, "loss/logits": 0.3312181234359741, "step": 858 }, { "epoch": 0.05375, "grad_norm": 3.890625, "grad_norm_var": 0.06602274576822917, "learning_rate": 0.0001, "loss": 9.5052, "loss/crossentropy": 2.524282932281494, "loss/hidden": 3.7578125, "loss/jsd": 0.0, "loss/logits": 0.34150634706020355, "step": 860 }, { "epoch": 0.053875, "grad_norm": 3.921875, "grad_norm_var": 0.06665751139322916, "learning_rate": 0.0001, "loss": 9.4067, "loss/crossentropy": 2.345171332359314, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.3292695879936218, "step": 862 }, { "epoch": 0.054, "grad_norm": 4.0625, "grad_norm_var": 0.05427144368489583, "learning_rate": 0.0001, "loss": 9.4267, "loss/crossentropy": 2.5034204721450806, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.34620046615600586, "step": 864 }, { "epoch": 0.054125, "grad_norm": 3.921875, "grad_norm_var": 0.05933837890625, "learning_rate": 0.0001, "loss": 9.6655, "loss/crossentropy": 2.609255313873291, "loss/hidden": 3.875, "loss/jsd": 0.0, "loss/logits": 0.38452892005443573, "step": 866 }, { "epoch": 0.05425, "grad_norm": 4.0625, "grad_norm_var": 0.054686482747395834, "learning_rate": 0.0001, "loss": 9.5649, "loss/crossentropy": 2.795304298400879, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.3581002801656723, "step": 868 }, { "epoch": 0.054375, "grad_norm": 3.859375, "grad_norm_var": 0.18343098958333334, "learning_rate": 0.0001, "loss": 9.3991, "loss/crossentropy": 2.3437917232513428, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.3472553938627243, "step": 870 }, { "epoch": 0.0545, "grad_norm": 5.375, "grad_norm_var": 0.2687459309895833, "learning_rate": 0.0001, "loss": 9.6406, "loss/crossentropy": 2.4949249029159546, "loss/hidden": 3.7109375, "loss/jsd": 0.0, "loss/logits": 0.3250335156917572, "step": 872 }, { "epoch": 0.054625, "grad_norm": 3.9375, "grad_norm_var": 0.2698313395182292, "learning_rate": 0.0001, "loss": 9.4352, "loss/crossentropy": 2.646947979927063, "loss/hidden": 3.7265625, "loss/jsd": 0.0, "loss/logits": 0.3375275284051895, "step": 874 }, { "epoch": 0.05475, "grad_norm": 4.4375, "grad_norm_var": 0.26324462890625, "learning_rate": 0.0001, "loss": 9.6272, "loss/crossentropy": 2.342916250228882, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.335741251707077, "step": 876 }, { "epoch": 0.054875, "grad_norm": 4.25, "grad_norm_var": 0.226904296875, "learning_rate": 0.0001, "loss": 9.521, "loss/crossentropy": 2.309291124343872, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.34289687871932983, "step": 878 }, { "epoch": 0.055, "grad_norm": 4.0625, "grad_norm_var": 0.22008056640625, "learning_rate": 0.0001, "loss": 9.6429, "loss/crossentropy": 2.363176941871643, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.31846094131469727, "step": 880 }, { "epoch": 0.055125, "grad_norm": 4.28125, "grad_norm_var": 0.21479390462239584, "learning_rate": 0.0001, "loss": 9.4278, "loss/crossentropy": 2.295005202293396, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.35759809613227844, "step": 882 }, { "epoch": 0.05525, "grad_norm": 4.09375, "grad_norm_var": 0.22467041015625, "learning_rate": 0.0001, "loss": 9.3845, "loss/crossentropy": 2.2038002014160156, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.3040274381637573, "step": 884 }, { "epoch": 0.055375, "grad_norm": 4.375, "grad_norm_var": 0.11887919108072917, "learning_rate": 0.0001, "loss": 9.6735, "loss/crossentropy": 2.581911325454712, "loss/hidden": 3.734375, "loss/jsd": 0.0, "loss/logits": 0.3485991954803467, "step": 886 }, { "epoch": 0.0555, "grad_norm": 3.875, "grad_norm_var": 0.04658915201822917, "learning_rate": 0.0001, "loss": 9.399, "loss/crossentropy": 2.504314661026001, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.3332435041666031, "step": 888 }, { "epoch": 0.055625, "grad_norm": 3.859375, "grad_norm_var": 0.05237630208333333, "learning_rate": 0.0001, "loss": 9.6586, "loss/crossentropy": 2.4632703065872192, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.3592112809419632, "step": 890 }, { "epoch": 0.05575, "grad_norm": 4.4375, "grad_norm_var": 0.05908203125, "learning_rate": 0.0001, "loss": 9.5926, "loss/crossentropy": 2.600472331047058, "loss/hidden": 3.7109375, "loss/jsd": 0.0, "loss/logits": 0.3421791195869446, "step": 892 }, { "epoch": 0.055875, "grad_norm": 3.78125, "grad_norm_var": 0.06067301432291667, "learning_rate": 0.0001, "loss": 9.3855, "loss/crossentropy": 2.1859222650527954, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.29888148605823517, "step": 894 }, { "epoch": 0.056, "grad_norm": 3.9375, "grad_norm_var": 0.05611572265625, "learning_rate": 0.0001, "loss": 9.4589, "loss/crossentropy": 2.3843421936035156, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.3476228564977646, "step": 896 }, { "epoch": 0.056125, "grad_norm": 4.0, "grad_norm_var": 0.056477864583333336, "learning_rate": 0.0001, "loss": 9.3348, "loss/crossentropy": 2.3973230123519897, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.3175063878297806, "step": 898 }, { "epoch": 0.05625, "grad_norm": 4.0, "grad_norm_var": 0.06223856608072917, "learning_rate": 0.0001, "loss": 9.5278, "loss/crossentropy": 2.350813627243042, "loss/hidden": 3.734375, "loss/jsd": 0.0, "loss/logits": 0.3350457102060318, "step": 900 }, { "epoch": 0.056375, "grad_norm": 4.5, "grad_norm_var": 0.0687652587890625, "learning_rate": 0.0001, "loss": 9.6984, "loss/crossentropy": 2.3941330909729004, "loss/hidden": 3.765625, "loss/jsd": 0.0, "loss/logits": 0.33253586292266846, "step": 902 }, { "epoch": 0.0565, "grad_norm": 3.984375, "grad_norm_var": 0.09226888020833333, "learning_rate": 0.0001, "loss": 9.686, "loss/crossentropy": 2.4759925603866577, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.3297637850046158, "step": 904 }, { "epoch": 0.056625, "grad_norm": 4.34375, "grad_norm_var": 0.08549702962239583, "learning_rate": 0.0001, "loss": 9.5501, "loss/crossentropy": 2.643571376800537, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.3501765877008438, "step": 906 }, { "epoch": 0.05675, "grad_norm": 4.0, "grad_norm_var": 0.06550191243489584, "learning_rate": 0.0001, "loss": 9.5891, "loss/crossentropy": 2.432242512702942, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.3715456426143646, "step": 908 }, { "epoch": 0.056875, "grad_norm": 4.03125, "grad_norm_var": 0.059403483072916666, "learning_rate": 0.0001, "loss": 9.4375, "loss/crossentropy": 2.525635004043579, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.35497090220451355, "step": 910 }, { "epoch": 0.057, "grad_norm": 4.34375, "grad_norm_var": 0.0610504150390625, "learning_rate": 0.0001, "loss": 9.663, "loss/crossentropy": 2.6299631595611572, "loss/hidden": 3.8125, "loss/jsd": 0.0, "loss/logits": 0.34647491574287415, "step": 912 }, { "epoch": 0.057125, "grad_norm": 4.3125, "grad_norm_var": 0.1096343994140625, "learning_rate": 0.0001, "loss": 9.2623, "loss/crossentropy": 2.2141228914260864, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.333205983042717, "step": 914 }, { "epoch": 0.05725, "grad_norm": 3.796875, "grad_norm_var": 0.1134765625, "learning_rate": 0.0001, "loss": 9.5108, "loss/crossentropy": 2.331273913383484, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.32548947632312775, "step": 916 }, { "epoch": 0.057375, "grad_norm": 3.765625, "grad_norm_var": 0.1252838134765625, "learning_rate": 0.0001, "loss": 9.3267, "loss/crossentropy": 2.20035183429718, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.31392902135849, "step": 918 }, { "epoch": 0.0575, "grad_norm": 4.53125, "grad_norm_var": 0.11096598307291666, "learning_rate": 0.0001, "loss": 9.4211, "loss/crossentropy": 2.4890637397766113, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.3301192671060562, "step": 920 }, { "epoch": 0.057625, "grad_norm": 4.65625, "grad_norm_var": 0.1294921875, "learning_rate": 0.0001, "loss": 9.5324, "loss/crossentropy": 2.5583585500717163, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.3586796224117279, "step": 922 }, { "epoch": 0.05775, "grad_norm": 4.03125, "grad_norm_var": 0.13170572916666667, "learning_rate": 0.0001, "loss": 9.7246, "loss/crossentropy": 2.417726516723633, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.3519165962934494, "step": 924 }, { "epoch": 0.057875, "grad_norm": 4.1875, "grad_norm_var": 0.12793680826822917, "learning_rate": 0.0001, "loss": 9.3623, "loss/crossentropy": 2.4320497512817383, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.35020147264003754, "step": 926 }, { "epoch": 0.058, "grad_norm": 4.4375, "grad_norm_var": 0.14709879557291666, "learning_rate": 0.0001, "loss": 9.2962, "loss/crossentropy": 2.2121816873550415, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.32885268330574036, "step": 928 }, { "epoch": 0.058125, "grad_norm": 3.984375, "grad_norm_var": 0.08567606608072917, "learning_rate": 0.0001, "loss": 9.3099, "loss/crossentropy": 2.382021903991699, "loss/hidden": 3.7890625, "loss/jsd": 0.0, "loss/logits": 0.32922065258026123, "step": 930 }, { "epoch": 0.05825, "grad_norm": 3.890625, "grad_norm_var": 0.08528645833333333, "learning_rate": 0.0001, "loss": 9.4542, "loss/crossentropy": 2.2859452962875366, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.3154686838388443, "step": 932 }, { "epoch": 0.058375, "grad_norm": 4.03125, "grad_norm_var": 0.07997945149739584, "learning_rate": 0.0001, "loss": 9.3008, "loss/crossentropy": 2.3658918142318726, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.3251184970140457, "step": 934 }, { "epoch": 0.0585, "grad_norm": 3.75, "grad_norm_var": 0.07141825358072916, "learning_rate": 0.0001, "loss": 9.3537, "loss/crossentropy": 2.199766993522644, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.3467453122138977, "step": 936 }, { "epoch": 0.058625, "grad_norm": 3.828125, "grad_norm_var": 0.0466796875, "learning_rate": 0.0001, "loss": 9.6814, "loss/crossentropy": 2.6493847370147705, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.3443215787410736, "step": 938 }, { "epoch": 0.05875, "grad_norm": 4.0, "grad_norm_var": 0.04973551432291667, "learning_rate": 0.0001, "loss": 9.3843, "loss/crossentropy": 2.501352071762085, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.3372286558151245, "step": 940 }, { "epoch": 0.058875, "grad_norm": 3.640625, "grad_norm_var": 0.06533203125, "learning_rate": 0.0001, "loss": 9.1373, "loss/crossentropy": 2.271903336048126, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.3119770586490631, "step": 942 }, { "epoch": 0.059, "grad_norm": 3.9375, "grad_norm_var": 0.06785481770833333, "learning_rate": 0.0001, "loss": 9.5648, "loss/crossentropy": 2.495382308959961, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.35206887125968933, "step": 944 }, { "epoch": 0.059125, "grad_norm": 3.875, "grad_norm_var": 0.07141825358072916, "learning_rate": 0.0001, "loss": 9.4638, "loss/crossentropy": 2.582332134246826, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3388357609510422, "step": 946 }, { "epoch": 0.05925, "grad_norm": 4.0625, "grad_norm_var": 0.069580078125, "learning_rate": 0.0001, "loss": 9.3972, "loss/crossentropy": 2.7968443632125854, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.34314611554145813, "step": 948 }, { "epoch": 0.059375, "grad_norm": 3.953125, "grad_norm_var": 0.0709136962890625, "learning_rate": 0.0001, "loss": 9.5731, "loss/crossentropy": 2.361037015914917, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.31498147547245026, "step": 950 }, { "epoch": 0.0595, "grad_norm": 3.75, "grad_norm_var": 0.07116597493489583, "learning_rate": 0.0001, "loss": 9.4971, "loss/crossentropy": 2.493962287902832, "loss/hidden": 3.7265625, "loss/jsd": 0.0, "loss/logits": 0.3326618820428848, "step": 952 }, { "epoch": 0.059625, "grad_norm": 3.734375, "grad_norm_var": 0.07270406087239584, "learning_rate": 0.0001, "loss": 9.5408, "loss/crossentropy": 2.4259244203567505, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.3625700771808624, "step": 954 }, { "epoch": 0.05975, "grad_norm": 4.9375, "grad_norm_var": 0.12129618326822916, "learning_rate": 0.0001, "loss": 9.4081, "loss/crossentropy": 2.146342396736145, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.3270118683576584, "step": 956 }, { "epoch": 0.059875, "grad_norm": 3.65625, "grad_norm_var": 0.1236968994140625, "learning_rate": 0.0001, "loss": 9.3212, "loss/crossentropy": 2.3437579870224, "loss/hidden": 3.765625, "loss/jsd": 0.0, "loss/logits": 0.34136760234832764, "step": 958 }, { "epoch": 0.06, "grad_norm": 3.921875, "grad_norm_var": 0.11343485514322917, "learning_rate": 0.0001, "loss": 9.4308, "loss/crossentropy": 2.330640196800232, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.3233296573162079, "step": 960 }, { "epoch": 0.060125, "grad_norm": 3.765625, "grad_norm_var": 0.11761067708333334, "learning_rate": 0.0001, "loss": 9.1553, "loss/crossentropy": 2.392879366874695, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.33645670115947723, "step": 962 }, { "epoch": 0.06025, "grad_norm": 3.484375, "grad_norm_var": 0.13583984375, "learning_rate": 0.0001, "loss": 9.2888, "loss/crossentropy": 2.2503827810287476, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.3347347527742386, "step": 964 }, { "epoch": 0.060375, "grad_norm": 3.984375, "grad_norm_var": 0.13542378743489583, "learning_rate": 0.0001, "loss": 9.368, "loss/crossentropy": 2.4098986387252808, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.3416828215122223, "step": 966 }, { "epoch": 0.0605, "grad_norm": 4.3125, "grad_norm_var": 0.13982645670572916, "learning_rate": 0.0001, "loss": 9.4573, "loss/crossentropy": 2.1524184942245483, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.30603043735027313, "step": 968 }, { "epoch": 0.060625, "grad_norm": 3.65625, "grad_norm_var": 0.14397786458333334, "learning_rate": 0.0001, "loss": 9.2709, "loss/crossentropy": 2.2244198322296143, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.317968025803566, "step": 970 }, { "epoch": 0.06075, "grad_norm": 3.5, "grad_norm_var": 0.08929036458333334, "learning_rate": 0.0001, "loss": 9.3768, "loss/crossentropy": 2.4069145917892456, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.3366214781999588, "step": 972 }, { "epoch": 0.060875, "grad_norm": 3.765625, "grad_norm_var": 0.05461324055989583, "learning_rate": 0.0001, "loss": 9.0551, "loss/crossentropy": 2.486254572868347, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.2998432517051697, "step": 974 }, { "epoch": 0.061, "grad_norm": 4.59375, "grad_norm_var": 0.09365132649739584, "learning_rate": 0.0001, "loss": 9.3255, "loss/crossentropy": 2.443332552909851, "loss/hidden": 3.734375, "loss/jsd": 0.0, "loss/logits": 0.3492431044578552, "step": 976 }, { "epoch": 0.061125, "grad_norm": 4.0, "grad_norm_var": 0.09325764973958334, "learning_rate": 0.0001, "loss": 9.3259, "loss/crossentropy": 2.4328815937042236, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.3370583653450012, "step": 978 }, { "epoch": 0.06125, "grad_norm": 3.859375, "grad_norm_var": 0.09388020833333334, "learning_rate": 0.0001, "loss": 9.1005, "loss/crossentropy": 2.373159170150757, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.31204234063625336, "step": 980 }, { "epoch": 0.061375, "grad_norm": 3.828125, "grad_norm_var": 0.09207255045572917, "learning_rate": 0.0001, "loss": 9.2474, "loss/crossentropy": 2.319578170776367, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3269862085580826, "step": 982 }, { "epoch": 0.0615, "grad_norm": 4.0625, "grad_norm_var": 0.08448893229166667, "learning_rate": 0.0001, "loss": 9.1644, "loss/crossentropy": 2.042439818382263, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.2966649830341339, "step": 984 }, { "epoch": 0.061625, "grad_norm": 3.890625, "grad_norm_var": 0.07712300618489583, "learning_rate": 0.0001, "loss": 9.354, "loss/crossentropy": 2.3522753715515137, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.3372476100921631, "step": 986 }, { "epoch": 0.06175, "grad_norm": 3.890625, "grad_norm_var": 0.06454671223958333, "learning_rate": 0.0001, "loss": 9.4464, "loss/crossentropy": 2.3970394134521484, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.3193608969449997, "step": 988 }, { "epoch": 0.061875, "grad_norm": 4.0, "grad_norm_var": 0.04579976399739583, "learning_rate": 0.0001, "loss": 9.5096, "loss/crossentropy": 2.4436655044555664, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.3359542489051819, "step": 990 }, { "epoch": 0.062, "grad_norm": 4.0625, "grad_norm_var": 0.020531209309895833, "learning_rate": 0.0001, "loss": 9.6125, "loss/crossentropy": 2.5487223863601685, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.3535960167646408, "step": 992 }, { "epoch": 0.062125, "grad_norm": 3.78125, "grad_norm_var": 0.023322550455729167, "learning_rate": 0.0001, "loss": 9.3969, "loss/crossentropy": 2.2758008241653442, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.3227398693561554, "step": 994 }, { "epoch": 0.06225, "grad_norm": 4.0625, "grad_norm_var": 0.01099853515625, "learning_rate": 0.0001, "loss": 9.2286, "loss/crossentropy": 2.065541088581085, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.2921288013458252, "step": 996 }, { "epoch": 0.062375, "grad_norm": 4.15625, "grad_norm_var": 0.015135701497395833, "learning_rate": 0.0001, "loss": 9.565, "loss/crossentropy": 2.5905497074127197, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.3195539563894272, "step": 998 }, { "epoch": 0.0625, "grad_norm": 4.09375, "grad_norm_var": 0.013719685872395833, "learning_rate": 0.0001, "loss": 9.4557, "loss/crossentropy": 2.491615653038025, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.348484605550766, "step": 1000 }, { "epoch": 0.062625, "grad_norm": 3.828125, "grad_norm_var": 0.018309529622395834, "learning_rate": 0.0001, "loss": 9.4021, "loss/crossentropy": 2.271575689315796, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.3233788311481476, "step": 1002 }, { "epoch": 0.06275, "grad_norm": 3.984375, "grad_norm_var": 0.017585245768229167, "learning_rate": 0.0001, "loss": 9.2456, "loss/crossentropy": 2.1161099076271057, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.3034187853336334, "step": 1004 }, { "epoch": 0.062875, "grad_norm": 3.609375, "grad_norm_var": 0.025520833333333333, "learning_rate": 0.0001, "loss": 9.3095, "loss/crossentropy": 2.369056463241577, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.3199651837348938, "step": 1006 }, { "epoch": 0.063, "grad_norm": 4.1875, "grad_norm_var": 0.029157511393229165, "learning_rate": 0.0001, "loss": 9.267, "loss/crossentropy": 2.327115058898926, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.34022311866283417, "step": 1008 }, { "epoch": 0.063125, "grad_norm": 3.828125, "grad_norm_var": 0.030631510416666667, "learning_rate": 0.0001, "loss": 9.3698, "loss/crossentropy": 2.7188167572021484, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.33434274792671204, "step": 1010 }, { "epoch": 0.06325, "grad_norm": 4.21875, "grad_norm_var": 0.036295572916666664, "learning_rate": 0.0001, "loss": 9.5306, "loss/crossentropy": 2.2543774843215942, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3080340623855591, "step": 1012 }, { "epoch": 0.063375, "grad_norm": 3.9375, "grad_norm_var": 0.03961181640625, "learning_rate": 0.0001, "loss": 9.2311, "loss/crossentropy": 2.137963056564331, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.3146566152572632, "step": 1014 }, { "epoch": 0.0635, "grad_norm": 3.875, "grad_norm_var": 0.03809305826822917, "learning_rate": 0.0001, "loss": 9.3595, "loss/crossentropy": 2.378168821334839, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.3253423571586609, "step": 1016 }, { "epoch": 0.063625, "grad_norm": 3.90625, "grad_norm_var": 0.0347808837890625, "learning_rate": 0.0001, "loss": 9.1484, "loss/crossentropy": 2.2595038414001465, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.3158324211835861, "step": 1018 }, { "epoch": 0.06375, "grad_norm": 3.9375, "grad_norm_var": 0.03615620930989583, "learning_rate": 0.0001, "loss": 9.2288, "loss/crossentropy": 2.4348455667495728, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.3402617871761322, "step": 1020 }, { "epoch": 0.063875, "grad_norm": 4.21875, "grad_norm_var": 0.040299479166666666, "learning_rate": 0.0001, "loss": 9.4738, "loss/crossentropy": 2.4196285009384155, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.3641415983438492, "step": 1022 }, { "epoch": 0.064, "grad_norm": 4.0, "grad_norm_var": 0.03557535807291667, "learning_rate": 0.0001, "loss": 9.3856, "loss/crossentropy": 2.7573235034942627, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.3504706919193268, "step": 1024 }, { "epoch": 0.064125, "grad_norm": 3.921875, "grad_norm_var": 0.03811442057291667, "learning_rate": 0.0001, "loss": 9.4086, "loss/crossentropy": 2.39897620677948, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.3188930004835129, "step": 1026 }, { "epoch": 0.06425, "grad_norm": 3.765625, "grad_norm_var": 0.044417317708333334, "learning_rate": 0.0001, "loss": 8.9954, "loss/crossentropy": 2.37536883354187, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.30067528784275055, "step": 1028 }, { "epoch": 0.064375, "grad_norm": 3.859375, "grad_norm_var": 0.05654195149739583, "learning_rate": 0.0001, "loss": 9.5045, "loss/crossentropy": 2.5579299926757812, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.34567518532276154, "step": 1030 }, { "epoch": 0.0645, "grad_norm": 3.84375, "grad_norm_var": 0.0560699462890625, "learning_rate": 0.0001, "loss": 9.497, "loss/crossentropy": 2.4573644399642944, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3271123617887497, "step": 1032 }, { "epoch": 0.064625, "grad_norm": 3.734375, "grad_norm_var": 0.13196512858072917, "learning_rate": 0.0001, "loss": 9.2878, "loss/crossentropy": 2.369649648666382, "loss/hidden": 3.78125, "loss/jsd": 0.0, "loss/logits": 0.3534861207008362, "step": 1034 }, { "epoch": 0.06475, "grad_norm": 4.34375, "grad_norm_var": 0.1352447509765625, "learning_rate": 0.0001, "loss": 9.2811, "loss/crossentropy": 2.366884231567383, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.3179771304130554, "step": 1036 }, { "epoch": 0.064875, "grad_norm": 4.28125, "grad_norm_var": 0.13723042805989583, "learning_rate": 0.0001, "loss": 9.245, "loss/crossentropy": 2.385068416595459, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.33948729932308197, "step": 1038 }, { "epoch": 0.065, "grad_norm": 3.953125, "grad_norm_var": 0.13835347493489583, "learning_rate": 0.0001, "loss": 9.3012, "loss/crossentropy": 2.4422179460525513, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.3261062651872635, "step": 1040 }, { "epoch": 0.065125, "grad_norm": 3.890625, "grad_norm_var": 0.1297760009765625, "learning_rate": 0.0001, "loss": 9.2994, "loss/crossentropy": 2.393699288368225, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.31233613193035126, "step": 1042 }, { "epoch": 0.06525, "grad_norm": 3.90625, "grad_norm_var": 0.10129292805989583, "learning_rate": 0.0001, "loss": 9.4722, "loss/crossentropy": 2.4751322269439697, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.34922046959400177, "step": 1044 }, { "epoch": 0.065375, "grad_norm": 4.09375, "grad_norm_var": 0.10408528645833333, "learning_rate": 0.0001, "loss": 9.2301, "loss/crossentropy": 2.3055585622787476, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.332491010427475, "step": 1046 }, { "epoch": 0.0655, "grad_norm": 4.0, "grad_norm_var": 0.1025390625, "learning_rate": 0.0001, "loss": 9.356, "loss/crossentropy": 2.3319748640060425, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.3209776282310486, "step": 1048 }, { "epoch": 0.065625, "grad_norm": 4.0625, "grad_norm_var": 0.0375396728515625, "learning_rate": 0.0001, "loss": 9.3369, "loss/crossentropy": 2.2262803316116333, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.3301280736923218, "step": 1050 }, { "epoch": 0.06575, "grad_norm": 3.765625, "grad_norm_var": 0.041239420572916664, "learning_rate": 0.0001, "loss": 9.0379, "loss/crossentropy": 2.3847025632858276, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.30600911378860474, "step": 1052 }, { "epoch": 0.065875, "grad_norm": 3.953125, "grad_norm_var": 0.03713785807291667, "learning_rate": 0.0001, "loss": 9.1241, "loss/crossentropy": 2.2389484643936157, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.31373530626296997, "step": 1054 }, { "epoch": 0.066, "grad_norm": 3.953125, "grad_norm_var": 0.04392801920572917, "learning_rate": 0.0001, "loss": 9.324, "loss/crossentropy": 2.4310184717178345, "loss/hidden": 3.7109375, "loss/jsd": 0.0, "loss/logits": 0.3465813100337982, "step": 1056 }, { "epoch": 0.066125, "grad_norm": 3.546875, "grad_norm_var": 0.055475870768229164, "learning_rate": 0.0001, "loss": 8.9162, "loss/crossentropy": 2.394460439682007, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.3166656345129013, "step": 1058 }, { "epoch": 0.06625, "grad_norm": 3.703125, "grad_norm_var": 0.052855428059895834, "learning_rate": 0.0001, "loss": 9.4262, "loss/crossentropy": 2.5295032262802124, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.3500918596982956, "step": 1060 }, { "epoch": 0.066375, "grad_norm": 4.15625, "grad_norm_var": 0.051920572916666664, "learning_rate": 0.0001, "loss": 9.3493, "loss/crossentropy": 2.2424673438072205, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.3131362199783325, "step": 1062 }, { "epoch": 0.0665, "grad_norm": 3.65625, "grad_norm_var": 0.04260660807291667, "learning_rate": 0.0001, "loss": 9.1694, "loss/crossentropy": 2.303846836090088, "loss/hidden": 3.75, "loss/jsd": 0.0, "loss/logits": 0.3290853351354599, "step": 1064 }, { "epoch": 0.066625, "grad_norm": 3.734375, "grad_norm_var": 0.030060831705729166, "learning_rate": 0.0001, "loss": 9.1424, "loss/crossentropy": 2.417978286743164, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3248021900653839, "step": 1066 }, { "epoch": 0.06675, "grad_norm": 3.8125, "grad_norm_var": 0.029683430989583332, "learning_rate": 0.0001, "loss": 9.2021, "loss/crossentropy": 2.1610294580459595, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.31622791290283203, "step": 1068 }, { "epoch": 0.066875, "grad_norm": 3.59375, "grad_norm_var": 0.029792277018229167, "learning_rate": 0.0001, "loss": 9.122, "loss/crossentropy": 2.254276156425476, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.29598017036914825, "step": 1070 }, { "epoch": 0.067, "grad_norm": 4.375, "grad_norm_var": 0.04895833333333333, "learning_rate": 0.0001, "loss": 9.5952, "loss/crossentropy": 2.485300302505493, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.3379819989204407, "step": 1072 }, { "epoch": 0.067125, "grad_norm": 4.125, "grad_norm_var": 0.0529937744140625, "learning_rate": 0.0001, "loss": 9.3245, "loss/crossentropy": 2.337808847427368, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.3292604982852936, "step": 1074 }, { "epoch": 0.06725, "grad_norm": 3.828125, "grad_norm_var": 0.05022786458333333, "learning_rate": 0.0001, "loss": 9.4473, "loss/crossentropy": 2.394913077354431, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.34427976608276367, "step": 1076 }, { "epoch": 0.067375, "grad_norm": 4.03125, "grad_norm_var": 0.05392252604166667, "learning_rate": 0.0001, "loss": 9.4114, "loss/crossentropy": 2.4923205375671387, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.3431336283683777, "step": 1078 }, { "epoch": 0.0675, "grad_norm": 4.0, "grad_norm_var": 0.05250244140625, "learning_rate": 0.0001, "loss": 9.3676, "loss/crossentropy": 2.405529022216797, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.33384498953819275, "step": 1080 }, { "epoch": 0.067625, "grad_norm": 4.1875, "grad_norm_var": 0.06123758951822917, "learning_rate": 0.0001, "loss": 9.0372, "loss/crossentropy": 2.446845769882202, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.30565811693668365, "step": 1082 }, { "epoch": 0.06775, "grad_norm": 4.375, "grad_norm_var": 0.06928609212239584, "learning_rate": 0.0001, "loss": 9.4077, "loss/crossentropy": 2.481779932975769, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.3120981603860855, "step": 1084 }, { "epoch": 0.067875, "grad_norm": 3.625, "grad_norm_var": 0.07662353515625, "learning_rate": 0.0001, "loss": 9.3184, "loss/crossentropy": 2.2164746522903442, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.29565760493278503, "step": 1086 }, { "epoch": 0.068, "grad_norm": 4.3125, "grad_norm_var": 0.0718170166015625, "learning_rate": 0.0001, "loss": 9.4184, "loss/crossentropy": 2.53956139087677, "loss/hidden": 3.65625, "loss/jsd": 0.0, "loss/logits": 0.34126120805740356, "step": 1088 }, { "epoch": 0.068125, "grad_norm": 3.75, "grad_norm_var": 0.06967671712239583, "learning_rate": 0.0001, "loss": 9.3542, "loss/crossentropy": 2.4937098026275635, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.32088300585746765, "step": 1090 }, { "epoch": 0.06825, "grad_norm": 3.828125, "grad_norm_var": 0.06809895833333333, "learning_rate": 0.0001, "loss": 9.1749, "loss/crossentropy": 2.387190341949463, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.33582253754138947, "step": 1092 }, { "epoch": 0.068375, "grad_norm": 3.890625, "grad_norm_var": 0.061747233072916664, "learning_rate": 0.0001, "loss": 9.4676, "loss/crossentropy": 2.626872181892395, "loss/hidden": 3.7421875, "loss/jsd": 0.0, "loss/logits": 0.38073016703128815, "step": 1094 }, { "epoch": 0.0685, "grad_norm": 4.21875, "grad_norm_var": 0.06539306640625, "learning_rate": 0.0001, "loss": 9.3993, "loss/crossentropy": 2.449720859527588, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.338313028216362, "step": 1096 }, { "epoch": 0.068625, "grad_norm": 3.84375, "grad_norm_var": 0.12389322916666666, "learning_rate": 0.0001, "loss": 9.3835, "loss/crossentropy": 2.2490886449813843, "loss/hidden": 3.7890625, "loss/jsd": 0.0, "loss/logits": 0.3486269563436508, "step": 1098 }, { "epoch": 0.06875, "grad_norm": 3.8125, "grad_norm_var": 0.11367899576822917, "learning_rate": 0.0001, "loss": 9.0693, "loss/crossentropy": 2.353515625, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.3053932934999466, "step": 1100 }, { "epoch": 0.068875, "grad_norm": 3.953125, "grad_norm_var": 0.09433186848958333, "learning_rate": 0.0001, "loss": 9.2217, "loss/crossentropy": 2.1353343725204468, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.31745678186416626, "step": 1102 }, { "epoch": 0.069, "grad_norm": 6.28125, "grad_norm_var": 0.4137980143229167, "learning_rate": 0.0001, "loss": 9.3661, "loss/crossentropy": 1.9615400433540344, "loss/hidden": 3.796875, "loss/jsd": 0.0, "loss/logits": 0.3933947682380676, "step": 1104 }, { "epoch": 0.069125, "grad_norm": 4.125, "grad_norm_var": 0.40084228515625, "learning_rate": 0.0001, "loss": 9.4, "loss/crossentropy": 2.4881348609924316, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.35493068397045135, "step": 1106 }, { "epoch": 0.06925, "grad_norm": 6.09375, "grad_norm_var": 0.6248372395833334, "learning_rate": 0.0001, "loss": 9.611, "loss/crossentropy": 2.47869610786438, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.3342919647693634, "step": 1108 }, { "epoch": 0.069375, "grad_norm": 4.03125, "grad_norm_var": 0.628662109375, "learning_rate": 0.0001, "loss": 9.2084, "loss/crossentropy": 2.07807195186615, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.31260576844215393, "step": 1110 }, { "epoch": 0.0695, "grad_norm": 3.953125, "grad_norm_var": 0.6350260416666667, "learning_rate": 0.0001, "loss": 9.0765, "loss/crossentropy": 2.4429653882980347, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.34172505140304565, "step": 1112 }, { "epoch": 0.069625, "grad_norm": 3.75, "grad_norm_var": 0.6293253580729167, "learning_rate": 0.0001, "loss": 9.4565, "loss/crossentropy": 2.2236061096191406, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.3183263838291168, "step": 1114 }, { "epoch": 0.06975, "grad_norm": 4.03125, "grad_norm_var": 0.611962890625, "learning_rate": 0.0001, "loss": 9.4014, "loss/crossentropy": 2.490137457847595, "loss/hidden": 3.671875, "loss/jsd": 0.0, "loss/logits": 0.334211602807045, "step": 1116 }, { "epoch": 0.069875, "grad_norm": 3.671875, "grad_norm_var": 0.62593994140625, "learning_rate": 0.0001, "loss": 9.2132, "loss/crossentropy": 2.503127098083496, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3488759547472, "step": 1118 }, { "epoch": 0.07, "grad_norm": 4.0, "grad_norm_var": 0.38203125, "learning_rate": 0.0001, "loss": 9.0646, "loss/crossentropy": 2.588125228881836, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.32440805435180664, "step": 1120 }, { "epoch": 0.070125, "grad_norm": 5.1875, "grad_norm_var": 0.44537760416666666, "learning_rate": 0.0001, "loss": 9.6197, "loss/crossentropy": 2.4455296993255615, "loss/hidden": 3.71875, "loss/jsd": 0.0, "loss/logits": 0.33201858401298523, "step": 1122 }, { "epoch": 0.07025, "grad_norm": 3.953125, "grad_norm_var": 0.1257232666015625, "learning_rate": 0.0001, "loss": 9.3346, "loss/crossentropy": 2.5608623027801514, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.3407406657934189, "step": 1124 }, { "epoch": 0.070375, "grad_norm": 3.9375, "grad_norm_var": 0.12796122233072918, "learning_rate": 0.0001, "loss": 9.4367, "loss/crossentropy": 2.4927167892456055, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.3708173930644989, "step": 1126 }, { "epoch": 0.0705, "grad_norm": 3.796875, "grad_norm_var": 0.14039306640625, "learning_rate": 0.0001, "loss": 9.3054, "loss/crossentropy": 2.3557698726654053, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.30249859392642975, "step": 1128 }, { "epoch": 0.070625, "grad_norm": 4.0, "grad_norm_var": 0.13619791666666667, "learning_rate": 0.0001, "loss": 9.3421, "loss/crossentropy": 2.2762213945388794, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.31596677005290985, "step": 1130 }, { "epoch": 0.07075, "grad_norm": 3.484375, "grad_norm_var": 0.15147196451822917, "learning_rate": 0.0001, "loss": 9.1813, "loss/crossentropy": 2.3421590328216553, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.3331995755434036, "step": 1132 }, { "epoch": 0.070875, "grad_norm": 4.34375, "grad_norm_var": 0.87095947265625, "learning_rate": 0.0001, "loss": 9.3163, "loss/crossentropy": 2.263655185699463, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.3255555182695389, "step": 1134 }, { "epoch": 0.071, "grad_norm": 3.9375, "grad_norm_var": 0.8688761393229166, "learning_rate": 0.0001, "loss": 9.2828, "loss/crossentropy": 2.5721757411956787, "loss/hidden": 3.6953125, "loss/jsd": 0.0, "loss/logits": 0.346492663025856, "step": 1136 }, { "epoch": 0.071125, "grad_norm": 3.96875, "grad_norm_var": 0.7981608072916667, "learning_rate": 0.0001, "loss": 9.1755, "loss/crossentropy": 2.3569506406784058, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.3098555654287338, "step": 1138 }, { "epoch": 0.07125, "grad_norm": 3.671875, "grad_norm_var": 0.8078684488932292, "learning_rate": 0.0001, "loss": 9.3109, "loss/crossentropy": 2.5194830894470215, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.33717748522758484, "step": 1140 }, { "epoch": 0.071375, "grad_norm": 3.765625, "grad_norm_var": 0.8381011962890625, "learning_rate": 0.0001, "loss": 9.0862, "loss/crossentropy": 2.352966547012329, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.31706100702285767, "step": 1142 }, { "epoch": 0.0715, "grad_norm": 4.03125, "grad_norm_var": 0.8229777018229166, "learning_rate": 0.0001, "loss": 9.0614, "loss/crossentropy": 2.36809766292572, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.30267640948295593, "step": 1144 }, { "epoch": 0.071625, "grad_norm": 3.765625, "grad_norm_var": 0.8300120035807291, "learning_rate": 0.0001, "loss": 9.2628, "loss/crossentropy": 2.2535144090652466, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.3255711644887924, "step": 1146 }, { "epoch": 0.07175, "grad_norm": 4.25, "grad_norm_var": 0.8011027018229167, "learning_rate": 0.0001, "loss": 9.4743, "loss/crossentropy": 2.443315029144287, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.3342190533876419, "step": 1148 }, { "epoch": 0.071875, "grad_norm": 3.578125, "grad_norm_var": 0.049860636393229164, "learning_rate": 0.0001, "loss": 9.0595, "loss/crossentropy": 2.3228014707565308, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.3196340799331665, "step": 1150 }, { "epoch": 0.072, "grad_norm": 4.1875, "grad_norm_var": 0.05780843098958333, "learning_rate": 0.0001, "loss": 9.1367, "loss/crossentropy": 2.449334144592285, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3382084518671036, "step": 1152 }, { "epoch": 0.072125, "grad_norm": 4.40625, "grad_norm_var": 0.07054036458333333, "learning_rate": 0.0001, "loss": 9.4249, "loss/crossentropy": 2.2319095134735107, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.3391086161136627, "step": 1154 }, { "epoch": 0.07225, "grad_norm": 4.4375, "grad_norm_var": 0.10032145182291667, "learning_rate": 0.0001, "loss": 9.2504, "loss/crossentropy": 2.5388150215148926, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.3299136161804199, "step": 1156 }, { "epoch": 0.072375, "grad_norm": 3.453125, "grad_norm_var": 0.10137430826822917, "learning_rate": 0.0001, "loss": 8.8611, "loss/crossentropy": 2.317844271659851, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.28633375465869904, "step": 1158 }, { "epoch": 0.0725, "grad_norm": 3.828125, "grad_norm_var": 0.10498046875, "learning_rate": 0.0001, "loss": 9.2766, "loss/crossentropy": 2.5527602434158325, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.34664320945739746, "step": 1160 }, { "epoch": 0.072625, "grad_norm": 3.546875, "grad_norm_var": 0.11223856608072917, "learning_rate": 0.0001, "loss": 8.9242, "loss/crossentropy": 2.2007336616516113, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.3022945523262024, "step": 1162 }, { "epoch": 0.07275, "grad_norm": 3.953125, "grad_norm_var": 0.1042388916015625, "learning_rate": 0.0001, "loss": 9.1566, "loss/crossentropy": 2.435725212097168, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.2984570860862732, "step": 1164 }, { "epoch": 0.072875, "grad_norm": 3.765625, "grad_norm_var": 0.09881083170572917, "learning_rate": 0.0001, "loss": 9.3371, "loss/crossentropy": 2.3465352058410645, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.33409954607486725, "step": 1166 }, { "epoch": 0.073, "grad_norm": 3.65625, "grad_norm_var": 0.09781494140625, "learning_rate": 0.0001, "loss": 9.2003, "loss/crossentropy": 2.35371732711792, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3324984610080719, "step": 1168 }, { "epoch": 0.073125, "grad_norm": 3.9375, "grad_norm_var": 0.07768452962239583, "learning_rate": 0.0001, "loss": 9.2635, "loss/crossentropy": 2.619705319404602, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.33950985968112946, "step": 1170 }, { "epoch": 0.07325, "grad_norm": 3.515625, "grad_norm_var": 0.021922810872395834, "learning_rate": 0.0001, "loss": 9.1711, "loss/crossentropy": 2.089954376220703, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.3136076331138611, "step": 1172 }, { "epoch": 0.073375, "grad_norm": 4.25, "grad_norm_var": 0.037474568684895834, "learning_rate": 0.0001, "loss": 9.357, "loss/crossentropy": 2.347720980644226, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.3133264631032944, "step": 1174 }, { "epoch": 0.0735, "grad_norm": 3.734375, "grad_norm_var": 0.0374176025390625, "learning_rate": 0.0001, "loss": 9.0205, "loss/crossentropy": 2.1896666288375854, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.31842848658561707, "step": 1176 }, { "epoch": 0.073625, "grad_norm": 3.640625, "grad_norm_var": 0.035139973958333334, "learning_rate": 0.0001, "loss": 9.314, "loss/crossentropy": 2.5638844966888428, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.3261349946260452, "step": 1178 }, { "epoch": 0.07375, "grad_norm": 3.703125, "grad_norm_var": 0.03573811848958333, "learning_rate": 0.0001, "loss": 9.0808, "loss/crossentropy": 2.2707711458206177, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.3127121925354004, "step": 1180 }, { "epoch": 0.073875, "grad_norm": 3.734375, "grad_norm_var": 0.03573811848958333, "learning_rate": 0.0001, "loss": 9.2911, "loss/crossentropy": 2.4895883798599243, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.3251212239265442, "step": 1182 }, { "epoch": 0.074, "grad_norm": 4.4375, "grad_norm_var": 0.06560872395833334, "learning_rate": 0.0001, "loss": 9.4185, "loss/crossentropy": 2.573387026786804, "loss/hidden": 3.8359375, "loss/jsd": 0.0, "loss/logits": 0.4014684557914734, "step": 1184 }, { "epoch": 0.074125, "grad_norm": 3.984375, "grad_norm_var": 0.06645406087239583, "learning_rate": 0.0001, "loss": 9.3622, "loss/crossentropy": 2.5062469244003296, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.3334721326828003, "step": 1186 }, { "epoch": 0.07425, "grad_norm": 3.578125, "grad_norm_var": 0.0643707275390625, "learning_rate": 0.0001, "loss": 8.7469, "loss/crossentropy": 2.2451168298721313, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.2801739275455475, "step": 1188 }, { "epoch": 0.074375, "grad_norm": 3.84375, "grad_norm_var": 0.05558980305989583, "learning_rate": 0.0001, "loss": 9.3391, "loss/crossentropy": 2.5548393726348877, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.30821627378463745, "step": 1190 }, { "epoch": 0.0745, "grad_norm": 3.71875, "grad_norm_var": 0.05677083333333333, "learning_rate": 0.0001, "loss": 9.166, "loss/crossentropy": 2.4481059312820435, "loss/hidden": 3.7265625, "loss/jsd": 0.0, "loss/logits": 0.37263134121894836, "step": 1192 }, { "epoch": 0.074625, "grad_norm": 3.609375, "grad_norm_var": 0.05585835774739583, "learning_rate": 0.0001, "loss": 9.3454, "loss/crossentropy": 2.0716623067855835, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.305528461933136, "step": 1194 }, { "epoch": 0.07475, "grad_norm": 3.75, "grad_norm_var": 0.05478108723958333, "learning_rate": 0.0001, "loss": 9.1304, "loss/crossentropy": 2.4684784412384033, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.33831028640270233, "step": 1196 }, { "epoch": 0.074875, "grad_norm": 3.953125, "grad_norm_var": 0.05436909993489583, "learning_rate": 0.0001, "loss": 9.0607, "loss/crossentropy": 2.307690143585205, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.3136231005191803, "step": 1198 }, { "epoch": 0.075, "grad_norm": 3.765625, "grad_norm_var": 0.03463134765625, "learning_rate": 0.0001, "loss": 9.526, "loss/crossentropy": 2.3969805240631104, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.33685119450092316, "step": 1200 }, { "epoch": 0.075125, "grad_norm": 4.125, "grad_norm_var": 0.03902587890625, "learning_rate": 0.0001, "loss": 9.3325, "loss/crossentropy": 2.4884891510009766, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.32767656445503235, "step": 1202 }, { "epoch": 0.07525, "grad_norm": 4.125, "grad_norm_var": 0.043440755208333334, "learning_rate": 0.0001, "loss": 9.1195, "loss/crossentropy": 2.2092502117156982, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.3058091998100281, "step": 1204 }, { "epoch": 0.075375, "grad_norm": 3.640625, "grad_norm_var": 0.04667867024739583, "learning_rate": 0.0001, "loss": 9.4251, "loss/crossentropy": 2.6787761449813843, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.3413826525211334, "step": 1206 }, { "epoch": 0.0755, "grad_norm": 3.90625, "grad_norm_var": 0.05217183430989583, "learning_rate": 0.0001, "loss": 8.9467, "loss/crossentropy": 2.5071334838867188, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.3084910959005356, "step": 1208 }, { "epoch": 0.075625, "grad_norm": 3.421875, "grad_norm_var": 0.06331380208333333, "learning_rate": 0.0001, "loss": 9.3112, "loss/crossentropy": 2.4085506200790405, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.31537193059921265, "step": 1210 }, { "epoch": 0.07575, "grad_norm": 4.125, "grad_norm_var": 0.06617838541666667, "learning_rate": 0.0001, "loss": 9.025, "loss/crossentropy": 2.370519518852234, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.3043440580368042, "step": 1212 }, { "epoch": 0.075875, "grad_norm": 3.484375, "grad_norm_var": 0.081591796875, "learning_rate": 0.0001, "loss": 8.7902, "loss/crossentropy": 2.093766450881958, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.2689971700310707, "step": 1214 }, { "epoch": 0.076, "grad_norm": 3.75, "grad_norm_var": 0.068896484375, "learning_rate": 0.0001, "loss": 9.2334, "loss/crossentropy": 2.459157109260559, "loss/hidden": 3.703125, "loss/jsd": 0.0, "loss/logits": 0.3207740783691406, "step": 1216 }, { "epoch": 0.076125, "grad_norm": 3.390625, "grad_norm_var": 0.06848551432291666, "learning_rate": 0.0001, "loss": 9.2117, "loss/crossentropy": 2.3601810932159424, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.3157646358013153, "step": 1218 }, { "epoch": 0.07625, "grad_norm": 3.5625, "grad_norm_var": 0.03772379557291667, "learning_rate": 0.0001, "loss": 9.262, "loss/crossentropy": 2.4909332990646362, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.30380991101264954, "step": 1220 }, { "epoch": 0.076375, "grad_norm": 3.765625, "grad_norm_var": 0.0382720947265625, "learning_rate": 0.0001, "loss": 9.1849, "loss/crossentropy": 2.6187103986740112, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.3528241813182831, "step": 1222 }, { "epoch": 0.0765, "grad_norm": 3.53125, "grad_norm_var": 0.0363677978515625, "learning_rate": 0.0001, "loss": 8.9057, "loss/crossentropy": 2.1186509132385254, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.32527799904346466, "step": 1224 }, { "epoch": 0.076625, "grad_norm": 3.671875, "grad_norm_var": 0.032160441080729164, "learning_rate": 0.0001, "loss": 9.3395, "loss/crossentropy": 2.339973211288452, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.31841570138931274, "step": 1226 }, { "epoch": 0.07675, "grad_norm": 4.625, "grad_norm_var": 0.0916015625, "learning_rate": 0.0001, "loss": 9.3554, "loss/crossentropy": 2.5320791006088257, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.316191166639328, "step": 1228 }, { "epoch": 0.076875, "grad_norm": 3.671875, "grad_norm_var": 0.0856353759765625, "learning_rate": 0.0001, "loss": 8.8309, "loss/crossentropy": 2.2061994075775146, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.3057589828968048, "step": 1230 }, { "epoch": 0.077, "grad_norm": 3.40625, "grad_norm_var": 0.10224202473958334, "learning_rate": 0.0001, "loss": 8.9979, "loss/crossentropy": 2.3670225143432617, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.30261367559432983, "step": 1232 }, { "epoch": 0.077125, "grad_norm": 3.46875, "grad_norm_var": 0.09973958333333334, "learning_rate": 0.0001, "loss": 9.1629, "loss/crossentropy": 2.434694290161133, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.3391081690788269, "step": 1234 }, { "epoch": 0.07725, "grad_norm": 3.859375, "grad_norm_var": 0.10047098795572916, "learning_rate": 0.0001, "loss": 9.0251, "loss/crossentropy": 2.3634976148605347, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.3024301528930664, "step": 1236 }, { "epoch": 0.077375, "grad_norm": 3.734375, "grad_norm_var": 0.17688802083333333, "learning_rate": 0.0001, "loss": 9.3899, "loss/crossentropy": 2.307217240333557, "loss/hidden": 3.6875, "loss/jsd": 0.0, "loss/logits": 0.3346613794565201, "step": 1238 }, { "epoch": 0.0775, "grad_norm": 3.75, "grad_norm_var": 0.16782124837239584, "learning_rate": 0.0001, "loss": 9.0753, "loss/crossentropy": 2.18193256855011, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.3203928619623184, "step": 1240 }, { "epoch": 0.077625, "grad_norm": 3.53125, "grad_norm_var": 0.17125244140625, "learning_rate": 0.0001, "loss": 8.8909, "loss/crossentropy": 2.4916880130767822, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.302780881524086, "step": 1242 }, { "epoch": 0.07775, "grad_norm": 12.9375, "grad_norm_var": 6.389351399739583, "learning_rate": 0.0001, "loss": 9.83, "loss/crossentropy": 2.3125792741775513, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.33007092773914337, "step": 1244 }, { "epoch": 0.077875, "grad_norm": 4.625, "grad_norm_var": 46.981346638997394, "learning_rate": 0.0001, "loss": 9.6705, "loss/crossentropy": 2.3381155729293823, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.4021202623844147, "step": 1246 }, { "epoch": 0.078, "grad_norm": 3.84375, "grad_norm_var": 46.28162333170573, "learning_rate": 0.0001, "loss": 9.3695, "loss/crossentropy": 2.440253973007202, "loss/hidden": 3.6796875, "loss/jsd": 0.0, "loss/logits": 0.3352806717157364, "step": 1248 }, { "epoch": 0.078125, "grad_norm": 3.625, "grad_norm_var": 46.24781494140625, "learning_rate": 0.0001, "loss": 9.1471, "loss/crossentropy": 2.3579354286193848, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.30118629336357117, "step": 1250 }, { "epoch": 0.07825, "grad_norm": 3.578125, "grad_norm_var": 46.2870595296224, "learning_rate": 0.0001, "loss": 9.1803, "loss/crossentropy": 2.203198552131653, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.32657940685749054, "step": 1252 }, { "epoch": 0.078375, "grad_norm": 3.765625, "grad_norm_var": 46.582112630208336, "learning_rate": 0.0001, "loss": 9.1639, "loss/crossentropy": 2.3819659948349, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.3207147717475891, "step": 1254 }, { "epoch": 0.0785, "grad_norm": 3.71875, "grad_norm_var": 46.60387369791667, "learning_rate": 0.0001, "loss": 9.3193, "loss/crossentropy": 2.3398979902267456, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.31140096485614777, "step": 1256 }, { "epoch": 0.078625, "grad_norm": 3.75, "grad_norm_var": 46.48379618326823, "learning_rate": 0.0001, "loss": 9.2251, "loss/crossentropy": 2.349861264228821, "loss/hidden": 3.6640625, "loss/jsd": 0.0, "loss/logits": 0.30966590344905853, "step": 1258 }, { "epoch": 0.07875, "grad_norm": 3.59375, "grad_norm_var": 43.54053446451823, "learning_rate": 0.0001, "loss": 9.0854, "loss/crossentropy": 2.361837148666382, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.3148733079433441, "step": 1260 }, { "epoch": 0.078875, "grad_norm": 3.6875, "grad_norm_var": 0.15891825358072917, "learning_rate": 0.0001, "loss": 9.0902, "loss/crossentropy": 2.3919564485549927, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.3144863545894623, "step": 1262 }, { "epoch": 0.079, "grad_norm": 4.09375, "grad_norm_var": 0.0195709228515625, "learning_rate": 0.0001, "loss": 9.1359, "loss/crossentropy": 2.456905484199524, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.34087100625038147, "step": 1264 }, { "epoch": 0.079125, "grad_norm": 4.09375, "grad_norm_var": 0.03790690104166667, "learning_rate": 0.0001, "loss": 9.2885, "loss/crossentropy": 2.256826400756836, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.30895647406578064, "step": 1266 }, { "epoch": 0.07925, "grad_norm": 3.765625, "grad_norm_var": 0.21834208170572916, "learning_rate": 0.0001, "loss": 9.2081, "loss/crossentropy": 2.0980992913246155, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.33980080485343933, "step": 1268 }, { "epoch": 0.079375, "grad_norm": 3.828125, "grad_norm_var": 0.22421875, "learning_rate": 0.0001, "loss": 8.9036, "loss/crossentropy": 2.381914973258972, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.3259545713663101, "step": 1270 }, { "epoch": 0.0795, "grad_norm": 4.0625, "grad_norm_var": 0.22944234212239584, "learning_rate": 0.0001, "loss": 9.1066, "loss/crossentropy": 2.3197826147079468, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.3119906038045883, "step": 1272 }, { "epoch": 0.079625, "grad_norm": 4.25, "grad_norm_var": 0.23908589680989584, "learning_rate": 0.0001, "loss": 9.189, "loss/crossentropy": 2.385148286819458, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.324929416179657, "step": 1274 }, { "epoch": 0.07975, "grad_norm": 3.84375, "grad_norm_var": 0.2331207275390625, "learning_rate": 0.0001, "loss": 9.428, "loss/crossentropy": 2.334364414215088, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.32778996229171753, "step": 1276 }, { "epoch": 0.079875, "grad_norm": 3.421875, "grad_norm_var": 0.24498291015625, "learning_rate": 0.0001, "loss": 8.8294, "loss/crossentropy": 2.047423243522644, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.2832287549972534, "step": 1278 }, { "epoch": 0.08, "grad_norm": 3.78125, "grad_norm_var": 0.23700764973958333, "learning_rate": 0.0001, "loss": 8.8972, "loss/crossentropy": 2.3135321140289307, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.2986491918563843, "step": 1280 }, { "epoch": 0.080125, "grad_norm": 3.609375, "grad_norm_var": 0.24149983723958332, "learning_rate": 0.0001, "loss": 9.1365, "loss/crossentropy": 2.4352437257766724, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.3260779529809952, "step": 1282 }, { "epoch": 0.08025, "grad_norm": 3.4375, "grad_norm_var": 0.06049702962239583, "learning_rate": 0.0001, "loss": 8.73, "loss/crossentropy": 2.3086354732513428, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.3060029596090317, "step": 1284 }, { "epoch": 0.080375, "grad_norm": 3.796875, "grad_norm_var": 0.05864156087239583, "learning_rate": 0.0001, "loss": 8.9689, "loss/crossentropy": 2.385580539703369, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.31818532943725586, "step": 1286 }, { "epoch": 0.0805, "grad_norm": 3.46875, "grad_norm_var": 0.056982421875, "learning_rate": 0.0001, "loss": 9.0075, "loss/crossentropy": 2.6208995580673218, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.3344912976026535, "step": 1288 }, { "epoch": 0.080625, "grad_norm": 3.578125, "grad_norm_var": 0.039286295572916664, "learning_rate": 0.0001, "loss": 8.9292, "loss/crossentropy": 2.2690337896347046, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.3385104089975357, "step": 1290 }, { "epoch": 0.08075, "grad_norm": 3.5, "grad_norm_var": 0.02431640625, "learning_rate": 0.0001, "loss": 8.6751, "loss/crossentropy": 2.236150622367859, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.3237984627485275, "step": 1292 }, { "epoch": 0.080875, "grad_norm": 4.09375, "grad_norm_var": 0.03980712890625, "learning_rate": 0.0001, "loss": 9.0435, "loss/crossentropy": 2.26598197221756, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.31261470913887024, "step": 1294 }, { "epoch": 0.081, "grad_norm": 3.84375, "grad_norm_var": 0.039305623372395834, "learning_rate": 0.0001, "loss": 9.2676, "loss/crossentropy": 2.382360100746155, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.31899434328079224, "step": 1296 }, { "epoch": 0.081125, "grad_norm": 3.78125, "grad_norm_var": 0.041991170247395834, "learning_rate": 0.0001, "loss": 9.036, "loss/crossentropy": 2.253870368003845, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.3281244486570358, "step": 1298 }, { "epoch": 0.08125, "grad_norm": 3.640625, "grad_norm_var": 0.03935546875, "learning_rate": 0.0001, "loss": 8.9191, "loss/crossentropy": 2.225800395011902, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.32163363695144653, "step": 1300 }, { "epoch": 0.081375, "grad_norm": 3.78125, "grad_norm_var": 0.036844889322916664, "learning_rate": 0.0001, "loss": 9.0865, "loss/crossentropy": 2.4340925216674805, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.3115888386964798, "step": 1302 }, { "epoch": 0.0815, "grad_norm": 3.8125, "grad_norm_var": 0.03950907389322917, "learning_rate": 0.0001, "loss": 9.1363, "loss/crossentropy": 2.3242989778518677, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.31321677565574646, "step": 1304 }, { "epoch": 0.081625, "grad_norm": 3.453125, "grad_norm_var": 0.04324544270833333, "learning_rate": 0.0001, "loss": 8.9641, "loss/crossentropy": 2.333641767501831, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.3083188980817795, "step": 1306 }, { "epoch": 0.08175, "grad_norm": 3.8125, "grad_norm_var": 0.041552734375, "learning_rate": 0.0001, "loss": 8.829, "loss/crossentropy": 2.2750980854034424, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.30787941813468933, "step": 1308 }, { "epoch": 0.081875, "grad_norm": 3.59375, "grad_norm_var": 0.0253326416015625, "learning_rate": 0.0001, "loss": 8.8402, "loss/crossentropy": 2.2719295024871826, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.2763071060180664, "step": 1310 }, { "epoch": 0.082, "grad_norm": 3.65625, "grad_norm_var": 0.021533203125, "learning_rate": 0.0001, "loss": 9.0406, "loss/crossentropy": 2.6696739196777344, "loss/hidden": 3.6328125, "loss/jsd": 0.0, "loss/logits": 0.32642635703086853, "step": 1312 }, { "epoch": 0.082125, "grad_norm": 3.40625, "grad_norm_var": 0.023346964518229166, "learning_rate": 0.0001, "loss": 8.929, "loss/crossentropy": 2.4896806478500366, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.32756727933883667, "step": 1314 }, { "epoch": 0.08225, "grad_norm": 3.78125, "grad_norm_var": 0.025617472330729165, "learning_rate": 0.0001, "loss": 9.2108, "loss/crossentropy": 2.384082555770874, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.31247396767139435, "step": 1316 }, { "epoch": 0.082375, "grad_norm": 3.71875, "grad_norm_var": 0.02974853515625, "learning_rate": 0.0001, "loss": 9.2751, "loss/crossentropy": 2.357794165611267, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.31041818857192993, "step": 1318 }, { "epoch": 0.0825, "grad_norm": 3.90625, "grad_norm_var": 0.028645833333333332, "learning_rate": 0.0001, "loss": 8.9023, "loss/crossentropy": 2.4072389602661133, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.33170604705810547, "step": 1320 }, { "epoch": 0.082625, "grad_norm": 3.53125, "grad_norm_var": 0.034357706705729164, "learning_rate": 0.0001, "loss": 8.9485, "loss/crossentropy": 2.239370107650757, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.29322461783885956, "step": 1322 }, { "epoch": 0.08275, "grad_norm": 3.546875, "grad_norm_var": 0.03426005045572917, "learning_rate": 0.0001, "loss": 9.035, "loss/crossentropy": 2.5135509967803955, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.3100414276123047, "step": 1324 }, { "epoch": 0.082875, "grad_norm": 3.953125, "grad_norm_var": 0.04504801432291667, "learning_rate": 0.0001, "loss": 8.9812, "loss/crossentropy": 2.1832433342933655, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.3234194219112396, "step": 1326 }, { "epoch": 0.083, "grad_norm": 3.546875, "grad_norm_var": 0.04663798014322917, "learning_rate": 0.0001, "loss": 8.8827, "loss/crossentropy": 2.4349963665008545, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.30108973383903503, "step": 1328 }, { "epoch": 0.083125, "grad_norm": 3.6875, "grad_norm_var": 0.042023722330729166, "learning_rate": 0.0001, "loss": 9.2632, "loss/crossentropy": 2.4420056343078613, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.3511925935745239, "step": 1330 }, { "epoch": 0.08325, "grad_norm": 3.484375, "grad_norm_var": 0.04169820149739583, "learning_rate": 0.0001, "loss": 8.6792, "loss/crossentropy": 2.1621901988983154, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.2893373519182205, "step": 1332 }, { "epoch": 0.083375, "grad_norm": 3.59375, "grad_norm_var": 0.036554972330729164, "learning_rate": 0.0001, "loss": 8.8182, "loss/crossentropy": 2.2495174407958984, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.29064077138900757, "step": 1334 }, { "epoch": 0.0835, "grad_norm": 3.84375, "grad_norm_var": 0.036896769205729166, "learning_rate": 0.0001, "loss": 8.9092, "loss/crossentropy": 2.2303038835525513, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.3096499443054199, "step": 1336 }, { "epoch": 0.083625, "grad_norm": 4.53125, "grad_norm_var": 0.08069661458333334, "learning_rate": 0.0001, "loss": 9.1106, "loss/crossentropy": 2.56923508644104, "loss/hidden": 3.625, "loss/jsd": 0.0, "loss/logits": 0.31639058887958527, "step": 1338 }, { "epoch": 0.08375, "grad_norm": 3.484375, "grad_norm_var": 0.0837066650390625, "learning_rate": 0.0001, "loss": 8.833, "loss/crossentropy": 2.2457833290100098, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.2985825538635254, "step": 1340 }, { "epoch": 0.083875, "grad_norm": 3.359375, "grad_norm_var": 0.08302408854166667, "learning_rate": 0.0001, "loss": 8.9599, "loss/crossentropy": 2.405690312385559, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.30246302485466003, "step": 1342 }, { "epoch": 0.084, "grad_norm": 3.65625, "grad_norm_var": 0.08238525390625, "learning_rate": 0.0001, "loss": 8.9099, "loss/crossentropy": 2.360277771949768, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.3205329477787018, "step": 1344 }, { "epoch": 0.084125, "grad_norm": 3.578125, "grad_norm_var": 0.08232421875, "learning_rate": 0.0001, "loss": 9.0251, "loss/crossentropy": 2.4718183279037476, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.322445809841156, "step": 1346 }, { "epoch": 0.08425, "grad_norm": 3.78125, "grad_norm_var": 0.08196512858072917, "learning_rate": 0.0001, "loss": 8.9674, "loss/crossentropy": 2.1336525678634644, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.2891063690185547, "step": 1348 }, { "epoch": 0.084375, "grad_norm": 3.671875, "grad_norm_var": 0.08079020182291667, "learning_rate": 0.0001, "loss": 8.7962, "loss/crossentropy": 2.443579316139221, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.32270699739456177, "step": 1350 }, { "epoch": 0.0845, "grad_norm": 3.4375, "grad_norm_var": 0.07742513020833333, "learning_rate": 0.0001, "loss": 8.8363, "loss/crossentropy": 2.47275173664093, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.31648723781108856, "step": 1352 }, { "epoch": 0.084625, "grad_norm": 3.5625, "grad_norm_var": 0.024039713541666667, "learning_rate": 0.0001, "loss": 8.7411, "loss/crossentropy": 2.4415959119796753, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.286969318985939, "step": 1354 }, { "epoch": 0.08475, "grad_norm": 3.71875, "grad_norm_var": 0.018973795572916667, "learning_rate": 0.0001, "loss": 8.8614, "loss/crossentropy": 2.2840429544448853, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.2909524738788605, "step": 1356 }, { "epoch": 0.084875, "grad_norm": 3.609375, "grad_norm_var": 0.016942342122395832, "learning_rate": 0.0001, "loss": 8.7727, "loss/crossentropy": 2.0146145820617676, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.30589647591114044, "step": 1358 }, { "epoch": 0.085, "grad_norm": 3.921875, "grad_norm_var": 0.021906534830729168, "learning_rate": 0.0001, "loss": 9.0598, "loss/crossentropy": 2.3537509441375732, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.32555970549583435, "step": 1360 }, { "epoch": 0.085125, "grad_norm": 4.25, "grad_norm_var": 0.048924763997395836, "learning_rate": 0.0001, "loss": 9.1544, "loss/crossentropy": 2.626122832298279, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.3384062200784683, "step": 1362 }, { "epoch": 0.08525, "grad_norm": 3.53125, "grad_norm_var": 0.04871419270833333, "learning_rate": 0.0001, "loss": 8.8179, "loss/crossentropy": 2.288867473602295, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.3119974285364151, "step": 1364 }, { "epoch": 0.085375, "grad_norm": 4.6875, "grad_norm_var": 0.12111002604166667, "learning_rate": 0.0001, "loss": 9.0527, "loss/crossentropy": 2.3565372228622437, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.309598833322525, "step": 1366 }, { "epoch": 0.0855, "grad_norm": 3.65625, "grad_norm_var": 0.11802978515625, "learning_rate": 0.0001, "loss": 9.0603, "loss/crossentropy": 2.361824154853821, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.32866616547107697, "step": 1368 }, { "epoch": 0.085625, "grad_norm": 3.78125, "grad_norm_var": 0.1112457275390625, "learning_rate": 0.0001, "loss": 9.1643, "loss/crossentropy": 2.2250397205352783, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.3138546347618103, "step": 1370 }, { "epoch": 0.08575, "grad_norm": 3.90625, "grad_norm_var": 0.11529541015625, "learning_rate": 0.0001, "loss": 9.2611, "loss/crossentropy": 2.5242254734039307, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.33204028010368347, "step": 1372 }, { "epoch": 0.085875, "grad_norm": 3.5, "grad_norm_var": 0.11106669108072917, "learning_rate": 0.0001, "loss": 8.9304, "loss/crossentropy": 2.1475495100021362, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.3122798055410385, "step": 1374 }, { "epoch": 0.086, "grad_norm": 3.609375, "grad_norm_var": 0.11819661458333333, "learning_rate": 0.0001, "loss": 8.9845, "loss/crossentropy": 2.3691786527633667, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.33019253611564636, "step": 1376 }, { "epoch": 0.086125, "grad_norm": 3.875, "grad_norm_var": 0.09970703125, "learning_rate": 0.0001, "loss": 8.8909, "loss/crossentropy": 2.491925835609436, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.31182408332824707, "step": 1378 }, { "epoch": 0.08625, "grad_norm": 4.03125, "grad_norm_var": 0.0946929931640625, "learning_rate": 0.0001, "loss": 9.0368, "loss/crossentropy": 2.531667709350586, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.31533099710941315, "step": 1380 }, { "epoch": 0.086375, "grad_norm": 3.484375, "grad_norm_var": 0.0412750244140625, "learning_rate": 0.0001, "loss": 8.9587, "loss/crossentropy": 2.4312853813171387, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.3191726803779602, "step": 1382 }, { "epoch": 0.0865, "grad_norm": 3.53125, "grad_norm_var": 0.037007649739583336, "learning_rate": 0.0001, "loss": 8.9345, "loss/crossentropy": 2.3028546571731567, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.29265616834163666, "step": 1384 }, { "epoch": 0.086625, "grad_norm": 3.84375, "grad_norm_var": 0.03870442708333333, "learning_rate": 0.0001, "loss": 9.0492, "loss/crossentropy": 2.363860607147217, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.3737258017063141, "step": 1386 }, { "epoch": 0.08675, "grad_norm": 3.625, "grad_norm_var": 0.031245930989583334, "learning_rate": 0.0001, "loss": 8.9288, "loss/crossentropy": 2.317563533782959, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.3253827840089798, "step": 1388 }, { "epoch": 0.086875, "grad_norm": 3.734375, "grad_norm_var": 0.027018229166666668, "learning_rate": 0.0001, "loss": 8.9841, "loss/crossentropy": 2.1043782234191895, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.32640159130096436, "step": 1390 }, { "epoch": 0.087, "grad_norm": 3.8125, "grad_norm_var": 0.0204742431640625, "learning_rate": 0.0001, "loss": 9.0368, "loss/crossentropy": 2.4259365797042847, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.30903398990631104, "step": 1392 }, { "epoch": 0.087125, "grad_norm": 3.828125, "grad_norm_var": 0.019205729166666668, "learning_rate": 0.0001, "loss": 9.0183, "loss/crossentropy": 2.4740456342697144, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.3079090714454651, "step": 1394 }, { "epoch": 0.08725, "grad_norm": 4.125, "grad_norm_var": 0.023177083333333334, "learning_rate": 0.0001, "loss": 9.0947, "loss/crossentropy": 2.3828792572021484, "loss/hidden": 3.640625, "loss/jsd": 0.0, "loss/logits": 0.3404559940099716, "step": 1396 }, { "epoch": 0.087375, "grad_norm": 4.21875, "grad_norm_var": 0.03254292805989583, "learning_rate": 0.0001, "loss": 8.9672, "loss/crossentropy": 2.445147395133972, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.3140483498573303, "step": 1398 }, { "epoch": 0.0875, "grad_norm": 3.40625, "grad_norm_var": 0.041792805989583334, "learning_rate": 0.0001, "loss": 8.9434, "loss/crossentropy": 2.4013755321502686, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.31101924180984497, "step": 1400 }, { "epoch": 0.087625, "grad_norm": 3.375, "grad_norm_var": 0.05074462890625, "learning_rate": 0.0001, "loss": 8.81, "loss/crossentropy": 2.1755102276802063, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.2810830771923065, "step": 1402 }, { "epoch": 0.08775, "grad_norm": 3.5, "grad_norm_var": 0.06453450520833333, "learning_rate": 0.0001, "loss": 8.95, "loss/crossentropy": 2.576531767845154, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.33312951028347015, "step": 1404 }, { "epoch": 0.087875, "grad_norm": 3.515625, "grad_norm_var": 0.06685791015625, "learning_rate": 0.0001, "loss": 9.0711, "loss/crossentropy": 2.418603301048279, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.327798455953598, "step": 1406 }, { "epoch": 0.088, "grad_norm": 3.609375, "grad_norm_var": 0.0658355712890625, "learning_rate": 0.0001, "loss": 9.2586, "loss/crossentropy": 2.6568247079849243, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.3096371293067932, "step": 1408 }, { "epoch": 0.088125, "grad_norm": 3.8125, "grad_norm_var": 0.0717437744140625, "learning_rate": 0.0001, "loss": 8.864, "loss/crossentropy": 2.04031902551651, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.3162307143211365, "step": 1410 }, { "epoch": 0.08825, "grad_norm": 4.09375, "grad_norm_var": 0.06897786458333334, "learning_rate": 0.0001, "loss": 9.1645, "loss/crossentropy": 2.5326061248779297, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.33072784543037415, "step": 1412 }, { "epoch": 0.088375, "grad_norm": 3.640625, "grad_norm_var": 0.04348551432291667, "learning_rate": 0.0001, "loss": 9.1564, "loss/crossentropy": 2.619969367980957, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.32924045622348785, "step": 1414 }, { "epoch": 0.0885, "grad_norm": 3.984375, "grad_norm_var": 0.051488240559895836, "learning_rate": 0.0001, "loss": 9.0071, "loss/crossentropy": 2.535549759864807, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.3203812837600708, "step": 1416 }, { "epoch": 0.088625, "grad_norm": 3.765625, "grad_norm_var": 0.05335286458333333, "learning_rate": 0.0001, "loss": 9.0727, "loss/crossentropy": 2.4984034299850464, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.30122537910938263, "step": 1418 }, { "epoch": 0.08875, "grad_norm": 3.46875, "grad_norm_var": 0.04179585774739583, "learning_rate": 0.0001, "loss": 8.8205, "loss/crossentropy": 2.3398995399475098, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.30119411647319794, "step": 1420 }, { "epoch": 0.088875, "grad_norm": 3.59375, "grad_norm_var": 0.03720296223958333, "learning_rate": 0.0001, "loss": 8.8969, "loss/crossentropy": 2.265345811843872, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.2919570058584213, "step": 1422 }, { "epoch": 0.089, "grad_norm": 3.625, "grad_norm_var": 0.03683268229166667, "learning_rate": 0.0001, "loss": 8.9852, "loss/crossentropy": 2.3537477254867554, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.31161582469940186, "step": 1424 }, { "epoch": 0.089125, "grad_norm": 3.46875, "grad_norm_var": 0.029442342122395833, "learning_rate": 0.0001, "loss": 8.9986, "loss/crossentropy": 2.4521849155426025, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.29275740683078766, "step": 1426 }, { "epoch": 0.08925, "grad_norm": 3.71875, "grad_norm_var": 0.021434529622395834, "learning_rate": 0.0001, "loss": 8.9503, "loss/crossentropy": 2.2590759992599487, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.3151112347841263, "step": 1428 }, { "epoch": 0.089375, "grad_norm": 3.625, "grad_norm_var": 0.02408447265625, "learning_rate": 0.0001, "loss": 9.0334, "loss/crossentropy": 2.431664228439331, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.3339419513940811, "step": 1430 }, { "epoch": 0.0895, "grad_norm": 3.796875, "grad_norm_var": 0.021467081705729165, "learning_rate": 0.0001, "loss": 9.0671, "loss/crossentropy": 2.2652631998062134, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.3316595107316971, "step": 1432 }, { "epoch": 0.089625, "grad_norm": 3.78125, "grad_norm_var": 0.028034464518229166, "learning_rate": 0.0001, "loss": 8.8634, "loss/crossentropy": 2.3894190788269043, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.30771124362945557, "step": 1434 }, { "epoch": 0.08975, "grad_norm": 3.4375, "grad_norm_var": 0.03173421223958333, "learning_rate": 0.0001, "loss": 8.8962, "loss/crossentropy": 2.174624800682068, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.2859848737716675, "step": 1436 }, { "epoch": 0.089875, "grad_norm": 3.703125, "grad_norm_var": 0.0318267822265625, "learning_rate": 0.0001, "loss": 9.0001, "loss/crossentropy": 2.4872666597366333, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.29487521946430206, "step": 1438 }, { "epoch": 0.09, "grad_norm": 3.703125, "grad_norm_var": 0.03215230305989583, "learning_rate": 0.0001, "loss": 8.8425, "loss/crossentropy": 2.276022791862488, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.28920166194438934, "step": 1440 }, { "epoch": 0.090125, "grad_norm": 3.421875, "grad_norm_var": 0.03479715983072917, "learning_rate": 0.0001, "loss": 8.966, "loss/crossentropy": 2.4065338373184204, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.2963782846927643, "step": 1442 }, { "epoch": 0.09025, "grad_norm": 3.359375, "grad_norm_var": 0.041243489583333334, "learning_rate": 0.0001, "loss": 8.9235, "loss/crossentropy": 2.52418851852417, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.3200630098581314, "step": 1444 }, { "epoch": 0.090375, "grad_norm": 3.296875, "grad_norm_var": 0.04114176432291667, "learning_rate": 0.0001, "loss": 8.8564, "loss/crossentropy": 2.454889178276062, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.3129142075777054, "step": 1446 }, { "epoch": 0.0905, "grad_norm": 3.75, "grad_norm_var": 0.039839680989583334, "learning_rate": 0.0001, "loss": 8.903, "loss/crossentropy": 2.3371113538742065, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.294046014547348, "step": 1448 }, { "epoch": 0.090625, "grad_norm": 4.1875, "grad_norm_var": 0.052643839518229166, "learning_rate": 0.0001, "loss": 8.8963, "loss/crossentropy": 2.4554747343063354, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.2924615442752838, "step": 1450 }, { "epoch": 0.09075, "grad_norm": 3.46875, "grad_norm_var": 0.0498687744140625, "learning_rate": 0.0001, "loss": 8.9145, "loss/crossentropy": 2.2903236150741577, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.2915424406528473, "step": 1452 }, { "epoch": 0.090875, "grad_norm": 3.5, "grad_norm_var": 0.05797119140625, "learning_rate": 0.0001, "loss": 8.8917, "loss/crossentropy": 2.3047916889190674, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.29666368663311005, "step": 1454 }, { "epoch": 0.091, "grad_norm": 3.578125, "grad_norm_var": 0.055517578125, "learning_rate": 0.0001, "loss": 8.815, "loss/crossentropy": 2.681693911552429, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.3014596551656723, "step": 1456 }, { "epoch": 0.091125, "grad_norm": 3.90625, "grad_norm_var": 0.8339914957682292, "learning_rate": 0.0001, "loss": 9.1305, "loss/crossentropy": 2.4676531553268433, "loss/hidden": 3.7734375, "loss/jsd": 0.0, "loss/logits": 0.37058068811893463, "step": 1458 }, { "epoch": 0.09125, "grad_norm": 3.5625, "grad_norm_var": 0.8101847330729167, "learning_rate": 0.0001, "loss": 8.9791, "loss/crossentropy": 2.41966712474823, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.31916917860507965, "step": 1460 }, { "epoch": 0.091375, "grad_norm": 4.09375, "grad_norm_var": 0.7877349853515625, "learning_rate": 0.0001, "loss": 8.874, "loss/crossentropy": 2.092151641845703, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.3130563199520111, "step": 1462 }, { "epoch": 0.0915, "grad_norm": 3.46875, "grad_norm_var": 0.8086100260416667, "learning_rate": 0.0001, "loss": 8.8506, "loss/crossentropy": 2.3982959985733032, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.3187362104654312, "step": 1464 }, { "epoch": 0.091625, "grad_norm": 3.96875, "grad_norm_var": 0.8220377604166667, "learning_rate": 0.0001, "loss": 8.9897, "loss/crossentropy": 2.554791212081909, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.3084287494421005, "step": 1466 }, { "epoch": 0.09175, "grad_norm": 3.40625, "grad_norm_var": 0.8275065104166667, "learning_rate": 0.0001, "loss": 8.8077, "loss/crossentropy": 2.230105757713318, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.30672188103199005, "step": 1468 }, { "epoch": 0.091875, "grad_norm": 3.84375, "grad_norm_var": 0.81285400390625, "learning_rate": 0.0001, "loss": 8.7891, "loss/crossentropy": 2.3420634269714355, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.28118696808815, "step": 1470 }, { "epoch": 0.092, "grad_norm": 3.609375, "grad_norm_var": 0.8321116129557292, "learning_rate": 0.0001, "loss": 8.9525, "loss/crossentropy": 2.473548173904419, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.30480530858039856, "step": 1472 }, { "epoch": 0.092125, "grad_norm": 3.5625, "grad_norm_var": 0.08126627604166667, "learning_rate": 0.0001, "loss": 8.6078, "loss/crossentropy": 2.331430673599243, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.28144779801368713, "step": 1474 }, { "epoch": 0.09225, "grad_norm": 3.640625, "grad_norm_var": 0.07432352701822917, "learning_rate": 0.0001, "loss": 8.9628, "loss/crossentropy": 2.106496810913086, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.29021361470222473, "step": 1476 }, { "epoch": 0.092375, "grad_norm": 3.484375, "grad_norm_var": 0.05628153483072917, "learning_rate": 0.0001, "loss": 8.9618, "loss/crossentropy": 2.41057550907135, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.3081541657447815, "step": 1478 }, { "epoch": 0.0925, "grad_norm": 3.703125, "grad_norm_var": 0.05603739420572917, "learning_rate": 0.0001, "loss": 8.9632, "loss/crossentropy": 2.481987476348877, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.2695133462548256, "step": 1480 }, { "epoch": 0.092625, "grad_norm": 4.6875, "grad_norm_var": 0.16575520833333332, "learning_rate": 0.0001, "loss": 8.9132, "loss/crossentropy": 2.3429198265075684, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.30013199150562286, "step": 1482 }, { "epoch": 0.09275, "grad_norm": 3.3125, "grad_norm_var": 0.1684722900390625, "learning_rate": 0.0001, "loss": 8.8029, "loss/crossentropy": 2.6235066652297974, "loss/hidden": 3.6015625, "loss/jsd": 0.0, "loss/logits": 0.32275477051734924, "step": 1484 }, { "epoch": 0.092875, "grad_norm": 3.984375, "grad_norm_var": 0.1758453369140625, "learning_rate": 0.0001, "loss": 9.082, "loss/crossentropy": 2.550568461418152, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.316007137298584, "step": 1486 }, { "epoch": 0.093, "grad_norm": 3.546875, "grad_norm_var": 0.16510009765625, "learning_rate": 0.0001, "loss": 8.8733, "loss/crossentropy": 2.4568541049957275, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.3310081660747528, "step": 1488 }, { "epoch": 0.093125, "grad_norm": 3.578125, "grad_norm_var": 0.1637115478515625, "learning_rate": 0.0001, "loss": 8.8808, "loss/crossentropy": 2.5027180910110474, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.32694798707962036, "step": 1490 }, { "epoch": 0.09325, "grad_norm": 3.53125, "grad_norm_var": 0.16510009765625, "learning_rate": 0.0001, "loss": 8.7269, "loss/crossentropy": 2.1558250188827515, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.3121803104877472, "step": 1492 }, { "epoch": 0.093375, "grad_norm": 4.53125, "grad_norm_var": 0.21354166666666666, "learning_rate": 0.0001, "loss": 8.996, "loss/crossentropy": 2.4038426876068115, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.31739044189453125, "step": 1494 }, { "epoch": 0.0935, "grad_norm": 3.15625, "grad_norm_var": 0.23286844889322916, "learning_rate": 0.0001, "loss": 8.8094, "loss/crossentropy": 2.419437289237976, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.3122180104255676, "step": 1496 }, { "epoch": 0.093625, "grad_norm": 3.3125, "grad_norm_var": 0.115283203125, "learning_rate": 0.0001, "loss": 8.7015, "loss/crossentropy": 2.3764225244522095, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.32906584441661835, "step": 1498 }, { "epoch": 0.09375, "grad_norm": 3.765625, "grad_norm_var": 0.10917867024739583, "learning_rate": 0.0001, "loss": 9.0724, "loss/crossentropy": 2.421535849571228, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.2912449240684509, "step": 1500 }, { "epoch": 0.093875, "grad_norm": 3.375, "grad_norm_var": 0.09527587890625, "learning_rate": 0.0001, "loss": 8.7729, "loss/crossentropy": 2.1934632062911987, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.31051623821258545, "step": 1502 }, { "epoch": 0.094, "grad_norm": 3.421875, "grad_norm_var": 0.09427083333333333, "learning_rate": 0.0001, "loss": 8.7316, "loss/crossentropy": 2.3402860164642334, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.290997713804245, "step": 1504 }, { "epoch": 0.094125, "grad_norm": 3.6875, "grad_norm_var": 0.09648335774739583, "learning_rate": 0.0001, "loss": 8.9565, "loss/crossentropy": 2.440617799758911, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.29472315311431885, "step": 1506 }, { "epoch": 0.09425, "grad_norm": 4.46875, "grad_norm_var": 0.15945638020833333, "learning_rate": 0.0001, "loss": 9.1673, "loss/crossentropy": 2.4849953651428223, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.3361028879880905, "step": 1508 }, { "epoch": 0.094375, "grad_norm": 3.59375, "grad_norm_var": 0.10628255208333333, "learning_rate": 0.0001, "loss": 8.907, "loss/crossentropy": 2.424979090690613, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.2993798553943634, "step": 1510 }, { "epoch": 0.0945, "grad_norm": 3.5, "grad_norm_var": 0.09318033854166667, "learning_rate": 0.0001, "loss": 9.0442, "loss/crossentropy": 2.418489933013916, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.32049693167209625, "step": 1512 }, { "epoch": 0.094625, "grad_norm": 3.53125, "grad_norm_var": 0.08361002604166666, "learning_rate": 0.0001, "loss": 8.9006, "loss/crossentropy": 2.2703150510787964, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.28213508427143097, "step": 1514 }, { "epoch": 0.09475, "grad_norm": 5.28125, "grad_norm_var": 0.2518717447916667, "learning_rate": 0.0001, "loss": 9.2422, "loss/crossentropy": 2.528243660926819, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.3030099719762802, "step": 1516 }, { "epoch": 0.094875, "grad_norm": 3.6875, "grad_norm_var": 0.23137613932291667, "learning_rate": 0.0001, "loss": 9.0565, "loss/crossentropy": 2.2802836894989014, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.30337056517601013, "step": 1518 }, { "epoch": 0.095, "grad_norm": 3.40625, "grad_norm_var": 0.22304585774739583, "learning_rate": 0.0001, "loss": 9.0657, "loss/crossentropy": 2.3259323835372925, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.29459048807621, "step": 1520 }, { "epoch": 0.095125, "grad_norm": 3.96875, "grad_norm_var": 0.22170817057291667, "learning_rate": 0.0001, "loss": 8.9044, "loss/crossentropy": 2.4524760246276855, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.2917432487010956, "step": 1522 }, { "epoch": 0.09525, "grad_norm": 3.703125, "grad_norm_var": 0.20701395670572917, "learning_rate": 0.0001, "loss": 8.6662, "loss/crossentropy": 2.002210795879364, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.28943265974521637, "step": 1524 }, { "epoch": 0.095375, "grad_norm": 4.03125, "grad_norm_var": 0.21230061848958334, "learning_rate": 0.0001, "loss": 9.0207, "loss/crossentropy": 2.6163631677627563, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.30554938316345215, "step": 1526 }, { "epoch": 0.0955, "grad_norm": 3.5, "grad_norm_var": 0.21051025390625, "learning_rate": 0.0001, "loss": 9.0829, "loss/crossentropy": 2.327064037322998, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.3019147366285324, "step": 1528 }, { "epoch": 0.095625, "grad_norm": 3.84375, "grad_norm_var": 0.21812744140625, "learning_rate": 0.0001, "loss": 8.9037, "loss/crossentropy": 2.4404029846191406, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.3117605447769165, "step": 1530 }, { "epoch": 0.09575, "grad_norm": 3.453125, "grad_norm_var": 0.06982421875, "learning_rate": 0.0001, "loss": 8.6937, "loss/crossentropy": 2.1841256618499756, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.2961889058351517, "step": 1532 }, { "epoch": 0.095875, "grad_norm": 4.1875, "grad_norm_var": 0.08876546223958333, "learning_rate": 0.0001, "loss": 8.9432, "loss/crossentropy": 2.5015478134155273, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.31438133120536804, "step": 1534 }, { "epoch": 0.096, "grad_norm": 3.515625, "grad_norm_var": 0.08181050618489584, "learning_rate": 0.0001, "loss": 9.0777, "loss/crossentropy": 2.6356844902038574, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.3276464343070984, "step": 1536 }, { "epoch": 0.096125, "grad_norm": 3.25, "grad_norm_var": 0.09553629557291667, "learning_rate": 0.0001, "loss": 8.8278, "loss/crossentropy": 2.6288344860076904, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.3106349855661392, "step": 1538 }, { "epoch": 0.09625, "grad_norm": 3.65625, "grad_norm_var": 0.08912760416666667, "learning_rate": 0.0001, "loss": 8.8824, "loss/crossentropy": 2.3544297218322754, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.29945333302021027, "step": 1540 }, { "epoch": 0.096375, "grad_norm": 3.671875, "grad_norm_var": 0.0782623291015625, "learning_rate": 0.0001, "loss": 8.8214, "loss/crossentropy": 2.525648593902588, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.3259720504283905, "step": 1542 }, { "epoch": 0.0965, "grad_norm": 3.703125, "grad_norm_var": 0.0704498291015625, "learning_rate": 0.0001, "loss": 9.0293, "loss/crossentropy": 2.557410717010498, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.3404108136892319, "step": 1544 }, { "epoch": 0.096625, "grad_norm": 3.578125, "grad_norm_var": 0.06096598307291667, "learning_rate": 0.0001, "loss": 8.9852, "loss/crossentropy": 2.4181408882141113, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.3040696382522583, "step": 1546 }, { "epoch": 0.09675, "grad_norm": 3.921875, "grad_norm_var": 0.06634114583333334, "learning_rate": 0.0001, "loss": 8.6302, "loss/crossentropy": 2.328543782234192, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.3038486987352371, "step": 1548 }, { "epoch": 0.096875, "grad_norm": 3.75, "grad_norm_var": 0.0479400634765625, "learning_rate": 0.0001, "loss": 8.8259, "loss/crossentropy": 2.2513211965560913, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.3101038932800293, "step": 1550 }, { "epoch": 0.097, "grad_norm": 3.609375, "grad_norm_var": 0.0394195556640625, "learning_rate": 0.0001, "loss": 8.7993, "loss/crossentropy": 2.2204794883728027, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.28817546367645264, "step": 1552 }, { "epoch": 0.097125, "grad_norm": 3.4375, "grad_norm_var": 0.0351715087890625, "learning_rate": 0.0001, "loss": 8.9577, "loss/crossentropy": 2.306167244911194, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.31589527428150177, "step": 1554 }, { "epoch": 0.09725, "grad_norm": 3.4375, "grad_norm_var": 0.026102701822916668, "learning_rate": 0.0001, "loss": 8.7825, "loss/crossentropy": 2.485254645347595, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.3053087592124939, "step": 1556 }, { "epoch": 0.097375, "grad_norm": 3.484375, "grad_norm_var": 0.037230428059895834, "learning_rate": 0.0001, "loss": 9.0267, "loss/crossentropy": 2.21419095993042, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.3267703801393509, "step": 1558 }, { "epoch": 0.0975, "grad_norm": 3.625, "grad_norm_var": 0.036375935872395834, "learning_rate": 0.0001, "loss": 8.9805, "loss/crossentropy": 2.8129303455352783, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.31060926616191864, "step": 1560 }, { "epoch": 0.097625, "grad_norm": 3.390625, "grad_norm_var": 0.03967997233072917, "learning_rate": 0.0001, "loss": 8.8682, "loss/crossentropy": 2.3201346397399902, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.3333878219127655, "step": 1562 }, { "epoch": 0.09775, "grad_norm": 3.71875, "grad_norm_var": 0.06048177083333333, "learning_rate": 0.0001, "loss": 8.6913, "loss/crossentropy": 2.4905728101730347, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.3240518271923065, "step": 1564 }, { "epoch": 0.097875, "grad_norm": 3.40625, "grad_norm_var": 0.05533447265625, "learning_rate": 0.0001, "loss": 8.625, "loss/crossentropy": 2.0138303637504578, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.2922722101211548, "step": 1566 }, { "epoch": 0.098, "grad_norm": 3.3125, "grad_norm_var": 0.05804036458333333, "learning_rate": 0.0001, "loss": 8.6031, "loss/crossentropy": 2.336915612220764, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.3007281422615051, "step": 1568 }, { "epoch": 0.098125, "grad_norm": 3.296875, "grad_norm_var": 0.06008199055989583, "learning_rate": 0.0001, "loss": 8.7958, "loss/crossentropy": 2.304540991783142, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.27709995210170746, "step": 1570 }, { "epoch": 0.09825, "grad_norm": 3.28125, "grad_norm_var": 0.060887654622395836, "learning_rate": 0.0001, "loss": 8.4869, "loss/crossentropy": 2.291286587715149, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.29013076424598694, "step": 1572 }, { "epoch": 0.098375, "grad_norm": 3.140625, "grad_norm_var": 0.044798787434895834, "learning_rate": 0.0001, "loss": 8.4662, "loss/crossentropy": 2.4036799669265747, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.290183424949646, "step": 1574 }, { "epoch": 0.0985, "grad_norm": 3.453125, "grad_norm_var": 0.0473052978515625, "learning_rate": 0.0001, "loss": 9.0173, "loss/crossentropy": 2.3124269247055054, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.3117297291755676, "step": 1576 }, { "epoch": 0.098625, "grad_norm": 3.453125, "grad_norm_var": 0.0395416259765625, "learning_rate": 0.0001, "loss": 8.6615, "loss/crossentropy": 2.0177338123321533, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.27028243243694305, "step": 1578 }, { "epoch": 0.09875, "grad_norm": 3.46875, "grad_norm_var": 0.018159993489583335, "learning_rate": 0.0001, "loss": 8.9067, "loss/crossentropy": 2.3349932432174683, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.31509193778038025, "step": 1580 }, { "epoch": 0.098875, "grad_norm": 3.515625, "grad_norm_var": 0.0189849853515625, "learning_rate": 0.0001, "loss": 8.7176, "loss/crossentropy": 2.599055767059326, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.3019975572824478, "step": 1582 }, { "epoch": 0.099, "grad_norm": 3.515625, "grad_norm_var": 0.026537068684895835, "learning_rate": 0.0001, "loss": 8.8398, "loss/crossentropy": 2.443954348564148, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.29715240001678467, "step": 1584 }, { "epoch": 0.099125, "grad_norm": 3.484375, "grad_norm_var": 0.026537068684895835, "learning_rate": 0.0001, "loss": 8.8296, "loss/crossentropy": 2.5043543577194214, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.28090426325798035, "step": 1586 }, { "epoch": 0.09925, "grad_norm": 3.0, "grad_norm_var": 0.03758036295572917, "learning_rate": 0.0001, "loss": 8.5364, "loss/crossentropy": 2.267952561378479, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.30116575956344604, "step": 1588 }, { "epoch": 0.099375, "grad_norm": 3.484375, "grad_norm_var": 0.0370025634765625, "learning_rate": 0.0001, "loss": 8.5028, "loss/crossentropy": 2.0941153168678284, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2714923173189163, "step": 1590 }, { "epoch": 0.0995, "grad_norm": 3.484375, "grad_norm_var": 0.031061808268229168, "learning_rate": 0.0001, "loss": 8.9053, "loss/crossentropy": 2.422881245613098, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.31440040469169617, "step": 1592 }, { "epoch": 0.099625, "grad_norm": 3.265625, "grad_norm_var": 0.032404581705729164, "learning_rate": 0.0001, "loss": 8.6802, "loss/crossentropy": 2.4505289793014526, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.2994416207075119, "step": 1594 }, { "epoch": 0.09975, "grad_norm": 3.578125, "grad_norm_var": 0.03308817545572917, "learning_rate": 0.0001, "loss": 8.5845, "loss/crossentropy": 2.3204309940338135, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.290659636259079, "step": 1596 }, { "epoch": 0.099875, "grad_norm": 3.5, "grad_norm_var": 0.045751953125, "learning_rate": 0.0001, "loss": 8.8864, "loss/crossentropy": 2.14364230632782, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.302025705575943, "step": 1598 }, { "epoch": 0.1, "grad_norm": 4.53125, "grad_norm_var": 0.11702067057291667, "learning_rate": 0.0001, "loss": 8.9408, "loss/crossentropy": 2.3856927156448364, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.2939811646938324, "step": 1600 }, { "epoch": 0.100125, "grad_norm": 4.65625, "grad_norm_var": 0.19981180826822917, "learning_rate": 0.0001, "loss": 8.9318, "loss/crossentropy": 2.393304467201233, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.31673501431941986, "step": 1602 }, { "epoch": 0.10025, "grad_norm": 3.609375, "grad_norm_var": 0.17449442545572916, "learning_rate": 0.0001, "loss": 8.8995, "loss/crossentropy": 2.265239119529724, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.2922002673149109, "step": 1604 }, { "epoch": 0.100375, "grad_norm": 3.4375, "grad_norm_var": 0.15900065104166666, "learning_rate": 0.0001, "loss": 8.928, "loss/crossentropy": 2.490726351737976, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.320227712392807, "step": 1606 }, { "epoch": 0.1005, "grad_norm": 3.921875, "grad_norm_var": 0.16309305826822917, "learning_rate": 0.0001, "loss": 9.0626, "loss/crossentropy": 2.3851197957992554, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.33952587842941284, "step": 1608 }, { "epoch": 0.100625, "grad_norm": 3.6875, "grad_norm_var": 0.1425445556640625, "learning_rate": 0.0001, "loss": 8.5284, "loss/crossentropy": 2.4831987619400024, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.28591710329055786, "step": 1610 }, { "epoch": 0.10075, "grad_norm": 3.21875, "grad_norm_var": 0.15349833170572916, "learning_rate": 0.0001, "loss": 8.5034, "loss/crossentropy": 2.3684011697769165, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.3100173771381378, "step": 1612 }, { "epoch": 0.100875, "grad_norm": 3.703125, "grad_norm_var": 0.15131734212239584, "learning_rate": 0.0001, "loss": 8.6768, "loss/crossentropy": 2.2726497650146484, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2776256799697876, "step": 1614 }, { "epoch": 0.101, "grad_norm": 3.796875, "grad_norm_var": 0.11056315104166667, "learning_rate": 0.0001, "loss": 8.7838, "loss/crossentropy": 2.2839853763580322, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.28534410893917084, "step": 1616 }, { "epoch": 0.101125, "grad_norm": 3.171875, "grad_norm_var": 0.05589090983072917, "learning_rate": 0.0001, "loss": 8.768, "loss/crossentropy": 2.6459089517593384, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.31302425265312195, "step": 1618 }, { "epoch": 0.10125, "grad_norm": 3.625, "grad_norm_var": 0.05469462076822917, "learning_rate": 0.0001, "loss": 8.8765, "loss/crossentropy": 2.3666889667510986, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.31573787331581116, "step": 1620 }, { "epoch": 0.101375, "grad_norm": 3.65625, "grad_norm_var": 0.05213114420572917, "learning_rate": 0.0001, "loss": 8.7946, "loss/crossentropy": 2.294891834259033, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.29489198327064514, "step": 1622 }, { "epoch": 0.1015, "grad_norm": 3.359375, "grad_norm_var": 0.050927734375, "learning_rate": 0.0001, "loss": 8.7622, "loss/crossentropy": 2.186626434326172, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.3071312755346298, "step": 1624 }, { "epoch": 0.101625, "grad_norm": 3.0625, "grad_norm_var": 0.06525065104166666, "learning_rate": 0.0001, "loss": 8.8552, "loss/crossentropy": 2.3912779092788696, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.30470481514930725, "step": 1626 }, { "epoch": 0.10175, "grad_norm": 3.375, "grad_norm_var": 0.06083984375, "learning_rate": 0.0001, "loss": 8.7036, "loss/crossentropy": 2.236059784889221, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.29887452721595764, "step": 1628 }, { "epoch": 0.101875, "grad_norm": 3.390625, "grad_norm_var": 0.04715169270833333, "learning_rate": 0.0001, "loss": 8.7288, "loss/crossentropy": 2.213658332824707, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.2943817526102066, "step": 1630 }, { "epoch": 0.102, "grad_norm": 3.703125, "grad_norm_var": 0.05207417805989583, "learning_rate": 0.0001, "loss": 8.8536, "loss/crossentropy": 2.0792142748832703, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.3170311152935028, "step": 1632 }, { "epoch": 0.102125, "grad_norm": 3.3125, "grad_norm_var": 0.05028889973958333, "learning_rate": 0.0001, "loss": 8.6659, "loss/crossentropy": 2.2078484296798706, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.3040362149477005, "step": 1634 }, { "epoch": 0.10225, "grad_norm": 3.453125, "grad_norm_var": 0.0458984375, "learning_rate": 0.0001, "loss": 8.7704, "loss/crossentropy": 2.5149965286254883, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2992193400859833, "step": 1636 }, { "epoch": 0.102375, "grad_norm": 3.625, "grad_norm_var": 0.05103759765625, "learning_rate": 0.0001, "loss": 9.0054, "loss/crossentropy": 2.3132543563842773, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.28851835429668427, "step": 1638 }, { "epoch": 0.1025, "grad_norm": 3.359375, "grad_norm_var": 0.049702962239583336, "learning_rate": 0.0001, "loss": 8.6654, "loss/crossentropy": 2.3990262746810913, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.286620169878006, "step": 1640 }, { "epoch": 0.102625, "grad_norm": 3.390625, "grad_norm_var": 0.03817952473958333, "learning_rate": 0.0001, "loss": 8.8415, "loss/crossentropy": 2.5137301683425903, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.3091888725757599, "step": 1642 }, { "epoch": 0.10275, "grad_norm": 3.40625, "grad_norm_var": 0.0311676025390625, "learning_rate": 0.0001, "loss": 8.641, "loss/crossentropy": 2.312187671661377, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.31714877486228943, "step": 1644 }, { "epoch": 0.102875, "grad_norm": 3.546875, "grad_norm_var": 0.0481597900390625, "learning_rate": 0.0001, "loss": 8.8978, "loss/crossentropy": 2.3664920330047607, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.31635183095932007, "step": 1646 }, { "epoch": 0.103, "grad_norm": 3.359375, "grad_norm_var": 0.04122721354166667, "learning_rate": 0.0001, "loss": 8.6667, "loss/crossentropy": 2.5847058296203613, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.2953044921159744, "step": 1648 }, { "epoch": 0.103125, "grad_norm": 3.953125, "grad_norm_var": 0.08193359375, "learning_rate": 0.0001, "loss": 8.9228, "loss/crossentropy": 2.601444363594055, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.3334079086780548, "step": 1650 }, { "epoch": 0.10325, "grad_norm": 3.09375, "grad_norm_var": 0.09729715983072916, "learning_rate": 0.0001, "loss": 8.6107, "loss/crossentropy": 2.3851035833358765, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.29805073142051697, "step": 1652 }, { "epoch": 0.103375, "grad_norm": 3.390625, "grad_norm_var": 0.10099995930989583, "learning_rate": 0.0001, "loss": 8.7792, "loss/crossentropy": 2.383594036102295, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.3409101665019989, "step": 1654 }, { "epoch": 0.1035, "grad_norm": 3.859375, "grad_norm_var": 0.1023834228515625, "learning_rate": 0.0001, "loss": 8.8836, "loss/crossentropy": 2.386256456375122, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.31013236939907074, "step": 1656 }, { "epoch": 0.103625, "grad_norm": 3.609375, "grad_norm_var": 0.09819234212239583, "learning_rate": 0.0001, "loss": 8.7058, "loss/crossentropy": 2.4861687421798706, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.2857501953840256, "step": 1658 }, { "epoch": 0.10375, "grad_norm": 3.265625, "grad_norm_var": 0.10321858723958334, "learning_rate": 0.0001, "loss": 8.7162, "loss/crossentropy": 2.165019392967224, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2871998995542526, "step": 1660 }, { "epoch": 0.103875, "grad_norm": 3.1875, "grad_norm_var": 0.10045572916666666, "learning_rate": 0.0001, "loss": 8.6825, "loss/crossentropy": 2.3428409099578857, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.30583275854587555, "step": 1662 }, { "epoch": 0.104, "grad_norm": 3.296875, "grad_norm_var": 0.1024810791015625, "learning_rate": 0.0001, "loss": 8.5464, "loss/crossentropy": 2.1279059648513794, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.28338921070098877, "step": 1664 }, { "epoch": 0.104125, "grad_norm": 3.296875, "grad_norm_var": 0.05191650390625, "learning_rate": 0.0001, "loss": 8.6361, "loss/crossentropy": 2.109134554862976, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.28193847835063934, "step": 1666 }, { "epoch": 0.10425, "grad_norm": 3.296875, "grad_norm_var": 0.04047749837239583, "learning_rate": 0.0001, "loss": 8.601, "loss/crossentropy": 2.1180429458618164, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2961196005344391, "step": 1668 }, { "epoch": 0.104375, "grad_norm": 3.421875, "grad_norm_var": 0.0576080322265625, "learning_rate": 0.0001, "loss": 8.6396, "loss/crossentropy": 2.313425898551941, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.28072628378868103, "step": 1670 }, { "epoch": 0.1045, "grad_norm": 3.390625, "grad_norm_var": 0.04879150390625, "learning_rate": 0.0001, "loss": 8.8899, "loss/crossentropy": 2.3383524417877197, "loss/hidden": 3.578125, "loss/jsd": 0.0, "loss/logits": 0.3101891279220581, "step": 1672 }, { "epoch": 0.104625, "grad_norm": 3.6875, "grad_norm_var": 0.05182291666666667, "learning_rate": 0.0001, "loss": 8.6569, "loss/crossentropy": 2.2282902002334595, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.27700501680374146, "step": 1674 }, { "epoch": 0.10475, "grad_norm": 3.546875, "grad_norm_var": 0.05043843587239583, "learning_rate": 0.0001, "loss": 8.8456, "loss/crossentropy": 2.1871705651283264, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.32123811542987823, "step": 1676 }, { "epoch": 0.104875, "grad_norm": 3.515625, "grad_norm_var": 0.04081929524739583, "learning_rate": 0.0001, "loss": 8.7159, "loss/crossentropy": 2.4735978841781616, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.3010915666818619, "step": 1678 }, { "epoch": 0.105, "grad_norm": 3.3125, "grad_norm_var": 0.04506734212239583, "learning_rate": 0.0001, "loss": 8.7137, "loss/crossentropy": 2.4544039964675903, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2948637306690216, "step": 1680 }, { "epoch": 0.105125, "grad_norm": 3.265625, "grad_norm_var": 0.044661458333333334, "learning_rate": 0.0001, "loss": 8.8799, "loss/crossentropy": 2.5841368436813354, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.3055508881807327, "step": 1682 }, { "epoch": 0.10525, "grad_norm": 3.5625, "grad_norm_var": 0.048859659830729166, "learning_rate": 0.0001, "loss": 8.6805, "loss/crossentropy": 2.195699095726013, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.3213060796260834, "step": 1684 }, { "epoch": 0.105375, "grad_norm": 3.953125, "grad_norm_var": 0.048193359375, "learning_rate": 0.0001, "loss": 8.6492, "loss/crossentropy": 2.3286162614822388, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.30889779329299927, "step": 1686 }, { "epoch": 0.1055, "grad_norm": 3.546875, "grad_norm_var": 0.046956380208333336, "learning_rate": 0.0001, "loss": 8.7456, "loss/crossentropy": 2.3685059547424316, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.3166068494319916, "step": 1688 }, { "epoch": 0.105625, "grad_norm": 3.3125, "grad_norm_var": 0.046305338541666664, "learning_rate": 0.0001, "loss": 8.5339, "loss/crossentropy": 2.444363236427307, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.30366671085357666, "step": 1690 }, { "epoch": 0.10575, "grad_norm": 3.4375, "grad_norm_var": 0.042601521809895834, "learning_rate": 0.0001, "loss": 8.7264, "loss/crossentropy": 2.3560056686401367, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.33288004994392395, "step": 1692 }, { "epoch": 0.105875, "grad_norm": 3.28125, "grad_norm_var": 0.044554646809895834, "learning_rate": 0.0001, "loss": 8.5372, "loss/crossentropy": 2.38726007938385, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.28503432869911194, "step": 1694 }, { "epoch": 0.106, "grad_norm": 3.3125, "grad_norm_var": 0.040648396809895834, "learning_rate": 0.0001, "loss": 8.5069, "loss/crossentropy": 2.2538931369781494, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2696816474199295, "step": 1696 }, { "epoch": 0.106125, "grad_norm": 3.390625, "grad_norm_var": 0.04129231770833333, "learning_rate": 0.0001, "loss": 8.4485, "loss/crossentropy": 2.3338488340377808, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2753836512565613, "step": 1698 }, { "epoch": 0.10625, "grad_norm": 3.5, "grad_norm_var": 0.03697509765625, "learning_rate": 0.0001, "loss": 8.6194, "loss/crossentropy": 2.346317410469055, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2954401522874832, "step": 1700 }, { "epoch": 0.106375, "grad_norm": 3.265625, "grad_norm_var": 0.020914713541666668, "learning_rate": 0.0001, "loss": 8.684, "loss/crossentropy": 2.4051437377929688, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.30432261526584625, "step": 1702 }, { "epoch": 0.1065, "grad_norm": 3.203125, "grad_norm_var": 0.018485514322916667, "learning_rate": 0.0001, "loss": 8.7033, "loss/crossentropy": 2.4638129472732544, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.27916355431079865, "step": 1704 }, { "epoch": 0.106625, "grad_norm": 3.296875, "grad_norm_var": 0.0199127197265625, "learning_rate": 0.0001, "loss": 8.4718, "loss/crossentropy": 2.2523876428604126, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.2789776176214218, "step": 1706 }, { "epoch": 0.10675, "grad_norm": 3.34375, "grad_norm_var": 0.024095662434895835, "learning_rate": 0.0001, "loss": 9.0841, "loss/crossentropy": 2.576385736465454, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.3490666449069977, "step": 1708 }, { "epoch": 0.106875, "grad_norm": 3.890625, "grad_norm_var": 0.041520182291666666, "learning_rate": 0.0001, "loss": 9.0395, "loss/crossentropy": 2.375891089439392, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.36591947078704834, "step": 1710 }, { "epoch": 0.107, "grad_norm": 3.25, "grad_norm_var": 0.04540608723958333, "learning_rate": 0.0001, "loss": 8.8532, "loss/crossentropy": 2.3545055389404297, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.3123166114091873, "step": 1712 }, { "epoch": 0.107125, "grad_norm": 3.40625, "grad_norm_var": 0.04453837076822917, "learning_rate": 0.0001, "loss": 8.5417, "loss/crossentropy": 2.412468910217285, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.29608577489852905, "step": 1714 }, { "epoch": 0.10725, "grad_norm": 3.640625, "grad_norm_var": 0.04431864420572917, "learning_rate": 0.0001, "loss": 8.7605, "loss/crossentropy": 2.3699898719787598, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2785623371601105, "step": 1716 }, { "epoch": 0.107375, "grad_norm": 3.765625, "grad_norm_var": 0.04664306640625, "learning_rate": 0.0001, "loss": 9.0555, "loss/crossentropy": 2.4347622394561768, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.32773303985595703, "step": 1718 }, { "epoch": 0.1075, "grad_norm": 3.765625, "grad_norm_var": 0.2630208333333333, "learning_rate": 0.0001, "loss": 8.6447, "loss/crossentropy": 2.3668179512023926, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.31063494086265564, "step": 1720 }, { "epoch": 0.107625, "grad_norm": 3.59375, "grad_norm_var": 0.24821675618489583, "learning_rate": 0.0001, "loss": 8.963, "loss/crossentropy": 2.6537840366363525, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.31972379982471466, "step": 1722 }, { "epoch": 0.10775, "grad_norm": 3.125, "grad_norm_var": 0.26516520182291664, "learning_rate": 0.0001, "loss": 8.2949, "loss/crossentropy": 2.07137930393219, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.27931931614875793, "step": 1724 }, { "epoch": 0.107875, "grad_norm": 3.578125, "grad_norm_var": 0.26011962890625, "learning_rate": 0.0001, "loss": 8.4489, "loss/crossentropy": 2.272459626197815, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.28474678099155426, "step": 1726 }, { "epoch": 0.108, "grad_norm": 3.421875, "grad_norm_var": 0.26783447265625, "learning_rate": 0.0001, "loss": 8.6669, "loss/crossentropy": 2.259164571762085, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.2813860774040222, "step": 1728 }, { "epoch": 0.108125, "grad_norm": 3.375, "grad_norm_var": 0.26335347493489586, "learning_rate": 0.0001, "loss": 8.7254, "loss/crossentropy": 2.3241848945617676, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2932712584733963, "step": 1730 }, { "epoch": 0.10825, "grad_norm": 3.46875, "grad_norm_var": 0.26031494140625, "learning_rate": 0.0001, "loss": 8.6529, "loss/crossentropy": 2.1817615032196045, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.28635722398757935, "step": 1732 }, { "epoch": 0.108375, "grad_norm": 3.75, "grad_norm_var": 0.2612864176432292, "learning_rate": 0.0001, "loss": 8.9668, "loss/crossentropy": 2.2815951108932495, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.32627154886722565, "step": 1734 }, { "epoch": 0.1085, "grad_norm": 3.65625, "grad_norm_var": 0.032835896809895834, "learning_rate": 0.0001, "loss": 8.9081, "loss/crossentropy": 2.4856772422790527, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.3120778799057007, "step": 1736 }, { "epoch": 0.108625, "grad_norm": 3.125, "grad_norm_var": 0.039061482747395834, "learning_rate": 0.0001, "loss": 8.6702, "loss/crossentropy": 1.900740385055542, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2662765383720398, "step": 1738 }, { "epoch": 0.10875, "grad_norm": 3.515625, "grad_norm_var": 0.031966145833333334, "learning_rate": 0.0001, "loss": 8.5816, "loss/crossentropy": 2.467803955078125, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2834676057100296, "step": 1740 }, { "epoch": 0.108875, "grad_norm": 3.4375, "grad_norm_var": 0.03134358723958333, "learning_rate": 0.0001, "loss": 8.9114, "loss/crossentropy": 2.294146180152893, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.27088499069213867, "step": 1742 }, { "epoch": 0.109, "grad_norm": 3.5, "grad_norm_var": 0.026292928059895835, "learning_rate": 0.0001, "loss": 8.8052, "loss/crossentropy": 2.2657724618911743, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.3134308159351349, "step": 1744 }, { "epoch": 0.109125, "grad_norm": 3.78125, "grad_norm_var": 0.03062744140625, "learning_rate": 0.0001, "loss": 8.694, "loss/crossentropy": 2.2271536588668823, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2863340824842453, "step": 1746 }, { "epoch": 0.10925, "grad_norm": 3.359375, "grad_norm_var": 0.030562337239583334, "learning_rate": 0.0001, "loss": 8.711, "loss/crossentropy": 2.3169878721237183, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.30684567987918854, "step": 1748 }, { "epoch": 0.109375, "grad_norm": 3.328125, "grad_norm_var": 0.08248697916666667, "learning_rate": 0.0001, "loss": 8.7637, "loss/crossentropy": 2.4109179973602295, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.3122059851884842, "step": 1750 }, { "epoch": 0.1095, "grad_norm": 3.5, "grad_norm_var": 0.08864644368489584, "learning_rate": 0.0001, "loss": 8.9595, "loss/crossentropy": 2.1605933904647827, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.3139963746070862, "step": 1752 }, { "epoch": 0.109625, "grad_norm": 3.296875, "grad_norm_var": 0.0794097900390625, "learning_rate": 0.0001, "loss": 8.7469, "loss/crossentropy": 2.427368640899658, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.3167697489261627, "step": 1754 }, { "epoch": 0.10975, "grad_norm": 3.359375, "grad_norm_var": 0.0843902587890625, "learning_rate": 0.0001, "loss": 8.6596, "loss/crossentropy": 2.3277207612991333, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.29165390133857727, "step": 1756 }, { "epoch": 0.109875, "grad_norm": 3.515625, "grad_norm_var": 0.07976786295572917, "learning_rate": 0.0001, "loss": 8.8038, "loss/crossentropy": 2.610532522201538, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.29831643402576447, "step": 1758 }, { "epoch": 0.11, "grad_norm": 3.859375, "grad_norm_var": 0.0923828125, "learning_rate": 0.0001, "loss": 8.9237, "loss/crossentropy": 2.4981523752212524, "loss/hidden": 3.5546875, "loss/jsd": 0.0, "loss/logits": 0.32519689202308655, "step": 1760 }, { "epoch": 0.110125, "grad_norm": 3.453125, "grad_norm_var": 0.08820699055989584, "learning_rate": 0.0001, "loss": 8.5204, "loss/crossentropy": 2.21122944355011, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.2947424352169037, "step": 1762 }, { "epoch": 0.11025, "grad_norm": 3.390625, "grad_norm_var": 0.09097391764322917, "learning_rate": 0.0001, "loss": 8.5618, "loss/crossentropy": 2.4394689798355103, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.2784826308488846, "step": 1764 }, { "epoch": 0.110375, "grad_norm": 3.203125, "grad_norm_var": 0.0383941650390625, "learning_rate": 0.0001, "loss": 8.5953, "loss/crossentropy": 2.1292494535446167, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.2804698646068573, "step": 1766 }, { "epoch": 0.1105, "grad_norm": 3.609375, "grad_norm_var": 0.02783203125, "learning_rate": 0.0001, "loss": 8.6388, "loss/crossentropy": 2.6741600036621094, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.3054506927728653, "step": 1768 }, { "epoch": 0.110625, "grad_norm": 3.703125, "grad_norm_var": 0.031266276041666666, "learning_rate": 0.0001, "loss": 8.6962, "loss/crossentropy": 2.2202726006507874, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.26149123907089233, "step": 1770 }, { "epoch": 0.11075, "grad_norm": 3.703125, "grad_norm_var": 0.033349609375, "learning_rate": 0.0001, "loss": 8.7391, "loss/crossentropy": 2.2986572980880737, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.3005179762840271, "step": 1772 }, { "epoch": 0.110875, "grad_norm": 3.671875, "grad_norm_var": 0.03790690104166667, "learning_rate": 0.0001, "loss": 8.814, "loss/crossentropy": 2.4717568159103394, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2965603917837143, "step": 1774 }, { "epoch": 0.111, "grad_norm": 3.34375, "grad_norm_var": 0.025699869791666666, "learning_rate": 0.0001, "loss": 8.6132, "loss/crossentropy": 2.4182674884796143, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.29090292751789093, "step": 1776 }, { "epoch": 0.111125, "grad_norm": 3.484375, "grad_norm_var": 0.028229777018229166, "learning_rate": 0.0001, "loss": 8.7369, "loss/crossentropy": 2.270543932914734, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2845657169818878, "step": 1778 }, { "epoch": 0.11125, "grad_norm": 3.34375, "grad_norm_var": 0.02769775390625, "learning_rate": 0.0001, "loss": 8.6883, "loss/crossentropy": 2.4965745210647583, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.30929645895957947, "step": 1780 }, { "epoch": 0.111375, "grad_norm": 3.3125, "grad_norm_var": 0.025927734375, "learning_rate": 0.0001, "loss": 8.2737, "loss/crossentropy": 1.9217499494552612, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.26199381053447723, "step": 1782 }, { "epoch": 0.1115, "grad_norm": 3.203125, "grad_norm_var": 0.03183186848958333, "learning_rate": 0.0001, "loss": 8.9253, "loss/crossentropy": 2.4939075708389282, "loss/hidden": 3.59375, "loss/jsd": 0.0, "loss/logits": 0.3338964730501175, "step": 1784 }, { "epoch": 0.111625, "grad_norm": 3.171875, "grad_norm_var": 0.03540751139322917, "learning_rate": 0.0001, "loss": 8.4357, "loss/crossentropy": 2.3698354959487915, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.28977689146995544, "step": 1786 }, { "epoch": 0.11175, "grad_norm": 4.03125, "grad_norm_var": 0.05469462076822917, "learning_rate": 0.0001, "loss": 8.7838, "loss/crossentropy": 2.4376271963119507, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.2988607585430145, "step": 1788 }, { "epoch": 0.111875, "grad_norm": 3.59375, "grad_norm_var": 0.1191070556640625, "learning_rate": 0.0001, "loss": 8.9321, "loss/crossentropy": 2.229245901107788, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.2898672968149185, "step": 1790 }, { "epoch": 0.112, "grad_norm": 3.546875, "grad_norm_var": 0.12310791015625, "learning_rate": 0.0001, "loss": 8.6884, "loss/crossentropy": 2.5585025548934937, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.30431005358695984, "step": 1792 }, { "epoch": 0.112125, "grad_norm": 3.15625, "grad_norm_var": 0.12705078125, "learning_rate": 0.0001, "loss": 8.7439, "loss/crossentropy": 2.237270951271057, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.29220762848854065, "step": 1794 }, { "epoch": 0.11225, "grad_norm": 3.3125, "grad_norm_var": 0.12760009765625, "learning_rate": 0.0001, "loss": 8.5924, "loss/crossentropy": 2.4742521047592163, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.2950773537158966, "step": 1796 }, { "epoch": 0.112375, "grad_norm": 3.40625, "grad_norm_var": 0.12604878743489584, "learning_rate": 0.0001, "loss": 8.5864, "loss/crossentropy": 2.39210307598114, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.30888137221336365, "step": 1798 }, { "epoch": 0.1125, "grad_norm": 3.625, "grad_norm_var": 0.12141927083333333, "learning_rate": 0.0001, "loss": 8.7044, "loss/crossentropy": 2.3993630409240723, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.2913102060556412, "step": 1800 }, { "epoch": 0.112625, "grad_norm": 3.484375, "grad_norm_var": 0.11155192057291667, "learning_rate": 0.0001, "loss": 8.8666, "loss/crossentropy": 2.682582974433899, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.31557750701904297, "step": 1802 }, { "epoch": 0.11275, "grad_norm": 3.328125, "grad_norm_var": 0.0958404541015625, "learning_rate": 0.0001, "loss": 8.4966, "loss/crossentropy": 2.0429012775421143, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.26017338037490845, "step": 1804 }, { "epoch": 0.112875, "grad_norm": 3.390625, "grad_norm_var": 0.017455037434895834, "learning_rate": 0.0001, "loss": 8.6763, "loss/crossentropy": 2.3047099113464355, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.31745266914367676, "step": 1806 }, { "epoch": 0.113, "grad_norm": 3.3125, "grad_norm_var": 0.0132965087890625, "learning_rate": 0.0001, "loss": 8.4867, "loss/crossentropy": 2.1992413997650146, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.32566145062446594, "step": 1808 }, { "epoch": 0.113125, "grad_norm": 3.75, "grad_norm_var": 0.01890869140625, "learning_rate": 0.0001, "loss": 8.8092, "loss/crossentropy": 2.771738290786743, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.304366871714592, "step": 1810 }, { "epoch": 0.11325, "grad_norm": 3.28125, "grad_norm_var": 0.019245402018229166, "learning_rate": 0.0001, "loss": 8.6466, "loss/crossentropy": 2.1588589549064636, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2743106037378311, "step": 1812 }, { "epoch": 0.113375, "grad_norm": 3.53125, "grad_norm_var": 0.022021484375, "learning_rate": 0.0001, "loss": 8.7871, "loss/crossentropy": 2.143616557121277, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.33632121980190277, "step": 1814 }, { "epoch": 0.1135, "grad_norm": 3.421875, "grad_norm_var": 0.019059244791666666, "learning_rate": 0.0001, "loss": 8.8236, "loss/crossentropy": 2.4321892261505127, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.3019208610057831, "step": 1816 }, { "epoch": 0.113625, "grad_norm": 3.15625, "grad_norm_var": 0.022150675455729168, "learning_rate": 0.0001, "loss": 8.6216, "loss/crossentropy": 2.4934128522872925, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.30101920664310455, "step": 1818 }, { "epoch": 0.11375, "grad_norm": 3.53125, "grad_norm_var": 0.0219390869140625, "learning_rate": 0.0001, "loss": 8.7396, "loss/crossentropy": 2.309072256088257, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.27716949582099915, "step": 1820 }, { "epoch": 0.113875, "grad_norm": 3.15625, "grad_norm_var": 0.0274566650390625, "learning_rate": 0.0001, "loss": 8.7831, "loss/crossentropy": 2.37872850894928, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.2861686944961548, "step": 1822 }, { "epoch": 0.114, "grad_norm": 3.234375, "grad_norm_var": 0.04157613118489583, "learning_rate": 0.0001, "loss": 8.673, "loss/crossentropy": 2.3230080604553223, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2735777944326401, "step": 1824 }, { "epoch": 0.114125, "grad_norm": 3.46875, "grad_norm_var": 0.0411773681640625, "learning_rate": 0.0001, "loss": 8.5909, "loss/crossentropy": 2.421627402305603, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.3164493143558502, "step": 1826 }, { "epoch": 0.11425, "grad_norm": 3.53125, "grad_norm_var": 0.041304524739583334, "learning_rate": 0.0001, "loss": 8.4245, "loss/crossentropy": 2.378306269645691, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.28885410726070404, "step": 1828 }, { "epoch": 0.114375, "grad_norm": 3.328125, "grad_norm_var": 0.04429931640625, "learning_rate": 0.0001, "loss": 8.586, "loss/crossentropy": 2.2422314882278442, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.28581804037094116, "step": 1830 }, { "epoch": 0.1145, "grad_norm": 3.40625, "grad_norm_var": 0.04399312337239583, "learning_rate": 0.0001, "loss": 8.893, "loss/crossentropy": 2.5272161960601807, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.29680605232715607, "step": 1832 }, { "epoch": 0.114625, "grad_norm": 3.328125, "grad_norm_var": 0.03966471354166667, "learning_rate": 0.0001, "loss": 8.6886, "loss/crossentropy": 2.5110833644866943, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.30536364018917084, "step": 1834 }, { "epoch": 0.11475, "grad_norm": 3.359375, "grad_norm_var": 0.039794921875, "learning_rate": 0.0001, "loss": 8.6081, "loss/crossentropy": 2.2756314277648926, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.329521119594574, "step": 1836 }, { "epoch": 0.114875, "grad_norm": 3.296875, "grad_norm_var": 0.03609619140625, "learning_rate": 0.0001, "loss": 8.7375, "loss/crossentropy": 2.3063724040985107, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.28737129271030426, "step": 1838 }, { "epoch": 0.115, "grad_norm": 3.4375, "grad_norm_var": 0.018863932291666666, "learning_rate": 0.0001, "loss": 8.8197, "loss/crossentropy": 2.3004164695739746, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.28277695178985596, "step": 1840 }, { "epoch": 0.115125, "grad_norm": 3.140625, "grad_norm_var": 0.023563639322916666, "learning_rate": 0.0001, "loss": 8.7529, "loss/crossentropy": 2.41781747341156, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2941035330295563, "step": 1842 }, { "epoch": 0.11525, "grad_norm": 3.234375, "grad_norm_var": 0.025593058268229166, "learning_rate": 0.0001, "loss": 8.659, "loss/crossentropy": 2.215361475944519, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2741749882698059, "step": 1844 }, { "epoch": 0.115375, "grad_norm": 3.234375, "grad_norm_var": 0.023053995768229165, "learning_rate": 0.0001, "loss": 8.5763, "loss/crossentropy": 2.263062834739685, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.2967001497745514, "step": 1846 }, { "epoch": 0.1155, "grad_norm": 3.390625, "grad_norm_var": 0.023160807291666665, "learning_rate": 0.0001, "loss": 8.5134, "loss/crossentropy": 2.3583621978759766, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2846299409866333, "step": 1848 }, { "epoch": 0.115625, "grad_norm": 3.421875, "grad_norm_var": 0.022883097330729168, "learning_rate": 0.0001, "loss": 8.7018, "loss/crossentropy": 2.3313838243484497, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.27793991565704346, "step": 1850 }, { "epoch": 0.11575, "grad_norm": 3.140625, "grad_norm_var": 0.0206695556640625, "learning_rate": 0.0001, "loss": 8.6257, "loss/crossentropy": 2.5213606357574463, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2858508974313736, "step": 1852 }, { "epoch": 0.115875, "grad_norm": 5.125, "grad_norm_var": 0.22066650390625, "learning_rate": 0.0001, "loss": 8.5687, "loss/crossentropy": 2.138124704360962, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2705334722995758, "step": 1854 }, { "epoch": 0.116, "grad_norm": 3.5, "grad_norm_var": 0.22082926432291666, "learning_rate": 0.0001, "loss": 8.692, "loss/crossentropy": 2.190787434577942, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2934059798717499, "step": 1856 }, { "epoch": 0.116125, "grad_norm": 3.8125, "grad_norm_var": 0.21988016764322918, "learning_rate": 0.0001, "loss": 8.9455, "loss/crossentropy": 2.4185396432876587, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2919394373893738, "step": 1858 }, { "epoch": 0.11625, "grad_norm": 3.359375, "grad_norm_var": 0.2108306884765625, "learning_rate": 0.0001, "loss": 8.7029, "loss/crossentropy": 2.4572391510009766, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.28716185688972473, "step": 1860 }, { "epoch": 0.116375, "grad_norm": 3.203125, "grad_norm_var": 0.21071675618489583, "learning_rate": 0.0001, "loss": 8.5979, "loss/crossentropy": 2.280961036682129, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.3058573454618454, "step": 1862 }, { "epoch": 0.1165, "grad_norm": 3.15625, "grad_norm_var": 0.22084859212239583, "learning_rate": 0.0001, "loss": 8.4776, "loss/crossentropy": 2.2182366847991943, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2703913003206253, "step": 1864 }, { "epoch": 0.116625, "grad_norm": 3.953125, "grad_norm_var": 0.23896382649739584, "learning_rate": 0.0001, "loss": 8.6861, "loss/crossentropy": 2.3092691898345947, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.2828802317380905, "step": 1866 }, { "epoch": 0.11675, "grad_norm": 3.625, "grad_norm_var": 0.2319244384765625, "learning_rate": 0.0001, "loss": 8.7237, "loss/crossentropy": 2.465211868286133, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.28346388041973114, "step": 1868 }, { "epoch": 0.116875, "grad_norm": 3.390625, "grad_norm_var": 0.04586181640625, "learning_rate": 0.0001, "loss": 8.5473, "loss/crossentropy": 2.289981722831726, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.27946069836616516, "step": 1870 }, { "epoch": 0.117, "grad_norm": 3.765625, "grad_norm_var": 0.0533599853515625, "learning_rate": 0.0001, "loss": 8.7206, "loss/crossentropy": 2.3901021480560303, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.27738629281520844, "step": 1872 }, { "epoch": 0.117125, "grad_norm": 3.71875, "grad_norm_var": 0.06018778483072917, "learning_rate": 0.0001, "loss": 8.8517, "loss/crossentropy": 2.143779933452606, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.2891843765974045, "step": 1874 }, { "epoch": 0.11725, "grad_norm": 3.390625, "grad_norm_var": 0.0638092041015625, "learning_rate": 0.0001, "loss": 8.7417, "loss/crossentropy": 2.1701208353042603, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.32332536578178406, "step": 1876 }, { "epoch": 0.117375, "grad_norm": 4.0, "grad_norm_var": 0.07613932291666667, "learning_rate": 0.0001, "loss": 8.5934, "loss/crossentropy": 2.075779378414154, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.2908122092485428, "step": 1878 }, { "epoch": 0.1175, "grad_norm": 3.296875, "grad_norm_var": 0.06750895182291666, "learning_rate": 0.0001, "loss": 8.509, "loss/crossentropy": 2.402305006980896, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.30343984067440033, "step": 1880 }, { "epoch": 0.117625, "grad_norm": 3.109375, "grad_norm_var": 0.06417643229166667, "learning_rate": 0.0001, "loss": 8.712, "loss/crossentropy": 2.4394556283950806, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.30280593037605286, "step": 1882 }, { "epoch": 0.11775, "grad_norm": 3.046875, "grad_norm_var": 0.07830301920572917, "learning_rate": 0.0001, "loss": 8.3853, "loss/crossentropy": 2.2950029373168945, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.25343556702136993, "step": 1884 }, { "epoch": 0.117875, "grad_norm": 3.140625, "grad_norm_var": 0.08426005045572917, "learning_rate": 0.0001, "loss": 8.7969, "loss/crossentropy": 2.3692102432250977, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.2964879274368286, "step": 1886 }, { "epoch": 0.118, "grad_norm": 3.484375, "grad_norm_var": 0.0810546875, "learning_rate": 0.0001, "loss": 8.4742, "loss/crossentropy": 2.475126624107361, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.3013022840023041, "step": 1888 }, { "epoch": 0.118125, "grad_norm": 3.4375, "grad_norm_var": 0.0604156494140625, "learning_rate": 0.0001, "loss": 8.5953, "loss/crossentropy": 2.4823808670043945, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.27870962023735046, "step": 1890 }, { "epoch": 0.11825, "grad_norm": 3.125, "grad_norm_var": 0.062474568684895836, "learning_rate": 0.0001, "loss": 8.3978, "loss/crossentropy": 2.3761686086654663, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.3016834706068039, "step": 1892 }, { "epoch": 0.118375, "grad_norm": 3.28125, "grad_norm_var": 0.13007710774739584, "learning_rate": 0.0001, "loss": 8.7529, "loss/crossentropy": 2.1992992162704468, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.3106433153152466, "step": 1894 }, { "epoch": 0.1185, "grad_norm": 3.890625, "grad_norm_var": 1.60845947265625, "learning_rate": 0.0001, "loss": 9.032, "loss/crossentropy": 2.3841880559921265, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.32507331669330597, "step": 1896 }, { "epoch": 0.118625, "grad_norm": 3.296875, "grad_norm_var": 1.5833943684895833, "learning_rate": 0.0001, "loss": 8.7286, "loss/crossentropy": 2.397653102874756, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.30125096440315247, "step": 1898 }, { "epoch": 0.11875, "grad_norm": 3.453125, "grad_norm_var": 1.5594309488932292, "learning_rate": 0.0001, "loss": 8.6239, "loss/crossentropy": 2.342803478240967, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.2870272248983383, "step": 1900 }, { "epoch": 0.118875, "grad_norm": 3.375, "grad_norm_var": 1.5496815999348958, "learning_rate": 0.0001, "loss": 8.755, "loss/crossentropy": 2.4685518741607666, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.30882811546325684, "step": 1902 }, { "epoch": 0.119, "grad_norm": 3.65625, "grad_norm_var": 1.5260050455729166, "learning_rate": 0.0001, "loss": 8.7347, "loss/crossentropy": 2.0706852674484253, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.29542240500450134, "step": 1904 }, { "epoch": 0.119125, "grad_norm": 4.5, "grad_norm_var": 1.541087849934896, "learning_rate": 0.0001, "loss": 8.7724, "loss/crossentropy": 2.156776189804077, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2645677626132965, "step": 1906 }, { "epoch": 0.11925, "grad_norm": 3.4375, "grad_norm_var": 1.529638671875, "learning_rate": 0.0001, "loss": 8.5694, "loss/crossentropy": 2.1738698482513428, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.28860998153686523, "step": 1908 }, { "epoch": 0.119375, "grad_norm": 3.421875, "grad_norm_var": 1.4942779541015625, "learning_rate": 0.0001, "loss": 8.8526, "loss/crossentropy": 2.407730460166931, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.32247424125671387, "step": 1910 }, { "epoch": 0.1195, "grad_norm": 3.484375, "grad_norm_var": 0.09680074055989583, "learning_rate": 0.0001, "loss": 8.6226, "loss/crossentropy": 2.0718756914138794, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2795727103948593, "step": 1912 }, { "epoch": 0.119625, "grad_norm": 3.109375, "grad_norm_var": 0.0980377197265625, "learning_rate": 0.0001, "loss": 8.6584, "loss/crossentropy": 2.28408420085907, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.30408230423927307, "step": 1914 }, { "epoch": 0.11975, "grad_norm": 3.015625, "grad_norm_var": 0.1058990478515625, "learning_rate": 0.0001, "loss": 8.6341, "loss/crossentropy": 2.3821157217025757, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.30592113733291626, "step": 1916 }, { "epoch": 0.119875, "grad_norm": 3.484375, "grad_norm_var": 0.1066558837890625, "learning_rate": 0.0001, "loss": 8.5097, "loss/crossentropy": 2.4786245822906494, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.29681436717510223, "step": 1918 }, { "epoch": 0.12, "grad_norm": 3.296875, "grad_norm_var": 0.105810546875, "learning_rate": 0.0001, "loss": 8.8635, "loss/crossentropy": 2.56216299533844, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.29444751143455505, "step": 1920 }, { "epoch": 0.120125, "grad_norm": 3.1875, "grad_norm_var": 0.025609334309895832, "learning_rate": 0.0001, "loss": 8.7351, "loss/crossentropy": 2.4210604429244995, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.27522487938404083, "step": 1922 }, { "epoch": 0.12025, "grad_norm": 3.3125, "grad_norm_var": 0.024323527018229166, "learning_rate": 0.0001, "loss": 8.4784, "loss/crossentropy": 2.239919900894165, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.28240087628364563, "step": 1924 }, { "epoch": 0.120375, "grad_norm": 3.53125, "grad_norm_var": 0.024608357747395834, "learning_rate": 0.0001, "loss": 8.756, "loss/crossentropy": 2.387734532356262, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.32171347737312317, "step": 1926 }, { "epoch": 0.1205, "grad_norm": 3.34375, "grad_norm_var": 0.026854451497395834, "learning_rate": 0.0001, "loss": 8.3875, "loss/crossentropy": 2.3138844966888428, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.2761044204235077, "step": 1928 }, { "epoch": 0.120625, "grad_norm": 3.171875, "grad_norm_var": 0.024250284830729166, "learning_rate": 0.0001, "loss": 8.8666, "loss/crossentropy": 2.3094791173934937, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.308842197060585, "step": 1930 }, { "epoch": 0.12075, "grad_norm": 3.453125, "grad_norm_var": 0.028807576497395834, "learning_rate": 0.0001, "loss": 8.5039, "loss/crossentropy": 2.2231727838516235, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.28627289831638336, "step": 1932 }, { "epoch": 0.120875, "grad_norm": 3.390625, "grad_norm_var": 0.0268218994140625, "learning_rate": 0.0001, "loss": 8.6522, "loss/crossentropy": 2.6999902725219727, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.287996843457222, "step": 1934 }, { "epoch": 0.121, "grad_norm": 3.328125, "grad_norm_var": 0.0267486572265625, "learning_rate": 0.0001, "loss": 8.4122, "loss/crossentropy": 2.383345127105713, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.2961116135120392, "step": 1936 }, { "epoch": 0.121125, "grad_norm": 3.3125, "grad_norm_var": 0.0250152587890625, "learning_rate": 0.0001, "loss": 8.6214, "loss/crossentropy": 2.291685461997986, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.27680595219135284, "step": 1938 }, { "epoch": 0.12125, "grad_norm": 3.34375, "grad_norm_var": 0.024828084309895835, "learning_rate": 0.0001, "loss": 8.8914, "loss/crossentropy": 2.4618613719940186, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.3022569268941879, "step": 1940 }, { "epoch": 0.121375, "grad_norm": 3.234375, "grad_norm_var": 0.022606404622395833, "learning_rate": 0.0001, "loss": 8.5706, "loss/crossentropy": 2.3783398866653442, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.3050181269645691, "step": 1942 }, { "epoch": 0.1215, "grad_norm": 3.53125, "grad_norm_var": 0.0193511962890625, "learning_rate": 0.0001, "loss": 8.6295, "loss/crossentropy": 2.1171228885650635, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.26293374598026276, "step": 1944 }, { "epoch": 0.121625, "grad_norm": 4.6875, "grad_norm_var": 1.4903310139973958, "learning_rate": 0.0001, "loss": 8.7994, "loss/crossentropy": 2.255262017250061, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.3071945011615753, "step": 1946 }, { "epoch": 0.12175, "grad_norm": 3.53125, "grad_norm_var": 1.4472819010416667, "learning_rate": 0.0001, "loss": 8.4087, "loss/crossentropy": 2.3733495473861694, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.28868359327316284, "step": 1948 }, { "epoch": 0.121875, "grad_norm": 3.4375, "grad_norm_var": 1.4477701822916667, "learning_rate": 0.0001, "loss": 8.6656, "loss/crossentropy": 2.264806866645813, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.2763102054595947, "step": 1950 }, { "epoch": 0.122, "grad_norm": 3.1875, "grad_norm_var": 1.4537394205729166, "learning_rate": 0.0001, "loss": 8.4854, "loss/crossentropy": 2.217471718788147, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2726486176252365, "step": 1952 }, { "epoch": 0.122125, "grad_norm": 3.1875, "grad_norm_var": 1.4633951822916667, "learning_rate": 0.0001, "loss": 8.8987, "loss/crossentropy": 2.348217725753784, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.3073858618736267, "step": 1954 }, { "epoch": 0.12225, "grad_norm": 3.15625, "grad_norm_var": 1.4752604166666667, "learning_rate": 0.0001, "loss": 8.5881, "loss/crossentropy": 2.3927940130233765, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2712845206260681, "step": 1956 }, { "epoch": 0.122375, "grad_norm": 3.53125, "grad_norm_var": 1.4529693603515625, "learning_rate": 0.0001, "loss": 8.5416, "loss/crossentropy": 2.251898407936096, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.269486665725708, "step": 1958 }, { "epoch": 0.1225, "grad_norm": 3.21875, "grad_norm_var": 1.4741363525390625, "learning_rate": 0.0001, "loss": 8.558, "loss/crossentropy": 2.247495174407959, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.2935321629047394, "step": 1960 }, { "epoch": 0.122625, "grad_norm": 5.0, "grad_norm_var": 0.20015869140625, "learning_rate": 0.0001, "loss": 8.6442, "loss/crossentropy": 2.0654982328414917, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2634906470775604, "step": 1962 }, { "epoch": 0.12275, "grad_norm": 3.375, "grad_norm_var": 0.21668192545572917, "learning_rate": 0.0001, "loss": 8.4305, "loss/crossentropy": 2.16032737493515, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.28428590297698975, "step": 1964 }, { "epoch": 0.122875, "grad_norm": 3.1875, "grad_norm_var": 0.22750244140625, "learning_rate": 0.0001, "loss": 8.3857, "loss/crossentropy": 2.5656957626342773, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.3001856654882431, "step": 1966 }, { "epoch": 0.123, "grad_norm": 3.46875, "grad_norm_var": 0.22258199055989583, "learning_rate": 0.0001, "loss": 8.5005, "loss/crossentropy": 2.350658416748047, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.29954949021339417, "step": 1968 }, { "epoch": 0.123125, "grad_norm": 3.390625, "grad_norm_var": 0.217919921875, "learning_rate": 0.0001, "loss": 8.7335, "loss/crossentropy": 2.528357982635498, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.3156994730234146, "step": 1970 }, { "epoch": 0.12325, "grad_norm": 3.515625, "grad_norm_var": 0.20797119140625, "learning_rate": 0.0001, "loss": 8.5015, "loss/crossentropy": 2.491376519203186, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.29096619784832, "step": 1972 }, { "epoch": 0.123375, "grad_norm": 3.171875, "grad_norm_var": 0.21378580729166666, "learning_rate": 0.0001, "loss": 8.624, "loss/crossentropy": 2.5760600566864014, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.30827219784259796, "step": 1974 }, { "epoch": 0.1235, "grad_norm": 3.546875, "grad_norm_var": 0.21451822916666666, "learning_rate": 0.0001, "loss": 8.588, "loss/crossentropy": 2.3280161023139954, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.29429638385772705, "step": 1976 }, { "epoch": 0.123625, "grad_norm": 3.3125, "grad_norm_var": 0.044367472330729164, "learning_rate": 0.0001, "loss": 8.578, "loss/crossentropy": 2.5439298152923584, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.30466994643211365, "step": 1978 }, { "epoch": 0.12375, "grad_norm": 3.140625, "grad_norm_var": 0.021708170572916668, "learning_rate": 0.0001, "loss": 8.3857, "loss/crossentropy": 2.264935255050659, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.28621000051498413, "step": 1980 }, { "epoch": 0.123875, "grad_norm": 3.375, "grad_norm_var": 0.015653483072916665, "learning_rate": 0.0001, "loss": 8.5949, "loss/crossentropy": 2.4394866228103638, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.28367021679878235, "step": 1982 }, { "epoch": 0.124, "grad_norm": 3.09375, "grad_norm_var": 0.03601786295572917, "learning_rate": 0.0001, "loss": 8.6264, "loss/crossentropy": 2.1645785570144653, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.32053497433662415, "step": 1984 }, { "epoch": 0.124125, "grad_norm": 3.125, "grad_norm_var": 0.04356180826822917, "learning_rate": 0.0001, "loss": 8.283, "loss/crossentropy": 2.332149863243103, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2783034145832062, "step": 1986 }, { "epoch": 0.12425, "grad_norm": 3.125, "grad_norm_var": 0.0423492431640625, "learning_rate": 0.0001, "loss": 8.419, "loss/crossentropy": 2.2908122539520264, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.26500867307186127, "step": 1988 }, { "epoch": 0.124375, "grad_norm": 3.046875, "grad_norm_var": 0.0461334228515625, "learning_rate": 0.0001, "loss": 8.4965, "loss/crossentropy": 2.2126917839050293, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2839796543121338, "step": 1990 }, { "epoch": 0.1245, "grad_norm": 4.09375, "grad_norm_var": 0.47056884765625, "learning_rate": 0.0001, "loss": 8.9506, "loss/crossentropy": 2.355017066001892, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.28324250876903534, "step": 1992 }, { "epoch": 0.124625, "grad_norm": 3.15625, "grad_norm_var": 0.47298075358072916, "learning_rate": 0.0001, "loss": 8.5804, "loss/crossentropy": 2.408555507659912, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.28895722329616547, "step": 1994 }, { "epoch": 0.12475, "grad_norm": 3.046875, "grad_norm_var": 0.4876129150390625, "learning_rate": 0.0001, "loss": 8.2851, "loss/crossentropy": 2.251736283302307, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2749274671077728, "step": 1996 }, { "epoch": 0.124875, "grad_norm": 3.578125, "grad_norm_var": 0.4910634358723958, "learning_rate": 0.0001, "loss": 8.6195, "loss/crossentropy": 2.457157015800476, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.31056058406829834, "step": 1998 }, { "epoch": 0.125, "grad_norm": 3.4375, "grad_norm_var": 0.47468973795572916, "learning_rate": 0.0001, "loss": 8.3637, "loss/crossentropy": 2.2277281284332275, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2744043469429016, "step": 2000 }, { "epoch": 0.125125, "grad_norm": 3.28125, "grad_norm_var": 0.45701497395833335, "learning_rate": 0.0001, "loss": 8.6506, "loss/crossentropy": 2.4241243600845337, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2997867166996002, "step": 2002 }, { "epoch": 0.12525, "grad_norm": 3.296875, "grad_norm_var": 0.45701497395833335, "learning_rate": 0.0001, "loss": 8.5374, "loss/crossentropy": 2.3989826440811157, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.2899128496646881, "step": 2004 }, { "epoch": 0.125375, "grad_norm": 3.65625, "grad_norm_var": 0.44111226399739584, "learning_rate": 0.0001, "loss": 8.5511, "loss/crossentropy": 2.326428711414337, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.30725909769535065, "step": 2006 }, { "epoch": 0.1255, "grad_norm": 3.015625, "grad_norm_var": 0.0493560791015625, "learning_rate": 0.0001, "loss": 8.4754, "loss/crossentropy": 2.232123017311096, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26897597312927246, "step": 2008 }, { "epoch": 0.125625, "grad_norm": 3.171875, "grad_norm_var": 0.046240234375, "learning_rate": 0.0001, "loss": 8.5567, "loss/crossentropy": 2.353936553001404, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.27618004381656647, "step": 2010 }, { "epoch": 0.12575, "grad_norm": 3.203125, "grad_norm_var": 0.039469401041666664, "learning_rate": 0.0001, "loss": 8.5775, "loss/crossentropy": 1.9717338681221008, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26382866501808167, "step": 2012 }, { "epoch": 0.125875, "grad_norm": 3.203125, "grad_norm_var": 0.023908487955729165, "learning_rate": 0.0001, "loss": 8.5801, "loss/crossentropy": 2.315757632255554, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.29249346256256104, "step": 2014 }, { "epoch": 0.126, "grad_norm": 4.71875, "grad_norm_var": 0.7569986979166666, "learning_rate": 0.0001, "loss": 8.9827, "loss/crossentropy": 2.2849130630493164, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.36747997999191284, "step": 2016 }, { "epoch": 0.126125, "grad_norm": 3.3125, "grad_norm_var": 0.8076456705729167, "learning_rate": 0.0001, "loss": 8.4695, "loss/crossentropy": 2.2076889276504517, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.33494821190834045, "step": 2018 }, { "epoch": 0.12625, "grad_norm": 3.28125, "grad_norm_var": 0.7920888264973959, "learning_rate": 0.0001, "loss": 8.7315, "loss/crossentropy": 2.2882113456726074, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.2940904349088669, "step": 2020 }, { "epoch": 0.126375, "grad_norm": 3.78125, "grad_norm_var": 0.7946451822916667, "learning_rate": 0.0001, "loss": 8.8536, "loss/crossentropy": 2.5705249309539795, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.33114902675151825, "step": 2022 }, { "epoch": 0.1265, "grad_norm": 3.125, "grad_norm_var": 0.78818359375, "learning_rate": 0.0001, "loss": 8.5144, "loss/crossentropy": 2.367907762527466, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.29049575328826904, "step": 2024 }, { "epoch": 0.126625, "grad_norm": 4.34375, "grad_norm_var": 2.528955078125, "learning_rate": 0.0001, "loss": 8.9281, "loss/crossentropy": 2.29294753074646, "loss/hidden": 3.6171875, "loss/jsd": 0.0, "loss/logits": 0.38897041976451874, "step": 2026 }, { "epoch": 0.12675, "grad_norm": 3.703125, "grad_norm_var": 2.452977498372396, "learning_rate": 0.0001, "loss": 8.7142, "loss/crossentropy": 2.286558747291565, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.28155122697353363, "step": 2028 }, { "epoch": 0.126875, "grad_norm": 3.46875, "grad_norm_var": 2.3957753499348957, "learning_rate": 0.0001, "loss": 8.5948, "loss/crossentropy": 2.4738941192626953, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.2834329903125763, "step": 2030 }, { "epoch": 0.127, "grad_norm": 3.296875, "grad_norm_var": 2.019173177083333, "learning_rate": 0.0001, "loss": 8.7443, "loss/crossentropy": 2.4368330240249634, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2754772901535034, "step": 2032 }, { "epoch": 0.127125, "grad_norm": 3.484375, "grad_norm_var": 1.99468994140625, "learning_rate": 0.0001, "loss": 8.4763, "loss/crossentropy": 2.287817358970642, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.25924333184957504, "step": 2034 }, { "epoch": 0.12725, "grad_norm": 3.28125, "grad_norm_var": 2.008112589518229, "learning_rate": 0.0001, "loss": 8.6018, "loss/crossentropy": 2.2494828701019287, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.29782405495643616, "step": 2036 }, { "epoch": 0.127375, "grad_norm": 3.5625, "grad_norm_var": 2.017438761393229, "learning_rate": 0.0001, "loss": 8.6547, "loss/crossentropy": 2.3243712186813354, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.28735925257205963, "step": 2038 }, { "epoch": 0.1275, "grad_norm": 3.875, "grad_norm_var": 1.9649648030598958, "learning_rate": 0.0001, "loss": 8.5979, "loss/crossentropy": 2.205388069152832, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2879514992237091, "step": 2040 }, { "epoch": 0.127625, "grad_norm": 3.546875, "grad_norm_var": 0.0300201416015625, "learning_rate": 0.0001, "loss": 8.7234, "loss/crossentropy": 2.3310853242874146, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.27090397477149963, "step": 2042 }, { "epoch": 0.12775, "grad_norm": 3.609375, "grad_norm_var": 0.0336822509765625, "learning_rate": 0.0001, "loss": 8.6942, "loss/crossentropy": 2.411409616470337, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.3128150999546051, "step": 2044 }, { "epoch": 0.127875, "grad_norm": 3.71875, "grad_norm_var": 0.0386871337890625, "learning_rate": 0.0001, "loss": 8.7962, "loss/crossentropy": 2.531530022621155, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.2983619421720505, "step": 2046 }, { "epoch": 0.128, "grad_norm": 3.28125, "grad_norm_var": 0.03849995930989583, "learning_rate": 0.0001, "loss": 8.6854, "loss/crossentropy": 2.4569283723831177, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.2882058769464493, "step": 2048 }, { "epoch": 0.128125, "grad_norm": 3.15625, "grad_norm_var": 0.051634724934895834, "learning_rate": 0.0001, "loss": 8.5848, "loss/crossentropy": 2.2018297910690308, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.27827227115631104, "step": 2050 }, { "epoch": 0.12825, "grad_norm": 3.65625, "grad_norm_var": 0.08240559895833334, "learning_rate": 0.0001, "loss": 9.0058, "loss/crossentropy": 2.3052055835723877, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.3264636695384979, "step": 2052 }, { "epoch": 0.128375, "grad_norm": 3.265625, "grad_norm_var": 0.08439839680989583, "learning_rate": 0.0001, "loss": 8.3336, "loss/crossentropy": 2.3923838138580322, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.28575557470321655, "step": 2054 }, { "epoch": 0.1285, "grad_norm": 3.296875, "grad_norm_var": 0.0878326416015625, "learning_rate": 0.0001, "loss": 8.513, "loss/crossentropy": 2.1959877014160156, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.2940082252025604, "step": 2056 }, { "epoch": 0.128625, "grad_norm": 3.59375, "grad_norm_var": 0.08575846354166666, "learning_rate": 0.0001, "loss": 8.812, "loss/crossentropy": 2.4442625045776367, "loss/hidden": 3.5390625, "loss/jsd": 0.0, "loss/logits": 0.30026645958423615, "step": 2058 }, { "epoch": 0.12875, "grad_norm": 3.375, "grad_norm_var": 0.08087565104166666, "learning_rate": 0.0001, "loss": 8.5372, "loss/crossentropy": 2.319927215576172, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.33365097641944885, "step": 2060 }, { "epoch": 0.128875, "grad_norm": 3.046875, "grad_norm_var": 0.079296875, "learning_rate": 0.0001, "loss": 8.6948, "loss/crossentropy": 2.263151526451111, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.3065057694911957, "step": 2062 }, { "epoch": 0.129, "grad_norm": 3.109375, "grad_norm_var": 0.08463541666666667, "learning_rate": 0.0001, "loss": 8.4387, "loss/crossentropy": 2.470115303993225, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.29933932423591614, "step": 2064 }, { "epoch": 0.129125, "grad_norm": 3.0625, "grad_norm_var": 0.08762613932291667, "learning_rate": 0.0001, "loss": 8.6413, "loss/crossentropy": 2.356273889541626, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.2907129377126694, "step": 2066 }, { "epoch": 0.12925, "grad_norm": 3.625, "grad_norm_var": 0.034505208333333336, "learning_rate": 0.0001, "loss": 8.7037, "loss/crossentropy": 2.3621848821640015, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2840597927570343, "step": 2068 }, { "epoch": 0.129375, "grad_norm": 3.5, "grad_norm_var": 0.03486226399739583, "learning_rate": 0.0001, "loss": 8.5151, "loss/crossentropy": 2.034373462200165, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2726925313472748, "step": 2070 }, { "epoch": 0.1295, "grad_norm": 3.515625, "grad_norm_var": 0.034016927083333336, "learning_rate": 0.0001, "loss": 8.6894, "loss/crossentropy": 2.224942922592163, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2749377191066742, "step": 2072 }, { "epoch": 0.129625, "grad_norm": 3.0, "grad_norm_var": 0.04013671875, "learning_rate": 0.0001, "loss": 8.7369, "loss/crossentropy": 2.385498046875, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.28188909590244293, "step": 2074 }, { "epoch": 0.12975, "grad_norm": 3.390625, "grad_norm_var": 0.04454752604166667, "learning_rate": 0.0001, "loss": 8.79, "loss/crossentropy": 2.5538192987442017, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.301289901137352, "step": 2076 }, { "epoch": 0.129875, "grad_norm": 3.203125, "grad_norm_var": 0.04350484212239583, "learning_rate": 0.0001, "loss": 8.5047, "loss/crossentropy": 2.343226909637451, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.30614304542541504, "step": 2078 }, { "epoch": 0.13, "grad_norm": 3.5, "grad_norm_var": 0.04088541666666667, "learning_rate": 0.0001, "loss": 8.5874, "loss/crossentropy": 2.5897929668426514, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.29140761494636536, "step": 2080 }, { "epoch": 0.130125, "grad_norm": 3.1875, "grad_norm_var": 0.0357330322265625, "learning_rate": 0.0001, "loss": 8.4543, "loss/crossentropy": 2.3254483938217163, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.27507585287094116, "step": 2082 }, { "epoch": 0.13025, "grad_norm": 3.359375, "grad_norm_var": 0.03479817708333333, "learning_rate": 0.0001, "loss": 8.5021, "loss/crossentropy": 2.230491876602173, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2707635313272476, "step": 2084 }, { "epoch": 0.130375, "grad_norm": 3.328125, "grad_norm_var": 0.0332183837890625, "learning_rate": 0.0001, "loss": 8.355, "loss/crossentropy": 2.4114824533462524, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.28432922065258026, "step": 2086 }, { "epoch": 0.1305, "grad_norm": 3.15625, "grad_norm_var": 0.0313873291015625, "learning_rate": 0.0001, "loss": 8.5269, "loss/crossentropy": 2.230819344520569, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.26765523850917816, "step": 2088 }, { "epoch": 0.130625, "grad_norm": 3.203125, "grad_norm_var": 0.0262115478515625, "learning_rate": 0.0001, "loss": 8.4321, "loss/crossentropy": 2.4839935302734375, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2982227951288223, "step": 2090 }, { "epoch": 0.13075, "grad_norm": 3.5, "grad_norm_var": 0.020796712239583334, "learning_rate": 0.0001, "loss": 8.5413, "loss/crossentropy": 2.4406583309173584, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.27541063725948334, "step": 2092 }, { "epoch": 0.130875, "grad_norm": 3.625, "grad_norm_var": 0.0326080322265625, "learning_rate": 0.0001, "loss": 8.3994, "loss/crossentropy": 2.193255662918091, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2544540911912918, "step": 2094 }, { "epoch": 0.131, "grad_norm": 3.375, "grad_norm_var": 0.026927693684895834, "learning_rate": 0.0001, "loss": 8.6152, "loss/crossentropy": 2.3712345361709595, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.27534911036491394, "step": 2096 }, { "epoch": 0.131125, "grad_norm": 3.21875, "grad_norm_var": 0.036454264322916666, "learning_rate": 0.0001, "loss": 8.4218, "loss/crossentropy": 2.301249861717224, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2652607709169388, "step": 2098 }, { "epoch": 0.13125, "grad_norm": 3.40625, "grad_norm_var": 0.059056599934895836, "learning_rate": 0.0001, "loss": 8.8422, "loss/crossentropy": 2.4641329050064087, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2975671887397766, "step": 2100 }, { "epoch": 0.131375, "grad_norm": 3.328125, "grad_norm_var": 0.05963134765625, "learning_rate": 0.0001, "loss": 8.5845, "loss/crossentropy": 2.577459692955017, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.2966485619544983, "step": 2102 }, { "epoch": 0.1315, "grad_norm": 3.0, "grad_norm_var": 0.0670562744140625, "learning_rate": 0.0001, "loss": 8.4679, "loss/crossentropy": 2.3673853874206543, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2907790243625641, "step": 2104 }, { "epoch": 0.131625, "grad_norm": 3.421875, "grad_norm_var": 0.06606343587239584, "learning_rate": 0.0001, "loss": 8.7638, "loss/crossentropy": 2.519857883453369, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.30287206172943115, "step": 2106 }, { "epoch": 0.13175, "grad_norm": 3.28125, "grad_norm_var": 0.06646728515625, "learning_rate": 0.0001, "loss": 8.4918, "loss/crossentropy": 2.1893075108528137, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.2853415459394455, "step": 2108 }, { "epoch": 0.131875, "grad_norm": 3.421875, "grad_norm_var": 0.054442342122395834, "learning_rate": 0.0001, "loss": 8.3529, "loss/crossentropy": 2.02129727602005, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2583470046520233, "step": 2110 }, { "epoch": 0.132, "grad_norm": 3.0625, "grad_norm_var": 0.06260477701822917, "learning_rate": 0.0001, "loss": 8.2412, "loss/crossentropy": 2.3207201957702637, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2662471830844879, "step": 2112 }, { "epoch": 0.132125, "grad_norm": 3.09375, "grad_norm_var": 0.05496419270833333, "learning_rate": 0.0001, "loss": 8.6733, "loss/crossentropy": 2.3841429948806763, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.2923773378133774, "step": 2114 }, { "epoch": 0.13225, "grad_norm": 3.359375, "grad_norm_var": 0.026488240559895834, "learning_rate": 0.0001, "loss": 8.6388, "loss/crossentropy": 2.178789973258972, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.27935703098773956, "step": 2116 }, { "epoch": 0.132375, "grad_norm": 3.390625, "grad_norm_var": 0.04003499348958333, "learning_rate": 0.0001, "loss": 8.5069, "loss/crossentropy": 2.1232659816741943, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2767260819673538, "step": 2118 }, { "epoch": 0.1325, "grad_norm": 3.859375, "grad_norm_var": 0.054906209309895836, "learning_rate": 0.0001, "loss": 8.6523, "loss/crossentropy": 2.349829316139221, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2905244827270508, "step": 2120 }, { "epoch": 0.132625, "grad_norm": 3.453125, "grad_norm_var": 0.08715718587239583, "learning_rate": 0.0001, "loss": 8.7333, "loss/crossentropy": 2.5470376014709473, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.3190218657255173, "step": 2122 }, { "epoch": 0.13275, "grad_norm": 3.28125, "grad_norm_var": 0.08217671712239584, "learning_rate": 0.0001, "loss": 8.6398, "loss/crossentropy": 2.436127185821533, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.29636839032173157, "step": 2124 }, { "epoch": 0.132875, "grad_norm": 3.28125, "grad_norm_var": 0.07965494791666666, "learning_rate": 0.0001, "loss": 8.2415, "loss/crossentropy": 2.409112572669983, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.2878931760787964, "step": 2126 }, { "epoch": 0.133, "grad_norm": 3.09375, "grad_norm_var": 0.06551106770833333, "learning_rate": 0.0001, "loss": 8.579, "loss/crossentropy": 2.3906946182250977, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.27670612931251526, "step": 2128 }, { "epoch": 0.133125, "grad_norm": 3.15625, "grad_norm_var": 0.064599609375, "learning_rate": 0.0001, "loss": 8.4984, "loss/crossentropy": 2.351179838180542, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.29204052686691284, "step": 2130 }, { "epoch": 0.13325, "grad_norm": 3.5625, "grad_norm_var": 0.06601460774739583, "learning_rate": 0.0001, "loss": 8.4208, "loss/crossentropy": 2.373140573501587, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.27188287675380707, "step": 2132 }, { "epoch": 0.133375, "grad_norm": 3.40625, "grad_norm_var": 0.062353515625, "learning_rate": 0.0001, "loss": 8.5288, "loss/crossentropy": 2.3178237676620483, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2606750875711441, "step": 2134 }, { "epoch": 0.1335, "grad_norm": 3.640625, "grad_norm_var": 0.051493326822916664, "learning_rate": 0.0001, "loss": 8.6286, "loss/crossentropy": 2.5622605085372925, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.2858807444572449, "step": 2136 }, { "epoch": 0.133625, "grad_norm": 3.21875, "grad_norm_var": 0.025934855143229168, "learning_rate": 0.0001, "loss": 8.6969, "loss/crossentropy": 2.461153268814087, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2880861461162567, "step": 2138 }, { "epoch": 0.13375, "grad_norm": 3.3125, "grad_norm_var": 0.030370076497395832, "learning_rate": 0.0001, "loss": 8.2534, "loss/crossentropy": 2.104259490966797, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.2679433301091194, "step": 2140 }, { "epoch": 0.133875, "grad_norm": 3.21875, "grad_norm_var": 0.0362457275390625, "learning_rate": 0.0001, "loss": 8.6085, "loss/crossentropy": 2.4813841581344604, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.29470039904117584, "step": 2142 }, { "epoch": 0.134, "grad_norm": 3.09375, "grad_norm_var": 0.03515625, "learning_rate": 0.0001, "loss": 8.4156, "loss/crossentropy": 2.3401472568511963, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2929167151451111, "step": 2144 }, { "epoch": 0.134125, "grad_norm": 3.09375, "grad_norm_var": 0.037840779622395834, "learning_rate": 0.0001, "loss": 8.5012, "loss/crossentropy": 2.3249677419662476, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.2590218484401703, "step": 2146 }, { "epoch": 0.13425, "grad_norm": 3.40625, "grad_norm_var": 0.03340555826822917, "learning_rate": 0.0001, "loss": 8.2595, "loss/crossentropy": 2.1680409908294678, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.258025586605072, "step": 2148 }, { "epoch": 0.134375, "grad_norm": 3.578125, "grad_norm_var": 0.0428131103515625, "learning_rate": 0.0001, "loss": 8.4906, "loss/crossentropy": 2.3455265760421753, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2917974293231964, "step": 2150 }, { "epoch": 0.1345, "grad_norm": 3.359375, "grad_norm_var": 0.0469146728515625, "learning_rate": 0.0001, "loss": 8.7886, "loss/crossentropy": 2.513558268547058, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.3155888319015503, "step": 2152 }, { "epoch": 0.134625, "grad_norm": 3.078125, "grad_norm_var": 0.04812825520833333, "learning_rate": 0.0001, "loss": 8.5435, "loss/crossentropy": 2.6064085960388184, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2825528085231781, "step": 2154 }, { "epoch": 0.13475, "grad_norm": 3.234375, "grad_norm_var": 0.06126200358072917, "learning_rate": 0.0001, "loss": 8.3432, "loss/crossentropy": 2.2562918663024902, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.28957146406173706, "step": 2156 }, { "epoch": 0.134875, "grad_norm": 3.265625, "grad_norm_var": 0.05621337890625, "learning_rate": 0.0001, "loss": 8.5618, "loss/crossentropy": 2.3136632442474365, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2644960880279541, "step": 2158 }, { "epoch": 0.135, "grad_norm": 3.125, "grad_norm_var": 0.055394490559895836, "learning_rate": 0.0001, "loss": 8.2383, "loss/crossentropy": 2.1522982120513916, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2998431622982025, "step": 2160 }, { "epoch": 0.135125, "grad_norm": 3.28125, "grad_norm_var": 0.05455322265625, "learning_rate": 0.0001, "loss": 8.5748, "loss/crossentropy": 2.4402201175689697, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2670477479696274, "step": 2162 }, { "epoch": 0.13525, "grad_norm": 3.125, "grad_norm_var": 0.05601806640625, "learning_rate": 0.0001, "loss": 8.3886, "loss/crossentropy": 2.46258282661438, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.293775275349617, "step": 2164 }, { "epoch": 0.135375, "grad_norm": 2.875, "grad_norm_var": 0.0496734619140625, "learning_rate": 0.0001, "loss": 8.3114, "loss/crossentropy": 2.2047289609909058, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2652291804552078, "step": 2166 }, { "epoch": 0.1355, "grad_norm": 3.140625, "grad_norm_var": 0.028218587239583332, "learning_rate": 0.0001, "loss": 8.4665, "loss/crossentropy": 2.1923593282699585, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2693886160850525, "step": 2168 }, { "epoch": 0.135625, "grad_norm": 3.359375, "grad_norm_var": 0.026949055989583335, "learning_rate": 0.0001, "loss": 8.4295, "loss/crossentropy": 2.529042959213257, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2995428442955017, "step": 2170 }, { "epoch": 0.13575, "grad_norm": 3.234375, "grad_norm_var": 0.017671712239583335, "learning_rate": 0.0001, "loss": 8.4678, "loss/crossentropy": 2.3110212087631226, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.2710278108716011, "step": 2172 }, { "epoch": 0.135875, "grad_norm": 3.53125, "grad_norm_var": 0.023298136393229165, "learning_rate": 0.0001, "loss": 8.3845, "loss/crossentropy": 2.3812527656555176, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.28374361991882324, "step": 2174 }, { "epoch": 0.136, "grad_norm": 2.9375, "grad_norm_var": 0.030338541666666666, "learning_rate": 0.0001, "loss": 8.4029, "loss/crossentropy": 2.444836735725403, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.2983601689338684, "step": 2176 }, { "epoch": 0.136125, "grad_norm": 3.1875, "grad_norm_var": 0.028856404622395835, "learning_rate": 0.0001, "loss": 8.4012, "loss/crossentropy": 2.2179330587387085, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.27099834382534027, "step": 2178 }, { "epoch": 0.13625, "grad_norm": 3.203125, "grad_norm_var": 0.0270660400390625, "learning_rate": 0.0001, "loss": 8.1745, "loss/crossentropy": 2.220338225364685, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.26341523230075836, "step": 2180 }, { "epoch": 0.136375, "grad_norm": 3.375, "grad_norm_var": 0.0201324462890625, "learning_rate": 0.0001, "loss": 8.5619, "loss/crossentropy": 2.3334707021713257, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2882850170135498, "step": 2182 }, { "epoch": 0.1365, "grad_norm": 3.078125, "grad_norm_var": 0.022639973958333334, "learning_rate": 0.0001, "loss": 8.1465, "loss/crossentropy": 1.9602341055870056, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2532319873571396, "step": 2184 }, { "epoch": 0.136625, "grad_norm": 3.25, "grad_norm_var": 0.0210113525390625, "learning_rate": 0.0001, "loss": 8.4768, "loss/crossentropy": 2.127853035926819, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.3045553117990494, "step": 2186 }, { "epoch": 0.13675, "grad_norm": 3.4375, "grad_norm_var": 0.0257720947265625, "learning_rate": 0.0001, "loss": 8.5223, "loss/crossentropy": 2.3501694202423096, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.274165078997612, "step": 2188 }, { "epoch": 0.136875, "grad_norm": 3.40625, "grad_norm_var": 0.022223917643229167, "learning_rate": 0.0001, "loss": 8.4563, "loss/crossentropy": 2.1404179334640503, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.2844165414571762, "step": 2190 }, { "epoch": 0.137, "grad_norm": 3.71875, "grad_norm_var": 0.03388264973958333, "learning_rate": 0.0001, "loss": 8.4094, "loss/crossentropy": 2.245135545730591, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2556656002998352, "step": 2192 }, { "epoch": 0.137125, "grad_norm": 3.078125, "grad_norm_var": 0.03547261555989583, "learning_rate": 0.0001, "loss": 8.7157, "loss/crossentropy": 2.41064453125, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.267337366938591, "step": 2194 }, { "epoch": 0.13725, "grad_norm": 3.234375, "grad_norm_var": 0.03752848307291667, "learning_rate": 0.0001, "loss": 8.5773, "loss/crossentropy": 2.5315933227539062, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.286949098110199, "step": 2196 }, { "epoch": 0.137375, "grad_norm": 2.84375, "grad_norm_var": 0.048111979166666666, "learning_rate": 0.0001, "loss": 8.3222, "loss/crossentropy": 2.3176004886627197, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2734638601541519, "step": 2198 }, { "epoch": 0.1375, "grad_norm": 3.515625, "grad_norm_var": 0.05325113932291667, "learning_rate": 0.0001, "loss": 8.5529, "loss/crossentropy": 2.339990735054016, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.27560897171497345, "step": 2200 }, { "epoch": 0.137625, "grad_norm": 3.078125, "grad_norm_var": 0.06060791015625, "learning_rate": 0.0001, "loss": 8.4805, "loss/crossentropy": 2.284912347793579, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2868229001760483, "step": 2202 }, { "epoch": 0.13775, "grad_norm": 3.09375, "grad_norm_var": 0.056452433268229164, "learning_rate": 0.0001, "loss": 8.4745, "loss/crossentropy": 2.400985598564148, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.3073076903820038, "step": 2204 }, { "epoch": 0.137875, "grad_norm": 3.21875, "grad_norm_var": 0.05416666666666667, "learning_rate": 0.0001, "loss": 8.3061, "loss/crossentropy": 2.6853994131088257, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.29532191157341003, "step": 2206 }, { "epoch": 0.138, "grad_norm": 3.203125, "grad_norm_var": 0.025739542643229165, "learning_rate": 0.0001, "loss": 8.3561, "loss/crossentropy": 2.3069804906845093, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.27918627858161926, "step": 2208 }, { "epoch": 0.138125, "grad_norm": 3.15625, "grad_norm_var": 0.04568684895833333, "learning_rate": 0.0001, "loss": 8.303, "loss/crossentropy": 2.101306200027466, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.27222634851932526, "step": 2210 }, { "epoch": 0.13825, "grad_norm": 3.328125, "grad_norm_var": 0.0466461181640625, "learning_rate": 0.0001, "loss": 8.4047, "loss/crossentropy": 2.2453717589378357, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.2868994474411011, "step": 2212 }, { "epoch": 0.138375, "grad_norm": 3.1875, "grad_norm_var": 0.03795166015625, "learning_rate": 0.0001, "loss": 8.371, "loss/crossentropy": 2.373469829559326, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.26734408736228943, "step": 2214 }, { "epoch": 0.1385, "grad_norm": 3.3125, "grad_norm_var": 0.0298492431640625, "learning_rate": 0.0001, "loss": 8.5017, "loss/crossentropy": 2.28477144241333, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.2690521627664566, "step": 2216 }, { "epoch": 0.138625, "grad_norm": 3.34375, "grad_norm_var": 0.025419108072916665, "learning_rate": 0.0001, "loss": 8.4961, "loss/crossentropy": 2.404952049255371, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.28461331129074097, "step": 2218 }, { "epoch": 0.13875, "grad_norm": 3.15625, "grad_norm_var": 0.0295806884765625, "learning_rate": 0.0001, "loss": 8.5059, "loss/crossentropy": 2.3059340715408325, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.27501142024993896, "step": 2220 }, { "epoch": 0.138875, "grad_norm": 3.03125, "grad_norm_var": 0.032877604166666664, "learning_rate": 0.0001, "loss": 8.5706, "loss/crossentropy": 2.2778221368789673, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2830122113227844, "step": 2222 }, { "epoch": 0.139, "grad_norm": 3.421875, "grad_norm_var": 0.0301666259765625, "learning_rate": 0.0001, "loss": 8.4701, "loss/crossentropy": 2.1778889894485474, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2717326432466507, "step": 2224 }, { "epoch": 0.139125, "grad_norm": 3.25, "grad_norm_var": 0.02222900390625, "learning_rate": 0.0001, "loss": 8.5608, "loss/crossentropy": 2.4517624378204346, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2836722731590271, "step": 2226 }, { "epoch": 0.13925, "grad_norm": 3.453125, "grad_norm_var": 0.02724609375, "learning_rate": 0.0001, "loss": 8.3748, "loss/crossentropy": 2.1878401041030884, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2892114222049713, "step": 2228 }, { "epoch": 0.139375, "grad_norm": 3.09375, "grad_norm_var": 0.02783203125, "learning_rate": 0.0001, "loss": 8.4792, "loss/crossentropy": 2.355382800102234, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.25766345858573914, "step": 2230 }, { "epoch": 0.1395, "grad_norm": 3.59375, "grad_norm_var": 0.035130818684895836, "learning_rate": 0.0001, "loss": 8.5125, "loss/crossentropy": 2.2248635292053223, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2683367282152176, "step": 2232 }, { "epoch": 0.139625, "grad_norm": 3.625, "grad_norm_var": 0.0451080322265625, "learning_rate": 0.0001, "loss": 8.5788, "loss/crossentropy": 2.253064751625061, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2640530467033386, "step": 2234 }, { "epoch": 0.13975, "grad_norm": 3.140625, "grad_norm_var": 0.043797810872395836, "learning_rate": 0.0001, "loss": 8.3916, "loss/crossentropy": 2.373534321784973, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.27554066479206085, "step": 2236 }, { "epoch": 0.139875, "grad_norm": 3.140625, "grad_norm_var": 0.04543355305989583, "learning_rate": 0.0001, "loss": 8.5169, "loss/crossentropy": 2.360267996788025, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2985559552907944, "step": 2238 }, { "epoch": 0.14, "grad_norm": 3.0, "grad_norm_var": 0.05829671223958333, "learning_rate": 0.0001, "loss": 8.2996, "loss/crossentropy": 2.340610146522522, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.26291558146476746, "step": 2240 }, { "epoch": 0.140125, "grad_norm": 3.296875, "grad_norm_var": 0.052144368489583336, "learning_rate": 0.0001, "loss": 8.592, "loss/crossentropy": 2.3476451635360718, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.30578161776065826, "step": 2242 }, { "epoch": 0.14025, "grad_norm": 3.265625, "grad_norm_var": 0.05040690104166667, "learning_rate": 0.0001, "loss": 8.2486, "loss/crossentropy": 2.257493257522583, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2669792026281357, "step": 2244 }, { "epoch": 0.140375, "grad_norm": 3.03125, "grad_norm_var": 0.05563151041666667, "learning_rate": 0.0001, "loss": 8.2584, "loss/crossentropy": 2.1079065799713135, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2586442455649376, "step": 2246 }, { "epoch": 0.1405, "grad_norm": 3.15625, "grad_norm_var": 0.04586181640625, "learning_rate": 0.0001, "loss": 8.3155, "loss/crossentropy": 2.3655279874801636, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.27686166763305664, "step": 2248 }, { "epoch": 0.140625, "grad_norm": 3.328125, "grad_norm_var": 0.023860677083333334, "learning_rate": 0.0001, "loss": 8.5854, "loss/crossentropy": 2.2470057010650635, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2807578891515732, "step": 2250 }, { "epoch": 0.14075, "grad_norm": 2.984375, "grad_norm_var": 0.020894368489583332, "learning_rate": 0.0001, "loss": 8.328, "loss/crossentropy": 2.281611919403076, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.26320287585258484, "step": 2252 }, { "epoch": 0.140875, "grad_norm": 3.71875, "grad_norm_var": 0.042708333333333334, "learning_rate": 0.0001, "loss": 8.5691, "loss/crossentropy": 2.281616449356079, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2866251766681671, "step": 2254 }, { "epoch": 0.141, "grad_norm": 3.25, "grad_norm_var": 0.039872233072916666, "learning_rate": 0.0001, "loss": 8.6355, "loss/crossentropy": 2.4851930141448975, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2863723188638687, "step": 2256 }, { "epoch": 0.141125, "grad_norm": 3.0625, "grad_norm_var": 0.04378153483072917, "learning_rate": 0.0001, "loss": 8.3547, "loss/crossentropy": 2.2120649814605713, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.26930585503578186, "step": 2258 }, { "epoch": 0.14125, "grad_norm": 3.234375, "grad_norm_var": 0.042008463541666666, "learning_rate": 0.0001, "loss": 8.5007, "loss/crossentropy": 2.3971279859542847, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.29983802139759064, "step": 2260 }, { "epoch": 0.141375, "grad_norm": 3.328125, "grad_norm_var": 0.03905843098958333, "learning_rate": 0.0001, "loss": 8.5227, "loss/crossentropy": 2.2901759147644043, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2883574813604355, "step": 2262 }, { "epoch": 0.1415, "grad_norm": 3.0625, "grad_norm_var": 0.041478474934895836, "learning_rate": 0.0001, "loss": 8.402, "loss/crossentropy": 2.1983225345611572, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.2799525707960129, "step": 2264 }, { "epoch": 0.141625, "grad_norm": 3.296875, "grad_norm_var": 0.038996378580729164, "learning_rate": 0.0001, "loss": 8.7039, "loss/crossentropy": 2.2069029808044434, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2689012736082077, "step": 2266 }, { "epoch": 0.14175, "grad_norm": 3.546875, "grad_norm_var": 0.03759358723958333, "learning_rate": 0.0001, "loss": 8.5396, "loss/crossentropy": 2.430467963218689, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.2826516032218933, "step": 2268 }, { "epoch": 0.141875, "grad_norm": 3.0625, "grad_norm_var": 0.022489420572916665, "learning_rate": 0.0001, "loss": 8.2918, "loss/crossentropy": 2.29804265499115, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2685416340827942, "step": 2270 }, { "epoch": 0.142, "grad_norm": 3.1875, "grad_norm_var": 0.020930989583333334, "learning_rate": 0.0001, "loss": 8.4861, "loss/crossentropy": 2.4030654430389404, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26462114602327347, "step": 2272 }, { "epoch": 0.142125, "grad_norm": 3.453125, "grad_norm_var": 0.12026265462239584, "learning_rate": 0.0001, "loss": 8.3658, "loss/crossentropy": 2.0990917682647705, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.29184460639953613, "step": 2274 }, { "epoch": 0.14225, "grad_norm": 3.1875, "grad_norm_var": 0.12245992024739584, "learning_rate": 0.0001, "loss": 8.4349, "loss/crossentropy": 2.258981704711914, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2933773547410965, "step": 2276 }, { "epoch": 0.142375, "grad_norm": 3.53125, "grad_norm_var": 0.1346099853515625, "learning_rate": 0.0001, "loss": 8.5544, "loss/crossentropy": 2.414697289466858, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.28715577721595764, "step": 2278 }, { "epoch": 0.1425, "grad_norm": 3.21875, "grad_norm_var": 0.12939453125, "learning_rate": 0.0001, "loss": 8.4856, "loss/crossentropy": 2.343166470527649, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2715238779783249, "step": 2280 }, { "epoch": 0.142625, "grad_norm": 3.09375, "grad_norm_var": 0.13337300618489584, "learning_rate": 0.0001, "loss": 8.4238, "loss/crossentropy": 2.1820271015167236, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.26598094403743744, "step": 2282 }, { "epoch": 0.14275, "grad_norm": 2.96875, "grad_norm_var": 0.14114583333333333, "learning_rate": 0.0001, "loss": 8.2716, "loss/crossentropy": 2.481716513633728, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2877477705478668, "step": 2284 }, { "epoch": 0.142875, "grad_norm": 3.078125, "grad_norm_var": 0.14442952473958334, "learning_rate": 0.0001, "loss": 8.2991, "loss/crossentropy": 2.216909646987915, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2711998224258423, "step": 2286 }, { "epoch": 0.143, "grad_norm": 3.34375, "grad_norm_var": 0.14483133951822916, "learning_rate": 0.0001, "loss": 8.2365, "loss/crossentropy": 2.162856936454773, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.28143976628780365, "step": 2288 }, { "epoch": 0.143125, "grad_norm": 3.203125, "grad_norm_var": 0.0439605712890625, "learning_rate": 0.0001, "loss": 8.5854, "loss/crossentropy": 2.5739957094192505, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.2899349331855774, "step": 2290 }, { "epoch": 0.14325, "grad_norm": 3.46875, "grad_norm_var": 0.0454742431640625, "learning_rate": 0.0001, "loss": 8.6261, "loss/crossentropy": 2.441213607788086, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.28684163093566895, "step": 2292 }, { "epoch": 0.143375, "grad_norm": 3.359375, "grad_norm_var": 0.031083170572916666, "learning_rate": 0.0001, "loss": 8.4656, "loss/crossentropy": 2.3305513858795166, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.30575811862945557, "step": 2294 }, { "epoch": 0.1435, "grad_norm": 3.640625, "grad_norm_var": 0.04129231770833333, "learning_rate": 0.0001, "loss": 8.5207, "loss/crossentropy": 2.4758119583129883, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2866526395082474, "step": 2296 }, { "epoch": 0.143625, "grad_norm": 3.046875, "grad_norm_var": 0.043822224934895834, "learning_rate": 0.0001, "loss": 8.308, "loss/crossentropy": 2.5252416133880615, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.29467783868312836, "step": 2298 }, { "epoch": 0.14375, "grad_norm": 2.90625, "grad_norm_var": 0.0426910400390625, "learning_rate": 0.0001, "loss": 8.0802, "loss/crossentropy": 2.417738676071167, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.29233458638191223, "step": 2300 }, { "epoch": 0.143875, "grad_norm": 3.09375, "grad_norm_var": 0.04114176432291667, "learning_rate": 0.0001, "loss": 8.2872, "loss/crossentropy": 2.298841118812561, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.25550100207328796, "step": 2302 }, { "epoch": 0.144, "grad_norm": 3.28125, "grad_norm_var": 0.03951416015625, "learning_rate": 0.0001, "loss": 8.593, "loss/crossentropy": 2.283052444458008, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.29309652745723724, "step": 2304 }, { "epoch": 0.144125, "grad_norm": 3.15625, "grad_norm_var": 0.0362457275390625, "learning_rate": 0.0001, "loss": 8.1915, "loss/crossentropy": 2.3673131465911865, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2645817846059799, "step": 2306 }, { "epoch": 0.14425, "grad_norm": 3.125, "grad_norm_var": 0.03322652180989583, "learning_rate": 0.0001, "loss": 8.4168, "loss/crossentropy": 2.3149588108062744, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.26110585033893585, "step": 2308 }, { "epoch": 0.144375, "grad_norm": 3.203125, "grad_norm_var": 0.0287506103515625, "learning_rate": 0.0001, "loss": 8.566, "loss/crossentropy": 2.2937344312667847, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2994783967733383, "step": 2310 }, { "epoch": 0.1445, "grad_norm": 3.203125, "grad_norm_var": 0.013459269205729167, "learning_rate": 0.0001, "loss": 8.3584, "loss/crossentropy": 2.4348164796829224, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.28557969629764557, "step": 2312 }, { "epoch": 0.144625, "grad_norm": 4.375, "grad_norm_var": 0.10247294108072917, "learning_rate": 0.0001, "loss": 8.1973, "loss/crossentropy": 2.133277177810669, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2594181001186371, "step": 2314 }, { "epoch": 0.14475, "grad_norm": 3.09375, "grad_norm_var": 0.0970123291015625, "learning_rate": 0.0001, "loss": 8.3714, "loss/crossentropy": 2.557259678840637, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.3214830011129379, "step": 2316 }, { "epoch": 0.144875, "grad_norm": 3.09375, "grad_norm_var": 0.0958404541015625, "learning_rate": 0.0001, "loss": 8.2437, "loss/crossentropy": 2.323344588279724, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2699511796236038, "step": 2318 }, { "epoch": 0.145, "grad_norm": 3.046875, "grad_norm_var": 0.10247294108072917, "learning_rate": 0.0001, "loss": 8.3199, "loss/crossentropy": 2.1560362577438354, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2754161208868027, "step": 2320 }, { "epoch": 0.145125, "grad_norm": 3.21875, "grad_norm_var": 0.10204671223958334, "learning_rate": 0.0001, "loss": 8.3976, "loss/crossentropy": 2.1962740421295166, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2777334451675415, "step": 2322 }, { "epoch": 0.14525, "grad_norm": 3.078125, "grad_norm_var": 0.10629781087239583, "learning_rate": 0.0001, "loss": 8.3426, "loss/crossentropy": 2.225161910057068, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2512729838490486, "step": 2324 }, { "epoch": 0.145375, "grad_norm": 3.328125, "grad_norm_var": 0.11122945149739584, "learning_rate": 0.0001, "loss": 8.5134, "loss/crossentropy": 2.286335587501526, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2837376594543457, "step": 2326 }, { "epoch": 0.1455, "grad_norm": 3.3125, "grad_norm_var": 0.11042378743489584, "learning_rate": 0.0001, "loss": 8.7059, "loss/crossentropy": 2.3663827180862427, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.3303401321172714, "step": 2328 }, { "epoch": 0.145625, "grad_norm": 3.140625, "grad_norm_var": 0.029313151041666666, "learning_rate": 0.0001, "loss": 8.3792, "loss/crossentropy": 2.273059129714966, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.27710796892642975, "step": 2330 }, { "epoch": 0.14575, "grad_norm": 3.078125, "grad_norm_var": 0.027958170572916666, "learning_rate": 0.0001, "loss": 8.4054, "loss/crossentropy": 2.319531798362732, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.2798737734556198, "step": 2332 }, { "epoch": 0.145875, "grad_norm": 4.75, "grad_norm_var": 0.19403889973958333, "learning_rate": 0.0001, "loss": 8.7102, "loss/crossentropy": 2.3764997720718384, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.29309070110321045, "step": 2334 }, { "epoch": 0.146, "grad_norm": 3.046875, "grad_norm_var": 0.2710601806640625, "learning_rate": 0.0001, "loss": 8.6281, "loss/crossentropy": 2.339399576187134, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2809606343507767, "step": 2336 }, { "epoch": 0.146125, "grad_norm": 3.203125, "grad_norm_var": 0.27922261555989586, "learning_rate": 0.0001, "loss": 8.4806, "loss/crossentropy": 2.3016566038131714, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.25977956503629684, "step": 2338 }, { "epoch": 0.14625, "grad_norm": 3.21875, "grad_norm_var": 0.26383056640625, "learning_rate": 0.0001, "loss": 8.2754, "loss/crossentropy": 2.3781991004943848, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2771807760000229, "step": 2340 }, { "epoch": 0.146375, "grad_norm": 3.453125, "grad_norm_var": 0.26324462890625, "learning_rate": 0.0001, "loss": 8.6139, "loss/crossentropy": 2.496955394744873, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.27403801679611206, "step": 2342 }, { "epoch": 0.1465, "grad_norm": 3.0, "grad_norm_var": 0.2773590087890625, "learning_rate": 0.0001, "loss": 8.1782, "loss/crossentropy": 2.2161307334899902, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.26936174929142, "step": 2344 }, { "epoch": 0.146625, "grad_norm": 3.125, "grad_norm_var": 0.29517822265625, "learning_rate": 0.0001, "loss": 8.2396, "loss/crossentropy": 2.3318371772766113, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.26794174313545227, "step": 2346 }, { "epoch": 0.14675, "grad_norm": 3.390625, "grad_norm_var": 0.30025126139322916, "learning_rate": 0.0001, "loss": 8.3753, "loss/crossentropy": 2.5500062704086304, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2776283770799637, "step": 2348 }, { "epoch": 0.146875, "grad_norm": 2.96875, "grad_norm_var": 0.14777730305989584, "learning_rate": 0.0001, "loss": 8.1207, "loss/crossentropy": 2.118456542491913, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.26154835522174835, "step": 2350 }, { "epoch": 0.147, "grad_norm": 3.265625, "grad_norm_var": 0.03133036295572917, "learning_rate": 0.0001, "loss": 8.3371, "loss/crossentropy": 2.3705456256866455, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.28914862871170044, "step": 2352 }, { "epoch": 0.147125, "grad_norm": 3.234375, "grad_norm_var": 0.031086222330729166, "learning_rate": 0.0001, "loss": 8.4854, "loss/crossentropy": 2.214228391647339, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.28034496307373047, "step": 2354 }, { "epoch": 0.14725, "grad_norm": 2.84375, "grad_norm_var": 0.03732096354166667, "learning_rate": 0.0001, "loss": 8.1409, "loss/crossentropy": 2.265252947807312, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2799190580844879, "step": 2356 }, { "epoch": 0.147375, "grad_norm": 3.140625, "grad_norm_var": 0.030159505208333333, "learning_rate": 0.0001, "loss": 8.2196, "loss/crossentropy": 2.523659586906433, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.26604168117046356, "step": 2358 }, { "epoch": 0.1475, "grad_norm": 2.953125, "grad_norm_var": 0.031135050455729167, "learning_rate": 0.0001, "loss": 8.2265, "loss/crossentropy": 2.3877971172332764, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2672644555568695, "step": 2360 }, { "epoch": 0.147625, "grad_norm": 2.84375, "grad_norm_var": 0.029564412434895833, "learning_rate": 0.0001, "loss": 8.3976, "loss/crossentropy": 2.3036141395568848, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.27549657225608826, "step": 2362 }, { "epoch": 0.14775, "grad_norm": 3.296875, "grad_norm_var": 0.025325520833333334, "learning_rate": 0.0001, "loss": 8.4514, "loss/crossentropy": 2.2630138397216797, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2665908634662628, "step": 2364 }, { "epoch": 0.147875, "grad_norm": 3.421875, "grad_norm_var": 0.03718973795572917, "learning_rate": 0.0001, "loss": 8.6282, "loss/crossentropy": 2.3436743021011353, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.27386774867773056, "step": 2366 }, { "epoch": 0.148, "grad_norm": 3.328125, "grad_norm_var": 0.0340728759765625, "learning_rate": 0.0001, "loss": 8.421, "loss/crossentropy": 2.4493263959884644, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.3048429489135742, "step": 2368 }, { "epoch": 0.148125, "grad_norm": 3.0625, "grad_norm_var": 0.03430582682291667, "learning_rate": 0.0001, "loss": 8.5809, "loss/crossentropy": 2.4388129711151123, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2775990962982178, "step": 2370 }, { "epoch": 0.14825, "grad_norm": 3.0625, "grad_norm_var": 0.030052693684895833, "learning_rate": 0.0001, "loss": 8.3091, "loss/crossentropy": 2.3041138648986816, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2724207639694214, "step": 2372 }, { "epoch": 0.148375, "grad_norm": 3.125, "grad_norm_var": 0.03465067545572917, "learning_rate": 0.0001, "loss": 8.2688, "loss/crossentropy": 1.9960012435913086, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2777027040719986, "step": 2374 }, { "epoch": 0.1485, "grad_norm": 3.140625, "grad_norm_var": 0.050348917643229164, "learning_rate": 0.0001, "loss": 8.513, "loss/crossentropy": 2.154646396636963, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.318150132894516, "step": 2376 }, { "epoch": 0.148625, "grad_norm": 3.03125, "grad_norm_var": 0.04312744140625, "learning_rate": 0.0001, "loss": 8.5843, "loss/crossentropy": 2.460582137107849, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2954748272895813, "step": 2378 }, { "epoch": 0.14875, "grad_norm": 3.34375, "grad_norm_var": 0.051493326822916664, "learning_rate": 0.0001, "loss": 8.3993, "loss/crossentropy": 2.1957290172576904, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2675289958715439, "step": 2380 }, { "epoch": 0.148875, "grad_norm": 3.0625, "grad_norm_var": 0.04504801432291667, "learning_rate": 0.0001, "loss": 8.4317, "loss/crossentropy": 2.2120442390441895, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.30290304124355316, "step": 2382 }, { "epoch": 0.149, "grad_norm": 3.28125, "grad_norm_var": 0.0464752197265625, "learning_rate": 0.0001, "loss": 8.4022, "loss/crossentropy": 2.42188036441803, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2719276398420334, "step": 2384 }, { "epoch": 0.149125, "grad_norm": 3.171875, "grad_norm_var": 0.045947265625, "learning_rate": 0.0001, "loss": 8.5146, "loss/crossentropy": 2.394136667251587, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2908514738082886, "step": 2386 }, { "epoch": 0.14925, "grad_norm": 3.125, "grad_norm_var": 0.043355305989583336, "learning_rate": 0.0001, "loss": 8.2204, "loss/crossentropy": 2.0119369626045227, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2561585605144501, "step": 2388 }, { "epoch": 0.149375, "grad_norm": 3.046875, "grad_norm_var": 0.03824462890625, "learning_rate": 0.0001, "loss": 8.4495, "loss/crossentropy": 2.52905535697937, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.29769638180732727, "step": 2390 }, { "epoch": 0.1495, "grad_norm": 3.109375, "grad_norm_var": 0.029715983072916667, "learning_rate": 0.0001, "loss": 8.4764, "loss/crossentropy": 2.2129627466201782, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.3012372702360153, "step": 2392 }, { "epoch": 0.149625, "grad_norm": 3.140625, "grad_norm_var": 0.028270467122395834, "learning_rate": 0.0001, "loss": 8.3592, "loss/crossentropy": 2.3810667991638184, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2853284478187561, "step": 2394 }, { "epoch": 0.14975, "grad_norm": 3.609375, "grad_norm_var": 0.029390462239583335, "learning_rate": 0.0001, "loss": 8.4495, "loss/crossentropy": 2.4027936458587646, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.290962889790535, "step": 2396 }, { "epoch": 0.149875, "grad_norm": 3.046875, "grad_norm_var": 0.029002888997395834, "learning_rate": 0.0001, "loss": 8.3231, "loss/crossentropy": 2.3980143070220947, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2743261456489563, "step": 2398 }, { "epoch": 0.15, "grad_norm": 3.171875, "grad_norm_var": 0.026155598958333335, "learning_rate": 0.0001, "loss": 8.3352, "loss/crossentropy": 2.1707264184951782, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.26785067468881607, "step": 2400 }, { "epoch": 0.150125, "grad_norm": 2.953125, "grad_norm_var": 0.030257161458333334, "learning_rate": 0.0001, "loss": 8.5202, "loss/crossentropy": 2.4350671768188477, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.3034633994102478, "step": 2402 }, { "epoch": 0.15025, "grad_norm": 9.5, "grad_norm_var": 2.522652180989583, "learning_rate": 0.0001, "loss": 8.6723, "loss/crossentropy": 2.305592894554138, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.27664943039417267, "step": 2404 }, { "epoch": 0.150375, "grad_norm": 3.296875, "grad_norm_var": 2.494429524739583, "learning_rate": 0.0001, "loss": 8.7736, "loss/crossentropy": 2.1731058955192566, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.2912386506795883, "step": 2406 }, { "epoch": 0.1505, "grad_norm": 3.421875, "grad_norm_var": 2.4850260416666665, "learning_rate": 0.0001, "loss": 8.4638, "loss/crossentropy": 2.230413794517517, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.29132725298404694, "step": 2408 }, { "epoch": 0.150625, "grad_norm": 3.1875, "grad_norm_var": 2.477733357747396, "learning_rate": 0.0001, "loss": 8.4085, "loss/crossentropy": 2.353387713432312, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.27667418122291565, "step": 2410 }, { "epoch": 0.15075, "grad_norm": 3.171875, "grad_norm_var": 2.5046702067057294, "learning_rate": 0.0001, "loss": 8.3084, "loss/crossentropy": 2.2864267826080322, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.29395492374897003, "step": 2412 }, { "epoch": 0.150875, "grad_norm": 3.03125, "grad_norm_var": 2.518973795572917, "learning_rate": 0.0001, "loss": 8.233, "loss/crossentropy": 2.203416109085083, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2798038125038147, "step": 2414 }, { "epoch": 0.151, "grad_norm": 3.21875, "grad_norm_var": 2.5182037353515625, "learning_rate": 0.0001, "loss": 8.5614, "loss/crossentropy": 2.3979218006134033, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2655583620071411, "step": 2416 }, { "epoch": 0.151125, "grad_norm": 2.921875, "grad_norm_var": 2.5204060872395835, "learning_rate": 0.0001, "loss": 8.2764, "loss/crossentropy": 2.2898428440093994, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.28126421570777893, "step": 2418 }, { "epoch": 0.15125, "grad_norm": 3.46875, "grad_norm_var": 0.058430989583333336, "learning_rate": 0.0001, "loss": 8.3922, "loss/crossentropy": 2.491398334503174, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.3128269612789154, "step": 2420 }, { "epoch": 0.151375, "grad_norm": 3.265625, "grad_norm_var": 0.022880045572916667, "learning_rate": 0.0001, "loss": 8.3701, "loss/crossentropy": 2.179121255874634, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.26591023802757263, "step": 2422 }, { "epoch": 0.1515, "grad_norm": 3.046875, "grad_norm_var": 0.01845703125, "learning_rate": 0.0001, "loss": 8.3354, "loss/crossentropy": 2.288297653198242, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2655438780784607, "step": 2424 }, { "epoch": 0.151625, "grad_norm": 2.96875, "grad_norm_var": 0.020213826497395834, "learning_rate": 0.0001, "loss": 8.362, "loss/crossentropy": 2.2921411991119385, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.26359351724386215, "step": 2426 }, { "epoch": 0.15175, "grad_norm": 3.203125, "grad_norm_var": 0.020113118489583335, "learning_rate": 0.0001, "loss": 8.3027, "loss/crossentropy": 2.018544852733612, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.27101635932922363, "step": 2428 }, { "epoch": 0.151875, "grad_norm": 3.125, "grad_norm_var": 0.017829386393229167, "learning_rate": 0.0001, "loss": 8.3429, "loss/crossentropy": 2.245858669281006, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2750082165002823, "step": 2430 }, { "epoch": 0.152, "grad_norm": 3.171875, "grad_norm_var": 0.019852701822916666, "learning_rate": 0.0001, "loss": 8.2621, "loss/crossentropy": 2.5454466342926025, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.26717574894428253, "step": 2432 }, { "epoch": 0.152125, "grad_norm": 3.140625, "grad_norm_var": 0.01705322265625, "learning_rate": 0.0001, "loss": 8.3405, "loss/crossentropy": 2.361135959625244, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.273520365357399, "step": 2434 }, { "epoch": 0.15225, "grad_norm": 3.0, "grad_norm_var": 0.012581380208333333, "learning_rate": 0.0001, "loss": 8.186, "loss/crossentropy": 2.2724266052246094, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2594418376684189, "step": 2436 }, { "epoch": 0.152375, "grad_norm": 3.171875, "grad_norm_var": 0.011421712239583333, "learning_rate": 0.0001, "loss": 8.4541, "loss/crossentropy": 2.2280519008636475, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.26080697774887085, "step": 2438 }, { "epoch": 0.1525, "grad_norm": 3.296875, "grad_norm_var": 0.013016764322916667, "learning_rate": 0.0001, "loss": 8.288, "loss/crossentropy": 2.288727283477783, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2835286557674408, "step": 2440 }, { "epoch": 0.152625, "grad_norm": 3.0, "grad_norm_var": 0.013133748372395834, "learning_rate": 0.0001, "loss": 8.3711, "loss/crossentropy": 2.5871081352233887, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2803485840559006, "step": 2442 }, { "epoch": 0.15275, "grad_norm": 3.015625, "grad_norm_var": 0.015453084309895834, "learning_rate": 0.0001, "loss": 8.2484, "loss/crossentropy": 2.170526623725891, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2835932523012161, "step": 2444 }, { "epoch": 0.152875, "grad_norm": 3.328125, "grad_norm_var": 0.0181793212890625, "learning_rate": 0.0001, "loss": 8.1166, "loss/crossentropy": 2.2848747968673706, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2959897220134735, "step": 2446 }, { "epoch": 0.153, "grad_norm": 3.0625, "grad_norm_var": 0.0361328125, "learning_rate": 0.0001, "loss": 8.4482, "loss/crossentropy": 2.31646192073822, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.3019329309463501, "step": 2448 }, { "epoch": 0.153125, "grad_norm": 2.953125, "grad_norm_var": 0.03801676432291667, "learning_rate": 0.0001, "loss": 8.4538, "loss/crossentropy": 2.2353241443634033, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.27827736735343933, "step": 2450 }, { "epoch": 0.15325, "grad_norm": 3.1875, "grad_norm_var": 0.03535054524739583, "learning_rate": 0.0001, "loss": 8.6794, "loss/crossentropy": 2.2519538402557373, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.3021092116832733, "step": 2452 }, { "epoch": 0.153375, "grad_norm": 3.265625, "grad_norm_var": 0.03618062337239583, "learning_rate": 0.0001, "loss": 8.3217, "loss/crossentropy": 2.2183037996292114, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.26068025827407837, "step": 2454 }, { "epoch": 0.1535, "grad_norm": 3.296875, "grad_norm_var": 0.04420166015625, "learning_rate": 0.0001, "loss": 8.4633, "loss/crossentropy": 2.500266432762146, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.2931359112262726, "step": 2456 }, { "epoch": 0.153625, "grad_norm": 2.984375, "grad_norm_var": 0.04478251139322917, "learning_rate": 0.0001, "loss": 8.436, "loss/crossentropy": 2.303490161895752, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2771891579031944, "step": 2458 }, { "epoch": 0.15375, "grad_norm": 3.265625, "grad_norm_var": 0.03901265462239583, "learning_rate": 0.0001, "loss": 8.5569, "loss/crossentropy": 2.461255431175232, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2784468084573746, "step": 2460 }, { "epoch": 0.153875, "grad_norm": 3.28125, "grad_norm_var": 0.038407389322916666, "learning_rate": 0.0001, "loss": 8.4363, "loss/crossentropy": 2.330198287963867, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.32574698328971863, "step": 2462 }, { "epoch": 0.154, "grad_norm": 3.203125, "grad_norm_var": 0.0226715087890625, "learning_rate": 0.0001, "loss": 8.2665, "loss/crossentropy": 2.2690482139587402, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.26994916796684265, "step": 2464 }, { "epoch": 0.154125, "grad_norm": 3.3125, "grad_norm_var": 0.019041951497395834, "learning_rate": 0.0001, "loss": 8.1331, "loss/crossentropy": 2.227648973464966, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2809407413005829, "step": 2466 }, { "epoch": 0.15425, "grad_norm": 3.15625, "grad_norm_var": 0.019038899739583334, "learning_rate": 0.0001, "loss": 8.4534, "loss/crossentropy": 2.4678770303726196, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2576301246881485, "step": 2468 }, { "epoch": 0.154375, "grad_norm": 2.96875, "grad_norm_var": 0.031590779622395836, "learning_rate": 0.0001, "loss": 8.2793, "loss/crossentropy": 2.3544777631759644, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2835593670606613, "step": 2470 }, { "epoch": 0.1545, "grad_norm": 3.0, "grad_norm_var": 0.024641927083333334, "learning_rate": 0.0001, "loss": 8.3369, "loss/crossentropy": 2.3852498531341553, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2661430686712265, "step": 2472 }, { "epoch": 0.154625, "grad_norm": 3.109375, "grad_norm_var": 0.020067342122395835, "learning_rate": 0.0001, "loss": 8.4255, "loss/crossentropy": 2.417587161064148, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.26272399723529816, "step": 2474 }, { "epoch": 0.15475, "grad_norm": 2.96875, "grad_norm_var": 0.025633748372395834, "learning_rate": 0.0001, "loss": 8.511, "loss/crossentropy": 2.2773125171661377, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2794167101383209, "step": 2476 }, { "epoch": 0.154875, "grad_norm": 3.0, "grad_norm_var": 0.022956339518229167, "learning_rate": 0.0001, "loss": 8.3734, "loss/crossentropy": 2.395659923553467, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.28663066029548645, "step": 2478 }, { "epoch": 0.155, "grad_norm": 3.3125, "grad_norm_var": 0.0287506103515625, "learning_rate": 0.0001, "loss": 8.0888, "loss/crossentropy": 2.0191069841384888, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.24243928492069244, "step": 2480 }, { "epoch": 0.155125, "grad_norm": 3.296875, "grad_norm_var": 0.031787109375, "learning_rate": 0.0001, "loss": 8.2903, "loss/crossentropy": 2.2635436058044434, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.26046207547187805, "step": 2482 }, { "epoch": 0.15525, "grad_norm": 3.109375, "grad_norm_var": 0.032177734375, "learning_rate": 0.0001, "loss": 8.3064, "loss/crossentropy": 2.3955200910568237, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2639884054660797, "step": 2484 }, { "epoch": 0.155375, "grad_norm": 3.671875, "grad_norm_var": 0.1326812744140625, "learning_rate": 0.0001, "loss": 8.5258, "loss/crossentropy": 2.4589436054229736, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2719157636165619, "step": 2486 }, { "epoch": 0.1555, "grad_norm": 3.078125, "grad_norm_var": 0.13032124837239584, "learning_rate": 0.0001, "loss": 8.1377, "loss/crossentropy": 2.343076705932617, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.27290938794612885, "step": 2488 }, { "epoch": 0.155625, "grad_norm": 3.25, "grad_norm_var": 0.131005859375, "learning_rate": 0.0001, "loss": 8.4763, "loss/crossentropy": 2.447250247001648, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2733266055583954, "step": 2490 }, { "epoch": 0.15575, "grad_norm": 3.125, "grad_norm_var": 0.12581278483072916, "learning_rate": 0.0001, "loss": 8.3088, "loss/crossentropy": 2.1297186613082886, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2590889632701874, "step": 2492 }, { "epoch": 0.155875, "grad_norm": 3.09375, "grad_norm_var": 0.12429097493489584, "learning_rate": 0.0001, "loss": 8.5055, "loss/crossentropy": 2.6144137382507324, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.27285242080688477, "step": 2494 }, { "epoch": 0.156, "grad_norm": 2.9375, "grad_norm_var": 0.12184956868489584, "learning_rate": 0.0001, "loss": 8.1753, "loss/crossentropy": 2.18227219581604, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2713698595762253, "step": 2496 }, { "epoch": 0.156125, "grad_norm": 3.328125, "grad_norm_var": 0.12237955729166666, "learning_rate": 0.0001, "loss": 8.3438, "loss/crossentropy": 2.5376009941101074, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.29997144639492035, "step": 2498 }, { "epoch": 0.15625, "grad_norm": 3.0625, "grad_norm_var": 0.11913960774739583, "learning_rate": 0.0001, "loss": 8.3455, "loss/crossentropy": 2.3957122564315796, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2771972119808197, "step": 2500 }, { "epoch": 0.156375, "grad_norm": 3.15625, "grad_norm_var": 0.013700358072916667, "learning_rate": 0.0001, "loss": 8.2594, "loss/crossentropy": 2.3807398080825806, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.27448034286499023, "step": 2502 }, { "epoch": 0.1565, "grad_norm": 3.1875, "grad_norm_var": 0.013557942708333333, "learning_rate": 0.0001, "loss": 8.3676, "loss/crossentropy": 2.4909543991088867, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2869381755590439, "step": 2504 }, { "epoch": 0.156625, "grad_norm": 2.984375, "grad_norm_var": 0.013248697916666666, "learning_rate": 0.0001, "loss": 8.2354, "loss/crossentropy": 2.4148250818252563, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2610015794634819, "step": 2506 }, { "epoch": 0.15675, "grad_norm": 3.375, "grad_norm_var": 0.014728800455729166, "learning_rate": 0.0001, "loss": 8.3965, "loss/crossentropy": 2.4353911876678467, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.26869361102581024, "step": 2508 }, { "epoch": 0.156875, "grad_norm": 3.25, "grad_norm_var": 0.019286092122395834, "learning_rate": 0.0001, "loss": 8.3729, "loss/crossentropy": 2.4266940355300903, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.27790483832359314, "step": 2510 }, { "epoch": 0.157, "grad_norm": 3.1875, "grad_norm_var": 0.016630045572916665, "learning_rate": 0.0001, "loss": 8.4859, "loss/crossentropy": 2.544732928276062, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.315563440322876, "step": 2512 }, { "epoch": 0.157125, "grad_norm": 3.34375, "grad_norm_var": 0.016706339518229165, "learning_rate": 0.0001, "loss": 8.3694, "loss/crossentropy": 2.2503061294555664, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.25774821639060974, "step": 2514 }, { "epoch": 0.15725, "grad_norm": 3.453125, "grad_norm_var": 0.020734659830729165, "learning_rate": 0.0001, "loss": 8.1868, "loss/crossentropy": 2.108790874481201, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.2711709439754486, "step": 2516 }, { "epoch": 0.157375, "grad_norm": 2.890625, "grad_norm_var": 0.026786295572916667, "learning_rate": 0.0001, "loss": 8.1875, "loss/crossentropy": 2.4312883615493774, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26691682636737823, "step": 2518 }, { "epoch": 0.1575, "grad_norm": 3.375, "grad_norm_var": 0.03139546712239583, "learning_rate": 0.0001, "loss": 8.6367, "loss/crossentropy": 2.4578086137771606, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.29156582057476044, "step": 2520 }, { "epoch": 0.157625, "grad_norm": 3.21875, "grad_norm_var": 0.028693644205729167, "learning_rate": 0.0001, "loss": 8.4396, "loss/crossentropy": 1.9031986594200134, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2655710130929947, "step": 2522 }, { "epoch": 0.15775, "grad_norm": 3.3125, "grad_norm_var": 0.029352823893229168, "learning_rate": 0.0001, "loss": 8.2257, "loss/crossentropy": 2.2168606519699097, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.261834055185318, "step": 2524 }, { "epoch": 0.157875, "grad_norm": 3.046875, "grad_norm_var": 0.02506103515625, "learning_rate": 0.0001, "loss": 8.4744, "loss/crossentropy": 2.310633659362793, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.27251285314559937, "step": 2526 }, { "epoch": 0.158, "grad_norm": 3.0625, "grad_norm_var": 0.02662353515625, "learning_rate": 0.0001, "loss": 8.4234, "loss/crossentropy": 2.2944494485855103, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2710151672363281, "step": 2528 }, { "epoch": 0.158125, "grad_norm": 3.015625, "grad_norm_var": 0.029683430989583332, "learning_rate": 0.0001, "loss": 8.0669, "loss/crossentropy": 2.162436366081238, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2699380964040756, "step": 2530 }, { "epoch": 0.15825, "grad_norm": 3.0625, "grad_norm_var": 0.024702962239583334, "learning_rate": 0.0001, "loss": 8.1866, "loss/crossentropy": 2.358327627182007, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.28228873014450073, "step": 2532 }, { "epoch": 0.158375, "grad_norm": 3.25, "grad_norm_var": 0.020873006184895834, "learning_rate": 0.0001, "loss": 8.3448, "loss/crossentropy": 2.4418132305145264, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2814173698425293, "step": 2534 }, { "epoch": 0.1585, "grad_norm": 3.0, "grad_norm_var": 0.0148834228515625, "learning_rate": 0.0001, "loss": 8.1711, "loss/crossentropy": 2.194345235824585, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2765004634857178, "step": 2536 }, { "epoch": 0.158625, "grad_norm": 3.03125, "grad_norm_var": 0.01500244140625, "learning_rate": 0.0001, "loss": 8.4133, "loss/crossentropy": 2.492135763168335, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2749137431383133, "step": 2538 }, { "epoch": 0.15875, "grad_norm": 3.109375, "grad_norm_var": 0.011393229166666666, "learning_rate": 0.0001, "loss": 8.2062, "loss/crossentropy": 2.087536931037903, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2661558836698532, "step": 2540 }, { "epoch": 0.158875, "grad_norm": 2.984375, "grad_norm_var": 0.008968098958333334, "learning_rate": 0.0001, "loss": 8.5442, "loss/crossentropy": 2.5190815925598145, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2785092294216156, "step": 2542 }, { "epoch": 0.159, "grad_norm": 3.5, "grad_norm_var": 0.0190093994140625, "learning_rate": 0.0001, "loss": 8.5158, "loss/crossentropy": 2.410357117652893, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2825370132923126, "step": 2544 }, { "epoch": 0.159125, "grad_norm": 3.03125, "grad_norm_var": 0.017853800455729166, "learning_rate": 0.0001, "loss": 8.0455, "loss/crossentropy": 2.469596028327942, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2713569104671478, "step": 2546 }, { "epoch": 0.15925, "grad_norm": 3.15625, "grad_norm_var": 0.0189605712890625, "learning_rate": 0.0001, "loss": 8.1763, "loss/crossentropy": 1.902215301990509, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.23949339985847473, "step": 2548 }, { "epoch": 0.159375, "grad_norm": 3.0625, "grad_norm_var": 0.018648274739583335, "learning_rate": 0.0001, "loss": 8.2739, "loss/crossentropy": 2.2176631689071655, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2655271142721176, "step": 2550 }, { "epoch": 0.1595, "grad_norm": 3.109375, "grad_norm_var": 0.017853800455729166, "learning_rate": 0.0001, "loss": 8.253, "loss/crossentropy": 2.4577187299728394, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.25844304263591766, "step": 2552 }, { "epoch": 0.159625, "grad_norm": 3.4375, "grad_norm_var": 0.0239898681640625, "learning_rate": 0.0001, "loss": 8.3834, "loss/crossentropy": 2.0720095038414, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.25736086815595627, "step": 2554 }, { "epoch": 0.15975, "grad_norm": 3.078125, "grad_norm_var": 0.024149576822916668, "learning_rate": 0.0001, "loss": 8.3121, "loss/crossentropy": 2.38494074344635, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.3003828078508377, "step": 2556 }, { "epoch": 0.159875, "grad_norm": 2.921875, "grad_norm_var": 0.0262603759765625, "learning_rate": 0.0001, "loss": 8.1596, "loss/crossentropy": 2.4485820531845093, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.26067347824573517, "step": 2558 }, { "epoch": 0.16, "grad_norm": 3.484375, "grad_norm_var": 0.024283854166666667, "learning_rate": 0.0001, "loss": 8.4399, "loss/crossentropy": 2.1876609325408936, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.25350525975227356, "step": 2560 }, { "epoch": 0.160125, "grad_norm": 5.0625, "grad_norm_var": 0.25761311848958335, "learning_rate": 0.0001, "loss": 8.381, "loss/crossentropy": 2.265621542930603, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.28078918159008026, "step": 2562 }, { "epoch": 0.16025, "grad_norm": 3.5625, "grad_norm_var": 0.30728759765625, "learning_rate": 0.0001, "loss": 8.5677, "loss/crossentropy": 2.5903844833374023, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.2908772826194763, "step": 2564 }, { "epoch": 0.160375, "grad_norm": 3.09375, "grad_norm_var": 0.3009104410807292, "learning_rate": 0.0001, "loss": 8.3239, "loss/crossentropy": 2.2390648126602173, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.2874547988176346, "step": 2566 }, { "epoch": 0.1605, "grad_norm": 3.203125, "grad_norm_var": 0.3062408447265625, "learning_rate": 0.0001, "loss": 8.1255, "loss/crossentropy": 2.012996554374695, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.25241582095623016, "step": 2568 }, { "epoch": 0.160625, "grad_norm": 3.15625, "grad_norm_var": 0.3130523681640625, "learning_rate": 0.0001, "loss": 8.3869, "loss/crossentropy": 2.2517330646514893, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.27007003128528595, "step": 2570 }, { "epoch": 0.16075, "grad_norm": 3.09375, "grad_norm_var": 0.3095448811848958, "learning_rate": 0.0001, "loss": 8.5403, "loss/crossentropy": 2.271014094352722, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2996182441711426, "step": 2572 }, { "epoch": 0.160875, "grad_norm": 3.0, "grad_norm_var": 0.3062164306640625, "learning_rate": 0.0001, "loss": 8.1903, "loss/crossentropy": 1.994364619255066, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.26399780064821243, "step": 2574 }, { "epoch": 0.161, "grad_norm": 2.9375, "grad_norm_var": 0.3115386962890625, "learning_rate": 0.0001, "loss": 8.2641, "loss/crossentropy": 2.1744818687438965, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.3009081333875656, "step": 2576 }, { "epoch": 0.161125, "grad_norm": 3.25, "grad_norm_var": 0.09748433430989584, "learning_rate": 0.0001, "loss": 8.265, "loss/crossentropy": 2.465830087661743, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2836051285266876, "step": 2578 }, { "epoch": 0.16125, "grad_norm": 3.34375, "grad_norm_var": 0.016535441080729168, "learning_rate": 0.0001, "loss": 8.5721, "loss/crossentropy": 2.098099946975708, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2927285134792328, "step": 2580 }, { "epoch": 0.161375, "grad_norm": 3.0, "grad_norm_var": 0.01763916015625, "learning_rate": 0.0001, "loss": 8.3492, "loss/crossentropy": 2.6066181659698486, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.3026330918073654, "step": 2582 }, { "epoch": 0.1615, "grad_norm": 2.984375, "grad_norm_var": 0.016927083333333332, "learning_rate": 0.0001, "loss": 8.2841, "loss/crossentropy": 2.3234163522720337, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26173534989356995, "step": 2584 }, { "epoch": 0.161625, "grad_norm": 3.59375, "grad_norm_var": 0.030692545572916667, "learning_rate": 0.0001, "loss": 8.3342, "loss/crossentropy": 2.3151192665100098, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2646416872739792, "step": 2586 }, { "epoch": 0.16175, "grad_norm": 3.0, "grad_norm_var": 0.03170572916666667, "learning_rate": 0.0001, "loss": 8.1608, "loss/crossentropy": 2.3148727416992188, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.26169875264167786, "step": 2588 }, { "epoch": 0.161875, "grad_norm": 3.09375, "grad_norm_var": 0.0297271728515625, "learning_rate": 0.0001, "loss": 8.2822, "loss/crossentropy": 2.418124556541443, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2865563780069351, "step": 2590 }, { "epoch": 0.162, "grad_norm": 2.90625, "grad_norm_var": 0.035090128580729164, "learning_rate": 0.0001, "loss": 8.2814, "loss/crossentropy": 2.2745689153671265, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.27167628705501556, "step": 2592 }, { "epoch": 0.162125, "grad_norm": 3.296875, "grad_norm_var": 0.03736572265625, "learning_rate": 0.0001, "loss": 8.1517, "loss/crossentropy": 2.1091307401657104, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.26850171387195587, "step": 2594 }, { "epoch": 0.16225, "grad_norm": 3.015625, "grad_norm_var": 0.03427327473958333, "learning_rate": 0.0001, "loss": 8.2353, "loss/crossentropy": 2.3798282146453857, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2726765275001526, "step": 2596 }, { "epoch": 0.162375, "grad_norm": 2.890625, "grad_norm_var": 0.036473592122395836, "learning_rate": 0.0001, "loss": 8.1632, "loss/crossentropy": 2.2773178815841675, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25379926711320877, "step": 2598 }, { "epoch": 0.1625, "grad_norm": 3.265625, "grad_norm_var": 0.041624959309895834, "learning_rate": 0.0001, "loss": 8.6347, "loss/crossentropy": 2.6614561080932617, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.28401175141334534, "step": 2600 }, { "epoch": 0.162625, "grad_norm": 3.0, "grad_norm_var": 0.024495442708333332, "learning_rate": 0.0001, "loss": 8.1829, "loss/crossentropy": 2.384236216545105, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2784354239702225, "step": 2602 }, { "epoch": 0.16275, "grad_norm": 3.0625, "grad_norm_var": 0.024714152018229168, "learning_rate": 0.0001, "loss": 8.2394, "loss/crossentropy": 2.282965302467346, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.2925720512866974, "step": 2604 }, { "epoch": 0.162875, "grad_norm": 2.859375, "grad_norm_var": 0.026167805989583334, "learning_rate": 0.0001, "loss": 8.2496, "loss/crossentropy": 2.2551791667938232, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.25895287841558456, "step": 2606 }, { "epoch": 0.163, "grad_norm": 3.046875, "grad_norm_var": 0.022004191080729166, "learning_rate": 0.0001, "loss": 8.3511, "loss/crossentropy": 2.298862099647522, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.25684790313243866, "step": 2608 }, { "epoch": 0.163125, "grad_norm": 3.25, "grad_norm_var": 0.02080078125, "learning_rate": 0.0001, "loss": 8.3087, "loss/crossentropy": 2.253198981285095, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2626389414072037, "step": 2610 }, { "epoch": 0.16325, "grad_norm": 3.015625, "grad_norm_var": 0.0210601806640625, "learning_rate": 0.0001, "loss": 8.2759, "loss/crossentropy": 2.5446906089782715, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2750287652015686, "step": 2612 }, { "epoch": 0.163375, "grad_norm": 3.203125, "grad_norm_var": 0.019319661458333335, "learning_rate": 0.0001, "loss": 8.1388, "loss/crossentropy": 2.444550633430481, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2657383382320404, "step": 2614 }, { "epoch": 0.1635, "grad_norm": 3.15625, "grad_norm_var": 0.013505045572916667, "learning_rate": 0.0001, "loss": 8.3399, "loss/crossentropy": 2.2695289850234985, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.28452712297439575, "step": 2616 }, { "epoch": 0.163625, "grad_norm": 2.890625, "grad_norm_var": 0.015477498372395834, "learning_rate": 0.0001, "loss": 8.1938, "loss/crossentropy": 2.3448396921157837, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.26763126254081726, "step": 2618 }, { "epoch": 0.16375, "grad_norm": 2.9375, "grad_norm_var": 0.01607666015625, "learning_rate": 0.0001, "loss": 8.2201, "loss/crossentropy": 2.2416622638702393, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.24487657845020294, "step": 2620 }, { "epoch": 0.163875, "grad_norm": 3.328125, "grad_norm_var": 0.018114217122395835, "learning_rate": 0.0001, "loss": 8.2554, "loss/crossentropy": 2.0370543003082275, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.30543872714042664, "step": 2622 }, { "epoch": 0.164, "grad_norm": 2.8125, "grad_norm_var": 0.0245025634765625, "learning_rate": 0.0001, "loss": 8.0796, "loss/crossentropy": 2.2707594633102417, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.26010069251060486, "step": 2624 }, { "epoch": 0.164125, "grad_norm": 3.171875, "grad_norm_var": 0.026399739583333335, "learning_rate": 0.0001, "loss": 8.2565, "loss/crossentropy": 2.085159659385681, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2566560357809067, "step": 2626 }, { "epoch": 0.16425, "grad_norm": 3.171875, "grad_norm_var": 0.025389607747395834, "learning_rate": 0.0001, "loss": 8.163, "loss/crossentropy": 2.165037214756012, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.255811408162117, "step": 2628 }, { "epoch": 0.164375, "grad_norm": 2.9375, "grad_norm_var": 0.023954264322916665, "learning_rate": 0.0001, "loss": 8.322, "loss/crossentropy": 2.383608818054199, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.29559099674224854, "step": 2630 }, { "epoch": 0.1645, "grad_norm": 3.140625, "grad_norm_var": 0.023688761393229167, "learning_rate": 0.0001, "loss": 8.4369, "loss/crossentropy": 2.5001370906829834, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2756764143705368, "step": 2632 }, { "epoch": 0.164625, "grad_norm": 3.0625, "grad_norm_var": 0.022684733072916668, "learning_rate": 0.0001, "loss": 8.2474, "loss/crossentropy": 2.153060555458069, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2610151022672653, "step": 2634 }, { "epoch": 0.16475, "grad_norm": 3.234375, "grad_norm_var": 0.02958984375, "learning_rate": 0.0001, "loss": 8.3281, "loss/crossentropy": 2.4711453914642334, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2531973719596863, "step": 2636 }, { "epoch": 0.164875, "grad_norm": 3.171875, "grad_norm_var": 0.028837076822916665, "learning_rate": 0.0001, "loss": 8.4733, "loss/crossentropy": 2.4670186042785645, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.30856695771217346, "step": 2638 }, { "epoch": 0.165, "grad_norm": 3.171875, "grad_norm_var": 0.022945149739583334, "learning_rate": 0.0001, "loss": 8.2444, "loss/crossentropy": 2.373382568359375, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2556057423353195, "step": 2640 }, { "epoch": 0.165125, "grad_norm": 3.25, "grad_norm_var": 0.016502888997395833, "learning_rate": 0.0001, "loss": 8.2022, "loss/crossentropy": 2.3559107780456543, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2673548609018326, "step": 2642 }, { "epoch": 0.16525, "grad_norm": 2.953125, "grad_norm_var": 0.01890869140625, "learning_rate": 0.0001, "loss": 8.149, "loss/crossentropy": 2.312580704689026, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2649365961551666, "step": 2644 }, { "epoch": 0.165375, "grad_norm": 3.34375, "grad_norm_var": 0.018871053059895834, "learning_rate": 0.0001, "loss": 8.1288, "loss/crossentropy": 2.0767895579338074, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2341424822807312, "step": 2646 }, { "epoch": 0.1655, "grad_norm": 2.984375, "grad_norm_var": 0.021610514322916666, "learning_rate": 0.0001, "loss": 8.0931, "loss/crossentropy": 2.3518717288970947, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.28567561507225037, "step": 2648 }, { "epoch": 0.165625, "grad_norm": 3.21875, "grad_norm_var": 0.021076456705729166, "learning_rate": 0.0001, "loss": 8.4698, "loss/crossentropy": 2.3555731773376465, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2730225473642349, "step": 2650 }, { "epoch": 0.16575, "grad_norm": 3.40625, "grad_norm_var": 0.025406901041666666, "learning_rate": 0.0001, "loss": 8.3373, "loss/crossentropy": 2.0926910042762756, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2553661912679672, "step": 2652 }, { "epoch": 0.165875, "grad_norm": 2.859375, "grad_norm_var": 0.027766927083333334, "learning_rate": 0.0001, "loss": 8.2932, "loss/crossentropy": 2.325496196746826, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.27998365461826324, "step": 2654 }, { "epoch": 0.166, "grad_norm": 2.9375, "grad_norm_var": 0.029524739583333334, "learning_rate": 0.0001, "loss": 8.2462, "loss/crossentropy": 2.3335851430892944, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.272167906165123, "step": 2656 }, { "epoch": 0.166125, "grad_norm": 3.0625, "grad_norm_var": 0.028206380208333333, "learning_rate": 0.0001, "loss": 8.1733, "loss/crossentropy": 2.1524252891540527, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.27060362696647644, "step": 2658 }, { "epoch": 0.16625, "grad_norm": 3.265625, "grad_norm_var": 0.0273101806640625, "learning_rate": 0.0001, "loss": 8.3935, "loss/crossentropy": 2.3421573638916016, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2588433101773262, "step": 2660 }, { "epoch": 0.166375, "grad_norm": 2.921875, "grad_norm_var": 0.027067057291666665, "learning_rate": 0.0001, "loss": 8.1186, "loss/crossentropy": 2.073794722557068, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.24907249212265015, "step": 2662 }, { "epoch": 0.1665, "grad_norm": 3.140625, "grad_norm_var": 0.024605305989583333, "learning_rate": 0.0001, "loss": 8.2707, "loss/crossentropy": 2.2278562784194946, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2469905987381935, "step": 2664 }, { "epoch": 0.166625, "grad_norm": 2.984375, "grad_norm_var": 0.025472005208333332, "learning_rate": 0.0001, "loss": 8.0191, "loss/crossentropy": 2.48944628238678, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.26759523153305054, "step": 2666 }, { "epoch": 0.16675, "grad_norm": 3.0, "grad_norm_var": 0.0146881103515625, "learning_rate": 0.0001, "loss": 8.1883, "loss/crossentropy": 2.166255533695221, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2581564337015152, "step": 2668 }, { "epoch": 0.166875, "grad_norm": 3.125, "grad_norm_var": 0.011751302083333333, "learning_rate": 0.0001, "loss": 8.2138, "loss/crossentropy": 2.3537596464157104, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.28053438663482666, "step": 2670 }, { "epoch": 0.167, "grad_norm": 2.984375, "grad_norm_var": 0.0102935791015625, "learning_rate": 0.0001, "loss": 8.1638, "loss/crossentropy": 2.3822021484375, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25109483301639557, "step": 2672 }, { "epoch": 0.167125, "grad_norm": 3.328125, "grad_norm_var": 0.014435831705729167, "learning_rate": 0.0001, "loss": 8.3125, "loss/crossentropy": 2.210346817970276, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2710355073213577, "step": 2674 }, { "epoch": 0.16725, "grad_norm": 3.203125, "grad_norm_var": 0.0143951416015625, "learning_rate": 0.0001, "loss": 8.0399, "loss/crossentropy": 2.1359152793884277, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2473316341638565, "step": 2676 }, { "epoch": 0.167375, "grad_norm": 2.921875, "grad_norm_var": 0.0133209228515625, "learning_rate": 0.0001, "loss": 8.2351, "loss/crossentropy": 2.4049233198165894, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2830816060304642, "step": 2678 }, { "epoch": 0.1675, "grad_norm": 2.96875, "grad_norm_var": 0.013166300455729167, "learning_rate": 0.0001, "loss": 8.1941, "loss/crossentropy": 2.3928329944610596, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.266268789768219, "step": 2680 }, { "epoch": 0.167625, "grad_norm": 3.421875, "grad_norm_var": 0.025748697916666667, "learning_rate": 0.0001, "loss": 8.0556, "loss/crossentropy": 2.024741470813751, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.26838643848896027, "step": 2682 }, { "epoch": 0.16775, "grad_norm": 2.96875, "grad_norm_var": 0.049958292643229166, "learning_rate": 0.0001, "loss": 8.3848, "loss/crossentropy": 2.528537154197693, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2850952595472336, "step": 2684 }, { "epoch": 0.167875, "grad_norm": 3.046875, "grad_norm_var": 0.04983723958333333, "learning_rate": 0.0001, "loss": 8.3009, "loss/crossentropy": 2.2957894802093506, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.26776544749736786, "step": 2686 }, { "epoch": 0.168, "grad_norm": 3.21875, "grad_norm_var": 0.05024312337239583, "learning_rate": 0.0001, "loss": 8.3334, "loss/crossentropy": 2.272274613380432, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2532171159982681, "step": 2688 }, { "epoch": 0.168125, "grad_norm": 2.96875, "grad_norm_var": 0.048981730143229166, "learning_rate": 0.0001, "loss": 8.337, "loss/crossentropy": 2.520447254180908, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2912859171628952, "step": 2690 }, { "epoch": 0.16825, "grad_norm": 2.953125, "grad_norm_var": 0.0484771728515625, "learning_rate": 0.0001, "loss": 8.0313, "loss/crossentropy": 2.2402195930480957, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.2597721219062805, "step": 2692 }, { "epoch": 0.168375, "grad_norm": 3.03125, "grad_norm_var": 0.045796712239583336, "learning_rate": 0.0001, "loss": 8.16, "loss/crossentropy": 2.263728380203247, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.25052615255117416, "step": 2694 }, { "epoch": 0.1685, "grad_norm": 3.0, "grad_norm_var": 0.04702046712239583, "learning_rate": 0.0001, "loss": 8.2191, "loss/crossentropy": 2.2724695205688477, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.27686507999897003, "step": 2696 }, { "epoch": 0.168625, "grad_norm": 3.15625, "grad_norm_var": 0.038557942708333334, "learning_rate": 0.0001, "loss": 8.292, "loss/crossentropy": 2.169528841972351, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.252603217959404, "step": 2698 }, { "epoch": 0.16875, "grad_norm": 4.5, "grad_norm_var": 0.14544169108072916, "learning_rate": 0.0001, "loss": 8.0871, "loss/crossentropy": 2.375905990600586, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2765290290117264, "step": 2700 }, { "epoch": 0.168875, "grad_norm": 3.015625, "grad_norm_var": 0.15084228515625, "learning_rate": 0.0001, "loss": 8.3753, "loss/crossentropy": 2.3945010900497437, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.27985604107379913, "step": 2702 }, { "epoch": 0.169, "grad_norm": 3.140625, "grad_norm_var": 0.14921468098958332, "learning_rate": 0.0001, "loss": 8.0489, "loss/crossentropy": 2.1550697684288025, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2562442794442177, "step": 2704 }, { "epoch": 0.169125, "grad_norm": 2.875, "grad_norm_var": 0.15213216145833333, "learning_rate": 0.0001, "loss": 8.0596, "loss/crossentropy": 2.1525893211364746, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2613821029663086, "step": 2706 }, { "epoch": 0.16925, "grad_norm": 2.953125, "grad_norm_var": 0.1527008056640625, "learning_rate": 0.0001, "loss": 8.2609, "loss/crossentropy": 2.194769859313965, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2507399097084999, "step": 2708 }, { "epoch": 0.169375, "grad_norm": 3.3125, "grad_norm_var": 0.1499420166015625, "learning_rate": 0.0001, "loss": 8.3803, "loss/crossentropy": 2.373349666595459, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.2778366059064865, "step": 2710 }, { "epoch": 0.1695, "grad_norm": 3.0625, "grad_norm_var": 0.14553629557291667, "learning_rate": 0.0001, "loss": 8.2339, "loss/crossentropy": 2.3904805183410645, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2910989373922348, "step": 2712 }, { "epoch": 0.169625, "grad_norm": 2.796875, "grad_norm_var": 0.15755106608072916, "learning_rate": 0.0001, "loss": 8.0392, "loss/crossentropy": 2.21162748336792, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.24572212994098663, "step": 2714 }, { "epoch": 0.16975, "grad_norm": 3.09375, "grad_norm_var": 0.034375, "learning_rate": 0.0001, "loss": 8.1972, "loss/crossentropy": 2.2650893926620483, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.27223196625709534, "step": 2716 }, { "epoch": 0.169875, "grad_norm": 3.078125, "grad_norm_var": 0.030208333333333334, "learning_rate": 0.0001, "loss": 8.2521, "loss/crossentropy": 2.410581946372986, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.27209579944610596, "step": 2718 }, { "epoch": 0.17, "grad_norm": 3.0, "grad_norm_var": 0.03357747395833333, "learning_rate": 0.0001, "loss": 8.149, "loss/crossentropy": 2.4662805795669556, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.25760146975517273, "step": 2720 }, { "epoch": 0.170125, "grad_norm": 3.09375, "grad_norm_var": 0.030296834309895833, "learning_rate": 0.0001, "loss": 8.2252, "loss/crossentropy": 2.275286078453064, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.263226181268692, "step": 2722 }, { "epoch": 0.17025, "grad_norm": 3.0625, "grad_norm_var": 0.026862589518229167, "learning_rate": 0.0001, "loss": 8.2922, "loss/crossentropy": 2.4460601806640625, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.28739283978939056, "step": 2724 }, { "epoch": 0.170375, "grad_norm": 3.0, "grad_norm_var": 0.0196197509765625, "learning_rate": 0.0001, "loss": 8.0972, "loss/crossentropy": 2.414928674697876, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.28981567919254303, "step": 2726 }, { "epoch": 0.1705, "grad_norm": 3.09375, "grad_norm_var": 0.0107086181640625, "learning_rate": 0.0001, "loss": 8.4286, "loss/crossentropy": 2.3859044313430786, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2801547795534134, "step": 2728 }, { "epoch": 0.170625, "grad_norm": 3.3125, "grad_norm_var": 0.010384114583333333, "learning_rate": 0.0001, "loss": 8.3648, "loss/crossentropy": 2.2060307264328003, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.26368267089128494, "step": 2730 }, { "epoch": 0.17075, "grad_norm": 3.359375, "grad_norm_var": 0.015478515625, "learning_rate": 0.0001, "loss": 8.5339, "loss/crossentropy": 2.4007495641708374, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.27564837038517, "step": 2732 }, { "epoch": 0.170875, "grad_norm": 3.3125, "grad_norm_var": 0.01627197265625, "learning_rate": 0.0001, "loss": 8.176, "loss/crossentropy": 2.2184962034225464, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2526983246207237, "step": 2734 }, { "epoch": 0.171, "grad_norm": 2.78125, "grad_norm_var": 0.019562784830729166, "learning_rate": 0.0001, "loss": 8.2518, "loss/crossentropy": 2.403747081756592, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.28357672691345215, "step": 2736 }, { "epoch": 0.171125, "grad_norm": 4.25, "grad_norm_var": 0.11033426920572917, "learning_rate": 0.0001, "loss": 8.3945, "loss/crossentropy": 2.2106932997703552, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.27614726126194, "step": 2738 }, { "epoch": 0.17125, "grad_norm": 2.96875, "grad_norm_var": 0.11243082682291666, "learning_rate": 0.0001, "loss": 8.1762, "loss/crossentropy": 2.366937756538391, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.2758289873600006, "step": 2740 }, { "epoch": 0.171375, "grad_norm": 3.15625, "grad_norm_var": 0.10838216145833333, "learning_rate": 0.0001, "loss": 8.1801, "loss/crossentropy": 2.6269803047180176, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2783343344926834, "step": 2742 }, { "epoch": 0.1715, "grad_norm": 2.78125, "grad_norm_var": 0.12049153645833334, "learning_rate": 0.0001, "loss": 8.087, "loss/crossentropy": 2.2394570112228394, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26301658153533936, "step": 2744 }, { "epoch": 0.171625, "grad_norm": 3.09375, "grad_norm_var": 0.11782938639322917, "learning_rate": 0.0001, "loss": 8.4104, "loss/crossentropy": 2.2185282707214355, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.28459644317626953, "step": 2746 }, { "epoch": 0.17175, "grad_norm": 3.5, "grad_norm_var": 0.12354227701822916, "learning_rate": 0.0001, "loss": 8.3685, "loss/crossentropy": 2.4350966215133667, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2675904631614685, "step": 2748 }, { "epoch": 0.171875, "grad_norm": 2.953125, "grad_norm_var": 0.12983296712239584, "learning_rate": 0.0001, "loss": 8.1103, "loss/crossentropy": 2.10713529586792, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26895423233509064, "step": 2750 }, { "epoch": 0.172, "grad_norm": 2.953125, "grad_norm_var": 0.12000223795572916, "learning_rate": 0.0001, "loss": 8.2241, "loss/crossentropy": 2.1709011793136597, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.24941477179527283, "step": 2752 }, { "epoch": 0.172125, "grad_norm": 2.953125, "grad_norm_var": 0.039094034830729166, "learning_rate": 0.0001, "loss": 8.1094, "loss/crossentropy": 2.4355965852737427, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.27485549449920654, "step": 2754 }, { "epoch": 0.17225, "grad_norm": 3.25, "grad_norm_var": 0.03870035807291667, "learning_rate": 0.0001, "loss": 8.176, "loss/crossentropy": 2.2021514177322388, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.24435579776763916, "step": 2756 }, { "epoch": 0.172375, "grad_norm": 2.9375, "grad_norm_var": 0.041193644205729164, "learning_rate": 0.0001, "loss": 8.0871, "loss/crossentropy": 2.2676401138305664, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.24549178779125214, "step": 2758 }, { "epoch": 0.1725, "grad_norm": 2.875, "grad_norm_var": 0.037333170572916664, "learning_rate": 0.0001, "loss": 8.3252, "loss/crossentropy": 2.266029477119446, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.26920171082019806, "step": 2760 }, { "epoch": 0.172625, "grad_norm": 3.0, "grad_norm_var": 0.04195556640625, "learning_rate": 0.0001, "loss": 8.3167, "loss/crossentropy": 2.2225186824798584, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2658109962940216, "step": 2762 }, { "epoch": 0.17275, "grad_norm": 3.015625, "grad_norm_var": 0.021712239583333334, "learning_rate": 0.0001, "loss": 8.3062, "loss/crossentropy": 2.5298062562942505, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.2963249981403351, "step": 2764 }, { "epoch": 0.172875, "grad_norm": 3.109375, "grad_norm_var": 0.011735026041666667, "learning_rate": 0.0001, "loss": 8.1575, "loss/crossentropy": 2.093632698059082, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2585790827870369, "step": 2766 }, { "epoch": 0.173, "grad_norm": 2.9375, "grad_norm_var": 0.010530598958333333, "learning_rate": 0.0001, "loss": 8.2698, "loss/crossentropy": 2.2221652269363403, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2734009772539139, "step": 2768 }, { "epoch": 0.173125, "grad_norm": 2.984375, "grad_norm_var": 0.011909993489583333, "learning_rate": 0.0001, "loss": 8.0252, "loss/crossentropy": 2.214139223098755, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2616540938615799, "step": 2770 }, { "epoch": 0.17325, "grad_norm": 3.078125, "grad_norm_var": 0.012109375, "learning_rate": 0.0001, "loss": 8.18, "loss/crossentropy": 2.2513784170150757, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.25692541897296906, "step": 2772 }, { "epoch": 0.173375, "grad_norm": 2.8125, "grad_norm_var": 0.015184529622395833, "learning_rate": 0.0001, "loss": 8.1285, "loss/crossentropy": 2.4102020263671875, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2653050720691681, "step": 2774 }, { "epoch": 0.1735, "grad_norm": 3.140625, "grad_norm_var": 0.014876302083333333, "learning_rate": 0.0001, "loss": 8.1504, "loss/crossentropy": 2.2256804704666138, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.26175589859485626, "step": 2776 }, { "epoch": 0.173625, "grad_norm": 2.921875, "grad_norm_var": 0.013297526041666667, "learning_rate": 0.0001, "loss": 8.2647, "loss/crossentropy": 2.562616467475891, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.28543052077293396, "step": 2778 }, { "epoch": 0.17375, "grad_norm": 2.859375, "grad_norm_var": 0.013963826497395833, "learning_rate": 0.0001, "loss": 8.0907, "loss/crossentropy": 2.1320207118988037, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2546294927597046, "step": 2780 }, { "epoch": 0.173875, "grad_norm": 3.21875, "grad_norm_var": 0.020556640625, "learning_rate": 0.0001, "loss": 8.3402, "loss/crossentropy": 2.2618422508239746, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.27197229862213135, "step": 2782 }, { "epoch": 0.174, "grad_norm": 3.28125, "grad_norm_var": 0.024723307291666666, "learning_rate": 0.0001, "loss": 8.4636, "loss/crossentropy": 2.625874876976013, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.27097639441490173, "step": 2784 }, { "epoch": 0.174125, "grad_norm": 2.90625, "grad_norm_var": 0.023558553059895834, "learning_rate": 0.0001, "loss": 8.1304, "loss/crossentropy": 2.170462965965271, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.28276196122169495, "step": 2786 }, { "epoch": 0.17425, "grad_norm": 3.078125, "grad_norm_var": 0.024290974934895834, "learning_rate": 0.0001, "loss": 8.0283, "loss/crossentropy": 2.2857601642608643, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2574136555194855, "step": 2788 }, { "epoch": 0.174375, "grad_norm": 3.09375, "grad_norm_var": 0.020978800455729165, "learning_rate": 0.0001, "loss": 8.1718, "loss/crossentropy": 2.1768821477890015, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26678016781806946, "step": 2790 }, { "epoch": 0.1745, "grad_norm": 3.140625, "grad_norm_var": 0.027762858072916667, "learning_rate": 0.0001, "loss": 8.3611, "loss/crossentropy": 2.337521195411682, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.26058705151081085, "step": 2792 }, { "epoch": 0.174625, "grad_norm": 3.0, "grad_norm_var": 0.0259185791015625, "learning_rate": 0.0001, "loss": 8.2887, "loss/crossentropy": 2.417194366455078, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2650887668132782, "step": 2794 }, { "epoch": 0.17475, "grad_norm": 3.125, "grad_norm_var": 0.021305338541666666, "learning_rate": 0.0001, "loss": 8.2086, "loss/crossentropy": 2.3959646224975586, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.25554417073726654, "step": 2796 }, { "epoch": 0.174875, "grad_norm": 3.140625, "grad_norm_var": 0.02144775390625, "learning_rate": 0.0001, "loss": 7.9649, "loss/crossentropy": 2.2579997777938843, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24610213935375214, "step": 2798 }, { "epoch": 0.175, "grad_norm": 2.984375, "grad_norm_var": 0.01875, "learning_rate": 0.0001, "loss": 8.1861, "loss/crossentropy": 2.402603507041931, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.27010577917099, "step": 2800 }, { "epoch": 0.175125, "grad_norm": 2.984375, "grad_norm_var": 0.018773396809895832, "learning_rate": 0.0001, "loss": 8.0684, "loss/crossentropy": 2.487064242362976, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26188019663095474, "step": 2802 }, { "epoch": 0.17525, "grad_norm": 2.921875, "grad_norm_var": 0.016380818684895833, "learning_rate": 0.0001, "loss": 8.0874, "loss/crossentropy": 2.27841317653656, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.25010599195957184, "step": 2804 }, { "epoch": 0.175375, "grad_norm": 3.3125, "grad_norm_var": 0.027131144205729166, "learning_rate": 0.0001, "loss": 8.2989, "loss/crossentropy": 2.4073312282562256, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.2679283916950226, "step": 2806 }, { "epoch": 0.1755, "grad_norm": 3.40625, "grad_norm_var": 0.0291015625, "learning_rate": 0.0001, "loss": 8.4547, "loss/crossentropy": 2.3957384824752808, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.27833716571331024, "step": 2808 }, { "epoch": 0.175625, "grad_norm": 3.03125, "grad_norm_var": 0.0291015625, "learning_rate": 0.0001, "loss": 8.2216, "loss/crossentropy": 2.447432279586792, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.283155158162117, "step": 2810 }, { "epoch": 0.17575, "grad_norm": 3.171875, "grad_norm_var": 0.03322652180989583, "learning_rate": 0.0001, "loss": 8.1123, "loss/crossentropy": 2.1782784461975098, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.24672221392393112, "step": 2812 }, { "epoch": 0.175875, "grad_norm": 3.03125, "grad_norm_var": 0.0337890625, "learning_rate": 0.0001, "loss": 8.2015, "loss/crossentropy": 2.194298505783081, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.25301510840654373, "step": 2814 }, { "epoch": 0.176, "grad_norm": 3.046875, "grad_norm_var": 0.035309855143229166, "learning_rate": 0.0001, "loss": 8.258, "loss/crossentropy": 2.2716476917266846, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.28211984038352966, "step": 2816 }, { "epoch": 0.176125, "grad_norm": 3.078125, "grad_norm_var": 0.032548014322916666, "learning_rate": 0.0001, "loss": 8.1436, "loss/crossentropy": 2.4797651767730713, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.27206259965896606, "step": 2818 }, { "epoch": 0.17625, "grad_norm": 3.171875, "grad_norm_var": 0.033610026041666664, "learning_rate": 0.0001, "loss": 8.1555, "loss/crossentropy": 2.3100684881210327, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.25597915798425674, "step": 2820 }, { "epoch": 0.176375, "grad_norm": 3.0, "grad_norm_var": 0.025153605143229167, "learning_rate": 0.0001, "loss": 8.19, "loss/crossentropy": 2.343941330909729, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26012492179870605, "step": 2822 }, { "epoch": 0.1765, "grad_norm": 2.953125, "grad_norm_var": 0.01533203125, "learning_rate": 0.0001, "loss": 8.1131, "loss/crossentropy": 2.218177556991577, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2628052681684494, "step": 2824 }, { "epoch": 0.176625, "grad_norm": 3.0, "grad_norm_var": 0.0157135009765625, "learning_rate": 0.0001, "loss": 8.3048, "loss/crossentropy": 2.3974251747131348, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.28162911534309387, "step": 2826 }, { "epoch": 0.17675, "grad_norm": 3.109375, "grad_norm_var": 0.012743123372395833, "learning_rate": 0.0001, "loss": 8.1691, "loss/crossentropy": 2.1724337339401245, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.26139407604932785, "step": 2828 }, { "epoch": 0.176875, "grad_norm": 3.015625, "grad_norm_var": 0.007258097330729167, "learning_rate": 0.0001, "loss": 8.2618, "loss/crossentropy": 2.3914555311203003, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.28328216075897217, "step": 2830 }, { "epoch": 0.177, "grad_norm": 2.828125, "grad_norm_var": 0.00855712890625, "learning_rate": 0.0001, "loss": 7.9935, "loss/crossentropy": 2.235588788986206, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2594834715127945, "step": 2832 }, { "epoch": 0.177125, "grad_norm": 3.046875, "grad_norm_var": 0.0100006103515625, "learning_rate": 0.0001, "loss": 8.3621, "loss/crossentropy": 2.38160240650177, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.28699810802936554, "step": 2834 }, { "epoch": 0.17725, "grad_norm": 2.96875, "grad_norm_var": 0.0061187744140625, "learning_rate": 0.0001, "loss": 8.2658, "loss/crossentropy": 2.5288161039352417, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2768819034099579, "step": 2836 }, { "epoch": 0.177375, "grad_norm": 2.859375, "grad_norm_var": 0.0096099853515625, "learning_rate": 0.0001, "loss": 7.8831, "loss/crossentropy": 2.262709617614746, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2727932184934616, "step": 2838 }, { "epoch": 0.1775, "grad_norm": 3.03125, "grad_norm_var": 0.010016886393229167, "learning_rate": 0.0001, "loss": 8.3451, "loss/crossentropy": 2.432405710220337, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.27632567286491394, "step": 2840 }, { "epoch": 0.177625, "grad_norm": 2.859375, "grad_norm_var": 0.011197916666666667, "learning_rate": 0.0001, "loss": 8.1573, "loss/crossentropy": 2.217153549194336, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2539759650826454, "step": 2842 }, { "epoch": 0.17775, "grad_norm": 3.234375, "grad_norm_var": 0.05241597493489583, "learning_rate": 0.0001, "loss": 8.5061, "loss/crossentropy": 2.3605984449386597, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2897241413593292, "step": 2844 }, { "epoch": 0.177875, "grad_norm": 3.125, "grad_norm_var": 0.0523590087890625, "learning_rate": 0.0001, "loss": 8.0759, "loss/crossentropy": 2.083876132965088, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2714761793613434, "step": 2846 }, { "epoch": 0.178, "grad_norm": 3.375, "grad_norm_var": 0.055680338541666666, "learning_rate": 0.0001, "loss": 8.4974, "loss/crossentropy": 2.539989471435547, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.28579047322273254, "step": 2848 }, { "epoch": 0.178125, "grad_norm": 3.265625, "grad_norm_var": 0.05750325520833333, "learning_rate": 0.0001, "loss": 8.2717, "loss/crossentropy": 2.558953881263733, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2836534380912781, "step": 2850 }, { "epoch": 0.17825, "grad_norm": 3.0, "grad_norm_var": 0.05771484375, "learning_rate": 0.0001, "loss": 8.2571, "loss/crossentropy": 2.3250139951705933, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25975073873996735, "step": 2852 }, { "epoch": 0.178375, "grad_norm": 3.015625, "grad_norm_var": 0.050032552083333334, "learning_rate": 0.0001, "loss": 8.24, "loss/crossentropy": 2.257322072982788, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26378054916858673, "step": 2854 }, { "epoch": 0.1785, "grad_norm": 3.015625, "grad_norm_var": 0.04654541015625, "learning_rate": 0.0001, "loss": 8.5173, "loss/crossentropy": 2.4077916145324707, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.30688565969467163, "step": 2856 }, { "epoch": 0.178625, "grad_norm": 3.25, "grad_norm_var": 0.0427642822265625, "learning_rate": 0.0001, "loss": 8.2647, "loss/crossentropy": 2.3220880031585693, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2572537362575531, "step": 2858 }, { "epoch": 0.17875, "grad_norm": 2.96875, "grad_norm_var": 0.019269816080729165, "learning_rate": 0.0001, "loss": 8.1931, "loss/crossentropy": 2.392123222351074, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2863081991672516, "step": 2860 }, { "epoch": 0.178875, "grad_norm": 3.03125, "grad_norm_var": 0.021647135416666668, "learning_rate": 0.0001, "loss": 8.284, "loss/crossentropy": 2.2904245853424072, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2629016935825348, "step": 2862 }, { "epoch": 0.179, "grad_norm": 2.9375, "grad_norm_var": 0.017724609375, "learning_rate": 0.0001, "loss": 7.9001, "loss/crossentropy": 2.10800838470459, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.25615420937538147, "step": 2864 }, { "epoch": 0.179125, "grad_norm": 3.015625, "grad_norm_var": 0.015380859375, "learning_rate": 0.0001, "loss": 8.1166, "loss/crossentropy": 2.113399863243103, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2571730315685272, "step": 2866 }, { "epoch": 0.17925, "grad_norm": 3.078125, "grad_norm_var": 0.01324462890625, "learning_rate": 0.0001, "loss": 8.3192, "loss/crossentropy": 2.5056850910186768, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.27596791088581085, "step": 2868 }, { "epoch": 0.179375, "grad_norm": 2.96875, "grad_norm_var": 0.0142974853515625, "learning_rate": 0.0001, "loss": 8.0736, "loss/crossentropy": 2.1984351873397827, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.22599153965711594, "step": 2870 }, { "epoch": 0.1795, "grad_norm": 2.921875, "grad_norm_var": 0.013288370768229167, "learning_rate": 0.0001, "loss": 8.1654, "loss/crossentropy": 2.398763060569763, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2709758132696152, "step": 2872 }, { "epoch": 0.179625, "grad_norm": 3.578125, "grad_norm_var": 0.031859334309895834, "learning_rate": 0.0001, "loss": 8.246, "loss/crossentropy": 2.279433012008667, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26721224188804626, "step": 2874 }, { "epoch": 0.17975, "grad_norm": 3.578125, "grad_norm_var": 0.09832356770833334, "learning_rate": 0.0001, "loss": 8.6018, "loss/crossentropy": 2.429903745651245, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.3210333585739136, "step": 2876 }, { "epoch": 0.179875, "grad_norm": 2.90625, "grad_norm_var": 0.10014546712239583, "learning_rate": 0.0001, "loss": 8.3066, "loss/crossentropy": 2.235803484916687, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.265362948179245, "step": 2878 }, { "epoch": 0.18, "grad_norm": 2.9375, "grad_norm_var": 0.10035400390625, "learning_rate": 0.0001, "loss": 8.2791, "loss/crossentropy": 2.2972434759140015, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.25388309359550476, "step": 2880 }, { "epoch": 0.180125, "grad_norm": 3.25, "grad_norm_var": 0.10048828125, "learning_rate": 0.0001, "loss": 8.3889, "loss/crossentropy": 2.3344578742980957, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.25023628771305084, "step": 2882 }, { "epoch": 0.18025, "grad_norm": 3.046875, "grad_norm_var": 0.10738525390625, "learning_rate": 0.0001, "loss": 8.2306, "loss/crossentropy": 2.3195676803588867, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.27837128937244415, "step": 2884 }, { "epoch": 0.180375, "grad_norm": 2.921875, "grad_norm_var": 0.1024810791015625, "learning_rate": 0.0001, "loss": 8.0015, "loss/crossentropy": 2.394813656806946, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.25014276802539825, "step": 2886 }, { "epoch": 0.1805, "grad_norm": 2.90625, "grad_norm_var": 0.11116129557291667, "learning_rate": 0.0001, "loss": 7.8888, "loss/crossentropy": 2.122212529182434, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2572604715824127, "step": 2888 }, { "epoch": 0.180625, "grad_norm": 3.25, "grad_norm_var": 0.09462890625, "learning_rate": 0.0001, "loss": 8.2487, "loss/crossentropy": 2.3987185955047607, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.28120650351047516, "step": 2890 }, { "epoch": 0.18075, "grad_norm": 3.109375, "grad_norm_var": 0.03023681640625, "learning_rate": 0.0001, "loss": 8.2168, "loss/crossentropy": 2.4273810386657715, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2616555094718933, "step": 2892 }, { "epoch": 0.180875, "grad_norm": 2.8125, "grad_norm_var": 0.035065714518229166, "learning_rate": 0.0001, "loss": 8.1309, "loss/crossentropy": 2.4763081073760986, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.26364801824092865, "step": 2894 }, { "epoch": 0.181, "grad_norm": 3.015625, "grad_norm_var": 0.03367411295572917, "learning_rate": 0.0001, "loss": 8.1263, "loss/crossentropy": 2.194410800933838, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.260297030210495, "step": 2896 }, { "epoch": 0.181125, "grad_norm": 2.78125, "grad_norm_var": 0.029508463541666665, "learning_rate": 0.0001, "loss": 8.0331, "loss/crossentropy": 2.2540624141693115, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.26236794888973236, "step": 2898 }, { "epoch": 0.18125, "grad_norm": 2.890625, "grad_norm_var": 0.0299957275390625, "learning_rate": 0.0001, "loss": 8.041, "loss/crossentropy": 2.181529998779297, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2571089118719101, "step": 2900 }, { "epoch": 0.181375, "grad_norm": 3.28125, "grad_norm_var": 0.03355204264322917, "learning_rate": 0.0001, "loss": 8.4286, "loss/crossentropy": 2.3706891536712646, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.29068616032600403, "step": 2902 }, { "epoch": 0.1815, "grad_norm": 3.046875, "grad_norm_var": 0.032063802083333336, "learning_rate": 0.0001, "loss": 8.322, "loss/crossentropy": 2.4488571882247925, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25543512403964996, "step": 2904 }, { "epoch": 0.181625, "grad_norm": 2.875, "grad_norm_var": 0.027962239583333333, "learning_rate": 0.0001, "loss": 8.1152, "loss/crossentropy": 2.252503991127014, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25314611941576004, "step": 2906 }, { "epoch": 0.18175, "grad_norm": 2.90625, "grad_norm_var": 0.025829060872395834, "learning_rate": 0.0001, "loss": 8.0455, "loss/crossentropy": 2.0501604080200195, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.23801933228969574, "step": 2908 }, { "epoch": 0.181875, "grad_norm": 3.328125, "grad_norm_var": 0.030696614583333334, "learning_rate": 0.0001, "loss": 8.2281, "loss/crossentropy": 2.1372073888778687, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2622874677181244, "step": 2910 }, { "epoch": 0.182, "grad_norm": 3.453125, "grad_norm_var": 0.7165191650390625, "learning_rate": 0.0001, "loss": 8.3927, "loss/crossentropy": 2.423773169517517, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2817266881465912, "step": 2912 }, { "epoch": 0.182125, "grad_norm": 3.234375, "grad_norm_var": 0.6845011393229167, "learning_rate": 0.0001, "loss": 8.3796, "loss/crossentropy": 2.2435269355773926, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.2818005681037903, "step": 2914 }, { "epoch": 0.18225, "grad_norm": 4.53125, "grad_norm_var": 0.7542154947916667, "learning_rate": 0.0001, "loss": 8.1857, "loss/crossentropy": 2.295470118522644, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.28818129003047943, "step": 2916 }, { "epoch": 0.182375, "grad_norm": 3.640625, "grad_norm_var": 0.7405588785807292, "learning_rate": 0.0001, "loss": 8.3772, "loss/crossentropy": 2.2698758840560913, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2839386910200119, "step": 2918 }, { "epoch": 0.1825, "grad_norm": 2.984375, "grad_norm_var": 0.7203084309895833, "learning_rate": 0.0001, "loss": 8.25, "loss/crossentropy": 2.4441301822662354, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.26232658326625824, "step": 2920 }, { "epoch": 0.182625, "grad_norm": 3.078125, "grad_norm_var": 0.7089192708333333, "learning_rate": 0.0001, "loss": 8.3258, "loss/crossentropy": 2.4217323064804077, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.2775084972381592, "step": 2922 }, { "epoch": 0.18275, "grad_norm": 3.1875, "grad_norm_var": 0.6976399739583333, "learning_rate": 0.0001, "loss": 8.2529, "loss/crossentropy": 2.181835651397705, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2539723962545395, "step": 2924 }, { "epoch": 0.182875, "grad_norm": 2.953125, "grad_norm_var": 0.7229400634765625, "learning_rate": 0.0001, "loss": 8.3196, "loss/crossentropy": 2.4659998416900635, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.27879883348941803, "step": 2926 }, { "epoch": 0.183, "grad_norm": 2.796875, "grad_norm_var": 0.18976236979166666, "learning_rate": 0.0001, "loss": 8.1181, "loss/crossentropy": 2.387602686882019, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.27245059609413147, "step": 2928 }, { "epoch": 0.183125, "grad_norm": 3.0625, "grad_norm_var": 0.18590087890625, "learning_rate": 0.0001, "loss": 8.2374, "loss/crossentropy": 2.31194806098938, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.26923683285713196, "step": 2930 }, { "epoch": 0.18325, "grad_norm": 3.078125, "grad_norm_var": 0.18599344889322916, "learning_rate": 0.0001, "loss": 8.1433, "loss/crossentropy": 2.1902053356170654, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2595588266849518, "step": 2932 }, { "epoch": 0.183375, "grad_norm": 2.953125, "grad_norm_var": 0.1603668212890625, "learning_rate": 0.0001, "loss": 8.1451, "loss/crossentropy": 2.359446108341217, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26731616258621216, "step": 2934 }, { "epoch": 0.1835, "grad_norm": 3.0625, "grad_norm_var": 0.1576812744140625, "learning_rate": 0.0001, "loss": 8.2641, "loss/crossentropy": 2.2995764017105103, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.26409196853637695, "step": 2936 }, { "epoch": 0.183625, "grad_norm": 3.625, "grad_norm_var": 0.1730621337890625, "learning_rate": 0.0001, "loss": 8.3355, "loss/crossentropy": 2.4295204877853394, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.28618867695331573, "step": 2938 }, { "epoch": 0.18375, "grad_norm": 2.90625, "grad_norm_var": 0.1745025634765625, "learning_rate": 0.0001, "loss": 8.3801, "loss/crossentropy": 2.6878920793533325, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.28163112699985504, "step": 2940 }, { "epoch": 0.183875, "grad_norm": 3.078125, "grad_norm_var": 0.17681884765625, "learning_rate": 0.0001, "loss": 8.2906, "loss/crossentropy": 2.4048426151275635, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2580362558364868, "step": 2942 }, { "epoch": 0.184, "grad_norm": 2.953125, "grad_norm_var": 0.17281494140625, "learning_rate": 0.0001, "loss": 8.1785, "loss/crossentropy": 2.345201253890991, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2763300687074661, "step": 2944 }, { "epoch": 0.184125, "grad_norm": 3.078125, "grad_norm_var": 0.16936848958333334, "learning_rate": 0.0001, "loss": 8.2876, "loss/crossentropy": 2.6029101610183716, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2681310623884201, "step": 2946 }, { "epoch": 0.18425, "grad_norm": 2.921875, "grad_norm_var": 0.036799112955729164, "learning_rate": 0.0001, "loss": 8.2017, "loss/crossentropy": 2.372221827507019, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2688012570142746, "step": 2948 }, { "epoch": 0.184375, "grad_norm": 3.15625, "grad_norm_var": 0.03762613932291667, "learning_rate": 0.0001, "loss": 8.158, "loss/crossentropy": 2.4124748706817627, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2789239138364792, "step": 2950 }, { "epoch": 0.1845, "grad_norm": 2.96875, "grad_norm_var": 0.038248697916666664, "learning_rate": 0.0001, "loss": 8.1871, "loss/crossentropy": 2.4088337421417236, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2847772538661957, "step": 2952 }, { "epoch": 0.184625, "grad_norm": 3.0, "grad_norm_var": 0.01422119140625, "learning_rate": 0.0001, "loss": 8.1798, "loss/crossentropy": 2.482720732688904, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.26483161747455597, "step": 2954 }, { "epoch": 0.18475, "grad_norm": 2.84375, "grad_norm_var": 0.015876261393229167, "learning_rate": 0.0001, "loss": 7.9673, "loss/crossentropy": 2.3646737337112427, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.24630165100097656, "step": 2956 }, { "epoch": 0.184875, "grad_norm": 2.953125, "grad_norm_var": 0.0157379150390625, "learning_rate": 0.0001, "loss": 8.1089, "loss/crossentropy": 2.3218533992767334, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.26656070351600647, "step": 2958 }, { "epoch": 0.185, "grad_norm": 2.984375, "grad_norm_var": 0.015607706705729167, "learning_rate": 0.0001, "loss": 8.2027, "loss/crossentropy": 2.2734180688858032, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.25169242173433304, "step": 2960 }, { "epoch": 0.185125, "grad_norm": 3.0, "grad_norm_var": 0.013084920247395833, "learning_rate": 0.0001, "loss": 8.2544, "loss/crossentropy": 2.438475728034973, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2564696967601776, "step": 2962 }, { "epoch": 0.18525, "grad_norm": 2.90625, "grad_norm_var": 0.0137847900390625, "learning_rate": 0.0001, "loss": 8.1306, "loss/crossentropy": 2.2471766471862793, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.25111958384513855, "step": 2964 }, { "epoch": 0.185375, "grad_norm": 2.984375, "grad_norm_var": 0.011921183268229166, "learning_rate": 0.0001, "loss": 8.2375, "loss/crossentropy": 2.4147186279296875, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.2586153745651245, "step": 2966 }, { "epoch": 0.1855, "grad_norm": 2.75, "grad_norm_var": 0.0188873291015625, "learning_rate": 0.0001, "loss": 8.3192, "loss/crossentropy": 2.4494996070861816, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.2895759344100952, "step": 2968 }, { "epoch": 0.185625, "grad_norm": 3.109375, "grad_norm_var": 0.0251129150390625, "learning_rate": 0.0001, "loss": 8.339, "loss/crossentropy": 2.3236255645751953, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.30765844881534576, "step": 2970 }, { "epoch": 0.18575, "grad_norm": 3.203125, "grad_norm_var": 0.023909505208333334, "learning_rate": 0.0001, "loss": 8.0738, "loss/crossentropy": 2.198439598083496, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.26555734872817993, "step": 2972 }, { "epoch": 0.185875, "grad_norm": 3.0, "grad_norm_var": 0.0239410400390625, "learning_rate": 0.0001, "loss": 8.2006, "loss/crossentropy": 2.0953043699264526, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.26947975903749466, "step": 2974 }, { "epoch": 0.186, "grad_norm": 3.140625, "grad_norm_var": 0.019661458333333333, "learning_rate": 0.0001, "loss": 8.4606, "loss/crossentropy": 2.4526013135910034, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.28150199353694916, "step": 2976 }, { "epoch": 0.186125, "grad_norm": 2.9375, "grad_norm_var": 0.020637003580729167, "learning_rate": 0.0001, "loss": 8.2076, "loss/crossentropy": 2.2855358123779297, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.25133074820041656, "step": 2978 }, { "epoch": 0.18625, "grad_norm": 2.828125, "grad_norm_var": 0.022802734375, "learning_rate": 0.0001, "loss": 8.3427, "loss/crossentropy": 2.376999020576477, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.26656532287597656, "step": 2980 }, { "epoch": 0.186375, "grad_norm": 3.09375, "grad_norm_var": 0.027046712239583333, "learning_rate": 0.0001, "loss": 8.1237, "loss/crossentropy": 2.2938228845596313, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24355412274599075, "step": 2982 }, { "epoch": 0.1865, "grad_norm": 3.125, "grad_norm_var": 0.022379557291666668, "learning_rate": 0.0001, "loss": 8.2351, "loss/crossentropy": 2.2552013397216797, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.2814163714647293, "step": 2984 }, { "epoch": 0.186625, "grad_norm": 3.140625, "grad_norm_var": 0.017366536458333335, "learning_rate": 0.0001, "loss": 8.0533, "loss/crossentropy": 2.53173291683197, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2805483788251877, "step": 2986 }, { "epoch": 0.18675, "grad_norm": 3.0625, "grad_norm_var": 0.016178385416666666, "learning_rate": 0.0001, "loss": 8.0591, "loss/crossentropy": 2.2584705352783203, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2641800567507744, "step": 2988 }, { "epoch": 0.186875, "grad_norm": 2.921875, "grad_norm_var": 0.022163899739583333, "learning_rate": 0.0001, "loss": 7.9337, "loss/crossentropy": 1.8534721732139587, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2394864708185196, "step": 2990 }, { "epoch": 0.187, "grad_norm": 3.0625, "grad_norm_var": 0.023420206705729165, "learning_rate": 0.0001, "loss": 8.223, "loss/crossentropy": 2.293165445327759, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.25678662955760956, "step": 2992 }, { "epoch": 0.187125, "grad_norm": 2.796875, "grad_norm_var": 0.030631510416666667, "learning_rate": 0.0001, "loss": 8.0176, "loss/crossentropy": 2.2423532009124756, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24015968292951584, "step": 2994 }, { "epoch": 0.18725, "grad_norm": 3.203125, "grad_norm_var": 0.032763671875, "learning_rate": 0.0001, "loss": 8.0765, "loss/crossentropy": 2.11912739276886, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.27838442474603653, "step": 2996 }, { "epoch": 0.187375, "grad_norm": 3.109375, "grad_norm_var": 0.03322652180989583, "learning_rate": 0.0001, "loss": 8.332, "loss/crossentropy": 2.430270552635193, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2781721204519272, "step": 2998 }, { "epoch": 0.1875, "grad_norm": 2.984375, "grad_norm_var": 0.030078125, "learning_rate": 0.0001, "loss": 8.0788, "loss/crossentropy": 2.3152072429656982, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24442951381206512, "step": 3000 }, { "epoch": 0.187625, "grad_norm": 2.875, "grad_norm_var": 0.032080078125, "learning_rate": 0.0001, "loss": 7.8221, "loss/crossentropy": 2.044616162776947, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.22671421617269516, "step": 3002 }, { "epoch": 0.18775, "grad_norm": 3.140625, "grad_norm_var": 0.0315826416015625, "learning_rate": 0.0001, "loss": 8.0428, "loss/crossentropy": 1.980787992477417, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.24500887095928192, "step": 3004 }, { "epoch": 0.187875, "grad_norm": 3.21875, "grad_norm_var": 0.04780171712239583, "learning_rate": 0.0001, "loss": 8.3364, "loss/crossentropy": 2.239225387573242, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2766413688659668, "step": 3006 }, { "epoch": 0.188, "grad_norm": 2.8125, "grad_norm_var": 0.049046834309895836, "learning_rate": 0.0001, "loss": 8.1033, "loss/crossentropy": 2.448020100593567, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.26164938509464264, "step": 3008 }, { "epoch": 0.188125, "grad_norm": 3.046875, "grad_norm_var": 0.040673828125, "learning_rate": 0.0001, "loss": 8.3013, "loss/crossentropy": 2.4478694200515747, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25723396241664886, "step": 3010 }, { "epoch": 0.18825, "grad_norm": 2.90625, "grad_norm_var": 0.037083943684895836, "learning_rate": 0.0001, "loss": 8.0114, "loss/crossentropy": 2.2931246757507324, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25216321647167206, "step": 3012 }, { "epoch": 0.188375, "grad_norm": 2.875, "grad_norm_var": 0.036031087239583336, "learning_rate": 0.0001, "loss": 8.0789, "loss/crossentropy": 2.2818782329559326, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2645493298768997, "step": 3014 }, { "epoch": 0.1885, "grad_norm": 3.390625, "grad_norm_var": 0.043944295247395834, "learning_rate": 0.0001, "loss": 8.0871, "loss/crossentropy": 2.1904850006103516, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2534702569246292, "step": 3016 }, { "epoch": 0.188625, "grad_norm": 3.0625, "grad_norm_var": 0.040827433268229164, "learning_rate": 0.0001, "loss": 8.2858, "loss/crossentropy": 2.5582823753356934, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.28889524936676025, "step": 3018 }, { "epoch": 0.18875, "grad_norm": 3.140625, "grad_norm_var": 0.044896443684895836, "learning_rate": 0.0001, "loss": 8.3075, "loss/crossentropy": 2.334454655647278, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.25536222755908966, "step": 3020 }, { "epoch": 0.188875, "grad_norm": 3.09375, "grad_norm_var": 0.025641886393229167, "learning_rate": 0.0001, "loss": 8.0683, "loss/crossentropy": 2.330474376678467, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2533309534192085, "step": 3022 }, { "epoch": 0.189, "grad_norm": 3.125, "grad_norm_var": 0.025373331705729165, "learning_rate": 0.0001, "loss": 8.111, "loss/crossentropy": 2.410144090652466, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.26289787888526917, "step": 3024 }, { "epoch": 0.189125, "grad_norm": 2.9375, "grad_norm_var": 0.024149576822916668, "learning_rate": 0.0001, "loss": 8.4391, "loss/crossentropy": 2.5078091621398926, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.28905192017555237, "step": 3026 }, { "epoch": 0.18925, "grad_norm": 3.09375, "grad_norm_var": 0.024283854166666667, "learning_rate": 0.0001, "loss": 8.213, "loss/crossentropy": 2.3117668628692627, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.3114188015460968, "step": 3028 }, { "epoch": 0.189375, "grad_norm": 3.0625, "grad_norm_var": 0.022379557291666668, "learning_rate": 0.0001, "loss": 8.3104, "loss/crossentropy": 2.4223134517669678, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2804117053747177, "step": 3030 }, { "epoch": 0.1895, "grad_norm": 3.109375, "grad_norm_var": 0.016145833333333335, "learning_rate": 0.0001, "loss": 8.1046, "loss/crossentropy": 2.44333016872406, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2601943165063858, "step": 3032 }, { "epoch": 0.189625, "grad_norm": 3.28125, "grad_norm_var": 0.0273590087890625, "learning_rate": 0.0001, "loss": 8.171, "loss/crossentropy": 2.2447644472122192, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25088611245155334, "step": 3034 }, { "epoch": 0.18975, "grad_norm": 3.234375, "grad_norm_var": 0.0401031494140625, "learning_rate": 0.0001, "loss": 8.1771, "loss/crossentropy": 2.2574959993362427, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.300536185503006, "step": 3036 }, { "epoch": 0.189875, "grad_norm": 2.984375, "grad_norm_var": 0.04713134765625, "learning_rate": 0.0001, "loss": 8.1283, "loss/crossentropy": 2.2428945302963257, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.27497410774230957, "step": 3038 }, { "epoch": 0.19, "grad_norm": 2.78125, "grad_norm_var": 0.05065104166666667, "learning_rate": 0.0001, "loss": 8.0159, "loss/crossentropy": 2.2476083040237427, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2652427852153778, "step": 3040 }, { "epoch": 0.190125, "grad_norm": 2.921875, "grad_norm_var": 0.050862630208333336, "learning_rate": 0.0001, "loss": 8.0181, "loss/crossentropy": 2.2924695014953613, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2591032460331917, "step": 3042 }, { "epoch": 0.19025, "grad_norm": 2.9375, "grad_norm_var": 0.050902303059895834, "learning_rate": 0.0001, "loss": 7.8949, "loss/crossentropy": 2.1451542377471924, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25656062364578247, "step": 3044 }, { "epoch": 0.190375, "grad_norm": 2.9375, "grad_norm_var": 0.051806640625, "learning_rate": 0.0001, "loss": 8.0506, "loss/crossentropy": 2.396212339401245, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.291217640042305, "step": 3046 }, { "epoch": 0.1905, "grad_norm": 2.75, "grad_norm_var": 0.05533447265625, "learning_rate": 0.0001, "loss": 8.1359, "loss/crossentropy": 2.4078985452651978, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2804025709629059, "step": 3048 }, { "epoch": 0.190625, "grad_norm": 2.78125, "grad_norm_var": 0.04224344889322917, "learning_rate": 0.0001, "loss": 8.0119, "loss/crossentropy": 2.522601842880249, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.29702451825141907, "step": 3050 }, { "epoch": 0.19075, "grad_norm": 2.96875, "grad_norm_var": 0.010400390625, "learning_rate": 0.0001, "loss": 7.8972, "loss/crossentropy": 2.4399465322494507, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2680388540029526, "step": 3052 }, { "epoch": 0.190875, "grad_norm": 3.28125, "grad_norm_var": 0.016974894205729167, "learning_rate": 0.0001, "loss": 7.9247, "loss/crossentropy": 2.208235263824463, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2559407502412796, "step": 3054 }, { "epoch": 0.191, "grad_norm": 2.90625, "grad_norm_var": 0.015119425455729167, "learning_rate": 0.0001, "loss": 8.1328, "loss/crossentropy": 2.2488074898719788, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25658509135246277, "step": 3056 }, { "epoch": 0.191125, "grad_norm": 2.984375, "grad_norm_var": 0.014533487955729167, "learning_rate": 0.0001, "loss": 8.1646, "loss/crossentropy": 2.3440630435943604, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.294276162981987, "step": 3058 }, { "epoch": 0.19125, "grad_norm": 2.921875, "grad_norm_var": 0.0190582275390625, "learning_rate": 0.0001, "loss": 8.112, "loss/crossentropy": 2.434723734855652, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.25768575817346573, "step": 3060 }, { "epoch": 0.191375, "grad_norm": 2.890625, "grad_norm_var": 0.019969685872395834, "learning_rate": 0.0001, "loss": 8.128, "loss/crossentropy": 2.2569202184677124, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2592715919017792, "step": 3062 }, { "epoch": 0.1915, "grad_norm": 2.875, "grad_norm_var": 0.019196573893229166, "learning_rate": 0.0001, "loss": 7.9687, "loss/crossentropy": 2.238947808742523, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.26189571619033813, "step": 3064 }, { "epoch": 0.191625, "grad_norm": 3.015625, "grad_norm_var": 0.0163970947265625, "learning_rate": 0.0001, "loss": 8.1887, "loss/crossentropy": 2.1487661600112915, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.26625922322273254, "step": 3066 }, { "epoch": 0.19175, "grad_norm": 2.8125, "grad_norm_var": 0.0185546875, "learning_rate": 0.0001, "loss": 8.2271, "loss/crossentropy": 2.0519703030586243, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24336016178131104, "step": 3068 }, { "epoch": 0.191875, "grad_norm": 2.78125, "grad_norm_var": 0.016650390625, "learning_rate": 0.0001, "loss": 7.8515, "loss/crossentropy": 2.22737193107605, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24473804980516434, "step": 3070 }, { "epoch": 0.192, "grad_norm": 3.03125, "grad_norm_var": 0.017308553059895832, "learning_rate": 0.0001, "loss": 8.0573, "loss/crossentropy": 2.188886821269989, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.23682686686515808, "step": 3072 }, { "epoch": 0.192125, "grad_norm": 3.15625, "grad_norm_var": 0.10769755045572917, "learning_rate": 0.0001, "loss": 8.3074, "loss/crossentropy": 2.458534598350525, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25693877041339874, "step": 3074 }, { "epoch": 0.19225, "grad_norm": 2.90625, "grad_norm_var": 0.10537821451822917, "learning_rate": 0.0001, "loss": 8.2244, "loss/crossentropy": 2.522627353668213, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.27664047479629517, "step": 3076 }, { "epoch": 0.192375, "grad_norm": 2.953125, "grad_norm_var": 0.10487874348958333, "learning_rate": 0.0001, "loss": 8.3394, "loss/crossentropy": 2.576285243034363, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.27181732654571533, "step": 3078 }, { "epoch": 0.1925, "grad_norm": 3.09375, "grad_norm_var": 0.10526936848958333, "learning_rate": 0.0001, "loss": 8.0066, "loss/crossentropy": 2.0819945335388184, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.23909074068069458, "step": 3080 }, { "epoch": 0.192625, "grad_norm": 3.265625, "grad_norm_var": 0.1159332275390625, "learning_rate": 0.0001, "loss": 8.1581, "loss/crossentropy": 2.2708539962768555, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2756985127925873, "step": 3082 }, { "epoch": 0.19275, "grad_norm": 2.859375, "grad_norm_var": 0.11389973958333334, "learning_rate": 0.0001, "loss": 8.2785, "loss/crossentropy": 2.39635694026947, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2778060585260391, "step": 3084 }, { "epoch": 0.192875, "grad_norm": 3.078125, "grad_norm_var": 0.10504557291666666, "learning_rate": 0.0001, "loss": 8.2355, "loss/crossentropy": 2.3772268295288086, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2498919665813446, "step": 3086 }, { "epoch": 0.193, "grad_norm": 3.3125, "grad_norm_var": 0.10641276041666667, "learning_rate": 0.0001, "loss": 8.1642, "loss/crossentropy": 2.166185975074768, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2886776030063629, "step": 3088 }, { "epoch": 0.193125, "grad_norm": 2.84375, "grad_norm_var": 0.04262593587239583, "learning_rate": 0.0001, "loss": 8.0373, "loss/crossentropy": 2.174901783466339, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2579093724489212, "step": 3090 }, { "epoch": 0.19325, "grad_norm": 3.078125, "grad_norm_var": 0.0406402587890625, "learning_rate": 0.0001, "loss": 8.2906, "loss/crossentropy": 2.4586178064346313, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.26945915818214417, "step": 3092 }, { "epoch": 0.193375, "grad_norm": 3.140625, "grad_norm_var": 0.04579976399739583, "learning_rate": 0.0001, "loss": 8.0433, "loss/crossentropy": 2.2675609588623047, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24192240834236145, "step": 3094 }, { "epoch": 0.1935, "grad_norm": 2.90625, "grad_norm_var": 0.04088134765625, "learning_rate": 0.0001, "loss": 8.1895, "loss/crossentropy": 2.378737211227417, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.27523770928382874, "step": 3096 }, { "epoch": 0.193625, "grad_norm": 2.96875, "grad_norm_var": 0.038655598958333336, "learning_rate": 0.0001, "loss": 8.1775, "loss/crossentropy": 2.2450079917907715, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.26037096977233887, "step": 3098 }, { "epoch": 0.19375, "grad_norm": 2.859375, "grad_norm_var": 0.04511617024739583, "learning_rate": 0.0001, "loss": 7.9961, "loss/crossentropy": 2.214326858520508, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24653871357440948, "step": 3100 }, { "epoch": 0.193875, "grad_norm": 3.046875, "grad_norm_var": 0.04381510416666667, "learning_rate": 0.0001, "loss": 8.1401, "loss/crossentropy": 2.153132200241089, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.26044490188360214, "step": 3102 }, { "epoch": 0.194, "grad_norm": 2.84375, "grad_norm_var": 0.03082275390625, "learning_rate": 0.0001, "loss": 8.0079, "loss/crossentropy": 2.1559587717056274, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24349378049373627, "step": 3104 }, { "epoch": 0.194125, "grad_norm": 3.25, "grad_norm_var": 0.03339436848958333, "learning_rate": 0.0001, "loss": 8.2171, "loss/crossentropy": 2.144322395324707, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2607101500034332, "step": 3106 }, { "epoch": 0.19425, "grad_norm": 3.09375, "grad_norm_var": 0.17703348795572918, "learning_rate": 0.0001, "loss": 8.2271, "loss/crossentropy": 2.158595383167267, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.25443463027477264, "step": 3108 }, { "epoch": 0.194375, "grad_norm": 3.265625, "grad_norm_var": 0.17183837890625, "learning_rate": 0.0001, "loss": 8.2891, "loss/crossentropy": 2.3929585218429565, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2684243321418762, "step": 3110 }, { "epoch": 0.1945, "grad_norm": 3.203125, "grad_norm_var": 0.2350250244140625, "learning_rate": 0.0001, "loss": 8.2582, "loss/crossentropy": 2.22498095035553, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2694687396287918, "step": 3112 }, { "epoch": 0.194625, "grad_norm": 2.984375, "grad_norm_var": 0.23277587890625, "learning_rate": 0.0001, "loss": 8.2521, "loss/crossentropy": 2.2842044830322266, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2741314470767975, "step": 3114 }, { "epoch": 0.19475, "grad_norm": 3.109375, "grad_norm_var": 0.208984375, "learning_rate": 0.0001, "loss": 8.4313, "loss/crossentropy": 2.719553828239441, "loss/hidden": 3.2734375, "loss/jsd": 0.0, "loss/logits": 0.28248198330402374, "step": 3116 }, { "epoch": 0.194875, "grad_norm": 2.90625, "grad_norm_var": 0.21138916015625, "learning_rate": 0.0001, "loss": 8.0922, "loss/crossentropy": 2.3392093181610107, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24478980898857117, "step": 3118 }, { "epoch": 0.195, "grad_norm": 2.859375, "grad_norm_var": 0.20575764973958333, "learning_rate": 0.0001, "loss": 8.1143, "loss/crossentropy": 2.2137898802757263, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25172895193099976, "step": 3120 }, { "epoch": 0.195125, "grad_norm": 3.015625, "grad_norm_var": 0.20693257649739583, "learning_rate": 0.0001, "loss": 7.8959, "loss/crossentropy": 2.22554612159729, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2681921124458313, "step": 3122 }, { "epoch": 0.19525, "grad_norm": 2.65625, "grad_norm_var": 0.1068359375, "learning_rate": 0.0001, "loss": 8.1436, "loss/crossentropy": 2.3785455226898193, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2584487646818161, "step": 3124 }, { "epoch": 0.195375, "grad_norm": 3.03125, "grad_norm_var": 0.1123687744140625, "learning_rate": 0.0001, "loss": 8.0, "loss/crossentropy": 2.149886727333069, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2602705806493759, "step": 3126 }, { "epoch": 0.1955, "grad_norm": 2.84375, "grad_norm_var": 0.038248697916666664, "learning_rate": 0.0001, "loss": 8.0842, "loss/crossentropy": 2.262367606163025, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26558272540569305, "step": 3128 }, { "epoch": 0.195625, "grad_norm": 2.90625, "grad_norm_var": 0.04039713541666667, "learning_rate": 0.0001, "loss": 7.9579, "loss/crossentropy": 2.3544100522994995, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.25334376096725464, "step": 3130 }, { "epoch": 0.19575, "grad_norm": 3.09375, "grad_norm_var": 0.040135701497395836, "learning_rate": 0.0001, "loss": 8.1151, "loss/crossentropy": 2.2972534894943237, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.26104989647865295, "step": 3132 }, { "epoch": 0.195875, "grad_norm": 3.046875, "grad_norm_var": 0.03992411295572917, "learning_rate": 0.0001, "loss": 8.1342, "loss/crossentropy": 2.1989673376083374, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.277900829911232, "step": 3134 }, { "epoch": 0.196, "grad_norm": 2.9375, "grad_norm_var": 0.039460245768229166, "learning_rate": 0.0001, "loss": 8.0714, "loss/crossentropy": 2.4464385509490967, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2850402891635895, "step": 3136 }, { "epoch": 0.196125, "grad_norm": 3.0625, "grad_norm_var": 0.030094401041666666, "learning_rate": 0.0001, "loss": 8.0838, "loss/crossentropy": 2.524294137954712, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2605943828821182, "step": 3138 }, { "epoch": 0.19625, "grad_norm": 2.859375, "grad_norm_var": 0.02447509765625, "learning_rate": 0.0001, "loss": 8.2011, "loss/crossentropy": 2.336963415145874, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2699812203645706, "step": 3140 }, { "epoch": 0.196375, "grad_norm": 2.8125, "grad_norm_var": 0.025927734375, "learning_rate": 0.0001, "loss": 8.0793, "loss/crossentropy": 2.0558972358703613, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.23027782887220383, "step": 3142 }, { "epoch": 0.1965, "grad_norm": 3.0, "grad_norm_var": 0.011995442708333333, "learning_rate": 0.0001, "loss": 8.0777, "loss/crossentropy": 2.488237142562866, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2672172784805298, "step": 3144 }, { "epoch": 0.196625, "grad_norm": 2.59375, "grad_norm_var": 0.018488566080729168, "learning_rate": 0.0001, "loss": 7.9179, "loss/crossentropy": 2.1932790279388428, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.24824626743793488, "step": 3146 }, { "epoch": 0.19675, "grad_norm": 3.546875, "grad_norm_var": 0.04078369140625, "learning_rate": 0.0001, "loss": 8.3246, "loss/crossentropy": 2.5695748329162598, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.29574574530124664, "step": 3148 }, { "epoch": 0.196875, "grad_norm": 2.828125, "grad_norm_var": 0.041112263997395836, "learning_rate": 0.0001, "loss": 7.97, "loss/crossentropy": 2.1290723085403442, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25089605897665024, "step": 3150 }, { "epoch": 0.197, "grad_norm": 3.15625, "grad_norm_var": 0.04358622233072917, "learning_rate": 0.0001, "loss": 8.2776, "loss/crossentropy": 2.170577347278595, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2691800892353058, "step": 3152 }, { "epoch": 0.197125, "grad_norm": 2.796875, "grad_norm_var": 0.05865478515625, "learning_rate": 0.0001, "loss": 8.2212, "loss/crossentropy": 2.328927516937256, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.26250624656677246, "step": 3154 }, { "epoch": 0.19725, "grad_norm": 2.875, "grad_norm_var": 0.060498046875, "learning_rate": 0.0001, "loss": 7.9754, "loss/crossentropy": 1.8585203289985657, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.24215184152126312, "step": 3156 }, { "epoch": 0.197375, "grad_norm": 3.015625, "grad_norm_var": 0.05597330729166667, "learning_rate": 0.0001, "loss": 7.9847, "loss/crossentropy": 2.3386768102645874, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.24663898348808289, "step": 3158 }, { "epoch": 0.1975, "grad_norm": 2.828125, "grad_norm_var": 0.059956868489583336, "learning_rate": 0.0001, "loss": 8.0886, "loss/crossentropy": 2.3307093381881714, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2650502547621727, "step": 3160 }, { "epoch": 0.197625, "grad_norm": 3.484375, "grad_norm_var": 0.06946614583333334, "learning_rate": 0.0001, "loss": 8.2584, "loss/crossentropy": 2.4045934677124023, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.26501110196113586, "step": 3162 }, { "epoch": 0.19775, "grad_norm": 3.21875, "grad_norm_var": 0.0585357666015625, "learning_rate": 0.0001, "loss": 8.2483, "loss/crossentropy": 2.2774007320404053, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.26311296224594116, "step": 3164 }, { "epoch": 0.197875, "grad_norm": 3.140625, "grad_norm_var": 0.060212198893229166, "learning_rate": 0.0001, "loss": 8.075, "loss/crossentropy": 2.447927236557007, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.28195714950561523, "step": 3166 }, { "epoch": 0.198, "grad_norm": 2.9375, "grad_norm_var": 0.05999348958333333, "learning_rate": 0.0001, "loss": 7.9802, "loss/crossentropy": 2.1790822744369507, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2555471360683441, "step": 3168 }, { "epoch": 0.198125, "grad_norm": 2.90625, "grad_norm_var": 0.04986979166666667, "learning_rate": 0.0001, "loss": 8.0028, "loss/crossentropy": 2.2405868768692017, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2627221643924713, "step": 3170 }, { "epoch": 0.19825, "grad_norm": 2.921875, "grad_norm_var": 0.048216756184895834, "learning_rate": 0.0001, "loss": 8.0685, "loss/crossentropy": 2.2628936767578125, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2600160911679268, "step": 3172 }, { "epoch": 0.198375, "grad_norm": 3.109375, "grad_norm_var": 0.050191243489583336, "learning_rate": 0.0001, "loss": 8.184, "loss/crossentropy": 2.193703293800354, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2586003839969635, "step": 3174 }, { "epoch": 0.1985, "grad_norm": 3.125, "grad_norm_var": 0.046483357747395836, "learning_rate": 0.0001, "loss": 8.3664, "loss/crossentropy": 2.391955852508545, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2850872576236725, "step": 3176 }, { "epoch": 0.198625, "grad_norm": 2.984375, "grad_norm_var": 0.019287109375, "learning_rate": 0.0001, "loss": 7.8907, "loss/crossentropy": 1.952785313129425, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.24282608181238174, "step": 3178 }, { "epoch": 0.19875, "grad_norm": 3.015625, "grad_norm_var": 0.014460245768229166, "learning_rate": 0.0001, "loss": 8.1804, "loss/crossentropy": 2.3660178184509277, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2768784761428833, "step": 3180 }, { "epoch": 0.198875, "grad_norm": 3.0625, "grad_norm_var": 0.010904947916666666, "learning_rate": 0.0001, "loss": 8.1394, "loss/crossentropy": 2.3029706478118896, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.2709043174982071, "step": 3182 }, { "epoch": 0.199, "grad_norm": 2.890625, "grad_norm_var": 0.011458333333333333, "learning_rate": 0.0001, "loss": 8.1912, "loss/crossentropy": 2.247406482696533, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.27382896840572357, "step": 3184 }, { "epoch": 0.199125, "grad_norm": 3.1875, "grad_norm_var": 0.013084920247395833, "learning_rate": 0.0001, "loss": 8.0932, "loss/crossentropy": 2.283021092414856, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2655387371778488, "step": 3186 }, { "epoch": 0.19925, "grad_norm": 2.84375, "grad_norm_var": 0.011986287434895833, "learning_rate": 0.0001, "loss": 8.1942, "loss/crossentropy": 2.3839281797409058, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2643355578184128, "step": 3188 }, { "epoch": 0.199375, "grad_norm": 2.953125, "grad_norm_var": 0.011979166666666667, "learning_rate": 0.0001, "loss": 8.1036, "loss/crossentropy": 2.3398349285125732, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2707003504037857, "step": 3190 }, { "epoch": 0.1995, "grad_norm": 2.90625, "grad_norm_var": 0.0106597900390625, "learning_rate": 0.0001, "loss": 7.9858, "loss/crossentropy": 2.2766858339309692, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.26129937171936035, "step": 3192 }, { "epoch": 0.199625, "grad_norm": 5.59375, "grad_norm_var": 0.43753255208333336, "learning_rate": 0.0001, "loss": 8.4462, "loss/crossentropy": 2.1764395236968994, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2510588988661766, "step": 3194 }, { "epoch": 0.19975, "grad_norm": 3.484375, "grad_norm_var": 0.44677632649739585, "learning_rate": 0.0001, "loss": 8.3175, "loss/crossentropy": 2.365424394607544, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.29546721279621124, "step": 3196 }, { "epoch": 0.199875, "grad_norm": 2.859375, "grad_norm_var": 0.45133056640625, "learning_rate": 0.0001, "loss": 8.0542, "loss/crossentropy": 2.315553069114685, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2633578032255173, "step": 3198 }, { "epoch": 0.2, "grad_norm": 2.84375, "grad_norm_var": 0.45888671875, "learning_rate": 0.0001, "loss": 8.0282, "loss/crossentropy": 2.266456127166748, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24378645420074463, "step": 3200 }, { "epoch": 0.200125, "grad_norm": 3.125, "grad_norm_var": 0.45402730305989586, "learning_rate": 0.0001, "loss": 8.2908, "loss/crossentropy": 2.457024335861206, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.27127622067928314, "step": 3202 }, { "epoch": 0.20025, "grad_norm": 3.03125, "grad_norm_var": 0.44961649576822915, "learning_rate": 0.0001, "loss": 8.0589, "loss/crossentropy": 2.238759994506836, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.24700668454170227, "step": 3204 }, { "epoch": 0.200375, "grad_norm": 3.21875, "grad_norm_var": 0.45494384765625, "learning_rate": 0.0001, "loss": 8.2079, "loss/crossentropy": 2.1566935777664185, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2526049539446831, "step": 3206 }, { "epoch": 0.2005, "grad_norm": 2.796875, "grad_norm_var": 0.45370686848958336, "learning_rate": 0.0001, "loss": 8.2606, "loss/crossentropy": 2.3832950592041016, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.26506198942661285, "step": 3208 }, { "epoch": 0.200625, "grad_norm": 3.5625, "grad_norm_var": 0.07243550618489583, "learning_rate": 0.0001, "loss": 8.0725, "loss/crossentropy": 2.2661877870559692, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2683081030845642, "step": 3210 }, { "epoch": 0.20075, "grad_norm": 2.8125, "grad_norm_var": 0.05592041015625, "learning_rate": 0.0001, "loss": 8.2014, "loss/crossentropy": 2.455717086791992, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2620306462049484, "step": 3212 }, { "epoch": 0.200875, "grad_norm": 3.15625, "grad_norm_var": 0.045670572916666666, "learning_rate": 0.0001, "loss": 8.2718, "loss/crossentropy": 2.3754059076309204, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.27477268874645233, "step": 3214 }, { "epoch": 0.201, "grad_norm": 2.8125, "grad_norm_var": 0.042529296875, "learning_rate": 0.0001, "loss": 8.0394, "loss/crossentropy": 2.190205931663513, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2606823891401291, "step": 3216 }, { "epoch": 0.201125, "grad_norm": 2.9375, "grad_norm_var": 0.0420318603515625, "learning_rate": 0.0001, "loss": 8.2165, "loss/crossentropy": 2.421746611595154, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2913060784339905, "step": 3218 }, { "epoch": 0.20125, "grad_norm": 3.140625, "grad_norm_var": 0.0412109375, "learning_rate": 0.0001, "loss": 8.1104, "loss/crossentropy": 2.221086263656616, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.25914373993873596, "step": 3220 }, { "epoch": 0.201375, "grad_norm": 2.84375, "grad_norm_var": 0.038863118489583334, "learning_rate": 0.0001, "loss": 8.1592, "loss/crossentropy": 2.4511712789535522, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2737935334444046, "step": 3222 }, { "epoch": 0.2015, "grad_norm": 3.0625, "grad_norm_var": 0.035628255208333334, "learning_rate": 0.0001, "loss": 8.0365, "loss/crossentropy": 2.123769164085388, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.23229189217090607, "step": 3224 }, { "epoch": 0.201625, "grad_norm": 2.796875, "grad_norm_var": 0.017650349934895834, "learning_rate": 0.0001, "loss": 8.0493, "loss/crossentropy": 2.178188681602478, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2590486854314804, "step": 3226 }, { "epoch": 0.20175, "grad_norm": 3.171875, "grad_norm_var": 0.01549072265625, "learning_rate": 0.0001, "loss": 7.9211, "loss/crossentropy": 2.4011971950531006, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.25986021757125854, "step": 3228 }, { "epoch": 0.201875, "grad_norm": 2.8125, "grad_norm_var": 0.03179931640625, "learning_rate": 0.0001, "loss": 8.2361, "loss/crossentropy": 2.5181671380996704, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2665943503379822, "step": 3230 }, { "epoch": 0.202, "grad_norm": 3.046875, "grad_norm_var": 0.028483072916666668, "learning_rate": 0.0001, "loss": 8.1181, "loss/crossentropy": 2.1627626419067383, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2532324343919754, "step": 3232 }, { "epoch": 0.202125, "grad_norm": 2.90625, "grad_norm_var": 0.0302886962890625, "learning_rate": 0.0001, "loss": 8.0974, "loss/crossentropy": 2.4595226049423218, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2654995620250702, "step": 3234 }, { "epoch": 0.20225, "grad_norm": 2.953125, "grad_norm_var": 0.03790690104166667, "learning_rate": 0.0001, "loss": 7.87, "loss/crossentropy": 2.2840529680252075, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25225698947906494, "step": 3236 }, { "epoch": 0.202375, "grad_norm": 3.0625, "grad_norm_var": 0.03756103515625, "learning_rate": 0.0001, "loss": 8.0822, "loss/crossentropy": 2.3052114248275757, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.254858136177063, "step": 3238 }, { "epoch": 0.2025, "grad_norm": 2.921875, "grad_norm_var": 0.0365234375, "learning_rate": 0.0001, "loss": 8.0767, "loss/crossentropy": 2.3538131713867188, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2715635895729065, "step": 3240 }, { "epoch": 0.202625, "grad_norm": 3.09375, "grad_norm_var": 0.034989420572916666, "learning_rate": 0.0001, "loss": 8.1945, "loss/crossentropy": 2.387078642845154, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.26866745948791504, "step": 3242 }, { "epoch": 0.20275, "grad_norm": 3.109375, "grad_norm_var": 0.035302734375, "learning_rate": 0.0001, "loss": 8.2018, "loss/crossentropy": 2.474520683288574, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.26195022463798523, "step": 3244 }, { "epoch": 0.202875, "grad_norm": 2.953125, "grad_norm_var": 0.016569010416666665, "learning_rate": 0.0001, "loss": 8.1921, "loss/crossentropy": 2.494389772415161, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.27728429436683655, "step": 3246 }, { "epoch": 0.203, "grad_norm": 2.96875, "grad_norm_var": 0.0160552978515625, "learning_rate": 0.0001, "loss": 8.2538, "loss/crossentropy": 2.2659478187561035, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2834864854812622, "step": 3248 }, { "epoch": 0.203125, "grad_norm": 2.96875, "grad_norm_var": 0.015067545572916667, "learning_rate": 0.0001, "loss": 8.3668, "loss/crossentropy": 2.5000627040863037, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.27600064873695374, "step": 3250 }, { "epoch": 0.20325, "grad_norm": 2.984375, "grad_norm_var": 0.011295572916666666, "learning_rate": 0.0001, "loss": 8.2863, "loss/crossentropy": 2.4377931356430054, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2744172215461731, "step": 3252 }, { "epoch": 0.203375, "grad_norm": 3.140625, "grad_norm_var": 0.011474609375, "learning_rate": 0.0001, "loss": 8.0717, "loss/crossentropy": 2.045997738838196, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2477242648601532, "step": 3254 }, { "epoch": 0.2035, "grad_norm": 3.1875, "grad_norm_var": 0.017552693684895832, "learning_rate": 0.0001, "loss": 8.2171, "loss/crossentropy": 2.2741061449050903, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2541493773460388, "step": 3256 }, { "epoch": 0.203625, "grad_norm": 2.796875, "grad_norm_var": 0.0221588134765625, "learning_rate": 0.0001, "loss": 8.0597, "loss/crossentropy": 2.317218542098999, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.26807165145874023, "step": 3258 }, { "epoch": 0.20375, "grad_norm": 2.828125, "grad_norm_var": 0.027074178059895832, "learning_rate": 0.0001, "loss": 8.1394, "loss/crossentropy": 2.196931838989258, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2481948360800743, "step": 3260 }, { "epoch": 0.203875, "grad_norm": 2.984375, "grad_norm_var": 0.0280670166015625, "learning_rate": 0.0001, "loss": 8.2271, "loss/crossentropy": 2.6477065086364746, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.2857190817594528, "step": 3262 }, { "epoch": 0.204, "grad_norm": 3.203125, "grad_norm_var": 0.032892862955729164, "learning_rate": 0.0001, "loss": 8.1574, "loss/crossentropy": 2.189841389656067, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2925741523504257, "step": 3264 }, { "epoch": 0.204125, "grad_norm": 3.0, "grad_norm_var": 0.03297119140625, "learning_rate": 0.0001, "loss": 7.9733, "loss/crossentropy": 2.350952982902527, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2603686600923538, "step": 3266 }, { "epoch": 0.20425, "grad_norm": 2.890625, "grad_norm_var": 0.0315338134765625, "learning_rate": 0.0001, "loss": 8.1512, "loss/crossentropy": 2.661333441734314, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.26404714584350586, "step": 3268 }, { "epoch": 0.204375, "grad_norm": 3.234375, "grad_norm_var": 0.0362701416015625, "learning_rate": 0.0001, "loss": 8.0497, "loss/crossentropy": 2.4684576988220215, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.25150536000728607, "step": 3270 }, { "epoch": 0.2045, "grad_norm": 3.015625, "grad_norm_var": 0.0236236572265625, "learning_rate": 0.0001, "loss": 8.0187, "loss/crossentropy": 2.1839378476142883, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.26790380477905273, "step": 3272 }, { "epoch": 0.204625, "grad_norm": 2.859375, "grad_norm_var": 0.024421183268229167, "learning_rate": 0.0001, "loss": 7.948, "loss/crossentropy": 2.191369652748108, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.25659455358982086, "step": 3274 }, { "epoch": 0.20475, "grad_norm": 3.375, "grad_norm_var": 0.034749348958333336, "learning_rate": 0.0001, "loss": 7.9327, "loss/crossentropy": 2.1985403299331665, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.27150973677635193, "step": 3276 }, { "epoch": 0.204875, "grad_norm": 2.96875, "grad_norm_var": 0.03560791015625, "learning_rate": 0.0001, "loss": 8.1169, "loss/crossentropy": 2.2231366634368896, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.26266512274742126, "step": 3278 }, { "epoch": 0.205, "grad_norm": 2.96875, "grad_norm_var": 0.0292633056640625, "learning_rate": 0.0001, "loss": 8.1368, "loss/crossentropy": 2.1812610626220703, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.25296615064144135, "step": 3280 }, { "epoch": 0.205125, "grad_norm": 3.125, "grad_norm_var": 0.031083170572916666, "learning_rate": 0.0001, "loss": 8.0905, "loss/crossentropy": 2.2935560941696167, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.24997325241565704, "step": 3282 }, { "epoch": 0.20525, "grad_norm": 2.96875, "grad_norm_var": 0.035791015625, "learning_rate": 0.0001, "loss": 7.9969, "loss/crossentropy": 2.0980414152145386, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2588716074824333, "step": 3284 }, { "epoch": 0.205375, "grad_norm": 2.828125, "grad_norm_var": 0.037109375, "learning_rate": 0.0001, "loss": 7.9273, "loss/crossentropy": 2.116236090660095, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25013136863708496, "step": 3286 }, { "epoch": 0.2055, "grad_norm": 2.96875, "grad_norm_var": 0.03726806640625, "learning_rate": 0.0001, "loss": 8.0264, "loss/crossentropy": 2.4083213806152344, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.27061036229133606, "step": 3288 }, { "epoch": 0.205625, "grad_norm": 3.234375, "grad_norm_var": 0.03951822916666667, "learning_rate": 0.0001, "loss": 8.0898, "loss/crossentropy": 2.4575644731521606, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2591339647769928, "step": 3290 }, { "epoch": 0.20575, "grad_norm": 2.953125, "grad_norm_var": 0.028385416666666666, "learning_rate": 0.0001, "loss": 8.0784, "loss/crossentropy": 2.386906147003174, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2638823390007019, "step": 3292 }, { "epoch": 0.205875, "grad_norm": 3.03125, "grad_norm_var": 0.02633056640625, "learning_rate": 0.0001, "loss": 8.0757, "loss/crossentropy": 2.2036038637161255, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24547383934259415, "step": 3294 }, { "epoch": 0.206, "grad_norm": 2.78125, "grad_norm_var": 0.0339508056640625, "learning_rate": 0.0001, "loss": 7.9217, "loss/crossentropy": 2.3093923330307007, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.28410804271698, "step": 3296 }, { "epoch": 0.206125, "grad_norm": 2.796875, "grad_norm_var": 0.0346832275390625, "learning_rate": 0.0001, "loss": 8.0937, "loss/crossentropy": 2.3748456239700317, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26681068539619446, "step": 3298 }, { "epoch": 0.20625, "grad_norm": 2.875, "grad_norm_var": 0.030712890625, "learning_rate": 0.0001, "loss": 8.2334, "loss/crossentropy": 2.5478373765945435, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.2881655991077423, "step": 3300 }, { "epoch": 0.206375, "grad_norm": 2.96875, "grad_norm_var": 0.026984659830729167, "learning_rate": 0.0001, "loss": 8.1905, "loss/crossentropy": 2.2184265851974487, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.26013314723968506, "step": 3302 }, { "epoch": 0.2065, "grad_norm": 2.96875, "grad_norm_var": 0.028597005208333335, "learning_rate": 0.0001, "loss": 8.165, "loss/crossentropy": 2.2159677743911743, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2637548893690109, "step": 3304 }, { "epoch": 0.206625, "grad_norm": 3.078125, "grad_norm_var": 0.0255035400390625, "learning_rate": 0.0001, "loss": 8.2577, "loss/crossentropy": 2.555700659751892, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.28978103399276733, "step": 3306 }, { "epoch": 0.20675, "grad_norm": 2.9375, "grad_norm_var": 0.024657185872395834, "learning_rate": 0.0001, "loss": 8.0698, "loss/crossentropy": 2.1323585510253906, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.27381235361099243, "step": 3308 }, { "epoch": 0.206875, "grad_norm": 2.953125, "grad_norm_var": 0.02353515625, "learning_rate": 0.0001, "loss": 8.2182, "loss/crossentropy": 2.354387640953064, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.27996116876602173, "step": 3310 }, { "epoch": 0.207, "grad_norm": 2.875, "grad_norm_var": 0.015729777018229165, "learning_rate": 0.0001, "loss": 8.2811, "loss/crossentropy": 2.5515745878219604, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2727746069431305, "step": 3312 }, { "epoch": 0.207125, "grad_norm": 2.90625, "grad_norm_var": 0.01539306640625, "learning_rate": 0.0001, "loss": 8.0698, "loss/crossentropy": 2.4017679691314697, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.24894237518310547, "step": 3314 }, { "epoch": 0.20725, "grad_norm": 3.078125, "grad_norm_var": 0.016597493489583334, "learning_rate": 0.0001, "loss": 8.2728, "loss/crossentropy": 2.1459991931915283, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2560963034629822, "step": 3316 }, { "epoch": 0.207375, "grad_norm": 2.78125, "grad_norm_var": 0.015397135416666667, "learning_rate": 0.0001, "loss": 7.9846, "loss/crossentropy": 2.299985408782959, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.26821835339069366, "step": 3318 }, { "epoch": 0.2075, "grad_norm": 2.734375, "grad_norm_var": 0.020173136393229166, "learning_rate": 0.0001, "loss": 7.7324, "loss/crossentropy": 2.0270864367485046, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.24157769232988358, "step": 3320 }, { "epoch": 0.207625, "grad_norm": 2.96875, "grad_norm_var": 0.014676920572916667, "learning_rate": 0.0001, "loss": 8.0535, "loss/crossentropy": 2.2521005868911743, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24373552203178406, "step": 3322 }, { "epoch": 0.20775, "grad_norm": 2.921875, "grad_norm_var": 0.014925130208333333, "learning_rate": 0.0001, "loss": 8.0202, "loss/crossentropy": 2.2878029346466064, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.25381772220134735, "step": 3324 }, { "epoch": 0.207875, "grad_norm": 2.9375, "grad_norm_var": 0.014826456705729166, "learning_rate": 0.0001, "loss": 7.96, "loss/crossentropy": 2.2602121829986572, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.26364803314208984, "step": 3326 }, { "epoch": 0.208, "grad_norm": 2.890625, "grad_norm_var": 0.01470947265625, "learning_rate": 0.0001, "loss": 8.0367, "loss/crossentropy": 2.48467481136322, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24693025648593903, "step": 3328 }, { "epoch": 0.208125, "grad_norm": 3.421875, "grad_norm_var": 0.0357421875, "learning_rate": 0.0001, "loss": 8.2808, "loss/crossentropy": 2.243489623069763, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.28045617043972015, "step": 3330 }, { "epoch": 0.20825, "grad_norm": 3.015625, "grad_norm_var": 0.0317779541015625, "learning_rate": 0.0001, "loss": 8.2617, "loss/crossentropy": 2.4428763389587402, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.27135811746120453, "step": 3332 }, { "epoch": 0.208375, "grad_norm": 2.765625, "grad_norm_var": 0.0314453125, "learning_rate": 0.0001, "loss": 8.1175, "loss/crossentropy": 2.453363060951233, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2776189148426056, "step": 3334 }, { "epoch": 0.2085, "grad_norm": 2.96875, "grad_norm_var": 0.0255859375, "learning_rate": 0.0001, "loss": 8.1894, "loss/crossentropy": 2.3482531309127808, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2623911052942276, "step": 3336 }, { "epoch": 0.208625, "grad_norm": 3.546875, "grad_norm_var": 0.047638956705729166, "learning_rate": 0.0001, "loss": 8.2654, "loss/crossentropy": 2.3092031478881836, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.28854168951511383, "step": 3338 }, { "epoch": 0.20875, "grad_norm": 2.9375, "grad_norm_var": 0.0513824462890625, "learning_rate": 0.0001, "loss": 8.0428, "loss/crossentropy": 2.1508899927139282, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2546079680323601, "step": 3340 }, { "epoch": 0.208875, "grad_norm": 2.890625, "grad_norm_var": 0.050732421875, "learning_rate": 0.0001, "loss": 8.2438, "loss/crossentropy": 2.3433796167373657, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.26718689501285553, "step": 3342 }, { "epoch": 0.209, "grad_norm": 3.015625, "grad_norm_var": 0.048127237955729166, "learning_rate": 0.0001, "loss": 7.8111, "loss/crossentropy": 2.229587435722351, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25351928174495697, "step": 3344 }, { "epoch": 0.209125, "grad_norm": 3.03125, "grad_norm_var": 0.03713785807291667, "learning_rate": 0.0001, "loss": 8.0189, "loss/crossentropy": 2.3170838356018066, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2664952799677849, "step": 3346 }, { "epoch": 0.20925, "grad_norm": 3.125, "grad_norm_var": 0.038899739583333336, "learning_rate": 0.0001, "loss": 8.2421, "loss/crossentropy": 2.2571229934692383, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.270868718624115, "step": 3348 }, { "epoch": 0.209375, "grad_norm": 2.9375, "grad_norm_var": 0.0376129150390625, "learning_rate": 0.0001, "loss": 8.1379, "loss/crossentropy": 2.4776333570480347, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25770139694213867, "step": 3350 }, { "epoch": 0.2095, "grad_norm": 2.875, "grad_norm_var": 0.0370269775390625, "learning_rate": 0.0001, "loss": 8.1088, "loss/crossentropy": 2.157105803489685, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.26475150883197784, "step": 3352 }, { "epoch": 0.209625, "grad_norm": 2.953125, "grad_norm_var": 0.037495930989583336, "learning_rate": 0.0001, "loss": 8.2263, "loss/crossentropy": 2.4708096981048584, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.27709396183490753, "step": 3354 }, { "epoch": 0.20975, "grad_norm": 2.84375, "grad_norm_var": 0.035863240559895836, "learning_rate": 0.0001, "loss": 8.0677, "loss/crossentropy": 2.376826286315918, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2647465467453003, "step": 3356 }, { "epoch": 0.209875, "grad_norm": 2.9375, "grad_norm_var": 0.03557535807291667, "learning_rate": 0.0001, "loss": 8.1983, "loss/crossentropy": 2.3289496898651123, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2608029991388321, "step": 3358 }, { "epoch": 0.21, "grad_norm": 2.75, "grad_norm_var": 0.04039306640625, "learning_rate": 0.0001, "loss": 8.0714, "loss/crossentropy": 2.345235228538513, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.26208456605672836, "step": 3360 }, { "epoch": 0.210125, "grad_norm": 2.875, "grad_norm_var": 0.038483683268229166, "learning_rate": 0.0001, "loss": 8.1277, "loss/crossentropy": 2.259727954864502, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2612561881542206, "step": 3362 }, { "epoch": 0.21025, "grad_norm": 2.828125, "grad_norm_var": 0.04000244140625, "learning_rate": 0.0001, "loss": 8.0557, "loss/crossentropy": 2.2856264114379883, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2503708600997925, "step": 3364 }, { "epoch": 0.210375, "grad_norm": 2.84375, "grad_norm_var": 0.03703511555989583, "learning_rate": 0.0001, "loss": 7.9275, "loss/crossentropy": 2.3697084188461304, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24406228959560394, "step": 3366 }, { "epoch": 0.2105, "grad_norm": 2.890625, "grad_norm_var": 0.0388092041015625, "learning_rate": 0.0001, "loss": 7.9977, "loss/crossentropy": 2.3035097122192383, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.25051476061344147, "step": 3368 }, { "epoch": 0.210625, "grad_norm": 2.96875, "grad_norm_var": 0.01090087890625, "learning_rate": 0.0001, "loss": 8.2405, "loss/crossentropy": 2.349913001060486, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.3001957833766937, "step": 3370 }, { "epoch": 0.21075, "grad_norm": 3.09375, "grad_norm_var": 0.0122955322265625, "learning_rate": 0.0001, "loss": 8.223, "loss/crossentropy": 2.424271821975708, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2743752747774124, "step": 3372 }, { "epoch": 0.210875, "grad_norm": 2.765625, "grad_norm_var": 0.013798014322916666, "learning_rate": 0.0001, "loss": 8.0953, "loss/crossentropy": 2.3287577629089355, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.27096617221832275, "step": 3374 }, { "epoch": 0.211, "grad_norm": 3.171875, "grad_norm_var": 0.016901652018229168, "learning_rate": 0.0001, "loss": 8.1759, "loss/crossentropy": 2.4276020526885986, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.25759419798851013, "step": 3376 }, { "epoch": 0.211125, "grad_norm": 2.90625, "grad_norm_var": 0.015265909830729167, "learning_rate": 0.0001, "loss": 7.8948, "loss/crossentropy": 2.512119174003601, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24548564851284027, "step": 3378 }, { "epoch": 0.21125, "grad_norm": 2.734375, "grad_norm_var": 0.014411417643229167, "learning_rate": 0.0001, "loss": 8.1024, "loss/crossentropy": 2.285138726234436, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2538295388221741, "step": 3380 }, { "epoch": 0.211375, "grad_norm": 2.765625, "grad_norm_var": 0.020067342122395835, "learning_rate": 0.0001, "loss": 8.3056, "loss/crossentropy": 2.543992042541504, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.26885660737752914, "step": 3382 }, { "epoch": 0.2115, "grad_norm": 3.328125, "grad_norm_var": 0.02701416015625, "learning_rate": 0.0001, "loss": 7.9994, "loss/crossentropy": 2.5173155069351196, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.26409924030303955, "step": 3384 }, { "epoch": 0.211625, "grad_norm": 2.828125, "grad_norm_var": 0.029157511393229165, "learning_rate": 0.0001, "loss": 8.0691, "loss/crossentropy": 2.1601486206054688, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24972151964902878, "step": 3386 }, { "epoch": 0.21175, "grad_norm": 3.109375, "grad_norm_var": 0.031083170572916666, "learning_rate": 0.0001, "loss": 7.9928, "loss/crossentropy": 2.074127435684204, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2501622885465622, "step": 3388 }, { "epoch": 0.211875, "grad_norm": 3.09375, "grad_norm_var": 0.02877197265625, "learning_rate": 0.0001, "loss": 8.0324, "loss/crossentropy": 2.1279070377349854, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24174269288778305, "step": 3390 }, { "epoch": 0.212, "grad_norm": 2.765625, "grad_norm_var": 0.0286529541015625, "learning_rate": 0.0001, "loss": 8.2032, "loss/crossentropy": 2.4869847297668457, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.26013004779815674, "step": 3392 }, { "epoch": 0.212125, "grad_norm": 2.84375, "grad_norm_var": 0.04604390462239583, "learning_rate": 0.0001, "loss": 8.2452, "loss/crossentropy": 2.279478073120117, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.25594570487737656, "step": 3394 }, { "epoch": 0.21225, "grad_norm": 3.09375, "grad_norm_var": 0.044408162434895836, "learning_rate": 0.0001, "loss": 8.1751, "loss/crossentropy": 2.32720410823822, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.2753504514694214, "step": 3396 }, { "epoch": 0.212375, "grad_norm": 2.890625, "grad_norm_var": 0.039159138997395836, "learning_rate": 0.0001, "loss": 8.0944, "loss/crossentropy": 2.517144560813904, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2721696197986603, "step": 3398 }, { "epoch": 0.2125, "grad_norm": 3.09375, "grad_norm_var": 0.0320465087890625, "learning_rate": 0.0001, "loss": 8.204, "loss/crossentropy": 2.1578654050827026, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2626515179872513, "step": 3400 }, { "epoch": 0.212625, "grad_norm": 2.84375, "grad_norm_var": 0.030663045247395833, "learning_rate": 0.0001, "loss": 8.0256, "loss/crossentropy": 2.3374911546707153, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2765570878982544, "step": 3402 }, { "epoch": 0.21275, "grad_norm": 2.796875, "grad_norm_var": 0.03200581868489583, "learning_rate": 0.0001, "loss": 7.8734, "loss/crossentropy": 2.1568061113357544, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.23823237419128418, "step": 3404 }, { "epoch": 0.212875, "grad_norm": 2.828125, "grad_norm_var": 0.032548014322916666, "learning_rate": 0.0001, "loss": 8.0153, "loss/crossentropy": 2.302879214286804, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2629336416721344, "step": 3406 }, { "epoch": 0.213, "grad_norm": 3.0, "grad_norm_var": 0.03235270182291667, "learning_rate": 0.0001, "loss": 8.0735, "loss/crossentropy": 2.365081310272217, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2782520353794098, "step": 3408 }, { "epoch": 0.213125, "grad_norm": 2.9375, "grad_norm_var": 0.013667805989583334, "learning_rate": 0.0001, "loss": 8.1104, "loss/crossentropy": 2.244347095489502, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24217111617326736, "step": 3410 }, { "epoch": 0.21325, "grad_norm": 2.984375, "grad_norm_var": 0.011937459309895834, "learning_rate": 0.0001, "loss": 8.1616, "loss/crossentropy": 2.3036710023880005, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.24564654380083084, "step": 3412 }, { "epoch": 0.213375, "grad_norm": 3.015625, "grad_norm_var": 0.012043253580729166, "learning_rate": 0.0001, "loss": 7.9978, "loss/crossentropy": 2.301898717880249, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.23626847565174103, "step": 3414 }, { "epoch": 0.2135, "grad_norm": 2.84375, "grad_norm_var": 0.0104156494140625, "learning_rate": 0.0001, "loss": 7.7726, "loss/crossentropy": 1.9760905504226685, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2343737632036209, "step": 3416 }, { "epoch": 0.213625, "grad_norm": 2.734375, "grad_norm_var": 0.012333170572916666, "learning_rate": 0.0001, "loss": 7.9653, "loss/crossentropy": 2.24316668510437, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.26175426691770554, "step": 3418 }, { "epoch": 0.21375, "grad_norm": 2.890625, "grad_norm_var": 0.0106842041015625, "learning_rate": 0.0001, "loss": 7.8672, "loss/crossentropy": 2.237221360206604, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.26167693734169006, "step": 3420 }, { "epoch": 0.213875, "grad_norm": 3.1875, "grad_norm_var": 0.014518229166666667, "learning_rate": 0.0001, "loss": 8.1044, "loss/crossentropy": 2.649104952812195, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.28041093051433563, "step": 3422 }, { "epoch": 0.214, "grad_norm": 2.703125, "grad_norm_var": 0.017552693684895832, "learning_rate": 0.0001, "loss": 7.7207, "loss/crossentropy": 2.257661819458008, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.25010478496551514, "step": 3424 }, { "epoch": 0.214125, "grad_norm": 3.125, "grad_norm_var": 0.020213826497395834, "learning_rate": 0.0001, "loss": 8.1403, "loss/crossentropy": 2.2480964064598083, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.25986164808273315, "step": 3426 }, { "epoch": 0.21425, "grad_norm": 2.6875, "grad_norm_var": 0.022391764322916667, "learning_rate": 0.0001, "loss": 7.915, "loss/crossentropy": 2.3254162073135376, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.25972771644592285, "step": 3428 }, { "epoch": 0.214375, "grad_norm": 3.0, "grad_norm_var": 0.023111979166666668, "learning_rate": 0.0001, "loss": 7.9769, "loss/crossentropy": 2.129208207130432, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24452001601457596, "step": 3430 }, { "epoch": 0.2145, "grad_norm": 2.71875, "grad_norm_var": 0.024779256184895834, "learning_rate": 0.0001, "loss": 7.8228, "loss/crossentropy": 2.2781134843826294, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.24319136142730713, "step": 3432 }, { "epoch": 0.214625, "grad_norm": 2.890625, "grad_norm_var": 0.0223785400390625, "learning_rate": 0.0001, "loss": 8.1741, "loss/crossentropy": 2.2903696298599243, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2773251011967659, "step": 3434 }, { "epoch": 0.21475, "grad_norm": 2.84375, "grad_norm_var": 0.022509765625, "learning_rate": 0.0001, "loss": 8.1499, "loss/crossentropy": 2.368951678276062, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26731523871421814, "step": 3436 }, { "epoch": 0.214875, "grad_norm": 3.25, "grad_norm_var": 0.026634724934895833, "learning_rate": 0.0001, "loss": 8.2168, "loss/crossentropy": 2.4735910892486572, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2662598043680191, "step": 3438 }, { "epoch": 0.215, "grad_norm": 2.875, "grad_norm_var": 0.022770182291666666, "learning_rate": 0.0001, "loss": 8.229, "loss/crossentropy": 2.4794031381607056, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2601815089583397, "step": 3440 }, { "epoch": 0.215125, "grad_norm": 2.9375, "grad_norm_var": 0.019124348958333332, "learning_rate": 0.0001, "loss": 8.1565, "loss/crossentropy": 2.5665329694747925, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2701922208070755, "step": 3442 }, { "epoch": 0.21525, "grad_norm": 3.03125, "grad_norm_var": 0.01568603515625, "learning_rate": 0.0001, "loss": 8.2036, "loss/crossentropy": 2.5179523229599, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25429390370845795, "step": 3444 }, { "epoch": 0.215375, "grad_norm": 2.921875, "grad_norm_var": 0.015217081705729166, "learning_rate": 0.0001, "loss": 8.1567, "loss/crossentropy": 2.416312336921692, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.28215354681015015, "step": 3446 }, { "epoch": 0.2155, "grad_norm": 2.8125, "grad_norm_var": 0.015070597330729166, "learning_rate": 0.0001, "loss": 8.1865, "loss/crossentropy": 2.6812586784362793, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2766028195619583, "step": 3448 }, { "epoch": 0.215625, "grad_norm": 2.859375, "grad_norm_var": 0.018485514322916667, "learning_rate": 0.0001, "loss": 8.1219, "loss/crossentropy": 2.3808157444000244, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.28973422944545746, "step": 3450 }, { "epoch": 0.21575, "grad_norm": 3.03125, "grad_norm_var": 0.019331868489583334, "learning_rate": 0.0001, "loss": 8.2494, "loss/crossentropy": 2.3658465147018433, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.23564206808805466, "step": 3452 }, { "epoch": 0.215875, "grad_norm": 3.03125, "grad_norm_var": 0.016292317708333334, "learning_rate": 0.0001, "loss": 8.0049, "loss/crossentropy": 2.420639157295227, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2846282720565796, "step": 3454 }, { "epoch": 0.216, "grad_norm": 2.765625, "grad_norm_var": 0.024592081705729168, "learning_rate": 0.0001, "loss": 8.2144, "loss/crossentropy": 2.3309932947158813, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25562766194343567, "step": 3456 }, { "epoch": 0.216125, "grad_norm": 3.046875, "grad_norm_var": 0.0251617431640625, "learning_rate": 0.0001, "loss": 7.777, "loss/crossentropy": 2.1230533719062805, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2454065978527069, "step": 3458 }, { "epoch": 0.21625, "grad_norm": 2.890625, "grad_norm_var": 0.025129191080729165, "learning_rate": 0.0001, "loss": 7.8995, "loss/crossentropy": 2.214825987815857, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2496349811553955, "step": 3460 }, { "epoch": 0.216375, "grad_norm": 3.09375, "grad_norm_var": 0.025861612955729165, "learning_rate": 0.0001, "loss": 8.1565, "loss/crossentropy": 2.5217314958572388, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2674577683210373, "step": 3462 }, { "epoch": 0.2165, "grad_norm": 2.734375, "grad_norm_var": 0.028343709309895833, "learning_rate": 0.0001, "loss": 8.1601, "loss/crossentropy": 2.164268136024475, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2537472993135452, "step": 3464 }, { "epoch": 0.216625, "grad_norm": 2.875, "grad_norm_var": 0.0252349853515625, "learning_rate": 0.0001, "loss": 8.0046, "loss/crossentropy": 2.4043670892715454, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25185615569353104, "step": 3466 }, { "epoch": 0.21675, "grad_norm": 2.84375, "grad_norm_var": 0.0302642822265625, "learning_rate": 0.0001, "loss": 7.6261, "loss/crossentropy": 2.0687937140464783, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23499736189842224, "step": 3468 }, { "epoch": 0.216875, "grad_norm": 3.015625, "grad_norm_var": 0.028205362955729167, "learning_rate": 0.0001, "loss": 7.869, "loss/crossentropy": 2.3043712377548218, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24541200697422028, "step": 3470 }, { "epoch": 0.217, "grad_norm": 3.0625, "grad_norm_var": 0.021637980143229166, "learning_rate": 0.0001, "loss": 8.1061, "loss/crossentropy": 2.347123861312866, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.26428553462028503, "step": 3472 }, { "epoch": 0.217125, "grad_norm": 3.046875, "grad_norm_var": 0.021483357747395834, "learning_rate": 0.0001, "loss": 8.0869, "loss/crossentropy": 2.2847089767456055, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2628851383924484, "step": 3474 }, { "epoch": 0.21725, "grad_norm": 2.609375, "grad_norm_var": 0.027179972330729166, "learning_rate": 0.0001, "loss": 7.8983, "loss/crossentropy": 2.110204815864563, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2360912710428238, "step": 3476 }, { "epoch": 0.217375, "grad_norm": 3.15625, "grad_norm_var": 0.027213541666666667, "learning_rate": 0.0001, "loss": 8.1178, "loss/crossentropy": 2.3014683723449707, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.25195126235485077, "step": 3478 }, { "epoch": 0.2175, "grad_norm": 3.09375, "grad_norm_var": 0.0236328125, "learning_rate": 0.0001, "loss": 8.0819, "loss/crossentropy": 2.4000685811042786, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.25215842574834824, "step": 3480 }, { "epoch": 0.217625, "grad_norm": 2.75, "grad_norm_var": 0.028385416666666666, "learning_rate": 0.0001, "loss": 7.8973, "loss/crossentropy": 2.091042637825012, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.24015626311302185, "step": 3482 }, { "epoch": 0.21775, "grad_norm": 2.890625, "grad_norm_var": 0.025553385416666668, "learning_rate": 0.0001, "loss": 8.0738, "loss/crossentropy": 2.3160455226898193, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.26687709987163544, "step": 3484 }, { "epoch": 0.217875, "grad_norm": 2.953125, "grad_norm_var": 0.0260406494140625, "learning_rate": 0.0001, "loss": 8.2908, "loss/crossentropy": 2.410978317260742, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.27568891644477844, "step": 3486 }, { "epoch": 0.218, "grad_norm": 3.046875, "grad_norm_var": 0.02578125, "learning_rate": 0.0001, "loss": 8.3535, "loss/crossentropy": 2.2229000329971313, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2667589634656906, "step": 3488 }, { "epoch": 0.218125, "grad_norm": 3.03125, "grad_norm_var": 0.02955322265625, "learning_rate": 0.0001, "loss": 8.1352, "loss/crossentropy": 2.2095683217048645, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.26982176303863525, "step": 3490 }, { "epoch": 0.21825, "grad_norm": 2.84375, "grad_norm_var": 0.0224273681640625, "learning_rate": 0.0001, "loss": 7.9164, "loss/crossentropy": 2.281849980354309, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.25118981301784515, "step": 3492 }, { "epoch": 0.218375, "grad_norm": 3.140625, "grad_norm_var": 0.021708170572916668, "learning_rate": 0.0001, "loss": 7.9892, "loss/crossentropy": 2.229013442993164, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2572949305176735, "step": 3494 }, { "epoch": 0.2185, "grad_norm": 2.859375, "grad_norm_var": 0.022614542643229166, "learning_rate": 0.0001, "loss": 8.2191, "loss/crossentropy": 2.1624966859817505, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2592136114835739, "step": 3496 }, { "epoch": 0.218625, "grad_norm": 2.8125, "grad_norm_var": 0.019343058268229168, "learning_rate": 0.0001, "loss": 8.1435, "loss/crossentropy": 2.446608304977417, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25450390577316284, "step": 3498 }, { "epoch": 0.21875, "grad_norm": 2.859375, "grad_norm_var": 0.0186920166015625, "learning_rate": 0.0001, "loss": 7.7994, "loss/crossentropy": 2.082264542579651, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.23312175273895264, "step": 3500 }, { "epoch": 0.218875, "grad_norm": 3.171875, "grad_norm_var": 0.024347941080729168, "learning_rate": 0.0001, "loss": 7.9793, "loss/crossentropy": 2.3270751237869263, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.24587048590183258, "step": 3502 }, { "epoch": 0.219, "grad_norm": 2.9375, "grad_norm_var": 0.023270670572916666, "learning_rate": 0.0001, "loss": 8.2429, "loss/crossentropy": 2.5829883813858032, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.2657495513558388, "step": 3504 }, { "epoch": 0.219125, "grad_norm": 2.859375, "grad_norm_var": 0.023786417643229165, "learning_rate": 0.0001, "loss": 7.9588, "loss/crossentropy": 2.324189066886902, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2597867324948311, "step": 3506 }, { "epoch": 0.21925, "grad_norm": 2.96875, "grad_norm_var": 0.02359619140625, "learning_rate": 0.0001, "loss": 8.0548, "loss/crossentropy": 2.2501312494277954, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2503369599580765, "step": 3508 }, { "epoch": 0.219375, "grad_norm": 2.9375, "grad_norm_var": 0.0199371337890625, "learning_rate": 0.0001, "loss": 8.0606, "loss/crossentropy": 2.3427246809005737, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.25106480717658997, "step": 3510 }, { "epoch": 0.2195, "grad_norm": 3.078125, "grad_norm_var": 0.019391886393229165, "learning_rate": 0.0001, "loss": 8.0356, "loss/crossentropy": 2.483175754547119, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.27612486481666565, "step": 3512 }, { "epoch": 0.219625, "grad_norm": 2.75, "grad_norm_var": 0.022614542643229166, "learning_rate": 0.0001, "loss": 8.1657, "loss/crossentropy": 2.44490385055542, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.27590498328208923, "step": 3514 }, { "epoch": 0.21975, "grad_norm": 2.921875, "grad_norm_var": 0.022005208333333335, "learning_rate": 0.0001, "loss": 8.068, "loss/crossentropy": 2.4559766054153442, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24495191872119904, "step": 3516 }, { "epoch": 0.219875, "grad_norm": 2.859375, "grad_norm_var": 0.014957682291666666, "learning_rate": 0.0001, "loss": 7.8848, "loss/crossentropy": 2.1902081966400146, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2486228197813034, "step": 3518 }, { "epoch": 0.22, "grad_norm": 2.8125, "grad_norm_var": 0.014253743489583333, "learning_rate": 0.0001, "loss": 8.0384, "loss/crossentropy": 2.4357227087020874, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.23928922414779663, "step": 3520 }, { "epoch": 0.220125, "grad_norm": 2.796875, "grad_norm_var": 0.010986328125, "learning_rate": 0.0001, "loss": 8.1229, "loss/crossentropy": 2.2222291231155396, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.27725693583488464, "step": 3522 }, { "epoch": 0.22025, "grad_norm": 3.046875, "grad_norm_var": 0.012116495768229167, "learning_rate": 0.0001, "loss": 7.9818, "loss/crossentropy": 2.5780692100524902, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2518724948167801, "step": 3524 }, { "epoch": 0.220375, "grad_norm": 3.03125, "grad_norm_var": 0.014045206705729167, "learning_rate": 0.0001, "loss": 8.0047, "loss/crossentropy": 2.57025945186615, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.262205608189106, "step": 3526 }, { "epoch": 0.2205, "grad_norm": 2.765625, "grad_norm_var": 0.013719685872395833, "learning_rate": 0.0001, "loss": 7.8127, "loss/crossentropy": 2.07043993473053, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23150594532489777, "step": 3528 }, { "epoch": 0.220625, "grad_norm": 3.046875, "grad_norm_var": 0.01549072265625, "learning_rate": 0.0001, "loss": 8.2683, "loss/crossentropy": 2.4276788234710693, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2603815943002701, "step": 3530 }, { "epoch": 0.22075, "grad_norm": 2.78125, "grad_norm_var": 0.0149078369140625, "learning_rate": 0.0001, "loss": 8.0993, "loss/crossentropy": 2.1250449419021606, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2265177145600319, "step": 3532 }, { "epoch": 0.220875, "grad_norm": 3.046875, "grad_norm_var": 0.01607666015625, "learning_rate": 0.0001, "loss": 8.0249, "loss/crossentropy": 2.3386144638061523, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.274929404258728, "step": 3534 }, { "epoch": 0.221, "grad_norm": 3.078125, "grad_norm_var": 0.018488566080729168, "learning_rate": 0.0001, "loss": 8.2108, "loss/crossentropy": 2.406868577003479, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2587934732437134, "step": 3536 }, { "epoch": 0.221125, "grad_norm": 2.859375, "grad_norm_var": 0.0168121337890625, "learning_rate": 0.0001, "loss": 8.0413, "loss/crossentropy": 2.4491130113601685, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.23888002336025238, "step": 3538 }, { "epoch": 0.22125, "grad_norm": 2.6875, "grad_norm_var": 0.023893229166666665, "learning_rate": 0.0001, "loss": 8.123, "loss/crossentropy": 2.2878233194351196, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.265165239572525, "step": 3540 }, { "epoch": 0.221375, "grad_norm": 3.171875, "grad_norm_var": 0.028902180989583335, "learning_rate": 0.0001, "loss": 8.0388, "loss/crossentropy": 2.3898258209228516, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2545909136533737, "step": 3542 }, { "epoch": 0.2215, "grad_norm": 2.765625, "grad_norm_var": 0.027213541666666667, "learning_rate": 0.0001, "loss": 8.0257, "loss/crossentropy": 2.4284157752990723, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.27379511296749115, "step": 3544 }, { "epoch": 0.221625, "grad_norm": 2.875, "grad_norm_var": 0.025031534830729167, "learning_rate": 0.0001, "loss": 7.9438, "loss/crossentropy": 2.2095457315444946, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.248259037733078, "step": 3546 }, { "epoch": 0.22175, "grad_norm": 2.953125, "grad_norm_var": 0.023758951822916666, "learning_rate": 0.0001, "loss": 7.9915, "loss/crossentropy": 2.4044177532196045, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.25886303186416626, "step": 3548 }, { "epoch": 0.221875, "grad_norm": 2.625, "grad_norm_var": 0.0283203125, "learning_rate": 0.0001, "loss": 7.9139, "loss/crossentropy": 2.2689582109451294, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24590185284614563, "step": 3550 }, { "epoch": 0.222, "grad_norm": 2.765625, "grad_norm_var": 0.0251373291015625, "learning_rate": 0.0001, "loss": 7.8249, "loss/crossentropy": 2.3793697357177734, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2494877427816391, "step": 3552 }, { "epoch": 0.222125, "grad_norm": 2.921875, "grad_norm_var": 0.0253570556640625, "learning_rate": 0.0001, "loss": 8.0842, "loss/crossentropy": 2.204411268234253, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.240115225315094, "step": 3554 }, { "epoch": 0.22225, "grad_norm": 2.953125, "grad_norm_var": 0.020970662434895832, "learning_rate": 0.0001, "loss": 8.0583, "loss/crossentropy": 2.307512044906616, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.23759452998638153, "step": 3556 }, { "epoch": 0.222375, "grad_norm": 2.828125, "grad_norm_var": 0.044611612955729164, "learning_rate": 0.0001, "loss": 7.8452, "loss/crossentropy": 2.4004819989204407, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.28139108419418335, "step": 3558 }, { "epoch": 0.2225, "grad_norm": 2.984375, "grad_norm_var": 0.0453125, "learning_rate": 0.0001, "loss": 7.9175, "loss/crossentropy": 2.321327328681946, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2644403874874115, "step": 3560 }, { "epoch": 0.222625, "grad_norm": 2.8125, "grad_norm_var": 0.0475738525390625, "learning_rate": 0.0001, "loss": 7.9732, "loss/crossentropy": 2.495920181274414, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2554667443037033, "step": 3562 }, { "epoch": 0.22275, "grad_norm": 3.3125, "grad_norm_var": 0.057291666666666664, "learning_rate": 0.0001, "loss": 8.1036, "loss/crossentropy": 2.382394790649414, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.26405252516269684, "step": 3564 }, { "epoch": 0.222875, "grad_norm": 3.859375, "grad_norm_var": 0.11291910807291666, "learning_rate": 0.0001, "loss": 8.3047, "loss/crossentropy": 2.2396143674850464, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2523963898420334, "step": 3566 }, { "epoch": 0.223, "grad_norm": 2.875, "grad_norm_var": 0.09706624348958333, "learning_rate": 0.0001, "loss": 8.1758, "loss/crossentropy": 2.323571801185608, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.24818182736635208, "step": 3568 }, { "epoch": 0.223125, "grad_norm": 2.96875, "grad_norm_var": 0.09746805826822917, "learning_rate": 0.0001, "loss": 8.2098, "loss/crossentropy": 2.2471351623535156, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2540801614522934, "step": 3570 }, { "epoch": 0.22325, "grad_norm": 2.84375, "grad_norm_var": 0.1020904541015625, "learning_rate": 0.0001, "loss": 8.1846, "loss/crossentropy": 2.622172474861145, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2584248036146164, "step": 3572 }, { "epoch": 0.223375, "grad_norm": 2.75, "grad_norm_var": 0.09426167805989584, "learning_rate": 0.0001, "loss": 8.0865, "loss/crossentropy": 2.5120197534561157, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.25580984354019165, "step": 3574 }, { "epoch": 0.2235, "grad_norm": 2.75, "grad_norm_var": 0.09970296223958333, "learning_rate": 0.0001, "loss": 8.0353, "loss/crossentropy": 2.3647409677505493, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2629951015114784, "step": 3576 }, { "epoch": 0.223625, "grad_norm": 2.9375, "grad_norm_var": 0.0968170166015625, "learning_rate": 0.0001, "loss": 8.0964, "loss/crossentropy": 2.225765824317932, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.27005481719970703, "step": 3578 }, { "epoch": 0.22375, "grad_norm": 2.78125, "grad_norm_var": 0.0942047119140625, "learning_rate": 0.0001, "loss": 7.9496, "loss/crossentropy": 2.172963261604309, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2738679349422455, "step": 3580 }, { "epoch": 0.223875, "grad_norm": 3.0, "grad_norm_var": 0.012776692708333334, "learning_rate": 0.0001, "loss": 8.1431, "loss/crossentropy": 2.360185146331787, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2576560080051422, "step": 3582 }, { "epoch": 0.224, "grad_norm": 3.03125, "grad_norm_var": 0.010477701822916666, "learning_rate": 0.0001, "loss": 7.9726, "loss/crossentropy": 2.034193217754364, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24234963953495026, "step": 3584 }, { "epoch": 0.224125, "grad_norm": 2.890625, "grad_norm_var": 0.010640462239583334, "learning_rate": 0.0001, "loss": 7.6952, "loss/crossentropy": 2.266782283782959, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24584876000881195, "step": 3586 }, { "epoch": 0.22425, "grad_norm": 3.515625, "grad_norm_var": 1.381476847330729, "learning_rate": 0.0001, "loss": 8.4263, "loss/crossentropy": 2.3165758848190308, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.23391347378492355, "step": 3588 }, { "epoch": 0.224375, "grad_norm": 3.4375, "grad_norm_var": 1.3608876546223958, "learning_rate": 0.0001, "loss": 8.1828, "loss/crossentropy": 2.5059953927993774, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.28126345574855804, "step": 3590 }, { "epoch": 0.2245, "grad_norm": 3.03125, "grad_norm_var": 1.338703409830729, "learning_rate": 0.0001, "loss": 8.1214, "loss/crossentropy": 2.365064263343811, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.27421511709690094, "step": 3592 }, { "epoch": 0.224625, "grad_norm": 2.90625, "grad_norm_var": 1.3280019124348958, "learning_rate": 0.0001, "loss": 8.0988, "loss/crossentropy": 2.4384506940841675, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2522090822458267, "step": 3594 }, { "epoch": 0.22475, "grad_norm": 2.96875, "grad_norm_var": 1.304955037434896, "learning_rate": 0.0001, "loss": 8.1116, "loss/crossentropy": 2.1527076959609985, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.24413420259952545, "step": 3596 }, { "epoch": 0.224875, "grad_norm": 3.140625, "grad_norm_var": 1.301513671875, "learning_rate": 0.0001, "loss": 8.1075, "loss/crossentropy": 2.2478511333465576, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2579783499240875, "step": 3598 }, { "epoch": 0.225, "grad_norm": 3.046875, "grad_norm_var": 1.307331339518229, "learning_rate": 0.0001, "loss": 7.9146, "loss/crossentropy": 2.396006226539612, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.24864860624074936, "step": 3600 }, { "epoch": 0.225125, "grad_norm": 2.875, "grad_norm_var": 1.31002197265625, "learning_rate": 0.0001, "loss": 8.2544, "loss/crossentropy": 2.6450746059417725, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2689923644065857, "step": 3602 }, { "epoch": 0.22525, "grad_norm": 2.75, "grad_norm_var": 0.05621744791666667, "learning_rate": 0.0001, "loss": 7.7334, "loss/crossentropy": 2.1939942836761475, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.23875074833631516, "step": 3604 }, { "epoch": 0.225375, "grad_norm": 2.78125, "grad_norm_var": 0.013114420572916667, "learning_rate": 0.0001, "loss": 7.9962, "loss/crossentropy": 2.20789635181427, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.25146640837192535, "step": 3606 }, { "epoch": 0.2255, "grad_norm": 2.953125, "grad_norm_var": 0.011832682291666667, "learning_rate": 0.0001, "loss": 8.0575, "loss/crossentropy": 2.2489298582077026, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24663664400577545, "step": 3608 }, { "epoch": 0.225625, "grad_norm": 3.078125, "grad_norm_var": 0.0134674072265625, "learning_rate": 0.0001, "loss": 8.0739, "loss/crossentropy": 2.153047263622284, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.25803135335445404, "step": 3610 }, { "epoch": 0.22575, "grad_norm": 3.0625, "grad_norm_var": 0.0144683837890625, "learning_rate": 0.0001, "loss": 8.231, "loss/crossentropy": 2.17412793636322, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2347177267074585, "step": 3612 }, { "epoch": 0.225875, "grad_norm": 2.96875, "grad_norm_var": 0.011263020833333333, "learning_rate": 0.0001, "loss": 8.0072, "loss/crossentropy": 2.55859112739563, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.25972864031791687, "step": 3614 }, { "epoch": 0.226, "grad_norm": 2.796875, "grad_norm_var": 0.011714680989583334, "learning_rate": 0.0001, "loss": 8.0068, "loss/crossentropy": 2.537785530090332, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2541259229183197, "step": 3616 }, { "epoch": 0.226125, "grad_norm": 2.953125, "grad_norm_var": 0.0119140625, "learning_rate": 0.0001, "loss": 7.9701, "loss/crossentropy": 2.1381853818893433, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2583460807800293, "step": 3618 }, { "epoch": 0.22625, "grad_norm": 2.859375, "grad_norm_var": 0.010383097330729167, "learning_rate": 0.0001, "loss": 8.0889, "loss/crossentropy": 2.267898917198181, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.26800353825092316, "step": 3620 }, { "epoch": 0.226375, "grad_norm": 2.921875, "grad_norm_var": 0.009147135416666667, "learning_rate": 0.0001, "loss": 8.0278, "loss/crossentropy": 2.291195571422577, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.26078518480062485, "step": 3622 }, { "epoch": 0.2265, "grad_norm": 2.890625, "grad_norm_var": 0.0090484619140625, "learning_rate": 0.0001, "loss": 7.7674, "loss/crossentropy": 2.1368112564086914, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.23087257146835327, "step": 3624 }, { "epoch": 0.226625, "grad_norm": 2.9375, "grad_norm_var": 0.0071685791015625, "learning_rate": 0.0001, "loss": 8.2532, "loss/crossentropy": 2.495459794998169, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2500630244612694, "step": 3626 }, { "epoch": 0.22675, "grad_norm": 2.875, "grad_norm_var": 0.0052154541015625, "learning_rate": 0.0001, "loss": 7.9174, "loss/crossentropy": 2.3807636499404907, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2521054297685623, "step": 3628 }, { "epoch": 0.226875, "grad_norm": 2.984375, "grad_norm_var": 0.0088043212890625, "learning_rate": 0.0001, "loss": 8.0956, "loss/crossentropy": 2.2587956190109253, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2414519563317299, "step": 3630 }, { "epoch": 0.227, "grad_norm": 2.859375, "grad_norm_var": 0.009357706705729166, "learning_rate": 0.0001, "loss": 8.0097, "loss/crossentropy": 2.202399969100952, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2805350720882416, "step": 3632 }, { "epoch": 0.227125, "grad_norm": 2.90625, "grad_norm_var": 0.00894775390625, "learning_rate": 0.0001, "loss": 7.9928, "loss/crossentropy": 2.351557970046997, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2543243542313576, "step": 3634 }, { "epoch": 0.22725, "grad_norm": 2.96875, "grad_norm_var": 0.008226521809895833, "learning_rate": 0.0001, "loss": 8.1574, "loss/crossentropy": 2.3034415245056152, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.28570613265037537, "step": 3636 }, { "epoch": 0.227375, "grad_norm": 3.0625, "grad_norm_var": 0.009593709309895834, "learning_rate": 0.0001, "loss": 7.9509, "loss/crossentropy": 2.2518638372421265, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2769414782524109, "step": 3638 }, { "epoch": 0.2275, "grad_norm": 3.0625, "grad_norm_var": 0.0104156494140625, "learning_rate": 0.0001, "loss": 8.2742, "loss/crossentropy": 2.313628911972046, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2677566707134247, "step": 3640 }, { "epoch": 0.227625, "grad_norm": 2.8125, "grad_norm_var": 0.022614542643229166, "learning_rate": 0.0001, "loss": 8.0699, "loss/crossentropy": 2.4361445903778076, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2696637511253357, "step": 3642 }, { "epoch": 0.22775, "grad_norm": 2.65625, "grad_norm_var": 0.0240234375, "learning_rate": 0.0001, "loss": 7.9745, "loss/crossentropy": 2.3829265832901, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24858426302671432, "step": 3644 }, { "epoch": 0.227875, "grad_norm": 2.734375, "grad_norm_var": 0.0252838134765625, "learning_rate": 0.0001, "loss": 8.0501, "loss/crossentropy": 2.344617486000061, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2515474408864975, "step": 3646 }, { "epoch": 0.228, "grad_norm": 3.140625, "grad_norm_var": 0.02886962890625, "learning_rate": 0.0001, "loss": 8.1793, "loss/crossentropy": 2.376260995864868, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24280902743339539, "step": 3648 }, { "epoch": 0.228125, "grad_norm": 3.03125, "grad_norm_var": 0.03185221354166667, "learning_rate": 0.0001, "loss": 8.2511, "loss/crossentropy": 2.427824020385742, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.27128610014915466, "step": 3650 }, { "epoch": 0.22825, "grad_norm": 3.109375, "grad_norm_var": 0.04291890462239583, "learning_rate": 0.0001, "loss": 8.229, "loss/crossentropy": 2.465716004371643, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2654089778661728, "step": 3652 }, { "epoch": 0.228375, "grad_norm": 2.609375, "grad_norm_var": 0.055231730143229164, "learning_rate": 0.0001, "loss": 7.7795, "loss/crossentropy": 2.081652522087097, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.25261829793453217, "step": 3654 }, { "epoch": 0.2285, "grad_norm": 3.09375, "grad_norm_var": 0.05577799479166667, "learning_rate": 0.0001, "loss": 7.9502, "loss/crossentropy": 2.1460715532302856, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.24947232007980347, "step": 3656 }, { "epoch": 0.228625, "grad_norm": 3.046875, "grad_norm_var": 0.047053019205729164, "learning_rate": 0.0001, "loss": 8.3344, "loss/crossentropy": 2.341508150100708, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2834539860486984, "step": 3658 }, { "epoch": 0.22875, "grad_norm": 3.234375, "grad_norm_var": 0.0469635009765625, "learning_rate": 0.0001, "loss": 8.0013, "loss/crossentropy": 2.135187327861786, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2561069577932358, "step": 3660 }, { "epoch": 0.228875, "grad_norm": 3.34375, "grad_norm_var": 0.30520426432291664, "learning_rate": 0.0001, "loss": 8.2324, "loss/crossentropy": 2.383564591407776, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.28082774579524994, "step": 3662 }, { "epoch": 0.229, "grad_norm": 2.71875, "grad_norm_var": 0.3198313395182292, "learning_rate": 0.0001, "loss": 8.081, "loss/crossentropy": 2.199310064315796, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2764912396669388, "step": 3664 }, { "epoch": 0.229125, "grad_norm": 3.078125, "grad_norm_var": 0.3196614583333333, "learning_rate": 0.0001, "loss": 8.0434, "loss/crossentropy": 2.3052173852920532, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.28200992941856384, "step": 3666 }, { "epoch": 0.22925, "grad_norm": 2.953125, "grad_norm_var": 0.32179361979166665, "learning_rate": 0.0001, "loss": 8.0128, "loss/crossentropy": 2.3882672786712646, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2689662426710129, "step": 3668 }, { "epoch": 0.229375, "grad_norm": 3.234375, "grad_norm_var": 0.3091949462890625, "learning_rate": 0.0001, "loss": 7.9856, "loss/crossentropy": 2.4546321630477905, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.25415413081645966, "step": 3670 }, { "epoch": 0.2295, "grad_norm": 2.84375, "grad_norm_var": 0.3122029622395833, "learning_rate": 0.0001, "loss": 8.0347, "loss/crossentropy": 2.3109859228134155, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2632336914539337, "step": 3672 }, { "epoch": 0.229625, "grad_norm": 2.921875, "grad_norm_var": 0.31685791015625, "learning_rate": 0.0001, "loss": 7.9744, "loss/crossentropy": 2.24290668964386, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24445384740829468, "step": 3674 }, { "epoch": 0.22975, "grad_norm": 2.953125, "grad_norm_var": 0.3194976806640625, "learning_rate": 0.0001, "loss": 8.0582, "loss/crossentropy": 2.6178494691848755, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2564311772584915, "step": 3676 }, { "epoch": 0.229875, "grad_norm": 3.109375, "grad_norm_var": 0.02584228515625, "learning_rate": 0.0001, "loss": 8.1775, "loss/crossentropy": 2.266401767730713, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.26438479125499725, "step": 3678 }, { "epoch": 0.23, "grad_norm": 3.21875, "grad_norm_var": 0.0263580322265625, "learning_rate": 0.0001, "loss": 7.9564, "loss/crossentropy": 2.2778828144073486, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.249868243932724, "step": 3680 }, { "epoch": 0.230125, "grad_norm": 2.96875, "grad_norm_var": 0.03190816243489583, "learning_rate": 0.0001, "loss": 8.2025, "loss/crossentropy": 2.5149052143096924, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2597532272338867, "step": 3682 }, { "epoch": 0.23025, "grad_norm": 2.796875, "grad_norm_var": 0.034407552083333334, "learning_rate": 0.0001, "loss": 8.1936, "loss/crossentropy": 2.4017945528030396, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2589564323425293, "step": 3684 }, { "epoch": 0.230375, "grad_norm": 2.90625, "grad_norm_var": 0.024933878580729166, "learning_rate": 0.0001, "loss": 8.0723, "loss/crossentropy": 2.3127940893173218, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2643566429615021, "step": 3686 }, { "epoch": 0.2305, "grad_norm": 2.640625, "grad_norm_var": 0.032063802083333336, "learning_rate": 0.0001, "loss": 7.6895, "loss/crossentropy": 2.009396731853485, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.2169792503118515, "step": 3688 }, { "epoch": 0.230625, "grad_norm": 2.90625, "grad_norm_var": 0.03504130045572917, "learning_rate": 0.0001, "loss": 8.242, "loss/crossentropy": 2.3510366678237915, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2538380026817322, "step": 3690 }, { "epoch": 0.23075, "grad_norm": 2.84375, "grad_norm_var": 0.030973307291666665, "learning_rate": 0.0001, "loss": 7.9667, "loss/crossentropy": 2.3450275659561157, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.26012004911899567, "step": 3692 }, { "epoch": 0.230875, "grad_norm": 2.75, "grad_norm_var": 0.03232014973958333, "learning_rate": 0.0001, "loss": 8.0063, "loss/crossentropy": 2.0265676975250244, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2440163493156433, "step": 3694 }, { "epoch": 0.231, "grad_norm": 3.25, "grad_norm_var": 0.041259765625, "learning_rate": 0.0001, "loss": 8.152, "loss/crossentropy": 2.259366512298584, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2548896074295044, "step": 3696 }, { "epoch": 0.231125, "grad_norm": 3.21875, "grad_norm_var": 0.037984212239583336, "learning_rate": 0.0001, "loss": 8.177, "loss/crossentropy": 2.4230661392211914, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.25759340822696686, "step": 3698 }, { "epoch": 0.23125, "grad_norm": 2.828125, "grad_norm_var": 0.03795572916666667, "learning_rate": 0.0001, "loss": 8.1042, "loss/crossentropy": 2.4304851293563843, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25671106576919556, "step": 3700 }, { "epoch": 0.231375, "grad_norm": 3.109375, "grad_norm_var": 0.042756144205729166, "learning_rate": 0.0001, "loss": 8.1151, "loss/crossentropy": 2.601144790649414, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25495633482933044, "step": 3702 }, { "epoch": 0.2315, "grad_norm": 2.84375, "grad_norm_var": 0.03870035807291667, "learning_rate": 0.0001, "loss": 7.8518, "loss/crossentropy": 2.0991747975349426, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.23651761561632156, "step": 3704 }, { "epoch": 0.231625, "grad_norm": 3.0, "grad_norm_var": 0.03697916666666667, "learning_rate": 0.0001, "loss": 8.0609, "loss/crossentropy": 2.3886858224868774, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.249774731695652, "step": 3706 }, { "epoch": 0.23175, "grad_norm": 2.875, "grad_norm_var": 0.03943684895833333, "learning_rate": 0.0001, "loss": 8.0872, "loss/crossentropy": 2.275684356689453, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2471064031124115, "step": 3708 }, { "epoch": 0.231875, "grad_norm": 2.84375, "grad_norm_var": 0.03299153645833333, "learning_rate": 0.0001, "loss": 7.9469, "loss/crossentropy": 2.490668535232544, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.26126355677843094, "step": 3710 }, { "epoch": 0.232, "grad_norm": 2.734375, "grad_norm_var": 0.029857381184895834, "learning_rate": 0.0001, "loss": 7.9749, "loss/crossentropy": 2.5393855571746826, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24972229450941086, "step": 3712 }, { "epoch": 0.232125, "grad_norm": 2.828125, "grad_norm_var": 0.028229777018229166, "learning_rate": 0.0001, "loss": 8.0115, "loss/crossentropy": 2.453523635864258, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.26304805278778076, "step": 3714 }, { "epoch": 0.23225, "grad_norm": 2.84375, "grad_norm_var": 0.028075154622395834, "learning_rate": 0.0001, "loss": 8.1984, "loss/crossentropy": 2.2866803407669067, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.26558496057987213, "step": 3716 }, { "epoch": 0.232375, "grad_norm": 2.9375, "grad_norm_var": 0.022907511393229166, "learning_rate": 0.0001, "loss": 8.1454, "loss/crossentropy": 2.5315120220184326, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2710745185613632, "step": 3718 }, { "epoch": 0.2325, "grad_norm": 2.796875, "grad_norm_var": 0.020563761393229168, "learning_rate": 0.0001, "loss": 7.7971, "loss/crossentropy": 2.1747653484344482, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2295996993780136, "step": 3720 }, { "epoch": 0.232625, "grad_norm": 3.28125, "grad_norm_var": 0.026634724934895833, "learning_rate": 0.0001, "loss": 7.9727, "loss/crossentropy": 2.348886013031006, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2646675556898117, "step": 3722 }, { "epoch": 0.23275, "grad_norm": 2.5625, "grad_norm_var": 0.026871744791666666, "learning_rate": 0.0001, "loss": 7.5556, "loss/crossentropy": 1.9647228121757507, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.22373639792203903, "step": 3724 }, { "epoch": 0.232875, "grad_norm": 2.78125, "grad_norm_var": 0.024762980143229165, "learning_rate": 0.0001, "loss": 7.9863, "loss/crossentropy": 2.328511357307434, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.23512594401836395, "step": 3726 }, { "epoch": 0.233, "grad_norm": 2.859375, "grad_norm_var": 0.02340087890625, "learning_rate": 0.0001, "loss": 8.0379, "loss/crossentropy": 2.4145067930221558, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2505330890417099, "step": 3728 }, { "epoch": 0.233125, "grad_norm": 3.015625, "grad_norm_var": 0.024365234375, "learning_rate": 0.0001, "loss": 8.0389, "loss/crossentropy": 2.326350212097168, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.282063826918602, "step": 3730 }, { "epoch": 0.23325, "grad_norm": 3.046875, "grad_norm_var": 0.025047810872395833, "learning_rate": 0.0001, "loss": 8.0287, "loss/crossentropy": 2.3160377740859985, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25916415452957153, "step": 3732 }, { "epoch": 0.233375, "grad_norm": 2.78125, "grad_norm_var": 0.025666300455729166, "learning_rate": 0.0001, "loss": 8.0978, "loss/crossentropy": 2.5563199520111084, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.2803049683570862, "step": 3734 }, { "epoch": 0.2335, "grad_norm": 3.0625, "grad_norm_var": 0.02427978515625, "learning_rate": 0.0001, "loss": 7.9286, "loss/crossentropy": 2.1742677688598633, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.25293681770563126, "step": 3736 }, { "epoch": 0.233625, "grad_norm": 2.890625, "grad_norm_var": 0.013776652018229167, "learning_rate": 0.0001, "loss": 8.1056, "loss/crossentropy": 2.3429336547851562, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.2538968622684479, "step": 3738 }, { "epoch": 0.23375, "grad_norm": 2.84375, "grad_norm_var": 0.007111612955729167, "learning_rate": 0.0001, "loss": 8.1074, "loss/crossentropy": 2.31491482257843, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24610111862421036, "step": 3740 }, { "epoch": 0.233875, "grad_norm": 2.84375, "grad_norm_var": 0.007233683268229167, "learning_rate": 0.0001, "loss": 7.8913, "loss/crossentropy": 2.399028778076172, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.24484336376190186, "step": 3742 }, { "epoch": 0.234, "grad_norm": 2.8125, "grad_norm_var": 0.009208170572916667, "learning_rate": 0.0001, "loss": 7.8039, "loss/crossentropy": 2.3483502864837646, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24819336086511612, "step": 3744 }, { "epoch": 0.234125, "grad_norm": 2.796875, "grad_norm_var": 0.0086578369140625, "learning_rate": 0.0001, "loss": 7.8538, "loss/crossentropy": 2.27790105342865, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24142986536026, "step": 3746 }, { "epoch": 0.23425, "grad_norm": 2.671875, "grad_norm_var": 0.008722941080729166, "learning_rate": 0.0001, "loss": 7.8606, "loss/crossentropy": 2.0212570428848267, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23354055732488632, "step": 3748 }, { "epoch": 0.234375, "grad_norm": 2.921875, "grad_norm_var": 0.008771769205729167, "learning_rate": 0.0001, "loss": 7.9268, "loss/crossentropy": 2.212460517883301, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.2667820304632187, "step": 3750 }, { "epoch": 0.2345, "grad_norm": 2.796875, "grad_norm_var": 0.0054107666015625, "learning_rate": 0.0001, "loss": 7.8209, "loss/crossentropy": 2.054919123649597, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2387581467628479, "step": 3752 }, { "epoch": 0.234625, "grad_norm": 2.90625, "grad_norm_var": 0.0064117431640625, "learning_rate": 0.0001, "loss": 8.0238, "loss/crossentropy": 2.1775506734848022, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2639275938272476, "step": 3754 }, { "epoch": 0.23475, "grad_norm": 2.8125, "grad_norm_var": 0.007013956705729167, "learning_rate": 0.0001, "loss": 7.9167, "loss/crossentropy": 2.1846723556518555, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.23828266561031342, "step": 3756 }, { "epoch": 0.234875, "grad_norm": 3.046875, "grad_norm_var": 0.010286458333333333, "learning_rate": 0.0001, "loss": 7.8777, "loss/crossentropy": 2.134470820426941, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.22749686986207962, "step": 3758 }, { "epoch": 0.235, "grad_norm": 2.953125, "grad_norm_var": 0.01025390625, "learning_rate": 0.0001, "loss": 8.009, "loss/crossentropy": 2.284530997276306, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.25272058695554733, "step": 3760 }, { "epoch": 0.235125, "grad_norm": 2.671875, "grad_norm_var": 0.01494140625, "learning_rate": 0.0001, "loss": 7.8373, "loss/crossentropy": 2.475208878517151, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2501443922519684, "step": 3762 }, { "epoch": 0.23525, "grad_norm": 2.953125, "grad_norm_var": 0.013102213541666666, "learning_rate": 0.0001, "loss": 7.9143, "loss/crossentropy": 2.037322998046875, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23644433170557022, "step": 3764 }, { "epoch": 0.235375, "grad_norm": 2.953125, "grad_norm_var": 0.013841756184895833, "learning_rate": 0.0001, "loss": 8.0747, "loss/crossentropy": 2.3539260625839233, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24543513357639313, "step": 3766 }, { "epoch": 0.2355, "grad_norm": 3.078125, "grad_norm_var": 0.014281209309895833, "learning_rate": 0.0001, "loss": 8.101, "loss/crossentropy": 2.3604376316070557, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2443503588438034, "step": 3768 }, { "epoch": 0.235625, "grad_norm": 2.71875, "grad_norm_var": 0.023758951822916666, "learning_rate": 0.0001, "loss": 8.0751, "loss/crossentropy": 2.261757254600525, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2623937949538231, "step": 3770 }, { "epoch": 0.23575, "grad_norm": 2.703125, "grad_norm_var": 0.023714192708333335, "learning_rate": 0.0001, "loss": 7.7161, "loss/crossentropy": 2.4158157110214233, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24528972804546356, "step": 3772 }, { "epoch": 0.235875, "grad_norm": 2.75, "grad_norm_var": 0.023844401041666668, "learning_rate": 0.0001, "loss": 7.9815, "loss/crossentropy": 2.449522614479065, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.25976794958114624, "step": 3774 }, { "epoch": 0.236, "grad_norm": 2.765625, "grad_norm_var": 0.025093587239583333, "learning_rate": 0.0001, "loss": 8.0425, "loss/crossentropy": 2.3642194271087646, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.27245795726776123, "step": 3776 }, { "epoch": 0.236125, "grad_norm": 2.765625, "grad_norm_var": 0.023729451497395835, "learning_rate": 0.0001, "loss": 7.9755, "loss/crossentropy": 2.4502243995666504, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2875143885612488, "step": 3778 }, { "epoch": 0.23625, "grad_norm": 2.9375, "grad_norm_var": 0.026558430989583333, "learning_rate": 0.0001, "loss": 7.8647, "loss/crossentropy": 2.023205041885376, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2414599061012268, "step": 3780 }, { "epoch": 0.236375, "grad_norm": 2.84375, "grad_norm_var": 0.025406901041666666, "learning_rate": 0.0001, "loss": 7.9166, "loss/crossentropy": 2.1861640214920044, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24028976261615753, "step": 3782 }, { "epoch": 0.2365, "grad_norm": 2.625, "grad_norm_var": 0.023628743489583333, "learning_rate": 0.0001, "loss": 7.7592, "loss/crossentropy": 2.204551935195923, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24570683389902115, "step": 3784 }, { "epoch": 0.236625, "grad_norm": 2.90625, "grad_norm_var": 0.011637369791666666, "learning_rate": 0.0001, "loss": 7.8933, "loss/crossentropy": 2.354386806488037, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25208230316638947, "step": 3786 }, { "epoch": 0.23675, "grad_norm": 3.3125, "grad_norm_var": 0.04938151041666667, "learning_rate": 0.0001, "loss": 8.1279, "loss/crossentropy": 2.3427704572677612, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2587417662143707, "step": 3788 }, { "epoch": 0.236875, "grad_norm": 2.84375, "grad_norm_var": 0.047606404622395834, "learning_rate": 0.0001, "loss": 8.0057, "loss/crossentropy": 2.320341467857361, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2580983489751816, "step": 3790 }, { "epoch": 0.237, "grad_norm": 2.71875, "grad_norm_var": 0.04879150390625, "learning_rate": 0.0001, "loss": 8.0361, "loss/crossentropy": 2.3142045736312866, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.25220172107219696, "step": 3792 }, { "epoch": 0.237125, "grad_norm": 3.1875, "grad_norm_var": 0.05084228515625, "learning_rate": 0.0001, "loss": 8.0118, "loss/crossentropy": 2.242996096611023, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.25726889073848724, "step": 3794 }, { "epoch": 0.23725, "grad_norm": 2.671875, "grad_norm_var": 0.050511678059895836, "learning_rate": 0.0001, "loss": 7.8234, "loss/crossentropy": 2.1468963623046875, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2335699424147606, "step": 3796 }, { "epoch": 0.237375, "grad_norm": 2.984375, "grad_norm_var": 0.05858968098958333, "learning_rate": 0.0001, "loss": 7.8601, "loss/crossentropy": 2.1172631978988647, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.22815796732902527, "step": 3798 }, { "epoch": 0.2375, "grad_norm": 2.984375, "grad_norm_var": 0.054931640625, "learning_rate": 0.0001, "loss": 7.92, "loss/crossentropy": 2.322197914123535, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.23355276137590408, "step": 3800 }, { "epoch": 0.237625, "grad_norm": 2.75, "grad_norm_var": 0.05657552083333333, "learning_rate": 0.0001, "loss": 7.7418, "loss/crossentropy": 2.3256465196609497, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2376643791794777, "step": 3802 }, { "epoch": 0.23775, "grad_norm": 2.75, "grad_norm_var": 0.030394490559895834, "learning_rate": 0.0001, "loss": 8.0796, "loss/crossentropy": 2.3443360328674316, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2643965929746628, "step": 3804 }, { "epoch": 0.237875, "grad_norm": 3.0625, "grad_norm_var": 0.0342437744140625, "learning_rate": 0.0001, "loss": 7.8792, "loss/crossentropy": 2.1913982629776, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.25000348687171936, "step": 3806 }, { "epoch": 0.238, "grad_norm": 2.90625, "grad_norm_var": 0.03147684733072917, "learning_rate": 0.0001, "loss": 7.8622, "loss/crossentropy": 2.425496220588684, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24364301562309265, "step": 3808 }, { "epoch": 0.238125, "grad_norm": 2.90625, "grad_norm_var": 0.025690714518229168, "learning_rate": 0.0001, "loss": 8.072, "loss/crossentropy": 2.3379809856414795, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2686043605208397, "step": 3810 }, { "epoch": 0.23825, "grad_norm": 3.234375, "grad_norm_var": 0.030171712239583332, "learning_rate": 0.0001, "loss": 7.7219, "loss/crossentropy": 1.885031819343567, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.22808341681957245, "step": 3812 }, { "epoch": 0.238375, "grad_norm": 2.765625, "grad_norm_var": 0.025651041666666666, "learning_rate": 0.0001, "loss": 8.0085, "loss/crossentropy": 2.3886247873306274, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.24375489354133606, "step": 3814 }, { "epoch": 0.2385, "grad_norm": 2.9375, "grad_norm_var": 0.025202433268229168, "learning_rate": 0.0001, "loss": 7.8644, "loss/crossentropy": 2.130933403968811, "loss/hidden": 3.0234375, "loss/jsd": 0.0, "loss/logits": 0.23475061357021332, "step": 3816 }, { "epoch": 0.238625, "grad_norm": 2.78125, "grad_norm_var": 0.024885050455729165, "learning_rate": 0.0001, "loss": 7.9254, "loss/crossentropy": 2.242841362953186, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2557855248451233, "step": 3818 }, { "epoch": 0.23875, "grad_norm": 2.8125, "grad_norm_var": 0.024104817708333334, "learning_rate": 0.0001, "loss": 8.0524, "loss/crossentropy": 2.1857462525367737, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.22692881524562836, "step": 3820 }, { "epoch": 0.238875, "grad_norm": 2.9375, "grad_norm_var": 0.020279947916666666, "learning_rate": 0.0001, "loss": 8.0351, "loss/crossentropy": 2.412648320198059, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24989540874958038, "step": 3822 }, { "epoch": 0.239, "grad_norm": 2.984375, "grad_norm_var": 0.10816650390625, "learning_rate": 0.0001, "loss": 7.9189, "loss/crossentropy": 2.2715072631835938, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.24672597646713257, "step": 3824 }, { "epoch": 0.239125, "grad_norm": 3.046875, "grad_norm_var": 0.10906473795572917, "learning_rate": 0.0001, "loss": 7.9206, "loss/crossentropy": 2.26256263256073, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.26400326192379, "step": 3826 }, { "epoch": 0.23925, "grad_norm": 2.8125, "grad_norm_var": 0.10342508951822917, "learning_rate": 0.0001, "loss": 7.7574, "loss/crossentropy": 2.229863405227661, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2663080394268036, "step": 3828 }, { "epoch": 0.239375, "grad_norm": 2.859375, "grad_norm_var": 0.09905192057291666, "learning_rate": 0.0001, "loss": 8.0812, "loss/crossentropy": 2.4192259311676025, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.26135944575071335, "step": 3830 }, { "epoch": 0.2395, "grad_norm": 2.75, "grad_norm_var": 0.09844462076822917, "learning_rate": 0.0001, "loss": 7.8173, "loss/crossentropy": 2.3621386289596558, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23955464363098145, "step": 3832 }, { "epoch": 0.239625, "grad_norm": 2.953125, "grad_norm_var": 0.09664306640625, "learning_rate": 0.0001, "loss": 7.936, "loss/crossentropy": 2.368203043937683, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.26091665029525757, "step": 3834 }, { "epoch": 0.23975, "grad_norm": 2.953125, "grad_norm_var": 0.09709879557291666, "learning_rate": 0.0001, "loss": 7.8929, "loss/crossentropy": 2.2252918481826782, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.24965070188045502, "step": 3836 }, { "epoch": 0.239875, "grad_norm": 2.765625, "grad_norm_var": 0.10928446451822917, "learning_rate": 0.0001, "loss": 7.8, "loss/crossentropy": 2.2704564332962036, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.25084085017442703, "step": 3838 }, { "epoch": 0.24, "grad_norm": 2.515625, "grad_norm_var": 0.019462076822916667, "learning_rate": 0.0001, "loss": 8.0258, "loss/crossentropy": 2.4409396648406982, "loss/hidden": 3.2109375, "loss/jsd": 0.0, "loss/logits": 0.27064767479896545, "step": 3840 }, { "epoch": 0.240125, "grad_norm": 2.734375, "grad_norm_var": 0.01988525390625, "learning_rate": 0.0001, "loss": 8.0067, "loss/crossentropy": 2.4022551774978638, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.26499345898628235, "step": 3842 }, { "epoch": 0.24025, "grad_norm": 2.75, "grad_norm_var": 0.0198150634765625, "learning_rate": 0.0001, "loss": 7.9328, "loss/crossentropy": 2.2620415687561035, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2681596428155899, "step": 3844 }, { "epoch": 0.240375, "grad_norm": 2.890625, "grad_norm_var": 0.019807942708333335, "learning_rate": 0.0001, "loss": 8.0649, "loss/crossentropy": 2.205040454864502, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2555413693189621, "step": 3846 }, { "epoch": 0.2405, "grad_norm": 3.03125, "grad_norm_var": 0.022801717122395832, "learning_rate": 0.0001, "loss": 7.8505, "loss/crossentropy": 2.5771888494491577, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2554461359977722, "step": 3848 }, { "epoch": 0.240625, "grad_norm": 2.65625, "grad_norm_var": 0.022980753580729166, "learning_rate": 0.0001, "loss": 7.7335, "loss/crossentropy": 2.1476651430130005, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.24120255559682846, "step": 3850 }, { "epoch": 0.24075, "grad_norm": 3.21875, "grad_norm_var": 0.03230692545572917, "learning_rate": 0.0001, "loss": 8.0287, "loss/crossentropy": 2.435906410217285, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2562536597251892, "step": 3852 }, { "epoch": 0.240875, "grad_norm": 2.90625, "grad_norm_var": 0.028547159830729165, "learning_rate": 0.0001, "loss": 8.0916, "loss/crossentropy": 2.2504727840423584, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24065575003623962, "step": 3854 }, { "epoch": 0.241, "grad_norm": 2.796875, "grad_norm_var": 0.020475260416666665, "learning_rate": 0.0001, "loss": 7.8006, "loss/crossentropy": 2.1379220485687256, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.2533913552761078, "step": 3856 }, { "epoch": 0.241125, "grad_norm": 2.71875, "grad_norm_var": 0.019364420572916666, "learning_rate": 0.0001, "loss": 7.8473, "loss/crossentropy": 2.5523799657821655, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.24728696048259735, "step": 3858 }, { "epoch": 0.24125, "grad_norm": 2.65625, "grad_norm_var": 0.021354166666666667, "learning_rate": 0.0001, "loss": 8.0523, "loss/crossentropy": 2.3402522802352905, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.23582365363836288, "step": 3860 }, { "epoch": 0.241375, "grad_norm": 3.0625, "grad_norm_var": 0.0236968994140625, "learning_rate": 0.0001, "loss": 7.8628, "loss/crossentropy": 2.312021493911743, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.25287964940071106, "step": 3862 }, { "epoch": 0.2415, "grad_norm": 3.21875, "grad_norm_var": 0.18788655598958334, "learning_rate": 0.0001, "loss": 7.9496, "loss/crossentropy": 2.5552531480789185, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.26104636490345, "step": 3864 }, { "epoch": 0.241625, "grad_norm": 2.78125, "grad_norm_var": 0.18173726399739584, "learning_rate": 0.0001, "loss": 7.9764, "loss/crossentropy": 2.3297171592712402, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.250930480659008, "step": 3866 }, { "epoch": 0.24175, "grad_norm": 2.984375, "grad_norm_var": 0.17805989583333334, "learning_rate": 0.0001, "loss": 7.9646, "loss/crossentropy": 2.406034827232361, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.27073846757411957, "step": 3868 }, { "epoch": 0.241875, "grad_norm": 2.9375, "grad_norm_var": 0.18010152180989583, "learning_rate": 0.0001, "loss": 8.0221, "loss/crossentropy": 2.354939103126526, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.2731742113828659, "step": 3870 }, { "epoch": 0.242, "grad_norm": 2.828125, "grad_norm_var": 0.18245035807291668, "learning_rate": 0.0001, "loss": 7.8674, "loss/crossentropy": 2.493025302886963, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.23660879582166672, "step": 3872 }, { "epoch": 0.242125, "grad_norm": 2.78125, "grad_norm_var": 0.17879130045572916, "learning_rate": 0.0001, "loss": 8.1985, "loss/crossentropy": 2.4070109128952026, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2546348571777344, "step": 3874 }, { "epoch": 0.24225, "grad_norm": 2.9375, "grad_norm_var": 0.17639058430989582, "learning_rate": 0.0001, "loss": 7.8092, "loss/crossentropy": 2.3700718879699707, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2518589720129967, "step": 3876 }, { "epoch": 0.242375, "grad_norm": 3.078125, "grad_norm_var": 0.17805582682291668, "learning_rate": 0.0001, "loss": 7.9985, "loss/crossentropy": 2.338467597961426, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.25343357026576996, "step": 3878 }, { "epoch": 0.2425, "grad_norm": 2.921875, "grad_norm_var": 0.011522420247395833, "learning_rate": 0.0001, "loss": 8.071, "loss/crossentropy": 2.1937835216522217, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.25322993099689484, "step": 3880 }, { "epoch": 0.242625, "grad_norm": 3.046875, "grad_norm_var": 0.0135650634765625, "learning_rate": 0.0001, "loss": 7.8869, "loss/crossentropy": 2.3660850524902344, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.252712182700634, "step": 3882 }, { "epoch": 0.24275, "grad_norm": 2.71875, "grad_norm_var": 0.014533487955729167, "learning_rate": 0.0001, "loss": 7.7893, "loss/crossentropy": 2.2534600496292114, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.24849370121955872, "step": 3884 }, { "epoch": 0.242875, "grad_norm": 2.75, "grad_norm_var": 0.0155426025390625, "learning_rate": 0.0001, "loss": 7.7321, "loss/crossentropy": 1.9124953746795654, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2445899397134781, "step": 3886 }, { "epoch": 0.243, "grad_norm": 3.03125, "grad_norm_var": 0.0174224853515625, "learning_rate": 0.0001, "loss": 8.0921, "loss/crossentropy": 2.450052261352539, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2614727318286896, "step": 3888 }, { "epoch": 0.243125, "grad_norm": 2.703125, "grad_norm_var": 0.0187652587890625, "learning_rate": 0.0001, "loss": 7.8815, "loss/crossentropy": 2.2794657945632935, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2508957237005234, "step": 3890 }, { "epoch": 0.24325, "grad_norm": 2.96875, "grad_norm_var": 0.016649373372395835, "learning_rate": 0.0001, "loss": 7.8555, "loss/crossentropy": 2.282740831375122, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2370600551366806, "step": 3892 }, { "epoch": 0.243375, "grad_norm": 3.09375, "grad_norm_var": 0.017317708333333334, "learning_rate": 0.0001, "loss": 8.0816, "loss/crossentropy": 2.106474459171295, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.26941801607608795, "step": 3894 }, { "epoch": 0.2435, "grad_norm": 2.96875, "grad_norm_var": 0.019456990559895835, "learning_rate": 0.0001, "loss": 8.1698, "loss/crossentropy": 2.224056601524353, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.2755711078643799, "step": 3896 }, { "epoch": 0.243625, "grad_norm": 2.828125, "grad_norm_var": 0.017414347330729166, "learning_rate": 0.0001, "loss": 7.9395, "loss/crossentropy": 2.1422271728515625, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2343953251838684, "step": 3898 }, { "epoch": 0.24375, "grad_norm": 3.0625, "grad_norm_var": 0.0193511962890625, "learning_rate": 0.0001, "loss": 8.1452, "loss/crossentropy": 2.2681803703308105, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2323412001132965, "step": 3900 }, { "epoch": 0.243875, "grad_norm": 2.78125, "grad_norm_var": 0.021793619791666666, "learning_rate": 0.0001, "loss": 7.7695, "loss/crossentropy": 2.108498513698578, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2429059073328972, "step": 3902 }, { "epoch": 0.244, "grad_norm": 2.78125, "grad_norm_var": 0.019710286458333334, "learning_rate": 0.0001, "loss": 7.9059, "loss/crossentropy": 2.3447383642196655, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2611192911863327, "step": 3904 }, { "epoch": 0.244125, "grad_norm": 3.03125, "grad_norm_var": 0.020807902018229168, "learning_rate": 0.0001, "loss": 7.9542, "loss/crossentropy": 1.840607225894928, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.23171918094158173, "step": 3906 }, { "epoch": 0.24425, "grad_norm": 2.765625, "grad_norm_var": 0.020882161458333333, "learning_rate": 0.0001, "loss": 7.7736, "loss/crossentropy": 2.113314151763916, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.23394189774990082, "step": 3908 }, { "epoch": 0.244375, "grad_norm": 3.125, "grad_norm_var": 0.021549479166666666, "learning_rate": 0.0001, "loss": 7.9819, "loss/crossentropy": 2.252769947052002, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25148941576480865, "step": 3910 }, { "epoch": 0.2445, "grad_norm": 2.953125, "grad_norm_var": 0.027567545572916668, "learning_rate": 0.0001, "loss": 8.0101, "loss/crossentropy": 2.1127032041549683, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.2942749857902527, "step": 3912 }, { "epoch": 0.244625, "grad_norm": 2.953125, "grad_norm_var": 0.02822265625, "learning_rate": 0.0001, "loss": 8.1576, "loss/crossentropy": 2.096681237220764, "loss/hidden": 3.0625, "loss/jsd": 0.0, "loss/logits": 0.2617676854133606, "step": 3914 }, { "epoch": 0.24475, "grad_norm": 2.9375, "grad_norm_var": 0.024837239583333334, "learning_rate": 0.0001, "loss": 7.9193, "loss/crossentropy": 2.312508463859558, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.27543358504772186, "step": 3916 }, { "epoch": 0.244875, "grad_norm": 2.578125, "grad_norm_var": 0.0254547119140625, "learning_rate": 0.0001, "loss": 8.0867, "loss/crossentropy": 2.5416085720062256, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2532195746898651, "step": 3918 }, { "epoch": 0.245, "grad_norm": 2.8125, "grad_norm_var": 0.028319295247395834, "learning_rate": 0.0001, "loss": 7.8437, "loss/crossentropy": 1.9256672859191895, "loss/hidden": 3.0859375, "loss/jsd": 0.0, "loss/logits": 0.23992876708507538, "step": 3920 }, { "epoch": 0.245125, "grad_norm": 2.71875, "grad_norm_var": 0.027083333333333334, "learning_rate": 0.0001, "loss": 7.7237, "loss/crossentropy": 2.0629305839538574, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2529604434967041, "step": 3922 }, { "epoch": 0.24525, "grad_norm": 2.9375, "grad_norm_var": 0.0267974853515625, "learning_rate": 0.0001, "loss": 8.2066, "loss/crossentropy": 2.533225417137146, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2775707393884659, "step": 3924 }, { "epoch": 0.245375, "grad_norm": 3.125, "grad_norm_var": 0.028246053059895835, "learning_rate": 0.0001, "loss": 7.8036, "loss/crossentropy": 2.3668118715286255, "loss/hidden": 3.09375, "loss/jsd": 0.0, "loss/logits": 0.24234110862016678, "step": 3926 }, { "epoch": 0.2455, "grad_norm": 2.71875, "grad_norm_var": 0.018382771809895834, "learning_rate": 0.0001, "loss": 7.7284, "loss/crossentropy": 2.1942179203033447, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25301945209503174, "step": 3928 }, { "epoch": 0.245625, "grad_norm": 3.3125, "grad_norm_var": 0.0335113525390625, "learning_rate": 0.0001, "loss": 7.9152, "loss/crossentropy": 2.3835543394088745, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24918851256370544, "step": 3930 }, { "epoch": 0.24575, "grad_norm": 2.828125, "grad_norm_var": 0.040848795572916666, "learning_rate": 0.0001, "loss": 8.0722, "loss/crossentropy": 2.275932550430298, "loss/hidden": 3.1640625, "loss/jsd": 0.0, "loss/logits": 0.27397048473358154, "step": 3932 }, { "epoch": 0.245875, "grad_norm": 2.84375, "grad_norm_var": 0.03684794108072917, "learning_rate": 0.0001, "loss": 7.7922, "loss/crossentropy": 2.214387059211731, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.25321590155363083, "step": 3934 }, { "epoch": 0.246, "grad_norm": 2.78125, "grad_norm_var": 0.034821573893229166, "learning_rate": 0.0001, "loss": 7.7217, "loss/crossentropy": 2.2051841020584106, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.2417343333363533, "step": 3936 }, { "epoch": 0.246125, "grad_norm": 2.859375, "grad_norm_var": 0.0321441650390625, "learning_rate": 0.0001, "loss": 7.7859, "loss/crossentropy": 2.1569536924362183, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.24789589643478394, "step": 3938 }, { "epoch": 0.24625, "grad_norm": 2.71875, "grad_norm_var": 0.0367828369140625, "learning_rate": 0.0001, "loss": 7.9834, "loss/crossentropy": 2.171297550201416, "loss/hidden": 3.1484375, "loss/jsd": 0.0, "loss/logits": 0.2569433003664017, "step": 3940 }, { "epoch": 0.246375, "grad_norm": 3.1875, "grad_norm_var": 0.03746337890625, "learning_rate": 0.0001, "loss": 8.0254, "loss/crossentropy": 2.353532552719116, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.26509344577789307, "step": 3942 }, { "epoch": 0.2465, "grad_norm": 2.875, "grad_norm_var": 0.03327534993489583, "learning_rate": 0.0001, "loss": 8.0222, "loss/crossentropy": 2.264642834663391, "loss/hidden": 3.109375, "loss/jsd": 0.0, "loss/logits": 0.26274262368679047, "step": 3944 }, { "epoch": 0.246625, "grad_norm": 2.875, "grad_norm_var": 0.024637858072916668, "learning_rate": 0.0001, "loss": 8.0714, "loss/crossentropy": 2.372550845146179, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25714441388845444, "step": 3946 }, { "epoch": 0.24675, "grad_norm": 2.65625, "grad_norm_var": 0.022980753580729166, "learning_rate": 0.0001, "loss": 7.9289, "loss/crossentropy": 2.4531720876693726, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2564563602209091, "step": 3948 }, { "epoch": 0.246875, "grad_norm": 3.0, "grad_norm_var": 0.02154541015625, "learning_rate": 0.0001, "loss": 8.1788, "loss/crossentropy": 2.4645724296569824, "loss/hidden": 3.1328125, "loss/jsd": 0.0, "loss/logits": 0.2531467527151108, "step": 3950 }, { "epoch": 0.247, "grad_norm": 2.84375, "grad_norm_var": 0.020335896809895834, "learning_rate": 0.0001, "loss": 8.087, "loss/crossentropy": 2.2224488258361816, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.27072136104106903, "step": 3952 }, { "epoch": 0.247125, "grad_norm": 2.75, "grad_norm_var": 0.021564737955729166, "learning_rate": 0.0001, "loss": 7.7025, "loss/crossentropy": 2.1155717372894287, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.2502880319952965, "step": 3954 }, { "epoch": 0.24725, "grad_norm": 2.859375, "grad_norm_var": 0.015901692708333335, "learning_rate": 0.0001, "loss": 7.9457, "loss/crossentropy": 2.156541883945465, "loss/hidden": 3.171875, "loss/jsd": 0.0, "loss/logits": 0.25808054208755493, "step": 3956 }, { "epoch": 0.247375, "grad_norm": 2.71875, "grad_norm_var": 0.009040323893229167, "learning_rate": 0.0001, "loss": 7.6475, "loss/crossentropy": 2.043439030647278, "loss/hidden": 3.078125, "loss/jsd": 0.0, "loss/logits": 0.22348909080028534, "step": 3958 }, { "epoch": 0.2475, "grad_norm": 2.953125, "grad_norm_var": 0.009227498372395834, "learning_rate": 0.0001, "loss": 7.7652, "loss/crossentropy": 2.283087968826294, "loss/hidden": 3.1953125, "loss/jsd": 0.0, "loss/logits": 0.2664376199245453, "step": 3960 }, { "epoch": 0.247625, "grad_norm": 2.953125, "grad_norm_var": 0.0106109619140625, "learning_rate": 0.0001, "loss": 8.1861, "loss/crossentropy": 2.446289896965027, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.24126730859279633, "step": 3962 }, { "epoch": 0.24775, "grad_norm": 2.984375, "grad_norm_var": 0.021024576822916665, "learning_rate": 0.0001, "loss": 8.0295, "loss/crossentropy": 2.106830358505249, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2426551803946495, "step": 3964 }, { "epoch": 0.247875, "grad_norm": 3.03125, "grad_norm_var": 0.022541300455729166, "learning_rate": 0.0001, "loss": 8.0848, "loss/crossentropy": 2.434194803237915, "loss/hidden": 3.203125, "loss/jsd": 0.0, "loss/logits": 0.24056891351938248, "step": 3966 }, { "epoch": 0.248, "grad_norm": 2.671875, "grad_norm_var": 0.03730061848958333, "learning_rate": 0.0001, "loss": 7.9573, "loss/crossentropy": 2.1523804664611816, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.24403362721204758, "step": 3968 }, { "epoch": 0.248125, "grad_norm": 2.765625, "grad_norm_var": 0.03892822265625, "learning_rate": 0.0001, "loss": 7.8268, "loss/crossentropy": 2.2469995617866516, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2514503300189972, "step": 3970 }, { "epoch": 0.24825, "grad_norm": 2.703125, "grad_norm_var": 0.0411285400390625, "learning_rate": 0.0001, "loss": 7.9338, "loss/crossentropy": 2.1976422667503357, "loss/hidden": 3.0546875, "loss/jsd": 0.0, "loss/logits": 0.24978452920913696, "step": 3972 }, { "epoch": 0.248375, "grad_norm": 2.84375, "grad_norm_var": 0.03876851399739583, "learning_rate": 0.0001, "loss": 7.6979, "loss/crossentropy": 2.2436105012893677, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.2375164031982422, "step": 3974 }, { "epoch": 0.2485, "grad_norm": 2.796875, "grad_norm_var": 0.037718709309895834, "learning_rate": 0.0001, "loss": 7.8885, "loss/crossentropy": 2.1020787954330444, "loss/hidden": 3.0703125, "loss/jsd": 0.0, "loss/logits": 0.22866419702768326, "step": 3976 }, { "epoch": 0.248625, "grad_norm": 3.296875, "grad_norm_var": 0.047362263997395834, "learning_rate": 0.0001, "loss": 8.104, "loss/crossentropy": 2.442590594291687, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.2711693271994591, "step": 3978 }, { "epoch": 0.24875, "grad_norm": 2.890625, "grad_norm_var": 0.03736063639322917, "learning_rate": 0.0001, "loss": 7.8339, "loss/crossentropy": 2.3493313789367676, "loss/hidden": 3.1171875, "loss/jsd": 0.0, "loss/logits": 0.24219609797000885, "step": 3980 }, { "epoch": 0.248875, "grad_norm": 2.625, "grad_norm_var": 0.0386383056640625, "learning_rate": 0.0001, "loss": 7.7534, "loss/crossentropy": 2.0476667881011963, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.239098958671093, "step": 3982 }, { "epoch": 0.249, "grad_norm": 2.703125, "grad_norm_var": 0.025129191080729165, "learning_rate": 0.0001, "loss": 8.087, "loss/crossentropy": 2.547991156578064, "loss/hidden": 3.1796875, "loss/jsd": 0.0, "loss/logits": 0.2603266090154648, "step": 3984 }, { "epoch": 0.249125, "grad_norm": 2.828125, "grad_norm_var": 0.0277984619140625, "learning_rate": 0.0001, "loss": 7.9039, "loss/crossentropy": 2.645310878753662, "loss/hidden": 3.140625, "loss/jsd": 0.0, "loss/logits": 0.2637501657009125, "step": 3986 }, { "epoch": 0.24925, "grad_norm": 2.90625, "grad_norm_var": 0.031078084309895834, "learning_rate": 0.0001, "loss": 7.9646, "loss/crossentropy": 2.2159535884857178, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.25144505500793457, "step": 3988 }, { "epoch": 0.249375, "grad_norm": 2.890625, "grad_norm_var": 0.03127339680989583, "learning_rate": 0.0001, "loss": 7.9682, "loss/crossentropy": 2.3531523942947388, "loss/hidden": 3.15625, "loss/jsd": 0.0, "loss/logits": 0.2576625347137451, "step": 3990 }, { "epoch": 0.2495, "grad_norm": 2.71875, "grad_norm_var": 0.032515462239583334, "learning_rate": 0.0001, "loss": 7.9189, "loss/crossentropy": 2.2137093544006348, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2575615271925926, "step": 3992 }, { "epoch": 0.249625, "grad_norm": 2.671875, "grad_norm_var": 0.021842447916666667, "learning_rate": 0.0001, "loss": 7.8345, "loss/crossentropy": 2.3585588932037354, "loss/hidden": 3.1875, "loss/jsd": 0.0, "loss/logits": 0.2596537619829178, "step": 3994 }, { "epoch": 0.24975, "grad_norm": 3.015625, "grad_norm_var": 0.027074178059895832, "learning_rate": 0.0001, "loss": 7.9586, "loss/crossentropy": 2.2737538814544678, "loss/hidden": 3.1015625, "loss/jsd": 0.0, "loss/logits": 0.25356176495552063, "step": 3996 }, { "epoch": 0.249875, "grad_norm": 2.921875, "grad_norm_var": 0.0232330322265625, "learning_rate": 0.0001, "loss": 7.9267, "loss/crossentropy": 2.1807756423950195, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2504650503396988, "step": 3998 }, { "epoch": 0.25, "grad_norm": 2.9375, "grad_norm_var": 0.01959228515625, "learning_rate": 0.0001, "loss": 7.9314, "loss/crossentropy": 2.3070114850997925, "loss/hidden": 3.125, "loss/jsd": 0.0, "loss/logits": 0.2451673448085785, "step": 4000 } ], "logging_steps": 2, "max_steps": 16000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 4000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.16590621310976e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }