{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.025, "eval_steps": 1000, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 2.5e-05, "grad_norm": 11.25, "learning_rate": 1.0000000000000002e-06, "loss": 1607.1185, "loss/crossentropy": 0.4998045265674591, "loss/hidden": 0.2041015625, "loss/logits": 0.007406285032629967, "loss/reg": 1606.4072265625, "loss/twn": 0.0, "step": 1 }, { "epoch": 5e-05, "grad_norm": 25.625, "learning_rate": 2.0000000000000003e-06, "loss": 1604.4435, "loss/crossentropy": 1.6524670124053955, "loss/hidden": 0.1357421875, "loss/logits": 0.006067799869924784, "loss/reg": 1602.649169921875, "loss/twn": 0.0, "step": 2 }, { "epoch": 7.5e-05, "grad_norm": 9.9375, "learning_rate": 3e-06, "loss": 1547.7074, "loss/crossentropy": 1.9341739416122437, "loss/hidden": 0.15234375, "loss/logits": 0.014470485970377922, "loss/reg": 1545.6063232421875, "loss/twn": 0.0, "step": 3 }, { "epoch": 0.0001, "grad_norm": 14.375, "learning_rate": 4.000000000000001e-06, "loss": 1500.7928, "loss/crossentropy": 2.407871723175049, "loss/hidden": 0.1875, "loss/logits": 0.01105956919491291, "loss/reg": 1498.1864013671875, "loss/twn": 0.0, "step": 4 }, { "epoch": 0.000125, "grad_norm": 9.875, "learning_rate": 5e-06, "loss": 1421.8827, "loss/crossentropy": 1.7022260427474976, "loss/hidden": 0.10546875, "loss/logits": 0.00693091843277216, "loss/reg": 1420.0679931640625, "loss/twn": 0.0, "step": 5 }, { "epoch": 0.00015, "grad_norm": 470.0, "learning_rate": 6e-06, "loss": 1315.2301, "loss/crossentropy": 1.3705801963806152, "loss/hidden": 0.181640625, "loss/logits": 0.002700040116906166, "loss/reg": 1313.6751708984375, "loss/twn": 0.0, "step": 6 }, { "epoch": 0.000175, "grad_norm": 12.4375, "learning_rate": 7.000000000000001e-06, "loss": 1187.7322, "loss/crossentropy": 1.8566981554031372, "loss/hidden": 0.083984375, "loss/logits": 0.004405488260090351, "loss/reg": 1185.787109375, "loss/twn": 0.0, "step": 7 }, { "epoch": 0.0002, "grad_norm": 20.75, "learning_rate": 8.000000000000001e-06, "loss": 1041.5831, "loss/crossentropy": 2.76304030418396, "loss/hidden": 0.14453125, "loss/logits": 0.009587295353412628, "loss/reg": 1038.6658935546875, "loss/twn": 0.0, "step": 8 }, { "epoch": 0.000225, "grad_norm": 280.0, "learning_rate": 9e-06, "loss": 889.5813, "loss/crossentropy": 2.0730843544006348, "loss/hidden": 0.1767578125, "loss/logits": 0.009802292101085186, "loss/reg": 887.3216552734375, "loss/twn": 0.0, "step": 9 }, { "epoch": 0.00025, "grad_norm": 13.8125, "learning_rate": 1e-05, "loss": 739.1588, "loss/crossentropy": 2.678976535797119, "loss/hidden": 0.1357421875, "loss/logits": 0.01222484465688467, "loss/reg": 736.3318481445312, "loss/twn": 0.0, "step": 10 }, { "epoch": 0.000275, "grad_norm": 19.5, "learning_rate": 1.1000000000000001e-05, "loss": 602.8989, "loss/crossentropy": 1.9476336240768433, "loss/hidden": 0.1865234375, "loss/logits": 0.007580972742289305, "loss/reg": 600.7571411132812, "loss/twn": 0.0, "step": 11 }, { "epoch": 0.0003, "grad_norm": 9.875, "learning_rate": 1.2e-05, "loss": 486.0685, "loss/crossentropy": 2.1175615787506104, "loss/hidden": 0.1767578125, "loss/logits": 0.008497287519276142, "loss/reg": 483.76568603515625, "loss/twn": 0.0, "step": 12 }, { "epoch": 0.000325, "grad_norm": 14.0625, "learning_rate": 1.3000000000000001e-05, "loss": 389.9533, "loss/crossentropy": 1.2835053205490112, "loss/hidden": 0.10498046875, "loss/logits": 0.008934162557125092, "loss/reg": 388.5558776855469, "loss/twn": 0.0, "step": 13 }, { "epoch": 0.00035, "grad_norm": 19.5, "learning_rate": 1.4000000000000001e-05, "loss": 313.737, "loss/crossentropy": 1.8903541564941406, "loss/hidden": 0.1103515625, "loss/logits": 0.008576408959925175, "loss/reg": 311.7276916503906, "loss/twn": 0.0, "step": 14 }, { "epoch": 0.000375, "grad_norm": 32.25, "learning_rate": 1.5e-05, "loss": 253.3787, "loss/crossentropy": 1.3272770643234253, "loss/hidden": 0.2275390625, "loss/logits": 0.009392762556672096, "loss/reg": 251.8144989013672, "loss/twn": 0.0, "step": 15 }, { "epoch": 0.0004, "grad_norm": 10.6875, "grad_norm_var": 16279.171077473959, "learning_rate": 1.6000000000000003e-05, "loss": 207.3934, "loss/crossentropy": 1.8237905502319336, "loss/hidden": 0.0986328125, "loss/logits": 0.0111556276679039, "loss/reg": 205.45985412597656, "loss/twn": 0.0, "step": 16 }, { "epoch": 0.000425, "grad_norm": 115.5, "grad_norm_var": 16268.691780598958, "learning_rate": 1.7000000000000003e-05, "loss": 171.5853, "loss/crossentropy": 1.6593583822250366, "loss/hidden": 0.140625, "loss/logits": 0.004231919534504414, "loss/reg": 169.7810516357422, "loss/twn": 0.0, "step": 17 }, { "epoch": 0.00045, "grad_norm": 16.375, "grad_norm_var": 16325.545556640625, "learning_rate": 1.8e-05, "loss": 143.6776, "loss/crossentropy": 0.922300398349762, "loss/hidden": 0.22265625, "loss/logits": 0.012654997408390045, "loss/reg": 142.52001953125, "loss/twn": 0.0, "step": 18 }, { "epoch": 0.000475, "grad_norm": 228.0, "grad_norm_var": 17643.971875, "learning_rate": 1.9e-05, "loss": 123.6267, "loss/crossentropy": 1.7576591968536377, "loss/hidden": 0.12255859375, "loss/logits": 0.0077894763089716434, "loss/reg": 121.73872375488281, "loss/twn": 0.0, "step": 19 }, { "epoch": 0.0005, "grad_norm": 15.3125, "grad_norm_var": 17635.768994140624, "learning_rate": 2e-05, "loss": 107.5298, "loss/crossentropy": 1.702596664428711, "loss/hidden": 0.193359375, "loss/logits": 0.012834219262003899, "loss/reg": 105.62105560302734, "loss/twn": 0.0, "step": 20 }, { "epoch": 0.000525, "grad_norm": 21.125, "grad_norm_var": 17537.747509765624, "learning_rate": 2.1e-05, "loss": 96.0316, "loss/crossentropy": 2.7474312782287598, "loss/hidden": 0.16796875, "loss/logits": 0.018948907032608986, "loss/reg": 93.09722900390625, "loss/twn": 0.0, "step": 21 }, { "epoch": 0.00055, "grad_norm": 12.8125, "grad_norm_var": 6900.875520833333, "learning_rate": 2.2000000000000003e-05, "loss": 85.865, "loss/crossentropy": 2.7010557651519775, "loss/hidden": 0.1005859375, "loss/logits": 0.004374333191663027, "loss/reg": 83.05902862548828, "loss/twn": 0.0, "step": 22 }, { "epoch": 0.000575, "grad_norm": 13.875, "grad_norm_var": 6893.302067057291, "learning_rate": 2.3000000000000003e-05, "loss": 78.0325, "loss/crossentropy": 2.801321029663086, "loss/hidden": 0.146484375, "loss/logits": 0.019743533805012703, "loss/reg": 75.06490325927734, "loss/twn": 0.0, "step": 23 }, { "epoch": 0.0006, "grad_norm": 11.625, "grad_norm_var": 6937.396728515625, "learning_rate": 2.4e-05, "loss": 70.1903, "loss/crossentropy": 1.5617326498031616, "loss/hidden": 0.134765625, "loss/logits": 0.006116375792771578, "loss/reg": 68.48768615722656, "loss/twn": 0.0, "step": 24 }, { "epoch": 0.000625, "grad_norm": 32.0, "grad_norm_var": 3246.975895182292, "learning_rate": 2.5e-05, "loss": 64.4648, "loss/crossentropy": 1.4777029752731323, "loss/hidden": 0.158203125, "loss/logits": 0.007067938335239887, "loss/reg": 62.821861267089844, "loss/twn": 0.0, "step": 25 }, { "epoch": 0.00065, "grad_norm": 11.3125, "grad_norm_var": 3254.977197265625, "learning_rate": 2.6000000000000002e-05, "loss": 60.7985, "loss/crossentropy": 2.695087432861328, "loss/hidden": 0.154296875, "loss/logits": 0.009972814470529556, "loss/reg": 57.93910217285156, "loss/twn": 0.0, "step": 26 }, { "epoch": 0.000675, "grad_norm": 9.9375, "grad_norm_var": 3282.35234375, "learning_rate": 2.7000000000000002e-05, "loss": 54.9976, "loss/crossentropy": 1.1553270816802979, "loss/hidden": 0.13671875, "loss/logits": 0.008951587602496147, "loss/reg": 53.69655990600586, "loss/twn": 0.0, "step": 27 }, { "epoch": 0.0007, "grad_norm": 20.0, "grad_norm_var": 3253.6384765625, "learning_rate": 2.8000000000000003e-05, "loss": 54.2425, "loss/crossentropy": 4.120519638061523, "loss/hidden": 0.1171875, "loss/logits": 0.012582110241055489, "loss/reg": 49.99223709106445, "loss/twn": 0.0, "step": 28 }, { "epoch": 0.000725, "grad_norm": 12.3125, "grad_norm_var": 3259.070768229167, "learning_rate": 2.9e-05, "loss": 49.3116, "loss/crossentropy": 2.52665638923645, "loss/hidden": 0.11474609375, "loss/logits": 0.009403377771377563, "loss/reg": 46.660804748535156, "loss/twn": 0.0, "step": 29 }, { "epoch": 0.00075, "grad_norm": 10.375, "grad_norm_var": 3284.8536458333333, "learning_rate": 3e-05, "loss": 44.966, "loss/crossentropy": 1.1925100088119507, "loss/hidden": 0.08837890625, "loss/logits": 0.002639985177665949, "loss/reg": 43.682464599609375, "loss/twn": 0.0, "step": 30 }, { "epoch": 0.000775, "grad_norm": 67.5, "grad_norm_var": 3345.6231770833333, "learning_rate": 3.1e-05, "loss": 42.8695, "loss/crossentropy": 1.6953678131103516, "loss/hidden": 0.2060546875, "loss/logits": 0.011261125095188618, "loss/reg": 40.95684814453125, "loss/twn": 0.0, "step": 31 }, { "epoch": 0.0008, "grad_norm": 22.5, "grad_norm_var": 3311.253108723958, "learning_rate": 3.2000000000000005e-05, "loss": 41.8117, "loss/crossentropy": 3.090846538543701, "loss/hidden": 0.2451171875, "loss/logits": 0.015867076814174652, "loss/reg": 38.459877014160156, "loss/twn": 0.0, "step": 32 }, { "epoch": 0.000825, "grad_norm": 14.8125, "grad_norm_var": 2914.9796223958333, "learning_rate": 3.3e-05, "loss": 38.0881, "loss/crossentropy": 1.7205989360809326, "loss/hidden": 0.1650390625, "loss/logits": 0.013207211159169674, "loss/reg": 36.18929672241211, "loss/twn": 0.0, "step": 33 }, { "epoch": 0.00085, "grad_norm": 17.75, "grad_norm_var": 2912.14296875, "learning_rate": 3.4000000000000007e-05, "loss": 35.2028, "loss/crossentropy": 0.8660376667976379, "loss/hidden": 0.1865234375, "loss/logits": 0.012940528802573681, "loss/reg": 34.13732147216797, "loss/twn": 0.0, "step": 34 }, { "epoch": 0.000875, "grad_norm": 12.5, "grad_norm_var": 199.53671875, "learning_rate": 3.5e-05, "loss": 34.9671, "loss/crossentropy": 2.679326057434082, "loss/hidden": 0.044921875, "loss/logits": 0.005477376747876406, "loss/reg": 32.237335205078125, "loss/twn": 0.0, "step": 35 }, { "epoch": 0.0009, "grad_norm": 13.1875, "grad_norm_var": 200.8947265625, "learning_rate": 3.6e-05, "loss": 32.3725, "loss/crossentropy": 1.6193571090698242, "loss/hidden": 0.2314453125, "loss/logits": 0.016166094690561295, "loss/reg": 30.505502700805664, "loss/twn": 0.0, "step": 36 }, { "epoch": 0.000925, "grad_norm": 13.5625, "grad_norm_var": 202.30284830729167, "learning_rate": 3.7e-05, "loss": 31.5358, "loss/crossentropy": 2.4860172271728516, "loss/hidden": 0.1572265625, "loss/logits": 0.010934200137853622, "loss/reg": 28.881595611572266, "loss/twn": 0.0, "step": 37 }, { "epoch": 0.00095, "grad_norm": 41.5, "grad_norm_var": 231.96901041666666, "learning_rate": 3.8e-05, "loss": 30.499, "loss/crossentropy": 2.840606689453125, "loss/hidden": 0.2138671875, "loss/logits": 0.017023704946041107, "loss/reg": 27.427488327026367, "loss/twn": 0.0, "step": 38 }, { "epoch": 0.000975, "grad_norm": 39.75, "grad_norm_var": 251.65826822916668, "learning_rate": 3.9000000000000006e-05, "loss": 27.702, "loss/crossentropy": 1.4538397789001465, "loss/hidden": 0.19921875, "loss/logits": 0.01595349609851837, "loss/reg": 26.032983779907227, "loss/twn": 0.0, "step": 39 }, { "epoch": 0.001, "grad_norm": 9.5, "grad_norm_var": 254.85572916666666, "learning_rate": 4e-05, "loss": 27.0967, "loss/crossentropy": 2.216383695602417, "loss/hidden": 0.10595703125, "loss/logits": 0.006870034150779247, "loss/reg": 24.76752471923828, "loss/twn": 0.0, "step": 40 }, { "epoch": 0.001025, "grad_norm": 11.625, "grad_norm_var": 253.04108072916668, "learning_rate": 4.1e-05, "loss": 25.7473, "loss/crossentropy": 2.1163833141326904, "loss/hidden": 0.06689453125, "loss/logits": 0.004073521587997675, "loss/reg": 23.559967041015625, "loss/twn": 0.0, "step": 41 }, { "epoch": 0.00105, "grad_norm": 14.0, "grad_norm_var": 250.197509765625, "learning_rate": 4.2e-05, "loss": 23.4299, "loss/crossentropy": 0.7737110257148743, "loss/hidden": 0.1748046875, "loss/logits": 0.009298819117248058, "loss/reg": 22.47205352783203, "loss/twn": 0.0, "step": 42 }, { "epoch": 0.001075, "grad_norm": 12.75, "grad_norm_var": 246.6650390625, "learning_rate": 4.3e-05, "loss": 23.3947, "loss/crossentropy": 1.8225781917572021, "loss/hidden": 0.1123046875, "loss/logits": 0.006940089166164398, "loss/reg": 21.452856063842773, "loss/twn": 0.0, "step": 43 }, { "epoch": 0.0011, "grad_norm": 82.0, "grad_norm_var": 479.8754557291667, "learning_rate": 4.4000000000000006e-05, "loss": 23.171, "loss/crossentropy": 2.5216610431671143, "loss/hidden": 0.1396484375, "loss/logits": 0.01111547276377678, "loss/reg": 20.498552322387695, "loss/twn": 0.0, "step": 44 }, { "epoch": 0.001125, "grad_norm": 10.75, "grad_norm_var": 482.614306640625, "learning_rate": 4.5e-05, "loss": 22.58, "loss/crossentropy": 2.7781012058258057, "loss/hidden": 0.158203125, "loss/logits": 0.008185407146811485, "loss/reg": 19.63548469543457, "loss/twn": 0.0, "step": 45 }, { "epoch": 0.00115, "grad_norm": 27.125, "grad_norm_var": 468.31573893229165, "learning_rate": 4.600000000000001e-05, "loss": 21.6325, "loss/crossentropy": 2.6625006198883057, "loss/hidden": 0.1494140625, "loss/logits": 0.009840598329901695, "loss/reg": 18.810749053955078, "loss/twn": 0.0, "step": 46 }, { "epoch": 0.001175, "grad_norm": 20.5, "grad_norm_var": 344.27980143229166, "learning_rate": 4.7e-05, "loss": 20.5522, "loss/crossentropy": 2.3305137157440186, "loss/hidden": 0.1484375, "loss/logits": 0.016644544899463654, "loss/reg": 18.056581497192383, "loss/twn": 0.0, "step": 47 }, { "epoch": 0.0012, "grad_norm": 11.25, "grad_norm_var": 352.54737955729166, "learning_rate": 4.8e-05, "loss": 18.7184, "loss/crossentropy": 1.2184098958969116, "loss/hidden": 0.1455078125, "loss/logits": 0.0094651710242033, "loss/reg": 17.344999313354492, "loss/twn": 0.0, "step": 48 }, { "epoch": 0.001225, "grad_norm": 9.5, "grad_norm_var": 359.42734375, "learning_rate": 4.9e-05, "loss": 19.2773, "loss/crossentropy": 2.487840414047241, "loss/hidden": 0.0986328125, "loss/logits": 0.004987399093806744, "loss/reg": 16.685823440551758, "loss/twn": 0.0, "step": 49 }, { "epoch": 0.00125, "grad_norm": 15.1875, "grad_norm_var": 361.1883951822917, "learning_rate": 5e-05, "loss": 18.9783, "loss/crossentropy": 2.735170602798462, "loss/hidden": 0.166015625, "loss/logits": 0.011278904974460602, "loss/reg": 16.065805435180664, "loss/twn": 0.0, "step": 50 }, { "epoch": 0.001275, "grad_norm": 15.75, "grad_norm_var": 357.929931640625, "learning_rate": 5.1000000000000006e-05, "loss": 17.1984, "loss/crossentropy": 1.4663747549057007, "loss/hidden": 0.2353515625, "loss/logits": 0.003095359541475773, "loss/reg": 15.49356746673584, "loss/twn": 0.0, "step": 51 }, { "epoch": 0.0013, "grad_norm": 9.9375, "grad_norm_var": 362.29881184895834, "learning_rate": 5.2000000000000004e-05, "loss": 16.953, "loss/crossentropy": 1.918389916419983, "loss/hidden": 0.0791015625, "loss/logits": 0.0031922967173159122, "loss/reg": 14.95230484008789, "loss/twn": 0.0, "step": 52 }, { "epoch": 0.001325, "grad_norm": 10.6875, "grad_norm_var": 365.8745930989583, "learning_rate": 5.300000000000001e-05, "loss": 16.8909, "loss/crossentropy": 2.3705666065216064, "loss/hidden": 0.07275390625, "loss/logits": 0.0030757079366594553, "loss/reg": 14.444525718688965, "loss/twn": 0.0, "step": 53 }, { "epoch": 0.00135, "grad_norm": 16.75, "grad_norm_var": 337.7085774739583, "learning_rate": 5.4000000000000005e-05, "loss": 16.1047, "loss/crossentropy": 1.9884377717971802, "loss/hidden": 0.12890625, "loss/logits": 0.010179271921515465, "loss/reg": 13.977179527282715, "loss/twn": 0.0, "step": 54 }, { "epoch": 0.001375, "grad_norm": 13.375, "grad_norm_var": 311.08631184895836, "learning_rate": 5.500000000000001e-05, "loss": 16.2292, "loss/crossentropy": 2.642868995666504, "loss/hidden": 0.04736328125, "loss/logits": 0.004405863583087921, "loss/reg": 13.5346097946167, "loss/twn": 0.0, "step": 55 }, { "epoch": 0.0014, "grad_norm": 20.875, "grad_norm_var": 306.026806640625, "learning_rate": 5.6000000000000006e-05, "loss": 14.8051, "loss/crossentropy": 1.465383529663086, "loss/hidden": 0.2080078125, "loss/logits": 0.00864885188639164, "loss/reg": 13.123102188110352, "loss/twn": 0.0, "step": 56 }, { "epoch": 0.001425, "grad_norm": 15.875, "grad_norm_var": 303.045166015625, "learning_rate": 5.6999999999999996e-05, "loss": 14.4812, "loss/crossentropy": 1.640454649925232, "loss/hidden": 0.09765625, "loss/logits": 0.007794347126036882, "loss/reg": 12.735280990600586, "loss/twn": 0.0, "step": 57 }, { "epoch": 0.00145, "grad_norm": 16.5, "grad_norm_var": 301.720947265625, "learning_rate": 5.8e-05, "loss": 14.5378, "loss/crossentropy": 2.0751516819000244, "loss/hidden": 0.07861328125, "loss/logits": 0.006516133435070515, "loss/reg": 12.377544403076172, "loss/twn": 0.0, "step": 58 }, { "epoch": 0.001475, "grad_norm": 20.875, "grad_norm_var": 298.750244140625, "learning_rate": 5.9e-05, "loss": 14.1279, "loss/crossentropy": 1.9119625091552734, "loss/hidden": 0.1552734375, "loss/logits": 0.021668870002031326, "loss/reg": 12.038968086242676, "loss/twn": 0.0, "step": 59 }, { "epoch": 0.0015, "grad_norm": 13.5625, "grad_norm_var": 23.984375, "learning_rate": 6e-05, "loss": 12.6629, "loss/crossentropy": 0.7274801731109619, "loss/hidden": 0.203125, "loss/logits": 0.009123459458351135, "loss/reg": 11.72317123413086, "loss/twn": 0.0, "step": 60 }, { "epoch": 0.001525, "grad_norm": 11.875, "grad_norm_var": 23.3462890625, "learning_rate": 6.1e-05, "loss": 14.3076, "loss/crossentropy": 2.738680601119995, "loss/hidden": 0.1396484375, "loss/logits": 0.010733511298894882, "loss/reg": 11.418492317199707, "loss/twn": 0.0, "step": 61 }, { "epoch": 0.00155, "grad_norm": 41.25, "grad_norm_var": 57.518489583333334, "learning_rate": 6.2e-05, "loss": 13.614, "loss/crossentropy": 2.1940059661865234, "loss/hidden": 0.26171875, "loss/logits": 0.016079768538475037, "loss/reg": 11.142221450805664, "loss/twn": 0.0, "step": 62 }, { "epoch": 0.001575, "grad_norm": 16.25, "grad_norm_var": 56.371875, "learning_rate": 6.3e-05, "loss": 12.6424, "loss/crossentropy": 1.5363647937774658, "loss/hidden": 0.220703125, "loss/logits": 0.009181533940136433, "loss/reg": 10.876102447509766, "loss/twn": 0.0, "step": 63 }, { "epoch": 0.0016, "grad_norm": 8.25, "grad_norm_var": 58.921875, "learning_rate": 6.400000000000001e-05, "loss": 12.6573, "loss/crossentropy": 1.9360976219177246, "loss/hidden": 0.08642578125, "loss/logits": 0.00900467112660408, "loss/reg": 10.625749588012695, "loss/twn": 0.0, "step": 64 }, { "epoch": 0.001625, "grad_norm": 25.0, "grad_norm_var": 60.43958333333333, "learning_rate": 6.500000000000001e-05, "loss": 12.8598, "loss/crossentropy": 2.2861106395721436, "loss/hidden": 0.169921875, "loss/logits": 0.008098036982119083, "loss/reg": 10.395671844482422, "loss/twn": 0.0, "step": 65 }, { "epoch": 0.00165, "grad_norm": 10.5, "grad_norm_var": 62.94568684895833, "learning_rate": 6.6e-05, "loss": 10.8882, "loss/crossentropy": 0.5159875154495239, "loss/hidden": 0.1884765625, "loss/logits": 0.007731384597718716, "loss/reg": 10.176012992858887, "loss/twn": 0.0, "step": 66 }, { "epoch": 0.001675, "grad_norm": 15.4375, "grad_norm_var": 62.99166666666667, "learning_rate": 6.7e-05, "loss": 11.7439, "loss/crossentropy": 1.6010075807571411, "loss/hidden": 0.16796875, "loss/logits": 0.0077722882851958275, "loss/reg": 9.96713924407959, "loss/twn": 0.0, "step": 67 }, { "epoch": 0.0017, "grad_norm": 8.125, "grad_norm_var": 64.82823893229167, "learning_rate": 6.800000000000001e-05, "loss": 11.7136, "loss/crossentropy": 1.8642301559448242, "loss/hidden": 0.07177734375, "loss/logits": 0.003159617306664586, "loss/reg": 9.774468421936035, "loss/twn": 0.0, "step": 68 }, { "epoch": 0.001725, "grad_norm": 12.75, "grad_norm_var": 63.475260416666664, "learning_rate": 6.9e-05, "loss": 11.2146, "loss/crossentropy": 1.5259939432144165, "loss/hidden": 0.0927734375, "loss/logits": 0.0040178182534873486, "loss/reg": 9.591811180114746, "loss/twn": 0.0, "step": 69 }, { "epoch": 0.00175, "grad_norm": 13.25, "grad_norm_var": 64.21901041666666, "learning_rate": 7e-05, "loss": 10.007, "loss/crossentropy": 0.4211646616458893, "loss/hidden": 0.1630859375, "loss/logits": 0.006910983473062515, "loss/reg": 9.415842056274414, "loss/twn": 0.0, "step": 70 }, { "epoch": 0.001775, "grad_norm": 15.1875, "grad_norm_var": 63.672900390625, "learning_rate": 7.1e-05, "loss": 11.3191, "loss/crossentropy": 1.91142737865448, "loss/hidden": 0.1416015625, "loss/logits": 0.012339383363723755, "loss/reg": 9.253693580627441, "loss/twn": 0.0, "step": 71 }, { "epoch": 0.0018, "grad_norm": 25.125, "grad_norm_var": 67.225634765625, "learning_rate": 7.2e-05, "loss": 9.6954, "loss/crossentropy": 0.3831652104854584, "loss/hidden": 0.2060546875, "loss/logits": 0.007283635437488556, "loss/reg": 9.098925590515137, "loss/twn": 0.0, "step": 72 }, { "epoch": 0.001825, "grad_norm": 12.375, "grad_norm_var": 68.45245768229167, "learning_rate": 7.3e-05, "loss": 11.8289, "loss/crossentropy": 2.7114861011505127, "loss/hidden": 0.1572265625, "loss/logits": 0.00806540995836258, "loss/reg": 8.952132225036621, "loss/twn": 0.0, "step": 73 }, { "epoch": 0.00185, "grad_norm": 18.0, "grad_norm_var": 68.56417643229166, "learning_rate": 7.4e-05, "loss": 9.9016, "loss/crossentropy": 0.9610092043876648, "loss/hidden": 0.12255859375, "loss/logits": 0.005808320362120867, "loss/reg": 8.812213897705078, "loss/twn": 0.0, "step": 74 }, { "epoch": 0.001875, "grad_norm": 11.75, "grad_norm_var": 68.73527018229167, "learning_rate": 7.500000000000001e-05, "loss": 11.33, "loss/crossentropy": 2.4811081886291504, "loss/hidden": 0.154296875, "loss/logits": 0.012214528396725655, "loss/reg": 8.682340621948242, "loss/twn": 0.0, "step": 75 }, { "epoch": 0.0019, "grad_norm": 16.75, "grad_norm_var": 68.26295572916666, "learning_rate": 7.6e-05, "loss": 10.7107, "loss/crossentropy": 1.9060146808624268, "loss/hidden": 0.2294921875, "loss/logits": 0.016750231385231018, "loss/reg": 8.558440208435059, "loss/twn": 0.0, "step": 76 }, { "epoch": 0.001925, "grad_norm": 11.5, "grad_norm_var": 68.49635416666666, "learning_rate": 7.7e-05, "loss": 10.9686, "loss/crossentropy": 2.370375394821167, "loss/hidden": 0.1435546875, "loss/logits": 0.014391288161277771, "loss/reg": 8.440238952636719, "loss/twn": 0.0, "step": 77 }, { "epoch": 0.00195, "grad_norm": 30.0, "grad_norm_var": 39.04713541666667, "learning_rate": 7.800000000000001e-05, "loss": 11.235, "loss/crossentropy": 2.7426469326019287, "loss/hidden": 0.1552734375, "loss/logits": 0.012605298310518265, "loss/reg": 8.32447624206543, "loss/twn": 0.0, "step": 78 }, { "epoch": 0.001975, "grad_norm": 12.8125, "grad_norm_var": 39.50636393229167, "learning_rate": 7.900000000000001e-05, "loss": 10.4604, "loss/crossentropy": 2.1269540786743164, "loss/hidden": 0.10986328125, "loss/logits": 0.0066815330646932125, "loss/reg": 8.216917991638184, "loss/twn": 0.0, "step": 79 }, { "epoch": 0.002, "grad_norm": 13.625, "grad_norm_var": 36.169384765625, "learning_rate": 8e-05, "loss": 10.9456, "loss/crossentropy": 2.6664817333221436, "loss/hidden": 0.15234375, "loss/logits": 0.011289350688457489, "loss/reg": 8.115513801574707, "loss/twn": 0.0, "step": 80 }, { "epoch": 0.002025, "grad_norm": 32.25, "grad_norm_var": 48.38487955729167, "learning_rate": 8.1e-05, "loss": 10.8641, "loss/crossentropy": 2.6699304580688477, "loss/hidden": 0.1640625, "loss/logits": 0.010942000895738602, "loss/reg": 8.019161224365234, "loss/twn": 0.0, "step": 81 }, { "epoch": 0.00205, "grad_norm": 21.875, "grad_norm_var": 47.804280598958336, "learning_rate": 8.2e-05, "loss": 10.4686, "loss/crossentropy": 2.2571003437042236, "loss/hidden": 0.267578125, "loss/logits": 0.021330825984477997, "loss/reg": 7.922557353973389, "loss/twn": 0.0, "step": 82 }, { "epoch": 0.002075, "grad_norm": 11.1875, "grad_norm_var": 49.776546223958334, "learning_rate": 8.3e-05, "loss": 10.8712, "loss/crossentropy": 2.8793890476226807, "loss/hidden": 0.1435546875, "loss/logits": 0.0122376699000597, "loss/reg": 7.835977077484131, "loss/twn": 0.0, "step": 83 }, { "epoch": 0.0021, "grad_norm": 12.125, "grad_norm_var": 46.224462890625, "learning_rate": 8.4e-05, "loss": 10.6407, "loss/crossentropy": 2.8739991188049316, "loss/hidden": 0.0140380859375, "loss/logits": 0.0031395466066896915, "loss/reg": 7.749497413635254, "loss/twn": 0.0, "step": 84 }, { "epoch": 0.002125, "grad_norm": 224.0, "grad_norm_var": 2718.206884765625, "learning_rate": 8.5e-05, "loss": 9.9904, "loss/crossentropy": 2.1338188648223877, "loss/hidden": 0.1748046875, "loss/logits": 0.013754406943917274, "loss/reg": 7.668061256408691, "loss/twn": 0.0, "step": 85 }, { "epoch": 0.00215, "grad_norm": 9.0625, "grad_norm_var": 2728.7181640625, "learning_rate": 8.6e-05, "loss": 10.4151, "loss/crossentropy": 2.6691489219665527, "loss/hidden": 0.1455078125, "loss/logits": 0.011645539663732052, "loss/reg": 7.588749885559082, "loss/twn": 0.0, "step": 86 }, { "epoch": 0.002175, "grad_norm": 23.375, "grad_norm_var": 2716.899593098958, "learning_rate": 8.7e-05, "loss": 9.2735, "loss/crossentropy": 1.5923405885696411, "loss/hidden": 0.1591796875, "loss/logits": 0.006691344082355499, "loss/reg": 7.515244007110596, "loss/twn": 0.0, "step": 87 }, { "epoch": 0.0022, "grad_norm": 11.3125, "grad_norm_var": 2738.4708333333333, "learning_rate": 8.800000000000001e-05, "loss": 10.0973, "loss/crossentropy": 2.563422679901123, "loss/hidden": 0.08642578125, "loss/logits": 0.006290389224886894, "loss/reg": 7.441190242767334, "loss/twn": 0.0, "step": 88 }, { "epoch": 0.002225, "grad_norm": 16.75, "grad_norm_var": 2729.6775390625, "learning_rate": 8.900000000000001e-05, "loss": 10.1917, "loss/crossentropy": 2.605319023132324, "loss/hidden": 0.1845703125, "loss/logits": 0.029636088758707047, "loss/reg": 7.372167587280273, "loss/twn": 0.0, "step": 89 }, { "epoch": 0.00225, "grad_norm": 17.0, "grad_norm_var": 2731.3098307291666, "learning_rate": 9e-05, "loss": 9.2722, "loss/crossentropy": 1.7703652381896973, "loss/hidden": 0.1845703125, "loss/logits": 0.011869278736412525, "loss/reg": 7.30535364151001, "loss/twn": 0.0, "step": 90 }, { "epoch": 0.002275, "grad_norm": 26.625, "grad_norm_var": 2709.51640625, "learning_rate": 9.1e-05, "loss": 10.0987, "loss/crossentropy": 2.770080327987671, "loss/hidden": 0.08154296875, "loss/logits": 0.006538551300764084, "loss/reg": 7.240530967712402, "loss/twn": 0.0, "step": 91 }, { "epoch": 0.0023, "grad_norm": 12.3125, "grad_norm_var": 2718.9657389322915, "learning_rate": 9.200000000000001e-05, "loss": 9.1897, "loss/crossentropy": 1.7330008745193481, "loss/hidden": 0.265625, "loss/logits": 0.012156343087553978, "loss/reg": 7.1789398193359375, "loss/twn": 0.0, "step": 92 }, { "epoch": 0.002325, "grad_norm": 113.5, "grad_norm_var": 3112.675113932292, "learning_rate": 9.300000000000001e-05, "loss": 8.4886, "loss/crossentropy": 1.163967490196228, "loss/hidden": 0.197265625, "loss/logits": 0.009277150966227055, "loss/reg": 7.11806058883667, "loss/twn": 0.0, "step": 93 }, { "epoch": 0.00235, "grad_norm": 37.25, "grad_norm_var": 3109.446598307292, "learning_rate": 9.4e-05, "loss": 9.3185, "loss/crossentropy": 2.135645627975464, "loss/hidden": 0.11767578125, "loss/logits": 0.0029190080240368843, "loss/reg": 7.06224250793457, "loss/twn": 0.0, "step": 94 }, { "epoch": 0.002375, "grad_norm": 8.3125, "grad_norm_var": 3125.339567057292, "learning_rate": 9.5e-05, "loss": 9.3855, "loss/crossentropy": 2.309610605239868, "loss/hidden": 0.06689453125, "loss/logits": 0.0035296978894621134, "loss/reg": 7.005456447601318, "loss/twn": 0.0, "step": 95 }, { "epoch": 0.0024, "grad_norm": 91.5, "grad_norm_var": 3262.5942545572916, "learning_rate": 9.6e-05, "loss": 9.9531, "loss/crossentropy": 2.7651376724243164, "loss/hidden": 0.2197265625, "loss/logits": 0.01677127555012703, "loss/reg": 6.951422214508057, "loss/twn": 0.0, "step": 96 }, { "epoch": 0.002425, "grad_norm": 35.25, "grad_norm_var": 3259.3458170572917, "learning_rate": 9.7e-05, "loss": 8.7807, "loss/crossentropy": 1.7207653522491455, "loss/hidden": 0.1494140625, "loss/logits": 0.011507261544466019, "loss/reg": 6.8990349769592285, "loss/twn": 0.0, "step": 97 }, { "epoch": 0.00245, "grad_norm": 13.75, "grad_norm_var": 3285.235791015625, "learning_rate": 9.8e-05, "loss": 8.475, "loss/crossentropy": 1.481154441833496, "loss/hidden": 0.140625, "loss/logits": 0.005128794349730015, "loss/reg": 6.848050594329834, "loss/twn": 0.0, "step": 98 }, { "epoch": 0.002475, "grad_norm": 10.1875, "grad_norm_var": 3289.334228515625, "learning_rate": 9.900000000000001e-05, "loss": 9.021, "loss/crossentropy": 2.196463108062744, "loss/hidden": 0.0234375, "loss/logits": 0.0013116542249917984, "loss/reg": 6.79979133605957, "loss/twn": 0.0, "step": 99 }, { "epoch": 0.0025, "grad_norm": 13.6875, "grad_norm_var": 3283.3889973958335, "learning_rate": 0.0001, "loss": 9.1231, "loss/crossentropy": 2.0860254764556885, "loss/hidden": 0.265625, "loss/logits": 0.0192781500518322, "loss/reg": 6.75217342376709, "loss/twn": 0.0, "step": 100 }, { "epoch": 0.002525, "grad_norm": 65.5, "grad_norm_var": 996.5311848958333, "learning_rate": 0.0001, "loss": 8.6978, "loss/crossentropy": 1.8436778783798218, "loss/hidden": 0.140625, "loss/logits": 0.006662796251475811, "loss/reg": 6.706822395324707, "loss/twn": 0.0, "step": 101 }, { "epoch": 0.00255, "grad_norm": 9.0625, "grad_norm_var": 996.5311848958333, "learning_rate": 0.0001, "loss": 9.2667, "loss/crossentropy": 2.4968836307525635, "loss/hidden": 0.0986328125, "loss/logits": 0.007922045886516571, "loss/reg": 6.663230895996094, "loss/twn": 0.0, "step": 102 }, { "epoch": 0.002575, "grad_norm": 11.1875, "grad_norm_var": 1019.1574055989583, "learning_rate": 0.0001, "loss": 8.2507, "loss/crossentropy": 1.475099802017212, "loss/hidden": 0.1484375, "loss/logits": 0.007549532223492861, "loss/reg": 6.619617938995361, "loss/twn": 0.0, "step": 103 }, { "epoch": 0.0026, "grad_norm": 13.5625, "grad_norm_var": 1013.6202962239583, "learning_rate": 0.0001, "loss": 9.2719, "loss/crossentropy": 2.5519533157348633, "loss/hidden": 0.1328125, "loss/logits": 0.00820184126496315, "loss/reg": 6.578925132751465, "loss/twn": 0.0, "step": 104 }, { "epoch": 0.002625, "grad_norm": 288.0, "grad_norm_var": 5098.051936848959, "learning_rate": 0.0001, "loss": 7.9889, "loss/crossentropy": 1.3079354763031006, "loss/hidden": 0.1328125, "loss/logits": 0.01036953553557396, "loss/reg": 6.537764072418213, "loss/twn": 0.0, "step": 105 }, { "epoch": 0.00265, "grad_norm": 17.5, "grad_norm_var": 5096.006363932292, "learning_rate": 0.0001, "loss": 9.3801, "loss/crossentropy": 2.7196500301361084, "loss/hidden": 0.1474609375, "loss/logits": 0.013073693960905075, "loss/reg": 6.49993371963501, "loss/twn": 0.0, "step": 106 }, { "epoch": 0.002675, "grad_norm": 772.0, "grad_norm_var": 37700.72758789062, "learning_rate": 0.0001, "loss": 7.3614, "loss/crossentropy": 0.6930418014526367, "loss/hidden": 0.1982421875, "loss/logits": 0.0074032871052622795, "loss/reg": 6.462671279907227, "loss/twn": 0.0, "step": 107 }, { "epoch": 0.0027, "grad_norm": 10.125, "grad_norm_var": 37725.00826822917, "learning_rate": 0.0001, "loss": 8.4394, "loss/crossentropy": 1.9201096296310425, "loss/hidden": 0.08642578125, "loss/logits": 0.006301195826381445, "loss/reg": 6.426520347595215, "loss/twn": 0.0, "step": 108 }, { "epoch": 0.002725, "grad_norm": 11.1875, "grad_norm_var": 38118.67159830729, "learning_rate": 0.0001, "loss": 9.0847, "loss/crossentropy": 2.634326219558716, "loss/hidden": 0.056640625, "loss/logits": 0.003579255659133196, "loss/reg": 6.3901753425598145, "loss/twn": 0.0, "step": 109 }, { "epoch": 0.00275, "grad_norm": 13.0, "grad_norm_var": 38319.52980143229, "learning_rate": 0.0001, "loss": 9.0789, "loss/crossentropy": 2.5669283866882324, "loss/hidden": 0.1455078125, "loss/logits": 0.010270677506923676, "loss/reg": 6.356166839599609, "loss/twn": 0.0, "step": 110 }, { "epoch": 0.002775, "grad_norm": 14.4375, "grad_norm_var": 38258.03097330729, "learning_rate": 0.0001, "loss": 8.9621, "loss/crossentropy": 2.5042357444763184, "loss/hidden": 0.125, "loss/logits": 0.00943165272474289, "loss/reg": 6.323448657989502, "loss/twn": 0.0, "step": 111 }, { "epoch": 0.0028, "grad_norm": 10.75, "grad_norm_var": 38615.72823893229, "learning_rate": 0.0001, "loss": 8.0629, "loss/crossentropy": 1.681036353111267, "loss/hidden": 0.08642578125, "loss/logits": 0.003971286583691835, "loss/reg": 6.291506767272949, "loss/twn": 0.0, "step": 112 }, { "epoch": 0.002825, "grad_norm": 23.875, "grad_norm_var": 38694.45271809896, "learning_rate": 0.0001, "loss": 7.101, "loss/crossentropy": 0.6117576956748962, "loss/hidden": 0.2158203125, "loss/logits": 0.012755107134580612, "loss/reg": 6.260617256164551, "loss/twn": 0.0, "step": 113 }, { "epoch": 0.00285, "grad_norm": 12.1875, "grad_norm_var": 38708.639322916664, "learning_rate": 0.0001, "loss": 8.0025, "loss/crossentropy": 1.5227508544921875, "loss/hidden": 0.234375, "loss/logits": 0.015468025580048561, "loss/reg": 6.229867935180664, "loss/twn": 0.0, "step": 114 }, { "epoch": 0.002875, "grad_norm": 16.375, "grad_norm_var": 38652.598942057295, "learning_rate": 0.0001, "loss": 9.088, "loss/crossentropy": 2.7616689205169678, "loss/hidden": 0.115234375, "loss/logits": 0.009811250492930412, "loss/reg": 6.201269626617432, "loss/twn": 0.0, "step": 115 }, { "epoch": 0.0029, "grad_norm": 12.375, "grad_norm_var": 38664.55670572917, "learning_rate": 0.0001, "loss": 8.9623, "loss/crossentropy": 2.647496461868286, "loss/hidden": 0.1328125, "loss/logits": 0.009309421293437481, "loss/reg": 6.172722339630127, "loss/twn": 0.0, "step": 116 }, { "epoch": 0.002925, "grad_norm": 20.5, "grad_norm_var": 38886.04108072917, "learning_rate": 0.0001, "loss": 9.1565, "loss/crossentropy": 2.8847148418426514, "loss/hidden": 0.11767578125, "loss/logits": 0.008627700619399548, "loss/reg": 6.145481586456299, "loss/twn": 0.0, "step": 117 }, { "epoch": 0.00295, "grad_norm": 16.375, "grad_norm_var": 38821.67394205729, "learning_rate": 0.0001, "loss": 9.0896, "loss/crossentropy": 2.7421655654907227, "loss/hidden": 0.212890625, "loss/logits": 0.016587935388088226, "loss/reg": 6.117995262145996, "loss/twn": 0.0, "step": 118 }, { "epoch": 0.002975, "grad_norm": 8.5625, "grad_norm_var": 38845.82667643229, "learning_rate": 0.0001, "loss": 8.091, "loss/crossentropy": 1.8508156538009644, "loss/hidden": 0.138671875, "loss/logits": 0.008302265778183937, "loss/reg": 6.093196868896484, "loss/twn": 0.0, "step": 119 }, { "epoch": 0.003, "grad_norm": 11.625, "grad_norm_var": 38862.91451822917, "learning_rate": 0.0001, "loss": 8.6832, "loss/crossentropy": 2.444472312927246, "loss/hidden": 0.1552734375, "loss/logits": 0.016056066378951073, "loss/reg": 6.067349433898926, "loss/twn": 0.0, "step": 120 }, { "epoch": 0.003025, "grad_norm": 15.0625, "grad_norm_var": 35901.329410807295, "learning_rate": 0.0001, "loss": 7.6991, "loss/crossentropy": 1.443003535270691, "loss/hidden": 0.203125, "loss/logits": 0.009365499019622803, "loss/reg": 6.043575763702393, "loss/twn": 0.0, "step": 121 }, { "epoch": 0.00305, "grad_norm": 10.125, "grad_norm_var": 35948.114567057295, "learning_rate": 0.0001, "loss": 8.3654, "loss/crossentropy": 2.175076961517334, "loss/hidden": 0.1572265625, "loss/logits": 0.012922637164592743, "loss/reg": 6.0201544761657715, "loss/twn": 0.0, "step": 122 }, { "epoch": 0.003075, "grad_norm": 12.9375, "grad_norm_var": 16.191145833333334, "learning_rate": 0.0001, "loss": 8.7509, "loss/crossentropy": 2.659536123275757, "loss/hidden": 0.08642578125, "loss/logits": 0.007753277197480202, "loss/reg": 5.997157096862793, "loss/twn": 0.0, "step": 123 }, { "epoch": 0.0031, "grad_norm": 11.9375, "grad_norm_var": 15.527978515625, "learning_rate": 0.0001, "loss": 8.4483, "loss/crossentropy": 2.3908164501190186, "loss/hidden": 0.07666015625, "loss/logits": 0.005580560304224491, "loss/reg": 5.975290298461914, "loss/twn": 0.0, "step": 124 }, { "epoch": 0.003125, "grad_norm": 12.8125, "grad_norm_var": 15.120035807291666, "learning_rate": 0.0001, "loss": 8.1928, "loss/crossentropy": 2.0353291034698486, "loss/hidden": 0.1923828125, "loss/logits": 0.011610760353505611, "loss/reg": 5.953509330749512, "loss/twn": 0.0, "step": 125 }, { "epoch": 0.00315, "grad_norm": 11.4375, "grad_norm_var": 15.467122395833334, "learning_rate": 0.0001, "loss": 6.9926, "loss/crossentropy": 0.923692524433136, "loss/hidden": 0.12890625, "loss/logits": 0.006808650679886341, "loss/reg": 5.933147430419922, "loss/twn": 0.0, "step": 126 }, { "epoch": 0.003175, "grad_norm": 18.875, "grad_norm_var": 17.053759765625, "learning_rate": 0.0001, "loss": 8.6838, "loss/crossentropy": 2.7514772415161133, "loss/hidden": 0.016357421875, "loss/logits": 0.00333950063213706, "loss/reg": 5.91263484954834, "loss/twn": 0.0, "step": 127 }, { "epoch": 0.0032, "grad_norm": 13.375, "grad_norm_var": 16.307275390625, "learning_rate": 0.0001, "loss": 7.5016, "loss/crossentropy": 1.4413155317306519, "loss/hidden": 0.1552734375, "loss/logits": 0.011735007166862488, "loss/reg": 5.893232822418213, "loss/twn": 0.0, "step": 128 }, { "epoch": 0.003225, "grad_norm": 52.25, "grad_norm_var": 102.939697265625, "learning_rate": 0.0001, "loss": 7.4808, "loss/crossentropy": 1.5077205896377563, "loss/hidden": 0.0947265625, "loss/logits": 0.004158593248575926, "loss/reg": 5.874199867248535, "loss/twn": 0.0, "step": 129 }, { "epoch": 0.00325, "grad_norm": 12.75, "grad_norm_var": 102.6697265625, "learning_rate": 0.0001, "loss": 8.7541, "loss/crossentropy": 2.7712345123291016, "loss/hidden": 0.1201171875, "loss/logits": 0.006270177662372589, "loss/reg": 5.856495380401611, "loss/twn": 0.0, "step": 130 }, { "epoch": 0.003275, "grad_norm": 9.5625, "grad_norm_var": 105.30779622395833, "learning_rate": 0.0001, "loss": 7.2955, "loss/crossentropy": 1.3631829023361206, "loss/hidden": 0.08837890625, "loss/logits": 0.005783870816230774, "loss/reg": 5.838170528411865, "loss/twn": 0.0, "step": 131 }, { "epoch": 0.0033, "grad_norm": 22.625, "grad_norm_var": 107.38448893229166, "learning_rate": 0.0001, "loss": 8.8285, "loss/crossentropy": 2.8225176334381104, "loss/hidden": 0.1767578125, "loss/logits": 0.007785791996866465, "loss/reg": 5.82139778137207, "loss/twn": 0.0, "step": 132 }, { "epoch": 0.003325, "grad_norm": 23.5, "grad_norm_var": 109.62667643229166, "learning_rate": 0.0001, "loss": 7.4036, "loss/crossentropy": 1.5806615352630615, "loss/hidden": 0.016357421875, "loss/logits": 0.001918629975989461, "loss/reg": 5.804649829864502, "loss/twn": 0.0, "step": 133 }, { "epoch": 0.00335, "grad_norm": 11.1875, "grad_norm_var": 111.3869140625, "learning_rate": 0.0001, "loss": 8.0831, "loss/crossentropy": 2.268815755844116, "loss/hidden": 0.0233154296875, "loss/logits": 0.0031364229507744312, "loss/reg": 5.787786483764648, "loss/twn": 0.0, "step": 134 }, { "epoch": 0.003375, "grad_norm": 17.875, "grad_norm_var": 107.36847330729167, "learning_rate": 0.0001, "loss": 8.4399, "loss/crossentropy": 2.5208940505981445, "loss/hidden": 0.1376953125, "loss/logits": 0.009613238275051117, "loss/reg": 5.771730422973633, "loss/twn": 0.0, "step": 135 }, { "epoch": 0.0034, "grad_norm": 12.1875, "grad_norm_var": 107.00416666666666, "learning_rate": 0.0001, "loss": 7.3628, "loss/crossentropy": 1.5146337747573853, "loss/hidden": 0.08642578125, "loss/logits": 0.004632354713976383, "loss/reg": 5.757077693939209, "loss/twn": 0.0, "step": 136 }, { "epoch": 0.003425, "grad_norm": 74.5, "grad_norm_var": 314.18409830729166, "learning_rate": 0.0001, "loss": 8.7141, "loss/crossentropy": 2.663015127182007, "loss/hidden": 0.298828125, "loss/logits": 0.010531080886721611, "loss/reg": 5.741701126098633, "loss/twn": 0.0, "step": 137 }, { "epoch": 0.00345, "grad_norm": 11.5625, "grad_norm_var": 312.32545572916666, "learning_rate": 0.0001, "loss": 8.2802, "loss/crossentropy": 2.3824350833892822, "loss/hidden": 0.1591796875, "loss/logits": 0.011414668522775173, "loss/reg": 5.727158546447754, "loss/twn": 0.0, "step": 138 }, { "epoch": 0.003475, "grad_norm": 11.1875, "grad_norm_var": 314.30149739583334, "learning_rate": 0.0001, "loss": 8.1258, "loss/crossentropy": 2.285022497177124, "loss/hidden": 0.11962890625, "loss/logits": 0.008713757619261742, "loss/reg": 5.712470054626465, "loss/twn": 0.0, "step": 139 }, { "epoch": 0.0035, "grad_norm": 16.375, "grad_norm_var": 310.479931640625, "learning_rate": 0.0001, "loss": 8.1639, "loss/crossentropy": 2.350821018218994, "loss/hidden": 0.10107421875, "loss/logits": 0.012461278587579727, "loss/reg": 5.699510097503662, "loss/twn": 0.0, "step": 140 }, { "epoch": 0.003525, "grad_norm": 9.4375, "grad_norm_var": 314.765478515625, "learning_rate": 0.0001, "loss": 7.8463, "loss/crossentropy": 2.103158473968506, "loss/hidden": 0.05224609375, "loss/logits": 0.005224157590419054, "loss/reg": 5.685665130615234, "loss/twn": 0.0, "step": 141 }, { "epoch": 0.00355, "grad_norm": 9.0625, "grad_norm_var": 318.001416015625, "learning_rate": 0.0001, "loss": 8.1747, "loss/crossentropy": 2.418196678161621, "loss/hidden": 0.07666015625, "loss/logits": 0.006400687620043755, "loss/reg": 5.6734795570373535, "loss/twn": 0.0, "step": 142 }, { "epoch": 0.003575, "grad_norm": 15.4375, "grad_norm_var": 319.43639322916664, "learning_rate": 0.0001, "loss": 6.5554, "loss/crossentropy": 0.6986656785011292, "loss/hidden": 0.1845703125, "loss/logits": 0.012149279937148094, "loss/reg": 5.660000324249268, "loss/twn": 0.0, "step": 143 }, { "epoch": 0.0036, "grad_norm": 15.875, "grad_norm_var": 317.5587890625, "learning_rate": 0.0001, "loss": 8.5371, "loss/crossentropy": 2.7418227195739746, "loss/hidden": 0.13671875, "loss/logits": 0.0111403688788414, "loss/reg": 5.647412300109863, "loss/twn": 0.0, "step": 144 }, { "epoch": 0.003625, "grad_norm": 13.875, "grad_norm_var": 246.30520833333333, "learning_rate": 0.0001, "loss": 6.2652, "loss/crossentropy": 0.4583094120025635, "loss/hidden": 0.158203125, "loss/logits": 0.012293124571442604, "loss/reg": 5.6363725662231445, "loss/twn": 0.0, "step": 145 }, { "epoch": 0.00365, "grad_norm": 22.75, "grad_norm_var": 245.63854166666667, "learning_rate": 0.0001, "loss": 7.3134, "loss/crossentropy": 1.468201994895935, "loss/hidden": 0.2109375, "loss/logits": 0.009673453867435455, "loss/reg": 5.624554634094238, "loss/twn": 0.0, "step": 146 }, { "epoch": 0.003675, "grad_norm": 310.0, "grad_norm_var": 5526.531754557292, "learning_rate": 0.0001, "loss": 7.0649, "loss/crossentropy": 1.2726686000823975, "loss/hidden": 0.1728515625, "loss/logits": 0.005334332585334778, "loss/reg": 5.614006042480469, "loss/twn": 0.0, "step": 147 }, { "epoch": 0.0037, "grad_norm": 9.25, "grad_norm_var": 5563.953889973958, "learning_rate": 0.0001, "loss": 6.774, "loss/crossentropy": 1.0251015424728394, "loss/hidden": 0.138671875, "loss/logits": 0.007210130337625742, "loss/reg": 5.603022575378418, "loss/twn": 0.0, "step": 148 }, { "epoch": 0.003725, "grad_norm": 17.875, "grad_norm_var": 5575.684358723958, "learning_rate": 0.0001, "loss": 8.7246, "loss/crossentropy": 2.911123275756836, "loss/hidden": 0.19921875, "loss/logits": 0.02136034518480301, "loss/reg": 5.592944145202637, "loss/twn": 0.0, "step": 149 }, { "epoch": 0.00375, "grad_norm": 9.875, "grad_norm_var": 5580.160872395833, "learning_rate": 0.0001, "loss": 8.39, "loss/crossentropy": 2.711456537246704, "loss/hidden": 0.09033203125, "loss/logits": 0.005852097645401955, "loss/reg": 5.582311153411865, "loss/twn": 0.0, "step": 150 }, { "epoch": 0.003775, "grad_norm": 14.625, "grad_norm_var": 5588.7056640625, "learning_rate": 0.0001, "loss": 7.2155, "loss/crossentropy": 1.4148988723754883, "loss/hidden": 0.2158203125, "loss/logits": 0.012599754147231579, "loss/reg": 5.5721516609191895, "loss/twn": 0.0, "step": 151 }, { "epoch": 0.0038, "grad_norm": 12.375, "grad_norm_var": 5588.115869140625, "learning_rate": 0.0001, "loss": 8.013, "loss/crossentropy": 2.3517696857452393, "loss/hidden": 0.09130859375, "loss/logits": 0.007323693484067917, "loss/reg": 5.562623023986816, "loss/twn": 0.0, "step": 152 }, { "epoch": 0.003825, "grad_norm": 30.5, "grad_norm_var": 5482.538785807292, "learning_rate": 0.0001, "loss": 7.8607, "loss/crossentropy": 2.2278008460998535, "loss/hidden": 0.07421875, "loss/logits": 0.0050660185515880585, "loss/reg": 5.553621292114258, "loss/twn": 0.0, "step": 153 }, { "epoch": 0.00385, "grad_norm": 13.0625, "grad_norm_var": 5478.366129557292, "learning_rate": 0.0001, "loss": 8.277, "loss/crossentropy": 2.206120252609253, "loss/hidden": 0.5078125, "loss/logits": 0.018887437880039215, "loss/reg": 5.544199466705322, "loss/twn": 0.0, "step": 154 }, { "epoch": 0.003875, "grad_norm": 93.0, "grad_norm_var": 5656.329622395833, "learning_rate": 0.0001, "loss": 8.3416, "loss/crossentropy": 2.643498420715332, "loss/hidden": 0.1474609375, "loss/logits": 0.015275152400135994, "loss/reg": 5.535386562347412, "loss/twn": 0.0, "step": 155 }, { "epoch": 0.0039, "grad_norm": 15.8125, "grad_norm_var": 5657.996468098959, "learning_rate": 0.0001, "loss": 8.66, "loss/crossentropy": 3.025573492050171, "loss/hidden": 0.0986328125, "loss/logits": 0.008579680696129799, "loss/reg": 5.52721643447876, "loss/twn": 0.0, "step": 156 }, { "epoch": 0.003925, "grad_norm": 15.3125, "grad_norm_var": 5637.544124348959, "learning_rate": 0.0001, "loss": 8.3922, "loss/crossentropy": 2.7117786407470703, "loss/hidden": 0.15234375, "loss/logits": 0.008373694494366646, "loss/reg": 5.519668102264404, "loss/twn": 0.0, "step": 157 }, { "epoch": 0.00395, "grad_norm": 24.75, "grad_norm_var": 5591.000455729167, "learning_rate": 0.0001, "loss": 8.4122, "loss/crossentropy": 2.7280266284942627, "loss/hidden": 0.1611328125, "loss/logits": 0.01191728375852108, "loss/reg": 5.511092662811279, "loss/twn": 0.0, "step": 158 }, { "epoch": 0.003975, "grad_norm": 12.9375, "grad_norm_var": 5599.461393229167, "learning_rate": 0.0001, "loss": 8.2413, "loss/crossentropy": 2.5042476654052734, "loss/hidden": 0.21484375, "loss/logits": 0.018616054207086563, "loss/reg": 5.503547668457031, "loss/twn": 0.0, "step": 159 }, { "epoch": 0.004, "grad_norm": 12.875, "grad_norm_var": 5609.470768229166, "learning_rate": 0.0001, "loss": 8.4032, "loss/crossentropy": 2.762385606765747, "loss/hidden": 0.1337890625, "loss/logits": 0.010888181626796722, "loss/reg": 5.496166706085205, "loss/twn": 0.0, "step": 160 }, { "epoch": 0.004025, "grad_norm": 10.8125, "grad_norm_var": 5620.440738932291, "learning_rate": 0.0001, "loss": 8.0804, "loss/crossentropy": 2.5779385566711426, "loss/hidden": 0.0093994140625, "loss/logits": 0.004039571154862642, "loss/reg": 5.489066123962402, "loss/twn": 0.0, "step": 161 }, { "epoch": 0.00405, "grad_norm": 334.0, "grad_norm_var": 10996.149723307291, "learning_rate": 0.0001, "loss": 6.2302, "loss/crossentropy": 0.5805911421775818, "loss/hidden": 0.158203125, "loss/logits": 0.008694609627127647, "loss/reg": 5.482710361480713, "loss/twn": 0.0, "step": 162 }, { "epoch": 0.004075, "grad_norm": 131.0, "grad_norm_var": 6997.830452473959, "learning_rate": 0.0001, "loss": 7.0482, "loss/crossentropy": 1.3563833236694336, "loss/hidden": 0.2060546875, "loss/logits": 0.010020879097282887, "loss/reg": 5.475753307342529, "loss/twn": 0.0, "step": 163 }, { "epoch": 0.0041, "grad_norm": 13.125, "grad_norm_var": 6979.068994140625, "learning_rate": 0.0001, "loss": 8.1687, "loss/crossentropy": 2.569106101989746, "loss/hidden": 0.1171875, "loss/logits": 0.012607071548700333, "loss/reg": 5.469805717468262, "loss/twn": 0.0, "step": 164 }, { "epoch": 0.004125, "grad_norm": 14.1875, "grad_norm_var": 6994.544010416666, "learning_rate": 0.0001, "loss": 7.2464, "loss/crossentropy": 1.6330703496932983, "loss/hidden": 0.142578125, "loss/logits": 0.007347936742007732, "loss/reg": 5.463380336761475, "loss/twn": 0.0, "step": 165 }, { "epoch": 0.00415, "grad_norm": 11.125, "grad_norm_var": 6988.3890625, "learning_rate": 0.0001, "loss": 8.1928, "loss/crossentropy": 2.5154733657836914, "loss/hidden": 0.20703125, "loss/logits": 0.013312840834259987, "loss/reg": 5.45693826675415, "loss/twn": 0.0, "step": 166 }, { "epoch": 0.004175, "grad_norm": 13.875, "grad_norm_var": 6991.70859375, "learning_rate": 0.0001, "loss": 7.1058, "loss/crossentropy": 1.5941680669784546, "loss/hidden": 0.05712890625, "loss/logits": 0.003117609303444624, "loss/reg": 5.451424598693848, "loss/twn": 0.0, "step": 167 }, { "epoch": 0.0042, "grad_norm": 25.5, "grad_norm_var": 6941.1431640625, "learning_rate": 0.0001, "loss": 5.8236, "loss/crossentropy": 0.22541135549545288, "loss/hidden": 0.1494140625, "loss/logits": 0.002781955059617758, "loss/reg": 5.445990562438965, "loss/twn": 0.0, "step": 168 }, { "epoch": 0.004225, "grad_norm": 9.9375, "grad_norm_var": 7016.212353515625, "learning_rate": 0.0001, "loss": 8.0762, "loss/crossentropy": 2.55377197265625, "loss/hidden": 0.07666015625, "loss/logits": 0.005485657136887312, "loss/reg": 5.4402995109558105, "loss/twn": 0.0, "step": 169 }, { "epoch": 0.00425, "grad_norm": 16.0, "grad_norm_var": 7003.476302083333, "learning_rate": 0.0001, "loss": 8.043, "loss/crossentropy": 2.316843032836914, "loss/hidden": 0.26953125, "loss/logits": 0.021316442638635635, "loss/reg": 5.435269832611084, "loss/twn": 0.0, "step": 170 }, { "epoch": 0.004275, "grad_norm": 11.125, "grad_norm_var": 6921.814518229166, "learning_rate": 0.0001, "loss": 8.12, "loss/crossentropy": 2.539064645767212, "loss/hidden": 0.134765625, "loss/logits": 0.01590101048350334, "loss/reg": 5.430272579193115, "loss/twn": 0.0, "step": 171 }, { "epoch": 0.0043, "grad_norm": 12.5625, "grad_norm_var": 6933.832747395833, "learning_rate": 0.0001, "loss": 7.4549, "loss/crossentropy": 1.8348197937011719, "loss/hidden": 0.1826171875, "loss/logits": 0.012229220010340214, "loss/reg": 5.425241470336914, "loss/twn": 0.0, "step": 172 }, { "epoch": 0.004325, "grad_norm": 14.1875, "grad_norm_var": 6937.888020833333, "learning_rate": 0.0001, "loss": 6.9049, "loss/crossentropy": 1.270473599433899, "loss/hidden": 0.205078125, "loss/logits": 0.009345902130007744, "loss/reg": 5.4199981689453125, "loss/twn": 0.0, "step": 173 }, { "epoch": 0.00435, "grad_norm": 10.5625, "grad_norm_var": 6982.626676432292, "learning_rate": 0.0001, "loss": 7.0624, "loss/crossentropy": 1.425657033920288, "loss/hidden": 0.2099609375, "loss/logits": 0.011879321187734604, "loss/reg": 5.4148969650268555, "loss/twn": 0.0, "step": 174 }, { "epoch": 0.004375, "grad_norm": 8.875, "grad_norm_var": 6998.784635416667, "learning_rate": 0.0001, "loss": 7.1708, "loss/crossentropy": 1.4886845350265503, "loss/hidden": 0.265625, "loss/logits": 0.005756002385169268, "loss/reg": 5.410771369934082, "loss/twn": 0.0, "step": 175 }, { "epoch": 0.0044, "grad_norm": 28.625, "grad_norm_var": 6956.046354166667, "learning_rate": 0.0001, "loss": 6.1956, "loss/crossentropy": 0.5712894201278687, "loss/hidden": 0.2119140625, "loss/logits": 0.006170031148940325, "loss/reg": 5.406195163726807, "loss/twn": 0.0, "step": 176 }, { "epoch": 0.004425, "grad_norm": 8.75, "grad_norm_var": 6964.777067057292, "learning_rate": 0.0001, "loss": 6.737, "loss/crossentropy": 1.1914383172988892, "loss/hidden": 0.13671875, "loss/logits": 0.006925875786691904, "loss/reg": 5.401881694793701, "loss/twn": 0.0, "step": 177 }, { "epoch": 0.00445, "grad_norm": 44.5, "grad_norm_var": 911.0606608072917, "learning_rate": 0.0001, "loss": 8.1517, "loss/crossentropy": 2.5778274536132812, "loss/hidden": 0.1669921875, "loss/logits": 0.009458218701183796, "loss/reg": 5.397425174713135, "loss/twn": 0.0, "step": 178 }, { "epoch": 0.004475, "grad_norm": 16.625, "grad_norm_var": 87.32237955729167, "learning_rate": 0.0001, "loss": 7.1888, "loss/crossentropy": 1.5994395017623901, "loss/hidden": 0.185546875, "loss/logits": 0.01000029407441616, "loss/reg": 5.393801212310791, "loss/twn": 0.0, "step": 179 }, { "epoch": 0.0045, "grad_norm": 12.1875, "grad_norm_var": 87.76451822916667, "learning_rate": 0.0001, "loss": 8.0046, "loss/crossentropy": 2.455324172973633, "loss/hidden": 0.1474609375, "loss/logits": 0.012162324041128159, "loss/reg": 5.389675617218018, "loss/twn": 0.0, "step": 180 }, { "epoch": 0.004525, "grad_norm": 124.0, "grad_norm_var": 812.4984212239583, "learning_rate": 0.0001, "loss": 6.3516, "loss/crossentropy": 0.8374608755111694, "loss/hidden": 0.125, "loss/logits": 0.003441192675381899, "loss/reg": 5.3856940269470215, "loss/twn": 0.0, "step": 181 }, { "epoch": 0.00455, "grad_norm": 10.9375, "grad_norm_var": 812.7981770833334, "learning_rate": 0.0001, "loss": 8.1559, "loss/crossentropy": 2.670118570327759, "loss/hidden": 0.0986328125, "loss/logits": 0.005220318678766489, "loss/reg": 5.381902694702148, "loss/twn": 0.0, "step": 182 }, { "epoch": 0.004575, "grad_norm": 10.0, "grad_norm_var": 818.4593098958334, "learning_rate": 0.0001, "loss": 7.6118, "loss/crossentropy": 2.0260519981384277, "loss/hidden": 0.1982421875, "loss/logits": 0.009290603920817375, "loss/reg": 5.378239631652832, "loss/twn": 0.0, "step": 183 }, { "epoch": 0.0046, "grad_norm": 11.3125, "grad_norm_var": 825.881884765625, "learning_rate": 0.0001, "loss": 8.0393, "loss/crossentropy": 2.524176597595215, "loss/hidden": 0.130859375, "loss/logits": 0.009289154782891273, "loss/reg": 5.3750152587890625, "loss/twn": 0.0, "step": 184 }, { "epoch": 0.004625, "grad_norm": 17.375, "grad_norm_var": 817.4895182291667, "learning_rate": 0.0001, "loss": 8.2323, "loss/crossentropy": 2.7599668502807617, "loss/hidden": 0.09130859375, "loss/logits": 0.009666713885962963, "loss/reg": 5.371392250061035, "loss/twn": 0.0, "step": 185 }, { "epoch": 0.00465, "grad_norm": 20.625, "grad_norm_var": 814.9096354166667, "learning_rate": 0.0001, "loss": 7.9849, "loss/crossentropy": 2.4787378311157227, "loss/hidden": 0.126953125, "loss/logits": 0.011332664638757706, "loss/reg": 5.367901802062988, "loss/twn": 0.0, "step": 186 }, { "epoch": 0.004675, "grad_norm": 20.625, "grad_norm_var": 805.9638020833333, "learning_rate": 0.0001, "loss": 8.1101, "loss/crossentropy": 2.6036434173583984, "loss/hidden": 0.1318359375, "loss/logits": 0.009613338857889175, "loss/reg": 5.364970684051514, "loss/twn": 0.0, "step": 187 }, { "epoch": 0.0047, "grad_norm": 14.5, "grad_norm_var": 803.4415201822917, "learning_rate": 0.0001, "loss": 8.2869, "loss/crossentropy": 2.7700679302215576, "loss/hidden": 0.1435546875, "loss/logits": 0.011366615071892738, "loss/reg": 5.361906051635742, "loss/twn": 0.0, "step": 188 }, { "epoch": 0.004725, "grad_norm": 30.75, "grad_norm_var": 800.3403645833333, "learning_rate": 0.0001, "loss": 8.1568, "loss/crossentropy": 2.633868455886841, "loss/hidden": 0.158203125, "loss/logits": 0.006533905863761902, "loss/reg": 5.358221054077148, "loss/twn": 0.0, "step": 189 }, { "epoch": 0.00475, "grad_norm": 33.0, "grad_norm_var": 790.4363118489583, "learning_rate": 0.0001, "loss": 7.5575, "loss/crossentropy": 1.9494565725326538, "loss/hidden": 0.2412109375, "loss/logits": 0.011269403621554375, "loss/reg": 5.3555755615234375, "loss/twn": 0.0, "step": 190 }, { "epoch": 0.004775, "grad_norm": 9.625, "grad_norm_var": 788.7796712239583, "learning_rate": 0.0001, "loss": 7.0502, "loss/crossentropy": 1.607956051826477, "loss/hidden": 0.08642578125, "loss/logits": 0.0032915128394961357, "loss/reg": 5.35251522064209, "loss/twn": 0.0, "step": 191 }, { "epoch": 0.0048, "grad_norm": 100.5, "grad_norm_var": 1138.346728515625, "learning_rate": 0.0001, "loss": 8.4206, "loss/crossentropy": 2.9291961193084717, "loss/hidden": 0.1337890625, "loss/logits": 0.007461494766175747, "loss/reg": 5.3501105308532715, "loss/twn": 0.0, "step": 192 }, { "epoch": 0.004825, "grad_norm": 14.875, "grad_norm_var": 1123.0661295572916, "learning_rate": 0.0001, "loss": 7.4399, "loss/crossentropy": 1.9533724784851074, "loss/hidden": 0.134765625, "loss/logits": 0.004339105449616909, "loss/reg": 5.3473944664001465, "loss/twn": 0.0, "step": 193 }, { "epoch": 0.00485, "grad_norm": 64.5, "grad_norm_var": 1184.8265462239583, "learning_rate": 0.0001, "loss": 7.542, "loss/crossentropy": 2.0671801567077637, "loss/hidden": 0.1171875, "loss/logits": 0.012455419637262821, "loss/reg": 5.345158100128174, "loss/twn": 0.0, "step": 194 }, { "epoch": 0.004875, "grad_norm": 16.125, "grad_norm_var": 1185.8648274739583, "learning_rate": 0.0001, "loss": 7.938, "loss/crossentropy": 2.456360101699829, "loss/hidden": 0.130859375, "loss/logits": 0.008208954706788063, "loss/reg": 5.342526435852051, "loss/twn": 0.0, "step": 195 }, { "epoch": 0.0049, "grad_norm": 80.0, "grad_norm_var": 1294.7356770833333, "learning_rate": 0.0001, "loss": 7.8581, "loss/crossentropy": 2.487790107727051, "loss/hidden": 0.025634765625, "loss/logits": 0.0046631209552288055, "loss/reg": 5.340009689331055, "loss/twn": 0.0, "step": 196 }, { "epoch": 0.004925, "grad_norm": 14.75, "grad_norm_var": 761.3453125, "learning_rate": 0.0001, "loss": 6.8181, "loss/crossentropy": 1.304626703262329, "loss/hidden": 0.171875, "loss/logits": 0.004350706003606319, "loss/reg": 5.337262153625488, "loss/twn": 0.0, "step": 197 }, { "epoch": 0.00495, "grad_norm": 11.125, "grad_norm_var": 760.887353515625, "learning_rate": 0.0001, "loss": 7.9797, "loss/crossentropy": 2.4714841842651367, "loss/hidden": 0.1611328125, "loss/logits": 0.011602582409977913, "loss/reg": 5.335472106933594, "loss/twn": 0.0, "step": 198 }, { "epoch": 0.004975, "grad_norm": 95.0, "grad_norm_var": 993.0878743489583, "learning_rate": 0.0001, "loss": 7.04, "loss/crossentropy": 1.4688469171524048, "loss/hidden": 0.2275390625, "loss/logits": 0.0106576569378376, "loss/reg": 5.332970142364502, "loss/twn": 0.0, "step": 199 }, { "epoch": 0.005, "grad_norm": 9.9375, "grad_norm_var": 997.4878743489584, "learning_rate": 0.0001, "loss": 6.7972, "loss/crossentropy": 1.3856897354125977, "loss/hidden": 0.080078125, "loss/logits": 0.001242777332663536, "loss/reg": 5.330203056335449, "loss/twn": 0.0, "step": 200 }, { "epoch": 0.005025, "grad_norm": 11.875, "grad_norm_var": 1011.9969889322916, "learning_rate": 0.0001, "loss": 7.5639, "loss/crossentropy": 2.073434591293335, "loss/hidden": 0.1484375, "loss/logits": 0.013964459300041199, "loss/reg": 5.328036308288574, "loss/twn": 0.0, "step": 201 }, { "epoch": 0.00505, "grad_norm": 18.25, "grad_norm_var": 1016.660400390625, "learning_rate": 0.0001, "loss": 8.68, "loss/crossentropy": 3.2589170932769775, "loss/hidden": 0.08642578125, "loss/logits": 0.008732986636459827, "loss/reg": 5.325946807861328, "loss/twn": 0.0, "step": 202 }, { "epoch": 0.005075, "grad_norm": 16.125, "grad_norm_var": 1026.004931640625, "learning_rate": 0.0001, "loss": 8.6078, "loss/crossentropy": 3.1099424362182617, "loss/hidden": 0.1552734375, "loss/logits": 0.018260516226291656, "loss/reg": 5.324294090270996, "loss/twn": 0.0, "step": 203 }, { "epoch": 0.0051, "grad_norm": 46.75, "grad_norm_var": 1007.981884765625, "learning_rate": 0.0001, "loss": 7.5539, "loss/crossentropy": 2.091862678527832, "loss/hidden": 0.130859375, "loss/logits": 0.009298876859247684, "loss/reg": 5.321921348571777, "loss/twn": 0.0, "step": 204 }, { "epoch": 0.005125, "grad_norm": 10.3125, "grad_norm_var": 1047.91484375, "learning_rate": 0.0001, "loss": 8.124, "loss/crossentropy": 2.732879400253296, "loss/hidden": 0.064453125, "loss/logits": 0.0066910069435834885, "loss/reg": 5.320003986358643, "loss/twn": 0.0, "step": 205 }, { "epoch": 0.00515, "grad_norm": 11.0625, "grad_norm_var": 1082.517822265625, "learning_rate": 0.0001, "loss": 6.9944, "loss/crossentropy": 1.5260496139526367, "loss/hidden": 0.142578125, "loss/logits": 0.007849331945180893, "loss/reg": 5.317881107330322, "loss/twn": 0.0, "step": 206 }, { "epoch": 0.005175, "grad_norm": 8.6875, "grad_norm_var": 1085.5166015625, "learning_rate": 0.0001, "loss": 6.7833, "loss/crossentropy": 1.3956589698791504, "loss/hidden": 0.06689453125, "loss/logits": 0.004923268221318722, "loss/reg": 5.315812587738037, "loss/twn": 0.0, "step": 207 }, { "epoch": 0.0052, "grad_norm": 10.0625, "grad_norm_var": 784.176025390625, "learning_rate": 0.0001, "loss": 6.6116, "loss/crossentropy": 1.040010690689087, "loss/hidden": 0.2451171875, "loss/logits": 0.011602293699979782, "loss/reg": 5.3148298263549805, "loss/twn": 0.0, "step": 208 }, { "epoch": 0.005225, "grad_norm": 21.875, "grad_norm_var": 775.4880045572917, "learning_rate": 0.0001, "loss": 8.5451, "loss/crossentropy": 3.072871685028076, "loss/hidden": 0.1474609375, "loss/logits": 0.011389853432774544, "loss/reg": 5.313349723815918, "loss/twn": 0.0, "step": 209 }, { "epoch": 0.00525, "grad_norm": 16.25, "grad_norm_var": 685.5469889322917, "learning_rate": 0.0001, "loss": 6.872, "loss/crossentropy": 1.4240162372589111, "loss/hidden": 0.130859375, "loss/logits": 0.0054016802459955215, "loss/reg": 5.311694145202637, "loss/twn": 0.0, "step": 210 }, { "epoch": 0.005275, "grad_norm": 10.8125, "grad_norm_var": 693.5171223958333, "learning_rate": 0.0001, "loss": 6.665, "loss/crossentropy": 1.2265129089355469, "loss/hidden": 0.12451171875, "loss/logits": 0.0037709574680775404, "loss/reg": 5.31024169921875, "loss/twn": 0.0, "step": 211 }, { "epoch": 0.0053, "grad_norm": 11.4375, "grad_norm_var": 480.45558268229166, "learning_rate": 0.0001, "loss": 8.1829, "loss/crossentropy": 2.7488791942596436, "loss/hidden": 0.11767578125, "loss/logits": 0.007810299750417471, "loss/reg": 5.3085198402404785, "loss/twn": 0.0, "step": 212 }, { "epoch": 0.005325, "grad_norm": 13.5, "grad_norm_var": 481.47316080729166, "learning_rate": 0.0001, "loss": 8.1725, "loss/crossentropy": 2.769019603729248, "loss/hidden": 0.08642578125, "loss/logits": 0.010608029551804066, "loss/reg": 5.306417942047119, "loss/twn": 0.0, "step": 213 }, { "epoch": 0.00535, "grad_norm": 10.875, "grad_norm_var": 481.77928059895834, "learning_rate": 0.0001, "loss": 7.1392, "loss/crossentropy": 1.7968316078186035, "loss/hidden": 0.03271484375, "loss/logits": 0.004501561634242535, "loss/reg": 5.305141925811768, "loss/twn": 0.0, "step": 214 }, { "epoch": 0.005375, "grad_norm": 11.1875, "grad_norm_var": 84.65208333333334, "learning_rate": 0.0001, "loss": 7.577, "loss/crossentropy": 2.1686487197875977, "loss/hidden": 0.0986328125, "loss/logits": 0.006175590679049492, "loss/reg": 5.303523540496826, "loss/twn": 0.0, "step": 215 }, { "epoch": 0.0054, "grad_norm": 12.0625, "grad_norm_var": 83.51764322916667, "learning_rate": 0.0001, "loss": 8.042, "loss/crossentropy": 2.737717628479004, "loss/hidden": 6.16908073425293e-06, "loss/logits": 0.0018352700863033533, "loss/reg": 5.302443027496338, "loss/twn": 0.0, "step": 216 }, { "epoch": 0.005425, "grad_norm": 12.8125, "grad_norm_var": 83.17316080729167, "learning_rate": 0.0001, "loss": 8.3658, "loss/crossentropy": 2.9845688343048096, "loss/hidden": 0.07421875, "loss/logits": 0.005686669610440731, "loss/reg": 5.301285743713379, "loss/twn": 0.0, "step": 217 }, { "epoch": 0.00545, "grad_norm": 13.25, "grad_norm_var": 82.654931640625, "learning_rate": 0.0001, "loss": 8.1609, "loss/crossentropy": 2.7524123191833496, "loss/hidden": 0.10107421875, "loss/logits": 0.007715051528066397, "loss/reg": 5.299709320068359, "loss/twn": 0.0, "step": 218 }, { "epoch": 0.005475, "grad_norm": 23.375, "grad_norm_var": 87.20506184895834, "learning_rate": 0.0001, "loss": 7.156, "loss/crossentropy": 1.624443531036377, "loss/hidden": 0.2216796875, "loss/logits": 0.011536870151758194, "loss/reg": 5.298386573791504, "loss/twn": 0.0, "step": 219 }, { "epoch": 0.0055, "grad_norm": 18.625, "grad_norm_var": 18.591780598958334, "learning_rate": 0.0001, "loss": 8.0654, "loss/crossentropy": 2.5879249572753906, "loss/hidden": 0.1630859375, "loss/logits": 0.017003701999783516, "loss/reg": 5.297426223754883, "loss/twn": 0.0, "step": 220 }, { "epoch": 0.005525, "grad_norm": 43.25, "grad_norm_var": 72.34680989583333, "learning_rate": 0.0001, "loss": 7.8801, "loss/crossentropy": 2.4491031169891357, "loss/hidden": 0.125, "loss/logits": 0.010065239854156971, "loss/reg": 5.295965194702148, "loss/twn": 0.0, "step": 221 }, { "epoch": 0.00555, "grad_norm": 9.5, "grad_norm_var": 73.438525390625, "learning_rate": 0.0001, "loss": 7.2681, "loss/crossentropy": 1.8589441776275635, "loss/hidden": 0.10595703125, "loss/logits": 0.0087303277105093, "loss/reg": 5.2944464683532715, "loss/twn": 0.0, "step": 222 }, { "epoch": 0.005575, "grad_norm": 10.3125, "grad_norm_var": 72.133447265625, "learning_rate": 0.0001, "loss": 6.9474, "loss/crossentropy": 1.452248215675354, "loss/hidden": 0.1962890625, "loss/logits": 0.005369896534830332, "loss/reg": 5.293449401855469, "loss/twn": 0.0, "step": 223 }, { "epoch": 0.0056, "grad_norm": 15.25, "grad_norm_var": 70.00305989583333, "learning_rate": 0.0001, "loss": 8.1935, "loss/crossentropy": 2.740863800048828, "loss/hidden": 0.14453125, "loss/logits": 0.01583397574722767, "loss/reg": 5.292267799377441, "loss/twn": 0.0, "step": 224 }, { "epoch": 0.005625, "grad_norm": 1056.0, "grad_norm_var": 67732.47864583334, "learning_rate": 0.0001, "loss": 7.8539, "loss/crossentropy": 2.403062105178833, "loss/hidden": 0.1474609375, "loss/logits": 0.012353872880339622, "loss/reg": 5.29097318649292, "loss/twn": 0.0, "step": 225 }, { "epoch": 0.00565, "grad_norm": 10.3125, "grad_norm_var": 67785.57133789062, "learning_rate": 0.0001, "loss": 7.194, "loss/crossentropy": 1.7008212804794312, "loss/hidden": 0.189453125, "loss/logits": 0.013665840029716492, "loss/reg": 5.290075778961182, "loss/twn": 0.0, "step": 226 }, { "epoch": 0.005675, "grad_norm": 12.5, "grad_norm_var": 67770.14609375, "learning_rate": 0.0001, "loss": 6.5587, "loss/crossentropy": 1.045196771621704, "loss/hidden": 0.2080078125, "loss/logits": 0.016789620742201805, "loss/reg": 5.288687705993652, "loss/twn": 0.0, "step": 227 }, { "epoch": 0.0057, "grad_norm": 27.625, "grad_norm_var": 67637.96925455729, "learning_rate": 0.0001, "loss": 7.1449, "loss/crossentropy": 1.6837458610534668, "loss/hidden": 0.158203125, "loss/logits": 0.015014993026852608, "loss/reg": 5.2879252433776855, "loss/twn": 0.0, "step": 228 }, { "epoch": 0.005725, "grad_norm": 10.6875, "grad_norm_var": 67663.88014322917, "learning_rate": 0.0001, "loss": 7.0729, "loss/crossentropy": 1.7119382619857788, "loss/hidden": 0.06689453125, "loss/logits": 0.007287868298590183, "loss/reg": 5.286799907684326, "loss/twn": 0.0, "step": 229 }, { "epoch": 0.00575, "grad_norm": 53.0, "grad_norm_var": 67380.34817708333, "learning_rate": 0.0001, "loss": 7.5607, "loss/crossentropy": 2.0949935913085938, "loss/hidden": 0.169921875, "loss/logits": 0.009483925998210907, "loss/reg": 5.286267280578613, "loss/twn": 0.0, "step": 230 }, { "epoch": 0.005775, "grad_norm": 19.375, "grad_norm_var": 67305.34086914062, "learning_rate": 0.0001, "loss": 7.9328, "loss/crossentropy": 2.538356304168701, "loss/hidden": 0.10498046875, "loss/logits": 0.005003707949072123, "loss/reg": 5.284492492675781, "loss/twn": 0.0, "step": 231 }, { "epoch": 0.0058, "grad_norm": 24.5, "grad_norm_var": 67195.30462239584, "learning_rate": 0.0001, "loss": 8.2162, "loss/crossentropy": 2.749314308166504, "loss/hidden": 0.169921875, "loss/logits": 0.012603437528014183, "loss/reg": 5.2843194007873535, "loss/twn": 0.0, "step": 232 }, { "epoch": 0.005825, "grad_norm": 14.6875, "grad_norm_var": 67177.47161458334, "learning_rate": 0.0001, "loss": 7.1017, "loss/crossentropy": 1.6476311683654785, "loss/hidden": 0.1572265625, "loss/logits": 0.01403855625540018, "loss/reg": 5.282772064208984, "loss/twn": 0.0, "step": 233 }, { "epoch": 0.00585, "grad_norm": 10.875, "grad_norm_var": 67200.58951822917, "learning_rate": 0.0001, "loss": 7.2783, "loss/crossentropy": 1.8889567852020264, "loss/hidden": 0.09814453125, "loss/logits": 0.009974194690585136, "loss/reg": 5.281259059906006, "loss/twn": 0.0, "step": 234 }, { "epoch": 0.005875, "grad_norm": 8.4375, "grad_norm_var": 67337.25597330728, "learning_rate": 0.0001, "loss": 7.2088, "loss/crossentropy": 1.7615716457366943, "loss/hidden": 0.16015625, "loss/logits": 0.0061751967296004295, "loss/reg": 5.280921459197998, "loss/twn": 0.0, "step": 235 }, { "epoch": 0.0059, "grad_norm": 9.4375, "grad_norm_var": 67422.68776041667, "learning_rate": 0.0001, "loss": 7.3424, "loss/crossentropy": 1.9424008131027222, "loss/hidden": 0.115234375, "loss/logits": 0.004247123841196299, "loss/reg": 5.280468940734863, "loss/twn": 0.0, "step": 236 }, { "epoch": 0.005925, "grad_norm": 10.3125, "grad_norm_var": 67667.18865559896, "learning_rate": 0.0001, "loss": 8.127, "loss/crossentropy": 2.7163503170013428, "loss/hidden": 0.12255859375, "loss/logits": 0.008166075684130192, "loss/reg": 5.279946804046631, "loss/twn": 0.0, "step": 237 }, { "epoch": 0.00595, "grad_norm": 12.5, "grad_norm_var": 67638.98084309897, "learning_rate": 0.0001, "loss": 7.0844, "loss/crossentropy": 1.5295031070709229, "loss/hidden": 0.26953125, "loss/logits": 0.006746275350451469, "loss/reg": 5.278590679168701, "loss/twn": 0.0, "step": 238 }, { "epoch": 0.005975, "grad_norm": 92.5, "grad_norm_var": 67279.8171875, "learning_rate": 0.0001, "loss": 5.8036, "loss/crossentropy": 0.4050528109073639, "loss/hidden": 0.1142578125, "loss/logits": 0.005355454981327057, "loss/reg": 5.278897762298584, "loss/twn": 0.0, "step": 239 }, { "epoch": 0.006, "grad_norm": 28.875, "grad_norm_var": 67161.52805989583, "learning_rate": 0.0001, "loss": 8.1647, "loss/crossentropy": 2.643662929534912, "loss/hidden": 0.2294921875, "loss/logits": 0.013766671530902386, "loss/reg": 5.277756214141846, "loss/twn": 0.0, "step": 240 }, { "epoch": 0.006025, "grad_norm": 13.1875, "grad_norm_var": 479.914697265625, "learning_rate": 0.0001, "loss": 6.9306, "loss/crossentropy": 1.5002285242080688, "loss/hidden": 0.1435546875, "loss/logits": 0.009923199191689491, "loss/reg": 5.276930809020996, "loss/twn": 0.0, "step": 241 }, { "epoch": 0.00605, "grad_norm": 11.9375, "grad_norm_var": 477.45519205729164, "learning_rate": 0.0001, "loss": 8.1267, "loss/crossentropy": 2.7130773067474365, "loss/hidden": 0.12890625, "loss/logits": 0.00791969709098339, "loss/reg": 5.276750087738037, "loss/twn": 0.0, "step": 242 }, { "epoch": 0.006075, "grad_norm": 10.75, "grad_norm_var": 479.98631184895834, "learning_rate": 0.0001, "loss": 7.1488, "loss/crossentropy": 1.727049469947815, "loss/hidden": 0.1376953125, "loss/logits": 0.00825846754014492, "loss/reg": 5.2758002281188965, "loss/twn": 0.0, "step": 243 }, { "epoch": 0.0061, "grad_norm": 15.4375, "grad_norm_var": 480.80833333333334, "learning_rate": 0.0001, "loss": 7.921, "loss/crossentropy": 2.5109403133392334, "loss/hidden": 0.12255859375, "loss/logits": 0.012591829523444176, "loss/reg": 5.274876594543457, "loss/twn": 0.0, "step": 244 }, { "epoch": 0.006125, "grad_norm": 14.6875, "grad_norm_var": 475.9583333333333, "learning_rate": 0.0001, "loss": 7.9285, "loss/crossentropy": 2.4956674575805664, "loss/hidden": 0.146484375, "loss/logits": 0.012429025955498219, "loss/reg": 5.2739410400390625, "loss/twn": 0.0, "step": 245 }, { "epoch": 0.00615, "grad_norm": 11.5625, "grad_norm_var": 411.4820149739583, "learning_rate": 0.0001, "loss": 7.9078, "loss/crossentropy": 2.5263020992279053, "loss/hidden": 0.10107421875, "loss/logits": 0.007129446603357792, "loss/reg": 5.273260116577148, "loss/twn": 0.0, "step": 246 }, { "epoch": 0.006175, "grad_norm": 17.5, "grad_norm_var": 411.6870930989583, "learning_rate": 0.0001, "loss": 7.0028, "loss/crossentropy": 1.6373541355133057, "loss/hidden": 0.08642578125, "loss/logits": 0.0059446613304317, "loss/reg": 5.273036479949951, "loss/twn": 0.0, "step": 247 }, { "epoch": 0.0062, "grad_norm": 11.375, "grad_norm_var": 413.1773274739583, "learning_rate": 0.0001, "loss": 7.5298, "loss/crossentropy": 2.1802141666412354, "loss/hidden": 0.07568359375, "loss/logits": 0.0017865689005702734, "loss/reg": 5.272162914276123, "loss/twn": 0.0, "step": 248 }, { "epoch": 0.006225, "grad_norm": 8.625, "grad_norm_var": 418.4583333333333, "learning_rate": 0.0001, "loss": 6.7983, "loss/crossentropy": 1.4629161357879639, "loss/hidden": 0.06201171875, "loss/logits": 0.0020404397509992123, "loss/reg": 5.271305084228516, "loss/twn": 0.0, "step": 249 }, { "epoch": 0.00625, "grad_norm": 10.625, "grad_norm_var": 418.6997395833333, "learning_rate": 0.0001, "loss": 6.9642, "loss/crossentropy": 1.5569666624069214, "loss/hidden": 0.12451171875, "loss/logits": 0.011692131869494915, "loss/reg": 5.27101469039917, "loss/twn": 0.0, "step": 250 }, { "epoch": 0.006275, "grad_norm": 174.0, "grad_norm_var": 1921.1363118489583, "learning_rate": 0.0001, "loss": 6.8711, "loss/crossentropy": 1.442520260810852, "loss/hidden": 0.1484375, "loss/logits": 0.009676285088062286, "loss/reg": 5.270504951477051, "loss/twn": 0.0, "step": 251 }, { "epoch": 0.0063, "grad_norm": 7.75, "grad_norm_var": 1925.5655598958333, "learning_rate": 0.0001, "loss": 6.6066, "loss/crossentropy": 1.229660153388977, "loss/hidden": 0.10107421875, "loss/logits": 0.006052733864635229, "loss/reg": 5.269782543182373, "loss/twn": 0.0, "step": 252 }, { "epoch": 0.006325, "grad_norm": 10.75, "grad_norm_var": 1924.5325358072917, "learning_rate": 0.0001, "loss": 7.977, "loss/crossentropy": 2.645150899887085, "loss/hidden": 0.05712890625, "loss/logits": 0.005253541748970747, "loss/reg": 5.269515514373779, "loss/twn": 0.0, "step": 253 }, { "epoch": 0.00635, "grad_norm": 15.3125, "grad_norm_var": 1919.1192057291667, "learning_rate": 0.0001, "loss": 8.4118, "loss/crossentropy": 2.997927188873291, "loss/hidden": 0.1337890625, "loss/logits": 0.010878477245569229, "loss/reg": 5.269217014312744, "loss/twn": 0.0, "step": 254 }, { "epoch": 0.006375, "grad_norm": 10.5625, "grad_norm_var": 1638.7606608072917, "learning_rate": 0.0001, "loss": 7.96, "loss/crossentropy": 2.5343592166900635, "loss/hidden": 0.1494140625, "loss/logits": 0.00772454310208559, "loss/reg": 5.268545627593994, "loss/twn": 0.0, "step": 255 }, { "epoch": 0.0064, "grad_norm": 11.125, "grad_norm_var": 1645.2782389322917, "learning_rate": 0.0001, "loss": 8.081, "loss/crossentropy": 2.7416863441467285, "loss/hidden": 0.06689453125, "loss/logits": 0.004296740982681513, "loss/reg": 5.268085479736328, "loss/twn": 0.0, "step": 256 }, { "epoch": 0.006425, "grad_norm": 83.5, "grad_norm_var": 1869.7838541666667, "learning_rate": 0.0001, "loss": 7.679, "loss/crossentropy": 2.278621196746826, "loss/hidden": 0.12255859375, "loss/logits": 0.01035086065530777, "loss/reg": 5.267502307891846, "loss/twn": 0.0, "step": 257 }, { "epoch": 0.00645, "grad_norm": 11.3125, "grad_norm_var": 1871.0296223958333, "learning_rate": 0.0001, "loss": 7.0164, "loss/crossentropy": 1.6138280630111694, "loss/hidden": 0.125, "loss/logits": 0.01041114330291748, "loss/reg": 5.267125606536865, "loss/twn": 0.0, "step": 258 }, { "epoch": 0.006475, "grad_norm": 9.375, "grad_norm_var": 1874.0453125, "learning_rate": 0.0001, "loss": 8.1242, "loss/crossentropy": 2.724980354309082, "loss/hidden": 0.1220703125, "loss/logits": 0.010582932271063328, "loss/reg": 5.266530990600586, "loss/twn": 0.0, "step": 259 }, { "epoch": 0.0065, "grad_norm": 7.09375, "grad_norm_var": 1890.6687133789062, "learning_rate": 0.0001, "loss": 7.1719, "loss/crossentropy": 1.8638581037521362, "loss/hidden": 0.0400390625, "loss/logits": 0.002111276611685753, "loss/reg": 5.265857696533203, "loss/twn": 0.0, "step": 260 }, { "epoch": 0.006525, "grad_norm": 12.9375, "grad_norm_var": 1893.4873982747397, "learning_rate": 0.0001, "loss": 7.2962, "loss/crossentropy": 1.825720191001892, "loss/hidden": 0.189453125, "loss/logits": 0.015128025785088539, "loss/reg": 5.26585054397583, "loss/twn": 0.0, "step": 261 }, { "epoch": 0.00655, "grad_norm": 116.5, "grad_norm_var": 2381.9933227539063, "learning_rate": 0.0001, "loss": 5.8142, "loss/crossentropy": 0.3675730228424072, "loss/hidden": 0.173828125, "loss/logits": 0.0077675022184848785, "loss/reg": 5.265065670013428, "loss/twn": 0.0, "step": 262 }, { "epoch": 0.006575, "grad_norm": 10.875, "grad_norm_var": 2397.895048014323, "learning_rate": 0.0001, "loss": 7.9558, "loss/crossentropy": 2.6135733127593994, "loss/hidden": 0.07177734375, "loss/logits": 0.005567469634115696, "loss/reg": 5.264838695526123, "loss/twn": 0.0, "step": 263 }, { "epoch": 0.0066, "grad_norm": 26.75, "grad_norm_var": 2370.424247233073, "learning_rate": 0.0001, "loss": 7.1463, "loss/crossentropy": 1.744168996810913, "loss/hidden": 0.130859375, "loss/logits": 0.006982623599469662, "loss/reg": 5.264274597167969, "loss/twn": 0.0, "step": 264 }, { "epoch": 0.006625, "grad_norm": 12.9375, "grad_norm_var": 2357.603544108073, "learning_rate": 0.0001, "loss": 6.835, "loss/crossentropy": 1.3916579484939575, "loss/hidden": 0.16796875, "loss/logits": 0.011533312499523163, "loss/reg": 5.263826847076416, "loss/twn": 0.0, "step": 265 }, { "epoch": 0.00665, "grad_norm": 86.0, "grad_norm_var": 2485.682157389323, "learning_rate": 0.0001, "loss": 8.1388, "loss/crossentropy": 2.743070363998413, "loss/hidden": 0.11962890625, "loss/logits": 0.012774601578712463, "loss/reg": 5.263358116149902, "loss/twn": 0.0, "step": 266 }, { "epoch": 0.006675, "grad_norm": 20.125, "grad_norm_var": 1173.6974243164063, "learning_rate": 0.0001, "loss": 7.547, "loss/crossentropy": 2.15224289894104, "loss/hidden": 0.126953125, "loss/logits": 0.004906866233795881, "loss/reg": 5.262901782989502, "loss/twn": 0.0, "step": 267 }, { "epoch": 0.0067, "grad_norm": 8.25, "grad_norm_var": 1172.3426066080729, "learning_rate": 0.0001, "loss": 8.0178, "loss/crossentropy": 2.7544662952423096, "loss/hidden": 2.9206275939941406e-06, "loss/logits": 0.0010420402977615595, "loss/reg": 5.262295246124268, "loss/twn": 0.0, "step": 268 }, { "epoch": 0.006725, "grad_norm": 10.5625, "grad_norm_var": 1172.7845011393229, "learning_rate": 0.0001, "loss": 7.2573, "loss/crossentropy": 1.708762288093567, "loss/hidden": 0.27734375, "loss/logits": 0.008786465972661972, "loss/reg": 5.262362480163574, "loss/twn": 0.0, "step": 269 }, { "epoch": 0.00675, "grad_norm": 8.6875, "grad_norm_var": 1187.023075358073, "learning_rate": 0.0001, "loss": 7.3198, "loss/crossentropy": 1.9545139074325562, "loss/hidden": 0.09619140625, "loss/logits": 0.0075501929968595505, "loss/reg": 5.261580467224121, "loss/twn": 0.0, "step": 270 }, { "epoch": 0.006775, "grad_norm": 27.75, "grad_norm_var": 1165.726688639323, "learning_rate": 0.0001, "loss": 7.9432, "loss/crossentropy": 2.5177128314971924, "loss/hidden": 0.1552734375, "loss/logits": 0.008861662819981575, "loss/reg": 5.261343955993652, "loss/twn": 0.0, "step": 271 }, { "epoch": 0.0068, "grad_norm": 12.75, "grad_norm_var": 1162.0217732747396, "learning_rate": 0.0001, "loss": 7.2029, "loss/crossentropy": 1.8247922658920288, "loss/hidden": 0.1103515625, "loss/logits": 0.00664330180734396, "loss/reg": 5.261136054992676, "loss/twn": 0.0, "step": 272 }, { "epoch": 0.006825, "grad_norm": 24.625, "grad_norm_var": 951.5283162434896, "learning_rate": 0.0001, "loss": 6.3479, "loss/crossentropy": 0.8956549167633057, "loss/hidden": 0.1845703125, "loss/logits": 0.00696325721219182, "loss/reg": 5.2607598304748535, "loss/twn": 0.0, "step": 273 }, { "epoch": 0.00685, "grad_norm": 15.3125, "grad_norm_var": 945.0106079101563, "learning_rate": 0.0001, "loss": 7.1854, "loss/crossentropy": 1.796543002128601, "loss/hidden": 0.12255859375, "loss/logits": 0.005927722901105881, "loss/reg": 5.26037073135376, "loss/twn": 0.0, "step": 274 }, { "epoch": 0.006875, "grad_norm": 18.625, "grad_norm_var": 930.2756469726562, "learning_rate": 0.0001, "loss": 8.1342, "loss/crossentropy": 2.680659770965576, "loss/hidden": 0.181640625, "loss/logits": 0.011690370738506317, "loss/reg": 5.260243892669678, "loss/twn": 0.0, "step": 275 }, { "epoch": 0.0069, "grad_norm": 14.4375, "grad_norm_var": 914.9025390625, "learning_rate": 0.0001, "loss": 8.5123, "loss/crossentropy": 3.0964365005493164, "loss/hidden": 0.1455078125, "loss/logits": 0.010575573891401291, "loss/reg": 5.259780406951904, "loss/twn": 0.0, "step": 276 }, { "epoch": 0.006925, "grad_norm": 12.6875, "grad_norm_var": 915.3650390625, "learning_rate": 0.0001, "loss": 8.2582, "loss/crossentropy": 2.825162172317505, "loss/hidden": 0.15625, "loss/logits": 0.01738206297159195, "loss/reg": 5.259433746337891, "loss/twn": 0.0, "step": 277 }, { "epoch": 0.00695, "grad_norm": 21.625, "grad_norm_var": 341.7171875, "learning_rate": 0.0001, "loss": 7.1032, "loss/crossentropy": 1.7048217058181763, "loss/hidden": 0.1279296875, "loss/logits": 0.011121492832899094, "loss/reg": 5.259332656860352, "loss/twn": 0.0, "step": 278 }, { "epoch": 0.006975, "grad_norm": 9.3125, "grad_norm_var": 343.92706705729165, "learning_rate": 0.0001, "loss": 7.4175, "loss/crossentropy": 1.9668402671813965, "loss/hidden": 0.1826171875, "loss/logits": 0.009085997007787228, "loss/reg": 5.258953094482422, "loss/twn": 0.0, "step": 279 }, { "epoch": 0.007, "grad_norm": 14.3125, "grad_norm_var": 343.48333333333335, "learning_rate": 0.0001, "loss": 7.5905, "loss/crossentropy": 2.2082672119140625, "loss/hidden": 0.1181640625, "loss/logits": 0.005382226780056953, "loss/reg": 5.258672714233398, "loss/twn": 0.0, "step": 280 }, { "epoch": 0.007025, "grad_norm": 17.75, "grad_norm_var": 340.4792805989583, "learning_rate": 0.0001, "loss": 6.2281, "loss/crossentropy": 0.711824893951416, "loss/hidden": 0.251953125, "loss/logits": 0.005929501727223396, "loss/reg": 5.258391380310059, "loss/twn": 0.0, "step": 281 }, { "epoch": 0.00705, "grad_norm": 48.5, "grad_norm_var": 99.24881184895834, "learning_rate": 0.0001, "loss": 6.6114, "loss/crossentropy": 1.0958250761032104, "loss/hidden": 0.25, "loss/logits": 0.007801922038197517, "loss/reg": 5.257816314697266, "loss/twn": 0.0, "step": 282 }, { "epoch": 0.007075, "grad_norm": 16.0, "grad_norm_var": 99.05115559895833, "learning_rate": 0.0001, "loss": 6.7629, "loss/crossentropy": 1.2992652654647827, "loss/hidden": 0.1953125, "loss/logits": 0.01093169767409563, "loss/reg": 5.257413864135742, "loss/twn": 0.0, "step": 283 }, { "epoch": 0.0071, "grad_norm": 9.5, "grad_norm_var": 97.594775390625, "learning_rate": 0.0001, "loss": 7.7528, "loss/crossentropy": 2.3623929023742676, "loss/hidden": 0.1279296875, "loss/logits": 0.00520264683291316, "loss/reg": 5.257322788238525, "loss/twn": 0.0, "step": 284 }, { "epoch": 0.007125, "grad_norm": 16.75, "grad_norm_var": 94.1384765625, "learning_rate": 0.0001, "loss": 7.282, "loss/crossentropy": 1.8802975416183472, "loss/hidden": 0.1357421875, "loss/logits": 0.008890845812857151, "loss/reg": 5.2571001052856445, "loss/twn": 0.0, "step": 285 }, { "epoch": 0.00715, "grad_norm": 9.875, "grad_norm_var": 92.745947265625, "learning_rate": 0.0001, "loss": 7.0654, "loss/crossentropy": 1.632087230682373, "loss/hidden": 0.1669921875, "loss/logits": 0.009606104344129562, "loss/reg": 5.256716728210449, "loss/twn": 0.0, "step": 286 }, { "epoch": 0.007175, "grad_norm": 29.25, "grad_norm_var": 94.813916015625, "learning_rate": 0.0001, "loss": 7.8745, "loss/crossentropy": 2.4687438011169434, "loss/hidden": 0.140625, "loss/logits": 0.00846975389868021, "loss/reg": 5.256651878356934, "loss/twn": 0.0, "step": 287 }, { "epoch": 0.0072, "grad_norm": 28.25, "grad_norm_var": 98.55167643229167, "learning_rate": 0.0001, "loss": 8.246, "loss/crossentropy": 2.8182952404022217, "loss/hidden": 0.158203125, "loss/logits": 0.013284040614962578, "loss/reg": 5.256263256072998, "loss/twn": 0.0, "step": 288 }, { "epoch": 0.007225, "grad_norm": 10.625, "grad_norm_var": 100.62980143229167, "learning_rate": 0.0001, "loss": 8.0743, "loss/crossentropy": 2.664623737335205, "loss/hidden": 0.1435546875, "loss/logits": 0.009994969703257084, "loss/reg": 5.256109237670898, "loss/twn": 0.0, "step": 289 }, { "epoch": 0.00725, "grad_norm": 134.0, "grad_norm_var": 933.7604166666666, "learning_rate": 0.0001, "loss": 8.0955, "loss/crossentropy": 2.6748669147491455, "loss/hidden": 0.158203125, "loss/logits": 0.006934846751391888, "loss/reg": 5.25545597076416, "loss/twn": 0.0, "step": 290 }, { "epoch": 0.007275, "grad_norm": 11.0625, "grad_norm_var": 944.487744140625, "learning_rate": 0.0001, "loss": 7.7484, "loss/crossentropy": 2.490112781524658, "loss/hidden": 9.655952453613281e-06, "loss/logits": 0.0029325929936021566, "loss/reg": 5.255389213562012, "loss/twn": 0.0, "step": 291 }, { "epoch": 0.0073, "grad_norm": 9.375, "grad_norm_var": 953.3853515625, "learning_rate": 0.0001, "loss": 7.3465, "loss/crossentropy": 1.976689338684082, "loss/hidden": 0.10791015625, "loss/logits": 0.006814016494899988, "loss/reg": 5.255037307739258, "loss/twn": 0.0, "step": 292 }, { "epoch": 0.007325, "grad_norm": 20.125, "grad_norm_var": 944.7024576822917, "learning_rate": 0.0001, "loss": 7.0865, "loss/crossentropy": 1.7022920846939087, "loss/hidden": 0.12451171875, "loss/logits": 0.004657519515603781, "loss/reg": 5.25502347946167, "loss/twn": 0.0, "step": 293 }, { "epoch": 0.00735, "grad_norm": 10.8125, "grad_norm_var": 957.44375, "learning_rate": 0.0001, "loss": 5.8776, "loss/crossentropy": 0.5808318853378296, "loss/hidden": 0.0400390625, "loss/logits": 0.001980610191822052, "loss/reg": 5.254761219024658, "loss/twn": 0.0, "step": 294 }, { "epoch": 0.007375, "grad_norm": 10.0, "grad_norm_var": 956.0610514322917, "learning_rate": 0.0001, "loss": 7.66, "loss/crossentropy": 2.271899700164795, "loss/hidden": 0.125, "loss/logits": 0.008504629135131836, "loss/reg": 5.254581451416016, "loss/twn": 0.0, "step": 295 }, { "epoch": 0.0074, "grad_norm": 8.6875, "grad_norm_var": 965.8755045572917, "learning_rate": 0.0001, "loss": 6.0481, "loss/crossentropy": 0.639009952545166, "loss/hidden": 0.1533203125, "loss/logits": 0.0019615632481873035, "loss/reg": 5.253849983215332, "loss/twn": 0.0, "step": 296 }, { "epoch": 0.007425, "grad_norm": 11.1875, "grad_norm_var": 974.3947916666667, "learning_rate": 0.0001, "loss": 7.7422, "loss/crossentropy": 2.3546876907348633, "loss/hidden": 0.12451171875, "loss/logits": 0.00921722687780857, "loss/reg": 5.253818035125732, "loss/twn": 0.0, "step": 297 }, { "epoch": 0.00745, "grad_norm": 17.625, "grad_norm_var": 933.1155598958334, "learning_rate": 0.0001, "loss": 8.3911, "loss/crossentropy": 3.0292787551879883, "loss/hidden": 0.10107421875, "loss/logits": 0.00718055572360754, "loss/reg": 5.253612518310547, "loss/twn": 0.0, "step": 298 }, { "epoch": 0.007475, "grad_norm": 440.0, "grad_norm_var": 11825.940559895833, "learning_rate": 0.0001, "loss": 7.0072, "loss/crossentropy": 1.5630390644073486, "loss/hidden": 0.18359375, "loss/logits": 0.007107208017259836, "loss/reg": 5.253422737121582, "loss/twn": 0.0, "step": 299 }, { "epoch": 0.0075, "grad_norm": 10.875, "grad_norm_var": 11818.895833333334, "learning_rate": 0.0001, "loss": 6.9788, "loss/crossentropy": 1.483949065208435, "loss/hidden": 0.2314453125, "loss/logits": 0.010475615039467812, "loss/reg": 5.25289249420166, "loss/twn": 0.0, "step": 300 }, { "epoch": 0.007525, "grad_norm": 9.8125, "grad_norm_var": 11851.417171223959, "learning_rate": 0.0001, "loss": 7.5546, "loss/crossentropy": 2.208834648132324, "loss/hidden": 0.08642578125, "loss/logits": 0.006687905173748732, "loss/reg": 5.2526960372924805, "loss/twn": 0.0, "step": 301 }, { "epoch": 0.00755, "grad_norm": 11.25, "grad_norm_var": 11844.504931640626, "learning_rate": 0.0001, "loss": 7.1337, "loss/crossentropy": 1.752866506576538, "loss/hidden": 0.1201171875, "loss/logits": 0.008473502472043037, "loss/reg": 5.2522807121276855, "loss/twn": 0.0, "step": 302 }, { "epoch": 0.007575, "grad_norm": 20.5, "grad_norm_var": 11871.525113932292, "learning_rate": 0.0001, "loss": 5.9332, "loss/crossentropy": 0.6589277386665344, "loss/hidden": 0.02099609375, "loss/logits": 0.0013422563206404448, "loss/reg": 5.251956462860107, "loss/twn": 0.0, "step": 303 }, { "epoch": 0.0076, "grad_norm": 11.5625, "grad_norm_var": 11932.343229166667, "learning_rate": 0.0001, "loss": 7.0088, "loss/crossentropy": 1.5739790201187134, "loss/hidden": 0.1748046875, "loss/logits": 0.007969235070049763, "loss/reg": 5.2520952224731445, "loss/twn": 0.0, "step": 304 }, { "epoch": 0.007625, "grad_norm": 8.9375, "grad_norm_var": 11940.642301432292, "learning_rate": 0.0001, "loss": 8.1038, "loss/crossentropy": 2.782174587249756, "loss/hidden": 0.064453125, "loss/logits": 0.005289027933031321, "loss/reg": 5.251874923706055, "loss/twn": 0.0, "step": 305 }, { "epoch": 0.00765, "grad_norm": 19.75, "grad_norm_var": 11425.267692057292, "learning_rate": 0.0001, "loss": 8.1897, "loss/crossentropy": 2.776104688644409, "loss/hidden": 0.150390625, "loss/logits": 0.011526349931955338, "loss/reg": 5.251704692840576, "loss/twn": 0.0, "step": 306 }, { "epoch": 0.007675, "grad_norm": 7.9375, "grad_norm_var": 11437.715608723958, "learning_rate": 0.0001, "loss": 8.9456, "loss/crossentropy": 3.692781686782837, "loss/hidden": 5.245208740234375e-06, "loss/logits": 0.0012750843307003379, "loss/reg": 5.251509189605713, "loss/twn": 0.0, "step": 307 }, { "epoch": 0.0077, "grad_norm": 69.0, "grad_norm_var": 11422.188264973958, "learning_rate": 0.0001, "loss": 8.1468, "loss/crossentropy": 2.5686264038085938, "loss/hidden": 0.30078125, "loss/logits": 0.025831755250692368, "loss/reg": 5.251588821411133, "loss/twn": 0.0, "step": 308 }, { "epoch": 0.007725, "grad_norm": 13.375, "grad_norm_var": 11445.626936848957, "learning_rate": 0.0001, "loss": 7.1097, "loss/crossentropy": 1.660717248916626, "loss/hidden": 0.185546875, "loss/logits": 0.01224461942911148, "loss/reg": 5.25119161605835, "loss/twn": 0.0, "step": 309 }, { "epoch": 0.00775, "grad_norm": 17.875, "grad_norm_var": 11418.828059895834, "learning_rate": 0.0001, "loss": 8.2796, "loss/crossentropy": 2.8778281211853027, "loss/hidden": 0.1376953125, "loss/logits": 0.013323968276381493, "loss/reg": 5.250760555267334, "loss/twn": 0.0, "step": 310 }, { "epoch": 0.007775, "grad_norm": 10.25, "grad_norm_var": 11417.731184895832, "learning_rate": 0.0001, "loss": 7.1638, "loss/crossentropy": 1.7495626211166382, "loss/hidden": 0.1533203125, "loss/logits": 0.010259518399834633, "loss/reg": 5.250608921051025, "loss/twn": 0.0, "step": 311 }, { "epoch": 0.0078, "grad_norm": 21.0, "grad_norm_var": 11370.812223307292, "learning_rate": 0.0001, "loss": 7.2684, "loss/crossentropy": 1.8736885786056519, "loss/hidden": 0.1357421875, "loss/logits": 0.008427501656115055, "loss/reg": 5.250565528869629, "loss/twn": 0.0, "step": 312 }, { "epoch": 0.007825, "grad_norm": 171.0, "grad_norm_var": 12271.96328125, "learning_rate": 0.0001, "loss": 6.804, "loss/crossentropy": 1.4117506742477417, "loss/hidden": 0.13671875, "loss/logits": 0.004921610467135906, "loss/reg": 5.250616550445557, "loss/twn": 0.0, "step": 313 }, { "epoch": 0.00785, "grad_norm": 21.75, "grad_norm_var": 12253.1322265625, "learning_rate": 0.0001, "loss": 7.013, "loss/crossentropy": 1.5148544311523438, "loss/hidden": 0.23828125, "loss/logits": 0.009858010336756706, "loss/reg": 5.24995756149292, "loss/twn": 0.0, "step": 314 }, { "epoch": 0.007875, "grad_norm": 127.0, "grad_norm_var": 2269.4103515625, "learning_rate": 0.0001, "loss": 8.0458, "loss/crossentropy": 2.6658105850219727, "loss/hidden": 0.12255859375, "loss/logits": 0.007101866416633129, "loss/reg": 5.250338077545166, "loss/twn": 0.0, "step": 315 }, { "epoch": 0.0079, "grad_norm": 49.25, "grad_norm_var": 2240.6091145833334, "learning_rate": 0.0001, "loss": 6.9261, "loss/crossentropy": 1.5474319458007812, "loss/hidden": 0.12451171875, "loss/logits": 0.004424188286066055, "loss/reg": 5.249726295471191, "loss/twn": 0.0, "step": 316 }, { "epoch": 0.007925, "grad_norm": 11.0625, "grad_norm_var": 2236.19375, "learning_rate": 0.0001, "loss": 7.9458, "loss/crossentropy": 2.6948788166046143, "loss/hidden": 3.7550926208496094e-06, "loss/logits": 0.0011138684349134564, "loss/reg": 5.249834060668945, "loss/twn": 0.0, "step": 317 }, { "epoch": 0.00795, "grad_norm": 14.625, "grad_norm_var": 2225.3322265625, "learning_rate": 0.0001, "loss": 7.097, "loss/crossentropy": 1.674680233001709, "loss/hidden": 0.1533203125, "loss/logits": 0.018878858536481857, "loss/reg": 5.250132083892822, "loss/twn": 0.0, "step": 318 }, { "epoch": 0.007975, "grad_norm": 280.0, "grad_norm_var": 5856.9806640625, "learning_rate": 0.0001, "loss": 6.8544, "loss/crossentropy": 1.4390206336975098, "loss/hidden": 0.158203125, "loss/logits": 0.007244464010000229, "loss/reg": 5.249932765960693, "loss/twn": 0.0, "step": 319 }, { "epoch": 0.008, "grad_norm": 17.75, "grad_norm_var": 5824.858837890625, "learning_rate": 0.0001, "loss": 8.0011, "loss/crossentropy": 2.620617389678955, "loss/hidden": 0.12255859375, "loss/logits": 0.008633976802229881, "loss/reg": 5.249265193939209, "loss/twn": 0.0, "step": 320 }, { "epoch": 0.008025, "grad_norm": 18.625, "grad_norm_var": 5772.79609375, "learning_rate": 0.0001, "loss": 6.3629, "loss/crossentropy": 0.973727822303772, "loss/hidden": 0.1337890625, "loss/logits": 0.005988460034132004, "loss/reg": 5.249377250671387, "loss/twn": 0.0, "step": 321 }, { "epoch": 0.00805, "grad_norm": 7.71875, "grad_norm_var": 5837.412365722656, "learning_rate": 0.0001, "loss": 6.802, "loss/crossentropy": 1.4281165599822998, "loss/hidden": 0.1171875, "loss/logits": 0.007195750251412392, "loss/reg": 5.249452114105225, "loss/twn": 0.0, "step": 322 }, { "epoch": 0.008075, "grad_norm": 10.375, "grad_norm_var": 5822.930822753906, "learning_rate": 0.0001, "loss": 8.0202, "loss/crossentropy": 2.667816162109375, "loss/hidden": 0.09619140625, "loss/logits": 0.00719710998237133, "loss/reg": 5.249003887176514, "loss/twn": 0.0, "step": 323 }, { "epoch": 0.0081, "grad_norm": 30.5, "grad_norm_var": 5837.498661295573, "learning_rate": 0.0001, "loss": 8.5258, "loss/crossentropy": 3.144439697265625, "loss/hidden": 0.1171875, "loss/logits": 0.01504556369036436, "loss/reg": 5.249162197113037, "loss/twn": 0.0, "step": 324 }, { "epoch": 0.008125, "grad_norm": 10.5625, "grad_norm_var": 5852.246708170573, "learning_rate": 0.0001, "loss": 7.8167, "loss/crossentropy": 2.3962008953094482, "loss/hidden": 0.1650390625, "loss/logits": 0.006836063228547573, "loss/reg": 5.248575210571289, "loss/twn": 0.0, "step": 325 }, { "epoch": 0.00815, "grad_norm": 486.0, "grad_norm_var": 17467.963993326823, "learning_rate": 0.0001, "loss": 5.9967, "loss/crossentropy": 0.583743691444397, "loss/hidden": 0.1572265625, "loss/logits": 0.007230043411254883, "loss/reg": 5.248484134674072, "loss/twn": 0.0, "step": 326 }, { "epoch": 0.008175, "grad_norm": 10.8125, "grad_norm_var": 17462.717508951824, "learning_rate": 0.0001, "loss": 7.1253, "loss/crossentropy": 1.6490944623947144, "loss/hidden": 0.2138671875, "loss/logits": 0.01414478849619627, "loss/reg": 5.248198509216309, "loss/twn": 0.0, "step": 327 }, { "epoch": 0.0082, "grad_norm": 10.125, "grad_norm_var": 17556.386942545574, "learning_rate": 0.0001, "loss": 7.8925, "loss/crossentropy": 2.5412166118621826, "loss/hidden": 0.09619140625, "loss/logits": 0.006635190453380346, "loss/reg": 5.248410701751709, "loss/twn": 0.0, "step": 328 }, { "epoch": 0.008225, "grad_norm": 12.6875, "grad_norm_var": 17198.204911295572, "learning_rate": 0.0001, "loss": 6.9945, "loss/crossentropy": 1.603257656097412, "loss/hidden": 0.1328125, "loss/logits": 0.010111295618116856, "loss/reg": 5.248310565948486, "loss/twn": 0.0, "step": 329 }, { "epoch": 0.00825, "grad_norm": 12.5, "grad_norm_var": 17262.97177327474, "learning_rate": 0.0001, "loss": 5.7919, "loss/crossentropy": 0.3630053400993347, "loss/hidden": 0.177734375, "loss/logits": 0.002837226027622819, "loss/reg": 5.248295783996582, "loss/twn": 0.0, "step": 330 }, { "epoch": 0.008275, "grad_norm": 22.0, "grad_norm_var": 17144.92880452474, "learning_rate": 0.0001, "loss": 8.1295, "loss/crossentropy": 2.8160948753356934, "loss/hidden": 0.0595703125, "loss/logits": 0.005728469230234623, "loss/reg": 5.24811315536499, "loss/twn": 0.0, "step": 331 }, { "epoch": 0.0083, "grad_norm": 17.125, "grad_norm_var": 17267.413732910158, "learning_rate": 0.0001, "loss": 6.9959, "loss/crossentropy": 1.5546507835388184, "loss/hidden": 0.18359375, "loss/logits": 0.010080805979669094, "loss/reg": 5.247556209564209, "loss/twn": 0.0, "step": 332 }, { "epoch": 0.008325, "grad_norm": 11.5625, "grad_norm_var": 17264.114904785158, "learning_rate": 0.0001, "loss": 7.8893, "loss/crossentropy": 2.5168728828430176, "loss/hidden": 0.115234375, "loss/logits": 0.009305896237492561, "loss/reg": 5.247858047485352, "loss/twn": 0.0, "step": 333 }, { "epoch": 0.00835, "grad_norm": 13.4375, "grad_norm_var": 17271.515751139323, "learning_rate": 0.0001, "loss": 8.1715, "loss/crossentropy": 2.723308563232422, "loss/hidden": 0.1884765625, "loss/logits": 0.01238995511084795, "loss/reg": 5.247326374053955, "loss/twn": 0.0, "step": 334 }, { "epoch": 0.008375, "grad_norm": 13.0, "grad_norm_var": 13921.291532389323, "learning_rate": 0.0001, "loss": 8.3945, "loss/crossentropy": 3.096419334411621, "loss/hidden": 0.04736328125, "loss/logits": 0.0033957725390791893, "loss/reg": 5.247368812561035, "loss/twn": 0.0, "step": 335 }, { "epoch": 0.0084, "grad_norm": 17.75, "grad_norm_var": 13921.291532389323, "learning_rate": 0.0001, "loss": 7.3812, "loss/crossentropy": 1.8540621995925903, "loss/hidden": 0.2734375, "loss/logits": 0.006610853597521782, "loss/reg": 5.247133731842041, "loss/twn": 0.0, "step": 336 }, { "epoch": 0.008425, "grad_norm": 13.3125, "grad_norm_var": 13941.063993326823, "learning_rate": 0.0001, "loss": 8.1344, "loss/crossentropy": 2.7475340366363525, "loss/hidden": 0.1328125, "loss/logits": 0.007097205147147179, "loss/reg": 5.246928691864014, "loss/twn": 0.0, "step": 337 }, { "epoch": 0.00845, "grad_norm": 68.5, "grad_norm_var": 13880.22734375, "learning_rate": 0.0001, "loss": 7.8648, "loss/crossentropy": 2.4544427394866943, "loss/hidden": 0.15234375, "loss/logits": 0.010997762903571129, "loss/reg": 5.246999263763428, "loss/twn": 0.0, "step": 338 }, { "epoch": 0.008475, "grad_norm": 18.75, "grad_norm_var": 13843.137434895832, "learning_rate": 0.0001, "loss": 7.9608, "loss/crossentropy": 2.603945255279541, "loss/hidden": 0.099609375, "loss/logits": 0.010800717398524284, "loss/reg": 5.246415615081787, "loss/twn": 0.0, "step": 339 }, { "epoch": 0.0085, "grad_norm": 29.5, "grad_norm_var": 13845.5384765625, "learning_rate": 0.0001, "loss": 6.9841, "loss/crossentropy": 1.614331603050232, "loss/hidden": 0.1171875, "loss/logits": 0.005967713892459869, "loss/reg": 5.246609687805176, "loss/twn": 0.0, "step": 340 }, { "epoch": 0.008525, "grad_norm": 11.0625, "grad_norm_var": 13843.059830729168, "learning_rate": 0.0001, "loss": 7.04, "loss/crossentropy": 1.6529104709625244, "loss/hidden": 0.130859375, "loss/logits": 0.009600062854588032, "loss/reg": 5.246581554412842, "loss/twn": 0.0, "step": 341 }, { "epoch": 0.00855, "grad_norm": 13.5, "grad_norm_var": 203.06764322916666, "learning_rate": 0.0001, "loss": 7.3224, "loss/crossentropy": 1.903311848640442, "loss/hidden": 0.162109375, "loss/logits": 0.010554994456470013, "loss/reg": 5.246466636657715, "loss/twn": 0.0, "step": 342 }, { "epoch": 0.008575, "grad_norm": 21.125, "grad_norm_var": 199.17628580729166, "learning_rate": 0.0001, "loss": 7.0821, "loss/crossentropy": 1.5746668577194214, "loss/hidden": 0.24609375, "loss/logits": 0.014962641522288322, "loss/reg": 5.246390342712402, "loss/twn": 0.0, "step": 343 }, { "epoch": 0.0086, "grad_norm": 14.4375, "grad_norm_var": 195.16588541666667, "learning_rate": 0.0001, "loss": 7.2347, "loss/crossentropy": 1.8215773105621338, "loss/hidden": 0.1552734375, "loss/logits": 0.01197369396686554, "loss/reg": 5.24590539932251, "loss/twn": 0.0, "step": 344 }, { "epoch": 0.008625, "grad_norm": 109.0, "grad_norm_var": 688.8426920572916, "learning_rate": 0.0001, "loss": 7.5554, "loss/crossentropy": 2.1289989948272705, "loss/hidden": 0.1630859375, "loss/logits": 0.017385877668857574, "loss/reg": 5.245935440063477, "loss/twn": 0.0, "step": 345 }, { "epoch": 0.00865, "grad_norm": 238.0, "grad_norm_var": 3478.8179524739585, "learning_rate": 0.0001, "loss": 7.9943, "loss/crossentropy": 2.593379020690918, "loss/hidden": 0.142578125, "loss/logits": 0.012457353994250298, "loss/reg": 5.245845317840576, "loss/twn": 0.0, "step": 346 }, { "epoch": 0.008675, "grad_norm": 43.5, "grad_norm_var": 3457.530712890625, "learning_rate": 0.0001, "loss": 7.1267, "loss/crossentropy": 1.6978092193603516, "loss/hidden": 0.1728515625, "loss/logits": 0.010361634194850922, "loss/reg": 5.245694637298584, "loss/twn": 0.0, "step": 347 }, { "epoch": 0.0087, "grad_norm": 24.875, "grad_norm_var": 3436.771207682292, "learning_rate": 0.0001, "loss": 6.7434, "loss/crossentropy": 1.4021539688110352, "loss/hidden": 0.08837890625, "loss/logits": 0.007280138321220875, "loss/reg": 5.245609283447266, "loss/twn": 0.0, "step": 348 }, { "epoch": 0.008725, "grad_norm": 26.25, "grad_norm_var": 3391.9552083333333, "learning_rate": 0.0001, "loss": 8.1297, "loss/crossentropy": 2.7253024578094482, "loss/hidden": 0.146484375, "loss/logits": 0.012663663364946842, "loss/reg": 5.245262145996094, "loss/twn": 0.0, "step": 349 }, { "epoch": 0.00875, "grad_norm": 10.875, "grad_norm_var": 3402.2098795572915, "learning_rate": 0.0001, "loss": 7.2976, "loss/crossentropy": 1.8926178216934204, "loss/hidden": 0.146484375, "loss/logits": 0.013426396995782852, "loss/reg": 5.245053291320801, "loss/twn": 0.0, "step": 350 }, { "epoch": 0.008775, "grad_norm": 13.0625, "grad_norm_var": 3401.967708333333, "learning_rate": 0.0001, "loss": 7.3178, "loss/crossentropy": 1.8622666597366333, "loss/hidden": 0.1953125, "loss/logits": 0.015104337595403194, "loss/reg": 5.245081424713135, "loss/twn": 0.0, "step": 351 }, { "epoch": 0.0088, "grad_norm": 29.875, "grad_norm_var": 3371.8004557291665, "learning_rate": 0.0001, "loss": 7.9433, "loss/crossentropy": 2.5869476795196533, "loss/hidden": 0.10546875, "loss/logits": 0.0058593666180968285, "loss/reg": 5.2449822425842285, "loss/twn": 0.0, "step": 352 }, { "epoch": 0.008825, "grad_norm": 28.5, "grad_norm_var": 3326.400113932292, "learning_rate": 0.0001, "loss": 7.0321, "loss/crossentropy": 1.740645408630371, "loss/hidden": 0.04248046875, "loss/logits": 0.004086637869477272, "loss/reg": 5.244920253753662, "loss/twn": 0.0, "step": 353 }, { "epoch": 0.00885, "grad_norm": 11.75, "grad_norm_var": 3340.7945149739585, "learning_rate": 0.0001, "loss": 7.3477, "loss/crossentropy": 1.9114320278167725, "loss/hidden": 0.1787109375, "loss/logits": 0.012824185192584991, "loss/reg": 5.244693279266357, "loss/twn": 0.0, "step": 354 }, { "epoch": 0.008875, "grad_norm": 9.6875, "grad_norm_var": 3371.9114583333335, "learning_rate": 0.0001, "loss": 6.4325, "loss/crossentropy": 1.1136820316314697, "loss/hidden": 0.0732421875, "loss/logits": 0.001058733556419611, "loss/reg": 5.244504451751709, "loss/twn": 0.0, "step": 355 }, { "epoch": 0.0089, "grad_norm": 9.625, "grad_norm_var": 3423.5968098958333, "learning_rate": 0.0001, "loss": 7.5243, "loss/crossentropy": 2.1711511611938477, "loss/hidden": 0.0986328125, "loss/logits": 0.009911064058542252, "loss/reg": 5.244630813598633, "loss/twn": 0.0, "step": 356 }, { "epoch": 0.008925, "grad_norm": 13.25, "grad_norm_var": 3415.909228515625, "learning_rate": 0.0001, "loss": 7.3907, "loss/crossentropy": 1.9722574949264526, "loss/hidden": 0.1611328125, "loss/logits": 0.012831033207476139, "loss/reg": 5.244527816772461, "loss/twn": 0.0, "step": 357 }, { "epoch": 0.00895, "grad_norm": 258.0, "grad_norm_var": 6334.500634765625, "learning_rate": 0.0001, "loss": 6.6838, "loss/crossentropy": 1.2551920413970947, "loss/hidden": 0.1796875, "loss/logits": 0.004792730323970318, "loss/reg": 5.244173526763916, "loss/twn": 0.0, "step": 358 }, { "epoch": 0.008975, "grad_norm": 9.375, "grad_norm_var": 6394.419514973958, "learning_rate": 0.0001, "loss": 7.731, "loss/crossentropy": 2.3687314987182617, "loss/hidden": 0.10791015625, "loss/logits": 0.010197984986007214, "loss/reg": 5.244191646575928, "loss/twn": 0.0, "step": 359 }, { "epoch": 0.009, "grad_norm": 10.75, "grad_norm_var": 6414.292643229167, "learning_rate": 0.0001, "loss": 6.3021, "loss/crossentropy": 0.8574244379997253, "loss/hidden": 0.19140625, "loss/logits": 0.009014951065182686, "loss/reg": 5.244270324707031, "loss/twn": 0.0, "step": 360 }, { "epoch": 0.009025, "grad_norm": 12.0, "grad_norm_var": 6276.774934895833, "learning_rate": 0.0001, "loss": 7.8089, "loss/crossentropy": 2.456528902053833, "loss/hidden": 0.09619140625, "loss/logits": 0.012111629359424114, "loss/reg": 5.244028091430664, "loss/twn": 0.0, "step": 361 }, { "epoch": 0.00905, "grad_norm": 12.6875, "grad_norm_var": 3706.745556640625, "learning_rate": 0.0001, "loss": 6.1318, "loss/crossentropy": 0.654383659362793, "loss/hidden": 0.2255859375, "loss/logits": 0.00800924189388752, "loss/reg": 5.243789196014404, "loss/twn": 0.0, "step": 362 }, { "epoch": 0.009075, "grad_norm": 60.0, "grad_norm_var": 3747.402587890625, "learning_rate": 0.0001, "loss": 5.8096, "loss/crossentropy": 0.38534435629844666, "loss/hidden": 0.17578125, "loss/logits": 0.004911348223686218, "loss/reg": 5.243527412414551, "loss/twn": 0.0, "step": 363 }, { "epoch": 0.0091, "grad_norm": 8.625, "grad_norm_var": 3783.2118326822915, "learning_rate": 0.0001, "loss": 7.0141, "loss/crossentropy": 1.7209011316299438, "loss/hidden": 0.04736328125, "loss/logits": 0.002276923507452011, "loss/reg": 5.243542671203613, "loss/twn": 0.0, "step": 364 }, { "epoch": 0.009125, "grad_norm": 700.0, "grad_norm_var": 31568.732014973957, "learning_rate": 0.0001, "loss": 6.5463, "loss/crossentropy": 1.1213988065719604, "loss/hidden": 0.171875, "loss/logits": 0.009275542572140694, "loss/reg": 5.243773937225342, "loss/twn": 0.0, "step": 365 }, { "epoch": 0.00915, "grad_norm": 9.5, "grad_norm_var": 31580.584228515625, "learning_rate": 0.0001, "loss": 7.2758, "loss/crossentropy": 1.888658046722412, "loss/hidden": 0.1328125, "loss/logits": 0.011050861328840256, "loss/reg": 5.243287086486816, "loss/twn": 0.0, "step": 366 }, { "epoch": 0.009175, "grad_norm": 7.6875, "grad_norm_var": 31626.63006184896, "learning_rate": 0.0001, "loss": 6.2879, "loss/crossentropy": 0.9124002456665039, "loss/hidden": 0.1220703125, "loss/logits": 0.00983446091413498, "loss/reg": 5.243640899658203, "loss/twn": 0.0, "step": 367 }, { "epoch": 0.0092, "grad_norm": 17.75, "grad_norm_var": 31707.892822265625, "learning_rate": 0.0001, "loss": 8.2629, "loss/crossentropy": 2.844151735305786, "loss/hidden": 0.16015625, "loss/logits": 0.015287065878510475, "loss/reg": 5.243282794952393, "loss/twn": 0.0, "step": 368 }, { "epoch": 0.009225, "grad_norm": 10.8125, "grad_norm_var": 31834.040625, "learning_rate": 0.0001, "loss": 7.021, "loss/crossentropy": 1.6278934478759766, "loss/hidden": 0.142578125, "loss/logits": 0.007217081263661385, "loss/reg": 5.243272304534912, "loss/twn": 0.0, "step": 369 }, { "epoch": 0.00925, "grad_norm": 8.875, "grad_norm_var": 31857.8806640625, "learning_rate": 0.0001, "loss": 7.2069, "loss/crossentropy": 1.8656002283096313, "loss/hidden": 0.09375, "loss/logits": 0.004388316534459591, "loss/reg": 5.243135452270508, "loss/twn": 0.0, "step": 370 }, { "epoch": 0.009275, "grad_norm": 14.0625, "grad_norm_var": 31822.486458333333, "learning_rate": 0.0001, "loss": 6.0393, "loss/crossentropy": 0.5794708132743835, "loss/hidden": 0.2080078125, "loss/logits": 0.008730066008865833, "loss/reg": 5.243083953857422, "loss/twn": 0.0, "step": 371 }, { "epoch": 0.0093, "grad_norm": 10.625, "grad_norm_var": 31814.140625, "learning_rate": 0.0001, "loss": 7.7146, "loss/crossentropy": 2.3672714233398438, "loss/hidden": 0.0986328125, "loss/logits": 0.005637788213789463, "loss/reg": 5.243073463439941, "loss/twn": 0.0, "step": 372 }, { "epoch": 0.009325, "grad_norm": 11.125, "grad_norm_var": 31831.281184895834, "learning_rate": 0.0001, "loss": 7.1862, "loss/crossentropy": 1.8489041328430176, "loss/hidden": 0.08837890625, "loss/logits": 0.00582461804151535, "loss/reg": 5.243066787719727, "loss/twn": 0.0, "step": 373 }, { "epoch": 0.00935, "grad_norm": 10.3125, "grad_norm_var": 29543.332405598958, "learning_rate": 0.0001, "loss": 8.1468, "loss/crossentropy": 2.779754877090454, "loss/hidden": 0.11279296875, "loss/logits": 0.011424477212131023, "loss/reg": 5.242800235748291, "loss/twn": 0.0, "step": 374 }, { "epoch": 0.009375, "grad_norm": 13.5625, "grad_norm_var": 29517.761393229168, "learning_rate": 0.0001, "loss": 5.9118, "loss/crossentropy": 0.5431471467018127, "loss/hidden": 0.1171875, "loss/logits": 0.008679039776325226, "loss/reg": 5.242814064025879, "loss/twn": 0.0, "step": 375 }, { "epoch": 0.0094, "grad_norm": 13.3125, "grad_norm_var": 29502.23357747396, "learning_rate": 0.0001, "loss": 7.1375, "loss/crossentropy": 1.6137068271636963, "loss/hidden": 0.267578125, "loss/logits": 0.013521241024136543, "loss/reg": 5.2426886558532715, "loss/twn": 0.0, "step": 376 }, { "epoch": 0.009425, "grad_norm": 20.25, "grad_norm_var": 29456.37303059896, "learning_rate": 0.0001, "loss": 8.1117, "loss/crossentropy": 2.5268547534942627, "loss/hidden": 0.328125, "loss/logits": 0.014234257861971855, "loss/reg": 5.242476463317871, "loss/twn": 0.0, "step": 377 }, { "epoch": 0.00945, "grad_norm": 12.625, "grad_norm_var": 29456.751497395835, "learning_rate": 0.0001, "loss": 7.0252, "loss/crossentropy": 1.6614928245544434, "loss/hidden": 0.1123046875, "loss/logits": 0.009039688855409622, "loss/reg": 5.242366313934326, "loss/twn": 0.0, "step": 378 }, { "epoch": 0.009475, "grad_norm": 12.375, "grad_norm_var": 29586.256770833334, "learning_rate": 0.0001, "loss": 7.3434, "loss/crossentropy": 1.9416403770446777, "loss/hidden": 0.1513671875, "loss/logits": 0.008125634863972664, "loss/reg": 5.242313385009766, "loss/twn": 0.0, "step": 379 }, { "epoch": 0.0095, "grad_norm": 9.3125, "grad_norm_var": 29582.02667643229, "learning_rate": 0.0001, "loss": 7.2748, "loss/crossentropy": 1.8965728282928467, "loss/hidden": 0.12890625, "loss/logits": 0.0069448379799723625, "loss/reg": 5.242362976074219, "loss/twn": 0.0, "step": 380 }, { "epoch": 0.009525, "grad_norm": 10.75, "grad_norm_var": 10.688785807291667, "learning_rate": 0.0001, "loss": 7.591, "loss/crossentropy": 2.19319224357605, "loss/hidden": 0.146484375, "loss/logits": 0.009041574783623219, "loss/reg": 5.242304801940918, "loss/twn": 0.0, "step": 381 }, { "epoch": 0.00955, "grad_norm": 17.875, "grad_norm_var": 12.215478515625, "learning_rate": 0.0001, "loss": 8.1746, "loss/crossentropy": 2.799837827682495, "loss/hidden": 0.1220703125, "loss/logits": 0.010410355404019356, "loss/reg": 5.242269992828369, "loss/twn": 0.0, "step": 382 }, { "epoch": 0.009575, "grad_norm": 142.0, "grad_norm_var": 1052.0530598958333, "learning_rate": 0.0001, "loss": 5.5973, "loss/crossentropy": 0.23081077635288239, "loss/hidden": 0.119140625, "loss/logits": 0.00518256239593029, "loss/reg": 5.242154121398926, "loss/twn": 0.0, "step": 383 }, { "epoch": 0.0096, "grad_norm": 178.0, "grad_norm_var": 2588.1160807291667, "learning_rate": 0.0001, "loss": 7.9228, "loss/crossentropy": 2.5279970169067383, "loss/hidden": 0.146484375, "loss/logits": 0.006114904303103685, "loss/reg": 5.242175102233887, "loss/twn": 0.0, "step": 384 }, { "epoch": 0.009625, "grad_norm": 9.6875, "grad_norm_var": 2591.222135416667, "learning_rate": 0.0001, "loss": 8.0683, "loss/crossentropy": 2.7547900676727295, "loss/hidden": 0.064453125, "loss/logits": 0.007344301789999008, "loss/reg": 5.241701602935791, "loss/twn": 0.0, "step": 385 }, { "epoch": 0.00965, "grad_norm": 15.4375, "grad_norm_var": 2574.622770182292, "learning_rate": 0.0001, "loss": 8.1417, "loss/crossentropy": 2.6952314376831055, "loss/hidden": 0.185546875, "loss/logits": 0.019006717950105667, "loss/reg": 5.241945743560791, "loss/twn": 0.0, "step": 386 }, { "epoch": 0.009675, "grad_norm": 10.5625, "grad_norm_var": 2583.447509765625, "learning_rate": 0.0001, "loss": 7.02, "loss/crossentropy": 1.5960569381713867, "loss/hidden": 0.16796875, "loss/logits": 0.014128390699625015, "loss/reg": 5.241806507110596, "loss/twn": 0.0, "step": 387 }, { "epoch": 0.0097, "grad_norm": 9.875, "grad_norm_var": 2585.531494140625, "learning_rate": 0.0001, "loss": 7.6158, "loss/crossentropy": 2.310436487197876, "loss/hidden": 0.05712890625, "loss/logits": 0.006508246064186096, "loss/reg": 5.241683006286621, "loss/twn": 0.0, "step": 388 }, { "epoch": 0.009725, "grad_norm": 13.625, "grad_norm_var": 2579.2749837239585, "learning_rate": 0.0001, "loss": 8.1291, "loss/crossentropy": 2.7542238235473633, "loss/hidden": 0.12255859375, "loss/logits": 0.010532179847359657, "loss/reg": 5.241789817810059, "loss/twn": 0.0, "step": 389 }, { "epoch": 0.00975, "grad_norm": 8.0625, "grad_norm_var": 2585.864436848958, "learning_rate": 0.0001, "loss": 7.6721, "loss/crossentropy": 2.319371223449707, "loss/hidden": 0.107421875, "loss/logits": 0.003740239655598998, "loss/reg": 5.241525650024414, "loss/twn": 0.0, "step": 390 }, { "epoch": 0.009775, "grad_norm": 10.4375, "grad_norm_var": 2593.774593098958, "learning_rate": 0.0001, "loss": 8.1857, "loss/crossentropy": 2.8464860916137695, "loss/hidden": 0.09130859375, "loss/logits": 0.006419371347874403, "loss/reg": 5.241455078125, "loss/twn": 0.0, "step": 391 }, { "epoch": 0.0098, "grad_norm": 17.125, "grad_norm_var": 2585.749479166667, "learning_rate": 0.0001, "loss": 8.0076, "loss/crossentropy": 2.555150032043457, "loss/hidden": 0.2001953125, "loss/logits": 0.01086280308663845, "loss/reg": 5.241420745849609, "loss/twn": 0.0, "step": 392 }, { "epoch": 0.009825, "grad_norm": 9.0, "grad_norm_var": 2609.972135416667, "learning_rate": 0.0001, "loss": 6.3665, "loss/crossentropy": 1.016921877861023, "loss/hidden": 0.10205078125, "loss/logits": 0.005952711217105389, "loss/reg": 5.241562843322754, "loss/twn": 0.0, "step": 393 }, { "epoch": 0.00985, "grad_norm": 10.875, "grad_norm_var": 2614.3161458333334, "learning_rate": 0.0001, "loss": 7.8429, "loss/crossentropy": 2.5393762588500977, "loss/hidden": 0.05712890625, "loss/logits": 0.005034131929278374, "loss/reg": 5.241337299346924, "loss/twn": 0.0, "step": 394 }, { "epoch": 0.009875, "grad_norm": 80.0, "grad_norm_var": 2738.4009765625, "learning_rate": 0.0001, "loss": 8.0734, "loss/crossentropy": 2.591184377670288, "loss/hidden": 0.2294921875, "loss/logits": 0.01196893397718668, "loss/reg": 5.24077033996582, "loss/twn": 0.0, "step": 395 }, { "epoch": 0.0099, "grad_norm": 20.375, "grad_norm_var": 2708.840478515625, "learning_rate": 0.0001, "loss": 8.3222, "loss/crossentropy": 2.96229887008667, "loss/hidden": 0.10791015625, "loss/logits": 0.010710952803492546, "loss/reg": 5.2412495613098145, "loss/twn": 0.0, "step": 396 }, { "epoch": 0.009925, "grad_norm": 14.0, "grad_norm_var": 2698.892431640625, "learning_rate": 0.0001, "loss": 7.2185, "loss/crossentropy": 1.843474268913269, "loss/hidden": 0.125, "loss/logits": 0.009277286008000374, "loss/reg": 5.240739345550537, "loss/twn": 0.0, "step": 397 }, { "epoch": 0.00995, "grad_norm": 11.0, "grad_norm_var": 2717.9419108072916, "learning_rate": 0.0001, "loss": 7.7714, "loss/crossentropy": 2.3742737770080566, "loss/hidden": 0.146484375, "loss/logits": 0.009680403396487236, "loss/reg": 5.240973472595215, "loss/twn": 0.0, "step": 398 }, { "epoch": 0.009975, "grad_norm": 12.0, "grad_norm_var": 1919.5929524739583, "learning_rate": 0.0001, "loss": 8.0069, "loss/crossentropy": 2.649110794067383, "loss/hidden": 0.10791015625, "loss/logits": 0.00901185255497694, "loss/reg": 5.24085807800293, "loss/twn": 0.0, "step": 399 }, { "epoch": 0.01, "grad_norm": 9.5, "grad_norm_var": 298.921337890625, "learning_rate": 0.0001, "loss": 7.3041, "loss/crossentropy": 1.9669857025146484, "loss/hidden": 0.0908203125, "loss/logits": 0.005569307133555412, "loss/reg": 5.240681171417236, "loss/twn": 0.0, "step": 400 }, { "epoch": 0.010025, "grad_norm": 169.0, "grad_norm_var": 1743.7280598958334, "learning_rate": 0.0001, "loss": 5.7116, "loss/crossentropy": 0.3063473105430603, "loss/hidden": 0.1611328125, "loss/logits": 0.003124656155705452, "loss/reg": 5.2410149574279785, "loss/twn": 0.0, "step": 401 }, { "epoch": 0.01005, "grad_norm": 61.75, "grad_norm_var": 1810.6761555989583, "learning_rate": 0.0001, "loss": 7.4546, "loss/crossentropy": 2.0817315578460693, "loss/hidden": 0.1279296875, "loss/logits": 0.004020760301500559, "loss/reg": 5.240888595581055, "loss/twn": 0.0, "step": 402 }, { "epoch": 0.010075, "grad_norm": 19.25, "grad_norm_var": 1793.8056640625, "learning_rate": 0.0001, "loss": 6.645, "loss/crossentropy": 1.261731743812561, "loss/hidden": 0.13671875, "loss/logits": 0.006218242458999157, "loss/reg": 5.240310192108154, "loss/twn": 0.0, "step": 403 }, { "epoch": 0.0101, "grad_norm": 8.875, "grad_norm_var": 1796.5171223958334, "learning_rate": 0.0001, "loss": 6.3703, "loss/crossentropy": 0.9765498638153076, "loss/hidden": 0.1455078125, "loss/logits": 0.007485189475119114, "loss/reg": 5.240753650665283, "loss/twn": 0.0, "step": 404 }, { "epoch": 0.010125, "grad_norm": 10.8125, "grad_norm_var": 1803.0320149739584, "learning_rate": 0.0001, "loss": 7.0126, "loss/crossentropy": 1.6325502395629883, "loss/hidden": 0.1318359375, "loss/logits": 0.007993818260729313, "loss/reg": 5.240240097045898, "loss/twn": 0.0, "step": 405 }, { "epoch": 0.01015, "grad_norm": 41.25, "grad_norm_var": 1776.9919270833334, "learning_rate": 0.0001, "loss": 6.1547, "loss/crossentropy": 0.801928699016571, "loss/hidden": 0.10986328125, "loss/logits": 0.002597600221633911, "loss/reg": 5.240261554718018, "loss/twn": 0.0, "step": 406 }, { "epoch": 0.010175, "grad_norm": 15.4375, "grad_norm_var": 1764.4606770833334, "learning_rate": 0.0001, "loss": 7.3067, "loss/crossentropy": 1.9361686706542969, "loss/hidden": 0.11962890625, "loss/logits": 0.01030397042632103, "loss/reg": 5.240562438964844, "loss/twn": 0.0, "step": 407 }, { "epoch": 0.0102, "grad_norm": 9.375, "grad_norm_var": 1783.4723958333334, "learning_rate": 0.0001, "loss": 8.0074, "loss/crossentropy": 2.654067039489746, "loss/hidden": 0.10546875, "loss/logits": 0.0074330884963274, "loss/reg": 5.240452766418457, "loss/twn": 0.0, "step": 408 }, { "epoch": 0.010225, "grad_norm": 8.0625, "grad_norm_var": 1786.3281087239584, "learning_rate": 0.0001, "loss": 7.3042, "loss/crossentropy": 1.928905963897705, "loss/hidden": 0.125, "loss/logits": 0.009886398911476135, "loss/reg": 5.240396499633789, "loss/twn": 0.0, "step": 409 }, { "epoch": 0.01025, "grad_norm": 11.8125, "grad_norm_var": 1783.8239583333334, "learning_rate": 0.0001, "loss": 7.1916, "loss/crossentropy": 1.7467641830444336, "loss/hidden": 0.1943359375, "loss/logits": 0.010304899886250496, "loss/reg": 5.2401838302612305, "loss/twn": 0.0, "step": 410 }, { "epoch": 0.010275, "grad_norm": 9.6875, "grad_norm_var": 1637.2480305989584, "learning_rate": 0.0001, "loss": 7.8017, "loss/crossentropy": 2.478278636932373, "loss/hidden": 0.07666015625, "loss/logits": 0.0064398422837257385, "loss/reg": 5.2403669357299805, "loss/twn": 0.0, "step": 411 }, { "epoch": 0.0103, "grad_norm": 13.75, "grad_norm_var": 1645.8536295572917, "learning_rate": 0.0001, "loss": 5.9725, "loss/crossentropy": 0.5244819521903992, "loss/hidden": 0.1982421875, "loss/logits": 0.009226880967617035, "loss/reg": 5.24050760269165, "loss/twn": 0.0, "step": 412 }, { "epoch": 0.010325, "grad_norm": 12.75, "grad_norm_var": 1648.0508951822917, "learning_rate": 0.0001, "loss": 6.1233, "loss/crossentropy": 0.6444658041000366, "loss/hidden": 0.228515625, "loss/logits": 0.009683252312242985, "loss/reg": 5.240647792816162, "loss/twn": 0.0, "step": 413 }, { "epoch": 0.01035, "grad_norm": 30.375, "grad_norm_var": 1631.4206868489584, "learning_rate": 0.0001, "loss": 7.3483, "loss/crossentropy": 1.942959189414978, "loss/hidden": 0.150390625, "loss/logits": 0.014488045126199722, "loss/reg": 5.2405009269714355, "loss/twn": 0.0, "step": 414 }, { "epoch": 0.010375, "grad_norm": 148.0, "grad_norm_var": 2502.174853515625, "learning_rate": 0.0001, "loss": 5.7032, "loss/crossentropy": 0.21244874596595764, "loss/hidden": 0.2451171875, "loss/logits": 0.005563709884881973, "loss/reg": 5.240046501159668, "loss/twn": 0.0, "step": 415 }, { "epoch": 0.0104, "grad_norm": 11.3125, "grad_norm_var": 2495.9203125, "learning_rate": 0.0001, "loss": 8.1335, "loss/crossentropy": 2.8577675819396973, "loss/hidden": 0.0302734375, "loss/logits": 0.005365458317101002, "loss/reg": 5.240131378173828, "loss/twn": 0.0, "step": 416 }, { "epoch": 0.010425, "grad_norm": 10.8125, "grad_norm_var": 1261.934228515625, "learning_rate": 0.0001, "loss": 6.9774, "loss/crossentropy": 1.5951570272445679, "loss/hidden": 0.134765625, "loss/logits": 0.007291505113244057, "loss/reg": 5.240211486816406, "loss/twn": 0.0, "step": 417 }, { "epoch": 0.01045, "grad_norm": 9.875, "grad_norm_var": 1186.0130045572917, "learning_rate": 0.0001, "loss": 7.6738, "loss/crossentropy": 2.354665756225586, "loss/hidden": 0.07421875, "loss/logits": 0.004681308753788471, "loss/reg": 5.240237236022949, "loss/twn": 0.0, "step": 418 }, { "epoch": 0.010475, "grad_norm": 18.0, "grad_norm_var": 1186.7714680989584, "learning_rate": 0.0001, "loss": 7.9042, "loss/crossentropy": 2.479489326477051, "loss/hidden": 0.1748046875, "loss/logits": 0.009896760806441307, "loss/reg": 5.239961624145508, "loss/twn": 0.0, "step": 419 }, { "epoch": 0.0105, "grad_norm": 78.0, "grad_norm_var": 1353.9675618489584, "learning_rate": 0.0001, "loss": 6.7, "loss/crossentropy": 1.1504077911376953, "loss/hidden": 0.306640625, "loss/logits": 0.0026911741588264704, "loss/reg": 5.240240097045898, "loss/twn": 0.0, "step": 420 }, { "epoch": 0.010525, "grad_norm": 14.5625, "grad_norm_var": 1346.5242024739584, "learning_rate": 0.0001, "loss": 8.0741, "loss/crossentropy": 2.7434136867523193, "loss/hidden": 0.083984375, "loss/logits": 0.006712112110108137, "loss/reg": 5.2399516105651855, "loss/twn": 0.0, "step": 421 }, { "epoch": 0.01055, "grad_norm": 14.375, "grad_norm_var": 1343.0808430989584, "learning_rate": 0.0001, "loss": 6.6885, "loss/crossentropy": 1.297805666923523, "loss/hidden": 0.1455078125, "loss/logits": 0.005279569886624813, "loss/reg": 5.2398576736450195, "loss/twn": 0.0, "step": 422 }, { "epoch": 0.010575, "grad_norm": 10.125, "grad_norm_var": 1352.3348307291667, "learning_rate": 0.0001, "loss": 5.9427, "loss/crossentropy": 0.5282614231109619, "loss/hidden": 0.1669921875, "loss/logits": 0.007687091361731291, "loss/reg": 5.239724159240723, "loss/twn": 0.0, "step": 423 }, { "epoch": 0.0106, "grad_norm": 13.6875, "grad_norm_var": 1344.1219889322917, "learning_rate": 0.0001, "loss": 7.3072, "loss/crossentropy": 1.7837430238723755, "loss/hidden": 0.271484375, "loss/logits": 0.012293729931116104, "loss/reg": 5.2396626472473145, "loss/twn": 0.0, "step": 424 }, { "epoch": 0.010625, "grad_norm": 53.75, "grad_norm_var": 1365.6212890625, "learning_rate": 0.0001, "loss": 8.513, "loss/crossentropy": 3.109501838684082, "loss/hidden": 0.1572265625, "loss/logits": 0.006876545026898384, "loss/reg": 5.239365100860596, "loss/twn": 0.0, "step": 425 }, { "epoch": 0.01065, "grad_norm": 8.5625, "grad_norm_var": 1373.6447265625, "learning_rate": 0.0001, "loss": 7.5404, "loss/crossentropy": 2.174077272415161, "loss/hidden": 0.115234375, "loss/logits": 0.011358590796589851, "loss/reg": 5.239738941192627, "loss/twn": 0.0, "step": 426 }, { "epoch": 0.010675, "grad_norm": 12.1875, "grad_norm_var": 1367.7306640625, "learning_rate": 0.0001, "loss": 8.1571, "loss/crossentropy": 2.827575922012329, "loss/hidden": 0.083984375, "loss/logits": 0.005951396189630032, "loss/reg": 5.239595890045166, "loss/twn": 0.0, "step": 427 }, { "epoch": 0.0107, "grad_norm": 14.875, "grad_norm_var": 1365.55859375, "learning_rate": 0.0001, "loss": 8.0294, "loss/crossentropy": 2.697416305541992, "loss/hidden": 0.087890625, "loss/logits": 0.004499722272157669, "loss/reg": 5.239617824554443, "loss/twn": 0.0, "step": 428 }, { "epoch": 0.010725, "grad_norm": 11.0, "grad_norm_var": 1369.5015625, "learning_rate": 0.0001, "loss": 6.9497, "loss/crossentropy": 1.5794799327850342, "loss/hidden": 0.12255859375, "loss/logits": 0.008041350170969963, "loss/reg": 5.239623069763184, "loss/twn": 0.0, "step": 429 }, { "epoch": 0.01075, "grad_norm": 12.0, "grad_norm_var": 1386.5462890625, "learning_rate": 0.0001, "loss": 6.8916, "loss/crossentropy": 1.5268070697784424, "loss/hidden": 0.11962890625, "loss/logits": 0.0057592848315835, "loss/reg": 5.23940372467041, "loss/twn": 0.0, "step": 430 }, { "epoch": 0.010775, "grad_norm": 11.5, "grad_norm_var": 359.2416015625, "learning_rate": 0.0001, "loss": 8.0251, "loss/crossentropy": 2.647442579269409, "loss/hidden": 0.12890625, "loss/logits": 0.009594411589205265, "loss/reg": 5.239197254180908, "loss/twn": 0.0, "step": 431 }, { "epoch": 0.0108, "grad_norm": 18.75, "grad_norm_var": 355.0367024739583, "learning_rate": 0.0001, "loss": 8.1387, "loss/crossentropy": 2.7440555095672607, "loss/hidden": 0.1376953125, "loss/logits": 0.01775319315493107, "loss/reg": 5.23914909362793, "loss/twn": 0.0, "step": 432 }, { "epoch": 0.010825, "grad_norm": 9.0, "grad_norm_var": 357.3424479166667, "learning_rate": 0.0001, "loss": 7.2622, "loss/crossentropy": 1.8811193704605103, "loss/hidden": 0.1337890625, "loss/logits": 0.007980940863490105, "loss/reg": 5.239285469055176, "loss/twn": 0.0, "step": 433 }, { "epoch": 0.01085, "grad_norm": 10.625, "grad_norm_var": 356.42604166666666, "learning_rate": 0.0001, "loss": 8.2447, "loss/crossentropy": 2.8804891109466553, "loss/hidden": 0.1123046875, "loss/logits": 0.01278759352862835, "loss/reg": 5.239134311676025, "loss/twn": 0.0, "step": 434 }, { "epoch": 0.010875, "grad_norm": 102.0, "grad_norm_var": 781.3260416666667, "learning_rate": 0.0001, "loss": 6.8421, "loss/crossentropy": 1.4239375591278076, "loss/hidden": 0.1669921875, "loss/logits": 0.011750075966119766, "loss/reg": 5.239468574523926, "loss/twn": 0.0, "step": 435 }, { "epoch": 0.0109, "grad_norm": 14.25, "grad_norm_var": 582.1736979166667, "learning_rate": 0.0001, "loss": 7.9746, "loss/crossentropy": 2.5394973754882812, "loss/hidden": 0.1875, "loss/logits": 0.008459478616714478, "loss/reg": 5.239116191864014, "loss/twn": 0.0, "step": 436 }, { "epoch": 0.010925, "grad_norm": 7.8125, "grad_norm_var": 590.5479166666667, "learning_rate": 0.0001, "loss": 7.3172, "loss/crossentropy": 1.975609540939331, "loss/hidden": 0.0986328125, "loss/logits": 0.0038147151935845613, "loss/reg": 5.239123344421387, "loss/twn": 0.0, "step": 437 }, { "epoch": 0.01095, "grad_norm": 10.8125, "grad_norm_var": 594.1465983072917, "learning_rate": 0.0001, "loss": 7.1303, "loss/crossentropy": 1.7625492811203003, "loss/hidden": 0.12255859375, "loss/logits": 0.005926240235567093, "loss/reg": 5.239302635192871, "loss/twn": 0.0, "step": 438 }, { "epoch": 0.010975, "grad_norm": 11.75, "grad_norm_var": 592.1593587239583, "learning_rate": 0.0001, "loss": 8.0788, "loss/crossentropy": 2.697333812713623, "loss/hidden": 0.12890625, "loss/logits": 0.013756821863353252, "loss/reg": 5.2388529777526855, "loss/twn": 0.0, "step": 439 }, { "epoch": 0.011, "grad_norm": 57.0, "grad_norm_var": 672.0280598958333, "learning_rate": 0.0001, "loss": 7.046, "loss/crossentropy": 1.7012284994125366, "loss/hidden": 0.10009765625, "loss/logits": 0.005743634421378374, "loss/reg": 5.238898754119873, "loss/twn": 0.0, "step": 440 }, { "epoch": 0.011025, "grad_norm": 16.375, "grad_norm_var": 605.434375, "learning_rate": 0.0001, "loss": 7.7288, "loss/crossentropy": 2.397491216659546, "loss/hidden": 0.08642578125, "loss/logits": 0.005718431435525417, "loss/reg": 5.239189624786377, "loss/twn": 0.0, "step": 441 }, { "epoch": 0.01105, "grad_norm": 30.75, "grad_norm_var": 600.7946451822917, "learning_rate": 0.0001, "loss": 6.7732, "loss/crossentropy": 1.3775757551193237, "loss/hidden": 0.150390625, "loss/logits": 0.006438620388507843, "loss/reg": 5.23883056640625, "loss/twn": 0.0, "step": 442 }, { "epoch": 0.011075, "grad_norm": 326.0, "grad_norm_var": 6348.5484375, "learning_rate": 0.0001, "loss": 6.8298, "loss/crossentropy": 1.457594394683838, "loss/hidden": 0.12890625, "loss/logits": 0.00396731635555625, "loss/reg": 5.239315509796143, "loss/twn": 0.0, "step": 443 }, { "epoch": 0.0111, "grad_norm": 12.9375, "grad_norm_var": 6355.669254557291, "learning_rate": 0.0001, "loss": 8.2584, "loss/crossentropy": 2.878317356109619, "loss/hidden": 0.130859375, "loss/logits": 0.010759024880826473, "loss/reg": 5.238509178161621, "loss/twn": 0.0, "step": 444 }, { "epoch": 0.011125, "grad_norm": 8.4375, "grad_norm_var": 6366.469791666666, "learning_rate": 0.0001, "loss": 7.6916, "loss/crossentropy": 2.346791982650757, "loss/hidden": 0.10107421875, "loss/logits": 0.004902126267552376, "loss/reg": 5.2388715744018555, "loss/twn": 0.0, "step": 445 }, { "epoch": 0.01115, "grad_norm": 35.75, "grad_norm_var": 6309.098697916666, "learning_rate": 0.0001, "loss": 6.5195, "loss/crossentropy": 1.047242283821106, "loss/hidden": 0.2265625, "loss/logits": 0.006808393634855747, "loss/reg": 5.2388739585876465, "loss/twn": 0.0, "step": 446 }, { "epoch": 0.011175, "grad_norm": 51.75, "grad_norm_var": 6242.728125, "learning_rate": 0.0001, "loss": 8.1962, "loss/crossentropy": 2.7516071796417236, "loss/hidden": 0.1865234375, "loss/logits": 0.0192459337413311, "loss/reg": 5.238797187805176, "loss/twn": 0.0, "step": 447 }, { "epoch": 0.0112, "grad_norm": 12.6875, "grad_norm_var": 6266.446077473958, "learning_rate": 0.0001, "loss": 8.1265, "loss/crossentropy": 2.730348587036133, "loss/hidden": 0.142578125, "loss/logits": 0.015144633129239082, "loss/reg": 5.238423824310303, "loss/twn": 0.0, "step": 448 }, { "epoch": 0.011225, "grad_norm": 48.25, "grad_norm_var": 6175.005843098958, "learning_rate": 0.0001, "loss": 6.9994, "loss/crossentropy": 1.4689970016479492, "loss/hidden": 0.283203125, "loss/logits": 0.008740945719182491, "loss/reg": 5.238440036773682, "loss/twn": 0.0, "step": 449 }, { "epoch": 0.01125, "grad_norm": 7.8125, "grad_norm_var": 6189.262434895833, "learning_rate": 0.0001, "loss": 7.8836, "loss/crossentropy": 2.586432695388794, "loss/hidden": 0.0546875, "loss/logits": 0.004194296896457672, "loss/reg": 5.238241672515869, "loss/twn": 0.0, "step": 450 }, { "epoch": 0.011275, "grad_norm": 12.5625, "grad_norm_var": 6035.099202473958, "learning_rate": 0.0001, "loss": 7.5775, "loss/crossentropy": 2.230092763900757, "loss/hidden": 0.10107421875, "loss/logits": 0.007766470313072205, "loss/reg": 5.238610744476318, "loss/twn": 0.0, "step": 451 }, { "epoch": 0.0113, "grad_norm": 10.9375, "grad_norm_var": 6047.8462890625, "learning_rate": 0.0001, "loss": 7.9835, "loss/crossentropy": 2.615774154663086, "loss/hidden": 0.1201171875, "loss/logits": 0.009664995595812798, "loss/reg": 5.237979412078857, "loss/twn": 0.0, "step": 452 }, { "epoch": 0.011325, "grad_norm": 76.0, "grad_norm_var": 6033.516259765625, "learning_rate": 0.0001, "loss": 6.81, "loss/crossentropy": 1.415939450263977, "loss/hidden": 0.1435546875, "loss/logits": 0.01219608448445797, "loss/reg": 5.238288402557373, "loss/twn": 0.0, "step": 453 }, { "epoch": 0.01135, "grad_norm": 9.125, "grad_norm_var": 6041.5244140625, "learning_rate": 0.0001, "loss": 7.0682, "loss/crossentropy": 1.6937216520309448, "loss/hidden": 0.1298828125, "loss/logits": 0.006700664758682251, "loss/reg": 5.237900733947754, "loss/twn": 0.0, "step": 454 }, { "epoch": 0.011375, "grad_norm": 19.125, "grad_norm_var": 6011.728645833334, "learning_rate": 0.0001, "loss": 8.4153, "loss/crossentropy": 2.93511700630188, "loss/hidden": 0.2197265625, "loss/logits": 0.022560518234968185, "loss/reg": 5.237886905670166, "loss/twn": 0.0, "step": 455 }, { "epoch": 0.0114, "grad_norm": 15.3125, "grad_norm_var": 6059.028759765625, "learning_rate": 0.0001, "loss": 7.0063, "loss/crossentropy": 1.5995585918426514, "loss/hidden": 0.1572265625, "loss/logits": 0.011760546825826168, "loss/reg": 5.237764835357666, "loss/twn": 0.0, "step": 456 }, { "epoch": 0.011425, "grad_norm": 10.875, "grad_norm_var": 6080.710791015625, "learning_rate": 0.0001, "loss": 7.0912, "loss/crossentropy": 1.6729381084442139, "loss/hidden": 0.1669921875, "loss/logits": 0.013158103451132774, "loss/reg": 5.238087177276611, "loss/twn": 0.0, "step": 457 }, { "epoch": 0.01145, "grad_norm": 11.3125, "grad_norm_var": 6136.1228515625, "learning_rate": 0.0001, "loss": 7.8938, "loss/crossentropy": 2.558936834335327, "loss/hidden": 0.08642578125, "loss/logits": 0.010592980310320854, "loss/reg": 5.23784065246582, "loss/twn": 0.0, "step": 458 }, { "epoch": 0.011475, "grad_norm": 14.3125, "grad_norm_var": 397.25792643229164, "learning_rate": 0.0001, "loss": 8.1499, "loss/crossentropy": 2.677492141723633, "loss/hidden": 0.21875, "loss/logits": 0.015965130180120468, "loss/reg": 5.237676620483398, "loss/twn": 0.0, "step": 459 }, { "epoch": 0.0115, "grad_norm": 15.4375, "grad_norm_var": 394.51964518229164, "learning_rate": 0.0001, "loss": 8.1308, "loss/crossentropy": 2.706998348236084, "loss/hidden": 0.1708984375, "loss/logits": 0.015249890275299549, "loss/reg": 5.237621784210205, "loss/twn": 0.0, "step": 460 }, { "epoch": 0.011525, "grad_norm": 10.6875, "grad_norm_var": 390.62316080729164, "learning_rate": 0.0001, "loss": 7.1208, "loss/crossentropy": 1.7324503660202026, "loss/hidden": 0.1416015625, "loss/logits": 0.008826036937534809, "loss/reg": 5.237947940826416, "loss/twn": 0.0, "step": 461 }, { "epoch": 0.01155, "grad_norm": 88.0, "grad_norm_var": 652.7167805989583, "learning_rate": 0.0001, "loss": 7.8363, "loss/crossentropy": 2.4369447231292725, "loss/hidden": 0.15234375, "loss/logits": 0.00924272183328867, "loss/reg": 5.237813949584961, "loss/twn": 0.0, "step": 462 }, { "epoch": 0.011575, "grad_norm": 8.875, "grad_norm_var": 619.7566243489583, "learning_rate": 0.0001, "loss": 7.7379, "loss/crossentropy": 2.4647915363311768, "loss/hidden": 0.0302734375, "loss/logits": 0.005017576273530722, "loss/reg": 5.237803936004639, "loss/twn": 0.0, "step": 463 }, { "epoch": 0.0116, "grad_norm": 8.3125, "grad_norm_var": 627.089306640625, "learning_rate": 0.0001, "loss": 7.7172, "loss/crossentropy": 2.3781650066375732, "loss/hidden": 0.09619140625, "loss/logits": 0.0055891769006848335, "loss/reg": 5.237229824066162, "loss/twn": 0.0, "step": 464 }, { "epoch": 0.011625, "grad_norm": 65.5, "grad_norm_var": 703.914697265625, "learning_rate": 0.0001, "loss": 8.1072, "loss/crossentropy": 2.73203182220459, "loss/hidden": 0.126953125, "loss/logits": 0.010718154720962048, "loss/reg": 5.237488746643066, "loss/twn": 0.0, "step": 465 }, { "epoch": 0.01165, "grad_norm": 21.875, "grad_norm_var": 685.90078125, "learning_rate": 0.0001, "loss": 8.3027, "loss/crossentropy": 2.8547723293304443, "loss/hidden": 0.1865234375, "loss/logits": 0.023786598816514015, "loss/reg": 5.2376484870910645, "loss/twn": 0.0, "step": 466 }, { "epoch": 0.011675, "grad_norm": 101.5, "grad_norm_var": 1034.077197265625, "learning_rate": 0.0001, "loss": 7.6863, "loss/crossentropy": 2.4159936904907227, "loss/hidden": 0.0302734375, "loss/logits": 0.0024244533851742744, "loss/reg": 5.23759126663208, "loss/twn": 0.0, "step": 467 }, { "epoch": 0.0117, "grad_norm": 13.9375, "grad_norm_var": 1026.835009765625, "learning_rate": 0.0001, "loss": 8.0093, "loss/crossentropy": 2.580734968185425, "loss/hidden": 0.17578125, "loss/logits": 0.015159064903855324, "loss/reg": 5.2376275062561035, "loss/twn": 0.0, "step": 468 }, { "epoch": 0.011725, "grad_norm": 9.8125, "grad_norm_var": 900.303125, "learning_rate": 0.0001, "loss": 7.7953, "loss/crossentropy": 2.4660680294036865, "loss/hidden": 0.08642578125, "loss/logits": 0.005589427426457405, "loss/reg": 5.237224578857422, "loss/twn": 0.0, "step": 469 }, { "epoch": 0.01175, "grad_norm": 384.0, "grad_norm_var": 8815.046809895834, "learning_rate": 0.0001, "loss": 6.1676, "loss/crossentropy": 0.7747684121131897, "loss/hidden": 0.1494140625, "loss/logits": 0.005672769621014595, "loss/reg": 5.23769998550415, "loss/twn": 0.0, "step": 470 }, { "epoch": 0.011775, "grad_norm": 14.75, "grad_norm_var": 8834.2125, "learning_rate": 0.0001, "loss": 6.978, "loss/crossentropy": 1.5184272527694702, "loss/hidden": 0.2109375, "loss/logits": 0.011126836761832237, "loss/reg": 5.2375288009643555, "loss/twn": 0.0, "step": 471 }, { "epoch": 0.0118, "grad_norm": 10.4375, "grad_norm_var": 8858.0212890625, "learning_rate": 0.0001, "loss": 7.7419, "loss/crossentropy": 2.359570264816284, "loss/hidden": 0.1318359375, "loss/logits": 0.013412706553936005, "loss/reg": 5.2371039390563965, "loss/twn": 0.0, "step": 472 }, { "epoch": 0.011825, "grad_norm": 13.9375, "grad_norm_var": 8842.896207682292, "learning_rate": 0.0001, "loss": 8.1685, "loss/crossentropy": 2.766843557357788, "loss/hidden": 0.150390625, "loss/logits": 0.01371270976960659, "loss/reg": 5.237538814544678, "loss/twn": 0.0, "step": 473 }, { "epoch": 0.01185, "grad_norm": 15.3125, "grad_norm_var": 8823.506624348958, "learning_rate": 0.0001, "loss": 7.5292, "loss/crossentropy": 2.151641607284546, "loss/hidden": 0.1318359375, "loss/logits": 0.008503757417201996, "loss/reg": 5.237189769744873, "loss/twn": 0.0, "step": 474 }, { "epoch": 0.011875, "grad_norm": 11.25, "grad_norm_var": 8838.5806640625, "learning_rate": 0.0001, "loss": 8.0395, "loss/crossentropy": 2.686025619506836, "loss/hidden": 0.1103515625, "loss/logits": 0.005928123835474253, "loss/reg": 5.237187385559082, "loss/twn": 0.0, "step": 475 }, { "epoch": 0.0119, "grad_norm": 10.1875, "grad_norm_var": 8864.2181640625, "learning_rate": 0.0001, "loss": 6.7249, "loss/crossentropy": 1.3968653678894043, "loss/hidden": 0.08642578125, "loss/logits": 0.004261254798620939, "loss/reg": 5.23736047744751, "loss/twn": 0.0, "step": 476 }, { "epoch": 0.011925, "grad_norm": 17.375, "grad_norm_var": 8832.607535807292, "learning_rate": 0.0001, "loss": 7.1631, "loss/crossentropy": 1.7119083404541016, "loss/hidden": 0.203125, "loss/logits": 0.010743262246251106, "loss/reg": 5.237338542938232, "loss/twn": 0.0, "step": 477 }, { "epoch": 0.01195, "grad_norm": 584.0, "grad_norm_var": 26742.08253580729, "learning_rate": 0.0001, "loss": 6.4806, "loss/crossentropy": 1.1264278888702393, "loss/hidden": 0.11376953125, "loss/logits": 0.003235449083149433, "loss/reg": 5.237181663513184, "loss/twn": 0.0, "step": 478 }, { "epoch": 0.011975, "grad_norm": 11.375, "grad_norm_var": 26718.53435872396, "learning_rate": 0.0001, "loss": 5.9934, "loss/crossentropy": 0.5191141963005066, "loss/hidden": 0.2275390625, "loss/logits": 0.009647047147154808, "loss/reg": 5.237125396728516, "loss/twn": 0.0, "step": 479 }, { "epoch": 0.012, "grad_norm": 10.375, "grad_norm_var": 26698.853059895835, "learning_rate": 0.0001, "loss": 8.1124, "loss/crossentropy": 2.780978202819824, "loss/hidden": 0.08642578125, "loss/logits": 0.007745138369500637, "loss/reg": 5.23725700378418, "loss/twn": 0.0, "step": 480 }, { "epoch": 0.012025, "grad_norm": 34.5, "grad_norm_var": 26822.8853515625, "learning_rate": 0.0001, "loss": 7.868, "loss/crossentropy": 2.5087831020355225, "loss/hidden": 0.11474609375, "loss/logits": 0.007656463421881199, "loss/reg": 5.236792087554932, "loss/twn": 0.0, "step": 481 }, { "epoch": 0.01205, "grad_norm": 8.6875, "grad_norm_var": 26934.268212890624, "learning_rate": 0.0001, "loss": 7.2881, "loss/crossentropy": 1.942946434020996, "loss/hidden": 0.10107421875, "loss/logits": 0.007265533320605755, "loss/reg": 5.236773490905762, "loss/twn": 0.0, "step": 482 }, { "epoch": 0.012075, "grad_norm": 10.375, "grad_norm_var": 27170.338916015626, "learning_rate": 0.0001, "loss": 7.3296, "loss/crossentropy": 1.9258122444152832, "loss/hidden": 0.1552734375, "loss/logits": 0.011823762208223343, "loss/reg": 5.2366719245910645, "loss/twn": 0.0, "step": 483 }, { "epoch": 0.0121, "grad_norm": 9.4375, "grad_norm_var": 27206.753759765626, "learning_rate": 0.0001, "loss": 7.756, "loss/crossentropy": 2.4538094997406006, "loss/hidden": 0.0595703125, "loss/logits": 0.005918778479099274, "loss/reg": 5.2366943359375, "loss/twn": 0.0, "step": 484 }, { "epoch": 0.012125, "grad_norm": 13.0625, "grad_norm_var": 27180.362744140624, "learning_rate": 0.0001, "loss": 8.1167, "loss/crossentropy": 2.796402931213379, "loss/hidden": 0.0791015625, "loss/logits": 0.004426885861903429, "loss/reg": 5.236767768859863, "loss/twn": 0.0, "step": 485 }, { "epoch": 0.01215, "grad_norm": 10.0625, "grad_norm_var": 20385.898893229165, "learning_rate": 0.0001, "loss": 7.9673, "loss/crossentropy": 2.5698435306549072, "loss/hidden": 0.1494140625, "loss/logits": 0.011329087428748608, "loss/reg": 5.236757278442383, "loss/twn": 0.0, "step": 486 }, { "epoch": 0.012175, "grad_norm": 10.625, "grad_norm_var": 20405.838541666668, "learning_rate": 0.0001, "loss": 7.2025, "loss/crossentropy": 1.8346238136291504, "loss/hidden": 0.126953125, "loss/logits": 0.004264689050614834, "loss/reg": 5.236656188964844, "loss/twn": 0.0, "step": 487 }, { "epoch": 0.0122, "grad_norm": 14.9375, "grad_norm_var": 20384.079166666666, "learning_rate": 0.0001, "loss": 7.7223, "loss/crossentropy": 2.3055596351623535, "loss/hidden": 0.1650390625, "loss/logits": 0.015009969472885132, "loss/reg": 5.236688137054443, "loss/twn": 0.0, "step": 488 }, { "epoch": 0.012225, "grad_norm": 16.25, "grad_norm_var": 20373.57355143229, "learning_rate": 0.0001, "loss": 7.9159, "loss/crossentropy": 2.573246955871582, "loss/hidden": 0.0986328125, "loss/logits": 0.007458841428160667, "loss/reg": 5.236563682556152, "loss/twn": 0.0, "step": 489 }, { "epoch": 0.01225, "grad_norm": 12.3125, "grad_norm_var": 20387.70636393229, "learning_rate": 0.0001, "loss": 7.9834, "loss/crossentropy": 2.5690972805023193, "loss/hidden": 0.1650390625, "loss/logits": 0.012899991124868393, "loss/reg": 5.236404895782471, "loss/twn": 0.0, "step": 490 }, { "epoch": 0.012275, "grad_norm": 10.0625, "grad_norm_var": 20393.779622395832, "learning_rate": 0.0001, "loss": 8.1415, "loss/crossentropy": 2.8210272789001465, "loss/hidden": 0.07666015625, "loss/logits": 0.006865846458822489, "loss/reg": 5.23691463470459, "loss/twn": 0.0, "step": 491 }, { "epoch": 0.0123, "grad_norm": 9.8125, "grad_norm_var": 20395.727864583332, "learning_rate": 0.0001, "loss": 7.8399, "loss/crossentropy": 2.5191946029663086, "loss/hidden": 0.0791015625, "loss/logits": 0.00524523202329874, "loss/reg": 5.2364020347595215, "loss/twn": 0.0, "step": 492 }, { "epoch": 0.012325, "grad_norm": 19.625, "grad_norm_var": 20386.570833333335, "learning_rate": 0.0001, "loss": 8.46, "loss/crossentropy": 3.0929410457611084, "loss/hidden": 0.12060546875, "loss/logits": 0.010085565969347954, "loss/reg": 5.236414909362793, "loss/twn": 0.0, "step": 493 }, { "epoch": 0.01235, "grad_norm": 13.375, "grad_norm_var": 39.9259765625, "learning_rate": 0.0001, "loss": 8.1608, "loss/crossentropy": 2.7697112560272217, "loss/hidden": 0.1396484375, "loss/logits": 0.015202455222606659, "loss/reg": 5.236268043518066, "loss/twn": 0.0, "step": 494 }, { "epoch": 0.012375, "grad_norm": 24.125, "grad_norm_var": 46.5931640625, "learning_rate": 0.0001, "loss": 7.9654, "loss/crossentropy": 2.5814311504364014, "loss/hidden": 0.140625, "loss/logits": 0.006853965111076832, "loss/reg": 5.236512184143066, "loss/twn": 0.0, "step": 495 }, { "epoch": 0.0124, "grad_norm": 10.3125, "grad_norm_var": 46.62550455729167, "learning_rate": 0.0001, "loss": 6.2441, "loss/crossentropy": 0.8653862476348877, "loss/hidden": 0.13671875, "loss/logits": 0.005771493539214134, "loss/reg": 5.236272811889648, "loss/twn": 0.0, "step": 496 }, { "epoch": 0.012425, "grad_norm": 19.0, "grad_norm_var": 19.734619140625, "learning_rate": 0.0001, "loss": 7.8094, "loss/crossentropy": 2.4122979640960693, "loss/hidden": 0.1533203125, "loss/logits": 0.007559158839285374, "loss/reg": 5.236222743988037, "loss/twn": 0.0, "step": 497 }, { "epoch": 0.01245, "grad_norm": 23.375, "grad_norm_var": 24.274739583333332, "learning_rate": 0.0001, "loss": 7.2016, "loss/crossentropy": 1.773500680923462, "loss/hidden": 0.177734375, "loss/logits": 0.01412028819322586, "loss/reg": 5.236289024353027, "loss/twn": 0.0, "step": 498 }, { "epoch": 0.012475, "grad_norm": 12.25, "grad_norm_var": 23.545247395833332, "learning_rate": 0.0001, "loss": 7.306, "loss/crossentropy": 1.899260401725769, "loss/hidden": 0.16015625, "loss/logits": 0.010113149881362915, "loss/reg": 5.236475467681885, "loss/twn": 0.0, "step": 499 }, { "epoch": 0.0125, "grad_norm": 10.0625, "grad_norm_var": 23.165364583333332, "learning_rate": 0.0001, "loss": 7.6044, "loss/crossentropy": 2.234079360961914, "loss/hidden": 0.126953125, "loss/logits": 0.0073781562969088554, "loss/reg": 5.235958576202393, "loss/twn": 0.0, "step": 500 }, { "epoch": 0.012525, "grad_norm": 11.8125, "grad_norm_var": 23.473958333333332, "learning_rate": 0.0001, "loss": 7.5774, "loss/crossentropy": 2.250281572341919, "loss/hidden": 0.08642578125, "loss/logits": 0.004540358670055866, "loss/reg": 5.236131191253662, "loss/twn": 0.0, "step": 501 }, { "epoch": 0.01255, "grad_norm": 10.1875, "grad_norm_var": 23.405143229166665, "learning_rate": 0.0001, "loss": 6.5639, "loss/crossentropy": 1.1574146747589111, "loss/hidden": 0.1591796875, "loss/logits": 0.011335412040352821, "loss/reg": 5.236012935638428, "loss/twn": 0.0, "step": 502 }, { "epoch": 0.012575, "grad_norm": 12.0, "grad_norm_var": 22.857291666666665, "learning_rate": 0.0001, "loss": 7.8073, "loss/crossentropy": 2.4466099739074707, "loss/hidden": 0.115234375, "loss/logits": 0.009260098449885845, "loss/reg": 5.236217021942139, "loss/twn": 0.0, "step": 503 }, { "epoch": 0.0126, "grad_norm": 15.0, "grad_norm_var": 22.862483723958334, "learning_rate": 0.0001, "loss": 6.7732, "loss/crossentropy": 1.3012182712554932, "loss/hidden": 0.2265625, "loss/logits": 0.00941612757742405, "loss/reg": 5.235978126525879, "loss/twn": 0.0, "step": 504 }, { "epoch": 0.012625, "grad_norm": 141.0, "grad_norm_var": 1027.1649576822917, "learning_rate": 0.0001, "loss": 7.359, "loss/crossentropy": 1.941611886024475, "loss/hidden": 0.173828125, "loss/logits": 0.007238644640892744, "loss/reg": 5.236276626586914, "loss/twn": 0.0, "step": 505 }, { "epoch": 0.01265, "grad_norm": 11.875, "grad_norm_var": 1027.7504557291666, "learning_rate": 0.0001, "loss": 8.2093, "loss/crossentropy": 2.877250909805298, "loss/hidden": 0.08642578125, "loss/logits": 0.009630267508327961, "loss/reg": 5.235978603363037, "loss/twn": 0.0, "step": 506 }, { "epoch": 0.012675, "grad_norm": 136.0, "grad_norm_var": 1816.5980305989583, "learning_rate": 0.0001, "loss": 8.2749, "loss/crossentropy": 2.9017584323883057, "loss/hidden": 0.1279296875, "loss/logits": 0.00875360518693924, "loss/reg": 5.236504077911377, "loss/twn": 0.0, "step": 507 }, { "epoch": 0.0127, "grad_norm": 124.5, "grad_norm_var": 2330.153125, "learning_rate": 0.0001, "loss": 5.9319, "loss/crossentropy": 0.569814920425415, "loss/hidden": 0.11962890625, "loss/logits": 0.006820861250162125, "loss/reg": 5.235653400421143, "loss/twn": 0.0, "step": 508 }, { "epoch": 0.012725, "grad_norm": 12.6875, "grad_norm_var": 2349.377587890625, "learning_rate": 0.0001, "loss": 7.4016, "loss/crossentropy": 2.056201934814453, "loss/hidden": 0.0986328125, "loss/logits": 0.010679306462407112, "loss/reg": 5.236123085021973, "loss/twn": 0.0, "step": 509 }, { "epoch": 0.01275, "grad_norm": 10.9375, "grad_norm_var": 2357.3369140625, "learning_rate": 0.0001, "loss": 7.799, "loss/crossentropy": 2.4681053161621094, "loss/hidden": 0.08642578125, "loss/logits": 0.008713052608072758, "loss/reg": 5.235713481903076, "loss/twn": 0.0, "step": 510 }, { "epoch": 0.012775, "grad_norm": 11.8125, "grad_norm_var": 2387.242822265625, "learning_rate": 0.0001, "loss": 7.0821, "loss/crossentropy": 1.63010573387146, "loss/hidden": 0.2041015625, "loss/logits": 0.011883174069225788, "loss/reg": 5.235997200012207, "loss/twn": 0.0, "step": 511 }, { "epoch": 0.0128, "grad_norm": 9.3125, "grad_norm_var": 2390.703759765625, "learning_rate": 0.0001, "loss": 7.1311, "loss/crossentropy": 1.7556850910186768, "loss/hidden": 0.1328125, "loss/logits": 0.0065896175801754, "loss/reg": 5.236062049865723, "loss/twn": 0.0, "step": 512 }, { "epoch": 0.012825, "grad_norm": 7.15625, "grad_norm_var": 2425.903446451823, "learning_rate": 0.0001, "loss": 6.1515, "loss/crossentropy": 0.7963519096374512, "loss/hidden": 0.11279296875, "loss/logits": 0.006251027341932058, "loss/reg": 5.236119270324707, "loss/twn": 0.0, "step": 513 }, { "epoch": 0.01285, "grad_norm": 14.3125, "grad_norm_var": 2445.081018066406, "learning_rate": 0.0001, "loss": 8.0629, "loss/crossentropy": 2.6161036491394043, "loss/hidden": 0.197265625, "loss/logits": 0.013607255183160305, "loss/reg": 5.235938549041748, "loss/twn": 0.0, "step": 514 }, { "epoch": 0.012875, "grad_norm": 9.25, "grad_norm_var": 2454.5161743164062, "learning_rate": 0.0001, "loss": 8.0433, "loss/crossentropy": 2.7305965423583984, "loss/hidden": 0.0693359375, "loss/logits": 0.007636295165866613, "loss/reg": 5.235754489898682, "loss/twn": 0.0, "step": 515 }, { "epoch": 0.0129, "grad_norm": 17.125, "grad_norm_var": 2434.8625610351564, "learning_rate": 0.0001, "loss": 7.871, "loss/crossentropy": 2.3984246253967285, "loss/hidden": 0.220703125, "loss/logits": 0.01586098223924637, "loss/reg": 5.2359771728515625, "loss/twn": 0.0, "step": 516 }, { "epoch": 0.012925, "grad_norm": 16.75, "grad_norm_var": 2421.328153483073, "learning_rate": 0.0001, "loss": 8.2615, "loss/crossentropy": 2.9614031314849854, "loss/hidden": 0.0595703125, "loss/logits": 0.0044908830896019936, "loss/reg": 5.23606014251709, "loss/twn": 0.0, "step": 517 }, { "epoch": 0.01295, "grad_norm": 13.375, "grad_norm_var": 2411.420340983073, "learning_rate": 0.0001, "loss": 7.3563, "loss/crossentropy": 2.021721363067627, "loss/hidden": 0.09375, "loss/logits": 0.004891795106232166, "loss/reg": 5.235958099365234, "loss/twn": 0.0, "step": 518 }, { "epoch": 0.012975, "grad_norm": 14.6875, "grad_norm_var": 2403.5608032226564, "learning_rate": 0.0001, "loss": 7.2795, "loss/crossentropy": 1.8727322816848755, "loss/hidden": 0.16015625, "loss/logits": 0.010636523365974426, "loss/reg": 5.235999584197998, "loss/twn": 0.0, "step": 519 }, { "epoch": 0.013, "grad_norm": 10.1875, "grad_norm_var": 2418.0734985351564, "learning_rate": 0.0001, "loss": 7.2173, "loss/crossentropy": 1.8416680097579956, "loss/hidden": 0.1318359375, "loss/logits": 0.00781365018337965, "loss/reg": 5.235951900482178, "loss/twn": 0.0, "step": 520 }, { "epoch": 0.013025, "grad_norm": 9.8125, "grad_norm_var": 1640.6509073893228, "learning_rate": 0.0001, "loss": 7.9581, "loss/crossentropy": 2.697462797164917, "loss/hidden": 0.02099609375, "loss/logits": 0.004056986421346664, "loss/reg": 5.235566139221191, "loss/twn": 0.0, "step": 521 }, { "epoch": 0.01305, "grad_norm": 444.0, "grad_norm_var": 12447.939611816406, "learning_rate": 0.0001, "loss": 6.4374, "loss/crossentropy": 1.0375036001205444, "loss/hidden": 0.1611328125, "loss/logits": 0.003013317007571459, "loss/reg": 5.235776901245117, "loss/twn": 0.0, "step": 522 }, { "epoch": 0.013075, "grad_norm": 14.5625, "grad_norm_var": 12039.795764160157, "learning_rate": 0.0001, "loss": 7.6035, "loss/crossentropy": 2.2387633323669434, "loss/hidden": 0.11767578125, "loss/logits": 0.011812473647296429, "loss/reg": 5.235291957855225, "loss/twn": 0.0, "step": 523 }, { "epoch": 0.0131, "grad_norm": 11.75, "grad_norm_var": 11658.413016764323, "learning_rate": 0.0001, "loss": 8.2169, "loss/crossentropy": 2.89178729057312, "loss/hidden": 0.083984375, "loss/logits": 0.005315279122442007, "loss/reg": 5.235769748687744, "loss/twn": 0.0, "step": 524 }, { "epoch": 0.013125, "grad_norm": 25.0, "grad_norm_var": 11624.30995686849, "learning_rate": 0.0001, "loss": 6.9365, "loss/crossentropy": 1.5732824802398682, "loss/hidden": 0.1201171875, "loss/logits": 0.007601576391607523, "loss/reg": 5.235477924346924, "loss/twn": 0.0, "step": 525 }, { "epoch": 0.01315, "grad_norm": 10.875, "grad_norm_var": 11624.552404785156, "learning_rate": 0.0001, "loss": 8.1942, "loss/crossentropy": 2.9256229400634766, "loss/hidden": 0.0302734375, "loss/logits": 0.002373297931626439, "loss/reg": 5.235915184020996, "loss/twn": 0.0, "step": 526 }, { "epoch": 0.013175, "grad_norm": 8.1875, "grad_norm_var": 11638.996708170573, "learning_rate": 0.0001, "loss": 6.944, "loss/crossentropy": 1.6115573644638062, "loss/hidden": 0.09375, "loss/logits": 0.002878053579479456, "loss/reg": 5.235781669616699, "loss/twn": 0.0, "step": 527 }, { "epoch": 0.0132, "grad_norm": 33.75, "grad_norm_var": 11577.075646972657, "learning_rate": 0.0001, "loss": 5.7046, "loss/crossentropy": 0.35535818338394165, "loss/hidden": 0.11328125, "loss/logits": 0.0003594207810238004, "loss/reg": 5.235568046569824, "loss/twn": 0.0, "step": 528 }, { "epoch": 0.013225, "grad_norm": 17.0, "grad_norm_var": 11538.3197265625, "learning_rate": 0.0001, "loss": 7.8718, "loss/crossentropy": 2.430708646774292, "loss/hidden": 0.197265625, "loss/logits": 0.008184842765331268, "loss/reg": 5.235634803771973, "loss/twn": 0.0, "step": 529 }, { "epoch": 0.01325, "grad_norm": 12.6875, "grad_norm_var": 11544.465104166668, "learning_rate": 0.0001, "loss": 7.1415, "loss/crossentropy": 1.7498486042022705, "loss/hidden": 0.146484375, "loss/logits": 0.009613174945116043, "loss/reg": 5.235511302947998, "loss/twn": 0.0, "step": 530 }, { "epoch": 0.013275, "grad_norm": 10.5, "grad_norm_var": 11539.135677083334, "learning_rate": 0.0001, "loss": 7.9653, "loss/crossentropy": 2.616609573364258, "loss/hidden": 0.10107421875, "loss/logits": 0.011807188391685486, "loss/reg": 5.235820770263672, "loss/twn": 0.0, "step": 531 }, { "epoch": 0.0133, "grad_norm": 15.5625, "grad_norm_var": 11544.447770182293, "learning_rate": 0.0001, "loss": 7.821, "loss/crossentropy": 2.5074052810668945, "loss/hidden": 0.07177734375, "loss/logits": 0.006313705816864967, "loss/reg": 5.235459804534912, "loss/twn": 0.0, "step": 532 }, { "epoch": 0.013325, "grad_norm": 55.0, "grad_norm_var": 11508.170035807292, "learning_rate": 0.0001, "loss": 6.9534, "loss/crossentropy": 1.6006724834442139, "loss/hidden": 0.10986328125, "loss/logits": 0.007289988920092583, "loss/reg": 5.235603332519531, "loss/twn": 0.0, "step": 533 }, { "epoch": 0.01335, "grad_norm": 12.625, "grad_norm_var": 11511.286051432291, "learning_rate": 0.0001, "loss": 8.027, "loss/crossentropy": 2.671614646911621, "loss/hidden": 0.11279296875, "loss/logits": 0.0071898894384503365, "loss/reg": 5.235414028167725, "loss/twn": 0.0, "step": 534 }, { "epoch": 0.013375, "grad_norm": 372.0, "grad_norm_var": 18087.790104166666, "learning_rate": 0.0001, "loss": 8.0188, "loss/crossentropy": 2.639939308166504, "loss/hidden": 0.134765625, "loss/logits": 0.008616717532277107, "loss/reg": 5.235448360443115, "loss/twn": 0.0, "step": 535 }, { "epoch": 0.0134, "grad_norm": 9.0625, "grad_norm_var": 18096.311393229167, "learning_rate": 0.0001, "loss": 7.087, "loss/crossentropy": 1.6764798164367676, "loss/hidden": 0.1630859375, "loss/logits": 0.012176426127552986, "loss/reg": 5.235233783721924, "loss/twn": 0.0, "step": 536 }, { "epoch": 0.013425, "grad_norm": 9.3125, "grad_norm_var": 18100.0994140625, "learning_rate": 0.0001, "loss": 6.986, "loss/crossentropy": 1.620530605316162, "loss/hidden": 0.12255859375, "loss/logits": 0.007392015308141708, "loss/reg": 5.235503673553467, "loss/twn": 0.0, "step": 537 }, { "epoch": 0.01345, "grad_norm": 16.125, "grad_norm_var": 7998.4609375, "learning_rate": 0.0001, "loss": 8.4295, "loss/crossentropy": 3.019792079925537, "loss/hidden": 0.1591796875, "loss/logits": 0.015434058383107185, "loss/reg": 5.235138893127441, "loss/twn": 0.0, "step": 538 }, { "epoch": 0.013475, "grad_norm": 18.5, "grad_norm_var": 7986.272119140625, "learning_rate": 0.0001, "loss": 6.5863, "loss/crossentropy": 0.9472768902778625, "loss/hidden": 0.3984375, "loss/logits": 0.005067166872322559, "loss/reg": 5.235495090484619, "loss/twn": 0.0, "step": 539 }, { "epoch": 0.0135, "grad_norm": 68.0, "grad_norm_var": 7973.117822265625, "learning_rate": 0.0001, "loss": 7.0399, "loss/crossentropy": 1.5947017669677734, "loss/hidden": 0.201171875, "loss/logits": 0.008832491934299469, "loss/reg": 5.235156059265137, "loss/twn": 0.0, "step": 540 }, { "epoch": 0.013525, "grad_norm": 696.0, "grad_norm_var": 34468.181884765625, "learning_rate": 0.0001, "loss": 8.1707, "loss/crossentropy": 2.8003129959106445, "loss/hidden": 0.123046875, "loss/logits": 0.012298551388084888, "loss/reg": 5.235071659088135, "loss/twn": 0.0, "step": 541 }, { "epoch": 0.01355, "grad_norm": 11.5, "grad_norm_var": 34462.002197265625, "learning_rate": 0.0001, "loss": 7.5471, "loss/crossentropy": 2.217853546142578, "loss/hidden": 0.08642578125, "loss/logits": 0.007477154955267906, "loss/reg": 5.235381126403809, "loss/twn": 0.0, "step": 542 }, { "epoch": 0.013575, "grad_norm": 12.6875, "grad_norm_var": 34416.96235351563, "learning_rate": 0.0001, "loss": 8.3214, "loss/crossentropy": 2.926238775253296, "loss/hidden": 0.14453125, "loss/logits": 0.015387848019599915, "loss/reg": 5.235249042510986, "loss/twn": 0.0, "step": 543 }, { "epoch": 0.0136, "grad_norm": 78.0, "grad_norm_var": 34233.16352539063, "learning_rate": 0.0001, "loss": 8.227, "loss/crossentropy": 2.8347182273864746, "loss/hidden": 0.1416015625, "loss/logits": 0.015542502515017986, "loss/reg": 5.235121250152588, "loss/twn": 0.0, "step": 544 }, { "epoch": 0.013625, "grad_norm": 9.625, "grad_norm_var": 34306.78292643229, "learning_rate": 0.0001, "loss": 7.7463, "loss/crossentropy": 2.451880693435669, "loss/hidden": 0.0546875, "loss/logits": 0.004437028430402279, "loss/reg": 5.2353057861328125, "loss/twn": 0.0, "step": 545 }, { "epoch": 0.01365, "grad_norm": 11.3125, "grad_norm_var": 34320.69907226563, "learning_rate": 0.0001, "loss": 6.514, "loss/crossentropy": 1.0950896739959717, "loss/hidden": 0.1787109375, "loss/logits": 0.004933930933475494, "loss/reg": 5.235309600830078, "loss/twn": 0.0, "step": 546 }, { "epoch": 0.013675, "grad_norm": 19.375, "grad_norm_var": 34234.075374348955, "learning_rate": 0.0001, "loss": 7.917, "loss/crossentropy": 2.5059616565704346, "loss/hidden": 0.1650390625, "loss/logits": 0.010931363329291344, "loss/reg": 5.23504114151001, "loss/twn": 0.0, "step": 547 }, { "epoch": 0.0137, "grad_norm": 13.0625, "grad_norm_var": 34258.751155598955, "learning_rate": 0.0001, "loss": 8.0647, "loss/crossentropy": 2.657578706741333, "loss/hidden": 0.1640625, "loss/logits": 0.007689584046602249, "loss/reg": 5.235403060913086, "loss/twn": 0.0, "step": 548 }, { "epoch": 0.013725, "grad_norm": 8.6875, "grad_norm_var": 34598.19524739583, "learning_rate": 0.0001, "loss": 7.7748, "loss/crossentropy": 2.4471874237060547, "loss/hidden": 0.08642578125, "loss/logits": 0.006248220801353455, "loss/reg": 5.234949588775635, "loss/twn": 0.0, "step": 549 }, { "epoch": 0.01375, "grad_norm": 20.125, "grad_norm_var": 34528.96868489583, "learning_rate": 0.0001, "loss": 8.0594, "loss/crossentropy": 2.625808000564575, "loss/hidden": 0.1826171875, "loss/logits": 0.015631355345249176, "loss/reg": 5.235373020172119, "loss/twn": 0.0, "step": 550 }, { "epoch": 0.013775, "grad_norm": 13.875, "grad_norm_var": 28880.479427083334, "learning_rate": 0.0001, "loss": 5.896, "loss/crossentropy": 0.3797203600406647, "loss/hidden": 0.27734375, "loss/logits": 0.003970766440033913, "loss/reg": 5.234944820404053, "loss/twn": 0.0, "step": 551 }, { "epoch": 0.0138, "grad_norm": 37.0, "grad_norm_var": 28726.655843098957, "learning_rate": 0.0001, "loss": 8.1785, "loss/crossentropy": 2.7710931301116943, "loss/hidden": 0.1650390625, "loss/logits": 0.007251654751598835, "loss/reg": 5.235079765319824, "loss/twn": 0.0, "step": 552 }, { "epoch": 0.013825, "grad_norm": 10.4375, "grad_norm_var": 28718.351936848958, "learning_rate": 0.0001, "loss": 7.4216, "loss/crossentropy": 2.1164746284484863, "loss/hidden": 0.06689453125, "loss/logits": 0.003332372521981597, "loss/reg": 5.234862327575684, "loss/twn": 0.0, "step": 553 }, { "epoch": 0.01385, "grad_norm": 8.625, "grad_norm_var": 28771.012093098958, "learning_rate": 0.0001, "loss": 6.9919, "loss/crossentropy": 1.6055660247802734, "loss/hidden": 0.140625, "loss/logits": 0.010463319718837738, "loss/reg": 5.235208034515381, "loss/twn": 0.0, "step": 554 }, { "epoch": 0.013875, "grad_norm": 19.375, "grad_norm_var": 28765.65818684896, "learning_rate": 0.0001, "loss": 7.6993, "loss/crossentropy": 2.3250389099121094, "loss/hidden": 0.1328125, "loss/logits": 0.006799938622862101, "loss/reg": 5.234645843505859, "loss/twn": 0.0, "step": 555 }, { "epoch": 0.0139, "grad_norm": 28.0, "grad_norm_var": 28848.887353515624, "learning_rate": 0.0001, "loss": 6.6932, "loss/crossentropy": 1.2838351726531982, "loss/hidden": 0.16796875, "loss/logits": 0.00634372141212225, "loss/reg": 5.235021591186523, "loss/twn": 0.0, "step": 556 }, { "epoch": 0.013925, "grad_norm": 209.0, "grad_norm_var": 2527.298291015625, "learning_rate": 0.0001, "loss": 8.2303, "loss/crossentropy": 2.8924214839935303, "loss/hidden": 0.095703125, "loss/logits": 0.007457260973751545, "loss/reg": 5.234717845916748, "loss/twn": 0.0, "step": 557 }, { "epoch": 0.01395, "grad_norm": 14.25, "grad_norm_var": 2520.284358723958, "learning_rate": 0.0001, "loss": 7.1526, "loss/crossentropy": 1.7737421989440918, "loss/hidden": 0.134765625, "loss/logits": 0.009325024671852589, "loss/reg": 5.2347235679626465, "loss/twn": 0.0, "step": 558 }, { "epoch": 0.013975, "grad_norm": 31.75, "grad_norm_var": 2493.68125, "learning_rate": 0.0001, "loss": 7.8427, "loss/crossentropy": 2.5061957836151123, "loss/hidden": 0.09326171875, "loss/logits": 0.008374359458684921, "loss/reg": 5.234871864318848, "loss/twn": 0.0, "step": 559 }, { "epoch": 0.014, "grad_norm": 9.5625, "grad_norm_var": 2378.353369140625, "learning_rate": 0.0001, "loss": 8.2168, "loss/crossentropy": 2.899165391921997, "loss/hidden": 0.0791015625, "loss/logits": 0.0035689827054739, "loss/reg": 5.234927654266357, "loss/twn": 0.0, "step": 560 }, { "epoch": 0.014025, "grad_norm": 53.0, "grad_norm_var": 2383.8656087239583, "learning_rate": 0.0001, "loss": 5.8024, "loss/crossentropy": 0.35774412751197815, "loss/hidden": 0.2021484375, "loss/logits": 0.00793472956866026, "loss/reg": 5.234549045562744, "loss/twn": 0.0, "step": 561 }, { "epoch": 0.01405, "grad_norm": 64.5, "grad_norm_var": 2415.985872395833, "learning_rate": 0.0001, "loss": 7.1944, "loss/crossentropy": 1.7934857606887817, "loss/hidden": 0.15625, "loss/logits": 0.01002482883632183, "loss/reg": 5.234671115875244, "loss/twn": 0.0, "step": 562 }, { "epoch": 0.014075, "grad_norm": 7.78125, "grad_norm_var": 2448.6008422851564, "learning_rate": 0.0001, "loss": 7.6461, "loss/crossentropy": 2.308584451675415, "loss/hidden": 0.09619140625, "loss/logits": 0.006569989956915379, "loss/reg": 5.23472261428833, "loss/twn": 0.0, "step": 563 }, { "epoch": 0.0141, "grad_norm": 55.25, "grad_norm_var": 2440.2951782226564, "learning_rate": 0.0001, "loss": 7.8559, "loss/crossentropy": 2.439448118209839, "loss/hidden": 0.16796875, "loss/logits": 0.01364111714065075, "loss/reg": 5.234842300415039, "loss/twn": 0.0, "step": 564 }, { "epoch": 0.014125, "grad_norm": 16.25, "grad_norm_var": 2415.370438639323, "learning_rate": 0.0001, "loss": 8.3004, "loss/crossentropy": 2.8731632232666016, "loss/hidden": 0.1787109375, "loss/logits": 0.014065857976675034, "loss/reg": 5.23447847366333, "loss/twn": 0.0, "step": 565 }, { "epoch": 0.01415, "grad_norm": 11.8125, "grad_norm_var": 2438.8619099934895, "learning_rate": 0.0001, "loss": 7.1178, "loss/crossentropy": 1.780933141708374, "loss/hidden": 0.09375, "loss/logits": 0.008497532457113266, "loss/reg": 5.234607696533203, "loss/twn": 0.0, "step": 566 }, { "epoch": 0.014175, "grad_norm": 14.375, "grad_norm_var": 2437.342248535156, "learning_rate": 0.0001, "loss": 8.2505, "loss/crossentropy": 2.875947952270508, "loss/hidden": 0.125, "loss/logits": 0.0144406259059906, "loss/reg": 5.235077857971191, "loss/twn": 0.0, "step": 567 }, { "epoch": 0.0142, "grad_norm": 22.75, "grad_norm_var": 2449.9111938476562, "learning_rate": 0.0001, "loss": 7.0607, "loss/crossentropy": 1.6746110916137695, "loss/hidden": 0.146484375, "loss/logits": 0.0051438165828585625, "loss/reg": 5.234502792358398, "loss/twn": 0.0, "step": 568 }, { "epoch": 0.014225, "grad_norm": 19.625, "grad_norm_var": 2423.817736816406, "learning_rate": 0.0001, "loss": 7.989, "loss/crossentropy": 2.5603058338165283, "loss/hidden": 0.1826171875, "loss/logits": 0.011639740318059921, "loss/reg": 5.234450340270996, "loss/twn": 0.0, "step": 569 }, { "epoch": 0.01425, "grad_norm": 12.5625, "grad_norm_var": 2410.089807128906, "learning_rate": 0.0001, "loss": 6.1451, "loss/crossentropy": 0.8017870187759399, "loss/hidden": 0.10498046875, "loss/logits": 0.004113970324397087, "loss/reg": 5.2342610359191895, "loss/twn": 0.0, "step": 570 }, { "epoch": 0.014275, "grad_norm": 7.96875, "grad_norm_var": 2444.820947265625, "learning_rate": 0.0001, "loss": 6.3145, "loss/crossentropy": 0.9622921943664551, "loss/hidden": 0.10791015625, "loss/logits": 0.00957135483622551, "loss/reg": 5.234708786010742, "loss/twn": 0.0, "step": 571 }, { "epoch": 0.0143, "grad_norm": 9.5, "grad_norm_var": 2486.3206868489583, "learning_rate": 0.0001, "loss": 7.6128, "loss/crossentropy": 2.2709686756134033, "loss/hidden": 0.0986328125, "loss/logits": 0.008640400134027004, "loss/reg": 5.2345733642578125, "loss/twn": 0.0, "step": 572 }, { "epoch": 0.014325, "grad_norm": 7.6875, "grad_norm_var": 348.6860026041667, "learning_rate": 0.0001, "loss": 7.5522, "loss/crossentropy": 2.1828360557556152, "loss/hidden": 0.126953125, "loss/logits": 0.008249370381236076, "loss/reg": 5.234208583831787, "loss/twn": 0.0, "step": 573 }, { "epoch": 0.01435, "grad_norm": 12.4375, "grad_norm_var": 350.864306640625, "learning_rate": 0.0001, "loss": 8.2355, "loss/crossentropy": 2.8409531116485596, "loss/hidden": 0.1494140625, "loss/logits": 0.01071943435817957, "loss/reg": 5.234445095062256, "loss/twn": 0.0, "step": 574 }, { "epoch": 0.014375, "grad_norm": 9.8125, "grad_norm_var": 353.3037109375, "learning_rate": 0.0001, "loss": 6.8632, "loss/crossentropy": 1.4696354866027832, "loss/hidden": 0.1513671875, "loss/logits": 0.007870590314269066, "loss/reg": 5.234313011169434, "loss/twn": 0.0, "step": 575 }, { "epoch": 0.0144, "grad_norm": 54.5, "grad_norm_var": 411.406494140625, "learning_rate": 0.0001, "loss": 7.1332, "loss/crossentropy": 1.8072994947433472, "loss/hidden": 0.08642578125, "loss/logits": 0.004960792139172554, "loss/reg": 5.234533309936523, "loss/twn": 0.0, "step": 576 }, { "epoch": 0.014425, "grad_norm": 14.9375, "grad_norm_var": 353.450390625, "learning_rate": 0.0001, "loss": 8.0425, "loss/crossentropy": 2.6438798904418945, "loss/hidden": 0.1494140625, "loss/logits": 0.014983810484409332, "loss/reg": 5.234223365783691, "loss/twn": 0.0, "step": 577 }, { "epoch": 0.01445, "grad_norm": 8.6875, "grad_norm_var": 227.10193684895833, "learning_rate": 0.0001, "loss": 6.7681, "loss/crossentropy": 1.3993828296661377, "loss/hidden": 0.12890625, "loss/logits": 0.005370709113776684, "loss/reg": 5.234410285949707, "loss/twn": 0.0, "step": 578 }, { "epoch": 0.014475, "grad_norm": 30.875, "grad_norm_var": 229.36620686848957, "learning_rate": 0.0001, "loss": 8.3016, "loss/crossentropy": 2.88641095161438, "loss/hidden": 0.166015625, "loss/logits": 0.014844672754406929, "loss/reg": 5.234356880187988, "loss/twn": 0.0, "step": 579 }, { "epoch": 0.0145, "grad_norm": 10.625, "grad_norm_var": 140.0116170247396, "learning_rate": 0.0001, "loss": 6.7513, "loss/crossentropy": 1.4178798198699951, "loss/hidden": 0.09375, "loss/logits": 0.005413350649178028, "loss/reg": 5.234261989593506, "loss/twn": 0.0, "step": 580 }, { "epoch": 0.014525, "grad_norm": 17.875, "grad_norm_var": 140.1169881184896, "learning_rate": 0.0001, "loss": 7.069, "loss/crossentropy": 1.6242541074752808, "loss/hidden": 0.19921875, "loss/logits": 0.011282745748758316, "loss/reg": 5.234253883361816, "loss/twn": 0.0, "step": 581 }, { "epoch": 0.01455, "grad_norm": 13.5, "grad_norm_var": 139.2117146809896, "learning_rate": 0.0001, "loss": 6.743, "loss/crossentropy": 1.388974666595459, "loss/hidden": 0.1103515625, "loss/logits": 0.009235072880983353, "loss/reg": 5.234424591064453, "loss/twn": 0.0, "step": 582 }, { "epoch": 0.014575, "grad_norm": 13.125, "grad_norm_var": 139.70227457682293, "learning_rate": 0.0001, "loss": 7.0183, "loss/crossentropy": 1.626574993133545, "loss/hidden": 0.15234375, "loss/logits": 0.005102044437080622, "loss/reg": 5.234281539916992, "loss/twn": 0.0, "step": 583 }, { "epoch": 0.0146, "grad_norm": 11.4375, "grad_norm_var": 138.50621337890624, "learning_rate": 0.0001, "loss": 8.1816, "loss/crossentropy": 2.712043285369873, "loss/hidden": 0.21484375, "loss/logits": 0.020563386380672455, "loss/reg": 5.234140396118164, "loss/twn": 0.0, "step": 584 }, { "epoch": 0.014625, "grad_norm": 135.0, "grad_norm_var": 1027.0439412434896, "learning_rate": 0.0001, "loss": 7.6221, "loss/crossentropy": 2.309222936630249, "loss/hidden": 0.07177734375, "loss/logits": 0.00689616659656167, "loss/reg": 5.234216213226318, "loss/twn": 0.0, "step": 585 }, { "epoch": 0.01465, "grad_norm": 12.3125, "grad_norm_var": 1027.4010375976563, "learning_rate": 0.0001, "loss": 8.0345, "loss/crossentropy": 2.648197650909424, "loss/hidden": 0.14453125, "loss/logits": 0.0074767498299479485, "loss/reg": 5.2342705726623535, "loss/twn": 0.0, "step": 586 }, { "epoch": 0.014675, "grad_norm": 11.3125, "grad_norm_var": 1021.3348307291667, "learning_rate": 0.0001, "loss": 6.9087, "loss/crossentropy": 1.557716965675354, "loss/hidden": 0.1103515625, "loss/logits": 0.0062755015678703785, "loss/reg": 5.2343974113464355, "loss/twn": 0.0, "step": 587 }, { "epoch": 0.0147, "grad_norm": 14.0, "grad_norm_var": 1014.2895182291667, "learning_rate": 0.0001, "loss": 7.9829, "loss/crossentropy": 2.599196195602417, "loss/hidden": 0.1416015625, "loss/logits": 0.008030779659748077, "loss/reg": 5.23403787612915, "loss/twn": 0.0, "step": 588 }, { "epoch": 0.014725, "grad_norm": 10.75, "grad_norm_var": 1008.364697265625, "learning_rate": 0.0001, "loss": 6.2994, "loss/crossentropy": 0.9181722402572632, "loss/hidden": 0.1455078125, "loss/logits": 0.0018882363801822066, "loss/reg": 5.2338151931762695, "loss/twn": 0.0, "step": 589 }, { "epoch": 0.01475, "grad_norm": 10.75, "grad_norm_var": 1011.1046875, "learning_rate": 0.0001, "loss": 7.8917, "loss/crossentropy": 2.6068289279937744, "loss/hidden": 0.04736328125, "loss/logits": 0.003562201978638768, "loss/reg": 5.233980178833008, "loss/twn": 0.0, "step": 590 }, { "epoch": 0.014775, "grad_norm": 10.4375, "grad_norm_var": 1009.9702473958333, "learning_rate": 0.0001, "loss": 6.9904, "loss/crossentropy": 1.5700491666793823, "loss/hidden": 0.1787109375, "loss/logits": 0.0076020704582333565, "loss/reg": 5.234048366546631, "loss/twn": 0.0, "step": 591 }, { "epoch": 0.0148, "grad_norm": 13.75, "grad_norm_var": 946.7228515625, "learning_rate": 0.0001, "loss": 8.3853, "loss/crossentropy": 2.9748241901397705, "loss/hidden": 0.162109375, "loss/logits": 0.014261037111282349, "loss/reg": 5.234062194824219, "loss/twn": 0.0, "step": 592 }, { "epoch": 0.014825, "grad_norm": 20.625, "grad_norm_var": 943.9872233072916, "learning_rate": 0.0001, "loss": 7.1549, "loss/crossentropy": 1.6911969184875488, "loss/hidden": 0.2158203125, "loss/logits": 0.013989459723234177, "loss/reg": 5.233931541442871, "loss/twn": 0.0, "step": 593 }, { "epoch": 0.01485, "grad_norm": 12.1875, "grad_norm_var": 938.7426920572917, "learning_rate": 0.0001, "loss": 6.7309, "loss/crossentropy": 1.2908926010131836, "loss/hidden": 0.1943359375, "loss/logits": 0.011645066551864147, "loss/reg": 5.234016418457031, "loss/twn": 0.0, "step": 594 }, { "epoch": 0.014875, "grad_norm": 11.1875, "grad_norm_var": 939.1067057291667, "learning_rate": 0.0001, "loss": 8.2017, "loss/crossentropy": 2.890516757965088, "loss/hidden": 0.07177734375, "loss/logits": 0.005793450400233269, "loss/reg": 5.233628273010254, "loss/twn": 0.0, "step": 595 }, { "epoch": 0.0149, "grad_norm": 9.75, "grad_norm_var": 940.3130208333333, "learning_rate": 0.0001, "loss": 6.5188, "loss/crossentropy": 1.189386010169983, "loss/hidden": 0.09130859375, "loss/logits": 0.004200034309178591, "loss/reg": 5.233954906463623, "loss/twn": 0.0, "step": 596 }, { "epoch": 0.014925, "grad_norm": 7.09375, "grad_norm_var": 951.3511678059896, "learning_rate": 0.0001, "loss": 6.7014, "loss/crossentropy": 1.3759723901748657, "loss/hidden": 0.08642578125, "loss/logits": 0.0052226390689611435, "loss/reg": 5.233729362487793, "loss/twn": 0.0, "step": 597 }, { "epoch": 0.01495, "grad_norm": 8.875, "grad_norm_var": 956.5892211914063, "learning_rate": 0.0001, "loss": 7.4921, "loss/crossentropy": 2.140756368637085, "loss/hidden": 0.10986328125, "loss/logits": 0.007792431861162186, "loss/reg": 5.233695983886719, "loss/twn": 0.0, "step": 598 }, { "epoch": 0.014975, "grad_norm": 12.5, "grad_norm_var": 957.1479777018229, "learning_rate": 0.0001, "loss": 8.0241, "loss/crossentropy": 2.6345574855804443, "loss/hidden": 0.1435546875, "loss/logits": 0.012438047677278519, "loss/reg": 5.233548164367676, "loss/twn": 0.0, "step": 599 }, { "epoch": 0.015, "grad_norm": 10.0, "grad_norm_var": 958.8220662434895, "learning_rate": 0.0001, "loss": 6.329, "loss/crossentropy": 0.9624335765838623, "loss/hidden": 0.125, "loss/logits": 0.007683398202061653, "loss/reg": 5.233921051025391, "loss/twn": 0.0, "step": 600 }, { "epoch": 0.015025, "grad_norm": 78.5, "grad_norm_var": 287.54615478515626, "learning_rate": 0.0001, "loss": 8.289, "loss/crossentropy": 2.931908369064331, "loss/hidden": 0.11279296875, "loss/logits": 0.010676998645067215, "loss/reg": 5.233586311340332, "loss/twn": 0.0, "step": 601 }, { "epoch": 0.01505, "grad_norm": 11.125, "grad_norm_var": 288.1986612955729, "learning_rate": 0.0001, "loss": 7.1402, "loss/crossentropy": 1.7836761474609375, "loss/hidden": 0.115234375, "loss/logits": 0.007275612559169531, "loss/reg": 5.234038352966309, "loss/twn": 0.0, "step": 602 }, { "epoch": 0.015075, "grad_norm": 214.0, "grad_norm_var": 2734.4889933268228, "learning_rate": 0.0001, "loss": 8.0975, "loss/crossentropy": 2.6364102363586426, "loss/hidden": 0.220703125, "loss/logits": 0.006685142405331135, "loss/reg": 5.233693599700928, "loss/twn": 0.0, "step": 603 }, { "epoch": 0.0151, "grad_norm": 14.125, "grad_norm_var": 2734.2487915039064, "learning_rate": 0.0001, "loss": 7.7466, "loss/crossentropy": 2.511186122894287, "loss/hidden": 6.556510925292969e-06, "loss/logits": 0.0016271582571789622, "loss/reg": 5.233736515045166, "loss/twn": 0.0, "step": 604 }, { "epoch": 0.015125, "grad_norm": 11.3125, "grad_norm_var": 2732.938928222656, "learning_rate": 0.0001, "loss": 7.2908, "loss/crossentropy": 1.9334180355072021, "loss/hidden": 0.11279296875, "loss/logits": 0.011062689125537872, "loss/reg": 5.233515739440918, "loss/twn": 0.0, "step": 605 }, { "epoch": 0.01515, "grad_norm": 38.25, "grad_norm_var": 2715.0710896809896, "learning_rate": 0.0001, "loss": 7.1654, "loss/crossentropy": 1.7573686838150024, "loss/hidden": 0.1630859375, "loss/logits": 0.01111831609159708, "loss/reg": 5.233856201171875, "loss/twn": 0.0, "step": 606 }, { "epoch": 0.015175, "grad_norm": 10.9375, "grad_norm_var": 2713.767053222656, "learning_rate": 0.0001, "loss": 8.0656, "loss/crossentropy": 2.725510835647583, "loss/hidden": 0.09619140625, "loss/logits": 0.010500291362404823, "loss/reg": 5.233447074890137, "loss/twn": 0.0, "step": 607 }, { "epoch": 0.0152, "grad_norm": 15.4375, "grad_norm_var": 2710.229455566406, "learning_rate": 0.0001, "loss": 8.219, "loss/crossentropy": 2.8715081214904785, "loss/hidden": 0.103515625, "loss/logits": 0.010522611439228058, "loss/reg": 5.233500003814697, "loss/twn": 0.0, "step": 608 }, { "epoch": 0.015225, "grad_norm": 12.75, "grad_norm_var": 2724.336779785156, "learning_rate": 0.0001, "loss": 7.4159, "loss/crossentropy": 2.050481081008911, "loss/hidden": 0.1201171875, "loss/logits": 0.012026194483041763, "loss/reg": 5.233264446258545, "loss/twn": 0.0, "step": 609 }, { "epoch": 0.01525, "grad_norm": 102.5, "grad_norm_var": 3021.098010253906, "learning_rate": 0.0001, "loss": 6.1157, "loss/crossentropy": 0.7093434929847717, "loss/hidden": 0.162109375, "loss/logits": 0.010594572871923447, "loss/reg": 5.233696460723877, "loss/twn": 0.0, "step": 610 }, { "epoch": 0.015275, "grad_norm": 11.0, "grad_norm_var": 3021.7085571289062, "learning_rate": 0.0001, "loss": 7.9153, "loss/crossentropy": 2.5407986640930176, "loss/hidden": 0.130859375, "loss/logits": 0.010352734476327896, "loss/reg": 5.233257293701172, "loss/twn": 0.0, "step": 611 }, { "epoch": 0.0153, "grad_norm": 9.3125, "grad_norm_var": 3023.223173014323, "learning_rate": 0.0001, "loss": 7.4287, "loss/crossentropy": 2.056551694869995, "loss/hidden": 0.1298828125, "loss/logits": 0.008765427395701408, "loss/reg": 5.233468055725098, "loss/twn": 0.0, "step": 612 }, { "epoch": 0.015325, "grad_norm": 39.25, "grad_norm_var": 2966.133268229167, "learning_rate": 0.0001, "loss": 7.0537, "loss/crossentropy": 1.7130509614944458, "loss/hidden": 0.10205078125, "loss/logits": 0.005178738851100206, "loss/reg": 5.233448028564453, "loss/twn": 0.0, "step": 613 }, { "epoch": 0.01535, "grad_norm": 15.6875, "grad_norm_var": 2943.0399576822915, "learning_rate": 0.0001, "loss": 7.4671, "loss/crossentropy": 2.072944402694702, "loss/hidden": 0.138671875, "loss/logits": 0.021782729774713516, "loss/reg": 5.233670234680176, "loss/twn": 0.0, "step": 614 }, { "epoch": 0.015375, "grad_norm": 40.5, "grad_norm_var": 2897.146207682292, "learning_rate": 0.0001, "loss": 7.2931, "loss/crossentropy": 1.9488314390182495, "loss/hidden": 0.10693359375, "loss/logits": 0.004016375169157982, "loss/reg": 5.233325481414795, "loss/twn": 0.0, "step": 615 }, { "epoch": 0.0154, "grad_norm": 17.875, "grad_norm_var": 2869.870817057292, "learning_rate": 0.0001, "loss": 7.5828, "loss/crossentropy": 2.250316858291626, "loss/hidden": 0.09375, "loss/logits": 0.005294739734381437, "loss/reg": 5.233473300933838, "loss/twn": 0.0, "step": 616 }, { "epoch": 0.015425, "grad_norm": 13.6875, "grad_norm_var": 2801.0919270833333, "learning_rate": 0.0001, "loss": 7.946, "loss/crossentropy": 2.605081796646118, "loss/hidden": 0.09619140625, "loss/logits": 0.01103608775883913, "loss/reg": 5.233642578125, "loss/twn": 0.0, "step": 617 }, { "epoch": 0.01545, "grad_norm": 10.8125, "grad_norm_var": 2802.1390462239583, "learning_rate": 0.0001, "loss": 7.9096, "loss/crossentropy": 2.6560287475585938, "loss/hidden": 0.016357421875, "loss/logits": 0.0036827209405601025, "loss/reg": 5.233491897583008, "loss/twn": 0.0, "step": 618 }, { "epoch": 0.015475, "grad_norm": 10.5625, "grad_norm_var": 563.0020833333333, "learning_rate": 0.0001, "loss": 7.9054, "loss/crossentropy": 2.542119026184082, "loss/hidden": 0.12255859375, "loss/logits": 0.007251254748553038, "loss/reg": 5.2334303855896, "loss/twn": 0.0, "step": 619 }, { "epoch": 0.0155, "grad_norm": 17.875, "grad_norm_var": 559.2559895833333, "learning_rate": 0.0001, "loss": 8.014, "loss/crossentropy": 2.6021788120269775, "loss/hidden": 0.1708984375, "loss/logits": 0.007565245497971773, "loss/reg": 5.23338508605957, "loss/twn": 0.0, "step": 620 }, { "epoch": 0.015525, "grad_norm": 21.625, "grad_norm_var": 548.9945149739583, "learning_rate": 0.0001, "loss": 8.1369, "loss/crossentropy": 2.7238121032714844, "loss/hidden": 0.1650390625, "loss/logits": 0.0146188298240304, "loss/reg": 5.233391284942627, "loss/twn": 0.0, "step": 621 }, { "epoch": 0.01555, "grad_norm": 26.875, "grad_norm_var": 535.8540201822917, "learning_rate": 0.0001, "loss": 6.8563, "loss/crossentropy": 1.4800411462783813, "loss/hidden": 0.134765625, "loss/logits": 0.008334355428814888, "loss/reg": 5.233196258544922, "loss/twn": 0.0, "step": 622 }, { "epoch": 0.015575, "grad_norm": 53.0, "grad_norm_var": 575.73671875, "learning_rate": 0.0001, "loss": 8.042, "loss/crossentropy": 2.7313811779022217, "loss/hidden": 0.07275390625, "loss/logits": 0.004363874904811382, "loss/reg": 5.23349666595459, "loss/twn": 0.0, "step": 623 }, { "epoch": 0.0156, "grad_norm": 11.9375, "grad_norm_var": 581.51171875, "learning_rate": 0.0001, "loss": 7.9383, "loss/crossentropy": 2.561565637588501, "loss/hidden": 0.12890625, "loss/logits": 0.014568326994776726, "loss/reg": 5.233224868774414, "loss/twn": 0.0, "step": 624 }, { "epoch": 0.015625, "grad_norm": 17.875, "grad_norm_var": 574.1311848958334, "learning_rate": 0.0001, "loss": 7.8326, "loss/crossentropy": 2.5030391216278076, "loss/hidden": 0.08642578125, "loss/logits": 0.009584227576851845, "loss/reg": 5.233513355255127, "loss/twn": 0.0, "step": 625 }, { "epoch": 0.01565, "grad_norm": 14.8125, "grad_norm_var": 163.484228515625, "learning_rate": 0.0001, "loss": 6.0231, "loss/crossentropy": 0.6415687203407288, "loss/hidden": 0.1376953125, "loss/logits": 0.010496280156075954, "loss/reg": 5.233306884765625, "loss/twn": 0.0, "step": 626 }, { "epoch": 0.015675, "grad_norm": 20.125, "grad_norm_var": 156.77355143229167, "learning_rate": 0.0001, "loss": 6.8528, "loss/crossentropy": 1.486595630645752, "loss/hidden": 0.126953125, "loss/logits": 0.005804477259516716, "loss/reg": 5.2334794998168945, "loss/twn": 0.0, "step": 627 }, { "epoch": 0.0157, "grad_norm": 18.0, "grad_norm_var": 147.53177083333333, "learning_rate": 0.0001, "loss": 6.2299, "loss/crossentropy": 0.6993483304977417, "loss/hidden": 0.287109375, "loss/logits": 0.010279776528477669, "loss/reg": 5.233189582824707, "loss/twn": 0.0, "step": 628 }, { "epoch": 0.015725, "grad_norm": 9.0625, "grad_norm_var": 134.67849934895833, "learning_rate": 0.0001, "loss": 7.3681, "loss/crossentropy": 2.038987874984741, "loss/hidden": 0.08642578125, "loss/logits": 0.009393742308020592, "loss/reg": 5.2332634925842285, "loss/twn": 0.0, "step": 629 }, { "epoch": 0.01575, "grad_norm": 13.625, "grad_norm_var": 136.13567708333332, "learning_rate": 0.0001, "loss": 7.7675, "loss/crossentropy": 2.3605449199676514, "loss/hidden": 0.16796875, "loss/logits": 0.006129886955022812, "loss/reg": 5.232873916625977, "loss/twn": 0.0, "step": 630 }, { "epoch": 0.015775, "grad_norm": 14.6875, "grad_norm_var": 106.847900390625, "learning_rate": 0.0001, "loss": 8.0393, "loss/crossentropy": 2.6303060054779053, "loss/hidden": 0.1689453125, "loss/logits": 0.007038387469947338, "loss/reg": 5.23299503326416, "loss/twn": 0.0, "step": 631 }, { "epoch": 0.0158, "grad_norm": 72.0, "grad_norm_var": 287.03904622395834, "learning_rate": 0.0001, "loss": 8.0743, "loss/crossentropy": 2.7000534534454346, "loss/hidden": 0.130859375, "loss/logits": 0.010081219486892223, "loss/reg": 5.233316421508789, "loss/twn": 0.0, "step": 632 }, { "epoch": 0.015825, "grad_norm": 39.0, "grad_norm_var": 300.17649739583334, "learning_rate": 0.0001, "loss": 7.4968, "loss/crossentropy": 2.066685676574707, "loss/hidden": 0.1875, "loss/logits": 0.009645121172070503, "loss/reg": 5.233018398284912, "loss/twn": 0.0, "step": 633 }, { "epoch": 0.01585, "grad_norm": 11.125, "grad_norm_var": 299.664697265625, "learning_rate": 0.0001, "loss": 7.4915, "loss/crossentropy": 2.1979663372039795, "loss/hidden": 0.0546875, "loss/logits": 0.0055001177825033665, "loss/reg": 5.23332405090332, "loss/twn": 0.0, "step": 634 }, { "epoch": 0.015875, "grad_norm": 13.5625, "grad_norm_var": 295.147509765625, "learning_rate": 0.0001, "loss": 8.1414, "loss/crossentropy": 2.8441083431243896, "loss/hidden": 0.05712890625, "loss/logits": 0.007112159393727779, "loss/reg": 5.233099460601807, "loss/twn": 0.0, "step": 635 }, { "epoch": 0.0159, "grad_norm": 14.4375, "grad_norm_var": 298.4408854166667, "learning_rate": 0.0001, "loss": 7.8434, "loss/crossentropy": 2.419523239135742, "loss/hidden": 0.1787109375, "loss/logits": 0.011547038331627846, "loss/reg": 5.233601093292236, "loss/twn": 0.0, "step": 636 }, { "epoch": 0.015925, "grad_norm": 135.0, "grad_norm_var": 1077.4806640625, "learning_rate": 0.0001, "loss": 7.9874, "loss/crossentropy": 2.4825010299682617, "loss/hidden": 0.25390625, "loss/logits": 0.017681429162621498, "loss/reg": 5.233287811279297, "loss/twn": 0.0, "step": 637 }, { "epoch": 0.01595, "grad_norm": 79.5, "grad_norm_var": 1226.39296875, "learning_rate": 0.0001, "loss": 5.6235, "loss/crossentropy": 0.20704708993434906, "loss/hidden": 0.1806640625, "loss/logits": 0.0022514096926897764, "loss/reg": 5.233528137207031, "loss/twn": 0.0, "step": 638 }, { "epoch": 0.015975, "grad_norm": 12.3125, "grad_norm_var": 1224.665869140625, "learning_rate": 0.0001, "loss": 7.9267, "loss/crossentropy": 2.5261454582214355, "loss/hidden": 0.1572265625, "loss/logits": 0.010081654414534569, "loss/reg": 5.233248233795166, "loss/twn": 0.0, "step": 639 }, { "epoch": 0.016, "grad_norm": 384.0, "grad_norm_var": 8927.619205729166, "learning_rate": 0.0001, "loss": 7.0656, "loss/crossentropy": 1.7351843118667603, "loss/hidden": 0.0927734375, "loss/logits": 0.004287827759981155, "loss/reg": 5.233373641967773, "loss/twn": 0.0, "step": 640 }, { "epoch": 0.016025, "grad_norm": 18.125, "grad_norm_var": 8926.408268229166, "learning_rate": 0.0001, "loss": 7.0317, "loss/crossentropy": 1.592779278755188, "loss/hidden": 0.193359375, "loss/logits": 0.012589013203978539, "loss/reg": 5.232937335968018, "loss/twn": 0.0, "step": 641 }, { "epoch": 0.01605, "grad_norm": 9.125, "grad_norm_var": 8958.401936848959, "learning_rate": 0.0001, "loss": 8.2427, "loss/crossentropy": 2.891831874847412, "loss/hidden": 0.1123046875, "loss/logits": 0.005527087952941656, "loss/reg": 5.233004093170166, "loss/twn": 0.0, "step": 642 }, { "epoch": 0.016075, "grad_norm": 93.0, "grad_norm_var": 8961.362483723959, "learning_rate": 0.0001, "loss": 6.9493, "loss/crossentropy": 1.5797322988510132, "loss/hidden": 0.1259765625, "loss/logits": 0.010628938674926758, "loss/reg": 5.232941627502441, "loss/twn": 0.0, "step": 643 }, { "epoch": 0.0161, "grad_norm": 9.875, "grad_norm_var": 9009.401546223959, "learning_rate": 0.0001, "loss": 8.1281, "loss/crossentropy": 2.7545292377471924, "loss/hidden": 0.1328125, "loss/logits": 0.007755103521049023, "loss/reg": 5.232993125915527, "loss/twn": 0.0, "step": 644 }, { "epoch": 0.016125, "grad_norm": 28.75, "grad_norm_var": 8905.093684895834, "learning_rate": 0.0001, "loss": 7.9328, "loss/crossentropy": 2.643615245819092, "loss/hidden": 0.054443359375, "loss/logits": 0.0016623031115159392, "loss/reg": 5.2330780029296875, "loss/twn": 0.0, "step": 645 }, { "epoch": 0.01615, "grad_norm": 20.625, "grad_norm_var": 8865.565559895833, "learning_rate": 0.0001, "loss": 6.9237, "loss/crossentropy": 1.4672698974609375, "loss/hidden": 0.2138671875, "loss/logits": 0.009410521015524864, "loss/reg": 5.233164310455322, "loss/twn": 0.0, "step": 646 }, { "epoch": 0.016175, "grad_norm": 15.25, "grad_norm_var": 8862.209749348958, "learning_rate": 0.0001, "loss": 7.9973, "loss/crossentropy": 2.6482272148132324, "loss/hidden": 0.10791015625, "loss/logits": 0.00825223047286272, "loss/reg": 5.232880592346191, "loss/twn": 0.0, "step": 647 }, { "epoch": 0.0162, "grad_norm": 13.6875, "grad_norm_var": 8979.335872395834, "learning_rate": 0.0001, "loss": 6.1538, "loss/crossentropy": 0.7194666862487793, "loss/hidden": 0.1884765625, "loss/logits": 0.012791863642632961, "loss/reg": 5.233090877532959, "loss/twn": 0.0, "step": 648 }, { "epoch": 0.016225, "grad_norm": 24.875, "grad_norm_var": 9023.984114583332, "learning_rate": 0.0001, "loss": 7.1305, "loss/crossentropy": 1.7185572385787964, "loss/hidden": 0.1669921875, "loss/logits": 0.011910820379853249, "loss/reg": 5.233007431030273, "loss/twn": 0.0, "step": 649 }, { "epoch": 0.01625, "grad_norm": 12.4375, "grad_norm_var": 9016.378108723959, "learning_rate": 0.0001, "loss": 7.4389, "loss/crossentropy": 2.082362174987793, "loss/hidden": 0.11279296875, "loss/logits": 0.010908122174441814, "loss/reg": 5.232817649841309, "loss/twn": 0.0, "step": 650 }, { "epoch": 0.016275, "grad_norm": 8.9375, "grad_norm_var": 9043.443994140625, "learning_rate": 0.0001, "loss": 7.4177, "loss/crossentropy": 2.062870502471924, "loss/hidden": 0.11328125, "loss/logits": 0.008752668276429176, "loss/reg": 5.232777118682861, "loss/twn": 0.0, "step": 651 }, { "epoch": 0.0163, "grad_norm": 12.0625, "grad_norm_var": 9056.640087890624, "learning_rate": 0.0001, "loss": 7.9823, "loss/crossentropy": 2.6078176498413086, "loss/hidden": 0.1328125, "loss/logits": 0.008596043102443218, "loss/reg": 5.23306941986084, "loss/twn": 0.0, "step": 652 }, { "epoch": 0.016325, "grad_norm": 122.5, "grad_norm_var": 8932.818473307292, "learning_rate": 0.0001, "loss": 7.9264, "loss/crossentropy": 2.5046682357788086, "loss/hidden": 0.1748046875, "loss/logits": 0.014288893900811672, "loss/reg": 5.232659339904785, "loss/twn": 0.0, "step": 653 }, { "epoch": 0.01635, "grad_norm": 21.5, "grad_norm_var": 8946.382014973959, "learning_rate": 0.0001, "loss": 7.2315, "loss/crossentropy": 1.8828779458999634, "loss/hidden": 0.10986328125, "loss/logits": 0.005740518681704998, "loss/reg": 5.23300838470459, "loss/twn": 0.0, "step": 654 }, { "epoch": 0.016375, "grad_norm": 352.0, "grad_norm_var": 14431.18515625, "learning_rate": 0.0001, "loss": 6.8525, "loss/crossentropy": 1.471374273300171, "loss/hidden": 0.138671875, "loss/logits": 0.009916655719280243, "loss/reg": 5.232522010803223, "loss/twn": 0.0, "step": 655 }, { "epoch": 0.0164, "grad_norm": 33.75, "grad_norm_var": 7512.653125, "learning_rate": 0.0001, "loss": 6.2595, "loss/crossentropy": 0.7713863253593445, "loss/hidden": 0.244140625, "loss/logits": 0.01109264511615038, "loss/reg": 5.232911109924316, "loss/twn": 0.0, "step": 656 }, { "epoch": 0.016425, "grad_norm": 15.0625, "grad_norm_var": 7526.165608723958, "learning_rate": 0.0001, "loss": 8.093, "loss/crossentropy": 2.709456205368042, "loss/hidden": 0.1337890625, "loss/logits": 0.01727338880300522, "loss/reg": 5.23248815536499, "loss/twn": 0.0, "step": 657 }, { "epoch": 0.01645, "grad_norm": 9.5625, "grad_norm_var": 7523.817122395833, "learning_rate": 0.0001, "loss": 6.334, "loss/crossentropy": 0.9124006032943726, "loss/hidden": 0.1806640625, "loss/logits": 0.007904157042503357, "loss/reg": 5.2329912185668945, "loss/twn": 0.0, "step": 658 }, { "epoch": 0.016475, "grad_norm": 9.625, "grad_norm_var": 7476.006770833334, "learning_rate": 0.0001, "loss": 7.0775, "loss/crossentropy": 1.669623613357544, "loss/hidden": 0.1640625, "loss/logits": 0.011253604665398598, "loss/reg": 5.232557773590088, "loss/twn": 0.0, "step": 659 }, { "epoch": 0.0165, "grad_norm": 9.8125, "grad_norm_var": 7476.294775390625, "learning_rate": 0.0001, "loss": 8.0574, "loss/crossentropy": 2.7882609367370605, "loss/hidden": 0.03515625, "loss/logits": 0.0012600821210071445, "loss/reg": 5.23272705078125, "loss/twn": 0.0, "step": 660 }, { "epoch": 0.016525, "grad_norm": 64.5, "grad_norm_var": 7481.564176432292, "learning_rate": 0.0001, "loss": 7.1712, "loss/crossentropy": 1.5283738374710083, "loss/hidden": 0.40234375, "loss/logits": 0.008047623559832573, "loss/reg": 5.2324652671813965, "loss/twn": 0.0, "step": 661 }, { "epoch": 0.01655, "grad_norm": 14.0625, "grad_norm_var": 7507.016080729167, "learning_rate": 0.0001, "loss": 6.9527, "loss/crossentropy": 1.5561813116073608, "loss/hidden": 0.150390625, "loss/logits": 0.013308672234416008, "loss/reg": 5.232776641845703, "loss/twn": 0.0, "step": 662 }, { "epoch": 0.016575, "grad_norm": 8.5625, "grad_norm_var": 7537.432014973959, "learning_rate": 0.0001, "loss": 6.883, "loss/crossentropy": 1.4900498390197754, "loss/hidden": 0.15234375, "loss/logits": 0.008140160702168941, "loss/reg": 5.232505798339844, "loss/twn": 0.0, "step": 663 }, { "epoch": 0.0166, "grad_norm": 10.4375, "grad_norm_var": 7552.011311848958, "learning_rate": 0.0001, "loss": 7.9879, "loss/crossentropy": 2.649501085281372, "loss/hidden": 0.0986328125, "loss/logits": 0.006926264148205519, "loss/reg": 5.2328362464904785, "loss/twn": 0.0, "step": 664 }, { "epoch": 0.016625, "grad_norm": 20.875, "grad_norm_var": 7564.067561848959, "learning_rate": 0.0001, "loss": 7.9074, "loss/crossentropy": 2.5822925567626953, "loss/hidden": 0.08642578125, "loss/logits": 0.006065651774406433, "loss/reg": 5.232655048370361, "loss/twn": 0.0, "step": 665 }, { "epoch": 0.01665, "grad_norm": 10.125, "grad_norm_var": 7574.551497395833, "learning_rate": 0.0001, "loss": 8.0297, "loss/crossentropy": 2.7020585536956787, "loss/hidden": 0.08642578125, "loss/logits": 0.008298511616885662, "loss/reg": 5.232966899871826, "loss/twn": 0.0, "step": 666 }, { "epoch": 0.016675, "grad_norm": 11.3125, "grad_norm_var": 7563.417447916667, "learning_rate": 0.0001, "loss": 6.2821, "loss/crossentropy": 0.8871417045593262, "loss/hidden": 0.1572265625, "loss/logits": 0.005114687606692314, "loss/reg": 5.232606410980225, "loss/twn": 0.0, "step": 667 }, { "epoch": 0.0167, "grad_norm": 16.75, "grad_norm_var": 7543.980192057292, "learning_rate": 0.0001, "loss": 7.9399, "loss/crossentropy": 2.610417604446411, "loss/hidden": 0.08642578125, "loss/logits": 0.010183998383581638, "loss/reg": 5.2328290939331055, "loss/twn": 0.0, "step": 668 }, { "epoch": 0.016725, "grad_norm": 7.71875, "grad_norm_var": 7191.31181233724, "learning_rate": 0.0001, "loss": 7.23, "loss/crossentropy": 1.891376256942749, "loss/hidden": 0.09765625, "loss/logits": 0.008510958403348923, "loss/reg": 5.232450485229492, "loss/twn": 0.0, "step": 669 }, { "epoch": 0.01675, "grad_norm": 16.5, "grad_norm_var": 7204.193322753907, "learning_rate": 0.0001, "loss": 6.8222, "loss/crossentropy": 1.4736872911453247, "loss/hidden": 0.10546875, "loss/logits": 0.010109667666256428, "loss/reg": 5.232895374298096, "loss/twn": 0.0, "step": 670 }, { "epoch": 0.016775, "grad_norm": 17.0, "grad_norm_var": 200.33785400390624, "learning_rate": 0.0001, "loss": 8.133, "loss/crossentropy": 2.7790119647979736, "loss/hidden": 0.11279296875, "loss/logits": 0.008631115779280663, "loss/reg": 5.232613563537598, "loss/twn": 0.0, "step": 671 }, { "epoch": 0.0168, "grad_norm": 21.875, "grad_norm_var": 182.99231363932293, "learning_rate": 0.0001, "loss": 8.1379, "loss/crossentropy": 2.766389846801758, "loss/hidden": 0.12890625, "loss/logits": 0.009614645503461361, "loss/reg": 5.232971668243408, "loss/twn": 0.0, "step": 672 }, { "epoch": 0.016825, "grad_norm": 10.1875, "grad_norm_var": 185.4031534830729, "learning_rate": 0.0001, "loss": 7.7498, "loss/crossentropy": 2.4996728897094727, "loss/hidden": 0.0140380859375, "loss/logits": 0.0034008692018687725, "loss/reg": 5.2326860427856445, "loss/twn": 0.0, "step": 673 }, { "epoch": 0.01685, "grad_norm": 11.6875, "grad_norm_var": 183.8099568684896, "learning_rate": 0.0001, "loss": 6.7223, "loss/crossentropy": 1.3949775695800781, "loss/hidden": 0.0927734375, "loss/logits": 0.0020429021678864956, "loss/reg": 5.23252010345459, "loss/twn": 0.0, "step": 674 }, { "epoch": 0.016875, "grad_norm": 11.0, "grad_norm_var": 182.70172119140625, "learning_rate": 0.0001, "loss": 7.9354, "loss/crossentropy": 2.6207685470581055, "loss/hidden": 0.07666015625, "loss/logits": 0.005197848193347454, "loss/reg": 5.232755184173584, "loss/twn": 0.0, "step": 675 }, { "epoch": 0.0169, "grad_norm": 20.5, "grad_norm_var": 180.45289306640626, "learning_rate": 0.0001, "loss": 8.3505, "loss/crossentropy": 2.9047460556030273, "loss/hidden": 0.193359375, "loss/logits": 0.019472159445285797, "loss/reg": 5.232929706573486, "loss/twn": 0.0, "step": 676 }, { "epoch": 0.016925, "grad_norm": 8.5, "grad_norm_var": 22.29664306640625, "learning_rate": 0.0001, "loss": 7.8368, "loss/crossentropy": 2.4469549655914307, "loss/hidden": 0.1484375, "loss/logits": 0.008610617369413376, "loss/reg": 5.232751369476318, "loss/twn": 0.0, "step": 677 }, { "epoch": 0.01695, "grad_norm": 14.9375, "grad_norm_var": 22.402144368489584, "learning_rate": 0.0001, "loss": 7.847, "loss/crossentropy": 2.5082645416259766, "loss/hidden": 0.0986328125, "loss/logits": 0.007502686232328415, "loss/reg": 5.2326483726501465, "loss/twn": 0.0, "step": 678 }, { "epoch": 0.016975, "grad_norm": 179.0, "grad_norm_var": 1722.9600545247397, "learning_rate": 0.0001, "loss": 6.7476, "loss/crossentropy": 1.1506078243255615, "loss/hidden": 0.35546875, "loss/logits": 0.008823427371680737, "loss/reg": 5.2327399253845215, "loss/twn": 0.0, "step": 679 }, { "epoch": 0.017, "grad_norm": 10.0625, "grad_norm_var": 1723.6607381184897, "learning_rate": 0.0001, "loss": 7.9045, "loss/crossentropy": 2.541614055633545, "loss/hidden": 0.12109375, "loss/logits": 0.008851654827594757, "loss/reg": 5.232950687408447, "loss/twn": 0.0, "step": 680 }, { "epoch": 0.017025, "grad_norm": 38.5, "grad_norm_var": 1735.1399373372396, "learning_rate": 0.0001, "loss": 6.9999, "loss/crossentropy": 1.6033979654312134, "loss/hidden": 0.1572265625, "loss/logits": 0.006385164335370064, "loss/reg": 5.232907772064209, "loss/twn": 0.0, "step": 681 }, { "epoch": 0.01705, "grad_norm": 17.625, "grad_norm_var": 1723.4270467122396, "learning_rate": 0.0001, "loss": 7.998, "loss/crossentropy": 2.6382601261138916, "loss/hidden": 0.11279296875, "loss/logits": 0.014375717379152775, "loss/reg": 5.232568264007568, "loss/twn": 0.0, "step": 682 }, { "epoch": 0.017075, "grad_norm": 10.6875, "grad_norm_var": 1724.6606079101562, "learning_rate": 0.0001, "loss": 6.5706, "loss/crossentropy": 1.2485917806625366, "loss/hidden": 0.08642578125, "loss/logits": 0.002942750696092844, "loss/reg": 5.232631206512451, "loss/twn": 0.0, "step": 683 }, { "epoch": 0.0171, "grad_norm": 55.25, "grad_norm_var": 1770.930790201823, "learning_rate": 0.0001, "loss": 7.0946, "loss/crossentropy": 1.641523003578186, "loss/hidden": 0.212890625, "loss/logits": 0.007442857138812542, "loss/reg": 5.2327880859375, "loss/twn": 0.0, "step": 684 }, { "epoch": 0.017125, "grad_norm": 12.0, "grad_norm_var": 1760.3909993489583, "learning_rate": 0.0001, "loss": 6.5653, "loss/crossentropy": 1.1899452209472656, "loss/hidden": 0.130859375, "loss/logits": 0.0118794534355402, "loss/reg": 5.232624530792236, "loss/twn": 0.0, "step": 685 }, { "epoch": 0.01715, "grad_norm": 11.0625, "grad_norm_var": 1770.9077473958334, "learning_rate": 0.0001, "loss": 7.3521, "loss/crossentropy": 2.0265913009643555, "loss/hidden": 0.08642578125, "loss/logits": 0.006208708509802818, "loss/reg": 5.2328410148620605, "loss/twn": 0.0, "step": 686 }, { "epoch": 0.017175, "grad_norm": 17.75, "grad_norm_var": 1769.8311848958333, "learning_rate": 0.0001, "loss": 6.2951, "loss/crossentropy": 0.9038959741592407, "loss/hidden": 0.1513671875, "loss/logits": 0.007222220301628113, "loss/reg": 5.2325825691223145, "loss/twn": 0.0, "step": 687 }, { "epoch": 0.0172, "grad_norm": 137.0, "grad_norm_var": 2501.6544270833333, "learning_rate": 0.0001, "loss": 7.9399, "loss/crossentropy": 2.5502278804779053, "loss/hidden": 0.1455078125, "loss/logits": 0.011638427153229713, "loss/reg": 5.232552528381348, "loss/twn": 0.0, "step": 688 }, { "epoch": 0.017225, "grad_norm": 16.25, "grad_norm_var": 2483.604280598958, "learning_rate": 0.0001, "loss": 8.1215, "loss/crossentropy": 2.7514421939849854, "loss/hidden": 0.126953125, "loss/logits": 0.010679876431822777, "loss/reg": 5.232418537139893, "loss/twn": 0.0, "step": 689 }, { "epoch": 0.01725, "grad_norm": 9.75, "grad_norm_var": 2490.0520182291666, "learning_rate": 0.0001, "loss": 7.1755, "loss/crossentropy": 1.8485499620437622, "loss/hidden": 0.08837890625, "loss/logits": 0.005838857963681221, "loss/reg": 5.2327117919921875, "loss/twn": 0.0, "step": 690 }, { "epoch": 0.017275, "grad_norm": 12.5, "grad_norm_var": 2485.2692057291665, "learning_rate": 0.0001, "loss": 7.1405, "loss/crossentropy": 1.8488080501556396, "loss/hidden": 0.0546875, "loss/logits": 0.0044571696780622005, "loss/reg": 5.232502460479736, "loss/twn": 0.0, "step": 691 }, { "epoch": 0.0173, "grad_norm": 1448.0, "grad_norm_var": 126949.88639322917, "learning_rate": 0.0001, "loss": 6.8951, "loss/crossentropy": 1.4486842155456543, "loss/hidden": 0.2001953125, "loss/logits": 0.013465155847370625, "loss/reg": 5.23276424407959, "loss/twn": 0.0, "step": 692 }, { "epoch": 0.017325, "grad_norm": 192.0, "grad_norm_var": 126205.7556640625, "learning_rate": 0.0001, "loss": 6.7837, "loss/crossentropy": 1.3860431909561157, "loss/hidden": 0.1591796875, "loss/logits": 0.006227361969649792, "loss/reg": 5.232283592224121, "loss/twn": 0.0, "step": 693 }, { "epoch": 0.01735, "grad_norm": 8.8125, "grad_norm_var": 126307.29348958333, "learning_rate": 0.0001, "loss": 7.7481, "loss/crossentropy": 2.4704062938690186, "loss/hidden": 0.0400390625, "loss/logits": 0.004876245744526386, "loss/reg": 5.232789993286133, "loss/twn": 0.0, "step": 694 }, { "epoch": 0.017375, "grad_norm": 13.9375, "grad_norm_var": 127064.13084309896, "learning_rate": 0.0001, "loss": 7.4, "loss/crossentropy": 2.0642364025115967, "loss/hidden": 0.09375, "loss/logits": 0.009522231295704842, "loss/reg": 5.2324628829956055, "loss/twn": 0.0, "step": 695 }, { "epoch": 0.0174, "grad_norm": 9.3125, "grad_norm_var": 127075.72967122396, "learning_rate": 0.0001, "loss": 7.1133, "loss/crossentropy": 1.8082016706466675, "loss/hidden": 0.0693359375, "loss/logits": 0.0033433041535317898, "loss/reg": 5.232382297515869, "loss/twn": 0.0, "step": 696 }, { "epoch": 0.017425, "grad_norm": 15.5, "grad_norm_var": 127376.05935872396, "learning_rate": 0.0001, "loss": 8.0679, "loss/crossentropy": 2.6998050212860107, "loss/hidden": 0.125, "loss/logits": 0.011031926609575748, "loss/reg": 5.232105731964111, "loss/twn": 0.0, "step": 697 }, { "epoch": 0.01745, "grad_norm": 16.5, "grad_norm_var": 127392.12693684896, "learning_rate": 0.0001, "loss": 7.1902, "loss/crossentropy": 1.788904070854187, "loss/hidden": 0.158203125, "loss/logits": 0.010435621254146099, "loss/reg": 5.232694149017334, "loss/twn": 0.0, "step": 698 }, { "epoch": 0.017475, "grad_norm": 26.25, "grad_norm_var": 127171.84055989583, "learning_rate": 0.0001, "loss": 8.0089, "loss/crossentropy": 2.5942537784576416, "loss/hidden": 0.1728515625, "loss/logits": 0.009584179148077965, "loss/reg": 5.232229232788086, "loss/twn": 0.0, "step": 699 }, { "epoch": 0.0175, "grad_norm": 7.375, "grad_norm_var": 127761.07708333334, "learning_rate": 0.0001, "loss": 7.1149, "loss/crossentropy": 1.7783420085906982, "loss/hidden": 0.0986328125, "loss/logits": 0.0052945781499147415, "loss/reg": 5.232630252838135, "loss/twn": 0.0, "step": 700 }, { "epoch": 0.017525, "grad_norm": 14.875, "grad_norm_var": 127719.3791015625, "learning_rate": 0.0001, "loss": 8.0908, "loss/crossentropy": 2.7603297233581543, "loss/hidden": 0.08642578125, "loss/logits": 0.011579321697354317, "loss/reg": 5.232461929321289, "loss/twn": 0.0, "step": 701 }, { "epoch": 0.01755, "grad_norm": 25.75, "grad_norm_var": 127515.01248372396, "learning_rate": 0.0001, "loss": 7.8012, "loss/crossentropy": 2.4607961177825928, "loss/hidden": 0.0986328125, "loss/logits": 0.009348167106509209, "loss/reg": 5.232429504394531, "loss/twn": 0.0, "step": 702 }, { "epoch": 0.017575, "grad_norm": 36.25, "grad_norm_var": 127276.2372233073, "learning_rate": 0.0001, "loss": 7.7355, "loss/crossentropy": 2.2735610008239746, "loss/hidden": 0.216796875, "loss/logits": 0.012781517580151558, "loss/reg": 5.232335090637207, "loss/twn": 0.0, "step": 703 }, { "epoch": 0.0176, "grad_norm": 12.8125, "grad_norm_var": 128031.16139322917, "learning_rate": 0.0001, "loss": 7.7608, "loss/crossentropy": 2.4146182537078857, "loss/hidden": 0.10595703125, "loss/logits": 0.007775201462209225, "loss/reg": 5.232488632202148, "loss/twn": 0.0, "step": 704 }, { "epoch": 0.017625, "grad_norm": 12.875, "grad_norm_var": 128077.03854166667, "learning_rate": 0.0001, "loss": 7.6663, "loss/crossentropy": 2.3123011589050293, "loss/hidden": 0.11279296875, "loss/logits": 0.008896533399820328, "loss/reg": 5.232283115386963, "loss/twn": 0.0, "step": 705 }, { "epoch": 0.01765, "grad_norm": 9.8125, "grad_norm_var": 128076.14998372395, "learning_rate": 0.0001, "loss": 6.7063, "loss/crossentropy": 1.328172206878662, "loss/hidden": 0.138671875, "loss/logits": 0.006915399804711342, "loss/reg": 5.232503414154053, "loss/twn": 0.0, "step": 706 }, { "epoch": 0.017675, "grad_norm": 10.25, "grad_norm_var": 128107.63943684896, "learning_rate": 0.0001, "loss": 5.6211, "loss/crossentropy": 0.28784415125846863, "loss/hidden": 0.09521484375, "loss/logits": 0.005815575830638409, "loss/reg": 5.232184410095215, "loss/twn": 0.0, "step": 707 }, { "epoch": 0.0177, "grad_norm": 9.875, "grad_norm_var": 2011.0417805989584, "learning_rate": 0.0001, "loss": 7.6542, "loss/crossentropy": 2.2898013591766357, "loss/hidden": 0.126953125, "loss/logits": 0.005081703420728445, "loss/reg": 5.232325553894043, "loss/twn": 0.0, "step": 708 }, { "epoch": 0.017725, "grad_norm": 9.6875, "grad_norm_var": 62.6244140625, "learning_rate": 0.0001, "loss": 6.8514, "loss/crossentropy": 1.4818062782287598, "loss/hidden": 0.1328125, "loss/logits": 0.0045208255760371685, "loss/reg": 5.232298851013184, "loss/twn": 0.0, "step": 709 }, { "epoch": 0.01775, "grad_norm": 14.9375, "grad_norm_var": 59.92239583333333, "learning_rate": 0.0001, "loss": 7.9023, "loss/crossentropy": 2.559528112411499, "loss/hidden": 0.103515625, "loss/logits": 0.006969613488763571, "loss/reg": 5.232254505157471, "loss/twn": 0.0, "step": 710 }, { "epoch": 0.017775, "grad_norm": 11.375, "grad_norm_var": 60.82394205729167, "learning_rate": 0.0001, "loss": 6.8892, "loss/crossentropy": 1.5907115936279297, "loss/hidden": 0.064453125, "loss/logits": 0.0018022289732471108, "loss/reg": 5.232196807861328, "loss/twn": 0.0, "step": 711 }, { "epoch": 0.0178, "grad_norm": 12.5, "grad_norm_var": 58.95045572916667, "learning_rate": 0.0001, "loss": 8.0051, "loss/crossentropy": 2.6705007553100586, "loss/hidden": 0.09619140625, "loss/logits": 0.005942562595009804, "loss/reg": 5.232419967651367, "loss/twn": 0.0, "step": 712 }, { "epoch": 0.017825, "grad_norm": 7.6875, "grad_norm_var": 62.675634765625, "learning_rate": 0.0001, "loss": 7.4538, "loss/crossentropy": 2.1238763332366943, "loss/hidden": 0.09130859375, "loss/logits": 0.006380223203450441, "loss/reg": 5.232213497161865, "loss/twn": 0.0, "step": 713 }, { "epoch": 0.01785, "grad_norm": 11.5, "grad_norm_var": 63.188655598958334, "learning_rate": 0.0001, "loss": 7.0007, "loss/crossentropy": 1.551921010017395, "loss/hidden": 0.2099609375, "loss/logits": 0.006598391104489565, "loss/reg": 5.2322468757629395, "loss/twn": 0.0, "step": 714 }, { "epoch": 0.017875, "grad_norm": 12.5625, "grad_norm_var": 53.66087239583333, "learning_rate": 0.0001, "loss": 8.5447, "loss/crossentropy": 3.1880903244018555, "loss/hidden": 0.11474609375, "loss/logits": 0.009867793880403042, "loss/reg": 5.2319817543029785, "loss/twn": 0.0, "step": 715 }, { "epoch": 0.0179, "grad_norm": 8.8125, "grad_norm_var": 52.566650390625, "learning_rate": 0.0001, "loss": 7.3708, "loss/crossentropy": 2.0305135250091553, "loss/hidden": 0.10107421875, "loss/logits": 0.00682512391358614, "loss/reg": 5.232370853424072, "loss/twn": 0.0, "step": 716 }, { "epoch": 0.017925, "grad_norm": 9.5, "grad_norm_var": 53.636051432291666, "learning_rate": 0.0001, "loss": 6.9861, "loss/crossentropy": 1.6283918619155884, "loss/hidden": 0.1201171875, "loss/logits": 0.0056338622234761715, "loss/reg": 5.231950759887695, "loss/twn": 0.0, "step": 717 }, { "epoch": 0.01795, "grad_norm": 16.375, "grad_norm_var": 43.831363932291666, "learning_rate": 0.0001, "loss": 7.5541, "loss/crossentropy": 2.1077566146850586, "loss/hidden": 0.2041015625, "loss/logits": 0.009965687990188599, "loss/reg": 5.232308387756348, "loss/twn": 0.0, "step": 718 }, { "epoch": 0.017975, "grad_norm": 12.8125, "grad_norm_var": 5.275455729166667, "learning_rate": 0.0001, "loss": 8.3363, "loss/crossentropy": 2.959925413131714, "loss/hidden": 0.1298828125, "loss/logits": 0.014515706337988377, "loss/reg": 5.231976509094238, "loss/twn": 0.0, "step": 719 }, { "epoch": 0.018, "grad_norm": 13.1875, "grad_norm_var": 5.351822916666666, "learning_rate": 0.0001, "loss": 8.3041, "loss/crossentropy": 2.966278553009033, "loss/hidden": 0.0986328125, "loss/logits": 0.007250492461025715, "loss/reg": 5.2319159507751465, "loss/twn": 0.0, "step": 720 }, { "epoch": 0.018025, "grad_norm": 16.25, "grad_norm_var": 6.689518229166667, "learning_rate": 0.0001, "loss": 8.1075, "loss/crossentropy": 2.7332916259765625, "loss/hidden": 0.12890625, "loss/logits": 0.0132482023909688, "loss/reg": 5.232076644897461, "loss/twn": 0.0, "step": 721 }, { "epoch": 0.01805, "grad_norm": 12.0, "grad_norm_var": 6.439436848958334, "learning_rate": 0.0001, "loss": 6.6983, "loss/crossentropy": 1.3738974332809448, "loss/hidden": 0.08642578125, "loss/logits": 0.006151068024337292, "loss/reg": 5.231801986694336, "loss/twn": 0.0, "step": 722 }, { "epoch": 0.018075, "grad_norm": 8.8125, "grad_norm_var": 6.871809895833334, "learning_rate": 0.0001, "loss": 7.4411, "loss/crossentropy": 2.184509515762329, "loss/hidden": 0.02099609375, "loss/logits": 0.00361478328704834, "loss/reg": 5.23202657699585, "loss/twn": 0.0, "step": 723 }, { "epoch": 0.0181, "grad_norm": 13.75, "grad_norm_var": 6.845572916666667, "learning_rate": 0.0001, "loss": 7.2513, "loss/crossentropy": 1.84177827835083, "loss/hidden": 0.16796875, "loss/logits": 0.009550071321427822, "loss/reg": 5.231957912445068, "loss/twn": 0.0, "step": 724 }, { "epoch": 0.018125, "grad_norm": 11.625, "grad_norm_var": 6.486832682291666, "learning_rate": 0.0001, "loss": 8.1392, "loss/crossentropy": 2.7632906436920166, "loss/hidden": 0.1318359375, "loss/logits": 0.012371431104838848, "loss/reg": 5.231691360473633, "loss/twn": 0.0, "step": 725 }, { "epoch": 0.01815, "grad_norm": 13.625, "grad_norm_var": 6.098893229166666, "learning_rate": 0.0001, "loss": 8.1336, "loss/crossentropy": 2.7336745262145996, "loss/hidden": 0.158203125, "loss/logits": 0.00991059560328722, "loss/reg": 5.231861114501953, "loss/twn": 0.0, "step": 726 }, { "epoch": 0.018175, "grad_norm": 9.3125, "grad_norm_var": 6.543082682291667, "learning_rate": 0.0001, "loss": 8.1011, "loss/crossentropy": 2.7824230194091797, "loss/hidden": 0.08154296875, "loss/logits": 0.005494968965649605, "loss/reg": 5.231605052947998, "loss/twn": 0.0, "step": 727 }, { "epoch": 0.0182, "grad_norm": 214.0, "grad_norm_var": 2560.450634765625, "learning_rate": 0.0001, "loss": 7.8675, "loss/crossentropy": 2.6006758213043213, "loss/hidden": 0.03271484375, "loss/logits": 0.0022825347259640694, "loss/reg": 5.231857776641846, "loss/twn": 0.0, "step": 728 }, { "epoch": 0.018225, "grad_norm": 10.25, "grad_norm_var": 2555.1207682291665, "learning_rate": 0.0001, "loss": 7.9332, "loss/crossentropy": 2.5436322689056396, "loss/hidden": 0.150390625, "loss/logits": 0.007534053176641464, "loss/reg": 5.231611251831055, "loss/twn": 0.0, "step": 729 }, { "epoch": 0.01825, "grad_norm": 7.46875, "grad_norm_var": 2563.203739420573, "learning_rate": 0.0001, "loss": 6.4295, "loss/crossentropy": 1.0562663078308105, "loss/hidden": 0.1357421875, "loss/logits": 0.005792177282273769, "loss/reg": 5.231747150421143, "loss/twn": 0.0, "step": 730 }, { "epoch": 0.018275, "grad_norm": 9.5, "grad_norm_var": 2568.6221313476562, "learning_rate": 0.0001, "loss": 6.1, "loss/crossentropy": 0.7717524766921997, "loss/hidden": 0.0908203125, "loss/logits": 0.005497816018760204, "loss/reg": 5.231908321380615, "loss/twn": 0.0, "step": 731 }, { "epoch": 0.0183, "grad_norm": 15.25, "grad_norm_var": 2558.0002563476564, "learning_rate": 0.0001, "loss": 8.2472, "loss/crossentropy": 2.8519446849823, "loss/hidden": 0.1484375, "loss/logits": 0.015055367723107338, "loss/reg": 5.231762886047363, "loss/twn": 0.0, "step": 732 }, { "epoch": 0.018325, "grad_norm": 18.625, "grad_norm_var": 2544.823661295573, "learning_rate": 0.0001, "loss": 8.0862, "loss/crossentropy": 2.667858600616455, "loss/hidden": 0.1708984375, "loss/logits": 0.015612177550792694, "loss/reg": 5.231838703155518, "loss/twn": 0.0, "step": 733 }, { "epoch": 0.01835, "grad_norm": 11.0, "grad_norm_var": 2552.937951660156, "learning_rate": 0.0001, "loss": 8.1949, "loss/crossentropy": 2.86808705329895, "loss/hidden": 0.08642578125, "loss/logits": 0.00871000811457634, "loss/reg": 5.231663703918457, "loss/twn": 0.0, "step": 734 }, { "epoch": 0.018375, "grad_norm": 10.375, "grad_norm_var": 2557.2188110351562, "learning_rate": 0.0001, "loss": 5.7664, "loss/crossentropy": 0.43561050295829773, "loss/hidden": 0.0947265625, "loss/logits": 0.004361970815807581, "loss/reg": 5.231712818145752, "loss/twn": 0.0, "step": 735 }, { "epoch": 0.0184, "grad_norm": 10.375, "grad_norm_var": 2562.0264282226562, "learning_rate": 0.0001, "loss": 5.8451, "loss/crossentropy": 0.3734654188156128, "loss/hidden": 0.232421875, "loss/logits": 0.0075777387246489525, "loss/reg": 5.231621265411377, "loss/twn": 0.0, "step": 736 }, { "epoch": 0.018425, "grad_norm": 14.625, "grad_norm_var": 2563.9819295247394, "learning_rate": 0.0001, "loss": 6.356, "loss/crossentropy": 0.9930484294891357, "loss/hidden": 0.12451171875, "loss/logits": 0.006776281166821718, "loss/reg": 5.231681823730469, "loss/twn": 0.0, "step": 737 }, { "epoch": 0.01845, "grad_norm": 7.75, "grad_norm_var": 2572.144364420573, "learning_rate": 0.0001, "loss": 6.7646, "loss/crossentropy": 1.4421303272247314, "loss/hidden": 0.08642578125, "loss/logits": 0.0040518054738640785, "loss/reg": 5.231963157653809, "loss/twn": 0.0, "step": 738 }, { "epoch": 0.018475, "grad_norm": 13.5, "grad_norm_var": 2563.933915201823, "learning_rate": 0.0001, "loss": 8.2343, "loss/crossentropy": 2.907639741897583, "loss/hidden": 0.08642578125, "loss/logits": 0.008740050718188286, "loss/reg": 5.23153829574585, "loss/twn": 0.0, "step": 739 }, { "epoch": 0.0185, "grad_norm": 13.875, "grad_norm_var": 2563.7567342122397, "learning_rate": 0.0001, "loss": 8.0317, "loss/crossentropy": 2.705251693725586, "loss/hidden": 0.08642578125, "loss/logits": 0.008279213681817055, "loss/reg": 5.231719493865967, "loss/twn": 0.0, "step": 740 }, { "epoch": 0.018525, "grad_norm": 13.6875, "grad_norm_var": 2560.4964803059897, "learning_rate": 0.0001, "loss": 6.2045, "loss/crossentropy": 0.7274767160415649, "loss/hidden": 0.236328125, "loss/logits": 0.009199721738696098, "loss/reg": 5.231486797332764, "loss/twn": 0.0, "step": 741 }, { "epoch": 0.01855, "grad_norm": 17.5, "grad_norm_var": 2555.7768513997394, "learning_rate": 0.0001, "loss": 7.1638, "loss/crossentropy": 1.6126173734664917, "loss/hidden": 0.30078125, "loss/logits": 0.018833626061677933, "loss/reg": 5.231540203094482, "loss/twn": 0.0, "step": 742 }, { "epoch": 0.018575, "grad_norm": 63.25, "grad_norm_var": 2626.092248535156, "learning_rate": 0.0001, "loss": 7.4974, "loss/crossentropy": 1.9825630187988281, "loss/hidden": 0.263671875, "loss/logits": 0.019452253356575966, "loss/reg": 5.231677055358887, "loss/twn": 0.0, "step": 743 }, { "epoch": 0.0186, "grad_norm": 11.5625, "grad_norm_var": 172.0647420247396, "learning_rate": 0.0001, "loss": 7.2546, "loss/crossentropy": 1.7497669458389282, "loss/hidden": 0.259765625, "loss/logits": 0.013276930898427963, "loss/reg": 5.231839179992676, "loss/twn": 0.0, "step": 744 }, { "epoch": 0.018625, "grad_norm": 9.5, "grad_norm_var": 172.62860921223958, "learning_rate": 0.0001, "loss": 7.8067, "loss/crossentropy": 2.420611619949341, "loss/hidden": 0.1455078125, "loss/logits": 0.009184225462377071, "loss/reg": 5.231430530548096, "loss/twn": 0.0, "step": 745 }, { "epoch": 0.01865, "grad_norm": 13.875, "grad_norm_var": 168.34192708333333, "learning_rate": 0.0001, "loss": 6.583, "loss/crossentropy": 1.1484166383743286, "loss/hidden": 0.1943359375, "loss/logits": 0.008578259497880936, "loss/reg": 5.231657028198242, "loss/twn": 0.0, "step": 746 }, { "epoch": 0.018675, "grad_norm": 118.0, "grad_norm_var": 811.6565104166667, "learning_rate": 0.0001, "loss": 6.9941, "loss/crossentropy": 1.6018927097320557, "loss/hidden": 0.154296875, "loss/logits": 0.0065146745182573795, "loss/reg": 5.231416702270508, "loss/twn": 0.0, "step": 747 }, { "epoch": 0.0187, "grad_norm": 17.625, "grad_norm_var": 809.6587890625, "learning_rate": 0.0001, "loss": 7.9705, "loss/crossentropy": 2.5781874656677246, "loss/hidden": 0.150390625, "loss/logits": 0.010151069611310959, "loss/reg": 5.2317585945129395, "loss/twn": 0.0, "step": 748 }, { "epoch": 0.018725, "grad_norm": 15.75, "grad_norm_var": 811.78359375, "learning_rate": 0.0001, "loss": 7.9057, "loss/crossentropy": 2.5473275184631348, "loss/hidden": 0.11767578125, "loss/logits": 0.009290559217333794, "loss/reg": 5.231451034545898, "loss/twn": 0.0, "step": 749 }, { "epoch": 0.01875, "grad_norm": 57.0, "grad_norm_var": 872.6377604166667, "learning_rate": 0.0001, "loss": 5.8577, "loss/crossentropy": 0.43267643451690674, "loss/hidden": 0.1845703125, "loss/logits": 0.008671510964632034, "loss/reg": 5.231771469116211, "loss/twn": 0.0, "step": 750 }, { "epoch": 0.018775, "grad_norm": 12.875, "grad_norm_var": 867.9815104166667, "learning_rate": 0.0001, "loss": 8.1731, "loss/crossentropy": 2.8458542823791504, "loss/hidden": 0.0888671875, "loss/logits": 0.00722483079880476, "loss/reg": 5.231186389923096, "loss/twn": 0.0, "step": 751 }, { "epoch": 0.0188, "grad_norm": 31.25, "grad_norm_var": 852.6405598958333, "learning_rate": 0.0001, "loss": 6.5314, "loss/crossentropy": 1.1035902500152588, "loss/hidden": 0.19140625, "loss/logits": 0.0045470790937542915, "loss/reg": 5.231815814971924, "loss/twn": 0.0, "step": 752 }, { "epoch": 0.018825, "grad_norm": 9.6875, "grad_norm_var": 862.2956868489583, "learning_rate": 0.0001, "loss": 5.7019, "loss/crossentropy": 0.3551396429538727, "loss/hidden": 0.1123046875, "loss/logits": 0.003157853614538908, "loss/reg": 5.2312774658203125, "loss/twn": 0.0, "step": 753 }, { "epoch": 0.01885, "grad_norm": 9.6875, "grad_norm_var": 857.6431640625, "learning_rate": 0.0001, "loss": 7.8963, "loss/crossentropy": 2.603400945663452, "loss/hidden": 0.05712890625, "loss/logits": 0.004422674421221018, "loss/reg": 5.231386661529541, "loss/twn": 0.0, "step": 754 }, { "epoch": 0.018875, "grad_norm": 14.125, "grad_norm_var": 856.56015625, "learning_rate": 0.0001, "loss": 7.4962, "loss/crossentropy": 2.134359359741211, "loss/hidden": 0.1201171875, "loss/logits": 0.010161285288631916, "loss/reg": 5.231522083282471, "loss/twn": 0.0, "step": 755 }, { "epoch": 0.0189, "grad_norm": 16.125, "grad_norm_var": 852.990625, "learning_rate": 0.0001, "loss": 8.2635, "loss/crossentropy": 2.91304874420166, "loss/hidden": 0.10595703125, "loss/logits": 0.01325392909348011, "loss/reg": 5.231236934661865, "loss/twn": 0.0, "step": 756 }, { "epoch": 0.018925, "grad_norm": 8.75, "grad_norm_var": 863.2577962239583, "learning_rate": 0.0001, "loss": 6.6673, "loss/crossentropy": 1.2952691316604614, "loss/hidden": 0.1337890625, "loss/logits": 0.006863090209662914, "loss/reg": 5.231356143951416, "loss/twn": 0.0, "step": 757 }, { "epoch": 0.01895, "grad_norm": 9.4375, "grad_norm_var": 877.1677083333333, "learning_rate": 0.0001, "loss": 6.9557, "loss/crossentropy": 1.5827580690383911, "loss/hidden": 0.1328125, "loss/logits": 0.008454329334199429, "loss/reg": 5.231715202331543, "loss/twn": 0.0, "step": 758 }, { "epoch": 0.018975, "grad_norm": 16.0, "grad_norm_var": 783.0122395833333, "learning_rate": 0.0001, "loss": 7.1366, "loss/crossentropy": 1.7788175344467163, "loss/hidden": 0.1201171875, "loss/logits": 0.006595224142074585, "loss/reg": 5.231091499328613, "loss/twn": 0.0, "step": 759 }, { "epoch": 0.019, "grad_norm": 9.75, "grad_norm_var": 786.030712890625, "learning_rate": 0.0001, "loss": 8.1245, "loss/crossentropy": 2.800469160079956, "loss/hidden": 0.08642578125, "loss/logits": 0.006068210117518902, "loss/reg": 5.231574535369873, "loss/twn": 0.0, "step": 760 }, { "epoch": 0.019025, "grad_norm": 19.25, "grad_norm_var": 774.305322265625, "learning_rate": 0.0001, "loss": 6.9579, "loss/crossentropy": 1.6193712949752808, "loss/hidden": 0.10107421875, "loss/logits": 0.006502064410597086, "loss/reg": 5.2309889793396, "loss/twn": 0.0, "step": 761 }, { "epoch": 0.01905, "grad_norm": 20.625, "grad_norm_var": 768.311181640625, "learning_rate": 0.0001, "loss": 8.3539, "loss/crossentropy": 2.9865291118621826, "loss/hidden": 0.11962890625, "loss/logits": 0.016289401799440384, "loss/reg": 5.231488227844238, "loss/twn": 0.0, "step": 762 }, { "epoch": 0.019075, "grad_norm": 31.625, "grad_norm_var": 153.429150390625, "learning_rate": 0.0001, "loss": 7.0021, "loss/crossentropy": 1.6152843236923218, "loss/hidden": 0.1494140625, "loss/logits": 0.006050444208085537, "loss/reg": 5.2313923835754395, "loss/twn": 0.0, "step": 763 }, { "epoch": 0.0191, "grad_norm": 39.25, "grad_norm_var": 179.49178059895834, "learning_rate": 0.0001, "loss": 6.1086, "loss/crossentropy": 0.6142204403877258, "loss/hidden": 0.2578125, "loss/logits": 0.005147262010723352, "loss/reg": 5.231447219848633, "loss/twn": 0.0, "step": 764 }, { "epoch": 0.019125, "grad_norm": 28.0, "grad_norm_var": 181.80779622395832, "learning_rate": 0.0001, "loss": 7.0546, "loss/crossentropy": 1.6059232950210571, "loss/hidden": 0.212890625, "loss/logits": 0.00476275198161602, "loss/reg": 5.231032371520996, "loss/twn": 0.0, "step": 765 }, { "epoch": 0.01915, "grad_norm": 11.875, "grad_norm_var": 91.510791015625, "learning_rate": 0.0001, "loss": 7.9657, "loss/crossentropy": 2.631168842315674, "loss/hidden": 0.09619140625, "loss/logits": 0.007133619859814644, "loss/reg": 5.231229305267334, "loss/twn": 0.0, "step": 766 }, { "epoch": 0.019175, "grad_norm": 43.75, "grad_norm_var": 129.911572265625, "learning_rate": 0.0001, "loss": 6.3914, "loss/crossentropy": 1.015294075012207, "loss/hidden": 0.13671875, "loss/logits": 0.00839436985552311, "loss/reg": 5.231020450592041, "loss/twn": 0.0, "step": 767 }, { "epoch": 0.0192, "grad_norm": 13.625, "grad_norm_var": 122.769775390625, "learning_rate": 0.0001, "loss": 6.8334, "loss/crossentropy": 1.33900785446167, "loss/hidden": 0.251953125, "loss/logits": 0.011178944259881973, "loss/reg": 5.2312493324279785, "loss/twn": 0.0, "step": 768 }, { "epoch": 0.019225, "grad_norm": 11.5625, "grad_norm_var": 120.699462890625, "learning_rate": 0.0001, "loss": 6.9225, "loss/crossentropy": 1.5955766439437866, "loss/hidden": 0.0908203125, "loss/logits": 0.005241988226771355, "loss/reg": 5.230888366699219, "loss/twn": 0.0, "step": 769 }, { "epoch": 0.01925, "grad_norm": 17.125, "grad_norm_var": 114.95670572916667, "learning_rate": 0.0001, "loss": 8.1154, "loss/crossentropy": 2.7243542671203613, "loss/hidden": 0.142578125, "loss/logits": 0.017005622386932373, "loss/reg": 5.231443881988525, "loss/twn": 0.0, "step": 770 }, { "epoch": 0.019275, "grad_norm": 306.0, "grad_norm_var": 5232.954427083333, "learning_rate": 0.0001, "loss": 7.935, "loss/crossentropy": 2.5942704677581787, "loss/hidden": 0.10205078125, "loss/logits": 0.007505115121603012, "loss/reg": 5.231191635131836, "loss/twn": 0.0, "step": 771 }, { "epoch": 0.0193, "grad_norm": 13.0, "grad_norm_var": 5242.542643229167, "learning_rate": 0.0001, "loss": 7.9428, "loss/crossentropy": 2.644366979598999, "loss/hidden": 0.0595703125, "loss/logits": 0.007467132993042469, "loss/reg": 5.231389045715332, "loss/twn": 0.0, "step": 772 }, { "epoch": 0.019325, "grad_norm": 49.5, "grad_norm_var": 5190.246809895833, "learning_rate": 0.0001, "loss": 6.8313, "loss/crossentropy": 1.4516693353652954, "loss/hidden": 0.1416015625, "loss/logits": 0.006859183311462402, "loss/reg": 5.231204032897949, "loss/twn": 0.0, "step": 773 }, { "epoch": 0.01935, "grad_norm": 10.25, "grad_norm_var": 5186.974593098958, "learning_rate": 0.0001, "loss": 8.2807, "loss/crossentropy": 3.0418143272399902, "loss/hidden": 0.00469970703125, "loss/logits": 0.002671225229278207, "loss/reg": 5.23149299621582, "loss/twn": 0.0, "step": 774 }, { "epoch": 0.019375, "grad_norm": 12.8125, "grad_norm_var": 5197.841145833333, "learning_rate": 0.0001, "loss": 7.9916, "loss/crossentropy": 2.7271320819854736, "loss/hidden": 0.0302734375, "loss/logits": 0.00295096542686224, "loss/reg": 5.231270790100098, "loss/twn": 0.0, "step": 775 }, { "epoch": 0.0194, "grad_norm": 23.0, "grad_norm_var": 5155.59296875, "learning_rate": 0.0001, "loss": 8.1296, "loss/crossentropy": 2.7584662437438965, "loss/hidden": 0.126953125, "loss/logits": 0.012552576139569283, "loss/reg": 5.231604099273682, "loss/twn": 0.0, "step": 776 }, { "epoch": 0.019425, "grad_norm": 14.1875, "grad_norm_var": 5171.675634765625, "learning_rate": 0.0001, "loss": 5.5239, "loss/crossentropy": 0.21823735535144806, "loss/hidden": 0.0693359375, "loss/logits": 0.005251707974821329, "loss/reg": 5.231090545654297, "loss/twn": 0.0, "step": 777 }, { "epoch": 0.01945, "grad_norm": 11.875, "grad_norm_var": 5199.516129557292, "learning_rate": 0.0001, "loss": 7.0206, "loss/crossentropy": 1.654329538345337, "loss/hidden": 0.12451171875, "loss/logits": 0.01058058813214302, "loss/reg": 5.231167316436768, "loss/twn": 0.0, "step": 778 }, { "epoch": 0.019475, "grad_norm": 31.75, "grad_norm_var": 5199.380192057291, "learning_rate": 0.0001, "loss": 7.4021, "loss/crossentropy": 2.093484878540039, "loss/hidden": 0.07568359375, "loss/logits": 0.0018800008110702038, "loss/reg": 5.231100082397461, "loss/twn": 0.0, "step": 779 }, { "epoch": 0.0195, "grad_norm": 230.0, "grad_norm_var": 7458.277457682291, "learning_rate": 0.0001, "loss": 7.047, "loss/crossentropy": 1.6409082412719727, "loss/hidden": 0.1650390625, "loss/logits": 0.00975135900080204, "loss/reg": 5.231270790100098, "loss/twn": 0.0, "step": 780 }, { "epoch": 0.019525, "grad_norm": 17.875, "grad_norm_var": 7496.773551432291, "learning_rate": 0.0001, "loss": 8.1639, "loss/crossentropy": 2.775987148284912, "loss/hidden": 0.150390625, "loss/logits": 0.006807660683989525, "loss/reg": 5.230672836303711, "loss/twn": 0.0, "step": 781 }, { "epoch": 0.01955, "grad_norm": 13.6875, "grad_norm_var": 7487.490625, "learning_rate": 0.0001, "loss": 6.7803, "loss/crossentropy": 1.4760520458221436, "loss/hidden": 0.0693359375, "loss/logits": 0.003954825457185507, "loss/reg": 5.230950355529785, "loss/twn": 0.0, "step": 782 }, { "epoch": 0.019575, "grad_norm": 15.0625, "grad_norm_var": 7567.613916015625, "learning_rate": 0.0001, "loss": 8.3797, "loss/crossentropy": 3.015629291534424, "loss/hidden": 0.11962890625, "loss/logits": 0.013525455258786678, "loss/reg": 5.230944633483887, "loss/twn": 0.0, "step": 783 }, { "epoch": 0.0196, "grad_norm": 13.125, "grad_norm_var": 7570.018343098958, "learning_rate": 0.0001, "loss": 8.0947, "loss/crossentropy": 2.698845863342285, "loss/hidden": 0.1474609375, "loss/logits": 0.017367932945489883, "loss/reg": 5.231037616729736, "loss/twn": 0.0, "step": 784 }, { "epoch": 0.019625, "grad_norm": 98.0, "grad_norm_var": 7600.609114583333, "learning_rate": 0.0001, "loss": 8.2002, "loss/crossentropy": 2.826629161834717, "loss/hidden": 0.1318359375, "loss/logits": 0.0108743105083704, "loss/reg": 5.230876445770264, "loss/twn": 0.0, "step": 785 }, { "epoch": 0.01965, "grad_norm": 10.125, "grad_norm_var": 7638.861197916666, "learning_rate": 0.0001, "loss": 7.1464, "loss/crossentropy": 1.752783179283142, "loss/hidden": 0.15625, "loss/logits": 0.006541845388710499, "loss/reg": 5.230816841125488, "loss/twn": 0.0, "step": 786 }, { "epoch": 0.019675, "grad_norm": 12.75, "grad_norm_var": 3175.657291666667, "learning_rate": 0.0001, "loss": 7.4269, "loss/crossentropy": 2.056227207183838, "loss/hidden": 0.125, "loss/logits": 0.014510264620184898, "loss/reg": 5.231206893920898, "loss/twn": 0.0, "step": 787 }, { "epoch": 0.0197, "grad_norm": 10.3125, "grad_norm_var": 3184.372770182292, "learning_rate": 0.0001, "loss": 6.9013, "loss/crossentropy": 1.5427310466766357, "loss/hidden": 0.1201171875, "loss/logits": 0.007622131146490574, "loss/reg": 5.230830192565918, "loss/twn": 0.0, "step": 788 }, { "epoch": 0.019725, "grad_norm": 9.375, "grad_norm_var": 3212.2094889322916, "learning_rate": 0.0001, "loss": 8.0128, "loss/crossentropy": 2.7377614974975586, "loss/hidden": 0.0400390625, "loss/logits": 0.004178863950073719, "loss/reg": 5.230835914611816, "loss/twn": 0.0, "step": 789 }, { "epoch": 0.01975, "grad_norm": 9.625, "grad_norm_var": 3214.161962890625, "learning_rate": 0.0001, "loss": 6.8224, "loss/crossentropy": 1.4642736911773682, "loss/hidden": 0.11767578125, "loss/logits": 0.009586405009031296, "loss/reg": 5.230891704559326, "loss/twn": 0.0, "step": 790 }, { "epoch": 0.019775, "grad_norm": 9.125, "grad_norm_var": 3225.1082682291667, "learning_rate": 0.0001, "loss": 7.9752, "loss/crossentropy": 2.6582653522491455, "loss/hidden": 0.0791015625, "loss/logits": 0.007132208906114101, "loss/reg": 5.230742931365967, "loss/twn": 0.0, "step": 791 }, { "epoch": 0.0198, "grad_norm": 14.0625, "grad_norm_var": 3242.157014973958, "learning_rate": 0.0001, "loss": 7.6474, "loss/crossentropy": 2.298769235610962, "loss/hidden": 0.11279296875, "loss/logits": 0.004840749781578779, "loss/reg": 5.2309794425964355, "loss/twn": 0.0, "step": 792 }, { "epoch": 0.019825, "grad_norm": 17.375, "grad_norm_var": 3234.984309895833, "learning_rate": 0.0001, "loss": 7.5051, "loss/crossentropy": 2.1144237518310547, "loss/hidden": 0.1484375, "loss/logits": 0.011511989869177341, "loss/reg": 5.230709552764893, "loss/twn": 0.0, "step": 793 }, { "epoch": 0.01985, "grad_norm": 12.6875, "grad_norm_var": 3232.7632649739585, "learning_rate": 0.0001, "loss": 6.3903, "loss/crossentropy": 0.9461896419525146, "loss/hidden": 0.2080078125, "loss/logits": 0.005450299009680748, "loss/reg": 5.230684757232666, "loss/twn": 0.0, "step": 794 }, { "epoch": 0.019875, "grad_norm": 13.9375, "grad_norm_var": 3255.1077473958335, "learning_rate": 0.0001, "loss": 7.4702, "loss/crossentropy": 2.1320881843566895, "loss/hidden": 0.0986328125, "loss/logits": 0.008736222982406616, "loss/reg": 5.230694770812988, "loss/twn": 0.0, "step": 795 }, { "epoch": 0.0199, "grad_norm": 9.8125, "grad_norm_var": 463.37107747395834, "learning_rate": 0.0001, "loss": 5.7159, "loss/crossentropy": 0.34444770216941833, "loss/hidden": 0.1357421875, "loss/logits": 0.005040573887526989, "loss/reg": 5.230666160583496, "loss/twn": 0.0, "step": 796 }, { "epoch": 0.019925, "grad_norm": 7.125, "grad_norm_var": 470.67771809895834, "learning_rate": 0.0001, "loss": 6.3741, "loss/crossentropy": 0.9926528334617615, "loss/hidden": 0.142578125, "loss/logits": 0.007962658070027828, "loss/reg": 5.23092794418335, "loss/twn": 0.0, "step": 797 }, { "epoch": 0.01995, "grad_norm": 9.0, "grad_norm_var": 474.28489583333334, "learning_rate": 0.0001, "loss": 7.4732, "loss/crossentropy": 2.123459815979004, "loss/hidden": 0.1123046875, "loss/logits": 0.0064601292833685875, "loss/reg": 5.2309346199035645, "loss/twn": 0.0, "step": 798 }, { "epoch": 0.019975, "grad_norm": 17.25, "grad_norm_var": 474.027978515625, "learning_rate": 0.0001, "loss": 7.4841, "loss/crossentropy": 2.1429977416992188, "loss/hidden": 0.10107421875, "loss/logits": 0.009501198306679726, "loss/reg": 5.230530738830566, "loss/twn": 0.0, "step": 799 }, { "epoch": 0.02, "grad_norm": 13.3125, "grad_norm_var": 473.9306640625, "learning_rate": 0.0001, "loss": 7.9787, "loss/crossentropy": 2.5355384349823, "loss/hidden": 0.203125, "loss/logits": 0.009105566889047623, "loss/reg": 5.230905055999756, "loss/twn": 0.0, "step": 800 }, { "epoch": 0.020025, "grad_norm": 15.5625, "grad_norm_var": 9.640087890625, "learning_rate": 0.0001, "loss": 7.6371, "loss/crossentropy": 2.3353328704833984, "loss/hidden": 0.064453125, "loss/logits": 0.00665889261290431, "loss/reg": 5.230616569519043, "loss/twn": 0.0, "step": 801 }, { "epoch": 0.02005, "grad_norm": 64.0, "grad_norm_var": 177.831103515625, "learning_rate": 0.0001, "loss": 8.2014, "loss/crossentropy": 2.8111629486083984, "loss/hidden": 0.1513671875, "loss/logits": 0.008202088996767998, "loss/reg": 5.230618476867676, "loss/twn": 0.0, "step": 802 }, { "epoch": 0.020075, "grad_norm": 12.0625, "grad_norm_var": 178.09733072916666, "learning_rate": 0.0001, "loss": 6.0497, "loss/crossentropy": 0.6109923124313354, "loss/hidden": 0.2021484375, "loss/logits": 0.005807585082948208, "loss/reg": 5.230772495269775, "loss/twn": 0.0, "step": 803 }, { "epoch": 0.0201, "grad_norm": 8.8125, "grad_norm_var": 179.23326822916667, "learning_rate": 0.0001, "loss": 7.8262, "loss/crossentropy": 2.455592393875122, "loss/hidden": 0.1328125, "loss/logits": 0.0069784787483513355, "loss/reg": 5.230773448944092, "loss/twn": 0.0, "step": 804 }, { "epoch": 0.020125, "grad_norm": 73.5, "grad_norm_var": 386.47057291666664, "learning_rate": 0.0001, "loss": 7.1917, "loss/crossentropy": 1.826701283454895, "loss/hidden": 0.125, "loss/logits": 0.0094426479190588, "loss/reg": 5.230578422546387, "loss/twn": 0.0, "step": 805 }, { "epoch": 0.02015, "grad_norm": 10.25, "grad_norm_var": 385.6968098958333, "learning_rate": 0.0001, "loss": 7.9012, "loss/crossentropy": 2.539992094039917, "loss/hidden": 0.12158203125, "loss/logits": 0.008753599599003792, "loss/reg": 5.230825424194336, "loss/twn": 0.0, "step": 806 }, { "epoch": 0.020175, "grad_norm": 19.875, "grad_norm_var": 378.4181640625, "learning_rate": 0.0001, "loss": 6.9229, "loss/crossentropy": 1.5474615097045898, "loss/hidden": 0.1376953125, "loss/logits": 0.00709810946136713, "loss/reg": 5.230693817138672, "loss/twn": 0.0, "step": 807 }, { "epoch": 0.0202, "grad_norm": 10.25, "grad_norm_var": 382.30115559895836, "learning_rate": 0.0001, "loss": 7.2085, "loss/crossentropy": 1.7812546491622925, "loss/hidden": 0.189453125, "loss/logits": 0.007315409369766712, "loss/reg": 5.230427265167236, "loss/twn": 0.0, "step": 808 }, { "epoch": 0.020225, "grad_norm": 9.5, "grad_norm_var": 388.59295247395835, "learning_rate": 0.0001, "loss": 7.8477, "loss/crossentropy": 2.541975498199463, "loss/hidden": 0.07177734375, "loss/logits": 0.003062914125621319, "loss/reg": 5.230909824371338, "loss/twn": 0.0, "step": 809 }, { "epoch": 0.02025, "grad_norm": 15.5625, "grad_norm_var": 386.619384765625, "learning_rate": 0.0001, "loss": 7.3771, "loss/crossentropy": 2.025947332382202, "loss/hidden": 0.11083984375, "loss/logits": 0.009859994053840637, "loss/reg": 5.230443954467773, "loss/twn": 0.0, "step": 810 }, { "epoch": 0.020275, "grad_norm": 8.6875, "grad_norm_var": 392.140087890625, "learning_rate": 0.0001, "loss": 7.2273, "loss/crossentropy": 1.893878698348999, "loss/hidden": 0.09375, "loss/logits": 0.009128249250352383, "loss/reg": 5.2305426597595215, "loss/twn": 0.0, "step": 811 }, { "epoch": 0.0203, "grad_norm": 33.75, "grad_norm_var": 398.5171875, "learning_rate": 0.0001, "loss": 8.1449, "loss/crossentropy": 2.717454195022583, "loss/hidden": 0.1796875, "loss/logits": 0.016887273639440536, "loss/reg": 5.230856895446777, "loss/twn": 0.0, "step": 812 }, { "epoch": 0.020325, "grad_norm": 12.875, "grad_norm_var": 390.30546875, "learning_rate": 0.0001, "loss": 8.3305, "loss/crossentropy": 3.0012078285217285, "loss/hidden": 0.09326171875, "loss/logits": 0.005363960284739733, "loss/reg": 5.230653285980225, "loss/twn": 0.0, "step": 813 }, { "epoch": 0.02035, "grad_norm": 34.0, "grad_norm_var": 389.73255208333336, "learning_rate": 0.0001, "loss": 6.834, "loss/crossentropy": 1.4066799879074097, "loss/hidden": 0.193359375, "loss/logits": 0.003266718937084079, "loss/reg": 5.230650901794434, "loss/twn": 0.0, "step": 814 }, { "epoch": 0.020375, "grad_norm": 326.0, "grad_norm_var": 6133.447395833334, "learning_rate": 0.0001, "loss": 8.109, "loss/crossentropy": 2.785377025604248, "loss/hidden": 0.087890625, "loss/logits": 0.005352129694074392, "loss/reg": 5.230373859405518, "loss/twn": 0.0, "step": 815 }, { "epoch": 0.0204, "grad_norm": 61.75, "grad_norm_var": 6096.425504557292, "learning_rate": 0.0001, "loss": 6.3327, "loss/crossentropy": 0.9138974547386169, "loss/hidden": 0.17578125, "loss/logits": 0.012529873289167881, "loss/reg": 5.230535984039307, "loss/twn": 0.0, "step": 816 }, { "epoch": 0.020425, "grad_norm": 15.8125, "grad_norm_var": 6095.455582682292, "learning_rate": 0.0001, "loss": 8.4489, "loss/crossentropy": 3.057579517364502, "loss/hidden": 0.142578125, "loss/logits": 0.018565690144896507, "loss/reg": 5.230161666870117, "loss/twn": 0.0, "step": 817 }, { "epoch": 0.02045, "grad_norm": 18.125, "grad_norm_var": 6109.504801432292, "learning_rate": 0.0001, "loss": 7.9928, "loss/crossentropy": 2.607743740081787, "loss/hidden": 0.1494140625, "loss/logits": 0.004932316951453686, "loss/reg": 5.230733394622803, "loss/twn": 0.0, "step": 818 }, { "epoch": 0.020475, "grad_norm": 21.25, "grad_norm_var": 6078.197916666667, "learning_rate": 0.0001, "loss": 8.2796, "loss/crossentropy": 2.8803536891937256, "loss/hidden": 0.15234375, "loss/logits": 0.016475437209010124, "loss/reg": 5.230462551116943, "loss/twn": 0.0, "step": 819 }, { "epoch": 0.0205, "grad_norm": 9.6875, "grad_norm_var": 6074.315559895834, "learning_rate": 0.0001, "loss": 7.3712, "loss/crossentropy": 1.9426707029342651, "loss/hidden": 0.1923828125, "loss/logits": 0.005748513620346785, "loss/reg": 5.230403900146484, "loss/twn": 0.0, "step": 820 }, { "epoch": 0.020525, "grad_norm": 10.0, "grad_norm_var": 6064.3275390625, "learning_rate": 0.0001, "loss": 8.233, "loss/crossentropy": 2.895735025405884, "loss/hidden": 0.0986328125, "loss/logits": 0.008359922096133232, "loss/reg": 5.230307102203369, "loss/twn": 0.0, "step": 821 }, { "epoch": 0.02055, "grad_norm": 11.8125, "grad_norm_var": 6058.576806640625, "learning_rate": 0.0001, "loss": 7.0902, "loss/crossentropy": 1.73037588596344, "loss/hidden": 0.11962890625, "loss/logits": 0.009449999779462814, "loss/reg": 5.230698585510254, "loss/twn": 0.0, "step": 822 }, { "epoch": 0.020575, "grad_norm": 11.125, "grad_norm_var": 6085.305322265625, "learning_rate": 0.0001, "loss": 7.9687, "loss/crossentropy": 2.613971471786499, "loss/hidden": 0.11767578125, "loss/logits": 0.0068368250504136086, "loss/reg": 5.230223178863525, "loss/twn": 0.0, "step": 823 }, { "epoch": 0.0206, "grad_norm": 13.625, "grad_norm_var": 6073.468212890625, "learning_rate": 0.0001, "loss": 7.6646, "loss/crossentropy": 2.2981951236724854, "loss/hidden": 0.12451171875, "loss/logits": 0.01143670454621315, "loss/reg": 5.230466842651367, "loss/twn": 0.0, "step": 824 }, { "epoch": 0.020625, "grad_norm": 9.1875, "grad_norm_var": 6074.676302083333, "learning_rate": 0.0001, "loss": 7.9187, "loss/crossentropy": 2.578167676925659, "loss/hidden": 0.10546875, "loss/logits": 0.005032903980463743, "loss/reg": 5.2300333976745605, "loss/twn": 0.0, "step": 825 }, { "epoch": 0.02065, "grad_norm": 12.3125, "grad_norm_var": 6085.2015625, "learning_rate": 0.0001, "loss": 7.4645, "loss/crossentropy": 2.171114444732666, "loss/hidden": 0.0595703125, "loss/logits": 0.003213082440197468, "loss/reg": 5.23060417175293, "loss/twn": 0.0, "step": 826 }, { "epoch": 0.020675, "grad_norm": 7.75, "grad_norm_var": 6088.936181640625, "learning_rate": 0.0001, "loss": 7.2651, "loss/crossentropy": 1.9576009511947632, "loss/hidden": 0.0693359375, "loss/logits": 0.008241134695708752, "loss/reg": 5.229961395263672, "loss/twn": 0.0, "step": 827 }, { "epoch": 0.0207, "grad_norm": 9.1875, "grad_norm_var": 6140.7796875, "learning_rate": 0.0001, "loss": 6.9102, "loss/crossentropy": 1.5859615802764893, "loss/hidden": 0.09130859375, "loss/logits": 0.0022871571127325296, "loss/reg": 5.230637550354004, "loss/twn": 0.0, "step": 828 }, { "epoch": 0.020725, "grad_norm": 8.875, "grad_norm_var": 6154.396354166666, "learning_rate": 0.0001, "loss": 6.8867, "loss/crossentropy": 1.5643407106399536, "loss/hidden": 0.08642578125, "loss/logits": 0.005755975842475891, "loss/reg": 5.230214595794678, "loss/twn": 0.0, "step": 829 }, { "epoch": 0.02075, "grad_norm": 79.5, "grad_norm_var": 6269.947395833334, "learning_rate": 0.0001, "loss": 6.9087, "loss/crossentropy": 1.5483061075210571, "loss/hidden": 0.12451171875, "loss/logits": 0.005769835785031319, "loss/reg": 5.230134963989258, "loss/twn": 0.0, "step": 830 }, { "epoch": 0.020775, "grad_norm": 10.25, "grad_norm_var": 423.63880208333336, "learning_rate": 0.0001, "loss": 8.0232, "loss/crossentropy": 2.6613450050354004, "loss/hidden": 0.126953125, "loss/logits": 0.004578165709972382, "loss/reg": 5.2303147315979, "loss/twn": 0.0, "step": 831 }, { "epoch": 0.0208, "grad_norm": 11.9375, "grad_norm_var": 297.382275390625, "learning_rate": 0.0001, "loss": 7.1745, "loss/crossentropy": 1.7624868154525757, "loss/hidden": 0.1708984375, "loss/logits": 0.0105954110622406, "loss/reg": 5.2305684089660645, "loss/twn": 0.0, "step": 832 }, { "epoch": 0.020825, "grad_norm": 11.8125, "grad_norm_var": 298.63019205729165, "learning_rate": 0.0001, "loss": 8.0084, "loss/crossentropy": 2.670598030090332, "loss/hidden": 0.0986328125, "loss/logits": 0.009010246023535728, "loss/reg": 5.230124473571777, "loss/twn": 0.0, "step": 833 }, { "epoch": 0.02085, "grad_norm": 11.6875, "grad_norm_var": 299.41979166666664, "learning_rate": 0.0001, "loss": 7.9989, "loss/crossentropy": 2.6066150665283203, "loss/hidden": 0.1533203125, "loss/logits": 0.008532057516276836, "loss/reg": 5.230454444885254, "loss/twn": 0.0, "step": 834 }, { "epoch": 0.020875, "grad_norm": 21.375, "grad_norm_var": 299.51451822916664, "learning_rate": 0.0001, "loss": 8.5087, "loss/crossentropy": 3.1679840087890625, "loss/hidden": 0.10107421875, "loss/logits": 0.009620252065360546, "loss/reg": 5.230065822601318, "loss/twn": 0.0, "step": 835 }, { "epoch": 0.0209, "grad_norm": 44.0, "grad_norm_var": 345.89894205729166, "learning_rate": 0.0001, "loss": 7.9201, "loss/crossentropy": 2.6046769618988037, "loss/hidden": 0.080078125, "loss/logits": 0.004920288920402527, "loss/reg": 5.230381488800049, "loss/twn": 0.0, "step": 836 }, { "epoch": 0.020925, "grad_norm": 8.875, "grad_norm_var": 347.14464518229164, "learning_rate": 0.0001, "loss": 7.3371, "loss/crossentropy": 1.9817231893539429, "loss/hidden": 0.11767578125, "loss/logits": 0.007638626731932163, "loss/reg": 5.230074405670166, "loss/twn": 0.0, "step": 837 }, { "epoch": 0.02095, "grad_norm": 42.25, "grad_norm_var": 381.12526041666666, "learning_rate": 0.0001, "loss": 8.104, "loss/crossentropy": 2.708381414413452, "loss/hidden": 0.1513671875, "loss/logits": 0.013956461101770401, "loss/reg": 5.230310440063477, "loss/twn": 0.0, "step": 838 }, { "epoch": 0.020975, "grad_norm": 15.75, "grad_norm_var": 377.2301432291667, "learning_rate": 0.0001, "loss": 7.4726, "loss/crossentropy": 2.0920002460479736, "loss/hidden": 0.142578125, "loss/logits": 0.00805431604385376, "loss/reg": 5.230012893676758, "loss/twn": 0.0, "step": 839 }, { "epoch": 0.021, "grad_norm": 15.5, "grad_norm_var": 375.8815104166667, "learning_rate": 0.0001, "loss": 8.1933, "loss/crossentropy": 2.8643245697021484, "loss/hidden": 0.08642578125, "loss/logits": 0.012173913419246674, "loss/reg": 5.230340480804443, "loss/twn": 0.0, "step": 840 }, { "epoch": 0.021025, "grad_norm": 15.8125, "grad_norm_var": 369.05983072916666, "learning_rate": 0.0001, "loss": 7.1166, "loss/crossentropy": 1.7462717294692993, "loss/hidden": 0.126953125, "loss/logits": 0.0133826844394207, "loss/reg": 5.2299885749816895, "loss/twn": 0.0, "step": 841 }, { "epoch": 0.02105, "grad_norm": 11.25, "grad_norm_var": 370.280322265625, "learning_rate": 0.0001, "loss": 8.2839, "loss/crossentropy": 2.8843467235565186, "loss/hidden": 0.1591796875, "loss/logits": 0.01038344856351614, "loss/reg": 5.229991912841797, "loss/twn": 0.0, "step": 842 }, { "epoch": 0.021075, "grad_norm": 10.5, "grad_norm_var": 366.12810872395835, "learning_rate": 0.0001, "loss": 7.9547, "loss/crossentropy": 2.5944228172302246, "loss/hidden": 0.1201171875, "loss/logits": 0.009886080399155617, "loss/reg": 5.230307579040527, "loss/twn": 0.0, "step": 843 }, { "epoch": 0.0211, "grad_norm": 9.125, "grad_norm_var": 366.22291666666666, "learning_rate": 0.0001, "loss": 7.8067, "loss/crossentropy": 2.4823381900787354, "loss/hidden": 0.08642578125, "loss/logits": 0.007714688777923584, "loss/reg": 5.2301812171936035, "loss/twn": 0.0, "step": 844 }, { "epoch": 0.021125, "grad_norm": 10.125, "grad_norm_var": 364.37786458333335, "learning_rate": 0.0001, "loss": 7.77, "loss/crossentropy": 2.477754592895508, "loss/hidden": 0.058837890625, "loss/logits": 0.0033183712512254715, "loss/reg": 5.2300519943237305, "loss/twn": 0.0, "step": 845 }, { "epoch": 0.02115, "grad_norm": 8.125, "grad_norm_var": 122.33483072916667, "learning_rate": 0.0001, "loss": 6.1579, "loss/crossentropy": 0.859074056148529, "loss/hidden": 0.06689453125, "loss/logits": 0.0018144365167245269, "loss/reg": 5.230114936828613, "loss/twn": 0.0, "step": 846 }, { "epoch": 0.021175, "grad_norm": 11.375, "grad_norm_var": 121.52916666666667, "learning_rate": 0.0001, "loss": 7.8874, "loss/crossentropy": 2.5228824615478516, "loss/hidden": 0.1279296875, "loss/logits": 0.006642586551606655, "loss/reg": 5.229929447174072, "loss/twn": 0.0, "step": 847 }, { "epoch": 0.0212, "grad_norm": 7.84375, "grad_norm_var": 124.91343994140625, "learning_rate": 0.0001, "loss": 7.0446, "loss/crossentropy": 1.6815242767333984, "loss/hidden": 0.123046875, "loss/logits": 0.009667545557022095, "loss/reg": 5.230370044708252, "loss/twn": 0.0, "step": 848 }, { "epoch": 0.021225, "grad_norm": 10.875, "grad_norm_var": 125.48717041015625, "learning_rate": 0.0001, "loss": 7.8503, "loss/crossentropy": 2.5361642837524414, "loss/hidden": 0.0791015625, "loss/logits": 0.004664687905460596, "loss/reg": 5.230417728424072, "loss/twn": 0.0, "step": 849 }, { "epoch": 0.02125, "grad_norm": 8.5625, "grad_norm_var": 127.85452067057291, "learning_rate": 0.0001, "loss": 7.3983, "loss/crossentropy": 2.035668134689331, "loss/hidden": 0.125, "loss/logits": 0.00722795445472002, "loss/reg": 5.230389595031738, "loss/twn": 0.0, "step": 850 }, { "epoch": 0.021275, "grad_norm": 12.5, "grad_norm_var": 126.07258707682291, "learning_rate": 0.0001, "loss": 7.8854, "loss/crossentropy": 2.4939327239990234, "loss/hidden": 0.1494140625, "loss/logits": 0.012096907943487167, "loss/reg": 5.229991912841797, "loss/twn": 0.0, "step": 851 }, { "epoch": 0.0213, "grad_norm": 11.5, "grad_norm_var": 67.09016520182291, "learning_rate": 0.0001, "loss": 6.8451, "loss/crossentropy": 1.4572025537490845, "loss/hidden": 0.14453125, "loss/logits": 0.013090159744024277, "loss/reg": 5.230262756347656, "loss/twn": 0.0, "step": 852 }, { "epoch": 0.021325, "grad_norm": 12.0625, "grad_norm_var": 65.91975504557291, "learning_rate": 0.0001, "loss": 7.7312, "loss/crossentropy": 2.376828193664551, "loss/hidden": 0.115234375, "loss/logits": 0.009080484509468079, "loss/reg": 5.230012893676758, "loss/twn": 0.0, "step": 853 }, { "epoch": 0.02135, "grad_norm": 11.75, "grad_norm_var": 6.420926920572916, "learning_rate": 0.0001, "loss": 7.0291, "loss/crossentropy": 1.6736173629760742, "loss/hidden": 0.12060546875, "loss/logits": 0.004477534908801317, "loss/reg": 5.230405807495117, "loss/twn": 0.0, "step": 854 }, { "epoch": 0.021375, "grad_norm": 12.0, "grad_norm_var": 5.132840983072916, "learning_rate": 0.0001, "loss": 6.8586, "loss/crossentropy": 1.5102664232254028, "loss/hidden": 0.1103515625, "loss/logits": 0.008198726922273636, "loss/reg": 5.229762077331543, "loss/twn": 0.0, "step": 855 }, { "epoch": 0.0214, "grad_norm": 38.0, "grad_norm_var": 49.72854410807292, "learning_rate": 0.0001, "loss": 7.6098, "loss/crossentropy": 2.24526047706604, "loss/hidden": 0.12255859375, "loss/logits": 0.011424753814935684, "loss/reg": 5.23051118850708, "loss/twn": 0.0, "step": 856 }, { "epoch": 0.021425, "grad_norm": 164.0, "grad_norm_var": 1485.9123982747396, "learning_rate": 0.0001, "loss": 7.3561, "loss/crossentropy": 1.972996711730957, "loss/hidden": 0.14453125, "loss/logits": 0.008586418814957142, "loss/reg": 5.230007648468018, "loss/twn": 0.0, "step": 857 }, { "epoch": 0.02145, "grad_norm": 24.25, "grad_norm_var": 1478.1022420247396, "learning_rate": 0.0001, "loss": 7.1992, "loss/crossentropy": 1.7982319593429565, "loss/hidden": 0.1611328125, "loss/logits": 0.009712583385407925, "loss/reg": 5.230077743530273, "loss/twn": 0.0, "step": 858 }, { "epoch": 0.021475, "grad_norm": 55.0, "grad_norm_var": 1529.7060180664062, "learning_rate": 0.0001, "loss": 6.8691, "loss/crossentropy": 1.487571358680725, "loss/hidden": 0.1416015625, "loss/logits": 0.009719829075038433, "loss/reg": 5.230188369750977, "loss/twn": 0.0, "step": 859 }, { "epoch": 0.0215, "grad_norm": 37.75, "grad_norm_var": 1518.6361938476562, "learning_rate": 0.0001, "loss": 6.2527, "loss/crossentropy": 0.7488301396369934, "loss/hidden": 0.263671875, "loss/logits": 0.010243739932775497, "loss/reg": 5.229933261871338, "loss/twn": 0.0, "step": 860 }, { "epoch": 0.021525, "grad_norm": 22.75, "grad_norm_var": 1499.8006144205729, "learning_rate": 0.0001, "loss": 6.8342, "loss/crossentropy": 1.4506139755249023, "loss/hidden": 0.150390625, "loss/logits": 0.002946457825601101, "loss/reg": 5.23027229309082, "loss/twn": 0.0, "step": 861 }, { "epoch": 0.02155, "grad_norm": 26.125, "grad_norm_var": 1472.299051920573, "learning_rate": 0.0001, "loss": 8.298, "loss/crossentropy": 2.793138265609741, "loss/hidden": 0.2578125, "loss/logits": 0.016960376873612404, "loss/reg": 5.230113506317139, "loss/twn": 0.0, "step": 862 }, { "epoch": 0.021575, "grad_norm": 12.0, "grad_norm_var": 1470.842508951823, "learning_rate": 0.0001, "loss": 8.426, "loss/crossentropy": 3.0826714038848877, "loss/hidden": 0.10302734375, "loss/logits": 0.009999147616326809, "loss/reg": 5.230251789093018, "loss/twn": 0.0, "step": 863 }, { "epoch": 0.0216, "grad_norm": 170.0, "grad_norm_var": 2652.8306640625, "learning_rate": 0.0001, "loss": 8.011, "loss/crossentropy": 2.557135820388794, "loss/hidden": 0.205078125, "loss/logits": 0.018819302320480347, "loss/reg": 5.2300004959106445, "loss/twn": 0.0, "step": 864 }, { "epoch": 0.021625, "grad_norm": 9.8125, "grad_norm_var": 2656.930973307292, "learning_rate": 0.0001, "loss": 6.4711, "loss/crossentropy": 1.1575171947479248, "loss/hidden": 0.0791015625, "loss/logits": 0.004193156957626343, "loss/reg": 5.230248928070068, "loss/twn": 0.0, "step": 865 }, { "epoch": 0.02165, "grad_norm": 10.125, "grad_norm_var": 2650.689518229167, "learning_rate": 0.0001, "loss": 7.0724, "loss/crossentropy": 1.6960041522979736, "loss/hidden": 0.13671875, "loss/logits": 0.009182040579617023, "loss/reg": 5.230460166931152, "loss/twn": 0.0, "step": 866 }, { "epoch": 0.021675, "grad_norm": 7.875, "grad_norm_var": 2668.584895833333, "learning_rate": 0.0001, "loss": 7.1586, "loss/crossentropy": 1.821337103843689, "loss/hidden": 0.10107421875, "loss/logits": 0.00650972593575716, "loss/reg": 5.229717254638672, "loss/twn": 0.0, "step": 867 }, { "epoch": 0.0217, "grad_norm": 14.6875, "grad_norm_var": 2657.5058430989584, "learning_rate": 0.0001, "loss": 8.054, "loss/crossentropy": 2.651723623275757, "loss/hidden": 0.1552734375, "loss/logits": 0.01694151759147644, "loss/reg": 5.230074405670166, "loss/twn": 0.0, "step": 868 }, { "epoch": 0.021725, "grad_norm": 17.5, "grad_norm_var": 2639.634309895833, "learning_rate": 0.0001, "loss": 7.8281, "loss/crossentropy": 2.49051570892334, "loss/hidden": 0.0986328125, "loss/logits": 0.009045520797371864, "loss/reg": 5.229867935180664, "loss/twn": 0.0, "step": 869 }, { "epoch": 0.02175, "grad_norm": 13.4375, "grad_norm_var": 2633.545686848958, "learning_rate": 0.0001, "loss": 7.0012, "loss/crossentropy": 1.546895146369934, "loss/hidden": 0.21484375, "loss/logits": 0.009267905727028847, "loss/reg": 5.230212211608887, "loss/twn": 0.0, "step": 870 }, { "epoch": 0.021775, "grad_norm": 14.9375, "grad_norm_var": 2623.2330729166665, "learning_rate": 0.0001, "loss": 7.582, "loss/crossentropy": 2.2645156383514404, "loss/hidden": 0.08154296875, "loss/logits": 0.006237420719116926, "loss/reg": 5.229740619659424, "loss/twn": 0.0, "step": 871 }, { "epoch": 0.0218, "grad_norm": 12.6875, "grad_norm_var": 2669.6590983072915, "learning_rate": 0.0001, "loss": 6.8665, "loss/crossentropy": 1.5306649208068848, "loss/hidden": 0.1005859375, "loss/logits": 0.004954389296472073, "loss/reg": 5.230282306671143, "loss/twn": 0.0, "step": 872 }, { "epoch": 0.021825, "grad_norm": 35.0, "grad_norm_var": 1547.8294108072917, "learning_rate": 0.0001, "loss": 8.1648, "loss/crossentropy": 2.7598485946655273, "loss/hidden": 0.1611328125, "loss/logits": 0.014029700309038162, "loss/reg": 5.229771137237549, "loss/twn": 0.0, "step": 873 }, { "epoch": 0.02185, "grad_norm": 9.375, "grad_norm_var": 1573.5507649739584, "learning_rate": 0.0001, "loss": 6.9286, "loss/crossentropy": 1.5998884439468384, "loss/hidden": 0.09521484375, "loss/logits": 0.003966475836932659, "loss/reg": 5.2295050621032715, "loss/twn": 0.0, "step": 874 }, { "epoch": 0.021875, "grad_norm": 13.375, "grad_norm_var": 1539.2968587239584, "learning_rate": 0.0001, "loss": 8.1337, "loss/crossentropy": 2.756443977355957, "loss/hidden": 0.1357421875, "loss/logits": 0.011736356653273106, "loss/reg": 5.229776859283447, "loss/twn": 0.0, "step": 875 }, { "epoch": 0.0219, "grad_norm": 14.1875, "grad_norm_var": 1539.3275390625, "learning_rate": 0.0001, "loss": 8.4196, "loss/crossentropy": 3.001187324523926, "loss/hidden": 0.171875, "loss/logits": 0.01653527095913887, "loss/reg": 5.230004787445068, "loss/twn": 0.0, "step": 876 }, { "epoch": 0.021925, "grad_norm": 10.0625, "grad_norm_var": 1553.6042805989584, "learning_rate": 0.0001, "loss": 7.6758, "loss/crossentropy": 2.371596097946167, "loss/hidden": 0.0693359375, "loss/logits": 0.004934161901473999, "loss/reg": 5.229954242706299, "loss/twn": 0.0, "step": 877 }, { "epoch": 0.02195, "grad_norm": 15.25, "grad_norm_var": 1558.5659993489583, "learning_rate": 0.0001, "loss": 7.0978, "loss/crossentropy": 1.6408387422561646, "loss/hidden": 0.208984375, "loss/logits": 0.017855621874332428, "loss/reg": 5.230149745941162, "loss/twn": 0.0, "step": 878 }, { "epoch": 0.021975, "grad_norm": 11.6875, "grad_norm_var": 1559.0625, "learning_rate": 0.0001, "loss": 7.826, "loss/crossentropy": 2.489450454711914, "loss/hidden": 0.10107421875, "loss/logits": 0.005990723147988319, "loss/reg": 5.22952938079834, "loss/twn": 0.0, "step": 879 }, { "epoch": 0.022, "grad_norm": 142.0, "grad_norm_var": 1062.0625, "learning_rate": 0.0001, "loss": 6.2769, "loss/crossentropy": 0.8815757632255554, "loss/hidden": 0.1513671875, "loss/logits": 0.01392771303653717, "loss/reg": 5.230021953582764, "loss/twn": 0.0, "step": 880 }, { "epoch": 0.022025, "grad_norm": 9.75, "grad_norm_var": 1062.164306640625, "learning_rate": 0.0001, "loss": 7.2034, "loss/crossentropy": 1.8355108499526978, "loss/hidden": 0.1318359375, "loss/logits": 0.00601241085678339, "loss/reg": 5.230048656463623, "loss/twn": 0.0, "step": 881 }, { "epoch": 0.02205, "grad_norm": 10.5625, "grad_norm_var": 1061.4837890625, "learning_rate": 0.0001, "loss": 6.8518, "loss/crossentropy": 1.397723913192749, "loss/hidden": 0.2158203125, "loss/logits": 0.00827928725630045, "loss/reg": 5.229991912841797, "loss/twn": 0.0, "step": 882 }, { "epoch": 0.022075, "grad_norm": 10.6875, "grad_norm_var": 1056.672509765625, "learning_rate": 0.0001, "loss": 8.2004, "loss/crossentropy": 2.9133853912353516, "loss/hidden": 0.05224609375, "loss/logits": 0.005087848752737045, "loss/reg": 5.2297210693359375, "loss/twn": 0.0, "step": 883 }, { "epoch": 0.0221, "grad_norm": 37.75, "grad_norm_var": 1066.81640625, "learning_rate": 0.0001, "loss": 6.094, "loss/crossentropy": 0.7456091046333313, "loss/hidden": 0.1162109375, "loss/logits": 0.002344908192753792, "loss/reg": 5.229843616485596, "loss/twn": 0.0, "step": 884 }, { "epoch": 0.022125, "grad_norm": 14.3125, "grad_norm_var": 1070.061181640625, "learning_rate": 0.0001, "loss": 7.1178, "loss/crossentropy": 1.7599776983261108, "loss/hidden": 0.12109375, "loss/logits": 0.0070150988176465034, "loss/reg": 5.229736804962158, "loss/twn": 0.0, "step": 885 }, { "epoch": 0.02215, "grad_norm": 18.0, "grad_norm_var": 1065.2764973958333, "learning_rate": 0.0001, "loss": 6.9191, "loss/crossentropy": 1.5698696374893188, "loss/hidden": 0.10986328125, "loss/logits": 0.009311170317232609, "loss/reg": 5.230006217956543, "loss/twn": 0.0, "step": 886 }, { "epoch": 0.022175, "grad_norm": 12.875, "grad_norm_var": 1067.9593587239583, "learning_rate": 0.0001, "loss": 7.0143, "loss/crossentropy": 1.6162168979644775, "loss/hidden": 0.1552734375, "loss/logits": 0.013361955992877483, "loss/reg": 5.229493618011475, "loss/twn": 0.0, "step": 887 }, { "epoch": 0.0222, "grad_norm": 13.4375, "grad_norm_var": 1066.9034993489583, "learning_rate": 0.0001, "loss": 8.1799, "loss/crossentropy": 2.848146915435791, "loss/hidden": 0.09375, "loss/logits": 0.008583602495491505, "loss/reg": 5.229381561279297, "loss/twn": 0.0, "step": 888 }, { "epoch": 0.022225, "grad_norm": 18.75, "grad_norm_var": 1058.8038899739583, "learning_rate": 0.0001, "loss": 8.1476, "loss/crossentropy": 2.7504210472106934, "loss/hidden": 0.158203125, "loss/logits": 0.00982433557510376, "loss/reg": 5.229184627532959, "loss/twn": 0.0, "step": 889 }, { "epoch": 0.02225, "grad_norm": 57.25, "grad_norm_var": 1117.4507649739583, "learning_rate": 0.0001, "loss": 6.9922, "loss/crossentropy": 1.6320585012435913, "loss/hidden": 0.1171875, "loss/logits": 0.01302691176533699, "loss/reg": 5.229902267456055, "loss/twn": 0.0, "step": 890 }, { "epoch": 0.022275, "grad_norm": 11.125, "grad_norm_var": 1121.4409993489583, "learning_rate": 0.0001, "loss": 7.9641, "loss/crossentropy": 2.6222574710845947, "loss/hidden": 0.10107421875, "loss/logits": 0.011231745593249798, "loss/reg": 5.229504108428955, "loss/twn": 0.0, "step": 891 }, { "epoch": 0.0223, "grad_norm": 7.65625, "grad_norm_var": 1133.9413696289062, "learning_rate": 0.0001, "loss": 7.5977, "loss/crossentropy": 2.262641191482544, "loss/hidden": 0.09619140625, "loss/logits": 0.008943114429712296, "loss/reg": 5.229957103729248, "loss/twn": 0.0, "step": 892 }, { "epoch": 0.022325, "grad_norm": 11.3125, "grad_norm_var": 1131.5373982747396, "learning_rate": 0.0001, "loss": 7.885, "loss/crossentropy": 2.514071226119995, "loss/hidden": 0.134765625, "loss/logits": 0.006755891256034374, "loss/reg": 5.229411602020264, "loss/twn": 0.0, "step": 893 }, { "epoch": 0.02235, "grad_norm": 31.875, "grad_norm_var": 1126.865946451823, "learning_rate": 0.0001, "loss": 6.8167, "loss/crossentropy": 1.4057066440582275, "loss/hidden": 0.17578125, "loss/logits": 0.005477352067828178, "loss/reg": 5.229723930358887, "loss/twn": 0.0, "step": 894 }, { "epoch": 0.022375, "grad_norm": 10.375, "grad_norm_var": 1129.511454264323, "learning_rate": 0.0001, "loss": 7.7274, "loss/crossentropy": 2.414677381515503, "loss/hidden": 0.0791015625, "loss/logits": 0.004009230528026819, "loss/reg": 5.229562282562256, "loss/twn": 0.0, "step": 895 }, { "epoch": 0.0224, "grad_norm": 11.125, "grad_norm_var": 177.70256754557292, "learning_rate": 0.0001, "loss": 6.6999, "loss/crossentropy": 1.3421200513839722, "loss/hidden": 0.1220703125, "loss/logits": 0.006061128340661526, "loss/reg": 5.229669094085693, "loss/twn": 0.0, "step": 896 }, { "epoch": 0.022425, "grad_norm": 20.125, "grad_norm_var": 173.1175740559896, "learning_rate": 0.0001, "loss": 7.0618, "loss/crossentropy": 1.6401444673538208, "loss/hidden": 0.1806640625, "loss/logits": 0.011477080173790455, "loss/reg": 5.229546546936035, "loss/twn": 0.0, "step": 897 }, { "epoch": 0.02245, "grad_norm": 9.5, "grad_norm_var": 174.32340087890626, "learning_rate": 0.0001, "loss": 7.0679, "loss/crossentropy": 1.6865355968475342, "loss/hidden": 0.1435546875, "loss/logits": 0.007968233898282051, "loss/reg": 5.229793071746826, "loss/twn": 0.0, "step": 898 }, { "epoch": 0.022475, "grad_norm": 9.5625, "grad_norm_var": 175.57584228515626, "learning_rate": 0.0001, "loss": 6.9239, "loss/crossentropy": 1.5547279119491577, "loss/hidden": 0.1328125, "loss/logits": 0.006813929416239262, "loss/reg": 5.229519844055176, "loss/twn": 0.0, "step": 899 }, { "epoch": 0.0225, "grad_norm": 15.1875, "grad_norm_var": 149.29993082682293, "learning_rate": 0.0001, "loss": 7.6572, "loss/crossentropy": 2.416555166244507, "loss/hidden": 0.0093994140625, "loss/logits": 0.001800237107090652, "loss/reg": 5.229480266571045, "loss/twn": 0.0, "step": 900 }, { "epoch": 0.022525, "grad_norm": 21.5, "grad_norm_var": 149.9250935872396, "learning_rate": 0.0001, "loss": 8.2505, "loss/crossentropy": 2.8510868549346924, "loss/hidden": 0.1591796875, "loss/logits": 0.01060121227055788, "loss/reg": 5.229599952697754, "loss/twn": 0.0, "step": 901 }, { "epoch": 0.02255, "grad_norm": 9.6875, "grad_norm_var": 153.66571858723958, "learning_rate": 0.0001, "loss": 7.8264, "loss/crossentropy": 2.532890796661377, "loss/hidden": 0.0595703125, "loss/logits": 0.004101074766367674, "loss/reg": 5.229794979095459, "loss/twn": 0.0, "step": 902 }, { "epoch": 0.022575, "grad_norm": 41.75, "grad_norm_var": 190.05273030598957, "learning_rate": 0.0001, "loss": 7.0146, "loss/crossentropy": 1.6412537097930908, "loss/hidden": 0.134765625, "loss/logits": 0.008777379989624023, "loss/reg": 5.229798793792725, "loss/twn": 0.0, "step": 903 }, { "epoch": 0.0226, "grad_norm": 15.625, "grad_norm_var": 188.79833577473957, "learning_rate": 0.0001, "loss": 8.2633, "loss/crossentropy": 2.9667930603027344, "loss/hidden": 0.06201171875, "loss/logits": 0.005043432116508484, "loss/reg": 5.229448318481445, "loss/twn": 0.0, "step": 904 }, { "epoch": 0.022625, "grad_norm": 12.4375, "grad_norm_var": 191.41539306640624, "learning_rate": 0.0001, "loss": 8.0256, "loss/crossentropy": 2.680783987045288, "loss/hidden": 0.10986328125, "loss/logits": 0.005312731955200434, "loss/reg": 5.229615211486816, "loss/twn": 0.0, "step": 905 }, { "epoch": 0.02265, "grad_norm": 16.625, "grad_norm_var": 84.70071207682291, "learning_rate": 0.0001, "loss": 7.5648, "loss/crossentropy": 2.088520050048828, "loss/hidden": 0.234375, "loss/logits": 0.012642334215342999, "loss/reg": 5.229223251342773, "loss/twn": 0.0, "step": 906 }, { "epoch": 0.022675, "grad_norm": 24.125, "grad_norm_var": 86.87076416015626, "learning_rate": 0.0001, "loss": 5.9125, "loss/crossentropy": 0.48404479026794434, "loss/hidden": 0.1904296875, "loss/logits": 0.008365976624190807, "loss/reg": 5.2296600341796875, "loss/twn": 0.0, "step": 907 }, { "epoch": 0.0227, "grad_norm": 11.0, "grad_norm_var": 83.502197265625, "learning_rate": 0.0001, "loss": 6.8349, "loss/crossentropy": 1.4288034439086914, "loss/hidden": 0.1669921875, "loss/logits": 0.009537655860185623, "loss/reg": 5.229605674743652, "loss/twn": 0.0, "step": 908 }, { "epoch": 0.022725, "grad_norm": 16.75, "grad_norm_var": 81.23515625, "learning_rate": 0.0001, "loss": 7.9611, "loss/crossentropy": 2.5989267826080322, "loss/hidden": 0.1201171875, "loss/logits": 0.012637370266020298, "loss/reg": 5.22941255569458, "loss/twn": 0.0, "step": 909 }, { "epoch": 0.02275, "grad_norm": 15.8125, "grad_norm_var": 66.20584309895834, "learning_rate": 0.0001, "loss": 7.4412, "loss/crossentropy": 2.0751564502716064, "loss/hidden": 0.125, "loss/logits": 0.011572781018912792, "loss/reg": 5.229443550109863, "loss/twn": 0.0, "step": 910 }, { "epoch": 0.022775, "grad_norm": 13.0, "grad_norm_var": 64.55428059895833, "learning_rate": 0.0001, "loss": 6.7192, "loss/crossentropy": 1.3851910829544067, "loss/hidden": 0.09716796875, "loss/logits": 0.007330389227718115, "loss/reg": 5.229530334472656, "loss/twn": 0.0, "step": 911 }, { "epoch": 0.0228, "grad_norm": 12.125, "grad_norm_var": 63.901676432291666, "learning_rate": 0.0001, "loss": 7.152, "loss/crossentropy": 1.8044177293777466, "loss/hidden": 0.1123046875, "loss/logits": 0.005986911244690418, "loss/reg": 5.22929048538208, "loss/twn": 0.0, "step": 912 }, { "epoch": 0.022825, "grad_norm": 8.8125, "grad_norm_var": 66.50885416666667, "learning_rate": 0.0001, "loss": 8.4326, "loss/crossentropy": 3.201655864715576, "loss/hidden": 4.380941390991211e-06, "loss/logits": 0.0015021440340206027, "loss/reg": 5.229450702667236, "loss/twn": 0.0, "step": 913 }, { "epoch": 0.02285, "grad_norm": 10.3125, "grad_norm_var": 65.86287434895833, "learning_rate": 0.0001, "loss": 6.5774, "loss/crossentropy": 1.2178183794021606, "loss/hidden": 0.12255859375, "loss/logits": 0.007649564184248447, "loss/reg": 5.22938346862793, "loss/twn": 0.0, "step": 914 }, { "epoch": 0.022875, "grad_norm": 13.0, "grad_norm_var": 63.69921875, "learning_rate": 0.0001, "loss": 7.8085, "loss/crossentropy": 2.4297561645507812, "loss/hidden": 0.1376953125, "loss/logits": 0.011574456468224525, "loss/reg": 5.2294511795043945, "loss/twn": 0.0, "step": 915 }, { "epoch": 0.0229, "grad_norm": 13.25, "grad_norm_var": 64.17198893229167, "learning_rate": 0.0001, "loss": 7.9386, "loss/crossentropy": 2.616173028945923, "loss/hidden": 0.08642578125, "loss/logits": 0.0062749385833740234, "loss/reg": 5.229771137237549, "loss/twn": 0.0, "step": 916 }, { "epoch": 0.022925, "grad_norm": 10.6875, "grad_norm_var": 63.5328125, "learning_rate": 0.0001, "loss": 8.1081, "loss/crossentropy": 2.7487690448760986, "loss/hidden": 0.1201171875, "loss/logits": 0.009604476392269135, "loss/reg": 5.2296576499938965, "loss/twn": 0.0, "step": 917 }, { "epoch": 0.02295, "grad_norm": 21.625, "grad_norm_var": 63.486181640625, "learning_rate": 0.0001, "loss": 7.0533, "loss/crossentropy": 1.7067608833312988, "loss/hidden": 0.1123046875, "loss/logits": 0.004787761718034744, "loss/reg": 5.2294087409973145, "loss/twn": 0.0, "step": 918 }, { "epoch": 0.022975, "grad_norm": 14.6875, "grad_norm_var": 16.556705729166666, "learning_rate": 0.0001, "loss": 6.862, "loss/crossentropy": 1.5273187160491943, "loss/hidden": 0.0986328125, "loss/logits": 0.006548475474119186, "loss/reg": 5.229493618011475, "loss/twn": 0.0, "step": 919 }, { "epoch": 0.023, "grad_norm": 10.25, "grad_norm_var": 17.4609375, "learning_rate": 0.0001, "loss": 8.096, "loss/crossentropy": 2.765378952026367, "loss/hidden": 0.09375, "loss/logits": 0.007458665873855352, "loss/reg": 5.229459285736084, "loss/twn": 0.0, "step": 920 }, { "epoch": 0.023025, "grad_norm": 15.875, "grad_norm_var": 17.468994140625, "learning_rate": 0.0001, "loss": 6.2566, "loss/crossentropy": 0.8696529865264893, "loss/hidden": 0.1484375, "loss/logits": 0.009295967407524586, "loss/reg": 5.229222297668457, "loss/twn": 0.0, "step": 921 }, { "epoch": 0.02305, "grad_norm": 14.5, "grad_norm_var": 17.077197265625, "learning_rate": 0.0001, "loss": 7.8853, "loss/crossentropy": 2.496795892715454, "loss/hidden": 0.1494140625, "loss/logits": 0.009545085951685905, "loss/reg": 5.229542255401611, "loss/twn": 0.0, "step": 922 }, { "epoch": 0.023075, "grad_norm": 23.25, "grad_norm_var": 15.957014973958334, "learning_rate": 0.0001, "loss": 7.049, "loss/crossentropy": 1.6003493070602417, "loss/hidden": 0.2060546875, "loss/logits": 0.013520617038011551, "loss/reg": 5.229060173034668, "loss/twn": 0.0, "step": 923 }, { "epoch": 0.0231, "grad_norm": 9.75, "grad_norm_var": 16.564436848958334, "learning_rate": 0.0001, "loss": 6.8671, "loss/crossentropy": 1.5159342288970947, "loss/hidden": 0.10986328125, "loss/logits": 0.011563955806195736, "loss/reg": 5.2297444343566895, "loss/twn": 0.0, "step": 924 }, { "epoch": 0.023125, "grad_norm": 9.125, "grad_norm_var": 17.382535807291667, "learning_rate": 0.0001, "loss": 7.8368, "loss/crossentropy": 2.537524938583374, "loss/hidden": 0.064453125, "loss/logits": 0.005966213531792164, "loss/reg": 5.228902339935303, "loss/twn": 0.0, "step": 925 }, { "epoch": 0.02315, "grad_norm": 10.6875, "grad_norm_var": 17.446598307291666, "learning_rate": 0.0001, "loss": 7.097, "loss/crossentropy": 1.695339560508728, "loss/hidden": 0.1630859375, "loss/logits": 0.009303221479058266, "loss/reg": 5.229240894317627, "loss/twn": 0.0, "step": 926 }, { "epoch": 0.023175, "grad_norm": 10.375, "grad_norm_var": 17.941520182291665, "learning_rate": 0.0001, "loss": 8.0381, "loss/crossentropy": 2.6705760955810547, "loss/hidden": 0.12890625, "loss/logits": 0.00920666940510273, "loss/reg": 5.229386329650879, "loss/twn": 0.0, "step": 927 }, { "epoch": 0.0232, "grad_norm": 8.75, "grad_norm_var": 19.055973307291666, "learning_rate": 0.0001, "loss": 7.8173, "loss/crossentropy": 2.529360055923462, "loss/hidden": 0.0546875, "loss/logits": 0.003768081543967128, "loss/reg": 5.229437351226807, "loss/twn": 0.0, "step": 928 }, { "epoch": 0.023225, "grad_norm": 74.5, "grad_norm_var": 253.73483072916667, "learning_rate": 0.0001, "loss": 6.3033, "loss/crossentropy": 0.9277183413505554, "loss/hidden": 0.138671875, "loss/logits": 0.007658226415514946, "loss/reg": 5.229298114776611, "loss/twn": 0.0, "step": 929 }, { "epoch": 0.02325, "grad_norm": 19.875, "grad_norm_var": 251.03292643229167, "learning_rate": 0.0001, "loss": 8.3524, "loss/crossentropy": 2.9925942420959473, "loss/hidden": 0.1201171875, "loss/logits": 0.010561013594269753, "loss/reg": 5.229130268096924, "loss/twn": 0.0, "step": 930 }, { "epoch": 0.023275, "grad_norm": 25.5, "grad_norm_var": 253.27902018229167, "learning_rate": 0.0001, "loss": 7.8689, "loss/crossentropy": 2.5310251712799072, "loss/hidden": 0.0986328125, "loss/logits": 0.010016044601798058, "loss/reg": 5.229218482971191, "loss/twn": 0.0, "step": 931 }, { "epoch": 0.0233, "grad_norm": 138.0, "grad_norm_var": 1142.0577962239583, "learning_rate": 0.0001, "loss": 6.7784, "loss/crossentropy": 1.2963286638259888, "loss/hidden": 0.2431640625, "loss/logits": 0.00944933295249939, "loss/reg": 5.229430198669434, "loss/twn": 0.0, "step": 932 }, { "epoch": 0.023325, "grad_norm": 12.0625, "grad_norm_var": 1139.352197265625, "learning_rate": 0.0001, "loss": 8.0218, "loss/crossentropy": 2.6391210556030273, "loss/hidden": 0.142578125, "loss/logits": 0.011118912138044834, "loss/reg": 5.229004383087158, "loss/twn": 0.0, "step": 933 }, { "epoch": 0.02335, "grad_norm": 10.75, "grad_norm_var": 1153.342431640625, "learning_rate": 0.0001, "loss": 7.3416, "loss/crossentropy": 2.0831658840179443, "loss/hidden": 0.0279541015625, "loss/logits": 0.0012758576776832342, "loss/reg": 5.229192733764648, "loss/twn": 0.0, "step": 934 }, { "epoch": 0.023375, "grad_norm": 29.125, "grad_norm_var": 1145.5634765625, "learning_rate": 0.0001, "loss": 6.8365, "loss/crossentropy": 1.4616435766220093, "loss/hidden": 0.138671875, "loss/logits": 0.007113803178071976, "loss/reg": 5.229030132293701, "loss/twn": 0.0, "step": 935 }, { "epoch": 0.0234, "grad_norm": 13.625, "grad_norm_var": 1139.00859375, "learning_rate": 0.0001, "loss": 6.1257, "loss/crossentropy": 0.7412286400794983, "loss/hidden": 0.1484375, "loss/logits": 0.006910163909196854, "loss/reg": 5.229117393493652, "loss/twn": 0.0, "step": 936 }, { "epoch": 0.023425, "grad_norm": 98.5, "grad_norm_var": 1447.4322265625, "learning_rate": 0.0001, "loss": 7.7219, "loss/crossentropy": 2.3482983112335205, "loss/hidden": 0.130859375, "loss/logits": 0.013444026932120323, "loss/reg": 5.229300498962402, "loss/twn": 0.0, "step": 937 }, { "epoch": 0.02345, "grad_norm": 6.3125, "grad_norm_var": 1470.478759765625, "learning_rate": 0.0001, "loss": 6.4927, "loss/crossentropy": 1.20167076587677, "loss/hidden": 0.0595703125, "loss/logits": 0.0022249873727560043, "loss/reg": 5.229248523712158, "loss/twn": 0.0, "step": 938 }, { "epoch": 0.023475, "grad_norm": 9.3125, "grad_norm_var": 1497.5080729166666, "learning_rate": 0.0001, "loss": 7.1103, "loss/crossentropy": 1.7726614475250244, "loss/hidden": 0.0986328125, "loss/logits": 0.009686892852187157, "loss/reg": 5.2292704582214355, "loss/twn": 0.0, "step": 939 }, { "epoch": 0.0235, "grad_norm": 62.75, "grad_norm_var": 1527.21015625, "learning_rate": 0.0001, "loss": 6.9551, "loss/crossentropy": 1.6360843181610107, "loss/hidden": 0.08642578125, "loss/logits": 0.0034621984232217073, "loss/reg": 5.229147434234619, "loss/twn": 0.0, "step": 940 }, { "epoch": 0.023525, "grad_norm": 25.0, "grad_norm_var": 1490.9374348958333, "learning_rate": 0.0001, "loss": 6.9106, "loss/crossentropy": 1.543732762336731, "loss/hidden": 0.12890625, "loss/logits": 0.008542709052562714, "loss/reg": 5.229380130767822, "loss/twn": 0.0, "step": 941 }, { "epoch": 0.02355, "grad_norm": 15.0, "grad_norm_var": 1478.2952962239583, "learning_rate": 0.0001, "loss": 8.1479, "loss/crossentropy": 2.7828969955444336, "loss/hidden": 0.12255859375, "loss/logits": 0.013092401437461376, "loss/reg": 5.229334831237793, "loss/twn": 0.0, "step": 942 }, { "epoch": 0.023575, "grad_norm": 10.375, "grad_norm_var": 1478.2952962239583, "learning_rate": 0.0001, "loss": 6.8721, "loss/crossentropy": 1.47151517868042, "loss/hidden": 0.1611328125, "loss/logits": 0.010217259638011456, "loss/reg": 5.229234218597412, "loss/twn": 0.0, "step": 943 }, { "epoch": 0.0236, "grad_norm": 10.5, "grad_norm_var": 1472.3699055989584, "learning_rate": 0.0001, "loss": 6.1565, "loss/crossentropy": 0.7564952373504639, "loss/hidden": 0.1630859375, "loss/logits": 0.007633813191205263, "loss/reg": 5.229316234588623, "loss/twn": 0.0, "step": 944 }, { "epoch": 0.023625, "grad_norm": 12.0, "grad_norm_var": 1387.962353515625, "learning_rate": 0.0001, "loss": 8.4192, "loss/crossentropy": 3.100782871246338, "loss/hidden": 0.08154296875, "loss/logits": 0.00750060984864831, "loss/reg": 5.22934627532959, "loss/twn": 0.0, "step": 945 }, { "epoch": 0.02365, "grad_norm": 10.5, "grad_norm_var": 1407.571728515625, "learning_rate": 0.0001, "loss": 7.9888, "loss/crossentropy": 2.6322426795959473, "loss/hidden": 0.11962890625, "loss/logits": 0.008000584319233894, "loss/reg": 5.2289581298828125, "loss/twn": 0.0, "step": 946 }, { "epoch": 0.023675, "grad_norm": 15.5625, "grad_norm_var": 1420.4775390625, "learning_rate": 0.0001, "loss": 8.039, "loss/crossentropy": 2.6621694564819336, "loss/hidden": 0.138671875, "loss/logits": 0.008840564638376236, "loss/reg": 5.229346752166748, "loss/twn": 0.0, "step": 947 }, { "epoch": 0.0237, "grad_norm": 9.1875, "grad_norm_var": 601.947900390625, "learning_rate": 0.0001, "loss": 6.9809, "loss/crossentropy": 1.6236486434936523, "loss/hidden": 0.1201171875, "loss/logits": 0.0081618158146739, "loss/reg": 5.229002475738525, "loss/twn": 0.0, "step": 948 }, { "epoch": 0.023725, "grad_norm": 52.0, "grad_norm_var": 649.196875, "learning_rate": 0.0001, "loss": 7.1623, "loss/crossentropy": 1.6345115900039673, "loss/hidden": 0.294921875, "loss/logits": 0.00377917499281466, "loss/reg": 5.2290778160095215, "loss/twn": 0.0, "step": 949 }, { "epoch": 0.02375, "grad_norm": 11.8125, "grad_norm_var": 647.3327962239583, "learning_rate": 0.0001, "loss": 6.9575, "loss/crossentropy": 1.564661979675293, "loss/hidden": 0.1572265625, "loss/logits": 0.006771073676645756, "loss/reg": 5.228812217712402, "loss/twn": 0.0, "step": 950 }, { "epoch": 0.023775, "grad_norm": 9.5, "grad_norm_var": 659.2304524739583, "learning_rate": 0.0001, "loss": 7.8587, "loss/crossentropy": 2.5682647228240967, "loss/hidden": 0.05712890625, "loss/logits": 0.003940091468393803, "loss/reg": 5.229336261749268, "loss/twn": 0.0, "step": 951 }, { "epoch": 0.0238, "grad_norm": 12.25, "grad_norm_var": 661.1124837239583, "learning_rate": 0.0001, "loss": 7.4498, "loss/crossentropy": 2.0796985626220703, "loss/hidden": 0.1328125, "loss/logits": 0.007796227466315031, "loss/reg": 5.229506015777588, "loss/twn": 0.0, "step": 952 }, { "epoch": 0.023825, "grad_norm": 12.9375, "grad_norm_var": 259.1692708333333, "learning_rate": 0.0001, "loss": 7.7385, "loss/crossentropy": 2.351168155670166, "loss/hidden": 0.1484375, "loss/logits": 0.010073304176330566, "loss/reg": 5.228799819946289, "loss/twn": 0.0, "step": 953 }, { "epoch": 0.02385, "grad_norm": 7.3125, "grad_norm_var": 257.6984375, "learning_rate": 0.0001, "loss": 6.6797, "loss/crossentropy": 1.3460360765457153, "loss/hidden": 0.0986328125, "loss/logits": 0.005743211135268211, "loss/reg": 5.229253768920898, "loss/twn": 0.0, "step": 954 }, { "epoch": 0.023875, "grad_norm": 50.5, "grad_norm_var": 316.70167643229166, "learning_rate": 0.0001, "loss": 7.4466, "loss/crossentropy": 2.0729310512542725, "loss/hidden": 0.134765625, "loss/logits": 0.009920709766447544, "loss/reg": 5.228950023651123, "loss/twn": 0.0, "step": 955 }, { "epoch": 0.0239, "grad_norm": 32.0, "grad_norm_var": 202.36612955729166, "learning_rate": 0.0001, "loss": 6.7081, "loss/crossentropy": 1.379569172859192, "loss/hidden": 0.09521484375, "loss/logits": 0.00438508577644825, "loss/reg": 5.228931903839111, "loss/twn": 0.0, "step": 956 }, { "epoch": 0.023925, "grad_norm": 8.875, "grad_norm_var": 204.70089518229167, "learning_rate": 0.0001, "loss": 6.1735, "loss/crossentropy": 0.8404383063316345, "loss/hidden": 0.0986328125, "loss/logits": 0.0052786958403885365, "loss/reg": 5.229166030883789, "loss/twn": 0.0, "step": 957 }, { "epoch": 0.02395, "grad_norm": 11.0625, "grad_norm_var": 206.99264322916667, "learning_rate": 0.0001, "loss": 8.0646, "loss/crossentropy": 2.8019535541534424, "loss/hidden": 0.0302734375, "loss/logits": 0.003258619224652648, "loss/reg": 5.229119777679443, "loss/twn": 0.0, "step": 958 }, { "epoch": 0.023975, "grad_norm": 62.25, "grad_norm_var": 327.46692708333336, "learning_rate": 0.0001, "loss": 6.2366, "loss/crossentropy": 0.899376630783081, "loss/hidden": 0.1044921875, "loss/logits": 0.0037081395275890827, "loss/reg": 5.229043960571289, "loss/twn": 0.0, "step": 959 }, { "epoch": 0.024, "grad_norm": 19.75, "grad_norm_var": 320.46197916666665, "learning_rate": 0.0001, "loss": 8.3454, "loss/crossentropy": 2.9434502124786377, "loss/hidden": 0.1572265625, "loss/logits": 0.015512878075242043, "loss/reg": 5.2291717529296875, "loss/twn": 0.0, "step": 960 }, { "epoch": 0.024025, "grad_norm": 12.8125, "grad_norm_var": 319.51808268229166, "learning_rate": 0.0001, "loss": 6.8387, "loss/crossentropy": 1.3557770252227783, "loss/hidden": 0.2470703125, "loss/logits": 0.006780410185456276, "loss/reg": 5.229072093963623, "loss/twn": 0.0, "step": 961 }, { "epoch": 0.02405, "grad_norm": 69.0, "grad_norm_var": 450.38136393229166, "learning_rate": 0.0001, "loss": 6.6024, "loss/crossentropy": 1.2391810417175293, "loss/hidden": 0.12255859375, "loss/logits": 0.011559647507965565, "loss/reg": 5.229081630706787, "loss/twn": 0.0, "step": 962 }, { "epoch": 0.024075, "grad_norm": 11.1875, "grad_norm_var": 456.966650390625, "learning_rate": 0.0001, "loss": 6.8096, "loss/crossentropy": 1.4291319847106934, "loss/hidden": 0.150390625, "loss/logits": 0.0013875500299036503, "loss/reg": 5.228731632232666, "loss/twn": 0.0, "step": 963 }, { "epoch": 0.0241, "grad_norm": 13.8125, "grad_norm_var": 448.843994140625, "learning_rate": 0.0001, "loss": 6.9963, "loss/crossentropy": 1.6549384593963623, "loss/hidden": 0.1025390625, "loss/logits": 0.009464550763368607, "loss/reg": 5.22934103012085, "loss/twn": 0.0, "step": 964 }, { "epoch": 0.024125, "grad_norm": 160.0, "grad_norm_var": 1569.287744140625, "learning_rate": 0.0001, "loss": 5.8326, "loss/crossentropy": 0.4071745276451111, "loss/hidden": 0.19140625, "loss/logits": 0.005319996736943722, "loss/reg": 5.228703498840332, "loss/twn": 0.0, "step": 965 }, { "epoch": 0.02415, "grad_norm": 9.625, "grad_norm_var": 1575.3483723958334, "learning_rate": 0.0001, "loss": 7.4917, "loss/crossentropy": 2.1543774604797363, "loss/hidden": 0.0986328125, "loss/logits": 0.00969572365283966, "loss/reg": 5.229022026062012, "loss/twn": 0.0, "step": 966 }, { "epoch": 0.024175, "grad_norm": 22.5, "grad_norm_var": 1547.8994140625, "learning_rate": 0.0001, "loss": 8.001, "loss/crossentropy": 2.677924156188965, "loss/hidden": 0.08642578125, "loss/logits": 0.007645599078387022, "loss/reg": 5.228973388671875, "loss/twn": 0.0, "step": 967 }, { "epoch": 0.0242, "grad_norm": 14.1875, "grad_norm_var": 1542.969384765625, "learning_rate": 0.0001, "loss": 7.5317, "loss/crossentropy": 2.1442387104034424, "loss/hidden": 0.146484375, "loss/logits": 0.012024961411952972, "loss/reg": 5.228928089141846, "loss/twn": 0.0, "step": 968 }, { "epoch": 0.024225, "grad_norm": 12.5, "grad_norm_var": 1544.1145182291666, "learning_rate": 0.0001, "loss": 6.3484, "loss/crossentropy": 0.9374382495880127, "loss/hidden": 0.177734375, "loss/logits": 0.004194112028926611, "loss/reg": 5.229069232940674, "loss/twn": 0.0, "step": 969 }, { "epoch": 0.02425, "grad_norm": 17.75, "grad_norm_var": 1516.099072265625, "learning_rate": 0.0001, "loss": 6.8346, "loss/crossentropy": 1.4595236778259277, "loss/hidden": 0.142578125, "loss/logits": 0.0036797509528696537, "loss/reg": 5.228834629058838, "loss/twn": 0.0, "step": 970 }, { "epoch": 0.024275, "grad_norm": 12.3125, "grad_norm_var": 1518.0780598958333, "learning_rate": 0.0001, "loss": 6.2077, "loss/crossentropy": 0.8395573496818542, "loss/hidden": 0.1328125, "loss/logits": 0.0061765448190271854, "loss/reg": 5.22913122177124, "loss/twn": 0.0, "step": 971 }, { "epoch": 0.0243, "grad_norm": 8.9375, "grad_norm_var": 1547.0202962239584, "learning_rate": 0.0001, "loss": 6.998, "loss/crossentropy": 1.55341374874115, "loss/hidden": 0.2080078125, "loss/logits": 0.008093073032796383, "loss/reg": 5.228493690490723, "loss/twn": 0.0, "step": 972 }, { "epoch": 0.024325, "grad_norm": 8.5625, "grad_norm_var": 1547.8716145833334, "learning_rate": 0.0001, "loss": 7.1173, "loss/crossentropy": 1.7166783809661865, "loss/hidden": 0.1640625, "loss/logits": 0.007523189298808575, "loss/reg": 5.229001045227051, "loss/twn": 0.0, "step": 973 }, { "epoch": 0.02435, "grad_norm": 224.0, "grad_norm_var": 3868.502197265625, "learning_rate": 0.0001, "loss": 6.3737, "loss/crossentropy": 0.9948546886444092, "loss/hidden": 0.1435546875, "loss/logits": 0.00641383184120059, "loss/reg": 5.228926658630371, "loss/twn": 0.0, "step": 974 }, { "epoch": 0.024375, "grad_norm": 15.375, "grad_norm_var": 3882.076416015625, "learning_rate": 0.0001, "loss": 8.2234, "loss/crossentropy": 2.891629219055176, "loss/hidden": 0.09375, "loss/logits": 0.00904359295964241, "loss/reg": 5.228950500488281, "loss/twn": 0.0, "step": 975 }, { "epoch": 0.0244, "grad_norm": 740.0, "grad_norm_var": 34406.04633789063, "learning_rate": 0.0001, "loss": 6.273, "loss/crossentropy": 0.9103600978851318, "loss/hidden": 0.1298828125, "loss/logits": 0.003978141117841005, "loss/reg": 5.228822231292725, "loss/twn": 0.0, "step": 976 }, { "epoch": 0.024425, "grad_norm": 12.75, "grad_norm_var": 34406.644270833334, "learning_rate": 0.0001, "loss": 7.4745, "loss/crossentropy": 2.0163183212280273, "loss/hidden": 0.2197265625, "loss/logits": 0.009444335475564003, "loss/reg": 5.229053497314453, "loss/twn": 0.0, "step": 977 }, { "epoch": 0.02445, "grad_norm": 12.4375, "grad_norm_var": 34723.73305664062, "learning_rate": 0.0001, "loss": 7.8159, "loss/crossentropy": 2.453317642211914, "loss/hidden": 0.125, "loss/logits": 0.008999479934573174, "loss/reg": 5.228621959686279, "loss/twn": 0.0, "step": 978 }, { "epoch": 0.024475, "grad_norm": 15.0, "grad_norm_var": 34689.15546875, "learning_rate": 0.0001, "loss": 8.3269, "loss/crossentropy": 2.9546549320220947, "loss/hidden": 0.130859375, "loss/logits": 0.012454254552721977, "loss/reg": 5.2289299964904785, "loss/twn": 0.0, "step": 979 }, { "epoch": 0.0245, "grad_norm": 23.625, "grad_norm_var": 34606.96300455729, "learning_rate": 0.0001, "loss": 5.9559, "loss/crossentropy": 0.5534784197807312, "loss/hidden": 0.166015625, "loss/logits": 0.007569343317300081, "loss/reg": 5.2288498878479, "loss/twn": 0.0, "step": 980 }, { "epoch": 0.024525, "grad_norm": 12.125, "grad_norm_var": 34432.74777018229, "learning_rate": 0.0001, "loss": 8.0158, "loss/crossentropy": 2.6883370876312256, "loss/hidden": 0.08642578125, "loss/logits": 0.012242003343999386, "loss/reg": 5.228771686553955, "loss/twn": 0.0, "step": 981 }, { "epoch": 0.02455, "grad_norm": 17.0, "grad_norm_var": 34374.21638997396, "learning_rate": 0.0001, "loss": 8.0208, "loss/crossentropy": 2.660618782043457, "loss/hidden": 0.11767578125, "loss/logits": 0.013429110869765282, "loss/reg": 5.229035377502441, "loss/twn": 0.0, "step": 982 }, { "epoch": 0.024575, "grad_norm": 26.875, "grad_norm_var": 34345.91560872396, "learning_rate": 0.0001, "loss": 7.0489, "loss/crossentropy": 1.6950386762619019, "loss/hidden": 0.119140625, "loss/logits": 0.006002393085509539, "loss/reg": 5.2287116050720215, "loss/twn": 0.0, "step": 983 }, { "epoch": 0.0246, "grad_norm": 44.0, "grad_norm_var": 34166.33411458333, "learning_rate": 0.0001, "loss": 7.4035, "loss/crossentropy": 2.058884859085083, "loss/hidden": 0.107421875, "loss/logits": 0.008141661062836647, "loss/reg": 5.22901725769043, "loss/twn": 0.0, "step": 984 }, { "epoch": 0.024625, "grad_norm": 196.0, "grad_norm_var": 34736.71328125, "learning_rate": 0.0001, "loss": 8.0381, "loss/crossentropy": 2.5864806175231934, "loss/hidden": 0.2119140625, "loss/logits": 0.01062602736055851, "loss/reg": 5.229076862335205, "loss/twn": 0.0, "step": 985 }, { "epoch": 0.02465, "grad_norm": 11.8125, "grad_norm_var": 34793.479801432295, "learning_rate": 0.0001, "loss": 7.9697, "loss/crossentropy": 2.6345629692077637, "loss/hidden": 0.0986328125, "loss/logits": 0.007676620967686176, "loss/reg": 5.228846073150635, "loss/twn": 0.0, "step": 986 }, { "epoch": 0.024675, "grad_norm": 11.8125, "grad_norm_var": 34798.427978515625, "learning_rate": 0.0001, "loss": 7.7632, "loss/crossentropy": 2.396425724029541, "loss/hidden": 0.1240234375, "loss/logits": 0.013703764416277409, "loss/reg": 5.229032516479492, "loss/twn": 0.0, "step": 987 }, { "epoch": 0.0247, "grad_norm": 10.125, "grad_norm_var": 34786.271875, "learning_rate": 0.0001, "loss": 8.1079, "loss/crossentropy": 2.747255325317383, "loss/hidden": 0.1240234375, "loss/logits": 0.007296864874660969, "loss/reg": 5.2292985916137695, "loss/twn": 0.0, "step": 988 }, { "epoch": 0.024725, "grad_norm": 15.0, "grad_norm_var": 34722.09972330729, "learning_rate": 0.0001, "loss": 6.9097, "loss/crossentropy": 1.5659387111663818, "loss/hidden": 0.10986328125, "loss/logits": 0.005029057152569294, "loss/reg": 5.228851795196533, "loss/twn": 0.0, "step": 989 }, { "epoch": 0.02475, "grad_norm": 15.75, "grad_norm_var": 33621.52016601562, "learning_rate": 0.0001, "loss": 7.1496, "loss/crossentropy": 1.7617563009262085, "loss/hidden": 0.1513671875, "loss/logits": 0.007417085114866495, "loss/reg": 5.229080677032471, "loss/twn": 0.0, "step": 990 }, { "epoch": 0.024775, "grad_norm": 14.75, "grad_norm_var": 33626.407535807295, "learning_rate": 0.0001, "loss": 7.4938, "loss/crossentropy": 2.093966245651245, "loss/hidden": 0.162109375, "loss/logits": 0.008837287314236164, "loss/reg": 5.228903770446777, "loss/twn": 0.0, "step": 991 }, { "epoch": 0.0248, "grad_norm": 21.375, "grad_norm_var": 2059.307275390625, "learning_rate": 0.0001, "loss": 5.6135, "loss/crossentropy": 0.2334800660610199, "loss/hidden": 0.14453125, "loss/logits": 0.00651364354416728, "loss/reg": 5.228927135467529, "loss/twn": 0.0, "step": 992 }, { "epoch": 0.024825, "grad_norm": 11.875, "grad_norm_var": 2061.2249837239583, "learning_rate": 0.0001, "loss": 7.9624, "loss/crossentropy": 2.5844943523406982, "loss/hidden": 0.1396484375, "loss/logits": 0.009160241112112999, "loss/reg": 5.229069709777832, "loss/twn": 0.0, "step": 993 }, { "epoch": 0.02485, "grad_norm": 16.75, "grad_norm_var": 2053.023372395833, "learning_rate": 0.0001, "loss": 7.9445, "loss/crossentropy": 2.5577454566955566, "loss/hidden": 0.146484375, "loss/logits": 0.011256640776991844, "loss/reg": 5.229057312011719, "loss/twn": 0.0, "step": 994 }, { "epoch": 0.024875, "grad_norm": 15.875, "grad_norm_var": 2051.4388020833335, "learning_rate": 0.0001, "loss": 8.5701, "loss/crossentropy": 3.218754529953003, "loss/hidden": 0.115234375, "loss/logits": 0.0072383033111691475, "loss/reg": 5.228893280029297, "loss/twn": 0.0, "step": 995 }, { "epoch": 0.0249, "grad_norm": 11.6875, "grad_norm_var": 2068.9751139322916, "learning_rate": 0.0001, "loss": 6.8167, "loss/crossentropy": 1.4376857280731201, "loss/hidden": 0.140625, "loss/logits": 0.009291324764490128, "loss/reg": 5.22910737991333, "loss/twn": 0.0, "step": 996 }, { "epoch": 0.024925, "grad_norm": 23.875, "grad_norm_var": 2052.261962890625, "learning_rate": 0.0001, "loss": 7.6128, "loss/crossentropy": 2.2152881622314453, "loss/hidden": 0.1630859375, "loss/logits": 0.0057820603251457214, "loss/reg": 5.228606700897217, "loss/twn": 0.0, "step": 997 }, { "epoch": 0.02495, "grad_norm": 84.5, "grad_norm_var": 2228.711181640625, "learning_rate": 0.0001, "loss": 8.1583, "loss/crossentropy": 2.797482490539551, "loss/hidden": 0.1220703125, "loss/logits": 0.009698813781142235, "loss/reg": 5.229077339172363, "loss/twn": 0.0, "step": 998 }, { "epoch": 0.024975, "grad_norm": 10.125, "grad_norm_var": 2260.4925618489583, "learning_rate": 0.0001, "loss": 7.9002, "loss/crossentropy": 2.5686769485473633, "loss/hidden": 0.09619140625, "loss/logits": 0.006712072994560003, "loss/reg": 5.228668212890625, "loss/twn": 0.0, "step": 999 }, { "epoch": 0.025, "grad_norm": 71.0, "grad_norm_var": 2348.5097493489584, "learning_rate": 0.0001, "loss": 7.9685, "loss/crossentropy": 2.6334779262542725, "loss/hidden": 0.099609375, "loss/logits": 0.0062838364392519, "loss/reg": 5.229094505310059, "loss/twn": 0.0, "step": 1000 } ], "logging_steps": 1, "max_steps": 40000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": true, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.0457034088448e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }