{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5, "eval_steps": 2000, "global_step": 20000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00025, "grad_norm": 32.5, "learning_rate": 0.0001, "loss": 7.9852, "loss/crossentropy": 2.2558943748474123, "loss/hidden": 3.53671875, "loss/jsd": 0.0, "loss/logits": 0.22032691352069378, "step": 10 }, { "epoch": 0.0005, "grad_norm": 39.0, "grad_norm_var": 6.1306640625, "learning_rate": 0.0001, "loss": 8.0827, "loss/crossentropy": 2.219619666039944, "loss/hidden": 3.397265625, "loss/jsd": 0.0, "loss/logits": 0.20763051956892015, "step": 20 }, { "epoch": 0.00075, "grad_norm": 30.375, "grad_norm_var": 7.383072916666666, "learning_rate": 0.0001, "loss": 7.8479, "loss/crossentropy": 2.185364603996277, "loss/hidden": 3.59296875, "loss/jsd": 0.0, "loss/logits": 0.23391152992844583, "step": 30 }, { "epoch": 0.001, "grad_norm": 32.75, "grad_norm_var": 187.0322265625, "learning_rate": 0.0001, "loss": 7.8341, "loss/crossentropy": 2.083733668923378, "loss/hidden": 3.375390625, "loss/jsd": 0.0, "loss/logits": 0.2040597340092063, "step": 40 }, { "epoch": 0.00125, "grad_norm": 32.5, "grad_norm_var": 5.111393229166667, "learning_rate": 0.0001, "loss": 7.6815, "loss/crossentropy": 2.182037726044655, "loss/hidden": 3.439453125, "loss/jsd": 0.0, "loss/logits": 0.20423058047890663, "step": 50 }, { "epoch": 0.0015, "grad_norm": 33.0, "grad_norm_var": 1.3853515625, "learning_rate": 0.0001, "loss": 7.6919, "loss/crossentropy": 2.1419573068618774, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.19463330563157796, "step": 60 }, { "epoch": 0.00175, "grad_norm": 33.25, "grad_norm_var": 2.228125, "learning_rate": 0.0001, "loss": 7.838, "loss/crossentropy": 2.242894622683525, "loss/hidden": 3.425390625, "loss/jsd": 0.0, "loss/logits": 0.22062199115753173, "step": 70 }, { "epoch": 0.002, "grad_norm": 32.5, "grad_norm_var": 8.371809895833334, "learning_rate": 0.0001, "loss": 8.018, "loss/crossentropy": 2.0408543169498445, "loss/hidden": 3.506640625, "loss/jsd": 0.0, "loss/logits": 0.20080858804285526, "step": 80 }, { "epoch": 0.00225, "grad_norm": 32.75, "grad_norm_var": 7.322916666666667, "learning_rate": 0.0001, "loss": 7.8807, "loss/crossentropy": 2.0654082030057905, "loss/hidden": 3.41640625, "loss/jsd": 0.0, "loss/logits": 0.2109438929706812, "step": 90 }, { "epoch": 0.0025, "grad_norm": 34.5, "grad_norm_var": 2805.3603515625, "learning_rate": 0.0001, "loss": 8.0497, "loss/crossentropy": 2.0930048365145923, "loss/hidden": 3.410546875, "loss/jsd": 0.0, "loss/logits": 0.2103888330049813, "step": 100 }, { "epoch": 0.00275, "grad_norm": 33.0, "grad_norm_var": 2798.430143229167, "learning_rate": 0.0001, "loss": 7.8583, "loss/crossentropy": 2.3308374524116515, "loss/hidden": 3.373046875, "loss/jsd": 0.0, "loss/logits": 0.2045755073428154, "step": 110 }, { "epoch": 0.003, "grad_norm": 34.0, "grad_norm_var": 1.6999348958333333, "learning_rate": 0.0001, "loss": 7.7088, "loss/crossentropy": 2.120428466796875, "loss/hidden": 3.56796875, "loss/jsd": 0.0, "loss/logits": 0.21561774536967276, "step": 120 }, { "epoch": 0.00325, "grad_norm": 31.0, "grad_norm_var": 40.73899739583333, "learning_rate": 0.0001, "loss": 7.762, "loss/crossentropy": 2.1464689180254934, "loss/hidden": 3.4203125, "loss/jsd": 0.0, "loss/logits": 0.2084670951589942, "step": 130 }, { "epoch": 0.0035, "grad_norm": 33.0, "grad_norm_var": 42.67265625, "learning_rate": 0.0001, "loss": 7.711, "loss/crossentropy": 2.1301105961203577, "loss/hidden": 3.512890625, "loss/jsd": 0.0, "loss/logits": 0.21644905991852284, "step": 140 }, { "epoch": 0.00375, "grad_norm": 32.75, "grad_norm_var": 8.066080729166666, "learning_rate": 0.0001, "loss": 7.7295, "loss/crossentropy": 2.115240353345871, "loss/hidden": 3.491796875, "loss/jsd": 0.0, "loss/logits": 0.2064087452366948, "step": 150 }, { "epoch": 0.004, "grad_norm": 32.5, "grad_norm_var": 422.72057291666664, "learning_rate": 0.0001, "loss": 7.9367, "loss/crossentropy": 2.141779786348343, "loss/hidden": 3.457421875, "loss/jsd": 0.0, "loss/logits": 0.21747791785746812, "step": 160 }, { "epoch": 0.00425, "grad_norm": 32.0, "grad_norm_var": 6.705208333333333, "learning_rate": 0.0001, "loss": 7.756, "loss/crossentropy": 2.103468084335327, "loss/hidden": 3.449609375, "loss/jsd": 0.0, "loss/logits": 0.2027540819719434, "step": 170 }, { "epoch": 0.0045, "grad_norm": 32.25, "grad_norm_var": 17.8556640625, "learning_rate": 0.0001, "loss": 7.7008, "loss/crossentropy": 2.1950977832078933, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.2186649737879634, "step": 180 }, { "epoch": 0.00475, "grad_norm": 30.5, "grad_norm_var": 66.61608072916667, "learning_rate": 0.0001, "loss": 7.6552, "loss/crossentropy": 2.029752139747143, "loss/hidden": 3.585546875, "loss/jsd": 0.0, "loss/logits": 0.22110425475984813, "step": 190 }, { "epoch": 0.005, "grad_norm": 36.0, "grad_norm_var": 13.160872395833334, "learning_rate": 0.0001, "loss": 7.6964, "loss/crossentropy": 2.169684535264969, "loss/hidden": 3.52265625, "loss/jsd": 0.0, "loss/logits": 0.22122678495943546, "step": 200 }, { "epoch": 0.00525, "grad_norm": 33.25, "grad_norm_var": 3.442643229166667, "learning_rate": 0.0001, "loss": 7.7135, "loss/crossentropy": 2.1366169154644012, "loss/hidden": 3.449609375, "loss/jsd": 0.0, "loss/logits": 0.19148335698992014, "step": 210 }, { "epoch": 0.0055, "grad_norm": 35.25, "grad_norm_var": 4.151822916666666, "learning_rate": 0.0001, "loss": 7.654, "loss/crossentropy": 2.196457767486572, "loss/hidden": 3.573828125, "loss/jsd": 0.0, "loss/logits": 0.22240160517394542, "step": 220 }, { "epoch": 0.00575, "grad_norm": 34.5, "grad_norm_var": 61.07057291666667, "learning_rate": 0.0001, "loss": 7.7286, "loss/crossentropy": 2.099469523131847, "loss/hidden": 3.41796875, "loss/jsd": 0.0, "loss/logits": 0.19720815271139144, "step": 230 }, { "epoch": 0.006, "grad_norm": 33.75, "grad_norm_var": 5.479166666666667, "learning_rate": 0.0001, "loss": 7.6841, "loss/crossentropy": 2.0730464071035386, "loss/hidden": 3.480078125, "loss/jsd": 0.0, "loss/logits": 0.20290055498480797, "step": 240 }, { "epoch": 0.00625, "grad_norm": 29.5, "grad_norm_var": 4.9197265625, "learning_rate": 0.0001, "loss": 7.6999, "loss/crossentropy": 2.1514100462198256, "loss/hidden": 3.42109375, "loss/jsd": 0.0, "loss/logits": 0.2013280361890793, "step": 250 }, { "epoch": 0.0065, "grad_norm": 33.25, "grad_norm_var": 150.86712239583332, "learning_rate": 0.0001, "loss": 7.7617, "loss/crossentropy": 2.0071112543344496, "loss/hidden": 3.39921875, "loss/jsd": 0.0, "loss/logits": 0.1923872010782361, "step": 260 }, { "epoch": 0.00675, "grad_norm": 32.25, "grad_norm_var": 41.373958333333334, "learning_rate": 0.0001, "loss": 7.7968, "loss/crossentropy": 2.254330241680145, "loss/hidden": 3.391015625, "loss/jsd": 0.0, "loss/logits": 0.20844143405556678, "step": 270 }, { "epoch": 0.007, "grad_norm": 30.25, "grad_norm_var": 5.9625, "learning_rate": 0.0001, "loss": 7.7437, "loss/crossentropy": 2.2058747708797455, "loss/hidden": 3.406640625, "loss/jsd": 0.0, "loss/logits": 0.19603454861789943, "step": 280 }, { "epoch": 0.00725, "grad_norm": 32.0, "grad_norm_var": 3.4306640625, "learning_rate": 0.0001, "loss": 7.7308, "loss/crossentropy": 2.158563455939293, "loss/hidden": 3.485546875, "loss/jsd": 0.0, "loss/logits": 0.20696840062737465, "step": 290 }, { "epoch": 0.0075, "grad_norm": 35.25, "grad_norm_var": 4.163997395833333, "learning_rate": 0.0001, "loss": 7.7781, "loss/crossentropy": 2.1728759124875068, "loss/hidden": 3.6984375, "loss/jsd": 0.0, "loss/logits": 0.2232737574726343, "step": 300 }, { "epoch": 0.00775, "grad_norm": 32.25, "grad_norm_var": 9.7212890625, "learning_rate": 0.0001, "loss": 7.7039, "loss/crossentropy": 2.1304744452238085, "loss/hidden": 3.510546875, "loss/jsd": 0.0, "loss/logits": 0.21460633352398872, "step": 310 }, { "epoch": 0.008, "grad_norm": 34.25, "grad_norm_var": 102.21979166666667, "learning_rate": 0.0001, "loss": 7.7386, "loss/crossentropy": 2.2150494754314423, "loss/hidden": 3.470703125, "loss/jsd": 0.0, "loss/logits": 0.2204894032329321, "step": 320 }, { "epoch": 0.00825, "grad_norm": 34.5, "grad_norm_var": 103.0806640625, "learning_rate": 0.0001, "loss": 7.6843, "loss/crossentropy": 2.1483607694506643, "loss/hidden": 3.525, "loss/jsd": 0.0, "loss/logits": 0.2168185070157051, "step": 330 }, { "epoch": 0.0085, "grad_norm": 29.75, "grad_norm_var": 4.245768229166667, "learning_rate": 0.0001, "loss": 7.6824, "loss/crossentropy": 2.2668254554271696, "loss/hidden": 3.36171875, "loss/jsd": 0.0, "loss/logits": 0.1978676740080118, "step": 340 }, { "epoch": 0.00875, "grad_norm": 30.75, "grad_norm_var": 7.785416666666666, "learning_rate": 0.0001, "loss": 7.6476, "loss/crossentropy": 2.247351437807083, "loss/hidden": 3.45078125, "loss/jsd": 0.0, "loss/logits": 0.2044550308957696, "step": 350 }, { "epoch": 0.009, "grad_norm": 33.0, "grad_norm_var": 36.1150390625, "learning_rate": 0.0001, "loss": 7.6707, "loss/crossentropy": 2.2508403569459916, "loss/hidden": 3.351171875, "loss/jsd": 0.0, "loss/logits": 0.20586735829710961, "step": 360 }, { "epoch": 0.00925, "grad_norm": 27.875, "grad_norm_var": 42.02805989583333, "learning_rate": 0.0001, "loss": 7.787, "loss/crossentropy": 2.2524363905191422, "loss/hidden": 3.39921875, "loss/jsd": 0.0, "loss/logits": 0.20639074668288232, "step": 370 }, { "epoch": 0.0095, "grad_norm": 29.625, "grad_norm_var": 19.65625, "learning_rate": 0.0001, "loss": 7.7258, "loss/crossentropy": 2.2901588469743728, "loss/hidden": 3.37890625, "loss/jsd": 0.0, "loss/logits": 0.20220000632107257, "step": 380 }, { "epoch": 0.00975, "grad_norm": 29.875, "grad_norm_var": 15.653059895833334, "learning_rate": 0.0001, "loss": 7.6445, "loss/crossentropy": 2.1256834477186204, "loss/hidden": 3.337109375, "loss/jsd": 0.0, "loss/logits": 0.190180828794837, "step": 390 }, { "epoch": 0.01, "grad_norm": 33.5, "grad_norm_var": 4.7712890625, "learning_rate": 0.0001, "loss": 7.7719, "loss/crossentropy": 2.2237626880407335, "loss/hidden": 3.484765625, "loss/jsd": 0.0, "loss/logits": 0.2264560218900442, "step": 400 }, { "epoch": 0.01025, "grad_norm": 33.0, "grad_norm_var": 2.9247395833333334, "learning_rate": 0.0001, "loss": 7.7631, "loss/crossentropy": 2.060544753074646, "loss/hidden": 3.470703125, "loss/jsd": 0.0, "loss/logits": 0.2202487275004387, "step": 410 }, { "epoch": 0.0105, "grad_norm": 32.75, "grad_norm_var": 2.6712890625, "learning_rate": 0.0001, "loss": 7.8193, "loss/crossentropy": 2.270045906305313, "loss/hidden": 3.49921875, "loss/jsd": 0.0, "loss/logits": 0.22449300419539214, "step": 420 }, { "epoch": 0.01075, "grad_norm": 40.0, "grad_norm_var": 5.551822916666667, "learning_rate": 0.0001, "loss": 7.7502, "loss/crossentropy": 2.220390594005585, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.20708895958960055, "step": 430 }, { "epoch": 0.011, "grad_norm": 33.75, "grad_norm_var": 6.554622395833333, "learning_rate": 0.0001, "loss": 7.6386, "loss/crossentropy": 2.0412577211856844, "loss/hidden": 3.52109375, "loss/jsd": 0.0, "loss/logits": 0.20780573673546315, "step": 440 }, { "epoch": 0.01125, "grad_norm": 39.5, "grad_norm_var": 9.018684895833333, "learning_rate": 0.0001, "loss": 7.7522, "loss/crossentropy": 2.085190561413765, "loss/hidden": 3.434375, "loss/jsd": 0.0, "loss/logits": 0.22059339918196202, "step": 450 }, { "epoch": 0.0115, "grad_norm": 31.75, "grad_norm_var": 5.703580729166666, "learning_rate": 0.0001, "loss": 7.604, "loss/crossentropy": 1.9451062515378, "loss/hidden": 3.527734375, "loss/jsd": 0.0, "loss/logits": 0.2082566052675247, "step": 460 }, { "epoch": 0.01175, "grad_norm": 34.25, "grad_norm_var": 1.4416666666666667, "learning_rate": 0.0001, "loss": 7.7286, "loss/crossentropy": 2.23127267062664, "loss/hidden": 3.560546875, "loss/jsd": 0.0, "loss/logits": 0.22645943984389305, "step": 470 }, { "epoch": 0.012, "grad_norm": 43.5, "grad_norm_var": 61.358333333333334, "learning_rate": 0.0001, "loss": 7.5857, "loss/crossentropy": 2.0275631666183473, "loss/hidden": 3.341015625, "loss/jsd": 0.0, "loss/logits": 0.19867698503658177, "step": 480 }, { "epoch": 0.01225, "grad_norm": 30.5, "grad_norm_var": 62.425455729166664, "learning_rate": 0.0001, "loss": 7.7023, "loss/crossentropy": 2.1564531326293945, "loss/hidden": 3.422265625, "loss/jsd": 0.0, "loss/logits": 0.21295207217335702, "step": 490 }, { "epoch": 0.0125, "grad_norm": 39.0, "grad_norm_var": 7.580208333333333, "learning_rate": 0.0001, "loss": 7.8035, "loss/crossentropy": 2.1716607972979545, "loss/hidden": 3.489453125, "loss/jsd": 0.0, "loss/logits": 0.21046865545213223, "step": 500 }, { "epoch": 0.01275, "grad_norm": 35.25, "grad_norm_var": 5.121809895833334, "learning_rate": 0.0001, "loss": 7.764, "loss/crossentropy": 2.181091034412384, "loss/hidden": 3.410546875, "loss/jsd": 0.0, "loss/logits": 0.20339491367340087, "step": 510 }, { "epoch": 0.013, "grad_norm": 31.5, "grad_norm_var": 3.820572916666667, "learning_rate": 0.0001, "loss": 7.6112, "loss/crossentropy": 2.012446442246437, "loss/hidden": 3.454296875, "loss/jsd": 0.0, "loss/logits": 0.20432959645986556, "step": 520 }, { "epoch": 0.01325, "grad_norm": 33.0, "grad_norm_var": 2.92890625, "learning_rate": 0.0001, "loss": 7.73, "loss/crossentropy": 2.0826023176312445, "loss/hidden": 3.51171875, "loss/jsd": 0.0, "loss/logits": 0.21190985422581435, "step": 530 }, { "epoch": 0.0135, "grad_norm": 31.375, "grad_norm_var": 2.0229166666666667, "learning_rate": 0.0001, "loss": 7.7527, "loss/crossentropy": 2.191486455500126, "loss/hidden": 3.378125, "loss/jsd": 0.0, "loss/logits": 0.1943425141274929, "step": 540 }, { "epoch": 0.01375, "grad_norm": 29.75, "grad_norm_var": 4.983268229166667, "learning_rate": 0.0001, "loss": 7.7035, "loss/crossentropy": 2.0664332896471023, "loss/hidden": 3.475, "loss/jsd": 0.0, "loss/logits": 0.20404404532164336, "step": 550 }, { "epoch": 0.014, "grad_norm": 31.875, "grad_norm_var": 3.381705729166667, "learning_rate": 0.0001, "loss": 7.5679, "loss/crossentropy": 2.0470397621393204, "loss/hidden": 3.430859375, "loss/jsd": 0.0, "loss/logits": 0.21556914187967777, "step": 560 }, { "epoch": 0.01425, "grad_norm": 33.0, "grad_norm_var": 2.508072916666667, "learning_rate": 0.0001, "loss": 7.7909, "loss/crossentropy": 2.21784293949604, "loss/hidden": 3.4671875, "loss/jsd": 0.0, "loss/logits": 0.20632803943008185, "step": 570 }, { "epoch": 0.0145, "grad_norm": 32.25, "grad_norm_var": 2.6837890625, "learning_rate": 0.0001, "loss": 7.6449, "loss/crossentropy": 2.291385439038277, "loss/hidden": 3.432421875, "loss/jsd": 0.0, "loss/logits": 0.19818231668323277, "step": 580 }, { "epoch": 0.01475, "grad_norm": 30.875, "grad_norm_var": 106.909375, "learning_rate": 0.0001, "loss": 7.7009, "loss/crossentropy": 2.102944087982178, "loss/hidden": 3.532421875, "loss/jsd": 0.0, "loss/logits": 0.20003035496920346, "step": 590 }, { "epoch": 0.015, "grad_norm": 33.0, "grad_norm_var": 11.816080729166666, "learning_rate": 0.0001, "loss": 7.5907, "loss/crossentropy": 2.239013722538948, "loss/hidden": 3.471484375, "loss/jsd": 0.0, "loss/logits": 0.22811597101390363, "step": 600 }, { "epoch": 0.01525, "grad_norm": 34.75, "grad_norm_var": 3.2811848958333334, "learning_rate": 0.0001, "loss": 7.6578, "loss/crossentropy": 2.1826944231986998, "loss/hidden": 3.36796875, "loss/jsd": 0.0, "loss/logits": 0.19526719450950622, "step": 610 }, { "epoch": 0.0155, "grad_norm": 32.0, "grad_norm_var": 29.54140625, "learning_rate": 0.0001, "loss": 7.6592, "loss/crossentropy": 2.0491475805640222, "loss/hidden": 3.47109375, "loss/jsd": 0.0, "loss/logits": 0.19470291025936604, "step": 620 }, { "epoch": 0.01575, "grad_norm": 29.625, "grad_norm_var": 3.458268229166667, "learning_rate": 0.0001, "loss": 7.7237, "loss/crossentropy": 2.171899539232254, "loss/hidden": 3.417578125, "loss/jsd": 0.0, "loss/logits": 0.19583625346422195, "step": 630 }, { "epoch": 0.016, "grad_norm": 41.5, "grad_norm_var": 835.4395182291667, "learning_rate": 0.0001, "loss": 7.6493, "loss/crossentropy": 2.019927790760994, "loss/hidden": 3.40546875, "loss/jsd": 0.0, "loss/logits": 0.1889604590833187, "step": 640 }, { "epoch": 0.01625, "grad_norm": 33.25, "grad_norm_var": 816.1582682291667, "learning_rate": 0.0001, "loss": 7.7628, "loss/crossentropy": 2.146556834876537, "loss/hidden": 3.53046875, "loss/jsd": 0.0, "loss/logits": 0.21917179077863694, "step": 650 }, { "epoch": 0.0165, "grad_norm": 34.0, "grad_norm_var": 13.574739583333333, "learning_rate": 0.0001, "loss": 7.6199, "loss/crossentropy": 2.2131199680268763, "loss/hidden": 3.393359375, "loss/jsd": 0.0, "loss/logits": 0.20602547163143753, "step": 660 }, { "epoch": 0.01675, "grad_norm": 34.75, "grad_norm_var": 28.6125, "learning_rate": 0.0001, "loss": 7.6113, "loss/crossentropy": 2.0343705236911775, "loss/hidden": 3.37734375, "loss/jsd": 0.0, "loss/logits": 0.18553176671266555, "step": 670 }, { "epoch": 0.017, "grad_norm": 31.625, "grad_norm_var": 19.8369140625, "learning_rate": 0.0001, "loss": 7.6443, "loss/crossentropy": 2.1528817296028135, "loss/hidden": 3.456640625, "loss/jsd": 0.0, "loss/logits": 0.20593271851539613, "step": 680 }, { "epoch": 0.01725, "grad_norm": 36.25, "grad_norm_var": 5.97265625, "learning_rate": 0.0001, "loss": 7.7476, "loss/crossentropy": 2.2202903479337692, "loss/hidden": 3.419921875, "loss/jsd": 0.0, "loss/logits": 0.2035485502332449, "step": 690 }, { "epoch": 0.0175, "grad_norm": 31.5, "grad_norm_var": 7.5431640625, "learning_rate": 0.0001, "loss": 7.621, "loss/crossentropy": 2.0744683638215067, "loss/hidden": 3.359765625, "loss/jsd": 0.0, "loss/logits": 0.2023961789906025, "step": 700 }, { "epoch": 0.01775, "grad_norm": 29.875, "grad_norm_var": 26.912239583333335, "learning_rate": 0.0001, "loss": 7.7058, "loss/crossentropy": 2.1098077327013014, "loss/hidden": 3.48359375, "loss/jsd": 0.0, "loss/logits": 0.20867060720920563, "step": 710 }, { "epoch": 0.018, "grad_norm": 34.0, "grad_norm_var": 5.291080729166667, "learning_rate": 0.0001, "loss": 7.5224, "loss/crossentropy": 2.0663078971207143, "loss/hidden": 3.439453125, "loss/jsd": 0.0, "loss/logits": 0.1999094202183187, "step": 720 }, { "epoch": 0.01825, "grad_norm": 36.75, "grad_norm_var": 34.112239583333334, "learning_rate": 0.0001, "loss": 7.5679, "loss/crossentropy": 2.264209559559822, "loss/hidden": 3.438671875, "loss/jsd": 0.0, "loss/logits": 0.213060562312603, "step": 730 }, { "epoch": 0.0185, "grad_norm": 31.875, "grad_norm_var": 31.756705729166665, "learning_rate": 0.0001, "loss": 7.6714, "loss/crossentropy": 2.128282290697098, "loss/hidden": 3.494921875, "loss/jsd": 0.0, "loss/logits": 0.21533375550061465, "step": 740 }, { "epoch": 0.01875, "grad_norm": 29.125, "grad_norm_var": 4.314518229166667, "learning_rate": 0.0001, "loss": 7.6825, "loss/crossentropy": 2.0806978911161425, "loss/hidden": 3.446875, "loss/jsd": 0.0, "loss/logits": 0.19196727648377418, "step": 750 }, { "epoch": 0.019, "grad_norm": 31.625, "grad_norm_var": 9.520833333333334, "learning_rate": 0.0001, "loss": 7.6058, "loss/crossentropy": 2.2315777271986006, "loss/hidden": 3.484765625, "loss/jsd": 0.0, "loss/logits": 0.21225934717804193, "step": 760 }, { "epoch": 0.01925, "grad_norm": 33.25, "grad_norm_var": 29.381705729166665, "learning_rate": 0.0001, "loss": 7.8377, "loss/crossentropy": 2.200978134572506, "loss/hidden": 3.52890625, "loss/jsd": 0.0, "loss/logits": 0.22721426151692867, "step": 770 }, { "epoch": 0.0195, "grad_norm": 32.75, "grad_norm_var": 11.095247395833333, "learning_rate": 0.0001, "loss": 7.7305, "loss/crossentropy": 2.2799030035734176, "loss/hidden": 3.512890625, "loss/jsd": 0.0, "loss/logits": 0.2418980894610286, "step": 780 }, { "epoch": 0.01975, "grad_norm": 46.5, "grad_norm_var": 22.12265625, "learning_rate": 0.0001, "loss": 7.663, "loss/crossentropy": 2.0916543275117876, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.20638740565627814, "step": 790 }, { "epoch": 0.02, "grad_norm": 53.75, "grad_norm_var": 2.3053503983968586e+18, "learning_rate": 0.0001, "loss": 7.6743, "loss/crossentropy": 2.2010276943445204, "loss/hidden": 3.33671875, "loss/jsd": 0.0, "loss/logits": 0.2011238183826208, "step": 800 }, { "epoch": 0.02025, "grad_norm": 34.75, "grad_norm_var": 2.3053503962269005e+18, "learning_rate": 0.0001, "loss": 7.7365, "loss/crossentropy": 2.2651585280895232, "loss/hidden": 3.425390625, "loss/jsd": 0.0, "loss/logits": 0.22051467839628458, "step": 810 }, { "epoch": 0.0205, "grad_norm": 32.25, "grad_norm_var": 69.03854166666666, "learning_rate": 0.0001, "loss": 7.5788, "loss/crossentropy": 2.1270318403840065, "loss/hidden": 3.47109375, "loss/jsd": 0.0, "loss/logits": 0.20940354652702808, "step": 820 }, { "epoch": 0.02075, "grad_norm": 33.0, "grad_norm_var": 8.0603515625, "learning_rate": 0.0001, "loss": 7.6237, "loss/crossentropy": 2.1726820170879364, "loss/hidden": 3.428125, "loss/jsd": 0.0, "loss/logits": 0.20644128862768413, "step": 830 }, { "epoch": 0.021, "grad_norm": 33.5, "grad_norm_var": 9.085416666666667, "learning_rate": 0.0001, "loss": 7.455, "loss/crossentropy": 2.166350546479225, "loss/hidden": 3.412109375, "loss/jsd": 0.0, "loss/logits": 0.20701032150536774, "step": 840 }, { "epoch": 0.02125, "grad_norm": 37.75, "grad_norm_var": 6.362239583333333, "learning_rate": 0.0001, "loss": 7.5725, "loss/crossentropy": 2.095318245887756, "loss/hidden": 3.429296875, "loss/jsd": 0.0, "loss/logits": 0.20427223704755307, "step": 850 }, { "epoch": 0.0215, "grad_norm": 29.625, "grad_norm_var": 15.934830729166666, "learning_rate": 0.0001, "loss": 7.5728, "loss/crossentropy": 2.1999073296785356, "loss/hidden": 3.400390625, "loss/jsd": 0.0, "loss/logits": 0.2012161746621132, "step": 860 }, { "epoch": 0.02175, "grad_norm": 30.5, "grad_norm_var": 5.681184895833334, "learning_rate": 0.0001, "loss": 7.6168, "loss/crossentropy": 2.2957967817783356, "loss/hidden": 3.395703125, "loss/jsd": 0.0, "loss/logits": 0.20650502648204566, "step": 870 }, { "epoch": 0.022, "grad_norm": 30.0, "grad_norm_var": 10.9181640625, "learning_rate": 0.0001, "loss": 7.6929, "loss/crossentropy": 2.2093090921640397, "loss/hidden": 3.350390625, "loss/jsd": 0.0, "loss/logits": 0.18699637930840254, "step": 880 }, { "epoch": 0.02225, "grad_norm": 35.0, "grad_norm_var": 11.192122395833334, "learning_rate": 0.0001, "loss": 7.5316, "loss/crossentropy": 2.0586251467466354, "loss/hidden": 3.428125, "loss/jsd": 0.0, "loss/logits": 0.21464286223053933, "step": 890 }, { "epoch": 0.0225, "grad_norm": 48.0, "grad_norm_var": 22.92265625, "learning_rate": 0.0001, "loss": 7.5857, "loss/crossentropy": 2.1724458605051042, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.19797445200383662, "step": 900 }, { "epoch": 0.02275, "grad_norm": 33.0, "grad_norm_var": 38.4837890625, "learning_rate": 0.0001, "loss": 7.6184, "loss/crossentropy": 2.15934486836195, "loss/hidden": 3.357421875, "loss/jsd": 0.0, "loss/logits": 0.19419073052704333, "step": 910 }, { "epoch": 0.023, "grad_norm": 31.0, "grad_norm_var": 29.8869140625, "learning_rate": 0.0001, "loss": 7.6209, "loss/crossentropy": 2.1601561695337295, "loss/hidden": 3.384375, "loss/jsd": 0.0, "loss/logits": 0.20290262177586554, "step": 920 }, { "epoch": 0.02325, "grad_norm": 33.0, "grad_norm_var": 2.5143229166666665, "learning_rate": 0.0001, "loss": 7.5969, "loss/crossentropy": 2.184823766350746, "loss/hidden": 3.392578125, "loss/jsd": 0.0, "loss/logits": 0.2016730338335037, "step": 930 }, { "epoch": 0.0235, "grad_norm": 30.125, "grad_norm_var": 1.5764973958333333, "learning_rate": 0.0001, "loss": 7.5772, "loss/crossentropy": 2.2380657255649568, "loss/hidden": 3.319140625, "loss/jsd": 0.0, "loss/logits": 0.19638443663716315, "step": 940 }, { "epoch": 0.02375, "grad_norm": 28.875, "grad_norm_var": 5.199934895833334, "learning_rate": 0.0001, "loss": 7.6348, "loss/crossentropy": 2.183762513846159, "loss/hidden": 3.4171875, "loss/jsd": 0.0, "loss/logits": 0.19700763542205096, "step": 950 }, { "epoch": 0.024, "grad_norm": 33.0, "grad_norm_var": 20.1666015625, "learning_rate": 0.0001, "loss": 7.6351, "loss/crossentropy": 2.1142914414405825, "loss/hidden": 3.46953125, "loss/jsd": 0.0, "loss/logits": 0.2082229983061552, "step": 960 }, { "epoch": 0.02425, "grad_norm": 31.5, "grad_norm_var": 18.738541666666666, "learning_rate": 0.0001, "loss": 7.6302, "loss/crossentropy": 2.315229868888855, "loss/hidden": 3.32734375, "loss/jsd": 0.0, "loss/logits": 0.20684304945170878, "step": 970 }, { "epoch": 0.0245, "grad_norm": 33.75, "grad_norm_var": 2.501822916666667, "learning_rate": 0.0001, "loss": 7.6816, "loss/crossentropy": 2.094369947910309, "loss/hidden": 3.59921875, "loss/jsd": 0.0, "loss/logits": 0.2428071454167366, "step": 980 }, { "epoch": 0.02475, "grad_norm": 29.25, "grad_norm_var": 4.06015625, "learning_rate": 0.0001, "loss": 7.6289, "loss/crossentropy": 2.183109185099602, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.21173047311604024, "step": 990 }, { "epoch": 0.025, "grad_norm": 33.5, "grad_norm_var": 2.9593098958333335, "learning_rate": 0.0001, "loss": 7.6417, "loss/crossentropy": 2.1311034083366396, "loss/hidden": 3.44765625, "loss/jsd": 0.0, "loss/logits": 0.1998819222673774, "step": 1000 }, { "epoch": 0.02525, "grad_norm": 31.0, "grad_norm_var": 4.510872395833333, "learning_rate": 0.0001, "loss": 7.6299, "loss/crossentropy": 2.1297010451555254, "loss/hidden": 3.475390625, "loss/jsd": 0.0, "loss/logits": 0.20842864252626897, "step": 1010 }, { "epoch": 0.0255, "grad_norm": 30.375, "grad_norm_var": 3.9893229166666666, "learning_rate": 0.0001, "loss": 7.6593, "loss/crossentropy": 2.2224902719259263, "loss/hidden": 3.509375, "loss/jsd": 0.0, "loss/logits": 0.21827217563986778, "step": 1020 }, { "epoch": 0.02575, "grad_norm": 38.25, "grad_norm_var": 6.010416666666667, "learning_rate": 0.0001, "loss": 7.6712, "loss/crossentropy": 2.1976612359285355, "loss/hidden": 3.506640625, "loss/jsd": 0.0, "loss/logits": 0.210753770545125, "step": 1030 }, { "epoch": 0.026, "grad_norm": 33.5, "grad_norm_var": 9.435872395833334, "learning_rate": 0.0001, "loss": 7.748, "loss/crossentropy": 2.1889317661523817, "loss/hidden": 3.398828125, "loss/jsd": 0.0, "loss/logits": 0.2081079863011837, "step": 1040 }, { "epoch": 0.02625, "grad_norm": 31.0, "grad_norm_var": 6.206184895833333, "learning_rate": 0.0001, "loss": 7.5803, "loss/crossentropy": 2.0802227064967154, "loss/hidden": 3.46796875, "loss/jsd": 0.0, "loss/logits": 0.19740718584507705, "step": 1050 }, { "epoch": 0.0265, "grad_norm": 31.625, "grad_norm_var": 1.3770182291666666, "learning_rate": 0.0001, "loss": 7.6098, "loss/crossentropy": 1.969551184773445, "loss/hidden": 3.4546875, "loss/jsd": 0.0, "loss/logits": 0.1954148853197694, "step": 1060 }, { "epoch": 0.02675, "grad_norm": 30.75, "grad_norm_var": 2.09375, "learning_rate": 0.0001, "loss": 7.7826, "loss/crossentropy": 2.160974936187267, "loss/hidden": 3.581640625, "loss/jsd": 0.0, "loss/logits": 0.2183740811422467, "step": 1070 }, { "epoch": 0.027, "grad_norm": 30.625, "grad_norm_var": 2.6520182291666665, "learning_rate": 0.0001, "loss": 7.6186, "loss/crossentropy": 2.179084411263466, "loss/hidden": 3.4109375, "loss/jsd": 0.0, "loss/logits": 0.19020346291363238, "step": 1080 }, { "epoch": 0.02725, "grad_norm": 56.5, "grad_norm_var": 48.91868489583333, "learning_rate": 0.0001, "loss": 7.7756, "loss/crossentropy": 2.1456103891134264, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.2178689869120717, "step": 1090 }, { "epoch": 0.0275, "grad_norm": 32.0, "grad_norm_var": 42.19557291666667, "learning_rate": 0.0001, "loss": 7.6714, "loss/crossentropy": 2.2156156271696092, "loss/hidden": 3.43515625, "loss/jsd": 0.0, "loss/logits": 0.2013509316369891, "step": 1100 }, { "epoch": 0.02775, "grad_norm": 31.625, "grad_norm_var": 29.3775390625, "learning_rate": 0.0001, "loss": 7.6833, "loss/crossentropy": 2.0683740943670275, "loss/hidden": 3.5046875, "loss/jsd": 0.0, "loss/logits": 0.21320818569511174, "step": 1110 }, { "epoch": 0.028, "grad_norm": 27.5, "grad_norm_var": 35.483333333333334, "learning_rate": 0.0001, "loss": 7.7302, "loss/crossentropy": 2.098052313923836, "loss/hidden": 3.46484375, "loss/jsd": 0.0, "loss/logits": 0.19890500828623772, "step": 1120 }, { "epoch": 0.02825, "grad_norm": 29.0, "grad_norm_var": 10.8775390625, "learning_rate": 0.0001, "loss": 7.6716, "loss/crossentropy": 2.0999813921749593, "loss/hidden": 3.373046875, "loss/jsd": 0.0, "loss/logits": 0.18992104195058346, "step": 1130 }, { "epoch": 0.0285, "grad_norm": 30.875, "grad_norm_var": 3.67890625, "learning_rate": 0.0001, "loss": 7.5401, "loss/crossentropy": 2.07411085665226, "loss/hidden": 3.433984375, "loss/jsd": 0.0, "loss/logits": 0.2018830729648471, "step": 1140 }, { "epoch": 0.02875, "grad_norm": 31.125, "grad_norm_var": 18.053580729166665, "learning_rate": 0.0001, "loss": 7.645, "loss/crossentropy": 2.0945233553647995, "loss/hidden": 3.4859375, "loss/jsd": 0.0, "loss/logits": 0.21366582233458759, "step": 1150 }, { "epoch": 0.029, "grad_norm": 31.25, "grad_norm_var": 16.978125, "learning_rate": 0.0001, "loss": 7.6514, "loss/crossentropy": 2.0980678737163543, "loss/hidden": 3.434375, "loss/jsd": 0.0, "loss/logits": 0.19850811325013637, "step": 1160 }, { "epoch": 0.02925, "grad_norm": 32.5, "grad_norm_var": 30.2556640625, "learning_rate": 0.0001, "loss": 7.6021, "loss/crossentropy": 2.155895306169987, "loss/hidden": 3.378125, "loss/jsd": 0.0, "loss/logits": 0.19983574748039246, "step": 1170 }, { "epoch": 0.0295, "grad_norm": 30.5, "grad_norm_var": 5.733072916666667, "learning_rate": 0.0001, "loss": 7.6343, "loss/crossentropy": 2.1906268298625946, "loss/hidden": 3.348046875, "loss/jsd": 0.0, "loss/logits": 0.19584416709840297, "step": 1180 }, { "epoch": 0.02975, "grad_norm": 32.5, "grad_norm_var": 3.89765625, "learning_rate": 0.0001, "loss": 7.7077, "loss/crossentropy": 2.163237012922764, "loss/hidden": 3.562109375, "loss/jsd": 0.0, "loss/logits": 0.21741114580072463, "step": 1190 }, { "epoch": 0.03, "grad_norm": 27.875, "grad_norm_var": 3.6639973958333334, "learning_rate": 0.0001, "loss": 7.6695, "loss/crossentropy": 2.1346954315900804, "loss/hidden": 3.394921875, "loss/jsd": 0.0, "loss/logits": 0.19178961254656315, "step": 1200 }, { "epoch": 0.03025, "grad_norm": 30.5, "grad_norm_var": 24.9125, "learning_rate": 0.0001, "loss": 7.6108, "loss/crossentropy": 2.2493597716093063, "loss/hidden": 3.433203125, "loss/jsd": 0.0, "loss/logits": 0.209975734166801, "step": 1210 }, { "epoch": 0.0305, "grad_norm": 30.75, "grad_norm_var": 5.566080729166667, "learning_rate": 0.0001, "loss": 7.5902, "loss/crossentropy": 2.046968361735344, "loss/hidden": 3.451171875, "loss/jsd": 0.0, "loss/logits": 0.1867401722818613, "step": 1220 }, { "epoch": 0.03075, "grad_norm": 29.75, "grad_norm_var": 5.396875, "learning_rate": 0.0001, "loss": 7.6659, "loss/crossentropy": 2.0429708033800127, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.18792454693466426, "step": 1230 }, { "epoch": 0.031, "grad_norm": 30.375, "grad_norm_var": 2.8827473958333334, "learning_rate": 0.0001, "loss": 7.5425, "loss/crossentropy": 2.124686148762703, "loss/hidden": 3.45625, "loss/jsd": 0.0, "loss/logits": 0.23028194047510625, "step": 1240 }, { "epoch": 0.03125, "grad_norm": 33.0, "grad_norm_var": 2.220833333333333, "learning_rate": 0.0001, "loss": 7.7766, "loss/crossentropy": 2.1431034594774245, "loss/hidden": 3.30390625, "loss/jsd": 0.0, "loss/logits": 0.1852023523300886, "step": 1250 }, { "epoch": 0.0315, "grad_norm": 34.0, "grad_norm_var": 3.3080729166666667, "learning_rate": 0.0001, "loss": 7.6547, "loss/crossentropy": 2.208024913072586, "loss/hidden": 3.32265625, "loss/jsd": 0.0, "loss/logits": 0.19009452145546674, "step": 1260 }, { "epoch": 0.03175, "grad_norm": 31.375, "grad_norm_var": 3.7249348958333335, "learning_rate": 0.0001, "loss": 7.6331, "loss/crossentropy": 2.0774734795093535, "loss/hidden": 3.378515625, "loss/jsd": 0.0, "loss/logits": 0.19277823474258185, "step": 1270 }, { "epoch": 0.032, "grad_norm": 32.75, "grad_norm_var": 2.6393229166666665, "learning_rate": 0.0001, "loss": 7.6147, "loss/crossentropy": 2.244540962576866, "loss/hidden": 3.444921875, "loss/jsd": 0.0, "loss/logits": 0.20150573179125786, "step": 1280 }, { "epoch": 0.03225, "grad_norm": 31.75, "grad_norm_var": 3.2280598958333333, "learning_rate": 0.0001, "loss": 7.6724, "loss/crossentropy": 2.1218355029821394, "loss/hidden": 3.41640625, "loss/jsd": 0.0, "loss/logits": 0.19727950319647788, "step": 1290 }, { "epoch": 0.0325, "grad_norm": 33.25, "grad_norm_var": 3.6113932291666666, "learning_rate": 0.0001, "loss": 7.5881, "loss/crossentropy": 2.048227934539318, "loss/hidden": 3.352734375, "loss/jsd": 0.0, "loss/logits": 0.19401397118344904, "step": 1300 }, { "epoch": 0.03275, "grad_norm": 35.5, "grad_norm_var": 2.8384765625, "learning_rate": 0.0001, "loss": 7.6481, "loss/crossentropy": 2.0217724472284315, "loss/hidden": 3.398828125, "loss/jsd": 0.0, "loss/logits": 0.19338970091193913, "step": 1310 }, { "epoch": 0.033, "grad_norm": 32.75, "grad_norm_var": 2.3671223958333334, "learning_rate": 0.0001, "loss": 7.5734, "loss/crossentropy": 2.123840129375458, "loss/hidden": 3.377734375, "loss/jsd": 0.0, "loss/logits": 0.19704403057694436, "step": 1320 }, { "epoch": 0.03325, "grad_norm": 31.375, "grad_norm_var": 1.8207682291666667, "learning_rate": 0.0001, "loss": 7.5379, "loss/crossentropy": 2.1691371381282805, "loss/hidden": 3.416796875, "loss/jsd": 0.0, "loss/logits": 0.20379403475672006, "step": 1330 }, { "epoch": 0.0335, "grad_norm": 30.5, "grad_norm_var": 2.162239583333333, "learning_rate": 0.0001, "loss": 7.5824, "loss/crossentropy": 2.0320975854992867, "loss/hidden": 3.490625, "loss/jsd": 0.0, "loss/logits": 0.204788769595325, "step": 1340 }, { "epoch": 0.03375, "grad_norm": 29.875, "grad_norm_var": 29.69140625, "learning_rate": 0.0001, "loss": 7.6835, "loss/crossentropy": 2.1799038141965865, "loss/hidden": 3.39765625, "loss/jsd": 0.0, "loss/logits": 0.21532316971570253, "step": 1350 }, { "epoch": 0.034, "grad_norm": 30.625, "grad_norm_var": 2.4837890625, "learning_rate": 0.0001, "loss": 7.6461, "loss/crossentropy": 2.075017270445824, "loss/hidden": 3.483984375, "loss/jsd": 0.0, "loss/logits": 0.21185822309926153, "step": 1360 }, { "epoch": 0.03425, "grad_norm": 29.375, "grad_norm_var": 2.6806640625, "learning_rate": 0.0001, "loss": 7.6084, "loss/crossentropy": 2.2061389327049254, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.19864549599587916, "step": 1370 }, { "epoch": 0.0345, "grad_norm": 46.25, "grad_norm_var": 16.428125, "learning_rate": 0.0001, "loss": 7.5773, "loss/crossentropy": 2.1411263316869737, "loss/hidden": 3.40703125, "loss/jsd": 0.0, "loss/logits": 0.18519967906177043, "step": 1380 }, { "epoch": 0.03475, "grad_norm": 33.5, "grad_norm_var": 312.2525390625, "learning_rate": 0.0001, "loss": 7.6651, "loss/crossentropy": 2.1760765284299852, "loss/hidden": 3.457421875, "loss/jsd": 0.0, "loss/logits": 0.19441114887595176, "step": 1390 }, { "epoch": 0.035, "grad_norm": 30.375, "grad_norm_var": 7.5697265625, "learning_rate": 0.0001, "loss": 7.6384, "loss/crossentropy": 2.148920811712742, "loss/hidden": 3.503515625, "loss/jsd": 0.0, "loss/logits": 0.20845398511737584, "step": 1400 }, { "epoch": 0.03525, "grad_norm": 31.5, "grad_norm_var": 3.2301432291666665, "learning_rate": 0.0001, "loss": 7.6503, "loss/crossentropy": 2.2241507709026336, "loss/hidden": 3.478515625, "loss/jsd": 0.0, "loss/logits": 0.2013774536550045, "step": 1410 }, { "epoch": 0.0355, "grad_norm": 32.75, "grad_norm_var": 3.67265625, "learning_rate": 0.0001, "loss": 7.6107, "loss/crossentropy": 2.151585003733635, "loss/hidden": 3.422265625, "loss/jsd": 0.0, "loss/logits": 0.19790932536125183, "step": 1420 }, { "epoch": 0.03575, "grad_norm": 31.0, "grad_norm_var": 2.192643229166667, "learning_rate": 0.0001, "loss": 7.6089, "loss/crossentropy": 2.106749549508095, "loss/hidden": 3.555078125, "loss/jsd": 0.0, "loss/logits": 0.20702882390469313, "step": 1430 }, { "epoch": 0.036, "grad_norm": 34.0, "grad_norm_var": 2.8622395833333334, "learning_rate": 0.0001, "loss": 7.6081, "loss/crossentropy": 2.0727885022759436, "loss/hidden": 3.425, "loss/jsd": 0.0, "loss/logits": 0.21057205237448215, "step": 1440 }, { "epoch": 0.03625, "grad_norm": 31.25, "grad_norm_var": 3.32890625, "learning_rate": 0.0001, "loss": 7.7005, "loss/crossentropy": 2.3014074742794035, "loss/hidden": 3.35859375, "loss/jsd": 0.0, "loss/logits": 0.20723759960383176, "step": 1450 }, { "epoch": 0.0365, "grad_norm": 28.625, "grad_norm_var": 4.240625, "learning_rate": 0.0001, "loss": 7.5677, "loss/crossentropy": 2.12650800794363, "loss/hidden": 3.44609375, "loss/jsd": 0.0, "loss/logits": 0.20236929692327976, "step": 1460 }, { "epoch": 0.03675, "grad_norm": 31.0, "grad_norm_var": 8.195768229166667, "learning_rate": 0.0001, "loss": 7.7299, "loss/crossentropy": 2.187662351131439, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.20965678989887238, "step": 1470 }, { "epoch": 0.037, "grad_norm": 33.0, "grad_norm_var": 6.1666015625, "learning_rate": 0.0001, "loss": 7.5942, "loss/crossentropy": 2.174471014738083, "loss/hidden": 3.534765625, "loss/jsd": 0.0, "loss/logits": 0.21107099391520023, "step": 1480 }, { "epoch": 0.03725, "grad_norm": 40.5, "grad_norm_var": 7.112239583333333, "learning_rate": 0.0001, "loss": 7.5362, "loss/crossentropy": 2.069592148065567, "loss/hidden": 3.373046875, "loss/jsd": 0.0, "loss/logits": 0.19253854881972074, "step": 1490 }, { "epoch": 0.0375, "grad_norm": 33.25, "grad_norm_var": 6.4869140625, "learning_rate": 0.0001, "loss": 7.5941, "loss/crossentropy": 2.0679068714380264, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.20371587071567773, "step": 1500 }, { "epoch": 0.03775, "grad_norm": 30.875, "grad_norm_var": 2.5686848958333335, "learning_rate": 0.0001, "loss": 7.6557, "loss/crossentropy": 2.1364961892366408, "loss/hidden": 3.4546875, "loss/jsd": 0.0, "loss/logits": 0.20447015166282653, "step": 1510 }, { "epoch": 0.038, "grad_norm": 32.5, "grad_norm_var": 2.6738932291666666, "learning_rate": 0.0001, "loss": 7.622, "loss/crossentropy": 2.1275009989738463, "loss/hidden": 3.431640625, "loss/jsd": 0.0, "loss/logits": 0.18973923586308955, "step": 1520 }, { "epoch": 0.03825, "grad_norm": 32.5, "grad_norm_var": 1.3177083333333333, "learning_rate": 0.0001, "loss": 7.7366, "loss/crossentropy": 2.212426933646202, "loss/hidden": 3.44296875, "loss/jsd": 0.0, "loss/logits": 0.21735910680145026, "step": 1530 }, { "epoch": 0.0385, "grad_norm": 27.75, "grad_norm_var": 2.7348307291666667, "learning_rate": 0.0001, "loss": 7.6356, "loss/crossentropy": 2.1074824020266534, "loss/hidden": 3.39609375, "loss/jsd": 0.0, "loss/logits": 0.19886015299707652, "step": 1540 }, { "epoch": 0.03875, "grad_norm": 31.75, "grad_norm_var": 4.234830729166666, "learning_rate": 0.0001, "loss": 7.7567, "loss/crossentropy": 2.177932971715927, "loss/hidden": 3.5609375, "loss/jsd": 0.0, "loss/logits": 0.2221821215003729, "step": 1550 }, { "epoch": 0.039, "grad_norm": 36.25, "grad_norm_var": 2.8791015625, "learning_rate": 0.0001, "loss": 7.5766, "loss/crossentropy": 2.0759232968091963, "loss/hidden": 3.386328125, "loss/jsd": 0.0, "loss/logits": 0.19086231291294098, "step": 1560 }, { "epoch": 0.03925, "grad_norm": 32.0, "grad_norm_var": 3.7046223958333333, "learning_rate": 0.0001, "loss": 7.537, "loss/crossentropy": 2.2487298890948297, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.19235554365441204, "step": 1570 }, { "epoch": 0.0395, "grad_norm": 33.0, "grad_norm_var": 2.5927083333333334, "learning_rate": 0.0001, "loss": 7.5167, "loss/crossentropy": 2.2023983120918276, "loss/hidden": 3.405078125, "loss/jsd": 0.0, "loss/logits": 0.20318181458860635, "step": 1580 }, { "epoch": 0.03975, "grad_norm": 30.0, "grad_norm_var": 3.4436848958333335, "learning_rate": 0.0001, "loss": 7.5528, "loss/crossentropy": 1.9812082558870316, "loss/hidden": 3.426171875, "loss/jsd": 0.0, "loss/logits": 0.19151438660919667, "step": 1590 }, { "epoch": 0.04, "grad_norm": 30.0, "grad_norm_var": 2.6098307291666667, "learning_rate": 0.0001, "loss": 7.5232, "loss/crossentropy": 2.0910235196352005, "loss/hidden": 3.4640625, "loss/jsd": 0.0, "loss/logits": 0.20488944984972476, "step": 1600 }, { "epoch": 0.04025, "grad_norm": 30.0, "grad_norm_var": 1.5854166666666667, "learning_rate": 0.0001, "loss": 7.7078, "loss/crossentropy": 2.079045358300209, "loss/hidden": 3.483203125, "loss/jsd": 0.0, "loss/logits": 0.20172932054847478, "step": 1610 }, { "epoch": 0.0405, "grad_norm": 33.5, "grad_norm_var": 3.4368798010809257e+18, "learning_rate": 0.0001, "loss": 7.5441, "loss/crossentropy": 2.146761792898178, "loss/hidden": 3.562890625, "loss/jsd": 0.0, "loss/logits": 0.20237026009708642, "step": 1620 }, { "epoch": 0.04075, "grad_norm": 31.625, "grad_norm_var": 3.4368798024404393e+18, "learning_rate": 0.0001, "loss": 7.7162, "loss/crossentropy": 2.1575208425521852, "loss/hidden": 3.35703125, "loss/jsd": 0.0, "loss/logits": 0.1914537126198411, "step": 1630 }, { "epoch": 0.041, "grad_norm": 29.0, "grad_norm_var": 37.940625, "learning_rate": 0.0001, "loss": 7.5673, "loss/crossentropy": 2.2059059768915175, "loss/hidden": 3.385546875, "loss/jsd": 0.0, "loss/logits": 0.1920375470072031, "step": 1640 }, { "epoch": 0.04125, "grad_norm": 30.75, "grad_norm_var": 61.631184895833336, "learning_rate": 0.0001, "loss": 7.5788, "loss/crossentropy": 2.1531882882118225, "loss/hidden": 3.3765625, "loss/jsd": 0.0, "loss/logits": 0.192771671153605, "step": 1650 }, { "epoch": 0.0415, "grad_norm": 35.25, "grad_norm_var": 47.353125, "learning_rate": 0.0001, "loss": 7.5891, "loss/crossentropy": 2.1217811673879625, "loss/hidden": 3.4046875, "loss/jsd": 0.0, "loss/logits": 0.19420051630586385, "step": 1660 }, { "epoch": 0.04175, "grad_norm": 29.875, "grad_norm_var": 21.580989583333334, "learning_rate": 0.0001, "loss": 7.6361, "loss/crossentropy": 2.0970900297164916, "loss/hidden": 3.48828125, "loss/jsd": 0.0, "loss/logits": 0.21125762071460485, "step": 1670 }, { "epoch": 0.042, "grad_norm": 36.5, "grad_norm_var": 72.77916666666667, "learning_rate": 0.0001, "loss": 7.7045, "loss/crossentropy": 2.1662445843219755, "loss/hidden": 3.441015625, "loss/jsd": 0.0, "loss/logits": 0.21249269619584082, "step": 1680 }, { "epoch": 0.04225, "grad_norm": 32.75, "grad_norm_var": 72.42473958333333, "learning_rate": 0.0001, "loss": 7.5754, "loss/crossentropy": 2.1745404630899428, "loss/hidden": 3.478125, "loss/jsd": 0.0, "loss/logits": 0.20168567337095739, "step": 1690 }, { "epoch": 0.0425, "grad_norm": 32.5, "grad_norm_var": 9.149739583333334, "learning_rate": 0.0001, "loss": 7.5555, "loss/crossentropy": 2.126581160724163, "loss/hidden": 3.34296875, "loss/jsd": 0.0, "loss/logits": 0.1933064443990588, "step": 1700 }, { "epoch": 0.04275, "grad_norm": 32.0, "grad_norm_var": 10.1369140625, "learning_rate": 0.0001, "loss": 7.5377, "loss/crossentropy": 2.1726800590753554, "loss/hidden": 3.441796875, "loss/jsd": 0.0, "loss/logits": 0.20911512654274703, "step": 1710 }, { "epoch": 0.043, "grad_norm": 31.0, "grad_norm_var": 3.314322916666667, "learning_rate": 0.0001, "loss": 7.6106, "loss/crossentropy": 1.9999253153800964, "loss/hidden": 3.43046875, "loss/jsd": 0.0, "loss/logits": 0.18493321686983108, "step": 1720 }, { "epoch": 0.04325, "grad_norm": 38.0, "grad_norm_var": 6.673958333333333, "learning_rate": 0.0001, "loss": 7.7146, "loss/crossentropy": 2.1180618047714233, "loss/hidden": 3.46953125, "loss/jsd": 0.0, "loss/logits": 0.2076649811118841, "step": 1730 }, { "epoch": 0.0435, "grad_norm": 34.0, "grad_norm_var": 7.339322916666666, "learning_rate": 0.0001, "loss": 7.5464, "loss/crossentropy": 2.0583778262138366, "loss/hidden": 3.322265625, "loss/jsd": 0.0, "loss/logits": 0.18418019600212573, "step": 1740 }, { "epoch": 0.04375, "grad_norm": 31.0, "grad_norm_var": 4.749739583333334, "learning_rate": 0.0001, "loss": 7.5528, "loss/crossentropy": 2.2259044647216797, "loss/hidden": 3.394921875, "loss/jsd": 0.0, "loss/logits": 0.2031107559800148, "step": 1750 }, { "epoch": 0.044, "grad_norm": 30.75, "grad_norm_var": 3.1389973958333335, "learning_rate": 0.0001, "loss": 7.5321, "loss/crossentropy": 2.124616578221321, "loss/hidden": 3.46484375, "loss/jsd": 0.0, "loss/logits": 0.2149519257247448, "step": 1760 }, { "epoch": 0.04425, "grad_norm": 28.625, "grad_norm_var": 6.442708333333333, "learning_rate": 0.0001, "loss": 7.63, "loss/crossentropy": 2.1418310686945916, "loss/hidden": 3.35390625, "loss/jsd": 0.0, "loss/logits": 0.19125681575387715, "step": 1770 }, { "epoch": 0.0445, "grad_norm": 32.25, "grad_norm_var": 3.011458333333333, "learning_rate": 0.0001, "loss": 7.5307, "loss/crossentropy": 2.144363935291767, "loss/hidden": 3.440625, "loss/jsd": 0.0, "loss/logits": 0.19334549438208343, "step": 1780 }, { "epoch": 0.04475, "grad_norm": 32.75, "grad_norm_var": 1.6559895833333333, "learning_rate": 0.0001, "loss": 7.627, "loss/crossentropy": 2.2538854971528055, "loss/hidden": 3.385546875, "loss/jsd": 0.0, "loss/logits": 0.2010333575308323, "step": 1790 }, { "epoch": 0.045, "grad_norm": 31.5, "grad_norm_var": 1.9416015625, "learning_rate": 0.0001, "loss": 7.6471, "loss/crossentropy": 2.091226762533188, "loss/hidden": 3.450390625, "loss/jsd": 0.0, "loss/logits": 0.2152847982943058, "step": 1800 }, { "epoch": 0.04525, "grad_norm": 28.875, "grad_norm_var": 18.03515625, "learning_rate": 0.0001, "loss": 7.5626, "loss/crossentropy": 2.295862782001495, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.20952691733837128, "step": 1810 }, { "epoch": 0.0455, "grad_norm": 30.75, "grad_norm_var": 18.4541015625, "learning_rate": 0.0001, "loss": 7.6872, "loss/crossentropy": 2.0833657890558244, "loss/hidden": 3.4875, "loss/jsd": 0.0, "loss/logits": 0.2070673793554306, "step": 1820 }, { "epoch": 0.04575, "grad_norm": 30.25, "grad_norm_var": 2.812239583333333, "learning_rate": 0.0001, "loss": 7.647, "loss/crossentropy": 2.1021256439387797, "loss/hidden": 3.387890625, "loss/jsd": 0.0, "loss/logits": 0.18996517434716226, "step": 1830 }, { "epoch": 0.046, "grad_norm": 33.75, "grad_norm_var": 2.7009765625, "learning_rate": 0.0001, "loss": 7.5203, "loss/crossentropy": 2.2004275381565095, "loss/hidden": 3.314453125, "loss/jsd": 0.0, "loss/logits": 0.187726416811347, "step": 1840 }, { "epoch": 0.04625, "grad_norm": 30.0, "grad_norm_var": 4.067643229166666, "learning_rate": 0.0001, "loss": 7.6201, "loss/crossentropy": 2.1705893486738206, "loss/hidden": 3.522265625, "loss/jsd": 0.0, "loss/logits": 0.21116435080766677, "step": 1850 }, { "epoch": 0.0465, "grad_norm": 32.5, "grad_norm_var": 5.030989583333334, "learning_rate": 0.0001, "loss": 7.6235, "loss/crossentropy": 2.1204057022929192, "loss/hidden": 3.351953125, "loss/jsd": 0.0, "loss/logits": 0.19528388790786266, "step": 1860 }, { "epoch": 0.04675, "grad_norm": 45.0, "grad_norm_var": 15.6603515625, "learning_rate": 0.0001, "loss": 7.4562, "loss/crossentropy": 2.154220977425575, "loss/hidden": 3.4125, "loss/jsd": 0.0, "loss/logits": 0.19214881088119, "step": 1870 }, { "epoch": 0.047, "grad_norm": 32.5, "grad_norm_var": 15.614518229166666, "learning_rate": 0.0001, "loss": 7.6258, "loss/crossentropy": 2.1000912792980673, "loss/hidden": 3.459375, "loss/jsd": 0.0, "loss/logits": 0.20185065623372794, "step": 1880 }, { "epoch": 0.04725, "grad_norm": 33.25, "grad_norm_var": 5.931184895833334, "learning_rate": 0.0001, "loss": 7.4863, "loss/crossentropy": 2.2270338363945483, "loss/hidden": 3.337890625, "loss/jsd": 0.0, "loss/logits": 0.19286383930593728, "step": 1890 }, { "epoch": 0.0475, "grad_norm": 32.5, "grad_norm_var": 3.311393229166667, "learning_rate": 0.0001, "loss": 7.6001, "loss/crossentropy": 2.1998794853687285, "loss/hidden": 3.382421875, "loss/jsd": 0.0, "loss/logits": 0.19646506551653148, "step": 1900 }, { "epoch": 0.04775, "grad_norm": 31.375, "grad_norm_var": 3.577018229166667, "learning_rate": 0.0001, "loss": 7.653, "loss/crossentropy": 2.104053999483585, "loss/hidden": 3.48203125, "loss/jsd": 0.0, "loss/logits": 0.201510801166296, "step": 1910 }, { "epoch": 0.048, "grad_norm": 30.5, "grad_norm_var": 17.68125, "learning_rate": 0.0001, "loss": 7.5273, "loss/crossentropy": 2.1184800997376443, "loss/hidden": 3.358984375, "loss/jsd": 0.0, "loss/logits": 0.19248204957693815, "step": 1920 }, { "epoch": 0.04825, "grad_norm": 33.5, "grad_norm_var": 1.3, "learning_rate": 0.0001, "loss": 7.655, "loss/crossentropy": 2.1734499007463457, "loss/hidden": 3.455859375, "loss/jsd": 0.0, "loss/logits": 0.1961166137829423, "step": 1930 }, { "epoch": 0.0485, "grad_norm": 30.125, "grad_norm_var": 2.5205729166666666, "learning_rate": 0.0001, "loss": 7.6186, "loss/crossentropy": 2.237542712688446, "loss/hidden": 3.400390625, "loss/jsd": 0.0, "loss/logits": 0.20616979897022247, "step": 1940 }, { "epoch": 0.04875, "grad_norm": 30.75, "grad_norm_var": 2.810416666666667, "learning_rate": 0.0001, "loss": 7.5436, "loss/crossentropy": 2.1811093270778654, "loss/hidden": 3.40234375, "loss/jsd": 0.0, "loss/logits": 0.1928004425019026, "step": 1950 }, { "epoch": 0.049, "grad_norm": 34.25, "grad_norm_var": 1.5181640625, "learning_rate": 0.0001, "loss": 7.6175, "loss/crossentropy": 2.045266591012478, "loss/hidden": 3.45078125, "loss/jsd": 0.0, "loss/logits": 0.21615136358886958, "step": 1960 }, { "epoch": 0.04925, "grad_norm": 33.5, "grad_norm_var": 26.225455729166665, "learning_rate": 0.0001, "loss": 7.7083, "loss/crossentropy": 2.086462992429733, "loss/hidden": 3.469921875, "loss/jsd": 0.0, "loss/logits": 0.2192224683240056, "step": 1970 }, { "epoch": 0.0495, "grad_norm": 29.75, "grad_norm_var": 8.340625, "learning_rate": 0.0001, "loss": 7.5032, "loss/crossentropy": 2.0052025958895685, "loss/hidden": 3.407421875, "loss/jsd": 0.0, "loss/logits": 0.18473264537751674, "step": 1980 }, { "epoch": 0.04975, "grad_norm": 44.5, "grad_norm_var": 4635.33515625, "learning_rate": 0.0001, "loss": 7.5567, "loss/crossentropy": 2.04089385792613, "loss/hidden": 3.394921875, "loss/jsd": 0.0, "loss/logits": 0.19560968028381467, "step": 1990 }, { "epoch": 0.05, "grad_norm": 51.0, "grad_norm_var": 4555.746875, "learning_rate": 0.0001, "loss": 7.6029, "loss/crossentropy": 2.2241372987627983, "loss/hidden": 3.387109375, "loss/jsd": 0.0, "loss/logits": 0.2209097046405077, "step": 2000 }, { "epoch": 0.05025, "grad_norm": 32.5, "grad_norm_var": 49.11608072916667, "learning_rate": 0.0001, "loss": 7.5781, "loss/crossentropy": 2.2160742044448853, "loss/hidden": 3.4171875, "loss/jsd": 0.0, "loss/logits": 0.19539013858884574, "step": 2010 }, { "epoch": 0.0505, "grad_norm": 28.625, "grad_norm_var": 73.70201822916667, "learning_rate": 0.0001, "loss": 7.6375, "loss/crossentropy": 2.094863271713257, "loss/hidden": 3.413671875, "loss/jsd": 0.0, "loss/logits": 0.20763578601181507, "step": 2020 }, { "epoch": 0.05075, "grad_norm": 30.125, "grad_norm_var": 22.494791666666668, "learning_rate": 0.0001, "loss": 7.6164, "loss/crossentropy": 2.121490868926048, "loss/hidden": 3.402734375, "loss/jsd": 0.0, "loss/logits": 0.19190637897700072, "step": 2030 }, { "epoch": 0.051, "grad_norm": 34.5, "grad_norm_var": 20.549739583333334, "learning_rate": 0.0001, "loss": 7.5717, "loss/crossentropy": 2.0418698236346247, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.18076165094971658, "step": 2040 }, { "epoch": 0.05125, "grad_norm": 29.125, "grad_norm_var": 14.407291666666667, "learning_rate": 0.0001, "loss": 7.5867, "loss/crossentropy": 2.1989556729793547, "loss/hidden": 3.385546875, "loss/jsd": 0.0, "loss/logits": 0.2011225748807192, "step": 2050 }, { "epoch": 0.0515, "grad_norm": 32.75, "grad_norm_var": 17.120572916666667, "learning_rate": 0.0001, "loss": 7.6813, "loss/crossentropy": 2.073545518517494, "loss/hidden": 3.546484375, "loss/jsd": 0.0, "loss/logits": 0.22323863469064237, "step": 2060 }, { "epoch": 0.05175, "grad_norm": 42.25, "grad_norm_var": 76.27291666666666, "learning_rate": 0.0001, "loss": 7.6323, "loss/crossentropy": 2.1424515694379807, "loss/hidden": 3.44140625, "loss/jsd": 0.0, "loss/logits": 0.19750071745365858, "step": 2070 }, { "epoch": 0.052, "grad_norm": 30.625, "grad_norm_var": 74.925, "learning_rate": 0.0001, "loss": 7.6198, "loss/crossentropy": 2.006666135787964, "loss/hidden": 3.463671875, "loss/jsd": 0.0, "loss/logits": 0.19839615989476442, "step": 2080 }, { "epoch": 0.05225, "grad_norm": 41.5, "grad_norm_var": 30.869791666666668, "learning_rate": 0.0001, "loss": 7.5671, "loss/crossentropy": 2.2227450221776963, "loss/hidden": 3.451171875, "loss/jsd": 0.0, "loss/logits": 0.20443473970517517, "step": 2090 }, { "epoch": 0.0525, "grad_norm": 34.75, "grad_norm_var": 24.609309895833334, "learning_rate": 0.0001, "loss": 7.5928, "loss/crossentropy": 2.186553081870079, "loss/hidden": 3.378125, "loss/jsd": 0.0, "loss/logits": 0.18928833175450563, "step": 2100 }, { "epoch": 0.05275, "grad_norm": 29.125, "grad_norm_var": 18.37265625, "learning_rate": 0.0001, "loss": 7.7389, "loss/crossentropy": 2.0680640071630476, "loss/hidden": 3.4515625, "loss/jsd": 0.0, "loss/logits": 0.201955908536911, "step": 2110 }, { "epoch": 0.053, "grad_norm": 33.5, "grad_norm_var": 9.276497395833333, "learning_rate": 0.0001, "loss": 7.6148, "loss/crossentropy": 2.1981600403785704, "loss/hidden": 3.352734375, "loss/jsd": 0.0, "loss/logits": 0.1901057105511427, "step": 2120 }, { "epoch": 0.05325, "grad_norm": 31.75, "grad_norm_var": 5.308072916666666, "learning_rate": 0.0001, "loss": 7.48, "loss/crossentropy": 2.2574460208415985, "loss/hidden": 3.362109375, "loss/jsd": 0.0, "loss/logits": 0.19361322987824678, "step": 2130 }, { "epoch": 0.0535, "grad_norm": 29.625, "grad_norm_var": 8.5900390625, "learning_rate": 0.0001, "loss": 7.6644, "loss/crossentropy": 2.154197073727846, "loss/hidden": 3.50625, "loss/jsd": 0.0, "loss/logits": 0.2086074635386467, "step": 2140 }, { "epoch": 0.05375, "grad_norm": 30.5, "grad_norm_var": 8.333072916666667, "learning_rate": 0.0001, "loss": 7.5279, "loss/crossentropy": 2.1245498836040495, "loss/hidden": 3.37265625, "loss/jsd": 0.0, "loss/logits": 0.19000006280839443, "step": 2150 }, { "epoch": 0.054, "grad_norm": 31.375, "grad_norm_var": 10.856184895833334, "learning_rate": 0.0001, "loss": 7.7098, "loss/crossentropy": 2.1968549311161043, "loss/hidden": 3.469140625, "loss/jsd": 0.0, "loss/logits": 0.21285793352872134, "step": 2160 }, { "epoch": 0.05425, "grad_norm": 34.25, "grad_norm_var": 17.820572916666666, "learning_rate": 0.0001, "loss": 7.7095, "loss/crossentropy": 2.2533976465463637, "loss/hidden": 3.434375, "loss/jsd": 0.0, "loss/logits": 0.20727334953844548, "step": 2170 }, { "epoch": 0.0545, "grad_norm": 36.0, "grad_norm_var": 22.128580729166668, "learning_rate": 0.0001, "loss": 7.5933, "loss/crossentropy": 2.227231651544571, "loss/hidden": 3.380078125, "loss/jsd": 0.0, "loss/logits": 0.19693543761968613, "step": 2180 }, { "epoch": 0.05475, "grad_norm": 29.125, "grad_norm_var": 11.466666666666667, "learning_rate": 0.0001, "loss": 7.5348, "loss/crossentropy": 2.1581582985818386, "loss/hidden": 3.397265625, "loss/jsd": 0.0, "loss/logits": 0.19432583590969443, "step": 2190 }, { "epoch": 0.055, "grad_norm": 37.25, "grad_norm_var": 10.514518229166667, "learning_rate": 0.0001, "loss": 7.6264, "loss/crossentropy": 2.0908095851540565, "loss/hidden": 3.39296875, "loss/jsd": 0.0, "loss/logits": 0.20568534098565577, "step": 2200 }, { "epoch": 0.05525, "grad_norm": 33.25, "grad_norm_var": 5.6494140625, "learning_rate": 0.0001, "loss": 7.671, "loss/crossentropy": 2.0827252097427844, "loss/hidden": 3.54921875, "loss/jsd": 0.0, "loss/logits": 0.20747530292719601, "step": 2210 }, { "epoch": 0.0555, "grad_norm": 31.875, "grad_norm_var": 47.234375, "learning_rate": 0.0001, "loss": 7.5823, "loss/crossentropy": 2.2815075665712357, "loss/hidden": 3.41484375, "loss/jsd": 0.0, "loss/logits": 0.20698099359869956, "step": 2220 }, { "epoch": 0.05575, "grad_norm": 37.5, "grad_norm_var": 30.277018229166668, "learning_rate": 0.0001, "loss": 7.5812, "loss/crossentropy": 2.2282338082790374, "loss/hidden": 3.365234375, "loss/jsd": 0.0, "loss/logits": 0.19244133178144693, "step": 2230 }, { "epoch": 0.056, "grad_norm": 31.0, "grad_norm_var": 27.68515625, "learning_rate": 0.0001, "loss": 7.6485, "loss/crossentropy": 2.184544026851654, "loss/hidden": 3.39375, "loss/jsd": 0.0, "loss/logits": 0.1996760057285428, "step": 2240 }, { "epoch": 0.05625, "grad_norm": 30.125, "grad_norm_var": 2.223372395833333, "learning_rate": 0.0001, "loss": 7.6003, "loss/crossentropy": 2.1572179198265076, "loss/hidden": 3.38984375, "loss/jsd": 0.0, "loss/logits": 0.20082181300967933, "step": 2250 }, { "epoch": 0.0565, "grad_norm": 63.5, "grad_norm_var": 240.61295572916666, "learning_rate": 0.0001, "loss": 7.6129, "loss/crossentropy": 2.1616927281022074, "loss/hidden": 3.494140625, "loss/jsd": 0.0, "loss/logits": 0.21196384327486156, "step": 2260 }, { "epoch": 0.05675, "grad_norm": 29.875, "grad_norm_var": 276.98932291666665, "learning_rate": 0.0001, "loss": 7.5683, "loss/crossentropy": 2.1882350742816925, "loss/hidden": 3.43671875, "loss/jsd": 0.0, "loss/logits": 0.2052389807999134, "step": 2270 }, { "epoch": 0.057, "grad_norm": 33.5, "grad_norm_var": 11.6681640625, "learning_rate": 0.0001, "loss": 7.6076, "loss/crossentropy": 2.1808354407548904, "loss/hidden": 3.540625, "loss/jsd": 0.0, "loss/logits": 0.21027475781738758, "step": 2280 }, { "epoch": 0.05725, "grad_norm": 30.5, "grad_norm_var": 1.6872395833333333, "learning_rate": 0.0001, "loss": 7.5295, "loss/crossentropy": 2.1488232225179673, "loss/hidden": 3.496875, "loss/jsd": 0.0, "loss/logits": 0.2111368477344513, "step": 2290 }, { "epoch": 0.0575, "grad_norm": 30.125, "grad_norm_var": 2.0603515625, "learning_rate": 0.0001, "loss": 7.6875, "loss/crossentropy": 2.1594634115695954, "loss/hidden": 3.384765625, "loss/jsd": 0.0, "loss/logits": 0.20478933807462454, "step": 2300 }, { "epoch": 0.05775, "grad_norm": 31.625, "grad_norm_var": 3.408072916666667, "learning_rate": 0.0001, "loss": 7.6636, "loss/crossentropy": 2.0932443618774412, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.21733616031706332, "step": 2310 }, { "epoch": 0.058, "grad_norm": 30.5, "grad_norm_var": 3.5416015625, "learning_rate": 0.0001, "loss": 7.5737, "loss/crossentropy": 2.1831247925758364, "loss/hidden": 3.409765625, "loss/jsd": 0.0, "loss/logits": 0.1989585768431425, "step": 2320 }, { "epoch": 0.05825, "grad_norm": 30.25, "grad_norm_var": 2.2393229166666666, "learning_rate": 0.0001, "loss": 7.5009, "loss/crossentropy": 2.2847615987062455, "loss/hidden": 3.424609375, "loss/jsd": 0.0, "loss/logits": 0.20479252003133297, "step": 2330 }, { "epoch": 0.0585, "grad_norm": 33.0, "grad_norm_var": 6.374934895833333, "learning_rate": 0.0001, "loss": 7.7313, "loss/crossentropy": 2.1928456306457518, "loss/hidden": 3.401953125, "loss/jsd": 0.0, "loss/logits": 0.20380632691085337, "step": 2340 }, { "epoch": 0.05875, "grad_norm": 29.125, "grad_norm_var": 4.7697265625, "learning_rate": 0.0001, "loss": 7.5476, "loss/crossentropy": 2.1438629984855653, "loss/hidden": 3.321875, "loss/jsd": 0.0, "loss/logits": 0.19849517289549112, "step": 2350 }, { "epoch": 0.059, "grad_norm": 33.5, "grad_norm_var": 3.8893229166666665, "learning_rate": 0.0001, "loss": 7.6545, "loss/crossentropy": 2.1132273241877555, "loss/hidden": 3.419140625, "loss/jsd": 0.0, "loss/logits": 0.21593583207577466, "step": 2360 }, { "epoch": 0.05925, "grad_norm": 38.0, "grad_norm_var": 3.9934895833333335, "learning_rate": 0.0001, "loss": 7.6438, "loss/crossentropy": 2.1729103833436967, "loss/hidden": 3.4515625, "loss/jsd": 0.0, "loss/logits": 0.204007020406425, "step": 2370 }, { "epoch": 0.0595, "grad_norm": 28.875, "grad_norm_var": 4.180989583333333, "learning_rate": 0.0001, "loss": 7.5261, "loss/crossentropy": 2.1972236961126326, "loss/hidden": 3.37421875, "loss/jsd": 0.0, "loss/logits": 0.21200564429163932, "step": 2380 }, { "epoch": 0.05975, "grad_norm": 30.375, "grad_norm_var": 2.9854166666666666, "learning_rate": 0.0001, "loss": 7.6233, "loss/crossentropy": 2.130857673287392, "loss/hidden": 3.398828125, "loss/jsd": 0.0, "loss/logits": 0.2043182048946619, "step": 2390 }, { "epoch": 0.06, "grad_norm": 33.0, "grad_norm_var": 2.552018229166667, "learning_rate": 0.0001, "loss": 7.5626, "loss/crossentropy": 2.1780240714550017, "loss/hidden": 3.4515625, "loss/jsd": 0.0, "loss/logits": 0.19501971434801818, "step": 2400 }, { "epoch": 0.06025, "grad_norm": 33.25, "grad_norm_var": 1.4978515625, "learning_rate": 0.0001, "loss": 7.5495, "loss/crossentropy": 2.075847564637661, "loss/hidden": 3.377734375, "loss/jsd": 0.0, "loss/logits": 0.19856880297884344, "step": 2410 }, { "epoch": 0.0605, "grad_norm": 30.375, "grad_norm_var": 8.763541666666667, "learning_rate": 0.0001, "loss": 7.5161, "loss/crossentropy": 1.960635770857334, "loss/hidden": 3.521484375, "loss/jsd": 0.0, "loss/logits": 0.2014876109547913, "step": 2420 }, { "epoch": 0.06075, "grad_norm": 30.375, "grad_norm_var": 5.85390625, "learning_rate": 0.0001, "loss": 7.6453, "loss/crossentropy": 2.235329329967499, "loss/hidden": 3.438671875, "loss/jsd": 0.0, "loss/logits": 0.22043778821825982, "step": 2430 }, { "epoch": 0.061, "grad_norm": 35.0, "grad_norm_var": 2.928465629875169e+18, "learning_rate": 0.0001, "loss": 7.5153, "loss/crossentropy": 2.0534446865320204, "loss/hidden": 3.469140625, "loss/jsd": 0.0, "loss/logits": 0.20287865065038205, "step": 2440 }, { "epoch": 0.06125, "grad_norm": 31.0, "grad_norm_var": 2.928465629810996e+18, "learning_rate": 0.0001, "loss": 7.5862, "loss/crossentropy": 2.1208725392818453, "loss/hidden": 3.534765625, "loss/jsd": 0.0, "loss/logits": 0.22640016246587039, "step": 2450 }, { "epoch": 0.0615, "grad_norm": 31.125, "grad_norm_var": 1.5614583333333334, "learning_rate": 0.0001, "loss": 7.5751, "loss/crossentropy": 2.206133508682251, "loss/hidden": 3.387109375, "loss/jsd": 0.0, "loss/logits": 0.19071156904101372, "step": 2460 }, { "epoch": 0.06175, "grad_norm": 28.625, "grad_norm_var": 7.667643229166667, "learning_rate": 0.0001, "loss": 7.5604, "loss/crossentropy": 2.227986590564251, "loss/hidden": 3.30859375, "loss/jsd": 0.0, "loss/logits": 0.19392532519996167, "step": 2470 }, { "epoch": 0.062, "grad_norm": 34.0, "grad_norm_var": 5.81015625, "learning_rate": 0.0001, "loss": 7.6077, "loss/crossentropy": 2.1948474526405333, "loss/hidden": 3.427734375, "loss/jsd": 0.0, "loss/logits": 0.2092638686299324, "step": 2480 }, { "epoch": 0.06225, "grad_norm": 34.5, "grad_norm_var": 7.656184895833333, "learning_rate": 0.0001, "loss": 7.4338, "loss/crossentropy": 1.935844713449478, "loss/hidden": 3.413671875, "loss/jsd": 0.0, "loss/logits": 0.184383431263268, "step": 2490 }, { "epoch": 0.0625, "grad_norm": 42.75, "grad_norm_var": 17.27890625, "learning_rate": 0.0001, "loss": 7.5645, "loss/crossentropy": 2.2957077413797378, "loss/hidden": 3.409765625, "loss/jsd": 0.0, "loss/logits": 0.2223764518275857, "step": 2500 }, { "epoch": 0.06275, "grad_norm": 30.125, "grad_norm_var": 31.021809895833332, "learning_rate": 0.0001, "loss": 7.5539, "loss/crossentropy": 2.2163919866085053, "loss/hidden": 3.365234375, "loss/jsd": 0.0, "loss/logits": 0.2129704337567091, "step": 2510 }, { "epoch": 0.063, "grad_norm": 33.25, "grad_norm_var": 40.36764322916667, "learning_rate": 0.0001, "loss": 7.5664, "loss/crossentropy": 2.08260739967227, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.20597089193761348, "step": 2520 }, { "epoch": 0.06325, "grad_norm": 35.5, "grad_norm_var": 34.153125, "learning_rate": 0.0001, "loss": 7.559, "loss/crossentropy": 2.0747475802898405, "loss/hidden": 3.383203125, "loss/jsd": 0.0, "loss/logits": 0.1930603832937777, "step": 2530 }, { "epoch": 0.0635, "grad_norm": 31.25, "grad_norm_var": 12.302018229166666, "learning_rate": 0.0001, "loss": 7.609, "loss/crossentropy": 2.1542711734771727, "loss/hidden": 3.391796875, "loss/jsd": 0.0, "loss/logits": 0.19414089974015952, "step": 2540 }, { "epoch": 0.06375, "grad_norm": 29.125, "grad_norm_var": 34.333072916666666, "learning_rate": 0.0001, "loss": 7.5103, "loss/crossentropy": 2.1697633042931557, "loss/hidden": 3.325390625, "loss/jsd": 0.0, "loss/logits": 0.18199986461549997, "step": 2550 }, { "epoch": 0.064, "grad_norm": 30.75, "grad_norm_var": 21.8322265625, "learning_rate": 0.0001, "loss": 7.6335, "loss/crossentropy": 2.20294488966465, "loss/hidden": 3.35703125, "loss/jsd": 0.0, "loss/logits": 0.20988359525799752, "step": 2560 }, { "epoch": 0.06425, "grad_norm": 36.5, "grad_norm_var": 21.339322916666667, "learning_rate": 0.0001, "loss": 7.665, "loss/crossentropy": 2.1367219746112824, "loss/hidden": 3.4421875, "loss/jsd": 0.0, "loss/logits": 0.19768773801624775, "step": 2570 }, { "epoch": 0.0645, "grad_norm": 42.5, "grad_norm_var": 30.690625, "learning_rate": 0.0001, "loss": 7.6476, "loss/crossentropy": 2.2177338540554046, "loss/hidden": 3.605078125, "loss/jsd": 0.0, "loss/logits": 0.22041462864726782, "step": 2580 }, { "epoch": 0.06475, "grad_norm": 34.0, "grad_norm_var": 31.164583333333333, "learning_rate": 0.0001, "loss": 7.6158, "loss/crossentropy": 2.2423059731721877, "loss/hidden": 3.358984375, "loss/jsd": 0.0, "loss/logits": 0.21196699403226377, "step": 2590 }, { "epoch": 0.065, "grad_norm": 34.25, "grad_norm_var": 23.593684895833334, "learning_rate": 0.0001, "loss": 7.5107, "loss/crossentropy": 2.241020438075066, "loss/hidden": 3.25859375, "loss/jsd": 0.0, "loss/logits": 0.18191274981945754, "step": 2600 }, { "epoch": 0.06525, "grad_norm": 28.75, "grad_norm_var": 30.975455729166665, "learning_rate": 0.0001, "loss": 7.5492, "loss/crossentropy": 2.052956056594849, "loss/hidden": 3.447265625, "loss/jsd": 0.0, "loss/logits": 0.2021762602031231, "step": 2610 }, { "epoch": 0.0655, "grad_norm": 32.5, "grad_norm_var": 31.032291666666666, "learning_rate": 0.0001, "loss": 7.503, "loss/crossentropy": 2.17630957365036, "loss/hidden": 3.364453125, "loss/jsd": 0.0, "loss/logits": 0.19023955501616002, "step": 2620 }, { "epoch": 0.06575, "grad_norm": 34.5, "grad_norm_var": 6.083072916666667, "learning_rate": 0.0001, "loss": 7.5728, "loss/crossentropy": 2.178831994533539, "loss/hidden": 3.38046875, "loss/jsd": 0.0, "loss/logits": 0.1946109678596258, "step": 2630 }, { "epoch": 0.066, "grad_norm": 33.5, "grad_norm_var": 4.01640625, "learning_rate": 0.0001, "loss": 7.5928, "loss/crossentropy": 2.0746659457683565, "loss/hidden": 3.374609375, "loss/jsd": 0.0, "loss/logits": 0.19469854161143302, "step": 2640 }, { "epoch": 0.06625, "grad_norm": 31.0, "grad_norm_var": 21.718489583333334, "learning_rate": 0.0001, "loss": 7.6008, "loss/crossentropy": 2.202178010344505, "loss/hidden": 3.44609375, "loss/jsd": 0.0, "loss/logits": 0.2072868559509516, "step": 2650 }, { "epoch": 0.0665, "grad_norm": 31.375, "grad_norm_var": 24.743489583333332, "learning_rate": 0.0001, "loss": 7.5532, "loss/crossentropy": 2.0579031884670256, "loss/hidden": 3.33671875, "loss/jsd": 0.0, "loss/logits": 0.18248077742755414, "step": 2660 }, { "epoch": 0.06675, "grad_norm": 29.5, "grad_norm_var": 6.5181640625, "learning_rate": 0.0001, "loss": 7.6173, "loss/crossentropy": 2.0323975652456285, "loss/hidden": 3.47265625, "loss/jsd": 0.0, "loss/logits": 0.20279558952897786, "step": 2670 }, { "epoch": 0.067, "grad_norm": 30.125, "grad_norm_var": 9.7931640625, "learning_rate": 0.0001, "loss": 7.6202, "loss/crossentropy": 2.170803511887789, "loss/hidden": 3.448828125, "loss/jsd": 0.0, "loss/logits": 0.21887149531394243, "step": 2680 }, { "epoch": 0.06725, "grad_norm": 28.75, "grad_norm_var": 6.42890625, "learning_rate": 0.0001, "loss": 7.6042, "loss/crossentropy": 2.129136848449707, "loss/hidden": 3.401171875, "loss/jsd": 0.0, "loss/logits": 0.21714741103351115, "step": 2690 }, { "epoch": 0.0675, "grad_norm": 28.5, "grad_norm_var": 5.76640625, "learning_rate": 0.0001, "loss": 7.5307, "loss/crossentropy": 2.2333458453416823, "loss/hidden": 3.35390625, "loss/jsd": 0.0, "loss/logits": 0.19759367052465676, "step": 2700 }, { "epoch": 0.06775, "grad_norm": 31.25, "grad_norm_var": 4.925, "learning_rate": 0.0001, "loss": 7.5684, "loss/crossentropy": 2.1930347234010696, "loss/hidden": 3.3265625, "loss/jsd": 0.0, "loss/logits": 0.19194095116108656, "step": 2710 }, { "epoch": 0.068, "grad_norm": 29.875, "grad_norm_var": 1.0291666666666666, "learning_rate": 0.0001, "loss": 7.6177, "loss/crossentropy": 2.1858380883932114, "loss/hidden": 3.350390625, "loss/jsd": 0.0, "loss/logits": 0.18830202352255582, "step": 2720 }, { "epoch": 0.06825, "grad_norm": 33.25, "grad_norm_var": 5.19765625, "learning_rate": 0.0001, "loss": 7.6231, "loss/crossentropy": 2.1380992412567137, "loss/hidden": 3.40703125, "loss/jsd": 0.0, "loss/logits": 0.19919480197131634, "step": 2730 }, { "epoch": 0.0685, "grad_norm": 34.75, "grad_norm_var": 2.5872395833333335, "learning_rate": 0.0001, "loss": 7.7166, "loss/crossentropy": 2.299161267280579, "loss/hidden": 3.403125, "loss/jsd": 0.0, "loss/logits": 0.21277474984526634, "step": 2740 }, { "epoch": 0.06875, "grad_norm": 28.25, "grad_norm_var": 10.6525390625, "learning_rate": 0.0001, "loss": 7.6385, "loss/crossentropy": 2.0595856219530106, "loss/hidden": 3.397265625, "loss/jsd": 0.0, "loss/logits": 0.20196862574666738, "step": 2750 }, { "epoch": 0.069, "grad_norm": 30.0, "grad_norm_var": 3.1885416666666666, "learning_rate": 0.0001, "loss": 7.5306, "loss/crossentropy": 2.1047334015369414, "loss/hidden": 3.527734375, "loss/jsd": 0.0, "loss/logits": 0.20494798701256514, "step": 2760 }, { "epoch": 0.06925, "grad_norm": 35.5, "grad_norm_var": 2.5136418842603423e+18, "learning_rate": 0.0001, "loss": 7.5758, "loss/crossentropy": 1.9750292956829072, "loss/hidden": 3.49453125, "loss/jsd": 0.0, "loss/logits": 0.1999529790133238, "step": 2770 }, { "epoch": 0.0695, "grad_norm": 30.875, "grad_norm_var": 2.513641880435452e+18, "learning_rate": 0.0001, "loss": 7.5901, "loss/crossentropy": 2.1417267471551895, "loss/hidden": 3.412109375, "loss/jsd": 0.0, "loss/logits": 0.20058641098439695, "step": 2780 }, { "epoch": 0.06975, "grad_norm": 28.25, "grad_norm_var": 155.97024739583333, "learning_rate": 0.0001, "loss": 7.5485, "loss/crossentropy": 2.1048891723155974, "loss/hidden": 3.33515625, "loss/jsd": 0.0, "loss/logits": 0.19070767909288405, "step": 2790 }, { "epoch": 0.07, "grad_norm": 32.25, "grad_norm_var": 6.499739583333334, "learning_rate": 0.0001, "loss": 7.5996, "loss/crossentropy": 2.176823168247938, "loss/hidden": 3.344921875, "loss/jsd": 0.0, "loss/logits": 0.20276176873594523, "step": 2800 }, { "epoch": 0.07025, "grad_norm": 29.5, "grad_norm_var": 4.684309895833334, "learning_rate": 0.0001, "loss": 7.4426, "loss/crossentropy": 2.041793665289879, "loss/hidden": 3.3265625, "loss/jsd": 0.0, "loss/logits": 0.17560106106102466, "step": 2810 }, { "epoch": 0.0705, "grad_norm": 31.75, "grad_norm_var": 3.005989583333333, "learning_rate": 0.0001, "loss": 7.5811, "loss/crossentropy": 2.1094699330627917, "loss/hidden": 3.424609375, "loss/jsd": 0.0, "loss/logits": 0.19798500649631023, "step": 2820 }, { "epoch": 0.07075, "grad_norm": 33.75, "grad_norm_var": 4.320572916666666, "learning_rate": 0.0001, "loss": 7.5475, "loss/crossentropy": 2.1311514347791674, "loss/hidden": 3.344921875, "loss/jsd": 0.0, "loss/logits": 0.21825175136327743, "step": 2830 }, { "epoch": 0.071, "grad_norm": 34.75, "grad_norm_var": 3.7549465266724797e+18, "learning_rate": 0.0001, "loss": 7.5823, "loss/crossentropy": 2.120433983206749, "loss/hidden": 3.412109375, "loss/jsd": 0.0, "loss/logits": 0.20115374326705932, "step": 2840 }, { "epoch": 0.07125, "grad_norm": 29.25, "grad_norm_var": 3.7549465268743306e+18, "learning_rate": 0.0001, "loss": 7.5444, "loss/crossentropy": 2.1518867701292037, "loss/hidden": 3.398046875, "loss/jsd": 0.0, "loss/logits": 0.19730570819228888, "step": 2850 }, { "epoch": 0.0715, "grad_norm": 31.0, "grad_norm_var": 2.317122395833333, "learning_rate": 0.0001, "loss": 7.6362, "loss/crossentropy": 2.137280356884003, "loss/hidden": 3.422265625, "loss/jsd": 0.0, "loss/logits": 0.20695031639188527, "step": 2860 }, { "epoch": 0.07175, "grad_norm": 34.0, "grad_norm_var": 157.31015625, "learning_rate": 0.0001, "loss": 7.6643, "loss/crossentropy": 2.067914080619812, "loss/hidden": 3.5015625, "loss/jsd": 0.0, "loss/logits": 0.22938031535595654, "step": 2870 }, { "epoch": 0.072, "grad_norm": 32.5, "grad_norm_var": 151.77057291666668, "learning_rate": 0.0001, "loss": 7.5762, "loss/crossentropy": 2.0962916165590286, "loss/hidden": 3.456640625, "loss/jsd": 0.0, "loss/logits": 0.19928023852407933, "step": 2880 }, { "epoch": 0.07225, "grad_norm": 35.75, "grad_norm_var": 104.17962239583333, "learning_rate": 0.0001, "loss": 7.7066, "loss/crossentropy": 2.1184468276798727, "loss/hidden": 3.376953125, "loss/jsd": 0.0, "loss/logits": 0.1974171632900834, "step": 2890 }, { "epoch": 0.0725, "grad_norm": 32.25, "grad_norm_var": 116.02493489583334, "learning_rate": 0.0001, "loss": 7.6075, "loss/crossentropy": 2.1381339877843857, "loss/hidden": 3.434765625, "loss/jsd": 0.0, "loss/logits": 0.19655965138226747, "step": 2900 }, { "epoch": 0.07275, "grad_norm": 31.5, "grad_norm_var": 1.9551432291666666, "learning_rate": 0.0001, "loss": 7.5247, "loss/crossentropy": 2.141963595151901, "loss/hidden": 3.476953125, "loss/jsd": 0.0, "loss/logits": 0.19582534320652484, "step": 2910 }, { "epoch": 0.073, "grad_norm": 31.875, "grad_norm_var": 1.3014973958333333, "learning_rate": 0.0001, "loss": 7.5588, "loss/crossentropy": 2.077046422660351, "loss/hidden": 3.465234375, "loss/jsd": 0.0, "loss/logits": 0.20551967658102513, "step": 2920 }, { "epoch": 0.07325, "grad_norm": 32.25, "grad_norm_var": 8.734375, "learning_rate": 0.0001, "loss": 7.6195, "loss/crossentropy": 2.0110410653054713, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.1805876674130559, "step": 2930 }, { "epoch": 0.0735, "grad_norm": 32.25, "grad_norm_var": 4838.416666666667, "learning_rate": 0.0001, "loss": 7.6623, "loss/crossentropy": 2.1130379527807235, "loss/hidden": 3.478125, "loss/jsd": 0.0, "loss/logits": 0.2008265011012554, "step": 2940 }, { "epoch": 0.07375, "grad_norm": 39.0, "grad_norm_var": 57.90807291666667, "learning_rate": 0.0001, "loss": 7.59, "loss/crossentropy": 2.1667733818292616, "loss/hidden": 3.3109375, "loss/jsd": 0.0, "loss/logits": 0.19806477334350348, "step": 2950 }, { "epoch": 0.074, "grad_norm": 31.875, "grad_norm_var": 31.228059895833333, "learning_rate": 0.0001, "loss": 7.5131, "loss/crossentropy": 2.2757086992263793, "loss/hidden": 3.445703125, "loss/jsd": 0.0, "loss/logits": 0.197940625064075, "step": 2960 }, { "epoch": 0.07425, "grad_norm": 34.25, "grad_norm_var": 2.996809895833333, "learning_rate": 0.0001, "loss": 7.5841, "loss/crossentropy": 2.143115535378456, "loss/hidden": 3.45625, "loss/jsd": 0.0, "loss/logits": 0.20177703239023687, "step": 2970 }, { "epoch": 0.0745, "grad_norm": 31.0, "grad_norm_var": 3.1546223958333335, "learning_rate": 0.0001, "loss": 7.5363, "loss/crossentropy": 2.2850147604942324, "loss/hidden": 3.283984375, "loss/jsd": 0.0, "loss/logits": 0.19140432458370923, "step": 2980 }, { "epoch": 0.07475, "grad_norm": 32.25, "grad_norm_var": 7.310416666666667, "learning_rate": 0.0001, "loss": 7.5339, "loss/crossentropy": 2.104042625427246, "loss/hidden": 3.490625, "loss/jsd": 0.0, "loss/logits": 0.211691821180284, "step": 2990 }, { "epoch": 0.075, "grad_norm": 32.5, "grad_norm_var": 8.506705729166667, "learning_rate": 0.0001, "loss": 7.5448, "loss/crossentropy": 2.2270253866910936, "loss/hidden": 3.4390625, "loss/jsd": 0.0, "loss/logits": 0.20328052509576083, "step": 3000 }, { "epoch": 0.07525, "grad_norm": 46.75, "grad_norm_var": 17.10390625, "learning_rate": 0.0001, "loss": 7.6114, "loss/crossentropy": 2.2623429775238035, "loss/hidden": 3.32265625, "loss/jsd": 0.0, "loss/logits": 0.19479246698319913, "step": 3010 }, { "epoch": 0.0755, "grad_norm": 31.75, "grad_norm_var": 18.981184895833334, "learning_rate": 0.0001, "loss": 7.5739, "loss/crossentropy": 2.1012531995773314, "loss/hidden": 3.454296875, "loss/jsd": 0.0, "loss/logits": 0.20767469964921476, "step": 3020 }, { "epoch": 0.07575, "grad_norm": 45.25, "grad_norm_var": 15.55, "learning_rate": 0.0001, "loss": 7.6153, "loss/crossentropy": 2.090057593584061, "loss/hidden": 3.468359375, "loss/jsd": 0.0, "loss/logits": 0.1894347405061126, "step": 3030 }, { "epoch": 0.076, "grad_norm": 32.0, "grad_norm_var": 29.878125, "learning_rate": 0.0001, "loss": 7.6585, "loss/crossentropy": 2.200800988078117, "loss/hidden": 3.4234375, "loss/jsd": 0.0, "loss/logits": 0.19270651414990425, "step": 3040 }, { "epoch": 0.07625, "grad_norm": 31.0, "grad_norm_var": 10.167708333333334, "learning_rate": 0.0001, "loss": 7.5696, "loss/crossentropy": 2.0458697080612183, "loss/hidden": 3.53046875, "loss/jsd": 0.0, "loss/logits": 0.20314501021057368, "step": 3050 }, { "epoch": 0.0765, "grad_norm": 30.375, "grad_norm_var": 10.0181640625, "learning_rate": 0.0001, "loss": 7.5747, "loss/crossentropy": 2.2118399769067763, "loss/hidden": 3.482421875, "loss/jsd": 0.0, "loss/logits": 0.22158339023590087, "step": 3060 }, { "epoch": 0.07675, "grad_norm": 32.25, "grad_norm_var": 0.9650390625, "learning_rate": 0.0001, "loss": 7.5631, "loss/crossentropy": 2.1812553733587263, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.217013025470078, "step": 3070 }, { "epoch": 0.077, "grad_norm": 32.75, "grad_norm_var": 0.9171223958333333, "learning_rate": 0.0001, "loss": 7.578, "loss/crossentropy": 2.181536224484444, "loss/hidden": 3.36953125, "loss/jsd": 0.0, "loss/logits": 0.19952490609139203, "step": 3080 }, { "epoch": 0.07725, "grad_norm": 31.5, "grad_norm_var": 19.015559895833334, "learning_rate": 0.0001, "loss": 7.4878, "loss/crossentropy": 2.122265163064003, "loss/hidden": 3.4171875, "loss/jsd": 0.0, "loss/logits": 0.19191155321896075, "step": 3090 }, { "epoch": 0.0775, "grad_norm": 32.75, "grad_norm_var": 5.199739583333334, "learning_rate": 0.0001, "loss": 7.7471, "loss/crossentropy": 2.210584083199501, "loss/hidden": 3.42890625, "loss/jsd": 0.0, "loss/logits": 0.21269479542970657, "step": 3100 }, { "epoch": 0.07775, "grad_norm": 33.25, "grad_norm_var": 4.490625, "learning_rate": 0.0001, "loss": 7.6586, "loss/crossentropy": 2.090894425660372, "loss/hidden": 3.46328125, "loss/jsd": 0.0, "loss/logits": 0.19853799045085907, "step": 3110 }, { "epoch": 0.078, "grad_norm": 31.375, "grad_norm_var": 27.370833333333334, "learning_rate": 0.0001, "loss": 7.5818, "loss/crossentropy": 2.044330509006977, "loss/hidden": 3.50078125, "loss/jsd": 0.0, "loss/logits": 0.20974230151623488, "step": 3120 }, { "epoch": 0.07825, "grad_norm": 31.75, "grad_norm_var": 6.688997395833334, "learning_rate": 0.0001, "loss": 7.654, "loss/crossentropy": 2.1380004197359086, "loss/hidden": 3.3875, "loss/jsd": 0.0, "loss/logits": 0.20455153118818997, "step": 3130 }, { "epoch": 0.0785, "grad_norm": 29.875, "grad_norm_var": 7.715625, "learning_rate": 0.0001, "loss": 7.6551, "loss/crossentropy": 2.1576705113053323, "loss/hidden": 3.4484375, "loss/jsd": 0.0, "loss/logits": 0.20728315506130457, "step": 3140 }, { "epoch": 0.07875, "grad_norm": 30.875, "grad_norm_var": 3.2864583333333335, "learning_rate": 0.0001, "loss": 7.5478, "loss/crossentropy": 2.233546493947506, "loss/hidden": 3.424609375, "loss/jsd": 0.0, "loss/logits": 0.20845879297703504, "step": 3150 }, { "epoch": 0.079, "grad_norm": 34.25, "grad_norm_var": 15.4728515625, "learning_rate": 0.0001, "loss": 7.6264, "loss/crossentropy": 2.0873878210783006, "loss/hidden": 3.355859375, "loss/jsd": 0.0, "loss/logits": 0.19576258175075054, "step": 3160 }, { "epoch": 0.07925, "grad_norm": 29.625, "grad_norm_var": 3.12265625, "learning_rate": 0.0001, "loss": 7.5192, "loss/crossentropy": 2.132347696274519, "loss/hidden": 3.345703125, "loss/jsd": 0.0, "loss/logits": 0.1958464809693396, "step": 3170 }, { "epoch": 0.0795, "grad_norm": 33.0, "grad_norm_var": 3.370247395833333, "learning_rate": 0.0001, "loss": 7.5169, "loss/crossentropy": 2.2280534476041796, "loss/hidden": 3.35546875, "loss/jsd": 0.0, "loss/logits": 0.19412651136517525, "step": 3180 }, { "epoch": 0.07975, "grad_norm": 28.25, "grad_norm_var": 4.989518229166666, "learning_rate": 0.0001, "loss": 7.6576, "loss/crossentropy": 2.138143754005432, "loss/hidden": 3.58828125, "loss/jsd": 0.0, "loss/logits": 0.23302078600972892, "step": 3190 }, { "epoch": 0.08, "grad_norm": 30.375, "grad_norm_var": 6.858072916666667, "learning_rate": 0.0001, "loss": 7.5707, "loss/crossentropy": 2.1512865126132965, "loss/hidden": 3.424609375, "loss/jsd": 0.0, "loss/logits": 0.1957532402127981, "step": 3200 }, { "epoch": 0.08025, "grad_norm": 39.0, "grad_norm_var": 34.83515625, "learning_rate": 0.0001, "loss": 7.6814, "loss/crossentropy": 2.113873428106308, "loss/hidden": 3.52890625, "loss/jsd": 0.0, "loss/logits": 0.22486987188458443, "step": 3210 }, { "epoch": 0.0805, "grad_norm": 30.625, "grad_norm_var": 33.244205729166666, "learning_rate": 0.0001, "loss": 7.6542, "loss/crossentropy": 2.153007471561432, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.20893471594899893, "step": 3220 }, { "epoch": 0.08075, "grad_norm": 31.25, "grad_norm_var": 619.4145182291667, "learning_rate": 0.0001, "loss": 7.5091, "loss/crossentropy": 2.169908273220062, "loss/hidden": 3.456640625, "loss/jsd": 0.0, "loss/logits": 0.20535510070621968, "step": 3230 }, { "epoch": 0.081, "grad_norm": 30.0, "grad_norm_var": 599.9247395833333, "learning_rate": 0.0001, "loss": 7.5568, "loss/crossentropy": 2.048283484578133, "loss/hidden": 3.593359375, "loss/jsd": 0.0, "loss/logits": 0.21250182073563337, "step": 3240 }, { "epoch": 0.08125, "grad_norm": 30.5, "grad_norm_var": 2.982291666666667, "learning_rate": 0.0001, "loss": 7.5569, "loss/crossentropy": 2.0828478574752807, "loss/hidden": 3.496484375, "loss/jsd": 0.0, "loss/logits": 0.20137296654284, "step": 3250 }, { "epoch": 0.0815, "grad_norm": 28.25, "grad_norm_var": 17.9900390625, "learning_rate": 0.0001, "loss": 7.5869, "loss/crossentropy": 2.28429861664772, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.21088685300201176, "step": 3260 }, { "epoch": 0.08175, "grad_norm": 32.25, "grad_norm_var": 2.5587890625, "learning_rate": 0.0001, "loss": 7.6037, "loss/crossentropy": 2.1336950808763504, "loss/hidden": 3.431640625, "loss/jsd": 0.0, "loss/logits": 0.20057348478585482, "step": 3270 }, { "epoch": 0.082, "grad_norm": 33.75, "grad_norm_var": 34.57076822916667, "learning_rate": 0.0001, "loss": 7.6129, "loss/crossentropy": 2.0662791609764097, "loss/hidden": 3.408203125, "loss/jsd": 0.0, "loss/logits": 0.22612145096063613, "step": 3280 }, { "epoch": 0.08225, "grad_norm": 32.75, "grad_norm_var": 18.8181640625, "learning_rate": 0.0001, "loss": 7.5469, "loss/crossentropy": 2.069541847705841, "loss/hidden": 3.376171875, "loss/jsd": 0.0, "loss/logits": 0.1844556663185358, "step": 3290 }, { "epoch": 0.0825, "grad_norm": 29.375, "grad_norm_var": 3.0712890625, "learning_rate": 0.0001, "loss": 7.6025, "loss/crossentropy": 2.2499852567911147, "loss/hidden": 3.451953125, "loss/jsd": 0.0, "loss/logits": 0.21941267363727093, "step": 3300 }, { "epoch": 0.08275, "grad_norm": 33.0, "grad_norm_var": 1.29140625, "learning_rate": 0.0001, "loss": 7.6334, "loss/crossentropy": 2.1277933359146117, "loss/hidden": 3.437109375, "loss/jsd": 0.0, "loss/logits": 0.19795072823762894, "step": 3310 }, { "epoch": 0.083, "grad_norm": 34.0, "grad_norm_var": 3.41015625, "learning_rate": 0.0001, "loss": 7.6326, "loss/crossentropy": 2.214772176742554, "loss/hidden": 3.35390625, "loss/jsd": 0.0, "loss/logits": 0.20066177062690257, "step": 3320 }, { "epoch": 0.08325, "grad_norm": 29.875, "grad_norm_var": 18.768489583333334, "learning_rate": 0.0001, "loss": 7.569, "loss/crossentropy": 2.093920087814331, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.2021244278177619, "step": 3330 }, { "epoch": 0.0835, "grad_norm": 30.25, "grad_norm_var": 18.695833333333333, "learning_rate": 0.0001, "loss": 7.5041, "loss/crossentropy": 2.0556397035717966, "loss/hidden": 3.34765625, "loss/jsd": 0.0, "loss/logits": 0.17018966227769852, "step": 3340 }, { "epoch": 0.08375, "grad_norm": 28.875, "grad_norm_var": 2.854622395833333, "learning_rate": 0.0001, "loss": 7.5837, "loss/crossentropy": 2.0227382972836496, "loss/hidden": 3.4890625, "loss/jsd": 0.0, "loss/logits": 0.21123163159936667, "step": 3350 }, { "epoch": 0.084, "grad_norm": 33.5, "grad_norm_var": 2.894791666666667, "learning_rate": 0.0001, "loss": 7.5374, "loss/crossentropy": 2.146658593416214, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.18999559991061687, "step": 3360 }, { "epoch": 0.08425, "grad_norm": 32.5, "grad_norm_var": 2.5759765625, "learning_rate": 0.0001, "loss": 7.6197, "loss/crossentropy": 1.9798088841140271, "loss/hidden": 3.569921875, "loss/jsd": 0.0, "loss/logits": 0.20031734639778734, "step": 3370 }, { "epoch": 0.0845, "grad_norm": 30.5, "grad_norm_var": 1.6436848958333334, "learning_rate": 0.0001, "loss": 7.5738, "loss/crossentropy": 2.1442878276109694, "loss/hidden": 3.458984375, "loss/jsd": 0.0, "loss/logits": 0.20664554908871652, "step": 3380 }, { "epoch": 0.08475, "grad_norm": 29.25, "grad_norm_var": 1.9875, "learning_rate": 0.0001, "loss": 7.4965, "loss/crossentropy": 2.150037130713463, "loss/hidden": 3.3625, "loss/jsd": 0.0, "loss/logits": 0.19511928483843805, "step": 3390 }, { "epoch": 0.085, "grad_norm": 32.25, "grad_norm_var": 2.2122395833333335, "learning_rate": 0.0001, "loss": 7.5151, "loss/crossentropy": 2.0501646161079408, "loss/hidden": 3.425390625, "loss/jsd": 0.0, "loss/logits": 0.19615819547325372, "step": 3400 }, { "epoch": 0.08525, "grad_norm": 33.75, "grad_norm_var": 2.720833333333333, "learning_rate": 0.0001, "loss": 7.6432, "loss/crossentropy": 2.2328430742025374, "loss/hidden": 3.41796875, "loss/jsd": 0.0, "loss/logits": 0.19816372059285642, "step": 3410 }, { "epoch": 0.0855, "grad_norm": 31.375, "grad_norm_var": 7.519791666666666, "learning_rate": 0.0001, "loss": 7.665, "loss/crossentropy": 2.1567111521959306, "loss/hidden": 3.500390625, "loss/jsd": 0.0, "loss/logits": 0.19839788805693387, "step": 3420 }, { "epoch": 0.08575, "grad_norm": 40.25, "grad_norm_var": 7.808072916666666, "learning_rate": 0.0001, "loss": 7.5919, "loss/crossentropy": 2.2465982705354692, "loss/hidden": 3.46328125, "loss/jsd": 0.0, "loss/logits": 0.21547308284789324, "step": 3430 }, { "epoch": 0.086, "grad_norm": 33.0, "grad_norm_var": 8.09765625, "learning_rate": 0.0001, "loss": 7.6239, "loss/crossentropy": 2.1680932968854902, "loss/hidden": 3.391015625, "loss/jsd": 0.0, "loss/logits": 0.21484404131770135, "step": 3440 }, { "epoch": 0.08625, "grad_norm": 28.625, "grad_norm_var": 3.070572916666667, "learning_rate": 0.0001, "loss": 7.4802, "loss/crossentropy": 2.050510385632515, "loss/hidden": 3.421484375, "loss/jsd": 0.0, "loss/logits": 0.18613745234906673, "step": 3450 }, { "epoch": 0.0865, "grad_norm": 33.5, "grad_norm_var": 2.2143229166666667, "learning_rate": 0.0001, "loss": 7.5311, "loss/crossentropy": 2.0699805982410906, "loss/hidden": 3.40390625, "loss/jsd": 0.0, "loss/logits": 0.1896037317812443, "step": 3460 }, { "epoch": 0.08675, "grad_norm": 31.875, "grad_norm_var": 5.651822916666666, "learning_rate": 0.0001, "loss": 7.596, "loss/crossentropy": 2.0576356425881386, "loss/hidden": 3.522265625, "loss/jsd": 0.0, "loss/logits": 0.20867941789329053, "step": 3470 }, { "epoch": 0.087, "grad_norm": 29.625, "grad_norm_var": 5.998372395833333, "learning_rate": 0.0001, "loss": 7.4738, "loss/crossentropy": 2.2489717990159988, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.18595699593424797, "step": 3480 }, { "epoch": 0.08725, "grad_norm": 28.5, "grad_norm_var": 5.2056640625, "learning_rate": 0.0001, "loss": 7.5246, "loss/crossentropy": 2.178911143541336, "loss/hidden": 3.36796875, "loss/jsd": 0.0, "loss/logits": 0.204429741948843, "step": 3490 }, { "epoch": 0.0875, "grad_norm": 28.25, "grad_norm_var": 1.8134765625, "learning_rate": 0.0001, "loss": 7.4901, "loss/crossentropy": 2.1816289871931076, "loss/hidden": 3.425390625, "loss/jsd": 0.0, "loss/logits": 0.20369651466608046, "step": 3500 }, { "epoch": 0.08775, "grad_norm": 32.25, "grad_norm_var": 59.895572916666666, "learning_rate": 0.0001, "loss": 7.6056, "loss/crossentropy": 2.124045217037201, "loss/hidden": 3.562890625, "loss/jsd": 0.0, "loss/logits": 0.2361205333843827, "step": 3510 }, { "epoch": 0.088, "grad_norm": 31.375, "grad_norm_var": 58.15358072916667, "learning_rate": 0.0001, "loss": 7.6185, "loss/crossentropy": 2.0576027542352677, "loss/hidden": 3.408984375, "loss/jsd": 0.0, "loss/logits": 0.19163726801052688, "step": 3520 }, { "epoch": 0.08825, "grad_norm": 30.875, "grad_norm_var": 19.622330729166666, "learning_rate": 0.0001, "loss": 7.608, "loss/crossentropy": 2.2166375398635862, "loss/hidden": 3.430078125, "loss/jsd": 0.0, "loss/logits": 0.20556784830987454, "step": 3530 }, { "epoch": 0.0885, "grad_norm": 33.25, "grad_norm_var": 3.76875, "learning_rate": 0.0001, "loss": 7.5655, "loss/crossentropy": 2.215288892388344, "loss/hidden": 3.43515625, "loss/jsd": 0.0, "loss/logits": 0.21560241151601076, "step": 3540 }, { "epoch": 0.08875, "grad_norm": 30.0, "grad_norm_var": 3.316666666666667, "learning_rate": 0.0001, "loss": 7.4909, "loss/crossentropy": 2.0867604553699493, "loss/hidden": 3.391015625, "loss/jsd": 0.0, "loss/logits": 0.1992955395951867, "step": 3550 }, { "epoch": 0.089, "grad_norm": 33.25, "grad_norm_var": 286.81555989583336, "learning_rate": 0.0001, "loss": 7.716, "loss/crossentropy": 2.2209325939416886, "loss/hidden": 3.46484375, "loss/jsd": 0.0, "loss/logits": 0.19783683270215988, "step": 3560 }, { "epoch": 0.08925, "grad_norm": 34.75, "grad_norm_var": 286.8197916666667, "learning_rate": 0.0001, "loss": 7.5905, "loss/crossentropy": 2.098256954550743, "loss/hidden": 3.309765625, "loss/jsd": 0.0, "loss/logits": 0.184653827175498, "step": 3570 }, { "epoch": 0.0895, "grad_norm": 31.125, "grad_norm_var": 5.868489583333333, "learning_rate": 0.0001, "loss": 7.625, "loss/crossentropy": 2.0555992782115937, "loss/hidden": 3.45546875, "loss/jsd": 0.0, "loss/logits": 0.19005871675908564, "step": 3580 }, { "epoch": 0.08975, "grad_norm": 37.0, "grad_norm_var": 7.585416666666666, "learning_rate": 0.0001, "loss": 7.542, "loss/crossentropy": 2.0350914053618907, "loss/hidden": 3.396484375, "loss/jsd": 0.0, "loss/logits": 0.192917075753212, "step": 3590 }, { "epoch": 0.09, "grad_norm": 34.25, "grad_norm_var": 26.8322265625, "learning_rate": 0.0001, "loss": 7.6028, "loss/crossentropy": 2.249291920661926, "loss/hidden": 3.33125, "loss/jsd": 0.0, "loss/logits": 0.20490463990718127, "step": 3600 }, { "epoch": 0.09025, "grad_norm": 30.375, "grad_norm_var": 2.9457682291666667, "learning_rate": 0.0001, "loss": 7.5738, "loss/crossentropy": 2.201851597428322, "loss/hidden": 3.434765625, "loss/jsd": 0.0, "loss/logits": 0.20158183835446836, "step": 3610 }, { "epoch": 0.0905, "grad_norm": 30.125, "grad_norm_var": 14.446809895833333, "learning_rate": 0.0001, "loss": 7.5562, "loss/crossentropy": 2.188534340262413, "loss/hidden": 3.491796875, "loss/jsd": 0.0, "loss/logits": 0.22296689171344042, "step": 3620 }, { "epoch": 0.09075, "grad_norm": 32.0, "grad_norm_var": 15.792643229166666, "learning_rate": 0.0001, "loss": 7.5509, "loss/crossentropy": 2.1134337186813354, "loss/hidden": 3.4078125, "loss/jsd": 0.0, "loss/logits": 0.19732800796627997, "step": 3630 }, { "epoch": 0.091, "grad_norm": 29.875, "grad_norm_var": 14.366080729166667, "learning_rate": 0.0001, "loss": 7.6349, "loss/crossentropy": 2.0555363953113557, "loss/hidden": 3.4328125, "loss/jsd": 0.0, "loss/logits": 0.19681458938866853, "step": 3640 }, { "epoch": 0.09125, "grad_norm": 33.25, "grad_norm_var": 13.740559895833334, "learning_rate": 0.0001, "loss": 7.6513, "loss/crossentropy": 2.2402344048023224, "loss/hidden": 3.46171875, "loss/jsd": 0.0, "loss/logits": 0.20878477580845356, "step": 3650 }, { "epoch": 0.0915, "grad_norm": 32.5, "grad_norm_var": 30.0150390625, "learning_rate": 0.0001, "loss": 7.5723, "loss/crossentropy": 2.14640394449234, "loss/hidden": 3.453515625, "loss/jsd": 0.0, "loss/logits": 0.20020358953624964, "step": 3660 }, { "epoch": 0.09175, "grad_norm": 36.0, "grad_norm_var": 49.2619140625, "learning_rate": 0.0001, "loss": 7.5782, "loss/crossentropy": 2.2063133299350737, "loss/hidden": 3.390234375, "loss/jsd": 0.0, "loss/logits": 0.19907979741692544, "step": 3670 }, { "epoch": 0.092, "grad_norm": 32.0, "grad_norm_var": 15.4400390625, "learning_rate": 0.0001, "loss": 7.5501, "loss/crossentropy": 2.1933626160025597, "loss/hidden": 3.332421875, "loss/jsd": 0.0, "loss/logits": 0.19394716806709766, "step": 3680 }, { "epoch": 0.09225, "grad_norm": 28.375, "grad_norm_var": 13.159375, "learning_rate": 0.0001, "loss": 7.5258, "loss/crossentropy": 2.0778674989938737, "loss/hidden": 3.344921875, "loss/jsd": 0.0, "loss/logits": 0.19343881569802762, "step": 3690 }, { "epoch": 0.0925, "grad_norm": 32.5, "grad_norm_var": 15.096809895833333, "learning_rate": 0.0001, "loss": 7.4597, "loss/crossentropy": 2.0565507017076015, "loss/hidden": 3.389453125, "loss/jsd": 0.0, "loss/logits": 0.19122497290372847, "step": 3700 }, { "epoch": 0.09275, "grad_norm": 29.625, "grad_norm_var": 2.729622395833333, "learning_rate": 0.0001, "loss": 7.5714, "loss/crossentropy": 2.082234078645706, "loss/hidden": 3.437109375, "loss/jsd": 0.0, "loss/logits": 0.2102882768958807, "step": 3710 }, { "epoch": 0.093, "grad_norm": 30.0, "grad_norm_var": 2.6254557291666667, "learning_rate": 0.0001, "loss": 7.6155, "loss/crossentropy": 2.2312920093536377, "loss/hidden": 3.34609375, "loss/jsd": 0.0, "loss/logits": 0.18575988691300155, "step": 3720 }, { "epoch": 0.09325, "grad_norm": 33.25, "grad_norm_var": 1.6181640625, "learning_rate": 0.0001, "loss": 7.5344, "loss/crossentropy": 2.2397233605384828, "loss/hidden": 3.343359375, "loss/jsd": 0.0, "loss/logits": 0.20670964010059834, "step": 3730 }, { "epoch": 0.0935, "grad_norm": 33.5, "grad_norm_var": 3.3124348958333334, "learning_rate": 0.0001, "loss": 7.5839, "loss/crossentropy": 2.2065307170152666, "loss/hidden": 3.348046875, "loss/jsd": 0.0, "loss/logits": 0.20138480551540852, "step": 3740 }, { "epoch": 0.09375, "grad_norm": 36.25, "grad_norm_var": 9.07265625, "learning_rate": 0.0001, "loss": 7.6468, "loss/crossentropy": 2.192644628882408, "loss/hidden": 3.420703125, "loss/jsd": 0.0, "loss/logits": 0.20751077253371478, "step": 3750 }, { "epoch": 0.094, "grad_norm": 32.25, "grad_norm_var": 18.11015625, "learning_rate": 0.0001, "loss": 7.731, "loss/crossentropy": 2.205154886841774, "loss/hidden": 3.38984375, "loss/jsd": 0.0, "loss/logits": 0.20712865255773066, "step": 3760 }, { "epoch": 0.09425, "grad_norm": 30.75, "grad_norm_var": 30.259309895833333, "learning_rate": 0.0001, "loss": 7.4267, "loss/crossentropy": 2.0174816213548183, "loss/hidden": 3.363671875, "loss/jsd": 0.0, "loss/logits": 0.1771852731704712, "step": 3770 }, { "epoch": 0.0945, "grad_norm": 32.75, "grad_norm_var": 2.687239583333333, "learning_rate": 0.0001, "loss": 7.5078, "loss/crossentropy": 2.142752841114998, "loss/hidden": 3.384375, "loss/jsd": 0.0, "loss/logits": 0.2145843595266342, "step": 3780 }, { "epoch": 0.09475, "grad_norm": 41.5, "grad_norm_var": 28.512239583333333, "learning_rate": 0.0001, "loss": 7.5586, "loss/crossentropy": 2.2184203058481216, "loss/hidden": 3.433203125, "loss/jsd": 0.0, "loss/logits": 0.2169014524668455, "step": 3790 }, { "epoch": 0.095, "grad_norm": 32.5, "grad_norm_var": 29.637955729166666, "learning_rate": 0.0001, "loss": 7.5796, "loss/crossentropy": 2.0424424298107624, "loss/hidden": 3.440234375, "loss/jsd": 0.0, "loss/logits": 0.19471338465809823, "step": 3800 }, { "epoch": 0.09525, "grad_norm": 31.25, "grad_norm_var": 2.846809895833333, "learning_rate": 0.0001, "loss": 7.4697, "loss/crossentropy": 2.2178969264030455, "loss/hidden": 3.215625, "loss/jsd": 0.0, "loss/logits": 0.18599100317806005, "step": 3810 }, { "epoch": 0.0955, "grad_norm": 35.75, "grad_norm_var": 5.571809895833334, "learning_rate": 0.0001, "loss": 7.5899, "loss/crossentropy": 2.147325333952904, "loss/hidden": 3.373046875, "loss/jsd": 0.0, "loss/logits": 0.19797458127141, "step": 3820 }, { "epoch": 0.09575, "grad_norm": 31.0, "grad_norm_var": 19.242708333333333, "learning_rate": 0.0001, "loss": 7.4524, "loss/crossentropy": 2.222689136862755, "loss/hidden": 3.390234375, "loss/jsd": 0.0, "loss/logits": 0.20618323031812907, "step": 3830 }, { "epoch": 0.096, "grad_norm": 31.125, "grad_norm_var": 6.659309895833333, "learning_rate": 0.0001, "loss": 7.5371, "loss/crossentropy": 2.045217160880566, "loss/hidden": 3.50703125, "loss/jsd": 0.0, "loss/logits": 0.21105091590434313, "step": 3840 }, { "epoch": 0.09625, "grad_norm": 29.5, "grad_norm_var": 3.6624348958333335, "learning_rate": 0.0001, "loss": 7.4832, "loss/crossentropy": 2.1246922612190247, "loss/hidden": 3.37109375, "loss/jsd": 0.0, "loss/logits": 0.19520843997597695, "step": 3850 }, { "epoch": 0.0965, "grad_norm": 32.25, "grad_norm_var": 3.840625, "learning_rate": 0.0001, "loss": 7.5247, "loss/crossentropy": 2.1101455599069596, "loss/hidden": 3.391796875, "loss/jsd": 0.0, "loss/logits": 0.19655006285756826, "step": 3860 }, { "epoch": 0.09675, "grad_norm": 31.375, "grad_norm_var": 4.0759765625, "learning_rate": 0.0001, "loss": 7.7123, "loss/crossentropy": 2.1925098091363906, "loss/hidden": 3.57265625, "loss/jsd": 0.0, "loss/logits": 0.21303071565926074, "step": 3870 }, { "epoch": 0.097, "grad_norm": 32.25, "grad_norm_var": 2.940559895833333, "learning_rate": 0.0001, "loss": 7.6194, "loss/crossentropy": 2.0254321210086346, "loss/hidden": 3.504296875, "loss/jsd": 0.0, "loss/logits": 0.21549067068845035, "step": 3880 }, { "epoch": 0.09725, "grad_norm": 32.75, "grad_norm_var": 1.6895833333333334, "learning_rate": 0.0001, "loss": 7.5341, "loss/crossentropy": 2.18448192179203, "loss/hidden": 3.331640625, "loss/jsd": 0.0, "loss/logits": 0.18761022239923478, "step": 3890 }, { "epoch": 0.0975, "grad_norm": 32.5, "grad_norm_var": 14.326822916666666, "learning_rate": 0.0001, "loss": 7.6378, "loss/crossentropy": 2.0941652059555054, "loss/hidden": 3.45703125, "loss/jsd": 0.0, "loss/logits": 0.20138136427849532, "step": 3900 }, { "epoch": 0.09775, "grad_norm": 31.875, "grad_norm_var": 2.1186848958333333, "learning_rate": 0.0001, "loss": 7.6588, "loss/crossentropy": 2.09155390933156, "loss/hidden": 3.487109375, "loss/jsd": 0.0, "loss/logits": 0.2071656842716038, "step": 3910 }, { "epoch": 0.098, "grad_norm": 32.5, "grad_norm_var": 3.729622395833333, "learning_rate": 0.0001, "loss": 7.6663, "loss/crossentropy": 2.1858886659145353, "loss/hidden": 3.41796875, "loss/jsd": 0.0, "loss/logits": 0.20006632767617702, "step": 3920 }, { "epoch": 0.09825, "grad_norm": 40.25, "grad_norm_var": 15.908072916666667, "learning_rate": 0.0001, "loss": 7.5567, "loss/crossentropy": 2.2354005187749864, "loss/hidden": 3.355859375, "loss/jsd": 0.0, "loss/logits": 0.20157534964382648, "step": 3930 }, { "epoch": 0.0985, "grad_norm": 31.625, "grad_norm_var": 17.745768229166668, "learning_rate": 0.0001, "loss": 7.5189, "loss/crossentropy": 2.015377716720104, "loss/hidden": 3.390234375, "loss/jsd": 0.0, "loss/logits": 0.1868499366566539, "step": 3940 }, { "epoch": 0.09875, "grad_norm": 33.0, "grad_norm_var": 7.66640625, "learning_rate": 0.0001, "loss": 7.533, "loss/crossentropy": 2.0591939449310304, "loss/hidden": 3.385546875, "loss/jsd": 0.0, "loss/logits": 0.20258171651512386, "step": 3950 }, { "epoch": 0.099, "grad_norm": 41.25, "grad_norm_var": 3.3442041663272433e+18, "learning_rate": 0.0001, "loss": 7.4832, "loss/crossentropy": 2.1117863088846205, "loss/hidden": 3.559375, "loss/jsd": 0.0, "loss/logits": 0.2428264247253537, "step": 3960 }, { "epoch": 0.09925, "grad_norm": 30.875, "grad_norm_var": 3.3442041639803904e+18, "learning_rate": 0.0001, "loss": 7.6253, "loss/crossentropy": 2.1176558315753935, "loss/hidden": 3.55546875, "loss/jsd": 0.0, "loss/logits": 0.22726768516004087, "step": 3970 }, { "epoch": 0.0995, "grad_norm": 31.25, "grad_norm_var": 19.5384765625, "learning_rate": 0.0001, "loss": 7.4413, "loss/crossentropy": 2.088257111608982, "loss/hidden": 3.34609375, "loss/jsd": 0.0, "loss/logits": 0.19193989606574177, "step": 3980 }, { "epoch": 0.09975, "grad_norm": 32.75, "grad_norm_var": 2.28125, "learning_rate": 0.0001, "loss": 7.6017, "loss/crossentropy": 2.134918417036533, "loss/hidden": 3.411328125, "loss/jsd": 0.0, "loss/logits": 0.19602251183241606, "step": 3990 }, { "epoch": 0.1, "grad_norm": 32.25, "grad_norm_var": 6.172916666666667, "learning_rate": 0.0001, "loss": 7.5138, "loss/crossentropy": 2.1410045489668845, "loss/hidden": 3.4515625, "loss/jsd": 0.0, "loss/logits": 0.20588791109621524, "step": 4000 }, { "epoch": 0.10025, "grad_norm": 30.0, "grad_norm_var": 11.4306640625, "learning_rate": 0.0001, "loss": 7.5549, "loss/crossentropy": 2.241489386558533, "loss/hidden": 3.343359375, "loss/jsd": 0.0, "loss/logits": 0.20121449399739505, "step": 4010 }, { "epoch": 0.1005, "grad_norm": 33.75, "grad_norm_var": 5.220247395833334, "learning_rate": 0.0001, "loss": 7.6037, "loss/crossentropy": 2.177129751443863, "loss/hidden": 3.4484375, "loss/jsd": 0.0, "loss/logits": 0.195396139472723, "step": 4020 }, { "epoch": 0.10075, "grad_norm": 31.375, "grad_norm_var": 25.279622395833332, "learning_rate": 0.0001, "loss": 7.5495, "loss/crossentropy": 2.1074128076434135, "loss/hidden": 3.40859375, "loss/jsd": 0.0, "loss/logits": 0.18998505976051092, "step": 4030 }, { "epoch": 0.101, "grad_norm": 30.375, "grad_norm_var": 12.564518229166667, "learning_rate": 0.0001, "loss": 7.4628, "loss/crossentropy": 2.031256873905659, "loss/hidden": 3.369921875, "loss/jsd": 0.0, "loss/logits": 0.19107761420309544, "step": 4040 }, { "epoch": 0.10125, "grad_norm": 29.25, "grad_norm_var": 8.242643229166667, "learning_rate": 0.0001, "loss": 7.5906, "loss/crossentropy": 2.2593255966901777, "loss/hidden": 3.33515625, "loss/jsd": 0.0, "loss/logits": 0.20071447864174843, "step": 4050 }, { "epoch": 0.1015, "grad_norm": 28.875, "grad_norm_var": 7.72890625, "learning_rate": 0.0001, "loss": 7.5008, "loss/crossentropy": 2.1623566120862963, "loss/hidden": 3.442578125, "loss/jsd": 0.0, "loss/logits": 0.20114662442356349, "step": 4060 }, { "epoch": 0.10175, "grad_norm": 28.75, "grad_norm_var": 2.5291015625, "learning_rate": 0.0001, "loss": 7.4901, "loss/crossentropy": 2.1303680926561355, "loss/hidden": 3.458984375, "loss/jsd": 0.0, "loss/logits": 0.19380100946873427, "step": 4070 }, { "epoch": 0.102, "grad_norm": 33.25, "grad_norm_var": 1.9583333333333333, "learning_rate": 0.0001, "loss": 7.6067, "loss/crossentropy": 2.180470046401024, "loss/hidden": 3.40390625, "loss/jsd": 0.0, "loss/logits": 0.20080227889120578, "step": 4080 }, { "epoch": 0.10225, "grad_norm": 31.0, "grad_norm_var": 43.7791015625, "learning_rate": 0.0001, "loss": 7.3949, "loss/crossentropy": 2.04951853454113, "loss/hidden": 3.391796875, "loss/jsd": 0.0, "loss/logits": 0.1938589910045266, "step": 4090 }, { "epoch": 0.1025, "grad_norm": 31.5, "grad_norm_var": 40.233333333333334, "learning_rate": 0.0001, "loss": 7.5, "loss/crossentropy": 1.9436089858412742, "loss/hidden": 3.387890625, "loss/jsd": 0.0, "loss/logits": 0.1772780598141253, "step": 4100 }, { "epoch": 0.10275, "grad_norm": 30.75, "grad_norm_var": 6.14765625, "learning_rate": 0.0001, "loss": 7.5926, "loss/crossentropy": 2.1081591993570328, "loss/hidden": 3.436328125, "loss/jsd": 0.0, "loss/logits": 0.20672952029854058, "step": 4110 }, { "epoch": 0.103, "grad_norm": 31.875, "grad_norm_var": 11.225455729166667, "learning_rate": 0.0001, "loss": 7.6334, "loss/crossentropy": 2.0973087579011915, "loss/hidden": 3.393359375, "loss/jsd": 0.0, "loss/logits": 0.20208985283970832, "step": 4120 }, { "epoch": 0.10325, "grad_norm": 29.375, "grad_norm_var": 25.839583333333334, "learning_rate": 0.0001, "loss": 7.4644, "loss/crossentropy": 2.205477836728096, "loss/hidden": 3.3546875, "loss/jsd": 0.0, "loss/logits": 0.19515758529305458, "step": 4130 }, { "epoch": 0.1035, "grad_norm": 31.25, "grad_norm_var": 3.6567057291666667, "learning_rate": 0.0001, "loss": 7.4746, "loss/crossentropy": 2.1042226657271383, "loss/hidden": 3.378515625, "loss/jsd": 0.0, "loss/logits": 0.1850555408746004, "step": 4140 }, { "epoch": 0.10375, "grad_norm": 29.875, "grad_norm_var": 2.5952473958333333, "learning_rate": 0.0001, "loss": 7.5805, "loss/crossentropy": 2.1080187141895292, "loss/hidden": 3.481640625, "loss/jsd": 0.0, "loss/logits": 0.20642356667667627, "step": 4150 }, { "epoch": 0.104, "grad_norm": 34.75, "grad_norm_var": 12.11875, "learning_rate": 0.0001, "loss": 7.6864, "loss/crossentropy": 2.1868012815713884, "loss/hidden": 3.444140625, "loss/jsd": 0.0, "loss/logits": 0.19968394786119462, "step": 4160 }, { "epoch": 0.10425, "grad_norm": 32.25, "grad_norm_var": 11.406184895833333, "learning_rate": 0.0001, "loss": 7.6461, "loss/crossentropy": 2.0963368862867355, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.21419077794998884, "step": 4170 }, { "epoch": 0.1045, "grad_norm": 31.0, "grad_norm_var": 11.670833333333333, "learning_rate": 0.0001, "loss": 7.6029, "loss/crossentropy": 2.100097879767418, "loss/hidden": 3.35390625, "loss/jsd": 0.0, "loss/logits": 0.18958222791552543, "step": 4180 }, { "epoch": 0.10475, "grad_norm": 29.375, "grad_norm_var": 7.664518229166666, "learning_rate": 0.0001, "loss": 7.5954, "loss/crossentropy": 2.2243005722761153, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.19990855641663074, "step": 4190 }, { "epoch": 0.105, "grad_norm": 29.75, "grad_norm_var": 4.333072916666667, "learning_rate": 0.0001, "loss": 7.6671, "loss/crossentropy": 2.250749832391739, "loss/hidden": 3.308203125, "loss/jsd": 0.0, "loss/logits": 0.19082491770386695, "step": 4200 }, { "epoch": 0.10525, "grad_norm": 29.25, "grad_norm_var": 10.568489583333333, "learning_rate": 0.0001, "loss": 7.6118, "loss/crossentropy": 2.155411234498024, "loss/hidden": 3.42421875, "loss/jsd": 0.0, "loss/logits": 0.2127680890262127, "step": 4210 }, { "epoch": 0.1055, "grad_norm": 34.25, "grad_norm_var": 8.6556640625, "learning_rate": 0.0001, "loss": 7.4391, "loss/crossentropy": 2.037487879395485, "loss/hidden": 3.31015625, "loss/jsd": 0.0, "loss/logits": 0.17465929109603168, "step": 4220 }, { "epoch": 0.10575, "grad_norm": 30.625, "grad_norm_var": 8.183072916666667, "learning_rate": 0.0001, "loss": 7.6537, "loss/crossentropy": 2.067080709338188, "loss/hidden": 3.316796875, "loss/jsd": 0.0, "loss/logits": 0.1857584908604622, "step": 4230 }, { "epoch": 0.106, "grad_norm": 31.25, "grad_norm_var": 1.4171223958333334, "learning_rate": 0.0001, "loss": 7.608, "loss/crossentropy": 2.279679241776466, "loss/hidden": 3.33984375, "loss/jsd": 0.0, "loss/logits": 0.206354571133852, "step": 4240 }, { "epoch": 0.10625, "grad_norm": 31.75, "grad_norm_var": 31.7681640625, "learning_rate": 0.0001, "loss": 7.5259, "loss/crossentropy": 2.1007855504751207, "loss/hidden": 3.37265625, "loss/jsd": 0.0, "loss/logits": 0.1838985349982977, "step": 4250 }, { "epoch": 0.1065, "grad_norm": 28.625, "grad_norm_var": 2.5104166666666665, "learning_rate": 0.0001, "loss": 7.5786, "loss/crossentropy": 2.1117694169282912, "loss/hidden": 3.465625, "loss/jsd": 0.0, "loss/logits": 0.2134232448413968, "step": 4260 }, { "epoch": 0.10675, "grad_norm": 33.25, "grad_norm_var": 5.585416666666666, "learning_rate": 0.0001, "loss": 7.5424, "loss/crossentropy": 2.1906253546476364, "loss/hidden": 3.380859375, "loss/jsd": 0.0, "loss/logits": 0.19627480674535036, "step": 4270 }, { "epoch": 0.107, "grad_norm": 33.75, "grad_norm_var": 16.780208333333334, "learning_rate": 0.0001, "loss": 7.7039, "loss/crossentropy": 2.167680537700653, "loss/hidden": 3.469921875, "loss/jsd": 0.0, "loss/logits": 0.21632006093859674, "step": 4280 }, { "epoch": 0.10725, "grad_norm": 32.75, "grad_norm_var": 14.226497395833333, "learning_rate": 0.0001, "loss": 7.6197, "loss/crossentropy": 2.2265933483839033, "loss/hidden": 3.403125, "loss/jsd": 0.0, "loss/logits": 0.20777842812240124, "step": 4290 }, { "epoch": 0.1075, "grad_norm": 32.25, "grad_norm_var": 5.06640625, "learning_rate": 0.0001, "loss": 7.5019, "loss/crossentropy": 2.0171881064772608, "loss/hidden": 3.36640625, "loss/jsd": 0.0, "loss/logits": 0.1733700342476368, "step": 4300 }, { "epoch": 0.10775, "grad_norm": 32.75, "grad_norm_var": 39.37473958333333, "learning_rate": 0.0001, "loss": 7.6624, "loss/crossentropy": 2.052209459245205, "loss/hidden": 3.487109375, "loss/jsd": 0.0, "loss/logits": 0.2088288875296712, "step": 4310 }, { "epoch": 0.108, "grad_norm": 29.375, "grad_norm_var": 2.460724588873515e+18, "learning_rate": 0.0001, "loss": 7.5801, "loss/crossentropy": 2.096518099308014, "loss/hidden": 3.43671875, "loss/jsd": 0.0, "loss/logits": 0.2093063434585929, "step": 4320 }, { "epoch": 0.10825, "grad_norm": 28.25, "grad_norm_var": 2.4607245888865874e+18, "learning_rate": 0.0001, "loss": 7.4793, "loss/crossentropy": 2.1164163142442702, "loss/hidden": 3.405078125, "loss/jsd": 0.0, "loss/logits": 0.19930963944643737, "step": 4330 }, { "epoch": 0.1085, "grad_norm": 33.5, "grad_norm_var": 46.587239583333336, "learning_rate": 0.0001, "loss": 7.5915, "loss/crossentropy": 2.1831407219171526, "loss/hidden": 3.446875, "loss/jsd": 0.0, "loss/logits": 0.21474861968308687, "step": 4340 }, { "epoch": 0.10875, "grad_norm": 31.0, "grad_norm_var": 17.534375, "learning_rate": 0.0001, "loss": 7.4743, "loss/crossentropy": 2.1157304018735887, "loss/hidden": 3.416015625, "loss/jsd": 0.0, "loss/logits": 0.2028682116419077, "step": 4350 }, { "epoch": 0.109, "grad_norm": 28.625, "grad_norm_var": 4.282291666666667, "learning_rate": 0.0001, "loss": 7.6076, "loss/crossentropy": 1.980335572361946, "loss/hidden": 3.541796875, "loss/jsd": 0.0, "loss/logits": 0.20984734632074833, "step": 4360 }, { "epoch": 0.10925, "grad_norm": 31.25, "grad_norm_var": 30.812955729166667, "learning_rate": 0.0001, "loss": 7.5349, "loss/crossentropy": 2.107056123018265, "loss/hidden": 3.25859375, "loss/jsd": 0.0, "loss/logits": 0.18440892472863196, "step": 4370 }, { "epoch": 0.1095, "grad_norm": 30.375, "grad_norm_var": 21.462239583333332, "learning_rate": 0.0001, "loss": 7.5459, "loss/crossentropy": 2.2860846698284147, "loss/hidden": 3.297265625, "loss/jsd": 0.0, "loss/logits": 0.1911198776215315, "step": 4380 }, { "epoch": 0.10975, "grad_norm": 30.0, "grad_norm_var": 5.008333333333334, "learning_rate": 0.0001, "loss": 7.59, "loss/crossentropy": 2.150055022537708, "loss/hidden": 3.396875, "loss/jsd": 0.0, "loss/logits": 0.20779079720377922, "step": 4390 }, { "epoch": 0.11, "grad_norm": 37.25, "grad_norm_var": 24.933268229166668, "learning_rate": 0.0001, "loss": 7.6256, "loss/crossentropy": 2.225931641459465, "loss/hidden": 3.569921875, "loss/jsd": 0.0, "loss/logits": 0.23082431070506573, "step": 4400 }, { "epoch": 0.11025, "grad_norm": 29.375, "grad_norm_var": 26.770572916666666, "learning_rate": 0.0001, "loss": 7.5585, "loss/crossentropy": 2.1318808451294897, "loss/hidden": 3.41484375, "loss/jsd": 0.0, "loss/logits": 0.205277425237, "step": 4410 }, { "epoch": 0.1105, "grad_norm": 28.25, "grad_norm_var": 6.009830729166667, "learning_rate": 0.0001, "loss": 7.5037, "loss/crossentropy": 2.108688759803772, "loss/hidden": 3.426171875, "loss/jsd": 0.0, "loss/logits": 0.19271691460162402, "step": 4420 }, { "epoch": 0.11075, "grad_norm": 30.875, "grad_norm_var": 2.85390625, "learning_rate": 0.0001, "loss": 7.5599, "loss/crossentropy": 2.1696368783712385, "loss/hidden": 3.391796875, "loss/jsd": 0.0, "loss/logits": 0.19752040579915048, "step": 4430 }, { "epoch": 0.111, "grad_norm": 30.25, "grad_norm_var": 18.476822916666666, "learning_rate": 0.0001, "loss": 7.5321, "loss/crossentropy": 2.167906680703163, "loss/hidden": 3.48828125, "loss/jsd": 0.0, "loss/logits": 0.2087454443797469, "step": 4440 }, { "epoch": 0.11125, "grad_norm": 30.75, "grad_norm_var": 4.6634765625, "learning_rate": 0.0001, "loss": 7.6168, "loss/crossentropy": 2.0555444791913033, "loss/hidden": 3.325, "loss/jsd": 0.0, "loss/logits": 0.17809300348162652, "step": 4450 }, { "epoch": 0.1115, "grad_norm": 31.25, "grad_norm_var": 5.618489583333333, "learning_rate": 0.0001, "loss": 7.669, "loss/crossentropy": 2.2362961381673814, "loss/hidden": 3.480078125, "loss/jsd": 0.0, "loss/logits": 0.21373681984841825, "step": 4460 }, { "epoch": 0.11175, "grad_norm": 31.625, "grad_norm_var": 1.8311848958333334, "learning_rate": 0.0001, "loss": 7.469, "loss/crossentropy": 2.207303923368454, "loss/hidden": 3.400390625, "loss/jsd": 0.0, "loss/logits": 0.21020539589226245, "step": 4470 }, { "epoch": 0.112, "grad_norm": 66.0, "grad_norm_var": 76.79973958333333, "learning_rate": 0.0001, "loss": 7.5099, "loss/crossentropy": 2.170773930847645, "loss/hidden": 3.4703125, "loss/jsd": 0.0, "loss/logits": 0.20324019938707352, "step": 4480 }, { "epoch": 0.11225, "grad_norm": 35.0, "grad_norm_var": 76.34895833333333, "learning_rate": 0.0001, "loss": 7.5149, "loss/crossentropy": 2.047274041175842, "loss/hidden": 3.337890625, "loss/jsd": 0.0, "loss/logits": 0.18544426914304496, "step": 4490 }, { "epoch": 0.1125, "grad_norm": 28.25, "grad_norm_var": 4.081705729166667, "learning_rate": 0.0001, "loss": 7.5745, "loss/crossentropy": 2.1270942091941833, "loss/hidden": 3.311328125, "loss/jsd": 0.0, "loss/logits": 0.19201683439314365, "step": 4500 }, { "epoch": 0.11275, "grad_norm": 28.25, "grad_norm_var": 2.6192057291666666, "learning_rate": 0.0001, "loss": 7.632, "loss/crossentropy": 2.0815075978636743, "loss/hidden": 3.438671875, "loss/jsd": 0.0, "loss/logits": 0.20613169986754656, "step": 4510 }, { "epoch": 0.113, "grad_norm": 33.25, "grad_norm_var": 166.34212239583334, "learning_rate": 0.0001, "loss": 7.5383, "loss/crossentropy": 2.144553080201149, "loss/hidden": 3.33125, "loss/jsd": 0.0, "loss/logits": 0.19253196399658917, "step": 4520 }, { "epoch": 0.11325, "grad_norm": 35.75, "grad_norm_var": 1.734519148252968e+18, "learning_rate": 0.0001, "loss": 7.5876, "loss/crossentropy": 2.212298333644867, "loss/hidden": 3.34765625, "loss/jsd": 0.0, "loss/logits": 0.19719784446060656, "step": 4530 }, { "epoch": 0.1135, "grad_norm": 30.625, "grad_norm_var": 32.881184895833336, "learning_rate": 0.0001, "loss": 7.4563, "loss/crossentropy": 2.095240616798401, "loss/hidden": 3.41796875, "loss/jsd": 0.0, "loss/logits": 0.19202221632003785, "step": 4540 }, { "epoch": 0.11375, "grad_norm": 31.625, "grad_norm_var": 87.4853515625, "learning_rate": 0.0001, "loss": 7.5184, "loss/crossentropy": 2.134147650748491, "loss/hidden": 3.340625, "loss/jsd": 0.0, "loss/logits": 0.19104634067043663, "step": 4550 }, { "epoch": 0.114, "grad_norm": 32.25, "grad_norm_var": 78.95618489583333, "learning_rate": 0.0001, "loss": 7.4668, "loss/crossentropy": 2.1294006586074827, "loss/hidden": 3.326953125, "loss/jsd": 0.0, "loss/logits": 0.1853051505982876, "step": 4560 }, { "epoch": 0.11425, "grad_norm": 30.5, "grad_norm_var": 4.706705729166667, "learning_rate": 0.0001, "loss": 7.5114, "loss/crossentropy": 2.0886638939380644, "loss/hidden": 3.452734375, "loss/jsd": 0.0, "loss/logits": 0.20859680250287055, "step": 4570 }, { "epoch": 0.1145, "grad_norm": 35.5, "grad_norm_var": 5.934830729166666, "learning_rate": 0.0001, "loss": 7.56, "loss/crossentropy": 2.1498399868607523, "loss/hidden": 3.50703125, "loss/jsd": 0.0, "loss/logits": 0.18966037435457112, "step": 4580 }, { "epoch": 0.11475, "grad_norm": 32.75, "grad_norm_var": 3.358268229166667, "learning_rate": 0.0001, "loss": 7.6013, "loss/crossentropy": 2.199784816801548, "loss/hidden": 3.347265625, "loss/jsd": 0.0, "loss/logits": 0.18477672804147005, "step": 4590 }, { "epoch": 0.115, "grad_norm": 31.375, "grad_norm_var": 17.598372395833334, "learning_rate": 0.0001, "loss": 7.5806, "loss/crossentropy": 2.165570431947708, "loss/hidden": 3.505859375, "loss/jsd": 0.0, "loss/logits": 0.2113142903894186, "step": 4600 }, { "epoch": 0.11525, "grad_norm": 31.75, "grad_norm_var": 11.908268229166667, "learning_rate": 0.0001, "loss": 7.6265, "loss/crossentropy": 2.07996127307415, "loss/hidden": 3.418359375, "loss/jsd": 0.0, "loss/logits": 0.21436248067766428, "step": 4610 }, { "epoch": 0.1155, "grad_norm": 28.125, "grad_norm_var": 4.520833333333333, "learning_rate": 0.0001, "loss": 7.57, "loss/crossentropy": 2.1484976023435594, "loss/hidden": 3.390234375, "loss/jsd": 0.0, "loss/logits": 0.1913412597030401, "step": 4620 }, { "epoch": 0.11575, "grad_norm": 30.125, "grad_norm_var": 4.1875, "learning_rate": 0.0001, "loss": 7.4701, "loss/crossentropy": 2.0956287920475005, "loss/hidden": 3.39140625, "loss/jsd": 0.0, "loss/logits": 0.19558896012604238, "step": 4630 }, { "epoch": 0.116, "grad_norm": 29.875, "grad_norm_var": 2.106184895833333, "learning_rate": 0.0001, "loss": 7.4538, "loss/crossentropy": 2.1474092990159988, "loss/hidden": 3.45234375, "loss/jsd": 0.0, "loss/logits": 0.2039597311988473, "step": 4640 }, { "epoch": 0.11625, "grad_norm": 31.75, "grad_norm_var": 3.520768229166667, "learning_rate": 0.0001, "loss": 7.6657, "loss/crossentropy": 2.1128339886665346, "loss/hidden": 3.361328125, "loss/jsd": 0.0, "loss/logits": 0.21120875477790832, "step": 4650 }, { "epoch": 0.1165, "grad_norm": 31.5, "grad_norm_var": 2.133072916666667, "learning_rate": 0.0001, "loss": 7.5441, "loss/crossentropy": 2.1643586844205855, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.20440610516816377, "step": 4660 }, { "epoch": 0.11675, "grad_norm": 33.0, "grad_norm_var": 3.504622395833333, "learning_rate": 0.0001, "loss": 7.6051, "loss/crossentropy": 2.195269528031349, "loss/hidden": 3.428125, "loss/jsd": 0.0, "loss/logits": 0.20059011913836003, "step": 4670 }, { "epoch": 0.117, "grad_norm": 30.0, "grad_norm_var": 31.235416666666666, "learning_rate": 0.0001, "loss": 7.4526, "loss/crossentropy": 2.082709529995918, "loss/hidden": 3.36640625, "loss/jsd": 0.0, "loss/logits": 0.19617473538964986, "step": 4680 }, { "epoch": 0.11725, "grad_norm": 32.0, "grad_norm_var": 3.45625, "learning_rate": 0.0001, "loss": 7.5593, "loss/crossentropy": 2.1473275452852247, "loss/hidden": 3.481640625, "loss/jsd": 0.0, "loss/logits": 0.20148523468524218, "step": 4690 }, { "epoch": 0.1175, "grad_norm": 31.875, "grad_norm_var": 2.145768229166667, "learning_rate": 0.0001, "loss": 7.4609, "loss/crossentropy": 2.14631325006485, "loss/hidden": 3.42890625, "loss/jsd": 0.0, "loss/logits": 0.20976583026349543, "step": 4700 }, { "epoch": 0.11775, "grad_norm": 32.5, "grad_norm_var": 33.20826822916667, "learning_rate": 0.0001, "loss": 7.5877, "loss/crossentropy": 2.1093395471572878, "loss/hidden": 3.46953125, "loss/jsd": 0.0, "loss/logits": 0.20207433141767978, "step": 4710 }, { "epoch": 0.118, "grad_norm": 30.75, "grad_norm_var": 10.86875, "learning_rate": 0.0001, "loss": 7.5437, "loss/crossentropy": 2.1447531282901764, "loss/hidden": 3.353515625, "loss/jsd": 0.0, "loss/logits": 0.1963239062577486, "step": 4720 }, { "epoch": 0.11825, "grad_norm": 34.5, "grad_norm_var": 4.473372395833334, "learning_rate": 0.0001, "loss": 7.6248, "loss/crossentropy": 2.1142423778772352, "loss/hidden": 3.283203125, "loss/jsd": 0.0, "loss/logits": 0.19059138614684343, "step": 4730 }, { "epoch": 0.1185, "grad_norm": 27.875, "grad_norm_var": 3.098372395833333, "learning_rate": 0.0001, "loss": 7.5631, "loss/crossentropy": 2.07417613863945, "loss/hidden": 3.455859375, "loss/jsd": 0.0, "loss/logits": 0.19420854579657315, "step": 4740 }, { "epoch": 0.11875, "grad_norm": 29.875, "grad_norm_var": 2.734375, "learning_rate": 0.0001, "loss": 7.6499, "loss/crossentropy": 2.0995796024799347, "loss/hidden": 3.518359375, "loss/jsd": 0.0, "loss/logits": 0.21285411342978477, "step": 4750 }, { "epoch": 0.119, "grad_norm": 32.5, "grad_norm_var": 2.7228515625, "learning_rate": 0.0001, "loss": 7.5854, "loss/crossentropy": 2.169099047780037, "loss/hidden": 3.462109375, "loss/jsd": 0.0, "loss/logits": 0.22147531677037477, "step": 4760 }, { "epoch": 0.11925, "grad_norm": 32.5, "grad_norm_var": 34.799739583333334, "learning_rate": 0.0001, "loss": 7.435, "loss/crossentropy": 2.0821361050009726, "loss/hidden": 3.2953125, "loss/jsd": 0.0, "loss/logits": 0.17594938166439533, "step": 4770 }, { "epoch": 0.1195, "grad_norm": 33.75, "grad_norm_var": 23.00390625, "learning_rate": 0.0001, "loss": 7.6152, "loss/crossentropy": 2.183753404021263, "loss/hidden": 3.364453125, "loss/jsd": 0.0, "loss/logits": 0.19579742290079594, "step": 4780 }, { "epoch": 0.11975, "grad_norm": 29.25, "grad_norm_var": 23.6541015625, "learning_rate": 0.0001, "loss": 7.5295, "loss/crossentropy": 2.1669270396232605, "loss/hidden": 3.505859375, "loss/jsd": 0.0, "loss/logits": 0.21605710163712502, "step": 4790 }, { "epoch": 0.12, "grad_norm": 34.25, "grad_norm_var": 4.624739583333334, "learning_rate": 0.0001, "loss": 7.6351, "loss/crossentropy": 2.116096779704094, "loss/hidden": 3.4625, "loss/jsd": 0.0, "loss/logits": 0.21523005720227956, "step": 4800 }, { "epoch": 0.12025, "grad_norm": 29.5, "grad_norm_var": 6.009375, "learning_rate": 0.0001, "loss": 7.5683, "loss/crossentropy": 2.051844981312752, "loss/hidden": 3.351953125, "loss/jsd": 0.0, "loss/logits": 0.17574754767119885, "step": 4810 }, { "epoch": 0.1205, "grad_norm": 30.5, "grad_norm_var": 2.0087890625, "learning_rate": 0.0001, "loss": 7.5892, "loss/crossentropy": 2.2223299980163573, "loss/hidden": 3.3765625, "loss/jsd": 0.0, "loss/logits": 0.19802077617496253, "step": 4820 }, { "epoch": 0.12075, "grad_norm": 30.125, "grad_norm_var": 3.1080729166666665, "learning_rate": 0.0001, "loss": 7.5951, "loss/crossentropy": 2.1982655793428423, "loss/hidden": 3.444140625, "loss/jsd": 0.0, "loss/logits": 0.2064062263816595, "step": 4830 }, { "epoch": 0.121, "grad_norm": 30.0, "grad_norm_var": 12.412434895833334, "learning_rate": 0.0001, "loss": 7.5254, "loss/crossentropy": 1.9753010511398315, "loss/hidden": 3.4484375, "loss/jsd": 0.0, "loss/logits": 0.19706314485520124, "step": 4840 }, { "epoch": 0.12125, "grad_norm": 30.125, "grad_norm_var": 13.1447265625, "learning_rate": 0.0001, "loss": 7.5351, "loss/crossentropy": 2.0562877766788006, "loss/hidden": 3.396875, "loss/jsd": 0.0, "loss/logits": 0.17667807769030333, "step": 4850 }, { "epoch": 0.1215, "grad_norm": 33.0, "grad_norm_var": 3.465625, "learning_rate": 0.0001, "loss": 7.6024, "loss/crossentropy": 2.1578008987009527, "loss/hidden": 3.344140625, "loss/jsd": 0.0, "loss/logits": 0.18234786652028562, "step": 4860 }, { "epoch": 0.12175, "grad_norm": 30.375, "grad_norm_var": 2.343489583333333, "learning_rate": 0.0001, "loss": 7.4726, "loss/crossentropy": 2.1330083698034286, "loss/hidden": 3.333203125, "loss/jsd": 0.0, "loss/logits": 0.18738476932048798, "step": 4870 }, { "epoch": 0.122, "grad_norm": 31.625, "grad_norm_var": 3.2322265625, "learning_rate": 0.0001, "loss": 7.605, "loss/crossentropy": 2.1732089832425117, "loss/hidden": 3.460546875, "loss/jsd": 0.0, "loss/logits": 0.19949225690215827, "step": 4880 }, { "epoch": 0.12225, "grad_norm": 33.5, "grad_norm_var": 2.238997395833333, "learning_rate": 0.0001, "loss": 7.5382, "loss/crossentropy": 2.188927575945854, "loss/hidden": 3.419921875, "loss/jsd": 0.0, "loss/logits": 0.19776681065559387, "step": 4890 }, { "epoch": 0.1225, "grad_norm": 29.0, "grad_norm_var": 15.819791666666667, "learning_rate": 0.0001, "loss": 7.5728, "loss/crossentropy": 2.202793037891388, "loss/hidden": 3.36328125, "loss/jsd": 0.0, "loss/logits": 0.1992744604125619, "step": 4900 }, { "epoch": 0.12275, "grad_norm": 30.25, "grad_norm_var": 16.1775390625, "learning_rate": 0.0001, "loss": 7.426, "loss/crossentropy": 2.177696394920349, "loss/hidden": 3.36796875, "loss/jsd": 0.0, "loss/logits": 0.19968770015984774, "step": 4910 }, { "epoch": 0.123, "grad_norm": 35.5, "grad_norm_var": 7.707291666666666, "learning_rate": 0.0001, "loss": 7.7485, "loss/crossentropy": 2.040875867009163, "loss/hidden": 3.45234375, "loss/jsd": 0.0, "loss/logits": 0.1929738214239478, "step": 4920 }, { "epoch": 0.12325, "grad_norm": 32.25, "grad_norm_var": 6.771809895833333, "learning_rate": 0.0001, "loss": 7.5517, "loss/crossentropy": 2.199721184372902, "loss/hidden": 3.406640625, "loss/jsd": 0.0, "loss/logits": 0.20861553251743317, "step": 4930 }, { "epoch": 0.1235, "grad_norm": 28.75, "grad_norm_var": 4.235416666666667, "learning_rate": 0.0001, "loss": 7.5576, "loss/crossentropy": 2.133353302627802, "loss/hidden": 3.45546875, "loss/jsd": 0.0, "loss/logits": 0.21631606128066777, "step": 4940 }, { "epoch": 0.12375, "grad_norm": 40.0, "grad_norm_var": 15.1650390625, "learning_rate": 0.0001, "loss": 7.5129, "loss/crossentropy": 2.2380867928266523, "loss/hidden": 3.3703125, "loss/jsd": 0.0, "loss/logits": 0.2078275766223669, "step": 4950 }, { "epoch": 0.124, "grad_norm": 32.5, "grad_norm_var": 9.342643229166667, "learning_rate": 0.0001, "loss": 7.4421, "loss/crossentropy": 2.1563473463058473, "loss/hidden": 3.374609375, "loss/jsd": 0.0, "loss/logits": 0.1852986102923751, "step": 4960 }, { "epoch": 0.12425, "grad_norm": 32.5, "grad_norm_var": 2.3916666666666666, "learning_rate": 0.0001, "loss": 7.5503, "loss/crossentropy": 2.124380439519882, "loss/hidden": 3.480859375, "loss/jsd": 0.0, "loss/logits": 0.20270956568419934, "step": 4970 }, { "epoch": 0.1245, "grad_norm": 30.875, "grad_norm_var": 5.837434895833334, "learning_rate": 0.0001, "loss": 7.4857, "loss/crossentropy": 2.0482941284775733, "loss/hidden": 3.373828125, "loss/jsd": 0.0, "loss/logits": 0.18650466352701187, "step": 4980 }, { "epoch": 0.12475, "grad_norm": 31.0, "grad_norm_var": 3.0858723958333334, "learning_rate": 0.0001, "loss": 7.5446, "loss/crossentropy": 2.160063311457634, "loss/hidden": 3.319140625, "loss/jsd": 0.0, "loss/logits": 0.18700905814766883, "step": 4990 }, { "epoch": 0.125, "grad_norm": 28.75, "grad_norm_var": 5.533333333333333, "learning_rate": 0.0001, "loss": 7.5333, "loss/crossentropy": 2.056824396550655, "loss/hidden": 3.4421875, "loss/jsd": 0.0, "loss/logits": 0.18965724110603333, "step": 5000 }, { "epoch": 0.12525, "grad_norm": 32.5, "grad_norm_var": 32.25807291666667, "learning_rate": 0.0001, "loss": 7.5864, "loss/crossentropy": 2.2407308876514436, "loss/hidden": 3.51328125, "loss/jsd": 0.0, "loss/logits": 0.22782632596790792, "step": 5010 }, { "epoch": 0.1255, "grad_norm": 31.375, "grad_norm_var": 32.7353515625, "learning_rate": 0.0001, "loss": 7.5671, "loss/crossentropy": 2.18543721139431, "loss/hidden": 3.37421875, "loss/jsd": 0.0, "loss/logits": 0.19135277662426234, "step": 5020 }, { "epoch": 0.12575, "grad_norm": 32.5, "grad_norm_var": 2.814322916666667, "learning_rate": 0.0001, "loss": 7.578, "loss/crossentropy": 2.1418938025832177, "loss/hidden": 3.362109375, "loss/jsd": 0.0, "loss/logits": 0.20001544915139674, "step": 5030 }, { "epoch": 0.126, "grad_norm": 33.0, "grad_norm_var": 1.3238932291666667, "learning_rate": 0.0001, "loss": 7.5594, "loss/crossentropy": 2.1838817209005357, "loss/hidden": 3.417578125, "loss/jsd": 0.0, "loss/logits": 0.2107716018334031, "step": 5040 }, { "epoch": 0.12625, "grad_norm": 31.25, "grad_norm_var": 3.47890625, "learning_rate": 0.0001, "loss": 7.5275, "loss/crossentropy": 2.2816754072904586, "loss/hidden": 3.34921875, "loss/jsd": 0.0, "loss/logits": 0.201267159730196, "step": 5050 }, { "epoch": 0.1265, "grad_norm": 33.5, "grad_norm_var": 4.2087890625, "learning_rate": 0.0001, "loss": 7.6079, "loss/crossentropy": 2.0686806365847588, "loss/hidden": 3.603125, "loss/jsd": 0.0, "loss/logits": 0.2172813605517149, "step": 5060 }, { "epoch": 0.12675, "grad_norm": 32.25, "grad_norm_var": 3.19765625, "learning_rate": 0.0001, "loss": 7.5561, "loss/crossentropy": 2.1800751775503158, "loss/hidden": 3.389453125, "loss/jsd": 0.0, "loss/logits": 0.20199469216167926, "step": 5070 }, { "epoch": 0.127, "grad_norm": 30.5, "grad_norm_var": 1.8643229166666666, "learning_rate": 0.0001, "loss": 7.4702, "loss/crossentropy": 2.2943209201097488, "loss/hidden": 3.355859375, "loss/jsd": 0.0, "loss/logits": 0.1958466824144125, "step": 5080 }, { "epoch": 0.12725, "grad_norm": 30.125, "grad_norm_var": 1.9030598958333333, "learning_rate": 0.0001, "loss": 7.4383, "loss/crossentropy": 2.291040873527527, "loss/hidden": 3.308203125, "loss/jsd": 0.0, "loss/logits": 0.18960105255246162, "step": 5090 }, { "epoch": 0.1275, "grad_norm": 32.25, "grad_norm_var": 1.4504557291666667, "learning_rate": 0.0001, "loss": 7.6139, "loss/crossentropy": 2.116188834607601, "loss/hidden": 3.5015625, "loss/jsd": 0.0, "loss/logits": 0.19662482757121325, "step": 5100 }, { "epoch": 0.12775, "grad_norm": 32.25, "grad_norm_var": 1.5395182291666667, "learning_rate": 0.0001, "loss": 7.5074, "loss/crossentropy": 2.200083887577057, "loss/hidden": 3.38125, "loss/jsd": 0.0, "loss/logits": 0.19970641760155558, "step": 5110 }, { "epoch": 0.128, "grad_norm": 31.125, "grad_norm_var": 1.990625, "learning_rate": 0.0001, "loss": 7.5949, "loss/crossentropy": 2.080552561581135, "loss/hidden": 3.403125, "loss/jsd": 0.0, "loss/logits": 0.19426519125699998, "step": 5120 }, { "epoch": 0.12825, "grad_norm": 31.125, "grad_norm_var": 2.200455729166667, "learning_rate": 0.0001, "loss": 7.5036, "loss/crossentropy": 2.138309660553932, "loss/hidden": 3.430859375, "loss/jsd": 0.0, "loss/logits": 0.20698665007948874, "step": 5130 }, { "epoch": 0.1285, "grad_norm": 30.125, "grad_norm_var": 5.530989583333334, "learning_rate": 0.0001, "loss": 7.6081, "loss/crossentropy": 2.0686957597732545, "loss/hidden": 3.3734375, "loss/jsd": 0.0, "loss/logits": 0.22128727175295354, "step": 5140 }, { "epoch": 0.12875, "grad_norm": 30.75, "grad_norm_var": 2.120833333333333, "learning_rate": 0.0001, "loss": 7.5236, "loss/crossentropy": 2.0967576891183852, "loss/hidden": 3.46640625, "loss/jsd": 0.0, "loss/logits": 0.19716181065887212, "step": 5150 }, { "epoch": 0.129, "grad_norm": 32.25, "grad_norm_var": 1.8468098958333334, "learning_rate": 0.0001, "loss": 7.5679, "loss/crossentropy": 2.172739614546299, "loss/hidden": 3.350390625, "loss/jsd": 0.0, "loss/logits": 0.1919636652804911, "step": 5160 }, { "epoch": 0.12925, "grad_norm": 43.75, "grad_norm_var": 17.77890625, "learning_rate": 0.0001, "loss": 7.5027, "loss/crossentropy": 2.1427868396043777, "loss/hidden": 3.51953125, "loss/jsd": 0.0, "loss/logits": 0.20654744990170001, "step": 5170 }, { "epoch": 0.1295, "grad_norm": 33.25, "grad_norm_var": 61.1962890625, "learning_rate": 0.0001, "loss": 7.52, "loss/crossentropy": 2.1186717480421065, "loss/hidden": 3.42421875, "loss/jsd": 0.0, "loss/logits": 0.1938714198768139, "step": 5180 }, { "epoch": 0.12975, "grad_norm": 33.0, "grad_norm_var": 4.07890625, "learning_rate": 0.0001, "loss": 7.6231, "loss/crossentropy": 2.072993017733097, "loss/hidden": 3.37578125, "loss/jsd": 0.0, "loss/logits": 0.18984810579568148, "step": 5190 }, { "epoch": 0.13, "grad_norm": 36.25, "grad_norm_var": 20.777083333333334, "learning_rate": 0.0001, "loss": 7.597, "loss/crossentropy": 2.206225660443306, "loss/hidden": 3.376171875, "loss/jsd": 0.0, "loss/logits": 0.2030201606452465, "step": 5200 }, { "epoch": 0.13025, "grad_norm": 43.75, "grad_norm_var": 24.43515625, "learning_rate": 0.0001, "loss": 7.5075, "loss/crossentropy": 2.228325179219246, "loss/hidden": 3.366796875, "loss/jsd": 0.0, "loss/logits": 0.20732430163770915, "step": 5210 }, { "epoch": 0.1305, "grad_norm": 30.5, "grad_norm_var": 11.99765625, "learning_rate": 0.0001, "loss": 7.5295, "loss/crossentropy": 2.129982355237007, "loss/hidden": 3.4546875, "loss/jsd": 0.0, "loss/logits": 0.19593548215925694, "step": 5220 }, { "epoch": 0.13075, "grad_norm": 30.75, "grad_norm_var": 4.292708333333334, "learning_rate": 0.0001, "loss": 7.5261, "loss/crossentropy": 2.1533152967691422, "loss/hidden": 3.3296875, "loss/jsd": 0.0, "loss/logits": 0.19487107992172242, "step": 5230 }, { "epoch": 0.131, "grad_norm": 30.875, "grad_norm_var": 12.989583333333334, "learning_rate": 0.0001, "loss": 7.5566, "loss/crossentropy": 2.195969894528389, "loss/hidden": 3.48046875, "loss/jsd": 0.0, "loss/logits": 0.2189876638352871, "step": 5240 }, { "epoch": 0.13125, "grad_norm": 33.25, "grad_norm_var": 22.575, "learning_rate": 0.0001, "loss": 7.482, "loss/crossentropy": 2.074573493748903, "loss/hidden": 3.412890625, "loss/jsd": 0.0, "loss/logits": 0.19280508980154992, "step": 5250 }, { "epoch": 0.1315, "grad_norm": 29.125, "grad_norm_var": 15.599739583333333, "learning_rate": 0.0001, "loss": 7.5395, "loss/crossentropy": 2.227403500676155, "loss/hidden": 3.371484375, "loss/jsd": 0.0, "loss/logits": 0.20514250732958317, "step": 5260 }, { "epoch": 0.13175, "grad_norm": 31.625, "grad_norm_var": 113.15618489583333, "learning_rate": 0.0001, "loss": 7.5209, "loss/crossentropy": 2.055370827019215, "loss/hidden": 3.334375, "loss/jsd": 0.0, "loss/logits": 0.18272479642182587, "step": 5270 }, { "epoch": 0.132, "grad_norm": 31.625, "grad_norm_var": 1.765625, "learning_rate": 0.0001, "loss": 7.4798, "loss/crossentropy": 2.1356967806816103, "loss/hidden": 3.430859375, "loss/jsd": 0.0, "loss/logits": 0.19737351574003698, "step": 5280 }, { "epoch": 0.13225, "grad_norm": 31.0, "grad_norm_var": 7.879622395833334, "learning_rate": 0.0001, "loss": 7.4715, "loss/crossentropy": 2.177875056862831, "loss/hidden": 3.361328125, "loss/jsd": 0.0, "loss/logits": 0.20798433478921652, "step": 5290 }, { "epoch": 0.1325, "grad_norm": 29.5, "grad_norm_var": 8.760872395833333, "learning_rate": 0.0001, "loss": 7.5419, "loss/crossentropy": 2.182955250144005, "loss/hidden": 3.354296875, "loss/jsd": 0.0, "loss/logits": 0.18478553090244532, "step": 5300 }, { "epoch": 0.13275, "grad_norm": 29.125, "grad_norm_var": 4.260416666666667, "learning_rate": 0.0001, "loss": 7.5816, "loss/crossentropy": 2.0469735309481623, "loss/hidden": 3.55859375, "loss/jsd": 0.0, "loss/logits": 0.18724320270121098, "step": 5310 }, { "epoch": 0.133, "grad_norm": 33.0, "grad_norm_var": 1.8801432291666667, "learning_rate": 0.0001, "loss": 7.7815, "loss/crossentropy": 2.1846155911684035, "loss/hidden": 3.499609375, "loss/jsd": 0.0, "loss/logits": 0.21609773077070712, "step": 5320 }, { "epoch": 0.13325, "grad_norm": 30.875, "grad_norm_var": 5.584375, "learning_rate": 0.0001, "loss": 7.7067, "loss/crossentropy": 2.0075640469789504, "loss/hidden": 3.474609375, "loss/jsd": 0.0, "loss/logits": 0.20185804851353167, "step": 5330 }, { "epoch": 0.1335, "grad_norm": 27.375, "grad_norm_var": 6.120247395833333, "learning_rate": 0.0001, "loss": 7.5345, "loss/crossentropy": 2.058630608022213, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.19536744449287652, "step": 5340 }, { "epoch": 0.13375, "grad_norm": 52.5, "grad_norm_var": 30.692643229166666, "learning_rate": 0.0001, "loss": 7.6655, "loss/crossentropy": 2.050611114501953, "loss/hidden": 3.423046875, "loss/jsd": 0.0, "loss/logits": 0.19592730849981307, "step": 5350 }, { "epoch": 0.134, "grad_norm": 31.125, "grad_norm_var": 28.90390625, "learning_rate": 0.0001, "loss": 7.5509, "loss/crossentropy": 2.1665500849485397, "loss/hidden": 3.38203125, "loss/jsd": 0.0, "loss/logits": 0.19354073759168386, "step": 5360 }, { "epoch": 0.13425, "grad_norm": 33.5, "grad_norm_var": 2.9931640625, "learning_rate": 0.0001, "loss": 7.6343, "loss/crossentropy": 2.1793171644210814, "loss/hidden": 3.3328125, "loss/jsd": 0.0, "loss/logits": 0.20293663591146469, "step": 5370 }, { "epoch": 0.1345, "grad_norm": 32.5, "grad_norm_var": 1.8098307291666667, "learning_rate": 0.0001, "loss": 7.6, "loss/crossentropy": 2.152762657403946, "loss/hidden": 3.441015625, "loss/jsd": 0.0, "loss/logits": 0.21694996021687984, "step": 5380 }, { "epoch": 0.13475, "grad_norm": 30.125, "grad_norm_var": 1.4301432291666667, "learning_rate": 0.0001, "loss": 7.5121, "loss/crossentropy": 2.083320555835962, "loss/hidden": 3.3375, "loss/jsd": 0.0, "loss/logits": 0.18746816255152227, "step": 5390 }, { "epoch": 0.135, "grad_norm": 29.625, "grad_norm_var": 5.109830729166666, "learning_rate": 0.0001, "loss": 7.4172, "loss/crossentropy": 2.0639937698841093, "loss/hidden": 3.3875, "loss/jsd": 0.0, "loss/logits": 0.18618469405919313, "step": 5400 }, { "epoch": 0.13525, "grad_norm": 31.125, "grad_norm_var": 12.337434895833333, "learning_rate": 0.0001, "loss": 7.5749, "loss/crossentropy": 2.068472331762314, "loss/hidden": 3.524609375, "loss/jsd": 0.0, "loss/logits": 0.18952864613384007, "step": 5410 }, { "epoch": 0.1355, "grad_norm": 32.25, "grad_norm_var": 7.9541015625, "learning_rate": 0.0001, "loss": 7.5385, "loss/crossentropy": 2.085490897297859, "loss/hidden": 3.41484375, "loss/jsd": 0.0, "loss/logits": 0.1854228163138032, "step": 5420 }, { "epoch": 0.13575, "grad_norm": 30.0, "grad_norm_var": 7.678125, "learning_rate": 0.0001, "loss": 7.5583, "loss/crossentropy": 2.1747709423303605, "loss/hidden": 3.444921875, "loss/jsd": 0.0, "loss/logits": 0.20177022367715836, "step": 5430 }, { "epoch": 0.136, "grad_norm": 31.875, "grad_norm_var": 2.6541015625, "learning_rate": 0.0001, "loss": 7.4911, "loss/crossentropy": 2.167823739349842, "loss/hidden": 3.441015625, "loss/jsd": 0.0, "loss/logits": 0.2082101447507739, "step": 5440 }, { "epoch": 0.13625, "grad_norm": 30.625, "grad_norm_var": 11.2087890625, "learning_rate": 0.0001, "loss": 7.639, "loss/crossentropy": 2.1367167800664904, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.20548735409975052, "step": 5450 }, { "epoch": 0.1365, "grad_norm": 28.375, "grad_norm_var": 15.654166666666667, "learning_rate": 0.0001, "loss": 7.4951, "loss/crossentropy": 2.109811532497406, "loss/hidden": 3.4640625, "loss/jsd": 0.0, "loss/logits": 0.1994595667347312, "step": 5460 }, { "epoch": 0.13675, "grad_norm": 31.75, "grad_norm_var": 4.463997395833333, "learning_rate": 0.0001, "loss": 7.5263, "loss/crossentropy": 2.2288592010736465, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.20487434454262257, "step": 5470 }, { "epoch": 0.137, "grad_norm": 32.0, "grad_norm_var": 3.8400390625, "learning_rate": 0.0001, "loss": 7.5681, "loss/crossentropy": 2.0951829612255097, "loss/hidden": 3.455078125, "loss/jsd": 0.0, "loss/logits": 0.20034745894372463, "step": 5480 }, { "epoch": 0.13725, "grad_norm": 31.625, "grad_norm_var": 2.299934895833333, "learning_rate": 0.0001, "loss": 7.4772, "loss/crossentropy": 2.022165683656931, "loss/hidden": 3.319140625, "loss/jsd": 0.0, "loss/logits": 0.18401878620497883, "step": 5490 }, { "epoch": 0.1375, "grad_norm": 31.625, "grad_norm_var": 4.205989583333333, "learning_rate": 0.0001, "loss": 7.6869, "loss/crossentropy": 2.095045933127403, "loss/hidden": 3.609375, "loss/jsd": 0.0, "loss/logits": 0.21982598043978213, "step": 5500 }, { "epoch": 0.13775, "grad_norm": 29.875, "grad_norm_var": 2.177018229166667, "learning_rate": 0.0001, "loss": 7.4483, "loss/crossentropy": 2.07875951230526, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.1898285737261176, "step": 5510 }, { "epoch": 0.138, "grad_norm": 30.0, "grad_norm_var": 3.4181640625, "learning_rate": 0.0001, "loss": 7.5258, "loss/crossentropy": 2.194947564601898, "loss/hidden": 3.4125, "loss/jsd": 0.0, "loss/logits": 0.21397264376282693, "step": 5520 }, { "epoch": 0.13825, "grad_norm": 32.75, "grad_norm_var": 5.5541015625, "learning_rate": 0.0001, "loss": 7.6574, "loss/crossentropy": 2.2282156944274902, "loss/hidden": 3.375390625, "loss/jsd": 0.0, "loss/logits": 0.19613583907485008, "step": 5530 }, { "epoch": 0.1385, "grad_norm": 29.875, "grad_norm_var": 4.927018229166666, "learning_rate": 0.0001, "loss": 7.5361, "loss/crossentropy": 2.1092587068676947, "loss/hidden": 3.342578125, "loss/jsd": 0.0, "loss/logits": 0.1894394876435399, "step": 5540 }, { "epoch": 0.13875, "grad_norm": 30.875, "grad_norm_var": 6.080989583333333, "learning_rate": 0.0001, "loss": 7.6223, "loss/crossentropy": 2.189313694834709, "loss/hidden": 3.428125, "loss/jsd": 0.0, "loss/logits": 0.210250511392951, "step": 5550 }, { "epoch": 0.139, "grad_norm": 41.25, "grad_norm_var": 8.976497395833333, "learning_rate": 0.0001, "loss": 7.4876, "loss/crossentropy": 2.046143325418234, "loss/hidden": 3.457421875, "loss/jsd": 0.0, "loss/logits": 0.19476603530347347, "step": 5560 }, { "epoch": 0.13925, "grad_norm": 31.25, "grad_norm_var": 9.076497395833334, "learning_rate": 0.0001, "loss": 7.5338, "loss/crossentropy": 2.1882025837898254, "loss/hidden": 3.45, "loss/jsd": 0.0, "loss/logits": 0.2058427443727851, "step": 5570 }, { "epoch": 0.1395, "grad_norm": 30.25, "grad_norm_var": 182.91875, "learning_rate": 0.0001, "loss": 7.6344, "loss/crossentropy": 2.160731779038906, "loss/hidden": 3.5171875, "loss/jsd": 0.0, "loss/logits": 0.22695957981050013, "step": 5580 }, { "epoch": 0.13975, "grad_norm": 31.0, "grad_norm_var": 4.140559895833333, "learning_rate": 0.0001, "loss": 7.5419, "loss/crossentropy": 1.8924851581454276, "loss/hidden": 3.446484375, "loss/jsd": 0.0, "loss/logits": 0.1802460763603449, "step": 5590 }, { "epoch": 0.14, "grad_norm": 30.25, "grad_norm_var": 2.084830729166667, "learning_rate": 0.0001, "loss": 7.5498, "loss/crossentropy": 2.242933538556099, "loss/hidden": 3.387109375, "loss/jsd": 0.0, "loss/logits": 0.22144901510328055, "step": 5600 }, { "epoch": 0.14025, "grad_norm": 29.75, "grad_norm_var": 1.5622395833333333, "learning_rate": 0.0001, "loss": 7.5133, "loss/crossentropy": 2.132666201889515, "loss/hidden": 3.492578125, "loss/jsd": 0.0, "loss/logits": 0.19677439089864493, "step": 5610 }, { "epoch": 0.1405, "grad_norm": 30.5, "grad_norm_var": 14.234375, "learning_rate": 0.0001, "loss": 7.6226, "loss/crossentropy": 2.0671760708093645, "loss/hidden": 3.4203125, "loss/jsd": 0.0, "loss/logits": 0.20731164887547493, "step": 5620 }, { "epoch": 0.14075, "grad_norm": 35.5, "grad_norm_var": 14.44375, "learning_rate": 0.0001, "loss": 7.5063, "loss/crossentropy": 2.060420323908329, "loss/hidden": 3.346484375, "loss/jsd": 0.0, "loss/logits": 0.18428542967885733, "step": 5630 }, { "epoch": 0.141, "grad_norm": 38.5, "grad_norm_var": 19.1509765625, "learning_rate": 0.0001, "loss": 7.5322, "loss/crossentropy": 2.2572733104228973, "loss/hidden": 3.291015625, "loss/jsd": 0.0, "loss/logits": 0.19184892270714043, "step": 5640 }, { "epoch": 0.14125, "grad_norm": 32.0, "grad_norm_var": 20.373893229166665, "learning_rate": 0.0001, "loss": 7.5473, "loss/crossentropy": 2.1559308364987375, "loss/hidden": 3.237109375, "loss/jsd": 0.0, "loss/logits": 0.18201812207698823, "step": 5650 }, { "epoch": 0.1415, "grad_norm": 28.5, "grad_norm_var": 10.033268229166667, "learning_rate": 0.0001, "loss": 7.4529, "loss/crossentropy": 2.1479016572237013, "loss/hidden": 3.393359375, "loss/jsd": 0.0, "loss/logits": 0.20532293617725372, "step": 5660 }, { "epoch": 0.14175, "grad_norm": 29.5, "grad_norm_var": 3.6332682291666667, "learning_rate": 0.0001, "loss": 7.6001, "loss/crossentropy": 2.140682426095009, "loss/hidden": 3.321484375, "loss/jsd": 0.0, "loss/logits": 0.18856783863157034, "step": 5670 }, { "epoch": 0.142, "grad_norm": 32.75, "grad_norm_var": 2.84140625, "learning_rate": 0.0001, "loss": 7.6348, "loss/crossentropy": 2.25071659386158, "loss/hidden": 3.312109375, "loss/jsd": 0.0, "loss/logits": 0.2029257183894515, "step": 5680 }, { "epoch": 0.14225, "grad_norm": 31.75, "grad_norm_var": 2.643684895833333, "learning_rate": 0.0001, "loss": 7.5323, "loss/crossentropy": 2.149606391787529, "loss/hidden": 3.436328125, "loss/jsd": 0.0, "loss/logits": 0.2141027996316552, "step": 5690 }, { "epoch": 0.1425, "grad_norm": 30.25, "grad_norm_var": 2.32890625, "learning_rate": 0.0001, "loss": 7.5615, "loss/crossentropy": 2.2273970007896424, "loss/hidden": 3.4703125, "loss/jsd": 0.0, "loss/logits": 0.22348886616528035, "step": 5700 }, { "epoch": 0.14275, "grad_norm": 29.875, "grad_norm_var": 1.8285807291666667, "learning_rate": 0.0001, "loss": 7.4366, "loss/crossentropy": 2.2010063380002975, "loss/hidden": 3.289453125, "loss/jsd": 0.0, "loss/logits": 0.18594364672899247, "step": 5710 }, { "epoch": 0.143, "grad_norm": 34.25, "grad_norm_var": 2.6020182291666667, "learning_rate": 0.0001, "loss": 7.508, "loss/crossentropy": 2.0483784288167954, "loss/hidden": 3.40234375, "loss/jsd": 0.0, "loss/logits": 0.2091974811628461, "step": 5720 }, { "epoch": 0.14325, "grad_norm": 33.0, "grad_norm_var": 2.6744140625, "learning_rate": 0.0001, "loss": 7.6646, "loss/crossentropy": 2.025853230059147, "loss/hidden": 3.589453125, "loss/jsd": 0.0, "loss/logits": 0.2227605242282152, "step": 5730 }, { "epoch": 0.1435, "grad_norm": 31.0, "grad_norm_var": 1.8327473958333333, "learning_rate": 0.0001, "loss": 7.6064, "loss/crossentropy": 2.075891287624836, "loss/hidden": 3.38359375, "loss/jsd": 0.0, "loss/logits": 0.1890827091410756, "step": 5740 }, { "epoch": 0.14375, "grad_norm": 31.625, "grad_norm_var": 13.943489583333333, "learning_rate": 0.0001, "loss": 7.4294, "loss/crossentropy": 2.0676268830895426, "loss/hidden": 3.426953125, "loss/jsd": 0.0, "loss/logits": 0.20394732002168894, "step": 5750 }, { "epoch": 0.144, "grad_norm": 30.0, "grad_norm_var": 14.658072916666667, "learning_rate": 0.0001, "loss": 7.5958, "loss/crossentropy": 2.1388431686908005, "loss/hidden": 3.499609375, "loss/jsd": 0.0, "loss/logits": 0.21454429486766458, "step": 5760 }, { "epoch": 0.14425, "grad_norm": 32.0, "grad_norm_var": 13.958072916666667, "learning_rate": 0.0001, "loss": 7.5342, "loss/crossentropy": 2.0246446818113326, "loss/hidden": 3.3140625, "loss/jsd": 0.0, "loss/logits": 0.18158567249774932, "step": 5770 }, { "epoch": 0.1445, "grad_norm": 28.375, "grad_norm_var": 14.469205729166667, "learning_rate": 0.0001, "loss": 7.5141, "loss/crossentropy": 2.103305173665285, "loss/hidden": 3.3234375, "loss/jsd": 0.0, "loss/logits": 0.1842075905762613, "step": 5780 }, { "epoch": 0.14475, "grad_norm": 28.875, "grad_norm_var": 3.160872395833333, "learning_rate": 0.0001, "loss": 7.6213, "loss/crossentropy": 2.096319726109505, "loss/hidden": 3.378515625, "loss/jsd": 0.0, "loss/logits": 0.19167389422655107, "step": 5790 }, { "epoch": 0.145, "grad_norm": 35.25, "grad_norm_var": 3.4916666666666667, "learning_rate": 0.0001, "loss": 7.4461, "loss/crossentropy": 2.0800597339868547, "loss/hidden": 3.3453125, "loss/jsd": 0.0, "loss/logits": 0.18269639350473882, "step": 5800 }, { "epoch": 0.14525, "grad_norm": 45.5, "grad_norm_var": 22.3306640625, "learning_rate": 0.0001, "loss": 7.4855, "loss/crossentropy": 2.036512078344822, "loss/hidden": 3.462890625, "loss/jsd": 0.0, "loss/logits": 0.1910469362512231, "step": 5810 }, { "epoch": 0.1455, "grad_norm": 33.0, "grad_norm_var": 23.284309895833335, "learning_rate": 0.0001, "loss": 7.4838, "loss/crossentropy": 2.1922324389219283, "loss/hidden": 3.48671875, "loss/jsd": 0.0, "loss/logits": 0.2080824663862586, "step": 5820 }, { "epoch": 0.14575, "grad_norm": 31.25, "grad_norm_var": 2.7171223958333335, "learning_rate": 0.0001, "loss": 7.4657, "loss/crossentropy": 2.0417690485715867, "loss/hidden": 3.4546875, "loss/jsd": 0.0, "loss/logits": 0.19891292043030262, "step": 5830 }, { "epoch": 0.146, "grad_norm": 32.75, "grad_norm_var": 6.515625, "learning_rate": 0.0001, "loss": 7.5864, "loss/crossentropy": 2.12301287651062, "loss/hidden": 3.2953125, "loss/jsd": 0.0, "loss/logits": 0.19781142249703407, "step": 5840 }, { "epoch": 0.14625, "grad_norm": 33.0, "grad_norm_var": 6.574934895833334, "learning_rate": 0.0001, "loss": 7.6264, "loss/crossentropy": 2.1600560665130617, "loss/hidden": 3.448046875, "loss/jsd": 0.0, "loss/logits": 0.22095333114266397, "step": 5850 }, { "epoch": 0.1465, "grad_norm": 31.125, "grad_norm_var": 1.3822265625, "learning_rate": 0.0001, "loss": 7.5231, "loss/crossentropy": 2.185663253068924, "loss/hidden": 3.41875, "loss/jsd": 0.0, "loss/logits": 0.2029089467599988, "step": 5860 }, { "epoch": 0.14675, "grad_norm": 33.0, "grad_norm_var": 3.020247395833333, "learning_rate": 0.0001, "loss": 7.4938, "loss/crossentropy": 2.052996274828911, "loss/hidden": 3.477734375, "loss/jsd": 0.0, "loss/logits": 0.19343450702726842, "step": 5870 }, { "epoch": 0.147, "grad_norm": 31.875, "grad_norm_var": 6.112239583333333, "learning_rate": 0.0001, "loss": 7.5855, "loss/crossentropy": 2.2062906324863434, "loss/hidden": 3.369921875, "loss/jsd": 0.0, "loss/logits": 0.20249157436192036, "step": 5880 }, { "epoch": 0.14725, "grad_norm": 36.25, "grad_norm_var": 2.4330729166666667, "learning_rate": 0.0001, "loss": 7.5644, "loss/crossentropy": 2.0493671208620072, "loss/hidden": 3.42109375, "loss/jsd": 0.0, "loss/logits": 0.19113321453332902, "step": 5890 }, { "epoch": 0.1475, "grad_norm": 31.5, "grad_norm_var": 2.6968098958333333, "learning_rate": 0.0001, "loss": 7.5941, "loss/crossentropy": 2.20456420481205, "loss/hidden": 3.378125, "loss/jsd": 0.0, "loss/logits": 0.20127719175070524, "step": 5900 }, { "epoch": 0.14775, "grad_norm": 29.625, "grad_norm_var": 1.8268229166666667, "learning_rate": 0.0001, "loss": 7.4958, "loss/crossentropy": 2.2630672723054888, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.20422303341329098, "step": 5910 }, { "epoch": 0.148, "grad_norm": 33.5, "grad_norm_var": 2.45, "learning_rate": 0.0001, "loss": 7.4893, "loss/crossentropy": 2.130161929130554, "loss/hidden": 3.442578125, "loss/jsd": 0.0, "loss/logits": 0.2029714561998844, "step": 5920 }, { "epoch": 0.14825, "grad_norm": 31.375, "grad_norm_var": 0.9728515625, "learning_rate": 0.0001, "loss": 7.6087, "loss/crossentropy": 2.2219777315855027, "loss/hidden": 3.374609375, "loss/jsd": 0.0, "loss/logits": 0.20015078466385602, "step": 5930 }, { "epoch": 0.1485, "grad_norm": 31.125, "grad_norm_var": 2.9514973958333335, "learning_rate": 0.0001, "loss": 7.5908, "loss/crossentropy": 2.2077976912260056, "loss/hidden": 3.37578125, "loss/jsd": 0.0, "loss/logits": 0.1996950874105096, "step": 5940 }, { "epoch": 0.14875, "grad_norm": 30.0, "grad_norm_var": 3.3619140625, "learning_rate": 0.0001, "loss": 7.5402, "loss/crossentropy": 2.1866880118846894, "loss/hidden": 3.408203125, "loss/jsd": 0.0, "loss/logits": 0.19317448064684867, "step": 5950 }, { "epoch": 0.149, "grad_norm": 31.625, "grad_norm_var": 2.6968098958333333, "learning_rate": 0.0001, "loss": 7.4202, "loss/crossentropy": 2.1943173080682756, "loss/hidden": 3.2390625, "loss/jsd": 0.0, "loss/logits": 0.18432049825787544, "step": 5960 }, { "epoch": 0.14925, "grad_norm": 30.75, "grad_norm_var": 16.936393229166665, "learning_rate": 0.0001, "loss": 7.5056, "loss/crossentropy": 2.17142014503479, "loss/hidden": 3.4046875, "loss/jsd": 0.0, "loss/logits": 0.1876060863956809, "step": 5970 }, { "epoch": 0.1495, "grad_norm": 31.5, "grad_norm_var": 4.03515625, "learning_rate": 0.0001, "loss": 7.5442, "loss/crossentropy": 2.2047942698001863, "loss/hidden": 3.369140625, "loss/jsd": 0.0, "loss/logits": 0.1988227294757962, "step": 5980 }, { "epoch": 0.14975, "grad_norm": 32.0, "grad_norm_var": 3.332291666666667, "learning_rate": 0.0001, "loss": 7.5693, "loss/crossentropy": 2.281469625234604, "loss/hidden": 3.337109375, "loss/jsd": 0.0, "loss/logits": 0.19730366189032794, "step": 5990 }, { "epoch": 0.15, "grad_norm": 30.125, "grad_norm_var": 3.7603515625, "learning_rate": 0.0001, "loss": 7.4556, "loss/crossentropy": 2.256136628985405, "loss/hidden": 3.36796875, "loss/jsd": 0.0, "loss/logits": 0.1971954844892025, "step": 6000 }, { "epoch": 0.15025, "grad_norm": 30.0, "grad_norm_var": 2.1285807291666665, "learning_rate": 0.0001, "loss": 7.5146, "loss/crossentropy": 2.105835199356079, "loss/hidden": 3.416015625, "loss/jsd": 0.0, "loss/logits": 0.19158641248941422, "step": 6010 }, { "epoch": 0.1505, "grad_norm": 40.0, "grad_norm_var": 18.542708333333334, "learning_rate": 0.0001, "loss": 7.4687, "loss/crossentropy": 2.095852518081665, "loss/hidden": 3.359765625, "loss/jsd": 0.0, "loss/logits": 0.18887621480971575, "step": 6020 }, { "epoch": 0.15075, "grad_norm": 30.5, "grad_norm_var": 19.169791666666665, "learning_rate": 0.0001, "loss": 7.6293, "loss/crossentropy": 2.0981243371963503, "loss/hidden": 3.4875, "loss/jsd": 0.0, "loss/logits": 0.22096464578062297, "step": 6030 }, { "epoch": 0.151, "grad_norm": 30.5, "grad_norm_var": 2.716080729166667, "learning_rate": 0.0001, "loss": 7.5935, "loss/crossentropy": 2.212895154953003, "loss/hidden": 3.510546875, "loss/jsd": 0.0, "loss/logits": 0.22050899863243104, "step": 6040 }, { "epoch": 0.15125, "grad_norm": 30.625, "grad_norm_var": 3.042122395833333, "learning_rate": 0.0001, "loss": 7.5, "loss/crossentropy": 2.1280829131603243, "loss/hidden": 3.337109375, "loss/jsd": 0.0, "loss/logits": 0.18497859146445988, "step": 6050 }, { "epoch": 0.1515, "grad_norm": 29.25, "grad_norm_var": 47.15807291666667, "learning_rate": 0.0001, "loss": 7.5469, "loss/crossentropy": 2.1731619790196417, "loss/hidden": 3.4578125, "loss/jsd": 0.0, "loss/logits": 0.22194090783596038, "step": 6060 }, { "epoch": 0.15175, "grad_norm": 30.5, "grad_norm_var": 3.218489583333333, "learning_rate": 0.0001, "loss": 7.4378, "loss/crossentropy": 2.047077566385269, "loss/hidden": 3.33359375, "loss/jsd": 0.0, "loss/logits": 0.1876080146059394, "step": 6070 }, { "epoch": 0.152, "grad_norm": 33.0, "grad_norm_var": 5.45, "learning_rate": 0.0001, "loss": 7.584, "loss/crossentropy": 2.1896773248910906, "loss/hidden": 3.341796875, "loss/jsd": 0.0, "loss/logits": 0.19448568094521762, "step": 6080 }, { "epoch": 0.15225, "grad_norm": 31.125, "grad_norm_var": 3.7634765625, "learning_rate": 0.0001, "loss": 7.6047, "loss/crossentropy": 2.1123378753662108, "loss/hidden": 3.374609375, "loss/jsd": 0.0, "loss/logits": 0.1931258851662278, "step": 6090 }, { "epoch": 0.1525, "grad_norm": 30.625, "grad_norm_var": 2.5488932291666666, "learning_rate": 0.0001, "loss": 7.5301, "loss/crossentropy": 2.2152541011571882, "loss/hidden": 3.363671875, "loss/jsd": 0.0, "loss/logits": 0.19427606835961342, "step": 6100 }, { "epoch": 0.15275, "grad_norm": 34.25, "grad_norm_var": 3.7421223958333334, "learning_rate": 0.0001, "loss": 7.5936, "loss/crossentropy": 2.109275272488594, "loss/hidden": 3.531640625, "loss/jsd": 0.0, "loss/logits": 0.2041168488562107, "step": 6110 }, { "epoch": 0.153, "grad_norm": 30.0, "grad_norm_var": 2.723893229166667, "learning_rate": 0.0001, "loss": 7.5107, "loss/crossentropy": 2.0911407291889192, "loss/hidden": 3.403515625, "loss/jsd": 0.0, "loss/logits": 0.19639678560197354, "step": 6120 }, { "epoch": 0.15325, "grad_norm": 36.0, "grad_norm_var": 3.2561848958333335, "learning_rate": 0.0001, "loss": 7.4788, "loss/crossentropy": 2.1557929456233977, "loss/hidden": 3.5140625, "loss/jsd": 0.0, "loss/logits": 0.2058223508298397, "step": 6130 }, { "epoch": 0.1535, "grad_norm": 30.0, "grad_norm_var": 84.3337890625, "learning_rate": 0.0001, "loss": 7.5656, "loss/crossentropy": 2.0974055036902426, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.19299208922311664, "step": 6140 }, { "epoch": 0.15375, "grad_norm": 30.625, "grad_norm_var": 85.70149739583333, "learning_rate": 0.0001, "loss": 7.4607, "loss/crossentropy": 2.0354842752218247, "loss/hidden": 3.418359375, "loss/jsd": 0.0, "loss/logits": 0.2099142510443926, "step": 6150 }, { "epoch": 0.154, "grad_norm": 29.75, "grad_norm_var": 9.5134765625, "learning_rate": 0.0001, "loss": 7.4221, "loss/crossentropy": 1.9916995614767075, "loss/hidden": 3.351953125, "loss/jsd": 0.0, "loss/logits": 0.17939293198287487, "step": 6160 }, { "epoch": 0.15425, "grad_norm": 33.5, "grad_norm_var": 10.60390625, "learning_rate": 0.0001, "loss": 7.5836, "loss/crossentropy": 2.097960978746414, "loss/hidden": 3.353515625, "loss/jsd": 0.0, "loss/logits": 0.1908213123679161, "step": 6170 }, { "epoch": 0.1545, "grad_norm": 31.125, "grad_norm_var": 3.0942057291666667, "learning_rate": 0.0001, "loss": 7.5073, "loss/crossentropy": 2.253797325491905, "loss/hidden": 3.44140625, "loss/jsd": 0.0, "loss/logits": 0.19871626775711776, "step": 6180 }, { "epoch": 0.15475, "grad_norm": 30.375, "grad_norm_var": 1.3900390625, "learning_rate": 0.0001, "loss": 7.5189, "loss/crossentropy": 2.1756670624017715, "loss/hidden": 3.381640625, "loss/jsd": 0.0, "loss/logits": 0.19654417987912892, "step": 6190 }, { "epoch": 0.155, "grad_norm": 32.75, "grad_norm_var": 1.4181640625, "learning_rate": 0.0001, "loss": 7.3147, "loss/crossentropy": 2.1110276162624357, "loss/hidden": 3.294921875, "loss/jsd": 0.0, "loss/logits": 0.1801340376958251, "step": 6200 }, { "epoch": 0.15525, "grad_norm": 28.5, "grad_norm_var": 1.6223307291666667, "learning_rate": 0.0001, "loss": 7.6015, "loss/crossentropy": 2.1547244489192963, "loss/hidden": 3.31171875, "loss/jsd": 0.0, "loss/logits": 0.18071307614445686, "step": 6210 }, { "epoch": 0.1555, "grad_norm": 33.0, "grad_norm_var": 4.36875, "learning_rate": 0.0001, "loss": 7.5559, "loss/crossentropy": 2.1889162242412565, "loss/hidden": 3.344140625, "loss/jsd": 0.0, "loss/logits": 0.185367326810956, "step": 6220 }, { "epoch": 0.15575, "grad_norm": 38.75, "grad_norm_var": 25.381184895833332, "learning_rate": 0.0001, "loss": 7.5323, "loss/crossentropy": 2.023681116104126, "loss/hidden": 3.38203125, "loss/jsd": 0.0, "loss/logits": 0.19041543006896972, "step": 6230 }, { "epoch": 0.156, "grad_norm": 31.5, "grad_norm_var": 12.545833333333333, "learning_rate": 0.0001, "loss": 7.6086, "loss/crossentropy": 2.260247975587845, "loss/hidden": 3.324609375, "loss/jsd": 0.0, "loss/logits": 0.19489852003753186, "step": 6240 }, { "epoch": 0.15625, "grad_norm": 30.5, "grad_norm_var": 6.642122395833334, "learning_rate": 0.0001, "loss": 7.5419, "loss/crossentropy": 2.1487904608249666, "loss/hidden": 3.37421875, "loss/jsd": 0.0, "loss/logits": 0.1957969294860959, "step": 6250 }, { "epoch": 0.1565, "grad_norm": 31.25, "grad_norm_var": 5.493489583333333, "learning_rate": 0.0001, "loss": 7.588, "loss/crossentropy": 2.279403430223465, "loss/hidden": 3.350390625, "loss/jsd": 0.0, "loss/logits": 0.20682295374572277, "step": 6260 }, { "epoch": 0.15675, "grad_norm": 30.125, "grad_norm_var": 8.14765625, "learning_rate": 0.0001, "loss": 7.5894, "loss/crossentropy": 2.1407866299152376, "loss/hidden": 3.426953125, "loss/jsd": 0.0, "loss/logits": 0.20337907522916793, "step": 6270 }, { "epoch": 0.157, "grad_norm": 33.25, "grad_norm_var": 1314.8994140625, "learning_rate": 0.0001, "loss": 7.6872, "loss/crossentropy": 2.1802447110414507, "loss/hidden": 3.540625, "loss/jsd": 0.0, "loss/logits": 0.22727628983557224, "step": 6280 }, { "epoch": 0.15725, "grad_norm": 30.625, "grad_norm_var": 1378.0379557291667, "learning_rate": 0.0001, "loss": 7.5596, "loss/crossentropy": 2.1761491730809213, "loss/hidden": 3.335546875, "loss/jsd": 0.0, "loss/logits": 0.18902392424643039, "step": 6290 }, { "epoch": 0.1575, "grad_norm": 42.25, "grad_norm_var": 146.03020833333332, "learning_rate": 0.0001, "loss": 7.6132, "loss/crossentropy": 2.106969301402569, "loss/hidden": 3.45859375, "loss/jsd": 0.0, "loss/logits": 0.2025001836940646, "step": 6300 }, { "epoch": 0.15775, "grad_norm": 29.25, "grad_norm_var": 14.539322916666666, "learning_rate": 0.0001, "loss": 7.514, "loss/crossentropy": 2.0654825627803803, "loss/hidden": 3.419921875, "loss/jsd": 0.0, "loss/logits": 0.19361731074750424, "step": 6310 }, { "epoch": 0.158, "grad_norm": 31.75, "grad_norm_var": 8.153580729166666, "learning_rate": 0.0001, "loss": 7.5587, "loss/crossentropy": 2.025081543624401, "loss/hidden": 3.426953125, "loss/jsd": 0.0, "loss/logits": 0.2247559091076255, "step": 6320 }, { "epoch": 0.15825, "grad_norm": 31.875, "grad_norm_var": 2.0431640625, "learning_rate": 0.0001, "loss": 7.4657, "loss/crossentropy": 2.136493813991547, "loss/hidden": 3.35546875, "loss/jsd": 0.0, "loss/logits": 0.19502629730850457, "step": 6330 }, { "epoch": 0.1585, "grad_norm": 33.0, "grad_norm_var": 2.189322916666667, "learning_rate": 0.0001, "loss": 7.5454, "loss/crossentropy": 2.176800549030304, "loss/hidden": 3.38203125, "loss/jsd": 0.0, "loss/logits": 0.201618373952806, "step": 6340 }, { "epoch": 0.15875, "grad_norm": 28.375, "grad_norm_var": 2.97265625, "learning_rate": 0.0001, "loss": 7.5116, "loss/crossentropy": 2.2223973661661147, "loss/hidden": 3.43359375, "loss/jsd": 0.0, "loss/logits": 0.2056947773322463, "step": 6350 }, { "epoch": 0.159, "grad_norm": 31.5, "grad_norm_var": 3.0580729166666667, "learning_rate": 0.0001, "loss": 7.5133, "loss/crossentropy": 2.147826671600342, "loss/hidden": 3.4234375, "loss/jsd": 0.0, "loss/logits": 0.19723634477704763, "step": 6360 }, { "epoch": 0.15925, "grad_norm": 35.0, "grad_norm_var": 18.772916666666667, "learning_rate": 0.0001, "loss": 7.679, "loss/crossentropy": 2.079222206771374, "loss/hidden": 3.47890625, "loss/jsd": 0.0, "loss/logits": 0.2045620433986187, "step": 6370 }, { "epoch": 0.1595, "grad_norm": 28.75, "grad_norm_var": 20.639518229166665, "learning_rate": 0.0001, "loss": 7.5245, "loss/crossentropy": 2.163569709658623, "loss/hidden": 3.45546875, "loss/jsd": 0.0, "loss/logits": 0.20100143328309059, "step": 6380 }, { "epoch": 0.15975, "grad_norm": 36.75, "grad_norm_var": 18.972916666666666, "learning_rate": 0.0001, "loss": 7.5705, "loss/crossentropy": 2.0975339651107787, "loss/hidden": 3.435546875, "loss/jsd": 0.0, "loss/logits": 0.19847188219428064, "step": 6390 }, { "epoch": 0.16, "grad_norm": 31.375, "grad_norm_var": 64.11770833333334, "learning_rate": 0.0001, "loss": 7.6051, "loss/crossentropy": 2.0802886128425597, "loss/hidden": 3.381640625, "loss/jsd": 0.0, "loss/logits": 0.20291686560958624, "step": 6400 }, { "epoch": 0.16025, "grad_norm": 32.25, "grad_norm_var": 74.28958333333334, "learning_rate": 0.0001, "loss": 7.4922, "loss/crossentropy": 2.1224244251847266, "loss/hidden": 3.342578125, "loss/jsd": 0.0, "loss/logits": 0.194674601405859, "step": 6410 }, { "epoch": 0.1605, "grad_norm": 62.5, "grad_norm_var": 69.86041666666667, "learning_rate": 0.0001, "loss": 7.6584, "loss/crossentropy": 2.1936039954423903, "loss/hidden": 3.348828125, "loss/jsd": 0.0, "loss/logits": 0.19804685264825822, "step": 6420 }, { "epoch": 0.16075, "grad_norm": 36.5, "grad_norm_var": 66.04576822916667, "learning_rate": 0.0001, "loss": 7.4329, "loss/crossentropy": 2.103436988592148, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.19700358025729656, "step": 6430 }, { "epoch": 0.161, "grad_norm": 28.875, "grad_norm_var": 8.187239583333334, "learning_rate": 0.0001, "loss": 7.5397, "loss/crossentropy": 2.107574874162674, "loss/hidden": 3.380078125, "loss/jsd": 0.0, "loss/logits": 0.1931281227618456, "step": 6440 }, { "epoch": 0.16125, "grad_norm": 32.75, "grad_norm_var": 20.245768229166668, "learning_rate": 0.0001, "loss": 7.5289, "loss/crossentropy": 2.251564306020737, "loss/hidden": 3.4125, "loss/jsd": 0.0, "loss/logits": 0.20565700754523278, "step": 6450 }, { "epoch": 0.1615, "grad_norm": 29.5, "grad_norm_var": 3.187239583333333, "learning_rate": 0.0001, "loss": 7.4563, "loss/crossentropy": 2.0432717867195604, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.17879284229129552, "step": 6460 }, { "epoch": 0.16175, "grad_norm": 40.0, "grad_norm_var": 8.5416015625, "learning_rate": 0.0001, "loss": 7.5052, "loss/crossentropy": 2.130405417084694, "loss/hidden": 3.423046875, "loss/jsd": 0.0, "loss/logits": 0.1868636028841138, "step": 6470 }, { "epoch": 0.162, "grad_norm": 30.625, "grad_norm_var": 8.284375, "learning_rate": 0.0001, "loss": 7.4387, "loss/crossentropy": 2.1113599717617033, "loss/hidden": 3.377734375, "loss/jsd": 0.0, "loss/logits": 0.20160141140222548, "step": 6480 }, { "epoch": 0.16225, "grad_norm": 30.5, "grad_norm_var": 10.103125, "learning_rate": 0.0001, "loss": 7.4812, "loss/crossentropy": 2.177107959985733, "loss/hidden": 3.363671875, "loss/jsd": 0.0, "loss/logits": 0.19870908968150616, "step": 6490 }, { "epoch": 0.1625, "grad_norm": 29.0, "grad_norm_var": 12.816080729166666, "learning_rate": 0.0001, "loss": 7.5634, "loss/crossentropy": 2.2469130218029023, "loss/hidden": 3.4421875, "loss/jsd": 0.0, "loss/logits": 0.22473043091595174, "step": 6500 }, { "epoch": 0.16275, "grad_norm": 62.75, "grad_norm_var": 67.71555989583334, "learning_rate": 0.0001, "loss": 7.4504, "loss/crossentropy": 2.1134796291589737, "loss/hidden": 3.395703125, "loss/jsd": 0.0, "loss/logits": 0.20341392308473588, "step": 6510 }, { "epoch": 0.163, "grad_norm": 38.5, "grad_norm_var": 132.73723958333332, "learning_rate": 0.0001, "loss": 7.6099, "loss/crossentropy": 2.0655704945325852, "loss/hidden": 3.45625, "loss/jsd": 0.0, "loss/logits": 0.2016578745096922, "step": 6520 }, { "epoch": 0.16325, "grad_norm": 32.0, "grad_norm_var": 117.9541015625, "learning_rate": 0.0001, "loss": 7.668, "loss/crossentropy": 2.171022225916386, "loss/hidden": 3.575, "loss/jsd": 0.0, "loss/logits": 0.23951137959957122, "step": 6530 }, { "epoch": 0.1635, "grad_norm": 32.75, "grad_norm_var": 6.513541666666667, "learning_rate": 0.0001, "loss": 7.5344, "loss/crossentropy": 2.1631700932979583, "loss/hidden": 3.371875, "loss/jsd": 0.0, "loss/logits": 0.19134688563644886, "step": 6540 }, { "epoch": 0.16375, "grad_norm": 31.25, "grad_norm_var": 10.718489583333334, "learning_rate": 0.0001, "loss": 7.5036, "loss/crossentropy": 2.154293045401573, "loss/hidden": 3.355859375, "loss/jsd": 0.0, "loss/logits": 0.2056902015581727, "step": 6550 }, { "epoch": 0.164, "grad_norm": 30.0, "grad_norm_var": 4.233268229166667, "learning_rate": 0.0001, "loss": 7.4959, "loss/crossentropy": 2.077804160118103, "loss/hidden": 3.34921875, "loss/jsd": 0.0, "loss/logits": 0.18005848359316587, "step": 6560 }, { "epoch": 0.16425, "grad_norm": 31.75, "grad_norm_var": 3.2343098958333334, "learning_rate": 0.0001, "loss": 7.3855, "loss/crossentropy": 2.238145849108696, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.20202569160610437, "step": 6570 }, { "epoch": 0.1645, "grad_norm": 29.25, "grad_norm_var": 4.284375, "learning_rate": 0.0001, "loss": 7.5453, "loss/crossentropy": 2.1169906362891195, "loss/hidden": 3.41640625, "loss/jsd": 0.0, "loss/logits": 0.1980356415733695, "step": 6580 }, { "epoch": 0.16475, "grad_norm": 32.5, "grad_norm_var": 1.6848307291666667, "learning_rate": 0.0001, "loss": 7.5151, "loss/crossentropy": 2.165386658906937, "loss/hidden": 3.413671875, "loss/jsd": 0.0, "loss/logits": 0.19807699434459208, "step": 6590 }, { "epoch": 0.165, "grad_norm": 30.125, "grad_norm_var": 2.905989583333333, "learning_rate": 0.0001, "loss": 7.4242, "loss/crossentropy": 2.2100414454936983, "loss/hidden": 3.452734375, "loss/jsd": 0.0, "loss/logits": 0.21563767176121473, "step": 6600 }, { "epoch": 0.16525, "grad_norm": 28.25, "grad_norm_var": 21.574934895833334, "learning_rate": 0.0001, "loss": 7.4757, "loss/crossentropy": 2.1352153360843658, "loss/hidden": 3.277734375, "loss/jsd": 0.0, "loss/logits": 0.176172699034214, "step": 6610 }, { "epoch": 0.1655, "grad_norm": 32.0, "grad_norm_var": 9.960872395833333, "learning_rate": 0.0001, "loss": 7.5596, "loss/crossentropy": 2.1773680597543716, "loss/hidden": 3.3703125, "loss/jsd": 0.0, "loss/logits": 0.18952815048396587, "step": 6620 }, { "epoch": 0.16575, "grad_norm": 28.625, "grad_norm_var": 4.380143229166666, "learning_rate": 0.0001, "loss": 7.56, "loss/crossentropy": 2.1314379185438157, "loss/hidden": 3.409765625, "loss/jsd": 0.0, "loss/logits": 0.19869489334523677, "step": 6630 }, { "epoch": 0.166, "grad_norm": 32.75, "grad_norm_var": 65.24765625, "learning_rate": 0.0001, "loss": 7.5285, "loss/crossentropy": 2.1807857722043993, "loss/hidden": 3.44453125, "loss/jsd": 0.0, "loss/logits": 0.20406192895025016, "step": 6640 }, { "epoch": 0.16625, "grad_norm": 36.75, "grad_norm_var": 61.514322916666664, "learning_rate": 0.0001, "loss": 7.5774, "loss/crossentropy": 2.1761706352233885, "loss/hidden": 3.490625, "loss/jsd": 0.0, "loss/logits": 0.22523548007011412, "step": 6650 }, { "epoch": 0.1665, "grad_norm": 40.0, "grad_norm_var": 10.858072916666666, "learning_rate": 0.0001, "loss": 7.4837, "loss/crossentropy": 2.0422482162714006, "loss/hidden": 3.411328125, "loss/jsd": 0.0, "loss/logits": 0.19435476139187813, "step": 6660 }, { "epoch": 0.16675, "grad_norm": 31.125, "grad_norm_var": 9.977083333333333, "learning_rate": 0.0001, "loss": 7.6686, "loss/crossentropy": 2.182368849217892, "loss/hidden": 3.3578125, "loss/jsd": 0.0, "loss/logits": 0.20404126346111298, "step": 6670 }, { "epoch": 0.167, "grad_norm": 32.0, "grad_norm_var": 11.715625, "learning_rate": 0.0001, "loss": 7.5378, "loss/crossentropy": 2.246191081404686, "loss/hidden": 3.28359375, "loss/jsd": 0.0, "loss/logits": 0.18508785720914603, "step": 6680 }, { "epoch": 0.16725, "grad_norm": 32.25, "grad_norm_var": 7.314322916666667, "learning_rate": 0.0001, "loss": 7.5093, "loss/crossentropy": 2.1791825108230114, "loss/hidden": 3.303515625, "loss/jsd": 0.0, "loss/logits": 0.18074298240244388, "step": 6690 }, { "epoch": 0.1675, "grad_norm": 30.75, "grad_norm_var": 8.992708333333333, "learning_rate": 0.0001, "loss": 7.5574, "loss/crossentropy": 2.1036971658468246, "loss/hidden": 3.372265625, "loss/jsd": 0.0, "loss/logits": 0.1981421067379415, "step": 6700 }, { "epoch": 0.16775, "grad_norm": 29.5, "grad_norm_var": 14.267643229166667, "learning_rate": 0.0001, "loss": 7.3344, "loss/crossentropy": 2.001809497177601, "loss/hidden": 3.39609375, "loss/jsd": 0.0, "loss/logits": 0.1832934280857444, "step": 6710 }, { "epoch": 0.168, "grad_norm": 61.25, "grad_norm_var": 63.89375, "learning_rate": 0.0001, "loss": 7.5556, "loss/crossentropy": 2.1391955494880674, "loss/hidden": 3.49375, "loss/jsd": 0.0, "loss/logits": 0.20318875089287758, "step": 6720 }, { "epoch": 0.16825, "grad_norm": 33.75, "grad_norm_var": 56.9791015625, "learning_rate": 0.0001, "loss": 7.556, "loss/crossentropy": 2.155173195898533, "loss/hidden": 3.390234375, "loss/jsd": 0.0, "loss/logits": 0.19701551645994186, "step": 6730 }, { "epoch": 0.1685, "grad_norm": 30.5, "grad_norm_var": 9.07890625, "learning_rate": 0.0001, "loss": 7.444, "loss/crossentropy": 2.1643510669469834, "loss/hidden": 3.38203125, "loss/jsd": 0.0, "loss/logits": 0.19851317517459394, "step": 6740 }, { "epoch": 0.16875, "grad_norm": 32.5, "grad_norm_var": 4.964518229166667, "learning_rate": 0.0001, "loss": 7.4711, "loss/crossentropy": 2.1824121534824372, "loss/hidden": 3.38125, "loss/jsd": 0.0, "loss/logits": 0.20471413098275662, "step": 6750 }, { "epoch": 0.169, "grad_norm": 32.75, "grad_norm_var": 6.1009765625, "learning_rate": 0.0001, "loss": 7.4304, "loss/crossentropy": 2.0600946068763735, "loss/hidden": 3.374609375, "loss/jsd": 0.0, "loss/logits": 0.18951213210821152, "step": 6760 }, { "epoch": 0.16925, "grad_norm": 34.25, "grad_norm_var": 6.034309895833333, "learning_rate": 0.0001, "loss": 7.5587, "loss/crossentropy": 2.1303545042872427, "loss/hidden": 3.32578125, "loss/jsd": 0.0, "loss/logits": 0.19376144148409366, "step": 6770 }, { "epoch": 0.1695, "grad_norm": 35.75, "grad_norm_var": 8.519205729166666, "learning_rate": 0.0001, "loss": 7.4825, "loss/crossentropy": 2.223961615562439, "loss/hidden": 3.414453125, "loss/jsd": 0.0, "loss/logits": 0.1993227696046233, "step": 6780 }, { "epoch": 0.16975, "grad_norm": 52.5, "grad_norm_var": 31.4837890625, "learning_rate": 0.0001, "loss": 7.4688, "loss/crossentropy": 2.1645863845944406, "loss/hidden": 3.416015625, "loss/jsd": 0.0, "loss/logits": 0.19716022843495012, "step": 6790 }, { "epoch": 0.17, "grad_norm": 32.75, "grad_norm_var": 29.0400390625, "learning_rate": 0.0001, "loss": 7.5153, "loss/crossentropy": 2.251170714199543, "loss/hidden": 3.37578125, "loss/jsd": 0.0, "loss/logits": 0.21400584764778613, "step": 6800 }, { "epoch": 0.17025, "grad_norm": 32.5, "grad_norm_var": 4.945833333333334, "learning_rate": 0.0001, "loss": 7.4593, "loss/crossentropy": 2.1392437756061553, "loss/hidden": 3.3734375, "loss/jsd": 0.0, "loss/logits": 0.19613203704357146, "step": 6810 }, { "epoch": 0.1705, "grad_norm": 31.5, "grad_norm_var": 9.330989583333333, "learning_rate": 0.0001, "loss": 7.4589, "loss/crossentropy": 2.1152432590723036, "loss/hidden": 3.330078125, "loss/jsd": 0.0, "loss/logits": 0.1848454039543867, "step": 6820 }, { "epoch": 0.17075, "grad_norm": 33.5, "grad_norm_var": 15.155208333333333, "learning_rate": 0.0001, "loss": 7.5984, "loss/crossentropy": 2.208776918053627, "loss/hidden": 3.38203125, "loss/jsd": 0.0, "loss/logits": 0.20963791087269784, "step": 6830 }, { "epoch": 0.171, "grad_norm": 31.0, "grad_norm_var": 14.967708333333333, "learning_rate": 0.0001, "loss": 7.5348, "loss/crossentropy": 2.1368553161621096, "loss/hidden": 3.47734375, "loss/jsd": 0.0, "loss/logits": 0.2036329936236143, "step": 6840 }, { "epoch": 0.17125, "grad_norm": 33.25, "grad_norm_var": 11.615625, "learning_rate": 0.0001, "loss": 7.4543, "loss/crossentropy": 2.2001425683498383, "loss/hidden": 3.291015625, "loss/jsd": 0.0, "loss/logits": 0.18403711169958115, "step": 6850 }, { "epoch": 0.1715, "grad_norm": 31.875, "grad_norm_var": 4.045247395833333, "learning_rate": 0.0001, "loss": 7.4924, "loss/crossentropy": 2.0965635985136033, "loss/hidden": 3.413671875, "loss/jsd": 0.0, "loss/logits": 0.19117785301059484, "step": 6860 }, { "epoch": 0.17175, "grad_norm": 32.0, "grad_norm_var": 6.587434895833334, "learning_rate": 0.0001, "loss": 7.4879, "loss/crossentropy": 2.106214761734009, "loss/hidden": 3.40546875, "loss/jsd": 0.0, "loss/logits": 0.19950247537344695, "step": 6870 }, { "epoch": 0.172, "grad_norm": 30.75, "grad_norm_var": 8.289518229166667, "learning_rate": 0.0001, "loss": 7.4561, "loss/crossentropy": 2.1776740878820418, "loss/hidden": 3.386328125, "loss/jsd": 0.0, "loss/logits": 0.19133987911045552, "step": 6880 }, { "epoch": 0.17225, "grad_norm": 30.25, "grad_norm_var": 2.77265625, "learning_rate": 0.0001, "loss": 7.4422, "loss/crossentropy": 2.082959216833115, "loss/hidden": 3.544921875, "loss/jsd": 0.0, "loss/logits": 0.2135216325521469, "step": 6890 }, { "epoch": 0.1725, "grad_norm": 31.0, "grad_norm_var": 4.1556640625, "learning_rate": 0.0001, "loss": 7.6053, "loss/crossentropy": 2.2018980890512467, "loss/hidden": 3.495703125, "loss/jsd": 0.0, "loss/logits": 0.21013570427894593, "step": 6900 }, { "epoch": 0.17275, "grad_norm": 39.5, "grad_norm_var": 6.5322265625, "learning_rate": 0.0001, "loss": 7.5447, "loss/crossentropy": 2.2833516895771027, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.19603817090392112, "step": 6910 }, { "epoch": 0.173, "grad_norm": 31.125, "grad_norm_var": 12.677018229166666, "learning_rate": 0.0001, "loss": 7.4817, "loss/crossentropy": 2.166572627425194, "loss/hidden": 3.38671875, "loss/jsd": 0.0, "loss/logits": 0.19017961733043193, "step": 6920 }, { "epoch": 0.17325, "grad_norm": 32.0, "grad_norm_var": 3.78125, "learning_rate": 0.0001, "loss": 7.6572, "loss/crossentropy": 2.1658833861351012, "loss/hidden": 3.426171875, "loss/jsd": 0.0, "loss/logits": 0.22312654759734868, "step": 6930 }, { "epoch": 0.1735, "grad_norm": 31.5, "grad_norm_var": 3.5747395833333333, "learning_rate": 0.0001, "loss": 7.4706, "loss/crossentropy": 2.1533073887228964, "loss/hidden": 3.400390625, "loss/jsd": 0.0, "loss/logits": 0.20075356736779212, "step": 6940 }, { "epoch": 0.17375, "grad_norm": 33.0, "grad_norm_var": 7.611393229166667, "learning_rate": 0.0001, "loss": 7.6678, "loss/crossentropy": 2.2583969831466675, "loss/hidden": 3.406640625, "loss/jsd": 0.0, "loss/logits": 0.2209668504074216, "step": 6950 }, { "epoch": 0.174, "grad_norm": 30.0, "grad_norm_var": 11.0087890625, "learning_rate": 0.0001, "loss": 7.3805, "loss/crossentropy": 2.1416700780391693, "loss/hidden": 3.31953125, "loss/jsd": 0.0, "loss/logits": 0.20209096949547528, "step": 6960 }, { "epoch": 0.17425, "grad_norm": 29.0, "grad_norm_var": 5.171875, "learning_rate": 0.0001, "loss": 7.5472, "loss/crossentropy": 2.1253055185079575, "loss/hidden": 3.402734375, "loss/jsd": 0.0, "loss/logits": 0.19859984703361988, "step": 6970 }, { "epoch": 0.1745, "grad_norm": 30.0, "grad_norm_var": 6.6603515625, "learning_rate": 0.0001, "loss": 7.5648, "loss/crossentropy": 2.182023701816797, "loss/hidden": 3.29296875, "loss/jsd": 0.0, "loss/logits": 0.18740688636898994, "step": 6980 }, { "epoch": 0.17475, "grad_norm": 30.75, "grad_norm_var": 2.4947916666666665, "learning_rate": 0.0001, "loss": 7.4586, "loss/crossentropy": 2.1650559276342394, "loss/hidden": 3.394921875, "loss/jsd": 0.0, "loss/logits": 0.1944490736350417, "step": 6990 }, { "epoch": 0.175, "grad_norm": 32.25, "grad_norm_var": 5.6587890625, "learning_rate": 0.0001, "loss": 7.569, "loss/crossentropy": 2.177471086382866, "loss/hidden": 3.3796875, "loss/jsd": 0.0, "loss/logits": 0.19053987860679628, "step": 7000 }, { "epoch": 0.17525, "grad_norm": 32.5, "grad_norm_var": 26.516080729166667, "learning_rate": 0.0001, "loss": 7.4753, "loss/crossentropy": 2.154606765508652, "loss/hidden": 3.423828125, "loss/jsd": 0.0, "loss/logits": 0.19910989850759506, "step": 7010 }, { "epoch": 0.1755, "grad_norm": 34.5, "grad_norm_var": 39.670572916666664, "learning_rate": 0.0001, "loss": 7.631, "loss/crossentropy": 2.197327023744583, "loss/hidden": 3.541796875, "loss/jsd": 0.0, "loss/logits": 0.21263533756136893, "step": 7020 }, { "epoch": 0.17575, "grad_norm": 32.25, "grad_norm_var": 19.970833333333335, "learning_rate": 0.0001, "loss": 7.5712, "loss/crossentropy": 2.143507385253906, "loss/hidden": 3.41796875, "loss/jsd": 0.0, "loss/logits": 0.20819550901651382, "step": 7030 }, { "epoch": 0.176, "grad_norm": 30.25, "grad_norm_var": 2.755989583333333, "learning_rate": 0.0001, "loss": 7.6156, "loss/crossentropy": 2.160289117693901, "loss/hidden": 3.45859375, "loss/jsd": 0.0, "loss/logits": 0.2082687295973301, "step": 7040 }, { "epoch": 0.17625, "grad_norm": 34.0, "grad_norm_var": 1.6035807291666666, "learning_rate": 0.0001, "loss": 7.5106, "loss/crossentropy": 2.096612122654915, "loss/hidden": 3.430859375, "loss/jsd": 0.0, "loss/logits": 0.2005817864090204, "step": 7050 }, { "epoch": 0.1765, "grad_norm": 29.5, "grad_norm_var": 1.7921223958333334, "learning_rate": 0.0001, "loss": 7.4678, "loss/crossentropy": 2.150402194261551, "loss/hidden": 3.31953125, "loss/jsd": 0.0, "loss/logits": 0.18803946152329445, "step": 7060 }, { "epoch": 0.17675, "grad_norm": 30.375, "grad_norm_var": 1.7754557291666666, "learning_rate": 0.0001, "loss": 7.5601, "loss/crossentropy": 2.111455664038658, "loss/hidden": 3.38125, "loss/jsd": 0.0, "loss/logits": 0.1900298461318016, "step": 7070 }, { "epoch": 0.177, "grad_norm": 30.875, "grad_norm_var": 3.0497395833333334, "learning_rate": 0.0001, "loss": 7.5888, "loss/crossentropy": 2.172677582502365, "loss/hidden": 3.380859375, "loss/jsd": 0.0, "loss/logits": 0.1932773429900408, "step": 7080 }, { "epoch": 0.17725, "grad_norm": 29.25, "grad_norm_var": 3.131705729166667, "learning_rate": 0.0001, "loss": 7.527, "loss/crossentropy": 2.2066776901483536, "loss/hidden": 3.416015625, "loss/jsd": 0.0, "loss/logits": 0.20891736317425966, "step": 7090 }, { "epoch": 0.1775, "grad_norm": 33.0, "grad_norm_var": 5.386713240151209e+18, "learning_rate": 0.0001, "loss": 7.6693, "loss/crossentropy": 2.1923015132546424, "loss/hidden": 3.3640625, "loss/jsd": 0.0, "loss/logits": 0.2022488683462143, "step": 7100 }, { "epoch": 0.17775, "grad_norm": 31.375, "grad_norm_var": 2.9284656292120494e+18, "learning_rate": 0.0001, "loss": 7.557, "loss/crossentropy": 2.0847490578889847, "loss/hidden": 3.552734375, "loss/jsd": 0.0, "loss/logits": 0.2148862171918154, "step": 7110 }, { "epoch": 0.178, "grad_norm": 30.625, "grad_norm_var": 1.3369140625, "learning_rate": 0.0001, "loss": 7.4864, "loss/crossentropy": 2.1249582156538964, "loss/hidden": 3.296484375, "loss/jsd": 0.0, "loss/logits": 0.19168496485799552, "step": 7120 }, { "epoch": 0.17825, "grad_norm": 31.875, "grad_norm_var": 2.878125, "learning_rate": 0.0001, "loss": 7.36, "loss/crossentropy": 2.173662793636322, "loss/hidden": 3.34921875, "loss/jsd": 0.0, "loss/logits": 0.1978321423754096, "step": 7130 }, { "epoch": 0.1785, "grad_norm": 34.5, "grad_norm_var": 3.9723307291666665, "learning_rate": 0.0001, "loss": 7.5663, "loss/crossentropy": 2.1832578271627425, "loss/hidden": 3.4484375, "loss/jsd": 0.0, "loss/logits": 0.22951248846948147, "step": 7140 }, { "epoch": 0.17875, "grad_norm": 32.75, "grad_norm_var": 2.022330729166667, "learning_rate": 0.0001, "loss": 7.5795, "loss/crossentropy": 2.1229765862226486, "loss/hidden": 3.28203125, "loss/jsd": 0.0, "loss/logits": 0.1857854513451457, "step": 7150 }, { "epoch": 0.179, "grad_norm": 30.25, "grad_norm_var": 42.5978515625, "learning_rate": 0.0001, "loss": 7.4524, "loss/crossentropy": 2.1213574737310408, "loss/hidden": 3.4671875, "loss/jsd": 0.0, "loss/logits": 0.22352912444621326, "step": 7160 }, { "epoch": 0.17925, "grad_norm": 29.625, "grad_norm_var": 44.90774739583333, "learning_rate": 0.0001, "loss": 7.5702, "loss/crossentropy": 2.2016272962093355, "loss/hidden": 3.4109375, "loss/jsd": 0.0, "loss/logits": 0.19035741928964853, "step": 7170 }, { "epoch": 0.1795, "grad_norm": 33.25, "grad_norm_var": 2.08515625, "learning_rate": 0.0001, "loss": 7.5485, "loss/crossentropy": 2.1106868594884873, "loss/hidden": 3.41875, "loss/jsd": 0.0, "loss/logits": 0.19449420645833015, "step": 7180 }, { "epoch": 0.17975, "grad_norm": 33.25, "grad_norm_var": 3.414322916666667, "learning_rate": 0.0001, "loss": 7.4434, "loss/crossentropy": 2.1368062049150467, "loss/hidden": 3.389453125, "loss/jsd": 0.0, "loss/logits": 0.1976731013506651, "step": 7190 }, { "epoch": 0.18, "grad_norm": 33.5, "grad_norm_var": 3.5483723958333333, "learning_rate": 0.0001, "loss": 7.4368, "loss/crossentropy": 2.1658909559249877, "loss/hidden": 3.476171875, "loss/jsd": 0.0, "loss/logits": 0.1897917227819562, "step": 7200 }, { "epoch": 0.18025, "grad_norm": 32.5, "grad_norm_var": 3.39765625, "learning_rate": 0.0001, "loss": 7.5235, "loss/crossentropy": 2.0003643825650217, "loss/hidden": 3.39453125, "loss/jsd": 0.0, "loss/logits": 0.1881322082132101, "step": 7210 }, { "epoch": 0.1805, "grad_norm": 32.25, "grad_norm_var": 7.2369140625, "learning_rate": 0.0001, "loss": 7.5351, "loss/crossentropy": 2.1550097078084947, "loss/hidden": 3.417578125, "loss/jsd": 0.0, "loss/logits": 0.1885005658492446, "step": 7220 }, { "epoch": 0.18075, "grad_norm": 29.75, "grad_norm_var": 3.6809895833333335, "learning_rate": 0.0001, "loss": 7.3615, "loss/crossentropy": 2.089699313044548, "loss/hidden": 3.362109375, "loss/jsd": 0.0, "loss/logits": 0.18596011865884066, "step": 7230 }, { "epoch": 0.181, "grad_norm": 29.625, "grad_norm_var": 3.780989583333333, "learning_rate": 0.0001, "loss": 7.521, "loss/crossentropy": 2.1315871119499206, "loss/hidden": 3.476953125, "loss/jsd": 0.0, "loss/logits": 0.2100736267864704, "step": 7240 }, { "epoch": 0.18125, "grad_norm": 34.75, "grad_norm_var": 4.068684895833333, "learning_rate": 0.0001, "loss": 7.4489, "loss/crossentropy": 2.0509297475218773, "loss/hidden": 3.44296875, "loss/jsd": 0.0, "loss/logits": 0.21798528153449298, "step": 7250 }, { "epoch": 0.1815, "grad_norm": 33.0, "grad_norm_var": 2.546875, "learning_rate": 0.0001, "loss": 7.5147, "loss/crossentropy": 2.097806680202484, "loss/hidden": 3.419921875, "loss/jsd": 0.0, "loss/logits": 0.1954873636364937, "step": 7260 }, { "epoch": 0.18175, "grad_norm": 31.375, "grad_norm_var": 1.9270833333333333, "learning_rate": 0.0001, "loss": 7.5335, "loss/crossentropy": 2.1867170676589014, "loss/hidden": 3.471484375, "loss/jsd": 0.0, "loss/logits": 0.20327441468834878, "step": 7270 }, { "epoch": 0.182, "grad_norm": 31.75, "grad_norm_var": 2.0434895833333333, "learning_rate": 0.0001, "loss": 7.5098, "loss/crossentropy": 2.059905408322811, "loss/hidden": 3.402734375, "loss/jsd": 0.0, "loss/logits": 0.1984263988211751, "step": 7280 }, { "epoch": 0.18225, "grad_norm": 30.875, "grad_norm_var": 3.0004557291666667, "learning_rate": 0.0001, "loss": 7.6258, "loss/crossentropy": 2.19571368098259, "loss/hidden": 3.424609375, "loss/jsd": 0.0, "loss/logits": 0.20168307051062584, "step": 7290 }, { "epoch": 0.1825, "grad_norm": 30.25, "grad_norm_var": 2.7051432291666666, "learning_rate": 0.0001, "loss": 7.5923, "loss/crossentropy": 2.12765002399683, "loss/hidden": 3.518359375, "loss/jsd": 0.0, "loss/logits": 0.19315951522439717, "step": 7300 }, { "epoch": 0.18275, "grad_norm": 29.375, "grad_norm_var": 1.746875, "learning_rate": 0.0001, "loss": 7.5356, "loss/crossentropy": 2.2317890375852585, "loss/hidden": 3.369921875, "loss/jsd": 0.0, "loss/logits": 0.19801790416240692, "step": 7310 }, { "epoch": 0.183, "grad_norm": 30.5, "grad_norm_var": 3.206184895833333, "learning_rate": 0.0001, "loss": 7.4805, "loss/crossentropy": 2.115889000892639, "loss/hidden": 3.43046875, "loss/jsd": 0.0, "loss/logits": 0.20432772096246482, "step": 7320 }, { "epoch": 0.18325, "grad_norm": 29.875, "grad_norm_var": 7.782291666666667, "learning_rate": 0.0001, "loss": 7.5582, "loss/crossentropy": 2.20442833006382, "loss/hidden": 3.488671875, "loss/jsd": 0.0, "loss/logits": 0.20083660595119, "step": 7330 }, { "epoch": 0.1835, "grad_norm": 31.5, "grad_norm_var": 3.6947916666666667, "learning_rate": 0.0001, "loss": 7.6294, "loss/crossentropy": 2.210834649205208, "loss/hidden": 3.363671875, "loss/jsd": 0.0, "loss/logits": 0.19781918153166772, "step": 7340 }, { "epoch": 0.18375, "grad_norm": 31.25, "grad_norm_var": 3.7791015625, "learning_rate": 0.0001, "loss": 7.5733, "loss/crossentropy": 2.094112278521061, "loss/hidden": 3.41171875, "loss/jsd": 0.0, "loss/logits": 0.1902735486626625, "step": 7350 }, { "epoch": 0.184, "grad_norm": 31.25, "grad_norm_var": 1.3796223958333333, "learning_rate": 0.0001, "loss": 7.4645, "loss/crossentropy": 2.1219489932060243, "loss/hidden": 3.385546875, "loss/jsd": 0.0, "loss/logits": 0.1875064203515649, "step": 7360 }, { "epoch": 0.18425, "grad_norm": 31.0, "grad_norm_var": 5.262955729166666, "learning_rate": 0.0001, "loss": 7.5375, "loss/crossentropy": 2.1768325984478, "loss/hidden": 3.41328125, "loss/jsd": 0.0, "loss/logits": 0.20637014620006083, "step": 7370 }, { "epoch": 0.1845, "grad_norm": 33.25, "grad_norm_var": 4.51015625, "learning_rate": 0.0001, "loss": 7.5972, "loss/crossentropy": 2.088278591632843, "loss/hidden": 3.42890625, "loss/jsd": 0.0, "loss/logits": 0.19868529513478278, "step": 7380 }, { "epoch": 0.18475, "grad_norm": 28.125, "grad_norm_var": 7.376041666666667, "learning_rate": 0.0001, "loss": 7.4234, "loss/crossentropy": 2.162215715646744, "loss/hidden": 3.36640625, "loss/jsd": 0.0, "loss/logits": 0.20086740422993898, "step": 7390 }, { "epoch": 0.185, "grad_norm": 34.0, "grad_norm_var": 141.65201822916666, "learning_rate": 0.0001, "loss": 7.4958, "loss/crossentropy": 2.076205277442932, "loss/hidden": 3.42109375, "loss/jsd": 0.0, "loss/logits": 0.1904180521145463, "step": 7400 }, { "epoch": 0.18525, "grad_norm": 31.125, "grad_norm_var": 138.825, "learning_rate": 0.0001, "loss": 7.574, "loss/crossentropy": 2.1355942092835902, "loss/hidden": 3.47734375, "loss/jsd": 0.0, "loss/logits": 0.2197154738008976, "step": 7410 }, { "epoch": 0.1855, "grad_norm": 30.75, "grad_norm_var": 3.405847189186238e+18, "learning_rate": 0.0001, "loss": 7.4496, "loss/crossentropy": 2.152878683805466, "loss/hidden": 3.36953125, "loss/jsd": 0.0, "loss/logits": 0.18649988640099763, "step": 7420 }, { "epoch": 0.18575, "grad_norm": 30.25, "grad_norm_var": 3.40584718949382e+18, "learning_rate": 0.0001, "loss": 7.4482, "loss/crossentropy": 2.100820633769035, "loss/hidden": 3.38828125, "loss/jsd": 0.0, "loss/logits": 0.1907477645203471, "step": 7430 }, { "epoch": 0.186, "grad_norm": 30.75, "grad_norm_var": 2.299934895833333, "learning_rate": 0.0001, "loss": 7.436, "loss/crossentropy": 2.1752007991075515, "loss/hidden": 3.323828125, "loss/jsd": 0.0, "loss/logits": 0.19189766012132167, "step": 7440 }, { "epoch": 0.18625, "grad_norm": 30.25, "grad_norm_var": 5.689322916666667, "learning_rate": 0.0001, "loss": 7.5691, "loss/crossentropy": 2.242092598974705, "loss/hidden": 3.385546875, "loss/jsd": 0.0, "loss/logits": 0.19479812681674957, "step": 7450 }, { "epoch": 0.1865, "grad_norm": 30.625, "grad_norm_var": 5.6369140625, "learning_rate": 0.0001, "loss": 7.6412, "loss/crossentropy": 2.1058298379182814, "loss/hidden": 3.4734375, "loss/jsd": 0.0, "loss/logits": 0.22841914147138595, "step": 7460 }, { "epoch": 0.18675, "grad_norm": 29.0, "grad_norm_var": 4.435872395833333, "learning_rate": 0.0001, "loss": 7.3797, "loss/crossentropy": 2.0960675440728664, "loss/hidden": 3.39375, "loss/jsd": 0.0, "loss/logits": 0.1850858678109944, "step": 7470 }, { "epoch": 0.187, "grad_norm": 30.125, "grad_norm_var": 5.34765625, "learning_rate": 0.0001, "loss": 7.5432, "loss/crossentropy": 2.180506870150566, "loss/hidden": 3.459765625, "loss/jsd": 0.0, "loss/logits": 0.20674382094293833, "step": 7480 }, { "epoch": 0.18725, "grad_norm": 31.25, "grad_norm_var": 4.718489583333334, "learning_rate": 0.0001, "loss": 7.5242, "loss/crossentropy": 2.101006045937538, "loss/hidden": 3.503125, "loss/jsd": 0.0, "loss/logits": 0.19774735551327466, "step": 7490 }, { "epoch": 0.1875, "grad_norm": 31.375, "grad_norm_var": 1.7791666666666666, "learning_rate": 0.0001, "loss": 7.5166, "loss/crossentropy": 2.069665388762951, "loss/hidden": 3.41484375, "loss/jsd": 0.0, "loss/logits": 0.19810831565409898, "step": 7500 }, { "epoch": 0.18775, "grad_norm": 29.75, "grad_norm_var": 2.07890625, "learning_rate": 0.0001, "loss": 7.4959, "loss/crossentropy": 2.0420530915260313, "loss/hidden": 3.340625, "loss/jsd": 0.0, "loss/logits": 0.18399292901158332, "step": 7510 }, { "epoch": 0.188, "grad_norm": 30.875, "grad_norm_var": 4.0587890625, "learning_rate": 0.0001, "loss": 7.5325, "loss/crossentropy": 2.1312338694930077, "loss/hidden": 3.404296875, "loss/jsd": 0.0, "loss/logits": 0.19504649545997382, "step": 7520 }, { "epoch": 0.18825, "grad_norm": 43.0, "grad_norm_var": 9.308333333333334, "learning_rate": 0.0001, "loss": 7.6141, "loss/crossentropy": 2.2894584849476813, "loss/hidden": 3.409375, "loss/jsd": 0.0, "loss/logits": 0.20183086302131414, "step": 7530 }, { "epoch": 0.1885, "grad_norm": 34.0, "grad_norm_var": 10.281705729166667, "learning_rate": 0.0001, "loss": 7.5206, "loss/crossentropy": 2.1557983949780466, "loss/hidden": 3.327734375, "loss/jsd": 0.0, "loss/logits": 0.19000910818576813, "step": 7540 }, { "epoch": 0.18875, "grad_norm": 29.25, "grad_norm_var": 4.21015625, "learning_rate": 0.0001, "loss": 7.5249, "loss/crossentropy": 2.124994584918022, "loss/hidden": 3.344140625, "loss/jsd": 0.0, "loss/logits": 0.19348742607980968, "step": 7550 }, { "epoch": 0.189, "grad_norm": 29.75, "grad_norm_var": 4.334830729166667, "learning_rate": 0.0001, "loss": 7.4858, "loss/crossentropy": 2.154696786403656, "loss/hidden": 3.33359375, "loss/jsd": 0.0, "loss/logits": 0.19012149404734374, "step": 7560 }, { "epoch": 0.18925, "grad_norm": 30.25, "grad_norm_var": 3.9872395833333334, "learning_rate": 0.0001, "loss": 7.487, "loss/crossentropy": 2.1938005700707435, "loss/hidden": 3.40546875, "loss/jsd": 0.0, "loss/logits": 0.19971045572310686, "step": 7570 }, { "epoch": 0.1895, "grad_norm": 30.375, "grad_norm_var": 3.5624176423088947e+18, "learning_rate": 0.0001, "loss": 7.4698, "loss/crossentropy": 2.120267179608345, "loss/hidden": 3.358984375, "loss/jsd": 0.0, "loss/logits": 0.19198095686733724, "step": 7580 }, { "epoch": 0.18975, "grad_norm": 30.75, "grad_norm_var": 4.756184895833333, "learning_rate": 0.0001, "loss": 7.4743, "loss/crossentropy": 2.1569736182689665, "loss/hidden": 3.53984375, "loss/jsd": 0.0, "loss/logits": 0.21590765863656997, "step": 7590 }, { "epoch": 0.19, "grad_norm": 30.125, "grad_norm_var": 2.9166015625, "learning_rate": 0.0001, "loss": 7.4344, "loss/crossentropy": 2.1558305487036704, "loss/hidden": 3.376171875, "loss/jsd": 0.0, "loss/logits": 0.18447545487433673, "step": 7600 }, { "epoch": 0.19025, "grad_norm": 31.375, "grad_norm_var": 4.1025390625, "learning_rate": 0.0001, "loss": 7.4889, "loss/crossentropy": 2.0612099058926105, "loss/hidden": 3.427734375, "loss/jsd": 0.0, "loss/logits": 0.19961078523192555, "step": 7610 }, { "epoch": 0.1905, "grad_norm": 28.625, "grad_norm_var": 3.3087890625, "learning_rate": 0.0001, "loss": 7.6558, "loss/crossentropy": 2.04723744392395, "loss/hidden": 3.5171875, "loss/jsd": 0.0, "loss/logits": 0.22622217051684856, "step": 7620 }, { "epoch": 0.19075, "grad_norm": 29.25, "grad_norm_var": 5.770833333333333, "learning_rate": 0.0001, "loss": 7.4133, "loss/crossentropy": 2.1897004157304765, "loss/hidden": 3.312109375, "loss/jsd": 0.0, "loss/logits": 0.1826148485764861, "step": 7630 }, { "epoch": 0.191, "grad_norm": 31.0, "grad_norm_var": 4.260416666666667, "learning_rate": 0.0001, "loss": 7.4913, "loss/crossentropy": 2.157025161385536, "loss/hidden": 3.5171875, "loss/jsd": 0.0, "loss/logits": 0.22510031294077634, "step": 7640 }, { "epoch": 0.19125, "grad_norm": 29.875, "grad_norm_var": 2.2270182291666667, "learning_rate": 0.0001, "loss": 7.517, "loss/crossentropy": 2.090579715371132, "loss/hidden": 3.3890625, "loss/jsd": 0.0, "loss/logits": 0.1889673013240099, "step": 7650 }, { "epoch": 0.1915, "grad_norm": 31.375, "grad_norm_var": 3.0770833333333334, "learning_rate": 0.0001, "loss": 7.6179, "loss/crossentropy": 2.1681595921516417, "loss/hidden": 3.555078125, "loss/jsd": 0.0, "loss/logits": 0.2365275712683797, "step": 7660 }, { "epoch": 0.19175, "grad_norm": 31.875, "grad_norm_var": 3.4280598958333335, "learning_rate": 0.0001, "loss": 7.5348, "loss/crossentropy": 2.207126745581627, "loss/hidden": 3.34921875, "loss/jsd": 0.0, "loss/logits": 0.19211382810026406, "step": 7670 }, { "epoch": 0.192, "grad_norm": 28.875, "grad_norm_var": 3.796875, "learning_rate": 0.0001, "loss": 7.553, "loss/crossentropy": 2.161148224771023, "loss/hidden": 3.52734375, "loss/jsd": 0.0, "loss/logits": 0.22304172217845916, "step": 7680 }, { "epoch": 0.19225, "grad_norm": 29.0, "grad_norm_var": 3.8889973958333335, "learning_rate": 0.0001, "loss": 7.5009, "loss/crossentropy": 2.0679882526397706, "loss/hidden": 3.46015625, "loss/jsd": 0.0, "loss/logits": 0.2086691778153181, "step": 7690 }, { "epoch": 0.1925, "grad_norm": 30.75, "grad_norm_var": 3.2552083333333335, "learning_rate": 0.0001, "loss": 7.5372, "loss/crossentropy": 2.0694913983345034, "loss/hidden": 3.37265625, "loss/jsd": 0.0, "loss/logits": 0.19288846384733915, "step": 7700 }, { "epoch": 0.19275, "grad_norm": 32.0, "grad_norm_var": 1.8072916666666667, "learning_rate": 0.0001, "loss": 7.5833, "loss/crossentropy": 2.2242089927196504, "loss/hidden": 3.306640625, "loss/jsd": 0.0, "loss/logits": 0.1934451697394252, "step": 7710 }, { "epoch": 0.193, "grad_norm": 34.5, "grad_norm_var": 12.787955729166667, "learning_rate": 0.0001, "loss": 7.6402, "loss/crossentropy": 2.157019394636154, "loss/hidden": 3.405078125, "loss/jsd": 0.0, "loss/logits": 0.19785581808537245, "step": 7720 }, { "epoch": 0.19325, "grad_norm": 32.0, "grad_norm_var": 12.268489583333333, "learning_rate": 0.0001, "loss": 7.3766, "loss/crossentropy": 2.226413035392761, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.20680981017649175, "step": 7730 }, { "epoch": 0.1935, "grad_norm": 35.5, "grad_norm_var": 6.34140625, "learning_rate": 0.0001, "loss": 7.5761, "loss/crossentropy": 2.0320019692182543, "loss/hidden": 3.401953125, "loss/jsd": 0.0, "loss/logits": 0.19264598488807677, "step": 7740 }, { "epoch": 0.19375, "grad_norm": 34.5, "grad_norm_var": 5.897916666666666, "learning_rate": 0.0001, "loss": 7.453, "loss/crossentropy": 2.0598455399274824, "loss/hidden": 3.284375, "loss/jsd": 0.0, "loss/logits": 0.1851517017930746, "step": 7750 }, { "epoch": 0.194, "grad_norm": 29.0, "grad_norm_var": 6.7025390625, "learning_rate": 0.0001, "loss": 7.4278, "loss/crossentropy": 2.1927490830421448, "loss/hidden": 3.414453125, "loss/jsd": 0.0, "loss/logits": 0.20509992372244595, "step": 7760 }, { "epoch": 0.19425, "grad_norm": 31.375, "grad_norm_var": 5.700455729166666, "learning_rate": 0.0001, "loss": 7.4874, "loss/crossentropy": 2.154856140911579, "loss/hidden": 3.430859375, "loss/jsd": 0.0, "loss/logits": 0.20458675045520067, "step": 7770 }, { "epoch": 0.1945, "grad_norm": 37.5, "grad_norm_var": 6.71015625, "learning_rate": 0.0001, "loss": 7.5452, "loss/crossentropy": 2.1202862530946733, "loss/hidden": 3.3078125, "loss/jsd": 0.0, "loss/logits": 0.18985650166869164, "step": 7780 }, { "epoch": 0.19475, "grad_norm": 36.25, "grad_norm_var": 6.918684895833334, "learning_rate": 0.0001, "loss": 7.5427, "loss/crossentropy": 2.1549450784921644, "loss/hidden": 3.434765625, "loss/jsd": 0.0, "loss/logits": 0.18924216069281102, "step": 7790 }, { "epoch": 0.195, "grad_norm": 35.25, "grad_norm_var": 6.2291015625, "learning_rate": 0.0001, "loss": 7.4733, "loss/crossentropy": 2.0557655200362204, "loss/hidden": 3.39765625, "loss/jsd": 0.0, "loss/logits": 0.1919045101851225, "step": 7800 }, { "epoch": 0.19525, "grad_norm": 46.75, "grad_norm_var": 50.040625, "learning_rate": 0.0001, "loss": 7.4911, "loss/crossentropy": 2.171146012097597, "loss/hidden": 3.449609375, "loss/jsd": 0.0, "loss/logits": 0.20546058733016254, "step": 7810 }, { "epoch": 0.1955, "grad_norm": 33.25, "grad_norm_var": 15.828580729166667, "learning_rate": 0.0001, "loss": 7.5692, "loss/crossentropy": 2.1003697514533997, "loss/hidden": 3.489453125, "loss/jsd": 0.0, "loss/logits": 0.20100659225136042, "step": 7820 }, { "epoch": 0.19575, "grad_norm": 36.0, "grad_norm_var": 29.226497395833334, "learning_rate": 0.0001, "loss": 7.5267, "loss/crossentropy": 2.1091859377920628, "loss/hidden": 3.35546875, "loss/jsd": 0.0, "loss/logits": 0.19987453976646066, "step": 7830 }, { "epoch": 0.196, "grad_norm": 48.5, "grad_norm_var": 42.45201822916667, "learning_rate": 0.0001, "loss": 7.4625, "loss/crossentropy": 2.111352452635765, "loss/hidden": 3.31171875, "loss/jsd": 0.0, "loss/logits": 0.18483758307993411, "step": 7840 }, { "epoch": 0.19625, "grad_norm": 30.5, "grad_norm_var": 25.65390625, "learning_rate": 0.0001, "loss": 7.5415, "loss/crossentropy": 2.0419967979192735, "loss/hidden": 3.464453125, "loss/jsd": 0.0, "loss/logits": 0.2173704007640481, "step": 7850 }, { "epoch": 0.1965, "grad_norm": 30.875, "grad_norm_var": 21.546875, "learning_rate": 0.0001, "loss": 7.4951, "loss/crossentropy": 2.084785957634449, "loss/hidden": 3.304296875, "loss/jsd": 0.0, "loss/logits": 0.17884304635226728, "step": 7860 }, { "epoch": 0.19675, "grad_norm": 31.75, "grad_norm_var": 18.153059895833334, "learning_rate": 0.0001, "loss": 7.4509, "loss/crossentropy": 2.1023138776421546, "loss/hidden": 3.479296875, "loss/jsd": 0.0, "loss/logits": 0.20335167981684207, "step": 7870 }, { "epoch": 0.197, "grad_norm": 36.25, "grad_norm_var": 12.093489583333334, "learning_rate": 0.0001, "loss": 7.5467, "loss/crossentropy": 2.171536546945572, "loss/hidden": 3.41953125, "loss/jsd": 0.0, "loss/logits": 0.21221924368292094, "step": 7880 }, { "epoch": 0.19725, "grad_norm": 28.75, "grad_norm_var": 9.983333333333333, "learning_rate": 0.0001, "loss": 7.3785, "loss/crossentropy": 2.1119479715824125, "loss/hidden": 3.39765625, "loss/jsd": 0.0, "loss/logits": 0.18313394635915756, "step": 7890 }, { "epoch": 0.1975, "grad_norm": 37.25, "grad_norm_var": 11.928580729166667, "learning_rate": 0.0001, "loss": 7.4945, "loss/crossentropy": 1.9945223838090897, "loss/hidden": 3.434765625, "loss/jsd": 0.0, "loss/logits": 0.19973532520234585, "step": 7900 }, { "epoch": 0.19775, "grad_norm": 32.25, "grad_norm_var": 11.839583333333334, "learning_rate": 0.0001, "loss": 7.4542, "loss/crossentropy": 2.1550634860992433, "loss/hidden": 3.448046875, "loss/jsd": 0.0, "loss/logits": 0.19187356438487768, "step": 7910 }, { "epoch": 0.198, "grad_norm": 31.5, "grad_norm_var": 12.584309895833334, "learning_rate": 0.0001, "loss": 7.4245, "loss/crossentropy": 2.092045524716377, "loss/hidden": 3.315625, "loss/jsd": 0.0, "loss/logits": 0.18641792330890894, "step": 7920 }, { "epoch": 0.19825, "grad_norm": 28.875, "grad_norm_var": 12.959375, "learning_rate": 0.0001, "loss": 7.4616, "loss/crossentropy": 2.1302111998200415, "loss/hidden": 3.40390625, "loss/jsd": 0.0, "loss/logits": 0.2105752557516098, "step": 7930 }, { "epoch": 0.1985, "grad_norm": 30.75, "grad_norm_var": 14.9431640625, "learning_rate": 0.0001, "loss": 7.4869, "loss/crossentropy": 2.1739411771297457, "loss/hidden": 3.455078125, "loss/jsd": 0.0, "loss/logits": 0.19759192429482936, "step": 7940 }, { "epoch": 0.19875, "grad_norm": 33.25, "grad_norm_var": 12.703580729166667, "learning_rate": 0.0001, "loss": 7.4435, "loss/crossentropy": 2.0400329381227493, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.18711388055235148, "step": 7950 }, { "epoch": 0.199, "grad_norm": 40.0, "grad_norm_var": 10.454622395833333, "learning_rate": 0.0001, "loss": 7.3235, "loss/crossentropy": 2.1173346668481825, "loss/hidden": 3.383203125, "loss/jsd": 0.0, "loss/logits": 0.200854654237628, "step": 7960 }, { "epoch": 0.19925, "grad_norm": 31.125, "grad_norm_var": 18.376822916666665, "learning_rate": 0.0001, "loss": 7.551, "loss/crossentropy": 2.256546127796173, "loss/hidden": 3.430859375, "loss/jsd": 0.0, "loss/logits": 0.2098003163933754, "step": 7970 }, { "epoch": 0.1995, "grad_norm": 30.625, "grad_norm_var": 11.159375, "learning_rate": 0.0001, "loss": 7.5863, "loss/crossentropy": 2.0988403081893923, "loss/hidden": 3.372265625, "loss/jsd": 0.0, "loss/logits": 0.2052627831697464, "step": 7980 }, { "epoch": 0.19975, "grad_norm": 34.75, "grad_norm_var": 3.539322916666667, "learning_rate": 0.0001, "loss": 7.5501, "loss/crossentropy": 2.157486143708229, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.19415573356673121, "step": 7990 }, { "epoch": 0.2, "grad_norm": 28.75, "grad_norm_var": 5.282291666666667, "learning_rate": 0.0001, "loss": 7.5487, "loss/crossentropy": 2.204603946208954, "loss/hidden": 3.328515625, "loss/jsd": 0.0, "loss/logits": 0.19015649259090422, "step": 8000 }, { "epoch": 0.20025, "grad_norm": 29.5, "grad_norm_var": 3.59375, "learning_rate": 0.0001, "loss": 7.4113, "loss/crossentropy": 2.1252332285046576, "loss/hidden": 3.287109375, "loss/jsd": 0.0, "loss/logits": 0.1751571564003825, "step": 8010 }, { "epoch": 0.2005, "grad_norm": 31.0, "grad_norm_var": 3.603125, "learning_rate": 0.0001, "loss": 7.498, "loss/crossentropy": 2.048645743727684, "loss/hidden": 3.35625, "loss/jsd": 0.0, "loss/logits": 0.1930268170312047, "step": 8020 }, { "epoch": 0.20075, "grad_norm": 31.5, "grad_norm_var": 3.3822265625, "learning_rate": 0.0001, "loss": 7.4496, "loss/crossentropy": 2.1435810789465903, "loss/hidden": 3.340625, "loss/jsd": 0.0, "loss/logits": 0.19256617035716772, "step": 8030 }, { "epoch": 0.201, "grad_norm": 31.875, "grad_norm_var": 5.8837890625, "learning_rate": 0.0001, "loss": 7.5991, "loss/crossentropy": 2.06428082883358, "loss/hidden": 3.397265625, "loss/jsd": 0.0, "loss/logits": 0.18795610759407283, "step": 8040 }, { "epoch": 0.20125, "grad_norm": 31.75, "grad_norm_var": 4.08515625, "learning_rate": 0.0001, "loss": 7.5106, "loss/crossentropy": 2.2008739590644835, "loss/hidden": 3.329296875, "loss/jsd": 0.0, "loss/logits": 0.1890289282426238, "step": 8050 }, { "epoch": 0.2015, "grad_norm": 33.25, "grad_norm_var": 2.7708333333333335, "learning_rate": 0.0001, "loss": 7.4302, "loss/crossentropy": 2.1280903786420824, "loss/hidden": 3.511328125, "loss/jsd": 0.0, "loss/logits": 0.20874056480824948, "step": 8060 }, { "epoch": 0.20175, "grad_norm": 29.5, "grad_norm_var": 3.2035807291666667, "learning_rate": 0.0001, "loss": 7.4529, "loss/crossentropy": 2.1864554077386855, "loss/hidden": 3.390234375, "loss/jsd": 0.0, "loss/logits": 0.19697167426347734, "step": 8070 }, { "epoch": 0.202, "grad_norm": 35.0, "grad_norm_var": 2.278125, "learning_rate": 0.0001, "loss": 7.5744, "loss/crossentropy": 2.211556833982468, "loss/hidden": 3.433203125, "loss/jsd": 0.0, "loss/logits": 0.20347684845328332, "step": 8080 }, { "epoch": 0.20225, "grad_norm": 28.875, "grad_norm_var": 33.37890625, "learning_rate": 0.0001, "loss": 7.5018, "loss/crossentropy": 2.185184845328331, "loss/hidden": 3.358203125, "loss/jsd": 0.0, "loss/logits": 0.2022322118282318, "step": 8090 }, { "epoch": 0.2025, "grad_norm": 31.875, "grad_norm_var": 3.3931640625, "learning_rate": 0.0001, "loss": 7.5218, "loss/crossentropy": 2.1828758120536804, "loss/hidden": 3.337890625, "loss/jsd": 0.0, "loss/logits": 0.1962041450664401, "step": 8100 }, { "epoch": 0.20275, "grad_norm": 29.875, "grad_norm_var": 1.9488932291666667, "learning_rate": 0.0001, "loss": 7.4982, "loss/crossentropy": 2.159992370009422, "loss/hidden": 3.446484375, "loss/jsd": 0.0, "loss/logits": 0.20945403575897217, "step": 8110 }, { "epoch": 0.203, "grad_norm": 31.25, "grad_norm_var": 4.748958333333333, "learning_rate": 0.0001, "loss": 7.4657, "loss/crossentropy": 2.1400886207818983, "loss/hidden": 3.352734375, "loss/jsd": 0.0, "loss/logits": 0.19184730034321545, "step": 8120 }, { "epoch": 0.20325, "grad_norm": 32.0, "grad_norm_var": 1.0733723958333334, "learning_rate": 0.0001, "loss": 7.5559, "loss/crossentropy": 2.112087991833687, "loss/hidden": 3.475390625, "loss/jsd": 0.0, "loss/logits": 0.20091654695570468, "step": 8130 }, { "epoch": 0.2035, "grad_norm": 30.625, "grad_norm_var": 1.921875, "learning_rate": 0.0001, "loss": 7.5644, "loss/crossentropy": 2.139628532528877, "loss/hidden": 3.52265625, "loss/jsd": 0.0, "loss/logits": 0.19385650400072335, "step": 8140 }, { "epoch": 0.20375, "grad_norm": 33.25, "grad_norm_var": 2.379166666666667, "learning_rate": 0.0001, "loss": 7.5002, "loss/crossentropy": 2.091212958097458, "loss/hidden": 3.446484375, "loss/jsd": 0.0, "loss/logits": 0.2040353151038289, "step": 8150 }, { "epoch": 0.204, "grad_norm": 32.0, "grad_norm_var": 2.2228515625, "learning_rate": 0.0001, "loss": 7.4494, "loss/crossentropy": 2.2567614316940308, "loss/hidden": 3.35078125, "loss/jsd": 0.0, "loss/logits": 0.18545747231692075, "step": 8160 }, { "epoch": 0.20425, "grad_norm": 31.125, "grad_norm_var": 229.56666666666666, "learning_rate": 0.0001, "loss": 7.4626, "loss/crossentropy": 2.1603645354509355, "loss/hidden": 3.539453125, "loss/jsd": 0.0, "loss/logits": 0.21300703901797532, "step": 8170 }, { "epoch": 0.2045, "grad_norm": 28.625, "grad_norm_var": 4.2806640625, "learning_rate": 0.0001, "loss": 7.5117, "loss/crossentropy": 2.1204755783081053, "loss/hidden": 3.456640625, "loss/jsd": 0.0, "loss/logits": 0.18535650707781315, "step": 8180 }, { "epoch": 0.20475, "grad_norm": 31.25, "grad_norm_var": 2.1072265625, "learning_rate": 0.0001, "loss": 7.5637, "loss/crossentropy": 2.1639575958251953, "loss/hidden": 3.398828125, "loss/jsd": 0.0, "loss/logits": 0.20935236364603044, "step": 8190 }, { "epoch": 0.205, "grad_norm": 28.0, "grad_norm_var": 2.2494140625, "learning_rate": 0.0001, "loss": 7.3019, "loss/crossentropy": 2.056389382481575, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.1874819153919816, "step": 8200 }, { "epoch": 0.20525, "grad_norm": 32.5, "grad_norm_var": 4.274934895833334, "learning_rate": 0.0001, "loss": 7.5162, "loss/crossentropy": 2.1393348813056945, "loss/hidden": 3.41640625, "loss/jsd": 0.0, "loss/logits": 0.20481376871466636, "step": 8210 }, { "epoch": 0.2055, "grad_norm": 29.625, "grad_norm_var": 5.553125, "learning_rate": 0.0001, "loss": 7.5046, "loss/crossentropy": 2.1898138865828516, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.1965622954070568, "step": 8220 }, { "epoch": 0.20575, "grad_norm": 34.5, "grad_norm_var": 2.899825551450536e+18, "learning_rate": 0.0001, "loss": 7.6057, "loss/crossentropy": 2.139435464143753, "loss/hidden": 3.51328125, "loss/jsd": 0.0, "loss/logits": 0.20182280670851468, "step": 8230 }, { "epoch": 0.206, "grad_norm": 37.75, "grad_norm_var": 2.899825549761839e+18, "learning_rate": 0.0001, "loss": 7.5603, "loss/crossentropy": 2.248063361644745, "loss/hidden": 3.44609375, "loss/jsd": 0.0, "loss/logits": 0.21399107333272696, "step": 8240 }, { "epoch": 0.20625, "grad_norm": 33.5, "grad_norm_var": 114.23118489583334, "learning_rate": 0.0001, "loss": 7.4545, "loss/crossentropy": 2.140778873860836, "loss/hidden": 3.3421875, "loss/jsd": 0.0, "loss/logits": 0.18022785130888225, "step": 8250 }, { "epoch": 0.2065, "grad_norm": 30.5, "grad_norm_var": 110.5072265625, "learning_rate": 0.0001, "loss": 7.5364, "loss/crossentropy": 2.259766247868538, "loss/hidden": 3.312109375, "loss/jsd": 0.0, "loss/logits": 0.19342643208801746, "step": 8260 }, { "epoch": 0.20675, "grad_norm": 31.75, "grad_norm_var": 0.7666015625, "learning_rate": 0.0001, "loss": 7.6531, "loss/crossentropy": 2.158949288725853, "loss/hidden": 3.43515625, "loss/jsd": 0.0, "loss/logits": 0.19808744061738254, "step": 8270 }, { "epoch": 0.207, "grad_norm": 32.0, "grad_norm_var": 1.4311848958333333, "learning_rate": 0.0001, "loss": 7.6092, "loss/crossentropy": 2.111227548122406, "loss/hidden": 3.402734375, "loss/jsd": 0.0, "loss/logits": 0.20227425117045642, "step": 8280 }, { "epoch": 0.20725, "grad_norm": 31.5, "grad_norm_var": 1.56640625, "learning_rate": 0.0001, "loss": 7.5372, "loss/crossentropy": 2.0839753821492195, "loss/hidden": 3.428125, "loss/jsd": 0.0, "loss/logits": 0.18574036825448276, "step": 8290 }, { "epoch": 0.2075, "grad_norm": 31.375, "grad_norm_var": 0.8384765625, "learning_rate": 0.0001, "loss": 7.536, "loss/crossentropy": 2.1465844094753264, "loss/hidden": 3.407421875, "loss/jsd": 0.0, "loss/logits": 0.2083981443196535, "step": 8300 }, { "epoch": 0.20775, "grad_norm": 32.5, "grad_norm_var": 20.8416015625, "learning_rate": 0.0001, "loss": 7.4722, "loss/crossentropy": 2.1032681286334993, "loss/hidden": 3.439453125, "loss/jsd": 0.0, "loss/logits": 0.19353157542645932, "step": 8310 }, { "epoch": 0.208, "grad_norm": 29.75, "grad_norm_var": 2.1395833333333334, "learning_rate": 0.0001, "loss": 7.4453, "loss/crossentropy": 2.092882976680994, "loss/hidden": 3.41640625, "loss/jsd": 0.0, "loss/logits": 0.20083120074123145, "step": 8320 }, { "epoch": 0.20825, "grad_norm": 29.125, "grad_norm_var": 2.42265625, "learning_rate": 0.0001, "loss": 7.4658, "loss/crossentropy": 2.047010327875614, "loss/hidden": 3.26484375, "loss/jsd": 0.0, "loss/logits": 0.17298451280221344, "step": 8330 }, { "epoch": 0.2085, "grad_norm": 31.625, "grad_norm_var": 2.8114583333333334, "learning_rate": 0.0001, "loss": 7.6227, "loss/crossentropy": 2.1705314934253694, "loss/hidden": 3.44921875, "loss/jsd": 0.0, "loss/logits": 0.2104344118386507, "step": 8340 }, { "epoch": 0.20875, "grad_norm": 30.75, "grad_norm_var": 15.13515625, "learning_rate": 0.0001, "loss": 7.5292, "loss/crossentropy": 2.251467025279999, "loss/hidden": 3.44296875, "loss/jsd": 0.0, "loss/logits": 0.206053901091218, "step": 8350 }, { "epoch": 0.209, "grad_norm": 30.25, "grad_norm_var": 9.75, "learning_rate": 0.0001, "loss": 7.5297, "loss/crossentropy": 2.2416324824094773, "loss/hidden": 3.38203125, "loss/jsd": 0.0, "loss/logits": 0.18910795077681541, "step": 8360 }, { "epoch": 0.20925, "grad_norm": 49.75, "grad_norm_var": 177.46145833333333, "learning_rate": 0.0001, "loss": 7.4604, "loss/crossentropy": 1.9895980581641197, "loss/hidden": 3.43203125, "loss/jsd": 0.0, "loss/logits": 0.19730695113539695, "step": 8370 }, { "epoch": 0.2095, "grad_norm": 56.25, "grad_norm_var": 334.9145182291667, "learning_rate": 0.0001, "loss": 7.5698, "loss/crossentropy": 2.2018245279788973, "loss/hidden": 3.38828125, "loss/jsd": 0.0, "loss/logits": 0.21164307594299317, "step": 8380 }, { "epoch": 0.20975, "grad_norm": 32.75, "grad_norm_var": 72.22890625, "learning_rate": 0.0001, "loss": 7.5902, "loss/crossentropy": 2.1519288808107375, "loss/hidden": 3.3796875, "loss/jsd": 0.0, "loss/logits": 0.19217228647321463, "step": 8390 }, { "epoch": 0.21, "grad_norm": 30.0, "grad_norm_var": 7.153059895833334, "learning_rate": 0.0001, "loss": 7.451, "loss/crossentropy": 2.1779698938131333, "loss/hidden": 3.44140625, "loss/jsd": 0.0, "loss/logits": 0.1994494464248419, "step": 8400 }, { "epoch": 0.21025, "grad_norm": 32.0, "grad_norm_var": 7.124739583333334, "learning_rate": 0.0001, "loss": 7.5019, "loss/crossentropy": 2.140475983917713, "loss/hidden": 3.3734375, "loss/jsd": 0.0, "loss/logits": 0.20144703481346368, "step": 8410 }, { "epoch": 0.2105, "grad_norm": 31.625, "grad_norm_var": 1.2639973958333333, "learning_rate": 0.0001, "loss": 7.4268, "loss/crossentropy": 2.2145755022764204, "loss/hidden": 3.328515625, "loss/jsd": 0.0, "loss/logits": 0.19062853194773197, "step": 8420 }, { "epoch": 0.21075, "grad_norm": 31.0, "grad_norm_var": 2.620247395833333, "learning_rate": 0.0001, "loss": 7.5149, "loss/crossentropy": 2.3047141253948213, "loss/hidden": 3.34140625, "loss/jsd": 0.0, "loss/logits": 0.20336541533470154, "step": 8430 }, { "epoch": 0.211, "grad_norm": 31.875, "grad_norm_var": 2.7514973958333333, "learning_rate": 0.0001, "loss": 7.5319, "loss/crossentropy": 2.232821634411812, "loss/hidden": 3.441015625, "loss/jsd": 0.0, "loss/logits": 0.21666326355189086, "step": 8440 }, { "epoch": 0.21125, "grad_norm": 33.5, "grad_norm_var": 1.49765625, "learning_rate": 0.0001, "loss": 7.5718, "loss/crossentropy": 2.092620450258255, "loss/hidden": 3.391796875, "loss/jsd": 0.0, "loss/logits": 0.20793931670486926, "step": 8450 }, { "epoch": 0.2115, "grad_norm": 32.5, "grad_norm_var": 3.1306640625, "learning_rate": 0.0001, "loss": 7.4344, "loss/crossentropy": 2.151240213960409, "loss/hidden": 3.40078125, "loss/jsd": 0.0, "loss/logits": 0.2046731175854802, "step": 8460 }, { "epoch": 0.21175, "grad_norm": 33.75, "grad_norm_var": 2.81640625, "learning_rate": 0.0001, "loss": 7.5581, "loss/crossentropy": 2.1520677715539933, "loss/hidden": 3.379296875, "loss/jsd": 0.0, "loss/logits": 0.19603268206119537, "step": 8470 }, { "epoch": 0.212, "grad_norm": 31.0, "grad_norm_var": 4.092643229166667, "learning_rate": 0.0001, "loss": 7.5382, "loss/crossentropy": 2.1510326713323593, "loss/hidden": 3.33359375, "loss/jsd": 0.0, "loss/logits": 0.18077819589525462, "step": 8480 }, { "epoch": 0.21225, "grad_norm": 32.75, "grad_norm_var": 4.753059895833333, "learning_rate": 0.0001, "loss": 7.4642, "loss/crossentropy": 2.0811485938727854, "loss/hidden": 3.351171875, "loss/jsd": 0.0, "loss/logits": 0.18939817287027835, "step": 8490 }, { "epoch": 0.2125, "grad_norm": 31.25, "grad_norm_var": 5.855989583333334, "learning_rate": 0.0001, "loss": 7.5158, "loss/crossentropy": 2.0734725266695024, "loss/hidden": 3.316796875, "loss/jsd": 0.0, "loss/logits": 0.18070020200684667, "step": 8500 }, { "epoch": 0.21275, "grad_norm": 29.25, "grad_norm_var": 5.5462890625, "learning_rate": 0.0001, "loss": 7.4723, "loss/crossentropy": 2.144814722239971, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.19649898763746024, "step": 8510 }, { "epoch": 0.213, "grad_norm": 28.625, "grad_norm_var": 3.2309895833333333, "learning_rate": 0.0001, "loss": 7.3777, "loss/crossentropy": 2.0877407863736153, "loss/hidden": 3.42734375, "loss/jsd": 0.0, "loss/logits": 0.19544295389205218, "step": 8520 }, { "epoch": 0.21325, "grad_norm": 30.5, "grad_norm_var": 1.6583333333333334, "learning_rate": 0.0001, "loss": 7.5179, "loss/crossentropy": 2.1433625385165214, "loss/hidden": 3.4734375, "loss/jsd": 0.0, "loss/logits": 0.19359017722308636, "step": 8530 }, { "epoch": 0.2135, "grad_norm": 34.25, "grad_norm_var": 1.6510416666666667, "learning_rate": 0.0001, "loss": 7.4658, "loss/crossentropy": 2.221453693509102, "loss/hidden": 3.43125, "loss/jsd": 0.0, "loss/logits": 0.20547619834542274, "step": 8540 }, { "epoch": 0.21375, "grad_norm": 32.0, "grad_norm_var": 2.5791015625, "learning_rate": 0.0001, "loss": 7.5163, "loss/crossentropy": 2.213938871026039, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.20213278364390136, "step": 8550 }, { "epoch": 0.214, "grad_norm": 34.25, "grad_norm_var": 2.784375, "learning_rate": 0.0001, "loss": 7.6361, "loss/crossentropy": 2.133354830741882, "loss/hidden": 3.512109375, "loss/jsd": 0.0, "loss/logits": 0.207733928412199, "step": 8560 }, { "epoch": 0.21425, "grad_norm": 34.0, "grad_norm_var": 7.219205729166666, "learning_rate": 0.0001, "loss": 7.5661, "loss/crossentropy": 2.0587344974279405, "loss/hidden": 3.550390625, "loss/jsd": 0.0, "loss/logits": 0.22256299555301667, "step": 8570 }, { "epoch": 0.2145, "grad_norm": 31.0, "grad_norm_var": 7.805989583333333, "learning_rate": 0.0001, "loss": 7.5549, "loss/crossentropy": 2.0949306935071945, "loss/hidden": 3.4796875, "loss/jsd": 0.0, "loss/logits": 0.19931861553341151, "step": 8580 }, { "epoch": 0.21475, "grad_norm": 28.5, "grad_norm_var": 8.095247395833333, "learning_rate": 0.0001, "loss": 7.478, "loss/crossentropy": 2.1063201159238814, "loss/hidden": 3.459375, "loss/jsd": 0.0, "loss/logits": 0.20397470220923425, "step": 8590 }, { "epoch": 0.215, "grad_norm": 33.5, "grad_norm_var": 28.859375, "learning_rate": 0.0001, "loss": 7.5632, "loss/crossentropy": 2.177382430434227, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.2048982411623001, "step": 8600 }, { "epoch": 0.21525, "grad_norm": 31.75, "grad_norm_var": 1.7955729166666667, "learning_rate": 0.0001, "loss": 7.5325, "loss/crossentropy": 2.0719266816973687, "loss/hidden": 3.395703125, "loss/jsd": 0.0, "loss/logits": 0.19100363925099373, "step": 8610 }, { "epoch": 0.2155, "grad_norm": 29.25, "grad_norm_var": 1.7372395833333334, "learning_rate": 0.0001, "loss": 7.5877, "loss/crossentropy": 2.147695118188858, "loss/hidden": 3.41875, "loss/jsd": 0.0, "loss/logits": 0.19447963647544383, "step": 8620 }, { "epoch": 0.21575, "grad_norm": 31.375, "grad_norm_var": 5.406184895833333, "learning_rate": 0.0001, "loss": 7.5509, "loss/crossentropy": 2.1029836043715475, "loss/hidden": 3.45, "loss/jsd": 0.0, "loss/logits": 0.19613635465502738, "step": 8630 }, { "epoch": 0.216, "grad_norm": 29.0, "grad_norm_var": 10.32265625, "learning_rate": 0.0001, "loss": 7.555, "loss/crossentropy": 2.1027876287698746, "loss/hidden": 3.34453125, "loss/jsd": 0.0, "loss/logits": 0.18709390722215175, "step": 8640 }, { "epoch": 0.21625, "grad_norm": 31.75, "grad_norm_var": 7.4978515625, "learning_rate": 0.0001, "loss": 7.481, "loss/crossentropy": 2.078503981232643, "loss/hidden": 3.3578125, "loss/jsd": 0.0, "loss/logits": 0.19765212927013637, "step": 8650 }, { "epoch": 0.2165, "grad_norm": 28.375, "grad_norm_var": 6.212434895833334, "learning_rate": 0.0001, "loss": 7.5951, "loss/crossentropy": 2.1425766557455064, "loss/hidden": 3.522265625, "loss/jsd": 0.0, "loss/logits": 0.2285369848832488, "step": 8660 }, { "epoch": 0.21675, "grad_norm": 37.0, "grad_norm_var": 5.02890625, "learning_rate": 0.0001, "loss": 7.4619, "loss/crossentropy": 2.094095268845558, "loss/hidden": 3.33671875, "loss/jsd": 0.0, "loss/logits": 0.18905023839324714, "step": 8670 }, { "epoch": 0.217, "grad_norm": 32.25, "grad_norm_var": 5.205989583333333, "learning_rate": 0.0001, "loss": 7.6255, "loss/crossentropy": 2.0726657152175902, "loss/hidden": 3.424609375, "loss/jsd": 0.0, "loss/logits": 0.19401923939585686, "step": 8680 }, { "epoch": 0.21725, "grad_norm": 31.125, "grad_norm_var": 1.8240281960963725e+18, "learning_rate": 0.0001, "loss": 7.476, "loss/crossentropy": 2.1063093028962614, "loss/hidden": 3.351953125, "loss/jsd": 0.0, "loss/logits": 0.18630194589495658, "step": 8690 }, { "epoch": 0.2175, "grad_norm": 33.0, "grad_norm_var": 9.2947265625, "learning_rate": 0.0001, "loss": 7.6565, "loss/crossentropy": 2.188694643974304, "loss/hidden": 3.561328125, "loss/jsd": 0.0, "loss/logits": 0.21225023418664932, "step": 8700 }, { "epoch": 0.21775, "grad_norm": 30.0, "grad_norm_var": 3.644791666666667, "learning_rate": 0.0001, "loss": 7.4697, "loss/crossentropy": 2.1252513483166693, "loss/hidden": 3.334765625, "loss/jsd": 0.0, "loss/logits": 0.19880297109484674, "step": 8710 }, { "epoch": 0.218, "grad_norm": 30.875, "grad_norm_var": 11.028059895833334, "learning_rate": 0.0001, "loss": 7.4975, "loss/crossentropy": 2.1700349181890486, "loss/hidden": 3.351953125, "loss/jsd": 0.0, "loss/logits": 0.20139609836041927, "step": 8720 }, { "epoch": 0.21825, "grad_norm": 32.25, "grad_norm_var": 1.4189153379885778e+18, "learning_rate": 0.0001, "loss": 7.616, "loss/crossentropy": 2.032346047461033, "loss/hidden": 3.425, "loss/jsd": 0.0, "loss/logits": 0.1754102895502001, "step": 8730 }, { "epoch": 0.2185, "grad_norm": 31.25, "grad_norm_var": 2.25390625, "learning_rate": 0.0001, "loss": 7.5088, "loss/crossentropy": 2.1633499920368195, "loss/hidden": 3.448828125, "loss/jsd": 0.0, "loss/logits": 0.1996068175882101, "step": 8740 }, { "epoch": 0.21875, "grad_norm": 29.375, "grad_norm_var": 1.6122395833333334, "learning_rate": 0.0001, "loss": 7.5088, "loss/crossentropy": 2.1443722933530807, "loss/hidden": 3.501171875, "loss/jsd": 0.0, "loss/logits": 0.2017217763699591, "step": 8750 }, { "epoch": 0.219, "grad_norm": 32.25, "grad_norm_var": 14.970572916666667, "learning_rate": 0.0001, "loss": 7.5224, "loss/crossentropy": 2.176077055931091, "loss/hidden": 3.4203125, "loss/jsd": 0.0, "loss/logits": 0.2048908168449998, "step": 8760 }, { "epoch": 0.21925, "grad_norm": 31.0, "grad_norm_var": 14.087955729166667, "learning_rate": 0.0001, "loss": 7.5417, "loss/crossentropy": 2.183764386177063, "loss/hidden": 3.444140625, "loss/jsd": 0.0, "loss/logits": 0.20730934757739305, "step": 8770 }, { "epoch": 0.2195, "grad_norm": 46.5, "grad_norm_var": 23.55390625, "learning_rate": 0.0001, "loss": 7.5464, "loss/crossentropy": 2.0760655224323274, "loss/hidden": 3.3859375, "loss/jsd": 0.0, "loss/logits": 0.18335200790315867, "step": 8780 }, { "epoch": 0.21975, "grad_norm": 31.125, "grad_norm_var": 16.1369140625, "learning_rate": 0.0001, "loss": 7.5001, "loss/crossentropy": 2.215297505259514, "loss/hidden": 3.33671875, "loss/jsd": 0.0, "loss/logits": 0.19053485784679652, "step": 8790 }, { "epoch": 0.22, "grad_norm": 41.75, "grad_norm_var": 10.250455729166667, "learning_rate": 0.0001, "loss": 7.5664, "loss/crossentropy": 2.1419076189398765, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.1872857511974871, "step": 8800 }, { "epoch": 0.22025, "grad_norm": 31.625, "grad_norm_var": 13.099934895833334, "learning_rate": 0.0001, "loss": 7.5152, "loss/crossentropy": 2.136802351474762, "loss/hidden": 3.368359375, "loss/jsd": 0.0, "loss/logits": 0.19524938464164734, "step": 8810 }, { "epoch": 0.2205, "grad_norm": 29.0, "grad_norm_var": 18.0634765625, "learning_rate": 0.0001, "loss": 7.6215, "loss/crossentropy": 2.231374368071556, "loss/hidden": 3.314453125, "loss/jsd": 0.0, "loss/logits": 0.18930297438055277, "step": 8820 }, { "epoch": 0.22075, "grad_norm": 30.125, "grad_norm_var": 18.358333333333334, "learning_rate": 0.0001, "loss": 7.5354, "loss/crossentropy": 2.2225961208343508, "loss/hidden": 3.41328125, "loss/jsd": 0.0, "loss/logits": 0.19855661746114492, "step": 8830 }, { "epoch": 0.221, "grad_norm": 31.625, "grad_norm_var": 18.312239583333334, "learning_rate": 0.0001, "loss": 7.6216, "loss/crossentropy": 2.1328855097293853, "loss/hidden": 3.40390625, "loss/jsd": 0.0, "loss/logits": 0.20832530297338964, "step": 8840 }, { "epoch": 0.22125, "grad_norm": 28.375, "grad_norm_var": 8.085416666666667, "learning_rate": 0.0001, "loss": 7.3697, "loss/crossentropy": 2.0526474595069883, "loss/hidden": 3.5015625, "loss/jsd": 0.0, "loss/logits": 0.21284359116107227, "step": 8850 }, { "epoch": 0.2215, "grad_norm": 37.25, "grad_norm_var": 9.942708333333334, "learning_rate": 0.0001, "loss": 7.4967, "loss/crossentropy": 2.1749773651361464, "loss/hidden": 3.441015625, "loss/jsd": 0.0, "loss/logits": 0.20905132126063108, "step": 8860 }, { "epoch": 0.22175, "grad_norm": 32.0, "grad_norm_var": 6.322330729166667, "learning_rate": 0.0001, "loss": 7.503, "loss/crossentropy": 2.108053085952997, "loss/hidden": 3.41953125, "loss/jsd": 0.0, "loss/logits": 0.20318386573344469, "step": 8870 }, { "epoch": 0.222, "grad_norm": 34.0, "grad_norm_var": 7.685416666666667, "learning_rate": 0.0001, "loss": 7.3618, "loss/crossentropy": 2.1234827637672424, "loss/hidden": 3.36796875, "loss/jsd": 0.0, "loss/logits": 0.18484860006719828, "step": 8880 }, { "epoch": 0.22225, "grad_norm": 29.75, "grad_norm_var": 8.220247395833333, "learning_rate": 0.0001, "loss": 7.393, "loss/crossentropy": 2.099182015657425, "loss/hidden": 3.350390625, "loss/jsd": 0.0, "loss/logits": 0.19038616605103015, "step": 8890 }, { "epoch": 0.2225, "grad_norm": 31.0, "grad_norm_var": 7.569205729166667, "learning_rate": 0.0001, "loss": 7.4768, "loss/crossentropy": 2.148054042458534, "loss/hidden": 3.44765625, "loss/jsd": 0.0, "loss/logits": 0.2065868368372321, "step": 8900 }, { "epoch": 0.22275, "grad_norm": 28.75, "grad_norm_var": 6.009375, "learning_rate": 0.0001, "loss": 7.5001, "loss/crossentropy": 2.126410482823849, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.210067143663764, "step": 8910 }, { "epoch": 0.223, "grad_norm": 37.5, "grad_norm_var": 7.870572916666666, "learning_rate": 0.0001, "loss": 7.4678, "loss/crossentropy": 2.1258621901273727, "loss/hidden": 3.428515625, "loss/jsd": 0.0, "loss/logits": 0.19669083636254073, "step": 8920 }, { "epoch": 0.22325, "grad_norm": 30.0, "grad_norm_var": 8.908268229166667, "learning_rate": 0.0001, "loss": 7.5039, "loss/crossentropy": 2.1579898312687873, "loss/hidden": 3.319140625, "loss/jsd": 0.0, "loss/logits": 0.18557702358812095, "step": 8930 }, { "epoch": 0.2235, "grad_norm": 28.5, "grad_norm_var": 8.448372395833333, "learning_rate": 0.0001, "loss": 7.5477, "loss/crossentropy": 2.2186635404825212, "loss/hidden": 3.36015625, "loss/jsd": 0.0, "loss/logits": 0.19810736775398255, "step": 8940 }, { "epoch": 0.22375, "grad_norm": 36.25, "grad_norm_var": 7.304166666666666, "learning_rate": 0.0001, "loss": 7.5522, "loss/crossentropy": 2.1860512644052505, "loss/hidden": 3.33203125, "loss/jsd": 0.0, "loss/logits": 0.18810464218258857, "step": 8950 }, { "epoch": 0.224, "grad_norm": 35.75, "grad_norm_var": 4.498372395833333, "learning_rate": 0.0001, "loss": 7.5105, "loss/crossentropy": 2.244818753004074, "loss/hidden": 3.350390625, "loss/jsd": 0.0, "loss/logits": 0.19993619099259377, "step": 8960 }, { "epoch": 0.22425, "grad_norm": 33.25, "grad_norm_var": 4.34375, "learning_rate": 0.0001, "loss": 7.6077, "loss/crossentropy": 2.1209320187568665, "loss/hidden": 3.47421875, "loss/jsd": 0.0, "loss/logits": 0.2167982429265976, "step": 8970 }, { "epoch": 0.2245, "grad_norm": 31.0, "grad_norm_var": 18.992122395833334, "learning_rate": 0.0001, "loss": 7.5306, "loss/crossentropy": 2.008339713513851, "loss/hidden": 3.43515625, "loss/jsd": 0.0, "loss/logits": 0.19748742282390594, "step": 8980 }, { "epoch": 0.22475, "grad_norm": 29.5, "grad_norm_var": 20.906705729166667, "learning_rate": 0.0001, "loss": 7.5837, "loss/crossentropy": 2.245701877772808, "loss/hidden": 3.403125, "loss/jsd": 0.0, "loss/logits": 0.19117865581065416, "step": 8990 }, { "epoch": 0.225, "grad_norm": 32.75, "grad_norm_var": 5.1119140625, "learning_rate": 0.0001, "loss": 7.5251, "loss/crossentropy": 2.050145834684372, "loss/hidden": 3.37890625, "loss/jsd": 0.0, "loss/logits": 0.19412143267691134, "step": 9000 }, { "epoch": 0.22525, "grad_norm": 32.75, "grad_norm_var": 7.176822916666667, "learning_rate": 0.0001, "loss": 7.5851, "loss/crossentropy": 2.190397572517395, "loss/hidden": 3.349609375, "loss/jsd": 0.0, "loss/logits": 0.20755842179059983, "step": 9010 }, { "epoch": 0.2255, "grad_norm": 30.75, "grad_norm_var": 8.977083333333333, "learning_rate": 0.0001, "loss": 7.4264, "loss/crossentropy": 2.284222900867462, "loss/hidden": 3.332421875, "loss/jsd": 0.0, "loss/logits": 0.21356079690158367, "step": 9020 }, { "epoch": 0.22575, "grad_norm": 31.75, "grad_norm_var": 3.0639973958333333, "learning_rate": 0.0001, "loss": 7.5988, "loss/crossentropy": 2.1513551443815233, "loss/hidden": 3.351953125, "loss/jsd": 0.0, "loss/logits": 0.19452885985374452, "step": 9030 }, { "epoch": 0.226, "grad_norm": 31.25, "grad_norm_var": 1.7830729166666666, "learning_rate": 0.0001, "loss": 7.4915, "loss/crossentropy": 2.2022497206926346, "loss/hidden": 3.415625, "loss/jsd": 0.0, "loss/logits": 0.1973773717880249, "step": 9040 }, { "epoch": 0.22625, "grad_norm": 28.25, "grad_norm_var": 2.584830729166667, "learning_rate": 0.0001, "loss": 7.5443, "loss/crossentropy": 2.1286503240466117, "loss/hidden": 3.40546875, "loss/jsd": 0.0, "loss/logits": 0.18610758930444718, "step": 9050 }, { "epoch": 0.2265, "grad_norm": 27.75, "grad_norm_var": 3.345572916666667, "learning_rate": 0.0001, "loss": 7.5659, "loss/crossentropy": 2.1127908319234847, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.19702097605913876, "step": 9060 }, { "epoch": 0.22675, "grad_norm": 29.0, "grad_norm_var": 2.716080729166667, "learning_rate": 0.0001, "loss": 7.5575, "loss/crossentropy": 2.0748134687542916, "loss/hidden": 3.331640625, "loss/jsd": 0.0, "loss/logits": 0.19112884253263474, "step": 9070 }, { "epoch": 0.227, "grad_norm": 30.875, "grad_norm_var": 1.0764973958333333, "learning_rate": 0.0001, "loss": 7.6302, "loss/crossentropy": 2.1477744698524477, "loss/hidden": 3.391796875, "loss/jsd": 0.0, "loss/logits": 0.198454162850976, "step": 9080 }, { "epoch": 0.22725, "grad_norm": 31.0, "grad_norm_var": 2.3197265625, "learning_rate": 0.0001, "loss": 7.4827, "loss/crossentropy": 2.1293207883834837, "loss/hidden": 3.55625, "loss/jsd": 0.0, "loss/logits": 0.21182843409478663, "step": 9090 }, { "epoch": 0.2275, "grad_norm": 34.25, "grad_norm_var": 3.1333333333333333, "learning_rate": 0.0001, "loss": 7.5027, "loss/crossentropy": 2.145271519571543, "loss/hidden": 3.391015625, "loss/jsd": 0.0, "loss/logits": 0.18950196234509348, "step": 9100 }, { "epoch": 0.22775, "grad_norm": 30.5, "grad_norm_var": 2.9337890625, "learning_rate": 0.0001, "loss": 7.5571, "loss/crossentropy": 2.1665233090519904, "loss/hidden": 3.48046875, "loss/jsd": 0.0, "loss/logits": 0.20093025229871272, "step": 9110 }, { "epoch": 0.228, "grad_norm": 31.5, "grad_norm_var": 1.9718098958333334, "learning_rate": 0.0001, "loss": 7.4992, "loss/crossentropy": 2.119904951751232, "loss/hidden": 3.37890625, "loss/jsd": 0.0, "loss/logits": 0.19110188409686088, "step": 9120 }, { "epoch": 0.22825, "grad_norm": 32.5, "grad_norm_var": 1.21875, "learning_rate": 0.0001, "loss": 7.5861, "loss/crossentropy": 2.169039398431778, "loss/hidden": 3.344140625, "loss/jsd": 0.0, "loss/logits": 0.18969318978488445, "step": 9130 }, { "epoch": 0.2285, "grad_norm": 31.125, "grad_norm_var": 2.0134765625, "learning_rate": 0.0001, "loss": 7.4457, "loss/crossentropy": 2.202808904647827, "loss/hidden": 3.415625, "loss/jsd": 0.0, "loss/logits": 0.21587605867534876, "step": 9140 }, { "epoch": 0.22875, "grad_norm": 30.125, "grad_norm_var": 1.9014973958333334, "learning_rate": 0.0001, "loss": 7.5869, "loss/crossentropy": 2.188230502605438, "loss/hidden": 3.32578125, "loss/jsd": 0.0, "loss/logits": 0.20162757411599158, "step": 9150 }, { "epoch": 0.229, "grad_norm": 31.125, "grad_norm_var": 1.6268229166666666, "learning_rate": 0.0001, "loss": 7.5041, "loss/crossentropy": 2.166210785508156, "loss/hidden": 3.346875, "loss/jsd": 0.0, "loss/logits": 0.19896902665495872, "step": 9160 }, { "epoch": 0.22925, "grad_norm": 31.875, "grad_norm_var": 1.3582682291666666, "learning_rate": 0.0001, "loss": 7.4795, "loss/crossentropy": 2.1638981848955154, "loss/hidden": 3.28046875, "loss/jsd": 0.0, "loss/logits": 0.18999446034431458, "step": 9170 }, { "epoch": 0.2295, "grad_norm": 34.0, "grad_norm_var": 2.6177083333333333, "learning_rate": 0.0001, "loss": 7.4529, "loss/crossentropy": 2.170040412247181, "loss/hidden": 3.399609375, "loss/jsd": 0.0, "loss/logits": 0.22052502054721118, "step": 9180 }, { "epoch": 0.22975, "grad_norm": 31.0, "grad_norm_var": 3.2400390625, "learning_rate": 0.0001, "loss": 7.4625, "loss/crossentropy": 2.114721930027008, "loss/hidden": 3.40859375, "loss/jsd": 0.0, "loss/logits": 0.19851209484040738, "step": 9190 }, { "epoch": 0.23, "grad_norm": 31.375, "grad_norm_var": 2.2457682291666665, "learning_rate": 0.0001, "loss": 7.5862, "loss/crossentropy": 2.1541184708476067, "loss/hidden": 3.374609375, "loss/jsd": 0.0, "loss/logits": 0.1960083631798625, "step": 9200 }, { "epoch": 0.23025, "grad_norm": 31.25, "grad_norm_var": 2.676041666666667, "learning_rate": 0.0001, "loss": 7.4881, "loss/crossentropy": 2.1309454582631586, "loss/hidden": 3.355078125, "loss/jsd": 0.0, "loss/logits": 0.19452987853437662, "step": 9210 }, { "epoch": 0.2305, "grad_norm": 31.75, "grad_norm_var": 3.671875, "learning_rate": 0.0001, "loss": 7.4938, "loss/crossentropy": 2.18781658411026, "loss/hidden": 3.385546875, "loss/jsd": 0.0, "loss/logits": 0.2072685670107603, "step": 9220 }, { "epoch": 0.23075, "grad_norm": 33.25, "grad_norm_var": 2.2556640625, "learning_rate": 0.0001, "loss": 7.4355, "loss/crossentropy": 2.1591438576579094, "loss/hidden": 3.337890625, "loss/jsd": 0.0, "loss/logits": 0.18837961312383414, "step": 9230 }, { "epoch": 0.231, "grad_norm": 30.25, "grad_norm_var": 2.2087890625, "learning_rate": 0.0001, "loss": 7.4499, "loss/crossentropy": 2.133663722872734, "loss/hidden": 3.376953125, "loss/jsd": 0.0, "loss/logits": 0.18666253574192523, "step": 9240 }, { "epoch": 0.23125, "grad_norm": 31.875, "grad_norm_var": 2.1806640625, "learning_rate": 0.0001, "loss": 7.5916, "loss/crossentropy": 2.12740980386734, "loss/hidden": 3.405859375, "loss/jsd": 0.0, "loss/logits": 0.19293057937175034, "step": 9250 }, { "epoch": 0.2315, "grad_norm": 32.5, "grad_norm_var": 2.1947265625, "learning_rate": 0.0001, "loss": 7.5307, "loss/crossentropy": 2.1379271537065505, "loss/hidden": 3.387109375, "loss/jsd": 0.0, "loss/logits": 0.19501187074929477, "step": 9260 }, { "epoch": 0.23175, "grad_norm": 30.5, "grad_norm_var": 2.8264973958333335, "learning_rate": 0.0001, "loss": 7.4995, "loss/crossentropy": 2.220619598031044, "loss/hidden": 3.3421875, "loss/jsd": 0.0, "loss/logits": 0.19177700616419316, "step": 9270 }, { "epoch": 0.232, "grad_norm": 31.25, "grad_norm_var": 4.855208333333334, "learning_rate": 0.0001, "loss": 7.461, "loss/crossentropy": 2.1386048540472986, "loss/hidden": 3.401953125, "loss/jsd": 0.0, "loss/logits": 0.19117407780140638, "step": 9280 }, { "epoch": 0.23225, "grad_norm": 33.75, "grad_norm_var": 3.1122395833333334, "learning_rate": 0.0001, "loss": 7.5308, "loss/crossentropy": 2.201369822025299, "loss/hidden": 3.293359375, "loss/jsd": 0.0, "loss/logits": 0.18732962422072888, "step": 9290 }, { "epoch": 0.2325, "grad_norm": 30.75, "grad_norm_var": 3.4559895833333334, "learning_rate": 0.0001, "loss": 7.4868, "loss/crossentropy": 2.172528564929962, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.19425926376134156, "step": 9300 }, { "epoch": 0.23275, "grad_norm": 27.5, "grad_norm_var": 2.83515625, "learning_rate": 0.0001, "loss": 7.4503, "loss/crossentropy": 2.021423862874508, "loss/hidden": 3.48125, "loss/jsd": 0.0, "loss/logits": 0.20009233951568603, "step": 9310 }, { "epoch": 0.233, "grad_norm": 32.75, "grad_norm_var": 5.4125, "learning_rate": 0.0001, "loss": 7.5898, "loss/crossentropy": 2.197957542538643, "loss/hidden": 3.426953125, "loss/jsd": 0.0, "loss/logits": 0.21913623586297035, "step": 9320 }, { "epoch": 0.23325, "grad_norm": 31.75, "grad_norm_var": 3.70390625, "learning_rate": 0.0001, "loss": 7.5196, "loss/crossentropy": 2.00966841802001, "loss/hidden": 3.449609375, "loss/jsd": 0.0, "loss/logits": 0.21357689071446656, "step": 9330 }, { "epoch": 0.2335, "grad_norm": 32.75, "grad_norm_var": 1.5629557291666667, "learning_rate": 0.0001, "loss": 7.5156, "loss/crossentropy": 2.200144296884537, "loss/hidden": 3.342578125, "loss/jsd": 0.0, "loss/logits": 0.19718017429113388, "step": 9340 }, { "epoch": 0.23375, "grad_norm": 30.75, "grad_norm_var": 1.6541666666666666, "learning_rate": 0.0001, "loss": 7.4899, "loss/crossentropy": 1.991941288113594, "loss/hidden": 3.426171875, "loss/jsd": 0.0, "loss/logits": 0.1878137281164527, "step": 9350 }, { "epoch": 0.234, "grad_norm": 31.125, "grad_norm_var": 1.6934895833333334, "learning_rate": 0.0001, "loss": 7.5461, "loss/crossentropy": 2.0645473077893257, "loss/hidden": 3.451953125, "loss/jsd": 0.0, "loss/logits": 0.20249587260186672, "step": 9360 }, { "epoch": 0.23425, "grad_norm": 30.625, "grad_norm_var": 3.3462890625, "learning_rate": 0.0001, "loss": 7.4449, "loss/crossentropy": 2.203160837292671, "loss/hidden": 3.38515625, "loss/jsd": 0.0, "loss/logits": 0.20446450710296632, "step": 9370 }, { "epoch": 0.2345, "grad_norm": 32.25, "grad_norm_var": 4.572330729166667, "learning_rate": 0.0001, "loss": 7.451, "loss/crossentropy": 2.1672566562891005, "loss/hidden": 3.419140625, "loss/jsd": 0.0, "loss/logits": 0.22198531460016965, "step": 9380 }, { "epoch": 0.23475, "grad_norm": 30.5, "grad_norm_var": 3.42265625, "learning_rate": 0.0001, "loss": 7.6011, "loss/crossentropy": 2.0229784891009333, "loss/hidden": 3.34609375, "loss/jsd": 0.0, "loss/logits": 0.18394104316830634, "step": 9390 }, { "epoch": 0.235, "grad_norm": 30.75, "grad_norm_var": 4.330208333333333, "learning_rate": 0.0001, "loss": 7.4942, "loss/crossentropy": 2.1677876338362694, "loss/hidden": 3.42578125, "loss/jsd": 0.0, "loss/logits": 0.22315683960914612, "step": 9400 }, { "epoch": 0.23525, "grad_norm": 31.375, "grad_norm_var": 19.045572916666668, "learning_rate": 0.0001, "loss": 7.5569, "loss/crossentropy": 2.2457904130220414, "loss/hidden": 3.35625, "loss/jsd": 0.0, "loss/logits": 0.20503853298723698, "step": 9410 }, { "epoch": 0.2355, "grad_norm": 31.0, "grad_norm_var": 18.926822916666666, "learning_rate": 0.0001, "loss": 7.4461, "loss/crossentropy": 2.142396827042103, "loss/hidden": 3.38515625, "loss/jsd": 0.0, "loss/logits": 0.1910883378237486, "step": 9420 }, { "epoch": 0.23575, "grad_norm": 31.25, "grad_norm_var": 8.741666666666667, "learning_rate": 0.0001, "loss": 7.475, "loss/crossentropy": 2.063981272280216, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.20397737752646208, "step": 9430 }, { "epoch": 0.236, "grad_norm": 30.0, "grad_norm_var": 9.062434895833333, "learning_rate": 0.0001, "loss": 7.4995, "loss/crossentropy": 1.9866131611168385, "loss/hidden": 3.516015625, "loss/jsd": 0.0, "loss/logits": 0.19697492588311433, "step": 9440 }, { "epoch": 0.23625, "grad_norm": 40.25, "grad_norm_var": 12.64140625, "learning_rate": 0.0001, "loss": 7.5931, "loss/crossentropy": 2.168704579770565, "loss/hidden": 3.46796875, "loss/jsd": 0.0, "loss/logits": 0.19889446310698985, "step": 9450 }, { "epoch": 0.2365, "grad_norm": 29.25, "grad_norm_var": 51.16223958333333, "learning_rate": 0.0001, "loss": 7.4255, "loss/crossentropy": 2.241490375995636, "loss/hidden": 3.33125, "loss/jsd": 0.0, "loss/logits": 0.19278081078082324, "step": 9460 }, { "epoch": 0.23675, "grad_norm": 31.125, "grad_norm_var": 49.08118489583333, "learning_rate": 0.0001, "loss": 7.44, "loss/crossentropy": 2.0530834168195726, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.1845833698287606, "step": 9470 }, { "epoch": 0.237, "grad_norm": 35.25, "grad_norm_var": 5.190559895833333, "learning_rate": 0.0001, "loss": 7.4893, "loss/crossentropy": 2.1680017322301866, "loss/hidden": 3.387890625, "loss/jsd": 0.0, "loss/logits": 0.19126822520047426, "step": 9480 }, { "epoch": 0.23725, "grad_norm": 29.75, "grad_norm_var": 4.1728515625, "learning_rate": 0.0001, "loss": 7.5044, "loss/crossentropy": 2.1760893553495406, "loss/hidden": 3.3609375, "loss/jsd": 0.0, "loss/logits": 0.1902306279167533, "step": 9490 }, { "epoch": 0.2375, "grad_norm": 30.5, "grad_norm_var": 7.610872395833334, "learning_rate": 0.0001, "loss": 7.5567, "loss/crossentropy": 2.169127979874611, "loss/hidden": 3.3953125, "loss/jsd": 0.0, "loss/logits": 0.1897792138159275, "step": 9500 }, { "epoch": 0.23775, "grad_norm": 34.25, "grad_norm_var": 31.978059895833333, "learning_rate": 0.0001, "loss": 7.5548, "loss/crossentropy": 2.0474624037742615, "loss/hidden": 3.3921875, "loss/jsd": 0.0, "loss/logits": 0.1897742820903659, "step": 9510 }, { "epoch": 0.238, "grad_norm": 28.875, "grad_norm_var": 33.4025390625, "learning_rate": 0.0001, "loss": 7.46, "loss/crossentropy": 2.167206625640392, "loss/hidden": 3.341796875, "loss/jsd": 0.0, "loss/logits": 0.1971887955442071, "step": 9520 }, { "epoch": 0.23825, "grad_norm": 38.0, "grad_norm_var": 10.677018229166666, "learning_rate": 0.0001, "loss": 7.4721, "loss/crossentropy": 2.160684567689896, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.2026047335937619, "step": 9530 }, { "epoch": 0.2385, "grad_norm": 33.25, "grad_norm_var": 6.915559895833334, "learning_rate": 0.0001, "loss": 7.4439, "loss/crossentropy": 2.101367045938969, "loss/hidden": 3.316796875, "loss/jsd": 0.0, "loss/logits": 0.18591827657073737, "step": 9540 }, { "epoch": 0.23875, "grad_norm": 29.75, "grad_norm_var": 4.285416666666666, "learning_rate": 0.0001, "loss": 7.6017, "loss/crossentropy": 2.144346782565117, "loss/hidden": 3.458203125, "loss/jsd": 0.0, "loss/logits": 0.22905603051185608, "step": 9550 }, { "epoch": 0.239, "grad_norm": 31.125, "grad_norm_var": 5.608333333333333, "learning_rate": 0.0001, "loss": 7.391, "loss/crossentropy": 2.2163674563169478, "loss/hidden": 3.491015625, "loss/jsd": 0.0, "loss/logits": 0.20480377711355685, "step": 9560 }, { "epoch": 0.23925, "grad_norm": 32.75, "grad_norm_var": 5.554166666666666, "learning_rate": 0.0001, "loss": 7.4754, "loss/crossentropy": 2.05806076079607, "loss/hidden": 3.43046875, "loss/jsd": 0.0, "loss/logits": 0.1894669458270073, "step": 9570 }, { "epoch": 0.2395, "grad_norm": 35.75, "grad_norm_var": 6.116666666666666, "learning_rate": 0.0001, "loss": 7.64, "loss/crossentropy": 2.2116071820259093, "loss/hidden": 3.38046875, "loss/jsd": 0.0, "loss/logits": 0.20237535443156957, "step": 9580 }, { "epoch": 0.23975, "grad_norm": 27.625, "grad_norm_var": 6.3166015625, "learning_rate": 0.0001, "loss": 7.4412, "loss/crossentropy": 2.0997579991817474, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.18707791212946176, "step": 9590 }, { "epoch": 0.24, "grad_norm": 31.125, "grad_norm_var": 4.391666666666667, "learning_rate": 0.0001, "loss": 7.5587, "loss/crossentropy": 2.1704364523291586, "loss/hidden": 3.42109375, "loss/jsd": 0.0, "loss/logits": 0.20182078052312136, "step": 9600 }, { "epoch": 0.24025, "grad_norm": 29.75, "grad_norm_var": 2.959375, "learning_rate": 0.0001, "loss": 7.5494, "loss/crossentropy": 2.1063621580600738, "loss/hidden": 3.4671875, "loss/jsd": 0.0, "loss/logits": 0.2080043438822031, "step": 9610 }, { "epoch": 0.2405, "grad_norm": 30.875, "grad_norm_var": 2.5872395833333335, "learning_rate": 0.0001, "loss": 7.4507, "loss/crossentropy": 2.1176367297768595, "loss/hidden": 3.31015625, "loss/jsd": 0.0, "loss/logits": 0.19296143669635057, "step": 9620 }, { "epoch": 0.24075, "grad_norm": 30.125, "grad_norm_var": 2.3082682291666665, "learning_rate": 0.0001, "loss": 7.5035, "loss/crossentropy": 2.216022199392319, "loss/hidden": 3.486328125, "loss/jsd": 0.0, "loss/logits": 0.2059534139931202, "step": 9630 }, { "epoch": 0.241, "grad_norm": 31.5, "grad_norm_var": 10.676041666666666, "learning_rate": 0.0001, "loss": 7.5155, "loss/crossentropy": 2.209742319583893, "loss/hidden": 3.3421875, "loss/jsd": 0.0, "loss/logits": 0.19856051169335842, "step": 9640 }, { "epoch": 0.24125, "grad_norm": 31.75, "grad_norm_var": 1.10390625, "learning_rate": 0.0001, "loss": 7.4983, "loss/crossentropy": 2.040982872247696, "loss/hidden": 3.367578125, "loss/jsd": 0.0, "loss/logits": 0.19508841075003147, "step": 9650 }, { "epoch": 0.2415, "grad_norm": 29.5, "grad_norm_var": 1.1697916666666666, "learning_rate": 0.0001, "loss": 7.4469, "loss/crossentropy": 2.1563887119293215, "loss/hidden": 3.2921875, "loss/jsd": 0.0, "loss/logits": 0.18813634980469943, "step": 9660 }, { "epoch": 0.24175, "grad_norm": 29.75, "grad_norm_var": 2.5768229166666665, "learning_rate": 0.0001, "loss": 7.5616, "loss/crossentropy": 2.056604099273682, "loss/hidden": 3.371484375, "loss/jsd": 0.0, "loss/logits": 0.19395184125751258, "step": 9670 }, { "epoch": 0.242, "grad_norm": 30.25, "grad_norm_var": 2.067643229166667, "learning_rate": 0.0001, "loss": 7.4244, "loss/crossentropy": 2.174482125043869, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.20453137308359146, "step": 9680 }, { "epoch": 0.24225, "grad_norm": 31.0, "grad_norm_var": 3.905989583333333, "learning_rate": 0.0001, "loss": 7.5003, "loss/crossentropy": 2.119745084643364, "loss/hidden": 3.43671875, "loss/jsd": 0.0, "loss/logits": 0.20908331871032715, "step": 9690 }, { "epoch": 0.2425, "grad_norm": 29.25, "grad_norm_var": 3.4785807291666666, "learning_rate": 0.0001, "loss": 7.3362, "loss/crossentropy": 2.122311297059059, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.18319036178290843, "step": 9700 }, { "epoch": 0.24275, "grad_norm": 29.875, "grad_norm_var": 1.9541666666666666, "learning_rate": 0.0001, "loss": 7.4396, "loss/crossentropy": 1.9478351414203643, "loss/hidden": 3.437109375, "loss/jsd": 0.0, "loss/logits": 0.20451803579926492, "step": 9710 }, { "epoch": 0.243, "grad_norm": 32.75, "grad_norm_var": 3.3997395833333335, "learning_rate": 0.0001, "loss": 7.4967, "loss/crossentropy": 2.0708641201257705, "loss/hidden": 3.393359375, "loss/jsd": 0.0, "loss/logits": 0.19053229298442603, "step": 9720 }, { "epoch": 0.24325, "grad_norm": 33.5, "grad_norm_var": 3.635872395833333, "learning_rate": 0.0001, "loss": 7.5016, "loss/crossentropy": 2.0913224294781685, "loss/hidden": 3.3953125, "loss/jsd": 0.0, "loss/logits": 0.1940496889874339, "step": 9730 }, { "epoch": 0.2435, "grad_norm": 30.625, "grad_norm_var": 3.3291015625, "learning_rate": 0.0001, "loss": 7.5077, "loss/crossentropy": 2.221079145371914, "loss/hidden": 3.315625, "loss/jsd": 0.0, "loss/logits": 0.1924398820847273, "step": 9740 }, { "epoch": 0.24375, "grad_norm": 31.5, "grad_norm_var": 1.56875, "learning_rate": 0.0001, "loss": 7.4463, "loss/crossentropy": 2.148006671667099, "loss/hidden": 3.3765625, "loss/jsd": 0.0, "loss/logits": 0.20756886284798384, "step": 9750 }, { "epoch": 0.244, "grad_norm": 30.25, "grad_norm_var": 8.2978515625, "learning_rate": 0.0001, "loss": 7.4544, "loss/crossentropy": 2.2184116363525392, "loss/hidden": 3.359765625, "loss/jsd": 0.0, "loss/logits": 0.19135836903005837, "step": 9760 }, { "epoch": 0.24425, "grad_norm": 29.75, "grad_norm_var": 4.690625, "learning_rate": 0.0001, "loss": 7.4095, "loss/crossentropy": 2.1005363285541536, "loss/hidden": 3.36640625, "loss/jsd": 0.0, "loss/logits": 0.19765366949141025, "step": 9770 }, { "epoch": 0.2445, "grad_norm": 30.5, "grad_norm_var": 1.5327473958333333, "learning_rate": 0.0001, "loss": 7.5002, "loss/crossentropy": 2.167180609703064, "loss/hidden": 3.337890625, "loss/jsd": 0.0, "loss/logits": 0.19258719477802516, "step": 9780 }, { "epoch": 0.24475, "grad_norm": 29.875, "grad_norm_var": 2.4697916666666666, "learning_rate": 0.0001, "loss": 7.3885, "loss/crossentropy": 1.9792610332369804, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.18384026363492012, "step": 9790 }, { "epoch": 0.245, "grad_norm": 33.75, "grad_norm_var": 2.5447265625, "learning_rate": 0.0001, "loss": 7.5547, "loss/crossentropy": 2.0596527442336083, "loss/hidden": 3.569921875, "loss/jsd": 0.0, "loss/logits": 0.20697965249419212, "step": 9800 }, { "epoch": 0.24525, "grad_norm": 32.75, "grad_norm_var": 5.042122395833333, "learning_rate": 0.0001, "loss": 7.4887, "loss/crossentropy": 2.132730546593666, "loss/hidden": 3.498828125, "loss/jsd": 0.0, "loss/logits": 0.19533955454826354, "step": 9810 }, { "epoch": 0.2455, "grad_norm": 32.25, "grad_norm_var": 5.31015625, "learning_rate": 0.0001, "loss": 7.5278, "loss/crossentropy": 2.151031643152237, "loss/hidden": 3.450390625, "loss/jsd": 0.0, "loss/logits": 0.19322178810834884, "step": 9820 }, { "epoch": 0.24575, "grad_norm": 30.75, "grad_norm_var": 1.6059895833333333, "learning_rate": 0.0001, "loss": 7.5534, "loss/crossentropy": 2.143332117795944, "loss/hidden": 3.337109375, "loss/jsd": 0.0, "loss/logits": 0.1897228866815567, "step": 9830 }, { "epoch": 0.246, "grad_norm": 31.75, "grad_norm_var": 3.4395182291666666, "learning_rate": 0.0001, "loss": 7.5657, "loss/crossentropy": 2.1097486332058906, "loss/hidden": 3.45390625, "loss/jsd": 0.0, "loss/logits": 0.20945656076073646, "step": 9840 }, { "epoch": 0.24625, "grad_norm": 32.75, "grad_norm_var": 2.9452473958333334, "learning_rate": 0.0001, "loss": 7.5013, "loss/crossentropy": 2.065919445455074, "loss/hidden": 3.401171875, "loss/jsd": 0.0, "loss/logits": 0.18999069584533573, "step": 9850 }, { "epoch": 0.2465, "grad_norm": 31.875, "grad_norm_var": 2.2322265625, "learning_rate": 0.0001, "loss": 7.5501, "loss/crossentropy": 2.183481493592262, "loss/hidden": 3.504296875, "loss/jsd": 0.0, "loss/logits": 0.21892244052141904, "step": 9860 }, { "epoch": 0.24675, "grad_norm": 32.75, "grad_norm_var": 2.4926432291666667, "learning_rate": 0.0001, "loss": 7.478, "loss/crossentropy": 2.1966081470251084, "loss/hidden": 3.402734375, "loss/jsd": 0.0, "loss/logits": 0.20018710754811764, "step": 9870 }, { "epoch": 0.247, "grad_norm": 31.0, "grad_norm_var": 2.2244140625, "learning_rate": 0.0001, "loss": 7.5127, "loss/crossentropy": 1.9914133831858636, "loss/hidden": 3.429296875, "loss/jsd": 0.0, "loss/logits": 0.19236233234405517, "step": 9880 }, { "epoch": 0.24725, "grad_norm": 32.25, "grad_norm_var": 1.8900390625, "learning_rate": 0.0001, "loss": 7.5007, "loss/crossentropy": 2.1605743557214736, "loss/hidden": 3.342578125, "loss/jsd": 0.0, "loss/logits": 0.1983661765232682, "step": 9890 }, { "epoch": 0.2475, "grad_norm": 32.75, "grad_norm_var": 2.0666015625, "learning_rate": 0.0001, "loss": 7.5307, "loss/crossentropy": 2.0920628547668456, "loss/hidden": 3.404296875, "loss/jsd": 0.0, "loss/logits": 0.19224398005753757, "step": 9900 }, { "epoch": 0.24775, "grad_norm": 29.75, "grad_norm_var": 28.910416666666666, "learning_rate": 0.0001, "loss": 7.4845, "loss/crossentropy": 2.13549183011055, "loss/hidden": 3.435546875, "loss/jsd": 0.0, "loss/logits": 0.2024377616122365, "step": 9910 }, { "epoch": 0.248, "grad_norm": 33.5, "grad_norm_var": 3.2122395833333335, "learning_rate": 0.0001, "loss": 7.5567, "loss/crossentropy": 2.077779620885849, "loss/hidden": 3.429296875, "loss/jsd": 0.0, "loss/logits": 0.1914847193285823, "step": 9920 }, { "epoch": 0.24825, "grad_norm": 32.5, "grad_norm_var": 2.3416015625, "learning_rate": 0.0001, "loss": 7.6034, "loss/crossentropy": 2.24170383810997, "loss/hidden": 3.416796875, "loss/jsd": 0.0, "loss/logits": 0.21483709737658502, "step": 9930 }, { "epoch": 0.2485, "grad_norm": 29.25, "grad_norm_var": 1.9525390625, "learning_rate": 0.0001, "loss": 7.5277, "loss/crossentropy": 2.0574432730674745, "loss/hidden": 3.403125, "loss/jsd": 0.0, "loss/logits": 0.19156391881406307, "step": 9940 }, { "epoch": 0.24875, "grad_norm": 29.375, "grad_norm_var": 1.6181640625, "learning_rate": 0.0001, "loss": 7.4432, "loss/crossentropy": 2.231716367602348, "loss/hidden": 3.271484375, "loss/jsd": 0.0, "loss/logits": 0.19992320109158754, "step": 9950 }, { "epoch": 0.249, "grad_norm": 30.625, "grad_norm_var": 2.496809895833333, "learning_rate": 0.0001, "loss": 7.5233, "loss/crossentropy": 2.2805121034383773, "loss/hidden": 3.376171875, "loss/jsd": 0.0, "loss/logits": 0.21222201287746428, "step": 9960 }, { "epoch": 0.24925, "grad_norm": 30.25, "grad_norm_var": 3.6143229166666666, "learning_rate": 0.0001, "loss": 7.5022, "loss/crossentropy": 2.1643686085939406, "loss/hidden": 3.45078125, "loss/jsd": 0.0, "loss/logits": 0.2017618851736188, "step": 9970 }, { "epoch": 0.2495, "grad_norm": 33.25, "grad_norm_var": 2.9947265625, "learning_rate": 0.0001, "loss": 7.5351, "loss/crossentropy": 2.052297804504633, "loss/hidden": 3.387890625, "loss/jsd": 0.0, "loss/logits": 0.19462402295321227, "step": 9980 }, { "epoch": 0.24975, "grad_norm": 28.625, "grad_norm_var": 1.8958333333333333, "learning_rate": 0.0001, "loss": 7.4501, "loss/crossentropy": 2.1721100926399233, "loss/hidden": 3.3234375, "loss/jsd": 0.0, "loss/logits": 0.20133505929261447, "step": 9990 }, { "epoch": 0.25, "grad_norm": 28.0, "grad_norm_var": 2.2645833333333334, "learning_rate": 0.0001, "loss": 7.5168, "loss/crossentropy": 2.0236786626279355, "loss/hidden": 3.387890625, "loss/jsd": 0.0, "loss/logits": 0.18019386120140551, "step": 10000 }, { "epoch": 0.25025, "grad_norm": 28.875, "grad_norm_var": 2.1801432291666667, "learning_rate": 0.0001, "loss": 7.6, "loss/crossentropy": 2.179265484213829, "loss/hidden": 3.411328125, "loss/jsd": 0.0, "loss/logits": 0.21681770533323289, "step": 10010 }, { "epoch": 0.2505, "grad_norm": 32.75, "grad_norm_var": 2.2884765625, "learning_rate": 0.0001, "loss": 7.4947, "loss/crossentropy": 2.238062459230423, "loss/hidden": 3.340625, "loss/jsd": 0.0, "loss/logits": 0.1930771939456463, "step": 10020 }, { "epoch": 0.25075, "grad_norm": 29.25, "grad_norm_var": 1.2957682291666666, "learning_rate": 0.0001, "loss": 7.4869, "loss/crossentropy": 2.016241580247879, "loss/hidden": 3.40703125, "loss/jsd": 0.0, "loss/logits": 0.18652069568634033, "step": 10030 }, { "epoch": 0.251, "grad_norm": 29.125, "grad_norm_var": 1.47265625, "learning_rate": 0.0001, "loss": 7.4921, "loss/crossentropy": 2.1150885075330734, "loss/hidden": 3.416015625, "loss/jsd": 0.0, "loss/logits": 0.19066351260989906, "step": 10040 }, { "epoch": 0.25125, "grad_norm": 30.25, "grad_norm_var": 1.8410807291666667, "learning_rate": 0.0001, "loss": 7.4968, "loss/crossentropy": 2.028704561293125, "loss/hidden": 3.335546875, "loss/jsd": 0.0, "loss/logits": 0.19649554397910834, "step": 10050 }, { "epoch": 0.2515, "grad_norm": 33.5, "grad_norm_var": 4.282747395833334, "learning_rate": 0.0001, "loss": 7.5077, "loss/crossentropy": 2.1834514826536178, "loss/hidden": 3.426171875, "loss/jsd": 0.0, "loss/logits": 0.19737422596663237, "step": 10060 }, { "epoch": 0.25175, "grad_norm": 30.875, "grad_norm_var": 4.7541015625, "learning_rate": 0.0001, "loss": 7.5032, "loss/crossentropy": 2.2020941093564033, "loss/hidden": 3.42421875, "loss/jsd": 0.0, "loss/logits": 0.20502295531332493, "step": 10070 }, { "epoch": 0.252, "grad_norm": 34.25, "grad_norm_var": 5.939322916666667, "learning_rate": 0.0001, "loss": 7.4971, "loss/crossentropy": 2.1619419783353804, "loss/hidden": 3.39375, "loss/jsd": 0.0, "loss/logits": 0.19227985218167304, "step": 10080 }, { "epoch": 0.25225, "grad_norm": 30.375, "grad_norm_var": 3.762434895833333, "learning_rate": 0.0001, "loss": 7.5093, "loss/crossentropy": 2.045739908516407, "loss/hidden": 3.40859375, "loss/jsd": 0.0, "loss/logits": 0.1985299138352275, "step": 10090 }, { "epoch": 0.2525, "grad_norm": 30.25, "grad_norm_var": 2.0645182291666666, "learning_rate": 0.0001, "loss": 7.5485, "loss/crossentropy": 2.1392029881477357, "loss/hidden": 3.4875, "loss/jsd": 0.0, "loss/logits": 0.22696843333542346, "step": 10100 }, { "epoch": 0.25275, "grad_norm": 41.25, "grad_norm_var": 8.16640625, "learning_rate": 0.0001, "loss": 7.5099, "loss/crossentropy": 2.112062802910805, "loss/hidden": 3.37578125, "loss/jsd": 0.0, "loss/logits": 0.20055703315883874, "step": 10110 }, { "epoch": 0.253, "grad_norm": 32.75, "grad_norm_var": 8.356705729166666, "learning_rate": 0.0001, "loss": 7.5021, "loss/crossentropy": 2.081932318210602, "loss/hidden": 3.323828125, "loss/jsd": 0.0, "loss/logits": 0.1885841501876712, "step": 10120 }, { "epoch": 0.25325, "grad_norm": 32.75, "grad_norm_var": 2.154166666666667, "learning_rate": 0.0001, "loss": 7.5364, "loss/crossentropy": 2.1130091354250906, "loss/hidden": 3.321875, "loss/jsd": 0.0, "loss/logits": 0.19095001947134732, "step": 10130 }, { "epoch": 0.2535, "grad_norm": 30.625, "grad_norm_var": 1.7160807291666667, "learning_rate": 0.0001, "loss": 7.5311, "loss/crossentropy": 2.0348104894161225, "loss/hidden": 3.458203125, "loss/jsd": 0.0, "loss/logits": 0.20060632489621638, "step": 10140 }, { "epoch": 0.25375, "grad_norm": 32.0, "grad_norm_var": 1.8551432291666667, "learning_rate": 0.0001, "loss": 7.5424, "loss/crossentropy": 2.1739980176091196, "loss/hidden": 3.361328125, "loss/jsd": 0.0, "loss/logits": 0.19488737732172012, "step": 10150 }, { "epoch": 0.254, "grad_norm": 35.0, "grad_norm_var": 3.4330729166666667, "learning_rate": 0.0001, "loss": 7.5232, "loss/crossentropy": 2.1551436215639113, "loss/hidden": 3.422265625, "loss/jsd": 0.0, "loss/logits": 0.20004159081727266, "step": 10160 }, { "epoch": 0.25425, "grad_norm": 32.25, "grad_norm_var": 5.01015625, "learning_rate": 0.0001, "loss": 7.5319, "loss/crossentropy": 2.100159841030836, "loss/hidden": 3.535546875, "loss/jsd": 0.0, "loss/logits": 0.19967089565470814, "step": 10170 }, { "epoch": 0.2545, "grad_norm": 31.125, "grad_norm_var": 1.896875, "learning_rate": 0.0001, "loss": 7.4974, "loss/crossentropy": 2.1067795649170877, "loss/hidden": 3.42734375, "loss/jsd": 0.0, "loss/logits": 0.1976080872118473, "step": 10180 }, { "epoch": 0.25475, "grad_norm": 32.5, "grad_norm_var": 1.3619140625, "learning_rate": 0.0001, "loss": 7.5954, "loss/crossentropy": 2.0253565564751623, "loss/hidden": 3.473828125, "loss/jsd": 0.0, "loss/logits": 0.19032611772418023, "step": 10190 }, { "epoch": 0.255, "grad_norm": 28.25, "grad_norm_var": 1.9875, "learning_rate": 0.0001, "loss": 7.4835, "loss/crossentropy": 2.0224084883928297, "loss/hidden": 3.380859375, "loss/jsd": 0.0, "loss/logits": 0.18663749676197766, "step": 10200 }, { "epoch": 0.25525, "grad_norm": 30.0, "grad_norm_var": 1.7973307291666667, "learning_rate": 0.0001, "loss": 7.5863, "loss/crossentropy": 2.126485800743103, "loss/hidden": 3.425390625, "loss/jsd": 0.0, "loss/logits": 0.2074130540713668, "step": 10210 }, { "epoch": 0.2555, "grad_norm": 29.375, "grad_norm_var": 2.6093098958333334, "learning_rate": 0.0001, "loss": 7.6372, "loss/crossentropy": 2.0780230551958083, "loss/hidden": 3.435546875, "loss/jsd": 0.0, "loss/logits": 0.1949790535494685, "step": 10220 }, { "epoch": 0.25575, "grad_norm": 29.75, "grad_norm_var": 1.7247395833333334, "learning_rate": 0.0001, "loss": 7.4954, "loss/crossentropy": 2.0865010380744935, "loss/hidden": 3.36796875, "loss/jsd": 0.0, "loss/logits": 0.19579052459448576, "step": 10230 }, { "epoch": 0.256, "grad_norm": 31.875, "grad_norm_var": 3.9025390625, "learning_rate": 0.0001, "loss": 7.5297, "loss/crossentropy": 2.06137825101614, "loss/hidden": 3.40390625, "loss/jsd": 0.0, "loss/logits": 0.20053859017789363, "step": 10240 }, { "epoch": 0.25625, "grad_norm": 30.125, "grad_norm_var": 5.632291666666666, "learning_rate": 0.0001, "loss": 7.5228, "loss/crossentropy": 2.102967083454132, "loss/hidden": 3.407421875, "loss/jsd": 0.0, "loss/logits": 0.1902999484911561, "step": 10250 }, { "epoch": 0.2565, "grad_norm": 31.0, "grad_norm_var": 1.2010416666666666, "learning_rate": 0.0001, "loss": 7.4742, "loss/crossentropy": 2.0118507161736487, "loss/hidden": 3.489453125, "loss/jsd": 0.0, "loss/logits": 0.20069226585328578, "step": 10260 }, { "epoch": 0.25675, "grad_norm": 32.5, "grad_norm_var": 1.0728515625, "learning_rate": 0.0001, "loss": 7.5045, "loss/crossentropy": 2.161550810933113, "loss/hidden": 3.366796875, "loss/jsd": 0.0, "loss/logits": 0.1856826152652502, "step": 10270 }, { "epoch": 0.257, "grad_norm": 29.875, "grad_norm_var": 2.037239583333333, "learning_rate": 0.0001, "loss": 7.4474, "loss/crossentropy": 2.209408235549927, "loss/hidden": 3.333984375, "loss/jsd": 0.0, "loss/logits": 0.19467408265918493, "step": 10280 }, { "epoch": 0.25725, "grad_norm": 30.25, "grad_norm_var": 1.925, "learning_rate": 0.0001, "loss": 7.5425, "loss/crossentropy": 2.1496171414852143, "loss/hidden": 3.2546875, "loss/jsd": 0.0, "loss/logits": 0.18077059462666512, "step": 10290 }, { "epoch": 0.2575, "grad_norm": 30.25, "grad_norm_var": 2.8108723958333335, "learning_rate": 0.0001, "loss": 7.4462, "loss/crossentropy": 2.2561030745506288, "loss/hidden": 3.387109375, "loss/jsd": 0.0, "loss/logits": 0.20665224064141513, "step": 10300 }, { "epoch": 0.25775, "grad_norm": 28.375, "grad_norm_var": 2.11015625, "learning_rate": 0.0001, "loss": 7.3919, "loss/crossentropy": 2.1672320902347564, "loss/hidden": 3.353125, "loss/jsd": 0.0, "loss/logits": 0.19321133568882942, "step": 10310 }, { "epoch": 0.258, "grad_norm": 28.5, "grad_norm_var": 2.8114583333333334, "learning_rate": 0.0001, "loss": 7.4607, "loss/crossentropy": 2.1774761766195296, "loss/hidden": 3.30859375, "loss/jsd": 0.0, "loss/logits": 0.1832532085478306, "step": 10320 }, { "epoch": 0.25825, "grad_norm": 30.125, "grad_norm_var": 3.006705729166667, "learning_rate": 0.0001, "loss": 7.5093, "loss/crossentropy": 2.08485603928566, "loss/hidden": 3.411328125, "loss/jsd": 0.0, "loss/logits": 0.20309378914535045, "step": 10330 }, { "epoch": 0.2585, "grad_norm": 32.25, "grad_norm_var": 2.8994140625, "learning_rate": 0.0001, "loss": 7.5961, "loss/crossentropy": 2.1903068631887437, "loss/hidden": 3.319140625, "loss/jsd": 0.0, "loss/logits": 0.19475248865783215, "step": 10340 }, { "epoch": 0.25875, "grad_norm": 31.625, "grad_norm_var": 3.6723307291666667, "learning_rate": 0.0001, "loss": 7.5157, "loss/crossentropy": 2.1206096917390824, "loss/hidden": 3.40546875, "loss/jsd": 0.0, "loss/logits": 0.19404957443475723, "step": 10350 }, { "epoch": 0.259, "grad_norm": 35.75, "grad_norm_var": 7.144791666666666, "learning_rate": 0.0001, "loss": 7.5988, "loss/crossentropy": 2.1614494152367114, "loss/hidden": 3.444921875, "loss/jsd": 0.0, "loss/logits": 0.19104234427213668, "step": 10360 }, { "epoch": 0.25925, "grad_norm": 29.875, "grad_norm_var": 5.8650390625, "learning_rate": 0.0001, "loss": 7.5585, "loss/crossentropy": 2.2361265033483506, "loss/hidden": 3.293359375, "loss/jsd": 0.0, "loss/logits": 0.18775968980044128, "step": 10370 }, { "epoch": 0.2595, "grad_norm": 31.25, "grad_norm_var": 1.8166666666666667, "learning_rate": 0.0001, "loss": 7.5804, "loss/crossentropy": 2.086420488357544, "loss/hidden": 3.415625, "loss/jsd": 0.0, "loss/logits": 0.19417675212025642, "step": 10380 }, { "epoch": 0.25975, "grad_norm": 30.375, "grad_norm_var": 2.603125, "learning_rate": 0.0001, "loss": 7.4606, "loss/crossentropy": 2.0714549839496614, "loss/hidden": 3.44453125, "loss/jsd": 0.0, "loss/logits": 0.21479819286614657, "step": 10390 }, { "epoch": 0.26, "grad_norm": 31.375, "grad_norm_var": 2.0738932291666665, "learning_rate": 0.0001, "loss": 7.6124, "loss/crossentropy": 2.0951902747154234, "loss/hidden": 3.381640625, "loss/jsd": 0.0, "loss/logits": 0.19614790324121714, "step": 10400 }, { "epoch": 0.26025, "grad_norm": 33.75, "grad_norm_var": 1.9416015625, "learning_rate": 0.0001, "loss": 7.535, "loss/crossentropy": 2.1505746722221373, "loss/hidden": 3.413671875, "loss/jsd": 0.0, "loss/logits": 0.20192769896239043, "step": 10410 }, { "epoch": 0.2605, "grad_norm": 34.0, "grad_norm_var": 3.5260416666666665, "learning_rate": 0.0001, "loss": 7.6095, "loss/crossentropy": 2.249527391791344, "loss/hidden": 3.37265625, "loss/jsd": 0.0, "loss/logits": 0.21959280855953695, "step": 10420 }, { "epoch": 0.26075, "grad_norm": 33.75, "grad_norm_var": 2.6025390625, "learning_rate": 0.0001, "loss": 7.5459, "loss/crossentropy": 2.190070366859436, "loss/hidden": 3.3765625, "loss/jsd": 0.0, "loss/logits": 0.18353844359517096, "step": 10430 }, { "epoch": 0.261, "grad_norm": 33.5, "grad_norm_var": 4.64375, "learning_rate": 0.0001, "loss": 7.5007, "loss/crossentropy": 2.148374534398317, "loss/hidden": 3.327734375, "loss/jsd": 0.0, "loss/logits": 0.19743462903425096, "step": 10440 }, { "epoch": 0.26125, "grad_norm": 31.125, "grad_norm_var": 4.2, "learning_rate": 0.0001, "loss": 7.5689, "loss/crossentropy": 2.2100040167570114, "loss/hidden": 3.42109375, "loss/jsd": 0.0, "loss/logits": 0.22015094980597497, "step": 10450 }, { "epoch": 0.2615, "grad_norm": 54.5, "grad_norm_var": 40.2072265625, "learning_rate": 0.0001, "loss": 7.5953, "loss/crossentropy": 2.0877301573753355, "loss/hidden": 3.48515625, "loss/jsd": 0.0, "loss/logits": 0.2032545803114772, "step": 10460 }, { "epoch": 0.26175, "grad_norm": 29.375, "grad_norm_var": 67.60807291666667, "learning_rate": 0.0001, "loss": 7.55, "loss/crossentropy": 2.1474430203437804, "loss/hidden": 3.416796875, "loss/jsd": 0.0, "loss/logits": 0.1959912359714508, "step": 10470 }, { "epoch": 0.262, "grad_norm": 31.625, "grad_norm_var": 3.184309895833333, "learning_rate": 0.0001, "loss": 7.5324, "loss/crossentropy": 2.2022393375635145, "loss/hidden": 3.337109375, "loss/jsd": 0.0, "loss/logits": 0.18873728141188623, "step": 10480 }, { "epoch": 0.26225, "grad_norm": 29.125, "grad_norm_var": 3.621809895833333, "learning_rate": 0.0001, "loss": 7.5054, "loss/crossentropy": 2.038270506262779, "loss/hidden": 3.358984375, "loss/jsd": 0.0, "loss/logits": 0.18390222508460283, "step": 10490 }, { "epoch": 0.2625, "grad_norm": 39.5, "grad_norm_var": 6.564583333333333, "learning_rate": 0.0001, "loss": 7.4524, "loss/crossentropy": 2.112567237019539, "loss/hidden": 3.323828125, "loss/jsd": 0.0, "loss/logits": 0.18860076945275067, "step": 10500 }, { "epoch": 0.26275, "grad_norm": 28.5, "grad_norm_var": 8.082747395833334, "learning_rate": 0.0001, "loss": 7.4385, "loss/crossentropy": 2.172071099281311, "loss/hidden": 3.376953125, "loss/jsd": 0.0, "loss/logits": 0.1897763295099139, "step": 10510 }, { "epoch": 0.263, "grad_norm": 33.5, "grad_norm_var": 2.86015625, "learning_rate": 0.0001, "loss": 7.5004, "loss/crossentropy": 2.2221229881048203, "loss/hidden": 3.315234375, "loss/jsd": 0.0, "loss/logits": 0.19093570075929164, "step": 10520 }, { "epoch": 0.26325, "grad_norm": 31.0, "grad_norm_var": 1.7697916666666667, "learning_rate": 0.0001, "loss": 7.5274, "loss/crossentropy": 2.121767780184746, "loss/hidden": 3.421484375, "loss/jsd": 0.0, "loss/logits": 0.20014208517968654, "step": 10530 }, { "epoch": 0.2635, "grad_norm": 30.125, "grad_norm_var": 1.1059895833333333, "learning_rate": 0.0001, "loss": 7.6305, "loss/crossentropy": 2.0758725464344026, "loss/hidden": 3.408984375, "loss/jsd": 0.0, "loss/logits": 0.21838635317981242, "step": 10540 }, { "epoch": 0.26375, "grad_norm": 31.25, "grad_norm_var": 3.246875, "learning_rate": 0.0001, "loss": 7.5622, "loss/crossentropy": 2.1207674235105514, "loss/hidden": 3.338671875, "loss/jsd": 0.0, "loss/logits": 0.19129925426095723, "step": 10550 }, { "epoch": 0.264, "grad_norm": 30.5, "grad_norm_var": 3.0947265625, "learning_rate": 0.0001, "loss": 7.4523, "loss/crossentropy": 2.0643288612365724, "loss/hidden": 3.326953125, "loss/jsd": 0.0, "loss/logits": 0.1818947261199355, "step": 10560 }, { "epoch": 0.26425, "grad_norm": 36.0, "grad_norm_var": 4.464322916666666, "learning_rate": 0.0001, "loss": 7.467, "loss/crossentropy": 2.168118804693222, "loss/hidden": 3.42265625, "loss/jsd": 0.0, "loss/logits": 0.19394876565784216, "step": 10570 }, { "epoch": 0.2645, "grad_norm": 29.875, "grad_norm_var": 5.934830729166666, "learning_rate": 0.0001, "loss": 7.5136, "loss/crossentropy": 2.1245740592479705, "loss/hidden": 3.307421875, "loss/jsd": 0.0, "loss/logits": 0.18961659409105777, "step": 10580 }, { "epoch": 0.26475, "grad_norm": 31.75, "grad_norm_var": 5.565559895833333, "learning_rate": 0.0001, "loss": 7.4291, "loss/crossentropy": 2.166120108962059, "loss/hidden": 3.317578125, "loss/jsd": 0.0, "loss/logits": 0.18523282706737518, "step": 10590 }, { "epoch": 0.265, "grad_norm": 32.0, "grad_norm_var": 2.995833333333333, "learning_rate": 0.0001, "loss": 7.5069, "loss/crossentropy": 2.200042313337326, "loss/hidden": 3.3328125, "loss/jsd": 0.0, "loss/logits": 0.1952303361147642, "step": 10600 }, { "epoch": 0.26525, "grad_norm": 30.875, "grad_norm_var": 2.0082682291666667, "learning_rate": 0.0001, "loss": 7.4612, "loss/crossentropy": 2.0848385438323023, "loss/hidden": 3.401953125, "loss/jsd": 0.0, "loss/logits": 0.19754947163164616, "step": 10610 }, { "epoch": 0.2655, "grad_norm": 30.5, "grad_norm_var": 1.125, "learning_rate": 0.0001, "loss": 7.507, "loss/crossentropy": 2.0842891573905944, "loss/hidden": 3.352734375, "loss/jsd": 0.0, "loss/logits": 0.18975816927850248, "step": 10620 }, { "epoch": 0.26575, "grad_norm": 30.375, "grad_norm_var": 1.1405598958333334, "learning_rate": 0.0001, "loss": 7.5813, "loss/crossentropy": 2.133824145793915, "loss/hidden": 3.3890625, "loss/jsd": 0.0, "loss/logits": 0.20222157016396522, "step": 10630 }, { "epoch": 0.266, "grad_norm": 30.0, "grad_norm_var": 18.200455729166666, "learning_rate": 0.0001, "loss": 7.5877, "loss/crossentropy": 2.1643952041864396, "loss/hidden": 3.376953125, "loss/jsd": 0.0, "loss/logits": 0.19804936908185483, "step": 10640 }, { "epoch": 0.26625, "grad_norm": 31.0, "grad_norm_var": 399.8686848958333, "learning_rate": 0.0001, "loss": 7.6453, "loss/crossentropy": 2.049100285768509, "loss/hidden": 3.44140625, "loss/jsd": 0.0, "loss/logits": 0.20171927530318498, "step": 10650 }, { "epoch": 0.2665, "grad_norm": 32.0, "grad_norm_var": 7.462955729166667, "learning_rate": 0.0001, "loss": 7.5871, "loss/crossentropy": 2.086188179254532, "loss/hidden": 3.53359375, "loss/jsd": 0.0, "loss/logits": 0.2075439091771841, "step": 10660 }, { "epoch": 0.26675, "grad_norm": 30.875, "grad_norm_var": 3.7143229166666667, "learning_rate": 0.0001, "loss": 7.5477, "loss/crossentropy": 2.1709790498018267, "loss/hidden": 3.319140625, "loss/jsd": 0.0, "loss/logits": 0.18898459300398826, "step": 10670 }, { "epoch": 0.267, "grad_norm": 35.5, "grad_norm_var": 8.525, "learning_rate": 0.0001, "loss": 7.5583, "loss/crossentropy": 2.191944640874863, "loss/hidden": 3.333984375, "loss/jsd": 0.0, "loss/logits": 0.18916468657553195, "step": 10680 }, { "epoch": 0.26725, "grad_norm": 31.25, "grad_norm_var": 9.723372395833334, "learning_rate": 0.0001, "loss": 7.6552, "loss/crossentropy": 2.153936019539833, "loss/hidden": 3.4203125, "loss/jsd": 0.0, "loss/logits": 0.21916016209870576, "step": 10690 }, { "epoch": 0.2675, "grad_norm": 30.75, "grad_norm_var": 2.5327473958333333, "learning_rate": 0.0001, "loss": 7.4995, "loss/crossentropy": 2.144308550655842, "loss/hidden": 3.393359375, "loss/jsd": 0.0, "loss/logits": 0.18704238552600144, "step": 10700 }, { "epoch": 0.26775, "grad_norm": 29.375, "grad_norm_var": 14.4875, "learning_rate": 0.0001, "loss": 7.6781, "loss/crossentropy": 2.1939100988209246, "loss/hidden": 3.366796875, "loss/jsd": 0.0, "loss/logits": 0.19588800482451915, "step": 10710 }, { "epoch": 0.268, "grad_norm": 32.5, "grad_norm_var": 21.031705729166667, "learning_rate": 0.0001, "loss": 7.5024, "loss/crossentropy": 2.0107320092618464, "loss/hidden": 3.34453125, "loss/jsd": 0.0, "loss/logits": 0.1809020394459367, "step": 10720 }, { "epoch": 0.26825, "grad_norm": 30.125, "grad_norm_var": 12.817708333333334, "learning_rate": 0.0001, "loss": 7.5567, "loss/crossentropy": 2.21520319879055, "loss/hidden": 3.3, "loss/jsd": 0.0, "loss/logits": 0.18951662816107273, "step": 10730 }, { "epoch": 0.2685, "grad_norm": 30.625, "grad_norm_var": 31.861393229166666, "learning_rate": 0.0001, "loss": 7.5161, "loss/crossentropy": 2.1877179771661757, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.1967765673995018, "step": 10740 }, { "epoch": 0.26875, "grad_norm": 34.25, "grad_norm_var": 30.971875, "learning_rate": 0.0001, "loss": 7.549, "loss/crossentropy": 2.1257426261901857, "loss/hidden": 3.3078125, "loss/jsd": 0.0, "loss/logits": 0.18805306870490313, "step": 10750 }, { "epoch": 0.269, "grad_norm": 30.5, "grad_norm_var": 4.608072916666667, "learning_rate": 0.0001, "loss": 7.5008, "loss/crossentropy": 2.0466638416051866, "loss/hidden": 3.316796875, "loss/jsd": 0.0, "loss/logits": 0.18291978891938926, "step": 10760 }, { "epoch": 0.26925, "grad_norm": 32.5, "grad_norm_var": 15.839518229166666, "learning_rate": 0.0001, "loss": 7.6738, "loss/crossentropy": 2.177478575706482, "loss/hidden": 3.46484375, "loss/jsd": 0.0, "loss/logits": 0.2091924037784338, "step": 10770 }, { "epoch": 0.2695, "grad_norm": 30.625, "grad_norm_var": 4.19765625, "learning_rate": 0.0001, "loss": 7.4797, "loss/crossentropy": 2.054281398653984, "loss/hidden": 3.33125, "loss/jsd": 0.0, "loss/logits": 0.19896940067410468, "step": 10780 }, { "epoch": 0.26975, "grad_norm": 28.0, "grad_norm_var": 4.278580729166666, "learning_rate": 0.0001, "loss": 7.5147, "loss/crossentropy": 2.2530731678009035, "loss/hidden": 3.350390625, "loss/jsd": 0.0, "loss/logits": 0.1928473612293601, "step": 10790 }, { "epoch": 0.27, "grad_norm": 31.625, "grad_norm_var": 2.033072916666667, "learning_rate": 0.0001, "loss": 7.5229, "loss/crossentropy": 2.218820883333683, "loss/hidden": 3.28515625, "loss/jsd": 0.0, "loss/logits": 0.18697738237679004, "step": 10800 }, { "epoch": 0.27025, "grad_norm": 30.75, "grad_norm_var": 2.622916666666667, "learning_rate": 0.0001, "loss": 7.6507, "loss/crossentropy": 2.100504669547081, "loss/hidden": 3.419921875, "loss/jsd": 0.0, "loss/logits": 0.19196046218276025, "step": 10810 }, { "epoch": 0.2705, "grad_norm": 33.0, "grad_norm_var": 11.7212890625, "learning_rate": 0.0001, "loss": 7.6045, "loss/crossentropy": 2.0671754188835623, "loss/hidden": 3.466796875, "loss/jsd": 0.0, "loss/logits": 0.195990754943341, "step": 10820 }, { "epoch": 0.27075, "grad_norm": 29.875, "grad_norm_var": 11.322916666666666, "learning_rate": 0.0001, "loss": 7.5369, "loss/crossentropy": 2.0837562650442125, "loss/hidden": 3.39453125, "loss/jsd": 0.0, "loss/logits": 0.18965425249189138, "step": 10830 }, { "epoch": 0.271, "grad_norm": 31.0, "grad_norm_var": 4.9462890625, "learning_rate": 0.0001, "loss": 7.6529, "loss/crossentropy": 2.1033942848443985, "loss/hidden": 3.432421875, "loss/jsd": 0.0, "loss/logits": 0.21247205398976804, "step": 10840 }, { "epoch": 0.27125, "grad_norm": 32.5, "grad_norm_var": 1.7796223958333333, "learning_rate": 0.0001, "loss": 7.5122, "loss/crossentropy": 2.2221016943454743, "loss/hidden": 3.373828125, "loss/jsd": 0.0, "loss/logits": 0.20388032998889685, "step": 10850 }, { "epoch": 0.2715, "grad_norm": 33.5, "grad_norm_var": 1.4080729166666666, "learning_rate": 0.0001, "loss": 7.6508, "loss/crossentropy": 2.129495047032833, "loss/hidden": 3.34140625, "loss/jsd": 0.0, "loss/logits": 0.22404770106077193, "step": 10860 }, { "epoch": 0.27175, "grad_norm": 30.75, "grad_norm_var": 2.537239583333333, "learning_rate": 0.0001, "loss": 7.4812, "loss/crossentropy": 2.085365228354931, "loss/hidden": 3.409375, "loss/jsd": 0.0, "loss/logits": 0.21701993197202682, "step": 10870 }, { "epoch": 0.272, "grad_norm": 30.25, "grad_norm_var": 78.16640625, "learning_rate": 0.0001, "loss": 7.4868, "loss/crossentropy": 2.0185007080435753, "loss/hidden": 3.31171875, "loss/jsd": 0.0, "loss/logits": 0.1912636984139681, "step": 10880 }, { "epoch": 0.27225, "grad_norm": 33.5, "grad_norm_var": 2.4389973958333333, "learning_rate": 0.0001, "loss": 7.4235, "loss/crossentropy": 2.0273312851786613, "loss/hidden": 3.452734375, "loss/jsd": 0.0, "loss/logits": 0.1935686475597322, "step": 10890 }, { "epoch": 0.2725, "grad_norm": 31.0, "grad_norm_var": 3.676041666666667, "learning_rate": 0.0001, "loss": 7.56, "loss/crossentropy": 2.1340904593467713, "loss/hidden": 3.39375, "loss/jsd": 0.0, "loss/logits": 0.21148081459105014, "step": 10900 }, { "epoch": 0.27275, "grad_norm": 30.25, "grad_norm_var": 2.169205729166667, "learning_rate": 0.0001, "loss": 7.4542, "loss/crossentropy": 2.1168949902057648, "loss/hidden": 3.36484375, "loss/jsd": 0.0, "loss/logits": 0.19148083217442036, "step": 10910 }, { "epoch": 0.273, "grad_norm": 30.375, "grad_norm_var": 1.99765625, "learning_rate": 0.0001, "loss": 7.5497, "loss/crossentropy": 2.1183649614453315, "loss/hidden": 3.36640625, "loss/jsd": 0.0, "loss/logits": 0.1918137041851878, "step": 10920 }, { "epoch": 0.27325, "grad_norm": 33.0, "grad_norm_var": 1.7768229166666667, "learning_rate": 0.0001, "loss": 7.6078, "loss/crossentropy": 2.0465846031904222, "loss/hidden": 3.434375, "loss/jsd": 0.0, "loss/logits": 0.21847008895128966, "step": 10930 }, { "epoch": 0.2735, "grad_norm": 31.625, "grad_norm_var": 4.513997395833333, "learning_rate": 0.0001, "loss": 7.421, "loss/crossentropy": 2.1968978524208067, "loss/hidden": 3.43828125, "loss/jsd": 0.0, "loss/logits": 0.2054054219275713, "step": 10940 }, { "epoch": 0.27375, "grad_norm": 29.25, "grad_norm_var": 4.718489583333334, "learning_rate": 0.0001, "loss": 7.5254, "loss/crossentropy": 2.284649354219437, "loss/hidden": 3.38203125, "loss/jsd": 0.0, "loss/logits": 0.1995658937841654, "step": 10950 }, { "epoch": 0.274, "grad_norm": 30.625, "grad_norm_var": 2.67890625, "learning_rate": 0.0001, "loss": 7.5708, "loss/crossentropy": 2.141575986146927, "loss/hidden": 3.32109375, "loss/jsd": 0.0, "loss/logits": 0.19440446924418212, "step": 10960 }, { "epoch": 0.27425, "grad_norm": 42.5, "grad_norm_var": 4.468696696373851e+18, "learning_rate": 0.0001, "loss": 7.5234, "loss/crossentropy": 2.131410190463066, "loss/hidden": 3.311328125, "loss/jsd": 0.0, "loss/logits": 0.1760883742943406, "step": 10970 }, { "epoch": 0.2745, "grad_norm": 29.5, "grad_norm_var": 4.468696695713248e+18, "learning_rate": 0.0001, "loss": 7.5372, "loss/crossentropy": 2.1476279973983763, "loss/hidden": 3.35, "loss/jsd": 0.0, "loss/logits": 0.184328780695796, "step": 10980 }, { "epoch": 0.27475, "grad_norm": 31.0, "grad_norm_var": 8.8931640625, "learning_rate": 0.0001, "loss": 7.5383, "loss/crossentropy": 2.1352312207221984, "loss/hidden": 3.394140625, "loss/jsd": 0.0, "loss/logits": 0.19372365307062864, "step": 10990 }, { "epoch": 0.275, "grad_norm": 29.375, "grad_norm_var": 7.148372395833333, "learning_rate": 0.0001, "loss": 7.4628, "loss/crossentropy": 2.0489871561527253, "loss/hidden": 3.427734375, "loss/jsd": 0.0, "loss/logits": 0.18995484188199044, "step": 11000 }, { "epoch": 0.27525, "grad_norm": 32.5, "grad_norm_var": 2.6264973958333333, "learning_rate": 0.0001, "loss": 7.4443, "loss/crossentropy": 2.1590236008167265, "loss/hidden": 3.378515625, "loss/jsd": 0.0, "loss/logits": 0.19273458905518054, "step": 11010 }, { "epoch": 0.2755, "grad_norm": 30.0, "grad_norm_var": 3.8848307291666666, "learning_rate": 0.0001, "loss": 7.4967, "loss/crossentropy": 2.1544351994991304, "loss/hidden": 3.377734375, "loss/jsd": 0.0, "loss/logits": 0.21255392767488956, "step": 11020 }, { "epoch": 0.27575, "grad_norm": 30.25, "grad_norm_var": 2.5893229166666667, "learning_rate": 0.0001, "loss": 7.5015, "loss/crossentropy": 2.018085864186287, "loss/hidden": 3.382421875, "loss/jsd": 0.0, "loss/logits": 0.19312111511826516, "step": 11030 }, { "epoch": 0.276, "grad_norm": 30.875, "grad_norm_var": 2.8645833333333335, "learning_rate": 0.0001, "loss": 7.4226, "loss/crossentropy": 2.105189894139767, "loss/hidden": 3.291796875, "loss/jsd": 0.0, "loss/logits": 0.17890707962214947, "step": 11040 }, { "epoch": 0.27625, "grad_norm": 36.25, "grad_norm_var": 64.57493489583334, "learning_rate": 0.0001, "loss": 7.6245, "loss/crossentropy": 2.0516477465629577, "loss/hidden": 3.4109375, "loss/jsd": 0.0, "loss/logits": 0.2067173158749938, "step": 11050 }, { "epoch": 0.2765, "grad_norm": 31.5, "grad_norm_var": 21.5666015625, "learning_rate": 0.0001, "loss": 7.5657, "loss/crossentropy": 2.1685635060071946, "loss/hidden": 3.426171875, "loss/jsd": 0.0, "loss/logits": 0.20545070786029102, "step": 11060 }, { "epoch": 0.27675, "grad_norm": 31.0, "grad_norm_var": 18.304622395833334, "learning_rate": 0.0001, "loss": 7.5266, "loss/crossentropy": 2.168036562204361, "loss/hidden": 3.3609375, "loss/jsd": 0.0, "loss/logits": 0.20288394428789616, "step": 11070 }, { "epoch": 0.277, "grad_norm": 30.125, "grad_norm_var": 4.6587890625, "learning_rate": 0.0001, "loss": 7.637, "loss/crossentropy": 2.175144538283348, "loss/hidden": 3.419921875, "loss/jsd": 0.0, "loss/logits": 0.2135882146656513, "step": 11080 }, { "epoch": 0.27725, "grad_norm": 31.75, "grad_norm_var": 4.45625, "learning_rate": 0.0001, "loss": 7.5509, "loss/crossentropy": 2.025794816017151, "loss/hidden": 3.460546875, "loss/jsd": 0.0, "loss/logits": 0.19755149651318787, "step": 11090 }, { "epoch": 0.2775, "grad_norm": 27.5, "grad_norm_var": 3.9025390625, "learning_rate": 0.0001, "loss": 7.5019, "loss/crossentropy": 2.1192509084939957, "loss/hidden": 3.3765625, "loss/jsd": 0.0, "loss/logits": 0.1885302910581231, "step": 11100 }, { "epoch": 0.27775, "grad_norm": 31.875, "grad_norm_var": 3.4514973958333335, "learning_rate": 0.0001, "loss": 7.508, "loss/crossentropy": 2.0708123981952666, "loss/hidden": 3.515234375, "loss/jsd": 0.0, "loss/logits": 0.21488154623657466, "step": 11110 }, { "epoch": 0.278, "grad_norm": 29.375, "grad_norm_var": 6.984375, "learning_rate": 0.0001, "loss": 7.5729, "loss/crossentropy": 2.196261405944824, "loss/hidden": 3.368359375, "loss/jsd": 0.0, "loss/logits": 0.19242637380957603, "step": 11120 }, { "epoch": 0.27825, "grad_norm": 32.0, "grad_norm_var": 13.114322916666667, "learning_rate": 0.0001, "loss": 7.501, "loss/crossentropy": 2.0530719697475432, "loss/hidden": 3.3703125, "loss/jsd": 0.0, "loss/logits": 0.19876221660524607, "step": 11130 }, { "epoch": 0.2785, "grad_norm": 29.625, "grad_norm_var": 13.57890625, "learning_rate": 0.0001, "loss": 7.5154, "loss/crossentropy": 2.1174173802137375, "loss/hidden": 3.408984375, "loss/jsd": 0.0, "loss/logits": 0.20414691772311927, "step": 11140 }, { "epoch": 0.27875, "grad_norm": 29.25, "grad_norm_var": 44.18932291666667, "learning_rate": 0.0001, "loss": 7.5442, "loss/crossentropy": 2.2288032442331316, "loss/hidden": 3.41953125, "loss/jsd": 0.0, "loss/logits": 0.2252635546028614, "step": 11150 }, { "epoch": 0.279, "grad_norm": 30.75, "grad_norm_var": 45.85182291666667, "learning_rate": 0.0001, "loss": 7.5199, "loss/crossentropy": 2.2578700333833694, "loss/hidden": 3.3625, "loss/jsd": 0.0, "loss/logits": 0.21590106561779976, "step": 11160 }, { "epoch": 0.27925, "grad_norm": 29.0, "grad_norm_var": 65.8634765625, "learning_rate": 0.0001, "loss": 7.4833, "loss/crossentropy": 2.15876952111721, "loss/hidden": 3.45859375, "loss/jsd": 0.0, "loss/logits": 0.21022603176534177, "step": 11170 }, { "epoch": 0.2795, "grad_norm": 33.5, "grad_norm_var": 41.5337890625, "learning_rate": 0.0001, "loss": 7.5426, "loss/crossentropy": 2.2054150819778444, "loss/hidden": 3.3015625, "loss/jsd": 0.0, "loss/logits": 0.1938659494742751, "step": 11180 }, { "epoch": 0.27975, "grad_norm": 29.75, "grad_norm_var": 37.57265625, "learning_rate": 0.0001, "loss": 7.5083, "loss/crossentropy": 2.191286247968674, "loss/hidden": 3.340625, "loss/jsd": 0.0, "loss/logits": 0.19200536869466306, "step": 11190 }, { "epoch": 0.28, "grad_norm": 30.875, "grad_norm_var": 36.5697265625, "learning_rate": 0.0001, "loss": 7.3998, "loss/crossentropy": 2.240337911248207, "loss/hidden": 3.28671875, "loss/jsd": 0.0, "loss/logits": 0.19107160083949565, "step": 11200 }, { "epoch": 0.28025, "grad_norm": 42.25, "grad_norm_var": 37.92701822916667, "learning_rate": 0.0001, "loss": 7.4565, "loss/crossentropy": 2.21739219725132, "loss/hidden": 3.3609375, "loss/jsd": 0.0, "loss/logits": 0.2041622471064329, "step": 11210 }, { "epoch": 0.2805, "grad_norm": 30.0, "grad_norm_var": 14.87890625, "learning_rate": 0.0001, "loss": 7.4073, "loss/crossentropy": 2.1360912755131722, "loss/hidden": 3.379296875, "loss/jsd": 0.0, "loss/logits": 0.19229222014546393, "step": 11220 }, { "epoch": 0.28075, "grad_norm": 36.5, "grad_norm_var": 21.35625, "learning_rate": 0.0001, "loss": 7.445, "loss/crossentropy": 2.2468197375535963, "loss/hidden": 3.367578125, "loss/jsd": 0.0, "loss/logits": 0.20960316248238087, "step": 11230 }, { "epoch": 0.281, "grad_norm": 30.5, "grad_norm_var": 24.69765625, "learning_rate": 0.0001, "loss": 7.4355, "loss/crossentropy": 2.2591811805963515, "loss/hidden": 3.298828125, "loss/jsd": 0.0, "loss/logits": 0.18697103951126337, "step": 11240 }, { "epoch": 0.28125, "grad_norm": 29.75, "grad_norm_var": 10.726822916666666, "learning_rate": 0.0001, "loss": 7.5225, "loss/crossentropy": 2.2241678714752195, "loss/hidden": 3.39765625, "loss/jsd": 0.0, "loss/logits": 0.19806783739477396, "step": 11250 }, { "epoch": 0.2815, "grad_norm": 28.25, "grad_norm_var": 11.403059895833334, "learning_rate": 0.0001, "loss": 7.4736, "loss/crossentropy": 2.28880070745945, "loss/hidden": 3.35703125, "loss/jsd": 0.0, "loss/logits": 0.19563225321471692, "step": 11260 }, { "epoch": 0.28175, "grad_norm": 33.0, "grad_norm_var": 16.953580729166667, "learning_rate": 0.0001, "loss": 7.5007, "loss/crossentropy": 2.1222257822752, "loss/hidden": 3.387109375, "loss/jsd": 0.0, "loss/logits": 0.20468491613864898, "step": 11270 }, { "epoch": 0.282, "grad_norm": 32.0, "grad_norm_var": 16.693684895833332, "learning_rate": 0.0001, "loss": 7.4601, "loss/crossentropy": 2.0914298713207247, "loss/hidden": 3.444140625, "loss/jsd": 0.0, "loss/logits": 0.20181928034871816, "step": 11280 }, { "epoch": 0.28225, "grad_norm": 32.5, "grad_norm_var": 14.206705729166666, "learning_rate": 0.0001, "loss": 7.4363, "loss/crossentropy": 2.058793123066425, "loss/hidden": 3.3578125, "loss/jsd": 0.0, "loss/logits": 0.1982463788241148, "step": 11290 }, { "epoch": 0.2825, "grad_norm": 31.75, "grad_norm_var": 8.758333333333333, "learning_rate": 0.0001, "loss": 7.4243, "loss/crossentropy": 2.0935801014304163, "loss/hidden": 3.3703125, "loss/jsd": 0.0, "loss/logits": 0.1959468424320221, "step": 11300 }, { "epoch": 0.28275, "grad_norm": 28.125, "grad_norm_var": 9.334309895833334, "learning_rate": 0.0001, "loss": 7.4081, "loss/crossentropy": 2.156096602976322, "loss/hidden": 3.33515625, "loss/jsd": 0.0, "loss/logits": 0.1856603730469942, "step": 11310 }, { "epoch": 0.283, "grad_norm": 30.375, "grad_norm_var": 7.953059895833333, "learning_rate": 0.0001, "loss": 7.4932, "loss/crossentropy": 2.0780559942126273, "loss/hidden": 3.355859375, "loss/jsd": 0.0, "loss/logits": 0.20045752841979264, "step": 11320 }, { "epoch": 0.28325, "grad_norm": 38.25, "grad_norm_var": 9.335872395833333, "learning_rate": 0.0001, "loss": 7.4376, "loss/crossentropy": 2.203425186872482, "loss/hidden": 3.28515625, "loss/jsd": 0.0, "loss/logits": 0.1887454966083169, "step": 11330 }, { "epoch": 0.2835, "grad_norm": 31.25, "grad_norm_var": 7.339322916666666, "learning_rate": 0.0001, "loss": 7.5769, "loss/crossentropy": 2.139732484519482, "loss/hidden": 3.3875, "loss/jsd": 0.0, "loss/logits": 0.200992492120713, "step": 11340 }, { "epoch": 0.28375, "grad_norm": 29.375, "grad_norm_var": 8.889583333333333, "learning_rate": 0.0001, "loss": 7.4588, "loss/crossentropy": 2.1970251157879828, "loss/hidden": 3.4, "loss/jsd": 0.0, "loss/logits": 0.20897436924278737, "step": 11350 }, { "epoch": 0.284, "grad_norm": 29.625, "grad_norm_var": 10.83515625, "learning_rate": 0.0001, "loss": 7.556, "loss/crossentropy": 2.188813480734825, "loss/hidden": 3.49453125, "loss/jsd": 0.0, "loss/logits": 0.2200198858976364, "step": 11360 }, { "epoch": 0.28425, "grad_norm": 28.25, "grad_norm_var": 6.669205729166666, "learning_rate": 0.0001, "loss": 7.4272, "loss/crossentropy": 2.0679334342479705, "loss/hidden": 3.31015625, "loss/jsd": 0.0, "loss/logits": 0.19683563616126776, "step": 11370 }, { "epoch": 0.2845, "grad_norm": 32.5, "grad_norm_var": 6.563997395833334, "learning_rate": 0.0001, "loss": 7.5581, "loss/crossentropy": 2.29857756793499, "loss/hidden": 3.43828125, "loss/jsd": 0.0, "loss/logits": 0.19470676891505717, "step": 11380 }, { "epoch": 0.28475, "grad_norm": 31.0, "grad_norm_var": 2.73515625, "learning_rate": 0.0001, "loss": 7.4517, "loss/crossentropy": 2.061806722730398, "loss/hidden": 3.348828125, "loss/jsd": 0.0, "loss/logits": 0.20109398122876881, "step": 11390 }, { "epoch": 0.285, "grad_norm": 31.75, "grad_norm_var": 3.0205729166666666, "learning_rate": 0.0001, "loss": 7.668, "loss/crossentropy": 2.0487837977707386, "loss/hidden": 3.475390625, "loss/jsd": 0.0, "loss/logits": 0.2249346138909459, "step": 11400 }, { "epoch": 0.28525, "grad_norm": 43.25, "grad_norm_var": 3.0152302910081336e+18, "learning_rate": 0.0001, "loss": 7.5882, "loss/crossentropy": 2.071147194504738, "loss/hidden": 3.605078125, "loss/jsd": 0.0, "loss/logits": 0.19121263399720193, "step": 11410 }, { "epoch": 0.2855, "grad_norm": 30.0, "grad_norm_var": 4.0873121650067896e+18, "learning_rate": 0.0001, "loss": 7.6125, "loss/crossentropy": 2.136325827240944, "loss/hidden": 3.351171875, "loss/jsd": 0.0, "loss/logits": 0.19542958196252586, "step": 11420 }, { "epoch": 0.28575, "grad_norm": 49.5, "grad_norm_var": 3.85311092376286e+18, "learning_rate": 0.0001, "loss": 7.5728, "loss/crossentropy": 2.061227947473526, "loss/hidden": 3.62578125, "loss/jsd": 0.0, "loss/logits": 0.19353212472051382, "step": 11430 }, { "epoch": 0.286, "grad_norm": 31.75, "grad_norm_var": 3.8531109241063736e+18, "learning_rate": 0.0001, "loss": 7.4906, "loss/crossentropy": 2.1435365855693815, "loss/hidden": 3.394140625, "loss/jsd": 0.0, "loss/logits": 0.19920943658798934, "step": 11440 }, { "epoch": 0.28625, "grad_norm": 30.5, "grad_norm_var": 3.358268229166667, "learning_rate": 0.0001, "loss": 7.4355, "loss/crossentropy": 2.1882937461137772, "loss/hidden": 3.403125, "loss/jsd": 0.0, "loss/logits": 0.20106471050530672, "step": 11450 }, { "epoch": 0.2865, "grad_norm": 32.75, "grad_norm_var": 42.1759765625, "learning_rate": 0.0001, "loss": 7.5564, "loss/crossentropy": 2.3196360021829605, "loss/hidden": 3.483984375, "loss/jsd": 0.0, "loss/logits": 0.20747811421751977, "step": 11460 }, { "epoch": 0.28675, "grad_norm": 29.25, "grad_norm_var": 39.1962890625, "learning_rate": 0.0001, "loss": 7.5218, "loss/crossentropy": 2.0665975011885167, "loss/hidden": 3.349609375, "loss/jsd": 0.0, "loss/logits": 0.18888808842748403, "step": 11470 }, { "epoch": 0.287, "grad_norm": 38.25, "grad_norm_var": 257.53515625, "learning_rate": 0.0001, "loss": 7.6489, "loss/crossentropy": 2.0905568569898607, "loss/hidden": 3.471484375, "loss/jsd": 0.0, "loss/logits": 0.21102672098204495, "step": 11480 }, { "epoch": 0.28725, "grad_norm": 33.5, "grad_norm_var": 254.4853515625, "learning_rate": 0.0001, "loss": 7.4738, "loss/crossentropy": 2.091688461601734, "loss/hidden": 3.351171875, "loss/jsd": 0.0, "loss/logits": 0.17688411958515643, "step": 11490 }, { "epoch": 0.2875, "grad_norm": 32.0, "grad_norm_var": 2.16875, "learning_rate": 0.0001, "loss": 7.5848, "loss/crossentropy": 2.127404825389385, "loss/hidden": 3.37265625, "loss/jsd": 0.0, "loss/logits": 0.2071824749931693, "step": 11500 }, { "epoch": 0.28775, "grad_norm": 30.625, "grad_norm_var": 1.8989583333333333, "learning_rate": 0.0001, "loss": 7.4661, "loss/crossentropy": 2.1476680278778075, "loss/hidden": 3.431640625, "loss/jsd": 0.0, "loss/logits": 0.1993227731436491, "step": 11510 }, { "epoch": 0.288, "grad_norm": 31.125, "grad_norm_var": 1.4770833333333333, "learning_rate": 0.0001, "loss": 7.5997, "loss/crossentropy": 2.168101379275322, "loss/hidden": 3.466015625, "loss/jsd": 0.0, "loss/logits": 0.2063988609239459, "step": 11520 }, { "epoch": 0.28825, "grad_norm": 32.5, "grad_norm_var": 2.9166666666666665, "learning_rate": 0.0001, "loss": 7.5253, "loss/crossentropy": 2.1605693727731703, "loss/hidden": 3.450390625, "loss/jsd": 0.0, "loss/logits": 0.20308145191520452, "step": 11530 }, { "epoch": 0.2885, "grad_norm": 32.5, "grad_norm_var": 5.273893229166666, "learning_rate": 0.0001, "loss": 7.3857, "loss/crossentropy": 2.2063374549150465, "loss/hidden": 3.294921875, "loss/jsd": 0.0, "loss/logits": 0.1978879366070032, "step": 11540 }, { "epoch": 0.28875, "grad_norm": 30.375, "grad_norm_var": 376.1587890625, "learning_rate": 0.0001, "loss": 7.5441, "loss/crossentropy": 2.1251575350761414, "loss/hidden": 3.437890625, "loss/jsd": 0.0, "loss/logits": 0.21420632135123013, "step": 11550 }, { "epoch": 0.289, "grad_norm": 29.875, "grad_norm_var": 1.8624348958333334, "learning_rate": 0.0001, "loss": 7.5367, "loss/crossentropy": 2.152555100619793, "loss/hidden": 3.483203125, "loss/jsd": 0.0, "loss/logits": 0.209245034866035, "step": 11560 }, { "epoch": 0.28925, "grad_norm": 27.125, "grad_norm_var": 6.71875, "learning_rate": 0.0001, "loss": 7.4824, "loss/crossentropy": 2.068640360236168, "loss/hidden": 3.393359375, "loss/jsd": 0.0, "loss/logits": 0.1873430425301194, "step": 11570 }, { "epoch": 0.2895, "grad_norm": 31.875, "grad_norm_var": 7.520768229166666, "learning_rate": 0.0001, "loss": 7.4681, "loss/crossentropy": 2.0883365333080293, "loss/hidden": 3.3546875, "loss/jsd": 0.0, "loss/logits": 0.19091288559138775, "step": 11580 }, { "epoch": 0.28975, "grad_norm": 31.625, "grad_norm_var": 4.349739583333333, "learning_rate": 0.0001, "loss": 7.4169, "loss/crossentropy": 2.042503895610571, "loss/hidden": 3.425, "loss/jsd": 0.0, "loss/logits": 0.20744232581928373, "step": 11590 }, { "epoch": 0.29, "grad_norm": 30.75, "grad_norm_var": 1.72265625, "learning_rate": 0.0001, "loss": 7.603, "loss/crossentropy": 2.1102429538965226, "loss/hidden": 3.500390625, "loss/jsd": 0.0, "loss/logits": 0.20691623724997044, "step": 11600 }, { "epoch": 0.29025, "grad_norm": 29.375, "grad_norm_var": 4.84140625, "learning_rate": 0.0001, "loss": 7.5562, "loss/crossentropy": 2.098295146226883, "loss/hidden": 3.38046875, "loss/jsd": 0.0, "loss/logits": 0.20017402190715075, "step": 11610 }, { "epoch": 0.2905, "grad_norm": 30.125, "grad_norm_var": 2.2684895833333334, "learning_rate": 0.0001, "loss": 7.5017, "loss/crossentropy": 2.139223408699036, "loss/hidden": 3.352734375, "loss/jsd": 0.0, "loss/logits": 0.20002739522606133, "step": 11620 }, { "epoch": 0.29075, "grad_norm": 29.875, "grad_norm_var": 4.357291666666667, "learning_rate": 0.0001, "loss": 7.5172, "loss/crossentropy": 2.1825324445962906, "loss/hidden": 3.40859375, "loss/jsd": 0.0, "loss/logits": 0.20125013943761588, "step": 11630 }, { "epoch": 0.291, "grad_norm": 32.75, "grad_norm_var": 3.46015625, "learning_rate": 0.0001, "loss": 7.599, "loss/crossentropy": 2.068669117987156, "loss/hidden": 3.480078125, "loss/jsd": 0.0, "loss/logits": 0.19485772978514432, "step": 11640 }, { "epoch": 0.29125, "grad_norm": 28.625, "grad_norm_var": 1.86015625, "learning_rate": 0.0001, "loss": 7.4215, "loss/crossentropy": 2.089947706460953, "loss/hidden": 3.319921875, "loss/jsd": 0.0, "loss/logits": 0.1832545857876539, "step": 11650 }, { "epoch": 0.2915, "grad_norm": 29.75, "grad_norm_var": 2.2494140625, "learning_rate": 0.0001, "loss": 7.4485, "loss/crossentropy": 2.203251999616623, "loss/hidden": 3.451953125, "loss/jsd": 0.0, "loss/logits": 0.20148768946528434, "step": 11660 }, { "epoch": 0.29175, "grad_norm": 29.5, "grad_norm_var": 2.1489583333333333, "learning_rate": 0.0001, "loss": 7.4095, "loss/crossentropy": 2.15777850151062, "loss/hidden": 3.4890625, "loss/jsd": 0.0, "loss/logits": 0.20323845893144607, "step": 11670 }, { "epoch": 0.292, "grad_norm": 28.875, "grad_norm_var": 2.42890625, "learning_rate": 0.0001, "loss": 7.4914, "loss/crossentropy": 2.156290701031685, "loss/hidden": 3.412890625, "loss/jsd": 0.0, "loss/logits": 0.21260349955409766, "step": 11680 }, { "epoch": 0.29225, "grad_norm": 30.875, "grad_norm_var": 2.283072916666667, "learning_rate": 0.0001, "loss": 7.5226, "loss/crossentropy": 2.218624970316887, "loss/hidden": 3.483203125, "loss/jsd": 0.0, "loss/logits": 0.21999017260968684, "step": 11690 }, { "epoch": 0.2925, "grad_norm": 29.5, "grad_norm_var": 1.7957682291666666, "learning_rate": 0.0001, "loss": 7.452, "loss/crossentropy": 2.146151360869408, "loss/hidden": 3.39140625, "loss/jsd": 0.0, "loss/logits": 0.19877625294029713, "step": 11700 }, { "epoch": 0.29275, "grad_norm": 32.75, "grad_norm_var": 2.1552083333333334, "learning_rate": 0.0001, "loss": 7.5397, "loss/crossentropy": 2.176427762210369, "loss/hidden": 3.40859375, "loss/jsd": 0.0, "loss/logits": 0.220915474742651, "step": 11710 }, { "epoch": 0.293, "grad_norm": 31.875, "grad_norm_var": 2.4330729166666667, "learning_rate": 0.0001, "loss": 7.4466, "loss/crossentropy": 1.9928596645593644, "loss/hidden": 3.371875, "loss/jsd": 0.0, "loss/logits": 0.17805732283741235, "step": 11720 }, { "epoch": 0.29325, "grad_norm": 36.0, "grad_norm_var": 3.7087890625, "learning_rate": 0.0001, "loss": 7.4407, "loss/crossentropy": 2.04853350520134, "loss/hidden": 3.39375, "loss/jsd": 0.0, "loss/logits": 0.19008731115609406, "step": 11730 }, { "epoch": 0.2935, "grad_norm": 31.0, "grad_norm_var": 3.638541666666667, "learning_rate": 0.0001, "loss": 7.4562, "loss/crossentropy": 2.1776413440704347, "loss/hidden": 3.353125, "loss/jsd": 0.0, "loss/logits": 0.19186513982713221, "step": 11740 }, { "epoch": 0.29375, "grad_norm": 29.875, "grad_norm_var": 2.2343098958333334, "learning_rate": 0.0001, "loss": 7.473, "loss/crossentropy": 2.151021400094032, "loss/hidden": 3.39140625, "loss/jsd": 0.0, "loss/logits": 0.19510295912623404, "step": 11750 }, { "epoch": 0.294, "grad_norm": 31.25, "grad_norm_var": 3.453125, "learning_rate": 0.0001, "loss": 7.6263, "loss/crossentropy": 2.137827117741108, "loss/hidden": 3.48359375, "loss/jsd": 0.0, "loss/logits": 0.24175900630652905, "step": 11760 }, { "epoch": 0.29425, "grad_norm": 30.375, "grad_norm_var": 3.168684895833333, "learning_rate": 0.0001, "loss": 7.4144, "loss/crossentropy": 2.104247944802046, "loss/hidden": 3.28984375, "loss/jsd": 0.0, "loss/logits": 0.19388842806220055, "step": 11770 }, { "epoch": 0.2945, "grad_norm": 30.5, "grad_norm_var": 1.7197265625, "learning_rate": 0.0001, "loss": 7.5832, "loss/crossentropy": 2.234465000033379, "loss/hidden": 3.4328125, "loss/jsd": 0.0, "loss/logits": 0.19337044432759284, "step": 11780 }, { "epoch": 0.29475, "grad_norm": 31.5, "grad_norm_var": 1.7729166666666667, "learning_rate": 0.0001, "loss": 7.5465, "loss/crossentropy": 2.253598779439926, "loss/hidden": 3.409765625, "loss/jsd": 0.0, "loss/logits": 0.19784107208251953, "step": 11790 }, { "epoch": 0.295, "grad_norm": 30.5, "grad_norm_var": 1.7108723958333334, "learning_rate": 0.0001, "loss": 7.5977, "loss/crossentropy": 2.127879926562309, "loss/hidden": 3.391796875, "loss/jsd": 0.0, "loss/logits": 0.19584042821079492, "step": 11800 }, { "epoch": 0.29525, "grad_norm": 30.0, "grad_norm_var": 1.9541666666666666, "learning_rate": 0.0001, "loss": 7.5794, "loss/crossentropy": 2.1220794349908827, "loss/hidden": 3.451953125, "loss/jsd": 0.0, "loss/logits": 0.20339306239038707, "step": 11810 }, { "epoch": 0.2955, "grad_norm": 30.75, "grad_norm_var": 1.5384765625, "learning_rate": 0.0001, "loss": 7.5332, "loss/crossentropy": 2.1137719243764876, "loss/hidden": 3.441015625, "loss/jsd": 0.0, "loss/logits": 0.17815515641123056, "step": 11820 }, { "epoch": 0.29575, "grad_norm": 36.0, "grad_norm_var": 2.2046223958333333, "learning_rate": 0.0001, "loss": 7.5875, "loss/crossentropy": 2.060352697968483, "loss/hidden": 3.466015625, "loss/jsd": 0.0, "loss/logits": 0.20077992398291827, "step": 11830 }, { "epoch": 0.296, "grad_norm": 30.875, "grad_norm_var": 451.95625, "learning_rate": 0.0001, "loss": 7.4559, "loss/crossentropy": 2.0753053151071073, "loss/hidden": 3.3546875, "loss/jsd": 0.0, "loss/logits": 0.18949195835739374, "step": 11840 }, { "epoch": 0.29625, "grad_norm": 30.125, "grad_norm_var": 464.97389322916666, "learning_rate": 0.0001, "loss": 7.5175, "loss/crossentropy": 2.216064375638962, "loss/hidden": 3.4078125, "loss/jsd": 0.0, "loss/logits": 0.19813688658177853, "step": 11850 }, { "epoch": 0.2965, "grad_norm": 33.5, "grad_norm_var": 2.785872395833333, "learning_rate": 0.0001, "loss": 7.4496, "loss/crossentropy": 2.151813006401062, "loss/hidden": 3.473046875, "loss/jsd": 0.0, "loss/logits": 0.2109227592125535, "step": 11860 }, { "epoch": 0.29675, "grad_norm": 31.25, "grad_norm_var": 8.512955729166666, "learning_rate": 0.0001, "loss": 7.6473, "loss/crossentropy": 2.2002500608563422, "loss/hidden": 3.440234375, "loss/jsd": 0.0, "loss/logits": 0.22418888993561267, "step": 11870 }, { "epoch": 0.297, "grad_norm": 28.875, "grad_norm_var": 4.9947265625, "learning_rate": 0.0001, "loss": 7.5421, "loss/crossentropy": 2.161974515020847, "loss/hidden": 3.4125, "loss/jsd": 0.0, "loss/logits": 0.20292557254433632, "step": 11880 }, { "epoch": 0.29725, "grad_norm": 30.375, "grad_norm_var": 4.8125, "learning_rate": 0.0001, "loss": 7.4689, "loss/crossentropy": 2.1570896983146666, "loss/hidden": 3.3875, "loss/jsd": 0.0, "loss/logits": 0.18599064461886883, "step": 11890 }, { "epoch": 0.2975, "grad_norm": 31.875, "grad_norm_var": 3.1639973958333334, "learning_rate": 0.0001, "loss": 7.602, "loss/crossentropy": 2.116308979690075, "loss/hidden": 3.41484375, "loss/jsd": 0.0, "loss/logits": 0.20236566830426456, "step": 11900 }, { "epoch": 0.29775, "grad_norm": 32.0, "grad_norm_var": 1.478125, "learning_rate": 0.0001, "loss": 7.4508, "loss/crossentropy": 2.0929686918854715, "loss/hidden": 3.358203125, "loss/jsd": 0.0, "loss/logits": 0.18616765905171634, "step": 11910 }, { "epoch": 0.298, "grad_norm": 31.375, "grad_norm_var": 5.142643229166667, "learning_rate": 0.0001, "loss": 7.5382, "loss/crossentropy": 2.0649349838495255, "loss/hidden": 3.491015625, "loss/jsd": 0.0, "loss/logits": 0.22931914012879134, "step": 11920 }, { "epoch": 0.29825, "grad_norm": 28.375, "grad_norm_var": 5.2009765625, "learning_rate": 0.0001, "loss": 7.5117, "loss/crossentropy": 2.218345195055008, "loss/hidden": 3.378125, "loss/jsd": 0.0, "loss/logits": 0.19991735983639955, "step": 11930 }, { "epoch": 0.2985, "grad_norm": 29.375, "grad_norm_var": 1.8608723958333333, "learning_rate": 0.0001, "loss": 7.4722, "loss/crossentropy": 2.125768192112446, "loss/hidden": 3.423046875, "loss/jsd": 0.0, "loss/logits": 0.20784178506582976, "step": 11940 }, { "epoch": 0.29875, "grad_norm": 30.0, "grad_norm_var": 4.0625, "learning_rate": 0.0001, "loss": 7.5416, "loss/crossentropy": 2.1809418946504593, "loss/hidden": 3.41015625, "loss/jsd": 0.0, "loss/logits": 0.20119600538164378, "step": 11950 }, { "epoch": 0.299, "grad_norm": 30.75, "grad_norm_var": 2.268684895833333, "learning_rate": 0.0001, "loss": 7.4921, "loss/crossentropy": 2.1606124877929687, "loss/hidden": 3.34296875, "loss/jsd": 0.0, "loss/logits": 0.20459957644343377, "step": 11960 }, { "epoch": 0.29925, "grad_norm": 30.25, "grad_norm_var": 2.785872395833333, "learning_rate": 0.0001, "loss": 7.4666, "loss/crossentropy": 2.0705448195338247, "loss/hidden": 3.39609375, "loss/jsd": 0.0, "loss/logits": 0.1915181243792176, "step": 11970 }, { "epoch": 0.2995, "grad_norm": 32.25, "grad_norm_var": 3.0718098958333333, "learning_rate": 0.0001, "loss": 7.5573, "loss/crossentropy": 2.219760975241661, "loss/hidden": 3.5890625, "loss/jsd": 0.0, "loss/logits": 0.2002726301550865, "step": 11980 }, { "epoch": 0.29975, "grad_norm": 29.125, "grad_norm_var": 2.840625, "learning_rate": 0.0001, "loss": 7.4501, "loss/crossentropy": 2.1825525611639023, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.1877422071993351, "step": 11990 }, { "epoch": 0.3, "grad_norm": 33.5, "grad_norm_var": 3.609375, "learning_rate": 0.0001, "loss": 7.568, "loss/crossentropy": 2.034752905368805, "loss/hidden": 3.425, "loss/jsd": 0.0, "loss/logits": 0.19768680389970542, "step": 12000 }, { "epoch": 0.30025, "grad_norm": 32.5, "grad_norm_var": 1.9072916666666666, "learning_rate": 0.0001, "loss": 7.4314, "loss/crossentropy": 1.9573114350438119, "loss/hidden": 3.284375, "loss/jsd": 0.0, "loss/logits": 0.17485380116850138, "step": 12010 }, { "epoch": 0.3005, "grad_norm": 33.0, "grad_norm_var": 1.8921223958333333, "learning_rate": 0.0001, "loss": 7.5809, "loss/crossentropy": 2.2018728822469713, "loss/hidden": 3.336328125, "loss/jsd": 0.0, "loss/logits": 0.1915513090789318, "step": 12020 }, { "epoch": 0.30075, "grad_norm": 30.75, "grad_norm_var": 2.051041666666667, "learning_rate": 0.0001, "loss": 7.4786, "loss/crossentropy": 2.121686166524887, "loss/hidden": 3.34453125, "loss/jsd": 0.0, "loss/logits": 0.17941332049667835, "step": 12030 }, { "epoch": 0.301, "grad_norm": 31.0, "grad_norm_var": 1.1660807291666666, "learning_rate": 0.0001, "loss": 7.516, "loss/crossentropy": 2.146714040637016, "loss/hidden": 3.348828125, "loss/jsd": 0.0, "loss/logits": 0.18955296482890843, "step": 12040 }, { "epoch": 0.30125, "grad_norm": 29.875, "grad_norm_var": 4.1916015625, "learning_rate": 0.0001, "loss": 7.4308, "loss/crossentropy": 2.1516218155622484, "loss/hidden": 3.41953125, "loss/jsd": 0.0, "loss/logits": 0.19275461547076703, "step": 12050 }, { "epoch": 0.3015, "grad_norm": 29.0, "grad_norm_var": 4.0244140625, "learning_rate": 0.0001, "loss": 7.4391, "loss/crossentropy": 2.082157927751541, "loss/hidden": 3.3765625, "loss/jsd": 0.0, "loss/logits": 0.2028081189841032, "step": 12060 }, { "epoch": 0.30175, "grad_norm": 8153726976.0, "grad_norm_var": 4.1552039435295524e+18, "learning_rate": 0.0001, "loss": 7.5841, "loss/crossentropy": 2.155959886312485, "loss/hidden": 3.412890625, "loss/jsd": 0.0, "loss/logits": 0.21119905970990657, "step": 12070 }, { "epoch": 0.302, "grad_norm": 31.875, "grad_norm_var": 4.1552039419327805e+18, "learning_rate": 0.0001, "loss": 7.4881, "loss/crossentropy": 2.175818865001202, "loss/hidden": 3.387890625, "loss/jsd": 0.0, "loss/logits": 0.19981482364237307, "step": 12080 }, { "epoch": 0.30225, "grad_norm": 30.625, "grad_norm_var": 2.1306640625, "learning_rate": 0.0001, "loss": 7.5543, "loss/crossentropy": 2.153113231062889, "loss/hidden": 3.305078125, "loss/jsd": 0.0, "loss/logits": 0.18751353081315755, "step": 12090 }, { "epoch": 0.3025, "grad_norm": 29.125, "grad_norm_var": 2.55, "learning_rate": 0.0001, "loss": 7.516, "loss/crossentropy": 2.113540416955948, "loss/hidden": 3.405078125, "loss/jsd": 0.0, "loss/logits": 0.18914419934153556, "step": 12100 }, { "epoch": 0.30275, "grad_norm": 34.0, "grad_norm_var": 2.664322916666667, "learning_rate": 0.0001, "loss": 7.5051, "loss/crossentropy": 2.0748536720871926, "loss/hidden": 3.541015625, "loss/jsd": 0.0, "loss/logits": 0.21318843495100737, "step": 12110 }, { "epoch": 0.303, "grad_norm": 33.25, "grad_norm_var": 2.9830729166666665, "learning_rate": 0.0001, "loss": 7.5366, "loss/crossentropy": 2.1331150621175765, "loss/hidden": 3.38671875, "loss/jsd": 0.0, "loss/logits": 0.19423545580357313, "step": 12120 }, { "epoch": 0.30325, "grad_norm": 31.125, "grad_norm_var": 2.4400390625, "learning_rate": 0.0001, "loss": 7.4711, "loss/crossentropy": 2.220665395259857, "loss/hidden": 3.468359375, "loss/jsd": 0.0, "loss/logits": 0.21431318037211894, "step": 12130 }, { "epoch": 0.3035, "grad_norm": 32.0, "grad_norm_var": 2.9233723958333333, "learning_rate": 0.0001, "loss": 7.5293, "loss/crossentropy": 2.076695668697357, "loss/hidden": 3.40234375, "loss/jsd": 0.0, "loss/logits": 0.19359768629074098, "step": 12140 }, { "epoch": 0.30375, "grad_norm": 30.5, "grad_norm_var": 2.037239583333333, "learning_rate": 0.0001, "loss": 7.502, "loss/crossentropy": 2.007551699131727, "loss/hidden": 3.389453125, "loss/jsd": 0.0, "loss/logits": 0.18256566096097232, "step": 12150 }, { "epoch": 0.304, "grad_norm": 33.5, "grad_norm_var": 2.1858723958333335, "learning_rate": 0.0001, "loss": 7.5602, "loss/crossentropy": 2.2614489823579786, "loss/hidden": 3.396875, "loss/jsd": 0.0, "loss/logits": 0.21236035842448472, "step": 12160 }, { "epoch": 0.30425, "grad_norm": 29.125, "grad_norm_var": 4.1150390625, "learning_rate": 0.0001, "loss": 7.4919, "loss/crossentropy": 2.102987366914749, "loss/hidden": 3.378125, "loss/jsd": 0.0, "loss/logits": 0.19789319038391112, "step": 12170 }, { "epoch": 0.3045, "grad_norm": 31.5, "grad_norm_var": 2.8921223958333333, "learning_rate": 0.0001, "loss": 7.5418, "loss/crossentropy": 2.1469786316156387, "loss/hidden": 3.36953125, "loss/jsd": 0.0, "loss/logits": 0.20192387700080872, "step": 12180 }, { "epoch": 0.30475, "grad_norm": 29.375, "grad_norm_var": 1.7893229166666667, "learning_rate": 0.0001, "loss": 7.4362, "loss/crossentropy": 2.068286693096161, "loss/hidden": 3.420703125, "loss/jsd": 0.0, "loss/logits": 0.19657166097313167, "step": 12190 }, { "epoch": 0.305, "grad_norm": 31.125, "grad_norm_var": 1.6239583333333334, "learning_rate": 0.0001, "loss": 7.4976, "loss/crossentropy": 2.0656373471021654, "loss/hidden": 3.333984375, "loss/jsd": 0.0, "loss/logits": 0.18264612033963204, "step": 12200 }, { "epoch": 0.30525, "grad_norm": 32.25, "grad_norm_var": 2.4760416666666667, "learning_rate": 0.0001, "loss": 7.5079, "loss/crossentropy": 2.0986109912395476, "loss/hidden": 3.429296875, "loss/jsd": 0.0, "loss/logits": 0.18751255813986062, "step": 12210 }, { "epoch": 0.3055, "grad_norm": 32.25, "grad_norm_var": 3.784309895833333, "learning_rate": 0.0001, "loss": 7.5652, "loss/crossentropy": 2.128317725658417, "loss/hidden": 3.52890625, "loss/jsd": 0.0, "loss/logits": 0.19654161781072615, "step": 12220 }, { "epoch": 0.30575, "grad_norm": 29.375, "grad_norm_var": 4.205989583333333, "learning_rate": 0.0001, "loss": 7.4884, "loss/crossentropy": 2.1307172656059263, "loss/hidden": 3.325390625, "loss/jsd": 0.0, "loss/logits": 0.1911188881844282, "step": 12230 }, { "epoch": 0.306, "grad_norm": 29.0, "grad_norm_var": 3.9622395833333335, "learning_rate": 0.0001, "loss": 7.5641, "loss/crossentropy": 2.106894627213478, "loss/hidden": 3.4125, "loss/jsd": 0.0, "loss/logits": 0.19271711856126786, "step": 12240 }, { "epoch": 0.30625, "grad_norm": 29.25, "grad_norm_var": 14.451822916666666, "learning_rate": 0.0001, "loss": 7.5277, "loss/crossentropy": 2.09238311201334, "loss/hidden": 3.2703125, "loss/jsd": 0.0, "loss/logits": 0.1898360250517726, "step": 12250 }, { "epoch": 0.3065, "grad_norm": 30.375, "grad_norm_var": 44.141666666666666, "learning_rate": 0.0001, "loss": 7.5351, "loss/crossentropy": 2.189305357635021, "loss/hidden": 3.52421875, "loss/jsd": 0.0, "loss/logits": 0.2375885730609298, "step": 12260 }, { "epoch": 0.30675, "grad_norm": 31.125, "grad_norm_var": 2.3309895833333334, "learning_rate": 0.0001, "loss": 7.518, "loss/crossentropy": 2.0894197657704354, "loss/hidden": 3.403125, "loss/jsd": 0.0, "loss/logits": 0.18371071619912982, "step": 12270 }, { "epoch": 0.307, "grad_norm": 34.0, "grad_norm_var": 3.203125, "learning_rate": 0.0001, "loss": 7.5372, "loss/crossentropy": 2.2316055417060854, "loss/hidden": 3.312890625, "loss/jsd": 0.0, "loss/logits": 0.1975148657336831, "step": 12280 }, { "epoch": 0.30725, "grad_norm": 33.5, "grad_norm_var": 3.3671223958333334, "learning_rate": 0.0001, "loss": 7.499, "loss/crossentropy": 2.168405243754387, "loss/hidden": 3.463671875, "loss/jsd": 0.0, "loss/logits": 0.2092548543587327, "step": 12290 }, { "epoch": 0.3075, "grad_norm": 29.875, "grad_norm_var": 3.1372395833333333, "learning_rate": 0.0001, "loss": 7.4549, "loss/crossentropy": 2.1289770871400835, "loss/hidden": 3.33203125, "loss/jsd": 0.0, "loss/logits": 0.1914392326027155, "step": 12300 }, { "epoch": 0.30775, "grad_norm": 31.75, "grad_norm_var": 15.703125, "learning_rate": 0.0001, "loss": 7.4803, "loss/crossentropy": 1.9998106390237809, "loss/hidden": 3.433203125, "loss/jsd": 0.0, "loss/logits": 0.1981509406119585, "step": 12310 }, { "epoch": 0.308, "grad_norm": 30.75, "grad_norm_var": 2.53125, "learning_rate": 0.0001, "loss": 7.5639, "loss/crossentropy": 2.1574712097644806, "loss/hidden": 3.44765625, "loss/jsd": 0.0, "loss/logits": 0.19861921556293965, "step": 12320 }, { "epoch": 0.30825, "grad_norm": 30.375, "grad_norm_var": 3.6462890625, "learning_rate": 0.0001, "loss": 7.5662, "loss/crossentropy": 2.1368797808885573, "loss/hidden": 3.36484375, "loss/jsd": 0.0, "loss/logits": 0.19260376282036304, "step": 12330 }, { "epoch": 0.3085, "grad_norm": 31.625, "grad_norm_var": 3.4353515625, "learning_rate": 0.0001, "loss": 7.5165, "loss/crossentropy": 2.157291880249977, "loss/hidden": 3.326171875, "loss/jsd": 0.0, "loss/logits": 0.19906458482146264, "step": 12340 }, { "epoch": 0.30875, "grad_norm": 31.25, "grad_norm_var": 1.84765625, "learning_rate": 0.0001, "loss": 7.5003, "loss/crossentropy": 2.198679545521736, "loss/hidden": 3.390234375, "loss/jsd": 0.0, "loss/logits": 0.20154633428901433, "step": 12350 }, { "epoch": 0.309, "grad_norm": 30.75, "grad_norm_var": 6.72890625, "learning_rate": 0.0001, "loss": 7.3993, "loss/crossentropy": 2.0015364930033686, "loss/hidden": 3.437890625, "loss/jsd": 0.0, "loss/logits": 0.17583242971450092, "step": 12360 }, { "epoch": 0.30925, "grad_norm": 32.25, "grad_norm_var": 6.652083333333334, "learning_rate": 0.0001, "loss": 7.5535, "loss/crossentropy": 2.0551007747650147, "loss/hidden": 3.365234375, "loss/jsd": 0.0, "loss/logits": 0.17546763848513364, "step": 12370 }, { "epoch": 0.3095, "grad_norm": 33.25, "grad_norm_var": 4.09140625, "learning_rate": 0.0001, "loss": 7.5495, "loss/crossentropy": 2.1019306644797324, "loss/hidden": 3.47734375, "loss/jsd": 0.0, "loss/logits": 0.21077220756560563, "step": 12380 }, { "epoch": 0.30975, "grad_norm": 28.625, "grad_norm_var": 5.5947265625, "learning_rate": 0.0001, "loss": 7.5257, "loss/crossentropy": 2.059135194122791, "loss/hidden": 3.360546875, "loss/jsd": 0.0, "loss/logits": 0.19150259550660848, "step": 12390 }, { "epoch": 0.31, "grad_norm": 29.75, "grad_norm_var": 2.9872395833333334, "learning_rate": 0.0001, "loss": 7.5651, "loss/crossentropy": 2.1537068828940393, "loss/hidden": 3.387890625, "loss/jsd": 0.0, "loss/logits": 0.20102579537779092, "step": 12400 }, { "epoch": 0.31025, "grad_norm": 31.5, "grad_norm_var": 1.4660807291666667, "learning_rate": 0.0001, "loss": 7.4731, "loss/crossentropy": 2.1616740792989733, "loss/hidden": 3.33125, "loss/jsd": 0.0, "loss/logits": 0.19620374999940396, "step": 12410 }, { "epoch": 0.3105, "grad_norm": 28.5, "grad_norm_var": 19.721809895833335, "learning_rate": 0.0001, "loss": 7.5764, "loss/crossentropy": 2.094822147488594, "loss/hidden": 3.4046875, "loss/jsd": 0.0, "loss/logits": 0.18539618086069823, "step": 12420 }, { "epoch": 0.31075, "grad_norm": 32.25, "grad_norm_var": 9.62265625, "learning_rate": 0.0001, "loss": 7.5147, "loss/crossentropy": 2.061335799098015, "loss/hidden": 3.458984375, "loss/jsd": 0.0, "loss/logits": 0.2312490541487932, "step": 12430 }, { "epoch": 0.311, "grad_norm": 34.0, "grad_norm_var": 3.849739583333333, "learning_rate": 0.0001, "loss": 7.47, "loss/crossentropy": 2.0725695550441743, "loss/hidden": 3.35078125, "loss/jsd": 0.0, "loss/logits": 0.18395131248980762, "step": 12440 }, { "epoch": 0.31125, "grad_norm": 31.625, "grad_norm_var": 37.8478515625, "learning_rate": 0.0001, "loss": 7.5891, "loss/crossentropy": 2.177759502083063, "loss/hidden": 3.378515625, "loss/jsd": 0.0, "loss/logits": 0.19881557505577802, "step": 12450 }, { "epoch": 0.3115, "grad_norm": 30.0, "grad_norm_var": 37.426822916666666, "learning_rate": 0.0001, "loss": 7.5683, "loss/crossentropy": 2.215823370218277, "loss/hidden": 3.49296875, "loss/jsd": 0.0, "loss/logits": 0.21547670513391495, "step": 12460 }, { "epoch": 0.31175, "grad_norm": 29.25, "grad_norm_var": 3.113997395833333, "learning_rate": 0.0001, "loss": 7.5074, "loss/crossentropy": 2.152268370985985, "loss/hidden": 3.439453125, "loss/jsd": 0.0, "loss/logits": 0.19586993865668773, "step": 12470 }, { "epoch": 0.312, "grad_norm": 31.125, "grad_norm_var": 2.90625, "learning_rate": 0.0001, "loss": 7.4881, "loss/crossentropy": 2.1033642858266832, "loss/hidden": 3.4015625, "loss/jsd": 0.0, "loss/logits": 0.19163406621664764, "step": 12480 }, { "epoch": 0.31225, "grad_norm": 30.0, "grad_norm_var": 1.8848307291666666, "learning_rate": 0.0001, "loss": 7.4933, "loss/crossentropy": 2.1959578454494477, "loss/hidden": 3.286328125, "loss/jsd": 0.0, "loss/logits": 0.19511055387556553, "step": 12490 }, { "epoch": 0.3125, "grad_norm": 33.5, "grad_norm_var": 2.41875, "learning_rate": 0.0001, "loss": 7.4391, "loss/crossentropy": 2.195823746919632, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.17207753676921128, "step": 12500 }, { "epoch": 0.31275, "grad_norm": 29.0, "grad_norm_var": 3.8358723958333334, "learning_rate": 0.0001, "loss": 7.5033, "loss/crossentropy": 2.0963750213384627, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.17908218652009963, "step": 12510 }, { "epoch": 0.313, "grad_norm": 30.875, "grad_norm_var": 2.308333333333333, "learning_rate": 0.0001, "loss": 7.4239, "loss/crossentropy": 1.9604337126016618, "loss/hidden": 3.4515625, "loss/jsd": 0.0, "loss/logits": 0.18109768088907002, "step": 12520 }, { "epoch": 0.31325, "grad_norm": 33.5, "grad_norm_var": 5.990625, "learning_rate": 0.0001, "loss": 7.48, "loss/crossentropy": 2.1750424414873124, "loss/hidden": 3.444921875, "loss/jsd": 0.0, "loss/logits": 0.19859890583902598, "step": 12530 }, { "epoch": 0.3135, "grad_norm": 34.75, "grad_norm_var": 2.2145182291666665, "learning_rate": 0.0001, "loss": 7.5841, "loss/crossentropy": 2.2009086132049562, "loss/hidden": 3.342578125, "loss/jsd": 0.0, "loss/logits": 0.20133890733122825, "step": 12540 }, { "epoch": 0.31375, "grad_norm": 34.25, "grad_norm_var": 3.0931640625, "learning_rate": 0.0001, "loss": 7.4803, "loss/crossentropy": 2.3103501707315446, "loss/hidden": 3.346875, "loss/jsd": 0.0, "loss/logits": 0.2070220198482275, "step": 12550 }, { "epoch": 0.314, "grad_norm": 32.75, "grad_norm_var": 20.39140625, "learning_rate": 0.0001, "loss": 7.5026, "loss/crossentropy": 2.0587240263819693, "loss/hidden": 3.28203125, "loss/jsd": 0.0, "loss/logits": 0.18765954226255416, "step": 12560 }, { "epoch": 0.31425, "grad_norm": 31.0, "grad_norm_var": 1.32265625, "learning_rate": 0.0001, "loss": 7.5031, "loss/crossentropy": 2.194789284467697, "loss/hidden": 3.401171875, "loss/jsd": 0.0, "loss/logits": 0.19717307426035405, "step": 12570 }, { "epoch": 0.3145, "grad_norm": 32.25, "grad_norm_var": 2.5416666666666665, "learning_rate": 0.0001, "loss": 7.4895, "loss/crossentropy": 2.2054634511470796, "loss/hidden": 3.3015625, "loss/jsd": 0.0, "loss/logits": 0.1899452781304717, "step": 12580 }, { "epoch": 0.31475, "grad_norm": 30.75, "grad_norm_var": 2.279166666666667, "learning_rate": 0.0001, "loss": 7.4928, "loss/crossentropy": 2.1559681832790374, "loss/hidden": 3.32421875, "loss/jsd": 0.0, "loss/logits": 0.18954651188105345, "step": 12590 }, { "epoch": 0.315, "grad_norm": 32.5, "grad_norm_var": 2.5337890625, "learning_rate": 0.0001, "loss": 7.4422, "loss/crossentropy": 2.254222446680069, "loss/hidden": 3.370703125, "loss/jsd": 0.0, "loss/logits": 0.19737180043011904, "step": 12600 }, { "epoch": 0.31525, "grad_norm": 30.375, "grad_norm_var": 2.3681640625, "learning_rate": 0.0001, "loss": 7.4784, "loss/crossentropy": 2.146516689658165, "loss/hidden": 3.467578125, "loss/jsd": 0.0, "loss/logits": 0.20649947822093964, "step": 12610 }, { "epoch": 0.3155, "grad_norm": 29.25, "grad_norm_var": 2.608268229166667, "learning_rate": 0.0001, "loss": 7.4844, "loss/crossentropy": 2.0874697089195253, "loss/hidden": 3.467578125, "loss/jsd": 0.0, "loss/logits": 0.19496283829212188, "step": 12620 }, { "epoch": 0.31575, "grad_norm": 31.0, "grad_norm_var": 13.2681640625, "learning_rate": 0.0001, "loss": 7.55, "loss/crossentropy": 2.1777540147304535, "loss/hidden": 3.41015625, "loss/jsd": 0.0, "loss/logits": 0.20055020954459907, "step": 12630 }, { "epoch": 0.316, "grad_norm": 29.75, "grad_norm_var": 13.5791015625, "learning_rate": 0.0001, "loss": 7.4849, "loss/crossentropy": 2.1960367292165754, "loss/hidden": 3.39609375, "loss/jsd": 0.0, "loss/logits": 0.19755847714841365, "step": 12640 }, { "epoch": 0.31625, "grad_norm": 33.0, "grad_norm_var": 2.7603515625, "learning_rate": 0.0001, "loss": 7.4898, "loss/crossentropy": 2.1551318049430845, "loss/hidden": 3.428515625, "loss/jsd": 0.0, "loss/logits": 0.2070561083033681, "step": 12650 }, { "epoch": 0.3165, "grad_norm": 32.5, "grad_norm_var": 2.3018229166666666, "learning_rate": 0.0001, "loss": 7.4701, "loss/crossentropy": 2.1339357078075407, "loss/hidden": 3.471875, "loss/jsd": 0.0, "loss/logits": 0.19703674577176572, "step": 12660 }, { "epoch": 0.31675, "grad_norm": 29.5, "grad_norm_var": 11.317643229166666, "learning_rate": 0.0001, "loss": 7.4713, "loss/crossentropy": 2.0686328157782556, "loss/hidden": 3.45703125, "loss/jsd": 0.0, "loss/logits": 0.1992340425029397, "step": 12670 }, { "epoch": 0.317, "grad_norm": 28.5, "grad_norm_var": 2.022330729166667, "learning_rate": 0.0001, "loss": 7.3718, "loss/crossentropy": 2.1510003715753556, "loss/hidden": 3.285546875, "loss/jsd": 0.0, "loss/logits": 0.1883657006546855, "step": 12680 }, { "epoch": 0.31725, "grad_norm": 31.875, "grad_norm_var": 1.9809895833333333, "learning_rate": 0.0001, "loss": 7.4673, "loss/crossentropy": 2.0955928951501845, "loss/hidden": 3.341796875, "loss/jsd": 0.0, "loss/logits": 0.19225707659497857, "step": 12690 }, { "epoch": 0.3175, "grad_norm": 30.5, "grad_norm_var": 2.220572916666667, "learning_rate": 0.0001, "loss": 7.4942, "loss/crossentropy": 2.178106242418289, "loss/hidden": 3.33984375, "loss/jsd": 0.0, "loss/logits": 0.19251996818929912, "step": 12700 }, { "epoch": 0.31775, "grad_norm": 33.25, "grad_norm_var": 2.0405598958333333, "learning_rate": 0.0001, "loss": 7.419, "loss/crossentropy": 2.323311746120453, "loss/hidden": 3.244140625, "loss/jsd": 0.0, "loss/logits": 0.18311577010899782, "step": 12710 }, { "epoch": 0.318, "grad_norm": 30.125, "grad_norm_var": 2.255143229166667, "learning_rate": 0.0001, "loss": 7.3814, "loss/crossentropy": 2.166349285840988, "loss/hidden": 3.387890625, "loss/jsd": 0.0, "loss/logits": 0.19064488224685192, "step": 12720 }, { "epoch": 0.31825, "grad_norm": 31.875, "grad_norm_var": 2.2400390625, "learning_rate": 0.0001, "loss": 7.4573, "loss/crossentropy": 2.0821331262588503, "loss/hidden": 3.377734375, "loss/jsd": 0.0, "loss/logits": 0.19217007402330638, "step": 12730 }, { "epoch": 0.3185, "grad_norm": 29.125, "grad_norm_var": 2.31640625, "learning_rate": 0.0001, "loss": 7.3945, "loss/crossentropy": 2.2248596966266634, "loss/hidden": 3.372265625, "loss/jsd": 0.0, "loss/logits": 0.1899226889014244, "step": 12740 }, { "epoch": 0.31875, "grad_norm": 28.25, "grad_norm_var": 1.87890625, "learning_rate": 0.0001, "loss": 7.5347, "loss/crossentropy": 2.2382041692733763, "loss/hidden": 3.28671875, "loss/jsd": 0.0, "loss/logits": 0.1984447505325079, "step": 12750 }, { "epoch": 0.319, "grad_norm": 29.0, "grad_norm_var": 2.644791666666667, "learning_rate": 0.0001, "loss": 7.566, "loss/crossentropy": 2.1457022726535797, "loss/hidden": 3.439453125, "loss/jsd": 0.0, "loss/logits": 0.2007513264194131, "step": 12760 }, { "epoch": 0.31925, "grad_norm": 31.75, "grad_norm_var": 2.006705729166667, "learning_rate": 0.0001, "loss": 7.3134, "loss/crossentropy": 2.0300011321902276, "loss/hidden": 3.415234375, "loss/jsd": 0.0, "loss/logits": 0.18935967702418566, "step": 12770 }, { "epoch": 0.3195, "grad_norm": 31.25, "grad_norm_var": 21.826041666666665, "learning_rate": 0.0001, "loss": 7.5243, "loss/crossentropy": 2.1192230358719826, "loss/hidden": 3.566015625, "loss/jsd": 0.0, "loss/logits": 0.23796151615679265, "step": 12780 }, { "epoch": 0.31975, "grad_norm": 29.125, "grad_norm_var": 3.03125, "learning_rate": 0.0001, "loss": 7.4538, "loss/crossentropy": 2.1349218785762787, "loss/hidden": 3.291796875, "loss/jsd": 0.0, "loss/logits": 0.18145886678248643, "step": 12790 }, { "epoch": 0.32, "grad_norm": 30.0, "grad_norm_var": 2.998893229166667, "learning_rate": 0.0001, "loss": 7.5129, "loss/crossentropy": 2.0976530820131303, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.19899471253156661, "step": 12800 }, { "epoch": 0.32025, "grad_norm": 30.375, "grad_norm_var": 2.8134765625, "learning_rate": 0.0001, "loss": 7.3964, "loss/crossentropy": 2.1131363533437253, "loss/hidden": 3.383203125, "loss/jsd": 0.0, "loss/logits": 0.19826100897043944, "step": 12810 }, { "epoch": 0.3205, "grad_norm": 29.125, "grad_norm_var": 2.3947265625, "learning_rate": 0.0001, "loss": 7.3552, "loss/crossentropy": 2.1666725143790244, "loss/hidden": 3.25625, "loss/jsd": 0.0, "loss/logits": 0.17309209685772659, "step": 12820 }, { "epoch": 0.32075, "grad_norm": 35.25, "grad_norm_var": 6.137434895833334, "learning_rate": 0.0001, "loss": 7.4774, "loss/crossentropy": 2.0877491027116775, "loss/hidden": 3.3171875, "loss/jsd": 0.0, "loss/logits": 0.19852426294237374, "step": 12830 }, { "epoch": 0.321, "grad_norm": 31.25, "grad_norm_var": 31.881184895833332, "learning_rate": 0.0001, "loss": 7.5434, "loss/crossentropy": 2.0780150018632413, "loss/hidden": 3.375390625, "loss/jsd": 0.0, "loss/logits": 0.19723174571990967, "step": 12840 }, { "epoch": 0.32125, "grad_norm": 30.625, "grad_norm_var": 7.818489583333333, "learning_rate": 0.0001, "loss": 7.393, "loss/crossentropy": 2.260560867190361, "loss/hidden": 3.332421875, "loss/jsd": 0.0, "loss/logits": 0.2075773360207677, "step": 12850 }, { "epoch": 0.3215, "grad_norm": 32.0, "grad_norm_var": 10.083072916666667, "learning_rate": 0.0001, "loss": 7.3997, "loss/crossentropy": 2.1417742699384688, "loss/hidden": 3.273828125, "loss/jsd": 0.0, "loss/logits": 0.17287890873849393, "step": 12860 }, { "epoch": 0.32175, "grad_norm": 27.75, "grad_norm_var": 2.2705729166666666, "learning_rate": 0.0001, "loss": 7.3752, "loss/crossentropy": 2.131768229603767, "loss/hidden": 3.46953125, "loss/jsd": 0.0, "loss/logits": 0.19949521739035844, "step": 12870 }, { "epoch": 0.322, "grad_norm": 31.625, "grad_norm_var": 1.3863932291666667, "learning_rate": 0.0001, "loss": 7.4512, "loss/crossentropy": 2.266862842440605, "loss/hidden": 3.28359375, "loss/jsd": 0.0, "loss/logits": 0.19267170149832963, "step": 12880 }, { "epoch": 0.32225, "grad_norm": 30.0, "grad_norm_var": 4.132291666666666, "learning_rate": 0.0001, "loss": 7.534, "loss/crossentropy": 2.0951140992343427, "loss/hidden": 3.406640625, "loss/jsd": 0.0, "loss/logits": 0.2013495948165655, "step": 12890 }, { "epoch": 0.3225, "grad_norm": 31.5, "grad_norm_var": 2.995833333333333, "learning_rate": 0.0001, "loss": 7.4035, "loss/crossentropy": 2.2091780215501786, "loss/hidden": 3.327734375, "loss/jsd": 0.0, "loss/logits": 0.2002504987642169, "step": 12900 }, { "epoch": 0.32275, "grad_norm": 30.375, "grad_norm_var": 0.6301432291666667, "learning_rate": 0.0001, "loss": 7.4548, "loss/crossentropy": 2.1198602825403214, "loss/hidden": 3.400390625, "loss/jsd": 0.0, "loss/logits": 0.18528146538883447, "step": 12910 }, { "epoch": 0.323, "grad_norm": 28.875, "grad_norm_var": 0.8921223958333333, "learning_rate": 0.0001, "loss": 7.4588, "loss/crossentropy": 2.18964803814888, "loss/hidden": 3.310546875, "loss/jsd": 0.0, "loss/logits": 0.19150267392396927, "step": 12920 }, { "epoch": 0.32325, "grad_norm": 29.625, "grad_norm_var": 1.7434895833333333, "learning_rate": 0.0001, "loss": 7.5432, "loss/crossentropy": 2.1617533951997756, "loss/hidden": 3.349609375, "loss/jsd": 0.0, "loss/logits": 0.20485697444528342, "step": 12930 }, { "epoch": 0.3235, "grad_norm": 29.75, "grad_norm_var": 1.9655598958333333, "learning_rate": 0.0001, "loss": 7.5072, "loss/crossentropy": 1.9271393403410912, "loss/hidden": 3.42421875, "loss/jsd": 0.0, "loss/logits": 0.19526469726115464, "step": 12940 }, { "epoch": 0.32375, "grad_norm": 31.875, "grad_norm_var": 2.6333333333333333, "learning_rate": 0.0001, "loss": 7.5258, "loss/crossentropy": 2.0988502144813537, "loss/hidden": 3.35234375, "loss/jsd": 0.0, "loss/logits": 0.18954623490571976, "step": 12950 }, { "epoch": 0.324, "grad_norm": 30.25, "grad_norm_var": 3.6372395833333333, "learning_rate": 0.0001, "loss": 7.4828, "loss/crossentropy": 2.0505402773618697, "loss/hidden": 3.395703125, "loss/jsd": 0.0, "loss/logits": 0.19810300767421724, "step": 12960 }, { "epoch": 0.32425, "grad_norm": 29.25, "grad_norm_var": 2.749739583333333, "learning_rate": 0.0001, "loss": 7.5484, "loss/crossentropy": 2.0827158600091935, "loss/hidden": 3.355859375, "loss/jsd": 0.0, "loss/logits": 0.18706083726137876, "step": 12970 }, { "epoch": 0.3245, "grad_norm": 30.375, "grad_norm_var": 2.0462890625, "learning_rate": 0.0001, "loss": 7.516, "loss/crossentropy": 2.1275136440992357, "loss/hidden": 3.303515625, "loss/jsd": 0.0, "loss/logits": 0.18214173689484597, "step": 12980 }, { "epoch": 0.32475, "grad_norm": 30.5, "grad_norm_var": 8.691080729166666, "learning_rate": 0.0001, "loss": 7.4975, "loss/crossentropy": 2.2110335171222686, "loss/hidden": 3.389453125, "loss/jsd": 0.0, "loss/logits": 0.21523924693465232, "step": 12990 }, { "epoch": 0.325, "grad_norm": 32.75, "grad_norm_var": 7.461393229166666, "learning_rate": 0.0001, "loss": 7.4891, "loss/crossentropy": 2.147765000164509, "loss/hidden": 3.351171875, "loss/jsd": 0.0, "loss/logits": 0.19382500275969505, "step": 13000 }, { "epoch": 0.32525, "grad_norm": 30.125, "grad_norm_var": 2.283072916666667, "learning_rate": 0.0001, "loss": 7.4506, "loss/crossentropy": 2.0027987882494926, "loss/hidden": 3.44375, "loss/jsd": 0.0, "loss/logits": 0.18850265592336654, "step": 13010 }, { "epoch": 0.3255, "grad_norm": 29.25, "grad_norm_var": 4.158333333333333, "learning_rate": 0.0001, "loss": 7.4449, "loss/crossentropy": 2.239279788732529, "loss/hidden": 3.43671875, "loss/jsd": 0.0, "loss/logits": 0.2110375264659524, "step": 13020 }, { "epoch": 0.32575, "grad_norm": 29.0, "grad_norm_var": 5.57265625, "learning_rate": 0.0001, "loss": 7.3912, "loss/crossentropy": 2.1745992332696913, "loss/hidden": 3.41875, "loss/jsd": 0.0, "loss/logits": 0.2059601791203022, "step": 13030 }, { "epoch": 0.326, "grad_norm": 29.125, "grad_norm_var": 8.201822916666666, "learning_rate": 0.0001, "loss": 7.4196, "loss/crossentropy": 2.1236157566308975, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.19811425134539604, "step": 13040 }, { "epoch": 0.32625, "grad_norm": 29.125, "grad_norm_var": 14.8509765625, "learning_rate": 0.0001, "loss": 7.3974, "loss/crossentropy": 2.1588226526975633, "loss/hidden": 3.43515625, "loss/jsd": 0.0, "loss/logits": 0.2041918309405446, "step": 13050 }, { "epoch": 0.3265, "grad_norm": 33.25, "grad_norm_var": 14.5853515625, "learning_rate": 0.0001, "loss": 7.4069, "loss/crossentropy": 2.1836913257837294, "loss/hidden": 3.3390625, "loss/jsd": 0.0, "loss/logits": 0.1884943839162588, "step": 13060 }, { "epoch": 0.32675, "grad_norm": 28.875, "grad_norm_var": 15.664518229166667, "learning_rate": 0.0001, "loss": 7.5712, "loss/crossentropy": 2.173456999659538, "loss/hidden": 3.366015625, "loss/jsd": 0.0, "loss/logits": 0.20252696424722672, "step": 13070 }, { "epoch": 0.327, "grad_norm": 32.0, "grad_norm_var": 13.63515625, "learning_rate": 0.0001, "loss": 7.4706, "loss/crossentropy": 2.222018560767174, "loss/hidden": 3.330078125, "loss/jsd": 0.0, "loss/logits": 0.19077708926051856, "step": 13080 }, { "epoch": 0.32725, "grad_norm": 31.625, "grad_norm_var": 8.099934895833334, "learning_rate": 0.0001, "loss": 7.5034, "loss/crossentropy": 2.2448195964097977, "loss/hidden": 3.5734375, "loss/jsd": 0.0, "loss/logits": 0.241958281211555, "step": 13090 }, { "epoch": 0.3275, "grad_norm": 28.625, "grad_norm_var": 10.601822916666666, "learning_rate": 0.0001, "loss": 7.4343, "loss/crossentropy": 2.231740725040436, "loss/hidden": 3.303515625, "loss/jsd": 0.0, "loss/logits": 0.1907974945381284, "step": 13100 }, { "epoch": 0.32775, "grad_norm": 35.25, "grad_norm_var": 15.445247395833333, "learning_rate": 0.0001, "loss": 7.45, "loss/crossentropy": 2.116320764273405, "loss/hidden": 3.31171875, "loss/jsd": 0.0, "loss/logits": 0.17694039307534695, "step": 13110 }, { "epoch": 0.328, "grad_norm": 35.25, "grad_norm_var": 11.502018229166667, "learning_rate": 0.0001, "loss": 7.5445, "loss/crossentropy": 2.099977496266365, "loss/hidden": 3.52265625, "loss/jsd": 0.0, "loss/logits": 0.20624227696098388, "step": 13120 }, { "epoch": 0.32825, "grad_norm": 30.75, "grad_norm_var": 7.521875, "learning_rate": 0.0001, "loss": 7.4769, "loss/crossentropy": 2.1053822576999663, "loss/hidden": 3.40390625, "loss/jsd": 0.0, "loss/logits": 0.19216917939484118, "step": 13130 }, { "epoch": 0.3285, "grad_norm": 34.75, "grad_norm_var": 14.083268229166666, "learning_rate": 0.0001, "loss": 7.4462, "loss/crossentropy": 1.9960962146520616, "loss/hidden": 3.365234375, "loss/jsd": 0.0, "loss/logits": 0.18692312780767678, "step": 13140 }, { "epoch": 0.32875, "grad_norm": 29.0, "grad_norm_var": 10.630208333333334, "learning_rate": 0.0001, "loss": 7.4762, "loss/crossentropy": 2.1613951712846755, "loss/hidden": 3.537109375, "loss/jsd": 0.0, "loss/logits": 0.20941741671413183, "step": 13150 }, { "epoch": 0.329, "grad_norm": 33.5, "grad_norm_var": 6.3900390625, "learning_rate": 0.0001, "loss": 7.4808, "loss/crossentropy": 2.189284533262253, "loss/hidden": 3.387890625, "loss/jsd": 0.0, "loss/logits": 0.19653193168342115, "step": 13160 }, { "epoch": 0.32925, "grad_norm": 30.625, "grad_norm_var": 2.8489583333333335, "learning_rate": 0.0001, "loss": 7.4326, "loss/crossentropy": 2.1141976565122604, "loss/hidden": 3.329296875, "loss/jsd": 0.0, "loss/logits": 0.18286358639597894, "step": 13170 }, { "epoch": 0.3295, "grad_norm": 31.25, "grad_norm_var": 5.464583333333334, "learning_rate": 0.0001, "loss": 7.3723, "loss/crossentropy": 2.1913035601377486, "loss/hidden": 3.272265625, "loss/jsd": 0.0, "loss/logits": 0.1858003603294492, "step": 13180 }, { "epoch": 0.32975, "grad_norm": 29.375, "grad_norm_var": 17.029622395833332, "learning_rate": 0.0001, "loss": 7.4759, "loss/crossentropy": 1.9873070642352104, "loss/hidden": 3.429296875, "loss/jsd": 0.0, "loss/logits": 0.1818276308476925, "step": 13190 }, { "epoch": 0.33, "grad_norm": 32.5, "grad_norm_var": 4.840559895833334, "learning_rate": 0.0001, "loss": 7.4491, "loss/crossentropy": 2.0309854298830032, "loss/hidden": 3.401171875, "loss/jsd": 0.0, "loss/logits": 0.18652353323996068, "step": 13200 }, { "epoch": 0.33025, "grad_norm": 31.625, "grad_norm_var": 3.896875, "learning_rate": 0.0001, "loss": 7.4246, "loss/crossentropy": 2.1724058866500853, "loss/hidden": 3.323828125, "loss/jsd": 0.0, "loss/logits": 0.19039318859577178, "step": 13210 }, { "epoch": 0.3305, "grad_norm": 33.25, "grad_norm_var": 3.903580729166667, "learning_rate": 0.0001, "loss": 7.5702, "loss/crossentropy": 2.144900679588318, "loss/hidden": 3.36875, "loss/jsd": 0.0, "loss/logits": 0.19142087129876018, "step": 13220 }, { "epoch": 0.33075, "grad_norm": 31.125, "grad_norm_var": 4.763997395833333, "learning_rate": 0.0001, "loss": 7.4713, "loss/crossentropy": 2.1755593717098236, "loss/hidden": 3.339453125, "loss/jsd": 0.0, "loss/logits": 0.18869520910084248, "step": 13230 }, { "epoch": 0.331, "grad_norm": 30.25, "grad_norm_var": 2.674934895833333, "learning_rate": 0.0001, "loss": 7.3374, "loss/crossentropy": 2.2121481001377106, "loss/hidden": 3.27421875, "loss/jsd": 0.0, "loss/logits": 0.18241031859070062, "step": 13240 }, { "epoch": 0.33125, "grad_norm": 29.75, "grad_norm_var": 2.4494140625, "learning_rate": 0.0001, "loss": 7.3846, "loss/crossentropy": 2.166096642613411, "loss/hidden": 3.28203125, "loss/jsd": 0.0, "loss/logits": 0.18299505580216646, "step": 13250 }, { "epoch": 0.3315, "grad_norm": 30.625, "grad_norm_var": 1.7018229166666667, "learning_rate": 0.0001, "loss": 7.4058, "loss/crossentropy": 2.0837877124547957, "loss/hidden": 3.3765625, "loss/jsd": 0.0, "loss/logits": 0.18499783985316753, "step": 13260 }, { "epoch": 0.33175, "grad_norm": 31.375, "grad_norm_var": 1.5603515625, "learning_rate": 0.0001, "loss": 7.4647, "loss/crossentropy": 2.114008131623268, "loss/hidden": 3.483203125, "loss/jsd": 0.0, "loss/logits": 0.1959222162142396, "step": 13270 }, { "epoch": 0.332, "grad_norm": 30.5, "grad_norm_var": 5.910872395833334, "learning_rate": 0.0001, "loss": 7.511, "loss/crossentropy": 2.2248871475458145, "loss/hidden": 3.2953125, "loss/jsd": 0.0, "loss/logits": 0.18698529433459044, "step": 13280 }, { "epoch": 0.33225, "grad_norm": 33.75, "grad_norm_var": 4.077018229166667, "learning_rate": 0.0001, "loss": 7.4879, "loss/crossentropy": 2.203841781616211, "loss/hidden": 3.472265625, "loss/jsd": 0.0, "loss/logits": 0.2007746297866106, "step": 13290 }, { "epoch": 0.3325, "grad_norm": 33.25, "grad_norm_var": 2.0483723958333333, "learning_rate": 0.0001, "loss": 7.5087, "loss/crossentropy": 2.1073865801095963, "loss/hidden": 3.45, "loss/jsd": 0.0, "loss/logits": 0.2040329247713089, "step": 13300 }, { "epoch": 0.33275, "grad_norm": 31.5, "grad_norm_var": 5.128580729166667, "learning_rate": 0.0001, "loss": 7.425, "loss/crossentropy": 2.1313794389367104, "loss/hidden": 3.41875, "loss/jsd": 0.0, "loss/logits": 0.1976473730057478, "step": 13310 }, { "epoch": 0.333, "grad_norm": 28.625, "grad_norm_var": 5.35390625, "learning_rate": 0.0001, "loss": 7.3878, "loss/crossentropy": 2.0418414741754534, "loss/hidden": 3.325390625, "loss/jsd": 0.0, "loss/logits": 0.19092906154692174, "step": 13320 }, { "epoch": 0.33325, "grad_norm": 30.5, "grad_norm_var": 1.6229166666666666, "learning_rate": 0.0001, "loss": 7.4116, "loss/crossentropy": 2.1283623784780503, "loss/hidden": 3.357421875, "loss/jsd": 0.0, "loss/logits": 0.1854261551052332, "step": 13330 }, { "epoch": 0.3335, "grad_norm": 30.375, "grad_norm_var": 6.0994140625, "learning_rate": 0.0001, "loss": 7.4184, "loss/crossentropy": 2.084540565311909, "loss/hidden": 3.403125, "loss/jsd": 0.0, "loss/logits": 0.18820476066321135, "step": 13340 }, { "epoch": 0.33375, "grad_norm": 32.75, "grad_norm_var": 4.408072916666667, "learning_rate": 0.0001, "loss": 7.4188, "loss/crossentropy": 2.133148857951164, "loss/hidden": 3.27734375, "loss/jsd": 0.0, "loss/logits": 0.17596916407346724, "step": 13350 }, { "epoch": 0.334, "grad_norm": 32.5, "grad_norm_var": 3.331184895833333, "learning_rate": 0.0001, "loss": 7.4688, "loss/crossentropy": 2.070942518115044, "loss/hidden": 3.3, "loss/jsd": 0.0, "loss/logits": 0.17796068042516708, "step": 13360 }, { "epoch": 0.33425, "grad_norm": 32.25, "grad_norm_var": 2.38515625, "learning_rate": 0.0001, "loss": 7.5483, "loss/crossentropy": 2.1084318548440932, "loss/hidden": 3.355859375, "loss/jsd": 0.0, "loss/logits": 0.19390486590564252, "step": 13370 }, { "epoch": 0.3345, "grad_norm": 29.625, "grad_norm_var": 2.348372395833333, "learning_rate": 0.0001, "loss": 7.4769, "loss/crossentropy": 2.0068695291876795, "loss/hidden": 3.44140625, "loss/jsd": 0.0, "loss/logits": 0.18838447090238333, "step": 13380 }, { "epoch": 0.33475, "grad_norm": 33.5, "grad_norm_var": 2.5145833333333334, "learning_rate": 0.0001, "loss": 7.4633, "loss/crossentropy": 2.118716311454773, "loss/hidden": 3.44140625, "loss/jsd": 0.0, "loss/logits": 0.20167391393333672, "step": 13390 }, { "epoch": 0.335, "grad_norm": 30.75, "grad_norm_var": 2.232291666666667, "learning_rate": 0.0001, "loss": 7.4333, "loss/crossentropy": 2.1667033195495606, "loss/hidden": 3.325, "loss/jsd": 0.0, "loss/logits": 0.19795446433126926, "step": 13400 }, { "epoch": 0.33525, "grad_norm": 28.375, "grad_norm_var": 3.226822916666667, "learning_rate": 0.0001, "loss": 7.489, "loss/crossentropy": 2.1369444996118547, "loss/hidden": 3.401171875, "loss/jsd": 0.0, "loss/logits": 0.19194683507084848, "step": 13410 }, { "epoch": 0.3355, "grad_norm": 31.875, "grad_norm_var": 3.664322916666667, "learning_rate": 0.0001, "loss": 7.514, "loss/crossentropy": 2.1737091183662414, "loss/hidden": 3.342578125, "loss/jsd": 0.0, "loss/logits": 0.18577092699706554, "step": 13420 }, { "epoch": 0.33575, "grad_norm": 30.625, "grad_norm_var": 2.653059895833333, "learning_rate": 0.0001, "loss": 7.5082, "loss/crossentropy": 2.1547589361667634, "loss/hidden": 3.37578125, "loss/jsd": 0.0, "loss/logits": 0.1893642207607627, "step": 13430 }, { "epoch": 0.336, "grad_norm": 30.25, "grad_norm_var": 3.721809895833333, "learning_rate": 0.0001, "loss": 7.4585, "loss/crossentropy": 2.228061503171921, "loss/hidden": 3.38046875, "loss/jsd": 0.0, "loss/logits": 0.19817217160016298, "step": 13440 }, { "epoch": 0.33625, "grad_norm": 32.0, "grad_norm_var": 4.395768229166666, "learning_rate": 0.0001, "loss": 7.4325, "loss/crossentropy": 2.0584416151046754, "loss/hidden": 3.33828125, "loss/jsd": 0.0, "loss/logits": 0.18820574525743722, "step": 13450 }, { "epoch": 0.3365, "grad_norm": 32.25, "grad_norm_var": 4.1978515625, "learning_rate": 0.0001, "loss": 7.5845, "loss/crossentropy": 2.1853963419795037, "loss/hidden": 3.459765625, "loss/jsd": 0.0, "loss/logits": 0.20326036512851714, "step": 13460 }, { "epoch": 0.33675, "grad_norm": 31.625, "grad_norm_var": 3.5145833333333334, "learning_rate": 0.0001, "loss": 7.4957, "loss/crossentropy": 2.058859869837761, "loss/hidden": 3.392578125, "loss/jsd": 0.0, "loss/logits": 0.18476009368896484, "step": 13470 }, { "epoch": 0.337, "grad_norm": 29.75, "grad_norm_var": 2.189322916666667, "learning_rate": 0.0001, "loss": 7.3814, "loss/crossentropy": 2.09515139721334, "loss/hidden": 3.30859375, "loss/jsd": 0.0, "loss/logits": 0.18534991908818482, "step": 13480 }, { "epoch": 0.33725, "grad_norm": 28.25, "grad_norm_var": 2.5666015625, "learning_rate": 0.0001, "loss": 7.4269, "loss/crossentropy": 2.1185129195451737, "loss/hidden": 3.427734375, "loss/jsd": 0.0, "loss/logits": 0.19515257980674505, "step": 13490 }, { "epoch": 0.3375, "grad_norm": 31.125, "grad_norm_var": 2.351822916666667, "learning_rate": 0.0001, "loss": 7.5122, "loss/crossentropy": 2.138470843434334, "loss/hidden": 3.424609375, "loss/jsd": 0.0, "loss/logits": 0.19903527870774268, "step": 13500 }, { "epoch": 0.33775, "grad_norm": 30.25, "grad_norm_var": 1.41875, "learning_rate": 0.0001, "loss": 7.6212, "loss/crossentropy": 2.1452796325087546, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.1878256807103753, "step": 13510 }, { "epoch": 0.338, "grad_norm": 30.5, "grad_norm_var": 1.1223307291666667, "learning_rate": 0.0001, "loss": 7.5111, "loss/crossentropy": 2.1782995223999024, "loss/hidden": 3.423828125, "loss/jsd": 0.0, "loss/logits": 0.19461661092936994, "step": 13520 }, { "epoch": 0.33825, "grad_norm": 29.0, "grad_norm_var": 1.5809895833333334, "learning_rate": 0.0001, "loss": 7.4255, "loss/crossentropy": 2.184438717365265, "loss/hidden": 3.4171875, "loss/jsd": 0.0, "loss/logits": 0.2046054745092988, "step": 13530 }, { "epoch": 0.3385, "grad_norm": 31.125, "grad_norm_var": 2.736458333333333, "learning_rate": 0.0001, "loss": 7.4071, "loss/crossentropy": 2.082030822336674, "loss/hidden": 3.36015625, "loss/jsd": 0.0, "loss/logits": 0.1870069233700633, "step": 13540 }, { "epoch": 0.33875, "grad_norm": 31.25, "grad_norm_var": 2.645247395833333, "learning_rate": 0.0001, "loss": 7.4356, "loss/crossentropy": 2.0698161378502844, "loss/hidden": 3.376953125, "loss/jsd": 0.0, "loss/logits": 0.18587249666452407, "step": 13550 }, { "epoch": 0.339, "grad_norm": 31.625, "grad_norm_var": 2.824934895833333, "learning_rate": 0.0001, "loss": 7.5059, "loss/crossentropy": 2.1267208635807036, "loss/hidden": 3.37109375, "loss/jsd": 0.0, "loss/logits": 0.20201743990182877, "step": 13560 }, { "epoch": 0.33925, "grad_norm": 31.0, "grad_norm_var": 4.14765625, "learning_rate": 0.0001, "loss": 7.5921, "loss/crossentropy": 2.1734737694263457, "loss/hidden": 3.382421875, "loss/jsd": 0.0, "loss/logits": 0.21583349518477918, "step": 13570 }, { "epoch": 0.3395, "grad_norm": 32.25, "grad_norm_var": 3.4989583333333334, "learning_rate": 0.0001, "loss": 7.4379, "loss/crossentropy": 2.214940372109413, "loss/hidden": 3.393359375, "loss/jsd": 0.0, "loss/logits": 0.21127836983650922, "step": 13580 }, { "epoch": 0.33975, "grad_norm": 30.375, "grad_norm_var": 2.468489583333333, "learning_rate": 0.0001, "loss": 7.5302, "loss/crossentropy": 2.086373192071915, "loss/hidden": 3.40859375, "loss/jsd": 0.0, "loss/logits": 0.1951220341026783, "step": 13590 }, { "epoch": 0.34, "grad_norm": 30.625, "grad_norm_var": 4.908333333333333, "learning_rate": 0.0001, "loss": 7.4568, "loss/crossentropy": 2.1543196618556975, "loss/hidden": 3.358984375, "loss/jsd": 0.0, "loss/logits": 0.20275593940168618, "step": 13600 }, { "epoch": 0.34025, "grad_norm": 32.25, "grad_norm_var": 2.83515625, "learning_rate": 0.0001, "loss": 7.4797, "loss/crossentropy": 2.0696166858077047, "loss/hidden": 3.351953125, "loss/jsd": 0.0, "loss/logits": 0.18954422865062953, "step": 13610 }, { "epoch": 0.3405, "grad_norm": 30.5, "grad_norm_var": 2.9145182291666667, "learning_rate": 0.0001, "loss": 7.5892, "loss/crossentropy": 2.1566884085536002, "loss/hidden": 3.476171875, "loss/jsd": 0.0, "loss/logits": 0.22052888944745064, "step": 13620 }, { "epoch": 0.34075, "grad_norm": 32.5, "grad_norm_var": 4.57890625, "learning_rate": 0.0001, "loss": 7.5791, "loss/crossentropy": 2.1455826848745345, "loss/hidden": 3.46953125, "loss/jsd": 0.0, "loss/logits": 0.2092026699334383, "step": 13630 }, { "epoch": 0.341, "grad_norm": 31.0, "grad_norm_var": 2.464583333333333, "learning_rate": 0.0001, "loss": 7.5429, "loss/crossentropy": 2.208886668086052, "loss/hidden": 3.29609375, "loss/jsd": 0.0, "loss/logits": 0.18887277618050574, "step": 13640 }, { "epoch": 0.34125, "grad_norm": 32.0, "grad_norm_var": 2.5747395833333333, "learning_rate": 0.0001, "loss": 7.4508, "loss/crossentropy": 2.013479822874069, "loss/hidden": 3.3765625, "loss/jsd": 0.0, "loss/logits": 0.18337813336402178, "step": 13650 }, { "epoch": 0.3415, "grad_norm": 32.75, "grad_norm_var": 2.2509765625, "learning_rate": 0.0001, "loss": 7.5422, "loss/crossentropy": 2.143598672747612, "loss/hidden": 3.32734375, "loss/jsd": 0.0, "loss/logits": 0.2065559087321162, "step": 13660 }, { "epoch": 0.34175, "grad_norm": 31.125, "grad_norm_var": 1.7927083333333333, "learning_rate": 0.0001, "loss": 7.5533, "loss/crossentropy": 2.2552105635404587, "loss/hidden": 3.4890625, "loss/jsd": 0.0, "loss/logits": 0.19607291985303163, "step": 13670 }, { "epoch": 0.342, "grad_norm": 31.125, "grad_norm_var": 1.9983723958333333, "learning_rate": 0.0001, "loss": 7.4126, "loss/crossentropy": 2.1220908224582673, "loss/hidden": 3.4234375, "loss/jsd": 0.0, "loss/logits": 0.19602216491475702, "step": 13680 }, { "epoch": 0.34225, "grad_norm": 29.125, "grad_norm_var": 2.3827473958333334, "learning_rate": 0.0001, "loss": 7.4261, "loss/crossentropy": 2.045897740870714, "loss/hidden": 3.344921875, "loss/jsd": 0.0, "loss/logits": 0.18407570132985712, "step": 13690 }, { "epoch": 0.3425, "grad_norm": 31.75, "grad_norm_var": 2.2030598958333334, "learning_rate": 0.0001, "loss": 7.4016, "loss/crossentropy": 2.1739032745361326, "loss/hidden": 3.31796875, "loss/jsd": 0.0, "loss/logits": 0.1999795723706484, "step": 13700 }, { "epoch": 0.34275, "grad_norm": 31.0, "grad_norm_var": 1.0410807291666666, "learning_rate": 0.0001, "loss": 7.4881, "loss/crossentropy": 2.120612841844559, "loss/hidden": 3.376171875, "loss/jsd": 0.0, "loss/logits": 0.18889743015170096, "step": 13710 }, { "epoch": 0.343, "grad_norm": 30.875, "grad_norm_var": 1.365625, "learning_rate": 0.0001, "loss": 7.5733, "loss/crossentropy": 2.222416913509369, "loss/hidden": 3.365625, "loss/jsd": 0.0, "loss/logits": 0.20086848884820938, "step": 13720 }, { "epoch": 0.34325, "grad_norm": 28.25, "grad_norm_var": 2.4614583333333333, "learning_rate": 0.0001, "loss": 7.5037, "loss/crossentropy": 2.1907012492418287, "loss/hidden": 3.347265625, "loss/jsd": 0.0, "loss/logits": 0.190412163361907, "step": 13730 }, { "epoch": 0.3435, "grad_norm": 30.5, "grad_norm_var": 5.7978515625, "learning_rate": 0.0001, "loss": 7.5508, "loss/crossentropy": 2.1012231409549713, "loss/hidden": 3.46328125, "loss/jsd": 0.0, "loss/logits": 0.20470567271113396, "step": 13740 }, { "epoch": 0.34375, "grad_norm": 29.5, "grad_norm_var": 14.058072916666667, "learning_rate": 0.0001, "loss": 7.4824, "loss/crossentropy": 2.1705787777900696, "loss/hidden": 3.419140625, "loss/jsd": 0.0, "loss/logits": 0.20006757453083993, "step": 13750 }, { "epoch": 0.344, "grad_norm": 30.25, "grad_norm_var": 4.3279592088337556e+18, "learning_rate": 0.0001, "loss": 7.3957, "loss/crossentropy": 2.0967624366283415, "loss/hidden": 3.307421875, "loss/jsd": 0.0, "loss/logits": 0.17579186484217643, "step": 13760 }, { "epoch": 0.34425, "grad_norm": 30.25, "grad_norm_var": 4.3279592092845036e+18, "learning_rate": 0.0001, "loss": 7.5046, "loss/crossentropy": 2.0625901982188224, "loss/hidden": 3.351171875, "loss/jsd": 0.0, "loss/logits": 0.18775402326136828, "step": 13770 }, { "epoch": 0.3445, "grad_norm": 30.875, "grad_norm_var": 3.1556640625, "learning_rate": 0.0001, "loss": 7.4818, "loss/crossentropy": 2.18047796189785, "loss/hidden": 3.250390625, "loss/jsd": 0.0, "loss/logits": 0.18007917553186417, "step": 13780 }, { "epoch": 0.34475, "grad_norm": 31.875, "grad_norm_var": 9.3416015625, "learning_rate": 0.0001, "loss": 7.5204, "loss/crossentropy": 2.1267434477806093, "loss/hidden": 3.49453125, "loss/jsd": 0.0, "loss/logits": 0.20445957724004984, "step": 13790 }, { "epoch": 0.345, "grad_norm": 31.0, "grad_norm_var": 24.017643229166666, "learning_rate": 0.0001, "loss": 7.3979, "loss/crossentropy": 2.1567360132932665, "loss/hidden": 3.329296875, "loss/jsd": 0.0, "loss/logits": 0.19118222668766977, "step": 13800 }, { "epoch": 0.34525, "grad_norm": 30.5, "grad_norm_var": 3.7625, "learning_rate": 0.0001, "loss": 7.3801, "loss/crossentropy": 2.1303461611270906, "loss/hidden": 3.395703125, "loss/jsd": 0.0, "loss/logits": 0.19525523390620947, "step": 13810 }, { "epoch": 0.3455, "grad_norm": 32.0, "grad_norm_var": 4.1275390625, "learning_rate": 0.0001, "loss": 7.5682, "loss/crossentropy": 2.1890379935503006, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.20919501278549432, "step": 13820 }, { "epoch": 0.34575, "grad_norm": 34.25, "grad_norm_var": 4.558268229166667, "learning_rate": 0.0001, "loss": 7.3675, "loss/crossentropy": 2.0688234619796275, "loss/hidden": 3.391796875, "loss/jsd": 0.0, "loss/logits": 0.2000229710713029, "step": 13830 }, { "epoch": 0.346, "grad_norm": 30.625, "grad_norm_var": 6.05390625, "learning_rate": 0.0001, "loss": 7.4978, "loss/crossentropy": 2.0757779657840727, "loss/hidden": 3.334765625, "loss/jsd": 0.0, "loss/logits": 0.177880766056478, "step": 13840 }, { "epoch": 0.34625, "grad_norm": 33.75, "grad_norm_var": 6.74765625, "learning_rate": 0.0001, "loss": 7.4687, "loss/crossentropy": 2.2370450526475905, "loss/hidden": 3.382421875, "loss/jsd": 0.0, "loss/logits": 0.18801292553544044, "step": 13850 }, { "epoch": 0.3465, "grad_norm": 31.625, "grad_norm_var": 3.43515625, "learning_rate": 0.0001, "loss": 7.5391, "loss/crossentropy": 2.2622280076146124, "loss/hidden": 3.384375, "loss/jsd": 0.0, "loss/logits": 0.1970792453736067, "step": 13860 }, { "epoch": 0.34675, "grad_norm": 31.875, "grad_norm_var": 2.715559895833333, "learning_rate": 0.0001, "loss": 7.4578, "loss/crossentropy": 2.1148511469364166, "loss/hidden": 3.391015625, "loss/jsd": 0.0, "loss/logits": 0.19618813470005989, "step": 13870 }, { "epoch": 0.347, "grad_norm": 34.0, "grad_norm_var": 2.9916015625, "learning_rate": 0.0001, "loss": 7.4502, "loss/crossentropy": 2.0950228199362755, "loss/hidden": 3.466015625, "loss/jsd": 0.0, "loss/logits": 0.19036842305213214, "step": 13880 }, { "epoch": 0.34725, "grad_norm": 31.625, "grad_norm_var": 3.252083333333333, "learning_rate": 0.0001, "loss": 7.5902, "loss/crossentropy": 2.127346533536911, "loss/hidden": 3.479296875, "loss/jsd": 0.0, "loss/logits": 0.20207451861351727, "step": 13890 }, { "epoch": 0.3475, "grad_norm": 29.375, "grad_norm_var": 96.4712890625, "learning_rate": 0.0001, "loss": 7.4647, "loss/crossentropy": 2.1354104042053224, "loss/hidden": 3.394921875, "loss/jsd": 0.0, "loss/logits": 0.19764424189925195, "step": 13900 }, { "epoch": 0.34775, "grad_norm": 32.25, "grad_norm_var": 49.921875, "learning_rate": 0.0001, "loss": 7.4506, "loss/crossentropy": 2.0013342767953874, "loss/hidden": 3.401953125, "loss/jsd": 0.0, "loss/logits": 0.18574044182896615, "step": 13910 }, { "epoch": 0.348, "grad_norm": 31.75, "grad_norm_var": 34.174739583333334, "learning_rate": 0.0001, "loss": 7.395, "loss/crossentropy": 2.087017002701759, "loss/hidden": 3.33515625, "loss/jsd": 0.0, "loss/logits": 0.1856970646418631, "step": 13920 }, { "epoch": 0.34825, "grad_norm": 31.5, "grad_norm_var": 33.81041666666667, "learning_rate": 0.0001, "loss": 7.4898, "loss/crossentropy": 2.2239955455064773, "loss/hidden": 3.28359375, "loss/jsd": 0.0, "loss/logits": 0.19794288650155067, "step": 13930 }, { "epoch": 0.3485, "grad_norm": 31.125, "grad_norm_var": 9.250455729166667, "learning_rate": 0.0001, "loss": 7.5032, "loss/crossentropy": 2.1092188715934754, "loss/hidden": 3.344921875, "loss/jsd": 0.0, "loss/logits": 0.187043634057045, "step": 13940 }, { "epoch": 0.34875, "grad_norm": 31.0, "grad_norm_var": 5.587239583333333, "learning_rate": 0.0001, "loss": 7.5959, "loss/crossentropy": 2.134988930821419, "loss/hidden": 3.579296875, "loss/jsd": 0.0, "loss/logits": 0.24077722635120152, "step": 13950 }, { "epoch": 0.349, "grad_norm": 32.0, "grad_norm_var": 3.0708333333333333, "learning_rate": 0.0001, "loss": 7.5289, "loss/crossentropy": 2.1751519560813906, "loss/hidden": 3.408984375, "loss/jsd": 0.0, "loss/logits": 0.20373264253139495, "step": 13960 }, { "epoch": 0.34925, "grad_norm": 29.375, "grad_norm_var": 2.91640625, "learning_rate": 0.0001, "loss": 7.3548, "loss/crossentropy": 2.0371064260601996, "loss/hidden": 3.31015625, "loss/jsd": 0.0, "loss/logits": 0.17776737520471214, "step": 13970 }, { "epoch": 0.3495, "grad_norm": 31.5, "grad_norm_var": 1.8893229166666667, "learning_rate": 0.0001, "loss": 7.3986, "loss/crossentropy": 2.0750357516109945, "loss/hidden": 3.37734375, "loss/jsd": 0.0, "loss/logits": 0.18888103473000228, "step": 13980 }, { "epoch": 0.34975, "grad_norm": 51.25, "grad_norm_var": 3.1925595247275407e+18, "learning_rate": 0.0001, "loss": 7.6214, "loss/crossentropy": 2.1400484412908556, "loss/hidden": 3.57578125, "loss/jsd": 0.0, "loss/logits": 0.22168501131236554, "step": 13990 }, { "epoch": 0.35, "grad_norm": 29.875, "grad_norm_var": 34.92916666666667, "learning_rate": 0.0001, "loss": 7.3981, "loss/crossentropy": 2.1600725933909417, "loss/hidden": 3.308203125, "loss/jsd": 0.0, "loss/logits": 0.18070472478866578, "step": 14000 }, { "epoch": 0.35025, "grad_norm": 28.5, "grad_norm_var": 79.35625, "learning_rate": 0.0001, "loss": 7.375, "loss/crossentropy": 2.127374938130379, "loss/hidden": 3.3015625, "loss/jsd": 0.0, "loss/logits": 0.1964468475431204, "step": 14010 }, { "epoch": 0.3505, "grad_norm": 31.5, "grad_norm_var": 11.227018229166667, "learning_rate": 0.0001, "loss": 7.4185, "loss/crossentropy": 2.064249020814896, "loss/hidden": 3.448046875, "loss/jsd": 0.0, "loss/logits": 0.1939970314502716, "step": 14020 }, { "epoch": 0.35075, "grad_norm": 28.125, "grad_norm_var": 10.7962890625, "learning_rate": 0.0001, "loss": 7.3458, "loss/crossentropy": 2.009464919567108, "loss/hidden": 3.3625, "loss/jsd": 0.0, "loss/logits": 0.16926231551915408, "step": 14030 }, { "epoch": 0.351, "grad_norm": 33.0, "grad_norm_var": 10.735416666666667, "learning_rate": 0.0001, "loss": 7.4381, "loss/crossentropy": 2.114129716157913, "loss/hidden": 3.38203125, "loss/jsd": 0.0, "loss/logits": 0.18966517839580774, "step": 14040 }, { "epoch": 0.35125, "grad_norm": 38.25, "grad_norm_var": 11.432747395833333, "learning_rate": 0.0001, "loss": 7.4797, "loss/crossentropy": 2.025939238071442, "loss/hidden": 3.561328125, "loss/jsd": 0.0, "loss/logits": 0.21536036860197783, "step": 14050 }, { "epoch": 0.3515, "grad_norm": 29.0, "grad_norm_var": 10.072330729166667, "learning_rate": 0.0001, "loss": 7.4484, "loss/crossentropy": 2.067097157239914, "loss/hidden": 3.36171875, "loss/jsd": 0.0, "loss/logits": 0.19576688967645167, "step": 14060 }, { "epoch": 0.35175, "grad_norm": 30.0, "grad_norm_var": 8.662434895833334, "learning_rate": 0.0001, "loss": 7.4599, "loss/crossentropy": 2.1595573753118513, "loss/hidden": 3.390234375, "loss/jsd": 0.0, "loss/logits": 0.18690041936933993, "step": 14070 }, { "epoch": 0.352, "grad_norm": 29.875, "grad_norm_var": 33.607747395833336, "learning_rate": 0.0001, "loss": 7.3965, "loss/crossentropy": 2.2124534368515016, "loss/hidden": 3.319921875, "loss/jsd": 0.0, "loss/logits": 0.18615365140140055, "step": 14080 }, { "epoch": 0.35225, "grad_norm": 28.5, "grad_norm_var": 34.65514322916667, "learning_rate": 0.0001, "loss": 7.4262, "loss/crossentropy": 2.1372614175081255, "loss/hidden": 3.320703125, "loss/jsd": 0.0, "loss/logits": 0.18902692012488842, "step": 14090 }, { "epoch": 0.3525, "grad_norm": 30.75, "grad_norm_var": 4.59140625, "learning_rate": 0.0001, "loss": 7.5356, "loss/crossentropy": 2.1618730440735816, "loss/hidden": 3.328515625, "loss/jsd": 0.0, "loss/logits": 0.19613058008253575, "step": 14100 }, { "epoch": 0.35275, "grad_norm": 29.125, "grad_norm_var": 5.245833333333334, "learning_rate": 0.0001, "loss": 7.4814, "loss/crossentropy": 2.159251680970192, "loss/hidden": 3.4828125, "loss/jsd": 0.0, "loss/logits": 0.22244902290403842, "step": 14110 }, { "epoch": 0.353, "grad_norm": 30.75, "grad_norm_var": 5.6, "learning_rate": 0.0001, "loss": 7.3935, "loss/crossentropy": 2.1684078365564345, "loss/hidden": 3.32265625, "loss/jsd": 0.0, "loss/logits": 0.18789836578071117, "step": 14120 }, { "epoch": 0.35325, "grad_norm": 28.875, "grad_norm_var": 5.843489583333334, "learning_rate": 0.0001, "loss": 7.419, "loss/crossentropy": 2.1756194710731505, "loss/hidden": 3.27265625, "loss/jsd": 0.0, "loss/logits": 0.1782940372824669, "step": 14130 }, { "epoch": 0.3535, "grad_norm": 33.75, "grad_norm_var": 4.78125, "learning_rate": 0.0001, "loss": 7.5005, "loss/crossentropy": 2.1395913138985634, "loss/hidden": 3.3875, "loss/jsd": 0.0, "loss/logits": 0.19922848492860795, "step": 14140 }, { "epoch": 0.35375, "grad_norm": 32.5, "grad_norm_var": 2.9184895833333333, "learning_rate": 0.0001, "loss": 7.3926, "loss/crossentropy": 2.1444697260856627, "loss/hidden": 3.328515625, "loss/jsd": 0.0, "loss/logits": 0.18964640200138091, "step": 14150 }, { "epoch": 0.354, "grad_norm": 29.625, "grad_norm_var": 5.342708333333333, "learning_rate": 0.0001, "loss": 7.4802, "loss/crossentropy": 2.085355066135526, "loss/hidden": 3.382421875, "loss/jsd": 0.0, "loss/logits": 0.20059554176405073, "step": 14160 }, { "epoch": 0.35425, "grad_norm": 30.375, "grad_norm_var": 17.780989583333334, "learning_rate": 0.0001, "loss": 7.3967, "loss/crossentropy": 2.1756529211997986, "loss/hidden": 3.330078125, "loss/jsd": 0.0, "loss/logits": 0.1973805770277977, "step": 14170 }, { "epoch": 0.3545, "grad_norm": 35.0, "grad_norm_var": 5.43125, "learning_rate": 0.0001, "loss": 7.5431, "loss/crossentropy": 2.197963294386864, "loss/hidden": 3.344140625, "loss/jsd": 0.0, "loss/logits": 0.20036561377346515, "step": 14180 }, { "epoch": 0.35475, "grad_norm": 29.375, "grad_norm_var": 4.4775390625, "learning_rate": 0.0001, "loss": 7.4884, "loss/crossentropy": 2.1392128080129624, "loss/hidden": 3.38125, "loss/jsd": 0.0, "loss/logits": 0.19742392897605895, "step": 14190 }, { "epoch": 0.355, "grad_norm": 31.75, "grad_norm_var": 2.24765625, "learning_rate": 0.0001, "loss": 7.4598, "loss/crossentropy": 2.064734974503517, "loss/hidden": 3.365625, "loss/jsd": 0.0, "loss/logits": 0.21485688779503107, "step": 14200 }, { "epoch": 0.35525, "grad_norm": 28.75, "grad_norm_var": 3.8520182291666667, "learning_rate": 0.0001, "loss": 7.3441, "loss/crossentropy": 2.0820038333535194, "loss/hidden": 3.346484375, "loss/jsd": 0.0, "loss/logits": 0.1940332455560565, "step": 14210 }, { "epoch": 0.3555, "grad_norm": 29.0, "grad_norm_var": 1.7738932291666667, "learning_rate": 0.0001, "loss": 7.5072, "loss/crossentropy": 2.1766203552484513, "loss/hidden": 3.49296875, "loss/jsd": 0.0, "loss/logits": 0.23402530644088984, "step": 14220 }, { "epoch": 0.35575, "grad_norm": 29.0, "grad_norm_var": 2.2108723958333334, "learning_rate": 0.0001, "loss": 7.4591, "loss/crossentropy": 2.1537226110696794, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.20421480108052492, "step": 14230 }, { "epoch": 0.356, "grad_norm": 31.0, "grad_norm_var": 2.18515625, "learning_rate": 0.0001, "loss": 7.4918, "loss/crossentropy": 2.14600280970335, "loss/hidden": 3.317578125, "loss/jsd": 0.0, "loss/logits": 0.18569285795092583, "step": 14240 }, { "epoch": 0.35625, "grad_norm": 32.25, "grad_norm_var": 1.1848307291666667, "learning_rate": 0.0001, "loss": 7.3701, "loss/crossentropy": 1.989667421579361, "loss/hidden": 3.36796875, "loss/jsd": 0.0, "loss/logits": 0.17830973230302333, "step": 14250 }, { "epoch": 0.3565, "grad_norm": 31.125, "grad_norm_var": 3.1634765625, "learning_rate": 0.0001, "loss": 7.4071, "loss/crossentropy": 1.9839058618992567, "loss/hidden": 3.392578125, "loss/jsd": 0.0, "loss/logits": 0.19238084005191922, "step": 14260 }, { "epoch": 0.35675, "grad_norm": 32.0, "grad_norm_var": 1.7497395833333333, "learning_rate": 0.0001, "loss": 7.5671, "loss/crossentropy": 2.3518798291683196, "loss/hidden": 3.3953125, "loss/jsd": 0.0, "loss/logits": 0.22251383662223817, "step": 14270 }, { "epoch": 0.357, "grad_norm": 28.625, "grad_norm_var": 2.5697916666666667, "learning_rate": 0.0001, "loss": 7.4098, "loss/crossentropy": 2.0802294224500657, "loss/hidden": 3.36015625, "loss/jsd": 0.0, "loss/logits": 0.18012972828000784, "step": 14280 }, { "epoch": 0.35725, "grad_norm": 30.75, "grad_norm_var": 3.298958333333333, "learning_rate": 0.0001, "loss": 7.5495, "loss/crossentropy": 2.1887664824724196, "loss/hidden": 3.4390625, "loss/jsd": 0.0, "loss/logits": 0.21481433548033238, "step": 14290 }, { "epoch": 0.3575, "grad_norm": 28.625, "grad_norm_var": 3.442122395833333, "learning_rate": 0.0001, "loss": 7.4145, "loss/crossentropy": 2.164269728958607, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.18548591639846562, "step": 14300 }, { "epoch": 0.35775, "grad_norm": 30.0, "grad_norm_var": 3.4488932291666665, "learning_rate": 0.0001, "loss": 7.4602, "loss/crossentropy": 2.295723894238472, "loss/hidden": 3.344921875, "loss/jsd": 0.0, "loss/logits": 0.19964433908462526, "step": 14310 }, { "epoch": 0.358, "grad_norm": 28.625, "grad_norm_var": 1.9473307291666666, "learning_rate": 0.0001, "loss": 7.3661, "loss/crossentropy": 2.166135938465595, "loss/hidden": 3.38203125, "loss/jsd": 0.0, "loss/logits": 0.2061729446053505, "step": 14320 }, { "epoch": 0.35825, "grad_norm": 30.625, "grad_norm_var": 1.8650390625, "learning_rate": 0.0001, "loss": 7.4077, "loss/crossentropy": 2.2319384172558783, "loss/hidden": 3.3640625, "loss/jsd": 0.0, "loss/logits": 0.20099803544580935, "step": 14330 }, { "epoch": 0.3585, "grad_norm": 30.25, "grad_norm_var": 3.536393229166667, "learning_rate": 0.0001, "loss": 7.5508, "loss/crossentropy": 2.1728119254112244, "loss/hidden": 3.446875, "loss/jsd": 0.0, "loss/logits": 0.20843122638761996, "step": 14340 }, { "epoch": 0.35875, "grad_norm": 30.5, "grad_norm_var": 2.5192057291666665, "learning_rate": 0.0001, "loss": 7.5253, "loss/crossentropy": 2.2258054494857786, "loss/hidden": 3.42734375, "loss/jsd": 0.0, "loss/logits": 0.2185509093105793, "step": 14350 }, { "epoch": 0.359, "grad_norm": 31.375, "grad_norm_var": 1.6666015625, "learning_rate": 0.0001, "loss": 7.4171, "loss/crossentropy": 2.195216101408005, "loss/hidden": 3.258984375, "loss/jsd": 0.0, "loss/logits": 0.18341686353087425, "step": 14360 }, { "epoch": 0.35925, "grad_norm": 30.25, "grad_norm_var": 7.613541666666666, "learning_rate": 0.0001, "loss": 7.4032, "loss/crossentropy": 2.1008095175027846, "loss/hidden": 3.394921875, "loss/jsd": 0.0, "loss/logits": 0.18845746479928493, "step": 14370 }, { "epoch": 0.3595, "grad_norm": 31.25, "grad_norm_var": 8.676041666666666, "learning_rate": 0.0001, "loss": 7.4559, "loss/crossentropy": 2.0987029120326044, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.20877646040171385, "step": 14380 }, { "epoch": 0.35975, "grad_norm": 30.75, "grad_norm_var": 1.84765625, "learning_rate": 0.0001, "loss": 7.4529, "loss/crossentropy": 2.0258067578077315, "loss/hidden": 3.345703125, "loss/jsd": 0.0, "loss/logits": 0.1862058535218239, "step": 14390 }, { "epoch": 0.36, "grad_norm": 28.0, "grad_norm_var": 12.8400390625, "learning_rate": 0.0001, "loss": 7.4557, "loss/crossentropy": 2.126949782669544, "loss/hidden": 3.340625, "loss/jsd": 0.0, "loss/logits": 0.1885037848725915, "step": 14400 }, { "epoch": 0.36025, "grad_norm": 40.25, "grad_norm_var": 9.422330729166667, "learning_rate": 0.0001, "loss": 7.455, "loss/crossentropy": 2.1198273450136185, "loss/hidden": 3.405859375, "loss/jsd": 0.0, "loss/logits": 0.1814750697463751, "step": 14410 }, { "epoch": 0.3605, "grad_norm": 32.5, "grad_norm_var": 439.35390625, "learning_rate": 0.0001, "loss": 7.4941, "loss/crossentropy": 2.2227646946907043, "loss/hidden": 3.30703125, "loss/jsd": 0.0, "loss/logits": 0.1837354227900505, "step": 14420 }, { "epoch": 0.36075, "grad_norm": 29.75, "grad_norm_var": 2.0145833333333334, "learning_rate": 0.0001, "loss": 7.3752, "loss/crossentropy": 2.241603446006775, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.18308813124895096, "step": 14430 }, { "epoch": 0.361, "grad_norm": 29.875, "grad_norm_var": 16.7587890625, "learning_rate": 0.0001, "loss": 7.509, "loss/crossentropy": 2.207683742046356, "loss/hidden": 3.4234375, "loss/jsd": 0.0, "loss/logits": 0.20882598627358676, "step": 14440 }, { "epoch": 0.36125, "grad_norm": 30.0, "grad_norm_var": 31.902083333333334, "learning_rate": 0.0001, "loss": 7.4703, "loss/crossentropy": 2.086098091304302, "loss/hidden": 3.396484375, "loss/jsd": 0.0, "loss/logits": 0.18962491042912005, "step": 14450 }, { "epoch": 0.3615, "grad_norm": 30.75, "grad_norm_var": 540.1921223958333, "learning_rate": 0.0001, "loss": 7.4866, "loss/crossentropy": 2.1778491258621218, "loss/hidden": 3.398046875, "loss/jsd": 0.0, "loss/logits": 0.21017426177859305, "step": 14460 }, { "epoch": 0.36175, "grad_norm": 38.75, "grad_norm_var": 1848.2552083333333, "learning_rate": 0.0001, "loss": 7.8459, "loss/crossentropy": 2.2158830940723417, "loss/hidden": 3.571875, "loss/jsd": 0.0, "loss/logits": 0.20809865184128284, "step": 14470 }, { "epoch": 0.362, "grad_norm": 27.0, "grad_norm_var": 147.39108072916667, "learning_rate": 0.0001, "loss": 7.4835, "loss/crossentropy": 2.1877679374068975, "loss/hidden": 3.447265625, "loss/jsd": 0.0, "loss/logits": 0.1906817673705518, "step": 14480 }, { "epoch": 0.36225, "grad_norm": 29.125, "grad_norm_var": 2.633072916666667, "learning_rate": 0.0001, "loss": 7.5583, "loss/crossentropy": 2.17199744284153, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.19461486004292966, "step": 14490 }, { "epoch": 0.3625, "grad_norm": 33.0, "grad_norm_var": 18.4119140625, "learning_rate": 0.0001, "loss": 7.4614, "loss/crossentropy": 2.115432745218277, "loss/hidden": 3.337109375, "loss/jsd": 0.0, "loss/logits": 0.1788713935762644, "step": 14500 }, { "epoch": 0.36275, "grad_norm": 30.5, "grad_norm_var": 18.049739583333334, "learning_rate": 0.0001, "loss": 7.5055, "loss/crossentropy": 2.1718098491430284, "loss/hidden": 3.408984375, "loss/jsd": 0.0, "loss/logits": 0.18509582802653313, "step": 14510 }, { "epoch": 0.363, "grad_norm": 29.875, "grad_norm_var": 2.419205729166667, "learning_rate": 0.0001, "loss": 7.4428, "loss/crossentropy": 2.1485825031995773, "loss/hidden": 3.474609375, "loss/jsd": 0.0, "loss/logits": 0.1956698888912797, "step": 14520 }, { "epoch": 0.36325, "grad_norm": 28.0, "grad_norm_var": 3.57890625, "learning_rate": 0.0001, "loss": 7.3589, "loss/crossentropy": 2.121652737259865, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.17529746759682893, "step": 14530 }, { "epoch": 0.3635, "grad_norm": 29.375, "grad_norm_var": 3.726822916666667, "learning_rate": 0.0001, "loss": 7.4459, "loss/crossentropy": 2.154259353876114, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.19357640612870455, "step": 14540 }, { "epoch": 0.36375, "grad_norm": 30.75, "grad_norm_var": 2.4488932291666665, "learning_rate": 0.0001, "loss": 7.5765, "loss/crossentropy": 2.2079395562410356, "loss/hidden": 3.394921875, "loss/jsd": 0.0, "loss/logits": 0.19934294298291205, "step": 14550 }, { "epoch": 0.364, "grad_norm": 33.5, "grad_norm_var": 3.1207682291666665, "learning_rate": 0.0001, "loss": 7.5529, "loss/crossentropy": 2.1579170167446136, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.18290599882602693, "step": 14560 }, { "epoch": 0.36425, "grad_norm": 31.0, "grad_norm_var": 3.414322916666667, "learning_rate": 0.0001, "loss": 7.5717, "loss/crossentropy": 2.0264066070318223, "loss/hidden": 3.466796875, "loss/jsd": 0.0, "loss/logits": 0.2068371519446373, "step": 14570 }, { "epoch": 0.3645, "grad_norm": 33.5, "grad_norm_var": 2.905143229166667, "learning_rate": 0.0001, "loss": 7.4105, "loss/crossentropy": 2.0989100337028503, "loss/hidden": 3.289453125, "loss/jsd": 0.0, "loss/logits": 0.18423682264983654, "step": 14580 }, { "epoch": 0.36475, "grad_norm": 31.625, "grad_norm_var": 2.785872395833333, "learning_rate": 0.0001, "loss": 7.4701, "loss/crossentropy": 2.172179477661848, "loss/hidden": 3.33046875, "loss/jsd": 0.0, "loss/logits": 0.19203788777813316, "step": 14590 }, { "epoch": 0.365, "grad_norm": 31.375, "grad_norm_var": 3.4895833333333335, "learning_rate": 0.0001, "loss": 7.5094, "loss/crossentropy": 2.132567846775055, "loss/hidden": 3.37109375, "loss/jsd": 0.0, "loss/logits": 0.192837636731565, "step": 14600 }, { "epoch": 0.36525, "grad_norm": 31.25, "grad_norm_var": 3.6837890625, "learning_rate": 0.0001, "loss": 7.4883, "loss/crossentropy": 2.184123566746712, "loss/hidden": 3.371484375, "loss/jsd": 0.0, "loss/logits": 0.19760762453079223, "step": 14610 }, { "epoch": 0.3655, "grad_norm": 32.75, "grad_norm_var": 1.4098307291666667, "learning_rate": 0.0001, "loss": 7.5364, "loss/crossentropy": 2.160658273100853, "loss/hidden": 3.339453125, "loss/jsd": 0.0, "loss/logits": 0.19369914215058087, "step": 14620 }, { "epoch": 0.36575, "grad_norm": 29.0, "grad_norm_var": 2.3684895833333335, "learning_rate": 0.0001, "loss": 7.5713, "loss/crossentropy": 2.121895319223404, "loss/hidden": 3.487109375, "loss/jsd": 0.0, "loss/logits": 0.20311131626367568, "step": 14630 }, { "epoch": 0.366, "grad_norm": 31.375, "grad_norm_var": 1.7067057291666667, "learning_rate": 0.0001, "loss": 7.5138, "loss/crossentropy": 2.2521799474954607, "loss/hidden": 3.438671875, "loss/jsd": 0.0, "loss/logits": 0.21882211808115243, "step": 14640 }, { "epoch": 0.36625, "grad_norm": 33.25, "grad_norm_var": 7.970572916666667, "learning_rate": 0.0001, "loss": 7.4956, "loss/crossentropy": 2.0071460634469984, "loss/hidden": 3.548046875, "loss/jsd": 0.0, "loss/logits": 0.1984138570725918, "step": 14650 }, { "epoch": 0.3665, "grad_norm": 34.5, "grad_norm_var": 3.4879557291666665, "learning_rate": 0.0001, "loss": 7.659, "loss/crossentropy": 2.201695331931114, "loss/hidden": 3.42890625, "loss/jsd": 0.0, "loss/logits": 0.21316100917756559, "step": 14660 }, { "epoch": 0.36675, "grad_norm": 29.5, "grad_norm_var": 3.5509765625, "learning_rate": 0.0001, "loss": 7.5, "loss/crossentropy": 2.1117188185453415, "loss/hidden": 3.49375, "loss/jsd": 0.0, "loss/logits": 0.20876432172954082, "step": 14670 }, { "epoch": 0.367, "grad_norm": 29.625, "grad_norm_var": 4.667708333333334, "learning_rate": 0.0001, "loss": 7.5143, "loss/crossentropy": 2.190372434258461, "loss/hidden": 3.458984375, "loss/jsd": 0.0, "loss/logits": 0.19909227322787046, "step": 14680 }, { "epoch": 0.36725, "grad_norm": 30.5, "grad_norm_var": 7.52265625, "learning_rate": 0.0001, "loss": 7.4599, "loss/crossentropy": 2.126342362165451, "loss/hidden": 3.4234375, "loss/jsd": 0.0, "loss/logits": 0.18715179907158017, "step": 14690 }, { "epoch": 0.3675, "grad_norm": 29.875, "grad_norm_var": 2.5072916666666667, "learning_rate": 0.0001, "loss": 7.4414, "loss/crossentropy": 2.2693688839673998, "loss/hidden": 3.425390625, "loss/jsd": 0.0, "loss/logits": 0.20018572360277176, "step": 14700 }, { "epoch": 0.36775, "grad_norm": 32.0, "grad_norm_var": 2.044791666666667, "learning_rate": 0.0001, "loss": 7.4719, "loss/crossentropy": 2.181891369819641, "loss/hidden": 3.312890625, "loss/jsd": 0.0, "loss/logits": 0.19805356562137605, "step": 14710 }, { "epoch": 0.368, "grad_norm": 31.625, "grad_norm_var": 1.7613932291666667, "learning_rate": 0.0001, "loss": 7.5656, "loss/crossentropy": 2.1095578551292418, "loss/hidden": 3.434765625, "loss/jsd": 0.0, "loss/logits": 0.22850083969533444, "step": 14720 }, { "epoch": 0.36825, "grad_norm": 30.25, "grad_norm_var": 2.870247395833333, "learning_rate": 0.0001, "loss": 7.3893, "loss/crossentropy": 1.992812941968441, "loss/hidden": 3.307421875, "loss/jsd": 0.0, "loss/logits": 0.18111684722825885, "step": 14730 }, { "epoch": 0.3685, "grad_norm": 31.125, "grad_norm_var": 2.957291666666667, "learning_rate": 0.0001, "loss": 7.465, "loss/crossentropy": 2.1468236982822417, "loss/hidden": 3.358203125, "loss/jsd": 0.0, "loss/logits": 0.18966571036726237, "step": 14740 }, { "epoch": 0.36875, "grad_norm": 33.0, "grad_norm_var": 4.887434895833334, "learning_rate": 0.0001, "loss": 7.5622, "loss/crossentropy": 2.0730134949088095, "loss/hidden": 3.478125, "loss/jsd": 0.0, "loss/logits": 0.1993141021579504, "step": 14750 }, { "epoch": 0.369, "grad_norm": 30.875, "grad_norm_var": 519.0749348958333, "learning_rate": 0.0001, "loss": 7.6123, "loss/crossentropy": 2.2167199581861494, "loss/hidden": 3.330078125, "loss/jsd": 0.0, "loss/logits": 0.20820484217256308, "step": 14760 }, { "epoch": 0.36925, "grad_norm": 32.25, "grad_norm_var": 512.0455729166666, "learning_rate": 0.0001, "loss": 7.4422, "loss/crossentropy": 2.1445954963564873, "loss/hidden": 3.280078125, "loss/jsd": 0.0, "loss/logits": 0.18091966006904842, "step": 14770 }, { "epoch": 0.3695, "grad_norm": 32.5, "grad_norm_var": 2.2997395833333334, "learning_rate": 0.0001, "loss": 7.5236, "loss/crossentropy": 2.1166233003139494, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.19354630559682845, "step": 14780 }, { "epoch": 0.36975, "grad_norm": 33.0, "grad_norm_var": 3.0389973958333334, "learning_rate": 0.0001, "loss": 7.5297, "loss/crossentropy": 2.1253889203071594, "loss/hidden": 3.438671875, "loss/jsd": 0.0, "loss/logits": 0.18810804057866334, "step": 14790 }, { "epoch": 0.37, "grad_norm": 30.0, "grad_norm_var": 3.134375, "learning_rate": 0.0001, "loss": 7.494, "loss/crossentropy": 2.1236992135643957, "loss/hidden": 3.428125, "loss/jsd": 0.0, "loss/logits": 0.21158513389527797, "step": 14800 }, { "epoch": 0.37025, "grad_norm": 30.5, "grad_norm_var": 2.37265625, "learning_rate": 0.0001, "loss": 7.6228, "loss/crossentropy": 2.1661381393671038, "loss/hidden": 3.3453125, "loss/jsd": 0.0, "loss/logits": 0.19546456951647997, "step": 14810 }, { "epoch": 0.3705, "grad_norm": 29.625, "grad_norm_var": 1.5059895833333334, "learning_rate": 0.0001, "loss": 7.5015, "loss/crossentropy": 2.1351567402482035, "loss/hidden": 3.33984375, "loss/jsd": 0.0, "loss/logits": 0.18865119516849518, "step": 14820 }, { "epoch": 0.37075, "grad_norm": 30.25, "grad_norm_var": 3.138541666666667, "learning_rate": 0.0001, "loss": 7.6212, "loss/crossentropy": 2.092208056151867, "loss/hidden": 3.49140625, "loss/jsd": 0.0, "loss/logits": 0.21708083804696798, "step": 14830 }, { "epoch": 0.371, "grad_norm": 34.0, "grad_norm_var": 4.2087890625, "learning_rate": 0.0001, "loss": 7.5208, "loss/crossentropy": 2.0589574337005616, "loss/hidden": 3.36875, "loss/jsd": 0.0, "loss/logits": 0.19297052770853043, "step": 14840 }, { "epoch": 0.37125, "grad_norm": 29.375, "grad_norm_var": 15.303580729166667, "learning_rate": 0.0001, "loss": 7.5897, "loss/crossentropy": 2.116301205009222, "loss/hidden": 3.40859375, "loss/jsd": 0.0, "loss/logits": 0.19354041842743755, "step": 14850 }, { "epoch": 0.3715, "grad_norm": 30.125, "grad_norm_var": 2.428125, "learning_rate": 0.0001, "loss": 7.414, "loss/crossentropy": 2.136660024523735, "loss/hidden": 3.474609375, "loss/jsd": 0.0, "loss/logits": 0.19879130963236094, "step": 14860 }, { "epoch": 0.37175, "grad_norm": 30.25, "grad_norm_var": 3.9759765625, "learning_rate": 0.0001, "loss": 7.5417, "loss/crossentropy": 2.1285243958234785, "loss/hidden": 3.402734375, "loss/jsd": 0.0, "loss/logits": 0.19073361102491618, "step": 14870 }, { "epoch": 0.372, "grad_norm": 32.25, "grad_norm_var": 1.7768229166666667, "learning_rate": 0.0001, "loss": 7.556, "loss/crossentropy": 2.164254853129387, "loss/hidden": 3.444140625, "loss/jsd": 0.0, "loss/logits": 0.20029650218784809, "step": 14880 }, { "epoch": 0.37225, "grad_norm": 28.0, "grad_norm_var": 3.5462890625, "learning_rate": 0.0001, "loss": 7.5072, "loss/crossentropy": 2.140005186200142, "loss/hidden": 3.345703125, "loss/jsd": 0.0, "loss/logits": 0.20111254062503575, "step": 14890 }, { "epoch": 0.3725, "grad_norm": 30.625, "grad_norm_var": 1.7018229166666667, "learning_rate": 0.0001, "loss": 7.4189, "loss/crossentropy": 2.122871424257755, "loss/hidden": 3.459765625, "loss/jsd": 0.0, "loss/logits": 0.20454877093434334, "step": 14900 }, { "epoch": 0.37275, "grad_norm": 29.875, "grad_norm_var": 9.73515625, "learning_rate": 0.0001, "loss": 7.5896, "loss/crossentropy": 2.0703314900398255, "loss/hidden": 3.471875, "loss/jsd": 0.0, "loss/logits": 0.22251170128583908, "step": 14910 }, { "epoch": 0.373, "grad_norm": 32.25, "grad_norm_var": 9.148372395833333, "learning_rate": 0.0001, "loss": 7.4195, "loss/crossentropy": 1.9516542181372643, "loss/hidden": 3.46796875, "loss/jsd": 0.0, "loss/logits": 0.18584211356937885, "step": 14920 }, { "epoch": 0.37325, "grad_norm": 32.0, "grad_norm_var": 2.926497395833333, "learning_rate": 0.0001, "loss": 7.4303, "loss/crossentropy": 2.207534837722778, "loss/hidden": 3.295703125, "loss/jsd": 0.0, "loss/logits": 0.1845931550487876, "step": 14930 }, { "epoch": 0.3735, "grad_norm": 31.875, "grad_norm_var": 2.1087890625, "learning_rate": 0.0001, "loss": 7.4834, "loss/crossentropy": 2.2537055611610413, "loss/hidden": 3.315625, "loss/jsd": 0.0, "loss/logits": 0.1929902819916606, "step": 14940 }, { "epoch": 0.37375, "grad_norm": 28.625, "grad_norm_var": 2.155989583333333, "learning_rate": 0.0001, "loss": 7.5208, "loss/crossentropy": 2.0970467746257784, "loss/hidden": 3.53203125, "loss/jsd": 0.0, "loss/logits": 0.20525697618722916, "step": 14950 }, { "epoch": 0.374, "grad_norm": 32.0, "grad_norm_var": 1.9587890625, "learning_rate": 0.0001, "loss": 7.3624, "loss/crossentropy": 2.203538399934769, "loss/hidden": 3.351953125, "loss/jsd": 0.0, "loss/logits": 0.2022376826032996, "step": 14960 }, { "epoch": 0.37425, "grad_norm": 30.625, "grad_norm_var": 3.6488932291666667, "learning_rate": 0.0001, "loss": 7.5996, "loss/crossentropy": 2.242584636807442, "loss/hidden": 3.362109375, "loss/jsd": 0.0, "loss/logits": 0.19289392810314893, "step": 14970 }, { "epoch": 0.3745, "grad_norm": 30.0, "grad_norm_var": 4.21875, "learning_rate": 0.0001, "loss": 7.4014, "loss/crossentropy": 2.181095063686371, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.19425397235900163, "step": 14980 }, { "epoch": 0.37475, "grad_norm": 32.75, "grad_norm_var": 1.5707682291666667, "learning_rate": 0.0001, "loss": 7.443, "loss/crossentropy": 2.247698572278023, "loss/hidden": 3.398046875, "loss/jsd": 0.0, "loss/logits": 0.19984396249055864, "step": 14990 }, { "epoch": 0.375, "grad_norm": 31.25, "grad_norm_var": 11.702083333333333, "learning_rate": 0.0001, "loss": 7.5215, "loss/crossentropy": 2.2001713097095488, "loss/hidden": 3.36875, "loss/jsd": 0.0, "loss/logits": 0.19376100320369005, "step": 15000 }, { "epoch": 0.37525, "grad_norm": 31.625, "grad_norm_var": 5.531705729166666, "learning_rate": 0.0001, "loss": 7.4883, "loss/crossentropy": 2.219767712801695, "loss/hidden": 3.419921875, "loss/jsd": 0.0, "loss/logits": 0.21262812037020923, "step": 15010 }, { "epoch": 0.3755, "grad_norm": 30.25, "grad_norm_var": 14.839322916666667, "learning_rate": 0.0001, "loss": 7.506, "loss/crossentropy": 2.1578361958265306, "loss/hidden": 3.31640625, "loss/jsd": 0.0, "loss/logits": 0.1888531982898712, "step": 15020 }, { "epoch": 0.37575, "grad_norm": 34.25, "grad_norm_var": 3.611458333333333, "learning_rate": 0.0001, "loss": 7.4726, "loss/crossentropy": 2.246231307089329, "loss/hidden": 3.28515625, "loss/jsd": 0.0, "loss/logits": 0.20316961575299503, "step": 15030 }, { "epoch": 0.376, "grad_norm": 33.5, "grad_norm_var": 2.6830729166666667, "learning_rate": 0.0001, "loss": 7.4463, "loss/crossentropy": 2.2054271012544633, "loss/hidden": 3.255859375, "loss/jsd": 0.0, "loss/logits": 0.17821243554353713, "step": 15040 }, { "epoch": 0.37625, "grad_norm": 30.375, "grad_norm_var": 1.6552083333333334, "learning_rate": 0.0001, "loss": 7.4054, "loss/crossentropy": 2.198264943063259, "loss/hidden": 3.33125, "loss/jsd": 0.0, "loss/logits": 0.1874470232054591, "step": 15050 }, { "epoch": 0.3765, "grad_norm": 34.0, "grad_norm_var": 4.098893229166666, "learning_rate": 0.0001, "loss": 7.5948, "loss/crossentropy": 2.125826768577099, "loss/hidden": 3.405078125, "loss/jsd": 0.0, "loss/logits": 0.19506640397012234, "step": 15060 }, { "epoch": 0.37675, "grad_norm": 31.625, "grad_norm_var": 12.9009765625, "learning_rate": 0.0001, "loss": 7.403, "loss/crossentropy": 2.1264645174145698, "loss/hidden": 3.194921875, "loss/jsd": 0.0, "loss/logits": 0.1758699558675289, "step": 15070 }, { "epoch": 0.377, "grad_norm": 30.25, "grad_norm_var": 12.3072265625, "learning_rate": 0.0001, "loss": 7.4164, "loss/crossentropy": 2.0446790009737015, "loss/hidden": 3.405078125, "loss/jsd": 0.0, "loss/logits": 0.18656185436993838, "step": 15080 }, { "epoch": 0.37725, "grad_norm": 30.375, "grad_norm_var": 1.7770182291666667, "learning_rate": 0.0001, "loss": 7.5228, "loss/crossentropy": 2.2252503603696825, "loss/hidden": 3.354296875, "loss/jsd": 0.0, "loss/logits": 0.1904544487595558, "step": 15090 }, { "epoch": 0.3775, "grad_norm": 33.25, "grad_norm_var": 1.2988932291666666, "learning_rate": 0.0001, "loss": 7.5016, "loss/crossentropy": 2.09442842900753, "loss/hidden": 3.373828125, "loss/jsd": 0.0, "loss/logits": 0.18910849764943122, "step": 15100 }, { "epoch": 0.37775, "grad_norm": 29.75, "grad_norm_var": 2.792708333333333, "learning_rate": 0.0001, "loss": 7.4727, "loss/crossentropy": 2.2667126506567, "loss/hidden": 3.300390625, "loss/jsd": 0.0, "loss/logits": 0.20034724567085505, "step": 15110 }, { "epoch": 0.378, "grad_norm": 31.25, "grad_norm_var": 2.4447265625, "learning_rate": 0.0001, "loss": 7.4735, "loss/crossentropy": 2.1522352784872054, "loss/hidden": 3.4046875, "loss/jsd": 0.0, "loss/logits": 0.19715874120593072, "step": 15120 }, { "epoch": 0.37825, "grad_norm": 30.125, "grad_norm_var": 2.1738932291666666, "learning_rate": 0.0001, "loss": 7.5987, "loss/crossentropy": 2.2373314648866653, "loss/hidden": 3.411328125, "loss/jsd": 0.0, "loss/logits": 0.20188613571226596, "step": 15130 }, { "epoch": 0.3785, "grad_norm": 31.625, "grad_norm_var": 3.076041666666667, "learning_rate": 0.0001, "loss": 7.4819, "loss/crossentropy": 2.0119318321347235, "loss/hidden": 3.37734375, "loss/jsd": 0.0, "loss/logits": 0.1882522501051426, "step": 15140 }, { "epoch": 0.37875, "grad_norm": 33.0, "grad_norm_var": 3.067643229166667, "learning_rate": 0.0001, "loss": 7.6136, "loss/crossentropy": 2.169758787751198, "loss/hidden": 3.335546875, "loss/jsd": 0.0, "loss/logits": 0.19593703038990498, "step": 15150 }, { "epoch": 0.379, "grad_norm": 32.0, "grad_norm_var": 11.815625, "learning_rate": 0.0001, "loss": 7.5821, "loss/crossentropy": 2.184238949418068, "loss/hidden": 3.395703125, "loss/jsd": 0.0, "loss/logits": 0.19631746597588062, "step": 15160 }, { "epoch": 0.37925, "grad_norm": 32.25, "grad_norm_var": 5.576822916666667, "learning_rate": 0.0001, "loss": 7.5062, "loss/crossentropy": 2.1800487637519836, "loss/hidden": 3.387890625, "loss/jsd": 0.0, "loss/logits": 0.1907597079873085, "step": 15170 }, { "epoch": 0.3795, "grad_norm": 32.75, "grad_norm_var": 7.9431640625, "learning_rate": 0.0001, "loss": 7.448, "loss/crossentropy": 2.036279045790434, "loss/hidden": 3.39609375, "loss/jsd": 0.0, "loss/logits": 0.1857758706435561, "step": 15180 }, { "epoch": 0.37975, "grad_norm": 31.25, "grad_norm_var": 8.451822916666666, "learning_rate": 0.0001, "loss": 7.4471, "loss/crossentropy": 2.133337301015854, "loss/hidden": 3.379296875, "loss/jsd": 0.0, "loss/logits": 0.19323884584009648, "step": 15190 }, { "epoch": 0.38, "grad_norm": 30.375, "grad_norm_var": 2.470572916666667, "learning_rate": 0.0001, "loss": 7.4197, "loss/crossentropy": 2.166276270151138, "loss/hidden": 3.38515625, "loss/jsd": 0.0, "loss/logits": 0.19621124118566513, "step": 15200 }, { "epoch": 0.38025, "grad_norm": 32.25, "grad_norm_var": 3.7270182291666667, "learning_rate": 0.0001, "loss": 7.5059, "loss/crossentropy": 2.2187248289585115, "loss/hidden": 3.2953125, "loss/jsd": 0.0, "loss/logits": 0.18203979916870594, "step": 15210 }, { "epoch": 0.3805, "grad_norm": 31.625, "grad_norm_var": 1.8229166666666667, "learning_rate": 0.0001, "loss": 7.4983, "loss/crossentropy": 2.1355438262224196, "loss/hidden": 3.312890625, "loss/jsd": 0.0, "loss/logits": 0.1705906637944281, "step": 15220 }, { "epoch": 0.38075, "grad_norm": 31.125, "grad_norm_var": 1.8587890625, "learning_rate": 0.0001, "loss": 7.4817, "loss/crossentropy": 2.20551737844944, "loss/hidden": 3.3265625, "loss/jsd": 0.0, "loss/logits": 0.19247971139848233, "step": 15230 }, { "epoch": 0.381, "grad_norm": 31.375, "grad_norm_var": 1.3666666666666667, "learning_rate": 0.0001, "loss": 7.3827, "loss/crossentropy": 2.0013756826519966, "loss/hidden": 3.368359375, "loss/jsd": 0.0, "loss/logits": 0.1825490690767765, "step": 15240 }, { "epoch": 0.38125, "grad_norm": 32.25, "grad_norm_var": 1.2139973958333334, "learning_rate": 0.0001, "loss": 7.4722, "loss/crossentropy": 2.040935108065605, "loss/hidden": 3.365625, "loss/jsd": 0.0, "loss/logits": 0.185430452786386, "step": 15250 }, { "epoch": 0.3815, "grad_norm": 35.25, "grad_norm_var": 2.1119140625, "learning_rate": 0.0001, "loss": 7.4966, "loss/crossentropy": 2.115219935774803, "loss/hidden": 3.4234375, "loss/jsd": 0.0, "loss/logits": 0.19823081977665424, "step": 15260 }, { "epoch": 0.38175, "grad_norm": 31.125, "grad_norm_var": 3.3947265625, "learning_rate": 0.0001, "loss": 7.3673, "loss/crossentropy": 2.157605493068695, "loss/hidden": 3.278125, "loss/jsd": 0.0, "loss/logits": 0.17537485528737307, "step": 15270 }, { "epoch": 0.382, "grad_norm": 29.125, "grad_norm_var": 1.5833333333333333, "learning_rate": 0.0001, "loss": 7.3478, "loss/crossentropy": 2.1017936944961546, "loss/hidden": 3.33984375, "loss/jsd": 0.0, "loss/logits": 0.18104431293904782, "step": 15280 }, { "epoch": 0.38225, "grad_norm": 6207569920.0, "grad_norm_var": 5.670665197791722e+18, "learning_rate": 0.0001, "loss": 7.6066, "loss/crossentropy": 2.139920362830162, "loss/hidden": 3.353125, "loss/jsd": 0.0, "loss/logits": 0.1915447361767292, "step": 15290 }, { "epoch": 0.3825, "grad_norm": 30.375, "grad_norm_var": 2.40837024446275e+18, "learning_rate": 0.0001, "loss": 7.3699, "loss/crossentropy": 2.1570042625069616, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.18620338048785925, "step": 15300 }, { "epoch": 0.38275, "grad_norm": 31.625, "grad_norm_var": 0.7934895833333333, "learning_rate": 0.0001, "loss": 7.3904, "loss/crossentropy": 2.0324711576104164, "loss/hidden": 3.376171875, "loss/jsd": 0.0, "loss/logits": 0.18631240744143723, "step": 15310 }, { "epoch": 0.383, "grad_norm": 31.25, "grad_norm_var": 1.9098307291666667, "learning_rate": 0.0001, "loss": 7.4851, "loss/crossentropy": 2.094607950001955, "loss/hidden": 3.5671875, "loss/jsd": 0.0, "loss/logits": 0.19497194588184358, "step": 15320 }, { "epoch": 0.38325, "grad_norm": 29.75, "grad_norm_var": 3.1030598958333333, "learning_rate": 0.0001, "loss": 7.4493, "loss/crossentropy": 2.131368662416935, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.19036126751452684, "step": 15330 }, { "epoch": 0.3835, "grad_norm": 29.625, "grad_norm_var": 2.9983723958333335, "learning_rate": 0.0001, "loss": 7.4902, "loss/crossentropy": 2.0336036890745164, "loss/hidden": 3.414453125, "loss/jsd": 0.0, "loss/logits": 0.20238237082958221, "step": 15340 }, { "epoch": 0.38375, "grad_norm": 33.25, "grad_norm_var": 1.334375, "learning_rate": 0.0001, "loss": 7.4585, "loss/crossentropy": 2.0801710695028306, "loss/hidden": 3.430859375, "loss/jsd": 0.0, "loss/logits": 0.19465604964643718, "step": 15350 }, { "epoch": 0.384, "grad_norm": 31.375, "grad_norm_var": 2.2041666666666666, "learning_rate": 0.0001, "loss": 7.4369, "loss/crossentropy": 2.10711809694767, "loss/hidden": 3.389453125, "loss/jsd": 0.0, "loss/logits": 0.19854038041085004, "step": 15360 }, { "epoch": 0.38425, "grad_norm": 33.75, "grad_norm_var": 2.4447265625, "learning_rate": 0.0001, "loss": 7.5833, "loss/crossentropy": 2.1751110404729843, "loss/hidden": 3.416796875, "loss/jsd": 0.0, "loss/logits": 0.1971418472006917, "step": 15370 }, { "epoch": 0.3845, "grad_norm": 33.5, "grad_norm_var": 2.231705729166667, "learning_rate": 0.0001, "loss": 7.5016, "loss/crossentropy": 2.1026677012443544, "loss/hidden": 3.365625, "loss/jsd": 0.0, "loss/logits": 0.1858460023999214, "step": 15380 }, { "epoch": 0.38475, "grad_norm": 30.625, "grad_norm_var": 3.7135416666666665, "learning_rate": 0.0001, "loss": 7.483, "loss/crossentropy": 2.140875709801912, "loss/hidden": 3.306640625, "loss/jsd": 0.0, "loss/logits": 0.19617951191030442, "step": 15390 }, { "epoch": 0.385, "grad_norm": 28.75, "grad_norm_var": 4.5337890625, "learning_rate": 0.0001, "loss": 7.4622, "loss/crossentropy": 2.0783541426062584, "loss/hidden": 3.37734375, "loss/jsd": 0.0, "loss/logits": 0.20269902739673853, "step": 15400 }, { "epoch": 0.38525, "grad_norm": 29.875, "grad_norm_var": 3.095572916666667, "learning_rate": 0.0001, "loss": 7.5227, "loss/crossentropy": 2.0219787105917932, "loss/hidden": 3.426171875, "loss/jsd": 0.0, "loss/logits": 0.17434774748981, "step": 15410 }, { "epoch": 0.3855, "grad_norm": 32.0, "grad_norm_var": 1.75390625, "learning_rate": 0.0001, "loss": 7.5115, "loss/crossentropy": 2.303240016102791, "loss/hidden": 3.345703125, "loss/jsd": 0.0, "loss/logits": 0.1981578789651394, "step": 15420 }, { "epoch": 0.38575, "grad_norm": 29.375, "grad_norm_var": 1.784375, "learning_rate": 0.0001, "loss": 7.4007, "loss/crossentropy": 2.116858586668968, "loss/hidden": 3.299609375, "loss/jsd": 0.0, "loss/logits": 0.17753979042172433, "step": 15430 }, { "epoch": 0.386, "grad_norm": 29.875, "grad_norm_var": 1.3041015625, "learning_rate": 0.0001, "loss": 7.4919, "loss/crossentropy": 2.2533510133624075, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.19535556957125663, "step": 15440 }, { "epoch": 0.38625, "grad_norm": 33.25, "grad_norm_var": 21.045572916666668, "learning_rate": 0.0001, "loss": 7.4599, "loss/crossentropy": 2.1751621633768083, "loss/hidden": 3.331640625, "loss/jsd": 0.0, "loss/logits": 0.19173544868826867, "step": 15450 }, { "epoch": 0.3865, "grad_norm": 32.75, "grad_norm_var": 3.405847188086631e+18, "learning_rate": 0.0001, "loss": 7.5371, "loss/crossentropy": 2.165526232123375, "loss/hidden": 3.42890625, "loss/jsd": 0.0, "loss/logits": 0.20593719817698003, "step": 15460 }, { "epoch": 0.38675, "grad_norm": 32.5, "grad_norm_var": 2.1025390625, "learning_rate": 0.0001, "loss": 7.4792, "loss/crossentropy": 2.254777705669403, "loss/hidden": 3.325390625, "loss/jsd": 0.0, "loss/logits": 0.18779365364462136, "step": 15470 }, { "epoch": 0.387, "grad_norm": 32.75, "grad_norm_var": 3.3358723958333334, "learning_rate": 0.0001, "loss": 7.4772, "loss/crossentropy": 2.1895380139350893, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.19807571023702622, "step": 15480 }, { "epoch": 0.38725, "grad_norm": 31.625, "grad_norm_var": 2.5541015625, "learning_rate": 0.0001, "loss": 7.5285, "loss/crossentropy": 2.1380341425538063, "loss/hidden": 3.353125, "loss/jsd": 0.0, "loss/logits": 0.21063561849296092, "step": 15490 }, { "epoch": 0.3875, "grad_norm": 37.0, "grad_norm_var": 13.8134765625, "learning_rate": 0.0001, "loss": 7.5225, "loss/crossentropy": 2.0771276742219924, "loss/hidden": 3.53203125, "loss/jsd": 0.0, "loss/logits": 0.21799675188958645, "step": 15500 }, { "epoch": 0.38775, "grad_norm": 29.875, "grad_norm_var": 15.705989583333333, "learning_rate": 0.0001, "loss": 7.4049, "loss/crossentropy": 2.0267057463526728, "loss/hidden": 3.55390625, "loss/jsd": 0.0, "loss/logits": 0.21528998501598834, "step": 15510 }, { "epoch": 0.388, "grad_norm": 30.875, "grad_norm_var": 2.25, "learning_rate": 0.0001, "loss": 7.5097, "loss/crossentropy": 2.1733275309205053, "loss/hidden": 3.310546875, "loss/jsd": 0.0, "loss/logits": 0.18399418219923974, "step": 15520 }, { "epoch": 0.38825, "grad_norm": 33.0, "grad_norm_var": 4.39140625, "learning_rate": 0.0001, "loss": 7.4645, "loss/crossentropy": 2.0331453289836645, "loss/hidden": 3.499609375, "loss/jsd": 0.0, "loss/logits": 0.19905872382223605, "step": 15530 }, { "epoch": 0.3885, "grad_norm": 28.375, "grad_norm_var": 5.15390625, "learning_rate": 0.0001, "loss": 7.4482, "loss/crossentropy": 2.0666572242975234, "loss/hidden": 3.497265625, "loss/jsd": 0.0, "loss/logits": 0.200329645909369, "step": 15540 }, { "epoch": 0.38875, "grad_norm": 37.0, "grad_norm_var": 4.893489583333333, "learning_rate": 0.0001, "loss": 7.5307, "loss/crossentropy": 2.121652653813362, "loss/hidden": 3.2828125, "loss/jsd": 0.0, "loss/logits": 0.18504574447870253, "step": 15550 }, { "epoch": 0.389, "grad_norm": 30.875, "grad_norm_var": 3.8202473958333334, "learning_rate": 0.0001, "loss": 7.4378, "loss/crossentropy": 2.0938068509101866, "loss/hidden": 3.366796875, "loss/jsd": 0.0, "loss/logits": 0.18647999074310065, "step": 15560 }, { "epoch": 0.38925, "grad_norm": 31.25, "grad_norm_var": 1.5309895833333333, "learning_rate": 0.0001, "loss": 7.5078, "loss/crossentropy": 2.016429315507412, "loss/hidden": 3.4171875, "loss/jsd": 0.0, "loss/logits": 0.18244766257703304, "step": 15570 }, { "epoch": 0.3895, "grad_norm": 32.25, "grad_norm_var": 4.033268229166667, "learning_rate": 0.0001, "loss": 7.5083, "loss/crossentropy": 2.220339822769165, "loss/hidden": 3.335546875, "loss/jsd": 0.0, "loss/logits": 0.20719899125397206, "step": 15580 }, { "epoch": 0.38975, "grad_norm": 30.875, "grad_norm_var": 2.937239583333333, "learning_rate": 0.0001, "loss": 7.5042, "loss/crossentropy": 2.1800031810998917, "loss/hidden": 3.4234375, "loss/jsd": 0.0, "loss/logits": 0.1941234938800335, "step": 15590 }, { "epoch": 0.39, "grad_norm": 31.0, "grad_norm_var": 2.6353515625, "learning_rate": 0.0001, "loss": 7.6156, "loss/crossentropy": 2.226159965991974, "loss/hidden": 3.499609375, "loss/jsd": 0.0, "loss/logits": 0.2090428164228797, "step": 15600 }, { "epoch": 0.39025, "grad_norm": 30.25, "grad_norm_var": 1.77890625, "learning_rate": 0.0001, "loss": 7.539, "loss/crossentropy": 2.068866019695997, "loss/hidden": 3.4515625, "loss/jsd": 0.0, "loss/logits": 0.1953371877782047, "step": 15610 }, { "epoch": 0.3905, "grad_norm": 31.0, "grad_norm_var": 54.42916666666667, "learning_rate": 0.0001, "loss": 7.5415, "loss/crossentropy": 2.2375737935304643, "loss/hidden": 3.325, "loss/jsd": 0.0, "loss/logits": 0.20228417851030828, "step": 15620 }, { "epoch": 0.39075, "grad_norm": 28.375, "grad_norm_var": 67.47604166666666, "learning_rate": 0.0001, "loss": 7.5058, "loss/crossentropy": 2.1122362539172173, "loss/hidden": 3.385546875, "loss/jsd": 0.0, "loss/logits": 0.18648870382457972, "step": 15630 }, { "epoch": 0.391, "grad_norm": 27.0, "grad_norm_var": 32.35390625, "learning_rate": 0.0001, "loss": 7.4127, "loss/crossentropy": 2.062555433809757, "loss/hidden": 3.401953125, "loss/jsd": 0.0, "loss/logits": 0.18710520956665277, "step": 15640 }, { "epoch": 0.39125, "grad_norm": 30.75, "grad_norm_var": 65.40983072916667, "learning_rate": 0.0001, "loss": 7.4815, "loss/crossentropy": 2.185184709727764, "loss/hidden": 3.340234375, "loss/jsd": 0.0, "loss/logits": 0.18476471938192846, "step": 15650 }, { "epoch": 0.3915, "grad_norm": 33.0, "grad_norm_var": 80.87858072916667, "learning_rate": 0.0001, "loss": 7.531, "loss/crossentropy": 2.1845591366291046, "loss/hidden": 3.323828125, "loss/jsd": 0.0, "loss/logits": 0.21412495020776987, "step": 15660 }, { "epoch": 0.39175, "grad_norm": 33.75, "grad_norm_var": 39.119791666666664, "learning_rate": 0.0001, "loss": 7.5411, "loss/crossentropy": 2.1479857102036477, "loss/hidden": 3.42421875, "loss/jsd": 0.0, "loss/logits": 0.2234236292541027, "step": 15670 }, { "epoch": 0.392, "grad_norm": 42.25, "grad_norm_var": 156.56666666666666, "learning_rate": 0.0001, "loss": 7.4113, "loss/crossentropy": 2.134408001601696, "loss/hidden": 3.319140625, "loss/jsd": 0.0, "loss/logits": 0.18845739047974347, "step": 15680 }, { "epoch": 0.39225, "grad_norm": 28.75, "grad_norm_var": 175.39524739583334, "learning_rate": 0.0001, "loss": 7.4248, "loss/crossentropy": 2.128613194823265, "loss/hidden": 3.44921875, "loss/jsd": 0.0, "loss/logits": 0.2125804962590337, "step": 15690 }, { "epoch": 0.3925, "grad_norm": 36.5, "grad_norm_var": 23.85625, "learning_rate": 0.0001, "loss": 7.4262, "loss/crossentropy": 2.224800446629524, "loss/hidden": 3.396484375, "loss/jsd": 0.0, "loss/logits": 0.20871379896998404, "step": 15700 }, { "epoch": 0.39275, "grad_norm": 35.25, "grad_norm_var": 13.641080729166667, "learning_rate": 0.0001, "loss": 7.3782, "loss/crossentropy": 2.1991523414850236, "loss/hidden": 3.373828125, "loss/jsd": 0.0, "loss/logits": 0.19195745941251516, "step": 15710 }, { "epoch": 0.393, "grad_norm": 31.875, "grad_norm_var": 11.7931640625, "learning_rate": 0.0001, "loss": 7.4026, "loss/crossentropy": 2.1917616426944733, "loss/hidden": 3.328515625, "loss/jsd": 0.0, "loss/logits": 0.19351839125156403, "step": 15720 }, { "epoch": 0.39325, "grad_norm": 47.75, "grad_norm_var": 26.167122395833335, "learning_rate": 0.0001, "loss": 7.457, "loss/crossentropy": 2.120897516608238, "loss/hidden": 3.31484375, "loss/jsd": 0.0, "loss/logits": 0.19529615007340909, "step": 15730 }, { "epoch": 0.3935, "grad_norm": 34.5, "grad_norm_var": 26.758268229166667, "learning_rate": 0.0001, "loss": 7.4009, "loss/crossentropy": 2.1326122283935547, "loss/hidden": 3.3546875, "loss/jsd": 0.0, "loss/logits": 0.17355557810515165, "step": 15740 }, { "epoch": 0.39375, "grad_norm": 29.75, "grad_norm_var": 10.8853515625, "learning_rate": 0.0001, "loss": 7.3956, "loss/crossentropy": 2.0307308197021485, "loss/hidden": 3.44921875, "loss/jsd": 0.0, "loss/logits": 0.20066507179290055, "step": 15750 }, { "epoch": 0.394, "grad_norm": 36.0, "grad_norm_var": 13.039518229166667, "learning_rate": 0.0001, "loss": 7.3499, "loss/crossentropy": 2.067159628868103, "loss/hidden": 3.27734375, "loss/jsd": 0.0, "loss/logits": 0.1807652136310935, "step": 15760 }, { "epoch": 0.39425, "grad_norm": 30.25, "grad_norm_var": 6.055989583333333, "learning_rate": 0.0001, "loss": 7.3641, "loss/crossentropy": 2.072852502763271, "loss/hidden": 3.3328125, "loss/jsd": 0.0, "loss/logits": 0.17794448863714935, "step": 15770 }, { "epoch": 0.3945, "grad_norm": 30.875, "grad_norm_var": 46.35598958333333, "learning_rate": 0.0001, "loss": 7.422, "loss/crossentropy": 2.1680019706487657, "loss/hidden": 3.38515625, "loss/jsd": 0.0, "loss/logits": 0.2028266169130802, "step": 15780 }, { "epoch": 0.39475, "grad_norm": 27.625, "grad_norm_var": 43.4603515625, "learning_rate": 0.0001, "loss": 7.3911, "loss/crossentropy": 2.16780024766922, "loss/hidden": 3.379296875, "loss/jsd": 0.0, "loss/logits": 0.19396517537534236, "step": 15790 }, { "epoch": 0.395, "grad_norm": 43.0, "grad_norm_var": 39.1072265625, "learning_rate": 0.0001, "loss": 7.3941, "loss/crossentropy": 2.175208270549774, "loss/hidden": 3.33984375, "loss/jsd": 0.0, "loss/logits": 0.18382846023887395, "step": 15800 }, { "epoch": 0.39525, "grad_norm": 44.5, "grad_norm_var": 36.8900390625, "learning_rate": 0.0001, "loss": 7.4574, "loss/crossentropy": 2.1617070853710176, "loss/hidden": 3.303125, "loss/jsd": 0.0, "loss/logits": 0.18401278592646123, "step": 15810 }, { "epoch": 0.3955, "grad_norm": 28.0, "grad_norm_var": 30.280989583333334, "learning_rate": 0.0001, "loss": 7.2727, "loss/crossentropy": 2.063553161919117, "loss/hidden": 3.24765625, "loss/jsd": 0.0, "loss/logits": 0.17086460664868355, "step": 15820 }, { "epoch": 0.39575, "grad_norm": 33.0, "grad_norm_var": 41.735416666666666, "learning_rate": 0.0001, "loss": 7.3294, "loss/crossentropy": 2.091352239251137, "loss/hidden": 3.430859375, "loss/jsd": 0.0, "loss/logits": 0.19158628936856986, "step": 15830 }, { "epoch": 0.396, "grad_norm": 28.75, "grad_norm_var": 42.423893229166666, "learning_rate": 0.0001, "loss": 7.3487, "loss/crossentropy": 2.1424781650304796, "loss/hidden": 3.333203125, "loss/jsd": 0.0, "loss/logits": 0.1843854110687971, "step": 15840 }, { "epoch": 0.39625, "grad_norm": 34.0, "grad_norm_var": 47.520572916666666, "learning_rate": 0.0001, "loss": 7.4008, "loss/crossentropy": 2.1899289660155774, "loss/hidden": 3.303515625, "loss/jsd": 0.0, "loss/logits": 0.1849813165143132, "step": 15850 }, { "epoch": 0.3965, "grad_norm": 27.875, "grad_norm_var": 15.66015625, "learning_rate": 0.0001, "loss": 7.3814, "loss/crossentropy": 2.181295931339264, "loss/hidden": 3.310546875, "loss/jsd": 0.0, "loss/logits": 0.18565880618989467, "step": 15860 }, { "epoch": 0.39675, "grad_norm": 36.25, "grad_norm_var": 17.7587890625, "learning_rate": 0.0001, "loss": 7.3741, "loss/crossentropy": 2.1227088302373884, "loss/hidden": 3.32734375, "loss/jsd": 0.0, "loss/logits": 0.1942040240392089, "step": 15870 }, { "epoch": 0.397, "grad_norm": 38.0, "grad_norm_var": 25.970572916666665, "learning_rate": 0.0001, "loss": 7.4482, "loss/crossentropy": 2.0236525177955627, "loss/hidden": 3.454296875, "loss/jsd": 0.0, "loss/logits": 0.20845895111560822, "step": 15880 }, { "epoch": 0.39725, "grad_norm": 40.25, "grad_norm_var": 21.3728515625, "learning_rate": 0.0001, "loss": 7.3296, "loss/crossentropy": 2.089261847734451, "loss/hidden": 3.251171875, "loss/jsd": 0.0, "loss/logits": 0.18734547607600688, "step": 15890 }, { "epoch": 0.3975, "grad_norm": 32.5, "grad_norm_var": 10.598893229166666, "learning_rate": 0.0001, "loss": 7.4169, "loss/crossentropy": 2.0026068508625032, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.18220478743314744, "step": 15900 }, { "epoch": 0.39775, "grad_norm": 29.125, "grad_norm_var": 9.208268229166666, "learning_rate": 0.0001, "loss": 7.2837, "loss/crossentropy": 2.237262162566185, "loss/hidden": 3.307421875, "loss/jsd": 0.0, "loss/logits": 0.1820806261152029, "step": 15910 }, { "epoch": 0.398, "grad_norm": 32.75, "grad_norm_var": 50.037434895833336, "learning_rate": 0.0001, "loss": 7.4485, "loss/crossentropy": 2.1642710946500303, "loss/hidden": 3.401953125, "loss/jsd": 0.0, "loss/logits": 0.18893331224098803, "step": 15920 }, { "epoch": 0.39825, "grad_norm": 29.125, "grad_norm_var": 31.555208333333333, "learning_rate": 0.0001, "loss": 7.287, "loss/crossentropy": 2.1653111398220064, "loss/hidden": 3.274609375, "loss/jsd": 0.0, "loss/logits": 0.1766128923743963, "step": 15930 }, { "epoch": 0.3985, "grad_norm": 30.5, "grad_norm_var": 42.138997395833336, "learning_rate": 0.0001, "loss": 7.3823, "loss/crossentropy": 2.0916855938732626, "loss/hidden": 3.320703125, "loss/jsd": 0.0, "loss/logits": 0.19688500016927718, "step": 15940 }, { "epoch": 0.39875, "grad_norm": 37.5, "grad_norm_var": 78.5375, "learning_rate": 0.0001, "loss": 7.2848, "loss/crossentropy": 2.1427202939987184, "loss/hidden": 3.48125, "loss/jsd": 0.0, "loss/logits": 0.1851636625826359, "step": 15950 }, { "epoch": 0.399, "grad_norm": 38.75, "grad_norm_var": 68.78125, "learning_rate": 0.0001, "loss": 7.3317, "loss/crossentropy": 2.003783145546913, "loss/hidden": 3.38125, "loss/jsd": 0.0, "loss/logits": 0.19389798324555158, "step": 15960 }, { "epoch": 0.39925, "grad_norm": 42.0, "grad_norm_var": 32.01712239583333, "learning_rate": 0.0001, "loss": 7.4473, "loss/crossentropy": 2.196589732170105, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.1970522753894329, "step": 15970 }, { "epoch": 0.3995, "grad_norm": 41.25, "grad_norm_var": 58.436458333333334, "learning_rate": 0.0001, "loss": 7.3478, "loss/crossentropy": 2.1799759164452555, "loss/hidden": 3.286328125, "loss/jsd": 0.0, "loss/logits": 0.18633378595113753, "step": 15980 }, { "epoch": 0.39975, "grad_norm": 30.875, "grad_norm_var": 59.228580729166666, "learning_rate": 0.0001, "loss": 7.3209, "loss/crossentropy": 2.0315997526049614, "loss/hidden": 3.39921875, "loss/jsd": 0.0, "loss/logits": 0.1865892570465803, "step": 15990 }, { "epoch": 0.4, "grad_norm": 31.375, "grad_norm_var": 3.465559895833333, "learning_rate": 0.0001, "loss": 7.3845, "loss/crossentropy": 2.1453574761748313, "loss/hidden": 3.33984375, "loss/jsd": 0.0, "loss/logits": 0.17996564768254758, "step": 16000 }, { "epoch": 0.40025, "grad_norm": 34.25, "grad_norm_var": 10.563997395833333, "learning_rate": 0.0001, "loss": 7.3941, "loss/crossentropy": 2.1078505992889403, "loss/hidden": 3.28984375, "loss/jsd": 0.0, "loss/logits": 0.18582884669303895, "step": 16010 }, { "epoch": 0.4005, "grad_norm": 39.0, "grad_norm_var": 12.724934895833334, "learning_rate": 0.0001, "loss": 7.317, "loss/crossentropy": 2.0792032331228256, "loss/hidden": 3.244140625, "loss/jsd": 0.0, "loss/logits": 0.16934819463640452, "step": 16020 }, { "epoch": 0.40075, "grad_norm": 39.75, "grad_norm_var": 13.576041666666667, "learning_rate": 0.0001, "loss": 7.3955, "loss/crossentropy": 2.1093589305877685, "loss/hidden": 3.419140625, "loss/jsd": 0.0, "loss/logits": 0.19456310402601956, "step": 16030 }, { "epoch": 0.401, "grad_norm": 38.25, "grad_norm_var": 10.633268229166667, "learning_rate": 0.0001, "loss": 7.3139, "loss/crossentropy": 2.064394600689411, "loss/hidden": 3.34609375, "loss/jsd": 0.0, "loss/logits": 0.18512170277535916, "step": 16040 }, { "epoch": 0.40125, "grad_norm": 36.25, "grad_norm_var": 13.001497395833333, "learning_rate": 0.0001, "loss": 7.4009, "loss/crossentropy": 2.140559023618698, "loss/hidden": 3.35625, "loss/jsd": 0.0, "loss/logits": 0.195547959767282, "step": 16050 }, { "epoch": 0.4015, "grad_norm": 31.0, "grad_norm_var": 14.090559895833334, "learning_rate": 0.0001, "loss": 7.4652, "loss/crossentropy": 2.1010533660650252, "loss/hidden": 3.490234375, "loss/jsd": 0.0, "loss/logits": 0.20802920442074538, "step": 16060 }, { "epoch": 0.40175, "grad_norm": 31.0, "grad_norm_var": 11.946875, "learning_rate": 0.0001, "loss": 7.391, "loss/crossentropy": 2.1016806095838545, "loss/hidden": 3.383203125, "loss/jsd": 0.0, "loss/logits": 0.1986994445323944, "step": 16070 }, { "epoch": 0.402, "grad_norm": 30.375, "grad_norm_var": 3.3671223958333334, "learning_rate": 0.0001, "loss": 7.4359, "loss/crossentropy": 2.1667724169790743, "loss/hidden": 3.2984375, "loss/jsd": 0.0, "loss/logits": 0.18537729047238827, "step": 16080 }, { "epoch": 0.40225, "grad_norm": 29.625, "grad_norm_var": 3.411393229166667, "learning_rate": 0.0001, "loss": 7.5328, "loss/crossentropy": 2.1518414229154588, "loss/hidden": 3.401953125, "loss/jsd": 0.0, "loss/logits": 0.19391167853027583, "step": 16090 }, { "epoch": 0.4025, "grad_norm": 33.0, "grad_norm_var": 25.363997395833334, "learning_rate": 0.0001, "loss": 7.516, "loss/crossentropy": 2.026207575201988, "loss/hidden": 3.49453125, "loss/jsd": 0.0, "loss/logits": 0.19631834831088782, "step": 16100 }, { "epoch": 0.40275, "grad_norm": 33.0, "grad_norm_var": 28.158333333333335, "learning_rate": 0.0001, "loss": 7.4104, "loss/crossentropy": 2.1191447034478186, "loss/hidden": 3.290625, "loss/jsd": 0.0, "loss/logits": 0.17606843169778585, "step": 16110 }, { "epoch": 0.403, "grad_norm": 33.25, "grad_norm_var": 8.17265625, "learning_rate": 0.0001, "loss": 7.4121, "loss/crossentropy": 2.2775932759046555, "loss/hidden": 3.32421875, "loss/jsd": 0.0, "loss/logits": 0.2066437665373087, "step": 16120 }, { "epoch": 0.40325, "grad_norm": 31.625, "grad_norm_var": 1.3990009838940193e+18, "learning_rate": 0.0001, "loss": 7.4655, "loss/crossentropy": 2.1553472220897674, "loss/hidden": 3.400390625, "loss/jsd": 0.0, "loss/logits": 0.1962028866633773, "step": 16130 }, { "epoch": 0.4035, "grad_norm": 29.625, "grad_norm_var": 3.3436848958333334, "learning_rate": 0.0001, "loss": 7.4711, "loss/crossentropy": 2.177881157398224, "loss/hidden": 3.407421875, "loss/jsd": 0.0, "loss/logits": 0.19104315489530563, "step": 16140 }, { "epoch": 0.40375, "grad_norm": 29.875, "grad_norm_var": 6.098893229166666, "learning_rate": 0.0001, "loss": 7.4871, "loss/crossentropy": 2.1309458047151564, "loss/hidden": 3.456640625, "loss/jsd": 0.0, "loss/logits": 0.20508808381855487, "step": 16150 }, { "epoch": 0.404, "grad_norm": 32.0, "grad_norm_var": 10.82890625, "learning_rate": 0.0001, "loss": 7.4862, "loss/crossentropy": 2.149015510082245, "loss/hidden": 3.44765625, "loss/jsd": 0.0, "loss/logits": 0.21942622102797033, "step": 16160 }, { "epoch": 0.40425, "grad_norm": 28.375, "grad_norm_var": 8.847330729166666, "learning_rate": 0.0001, "loss": 7.4679, "loss/crossentropy": 2.19928492307663, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.19909227825701237, "step": 16170 }, { "epoch": 0.4045, "grad_norm": 27.25, "grad_norm_var": 4.3994140625, "learning_rate": 0.0001, "loss": 7.4377, "loss/crossentropy": 2.05338691174984, "loss/hidden": 3.462890625, "loss/jsd": 0.0, "loss/logits": 0.20643874276429414, "step": 16180 }, { "epoch": 0.40475, "grad_norm": 37.0, "grad_norm_var": 6.540625, "learning_rate": 0.0001, "loss": 7.4132, "loss/crossentropy": 2.093175410479307, "loss/hidden": 3.48984375, "loss/jsd": 0.0, "loss/logits": 0.18920900514349343, "step": 16190 }, { "epoch": 0.405, "grad_norm": 28.25, "grad_norm_var": 6.5962890625, "learning_rate": 0.0001, "loss": 7.3788, "loss/crossentropy": 2.074980768561363, "loss/hidden": 3.301953125, "loss/jsd": 0.0, "loss/logits": 0.19456995707005262, "step": 16200 }, { "epoch": 0.40525, "grad_norm": 29.875, "grad_norm_var": 2.972916666666667, "learning_rate": 0.0001, "loss": 7.4665, "loss/crossentropy": 2.2107421159744263, "loss/hidden": 3.3453125, "loss/jsd": 0.0, "loss/logits": 0.19778486154973507, "step": 16210 }, { "epoch": 0.4055, "grad_norm": 32.5, "grad_norm_var": 3.1791015625, "learning_rate": 0.0001, "loss": 7.4696, "loss/crossentropy": 2.180025112628937, "loss/hidden": 3.46484375, "loss/jsd": 0.0, "loss/logits": 0.19855868369340895, "step": 16220 }, { "epoch": 0.40575, "grad_norm": 30.625, "grad_norm_var": 3.1150390625, "learning_rate": 0.0001, "loss": 7.4948, "loss/crossentropy": 2.1638258934020995, "loss/hidden": 3.515234375, "loss/jsd": 0.0, "loss/logits": 0.21891864482313395, "step": 16230 }, { "epoch": 0.406, "grad_norm": 29.75, "grad_norm_var": 9.517122395833333, "learning_rate": 0.0001, "loss": 7.5008, "loss/crossentropy": 2.145371067523956, "loss/hidden": 3.415234375, "loss/jsd": 0.0, "loss/logits": 0.22061494514346122, "step": 16240 }, { "epoch": 0.40625, "grad_norm": 30.75, "grad_norm_var": 3.1572265625, "learning_rate": 0.0001, "loss": 7.4569, "loss/crossentropy": 2.20810389816761, "loss/hidden": 3.34765625, "loss/jsd": 0.0, "loss/logits": 0.20236902087926864, "step": 16250 }, { "epoch": 0.4065, "grad_norm": 31.625, "grad_norm_var": 7.5775390625, "learning_rate": 0.0001, "loss": 7.4047, "loss/crossentropy": 2.117805525660515, "loss/hidden": 3.330859375, "loss/jsd": 0.0, "loss/logits": 0.1842184830456972, "step": 16260 }, { "epoch": 0.40675, "grad_norm": 30.125, "grad_norm_var": 10.220247395833333, "learning_rate": 0.0001, "loss": 7.4975, "loss/crossentropy": 2.203564515709877, "loss/hidden": 3.30859375, "loss/jsd": 0.0, "loss/logits": 0.18627342265099286, "step": 16270 }, { "epoch": 0.407, "grad_norm": 30.5, "grad_norm_var": 11.406705729166667, "learning_rate": 0.0001, "loss": 7.4088, "loss/crossentropy": 2.0998367607593535, "loss/hidden": 3.32890625, "loss/jsd": 0.0, "loss/logits": 0.18264923337846994, "step": 16280 }, { "epoch": 0.40725, "grad_norm": 30.375, "grad_norm_var": 24.726497395833334, "learning_rate": 0.0001, "loss": 7.5606, "loss/crossentropy": 2.192747640609741, "loss/hidden": 3.36328125, "loss/jsd": 0.0, "loss/logits": 0.2141475934535265, "step": 16290 }, { "epoch": 0.4075, "grad_norm": 30.25, "grad_norm_var": 9.641080729166667, "learning_rate": 0.0001, "loss": 7.5167, "loss/crossentropy": 2.167286628484726, "loss/hidden": 3.325390625, "loss/jsd": 0.0, "loss/logits": 0.19353726021945478, "step": 16300 }, { "epoch": 0.40775, "grad_norm": 29.625, "grad_norm_var": 4.417708333333334, "learning_rate": 0.0001, "loss": 7.4252, "loss/crossentropy": 2.058204619586468, "loss/hidden": 3.365234375, "loss/jsd": 0.0, "loss/logits": 0.1987381473183632, "step": 16310 }, { "epoch": 0.408, "grad_norm": 32.25, "grad_norm_var": 7.0306640625, "learning_rate": 0.0001, "loss": 7.4876, "loss/crossentropy": 2.1063947066664697, "loss/hidden": 3.30078125, "loss/jsd": 0.0, "loss/logits": 0.19293669573962688, "step": 16320 }, { "epoch": 0.40825, "grad_norm": 30.125, "grad_norm_var": 2.2030598958333334, "learning_rate": 0.0001, "loss": 7.5029, "loss/crossentropy": 2.2337026357650758, "loss/hidden": 3.408984375, "loss/jsd": 0.0, "loss/logits": 0.22151997741311788, "step": 16330 }, { "epoch": 0.4085, "grad_norm": 29.75, "grad_norm_var": 2.6958333333333333, "learning_rate": 0.0001, "loss": 7.4837, "loss/crossentropy": 2.1971309810876845, "loss/hidden": 3.376171875, "loss/jsd": 0.0, "loss/logits": 0.1970578793436289, "step": 16340 }, { "epoch": 0.40875, "grad_norm": 30.25, "grad_norm_var": 3.151497395833333, "learning_rate": 0.0001, "loss": 7.3711, "loss/crossentropy": 2.0743446350097656, "loss/hidden": 3.36171875, "loss/jsd": 0.0, "loss/logits": 0.19049311596900226, "step": 16350 }, { "epoch": 0.409, "grad_norm": 32.75, "grad_norm_var": 2.40625, "learning_rate": 0.0001, "loss": 7.4231, "loss/crossentropy": 1.9305765472352505, "loss/hidden": 3.370703125, "loss/jsd": 0.0, "loss/logits": 0.19289642302319407, "step": 16360 }, { "epoch": 0.40925, "grad_norm": 28.625, "grad_norm_var": 11.846809895833333, "learning_rate": 0.0001, "loss": 7.4518, "loss/crossentropy": 2.102352428436279, "loss/hidden": 3.45859375, "loss/jsd": 0.0, "loss/logits": 0.2020608950406313, "step": 16370 }, { "epoch": 0.4095, "grad_norm": 27.875, "grad_norm_var": 12.543489583333333, "learning_rate": 0.0001, "loss": 7.4805, "loss/crossentropy": 2.1010148346424105, "loss/hidden": 3.415625, "loss/jsd": 0.0, "loss/logits": 0.20252777952700854, "step": 16380 }, { "epoch": 0.40975, "grad_norm": 29.625, "grad_norm_var": 8.712239583333334, "learning_rate": 0.0001, "loss": 7.435, "loss/crossentropy": 2.087410292029381, "loss/hidden": 3.54453125, "loss/jsd": 0.0, "loss/logits": 0.21575740706175567, "step": 16390 }, { "epoch": 0.41, "grad_norm": 29.125, "grad_norm_var": 6.9087890625, "learning_rate": 0.0001, "loss": 7.4989, "loss/crossentropy": 2.2157984226942062, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.2107794165611267, "step": 16400 }, { "epoch": 0.41025, "grad_norm": 32.25, "grad_norm_var": 2.8372395833333335, "learning_rate": 0.0001, "loss": 7.4437, "loss/crossentropy": 2.141116687655449, "loss/hidden": 3.380859375, "loss/jsd": 0.0, "loss/logits": 0.18751906510442495, "step": 16410 }, { "epoch": 0.4105, "grad_norm": 31.125, "grad_norm_var": 3.67890625, "learning_rate": 0.0001, "loss": 7.5811, "loss/crossentropy": 2.1563937693834303, "loss/hidden": 3.36796875, "loss/jsd": 0.0, "loss/logits": 0.19754156917333604, "step": 16420 }, { "epoch": 0.41075, "grad_norm": 32.0, "grad_norm_var": 3.380989583333333, "learning_rate": 0.0001, "loss": 7.4985, "loss/crossentropy": 2.163114275038242, "loss/hidden": 3.358203125, "loss/jsd": 0.0, "loss/logits": 0.18562516067177057, "step": 16430 }, { "epoch": 0.411, "grad_norm": 29.25, "grad_norm_var": 3.005143229166667, "learning_rate": 0.0001, "loss": 7.5249, "loss/crossentropy": 2.1273641526699065, "loss/hidden": 3.374609375, "loss/jsd": 0.0, "loss/logits": 0.20784277468919754, "step": 16440 }, { "epoch": 0.41125, "grad_norm": 30.625, "grad_norm_var": 1.9393229166666666, "learning_rate": 0.0001, "loss": 7.4749, "loss/crossentropy": 2.107171121239662, "loss/hidden": 3.35625, "loss/jsd": 0.0, "loss/logits": 0.18700738586485385, "step": 16450 }, { "epoch": 0.4115, "grad_norm": 30.25, "grad_norm_var": 1.2863932291666667, "learning_rate": 0.0001, "loss": 7.5479, "loss/crossentropy": 2.060547313094139, "loss/hidden": 3.408984375, "loss/jsd": 0.0, "loss/logits": 0.19373653624206782, "step": 16460 }, { "epoch": 0.41175, "grad_norm": 29.625, "grad_norm_var": 3.5625, "learning_rate": 0.0001, "loss": 7.5479, "loss/crossentropy": 2.1868856728076933, "loss/hidden": 3.36328125, "loss/jsd": 0.0, "loss/logits": 0.20505926571786404, "step": 16470 }, { "epoch": 0.412, "grad_norm": 30.0, "grad_norm_var": 3.034309895833333, "learning_rate": 0.0001, "loss": 7.5314, "loss/crossentropy": 2.07418372631073, "loss/hidden": 3.34140625, "loss/jsd": 0.0, "loss/logits": 0.20410634595900773, "step": 16480 }, { "epoch": 0.41225, "grad_norm": 29.375, "grad_norm_var": 20.245247395833335, "learning_rate": 0.0001, "loss": 7.4788, "loss/crossentropy": 2.1793991714715957, "loss/hidden": 3.334375, "loss/jsd": 0.0, "loss/logits": 0.189803871512413, "step": 16490 }, { "epoch": 0.4125, "grad_norm": 42.75, "grad_norm_var": 32.5228515625, "learning_rate": 0.0001, "loss": 7.5413, "loss/crossentropy": 2.252856284379959, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.19915211237967015, "step": 16500 }, { "epoch": 0.41275, "grad_norm": 29.375, "grad_norm_var": 11.610416666666667, "learning_rate": 0.0001, "loss": 7.4184, "loss/crossentropy": 2.1054195404052733, "loss/hidden": 3.3390625, "loss/jsd": 0.0, "loss/logits": 0.19910655040293931, "step": 16510 }, { "epoch": 0.413, "grad_norm": 30.5, "grad_norm_var": 2.3958333333333335, "learning_rate": 0.0001, "loss": 7.4706, "loss/crossentropy": 2.171055743098259, "loss/hidden": 3.357421875, "loss/jsd": 0.0, "loss/logits": 0.1864585865288973, "step": 16520 }, { "epoch": 0.41325, "grad_norm": 31.75, "grad_norm_var": 1.7192057291666667, "learning_rate": 0.0001, "loss": 7.5641, "loss/crossentropy": 2.087317834794521, "loss/hidden": 3.455859375, "loss/jsd": 0.0, "loss/logits": 0.20189851969480516, "step": 16530 }, { "epoch": 0.4135, "grad_norm": 30.875, "grad_norm_var": 7.19140625, "learning_rate": 0.0001, "loss": 7.494, "loss/crossentropy": 2.0119203999638557, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.19246891103684902, "step": 16540 }, { "epoch": 0.41375, "grad_norm": 28.875, "grad_norm_var": 2.47890625, "learning_rate": 0.0001, "loss": 7.3923, "loss/crossentropy": 2.1976596504449843, "loss/hidden": 3.323828125, "loss/jsd": 0.0, "loss/logits": 0.1891266519203782, "step": 16550 }, { "epoch": 0.414, "grad_norm": 33.25, "grad_norm_var": 1.7124937379258301e+18, "learning_rate": 0.0001, "loss": 7.4992, "loss/crossentropy": 2.1714249163866044, "loss/hidden": 3.331640625, "loss/jsd": 0.0, "loss/logits": 0.19376156255602836, "step": 16560 }, { "epoch": 0.41425, "grad_norm": 35.5, "grad_norm_var": 27.289322916666666, "learning_rate": 0.0001, "loss": 7.4315, "loss/crossentropy": 2.062089595198631, "loss/hidden": 3.403125, "loss/jsd": 0.0, "loss/logits": 0.19295348990708588, "step": 16570 }, { "epoch": 0.4145, "grad_norm": 28.625, "grad_norm_var": 27.770572916666666, "learning_rate": 0.0001, "loss": 7.4442, "loss/crossentropy": 2.0051744118332864, "loss/hidden": 3.363671875, "loss/jsd": 0.0, "loss/logits": 0.1870790995657444, "step": 16580 }, { "epoch": 0.41475, "grad_norm": 29.75, "grad_norm_var": 3.789322916666667, "learning_rate": 0.0001, "loss": 7.4585, "loss/crossentropy": 2.2442015290260313, "loss/hidden": 3.33046875, "loss/jsd": 0.0, "loss/logits": 0.19160873014479876, "step": 16590 }, { "epoch": 0.415, "grad_norm": 30.0, "grad_norm_var": 0.3973307291666667, "learning_rate": 0.0001, "loss": 7.3867, "loss/crossentropy": 2.0700665429234504, "loss/hidden": 3.31328125, "loss/jsd": 0.0, "loss/logits": 0.18396779522299767, "step": 16600 }, { "epoch": 0.41525, "grad_norm": 30.5, "grad_norm_var": 1.98515625, "learning_rate": 0.0001, "loss": 7.4826, "loss/crossentropy": 2.0733146488666536, "loss/hidden": 3.368359375, "loss/jsd": 0.0, "loss/logits": 0.18656179327517747, "step": 16610 }, { "epoch": 0.4155, "grad_norm": 53.5, "grad_norm_var": 37.53515625, "learning_rate": 0.0001, "loss": 7.4502, "loss/crossentropy": 2.0442128658294676, "loss/hidden": 3.402734375, "loss/jsd": 0.0, "loss/logits": 0.19807227458804846, "step": 16620 }, { "epoch": 0.41575, "grad_norm": 29.5, "grad_norm_var": 39.595768229166666, "learning_rate": 0.0001, "loss": 7.3489, "loss/crossentropy": 1.989547997713089, "loss/hidden": 3.39140625, "loss/jsd": 0.0, "loss/logits": 0.20996779929846526, "step": 16630 }, { "epoch": 0.416, "grad_norm": 32.75, "grad_norm_var": 1.8098307291666667, "learning_rate": 0.0001, "loss": 7.5352, "loss/crossentropy": 2.233032874763012, "loss/hidden": 3.305859375, "loss/jsd": 0.0, "loss/logits": 0.19636076968163252, "step": 16640 }, { "epoch": 0.41625, "grad_norm": 31.25, "grad_norm_var": 2.8395182291666665, "learning_rate": 0.0001, "loss": 7.553, "loss/crossentropy": 2.121747186779976, "loss/hidden": 3.406640625, "loss/jsd": 0.0, "loss/logits": 0.19236730560660362, "step": 16650 }, { "epoch": 0.4165, "grad_norm": 33.0, "grad_norm_var": 2.1705729166666665, "learning_rate": 0.0001, "loss": 7.5506, "loss/crossentropy": 2.1418200969696044, "loss/hidden": 3.406640625, "loss/jsd": 0.0, "loss/logits": 0.20644566994160413, "step": 16660 }, { "epoch": 0.41675, "grad_norm": 32.0, "grad_norm_var": 2.7811848958333334, "learning_rate": 0.0001, "loss": 7.3972, "loss/crossentropy": 2.1746320933103562, "loss/hidden": 3.3390625, "loss/jsd": 0.0, "loss/logits": 0.2003942009061575, "step": 16670 }, { "epoch": 0.417, "grad_norm": 31.375, "grad_norm_var": 2.2718098958333335, "learning_rate": 0.0001, "loss": 7.5947, "loss/crossentropy": 2.109486374258995, "loss/hidden": 3.324609375, "loss/jsd": 0.0, "loss/logits": 0.18977900519967078, "step": 16680 }, { "epoch": 0.41725, "grad_norm": 32.75, "grad_norm_var": 2.0608723958333335, "learning_rate": 0.0001, "loss": 7.5853, "loss/crossentropy": 2.190330383181572, "loss/hidden": 3.447265625, "loss/jsd": 0.0, "loss/logits": 0.2031781405210495, "step": 16690 }, { "epoch": 0.4175, "grad_norm": 30.5, "grad_norm_var": 3.530143229166667, "learning_rate": 0.0001, "loss": 7.5202, "loss/crossentropy": 2.239956870675087, "loss/hidden": 3.297265625, "loss/jsd": 0.0, "loss/logits": 0.19103901870548726, "step": 16700 }, { "epoch": 0.41775, "grad_norm": 30.5, "grad_norm_var": 1.7889973958333334, "learning_rate": 0.0001, "loss": 7.5543, "loss/crossentropy": 2.041231009364128, "loss/hidden": 3.50625, "loss/jsd": 0.0, "loss/logits": 0.20854433607310058, "step": 16710 }, { "epoch": 0.418, "grad_norm": 30.625, "grad_norm_var": 4.063541666666667, "learning_rate": 0.0001, "loss": 7.4644, "loss/crossentropy": 2.0960741981863977, "loss/hidden": 3.452734375, "loss/jsd": 0.0, "loss/logits": 0.19888665452599524, "step": 16720 }, { "epoch": 0.41825, "grad_norm": 31.25, "grad_norm_var": 4.901822916666666, "learning_rate": 0.0001, "loss": 7.4165, "loss/crossentropy": 2.085656823962927, "loss/hidden": 3.30390625, "loss/jsd": 0.0, "loss/logits": 0.18611110784113408, "step": 16730 }, { "epoch": 0.4185, "grad_norm": 32.75, "grad_norm_var": 4.4509765625, "learning_rate": 0.0001, "loss": 7.4439, "loss/crossentropy": 2.05613434612751, "loss/hidden": 3.418359375, "loss/jsd": 0.0, "loss/logits": 0.20244114883244038, "step": 16740 }, { "epoch": 0.41875, "grad_norm": 27.875, "grad_norm_var": 3.0869140625, "learning_rate": 0.0001, "loss": 7.5106, "loss/crossentropy": 2.124755159020424, "loss/hidden": 3.334765625, "loss/jsd": 0.0, "loss/logits": 0.1944043705239892, "step": 16750 }, { "epoch": 0.419, "grad_norm": 32.5, "grad_norm_var": 17.409375, "learning_rate": 0.0001, "loss": 7.5009, "loss/crossentropy": 2.17716204226017, "loss/hidden": 3.342578125, "loss/jsd": 0.0, "loss/logits": 0.19449110236018896, "step": 16760 }, { "epoch": 0.41925, "grad_norm": 31.0, "grad_norm_var": 3.465559895833333, "learning_rate": 0.0001, "loss": 7.4858, "loss/crossentropy": 2.08994981944561, "loss/hidden": 3.315234375, "loss/jsd": 0.0, "loss/logits": 0.1893147800117731, "step": 16770 }, { "epoch": 0.4195, "grad_norm": 28.5, "grad_norm_var": 5.300455729166667, "learning_rate": 0.0001, "loss": 7.4615, "loss/crossentropy": 2.2256261706352234, "loss/hidden": 3.43828125, "loss/jsd": 0.0, "loss/logits": 0.217317346483469, "step": 16780 }, { "epoch": 0.41975, "grad_norm": 31.0, "grad_norm_var": 3.66015625, "learning_rate": 0.0001, "loss": 7.5383, "loss/crossentropy": 2.1508295446634293, "loss/hidden": 3.467578125, "loss/jsd": 0.0, "loss/logits": 0.21005581114441157, "step": 16790 }, { "epoch": 0.42, "grad_norm": 31.5, "grad_norm_var": 1.4184895833333333, "learning_rate": 0.0001, "loss": 7.4356, "loss/crossentropy": 2.1244804561138153, "loss/hidden": 3.420703125, "loss/jsd": 0.0, "loss/logits": 0.19516022335737943, "step": 16800 }, { "epoch": 0.42025, "grad_norm": 29.25, "grad_norm_var": 2.6468098958333335, "learning_rate": 0.0001, "loss": 7.3777, "loss/crossentropy": 2.1440630443394184, "loss/hidden": 3.34609375, "loss/jsd": 0.0, "loss/logits": 0.1946810379624367, "step": 16810 }, { "epoch": 0.4205, "grad_norm": 28.875, "grad_norm_var": 2.633072916666667, "learning_rate": 0.0001, "loss": 7.4624, "loss/crossentropy": 2.1416983366012574, "loss/hidden": 3.465234375, "loss/jsd": 0.0, "loss/logits": 0.21523999786004425, "step": 16820 }, { "epoch": 0.42075, "grad_norm": 29.125, "grad_norm_var": 2.5103515625, "learning_rate": 0.0001, "loss": 7.4346, "loss/crossentropy": 2.1387193992733957, "loss/hidden": 3.4234375, "loss/jsd": 0.0, "loss/logits": 0.19993042033165692, "step": 16830 }, { "epoch": 0.421, "grad_norm": 31.625, "grad_norm_var": 1.046875, "learning_rate": 0.0001, "loss": 7.5782, "loss/crossentropy": 2.1304811269044874, "loss/hidden": 3.523828125, "loss/jsd": 0.0, "loss/logits": 0.20474526304751633, "step": 16840 }, { "epoch": 0.42125, "grad_norm": 36.25, "grad_norm_var": 2.7309405658680745e+18, "learning_rate": 0.0001, "loss": 7.5442, "loss/crossentropy": 2.0812308952212333, "loss/hidden": 3.476953125, "loss/jsd": 0.0, "loss/logits": 0.20068600717931986, "step": 16850 }, { "epoch": 0.4215, "grad_norm": 30.25, "grad_norm_var": 2.7309405660953006e+18, "learning_rate": 0.0001, "loss": 7.4532, "loss/crossentropy": 2.193872679769993, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.20241283550858497, "step": 16860 }, { "epoch": 0.42175, "grad_norm": 29.125, "grad_norm_var": 1.9760416666666667, "learning_rate": 0.0001, "loss": 7.4989, "loss/crossentropy": 2.083309184014797, "loss/hidden": 3.416796875, "loss/jsd": 0.0, "loss/logits": 0.20324783604592084, "step": 16870 }, { "epoch": 0.422, "grad_norm": 29.5, "grad_norm_var": 3.5150390625, "learning_rate": 0.0001, "loss": 7.4589, "loss/crossentropy": 2.085008730739355, "loss/hidden": 3.441015625, "loss/jsd": 0.0, "loss/logits": 0.17911175787448883, "step": 16880 }, { "epoch": 0.42225, "grad_norm": 29.125, "grad_norm_var": 2.3650390625, "learning_rate": 0.0001, "loss": 7.5188, "loss/crossentropy": 2.216263073682785, "loss/hidden": 3.343359375, "loss/jsd": 0.0, "loss/logits": 0.20870072897523642, "step": 16890 }, { "epoch": 0.4225, "grad_norm": 30.75, "grad_norm_var": 2.1958333333333333, "learning_rate": 0.0001, "loss": 7.4356, "loss/crossentropy": 2.1983559757471083, "loss/hidden": 3.40390625, "loss/jsd": 0.0, "loss/logits": 0.19983625803142785, "step": 16900 }, { "epoch": 0.42275, "grad_norm": 30.875, "grad_norm_var": 2.29765625, "learning_rate": 0.0001, "loss": 7.4603, "loss/crossentropy": 2.1883961871266364, "loss/hidden": 3.3859375, "loss/jsd": 0.0, "loss/logits": 0.1971004357561469, "step": 16910 }, { "epoch": 0.423, "grad_norm": 29.75, "grad_norm_var": 5.770833333333333, "learning_rate": 0.0001, "loss": 7.6179, "loss/crossentropy": 2.219552181661129, "loss/hidden": 3.284375, "loss/jsd": 0.0, "loss/logits": 0.19232371896505357, "step": 16920 }, { "epoch": 0.42325, "grad_norm": 33.0, "grad_norm_var": 2.780989583333333, "learning_rate": 0.0001, "loss": 7.4868, "loss/crossentropy": 2.114982584118843, "loss/hidden": 3.401171875, "loss/jsd": 0.0, "loss/logits": 0.19648698382079602, "step": 16930 }, { "epoch": 0.4235, "grad_norm": 31.25, "grad_norm_var": 2.4525390625, "learning_rate": 0.0001, "loss": 7.4523, "loss/crossentropy": 2.1595448046922683, "loss/hidden": 3.449609375, "loss/jsd": 0.0, "loss/logits": 0.18362618330866098, "step": 16940 }, { "epoch": 0.42375, "grad_norm": 37.0, "grad_norm_var": 3.780989583333333, "learning_rate": 0.0001, "loss": 7.5867, "loss/crossentropy": 2.055322435498238, "loss/hidden": 3.458984375, "loss/jsd": 0.0, "loss/logits": 0.20943177156150342, "step": 16950 }, { "epoch": 0.424, "grad_norm": 31.25, "grad_norm_var": 6.687955729166666, "learning_rate": 0.0001, "loss": 7.5252, "loss/crossentropy": 2.138558331131935, "loss/hidden": 3.42265625, "loss/jsd": 0.0, "loss/logits": 0.19545765966176987, "step": 16960 }, { "epoch": 0.42425, "grad_norm": 30.125, "grad_norm_var": 2.5143229166666665, "learning_rate": 0.0001, "loss": 7.4497, "loss/crossentropy": 2.080673611164093, "loss/hidden": 3.38046875, "loss/jsd": 0.0, "loss/logits": 0.19070623591542243, "step": 16970 }, { "epoch": 0.4245, "grad_norm": 29.625, "grad_norm_var": 2.3358723958333334, "learning_rate": 0.0001, "loss": 7.3938, "loss/crossentropy": 2.1281508088111876, "loss/hidden": 3.351953125, "loss/jsd": 0.0, "loss/logits": 0.18748594932258128, "step": 16980 }, { "epoch": 0.42475, "grad_norm": 33.75, "grad_norm_var": 2.9358723958333335, "learning_rate": 0.0001, "loss": 7.4825, "loss/crossentropy": 2.2469502836465836, "loss/hidden": 3.2796875, "loss/jsd": 0.0, "loss/logits": 0.1956640187650919, "step": 16990 }, { "epoch": 0.425, "grad_norm": 40.0, "grad_norm_var": 7.366666666666666, "learning_rate": 0.0001, "loss": 7.4671, "loss/crossentropy": 2.179307234287262, "loss/hidden": 3.3265625, "loss/jsd": 0.0, "loss/logits": 0.1867943126708269, "step": 17000 }, { "epoch": 0.42525, "grad_norm": 29.5, "grad_norm_var": 6.927018229166666, "learning_rate": 0.0001, "loss": 7.5113, "loss/crossentropy": 2.101860895752907, "loss/hidden": 3.35234375, "loss/jsd": 0.0, "loss/logits": 0.18607552852481604, "step": 17010 }, { "epoch": 0.4255, "grad_norm": 30.125, "grad_norm_var": 9.673372395833333, "learning_rate": 0.0001, "loss": 7.434, "loss/crossentropy": 2.1202177673578264, "loss/hidden": 3.273046875, "loss/jsd": 0.0, "loss/logits": 0.18756014555692674, "step": 17020 }, { "epoch": 0.42575, "grad_norm": 29.0, "grad_norm_var": 8.856184895833334, "learning_rate": 0.0001, "loss": 7.4625, "loss/crossentropy": 2.1069195836782457, "loss/hidden": 3.376953125, "loss/jsd": 0.0, "loss/logits": 0.19768658615648746, "step": 17030 }, { "epoch": 0.426, "grad_norm": 32.0, "grad_norm_var": 2.8353515625, "learning_rate": 0.0001, "loss": 7.4817, "loss/crossentropy": 2.191109284758568, "loss/hidden": 3.362890625, "loss/jsd": 0.0, "loss/logits": 0.19470954779535532, "step": 17040 }, { "epoch": 0.42625, "grad_norm": 34.0, "grad_norm_var": 4.731184895833334, "learning_rate": 0.0001, "loss": 7.6624, "loss/crossentropy": 2.1684040665626525, "loss/hidden": 3.58046875, "loss/jsd": 0.0, "loss/logits": 0.23237632848322393, "step": 17050 }, { "epoch": 0.4265, "grad_norm": 28.625, "grad_norm_var": 2.0947265625, "learning_rate": 0.0001, "loss": 7.4689, "loss/crossentropy": 2.1706847339868545, "loss/hidden": 3.30234375, "loss/jsd": 0.0, "loss/logits": 0.18483849614858627, "step": 17060 }, { "epoch": 0.42675, "grad_norm": 32.25, "grad_norm_var": 1.8833333333333333, "learning_rate": 0.0001, "loss": 7.4644, "loss/crossentropy": 2.1269822865724564, "loss/hidden": 3.406640625, "loss/jsd": 0.0, "loss/logits": 0.20061115603893995, "step": 17070 }, { "epoch": 0.427, "grad_norm": 30.375, "grad_norm_var": 14.857291666666667, "learning_rate": 0.0001, "loss": 7.4191, "loss/crossentropy": 2.051969346404076, "loss/hidden": 3.351953125, "loss/jsd": 0.0, "loss/logits": 0.19209920186549426, "step": 17080 }, { "epoch": 0.42725, "grad_norm": 32.5, "grad_norm_var": 14.401041666666666, "learning_rate": 0.0001, "loss": 7.4766, "loss/crossentropy": 2.1646567583084106, "loss/hidden": 3.358203125, "loss/jsd": 0.0, "loss/logits": 0.1931615237146616, "step": 17090 }, { "epoch": 0.4275, "grad_norm": 32.25, "grad_norm_var": 2.0837890625, "learning_rate": 0.0001, "loss": 7.4649, "loss/crossentropy": 2.1323696315288543, "loss/hidden": 3.277734375, "loss/jsd": 0.0, "loss/logits": 0.18601270411163567, "step": 17100 }, { "epoch": 0.42775, "grad_norm": 30.375, "grad_norm_var": 1.4087890625, "learning_rate": 0.0001, "loss": 7.397, "loss/crossentropy": 2.1776672542095183, "loss/hidden": 3.483984375, "loss/jsd": 0.0, "loss/logits": 0.2124987743794918, "step": 17110 }, { "epoch": 0.428, "grad_norm": 31.125, "grad_norm_var": 1.8205729166666667, "learning_rate": 0.0001, "loss": 7.4531, "loss/crossentropy": 2.0366108894348143, "loss/hidden": 3.485546875, "loss/jsd": 0.0, "loss/logits": 0.1920434921979904, "step": 17120 }, { "epoch": 0.42825, "grad_norm": 29.625, "grad_norm_var": 7.97890625, "learning_rate": 0.0001, "loss": 7.5251, "loss/crossentropy": 2.2316421404480935, "loss/hidden": 3.410546875, "loss/jsd": 0.0, "loss/logits": 0.1984385471791029, "step": 17130 }, { "epoch": 0.4285, "grad_norm": 31.375, "grad_norm_var": 43.22604166666667, "learning_rate": 0.0001, "loss": 7.532, "loss/crossentropy": 2.1584898799657823, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.19015683736652136, "step": 17140 }, { "epoch": 0.42875, "grad_norm": 34.75, "grad_norm_var": 39.980143229166664, "learning_rate": 0.0001, "loss": 7.4923, "loss/crossentropy": 2.0776416778564455, "loss/hidden": 3.4015625, "loss/jsd": 0.0, "loss/logits": 0.19441335443407298, "step": 17150 }, { "epoch": 0.429, "grad_norm": 29.375, "grad_norm_var": 8.19375, "learning_rate": 0.0001, "loss": 7.4352, "loss/crossentropy": 2.1459785103797913, "loss/hidden": 3.387890625, "loss/jsd": 0.0, "loss/logits": 0.18559253066778184, "step": 17160 }, { "epoch": 0.42925, "grad_norm": 33.5, "grad_norm_var": 3.0900390625, "learning_rate": 0.0001, "loss": 7.4055, "loss/crossentropy": 2.182371488213539, "loss/hidden": 3.3546875, "loss/jsd": 0.0, "loss/logits": 0.2002473808825016, "step": 17170 }, { "epoch": 0.4295, "grad_norm": 32.0, "grad_norm_var": 6.212434895833334, "learning_rate": 0.0001, "loss": 7.4465, "loss/crossentropy": 2.1703537315130235, "loss/hidden": 3.4, "loss/jsd": 0.0, "loss/logits": 0.2002835465595126, "step": 17180 }, { "epoch": 0.42975, "grad_norm": 29.75, "grad_norm_var": 1.2900390625, "learning_rate": 0.0001, "loss": 7.5187, "loss/crossentropy": 2.1651883900165556, "loss/hidden": 3.40859375, "loss/jsd": 0.0, "loss/logits": 0.19961190838366746, "step": 17190 }, { "epoch": 0.43, "grad_norm": 32.0, "grad_norm_var": 2.6395833333333334, "learning_rate": 0.0001, "loss": 7.5114, "loss/crossentropy": 2.0858636096119882, "loss/hidden": 3.36875, "loss/jsd": 0.0, "loss/logits": 0.19053794257342815, "step": 17200 }, { "epoch": 0.43025, "grad_norm": 29.625, "grad_norm_var": 3.9145833333333333, "learning_rate": 0.0001, "loss": 7.479, "loss/crossentropy": 2.1603722527623175, "loss/hidden": 3.30859375, "loss/jsd": 0.0, "loss/logits": 0.19217769615352154, "step": 17210 }, { "epoch": 0.4305, "grad_norm": 32.75, "grad_norm_var": 3.849739583333333, "learning_rate": 0.0001, "loss": 7.3934, "loss/crossentropy": 2.0350109085440637, "loss/hidden": 3.412109375, "loss/jsd": 0.0, "loss/logits": 0.1894751563668251, "step": 17220 }, { "epoch": 0.43075, "grad_norm": 31.5, "grad_norm_var": 2.8285807291666667, "learning_rate": 0.0001, "loss": 7.4516, "loss/crossentropy": 2.0489366173744203, "loss/hidden": 3.52109375, "loss/jsd": 0.0, "loss/logits": 0.1949023339897394, "step": 17230 }, { "epoch": 0.431, "grad_norm": 36.5, "grad_norm_var": 29.027083333333334, "learning_rate": 0.0001, "loss": 7.4784, "loss/crossentropy": 2.0739230304956435, "loss/hidden": 3.380078125, "loss/jsd": 0.0, "loss/logits": 0.18924926947802306, "step": 17240 }, { "epoch": 0.43125, "grad_norm": 33.75, "grad_norm_var": 29.676497395833334, "learning_rate": 0.0001, "loss": 7.4699, "loss/crossentropy": 2.1982351303100587, "loss/hidden": 3.37734375, "loss/jsd": 0.0, "loss/logits": 0.21114630959928035, "step": 17250 }, { "epoch": 0.4315, "grad_norm": 29.25, "grad_norm_var": 4.301822916666667, "learning_rate": 0.0001, "loss": 7.5819, "loss/crossentropy": 2.096822480857372, "loss/hidden": 3.426171875, "loss/jsd": 0.0, "loss/logits": 0.2079928234219551, "step": 17260 }, { "epoch": 0.43175, "grad_norm": 28.625, "grad_norm_var": 8.306705729166667, "learning_rate": 0.0001, "loss": 7.593, "loss/crossentropy": 2.115600660443306, "loss/hidden": 3.432421875, "loss/jsd": 0.0, "loss/logits": 0.19750556647777556, "step": 17270 }, { "epoch": 0.432, "grad_norm": 28.5, "grad_norm_var": 2.0478515625, "learning_rate": 0.0001, "loss": 7.4366, "loss/crossentropy": 2.1597479790449143, "loss/hidden": 3.373828125, "loss/jsd": 0.0, "loss/logits": 0.19931532759219409, "step": 17280 }, { "epoch": 0.43225, "grad_norm": 31.0, "grad_norm_var": 1.2518229166666666, "learning_rate": 0.0001, "loss": 7.4325, "loss/crossentropy": 2.026200716197491, "loss/hidden": 3.391796875, "loss/jsd": 0.0, "loss/logits": 0.18456113785505296, "step": 17290 }, { "epoch": 0.4325, "grad_norm": 31.375, "grad_norm_var": 37.43118489583333, "learning_rate": 0.0001, "loss": 7.4414, "loss/crossentropy": 2.149491161108017, "loss/hidden": 3.428125, "loss/jsd": 0.0, "loss/logits": 0.20337051767855882, "step": 17300 }, { "epoch": 0.43275, "grad_norm": 34.5, "grad_norm_var": 36.064518229166666, "learning_rate": 0.0001, "loss": 7.4859, "loss/crossentropy": 2.1269571796059608, "loss/hidden": 3.46484375, "loss/jsd": 0.0, "loss/logits": 0.2096902344375849, "step": 17310 }, { "epoch": 0.433, "grad_norm": 31.875, "grad_norm_var": 5.748958333333333, "learning_rate": 0.0001, "loss": 7.4889, "loss/crossentropy": 2.273072028160095, "loss/hidden": 3.297265625, "loss/jsd": 0.0, "loss/logits": 0.18909230902791024, "step": 17320 }, { "epoch": 0.43325, "grad_norm": 28.875, "grad_norm_var": 3.46640625, "learning_rate": 0.0001, "loss": 7.4258, "loss/crossentropy": 2.1195196583867073, "loss/hidden": 3.402734375, "loss/jsd": 0.0, "loss/logits": 0.19008973110467195, "step": 17330 }, { "epoch": 0.4335, "grad_norm": 30.125, "grad_norm_var": 4.076822916666667, "learning_rate": 0.0001, "loss": 7.4625, "loss/crossentropy": 2.1201743364334105, "loss/hidden": 3.383203125, "loss/jsd": 0.0, "loss/logits": 0.21034939792007207, "step": 17340 }, { "epoch": 0.43375, "grad_norm": 29.375, "grad_norm_var": 4.154166666666667, "learning_rate": 0.0001, "loss": 7.4493, "loss/crossentropy": 2.193806689977646, "loss/hidden": 3.305859375, "loss/jsd": 0.0, "loss/logits": 0.18929022550582886, "step": 17350 }, { "epoch": 0.434, "grad_norm": 30.625, "grad_norm_var": 0.9994140625, "learning_rate": 0.0001, "loss": 7.5285, "loss/crossentropy": 2.20940330773592, "loss/hidden": 3.360546875, "loss/jsd": 0.0, "loss/logits": 0.19673640970140696, "step": 17360 }, { "epoch": 0.43425, "grad_norm": 28.625, "grad_norm_var": 2.61015625, "learning_rate": 0.0001, "loss": 7.4179, "loss/crossentropy": 2.0897106766700744, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.18753900453448297, "step": 17370 }, { "epoch": 0.4345, "grad_norm": 30.75, "grad_norm_var": 3.6556640625, "learning_rate": 0.0001, "loss": 7.4271, "loss/crossentropy": 2.1975408419966698, "loss/hidden": 3.299609375, "loss/jsd": 0.0, "loss/logits": 0.180303574539721, "step": 17380 }, { "epoch": 0.43475, "grad_norm": 31.625, "grad_norm_var": 2.5830729166666666, "learning_rate": 0.0001, "loss": 7.3879, "loss/crossentropy": 2.1741068333387377, "loss/hidden": 3.346875, "loss/jsd": 0.0, "loss/logits": 0.197440049611032, "step": 17390 }, { "epoch": 0.435, "grad_norm": 31.75, "grad_norm_var": 2.22890625, "learning_rate": 0.0001, "loss": 7.4275, "loss/crossentropy": 2.132155954837799, "loss/hidden": 3.447265625, "loss/jsd": 0.0, "loss/logits": 0.19416624326258897, "step": 17400 }, { "epoch": 0.43525, "grad_norm": 33.0, "grad_norm_var": 2.5764973958333335, "learning_rate": 0.0001, "loss": 7.3642, "loss/crossentropy": 2.0881365656852724, "loss/hidden": 3.31328125, "loss/jsd": 0.0, "loss/logits": 0.1851549668237567, "step": 17410 }, { "epoch": 0.4355, "grad_norm": 30.375, "grad_norm_var": 2.6479166666666667, "learning_rate": 0.0001, "loss": 7.4346, "loss/crossentropy": 2.213974291086197, "loss/hidden": 3.215234375, "loss/jsd": 0.0, "loss/logits": 0.17357883155345916, "step": 17420 }, { "epoch": 0.43575, "grad_norm": 34.25, "grad_norm_var": 4.375, "learning_rate": 0.0001, "loss": 7.4587, "loss/crossentropy": 2.150649666786194, "loss/hidden": 3.415234375, "loss/jsd": 0.0, "loss/logits": 0.19460058882832526, "step": 17430 }, { "epoch": 0.436, "grad_norm": 42.0, "grad_norm_var": 10.516666666666667, "learning_rate": 0.0001, "loss": 7.4428, "loss/crossentropy": 2.1933241873979568, "loss/hidden": 3.416796875, "loss/jsd": 0.0, "loss/logits": 0.19797156639397145, "step": 17440 }, { "epoch": 0.43625, "grad_norm": 31.125, "grad_norm_var": 9.1166015625, "learning_rate": 0.0001, "loss": 7.4753, "loss/crossentropy": 2.005639246106148, "loss/hidden": 3.429296875, "loss/jsd": 0.0, "loss/logits": 0.19176286831498146, "step": 17450 }, { "epoch": 0.4365, "grad_norm": 32.25, "grad_norm_var": 5.373893229166667, "learning_rate": 0.0001, "loss": 7.3735, "loss/crossentropy": 2.1921694964170455, "loss/hidden": 3.338671875, "loss/jsd": 0.0, "loss/logits": 0.1854123178869486, "step": 17460 }, { "epoch": 0.43675, "grad_norm": 30.0, "grad_norm_var": 5.0478515625, "learning_rate": 0.0001, "loss": 7.4629, "loss/crossentropy": 2.062588620185852, "loss/hidden": 3.409375, "loss/jsd": 0.0, "loss/logits": 0.18035206217318772, "step": 17470 }, { "epoch": 0.437, "grad_norm": 33.25, "grad_norm_var": 2.0369140625, "learning_rate": 0.0001, "loss": 7.4723, "loss/crossentropy": 2.1528229281306266, "loss/hidden": 3.34453125, "loss/jsd": 0.0, "loss/logits": 0.18847114518284797, "step": 17480 }, { "epoch": 0.43725, "grad_norm": 30.375, "grad_norm_var": 3.984375, "learning_rate": 0.0001, "loss": 7.3924, "loss/crossentropy": 2.225735864043236, "loss/hidden": 3.300390625, "loss/jsd": 0.0, "loss/logits": 0.19506857246160508, "step": 17490 }, { "epoch": 0.4375, "grad_norm": 32.0, "grad_norm_var": 14.176041666666666, "learning_rate": 0.0001, "loss": 7.6873, "loss/crossentropy": 2.1927039504051207, "loss/hidden": 3.41484375, "loss/jsd": 0.0, "loss/logits": 0.21184294372797013, "step": 17500 }, { "epoch": 0.43775, "grad_norm": 32.5, "grad_norm_var": 12.9025390625, "learning_rate": 0.0001, "loss": 7.6167, "loss/crossentropy": 2.1623361676931383, "loss/hidden": 3.40859375, "loss/jsd": 0.0, "loss/logits": 0.1988366713747382, "step": 17510 }, { "epoch": 0.438, "grad_norm": 5536481280.0, "grad_norm_var": 1.915789037802619e+18, "learning_rate": 0.0001, "loss": 7.4769, "loss/crossentropy": 2.0769626796245575, "loss/hidden": 3.32890625, "loss/jsd": 0.0, "loss/logits": 0.1847306966781616, "step": 17520 }, { "epoch": 0.43825, "grad_norm": 30.125, "grad_norm_var": 1.9157890376411382e+18, "learning_rate": 0.0001, "loss": 7.4514, "loss/crossentropy": 2.204550328850746, "loss/hidden": 3.2953125, "loss/jsd": 0.0, "loss/logits": 0.18567237444221973, "step": 17530 }, { "epoch": 0.4385, "grad_norm": 33.75, "grad_norm_var": 4.995572916666666, "learning_rate": 0.0001, "loss": 7.4338, "loss/crossentropy": 2.16412447988987, "loss/hidden": 3.333203125, "loss/jsd": 0.0, "loss/logits": 0.19459595140069724, "step": 17540 }, { "epoch": 0.43875, "grad_norm": 35.0, "grad_norm_var": 11.065625, "learning_rate": 0.0001, "loss": 7.4111, "loss/crossentropy": 2.161545941233635, "loss/hidden": 3.323046875, "loss/jsd": 0.0, "loss/logits": 0.18747595753520727, "step": 17550 }, { "epoch": 0.439, "grad_norm": 30.875, "grad_norm_var": 18.93515625, "learning_rate": 0.0001, "loss": 7.4927, "loss/crossentropy": 2.0142488092184068, "loss/hidden": 3.441015625, "loss/jsd": 0.0, "loss/logits": 0.20353511590510606, "step": 17560 }, { "epoch": 0.43925, "grad_norm": 31.75, "grad_norm_var": 21.154166666666665, "learning_rate": 0.0001, "loss": 7.557, "loss/crossentropy": 2.1331784069538116, "loss/hidden": 3.39375, "loss/jsd": 0.0, "loss/logits": 0.18347944878041744, "step": 17570 }, { "epoch": 0.4395, "grad_norm": 36.5, "grad_norm_var": 11.025455729166667, "learning_rate": 0.0001, "loss": 7.6167, "loss/crossentropy": 2.2123819947242738, "loss/hidden": 3.294140625, "loss/jsd": 0.0, "loss/logits": 0.20334599036723375, "step": 17580 }, { "epoch": 0.43975, "grad_norm": 35.0, "grad_norm_var": 26.2962890625, "learning_rate": 0.0001, "loss": 7.443, "loss/crossentropy": 2.054215854406357, "loss/hidden": 3.3171875, "loss/jsd": 0.0, "loss/logits": 0.1868831802159548, "step": 17590 }, { "epoch": 0.44, "grad_norm": 29.5, "grad_norm_var": 8.619791666666666, "learning_rate": 0.0001, "loss": 7.3659, "loss/crossentropy": 2.1821652933955193, "loss/hidden": 3.445703125, "loss/jsd": 0.0, "loss/logits": 0.1797631662338972, "step": 17600 }, { "epoch": 0.44025, "grad_norm": 33.25, "grad_norm_var": 6.815559895833333, "learning_rate": 0.0001, "loss": 7.4363, "loss/crossentropy": 2.230892112851143, "loss/hidden": 3.350390625, "loss/jsd": 0.0, "loss/logits": 0.19118015244603156, "step": 17610 }, { "epoch": 0.4405, "grad_norm": 31.75, "grad_norm_var": 3.7552083333333335, "learning_rate": 0.0001, "loss": 7.516, "loss/crossentropy": 2.152178055047989, "loss/hidden": 3.42578125, "loss/jsd": 0.0, "loss/logits": 0.2148242775350809, "step": 17620 }, { "epoch": 0.44075, "grad_norm": 37.25, "grad_norm_var": 9.076497395833334, "learning_rate": 0.0001, "loss": 7.3909, "loss/crossentropy": 2.1013472080230713, "loss/hidden": 3.408203125, "loss/jsd": 0.0, "loss/logits": 0.1838392995297909, "step": 17630 }, { "epoch": 0.441, "grad_norm": 35.75, "grad_norm_var": 10.3291015625, "learning_rate": 0.0001, "loss": 7.4749, "loss/crossentropy": 2.12664113342762, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.19979833103716374, "step": 17640 }, { "epoch": 0.44125, "grad_norm": 34.0, "grad_norm_var": 12.180989583333334, "learning_rate": 0.0001, "loss": 7.439, "loss/crossentropy": 2.2174566209316255, "loss/hidden": 3.33984375, "loss/jsd": 0.0, "loss/logits": 0.21133954562246798, "step": 17650 }, { "epoch": 0.4415, "grad_norm": 35.0, "grad_norm_var": 9.773958333333333, "learning_rate": 0.0001, "loss": 7.4036, "loss/crossentropy": 2.1589852809906005, "loss/hidden": 3.43671875, "loss/jsd": 0.0, "loss/logits": 0.1996469773352146, "step": 17660 }, { "epoch": 0.44175, "grad_norm": 30.125, "grad_norm_var": 4.467122395833333, "learning_rate": 0.0001, "loss": 7.3101, "loss/crossentropy": 2.1177646607160567, "loss/hidden": 3.3796875, "loss/jsd": 0.0, "loss/logits": 0.20224390178918839, "step": 17670 }, { "epoch": 0.442, "grad_norm": 33.25, "grad_norm_var": 4.853580729166667, "learning_rate": 0.0001, "loss": 7.4531, "loss/crossentropy": 2.035849825292826, "loss/hidden": 3.367578125, "loss/jsd": 0.0, "loss/logits": 0.1935986878350377, "step": 17680 }, { "epoch": 0.44225, "grad_norm": 31.25, "grad_norm_var": 6.71015625, "learning_rate": 0.0001, "loss": 7.3643, "loss/crossentropy": 2.0254479318857195, "loss/hidden": 3.429296875, "loss/jsd": 0.0, "loss/logits": 0.19959452152252197, "step": 17690 }, { "epoch": 0.4425, "grad_norm": 30.25, "grad_norm_var": 49.7041015625, "learning_rate": 0.0001, "loss": 7.487, "loss/crossentropy": 2.128339225053787, "loss/hidden": 3.428515625, "loss/jsd": 0.0, "loss/logits": 0.19117913916707038, "step": 17700 }, { "epoch": 0.44275, "grad_norm": 31.5, "grad_norm_var": 68.46451822916667, "learning_rate": 0.0001, "loss": 7.4991, "loss/crossentropy": 2.2027375906705857, "loss/hidden": 3.288671875, "loss/jsd": 0.0, "loss/logits": 0.18826800361275672, "step": 17710 }, { "epoch": 0.443, "grad_norm": 32.0, "grad_norm_var": 39.52682291666667, "learning_rate": 0.0001, "loss": 7.4788, "loss/crossentropy": 2.1423556417226792, "loss/hidden": 3.36953125, "loss/jsd": 0.0, "loss/logits": 0.19592436235398053, "step": 17720 }, { "epoch": 0.44325, "grad_norm": 32.75, "grad_norm_var": 25.954166666666666, "learning_rate": 0.0001, "loss": 7.6508, "loss/crossentropy": 2.1767683640122413, "loss/hidden": 3.38203125, "loss/jsd": 0.0, "loss/logits": 0.20649159606546164, "step": 17730 }, { "epoch": 0.4435, "grad_norm": 28.625, "grad_norm_var": 12.77265625, "learning_rate": 0.0001, "loss": 7.4281, "loss/crossentropy": 2.1573469266295433, "loss/hidden": 3.458984375, "loss/jsd": 0.0, "loss/logits": 0.2059380615130067, "step": 17740 }, { "epoch": 0.44375, "grad_norm": 31.25, "grad_norm_var": 12.7837890625, "learning_rate": 0.0001, "loss": 7.402, "loss/crossentropy": 2.121631011366844, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.18308492042124272, "step": 17750 }, { "epoch": 0.444, "grad_norm": 27.5, "grad_norm_var": 11.298958333333333, "learning_rate": 0.0001, "loss": 7.359, "loss/crossentropy": 2.1934143617749213, "loss/hidden": 3.332421875, "loss/jsd": 0.0, "loss/logits": 0.18884631469845772, "step": 17760 }, { "epoch": 0.44425, "grad_norm": 31.625, "grad_norm_var": 10.630989583333333, "learning_rate": 0.0001, "loss": 7.4511, "loss/crossentropy": 2.2182701379060745, "loss/hidden": 3.360546875, "loss/jsd": 0.0, "loss/logits": 0.20777408666908742, "step": 17770 }, { "epoch": 0.4445, "grad_norm": 41.5, "grad_norm_var": 15.221875, "learning_rate": 0.0001, "loss": 7.5054, "loss/crossentropy": 2.15917833596468, "loss/hidden": 3.4875, "loss/jsd": 0.0, "loss/logits": 0.20331293437629938, "step": 17780 }, { "epoch": 0.44475, "grad_norm": 29.25, "grad_norm_var": 28.752018229166666, "learning_rate": 0.0001, "loss": 7.4046, "loss/crossentropy": 2.2085522964596747, "loss/hidden": 3.4359375, "loss/jsd": 0.0, "loss/logits": 0.20968901123851538, "step": 17790 }, { "epoch": 0.445, "grad_norm": 27.625, "grad_norm_var": 16.3994140625, "learning_rate": 0.0001, "loss": 7.3779, "loss/crossentropy": 2.088705601543188, "loss/hidden": 3.264453125, "loss/jsd": 0.0, "loss/logits": 0.17547062635421753, "step": 17800 }, { "epoch": 0.44525, "grad_norm": 33.5, "grad_norm_var": 15.054622395833333, "learning_rate": 0.0001, "loss": 7.3363, "loss/crossentropy": 2.0361307077109814, "loss/hidden": 3.3265625, "loss/jsd": 0.0, "loss/logits": 0.17406579544767736, "step": 17810 }, { "epoch": 0.4455, "grad_norm": 31.25, "grad_norm_var": 4.33515625, "learning_rate": 0.0001, "loss": 7.3829, "loss/crossentropy": 2.2537280052900313, "loss/hidden": 3.307421875, "loss/jsd": 0.0, "loss/logits": 0.1972663702443242, "step": 17820 }, { "epoch": 0.44575, "grad_norm": 31.125, "grad_norm_var": 5.854166666666667, "learning_rate": 0.0001, "loss": 7.4551, "loss/crossentropy": 2.1048080518841745, "loss/hidden": 3.366796875, "loss/jsd": 0.0, "loss/logits": 0.20005077831447124, "step": 17830 }, { "epoch": 0.446, "grad_norm": 35.75, "grad_norm_var": 5.8572265625, "learning_rate": 0.0001, "loss": 7.4493, "loss/crossentropy": 2.1436916798353196, "loss/hidden": 3.343359375, "loss/jsd": 0.0, "loss/logits": 0.1947880506515503, "step": 17840 }, { "epoch": 0.44625, "grad_norm": 32.0, "grad_norm_var": 6.080989583333333, "learning_rate": 0.0001, "loss": 7.4321, "loss/crossentropy": 2.186265227198601, "loss/hidden": 3.3328125, "loss/jsd": 0.0, "loss/logits": 0.18987231217324735, "step": 17850 }, { "epoch": 0.4465, "grad_norm": 29.625, "grad_norm_var": 4.404622395833333, "learning_rate": 0.0001, "loss": 7.4713, "loss/crossentropy": 2.1789949417114256, "loss/hidden": 3.226171875, "loss/jsd": 0.0, "loss/logits": 0.16992315649986267, "step": 17860 }, { "epoch": 0.44675, "grad_norm": 35.75, "grad_norm_var": 4.260416666666667, "learning_rate": 0.0001, "loss": 7.4499, "loss/crossentropy": 2.1180329382419587, "loss/hidden": 3.34453125, "loss/jsd": 0.0, "loss/logits": 0.20158529318869114, "step": 17870 }, { "epoch": 0.447, "grad_norm": 32.75, "grad_norm_var": 5.462434895833334, "learning_rate": 0.0001, "loss": 7.4072, "loss/crossentropy": 2.1144249960780144, "loss/hidden": 3.277734375, "loss/jsd": 0.0, "loss/logits": 0.17686952501535416, "step": 17880 }, { "epoch": 0.44725, "grad_norm": 30.5, "grad_norm_var": 3.7018229166666665, "learning_rate": 0.0001, "loss": 7.4559, "loss/crossentropy": 2.1003448024392126, "loss/hidden": 3.473828125, "loss/jsd": 0.0, "loss/logits": 0.19702099692076444, "step": 17890 }, { "epoch": 0.4475, "grad_norm": 33.25, "grad_norm_var": 31.665625, "learning_rate": 0.0001, "loss": 7.5754, "loss/crossentropy": 2.165144255757332, "loss/hidden": 3.41640625, "loss/jsd": 0.0, "loss/logits": 0.20823087207973004, "step": 17900 }, { "epoch": 0.44775, "grad_norm": 29.75, "grad_norm_var": 27.611458333333335, "learning_rate": 0.0001, "loss": 7.4127, "loss/crossentropy": 2.1247920095920563, "loss/hidden": 3.34453125, "loss/jsd": 0.0, "loss/logits": 0.18894042856991292, "step": 17910 }, { "epoch": 0.448, "grad_norm": 31.375, "grad_norm_var": 16.0291015625, "learning_rate": 0.0001, "loss": 7.5839, "loss/crossentropy": 2.2031987488269804, "loss/hidden": 3.4328125, "loss/jsd": 0.0, "loss/logits": 0.18946342077106237, "step": 17920 }, { "epoch": 0.44825, "grad_norm": 30.5, "grad_norm_var": 7.564518229166667, "learning_rate": 0.0001, "loss": 7.4259, "loss/crossentropy": 2.08001571893692, "loss/hidden": 3.3375, "loss/jsd": 0.0, "loss/logits": 0.18759096097201108, "step": 17930 }, { "epoch": 0.4485, "grad_norm": 32.5, "grad_norm_var": 3.5405598958333333, "learning_rate": 0.0001, "loss": 7.4475, "loss/crossentropy": 2.091191267967224, "loss/hidden": 3.35078125, "loss/jsd": 0.0, "loss/logits": 0.192183992639184, "step": 17940 }, { "epoch": 0.44875, "grad_norm": 31.875, "grad_norm_var": 6.664322916666666, "learning_rate": 0.0001, "loss": 7.3588, "loss/crossentropy": 2.1699838608503343, "loss/hidden": 3.3390625, "loss/jsd": 0.0, "loss/logits": 0.1987156704068184, "step": 17950 }, { "epoch": 0.449, "grad_norm": 31.0, "grad_norm_var": 5.8509765625, "learning_rate": 0.0001, "loss": 7.532, "loss/crossentropy": 2.2819882690906526, "loss/hidden": 3.2984375, "loss/jsd": 0.0, "loss/logits": 0.1946489542722702, "step": 17960 }, { "epoch": 0.44925, "grad_norm": 35.25, "grad_norm_var": 4.6931640625, "learning_rate": 0.0001, "loss": 7.5208, "loss/crossentropy": 2.188323587179184, "loss/hidden": 3.41953125, "loss/jsd": 0.0, "loss/logits": 0.20090275909751654, "step": 17970 }, { "epoch": 0.4495, "grad_norm": 28.125, "grad_norm_var": 4.03125, "learning_rate": 0.0001, "loss": 7.498, "loss/crossentropy": 2.077894252538681, "loss/hidden": 3.373828125, "loss/jsd": 0.0, "loss/logits": 0.18269995506852865, "step": 17980 }, { "epoch": 0.44975, "grad_norm": 29.875, "grad_norm_var": 33.514322916666664, "learning_rate": 0.0001, "loss": 7.4175, "loss/crossentropy": 2.0424763709306717, "loss/hidden": 3.2703125, "loss/jsd": 0.0, "loss/logits": 0.1786050383001566, "step": 17990 }, { "epoch": 0.45, "grad_norm": 35.5, "grad_norm_var": 5.978580729166667, "learning_rate": 0.0001, "loss": 7.4981, "loss/crossentropy": 2.0257104620337487, "loss/hidden": 3.411328125, "loss/jsd": 0.0, "loss/logits": 0.203111675940454, "step": 18000 }, { "epoch": 0.45025, "grad_norm": 27.125, "grad_norm_var": 8.2103515625, "learning_rate": 0.0001, "loss": 7.3876, "loss/crossentropy": 2.00185070335865, "loss/hidden": 3.483984375, "loss/jsd": 0.0, "loss/logits": 0.207043712772429, "step": 18010 }, { "epoch": 0.4505, "grad_norm": 30.5, "grad_norm_var": 7.8541015625, "learning_rate": 0.0001, "loss": 7.3589, "loss/crossentropy": 2.163762652873993, "loss/hidden": 3.31171875, "loss/jsd": 0.0, "loss/logits": 0.18520545940846206, "step": 18020 }, { "epoch": 0.45075, "grad_norm": 31.375, "grad_norm_var": 3.8343098958333335, "learning_rate": 0.0001, "loss": 7.3585, "loss/crossentropy": 2.0801485389471055, "loss/hidden": 3.296875, "loss/jsd": 0.0, "loss/logits": 0.19670285172760488, "step": 18030 }, { "epoch": 0.451, "grad_norm": 32.5, "grad_norm_var": 4.377018229166667, "learning_rate": 0.0001, "loss": 7.3916, "loss/crossentropy": 2.063150378316641, "loss/hidden": 3.336328125, "loss/jsd": 0.0, "loss/logits": 0.18180835004895926, "step": 18040 }, { "epoch": 0.45125, "grad_norm": 28.25, "grad_norm_var": 4.050455729166667, "learning_rate": 0.0001, "loss": 7.4234, "loss/crossentropy": 2.085699537396431, "loss/hidden": 3.450390625, "loss/jsd": 0.0, "loss/logits": 0.18879668060690163, "step": 18050 }, { "epoch": 0.4515, "grad_norm": 29.75, "grad_norm_var": 4.415559895833334, "learning_rate": 0.0001, "loss": 7.4605, "loss/crossentropy": 2.197605139017105, "loss/hidden": 3.324609375, "loss/jsd": 0.0, "loss/logits": 0.1884280290454626, "step": 18060 }, { "epoch": 0.45175, "grad_norm": 35.75, "grad_norm_var": 3.74765625, "learning_rate": 0.0001, "loss": 7.383, "loss/crossentropy": 2.1910621374845505, "loss/hidden": 3.33125, "loss/jsd": 0.0, "loss/logits": 0.18678564615547658, "step": 18070 }, { "epoch": 0.452, "grad_norm": 31.0, "grad_norm_var": 2.5136418856872443e+18, "learning_rate": 0.0001, "loss": 7.4528, "loss/crossentropy": 2.188386806845665, "loss/hidden": 3.6078125, "loss/jsd": 0.0, "loss/logits": 0.18787388410419226, "step": 18080 }, { "epoch": 0.45225, "grad_norm": 34.5, "grad_norm_var": 5.255989583333333, "learning_rate": 0.0001, "loss": 7.5167, "loss/crossentropy": 2.086820271611214, "loss/hidden": 3.521484375, "loss/jsd": 0.0, "loss/logits": 0.21262760646641254, "step": 18090 }, { "epoch": 0.4525, "grad_norm": 30.25, "grad_norm_var": 4.459309895833333, "learning_rate": 0.0001, "loss": 7.4389, "loss/crossentropy": 2.156789070367813, "loss/hidden": 3.46171875, "loss/jsd": 0.0, "loss/logits": 0.2072874901816249, "step": 18100 }, { "epoch": 0.45275, "grad_norm": 30.25, "grad_norm_var": 2.32265625, "learning_rate": 0.0001, "loss": 7.4645, "loss/crossentropy": 2.280264839529991, "loss/hidden": 3.244140625, "loss/jsd": 0.0, "loss/logits": 0.18678643852472304, "step": 18110 }, { "epoch": 0.453, "grad_norm": 32.5, "grad_norm_var": 2.63515625, "learning_rate": 0.0001, "loss": 7.5154, "loss/crossentropy": 2.167169563472271, "loss/hidden": 3.316015625, "loss/jsd": 0.0, "loss/logits": 0.19211388044059277, "step": 18120 }, { "epoch": 0.45325, "grad_norm": 30.0, "grad_norm_var": 2.9061848958333334, "learning_rate": 0.0001, "loss": 7.564, "loss/crossentropy": 2.070133328437805, "loss/hidden": 3.39765625, "loss/jsd": 0.0, "loss/logits": 0.18248395789414645, "step": 18130 }, { "epoch": 0.4535, "grad_norm": 30.375, "grad_norm_var": 2.637955729166667, "learning_rate": 0.0001, "loss": 7.4519, "loss/crossentropy": 2.068394711613655, "loss/hidden": 3.39609375, "loss/jsd": 0.0, "loss/logits": 0.19283174835145472, "step": 18140 }, { "epoch": 0.45375, "grad_norm": 31.0, "grad_norm_var": 1.7754557291666666, "learning_rate": 0.0001, "loss": 7.4121, "loss/crossentropy": 2.0715004056692123, "loss/hidden": 3.464453125, "loss/jsd": 0.0, "loss/logits": 0.21265985630452633, "step": 18150 }, { "epoch": 0.454, "grad_norm": 30.25, "grad_norm_var": 2.4551432291666666, "learning_rate": 0.0001, "loss": 7.5993, "loss/crossentropy": 2.201769821345806, "loss/hidden": 3.378125, "loss/jsd": 0.0, "loss/logits": 0.19506389424204826, "step": 18160 }, { "epoch": 0.45425, "grad_norm": 30.0, "grad_norm_var": 1.2499348958333334, "learning_rate": 0.0001, "loss": 7.4775, "loss/crossentropy": 2.22137137055397, "loss/hidden": 3.260546875, "loss/jsd": 0.0, "loss/logits": 0.18893426284193993, "step": 18170 }, { "epoch": 0.4545, "grad_norm": 31.25, "grad_norm_var": 27.433268229166668, "learning_rate": 0.0001, "loss": 7.5006, "loss/crossentropy": 2.194701671600342, "loss/hidden": 3.4265625, "loss/jsd": 0.0, "loss/logits": 0.19335605893284083, "step": 18180 }, { "epoch": 0.45475, "grad_norm": 27.375, "grad_norm_var": 33.23587239583333, "learning_rate": 0.0001, "loss": 7.5273, "loss/crossentropy": 2.224129018187523, "loss/hidden": 3.42734375, "loss/jsd": 0.0, "loss/logits": 0.21178614553064107, "step": 18190 }, { "epoch": 0.455, "grad_norm": 31.125, "grad_norm_var": 9.046875, "learning_rate": 0.0001, "loss": 7.4934, "loss/crossentropy": 2.168444353342056, "loss/hidden": 3.369921875, "loss/jsd": 0.0, "loss/logits": 0.18636115789413452, "step": 18200 }, { "epoch": 0.45525, "grad_norm": 28.25, "grad_norm_var": 9.622330729166666, "learning_rate": 0.0001, "loss": 7.5636, "loss/crossentropy": 2.1226459950208665, "loss/hidden": 3.462109375, "loss/jsd": 0.0, "loss/logits": 0.19698166605085135, "step": 18210 }, { "epoch": 0.4555, "grad_norm": 29.75, "grad_norm_var": 3.7848307291666665, "learning_rate": 0.0001, "loss": 7.4006, "loss/crossentropy": 2.248943442106247, "loss/hidden": 3.337890625, "loss/jsd": 0.0, "loss/logits": 0.1881408639252186, "step": 18220 }, { "epoch": 0.45575, "grad_norm": 32.75, "grad_norm_var": 3.893684895833333, "learning_rate": 0.0001, "loss": 7.4471, "loss/crossentropy": 2.150214585661888, "loss/hidden": 3.437109375, "loss/jsd": 0.0, "loss/logits": 0.2005108630284667, "step": 18230 }, { "epoch": 0.456, "grad_norm": 30.875, "grad_norm_var": 5.280989583333334, "learning_rate": 0.0001, "loss": 7.4975, "loss/crossentropy": 2.149556818604469, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.19575091004371642, "step": 18240 }, { "epoch": 0.45625, "grad_norm": 31.0, "grad_norm_var": 3.73125, "learning_rate": 0.0001, "loss": 7.4075, "loss/crossentropy": 2.0766668774187567, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.20460515054874123, "step": 18250 }, { "epoch": 0.4565, "grad_norm": 37.0, "grad_norm_var": 4.530989583333334, "learning_rate": 0.0001, "loss": 7.3759, "loss/crossentropy": 2.04805474281311, "loss/hidden": 3.384765625, "loss/jsd": 0.0, "loss/logits": 0.18103201519697903, "step": 18260 }, { "epoch": 0.45675, "grad_norm": 35.25, "grad_norm_var": 70.37265625, "learning_rate": 0.0001, "loss": 7.5674, "loss/crossentropy": 2.1427599877119063, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.2034947697073221, "step": 18270 }, { "epoch": 0.457, "grad_norm": 29.25, "grad_norm_var": 7.63515625, "learning_rate": 0.0001, "loss": 7.4797, "loss/crossentropy": 2.074523624777794, "loss/hidden": 3.405078125, "loss/jsd": 0.0, "loss/logits": 0.19340968001633882, "step": 18280 }, { "epoch": 0.45725, "grad_norm": 31.625, "grad_norm_var": 4.620247395833333, "learning_rate": 0.0001, "loss": 7.4549, "loss/crossentropy": 2.224767416715622, "loss/hidden": 3.319140625, "loss/jsd": 0.0, "loss/logits": 0.18859731387346984, "step": 18290 }, { "epoch": 0.4575, "grad_norm": 32.75, "grad_norm_var": 16.937955729166667, "learning_rate": 0.0001, "loss": 7.4625, "loss/crossentropy": 2.1193591982126234, "loss/hidden": 3.419921875, "loss/jsd": 0.0, "loss/logits": 0.2029947843402624, "step": 18300 }, { "epoch": 0.45775, "grad_norm": 39.5, "grad_norm_var": 18.048893229166666, "learning_rate": 0.0001, "loss": 7.4101, "loss/crossentropy": 2.1422934621572494, "loss/hidden": 3.354296875, "loss/jsd": 0.0, "loss/logits": 0.19083867389708759, "step": 18310 }, { "epoch": 0.458, "grad_norm": 28.125, "grad_norm_var": 8.314322916666667, "learning_rate": 0.0001, "loss": 7.4655, "loss/crossentropy": 2.0613017052412035, "loss/hidden": 3.433203125, "loss/jsd": 0.0, "loss/logits": 0.2031024197116494, "step": 18320 }, { "epoch": 0.45825, "grad_norm": 40.5, "grad_norm_var": 15.055989583333334, "learning_rate": 0.0001, "loss": 7.3438, "loss/crossentropy": 2.208176797628403, "loss/hidden": 3.393359375, "loss/jsd": 0.0, "loss/logits": 0.1937787916511297, "step": 18330 }, { "epoch": 0.4585, "grad_norm": 29.75, "grad_norm_var": 12.668489583333333, "learning_rate": 0.0001, "loss": 7.4472, "loss/crossentropy": 2.1267407298088075, "loss/hidden": 3.444140625, "loss/jsd": 0.0, "loss/logits": 0.19318134970963002, "step": 18340 }, { "epoch": 0.45875, "grad_norm": 29.375, "grad_norm_var": 10.7384765625, "learning_rate": 0.0001, "loss": 7.3673, "loss/crossentropy": 2.0367541924118995, "loss/hidden": 3.29375, "loss/jsd": 0.0, "loss/logits": 0.1756553279235959, "step": 18350 }, { "epoch": 0.459, "grad_norm": 33.5, "grad_norm_var": 11.2697265625, "learning_rate": 0.0001, "loss": 7.5204, "loss/crossentropy": 2.187627172470093, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.192278266325593, "step": 18360 }, { "epoch": 0.45925, "grad_norm": 35.0, "grad_norm_var": 49.330989583333334, "learning_rate": 0.0001, "loss": 7.3755, "loss/crossentropy": 2.0965137451887133, "loss/hidden": 3.3796875, "loss/jsd": 0.0, "loss/logits": 0.17834257557988167, "step": 18370 }, { "epoch": 0.4595, "grad_norm": 31.625, "grad_norm_var": 14.001822916666667, "learning_rate": 0.0001, "loss": 7.454, "loss/crossentropy": 2.1809972152113914, "loss/hidden": 3.300390625, "loss/jsd": 0.0, "loss/logits": 0.19421270694583653, "step": 18380 }, { "epoch": 0.45975, "grad_norm": 29.375, "grad_norm_var": 12.6072265625, "learning_rate": 0.0001, "loss": 7.4537, "loss/crossentropy": 2.0333224818110467, "loss/hidden": 3.307421875, "loss/jsd": 0.0, "loss/logits": 0.18099675104022026, "step": 18390 }, { "epoch": 0.46, "grad_norm": 31.875, "grad_norm_var": 7.2728515625, "learning_rate": 0.0001, "loss": 7.5258, "loss/crossentropy": 2.0772110372781754, "loss/hidden": 3.532421875, "loss/jsd": 0.0, "loss/logits": 0.2270737275481224, "step": 18400 }, { "epoch": 0.46025, "grad_norm": 28.75, "grad_norm_var": 5.723893229166666, "learning_rate": 0.0001, "loss": 7.3846, "loss/crossentropy": 2.092162123322487, "loss/hidden": 3.437109375, "loss/jsd": 0.0, "loss/logits": 0.2126236293464899, "step": 18410 }, { "epoch": 0.4605, "grad_norm": 30.5, "grad_norm_var": 3.8921223958333333, "learning_rate": 0.0001, "loss": 7.4763, "loss/crossentropy": 2.118947532773018, "loss/hidden": 3.405859375, "loss/jsd": 0.0, "loss/logits": 0.20705053452402353, "step": 18420 }, { "epoch": 0.46075, "grad_norm": 30.5, "grad_norm_var": 5.431184895833334, "learning_rate": 0.0001, "loss": 7.3772, "loss/crossentropy": 2.150417110323906, "loss/hidden": 3.244140625, "loss/jsd": 0.0, "loss/logits": 0.18132503498345615, "step": 18430 }, { "epoch": 0.461, "grad_norm": 31.125, "grad_norm_var": 5.370572916666666, "learning_rate": 0.0001, "loss": 7.4958, "loss/crossentropy": 2.0525425523519516, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.178656730055809, "step": 18440 }, { "epoch": 0.46125, "grad_norm": 34.75, "grad_norm_var": 4.55390625, "learning_rate": 0.0001, "loss": 7.4681, "loss/crossentropy": 2.1539366483688354, "loss/hidden": 3.40234375, "loss/jsd": 0.0, "loss/logits": 0.20078267380595208, "step": 18450 }, { "epoch": 0.4615, "grad_norm": 28.625, "grad_norm_var": 6.175, "learning_rate": 0.0001, "loss": 7.4644, "loss/crossentropy": 2.238241583108902, "loss/hidden": 3.36640625, "loss/jsd": 0.0, "loss/logits": 0.2120617249980569, "step": 18460 }, { "epoch": 0.46175, "grad_norm": 34.0, "grad_norm_var": 6.612955729166667, "learning_rate": 0.0001, "loss": 7.452, "loss/crossentropy": 2.061172368377447, "loss/hidden": 3.35546875, "loss/jsd": 0.0, "loss/logits": 0.1729869638569653, "step": 18470 }, { "epoch": 0.462, "grad_norm": 32.0, "grad_norm_var": 5.0072265625, "learning_rate": 0.0001, "loss": 7.4506, "loss/crossentropy": 2.1220480680465696, "loss/hidden": 3.4203125, "loss/jsd": 0.0, "loss/logits": 0.20648412220180035, "step": 18480 }, { "epoch": 0.46225, "grad_norm": 31.0, "grad_norm_var": 4.564322916666667, "learning_rate": 0.0001, "loss": 7.5257, "loss/crossentropy": 2.1619621008634566, "loss/hidden": 3.428125, "loss/jsd": 0.0, "loss/logits": 0.22688564024865626, "step": 18490 }, { "epoch": 0.4625, "grad_norm": 32.0, "grad_norm_var": 5.120833333333334, "learning_rate": 0.0001, "loss": 7.3694, "loss/crossentropy": 2.264406609535217, "loss/hidden": 3.298046875, "loss/jsd": 0.0, "loss/logits": 0.18825284875929355, "step": 18500 }, { "epoch": 0.46275, "grad_norm": 28.125, "grad_norm_var": 5.265625, "learning_rate": 0.0001, "loss": 7.5201, "loss/crossentropy": 2.2461962729692457, "loss/hidden": 3.420703125, "loss/jsd": 0.0, "loss/logits": 0.21331474296748637, "step": 18510 }, { "epoch": 0.463, "grad_norm": 30.75, "grad_norm_var": 4.0728515625, "learning_rate": 0.0001, "loss": 7.4195, "loss/crossentropy": 1.9451055109500885, "loss/hidden": 3.351171875, "loss/jsd": 0.0, "loss/logits": 0.18808368891477584, "step": 18520 }, { "epoch": 0.46325, "grad_norm": 31.0, "grad_norm_var": 3.0927083333333334, "learning_rate": 0.0001, "loss": 7.4772, "loss/crossentropy": 2.110605201125145, "loss/hidden": 3.4671875, "loss/jsd": 0.0, "loss/logits": 0.19988117516040801, "step": 18530 }, { "epoch": 0.4635, "grad_norm": 30.25, "grad_norm_var": 1.9395182291666666, "learning_rate": 0.0001, "loss": 7.4795, "loss/crossentropy": 1.9315606564283372, "loss/hidden": 3.428515625, "loss/jsd": 0.0, "loss/logits": 0.17853899393230677, "step": 18540 }, { "epoch": 0.46375, "grad_norm": 34.5, "grad_norm_var": 26.724934895833332, "learning_rate": 0.0001, "loss": 7.5498, "loss/crossentropy": 2.1117912903428078, "loss/hidden": 3.54453125, "loss/jsd": 0.0, "loss/logits": 0.2193370433524251, "step": 18550 }, { "epoch": 0.464, "grad_norm": 29.25, "grad_norm_var": 27.626822916666665, "learning_rate": 0.0001, "loss": 7.4152, "loss/crossentropy": 2.057279494404793, "loss/hidden": 3.337109375, "loss/jsd": 0.0, "loss/logits": 0.18574363086372614, "step": 18560 }, { "epoch": 0.46425, "grad_norm": 32.75, "grad_norm_var": 3.3452473958333333, "learning_rate": 0.0001, "loss": 7.4341, "loss/crossentropy": 2.0958516538143157, "loss/hidden": 3.36015625, "loss/jsd": 0.0, "loss/logits": 0.18902888633310794, "step": 18570 }, { "epoch": 0.4645, "grad_norm": 30.375, "grad_norm_var": 1.6830729166666667, "learning_rate": 0.0001, "loss": 7.5493, "loss/crossentropy": 2.177600572258234, "loss/hidden": 3.457421875, "loss/jsd": 0.0, "loss/logits": 0.2118171840906143, "step": 18580 }, { "epoch": 0.46475, "grad_norm": 29.25, "grad_norm_var": 4.475455729166667, "learning_rate": 0.0001, "loss": 7.4925, "loss/crossentropy": 2.087388700246811, "loss/hidden": 3.41953125, "loss/jsd": 0.0, "loss/logits": 0.19477690532803535, "step": 18590 }, { "epoch": 0.465, "grad_norm": 30.75, "grad_norm_var": 10.176822916666667, "learning_rate": 0.0001, "loss": 7.5546, "loss/crossentropy": 2.076089936494827, "loss/hidden": 3.38203125, "loss/jsd": 0.0, "loss/logits": 0.18885728288441897, "step": 18600 }, { "epoch": 0.46525, "grad_norm": 30.0, "grad_norm_var": 1.8504557291666666, "learning_rate": 0.0001, "loss": 7.4298, "loss/crossentropy": 2.2245807096362116, "loss/hidden": 3.455859375, "loss/jsd": 0.0, "loss/logits": 0.2102813944220543, "step": 18610 }, { "epoch": 0.4655, "grad_norm": 30.75, "grad_norm_var": 2.06015625, "learning_rate": 0.0001, "loss": 7.5033, "loss/crossentropy": 2.037870594859123, "loss/hidden": 3.42421875, "loss/jsd": 0.0, "loss/logits": 0.19886170476675033, "step": 18620 }, { "epoch": 0.46575, "grad_norm": 29.25, "grad_norm_var": 4.106705729166666, "learning_rate": 0.0001, "loss": 7.4177, "loss/crossentropy": 2.1507607758045197, "loss/hidden": 3.378515625, "loss/jsd": 0.0, "loss/logits": 0.20115135256201028, "step": 18630 }, { "epoch": 0.466, "grad_norm": 29.25, "grad_norm_var": 3.1455729166666666, "learning_rate": 0.0001, "loss": 7.4323, "loss/crossentropy": 2.208960172533989, "loss/hidden": 3.425, "loss/jsd": 0.0, "loss/logits": 0.21820493452250958, "step": 18640 }, { "epoch": 0.46625, "grad_norm": 29.875, "grad_norm_var": 12.363541666666666, "learning_rate": 0.0001, "loss": 7.5667, "loss/crossentropy": 2.062282791733742, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.2069956874474883, "step": 18650 }, { "epoch": 0.4665, "grad_norm": 31.5, "grad_norm_var": 3.501822916666667, "learning_rate": 0.0001, "loss": 7.4799, "loss/crossentropy": 2.155339244008064, "loss/hidden": 3.428125, "loss/jsd": 0.0, "loss/logits": 0.20734498463571072, "step": 18660 }, { "epoch": 0.46675, "grad_norm": 35.5, "grad_norm_var": 10.81875, "learning_rate": 0.0001, "loss": 7.4112, "loss/crossentropy": 2.116558775305748, "loss/hidden": 3.45078125, "loss/jsd": 0.0, "loss/logits": 0.22728623170405626, "step": 18670 }, { "epoch": 0.467, "grad_norm": 48.0, "grad_norm_var": 112.26555989583333, "learning_rate": 0.0001, "loss": 7.5819, "loss/crossentropy": 2.1996765822172164, "loss/hidden": 3.391796875, "loss/jsd": 0.0, "loss/logits": 0.20961329583078622, "step": 18680 }, { "epoch": 0.46725, "grad_norm": 29.625, "grad_norm_var": 121.28932291666666, "learning_rate": 0.0001, "loss": 7.4733, "loss/crossentropy": 2.178187218308449, "loss/hidden": 3.476953125, "loss/jsd": 0.0, "loss/logits": 0.20361957363784314, "step": 18690 }, { "epoch": 0.4675, "grad_norm": 29.375, "grad_norm_var": 8.421809895833333, "learning_rate": 0.0001, "loss": 7.4133, "loss/crossentropy": 2.3110476464033125, "loss/hidden": 3.28359375, "loss/jsd": 0.0, "loss/logits": 0.19236205983906984, "step": 18700 }, { "epoch": 0.46775, "grad_norm": 40.0, "grad_norm_var": 2.408370244508014e+18, "learning_rate": 0.0001, "loss": 7.5826, "loss/crossentropy": 1.9763169527053832, "loss/hidden": 3.5734375, "loss/jsd": 0.0, "loss/logits": 0.19957512021064758, "step": 18710 }, { "epoch": 0.468, "grad_norm": 33.25, "grad_norm_var": 7.312239583333334, "learning_rate": 0.0001, "loss": 7.5, "loss/crossentropy": 2.1549572572112083, "loss/hidden": 3.4234375, "loss/jsd": 0.0, "loss/logits": 0.197417950630188, "step": 18720 }, { "epoch": 0.46825, "grad_norm": 30.25, "grad_norm_var": 2.8677083333333333, "learning_rate": 0.0001, "loss": 7.4042, "loss/crossentropy": 2.168655255436897, "loss/hidden": 3.35, "loss/jsd": 0.0, "loss/logits": 0.19954893179237843, "step": 18730 }, { "epoch": 0.4685, "grad_norm": 31.5, "grad_norm_var": 2.6478515625, "learning_rate": 0.0001, "loss": 7.5232, "loss/crossentropy": 2.11354903280735, "loss/hidden": 3.390234375, "loss/jsd": 0.0, "loss/logits": 0.1892693081870675, "step": 18740 }, { "epoch": 0.46875, "grad_norm": 32.5, "grad_norm_var": 2.4692057291666667, "learning_rate": 0.0001, "loss": 7.4898, "loss/crossentropy": 2.1568462401628494, "loss/hidden": 3.377734375, "loss/jsd": 0.0, "loss/logits": 0.20531034972518683, "step": 18750 }, { "epoch": 0.469, "grad_norm": 32.0, "grad_norm_var": 2.2113932291666667, "learning_rate": 0.0001, "loss": 7.6474, "loss/crossentropy": 2.1167890459299086, "loss/hidden": 3.372265625, "loss/jsd": 0.0, "loss/logits": 0.18697490729391575, "step": 18760 }, { "epoch": 0.46925, "grad_norm": 28.625, "grad_norm_var": 2.9041015625, "learning_rate": 0.0001, "loss": 7.3638, "loss/crossentropy": 2.065881370007992, "loss/hidden": 3.2921875, "loss/jsd": 0.0, "loss/logits": 0.18317838590592145, "step": 18770 }, { "epoch": 0.4695, "grad_norm": 30.375, "grad_norm_var": 3.733268229166667, "learning_rate": 0.0001, "loss": 7.4584, "loss/crossentropy": 2.2059927970170974, "loss/hidden": 3.390234375, "loss/jsd": 0.0, "loss/logits": 0.19830067679286004, "step": 18780 }, { "epoch": 0.46975, "grad_norm": 29.5, "grad_norm_var": 3.7934895833333333, "learning_rate": 0.0001, "loss": 7.4786, "loss/crossentropy": 2.0999852418899536, "loss/hidden": 3.26484375, "loss/jsd": 0.0, "loss/logits": 0.1729890163987875, "step": 18790 }, { "epoch": 0.47, "grad_norm": 31.25, "grad_norm_var": 1.5400390625, "learning_rate": 0.0001, "loss": 7.4596, "loss/crossentropy": 2.015045887231827, "loss/hidden": 3.280859375, "loss/jsd": 0.0, "loss/logits": 0.1711340345442295, "step": 18800 }, { "epoch": 0.47025, "grad_norm": 30.5, "grad_norm_var": 2.2223307291666665, "learning_rate": 0.0001, "loss": 7.4839, "loss/crossentropy": 2.0983877390623094, "loss/hidden": 3.387890625, "loss/jsd": 0.0, "loss/logits": 0.18115116618573665, "step": 18810 }, { "epoch": 0.4705, "grad_norm": 30.0, "grad_norm_var": 78.5822265625, "learning_rate": 0.0001, "loss": 7.5142, "loss/crossentropy": 2.122063784301281, "loss/hidden": 3.3453125, "loss/jsd": 0.0, "loss/logits": 0.18946684896945953, "step": 18820 }, { "epoch": 0.47075, "grad_norm": 28.375, "grad_norm_var": 97.58639322916666, "learning_rate": 0.0001, "loss": 7.6061, "loss/crossentropy": 2.237548753619194, "loss/hidden": 3.2828125, "loss/jsd": 0.0, "loss/logits": 0.19166598822921516, "step": 18830 }, { "epoch": 0.471, "grad_norm": 30.25, "grad_norm_var": 9.527083333333334, "learning_rate": 0.0001, "loss": 7.4705, "loss/crossentropy": 2.081946098059416, "loss/hidden": 3.38515625, "loss/jsd": 0.0, "loss/logits": 0.18989304369315504, "step": 18840 }, { "epoch": 0.47125, "grad_norm": 31.125, "grad_norm_var": 8.614583333333334, "learning_rate": 0.0001, "loss": 7.3761, "loss/crossentropy": 2.2694112062454224, "loss/hidden": 3.291015625, "loss/jsd": 0.0, "loss/logits": 0.1898673100396991, "step": 18850 }, { "epoch": 0.4715, "grad_norm": 35.0, "grad_norm_var": 3.9619140625, "learning_rate": 0.0001, "loss": 7.5445, "loss/crossentropy": 2.1576954081654547, "loss/hidden": 3.423046875, "loss/jsd": 0.0, "loss/logits": 0.19350357558578252, "step": 18860 }, { "epoch": 0.47175, "grad_norm": 33.5, "grad_norm_var": 13.674934895833333, "learning_rate": 0.0001, "loss": 7.498, "loss/crossentropy": 2.0821513146162034, "loss/hidden": 3.458984375, "loss/jsd": 0.0, "loss/logits": 0.19236762393265963, "step": 18870 }, { "epoch": 0.472, "grad_norm": 30.0, "grad_norm_var": 14.720572916666667, "learning_rate": 0.0001, "loss": 7.44, "loss/crossentropy": 2.097795248031616, "loss/hidden": 3.4109375, "loss/jsd": 0.0, "loss/logits": 0.1873439846560359, "step": 18880 }, { "epoch": 0.47225, "grad_norm": 50.5, "grad_norm_var": 28.486458333333335, "learning_rate": 0.0001, "loss": 7.5299, "loss/crossentropy": 2.1926951453089716, "loss/hidden": 3.483203125, "loss/jsd": 0.0, "loss/logits": 0.19845539480447769, "step": 18890 }, { "epoch": 0.4725, "grad_norm": 28.0, "grad_norm_var": 27.077018229166665, "learning_rate": 0.0001, "loss": 7.4255, "loss/crossentropy": 2.1055196806788445, "loss/hidden": 3.437109375, "loss/jsd": 0.0, "loss/logits": 0.1981551146134734, "step": 18900 }, { "epoch": 0.47275, "grad_norm": 28.375, "grad_norm_var": 2.6311848958333335, "learning_rate": 0.0001, "loss": 7.4186, "loss/crossentropy": 2.1644329100847246, "loss/hidden": 3.3859375, "loss/jsd": 0.0, "loss/logits": 0.1863600155338645, "step": 18910 }, { "epoch": 0.473, "grad_norm": 37.25, "grad_norm_var": 5.633333333333334, "learning_rate": 0.0001, "loss": 7.5188, "loss/crossentropy": 2.111950933933258, "loss/hidden": 3.329296875, "loss/jsd": 0.0, "loss/logits": 0.19227960985153913, "step": 18920 }, { "epoch": 0.47325, "grad_norm": 30.5, "grad_norm_var": 4.974934895833333, "learning_rate": 0.0001, "loss": 7.4124, "loss/crossentropy": 2.0366090714931486, "loss/hidden": 3.446484375, "loss/jsd": 0.0, "loss/logits": 0.20001390851102768, "step": 18930 }, { "epoch": 0.4735, "grad_norm": 28.5, "grad_norm_var": 4.0509765625, "learning_rate": 0.0001, "loss": 7.4834, "loss/crossentropy": 2.136207638680935, "loss/hidden": 3.386328125, "loss/jsd": 0.0, "loss/logits": 0.18433616552501916, "step": 18940 }, { "epoch": 0.47375, "grad_norm": 33.0, "grad_norm_var": 2.98515625, "learning_rate": 0.0001, "loss": 7.6108, "loss/crossentropy": 2.1925762712955477, "loss/hidden": 3.492578125, "loss/jsd": 0.0, "loss/logits": 0.2353795062750578, "step": 18950 }, { "epoch": 0.474, "grad_norm": 32.75, "grad_norm_var": 3.55390625, "learning_rate": 0.0001, "loss": 7.6327, "loss/crossentropy": 2.1619835913181307, "loss/hidden": 3.4234375, "loss/jsd": 0.0, "loss/logits": 0.20424192007631065, "step": 18960 }, { "epoch": 0.47425, "grad_norm": 32.5, "grad_norm_var": 2.7983723958333333, "learning_rate": 0.0001, "loss": 7.538, "loss/crossentropy": 2.1776172876358033, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.20042523313313723, "step": 18970 }, { "epoch": 0.4745, "grad_norm": 30.625, "grad_norm_var": 4.616080729166667, "learning_rate": 0.0001, "loss": 7.5789, "loss/crossentropy": 2.124316268414259, "loss/hidden": 3.326953125, "loss/jsd": 0.0, "loss/logits": 0.18864221088588237, "step": 18980 }, { "epoch": 0.47475, "grad_norm": 31.875, "grad_norm_var": 2.0697916666666667, "learning_rate": 0.0001, "loss": 7.574, "loss/crossentropy": 2.073023219406605, "loss/hidden": 3.45078125, "loss/jsd": 0.0, "loss/logits": 0.2027593031525612, "step": 18990 }, { "epoch": 0.475, "grad_norm": 32.25, "grad_norm_var": 2.664322916666667, "learning_rate": 0.0001, "loss": 7.4101, "loss/crossentropy": 2.0187594324350355, "loss/hidden": 3.48671875, "loss/jsd": 0.0, "loss/logits": 0.21064169742166997, "step": 19000 }, { "epoch": 0.47525, "grad_norm": 31.25, "grad_norm_var": 1.8197265625, "learning_rate": 0.0001, "loss": 7.4446, "loss/crossentropy": 2.2003646433353423, "loss/hidden": 3.28359375, "loss/jsd": 0.0, "loss/logits": 0.18558428715914488, "step": 19010 }, { "epoch": 0.4755, "grad_norm": 33.5, "grad_norm_var": 85.84348958333334, "learning_rate": 0.0001, "loss": 7.5855, "loss/crossentropy": 2.2160910099744795, "loss/hidden": 3.330078125, "loss/jsd": 0.0, "loss/logits": 0.20706005822867155, "step": 19020 }, { "epoch": 0.47575, "grad_norm": 27.875, "grad_norm_var": 6.8462890625, "learning_rate": 0.0001, "loss": 7.4416, "loss/crossentropy": 2.080396056175232, "loss/hidden": 3.3640625, "loss/jsd": 0.0, "loss/logits": 0.18421638533473014, "step": 19030 }, { "epoch": 0.476, "grad_norm": 30.0, "grad_norm_var": 2.349739583333333, "learning_rate": 0.0001, "loss": 7.4877, "loss/crossentropy": 2.0878879494965075, "loss/hidden": 3.530078125, "loss/jsd": 0.0, "loss/logits": 0.20337579548358917, "step": 19040 }, { "epoch": 0.47625, "grad_norm": 28.125, "grad_norm_var": 2.975455729166667, "learning_rate": 0.0001, "loss": 7.3789, "loss/crossentropy": 2.0772477351129055, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.1744394849985838, "step": 19050 }, { "epoch": 0.4765, "grad_norm": 30.25, "grad_norm_var": 24.359375, "learning_rate": 0.0001, "loss": 7.4862, "loss/crossentropy": 2.100569760799408, "loss/hidden": 3.410546875, "loss/jsd": 0.0, "loss/logits": 0.19498176760971547, "step": 19060 }, { "epoch": 0.47675, "grad_norm": 33.5, "grad_norm_var": 23.444791666666667, "learning_rate": 0.0001, "loss": 7.4441, "loss/crossentropy": 2.088040843605995, "loss/hidden": 3.4046875, "loss/jsd": 0.0, "loss/logits": 0.1772982817143202, "step": 19070 }, { "epoch": 0.477, "grad_norm": 29.625, "grad_norm_var": 4.158072916666667, "learning_rate": 0.0001, "loss": 7.4619, "loss/crossentropy": 2.176638293266296, "loss/hidden": 3.430859375, "loss/jsd": 0.0, "loss/logits": 0.19879398066550494, "step": 19080 }, { "epoch": 0.47725, "grad_norm": 31.25, "grad_norm_var": 4.5978515625, "learning_rate": 0.0001, "loss": 7.4313, "loss/crossentropy": 2.091726778447628, "loss/hidden": 3.393359375, "loss/jsd": 0.0, "loss/logits": 0.18576510678976774, "step": 19090 }, { "epoch": 0.4775, "grad_norm": 31.5, "grad_norm_var": 5.673372395833334, "learning_rate": 0.0001, "loss": 7.4909, "loss/crossentropy": 2.235911877453327, "loss/hidden": 3.3921875, "loss/jsd": 0.0, "loss/logits": 0.21205585449934006, "step": 19100 }, { "epoch": 0.47775, "grad_norm": 29.875, "grad_norm_var": 2.5973307291666665, "learning_rate": 0.0001, "loss": 7.394, "loss/crossentropy": 2.2365425676107407, "loss/hidden": 3.309765625, "loss/jsd": 0.0, "loss/logits": 0.19166039237752558, "step": 19110 }, { "epoch": 0.478, "grad_norm": 31.25, "grad_norm_var": 3.24375, "learning_rate": 0.0001, "loss": 7.5406, "loss/crossentropy": 2.108828788995743, "loss/hidden": 3.375390625, "loss/jsd": 0.0, "loss/logits": 0.18494639433920385, "step": 19120 }, { "epoch": 0.47825, "grad_norm": 29.25, "grad_norm_var": 6.721809895833333, "learning_rate": 0.0001, "loss": 7.5922, "loss/crossentropy": 2.148023870587349, "loss/hidden": 3.448828125, "loss/jsd": 0.0, "loss/logits": 0.19042143411934376, "step": 19130 }, { "epoch": 0.4785, "grad_norm": 28.5, "grad_norm_var": 2.0660807291666665, "learning_rate": 0.0001, "loss": 7.3712, "loss/crossentropy": 2.1488536432385446, "loss/hidden": 3.3546875, "loss/jsd": 0.0, "loss/logits": 0.19772066082805395, "step": 19140 }, { "epoch": 0.47875, "grad_norm": 32.5, "grad_norm_var": 3.3541015625, "learning_rate": 0.0001, "loss": 7.542, "loss/crossentropy": 2.0818497955799105, "loss/hidden": 3.45703125, "loss/jsd": 0.0, "loss/logits": 0.20143606336787342, "step": 19150 }, { "epoch": 0.479, "grad_norm": 30.5, "grad_norm_var": 2.6796223958333334, "learning_rate": 0.0001, "loss": 7.5226, "loss/crossentropy": 2.213534101843834, "loss/hidden": 3.39375, "loss/jsd": 0.0, "loss/logits": 0.20065031982958317, "step": 19160 }, { "epoch": 0.47925, "grad_norm": 29.625, "grad_norm_var": 5.286393229166666, "learning_rate": 0.0001, "loss": 7.4779, "loss/crossentropy": 2.094844227284193, "loss/hidden": 3.417578125, "loss/jsd": 0.0, "loss/logits": 0.19940570322796702, "step": 19170 }, { "epoch": 0.4795, "grad_norm": 29.5, "grad_norm_var": 2.856705729166667, "learning_rate": 0.0001, "loss": 7.4351, "loss/crossentropy": 2.0740013778209687, "loss/hidden": 3.31640625, "loss/jsd": 0.0, "loss/logits": 0.1735646616667509, "step": 19180 }, { "epoch": 0.47975, "grad_norm": 31.125, "grad_norm_var": 4.551497395833334, "learning_rate": 0.0001, "loss": 7.3991, "loss/crossentropy": 2.1303412437438967, "loss/hidden": 3.367578125, "loss/jsd": 0.0, "loss/logits": 0.20425421688705683, "step": 19190 }, { "epoch": 0.48, "grad_norm": 28.25, "grad_norm_var": 4.960416666666666, "learning_rate": 0.0001, "loss": 7.4677, "loss/crossentropy": 2.211458620429039, "loss/hidden": 3.36171875, "loss/jsd": 0.0, "loss/logits": 0.20086632966995238, "step": 19200 }, { "epoch": 0.48025, "grad_norm": 30.5, "grad_norm_var": 2.3056640625, "learning_rate": 0.0001, "loss": 7.4356, "loss/crossentropy": 2.098607787489891, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.18264676369726657, "step": 19210 }, { "epoch": 0.4805, "grad_norm": 30.375, "grad_norm_var": 2.2363932291666666, "learning_rate": 0.0001, "loss": 7.4141, "loss/crossentropy": 2.1743356212973595, "loss/hidden": 3.23203125, "loss/jsd": 0.0, "loss/logits": 0.18333771340548993, "step": 19220 }, { "epoch": 0.48075, "grad_norm": 29.625, "grad_norm_var": 79.7166015625, "learning_rate": 0.0001, "loss": 7.4269, "loss/crossentropy": 2.269570749998093, "loss/hidden": 3.339453125, "loss/jsd": 0.0, "loss/logits": 0.1983551263809204, "step": 19230 }, { "epoch": 0.481, "grad_norm": 30.25, "grad_norm_var": 3.2462890625, "learning_rate": 0.0001, "loss": 7.4745, "loss/crossentropy": 2.0477301344275474, "loss/hidden": 3.256640625, "loss/jsd": 0.0, "loss/logits": 0.18232697807252407, "step": 19240 }, { "epoch": 0.48125, "grad_norm": 34.0, "grad_norm_var": 4.055989583333333, "learning_rate": 0.0001, "loss": 7.4972, "loss/crossentropy": 2.0904338479042055, "loss/hidden": 3.39765625, "loss/jsd": 0.0, "loss/logits": 0.19662635941058398, "step": 19250 }, { "epoch": 0.4815, "grad_norm": 33.75, "grad_norm_var": 23.349739583333335, "learning_rate": 0.0001, "loss": 7.5738, "loss/crossentropy": 2.146761494874954, "loss/hidden": 3.375390625, "loss/jsd": 0.0, "loss/logits": 0.2060481060296297, "step": 19260 }, { "epoch": 0.48175, "grad_norm": 30.75, "grad_norm_var": 25.313997395833333, "learning_rate": 0.0001, "loss": 7.436, "loss/crossentropy": 2.136366602778435, "loss/hidden": 3.45390625, "loss/jsd": 0.0, "loss/logits": 0.18981903176754714, "step": 19270 }, { "epoch": 0.482, "grad_norm": 30.875, "grad_norm_var": 10.1134765625, "learning_rate": 0.0001, "loss": 7.5383, "loss/crossentropy": 2.155537909269333, "loss/hidden": 3.356640625, "loss/jsd": 0.0, "loss/logits": 0.19545716531574725, "step": 19280 }, { "epoch": 0.48225, "grad_norm": 35.0, "grad_norm_var": 22.241666666666667, "learning_rate": 0.0001, "loss": 7.4286, "loss/crossentropy": 2.2027793377637863, "loss/hidden": 3.33203125, "loss/jsd": 0.0, "loss/logits": 0.18559762574732303, "step": 19290 }, { "epoch": 0.4825, "grad_norm": 29.0, "grad_norm_var": 22.192122395833334, "learning_rate": 0.0001, "loss": 7.4078, "loss/crossentropy": 2.146567094326019, "loss/hidden": 3.412109375, "loss/jsd": 0.0, "loss/logits": 0.19877713918685913, "step": 19300 }, { "epoch": 0.48275, "grad_norm": 28.625, "grad_norm_var": 10.4509765625, "learning_rate": 0.0001, "loss": 7.4685, "loss/crossentropy": 2.1408349931240083, "loss/hidden": 3.409765625, "loss/jsd": 0.0, "loss/logits": 0.20110496152192353, "step": 19310 }, { "epoch": 0.483, "grad_norm": 27.625, "grad_norm_var": 38.0509765625, "learning_rate": 0.0001, "loss": 7.4049, "loss/crossentropy": 2.0760223269462585, "loss/hidden": 3.401953125, "loss/jsd": 0.0, "loss/logits": 0.1911366345360875, "step": 19320 }, { "epoch": 0.48325, "grad_norm": 29.125, "grad_norm_var": 6.47265625, "learning_rate": 0.0001, "loss": 7.5924, "loss/crossentropy": 2.0804271429777144, "loss/hidden": 3.48046875, "loss/jsd": 0.0, "loss/logits": 0.20118813700973986, "step": 19330 }, { "epoch": 0.4835, "grad_norm": 31.375, "grad_norm_var": 5.389322916666667, "learning_rate": 0.0001, "loss": 7.4406, "loss/crossentropy": 2.1793388813734054, "loss/hidden": 3.39609375, "loss/jsd": 0.0, "loss/logits": 0.19482165575027466, "step": 19340 }, { "epoch": 0.48375, "grad_norm": 28.375, "grad_norm_var": 1.6296223958333333, "learning_rate": 0.0001, "loss": 7.3994, "loss/crossentropy": 2.037779450416565, "loss/hidden": 3.459375, "loss/jsd": 0.0, "loss/logits": 0.19606408532708883, "step": 19350 }, { "epoch": 0.484, "grad_norm": 29.75, "grad_norm_var": 2.1025390625, "learning_rate": 0.0001, "loss": 7.5035, "loss/crossentropy": 2.108065330982208, "loss/hidden": 3.300390625, "loss/jsd": 0.0, "loss/logits": 0.1907705569639802, "step": 19360 }, { "epoch": 0.48425, "grad_norm": 30.875, "grad_norm_var": 1.8333333333333333, "learning_rate": 0.0001, "loss": 7.5038, "loss/crossentropy": 2.17846602499485, "loss/hidden": 3.3265625, "loss/jsd": 0.0, "loss/logits": 0.19125208593904972, "step": 19370 }, { "epoch": 0.4845, "grad_norm": 32.5, "grad_norm_var": 1.4442057291666666, "learning_rate": 0.0001, "loss": 7.5134, "loss/crossentropy": 2.139002203941345, "loss/hidden": 3.43203125, "loss/jsd": 0.0, "loss/logits": 0.20099837705492973, "step": 19380 }, { "epoch": 0.48475, "grad_norm": 30.5, "grad_norm_var": 2.810416666666667, "learning_rate": 0.0001, "loss": 7.5401, "loss/crossentropy": 2.199608436226845, "loss/hidden": 3.35546875, "loss/jsd": 0.0, "loss/logits": 0.20445926804095507, "step": 19390 }, { "epoch": 0.485, "grad_norm": 31.875, "grad_norm_var": 3.9160807291666666, "learning_rate": 0.0001, "loss": 7.5383, "loss/crossentropy": 2.12961964905262, "loss/hidden": 3.445703125, "loss/jsd": 0.0, "loss/logits": 0.20509012490510942, "step": 19400 }, { "epoch": 0.48525, "grad_norm": 28.125, "grad_norm_var": 2.0052083333333335, "learning_rate": 0.0001, "loss": 7.507, "loss/crossentropy": 2.1178236939013004, "loss/hidden": 3.406640625, "loss/jsd": 0.0, "loss/logits": 0.2015313227660954, "step": 19410 }, { "epoch": 0.4855, "grad_norm": 31.25, "grad_norm_var": 4.2712890625, "learning_rate": 0.0001, "loss": 7.4174, "loss/crossentropy": 2.243345336616039, "loss/hidden": 3.415234375, "loss/jsd": 0.0, "loss/logits": 0.19893982484936715, "step": 19420 }, { "epoch": 0.48575, "grad_norm": 28.875, "grad_norm_var": 4.418684895833334, "learning_rate": 0.0001, "loss": 7.4455, "loss/crossentropy": 2.2348459392786024, "loss/hidden": 3.325390625, "loss/jsd": 0.0, "loss/logits": 0.19131067730486392, "step": 19430 }, { "epoch": 0.486, "grad_norm": 31.5, "grad_norm_var": 1.3369140625, "learning_rate": 0.0001, "loss": 7.4526, "loss/crossentropy": 2.0366022780537607, "loss/hidden": 3.37109375, "loss/jsd": 0.0, "loss/logits": 0.20100958244875072, "step": 19440 }, { "epoch": 0.48625, "grad_norm": 29.25, "grad_norm_var": 1.0559895833333333, "learning_rate": 0.0001, "loss": 7.5342, "loss/crossentropy": 2.1607364803552627, "loss/hidden": 3.4796875, "loss/jsd": 0.0, "loss/logits": 0.1935076829046011, "step": 19450 }, { "epoch": 0.4865, "grad_norm": 29.625, "grad_norm_var": 1.4410807291666667, "learning_rate": 0.0001, "loss": 7.5696, "loss/crossentropy": 2.075075288116932, "loss/hidden": 3.396875, "loss/jsd": 0.0, "loss/logits": 0.20319369733333587, "step": 19460 }, { "epoch": 0.48675, "grad_norm": 29.5, "grad_norm_var": 1.5134765625, "learning_rate": 0.0001, "loss": 7.3679, "loss/crossentropy": 2.1516528606414793, "loss/hidden": 3.330078125, "loss/jsd": 0.0, "loss/logits": 0.18369753509759904, "step": 19470 }, { "epoch": 0.487, "grad_norm": 30.25, "grad_norm_var": 10.126822916666667, "learning_rate": 0.0001, "loss": 7.5018, "loss/crossentropy": 2.059604635834694, "loss/hidden": 3.476171875, "loss/jsd": 0.0, "loss/logits": 0.19641294488683342, "step": 19480 }, { "epoch": 0.48725, "grad_norm": 30.625, "grad_norm_var": 17.711458333333333, "learning_rate": 0.0001, "loss": 7.4581, "loss/crossentropy": 2.0580811988562346, "loss/hidden": 3.34765625, "loss/jsd": 0.0, "loss/logits": 0.1850988393649459, "step": 19490 }, { "epoch": 0.4875, "grad_norm": 31.25, "grad_norm_var": 18.8494140625, "learning_rate": 0.0001, "loss": 7.4531, "loss/crossentropy": 2.213153839111328, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.1889218719676137, "step": 19500 }, { "epoch": 0.48775, "grad_norm": 29.0, "grad_norm_var": 11.201041666666667, "learning_rate": 0.0001, "loss": 7.5007, "loss/crossentropy": 2.1869974330067636, "loss/hidden": 3.50546875, "loss/jsd": 0.0, "loss/logits": 0.21722153890877963, "step": 19510 }, { "epoch": 0.488, "grad_norm": 28.625, "grad_norm_var": 11.2244140625, "learning_rate": 0.0001, "loss": 7.5173, "loss/crossentropy": 2.3184584975242615, "loss/hidden": 3.28203125, "loss/jsd": 0.0, "loss/logits": 0.19578878507018088, "step": 19520 }, { "epoch": 0.48825, "grad_norm": 31.0, "grad_norm_var": 4.477083333333334, "learning_rate": 0.0001, "loss": 7.4547, "loss/crossentropy": 2.2146874606609344, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.18467055093497037, "step": 19530 }, { "epoch": 0.4885, "grad_norm": 34.0, "grad_norm_var": 1.90390625, "learning_rate": 0.0001, "loss": 7.5217, "loss/crossentropy": 2.121046042442322, "loss/hidden": 3.37890625, "loss/jsd": 0.0, "loss/logits": 0.19369308706372976, "step": 19540 }, { "epoch": 0.48875, "grad_norm": 30.75, "grad_norm_var": 2.3322265625, "learning_rate": 0.0001, "loss": 7.3847, "loss/crossentropy": 2.03256713449955, "loss/hidden": 3.44453125, "loss/jsd": 0.0, "loss/logits": 0.18962962226942182, "step": 19550 }, { "epoch": 0.489, "grad_norm": 28.625, "grad_norm_var": 1.7330729166666667, "learning_rate": 0.0001, "loss": 7.4378, "loss/crossentropy": 2.0885259807109833, "loss/hidden": 3.368359375, "loss/jsd": 0.0, "loss/logits": 0.1941137770190835, "step": 19560 }, { "epoch": 0.48925, "grad_norm": 30.25, "grad_norm_var": 2.25625, "learning_rate": 0.0001, "loss": 7.4224, "loss/crossentropy": 2.0018733590841293, "loss/hidden": 3.4703125, "loss/jsd": 0.0, "loss/logits": 0.18955971095710994, "step": 19570 }, { "epoch": 0.4895, "grad_norm": 32.5, "grad_norm_var": 1.94375, "learning_rate": 0.0001, "loss": 7.4489, "loss/crossentropy": 2.136213929951191, "loss/hidden": 3.366015625, "loss/jsd": 0.0, "loss/logits": 0.2040450617671013, "step": 19580 }, { "epoch": 0.48975, "grad_norm": 29.875, "grad_norm_var": 1.5559895833333333, "learning_rate": 0.0001, "loss": 7.4945, "loss/crossentropy": 2.049376104027033, "loss/hidden": 3.481640625, "loss/jsd": 0.0, "loss/logits": 0.1941303763538599, "step": 19590 }, { "epoch": 0.49, "grad_norm": 32.25, "grad_norm_var": 2.3643229166666666, "learning_rate": 0.0001, "loss": 7.4543, "loss/crossentropy": 2.189481696486473, "loss/hidden": 3.351171875, "loss/jsd": 0.0, "loss/logits": 0.19288850836455823, "step": 19600 }, { "epoch": 0.49025, "grad_norm": 31.125, "grad_norm_var": 1.6760416666666667, "learning_rate": 0.0001, "loss": 7.3698, "loss/crossentropy": 2.0647163964807986, "loss/hidden": 3.398046875, "loss/jsd": 0.0, "loss/logits": 0.1837151089683175, "step": 19610 }, { "epoch": 0.4905, "grad_norm": 31.25, "grad_norm_var": 1.3643229166666666, "learning_rate": 0.0001, "loss": 7.4861, "loss/crossentropy": 2.116924060881138, "loss/hidden": 3.383984375, "loss/jsd": 0.0, "loss/logits": 0.19573113061487674, "step": 19620 }, { "epoch": 0.49075, "grad_norm": 33.75, "grad_norm_var": 2.9848307291666667, "learning_rate": 0.0001, "loss": 7.4356, "loss/crossentropy": 2.0818495750427246, "loss/hidden": 3.359765625, "loss/jsd": 0.0, "loss/logits": 0.19164716321974992, "step": 19630 }, { "epoch": 0.491, "grad_norm": 29.875, "grad_norm_var": 2.0369140625, "learning_rate": 0.0001, "loss": 7.4685, "loss/crossentropy": 2.0525588363409044, "loss/hidden": 3.428125, "loss/jsd": 0.0, "loss/logits": 0.19741135966032744, "step": 19640 }, { "epoch": 0.49125, "grad_norm": 29.125, "grad_norm_var": 3.5171223958333333, "learning_rate": 0.0001, "loss": 7.4433, "loss/crossentropy": 2.167221999168396, "loss/hidden": 3.384375, "loss/jsd": 0.0, "loss/logits": 0.19002752266824247, "step": 19650 }, { "epoch": 0.4915, "grad_norm": 35.5, "grad_norm_var": 4.730208333333334, "learning_rate": 0.0001, "loss": 7.5538, "loss/crossentropy": 2.125367084145546, "loss/hidden": 3.456640625, "loss/jsd": 0.0, "loss/logits": 0.2023206612095237, "step": 19660 }, { "epoch": 0.49175, "grad_norm": 30.0, "grad_norm_var": 4.536458333333333, "learning_rate": 0.0001, "loss": 7.5214, "loss/crossentropy": 2.2167422816157343, "loss/hidden": 3.454296875, "loss/jsd": 0.0, "loss/logits": 0.19829270094633103, "step": 19670 }, { "epoch": 0.492, "grad_norm": 31.625, "grad_norm_var": 2.948958333333333, "learning_rate": 0.0001, "loss": 7.4018, "loss/crossentropy": 2.16324964761734, "loss/hidden": 3.335546875, "loss/jsd": 0.0, "loss/logits": 0.1941527757793665, "step": 19680 }, { "epoch": 0.49225, "grad_norm": 29.875, "grad_norm_var": 3.678580729166667, "learning_rate": 0.0001, "loss": 7.4845, "loss/crossentropy": 2.104529711604118, "loss/hidden": 3.26796875, "loss/jsd": 0.0, "loss/logits": 0.18227193616330623, "step": 19690 }, { "epoch": 0.4925, "grad_norm": 29.875, "grad_norm_var": 1.3080729166666667, "learning_rate": 0.0001, "loss": 7.4912, "loss/crossentropy": 2.0313437558710574, "loss/hidden": 3.387890625, "loss/jsd": 0.0, "loss/logits": 0.19764493741095066, "step": 19700 }, { "epoch": 0.49275, "grad_norm": 32.0, "grad_norm_var": 1.9884765625, "learning_rate": 0.0001, "loss": 7.4652, "loss/crossentropy": 2.1517415940761566, "loss/hidden": 3.342578125, "loss/jsd": 0.0, "loss/logits": 0.19597405511885882, "step": 19710 }, { "epoch": 0.493, "grad_norm": 31.375, "grad_norm_var": 2.220833333333333, "learning_rate": 0.0001, "loss": 7.5347, "loss/crossentropy": 2.1508257031440734, "loss/hidden": 3.4734375, "loss/jsd": 0.0, "loss/logits": 0.19206973258405924, "step": 19720 }, { "epoch": 0.49325, "grad_norm": 29.75, "grad_norm_var": 1.0479166666666666, "learning_rate": 0.0001, "loss": 7.5137, "loss/crossentropy": 2.2038475602865217, "loss/hidden": 3.3609375, "loss/jsd": 0.0, "loss/logits": 0.19588778279721736, "step": 19730 }, { "epoch": 0.4935, "grad_norm": 28.875, "grad_norm_var": 3.449934895833333, "learning_rate": 0.0001, "loss": 7.4967, "loss/crossentropy": 2.0373061880469323, "loss/hidden": 3.508203125, "loss/jsd": 0.0, "loss/logits": 0.20586073007434608, "step": 19740 }, { "epoch": 0.49375, "grad_norm": 31.0, "grad_norm_var": 4.093489583333334, "learning_rate": 0.0001, "loss": 7.5691, "loss/crossentropy": 2.1521503664553165, "loss/hidden": 3.29921875, "loss/jsd": 0.0, "loss/logits": 0.18537704246118664, "step": 19750 }, { "epoch": 0.494, "grad_norm": 29.75, "grad_norm_var": 1.8858723958333334, "learning_rate": 0.0001, "loss": 7.4954, "loss/crossentropy": 2.0740778051316737, "loss/hidden": 3.4859375, "loss/jsd": 0.0, "loss/logits": 0.20432667913846672, "step": 19760 }, { "epoch": 0.49425, "grad_norm": 34.25, "grad_norm_var": 2.8061848958333333, "learning_rate": 0.0001, "loss": 7.5234, "loss/crossentropy": 2.1025756657123567, "loss/hidden": 3.31015625, "loss/jsd": 0.0, "loss/logits": 0.1960667993873358, "step": 19770 }, { "epoch": 0.4945, "grad_norm": 34.0, "grad_norm_var": 3.1733723958333333, "learning_rate": 0.0001, "loss": 7.416, "loss/crossentropy": 2.0789266645908357, "loss/hidden": 3.363671875, "loss/jsd": 0.0, "loss/logits": 0.17625574823468923, "step": 19780 }, { "epoch": 0.49475, "grad_norm": 32.0, "grad_norm_var": 3.1150390625, "learning_rate": 0.0001, "loss": 7.5526, "loss/crossentropy": 2.1197325706481935, "loss/hidden": 3.397265625, "loss/jsd": 0.0, "loss/logits": 0.19719154592603444, "step": 19790 }, { "epoch": 0.495, "grad_norm": 28.75, "grad_norm_var": 1.7999348958333334, "learning_rate": 0.0001, "loss": 7.4495, "loss/crossentropy": 2.1562264621257783, "loss/hidden": 3.436328125, "loss/jsd": 0.0, "loss/logits": 0.2082188345491886, "step": 19800 }, { "epoch": 0.49525, "grad_norm": 30.125, "grad_norm_var": 2.2561848958333335, "learning_rate": 0.0001, "loss": 7.5052, "loss/crossentropy": 2.1419773191213607, "loss/hidden": 3.4546875, "loss/jsd": 0.0, "loss/logits": 0.19062785096466542, "step": 19810 }, { "epoch": 0.4955, "grad_norm": 28.125, "grad_norm_var": 2.42265625, "learning_rate": 0.0001, "loss": 7.3991, "loss/crossentropy": 2.144966857135296, "loss/hidden": 3.323046875, "loss/jsd": 0.0, "loss/logits": 0.19392486391589045, "step": 19820 }, { "epoch": 0.49575, "grad_norm": 30.875, "grad_norm_var": 1.3354166666666667, "learning_rate": 0.0001, "loss": 7.5459, "loss/crossentropy": 2.317871165275574, "loss/hidden": 3.37578125, "loss/jsd": 0.0, "loss/logits": 0.20522685311734676, "step": 19830 }, { "epoch": 0.496, "grad_norm": 35.25, "grad_norm_var": 8.90625, "learning_rate": 0.0001, "loss": 7.458, "loss/crossentropy": 2.106313969194889, "loss/hidden": 3.3953125, "loss/jsd": 0.0, "loss/logits": 0.18565387930721045, "step": 19840 }, { "epoch": 0.49625, "grad_norm": 32.25, "grad_norm_var": 10.2275390625, "learning_rate": 0.0001, "loss": 7.4956, "loss/crossentropy": 2.1663008645176887, "loss/hidden": 3.333984375, "loss/jsd": 0.0, "loss/logits": 0.19335117507725955, "step": 19850 }, { "epoch": 0.4965, "grad_norm": 29.875, "grad_norm_var": 0.6684895833333333, "learning_rate": 0.0001, "loss": 7.4191, "loss/crossentropy": 2.2045235991477967, "loss/hidden": 3.314453125, "loss/jsd": 0.0, "loss/logits": 0.18433119654655455, "step": 19860 }, { "epoch": 0.49675, "grad_norm": 31.75, "grad_norm_var": 6.604166666666667, "learning_rate": 0.0001, "loss": 7.614, "loss/crossentropy": 2.102213513851166, "loss/hidden": 3.437890625, "loss/jsd": 0.0, "loss/logits": 0.19189696963876485, "step": 19870 }, { "epoch": 0.497, "grad_norm": 29.75, "grad_norm_var": 3.9853515625, "learning_rate": 0.0001, "loss": 7.4145, "loss/crossentropy": 2.1133465990424156, "loss/hidden": 3.408984375, "loss/jsd": 0.0, "loss/logits": 0.19469942338764668, "step": 19880 }, { "epoch": 0.49725, "grad_norm": 31.75, "grad_norm_var": 5.577018229166667, "learning_rate": 0.0001, "loss": 7.3473, "loss/crossentropy": 2.1288680538535116, "loss/hidden": 3.35, "loss/jsd": 0.0, "loss/logits": 0.18879411723464729, "step": 19890 }, { "epoch": 0.4975, "grad_norm": 31.875, "grad_norm_var": 4.814518229166667, "learning_rate": 0.0001, "loss": 7.5347, "loss/crossentropy": 2.125280204415321, "loss/hidden": 3.325390625, "loss/jsd": 0.0, "loss/logits": 0.1846629874780774, "step": 19900 }, { "epoch": 0.49775, "grad_norm": 35.25, "grad_norm_var": 20.151822916666667, "learning_rate": 0.0001, "loss": 7.4436, "loss/crossentropy": 2.0771165072917936, "loss/hidden": 3.419140625, "loss/jsd": 0.0, "loss/logits": 0.18763564769178628, "step": 19910 }, { "epoch": 0.498, "grad_norm": 29.75, "grad_norm_var": 21.658333333333335, "learning_rate": 0.0001, "loss": 7.4811, "loss/crossentropy": 1.9571763649582863, "loss/hidden": 3.377734375, "loss/jsd": 0.0, "loss/logits": 0.1826960850507021, "step": 19920 }, { "epoch": 0.49825, "grad_norm": 31.5, "grad_norm_var": 5.590559895833334, "learning_rate": 0.0001, "loss": 7.3641, "loss/crossentropy": 2.099938778579235, "loss/hidden": 3.402734375, "loss/jsd": 0.0, "loss/logits": 0.19888236112892627, "step": 19930 }, { "epoch": 0.4985, "grad_norm": 30.375, "grad_norm_var": 1.5239583333333333, "learning_rate": 0.0001, "loss": 7.5621, "loss/crossentropy": 2.146530794352293, "loss/hidden": 3.400390625, "loss/jsd": 0.0, "loss/logits": 0.19427433758974075, "step": 19940 }, { "epoch": 0.49875, "grad_norm": 30.875, "grad_norm_var": 1.5035807291666667, "learning_rate": 0.0001, "loss": 7.483, "loss/crossentropy": 2.141235402226448, "loss/hidden": 3.455078125, "loss/jsd": 0.0, "loss/logits": 0.21379205845296384, "step": 19950 }, { "epoch": 0.499, "grad_norm": 30.0, "grad_norm_var": 3.6572265625, "learning_rate": 0.0001, "loss": 7.4701, "loss/crossentropy": 2.2140693843364714, "loss/hidden": 3.35, "loss/jsd": 0.0, "loss/logits": 0.18884072303771973, "step": 19960 }, { "epoch": 0.49925, "grad_norm": 34.25, "grad_norm_var": 4.355143229166667, "learning_rate": 0.0001, "loss": 7.4807, "loss/crossentropy": 2.106860537827015, "loss/hidden": 3.409765625, "loss/jsd": 0.0, "loss/logits": 0.20601125992834568, "step": 19970 }, { "epoch": 0.4995, "grad_norm": 30.25, "grad_norm_var": 11.820572916666666, "learning_rate": 0.0001, "loss": 7.503, "loss/crossentropy": 2.1892090171575544, "loss/hidden": 3.34140625, "loss/jsd": 0.0, "loss/logits": 0.18834524974226952, "step": 19980 }, { "epoch": 0.49975, "grad_norm": 29.75, "grad_norm_var": 4.793489583333334, "learning_rate": 0.0001, "loss": 7.5279, "loss/crossentropy": 2.103451582789421, "loss/hidden": 3.4421875, "loss/jsd": 0.0, "loss/logits": 0.2049520380795002, "step": 19990 }, { "epoch": 0.5, "grad_norm": 32.25, "grad_norm_var": 28.2822265625, "learning_rate": 0.0001, "loss": 7.5062, "loss/crossentropy": 1.972146065533161, "loss/hidden": 3.387109375, "loss/jsd": 0.0, "loss/logits": 0.19209758015349507, "step": 20000 } ], "logging_steps": 10, "max_steps": 40000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.715020064017613e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }