{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2, "eval_steps": 2000, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005, "grad_norm": 2128.0, "learning_rate": 1.9e-05, "loss": 69.9557, "loss/crossentropy": 12.354743599891663, "loss/hidden": 18.71875, "loss/jsd": 5.161534905433655, "loss/logits": 0.0, "step": 10 }, { "epoch": 0.001, "grad_norm": 266.0, "grad_norm_var": 15343106.783333333, "learning_rate": 2.8000000000000003e-05, "loss": 52.9613, "loss/crossentropy": 9.517439389228821, "loss/hidden": 18.68125, "loss/jsd": 3.39926393032074, "loss/logits": 0.0, "step": 20 }, { "epoch": 0.0015, "grad_norm": 186.0, "grad_norm_var": 174925.440625, "learning_rate": 3.7e-05, "loss": 48.1973, "loss/crossentropy": 8.46514676809311, "loss/hidden": 18.065625, "loss/jsd": 2.9373991966247557, "loss/logits": 0.0, "step": 30 }, { "epoch": 0.002, "grad_norm": 532.0, "grad_norm_var": 39180.229166666664, "learning_rate": 4.600000000000001e-05, "loss": 45.9066, "loss/crossentropy": 8.040922927856446, "loss/hidden": 17.096875, "loss/jsd": 2.860607051849365, "loss/logits": 0.0, "step": 40 }, { "epoch": 0.0025, "grad_norm": 334.0, "grad_norm_var": 38047.8, "learning_rate": 5.500000000000001e-05, "loss": 41.576, "loss/crossentropy": 7.545825862884522, "loss/hidden": 16.803125, "loss/jsd": 2.476469251513481, "loss/logits": 0.0, "step": 50 }, { "epoch": 0.003, "grad_norm": 238.0, "grad_norm_var": 76798.25, "learning_rate": 6.400000000000001e-05, "loss": 36.7377, "loss/crossentropy": 6.656394875049591, "loss/hidden": 15.9859375, "loss/jsd": 2.101923054456711, "loss/logits": 0.0, "step": 60 }, { "epoch": 0.0035, "grad_norm": 221.0, "grad_norm_var": 72765.58333333333, "learning_rate": 7.3e-05, "loss": 28.2567, "loss/crossentropy": 5.261470526456833, "loss/hidden": 13.6265625, "loss/jsd": 1.4439617365598678, "loss/logits": 0.0, "step": 70 }, { "epoch": 0.004, "grad_norm": 185.0, "grad_norm_var": 16524.266666666666, "learning_rate": 8.200000000000001e-05, "loss": 19.3251, "loss/crossentropy": 4.03509070277214, "loss/hidden": 11.1984375, "loss/jsd": 0.8947193071246147, "loss/logits": 0.0, "step": 80 }, { "epoch": 0.0045, "grad_norm": 107.5, "grad_norm_var": 1300.190625, "learning_rate": 9.1e-05, "loss": 14.15, "loss/crossentropy": 3.2564123183488847, "loss/hidden": 9.371875, "loss/jsd": 0.4821927219629288, "loss/logits": 0.0, "step": 90 }, { "epoch": 0.005, "grad_norm": 113.0, "grad_norm_var": 747.6072916666667, "learning_rate": 0.0001, "loss": 12.3004, "loss/crossentropy": 2.9699372231960295, "loss/hidden": 8.3890625, "loss/jsd": 0.37094187960028646, "loss/logits": 0.0, "step": 100 }, { "epoch": 0.0055, "grad_norm": 143.0, "grad_norm_var": 1498.7333333333333, "learning_rate": 0.0001, "loss": 11.0558, "loss/crossentropy": 3.028834396600723, "loss/hidden": 7.6984375, "loss/jsd": 0.32162978053092955, "loss/logits": 0.0, "step": 110 }, { "epoch": 0.006, "grad_norm": 141.0, "grad_norm_var": 384.065625, "learning_rate": 0.0001, "loss": 10.3695, "loss/crossentropy": 2.8776101738214495, "loss/hidden": 7.54375, "loss/jsd": 0.31908423118293283, "loss/logits": 0.0, "step": 120 }, { "epoch": 0.0065, "grad_norm": 126.5, "grad_norm_var": 376.53229166666665, "learning_rate": 0.0001, "loss": 9.7913, "loss/crossentropy": 2.742277052998543, "loss/hidden": 7.16328125, "loss/jsd": 0.2711725488305092, "loss/logits": 0.0, "step": 130 }, { "epoch": 0.007, "grad_norm": 129.0, "grad_norm_var": 266.0, "learning_rate": 0.0001, "loss": 9.524, "loss/crossentropy": 2.4384234696626663, "loss/hidden": 6.9765625, "loss/jsd": 0.2616196651011705, "loss/logits": 0.0, "step": 140 }, { "epoch": 0.0075, "grad_norm": 100.5, "grad_norm_var": 409.03229166666665, "learning_rate": 0.0001, "loss": 9.1046, "loss/crossentropy": 2.8043846026062966, "loss/hidden": 6.6234375, "loss/jsd": 0.2574016904458404, "loss/logits": 0.0, "step": 150 }, { "epoch": 0.008, "grad_norm": 78.0, "grad_norm_var": 385.70729166666666, "learning_rate": 0.0001, "loss": 8.961, "loss/crossentropy": 2.6820163667201995, "loss/hidden": 6.44609375, "loss/jsd": 0.22497861441224815, "loss/logits": 0.0, "step": 160 }, { "epoch": 0.0085, "grad_norm": 73.0, "grad_norm_var": 180.540625, "learning_rate": 0.0001, "loss": 8.6716, "loss/crossentropy": 2.56088288128376, "loss/hidden": 6.525, "loss/jsd": 0.23445787131786347, "loss/logits": 0.0, "step": 170 }, { "epoch": 0.009, "grad_norm": 70.5, "grad_norm_var": 66.43333333333334, "learning_rate": 0.0001, "loss": 8.3449, "loss/crossentropy": 2.5659249514341353, "loss/hidden": 6.0875, "loss/jsd": 0.20521375369280576, "loss/logits": 0.0, "step": 180 }, { "epoch": 0.0095, "grad_norm": 59.0, "grad_norm_var": 125.99895833333333, "learning_rate": 0.0001, "loss": 8.2048, "loss/crossentropy": 2.4801410123705865, "loss/hidden": 5.96875, "loss/jsd": 0.20023126248270273, "loss/logits": 0.0, "step": 190 }, { "epoch": 0.01, "grad_norm": 95.5, "grad_norm_var": 151.215625, "learning_rate": 0.0001, "loss": 7.9327, "loss/crossentropy": 2.7575797021389006, "loss/hidden": 5.9078125, "loss/jsd": 0.21125836484134197, "loss/logits": 0.0, "step": 200 }, { "epoch": 0.0105, "grad_norm": 69.5, "grad_norm_var": 81.01666666666667, "learning_rate": 0.0001, "loss": 7.867, "loss/crossentropy": 2.584353247284889, "loss/hidden": 5.79140625, "loss/jsd": 0.18391123060137032, "loss/logits": 0.0, "step": 210 }, { "epoch": 0.011, "grad_norm": 67.0, "grad_norm_var": 111.115625, "learning_rate": 0.0001, "loss": 7.5262, "loss/crossentropy": 2.5395505383610724, "loss/hidden": 5.68125, "loss/jsd": 0.17292506210505962, "loss/logits": 0.0, "step": 220 }, { "epoch": 0.0115, "grad_norm": 80.0, "grad_norm_var": 114.725, "learning_rate": 0.0001, "loss": 7.553, "loss/crossentropy": 2.469125708937645, "loss/hidden": 5.62890625, "loss/jsd": 0.1715977793559432, "loss/logits": 0.0, "step": 230 }, { "epoch": 0.012, "grad_norm": 59.75, "grad_norm_var": 93.15, "learning_rate": 0.0001, "loss": 7.3673, "loss/crossentropy": 2.572914382815361, "loss/hidden": 5.51171875, "loss/jsd": 0.18267902322113513, "loss/logits": 0.0, "step": 240 }, { "epoch": 0.0125, "grad_norm": 53.5, "grad_norm_var": 99.42395833333333, "learning_rate": 0.0001, "loss": 7.3184, "loss/crossentropy": 2.6171721309423446, "loss/hidden": 5.53515625, "loss/jsd": 0.17945121377706527, "loss/logits": 0.0, "step": 250 }, { "epoch": 0.013, "grad_norm": 48.75, "grad_norm_var": 118.975, "learning_rate": 0.0001, "loss": 7.2085, "loss/crossentropy": 2.4379070818424227, "loss/hidden": 5.509375, "loss/jsd": 0.1914055148139596, "loss/logits": 0.0, "step": 260 }, { "epoch": 0.0135, "grad_norm": 67.5, "grad_norm_var": 102.2875, "learning_rate": 0.0001, "loss": 7.0206, "loss/crossentropy": 2.5107616782188416, "loss/hidden": 5.4359375, "loss/jsd": 0.19947240259498358, "loss/logits": 0.0, "step": 270 }, { "epoch": 0.014, "grad_norm": 67.5, "grad_norm_var": 136.27057291666668, "learning_rate": 0.0001, "loss": 6.9768, "loss/crossentropy": 2.4130793780088426, "loss/hidden": 5.28359375, "loss/jsd": 0.18424466587603092, "loss/logits": 0.0, "step": 280 }, { "epoch": 0.0145, "grad_norm": 62.0, "grad_norm_var": 98.04895833333333, "learning_rate": 0.0001, "loss": 6.8743, "loss/crossentropy": 2.382996806502342, "loss/hidden": 5.20625, "loss/jsd": 0.1648038787767291, "loss/logits": 0.0, "step": 290 }, { "epoch": 0.015, "grad_norm": 57.75, "grad_norm_var": 81.59895833333333, "learning_rate": 0.0001, "loss": 6.7946, "loss/crossentropy": 2.5844862312078476, "loss/hidden": 5.22265625, "loss/jsd": 0.1599080903455615, "loss/logits": 0.0, "step": 300 }, { "epoch": 0.0155, "grad_norm": 77.0, "grad_norm_var": 103.13229166666666, "learning_rate": 0.0001, "loss": 6.7739, "loss/crossentropy": 2.4337188243865966, "loss/hidden": 4.98046875, "loss/jsd": 0.14282424729317428, "loss/logits": 0.0, "step": 310 }, { "epoch": 0.016, "grad_norm": 51.75, "grad_norm_var": 154.47265625, "learning_rate": 0.0001, "loss": 6.6113, "loss/crossentropy": 2.516378104686737, "loss/hidden": 5.03828125, "loss/jsd": 0.1448629444465041, "loss/logits": 0.0, "step": 320 }, { "epoch": 0.0165, "grad_norm": 47.0, "grad_norm_var": 43.2625, "learning_rate": 0.0001, "loss": 6.4669, "loss/crossentropy": 2.5109775930643083, "loss/hidden": 4.9265625, "loss/jsd": 0.14978713616728784, "loss/logits": 0.0, "step": 330 }, { "epoch": 0.017, "grad_norm": 57.75, "grad_norm_var": 64.21087239583333, "learning_rate": 0.0001, "loss": 6.4713, "loss/crossentropy": 2.497659134864807, "loss/hidden": 4.91796875, "loss/jsd": 0.14760203529149293, "loss/logits": 0.0, "step": 340 }, { "epoch": 0.0175, "grad_norm": 52.5, "grad_norm_var": 120.26243489583334, "learning_rate": 0.0001, "loss": 6.4978, "loss/crossentropy": 2.402846799790859, "loss/hidden": 4.7796875, "loss/jsd": 0.13832223881036043, "loss/logits": 0.0, "step": 350 }, { "epoch": 0.018, "grad_norm": 46.5, "grad_norm_var": 21.149739583333332, "learning_rate": 0.0001, "loss": 6.3607, "loss/crossentropy": 2.3924304962158205, "loss/hidden": 4.9890625, "loss/jsd": 0.1568290738388896, "loss/logits": 0.0, "step": 360 }, { "epoch": 0.0185, "grad_norm": 44.75, "grad_norm_var": 49.326822916666664, "learning_rate": 0.0001, "loss": 6.3592, "loss/crossentropy": 2.4209784388542177, "loss/hidden": 4.89765625, "loss/jsd": 0.14134480394423007, "loss/logits": 0.0, "step": 370 }, { "epoch": 0.019, "grad_norm": 43.75, "grad_norm_var": 71.22057291666667, "learning_rate": 0.0001, "loss": 6.2124, "loss/crossentropy": 2.549247406423092, "loss/hidden": 4.7390625, "loss/jsd": 0.140831589885056, "loss/logits": 0.0, "step": 380 }, { "epoch": 0.0195, "grad_norm": 92.5, "grad_norm_var": 9.065077296740351e+17, "learning_rate": 0.0001, "loss": 6.2864, "loss/crossentropy": 2.4922619298100472, "loss/hidden": 4.87734375, "loss/jsd": 0.1634673684835434, "loss/logits": 0.0, "step": 390 }, { "epoch": 0.02, "grad_norm": 43.75, "grad_norm_var": 9.065077288409414e+17, "learning_rate": 0.0001, "loss": 6.2254, "loss/crossentropy": 2.469875320792198, "loss/hidden": 4.82265625, "loss/jsd": 0.1564602382481098, "loss/logits": 0.0, "step": 400 }, { "epoch": 0.0205, "grad_norm": 56.0, "grad_norm_var": 47.498958333333334, "learning_rate": 0.0001, "loss": 6.1795, "loss/crossentropy": 2.548477476835251, "loss/hidden": 4.75703125, "loss/jsd": 0.17199970744550228, "loss/logits": 0.0, "step": 410 }, { "epoch": 0.021, "grad_norm": 54.75, "grad_norm_var": 720.9768229166667, "learning_rate": 0.0001, "loss": 6.252, "loss/crossentropy": 2.479714798927307, "loss/hidden": 4.68828125, "loss/jsd": 0.1501935562118888, "loss/logits": 0.0, "step": 420 }, { "epoch": 0.0215, "grad_norm": 47.25, "grad_norm_var": 723.6166666666667, "learning_rate": 0.0001, "loss": 6.1002, "loss/crossentropy": 2.529230397939682, "loss/hidden": 4.7921875, "loss/jsd": 0.15877617206424474, "loss/logits": 0.0, "step": 430 }, { "epoch": 0.022, "grad_norm": 53.0, "grad_norm_var": 1.207597994464615e+18, "learning_rate": 0.0001, "loss": 6.0501, "loss/crossentropy": 2.2404126971960068, "loss/hidden": 4.54140625, "loss/jsd": 0.1322522010654211, "loss/logits": 0.0, "step": 440 }, { "epoch": 0.0225, "grad_norm": 3875536896.0, "grad_norm_var": 2.004372710541947e+18, "learning_rate": 0.0001, "loss": 6.1466, "loss/crossentropy": 2.430220237374306, "loss/hidden": 4.62109375, "loss/jsd": 0.14306345414370297, "loss/logits": 0.0, "step": 450 }, { "epoch": 0.023, "grad_norm": 43.0, "grad_norm_var": 9.387366184428504e+17, "learning_rate": 0.0001, "loss": 6.0412, "loss/crossentropy": 2.345375160872936, "loss/hidden": 4.620703125, "loss/jsd": 0.1385633122175932, "loss/logits": 0.0, "step": 460 }, { "epoch": 0.0235, "grad_norm": 42.75, "grad_norm_var": 21.448958333333334, "learning_rate": 0.0001, "loss": 5.9336, "loss/crossentropy": 2.425405339896679, "loss/hidden": 4.60546875, "loss/jsd": 0.13772829296067357, "loss/logits": 0.0, "step": 470 }, { "epoch": 0.024, "grad_norm": 39.75, "grad_norm_var": 10.595572916666667, "learning_rate": 0.0001, "loss": 5.9238, "loss/crossentropy": 2.1817762181162834, "loss/hidden": 4.540234375, "loss/jsd": 0.12882032115012407, "loss/logits": 0.0, "step": 480 }, { "epoch": 0.0245, "grad_norm": 33.75, "grad_norm_var": 21.001822916666665, "learning_rate": 0.0001, "loss": 6.0109, "loss/crossentropy": 2.3736354261636734, "loss/hidden": 4.64140625, "loss/jsd": 0.1405269218608737, "loss/logits": 0.0, "step": 490 }, { "epoch": 0.025, "grad_norm": 41.25, "grad_norm_var": 220.015625, "learning_rate": 0.0001, "loss": 5.9307, "loss/crossentropy": 2.5042927861213684, "loss/hidden": 4.7546875, "loss/jsd": 0.18516455199569465, "loss/logits": 0.0, "step": 500 }, { "epoch": 0.0255, "grad_norm": 41.0, "grad_norm_var": 16.257291666666667, "learning_rate": 0.0001, "loss": 5.9019, "loss/crossentropy": 2.526998797059059, "loss/hidden": 4.47109375, "loss/jsd": 0.13217656817287207, "loss/logits": 0.0, "step": 510 }, { "epoch": 0.026, "grad_norm": 32.25, "grad_norm_var": 16.782291666666666, "learning_rate": 0.0001, "loss": 5.8327, "loss/crossentropy": 2.316130298376083, "loss/hidden": 4.387109375, "loss/jsd": 0.12394356895238161, "loss/logits": 0.0, "step": 520 }, { "epoch": 0.0265, "grad_norm": 39.0, "grad_norm_var": 24.970833333333335, "learning_rate": 0.0001, "loss": 5.8284, "loss/crossentropy": 2.214504113793373, "loss/hidden": 4.623046875, "loss/jsd": 0.15524424342438578, "loss/logits": 0.0, "step": 530 }, { "epoch": 0.027, "grad_norm": 35.75, "grad_norm_var": 11.79375, "learning_rate": 0.0001, "loss": 5.7037, "loss/crossentropy": 2.336098350584507, "loss/hidden": 4.33515625, "loss/jsd": 0.12178284991532565, "loss/logits": 0.0, "step": 540 }, { "epoch": 0.0275, "grad_norm": 38.0, "grad_norm_var": 13.470768229166667, "learning_rate": 0.0001, "loss": 5.7146, "loss/crossentropy": 2.4750932276248934, "loss/hidden": 4.41953125, "loss/jsd": 0.12415571566671132, "loss/logits": 0.0, "step": 550 }, { "epoch": 0.028, "grad_norm": 37.0, "grad_norm_var": 15.298958333333333, "learning_rate": 0.0001, "loss": 5.6597, "loss/crossentropy": 2.360400839149952, "loss/hidden": 4.45546875, "loss/jsd": 0.1331789677962661, "loss/logits": 0.0, "step": 560 }, { "epoch": 0.0285, "grad_norm": 26.75, "grad_norm_var": 108.82057291666666, "learning_rate": 0.0001, "loss": 5.6905, "loss/crossentropy": 2.547207270562649, "loss/hidden": 4.413671875, "loss/jsd": 0.13257503397762777, "loss/logits": 0.0, "step": 570 }, { "epoch": 0.029, "grad_norm": 38.25, "grad_norm_var": 82.65149739583333, "learning_rate": 0.0001, "loss": 5.707, "loss/crossentropy": 2.4661644257605078, "loss/hidden": 4.43046875, "loss/jsd": 0.13218661015853286, "loss/logits": 0.0, "step": 580 }, { "epoch": 0.0295, "grad_norm": 33.0, "grad_norm_var": 14.656705729166667, "learning_rate": 0.0001, "loss": 5.6198, "loss/crossentropy": 2.3429581418633463, "loss/hidden": 4.35390625, "loss/jsd": 0.1255058040842414, "loss/logits": 0.0, "step": 590 }, { "epoch": 0.03, "grad_norm": 30.5, "grad_norm_var": 16.014518229166665, "learning_rate": 0.0001, "loss": 5.5969, "loss/crossentropy": 2.6043634325265885, "loss/hidden": 4.3796875, "loss/jsd": 0.1311176208779216, "loss/logits": 0.0, "step": 600 }, { "epoch": 0.0305, "grad_norm": 33.0, "grad_norm_var": 10.665625, "learning_rate": 0.0001, "loss": 5.5352, "loss/crossentropy": 2.4040530994534492, "loss/hidden": 4.219140625, "loss/jsd": 0.12296068714931607, "loss/logits": 0.0, "step": 610 }, { "epoch": 0.031, "grad_norm": 38.75, "grad_norm_var": 16.33125, "learning_rate": 0.0001, "loss": 5.4814, "loss/crossentropy": 2.390893703699112, "loss/hidden": 4.291796875, "loss/jsd": 0.11912889536470175, "loss/logits": 0.0, "step": 620 }, { "epoch": 0.0315, "grad_norm": 34.75, "grad_norm_var": 19.909375, "learning_rate": 0.0001, "loss": 5.5724, "loss/crossentropy": 2.5551778227090836, "loss/hidden": 4.251171875, "loss/jsd": 0.134556083381176, "loss/logits": 0.0, "step": 630 }, { "epoch": 0.032, "grad_norm": 33.0, "grad_norm_var": 1.2447526950627446e+18, "learning_rate": 0.0001, "loss": 5.6162, "loss/crossentropy": 2.4906763210892677, "loss/hidden": 4.20234375, "loss/jsd": 0.1178798858076334, "loss/logits": 0.0, "step": 640 }, { "epoch": 0.0325, "grad_norm": 29.875, "grad_norm_var": 1.2447526957786422e+18, "learning_rate": 0.0001, "loss": 5.5184, "loss/crossentropy": 2.437400442361832, "loss/hidden": 4.23046875, "loss/jsd": 0.12956738714128732, "loss/logits": 0.0, "step": 650 }, { "epoch": 0.033, "grad_norm": 33.0, "grad_norm_var": 27.3134765625, "learning_rate": 0.0001, "loss": 5.6369, "loss/crossentropy": 2.4849177479743956, "loss/hidden": 4.262109375, "loss/jsd": 0.12098300596699119, "loss/logits": 0.0, "step": 660 }, { "epoch": 0.0335, "grad_norm": 28.5, "grad_norm_var": 17.055989583333332, "learning_rate": 0.0001, "loss": 5.4991, "loss/crossentropy": 2.4364880681037904, "loss/hidden": 4.26171875, "loss/jsd": 0.12965436410158873, "loss/logits": 0.0, "step": 670 }, { "epoch": 0.034, "grad_norm": 28.375, "grad_norm_var": 19.137955729166666, "learning_rate": 0.0001, "loss": 5.5161, "loss/crossentropy": 2.392630486190319, "loss/hidden": 4.173828125, "loss/jsd": 0.11459105852991343, "loss/logits": 0.0, "step": 680 }, { "epoch": 0.0345, "grad_norm": 27.25, "grad_norm_var": 13.9322265625, "learning_rate": 0.0001, "loss": 5.4332, "loss/crossentropy": 2.344803684949875, "loss/hidden": 4.176953125, "loss/jsd": 0.11463690986856818, "loss/logits": 0.0, "step": 690 }, { "epoch": 0.035, "grad_norm": 34.75, "grad_norm_var": 15.854622395833333, "learning_rate": 0.0001, "loss": 5.5003, "loss/crossentropy": 2.395221236348152, "loss/hidden": 4.260546875, "loss/jsd": 0.1217193447984755, "loss/logits": 0.0, "step": 700 }, { "epoch": 0.0355, "grad_norm": 25.25, "grad_norm_var": 14.663541666666667, "learning_rate": 0.0001, "loss": 5.4171, "loss/crossentropy": 2.4193977400660516, "loss/hidden": 4.23828125, "loss/jsd": 0.12150606149807572, "loss/logits": 0.0, "step": 710 }, { "epoch": 0.036, "grad_norm": 26.875, "grad_norm_var": 13.142643229166667, "learning_rate": 0.0001, "loss": 5.3761, "loss/crossentropy": 2.2133478626608847, "loss/hidden": 4.141796875, "loss/jsd": 0.11149341901764273, "loss/logits": 0.0, "step": 720 }, { "epoch": 0.0365, "grad_norm": 34.25, "grad_norm_var": 14.420572916666666, "learning_rate": 0.0001, "loss": 5.3258, "loss/crossentropy": 2.3443893820047377, "loss/hidden": 4.090234375, "loss/jsd": 0.11677124733105301, "loss/logits": 0.0, "step": 730 }, { "epoch": 0.037, "grad_norm": 31.25, "grad_norm_var": 9.551822916666667, "learning_rate": 0.0001, "loss": 5.3054, "loss/crossentropy": 2.3357947677373887, "loss/hidden": 4.194140625, "loss/jsd": 0.12083362191915512, "loss/logits": 0.0, "step": 740 }, { "epoch": 0.0375, "grad_norm": 32.25, "grad_norm_var": 9.950455729166666, "learning_rate": 0.0001, "loss": 5.2645, "loss/crossentropy": 2.4039614737033843, "loss/hidden": 4.08671875, "loss/jsd": 0.1069810570217669, "loss/logits": 0.0, "step": 750 }, { "epoch": 0.038, "grad_norm": 24.0, "grad_norm_var": 1.1710062557908698e+18, "learning_rate": 0.0001, "loss": 5.3587, "loss/crossentropy": 2.4738259255886077, "loss/hidden": 4.209765625, "loss/jsd": 0.13927901685237884, "loss/logits": 0.0, "step": 760 }, { "epoch": 0.0385, "grad_norm": 29.125, "grad_norm_var": 1.1710062386255852e+18, "learning_rate": 0.0001, "loss": 5.3753, "loss/crossentropy": 2.2876866430044176, "loss/hidden": 4.1421875, "loss/jsd": 0.11211317665874958, "loss/logits": 0.0, "step": 770 }, { "epoch": 0.039, "grad_norm": 27.875, "grad_norm_var": 485.3025390625, "learning_rate": 0.0001, "loss": 5.2875, "loss/crossentropy": 2.3789359077811243, "loss/hidden": 4.13828125, "loss/jsd": 0.11359207816421986, "loss/logits": 0.0, "step": 780 }, { "epoch": 0.0395, "grad_norm": 21.875, "grad_norm_var": 19.980208333333334, "learning_rate": 0.0001, "loss": 5.2659, "loss/crossentropy": 2.4840095818042753, "loss/hidden": 4.076953125, "loss/jsd": 0.1078010268509388, "loss/logits": 0.0, "step": 790 }, { "epoch": 0.04, "grad_norm": 32.75, "grad_norm_var": 21.772330729166665, "learning_rate": 0.0001, "loss": 5.3525, "loss/crossentropy": 2.2179358512163163, "loss/hidden": 4.16796875, "loss/jsd": 0.11819018721580506, "loss/logits": 0.0, "step": 800 }, { "epoch": 0.0405, "grad_norm": 27.0, "grad_norm_var": 22.1884765625, "learning_rate": 0.0001, "loss": 5.3043, "loss/crossentropy": 2.4508845895528792, "loss/hidden": 4.133203125, "loss/jsd": 0.11473200833424926, "loss/logits": 0.0, "step": 810 }, { "epoch": 0.041, "grad_norm": 28.625, "grad_norm_var": 62.53515625, "learning_rate": 0.0001, "loss": 5.2633, "loss/crossentropy": 2.5463142573833464, "loss/hidden": 4.076171875, "loss/jsd": 0.12316551432013512, "loss/logits": 0.0, "step": 820 }, { "epoch": 0.0415, "grad_norm": 26.625, "grad_norm_var": 29.2150390625, "learning_rate": 0.0001, "loss": 5.2498, "loss/crossentropy": 2.379361332952976, "loss/hidden": 4.125, "loss/jsd": 0.11994905360043048, "loss/logits": 0.0, "step": 830 }, { "epoch": 0.042, "grad_norm": 27.75, "grad_norm_var": 37.1197265625, "learning_rate": 0.0001, "loss": 5.25, "loss/crossentropy": 2.448214793205261, "loss/hidden": 4.233203125, "loss/jsd": 0.13263647919520735, "loss/logits": 0.0, "step": 840 }, { "epoch": 0.0425, "grad_norm": 26.25, "grad_norm_var": 13.433072916666667, "learning_rate": 0.0001, "loss": 5.1491, "loss/crossentropy": 2.4302526518702505, "loss/hidden": 4.12578125, "loss/jsd": 0.11334973787888884, "loss/logits": 0.0, "step": 850 }, { "epoch": 0.043, "grad_norm": 23.75, "grad_norm_var": 7.470572916666667, "learning_rate": 0.0001, "loss": 5.1671, "loss/crossentropy": 2.415765553712845, "loss/hidden": 4.11328125, "loss/jsd": 0.11990332859568298, "loss/logits": 0.0, "step": 860 }, { "epoch": 0.0435, "grad_norm": 25.5, "grad_norm_var": 6.077083333333333, "learning_rate": 0.0001, "loss": 5.1279, "loss/crossentropy": 2.3868868976831434, "loss/hidden": 4.0671875, "loss/jsd": 0.11438164403662085, "loss/logits": 0.0, "step": 870 }, { "epoch": 0.044, "grad_norm": 25.0, "grad_norm_var": 48.50416666666667, "learning_rate": 0.0001, "loss": 5.18, "loss/crossentropy": 2.367817610502243, "loss/hidden": 4.136328125, "loss/jsd": 0.12616985198110342, "loss/logits": 0.0, "step": 880 }, { "epoch": 0.0445, "grad_norm": 23.625, "grad_norm_var": 52.3375, "learning_rate": 0.0001, "loss": 5.1786, "loss/crossentropy": 2.4342163532972334, "loss/hidden": 4.0125, "loss/jsd": 0.12039547078311444, "loss/logits": 0.0, "step": 890 }, { "epoch": 0.045, "grad_norm": 28.125, "grad_norm_var": 6.708268229166666, "learning_rate": 0.0001, "loss": 5.1451, "loss/crossentropy": 2.4633941307663916, "loss/hidden": 4.08125, "loss/jsd": 0.11877955347299576, "loss/logits": 0.0, "step": 900 }, { "epoch": 0.0455, "grad_norm": 28.5, "grad_norm_var": 8.4603515625, "learning_rate": 0.0001, "loss": 5.1919, "loss/crossentropy": 2.3779468327760696, "loss/hidden": 4.058984375, "loss/jsd": 0.11588607728481293, "loss/logits": 0.0, "step": 910 }, { "epoch": 0.046, "grad_norm": 38.25, "grad_norm_var": 59.06295572916667, "learning_rate": 0.0001, "loss": 5.2033, "loss/crossentropy": 2.4956902295351027, "loss/hidden": 4.107421875, "loss/jsd": 0.11758697256445885, "loss/logits": 0.0, "step": 920 }, { "epoch": 0.0465, "grad_norm": 22.625, "grad_norm_var": 21.1744140625, "learning_rate": 0.0001, "loss": 5.1248, "loss/crossentropy": 2.4070512309670447, "loss/hidden": 4.123828125, "loss/jsd": 0.12089485572651029, "loss/logits": 0.0, "step": 930 }, { "epoch": 0.047, "grad_norm": 47.75, "grad_norm_var": 164.896875, "learning_rate": 0.0001, "loss": 5.1273, "loss/crossentropy": 2.1984025448560716, "loss/hidden": 3.83359375, "loss/jsd": 0.10510765034705401, "loss/logits": 0.0, "step": 940 }, { "epoch": 0.0475, "grad_norm": 24.125, "grad_norm_var": 171.48326822916667, "learning_rate": 0.0001, "loss": 5.0933, "loss/crossentropy": 2.408414696156979, "loss/hidden": 3.9015625, "loss/jsd": 0.09813609030097722, "loss/logits": 0.0, "step": 950 }, { "epoch": 0.048, "grad_norm": 25.5, "grad_norm_var": 10.351041666666667, "learning_rate": 0.0001, "loss": 5.0887, "loss/crossentropy": 2.3635219663381575, "loss/hidden": 3.983984375, "loss/jsd": 0.10892721712589264, "loss/logits": 0.0, "step": 960 }, { "epoch": 0.0485, "grad_norm": 23.25, "grad_norm_var": 15.676497395833334, "learning_rate": 0.0001, "loss": 5.0293, "loss/crossentropy": 2.182341808080673, "loss/hidden": 3.92421875, "loss/jsd": 0.10646048728376627, "loss/logits": 0.0, "step": 970 }, { "epoch": 0.049, "grad_norm": 26.625, "grad_norm_var": 7.992708333333334, "learning_rate": 0.0001, "loss": 5.1407, "loss/crossentropy": 2.4966017305850983, "loss/hidden": 3.909375, "loss/jsd": 0.11931864526122808, "loss/logits": 0.0, "step": 980 }, { "epoch": 0.0495, "grad_norm": 25.125, "grad_norm_var": 915.2077473958333, "learning_rate": 0.0001, "loss": 5.1799, "loss/crossentropy": 2.3614319562911987, "loss/hidden": 3.95390625, "loss/jsd": 0.10783975422382355, "loss/logits": 0.0, "step": 990 }, { "epoch": 0.05, "grad_norm": 24.875, "grad_norm_var": 862.96640625, "learning_rate": 0.0001, "loss": 5.1175, "loss/crossentropy": 2.3259101063013077, "loss/hidden": 4.09140625, "loss/jsd": 0.11582606900483369, "loss/logits": 0.0, "step": 1000 }, { "epoch": 0.0505, "grad_norm": 27.0, "grad_norm_var": 36.96243489583333, "learning_rate": 0.0001, "loss": 5.1445, "loss/crossentropy": 2.4153922617435457, "loss/hidden": 4.044140625, "loss/jsd": 0.11763136927038431, "loss/logits": 0.0, "step": 1010 }, { "epoch": 0.051, "grad_norm": 27.0, "grad_norm_var": 11.583333333333334, "learning_rate": 0.0001, "loss": 5.0695, "loss/crossentropy": 2.287649059295654, "loss/hidden": 3.97578125, "loss/jsd": 0.10912037892267108, "loss/logits": 0.0, "step": 1020 }, { "epoch": 0.0515, "grad_norm": 34.25, "grad_norm_var": 598.6910807291666, "learning_rate": 0.0001, "loss": 5.1531, "loss/crossentropy": 2.5355153501033785, "loss/hidden": 3.972265625, "loss/jsd": 0.11578338220715523, "loss/logits": 0.0, "step": 1030 }, { "epoch": 0.052, "grad_norm": 23.0, "grad_norm_var": 149.62389322916667, "learning_rate": 0.0001, "loss": 5.1453, "loss/crossentropy": 2.328887623548508, "loss/hidden": 3.84609375, "loss/jsd": 0.1067446961067617, "loss/logits": 0.0, "step": 1040 }, { "epoch": 0.0525, "grad_norm": 22.625, "grad_norm_var": 23.629166666666666, "learning_rate": 0.0001, "loss": 5.0775, "loss/crossentropy": 2.3245414569973946, "loss/hidden": 3.950390625, "loss/jsd": 0.11564150396734477, "loss/logits": 0.0, "step": 1050 }, { "epoch": 0.053, "grad_norm": 29.375, "grad_norm_var": 22.822330729166666, "learning_rate": 0.0001, "loss": 4.929, "loss/crossentropy": 2.5518812984228134, "loss/hidden": 3.76796875, "loss/jsd": 0.10029144948348404, "loss/logits": 0.0, "step": 1060 }, { "epoch": 0.0535, "grad_norm": 22.875, "grad_norm_var": 27.373372395833332, "learning_rate": 0.0001, "loss": 5.1682, "loss/crossentropy": 2.3814490526914596, "loss/hidden": 4.084765625, "loss/jsd": 0.13794842325150966, "loss/logits": 0.0, "step": 1070 }, { "epoch": 0.054, "grad_norm": 30.375, "grad_norm_var": 25.968684895833334, "learning_rate": 0.0001, "loss": 5.0446, "loss/crossentropy": 2.336636045575142, "loss/hidden": 3.98984375, "loss/jsd": 0.11006514001637697, "loss/logits": 0.0, "step": 1080 }, { "epoch": 0.0545, "grad_norm": 25.5, "grad_norm_var": 32.0447265625, "learning_rate": 0.0001, "loss": 5.0339, "loss/crossentropy": 2.2337013885378836, "loss/hidden": 3.945703125, "loss/jsd": 0.11723029632121325, "loss/logits": 0.0, "step": 1090 }, { "epoch": 0.055, "grad_norm": 25.375, "grad_norm_var": 102.66432291666666, "learning_rate": 0.0001, "loss": 5.0155, "loss/crossentropy": 2.443159765005112, "loss/hidden": 4.062890625, "loss/jsd": 0.11166490567848086, "loss/logits": 0.0, "step": 1100 }, { "epoch": 0.0555, "grad_norm": 26.25, "grad_norm_var": 12.558072916666667, "learning_rate": 0.0001, "loss": 5.0531, "loss/crossentropy": 2.2338882118463514, "loss/hidden": 4.025, "loss/jsd": 0.11465255348011852, "loss/logits": 0.0, "step": 1110 }, { "epoch": 0.056, "grad_norm": 25.875, "grad_norm_var": 8.347916666666666, "learning_rate": 0.0001, "loss": 5.0976, "loss/crossentropy": 2.3596479177474974, "loss/hidden": 3.940625, "loss/jsd": 0.11759824641048908, "loss/logits": 0.0, "step": 1120 }, { "epoch": 0.0565, "grad_norm": 30.25, "grad_norm_var": 188.1353515625, "learning_rate": 0.0001, "loss": 5.0785, "loss/crossentropy": 2.3698789328336716, "loss/hidden": 3.962109375, "loss/jsd": 0.1172801936045289, "loss/logits": 0.0, "step": 1130 }, { "epoch": 0.057, "grad_norm": 26.375, "grad_norm_var": 185.04765625, "learning_rate": 0.0001, "loss": 5.0927, "loss/crossentropy": 2.3481896728277207, "loss/hidden": 3.9609375, "loss/jsd": 0.10608052760362625, "loss/logits": 0.0, "step": 1140 }, { "epoch": 0.0575, "grad_norm": 22.875, "grad_norm_var": 125.32233072916667, "learning_rate": 0.0001, "loss": 5.0263, "loss/crossentropy": 2.301522643119097, "loss/hidden": 3.8, "loss/jsd": 0.10154257528483868, "loss/logits": 0.0, "step": 1150 }, { "epoch": 0.058, "grad_norm": 27.625, "grad_norm_var": 81.21432291666666, "learning_rate": 0.0001, "loss": 5.1087, "loss/crossentropy": 2.3300373941659926, "loss/hidden": 3.923828125, "loss/jsd": 0.10997985871508717, "loss/logits": 0.0, "step": 1160 }, { "epoch": 0.0585, "grad_norm": 22.0, "grad_norm_var": 37.805989583333336, "learning_rate": 0.0001, "loss": 4.9669, "loss/crossentropy": 2.3570085942745207, "loss/hidden": 3.903125, "loss/jsd": 0.12716795089654626, "loss/logits": 0.0, "step": 1170 }, { "epoch": 0.059, "grad_norm": 28.25, "grad_norm_var": 6.526822916666666, "learning_rate": 0.0001, "loss": 4.8827, "loss/crossentropy": 2.4714103788137436, "loss/hidden": 3.878125, "loss/jsd": 0.11338211484253406, "loss/logits": 0.0, "step": 1180 }, { "epoch": 0.0595, "grad_norm": 25.0, "grad_norm_var": 1.0217717449682671e+18, "learning_rate": 0.0001, "loss": 5.0544, "loss/crossentropy": 2.170953643321991, "loss/hidden": 3.91875, "loss/jsd": 0.11225487310439348, "loss/logits": 0.0, "step": 1190 }, { "epoch": 0.06, "grad_norm": 22.125, "grad_norm_var": 22.508072916666666, "learning_rate": 0.0001, "loss": 4.8895, "loss/crossentropy": 2.4479696050286295, "loss/hidden": 3.896484375, "loss/jsd": 0.10494228331372142, "loss/logits": 0.0, "step": 1200 }, { "epoch": 0.0605, "grad_norm": 22.25, "grad_norm_var": 19.080989583333334, "learning_rate": 0.0001, "loss": 4.8699, "loss/crossentropy": 2.3343143433332445, "loss/hidden": 3.787109375, "loss/jsd": 0.10432742889970541, "loss/logits": 0.0, "step": 1210 }, { "epoch": 0.061, "grad_norm": 19.0, "grad_norm_var": 7.299934895833333, "learning_rate": 0.0001, "loss": 4.9113, "loss/crossentropy": 2.2152185067534447, "loss/hidden": 3.838671875, "loss/jsd": 0.10314544131979346, "loss/logits": 0.0, "step": 1220 }, { "epoch": 0.0615, "grad_norm": 25.125, "grad_norm_var": 8.783333333333333, "learning_rate": 0.0001, "loss": 4.8793, "loss/crossentropy": 2.3982258841395376, "loss/hidden": 3.765625, "loss/jsd": 0.1033841515891254, "loss/logits": 0.0, "step": 1230 }, { "epoch": 0.062, "grad_norm": 24.25, "grad_norm_var": 8.654166666666667, "learning_rate": 0.0001, "loss": 4.936, "loss/crossentropy": 2.3861924752593042, "loss/hidden": 3.990625, "loss/jsd": 0.1316368247382343, "loss/logits": 0.0, "step": 1240 }, { "epoch": 0.0625, "grad_norm": 27.625, "grad_norm_var": 18.838997395833335, "learning_rate": 0.0001, "loss": 5.0574, "loss/crossentropy": 2.3481432244181635, "loss/hidden": 3.886328125, "loss/jsd": 0.12455893289297819, "loss/logits": 0.0, "step": 1250 }, { "epoch": 0.063, "grad_norm": 24.875, "grad_norm_var": 741.0330729166667, "learning_rate": 0.0001, "loss": 5.054, "loss/crossentropy": 2.50970872938633, "loss/hidden": 3.89375, "loss/jsd": 0.11707814577966928, "loss/logits": 0.0, "step": 1260 }, { "epoch": 0.0635, "grad_norm": 22.25, "grad_norm_var": 766.54140625, "learning_rate": 0.0001, "loss": 4.9292, "loss/crossentropy": 2.214522284269333, "loss/hidden": 3.817578125, "loss/jsd": 0.09662074805237353, "loss/logits": 0.0, "step": 1270 }, { "epoch": 0.064, "grad_norm": 27.125, "grad_norm_var": 1.2075980051835433e+18, "learning_rate": 0.0001, "loss": 4.9727, "loss/crossentropy": 2.5177758872509, "loss/hidden": 3.872265625, "loss/jsd": 0.12324077049270273, "loss/logits": 0.0, "step": 1280 }, { "epoch": 0.0645, "grad_norm": 26.25, "grad_norm_var": 4.482291666666667, "learning_rate": 0.0001, "loss": 4.8651, "loss/crossentropy": 2.4133356541395186, "loss/hidden": 3.8109375, "loss/jsd": 0.10085376175120472, "loss/logits": 0.0, "step": 1290 }, { "epoch": 0.065, "grad_norm": 20.875, "grad_norm_var": 4.7875, "learning_rate": 0.0001, "loss": 4.8874, "loss/crossentropy": 2.211686734855175, "loss/hidden": 3.82578125, "loss/jsd": 0.10324386316351593, "loss/logits": 0.0, "step": 1300 }, { "epoch": 0.0655, "grad_norm": 25.125, "grad_norm_var": 39.35045572916667, "learning_rate": 0.0001, "loss": 4.9265, "loss/crossentropy": 2.386268785595894, "loss/hidden": 3.837890625, "loss/jsd": 0.11206256924197078, "loss/logits": 0.0, "step": 1310 }, { "epoch": 0.066, "grad_norm": 25.5, "grad_norm_var": 39.68020833333333, "learning_rate": 0.0001, "loss": 4.9719, "loss/crossentropy": 2.3294328808784486, "loss/hidden": 3.8640625, "loss/jsd": 0.11526230238378048, "loss/logits": 0.0, "step": 1320 }, { "epoch": 0.0665, "grad_norm": 23.375, "grad_norm_var": 47.66295572916667, "learning_rate": 0.0001, "loss": 4.922, "loss/crossentropy": 2.38586545586586, "loss/hidden": 3.7203125, "loss/jsd": 0.09609230635687709, "loss/logits": 0.0, "step": 1330 }, { "epoch": 0.067, "grad_norm": 25.125, "grad_norm_var": 36.25305989583333, "learning_rate": 0.0001, "loss": 4.9463, "loss/crossentropy": 2.4498814970254896, "loss/hidden": 3.84296875, "loss/jsd": 0.10662997653707862, "loss/logits": 0.0, "step": 1340 }, { "epoch": 0.0675, "grad_norm": 26.625, "grad_norm_var": 34.154622395833336, "learning_rate": 0.0001, "loss": 4.89, "loss/crossentropy": 2.3147580534219743, "loss/hidden": 3.840625, "loss/jsd": 0.10548559352755546, "loss/logits": 0.0, "step": 1350 }, { "epoch": 0.068, "grad_norm": 20.25, "grad_norm_var": 5.3369140625, "learning_rate": 0.0001, "loss": 4.9495, "loss/crossentropy": 2.2381860077381135, "loss/hidden": 3.936328125, "loss/jsd": 0.1049613301642239, "loss/logits": 0.0, "step": 1360 }, { "epoch": 0.0685, "grad_norm": 21.625, "grad_norm_var": 42.57604166666667, "learning_rate": 0.0001, "loss": 4.9902, "loss/crossentropy": 2.3451401717960834, "loss/hidden": 3.97578125, "loss/jsd": 0.10501982429996133, "loss/logits": 0.0, "step": 1370 }, { "epoch": 0.069, "grad_norm": 25.5, "grad_norm_var": 13.911393229166666, "learning_rate": 0.0001, "loss": 4.8738, "loss/crossentropy": 2.2887198269367217, "loss/hidden": 3.948828125, "loss/jsd": 0.10703569920733572, "loss/logits": 0.0, "step": 1380 }, { "epoch": 0.0695, "grad_norm": 24.0, "grad_norm_var": 4.178125, "learning_rate": 0.0001, "loss": 4.908, "loss/crossentropy": 2.4341419368982313, "loss/hidden": 3.9109375, "loss/jsd": 0.13313074046745896, "loss/logits": 0.0, "step": 1390 }, { "epoch": 0.07, "grad_norm": 25.5, "grad_norm_var": 3.2643229166666665, "learning_rate": 0.0001, "loss": 4.8483, "loss/crossentropy": 2.3005983904004097, "loss/hidden": 3.794921875, "loss/jsd": 0.1167063161265105, "loss/logits": 0.0, "step": 1400 }, { "epoch": 0.0705, "grad_norm": 25.375, "grad_norm_var": 2.6684895833333333, "learning_rate": 0.0001, "loss": 4.8661, "loss/crossentropy": 2.3177727833390236, "loss/hidden": 3.76328125, "loss/jsd": 0.09948643315583468, "loss/logits": 0.0, "step": 1410 }, { "epoch": 0.071, "grad_norm": 21.25, "grad_norm_var": 6.670572916666667, "learning_rate": 0.0001, "loss": 4.8736, "loss/crossentropy": 2.2698763489723204, "loss/hidden": 3.831640625, "loss/jsd": 0.10282253352925182, "loss/logits": 0.0, "step": 1420 }, { "epoch": 0.0715, "grad_norm": 18.625, "grad_norm_var": 8.87265625, "learning_rate": 0.0001, "loss": 4.8039, "loss/crossentropy": 2.360131266713142, "loss/hidden": 3.722265625, "loss/jsd": 0.10547879729419947, "loss/logits": 0.0, "step": 1430 }, { "epoch": 0.072, "grad_norm": 21.75, "grad_norm_var": 3.8889973958333335, "learning_rate": 0.0001, "loss": 4.7269, "loss/crossentropy": 2.311430121213198, "loss/hidden": 3.7109375, "loss/jsd": 0.09480313453823327, "loss/logits": 0.0, "step": 1440 }, { "epoch": 0.0725, "grad_norm": 29.75, "grad_norm_var": 7.685416666666667, "learning_rate": 0.0001, "loss": 4.7292, "loss/crossentropy": 2.4506467133760452, "loss/hidden": 3.672265625, "loss/jsd": 0.09663807023316622, "loss/logits": 0.0, "step": 1450 }, { "epoch": 0.073, "grad_norm": 23.25, "grad_norm_var": 8.416666666666666, "learning_rate": 0.0001, "loss": 4.7346, "loss/crossentropy": 2.2691701710224152, "loss/hidden": 3.836328125, "loss/jsd": 0.1028917589224875, "loss/logits": 0.0, "step": 1460 }, { "epoch": 0.0735, "grad_norm": 20.375, "grad_norm_var": 7.246875, "learning_rate": 0.0001, "loss": 4.7517, "loss/crossentropy": 2.3083701550960543, "loss/hidden": 3.714453125, "loss/jsd": 0.09604525147005916, "loss/logits": 0.0, "step": 1470 }, { "epoch": 0.074, "grad_norm": 22.0, "grad_norm_var": 11.672916666666667, "learning_rate": 0.0001, "loss": 4.8113, "loss/crossentropy": 2.3635326638817786, "loss/hidden": 3.6703125, "loss/jsd": 0.10219773268327118, "loss/logits": 0.0, "step": 1480 }, { "epoch": 0.0745, "grad_norm": 21.375, "grad_norm_var": 5.637239583333334, "learning_rate": 0.0001, "loss": 4.798, "loss/crossentropy": 2.182288531959057, "loss/hidden": 3.784375, "loss/jsd": 0.09713765853084624, "loss/logits": 0.0, "step": 1490 }, { "epoch": 0.075, "grad_norm": 22.375, "grad_norm_var": 13.480143229166666, "learning_rate": 0.0001, "loss": 4.9073, "loss/crossentropy": 2.209014095366001, "loss/hidden": 3.77734375, "loss/jsd": 0.100444171205163, "loss/logits": 0.0, "step": 1500 }, { "epoch": 0.0755, "grad_norm": 20.25, "grad_norm_var": 15.253125, "learning_rate": 0.0001, "loss": 4.8648, "loss/crossentropy": 2.307139050960541, "loss/hidden": 3.835546875, "loss/jsd": 0.10750290956348181, "loss/logits": 0.0, "step": 1510 }, { "epoch": 0.076, "grad_norm": 22.25, "grad_norm_var": 5.84765625, "learning_rate": 0.0001, "loss": 4.7021, "loss/crossentropy": 2.4567115128040315, "loss/hidden": 3.641015625, "loss/jsd": 0.0963326326571405, "loss/logits": 0.0, "step": 1520 }, { "epoch": 0.0765, "grad_norm": 23.0, "grad_norm_var": 17.799934895833335, "learning_rate": 0.0001, "loss": 4.7726, "loss/crossentropy": 2.3501833245158195, "loss/hidden": 3.71171875, "loss/jsd": 0.09695078176446259, "loss/logits": 0.0, "step": 1530 }, { "epoch": 0.077, "grad_norm": 26.875, "grad_norm_var": 14.445572916666666, "learning_rate": 0.0001, "loss": 4.7776, "loss/crossentropy": 2.35235877931118, "loss/hidden": 3.7109375, "loss/jsd": 0.09894683174788951, "loss/logits": 0.0, "step": 1540 }, { "epoch": 0.0775, "grad_norm": 20.0, "grad_norm_var": 6.866080729166667, "learning_rate": 0.0001, "loss": 4.7465, "loss/crossentropy": 2.3319214552640917, "loss/hidden": 3.678515625, "loss/jsd": 0.10175617430359125, "loss/logits": 0.0, "step": 1550 }, { "epoch": 0.078, "grad_norm": 25.75, "grad_norm_var": 5.9306640625, "learning_rate": 0.0001, "loss": 4.7662, "loss/crossentropy": 2.312511496245861, "loss/hidden": 3.82421875, "loss/jsd": 0.10424250243231654, "loss/logits": 0.0, "step": 1560 }, { "epoch": 0.0785, "grad_norm": 19.25, "grad_norm_var": 7.527018229166667, "learning_rate": 0.0001, "loss": 4.7506, "loss/crossentropy": 2.195492114126682, "loss/hidden": 3.806640625, "loss/jsd": 0.10378086129203438, "loss/logits": 0.0, "step": 1570 }, { "epoch": 0.079, "grad_norm": 20.875, "grad_norm_var": 5.69765625, "learning_rate": 0.0001, "loss": 4.7525, "loss/crossentropy": 2.3451679602265356, "loss/hidden": 3.64609375, "loss/jsd": 0.10021187355741859, "loss/logits": 0.0, "step": 1580 }, { "epoch": 0.0795, "grad_norm": 25.125, "grad_norm_var": 4.002083333333333, "learning_rate": 0.0001, "loss": 4.7907, "loss/crossentropy": 2.235419529676437, "loss/hidden": 3.76875, "loss/jsd": 0.1017349574714899, "loss/logits": 0.0, "step": 1590 }, { "epoch": 0.08, "grad_norm": 17.5, "grad_norm_var": 6.187239583333334, "learning_rate": 0.0001, "loss": 4.7544, "loss/crossentropy": 2.349038490653038, "loss/hidden": 3.75859375, "loss/jsd": 0.10723181385546923, "loss/logits": 0.0, "step": 1600 }, { "epoch": 0.0805, "grad_norm": 21.5, "grad_norm_var": 5.036458333333333, "learning_rate": 0.0001, "loss": 4.7984, "loss/crossentropy": 2.2953826270997526, "loss/hidden": 3.735546875, "loss/jsd": 0.10434331484138966, "loss/logits": 0.0, "step": 1610 }, { "epoch": 0.081, "grad_norm": 21.625, "grad_norm_var": 2.8671223958333334, "learning_rate": 0.0001, "loss": 4.7802, "loss/crossentropy": 2.420463111996651, "loss/hidden": 3.7671875, "loss/jsd": 0.09350865064188837, "loss/logits": 0.0, "step": 1620 }, { "epoch": 0.0815, "grad_norm": 21.875, "grad_norm_var": 5.375455729166666, "learning_rate": 0.0001, "loss": 4.7768, "loss/crossentropy": 2.4329511165618896, "loss/hidden": 3.770703125, "loss/jsd": 0.11144884563982486, "loss/logits": 0.0, "step": 1630 }, { "epoch": 0.082, "grad_norm": 18.0, "grad_norm_var": 5.853059895833334, "learning_rate": 0.0001, "loss": 4.81, "loss/crossentropy": 2.3901975452899933, "loss/hidden": 3.816015625, "loss/jsd": 0.11511239362880588, "loss/logits": 0.0, "step": 1640 }, { "epoch": 0.0825, "grad_norm": 18.625, "grad_norm_var": 6.853059895833334, "learning_rate": 0.0001, "loss": 4.8666, "loss/crossentropy": 2.42452190220356, "loss/hidden": 3.791015625, "loss/jsd": 0.10730197560042143, "loss/logits": 0.0, "step": 1650 }, { "epoch": 0.083, "grad_norm": 22.125, "grad_norm_var": 5.945768229166666, "learning_rate": 0.0001, "loss": 4.825, "loss/crossentropy": 2.415967509150505, "loss/hidden": 3.7140625, "loss/jsd": 0.10223841555416584, "loss/logits": 0.0, "step": 1660 }, { "epoch": 0.0835, "grad_norm": 19.625, "grad_norm_var": 4.648372395833333, "learning_rate": 0.0001, "loss": 4.6893, "loss/crossentropy": 2.346050335466862, "loss/hidden": 3.75234375, "loss/jsd": 0.10205129384994507, "loss/logits": 0.0, "step": 1670 }, { "epoch": 0.084, "grad_norm": 26.75, "grad_norm_var": 6.887239583333334, "learning_rate": 0.0001, "loss": 4.7759, "loss/crossentropy": 2.272695492208004, "loss/hidden": 3.740234375, "loss/jsd": 0.09743564091622829, "loss/logits": 0.0, "step": 1680 }, { "epoch": 0.0845, "grad_norm": 21.25, "grad_norm_var": 8.242122395833333, "learning_rate": 0.0001, "loss": 4.7886, "loss/crossentropy": 2.421866828203201, "loss/hidden": 3.801171875, "loss/jsd": 0.10874381214380265, "loss/logits": 0.0, "step": 1690 }, { "epoch": 0.085, "grad_norm": 21.875, "grad_norm_var": 5.842643229166667, "learning_rate": 0.0001, "loss": 4.7021, "loss/crossentropy": 2.389561951160431, "loss/hidden": 3.679296875, "loss/jsd": 0.1009491034783423, "loss/logits": 0.0, "step": 1700 }, { "epoch": 0.0855, "grad_norm": 21.625, "grad_norm_var": 13.94765625, "learning_rate": 0.0001, "loss": 4.8137, "loss/crossentropy": 2.3791208446025847, "loss/hidden": 3.775, "loss/jsd": 0.11071940269321204, "loss/logits": 0.0, "step": 1710 }, { "epoch": 0.086, "grad_norm": 20.5, "grad_norm_var": 13.2603515625, "learning_rate": 0.0001, "loss": 4.738, "loss/crossentropy": 2.4374333173036575, "loss/hidden": 3.728125, "loss/jsd": 0.10198512580245733, "loss/logits": 0.0, "step": 1720 }, { "epoch": 0.0865, "grad_norm": 16.625, "grad_norm_var": 11.1853515625, "learning_rate": 0.0001, "loss": 4.7524, "loss/crossentropy": 2.3030879952013494, "loss/hidden": 3.626953125, "loss/jsd": 0.09310725582763553, "loss/logits": 0.0, "step": 1730 }, { "epoch": 0.087, "grad_norm": 18.875, "grad_norm_var": 6.285416666666666, "learning_rate": 0.0001, "loss": 4.7021, "loss/crossentropy": 2.192840526998043, "loss/hidden": 3.819140625, "loss/jsd": 0.09320764979347587, "loss/logits": 0.0, "step": 1740 }, { "epoch": 0.0875, "grad_norm": 24.625, "grad_norm_var": 6.4353515625, "learning_rate": 0.0001, "loss": 4.7059, "loss/crossentropy": 2.3610597878694533, "loss/hidden": 3.733984375, "loss/jsd": 0.10029621962457895, "loss/logits": 0.0, "step": 1750 }, { "epoch": 0.088, "grad_norm": 19.5, "grad_norm_var": 19.762239583333333, "learning_rate": 0.0001, "loss": 4.7081, "loss/crossentropy": 2.410063475370407, "loss/hidden": 3.65078125, "loss/jsd": 0.10161215299740434, "loss/logits": 0.0, "step": 1760 }, { "epoch": 0.0885, "grad_norm": 33.25, "grad_norm_var": 22.748893229166665, "learning_rate": 0.0001, "loss": 4.5743, "loss/crossentropy": 2.2984881952404974, "loss/hidden": 3.666796875, "loss/jsd": 0.09647621251642705, "loss/logits": 0.0, "step": 1770 }, { "epoch": 0.089, "grad_norm": 20.25, "grad_norm_var": 16.602083333333333, "learning_rate": 0.0001, "loss": 4.7585, "loss/crossentropy": 2.3432783752679827, "loss/hidden": 3.716796875, "loss/jsd": 0.10081057399511337, "loss/logits": 0.0, "step": 1780 }, { "epoch": 0.0895, "grad_norm": 22.875, "grad_norm_var": 5.621875, "learning_rate": 0.0001, "loss": 4.707, "loss/crossentropy": 2.352738951146603, "loss/hidden": 3.673828125, "loss/jsd": 0.09183212611824274, "loss/logits": 0.0, "step": 1790 }, { "epoch": 0.09, "grad_norm": 33.0, "grad_norm_var": 16.277083333333334, "learning_rate": 0.0001, "loss": 4.6996, "loss/crossentropy": 2.443929785490036, "loss/hidden": 3.55390625, "loss/jsd": 0.09280467573553323, "loss/logits": 0.0, "step": 1800 }, { "epoch": 0.0905, "grad_norm": 17.125, "grad_norm_var": 20.319205729166665, "learning_rate": 0.0001, "loss": 4.7503, "loss/crossentropy": 2.333281812816858, "loss/hidden": 3.687109375, "loss/jsd": 0.09856429314240814, "loss/logits": 0.0, "step": 1810 }, { "epoch": 0.091, "grad_norm": 19.625, "grad_norm_var": 14.943684895833334, "learning_rate": 0.0001, "loss": 4.8011, "loss/crossentropy": 2.3165989741683006, "loss/hidden": 3.93515625, "loss/jsd": 0.1154123242944479, "loss/logits": 0.0, "step": 1820 }, { "epoch": 0.0915, "grad_norm": 19.125, "grad_norm_var": 2.5833333333333335, "learning_rate": 0.0001, "loss": 4.7784, "loss/crossentropy": 2.3343286007642745, "loss/hidden": 3.796875, "loss/jsd": 0.11231993734836579, "loss/logits": 0.0, "step": 1830 }, { "epoch": 0.092, "grad_norm": 18.0, "grad_norm_var": 4.880989583333333, "learning_rate": 0.0001, "loss": 4.6886, "loss/crossentropy": 2.412258565425873, "loss/hidden": 3.78046875, "loss/jsd": 0.10415599066764117, "loss/logits": 0.0, "step": 1840 }, { "epoch": 0.0925, "grad_norm": 17.25, "grad_norm_var": 6.083072916666667, "learning_rate": 0.0001, "loss": 4.7485, "loss/crossentropy": 2.379472056031227, "loss/hidden": 3.6609375, "loss/jsd": 0.09712380319833755, "loss/logits": 0.0, "step": 1850 }, { "epoch": 0.093, "grad_norm": 19.125, "grad_norm_var": 9.0041015625, "learning_rate": 0.0001, "loss": 4.7145, "loss/crossentropy": 2.286051708459854, "loss/hidden": 3.671484375, "loss/jsd": 0.09749153861775994, "loss/logits": 0.0, "step": 1860 }, { "epoch": 0.0935, "grad_norm": 24.25, "grad_norm_var": 10.09765625, "learning_rate": 0.0001, "loss": 4.6597, "loss/crossentropy": 2.3485587686300278, "loss/hidden": 3.656640625, "loss/jsd": 0.09961330010555684, "loss/logits": 0.0, "step": 1870 }, { "epoch": 0.094, "grad_norm": 25.5, "grad_norm_var": 10.745833333333334, "learning_rate": 0.0001, "loss": 4.7654, "loss/crossentropy": 2.22419136762619, "loss/hidden": 3.680859375, "loss/jsd": 0.09599914094433189, "loss/logits": 0.0, "step": 1880 }, { "epoch": 0.0945, "grad_norm": 23.375, "grad_norm_var": 11.849739583333333, "learning_rate": 0.0001, "loss": 4.6586, "loss/crossentropy": 2.2319135151803495, "loss/hidden": 3.776953125, "loss/jsd": 0.1003801210783422, "loss/logits": 0.0, "step": 1890 }, { "epoch": 0.095, "grad_norm": 20.125, "grad_norm_var": 15.6884765625, "learning_rate": 0.0001, "loss": 4.7113, "loss/crossentropy": 2.466662494838238, "loss/hidden": 3.694140625, "loss/jsd": 0.09942078748717904, "loss/logits": 0.0, "step": 1900 }, { "epoch": 0.0955, "grad_norm": 20.875, "grad_norm_var": 11.4197265625, "learning_rate": 0.0001, "loss": 4.6638, "loss/crossentropy": 2.3695669680833817, "loss/hidden": 3.593359375, "loss/jsd": 0.09504008954390883, "loss/logits": 0.0, "step": 1910 }, { "epoch": 0.096, "grad_norm": 22.125, "grad_norm_var": 4.581184895833333, "learning_rate": 0.0001, "loss": 4.6473, "loss/crossentropy": 2.345889499783516, "loss/hidden": 3.691015625, "loss/jsd": 0.10475197089836001, "loss/logits": 0.0, "step": 1920 }, { "epoch": 0.0965, "grad_norm": 22.5, "grad_norm_var": 4.087239583333333, "learning_rate": 0.0001, "loss": 4.721, "loss/crossentropy": 2.256808315217495, "loss/hidden": 3.7015625, "loss/jsd": 0.09892030693590641, "loss/logits": 0.0, "step": 1930 }, { "epoch": 0.097, "grad_norm": 19.125, "grad_norm_var": 6.395572916666667, "learning_rate": 0.0001, "loss": 4.5498, "loss/crossentropy": 2.5429009228944777, "loss/hidden": 3.680859375, "loss/jsd": 0.09861663114279509, "loss/logits": 0.0, "step": 1940 }, { "epoch": 0.0975, "grad_norm": 21.25, "grad_norm_var": 5.843489583333334, "learning_rate": 0.0001, "loss": 4.6899, "loss/crossentropy": 2.272120487689972, "loss/hidden": 3.6375, "loss/jsd": 0.09743905253708363, "loss/logits": 0.0, "step": 1950 }, { "epoch": 0.098, "grad_norm": 17.625, "grad_norm_var": 6.62265625, "learning_rate": 0.0001, "loss": 4.6035, "loss/crossentropy": 2.1649394638836386, "loss/hidden": 3.584765625, "loss/jsd": 0.08881366224959493, "loss/logits": 0.0, "step": 1960 }, { "epoch": 0.0985, "grad_norm": 4328521728.0, "grad_norm_var": 1.171006260534208e+18, "learning_rate": 0.0001, "loss": 4.6906, "loss/crossentropy": 2.3182963758707045, "loss/hidden": 3.623046875, "loss/jsd": 0.10251586111262441, "loss/logits": 0.0, "step": 1970 }, { "epoch": 0.099, "grad_norm": 20.25, "grad_norm_var": 2.715501666496903e+18, "learning_rate": 0.0001, "loss": 4.7101, "loss/crossentropy": 2.407327815890312, "loss/hidden": 3.60703125, "loss/jsd": 0.09459855072200299, "loss/logits": 0.0, "step": 1980 }, { "epoch": 0.0995, "grad_norm": 19.25, "grad_norm_var": 1.7345191619224492e+18, "learning_rate": 0.0001, "loss": 4.6395, "loss/crossentropy": 2.256649875640869, "loss/hidden": 3.65234375, "loss/jsd": 0.10230031171813607, "loss/logits": 0.0, "step": 1990 }, { "epoch": 0.1, "grad_norm": 22.0, "grad_norm_var": 2.981184895833333, "learning_rate": 0.0001, "loss": 4.5112, "loss/crossentropy": 2.3214069336652754, "loss/hidden": 3.553515625, "loss/jsd": 0.09316142341122031, "loss/logits": 0.0, "step": 2000 }, { "epoch": 0.1005, "grad_norm": 20.25, "grad_norm_var": 4.611393229166667, "learning_rate": 0.0001, "loss": 4.5154, "loss/crossentropy": 2.297450725734234, "loss/hidden": 3.569140625, "loss/jsd": 0.09217815361917019, "loss/logits": 0.0, "step": 2010 }, { "epoch": 0.101, "grad_norm": 23.25, "grad_norm_var": 8.597249348958334, "learning_rate": 0.0001, "loss": 4.6108, "loss/crossentropy": 2.3132576078176497, "loss/hidden": 3.69375, "loss/jsd": 0.1215221800841391, "loss/logits": 0.0, "step": 2020 }, { "epoch": 0.1015, "grad_norm": 16.75, "grad_norm_var": 8.385400390625, "learning_rate": 0.0001, "loss": 4.5906, "loss/crossentropy": 2.42258235514164, "loss/hidden": 3.58359375, "loss/jsd": 0.09518450712785125, "loss/logits": 0.0, "step": 2030 }, { "epoch": 0.102, "grad_norm": 17.75, "grad_norm_var": 39.30416666666667, "learning_rate": 0.0001, "loss": 4.6782, "loss/crossentropy": 2.226282720267773, "loss/hidden": 3.6078125, "loss/jsd": 0.08297519264742732, "loss/logits": 0.0, "step": 2040 }, { "epoch": 0.1025, "grad_norm": 18.5, "grad_norm_var": 8.6306640625, "learning_rate": 0.0001, "loss": 4.6889, "loss/crossentropy": 2.23982213139534, "loss/hidden": 3.660546875, "loss/jsd": 0.0924127135425806, "loss/logits": 0.0, "step": 2050 }, { "epoch": 0.103, "grad_norm": 18.25, "grad_norm_var": 6.187744140625, "learning_rate": 0.0001, "loss": 4.6246, "loss/crossentropy": 2.2483278423547746, "loss/hidden": 3.60859375, "loss/jsd": 0.09513462502509355, "loss/logits": 0.0, "step": 2060 }, { "epoch": 0.1035, "grad_norm": 28.375, "grad_norm_var": 12.276416015625, "learning_rate": 0.0001, "loss": 4.6868, "loss/crossentropy": 2.2927519381046295, "loss/hidden": 3.53515625, "loss/jsd": 0.08648296073079109, "loss/logits": 0.0, "step": 2070 }, { "epoch": 0.104, "grad_norm": 24.125, "grad_norm_var": 14.239583333333334, "learning_rate": 0.0001, "loss": 4.5602, "loss/crossentropy": 2.3293472826480865, "loss/hidden": 3.599609375, "loss/jsd": 0.09772532721981406, "loss/logits": 0.0, "step": 2080 }, { "epoch": 0.1045, "grad_norm": 19.75, "grad_norm_var": 8.269205729166666, "learning_rate": 0.0001, "loss": 4.6017, "loss/crossentropy": 2.3832351714372635, "loss/hidden": 3.60078125, "loss/jsd": 0.09314336217939853, "loss/logits": 0.0, "step": 2090 }, { "epoch": 0.105, "grad_norm": 20.0, "grad_norm_var": 5.070833333333334, "learning_rate": 0.0001, "loss": 4.5706, "loss/crossentropy": 2.4874933838844298, "loss/hidden": 3.651171875, "loss/jsd": 0.09874060060828924, "loss/logits": 0.0, "step": 2100 }, { "epoch": 0.1055, "grad_norm": 16.0, "grad_norm_var": 42.828059895833334, "learning_rate": 0.0001, "loss": 4.6945, "loss/crossentropy": 2.185934893786907, "loss/hidden": 3.78046875, "loss/jsd": 0.10176362562924623, "loss/logits": 0.0, "step": 2110 }, { "epoch": 0.106, "grad_norm": 20.5, "grad_norm_var": 912.2905598958333, "learning_rate": 0.0001, "loss": 4.8579, "loss/crossentropy": 2.337796673178673, "loss/hidden": 3.673828125, "loss/jsd": 0.09400355285033583, "loss/logits": 0.0, "step": 2120 }, { "epoch": 0.1065, "grad_norm": 19.0, "grad_norm_var": 86.45514322916667, "learning_rate": 0.0001, "loss": 4.6905, "loss/crossentropy": 2.191851982474327, "loss/hidden": 3.83828125, "loss/jsd": 0.09336025016382336, "loss/logits": 0.0, "step": 2130 }, { "epoch": 0.107, "grad_norm": 20.625, "grad_norm_var": 12.3416015625, "learning_rate": 0.0001, "loss": 4.746, "loss/crossentropy": 2.321294938027859, "loss/hidden": 3.709375, "loss/jsd": 0.09611575696617365, "loss/logits": 0.0, "step": 2140 }, { "epoch": 0.1075, "grad_norm": 20.5, "grad_norm_var": 11.157291666666667, "learning_rate": 0.0001, "loss": 4.6848, "loss/crossentropy": 2.3641166269779204, "loss/hidden": 3.802734375, "loss/jsd": 0.11719204504042864, "loss/logits": 0.0, "step": 2150 }, { "epoch": 0.108, "grad_norm": 21.25, "grad_norm_var": 313.9947265625, "learning_rate": 0.0001, "loss": 4.703, "loss/crossentropy": 2.3178130373358727, "loss/hidden": 3.6640625, "loss/jsd": 0.10858506197109818, "loss/logits": 0.0, "step": 2160 }, { "epoch": 0.1085, "grad_norm": 18.125, "grad_norm_var": 505.7301432291667, "learning_rate": 0.0001, "loss": 4.7499, "loss/crossentropy": 2.2643778324127197, "loss/hidden": 3.686328125, "loss/jsd": 0.09995021363720298, "loss/logits": 0.0, "step": 2170 }, { "epoch": 0.109, "grad_norm": 19.0, "grad_norm_var": 4.88125, "learning_rate": 0.0001, "loss": 4.639, "loss/crossentropy": 2.333830028772354, "loss/hidden": 3.77421875, "loss/jsd": 0.10143324267119169, "loss/logits": 0.0, "step": 2180 }, { "epoch": 0.1095, "grad_norm": 17.625, "grad_norm_var": 1.756685316214961e+18, "learning_rate": 0.0001, "loss": 4.6091, "loss/crossentropy": 2.2005941957235335, "loss/hidden": 3.546875, "loss/jsd": 0.08694255957379937, "loss/logits": 0.0, "step": 2190 }, { "epoch": 0.11, "grad_norm": 19.25, "grad_norm_var": 219.81608072916666, "learning_rate": 0.0001, "loss": 4.6177, "loss/crossentropy": 2.3627296075224877, "loss/hidden": 3.74765625, "loss/jsd": 0.10458627291955054, "loss/logits": 0.0, "step": 2200 }, { "epoch": 0.1105, "grad_norm": 23.25, "grad_norm_var": 130.35305989583333, "learning_rate": 0.0001, "loss": 4.6516, "loss/crossentropy": 2.4541628479957582, "loss/hidden": 3.723046875, "loss/jsd": 0.09181494554504752, "loss/logits": 0.0, "step": 2210 }, { "epoch": 0.111, "grad_norm": 20.5, "grad_norm_var": 130.06015625, "learning_rate": 0.0001, "loss": 4.69, "loss/crossentropy": 2.416627970337868, "loss/hidden": 3.734765625, "loss/jsd": 0.11581595735624432, "loss/logits": 0.0, "step": 2220 }, { "epoch": 0.1115, "grad_norm": 18.625, "grad_norm_var": 5.193473307291667, "learning_rate": 0.0001, "loss": 4.685, "loss/crossentropy": 2.3696270257234575, "loss/hidden": 3.592578125, "loss/jsd": 0.09627662082202733, "loss/logits": 0.0, "step": 2230 }, { "epoch": 0.112, "grad_norm": 17.875, "grad_norm_var": 3.7067057291666665, "learning_rate": 0.0001, "loss": 4.6807, "loss/crossentropy": 2.374240705370903, "loss/hidden": 3.63671875, "loss/jsd": 0.10023370888084174, "loss/logits": 0.0, "step": 2240 }, { "epoch": 0.1125, "grad_norm": 18.875, "grad_norm_var": 6.2431640625, "learning_rate": 0.0001, "loss": 4.6202, "loss/crossentropy": 2.39550845772028, "loss/hidden": 3.655078125, "loss/jsd": 0.10500529641285539, "loss/logits": 0.0, "step": 2250 }, { "epoch": 0.113, "grad_norm": 16.75, "grad_norm_var": 5.78125, "learning_rate": 0.0001, "loss": 4.6473, "loss/crossentropy": 2.3785043194890023, "loss/hidden": 3.659375, "loss/jsd": 0.09861900489777327, "loss/logits": 0.0, "step": 2260 }, { "epoch": 0.1135, "grad_norm": 20.25, "grad_norm_var": 5.677018229166666, "learning_rate": 0.0001, "loss": 4.5771, "loss/crossentropy": 2.4541394472122193, "loss/hidden": 3.692578125, "loss/jsd": 0.10195111334323884, "loss/logits": 0.0, "step": 2270 }, { "epoch": 0.114, "grad_norm": 21.25, "grad_norm_var": 7.0228515625, "learning_rate": 0.0001, "loss": 4.597, "loss/crossentropy": 2.3176154881715774, "loss/hidden": 3.583984375, "loss/jsd": 0.09049384696409106, "loss/logits": 0.0, "step": 2280 }, { "epoch": 0.1145, "grad_norm": 15.75, "grad_norm_var": 15.241520182291667, "learning_rate": 0.0001, "loss": 4.5624, "loss/crossentropy": 2.5178518027067183, "loss/hidden": 3.528125, "loss/jsd": 0.09066717140376568, "loss/logits": 0.0, "step": 2290 }, { "epoch": 0.115, "grad_norm": 17.625, "grad_norm_var": 7.566520182291667, "learning_rate": 0.0001, "loss": 4.5471, "loss/crossentropy": 2.3759778410196306, "loss/hidden": 3.553125, "loss/jsd": 0.09599322909489275, "loss/logits": 0.0, "step": 2300 }, { "epoch": 0.1155, "grad_norm": 20.0, "grad_norm_var": 8.312434895833333, "learning_rate": 0.0001, "loss": 4.5075, "loss/crossentropy": 2.3496225073933603, "loss/hidden": 3.606640625, "loss/jsd": 0.09744280204176903, "loss/logits": 0.0, "step": 2310 }, { "epoch": 0.116, "grad_norm": 19.75, "grad_norm_var": 3.2108723958333334, "learning_rate": 0.0001, "loss": 4.5475, "loss/crossentropy": 2.4485339492559435, "loss/hidden": 3.523046875, "loss/jsd": 0.0890957485884428, "loss/logits": 0.0, "step": 2320 }, { "epoch": 0.1165, "grad_norm": 19.0, "grad_norm_var": 2.364697265625, "learning_rate": 0.0001, "loss": 4.5781, "loss/crossentropy": 2.299929490685463, "loss/hidden": 3.598828125, "loss/jsd": 0.09711863240227103, "loss/logits": 0.0, "step": 2330 }, { "epoch": 0.117, "grad_norm": 17.75, "grad_norm_var": 1.4955729166666667, "learning_rate": 0.0001, "loss": 4.5392, "loss/crossentropy": 2.298077051341534, "loss/hidden": 3.61015625, "loss/jsd": 0.0896261626854539, "loss/logits": 0.0, "step": 2340 }, { "epoch": 0.1175, "grad_norm": 17.5, "grad_norm_var": 3.0098307291666666, "learning_rate": 0.0001, "loss": 4.5949, "loss/crossentropy": 2.2876608431339265, "loss/hidden": 3.757421875, "loss/jsd": 0.10631331414915621, "loss/logits": 0.0, "step": 2350 }, { "epoch": 0.118, "grad_norm": 21.875, "grad_norm_var": 6.657291666666667, "learning_rate": 0.0001, "loss": 4.6054, "loss/crossentropy": 2.589036238193512, "loss/hidden": 3.7109375, "loss/jsd": 0.09777994276955723, "loss/logits": 0.0, "step": 2360 }, { "epoch": 0.1185, "grad_norm": 19.5, "grad_norm_var": 4.276497395833333, "learning_rate": 0.0001, "loss": 4.6578, "loss/crossentropy": 2.4440223038196565, "loss/hidden": 3.63125, "loss/jsd": 0.10012138104066252, "loss/logits": 0.0, "step": 2370 }, { "epoch": 0.119, "grad_norm": 15.9375, "grad_norm_var": 6.341520182291666, "learning_rate": 0.0001, "loss": 4.6382, "loss/crossentropy": 2.3379690438508987, "loss/hidden": 3.73046875, "loss/jsd": 0.10282904924824834, "loss/logits": 0.0, "step": 2380 }, { "epoch": 0.1195, "grad_norm": 20.5, "grad_norm_var": 7.068733723958333, "learning_rate": 0.0001, "loss": 4.5986, "loss/crossentropy": 2.358085313439369, "loss/hidden": 3.59140625, "loss/jsd": 0.0954778247512877, "loss/logits": 0.0, "step": 2390 }, { "epoch": 0.12, "grad_norm": 18.125, "grad_norm_var": 4.709375, "learning_rate": 0.0001, "loss": 4.5411, "loss/crossentropy": 2.262301415205002, "loss/hidden": 3.6265625, "loss/jsd": 0.09096273891627789, "loss/logits": 0.0, "step": 2400 }, { "epoch": 0.1205, "grad_norm": 21.125, "grad_norm_var": 2.4447916666666667, "learning_rate": 0.0001, "loss": 4.5437, "loss/crossentropy": 2.493518462777138, "loss/hidden": 3.61953125, "loss/jsd": 0.08979002349078655, "loss/logits": 0.0, "step": 2410 }, { "epoch": 0.121, "grad_norm": 16.75, "grad_norm_var": 6.323958333333334, "learning_rate": 0.0001, "loss": 4.563, "loss/crossentropy": 2.4933597564697267, "loss/hidden": 3.592578125, "loss/jsd": 0.09690459789708257, "loss/logits": 0.0, "step": 2420 }, { "epoch": 0.1215, "grad_norm": 18.875, "grad_norm_var": 6.918489583333334, "learning_rate": 0.0001, "loss": 4.5351, "loss/crossentropy": 2.516791993379593, "loss/hidden": 3.598828125, "loss/jsd": 0.09446065053343773, "loss/logits": 0.0, "step": 2430 }, { "epoch": 0.122, "grad_norm": 19.375, "grad_norm_var": 5.448942057291666, "learning_rate": 0.0001, "loss": 4.5509, "loss/crossentropy": 2.2249866664409637, "loss/hidden": 3.53203125, "loss/jsd": 0.08729059183970093, "loss/logits": 0.0, "step": 2440 }, { "epoch": 0.1225, "grad_norm": 19.25, "grad_norm_var": 5.459228515625, "learning_rate": 0.0001, "loss": 4.5135, "loss/crossentropy": 2.2651902705430986, "loss/hidden": 3.54765625, "loss/jsd": 0.08877531317993999, "loss/logits": 0.0, "step": 2450 }, { "epoch": 0.123, "grad_norm": 18.375, "grad_norm_var": 2.688004557291667, "learning_rate": 0.0001, "loss": 4.4698, "loss/crossentropy": 2.2470821171998976, "loss/hidden": 3.562890625, "loss/jsd": 0.09754009852185845, "loss/logits": 0.0, "step": 2460 }, { "epoch": 0.1235, "grad_norm": 20.0, "grad_norm_var": 4.591910807291667, "learning_rate": 0.0001, "loss": 4.5083, "loss/crossentropy": 2.1959333077073095, "loss/hidden": 3.531640625, "loss/jsd": 0.0921278445981443, "loss/logits": 0.0, "step": 2470 }, { "epoch": 0.124, "grad_norm": 19.0, "grad_norm_var": 3.9395182291666666, "learning_rate": 0.0001, "loss": 4.5378, "loss/crossentropy": 2.2659239649772642, "loss/hidden": 3.5875, "loss/jsd": 0.09199469089508057, "loss/logits": 0.0, "step": 2480 }, { "epoch": 0.1245, "grad_norm": 18.375, "grad_norm_var": 2.9302083333333333, "learning_rate": 0.0001, "loss": 4.4709, "loss/crossentropy": 2.2354795530438425, "loss/hidden": 3.49140625, "loss/jsd": 0.08955592634156347, "loss/logits": 0.0, "step": 2490 }, { "epoch": 0.125, "grad_norm": 21.75, "grad_norm_var": 3.544791666666667, "learning_rate": 0.0001, "loss": 4.546, "loss/crossentropy": 2.321756035089493, "loss/hidden": 3.491796875, "loss/jsd": 0.08398934034630656, "loss/logits": 0.0, "step": 2500 }, { "epoch": 0.1255, "grad_norm": 20.375, "grad_norm_var": 3.5992024739583335, "learning_rate": 0.0001, "loss": 4.5442, "loss/crossentropy": 2.327367161214352, "loss/hidden": 3.562109375, "loss/jsd": 0.08928178530186415, "loss/logits": 0.0, "step": 2510 }, { "epoch": 0.126, "grad_norm": 18.875, "grad_norm_var": 3.892041015625, "learning_rate": 0.0001, "loss": 4.4942, "loss/crossentropy": 2.198644478619099, "loss/hidden": 3.44921875, "loss/jsd": 0.08295171349309385, "loss/logits": 0.0, "step": 2520 }, { "epoch": 0.1265, "grad_norm": 16.125, "grad_norm_var": 5.773811848958333, "learning_rate": 0.0001, "loss": 4.576, "loss/crossentropy": 2.472541335225105, "loss/hidden": 3.597265625, "loss/jsd": 0.10432412773370743, "loss/logits": 0.0, "step": 2530 }, { "epoch": 0.127, "grad_norm": 20.875, "grad_norm_var": 5.364583333333333, "learning_rate": 0.0001, "loss": 4.5337, "loss/crossentropy": 2.3647551596164704, "loss/hidden": 3.618359375, "loss/jsd": 0.10374335153028369, "loss/logits": 0.0, "step": 2540 }, { "epoch": 0.1275, "grad_norm": 37.25, "grad_norm_var": 1281.695947265625, "learning_rate": 0.0001, "loss": 4.5825, "loss/crossentropy": 2.2414861261844634, "loss/hidden": 3.4828125, "loss/jsd": 0.09403842501342297, "loss/logits": 0.0, "step": 2550 }, { "epoch": 0.128, "grad_norm": 14.875, "grad_norm_var": 1240.8051432291666, "learning_rate": 0.0001, "loss": 4.4589, "loss/crossentropy": 2.234823814034462, "loss/hidden": 3.509765625, "loss/jsd": 0.08673453908413649, "loss/logits": 0.0, "step": 2560 }, { "epoch": 0.1285, "grad_norm": 22.75, "grad_norm_var": 16.616080729166665, "learning_rate": 0.0001, "loss": 4.4816, "loss/crossentropy": 2.387620323896408, "loss/hidden": 3.55078125, "loss/jsd": 0.08936102241277695, "loss/logits": 0.0, "step": 2570 }, { "epoch": 0.129, "grad_norm": 16.125, "grad_norm_var": 9.396809895833334, "learning_rate": 0.0001, "loss": 4.4124, "loss/crossentropy": 2.1731797240674497, "loss/hidden": 3.40390625, "loss/jsd": 0.07968775480985642, "loss/logits": 0.0, "step": 2580 }, { "epoch": 0.1295, "grad_norm": 20.625, "grad_norm_var": 8.490559895833334, "learning_rate": 0.0001, "loss": 4.4807, "loss/crossentropy": 2.1817662701010705, "loss/hidden": 3.676953125, "loss/jsd": 0.09472927646711468, "loss/logits": 0.0, "step": 2590 }, { "epoch": 0.13, "grad_norm": 21.375, "grad_norm_var": 4.510416666666667, "learning_rate": 0.0001, "loss": 4.5324, "loss/crossentropy": 2.2697513103485107, "loss/hidden": 3.570703125, "loss/jsd": 0.08940641283988952, "loss/logits": 0.0, "step": 2600 }, { "epoch": 0.1305, "grad_norm": 18.375, "grad_norm_var": 9.0634765625, "learning_rate": 0.0001, "loss": 4.4845, "loss/crossentropy": 2.2707223266363146, "loss/hidden": 3.52109375, "loss/jsd": 0.09460832485929131, "loss/logits": 0.0, "step": 2610 }, { "epoch": 0.131, "grad_norm": 66.0, "grad_norm_var": 143.3619140625, "learning_rate": 0.0001, "loss": 4.5179, "loss/crossentropy": 2.254822887480259, "loss/hidden": 3.531640625, "loss/jsd": 0.09221142884343862, "loss/logits": 0.0, "step": 2620 }, { "epoch": 0.1315, "grad_norm": 21.625, "grad_norm_var": 143.6431640625, "learning_rate": 0.0001, "loss": 4.4947, "loss/crossentropy": 2.347915455698967, "loss/hidden": 3.58203125, "loss/jsd": 0.09412752091884613, "loss/logits": 0.0, "step": 2630 }, { "epoch": 0.132, "grad_norm": 17.375, "grad_norm_var": 3.1708333333333334, "learning_rate": 0.0001, "loss": 4.528, "loss/crossentropy": 2.2751280948519708, "loss/hidden": 3.56328125, "loss/jsd": 0.08851864533498884, "loss/logits": 0.0, "step": 2640 }, { "epoch": 0.1325, "grad_norm": 19.0, "grad_norm_var": 13.576155598958334, "learning_rate": 0.0001, "loss": 4.4741, "loss/crossentropy": 2.2658936589956284, "loss/hidden": 3.54296875, "loss/jsd": 0.08702889690175653, "loss/logits": 0.0, "step": 2650 }, { "epoch": 0.133, "grad_norm": 21.625, "grad_norm_var": 722.3559895833333, "learning_rate": 0.0001, "loss": 4.5137, "loss/crossentropy": 2.1781817600131035, "loss/hidden": 3.56484375, "loss/jsd": 0.09932177630253136, "loss/logits": 0.0, "step": 2660 }, { "epoch": 0.1335, "grad_norm": 17.5, "grad_norm_var": 2.5122395833333333, "learning_rate": 0.0001, "loss": 4.4842, "loss/crossentropy": 2.3243243932724, "loss/hidden": 3.6015625, "loss/jsd": 0.09606684306636452, "loss/logits": 0.0, "step": 2670 }, { "epoch": 0.134, "grad_norm": 16.75, "grad_norm_var": 38.6853515625, "learning_rate": 0.0001, "loss": 4.5164, "loss/crossentropy": 2.389857916533947, "loss/hidden": 3.610546875, "loss/jsd": 0.09261430930346251, "loss/logits": 0.0, "step": 2680 }, { "epoch": 0.1345, "grad_norm": 19.25, "grad_norm_var": 4.7369140625, "learning_rate": 0.0001, "loss": 4.4902, "loss/crossentropy": 2.3873065978288652, "loss/hidden": 3.496875, "loss/jsd": 0.08840383114293218, "loss/logits": 0.0, "step": 2690 }, { "epoch": 0.135, "grad_norm": 20.5, "grad_norm_var": 3.177018229166667, "learning_rate": 0.0001, "loss": 4.4057, "loss/crossentropy": 2.338147234916687, "loss/hidden": 3.534375, "loss/jsd": 0.09641889259219169, "loss/logits": 0.0, "step": 2700 }, { "epoch": 0.1355, "grad_norm": 17.75, "grad_norm_var": 8.540999348958334, "learning_rate": 0.0001, "loss": 4.513, "loss/crossentropy": 2.2362188696861267, "loss/hidden": 3.6625, "loss/jsd": 0.10190682755783201, "loss/logits": 0.0, "step": 2710 }, { "epoch": 0.136, "grad_norm": 22.75, "grad_norm_var": 38.66183268229167, "learning_rate": 0.0001, "loss": 4.6171, "loss/crossentropy": 2.222167354822159, "loss/hidden": 3.628125, "loss/jsd": 0.11221090480685234, "loss/logits": 0.0, "step": 2720 }, { "epoch": 0.1365, "grad_norm": 16.375, "grad_norm_var": 37.61015625, "learning_rate": 0.0001, "loss": 4.539, "loss/crossentropy": 2.3914648950099946, "loss/hidden": 3.59765625, "loss/jsd": 0.09164496380835771, "loss/logits": 0.0, "step": 2730 }, { "epoch": 0.137, "grad_norm": 16.5, "grad_norm_var": 2.32890625, "learning_rate": 0.0001, "loss": 4.6123, "loss/crossentropy": 2.385912075638771, "loss/hidden": 3.541796875, "loss/jsd": 0.08835116708651185, "loss/logits": 0.0, "step": 2740 }, { "epoch": 0.1375, "grad_norm": 17.75, "grad_norm_var": 6.953125, "learning_rate": 0.0001, "loss": 4.5453, "loss/crossentropy": 2.291456125676632, "loss/hidden": 3.590625, "loss/jsd": 0.0944554246030748, "loss/logits": 0.0, "step": 2750 }, { "epoch": 0.138, "grad_norm": 18.875, "grad_norm_var": 23.817643229166666, "learning_rate": 0.0001, "loss": 4.4631, "loss/crossentropy": 2.2219670079648495, "loss/hidden": 3.562890625, "loss/jsd": 0.08628002055920661, "loss/logits": 0.0, "step": 2760 }, { "epoch": 0.1385, "grad_norm": 17.5, "grad_norm_var": 22.864518229166666, "learning_rate": 0.0001, "loss": 4.4415, "loss/crossentropy": 2.3799121528863907, "loss/hidden": 3.5, "loss/jsd": 0.09274168154224753, "loss/logits": 0.0, "step": 2770 }, { "epoch": 0.139, "grad_norm": 20.25, "grad_norm_var": 6.460791015625, "learning_rate": 0.0001, "loss": 4.5309, "loss/crossentropy": 2.1055190823972225, "loss/hidden": 3.534765625, "loss/jsd": 0.08359876750037074, "loss/logits": 0.0, "step": 2780 }, { "epoch": 0.1395, "grad_norm": 17.875, "grad_norm_var": 5.627587890625, "learning_rate": 0.0001, "loss": 4.4825, "loss/crossentropy": 2.3334684520959854, "loss/hidden": 3.533984375, "loss/jsd": 0.0967961790971458, "loss/logits": 0.0, "step": 2790 }, { "epoch": 0.14, "grad_norm": 17.125, "grad_norm_var": 4.620556640625, "learning_rate": 0.0001, "loss": 4.4282, "loss/crossentropy": 2.42918943464756, "loss/hidden": 3.526953125, "loss/jsd": 0.08855977468192577, "loss/logits": 0.0, "step": 2800 }, { "epoch": 0.1405, "grad_norm": 18.5, "grad_norm_var": 5.042643229166667, "learning_rate": 0.0001, "loss": 4.4503, "loss/crossentropy": 2.272622914612293, "loss/hidden": 3.58671875, "loss/jsd": 0.0873057721182704, "loss/logits": 0.0, "step": 2810 }, { "epoch": 0.141, "grad_norm": 18.125, "grad_norm_var": 3.658837890625, "learning_rate": 0.0001, "loss": 4.4984, "loss/crossentropy": 2.1938667565584185, "loss/hidden": 3.553515625, "loss/jsd": 0.08576443083584309, "loss/logits": 0.0, "step": 2820 }, { "epoch": 0.1415, "grad_norm": 20.375, "grad_norm_var": 3.8306640625, "learning_rate": 0.0001, "loss": 4.5033, "loss/crossentropy": 2.3123946458101274, "loss/hidden": 3.56171875, "loss/jsd": 0.0924573240801692, "loss/logits": 0.0, "step": 2830 }, { "epoch": 0.142, "grad_norm": 22.75, "grad_norm_var": 5.495768229166667, "learning_rate": 0.0001, "loss": 4.4168, "loss/crossentropy": 2.3987593173980715, "loss/hidden": 3.509765625, "loss/jsd": 0.0879776468500495, "loss/logits": 0.0, "step": 2840 }, { "epoch": 0.1425, "grad_norm": 17.375, "grad_norm_var": 9.082747395833334, "learning_rate": 0.0001, "loss": 4.4669, "loss/crossentropy": 2.2637423157691954, "loss/hidden": 3.5609375, "loss/jsd": 0.10491609480232, "loss/logits": 0.0, "step": 2850 }, { "epoch": 0.143, "grad_norm": 16.625, "grad_norm_var": 5.2478515625, "learning_rate": 0.0001, "loss": 4.3874, "loss/crossentropy": 2.36127190887928, "loss/hidden": 3.5421875, "loss/jsd": 0.08668355047702789, "loss/logits": 0.0, "step": 2860 }, { "epoch": 0.1435, "grad_norm": 20.625, "grad_norm_var": 7.6447265625, "learning_rate": 0.0001, "loss": 4.4512, "loss/crossentropy": 2.4082365155220034, "loss/hidden": 3.561328125, "loss/jsd": 0.09474811758846044, "loss/logits": 0.0, "step": 2870 }, { "epoch": 0.144, "grad_norm": 21.125, "grad_norm_var": 8.8734375, "learning_rate": 0.0001, "loss": 4.4895, "loss/crossentropy": 2.2012635439634325, "loss/hidden": 3.4703125, "loss/jsd": 0.08498403234407306, "loss/logits": 0.0, "step": 2880 }, { "epoch": 0.1445, "grad_norm": 21.625, "grad_norm_var": 3.4480305989583333, "learning_rate": 0.0001, "loss": 4.447, "loss/crossentropy": 2.298944839835167, "loss/hidden": 3.555078125, "loss/jsd": 0.09882149025797844, "loss/logits": 0.0, "step": 2890 }, { "epoch": 0.145, "grad_norm": 16.75, "grad_norm_var": 2.8893229166666665, "learning_rate": 0.0001, "loss": 4.5259, "loss/crossentropy": 2.4170736342668535, "loss/hidden": 3.62265625, "loss/jsd": 0.10080426596105099, "loss/logits": 0.0, "step": 2900 }, { "epoch": 0.1455, "grad_norm": 24.75, "grad_norm_var": 9.176041666666666, "learning_rate": 0.0001, "loss": 4.4133, "loss/crossentropy": 2.266545096039772, "loss/hidden": 3.488671875, "loss/jsd": 0.08616750100627542, "loss/logits": 0.0, "step": 2910 }, { "epoch": 0.146, "grad_norm": 23.5, "grad_norm_var": 10.60859375, "learning_rate": 0.0001, "loss": 4.4536, "loss/crossentropy": 2.3165148913860323, "loss/hidden": 3.42890625, "loss/jsd": 0.07857409287244081, "loss/logits": 0.0, "step": 2920 }, { "epoch": 0.1465, "grad_norm": 18.125, "grad_norm_var": 4.756884765625, "learning_rate": 0.0001, "loss": 4.4494, "loss/crossentropy": 2.275170993804932, "loss/hidden": 3.52421875, "loss/jsd": 0.08944948101416231, "loss/logits": 0.0, "step": 2930 }, { "epoch": 0.147, "grad_norm": 19.375, "grad_norm_var": 4.254931640625, "learning_rate": 0.0001, "loss": 4.4176, "loss/crossentropy": 2.0632098406553268, "loss/hidden": 3.54296875, "loss/jsd": 0.09174134442582726, "loss/logits": 0.0, "step": 2940 }, { "epoch": 0.1475, "grad_norm": 20.75, "grad_norm_var": 3.086442057291667, "learning_rate": 0.0001, "loss": 4.4606, "loss/crossentropy": 2.251816061139107, "loss/hidden": 3.43046875, "loss/jsd": 0.08638259647414089, "loss/logits": 0.0, "step": 2950 }, { "epoch": 0.148, "grad_norm": 44.75, "grad_norm_var": 49.42265625, "learning_rate": 0.0001, "loss": 4.4651, "loss/crossentropy": 2.1415898591279983, "loss/hidden": 3.41796875, "loss/jsd": 0.08234252617694438, "loss/logits": 0.0, "step": 2960 }, { "epoch": 0.1485, "grad_norm": 21.0, "grad_norm_var": 47.44993489583333, "learning_rate": 0.0001, "loss": 4.4563, "loss/crossentropy": 2.2365823119878767, "loss/hidden": 3.542578125, "loss/jsd": 0.09432788556441665, "loss/logits": 0.0, "step": 2970 }, { "epoch": 0.149, "grad_norm": 16.875, "grad_norm_var": 3.926155598958333, "learning_rate": 0.0001, "loss": 4.4151, "loss/crossentropy": 2.397647699713707, "loss/hidden": 3.58984375, "loss/jsd": 0.09230242855846882, "loss/logits": 0.0, "step": 2980 }, { "epoch": 0.1495, "grad_norm": 18.25, "grad_norm_var": 2.0455729166666665, "learning_rate": 0.0001, "loss": 4.4923, "loss/crossentropy": 2.270119884610176, "loss/hidden": 3.588671875, "loss/jsd": 0.09977766564115882, "loss/logits": 0.0, "step": 2990 }, { "epoch": 0.15, "grad_norm": 20.875, "grad_norm_var": 5.620947265625, "learning_rate": 0.0001, "loss": 4.424, "loss/crossentropy": 2.274160121381283, "loss/hidden": 3.493359375, "loss/jsd": 0.08368044728413224, "loss/logits": 0.0, "step": 3000 }, { "epoch": 0.1505, "grad_norm": 18.0, "grad_norm_var": 5.805843098958333, "learning_rate": 0.0001, "loss": 4.4366, "loss/crossentropy": 2.2047403126955034, "loss/hidden": 3.48125, "loss/jsd": 0.09485792317427695, "loss/logits": 0.0, "step": 3010 }, { "epoch": 0.151, "grad_norm": 15.75, "grad_norm_var": 3.952018229166667, "learning_rate": 0.0001, "loss": 4.4361, "loss/crossentropy": 2.426369333267212, "loss/hidden": 3.505078125, "loss/jsd": 0.09415734186768532, "loss/logits": 0.0, "step": 3020 }, { "epoch": 0.1515, "grad_norm": 20.5, "grad_norm_var": 27.647916666666667, "learning_rate": 0.0001, "loss": 4.386, "loss/crossentropy": 2.1474297270178795, "loss/hidden": 3.485546875, "loss/jsd": 0.08619686132296919, "loss/logits": 0.0, "step": 3030 }, { "epoch": 0.152, "grad_norm": 18.125, "grad_norm_var": 7.828369140625, "learning_rate": 0.0001, "loss": 4.5285, "loss/crossentropy": 2.343987912684679, "loss/hidden": 3.58515625, "loss/jsd": 0.09171235403046012, "loss/logits": 0.0, "step": 3040 }, { "epoch": 0.1525, "grad_norm": 18.25, "grad_norm_var": 8.158707682291666, "learning_rate": 0.0001, "loss": 4.4394, "loss/crossentropy": 2.3131785288453104, "loss/hidden": 3.59453125, "loss/jsd": 0.09000935666263103, "loss/logits": 0.0, "step": 3050 }, { "epoch": 0.153, "grad_norm": 37.0, "grad_norm_var": 68.850244140625, "learning_rate": 0.0001, "loss": 4.4266, "loss/crossentropy": 2.333009423315525, "loss/hidden": 3.5515625, "loss/jsd": 0.10508564142510295, "loss/logits": 0.0, "step": 3060 }, { "epoch": 0.1535, "grad_norm": 18.75, "grad_norm_var": 69.41712239583333, "learning_rate": 0.0001, "loss": 4.4924, "loss/crossentropy": 2.3548025131225585, "loss/hidden": 3.560546875, "loss/jsd": 0.08788978308439255, "loss/logits": 0.0, "step": 3070 }, { "epoch": 0.154, "grad_norm": 19.125, "grad_norm_var": 2.856494140625, "learning_rate": 0.0001, "loss": 4.4327, "loss/crossentropy": 2.196867881715298, "loss/hidden": 3.598046875, "loss/jsd": 0.09646046198904515, "loss/logits": 0.0, "step": 3080 }, { "epoch": 0.1545, "grad_norm": 22.375, "grad_norm_var": 3.7356770833333335, "learning_rate": 0.0001, "loss": 4.4178, "loss/crossentropy": 2.2703019440174104, "loss/hidden": 3.5359375, "loss/jsd": 0.09918619338423014, "loss/logits": 0.0, "step": 3090 }, { "epoch": 0.155, "grad_norm": 21.5, "grad_norm_var": 10.3275390625, "learning_rate": 0.0001, "loss": 4.3676, "loss/crossentropy": 2.2433530882000925, "loss/hidden": 3.416015625, "loss/jsd": 0.08071139380335808, "loss/logits": 0.0, "step": 3100 }, { "epoch": 0.1555, "grad_norm": 18.5, "grad_norm_var": 10.381510416666666, "learning_rate": 0.0001, "loss": 4.3876, "loss/crossentropy": 2.229444594681263, "loss/hidden": 3.46640625, "loss/jsd": 0.08576273424550891, "loss/logits": 0.0, "step": 3110 }, { "epoch": 0.156, "grad_norm": 15.6875, "grad_norm_var": 4.5712890625, "learning_rate": 0.0001, "loss": 4.345, "loss/crossentropy": 2.2901594534516336, "loss/hidden": 3.35859375, "loss/jsd": 0.07994853192940354, "loss/logits": 0.0, "step": 3120 }, { "epoch": 0.1565, "grad_norm": 15.625, "grad_norm_var": 7.159358723958333, "learning_rate": 0.0001, "loss": 4.3626, "loss/crossentropy": 2.351148310303688, "loss/hidden": 3.499609375, "loss/jsd": 0.089451711345464, "loss/logits": 0.0, "step": 3130 }, { "epoch": 0.157, "grad_norm": 16.375, "grad_norm_var": 6.505192057291667, "learning_rate": 0.0001, "loss": 4.4232, "loss/crossentropy": 2.1818712055683136, "loss/hidden": 3.45234375, "loss/jsd": 0.0778524660039693, "loss/logits": 0.0, "step": 3140 }, { "epoch": 0.1575, "grad_norm": 16.75, "grad_norm_var": 2.1102701822916665, "learning_rate": 0.0001, "loss": 4.4401, "loss/crossentropy": 2.349280393123627, "loss/hidden": 3.570703125, "loss/jsd": 0.09622437562793493, "loss/logits": 0.0, "step": 3150 }, { "epoch": 0.158, "grad_norm": 17.625, "grad_norm_var": 2.701546223958333, "learning_rate": 0.0001, "loss": 4.3796, "loss/crossentropy": 2.329416874051094, "loss/hidden": 3.539453125, "loss/jsd": 0.08784733964130283, "loss/logits": 0.0, "step": 3160 }, { "epoch": 0.1585, "grad_norm": 16.375, "grad_norm_var": 2.156363932291667, "learning_rate": 0.0001, "loss": 4.2973, "loss/crossentropy": 2.2838528990745544, "loss/hidden": 3.38984375, "loss/jsd": 0.08452709410339594, "loss/logits": 0.0, "step": 3170 }, { "epoch": 0.159, "grad_norm": 19.0, "grad_norm_var": 4.3259765625, "learning_rate": 0.0001, "loss": 4.4271, "loss/crossentropy": 2.1395395755767823, "loss/hidden": 3.5203125, "loss/jsd": 0.10172450188547373, "loss/logits": 0.0, "step": 3180 }, { "epoch": 0.1595, "grad_norm": 19.375, "grad_norm_var": 4.408268229166667, "learning_rate": 0.0001, "loss": 4.4574, "loss/crossentropy": 2.394977739453316, "loss/hidden": 3.6234375, "loss/jsd": 0.10171002727001906, "loss/logits": 0.0, "step": 3190 }, { "epoch": 0.16, "grad_norm": 16.875, "grad_norm_var": 6.147379557291667, "learning_rate": 0.0001, "loss": 4.4832, "loss/crossentropy": 2.355364751815796, "loss/hidden": 3.50546875, "loss/jsd": 0.08951211860403419, "loss/logits": 0.0, "step": 3200 }, { "epoch": 0.1605, "grad_norm": 28.625, "grad_norm_var": 1.244752705334018e+18, "learning_rate": 0.0001, "loss": 4.4816, "loss/crossentropy": 2.305925354361534, "loss/hidden": 3.58828125, "loss/jsd": 0.08834340209141374, "loss/logits": 0.0, "step": 3210 }, { "epoch": 0.161, "grad_norm": 18.25, "grad_norm_var": 9.839518229166666, "learning_rate": 0.0001, "loss": 4.4863, "loss/crossentropy": 2.3078080981969835, "loss/hidden": 3.5203125, "loss/jsd": 0.0940877721644938, "loss/logits": 0.0, "step": 3220 }, { "epoch": 0.1615, "grad_norm": 16.125, "grad_norm_var": 2.7150390625, "learning_rate": 0.0001, "loss": 4.4064, "loss/crossentropy": 2.4253244906663896, "loss/hidden": 3.58125, "loss/jsd": 0.09865610068663955, "loss/logits": 0.0, "step": 3230 }, { "epoch": 0.162, "grad_norm": 18.75, "grad_norm_var": 4.934749348958333, "learning_rate": 0.0001, "loss": 4.3174, "loss/crossentropy": 2.4989412158727644, "loss/hidden": 3.4234375, "loss/jsd": 0.08196726078167557, "loss/logits": 0.0, "step": 3240 }, { "epoch": 0.1625, "grad_norm": 16.375, "grad_norm_var": 2.9952962239583334, "learning_rate": 0.0001, "loss": 4.3693, "loss/crossentropy": 2.2475881457328795, "loss/hidden": 3.47109375, "loss/jsd": 0.09224425395950675, "loss/logits": 0.0, "step": 3250 }, { "epoch": 0.163, "grad_norm": 15.3125, "grad_norm_var": 1.9930826822916667, "learning_rate": 0.0001, "loss": 4.3783, "loss/crossentropy": 2.2365039557218553, "loss/hidden": 3.49921875, "loss/jsd": 0.09003520868718624, "loss/logits": 0.0, "step": 3260 }, { "epoch": 0.1635, "grad_norm": 17.375, "grad_norm_var": 2.0921223958333335, "learning_rate": 0.0001, "loss": 4.4329, "loss/crossentropy": 2.266706997156143, "loss/hidden": 3.542578125, "loss/jsd": 0.09133774926885962, "loss/logits": 0.0, "step": 3270 }, { "epoch": 0.164, "grad_norm": 17.0, "grad_norm_var": 2.531510416666667, "learning_rate": 0.0001, "loss": 4.5264, "loss/crossentropy": 2.292432078719139, "loss/hidden": 3.67265625, "loss/jsd": 0.11363288760185242, "loss/logits": 0.0, "step": 3280 }, { "epoch": 0.1645, "grad_norm": 20.125, "grad_norm_var": 2.6884765625, "learning_rate": 0.0001, "loss": 4.3941, "loss/crossentropy": 2.308723744750023, "loss/hidden": 3.60234375, "loss/jsd": 0.09796320544555784, "loss/logits": 0.0, "step": 3290 }, { "epoch": 0.165, "grad_norm": 19.0, "grad_norm_var": 3.3436848958333334, "learning_rate": 0.0001, "loss": 4.3738, "loss/crossentropy": 2.3898502081632613, "loss/hidden": 3.471484375, "loss/jsd": 0.08741156700998545, "loss/logits": 0.0, "step": 3300 }, { "epoch": 0.1655, "grad_norm": 19.875, "grad_norm_var": 3.4898274739583335, "learning_rate": 0.0001, "loss": 4.3816, "loss/crossentropy": 2.306541550159454, "loss/hidden": 3.592578125, "loss/jsd": 0.09893495552241802, "loss/logits": 0.0, "step": 3310 }, { "epoch": 0.166, "grad_norm": 14.8125, "grad_norm_var": 2.6378743489583334, "learning_rate": 0.0001, "loss": 4.37, "loss/crossentropy": 2.3887303933501243, "loss/hidden": 3.480859375, "loss/jsd": 0.08493705298751593, "loss/logits": 0.0, "step": 3320 }, { "epoch": 0.1665, "grad_norm": 21.875, "grad_norm_var": 3.0442545572916666, "learning_rate": 0.0001, "loss": 4.4504, "loss/crossentropy": 2.3878179833292963, "loss/hidden": 3.632421875, "loss/jsd": 0.09913788838312029, "loss/logits": 0.0, "step": 3330 }, { "epoch": 0.167, "grad_norm": 20.25, "grad_norm_var": 10.913785807291667, "learning_rate": 0.0001, "loss": 4.4553, "loss/crossentropy": 2.2205622404813767, "loss/hidden": 3.636328125, "loss/jsd": 0.10992270009592175, "loss/logits": 0.0, "step": 3340 }, { "epoch": 0.1675, "grad_norm": 22.0, "grad_norm_var": 7.966520182291666, "learning_rate": 0.0001, "loss": 4.3545, "loss/crossentropy": 2.108812813460827, "loss/hidden": 3.40234375, "loss/jsd": 0.0757693353574723, "loss/logits": 0.0, "step": 3350 }, { "epoch": 0.168, "grad_norm": 15.6875, "grad_norm_var": 3.8549479166666667, "learning_rate": 0.0001, "loss": 4.3713, "loss/crossentropy": 2.289568629860878, "loss/hidden": 3.541015625, "loss/jsd": 0.10061329454183579, "loss/logits": 0.0, "step": 3360 }, { "epoch": 0.1685, "grad_norm": 17.5, "grad_norm_var": 4.482145182291666, "learning_rate": 0.0001, "loss": 4.4519, "loss/crossentropy": 2.352386988699436, "loss/hidden": 3.42734375, "loss/jsd": 0.08975700601004064, "loss/logits": 0.0, "step": 3370 }, { "epoch": 0.169, "grad_norm": 18.875, "grad_norm_var": 4.2978515625, "learning_rate": 0.0001, "loss": 4.3503, "loss/crossentropy": 2.31557312309742, "loss/hidden": 3.548828125, "loss/jsd": 0.0878440142609179, "loss/logits": 0.0, "step": 3380 }, { "epoch": 0.1695, "grad_norm": 20.875, "grad_norm_var": 5.702604166666666, "learning_rate": 0.0001, "loss": 4.3864, "loss/crossentropy": 2.339674559235573, "loss/hidden": 3.464453125, "loss/jsd": 0.0880395533517003, "loss/logits": 0.0, "step": 3390 }, { "epoch": 0.17, "grad_norm": 15.1875, "grad_norm_var": 5.098551432291667, "learning_rate": 0.0001, "loss": 4.3726, "loss/crossentropy": 2.2533027648925783, "loss/hidden": 3.4921875, "loss/jsd": 0.08741035936400295, "loss/logits": 0.0, "step": 3400 }, { "epoch": 0.1705, "grad_norm": 21.875, "grad_norm_var": 3.4983723958333335, "learning_rate": 0.0001, "loss": 4.3701, "loss/crossentropy": 2.280165506899357, "loss/hidden": 3.52890625, "loss/jsd": 0.09410012043081224, "loss/logits": 0.0, "step": 3410 }, { "epoch": 0.171, "grad_norm": 15.75, "grad_norm_var": 3.486962890625, "learning_rate": 0.0001, "loss": 4.4465, "loss/crossentropy": 2.3110105454921723, "loss/hidden": 3.54453125, "loss/jsd": 0.10350852748379111, "loss/logits": 0.0, "step": 3420 }, { "epoch": 0.1715, "grad_norm": 15.0625, "grad_norm_var": 1.7901041666666666, "learning_rate": 0.0001, "loss": 4.2987, "loss/crossentropy": 2.5183032125234606, "loss/hidden": 3.541015625, "loss/jsd": 0.0940008645877242, "loss/logits": 0.0, "step": 3430 }, { "epoch": 0.172, "grad_norm": 16.625, "grad_norm_var": 1.3207509498935816e+18, "learning_rate": 0.0001, "loss": 4.3968, "loss/crossentropy": 2.298141914606094, "loss/hidden": 3.538671875, "loss/jsd": 0.09232875565066934, "loss/logits": 0.0, "step": 3440 }, { "epoch": 0.1725, "grad_norm": 25.875, "grad_norm_var": 10.539176432291667, "learning_rate": 0.0001, "loss": 4.405, "loss/crossentropy": 2.4251868039369584, "loss/hidden": 3.620703125, "loss/jsd": 0.09764928705990314, "loss/logits": 0.0, "step": 3450 }, { "epoch": 0.173, "grad_norm": 16.5, "grad_norm_var": 37.917301432291666, "learning_rate": 0.0001, "loss": 4.3521, "loss/crossentropy": 2.4465878754854202, "loss/hidden": 3.41328125, "loss/jsd": 0.09263761136680841, "loss/logits": 0.0, "step": 3460 }, { "epoch": 0.1735, "grad_norm": 19.5, "grad_norm_var": 10.325895182291667, "learning_rate": 0.0001, "loss": 4.3055, "loss/crossentropy": 2.341625288128853, "loss/hidden": 3.46171875, "loss/jsd": 0.08955673705786467, "loss/logits": 0.0, "step": 3470 }, { "epoch": 0.174, "grad_norm": 18.25, "grad_norm_var": 9.155582682291667, "learning_rate": 0.0001, "loss": 4.257, "loss/crossentropy": 2.2626075088977813, "loss/hidden": 3.38046875, "loss/jsd": 0.08382235984317958, "loss/logits": 0.0, "step": 3480 }, { "epoch": 0.1745, "grad_norm": 24.25, "grad_norm_var": 40.449853515625, "learning_rate": 0.0001, "loss": 4.365, "loss/crossentropy": 2.22887095361948, "loss/hidden": 3.495703125, "loss/jsd": 0.08516010586172343, "loss/logits": 0.0, "step": 3490 }, { "epoch": 0.175, "grad_norm": 29.625, "grad_norm_var": 22.202848307291667, "learning_rate": 0.0001, "loss": 4.4211, "loss/crossentropy": 2.325581954419613, "loss/hidden": 3.626171875, "loss/jsd": 0.10623239502310752, "loss/logits": 0.0, "step": 3500 }, { "epoch": 0.1755, "grad_norm": 18.25, "grad_norm_var": 20.083268229166666, "learning_rate": 0.0001, "loss": 4.4183, "loss/crossentropy": 2.3138944447040557, "loss/hidden": 3.605078125, "loss/jsd": 0.09691860349848866, "loss/logits": 0.0, "step": 3510 }, { "epoch": 0.176, "grad_norm": 14.75, "grad_norm_var": 3.0729166666666665, "learning_rate": 0.0001, "loss": 4.3327, "loss/crossentropy": 2.335177455097437, "loss/hidden": 3.516796875, "loss/jsd": 0.08317867233417928, "loss/logits": 0.0, "step": 3520 }, { "epoch": 0.1765, "grad_norm": 29.375, "grad_norm_var": 313.28943684895836, "learning_rate": 0.0001, "loss": 4.3804, "loss/crossentropy": 2.2869663372635842, "loss/hidden": 3.3875, "loss/jsd": 0.07983446251600981, "loss/logits": 0.0, "step": 3530 }, { "epoch": 0.177, "grad_norm": 18.75, "grad_norm_var": 294.8395182291667, "learning_rate": 0.0001, "loss": 4.4268, "loss/crossentropy": 2.3664276599884033, "loss/hidden": 3.541796875, "loss/jsd": 0.09534696582704782, "loss/logits": 0.0, "step": 3540 }, { "epoch": 0.1775, "grad_norm": 16.875, "grad_norm_var": 9.458707682291667, "learning_rate": 0.0001, "loss": 4.2841, "loss/crossentropy": 2.3651267111301424, "loss/hidden": 3.37890625, "loss/jsd": 0.08079936136491597, "loss/logits": 0.0, "step": 3550 }, { "epoch": 0.178, "grad_norm": 21.125, "grad_norm_var": 4.593473307291666, "learning_rate": 0.0001, "loss": 4.2825, "loss/crossentropy": 2.392011249065399, "loss/hidden": 3.41015625, "loss/jsd": 0.09476534733548761, "loss/logits": 0.0, "step": 3560 }, { "epoch": 0.1785, "grad_norm": 23.0, "grad_norm_var": 3.734375, "learning_rate": 0.0001, "loss": 4.3928, "loss/crossentropy": 2.4183569096028803, "loss/hidden": 3.3953125, "loss/jsd": 0.0872859289869666, "loss/logits": 0.0, "step": 3570 }, { "epoch": 0.179, "grad_norm": 17.375, "grad_norm_var": 6.357405598958334, "learning_rate": 0.0001, "loss": 4.366, "loss/crossentropy": 2.228242626786232, "loss/hidden": 3.61484375, "loss/jsd": 0.09655670188367367, "loss/logits": 0.0, "step": 3580 }, { "epoch": 0.1795, "grad_norm": 19.625, "grad_norm_var": 4.720247395833334, "learning_rate": 0.0001, "loss": 4.3721, "loss/crossentropy": 2.3514621019363404, "loss/hidden": 3.55703125, "loss/jsd": 0.08998525207862258, "loss/logits": 0.0, "step": 3590 }, { "epoch": 0.18, "grad_norm": 17.75, "grad_norm_var": 86.24099934895834, "learning_rate": 0.0001, "loss": 4.2887, "loss/crossentropy": 2.3266086250543596, "loss/hidden": 3.3921875, "loss/jsd": 0.08203610377386213, "loss/logits": 0.0, "step": 3600 }, { "epoch": 0.1805, "grad_norm": 17.125, "grad_norm_var": 6.689428671005983e+17, "learning_rate": 0.0001, "loss": 4.2832, "loss/crossentropy": 2.358739697933197, "loss/hidden": 3.381640625, "loss/jsd": 0.07689286703243851, "loss/logits": 0.0, "step": 3610 }, { "epoch": 0.181, "grad_norm": 19.75, "grad_norm_var": 6.689428671857951e+17, "learning_rate": 0.0001, "loss": 4.2562, "loss/crossentropy": 2.299892693758011, "loss/hidden": 3.358203125, "loss/jsd": 0.07548968028277159, "loss/logits": 0.0, "step": 3620 }, { "epoch": 0.1815, "grad_norm": 16.5, "grad_norm_var": 3.8056640625, "learning_rate": 0.0001, "loss": 4.3453, "loss/crossentropy": 2.1600609093904497, "loss/hidden": 3.521484375, "loss/jsd": 0.0914209995418787, "loss/logits": 0.0, "step": 3630 }, { "epoch": 0.182, "grad_norm": 18.375, "grad_norm_var": 5.445833333333334, "learning_rate": 0.0001, "loss": 4.261, "loss/crossentropy": 2.2630960240960123, "loss/hidden": 3.2734375, "loss/jsd": 0.07350569609552622, "loss/logits": 0.0, "step": 3640 }, { "epoch": 0.1825, "grad_norm": 19.5, "grad_norm_var": 4.786393229166666, "learning_rate": 0.0001, "loss": 4.2941, "loss/crossentropy": 2.26744422018528, "loss/hidden": 3.434765625, "loss/jsd": 0.09071792410686612, "loss/logits": 0.0, "step": 3650 }, { "epoch": 0.183, "grad_norm": 16.375, "grad_norm_var": 3.675520833333333, "learning_rate": 0.0001, "loss": 4.216, "loss/crossentropy": 2.392577236890793, "loss/hidden": 3.39140625, "loss/jsd": 0.0848071664571762, "loss/logits": 0.0, "step": 3660 }, { "epoch": 0.1835, "grad_norm": 18.625, "grad_norm_var": 2.8169270833333333, "learning_rate": 0.0001, "loss": 4.2512, "loss/crossentropy": 2.3719822376966477, "loss/hidden": 3.4421875, "loss/jsd": 0.08796066055074334, "loss/logits": 0.0, "step": 3670 }, { "epoch": 0.184, "grad_norm": 18.125, "grad_norm_var": 5.067692057291667, "learning_rate": 0.0001, "loss": 4.2015, "loss/crossentropy": 2.2736427552998064, "loss/hidden": 3.362109375, "loss/jsd": 0.0805484069045633, "loss/logits": 0.0, "step": 3680 }, { "epoch": 0.1845, "grad_norm": 14.8125, "grad_norm_var": 5.062434895833333, "learning_rate": 0.0001, "loss": 4.2791, "loss/crossentropy": 2.298249673843384, "loss/hidden": 3.403515625, "loss/jsd": 0.08579938132315874, "loss/logits": 0.0, "step": 3690 }, { "epoch": 0.185, "grad_norm": 17.75, "grad_norm_var": 5.132747395833333, "learning_rate": 0.0001, "loss": 4.2845, "loss/crossentropy": 2.29532730281353, "loss/hidden": 3.43359375, "loss/jsd": 0.08520804699510336, "loss/logits": 0.0, "step": 3700 }, { "epoch": 0.1855, "grad_norm": 21.375, "grad_norm_var": 3.1105305989583334, "learning_rate": 0.0001, "loss": 4.2881, "loss/crossentropy": 2.266036620736122, "loss/hidden": 3.4421875, "loss/jsd": 0.08662721011787652, "loss/logits": 0.0, "step": 3710 }, { "epoch": 0.186, "grad_norm": 17.875, "grad_norm_var": 2.1442057291666665, "learning_rate": 0.0001, "loss": 4.3015, "loss/crossentropy": 2.3964017778635025, "loss/hidden": 3.443359375, "loss/jsd": 0.08621067805215717, "loss/logits": 0.0, "step": 3720 }, { "epoch": 0.1865, "grad_norm": 16.5, "grad_norm_var": 3.620556640625, "learning_rate": 0.0001, "loss": 4.2411, "loss/crossentropy": 2.3594220340251923, "loss/hidden": 3.336328125, "loss/jsd": 0.0772560654208064, "loss/logits": 0.0, "step": 3730 }, { "epoch": 0.187, "grad_norm": 16.75, "grad_norm_var": 3.252978515625, "learning_rate": 0.0001, "loss": 4.184, "loss/crossentropy": 2.2494852378964425, "loss/hidden": 3.32421875, "loss/jsd": 0.08321888605132699, "loss/logits": 0.0, "step": 3740 }, { "epoch": 0.1875, "grad_norm": 18.875, "grad_norm_var": 3.7570149739583334, "learning_rate": 0.0001, "loss": 4.2329, "loss/crossentropy": 2.178547790646553, "loss/hidden": 3.322265625, "loss/jsd": 0.07429210902191699, "loss/logits": 0.0, "step": 3750 }, { "epoch": 0.188, "grad_norm": 16.125, "grad_norm_var": 4.394645182291667, "learning_rate": 0.0001, "loss": 4.2641, "loss/crossentropy": 2.2659785449504852, "loss/hidden": 3.4265625, "loss/jsd": 0.08605121849104762, "loss/logits": 0.0, "step": 3760 }, { "epoch": 0.1885, "grad_norm": 19.375, "grad_norm_var": 4.303889973958333, "learning_rate": 0.0001, "loss": 4.2343, "loss/crossentropy": 2.3981280818581583, "loss/hidden": 3.384375, "loss/jsd": 0.0853766439948231, "loss/logits": 0.0, "step": 3770 }, { "epoch": 0.189, "grad_norm": 15.75, "grad_norm_var": 3.1048014322916666, "learning_rate": 0.0001, "loss": 4.2052, "loss/crossentropy": 2.2695484533905983, "loss/hidden": 3.512109375, "loss/jsd": 0.09265543352812529, "loss/logits": 0.0, "step": 3780 }, { "epoch": 0.1895, "grad_norm": 15.3125, "grad_norm_var": 2.510139973958333, "learning_rate": 0.0001, "loss": 4.2526, "loss/crossentropy": 2.1818419501185415, "loss/hidden": 3.376171875, "loss/jsd": 0.08593555409461259, "loss/logits": 0.0, "step": 3790 }, { "epoch": 0.19, "grad_norm": 15.0625, "grad_norm_var": 2.105729166666667, "learning_rate": 0.0001, "loss": 4.2301, "loss/crossentropy": 2.218023180961609, "loss/hidden": 3.334765625, "loss/jsd": 0.07895527156069874, "loss/logits": 0.0, "step": 3800 }, { "epoch": 0.1905, "grad_norm": 15.5625, "grad_norm_var": 6.692708333333333, "learning_rate": 0.0001, "loss": 4.241, "loss/crossentropy": 2.2893342286348344, "loss/hidden": 3.296875, "loss/jsd": 0.07659890875220299, "loss/logits": 0.0, "step": 3810 }, { "epoch": 0.191, "grad_norm": 17.25, "grad_norm_var": 5.272509765625, "learning_rate": 0.0001, "loss": 4.2655, "loss/crossentropy": 2.207420842349529, "loss/hidden": 3.36484375, "loss/jsd": 0.08572290684096515, "loss/logits": 0.0, "step": 3820 }, { "epoch": 0.1915, "grad_norm": 21.625, "grad_norm_var": 5.355712890625, "learning_rate": 0.0001, "loss": 4.2474, "loss/crossentropy": 2.3076944231986998, "loss/hidden": 3.31796875, "loss/jsd": 0.07485279012471438, "loss/logits": 0.0, "step": 3830 }, { "epoch": 0.192, "grad_norm": 15.25, "grad_norm_var": 6.646858723958333, "learning_rate": 0.0001, "loss": 4.2634, "loss/crossentropy": 2.42186721265316, "loss/hidden": 3.4078125, "loss/jsd": 0.08714157855138183, "loss/logits": 0.0, "step": 3840 }, { "epoch": 0.1925, "grad_norm": 4076863488.0, "grad_norm_var": 1.0388009843068502e+18, "learning_rate": 0.0001, "loss": 4.2739, "loss/crossentropy": 2.3014174938201903, "loss/hidden": 3.364453125, "loss/jsd": 0.07954654460772873, "loss/logits": 0.0, "step": 3850 }, { "epoch": 0.193, "grad_norm": 17.5, "grad_norm_var": 1.0388009847272768e+18, "learning_rate": 0.0001, "loss": 4.2215, "loss/crossentropy": 2.348978337645531, "loss/hidden": 3.45625, "loss/jsd": 0.08323998479172587, "loss/logits": 0.0, "step": 3860 }, { "epoch": 0.1935, "grad_norm": 18.75, "grad_norm_var": 3.981884765625, "learning_rate": 0.0001, "loss": 4.1528, "loss/crossentropy": 2.3754432618618013, "loss/hidden": 3.401171875, "loss/jsd": 0.08722320841625333, "loss/logits": 0.0, "step": 3870 }, { "epoch": 0.194, "grad_norm": 18.875, "grad_norm_var": 3.824072265625, "learning_rate": 0.0001, "loss": 4.2868, "loss/crossentropy": 2.3063534289598464, "loss/hidden": 3.43515625, "loss/jsd": 0.08732216758653522, "loss/logits": 0.0, "step": 3880 }, { "epoch": 0.1945, "grad_norm": 18.25, "grad_norm_var": 3.3018229166666666, "learning_rate": 0.0001, "loss": 4.2463, "loss/crossentropy": 2.4114058747887612, "loss/hidden": 3.395703125, "loss/jsd": 0.08345712553709746, "loss/logits": 0.0, "step": 3890 }, { "epoch": 0.195, "grad_norm": 18.0, "grad_norm_var": 3.595833333333333, "learning_rate": 0.0001, "loss": 4.2128, "loss/crossentropy": 2.1565380930900573, "loss/hidden": 3.23984375, "loss/jsd": 0.07183347269892693, "loss/logits": 0.0, "step": 3900 }, { "epoch": 0.1955, "grad_norm": 19.5, "grad_norm_var": 1.669775390625, "learning_rate": 0.0001, "loss": 4.2174, "loss/crossentropy": 2.4012755006551743, "loss/hidden": 3.35859375, "loss/jsd": 0.08356887567788363, "loss/logits": 0.0, "step": 3910 }, { "epoch": 0.196, "grad_norm": 16.75, "grad_norm_var": 2.569775390625, "learning_rate": 0.0001, "loss": 4.2516, "loss/crossentropy": 2.4133204758167266, "loss/hidden": 3.405078125, "loss/jsd": 0.08416441585868598, "loss/logits": 0.0, "step": 3920 }, { "epoch": 0.1965, "grad_norm": 16.125, "grad_norm_var": 4.249072265625, "learning_rate": 0.0001, "loss": 4.2424, "loss/crossentropy": 2.2017408296465875, "loss/hidden": 3.434375, "loss/jsd": 0.08481362634338438, "loss/logits": 0.0, "step": 3930 }, { "epoch": 0.197, "grad_norm": 18.0, "grad_norm_var": 13.563541666666667, "learning_rate": 0.0001, "loss": 4.2145, "loss/crossentropy": 2.1327252730727198, "loss/hidden": 3.358203125, "loss/jsd": 0.08263032594695688, "loss/logits": 0.0, "step": 3940 }, { "epoch": 0.1975, "grad_norm": 15.9375, "grad_norm_var": 13.279801432291666, "learning_rate": 0.0001, "loss": 4.271, "loss/crossentropy": 2.3732340067625044, "loss/hidden": 3.38984375, "loss/jsd": 0.09080582885071635, "loss/logits": 0.0, "step": 3950 }, { "epoch": 0.198, "grad_norm": 14.125, "grad_norm_var": 3.5541015625, "learning_rate": 0.0001, "loss": 4.3277, "loss/crossentropy": 2.2829252019524575, "loss/hidden": 3.504296875, "loss/jsd": 0.09264815384522081, "loss/logits": 0.0, "step": 3960 }, { "epoch": 0.1985, "grad_norm": 20.125, "grad_norm_var": 5.493212890625, "learning_rate": 0.0001, "loss": 4.3215, "loss/crossentropy": 2.284733434021473, "loss/hidden": 3.394921875, "loss/jsd": 0.08987429440021515, "loss/logits": 0.0, "step": 3970 }, { "epoch": 0.199, "grad_norm": 17.0, "grad_norm_var": 5.512223307291666, "learning_rate": 0.0001, "loss": 4.2933, "loss/crossentropy": 2.2337097018957137, "loss/hidden": 3.3640625, "loss/jsd": 0.0808649729937315, "loss/logits": 0.0, "step": 3980 }, { "epoch": 0.1995, "grad_norm": 16.625, "grad_norm_var": 12.917122395833333, "learning_rate": 0.0001, "loss": 4.2148, "loss/crossentropy": 2.3057729706168173, "loss/hidden": 3.411328125, "loss/jsd": 0.08738104859367013, "loss/logits": 0.0, "step": 3990 }, { "epoch": 0.2, "grad_norm": 15.9375, "grad_norm_var": 5.007145182291667, "learning_rate": 0.0001, "loss": 4.217, "loss/crossentropy": 2.3626988530158997, "loss/hidden": 3.44296875, "loss/jsd": 0.09443312305957079, "loss/logits": 0.0, "step": 4000 } ], "logging_steps": 10, "max_steps": 20000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.1430040128035226e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }