| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.2, |
| "eval_steps": 2000, |
| "global_step": 4000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0005, |
| "grad_norm": 2128.0, |
| "learning_rate": 1.9e-05, |
| "loss": 69.9557, |
| "loss/crossentropy": 12.354743599891663, |
| "loss/hidden": 18.71875, |
| "loss/jsd": 5.161534905433655, |
| "loss/logits": 0.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.001, |
| "grad_norm": 266.0, |
| "grad_norm_var": 15343106.783333333, |
| "learning_rate": 2.8000000000000003e-05, |
| "loss": 52.9613, |
| "loss/crossentropy": 9.517439389228821, |
| "loss/hidden": 18.68125, |
| "loss/jsd": 3.39926393032074, |
| "loss/logits": 0.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.0015, |
| "grad_norm": 186.0, |
| "grad_norm_var": 174925.440625, |
| "learning_rate": 3.7e-05, |
| "loss": 48.1973, |
| "loss/crossentropy": 8.46514676809311, |
| "loss/hidden": 18.065625, |
| "loss/jsd": 2.9373991966247557, |
| "loss/logits": 0.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.002, |
| "grad_norm": 532.0, |
| "grad_norm_var": 39180.229166666664, |
| "learning_rate": 4.600000000000001e-05, |
| "loss": 45.9066, |
| "loss/crossentropy": 8.040922927856446, |
| "loss/hidden": 17.096875, |
| "loss/jsd": 2.860607051849365, |
| "loss/logits": 0.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.0025, |
| "grad_norm": 334.0, |
| "grad_norm_var": 38047.8, |
| "learning_rate": 5.500000000000001e-05, |
| "loss": 41.576, |
| "loss/crossentropy": 7.545825862884522, |
| "loss/hidden": 16.803125, |
| "loss/jsd": 2.476469251513481, |
| "loss/logits": 0.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.003, |
| "grad_norm": 238.0, |
| "grad_norm_var": 76798.25, |
| "learning_rate": 6.400000000000001e-05, |
| "loss": 36.7377, |
| "loss/crossentropy": 6.656394875049591, |
| "loss/hidden": 15.9859375, |
| "loss/jsd": 2.101923054456711, |
| "loss/logits": 0.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.0035, |
| "grad_norm": 221.0, |
| "grad_norm_var": 72765.58333333333, |
| "learning_rate": 7.3e-05, |
| "loss": 28.2567, |
| "loss/crossentropy": 5.261470526456833, |
| "loss/hidden": 13.6265625, |
| "loss/jsd": 1.4439617365598678, |
| "loss/logits": 0.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.004, |
| "grad_norm": 185.0, |
| "grad_norm_var": 16524.266666666666, |
| "learning_rate": 8.200000000000001e-05, |
| "loss": 19.3251, |
| "loss/crossentropy": 4.03509070277214, |
| "loss/hidden": 11.1984375, |
| "loss/jsd": 0.8947193071246147, |
| "loss/logits": 0.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.0045, |
| "grad_norm": 107.5, |
| "grad_norm_var": 1300.190625, |
| "learning_rate": 9.1e-05, |
| "loss": 14.15, |
| "loss/crossentropy": 3.2564123183488847, |
| "loss/hidden": 9.371875, |
| "loss/jsd": 0.4821927219629288, |
| "loss/logits": 0.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.005, |
| "grad_norm": 113.0, |
| "grad_norm_var": 747.6072916666667, |
| "learning_rate": 0.0001, |
| "loss": 12.3004, |
| "loss/crossentropy": 2.9699372231960295, |
| "loss/hidden": 8.3890625, |
| "loss/jsd": 0.37094187960028646, |
| "loss/logits": 0.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.0055, |
| "grad_norm": 143.0, |
| "grad_norm_var": 1498.7333333333333, |
| "learning_rate": 0.0001, |
| "loss": 11.0558, |
| "loss/crossentropy": 3.028834396600723, |
| "loss/hidden": 7.6984375, |
| "loss/jsd": 0.32162978053092955, |
| "loss/logits": 0.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.006, |
| "grad_norm": 141.0, |
| "grad_norm_var": 384.065625, |
| "learning_rate": 0.0001, |
| "loss": 10.3695, |
| "loss/crossentropy": 2.8776101738214495, |
| "loss/hidden": 7.54375, |
| "loss/jsd": 0.31908423118293283, |
| "loss/logits": 0.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.0065, |
| "grad_norm": 126.5, |
| "grad_norm_var": 376.53229166666665, |
| "learning_rate": 0.0001, |
| "loss": 9.7913, |
| "loss/crossentropy": 2.742277052998543, |
| "loss/hidden": 7.16328125, |
| "loss/jsd": 0.2711725488305092, |
| "loss/logits": 0.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.007, |
| "grad_norm": 129.0, |
| "grad_norm_var": 266.0, |
| "learning_rate": 0.0001, |
| "loss": 9.524, |
| "loss/crossentropy": 2.4384234696626663, |
| "loss/hidden": 6.9765625, |
| "loss/jsd": 0.2616196651011705, |
| "loss/logits": 0.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.0075, |
| "grad_norm": 100.5, |
| "grad_norm_var": 409.03229166666665, |
| "learning_rate": 0.0001, |
| "loss": 9.1046, |
| "loss/crossentropy": 2.8043846026062966, |
| "loss/hidden": 6.6234375, |
| "loss/jsd": 0.2574016904458404, |
| "loss/logits": 0.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.008, |
| "grad_norm": 78.0, |
| "grad_norm_var": 385.70729166666666, |
| "learning_rate": 0.0001, |
| "loss": 8.961, |
| "loss/crossentropy": 2.6820163667201995, |
| "loss/hidden": 6.44609375, |
| "loss/jsd": 0.22497861441224815, |
| "loss/logits": 0.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.0085, |
| "grad_norm": 73.0, |
| "grad_norm_var": 180.540625, |
| "learning_rate": 0.0001, |
| "loss": 8.6716, |
| "loss/crossentropy": 2.56088288128376, |
| "loss/hidden": 6.525, |
| "loss/jsd": 0.23445787131786347, |
| "loss/logits": 0.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.009, |
| "grad_norm": 70.5, |
| "grad_norm_var": 66.43333333333334, |
| "learning_rate": 0.0001, |
| "loss": 8.3449, |
| "loss/crossentropy": 2.5659249514341353, |
| "loss/hidden": 6.0875, |
| "loss/jsd": 0.20521375369280576, |
| "loss/logits": 0.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.0095, |
| "grad_norm": 59.0, |
| "grad_norm_var": 125.99895833333333, |
| "learning_rate": 0.0001, |
| "loss": 8.2048, |
| "loss/crossentropy": 2.4801410123705865, |
| "loss/hidden": 5.96875, |
| "loss/jsd": 0.20023126248270273, |
| "loss/logits": 0.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 95.5, |
| "grad_norm_var": 151.215625, |
| "learning_rate": 0.0001, |
| "loss": 7.9327, |
| "loss/crossentropy": 2.7575797021389006, |
| "loss/hidden": 5.9078125, |
| "loss/jsd": 0.21125836484134197, |
| "loss/logits": 0.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.0105, |
| "grad_norm": 69.5, |
| "grad_norm_var": 81.01666666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.867, |
| "loss/crossentropy": 2.584353247284889, |
| "loss/hidden": 5.79140625, |
| "loss/jsd": 0.18391123060137032, |
| "loss/logits": 0.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.011, |
| "grad_norm": 67.0, |
| "grad_norm_var": 111.115625, |
| "learning_rate": 0.0001, |
| "loss": 7.5262, |
| "loss/crossentropy": 2.5395505383610724, |
| "loss/hidden": 5.68125, |
| "loss/jsd": 0.17292506210505962, |
| "loss/logits": 0.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.0115, |
| "grad_norm": 80.0, |
| "grad_norm_var": 114.725, |
| "learning_rate": 0.0001, |
| "loss": 7.553, |
| "loss/crossentropy": 2.469125708937645, |
| "loss/hidden": 5.62890625, |
| "loss/jsd": 0.1715977793559432, |
| "loss/logits": 0.0, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.012, |
| "grad_norm": 59.75, |
| "grad_norm_var": 93.15, |
| "learning_rate": 0.0001, |
| "loss": 7.3673, |
| "loss/crossentropy": 2.572914382815361, |
| "loss/hidden": 5.51171875, |
| "loss/jsd": 0.18267902322113513, |
| "loss/logits": 0.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.0125, |
| "grad_norm": 53.5, |
| "grad_norm_var": 99.42395833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.3184, |
| "loss/crossentropy": 2.6171721309423446, |
| "loss/hidden": 5.53515625, |
| "loss/jsd": 0.17945121377706527, |
| "loss/logits": 0.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.013, |
| "grad_norm": 48.75, |
| "grad_norm_var": 118.975, |
| "learning_rate": 0.0001, |
| "loss": 7.2085, |
| "loss/crossentropy": 2.4379070818424227, |
| "loss/hidden": 5.509375, |
| "loss/jsd": 0.1914055148139596, |
| "loss/logits": 0.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.0135, |
| "grad_norm": 67.5, |
| "grad_norm_var": 102.2875, |
| "learning_rate": 0.0001, |
| "loss": 7.0206, |
| "loss/crossentropy": 2.5107616782188416, |
| "loss/hidden": 5.4359375, |
| "loss/jsd": 0.19947240259498358, |
| "loss/logits": 0.0, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.014, |
| "grad_norm": 67.5, |
| "grad_norm_var": 136.27057291666668, |
| "learning_rate": 0.0001, |
| "loss": 6.9768, |
| "loss/crossentropy": 2.4130793780088426, |
| "loss/hidden": 5.28359375, |
| "loss/jsd": 0.18424466587603092, |
| "loss/logits": 0.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.0145, |
| "grad_norm": 62.0, |
| "grad_norm_var": 98.04895833333333, |
| "learning_rate": 0.0001, |
| "loss": 6.8743, |
| "loss/crossentropy": 2.382996806502342, |
| "loss/hidden": 5.20625, |
| "loss/jsd": 0.1648038787767291, |
| "loss/logits": 0.0, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.015, |
| "grad_norm": 57.75, |
| "grad_norm_var": 81.59895833333333, |
| "learning_rate": 0.0001, |
| "loss": 6.7946, |
| "loss/crossentropy": 2.5844862312078476, |
| "loss/hidden": 5.22265625, |
| "loss/jsd": 0.1599080903455615, |
| "loss/logits": 0.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.0155, |
| "grad_norm": 77.0, |
| "grad_norm_var": 103.13229166666666, |
| "learning_rate": 0.0001, |
| "loss": 6.7739, |
| "loss/crossentropy": 2.4337188243865966, |
| "loss/hidden": 4.98046875, |
| "loss/jsd": 0.14282424729317428, |
| "loss/logits": 0.0, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.016, |
| "grad_norm": 51.75, |
| "grad_norm_var": 154.47265625, |
| "learning_rate": 0.0001, |
| "loss": 6.6113, |
| "loss/crossentropy": 2.516378104686737, |
| "loss/hidden": 5.03828125, |
| "loss/jsd": 0.1448629444465041, |
| "loss/logits": 0.0, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.0165, |
| "grad_norm": 47.0, |
| "grad_norm_var": 43.2625, |
| "learning_rate": 0.0001, |
| "loss": 6.4669, |
| "loss/crossentropy": 2.5109775930643083, |
| "loss/hidden": 4.9265625, |
| "loss/jsd": 0.14978713616728784, |
| "loss/logits": 0.0, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.017, |
| "grad_norm": 57.75, |
| "grad_norm_var": 64.21087239583333, |
| "learning_rate": 0.0001, |
| "loss": 6.4713, |
| "loss/crossentropy": 2.497659134864807, |
| "loss/hidden": 4.91796875, |
| "loss/jsd": 0.14760203529149293, |
| "loss/logits": 0.0, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.0175, |
| "grad_norm": 52.5, |
| "grad_norm_var": 120.26243489583334, |
| "learning_rate": 0.0001, |
| "loss": 6.4978, |
| "loss/crossentropy": 2.402846799790859, |
| "loss/hidden": 4.7796875, |
| "loss/jsd": 0.13832223881036043, |
| "loss/logits": 0.0, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.018, |
| "grad_norm": 46.5, |
| "grad_norm_var": 21.149739583333332, |
| "learning_rate": 0.0001, |
| "loss": 6.3607, |
| "loss/crossentropy": 2.3924304962158205, |
| "loss/hidden": 4.9890625, |
| "loss/jsd": 0.1568290738388896, |
| "loss/logits": 0.0, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.0185, |
| "grad_norm": 44.75, |
| "grad_norm_var": 49.326822916666664, |
| "learning_rate": 0.0001, |
| "loss": 6.3592, |
| "loss/crossentropy": 2.4209784388542177, |
| "loss/hidden": 4.89765625, |
| "loss/jsd": 0.14134480394423007, |
| "loss/logits": 0.0, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.019, |
| "grad_norm": 43.75, |
| "grad_norm_var": 71.22057291666667, |
| "learning_rate": 0.0001, |
| "loss": 6.2124, |
| "loss/crossentropy": 2.549247406423092, |
| "loss/hidden": 4.7390625, |
| "loss/jsd": 0.140831589885056, |
| "loss/logits": 0.0, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.0195, |
| "grad_norm": 92.5, |
| "grad_norm_var": 9.065077296740351e+17, |
| "learning_rate": 0.0001, |
| "loss": 6.2864, |
| "loss/crossentropy": 2.4922619298100472, |
| "loss/hidden": 4.87734375, |
| "loss/jsd": 0.1634673684835434, |
| "loss/logits": 0.0, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 43.75, |
| "grad_norm_var": 9.065077288409414e+17, |
| "learning_rate": 0.0001, |
| "loss": 6.2254, |
| "loss/crossentropy": 2.469875320792198, |
| "loss/hidden": 4.82265625, |
| "loss/jsd": 0.1564602382481098, |
| "loss/logits": 0.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.0205, |
| "grad_norm": 56.0, |
| "grad_norm_var": 47.498958333333334, |
| "learning_rate": 0.0001, |
| "loss": 6.1795, |
| "loss/crossentropy": 2.548477476835251, |
| "loss/hidden": 4.75703125, |
| "loss/jsd": 0.17199970744550228, |
| "loss/logits": 0.0, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.021, |
| "grad_norm": 54.75, |
| "grad_norm_var": 720.9768229166667, |
| "learning_rate": 0.0001, |
| "loss": 6.252, |
| "loss/crossentropy": 2.479714798927307, |
| "loss/hidden": 4.68828125, |
| "loss/jsd": 0.1501935562118888, |
| "loss/logits": 0.0, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.0215, |
| "grad_norm": 47.25, |
| "grad_norm_var": 723.6166666666667, |
| "learning_rate": 0.0001, |
| "loss": 6.1002, |
| "loss/crossentropy": 2.529230397939682, |
| "loss/hidden": 4.7921875, |
| "loss/jsd": 0.15877617206424474, |
| "loss/logits": 0.0, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.022, |
| "grad_norm": 53.0, |
| "grad_norm_var": 1.207597994464615e+18, |
| "learning_rate": 0.0001, |
| "loss": 6.0501, |
| "loss/crossentropy": 2.2404126971960068, |
| "loss/hidden": 4.54140625, |
| "loss/jsd": 0.1322522010654211, |
| "loss/logits": 0.0, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.0225, |
| "grad_norm": 3875536896.0, |
| "grad_norm_var": 2.004372710541947e+18, |
| "learning_rate": 0.0001, |
| "loss": 6.1466, |
| "loss/crossentropy": 2.430220237374306, |
| "loss/hidden": 4.62109375, |
| "loss/jsd": 0.14306345414370297, |
| "loss/logits": 0.0, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.023, |
| "grad_norm": 43.0, |
| "grad_norm_var": 9.387366184428504e+17, |
| "learning_rate": 0.0001, |
| "loss": 6.0412, |
| "loss/crossentropy": 2.345375160872936, |
| "loss/hidden": 4.620703125, |
| "loss/jsd": 0.1385633122175932, |
| "loss/logits": 0.0, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.0235, |
| "grad_norm": 42.75, |
| "grad_norm_var": 21.448958333333334, |
| "learning_rate": 0.0001, |
| "loss": 5.9336, |
| "loss/crossentropy": 2.425405339896679, |
| "loss/hidden": 4.60546875, |
| "loss/jsd": 0.13772829296067357, |
| "loss/logits": 0.0, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.024, |
| "grad_norm": 39.75, |
| "grad_norm_var": 10.595572916666667, |
| "learning_rate": 0.0001, |
| "loss": 5.9238, |
| "loss/crossentropy": 2.1817762181162834, |
| "loss/hidden": 4.540234375, |
| "loss/jsd": 0.12882032115012407, |
| "loss/logits": 0.0, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.0245, |
| "grad_norm": 33.75, |
| "grad_norm_var": 21.001822916666665, |
| "learning_rate": 0.0001, |
| "loss": 6.0109, |
| "loss/crossentropy": 2.3736354261636734, |
| "loss/hidden": 4.64140625, |
| "loss/jsd": 0.1405269218608737, |
| "loss/logits": 0.0, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.025, |
| "grad_norm": 41.25, |
| "grad_norm_var": 220.015625, |
| "learning_rate": 0.0001, |
| "loss": 5.9307, |
| "loss/crossentropy": 2.5042927861213684, |
| "loss/hidden": 4.7546875, |
| "loss/jsd": 0.18516455199569465, |
| "loss/logits": 0.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.0255, |
| "grad_norm": 41.0, |
| "grad_norm_var": 16.257291666666667, |
| "learning_rate": 0.0001, |
| "loss": 5.9019, |
| "loss/crossentropy": 2.526998797059059, |
| "loss/hidden": 4.47109375, |
| "loss/jsd": 0.13217656817287207, |
| "loss/logits": 0.0, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.026, |
| "grad_norm": 32.25, |
| "grad_norm_var": 16.782291666666666, |
| "learning_rate": 0.0001, |
| "loss": 5.8327, |
| "loss/crossentropy": 2.316130298376083, |
| "loss/hidden": 4.387109375, |
| "loss/jsd": 0.12394356895238161, |
| "loss/logits": 0.0, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.0265, |
| "grad_norm": 39.0, |
| "grad_norm_var": 24.970833333333335, |
| "learning_rate": 0.0001, |
| "loss": 5.8284, |
| "loss/crossentropy": 2.214504113793373, |
| "loss/hidden": 4.623046875, |
| "loss/jsd": 0.15524424342438578, |
| "loss/logits": 0.0, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.027, |
| "grad_norm": 35.75, |
| "grad_norm_var": 11.79375, |
| "learning_rate": 0.0001, |
| "loss": 5.7037, |
| "loss/crossentropy": 2.336098350584507, |
| "loss/hidden": 4.33515625, |
| "loss/jsd": 0.12178284991532565, |
| "loss/logits": 0.0, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.0275, |
| "grad_norm": 38.0, |
| "grad_norm_var": 13.470768229166667, |
| "learning_rate": 0.0001, |
| "loss": 5.7146, |
| "loss/crossentropy": 2.4750932276248934, |
| "loss/hidden": 4.41953125, |
| "loss/jsd": 0.12415571566671132, |
| "loss/logits": 0.0, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.028, |
| "grad_norm": 37.0, |
| "grad_norm_var": 15.298958333333333, |
| "learning_rate": 0.0001, |
| "loss": 5.6597, |
| "loss/crossentropy": 2.360400839149952, |
| "loss/hidden": 4.45546875, |
| "loss/jsd": 0.1331789677962661, |
| "loss/logits": 0.0, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.0285, |
| "grad_norm": 26.75, |
| "grad_norm_var": 108.82057291666666, |
| "learning_rate": 0.0001, |
| "loss": 5.6905, |
| "loss/crossentropy": 2.547207270562649, |
| "loss/hidden": 4.413671875, |
| "loss/jsd": 0.13257503397762777, |
| "loss/logits": 0.0, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.029, |
| "grad_norm": 38.25, |
| "grad_norm_var": 82.65149739583333, |
| "learning_rate": 0.0001, |
| "loss": 5.707, |
| "loss/crossentropy": 2.4661644257605078, |
| "loss/hidden": 4.43046875, |
| "loss/jsd": 0.13218661015853286, |
| "loss/logits": 0.0, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.0295, |
| "grad_norm": 33.0, |
| "grad_norm_var": 14.656705729166667, |
| "learning_rate": 0.0001, |
| "loss": 5.6198, |
| "loss/crossentropy": 2.3429581418633463, |
| "loss/hidden": 4.35390625, |
| "loss/jsd": 0.1255058040842414, |
| "loss/logits": 0.0, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 30.5, |
| "grad_norm_var": 16.014518229166665, |
| "learning_rate": 0.0001, |
| "loss": 5.5969, |
| "loss/crossentropy": 2.6043634325265885, |
| "loss/hidden": 4.3796875, |
| "loss/jsd": 0.1311176208779216, |
| "loss/logits": 0.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.0305, |
| "grad_norm": 33.0, |
| "grad_norm_var": 10.665625, |
| "learning_rate": 0.0001, |
| "loss": 5.5352, |
| "loss/crossentropy": 2.4040530994534492, |
| "loss/hidden": 4.219140625, |
| "loss/jsd": 0.12296068714931607, |
| "loss/logits": 0.0, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.031, |
| "grad_norm": 38.75, |
| "grad_norm_var": 16.33125, |
| "learning_rate": 0.0001, |
| "loss": 5.4814, |
| "loss/crossentropy": 2.390893703699112, |
| "loss/hidden": 4.291796875, |
| "loss/jsd": 0.11912889536470175, |
| "loss/logits": 0.0, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.0315, |
| "grad_norm": 34.75, |
| "grad_norm_var": 19.909375, |
| "learning_rate": 0.0001, |
| "loss": 5.5724, |
| "loss/crossentropy": 2.5551778227090836, |
| "loss/hidden": 4.251171875, |
| "loss/jsd": 0.134556083381176, |
| "loss/logits": 0.0, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.032, |
| "grad_norm": 33.0, |
| "grad_norm_var": 1.2447526950627446e+18, |
| "learning_rate": 0.0001, |
| "loss": 5.6162, |
| "loss/crossentropy": 2.4906763210892677, |
| "loss/hidden": 4.20234375, |
| "loss/jsd": 0.1178798858076334, |
| "loss/logits": 0.0, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.0325, |
| "grad_norm": 29.875, |
| "grad_norm_var": 1.2447526957786422e+18, |
| "learning_rate": 0.0001, |
| "loss": 5.5184, |
| "loss/crossentropy": 2.437400442361832, |
| "loss/hidden": 4.23046875, |
| "loss/jsd": 0.12956738714128732, |
| "loss/logits": 0.0, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.033, |
| "grad_norm": 33.0, |
| "grad_norm_var": 27.3134765625, |
| "learning_rate": 0.0001, |
| "loss": 5.6369, |
| "loss/crossentropy": 2.4849177479743956, |
| "loss/hidden": 4.262109375, |
| "loss/jsd": 0.12098300596699119, |
| "loss/logits": 0.0, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.0335, |
| "grad_norm": 28.5, |
| "grad_norm_var": 17.055989583333332, |
| "learning_rate": 0.0001, |
| "loss": 5.4991, |
| "loss/crossentropy": 2.4364880681037904, |
| "loss/hidden": 4.26171875, |
| "loss/jsd": 0.12965436410158873, |
| "loss/logits": 0.0, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.034, |
| "grad_norm": 28.375, |
| "grad_norm_var": 19.137955729166666, |
| "learning_rate": 0.0001, |
| "loss": 5.5161, |
| "loss/crossentropy": 2.392630486190319, |
| "loss/hidden": 4.173828125, |
| "loss/jsd": 0.11459105852991343, |
| "loss/logits": 0.0, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.0345, |
| "grad_norm": 27.25, |
| "grad_norm_var": 13.9322265625, |
| "learning_rate": 0.0001, |
| "loss": 5.4332, |
| "loss/crossentropy": 2.344803684949875, |
| "loss/hidden": 4.176953125, |
| "loss/jsd": 0.11463690986856818, |
| "loss/logits": 0.0, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.035, |
| "grad_norm": 34.75, |
| "grad_norm_var": 15.854622395833333, |
| "learning_rate": 0.0001, |
| "loss": 5.5003, |
| "loss/crossentropy": 2.395221236348152, |
| "loss/hidden": 4.260546875, |
| "loss/jsd": 0.1217193447984755, |
| "loss/logits": 0.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.0355, |
| "grad_norm": 25.25, |
| "grad_norm_var": 14.663541666666667, |
| "learning_rate": 0.0001, |
| "loss": 5.4171, |
| "loss/crossentropy": 2.4193977400660516, |
| "loss/hidden": 4.23828125, |
| "loss/jsd": 0.12150606149807572, |
| "loss/logits": 0.0, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.036, |
| "grad_norm": 26.875, |
| "grad_norm_var": 13.142643229166667, |
| "learning_rate": 0.0001, |
| "loss": 5.3761, |
| "loss/crossentropy": 2.2133478626608847, |
| "loss/hidden": 4.141796875, |
| "loss/jsd": 0.11149341901764273, |
| "loss/logits": 0.0, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.0365, |
| "grad_norm": 34.25, |
| "grad_norm_var": 14.420572916666666, |
| "learning_rate": 0.0001, |
| "loss": 5.3258, |
| "loss/crossentropy": 2.3443893820047377, |
| "loss/hidden": 4.090234375, |
| "loss/jsd": 0.11677124733105301, |
| "loss/logits": 0.0, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.037, |
| "grad_norm": 31.25, |
| "grad_norm_var": 9.551822916666667, |
| "learning_rate": 0.0001, |
| "loss": 5.3054, |
| "loss/crossentropy": 2.3357947677373887, |
| "loss/hidden": 4.194140625, |
| "loss/jsd": 0.12083362191915512, |
| "loss/logits": 0.0, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.0375, |
| "grad_norm": 32.25, |
| "grad_norm_var": 9.950455729166666, |
| "learning_rate": 0.0001, |
| "loss": 5.2645, |
| "loss/crossentropy": 2.4039614737033843, |
| "loss/hidden": 4.08671875, |
| "loss/jsd": 0.1069810570217669, |
| "loss/logits": 0.0, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.038, |
| "grad_norm": 24.0, |
| "grad_norm_var": 1.1710062557908698e+18, |
| "learning_rate": 0.0001, |
| "loss": 5.3587, |
| "loss/crossentropy": 2.4738259255886077, |
| "loss/hidden": 4.209765625, |
| "loss/jsd": 0.13927901685237884, |
| "loss/logits": 0.0, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.0385, |
| "grad_norm": 29.125, |
| "grad_norm_var": 1.1710062386255852e+18, |
| "learning_rate": 0.0001, |
| "loss": 5.3753, |
| "loss/crossentropy": 2.2876866430044176, |
| "loss/hidden": 4.1421875, |
| "loss/jsd": 0.11211317665874958, |
| "loss/logits": 0.0, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.039, |
| "grad_norm": 27.875, |
| "grad_norm_var": 485.3025390625, |
| "learning_rate": 0.0001, |
| "loss": 5.2875, |
| "loss/crossentropy": 2.3789359077811243, |
| "loss/hidden": 4.13828125, |
| "loss/jsd": 0.11359207816421986, |
| "loss/logits": 0.0, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.0395, |
| "grad_norm": 21.875, |
| "grad_norm_var": 19.980208333333334, |
| "learning_rate": 0.0001, |
| "loss": 5.2659, |
| "loss/crossentropy": 2.4840095818042753, |
| "loss/hidden": 4.076953125, |
| "loss/jsd": 0.1078010268509388, |
| "loss/logits": 0.0, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 32.75, |
| "grad_norm_var": 21.772330729166665, |
| "learning_rate": 0.0001, |
| "loss": 5.3525, |
| "loss/crossentropy": 2.2179358512163163, |
| "loss/hidden": 4.16796875, |
| "loss/jsd": 0.11819018721580506, |
| "loss/logits": 0.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.0405, |
| "grad_norm": 27.0, |
| "grad_norm_var": 22.1884765625, |
| "learning_rate": 0.0001, |
| "loss": 5.3043, |
| "loss/crossentropy": 2.4508845895528792, |
| "loss/hidden": 4.133203125, |
| "loss/jsd": 0.11473200833424926, |
| "loss/logits": 0.0, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.041, |
| "grad_norm": 28.625, |
| "grad_norm_var": 62.53515625, |
| "learning_rate": 0.0001, |
| "loss": 5.2633, |
| "loss/crossentropy": 2.5463142573833464, |
| "loss/hidden": 4.076171875, |
| "loss/jsd": 0.12316551432013512, |
| "loss/logits": 0.0, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.0415, |
| "grad_norm": 26.625, |
| "grad_norm_var": 29.2150390625, |
| "learning_rate": 0.0001, |
| "loss": 5.2498, |
| "loss/crossentropy": 2.379361332952976, |
| "loss/hidden": 4.125, |
| "loss/jsd": 0.11994905360043048, |
| "loss/logits": 0.0, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.042, |
| "grad_norm": 27.75, |
| "grad_norm_var": 37.1197265625, |
| "learning_rate": 0.0001, |
| "loss": 5.25, |
| "loss/crossentropy": 2.448214793205261, |
| "loss/hidden": 4.233203125, |
| "loss/jsd": 0.13263647919520735, |
| "loss/logits": 0.0, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.0425, |
| "grad_norm": 26.25, |
| "grad_norm_var": 13.433072916666667, |
| "learning_rate": 0.0001, |
| "loss": 5.1491, |
| "loss/crossentropy": 2.4302526518702505, |
| "loss/hidden": 4.12578125, |
| "loss/jsd": 0.11334973787888884, |
| "loss/logits": 0.0, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.043, |
| "grad_norm": 23.75, |
| "grad_norm_var": 7.470572916666667, |
| "learning_rate": 0.0001, |
| "loss": 5.1671, |
| "loss/crossentropy": 2.415765553712845, |
| "loss/hidden": 4.11328125, |
| "loss/jsd": 0.11990332859568298, |
| "loss/logits": 0.0, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.0435, |
| "grad_norm": 25.5, |
| "grad_norm_var": 6.077083333333333, |
| "learning_rate": 0.0001, |
| "loss": 5.1279, |
| "loss/crossentropy": 2.3868868976831434, |
| "loss/hidden": 4.0671875, |
| "loss/jsd": 0.11438164403662085, |
| "loss/logits": 0.0, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.044, |
| "grad_norm": 25.0, |
| "grad_norm_var": 48.50416666666667, |
| "learning_rate": 0.0001, |
| "loss": 5.18, |
| "loss/crossentropy": 2.367817610502243, |
| "loss/hidden": 4.136328125, |
| "loss/jsd": 0.12616985198110342, |
| "loss/logits": 0.0, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.0445, |
| "grad_norm": 23.625, |
| "grad_norm_var": 52.3375, |
| "learning_rate": 0.0001, |
| "loss": 5.1786, |
| "loss/crossentropy": 2.4342163532972334, |
| "loss/hidden": 4.0125, |
| "loss/jsd": 0.12039547078311444, |
| "loss/logits": 0.0, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.045, |
| "grad_norm": 28.125, |
| "grad_norm_var": 6.708268229166666, |
| "learning_rate": 0.0001, |
| "loss": 5.1451, |
| "loss/crossentropy": 2.4633941307663916, |
| "loss/hidden": 4.08125, |
| "loss/jsd": 0.11877955347299576, |
| "loss/logits": 0.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.0455, |
| "grad_norm": 28.5, |
| "grad_norm_var": 8.4603515625, |
| "learning_rate": 0.0001, |
| "loss": 5.1919, |
| "loss/crossentropy": 2.3779468327760696, |
| "loss/hidden": 4.058984375, |
| "loss/jsd": 0.11588607728481293, |
| "loss/logits": 0.0, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.046, |
| "grad_norm": 38.25, |
| "grad_norm_var": 59.06295572916667, |
| "learning_rate": 0.0001, |
| "loss": 5.2033, |
| "loss/crossentropy": 2.4956902295351027, |
| "loss/hidden": 4.107421875, |
| "loss/jsd": 0.11758697256445885, |
| "loss/logits": 0.0, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.0465, |
| "grad_norm": 22.625, |
| "grad_norm_var": 21.1744140625, |
| "learning_rate": 0.0001, |
| "loss": 5.1248, |
| "loss/crossentropy": 2.4070512309670447, |
| "loss/hidden": 4.123828125, |
| "loss/jsd": 0.12089485572651029, |
| "loss/logits": 0.0, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.047, |
| "grad_norm": 47.75, |
| "grad_norm_var": 164.896875, |
| "learning_rate": 0.0001, |
| "loss": 5.1273, |
| "loss/crossentropy": 2.1984025448560716, |
| "loss/hidden": 3.83359375, |
| "loss/jsd": 0.10510765034705401, |
| "loss/logits": 0.0, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.0475, |
| "grad_norm": 24.125, |
| "grad_norm_var": 171.48326822916667, |
| "learning_rate": 0.0001, |
| "loss": 5.0933, |
| "loss/crossentropy": 2.408414696156979, |
| "loss/hidden": 3.9015625, |
| "loss/jsd": 0.09813609030097722, |
| "loss/logits": 0.0, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.048, |
| "grad_norm": 25.5, |
| "grad_norm_var": 10.351041666666667, |
| "learning_rate": 0.0001, |
| "loss": 5.0887, |
| "loss/crossentropy": 2.3635219663381575, |
| "loss/hidden": 3.983984375, |
| "loss/jsd": 0.10892721712589264, |
| "loss/logits": 0.0, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.0485, |
| "grad_norm": 23.25, |
| "grad_norm_var": 15.676497395833334, |
| "learning_rate": 0.0001, |
| "loss": 5.0293, |
| "loss/crossentropy": 2.182341808080673, |
| "loss/hidden": 3.92421875, |
| "loss/jsd": 0.10646048728376627, |
| "loss/logits": 0.0, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.049, |
| "grad_norm": 26.625, |
| "grad_norm_var": 7.992708333333334, |
| "learning_rate": 0.0001, |
| "loss": 5.1407, |
| "loss/crossentropy": 2.4966017305850983, |
| "loss/hidden": 3.909375, |
| "loss/jsd": 0.11931864526122808, |
| "loss/logits": 0.0, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.0495, |
| "grad_norm": 25.125, |
| "grad_norm_var": 915.2077473958333, |
| "learning_rate": 0.0001, |
| "loss": 5.1799, |
| "loss/crossentropy": 2.3614319562911987, |
| "loss/hidden": 3.95390625, |
| "loss/jsd": 0.10783975422382355, |
| "loss/logits": 0.0, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 24.875, |
| "grad_norm_var": 862.96640625, |
| "learning_rate": 0.0001, |
| "loss": 5.1175, |
| "loss/crossentropy": 2.3259101063013077, |
| "loss/hidden": 4.09140625, |
| "loss/jsd": 0.11582606900483369, |
| "loss/logits": 0.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.0505, |
| "grad_norm": 27.0, |
| "grad_norm_var": 36.96243489583333, |
| "learning_rate": 0.0001, |
| "loss": 5.1445, |
| "loss/crossentropy": 2.4153922617435457, |
| "loss/hidden": 4.044140625, |
| "loss/jsd": 0.11763136927038431, |
| "loss/logits": 0.0, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.051, |
| "grad_norm": 27.0, |
| "grad_norm_var": 11.583333333333334, |
| "learning_rate": 0.0001, |
| "loss": 5.0695, |
| "loss/crossentropy": 2.287649059295654, |
| "loss/hidden": 3.97578125, |
| "loss/jsd": 0.10912037892267108, |
| "loss/logits": 0.0, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.0515, |
| "grad_norm": 34.25, |
| "grad_norm_var": 598.6910807291666, |
| "learning_rate": 0.0001, |
| "loss": 5.1531, |
| "loss/crossentropy": 2.5355153501033785, |
| "loss/hidden": 3.972265625, |
| "loss/jsd": 0.11578338220715523, |
| "loss/logits": 0.0, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.052, |
| "grad_norm": 23.0, |
| "grad_norm_var": 149.62389322916667, |
| "learning_rate": 0.0001, |
| "loss": 5.1453, |
| "loss/crossentropy": 2.328887623548508, |
| "loss/hidden": 3.84609375, |
| "loss/jsd": 0.1067446961067617, |
| "loss/logits": 0.0, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.0525, |
| "grad_norm": 22.625, |
| "grad_norm_var": 23.629166666666666, |
| "learning_rate": 0.0001, |
| "loss": 5.0775, |
| "loss/crossentropy": 2.3245414569973946, |
| "loss/hidden": 3.950390625, |
| "loss/jsd": 0.11564150396734477, |
| "loss/logits": 0.0, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.053, |
| "grad_norm": 29.375, |
| "grad_norm_var": 22.822330729166666, |
| "learning_rate": 0.0001, |
| "loss": 4.929, |
| "loss/crossentropy": 2.5518812984228134, |
| "loss/hidden": 3.76796875, |
| "loss/jsd": 0.10029144948348404, |
| "loss/logits": 0.0, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.0535, |
| "grad_norm": 22.875, |
| "grad_norm_var": 27.373372395833332, |
| "learning_rate": 0.0001, |
| "loss": 5.1682, |
| "loss/crossentropy": 2.3814490526914596, |
| "loss/hidden": 4.084765625, |
| "loss/jsd": 0.13794842325150966, |
| "loss/logits": 0.0, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.054, |
| "grad_norm": 30.375, |
| "grad_norm_var": 25.968684895833334, |
| "learning_rate": 0.0001, |
| "loss": 5.0446, |
| "loss/crossentropy": 2.336636045575142, |
| "loss/hidden": 3.98984375, |
| "loss/jsd": 0.11006514001637697, |
| "loss/logits": 0.0, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.0545, |
| "grad_norm": 25.5, |
| "grad_norm_var": 32.0447265625, |
| "learning_rate": 0.0001, |
| "loss": 5.0339, |
| "loss/crossentropy": 2.2337013885378836, |
| "loss/hidden": 3.945703125, |
| "loss/jsd": 0.11723029632121325, |
| "loss/logits": 0.0, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.055, |
| "grad_norm": 25.375, |
| "grad_norm_var": 102.66432291666666, |
| "learning_rate": 0.0001, |
| "loss": 5.0155, |
| "loss/crossentropy": 2.443159765005112, |
| "loss/hidden": 4.062890625, |
| "loss/jsd": 0.11166490567848086, |
| "loss/logits": 0.0, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.0555, |
| "grad_norm": 26.25, |
| "grad_norm_var": 12.558072916666667, |
| "learning_rate": 0.0001, |
| "loss": 5.0531, |
| "loss/crossentropy": 2.2338882118463514, |
| "loss/hidden": 4.025, |
| "loss/jsd": 0.11465255348011852, |
| "loss/logits": 0.0, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.056, |
| "grad_norm": 25.875, |
| "grad_norm_var": 8.347916666666666, |
| "learning_rate": 0.0001, |
| "loss": 5.0976, |
| "loss/crossentropy": 2.3596479177474974, |
| "loss/hidden": 3.940625, |
| "loss/jsd": 0.11759824641048908, |
| "loss/logits": 0.0, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.0565, |
| "grad_norm": 30.25, |
| "grad_norm_var": 188.1353515625, |
| "learning_rate": 0.0001, |
| "loss": 5.0785, |
| "loss/crossentropy": 2.3698789328336716, |
| "loss/hidden": 3.962109375, |
| "loss/jsd": 0.1172801936045289, |
| "loss/logits": 0.0, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.057, |
| "grad_norm": 26.375, |
| "grad_norm_var": 185.04765625, |
| "learning_rate": 0.0001, |
| "loss": 5.0927, |
| "loss/crossentropy": 2.3481896728277207, |
| "loss/hidden": 3.9609375, |
| "loss/jsd": 0.10608052760362625, |
| "loss/logits": 0.0, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.0575, |
| "grad_norm": 22.875, |
| "grad_norm_var": 125.32233072916667, |
| "learning_rate": 0.0001, |
| "loss": 5.0263, |
| "loss/crossentropy": 2.301522643119097, |
| "loss/hidden": 3.8, |
| "loss/jsd": 0.10154257528483868, |
| "loss/logits": 0.0, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.058, |
| "grad_norm": 27.625, |
| "grad_norm_var": 81.21432291666666, |
| "learning_rate": 0.0001, |
| "loss": 5.1087, |
| "loss/crossentropy": 2.3300373941659926, |
| "loss/hidden": 3.923828125, |
| "loss/jsd": 0.10997985871508717, |
| "loss/logits": 0.0, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.0585, |
| "grad_norm": 22.0, |
| "grad_norm_var": 37.805989583333336, |
| "learning_rate": 0.0001, |
| "loss": 4.9669, |
| "loss/crossentropy": 2.3570085942745207, |
| "loss/hidden": 3.903125, |
| "loss/jsd": 0.12716795089654626, |
| "loss/logits": 0.0, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.059, |
| "grad_norm": 28.25, |
| "grad_norm_var": 6.526822916666666, |
| "learning_rate": 0.0001, |
| "loss": 4.8827, |
| "loss/crossentropy": 2.4714103788137436, |
| "loss/hidden": 3.878125, |
| "loss/jsd": 0.11338211484253406, |
| "loss/logits": 0.0, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.0595, |
| "grad_norm": 25.0, |
| "grad_norm_var": 1.0217717449682671e+18, |
| "learning_rate": 0.0001, |
| "loss": 5.0544, |
| "loss/crossentropy": 2.170953643321991, |
| "loss/hidden": 3.91875, |
| "loss/jsd": 0.11225487310439348, |
| "loss/logits": 0.0, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 22.125, |
| "grad_norm_var": 22.508072916666666, |
| "learning_rate": 0.0001, |
| "loss": 4.8895, |
| "loss/crossentropy": 2.4479696050286295, |
| "loss/hidden": 3.896484375, |
| "loss/jsd": 0.10494228331372142, |
| "loss/logits": 0.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.0605, |
| "grad_norm": 22.25, |
| "grad_norm_var": 19.080989583333334, |
| "learning_rate": 0.0001, |
| "loss": 4.8699, |
| "loss/crossentropy": 2.3343143433332445, |
| "loss/hidden": 3.787109375, |
| "loss/jsd": 0.10432742889970541, |
| "loss/logits": 0.0, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.061, |
| "grad_norm": 19.0, |
| "grad_norm_var": 7.299934895833333, |
| "learning_rate": 0.0001, |
| "loss": 4.9113, |
| "loss/crossentropy": 2.2152185067534447, |
| "loss/hidden": 3.838671875, |
| "loss/jsd": 0.10314544131979346, |
| "loss/logits": 0.0, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.0615, |
| "grad_norm": 25.125, |
| "grad_norm_var": 8.783333333333333, |
| "learning_rate": 0.0001, |
| "loss": 4.8793, |
| "loss/crossentropy": 2.3982258841395376, |
| "loss/hidden": 3.765625, |
| "loss/jsd": 0.1033841515891254, |
| "loss/logits": 0.0, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.062, |
| "grad_norm": 24.25, |
| "grad_norm_var": 8.654166666666667, |
| "learning_rate": 0.0001, |
| "loss": 4.936, |
| "loss/crossentropy": 2.3861924752593042, |
| "loss/hidden": 3.990625, |
| "loss/jsd": 0.1316368247382343, |
| "loss/logits": 0.0, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.0625, |
| "grad_norm": 27.625, |
| "grad_norm_var": 18.838997395833335, |
| "learning_rate": 0.0001, |
| "loss": 5.0574, |
| "loss/crossentropy": 2.3481432244181635, |
| "loss/hidden": 3.886328125, |
| "loss/jsd": 0.12455893289297819, |
| "loss/logits": 0.0, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.063, |
| "grad_norm": 24.875, |
| "grad_norm_var": 741.0330729166667, |
| "learning_rate": 0.0001, |
| "loss": 5.054, |
| "loss/crossentropy": 2.50970872938633, |
| "loss/hidden": 3.89375, |
| "loss/jsd": 0.11707814577966928, |
| "loss/logits": 0.0, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.0635, |
| "grad_norm": 22.25, |
| "grad_norm_var": 766.54140625, |
| "learning_rate": 0.0001, |
| "loss": 4.9292, |
| "loss/crossentropy": 2.214522284269333, |
| "loss/hidden": 3.817578125, |
| "loss/jsd": 0.09662074805237353, |
| "loss/logits": 0.0, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.064, |
| "grad_norm": 27.125, |
| "grad_norm_var": 1.2075980051835433e+18, |
| "learning_rate": 0.0001, |
| "loss": 4.9727, |
| "loss/crossentropy": 2.5177758872509, |
| "loss/hidden": 3.872265625, |
| "loss/jsd": 0.12324077049270273, |
| "loss/logits": 0.0, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.0645, |
| "grad_norm": 26.25, |
| "grad_norm_var": 4.482291666666667, |
| "learning_rate": 0.0001, |
| "loss": 4.8651, |
| "loss/crossentropy": 2.4133356541395186, |
| "loss/hidden": 3.8109375, |
| "loss/jsd": 0.10085376175120472, |
| "loss/logits": 0.0, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.065, |
| "grad_norm": 20.875, |
| "grad_norm_var": 4.7875, |
| "learning_rate": 0.0001, |
| "loss": 4.8874, |
| "loss/crossentropy": 2.211686734855175, |
| "loss/hidden": 3.82578125, |
| "loss/jsd": 0.10324386316351593, |
| "loss/logits": 0.0, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.0655, |
| "grad_norm": 25.125, |
| "grad_norm_var": 39.35045572916667, |
| "learning_rate": 0.0001, |
| "loss": 4.9265, |
| "loss/crossentropy": 2.386268785595894, |
| "loss/hidden": 3.837890625, |
| "loss/jsd": 0.11206256924197078, |
| "loss/logits": 0.0, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.066, |
| "grad_norm": 25.5, |
| "grad_norm_var": 39.68020833333333, |
| "learning_rate": 0.0001, |
| "loss": 4.9719, |
| "loss/crossentropy": 2.3294328808784486, |
| "loss/hidden": 3.8640625, |
| "loss/jsd": 0.11526230238378048, |
| "loss/logits": 0.0, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.0665, |
| "grad_norm": 23.375, |
| "grad_norm_var": 47.66295572916667, |
| "learning_rate": 0.0001, |
| "loss": 4.922, |
| "loss/crossentropy": 2.38586545586586, |
| "loss/hidden": 3.7203125, |
| "loss/jsd": 0.09609230635687709, |
| "loss/logits": 0.0, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.067, |
| "grad_norm": 25.125, |
| "grad_norm_var": 36.25305989583333, |
| "learning_rate": 0.0001, |
| "loss": 4.9463, |
| "loss/crossentropy": 2.4498814970254896, |
| "loss/hidden": 3.84296875, |
| "loss/jsd": 0.10662997653707862, |
| "loss/logits": 0.0, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.0675, |
| "grad_norm": 26.625, |
| "grad_norm_var": 34.154622395833336, |
| "learning_rate": 0.0001, |
| "loss": 4.89, |
| "loss/crossentropy": 2.3147580534219743, |
| "loss/hidden": 3.840625, |
| "loss/jsd": 0.10548559352755546, |
| "loss/logits": 0.0, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.068, |
| "grad_norm": 20.25, |
| "grad_norm_var": 5.3369140625, |
| "learning_rate": 0.0001, |
| "loss": 4.9495, |
| "loss/crossentropy": 2.2381860077381135, |
| "loss/hidden": 3.936328125, |
| "loss/jsd": 0.1049613301642239, |
| "loss/logits": 0.0, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.0685, |
| "grad_norm": 21.625, |
| "grad_norm_var": 42.57604166666667, |
| "learning_rate": 0.0001, |
| "loss": 4.9902, |
| "loss/crossentropy": 2.3451401717960834, |
| "loss/hidden": 3.97578125, |
| "loss/jsd": 0.10501982429996133, |
| "loss/logits": 0.0, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.069, |
| "grad_norm": 25.5, |
| "grad_norm_var": 13.911393229166666, |
| "learning_rate": 0.0001, |
| "loss": 4.8738, |
| "loss/crossentropy": 2.2887198269367217, |
| "loss/hidden": 3.948828125, |
| "loss/jsd": 0.10703569920733572, |
| "loss/logits": 0.0, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.0695, |
| "grad_norm": 24.0, |
| "grad_norm_var": 4.178125, |
| "learning_rate": 0.0001, |
| "loss": 4.908, |
| "loss/crossentropy": 2.4341419368982313, |
| "loss/hidden": 3.9109375, |
| "loss/jsd": 0.13313074046745896, |
| "loss/logits": 0.0, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 25.5, |
| "grad_norm_var": 3.2643229166666665, |
| "learning_rate": 0.0001, |
| "loss": 4.8483, |
| "loss/crossentropy": 2.3005983904004097, |
| "loss/hidden": 3.794921875, |
| "loss/jsd": 0.1167063161265105, |
| "loss/logits": 0.0, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.0705, |
| "grad_norm": 25.375, |
| "grad_norm_var": 2.6684895833333333, |
| "learning_rate": 0.0001, |
| "loss": 4.8661, |
| "loss/crossentropy": 2.3177727833390236, |
| "loss/hidden": 3.76328125, |
| "loss/jsd": 0.09948643315583468, |
| "loss/logits": 0.0, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.071, |
| "grad_norm": 21.25, |
| "grad_norm_var": 6.670572916666667, |
| "learning_rate": 0.0001, |
| "loss": 4.8736, |
| "loss/crossentropy": 2.2698763489723204, |
| "loss/hidden": 3.831640625, |
| "loss/jsd": 0.10282253352925182, |
| "loss/logits": 0.0, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.0715, |
| "grad_norm": 18.625, |
| "grad_norm_var": 8.87265625, |
| "learning_rate": 0.0001, |
| "loss": 4.8039, |
| "loss/crossentropy": 2.360131266713142, |
| "loss/hidden": 3.722265625, |
| "loss/jsd": 0.10547879729419947, |
| "loss/logits": 0.0, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.072, |
| "grad_norm": 21.75, |
| "grad_norm_var": 3.8889973958333335, |
| "learning_rate": 0.0001, |
| "loss": 4.7269, |
| "loss/crossentropy": 2.311430121213198, |
| "loss/hidden": 3.7109375, |
| "loss/jsd": 0.09480313453823327, |
| "loss/logits": 0.0, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.0725, |
| "grad_norm": 29.75, |
| "grad_norm_var": 7.685416666666667, |
| "learning_rate": 0.0001, |
| "loss": 4.7292, |
| "loss/crossentropy": 2.4506467133760452, |
| "loss/hidden": 3.672265625, |
| "loss/jsd": 0.09663807023316622, |
| "loss/logits": 0.0, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.073, |
| "grad_norm": 23.25, |
| "grad_norm_var": 8.416666666666666, |
| "learning_rate": 0.0001, |
| "loss": 4.7346, |
| "loss/crossentropy": 2.2691701710224152, |
| "loss/hidden": 3.836328125, |
| "loss/jsd": 0.1028917589224875, |
| "loss/logits": 0.0, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.0735, |
| "grad_norm": 20.375, |
| "grad_norm_var": 7.246875, |
| "learning_rate": 0.0001, |
| "loss": 4.7517, |
| "loss/crossentropy": 2.3083701550960543, |
| "loss/hidden": 3.714453125, |
| "loss/jsd": 0.09604525147005916, |
| "loss/logits": 0.0, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.074, |
| "grad_norm": 22.0, |
| "grad_norm_var": 11.672916666666667, |
| "learning_rate": 0.0001, |
| "loss": 4.8113, |
| "loss/crossentropy": 2.3635326638817786, |
| "loss/hidden": 3.6703125, |
| "loss/jsd": 0.10219773268327118, |
| "loss/logits": 0.0, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.0745, |
| "grad_norm": 21.375, |
| "grad_norm_var": 5.637239583333334, |
| "learning_rate": 0.0001, |
| "loss": 4.798, |
| "loss/crossentropy": 2.182288531959057, |
| "loss/hidden": 3.784375, |
| "loss/jsd": 0.09713765853084624, |
| "loss/logits": 0.0, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.075, |
| "grad_norm": 22.375, |
| "grad_norm_var": 13.480143229166666, |
| "learning_rate": 0.0001, |
| "loss": 4.9073, |
| "loss/crossentropy": 2.209014095366001, |
| "loss/hidden": 3.77734375, |
| "loss/jsd": 0.100444171205163, |
| "loss/logits": 0.0, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.0755, |
| "grad_norm": 20.25, |
| "grad_norm_var": 15.253125, |
| "learning_rate": 0.0001, |
| "loss": 4.8648, |
| "loss/crossentropy": 2.307139050960541, |
| "loss/hidden": 3.835546875, |
| "loss/jsd": 0.10750290956348181, |
| "loss/logits": 0.0, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.076, |
| "grad_norm": 22.25, |
| "grad_norm_var": 5.84765625, |
| "learning_rate": 0.0001, |
| "loss": 4.7021, |
| "loss/crossentropy": 2.4567115128040315, |
| "loss/hidden": 3.641015625, |
| "loss/jsd": 0.0963326326571405, |
| "loss/logits": 0.0, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.0765, |
| "grad_norm": 23.0, |
| "grad_norm_var": 17.799934895833335, |
| "learning_rate": 0.0001, |
| "loss": 4.7726, |
| "loss/crossentropy": 2.3501833245158195, |
| "loss/hidden": 3.71171875, |
| "loss/jsd": 0.09695078176446259, |
| "loss/logits": 0.0, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.077, |
| "grad_norm": 26.875, |
| "grad_norm_var": 14.445572916666666, |
| "learning_rate": 0.0001, |
| "loss": 4.7776, |
| "loss/crossentropy": 2.35235877931118, |
| "loss/hidden": 3.7109375, |
| "loss/jsd": 0.09894683174788951, |
| "loss/logits": 0.0, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.0775, |
| "grad_norm": 20.0, |
| "grad_norm_var": 6.866080729166667, |
| "learning_rate": 0.0001, |
| "loss": 4.7465, |
| "loss/crossentropy": 2.3319214552640917, |
| "loss/hidden": 3.678515625, |
| "loss/jsd": 0.10175617430359125, |
| "loss/logits": 0.0, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.078, |
| "grad_norm": 25.75, |
| "grad_norm_var": 5.9306640625, |
| "learning_rate": 0.0001, |
| "loss": 4.7662, |
| "loss/crossentropy": 2.312511496245861, |
| "loss/hidden": 3.82421875, |
| "loss/jsd": 0.10424250243231654, |
| "loss/logits": 0.0, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.0785, |
| "grad_norm": 19.25, |
| "grad_norm_var": 7.527018229166667, |
| "learning_rate": 0.0001, |
| "loss": 4.7506, |
| "loss/crossentropy": 2.195492114126682, |
| "loss/hidden": 3.806640625, |
| "loss/jsd": 0.10378086129203438, |
| "loss/logits": 0.0, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.079, |
| "grad_norm": 20.875, |
| "grad_norm_var": 5.69765625, |
| "learning_rate": 0.0001, |
| "loss": 4.7525, |
| "loss/crossentropy": 2.3451679602265356, |
| "loss/hidden": 3.64609375, |
| "loss/jsd": 0.10021187355741859, |
| "loss/logits": 0.0, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.0795, |
| "grad_norm": 25.125, |
| "grad_norm_var": 4.002083333333333, |
| "learning_rate": 0.0001, |
| "loss": 4.7907, |
| "loss/crossentropy": 2.235419529676437, |
| "loss/hidden": 3.76875, |
| "loss/jsd": 0.1017349574714899, |
| "loss/logits": 0.0, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 17.5, |
| "grad_norm_var": 6.187239583333334, |
| "learning_rate": 0.0001, |
| "loss": 4.7544, |
| "loss/crossentropy": 2.349038490653038, |
| "loss/hidden": 3.75859375, |
| "loss/jsd": 0.10723181385546923, |
| "loss/logits": 0.0, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.0805, |
| "grad_norm": 21.5, |
| "grad_norm_var": 5.036458333333333, |
| "learning_rate": 0.0001, |
| "loss": 4.7984, |
| "loss/crossentropy": 2.2953826270997526, |
| "loss/hidden": 3.735546875, |
| "loss/jsd": 0.10434331484138966, |
| "loss/logits": 0.0, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.081, |
| "grad_norm": 21.625, |
| "grad_norm_var": 2.8671223958333334, |
| "learning_rate": 0.0001, |
| "loss": 4.7802, |
| "loss/crossentropy": 2.420463111996651, |
| "loss/hidden": 3.7671875, |
| "loss/jsd": 0.09350865064188837, |
| "loss/logits": 0.0, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.0815, |
| "grad_norm": 21.875, |
| "grad_norm_var": 5.375455729166666, |
| "learning_rate": 0.0001, |
| "loss": 4.7768, |
| "loss/crossentropy": 2.4329511165618896, |
| "loss/hidden": 3.770703125, |
| "loss/jsd": 0.11144884563982486, |
| "loss/logits": 0.0, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.082, |
| "grad_norm": 18.0, |
| "grad_norm_var": 5.853059895833334, |
| "learning_rate": 0.0001, |
| "loss": 4.81, |
| "loss/crossentropy": 2.3901975452899933, |
| "loss/hidden": 3.816015625, |
| "loss/jsd": 0.11511239362880588, |
| "loss/logits": 0.0, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.0825, |
| "grad_norm": 18.625, |
| "grad_norm_var": 6.853059895833334, |
| "learning_rate": 0.0001, |
| "loss": 4.8666, |
| "loss/crossentropy": 2.42452190220356, |
| "loss/hidden": 3.791015625, |
| "loss/jsd": 0.10730197560042143, |
| "loss/logits": 0.0, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.083, |
| "grad_norm": 22.125, |
| "grad_norm_var": 5.945768229166666, |
| "learning_rate": 0.0001, |
| "loss": 4.825, |
| "loss/crossentropy": 2.415967509150505, |
| "loss/hidden": 3.7140625, |
| "loss/jsd": 0.10223841555416584, |
| "loss/logits": 0.0, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.0835, |
| "grad_norm": 19.625, |
| "grad_norm_var": 4.648372395833333, |
| "learning_rate": 0.0001, |
| "loss": 4.6893, |
| "loss/crossentropy": 2.346050335466862, |
| "loss/hidden": 3.75234375, |
| "loss/jsd": 0.10205129384994507, |
| "loss/logits": 0.0, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.084, |
| "grad_norm": 26.75, |
| "grad_norm_var": 6.887239583333334, |
| "learning_rate": 0.0001, |
| "loss": 4.7759, |
| "loss/crossentropy": 2.272695492208004, |
| "loss/hidden": 3.740234375, |
| "loss/jsd": 0.09743564091622829, |
| "loss/logits": 0.0, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.0845, |
| "grad_norm": 21.25, |
| "grad_norm_var": 8.242122395833333, |
| "learning_rate": 0.0001, |
| "loss": 4.7886, |
| "loss/crossentropy": 2.421866828203201, |
| "loss/hidden": 3.801171875, |
| "loss/jsd": 0.10874381214380265, |
| "loss/logits": 0.0, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.085, |
| "grad_norm": 21.875, |
| "grad_norm_var": 5.842643229166667, |
| "learning_rate": 0.0001, |
| "loss": 4.7021, |
| "loss/crossentropy": 2.389561951160431, |
| "loss/hidden": 3.679296875, |
| "loss/jsd": 0.1009491034783423, |
| "loss/logits": 0.0, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.0855, |
| "grad_norm": 21.625, |
| "grad_norm_var": 13.94765625, |
| "learning_rate": 0.0001, |
| "loss": 4.8137, |
| "loss/crossentropy": 2.3791208446025847, |
| "loss/hidden": 3.775, |
| "loss/jsd": 0.11071940269321204, |
| "loss/logits": 0.0, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.086, |
| "grad_norm": 20.5, |
| "grad_norm_var": 13.2603515625, |
| "learning_rate": 0.0001, |
| "loss": 4.738, |
| "loss/crossentropy": 2.4374333173036575, |
| "loss/hidden": 3.728125, |
| "loss/jsd": 0.10198512580245733, |
| "loss/logits": 0.0, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.0865, |
| "grad_norm": 16.625, |
| "grad_norm_var": 11.1853515625, |
| "learning_rate": 0.0001, |
| "loss": 4.7524, |
| "loss/crossentropy": 2.3030879952013494, |
| "loss/hidden": 3.626953125, |
| "loss/jsd": 0.09310725582763553, |
| "loss/logits": 0.0, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.087, |
| "grad_norm": 18.875, |
| "grad_norm_var": 6.285416666666666, |
| "learning_rate": 0.0001, |
| "loss": 4.7021, |
| "loss/crossentropy": 2.192840526998043, |
| "loss/hidden": 3.819140625, |
| "loss/jsd": 0.09320764979347587, |
| "loss/logits": 0.0, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.0875, |
| "grad_norm": 24.625, |
| "grad_norm_var": 6.4353515625, |
| "learning_rate": 0.0001, |
| "loss": 4.7059, |
| "loss/crossentropy": 2.3610597878694533, |
| "loss/hidden": 3.733984375, |
| "loss/jsd": 0.10029621962457895, |
| "loss/logits": 0.0, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.088, |
| "grad_norm": 19.5, |
| "grad_norm_var": 19.762239583333333, |
| "learning_rate": 0.0001, |
| "loss": 4.7081, |
| "loss/crossentropy": 2.410063475370407, |
| "loss/hidden": 3.65078125, |
| "loss/jsd": 0.10161215299740434, |
| "loss/logits": 0.0, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.0885, |
| "grad_norm": 33.25, |
| "grad_norm_var": 22.748893229166665, |
| "learning_rate": 0.0001, |
| "loss": 4.5743, |
| "loss/crossentropy": 2.2984881952404974, |
| "loss/hidden": 3.666796875, |
| "loss/jsd": 0.09647621251642705, |
| "loss/logits": 0.0, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.089, |
| "grad_norm": 20.25, |
| "grad_norm_var": 16.602083333333333, |
| "learning_rate": 0.0001, |
| "loss": 4.7585, |
| "loss/crossentropy": 2.3432783752679827, |
| "loss/hidden": 3.716796875, |
| "loss/jsd": 0.10081057399511337, |
| "loss/logits": 0.0, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.0895, |
| "grad_norm": 22.875, |
| "grad_norm_var": 5.621875, |
| "learning_rate": 0.0001, |
| "loss": 4.707, |
| "loss/crossentropy": 2.352738951146603, |
| "loss/hidden": 3.673828125, |
| "loss/jsd": 0.09183212611824274, |
| "loss/logits": 0.0, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 33.0, |
| "grad_norm_var": 16.277083333333334, |
| "learning_rate": 0.0001, |
| "loss": 4.6996, |
| "loss/crossentropy": 2.443929785490036, |
| "loss/hidden": 3.55390625, |
| "loss/jsd": 0.09280467573553323, |
| "loss/logits": 0.0, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.0905, |
| "grad_norm": 17.125, |
| "grad_norm_var": 20.319205729166665, |
| "learning_rate": 0.0001, |
| "loss": 4.7503, |
| "loss/crossentropy": 2.333281812816858, |
| "loss/hidden": 3.687109375, |
| "loss/jsd": 0.09856429314240814, |
| "loss/logits": 0.0, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.091, |
| "grad_norm": 19.625, |
| "grad_norm_var": 14.943684895833334, |
| "learning_rate": 0.0001, |
| "loss": 4.8011, |
| "loss/crossentropy": 2.3165989741683006, |
| "loss/hidden": 3.93515625, |
| "loss/jsd": 0.1154123242944479, |
| "loss/logits": 0.0, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.0915, |
| "grad_norm": 19.125, |
| "grad_norm_var": 2.5833333333333335, |
| "learning_rate": 0.0001, |
| "loss": 4.7784, |
| "loss/crossentropy": 2.3343286007642745, |
| "loss/hidden": 3.796875, |
| "loss/jsd": 0.11231993734836579, |
| "loss/logits": 0.0, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.092, |
| "grad_norm": 18.0, |
| "grad_norm_var": 4.880989583333333, |
| "learning_rate": 0.0001, |
| "loss": 4.6886, |
| "loss/crossentropy": 2.412258565425873, |
| "loss/hidden": 3.78046875, |
| "loss/jsd": 0.10415599066764117, |
| "loss/logits": 0.0, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.0925, |
| "grad_norm": 17.25, |
| "grad_norm_var": 6.083072916666667, |
| "learning_rate": 0.0001, |
| "loss": 4.7485, |
| "loss/crossentropy": 2.379472056031227, |
| "loss/hidden": 3.6609375, |
| "loss/jsd": 0.09712380319833755, |
| "loss/logits": 0.0, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.093, |
| "grad_norm": 19.125, |
| "grad_norm_var": 9.0041015625, |
| "learning_rate": 0.0001, |
| "loss": 4.7145, |
| "loss/crossentropy": 2.286051708459854, |
| "loss/hidden": 3.671484375, |
| "loss/jsd": 0.09749153861775994, |
| "loss/logits": 0.0, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.0935, |
| "grad_norm": 24.25, |
| "grad_norm_var": 10.09765625, |
| "learning_rate": 0.0001, |
| "loss": 4.6597, |
| "loss/crossentropy": 2.3485587686300278, |
| "loss/hidden": 3.656640625, |
| "loss/jsd": 0.09961330010555684, |
| "loss/logits": 0.0, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.094, |
| "grad_norm": 25.5, |
| "grad_norm_var": 10.745833333333334, |
| "learning_rate": 0.0001, |
| "loss": 4.7654, |
| "loss/crossentropy": 2.22419136762619, |
| "loss/hidden": 3.680859375, |
| "loss/jsd": 0.09599914094433189, |
| "loss/logits": 0.0, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.0945, |
| "grad_norm": 23.375, |
| "grad_norm_var": 11.849739583333333, |
| "learning_rate": 0.0001, |
| "loss": 4.6586, |
| "loss/crossentropy": 2.2319135151803495, |
| "loss/hidden": 3.776953125, |
| "loss/jsd": 0.1003801210783422, |
| "loss/logits": 0.0, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.095, |
| "grad_norm": 20.125, |
| "grad_norm_var": 15.6884765625, |
| "learning_rate": 0.0001, |
| "loss": 4.7113, |
| "loss/crossentropy": 2.466662494838238, |
| "loss/hidden": 3.694140625, |
| "loss/jsd": 0.09942078748717904, |
| "loss/logits": 0.0, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.0955, |
| "grad_norm": 20.875, |
| "grad_norm_var": 11.4197265625, |
| "learning_rate": 0.0001, |
| "loss": 4.6638, |
| "loss/crossentropy": 2.3695669680833817, |
| "loss/hidden": 3.593359375, |
| "loss/jsd": 0.09504008954390883, |
| "loss/logits": 0.0, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.096, |
| "grad_norm": 22.125, |
| "grad_norm_var": 4.581184895833333, |
| "learning_rate": 0.0001, |
| "loss": 4.6473, |
| "loss/crossentropy": 2.345889499783516, |
| "loss/hidden": 3.691015625, |
| "loss/jsd": 0.10475197089836001, |
| "loss/logits": 0.0, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.0965, |
| "grad_norm": 22.5, |
| "grad_norm_var": 4.087239583333333, |
| "learning_rate": 0.0001, |
| "loss": 4.721, |
| "loss/crossentropy": 2.256808315217495, |
| "loss/hidden": 3.7015625, |
| "loss/jsd": 0.09892030693590641, |
| "loss/logits": 0.0, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.097, |
| "grad_norm": 19.125, |
| "grad_norm_var": 6.395572916666667, |
| "learning_rate": 0.0001, |
| "loss": 4.5498, |
| "loss/crossentropy": 2.5429009228944777, |
| "loss/hidden": 3.680859375, |
| "loss/jsd": 0.09861663114279509, |
| "loss/logits": 0.0, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.0975, |
| "grad_norm": 21.25, |
| "grad_norm_var": 5.843489583333334, |
| "learning_rate": 0.0001, |
| "loss": 4.6899, |
| "loss/crossentropy": 2.272120487689972, |
| "loss/hidden": 3.6375, |
| "loss/jsd": 0.09743905253708363, |
| "loss/logits": 0.0, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.098, |
| "grad_norm": 17.625, |
| "grad_norm_var": 6.62265625, |
| "learning_rate": 0.0001, |
| "loss": 4.6035, |
| "loss/crossentropy": 2.1649394638836386, |
| "loss/hidden": 3.584765625, |
| "loss/jsd": 0.08881366224959493, |
| "loss/logits": 0.0, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.0985, |
| "grad_norm": 4328521728.0, |
| "grad_norm_var": 1.171006260534208e+18, |
| "learning_rate": 0.0001, |
| "loss": 4.6906, |
| "loss/crossentropy": 2.3182963758707045, |
| "loss/hidden": 3.623046875, |
| "loss/jsd": 0.10251586111262441, |
| "loss/logits": 0.0, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.099, |
| "grad_norm": 20.25, |
| "grad_norm_var": 2.715501666496903e+18, |
| "learning_rate": 0.0001, |
| "loss": 4.7101, |
| "loss/crossentropy": 2.407327815890312, |
| "loss/hidden": 3.60703125, |
| "loss/jsd": 0.09459855072200299, |
| "loss/logits": 0.0, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.0995, |
| "grad_norm": 19.25, |
| "grad_norm_var": 1.7345191619224492e+18, |
| "learning_rate": 0.0001, |
| "loss": 4.6395, |
| "loss/crossentropy": 2.256649875640869, |
| "loss/hidden": 3.65234375, |
| "loss/jsd": 0.10230031171813607, |
| "loss/logits": 0.0, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 22.0, |
| "grad_norm_var": 2.981184895833333, |
| "learning_rate": 0.0001, |
| "loss": 4.5112, |
| "loss/crossentropy": 2.3214069336652754, |
| "loss/hidden": 3.553515625, |
| "loss/jsd": 0.09316142341122031, |
| "loss/logits": 0.0, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.1005, |
| "grad_norm": 20.25, |
| "grad_norm_var": 4.611393229166667, |
| "learning_rate": 0.0001, |
| "loss": 4.5154, |
| "loss/crossentropy": 2.297450725734234, |
| "loss/hidden": 3.569140625, |
| "loss/jsd": 0.09217815361917019, |
| "loss/logits": 0.0, |
| "step": 2010 |
| }, |
| { |
| "epoch": 0.101, |
| "grad_norm": 23.25, |
| "grad_norm_var": 8.597249348958334, |
| "learning_rate": 0.0001, |
| "loss": 4.6108, |
| "loss/crossentropy": 2.3132576078176497, |
| "loss/hidden": 3.69375, |
| "loss/jsd": 0.1215221800841391, |
| "loss/logits": 0.0, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.1015, |
| "grad_norm": 16.75, |
| "grad_norm_var": 8.385400390625, |
| "learning_rate": 0.0001, |
| "loss": 4.5906, |
| "loss/crossentropy": 2.42258235514164, |
| "loss/hidden": 3.58359375, |
| "loss/jsd": 0.09518450712785125, |
| "loss/logits": 0.0, |
| "step": 2030 |
| }, |
| { |
| "epoch": 0.102, |
| "grad_norm": 17.75, |
| "grad_norm_var": 39.30416666666667, |
| "learning_rate": 0.0001, |
| "loss": 4.6782, |
| "loss/crossentropy": 2.226282720267773, |
| "loss/hidden": 3.6078125, |
| "loss/jsd": 0.08297519264742732, |
| "loss/logits": 0.0, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.1025, |
| "grad_norm": 18.5, |
| "grad_norm_var": 8.6306640625, |
| "learning_rate": 0.0001, |
| "loss": 4.6889, |
| "loss/crossentropy": 2.23982213139534, |
| "loss/hidden": 3.660546875, |
| "loss/jsd": 0.0924127135425806, |
| "loss/logits": 0.0, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.103, |
| "grad_norm": 18.25, |
| "grad_norm_var": 6.187744140625, |
| "learning_rate": 0.0001, |
| "loss": 4.6246, |
| "loss/crossentropy": 2.2483278423547746, |
| "loss/hidden": 3.60859375, |
| "loss/jsd": 0.09513462502509355, |
| "loss/logits": 0.0, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.1035, |
| "grad_norm": 28.375, |
| "grad_norm_var": 12.276416015625, |
| "learning_rate": 0.0001, |
| "loss": 4.6868, |
| "loss/crossentropy": 2.2927519381046295, |
| "loss/hidden": 3.53515625, |
| "loss/jsd": 0.08648296073079109, |
| "loss/logits": 0.0, |
| "step": 2070 |
| }, |
| { |
| "epoch": 0.104, |
| "grad_norm": 24.125, |
| "grad_norm_var": 14.239583333333334, |
| "learning_rate": 0.0001, |
| "loss": 4.5602, |
| "loss/crossentropy": 2.3293472826480865, |
| "loss/hidden": 3.599609375, |
| "loss/jsd": 0.09772532721981406, |
| "loss/logits": 0.0, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.1045, |
| "grad_norm": 19.75, |
| "grad_norm_var": 8.269205729166666, |
| "learning_rate": 0.0001, |
| "loss": 4.6017, |
| "loss/crossentropy": 2.3832351714372635, |
| "loss/hidden": 3.60078125, |
| "loss/jsd": 0.09314336217939853, |
| "loss/logits": 0.0, |
| "step": 2090 |
| }, |
| { |
| "epoch": 0.105, |
| "grad_norm": 20.0, |
| "grad_norm_var": 5.070833333333334, |
| "learning_rate": 0.0001, |
| "loss": 4.5706, |
| "loss/crossentropy": 2.4874933838844298, |
| "loss/hidden": 3.651171875, |
| "loss/jsd": 0.09874060060828924, |
| "loss/logits": 0.0, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.1055, |
| "grad_norm": 16.0, |
| "grad_norm_var": 42.828059895833334, |
| "learning_rate": 0.0001, |
| "loss": 4.6945, |
| "loss/crossentropy": 2.185934893786907, |
| "loss/hidden": 3.78046875, |
| "loss/jsd": 0.10176362562924623, |
| "loss/logits": 0.0, |
| "step": 2110 |
| }, |
| { |
| "epoch": 0.106, |
| "grad_norm": 20.5, |
| "grad_norm_var": 912.2905598958333, |
| "learning_rate": 0.0001, |
| "loss": 4.8579, |
| "loss/crossentropy": 2.337796673178673, |
| "loss/hidden": 3.673828125, |
| "loss/jsd": 0.09400355285033583, |
| "loss/logits": 0.0, |
| "step": 2120 |
| }, |
| { |
| "epoch": 0.1065, |
| "grad_norm": 19.0, |
| "grad_norm_var": 86.45514322916667, |
| "learning_rate": 0.0001, |
| "loss": 4.6905, |
| "loss/crossentropy": 2.191851982474327, |
| "loss/hidden": 3.83828125, |
| "loss/jsd": 0.09336025016382336, |
| "loss/logits": 0.0, |
| "step": 2130 |
| }, |
| { |
| "epoch": 0.107, |
| "grad_norm": 20.625, |
| "grad_norm_var": 12.3416015625, |
| "learning_rate": 0.0001, |
| "loss": 4.746, |
| "loss/crossentropy": 2.321294938027859, |
| "loss/hidden": 3.709375, |
| "loss/jsd": 0.09611575696617365, |
| "loss/logits": 0.0, |
| "step": 2140 |
| }, |
| { |
| "epoch": 0.1075, |
| "grad_norm": 20.5, |
| "grad_norm_var": 11.157291666666667, |
| "learning_rate": 0.0001, |
| "loss": 4.6848, |
| "loss/crossentropy": 2.3641166269779204, |
| "loss/hidden": 3.802734375, |
| "loss/jsd": 0.11719204504042864, |
| "loss/logits": 0.0, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.108, |
| "grad_norm": 21.25, |
| "grad_norm_var": 313.9947265625, |
| "learning_rate": 0.0001, |
| "loss": 4.703, |
| "loss/crossentropy": 2.3178130373358727, |
| "loss/hidden": 3.6640625, |
| "loss/jsd": 0.10858506197109818, |
| "loss/logits": 0.0, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.1085, |
| "grad_norm": 18.125, |
| "grad_norm_var": 505.7301432291667, |
| "learning_rate": 0.0001, |
| "loss": 4.7499, |
| "loss/crossentropy": 2.2643778324127197, |
| "loss/hidden": 3.686328125, |
| "loss/jsd": 0.09995021363720298, |
| "loss/logits": 0.0, |
| "step": 2170 |
| }, |
| { |
| "epoch": 0.109, |
| "grad_norm": 19.0, |
| "grad_norm_var": 4.88125, |
| "learning_rate": 0.0001, |
| "loss": 4.639, |
| "loss/crossentropy": 2.333830028772354, |
| "loss/hidden": 3.77421875, |
| "loss/jsd": 0.10143324267119169, |
| "loss/logits": 0.0, |
| "step": 2180 |
| }, |
| { |
| "epoch": 0.1095, |
| "grad_norm": 17.625, |
| "grad_norm_var": 1.756685316214961e+18, |
| "learning_rate": 0.0001, |
| "loss": 4.6091, |
| "loss/crossentropy": 2.2005941957235335, |
| "loss/hidden": 3.546875, |
| "loss/jsd": 0.08694255957379937, |
| "loss/logits": 0.0, |
| "step": 2190 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 19.25, |
| "grad_norm_var": 219.81608072916666, |
| "learning_rate": 0.0001, |
| "loss": 4.6177, |
| "loss/crossentropy": 2.3627296075224877, |
| "loss/hidden": 3.74765625, |
| "loss/jsd": 0.10458627291955054, |
| "loss/logits": 0.0, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.1105, |
| "grad_norm": 23.25, |
| "grad_norm_var": 130.35305989583333, |
| "learning_rate": 0.0001, |
| "loss": 4.6516, |
| "loss/crossentropy": 2.4541628479957582, |
| "loss/hidden": 3.723046875, |
| "loss/jsd": 0.09181494554504752, |
| "loss/logits": 0.0, |
| "step": 2210 |
| }, |
| { |
| "epoch": 0.111, |
| "grad_norm": 20.5, |
| "grad_norm_var": 130.06015625, |
| "learning_rate": 0.0001, |
| "loss": 4.69, |
| "loss/crossentropy": 2.416627970337868, |
| "loss/hidden": 3.734765625, |
| "loss/jsd": 0.11581595735624432, |
| "loss/logits": 0.0, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.1115, |
| "grad_norm": 18.625, |
| "grad_norm_var": 5.193473307291667, |
| "learning_rate": 0.0001, |
| "loss": 4.685, |
| "loss/crossentropy": 2.3696270257234575, |
| "loss/hidden": 3.592578125, |
| "loss/jsd": 0.09627662082202733, |
| "loss/logits": 0.0, |
| "step": 2230 |
| }, |
| { |
| "epoch": 0.112, |
| "grad_norm": 17.875, |
| "grad_norm_var": 3.7067057291666665, |
| "learning_rate": 0.0001, |
| "loss": 4.6807, |
| "loss/crossentropy": 2.374240705370903, |
| "loss/hidden": 3.63671875, |
| "loss/jsd": 0.10023370888084174, |
| "loss/logits": 0.0, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.1125, |
| "grad_norm": 18.875, |
| "grad_norm_var": 6.2431640625, |
| "learning_rate": 0.0001, |
| "loss": 4.6202, |
| "loss/crossentropy": 2.39550845772028, |
| "loss/hidden": 3.655078125, |
| "loss/jsd": 0.10500529641285539, |
| "loss/logits": 0.0, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.113, |
| "grad_norm": 16.75, |
| "grad_norm_var": 5.78125, |
| "learning_rate": 0.0001, |
| "loss": 4.6473, |
| "loss/crossentropy": 2.3785043194890023, |
| "loss/hidden": 3.659375, |
| "loss/jsd": 0.09861900489777327, |
| "loss/logits": 0.0, |
| "step": 2260 |
| }, |
| { |
| "epoch": 0.1135, |
| "grad_norm": 20.25, |
| "grad_norm_var": 5.677018229166666, |
| "learning_rate": 0.0001, |
| "loss": 4.5771, |
| "loss/crossentropy": 2.4541394472122193, |
| "loss/hidden": 3.692578125, |
| "loss/jsd": 0.10195111334323884, |
| "loss/logits": 0.0, |
| "step": 2270 |
| }, |
| { |
| "epoch": 0.114, |
| "grad_norm": 21.25, |
| "grad_norm_var": 7.0228515625, |
| "learning_rate": 0.0001, |
| "loss": 4.597, |
| "loss/crossentropy": 2.3176154881715774, |
| "loss/hidden": 3.583984375, |
| "loss/jsd": 0.09049384696409106, |
| "loss/logits": 0.0, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.1145, |
| "grad_norm": 15.75, |
| "grad_norm_var": 15.241520182291667, |
| "learning_rate": 0.0001, |
| "loss": 4.5624, |
| "loss/crossentropy": 2.5178518027067183, |
| "loss/hidden": 3.528125, |
| "loss/jsd": 0.09066717140376568, |
| "loss/logits": 0.0, |
| "step": 2290 |
| }, |
| { |
| "epoch": 0.115, |
| "grad_norm": 17.625, |
| "grad_norm_var": 7.566520182291667, |
| "learning_rate": 0.0001, |
| "loss": 4.5471, |
| "loss/crossentropy": 2.3759778410196306, |
| "loss/hidden": 3.553125, |
| "loss/jsd": 0.09599322909489275, |
| "loss/logits": 0.0, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.1155, |
| "grad_norm": 20.0, |
| "grad_norm_var": 8.312434895833333, |
| "learning_rate": 0.0001, |
| "loss": 4.5075, |
| "loss/crossentropy": 2.3496225073933603, |
| "loss/hidden": 3.606640625, |
| "loss/jsd": 0.09744280204176903, |
| "loss/logits": 0.0, |
| "step": 2310 |
| }, |
| { |
| "epoch": 0.116, |
| "grad_norm": 19.75, |
| "grad_norm_var": 3.2108723958333334, |
| "learning_rate": 0.0001, |
| "loss": 4.5475, |
| "loss/crossentropy": 2.4485339492559435, |
| "loss/hidden": 3.523046875, |
| "loss/jsd": 0.0890957485884428, |
| "loss/logits": 0.0, |
| "step": 2320 |
| }, |
| { |
| "epoch": 0.1165, |
| "grad_norm": 19.0, |
| "grad_norm_var": 2.364697265625, |
| "learning_rate": 0.0001, |
| "loss": 4.5781, |
| "loss/crossentropy": 2.299929490685463, |
| "loss/hidden": 3.598828125, |
| "loss/jsd": 0.09711863240227103, |
| "loss/logits": 0.0, |
| "step": 2330 |
| }, |
| { |
| "epoch": 0.117, |
| "grad_norm": 17.75, |
| "grad_norm_var": 1.4955729166666667, |
| "learning_rate": 0.0001, |
| "loss": 4.5392, |
| "loss/crossentropy": 2.298077051341534, |
| "loss/hidden": 3.61015625, |
| "loss/jsd": 0.0896261626854539, |
| "loss/logits": 0.0, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.1175, |
| "grad_norm": 17.5, |
| "grad_norm_var": 3.0098307291666666, |
| "learning_rate": 0.0001, |
| "loss": 4.5949, |
| "loss/crossentropy": 2.2876608431339265, |
| "loss/hidden": 3.757421875, |
| "loss/jsd": 0.10631331414915621, |
| "loss/logits": 0.0, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.118, |
| "grad_norm": 21.875, |
| "grad_norm_var": 6.657291666666667, |
| "learning_rate": 0.0001, |
| "loss": 4.6054, |
| "loss/crossentropy": 2.589036238193512, |
| "loss/hidden": 3.7109375, |
| "loss/jsd": 0.09777994276955723, |
| "loss/logits": 0.0, |
| "step": 2360 |
| }, |
| { |
| "epoch": 0.1185, |
| "grad_norm": 19.5, |
| "grad_norm_var": 4.276497395833333, |
| "learning_rate": 0.0001, |
| "loss": 4.6578, |
| "loss/crossentropy": 2.4440223038196565, |
| "loss/hidden": 3.63125, |
| "loss/jsd": 0.10012138104066252, |
| "loss/logits": 0.0, |
| "step": 2370 |
| }, |
| { |
| "epoch": 0.119, |
| "grad_norm": 15.9375, |
| "grad_norm_var": 6.341520182291666, |
| "learning_rate": 0.0001, |
| "loss": 4.6382, |
| "loss/crossentropy": 2.3379690438508987, |
| "loss/hidden": 3.73046875, |
| "loss/jsd": 0.10282904924824834, |
| "loss/logits": 0.0, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.1195, |
| "grad_norm": 20.5, |
| "grad_norm_var": 7.068733723958333, |
| "learning_rate": 0.0001, |
| "loss": 4.5986, |
| "loss/crossentropy": 2.358085313439369, |
| "loss/hidden": 3.59140625, |
| "loss/jsd": 0.0954778247512877, |
| "loss/logits": 0.0, |
| "step": 2390 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 18.125, |
| "grad_norm_var": 4.709375, |
| "learning_rate": 0.0001, |
| "loss": 4.5411, |
| "loss/crossentropy": 2.262301415205002, |
| "loss/hidden": 3.6265625, |
| "loss/jsd": 0.09096273891627789, |
| "loss/logits": 0.0, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.1205, |
| "grad_norm": 21.125, |
| "grad_norm_var": 2.4447916666666667, |
| "learning_rate": 0.0001, |
| "loss": 4.5437, |
| "loss/crossentropy": 2.493518462777138, |
| "loss/hidden": 3.61953125, |
| "loss/jsd": 0.08979002349078655, |
| "loss/logits": 0.0, |
| "step": 2410 |
| }, |
| { |
| "epoch": 0.121, |
| "grad_norm": 16.75, |
| "grad_norm_var": 6.323958333333334, |
| "learning_rate": 0.0001, |
| "loss": 4.563, |
| "loss/crossentropy": 2.4933597564697267, |
| "loss/hidden": 3.592578125, |
| "loss/jsd": 0.09690459789708257, |
| "loss/logits": 0.0, |
| "step": 2420 |
| }, |
| { |
| "epoch": 0.1215, |
| "grad_norm": 18.875, |
| "grad_norm_var": 6.918489583333334, |
| "learning_rate": 0.0001, |
| "loss": 4.5351, |
| "loss/crossentropy": 2.516791993379593, |
| "loss/hidden": 3.598828125, |
| "loss/jsd": 0.09446065053343773, |
| "loss/logits": 0.0, |
| "step": 2430 |
| }, |
| { |
| "epoch": 0.122, |
| "grad_norm": 19.375, |
| "grad_norm_var": 5.448942057291666, |
| "learning_rate": 0.0001, |
| "loss": 4.5509, |
| "loss/crossentropy": 2.2249866664409637, |
| "loss/hidden": 3.53203125, |
| "loss/jsd": 0.08729059183970093, |
| "loss/logits": 0.0, |
| "step": 2440 |
| }, |
| { |
| "epoch": 0.1225, |
| "grad_norm": 19.25, |
| "grad_norm_var": 5.459228515625, |
| "learning_rate": 0.0001, |
| "loss": 4.5135, |
| "loss/crossentropy": 2.2651902705430986, |
| "loss/hidden": 3.54765625, |
| "loss/jsd": 0.08877531317993999, |
| "loss/logits": 0.0, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.123, |
| "grad_norm": 18.375, |
| "grad_norm_var": 2.688004557291667, |
| "learning_rate": 0.0001, |
| "loss": 4.4698, |
| "loss/crossentropy": 2.2470821171998976, |
| "loss/hidden": 3.562890625, |
| "loss/jsd": 0.09754009852185845, |
| "loss/logits": 0.0, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.1235, |
| "grad_norm": 20.0, |
| "grad_norm_var": 4.591910807291667, |
| "learning_rate": 0.0001, |
| "loss": 4.5083, |
| "loss/crossentropy": 2.1959333077073095, |
| "loss/hidden": 3.531640625, |
| "loss/jsd": 0.0921278445981443, |
| "loss/logits": 0.0, |
| "step": 2470 |
| }, |
| { |
| "epoch": 0.124, |
| "grad_norm": 19.0, |
| "grad_norm_var": 3.9395182291666666, |
| "learning_rate": 0.0001, |
| "loss": 4.5378, |
| "loss/crossentropy": 2.2659239649772642, |
| "loss/hidden": 3.5875, |
| "loss/jsd": 0.09199469089508057, |
| "loss/logits": 0.0, |
| "step": 2480 |
| }, |
| { |
| "epoch": 0.1245, |
| "grad_norm": 18.375, |
| "grad_norm_var": 2.9302083333333333, |
| "learning_rate": 0.0001, |
| "loss": 4.4709, |
| "loss/crossentropy": 2.2354795530438425, |
| "loss/hidden": 3.49140625, |
| "loss/jsd": 0.08955592634156347, |
| "loss/logits": 0.0, |
| "step": 2490 |
| }, |
| { |
| "epoch": 0.125, |
| "grad_norm": 21.75, |
| "grad_norm_var": 3.544791666666667, |
| "learning_rate": 0.0001, |
| "loss": 4.546, |
| "loss/crossentropy": 2.321756035089493, |
| "loss/hidden": 3.491796875, |
| "loss/jsd": 0.08398934034630656, |
| "loss/logits": 0.0, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.1255, |
| "grad_norm": 20.375, |
| "grad_norm_var": 3.5992024739583335, |
| "learning_rate": 0.0001, |
| "loss": 4.5442, |
| "loss/crossentropy": 2.327367161214352, |
| "loss/hidden": 3.562109375, |
| "loss/jsd": 0.08928178530186415, |
| "loss/logits": 0.0, |
| "step": 2510 |
| }, |
| { |
| "epoch": 0.126, |
| "grad_norm": 18.875, |
| "grad_norm_var": 3.892041015625, |
| "learning_rate": 0.0001, |
| "loss": 4.4942, |
| "loss/crossentropy": 2.198644478619099, |
| "loss/hidden": 3.44921875, |
| "loss/jsd": 0.08295171349309385, |
| "loss/logits": 0.0, |
| "step": 2520 |
| }, |
| { |
| "epoch": 0.1265, |
| "grad_norm": 16.125, |
| "grad_norm_var": 5.773811848958333, |
| "learning_rate": 0.0001, |
| "loss": 4.576, |
| "loss/crossentropy": 2.472541335225105, |
| "loss/hidden": 3.597265625, |
| "loss/jsd": 0.10432412773370743, |
| "loss/logits": 0.0, |
| "step": 2530 |
| }, |
| { |
| "epoch": 0.127, |
| "grad_norm": 20.875, |
| "grad_norm_var": 5.364583333333333, |
| "learning_rate": 0.0001, |
| "loss": 4.5337, |
| "loss/crossentropy": 2.3647551596164704, |
| "loss/hidden": 3.618359375, |
| "loss/jsd": 0.10374335153028369, |
| "loss/logits": 0.0, |
| "step": 2540 |
| }, |
| { |
| "epoch": 0.1275, |
| "grad_norm": 37.25, |
| "grad_norm_var": 1281.695947265625, |
| "learning_rate": 0.0001, |
| "loss": 4.5825, |
| "loss/crossentropy": 2.2414861261844634, |
| "loss/hidden": 3.4828125, |
| "loss/jsd": 0.09403842501342297, |
| "loss/logits": 0.0, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.128, |
| "grad_norm": 14.875, |
| "grad_norm_var": 1240.8051432291666, |
| "learning_rate": 0.0001, |
| "loss": 4.4589, |
| "loss/crossentropy": 2.234823814034462, |
| "loss/hidden": 3.509765625, |
| "loss/jsd": 0.08673453908413649, |
| "loss/logits": 0.0, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.1285, |
| "grad_norm": 22.75, |
| "grad_norm_var": 16.616080729166665, |
| "learning_rate": 0.0001, |
| "loss": 4.4816, |
| "loss/crossentropy": 2.387620323896408, |
| "loss/hidden": 3.55078125, |
| "loss/jsd": 0.08936102241277695, |
| "loss/logits": 0.0, |
| "step": 2570 |
| }, |
| { |
| "epoch": 0.129, |
| "grad_norm": 16.125, |
| "grad_norm_var": 9.396809895833334, |
| "learning_rate": 0.0001, |
| "loss": 4.4124, |
| "loss/crossentropy": 2.1731797240674497, |
| "loss/hidden": 3.40390625, |
| "loss/jsd": 0.07968775480985642, |
| "loss/logits": 0.0, |
| "step": 2580 |
| }, |
| { |
| "epoch": 0.1295, |
| "grad_norm": 20.625, |
| "grad_norm_var": 8.490559895833334, |
| "learning_rate": 0.0001, |
| "loss": 4.4807, |
| "loss/crossentropy": 2.1817662701010705, |
| "loss/hidden": 3.676953125, |
| "loss/jsd": 0.09472927646711468, |
| "loss/logits": 0.0, |
| "step": 2590 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 21.375, |
| "grad_norm_var": 4.510416666666667, |
| "learning_rate": 0.0001, |
| "loss": 4.5324, |
| "loss/crossentropy": 2.2697513103485107, |
| "loss/hidden": 3.570703125, |
| "loss/jsd": 0.08940641283988952, |
| "loss/logits": 0.0, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.1305, |
| "grad_norm": 18.375, |
| "grad_norm_var": 9.0634765625, |
| "learning_rate": 0.0001, |
| "loss": 4.4845, |
| "loss/crossentropy": 2.2707223266363146, |
| "loss/hidden": 3.52109375, |
| "loss/jsd": 0.09460832485929131, |
| "loss/logits": 0.0, |
| "step": 2610 |
| }, |
| { |
| "epoch": 0.131, |
| "grad_norm": 66.0, |
| "grad_norm_var": 143.3619140625, |
| "learning_rate": 0.0001, |
| "loss": 4.5179, |
| "loss/crossentropy": 2.254822887480259, |
| "loss/hidden": 3.531640625, |
| "loss/jsd": 0.09221142884343862, |
| "loss/logits": 0.0, |
| "step": 2620 |
| }, |
| { |
| "epoch": 0.1315, |
| "grad_norm": 21.625, |
| "grad_norm_var": 143.6431640625, |
| "learning_rate": 0.0001, |
| "loss": 4.4947, |
| "loss/crossentropy": 2.347915455698967, |
| "loss/hidden": 3.58203125, |
| "loss/jsd": 0.09412752091884613, |
| "loss/logits": 0.0, |
| "step": 2630 |
| }, |
| { |
| "epoch": 0.132, |
| "grad_norm": 17.375, |
| "grad_norm_var": 3.1708333333333334, |
| "learning_rate": 0.0001, |
| "loss": 4.528, |
| "loss/crossentropy": 2.2751280948519708, |
| "loss/hidden": 3.56328125, |
| "loss/jsd": 0.08851864533498884, |
| "loss/logits": 0.0, |
| "step": 2640 |
| }, |
| { |
| "epoch": 0.1325, |
| "grad_norm": 19.0, |
| "grad_norm_var": 13.576155598958334, |
| "learning_rate": 0.0001, |
| "loss": 4.4741, |
| "loss/crossentropy": 2.2658936589956284, |
| "loss/hidden": 3.54296875, |
| "loss/jsd": 0.08702889690175653, |
| "loss/logits": 0.0, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.133, |
| "grad_norm": 21.625, |
| "grad_norm_var": 722.3559895833333, |
| "learning_rate": 0.0001, |
| "loss": 4.5137, |
| "loss/crossentropy": 2.1781817600131035, |
| "loss/hidden": 3.56484375, |
| "loss/jsd": 0.09932177630253136, |
| "loss/logits": 0.0, |
| "step": 2660 |
| }, |
| { |
| "epoch": 0.1335, |
| "grad_norm": 17.5, |
| "grad_norm_var": 2.5122395833333333, |
| "learning_rate": 0.0001, |
| "loss": 4.4842, |
| "loss/crossentropy": 2.3243243932724, |
| "loss/hidden": 3.6015625, |
| "loss/jsd": 0.09606684306636452, |
| "loss/logits": 0.0, |
| "step": 2670 |
| }, |
| { |
| "epoch": 0.134, |
| "grad_norm": 16.75, |
| "grad_norm_var": 38.6853515625, |
| "learning_rate": 0.0001, |
| "loss": 4.5164, |
| "loss/crossentropy": 2.389857916533947, |
| "loss/hidden": 3.610546875, |
| "loss/jsd": 0.09261430930346251, |
| "loss/logits": 0.0, |
| "step": 2680 |
| }, |
| { |
| "epoch": 0.1345, |
| "grad_norm": 19.25, |
| "grad_norm_var": 4.7369140625, |
| "learning_rate": 0.0001, |
| "loss": 4.4902, |
| "loss/crossentropy": 2.3873065978288652, |
| "loss/hidden": 3.496875, |
| "loss/jsd": 0.08840383114293218, |
| "loss/logits": 0.0, |
| "step": 2690 |
| }, |
| { |
| "epoch": 0.135, |
| "grad_norm": 20.5, |
| "grad_norm_var": 3.177018229166667, |
| "learning_rate": 0.0001, |
| "loss": 4.4057, |
| "loss/crossentropy": 2.338147234916687, |
| "loss/hidden": 3.534375, |
| "loss/jsd": 0.09641889259219169, |
| "loss/logits": 0.0, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.1355, |
| "grad_norm": 17.75, |
| "grad_norm_var": 8.540999348958334, |
| "learning_rate": 0.0001, |
| "loss": 4.513, |
| "loss/crossentropy": 2.2362188696861267, |
| "loss/hidden": 3.6625, |
| "loss/jsd": 0.10190682755783201, |
| "loss/logits": 0.0, |
| "step": 2710 |
| }, |
| { |
| "epoch": 0.136, |
| "grad_norm": 22.75, |
| "grad_norm_var": 38.66183268229167, |
| "learning_rate": 0.0001, |
| "loss": 4.6171, |
| "loss/crossentropy": 2.222167354822159, |
| "loss/hidden": 3.628125, |
| "loss/jsd": 0.11221090480685234, |
| "loss/logits": 0.0, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.1365, |
| "grad_norm": 16.375, |
| "grad_norm_var": 37.61015625, |
| "learning_rate": 0.0001, |
| "loss": 4.539, |
| "loss/crossentropy": 2.3914648950099946, |
| "loss/hidden": 3.59765625, |
| "loss/jsd": 0.09164496380835771, |
| "loss/logits": 0.0, |
| "step": 2730 |
| }, |
| { |
| "epoch": 0.137, |
| "grad_norm": 16.5, |
| "grad_norm_var": 2.32890625, |
| "learning_rate": 0.0001, |
| "loss": 4.6123, |
| "loss/crossentropy": 2.385912075638771, |
| "loss/hidden": 3.541796875, |
| "loss/jsd": 0.08835116708651185, |
| "loss/logits": 0.0, |
| "step": 2740 |
| }, |
| { |
| "epoch": 0.1375, |
| "grad_norm": 17.75, |
| "grad_norm_var": 6.953125, |
| "learning_rate": 0.0001, |
| "loss": 4.5453, |
| "loss/crossentropy": 2.291456125676632, |
| "loss/hidden": 3.590625, |
| "loss/jsd": 0.0944554246030748, |
| "loss/logits": 0.0, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.138, |
| "grad_norm": 18.875, |
| "grad_norm_var": 23.817643229166666, |
| "learning_rate": 0.0001, |
| "loss": 4.4631, |
| "loss/crossentropy": 2.2219670079648495, |
| "loss/hidden": 3.562890625, |
| "loss/jsd": 0.08628002055920661, |
| "loss/logits": 0.0, |
| "step": 2760 |
| }, |
| { |
| "epoch": 0.1385, |
| "grad_norm": 17.5, |
| "grad_norm_var": 22.864518229166666, |
| "learning_rate": 0.0001, |
| "loss": 4.4415, |
| "loss/crossentropy": 2.3799121528863907, |
| "loss/hidden": 3.5, |
| "loss/jsd": 0.09274168154224753, |
| "loss/logits": 0.0, |
| "step": 2770 |
| }, |
| { |
| "epoch": 0.139, |
| "grad_norm": 20.25, |
| "grad_norm_var": 6.460791015625, |
| "learning_rate": 0.0001, |
| "loss": 4.5309, |
| "loss/crossentropy": 2.1055190823972225, |
| "loss/hidden": 3.534765625, |
| "loss/jsd": 0.08359876750037074, |
| "loss/logits": 0.0, |
| "step": 2780 |
| }, |
| { |
| "epoch": 0.1395, |
| "grad_norm": 17.875, |
| "grad_norm_var": 5.627587890625, |
| "learning_rate": 0.0001, |
| "loss": 4.4825, |
| "loss/crossentropy": 2.3334684520959854, |
| "loss/hidden": 3.533984375, |
| "loss/jsd": 0.0967961790971458, |
| "loss/logits": 0.0, |
| "step": 2790 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 17.125, |
| "grad_norm_var": 4.620556640625, |
| "learning_rate": 0.0001, |
| "loss": 4.4282, |
| "loss/crossentropy": 2.42918943464756, |
| "loss/hidden": 3.526953125, |
| "loss/jsd": 0.08855977468192577, |
| "loss/logits": 0.0, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.1405, |
| "grad_norm": 18.5, |
| "grad_norm_var": 5.042643229166667, |
| "learning_rate": 0.0001, |
| "loss": 4.4503, |
| "loss/crossentropy": 2.272622914612293, |
| "loss/hidden": 3.58671875, |
| "loss/jsd": 0.0873057721182704, |
| "loss/logits": 0.0, |
| "step": 2810 |
| }, |
| { |
| "epoch": 0.141, |
| "grad_norm": 18.125, |
| "grad_norm_var": 3.658837890625, |
| "learning_rate": 0.0001, |
| "loss": 4.4984, |
| "loss/crossentropy": 2.1938667565584185, |
| "loss/hidden": 3.553515625, |
| "loss/jsd": 0.08576443083584309, |
| "loss/logits": 0.0, |
| "step": 2820 |
| }, |
| { |
| "epoch": 0.1415, |
| "grad_norm": 20.375, |
| "grad_norm_var": 3.8306640625, |
| "learning_rate": 0.0001, |
| "loss": 4.5033, |
| "loss/crossentropy": 2.3123946458101274, |
| "loss/hidden": 3.56171875, |
| "loss/jsd": 0.0924573240801692, |
| "loss/logits": 0.0, |
| "step": 2830 |
| }, |
| { |
| "epoch": 0.142, |
| "grad_norm": 22.75, |
| "grad_norm_var": 5.495768229166667, |
| "learning_rate": 0.0001, |
| "loss": 4.4168, |
| "loss/crossentropy": 2.3987593173980715, |
| "loss/hidden": 3.509765625, |
| "loss/jsd": 0.0879776468500495, |
| "loss/logits": 0.0, |
| "step": 2840 |
| }, |
| { |
| "epoch": 0.1425, |
| "grad_norm": 17.375, |
| "grad_norm_var": 9.082747395833334, |
| "learning_rate": 0.0001, |
| "loss": 4.4669, |
| "loss/crossentropy": 2.2637423157691954, |
| "loss/hidden": 3.5609375, |
| "loss/jsd": 0.10491609480232, |
| "loss/logits": 0.0, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.143, |
| "grad_norm": 16.625, |
| "grad_norm_var": 5.2478515625, |
| "learning_rate": 0.0001, |
| "loss": 4.3874, |
| "loss/crossentropy": 2.36127190887928, |
| "loss/hidden": 3.5421875, |
| "loss/jsd": 0.08668355047702789, |
| "loss/logits": 0.0, |
| "step": 2860 |
| }, |
| { |
| "epoch": 0.1435, |
| "grad_norm": 20.625, |
| "grad_norm_var": 7.6447265625, |
| "learning_rate": 0.0001, |
| "loss": 4.4512, |
| "loss/crossentropy": 2.4082365155220034, |
| "loss/hidden": 3.561328125, |
| "loss/jsd": 0.09474811758846044, |
| "loss/logits": 0.0, |
| "step": 2870 |
| }, |
| { |
| "epoch": 0.144, |
| "grad_norm": 21.125, |
| "grad_norm_var": 8.8734375, |
| "learning_rate": 0.0001, |
| "loss": 4.4895, |
| "loss/crossentropy": 2.2012635439634325, |
| "loss/hidden": 3.4703125, |
| "loss/jsd": 0.08498403234407306, |
| "loss/logits": 0.0, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.1445, |
| "grad_norm": 21.625, |
| "grad_norm_var": 3.4480305989583333, |
| "learning_rate": 0.0001, |
| "loss": 4.447, |
| "loss/crossentropy": 2.298944839835167, |
| "loss/hidden": 3.555078125, |
| "loss/jsd": 0.09882149025797844, |
| "loss/logits": 0.0, |
| "step": 2890 |
| }, |
| { |
| "epoch": 0.145, |
| "grad_norm": 16.75, |
| "grad_norm_var": 2.8893229166666665, |
| "learning_rate": 0.0001, |
| "loss": 4.5259, |
| "loss/crossentropy": 2.4170736342668535, |
| "loss/hidden": 3.62265625, |
| "loss/jsd": 0.10080426596105099, |
| "loss/logits": 0.0, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.1455, |
| "grad_norm": 24.75, |
| "grad_norm_var": 9.176041666666666, |
| "learning_rate": 0.0001, |
| "loss": 4.4133, |
| "loss/crossentropy": 2.266545096039772, |
| "loss/hidden": 3.488671875, |
| "loss/jsd": 0.08616750100627542, |
| "loss/logits": 0.0, |
| "step": 2910 |
| }, |
| { |
| "epoch": 0.146, |
| "grad_norm": 23.5, |
| "grad_norm_var": 10.60859375, |
| "learning_rate": 0.0001, |
| "loss": 4.4536, |
| "loss/crossentropy": 2.3165148913860323, |
| "loss/hidden": 3.42890625, |
| "loss/jsd": 0.07857409287244081, |
| "loss/logits": 0.0, |
| "step": 2920 |
| }, |
| { |
| "epoch": 0.1465, |
| "grad_norm": 18.125, |
| "grad_norm_var": 4.756884765625, |
| "learning_rate": 0.0001, |
| "loss": 4.4494, |
| "loss/crossentropy": 2.275170993804932, |
| "loss/hidden": 3.52421875, |
| "loss/jsd": 0.08944948101416231, |
| "loss/logits": 0.0, |
| "step": 2930 |
| }, |
| { |
| "epoch": 0.147, |
| "grad_norm": 19.375, |
| "grad_norm_var": 4.254931640625, |
| "learning_rate": 0.0001, |
| "loss": 4.4176, |
| "loss/crossentropy": 2.0632098406553268, |
| "loss/hidden": 3.54296875, |
| "loss/jsd": 0.09174134442582726, |
| "loss/logits": 0.0, |
| "step": 2940 |
| }, |
| { |
| "epoch": 0.1475, |
| "grad_norm": 20.75, |
| "grad_norm_var": 3.086442057291667, |
| "learning_rate": 0.0001, |
| "loss": 4.4606, |
| "loss/crossentropy": 2.251816061139107, |
| "loss/hidden": 3.43046875, |
| "loss/jsd": 0.08638259647414089, |
| "loss/logits": 0.0, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.148, |
| "grad_norm": 44.75, |
| "grad_norm_var": 49.42265625, |
| "learning_rate": 0.0001, |
| "loss": 4.4651, |
| "loss/crossentropy": 2.1415898591279983, |
| "loss/hidden": 3.41796875, |
| "loss/jsd": 0.08234252617694438, |
| "loss/logits": 0.0, |
| "step": 2960 |
| }, |
| { |
| "epoch": 0.1485, |
| "grad_norm": 21.0, |
| "grad_norm_var": 47.44993489583333, |
| "learning_rate": 0.0001, |
| "loss": 4.4563, |
| "loss/crossentropy": 2.2365823119878767, |
| "loss/hidden": 3.542578125, |
| "loss/jsd": 0.09432788556441665, |
| "loss/logits": 0.0, |
| "step": 2970 |
| }, |
| { |
| "epoch": 0.149, |
| "grad_norm": 16.875, |
| "grad_norm_var": 3.926155598958333, |
| "learning_rate": 0.0001, |
| "loss": 4.4151, |
| "loss/crossentropy": 2.397647699713707, |
| "loss/hidden": 3.58984375, |
| "loss/jsd": 0.09230242855846882, |
| "loss/logits": 0.0, |
| "step": 2980 |
| }, |
| { |
| "epoch": 0.1495, |
| "grad_norm": 18.25, |
| "grad_norm_var": 2.0455729166666665, |
| "learning_rate": 0.0001, |
| "loss": 4.4923, |
| "loss/crossentropy": 2.270119884610176, |
| "loss/hidden": 3.588671875, |
| "loss/jsd": 0.09977766564115882, |
| "loss/logits": 0.0, |
| "step": 2990 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 20.875, |
| "grad_norm_var": 5.620947265625, |
| "learning_rate": 0.0001, |
| "loss": 4.424, |
| "loss/crossentropy": 2.274160121381283, |
| "loss/hidden": 3.493359375, |
| "loss/jsd": 0.08368044728413224, |
| "loss/logits": 0.0, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.1505, |
| "grad_norm": 18.0, |
| "grad_norm_var": 5.805843098958333, |
| "learning_rate": 0.0001, |
| "loss": 4.4366, |
| "loss/crossentropy": 2.2047403126955034, |
| "loss/hidden": 3.48125, |
| "loss/jsd": 0.09485792317427695, |
| "loss/logits": 0.0, |
| "step": 3010 |
| }, |
| { |
| "epoch": 0.151, |
| "grad_norm": 15.75, |
| "grad_norm_var": 3.952018229166667, |
| "learning_rate": 0.0001, |
| "loss": 4.4361, |
| "loss/crossentropy": 2.426369333267212, |
| "loss/hidden": 3.505078125, |
| "loss/jsd": 0.09415734186768532, |
| "loss/logits": 0.0, |
| "step": 3020 |
| }, |
| { |
| "epoch": 0.1515, |
| "grad_norm": 20.5, |
| "grad_norm_var": 27.647916666666667, |
| "learning_rate": 0.0001, |
| "loss": 4.386, |
| "loss/crossentropy": 2.1474297270178795, |
| "loss/hidden": 3.485546875, |
| "loss/jsd": 0.08619686132296919, |
| "loss/logits": 0.0, |
| "step": 3030 |
| }, |
| { |
| "epoch": 0.152, |
| "grad_norm": 18.125, |
| "grad_norm_var": 7.828369140625, |
| "learning_rate": 0.0001, |
| "loss": 4.5285, |
| "loss/crossentropy": 2.343987912684679, |
| "loss/hidden": 3.58515625, |
| "loss/jsd": 0.09171235403046012, |
| "loss/logits": 0.0, |
| "step": 3040 |
| }, |
| { |
| "epoch": 0.1525, |
| "grad_norm": 18.25, |
| "grad_norm_var": 8.158707682291666, |
| "learning_rate": 0.0001, |
| "loss": 4.4394, |
| "loss/crossentropy": 2.3131785288453104, |
| "loss/hidden": 3.59453125, |
| "loss/jsd": 0.09000935666263103, |
| "loss/logits": 0.0, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.153, |
| "grad_norm": 37.0, |
| "grad_norm_var": 68.850244140625, |
| "learning_rate": 0.0001, |
| "loss": 4.4266, |
| "loss/crossentropy": 2.333009423315525, |
| "loss/hidden": 3.5515625, |
| "loss/jsd": 0.10508564142510295, |
| "loss/logits": 0.0, |
| "step": 3060 |
| }, |
| { |
| "epoch": 0.1535, |
| "grad_norm": 18.75, |
| "grad_norm_var": 69.41712239583333, |
| "learning_rate": 0.0001, |
| "loss": 4.4924, |
| "loss/crossentropy": 2.3548025131225585, |
| "loss/hidden": 3.560546875, |
| "loss/jsd": 0.08788978308439255, |
| "loss/logits": 0.0, |
| "step": 3070 |
| }, |
| { |
| "epoch": 0.154, |
| "grad_norm": 19.125, |
| "grad_norm_var": 2.856494140625, |
| "learning_rate": 0.0001, |
| "loss": 4.4327, |
| "loss/crossentropy": 2.196867881715298, |
| "loss/hidden": 3.598046875, |
| "loss/jsd": 0.09646046198904515, |
| "loss/logits": 0.0, |
| "step": 3080 |
| }, |
| { |
| "epoch": 0.1545, |
| "grad_norm": 22.375, |
| "grad_norm_var": 3.7356770833333335, |
| "learning_rate": 0.0001, |
| "loss": 4.4178, |
| "loss/crossentropy": 2.2703019440174104, |
| "loss/hidden": 3.5359375, |
| "loss/jsd": 0.09918619338423014, |
| "loss/logits": 0.0, |
| "step": 3090 |
| }, |
| { |
| "epoch": 0.155, |
| "grad_norm": 21.5, |
| "grad_norm_var": 10.3275390625, |
| "learning_rate": 0.0001, |
| "loss": 4.3676, |
| "loss/crossentropy": 2.2433530882000925, |
| "loss/hidden": 3.416015625, |
| "loss/jsd": 0.08071139380335808, |
| "loss/logits": 0.0, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.1555, |
| "grad_norm": 18.5, |
| "grad_norm_var": 10.381510416666666, |
| "learning_rate": 0.0001, |
| "loss": 4.3876, |
| "loss/crossentropy": 2.229444594681263, |
| "loss/hidden": 3.46640625, |
| "loss/jsd": 0.08576273424550891, |
| "loss/logits": 0.0, |
| "step": 3110 |
| }, |
| { |
| "epoch": 0.156, |
| "grad_norm": 15.6875, |
| "grad_norm_var": 4.5712890625, |
| "learning_rate": 0.0001, |
| "loss": 4.345, |
| "loss/crossentropy": 2.2901594534516336, |
| "loss/hidden": 3.35859375, |
| "loss/jsd": 0.07994853192940354, |
| "loss/logits": 0.0, |
| "step": 3120 |
| }, |
| { |
| "epoch": 0.1565, |
| "grad_norm": 15.625, |
| "grad_norm_var": 7.159358723958333, |
| "learning_rate": 0.0001, |
| "loss": 4.3626, |
| "loss/crossentropy": 2.351148310303688, |
| "loss/hidden": 3.499609375, |
| "loss/jsd": 0.089451711345464, |
| "loss/logits": 0.0, |
| "step": 3130 |
| }, |
| { |
| "epoch": 0.157, |
| "grad_norm": 16.375, |
| "grad_norm_var": 6.505192057291667, |
| "learning_rate": 0.0001, |
| "loss": 4.4232, |
| "loss/crossentropy": 2.1818712055683136, |
| "loss/hidden": 3.45234375, |
| "loss/jsd": 0.0778524660039693, |
| "loss/logits": 0.0, |
| "step": 3140 |
| }, |
| { |
| "epoch": 0.1575, |
| "grad_norm": 16.75, |
| "grad_norm_var": 2.1102701822916665, |
| "learning_rate": 0.0001, |
| "loss": 4.4401, |
| "loss/crossentropy": 2.349280393123627, |
| "loss/hidden": 3.570703125, |
| "loss/jsd": 0.09622437562793493, |
| "loss/logits": 0.0, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.158, |
| "grad_norm": 17.625, |
| "grad_norm_var": 2.701546223958333, |
| "learning_rate": 0.0001, |
| "loss": 4.3796, |
| "loss/crossentropy": 2.329416874051094, |
| "loss/hidden": 3.539453125, |
| "loss/jsd": 0.08784733964130283, |
| "loss/logits": 0.0, |
| "step": 3160 |
| }, |
| { |
| "epoch": 0.1585, |
| "grad_norm": 16.375, |
| "grad_norm_var": 2.156363932291667, |
| "learning_rate": 0.0001, |
| "loss": 4.2973, |
| "loss/crossentropy": 2.2838528990745544, |
| "loss/hidden": 3.38984375, |
| "loss/jsd": 0.08452709410339594, |
| "loss/logits": 0.0, |
| "step": 3170 |
| }, |
| { |
| "epoch": 0.159, |
| "grad_norm": 19.0, |
| "grad_norm_var": 4.3259765625, |
| "learning_rate": 0.0001, |
| "loss": 4.4271, |
| "loss/crossentropy": 2.1395395755767823, |
| "loss/hidden": 3.5203125, |
| "loss/jsd": 0.10172450188547373, |
| "loss/logits": 0.0, |
| "step": 3180 |
| }, |
| { |
| "epoch": 0.1595, |
| "grad_norm": 19.375, |
| "grad_norm_var": 4.408268229166667, |
| "learning_rate": 0.0001, |
| "loss": 4.4574, |
| "loss/crossentropy": 2.394977739453316, |
| "loss/hidden": 3.6234375, |
| "loss/jsd": 0.10171002727001906, |
| "loss/logits": 0.0, |
| "step": 3190 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 16.875, |
| "grad_norm_var": 6.147379557291667, |
| "learning_rate": 0.0001, |
| "loss": 4.4832, |
| "loss/crossentropy": 2.355364751815796, |
| "loss/hidden": 3.50546875, |
| "loss/jsd": 0.08951211860403419, |
| "loss/logits": 0.0, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.1605, |
| "grad_norm": 28.625, |
| "grad_norm_var": 1.244752705334018e+18, |
| "learning_rate": 0.0001, |
| "loss": 4.4816, |
| "loss/crossentropy": 2.305925354361534, |
| "loss/hidden": 3.58828125, |
| "loss/jsd": 0.08834340209141374, |
| "loss/logits": 0.0, |
| "step": 3210 |
| }, |
| { |
| "epoch": 0.161, |
| "grad_norm": 18.25, |
| "grad_norm_var": 9.839518229166666, |
| "learning_rate": 0.0001, |
| "loss": 4.4863, |
| "loss/crossentropy": 2.3078080981969835, |
| "loss/hidden": 3.5203125, |
| "loss/jsd": 0.0940877721644938, |
| "loss/logits": 0.0, |
| "step": 3220 |
| }, |
| { |
| "epoch": 0.1615, |
| "grad_norm": 16.125, |
| "grad_norm_var": 2.7150390625, |
| "learning_rate": 0.0001, |
| "loss": 4.4064, |
| "loss/crossentropy": 2.4253244906663896, |
| "loss/hidden": 3.58125, |
| "loss/jsd": 0.09865610068663955, |
| "loss/logits": 0.0, |
| "step": 3230 |
| }, |
| { |
| "epoch": 0.162, |
| "grad_norm": 18.75, |
| "grad_norm_var": 4.934749348958333, |
| "learning_rate": 0.0001, |
| "loss": 4.3174, |
| "loss/crossentropy": 2.4989412158727644, |
| "loss/hidden": 3.4234375, |
| "loss/jsd": 0.08196726078167557, |
| "loss/logits": 0.0, |
| "step": 3240 |
| }, |
| { |
| "epoch": 0.1625, |
| "grad_norm": 16.375, |
| "grad_norm_var": 2.9952962239583334, |
| "learning_rate": 0.0001, |
| "loss": 4.3693, |
| "loss/crossentropy": 2.2475881457328795, |
| "loss/hidden": 3.47109375, |
| "loss/jsd": 0.09224425395950675, |
| "loss/logits": 0.0, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.163, |
| "grad_norm": 15.3125, |
| "grad_norm_var": 1.9930826822916667, |
| "learning_rate": 0.0001, |
| "loss": 4.3783, |
| "loss/crossentropy": 2.2365039557218553, |
| "loss/hidden": 3.49921875, |
| "loss/jsd": 0.09003520868718624, |
| "loss/logits": 0.0, |
| "step": 3260 |
| }, |
| { |
| "epoch": 0.1635, |
| "grad_norm": 17.375, |
| "grad_norm_var": 2.0921223958333335, |
| "learning_rate": 0.0001, |
| "loss": 4.4329, |
| "loss/crossentropy": 2.266706997156143, |
| "loss/hidden": 3.542578125, |
| "loss/jsd": 0.09133774926885962, |
| "loss/logits": 0.0, |
| "step": 3270 |
| }, |
| { |
| "epoch": 0.164, |
| "grad_norm": 17.0, |
| "grad_norm_var": 2.531510416666667, |
| "learning_rate": 0.0001, |
| "loss": 4.5264, |
| "loss/crossentropy": 2.292432078719139, |
| "loss/hidden": 3.67265625, |
| "loss/jsd": 0.11363288760185242, |
| "loss/logits": 0.0, |
| "step": 3280 |
| }, |
| { |
| "epoch": 0.1645, |
| "grad_norm": 20.125, |
| "grad_norm_var": 2.6884765625, |
| "learning_rate": 0.0001, |
| "loss": 4.3941, |
| "loss/crossentropy": 2.308723744750023, |
| "loss/hidden": 3.60234375, |
| "loss/jsd": 0.09796320544555784, |
| "loss/logits": 0.0, |
| "step": 3290 |
| }, |
| { |
| "epoch": 0.165, |
| "grad_norm": 19.0, |
| "grad_norm_var": 3.3436848958333334, |
| "learning_rate": 0.0001, |
| "loss": 4.3738, |
| "loss/crossentropy": 2.3898502081632613, |
| "loss/hidden": 3.471484375, |
| "loss/jsd": 0.08741156700998545, |
| "loss/logits": 0.0, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.1655, |
| "grad_norm": 19.875, |
| "grad_norm_var": 3.4898274739583335, |
| "learning_rate": 0.0001, |
| "loss": 4.3816, |
| "loss/crossentropy": 2.306541550159454, |
| "loss/hidden": 3.592578125, |
| "loss/jsd": 0.09893495552241802, |
| "loss/logits": 0.0, |
| "step": 3310 |
| }, |
| { |
| "epoch": 0.166, |
| "grad_norm": 14.8125, |
| "grad_norm_var": 2.6378743489583334, |
| "learning_rate": 0.0001, |
| "loss": 4.37, |
| "loss/crossentropy": 2.3887303933501243, |
| "loss/hidden": 3.480859375, |
| "loss/jsd": 0.08493705298751593, |
| "loss/logits": 0.0, |
| "step": 3320 |
| }, |
| { |
| "epoch": 0.1665, |
| "grad_norm": 21.875, |
| "grad_norm_var": 3.0442545572916666, |
| "learning_rate": 0.0001, |
| "loss": 4.4504, |
| "loss/crossentropy": 2.3878179833292963, |
| "loss/hidden": 3.632421875, |
| "loss/jsd": 0.09913788838312029, |
| "loss/logits": 0.0, |
| "step": 3330 |
| }, |
| { |
| "epoch": 0.167, |
| "grad_norm": 20.25, |
| "grad_norm_var": 10.913785807291667, |
| "learning_rate": 0.0001, |
| "loss": 4.4553, |
| "loss/crossentropy": 2.2205622404813767, |
| "loss/hidden": 3.636328125, |
| "loss/jsd": 0.10992270009592175, |
| "loss/logits": 0.0, |
| "step": 3340 |
| }, |
| { |
| "epoch": 0.1675, |
| "grad_norm": 22.0, |
| "grad_norm_var": 7.966520182291666, |
| "learning_rate": 0.0001, |
| "loss": 4.3545, |
| "loss/crossentropy": 2.108812813460827, |
| "loss/hidden": 3.40234375, |
| "loss/jsd": 0.0757693353574723, |
| "loss/logits": 0.0, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.168, |
| "grad_norm": 15.6875, |
| "grad_norm_var": 3.8549479166666667, |
| "learning_rate": 0.0001, |
| "loss": 4.3713, |
| "loss/crossentropy": 2.289568629860878, |
| "loss/hidden": 3.541015625, |
| "loss/jsd": 0.10061329454183579, |
| "loss/logits": 0.0, |
| "step": 3360 |
| }, |
| { |
| "epoch": 0.1685, |
| "grad_norm": 17.5, |
| "grad_norm_var": 4.482145182291666, |
| "learning_rate": 0.0001, |
| "loss": 4.4519, |
| "loss/crossentropy": 2.352386988699436, |
| "loss/hidden": 3.42734375, |
| "loss/jsd": 0.08975700601004064, |
| "loss/logits": 0.0, |
| "step": 3370 |
| }, |
| { |
| "epoch": 0.169, |
| "grad_norm": 18.875, |
| "grad_norm_var": 4.2978515625, |
| "learning_rate": 0.0001, |
| "loss": 4.3503, |
| "loss/crossentropy": 2.31557312309742, |
| "loss/hidden": 3.548828125, |
| "loss/jsd": 0.0878440142609179, |
| "loss/logits": 0.0, |
| "step": 3380 |
| }, |
| { |
| "epoch": 0.1695, |
| "grad_norm": 20.875, |
| "grad_norm_var": 5.702604166666666, |
| "learning_rate": 0.0001, |
| "loss": 4.3864, |
| "loss/crossentropy": 2.339674559235573, |
| "loss/hidden": 3.464453125, |
| "loss/jsd": 0.0880395533517003, |
| "loss/logits": 0.0, |
| "step": 3390 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 15.1875, |
| "grad_norm_var": 5.098551432291667, |
| "learning_rate": 0.0001, |
| "loss": 4.3726, |
| "loss/crossentropy": 2.2533027648925783, |
| "loss/hidden": 3.4921875, |
| "loss/jsd": 0.08741035936400295, |
| "loss/logits": 0.0, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.1705, |
| "grad_norm": 21.875, |
| "grad_norm_var": 3.4983723958333335, |
| "learning_rate": 0.0001, |
| "loss": 4.3701, |
| "loss/crossentropy": 2.280165506899357, |
| "loss/hidden": 3.52890625, |
| "loss/jsd": 0.09410012043081224, |
| "loss/logits": 0.0, |
| "step": 3410 |
| }, |
| { |
| "epoch": 0.171, |
| "grad_norm": 15.75, |
| "grad_norm_var": 3.486962890625, |
| "learning_rate": 0.0001, |
| "loss": 4.4465, |
| "loss/crossentropy": 2.3110105454921723, |
| "loss/hidden": 3.54453125, |
| "loss/jsd": 0.10350852748379111, |
| "loss/logits": 0.0, |
| "step": 3420 |
| }, |
| { |
| "epoch": 0.1715, |
| "grad_norm": 15.0625, |
| "grad_norm_var": 1.7901041666666666, |
| "learning_rate": 0.0001, |
| "loss": 4.2987, |
| "loss/crossentropy": 2.5183032125234606, |
| "loss/hidden": 3.541015625, |
| "loss/jsd": 0.0940008645877242, |
| "loss/logits": 0.0, |
| "step": 3430 |
| }, |
| { |
| "epoch": 0.172, |
| "grad_norm": 16.625, |
| "grad_norm_var": 1.3207509498935816e+18, |
| "learning_rate": 0.0001, |
| "loss": 4.3968, |
| "loss/crossentropy": 2.298141914606094, |
| "loss/hidden": 3.538671875, |
| "loss/jsd": 0.09232875565066934, |
| "loss/logits": 0.0, |
| "step": 3440 |
| }, |
| { |
| "epoch": 0.1725, |
| "grad_norm": 25.875, |
| "grad_norm_var": 10.539176432291667, |
| "learning_rate": 0.0001, |
| "loss": 4.405, |
| "loss/crossentropy": 2.4251868039369584, |
| "loss/hidden": 3.620703125, |
| "loss/jsd": 0.09764928705990314, |
| "loss/logits": 0.0, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.173, |
| "grad_norm": 16.5, |
| "grad_norm_var": 37.917301432291666, |
| "learning_rate": 0.0001, |
| "loss": 4.3521, |
| "loss/crossentropy": 2.4465878754854202, |
| "loss/hidden": 3.41328125, |
| "loss/jsd": 0.09263761136680841, |
| "loss/logits": 0.0, |
| "step": 3460 |
| }, |
| { |
| "epoch": 0.1735, |
| "grad_norm": 19.5, |
| "grad_norm_var": 10.325895182291667, |
| "learning_rate": 0.0001, |
| "loss": 4.3055, |
| "loss/crossentropy": 2.341625288128853, |
| "loss/hidden": 3.46171875, |
| "loss/jsd": 0.08955673705786467, |
| "loss/logits": 0.0, |
| "step": 3470 |
| }, |
| { |
| "epoch": 0.174, |
| "grad_norm": 18.25, |
| "grad_norm_var": 9.155582682291667, |
| "learning_rate": 0.0001, |
| "loss": 4.257, |
| "loss/crossentropy": 2.2626075088977813, |
| "loss/hidden": 3.38046875, |
| "loss/jsd": 0.08382235984317958, |
| "loss/logits": 0.0, |
| "step": 3480 |
| }, |
| { |
| "epoch": 0.1745, |
| "grad_norm": 24.25, |
| "grad_norm_var": 40.449853515625, |
| "learning_rate": 0.0001, |
| "loss": 4.365, |
| "loss/crossentropy": 2.22887095361948, |
| "loss/hidden": 3.495703125, |
| "loss/jsd": 0.08516010586172343, |
| "loss/logits": 0.0, |
| "step": 3490 |
| }, |
| { |
| "epoch": 0.175, |
| "grad_norm": 29.625, |
| "grad_norm_var": 22.202848307291667, |
| "learning_rate": 0.0001, |
| "loss": 4.4211, |
| "loss/crossentropy": 2.325581954419613, |
| "loss/hidden": 3.626171875, |
| "loss/jsd": 0.10623239502310752, |
| "loss/logits": 0.0, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.1755, |
| "grad_norm": 18.25, |
| "grad_norm_var": 20.083268229166666, |
| "learning_rate": 0.0001, |
| "loss": 4.4183, |
| "loss/crossentropy": 2.3138944447040557, |
| "loss/hidden": 3.605078125, |
| "loss/jsd": 0.09691860349848866, |
| "loss/logits": 0.0, |
| "step": 3510 |
| }, |
| { |
| "epoch": 0.176, |
| "grad_norm": 14.75, |
| "grad_norm_var": 3.0729166666666665, |
| "learning_rate": 0.0001, |
| "loss": 4.3327, |
| "loss/crossentropy": 2.335177455097437, |
| "loss/hidden": 3.516796875, |
| "loss/jsd": 0.08317867233417928, |
| "loss/logits": 0.0, |
| "step": 3520 |
| }, |
| { |
| "epoch": 0.1765, |
| "grad_norm": 29.375, |
| "grad_norm_var": 313.28943684895836, |
| "learning_rate": 0.0001, |
| "loss": 4.3804, |
| "loss/crossentropy": 2.2869663372635842, |
| "loss/hidden": 3.3875, |
| "loss/jsd": 0.07983446251600981, |
| "loss/logits": 0.0, |
| "step": 3530 |
| }, |
| { |
| "epoch": 0.177, |
| "grad_norm": 18.75, |
| "grad_norm_var": 294.8395182291667, |
| "learning_rate": 0.0001, |
| "loss": 4.4268, |
| "loss/crossentropy": 2.3664276599884033, |
| "loss/hidden": 3.541796875, |
| "loss/jsd": 0.09534696582704782, |
| "loss/logits": 0.0, |
| "step": 3540 |
| }, |
| { |
| "epoch": 0.1775, |
| "grad_norm": 16.875, |
| "grad_norm_var": 9.458707682291667, |
| "learning_rate": 0.0001, |
| "loss": 4.2841, |
| "loss/crossentropy": 2.3651267111301424, |
| "loss/hidden": 3.37890625, |
| "loss/jsd": 0.08079936136491597, |
| "loss/logits": 0.0, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.178, |
| "grad_norm": 21.125, |
| "grad_norm_var": 4.593473307291666, |
| "learning_rate": 0.0001, |
| "loss": 4.2825, |
| "loss/crossentropy": 2.392011249065399, |
| "loss/hidden": 3.41015625, |
| "loss/jsd": 0.09476534733548761, |
| "loss/logits": 0.0, |
| "step": 3560 |
| }, |
| { |
| "epoch": 0.1785, |
| "grad_norm": 23.0, |
| "grad_norm_var": 3.734375, |
| "learning_rate": 0.0001, |
| "loss": 4.3928, |
| "loss/crossentropy": 2.4183569096028803, |
| "loss/hidden": 3.3953125, |
| "loss/jsd": 0.0872859289869666, |
| "loss/logits": 0.0, |
| "step": 3570 |
| }, |
| { |
| "epoch": 0.179, |
| "grad_norm": 17.375, |
| "grad_norm_var": 6.357405598958334, |
| "learning_rate": 0.0001, |
| "loss": 4.366, |
| "loss/crossentropy": 2.228242626786232, |
| "loss/hidden": 3.61484375, |
| "loss/jsd": 0.09655670188367367, |
| "loss/logits": 0.0, |
| "step": 3580 |
| }, |
| { |
| "epoch": 0.1795, |
| "grad_norm": 19.625, |
| "grad_norm_var": 4.720247395833334, |
| "learning_rate": 0.0001, |
| "loss": 4.3721, |
| "loss/crossentropy": 2.3514621019363404, |
| "loss/hidden": 3.55703125, |
| "loss/jsd": 0.08998525207862258, |
| "loss/logits": 0.0, |
| "step": 3590 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 17.75, |
| "grad_norm_var": 86.24099934895834, |
| "learning_rate": 0.0001, |
| "loss": 4.2887, |
| "loss/crossentropy": 2.3266086250543596, |
| "loss/hidden": 3.3921875, |
| "loss/jsd": 0.08203610377386213, |
| "loss/logits": 0.0, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.1805, |
| "grad_norm": 17.125, |
| "grad_norm_var": 6.689428671005983e+17, |
| "learning_rate": 0.0001, |
| "loss": 4.2832, |
| "loss/crossentropy": 2.358739697933197, |
| "loss/hidden": 3.381640625, |
| "loss/jsd": 0.07689286703243851, |
| "loss/logits": 0.0, |
| "step": 3610 |
| }, |
| { |
| "epoch": 0.181, |
| "grad_norm": 19.75, |
| "grad_norm_var": 6.689428671857951e+17, |
| "learning_rate": 0.0001, |
| "loss": 4.2562, |
| "loss/crossentropy": 2.299892693758011, |
| "loss/hidden": 3.358203125, |
| "loss/jsd": 0.07548968028277159, |
| "loss/logits": 0.0, |
| "step": 3620 |
| }, |
| { |
| "epoch": 0.1815, |
| "grad_norm": 16.5, |
| "grad_norm_var": 3.8056640625, |
| "learning_rate": 0.0001, |
| "loss": 4.3453, |
| "loss/crossentropy": 2.1600609093904497, |
| "loss/hidden": 3.521484375, |
| "loss/jsd": 0.0914209995418787, |
| "loss/logits": 0.0, |
| "step": 3630 |
| }, |
| { |
| "epoch": 0.182, |
| "grad_norm": 18.375, |
| "grad_norm_var": 5.445833333333334, |
| "learning_rate": 0.0001, |
| "loss": 4.261, |
| "loss/crossentropy": 2.2630960240960123, |
| "loss/hidden": 3.2734375, |
| "loss/jsd": 0.07350569609552622, |
| "loss/logits": 0.0, |
| "step": 3640 |
| }, |
| { |
| "epoch": 0.1825, |
| "grad_norm": 19.5, |
| "grad_norm_var": 4.786393229166666, |
| "learning_rate": 0.0001, |
| "loss": 4.2941, |
| "loss/crossentropy": 2.26744422018528, |
| "loss/hidden": 3.434765625, |
| "loss/jsd": 0.09071792410686612, |
| "loss/logits": 0.0, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.183, |
| "grad_norm": 16.375, |
| "grad_norm_var": 3.675520833333333, |
| "learning_rate": 0.0001, |
| "loss": 4.216, |
| "loss/crossentropy": 2.392577236890793, |
| "loss/hidden": 3.39140625, |
| "loss/jsd": 0.0848071664571762, |
| "loss/logits": 0.0, |
| "step": 3660 |
| }, |
| { |
| "epoch": 0.1835, |
| "grad_norm": 18.625, |
| "grad_norm_var": 2.8169270833333333, |
| "learning_rate": 0.0001, |
| "loss": 4.2512, |
| "loss/crossentropy": 2.3719822376966477, |
| "loss/hidden": 3.4421875, |
| "loss/jsd": 0.08796066055074334, |
| "loss/logits": 0.0, |
| "step": 3670 |
| }, |
| { |
| "epoch": 0.184, |
| "grad_norm": 18.125, |
| "grad_norm_var": 5.067692057291667, |
| "learning_rate": 0.0001, |
| "loss": 4.2015, |
| "loss/crossentropy": 2.2736427552998064, |
| "loss/hidden": 3.362109375, |
| "loss/jsd": 0.0805484069045633, |
| "loss/logits": 0.0, |
| "step": 3680 |
| }, |
| { |
| "epoch": 0.1845, |
| "grad_norm": 14.8125, |
| "grad_norm_var": 5.062434895833333, |
| "learning_rate": 0.0001, |
| "loss": 4.2791, |
| "loss/crossentropy": 2.298249673843384, |
| "loss/hidden": 3.403515625, |
| "loss/jsd": 0.08579938132315874, |
| "loss/logits": 0.0, |
| "step": 3690 |
| }, |
| { |
| "epoch": 0.185, |
| "grad_norm": 17.75, |
| "grad_norm_var": 5.132747395833333, |
| "learning_rate": 0.0001, |
| "loss": 4.2845, |
| "loss/crossentropy": 2.29532730281353, |
| "loss/hidden": 3.43359375, |
| "loss/jsd": 0.08520804699510336, |
| "loss/logits": 0.0, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.1855, |
| "grad_norm": 21.375, |
| "grad_norm_var": 3.1105305989583334, |
| "learning_rate": 0.0001, |
| "loss": 4.2881, |
| "loss/crossentropy": 2.266036620736122, |
| "loss/hidden": 3.4421875, |
| "loss/jsd": 0.08662721011787652, |
| "loss/logits": 0.0, |
| "step": 3710 |
| }, |
| { |
| "epoch": 0.186, |
| "grad_norm": 17.875, |
| "grad_norm_var": 2.1442057291666665, |
| "learning_rate": 0.0001, |
| "loss": 4.3015, |
| "loss/crossentropy": 2.3964017778635025, |
| "loss/hidden": 3.443359375, |
| "loss/jsd": 0.08621067805215717, |
| "loss/logits": 0.0, |
| "step": 3720 |
| }, |
| { |
| "epoch": 0.1865, |
| "grad_norm": 16.5, |
| "grad_norm_var": 3.620556640625, |
| "learning_rate": 0.0001, |
| "loss": 4.2411, |
| "loss/crossentropy": 2.3594220340251923, |
| "loss/hidden": 3.336328125, |
| "loss/jsd": 0.0772560654208064, |
| "loss/logits": 0.0, |
| "step": 3730 |
| }, |
| { |
| "epoch": 0.187, |
| "grad_norm": 16.75, |
| "grad_norm_var": 3.252978515625, |
| "learning_rate": 0.0001, |
| "loss": 4.184, |
| "loss/crossentropy": 2.2494852378964425, |
| "loss/hidden": 3.32421875, |
| "loss/jsd": 0.08321888605132699, |
| "loss/logits": 0.0, |
| "step": 3740 |
| }, |
| { |
| "epoch": 0.1875, |
| "grad_norm": 18.875, |
| "grad_norm_var": 3.7570149739583334, |
| "learning_rate": 0.0001, |
| "loss": 4.2329, |
| "loss/crossentropy": 2.178547790646553, |
| "loss/hidden": 3.322265625, |
| "loss/jsd": 0.07429210902191699, |
| "loss/logits": 0.0, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.188, |
| "grad_norm": 16.125, |
| "grad_norm_var": 4.394645182291667, |
| "learning_rate": 0.0001, |
| "loss": 4.2641, |
| "loss/crossentropy": 2.2659785449504852, |
| "loss/hidden": 3.4265625, |
| "loss/jsd": 0.08605121849104762, |
| "loss/logits": 0.0, |
| "step": 3760 |
| }, |
| { |
| "epoch": 0.1885, |
| "grad_norm": 19.375, |
| "grad_norm_var": 4.303889973958333, |
| "learning_rate": 0.0001, |
| "loss": 4.2343, |
| "loss/crossentropy": 2.3981280818581583, |
| "loss/hidden": 3.384375, |
| "loss/jsd": 0.0853766439948231, |
| "loss/logits": 0.0, |
| "step": 3770 |
| }, |
| { |
| "epoch": 0.189, |
| "grad_norm": 15.75, |
| "grad_norm_var": 3.1048014322916666, |
| "learning_rate": 0.0001, |
| "loss": 4.2052, |
| "loss/crossentropy": 2.2695484533905983, |
| "loss/hidden": 3.512109375, |
| "loss/jsd": 0.09265543352812529, |
| "loss/logits": 0.0, |
| "step": 3780 |
| }, |
| { |
| "epoch": 0.1895, |
| "grad_norm": 15.3125, |
| "grad_norm_var": 2.510139973958333, |
| "learning_rate": 0.0001, |
| "loss": 4.2526, |
| "loss/crossentropy": 2.1818419501185415, |
| "loss/hidden": 3.376171875, |
| "loss/jsd": 0.08593555409461259, |
| "loss/logits": 0.0, |
| "step": 3790 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 15.0625, |
| "grad_norm_var": 2.105729166666667, |
| "learning_rate": 0.0001, |
| "loss": 4.2301, |
| "loss/crossentropy": 2.218023180961609, |
| "loss/hidden": 3.334765625, |
| "loss/jsd": 0.07895527156069874, |
| "loss/logits": 0.0, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.1905, |
| "grad_norm": 15.5625, |
| "grad_norm_var": 6.692708333333333, |
| "learning_rate": 0.0001, |
| "loss": 4.241, |
| "loss/crossentropy": 2.2893342286348344, |
| "loss/hidden": 3.296875, |
| "loss/jsd": 0.07659890875220299, |
| "loss/logits": 0.0, |
| "step": 3810 |
| }, |
| { |
| "epoch": 0.191, |
| "grad_norm": 17.25, |
| "grad_norm_var": 5.272509765625, |
| "learning_rate": 0.0001, |
| "loss": 4.2655, |
| "loss/crossentropy": 2.207420842349529, |
| "loss/hidden": 3.36484375, |
| "loss/jsd": 0.08572290684096515, |
| "loss/logits": 0.0, |
| "step": 3820 |
| }, |
| { |
| "epoch": 0.1915, |
| "grad_norm": 21.625, |
| "grad_norm_var": 5.355712890625, |
| "learning_rate": 0.0001, |
| "loss": 4.2474, |
| "loss/crossentropy": 2.3076944231986998, |
| "loss/hidden": 3.31796875, |
| "loss/jsd": 0.07485279012471438, |
| "loss/logits": 0.0, |
| "step": 3830 |
| }, |
| { |
| "epoch": 0.192, |
| "grad_norm": 15.25, |
| "grad_norm_var": 6.646858723958333, |
| "learning_rate": 0.0001, |
| "loss": 4.2634, |
| "loss/crossentropy": 2.42186721265316, |
| "loss/hidden": 3.4078125, |
| "loss/jsd": 0.08714157855138183, |
| "loss/logits": 0.0, |
| "step": 3840 |
| }, |
| { |
| "epoch": 0.1925, |
| "grad_norm": 4076863488.0, |
| "grad_norm_var": 1.0388009843068502e+18, |
| "learning_rate": 0.0001, |
| "loss": 4.2739, |
| "loss/crossentropy": 2.3014174938201903, |
| "loss/hidden": 3.364453125, |
| "loss/jsd": 0.07954654460772873, |
| "loss/logits": 0.0, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.193, |
| "grad_norm": 17.5, |
| "grad_norm_var": 1.0388009847272768e+18, |
| "learning_rate": 0.0001, |
| "loss": 4.2215, |
| "loss/crossentropy": 2.348978337645531, |
| "loss/hidden": 3.45625, |
| "loss/jsd": 0.08323998479172587, |
| "loss/logits": 0.0, |
| "step": 3860 |
| }, |
| { |
| "epoch": 0.1935, |
| "grad_norm": 18.75, |
| "grad_norm_var": 3.981884765625, |
| "learning_rate": 0.0001, |
| "loss": 4.1528, |
| "loss/crossentropy": 2.3754432618618013, |
| "loss/hidden": 3.401171875, |
| "loss/jsd": 0.08722320841625333, |
| "loss/logits": 0.0, |
| "step": 3870 |
| }, |
| { |
| "epoch": 0.194, |
| "grad_norm": 18.875, |
| "grad_norm_var": 3.824072265625, |
| "learning_rate": 0.0001, |
| "loss": 4.2868, |
| "loss/crossentropy": 2.3063534289598464, |
| "loss/hidden": 3.43515625, |
| "loss/jsd": 0.08732216758653522, |
| "loss/logits": 0.0, |
| "step": 3880 |
| }, |
| { |
| "epoch": 0.1945, |
| "grad_norm": 18.25, |
| "grad_norm_var": 3.3018229166666666, |
| "learning_rate": 0.0001, |
| "loss": 4.2463, |
| "loss/crossentropy": 2.4114058747887612, |
| "loss/hidden": 3.395703125, |
| "loss/jsd": 0.08345712553709746, |
| "loss/logits": 0.0, |
| "step": 3890 |
| }, |
| { |
| "epoch": 0.195, |
| "grad_norm": 18.0, |
| "grad_norm_var": 3.595833333333333, |
| "learning_rate": 0.0001, |
| "loss": 4.2128, |
| "loss/crossentropy": 2.1565380930900573, |
| "loss/hidden": 3.23984375, |
| "loss/jsd": 0.07183347269892693, |
| "loss/logits": 0.0, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.1955, |
| "grad_norm": 19.5, |
| "grad_norm_var": 1.669775390625, |
| "learning_rate": 0.0001, |
| "loss": 4.2174, |
| "loss/crossentropy": 2.4012755006551743, |
| "loss/hidden": 3.35859375, |
| "loss/jsd": 0.08356887567788363, |
| "loss/logits": 0.0, |
| "step": 3910 |
| }, |
| { |
| "epoch": 0.196, |
| "grad_norm": 16.75, |
| "grad_norm_var": 2.569775390625, |
| "learning_rate": 0.0001, |
| "loss": 4.2516, |
| "loss/crossentropy": 2.4133204758167266, |
| "loss/hidden": 3.405078125, |
| "loss/jsd": 0.08416441585868598, |
| "loss/logits": 0.0, |
| "step": 3920 |
| }, |
| { |
| "epoch": 0.1965, |
| "grad_norm": 16.125, |
| "grad_norm_var": 4.249072265625, |
| "learning_rate": 0.0001, |
| "loss": 4.2424, |
| "loss/crossentropy": 2.2017408296465875, |
| "loss/hidden": 3.434375, |
| "loss/jsd": 0.08481362634338438, |
| "loss/logits": 0.0, |
| "step": 3930 |
| }, |
| { |
| "epoch": 0.197, |
| "grad_norm": 18.0, |
| "grad_norm_var": 13.563541666666667, |
| "learning_rate": 0.0001, |
| "loss": 4.2145, |
| "loss/crossentropy": 2.1327252730727198, |
| "loss/hidden": 3.358203125, |
| "loss/jsd": 0.08263032594695688, |
| "loss/logits": 0.0, |
| "step": 3940 |
| }, |
| { |
| "epoch": 0.1975, |
| "grad_norm": 15.9375, |
| "grad_norm_var": 13.279801432291666, |
| "learning_rate": 0.0001, |
| "loss": 4.271, |
| "loss/crossentropy": 2.3732340067625044, |
| "loss/hidden": 3.38984375, |
| "loss/jsd": 0.09080582885071635, |
| "loss/logits": 0.0, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.198, |
| "grad_norm": 14.125, |
| "grad_norm_var": 3.5541015625, |
| "learning_rate": 0.0001, |
| "loss": 4.3277, |
| "loss/crossentropy": 2.2829252019524575, |
| "loss/hidden": 3.504296875, |
| "loss/jsd": 0.09264815384522081, |
| "loss/logits": 0.0, |
| "step": 3960 |
| }, |
| { |
| "epoch": 0.1985, |
| "grad_norm": 20.125, |
| "grad_norm_var": 5.493212890625, |
| "learning_rate": 0.0001, |
| "loss": 4.3215, |
| "loss/crossentropy": 2.284733434021473, |
| "loss/hidden": 3.394921875, |
| "loss/jsd": 0.08987429440021515, |
| "loss/logits": 0.0, |
| "step": 3970 |
| }, |
| { |
| "epoch": 0.199, |
| "grad_norm": 17.0, |
| "grad_norm_var": 5.512223307291666, |
| "learning_rate": 0.0001, |
| "loss": 4.2933, |
| "loss/crossentropy": 2.2337097018957137, |
| "loss/hidden": 3.3640625, |
| "loss/jsd": 0.0808649729937315, |
| "loss/logits": 0.0, |
| "step": 3980 |
| }, |
| { |
| "epoch": 0.1995, |
| "grad_norm": 16.625, |
| "grad_norm_var": 12.917122395833333, |
| "learning_rate": 0.0001, |
| "loss": 4.2148, |
| "loss/crossentropy": 2.3057729706168173, |
| "loss/hidden": 3.411328125, |
| "loss/jsd": 0.08738104859367013, |
| "loss/logits": 0.0, |
| "step": 3990 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 15.9375, |
| "grad_norm_var": 5.007145182291667, |
| "learning_rate": 0.0001, |
| "loss": 4.217, |
| "loss/crossentropy": 2.3626988530158997, |
| "loss/hidden": 3.44296875, |
| "loss/jsd": 0.09443312305957079, |
| "loss/logits": 0.0, |
| "step": 4000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 20000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 2000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.1430040128035226e+19, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|