diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,12033 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.25, + "eval_steps": 2000, + "global_step": 10000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00025, + "grad_norm": 39.5, + "learning_rate": 0.0001, + "loss": 7.8298, + "loss/crossentropy": 2.313796639442444, + "loss/hidden": 3.414453125, + "loss/jsd": 0.0, + "loss/logits": 0.21518087349832057, + "step": 10 + }, + { + "epoch": 0.0005, + "grad_norm": 31.5, + "grad_norm_var": 5.698893229166667, + "learning_rate": 0.0001, + "loss": 7.8693, + "loss/crossentropy": 2.1564369201660156, + "loss/hidden": 3.587109375, + "loss/jsd": 0.0, + "loss/logits": 0.21401480734348297, + "step": 20 + }, + { + "epoch": 0.00075, + "grad_norm": 36.0, + "grad_norm_var": 6.930143229166666, + "learning_rate": 0.0001, + "loss": 7.8779, + "loss/crossentropy": 2.179039953649044, + "loss/hidden": 3.709375, + "loss/jsd": 0.0, + "loss/logits": 0.22207003347575666, + "step": 30 + }, + { + "epoch": 0.001, + "grad_norm": 32.75, + "grad_norm_var": 40.942708333333336, + "learning_rate": 0.0001, + "loss": 7.7653, + "loss/crossentropy": 2.074952059984207, + "loss/hidden": 3.55625, + "loss/jsd": 0.0, + "loss/logits": 0.20403100922703743, + "step": 40 + }, + { + "epoch": 0.00125, + "grad_norm": 35.75, + "grad_norm_var": 94.25729166666666, + "learning_rate": 0.0001, + "loss": 7.8641, + "loss/crossentropy": 2.087546107172966, + "loss/hidden": 3.50546875, + "loss/jsd": 0.0, + "loss/logits": 0.19412125833332539, + "step": 50 + }, + { + "epoch": 0.0015, + "grad_norm": 30.25, + "grad_norm_var": 110.89140625, + "learning_rate": 0.0001, + "loss": 7.8652, + "loss/crossentropy": 2.2259810894727705, + "loss/hidden": 3.528125, + "loss/jsd": 0.0, + "loss/logits": 0.21063638497143983, + "step": 60 + }, + { + "epoch": 0.00175, + "grad_norm": 36.0, + "grad_norm_var": 62.02805989583333, + "learning_rate": 0.0001, + "loss": 7.751, + "loss/crossentropy": 2.164659637212753, + "loss/hidden": 3.47109375, + "loss/jsd": 0.0, + "loss/logits": 0.19977533183991908, + "step": 70 + }, + { + "epoch": 0.002, + "grad_norm": 34.25, + "grad_norm_var": 8.6759765625, + "learning_rate": 0.0001, + "loss": 7.7596, + "loss/crossentropy": 2.097026476264, + "loss/hidden": 3.478125, + "loss/jsd": 0.0, + "loss/logits": 0.20290330462157727, + "step": 80 + }, + { + "epoch": 0.00225, + "grad_norm": 39.75, + "grad_norm_var": 71.35598958333334, + "learning_rate": 0.0001, + "loss": 7.8106, + "loss/crossentropy": 2.1291788890957832, + "loss/hidden": 3.491796875, + "loss/jsd": 0.0, + "loss/logits": 0.19771635457873343, + "step": 90 + }, + { + "epoch": 0.0025, + "grad_norm": 34.25, + "grad_norm_var": 9.158072916666667, + "learning_rate": 0.0001, + "loss": 7.7473, + "loss/crossentropy": 2.147798593342304, + "loss/hidden": 3.558203125, + "loss/jsd": 0.0, + "loss/logits": 0.20517258979380132, + "step": 100 + }, + { + "epoch": 0.00275, + "grad_norm": 31.625, + "grad_norm_var": 9.737239583333333, + "learning_rate": 0.0001, + "loss": 7.7738, + "loss/crossentropy": 2.1884776622056963, + "loss/hidden": 3.458203125, + "loss/jsd": 0.0, + "loss/logits": 0.20624704901129007, + "step": 110 + }, + { + "epoch": 0.003, + "grad_norm": 37.75, + "grad_norm_var": 335.18170572916665, + "learning_rate": 0.0001, + "loss": 7.8546, + "loss/crossentropy": 2.2224678859114646, + "loss/hidden": 3.5296875, + "loss/jsd": 0.0, + "loss/logits": 0.22259013392031193, + "step": 120 + }, + { + "epoch": 0.00325, + "grad_norm": 136.0, + "grad_norm_var": 1014.1374348958333, + "learning_rate": 0.0001, + "loss": 7.7227, + "loss/crossentropy": 2.135145714879036, + "loss/hidden": 3.459765625, + "loss/jsd": 0.0, + "loss/logits": 0.21895913481712342, + "step": 130 + }, + { + "epoch": 0.0035, + "grad_norm": 36.25, + "grad_norm_var": 663.2072916666667, + "learning_rate": 0.0001, + "loss": 7.6794, + "loss/crossentropy": 2.2155070066452027, + "loss/hidden": 3.428515625, + "loss/jsd": 0.0, + "loss/logits": 0.18895817659795283, + "step": 140 + }, + { + "epoch": 0.00375, + "grad_norm": 38.5, + "grad_norm_var": 54.0384765625, + "learning_rate": 0.0001, + "loss": 7.7461, + "loss/crossentropy": 2.1793935388326644, + "loss/hidden": 3.5140625, + "loss/jsd": 0.0, + "loss/logits": 0.19897108823060988, + "step": 150 + }, + { + "epoch": 0.004, + "grad_norm": 62.0, + "grad_norm_var": 99.1447265625, + "learning_rate": 0.0001, + "loss": 7.7956, + "loss/crossentropy": 2.194957372546196, + "loss/hidden": 3.597265625, + "loss/jsd": 0.0, + "loss/logits": 0.22534323409199714, + "step": 160 + }, + { + "epoch": 0.00425, + "grad_norm": 32.5, + "grad_norm_var": 62.60390625, + "learning_rate": 0.0001, + "loss": 7.7866, + "loss/crossentropy": 2.1939920127391814, + "loss/hidden": 3.529296875, + "loss/jsd": 0.0, + "loss/logits": 0.2311840608716011, + "step": 170 + }, + { + "epoch": 0.0045, + "grad_norm": 33.5, + "grad_norm_var": 10.049934895833333, + "learning_rate": 0.0001, + "loss": 7.6691, + "loss/crossentropy": 2.1646964073181154, + "loss/hidden": 3.416015625, + "loss/jsd": 0.0, + "loss/logits": 0.2066217228770256, + "step": 180 + }, + { + "epoch": 0.00475, + "grad_norm": 32.75, + "grad_norm_var": 13.889322916666666, + "learning_rate": 0.0001, + "loss": 7.8529, + "loss/crossentropy": 2.135753521323204, + "loss/hidden": 3.64140625, + "loss/jsd": 0.0, + "loss/logits": 0.23793395943939685, + "step": 190 + }, + { + "epoch": 0.005, + "grad_norm": 30.375, + "grad_norm_var": 14.12265625, + "learning_rate": 0.0001, + "loss": 7.7357, + "loss/crossentropy": 2.1783783614635466, + "loss/hidden": 3.540234375, + "loss/jsd": 0.0, + "loss/logits": 0.2111268475651741, + "step": 200 + }, + { + "epoch": 0.00525, + "grad_norm": 47.75, + "grad_norm_var": 167.33014322916668, + "learning_rate": 0.0001, + "loss": 7.8414, + "loss/crossentropy": 2.091558237373829, + "loss/hidden": 3.49765625, + "loss/jsd": 0.0, + "loss/logits": 0.2065280582755804, + "step": 210 + }, + { + "epoch": 0.0055, + "grad_norm": 30.625, + "grad_norm_var": 185.38899739583334, + "learning_rate": 0.0001, + "loss": 7.7181, + "loss/crossentropy": 2.1866263896226883, + "loss/hidden": 3.434375, + "loss/jsd": 0.0, + "loss/logits": 0.20596572011709213, + "step": 220 + }, + { + "epoch": 0.00575, + "grad_norm": 30.125, + "grad_norm_var": 45.847330729166664, + "learning_rate": 0.0001, + "loss": 7.6276, + "loss/crossentropy": 2.1170753836631775, + "loss/hidden": 3.419140625, + "loss/jsd": 0.0, + "loss/logits": 0.20233637914061547, + "step": 230 + }, + { + "epoch": 0.006, + "grad_norm": 33.75, + "grad_norm_var": 17.477083333333333, + "learning_rate": 0.0001, + "loss": 7.7487, + "loss/crossentropy": 2.1388430804014207, + "loss/hidden": 3.55703125, + "loss/jsd": 0.0, + "loss/logits": 0.20581382531672715, + "step": 240 + }, + { + "epoch": 0.00625, + "grad_norm": 31.75, + "grad_norm_var": 1.54765625, + "learning_rate": 0.0001, + "loss": 7.6568, + "loss/crossentropy": 2.2856020241975785, + "loss/hidden": 3.4375, + "loss/jsd": 0.0, + "loss/logits": 0.2351478708907962, + "step": 250 + }, + { + "epoch": 0.0065, + "grad_norm": 28.375, + "grad_norm_var": 28.54375, + "learning_rate": 0.0001, + "loss": 7.6993, + "loss/crossentropy": 2.0653378486633303, + "loss/hidden": 3.493359375, + "loss/jsd": 0.0, + "loss/logits": 0.19755732025951148, + "step": 260 + }, + { + "epoch": 0.00675, + "grad_norm": 33.25, + "grad_norm_var": 28.384375, + "learning_rate": 0.0001, + "loss": 7.7075, + "loss/crossentropy": 2.1598333328962327, + "loss/hidden": 3.44765625, + "loss/jsd": 0.0, + "loss/logits": 0.19795978404581546, + "step": 270 + }, + { + "epoch": 0.007, + "grad_norm": 32.5, + "grad_norm_var": 20.862955729166668, + "learning_rate": 0.0001, + "loss": 7.6852, + "loss/crossentropy": 2.138056221604347, + "loss/hidden": 3.43671875, + "loss/jsd": 0.0, + "loss/logits": 0.1999417580664158, + "step": 280 + }, + { + "epoch": 0.00725, + "grad_norm": 42.0, + "grad_norm_var": 20.856705729166666, + "learning_rate": 0.0001, + "loss": 7.8621, + "loss/crossentropy": 2.1779348880052565, + "loss/hidden": 3.42421875, + "loss/jsd": 0.0, + "loss/logits": 0.1947902340441942, + "step": 290 + }, + { + "epoch": 0.0075, + "grad_norm": 31.75, + "grad_norm_var": 8.949739583333333, + "learning_rate": 0.0001, + "loss": 7.6542, + "loss/crossentropy": 2.174051034450531, + "loss/hidden": 3.49375, + "loss/jsd": 0.0, + "loss/logits": 0.2129627451300621, + "step": 300 + }, + { + "epoch": 0.00775, + "grad_norm": 33.5, + "grad_norm_var": 4.620247395833333, + "learning_rate": 0.0001, + "loss": 7.6721, + "loss/crossentropy": 2.0598735958337784, + "loss/hidden": 3.54921875, + "loss/jsd": 0.0, + "loss/logits": 0.20318429488688708, + "step": 310 + }, + { + "epoch": 0.008, + "grad_norm": 35.5, + "grad_norm_var": 2.059375, + "learning_rate": 0.0001, + "loss": 7.6655, + "loss/crossentropy": 2.1254130959510804, + "loss/hidden": 3.446875, + "loss/jsd": 0.0, + "loss/logits": 0.19875272288918494, + "step": 320 + }, + { + "epoch": 0.00825, + "grad_norm": 35.0, + "grad_norm_var": 1.9639973958333334, + "learning_rate": 0.0001, + "loss": 7.7461, + "loss/crossentropy": 2.1635933369398117, + "loss/hidden": 3.41640625, + "loss/jsd": 0.0, + "loss/logits": 0.2012931451201439, + "step": 330 + }, + { + "epoch": 0.0085, + "grad_norm": 33.75, + "grad_norm_var": 2.255989583333333, + "learning_rate": 0.0001, + "loss": 7.6974, + "loss/crossentropy": 2.214476653933525, + "loss/hidden": 3.39453125, + "loss/jsd": 0.0, + "loss/logits": 0.1954287003725767, + "step": 340 + }, + { + "epoch": 0.00875, + "grad_norm": 30.625, + "grad_norm_var": 2.4942057291666666, + "learning_rate": 0.0001, + "loss": 7.6918, + "loss/crossentropy": 2.216859245300293, + "loss/hidden": 3.393359375, + "loss/jsd": 0.0, + "loss/logits": 0.20083636604249477, + "step": 350 + }, + { + "epoch": 0.009, + "grad_norm": 31.625, + "grad_norm_var": 1.7905598958333333, + "learning_rate": 0.0001, + "loss": 7.6896, + "loss/crossentropy": 2.2161539107561112, + "loss/hidden": 3.405078125, + "loss/jsd": 0.0, + "loss/logits": 0.19574192687869071, + "step": 360 + }, + { + "epoch": 0.00925, + "grad_norm": 29.75, + "grad_norm_var": 7.141080729166666, + "learning_rate": 0.0001, + "loss": 7.8109, + "loss/crossentropy": 2.153403599560261, + "loss/hidden": 3.559765625, + "loss/jsd": 0.0, + "loss/logits": 0.21339697316288947, + "step": 370 + }, + { + "epoch": 0.0095, + "grad_norm": 37.5, + "grad_norm_var": 10.9244140625, + "learning_rate": 0.0001, + "loss": 7.7615, + "loss/crossentropy": 2.253763607144356, + "loss/hidden": 3.494140625, + "loss/jsd": 0.0, + "loss/logits": 0.2074073076248169, + "step": 380 + }, + { + "epoch": 0.00975, + "grad_norm": 33.75, + "grad_norm_var": 13.8400390625, + "learning_rate": 0.0001, + "loss": 7.7209, + "loss/crossentropy": 2.1363648414611816, + "loss/hidden": 3.478125, + "loss/jsd": 0.0, + "loss/logits": 0.20775138661265374, + "step": 390 + }, + { + "epoch": 0.01, + "grad_norm": 31.5, + "grad_norm_var": 14.397330729166667, + "learning_rate": 0.0001, + "loss": 7.6574, + "loss/crossentropy": 2.1789979085326197, + "loss/hidden": 3.5109375, + "loss/jsd": 0.0, + "loss/logits": 0.20811444334685802, + "step": 400 + }, + { + "epoch": 0.01025, + "grad_norm": 33.0, + "grad_norm_var": 9.6837890625, + "learning_rate": 0.0001, + "loss": 7.7272, + "loss/crossentropy": 2.232848098874092, + "loss/hidden": 3.5328125, + "loss/jsd": 0.0, + "loss/logits": 0.21815686002373696, + "step": 410 + }, + { + "epoch": 0.0105, + "grad_norm": 30.0, + "grad_norm_var": 73.00201822916667, + "learning_rate": 0.0001, + "loss": 7.7767, + "loss/crossentropy": 2.064501041173935, + "loss/hidden": 3.616796875, + "loss/jsd": 0.0, + "loss/logits": 0.21888567861169578, + "step": 420 + }, + { + "epoch": 0.01075, + "grad_norm": 29.625, + "grad_norm_var": 73.21015625, + "learning_rate": 0.0001, + "loss": 7.7084, + "loss/crossentropy": 2.1248373448848725, + "loss/hidden": 3.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.20307110175490378, + "step": 430 + }, + { + "epoch": 0.011, + "grad_norm": 31.875, + "grad_norm_var": 8.3259765625, + "learning_rate": 0.0001, + "loss": 7.6488, + "loss/crossentropy": 2.1684874832630157, + "loss/hidden": 3.371875, + "loss/jsd": 0.0, + "loss/logits": 0.18734447471797466, + "step": 440 + }, + { + "epoch": 0.01125, + "grad_norm": 40.75, + "grad_norm_var": 27.13125, + "learning_rate": 0.0001, + "loss": 7.831, + "loss/crossentropy": 2.1310232520103454, + "loss/hidden": 3.631640625, + "loss/jsd": 0.0, + "loss/logits": 0.20724854618310928, + "step": 450 + }, + { + "epoch": 0.0115, + "grad_norm": 36.0, + "grad_norm_var": 27.680208333333333, + "learning_rate": 0.0001, + "loss": 7.7446, + "loss/crossentropy": 2.134530597925186, + "loss/hidden": 3.43984375, + "loss/jsd": 0.0, + "loss/logits": 0.21913636103272438, + "step": 460 + }, + { + "epoch": 0.01175, + "grad_norm": 33.0, + "grad_norm_var": 7.48125, + "learning_rate": 0.0001, + "loss": 7.6288, + "loss/crossentropy": 2.2641385555267335, + "loss/hidden": 3.348828125, + "loss/jsd": 0.0, + "loss/logits": 0.1849798161536455, + "step": 470 + }, + { + "epoch": 0.012, + "grad_norm": 31.75, + "grad_norm_var": 11.4837890625, + "learning_rate": 0.0001, + "loss": 7.6493, + "loss/crossentropy": 2.2282994374632836, + "loss/hidden": 3.443359375, + "loss/jsd": 0.0, + "loss/logits": 0.2014446135610342, + "step": 480 + }, + { + "epoch": 0.01225, + "grad_norm": 37.25, + "grad_norm_var": 6.677083333333333, + "learning_rate": 0.0001, + "loss": 7.7612, + "loss/crossentropy": 2.1222758114337923, + "loss/hidden": 3.5625, + "loss/jsd": 0.0, + "loss/logits": 0.2013396628201008, + "step": 490 + }, + { + "epoch": 0.0125, + "grad_norm": 32.5, + "grad_norm_var": 4.7744140625, + "learning_rate": 0.0001, + "loss": 7.6421, + "loss/crossentropy": 2.069608175754547, + "loss/hidden": 3.45078125, + "loss/jsd": 0.0, + "loss/logits": 0.1966784244403243, + "step": 500 + }, + { + "epoch": 0.01275, + "grad_norm": 41.5, + "grad_norm_var": 10.3697265625, + "learning_rate": 0.0001, + "loss": 7.6818, + "loss/crossentropy": 2.1589883297681807, + "loss/hidden": 3.355078125, + "loss/jsd": 0.0, + "loss/logits": 0.18451723456382751, + "step": 510 + }, + { + "epoch": 0.013, + "grad_norm": 31.125, + "grad_norm_var": 6.667122395833333, + "learning_rate": 0.0001, + "loss": 7.7136, + "loss/crossentropy": 2.149793979898095, + "loss/hidden": 3.598828125, + "loss/jsd": 0.0, + "loss/logits": 0.2093389181420207, + "step": 520 + }, + { + "epoch": 0.01325, + "grad_norm": 35.25, + "grad_norm_var": 20.768489583333334, + "learning_rate": 0.0001, + "loss": 7.7677, + "loss/crossentropy": 2.195904017984867, + "loss/hidden": 3.425390625, + "loss/jsd": 0.0, + "loss/logits": 0.19832278694957495, + "step": 530 + }, + { + "epoch": 0.0135, + "grad_norm": 35.5, + "grad_norm_var": 17.8619140625, + "learning_rate": 0.0001, + "loss": 7.6679, + "loss/crossentropy": 2.1850160747766494, + "loss/hidden": 3.403125, + "loss/jsd": 0.0, + "loss/logits": 0.20512696355581284, + "step": 540 + }, + { + "epoch": 0.01375, + "grad_norm": 35.25, + "grad_norm_var": 25.736393229166666, + "learning_rate": 0.0001, + "loss": 7.6426, + "loss/crossentropy": 2.1822438329458236, + "loss/hidden": 3.441796875, + "loss/jsd": 0.0, + "loss/logits": 0.20032773297280074, + "step": 550 + }, + { + "epoch": 0.014, + "grad_norm": 35.25, + "grad_norm_var": 30.77890625, + "learning_rate": 0.0001, + "loss": 7.7347, + "loss/crossentropy": 2.1990185409784315, + "loss/hidden": 3.410546875, + "loss/jsd": 0.0, + "loss/logits": 0.19729668814688922, + "step": 560 + }, + { + "epoch": 0.01425, + "grad_norm": 41.25, + "grad_norm_var": 46.00149739583333, + "learning_rate": 0.0001, + "loss": 7.6662, + "loss/crossentropy": 2.0567948162555694, + "loss/hidden": 3.487109375, + "loss/jsd": 0.0, + "loss/logits": 0.18916799686849117, + "step": 570 + }, + { + "epoch": 0.0145, + "grad_norm": 31.875, + "grad_norm_var": 18.383072916666666, + "learning_rate": 0.0001, + "loss": 7.662, + "loss/crossentropy": 2.1589747786521913, + "loss/hidden": 3.6359375, + "loss/jsd": 0.0, + "loss/logits": 0.21064655482769012, + "step": 580 + }, + { + "epoch": 0.01475, + "grad_norm": 34.0, + "grad_norm_var": 7.5025390625, + "learning_rate": 0.0001, + "loss": 7.6739, + "loss/crossentropy": 2.053135275095701, + "loss/hidden": 3.4828125, + "loss/jsd": 0.0, + "loss/logits": 0.20297051095403731, + "step": 590 + }, + { + "epoch": 0.015, + "grad_norm": 42.0, + "grad_norm_var": 58.718684895833334, + "learning_rate": 0.0001, + "loss": 7.6386, + "loss/crossentropy": 2.0670476451516153, + "loss/hidden": 3.590234375, + "loss/jsd": 0.0, + "loss/logits": 0.208550613373518, + "step": 600 + }, + { + "epoch": 0.01525, + "grad_norm": 29.75, + "grad_norm_var": 57.89993489583333, + "learning_rate": 0.0001, + "loss": 7.6461, + "loss/crossentropy": 2.1219205021858216, + "loss/hidden": 3.4953125, + "loss/jsd": 0.0, + "loss/logits": 0.20205040834844112, + "step": 610 + }, + { + "epoch": 0.0155, + "grad_norm": 29.25, + "grad_norm_var": 10.6228515625, + "learning_rate": 0.0001, + "loss": 7.5641, + "loss/crossentropy": 2.114127852022648, + "loss/hidden": 3.4171875, + "loss/jsd": 0.0, + "loss/logits": 0.19217339344322681, + "step": 620 + }, + { + "epoch": 0.01575, + "grad_norm": 30.375, + "grad_norm_var": 14.542643229166666, + "learning_rate": 0.0001, + "loss": 7.6319, + "loss/crossentropy": 2.2160026699304582, + "loss/hidden": 3.392578125, + "loss/jsd": 0.0, + "loss/logits": 0.19281109217554332, + "step": 630 + }, + { + "epoch": 0.016, + "grad_norm": 34.0, + "grad_norm_var": 8.1869140625, + "learning_rate": 0.0001, + "loss": 7.6546, + "loss/crossentropy": 2.1914512276649476, + "loss/hidden": 3.47109375, + "loss/jsd": 0.0, + "loss/logits": 0.2007790008559823, + "step": 640 + }, + { + "epoch": 0.01625, + "grad_norm": 30.375, + "grad_norm_var": 5.4587890625, + "learning_rate": 0.0001, + "loss": 7.573, + "loss/crossentropy": 2.0619212985038757, + "loss/hidden": 3.440625, + "loss/jsd": 0.0, + "loss/logits": 0.19594881720840931, + "step": 650 + }, + { + "epoch": 0.0165, + "grad_norm": 34.75, + "grad_norm_var": 6.121809895833334, + "learning_rate": 0.0001, + "loss": 7.7448, + "loss/crossentropy": 2.1046764492988586, + "loss/hidden": 3.5921875, + "loss/jsd": 0.0, + "loss/logits": 0.20083924774080514, + "step": 660 + }, + { + "epoch": 0.01675, + "grad_norm": 34.5, + "grad_norm_var": 5.715559895833334, + "learning_rate": 0.0001, + "loss": 7.8327, + "loss/crossentropy": 2.2835423797369003, + "loss/hidden": 3.604296875, + "loss/jsd": 0.0, + "loss/logits": 0.23483402598649264, + "step": 670 + }, + { + "epoch": 0.017, + "grad_norm": 31.875, + "grad_norm_var": 10.7650390625, + "learning_rate": 0.0001, + "loss": 7.8138, + "loss/crossentropy": 2.0907129019498827, + "loss/hidden": 3.518359375, + "loss/jsd": 0.0, + "loss/logits": 0.1924523524940014, + "step": 680 + }, + { + "epoch": 0.01725, + "grad_norm": 33.0, + "grad_norm_var": 1.42265625, + "learning_rate": 0.0001, + "loss": 7.6162, + "loss/crossentropy": 2.127697338163853, + "loss/hidden": 3.50859375, + "loss/jsd": 0.0, + "loss/logits": 0.2057236723601818, + "step": 690 + }, + { + "epoch": 0.0175, + "grad_norm": 31.0, + "grad_norm_var": 8.9822265625, + "learning_rate": 0.0001, + "loss": 7.727, + "loss/crossentropy": 2.088846719264984, + "loss/hidden": 3.487890625, + "loss/jsd": 0.0, + "loss/logits": 0.21837750263512135, + "step": 700 + }, + { + "epoch": 0.01775, + "grad_norm": 31.125, + "grad_norm_var": 1.7385416666666667, + "learning_rate": 0.0001, + "loss": 7.504, + "loss/crossentropy": 2.1813240855932237, + "loss/hidden": 3.376171875, + "loss/jsd": 0.0, + "loss/logits": 0.18769481666386129, + "step": 710 + }, + { + "epoch": 0.018, + "grad_norm": 34.0, + "grad_norm_var": 2.569205729166667, + "learning_rate": 0.0001, + "loss": 7.6713, + "loss/crossentropy": 2.127785587310791, + "loss/hidden": 3.45546875, + "loss/jsd": 0.0, + "loss/logits": 0.20503429286181926, + "step": 720 + }, + { + "epoch": 0.01825, + "grad_norm": 29.875, + "grad_norm_var": 12.484375, + "learning_rate": 0.0001, + "loss": 7.6481, + "loss/crossentropy": 2.171098938584328, + "loss/hidden": 3.474609375, + "loss/jsd": 0.0, + "loss/logits": 0.20801318623125553, + "step": 730 + }, + { + "epoch": 0.0185, + "grad_norm": 37.25, + "grad_norm_var": 14.9978515625, + "learning_rate": 0.0001, + "loss": 7.6067, + "loss/crossentropy": 2.1487890854477882, + "loss/hidden": 3.427734375, + "loss/jsd": 0.0, + "loss/logits": 0.18985433727502823, + "step": 740 + }, + { + "epoch": 0.01875, + "grad_norm": 32.0, + "grad_norm_var": 7.299739583333333, + "learning_rate": 0.0001, + "loss": 7.4877, + "loss/crossentropy": 2.2428383469581603, + "loss/hidden": 3.24375, + "loss/jsd": 0.0, + "loss/logits": 0.18733534589409828, + "step": 750 + }, + { + "epoch": 0.019, + "grad_norm": 30.0, + "grad_norm_var": 3.379622395833333, + "learning_rate": 0.0001, + "loss": 7.6509, + "loss/crossentropy": 2.1872796684503557, + "loss/hidden": 3.4203125, + "loss/jsd": 0.0, + "loss/logits": 0.21135813258588315, + "step": 760 + }, + { + "epoch": 0.01925, + "grad_norm": 30.5, + "grad_norm_var": 46.3072265625, + "learning_rate": 0.0001, + "loss": 7.7384, + "loss/crossentropy": 2.2427982538938522, + "loss/hidden": 3.369921875, + "loss/jsd": 0.0, + "loss/logits": 0.1998496226966381, + "step": 770 + }, + { + "epoch": 0.0195, + "grad_norm": 41.5, + "grad_norm_var": 45.49264322916667, + "learning_rate": 0.0001, + "loss": 7.6688, + "loss/crossentropy": 2.207463192939758, + "loss/hidden": 3.321875, + "loss/jsd": 0.0, + "loss/logits": 0.18665656447410583, + "step": 780 + }, + { + "epoch": 0.01975, + "grad_norm": 31.625, + "grad_norm_var": 53.53951822916667, + "learning_rate": 0.0001, + "loss": 7.6327, + "loss/crossentropy": 2.075051838159561, + "loss/hidden": 3.37265625, + "loss/jsd": 0.0, + "loss/logits": 0.19726874344050885, + "step": 790 + }, + { + "epoch": 0.02, + "grad_norm": 35.25, + "grad_norm_var": 15.802083333333334, + "learning_rate": 0.0001, + "loss": 7.6833, + "loss/crossentropy": 2.0811705768108366, + "loss/hidden": 3.40078125, + "loss/jsd": 0.0, + "loss/logits": 0.19689124524593354, + "step": 800 + }, + { + "epoch": 0.02025, + "grad_norm": 36.25, + "grad_norm_var": 2.7916015625, + "learning_rate": 0.0001, + "loss": 7.6998, + "loss/crossentropy": 2.139931133389473, + "loss/hidden": 3.56640625, + "loss/jsd": 0.0, + "loss/logits": 0.20113225914537908, + "step": 810 + }, + { + "epoch": 0.0205, + "grad_norm": 33.25, + "grad_norm_var": 3.21640625, + "learning_rate": 0.0001, + "loss": 7.6125, + "loss/crossentropy": 2.3070268869400024, + "loss/hidden": 3.4, + "loss/jsd": 0.0, + "loss/logits": 0.19928287118673324, + "step": 820 + }, + { + "epoch": 0.02075, + "grad_norm": 31.375, + "grad_norm_var": 4.70390625, + "learning_rate": 0.0001, + "loss": 7.7169, + "loss/crossentropy": 2.1359834372997284, + "loss/hidden": 3.6421875, + "loss/jsd": 0.0, + "loss/logits": 0.22523897737264634, + "step": 830 + }, + { + "epoch": 0.021, + "grad_norm": 33.75, + "grad_norm_var": 7.06015625, + "learning_rate": 0.0001, + "loss": 7.6629, + "loss/crossentropy": 2.1498879536986353, + "loss/hidden": 3.599609375, + "loss/jsd": 0.0, + "loss/logits": 0.21073536314070224, + "step": 840 + }, + { + "epoch": 0.02125, + "grad_norm": 31.125, + "grad_norm_var": 11.855143229166666, + "learning_rate": 0.0001, + "loss": 7.7246, + "loss/crossentropy": 2.154731386899948, + "loss/hidden": 3.379296875, + "loss/jsd": 0.0, + "loss/logits": 0.18697260301560165, + "step": 850 + }, + { + "epoch": 0.0215, + "grad_norm": 28.0, + "grad_norm_var": 3.8988932291666667, + "learning_rate": 0.0001, + "loss": 7.4616, + "loss/crossentropy": 2.209418597817421, + "loss/hidden": 3.441015625, + "loss/jsd": 0.0, + "loss/logits": 0.19536950960755348, + "step": 860 + }, + { + "epoch": 0.02175, + "grad_norm": 36.25, + "grad_norm_var": 28.367708333333333, + "learning_rate": 0.0001, + "loss": 7.6221, + "loss/crossentropy": 2.106307029724121, + "loss/hidden": 3.477734375, + "loss/jsd": 0.0, + "loss/logits": 0.20511649739928545, + "step": 870 + }, + { + "epoch": 0.022, + "grad_norm": 30.875, + "grad_norm_var": 25.5978515625, + "learning_rate": 0.0001, + "loss": 7.57, + "loss/crossentropy": 2.170385852456093, + "loss/hidden": 3.419140625, + "loss/jsd": 0.0, + "loss/logits": 0.18977888114750385, + "step": 880 + }, + { + "epoch": 0.02225, + "grad_norm": 31.0, + "grad_norm_var": 4.178580729166667, + "learning_rate": 0.0001, + "loss": 7.659, + "loss/crossentropy": 2.022993338108063, + "loss/hidden": 3.580859375, + "loss/jsd": 0.0, + "loss/logits": 0.2017082829028368, + "step": 890 + }, + { + "epoch": 0.0225, + "grad_norm": 30.25, + "grad_norm_var": 4.118684895833334, + "learning_rate": 0.0001, + "loss": 7.6471, + "loss/crossentropy": 2.1982390731573105, + "loss/hidden": 3.4078125, + "loss/jsd": 0.0, + "loss/logits": 0.186830697581172, + "step": 900 + }, + { + "epoch": 0.02275, + "grad_norm": 36.0, + "grad_norm_var": 9.886393229166666, + "learning_rate": 0.0001, + "loss": 7.5929, + "loss/crossentropy": 2.1351534157991408, + "loss/hidden": 3.44296875, + "loss/jsd": 0.0, + "loss/logits": 0.19507032372057437, + "step": 910 + }, + { + "epoch": 0.023, + "grad_norm": 30.25, + "grad_norm_var": 69.10182291666666, + "learning_rate": 0.0001, + "loss": 7.7006, + "loss/crossentropy": 2.1805424720048903, + "loss/hidden": 3.60546875, + "loss/jsd": 0.0, + "loss/logits": 0.2387762701138854, + "step": 920 + }, + { + "epoch": 0.02325, + "grad_norm": 29.625, + "grad_norm_var": 148.09765625, + "learning_rate": 0.0001, + "loss": 7.603, + "loss/crossentropy": 2.1993222564458845, + "loss/hidden": 3.5421875, + "loss/jsd": 0.0, + "loss/logits": 0.2252051206305623, + "step": 930 + }, + { + "epoch": 0.0235, + "grad_norm": 33.0, + "grad_norm_var": 149.05670572916668, + "learning_rate": 0.0001, + "loss": 7.6101, + "loss/crossentropy": 2.132229286432266, + "loss/hidden": 3.434765625, + "loss/jsd": 0.0, + "loss/logits": 0.19997731409966946, + "step": 940 + }, + { + "epoch": 0.02375, + "grad_norm": 33.25, + "grad_norm_var": 2.372330729166667, + "learning_rate": 0.0001, + "loss": 7.6309, + "loss/crossentropy": 2.057620918750763, + "loss/hidden": 3.525, + "loss/jsd": 0.0, + "loss/logits": 0.19510896243155001, + "step": 950 + }, + { + "epoch": 0.024, + "grad_norm": 30.125, + "grad_norm_var": 2.9291666666666667, + "learning_rate": 0.0001, + "loss": 7.5599, + "loss/crossentropy": 2.1666407614946364, + "loss/hidden": 3.38828125, + "loss/jsd": 0.0, + "loss/logits": 0.19047823324799537, + "step": 960 + }, + { + "epoch": 0.02425, + "grad_norm": 34.0, + "grad_norm_var": 750.4874348958333, + "learning_rate": 0.0001, + "loss": 7.667, + "loss/crossentropy": 2.199223425984383, + "loss/hidden": 3.561328125, + "loss/jsd": 0.0, + "loss/logits": 0.22954254262149335, + "step": 970 + }, + { + "epoch": 0.0245, + "grad_norm": 31.0, + "grad_norm_var": 736.4518229166666, + "learning_rate": 0.0001, + "loss": 7.6648, + "loss/crossentropy": 2.052691954374313, + "loss/hidden": 3.612109375, + "loss/jsd": 0.0, + "loss/logits": 0.20743414014577866, + "step": 980 + }, + { + "epoch": 0.02475, + "grad_norm": 29.625, + "grad_norm_var": 13.204166666666667, + "learning_rate": 0.0001, + "loss": 7.587, + "loss/crossentropy": 2.2363356560468675, + "loss/hidden": 3.383984375, + "loss/jsd": 0.0, + "loss/logits": 0.19663754627108573, + "step": 990 + }, + { + "epoch": 0.025, + "grad_norm": 29.5, + "grad_norm_var": 8.623958333333333, + "learning_rate": 0.0001, + "loss": 7.5999, + "loss/crossentropy": 2.096450260281563, + "loss/hidden": 3.437890625, + "loss/jsd": 0.0, + "loss/logits": 0.1896633107215166, + "step": 1000 + }, + { + "epoch": 0.02525, + "grad_norm": 30.0, + "grad_norm_var": 26.456184895833335, + "learning_rate": 0.0001, + "loss": 7.6469, + "loss/crossentropy": 2.219489449262619, + "loss/hidden": 3.461328125, + "loss/jsd": 0.0, + "loss/logits": 0.20298976600170135, + "step": 1010 + }, + { + "epoch": 0.0255, + "grad_norm": 32.75, + "grad_norm_var": 22.0525390625, + "learning_rate": 0.0001, + "loss": 7.6634, + "loss/crossentropy": 2.157027468085289, + "loss/hidden": 3.641015625, + "loss/jsd": 0.0, + "loss/logits": 0.21785753238946198, + "step": 1020 + }, + { + "epoch": 0.02575, + "grad_norm": 33.0, + "grad_norm_var": 31.347330729166668, + "learning_rate": 0.0001, + "loss": 7.6449, + "loss/crossentropy": 2.1728423804044725, + "loss/hidden": 3.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.19077841471880674, + "step": 1030 + }, + { + "epoch": 0.026, + "grad_norm": 74.5, + "grad_norm_var": 122.20045572916666, + "learning_rate": 0.0001, + "loss": 7.6132, + "loss/crossentropy": 2.1822386175394057, + "loss/hidden": 3.39453125, + "loss/jsd": 0.0, + "loss/logits": 0.1930880568921566, + "step": 1040 + }, + { + "epoch": 0.02625, + "grad_norm": 67.5, + "grad_norm_var": 178.43125, + "learning_rate": 0.0001, + "loss": 7.6429, + "loss/crossentropy": 2.2600297421216964, + "loss/hidden": 3.458203125, + "loss/jsd": 0.0, + "loss/logits": 0.20767511576414108, + "step": 1050 + }, + { + "epoch": 0.0265, + "grad_norm": 31.875, + "grad_norm_var": 92.06087239583333, + "learning_rate": 0.0001, + "loss": 7.5718, + "loss/crossentropy": 2.118990848958492, + "loss/hidden": 3.37890625, + "loss/jsd": 0.0, + "loss/logits": 0.19327420592308045, + "step": 1060 + }, + { + "epoch": 0.02675, + "grad_norm": 30.875, + "grad_norm_var": 35.283268229166666, + "learning_rate": 0.0001, + "loss": 7.603, + "loss/crossentropy": 2.2320737928152083, + "loss/hidden": 3.335546875, + "loss/jsd": 0.0, + "loss/logits": 0.18573360554873944, + "step": 1070 + }, + { + "epoch": 0.027, + "grad_norm": 35.25, + "grad_norm_var": 3795.8796223958334, + "learning_rate": 0.0001, + "loss": 7.6632, + "loss/crossentropy": 2.1329027831554415, + "loss/hidden": 3.500390625, + "loss/jsd": 0.0, + "loss/logits": 0.25382886435836555, + "step": 1080 + }, + { + "epoch": 0.02725, + "grad_norm": 41.0, + "grad_norm_var": 3810.5869140625, + "learning_rate": 0.0001, + "loss": 7.591, + "loss/crossentropy": 2.147709222137928, + "loss/hidden": 3.409375, + "loss/jsd": 0.0, + "loss/logits": 0.19441522471606731, + "step": 1090 + }, + { + "epoch": 0.0275, + "grad_norm": 34.0, + "grad_norm_var": 10.347916666666666, + "learning_rate": 0.0001, + "loss": 7.459, + "loss/crossentropy": 2.134738603234291, + "loss/hidden": 3.45, + "loss/jsd": 0.0, + "loss/logits": 0.18917258959263564, + "step": 1100 + }, + { + "epoch": 0.02775, + "grad_norm": 30.5, + "grad_norm_var": 5.362239583333333, + "learning_rate": 0.0001, + "loss": 7.4625, + "loss/crossentropy": 2.072269695997238, + "loss/hidden": 3.453125, + "loss/jsd": 0.0, + "loss/logits": 0.19563193432986736, + "step": 1110 + }, + { + "epoch": 0.028, + "grad_norm": 29.375, + "grad_norm_var": 14.4478515625, + "learning_rate": 0.0001, + "loss": 7.514, + "loss/crossentropy": 2.131034165620804, + "loss/hidden": 3.44453125, + "loss/jsd": 0.0, + "loss/logits": 0.18194433208554983, + "step": 1120 + }, + { + "epoch": 0.02825, + "grad_norm": 33.0, + "grad_norm_var": 23.925, + "learning_rate": 0.0001, + "loss": 7.5884, + "loss/crossentropy": 2.023801653087139, + "loss/hidden": 3.637109375, + "loss/jsd": 0.0, + "loss/logits": 0.20569879673421382, + "step": 1130 + }, + { + "epoch": 0.0285, + "grad_norm": 32.0, + "grad_norm_var": 7.2744140625, + "learning_rate": 0.0001, + "loss": 7.6524, + "loss/crossentropy": 2.1517456393688916, + "loss/hidden": 3.537890625, + "loss/jsd": 0.0, + "loss/logits": 0.2079196309670806, + "step": 1140 + }, + { + "epoch": 0.02875, + "grad_norm": 35.75, + "grad_norm_var": 7.99140625, + "learning_rate": 0.0001, + "loss": 7.6074, + "loss/crossentropy": 2.004653300344944, + "loss/hidden": 3.583984375, + "loss/jsd": 0.0, + "loss/logits": 0.19787274841219188, + "step": 1150 + }, + { + "epoch": 0.029, + "grad_norm": 32.75, + "grad_norm_var": 4.120572916666666, + "learning_rate": 0.0001, + "loss": 7.6348, + "loss/crossentropy": 2.1528601229190825, + "loss/hidden": 3.361328125, + "loss/jsd": 0.0, + "loss/logits": 0.1927174234762788, + "step": 1160 + }, + { + "epoch": 0.02925, + "grad_norm": 36.25, + "grad_norm_var": 7.069205729166667, + "learning_rate": 0.0001, + "loss": 7.6205, + "loss/crossentropy": 2.05783154964447, + "loss/hidden": 3.614453125, + "loss/jsd": 0.0, + "loss/logits": 0.22374887801706791, + "step": 1170 + }, + { + "epoch": 0.0295, + "grad_norm": 31.0, + "grad_norm_var": 41.484309895833334, + "learning_rate": 0.0001, + "loss": 7.7356, + "loss/crossentropy": 2.126041141152382, + "loss/hidden": 3.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.2197611417621374, + "step": 1180 + }, + { + "epoch": 0.02975, + "grad_norm": 38.5, + "grad_norm_var": 43.91223958333333, + "learning_rate": 0.0001, + "loss": 7.6157, + "loss/crossentropy": 2.2476495057344437, + "loss/hidden": 3.462109375, + "loss/jsd": 0.0, + "loss/logits": 0.19447711408138274, + "step": 1190 + }, + { + "epoch": 0.03, + "grad_norm": 47.5, + "grad_norm_var": 95.72057291666667, + "learning_rate": 0.0001, + "loss": 7.7237, + "loss/crossentropy": 2.092088536918163, + "loss/hidden": 3.433203125, + "loss/jsd": 0.0, + "loss/logits": 0.19047515615820884, + "step": 1200 + }, + { + "epoch": 0.03025, + "grad_norm": 31.5, + "grad_norm_var": 99.6822265625, + "learning_rate": 0.0001, + "loss": 7.5678, + "loss/crossentropy": 2.1249007523059844, + "loss/hidden": 3.505078125, + "loss/jsd": 0.0, + "loss/logits": 0.19545839354395866, + "step": 1210 + }, + { + "epoch": 0.0305, + "grad_norm": 30.625, + "grad_norm_var": 6.409309895833333, + "learning_rate": 0.0001, + "loss": 7.5862, + "loss/crossentropy": 2.2506365835666657, + "loss/hidden": 3.39921875, + "loss/jsd": 0.0, + "loss/logits": 0.19677093252539635, + "step": 1220 + }, + { + "epoch": 0.03075, + "grad_norm": 33.25, + "grad_norm_var": 6.676822916666667, + "learning_rate": 0.0001, + "loss": 7.5504, + "loss/crossentropy": 2.133887434005737, + "loss/hidden": 3.459375, + "loss/jsd": 0.0, + "loss/logits": 0.1896925836801529, + "step": 1230 + }, + { + "epoch": 0.031, + "grad_norm": 38.0, + "grad_norm_var": 5.2337890625, + "learning_rate": 0.0001, + "loss": 7.5956, + "loss/crossentropy": 2.0669597774744033, + "loss/hidden": 3.475390625, + "loss/jsd": 0.0, + "loss/logits": 0.19292720556259155, + "step": 1240 + }, + { + "epoch": 0.03125, + "grad_norm": 30.625, + "grad_norm_var": 3.97265625, + "learning_rate": 0.0001, + "loss": 7.5722, + "loss/crossentropy": 2.261713761091232, + "loss/hidden": 3.346875, + "loss/jsd": 0.0, + "loss/logits": 0.19570228308439255, + "step": 1250 + }, + { + "epoch": 0.0315, + "grad_norm": 29.25, + "grad_norm_var": 4.916080729166667, + "learning_rate": 0.0001, + "loss": 7.5566, + "loss/crossentropy": 2.0476513862609864, + "loss/hidden": 3.668359375, + "loss/jsd": 0.0, + "loss/logits": 0.2093046260997653, + "step": 1260 + }, + { + "epoch": 0.03175, + "grad_norm": 34.75, + "grad_norm_var": 10.875455729166667, + "learning_rate": 0.0001, + "loss": 7.6172, + "loss/crossentropy": 2.1128817319869997, + "loss/hidden": 3.5703125, + "loss/jsd": 0.0, + "loss/logits": 0.21670667603611946, + "step": 1270 + }, + { + "epoch": 0.032, + "grad_norm": 33.5, + "grad_norm_var": 3.824934895833333, + "learning_rate": 0.0001, + "loss": 7.6787, + "loss/crossentropy": 2.2115501552820205, + "loss/hidden": 3.417578125, + "loss/jsd": 0.0, + "loss/logits": 0.1827129926532507, + "step": 1280 + }, + { + "epoch": 0.03225, + "grad_norm": 30.375, + "grad_norm_var": 13.828125, + "learning_rate": 0.0001, + "loss": 7.6339, + "loss/crossentropy": 2.176504462957382, + "loss/hidden": 3.5, + "loss/jsd": 0.0, + "loss/logits": 0.2160520726814866, + "step": 1290 + }, + { + "epoch": 0.0325, + "grad_norm": 32.25, + "grad_norm_var": 5.916080729166667, + "learning_rate": 0.0001, + "loss": 7.6438, + "loss/crossentropy": 2.173138880729675, + "loss/hidden": 3.529296875, + "loss/jsd": 0.0, + "loss/logits": 0.2056989949196577, + "step": 1300 + }, + { + "epoch": 0.03275, + "grad_norm": 32.25, + "grad_norm_var": 5.78515625, + "learning_rate": 0.0001, + "loss": 7.7146, + "loss/crossentropy": 2.247766065597534, + "loss/hidden": 3.488671875, + "loss/jsd": 0.0, + "loss/logits": 0.20310762114822864, + "step": 1310 + }, + { + "epoch": 0.033, + "grad_norm": 32.75, + "grad_norm_var": 4.51640625, + "learning_rate": 0.0001, + "loss": 7.6452, + "loss/crossentropy": 2.0862443327903746, + "loss/hidden": 3.406640625, + "loss/jsd": 0.0, + "loss/logits": 0.18912406917661428, + "step": 1320 + }, + { + "epoch": 0.03325, + "grad_norm": 34.5, + "grad_norm_var": 7.220833333333333, + "learning_rate": 0.0001, + "loss": 7.7715, + "loss/crossentropy": 2.093398702144623, + "loss/hidden": 3.570703125, + "loss/jsd": 0.0, + "loss/logits": 0.2104920681566, + "step": 1330 + }, + { + "epoch": 0.0335, + "grad_norm": 38.0, + "grad_norm_var": 9.108268229166667, + "learning_rate": 0.0001, + "loss": 7.7368, + "loss/crossentropy": 2.17246213555336, + "loss/hidden": 3.576953125, + "loss/jsd": 0.0, + "loss/logits": 0.21665989980101585, + "step": 1340 + }, + { + "epoch": 0.03375, + "grad_norm": 33.5, + "grad_norm_var": 4.794205729166666, + "learning_rate": 0.0001, + "loss": 7.5892, + "loss/crossentropy": 2.1238946616649628, + "loss/hidden": 3.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.2172164160758257, + "step": 1350 + }, + { + "epoch": 0.034, + "grad_norm": 32.5, + "grad_norm_var": 101.1431640625, + "learning_rate": 0.0001, + "loss": 7.6341, + "loss/crossentropy": 2.194270025193691, + "loss/hidden": 3.4859375, + "loss/jsd": 0.0, + "loss/logits": 0.19632596522569656, + "step": 1360 + }, + { + "epoch": 0.03425, + "grad_norm": 33.25, + "grad_norm_var": 3.9119140625, + "learning_rate": 0.0001, + "loss": 7.5496, + "loss/crossentropy": 2.1282873928546904, + "loss/hidden": 3.343359375, + "loss/jsd": 0.0, + "loss/logits": 0.17983752395957708, + "step": 1370 + }, + { + "epoch": 0.0345, + "grad_norm": 34.5, + "grad_norm_var": 4.324934895833334, + "learning_rate": 0.0001, + "loss": 7.6348, + "loss/crossentropy": 2.140147662162781, + "loss/hidden": 3.471484375, + "loss/jsd": 0.0, + "loss/logits": 0.20302014388144016, + "step": 1380 + }, + { + "epoch": 0.03475, + "grad_norm": 30.75, + "grad_norm_var": 2.818489583333333, + "learning_rate": 0.0001, + "loss": 7.7271, + "loss/crossentropy": 2.128489089012146, + "loss/hidden": 3.4359375, + "loss/jsd": 0.0, + "loss/logits": 0.19602114744484425, + "step": 1390 + }, + { + "epoch": 0.035, + "grad_norm": 31.25, + "grad_norm_var": 2.1458333333333335, + "learning_rate": 0.0001, + "loss": 7.6417, + "loss/crossentropy": 2.1306474581360817, + "loss/hidden": 3.582421875, + "loss/jsd": 0.0, + "loss/logits": 0.19735200479626655, + "step": 1400 + }, + { + "epoch": 0.03525, + "grad_norm": 31.25, + "grad_norm_var": 9.2009765625, + "learning_rate": 0.0001, + "loss": 7.7002, + "loss/crossentropy": 2.173697289824486, + "loss/hidden": 3.480859375, + "loss/jsd": 0.0, + "loss/logits": 0.20366120263934134, + "step": 1410 + }, + { + "epoch": 0.0355, + "grad_norm": 31.75, + "grad_norm_var": 9.913997395833333, + "learning_rate": 0.0001, + "loss": 7.6577, + "loss/crossentropy": 2.26003720164299, + "loss/hidden": 3.385546875, + "loss/jsd": 0.0, + "loss/logits": 0.20924863480031491, + "step": 1420 + }, + { + "epoch": 0.03575, + "grad_norm": 30.25, + "grad_norm_var": 26.66015625, + "learning_rate": 0.0001, + "loss": 7.5718, + "loss/crossentropy": 2.2352112770080566, + "loss/hidden": 3.419140625, + "loss/jsd": 0.0, + "loss/logits": 0.19357634484767913, + "step": 1430 + }, + { + "epoch": 0.036, + "grad_norm": 34.0, + "grad_norm_var": 30.602083333333333, + "learning_rate": 0.0001, + "loss": 7.5861, + "loss/crossentropy": 2.0770506739616392, + "loss/hidden": 3.5453125, + "loss/jsd": 0.0, + "loss/logits": 0.21514309681951999, + "step": 1440 + }, + { + "epoch": 0.03625, + "grad_norm": 37.5, + "grad_norm_var": 12.3875, + "learning_rate": 0.0001, + "loss": 7.4986, + "loss/crossentropy": 2.0542123883962633, + "loss/hidden": 3.510546875, + "loss/jsd": 0.0, + "loss/logits": 0.19684152901172638, + "step": 1450 + }, + { + "epoch": 0.0365, + "grad_norm": 32.75, + "grad_norm_var": 8.548372395833333, + "learning_rate": 0.0001, + "loss": 7.6006, + "loss/crossentropy": 2.175110411643982, + "loss/hidden": 3.37578125, + "loss/jsd": 0.0, + "loss/logits": 0.19031002502888442, + "step": 1460 + }, + { + "epoch": 0.03675, + "grad_norm": 34.25, + "grad_norm_var": 2.8429676028135214e+18, + "learning_rate": 0.0001, + "loss": 7.7702, + "loss/crossentropy": 2.1691948026418686, + "loss/hidden": 3.5859375, + "loss/jsd": 0.0, + "loss/logits": 0.22843880020081997, + "step": 1470 + }, + { + "epoch": 0.037, + "grad_norm": 36.25, + "grad_norm_var": 2.842967603101565e+18, + "learning_rate": 0.0001, + "loss": 7.6046, + "loss/crossentropy": 2.0826233722269536, + "loss/hidden": 3.520703125, + "loss/jsd": 0.0, + "loss/logits": 0.1938928204588592, + "step": 1480 + }, + { + "epoch": 0.03725, + "grad_norm": 32.75, + "grad_norm_var": 8.939518229166667, + "learning_rate": 0.0001, + "loss": 7.6265, + "loss/crossentropy": 2.2077848985791206, + "loss/hidden": 3.435546875, + "loss/jsd": 0.0, + "loss/logits": 0.1943045362830162, + "step": 1490 + }, + { + "epoch": 0.0375, + "grad_norm": 34.25, + "grad_norm_var": 7.7125, + "learning_rate": 0.0001, + "loss": 7.6217, + "loss/crossentropy": 2.1079602181911468, + "loss/hidden": 3.395703125, + "loss/jsd": 0.0, + "loss/logits": 0.19446163363754748, + "step": 1500 + }, + { + "epoch": 0.03775, + "grad_norm": 34.25, + "grad_norm_var": 5.7931640625, + "learning_rate": 0.0001, + "loss": 7.5893, + "loss/crossentropy": 2.078600898385048, + "loss/hidden": 3.5734375, + "loss/jsd": 0.0, + "loss/logits": 0.21464722994714974, + "step": 1510 + }, + { + "epoch": 0.038, + "grad_norm": 35.75, + "grad_norm_var": 6.186458333333333, + "learning_rate": 0.0001, + "loss": 7.6365, + "loss/crossentropy": 2.1014960765838624, + "loss/hidden": 3.53125, + "loss/jsd": 0.0, + "loss/logits": 0.19028044641017913, + "step": 1520 + }, + { + "epoch": 0.03825, + "grad_norm": 36.0, + "grad_norm_var": 4.455989583333333, + "learning_rate": 0.0001, + "loss": 7.5414, + "loss/crossentropy": 2.1112293377518654, + "loss/hidden": 3.350390625, + "loss/jsd": 0.0, + "loss/logits": 0.18414278626441954, + "step": 1530 + }, + { + "epoch": 0.0385, + "grad_norm": 32.25, + "grad_norm_var": 3.7249348958333335, + "learning_rate": 0.0001, + "loss": 7.5681, + "loss/crossentropy": 2.166412356495857, + "loss/hidden": 3.44453125, + "loss/jsd": 0.0, + "loss/logits": 0.19420933350920677, + "step": 1540 + }, + { + "epoch": 0.03875, + "grad_norm": 31.625, + "grad_norm_var": 5.330989583333333, + "learning_rate": 0.0001, + "loss": 7.6663, + "loss/crossentropy": 2.0857026129961014, + "loss/hidden": 3.574609375, + "loss/jsd": 0.0, + "loss/logits": 0.2175652377307415, + "step": 1550 + }, + { + "epoch": 0.039, + "grad_norm": 30.75, + "grad_norm_var": 7.401822916666666, + "learning_rate": 0.0001, + "loss": 7.5661, + "loss/crossentropy": 2.1806214213371278, + "loss/hidden": 3.562890625, + "loss/jsd": 0.0, + "loss/logits": 0.21507157981395722, + "step": 1560 + }, + { + "epoch": 0.03925, + "grad_norm": 37.5, + "grad_norm_var": 9.043684895833334, + "learning_rate": 0.0001, + "loss": 7.5649, + "loss/crossentropy": 2.073585295677185, + "loss/hidden": 3.520703125, + "loss/jsd": 0.0, + "loss/logits": 0.19737527389079332, + "step": 1570 + }, + { + "epoch": 0.0395, + "grad_norm": 32.5, + "grad_norm_var": 4.607747395833333, + "learning_rate": 0.0001, + "loss": 7.6893, + "loss/crossentropy": 2.262183803319931, + "loss/hidden": 3.425390625, + "loss/jsd": 0.0, + "loss/logits": 0.22450251020491124, + "step": 1580 + }, + { + "epoch": 0.03975, + "grad_norm": 31.25, + "grad_norm_var": 1.3983723958333334, + "learning_rate": 0.0001, + "loss": 7.7101, + "loss/crossentropy": 2.1200410187244416, + "loss/hidden": 3.464453125, + "loss/jsd": 0.0, + "loss/logits": 0.20633359774947166, + "step": 1590 + }, + { + "epoch": 0.04, + "grad_norm": 32.25, + "grad_norm_var": 28.868684895833333, + "learning_rate": 0.0001, + "loss": 7.6821, + "loss/crossentropy": 2.1520946115255355, + "loss/hidden": 3.616796875, + "loss/jsd": 0.0, + "loss/logits": 0.20241751577705144, + "step": 1600 + }, + { + "epoch": 0.04025, + "grad_norm": 34.5, + "grad_norm_var": 24.308072916666667, + "learning_rate": 0.0001, + "loss": 7.6482, + "loss/crossentropy": 2.109182408452034, + "loss/hidden": 3.491015625, + "loss/jsd": 0.0, + "loss/logits": 0.19619097150862216, + "step": 1610 + }, + { + "epoch": 0.0405, + "grad_norm": 33.25, + "grad_norm_var": 1.83125, + "learning_rate": 0.0001, + "loss": 7.614, + "loss/crossentropy": 2.220561644434929, + "loss/hidden": 3.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.20333079397678375, + "step": 1620 + }, + { + "epoch": 0.04075, + "grad_norm": 31.0, + "grad_norm_var": 7.183072916666666, + "learning_rate": 0.0001, + "loss": 7.7748, + "loss/crossentropy": 2.2026446878910066, + "loss/hidden": 3.472265625, + "loss/jsd": 0.0, + "loss/logits": 0.20862093791365624, + "step": 1630 + }, + { + "epoch": 0.041, + "grad_norm": 36.25, + "grad_norm_var": 8.080989583333333, + "learning_rate": 0.0001, + "loss": 7.6592, + "loss/crossentropy": 2.2313437908887863, + "loss/hidden": 3.32421875, + "loss/jsd": 0.0, + "loss/logits": 0.19155636206269264, + "step": 1640 + }, + { + "epoch": 0.04125, + "grad_norm": 31.5, + "grad_norm_var": 3.8863932291666665, + "learning_rate": 0.0001, + "loss": 7.6964, + "loss/crossentropy": 2.0529640942811964, + "loss/hidden": 3.658984375, + "loss/jsd": 0.0, + "loss/logits": 0.23474433943629264, + "step": 1650 + }, + { + "epoch": 0.0415, + "grad_norm": 33.25, + "grad_norm_var": 1.0374348958333333, + "learning_rate": 0.0001, + "loss": 7.6483, + "loss/crossentropy": 2.175355441868305, + "loss/hidden": 3.41953125, + "loss/jsd": 0.0, + "loss/logits": 0.19280093312263488, + "step": 1660 + }, + { + "epoch": 0.04175, + "grad_norm": 29.625, + "grad_norm_var": 2.4268229166666666, + "learning_rate": 0.0001, + "loss": 7.6277, + "loss/crossentropy": 2.121490114927292, + "loss/hidden": 3.4421875, + "loss/jsd": 0.0, + "loss/logits": 0.20306031554937362, + "step": 1670 + }, + { + "epoch": 0.042, + "grad_norm": 36.75, + "grad_norm_var": 186.4869140625, + "learning_rate": 0.0001, + "loss": 7.7912, + "loss/crossentropy": 2.123810574412346, + "loss/hidden": 3.596875, + "loss/jsd": 0.0, + "loss/logits": 0.20700039602816106, + "step": 1680 + }, + { + "epoch": 0.04225, + "grad_norm": 33.0, + "grad_norm_var": 194.72708333333333, + "learning_rate": 0.0001, + "loss": 7.5507, + "loss/crossentropy": 2.198003688454628, + "loss/hidden": 3.384375, + "loss/jsd": 0.0, + "loss/logits": 0.19452486634254457, + "step": 1690 + }, + { + "epoch": 0.0425, + "grad_norm": 32.25, + "grad_norm_var": 3.842643229166667, + "learning_rate": 0.0001, + "loss": 7.6641, + "loss/crossentropy": 2.1328449815511705, + "loss/hidden": 3.508984375, + "loss/jsd": 0.0, + "loss/logits": 0.20609250776469706, + "step": 1700 + }, + { + "epoch": 0.04275, + "grad_norm": 36.5, + "grad_norm_var": 23.064322916666665, + "learning_rate": 0.0001, + "loss": 7.5838, + "loss/crossentropy": 2.1690568923950195, + "loss/hidden": 3.459765625, + "loss/jsd": 0.0, + "loss/logits": 0.19301791079342365, + "step": 1710 + }, + { + "epoch": 0.043, + "grad_norm": 30.0, + "grad_norm_var": 5.9712890625, + "learning_rate": 0.0001, + "loss": 7.5441, + "loss/crossentropy": 2.1149508744478225, + "loss/hidden": 3.327734375, + "loss/jsd": 0.0, + "loss/logits": 0.19253603778779507, + "step": 1720 + }, + { + "epoch": 0.04325, + "grad_norm": 33.0, + "grad_norm_var": 3.82890625, + "learning_rate": 0.0001, + "loss": 7.6644, + "loss/crossentropy": 2.070442554354668, + "loss/hidden": 3.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.19407737776637077, + "step": 1730 + }, + { + "epoch": 0.0435, + "grad_norm": 30.625, + "grad_norm_var": 1.5947265625, + "learning_rate": 0.0001, + "loss": 7.6468, + "loss/crossentropy": 2.2249585568904875, + "loss/hidden": 3.437890625, + "loss/jsd": 0.0, + "loss/logits": 0.20079109650105237, + "step": 1740 + }, + { + "epoch": 0.04375, + "grad_norm": 32.25, + "grad_norm_var": 4.1384765625, + "learning_rate": 0.0001, + "loss": 7.6522, + "loss/crossentropy": 2.1712467283010484, + "loss/hidden": 3.481640625, + "loss/jsd": 0.0, + "loss/logits": 0.21240621842443944, + "step": 1750 + }, + { + "epoch": 0.044, + "grad_norm": 30.75, + "grad_norm_var": 2.7018229166666665, + "learning_rate": 0.0001, + "loss": 7.6326, + "loss/crossentropy": 2.1433209091424943, + "loss/hidden": 3.50546875, + "loss/jsd": 0.0, + "loss/logits": 0.19783576354384422, + "step": 1760 + }, + { + "epoch": 0.04425, + "grad_norm": 32.25, + "grad_norm_var": 10.2634765625, + "learning_rate": 0.0001, + "loss": 7.7252, + "loss/crossentropy": 2.1377856612205504, + "loss/hidden": 3.462109375, + "loss/jsd": 0.0, + "loss/logits": 0.19519764352589847, + "step": 1770 + }, + { + "epoch": 0.0445, + "grad_norm": 32.25, + "grad_norm_var": 9.817122395833334, + "learning_rate": 0.0001, + "loss": 7.6165, + "loss/crossentropy": 2.2387378960847855, + "loss/hidden": 3.465234375, + "loss/jsd": 0.0, + "loss/logits": 0.2087532427161932, + "step": 1780 + }, + { + "epoch": 0.04475, + "grad_norm": 31.0, + "grad_norm_var": 1.9650390625, + "learning_rate": 0.0001, + "loss": 7.6701, + "loss/crossentropy": 2.280033028125763, + "loss/hidden": 3.31875, + "loss/jsd": 0.0, + "loss/logits": 0.1907376278191805, + "step": 1790 + }, + { + "epoch": 0.045, + "grad_norm": 33.75, + "grad_norm_var": 2.4197265625, + "learning_rate": 0.0001, + "loss": 7.6553, + "loss/crossentropy": 2.205285739898682, + "loss/hidden": 3.448828125, + "loss/jsd": 0.0, + "loss/logits": 0.1980523556470871, + "step": 1800 + }, + { + "epoch": 0.04525, + "grad_norm": 31.0, + "grad_norm_var": 3.2462890625, + "learning_rate": 0.0001, + "loss": 7.6001, + "loss/crossentropy": 2.047496220469475, + "loss/hidden": 3.548046875, + "loss/jsd": 0.0, + "loss/logits": 0.19389633461833, + "step": 1810 + }, + { + "epoch": 0.0455, + "grad_norm": 31.125, + "grad_norm_var": 2.562239583333333, + "learning_rate": 0.0001, + "loss": 7.615, + "loss/crossentropy": 2.174453580379486, + "loss/hidden": 3.516796875, + "loss/jsd": 0.0, + "loss/logits": 0.20545508041977883, + "step": 1820 + }, + { + "epoch": 0.04575, + "grad_norm": 34.25, + "grad_norm_var": 3.4296223958333334, + "learning_rate": 0.0001, + "loss": 7.638, + "loss/crossentropy": 2.0722746759653092, + "loss/hidden": 3.437109375, + "loss/jsd": 0.0, + "loss/logits": 0.19747158586978913, + "step": 1830 + }, + { + "epoch": 0.046, + "grad_norm": 34.75, + "grad_norm_var": 3.0666015625, + "learning_rate": 0.0001, + "loss": 7.7087, + "loss/crossentropy": 2.1196924835443496, + "loss/hidden": 3.622265625, + "loss/jsd": 0.0, + "loss/logits": 0.20298538953065873, + "step": 1840 + }, + { + "epoch": 0.04625, + "grad_norm": 29.75, + "grad_norm_var": 2.6119140625, + "learning_rate": 0.0001, + "loss": 7.6036, + "loss/crossentropy": 2.1688392132520677, + "loss/hidden": 3.323046875, + "loss/jsd": 0.0, + "loss/logits": 0.17962730433791876, + "step": 1850 + }, + { + "epoch": 0.0465, + "grad_norm": 33.75, + "grad_norm_var": 1.3374348958333333, + "learning_rate": 0.0001, + "loss": 7.7051, + "loss/crossentropy": 2.1148360162973403, + "loss/hidden": 3.422265625, + "loss/jsd": 0.0, + "loss/logits": 0.2195219134911895, + "step": 1860 + }, + { + "epoch": 0.04675, + "grad_norm": 33.5, + "grad_norm_var": 2.45625, + "learning_rate": 0.0001, + "loss": 7.6397, + "loss/crossentropy": 2.016439202427864, + "loss/hidden": 3.529296875, + "loss/jsd": 0.0, + "loss/logits": 0.2061541959643364, + "step": 1870 + }, + { + "epoch": 0.047, + "grad_norm": 29.75, + "grad_norm_var": 4.173893229166667, + "learning_rate": 0.0001, + "loss": 7.7054, + "loss/crossentropy": 2.1470705419778824, + "loss/hidden": 3.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.21371309272944927, + "step": 1880 + }, + { + "epoch": 0.04725, + "grad_norm": 32.5, + "grad_norm_var": 4.054622395833333, + "learning_rate": 0.0001, + "loss": 7.5959, + "loss/crossentropy": 2.265937978029251, + "loss/hidden": 3.31328125, + "loss/jsd": 0.0, + "loss/logits": 0.1929181769490242, + "step": 1890 + }, + { + "epoch": 0.0475, + "grad_norm": 30.125, + "grad_norm_var": 6.6197265625, + "learning_rate": 0.0001, + "loss": 7.6012, + "loss/crossentropy": 2.0853475779294968, + "loss/hidden": 3.40625, + "loss/jsd": 0.0, + "loss/logits": 0.20215214397758247, + "step": 1900 + }, + { + "epoch": 0.04775, + "grad_norm": 34.0, + "grad_norm_var": 23.695572916666666, + "learning_rate": 0.0001, + "loss": 7.7544, + "loss/crossentropy": 2.162308484315872, + "loss/hidden": 3.4796875, + "loss/jsd": 0.0, + "loss/logits": 0.2013952497392893, + "step": 1910 + }, + { + "epoch": 0.048, + "grad_norm": 30.75, + "grad_norm_var": 4.120768229166667, + "learning_rate": 0.0001, + "loss": 7.6092, + "loss/crossentropy": 2.088267083466053, + "loss/hidden": 3.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.2007219024002552, + "step": 1920 + }, + { + "epoch": 0.04825, + "grad_norm": 35.75, + "grad_norm_var": 2.97890625, + "learning_rate": 0.0001, + "loss": 7.7141, + "loss/crossentropy": 2.0617689430713653, + "loss/hidden": 3.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1928685350343585, + "step": 1930 + }, + { + "epoch": 0.0485, + "grad_norm": 35.0, + "grad_norm_var": 6.073893229166667, + "learning_rate": 0.0001, + "loss": 7.7068, + "loss/crossentropy": 2.1201131522655485, + "loss/hidden": 3.501953125, + "loss/jsd": 0.0, + "loss/logits": 0.20757155679166317, + "step": 1940 + }, + { + "epoch": 0.04875, + "grad_norm": 31.5, + "grad_norm_var": 21.345572916666665, + "learning_rate": 0.0001, + "loss": 7.6198, + "loss/crossentropy": 2.235423868894577, + "loss/hidden": 3.36875, + "loss/jsd": 0.0, + "loss/logits": 0.1959926813840866, + "step": 1950 + }, + { + "epoch": 0.049, + "grad_norm": 30.625, + "grad_norm_var": 28.99765625, + "learning_rate": 0.0001, + "loss": 7.6389, + "loss/crossentropy": 2.205905148386955, + "loss/hidden": 3.446484375, + "loss/jsd": 0.0, + "loss/logits": 0.2116202499717474, + "step": 1960 + }, + { + "epoch": 0.04925, + "grad_norm": 33.25, + "grad_norm_var": 9.9369140625, + "learning_rate": 0.0001, + "loss": 7.7121, + "loss/crossentropy": 2.163422483205795, + "loss/hidden": 3.40390625, + "loss/jsd": 0.0, + "loss/logits": 0.1981559544801712, + "step": 1970 + }, + { + "epoch": 0.0495, + "grad_norm": 29.0, + "grad_norm_var": 9.118489583333334, + "learning_rate": 0.0001, + "loss": 7.6772, + "loss/crossentropy": 2.1636913806200027, + "loss/hidden": 3.442578125, + "loss/jsd": 0.0, + "loss/logits": 0.1968079771846533, + "step": 1980 + }, + { + "epoch": 0.04975, + "grad_norm": 35.25, + "grad_norm_var": 4.377018229166667, + "learning_rate": 0.0001, + "loss": 7.5948, + "loss/crossentropy": 2.174500140547752, + "loss/hidden": 3.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.1923616673797369, + "step": 1990 + }, + { + "epoch": 0.05, + "grad_norm": 30.75, + "grad_norm_var": 6.15625, + "learning_rate": 0.0001, + "loss": 7.5639, + "loss/crossentropy": 2.1197956264019013, + "loss/hidden": 3.50703125, + "loss/jsd": 0.0, + "loss/logits": 0.20423812307417394, + "step": 2000 + }, + { + "epoch": 0.05025, + "grad_norm": 33.5, + "grad_norm_var": 4.725455729166667, + "learning_rate": 0.0001, + "loss": 7.6238, + "loss/crossentropy": 2.1442053347826002, + "loss/hidden": 3.359375, + "loss/jsd": 0.0, + "loss/logits": 0.20131820477545262, + "step": 2010 + }, + { + "epoch": 0.0505, + "grad_norm": 33.0, + "grad_norm_var": 3.2212890625, + "learning_rate": 0.0001, + "loss": 7.6024, + "loss/crossentropy": 2.1970301985740663, + "loss/hidden": 3.43203125, + "loss/jsd": 0.0, + "loss/logits": 0.19248049296438693, + "step": 2020 + }, + { + "epoch": 0.05075, + "grad_norm": 31.5, + "grad_norm_var": 2.2853515625, + "learning_rate": 0.0001, + "loss": 7.6279, + "loss/crossentropy": 2.0732986360788344, + "loss/hidden": 3.476953125, + "loss/jsd": 0.0, + "loss/logits": 0.19802382439374924, + "step": 2030 + }, + { + "epoch": 0.051, + "grad_norm": 35.25, + "grad_norm_var": 3.2129557291666666, + "learning_rate": 0.0001, + "loss": 7.643, + "loss/crossentropy": 2.196815450489521, + "loss/hidden": 3.48203125, + "loss/jsd": 0.0, + "loss/logits": 0.20899684820324183, + "step": 2040 + }, + { + "epoch": 0.05125, + "grad_norm": 34.5, + "grad_norm_var": 4.093489583333334, + "learning_rate": 0.0001, + "loss": 7.6321, + "loss/crossentropy": 2.083095496892929, + "loss/hidden": 3.40390625, + "loss/jsd": 0.0, + "loss/logits": 0.18292178437113762, + "step": 2050 + }, + { + "epoch": 0.0515, + "grad_norm": 31.875, + "grad_norm_var": 19.478059895833333, + "learning_rate": 0.0001, + "loss": 7.5882, + "loss/crossentropy": 2.2153579622507094, + "loss/hidden": 3.360546875, + "loss/jsd": 0.0, + "loss/logits": 0.19245057981461286, + "step": 2060 + }, + { + "epoch": 0.05175, + "grad_norm": 34.5, + "grad_norm_var": 16.97265625, + "learning_rate": 0.0001, + "loss": 7.6006, + "loss/crossentropy": 2.2516845196485518, + "loss/hidden": 3.385546875, + "loss/jsd": 0.0, + "loss/logits": 0.1975632380694151, + "step": 2070 + }, + { + "epoch": 0.052, + "grad_norm": 29.75, + "grad_norm_var": 2.71015625, + "learning_rate": 0.0001, + "loss": 7.6393, + "loss/crossentropy": 2.1561204314231874, + "loss/hidden": 3.465234375, + "loss/jsd": 0.0, + "loss/logits": 0.2124529665336013, + "step": 2080 + }, + { + "epoch": 0.05225, + "grad_norm": 35.75, + "grad_norm_var": 36.984375, + "learning_rate": 0.0001, + "loss": 7.7289, + "loss/crossentropy": 2.2129232093691824, + "loss/hidden": 3.43125, + "loss/jsd": 0.0, + "loss/logits": 0.1984808323904872, + "step": 2090 + }, + { + "epoch": 0.0525, + "grad_norm": 28.875, + "grad_norm_var": 38.076822916666664, + "learning_rate": 0.0001, + "loss": 7.5446, + "loss/crossentropy": 2.281945154070854, + "loss/hidden": 3.387109375, + "loss/jsd": 0.0, + "loss/logits": 0.1915616899728775, + "step": 2100 + }, + { + "epoch": 0.05275, + "grad_norm": 30.5, + "grad_norm_var": 2.10625, + "learning_rate": 0.0001, + "loss": 7.6215, + "loss/crossentropy": 2.0773366719484327, + "loss/hidden": 3.439453125, + "loss/jsd": 0.0, + "loss/logits": 0.18790210355073214, + "step": 2110 + }, + { + "epoch": 0.053, + "grad_norm": 31.25, + "grad_norm_var": 1.24765625, + "learning_rate": 0.0001, + "loss": 7.4968, + "loss/crossentropy": 2.186223568022251, + "loss/hidden": 3.36875, + "loss/jsd": 0.0, + "loss/logits": 0.1900737203657627, + "step": 2120 + }, + { + "epoch": 0.05325, + "grad_norm": 30.75, + "grad_norm_var": 4.638541666666667, + "learning_rate": 0.0001, + "loss": 7.6574, + "loss/crossentropy": 2.2741902500391005, + "loss/hidden": 3.38359375, + "loss/jsd": 0.0, + "loss/logits": 0.1898935280740261, + "step": 2130 + }, + { + "epoch": 0.0535, + "grad_norm": 33.75, + "grad_norm_var": 18.001041666666666, + "learning_rate": 0.0001, + "loss": 7.6903, + "loss/crossentropy": 2.1332941919565203, + "loss/hidden": 3.42265625, + "loss/jsd": 0.0, + "loss/logits": 0.1954928996041417, + "step": 2140 + }, + { + "epoch": 0.05375, + "grad_norm": 34.5, + "grad_norm_var": 17.939583333333335, + "learning_rate": 0.0001, + "loss": 7.5786, + "loss/crossentropy": 2.2076333969831468, + "loss/hidden": 3.4828125, + "loss/jsd": 0.0, + "loss/logits": 0.20088096596300603, + "step": 2150 + }, + { + "epoch": 0.054, + "grad_norm": 33.5, + "grad_norm_var": 8.947916666666666, + "learning_rate": 0.0001, + "loss": 7.5995, + "loss/crossentropy": 2.201739010214806, + "loss/hidden": 3.394140625, + "loss/jsd": 0.0, + "loss/logits": 0.20021349862217902, + "step": 2160 + }, + { + "epoch": 0.05425, + "grad_norm": 30.0, + "grad_norm_var": 185.5900390625, + "learning_rate": 0.0001, + "loss": 7.6214, + "loss/crossentropy": 2.1913442850112914, + "loss/hidden": 3.407421875, + "loss/jsd": 0.0, + "loss/logits": 0.1996122680604458, + "step": 2170 + }, + { + "epoch": 0.0545, + "grad_norm": 30.625, + "grad_norm_var": 186.84166666666667, + "learning_rate": 0.0001, + "loss": 7.7375, + "loss/crossentropy": 2.173484447598457, + "loss/hidden": 3.428515625, + "loss/jsd": 0.0, + "loss/logits": 0.18794310167431832, + "step": 2180 + }, + { + "epoch": 0.05475, + "grad_norm": 31.375, + "grad_norm_var": 8.699739583333333, + "learning_rate": 0.0001, + "loss": 7.6023, + "loss/crossentropy": 2.207549235224724, + "loss/hidden": 3.494140625, + "loss/jsd": 0.0, + "loss/logits": 0.21360519118607044, + "step": 2190 + }, + { + "epoch": 0.055, + "grad_norm": 32.0, + "grad_norm_var": 5.228059895833334, + "learning_rate": 0.0001, + "loss": 7.5821, + "loss/crossentropy": 2.168141430988908, + "loss/hidden": 3.355078125, + "loss/jsd": 0.0, + "loss/logits": 0.1850608481094241, + "step": 2200 + }, + { + "epoch": 0.05525, + "grad_norm": 29.625, + "grad_norm_var": 10.708268229166666, + "learning_rate": 0.0001, + "loss": 7.6191, + "loss/crossentropy": 2.26080215126276, + "loss/hidden": 3.440625, + "loss/jsd": 0.0, + "loss/logits": 0.19979589320719243, + "step": 2210 + }, + { + "epoch": 0.0555, + "grad_norm": 31.25, + "grad_norm_var": 10.148958333333333, + "learning_rate": 0.0001, + "loss": 7.613, + "loss/crossentropy": 2.2105998665094377, + "loss/hidden": 3.430859375, + "loss/jsd": 0.0, + "loss/logits": 0.20264392383396626, + "step": 2220 + }, + { + "epoch": 0.05575, + "grad_norm": 29.5, + "grad_norm_var": 9.128059895833333, + "learning_rate": 0.0001, + "loss": 7.5296, + "loss/crossentropy": 2.076467031240463, + "loss/hidden": 3.59296875, + "loss/jsd": 0.0, + "loss/logits": 0.22621012963354586, + "step": 2230 + }, + { + "epoch": 0.056, + "grad_norm": 34.5, + "grad_norm_var": 3.99375, + "learning_rate": 0.0001, + "loss": 7.6686, + "loss/crossentropy": 2.0320577889680864, + "loss/hidden": 3.575, + "loss/jsd": 0.0, + "loss/logits": 0.19981470778584481, + "step": 2240 + }, + { + "epoch": 0.05625, + "grad_norm": 30.75, + "grad_norm_var": 14.473893229166666, + "learning_rate": 0.0001, + "loss": 7.5723, + "loss/crossentropy": 2.084241083264351, + "loss/hidden": 3.545703125, + "loss/jsd": 0.0, + "loss/logits": 0.2100257944315672, + "step": 2250 + }, + { + "epoch": 0.0565, + "grad_norm": 31.5, + "grad_norm_var": 44.7041015625, + "learning_rate": 0.0001, + "loss": 7.6107, + "loss/crossentropy": 2.2695932418107985, + "loss/hidden": 3.371875, + "loss/jsd": 0.0, + "loss/logits": 0.19781356416642665, + "step": 2260 + }, + { + "epoch": 0.05675, + "grad_norm": 30.875, + "grad_norm_var": 9.897916666666667, + "learning_rate": 0.0001, + "loss": 7.6187, + "loss/crossentropy": 2.188571906089783, + "loss/hidden": 3.43984375, + "loss/jsd": 0.0, + "loss/logits": 0.2015857521444559, + "step": 2270 + }, + { + "epoch": 0.057, + "grad_norm": 28.375, + "grad_norm_var": 3.490625, + "learning_rate": 0.0001, + "loss": 7.6139, + "loss/crossentropy": 2.1134210243821143, + "loss/hidden": 3.481640625, + "loss/jsd": 0.0, + "loss/logits": 0.22417646870017052, + "step": 2280 + }, + { + "epoch": 0.05725, + "grad_norm": 32.0, + "grad_norm_var": 6.479622395833333, + "learning_rate": 0.0001, + "loss": 7.6591, + "loss/crossentropy": 2.1894455403089523, + "loss/hidden": 3.465625, + "loss/jsd": 0.0, + "loss/logits": 0.19847002141177655, + "step": 2290 + }, + { + "epoch": 0.0575, + "grad_norm": 31.25, + "grad_norm_var": 8.809309895833334, + "learning_rate": 0.0001, + "loss": 7.657, + "loss/crossentropy": 2.1524556159973143, + "loss/hidden": 3.319921875, + "loss/jsd": 0.0, + "loss/logits": 0.1857963975518942, + "step": 2300 + }, + { + "epoch": 0.05775, + "grad_norm": 32.25, + "grad_norm_var": 3.121809895833333, + "learning_rate": 0.0001, + "loss": 7.6475, + "loss/crossentropy": 2.2037901908159254, + "loss/hidden": 3.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.22191528491675855, + "step": 2310 + }, + { + "epoch": 0.058, + "grad_norm": 32.5, + "grad_norm_var": 2.6348307291666666, + "learning_rate": 0.0001, + "loss": 7.5846, + "loss/crossentropy": 2.214809921383858, + "loss/hidden": 3.3453125, + "loss/jsd": 0.0, + "loss/logits": 0.19421134144067764, + "step": 2320 + }, + { + "epoch": 0.05825, + "grad_norm": 29.375, + "grad_norm_var": 2.939322916666667, + "learning_rate": 0.0001, + "loss": 7.6893, + "loss/crossentropy": 2.2204942047595977, + "loss/hidden": 3.437890625, + "loss/jsd": 0.0, + "loss/logits": 0.21439925488084555, + "step": 2330 + }, + { + "epoch": 0.0585, + "grad_norm": 28.625, + "grad_norm_var": 3.8744140625, + "learning_rate": 0.0001, + "loss": 7.6038, + "loss/crossentropy": 2.1540059238672256, + "loss/hidden": 3.56953125, + "loss/jsd": 0.0, + "loss/logits": 0.24175845962017775, + "step": 2340 + }, + { + "epoch": 0.05875, + "grad_norm": 31.875, + "grad_norm_var": 2.1426432291666666, + "learning_rate": 0.0001, + "loss": 7.6379, + "loss/crossentropy": 2.1948168754577635, + "loss/hidden": 3.43046875, + "loss/jsd": 0.0, + "loss/logits": 0.2041913490742445, + "step": 2350 + }, + { + "epoch": 0.059, + "grad_norm": 32.5, + "grad_norm_var": 1.7869140625, + "learning_rate": 0.0001, + "loss": 7.7135, + "loss/crossentropy": 2.1938526153564455, + "loss/hidden": 3.434375, + "loss/jsd": 0.0, + "loss/logits": 0.1992840923368931, + "step": 2360 + }, + { + "epoch": 0.05925, + "grad_norm": 33.75, + "grad_norm_var": 1.1343098958333333, + "learning_rate": 0.0001, + "loss": 7.6931, + "loss/crossentropy": 2.12769907861948, + "loss/hidden": 3.3796875, + "loss/jsd": 0.0, + "loss/logits": 0.18500677905976773, + "step": 2370 + }, + { + "epoch": 0.0595, + "grad_norm": 33.0, + "grad_norm_var": 3.1884765625, + "learning_rate": 0.0001, + "loss": 7.6297, + "loss/crossentropy": 2.1268584340810777, + "loss/hidden": 3.485546875, + "loss/jsd": 0.0, + "loss/logits": 0.20107861533761023, + "step": 2380 + }, + { + "epoch": 0.05975, + "grad_norm": 31.0, + "grad_norm_var": 5.368489583333333, + "learning_rate": 0.0001, + "loss": 7.555, + "loss/crossentropy": 2.198070913553238, + "loss/hidden": 3.4171875, + "loss/jsd": 0.0, + "loss/logits": 0.19437791910022498, + "step": 2390 + }, + { + "epoch": 0.06, + "grad_norm": 31.0, + "grad_norm_var": 3.218489583333333, + "learning_rate": 0.0001, + "loss": 7.633, + "loss/crossentropy": 2.1521017968654634, + "loss/hidden": 3.472265625, + "loss/jsd": 0.0, + "loss/logits": 0.19696612432599067, + "step": 2400 + }, + { + "epoch": 0.06025, + "grad_norm": 31.625, + "grad_norm_var": 1.4098307291666667, + "learning_rate": 0.0001, + "loss": 7.6634, + "loss/crossentropy": 2.0935733556747436, + "loss/hidden": 3.46171875, + "loss/jsd": 0.0, + "loss/logits": 0.19025789983570576, + "step": 2410 + }, + { + "epoch": 0.0605, + "grad_norm": 31.75, + "grad_norm_var": 4.812434895833333, + "learning_rate": 0.0001, + "loss": 7.6523, + "loss/crossentropy": 2.206766763329506, + "loss/hidden": 3.421484375, + "loss/jsd": 0.0, + "loss/logits": 0.19223052635788918, + "step": 2420 + }, + { + "epoch": 0.06075, + "grad_norm": 32.0, + "grad_norm_var": 5.545247395833333, + "learning_rate": 0.0001, + "loss": 7.6703, + "loss/crossentropy": 2.2091148614883425, + "loss/hidden": 3.471875, + "loss/jsd": 0.0, + "loss/logits": 0.2191623793914914, + "step": 2430 + }, + { + "epoch": 0.061, + "grad_norm": 31.875, + "grad_norm_var": 3.06640625, + "learning_rate": 0.0001, + "loss": 7.6194, + "loss/crossentropy": 2.2076220482587816, + "loss/hidden": 3.52265625, + "loss/jsd": 0.0, + "loss/logits": 0.21331611163914205, + "step": 2440 + }, + { + "epoch": 0.06125, + "grad_norm": 33.75, + "grad_norm_var": 3.753125, + "learning_rate": 0.0001, + "loss": 7.6143, + "loss/crossentropy": 2.1473243802785875, + "loss/hidden": 3.415625, + "loss/jsd": 0.0, + "loss/logits": 0.20034591071307659, + "step": 2450 + }, + { + "epoch": 0.0615, + "grad_norm": 31.25, + "grad_norm_var": 4.09765625, + "learning_rate": 0.0001, + "loss": 7.6166, + "loss/crossentropy": 2.2176205784082414, + "loss/hidden": 3.4015625, + "loss/jsd": 0.0, + "loss/logits": 0.1890367180109024, + "step": 2460 + }, + { + "epoch": 0.06175, + "grad_norm": 32.75, + "grad_norm_var": 2.56015625, + "learning_rate": 0.0001, + "loss": 7.5864, + "loss/crossentropy": 2.139193335175514, + "loss/hidden": 3.55234375, + "loss/jsd": 0.0, + "loss/logits": 0.19700367711484432, + "step": 2470 + }, + { + "epoch": 0.062, + "grad_norm": 30.625, + "grad_norm_var": 3.5400390625, + "learning_rate": 0.0001, + "loss": 7.6061, + "loss/crossentropy": 2.101886364817619, + "loss/hidden": 3.543359375, + "loss/jsd": 0.0, + "loss/logits": 0.20392275378108024, + "step": 2480 + }, + { + "epoch": 0.06225, + "grad_norm": 31.625, + "grad_norm_var": 2.3353515625, + "learning_rate": 0.0001, + "loss": 7.5912, + "loss/crossentropy": 2.1105535492300986, + "loss/hidden": 3.451953125, + "loss/jsd": 0.0, + "loss/logits": 0.20442402064800264, + "step": 2490 + }, + { + "epoch": 0.0625, + "grad_norm": 32.25, + "grad_norm_var": 2.183333333333333, + "learning_rate": 0.0001, + "loss": 7.6553, + "loss/crossentropy": 2.1315447479486465, + "loss/hidden": 3.471484375, + "loss/jsd": 0.0, + "loss/logits": 0.20164060425013303, + "step": 2500 + }, + { + "epoch": 0.06275, + "grad_norm": 33.25, + "grad_norm_var": 147.15149739583333, + "learning_rate": 0.0001, + "loss": 7.6542, + "loss/crossentropy": 2.0641630738973618, + "loss/hidden": 3.468359375, + "loss/jsd": 0.0, + "loss/logits": 0.20385651774704455, + "step": 2510 + }, + { + "epoch": 0.063, + "grad_norm": 30.5, + "grad_norm_var": 150.99166666666667, + "learning_rate": 0.0001, + "loss": 7.5568, + "loss/crossentropy": 2.191535955667496, + "loss/hidden": 3.402734375, + "loss/jsd": 0.0, + "loss/logits": 0.19768227599561214, + "step": 2520 + }, + { + "epoch": 0.06325, + "grad_norm": 29.0, + "grad_norm_var": 2.13515625, + "learning_rate": 0.0001, + "loss": 7.6544, + "loss/crossentropy": 2.199158227443695, + "loss/hidden": 3.45, + "loss/jsd": 0.0, + "loss/logits": 0.2097537014633417, + "step": 2530 + }, + { + "epoch": 0.0635, + "grad_norm": 31.5, + "grad_norm_var": 2.731705729166667, + "learning_rate": 0.0001, + "loss": 7.5063, + "loss/crossentropy": 2.1456793427467344, + "loss/hidden": 3.403515625, + "loss/jsd": 0.0, + "loss/logits": 0.19175102189183235, + "step": 2540 + }, + { + "epoch": 0.06375, + "grad_norm": 32.5, + "grad_norm_var": 6.859830729166666, + "learning_rate": 0.0001, + "loss": 7.5828, + "loss/crossentropy": 2.255453732609749, + "loss/hidden": 3.4203125, + "loss/jsd": 0.0, + "loss/logits": 0.19693338237702845, + "step": 2550 + }, + { + "epoch": 0.064, + "grad_norm": 31.625, + "grad_norm_var": 5.178125, + "learning_rate": 0.0001, + "loss": 7.5702, + "loss/crossentropy": 2.2270909011363984, + "loss/hidden": 3.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.20889390334486962, + "step": 2560 + }, + { + "epoch": 0.06425, + "grad_norm": 33.5, + "grad_norm_var": 3.6372395833333333, + "learning_rate": 0.0001, + "loss": 7.5904, + "loss/crossentropy": 2.190132850408554, + "loss/hidden": 3.3953125, + "loss/jsd": 0.0, + "loss/logits": 0.20386817157268525, + "step": 2570 + }, + { + "epoch": 0.0645, + "grad_norm": 34.25, + "grad_norm_var": 10.79765625, + "learning_rate": 0.0001, + "loss": 7.5854, + "loss/crossentropy": 2.07715407460928, + "loss/hidden": 3.581640625, + "loss/jsd": 0.0, + "loss/logits": 0.20797281824052333, + "step": 2580 + }, + { + "epoch": 0.06475, + "grad_norm": 33.5, + "grad_norm_var": 12.35625, + "learning_rate": 0.0001, + "loss": 7.6279, + "loss/crossentropy": 2.1247923612594604, + "loss/hidden": 3.434765625, + "loss/jsd": 0.0, + "loss/logits": 0.21620508767664431, + "step": 2590 + }, + { + "epoch": 0.065, + "grad_norm": 32.75, + "grad_norm_var": 5.094791666666667, + "learning_rate": 0.0001, + "loss": 7.5996, + "loss/crossentropy": 2.087959203124046, + "loss/hidden": 3.521875, + "loss/jsd": 0.0, + "loss/logits": 0.19923710729926825, + "step": 2600 + }, + { + "epoch": 0.06525, + "grad_norm": 30.0, + "grad_norm_var": 7.6150390625, + "learning_rate": 0.0001, + "loss": 7.682, + "loss/crossentropy": 2.1805250465869905, + "loss/hidden": 3.38984375, + "loss/jsd": 0.0, + "loss/logits": 0.2021130472421646, + "step": 2610 + }, + { + "epoch": 0.0655, + "grad_norm": 34.75, + "grad_norm_var": 7.06015625, + "learning_rate": 0.0001, + "loss": 7.7244, + "loss/crossentropy": 2.1178730964660644, + "loss/hidden": 3.402734375, + "loss/jsd": 0.0, + "loss/logits": 0.19405451826751233, + "step": 2620 + }, + { + "epoch": 0.06575, + "grad_norm": 29.75, + "grad_norm_var": 3.065559895833333, + "learning_rate": 0.0001, + "loss": 7.6483, + "loss/crossentropy": 2.1593512505292893, + "loss/hidden": 3.381640625, + "loss/jsd": 0.0, + "loss/logits": 0.2096536297351122, + "step": 2630 + }, + { + "epoch": 0.066, + "grad_norm": 33.25, + "grad_norm_var": 4.623372395833333, + "learning_rate": 0.0001, + "loss": 7.5982, + "loss/crossentropy": 2.159625916182995, + "loss/hidden": 3.365234375, + "loss/jsd": 0.0, + "loss/logits": 0.18939675595611333, + "step": 2640 + }, + { + "epoch": 0.06625, + "grad_norm": 53.0, + "grad_norm_var": 49.99524739583333, + "learning_rate": 0.0001, + "loss": 7.6918, + "loss/crossentropy": 2.114516945183277, + "loss/hidden": 3.51484375, + "loss/jsd": 0.0, + "loss/logits": 0.2009023107588291, + "step": 2650 + }, + { + "epoch": 0.0665, + "grad_norm": 30.125, + "grad_norm_var": 38.81399739583333, + "learning_rate": 0.0001, + "loss": 7.5543, + "loss/crossentropy": 2.132766366004944, + "loss/hidden": 3.421875, + "loss/jsd": 0.0, + "loss/logits": 0.1965734062716365, + "step": 2660 + }, + { + "epoch": 0.06675, + "grad_norm": 30.875, + "grad_norm_var": 1.9559895833333334, + "learning_rate": 0.0001, + "loss": 7.6338, + "loss/crossentropy": 2.1092610150575637, + "loss/hidden": 3.431640625, + "loss/jsd": 0.0, + "loss/logits": 0.17978871315717698, + "step": 2670 + }, + { + "epoch": 0.067, + "grad_norm": 29.875, + "grad_norm_var": 4.47890625, + "learning_rate": 0.0001, + "loss": 7.617, + "loss/crossentropy": 2.2271903961896897, + "loss/hidden": 3.461328125, + "loss/jsd": 0.0, + "loss/logits": 0.2073811784386635, + "step": 2680 + }, + { + "epoch": 0.06725, + "grad_norm": 30.125, + "grad_norm_var": 3.2249348958333335, + "learning_rate": 0.0001, + "loss": 7.641, + "loss/crossentropy": 2.0155764549970625, + "loss/hidden": 3.603515625, + "loss/jsd": 0.0, + "loss/logits": 0.2049756994470954, + "step": 2690 + }, + { + "epoch": 0.0675, + "grad_norm": 33.5, + "grad_norm_var": 3.0061848958333335, + "learning_rate": 0.0001, + "loss": 7.6253, + "loss/crossentropy": 2.221065053343773, + "loss/hidden": 3.482421875, + "loss/jsd": 0.0, + "loss/logits": 0.2112014289945364, + "step": 2700 + }, + { + "epoch": 0.06775, + "grad_norm": 32.25, + "grad_norm_var": 18.753125, + "learning_rate": 0.0001, + "loss": 7.6427, + "loss/crossentropy": 2.180001160502434, + "loss/hidden": 3.353125, + "loss/jsd": 0.0, + "loss/logits": 0.193130424618721, + "step": 2710 + }, + { + "epoch": 0.068, + "grad_norm": 32.5, + "grad_norm_var": 20.773893229166667, + "learning_rate": 0.0001, + "loss": 7.6163, + "loss/crossentropy": 2.283226564526558, + "loss/hidden": 3.48515625, + "loss/jsd": 0.0, + "loss/logits": 0.19388929307460784, + "step": 2720 + }, + { + "epoch": 0.06825, + "grad_norm": 31.25, + "grad_norm_var": 1.61640625, + "learning_rate": 0.0001, + "loss": 7.587, + "loss/crossentropy": 2.1378406554460527, + "loss/hidden": 3.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.2115953892469406, + "step": 2730 + }, + { + "epoch": 0.0685, + "grad_norm": 32.5, + "grad_norm_var": 1.8684895833333333, + "learning_rate": 0.0001, + "loss": 7.6011, + "loss/crossentropy": 2.0985760882496836, + "loss/hidden": 3.48125, + "loss/jsd": 0.0, + "loss/logits": 0.19036055766046048, + "step": 2740 + }, + { + "epoch": 0.06875, + "grad_norm": 32.25, + "grad_norm_var": 2.9535807291666667, + "learning_rate": 0.0001, + "loss": 7.6583, + "loss/crossentropy": 2.1665745437145234, + "loss/hidden": 3.36640625, + "loss/jsd": 0.0, + "loss/logits": 0.18649150040000678, + "step": 2750 + }, + { + "epoch": 0.069, + "grad_norm": 34.5, + "grad_norm_var": 6.343489583333334, + "learning_rate": 0.0001, + "loss": 7.5495, + "loss/crossentropy": 2.176983141899109, + "loss/hidden": 3.412890625, + "loss/jsd": 0.0, + "loss/logits": 0.20285341441631316, + "step": 2760 + }, + { + "epoch": 0.06925, + "grad_norm": 32.5, + "grad_norm_var": 4.972916666666666, + "learning_rate": 0.0001, + "loss": 7.6597, + "loss/crossentropy": 2.1060123026371, + "loss/hidden": 3.400390625, + "loss/jsd": 0.0, + "loss/logits": 0.19327255934476853, + "step": 2770 + }, + { + "epoch": 0.0695, + "grad_norm": 31.625, + "grad_norm_var": 32.33639322916667, + "learning_rate": 0.0001, + "loss": 7.5862, + "loss/crossentropy": 2.1663936868309976, + "loss/hidden": 3.512109375, + "loss/jsd": 0.0, + "loss/logits": 0.22126073129475116, + "step": 2780 + }, + { + "epoch": 0.06975, + "grad_norm": 32.5, + "grad_norm_var": 5.694791666666666, + "learning_rate": 0.0001, + "loss": 7.5124, + "loss/crossentropy": 2.225750984251499, + "loss/hidden": 3.350390625, + "loss/jsd": 0.0, + "loss/logits": 0.19473073966801166, + "step": 2790 + }, + { + "epoch": 0.07, + "grad_norm": 31.5, + "grad_norm_var": 4.237434895833333, + "learning_rate": 0.0001, + "loss": 7.6174, + "loss/crossentropy": 2.0647315263748167, + "loss/hidden": 3.4734375, + "loss/jsd": 0.0, + "loss/logits": 0.2136565549299121, + "step": 2800 + }, + { + "epoch": 0.07025, + "grad_norm": 36.0, + "grad_norm_var": 4.792708333333334, + "learning_rate": 0.0001, + "loss": 7.6789, + "loss/crossentropy": 2.1971701353788378, + "loss/hidden": 3.440625, + "loss/jsd": 0.0, + "loss/logits": 0.21544951274991037, + "step": 2810 + }, + { + "epoch": 0.0705, + "grad_norm": 31.125, + "grad_norm_var": 11.145247395833334, + "learning_rate": 0.0001, + "loss": 7.7043, + "loss/crossentropy": 2.2537077218294144, + "loss/hidden": 3.395703125, + "loss/jsd": 0.0, + "loss/logits": 0.19961411394178868, + "step": 2820 + }, + { + "epoch": 0.07075, + "grad_norm": 30.5, + "grad_norm_var": 85.65182291666666, + "learning_rate": 0.0001, + "loss": 7.6427, + "loss/crossentropy": 2.0513558954000475, + "loss/hidden": 3.615234375, + "loss/jsd": 0.0, + "loss/logits": 0.23545071221888064, + "step": 2830 + }, + { + "epoch": 0.071, + "grad_norm": 31.375, + "grad_norm_var": 64.77180989583333, + "learning_rate": 0.0001, + "loss": 7.6378, + "loss/crossentropy": 2.186201846599579, + "loss/hidden": 3.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.20769538041204214, + "step": 2840 + }, + { + "epoch": 0.07125, + "grad_norm": 32.25, + "grad_norm_var": 2.0268229166666667, + "learning_rate": 0.0001, + "loss": 7.5525, + "loss/crossentropy": 2.161085495352745, + "loss/hidden": 3.27421875, + "loss/jsd": 0.0, + "loss/logits": 0.18488222286105155, + "step": 2850 + }, + { + "epoch": 0.0715, + "grad_norm": 33.0, + "grad_norm_var": 10.437434895833333, + "learning_rate": 0.0001, + "loss": 7.6376, + "loss/crossentropy": 2.09626332372427, + "loss/hidden": 3.33203125, + "loss/jsd": 0.0, + "loss/logits": 0.179809108376503, + "step": 2860 + }, + { + "epoch": 0.07175, + "grad_norm": 33.25, + "grad_norm_var": 9.233072916666666, + "learning_rate": 0.0001, + "loss": 7.62, + "loss/crossentropy": 2.2382488936185836, + "loss/hidden": 3.353515625, + "loss/jsd": 0.0, + "loss/logits": 0.20872681811451912, + "step": 2870 + }, + { + "epoch": 0.072, + "grad_norm": 28.5, + "grad_norm_var": 8.784375, + "learning_rate": 0.0001, + "loss": 7.6516, + "loss/crossentropy": 2.1699771240353583, + "loss/hidden": 3.470703125, + "loss/jsd": 0.0, + "loss/logits": 0.2100867312401533, + "step": 2880 + }, + { + "epoch": 0.07225, + "grad_norm": 33.75, + "grad_norm_var": 9.269791666666666, + "learning_rate": 0.0001, + "loss": 7.5839, + "loss/crossentropy": 2.1368533104658125, + "loss/hidden": 3.426953125, + "loss/jsd": 0.0, + "loss/logits": 0.21750828213989734, + "step": 2890 + }, + { + "epoch": 0.0725, + "grad_norm": 36.0, + "grad_norm_var": 5.518489583333333, + "learning_rate": 0.0001, + "loss": 7.6849, + "loss/crossentropy": 2.12222815155983, + "loss/hidden": 3.580859375, + "loss/jsd": 0.0, + "loss/logits": 0.21114687696099282, + "step": 2900 + }, + { + "epoch": 0.07275, + "grad_norm": 31.125, + "grad_norm_var": 5.622916666666667, + "learning_rate": 0.0001, + "loss": 7.5109, + "loss/crossentropy": 2.171084225177765, + "loss/hidden": 3.403125, + "loss/jsd": 0.0, + "loss/logits": 0.1979156408458948, + "step": 2910 + }, + { + "epoch": 0.073, + "grad_norm": 31.125, + "grad_norm_var": 1.5619140625, + "learning_rate": 0.0001, + "loss": 7.6895, + "loss/crossentropy": 2.164732736349106, + "loss/hidden": 3.404296875, + "loss/jsd": 0.0, + "loss/logits": 0.20426477529108525, + "step": 2920 + }, + { + "epoch": 0.07325, + "grad_norm": 29.375, + "grad_norm_var": 1.7854166666666667, + "learning_rate": 0.0001, + "loss": 7.5573, + "loss/crossentropy": 2.1073058575391768, + "loss/hidden": 3.50234375, + "loss/jsd": 0.0, + "loss/logits": 0.2097570365294814, + "step": 2930 + }, + { + "epoch": 0.0735, + "grad_norm": 30.875, + "grad_norm_var": 2.4955729166666667, + "learning_rate": 0.0001, + "loss": 7.5697, + "loss/crossentropy": 2.153279659152031, + "loss/hidden": 3.361328125, + "loss/jsd": 0.0, + "loss/logits": 0.1900124330073595, + "step": 2940 + }, + { + "epoch": 0.07375, + "grad_norm": 49.5, + "grad_norm_var": 22.75390625, + "learning_rate": 0.0001, + "loss": 7.671, + "loss/crossentropy": 2.2612457245588304, + "loss/hidden": 3.414453125, + "loss/jsd": 0.0, + "loss/logits": 0.18990697022527456, + "step": 2950 + }, + { + "epoch": 0.074, + "grad_norm": 32.0, + "grad_norm_var": 24.510416666666668, + "learning_rate": 0.0001, + "loss": 7.5814, + "loss/crossentropy": 2.123460465669632, + "loss/hidden": 3.496484375, + "loss/jsd": 0.0, + "loss/logits": 0.20658994875848294, + "step": 2960 + }, + { + "epoch": 0.07425, + "grad_norm": 30.125, + "grad_norm_var": 118.29837239583334, + "learning_rate": 0.0001, + "loss": 7.5481, + "loss/crossentropy": 2.2275219768285752, + "loss/hidden": 3.33515625, + "loss/jsd": 0.0, + "loss/logits": 0.18647960387170315, + "step": 2970 + }, + { + "epoch": 0.0745, + "grad_norm": 29.375, + "grad_norm_var": 21.989322916666666, + "learning_rate": 0.0001, + "loss": 7.5209, + "loss/crossentropy": 2.1731285482645033, + "loss/hidden": 3.4171875, + "loss/jsd": 0.0, + "loss/logits": 0.1889862149953842, + "step": 2980 + }, + { + "epoch": 0.07475, + "grad_norm": 31.125, + "grad_norm_var": 4.253059895833333, + "learning_rate": 0.0001, + "loss": 7.5815, + "loss/crossentropy": 2.2546483501791954, + "loss/hidden": 3.415625, + "loss/jsd": 0.0, + "loss/logits": 0.18856723569333553, + "step": 2990 + }, + { + "epoch": 0.075, + "grad_norm": 32.0, + "grad_norm_var": 5.730143229166667, + "learning_rate": 0.0001, + "loss": 7.5835, + "loss/crossentropy": 2.092367857694626, + "loss/hidden": 3.52421875, + "loss/jsd": 0.0, + "loss/logits": 0.20502115599811077, + "step": 3000 + }, + { + "epoch": 0.07525, + "grad_norm": 29.5, + "grad_norm_var": 15.926822916666667, + "learning_rate": 0.0001, + "loss": 7.5849, + "loss/crossentropy": 2.109961675107479, + "loss/hidden": 3.432421875, + "loss/jsd": 0.0, + "loss/logits": 0.20341113824397325, + "step": 3010 + }, + { + "epoch": 0.0755, + "grad_norm": 32.5, + "grad_norm_var": 3.314322916666667, + "learning_rate": 0.0001, + "loss": 7.5896, + "loss/crossentropy": 2.1648701071739196, + "loss/hidden": 3.5, + "loss/jsd": 0.0, + "loss/logits": 0.20436643473803998, + "step": 3020 + }, + { + "epoch": 0.07575, + "grad_norm": 32.75, + "grad_norm_var": 1.4754557291666666, + "learning_rate": 0.0001, + "loss": 7.5959, + "loss/crossentropy": 2.2054502993822096, + "loss/hidden": 3.440234375, + "loss/jsd": 0.0, + "loss/logits": 0.20024821683764457, + "step": 3030 + }, + { + "epoch": 0.076, + "grad_norm": 30.875, + "grad_norm_var": 6.4931640625, + "learning_rate": 0.0001, + "loss": 7.6957, + "loss/crossentropy": 2.166448511183262, + "loss/hidden": 3.484765625, + "loss/jsd": 0.0, + "loss/logits": 0.20373598877340554, + "step": 3040 + }, + { + "epoch": 0.07625, + "grad_norm": 36.5, + "grad_norm_var": 9.442122395833334, + "learning_rate": 0.0001, + "loss": 7.5957, + "loss/crossentropy": 2.218970799446106, + "loss/hidden": 3.503515625, + "loss/jsd": 0.0, + "loss/logits": 0.20884830448776484, + "step": 3050 + }, + { + "epoch": 0.0765, + "grad_norm": 32.25, + "grad_norm_var": 6.062955729166666, + "learning_rate": 0.0001, + "loss": 7.5458, + "loss/crossentropy": 2.080473840236664, + "loss/hidden": 3.57109375, + "loss/jsd": 0.0, + "loss/logits": 0.20190774220973254, + "step": 3060 + }, + { + "epoch": 0.07675, + "grad_norm": 29.5, + "grad_norm_var": 2.6093098958333334, + "learning_rate": 0.0001, + "loss": 7.5646, + "loss/crossentropy": 2.1775156021118165, + "loss/hidden": 3.355859375, + "loss/jsd": 0.0, + "loss/logits": 0.19825822599232196, + "step": 3070 + }, + { + "epoch": 0.077, + "grad_norm": 30.125, + "grad_norm_var": 2.690625, + "learning_rate": 0.0001, + "loss": 7.7174, + "loss/crossentropy": 2.246141794323921, + "loss/hidden": 3.425390625, + "loss/jsd": 0.0, + "loss/logits": 0.20639744736254215, + "step": 3080 + }, + { + "epoch": 0.07725, + "grad_norm": 30.875, + "grad_norm_var": 4.271809895833333, + "learning_rate": 0.0001, + "loss": 7.6196, + "loss/crossentropy": 2.060313332080841, + "loss/hidden": 3.481640625, + "loss/jsd": 0.0, + "loss/logits": 0.21316638588905334, + "step": 3090 + }, + { + "epoch": 0.0775, + "grad_norm": 34.0, + "grad_norm_var": 2.873893229166667, + "learning_rate": 0.0001, + "loss": 7.5637, + "loss/crossentropy": 2.153154730796814, + "loss/hidden": 3.44453125, + "loss/jsd": 0.0, + "loss/logits": 0.20764457508921624, + "step": 3100 + }, + { + "epoch": 0.07775, + "grad_norm": 33.25, + "grad_norm_var": 2.2708333333333335, + "learning_rate": 0.0001, + "loss": 7.5924, + "loss/crossentropy": 2.2558963537216186, + "loss/hidden": 3.364453125, + "loss/jsd": 0.0, + "loss/logits": 0.19444480016827584, + "step": 3110 + }, + { + "epoch": 0.078, + "grad_norm": 29.5, + "grad_norm_var": 2.2301432291666665, + "learning_rate": 0.0001, + "loss": 7.7202, + "loss/crossentropy": 2.190881980955601, + "loss/hidden": 3.422265625, + "loss/jsd": 0.0, + "loss/logits": 0.2061827789992094, + "step": 3120 + }, + { + "epoch": 0.07825, + "grad_norm": 31.375, + "grad_norm_var": 2.991080729166667, + "learning_rate": 0.0001, + "loss": 7.5479, + "loss/crossentropy": 2.1357465982437134, + "loss/hidden": 3.394140625, + "loss/jsd": 0.0, + "loss/logits": 0.19807269163429736, + "step": 3130 + }, + { + "epoch": 0.0785, + "grad_norm": 29.125, + "grad_norm_var": 5.457291666666666, + "learning_rate": 0.0001, + "loss": 7.641, + "loss/crossentropy": 2.166859371960163, + "loss/hidden": 3.398046875, + "loss/jsd": 0.0, + "loss/logits": 0.1900594387203455, + "step": 3140 + }, + { + "epoch": 0.07875, + "grad_norm": 32.75, + "grad_norm_var": 27.958072916666666, + "learning_rate": 0.0001, + "loss": 7.5223, + "loss/crossentropy": 2.1595762044191362, + "loss/hidden": 3.551171875, + "loss/jsd": 0.0, + "loss/logits": 0.22254167906939984, + "step": 3150 + }, + { + "epoch": 0.079, + "grad_norm": 29.75, + "grad_norm_var": 3.4056640625, + "learning_rate": 0.0001, + "loss": 7.6761, + "loss/crossentropy": 2.156154304742813, + "loss/hidden": 3.407421875, + "loss/jsd": 0.0, + "loss/logits": 0.2026256375014782, + "step": 3160 + }, + { + "epoch": 0.07925, + "grad_norm": 32.0, + "grad_norm_var": 7.246875, + "learning_rate": 0.0001, + "loss": 7.4683, + "loss/crossentropy": 2.1108324408531187, + "loss/hidden": 3.5015625, + "loss/jsd": 0.0, + "loss/logits": 0.18734413515776396, + "step": 3170 + }, + { + "epoch": 0.0795, + "grad_norm": 28.375, + "grad_norm_var": 3.9212890625, + "learning_rate": 0.0001, + "loss": 7.5591, + "loss/crossentropy": 2.1108986347913743, + "loss/hidden": 3.487109375, + "loss/jsd": 0.0, + "loss/logits": 0.19466390162706376, + "step": 3180 + }, + { + "epoch": 0.07975, + "grad_norm": 32.5, + "grad_norm_var": 15.6962890625, + "learning_rate": 0.0001, + "loss": 7.6375, + "loss/crossentropy": 2.1180114537477492, + "loss/hidden": 3.478125, + "loss/jsd": 0.0, + "loss/logits": 0.21690767258405685, + "step": 3190 + }, + { + "epoch": 0.08, + "grad_norm": 31.875, + "grad_norm_var": 12.757747395833333, + "learning_rate": 0.0001, + "loss": 7.5977, + "loss/crossentropy": 2.1203838691115378, + "loss/hidden": 3.5421875, + "loss/jsd": 0.0, + "loss/logits": 0.20782926268875598, + "step": 3200 + }, + { + "epoch": 0.08025, + "grad_norm": 28.375, + "grad_norm_var": 3.3218098958333333, + "learning_rate": 0.0001, + "loss": 7.6303, + "loss/crossentropy": 2.1929849207401277, + "loss/hidden": 3.348828125, + "loss/jsd": 0.0, + "loss/logits": 0.19219291880726813, + "step": 3210 + }, + { + "epoch": 0.0805, + "grad_norm": 30.0, + "grad_norm_var": 3.1958333333333333, + "learning_rate": 0.0001, + "loss": 7.5282, + "loss/crossentropy": 2.2013367488980293, + "loss/hidden": 3.6, + "loss/jsd": 0.0, + "loss/logits": 0.21040805242955685, + "step": 3220 + }, + { + "epoch": 0.08075, + "grad_norm": 32.25, + "grad_norm_var": 1.8684895833333333, + "learning_rate": 0.0001, + "loss": 7.5331, + "loss/crossentropy": 2.184007254242897, + "loss/hidden": 3.371875, + "loss/jsd": 0.0, + "loss/logits": 0.19979026056826116, + "step": 3230 + }, + { + "epoch": 0.081, + "grad_norm": 30.75, + "grad_norm_var": 2.130989583333333, + "learning_rate": 0.0001, + "loss": 7.5964, + "loss/crossentropy": 2.2199858695268633, + "loss/hidden": 3.38515625, + "loss/jsd": 0.0, + "loss/logits": 0.21446770764887332, + "step": 3240 + }, + { + "epoch": 0.08125, + "grad_norm": 32.75, + "grad_norm_var": 2.6483723958333334, + "learning_rate": 0.0001, + "loss": 7.602, + "loss/crossentropy": 2.1263694643974302, + "loss/hidden": 3.482421875, + "loss/jsd": 0.0, + "loss/logits": 0.19737922623753548, + "step": 3250 + }, + { + "epoch": 0.0815, + "grad_norm": 30.75, + "grad_norm_var": 3.207291666666667, + "learning_rate": 0.0001, + "loss": 7.5927, + "loss/crossentropy": 2.184669151902199, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.18790993094444275, + "step": 3260 + }, + { + "epoch": 0.08175, + "grad_norm": 29.875, + "grad_norm_var": 2.857291666666667, + "learning_rate": 0.0001, + "loss": 7.615, + "loss/crossentropy": 2.0831361666321753, + "loss/hidden": 3.48203125, + "loss/jsd": 0.0, + "loss/logits": 0.19330178536474704, + "step": 3270 + }, + { + "epoch": 0.082, + "grad_norm": 30.625, + "grad_norm_var": 15.702018229166667, + "learning_rate": 0.0001, + "loss": 7.6216, + "loss/crossentropy": 2.158697286248207, + "loss/hidden": 3.3765625, + "loss/jsd": 0.0, + "loss/logits": 0.18888361509889365, + "step": 3280 + }, + { + "epoch": 0.08225, + "grad_norm": 32.75, + "grad_norm_var": 18.211393229166667, + "learning_rate": 0.0001, + "loss": 7.564, + "loss/crossentropy": 2.2913430631160736, + "loss/hidden": 3.4828125, + "loss/jsd": 0.0, + "loss/logits": 0.2058469709008932, + "step": 3290 + }, + { + "epoch": 0.0825, + "grad_norm": 35.5, + "grad_norm_var": 4.24140625, + "learning_rate": 0.0001, + "loss": 7.5168, + "loss/crossentropy": 2.2065580666065214, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.18786473274230958, + "step": 3300 + }, + { + "epoch": 0.08275, + "grad_norm": 32.5, + "grad_norm_var": 3.692643229166667, + "learning_rate": 0.0001, + "loss": 7.5099, + "loss/crossentropy": 2.1358665406703947, + "loss/hidden": 3.36875, + "loss/jsd": 0.0, + "loss/logits": 0.18491616416722537, + "step": 3310 + }, + { + "epoch": 0.083, + "grad_norm": 31.75, + "grad_norm_var": 2.50390625, + "learning_rate": 0.0001, + "loss": 7.6092, + "loss/crossentropy": 2.206757593154907, + "loss/hidden": 3.4328125, + "loss/jsd": 0.0, + "loss/logits": 0.20058272033929825, + "step": 3320 + }, + { + "epoch": 0.08325, + "grad_norm": 34.5, + "grad_norm_var": 1.7497395833333333, + "learning_rate": 0.0001, + "loss": 7.5493, + "loss/crossentropy": 2.0709328591823577, + "loss/hidden": 3.450390625, + "loss/jsd": 0.0, + "loss/logits": 0.1953151250258088, + "step": 3330 + }, + { + "epoch": 0.0835, + "grad_norm": 34.5, + "grad_norm_var": 2.9395182291666666, + "learning_rate": 0.0001, + "loss": 7.7584, + "loss/crossentropy": 2.1559954971075057, + "loss/hidden": 3.5484375, + "loss/jsd": 0.0, + "loss/logits": 0.20604321975260972, + "step": 3340 + }, + { + "epoch": 0.08375, + "grad_norm": 33.75, + "grad_norm_var": 17.864322916666666, + "learning_rate": 0.0001, + "loss": 7.6969, + "loss/crossentropy": 2.1975975424051284, + "loss/hidden": 3.40078125, + "loss/jsd": 0.0, + "loss/logits": 0.2027706265449524, + "step": 3350 + }, + { + "epoch": 0.084, + "grad_norm": 33.25, + "grad_norm_var": 2.7997395833333334, + "learning_rate": 0.0001, + "loss": 7.61, + "loss/crossentropy": 2.018556122481823, + "loss/hidden": 3.403515625, + "loss/jsd": 0.0, + "loss/logits": 0.18029189426451922, + "step": 3360 + }, + { + "epoch": 0.08425, + "grad_norm": 33.0, + "grad_norm_var": 2.5994140625, + "learning_rate": 0.0001, + "loss": 7.5397, + "loss/crossentropy": 2.1838466703891752, + "loss/hidden": 3.39453125, + "loss/jsd": 0.0, + "loss/logits": 0.2002351511269808, + "step": 3370 + }, + { + "epoch": 0.0845, + "grad_norm": 32.75, + "grad_norm_var": 2.912955729166667, + "learning_rate": 0.0001, + "loss": 7.5982, + "loss/crossentropy": 2.184953287243843, + "loss/hidden": 3.391015625, + "loss/jsd": 0.0, + "loss/logits": 0.19311312437057496, + "step": 3380 + }, + { + "epoch": 0.08475, + "grad_norm": 34.25, + "grad_norm_var": 3.309375, + "learning_rate": 0.0001, + "loss": 7.5841, + "loss/crossentropy": 2.2160476714372637, + "loss/hidden": 3.408984375, + "loss/jsd": 0.0, + "loss/logits": 0.1966065490618348, + "step": 3390 + }, + { + "epoch": 0.085, + "grad_norm": 31.75, + "grad_norm_var": 2.1936848958333335, + "learning_rate": 0.0001, + "loss": 7.588, + "loss/crossentropy": 2.2071674168109894, + "loss/hidden": 3.423828125, + "loss/jsd": 0.0, + "loss/logits": 0.19414376243948936, + "step": 3400 + }, + { + "epoch": 0.08525, + "grad_norm": 33.0, + "grad_norm_var": 1.6301432291666667, + "learning_rate": 0.0001, + "loss": 7.5966, + "loss/crossentropy": 2.117925961315632, + "loss/hidden": 3.44609375, + "loss/jsd": 0.0, + "loss/logits": 0.21105701606720687, + "step": 3410 + }, + { + "epoch": 0.0855, + "grad_norm": 32.25, + "grad_norm_var": 5.82265625, + "learning_rate": 0.0001, + "loss": 7.62, + "loss/crossentropy": 2.0512605965137483, + "loss/hidden": 3.509375, + "loss/jsd": 0.0, + "loss/logits": 0.20283049941062928, + "step": 3420 + }, + { + "epoch": 0.08575, + "grad_norm": 31.75, + "grad_norm_var": 6.539583333333334, + "learning_rate": 0.0001, + "loss": 7.5429, + "loss/crossentropy": 2.074887050688267, + "loss/hidden": 3.5, + "loss/jsd": 0.0, + "loss/logits": 0.18658901005983353, + "step": 3430 + }, + { + "epoch": 0.086, + "grad_norm": 33.5, + "grad_norm_var": 8.426497395833334, + "learning_rate": 0.0001, + "loss": 7.5759, + "loss/crossentropy": 2.1776267290115356, + "loss/hidden": 3.466796875, + "loss/jsd": 0.0, + "loss/logits": 0.20649599879980088, + "step": 3440 + }, + { + "epoch": 0.08625, + "grad_norm": 43.5, + "grad_norm_var": 14.962434895833333, + "learning_rate": 0.0001, + "loss": 7.6429, + "loss/crossentropy": 2.12281953394413, + "loss/hidden": 3.5265625, + "loss/jsd": 0.0, + "loss/logits": 0.20702828094363213, + "step": 3450 + }, + { + "epoch": 0.0865, + "grad_norm": 31.0, + "grad_norm_var": 196.96555989583334, + "learning_rate": 0.0001, + "loss": 7.7919, + "loss/crossentropy": 2.1028707295656206, + "loss/hidden": 3.536328125, + "loss/jsd": 0.0, + "loss/logits": 0.22825684808194638, + "step": 3460 + }, + { + "epoch": 0.08675, + "grad_norm": 32.25, + "grad_norm_var": 206.46979166666668, + "learning_rate": 0.0001, + "loss": 7.5698, + "loss/crossentropy": 2.1032180160284044, + "loss/hidden": 3.45625, + "loss/jsd": 0.0, + "loss/logits": 0.1899772472679615, + "step": 3470 + }, + { + "epoch": 0.087, + "grad_norm": 37.0, + "grad_norm_var": 15.6322265625, + "learning_rate": 0.0001, + "loss": 7.5884, + "loss/crossentropy": 2.0837722390890123, + "loss/hidden": 3.446484375, + "loss/jsd": 0.0, + "loss/logits": 0.20534363873302935, + "step": 3480 + }, + { + "epoch": 0.08725, + "grad_norm": 30.0, + "grad_norm_var": 16.8806640625, + "learning_rate": 0.0001, + "loss": 7.5719, + "loss/crossentropy": 2.1673771381378173, + "loss/hidden": 3.45234375, + "loss/jsd": 0.0, + "loss/logits": 0.21180946305394172, + "step": 3490 + }, + { + "epoch": 0.0875, + "grad_norm": 33.0, + "grad_norm_var": 16.212955729166666, + "learning_rate": 0.0001, + "loss": 7.5171, + "loss/crossentropy": 2.2269717276096346, + "loss/hidden": 3.294140625, + "loss/jsd": 0.0, + "loss/logits": 0.18251859862357378, + "step": 3500 + }, + { + "epoch": 0.08775, + "grad_norm": 33.5, + "grad_norm_var": 395.25390625, + "learning_rate": 0.0001, + "loss": 7.6825, + "loss/crossentropy": 2.2768601924180984, + "loss/hidden": 3.307421875, + "loss/jsd": 0.0, + "loss/logits": 0.17959882766008378, + "step": 3510 + }, + { + "epoch": 0.088, + "grad_norm": 31.875, + "grad_norm_var": 400.7280598958333, + "learning_rate": 0.0001, + "loss": 7.5387, + "loss/crossentropy": 2.174117147922516, + "loss/hidden": 3.325390625, + "loss/jsd": 0.0, + "loss/logits": 0.1909211568534374, + "step": 3520 + }, + { + "epoch": 0.08825, + "grad_norm": 34.0, + "grad_norm_var": 3.2527951689747005e+18, + "learning_rate": 0.0001, + "loss": 7.5099, + "loss/crossentropy": 2.102574473619461, + "loss/hidden": 3.365625, + "loss/jsd": 0.0, + "loss/logits": 0.18131749220192434, + "step": 3530 + }, + { + "epoch": 0.0885, + "grad_norm": 34.75, + "grad_norm_var": 3.252795168997245e+18, + "learning_rate": 0.0001, + "loss": 7.5664, + "loss/crossentropy": 2.121107617020607, + "loss/hidden": 3.433203125, + "loss/jsd": 0.0, + "loss/logits": 0.18711038120090961, + "step": 3540 + }, + { + "epoch": 0.08875, + "grad_norm": 35.25, + "grad_norm_var": 26.5384765625, + "learning_rate": 0.0001, + "loss": 7.5577, + "loss/crossentropy": 2.1354643225669863, + "loss/hidden": 3.372265625, + "loss/jsd": 0.0, + "loss/logits": 0.18915031235665083, + "step": 3550 + }, + { + "epoch": 0.089, + "grad_norm": 29.25, + "grad_norm_var": 39.177083333333336, + "learning_rate": 0.0001, + "loss": 7.5603, + "loss/crossentropy": 2.111011874675751, + "loss/hidden": 3.425390625, + "loss/jsd": 0.0, + "loss/logits": 0.20049102939665317, + "step": 3560 + }, + { + "epoch": 0.08925, + "grad_norm": 30.5, + "grad_norm_var": 24.4916015625, + "learning_rate": 0.0001, + "loss": 7.5407, + "loss/crossentropy": 2.091498665511608, + "loss/hidden": 3.429296875, + "loss/jsd": 0.0, + "loss/logits": 0.19228591658174993, + "step": 3570 + }, + { + "epoch": 0.0895, + "grad_norm": 30.0, + "grad_norm_var": 21.582291666666666, + "learning_rate": 0.0001, + "loss": 7.5146, + "loss/crossentropy": 2.1603414684534075, + "loss/hidden": 3.586328125, + "loss/jsd": 0.0, + "loss/logits": 0.22721791528165342, + "step": 3580 + }, + { + "epoch": 0.08975, + "grad_norm": 29.25, + "grad_norm_var": 18.798893229166666, + "learning_rate": 0.0001, + "loss": 7.5322, + "loss/crossentropy": 2.1110543325543403, + "loss/hidden": 3.439453125, + "loss/jsd": 0.0, + "loss/logits": 0.19287437647581102, + "step": 3590 + }, + { + "epoch": 0.09, + "grad_norm": 40.75, + "grad_norm_var": 15.987955729166666, + "learning_rate": 0.0001, + "loss": 7.55, + "loss/crossentropy": 2.211816768348217, + "loss/hidden": 3.38984375, + "loss/jsd": 0.0, + "loss/logits": 0.19003268536180257, + "step": 3600 + }, + { + "epoch": 0.09025, + "grad_norm": 29.75, + "grad_norm_var": 14.333268229166666, + "learning_rate": 0.0001, + "loss": 7.6044, + "loss/crossentropy": 2.2199724197387694, + "loss/hidden": 3.45234375, + "loss/jsd": 0.0, + "loss/logits": 0.19937946014106273, + "step": 3610 + }, + { + "epoch": 0.0905, + "grad_norm": 29.5, + "grad_norm_var": 7.994205729166667, + "learning_rate": 0.0001, + "loss": 7.5672, + "loss/crossentropy": 2.1754990458488463, + "loss/hidden": 3.455859375, + "loss/jsd": 0.0, + "loss/logits": 0.1872939633205533, + "step": 3620 + }, + { + "epoch": 0.09075, + "grad_norm": 29.625, + "grad_norm_var": 8.087955729166667, + "learning_rate": 0.0001, + "loss": 7.443, + "loss/crossentropy": 2.2714238941669462, + "loss/hidden": 3.36484375, + "loss/jsd": 0.0, + "loss/logits": 0.1929878756403923, + "step": 3630 + }, + { + "epoch": 0.091, + "grad_norm": 30.25, + "grad_norm_var": 6.550455729166667, + "learning_rate": 0.0001, + "loss": 7.5694, + "loss/crossentropy": 2.1840985506772994, + "loss/hidden": 3.504296875, + "loss/jsd": 0.0, + "loss/logits": 0.19485698137432336, + "step": 3640 + }, + { + "epoch": 0.09125, + "grad_norm": 32.25, + "grad_norm_var": 7.7494140625, + "learning_rate": 0.0001, + "loss": 7.5571, + "loss/crossentropy": 2.1566817820072175, + "loss/hidden": 3.503125, + "loss/jsd": 0.0, + "loss/logits": 0.21431526727974415, + "step": 3650 + }, + { + "epoch": 0.0915, + "grad_norm": 34.75, + "grad_norm_var": 5.333268229166666, + "learning_rate": 0.0001, + "loss": 7.5377, + "loss/crossentropy": 2.0471107825636863, + "loss/hidden": 3.497265625, + "loss/jsd": 0.0, + "loss/logits": 0.20124074276536702, + "step": 3660 + }, + { + "epoch": 0.09175, + "grad_norm": 33.0, + "grad_norm_var": 7.4353515625, + "learning_rate": 0.0001, + "loss": 7.6125, + "loss/crossentropy": 2.1804106384515762, + "loss/hidden": 3.440625, + "loss/jsd": 0.0, + "loss/logits": 0.22469761371612548, + "step": 3670 + }, + { + "epoch": 0.092, + "grad_norm": 33.5, + "grad_norm_var": 4.3572265625, + "learning_rate": 0.0001, + "loss": 7.6491, + "loss/crossentropy": 2.2595307737588883, + "loss/hidden": 3.352734375, + "loss/jsd": 0.0, + "loss/logits": 0.1894306108355522, + "step": 3680 + }, + { + "epoch": 0.09225, + "grad_norm": 36.25, + "grad_norm_var": 8.666666666666666, + "learning_rate": 0.0001, + "loss": 7.5857, + "loss/crossentropy": 2.0454846382141114, + "loss/hidden": 3.465234375, + "loss/jsd": 0.0, + "loss/logits": 0.1902542944997549, + "step": 3690 + }, + { + "epoch": 0.0925, + "grad_norm": 28.625, + "grad_norm_var": 6.204166666666667, + "learning_rate": 0.0001, + "loss": 7.6065, + "loss/crossentropy": 2.1835698932409286, + "loss/hidden": 3.4828125, + "loss/jsd": 0.0, + "loss/logits": 0.20310410112142563, + "step": 3700 + }, + { + "epoch": 0.09275, + "grad_norm": 35.25, + "grad_norm_var": 7.305989583333333, + "learning_rate": 0.0001, + "loss": 7.6004, + "loss/crossentropy": 2.0759357810020447, + "loss/hidden": 3.446484375, + "loss/jsd": 0.0, + "loss/logits": 0.21512960288673638, + "step": 3710 + }, + { + "epoch": 0.093, + "grad_norm": 38.25, + "grad_norm_var": 19.737239583333334, + "learning_rate": 0.0001, + "loss": 7.6564, + "loss/crossentropy": 2.2961436778306963, + "loss/hidden": 3.318359375, + "loss/jsd": 0.0, + "loss/logits": 0.198493617400527, + "step": 3720 + }, + { + "epoch": 0.09325, + "grad_norm": 30.5, + "grad_norm_var": 17.01015625, + "learning_rate": 0.0001, + "loss": 7.6998, + "loss/crossentropy": 2.1192551463842393, + "loss/hidden": 3.510546875, + "loss/jsd": 0.0, + "loss/logits": 0.19986802861094474, + "step": 3730 + }, + { + "epoch": 0.0935, + "grad_norm": 36.25, + "grad_norm_var": 10.20625, + "learning_rate": 0.0001, + "loss": 7.4855, + "loss/crossentropy": 2.0999212980270388, + "loss/hidden": 3.377734375, + "loss/jsd": 0.0, + "loss/logits": 0.19102167561650277, + "step": 3740 + }, + { + "epoch": 0.09375, + "grad_norm": 33.75, + "grad_norm_var": 7.556705729166667, + "learning_rate": 0.0001, + "loss": 7.6165, + "loss/crossentropy": 2.1783443093299866, + "loss/hidden": 3.418359375, + "loss/jsd": 0.0, + "loss/logits": 0.1862858783453703, + "step": 3750 + }, + { + "epoch": 0.094, + "grad_norm": 28.125, + "grad_norm_var": 5.3603515625, + "learning_rate": 0.0001, + "loss": 7.5372, + "loss/crossentropy": 2.0993641003966332, + "loss/hidden": 3.405078125, + "loss/jsd": 0.0, + "loss/logits": 0.17717746701091527, + "step": 3760 + }, + { + "epoch": 0.09425, + "grad_norm": 30.625, + "grad_norm_var": 5.297330729166666, + "learning_rate": 0.0001, + "loss": 7.614, + "loss/crossentropy": 2.1238688945770265, + "loss/hidden": 3.424609375, + "loss/jsd": 0.0, + "loss/logits": 0.20928110517561435, + "step": 3770 + }, + { + "epoch": 0.0945, + "grad_norm": 29.75, + "grad_norm_var": 4.573893229166667, + "learning_rate": 0.0001, + "loss": 7.5672, + "loss/crossentropy": 2.203542584180832, + "loss/hidden": 3.40546875, + "loss/jsd": 0.0, + "loss/logits": 0.19581303521990776, + "step": 3780 + }, + { + "epoch": 0.09475, + "grad_norm": 38.0, + "grad_norm_var": 5.322330729166667, + "learning_rate": 0.0001, + "loss": 7.5661, + "loss/crossentropy": 2.159272998571396, + "loss/hidden": 3.46953125, + "loss/jsd": 0.0, + "loss/logits": 0.1973846558481455, + "step": 3790 + }, + { + "epoch": 0.095, + "grad_norm": 30.75, + "grad_norm_var": 5.291666666666667, + "learning_rate": 0.0001, + "loss": 7.5495, + "loss/crossentropy": 2.188914805650711, + "loss/hidden": 3.375, + "loss/jsd": 0.0, + "loss/logits": 0.2011850569397211, + "step": 3800 + }, + { + "epoch": 0.09525, + "grad_norm": 31.125, + "grad_norm_var": 6.112434895833333, + "learning_rate": 0.0001, + "loss": 7.5776, + "loss/crossentropy": 2.1599004954099654, + "loss/hidden": 3.464453125, + "loss/jsd": 0.0, + "loss/logits": 0.21134469993412494, + "step": 3810 + }, + { + "epoch": 0.0955, + "grad_norm": 35.25, + "grad_norm_var": 29.9431640625, + "learning_rate": 0.0001, + "loss": 7.6779, + "loss/crossentropy": 2.1196573287248612, + "loss/hidden": 3.576171875, + "loss/jsd": 0.0, + "loss/logits": 0.2261866919696331, + "step": 3820 + }, + { + "epoch": 0.09575, + "grad_norm": 37.5, + "grad_norm_var": 11.21640625, + "learning_rate": 0.0001, + "loss": 7.4703, + "loss/crossentropy": 2.149521693587303, + "loss/hidden": 3.373046875, + "loss/jsd": 0.0, + "loss/logits": 0.1926643056795001, + "step": 3830 + }, + { + "epoch": 0.096, + "grad_norm": 32.0, + "grad_norm_var": 4.3494140625, + "learning_rate": 0.0001, + "loss": 7.5513, + "loss/crossentropy": 2.1707202911376955, + "loss/hidden": 3.425, + "loss/jsd": 0.0, + "loss/logits": 0.19597616009414195, + "step": 3840 + }, + { + "epoch": 0.09625, + "grad_norm": 29.5, + "grad_norm_var": 630.7197916666667, + "learning_rate": 0.0001, + "loss": 7.6191, + "loss/crossentropy": 2.0859180808067324, + "loss/hidden": 3.4640625, + "loss/jsd": 0.0, + "loss/logits": 0.20257378201931714, + "step": 3850 + }, + { + "epoch": 0.0965, + "grad_norm": 59.5, + "grad_norm_var": 100.66223958333333, + "learning_rate": 0.0001, + "loss": 7.5774, + "loss/crossentropy": 2.1815837740898134, + "loss/hidden": 3.426953125, + "loss/jsd": 0.0, + "loss/logits": 0.18438388928771018, + "step": 3860 + }, + { + "epoch": 0.09675, + "grad_norm": 36.75, + "grad_norm_var": 66.29212239583333, + "learning_rate": 0.0001, + "loss": 7.5808, + "loss/crossentropy": 2.0505243610590695, + "loss/hidden": 3.440234375, + "loss/jsd": 0.0, + "loss/logits": 0.18692483827471734, + "step": 3870 + }, + { + "epoch": 0.097, + "grad_norm": 30.375, + "grad_norm_var": 4.266080729166666, + "learning_rate": 0.0001, + "loss": 7.5664, + "loss/crossentropy": 2.2033773183822634, + "loss/hidden": 3.4625, + "loss/jsd": 0.0, + "loss/logits": 0.19963842574507, + "step": 3880 + }, + { + "epoch": 0.09725, + "grad_norm": 32.25, + "grad_norm_var": 5.0025390625, + "learning_rate": 0.0001, + "loss": 7.59, + "loss/crossentropy": 2.1328989803791045, + "loss/hidden": 3.417578125, + "loss/jsd": 0.0, + "loss/logits": 0.19634215533733368, + "step": 3890 + }, + { + "epoch": 0.0975, + "grad_norm": 34.0, + "grad_norm_var": 2.1322916666666667, + "learning_rate": 0.0001, + "loss": 7.6407, + "loss/crossentropy": 2.170455330610275, + "loss/hidden": 3.43828125, + "loss/jsd": 0.0, + "loss/logits": 0.19931643791496753, + "step": 3900 + }, + { + "epoch": 0.09775, + "grad_norm": 34.75, + "grad_norm_var": 3.558333333333333, + "learning_rate": 0.0001, + "loss": 7.5899, + "loss/crossentropy": 2.1301774442195893, + "loss/hidden": 3.468359375, + "loss/jsd": 0.0, + "loss/logits": 0.19248898830264807, + "step": 3910 + }, + { + "epoch": 0.098, + "grad_norm": 33.75, + "grad_norm_var": 3.3478515625, + "learning_rate": 0.0001, + "loss": 7.6526, + "loss/crossentropy": 2.1600559651851654, + "loss/hidden": 3.541796875, + "loss/jsd": 0.0, + "loss/logits": 0.22827934101223946, + "step": 3920 + }, + { + "epoch": 0.09825, + "grad_norm": 30.75, + "grad_norm_var": 6.117643229166666, + "learning_rate": 0.0001, + "loss": 7.5635, + "loss/crossentropy": 2.0728287249803543, + "loss/hidden": 3.534765625, + "loss/jsd": 0.0, + "loss/logits": 0.20216128267347813, + "step": 3930 + }, + { + "epoch": 0.0985, + "grad_norm": 33.25, + "grad_norm_var": 7.5087890625, + "learning_rate": 0.0001, + "loss": 7.7253, + "loss/crossentropy": 2.1514860481023788, + "loss/hidden": 3.530078125, + "loss/jsd": 0.0, + "loss/logits": 0.21096254773437978, + "step": 3940 + }, + { + "epoch": 0.09875, + "grad_norm": 34.25, + "grad_norm_var": 2.6207682291666665, + "learning_rate": 0.0001, + "loss": 7.538, + "loss/crossentropy": 2.169696259498596, + "loss/hidden": 3.48671875, + "loss/jsd": 0.0, + "loss/logits": 0.21556729041039943, + "step": 3950 + }, + { + "epoch": 0.099, + "grad_norm": 31.25, + "grad_norm_var": 5.3150390625, + "learning_rate": 0.0001, + "loss": 7.54, + "loss/crossentropy": 2.1874313950538635, + "loss/hidden": 3.399609375, + "loss/jsd": 0.0, + "loss/logits": 0.19146509394049643, + "step": 3960 + }, + { + "epoch": 0.09925, + "grad_norm": 32.0, + "grad_norm_var": 23.437239583333334, + "learning_rate": 0.0001, + "loss": 7.6292, + "loss/crossentropy": 2.165771406888962, + "loss/hidden": 3.592578125, + "loss/jsd": 0.0, + "loss/logits": 0.20717886611819267, + "step": 3970 + }, + { + "epoch": 0.0995, + "grad_norm": 31.375, + "grad_norm_var": 407.4603515625, + "learning_rate": 0.0001, + "loss": 7.7177, + "loss/crossentropy": 2.1150890797376634, + "loss/hidden": 3.501953125, + "loss/jsd": 0.0, + "loss/logits": 0.19676875434815883, + "step": 3980 + }, + { + "epoch": 0.09975, + "grad_norm": 33.5, + "grad_norm_var": 8.463997395833333, + "learning_rate": 0.0001, + "loss": 7.6397, + "loss/crossentropy": 2.13585125207901, + "loss/hidden": 3.597265625, + "loss/jsd": 0.0, + "loss/logits": 0.2018281053751707, + "step": 3990 + }, + { + "epoch": 0.1, + "grad_norm": 36.0, + "grad_norm_var": 8.787434895833334, + "learning_rate": 0.0001, + "loss": 7.6957, + "loss/crossentropy": 2.062576304376125, + "loss/hidden": 3.556640625, + "loss/jsd": 0.0, + "loss/logits": 0.20351322293281554, + "step": 4000 + }, + { + "epoch": 0.10025, + "grad_norm": 32.25, + "grad_norm_var": 2.6931640625, + "learning_rate": 0.0001, + "loss": 7.5171, + "loss/crossentropy": 2.1093045681715012, + "loss/hidden": 3.5, + "loss/jsd": 0.0, + "loss/logits": 0.19258121848106385, + "step": 4010 + }, + { + "epoch": 0.1005, + "grad_norm": 38.5, + "grad_norm_var": 6.976497395833333, + "learning_rate": 0.0001, + "loss": 7.7046, + "loss/crossentropy": 2.1054726734757425, + "loss/hidden": 3.515234375, + "loss/jsd": 0.0, + "loss/logits": 0.18498583231121302, + "step": 4020 + }, + { + "epoch": 0.10075, + "grad_norm": 32.25, + "grad_norm_var": 16.50390625, + "learning_rate": 0.0001, + "loss": 7.6242, + "loss/crossentropy": 2.0566830962896345, + "loss/hidden": 3.537890625, + "loss/jsd": 0.0, + "loss/logits": 0.21257028207182885, + "step": 4030 + }, + { + "epoch": 0.101, + "grad_norm": 30.125, + "grad_norm_var": 21.61640625, + "learning_rate": 0.0001, + "loss": 7.5611, + "loss/crossentropy": 2.0847130313515665, + "loss/hidden": 3.378515625, + "loss/jsd": 0.0, + "loss/logits": 0.19317954257130623, + "step": 4040 + }, + { + "epoch": 0.10125, + "grad_norm": 31.0, + "grad_norm_var": 16.408268229166666, + "learning_rate": 0.0001, + "loss": 7.5456, + "loss/crossentropy": 2.116552269458771, + "loss/hidden": 3.444140625, + "loss/jsd": 0.0, + "loss/logits": 0.1943613938987255, + "step": 4050 + }, + { + "epoch": 0.1015, + "grad_norm": 31.25, + "grad_norm_var": 17.984375, + "learning_rate": 0.0001, + "loss": 7.6259, + "loss/crossentropy": 2.2868128657341003, + "loss/hidden": 3.494921875, + "loss/jsd": 0.0, + "loss/logits": 0.2375142715871334, + "step": 4060 + }, + { + "epoch": 0.10175, + "grad_norm": 29.625, + "grad_norm_var": 2.2025390625, + "learning_rate": 0.0001, + "loss": 7.5637, + "loss/crossentropy": 2.092506285011768, + "loss/hidden": 3.466796875, + "loss/jsd": 0.0, + "loss/logits": 0.1903899708762765, + "step": 4070 + }, + { + "epoch": 0.102, + "grad_norm": 30.25, + "grad_norm_var": 55.06764322916667, + "learning_rate": 0.0001, + "loss": 7.6365, + "loss/crossentropy": 2.2538520216941835, + "loss/hidden": 3.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.2204372201114893, + "step": 4080 + }, + { + "epoch": 0.10225, + "grad_norm": 48.0, + "grad_norm_var": 66.06555989583333, + "learning_rate": 0.0001, + "loss": 7.6539, + "loss/crossentropy": 2.198161965608597, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.18744452036917209, + "step": 4090 + }, + { + "epoch": 0.1025, + "grad_norm": 31.625, + "grad_norm_var": 25.937239583333334, + "learning_rate": 0.0001, + "loss": 7.5872, + "loss/crossentropy": 2.161240801215172, + "loss/hidden": 3.548046875, + "loss/jsd": 0.0, + "loss/logits": 0.19324529767036439, + "step": 4100 + }, + { + "epoch": 0.10275, + "grad_norm": 31.125, + "grad_norm_var": 2.9613932291666667, + "learning_rate": 0.0001, + "loss": 7.5854, + "loss/crossentropy": 2.185439817607403, + "loss/hidden": 3.453125, + "loss/jsd": 0.0, + "loss/logits": 0.19476189762353896, + "step": 4110 + }, + { + "epoch": 0.103, + "grad_norm": 29.0, + "grad_norm_var": 5.4556640625, + "learning_rate": 0.0001, + "loss": 7.6728, + "loss/crossentropy": 2.1513148337602614, + "loss/hidden": 3.4875, + "loss/jsd": 0.0, + "loss/logits": 0.20135847330093384, + "step": 4120 + }, + { + "epoch": 0.10325, + "grad_norm": 36.25, + "grad_norm_var": 4.3994140625, + "learning_rate": 0.0001, + "loss": 7.6006, + "loss/crossentropy": 2.0776968479156492, + "loss/hidden": 3.49140625, + "loss/jsd": 0.0, + "loss/logits": 0.19831380508840085, + "step": 4130 + }, + { + "epoch": 0.1035, + "grad_norm": 33.0, + "grad_norm_var": 4.205143229166667, + "learning_rate": 0.0001, + "loss": 7.6157, + "loss/crossentropy": 2.0971890702843665, + "loss/hidden": 3.64140625, + "loss/jsd": 0.0, + "loss/logits": 0.2015662420541048, + "step": 4140 + }, + { + "epoch": 0.10375, + "grad_norm": 35.5, + "grad_norm_var": 23.512239583333333, + "learning_rate": 0.0001, + "loss": 7.6638, + "loss/crossentropy": 2.128816670179367, + "loss/hidden": 3.420703125, + "loss/jsd": 0.0, + "loss/logits": 0.19698726907372474, + "step": 4150 + }, + { + "epoch": 0.104, + "grad_norm": 30.75, + "grad_norm_var": 22.026822916666667, + "learning_rate": 0.0001, + "loss": 7.6175, + "loss/crossentropy": 2.0965539067983627, + "loss/hidden": 3.455859375, + "loss/jsd": 0.0, + "loss/logits": 0.22271894477307796, + "step": 4160 + }, + { + "epoch": 0.10425, + "grad_norm": 32.5, + "grad_norm_var": 2.426041666666667, + "learning_rate": 0.0001, + "loss": 7.555, + "loss/crossentropy": 2.215752348303795, + "loss/hidden": 3.364453125, + "loss/jsd": 0.0, + "loss/logits": 0.1962002281099558, + "step": 4170 + }, + { + "epoch": 0.1045, + "grad_norm": 39.5, + "grad_norm_var": 23.042708333333334, + "learning_rate": 0.0001, + "loss": 7.6246, + "loss/crossentropy": 2.0542988061904905, + "loss/hidden": 3.440625, + "loss/jsd": 0.0, + "loss/logits": 0.19908196646720172, + "step": 4180 + }, + { + "epoch": 0.10475, + "grad_norm": 34.75, + "grad_norm_var": 6.342643229166667, + "learning_rate": 0.0001, + "loss": 7.487, + "loss/crossentropy": 2.2133218079805372, + "loss/hidden": 3.382421875, + "loss/jsd": 0.0, + "loss/logits": 0.18823296912014484, + "step": 4190 + }, + { + "epoch": 0.105, + "grad_norm": 31.125, + "grad_norm_var": 173.98430989583332, + "learning_rate": 0.0001, + "loss": 7.643, + "loss/crossentropy": 2.1089532509446145, + "loss/hidden": 3.501953125, + "loss/jsd": 0.0, + "loss/logits": 0.20591201409697532, + "step": 4200 + }, + { + "epoch": 0.10525, + "grad_norm": 36.75, + "grad_norm_var": 7.351041666666666, + "learning_rate": 0.0001, + "loss": 7.5643, + "loss/crossentropy": 2.289369744062424, + "loss/hidden": 3.36640625, + "loss/jsd": 0.0, + "loss/logits": 0.19611021652817726, + "step": 4210 + }, + { + "epoch": 0.1055, + "grad_norm": 35.75, + "grad_norm_var": 6.539322916666666, + "learning_rate": 0.0001, + "loss": 7.6756, + "loss/crossentropy": 2.1963788866996765, + "loss/hidden": 3.49921875, + "loss/jsd": 0.0, + "loss/logits": 0.19495316371321678, + "step": 4220 + }, + { + "epoch": 0.10575, + "grad_norm": 31.875, + "grad_norm_var": 6.276041666666667, + "learning_rate": 0.0001, + "loss": 7.5552, + "loss/crossentropy": 2.078681927919388, + "loss/hidden": 3.441015625, + "loss/jsd": 0.0, + "loss/logits": 0.1941295877099037, + "step": 4230 + }, + { + "epoch": 0.106, + "grad_norm": 44.5, + "grad_norm_var": 34.80520833333333, + "learning_rate": 0.0001, + "loss": 7.6072, + "loss/crossentropy": 2.222626182436943, + "loss/hidden": 3.2828125, + "loss/jsd": 0.0, + "loss/logits": 0.1848284311592579, + "step": 4240 + }, + { + "epoch": 0.10625, + "grad_norm": 31.375, + "grad_norm_var": 35.084375, + "learning_rate": 0.0001, + "loss": 7.6435, + "loss/crossentropy": 2.152433153986931, + "loss/hidden": 3.3390625, + "loss/jsd": 0.0, + "loss/logits": 0.1829435657709837, + "step": 4250 + }, + { + "epoch": 0.1065, + "grad_norm": 31.625, + "grad_norm_var": 4.431705729166667, + "learning_rate": 0.0001, + "loss": 7.5977, + "loss/crossentropy": 2.1493207842111586, + "loss/hidden": 3.466015625, + "loss/jsd": 0.0, + "loss/logits": 0.19450047723948954, + "step": 4260 + }, + { + "epoch": 0.10675, + "grad_norm": 30.0, + "grad_norm_var": 8.024739583333334, + "learning_rate": 0.0001, + "loss": 7.5423, + "loss/crossentropy": 2.0623584628105163, + "loss/hidden": 3.42890625, + "loss/jsd": 0.0, + "loss/logits": 0.19236240349709988, + "step": 4270 + }, + { + "epoch": 0.107, + "grad_norm": 51.75, + "grad_norm_var": 105.76145833333334, + "learning_rate": 0.0001, + "loss": 7.577, + "loss/crossentropy": 2.103591626882553, + "loss/hidden": 3.369140625, + "loss/jsd": 0.0, + "loss/logits": 0.18201838787645103, + "step": 4280 + }, + { + "epoch": 0.10725, + "grad_norm": 33.25, + "grad_norm_var": 144.54973958333332, + "learning_rate": 0.0001, + "loss": 7.7078, + "loss/crossentropy": 2.1076686546206473, + "loss/hidden": 3.56328125, + "loss/jsd": 0.0, + "loss/logits": 0.19488887619227171, + "step": 4290 + }, + { + "epoch": 0.1075, + "grad_norm": 32.0, + "grad_norm_var": 190.62057291666667, + "learning_rate": 0.0001, + "loss": 7.5811, + "loss/crossentropy": 2.168550156056881, + "loss/hidden": 3.34453125, + "loss/jsd": 0.0, + "loss/logits": 0.1908732896670699, + "step": 4300 + }, + { + "epoch": 0.10775, + "grad_norm": 28.875, + "grad_norm_var": 149.1822265625, + "learning_rate": 0.0001, + "loss": 7.5999, + "loss/crossentropy": 2.100285217165947, + "loss/hidden": 3.411328125, + "loss/jsd": 0.0, + "loss/logits": 0.19184609185904264, + "step": 4310 + }, + { + "epoch": 0.108, + "grad_norm": 44.25, + "grad_norm_var": 12.502018229166667, + "learning_rate": 0.0001, + "loss": 7.7075, + "loss/crossentropy": 2.0968768775463102, + "loss/hidden": 3.51796875, + "loss/jsd": 0.0, + "loss/logits": 0.20168912429362534, + "step": 4320 + }, + { + "epoch": 0.10825, + "grad_norm": 31.25, + "grad_norm_var": 12.81875, + "learning_rate": 0.0001, + "loss": 7.5576, + "loss/crossentropy": 2.1037441343069077, + "loss/hidden": 3.370703125, + "loss/jsd": 0.0, + "loss/logits": 0.17891897186636924, + "step": 4330 + }, + { + "epoch": 0.1085, + "grad_norm": 33.5, + "grad_norm_var": 2.700455729166667, + "learning_rate": 0.0001, + "loss": 7.5097, + "loss/crossentropy": 2.210876139998436, + "loss/hidden": 3.36171875, + "loss/jsd": 0.0, + "loss/logits": 0.19316814988851547, + "step": 4340 + }, + { + "epoch": 0.10875, + "grad_norm": 31.125, + "grad_norm_var": 17.4666015625, + "learning_rate": 0.0001, + "loss": 7.5755, + "loss/crossentropy": 2.1331328481435774, + "loss/hidden": 3.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.20047767795622348, + "step": 4350 + }, + { + "epoch": 0.109, + "grad_norm": 32.75, + "grad_norm_var": 3.198372395833333, + "learning_rate": 0.0001, + "loss": 7.5385, + "loss/crossentropy": 2.153067779541016, + "loss/hidden": 3.476953125, + "loss/jsd": 0.0, + "loss/logits": 0.20173794813454152, + "step": 4360 + }, + { + "epoch": 0.10925, + "grad_norm": 32.0, + "grad_norm_var": 4.010416666666667, + "learning_rate": 0.0001, + "loss": 7.5426, + "loss/crossentropy": 2.165090653300285, + "loss/hidden": 3.330859375, + "loss/jsd": 0.0, + "loss/logits": 0.18712956104427575, + "step": 4370 + }, + { + "epoch": 0.1095, + "grad_norm": 30.0, + "grad_norm_var": 1.77265625, + "learning_rate": 0.0001, + "loss": 7.605, + "loss/crossentropy": 2.217823189496994, + "loss/hidden": 3.37265625, + "loss/jsd": 0.0, + "loss/logits": 0.20003505125641824, + "step": 4380 + }, + { + "epoch": 0.10975, + "grad_norm": 29.875, + "grad_norm_var": 1.7603515625, + "learning_rate": 0.0001, + "loss": 7.507, + "loss/crossentropy": 2.138795481622219, + "loss/hidden": 3.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.1892871480435133, + "step": 4390 + }, + { + "epoch": 0.11, + "grad_norm": 30.375, + "grad_norm_var": 22.449739583333333, + "learning_rate": 0.0001, + "loss": 7.6965, + "loss/crossentropy": 2.22740375995636, + "loss/hidden": 3.5578125, + "loss/jsd": 0.0, + "loss/logits": 0.2264870759099722, + "step": 4400 + }, + { + "epoch": 0.11025, + "grad_norm": 29.5, + "grad_norm_var": 37.71666666666667, + "learning_rate": 0.0001, + "loss": 7.604, + "loss/crossentropy": 2.1781785815954207, + "loss/hidden": 3.471875, + "loss/jsd": 0.0, + "loss/logits": 0.2012148156762123, + "step": 4410 + }, + { + "epoch": 0.1105, + "grad_norm": 33.0, + "grad_norm_var": 25.305989583333332, + "learning_rate": 0.0001, + "loss": 7.5767, + "loss/crossentropy": 2.0333445832133292, + "loss/hidden": 3.55703125, + "loss/jsd": 0.0, + "loss/logits": 0.1852614250034094, + "step": 4420 + }, + { + "epoch": 0.11075, + "grad_norm": 32.0, + "grad_norm_var": 0.9760416666666667, + "learning_rate": 0.0001, + "loss": 7.6218, + "loss/crossentropy": 2.2101993292570112, + "loss/hidden": 3.505078125, + "loss/jsd": 0.0, + "loss/logits": 0.2069159124046564, + "step": 4430 + }, + { + "epoch": 0.111, + "grad_norm": 30.5, + "grad_norm_var": 7.8837890625, + "learning_rate": 0.0001, + "loss": 7.6543, + "loss/crossentropy": 2.0182371377944945, + "loss/hidden": 3.475390625, + "loss/jsd": 0.0, + "loss/logits": 0.19055260960012674, + "step": 4440 + }, + { + "epoch": 0.11125, + "grad_norm": 29.0, + "grad_norm_var": 18.167643229166668, + "learning_rate": 0.0001, + "loss": 7.5559, + "loss/crossentropy": 2.209046494960785, + "loss/hidden": 3.4546875, + "loss/jsd": 0.0, + "loss/logits": 0.1926161792129278, + "step": 4450 + }, + { + "epoch": 0.1115, + "grad_norm": 30.375, + "grad_norm_var": 19.9634765625, + "learning_rate": 0.0001, + "loss": 7.5527, + "loss/crossentropy": 2.2265418380498887, + "loss/hidden": 3.28671875, + "loss/jsd": 0.0, + "loss/logits": 0.17907681576907636, + "step": 4460 + }, + { + "epoch": 0.11175, + "grad_norm": 35.25, + "grad_norm_var": 3.0434895833333333, + "learning_rate": 0.0001, + "loss": 7.6046, + "loss/crossentropy": 2.1534146428108216, + "loss/hidden": 3.436328125, + "loss/jsd": 0.0, + "loss/logits": 0.19181067440658808, + "step": 4470 + }, + { + "epoch": 0.112, + "grad_norm": 31.375, + "grad_norm_var": 2.161393229166667, + "learning_rate": 0.0001, + "loss": 7.5151, + "loss/crossentropy": 2.2303753718733788, + "loss/hidden": 3.46796875, + "loss/jsd": 0.0, + "loss/logits": 0.19312014058232307, + "step": 4480 + }, + { + "epoch": 0.11225, + "grad_norm": 31.25, + "grad_norm_var": 2.071875, + "learning_rate": 0.0001, + "loss": 7.5733, + "loss/crossentropy": 2.2565354451537134, + "loss/hidden": 3.300390625, + "loss/jsd": 0.0, + "loss/logits": 0.19555974584072827, + "step": 4490 + }, + { + "epoch": 0.1125, + "grad_norm": 30.125, + "grad_norm_var": 6.21015625, + "learning_rate": 0.0001, + "loss": 7.5721, + "loss/crossentropy": 2.1691703468561174, + "loss/hidden": 3.350390625, + "loss/jsd": 0.0, + "loss/logits": 0.19526711832731963, + "step": 4500 + }, + { + "epoch": 0.11275, + "grad_norm": 30.75, + "grad_norm_var": 34.985416666666666, + "learning_rate": 0.0001, + "loss": 7.6031, + "loss/crossentropy": 2.191486781835556, + "loss/hidden": 3.38515625, + "loss/jsd": 0.0, + "loss/logits": 0.2022854283452034, + "step": 4510 + }, + { + "epoch": 0.113, + "grad_norm": 32.75, + "grad_norm_var": 34.91041666666667, + "learning_rate": 0.0001, + "loss": 7.566, + "loss/crossentropy": 2.07875557243824, + "loss/hidden": 3.510546875, + "loss/jsd": 0.0, + "loss/logits": 0.20517632961273194, + "step": 4520 + }, + { + "epoch": 0.11325, + "grad_norm": 30.25, + "grad_norm_var": 3.1869140625, + "learning_rate": 0.0001, + "loss": 7.6204, + "loss/crossentropy": 2.1490323692560196, + "loss/hidden": 3.55390625, + "loss/jsd": 0.0, + "loss/logits": 0.20650937724858523, + "step": 4530 + }, + { + "epoch": 0.1135, + "grad_norm": 31.625, + "grad_norm_var": 3.6639973958333334, + "learning_rate": 0.0001, + "loss": 7.5605, + "loss/crossentropy": 2.19907369017601, + "loss/hidden": 3.3765625, + "loss/jsd": 0.0, + "loss/logits": 0.18378095962107183, + "step": 4540 + }, + { + "epoch": 0.11375, + "grad_norm": 33.0, + "grad_norm_var": 3.34140625, + "learning_rate": 0.0001, + "loss": 7.5943, + "loss/crossentropy": 2.0509427756071092, + "loss/hidden": 3.46796875, + "loss/jsd": 0.0, + "loss/logits": 0.2012764386832714, + "step": 4550 + }, + { + "epoch": 0.114, + "grad_norm": 32.25, + "grad_norm_var": 55.52916666666667, + "learning_rate": 0.0001, + "loss": 7.5349, + "loss/crossentropy": 2.247987303137779, + "loss/hidden": 3.491015625, + "loss/jsd": 0.0, + "loss/logits": 0.2166461084038019, + "step": 4560 + }, + { + "epoch": 0.11425, + "grad_norm": 31.0, + "grad_norm_var": 59.064518229166666, + "learning_rate": 0.0001, + "loss": 7.5699, + "loss/crossentropy": 2.256947749853134, + "loss/hidden": 3.369921875, + "loss/jsd": 0.0, + "loss/logits": 0.19454225115478038, + "step": 4570 + }, + { + "epoch": 0.1145, + "grad_norm": 30.5, + "grad_norm_var": 12.089583333333334, + "learning_rate": 0.0001, + "loss": 7.5587, + "loss/crossentropy": 2.230518189072609, + "loss/hidden": 3.380078125, + "loss/jsd": 0.0, + "loss/logits": 0.19083393104374408, + "step": 4580 + }, + { + "epoch": 0.11475, + "grad_norm": 31.5, + "grad_norm_var": 18.3056640625, + "learning_rate": 0.0001, + "loss": 7.664, + "loss/crossentropy": 2.113222661614418, + "loss/hidden": 3.51640625, + "loss/jsd": 0.0, + "loss/logits": 0.20051947552710772, + "step": 4590 + }, + { + "epoch": 0.115, + "grad_norm": 29.25, + "grad_norm_var": 26.7837890625, + "learning_rate": 0.0001, + "loss": 7.6508, + "loss/crossentropy": 2.2963487923145296, + "loss/hidden": 3.430859375, + "loss/jsd": 0.0, + "loss/logits": 0.21824662014842033, + "step": 4600 + }, + { + "epoch": 0.11525, + "grad_norm": 31.75, + "grad_norm_var": 3.249739583333333, + "learning_rate": 0.0001, + "loss": 7.5491, + "loss/crossentropy": 2.2380147099494936, + "loss/hidden": 3.488671875, + "loss/jsd": 0.0, + "loss/logits": 0.2021485272794962, + "step": 4610 + }, + { + "epoch": 0.1155, + "grad_norm": 34.0, + "grad_norm_var": 3.2072265625, + "learning_rate": 0.0001, + "loss": 7.6035, + "loss/crossentropy": 2.1933206588029863, + "loss/hidden": 3.519921875, + "loss/jsd": 0.0, + "loss/logits": 0.2073045803233981, + "step": 4620 + }, + { + "epoch": 0.11575, + "grad_norm": 30.375, + "grad_norm_var": 25.005989583333335, + "learning_rate": 0.0001, + "loss": 7.577, + "loss/crossentropy": 2.3104471057653426, + "loss/hidden": 3.377734375, + "loss/jsd": 0.0, + "loss/logits": 0.20087463557720184, + "step": 4630 + }, + { + "epoch": 0.116, + "grad_norm": 33.5, + "grad_norm_var": 539.2056640625, + "learning_rate": 0.0001, + "loss": 7.5764, + "loss/crossentropy": 2.1786745607852938, + "loss/hidden": 3.39453125, + "loss/jsd": 0.0, + "loss/logits": 0.19096632562577726, + "step": 4640 + }, + { + "epoch": 0.11625, + "grad_norm": 30.5, + "grad_norm_var": 132.26139322916666, + "learning_rate": 0.0001, + "loss": 7.7424, + "loss/crossentropy": 2.1628643572330475, + "loss/hidden": 3.526171875, + "loss/jsd": 0.0, + "loss/logits": 0.1994694285094738, + "step": 4650 + }, + { + "epoch": 0.1165, + "grad_norm": 40.25, + "grad_norm_var": 12.672330729166667, + "learning_rate": 0.0001, + "loss": 7.6456, + "loss/crossentropy": 2.0927803248167036, + "loss/hidden": 3.507421875, + "loss/jsd": 0.0, + "loss/logits": 0.20613454841077328, + "step": 4660 + }, + { + "epoch": 0.11675, + "grad_norm": 37.25, + "grad_norm_var": 7.458072916666667, + "learning_rate": 0.0001, + "loss": 7.622, + "loss/crossentropy": 2.311227411031723, + "loss/hidden": 3.402734375, + "loss/jsd": 0.0, + "loss/logits": 0.21675110273063183, + "step": 4670 + }, + { + "epoch": 0.117, + "grad_norm": 34.5, + "grad_norm_var": 138.99524739583333, + "learning_rate": 0.0001, + "loss": 7.6633, + "loss/crossentropy": 2.1860357582569123, + "loss/hidden": 3.509375, + "loss/jsd": 0.0, + "loss/logits": 0.20624178424477577, + "step": 4680 + }, + { + "epoch": 0.11725, + "grad_norm": 38.25, + "grad_norm_var": 11.161393229166666, + "learning_rate": 0.0001, + "loss": 7.654, + "loss/crossentropy": 2.246461641788483, + "loss/hidden": 3.435546875, + "loss/jsd": 0.0, + "loss/logits": 0.19816880766302347, + "step": 4690 + }, + { + "epoch": 0.1175, + "grad_norm": 98.5, + "grad_norm_var": 275.63645833333334, + "learning_rate": 0.0001, + "loss": 7.6871, + "loss/crossentropy": 2.1662321478128432, + "loss/hidden": 3.52421875, + "loss/jsd": 0.0, + "loss/logits": 0.23473294898867608, + "step": 4700 + }, + { + "epoch": 0.11775, + "grad_norm": 32.25, + "grad_norm_var": 273.496875, + "learning_rate": 0.0001, + "loss": 7.5664, + "loss/crossentropy": 2.1411470264196395, + "loss/hidden": 3.466015625, + "loss/jsd": 0.0, + "loss/logits": 0.20816716887056827, + "step": 4710 + }, + { + "epoch": 0.118, + "grad_norm": 30.625, + "grad_norm_var": 1.4697916666666666, + "learning_rate": 0.0001, + "loss": 7.5106, + "loss/crossentropy": 2.184460151195526, + "loss/hidden": 3.387890625, + "loss/jsd": 0.0, + "loss/logits": 0.19597234334796668, + "step": 4720 + }, + { + "epoch": 0.11825, + "grad_norm": 32.75, + "grad_norm_var": 151.39368489583333, + "learning_rate": 0.0001, + "loss": 7.7143, + "loss/crossentropy": 2.18603872358799, + "loss/hidden": 3.509375, + "loss/jsd": 0.0, + "loss/logits": 0.21333505641669034, + "step": 4730 + }, + { + "epoch": 0.1185, + "grad_norm": 31.125, + "grad_norm_var": 41.80774739583333, + "learning_rate": 0.0001, + "loss": 7.5158, + "loss/crossentropy": 2.038896057009697, + "loss/hidden": 3.413671875, + "loss/jsd": 0.0, + "loss/logits": 0.18354782909154893, + "step": 4740 + }, + { + "epoch": 0.11875, + "grad_norm": 31.5, + "grad_norm_var": 20.727083333333333, + "learning_rate": 0.0001, + "loss": 7.6044, + "loss/crossentropy": 2.07361024916172, + "loss/hidden": 3.505078125, + "loss/jsd": 0.0, + "loss/logits": 0.2144785810261965, + "step": 4750 + }, + { + "epoch": 0.119, + "grad_norm": 32.0, + "grad_norm_var": 14.9994140625, + "learning_rate": 0.0001, + "loss": 7.5936, + "loss/crossentropy": 2.172766661643982, + "loss/hidden": 3.359375, + "loss/jsd": 0.0, + "loss/logits": 0.1904382836073637, + "step": 4760 + }, + { + "epoch": 0.11925, + "grad_norm": 33.0, + "grad_norm_var": 11.213541666666666, + "learning_rate": 0.0001, + "loss": 7.6024, + "loss/crossentropy": 2.2863214761018753, + "loss/hidden": 3.464453125, + "loss/jsd": 0.0, + "loss/logits": 0.20077989026904106, + "step": 4770 + }, + { + "epoch": 0.1195, + "grad_norm": 32.0, + "grad_norm_var": 23.308268229166668, + "learning_rate": 0.0001, + "loss": 7.5743, + "loss/crossentropy": 2.172411371767521, + "loss/hidden": 3.357421875, + "loss/jsd": 0.0, + "loss/logits": 0.1879224268719554, + "step": 4780 + }, + { + "epoch": 0.11975, + "grad_norm": 29.625, + "grad_norm_var": 27.662239583333335, + "learning_rate": 0.0001, + "loss": 7.6003, + "loss/crossentropy": 2.061740070581436, + "loss/hidden": 3.424609375, + "loss/jsd": 0.0, + "loss/logits": 0.196690865047276, + "step": 4790 + }, + { + "epoch": 0.12, + "grad_norm": 34.5, + "grad_norm_var": 9.820768229166667, + "learning_rate": 0.0001, + "loss": 7.5893, + "loss/crossentropy": 2.1725818127393723, + "loss/hidden": 3.33828125, + "loss/jsd": 0.0, + "loss/logits": 0.18248203694820403, + "step": 4800 + }, + { + "epoch": 0.12025, + "grad_norm": 31.375, + "grad_norm_var": 5.5478515625, + "learning_rate": 0.0001, + "loss": 7.5542, + "loss/crossentropy": 2.0823758363723757, + "loss/hidden": 3.503125, + "loss/jsd": 0.0, + "loss/logits": 0.19437449853867292, + "step": 4810 + }, + { + "epoch": 0.1205, + "grad_norm": 31.125, + "grad_norm_var": 2.8754557291666667, + "learning_rate": 0.0001, + "loss": 7.7018, + "loss/crossentropy": 2.220066267251968, + "loss/hidden": 3.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.20239269211888314, + "step": 4820 + }, + { + "epoch": 0.12075, + "grad_norm": 33.25, + "grad_norm_var": 6.868489583333333, + "learning_rate": 0.0001, + "loss": 7.5765, + "loss/crossentropy": 2.0765403911471365, + "loss/hidden": 3.435546875, + "loss/jsd": 0.0, + "loss/logits": 0.19827509336173535, + "step": 4830 + }, + { + "epoch": 0.121, + "grad_norm": 34.0, + "grad_norm_var": 27.168489583333333, + "learning_rate": 0.0001, + "loss": 7.7248, + "loss/crossentropy": 2.1397932201623915, + "loss/hidden": 3.50546875, + "loss/jsd": 0.0, + "loss/logits": 0.204028557613492, + "step": 4840 + }, + { + "epoch": 0.12125, + "grad_norm": 38.0, + "grad_norm_var": 22.633072916666666, + "learning_rate": 0.0001, + "loss": 7.6658, + "loss/crossentropy": 2.3317618519067764, + "loss/hidden": 3.33046875, + "loss/jsd": 0.0, + "loss/logits": 0.19565313905477524, + "step": 4850 + }, + { + "epoch": 0.1215, + "grad_norm": 29.875, + "grad_norm_var": 4.6041015625, + "learning_rate": 0.0001, + "loss": 7.5415, + "loss/crossentropy": 2.060061091184616, + "loss/hidden": 3.491015625, + "loss/jsd": 0.0, + "loss/logits": 0.19083615019917488, + "step": 4860 + }, + { + "epoch": 0.12175, + "grad_norm": 33.75, + "grad_norm_var": 5.06875, + "learning_rate": 0.0001, + "loss": 7.651, + "loss/crossentropy": 2.158045071363449, + "loss/hidden": 3.52421875, + "loss/jsd": 0.0, + "loss/logits": 0.2148456061258912, + "step": 4870 + }, + { + "epoch": 0.122, + "grad_norm": 30.625, + "grad_norm_var": 16.091666666666665, + "learning_rate": 0.0001, + "loss": 7.5793, + "loss/crossentropy": 2.0583921030163763, + "loss/hidden": 3.487109375, + "loss/jsd": 0.0, + "loss/logits": 0.19111349806189537, + "step": 4880 + }, + { + "epoch": 0.12225, + "grad_norm": 33.25, + "grad_norm_var": 17.422330729166667, + "learning_rate": 0.0001, + "loss": 7.6588, + "loss/crossentropy": 2.1186117827892303, + "loss/hidden": 3.500390625, + "loss/jsd": 0.0, + "loss/logits": 0.19436944983899593, + "step": 4890 + }, + { + "epoch": 0.1225, + "grad_norm": 36.75, + "grad_norm_var": 3.2676432291666666, + "learning_rate": 0.0001, + "loss": 7.4748, + "loss/crossentropy": 2.2382855489850044, + "loss/hidden": 3.39921875, + "loss/jsd": 0.0, + "loss/logits": 0.19216692261397839, + "step": 4900 + }, + { + "epoch": 0.12275, + "grad_norm": 29.375, + "grad_norm_var": 31.048958333333335, + "learning_rate": 0.0001, + "loss": 7.5018, + "loss/crossentropy": 2.1136436641216276, + "loss/hidden": 3.482421875, + "loss/jsd": 0.0, + "loss/logits": 0.19329534620046615, + "step": 4910 + }, + { + "epoch": 0.123, + "grad_norm": 40.5, + "grad_norm_var": 8.3572265625, + "learning_rate": 0.0001, + "loss": 7.5661, + "loss/crossentropy": 2.043731611967087, + "loss/hidden": 3.472265625, + "loss/jsd": 0.0, + "loss/logits": 0.18812808189541103, + "step": 4920 + }, + { + "epoch": 0.12325, + "grad_norm": 31.25, + "grad_norm_var": 16.280989583333334, + "learning_rate": 0.0001, + "loss": 7.5565, + "loss/crossentropy": 2.129016649723053, + "loss/hidden": 3.345703125, + "loss/jsd": 0.0, + "loss/logits": 0.1941742904484272, + "step": 4930 + }, + { + "epoch": 0.1235, + "grad_norm": 30.125, + "grad_norm_var": 1.4504557291666667, + "learning_rate": 0.0001, + "loss": 7.5356, + "loss/crossentropy": 2.1981059461832047, + "loss/hidden": 3.533203125, + "loss/jsd": 0.0, + "loss/logits": 0.19572316966950892, + "step": 4940 + }, + { + "epoch": 0.12375, + "grad_norm": 54.5, + "grad_norm_var": 36.024739583333336, + "learning_rate": 0.0001, + "loss": 7.5463, + "loss/crossentropy": 2.107844803482294, + "loss/hidden": 3.551953125, + "loss/jsd": 0.0, + "loss/logits": 0.18902508020401002, + "step": 4950 + }, + { + "epoch": 0.124, + "grad_norm": 33.0, + "grad_norm_var": 64.778125, + "learning_rate": 0.0001, + "loss": 7.6053, + "loss/crossentropy": 2.1164773657917975, + "loss/hidden": 3.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.19264463931322098, + "step": 4960 + }, + { + "epoch": 0.12425, + "grad_norm": 31.5, + "grad_norm_var": 42.0666015625, + "learning_rate": 0.0001, + "loss": 7.5774, + "loss/crossentropy": 2.176864555478096, + "loss/hidden": 3.491796875, + "loss/jsd": 0.0, + "loss/logits": 0.20114411041140556, + "step": 4970 + }, + { + "epoch": 0.1245, + "grad_norm": 32.25, + "grad_norm_var": 33.07180989583333, + "learning_rate": 0.0001, + "loss": 7.5698, + "loss/crossentropy": 2.1456878036260605, + "loss/hidden": 3.38515625, + "loss/jsd": 0.0, + "loss/logits": 0.19218399338424205, + "step": 4980 + }, + { + "epoch": 0.12475, + "grad_norm": 33.25, + "grad_norm_var": 9.938997395833333, + "learning_rate": 0.0001, + "loss": 7.5467, + "loss/crossentropy": 2.3402266025543215, + "loss/hidden": 3.290625, + "loss/jsd": 0.0, + "loss/logits": 0.18540082685649395, + "step": 4990 + }, + { + "epoch": 0.125, + "grad_norm": 32.5, + "grad_norm_var": 1.6580729166666666, + "learning_rate": 0.0001, + "loss": 7.5883, + "loss/crossentropy": 2.186999189853668, + "loss/hidden": 3.341796875, + "loss/jsd": 0.0, + "loss/logits": 0.21516974158585073, + "step": 5000 + }, + { + "epoch": 0.12525, + "grad_norm": 33.5, + "grad_norm_var": 4.194205729166667, + "learning_rate": 0.0001, + "loss": 7.5936, + "loss/crossentropy": 2.171919286251068, + "loss/hidden": 3.41484375, + "loss/jsd": 0.0, + "loss/logits": 0.19737605061382055, + "step": 5010 + }, + { + "epoch": 0.1255, + "grad_norm": 32.0, + "grad_norm_var": 0.67265625, + "learning_rate": 0.0001, + "loss": 7.509, + "loss/crossentropy": 2.2188323110342028, + "loss/hidden": 3.33984375, + "loss/jsd": 0.0, + "loss/logits": 0.18663883544504642, + "step": 5020 + }, + { + "epoch": 0.12575, + "grad_norm": 29.75, + "grad_norm_var": 10.642643229166667, + "learning_rate": 0.0001, + "loss": 7.6301, + "loss/crossentropy": 2.191204625368118, + "loss/hidden": 3.348046875, + "loss/jsd": 0.0, + "loss/logits": 0.1946074590086937, + "step": 5030 + }, + { + "epoch": 0.126, + "grad_norm": 31.125, + "grad_norm_var": 1.6541015625, + "learning_rate": 0.0001, + "loss": 7.502, + "loss/crossentropy": 2.1200972706079484, + "loss/hidden": 3.43671875, + "loss/jsd": 0.0, + "loss/logits": 0.19346977435052395, + "step": 5040 + }, + { + "epoch": 0.12625, + "grad_norm": 44.75, + "grad_norm_var": 16.30390625, + "learning_rate": 0.0001, + "loss": 7.5705, + "loss/crossentropy": 2.1876573234796526, + "loss/hidden": 3.320703125, + "loss/jsd": 0.0, + "loss/logits": 0.18829105645418168, + "step": 5050 + }, + { + "epoch": 0.1265, + "grad_norm": 31.875, + "grad_norm_var": 20.667708333333334, + "learning_rate": 0.0001, + "loss": 7.5126, + "loss/crossentropy": 2.2233003705739973, + "loss/hidden": 3.3703125, + "loss/jsd": 0.0, + "loss/logits": 0.19192412812262774, + "step": 5060 + }, + { + "epoch": 0.12675, + "grad_norm": 33.0, + "grad_norm_var": 2.3285807291666667, + "learning_rate": 0.0001, + "loss": 7.5683, + "loss/crossentropy": 2.0495440497994424, + "loss/hidden": 3.4625, + "loss/jsd": 0.0, + "loss/logits": 0.18455710131675004, + "step": 5070 + }, + { + "epoch": 0.127, + "grad_norm": 31.5, + "grad_norm_var": 2.6684895833333333, + "learning_rate": 0.0001, + "loss": 7.4892, + "loss/crossentropy": 2.1829854756593705, + "loss/hidden": 3.415234375, + "loss/jsd": 0.0, + "loss/logits": 0.1906156621873379, + "step": 5080 + }, + { + "epoch": 0.12725, + "grad_norm": 31.75, + "grad_norm_var": 26.134375, + "learning_rate": 0.0001, + "loss": 7.7141, + "loss/crossentropy": 2.150173208117485, + "loss/hidden": 3.546875, + "loss/jsd": 0.0, + "loss/logits": 0.20331819988787175, + "step": 5090 + }, + { + "epoch": 0.1275, + "grad_norm": 36.5, + "grad_norm_var": 25.376822916666665, + "learning_rate": 0.0001, + "loss": 7.572, + "loss/crossentropy": 2.2022222489118577, + "loss/hidden": 3.522265625, + "loss/jsd": 0.0, + "loss/logits": 0.20444649420678615, + "step": 5100 + }, + { + "epoch": 0.12775, + "grad_norm": 31.375, + "grad_norm_var": 15.917643229166666, + "learning_rate": 0.0001, + "loss": 7.5598, + "loss/crossentropy": 2.0867941707372664, + "loss/hidden": 3.56328125, + "loss/jsd": 0.0, + "loss/logits": 0.19711919017136098, + "step": 5110 + }, + { + "epoch": 0.128, + "grad_norm": 30.125, + "grad_norm_var": 16.92265625, + "learning_rate": 0.0001, + "loss": 7.4871, + "loss/crossentropy": 2.1122621968388557, + "loss/hidden": 3.371484375, + "loss/jsd": 0.0, + "loss/logits": 0.1813073743134737, + "step": 5120 + }, + { + "epoch": 0.12825, + "grad_norm": 30.75, + "grad_norm_var": 5.108072916666667, + "learning_rate": 0.0001, + "loss": 7.5229, + "loss/crossentropy": 2.0356661707162855, + "loss/hidden": 3.480859375, + "loss/jsd": 0.0, + "loss/logits": 0.19353711605072021, + "step": 5130 + }, + { + "epoch": 0.1285, + "grad_norm": 32.5, + "grad_norm_var": 34.81920572916667, + "learning_rate": 0.0001, + "loss": 7.64, + "loss/crossentropy": 2.1762280851602553, + "loss/hidden": 3.421875, + "loss/jsd": 0.0, + "loss/logits": 0.20253173671662808, + "step": 5140 + }, + { + "epoch": 0.12875, + "grad_norm": 32.25, + "grad_norm_var": 51.16223958333333, + "learning_rate": 0.0001, + "loss": 7.6475, + "loss/crossentropy": 2.14297553896904, + "loss/hidden": 3.48671875, + "loss/jsd": 0.0, + "loss/logits": 0.2105777282267809, + "step": 5150 + }, + { + "epoch": 0.129, + "grad_norm": 34.0, + "grad_norm_var": 2.513641882357806e+18, + "learning_rate": 0.0001, + "loss": 7.6232, + "loss/crossentropy": 2.190713110566139, + "loss/hidden": 3.689453125, + "loss/jsd": 0.0, + "loss/logits": 0.2259815253317356, + "step": 5160 + }, + { + "epoch": 0.12925, + "grad_norm": 50.0, + "grad_norm_var": 2.513641882159625e+18, + "learning_rate": 0.0001, + "loss": 7.5304, + "loss/crossentropy": 2.287049275636673, + "loss/hidden": 3.22109375, + "loss/jsd": 0.0, + "loss/logits": 0.1791717953979969, + "step": 5170 + }, + { + "epoch": 0.1295, + "grad_norm": 32.25, + "grad_norm_var": 57.03932291666667, + "learning_rate": 0.0001, + "loss": 7.5666, + "loss/crossentropy": 2.205572660267353, + "loss/hidden": 3.40234375, + "loss/jsd": 0.0, + "loss/logits": 0.1940192885696888, + "step": 5180 + }, + { + "epoch": 0.12975, + "grad_norm": 33.25, + "grad_norm_var": 2.115625, + "learning_rate": 0.0001, + "loss": 7.5783, + "loss/crossentropy": 2.170276886224747, + "loss/hidden": 3.498828125, + "loss/jsd": 0.0, + "loss/logits": 0.19651044271886348, + "step": 5190 + }, + { + "epoch": 0.13, + "grad_norm": 31.5, + "grad_norm_var": 3.549934895833333, + "learning_rate": 0.0001, + "loss": 7.6073, + "loss/crossentropy": 2.2128631293773653, + "loss/hidden": 3.335546875, + "loss/jsd": 0.0, + "loss/logits": 0.19390376433730125, + "step": 5200 + }, + { + "epoch": 0.13025, + "grad_norm": 35.75, + "grad_norm_var": 9.099934895833334, + "learning_rate": 0.0001, + "loss": 7.575, + "loss/crossentropy": 2.171933504939079, + "loss/hidden": 3.3765625, + "loss/jsd": 0.0, + "loss/logits": 0.1874479927122593, + "step": 5210 + }, + { + "epoch": 0.1305, + "grad_norm": 38.25, + "grad_norm_var": 12.2869140625, + "learning_rate": 0.0001, + "loss": 7.5635, + "loss/crossentropy": 2.1169356971979143, + "loss/hidden": 3.334375, + "loss/jsd": 0.0, + "loss/logits": 0.1857094492763281, + "step": 5220 + }, + { + "epoch": 0.13075, + "grad_norm": 51.25, + "grad_norm_var": 892.175, + "learning_rate": 0.0001, + "loss": 7.5924, + "loss/crossentropy": 2.1072397351264955, + "loss/hidden": 3.505078125, + "loss/jsd": 0.0, + "loss/logits": 0.20098341945558787, + "step": 5230 + }, + { + "epoch": 0.131, + "grad_norm": 32.5, + "grad_norm_var": 895.24140625, + "learning_rate": 0.0001, + "loss": 7.5454, + "loss/crossentropy": 2.260575148463249, + "loss/hidden": 3.348046875, + "loss/jsd": 0.0, + "loss/logits": 0.18017468005418777, + "step": 5240 + }, + { + "epoch": 0.13125, + "grad_norm": 48.0, + "grad_norm_var": 114.225, + "learning_rate": 0.0001, + "loss": 7.6162, + "loss/crossentropy": 2.2057667702436445, + "loss/hidden": 3.387890625, + "loss/jsd": 0.0, + "loss/logits": 0.20519790165126323, + "step": 5250 + }, + { + "epoch": 0.1315, + "grad_norm": 74.0, + "grad_norm_var": 349.00983072916665, + "learning_rate": 0.0001, + "loss": 7.6702, + "loss/crossentropy": 2.154927045106888, + "loss/hidden": 3.366796875, + "loss/jsd": 0.0, + "loss/logits": 0.21574116442352534, + "step": 5260 + }, + { + "epoch": 0.13175, + "grad_norm": 31.0, + "grad_norm_var": 229.61243489583333, + "learning_rate": 0.0001, + "loss": 7.4891, + "loss/crossentropy": 2.107641798257828, + "loss/hidden": 3.414453125, + "loss/jsd": 0.0, + "loss/logits": 0.18632576130330564, + "step": 5270 + }, + { + "epoch": 0.132, + "grad_norm": 33.75, + "grad_norm_var": 7.731705729166666, + "learning_rate": 0.0001, + "loss": 7.5382, + "loss/crossentropy": 2.245188394188881, + "loss/hidden": 3.26875, + "loss/jsd": 0.0, + "loss/logits": 0.18824921660125254, + "step": 5280 + }, + { + "epoch": 0.13225, + "grad_norm": 33.75, + "grad_norm_var": 3.63125, + "learning_rate": 0.0001, + "loss": 7.4886, + "loss/crossentropy": 2.2113157629966738, + "loss/hidden": 3.294140625, + "loss/jsd": 0.0, + "loss/logits": 0.1719427563250065, + "step": 5290 + }, + { + "epoch": 0.1325, + "grad_norm": 31.125, + "grad_norm_var": 5.176822916666667, + "learning_rate": 0.0001, + "loss": 7.5181, + "loss/crossentropy": 2.2539247930049897, + "loss/hidden": 3.31640625, + "loss/jsd": 0.0, + "loss/logits": 0.18716043829917908, + "step": 5300 + }, + { + "epoch": 0.13275, + "grad_norm": 30.5, + "grad_norm_var": 5.7791015625, + "learning_rate": 0.0001, + "loss": 7.4139, + "loss/crossentropy": 2.232303848862648, + "loss/hidden": 3.4, + "loss/jsd": 0.0, + "loss/logits": 0.18582747615873813, + "step": 5310 + }, + { + "epoch": 0.133, + "grad_norm": 29.25, + "grad_norm_var": 2.792708333333333, + "learning_rate": 0.0001, + "loss": 7.4902, + "loss/crossentropy": 2.2038251549005508, + "loss/hidden": 3.330859375, + "loss/jsd": 0.0, + "loss/logits": 0.19186565633863212, + "step": 5320 + }, + { + "epoch": 0.13325, + "grad_norm": 31.0, + "grad_norm_var": 6.145768229166666, + "learning_rate": 0.0001, + "loss": 7.568, + "loss/crossentropy": 2.258873853087425, + "loss/hidden": 3.490625, + "loss/jsd": 0.0, + "loss/logits": 0.2075445156544447, + "step": 5330 + }, + { + "epoch": 0.1335, + "grad_norm": 35.25, + "grad_norm_var": 2.1952473958333334, + "learning_rate": 0.0001, + "loss": 7.5231, + "loss/crossentropy": 2.1799694120883943, + "loss/hidden": 3.3015625, + "loss/jsd": 0.0, + "loss/logits": 0.18682638984173536, + "step": 5340 + }, + { + "epoch": 0.13375, + "grad_norm": 30.0, + "grad_norm_var": 1.9739583333333333, + "learning_rate": 0.0001, + "loss": 7.5416, + "loss/crossentropy": 2.2539006620645523, + "loss/hidden": 3.34921875, + "loss/jsd": 0.0, + "loss/logits": 0.18906075097620487, + "step": 5350 + }, + { + "epoch": 0.134, + "grad_norm": 31.125, + "grad_norm_var": 2.880989583333333, + "learning_rate": 0.0001, + "loss": 7.5205, + "loss/crossentropy": 2.111976405978203, + "loss/hidden": 3.3953125, + "loss/jsd": 0.0, + "loss/logits": 0.18815945349633695, + "step": 5360 + }, + { + "epoch": 0.13425, + "grad_norm": 41.75, + "grad_norm_var": 8.989322916666667, + "learning_rate": 0.0001, + "loss": 7.5645, + "loss/crossentropy": 2.150078758597374, + "loss/hidden": 3.305859375, + "loss/jsd": 0.0, + "loss/logits": 0.18150232955813408, + "step": 5370 + }, + { + "epoch": 0.1345, + "grad_norm": 32.25, + "grad_norm_var": 9.262239583333333, + "learning_rate": 0.0001, + "loss": 7.5691, + "loss/crossentropy": 2.2475525766611097, + "loss/hidden": 3.469921875, + "loss/jsd": 0.0, + "loss/logits": 0.22073253151029348, + "step": 5380 + }, + { + "epoch": 0.13475, + "grad_norm": 34.25, + "grad_norm_var": 4.981705729166666, + "learning_rate": 0.0001, + "loss": 7.5575, + "loss/crossentropy": 2.1171315133571627, + "loss/hidden": 3.445703125, + "loss/jsd": 0.0, + "loss/logits": 0.20035637486726046, + "step": 5390 + }, + { + "epoch": 0.135, + "grad_norm": 212.0, + "grad_norm_var": 2028.59375, + "learning_rate": 0.0001, + "loss": 7.6561, + "loss/crossentropy": 2.147063474357128, + "loss/hidden": 3.507421875, + "loss/jsd": 0.0, + "loss/logits": 0.20187063701450825, + "step": 5400 + }, + { + "epoch": 0.13525, + "grad_norm": 28.5, + "grad_norm_var": 2002.4431640625, + "learning_rate": 0.0001, + "loss": 7.5921, + "loss/crossentropy": 2.248355305194855, + "loss/hidden": 3.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.2067353159189224, + "step": 5410 + }, + { + "epoch": 0.1355, + "grad_norm": 30.375, + "grad_norm_var": 2.59765625, + "learning_rate": 0.0001, + "loss": 7.5656, + "loss/crossentropy": 2.140727072954178, + "loss/hidden": 3.42109375, + "loss/jsd": 0.0, + "loss/logits": 0.20080696120858194, + "step": 5420 + }, + { + "epoch": 0.13575, + "grad_norm": 30.875, + "grad_norm_var": 2.299934895833333, + "learning_rate": 0.0001, + "loss": 7.6261, + "loss/crossentropy": 2.2185936748981474, + "loss/hidden": 3.36796875, + "loss/jsd": 0.0, + "loss/logits": 0.1954452872276306, + "step": 5430 + }, + { + "epoch": 0.136, + "grad_norm": 32.0, + "grad_norm_var": 2.5455729166666665, + "learning_rate": 0.0001, + "loss": 7.5373, + "loss/crossentropy": 2.0910344183444978, + "loss/hidden": 3.333984375, + "loss/jsd": 0.0, + "loss/logits": 0.1913301758468151, + "step": 5440 + }, + { + "epoch": 0.13625, + "grad_norm": 34.5, + "grad_norm_var": 2.590559895833333, + "learning_rate": 0.0001, + "loss": 7.5366, + "loss/crossentropy": 2.278426119685173, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.1943113673478365, + "step": 5450 + }, + { + "epoch": 0.1365, + "grad_norm": 31.625, + "grad_norm_var": 2.5580729166666667, + "learning_rate": 0.0001, + "loss": 7.5847, + "loss/crossentropy": 2.076927217841148, + "loss/hidden": 3.4703125, + "loss/jsd": 0.0, + "loss/logits": 0.18883342035114764, + "step": 5460 + }, + { + "epoch": 0.13675, + "grad_norm": 34.75, + "grad_norm_var": 28.4197265625, + "learning_rate": 0.0001, + "loss": 7.6209, + "loss/crossentropy": 2.234089860320091, + "loss/hidden": 3.47734375, + "loss/jsd": 0.0, + "loss/logits": 0.21088924966752529, + "step": 5470 + }, + { + "epoch": 0.137, + "grad_norm": 31.25, + "grad_norm_var": 169.49479166666666, + "learning_rate": 0.0001, + "loss": 7.5375, + "loss/crossentropy": 2.117088034749031, + "loss/hidden": 3.3953125, + "loss/jsd": 0.0, + "loss/logits": 0.1874922074377537, + "step": 5480 + }, + { + "epoch": 0.13725, + "grad_norm": 31.0, + "grad_norm_var": 168.70462239583333, + "learning_rate": 0.0001, + "loss": 7.5082, + "loss/crossentropy": 2.1952930808067324, + "loss/hidden": 3.319921875, + "loss/jsd": 0.0, + "loss/logits": 0.19221495129168034, + "step": 5490 + }, + { + "epoch": 0.1375, + "grad_norm": 30.5, + "grad_norm_var": 27.639518229166665, + "learning_rate": 0.0001, + "loss": 7.4408, + "loss/crossentropy": 2.184998545050621, + "loss/hidden": 3.355859375, + "loss/jsd": 0.0, + "loss/logits": 0.17893593553453685, + "step": 5500 + }, + { + "epoch": 0.13775, + "grad_norm": 32.5, + "grad_norm_var": 1.6910807291666667, + "learning_rate": 0.0001, + "loss": 7.6031, + "loss/crossentropy": 2.280049467086792, + "loss/hidden": 3.506640625, + "loss/jsd": 0.0, + "loss/logits": 0.21599141787737608, + "step": 5510 + }, + { + "epoch": 0.138, + "grad_norm": 31.5, + "grad_norm_var": 2.730208333333333, + "learning_rate": 0.0001, + "loss": 7.5417, + "loss/crossentropy": 2.159082019329071, + "loss/hidden": 3.31640625, + "loss/jsd": 0.0, + "loss/logits": 0.19252310022711755, + "step": 5520 + }, + { + "epoch": 0.13825, + "grad_norm": 31.125, + "grad_norm_var": 2.520768229166667, + "learning_rate": 0.0001, + "loss": 7.6031, + "loss/crossentropy": 2.2485590517520904, + "loss/hidden": 3.448828125, + "loss/jsd": 0.0, + "loss/logits": 0.19249292369931936, + "step": 5530 + }, + { + "epoch": 0.1385, + "grad_norm": 31.75, + "grad_norm_var": 2.939583333333333, + "learning_rate": 0.0001, + "loss": 7.5843, + "loss/crossentropy": 2.0814964517951013, + "loss/hidden": 3.501953125, + "loss/jsd": 0.0, + "loss/logits": 0.19598778411746026, + "step": 5540 + }, + { + "epoch": 0.13875, + "grad_norm": 30.25, + "grad_norm_var": 515.5889973958333, + "learning_rate": 0.0001, + "loss": 7.6637, + "loss/crossentropy": 2.1314420223236086, + "loss/hidden": 3.451953125, + "loss/jsd": 0.0, + "loss/logits": 0.20185216665267944, + "step": 5550 + }, + { + "epoch": 0.139, + "grad_norm": 33.0, + "grad_norm_var": 3.4643229166666667, + "learning_rate": 0.0001, + "loss": 7.5407, + "loss/crossentropy": 2.190821570158005, + "loss/hidden": 3.362890625, + "loss/jsd": 0.0, + "loss/logits": 0.1984243031591177, + "step": 5560 + }, + { + "epoch": 0.13925, + "grad_norm": 30.875, + "grad_norm_var": 3.1483723958333334, + "learning_rate": 0.0001, + "loss": 7.587, + "loss/crossentropy": 2.2163518011569976, + "loss/hidden": 3.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.19457473792135715, + "step": 5570 + }, + { + "epoch": 0.1395, + "grad_norm": 28.625, + "grad_norm_var": 3.090559895833333, + "learning_rate": 0.0001, + "loss": 7.5224, + "loss/crossentropy": 2.077855309844017, + "loss/hidden": 3.551171875, + "loss/jsd": 0.0, + "loss/logits": 0.19357334338128568, + "step": 5580 + }, + { + "epoch": 0.13975, + "grad_norm": 31.25, + "grad_norm_var": 18.9041015625, + "learning_rate": 0.0001, + "loss": 7.6508, + "loss/crossentropy": 2.0995124436914923, + "loss/hidden": 3.54921875, + "loss/jsd": 0.0, + "loss/logits": 0.20311756529845298, + "step": 5590 + }, + { + "epoch": 0.14, + "grad_norm": 30.875, + "grad_norm_var": 16.024739583333332, + "learning_rate": 0.0001, + "loss": 7.5486, + "loss/crossentropy": 2.1742469370365143, + "loss/hidden": 3.501171875, + "loss/jsd": 0.0, + "loss/logits": 0.2091912193223834, + "step": 5600 + }, + { + "epoch": 0.14025, + "grad_norm": 31.0, + "grad_norm_var": 1.0989583333333333, + "learning_rate": 0.0001, + "loss": 7.5563, + "loss/crossentropy": 2.1435637921094894, + "loss/hidden": 3.525390625, + "loss/jsd": 0.0, + "loss/logits": 0.20397210270166397, + "step": 5610 + }, + { + "epoch": 0.1405, + "grad_norm": 44.0, + "grad_norm_var": 11.0041015625, + "learning_rate": 0.0001, + "loss": 7.593, + "loss/crossentropy": 2.151739400625229, + "loss/hidden": 3.477734375, + "loss/jsd": 0.0, + "loss/logits": 0.20091456174850464, + "step": 5620 + }, + { + "epoch": 0.14075, + "grad_norm": 31.125, + "grad_norm_var": 16.405208333333334, + "learning_rate": 0.0001, + "loss": 7.5337, + "loss/crossentropy": 2.2840585201978683, + "loss/hidden": 3.346875, + "loss/jsd": 0.0, + "loss/logits": 0.21120928078889847, + "step": 5630 + }, + { + "epoch": 0.141, + "grad_norm": 29.0, + "grad_norm_var": 2.7375, + "learning_rate": 0.0001, + "loss": 7.4997, + "loss/crossentropy": 2.041360355913639, + "loss/hidden": 3.439453125, + "loss/jsd": 0.0, + "loss/logits": 0.2038326717913151, + "step": 5640 + }, + { + "epoch": 0.14125, + "grad_norm": 29.25, + "grad_norm_var": 23.368684895833333, + "learning_rate": 0.0001, + "loss": 7.5633, + "loss/crossentropy": 2.111186498403549, + "loss/hidden": 3.463671875, + "loss/jsd": 0.0, + "loss/logits": 0.20292142927646636, + "step": 5650 + }, + { + "epoch": 0.1415, + "grad_norm": 33.5, + "grad_norm_var": 4.765559895833333, + "learning_rate": 0.0001, + "loss": 7.577, + "loss/crossentropy": 2.1156825721263885, + "loss/hidden": 3.423828125, + "loss/jsd": 0.0, + "loss/logits": 0.19096483811736106, + "step": 5660 + }, + { + "epoch": 0.14175, + "grad_norm": 29.625, + "grad_norm_var": 6.212434895833334, + "learning_rate": 0.0001, + "loss": 7.561, + "loss/crossentropy": 2.1884111180901527, + "loss/hidden": 3.416015625, + "loss/jsd": 0.0, + "loss/logits": 0.19188922494649888, + "step": 5670 + }, + { + "epoch": 0.142, + "grad_norm": 31.75, + "grad_norm_var": 4.145572916666667, + "learning_rate": 0.0001, + "loss": 7.5785, + "loss/crossentropy": 2.1473594516515733, + "loss/hidden": 3.4375, + "loss/jsd": 0.0, + "loss/logits": 0.19657318461686374, + "step": 5680 + }, + { + "epoch": 0.14225, + "grad_norm": 32.75, + "grad_norm_var": 9.868489583333334, + "learning_rate": 0.0001, + "loss": 7.5749, + "loss/crossentropy": 2.224351739883423, + "loss/hidden": 3.390234375, + "loss/jsd": 0.0, + "loss/logits": 0.19284768104553224, + "step": 5690 + }, + { + "epoch": 0.1425, + "grad_norm": 29.375, + "grad_norm_var": 24.6166015625, + "learning_rate": 0.0001, + "loss": 7.4841, + "loss/crossentropy": 2.0835042744874954, + "loss/hidden": 3.505859375, + "loss/jsd": 0.0, + "loss/logits": 0.1889321893453598, + "step": 5700 + }, + { + "epoch": 0.14275, + "grad_norm": 32.5, + "grad_norm_var": 3.8934895833333334, + "learning_rate": 0.0001, + "loss": 7.5411, + "loss/crossentropy": 2.132970982789993, + "loss/hidden": 3.4078125, + "loss/jsd": 0.0, + "loss/logits": 0.20858928225934506, + "step": 5710 + }, + { + "epoch": 0.143, + "grad_norm": 33.0, + "grad_norm_var": 96.9619140625, + "learning_rate": 0.0001, + "loss": 7.5633, + "loss/crossentropy": 2.2334523528814314, + "loss/hidden": 3.442578125, + "loss/jsd": 0.0, + "loss/logits": 0.20192687548696994, + "step": 5720 + }, + { + "epoch": 0.14325, + "grad_norm": 32.0, + "grad_norm_var": 90.8759765625, + "learning_rate": 0.0001, + "loss": 7.5, + "loss/crossentropy": 2.181543472409248, + "loss/hidden": 3.44453125, + "loss/jsd": 0.0, + "loss/logits": 0.20608940124511718, + "step": 5730 + }, + { + "epoch": 0.1435, + "grad_norm": 29.125, + "grad_norm_var": 68.45416666666667, + "learning_rate": 0.0001, + "loss": 7.6044, + "loss/crossentropy": 2.2480223774909973, + "loss/hidden": 3.317578125, + "loss/jsd": 0.0, + "loss/logits": 0.18675435222685338, + "step": 5740 + }, + { + "epoch": 0.14375, + "grad_norm": 39.25, + "grad_norm_var": 94.02057291666667, + "learning_rate": 0.0001, + "loss": 7.5096, + "loss/crossentropy": 2.2112644970417024, + "loss/hidden": 3.40390625, + "loss/jsd": 0.0, + "loss/logits": 0.21228924561291934, + "step": 5750 + }, + { + "epoch": 0.144, + "grad_norm": 31.5, + "grad_norm_var": 2.1797421917742129e+18, + "learning_rate": 0.0001, + "loss": 7.6201, + "loss/crossentropy": 2.2231735616922377, + "loss/hidden": 3.3765625, + "loss/jsd": 0.0, + "loss/logits": 0.19756054263561965, + "step": 5760 + }, + { + "epoch": 0.14425, + "grad_norm": 38.75, + "grad_norm_var": 58.33743489583333, + "learning_rate": 0.0001, + "loss": 7.5131, + "loss/crossentropy": 2.10022853910923, + "loss/hidden": 3.383984375, + "loss/jsd": 0.0, + "loss/logits": 0.1919796233996749, + "step": 5770 + }, + { + "epoch": 0.1445, + "grad_norm": 34.25, + "grad_norm_var": 46.984375, + "learning_rate": 0.0001, + "loss": 7.6109, + "loss/crossentropy": 2.190520279109478, + "loss/hidden": 3.358984375, + "loss/jsd": 0.0, + "loss/logits": 0.20563599281013012, + "step": 5780 + }, + { + "epoch": 0.14475, + "grad_norm": 32.75, + "grad_norm_var": 6.2166015625, + "learning_rate": 0.0001, + "loss": 7.6612, + "loss/crossentropy": 2.0708674401044846, + "loss/hidden": 3.473828125, + "loss/jsd": 0.0, + "loss/logits": 0.20486081317067145, + "step": 5790 + }, + { + "epoch": 0.145, + "grad_norm": 30.5, + "grad_norm_var": 21.637239583333333, + "learning_rate": 0.0001, + "loss": 7.5095, + "loss/crossentropy": 2.2081795185804367, + "loss/hidden": 3.50703125, + "loss/jsd": 0.0, + "loss/logits": 0.20764970332384108, + "step": 5800 + }, + { + "epoch": 0.14525, + "grad_norm": 32.25, + "grad_norm_var": 2.1119140625, + "learning_rate": 0.0001, + "loss": 7.5625, + "loss/crossentropy": 2.134955820441246, + "loss/hidden": 3.32890625, + "loss/jsd": 0.0, + "loss/logits": 0.18278160132467747, + "step": 5810 + }, + { + "epoch": 0.1455, + "grad_norm": 32.0, + "grad_norm_var": 26.182291666666668, + "learning_rate": 0.0001, + "loss": 7.5741, + "loss/crossentropy": 2.101625883579254, + "loss/hidden": 3.490234375, + "loss/jsd": 0.0, + "loss/logits": 0.21009023115038872, + "step": 5820 + }, + { + "epoch": 0.14575, + "grad_norm": 32.25, + "grad_norm_var": 12.02890625, + "learning_rate": 0.0001, + "loss": 7.5459, + "loss/crossentropy": 2.269702708721161, + "loss/hidden": 3.36796875, + "loss/jsd": 0.0, + "loss/logits": 0.1850876223295927, + "step": 5830 + }, + { + "epoch": 0.146, + "grad_norm": 31.625, + "grad_norm_var": 10.504166666666666, + "learning_rate": 0.0001, + "loss": 7.5756, + "loss/crossentropy": 2.1922702878713607, + "loss/hidden": 3.408203125, + "loss/jsd": 0.0, + "loss/logits": 0.1936382047832012, + "step": 5840 + }, + { + "epoch": 0.14625, + "grad_norm": 34.25, + "grad_norm_var": 2.6372395833333333, + "learning_rate": 0.0001, + "loss": 7.5806, + "loss/crossentropy": 2.0927012979984285, + "loss/hidden": 3.408203125, + "loss/jsd": 0.0, + "loss/logits": 0.200397995300591, + "step": 5850 + }, + { + "epoch": 0.1465, + "grad_norm": 31.375, + "grad_norm_var": 16.130143229166666, + "learning_rate": 0.0001, + "loss": 7.6031, + "loss/crossentropy": 2.150560998916626, + "loss/hidden": 3.379296875, + "loss/jsd": 0.0, + "loss/logits": 0.20800711959600449, + "step": 5860 + }, + { + "epoch": 0.14675, + "grad_norm": 30.875, + "grad_norm_var": 15.814583333333333, + "learning_rate": 0.0001, + "loss": 7.4831, + "loss/crossentropy": 2.138367956876755, + "loss/hidden": 3.299609375, + "loss/jsd": 0.0, + "loss/logits": 0.1825962917879224, + "step": 5870 + }, + { + "epoch": 0.147, + "grad_norm": 31.375, + "grad_norm_var": 11.920833333333333, + "learning_rate": 0.0001, + "loss": 7.5279, + "loss/crossentropy": 2.0560067892074585, + "loss/hidden": 3.4578125, + "loss/jsd": 0.0, + "loss/logits": 0.18732867166399955, + "step": 5880 + }, + { + "epoch": 0.14725, + "grad_norm": 31.625, + "grad_norm_var": 11.8087890625, + "learning_rate": 0.0001, + "loss": 7.4841, + "loss/crossentropy": 2.204834724962711, + "loss/hidden": 3.3015625, + "loss/jsd": 0.0, + "loss/logits": 0.18124654777348043, + "step": 5890 + }, + { + "epoch": 0.1475, + "grad_norm": 29.625, + "grad_norm_var": 12.66015625, + "learning_rate": 0.0001, + "loss": 7.5171, + "loss/crossentropy": 2.185581070184708, + "loss/hidden": 3.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.19726393837481737, + "step": 5900 + }, + { + "epoch": 0.14775, + "grad_norm": 29.625, + "grad_norm_var": 2.1747395833333334, + "learning_rate": 0.0001, + "loss": 7.4905, + "loss/crossentropy": 2.0858706533908844, + "loss/hidden": 3.57734375, + "loss/jsd": 0.0, + "loss/logits": 0.20108112394809724, + "step": 5910 + }, + { + "epoch": 0.148, + "grad_norm": 30.0, + "grad_norm_var": 3.87265625, + "learning_rate": 0.0001, + "loss": 7.5801, + "loss/crossentropy": 2.11753663122654, + "loss/hidden": 3.512890625, + "loss/jsd": 0.0, + "loss/logits": 0.20907512214034796, + "step": 5920 + }, + { + "epoch": 0.14825, + "grad_norm": 31.25, + "grad_norm_var": 2.6212890625, + "learning_rate": 0.0001, + "loss": 7.5707, + "loss/crossentropy": 2.1879894763231276, + "loss/hidden": 3.381640625, + "loss/jsd": 0.0, + "loss/logits": 0.19724611584097146, + "step": 5930 + }, + { + "epoch": 0.1485, + "grad_norm": 31.625, + "grad_norm_var": 5.4134765625, + "learning_rate": 0.0001, + "loss": 7.5137, + "loss/crossentropy": 2.0121699988842012, + "loss/hidden": 3.653515625, + "loss/jsd": 0.0, + "loss/logits": 0.2113606294617057, + "step": 5940 + }, + { + "epoch": 0.14875, + "grad_norm": 30.375, + "grad_norm_var": 7.517643229166667, + "learning_rate": 0.0001, + "loss": 7.5851, + "loss/crossentropy": 2.1624063462018968, + "loss/hidden": 3.3625, + "loss/jsd": 0.0, + "loss/logits": 0.19148119539022446, + "step": 5950 + }, + { + "epoch": 0.149, + "grad_norm": 40.5, + "grad_norm_var": 18.078580729166667, + "learning_rate": 0.0001, + "loss": 7.4704, + "loss/crossentropy": 2.100359010696411, + "loss/hidden": 3.39765625, + "loss/jsd": 0.0, + "loss/logits": 0.18530675377696754, + "step": 5960 + }, + { + "epoch": 0.14925, + "grad_norm": 33.0, + "grad_norm_var": 106.165625, + "learning_rate": 0.0001, + "loss": 7.5243, + "loss/crossentropy": 2.088457000255585, + "loss/hidden": 3.45, + "loss/jsd": 0.0, + "loss/logits": 0.1950376622378826, + "step": 5970 + }, + { + "epoch": 0.1495, + "grad_norm": 35.5, + "grad_norm_var": 110.30618489583334, + "learning_rate": 0.0001, + "loss": 7.566, + "loss/crossentropy": 2.25458045899868, + "loss/hidden": 3.35, + "loss/jsd": 0.0, + "loss/logits": 0.192095298320055, + "step": 5980 + }, + { + "epoch": 0.14975, + "grad_norm": 33.25, + "grad_norm_var": 21.141080729166667, + "learning_rate": 0.0001, + "loss": 7.5733, + "loss/crossentropy": 2.2489352226257324, + "loss/hidden": 3.384375, + "loss/jsd": 0.0, + "loss/logits": 0.19246331304311753, + "step": 5990 + }, + { + "epoch": 0.15, + "grad_norm": 33.25, + "grad_norm_var": 1.6747395833333334, + "learning_rate": 0.0001, + "loss": 7.6863, + "loss/crossentropy": 2.0839652568101883, + "loss/hidden": 3.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.20588702652603388, + "step": 6000 + }, + { + "epoch": 0.15025, + "grad_norm": 29.875, + "grad_norm_var": 9.457291666666666, + "learning_rate": 0.0001, + "loss": 7.5802, + "loss/crossentropy": 2.1618823766708375, + "loss/hidden": 3.36015625, + "loss/jsd": 0.0, + "loss/logits": 0.19899031519889832, + "step": 6010 + }, + { + "epoch": 0.1505, + "grad_norm": 32.5, + "grad_norm_var": 10.827018229166667, + "learning_rate": 0.0001, + "loss": 7.6105, + "loss/crossentropy": 2.0799810975790023, + "loss/hidden": 3.6, + "loss/jsd": 0.0, + "loss/logits": 0.20440249070525168, + "step": 6020 + }, + { + "epoch": 0.15075, + "grad_norm": 30.875, + "grad_norm_var": 3.2405598958333335, + "learning_rate": 0.0001, + "loss": 7.6337, + "loss/crossentropy": 2.131713417172432, + "loss/hidden": 3.492578125, + "loss/jsd": 0.0, + "loss/logits": 0.20057708621025086, + "step": 6030 + }, + { + "epoch": 0.151, + "grad_norm": 31.125, + "grad_norm_var": 17.879166666666666, + "learning_rate": 0.0001, + "loss": 7.7142, + "loss/crossentropy": 2.116998878121376, + "loss/hidden": 3.485546875, + "loss/jsd": 0.0, + "loss/logits": 0.2165115473791957, + "step": 6040 + }, + { + "epoch": 0.15125, + "grad_norm": 31.375, + "grad_norm_var": 19.562239583333334, + "learning_rate": 0.0001, + "loss": 7.5782, + "loss/crossentropy": 2.187731945514679, + "loss/hidden": 3.42421875, + "loss/jsd": 0.0, + "loss/logits": 0.1970472853630781, + "step": 6050 + }, + { + "epoch": 0.1515, + "grad_norm": 51.0, + "grad_norm_var": 25.872330729166666, + "learning_rate": 0.0001, + "loss": 7.5339, + "loss/crossentropy": 2.0746294870972632, + "loss/hidden": 3.49375, + "loss/jsd": 0.0, + "loss/logits": 0.18855103328824044, + "step": 6060 + }, + { + "epoch": 0.15175, + "grad_norm": 31.75, + "grad_norm_var": 24.65390625, + "learning_rate": 0.0001, + "loss": 7.6192, + "loss/crossentropy": 2.223821607232094, + "loss/hidden": 3.48203125, + "loss/jsd": 0.0, + "loss/logits": 0.20149823743849993, + "step": 6070 + }, + { + "epoch": 0.152, + "grad_norm": 32.5, + "grad_norm_var": 1.99765625, + "learning_rate": 0.0001, + "loss": 7.634, + "loss/crossentropy": 2.187976914644241, + "loss/hidden": 3.4640625, + "loss/jsd": 0.0, + "loss/logits": 0.19613044820725917, + "step": 6080 + }, + { + "epoch": 0.15225, + "grad_norm": 32.0, + "grad_norm_var": 2.8854166666666665, + "learning_rate": 0.0001, + "loss": 7.5475, + "loss/crossentropy": 2.2121584147214888, + "loss/hidden": 3.28828125, + "loss/jsd": 0.0, + "loss/logits": 0.1814923081547022, + "step": 6090 + }, + { + "epoch": 0.1525, + "grad_norm": 33.0, + "grad_norm_var": 2.9692057291666667, + "learning_rate": 0.0001, + "loss": 7.5725, + "loss/crossentropy": 2.189678418636322, + "loss/hidden": 3.412890625, + "loss/jsd": 0.0, + "loss/logits": 0.19301872439682483, + "step": 6100 + }, + { + "epoch": 0.15275, + "grad_norm": 31.0, + "grad_norm_var": 3.457747395833333, + "learning_rate": 0.0001, + "loss": 7.5315, + "loss/crossentropy": 2.1482601583004, + "loss/hidden": 3.35703125, + "loss/jsd": 0.0, + "loss/logits": 0.1997425738722086, + "step": 6110 + }, + { + "epoch": 0.153, + "grad_norm": 36.0, + "grad_norm_var": 4.5072265625, + "learning_rate": 0.0001, + "loss": 7.571, + "loss/crossentropy": 2.1208418533205986, + "loss/hidden": 3.41953125, + "loss/jsd": 0.0, + "loss/logits": 0.19502468593418598, + "step": 6120 + }, + { + "epoch": 0.15325, + "grad_norm": 33.0, + "grad_norm_var": 17.070572916666666, + "learning_rate": 0.0001, + "loss": 7.5239, + "loss/crossentropy": 2.160035288333893, + "loss/hidden": 3.4359375, + "loss/jsd": 0.0, + "loss/logits": 0.189887585490942, + "step": 6130 + }, + { + "epoch": 0.1535, + "grad_norm": 30.625, + "grad_norm_var": 22.48515625, + "learning_rate": 0.0001, + "loss": 7.5566, + "loss/crossentropy": 2.173004740476608, + "loss/hidden": 3.366796875, + "loss/jsd": 0.0, + "loss/logits": 0.2055901188403368, + "step": 6140 + }, + { + "epoch": 0.15375, + "grad_norm": 35.5, + "grad_norm_var": 16.00390625, + "learning_rate": 0.0001, + "loss": 7.6958, + "loss/crossentropy": 2.0565848529338835, + "loss/hidden": 3.53203125, + "loss/jsd": 0.0, + "loss/logits": 0.22364541105926036, + "step": 6150 + }, + { + "epoch": 0.154, + "grad_norm": 32.0, + "grad_norm_var": 11.542122395833333, + "learning_rate": 0.0001, + "loss": 7.6093, + "loss/crossentropy": 2.085195133090019, + "loss/hidden": 3.45546875, + "loss/jsd": 0.0, + "loss/logits": 0.19985817223787308, + "step": 6160 + }, + { + "epoch": 0.15425, + "grad_norm": 29.5, + "grad_norm_var": 2.1666666666666665, + "learning_rate": 0.0001, + "loss": 7.5611, + "loss/crossentropy": 2.1293098747730257, + "loss/hidden": 3.402734375, + "loss/jsd": 0.0, + "loss/logits": 0.19878034461289645, + "step": 6170 + }, + { + "epoch": 0.1545, + "grad_norm": 31.25, + "grad_norm_var": 20.2822265625, + "learning_rate": 0.0001, + "loss": 7.6405, + "loss/crossentropy": 2.2258552461862564, + "loss/hidden": 3.430078125, + "loss/jsd": 0.0, + "loss/logits": 0.20626316573470832, + "step": 6180 + }, + { + "epoch": 0.15475, + "grad_norm": 31.25, + "grad_norm_var": 20.730208333333334, + "learning_rate": 0.0001, + "loss": 7.4726, + "loss/crossentropy": 2.1775721326470374, + "loss/hidden": 3.449609375, + "loss/jsd": 0.0, + "loss/logits": 0.18398564979434012, + "step": 6190 + }, + { + "epoch": 0.155, + "grad_norm": 32.75, + "grad_norm_var": 3.1958333333333333, + "learning_rate": 0.0001, + "loss": 7.5734, + "loss/crossentropy": 2.223562794923782, + "loss/hidden": 3.34921875, + "loss/jsd": 0.0, + "loss/logits": 0.190413929335773, + "step": 6200 + }, + { + "epoch": 0.15525, + "grad_norm": 30.125, + "grad_norm_var": 5.166080729166667, + "learning_rate": 0.0001, + "loss": 7.6525, + "loss/crossentropy": 2.085880035161972, + "loss/hidden": 3.384375, + "loss/jsd": 0.0, + "loss/logits": 0.1801287617534399, + "step": 6210 + }, + { + "epoch": 0.1555, + "grad_norm": 30.875, + "grad_norm_var": 2.7035807291666667, + "learning_rate": 0.0001, + "loss": 7.5305, + "loss/crossentropy": 2.1930001616477965, + "loss/hidden": 3.420703125, + "loss/jsd": 0.0, + "loss/logits": 0.1861238319426775, + "step": 6220 + }, + { + "epoch": 0.15575, + "grad_norm": 31.0, + "grad_norm_var": 4.468489583333334, + "learning_rate": 0.0001, + "loss": 7.6889, + "loss/crossentropy": 2.22431803047657, + "loss/hidden": 3.388671875, + "loss/jsd": 0.0, + "loss/logits": 0.19135152641683817, + "step": 6230 + }, + { + "epoch": 0.156, + "grad_norm": 33.75, + "grad_norm_var": 4.770572916666667, + "learning_rate": 0.0001, + "loss": 7.5177, + "loss/crossentropy": 2.093687379360199, + "loss/hidden": 3.599609375, + "loss/jsd": 0.0, + "loss/logits": 0.1883873265236616, + "step": 6240 + }, + { + "epoch": 0.15625, + "grad_norm": 31.5, + "grad_norm_var": 1.1872395833333333, + "learning_rate": 0.0001, + "loss": 7.6073, + "loss/crossentropy": 2.2236929804086687, + "loss/hidden": 3.43828125, + "loss/jsd": 0.0, + "loss/logits": 0.213064675219357, + "step": 6250 + }, + { + "epoch": 0.1565, + "grad_norm": 31.0, + "grad_norm_var": 1.5947265625, + "learning_rate": 0.0001, + "loss": 7.5079, + "loss/crossentropy": 2.0539269253611563, + "loss/hidden": 3.4625, + "loss/jsd": 0.0, + "loss/logits": 0.1872910862788558, + "step": 6260 + }, + { + "epoch": 0.15675, + "grad_norm": 40.75, + "grad_norm_var": 21.492708333333333, + "learning_rate": 0.0001, + "loss": 7.6006, + "loss/crossentropy": 2.266616016626358, + "loss/hidden": 3.373828125, + "loss/jsd": 0.0, + "loss/logits": 0.1959751147776842, + "step": 6270 + }, + { + "epoch": 0.157, + "grad_norm": 31.375, + "grad_norm_var": 25.213541666666668, + "learning_rate": 0.0001, + "loss": 7.6284, + "loss/crossentropy": 2.2850263684988024, + "loss/hidden": 3.5203125, + "loss/jsd": 0.0, + "loss/logits": 0.2038384210318327, + "step": 6280 + }, + { + "epoch": 0.15725, + "grad_norm": 31.75, + "grad_norm_var": 2.0010416666666666, + "learning_rate": 0.0001, + "loss": 7.5067, + "loss/crossentropy": 2.122395873069763, + "loss/hidden": 3.448046875, + "loss/jsd": 0.0, + "loss/logits": 0.18144308719784022, + "step": 6290 + }, + { + "epoch": 0.1575, + "grad_norm": 34.0, + "grad_norm_var": 3.7462890625, + "learning_rate": 0.0001, + "loss": 7.5297, + "loss/crossentropy": 1.9984511777758598, + "loss/hidden": 3.563671875, + "loss/jsd": 0.0, + "loss/logits": 0.19643332287669182, + "step": 6300 + }, + { + "epoch": 0.15775, + "grad_norm": 30.5, + "grad_norm_var": 5.603059895833334, + "learning_rate": 0.0001, + "loss": 7.5455, + "loss/crossentropy": 2.029903215169907, + "loss/hidden": 3.496484375, + "loss/jsd": 0.0, + "loss/logits": 0.20556226037442685, + "step": 6310 + }, + { + "epoch": 0.158, + "grad_norm": 32.0, + "grad_norm_var": 35.5853515625, + "learning_rate": 0.0001, + "loss": 7.593, + "loss/crossentropy": 2.19532273709774, + "loss/hidden": 3.40390625, + "loss/jsd": 0.0, + "loss/logits": 0.1972879134118557, + "step": 6320 + }, + { + "epoch": 0.15825, + "grad_norm": 38.75, + "grad_norm_var": 18.0791015625, + "learning_rate": 0.0001, + "loss": 7.6415, + "loss/crossentropy": 2.0865643858909606, + "loss/hidden": 3.5046875, + "loss/jsd": 0.0, + "loss/logits": 0.2021018836647272, + "step": 6330 + }, + { + "epoch": 0.1585, + "grad_norm": 30.875, + "grad_norm_var": 12.580989583333333, + "learning_rate": 0.0001, + "loss": 7.658, + "loss/crossentropy": 2.31261685192585, + "loss/hidden": 3.40390625, + "loss/jsd": 0.0, + "loss/logits": 0.2027669247239828, + "step": 6340 + }, + { + "epoch": 0.15875, + "grad_norm": 31.875, + "grad_norm_var": 4.453580729166666, + "learning_rate": 0.0001, + "loss": 7.5713, + "loss/crossentropy": 2.1463931113481522, + "loss/hidden": 3.4390625, + "loss/jsd": 0.0, + "loss/logits": 0.18860225863754748, + "step": 6350 + }, + { + "epoch": 0.159, + "grad_norm": 32.25, + "grad_norm_var": 2.1, + "learning_rate": 0.0001, + "loss": 7.5179, + "loss/crossentropy": 2.134768417477608, + "loss/hidden": 3.369921875, + "loss/jsd": 0.0, + "loss/logits": 0.19954264387488366, + "step": 6360 + }, + { + "epoch": 0.15925, + "grad_norm": 33.5, + "grad_norm_var": 7.157747395833334, + "learning_rate": 0.0001, + "loss": 7.5755, + "loss/crossentropy": 2.144196245074272, + "loss/hidden": 3.425390625, + "loss/jsd": 0.0, + "loss/logits": 0.18948693461716176, + "step": 6370 + }, + { + "epoch": 0.1595, + "grad_norm": 31.375, + "grad_norm_var": 1.9546223958333333, + "learning_rate": 0.0001, + "loss": 7.6119, + "loss/crossentropy": 2.147921970486641, + "loss/hidden": 3.413671875, + "loss/jsd": 0.0, + "loss/logits": 0.19279659036546945, + "step": 6380 + }, + { + "epoch": 0.15975, + "grad_norm": 33.25, + "grad_norm_var": 1.9129557291666666, + "learning_rate": 0.0001, + "loss": 7.4797, + "loss/crossentropy": 2.1023626953363417, + "loss/hidden": 3.486328125, + "loss/jsd": 0.0, + "loss/logits": 0.1855178650468588, + "step": 6390 + }, + { + "epoch": 0.16, + "grad_norm": 31.375, + "grad_norm_var": 1.7577473958333334, + "learning_rate": 0.0001, + "loss": 7.5115, + "loss/crossentropy": 2.1468859046697615, + "loss/hidden": 3.398828125, + "loss/jsd": 0.0, + "loss/logits": 0.19039052687585353, + "step": 6400 + }, + { + "epoch": 0.16025, + "grad_norm": 31.875, + "grad_norm_var": 2.43515625, + "learning_rate": 0.0001, + "loss": 7.473, + "loss/crossentropy": 2.1335047364234923, + "loss/hidden": 3.349609375, + "loss/jsd": 0.0, + "loss/logits": 0.17829085066914557, + "step": 6410 + }, + { + "epoch": 0.1605, + "grad_norm": 31.5, + "grad_norm_var": 2.027083333333333, + "learning_rate": 0.0001, + "loss": 7.5477, + "loss/crossentropy": 2.0842413723468782, + "loss/hidden": 3.537109375, + "loss/jsd": 0.0, + "loss/logits": 0.20616262052208184, + "step": 6420 + }, + { + "epoch": 0.16075, + "grad_norm": 33.75, + "grad_norm_var": 33.49791666666667, + "learning_rate": 0.0001, + "loss": 7.5695, + "loss/crossentropy": 2.153767225146294, + "loss/hidden": 3.493359375, + "loss/jsd": 0.0, + "loss/logits": 0.20842270255088807, + "step": 6430 + }, + { + "epoch": 0.161, + "grad_norm": 32.75, + "grad_norm_var": 15.109375, + "learning_rate": 0.0001, + "loss": 7.6661, + "loss/crossentropy": 2.0475671708583834, + "loss/hidden": 3.6484375, + "loss/jsd": 0.0, + "loss/logits": 0.2175968911498785, + "step": 6440 + }, + { + "epoch": 0.16125, + "grad_norm": 32.5, + "grad_norm_var": 8402.958268229168, + "learning_rate": 0.0001, + "loss": 7.6392, + "loss/crossentropy": 2.2127752989530562, + "loss/hidden": 3.532421875, + "loss/jsd": 0.0, + "loss/logits": 0.21682111844420432, + "step": 6450 + }, + { + "epoch": 0.1615, + "grad_norm": 33.25, + "grad_norm_var": 24.9375, + "learning_rate": 0.0001, + "loss": 7.5701, + "loss/crossentropy": 2.1261292159557343, + "loss/hidden": 3.425390625, + "loss/jsd": 0.0, + "loss/logits": 0.20704152658581734, + "step": 6460 + }, + { + "epoch": 0.16175, + "grad_norm": 35.5, + "grad_norm_var": 30.0875, + "learning_rate": 0.0001, + "loss": 7.5408, + "loss/crossentropy": 2.124222718179226, + "loss/hidden": 3.5984375, + "loss/jsd": 0.0, + "loss/logits": 0.18728400766849518, + "step": 6470 + }, + { + "epoch": 0.162, + "grad_norm": 32.25, + "grad_norm_var": 3.1145833333333335, + "learning_rate": 0.0001, + "loss": 7.5831, + "loss/crossentropy": 2.2058008939027784, + "loss/hidden": 3.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.19860644564032554, + "step": 6480 + }, + { + "epoch": 0.16225, + "grad_norm": 31.875, + "grad_norm_var": 28.412239583333335, + "learning_rate": 0.0001, + "loss": 7.5668, + "loss/crossentropy": 2.1960059702396393, + "loss/hidden": 3.370703125, + "loss/jsd": 0.0, + "loss/logits": 0.1868469040840864, + "step": 6490 + }, + { + "epoch": 0.1625, + "grad_norm": 32.5, + "grad_norm_var": 3.2416015625, + "learning_rate": 0.0001, + "loss": 7.4873, + "loss/crossentropy": 2.180153116583824, + "loss/hidden": 3.412109375, + "loss/jsd": 0.0, + "loss/logits": 0.18321371413767337, + "step": 6500 + }, + { + "epoch": 0.16275, + "grad_norm": 31.125, + "grad_norm_var": 2.113997395833333, + "learning_rate": 0.0001, + "loss": 7.5498, + "loss/crossentropy": 2.085334287583828, + "loss/hidden": 3.481640625, + "loss/jsd": 0.0, + "loss/logits": 0.20047369822859765, + "step": 6510 + }, + { + "epoch": 0.163, + "grad_norm": 33.75, + "grad_norm_var": 14.7166015625, + "learning_rate": 0.0001, + "loss": 7.5356, + "loss/crossentropy": 2.081576499342918, + "loss/hidden": 3.52890625, + "loss/jsd": 0.0, + "loss/logits": 0.19775602114386856, + "step": 6520 + }, + { + "epoch": 0.16325, + "grad_norm": 32.25, + "grad_norm_var": 13.976822916666666, + "learning_rate": 0.0001, + "loss": 7.4858, + "loss/crossentropy": 2.0600350558757783, + "loss/hidden": 3.427734375, + "loss/jsd": 0.0, + "loss/logits": 0.18539784867316483, + "step": 6530 + }, + { + "epoch": 0.1635, + "grad_norm": 30.375, + "grad_norm_var": 3.846875, + "learning_rate": 0.0001, + "loss": 7.5135, + "loss/crossentropy": 2.117923478782177, + "loss/hidden": 3.5109375, + "loss/jsd": 0.0, + "loss/logits": 0.2071079235523939, + "step": 6540 + }, + { + "epoch": 0.16375, + "grad_norm": 33.25, + "grad_norm_var": 21.288997395833334, + "learning_rate": 0.0001, + "loss": 7.5807, + "loss/crossentropy": 2.087580367922783, + "loss/hidden": 3.45390625, + "loss/jsd": 0.0, + "loss/logits": 0.18587088529020548, + "step": 6550 + }, + { + "epoch": 0.164, + "grad_norm": 33.0, + "grad_norm_var": 5.292708333333334, + "learning_rate": 0.0001, + "loss": 7.4736, + "loss/crossentropy": 2.060797114670277, + "loss/hidden": 3.485546875, + "loss/jsd": 0.0, + "loss/logits": 0.1906617671251297, + "step": 6560 + }, + { + "epoch": 0.16425, + "grad_norm": 30.375, + "grad_norm_var": 1.934375, + "learning_rate": 0.0001, + "loss": 7.5351, + "loss/crossentropy": 2.177751311659813, + "loss/hidden": 3.426953125, + "loss/jsd": 0.0, + "loss/logits": 0.20191191136837006, + "step": 6570 + }, + { + "epoch": 0.1645, + "grad_norm": 33.0, + "grad_norm_var": 1.4184895833333333, + "learning_rate": 0.0001, + "loss": 7.6058, + "loss/crossentropy": 2.1490555882453917, + "loss/hidden": 3.52109375, + "loss/jsd": 0.0, + "loss/logits": 0.19388978108763694, + "step": 6580 + }, + { + "epoch": 0.16475, + "grad_norm": 29.125, + "grad_norm_var": 2.8478515625, + "learning_rate": 0.0001, + "loss": 7.5149, + "loss/crossentropy": 2.2254546850919725, + "loss/hidden": 3.306640625, + "loss/jsd": 0.0, + "loss/logits": 0.18252336494624616, + "step": 6590 + }, + { + "epoch": 0.165, + "grad_norm": 38.5, + "grad_norm_var": 207.35201822916667, + "learning_rate": 0.0001, + "loss": 7.6059, + "loss/crossentropy": 2.1390285924077035, + "loss/hidden": 3.303515625, + "loss/jsd": 0.0, + "loss/logits": 0.1844609746709466, + "step": 6600 + }, + { + "epoch": 0.16525, + "grad_norm": 35.0, + "grad_norm_var": 203.78098958333334, + "learning_rate": 0.0001, + "loss": 7.5921, + "loss/crossentropy": 2.092223954200745, + "loss/hidden": 3.58828125, + "loss/jsd": 0.0, + "loss/logits": 0.17789147663861513, + "step": 6610 + }, + { + "epoch": 0.1655, + "grad_norm": 30.0, + "grad_norm_var": 65.76640625, + "learning_rate": 0.0001, + "loss": 7.558, + "loss/crossentropy": 2.138624146580696, + "loss/hidden": 3.508984375, + "loss/jsd": 0.0, + "loss/logits": 0.19836988989263774, + "step": 6620 + }, + { + "epoch": 0.16575, + "grad_norm": 30.875, + "grad_norm_var": 68.97337239583334, + "learning_rate": 0.0001, + "loss": 7.581, + "loss/crossentropy": 2.1231719397008417, + "loss/hidden": 3.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.18597114123404027, + "step": 6630 + }, + { + "epoch": 0.166, + "grad_norm": 32.25, + "grad_norm_var": 125.8041015625, + "learning_rate": 0.0001, + "loss": 7.5804, + "loss/crossentropy": 2.130907243490219, + "loss/hidden": 3.41640625, + "loss/jsd": 0.0, + "loss/logits": 0.17537819929420947, + "step": 6640 + }, + { + "epoch": 0.16625, + "grad_norm": 40.25, + "grad_norm_var": 399.22057291666664, + "learning_rate": 0.0001, + "loss": 7.6269, + "loss/crossentropy": 2.0258643075823786, + "loss/hidden": 3.596875, + "loss/jsd": 0.0, + "loss/logits": 0.23089794162660837, + "step": 6650 + }, + { + "epoch": 0.1665, + "grad_norm": 31.0, + "grad_norm_var": 408.07233072916665, + "learning_rate": 0.0001, + "loss": 7.4616, + "loss/crossentropy": 2.23043432533741, + "loss/hidden": 3.40078125, + "loss/jsd": 0.0, + "loss/logits": 0.19158051013946534, + "step": 6660 + }, + { + "epoch": 0.16675, + "grad_norm": 32.0, + "grad_norm_var": 8.009830729166667, + "learning_rate": 0.0001, + "loss": 7.5931, + "loss/crossentropy": 2.1956572234630585, + "loss/hidden": 3.50234375, + "loss/jsd": 0.0, + "loss/logits": 0.20411487035453318, + "step": 6670 + }, + { + "epoch": 0.167, + "grad_norm": 34.5, + "grad_norm_var": 8.894791666666666, + "learning_rate": 0.0001, + "loss": 7.6195, + "loss/crossentropy": 2.2672122746706007, + "loss/hidden": 3.309765625, + "loss/jsd": 0.0, + "loss/logits": 0.18957087770104408, + "step": 6680 + }, + { + "epoch": 0.16725, + "grad_norm": 31.0, + "grad_norm_var": 66.73333333333333, + "learning_rate": 0.0001, + "loss": 7.5632, + "loss/crossentropy": 2.2279567658901214, + "loss/hidden": 3.32109375, + "loss/jsd": 0.0, + "loss/logits": 0.18730072304606438, + "step": 6690 + }, + { + "epoch": 0.1675, + "grad_norm": 42.0, + "grad_norm_var": 71.18854166666667, + "learning_rate": 0.0001, + "loss": 7.7089, + "loss/crossentropy": 2.282482776045799, + "loss/hidden": 3.4640625, + "loss/jsd": 0.0, + "loss/logits": 0.20141587276011705, + "step": 6700 + }, + { + "epoch": 0.16775, + "grad_norm": 31.5, + "grad_norm_var": 9.892122395833333, + "learning_rate": 0.0001, + "loss": 7.3758, + "loss/crossentropy": 2.2182126119732857, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.1786667076870799, + "step": 6710 + }, + { + "epoch": 0.168, + "grad_norm": 33.75, + "grad_norm_var": 3.6957682291666667, + "learning_rate": 0.0001, + "loss": 7.5636, + "loss/crossentropy": 2.1939273923635483, + "loss/hidden": 3.40703125, + "loss/jsd": 0.0, + "loss/logits": 0.18930760622024537, + "step": 6720 + }, + { + "epoch": 0.16825, + "grad_norm": 32.25, + "grad_norm_var": 3.9009765625, + "learning_rate": 0.0001, + "loss": 7.5783, + "loss/crossentropy": 2.0783598124980927, + "loss/hidden": 3.35625, + "loss/jsd": 0.0, + "loss/logits": 0.18662143610417842, + "step": 6730 + }, + { + "epoch": 0.1685, + "grad_norm": 35.5, + "grad_norm_var": 5.1353515625, + "learning_rate": 0.0001, + "loss": 7.5009, + "loss/crossentropy": 2.217881241440773, + "loss/hidden": 3.4109375, + "loss/jsd": 0.0, + "loss/logits": 0.19149041585624219, + "step": 6740 + }, + { + "epoch": 0.16875, + "grad_norm": 31.25, + "grad_norm_var": 5.264322916666667, + "learning_rate": 0.0001, + "loss": 7.6346, + "loss/crossentropy": 2.1645133450627325, + "loss/hidden": 3.47890625, + "loss/jsd": 0.0, + "loss/logits": 0.20458366964012384, + "step": 6750 + }, + { + "epoch": 0.169, + "grad_norm": 35.5, + "grad_norm_var": 4.1775390625, + "learning_rate": 0.0001, + "loss": 7.5251, + "loss/crossentropy": 2.171855625510216, + "loss/hidden": 3.359375, + "loss/jsd": 0.0, + "loss/logits": 0.1922192147001624, + "step": 6760 + }, + { + "epoch": 0.16925, + "grad_norm": 30.25, + "grad_norm_var": 50.66223958333333, + "learning_rate": 0.0001, + "loss": 7.5615, + "loss/crossentropy": 2.269451642036438, + "loss/hidden": 3.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.19260151647031307, + "step": 6770 + }, + { + "epoch": 0.1695, + "grad_norm": 32.5, + "grad_norm_var": 52.72057291666667, + "learning_rate": 0.0001, + "loss": 7.5146, + "loss/crossentropy": 2.2029780715703966, + "loss/hidden": 3.32734375, + "loss/jsd": 0.0, + "loss/logits": 0.18055486269295215, + "step": 6780 + }, + { + "epoch": 0.16975, + "grad_norm": 32.75, + "grad_norm_var": 5.893684895833333, + "learning_rate": 0.0001, + "loss": 7.6728, + "loss/crossentropy": 2.194507023692131, + "loss/hidden": 3.4390625, + "loss/jsd": 0.0, + "loss/logits": 0.21081659942865372, + "step": 6790 + }, + { + "epoch": 0.17, + "grad_norm": 39.0, + "grad_norm_var": 9.028580729166666, + "learning_rate": 0.0001, + "loss": 7.6095, + "loss/crossentropy": 2.228949736058712, + "loss/hidden": 3.361328125, + "loss/jsd": 0.0, + "loss/logits": 0.19297500066459178, + "step": 6800 + }, + { + "epoch": 0.17025, + "grad_norm": 51.25, + "grad_norm_var": 26.128125, + "learning_rate": 0.0001, + "loss": 7.5173, + "loss/crossentropy": 2.0145166903734206, + "loss/hidden": 3.473828125, + "loss/jsd": 0.0, + "loss/logits": 0.19940503742545843, + "step": 6810 + }, + { + "epoch": 0.1705, + "grad_norm": 30.0, + "grad_norm_var": 45.58932291666667, + "learning_rate": 0.0001, + "loss": 7.5887, + "loss/crossentropy": 2.1203667253255842, + "loss/hidden": 3.576953125, + "loss/jsd": 0.0, + "loss/logits": 0.24707065224647523, + "step": 6820 + }, + { + "epoch": 0.17075, + "grad_norm": 32.5, + "grad_norm_var": 13.638997395833334, + "learning_rate": 0.0001, + "loss": 7.6031, + "loss/crossentropy": 2.1143920481204987, + "loss/hidden": 3.6578125, + "loss/jsd": 0.0, + "loss/logits": 0.2371676929295063, + "step": 6830 + }, + { + "epoch": 0.171, + "grad_norm": 37.75, + "grad_norm_var": 11.876822916666667, + "learning_rate": 0.0001, + "loss": 7.4937, + "loss/crossentropy": 2.3085153490304946, + "loss/hidden": 3.395703125, + "loss/jsd": 0.0, + "loss/logits": 0.19867490641772748, + "step": 6840 + }, + { + "epoch": 0.17125, + "grad_norm": 31.625, + "grad_norm_var": 7.063541666666667, + "learning_rate": 0.0001, + "loss": 7.3836, + "loss/crossentropy": 2.2163452029228212, + "loss/hidden": 3.31796875, + "loss/jsd": 0.0, + "loss/logits": 0.19262240454554558, + "step": 6850 + }, + { + "epoch": 0.1715, + "grad_norm": 29.375, + "grad_norm_var": 7.3353515625, + "learning_rate": 0.0001, + "loss": 7.5696, + "loss/crossentropy": 2.1725872844457625, + "loss/hidden": 3.282421875, + "loss/jsd": 0.0, + "loss/logits": 0.18168828263878822, + "step": 6860 + }, + { + "epoch": 0.17175, + "grad_norm": 33.25, + "grad_norm_var": 6.418489583333334, + "learning_rate": 0.0001, + "loss": 7.5374, + "loss/crossentropy": 2.1833366841077804, + "loss/hidden": 3.42109375, + "loss/jsd": 0.0, + "loss/logits": 0.18841250911355018, + "step": 6870 + }, + { + "epoch": 0.172, + "grad_norm": 36.5, + "grad_norm_var": 21.055208333333333, + "learning_rate": 0.0001, + "loss": 7.5021, + "loss/crossentropy": 2.166957159340382, + "loss/hidden": 3.430078125, + "loss/jsd": 0.0, + "loss/logits": 0.1870790719985962, + "step": 6880 + }, + { + "epoch": 0.17225, + "grad_norm": 33.75, + "grad_norm_var": 24.2744140625, + "learning_rate": 0.0001, + "loss": 7.463, + "loss/crossentropy": 2.2468337625265122, + "loss/hidden": 3.28359375, + "loss/jsd": 0.0, + "loss/logits": 0.19030643235892059, + "step": 6890 + }, + { + "epoch": 0.1725, + "grad_norm": 27.875, + "grad_norm_var": 5.3275390625, + "learning_rate": 0.0001, + "loss": 7.453, + "loss/crossentropy": 2.1895847231149674, + "loss/hidden": 3.420703125, + "loss/jsd": 0.0, + "loss/logits": 0.19813059270381927, + "step": 6900 + }, + { + "epoch": 0.17275, + "grad_norm": 33.25, + "grad_norm_var": 12.654166666666667, + "learning_rate": 0.0001, + "loss": 7.5002, + "loss/crossentropy": 2.2333322286605837, + "loss/hidden": 3.418359375, + "loss/jsd": 0.0, + "loss/logits": 0.20021227821707727, + "step": 6910 + }, + { + "epoch": 0.173, + "grad_norm": 30.75, + "grad_norm_var": 11.69765625, + "learning_rate": 0.0001, + "loss": 7.4872, + "loss/crossentropy": 2.0374642267823218, + "loss/hidden": 3.4, + "loss/jsd": 0.0, + "loss/logits": 0.18903901670128107, + "step": 6920 + }, + { + "epoch": 0.17325, + "grad_norm": 31.875, + "grad_norm_var": 2.220572916666667, + "learning_rate": 0.0001, + "loss": 7.5288, + "loss/crossentropy": 2.2860298246145248, + "loss/hidden": 3.413671875, + "loss/jsd": 0.0, + "loss/logits": 0.20242451801896094, + "step": 6930 + }, + { + "epoch": 0.1735, + "grad_norm": 30.375, + "grad_norm_var": 1.9811848958333333, + "learning_rate": 0.0001, + "loss": 7.6115, + "loss/crossentropy": 2.174576237797737, + "loss/hidden": 3.502734375, + "loss/jsd": 0.0, + "loss/logits": 0.20389086604118348, + "step": 6940 + }, + { + "epoch": 0.17375, + "grad_norm": 29.5, + "grad_norm_var": 5.17265625, + "learning_rate": 0.0001, + "loss": 7.6028, + "loss/crossentropy": 2.160076954960823, + "loss/hidden": 3.35703125, + "loss/jsd": 0.0, + "loss/logits": 0.18584340140223504, + "step": 6950 + }, + { + "epoch": 0.174, + "grad_norm": 32.25, + "grad_norm_var": 3.4150390625, + "learning_rate": 0.0001, + "loss": 7.6249, + "loss/crossentropy": 2.2141772389411924, + "loss/hidden": 3.384765625, + "loss/jsd": 0.0, + "loss/logits": 0.2044496938586235, + "step": 6960 + }, + { + "epoch": 0.17425, + "grad_norm": 30.25, + "grad_norm_var": 2.63125, + "learning_rate": 0.0001, + "loss": 7.5556, + "loss/crossentropy": 2.1613443583250045, + "loss/hidden": 3.461328125, + "loss/jsd": 0.0, + "loss/logits": 0.20456040017306804, + "step": 6970 + }, + { + "epoch": 0.1745, + "grad_norm": 32.0, + "grad_norm_var": 1.9184895833333333, + "learning_rate": 0.0001, + "loss": 7.5504, + "loss/crossentropy": 2.179374423623085, + "loss/hidden": 3.309375, + "loss/jsd": 0.0, + "loss/logits": 0.18155127875506877, + "step": 6980 + }, + { + "epoch": 0.17475, + "grad_norm": 31.125, + "grad_norm_var": 1.1302083333333333, + "learning_rate": 0.0001, + "loss": 7.4596, + "loss/crossentropy": 2.2060728073120117, + "loss/hidden": 3.326171875, + "loss/jsd": 0.0, + "loss/logits": 0.18002954982221125, + "step": 6990 + }, + { + "epoch": 0.175, + "grad_norm": 27.875, + "grad_norm_var": 2.27265625, + "learning_rate": 0.0001, + "loss": 7.5016, + "loss/crossentropy": 2.1729022413492203, + "loss/hidden": 3.37265625, + "loss/jsd": 0.0, + "loss/logits": 0.20215826146304608, + "step": 7000 + }, + { + "epoch": 0.17525, + "grad_norm": 32.75, + "grad_norm_var": 2.187239583333333, + "learning_rate": 0.0001, + "loss": 7.5186, + "loss/crossentropy": 2.1925390481948854, + "loss/hidden": 3.564453125, + "loss/jsd": 0.0, + "loss/logits": 0.21080133505165577, + "step": 7010 + }, + { + "epoch": 0.1755, + "grad_norm": 33.0, + "grad_norm_var": 3.3681640625, + "learning_rate": 0.0001, + "loss": 7.6812, + "loss/crossentropy": 2.1888799130916596, + "loss/hidden": 3.362109375, + "loss/jsd": 0.0, + "loss/logits": 0.1895390186458826, + "step": 7020 + }, + { + "epoch": 0.17575, + "grad_norm": 29.0, + "grad_norm_var": 3.4625, + "learning_rate": 0.0001, + "loss": 7.4658, + "loss/crossentropy": 2.1911296755075456, + "loss/hidden": 3.43828125, + "loss/jsd": 0.0, + "loss/logits": 0.18743323888629676, + "step": 7030 + }, + { + "epoch": 0.176, + "grad_norm": 36.75, + "grad_norm_var": 4.143489583333333, + "learning_rate": 0.0001, + "loss": 7.5154, + "loss/crossentropy": 2.2533592522144317, + "loss/hidden": 3.323046875, + "loss/jsd": 0.0, + "loss/logits": 0.18201965987682342, + "step": 7040 + }, + { + "epoch": 0.17625, + "grad_norm": 34.0, + "grad_norm_var": 3.7556640625, + "learning_rate": 0.0001, + "loss": 7.4748, + "loss/crossentropy": 2.259921830892563, + "loss/hidden": 3.408203125, + "loss/jsd": 0.0, + "loss/logits": 0.1965817864984274, + "step": 7050 + }, + { + "epoch": 0.1765, + "grad_norm": 32.75, + "grad_norm_var": 76.703125, + "learning_rate": 0.0001, + "loss": 7.6429, + "loss/crossentropy": 2.18757144510746, + "loss/hidden": 3.415625, + "loss/jsd": 0.0, + "loss/logits": 0.18871748261153698, + "step": 7060 + }, + { + "epoch": 0.17675, + "grad_norm": 29.125, + "grad_norm_var": 18.242708333333333, + "learning_rate": 0.0001, + "loss": 7.4899, + "loss/crossentropy": 2.1853384137153626, + "loss/hidden": 3.356640625, + "loss/jsd": 0.0, + "loss/logits": 0.18886385671794415, + "step": 7070 + }, + { + "epoch": 0.177, + "grad_norm": 33.5, + "grad_norm_var": 17.101497395833334, + "learning_rate": 0.0001, + "loss": 7.5441, + "loss/crossentropy": 2.1331110268831255, + "loss/hidden": 3.295703125, + "loss/jsd": 0.0, + "loss/logits": 0.1878782594576478, + "step": 7080 + }, + { + "epoch": 0.17725, + "grad_norm": 35.25, + "grad_norm_var": 6.013997395833333, + "learning_rate": 0.0001, + "loss": 7.463, + "loss/crossentropy": 2.252199110388756, + "loss/hidden": 3.41640625, + "loss/jsd": 0.0, + "loss/logits": 0.197531633451581, + "step": 7090 + }, + { + "epoch": 0.1775, + "grad_norm": 32.75, + "grad_norm_var": 6.199739583333334, + "learning_rate": 0.0001, + "loss": 7.5179, + "loss/crossentropy": 2.1631226271390913, + "loss/hidden": 3.53828125, + "loss/jsd": 0.0, + "loss/logits": 0.20652975142002106, + "step": 7100 + }, + { + "epoch": 0.17775, + "grad_norm": 30.875, + "grad_norm_var": 2.31015625, + "learning_rate": 0.0001, + "loss": 7.5643, + "loss/crossentropy": 2.153787222504616, + "loss/hidden": 3.455859375, + "loss/jsd": 0.0, + "loss/logits": 0.2014656089246273, + "step": 7110 + }, + { + "epoch": 0.178, + "grad_norm": 29.875, + "grad_norm_var": 4.601822916666666, + "learning_rate": 0.0001, + "loss": 7.5516, + "loss/crossentropy": 2.1199822768568994, + "loss/hidden": 3.448046875, + "loss/jsd": 0.0, + "loss/logits": 0.1905859999358654, + "step": 7120 + }, + { + "epoch": 0.17825, + "grad_norm": 32.25, + "grad_norm_var": 3.4760416666666667, + "learning_rate": 0.0001, + "loss": 7.5994, + "loss/crossentropy": 2.26395897269249, + "loss/hidden": 3.402734375, + "loss/jsd": 0.0, + "loss/logits": 0.19645936116576196, + "step": 7130 + }, + { + "epoch": 0.1785, + "grad_norm": 36.25, + "grad_norm_var": 3.8854166666666665, + "learning_rate": 0.0001, + "loss": 7.5969, + "loss/crossentropy": 2.1285897165536882, + "loss/hidden": 3.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.18263876978307964, + "step": 7140 + }, + { + "epoch": 0.17875, + "grad_norm": 34.0, + "grad_norm_var": 4.657291666666667, + "learning_rate": 0.0001, + "loss": 7.5153, + "loss/crossentropy": 2.127173164486885, + "loss/hidden": 3.401171875, + "loss/jsd": 0.0, + "loss/logits": 0.19018456861376762, + "step": 7150 + }, + { + "epoch": 0.179, + "grad_norm": 31.125, + "grad_norm_var": 4.314322916666667, + "learning_rate": 0.0001, + "loss": 7.5891, + "loss/crossentropy": 2.187330016493797, + "loss/hidden": 3.371484375, + "loss/jsd": 0.0, + "loss/logits": 0.19513612650334836, + "step": 7160 + }, + { + "epoch": 0.17925, + "grad_norm": 32.5, + "grad_norm_var": 2.4479166666666665, + "learning_rate": 0.0001, + "loss": 7.5653, + "loss/crossentropy": 2.0964292854070665, + "loss/hidden": 3.467578125, + "loss/jsd": 0.0, + "loss/logits": 0.20541836731135846, + "step": 7170 + }, + { + "epoch": 0.1795, + "grad_norm": 33.0, + "grad_norm_var": 2.4025390625, + "learning_rate": 0.0001, + "loss": 7.5849, + "loss/crossentropy": 2.1456298559904097, + "loss/hidden": 3.515234375, + "loss/jsd": 0.0, + "loss/logits": 0.21056269146502019, + "step": 7180 + }, + { + "epoch": 0.17975, + "grad_norm": 33.5, + "grad_norm_var": 2.78515625, + "learning_rate": 0.0001, + "loss": 7.6594, + "loss/crossentropy": 2.0873524725437163, + "loss/hidden": 3.595703125, + "loss/jsd": 0.0, + "loss/logits": 0.2302736073732376, + "step": 7190 + }, + { + "epoch": 0.18, + "grad_norm": 32.0, + "grad_norm_var": 1.4697916666666666, + "learning_rate": 0.0001, + "loss": 7.5801, + "loss/crossentropy": 2.0368966817855836, + "loss/hidden": 3.572265625, + "loss/jsd": 0.0, + "loss/logits": 0.20361636602319777, + "step": 7200 + }, + { + "epoch": 0.18025, + "grad_norm": 32.75, + "grad_norm_var": 1.9942057291666666, + "learning_rate": 0.0001, + "loss": 7.6054, + "loss/crossentropy": 2.079051211476326, + "loss/hidden": 3.55546875, + "loss/jsd": 0.0, + "loss/logits": 0.21228218004107474, + "step": 7210 + }, + { + "epoch": 0.1805, + "grad_norm": 33.25, + "grad_norm_var": 4.237955729166667, + "learning_rate": 0.0001, + "loss": 7.6754, + "loss/crossentropy": 2.130552776157856, + "loss/hidden": 3.37265625, + "loss/jsd": 0.0, + "loss/logits": 0.1921296551823616, + "step": 7220 + }, + { + "epoch": 0.18075, + "grad_norm": 30.5, + "grad_norm_var": 2.526497395833333, + "learning_rate": 0.0001, + "loss": 7.5296, + "loss/crossentropy": 2.0738827764987944, + "loss/hidden": 3.403125, + "loss/jsd": 0.0, + "loss/logits": 0.19692382737994193, + "step": 7230 + }, + { + "epoch": 0.181, + "grad_norm": 32.75, + "grad_norm_var": 2.584375, + "learning_rate": 0.0001, + "loss": 7.6061, + "loss/crossentropy": 2.1579408079385756, + "loss/hidden": 3.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.21399259492754935, + "step": 7240 + }, + { + "epoch": 0.18125, + "grad_norm": 33.5, + "grad_norm_var": 2.8416015625, + "learning_rate": 0.0001, + "loss": 7.6824, + "loss/crossentropy": 2.221544751524925, + "loss/hidden": 3.53203125, + "loss/jsd": 0.0, + "loss/logits": 0.2030305091291666, + "step": 7250 + }, + { + "epoch": 0.1815, + "grad_norm": 35.5, + "grad_norm_var": 1.8848307291666666, + "learning_rate": 0.0001, + "loss": 7.5215, + "loss/crossentropy": 2.180348289012909, + "loss/hidden": 3.46796875, + "loss/jsd": 0.0, + "loss/logits": 0.2144750364124775, + "step": 7260 + }, + { + "epoch": 0.18175, + "grad_norm": 53.25, + "grad_norm_var": 31.4353515625, + "learning_rate": 0.0001, + "loss": 7.6027, + "loss/crossentropy": 2.1652086317539214, + "loss/hidden": 3.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.20979799777269365, + "step": 7270 + }, + { + "epoch": 0.182, + "grad_norm": 31.375, + "grad_norm_var": 31.370768229166668, + "learning_rate": 0.0001, + "loss": 7.5252, + "loss/crossentropy": 2.0706035763025286, + "loss/hidden": 3.38828125, + "loss/jsd": 0.0, + "loss/logits": 0.18490511737763882, + "step": 7280 + }, + { + "epoch": 0.18225, + "grad_norm": 32.25, + "grad_norm_var": 8.717643229166667, + "learning_rate": 0.0001, + "loss": 7.5688, + "loss/crossentropy": 2.2949971139431, + "loss/hidden": 3.4234375, + "loss/jsd": 0.0, + "loss/logits": 0.20526532903313638, + "step": 7290 + }, + { + "epoch": 0.1825, + "grad_norm": 32.0, + "grad_norm_var": 4.91640625, + "learning_rate": 0.0001, + "loss": 7.5752, + "loss/crossentropy": 2.2193857818841933, + "loss/hidden": 3.330078125, + "loss/jsd": 0.0, + "loss/logits": 0.1804453806951642, + "step": 7300 + }, + { + "epoch": 0.18275, + "grad_norm": 31.375, + "grad_norm_var": 3.701041666666667, + "learning_rate": 0.0001, + "loss": 7.6028, + "loss/crossentropy": 2.207156080007553, + "loss/hidden": 3.436328125, + "loss/jsd": 0.0, + "loss/logits": 0.2132449522614479, + "step": 7310 + }, + { + "epoch": 0.183, + "grad_norm": 31.75, + "grad_norm_var": 3.289322916666667, + "learning_rate": 0.0001, + "loss": 7.5818, + "loss/crossentropy": 2.271843919157982, + "loss/hidden": 3.36640625, + "loss/jsd": 0.0, + "loss/logits": 0.19545750357210637, + "step": 7320 + }, + { + "epoch": 0.18325, + "grad_norm": 31.25, + "grad_norm_var": 1.0124348958333333, + "learning_rate": 0.0001, + "loss": 7.502, + "loss/crossentropy": 2.1379879862070084, + "loss/hidden": 3.403125, + "loss/jsd": 0.0, + "loss/logits": 0.1861495029181242, + "step": 7330 + }, + { + "epoch": 0.1835, + "grad_norm": 31.25, + "grad_norm_var": 0.7556640625, + "learning_rate": 0.0001, + "loss": 7.5501, + "loss/crossentropy": 2.080298659205437, + "loss/hidden": 3.46875, + "loss/jsd": 0.0, + "loss/logits": 0.18716856762766837, + "step": 7340 + }, + { + "epoch": 0.18375, + "grad_norm": 31.25, + "grad_norm_var": 2.450455729166667, + "learning_rate": 0.0001, + "loss": 7.3934, + "loss/crossentropy": 2.0731329679489137, + "loss/hidden": 3.33984375, + "loss/jsd": 0.0, + "loss/logits": 0.17404056414961816, + "step": 7350 + }, + { + "epoch": 0.184, + "grad_norm": 34.25, + "grad_norm_var": 3.0947265625, + "learning_rate": 0.0001, + "loss": 7.5861, + "loss/crossentropy": 2.0820507287979124, + "loss/hidden": 3.465234375, + "loss/jsd": 0.0, + "loss/logits": 0.1926917627453804, + "step": 7360 + }, + { + "epoch": 0.18425, + "grad_norm": 30.875, + "grad_norm_var": 2.1624348958333335, + "learning_rate": 0.0001, + "loss": 7.5492, + "loss/crossentropy": 2.2114842593669892, + "loss/hidden": 3.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.2127979423850775, + "step": 7370 + }, + { + "epoch": 0.1845, + "grad_norm": 33.5, + "grad_norm_var": 4.882291666666666, + "learning_rate": 0.0001, + "loss": 7.5255, + "loss/crossentropy": 2.075804352760315, + "loss/hidden": 3.455859375, + "loss/jsd": 0.0, + "loss/logits": 0.18906766772270203, + "step": 7380 + }, + { + "epoch": 0.18475, + "grad_norm": 35.75, + "grad_norm_var": 34.35729166666667, + "learning_rate": 0.0001, + "loss": 7.6434, + "loss/crossentropy": 2.136875703930855, + "loss/hidden": 3.371875, + "loss/jsd": 0.0, + "loss/logits": 0.18598171565681695, + "step": 7390 + }, + { + "epoch": 0.185, + "grad_norm": 32.75, + "grad_norm_var": 4.624739583333334, + "learning_rate": 0.0001, + "loss": 7.6191, + "loss/crossentropy": 2.170763599872589, + "loss/hidden": 3.5109375, + "loss/jsd": 0.0, + "loss/logits": 0.2151478011161089, + "step": 7400 + }, + { + "epoch": 0.18525, + "grad_norm": 30.75, + "grad_norm_var": 2.051497395833333, + "learning_rate": 0.0001, + "loss": 7.4116, + "loss/crossentropy": 2.143643561005592, + "loss/hidden": 3.46796875, + "loss/jsd": 0.0, + "loss/logits": 0.19506660383194685, + "step": 7410 + }, + { + "epoch": 0.1855, + "grad_norm": 36.0, + "grad_norm_var": 5.089322916666666, + "learning_rate": 0.0001, + "loss": 7.534, + "loss/crossentropy": 2.1735941752791406, + "loss/hidden": 3.307421875, + "loss/jsd": 0.0, + "loss/logits": 0.17996873259544371, + "step": 7420 + }, + { + "epoch": 0.18575, + "grad_norm": 29.0, + "grad_norm_var": 4.81640625, + "learning_rate": 0.0001, + "loss": 7.5846, + "loss/crossentropy": 2.15124252140522, + "loss/hidden": 3.45546875, + "loss/jsd": 0.0, + "loss/logits": 0.1944795411080122, + "step": 7430 + }, + { + "epoch": 0.186, + "grad_norm": 30.125, + "grad_norm_var": 1.7900390625, + "learning_rate": 0.0001, + "loss": 7.5394, + "loss/crossentropy": 2.189228793978691, + "loss/hidden": 3.428125, + "loss/jsd": 0.0, + "loss/logits": 0.19658807516098023, + "step": 7440 + }, + { + "epoch": 0.18625, + "grad_norm": 31.25, + "grad_norm_var": 1.1572916666666666, + "learning_rate": 0.0001, + "loss": 7.5156, + "loss/crossentropy": 2.1805285453796386, + "loss/hidden": 3.43984375, + "loss/jsd": 0.0, + "loss/logits": 0.20203232783824204, + "step": 7450 + }, + { + "epoch": 0.1865, + "grad_norm": 32.75, + "grad_norm_var": 1.015625, + "learning_rate": 0.0001, + "loss": 7.5803, + "loss/crossentropy": 2.031705692410469, + "loss/hidden": 3.429296875, + "loss/jsd": 0.0, + "loss/logits": 0.19695232547819613, + "step": 7460 + }, + { + "epoch": 0.18675, + "grad_norm": 31.75, + "grad_norm_var": 1.6389973958333333, + "learning_rate": 0.0001, + "loss": 7.5008, + "loss/crossentropy": 2.0725282967090606, + "loss/hidden": 3.53515625, + "loss/jsd": 0.0, + "loss/logits": 0.19746688194572926, + "step": 7470 + }, + { + "epoch": 0.187, + "grad_norm": 33.75, + "grad_norm_var": 30.643684895833335, + "learning_rate": 0.0001, + "loss": 7.5746, + "loss/crossentropy": 2.2134982645511627, + "loss/hidden": 3.4640625, + "loss/jsd": 0.0, + "loss/logits": 0.2143145205453038, + "step": 7480 + }, + { + "epoch": 0.18725, + "grad_norm": 28.75, + "grad_norm_var": 173.95358072916667, + "learning_rate": 0.0001, + "loss": 7.5472, + "loss/crossentropy": 2.2404279142618178, + "loss/hidden": 3.349609375, + "loss/jsd": 0.0, + "loss/logits": 0.18395285904407502, + "step": 7490 + }, + { + "epoch": 0.1875, + "grad_norm": 33.0, + "grad_norm_var": 295.3212890625, + "learning_rate": 0.0001, + "loss": 7.6232, + "loss/crossentropy": 2.1231855720281603, + "loss/hidden": 3.43203125, + "loss/jsd": 0.0, + "loss/logits": 0.19835165105760097, + "step": 7500 + }, + { + "epoch": 0.18775, + "grad_norm": 33.5, + "grad_norm_var": 9.3119140625, + "learning_rate": 0.0001, + "loss": 7.5797, + "loss/crossentropy": 2.1263521701097488, + "loss/hidden": 3.39140625, + "loss/jsd": 0.0, + "loss/logits": 0.18789808079600334, + "step": 7510 + }, + { + "epoch": 0.188, + "grad_norm": 29.75, + "grad_norm_var": 30.567708333333332, + "learning_rate": 0.0001, + "loss": 7.544, + "loss/crossentropy": 2.2000538021326066, + "loss/hidden": 3.31796875, + "loss/jsd": 0.0, + "loss/logits": 0.19643234014511107, + "step": 7520 + }, + { + "epoch": 0.18825, + "grad_norm": 30.75, + "grad_norm_var": 26.5056640625, + "learning_rate": 0.0001, + "loss": 7.4594, + "loss/crossentropy": 2.1121160596609116, + "loss/hidden": 3.42734375, + "loss/jsd": 0.0, + "loss/logits": 0.18368990309536457, + "step": 7530 + }, + { + "epoch": 0.1885, + "grad_norm": 30.75, + "grad_norm_var": 2.661393229166667, + "learning_rate": 0.0001, + "loss": 7.4737, + "loss/crossentropy": 2.2245935589075088, + "loss/hidden": 3.24140625, + "loss/jsd": 0.0, + "loss/logits": 0.17950727473944425, + "step": 7540 + }, + { + "epoch": 0.18875, + "grad_norm": 32.75, + "grad_norm_var": 3.11015625, + "learning_rate": 0.0001, + "loss": 7.5744, + "loss/crossentropy": 2.2126697182655333, + "loss/hidden": 3.308203125, + "loss/jsd": 0.0, + "loss/logits": 0.19009165093302727, + "step": 7550 + }, + { + "epoch": 0.189, + "grad_norm": 34.25, + "grad_norm_var": 2.104622395833333, + "learning_rate": 0.0001, + "loss": 7.6006, + "loss/crossentropy": 2.0755111388862133, + "loss/hidden": 3.523046875, + "loss/jsd": 0.0, + "loss/logits": 0.20379672143608332, + "step": 7560 + }, + { + "epoch": 0.18925, + "grad_norm": 32.5, + "grad_norm_var": 16.618684895833333, + "learning_rate": 0.0001, + "loss": 7.528, + "loss/crossentropy": 2.205251544713974, + "loss/hidden": 3.429296875, + "loss/jsd": 0.0, + "loss/logits": 0.18352425061166286, + "step": 7570 + }, + { + "epoch": 0.1895, + "grad_norm": 29.875, + "grad_norm_var": 12.086393229166667, + "learning_rate": 0.0001, + "loss": 7.4678, + "loss/crossentropy": 2.0505823358893394, + "loss/hidden": 3.4125, + "loss/jsd": 0.0, + "loss/logits": 0.2034569911658764, + "step": 7580 + }, + { + "epoch": 0.18975, + "grad_norm": 35.0, + "grad_norm_var": 15.94140625, + "learning_rate": 0.0001, + "loss": 7.506, + "loss/crossentropy": 2.2195076823234556, + "loss/hidden": 3.361328125, + "loss/jsd": 0.0, + "loss/logits": 0.19098728708922863, + "step": 7590 + }, + { + "epoch": 0.19, + "grad_norm": 31.375, + "grad_norm_var": 16.2853515625, + "learning_rate": 0.0001, + "loss": 7.4491, + "loss/crossentropy": 2.185887323319912, + "loss/hidden": 3.40546875, + "loss/jsd": 0.0, + "loss/logits": 0.19378383718430997, + "step": 7600 + }, + { + "epoch": 0.19025, + "grad_norm": 33.0, + "grad_norm_var": 3.0389973958333334, + "learning_rate": 0.0001, + "loss": 7.5015, + "loss/crossentropy": 2.057955393195152, + "loss/hidden": 3.536328125, + "loss/jsd": 0.0, + "loss/logits": 0.20849357955157757, + "step": 7610 + }, + { + "epoch": 0.1905, + "grad_norm": 31.375, + "grad_norm_var": 17.470833333333335, + "learning_rate": 0.0001, + "loss": 7.6086, + "loss/crossentropy": 2.1172866210341454, + "loss/hidden": 3.476171875, + "loss/jsd": 0.0, + "loss/logits": 0.20152031816542149, + "step": 7620 + }, + { + "epoch": 0.19075, + "grad_norm": 32.5, + "grad_norm_var": 20.54765625, + "learning_rate": 0.0001, + "loss": 7.5686, + "loss/crossentropy": 2.2929946899414064, + "loss/hidden": 3.29765625, + "loss/jsd": 0.0, + "loss/logits": 0.18747647628188133, + "step": 7630 + }, + { + "epoch": 0.191, + "grad_norm": 37.25, + "grad_norm_var": 40.75807291666667, + "learning_rate": 0.0001, + "loss": 7.6427, + "loss/crossentropy": 2.180236041545868, + "loss/hidden": 3.54140625, + "loss/jsd": 0.0, + "loss/logits": 0.20883531272411346, + "step": 7640 + }, + { + "epoch": 0.19125, + "grad_norm": 29.625, + "grad_norm_var": 40.85520833333333, + "learning_rate": 0.0001, + "loss": 7.4917, + "loss/crossentropy": 2.1129625350236894, + "loss/hidden": 3.362109375, + "loss/jsd": 0.0, + "loss/logits": 0.18379813842475415, + "step": 7650 + }, + { + "epoch": 0.1915, + "grad_norm": 39.75, + "grad_norm_var": 6.581184895833333, + "learning_rate": 0.0001, + "loss": 7.5268, + "loss/crossentropy": 2.228352552652359, + "loss/hidden": 3.460546875, + "loss/jsd": 0.0, + "loss/logits": 0.2048925407230854, + "step": 7660 + }, + { + "epoch": 0.19175, + "grad_norm": 34.75, + "grad_norm_var": 6.0181640625, + "learning_rate": 0.0001, + "loss": 7.5951, + "loss/crossentropy": 2.2298452496528625, + "loss/hidden": 3.5171875, + "loss/jsd": 0.0, + "loss/logits": 0.21614472325891257, + "step": 7670 + }, + { + "epoch": 0.192, + "grad_norm": 29.625, + "grad_norm_var": 3.6025390625, + "learning_rate": 0.0001, + "loss": 7.5215, + "loss/crossentropy": 2.167386993765831, + "loss/hidden": 3.3921875, + "loss/jsd": 0.0, + "loss/logits": 0.1897386133670807, + "step": 7680 + }, + { + "epoch": 0.19225, + "grad_norm": 31.25, + "grad_norm_var": 1.5754557291666667, + "learning_rate": 0.0001, + "loss": 7.4768, + "loss/crossentropy": 2.1173581033945084, + "loss/hidden": 3.43046875, + "loss/jsd": 0.0, + "loss/logits": 0.19473140574991704, + "step": 7690 + }, + { + "epoch": 0.1925, + "grad_norm": 38.0, + "grad_norm_var": 16.32890625, + "learning_rate": 0.0001, + "loss": 7.5827, + "loss/crossentropy": 2.205728626251221, + "loss/hidden": 3.40859375, + "loss/jsd": 0.0, + "loss/logits": 0.20203282944858075, + "step": 7700 + }, + { + "epoch": 0.19275, + "grad_norm": 7583301632.0, + "grad_norm_var": 3.594153946076571e+18, + "learning_rate": 0.0001, + "loss": 7.4851, + "loss/crossentropy": 2.0537545680999756, + "loss/hidden": 3.4953125, + "loss/jsd": 0.0, + "loss/logits": 0.19121616017073392, + "step": 7710 + }, + { + "epoch": 0.193, + "grad_norm": 31.5, + "grad_norm_var": 3.594153947356253e+18, + "learning_rate": 0.0001, + "loss": 7.6091, + "loss/crossentropy": 2.2010452926158903, + "loss/hidden": 3.41328125, + "loss/jsd": 0.0, + "loss/logits": 0.19013969153165816, + "step": 7720 + }, + { + "epoch": 0.19325, + "grad_norm": 32.25, + "grad_norm_var": 127.8494140625, + "learning_rate": 0.0001, + "loss": 7.5129, + "loss/crossentropy": 2.1829321801662447, + "loss/hidden": 3.33828125, + "loss/jsd": 0.0, + "loss/logits": 0.18250775039196016, + "step": 7730 + }, + { + "epoch": 0.1935, + "grad_norm": 32.5, + "grad_norm_var": 6.54765625, + "learning_rate": 0.0001, + "loss": 7.5028, + "loss/crossentropy": 2.059948954731226, + "loss/hidden": 3.469921875, + "loss/jsd": 0.0, + "loss/logits": 0.1829567258246243, + "step": 7740 + }, + { + "epoch": 0.19375, + "grad_norm": 34.25, + "grad_norm_var": 65.40305989583334, + "learning_rate": 0.0001, + "loss": 7.5152, + "loss/crossentropy": 2.113444189727306, + "loss/hidden": 3.4, + "loss/jsd": 0.0, + "loss/logits": 0.19062405563890933, + "step": 7750 + }, + { + "epoch": 0.194, + "grad_norm": 32.25, + "grad_norm_var": 61.19166666666667, + "learning_rate": 0.0001, + "loss": 7.5399, + "loss/crossentropy": 2.0975135535001757, + "loss/hidden": 3.5359375, + "loss/jsd": 0.0, + "loss/logits": 0.21106049697846174, + "step": 7760 + }, + { + "epoch": 0.19425, + "grad_norm": 55.5, + "grad_norm_var": 35.5087890625, + "learning_rate": 0.0001, + "loss": 7.6208, + "loss/crossentropy": 2.168052741885185, + "loss/hidden": 3.458984375, + "loss/jsd": 0.0, + "loss/logits": 0.20290581844747066, + "step": 7770 + }, + { + "epoch": 0.1945, + "grad_norm": 32.5, + "grad_norm_var": 34.890625, + "learning_rate": 0.0001, + "loss": 7.6052, + "loss/crossentropy": 2.203343018889427, + "loss/hidden": 3.51953125, + "loss/jsd": 0.0, + "loss/logits": 0.2262148879468441, + "step": 7780 + }, + { + "epoch": 0.19475, + "grad_norm": 33.75, + "grad_norm_var": 1.90390625, + "learning_rate": 0.0001, + "loss": 7.5521, + "loss/crossentropy": 2.096657195687294, + "loss/hidden": 3.4359375, + "loss/jsd": 0.0, + "loss/logits": 0.19364065770059824, + "step": 7790 + }, + { + "epoch": 0.195, + "grad_norm": 32.25, + "grad_norm_var": 1.4916015625, + "learning_rate": 0.0001, + "loss": 7.607, + "loss/crossentropy": 2.0858742713928224, + "loss/hidden": 3.482421875, + "loss/jsd": 0.0, + "loss/logits": 0.195220067165792, + "step": 7800 + }, + { + "epoch": 0.19525, + "grad_norm": 30.5, + "grad_norm_var": 30.91640625, + "learning_rate": 0.0001, + "loss": 7.5047, + "loss/crossentropy": 2.1515824437141418, + "loss/hidden": 3.44765625, + "loss/jsd": 0.0, + "loss/logits": 0.19715734478086233, + "step": 7810 + }, + { + "epoch": 0.1955, + "grad_norm": 34.75, + "grad_norm_var": 29.525455729166666, + "learning_rate": 0.0001, + "loss": 7.5755, + "loss/crossentropy": 2.197349172830582, + "loss/hidden": 3.397265625, + "loss/jsd": 0.0, + "loss/logits": 0.1922721391543746, + "step": 7820 + }, + { + "epoch": 0.19575, + "grad_norm": 29.75, + "grad_norm_var": 2.6962890625, + "learning_rate": 0.0001, + "loss": 7.5252, + "loss/crossentropy": 2.1433851540088655, + "loss/hidden": 3.53125, + "loss/jsd": 0.0, + "loss/logits": 0.2196674931794405, + "step": 7830 + }, + { + "epoch": 0.196, + "grad_norm": 31.625, + "grad_norm_var": 4.008072916666666, + "learning_rate": 0.0001, + "loss": 7.5378, + "loss/crossentropy": 2.151882603764534, + "loss/hidden": 3.528125, + "loss/jsd": 0.0, + "loss/logits": 0.2241852417588234, + "step": 7840 + }, + { + "epoch": 0.19625, + "grad_norm": 31.125, + "grad_norm_var": 23.979166666666668, + "learning_rate": 0.0001, + "loss": 7.4767, + "loss/crossentropy": 2.100368928909302, + "loss/hidden": 3.429296875, + "loss/jsd": 0.0, + "loss/logits": 0.18492705803364515, + "step": 7850 + }, + { + "epoch": 0.1965, + "grad_norm": 35.25, + "grad_norm_var": 4.2900390625, + "learning_rate": 0.0001, + "loss": 7.5525, + "loss/crossentropy": 2.1549850702285767, + "loss/hidden": 3.41328125, + "loss/jsd": 0.0, + "loss/logits": 0.19525696001946927, + "step": 7860 + }, + { + "epoch": 0.19675, + "grad_norm": 29.125, + "grad_norm_var": 3.1264973958333333, + "learning_rate": 0.0001, + "loss": 7.4894, + "loss/crossentropy": 2.085528630018234, + "loss/hidden": 3.37734375, + "loss/jsd": 0.0, + "loss/logits": 0.19448419120162724, + "step": 7870 + }, + { + "epoch": 0.197, + "grad_norm": 31.125, + "grad_norm_var": 2.0509765625, + "learning_rate": 0.0001, + "loss": 7.6104, + "loss/crossentropy": 2.197956010699272, + "loss/hidden": 3.54609375, + "loss/jsd": 0.0, + "loss/logits": 0.20526408050209283, + "step": 7880 + }, + { + "epoch": 0.19725, + "grad_norm": 28.5, + "grad_norm_var": 4.287955729166667, + "learning_rate": 0.0001, + "loss": 7.5082, + "loss/crossentropy": 2.093092533946037, + "loss/hidden": 3.373828125, + "loss/jsd": 0.0, + "loss/logits": 0.18932582587003707, + "step": 7890 + }, + { + "epoch": 0.1975, + "grad_norm": 31.875, + "grad_norm_var": 3.9583333333333335, + "learning_rate": 0.0001, + "loss": 7.5156, + "loss/crossentropy": 2.1209671765565874, + "loss/hidden": 3.334375, + "loss/jsd": 0.0, + "loss/logits": 0.18670345041900874, + "step": 7900 + }, + { + "epoch": 0.19775, + "grad_norm": 42.75, + "grad_norm_var": 14.573893229166666, + "learning_rate": 0.0001, + "loss": 7.628, + "loss/crossentropy": 2.2129906579852103, + "loss/hidden": 3.41875, + "loss/jsd": 0.0, + "loss/logits": 0.18701824955642224, + "step": 7910 + }, + { + "epoch": 0.198, + "grad_norm": 29.375, + "grad_norm_var": 13.91015625, + "learning_rate": 0.0001, + "loss": 7.5283, + "loss/crossentropy": 2.063839703798294, + "loss/hidden": 3.5, + "loss/jsd": 0.0, + "loss/logits": 0.2156384490430355, + "step": 7920 + }, + { + "epoch": 0.19825, + "grad_norm": 30.0, + "grad_norm_var": 13.59140625, + "learning_rate": 0.0001, + "loss": 7.5872, + "loss/crossentropy": 2.150103223323822, + "loss/hidden": 3.44453125, + "loss/jsd": 0.0, + "loss/logits": 0.2093698762357235, + "step": 7930 + }, + { + "epoch": 0.1985, + "grad_norm": 30.75, + "grad_norm_var": 8.720833333333333, + "learning_rate": 0.0001, + "loss": 7.5207, + "loss/crossentropy": 2.1579747438430785, + "loss/hidden": 3.43046875, + "loss/jsd": 0.0, + "loss/logits": 0.20063298791646958, + "step": 7940 + }, + { + "epoch": 0.19875, + "grad_norm": 29.25, + "grad_norm_var": 2.7270182291666667, + "learning_rate": 0.0001, + "loss": 7.5105, + "loss/crossentropy": 2.104949194192886, + "loss/hidden": 3.462890625, + "loss/jsd": 0.0, + "loss/logits": 0.18702716194093227, + "step": 7950 + }, + { + "epoch": 0.199, + "grad_norm": 31.375, + "grad_norm_var": 2.7018229166666665, + "learning_rate": 0.0001, + "loss": 7.6138, + "loss/crossentropy": 2.1542328625917433, + "loss/hidden": 3.401171875, + "loss/jsd": 0.0, + "loss/logits": 0.1854570686817169, + "step": 7960 + }, + { + "epoch": 0.19925, + "grad_norm": 36.25, + "grad_norm_var": 78.99889322916667, + "learning_rate": 0.0001, + "loss": 7.5929, + "loss/crossentropy": 2.196703353524208, + "loss/hidden": 3.473828125, + "loss/jsd": 0.0, + "loss/logits": 0.20412985123693944, + "step": 7970 + }, + { + "epoch": 0.1995, + "grad_norm": 33.0, + "grad_norm_var": 79.22083333333333, + "learning_rate": 0.0001, + "loss": 7.5479, + "loss/crossentropy": 2.1077481478452684, + "loss/hidden": 3.441796875, + "loss/jsd": 0.0, + "loss/logits": 0.20956829860806464, + "step": 7980 + }, + { + "epoch": 0.19975, + "grad_norm": 30.875, + "grad_norm_var": 2.403059895833333, + "learning_rate": 0.0001, + "loss": 7.5393, + "loss/crossentropy": 2.2737906739115714, + "loss/hidden": 3.325390625, + "loss/jsd": 0.0, + "loss/logits": 0.19662595726549625, + "step": 7990 + }, + { + "epoch": 0.2, + "grad_norm": 31.625, + "grad_norm_var": 7.572916666666667, + "learning_rate": 0.0001, + "loss": 7.576, + "loss/crossentropy": 2.187049573659897, + "loss/hidden": 3.481640625, + "loss/jsd": 0.0, + "loss/logits": 0.1929944284260273, + "step": 8000 + }, + { + "epoch": 0.20025, + "grad_norm": 31.125, + "grad_norm_var": 1.81875, + "learning_rate": 0.0001, + "loss": 7.584, + "loss/crossentropy": 2.1443400979042053, + "loss/hidden": 3.587890625, + "loss/jsd": 0.0, + "loss/logits": 0.21116771101951598, + "step": 8010 + }, + { + "epoch": 0.2005, + "grad_norm": 33.25, + "grad_norm_var": 2.2309895833333333, + "learning_rate": 0.0001, + "loss": 7.619, + "loss/crossentropy": 2.1093345403671266, + "loss/hidden": 3.42421875, + "loss/jsd": 0.0, + "loss/logits": 0.19506766367703676, + "step": 8020 + }, + { + "epoch": 0.20075, + "grad_norm": 34.25, + "grad_norm_var": 2.2285807291666666, + "learning_rate": 0.0001, + "loss": 7.6101, + "loss/crossentropy": 2.3067246288061143, + "loss/hidden": 3.368359375, + "loss/jsd": 0.0, + "loss/logits": 0.21037033908069133, + "step": 8030 + }, + { + "epoch": 0.201, + "grad_norm": 31.25, + "grad_norm_var": 8.986393229166667, + "learning_rate": 0.0001, + "loss": 7.5813, + "loss/crossentropy": 2.0566743701696395, + "loss/hidden": 3.6375, + "loss/jsd": 0.0, + "loss/logits": 0.21147819980978966, + "step": 8040 + }, + { + "epoch": 0.20125, + "grad_norm": 30.875, + "grad_norm_var": 58.540625, + "learning_rate": 0.0001, + "loss": 7.6354, + "loss/crossentropy": 2.119761574268341, + "loss/hidden": 3.5140625, + "loss/jsd": 0.0, + "loss/logits": 0.21174631416797637, + "step": 8050 + }, + { + "epoch": 0.2015, + "grad_norm": 31.125, + "grad_norm_var": 1.9830729166666667, + "learning_rate": 0.0001, + "loss": 7.5129, + "loss/crossentropy": 2.098930720984936, + "loss/hidden": 3.46796875, + "loss/jsd": 0.0, + "loss/logits": 0.1940738322213292, + "step": 8060 + }, + { + "epoch": 0.20175, + "grad_norm": 31.125, + "grad_norm_var": 1.82265625, + "learning_rate": 0.0001, + "loss": 7.5747, + "loss/crossentropy": 2.1186475455760956, + "loss/hidden": 3.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.19831476360559464, + "step": 8070 + }, + { + "epoch": 0.202, + "grad_norm": 31.5, + "grad_norm_var": 1.5416666666666667, + "learning_rate": 0.0001, + "loss": 7.5639, + "loss/crossentropy": 2.2280248433351515, + "loss/hidden": 3.4640625, + "loss/jsd": 0.0, + "loss/logits": 0.18766701072454453, + "step": 8080 + }, + { + "epoch": 0.20225, + "grad_norm": 31.375, + "grad_norm_var": 29.02265625, + "learning_rate": 0.0001, + "loss": 7.4765, + "loss/crossentropy": 2.135748690366745, + "loss/hidden": 3.394140625, + "loss/jsd": 0.0, + "loss/logits": 0.20170771703124046, + "step": 8090 + }, + { + "epoch": 0.2025, + "grad_norm": 38.25, + "grad_norm_var": 4.775, + "learning_rate": 0.0001, + "loss": 7.6228, + "loss/crossentropy": 2.08721085190773, + "loss/hidden": 3.42265625, + "loss/jsd": 0.0, + "loss/logits": 0.20192455761134626, + "step": 8100 + }, + { + "epoch": 0.20275, + "grad_norm": 29.5, + "grad_norm_var": 15.781184895833333, + "learning_rate": 0.0001, + "loss": 7.5354, + "loss/crossentropy": 2.230104002356529, + "loss/hidden": 3.46640625, + "loss/jsd": 0.0, + "loss/logits": 0.20827409140765668, + "step": 8110 + }, + { + "epoch": 0.203, + "grad_norm": 30.75, + "grad_norm_var": 2.0233723958333334, + "learning_rate": 0.0001, + "loss": 7.4638, + "loss/crossentropy": 2.071944323182106, + "loss/hidden": 3.436328125, + "loss/jsd": 0.0, + "loss/logits": 0.1925355602055788, + "step": 8120 + }, + { + "epoch": 0.20325, + "grad_norm": 29.375, + "grad_norm_var": 2.0625, + "learning_rate": 0.0001, + "loss": 7.5385, + "loss/crossentropy": 2.0866906702518464, + "loss/hidden": 3.621484375, + "loss/jsd": 0.0, + "loss/logits": 0.2152187094092369, + "step": 8130 + }, + { + "epoch": 0.2035, + "grad_norm": 31.625, + "grad_norm_var": 6.661458333333333, + "learning_rate": 0.0001, + "loss": 7.5217, + "loss/crossentropy": 2.1989782720804216, + "loss/hidden": 3.502734375, + "loss/jsd": 0.0, + "loss/logits": 0.1997136753052473, + "step": 8140 + }, + { + "epoch": 0.20375, + "grad_norm": 30.5, + "grad_norm_var": 5.3197265625, + "learning_rate": 0.0001, + "loss": 7.4881, + "loss/crossentropy": 2.0995738029479982, + "loss/hidden": 3.456640625, + "loss/jsd": 0.0, + "loss/logits": 0.18900278601795434, + "step": 8150 + }, + { + "epoch": 0.204, + "grad_norm": 30.75, + "grad_norm_var": 0.8395833333333333, + "learning_rate": 0.0001, + "loss": 7.5243, + "loss/crossentropy": 2.0287352964282035, + "loss/hidden": 3.475390625, + "loss/jsd": 0.0, + "loss/logits": 0.19610616452991964, + "step": 8160 + }, + { + "epoch": 0.20425, + "grad_norm": 31.25, + "grad_norm_var": 5.362239583333333, + "learning_rate": 0.0001, + "loss": 7.5562, + "loss/crossentropy": 2.165475571155548, + "loss/hidden": 3.369140625, + "loss/jsd": 0.0, + "loss/logits": 0.19938987717032433, + "step": 8170 + }, + { + "epoch": 0.2045, + "grad_norm": 31.375, + "grad_norm_var": 1.571875, + "learning_rate": 0.0001, + "loss": 7.5421, + "loss/crossentropy": 2.2981997221708297, + "loss/hidden": 3.410546875, + "loss/jsd": 0.0, + "loss/logits": 0.18277304694056512, + "step": 8180 + }, + { + "epoch": 0.20475, + "grad_norm": 30.375, + "grad_norm_var": 4.543489583333334, + "learning_rate": 0.0001, + "loss": 7.5629, + "loss/crossentropy": 2.0597861796617507, + "loss/hidden": 3.441015625, + "loss/jsd": 0.0, + "loss/logits": 0.20162123087793588, + "step": 8190 + }, + { + "epoch": 0.205, + "grad_norm": 32.0, + "grad_norm_var": 2.0572916666666665, + "learning_rate": 0.0001, + "loss": 7.5376, + "loss/crossentropy": 2.1517196536064147, + "loss/hidden": 3.33984375, + "loss/jsd": 0.0, + "loss/logits": 0.19018489569425584, + "step": 8200 + }, + { + "epoch": 0.20525, + "grad_norm": 29.375, + "grad_norm_var": 1.4994140625, + "learning_rate": 0.0001, + "loss": 7.4639, + "loss/crossentropy": 2.2632179886102675, + "loss/hidden": 3.25625, + "loss/jsd": 0.0, + "loss/logits": 0.18489395044744014, + "step": 8210 + }, + { + "epoch": 0.2055, + "grad_norm": 36.5, + "grad_norm_var": 2.61640625, + "learning_rate": 0.0001, + "loss": 7.7838, + "loss/crossentropy": 2.1116667434573175, + "loss/hidden": 3.573828125, + "loss/jsd": 0.0, + "loss/logits": 0.19244479853659868, + "step": 8220 + }, + { + "epoch": 0.20575, + "grad_norm": 31.25, + "grad_norm_var": 1.8582682291666666, + "learning_rate": 0.0001, + "loss": 7.5409, + "loss/crossentropy": 2.1676986277103425, + "loss/hidden": 3.42734375, + "loss/jsd": 0.0, + "loss/logits": 0.19751499071717263, + "step": 8230 + }, + { + "epoch": 0.206, + "grad_norm": 35.5, + "grad_norm_var": 3.9962890625, + "learning_rate": 0.0001, + "loss": 7.5365, + "loss/crossentropy": 2.2388996213674544, + "loss/hidden": 3.4109375, + "loss/jsd": 0.0, + "loss/logits": 0.19586583338677882, + "step": 8240 + }, + { + "epoch": 0.20625, + "grad_norm": 30.375, + "grad_norm_var": 5.229166666666667, + "learning_rate": 0.0001, + "loss": 7.4937, + "loss/crossentropy": 2.173008766770363, + "loss/hidden": 3.384375, + "loss/jsd": 0.0, + "loss/logits": 0.18703010510653256, + "step": 8250 + }, + { + "epoch": 0.2065, + "grad_norm": 31.625, + "grad_norm_var": 14.615625, + "learning_rate": 0.0001, + "loss": 7.5587, + "loss/crossentropy": 2.1825241267681124, + "loss/hidden": 3.443359375, + "loss/jsd": 0.0, + "loss/logits": 0.19528221413493158, + "step": 8260 + }, + { + "epoch": 0.20675, + "grad_norm": 30.375, + "grad_norm_var": 8.207747395833334, + "learning_rate": 0.0001, + "loss": 7.479, + "loss/crossentropy": 2.0385641396045684, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.20498643592000007, + "step": 8270 + }, + { + "epoch": 0.207, + "grad_norm": 36.25, + "grad_norm_var": 5.3650390625, + "learning_rate": 0.0001, + "loss": 7.5835, + "loss/crossentropy": 2.2289943635463714, + "loss/hidden": 3.53046875, + "loss/jsd": 0.0, + "loss/logits": 0.20597830023616553, + "step": 8280 + }, + { + "epoch": 0.20725, + "grad_norm": 32.5, + "grad_norm_var": 72.63723958333334, + "learning_rate": 0.0001, + "loss": 7.4999, + "loss/crossentropy": 2.201861135661602, + "loss/hidden": 3.373828125, + "loss/jsd": 0.0, + "loss/logits": 0.1963998381048441, + "step": 8290 + }, + { + "epoch": 0.2075, + "grad_norm": 33.5, + "grad_norm_var": 75.33932291666666, + "learning_rate": 0.0001, + "loss": 7.4931, + "loss/crossentropy": 2.0565819829702376, + "loss/hidden": 3.587890625, + "loss/jsd": 0.0, + "loss/logits": 0.2153871137648821, + "step": 8300 + }, + { + "epoch": 0.20775, + "grad_norm": 33.75, + "grad_norm_var": 2.57265625, + "learning_rate": 0.0001, + "loss": 7.5969, + "loss/crossentropy": 2.2017314821481704, + "loss/hidden": 3.509765625, + "loss/jsd": 0.0, + "loss/logits": 0.20366298444569111, + "step": 8310 + }, + { + "epoch": 0.208, + "grad_norm": 31.5, + "grad_norm_var": 93.0625, + "learning_rate": 0.0001, + "loss": 7.5622, + "loss/crossentropy": 2.139016662538052, + "loss/hidden": 3.40859375, + "loss/jsd": 0.0, + "loss/logits": 0.18943111039698124, + "step": 8320 + }, + { + "epoch": 0.20825, + "grad_norm": 48.25, + "grad_norm_var": 20.628125, + "learning_rate": 0.0001, + "loss": 7.4914, + "loss/crossentropy": 2.1512654572725296, + "loss/hidden": 3.39765625, + "loss/jsd": 0.0, + "loss/logits": 0.20672829449176788, + "step": 8330 + }, + { + "epoch": 0.2085, + "grad_norm": 32.25, + "grad_norm_var": 19.8759765625, + "learning_rate": 0.0001, + "loss": 7.5845, + "loss/crossentropy": 2.0606055706739426, + "loss/hidden": 3.49921875, + "loss/jsd": 0.0, + "loss/logits": 0.20263293255120515, + "step": 8340 + }, + { + "epoch": 0.20875, + "grad_norm": 31.375, + "grad_norm_var": 55.4369140625, + "learning_rate": 0.0001, + "loss": 7.4669, + "loss/crossentropy": 2.074323023855686, + "loss/hidden": 3.344921875, + "loss/jsd": 0.0, + "loss/logits": 0.17513209469616414, + "step": 8350 + }, + { + "epoch": 0.209, + "grad_norm": 29.5, + "grad_norm_var": 54.5603515625, + "learning_rate": 0.0001, + "loss": 7.5259, + "loss/crossentropy": 2.197512632608414, + "loss/hidden": 3.438671875, + "loss/jsd": 0.0, + "loss/logits": 0.20681734681129454, + "step": 8360 + }, + { + "epoch": 0.20925, + "grad_norm": 29.625, + "grad_norm_var": 3.138541666666667, + "learning_rate": 0.0001, + "loss": 7.58, + "loss/crossentropy": 2.2005858927965165, + "loss/hidden": 3.414453125, + "loss/jsd": 0.0, + "loss/logits": 0.2138770181685686, + "step": 8370 + }, + { + "epoch": 0.2095, + "grad_norm": 33.0, + "grad_norm_var": 7.381184895833333, + "learning_rate": 0.0001, + "loss": 7.604, + "loss/crossentropy": 2.2078860282897947, + "loss/hidden": 3.570703125, + "loss/jsd": 0.0, + "loss/logits": 0.21495202817022802, + "step": 8380 + }, + { + "epoch": 0.20975, + "grad_norm": 31.875, + "grad_norm_var": 6.4931640625, + "learning_rate": 0.0001, + "loss": 7.5081, + "loss/crossentropy": 2.1586494892835617, + "loss/hidden": 3.434375, + "loss/jsd": 0.0, + "loss/logits": 0.19594298098236324, + "step": 8390 + }, + { + "epoch": 0.21, + "grad_norm": 32.5, + "grad_norm_var": 2.05390625, + "learning_rate": 0.0001, + "loss": 7.5932, + "loss/crossentropy": 2.1475181549787523, + "loss/hidden": 3.41953125, + "loss/jsd": 0.0, + "loss/logits": 0.19644266795367002, + "step": 8400 + }, + { + "epoch": 0.21025, + "grad_norm": 32.5, + "grad_norm_var": 1.1671223958333334, + "learning_rate": 0.0001, + "loss": 7.4963, + "loss/crossentropy": 2.2497402161359785, + "loss/hidden": 3.29296875, + "loss/jsd": 0.0, + "loss/logits": 0.20083448998630046, + "step": 8410 + }, + { + "epoch": 0.2105, + "grad_norm": 36.0, + "grad_norm_var": 4.269205729166667, + "learning_rate": 0.0001, + "loss": 7.4909, + "loss/crossentropy": 2.1163347721099854, + "loss/hidden": 3.487890625, + "loss/jsd": 0.0, + "loss/logits": 0.19131154641509057, + "step": 8420 + }, + { + "epoch": 0.21075, + "grad_norm": 32.25, + "grad_norm_var": 5.568489583333333, + "learning_rate": 0.0001, + "loss": 7.5108, + "loss/crossentropy": 2.1301738530397416, + "loss/hidden": 3.437890625, + "loss/jsd": 0.0, + "loss/logits": 0.18502578027546407, + "step": 8430 + }, + { + "epoch": 0.211, + "grad_norm": 30.625, + "grad_norm_var": 15.867708333333333, + "learning_rate": 0.0001, + "loss": 7.5275, + "loss/crossentropy": 2.127982833981514, + "loss/hidden": 3.399609375, + "loss/jsd": 0.0, + "loss/logits": 0.2027182461693883, + "step": 8440 + }, + { + "epoch": 0.21125, + "grad_norm": 29.625, + "grad_norm_var": 18.677018229166666, + "learning_rate": 0.0001, + "loss": 7.5249, + "loss/crossentropy": 2.1977096766233446, + "loss/hidden": 3.477734375, + "loss/jsd": 0.0, + "loss/logits": 0.20560352243483065, + "step": 8450 + }, + { + "epoch": 0.2115, + "grad_norm": 29.75, + "grad_norm_var": 16.110872395833333, + "learning_rate": 0.0001, + "loss": 7.4708, + "loss/crossentropy": 2.1555270701646805, + "loss/hidden": 3.3078125, + "loss/jsd": 0.0, + "loss/logits": 0.18343626484274864, + "step": 8460 + }, + { + "epoch": 0.21175, + "grad_norm": 29.5, + "grad_norm_var": 25.607291666666665, + "learning_rate": 0.0001, + "loss": 7.4635, + "loss/crossentropy": 2.1936387956142425, + "loss/hidden": 3.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.21095133386552334, + "step": 8470 + }, + { + "epoch": 0.212, + "grad_norm": 33.0, + "grad_norm_var": 16.8197265625, + "learning_rate": 0.0001, + "loss": 7.5373, + "loss/crossentropy": 2.231749877333641, + "loss/hidden": 3.382421875, + "loss/jsd": 0.0, + "loss/logits": 0.1943896021693945, + "step": 8480 + }, + { + "epoch": 0.21225, + "grad_norm": 31.75, + "grad_norm_var": 7.643489583333333, + "learning_rate": 0.0001, + "loss": 7.5582, + "loss/crossentropy": 2.171339076757431, + "loss/hidden": 3.475390625, + "loss/jsd": 0.0, + "loss/logits": 0.20933733694255352, + "step": 8490 + }, + { + "epoch": 0.2125, + "grad_norm": 36.5, + "grad_norm_var": 3.863541666666667, + "learning_rate": 0.0001, + "loss": 7.5613, + "loss/crossentropy": 2.170750407129526, + "loss/hidden": 3.416015625, + "loss/jsd": 0.0, + "loss/logits": 0.19806241411715747, + "step": 8500 + }, + { + "epoch": 0.21275, + "grad_norm": 30.625, + "grad_norm_var": 6.814518229166667, + "learning_rate": 0.0001, + "loss": 7.475, + "loss/crossentropy": 2.1906991213560105, + "loss/hidden": 3.403515625, + "loss/jsd": 0.0, + "loss/logits": 0.19974222220480442, + "step": 8510 + }, + { + "epoch": 0.213, + "grad_norm": 31.75, + "grad_norm_var": 1.0955729166666666, + "learning_rate": 0.0001, + "loss": 7.5352, + "loss/crossentropy": 2.101140005886555, + "loss/hidden": 3.46953125, + "loss/jsd": 0.0, + "loss/logits": 0.20742072239518167, + "step": 8520 + }, + { + "epoch": 0.21325, + "grad_norm": 33.0, + "grad_norm_var": 2.234375, + "learning_rate": 0.0001, + "loss": 7.6097, + "loss/crossentropy": 2.142820453643799, + "loss/hidden": 3.417578125, + "loss/jsd": 0.0, + "loss/logits": 0.2011982973664999, + "step": 8530 + }, + { + "epoch": 0.2135, + "grad_norm": 30.75, + "grad_norm_var": 4.988541666666666, + "learning_rate": 0.0001, + "loss": 7.5074, + "loss/crossentropy": 2.274955728650093, + "loss/hidden": 3.254296875, + "loss/jsd": 0.0, + "loss/logits": 0.18379125297069548, + "step": 8540 + }, + { + "epoch": 0.21375, + "grad_norm": 29.875, + "grad_norm_var": 5.245572916666666, + "learning_rate": 0.0001, + "loss": 7.4441, + "loss/crossentropy": 2.2132862359285355, + "loss/hidden": 3.355859375, + "loss/jsd": 0.0, + "loss/logits": 0.20189897604286672, + "step": 8550 + }, + { + "epoch": 0.214, + "grad_norm": 29.375, + "grad_norm_var": 1.3567057291666667, + "learning_rate": 0.0001, + "loss": 7.5183, + "loss/crossentropy": 2.1324101746082307, + "loss/hidden": 3.542578125, + "loss/jsd": 0.0, + "loss/logits": 0.2091756235808134, + "step": 8560 + }, + { + "epoch": 0.21425, + "grad_norm": 37.25, + "grad_norm_var": 26.8353515625, + "learning_rate": 0.0001, + "loss": 7.5208, + "loss/crossentropy": 2.2107133328914643, + "loss/hidden": 3.4234375, + "loss/jsd": 0.0, + "loss/logits": 0.19691390544176102, + "step": 8570 + }, + { + "epoch": 0.2145, + "grad_norm": 32.25, + "grad_norm_var": 8.3525390625, + "learning_rate": 0.0001, + "loss": 7.4517, + "loss/crossentropy": 2.1903372198343276, + "loss/hidden": 3.5, + "loss/jsd": 0.0, + "loss/logits": 0.21582610420882703, + "step": 8580 + }, + { + "epoch": 0.21475, + "grad_norm": 31.5, + "grad_norm_var": 1.2587890625, + "learning_rate": 0.0001, + "loss": 7.5885, + "loss/crossentropy": 2.1499848544597624, + "loss/hidden": 3.508984375, + "loss/jsd": 0.0, + "loss/logits": 0.2200299922376871, + "step": 8590 + }, + { + "epoch": 0.215, + "grad_norm": 44.75, + "grad_norm_var": 11.795833333333333, + "learning_rate": 0.0001, + "loss": 7.5716, + "loss/crossentropy": 2.2607037901878355, + "loss/hidden": 3.37890625, + "loss/jsd": 0.0, + "loss/logits": 0.19387190453708172, + "step": 8600 + }, + { + "epoch": 0.21525, + "grad_norm": 31.875, + "grad_norm_var": 13.584830729166667, + "learning_rate": 0.0001, + "loss": 7.4842, + "loss/crossentropy": 2.2367573767900466, + "loss/hidden": 3.291796875, + "loss/jsd": 0.0, + "loss/logits": 0.1797945935279131, + "step": 8610 + }, + { + "epoch": 0.2155, + "grad_norm": 31.0, + "grad_norm_var": 6.67890625, + "learning_rate": 0.0001, + "loss": 7.5594, + "loss/crossentropy": 2.132373479008675, + "loss/hidden": 3.405078125, + "loss/jsd": 0.0, + "loss/logits": 0.19787863213568926, + "step": 8620 + }, + { + "epoch": 0.21575, + "grad_norm": 32.75, + "grad_norm_var": 1.5483723958333333, + "learning_rate": 0.0001, + "loss": 7.5224, + "loss/crossentropy": 2.1131544440984724, + "loss/hidden": 3.434765625, + "loss/jsd": 0.0, + "loss/logits": 0.18490277007222175, + "step": 8630 + }, + { + "epoch": 0.216, + "grad_norm": 28.125, + "grad_norm_var": 2.9827473958333335, + "learning_rate": 0.0001, + "loss": 7.5298, + "loss/crossentropy": 2.1318382740020754, + "loss/hidden": 3.45625, + "loss/jsd": 0.0, + "loss/logits": 0.20170295760035514, + "step": 8640 + }, + { + "epoch": 0.21625, + "grad_norm": 29.5, + "grad_norm_var": 2.6635416666666667, + "learning_rate": 0.0001, + "loss": 7.5196, + "loss/crossentropy": 2.143068727850914, + "loss/hidden": 3.448046875, + "loss/jsd": 0.0, + "loss/logits": 0.2085008706897497, + "step": 8650 + }, + { + "epoch": 0.2165, + "grad_norm": 31.75, + "grad_norm_var": 4.72265625, + "learning_rate": 0.0001, + "loss": 7.583, + "loss/crossentropy": 2.191588431596756, + "loss/hidden": 3.420703125, + "loss/jsd": 0.0, + "loss/logits": 0.19564641676843167, + "step": 8660 + }, + { + "epoch": 0.21675, + "grad_norm": 36.75, + "grad_norm_var": 5.292122395833333, + "learning_rate": 0.0001, + "loss": 7.5915, + "loss/crossentropy": 2.1791785418987275, + "loss/hidden": 3.390625, + "loss/jsd": 0.0, + "loss/logits": 0.19772212151437998, + "step": 8670 + }, + { + "epoch": 0.217, + "grad_norm": 29.75, + "grad_norm_var": 4.042708333333334, + "learning_rate": 0.0001, + "loss": 7.496, + "loss/crossentropy": 2.1210457414388655, + "loss/hidden": 3.411328125, + "loss/jsd": 0.0, + "loss/logits": 0.20252102315425874, + "step": 8680 + }, + { + "epoch": 0.21725, + "grad_norm": 30.25, + "grad_norm_var": 2.02265625, + "learning_rate": 0.0001, + "loss": 7.4529, + "loss/crossentropy": 2.1803869009017944, + "loss/hidden": 3.412890625, + "loss/jsd": 0.0, + "loss/logits": 0.19335724450647832, + "step": 8690 + }, + { + "epoch": 0.2175, + "grad_norm": 32.5, + "grad_norm_var": 3.9624348958333333, + "learning_rate": 0.0001, + "loss": 7.5914, + "loss/crossentropy": 2.134222483634949, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.19238188080489635, + "step": 8700 + }, + { + "epoch": 0.21775, + "grad_norm": 32.75, + "grad_norm_var": 6.13515625, + "learning_rate": 0.0001, + "loss": 7.5587, + "loss/crossentropy": 2.1801154255867004, + "loss/hidden": 3.456640625, + "loss/jsd": 0.0, + "loss/logits": 0.19446788169443607, + "step": 8710 + }, + { + "epoch": 0.218, + "grad_norm": 30.0, + "grad_norm_var": 3.06015625, + "learning_rate": 0.0001, + "loss": 7.561, + "loss/crossentropy": 2.1410862773656847, + "loss/hidden": 3.444140625, + "loss/jsd": 0.0, + "loss/logits": 0.20139614418148993, + "step": 8720 + }, + { + "epoch": 0.21825, + "grad_norm": 33.5, + "grad_norm_var": 2.0707682291666667, + "learning_rate": 0.0001, + "loss": 7.527, + "loss/crossentropy": 2.137886345386505, + "loss/hidden": 3.4, + "loss/jsd": 0.0, + "loss/logits": 0.2020585484802723, + "step": 8730 + }, + { + "epoch": 0.2185, + "grad_norm": 31.125, + "grad_norm_var": 2.278125, + "learning_rate": 0.0001, + "loss": 7.5367, + "loss/crossentropy": 2.1495157063007353, + "loss/hidden": 3.352734375, + "loss/jsd": 0.0, + "loss/logits": 0.2042895916849375, + "step": 8740 + }, + { + "epoch": 0.21875, + "grad_norm": 31.25, + "grad_norm_var": 4.108072916666667, + "learning_rate": 0.0001, + "loss": 7.5061, + "loss/crossentropy": 2.2290263891220095, + "loss/hidden": 3.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.18052256256341934, + "step": 8750 + }, + { + "epoch": 0.219, + "grad_norm": 30.75, + "grad_norm_var": 3.0518229166666666, + "learning_rate": 0.0001, + "loss": 7.5422, + "loss/crossentropy": 2.204638335108757, + "loss/hidden": 3.273046875, + "loss/jsd": 0.0, + "loss/logits": 0.1816064776852727, + "step": 8760 + }, + { + "epoch": 0.21925, + "grad_norm": 33.75, + "grad_norm_var": 1.0259765625, + "learning_rate": 0.0001, + "loss": 7.5936, + "loss/crossentropy": 2.173864471912384, + "loss/hidden": 3.409375, + "loss/jsd": 0.0, + "loss/logits": 0.2168831005692482, + "step": 8770 + }, + { + "epoch": 0.2195, + "grad_norm": 33.0, + "grad_norm_var": 1.4957682291666667, + "learning_rate": 0.0001, + "loss": 7.6125, + "loss/crossentropy": 2.2631371796131132, + "loss/hidden": 3.37421875, + "loss/jsd": 0.0, + "loss/logits": 0.1991407833993435, + "step": 8780 + }, + { + "epoch": 0.21975, + "grad_norm": 30.625, + "grad_norm_var": 1.62890625, + "learning_rate": 0.0001, + "loss": 7.486, + "loss/crossentropy": 2.156681847572327, + "loss/hidden": 3.38359375, + "loss/jsd": 0.0, + "loss/logits": 0.18595759831368924, + "step": 8790 + }, + { + "epoch": 0.22, + "grad_norm": 33.5, + "grad_norm_var": 1.2202473958333333, + "learning_rate": 0.0001, + "loss": 7.4714, + "loss/crossentropy": 2.069344013929367, + "loss/hidden": 3.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.19667117949575186, + "step": 8800 + }, + { + "epoch": 0.22025, + "grad_norm": 30.625, + "grad_norm_var": 3.7875, + "learning_rate": 0.0001, + "loss": 7.6153, + "loss/crossentropy": 2.159384399652481, + "loss/hidden": 3.43125, + "loss/jsd": 0.0, + "loss/logits": 0.19717039167881012, + "step": 8810 + }, + { + "epoch": 0.2205, + "grad_norm": 31.25, + "grad_norm_var": 2.7080729166666666, + "learning_rate": 0.0001, + "loss": 7.5037, + "loss/crossentropy": 2.06855808198452, + "loss/hidden": 3.596875, + "loss/jsd": 0.0, + "loss/logits": 0.19925107434391975, + "step": 8820 + }, + { + "epoch": 0.22075, + "grad_norm": 33.0, + "grad_norm_var": 14.142122395833333, + "learning_rate": 0.0001, + "loss": 7.61, + "loss/crossentropy": 2.1856627821922303, + "loss/hidden": 3.4203125, + "loss/jsd": 0.0, + "loss/logits": 0.19198091998696326, + "step": 8830 + }, + { + "epoch": 0.221, + "grad_norm": 29.75, + "grad_norm_var": 23.12265625, + "learning_rate": 0.0001, + "loss": 7.5265, + "loss/crossentropy": 2.215584135055542, + "loss/hidden": 3.360546875, + "loss/jsd": 0.0, + "loss/logits": 0.193273763731122, + "step": 8840 + }, + { + "epoch": 0.22125, + "grad_norm": 32.5, + "grad_norm_var": 23.365625, + "learning_rate": 0.0001, + "loss": 7.6189, + "loss/crossentropy": 2.217027261853218, + "loss/hidden": 3.487890625, + "loss/jsd": 0.0, + "loss/logits": 0.1961042732000351, + "step": 8850 + }, + { + "epoch": 0.2215, + "grad_norm": 31.25, + "grad_norm_var": 2.9160807291666666, + "learning_rate": 0.0001, + "loss": 7.441, + "loss/crossentropy": 2.1981815114617347, + "loss/hidden": 3.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.19394037276506423, + "step": 8860 + }, + { + "epoch": 0.22175, + "grad_norm": 27.75, + "grad_norm_var": 9.90390625, + "learning_rate": 0.0001, + "loss": 7.4553, + "loss/crossentropy": 2.112216001749039, + "loss/hidden": 3.380859375, + "loss/jsd": 0.0, + "loss/logits": 0.18079342246055602, + "step": 8870 + }, + { + "epoch": 0.222, + "grad_norm": 31.375, + "grad_norm_var": 7.8384765625, + "learning_rate": 0.0001, + "loss": 7.4818, + "loss/crossentropy": 2.101804518699646, + "loss/hidden": 3.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.1867048390209675, + "step": 8880 + }, + { + "epoch": 0.22225, + "grad_norm": 32.25, + "grad_norm_var": 9.934309895833334, + "learning_rate": 0.0001, + "loss": 7.5937, + "loss/crossentropy": 2.193597176671028, + "loss/hidden": 3.404296875, + "loss/jsd": 0.0, + "loss/logits": 0.20191702879965306, + "step": 8890 + }, + { + "epoch": 0.2225, + "grad_norm": 30.0, + "grad_norm_var": 32.552083333333336, + "learning_rate": 0.0001, + "loss": 7.4298, + "loss/crossentropy": 2.2074760258197785, + "loss/hidden": 3.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.19682303816080093, + "step": 8900 + }, + { + "epoch": 0.22275, + "grad_norm": 36.0, + "grad_norm_var": 28.680143229166667, + "learning_rate": 0.0001, + "loss": 7.5468, + "loss/crossentropy": 2.008703652024269, + "loss/hidden": 3.46015625, + "loss/jsd": 0.0, + "loss/logits": 0.2202893177047372, + "step": 8910 + }, + { + "epoch": 0.223, + "grad_norm": 33.25, + "grad_norm_var": 10.238541666666666, + "learning_rate": 0.0001, + "loss": 7.5334, + "loss/crossentropy": 2.0829237014055253, + "loss/hidden": 3.330859375, + "loss/jsd": 0.0, + "loss/logits": 0.1849207304418087, + "step": 8920 + }, + { + "epoch": 0.22325, + "grad_norm": 33.25, + "grad_norm_var": 3.3676432291666667, + "learning_rate": 0.0001, + "loss": 7.5348, + "loss/crossentropy": 2.1565073817968368, + "loss/hidden": 3.37109375, + "loss/jsd": 0.0, + "loss/logits": 0.19267312660813332, + "step": 8930 + }, + { + "epoch": 0.2235, + "grad_norm": 42.5, + "grad_norm_var": 14.748372395833334, + "learning_rate": 0.0001, + "loss": 7.5472, + "loss/crossentropy": 2.0750382035970687, + "loss/hidden": 3.515625, + "loss/jsd": 0.0, + "loss/logits": 0.1899881549179554, + "step": 8940 + }, + { + "epoch": 0.22375, + "grad_norm": 35.5, + "grad_norm_var": 12.167122395833333, + "learning_rate": 0.0001, + "loss": 7.6047, + "loss/crossentropy": 2.1998844176530836, + "loss/hidden": 3.358984375, + "loss/jsd": 0.0, + "loss/logits": 0.1992990154772997, + "step": 8950 + }, + { + "epoch": 0.224, + "grad_norm": 39.5, + "grad_norm_var": 8.703125, + "learning_rate": 0.0001, + "loss": 7.5236, + "loss/crossentropy": 2.1398618072271347, + "loss/hidden": 3.38671875, + "loss/jsd": 0.0, + "loss/logits": 0.2044746194034815, + "step": 8960 + }, + { + "epoch": 0.22425, + "grad_norm": 29.5, + "grad_norm_var": 7.711458333333334, + "learning_rate": 0.0001, + "loss": 7.5588, + "loss/crossentropy": 2.0922764956951143, + "loss/hidden": 3.363671875, + "loss/jsd": 0.0, + "loss/logits": 0.18962055966258048, + "step": 8970 + }, + { + "epoch": 0.2245, + "grad_norm": 38.25, + "grad_norm_var": 8.417122395833333, + "learning_rate": 0.0001, + "loss": 7.5705, + "loss/crossentropy": 2.2088142544031144, + "loss/hidden": 3.418359375, + "loss/jsd": 0.0, + "loss/logits": 0.2040243223309517, + "step": 8980 + }, + { + "epoch": 0.22475, + "grad_norm": 31.25, + "grad_norm_var": 6.708268229166666, + "learning_rate": 0.0001, + "loss": 7.5883, + "loss/crossentropy": 2.0866949379444124, + "loss/hidden": 3.488671875, + "loss/jsd": 0.0, + "loss/logits": 0.21130447909235955, + "step": 8990 + }, + { + "epoch": 0.225, + "grad_norm": 28.375, + "grad_norm_var": 7.254166666666666, + "learning_rate": 0.0001, + "loss": 7.5008, + "loss/crossentropy": 2.117489975690842, + "loss/hidden": 3.49375, + "loss/jsd": 0.0, + "loss/logits": 0.18843676745891572, + "step": 9000 + }, + { + "epoch": 0.22525, + "grad_norm": 32.5, + "grad_norm_var": 18.155143229166665, + "learning_rate": 0.0001, + "loss": 7.4718, + "loss/crossentropy": 2.1349531918764115, + "loss/hidden": 3.337890625, + "loss/jsd": 0.0, + "loss/logits": 0.18441876713186503, + "step": 9010 + }, + { + "epoch": 0.2255, + "grad_norm": 30.25, + "grad_norm_var": 7.642708333333333, + "learning_rate": 0.0001, + "loss": 7.4722, + "loss/crossentropy": 2.158918860554695, + "loss/hidden": 3.3796875, + "loss/jsd": 0.0, + "loss/logits": 0.1831710446625948, + "step": 9020 + }, + { + "epoch": 0.22575, + "grad_norm": 30.625, + "grad_norm_var": 5.802083333333333, + "learning_rate": 0.0001, + "loss": 7.5065, + "loss/crossentropy": 2.1577743917703627, + "loss/hidden": 3.443359375, + "loss/jsd": 0.0, + "loss/logits": 0.19996861293911933, + "step": 9030 + }, + { + "epoch": 0.226, + "grad_norm": 35.75, + "grad_norm_var": 6.050455729166667, + "learning_rate": 0.0001, + "loss": 7.54, + "loss/crossentropy": 2.1675800561904905, + "loss/hidden": 3.3734375, + "loss/jsd": 0.0, + "loss/logits": 0.19423422291874887, + "step": 9040 + }, + { + "epoch": 0.22625, + "grad_norm": 30.25, + "grad_norm_var": 8.01015625, + "learning_rate": 0.0001, + "loss": 7.5053, + "loss/crossentropy": 2.1748566299676897, + "loss/hidden": 3.298046875, + "loss/jsd": 0.0, + "loss/logits": 0.19081774912774563, + "step": 9050 + }, + { + "epoch": 0.2265, + "grad_norm": 107.5, + "grad_norm_var": 363.3160807291667, + "learning_rate": 0.0001, + "loss": 7.5204, + "loss/crossentropy": 2.1024440199136736, + "loss/hidden": 3.3796875, + "loss/jsd": 0.0, + "loss/logits": 0.19131676983088255, + "step": 9060 + }, + { + "epoch": 0.22675, + "grad_norm": 43.0, + "grad_norm_var": 382.7749348958333, + "learning_rate": 0.0001, + "loss": 7.4636, + "loss/crossentropy": 2.0358673214912413, + "loss/hidden": 3.53671875, + "loss/jsd": 0.0, + "loss/logits": 0.20718471184372902, + "step": 9070 + }, + { + "epoch": 0.227, + "grad_norm": 38.0, + "grad_norm_var": 34.826497395833336, + "learning_rate": 0.0001, + "loss": 7.5132, + "loss/crossentropy": 2.1334098488092423, + "loss/hidden": 3.454296875, + "loss/jsd": 0.0, + "loss/logits": 0.1900653975084424, + "step": 9080 + }, + { + "epoch": 0.22725, + "grad_norm": 47.5, + "grad_norm_var": 35.35182291666667, + "learning_rate": 0.0001, + "loss": 7.5471, + "loss/crossentropy": 2.158940353989601, + "loss/hidden": 3.351953125, + "loss/jsd": 0.0, + "loss/logits": 0.19873067885637283, + "step": 9090 + }, + { + "epoch": 0.2275, + "grad_norm": 32.5, + "grad_norm_var": 25.093489583333334, + "learning_rate": 0.0001, + "loss": 7.4687, + "loss/crossentropy": 2.110513925552368, + "loss/hidden": 3.409375, + "loss/jsd": 0.0, + "loss/logits": 0.18377724830061198, + "step": 9100 + }, + { + "epoch": 0.22775, + "grad_norm": 47.75, + "grad_norm_var": 31.540625, + "learning_rate": 0.0001, + "loss": 7.5241, + "loss/crossentropy": 2.323571813106537, + "loss/hidden": 3.30234375, + "loss/jsd": 0.0, + "loss/logits": 0.18574214577674866, + "step": 9110 + }, + { + "epoch": 0.228, + "grad_norm": 32.0, + "grad_norm_var": 52.509375, + "learning_rate": 0.0001, + "loss": 7.5911, + "loss/crossentropy": 2.2099075824022294, + "loss/hidden": 3.365234375, + "loss/jsd": 0.0, + "loss/logits": 0.1919841269031167, + "step": 9120 + }, + { + "epoch": 0.22825, + "grad_norm": 28.5, + "grad_norm_var": 44.95104166666667, + "learning_rate": 0.0001, + "loss": 7.5406, + "loss/crossentropy": 2.146890181303024, + "loss/hidden": 3.46640625, + "loss/jsd": 0.0, + "loss/logits": 0.19247145019471645, + "step": 9130 + }, + { + "epoch": 0.2285, + "grad_norm": 39.0, + "grad_norm_var": 14.4625, + "learning_rate": 0.0001, + "loss": 7.4844, + "loss/crossentropy": 2.127192445099354, + "loss/hidden": 3.3484375, + "loss/jsd": 0.0, + "loss/logits": 0.17968443408608437, + "step": 9140 + }, + { + "epoch": 0.22875, + "grad_norm": 28.75, + "grad_norm_var": 15.970572916666667, + "learning_rate": 0.0001, + "loss": 7.5806, + "loss/crossentropy": 2.0811379849910736, + "loss/hidden": 3.437109375, + "loss/jsd": 0.0, + "loss/logits": 0.1876799188554287, + "step": 9150 + }, + { + "epoch": 0.229, + "grad_norm": 36.25, + "grad_norm_var": 16.017643229166666, + "learning_rate": 0.0001, + "loss": 7.4714, + "loss/crossentropy": 2.1674265801906585, + "loss/hidden": 3.404296875, + "loss/jsd": 0.0, + "loss/logits": 0.19961154460906982, + "step": 9160 + }, + { + "epoch": 0.22925, + "grad_norm": 30.25, + "grad_norm_var": 14.31640625, + "learning_rate": 0.0001, + "loss": 7.4578, + "loss/crossentropy": 2.1701185166835786, + "loss/hidden": 3.306640625, + "loss/jsd": 0.0, + "loss/logits": 0.19284930936992167, + "step": 9170 + }, + { + "epoch": 0.2295, + "grad_norm": 33.0, + "grad_norm_var": 24.164322916666666, + "learning_rate": 0.0001, + "loss": 7.5706, + "loss/crossentropy": 2.2007659181952475, + "loss/hidden": 3.483984375, + "loss/jsd": 0.0, + "loss/logits": 0.20873381607234479, + "step": 9180 + }, + { + "epoch": 0.22975, + "grad_norm": 33.5, + "grad_norm_var": 167.6619140625, + "learning_rate": 0.0001, + "loss": 7.5246, + "loss/crossentropy": 2.013364678621292, + "loss/hidden": 3.480859375, + "loss/jsd": 0.0, + "loss/logits": 0.1943978626281023, + "step": 9190 + }, + { + "epoch": 0.23, + "grad_norm": 32.5, + "grad_norm_var": 161.8853515625, + "learning_rate": 0.0001, + "loss": 7.5824, + "loss/crossentropy": 2.1492466554045677, + "loss/hidden": 3.383984375, + "loss/jsd": 0.0, + "loss/logits": 0.19080942254513503, + "step": 9200 + }, + { + "epoch": 0.23025, + "grad_norm": 35.0, + "grad_norm_var": 25.631184895833332, + "learning_rate": 0.0001, + "loss": 7.4525, + "loss/crossentropy": 2.1856732040643694, + "loss/hidden": 3.508203125, + "loss/jsd": 0.0, + "loss/logits": 0.21295880153775215, + "step": 9210 + }, + { + "epoch": 0.2305, + "grad_norm": 29.625, + "grad_norm_var": 44.44348958333333, + "learning_rate": 0.0001, + "loss": 7.5668, + "loss/crossentropy": 2.1970660746097566, + "loss/hidden": 3.31171875, + "loss/jsd": 0.0, + "loss/logits": 0.1991808257997036, + "step": 9220 + }, + { + "epoch": 0.23075, + "grad_norm": 33.25, + "grad_norm_var": 29.702083333333334, + "learning_rate": 0.0001, + "loss": 7.5854, + "loss/crossentropy": 2.1504436887800695, + "loss/hidden": 3.446875, + "loss/jsd": 0.0, + "loss/logits": 0.18685424784198404, + "step": 9230 + }, + { + "epoch": 0.231, + "grad_norm": 29.625, + "grad_norm_var": 10.211393229166667, + "learning_rate": 0.0001, + "loss": 7.3922, + "loss/crossentropy": 2.1555099219083784, + "loss/hidden": 3.510546875, + "loss/jsd": 0.0, + "loss/logits": 0.20160883199423552, + "step": 9240 + }, + { + "epoch": 0.23125, + "grad_norm": 30.75, + "grad_norm_var": 6.043489583333334, + "learning_rate": 0.0001, + "loss": 7.4663, + "loss/crossentropy": 2.153862714767456, + "loss/hidden": 3.34765625, + "loss/jsd": 0.0, + "loss/logits": 0.18072083070874215, + "step": 9250 + }, + { + "epoch": 0.2315, + "grad_norm": 35.25, + "grad_norm_var": 5.314322916666667, + "learning_rate": 0.0001, + "loss": 7.4845, + "loss/crossentropy": 2.2198209404945373, + "loss/hidden": 3.328515625, + "loss/jsd": 0.0, + "loss/logits": 0.19124398212879895, + "step": 9260 + }, + { + "epoch": 0.23175, + "grad_norm": 29.25, + "grad_norm_var": 8.071875, + "learning_rate": 0.0001, + "loss": 7.6265, + "loss/crossentropy": 2.1362643599510194, + "loss/hidden": 3.42265625, + "loss/jsd": 0.0, + "loss/logits": 0.18900094255805017, + "step": 9270 + }, + { + "epoch": 0.232, + "grad_norm": 30.375, + "grad_norm_var": 9.327018229166667, + "learning_rate": 0.0001, + "loss": 7.4949, + "loss/crossentropy": 2.226493775844574, + "loss/hidden": 3.346875, + "loss/jsd": 0.0, + "loss/logits": 0.19106594361364843, + "step": 9280 + }, + { + "epoch": 0.23225, + "grad_norm": 42.75, + "grad_norm_var": 11.729622395833333, + "learning_rate": 0.0001, + "loss": 7.5367, + "loss/crossentropy": 2.212230810523033, + "loss/hidden": 3.358203125, + "loss/jsd": 0.0, + "loss/logits": 0.20718637369573117, + "step": 9290 + }, + { + "epoch": 0.2325, + "grad_norm": 34.5, + "grad_norm_var": 13.130989583333333, + "learning_rate": 0.0001, + "loss": 7.4321, + "loss/crossentropy": 2.021953631937504, + "loss/hidden": 3.455859375, + "loss/jsd": 0.0, + "loss/logits": 0.19819956757128238, + "step": 9300 + }, + { + "epoch": 0.23275, + "grad_norm": 31.125, + "grad_norm_var": 5.062239583333334, + "learning_rate": 0.0001, + "loss": 7.5474, + "loss/crossentropy": 2.147363981604576, + "loss/hidden": 3.435546875, + "loss/jsd": 0.0, + "loss/logits": 0.1922474455088377, + "step": 9310 + }, + { + "epoch": 0.233, + "grad_norm": 32.25, + "grad_norm_var": 3.471875, + "learning_rate": 0.0001, + "loss": 7.5815, + "loss/crossentropy": 2.1315445095300674, + "loss/hidden": 3.546875, + "loss/jsd": 0.0, + "loss/logits": 0.22608386687934398, + "step": 9320 + }, + { + "epoch": 0.23325, + "grad_norm": 29.0, + "grad_norm_var": 4.233072916666667, + "learning_rate": 0.0001, + "loss": 7.5392, + "loss/crossentropy": 2.0831587575376034, + "loss/hidden": 3.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.20386858209967612, + "step": 9330 + }, + { + "epoch": 0.2335, + "grad_norm": 31.75, + "grad_norm_var": 225.54895833333333, + "learning_rate": 0.0001, + "loss": 7.5913, + "loss/crossentropy": 2.2117609918117522, + "loss/hidden": 3.427734375, + "loss/jsd": 0.0, + "loss/logits": 0.2390334837138653, + "step": 9340 + }, + { + "epoch": 0.23375, + "grad_norm": 30.75, + "grad_norm_var": 10.939518229166667, + "learning_rate": 0.0001, + "loss": 7.4838, + "loss/crossentropy": 2.2391141802072525, + "loss/hidden": 3.35, + "loss/jsd": 0.0, + "loss/logits": 0.1901895135641098, + "step": 9350 + }, + { + "epoch": 0.234, + "grad_norm": 36.0, + "grad_norm_var": 8.078059895833333, + "learning_rate": 0.0001, + "loss": 7.4995, + "loss/crossentropy": 2.179610106348991, + "loss/hidden": 3.396875, + "loss/jsd": 0.0, + "loss/logits": 0.19948177523910998, + "step": 9360 + }, + { + "epoch": 0.23425, + "grad_norm": 37.0, + "grad_norm_var": 4.773372395833333, + "learning_rate": 0.0001, + "loss": 7.566, + "loss/crossentropy": 2.265758016705513, + "loss/hidden": 3.49140625, + "loss/jsd": 0.0, + "loss/logits": 0.2015686921775341, + "step": 9370 + }, + { + "epoch": 0.2345, + "grad_norm": 33.25, + "grad_norm_var": 5.334309895833333, + "learning_rate": 0.0001, + "loss": 7.5378, + "loss/crossentropy": 2.164343351125717, + "loss/hidden": 3.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.2006633374840021, + "step": 9380 + }, + { + "epoch": 0.23475, + "grad_norm": 32.5, + "grad_norm_var": 29.55625, + "learning_rate": 0.0001, + "loss": 7.5607, + "loss/crossentropy": 2.150678759813309, + "loss/hidden": 3.463671875, + "loss/jsd": 0.0, + "loss/logits": 0.22009918540716172, + "step": 9390 + }, + { + "epoch": 0.235, + "grad_norm": 31.75, + "grad_norm_var": 275.2309895833333, + "learning_rate": 0.0001, + "loss": 7.6283, + "loss/crossentropy": 2.2133118510246277, + "loss/hidden": 3.39296875, + "loss/jsd": 0.0, + "loss/logits": 0.18500468730926514, + "step": 9400 + }, + { + "epoch": 0.23525, + "grad_norm": 29.125, + "grad_norm_var": 3.2697265625, + "learning_rate": 0.0001, + "loss": 7.5132, + "loss/crossentropy": 2.225054568052292, + "loss/hidden": 3.37734375, + "loss/jsd": 0.0, + "loss/logits": 0.20218575857579707, + "step": 9410 + }, + { + "epoch": 0.2355, + "grad_norm": 30.875, + "grad_norm_var": 7.542122395833333, + "learning_rate": 0.0001, + "loss": 7.4509, + "loss/crossentropy": 2.1811843127012254, + "loss/hidden": 3.381640625, + "loss/jsd": 0.0, + "loss/logits": 0.19548335634171962, + "step": 9420 + }, + { + "epoch": 0.23575, + "grad_norm": 31.875, + "grad_norm_var": 1.2934895833333333, + "learning_rate": 0.0001, + "loss": 7.5001, + "loss/crossentropy": 1.9795912995934486, + "loss/hidden": 3.547265625, + "loss/jsd": 0.0, + "loss/logits": 0.19292352311313152, + "step": 9430 + }, + { + "epoch": 0.236, + "grad_norm": 31.125, + "grad_norm_var": 95.365625, + "learning_rate": 0.0001, + "loss": 7.5911, + "loss/crossentropy": 2.032686772942543, + "loss/hidden": 3.380859375, + "loss/jsd": 0.0, + "loss/logits": 0.1814481422305107, + "step": 9440 + }, + { + "epoch": 0.23625, + "grad_norm": 31.375, + "grad_norm_var": 199.14348958333332, + "learning_rate": 0.0001, + "loss": 7.4887, + "loss/crossentropy": 2.1834616482257845, + "loss/hidden": 3.34921875, + "loss/jsd": 0.0, + "loss/logits": 0.1900908298790455, + "step": 9450 + }, + { + "epoch": 0.2365, + "grad_norm": 42.5, + "grad_norm_var": 9.148893229166667, + "learning_rate": 0.0001, + "loss": 7.5461, + "loss/crossentropy": 2.1409550577402117, + "loss/hidden": 3.430078125, + "loss/jsd": 0.0, + "loss/logits": 0.19009583443403244, + "step": 9460 + }, + { + "epoch": 0.23675, + "grad_norm": 30.0, + "grad_norm_var": 11.903125, + "learning_rate": 0.0001, + "loss": 7.5157, + "loss/crossentropy": 2.0411314353346826, + "loss/hidden": 3.420703125, + "loss/jsd": 0.0, + "loss/logits": 0.18683939017355441, + "step": 9470 + }, + { + "epoch": 0.237, + "grad_norm": 35.75, + "grad_norm_var": 14.804622395833333, + "learning_rate": 0.0001, + "loss": 7.5184, + "loss/crossentropy": 2.0848400443792343, + "loss/hidden": 3.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.2053835943341255, + "step": 9480 + }, + { + "epoch": 0.23725, + "grad_norm": 31.875, + "grad_norm_var": 4.326822916666667, + "learning_rate": 0.0001, + "loss": 7.5465, + "loss/crossentropy": 2.198624536395073, + "loss/hidden": 3.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.20486781597137452, + "step": 9490 + }, + { + "epoch": 0.2375, + "grad_norm": 39.75, + "grad_norm_var": 23.5791015625, + "learning_rate": 0.0001, + "loss": 7.5646, + "loss/crossentropy": 2.136742886900902, + "loss/hidden": 3.412890625, + "loss/jsd": 0.0, + "loss/logits": 0.19704439640045165, + "step": 9500 + }, + { + "epoch": 0.23775, + "grad_norm": 45.5, + "grad_norm_var": 35.976497395833334, + "learning_rate": 0.0001, + "loss": 7.4662, + "loss/crossentropy": 2.1736690044403075, + "loss/hidden": 3.464453125, + "loss/jsd": 0.0, + "loss/logits": 0.19949170239269734, + "step": 9510 + }, + { + "epoch": 0.238, + "grad_norm": 31.625, + "grad_norm_var": 23.130989583333335, + "learning_rate": 0.0001, + "loss": 7.5822, + "loss/crossentropy": 2.1334351167082786, + "loss/hidden": 3.53828125, + "loss/jsd": 0.0, + "loss/logits": 0.20324139203876257, + "step": 9520 + }, + { + "epoch": 0.23825, + "grad_norm": 29.25, + "grad_norm_var": 17.6181640625, + "learning_rate": 0.0001, + "loss": 7.5358, + "loss/crossentropy": 2.142412620782852, + "loss/hidden": 3.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.2030354391783476, + "step": 9530 + }, + { + "epoch": 0.2385, + "grad_norm": 39.75, + "grad_norm_var": 18.976041666666667, + "learning_rate": 0.0001, + "loss": 7.494, + "loss/crossentropy": 2.181875669956207, + "loss/hidden": 3.4671875, + "loss/jsd": 0.0, + "loss/logits": 0.1956815965473652, + "step": 9540 + }, + { + "epoch": 0.23875, + "grad_norm": 34.75, + "grad_norm_var": 8.832747395833334, + "learning_rate": 0.0001, + "loss": 7.648, + "loss/crossentropy": 2.2528909504413606, + "loss/hidden": 3.435546875, + "loss/jsd": 0.0, + "loss/logits": 0.20324745066463948, + "step": 9550 + }, + { + "epoch": 0.239, + "grad_norm": 35.5, + "grad_norm_var": 3.3268229166666665, + "learning_rate": 0.0001, + "loss": 7.5439, + "loss/crossentropy": 2.2434193670749663, + "loss/hidden": 3.415625, + "loss/jsd": 0.0, + "loss/logits": 0.20051232390105725, + "step": 9560 + }, + { + "epoch": 0.23925, + "grad_norm": 31.75, + "grad_norm_var": 1.7962890625, + "learning_rate": 0.0001, + "loss": 7.5977, + "loss/crossentropy": 2.0810982078313827, + "loss/hidden": 3.519921875, + "loss/jsd": 0.0, + "loss/logits": 0.1919446151703596, + "step": 9570 + }, + { + "epoch": 0.2395, + "grad_norm": 31.5, + "grad_norm_var": 18.6556640625, + "learning_rate": 0.0001, + "loss": 7.4919, + "loss/crossentropy": 2.11810165643692, + "loss/hidden": 3.272265625, + "loss/jsd": 0.0, + "loss/logits": 0.17909912299364805, + "step": 9580 + }, + { + "epoch": 0.23975, + "grad_norm": 30.25, + "grad_norm_var": 19.439322916666665, + "learning_rate": 0.0001, + "loss": 7.5892, + "loss/crossentropy": 2.2458123177289964, + "loss/hidden": 3.40703125, + "loss/jsd": 0.0, + "loss/logits": 0.2052942331880331, + "step": 9590 + }, + { + "epoch": 0.24, + "grad_norm": 32.5, + "grad_norm_var": 5.358268229166667, + "learning_rate": 0.0001, + "loss": 7.4833, + "loss/crossentropy": 2.294092634320259, + "loss/hidden": 3.355859375, + "loss/jsd": 0.0, + "loss/logits": 0.18821652382612228, + "step": 9600 + }, + { + "epoch": 0.24025, + "grad_norm": 34.0, + "grad_norm_var": 15.582747395833334, + "learning_rate": 0.0001, + "loss": 7.623, + "loss/crossentropy": 2.152405506372452, + "loss/hidden": 3.583203125, + "loss/jsd": 0.0, + "loss/logits": 0.20593744479119777, + "step": 9610 + }, + { + "epoch": 0.2405, + "grad_norm": 30.5, + "grad_norm_var": 11.377018229166667, + "learning_rate": 0.0001, + "loss": 7.6211, + "loss/crossentropy": 2.0907988399267197, + "loss/hidden": 3.487890625, + "loss/jsd": 0.0, + "loss/logits": 0.19778051003813743, + "step": 9620 + }, + { + "epoch": 0.24075, + "grad_norm": 31.25, + "grad_norm_var": 4.201822916666667, + "learning_rate": 0.0001, + "loss": 7.5435, + "loss/crossentropy": 2.1334788501262665, + "loss/hidden": 3.402734375, + "loss/jsd": 0.0, + "loss/logits": 0.1883639894425869, + "step": 9630 + }, + { + "epoch": 0.241, + "grad_norm": 32.25, + "grad_norm_var": 9.456705729166666, + "learning_rate": 0.0001, + "loss": 7.4907, + "loss/crossentropy": 2.1615520387887956, + "loss/hidden": 3.52734375, + "loss/jsd": 0.0, + "loss/logits": 0.212527497112751, + "step": 9640 + }, + { + "epoch": 0.24125, + "grad_norm": 30.875, + "grad_norm_var": 9.05, + "learning_rate": 0.0001, + "loss": 7.4189, + "loss/crossentropy": 2.182580092549324, + "loss/hidden": 3.437890625, + "loss/jsd": 0.0, + "loss/logits": 0.1890213243663311, + "step": 9650 + }, + { + "epoch": 0.2415, + "grad_norm": 29.75, + "grad_norm_var": 4.44765625, + "learning_rate": 0.0001, + "loss": 7.5499, + "loss/crossentropy": 2.1858610659837723, + "loss/hidden": 3.375390625, + "loss/jsd": 0.0, + "loss/logits": 0.1913149781525135, + "step": 9660 + }, + { + "epoch": 0.24175, + "grad_norm": 29.875, + "grad_norm_var": 7.605989583333334, + "learning_rate": 0.0001, + "loss": 7.4714, + "loss/crossentropy": 2.212157425284386, + "loss/hidden": 3.312890625, + "loss/jsd": 0.0, + "loss/logits": 0.1864764802157879, + "step": 9670 + }, + { + "epoch": 0.242, + "grad_norm": 30.375, + "grad_norm_var": 6.686393229166667, + "learning_rate": 0.0001, + "loss": 7.4193, + "loss/crossentropy": 2.226425829529762, + "loss/hidden": 3.341796875, + "loss/jsd": 0.0, + "loss/logits": 0.19140857569873332, + "step": 9680 + }, + { + "epoch": 0.24225, + "grad_norm": 31.0, + "grad_norm_var": 2.3114583333333334, + "learning_rate": 0.0001, + "loss": 7.5538, + "loss/crossentropy": 2.123721697926521, + "loss/hidden": 3.612109375, + "loss/jsd": 0.0, + "loss/logits": 0.21398749127984046, + "step": 9690 + }, + { + "epoch": 0.2425, + "grad_norm": 31.25, + "grad_norm_var": 2.03515625, + "learning_rate": 0.0001, + "loss": 7.4686, + "loss/crossentropy": 2.189563122391701, + "loss/hidden": 3.285546875, + "loss/jsd": 0.0, + "loss/logits": 0.19756122268736362, + "step": 9700 + }, + { + "epoch": 0.24275, + "grad_norm": 36.75, + "grad_norm_var": 4326.120768229167, + "learning_rate": 0.0001, + "loss": 7.6897, + "loss/crossentropy": 2.137082815170288, + "loss/hidden": 3.33359375, + "loss/jsd": 0.0, + "loss/logits": 0.18757453747093678, + "step": 9710 + }, + { + "epoch": 0.243, + "grad_norm": 32.5, + "grad_norm_var": 4357.42265625, + "learning_rate": 0.0001, + "loss": 7.4598, + "loss/crossentropy": 2.1567126482725145, + "loss/hidden": 3.319921875, + "loss/jsd": 0.0, + "loss/logits": 0.19823984801769257, + "step": 9720 + }, + { + "epoch": 0.24325, + "grad_norm": 30.625, + "grad_norm_var": 3.7192057291666667, + "learning_rate": 0.0001, + "loss": 7.5681, + "loss/crossentropy": 2.0657265201210975, + "loss/hidden": 3.423828125, + "loss/jsd": 0.0, + "loss/logits": 0.21410678885877132, + "step": 9730 + }, + { + "epoch": 0.2435, + "grad_norm": 30.875, + "grad_norm_var": 3.54765625, + "learning_rate": 0.0001, + "loss": 7.4632, + "loss/crossentropy": 2.231111526489258, + "loss/hidden": 3.218359375, + "loss/jsd": 0.0, + "loss/logits": 0.1727623924612999, + "step": 9740 + }, + { + "epoch": 0.24375, + "grad_norm": 33.25, + "grad_norm_var": 32.34348958333333, + "learning_rate": 0.0001, + "loss": 7.5244, + "loss/crossentropy": 2.184462660551071, + "loss/hidden": 3.3578125, + "loss/jsd": 0.0, + "loss/logits": 0.19121489115059376, + "step": 9750 + }, + { + "epoch": 0.244, + "grad_norm": 29.5, + "grad_norm_var": 13.9212890625, + "learning_rate": 0.0001, + "loss": 7.5373, + "loss/crossentropy": 2.237063002586365, + "loss/hidden": 3.42265625, + "loss/jsd": 0.0, + "loss/logits": 0.19552523456513882, + "step": 9760 + }, + { + "epoch": 0.24425, + "grad_norm": 31.0, + "grad_norm_var": 12.137239583333333, + "learning_rate": 0.0001, + "loss": 7.5296, + "loss/crossentropy": 2.1166174903512003, + "loss/hidden": 3.482421875, + "loss/jsd": 0.0, + "loss/logits": 0.19640736747533083, + "step": 9770 + }, + { + "epoch": 0.2445, + "grad_norm": 31.625, + "grad_norm_var": 1.8927083333333334, + "learning_rate": 0.0001, + "loss": 7.4695, + "loss/crossentropy": 2.215598449110985, + "loss/hidden": 3.330078125, + "loss/jsd": 0.0, + "loss/logits": 0.1820572379976511, + "step": 9780 + }, + { + "epoch": 0.24475, + "grad_norm": 28.375, + "grad_norm_var": 6.012239583333334, + "learning_rate": 0.0001, + "loss": 7.4857, + "loss/crossentropy": 2.19299538731575, + "loss/hidden": 3.433984375, + "loss/jsd": 0.0, + "loss/logits": 0.20531897619366646, + "step": 9790 + }, + { + "epoch": 0.245, + "grad_norm": 36.5, + "grad_norm_var": 2.2799472864610222e+18, + "learning_rate": 0.0001, + "loss": 7.6354, + "loss/crossentropy": 2.141967089474201, + "loss/hidden": 3.656640625, + "loss/jsd": 0.0, + "loss/logits": 0.19617959037423133, + "step": 9800 + }, + { + "epoch": 0.24525, + "grad_norm": 31.75, + "grad_norm_var": 2.2799472865365197e+18, + "learning_rate": 0.0001, + "loss": 7.5381, + "loss/crossentropy": 2.1641202688217165, + "loss/hidden": 3.4640625, + "loss/jsd": 0.0, + "loss/logits": 0.19467307589948177, + "step": 9810 + }, + { + "epoch": 0.2455, + "grad_norm": 32.0, + "grad_norm_var": 1.9330729166666667, + "learning_rate": 0.0001, + "loss": 7.4407, + "loss/crossentropy": 2.1282627910375593, + "loss/hidden": 3.428515625, + "loss/jsd": 0.0, + "loss/logits": 0.1983230970799923, + "step": 9820 + }, + { + "epoch": 0.24575, + "grad_norm": 30.375, + "grad_norm_var": 1.5020833333333334, + "learning_rate": 0.0001, + "loss": 7.5029, + "loss/crossentropy": 2.1581913977861404, + "loss/hidden": 3.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.20057737156748773, + "step": 9830 + }, + { + "epoch": 0.246, + "grad_norm": 32.0, + "grad_norm_var": 3.36640625, + "learning_rate": 0.0001, + "loss": 7.5631, + "loss/crossentropy": 2.111289617419243, + "loss/hidden": 3.56328125, + "loss/jsd": 0.0, + "loss/logits": 0.19521272610872983, + "step": 9840 + }, + { + "epoch": 0.24625, + "grad_norm": 34.0, + "grad_norm_var": 2.7947265625, + "learning_rate": 0.0001, + "loss": 7.4617, + "loss/crossentropy": 2.1353485763072966, + "loss/hidden": 3.377734375, + "loss/jsd": 0.0, + "loss/logits": 0.18625408709049224, + "step": 9850 + }, + { + "epoch": 0.2465, + "grad_norm": 29.875, + "grad_norm_var": 2.5582682291666665, + "learning_rate": 0.0001, + "loss": 7.4291, + "loss/crossentropy": 2.2995183020830154, + "loss/hidden": 3.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.1992196377366781, + "step": 9860 + }, + { + "epoch": 0.24675, + "grad_norm": 31.25, + "grad_norm_var": 1.32890625, + "learning_rate": 0.0001, + "loss": 7.6033, + "loss/crossentropy": 2.2881169497966765, + "loss/hidden": 3.410546875, + "loss/jsd": 0.0, + "loss/logits": 0.1932061992585659, + "step": 9870 + }, + { + "epoch": 0.247, + "grad_norm": 30.0, + "grad_norm_var": 14.017643229166667, + "learning_rate": 0.0001, + "loss": 7.5277, + "loss/crossentropy": 2.1658580511808396, + "loss/hidden": 3.341015625, + "loss/jsd": 0.0, + "loss/logits": 0.19005278386175634, + "step": 9880 + }, + { + "epoch": 0.24725, + "grad_norm": 29.625, + "grad_norm_var": 11.984830729166667, + "learning_rate": 0.0001, + "loss": 7.4924, + "loss/crossentropy": 2.2023943603038787, + "loss/hidden": 3.339453125, + "loss/jsd": 0.0, + "loss/logits": 0.19262171424925328, + "step": 9890 + }, + { + "epoch": 0.2475, + "grad_norm": 31.625, + "grad_norm_var": 6.539518229166666, + "learning_rate": 0.0001, + "loss": 7.3901, + "loss/crossentropy": 2.1616971135139464, + "loss/hidden": 3.405078125, + "loss/jsd": 0.0, + "loss/logits": 0.19074857234954834, + "step": 9900 + }, + { + "epoch": 0.24775, + "grad_norm": 31.625, + "grad_norm_var": 2.436393229166667, + "learning_rate": 0.0001, + "loss": 7.4839, + "loss/crossentropy": 2.21129602342844, + "loss/hidden": 3.37265625, + "loss/jsd": 0.0, + "loss/logits": 0.1898047223687172, + "step": 9910 + }, + { + "epoch": 0.248, + "grad_norm": 31.125, + "grad_norm_var": 1.9580729166666666, + "learning_rate": 0.0001, + "loss": 7.5074, + "loss/crossentropy": 2.1486201629042627, + "loss/hidden": 3.327734375, + "loss/jsd": 0.0, + "loss/logits": 0.19970939867198467, + "step": 9920 + }, + { + "epoch": 0.24825, + "grad_norm": 31.75, + "grad_norm_var": 1.9666666666666666, + "learning_rate": 0.0001, + "loss": 7.4634, + "loss/crossentropy": 2.073464626073837, + "loss/hidden": 3.489453125, + "loss/jsd": 0.0, + "loss/logits": 0.21017258744686843, + "step": 9930 + }, + { + "epoch": 0.2485, + "grad_norm": 31.25, + "grad_norm_var": 1.4166666666666667, + "learning_rate": 0.0001, + "loss": 7.4696, + "loss/crossentropy": 2.156512539088726, + "loss/hidden": 3.529296875, + "loss/jsd": 0.0, + "loss/logits": 0.20021300427615643, + "step": 9940 + }, + { + "epoch": 0.24875, + "grad_norm": 31.375, + "grad_norm_var": 2.6681640625, + "learning_rate": 0.0001, + "loss": 7.5618, + "loss/crossentropy": 2.098437860608101, + "loss/hidden": 3.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.2031120091676712, + "step": 9950 + }, + { + "epoch": 0.249, + "grad_norm": 29.125, + "grad_norm_var": 4.5259765625, + "learning_rate": 0.0001, + "loss": 7.5073, + "loss/crossentropy": 2.054112070798874, + "loss/hidden": 3.3703125, + "loss/jsd": 0.0, + "loss/logits": 0.18417572602629662, + "step": 9960 + }, + { + "epoch": 0.24925, + "grad_norm": 32.25, + "grad_norm_var": 9.677083333333334, + "learning_rate": 0.0001, + "loss": 7.4506, + "loss/crossentropy": 2.173594242334366, + "loss/hidden": 3.3609375, + "loss/jsd": 0.0, + "loss/logits": 0.1788831666111946, + "step": 9970 + }, + { + "epoch": 0.2495, + "grad_norm": 31.625, + "grad_norm_var": 12.448958333333334, + "learning_rate": 0.0001, + "loss": 7.5566, + "loss/crossentropy": 2.1865617662668226, + "loss/hidden": 3.54375, + "loss/jsd": 0.0, + "loss/logits": 0.219557130523026, + "step": 9980 + }, + { + "epoch": 0.24975, + "grad_norm": 31.125, + "grad_norm_var": 7.601822916666666, + "learning_rate": 0.0001, + "loss": 7.6451, + "loss/crossentropy": 2.161116376519203, + "loss/hidden": 3.42109375, + "loss/jsd": 0.0, + "loss/logits": 0.20050363764166831, + "step": 9990 + }, + { + "epoch": 0.25, + "grad_norm": 30.375, + "grad_norm_var": 1.603125, + "learning_rate": 0.0001, + "loss": 7.4863, + "loss/crossentropy": 2.056240776181221, + "loss/hidden": 3.48203125, + "loss/jsd": 0.0, + "loss/logits": 0.21083669643849134, + "step": 10000 + } + ], + "logging_steps": 10, + "max_steps": 40000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.8575100320088064e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}