diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,18018 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.1875, + "eval_steps": 500, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000125, + "grad_norm": 2.377527952194214, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.2768, + "loss/crossentropy": 2.697097063064575, + "loss/hidden": 1.1171875, + "loss/logits": 0.15893849730491638, + "loss/reg": 6.247002602322027e-05, + "step": 1 + }, + { + "epoch": 0.00025, + "grad_norm": 4.216994762420654, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.3752, + "loss/crossentropy": 3.101844310760498, + "loss/hidden": 1.1796875, + "loss/logits": 0.1949012577533722, + "loss/reg": 6.247002602322027e-05, + "step": 2 + }, + { + "epoch": 0.000375, + "grad_norm": 2.3287529945373535, + "learning_rate": 3e-06, + "loss": 1.2785, + "loss/crossentropy": 2.63712477684021, + "loss/hidden": 1.09375, + "loss/logits": 0.18410107493400574, + "loss/reg": 6.246996053960174e-05, + "step": 3 + }, + { + "epoch": 0.0005, + "grad_norm": 5.415231227874756, + "learning_rate": 4.000000000000001e-06, + "loss": 1.4285, + "loss/crossentropy": 2.5702285766601562, + "loss/hidden": 1.265625, + "loss/logits": 0.16228657960891724, + "loss/reg": 6.246980774449185e-05, + "step": 4 + }, + { + "epoch": 0.000625, + "grad_norm": 4.888370513916016, + "learning_rate": 5e-06, + "loss": 1.5121, + "loss/crossentropy": 2.439383029937744, + "loss/hidden": 1.3125, + "loss/logits": 0.19899356365203857, + "loss/reg": 6.24695821898058e-05, + "step": 5 + }, + { + "epoch": 0.00075, + "grad_norm": 2.608705997467041, + "learning_rate": 6e-06, + "loss": 1.293, + "loss/crossentropy": 2.668699026107788, + "loss/hidden": 1.109375, + "loss/logits": 0.18298496305942535, + "loss/reg": 6.246933480724692e-05, + "step": 6 + }, + { + "epoch": 0.000875, + "grad_norm": 2.8447623252868652, + "learning_rate": 7.000000000000001e-06, + "loss": 1.5339, + "loss/crossentropy": 2.5219366550445557, + "loss/hidden": 1.296875, + "loss/logits": 0.2364223599433899, + "loss/reg": 6.246914563234895e-05, + "step": 7 + }, + { + "epoch": 0.001, + "grad_norm": 3.7877628803253174, + "learning_rate": 8.000000000000001e-06, + "loss": 1.8218, + "loss/crossentropy": 2.1927688121795654, + "loss/hidden": 1.5546875, + "loss/logits": 0.2664879262447357, + "loss/reg": 6.246889097383246e-05, + "step": 8 + }, + { + "epoch": 0.001125, + "grad_norm": 2.988516330718994, + "learning_rate": 9e-06, + "loss": 1.7373, + "loss/crossentropy": 2.3826897144317627, + "loss/hidden": 1.421875, + "loss/logits": 0.314752995967865, + "loss/reg": 6.246858538361266e-05, + "step": 9 + }, + { + "epoch": 0.00125, + "grad_norm": 2.143723726272583, + "learning_rate": 1e-05, + "loss": 1.405, + "loss/crossentropy": 2.2246415615081787, + "loss/hidden": 1.234375, + "loss/logits": 0.16997714340686798, + "loss/reg": 6.246842531254515e-05, + "step": 10 + }, + { + "epoch": 0.001375, + "grad_norm": 2.4413657188415527, + "learning_rate": 1.1000000000000001e-05, + "loss": 1.4206, + "loss/crossentropy": 2.4612021446228027, + "loss/hidden": 1.1796875, + "loss/logits": 0.24033024907112122, + "loss/reg": 6.246819975785911e-05, + "step": 11 + }, + { + "epoch": 0.0015, + "grad_norm": 2.483156204223633, + "learning_rate": 1.2e-05, + "loss": 1.6449, + "loss/crossentropy": 2.2882771492004395, + "loss/hidden": 1.4140625, + "loss/logits": 0.23023059964179993, + "loss/reg": 6.246790871955454e-05, + "step": 12 + }, + { + "epoch": 0.001625, + "grad_norm": 2.7368147373199463, + "learning_rate": 1.3000000000000001e-05, + "loss": 1.4981, + "loss/crossentropy": 2.6942052841186523, + "loss/hidden": 1.265625, + "loss/logits": 0.23185348510742188, + "loss/reg": 6.24675813014619e-05, + "step": 13 + }, + { + "epoch": 0.00175, + "grad_norm": 5.189184665679932, + "learning_rate": 1.4000000000000001e-05, + "loss": 1.946, + "loss/crossentropy": 2.3771214485168457, + "loss/hidden": 1.625, + "loss/logits": 0.320385217666626, + "loss/reg": 6.246678822208196e-05, + "step": 14 + }, + { + "epoch": 0.001875, + "grad_norm": 2.305589437484741, + "learning_rate": 1.5e-05, + "loss": 1.4982, + "loss/crossentropy": 2.7562549114227295, + "loss/hidden": 1.25, + "loss/logits": 0.2476150244474411, + "loss/reg": 6.246620614547282e-05, + "step": 15 + }, + { + "epoch": 0.002, + "grad_norm": 2.3378520011901855, + "grad_norm_var": 1.2675163586822178, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.3302, + "loss/crossentropy": 2.445441961288452, + "loss/hidden": 1.125, + "loss/logits": 0.20453599095344543, + "loss/reg": 6.246585689950734e-05, + "step": 16 + }, + { + "epoch": 0.002125, + "grad_norm": 1.7903435230255127, + "grad_norm_var": 1.3529406709866008, + "learning_rate": 1.7000000000000003e-05, + "loss": 1.1333, + "loss/crossentropy": 2.323503017425537, + "loss/hidden": 0.984375, + "loss/logits": 0.14828170835971832, + "loss/reg": 6.246510747587308e-05, + "step": 17 + }, + { + "epoch": 0.00225, + "grad_norm": 3.363795518875122, + "grad_norm_var": 1.277817936381435, + "learning_rate": 1.8e-05, + "loss": 1.7292, + "loss/crossentropy": 2.6075525283813477, + "loss/hidden": 1.46875, + "loss/logits": 0.25987327098846436, + "loss/reg": 6.24642925686203e-05, + "step": 18 + }, + { + "epoch": 0.002375, + "grad_norm": 2.162050724029541, + "grad_norm_var": 1.2967721886362786, + "learning_rate": 1.9e-05, + "loss": 1.3146, + "loss/crossentropy": 2.570558786392212, + "loss/hidden": 1.125, + "loss/logits": 0.18898281455039978, + "loss/reg": 6.246323027880862e-05, + "step": 19 + }, + { + "epoch": 0.0025, + "grad_norm": 2.147024393081665, + "grad_norm_var": 0.9523869945360727, + "learning_rate": 2e-05, + "loss": 1.3484, + "loss/crossentropy": 2.6676244735717773, + "loss/hidden": 1.1484375, + "loss/logits": 0.19929195940494537, + "loss/reg": 6.246233533602208e-05, + "step": 20 + }, + { + "epoch": 0.002625, + "grad_norm": 2.0668728351593018, + "grad_norm_var": 0.6976603751830339, + "learning_rate": 2.1e-05, + "loss": 1.1929, + "loss/crossentropy": 2.401143789291382, + "loss/hidden": 1.03125, + "loss/logits": 0.1610003113746643, + "loss/reg": 6.246144039323553e-05, + "step": 21 + }, + { + "epoch": 0.00275, + "grad_norm": 2.8019566535949707, + "grad_norm_var": 0.6973240463492516, + "learning_rate": 2.2000000000000003e-05, + "loss": 1.419, + "loss/crossentropy": 2.627523183822632, + "loss/hidden": 1.203125, + "loss/logits": 0.2152642011642456, + "loss/reg": 6.246032717172056e-05, + "step": 22 + }, + { + "epoch": 0.002875, + "grad_norm": 3.8118937015533447, + "grad_norm_var": 0.7713008187193999, + "learning_rate": 2.3000000000000003e-05, + "loss": 1.4284, + "loss/crossentropy": 2.7227890491485596, + "loss/hidden": 1.1640625, + "loss/logits": 0.2637593150138855, + "loss/reg": 6.245896656764671e-05, + "step": 23 + }, + { + "epoch": 0.003, + "grad_norm": 2.1418018341064453, + "grad_norm_var": 0.7205284729945551, + "learning_rate": 2.4e-05, + "loss": 1.3002, + "loss/crossentropy": 2.545552968978882, + "loss/hidden": 1.1328125, + "loss/logits": 0.16680249571800232, + "loss/reg": 6.245774420676753e-05, + "step": 24 + }, + { + "epoch": 0.003125, + "grad_norm": 3.5331156253814697, + "grad_norm_var": 0.7613226543465996, + "learning_rate": 2.5e-05, + "loss": 1.3224, + "loss/crossentropy": 2.2371270656585693, + "loss/hidden": 1.15625, + "loss/logits": 0.16548338532447815, + "loss/reg": 6.245705299079418e-05, + "step": 25 + }, + { + "epoch": 0.00325, + "grad_norm": 1.9795947074890137, + "grad_norm_var": 0.7755306597344306, + "learning_rate": 2.6000000000000002e-05, + "loss": 1.3209, + "loss/crossentropy": 2.7113037109375, + "loss/hidden": 1.1328125, + "loss/logits": 0.18742361664772034, + "loss/reg": 6.245569966267794e-05, + "step": 26 + }, + { + "epoch": 0.003375, + "grad_norm": 2.6044108867645264, + "grad_norm_var": 0.7714440385524235, + "learning_rate": 2.7000000000000002e-05, + "loss": 1.4566, + "loss/crossentropy": 2.6034419536590576, + "loss/hidden": 1.2265625, + "loss/logits": 0.22937631607055664, + "loss/reg": 6.245376425795257e-05, + "step": 27 + }, + { + "epoch": 0.0035, + "grad_norm": 2.48085355758667, + "grad_norm_var": 0.7715158471256792, + "learning_rate": 2.8000000000000003e-05, + "loss": 1.4579, + "loss/crossentropy": 2.5794363021850586, + "loss/hidden": 1.2421875, + "loss/logits": 0.21509718894958496, + "loss/reg": 6.245166878215969e-05, + "step": 28 + }, + { + "epoch": 0.003625, + "grad_norm": 3.0413854122161865, + "grad_norm_var": 0.7781660489700184, + "learning_rate": 2.9e-05, + "loss": 1.6102, + "loss/crossentropy": 2.4173922538757324, + "loss/hidden": 1.375, + "loss/logits": 0.23455965518951416, + "loss/reg": 6.244902033358812e-05, + "step": 29 + }, + { + "epoch": 0.00375, + "grad_norm": 2.1076390743255615, + "grad_norm_var": 0.36324525064493024, + "learning_rate": 3e-05, + "loss": 1.0735, + "loss/crossentropy": 2.4064886569976807, + "loss/hidden": 0.9453125, + "loss/logits": 0.12752822041511536, + "loss/reg": 6.244838004931808e-05, + "step": 30 + }, + { + "epoch": 0.003875, + "grad_norm": 2.5296630859375, + "grad_norm_var": 0.359312391151574, + "learning_rate": 3.1e-05, + "loss": 1.3467, + "loss/crossentropy": 2.61391544342041, + "loss/hidden": 1.15625, + "loss/logits": 0.18978667259216309, + "loss/reg": 6.244736141525209e-05, + "step": 31 + }, + { + "epoch": 0.004, + "grad_norm": 2.123671054840088, + "grad_norm_var": 0.3684168280400947, + "learning_rate": 3.2000000000000005e-05, + "loss": 1.2191, + "loss/crossentropy": 2.6056668758392334, + "loss/hidden": 1.0546875, + "loss/logits": 0.16381201148033142, + "loss/reg": 6.244605174288154e-05, + "step": 32 + }, + { + "epoch": 0.004125, + "grad_norm": 3.685770034790039, + "grad_norm_var": 0.4027733703548923, + "learning_rate": 3.3e-05, + "loss": 1.6794, + "loss/crossentropy": 2.519561290740967, + "loss/hidden": 1.3828125, + "loss/logits": 0.29592496156692505, + "loss/reg": 6.24443418928422e-05, + "step": 33 + }, + { + "epoch": 0.00425, + "grad_norm": 1.9660468101501465, + "grad_norm_var": 0.393966226946808, + "learning_rate": 3.4000000000000007e-05, + "loss": 1.3395, + "loss/crossentropy": 2.638051986694336, + "loss/hidden": 1.15625, + "loss/logits": 0.18261724710464478, + "loss/reg": 6.244314135983586e-05, + "step": 34 + }, + { + "epoch": 0.004375, + "grad_norm": 2.3111677169799805, + "grad_norm_var": 0.38716579449971367, + "learning_rate": 3.5e-05, + "loss": 1.3501, + "loss/crossentropy": 2.599940776824951, + "loss/hidden": 1.15625, + "loss/logits": 0.19327056407928467, + "loss/reg": 6.244215182960033e-05, + "step": 35 + }, + { + "epoch": 0.0045, + "grad_norm": 2.5357542037963867, + "grad_norm_var": 0.3739975607775089, + "learning_rate": 3.6e-05, + "loss": 1.287, + "loss/crossentropy": 2.9884798526763916, + "loss/hidden": 1.1171875, + "loss/logits": 0.16922441124916077, + "loss/reg": 6.244022370083258e-05, + "step": 36 + }, + { + "epoch": 0.004625, + "grad_norm": 1.7781621217727661, + "grad_norm_var": 0.40002233468076764, + "learning_rate": 3.7e-05, + "loss": 1.074, + "loss/crossentropy": 2.669071674346924, + "loss/hidden": 0.93359375, + "loss/logits": 0.13981276750564575, + "loss/reg": 6.243858661036938e-05, + "step": 37 + }, + { + "epoch": 0.00475, + "grad_norm": 24.6973819732666, + "grad_norm_var": 30.983207545217457, + "learning_rate": 3.8e-05, + "loss": 1.3637, + "loss/crossentropy": 2.482579469680786, + "loss/hidden": 1.1953125, + "loss/logits": 0.16777344048023224, + "loss/reg": 6.243725511012599e-05, + "step": 38 + }, + { + "epoch": 0.004875, + "grad_norm": 2.5728342533111572, + "grad_norm_var": 31.103302953089262, + "learning_rate": 3.9000000000000006e-05, + "loss": 1.3424, + "loss/crossentropy": 2.2785422801971436, + "loss/hidden": 1.171875, + "loss/logits": 0.16988611221313477, + "loss/reg": 6.243555981200188e-05, + "step": 39 + }, + { + "epoch": 0.005, + "grad_norm": 1.7385622262954712, + "grad_norm_var": 31.206951393275006, + "learning_rate": 4e-05, + "loss": 1.077, + "loss/crossentropy": 2.7017714977264404, + "loss/hidden": 0.9453125, + "loss/logits": 0.13102804124355316, + "loss/reg": 6.243350071599707e-05, + "step": 40 + }, + { + "epoch": 0.005125, + "grad_norm": 2.455116033554077, + "grad_norm_var": 31.325901099338942, + "learning_rate": 4.1e-05, + "loss": 1.178, + "loss/crossentropy": 2.6521873474121094, + "loss/hidden": 1.015625, + "loss/logits": 0.16170336306095123, + "loss/reg": 6.243147072382271e-05, + "step": 41 + }, + { + "epoch": 0.00525, + "grad_norm": 3.0441935062408447, + "grad_norm_var": 31.14003983168487, + "learning_rate": 4.2e-05, + "loss": 1.488, + "loss/crossentropy": 2.5000290870666504, + "loss/hidden": 1.265625, + "loss/logits": 0.2217317819595337, + "loss/reg": 6.24291569693014e-05, + "step": 42 + }, + { + "epoch": 0.005375, + "grad_norm": 2.6227200031280518, + "grad_norm_var": 31.137008952861066, + "learning_rate": 4.3e-05, + "loss": 1.3106, + "loss/crossentropy": 2.6832528114318848, + "loss/hidden": 1.1171875, + "loss/logits": 0.19276997447013855, + "loss/reg": 6.242711242521182e-05, + "step": 43 + }, + { + "epoch": 0.0055, + "grad_norm": 2.9194633960723877, + "grad_norm_var": 31.06863081080745, + "learning_rate": 4.4000000000000006e-05, + "loss": 1.5396, + "loss/crossentropy": 2.483938455581665, + "loss/hidden": 1.3046875, + "loss/logits": 0.23424991965293884, + "loss/reg": 6.242513336474076e-05, + "step": 44 + }, + { + "epoch": 0.005625, + "grad_norm": 2.2491037845611572, + "grad_norm_var": 31.196778907875057, + "learning_rate": 4.5e-05, + "loss": 1.2321, + "loss/crossentropy": 2.9735186100006104, + "loss/hidden": 1.0625, + "loss/logits": 0.1689363420009613, + "loss/reg": 6.242344534257427e-05, + "step": 45 + }, + { + "epoch": 0.00575, + "grad_norm": 2.687225103378296, + "grad_norm_var": 31.084396554405373, + "learning_rate": 4.600000000000001e-05, + "loss": 1.2443, + "loss/crossentropy": 2.913846254348755, + "loss/hidden": 1.0625, + "loss/logits": 0.18112678825855255, + "loss/reg": 6.242193921934813e-05, + "step": 46 + }, + { + "epoch": 0.005875, + "grad_norm": 2.3648312091827393, + "grad_norm_var": 31.1155476706496, + "learning_rate": 4.7e-05, + "loss": 1.2044, + "loss/crossentropy": 2.374119520187378, + "loss/hidden": 1.046875, + "loss/logits": 0.15688437223434448, + "loss/reg": 6.242006929824129e-05, + "step": 47 + }, + { + "epoch": 0.006, + "grad_norm": 1.896540880203247, + "grad_norm_var": 31.171339818602494, + "learning_rate": 4.8e-05, + "loss": 1.238, + "loss/crossentropy": 2.613962173461914, + "loss/hidden": 1.0546875, + "loss/logits": 0.1826920211315155, + "loss/reg": 6.24187450739555e-05, + "step": 48 + }, + { + "epoch": 0.006125, + "grad_norm": 1.7585434913635254, + "grad_norm_var": 31.44447201393312, + "learning_rate": 4.9e-05, + "loss": 1.1411, + "loss/crossentropy": 2.5672757625579834, + "loss/hidden": 1.0, + "loss/logits": 0.14043202996253967, + "loss/reg": 6.241785740712658e-05, + "step": 49 + }, + { + "epoch": 0.00625, + "grad_norm": 1.8257592916488647, + "grad_norm_var": 31.47860052328912, + "learning_rate": 5e-05, + "loss": 1.2643, + "loss/crossentropy": 2.4829366207122803, + "loss/hidden": 1.0859375, + "loss/logits": 0.1777852475643158, + "loss/reg": 6.2416227592621e-05, + "step": 50 + }, + { + "epoch": 0.006375, + "grad_norm": 1.9530550241470337, + "grad_norm_var": 31.553698309541367, + "learning_rate": 5.1000000000000006e-05, + "loss": 1.1787, + "loss/crossentropy": 2.501922369003296, + "loss/hidden": 1.015625, + "loss/logits": 0.16241338849067688, + "loss/reg": 6.241373193915933e-05, + "step": 51 + }, + { + "epoch": 0.0065, + "grad_norm": 2.366898536682129, + "grad_norm_var": 31.58155048439878, + "learning_rate": 5.2000000000000004e-05, + "loss": 1.476, + "loss/crossentropy": 2.557314872741699, + "loss/hidden": 1.234375, + "loss/logits": 0.24098029732704163, + "loss/reg": 6.241213122848421e-05, + "step": 52 + }, + { + "epoch": 0.006625, + "grad_norm": 2.139944553375244, + "grad_norm_var": 31.497838767117898, + "learning_rate": 5.300000000000001e-05, + "loss": 1.3057, + "loss/crossentropy": 2.5664379596710205, + "loss/hidden": 1.125, + "loss/logits": 0.18005570769309998, + "loss/reg": 6.241026130737737e-05, + "step": 53 + }, + { + "epoch": 0.00675, + "grad_norm": 2.2614963054656982, + "grad_norm_var": 0.16298419379227144, + "learning_rate": 5.4000000000000005e-05, + "loss": 1.2081, + "loss/crossentropy": 2.5651533603668213, + "loss/hidden": 1.046875, + "loss/logits": 0.1606135070323944, + "loss/reg": 6.240784568944946e-05, + "step": 54 + }, + { + "epoch": 0.006875, + "grad_norm": 1.88372802734375, + "grad_norm_var": 0.16791840248250048, + "learning_rate": 5.500000000000001e-05, + "loss": 1.2037, + "loss/crossentropy": 2.0431623458862305, + "loss/hidden": 1.0703125, + "loss/logits": 0.13271506130695343, + "loss/reg": 6.240410584723577e-05, + "step": 55 + }, + { + "epoch": 0.007, + "grad_norm": 1.7579172849655151, + "grad_norm_var": 0.16659499666655736, + "learning_rate": 5.6000000000000006e-05, + "loss": 1.0787, + "loss/crossentropy": 2.5805883407592773, + "loss/hidden": 0.94140625, + "loss/logits": 0.13670633733272552, + "loss/reg": 6.240163202164695e-05, + "step": 56 + }, + { + "epoch": 0.007125, + "grad_norm": 2.740758180618286, + "grad_norm_var": 0.17906241043444873, + "learning_rate": 5.6999999999999996e-05, + "loss": 1.2499, + "loss/crossentropy": 2.821078062057495, + "loss/hidden": 1.0859375, + "loss/logits": 0.16337308287620544, + "loss/reg": 6.239958747755736e-05, + "step": 57 + }, + { + "epoch": 0.00725, + "grad_norm": 3.3393216133117676, + "grad_norm_var": 0.21459676497742203, + "learning_rate": 5.8e-05, + "loss": 1.5094, + "loss/crossentropy": 2.6574273109436035, + "loss/hidden": 1.2265625, + "loss/logits": 0.2822623550891876, + "loss/reg": 6.239775393623859e-05, + "step": 58 + }, + { + "epoch": 0.007375, + "grad_norm": 2.1151742935180664, + "grad_norm_var": 0.20871929877454623, + "learning_rate": 5.9e-05, + "loss": 1.31, + "loss/crossentropy": 2.28176212310791, + "loss/hidden": 1.125, + "loss/logits": 0.18433833122253418, + "loss/reg": 6.239649519557133e-05, + "step": 59 + }, + { + "epoch": 0.0075, + "grad_norm": 1.9203850030899048, + "grad_norm_var": 0.18408730894700795, + "learning_rate": 6e-05, + "loss": 1.2862, + "loss/crossentropy": 2.319091558456421, + "loss/hidden": 1.09375, + "loss/logits": 0.1918697953224182, + "loss/reg": 6.239335925783962e-05, + "step": 60 + }, + { + "epoch": 0.007625, + "grad_norm": 2.689425230026245, + "grad_norm_var": 0.1988651894699956, + "learning_rate": 6.1e-05, + "loss": 1.2077, + "loss/crossentropy": 2.396440029144287, + "loss/hidden": 1.0546875, + "loss/logits": 0.1523526906967163, + "loss/reg": 6.239157664822415e-05, + "step": 61 + }, + { + "epoch": 0.00775, + "grad_norm": 2.0848548412323, + "grad_norm_var": 0.184926237897677, + "learning_rate": 6.2e-05, + "loss": 1.1889, + "loss/crossentropy": 2.375331401824951, + "loss/hidden": 1.03125, + "loss/logits": 0.15707406401634216, + "loss/reg": 6.238814967218786e-05, + "step": 62 + }, + { + "epoch": 0.007875, + "grad_norm": 1.9770179986953735, + "grad_norm_var": 0.18547542502594508, + "learning_rate": 6.3e-05, + "loss": 1.1255, + "loss/crossentropy": 2.5883288383483887, + "loss/hidden": 0.984375, + "loss/logits": 0.14046350121498108, + "loss/reg": 6.238514470169321e-05, + "step": 63 + }, + { + "epoch": 0.008, + "grad_norm": 1.9654349088668823, + "grad_norm_var": 0.1832653842408547, + "learning_rate": 6.400000000000001e-05, + "loss": 1.1315, + "loss/crossentropy": 2.6122260093688965, + "loss/hidden": 0.9765625, + "loss/logits": 0.1543133556842804, + "loss/reg": 6.238299101823941e-05, + "step": 64 + }, + { + "epoch": 0.008125, + "grad_norm": 2.110621690750122, + "grad_norm_var": 0.1715223081433841, + "learning_rate": 6.500000000000001e-05, + "loss": 1.1513, + "loss/crossentropy": 2.3829517364501953, + "loss/hidden": 1.0, + "loss/logits": 0.15063607692718506, + "loss/reg": 6.237896013772115e-05, + "step": 65 + }, + { + "epoch": 0.00825, + "grad_norm": 3.1477179527282715, + "grad_norm_var": 0.21553302023151552, + "learning_rate": 6.6e-05, + "loss": 1.4659, + "loss/crossentropy": 2.2805211544036865, + "loss/hidden": 1.2421875, + "loss/logits": 0.22310970723628998, + "loss/reg": 6.237393972696736e-05, + "step": 66 + }, + { + "epoch": 0.008375, + "grad_norm": 2.482203722000122, + "grad_norm_var": 0.21008166056666275, + "learning_rate": 6.7e-05, + "loss": 1.0839, + "loss/crossentropy": 2.982119560241699, + "loss/hidden": 0.94140625, + "loss/logits": 0.14186254143714905, + "loss/reg": 6.236990884644911e-05, + "step": 67 + }, + { + "epoch": 0.0085, + "grad_norm": 2.198028087615967, + "grad_norm_var": 0.21061508280485744, + "learning_rate": 6.800000000000001e-05, + "loss": 1.2007, + "loss/crossentropy": 2.725332498550415, + "loss/hidden": 1.0390625, + "loss/logits": 0.1610267162322998, + "loss/reg": 6.236397166503593e-05, + "step": 68 + }, + { + "epoch": 0.008625, + "grad_norm": 1.9412530660629272, + "grad_norm_var": 0.21734592747188602, + "learning_rate": 6.9e-05, + "loss": 1.1269, + "loss/crossentropy": 2.682379722595215, + "loss/hidden": 0.984375, + "loss/logits": 0.14185243844985962, + "loss/reg": 6.235777982510626e-05, + "step": 69 + }, + { + "epoch": 0.00875, + "grad_norm": 2.223443031311035, + "grad_norm_var": 0.21757323137186588, + "learning_rate": 7e-05, + "loss": 1.3663, + "loss/crossentropy": 2.6186935901641846, + "loss/hidden": 1.1640625, + "loss/logits": 0.2016535997390747, + "loss/reg": 6.23530286247842e-05, + "step": 70 + }, + { + "epoch": 0.008875, + "grad_norm": 3.4456241130828857, + "grad_norm_var": 0.28625219910078287, + "learning_rate": 7.1e-05, + "loss": 1.6214, + "loss/crossentropy": 2.054266929626465, + "loss/hidden": 1.421875, + "loss/logits": 0.19887767732143402, + "loss/reg": 6.234741158550605e-05, + "step": 71 + }, + { + "epoch": 0.009, + "grad_norm": 1.9013352394104004, + "grad_norm_var": 0.27557130255187207, + "learning_rate": 7.2e-05, + "loss": 1.1365, + "loss/crossentropy": 2.422841787338257, + "loss/hidden": 0.9765625, + "loss/logits": 0.15926527976989746, + "loss/reg": 6.234211468836293e-05, + "step": 72 + }, + { + "epoch": 0.009125, + "grad_norm": 2.4032697677612305, + "grad_norm_var": 0.267026183625853, + "learning_rate": 7.3e-05, + "loss": 1.4414, + "loss/crossentropy": 2.4159440994262695, + "loss/hidden": 1.21875, + "loss/logits": 0.22204136848449707, + "loss/reg": 6.233662861632183e-05, + "step": 73 + }, + { + "epoch": 0.00925, + "grad_norm": 1.915128231048584, + "grad_norm_var": 0.21002777018266153, + "learning_rate": 7.4e-05, + "loss": 1.2439, + "loss/crossentropy": 2.587275505065918, + "loss/hidden": 1.0625, + "loss/logits": 0.1807810664176941, + "loss/reg": 6.232755549717695e-05, + "step": 74 + }, + { + "epoch": 0.009375, + "grad_norm": 3.4048879146575928, + "grad_norm_var": 0.28520435687560547, + "learning_rate": 7.500000000000001e-05, + "loss": 1.2774, + "loss/crossentropy": 2.6182703971862793, + "loss/hidden": 1.125, + "loss/logits": 0.15172982215881348, + "loss/reg": 6.231923180166632e-05, + "step": 75 + }, + { + "epoch": 0.0095, + "grad_norm": 2.3605074882507324, + "grad_norm_var": 0.27132747056331724, + "learning_rate": 7.6e-05, + "loss": 1.1409, + "loss/crossentropy": 2.6013262271881104, + "loss/hidden": 0.98828125, + "loss/logits": 0.151985764503479, + "loss/reg": 6.231063889572397e-05, + "step": 76 + }, + { + "epoch": 0.009625, + "grad_norm": 2.6056039333343506, + "grad_norm_var": 0.2684276793201585, + "learning_rate": 7.7e-05, + "loss": 1.1, + "loss/crossentropy": 2.534158945083618, + "loss/hidden": 0.94921875, + "loss/logits": 0.1501779407262802, + "loss/reg": 6.230256258277223e-05, + "step": 77 + }, + { + "epoch": 0.00975, + "grad_norm": 1.7923972606658936, + "grad_norm_var": 0.285494251958092, + "learning_rate": 7.800000000000001e-05, + "loss": 1.1471, + "loss/crossentropy": 2.3036601543426514, + "loss/hidden": 0.98828125, + "loss/logits": 0.15817409753799438, + "loss/reg": 6.229766586329788e-05, + "step": 78 + }, + { + "epoch": 0.009875, + "grad_norm": 2.0376312732696533, + "grad_norm_var": 0.2825708803585835, + "learning_rate": 7.900000000000001e-05, + "loss": 1.2985, + "loss/crossentropy": 2.5548579692840576, + "loss/hidden": 1.140625, + "loss/logits": 0.1572834551334381, + "loss/reg": 6.229063728824258e-05, + "step": 79 + }, + { + "epoch": 0.01, + "grad_norm": 2.998662233352661, + "grad_norm_var": 0.29342903010298654, + "learning_rate": 8e-05, + "loss": 1.5504, + "loss/crossentropy": 2.4098215103149414, + "loss/hidden": 1.3046875, + "loss/logits": 0.24512597918510437, + "loss/reg": 6.22822335571982e-05, + "step": 80 + }, + { + "epoch": 0.010125, + "grad_norm": 2.103449583053589, + "grad_norm_var": 0.29374293883859787, + "learning_rate": 8.1e-05, + "loss": 1.2985, + "loss/crossentropy": 2.380378484725952, + "loss/hidden": 1.125, + "loss/logits": 0.17282900214195251, + "loss/reg": 6.227292760740966e-05, + "step": 81 + }, + { + "epoch": 0.01025, + "grad_norm": 2.6376256942749023, + "grad_norm_var": 0.2615363410208279, + "learning_rate": 8.2e-05, + "loss": 1.266, + "loss/crossentropy": 2.4291374683380127, + "loss/hidden": 1.1015625, + "loss/logits": 0.16384728252887726, + "loss/reg": 6.226752884685993e-05, + "step": 82 + }, + { + "epoch": 0.010375, + "grad_norm": 2.0763561725616455, + "grad_norm_var": 0.2675552215302521, + "learning_rate": 8.3e-05, + "loss": 1.1733, + "loss/crossentropy": 2.423896312713623, + "loss/hidden": 1.015625, + "loss/logits": 0.15705125033855438, + "loss/reg": 6.225931429071352e-05, + "step": 83 + }, + { + "epoch": 0.0105, + "grad_norm": 4.398110866546631, + "grad_norm_var": 0.5173355174320988, + "learning_rate": 8.4e-05, + "loss": 1.5654, + "loss/crossentropy": 2.230816602706909, + "loss/hidden": 1.296875, + "loss/logits": 0.26791903376579285, + "loss/reg": 6.225006654858589e-05, + "step": 84 + }, + { + "epoch": 0.010625, + "grad_norm": 2.7163784503936768, + "grad_norm_var": 0.4955558090734691, + "learning_rate": 8.5e-05, + "loss": 1.2008, + "loss/crossentropy": 2.1671087741851807, + "loss/hidden": 1.0546875, + "loss/logits": 0.145525261759758, + "loss/reg": 6.224414391908795e-05, + "step": 85 + }, + { + "epoch": 0.01075, + "grad_norm": 1.9465394020080566, + "grad_norm_var": 0.5129132822581631, + "learning_rate": 8.6e-05, + "loss": 1.0109, + "loss/crossentropy": 2.218550443649292, + "loss/hidden": 0.90234375, + "loss/logits": 0.10795612633228302, + "loss/reg": 6.22385778115131e-05, + "step": 86 + }, + { + "epoch": 0.010875, + "grad_norm": 5.668015956878662, + "grad_norm_var": 1.0880389746416426, + "learning_rate": 8.7e-05, + "loss": 1.2925, + "loss/crossentropy": 2.360995292663574, + "loss/hidden": 1.1484375, + "loss/logits": 0.1434704214334488, + "loss/reg": 6.223141826922074e-05, + "step": 87 + }, + { + "epoch": 0.011, + "grad_norm": 3.4049394130706787, + "grad_norm_var": 1.0721571012465496, + "learning_rate": 8.800000000000001e-05, + "loss": 1.6353, + "loss/crossentropy": 1.9898579120635986, + "loss/hidden": 1.3828125, + "loss/logits": 0.25186440348625183, + "loss/reg": 6.222462252480909e-05, + "step": 88 + }, + { + "epoch": 0.011125, + "grad_norm": 1.885895013809204, + "grad_norm_var": 1.1148297312339375, + "learning_rate": 8.900000000000001e-05, + "loss": 1.0561, + "loss/crossentropy": 2.670912027359009, + "loss/hidden": 0.92578125, + "loss/logits": 0.12972213327884674, + "loss/reg": 6.221828516572714e-05, + "step": 89 + }, + { + "epoch": 0.01125, + "grad_norm": 1.886960506439209, + "grad_norm_var": 1.118003608268531, + "learning_rate": 9e-05, + "loss": 1.1335, + "loss/crossentropy": 2.5691866874694824, + "loss/hidden": 0.97265625, + "loss/logits": 0.16021151840686798, + "loss/reg": 6.221193325472996e-05, + "step": 90 + }, + { + "epoch": 0.011375, + "grad_norm": 3.117880344390869, + "grad_norm_var": 1.0979090394478965, + "learning_rate": 9.1e-05, + "loss": 1.3175, + "loss/crossentropy": 2.7383711338043213, + "loss/hidden": 1.140625, + "loss/logits": 0.1762513369321823, + "loss/reg": 6.220516661414877e-05, + "step": 91 + }, + { + "epoch": 0.0115, + "grad_norm": 2.5928220748901367, + "grad_norm_var": 1.0899203711980436, + "learning_rate": 9.200000000000001e-05, + "loss": 1.3898, + "loss/crossentropy": 2.255321741104126, + "loss/hidden": 1.171875, + "loss/logits": 0.21727776527404785, + "loss/reg": 6.2199542298913e-05, + "step": 92 + }, + { + "epoch": 0.011625, + "grad_norm": 2.5842387676239014, + "grad_norm_var": 1.09033696415262, + "learning_rate": 9.300000000000001e-05, + "loss": 1.3599, + "loss/crossentropy": 2.7780256271362305, + "loss/hidden": 1.15625, + "loss/logits": 0.203078031539917, + "loss/reg": 6.219152419362217e-05, + "step": 93 + }, + { + "epoch": 0.01175, + "grad_norm": 2.497912645339966, + "grad_norm_var": 1.032260222561935, + "learning_rate": 9.4e-05, + "loss": 1.2791, + "loss/crossentropy": 2.0482513904571533, + "loss/hidden": 1.109375, + "loss/logits": 0.16910339891910553, + "loss/reg": 6.218066846486181e-05, + "step": 94 + }, + { + "epoch": 0.011875, + "grad_norm": 2.1033713817596436, + "grad_norm_var": 1.0259829914817806, + "learning_rate": 9.5e-05, + "loss": 1.0875, + "loss/crossentropy": 2.427816152572632, + "loss/hidden": 0.94921875, + "loss/logits": 0.13770164549350739, + "loss/reg": 6.21745057287626e-05, + "step": 95 + }, + { + "epoch": 0.012, + "grad_norm": 2.063559055328369, + "grad_norm_var": 1.0544556100156115, + "learning_rate": 9.6e-05, + "loss": 1.217, + "loss/crossentropy": 2.498270034790039, + "loss/hidden": 1.046875, + "loss/logits": 0.16950619220733643, + "loss/reg": 6.216309702722356e-05, + "step": 96 + }, + { + "epoch": 0.012125, + "grad_norm": 2.3693654537200928, + "grad_norm_var": 1.036651450071012, + "learning_rate": 9.7e-05, + "loss": 1.2016, + "loss/crossentropy": 2.8368701934814453, + "loss/hidden": 1.0390625, + "loss/logits": 0.16189493238925934, + "loss/reg": 6.215785833774135e-05, + "step": 97 + }, + { + "epoch": 0.01225, + "grad_norm": 2.2980258464813232, + "grad_norm_var": 1.0488061784492646, + "learning_rate": 9.8e-05, + "loss": 1.5249, + "loss/crossentropy": 2.194488525390625, + "loss/hidden": 1.2421875, + "loss/logits": 0.2820858359336853, + "loss/reg": 6.215048051672056e-05, + "step": 98 + }, + { + "epoch": 0.012375, + "grad_norm": 3.147524833679199, + "grad_norm_var": 1.0277853179901806, + "learning_rate": 9.900000000000001e-05, + "loss": 1.7374, + "loss/crossentropy": 2.7856016159057617, + "loss/hidden": 1.4609375, + "loss/logits": 0.27581536769866943, + "loss/reg": 6.214459426701069e-05, + "step": 99 + }, + { + "epoch": 0.0125, + "grad_norm": 2.1317031383514404, + "grad_norm_var": 0.8636563030021608, + "learning_rate": 0.0001, + "loss": 1.3633, + "loss/crossentropy": 2.282402753829956, + "loss/hidden": 1.1484375, + "loss/logits": 0.2142634242773056, + "loss/reg": 6.213640881469473e-05, + "step": 100 + }, + { + "epoch": 0.012625, + "grad_norm": 2.2720911502838135, + "grad_norm_var": 0.8721171319962743, + "learning_rate": 0.0001, + "loss": 1.2405, + "loss/crossentropy": 2.8501064777374268, + "loss/hidden": 1.0625, + "loss/logits": 0.17741592228412628, + "loss/reg": 6.21288490947336e-05, + "step": 101 + }, + { + "epoch": 0.01275, + "grad_norm": 2.879110097885132, + "grad_norm_var": 0.8423375514351165, + "learning_rate": 0.0001, + "loss": 1.3486, + "loss/crossentropy": 2.4649596214294434, + "loss/hidden": 1.171875, + "loss/logits": 0.1761254221200943, + "loss/reg": 6.211963773239404e-05, + "step": 102 + }, + { + "epoch": 0.012875, + "grad_norm": 2.2214345932006836, + "grad_norm_var": 0.2123174305005847, + "learning_rate": 0.0001, + "loss": 1.1049, + "loss/crossentropy": 2.513540029525757, + "loss/hidden": 0.96484375, + "loss/logits": 0.13943374156951904, + "loss/reg": 6.21131548541598e-05, + "step": 103 + }, + { + "epoch": 0.013, + "grad_norm": 1.9674383401870728, + "grad_norm_var": 0.16151448650877043, + "learning_rate": 0.0001, + "loss": 1.2055, + "loss/crossentropy": 2.4960575103759766, + "loss/hidden": 1.03125, + "loss/logits": 0.17365112900733948, + "loss/reg": 6.210394349182025e-05, + "step": 104 + }, + { + "epoch": 0.013125, + "grad_norm": 2.152989387512207, + "grad_norm_var": 0.1485118756217919, + "learning_rate": 0.0001, + "loss": 1.3728, + "loss/crossentropy": 2.651463508605957, + "loss/hidden": 1.1796875, + "loss/logits": 0.1924474835395813, + "loss/reg": 6.209702405612916e-05, + "step": 105 + }, + { + "epoch": 0.01325, + "grad_norm": 2.591555118560791, + "grad_norm_var": 0.13200909593287988, + "learning_rate": 0.0001, + "loss": 1.5933, + "loss/crossentropy": 2.1848952770233154, + "loss/hidden": 1.375, + "loss/logits": 0.21770122647285461, + "loss/reg": 6.208720878930762e-05, + "step": 106 + }, + { + "epoch": 0.013375, + "grad_norm": 2.205780029296875, + "grad_norm_var": 0.10119294371901374, + "learning_rate": 0.0001, + "loss": 0.9785, + "loss/crossentropy": 2.4988999366760254, + "loss/hidden": 0.8671875, + "loss/logits": 0.1106652021408081, + "loss/reg": 6.207643309608102e-05, + "step": 107 + }, + { + "epoch": 0.0135, + "grad_norm": 2.427882671356201, + "grad_norm_var": 0.09821140867718908, + "learning_rate": 0.0001, + "loss": 1.2968, + "loss/crossentropy": 2.5072600841522217, + "loss/hidden": 1.09375, + "loss/logits": 0.20241403579711914, + "loss/reg": 6.206895341165364e-05, + "step": 108 + }, + { + "epoch": 0.013625, + "grad_norm": 2.4435040950775146, + "grad_norm_var": 0.09542213222792188, + "learning_rate": 0.0001, + "loss": 1.2803, + "loss/crossentropy": 2.2629339694976807, + "loss/hidden": 1.1015625, + "loss/logits": 0.17810457944869995, + "loss/reg": 6.205752288224176e-05, + "step": 109 + }, + { + "epoch": 0.01375, + "grad_norm": 2.9938735961914062, + "grad_norm_var": 0.11986086275213564, + "learning_rate": 0.0001, + "loss": 1.2708, + "loss/crossentropy": 2.5084388256073, + "loss/hidden": 1.09375, + "loss/logits": 0.1764756739139557, + "loss/reg": 6.204319652169943e-05, + "step": 110 + }, + { + "epoch": 0.013875, + "grad_norm": 2.499802827835083, + "grad_norm_var": 0.11443625726480532, + "learning_rate": 0.0001, + "loss": 1.3281, + "loss/crossentropy": 2.342087507247925, + "loss/hidden": 1.15625, + "loss/logits": 0.17120838165283203, + "loss/reg": 6.20328210061416e-05, + "step": 111 + }, + { + "epoch": 0.014, + "grad_norm": 3.28193736076355, + "grad_norm_var": 0.149862047644675, + "learning_rate": 0.0001, + "loss": 1.3891, + "loss/crossentropy": 2.396040916442871, + "loss/hidden": 1.1953125, + "loss/logits": 0.193180650472641, + "loss/reg": 6.202506483532488e-05, + "step": 112 + }, + { + "epoch": 0.014125, + "grad_norm": 2.2074780464172363, + "grad_norm_var": 0.15416329735346365, + "learning_rate": 0.0001, + "loss": 1.2137, + "loss/crossentropy": 2.501718759536743, + "loss/hidden": 1.0546875, + "loss/logits": 0.1583903729915619, + "loss/reg": 6.201667565619573e-05, + "step": 113 + }, + { + "epoch": 0.01425, + "grad_norm": 2.888498306274414, + "grad_norm_var": 0.1614203311265588, + "learning_rate": 0.0001, + "loss": 1.3498, + "loss/crossentropy": 3.097370147705078, + "loss/hidden": 1.15625, + "loss/logits": 0.19293376803398132, + "loss/reg": 6.200573989190161e-05, + "step": 114 + }, + { + "epoch": 0.014375, + "grad_norm": 2.385442018508911, + "grad_norm_var": 0.1339080451651928, + "learning_rate": 0.0001, + "loss": 1.3415, + "loss/crossentropy": 2.4950473308563232, + "loss/hidden": 1.15625, + "loss/logits": 0.18464481830596924, + "loss/reg": 6.199457857292145e-05, + "step": 115 + }, + { + "epoch": 0.0145, + "grad_norm": 3.3269190788269043, + "grad_norm_var": 0.16897616880053803, + "learning_rate": 0.0001, + "loss": 1.6405, + "loss/crossentropy": 2.19484806060791, + "loss/hidden": 1.3828125, + "loss/logits": 0.2570968270301819, + "loss/reg": 6.198590563144535e-05, + "step": 116 + }, + { + "epoch": 0.014625, + "grad_norm": 2.2415361404418945, + "grad_norm_var": 0.17015290356553733, + "learning_rate": 0.0001, + "loss": 1.2381, + "loss/crossentropy": 2.540816068649292, + "loss/hidden": 1.0625, + "loss/logits": 0.1749531626701355, + "loss/reg": 6.197726906975731e-05, + "step": 117 + }, + { + "epoch": 0.01475, + "grad_norm": 2.397615671157837, + "grad_norm_var": 0.1631737555736056, + "learning_rate": 0.0001, + "loss": 1.2192, + "loss/crossentropy": 2.6213266849517822, + "loss/hidden": 1.0546875, + "loss/logits": 0.16386428475379944, + "loss/reg": 6.197066250024363e-05, + "step": 118 + }, + { + "epoch": 0.014875, + "grad_norm": 2.75325345993042, + "grad_norm_var": 0.16006220619054398, + "learning_rate": 0.0001, + "loss": 1.5693, + "loss/crossentropy": 2.3850035667419434, + "loss/hidden": 1.34375, + "loss/logits": 0.2249460369348526, + "loss/reg": 6.196285539772362e-05, + "step": 119 + }, + { + "epoch": 0.015, + "grad_norm": 2.675480842590332, + "grad_norm_var": 0.13660137165245084, + "learning_rate": 0.0001, + "loss": 1.299, + "loss/crossentropy": 2.380896806716919, + "loss/hidden": 1.125, + "loss/logits": 0.17339974641799927, + "loss/reg": 6.195474998094141e-05, + "step": 120 + }, + { + "epoch": 0.015125, + "grad_norm": 2.611541509628296, + "grad_norm_var": 0.12289609882195597, + "learning_rate": 0.0001, + "loss": 1.2924, + "loss/crossentropy": 2.7064404487609863, + "loss/hidden": 1.109375, + "loss/logits": 0.18236055970191956, + "loss/reg": 6.194705929374322e-05, + "step": 121 + }, + { + "epoch": 0.01525, + "grad_norm": 2.3449323177337646, + "grad_norm_var": 0.12765774775469155, + "learning_rate": 0.0001, + "loss": 1.2957, + "loss/crossentropy": 2.5846447944641113, + "loss/hidden": 1.1171875, + "loss/logits": 0.17786133289337158, + "loss/reg": 6.193818262545392e-05, + "step": 122 + }, + { + "epoch": 0.015375, + "grad_norm": 2.1001734733581543, + "grad_norm_var": 0.13398098136615483, + "learning_rate": 0.0001, + "loss": 1.1704, + "loss/crossentropy": 2.504185676574707, + "loss/hidden": 1.015625, + "loss/logits": 0.15416675806045532, + "loss/reg": 6.192670116433874e-05, + "step": 123 + }, + { + "epoch": 0.0155, + "grad_norm": 2.365839719772339, + "grad_norm_var": 0.13563497966163046, + "learning_rate": 0.0001, + "loss": 1.3773, + "loss/crossentropy": 2.3259832859039307, + "loss/hidden": 1.171875, + "loss/logits": 0.20480972528457642, + "loss/reg": 6.19165730313398e-05, + "step": 124 + }, + { + "epoch": 0.015625, + "grad_norm": 2.1480026245117188, + "grad_norm_var": 0.1470561705316013, + "learning_rate": 0.0001, + "loss": 1.2768, + "loss/crossentropy": 2.288093090057373, + "loss/hidden": 1.109375, + "loss/logits": 0.16683252155780792, + "loss/reg": 6.19063139311038e-05, + "step": 125 + }, + { + "epoch": 0.01575, + "grad_norm": 2.2346343994140625, + "grad_norm_var": 0.14082182611320845, + "learning_rate": 0.0001, + "loss": 1.1441, + "loss/crossentropy": 2.6062135696411133, + "loss/hidden": 1.0, + "loss/logits": 0.14351129531860352, + "loss/reg": 6.189729174366221e-05, + "step": 126 + }, + { + "epoch": 0.015875, + "grad_norm": 3.187627077102661, + "grad_norm_var": 0.16771827237098264, + "learning_rate": 0.0001, + "loss": 1.4505, + "loss/crossentropy": 2.3607077598571777, + "loss/hidden": 1.2265625, + "loss/logits": 0.22327345609664917, + "loss/reg": 6.189044506754726e-05, + "step": 127 + }, + { + "epoch": 0.016, + "grad_norm": 2.1208789348602295, + "grad_norm_var": 0.1420574537193353, + "learning_rate": 0.0001, + "loss": 1.1414, + "loss/crossentropy": 2.408287286758423, + "loss/hidden": 1.0, + "loss/logits": 0.14076298475265503, + "loss/reg": 6.188445695443079e-05, + "step": 128 + }, + { + "epoch": 0.016125, + "grad_norm": 2.4475457668304443, + "grad_norm_var": 0.13631644029428572, + "learning_rate": 0.0001, + "loss": 1.2863, + "loss/crossentropy": 2.4705042839050293, + "loss/hidden": 1.1171875, + "loss/logits": 0.16846278309822083, + "loss/reg": 6.187462713569403e-05, + "step": 129 + }, + { + "epoch": 0.01625, + "grad_norm": 2.3132476806640625, + "grad_norm_var": 0.128302854564951, + "learning_rate": 0.0001, + "loss": 1.2265, + "loss/crossentropy": 2.323221445083618, + "loss/hidden": 1.0625, + "loss/logits": 0.16340406239032745, + "loss/reg": 6.18634803686291e-05, + "step": 130 + }, + { + "epoch": 0.016375, + "grad_norm": 2.6015546321868896, + "grad_norm_var": 0.12854282273958592, + "learning_rate": 0.0001, + "loss": 1.0946, + "loss/crossentropy": 2.554730176925659, + "loss/hidden": 0.9609375, + "loss/logits": 0.13307343423366547, + "loss/reg": 6.185180245665833e-05, + "step": 131 + }, + { + "epoch": 0.0165, + "grad_norm": 2.040545701980591, + "grad_norm_var": 0.08874970269449302, + "learning_rate": 0.0001, + "loss": 1.1715, + "loss/crossentropy": 2.6177141666412354, + "loss/hidden": 1.0078125, + "loss/logits": 0.163020521402359, + "loss/reg": 6.184292578836903e-05, + "step": 132 + }, + { + "epoch": 0.016625, + "grad_norm": 2.4451427459716797, + "grad_norm_var": 0.08672588329890019, + "learning_rate": 0.0001, + "loss": 1.2794, + "loss/crossentropy": 2.6671459674835205, + "loss/hidden": 1.109375, + "loss/logits": 0.16941678524017334, + "loss/reg": 6.18349076830782e-05, + "step": 133 + }, + { + "epoch": 0.01675, + "grad_norm": 2.5730879306793213, + "grad_norm_var": 0.08802712142174655, + "learning_rate": 0.0001, + "loss": 1.356, + "loss/crossentropy": 2.483858585357666, + "loss/hidden": 1.171875, + "loss/logits": 0.1835438758134842, + "loss/reg": 6.182605284266174e-05, + "step": 134 + }, + { + "epoch": 0.016875, + "grad_norm": 2.996643543243408, + "grad_norm_var": 0.10205043083370029, + "learning_rate": 0.0001, + "loss": 1.5067, + "loss/crossentropy": 2.267930507659912, + "loss/hidden": 1.3046875, + "loss/logits": 0.20140591263771057, + "loss/reg": 6.181577919051051e-05, + "step": 135 + }, + { + "epoch": 0.017, + "grad_norm": 2.2333881855010986, + "grad_norm_var": 0.10100001995976887, + "learning_rate": 0.0001, + "loss": 1.23, + "loss/crossentropy": 2.552584648132324, + "loss/hidden": 1.0546875, + "loss/logits": 0.17466390132904053, + "loss/reg": 6.180404307087883e-05, + "step": 136 + }, + { + "epoch": 0.017125, + "grad_norm": 2.476086378097534, + "grad_norm_var": 0.09873795942098601, + "learning_rate": 0.0001, + "loss": 1.2347, + "loss/crossentropy": 2.2955551147460938, + "loss/hidden": 1.09375, + "loss/logits": 0.1402929574251175, + "loss/reg": 6.179526099003851e-05, + "step": 137 + }, + { + "epoch": 0.01725, + "grad_norm": 2.9701859951019287, + "grad_norm_var": 0.11738609069977789, + "learning_rate": 0.0001, + "loss": 1.1041, + "loss/crossentropy": 2.4560158252716064, + "loss/hidden": 0.97265625, + "loss/logits": 0.1307787150144577, + "loss/reg": 6.178120383992791e-05, + "step": 138 + }, + { + "epoch": 0.017375, + "grad_norm": 2.151567220687866, + "grad_norm_var": 0.11513060923898569, + "learning_rate": 0.0001, + "loss": 1.1406, + "loss/crossentropy": 2.6192235946655273, + "loss/hidden": 0.98828125, + "loss/logits": 0.15172292292118073, + "loss/reg": 6.176753231557086e-05, + "step": 139 + }, + { + "epoch": 0.0175, + "grad_norm": 2.0209085941314697, + "grad_norm_var": 0.1267419293205286, + "learning_rate": 0.0001, + "loss": 1.0928, + "loss/crossentropy": 2.6628799438476562, + "loss/hidden": 0.94921875, + "loss/logits": 0.14296585321426392, + "loss/reg": 6.175567978061736e-05, + "step": 140 + }, + { + "epoch": 0.017625, + "grad_norm": 3.458299398422241, + "grad_norm_var": 0.18389511336323494, + "learning_rate": 0.0001, + "loss": 1.3966, + "loss/crossentropy": 2.885798692703247, + "loss/hidden": 1.171875, + "loss/logits": 0.22411209344863892, + "loss/reg": 6.174653390189633e-05, + "step": 141 + }, + { + "epoch": 0.01775, + "grad_norm": 2.608558177947998, + "grad_norm_var": 0.17855808227350187, + "learning_rate": 0.0001, + "loss": 1.1734, + "loss/crossentropy": 2.2689590454101562, + "loss/hidden": 1.0234375, + "loss/logits": 0.1493585705757141, + "loss/reg": 6.1732207541354e-05, + "step": 142 + }, + { + "epoch": 0.017875, + "grad_norm": 2.7264318466186523, + "grad_norm_var": 0.1520478077633771, + "learning_rate": 0.0001, + "loss": 1.2868, + "loss/crossentropy": 2.3888814449310303, + "loss/hidden": 1.1171875, + "loss/logits": 0.16896918416023254, + "loss/reg": 6.172260327730328e-05, + "step": 143 + }, + { + "epoch": 0.018, + "grad_norm": 2.4999561309814453, + "grad_norm_var": 0.14128539295791806, + "learning_rate": 0.0001, + "loss": 1.3804, + "loss/crossentropy": 2.442732572555542, + "loss/hidden": 1.1875, + "loss/logits": 0.19230639934539795, + "loss/reg": 6.171311542857438e-05, + "step": 144 + }, + { + "epoch": 0.018125, + "grad_norm": 3.084848642349243, + "grad_norm_var": 0.1592220375940921, + "learning_rate": 0.0001, + "loss": 1.5124, + "loss/crossentropy": 2.6801810264587402, + "loss/hidden": 1.2421875, + "loss/logits": 0.2696050703525543, + "loss/reg": 6.170615233713761e-05, + "step": 145 + }, + { + "epoch": 0.01825, + "grad_norm": 3.0833539962768555, + "grad_norm_var": 0.16940866671487811, + "learning_rate": 0.0001, + "loss": 1.294, + "loss/crossentropy": 2.434020519256592, + "loss/hidden": 1.140625, + "loss/logits": 0.15272179245948792, + "loss/reg": 6.170049164211378e-05, + "step": 146 + }, + { + "epoch": 0.018375, + "grad_norm": 2.2046446800231934, + "grad_norm_var": 0.18039814292173043, + "learning_rate": 0.0001, + "loss": 1.1769, + "loss/crossentropy": 2.5624289512634277, + "loss/hidden": 1.015625, + "loss/logits": 0.160653755068779, + "loss/reg": 6.169131665956229e-05, + "step": 147 + }, + { + "epoch": 0.0185, + "grad_norm": 1.9920902252197266, + "grad_norm_var": 0.18414873169562326, + "learning_rate": 0.0001, + "loss": 1.1186, + "loss/crossentropy": 2.709728479385376, + "loss/hidden": 0.96875, + "loss/logits": 0.1492651402950287, + "loss/reg": 6.168704567244276e-05, + "step": 148 + }, + { + "epoch": 0.018625, + "grad_norm": 2.7053756713867188, + "grad_norm_var": 0.18317033653553666, + "learning_rate": 0.0001, + "loss": 1.2849, + "loss/crossentropy": 2.594032049179077, + "loss/hidden": 1.09375, + "loss/logits": 0.1905450075864792, + "loss/reg": 6.168089748825878e-05, + "step": 149 + }, + { + "epoch": 0.01875, + "grad_norm": 2.1234872341156006, + "grad_norm_var": 0.1981121598309187, + "learning_rate": 0.0001, + "loss": 1.2526, + "loss/crossentropy": 2.5880792140960693, + "loss/hidden": 1.0703125, + "loss/logits": 0.18171370029449463, + "loss/reg": 6.167205719975755e-05, + "step": 150 + }, + { + "epoch": 0.018875, + "grad_norm": 2.4820902347564697, + "grad_norm_var": 0.18631464898325945, + "learning_rate": 0.0001, + "loss": 1.1869, + "loss/crossentropy": 2.2422618865966797, + "loss/hidden": 1.0234375, + "loss/logits": 0.16288068890571594, + "loss/reg": 6.166584353195503e-05, + "step": 151 + }, + { + "epoch": 0.019, + "grad_norm": 2.5669338703155518, + "grad_norm_var": 0.17912821539433874, + "learning_rate": 0.0001, + "loss": 1.0968, + "loss/crossentropy": 2.5655312538146973, + "loss/hidden": 0.953125, + "loss/logits": 0.1430792212486267, + "loss/reg": 6.165904778754339e-05, + "step": 152 + }, + { + "epoch": 0.019125, + "grad_norm": 2.191638469696045, + "grad_norm_var": 0.18782946638749062, + "learning_rate": 0.0001, + "loss": 1.297, + "loss/crossentropy": 2.3935883045196533, + "loss/hidden": 1.109375, + "loss/logits": 0.18698745965957642, + "loss/reg": 6.165434024296701e-05, + "step": 153 + }, + { + "epoch": 0.01925, + "grad_norm": 1.9139376878738403, + "grad_norm_var": 0.19900155234911943, + "learning_rate": 0.0001, + "loss": 1.1497, + "loss/crossentropy": 2.5978732109069824, + "loss/hidden": 0.99609375, + "loss/logits": 0.1530168354511261, + "loss/reg": 6.164138176245615e-05, + "step": 154 + }, + { + "epoch": 0.019375, + "grad_norm": 2.061805486679077, + "grad_norm_var": 0.20353621009625153, + "learning_rate": 0.0001, + "loss": 1.034, + "loss/crossentropy": 2.29733943939209, + "loss/hidden": 0.91015625, + "loss/logits": 0.12318030744791031, + "loss/reg": 6.162770296214148e-05, + "step": 155 + }, + { + "epoch": 0.0195, + "grad_norm": 2.686328649520874, + "grad_norm_var": 0.19023239802865194, + "learning_rate": 0.0001, + "loss": 1.4235, + "loss/crossentropy": 2.2928433418273926, + "loss/hidden": 1.2265625, + "loss/logits": 0.19631928205490112, + "loss/reg": 6.16170436842367e-05, + "step": 156 + }, + { + "epoch": 0.019625, + "grad_norm": 2.6863300800323486, + "grad_norm_var": 0.13134889378527811, + "learning_rate": 0.0001, + "loss": 1.4147, + "loss/crossentropy": 2.289113759994507, + "loss/hidden": 1.21875, + "loss/logits": 0.19536322355270386, + "loss/reg": 6.160605698823929e-05, + "step": 157 + }, + { + "epoch": 0.01975, + "grad_norm": 3.7774782180786133, + "grad_norm_var": 0.2373896188726722, + "learning_rate": 0.0001, + "loss": 1.3606, + "loss/crossentropy": 2.4960098266601562, + "loss/hidden": 1.171875, + "loss/logits": 0.18812544643878937, + "loss/reg": 6.159812619443983e-05, + "step": 158 + }, + { + "epoch": 0.019875, + "grad_norm": 2.5556654930114746, + "grad_norm_var": 0.23517615853210802, + "learning_rate": 0.0001, + "loss": 1.1015, + "loss/crossentropy": 2.4794013500213623, + "loss/hidden": 0.9609375, + "loss/logits": 0.1399209052324295, + "loss/reg": 6.158895121188834e-05, + "step": 159 + }, + { + "epoch": 0.02, + "grad_norm": 2.3351266384124756, + "grad_norm_var": 0.23772124659223212, + "learning_rate": 0.0001, + "loss": 1.1072, + "loss/crossentropy": 2.402188301086426, + "loss/hidden": 0.96484375, + "loss/logits": 0.14173097908496857, + "loss/reg": 6.158249016152695e-05, + "step": 160 + }, + { + "epoch": 0.020125, + "grad_norm": 2.319366455078125, + "grad_norm_var": 0.21752957054554395, + "learning_rate": 0.0001, + "loss": 1.1774, + "loss/crossentropy": 2.1729917526245117, + "loss/hidden": 1.0234375, + "loss/logits": 0.15335121750831604, + "loss/reg": 6.157202733447775e-05, + "step": 161 + }, + { + "epoch": 0.02025, + "grad_norm": 2.0917341709136963, + "grad_norm_var": 0.19926011430610652, + "learning_rate": 0.0001, + "loss": 1.2443, + "loss/crossentropy": 2.276581048965454, + "loss/hidden": 1.0859375, + "loss/logits": 0.1577274203300476, + "loss/reg": 6.156737799756229e-05, + "step": 162 + }, + { + "epoch": 0.020375, + "grad_norm": 4.31035041809082, + "grad_norm_var": 0.41637723338655513, + "learning_rate": 0.0001, + "loss": 1.8974, + "loss/crossentropy": 2.6449058055877686, + "loss/hidden": 1.5625, + "loss/logits": 0.33430173993110657, + "loss/reg": 6.156211748020723e-05, + "step": 163 + }, + { + "epoch": 0.0205, + "grad_norm": 2.145301342010498, + "grad_norm_var": 0.4064476055559296, + "learning_rate": 0.0001, + "loss": 1.2636, + "loss/crossentropy": 2.613586664199829, + "loss/hidden": 1.078125, + "loss/logits": 0.1848127692937851, + "loss/reg": 6.155785376904532e-05, + "step": 164 + }, + { + "epoch": 0.020625, + "grad_norm": 3.6308248043060303, + "grad_norm_var": 0.47796885273955964, + "learning_rate": 0.0001, + "loss": 1.2327, + "loss/crossentropy": 2.599729537963867, + "loss/hidden": 1.046875, + "loss/logits": 0.1852511763572693, + "loss/reg": 6.154972652439028e-05, + "step": 165 + }, + { + "epoch": 0.02075, + "grad_norm": 2.812910556793213, + "grad_norm_var": 0.4622733920417279, + "learning_rate": 0.0001, + "loss": 1.3898, + "loss/crossentropy": 2.7171225547790527, + "loss/hidden": 1.1875, + "loss/logits": 0.20167264342308044, + "loss/reg": 6.154461152618751e-05, + "step": 166 + }, + { + "epoch": 0.020875, + "grad_norm": 2.4922893047332764, + "grad_norm_var": 0.46203729327833537, + "learning_rate": 0.0001, + "loss": 1.3528, + "loss/crossentropy": 2.648606777191162, + "loss/hidden": 1.140625, + "loss/logits": 0.21159711480140686, + "loss/reg": 6.153558933874592e-05, + "step": 167 + }, + { + "epoch": 0.021, + "grad_norm": 2.2380781173706055, + "grad_norm_var": 0.47292652355391496, + "learning_rate": 0.0001, + "loss": 1.3863, + "loss/crossentropy": 2.5556812286376953, + "loss/hidden": 1.1796875, + "loss/logits": 0.20603393018245697, + "loss/reg": 6.152570131234825e-05, + "step": 168 + }, + { + "epoch": 0.021125, + "grad_norm": 2.8179726600646973, + "grad_norm_var": 0.4599538691877346, + "learning_rate": 0.0001, + "loss": 1.3315, + "loss/crossentropy": 2.285341262817383, + "loss/hidden": 1.140625, + "loss/logits": 0.19030849635601044, + "loss/reg": 6.151832349132746e-05, + "step": 169 + }, + { + "epoch": 0.02125, + "grad_norm": 2.933023691177368, + "grad_norm_var": 0.42080948451517297, + "learning_rate": 0.0001, + "loss": 1.5924, + "loss/crossentropy": 2.254920482635498, + "loss/hidden": 1.3828125, + "loss/logits": 0.20900759100914001, + "loss/reg": 6.151078559923917e-05, + "step": 170 + }, + { + "epoch": 0.021375, + "grad_norm": 2.9309163093566895, + "grad_norm_var": 0.38903358238886365, + "learning_rate": 0.0001, + "loss": 1.2104, + "loss/crossentropy": 2.771516799926758, + "loss/hidden": 1.0546875, + "loss/logits": 0.15512725710868835, + "loss/reg": 6.14999225945212e-05, + "step": 171 + }, + { + "epoch": 0.0215, + "grad_norm": 2.7658286094665527, + "grad_norm_var": 0.3882477326935183, + "learning_rate": 0.0001, + "loss": 1.2183, + "loss/crossentropy": 2.565211296081543, + "loss/hidden": 1.0546875, + "loss/logits": 0.16297924518585205, + "loss/reg": 6.149257387733087e-05, + "step": 172 + }, + { + "epoch": 0.021625, + "grad_norm": 3.39176344871521, + "grad_norm_var": 0.40840451933244426, + "learning_rate": 0.0001, + "loss": 1.3931, + "loss/crossentropy": 2.4181013107299805, + "loss/hidden": 1.1875, + "loss/logits": 0.2049458771944046, + "loss/reg": 6.148203829070553e-05, + "step": 173 + }, + { + "epoch": 0.02175, + "grad_norm": 2.7971994876861572, + "grad_norm_var": 0.3468190736041642, + "learning_rate": 0.0001, + "loss": 1.2467, + "loss/crossentropy": 2.644824981689453, + "loss/hidden": 1.0703125, + "loss/logits": 0.17579111456871033, + "loss/reg": 6.147275416878983e-05, + "step": 174 + }, + { + "epoch": 0.021875, + "grad_norm": 7.143955707550049, + "grad_norm_var": 1.5219747541806836, + "learning_rate": 0.0001, + "loss": 1.3279, + "loss/crossentropy": 2.6274638175964355, + "loss/hidden": 1.171875, + "loss/logits": 0.15536972880363464, + "loss/reg": 6.146173836896196e-05, + "step": 175 + }, + { + "epoch": 0.022, + "grad_norm": 8.911324501037598, + "grad_norm_var": 3.578509022301667, + "learning_rate": 0.0001, + "loss": 1.8863, + "loss/crossentropy": 1.8980119228363037, + "loss/hidden": 1.765625, + "loss/logits": 0.12003660202026367, + "loss/reg": 6.145203224150464e-05, + "step": 176 + }, + { + "epoch": 0.022125, + "grad_norm": 2.14353609085083, + "grad_norm_var": 3.6077286646662734, + "learning_rate": 0.0001, + "loss": 1.1573, + "loss/crossentropy": 2.1538591384887695, + "loss/hidden": 1.015625, + "loss/logits": 0.1410439908504486, + "loss/reg": 6.144325743662193e-05, + "step": 177 + }, + { + "epoch": 0.02225, + "grad_norm": 4.625613212585449, + "grad_norm_var": 3.542583274880191, + "learning_rate": 0.0001, + "loss": 1.6226, + "loss/crossentropy": 2.7923362255096436, + "loss/hidden": 1.375, + "loss/logits": 0.24694563448429108, + "loss/reg": 6.143252539914101e-05, + "step": 178 + }, + { + "epoch": 0.022375, + "grad_norm": 2.543745517730713, + "grad_norm_var": 3.5775446556342367, + "learning_rate": 0.0001, + "loss": 1.4192, + "loss/crossentropy": 2.3237483501434326, + "loss/hidden": 1.203125, + "loss/logits": 0.21549411118030548, + "loss/reg": 6.14215387031436e-05, + "step": 179 + }, + { + "epoch": 0.0225, + "grad_norm": 2.3068995475769043, + "grad_norm_var": 3.5495511663474453, + "learning_rate": 0.0001, + "loss": 1.2428, + "loss/crossentropy": 2.7135560512542725, + "loss/hidden": 1.0859375, + "loss/logits": 0.1562565714120865, + "loss/reg": 6.141421181382611e-05, + "step": 180 + }, + { + "epoch": 0.022625, + "grad_norm": 3.465264081954956, + "grad_norm_var": 3.5490467443763025, + "learning_rate": 0.0001, + "loss": 1.4771, + "loss/crossentropy": 3.3183774948120117, + "loss/hidden": 1.234375, + "loss/logits": 0.2421126663684845, + "loss/reg": 6.140418554423377e-05, + "step": 181 + }, + { + "epoch": 0.02275, + "grad_norm": 2.696394205093384, + "grad_norm_var": 3.5608805573030993, + "learning_rate": 0.0001, + "loss": 1.2269, + "loss/crossentropy": 2.609964370727539, + "loss/hidden": 1.0546875, + "loss/logits": 0.17162814736366272, + "loss/reg": 6.139430479379371e-05, + "step": 182 + }, + { + "epoch": 0.022875, + "grad_norm": 2.3278727531433105, + "grad_norm_var": 3.5849405900569513, + "learning_rate": 0.0001, + "loss": 1.0795, + "loss/crossentropy": 2.753383159637451, + "loss/hidden": 0.9453125, + "loss/logits": 0.1335984170436859, + "loss/reg": 6.138216122053564e-05, + "step": 183 + }, + { + "epoch": 0.023, + "grad_norm": 2.4336531162261963, + "grad_norm_var": 3.554360278579671, + "learning_rate": 0.0001, + "loss": 1.3948, + "loss/crossentropy": 2.4162991046905518, + "loss/hidden": 1.171875, + "loss/logits": 0.22235547006130219, + "loss/reg": 6.137174204923213e-05, + "step": 184 + }, + { + "epoch": 0.023125, + "grad_norm": 2.420710802078247, + "grad_norm_var": 3.601127481620784, + "learning_rate": 0.0001, + "loss": 1.4926, + "loss/crossentropy": 2.30292010307312, + "loss/hidden": 1.296875, + "loss/logits": 0.19511133432388306, + "loss/reg": 6.136245065135881e-05, + "step": 185 + }, + { + "epoch": 0.02325, + "grad_norm": 2.727184534072876, + "grad_norm_var": 3.6190579859970224, + "learning_rate": 0.0001, + "loss": 1.2816, + "loss/crossentropy": 2.4605464935302734, + "loss/hidden": 1.0703125, + "loss/logits": 0.2107134908437729, + "loss/reg": 6.135714647825807e-05, + "step": 186 + }, + { + "epoch": 0.023375, + "grad_norm": 1.9292963743209839, + "grad_norm_var": 3.754688597499932, + "learning_rate": 0.0001, + "loss": 1.1628, + "loss/crossentropy": 2.5925047397613525, + "loss/hidden": 1.0, + "loss/logits": 0.16220712661743164, + "loss/reg": 6.134893919806927e-05, + "step": 187 + }, + { + "epoch": 0.0235, + "grad_norm": 2.1395771503448486, + "grad_norm_var": 3.833355540800866, + "learning_rate": 0.0001, + "loss": 1.2712, + "loss/crossentropy": 2.227994441986084, + "loss/hidden": 1.0859375, + "loss/logits": 0.18463259935379028, + "loss/reg": 6.134230352472514e-05, + "step": 188 + }, + { + "epoch": 0.023625, + "grad_norm": 3.552602529525757, + "grad_norm_var": 3.8353265135005175, + "learning_rate": 0.0001, + "loss": 1.2518, + "loss/crossentropy": 2.562777280807495, + "loss/hidden": 1.0859375, + "loss/logits": 0.16521015763282776, + "loss/reg": 6.13337178947404e-05, + "step": 189 + }, + { + "epoch": 0.02375, + "grad_norm": 2.766602039337158, + "grad_norm_var": 3.8377842837978386, + "learning_rate": 0.0001, + "loss": 1.3731, + "loss/crossentropy": 2.4200425148010254, + "loss/hidden": 1.203125, + "loss/logits": 0.1694013774394989, + "loss/reg": 6.132431008154526e-05, + "step": 190 + }, + { + "epoch": 0.023875, + "grad_norm": 2.403444528579712, + "grad_norm_var": 2.8653780273055327, + "learning_rate": 0.0001, + "loss": 1.1651, + "loss/crossentropy": 2.6963400840759277, + "loss/hidden": 1.0078125, + "loss/logits": 0.1566968709230423, + "loss/reg": 6.132054841145873e-05, + "step": 191 + }, + { + "epoch": 0.024, + "grad_norm": 2.0356028079986572, + "grad_norm_var": 0.4806738598539164, + "learning_rate": 0.0001, + "loss": 1.4298, + "loss/crossentropy": 2.174285650253296, + "loss/hidden": 1.21875, + "loss/logits": 0.21048110723495483, + "loss/reg": 6.13146330579184e-05, + "step": 192 + }, + { + "epoch": 0.024125, + "grad_norm": 2.501723051071167, + "grad_norm_var": 0.4641524277019669, + "learning_rate": 0.0001, + "loss": 1.2669, + "loss/crossentropy": 2.6477620601654053, + "loss/hidden": 1.09375, + "loss/logits": 0.17256709933280945, + "loss/reg": 6.130609108367935e-05, + "step": 193 + }, + { + "epoch": 0.02425, + "grad_norm": 2.8256325721740723, + "grad_norm_var": 0.19964871735684203, + "learning_rate": 0.0001, + "loss": 1.364, + "loss/crossentropy": 2.4205310344696045, + "loss/hidden": 1.1875, + "loss/logits": 0.17588719725608826, + "loss/reg": 6.129377288743854e-05, + "step": 194 + }, + { + "epoch": 0.024375, + "grad_norm": 3.715850353240967, + "grad_norm_var": 0.28183777248683595, + "learning_rate": 0.0001, + "loss": 1.4108, + "loss/crossentropy": 2.5872642993927, + "loss/hidden": 1.234375, + "loss/logits": 0.1758473813533783, + "loss/reg": 6.128078530309722e-05, + "step": 195 + }, + { + "epoch": 0.0245, + "grad_norm": 3.3498318195343018, + "grad_norm_var": 0.3034271167360647, + "learning_rate": 0.0001, + "loss": 1.3691, + "loss/crossentropy": 2.6444506645202637, + "loss/hidden": 1.171875, + "loss/logits": 0.19665929675102234, + "loss/reg": 6.126934749772772e-05, + "step": 196 + }, + { + "epoch": 0.024625, + "grad_norm": 2.0526957511901855, + "grad_norm_var": 0.2850787945150557, + "learning_rate": 0.0001, + "loss": 1.2051, + "loss/crossentropy": 2.592327117919922, + "loss/hidden": 1.0390625, + "loss/logits": 0.16540399193763733, + "loss/reg": 6.125810614321381e-05, + "step": 197 + }, + { + "epoch": 0.02475, + "grad_norm": 2.4300317764282227, + "grad_norm_var": 0.28670823409057716, + "learning_rate": 0.0001, + "loss": 1.5286, + "loss/crossentropy": 2.36305570602417, + "loss/hidden": 1.2890625, + "loss/logits": 0.2389371693134308, + "loss/reg": 6.124811625340953e-05, + "step": 198 + }, + { + "epoch": 0.024875, + "grad_norm": 2.3255856037139893, + "grad_norm_var": 0.28679178178242776, + "learning_rate": 0.0001, + "loss": 1.1743, + "loss/crossentropy": 2.0803394317626953, + "loss/hidden": 1.03125, + "loss/logits": 0.1424179971218109, + "loss/reg": 6.124229548731819e-05, + "step": 199 + }, + { + "epoch": 0.025, + "grad_norm": 2.2634005546569824, + "grad_norm_var": 0.2923937566916393, + "learning_rate": 0.0001, + "loss": 1.2619, + "loss/crossentropy": 2.427354574203491, + "loss/hidden": 1.0859375, + "loss/logits": 0.1753256618976593, + "loss/reg": 6.123317871242762e-05, + "step": 200 + }, + { + "epoch": 0.025125, + "grad_norm": 2.789698839187622, + "grad_norm_var": 0.292575209213462, + "learning_rate": 0.0001, + "loss": 1.2794, + "loss/crossentropy": 2.4137160778045654, + "loss/hidden": 1.1328125, + "loss/logits": 0.14599566161632538, + "loss/reg": 6.122920603957027e-05, + "step": 201 + }, + { + "epoch": 0.02525, + "grad_norm": 2.23150897026062, + "grad_norm_var": 0.3003877767651639, + "learning_rate": 0.0001, + "loss": 1.2906, + "loss/crossentropy": 2.502619743347168, + "loss/hidden": 1.09375, + "loss/logits": 0.19620737433433533, + "loss/reg": 6.122409831732512e-05, + "step": 202 + }, + { + "epoch": 0.025375, + "grad_norm": 3.3167238235473633, + "grad_norm_var": 0.2999410613935005, + "learning_rate": 0.0001, + "loss": 1.4511, + "loss/crossentropy": 2.5889461040496826, + "loss/hidden": 1.2265625, + "loss/logits": 0.2239363044500351, + "loss/reg": 6.122187187429518e-05, + "step": 203 + }, + { + "epoch": 0.0255, + "grad_norm": 2.5847971439361572, + "grad_norm_var": 0.28091485279191464, + "learning_rate": 0.0001, + "loss": 1.248, + "loss/crossentropy": 2.4720451831817627, + "loss/hidden": 1.078125, + "loss/logits": 0.16930653154850006, + "loss/reg": 6.120974285295233e-05, + "step": 204 + }, + { + "epoch": 0.025625, + "grad_norm": 2.071563243865967, + "grad_norm_var": 0.24897236933793085, + "learning_rate": 0.0001, + "loss": 1.1016, + "loss/crossentropy": 2.5648884773254395, + "loss/hidden": 0.96875, + "loss/logits": 0.13218875229358673, + "loss/reg": 6.120166654000059e-05, + "step": 205 + }, + { + "epoch": 0.02575, + "grad_norm": 2.9454479217529297, + "grad_norm_var": 0.2548478796483238, + "learning_rate": 0.0001, + "loss": 1.3574, + "loss/crossentropy": 2.607356309890747, + "loss/hidden": 1.15625, + "loss/logits": 0.20053817331790924, + "loss/reg": 6.119644967839122e-05, + "step": 206 + }, + { + "epoch": 0.025875, + "grad_norm": 3.396070718765259, + "grad_norm_var": 0.28840087929906133, + "learning_rate": 0.0001, + "loss": 1.1743, + "loss/crossentropy": 2.682058334350586, + "loss/hidden": 1.0078125, + "loss/logits": 0.16590501368045807, + "loss/reg": 6.11838695476763e-05, + "step": 207 + }, + { + "epoch": 0.026, + "grad_norm": 2.4477601051330566, + "grad_norm_var": 0.26375613878289506, + "learning_rate": 0.0001, + "loss": 1.3022, + "loss/crossentropy": 2.819031000137329, + "loss/hidden": 1.109375, + "loss/logits": 0.19222432374954224, + "loss/reg": 6.117635348346084e-05, + "step": 208 + }, + { + "epoch": 0.026125, + "grad_norm": 2.5916216373443604, + "grad_norm_var": 0.2618484053528464, + "learning_rate": 0.0001, + "loss": 1.353, + "loss/crossentropy": 2.529510259628296, + "loss/hidden": 1.15625, + "loss/logits": 0.19612029194831848, + "loss/reg": 6.116151052992791e-05, + "step": 209 + }, + { + "epoch": 0.02625, + "grad_norm": 2.108261823654175, + "grad_norm_var": 0.28282181699858694, + "learning_rate": 0.0001, + "loss": 1.2782, + "loss/crossentropy": 2.3222012519836426, + "loss/hidden": 1.09375, + "loss/logits": 0.18379396200180054, + "loss/reg": 6.114997813710943e-05, + "step": 210 + }, + { + "epoch": 0.026375, + "grad_norm": 2.48710560798645, + "grad_norm_var": 0.20482550381518247, + "learning_rate": 0.0001, + "loss": 1.2718, + "loss/crossentropy": 2.6183624267578125, + "loss/hidden": 1.0859375, + "loss/logits": 0.18522073328495026, + "loss/reg": 6.114102870924398e-05, + "step": 211 + }, + { + "epoch": 0.0265, + "grad_norm": 2.63779616355896, + "grad_norm_var": 0.1640915083279668, + "learning_rate": 0.0001, + "loss": 1.3499, + "loss/crossentropy": 2.391116142272949, + "loss/hidden": 1.1640625, + "loss/logits": 0.18524512648582458, + "loss/reg": 6.112866685725749e-05, + "step": 212 + }, + { + "epoch": 0.026625, + "grad_norm": 2.7476329803466797, + "grad_norm_var": 0.14889028663519804, + "learning_rate": 0.0001, + "loss": 1.2842, + "loss/crossentropy": 2.5770251750946045, + "loss/hidden": 1.1171875, + "loss/logits": 0.16641706228256226, + "loss/reg": 6.111864786362275e-05, + "step": 213 + }, + { + "epoch": 0.02675, + "grad_norm": 2.565723419189453, + "grad_norm_var": 0.14722036218699916, + "learning_rate": 0.0001, + "loss": 1.2381, + "loss/crossentropy": 2.80257248878479, + "loss/hidden": 1.0546875, + "loss/logits": 0.18279102444648743, + "loss/reg": 6.110716640250757e-05, + "step": 214 + }, + { + "epoch": 0.026875, + "grad_norm": 4.107775688171387, + "grad_norm_var": 0.2818514081658729, + "learning_rate": 0.0001, + "loss": 1.5243, + "loss/crossentropy": 2.4806065559387207, + "loss/hidden": 1.3046875, + "loss/logits": 0.2190462350845337, + "loss/reg": 6.109999230829999e-05, + "step": 215 + }, + { + "epoch": 0.027, + "grad_norm": 2.3829445838928223, + "grad_norm_var": 0.27569299833046823, + "learning_rate": 0.0001, + "loss": 1.2079, + "loss/crossentropy": 2.466684579849243, + "loss/hidden": 1.046875, + "loss/logits": 0.16046380996704102, + "loss/reg": 6.108790694270283e-05, + "step": 216 + }, + { + "epoch": 0.027125, + "grad_norm": 2.554863929748535, + "grad_norm_var": 0.2767468455530223, + "learning_rate": 0.0001, + "loss": 1.1988, + "loss/crossentropy": 2.582035541534424, + "loss/hidden": 1.046875, + "loss/logits": 0.15130122005939484, + "loss/reg": 6.1076192650944e-05, + "step": 217 + }, + { + "epoch": 0.02725, + "grad_norm": 2.7898809909820557, + "grad_norm_var": 0.26145832144768877, + "learning_rate": 0.0001, + "loss": 1.6592, + "loss/crossentropy": 2.655186414718628, + "loss/hidden": 1.3984375, + "loss/logits": 0.26013702154159546, + "loss/reg": 6.107001536292955e-05, + "step": 218 + }, + { + "epoch": 0.027375, + "grad_norm": 2.7881548404693604, + "grad_norm_var": 0.2378165583524293, + "learning_rate": 0.0001, + "loss": 1.5451, + "loss/crossentropy": 2.4413743019104004, + "loss/hidden": 1.3203125, + "loss/logits": 0.2241469919681549, + "loss/reg": 6.106249202275649e-05, + "step": 219 + }, + { + "epoch": 0.0275, + "grad_norm": 2.2896728515625, + "grad_norm_var": 0.24781162791184835, + "learning_rate": 0.0001, + "loss": 1.2198, + "loss/crossentropy": 2.4421772956848145, + "loss/hidden": 1.0703125, + "loss/logits": 0.14890027046203613, + "loss/reg": 6.105640932219103e-05, + "step": 220 + }, + { + "epoch": 0.027625, + "grad_norm": 2.324869155883789, + "grad_norm_var": 0.23120432182346703, + "learning_rate": 0.0001, + "loss": 1.3402, + "loss/crossentropy": 2.526216745376587, + "loss/hidden": 1.140625, + "loss/logits": 0.19898337125778198, + "loss/reg": 6.10438291914761e-05, + "step": 221 + }, + { + "epoch": 0.02775, + "grad_norm": 2.88158917427063, + "grad_norm_var": 0.22935101127255847, + "learning_rate": 0.0001, + "loss": 1.372, + "loss/crossentropy": 2.361729621887207, + "loss/hidden": 1.15625, + "loss/logits": 0.21510916948318481, + "loss/reg": 6.10318202234339e-05, + "step": 222 + }, + { + "epoch": 0.027875, + "grad_norm": 2.9760019779205322, + "grad_norm_var": 0.20104925696453316, + "learning_rate": 0.0001, + "loss": 1.2925, + "loss/crossentropy": 2.5573909282684326, + "loss/hidden": 1.1171875, + "loss/logits": 0.1747477501630783, + "loss/reg": 6.1027145420666784e-05, + "step": 223 + }, + { + "epoch": 0.028, + "grad_norm": 2.702091932296753, + "grad_norm_var": 0.19763696198550798, + "learning_rate": 0.0001, + "loss": 1.3524, + "loss/crossentropy": 2.717195510864258, + "loss/hidden": 1.15625, + "loss/logits": 0.19553202390670776, + "loss/reg": 6.1014961829641834e-05, + "step": 224 + }, + { + "epoch": 0.028125, + "grad_norm": 2.1232945919036865, + "grad_norm_var": 0.21708226542899425, + "learning_rate": 0.0001, + "loss": 1.2661, + "loss/crossentropy": 2.4481968879699707, + "loss/hidden": 1.0859375, + "loss/logits": 0.1795472800731659, + "loss/reg": 6.100164682720788e-05, + "step": 225 + }, + { + "epoch": 0.02825, + "grad_norm": 2.191066026687622, + "grad_norm_var": 0.2114830183011783, + "learning_rate": 0.0001, + "loss": 1.1895, + "loss/crossentropy": 2.34470534324646, + "loss/hidden": 1.03125, + "loss/logits": 0.15763415396213531, + "loss/reg": 6.099118763813749e-05, + "step": 226 + }, + { + "epoch": 0.028375, + "grad_norm": 2.3068013191223145, + "grad_norm_var": 0.21765702233228598, + "learning_rate": 0.0001, + "loss": 1.539, + "loss/crossentropy": 2.5549845695495605, + "loss/hidden": 1.328125, + "loss/logits": 0.21025767922401428, + "loss/reg": 6.09817034273874e-05, + "step": 227 + }, + { + "epoch": 0.0285, + "grad_norm": 2.890655279159546, + "grad_norm_var": 0.221304562186567, + "learning_rate": 0.0001, + "loss": 1.5638, + "loss/crossentropy": 2.2339606285095215, + "loss/hidden": 1.34375, + "loss/logits": 0.21939440071582794, + "loss/reg": 6.096933429944329e-05, + "step": 228 + }, + { + "epoch": 0.028625, + "grad_norm": 2.182521343231201, + "grad_norm_var": 0.2349577927735633, + "learning_rate": 0.0001, + "loss": 1.2085, + "loss/crossentropy": 2.641230583190918, + "loss/hidden": 1.046875, + "loss/logits": 0.161014586687088, + "loss/reg": 6.095720891607925e-05, + "step": 229 + }, + { + "epoch": 0.02875, + "grad_norm": 2.704406976699829, + "grad_norm_var": 0.23499684870281476, + "learning_rate": 0.0001, + "loss": 1.3456, + "loss/crossentropy": 2.6833486557006836, + "loss/hidden": 1.15625, + "loss/logits": 0.18876385688781738, + "loss/reg": 6.094613127061166e-05, + "step": 230 + }, + { + "epoch": 0.028875, + "grad_norm": 3.4925310611724854, + "grad_norm_var": 0.13802667852219105, + "learning_rate": 0.0001, + "loss": 1.3709, + "loss/crossentropy": 2.1604089736938477, + "loss/hidden": 1.1953125, + "loss/logits": 0.17500904202461243, + "loss/reg": 6.093499541748315e-05, + "step": 231 + }, + { + "epoch": 0.029, + "grad_norm": 2.344773530960083, + "grad_norm_var": 0.13921650701028032, + "learning_rate": 0.0001, + "loss": 1.4725, + "loss/crossentropy": 2.493307113647461, + "loss/hidden": 1.25, + "loss/logits": 0.22193682193756104, + "loss/reg": 6.092391777201556e-05, + "step": 232 + }, + { + "epoch": 0.029125, + "grad_norm": 1.8828089237213135, + "grad_norm_var": 0.17117140448626647, + "learning_rate": 0.0001, + "loss": 1.1104, + "loss/crossentropy": 2.5302743911743164, + "loss/hidden": 0.9765625, + "loss/logits": 0.1331850290298462, + "loss/reg": 6.0912472690688446e-05, + "step": 233 + }, + { + "epoch": 0.02925, + "grad_norm": 2.747770071029663, + "grad_norm_var": 0.16996031408720758, + "learning_rate": 0.0001, + "loss": 1.1371, + "loss/crossentropy": 2.4189980030059814, + "loss/hidden": 0.99609375, + "loss/logits": 0.14035619795322418, + "loss/reg": 6.089695307309739e-05, + "step": 234 + }, + { + "epoch": 0.029375, + "grad_norm": 1.8742481470108032, + "grad_norm_var": 0.1933626604088189, + "learning_rate": 0.0001, + "loss": 1.1601, + "loss/crossentropy": 2.2694003582000732, + "loss/hidden": 1.015625, + "loss/logits": 0.14385350048542023, + "loss/reg": 6.088387090130709e-05, + "step": 235 + }, + { + "epoch": 0.0295, + "grad_norm": 2.0313689708709717, + "grad_norm_var": 0.20459374724346724, + "learning_rate": 0.0001, + "loss": 1.2446, + "loss/crossentropy": 2.4902865886688232, + "loss/hidden": 1.0703125, + "loss/logits": 0.17369529604911804, + "loss/reg": 6.086897337809205e-05, + "step": 236 + }, + { + "epoch": 0.029625, + "grad_norm": 2.3882880210876465, + "grad_norm_var": 0.20354561810974156, + "learning_rate": 0.0001, + "loss": 1.3947, + "loss/crossentropy": 2.4032340049743652, + "loss/hidden": 1.1875, + "loss/logits": 0.20656049251556396, + "loss/reg": 6.085408676881343e-05, + "step": 237 + }, + { + "epoch": 0.02975, + "grad_norm": 1.7327938079833984, + "grad_norm_var": 0.22490130088653987, + "learning_rate": 0.0001, + "loss": 1.1777, + "loss/crossentropy": 2.4949777126312256, + "loss/hidden": 1.015625, + "loss/logits": 0.1614799201488495, + "loss/reg": 6.084307824494317e-05, + "step": 238 + }, + { + "epoch": 0.029875, + "grad_norm": 2.2483370304107666, + "grad_norm_var": 0.20314943964483845, + "learning_rate": 0.0001, + "loss": 1.331, + "loss/crossentropy": 2.5907418727874756, + "loss/hidden": 1.1328125, + "loss/logits": 0.19753864407539368, + "loss/reg": 6.0828475398011506e-05, + "step": 239 + }, + { + "epoch": 0.03, + "grad_norm": 2.5151193141937256, + "grad_norm_var": 0.19693662117647784, + "learning_rate": 0.0001, + "loss": 1.2278, + "loss/crossentropy": 2.6233856678009033, + "loss/hidden": 1.0546875, + "loss/logits": 0.1725194901227951, + "loss/reg": 6.0820282669737935e-05, + "step": 240 + }, + { + "epoch": 0.030125, + "grad_norm": 2.198249101638794, + "grad_norm_var": 0.19498660957211478, + "learning_rate": 0.0001, + "loss": 1.1441, + "loss/crossentropy": 2.368884563446045, + "loss/hidden": 0.99609375, + "loss/logits": 0.1473642736673355, + "loss/reg": 6.0812566516688094e-05, + "step": 241 + }, + { + "epoch": 0.03025, + "grad_norm": 2.195218563079834, + "grad_norm_var": 0.1948951313244331, + "learning_rate": 0.0001, + "loss": 1.2993, + "loss/crossentropy": 2.352041721343994, + "loss/hidden": 1.1171875, + "loss/logits": 0.1815069168806076, + "loss/reg": 6.080829552956857e-05, + "step": 242 + }, + { + "epoch": 0.030375, + "grad_norm": 2.6142425537109375, + "grad_norm_var": 0.19868367561009795, + "learning_rate": 0.0001, + "loss": 1.3644, + "loss/crossentropy": 2.497286558151245, + "loss/hidden": 1.1875, + "loss/logits": 0.17629210650920868, + "loss/reg": 6.0799306083936244e-05, + "step": 243 + }, + { + "epoch": 0.0305, + "grad_norm": 2.342033624649048, + "grad_norm_var": 0.1799734399041227, + "learning_rate": 0.0001, + "loss": 1.1311, + "loss/crossentropy": 2.5182478427886963, + "loss/hidden": 0.984375, + "loss/logits": 0.1461625099182129, + "loss/reg": 6.078776277718134e-05, + "step": 244 + }, + { + "epoch": 0.030625, + "grad_norm": 2.3943874835968018, + "grad_norm_var": 0.17823371257387344, + "learning_rate": 0.0001, + "loss": 1.1773, + "loss/crossentropy": 2.575707197189331, + "loss/hidden": 1.015625, + "loss/logits": 0.1610667109489441, + "loss/reg": 6.078143633203581e-05, + "step": 245 + }, + { + "epoch": 0.03075, + "grad_norm": 2.2752902507781982, + "grad_norm_var": 0.16984605758260846, + "learning_rate": 0.0001, + "loss": 1.3322, + "loss/crossentropy": 2.228628635406494, + "loss/hidden": 1.1484375, + "loss/logits": 0.18314987421035767, + "loss/reg": 6.077219222788699e-05, + "step": 246 + }, + { + "epoch": 0.030875, + "grad_norm": 2.1779940128326416, + "grad_norm_var": 0.07406002979102144, + "learning_rate": 0.0001, + "loss": 1.179, + "loss/crossentropy": 2.4325718879699707, + "loss/hidden": 1.0078125, + "loss/logits": 0.17062756419181824, + "loss/reg": 6.076457793824375e-05, + "step": 247 + }, + { + "epoch": 0.031, + "grad_norm": 2.031386613845825, + "grad_norm_var": 0.07614130749575872, + "learning_rate": 0.0001, + "loss": 1.3177, + "loss/crossentropy": 2.3050920963287354, + "loss/hidden": 1.1328125, + "loss/logits": 0.18426315486431122, + "loss/reg": 6.075216515455395e-05, + "step": 248 + }, + { + "epoch": 0.031125, + "grad_norm": 2.4880683422088623, + "grad_norm_var": 0.07117238958467732, + "learning_rate": 0.0001, + "loss": 1.2617, + "loss/crossentropy": 2.690160036087036, + "loss/hidden": 1.0625, + "loss/logits": 0.1985635757446289, + "loss/reg": 6.0742688219761476e-05, + "step": 249 + }, + { + "epoch": 0.03125, + "grad_norm": 2.631229877471924, + "grad_norm_var": 0.06453399427719399, + "learning_rate": 0.0001, + "loss": 1.3072, + "loss/crossentropy": 2.4459030628204346, + "loss/hidden": 1.109375, + "loss/logits": 0.1971898078918457, + "loss/reg": 6.0733007558155805e-05, + "step": 250 + }, + { + "epoch": 0.031375, + "grad_norm": 2.7028048038482666, + "grad_norm_var": 0.06497512863382227, + "learning_rate": 0.0001, + "loss": 1.3656, + "loss/crossentropy": 2.7830824851989746, + "loss/hidden": 1.1796875, + "loss/logits": 0.18533006310462952, + "loss/reg": 6.0722686612280086e-05, + "step": 251 + }, + { + "epoch": 0.0315, + "grad_norm": 3.7025880813598633, + "grad_norm_var": 0.17735395269518506, + "learning_rate": 0.0001, + "loss": 1.2542, + "loss/crossentropy": 2.4722542762756348, + "loss/hidden": 1.078125, + "loss/logits": 0.17551761865615845, + "loss/reg": 6.0708127421094105e-05, + "step": 252 + }, + { + "epoch": 0.031625, + "grad_norm": 2.1496498584747314, + "grad_norm_var": 0.18175923180052275, + "learning_rate": 0.0001, + "loss": 1.0403, + "loss/crossentropy": 2.4383487701416016, + "loss/hidden": 0.91015625, + "loss/logits": 0.12949630618095398, + "loss/reg": 6.069323717383668e-05, + "step": 253 + }, + { + "epoch": 0.03175, + "grad_norm": 3.212991237640381, + "grad_norm_var": 0.18702365671043306, + "learning_rate": 0.0001, + "loss": 1.3555, + "loss/crossentropy": 2.1896352767944336, + "loss/hidden": 1.1953125, + "loss/logits": 0.1595323085784912, + "loss/reg": 6.067836147849448e-05, + "step": 254 + }, + { + "epoch": 0.031875, + "grad_norm": 2.53044056892395, + "grad_norm_var": 0.18281462084492142, + "learning_rate": 0.0001, + "loss": 1.2462, + "loss/crossentropy": 2.8005239963531494, + "loss/hidden": 1.0625, + "loss/logits": 0.18304814398288727, + "loss/reg": 6.0668298829114065e-05, + "step": 255 + }, + { + "epoch": 0.032, + "grad_norm": 5.920226573944092, + "grad_norm_var": 0.9097630014084027, + "learning_rate": 0.0001, + "loss": 1.9011, + "loss/crossentropy": 2.2827932834625244, + "loss/hidden": 1.59375, + "loss/logits": 0.3067648708820343, + "loss/reg": 6.0657377616735175e-05, + "step": 256 + }, + { + "epoch": 0.032125, + "grad_norm": 3.144649028778076, + "grad_norm_var": 0.8995354429829506, + "learning_rate": 0.0001, + "loss": 1.2361, + "loss/crossentropy": 2.9163215160369873, + "loss/hidden": 1.078125, + "loss/logits": 0.15732741355895996, + "loss/reg": 6.064687840989791e-05, + "step": 257 + }, + { + "epoch": 0.03225, + "grad_norm": 2.677065849304199, + "grad_norm_var": 0.8763431299745091, + "learning_rate": 0.0001, + "loss": 1.3123, + "loss/crossentropy": 2.9036660194396973, + "loss/hidden": 1.125, + "loss/logits": 0.18664765357971191, + "loss/reg": 6.0635462432401255e-05, + "step": 258 + }, + { + "epoch": 0.032375, + "grad_norm": 1.9815617799758911, + "grad_norm_var": 0.9180593253885627, + "learning_rate": 0.0001, + "loss": 1.2567, + "loss/crossentropy": 2.6647751331329346, + "loss/hidden": 1.0703125, + "loss/logits": 0.18578888475894928, + "loss/reg": 6.062128159101121e-05, + "step": 259 + }, + { + "epoch": 0.0325, + "grad_norm": 2.6094260215759277, + "grad_norm_var": 0.9071755924568459, + "learning_rate": 0.0001, + "loss": 1.4176, + "loss/crossentropy": 2.9915220737457275, + "loss/hidden": 1.21875, + "loss/logits": 0.19824379682540894, + "loss/reg": 6.060625673853792e-05, + "step": 260 + }, + { + "epoch": 0.032625, + "grad_norm": 2.4859585762023926, + "grad_norm_var": 0.9028772625757899, + "learning_rate": 0.0001, + "loss": 1.2047, + "loss/crossentropy": 2.325611114501953, + "loss/hidden": 1.03125, + "loss/logits": 0.17281952500343323, + "loss/reg": 6.0591693909373134e-05, + "step": 261 + }, + { + "epoch": 0.03275, + "grad_norm": 4.910043716430664, + "grad_norm_var": 1.154144117287072, + "learning_rate": 0.0001, + "loss": 1.2858, + "loss/crossentropy": 2.568098306655884, + "loss/hidden": 1.109375, + "loss/logits": 0.17582398653030396, + "loss/reg": 6.057979408069514e-05, + "step": 262 + }, + { + "epoch": 0.032875, + "grad_norm": 2.2592694759368896, + "grad_norm_var": 1.1460852387432343, + "learning_rate": 0.0001, + "loss": 1.3156, + "loss/crossentropy": 2.5264766216278076, + "loss/hidden": 1.1171875, + "loss/logits": 0.19776055216789246, + "loss/reg": 6.056776692275889e-05, + "step": 263 + }, + { + "epoch": 0.033, + "grad_norm": 2.6964571475982666, + "grad_norm_var": 1.0909556269012999, + "learning_rate": 0.0001, + "loss": 1.0468, + "loss/crossentropy": 2.740647792816162, + "loss/hidden": 0.91796875, + "loss/logits": 0.12825211882591248, + "loss/reg": 6.0556718381121755e-05, + "step": 264 + }, + { + "epoch": 0.033125, + "grad_norm": 2.112201690673828, + "grad_norm_var": 1.125761935491216, + "learning_rate": 0.0001, + "loss": 1.2175, + "loss/crossentropy": 2.475130081176758, + "loss/hidden": 1.0390625, + "loss/logits": 0.1778050661087036, + "loss/reg": 6.0543683503055945e-05, + "step": 265 + }, + { + "epoch": 0.03325, + "grad_norm": 1.8527328968048096, + "grad_norm_var": 1.2001448152569836, + "learning_rate": 0.0001, + "loss": 1.1913, + "loss/crossentropy": 2.2017788887023926, + "loss/hidden": 1.0234375, + "loss/logits": 0.16727614402770996, + "loss/reg": 6.053145989426412e-05, + "step": 266 + }, + { + "epoch": 0.033375, + "grad_norm": 2.2294929027557373, + "grad_norm_var": 1.2287526925730277, + "learning_rate": 0.0001, + "loss": 1.3521, + "loss/crossentropy": 2.268073558807373, + "loss/hidden": 1.1640625, + "loss/logits": 0.18739831447601318, + "loss/reg": 6.052442768123001e-05, + "step": 267 + }, + { + "epoch": 0.0335, + "grad_norm": 2.185410499572754, + "grad_norm_var": 1.2112062552861744, + "learning_rate": 0.0001, + "loss": 1.44, + "loss/crossentropy": 2.390622138977051, + "loss/hidden": 1.234375, + "loss/logits": 0.20500804483890533, + "loss/reg": 6.051711898180656e-05, + "step": 268 + }, + { + "epoch": 0.033625, + "grad_norm": 2.616452693939209, + "grad_norm_var": 1.1837342905938153, + "learning_rate": 0.0001, + "loss": 1.3338, + "loss/crossentropy": 2.3374340534210205, + "loss/hidden": 1.15625, + "loss/logits": 0.17693625390529633, + "loss/reg": 6.0506343288579956e-05, + "step": 269 + }, + { + "epoch": 0.03375, + "grad_norm": 2.5214874744415283, + "grad_norm_var": 1.1791403953024882, + "learning_rate": 0.0001, + "loss": 1.4572, + "loss/crossentropy": 2.6334807872772217, + "loss/hidden": 1.25, + "loss/logits": 0.20655225217342377, + "loss/reg": 6.0493421187857166e-05, + "step": 270 + }, + { + "epoch": 0.033875, + "grad_norm": 2.3426766395568848, + "grad_norm_var": 1.18798729537596, + "learning_rate": 0.0001, + "loss": 1.2858, + "loss/crossentropy": 2.362666130065918, + "loss/hidden": 1.1171875, + "loss/logits": 0.16799038648605347, + "loss/reg": 6.047951683285646e-05, + "step": 271 + }, + { + "epoch": 0.034, + "grad_norm": 2.483227491378784, + "grad_norm_var": 0.4891016266434789, + "learning_rate": 0.0001, + "loss": 1.4126, + "loss/crossentropy": 2.6330323219299316, + "loss/hidden": 1.203125, + "loss/logits": 0.20882482826709747, + "loss/reg": 6.046749331289902e-05, + "step": 272 + }, + { + "epoch": 0.034125, + "grad_norm": 3.3453869819641113, + "grad_norm_var": 0.5070205087741229, + "learning_rate": 0.0001, + "loss": 1.3731, + "loss/crossentropy": 2.6637308597564697, + "loss/hidden": 1.171875, + "loss/logits": 0.20059773325920105, + "loss/reg": 6.0458773077698424e-05, + "step": 273 + }, + { + "epoch": 0.03425, + "grad_norm": 2.2971482276916504, + "grad_norm_var": 0.5112160036914843, + "learning_rate": 0.0001, + "loss": 1.3516, + "loss/crossentropy": 2.400428533554077, + "loss/hidden": 1.1640625, + "loss/logits": 0.18688717484474182, + "loss/reg": 6.0452930483734235e-05, + "step": 274 + }, + { + "epoch": 0.034375, + "grad_norm": 11.117164611816406, + "grad_norm_var": 5.025199240890341, + "learning_rate": 0.0001, + "loss": 2.1956, + "loss/crossentropy": 2.7653286457061768, + "loss/hidden": 1.8984375, + "loss/logits": 0.2965186834335327, + "loss/reg": 6.045090049155988e-05, + "step": 275 + }, + { + "epoch": 0.0345, + "grad_norm": 3.6517550945281982, + "grad_norm_var": 5.020888752799834, + "learning_rate": 0.0001, + "loss": 1.4104, + "loss/crossentropy": 2.8897998332977295, + "loss/hidden": 1.1484375, + "loss/logits": 0.26139265298843384, + "loss/reg": 6.0451366152847186e-05, + "step": 276 + }, + { + "epoch": 0.034625, + "grad_norm": 2.6342201232910156, + "grad_norm_var": 5.008262345647254, + "learning_rate": 0.0001, + "loss": 1.272, + "loss/crossentropy": 2.662801504135132, + "loss/hidden": 1.09375, + "loss/logits": 0.17764705419540405, + "loss/reg": 6.0443973779911175e-05, + "step": 277 + }, + { + "epoch": 0.03475, + "grad_norm": 2.613866090774536, + "grad_norm_var": 4.815302301096653, + "learning_rate": 0.0001, + "loss": 1.3, + "loss/crossentropy": 2.2599401473999023, + "loss/hidden": 1.125, + "loss/logits": 0.1744215488433838, + "loss/reg": 6.04407032369636e-05, + "step": 278 + }, + { + "epoch": 0.034875, + "grad_norm": 2.4121639728546143, + "grad_norm_var": 4.800441045565859, + "learning_rate": 0.0001, + "loss": 1.2736, + "loss/crossentropy": 2.3868885040283203, + "loss/hidden": 1.109375, + "loss/logits": 0.16360533237457275, + "loss/reg": 6.0438182117650285e-05, + "step": 279 + }, + { + "epoch": 0.035, + "grad_norm": 2.257427930831909, + "grad_norm_var": 4.834324037466968, + "learning_rate": 0.0001, + "loss": 1.3236, + "loss/crossentropy": 2.452359914779663, + "loss/hidden": 1.1328125, + "loss/logits": 0.19017404317855835, + "loss/reg": 6.043619578122161e-05, + "step": 280 + }, + { + "epoch": 0.035125, + "grad_norm": 2.3916571140289307, + "grad_norm_var": 4.8045581397439525, + "learning_rate": 0.0001, + "loss": 1.3161, + "loss/crossentropy": 2.4834201335906982, + "loss/hidden": 1.109375, + "loss/logits": 0.20611721277236938, + "loss/reg": 6.043669054633938e-05, + "step": 281 + }, + { + "epoch": 0.03525, + "grad_norm": 2.815398931503296, + "grad_norm_var": 4.707581175884913, + "learning_rate": 0.0001, + "loss": 1.1312, + "loss/crossentropy": 3.0801713466644287, + "loss/hidden": 0.98828125, + "loss/logits": 0.14229975640773773, + "loss/reg": 6.044648034730926e-05, + "step": 282 + }, + { + "epoch": 0.035375, + "grad_norm": 3.1715469360351562, + "grad_norm_var": 4.651233430019207, + "learning_rate": 0.0001, + "loss": 1.409, + "loss/crossentropy": 2.354785919189453, + "loss/hidden": 1.1953125, + "loss/logits": 0.21305763721466064, + "loss/reg": 6.0437832871684805e-05, + "step": 283 + }, + { + "epoch": 0.0355, + "grad_norm": 2.5010037422180176, + "grad_norm_var": 4.615667456235268, + "learning_rate": 0.0001, + "loss": 1.3572, + "loss/crossentropy": 2.492047071456909, + "loss/hidden": 1.1640625, + "loss/logits": 0.1925477683544159, + "loss/reg": 6.044709516572766e-05, + "step": 284 + }, + { + "epoch": 0.035625, + "grad_norm": 1.964429259300232, + "grad_norm_var": 4.6928209367171645, + "learning_rate": 0.0001, + "loss": 1.1671, + "loss/crossentropy": 2.3351125717163086, + "loss/hidden": 0.99609375, + "loss/logits": 0.1704423427581787, + "loss/reg": 6.0453679907368496e-05, + "step": 285 + }, + { + "epoch": 0.03575, + "grad_norm": 2.3656678199768066, + "grad_norm_var": 4.707552916907375, + "learning_rate": 0.0001, + "loss": 1.5385, + "loss/crossentropy": 2.4216158390045166, + "loss/hidden": 1.28125, + "loss/logits": 0.2566841244697571, + "loss/reg": 6.0443537222454324e-05, + "step": 286 + }, + { + "epoch": 0.035875, + "grad_norm": 3.140928030014038, + "grad_norm_var": 4.661686527481659, + "learning_rate": 0.0001, + "loss": 1.3637, + "loss/crossentropy": 2.8347983360290527, + "loss/hidden": 1.15625, + "loss/logits": 0.20682096481323242, + "loss/reg": 6.043089888407849e-05, + "step": 287 + }, + { + "epoch": 0.036, + "grad_norm": 2.6460797786712646, + "grad_norm_var": 4.647830565858565, + "learning_rate": 0.0001, + "loss": 1.3928, + "loss/crossentropy": 2.108215093612671, + "loss/hidden": 1.2109375, + "loss/logits": 0.18129181861877441, + "loss/reg": 6.042820677976124e-05, + "step": 288 + }, + { + "epoch": 0.036125, + "grad_norm": 2.879531145095825, + "grad_norm_var": 4.652852381956769, + "learning_rate": 0.0001, + "loss": 1.4359, + "loss/crossentropy": 2.90163516998291, + "loss/hidden": 1.25, + "loss/logits": 0.1853410005569458, + "loss/reg": 6.042792301741429e-05, + "step": 289 + }, + { + "epoch": 0.03625, + "grad_norm": 2.5701370239257812, + "grad_norm_var": 4.625421100051376, + "learning_rate": 0.0001, + "loss": 1.3639, + "loss/crossentropy": 2.6896326541900635, + "loss/hidden": 1.15625, + "loss/logits": 0.2070741057395935, + "loss/reg": 6.0414979088818654e-05, + "step": 290 + }, + { + "epoch": 0.036375, + "grad_norm": 2.988196849822998, + "grad_norm_var": 0.16977142791367086, + "learning_rate": 0.0001, + "loss": 1.103, + "loss/crossentropy": 2.8485705852508545, + "loss/hidden": 0.96875, + "loss/logits": 0.13362044095993042, + "loss/reg": 6.0413527535274625e-05, + "step": 291 + }, + { + "epoch": 0.0365, + "grad_norm": 5.9153923988342285, + "grad_norm_var": 0.7809789933836029, + "learning_rate": 0.0001, + "loss": 1.6292, + "loss/crossentropy": 2.607590436935425, + "loss/hidden": 1.4375, + "loss/logits": 0.19109681248664856, + "loss/reg": 6.041422238922678e-05, + "step": 292 + }, + { + "epoch": 0.036625, + "grad_norm": 1.932381510734558, + "grad_norm_var": 0.8300136192923785, + "learning_rate": 0.0001, + "loss": 1.1314, + "loss/crossentropy": 2.2319207191467285, + "loss/hidden": 0.9921875, + "loss/logits": 0.13856041431427002, + "loss/reg": 6.041810775059275e-05, + "step": 293 + }, + { + "epoch": 0.03675, + "grad_norm": 2.1218042373657227, + "grad_norm_var": 0.8563980373093443, + "learning_rate": 0.0001, + "loss": 1.1898, + "loss/crossentropy": 2.7033910751342773, + "loss/hidden": 1.03125, + "loss/logits": 0.15791726112365723, + "loss/reg": 6.0404745454434305e-05, + "step": 294 + }, + { + "epoch": 0.036875, + "grad_norm": 3.239748954772949, + "grad_norm_var": 0.8614170936653748, + "learning_rate": 0.0001, + "loss": 1.6186, + "loss/crossentropy": 2.3478281497955322, + "loss/hidden": 1.3671875, + "loss/logits": 0.2507687509059906, + "loss/reg": 6.039286745362915e-05, + "step": 295 + }, + { + "epoch": 0.037, + "grad_norm": 2.361431121826172, + "grad_norm_var": 0.8544814148079373, + "learning_rate": 0.0001, + "loss": 1.2822, + "loss/crossentropy": 2.4396111965179443, + "loss/hidden": 1.0859375, + "loss/logits": 0.19569119811058044, + "loss/reg": 6.0390335420379415e-05, + "step": 296 + }, + { + "epoch": 0.037125, + "grad_norm": 2.6921112537384033, + "grad_norm_var": 0.8432509023111928, + "learning_rate": 0.0001, + "loss": 1.3584, + "loss/crossentropy": 2.3235762119293213, + "loss/hidden": 1.15625, + "loss/logits": 0.20157676935195923, + "loss/reg": 6.037576531525701e-05, + "step": 297 + }, + { + "epoch": 0.03725, + "grad_norm": 2.2376601696014404, + "grad_norm_var": 0.8653611900667765, + "learning_rate": 0.0001, + "loss": 1.3703, + "loss/crossentropy": 2.441978693008423, + "loss/hidden": 1.1875, + "loss/logits": 0.1821848303079605, + "loss/reg": 6.036146805854514e-05, + "step": 298 + }, + { + "epoch": 0.037375, + "grad_norm": 2.5022082328796387, + "grad_norm_var": 0.8598019948407729, + "learning_rate": 0.0001, + "loss": 1.2909, + "loss/crossentropy": 2.4099972248077393, + "loss/hidden": 1.09375, + "loss/logits": 0.1965959370136261, + "loss/reg": 6.035445403540507e-05, + "step": 299 + }, + { + "epoch": 0.0375, + "grad_norm": 2.323599338531494, + "grad_norm_var": 0.8677455500426021, + "learning_rate": 0.0001, + "loss": 1.2301, + "loss/crossentropy": 2.714334011077881, + "loss/hidden": 1.0625, + "loss/logits": 0.16703477501869202, + "loss/reg": 6.034153193468228e-05, + "step": 300 + }, + { + "epoch": 0.037625, + "grad_norm": 2.902794361114502, + "grad_norm_var": 0.8254198045813945, + "learning_rate": 0.0001, + "loss": 1.287, + "loss/crossentropy": 2.5897319316864014, + "loss/hidden": 1.1171875, + "loss/logits": 0.1692187488079071, + "loss/reg": 6.032464443705976e-05, + "step": 301 + }, + { + "epoch": 0.03775, + "grad_norm": 2.455423355102539, + "grad_norm_var": 0.8207107650276014, + "learning_rate": 0.0001, + "loss": 1.3118, + "loss/crossentropy": 2.2553625106811523, + "loss/hidden": 1.15625, + "loss/logits": 0.15494795143604279, + "loss/reg": 6.031416342011653e-05, + "step": 302 + }, + { + "epoch": 0.037875, + "grad_norm": 2.70770001411438, + "grad_norm_var": 0.8131429553718594, + "learning_rate": 0.0001, + "loss": 1.3645, + "loss/crossentropy": 2.298628807067871, + "loss/hidden": 1.1875, + "loss/logits": 0.17642799019813538, + "loss/reg": 6.029937867424451e-05, + "step": 303 + }, + { + "epoch": 0.038, + "grad_norm": 2.4096872806549072, + "grad_norm_var": 0.8208490888498592, + "learning_rate": 0.0001, + "loss": 1.2573, + "loss/crossentropy": 2.6787161827087402, + "loss/hidden": 1.078125, + "loss/logits": 0.17861339449882507, + "loss/reg": 6.027881318004802e-05, + "step": 304 + }, + { + "epoch": 0.038125, + "grad_norm": 2.364800214767456, + "grad_norm_var": 0.8295471446711137, + "learning_rate": 0.0001, + "loss": 1.3251, + "loss/crossentropy": 2.351970911026001, + "loss/hidden": 1.140625, + "loss/logits": 0.18391045928001404, + "loss/reg": 6.026409027981572e-05, + "step": 305 + }, + { + "epoch": 0.03825, + "grad_norm": 2.0991923809051514, + "grad_norm_var": 0.8536240669336511, + "learning_rate": 0.0001, + "loss": 1.078, + "loss/crossentropy": 2.7187068462371826, + "loss/hidden": 0.9453125, + "loss/logits": 0.13205038011074066, + "loss/reg": 6.0248257796047255e-05, + "step": 306 + }, + { + "epoch": 0.038375, + "grad_norm": 2.7471582889556885, + "grad_norm_var": 0.8481018158238611, + "learning_rate": 0.0001, + "loss": 1.4035, + "loss/crossentropy": 2.1265523433685303, + "loss/hidden": 1.21875, + "loss/logits": 0.18416792154312134, + "loss/reg": 6.0230733652133495e-05, + "step": 307 + }, + { + "epoch": 0.0385, + "grad_norm": 2.2592687606811523, + "grad_norm_var": 0.11041007633642194, + "learning_rate": 0.0001, + "loss": 1.271, + "loss/crossentropy": 2.66719651222229, + "loss/hidden": 1.0859375, + "loss/logits": 0.184452086687088, + "loss/reg": 6.0217109421500936e-05, + "step": 308 + }, + { + "epoch": 0.038625, + "grad_norm": 2.2400615215301514, + "grad_norm_var": 0.09468951175299385, + "learning_rate": 0.0001, + "loss": 1.2348, + "loss/crossentropy": 2.3710193634033203, + "loss/hidden": 1.0625, + "loss/logits": 0.1717246174812317, + "loss/reg": 6.020214277668856e-05, + "step": 309 + }, + { + "epoch": 0.03875, + "grad_norm": 2.0783209800720215, + "grad_norm_var": 0.09687885973874776, + "learning_rate": 0.0001, + "loss": 1.2085, + "loss/crossentropy": 2.2699692249298096, + "loss/hidden": 1.03125, + "loss/logits": 0.17665645480155945, + "loss/reg": 6.018438944010995e-05, + "step": 310 + }, + { + "epoch": 0.038875, + "grad_norm": 2.077648162841797, + "grad_norm_var": 0.06299334570375853, + "learning_rate": 0.0001, + "loss": 1.2169, + "loss/crossentropy": 2.334127426147461, + "loss/hidden": 1.0625, + "loss/logits": 0.15378312766551971, + "loss/reg": 6.0161146393511444e-05, + "step": 311 + }, + { + "epoch": 0.039, + "grad_norm": 2.440629482269287, + "grad_norm_var": 0.06293910816862744, + "learning_rate": 0.0001, + "loss": 1.2956, + "loss/crossentropy": 2.791874408721924, + "loss/hidden": 1.1171875, + "loss/logits": 0.1777758002281189, + "loss/reg": 6.014638711349107e-05, + "step": 312 + }, + { + "epoch": 0.039125, + "grad_norm": 2.853940963745117, + "grad_norm_var": 0.07069242228717272, + "learning_rate": 0.0001, + "loss": 1.2688, + "loss/crossentropy": 2.5036516189575195, + "loss/hidden": 1.0859375, + "loss/logits": 0.18226328492164612, + "loss/reg": 6.013087840983644e-05, + "step": 313 + }, + { + "epoch": 0.03925, + "grad_norm": 3.287529230117798, + "grad_norm_var": 0.11423125477930943, + "learning_rate": 0.0001, + "loss": 1.2435, + "loss/crossentropy": 2.696265697479248, + "loss/hidden": 1.0703125, + "loss/logits": 0.17254707217216492, + "loss/reg": 6.011854929965921e-05, + "step": 314 + }, + { + "epoch": 0.039375, + "grad_norm": 3.1080963611602783, + "grad_norm_var": 0.1386158794861321, + "learning_rate": 0.0001, + "loss": 1.473, + "loss/crossentropy": 2.1882760524749756, + "loss/hidden": 1.25, + "loss/logits": 0.2224160134792328, + "loss/reg": 6.0103353462181985e-05, + "step": 315 + }, + { + "epoch": 0.0395, + "grad_norm": 2.7303977012634277, + "grad_norm_var": 0.13818442385569654, + "learning_rate": 0.0001, + "loss": 1.4029, + "loss/crossentropy": 2.361660957336426, + "loss/hidden": 1.2109375, + "loss/logits": 0.19139324128627777, + "loss/reg": 6.008424679748714e-05, + "step": 316 + }, + { + "epoch": 0.039625, + "grad_norm": 1.7651097774505615, + "grad_norm_var": 0.16520987140884788, + "learning_rate": 0.0001, + "loss": 1.0765, + "loss/crossentropy": 2.435858964920044, + "loss/hidden": 0.953125, + "loss/logits": 0.1227254569530487, + "loss/reg": 6.007165211485699e-05, + "step": 317 + }, + { + "epoch": 0.03975, + "grad_norm": 2.128772258758545, + "grad_norm_var": 0.17279926669385734, + "learning_rate": 0.0001, + "loss": 1.1848, + "loss/crossentropy": 2.334495782852173, + "loss/hidden": 1.0546875, + "loss/logits": 0.12953956425189972, + "loss/reg": 6.005321120028384e-05, + "step": 318 + }, + { + "epoch": 0.039875, + "grad_norm": 2.1308538913726807, + "grad_norm_var": 0.1742483958439737, + "learning_rate": 0.0001, + "loss": 1.3191, + "loss/crossentropy": 2.3873021602630615, + "loss/hidden": 1.125, + "loss/logits": 0.19348952174186707, + "loss/reg": 6.0041034885216504e-05, + "step": 319 + }, + { + "epoch": 0.04, + "grad_norm": 2.706742286682129, + "grad_norm_var": 0.17935140917835876, + "learning_rate": 0.0001, + "loss": 1.4123, + "loss/crossentropy": 2.5321033000946045, + "loss/hidden": 1.203125, + "loss/logits": 0.20852993428707123, + "loss/reg": 6.002993177389726e-05, + "step": 320 + }, + { + "epoch": 0.040125, + "grad_norm": 6.118154525756836, + "grad_norm_var": 1.0228689502418715, + "learning_rate": 0.0001, + "loss": 1.7298, + "loss/crossentropy": 2.457045316696167, + "loss/hidden": 1.515625, + "loss/logits": 0.2136228382587433, + "loss/reg": 6.001694418955594e-05, + "step": 321 + }, + { + "epoch": 0.04025, + "grad_norm": 3.091947317123413, + "grad_norm_var": 1.0084811477178388, + "learning_rate": 0.0001, + "loss": 1.6635, + "loss/crossentropy": 2.6943020820617676, + "loss/hidden": 1.40625, + "loss/logits": 0.25662127137184143, + "loss/reg": 6.0004946135450155e-05, + "step": 322 + }, + { + "epoch": 0.040375, + "grad_norm": 2.488391637802124, + "grad_norm_var": 1.0122566583255546, + "learning_rate": 0.0001, + "loss": 1.2065, + "loss/crossentropy": 2.646897792816162, + "loss/hidden": 1.0546875, + "loss/logits": 0.15123483538627625, + "loss/reg": 5.9991711168549955e-05, + "step": 323 + }, + { + "epoch": 0.0405, + "grad_norm": 3.0675456523895264, + "grad_norm_var": 1.0035307165437406, + "learning_rate": 0.0001, + "loss": 1.4832, + "loss/crossentropy": 2.4176406860351562, + "loss/hidden": 1.2421875, + "loss/logits": 0.24040505290031433, + "loss/reg": 5.9981128288200125e-05, + "step": 324 + }, + { + "epoch": 0.040625, + "grad_norm": 2.424546957015991, + "grad_norm_var": 0.9926314451715664, + "learning_rate": 0.0001, + "loss": 1.07, + "loss/crossentropy": 2.703134059906006, + "loss/hidden": 0.94140625, + "loss/logits": 0.12803316116333008, + "loss/reg": 5.997138941893354e-05, + "step": 325 + }, + { + "epoch": 0.04075, + "grad_norm": 2.9345507621765137, + "grad_norm_var": 0.9582126623175621, + "learning_rate": 0.0001, + "loss": 1.4247, + "loss/crossentropy": 2.8940789699554443, + "loss/hidden": 1.21875, + "loss/logits": 0.20539763569831848, + "loss/reg": 5.996019172016531e-05, + "step": 326 + }, + { + "epoch": 0.040875, + "grad_norm": 3.069572925567627, + "grad_norm_var": 0.9195850402401864, + "learning_rate": 0.0001, + "loss": 1.3896, + "loss/crossentropy": 2.4416871070861816, + "loss/hidden": 1.1875, + "loss/logits": 0.20154833793640137, + "loss/reg": 5.9947429690510035e-05, + "step": 327 + }, + { + "epoch": 0.041, + "grad_norm": 2.323606491088867, + "grad_norm_var": 0.9275566292830253, + "learning_rate": 0.0001, + "loss": 1.2888, + "loss/crossentropy": 2.811528444290161, + "loss/hidden": 1.1015625, + "loss/logits": 0.18662354350090027, + "loss/reg": 5.9936231991741806e-05, + "step": 328 + }, + { + "epoch": 0.041125, + "grad_norm": 3.1679723262786865, + "grad_norm_var": 0.9322370885273564, + "learning_rate": 0.0001, + "loss": 1.5559, + "loss/crossentropy": 2.3170981407165527, + "loss/hidden": 1.3046875, + "loss/logits": 0.2506353557109833, + "loss/reg": 5.991987563902512e-05, + "step": 329 + }, + { + "epoch": 0.04125, + "grad_norm": 2.7683303356170654, + "grad_norm_var": 0.9228798875820224, + "learning_rate": 0.0001, + "loss": 1.3127, + "loss/crossentropy": 2.51680850982666, + "loss/hidden": 1.1171875, + "loss/logits": 0.1949077993631363, + "loss/reg": 5.990756835672073e-05, + "step": 330 + }, + { + "epoch": 0.041375, + "grad_norm": 2.4825031757354736, + "grad_norm_var": 0.9280253827718864, + "learning_rate": 0.0001, + "loss": 1.3408, + "loss/crossentropy": 2.605055332183838, + "loss/hidden": 1.140625, + "loss/logits": 0.19955970346927643, + "loss/reg": 5.989522469462827e-05, + "step": 331 + }, + { + "epoch": 0.0415, + "grad_norm": 3.2399041652679443, + "grad_norm_var": 0.9369785308922095, + "learning_rate": 0.0001, + "loss": 1.5753, + "loss/crossentropy": 2.7269279956817627, + "loss/hidden": 1.3515625, + "loss/logits": 0.22315430641174316, + "loss/reg": 5.988113844068721e-05, + "step": 332 + }, + { + "epoch": 0.041625, + "grad_norm": 2.8936927318573, + "grad_norm_var": 0.8504314928241191, + "learning_rate": 0.0001, + "loss": 1.3222, + "loss/crossentropy": 2.812412738800049, + "loss/hidden": 1.140625, + "loss/logits": 0.1809367835521698, + "loss/reg": 5.9867059462703764e-05, + "step": 333 + }, + { + "epoch": 0.04175, + "grad_norm": 2.432213068008423, + "grad_norm_var": 0.8233723477256942, + "learning_rate": 0.0001, + "loss": 1.4094, + "loss/crossentropy": 2.6377694606781006, + "loss/hidden": 1.203125, + "loss/logits": 0.20563456416130066, + "loss/reg": 5.9853711718460545e-05, + "step": 334 + }, + { + "epoch": 0.041875, + "grad_norm": 2.422299861907959, + "grad_norm_var": 0.7965082638815336, + "learning_rate": 0.0001, + "loss": 1.2328, + "loss/crossentropy": 2.5352189540863037, + "loss/hidden": 1.078125, + "loss/logits": 0.15405428409576416, + "loss/reg": 5.984482049825601e-05, + "step": 335 + }, + { + "epoch": 0.042, + "grad_norm": 2.703420877456665, + "grad_norm_var": 0.7966286375145801, + "learning_rate": 0.0001, + "loss": 1.2981, + "loss/crossentropy": 2.525949716567993, + "loss/hidden": 1.1015625, + "loss/logits": 0.1959662139415741, + "loss/reg": 5.983649680274539e-05, + "step": 336 + }, + { + "epoch": 0.042125, + "grad_norm": 3.625760078430176, + "grad_norm_var": 0.14094485019601447, + "learning_rate": 0.0001, + "loss": 1.6517, + "loss/crossentropy": 1.9824917316436768, + "loss/hidden": 1.3828125, + "loss/logits": 0.2682979702949524, + "loss/reg": 5.9825455537065864e-05, + "step": 337 + }, + { + "epoch": 0.04225, + "grad_norm": 2.2066762447357178, + "grad_norm_var": 0.1579467344221198, + "learning_rate": 0.0001, + "loss": 1.1768, + "loss/crossentropy": 2.5151102542877197, + "loss/hidden": 1.0078125, + "loss/logits": 0.16843904554843903, + "loss/reg": 5.981199865345843e-05, + "step": 338 + }, + { + "epoch": 0.042375, + "grad_norm": 2.961968421936035, + "grad_norm_var": 0.15445451920782696, + "learning_rate": 0.0001, + "loss": 1.5446, + "loss/crossentropy": 2.397102117538452, + "loss/hidden": 1.3046875, + "loss/logits": 0.23934724926948547, + "loss/reg": 5.979971319902688e-05, + "step": 339 + }, + { + "epoch": 0.0425, + "grad_norm": 2.4696779251098633, + "grad_norm_var": 0.15509145555751214, + "learning_rate": 0.0001, + "loss": 1.2907, + "loss/crossentropy": 2.518648624420166, + "loss/hidden": 1.1171875, + "loss/logits": 0.17289261519908905, + "loss/reg": 5.979237175779417e-05, + "step": 340 + }, + { + "epoch": 0.042625, + "grad_norm": 2.2886741161346436, + "grad_norm_var": 0.16228478040589658, + "learning_rate": 0.0001, + "loss": 1.2915, + "loss/crossentropy": 2.4755570888519287, + "loss/hidden": 1.109375, + "loss/logits": 0.18152545392513275, + "loss/reg": 5.978640547255054e-05, + "step": 341 + }, + { + "epoch": 0.04275, + "grad_norm": 2.4154622554779053, + "grad_norm_var": 0.16631279956205466, + "learning_rate": 0.0001, + "loss": 1.1361, + "loss/crossentropy": 2.620903730392456, + "loss/hidden": 0.9921875, + "loss/logits": 0.1432739496231079, + "loss/reg": 5.977362161502242e-05, + "step": 342 + }, + { + "epoch": 0.042875, + "grad_norm": 3.9107778072357178, + "grad_norm_var": 0.25008606934497735, + "learning_rate": 0.0001, + "loss": 1.6206, + "loss/crossentropy": 3.3820858001708984, + "loss/hidden": 1.40625, + "loss/logits": 0.21375682950019836, + "loss/reg": 5.976331885904074e-05, + "step": 343 + }, + { + "epoch": 0.043, + "grad_norm": 2.2201833724975586, + "grad_norm_var": 0.25690416036597197, + "learning_rate": 0.0001, + "loss": 1.2446, + "loss/crossentropy": 2.467216730117798, + "loss/hidden": 1.0625, + "loss/logits": 0.18146604299545288, + "loss/reg": 5.975304884486832e-05, + "step": 344 + }, + { + "epoch": 0.043125, + "grad_norm": 2.1915907859802246, + "grad_norm_var": 0.26377805805320803, + "learning_rate": 0.0001, + "loss": 1.4337, + "loss/crossentropy": 2.3638522624969482, + "loss/hidden": 1.203125, + "loss/logits": 0.22996577620506287, + "loss/reg": 5.974585292278789e-05, + "step": 345 + }, + { + "epoch": 0.04325, + "grad_norm": 2.2508416175842285, + "grad_norm_var": 0.27594342104869135, + "learning_rate": 0.0001, + "loss": 1.1804, + "loss/crossentropy": 2.5332260131835938, + "loss/hidden": 1.0234375, + "loss/logits": 0.15640094876289368, + "loss/reg": 5.973771112621762e-05, + "step": 346 + }, + { + "epoch": 0.043375, + "grad_norm": 2.0090150833129883, + "grad_norm_var": 0.30177518099136, + "learning_rate": 0.0001, + "loss": 1.2994, + "loss/crossentropy": 2.511950731277466, + "loss/hidden": 1.125, + "loss/logits": 0.17384442687034607, + "loss/reg": 5.9728798078140244e-05, + "step": 347 + }, + { + "epoch": 0.0435, + "grad_norm": 2.7306134700775146, + "grad_norm_var": 0.277258656834267, + "learning_rate": 0.0001, + "loss": 1.4199, + "loss/crossentropy": 2.6389760971069336, + "loss/hidden": 1.1875, + "loss/logits": 0.2318291962146759, + "loss/reg": 5.9719615819631144e-05, + "step": 348 + }, + { + "epoch": 0.043625, + "grad_norm": 2.270148515701294, + "grad_norm_var": 0.27783213891549774, + "learning_rate": 0.0001, + "loss": 1.5484, + "loss/crossentropy": 2.312831163406372, + "loss/hidden": 1.2890625, + "loss/logits": 0.2587454915046692, + "loss/reg": 5.970869824523106e-05, + "step": 349 + }, + { + "epoch": 0.04375, + "grad_norm": 2.0988070964813232, + "grad_norm_var": 0.2908751450016543, + "learning_rate": 0.0001, + "loss": 1.2681, + "loss/crossentropy": 2.378908634185791, + "loss/hidden": 1.109375, + "loss/logits": 0.1581004559993744, + "loss/reg": 5.970033089397475e-05, + "step": 350 + }, + { + "epoch": 0.043875, + "grad_norm": 2.045546770095825, + "grad_norm_var": 0.30608582246859417, + "learning_rate": 0.0001, + "loss": 1.1063, + "loss/crossentropy": 2.4011952877044678, + "loss/hidden": 0.96875, + "loss/logits": 0.13691341876983643, + "loss/reg": 5.969877020106651e-05, + "step": 351 + }, + { + "epoch": 0.044, + "grad_norm": 2.9582409858703613, + "grad_norm_var": 0.316207957574548, + "learning_rate": 0.0001, + "loss": 1.2072, + "loss/crossentropy": 2.643101215362549, + "loss/hidden": 1.046875, + "loss/logits": 0.15975871682167053, + "loss/reg": 5.9694295487133786e-05, + "step": 352 + }, + { + "epoch": 0.044125, + "grad_norm": 2.125020742416382, + "grad_norm_var": 0.23988746234485703, + "learning_rate": 0.0001, + "loss": 1.2268, + "loss/crossentropy": 2.5923550128936768, + "loss/hidden": 1.0390625, + "loss/logits": 0.18714120984077454, + "loss/reg": 5.968381810816936e-05, + "step": 353 + }, + { + "epoch": 0.04425, + "grad_norm": 2.2348685264587402, + "grad_norm_var": 0.2390334750954897, + "learning_rate": 0.0001, + "loss": 1.3948, + "loss/crossentropy": 2.549100637435913, + "loss/hidden": 1.1953125, + "loss/logits": 0.198894202709198, + "loss/reg": 5.9669990150723606e-05, + "step": 354 + }, + { + "epoch": 0.044375, + "grad_norm": 2.6807351112365723, + "grad_norm_var": 0.2247355561703434, + "learning_rate": 0.0001, + "loss": 1.5721, + "loss/crossentropy": 2.2256884574890137, + "loss/hidden": 1.3046875, + "loss/logits": 0.26683151721954346, + "loss/reg": 5.966486787656322e-05, + "step": 355 + }, + { + "epoch": 0.0445, + "grad_norm": 3.1524059772491455, + "grad_norm_var": 0.2573648537337417, + "learning_rate": 0.0001, + "loss": 1.5458, + "loss/crossentropy": 2.4026124477386475, + "loss/hidden": 1.3203125, + "loss/logits": 0.22484509646892548, + "loss/reg": 5.965128730167635e-05, + "step": 356 + }, + { + "epoch": 0.044625, + "grad_norm": 3.806107759475708, + "grad_norm_var": 0.3637951956534662, + "learning_rate": 0.0001, + "loss": 1.2257, + "loss/crossentropy": 2.534790277481079, + "loss/hidden": 1.1015625, + "loss/logits": 0.12353114783763885, + "loss/reg": 5.9637932281475514e-05, + "step": 357 + }, + { + "epoch": 0.04475, + "grad_norm": 2.6499619483947754, + "grad_norm_var": 0.36243857175732047, + "learning_rate": 0.0001, + "loss": 1.2577, + "loss/crossentropy": 2.786536931991577, + "loss/hidden": 1.078125, + "loss/logits": 0.17896610498428345, + "loss/reg": 5.962959403404966e-05, + "step": 358 + }, + { + "epoch": 0.044875, + "grad_norm": 2.750371217727661, + "grad_norm_var": 0.24122897908522703, + "learning_rate": 0.0001, + "loss": 1.3213, + "loss/crossentropy": 2.5112698078155518, + "loss/hidden": 1.125, + "loss/logits": 0.19569161534309387, + "loss/reg": 5.961711940472014e-05, + "step": 359 + }, + { + "epoch": 0.045, + "grad_norm": 2.4145219326019287, + "grad_norm_var": 0.23605635737508593, + "learning_rate": 0.0001, + "loss": 1.4067, + "loss/crossentropy": 2.4327914714813232, + "loss/hidden": 1.171875, + "loss/logits": 0.23425719141960144, + "loss/reg": 5.960506314295344e-05, + "step": 360 + }, + { + "epoch": 0.045125, + "grad_norm": 2.7820589542388916, + "grad_norm_var": 0.2317516785903725, + "learning_rate": 0.0001, + "loss": 1.5833, + "loss/crossentropy": 2.6201419830322266, + "loss/hidden": 1.3359375, + "loss/logits": 0.2468121349811554, + "loss/reg": 5.959635382168926e-05, + "step": 361 + }, + { + "epoch": 0.04525, + "grad_norm": 3.0179331302642822, + "grad_norm_var": 0.23691283979908515, + "learning_rate": 0.0001, + "loss": 1.3921, + "loss/crossentropy": 2.728665351867676, + "loss/hidden": 1.1953125, + "loss/logits": 0.19617268443107605, + "loss/reg": 5.958346446277574e-05, + "step": 362 + }, + { + "epoch": 0.045375, + "grad_norm": 2.577760934829712, + "grad_norm_var": 0.21171492452191767, + "learning_rate": 0.0001, + "loss": 1.3343, + "loss/crossentropy": 2.4396915435791016, + "loss/hidden": 1.140625, + "loss/logits": 0.19306717813014984, + "loss/reg": 5.957194298389368e-05, + "step": 363 + }, + { + "epoch": 0.0455, + "grad_norm": 2.2478973865509033, + "grad_norm_var": 0.22066793284785244, + "learning_rate": 0.0001, + "loss": 1.2107, + "loss/crossentropy": 2.5703125, + "loss/hidden": 1.046875, + "loss/logits": 0.16320618987083435, + "loss/reg": 5.955886445008218e-05, + "step": 364 + }, + { + "epoch": 0.045625, + "grad_norm": 2.8303184509277344, + "grad_norm_var": 0.21465200545435412, + "learning_rate": 0.0001, + "loss": 1.3184, + "loss/crossentropy": 2.5793418884277344, + "loss/hidden": 1.1171875, + "loss/logits": 0.20061752200126648, + "loss/reg": 5.954650623607449e-05, + "step": 365 + }, + { + "epoch": 0.04575, + "grad_norm": 2.3407225608825684, + "grad_norm_var": 0.20058607793752117, + "learning_rate": 0.0001, + "loss": 1.2154, + "loss/crossentropy": 2.5118396282196045, + "loss/hidden": 1.0546875, + "loss/logits": 0.16011780500411987, + "loss/reg": 5.9531517763389274e-05, + "step": 366 + }, + { + "epoch": 0.045875, + "grad_norm": 2.9164462089538574, + "grad_norm_var": 0.17624459628143327, + "learning_rate": 0.0001, + "loss": 2.3079, + "loss/crossentropy": 2.530949831008911, + "loss/hidden": 1.7890625, + "loss/logits": 0.5182523727416992, + "loss/reg": 5.9519883507164195e-05, + "step": 367 + }, + { + "epoch": 0.046, + "grad_norm": 2.6031134128570557, + "grad_norm_var": 0.17274354994838556, + "learning_rate": 0.0001, + "loss": 1.3529, + "loss/crossentropy": 2.5211331844329834, + "loss/hidden": 1.140625, + "loss/logits": 0.21172133088111877, + "loss/reg": 5.950441482127644e-05, + "step": 368 + }, + { + "epoch": 0.046125, + "grad_norm": 2.2432241439819336, + "grad_norm_var": 0.16462358021652007, + "learning_rate": 0.0001, + "loss": 1.2433, + "loss/crossentropy": 2.469212055206299, + "loss/hidden": 1.0625, + "loss/logits": 0.18018998205661774, + "loss/reg": 5.9490499552339315e-05, + "step": 369 + }, + { + "epoch": 0.04625, + "grad_norm": 3.287365674972534, + "grad_norm_var": 0.16815977224474163, + "learning_rate": 0.0001, + "loss": 1.3311, + "loss/crossentropy": 2.7330899238586426, + "loss/hidden": 1.140625, + "loss/logits": 0.18986304104328156, + "loss/reg": 5.9471924032550305e-05, + "step": 370 + }, + { + "epoch": 0.046375, + "grad_norm": 2.6555063724517822, + "grad_norm_var": 0.16849581874392333, + "learning_rate": 0.0001, + "loss": 1.3069, + "loss/crossentropy": 2.4908649921417236, + "loss/hidden": 1.1171875, + "loss/logits": 0.1890988051891327, + "loss/reg": 5.9457710449351e-05, + "step": 371 + }, + { + "epoch": 0.0465, + "grad_norm": 2.2832915782928467, + "grad_norm_var": 0.1710711381355336, + "learning_rate": 0.0001, + "loss": 1.251, + "loss/crossentropy": 2.6485414505004883, + "loss/hidden": 1.0859375, + "loss/logits": 0.16444087028503418, + "loss/reg": 5.9437123127281666e-05, + "step": 372 + }, + { + "epoch": 0.046625, + "grad_norm": 1.9312299489974976, + "grad_norm_var": 0.11748808484953574, + "learning_rate": 0.0001, + "loss": 1.3104, + "loss/crossentropy": 2.4345285892486572, + "loss/hidden": 1.125, + "loss/logits": 0.1848057061433792, + "loss/reg": 5.941649214946665e-05, + "step": 373 + }, + { + "epoch": 0.04675, + "grad_norm": 2.2687668800354004, + "grad_norm_var": 0.12381368567199799, + "learning_rate": 0.0001, + "loss": 1.2697, + "loss/crossentropy": 2.514896869659424, + "loss/hidden": 1.0859375, + "loss/logits": 0.18321493268013, + "loss/reg": 5.939120819675736e-05, + "step": 374 + }, + { + "epoch": 0.046875, + "grad_norm": 2.1616384983062744, + "grad_norm_var": 0.131467626574521, + "learning_rate": 0.0001, + "loss": 1.2405, + "loss/crossentropy": 2.698112964630127, + "loss/hidden": 1.0625, + "loss/logits": 0.1773754358291626, + "loss/reg": 5.936667002970353e-05, + "step": 375 + }, + { + "epoch": 0.047, + "grad_norm": 2.6922011375427246, + "grad_norm_var": 0.13182201209023336, + "learning_rate": 0.0001, + "loss": 1.3426, + "loss/crossentropy": 2.538865327835083, + "loss/hidden": 1.15625, + "loss/logits": 0.1857489049434662, + "loss/reg": 5.9345431509427726e-05, + "step": 376 + }, + { + "epoch": 0.047125, + "grad_norm": 2.2630982398986816, + "grad_norm_var": 0.13276797957838743, + "learning_rate": 0.0001, + "loss": 1.2869, + "loss/crossentropy": 2.4644358158111572, + "loss/hidden": 1.109375, + "loss/logits": 0.17694343626499176, + "loss/reg": 5.933275315328501e-05, + "step": 377 + }, + { + "epoch": 0.04725, + "grad_norm": 2.479646682739258, + "grad_norm_var": 0.11514238570119009, + "learning_rate": 0.0001, + "loss": 1.1618, + "loss/crossentropy": 2.5582141876220703, + "loss/hidden": 1.015625, + "loss/logits": 0.14554372429847717, + "loss/reg": 5.931046689511277e-05, + "step": 378 + }, + { + "epoch": 0.047375, + "grad_norm": 2.466947317123413, + "grad_norm_var": 0.114559834161389, + "learning_rate": 0.0001, + "loss": 1.48, + "loss/crossentropy": 2.4128925800323486, + "loss/hidden": 1.2421875, + "loss/logits": 0.23724211752414703, + "loss/reg": 5.929026156081818e-05, + "step": 379 + }, + { + "epoch": 0.0475, + "grad_norm": 2.538424015045166, + "grad_norm_var": 0.11086504579424972, + "learning_rate": 0.0001, + "loss": 1.4136, + "loss/crossentropy": 2.0768887996673584, + "loss/hidden": 1.234375, + "loss/logits": 0.17867109179496765, + "loss/reg": 5.92764736211393e-05, + "step": 380 + }, + { + "epoch": 0.047625, + "grad_norm": 2.654524564743042, + "grad_norm_var": 0.1049983644074643, + "learning_rate": 0.0001, + "loss": 1.3221, + "loss/crossentropy": 2.1216413974761963, + "loss/hidden": 1.15625, + "loss/logits": 0.16521546244621277, + "loss/reg": 5.926107769482769e-05, + "step": 381 + }, + { + "epoch": 0.04775, + "grad_norm": 2.237818717956543, + "grad_norm_var": 0.10766217194697697, + "learning_rate": 0.0001, + "loss": 1.2236, + "loss/crossentropy": 2.6475207805633545, + "loss/hidden": 1.0546875, + "loss/logits": 0.16833502054214478, + "loss/reg": 5.924178913119249e-05, + "step": 382 + }, + { + "epoch": 0.047875, + "grad_norm": 2.7116799354553223, + "grad_norm_var": 0.09837235459497572, + "learning_rate": 0.0001, + "loss": 1.3102, + "loss/crossentropy": 2.6615209579467773, + "loss/hidden": 1.1171875, + "loss/logits": 0.1924624741077423, + "loss/reg": 5.921960837440565e-05, + "step": 383 + }, + { + "epoch": 0.048, + "grad_norm": 2.5439391136169434, + "grad_norm_var": 0.09752047633307553, + "learning_rate": 0.0001, + "loss": 1.3258, + "loss/crossentropy": 2.10198974609375, + "loss/hidden": 1.15625, + "loss/logits": 0.1689702719449997, + "loss/reg": 5.919525210629217e-05, + "step": 384 + }, + { + "epoch": 0.048125, + "grad_norm": 2.617921829223633, + "grad_norm_var": 0.09528014676361156, + "learning_rate": 0.0001, + "loss": 1.61, + "loss/crossentropy": 2.445833206176758, + "loss/hidden": 1.328125, + "loss/logits": 0.28133296966552734, + "loss/reg": 5.917950693401508e-05, + "step": 385 + }, + { + "epoch": 0.04825, + "grad_norm": 2.514899730682373, + "grad_norm_var": 0.05015297139615639, + "learning_rate": 0.0001, + "loss": 1.1964, + "loss/crossentropy": 2.4887778759002686, + "loss/hidden": 1.0390625, + "loss/logits": 0.1567072868347168, + "loss/reg": 5.916162990615703e-05, + "step": 386 + }, + { + "epoch": 0.048375, + "grad_norm": 2.1075565814971924, + "grad_norm_var": 0.053089324895933446, + "learning_rate": 0.0001, + "loss": 1.0537, + "loss/crossentropy": 2.4045815467834473, + "loss/hidden": 0.921875, + "loss/logits": 0.1312153935432434, + "loss/reg": 5.914089342695661e-05, + "step": 387 + }, + { + "epoch": 0.0485, + "grad_norm": 2.475404739379883, + "grad_norm_var": 0.05228874002812057, + "learning_rate": 0.0001, + "loss": 1.3003, + "loss/crossentropy": 2.591153383255005, + "loss/hidden": 1.109375, + "loss/logits": 0.19037862122058868, + "loss/reg": 5.9116682677995414e-05, + "step": 388 + }, + { + "epoch": 0.048625, + "grad_norm": 4.638079643249512, + "grad_norm_var": 0.33504973194641535, + "learning_rate": 0.0001, + "loss": 1.7407, + "loss/crossentropy": 2.992236852645874, + "loss/hidden": 1.4609375, + "loss/logits": 0.2792096734046936, + "loss/reg": 5.9097284974996e-05, + "step": 389 + }, + { + "epoch": 0.04875, + "grad_norm": 2.4662392139434814, + "grad_norm_var": 0.32913998556907487, + "learning_rate": 0.0001, + "loss": 1.1454, + "loss/crossentropy": 2.9239540100097656, + "loss/hidden": 0.9921875, + "loss/logits": 0.15260137617588043, + "loss/reg": 5.907983722863719e-05, + "step": 390 + }, + { + "epoch": 0.048875, + "grad_norm": 2.439119338989258, + "grad_norm_var": 0.31780327994806234, + "learning_rate": 0.0001, + "loss": 1.3638, + "loss/crossentropy": 2.450254440307617, + "loss/hidden": 1.1640625, + "loss/logits": 0.19915927946567535, + "loss/reg": 5.9063841035822406e-05, + "step": 391 + }, + { + "epoch": 0.049, + "grad_norm": 2.3475067615509033, + "grad_norm_var": 0.3217026075593497, + "learning_rate": 0.0001, + "loss": 1.533, + "loss/crossentropy": 2.617830753326416, + "loss/hidden": 1.265625, + "loss/logits": 0.26678475737571716, + "loss/reg": 5.90429590374697e-05, + "step": 392 + }, + { + "epoch": 0.049125, + "grad_norm": 4.364901065826416, + "grad_norm_var": 0.5050899240629005, + "learning_rate": 0.0001, + "loss": 1.4632, + "loss/crossentropy": 2.4607560634613037, + "loss/hidden": 1.2421875, + "loss/logits": 0.22039487957954407, + "loss/reg": 5.902666089241393e-05, + "step": 393 + }, + { + "epoch": 0.04925, + "grad_norm": 2.338758707046509, + "grad_norm_var": 0.5109449021123245, + "learning_rate": 0.0001, + "loss": 1.2995, + "loss/crossentropy": 2.6618576049804688, + "loss/hidden": 1.1171875, + "loss/logits": 0.1817541867494583, + "loss/reg": 5.9010566474171355e-05, + "step": 394 + }, + { + "epoch": 0.049375, + "grad_norm": 3.5642833709716797, + "grad_norm_var": 0.549694181009107, + "learning_rate": 0.0001, + "loss": 1.3152, + "loss/crossentropy": 2.300379753112793, + "loss/hidden": 1.1171875, + "loss/logits": 0.1974020004272461, + "loss/reg": 5.8987676311517134e-05, + "step": 395 + }, + { + "epoch": 0.0495, + "grad_norm": 2.1328978538513184, + "grad_norm_var": 0.573308372527261, + "learning_rate": 0.0001, + "loss": 1.244, + "loss/crossentropy": 2.4386301040649414, + "loss/hidden": 1.0625, + "loss/logits": 0.18090221285820007, + "loss/reg": 5.8964946219930425e-05, + "step": 396 + }, + { + "epoch": 0.049625, + "grad_norm": 3.0894107818603516, + "grad_norm_var": 0.5790289690992334, + "learning_rate": 0.0001, + "loss": 1.5661, + "loss/crossentropy": 2.365107297897339, + "loss/hidden": 1.3671875, + "loss/logits": 0.1983477920293808, + "loss/reg": 5.8950212405761704e-05, + "step": 397 + }, + { + "epoch": 0.04975, + "grad_norm": 3.194427967071533, + "grad_norm_var": 0.566188494588774, + "learning_rate": 0.0001, + "loss": 1.4269, + "loss/crossentropy": 2.384216547012329, + "loss/hidden": 1.21875, + "loss/logits": 0.20751546323299408, + "loss/reg": 5.8928319049300626e-05, + "step": 398 + }, + { + "epoch": 0.049875, + "grad_norm": 2.5108933448791504, + "grad_norm_var": 0.5723226037333423, + "learning_rate": 0.0001, + "loss": 1.2127, + "loss/crossentropy": 2.5466771125793457, + "loss/hidden": 1.0546875, + "loss/logits": 0.1574660688638687, + "loss/reg": 5.891324326512404e-05, + "step": 399 + }, + { + "epoch": 0.05, + "grad_norm": 2.9769773483276367, + "grad_norm_var": 0.5672869916808385, + "learning_rate": 0.0001, + "loss": 1.3045, + "loss/crossentropy": 2.7223000526428223, + "loss/hidden": 1.1171875, + "loss/logits": 0.18677057325839996, + "loss/reg": 5.889027670491487e-05, + "step": 400 + }, + { + "epoch": 0.050125, + "grad_norm": 3.31915283203125, + "grad_norm_var": 0.5752734489563172, + "learning_rate": 0.0001, + "loss": 1.2639, + "loss/crossentropy": 2.4886364936828613, + "loss/hidden": 1.078125, + "loss/logits": 0.1851940155029297, + "loss/reg": 5.887265797355212e-05, + "step": 401 + }, + { + "epoch": 0.05025, + "grad_norm": 1.8946937322616577, + "grad_norm_var": 0.6315760522485537, + "learning_rate": 0.0001, + "loss": 1.2326, + "loss/crossentropy": 2.414213180541992, + "loss/hidden": 1.0703125, + "loss/logits": 0.16165336966514587, + "loss/reg": 5.885552673134953e-05, + "step": 402 + }, + { + "epoch": 0.050375, + "grad_norm": 2.5370404720306396, + "grad_norm_var": 0.5996572790739425, + "learning_rate": 0.0001, + "loss": 1.5079, + "loss/crossentropy": 2.3421835899353027, + "loss/hidden": 1.28125, + "loss/logits": 0.22602099180221558, + "loss/reg": 5.88419679843355e-05, + "step": 403 + }, + { + "epoch": 0.0505, + "grad_norm": 2.4215445518493652, + "grad_norm_var": 0.6028382899137373, + "learning_rate": 0.0001, + "loss": 1.4975, + "loss/crossentropy": 2.7361152172088623, + "loss/hidden": 1.2421875, + "loss/logits": 0.25468122959136963, + "loss/reg": 5.88247858104296e-05, + "step": 404 + }, + { + "epoch": 0.050625, + "grad_norm": 2.049978733062744, + "grad_norm_var": 0.4181645547932513, + "learning_rate": 0.0001, + "loss": 1.1088, + "loss/crossentropy": 2.350353717803955, + "loss/hidden": 0.953125, + "loss/logits": 0.15509989857673645, + "loss/reg": 5.880888784304261e-05, + "step": 405 + }, + { + "epoch": 0.05075, + "grad_norm": 2.7967936992645264, + "grad_norm_var": 0.41345734870287976, + "learning_rate": 0.0001, + "loss": 1.3869, + "loss/crossentropy": 2.5875766277313232, + "loss/hidden": 1.171875, + "loss/logits": 0.21445012092590332, + "loss/reg": 5.8793633797904477e-05, + "step": 406 + }, + { + "epoch": 0.050875, + "grad_norm": 2.169900894165039, + "grad_norm_var": 0.429098064205416, + "learning_rate": 0.0001, + "loss": 1.0776, + "loss/crossentropy": 2.398125410079956, + "loss/hidden": 0.93359375, + "loss/logits": 0.14346018433570862, + "loss/reg": 5.877741932636127e-05, + "step": 407 + }, + { + "epoch": 0.051, + "grad_norm": 2.5045695304870605, + "grad_norm_var": 0.4225916301522199, + "learning_rate": 0.0001, + "loss": 1.5355, + "loss/crossentropy": 2.1697590351104736, + "loss/hidden": 1.328125, + "loss/logits": 0.20677754282951355, + "loss/reg": 5.876670911675319e-05, + "step": 408 + }, + { + "epoch": 0.051125, + "grad_norm": 18.23008918762207, + "grad_norm_var": 15.43871781968465, + "learning_rate": 0.0001, + "loss": 1.4882, + "loss/crossentropy": 2.602886438369751, + "loss/hidden": 1.3046875, + "loss/logits": 0.18289120495319366, + "loss/reg": 5.874884300283156e-05, + "step": 409 + }, + { + "epoch": 0.05125, + "grad_norm": 2.8436660766601562, + "grad_norm_var": 15.369190103974788, + "learning_rate": 0.0001, + "loss": 1.3294, + "loss/crossentropy": 2.4684174060821533, + "loss/hidden": 1.140625, + "loss/logits": 0.18818634748458862, + "loss/reg": 5.873553891433403e-05, + "step": 410 + }, + { + "epoch": 0.051375, + "grad_norm": 2.2729334831237793, + "grad_norm_var": 15.486411427985377, + "learning_rate": 0.0001, + "loss": 1.2331, + "loss/crossentropy": 2.550140857696533, + "loss/hidden": 1.0390625, + "loss/logits": 0.19348369538784027, + "loss/reg": 5.8720732340589166e-05, + "step": 411 + }, + { + "epoch": 0.0515, + "grad_norm": 2.5612359046936035, + "grad_norm_var": 15.416427881560285, + "learning_rate": 0.0001, + "loss": 1.3333, + "loss/crossentropy": 2.4774818420410156, + "loss/hidden": 1.125, + "loss/logits": 0.2077203392982483, + "loss/reg": 5.8710702433018014e-05, + "step": 412 + }, + { + "epoch": 0.051625, + "grad_norm": 4.02579927444458, + "grad_norm_var": 15.409250289477422, + "learning_rate": 0.0001, + "loss": 1.507, + "loss/crossentropy": 2.555722713470459, + "loss/hidden": 1.3046875, + "loss/logits": 0.20171231031417847, + "loss/reg": 5.870195309398696e-05, + "step": 413 + }, + { + "epoch": 0.05175, + "grad_norm": 2.443574905395508, + "grad_norm_var": 15.489530544756628, + "learning_rate": 0.0001, + "loss": 1.2774, + "loss/crossentropy": 2.706422805786133, + "loss/hidden": 1.09375, + "loss/logits": 0.1831112802028656, + "loss/reg": 5.8690613514045253e-05, + "step": 414 + }, + { + "epoch": 0.051875, + "grad_norm": 2.079418897628784, + "grad_norm_var": 15.563674426313279, + "learning_rate": 0.0001, + "loss": 1.1798, + "loss/crossentropy": 2.6763839721679688, + "loss/hidden": 1.03125, + "loss/logits": 0.14800235629081726, + "loss/reg": 5.8675475884228945e-05, + "step": 415 + }, + { + "epoch": 0.052, + "grad_norm": 2.7786471843719482, + "grad_norm_var": 15.581826938638233, + "learning_rate": 0.0001, + "loss": 1.2465, + "loss/crossentropy": 2.6709306240081787, + "loss/hidden": 1.078125, + "loss/logits": 0.1678304374217987, + "loss/reg": 5.866462379344739e-05, + "step": 416 + }, + { + "epoch": 0.052125, + "grad_norm": 2.770376443862915, + "grad_norm_var": 15.618130403520784, + "learning_rate": 0.0001, + "loss": 1.3111, + "loss/crossentropy": 2.646826982498169, + "loss/hidden": 1.1328125, + "loss/logits": 0.1777157187461853, + "loss/reg": 5.865520142833702e-05, + "step": 417 + }, + { + "epoch": 0.05225, + "grad_norm": 2.092414617538452, + "grad_norm_var": 15.57762685735369, + "learning_rate": 0.0001, + "loss": 1.3353, + "loss/crossentropy": 2.62361741065979, + "loss/hidden": 1.15625, + "loss/logits": 0.17848479747772217, + "loss/reg": 5.8638761402107775e-05, + "step": 418 + }, + { + "epoch": 0.052375, + "grad_norm": 2.05226731300354, + "grad_norm_var": 15.656891853986265, + "learning_rate": 0.0001, + "loss": 1.14, + "loss/crossentropy": 2.697723865509033, + "loss/hidden": 0.9921875, + "loss/logits": 0.14726917445659637, + "loss/reg": 5.862316902494058e-05, + "step": 419 + }, + { + "epoch": 0.0525, + "grad_norm": 2.6924796104431152, + "grad_norm_var": 15.622310414474152, + "learning_rate": 0.0001, + "loss": 1.404, + "loss/crossentropy": 2.601827383041382, + "loss/hidden": 1.2109375, + "loss/logits": 0.19246245920658112, + "loss/reg": 5.860950841451995e-05, + "step": 420 + }, + { + "epoch": 0.052625, + "grad_norm": 5.301983833312988, + "grad_norm_var": 15.644682914862404, + "learning_rate": 0.0001, + "loss": 1.4862, + "loss/crossentropy": 2.6217854022979736, + "loss/hidden": 1.296875, + "loss/logits": 0.18871337175369263, + "loss/reg": 5.8600846386980265e-05, + "step": 421 + }, + { + "epoch": 0.05275, + "grad_norm": 2.114091634750366, + "grad_norm_var": 15.758396712898662, + "learning_rate": 0.0001, + "loss": 1.2033, + "loss/crossentropy": 2.5663623809814453, + "loss/hidden": 1.0390625, + "loss/logits": 0.16362521052360535, + "loss/reg": 5.8592915593180805e-05, + "step": 422 + }, + { + "epoch": 0.052875, + "grad_norm": 2.757091999053955, + "grad_norm_var": 15.661455859551703, + "learning_rate": 0.0001, + "loss": 1.1223, + "loss/crossentropy": 2.4681971073150635, + "loss/hidden": 0.97265625, + "loss/logits": 0.14905983209609985, + "loss/reg": 5.858425720361993e-05, + "step": 423 + }, + { + "epoch": 0.053, + "grad_norm": 2.4524407386779785, + "grad_norm_var": 15.670073831964206, + "learning_rate": 0.0001, + "loss": 1.2938, + "loss/crossentropy": 2.4758145809173584, + "loss/hidden": 1.1015625, + "loss/logits": 0.19164547324180603, + "loss/reg": 5.857350697624497e-05, + "step": 424 + }, + { + "epoch": 0.053125, + "grad_norm": 2.3052892684936523, + "grad_norm_var": 0.7038252417895506, + "learning_rate": 0.0001, + "loss": 1.2565, + "loss/crossentropy": 2.597487211227417, + "loss/hidden": 1.0703125, + "loss/logits": 0.18559187650680542, + "loss/reg": 5.855830750078894e-05, + "step": 425 + }, + { + "epoch": 0.05325, + "grad_norm": 2.7276995182037354, + "grad_norm_var": 0.7027765205874381, + "learning_rate": 0.0001, + "loss": 1.4141, + "loss/crossentropy": 2.6818253993988037, + "loss/hidden": 1.21875, + "loss/logits": 0.1948131024837494, + "loss/reg": 5.854442133568227e-05, + "step": 426 + }, + { + "epoch": 0.053375, + "grad_norm": 1.725293517112732, + "grad_norm_var": 0.7537440425638384, + "learning_rate": 0.0001, + "loss": 1.1664, + "loss/crossentropy": 2.4244258403778076, + "loss/hidden": 1.015625, + "loss/logits": 0.1502000093460083, + "loss/reg": 5.85384841542691e-05, + "step": 427 + }, + { + "epoch": 0.0535, + "grad_norm": 2.6642932891845703, + "grad_norm_var": 0.7527758186064119, + "learning_rate": 0.0001, + "loss": 1.5211, + "loss/crossentropy": 2.1209182739257812, + "loss/hidden": 1.328125, + "loss/logits": 0.192403644323349, + "loss/reg": 5.852692629559897e-05, + "step": 428 + }, + { + "epoch": 0.053625, + "grad_norm": 2.7787868976593018, + "grad_norm_var": 0.6272740663233074, + "learning_rate": 0.0001, + "loss": 1.3046, + "loss/crossentropy": 2.3020565509796143, + "loss/hidden": 1.125, + "loss/logits": 0.179016575217247, + "loss/reg": 5.851646346854977e-05, + "step": 429 + }, + { + "epoch": 0.05375, + "grad_norm": 2.891101360321045, + "grad_norm_var": 0.6299498912530666, + "learning_rate": 0.0001, + "loss": 1.4198, + "loss/crossentropy": 2.33249568939209, + "loss/hidden": 1.203125, + "loss/logits": 0.21604114770889282, + "loss/reg": 5.850956222275272e-05, + "step": 430 + }, + { + "epoch": 0.053875, + "grad_norm": 2.7940289974212646, + "grad_norm_var": 0.608789107013446, + "learning_rate": 0.0001, + "loss": 1.1825, + "loss/crossentropy": 2.6553549766540527, + "loss/hidden": 1.03125, + "loss/logits": 0.15064392983913422, + "loss/reg": 5.8500536397332326e-05, + "step": 431 + }, + { + "epoch": 0.054, + "grad_norm": 25.06597328186035, + "grad_norm_var": 31.943843646690855, + "learning_rate": 0.0001, + "loss": 2.4055, + "loss/crossentropy": 2.7126245498657227, + "loss/hidden": 2.03125, + "loss/logits": 0.3736712336540222, + "loss/reg": 5.849341687280685e-05, + "step": 432 + }, + { + "epoch": 0.054125, + "grad_norm": 2.4612748622894287, + "grad_norm_var": 32.003546233579016, + "learning_rate": 0.0001, + "loss": 1.4832, + "loss/crossentropy": 2.6244633197784424, + "loss/hidden": 1.25, + "loss/logits": 0.23266229033470154, + "loss/reg": 5.847978172823787e-05, + "step": 433 + }, + { + "epoch": 0.05425, + "grad_norm": 2.413149356842041, + "grad_norm_var": 31.926055741483236, + "learning_rate": 0.0001, + "loss": 1.405, + "loss/crossentropy": 2.513383626937866, + "loss/hidden": 1.1953125, + "loss/logits": 0.2091376930475235, + "loss/reg": 5.847239663125947e-05, + "step": 434 + }, + { + "epoch": 0.054375, + "grad_norm": 2.1266605854034424, + "grad_norm_var": 31.906339652731415, + "learning_rate": 0.0001, + "loss": 1.2307, + "loss/crossentropy": 2.645113706588745, + "loss/hidden": 1.0546875, + "loss/logits": 0.17538747191429138, + "loss/reg": 5.8466725022299215e-05, + "step": 435 + }, + { + "epoch": 0.0545, + "grad_norm": 2.693485975265503, + "grad_norm_var": 31.906153605922054, + "learning_rate": 0.0001, + "loss": 1.3491, + "loss/crossentropy": 2.5616350173950195, + "loss/hidden": 1.171875, + "loss/logits": 0.1766662299633026, + "loss/reg": 5.845691339345649e-05, + "step": 436 + }, + { + "epoch": 0.054625, + "grad_norm": 3.594322681427002, + "grad_norm_var": 31.81007436255887, + "learning_rate": 0.0001, + "loss": 1.4456, + "loss/crossentropy": 2.320868492126465, + "loss/hidden": 1.171875, + "loss/logits": 0.2730950713157654, + "loss/reg": 5.845166742801666e-05, + "step": 437 + }, + { + "epoch": 0.05475, + "grad_norm": 2.725066900253296, + "grad_norm_var": 31.681987454427826, + "learning_rate": 0.0001, + "loss": 1.4368, + "loss/crossentropy": 2.4526007175445557, + "loss/hidden": 1.21875, + "loss/logits": 0.21745863556861877, + "loss/reg": 5.844476982019842e-05, + "step": 438 + }, + { + "epoch": 0.054875, + "grad_norm": 2.615208625793457, + "grad_norm_var": 31.706966746538818, + "learning_rate": 0.0001, + "loss": 1.2902, + "loss/crossentropy": 2.5873489379882812, + "loss/hidden": 1.109375, + "loss/logits": 0.18027284741401672, + "loss/reg": 5.843998587806709e-05, + "step": 439 + }, + { + "epoch": 0.055, + "grad_norm": 2.679504632949829, + "grad_norm_var": 31.66327199965654, + "learning_rate": 0.0001, + "loss": 1.4142, + "loss/crossentropy": 2.171384811401367, + "loss/hidden": 1.21875, + "loss/logits": 0.1948787271976471, + "loss/reg": 5.8425270253792405e-05, + "step": 440 + }, + { + "epoch": 0.055125, + "grad_norm": 2.781118869781494, + "grad_norm_var": 31.56886824166385, + "learning_rate": 0.0001, + "loss": 1.2261, + "loss/crossentropy": 2.616610050201416, + "loss/hidden": 1.0625, + "loss/logits": 0.16300562024116516, + "loss/reg": 5.841004167450592e-05, + "step": 441 + }, + { + "epoch": 0.05525, + "grad_norm": 2.8343710899353027, + "grad_norm_var": 31.550828531904, + "learning_rate": 0.0001, + "loss": 1.6654, + "loss/crossentropy": 2.254971504211426, + "loss/hidden": 1.390625, + "loss/logits": 0.27416497468948364, + "loss/reg": 5.840086305397563e-05, + "step": 442 + }, + { + "epoch": 0.055375, + "grad_norm": 2.943516254425049, + "grad_norm_var": 31.26553828771242, + "learning_rate": 0.0001, + "loss": 1.3037, + "loss/crossentropy": 2.607365131378174, + "loss/hidden": 1.140625, + "loss/logits": 0.16250211000442505, + "loss/reg": 5.8392772189108655e-05, + "step": 443 + }, + { + "epoch": 0.0555, + "grad_norm": 4.3494696617126465, + "grad_norm_var": 31.11395178262311, + "learning_rate": 0.0001, + "loss": 1.4874, + "loss/crossentropy": 2.803809642791748, + "loss/hidden": 1.265625, + "loss/logits": 0.22114460170269012, + "loss/reg": 5.8383415307616815e-05, + "step": 444 + }, + { + "epoch": 0.055625, + "grad_norm": 2.3149962425231934, + "grad_norm_var": 31.21739595793184, + "learning_rate": 0.0001, + "loss": 1.1723, + "loss/crossentropy": 2.7661781311035156, + "loss/hidden": 1.015625, + "loss/logits": 0.1560768485069275, + "loss/reg": 5.8376208471599966e-05, + "step": 445 + }, + { + "epoch": 0.05575, + "grad_norm": 2.5312862396240234, + "grad_norm_var": 31.288532129977195, + "learning_rate": 0.0001, + "loss": 1.4583, + "loss/crossentropy": 2.3608808517456055, + "loss/hidden": 1.234375, + "loss/logits": 0.22338923811912537, + "loss/reg": 5.83621695113834e-05, + "step": 446 + }, + { + "epoch": 0.055875, + "grad_norm": 2.0245697498321533, + "grad_norm_var": 31.468007952235922, + "learning_rate": 0.0001, + "loss": 1.2537, + "loss/crossentropy": 2.6646907329559326, + "loss/hidden": 1.0859375, + "loss/logits": 0.1671399027109146, + "loss/reg": 5.835363481310196e-05, + "step": 447 + }, + { + "epoch": 0.056, + "grad_norm": 4.180586338043213, + "grad_norm_var": 0.4425575902395887, + "learning_rate": 0.0001, + "loss": 1.4287, + "loss/crossentropy": 2.478865623474121, + "loss/hidden": 1.1796875, + "loss/logits": 0.2484455555677414, + "loss/reg": 5.833926479681395e-05, + "step": 448 + }, + { + "epoch": 0.056125, + "grad_norm": 2.2291383743286133, + "grad_norm_var": 0.4573160813014281, + "learning_rate": 0.0001, + "loss": 1.2997, + "loss/crossentropy": 2.244389295578003, + "loss/hidden": 1.15625, + "loss/logits": 0.14287710189819336, + "loss/reg": 5.833054456161335e-05, + "step": 449 + }, + { + "epoch": 0.05625, + "grad_norm": 2.204925060272217, + "grad_norm_var": 0.47117643459253195, + "learning_rate": 0.0001, + "loss": 1.2876, + "loss/crossentropy": 2.3469107151031494, + "loss/hidden": 1.1015625, + "loss/logits": 0.1854255050420761, + "loss/reg": 5.832717943121679e-05, + "step": 450 + }, + { + "epoch": 0.056375, + "grad_norm": 2.5266880989074707, + "grad_norm_var": 0.4451698073358396, + "learning_rate": 0.0001, + "loss": 1.4392, + "loss/crossentropy": 2.440885305404663, + "loss/hidden": 1.2109375, + "loss/logits": 0.22769977152347565, + "loss/reg": 5.8323836128693074e-05, + "step": 451 + }, + { + "epoch": 0.0565, + "grad_norm": 2.410515785217285, + "grad_norm_var": 0.455202882380185, + "learning_rate": 0.0001, + "loss": 1.4083, + "loss/crossentropy": 2.4578142166137695, + "loss/hidden": 1.203125, + "loss/logits": 0.20461352169513702, + "loss/reg": 5.830869122291915e-05, + "step": 452 + }, + { + "epoch": 0.056625, + "grad_norm": 2.0389811992645264, + "grad_norm_var": 0.4435531519851603, + "learning_rate": 0.0001, + "loss": 1.1318, + "loss/crossentropy": 2.139033317565918, + "loss/hidden": 0.9921875, + "loss/logits": 0.1390083134174347, + "loss/reg": 5.829246947541833e-05, + "step": 453 + }, + { + "epoch": 0.05675, + "grad_norm": 1.979454517364502, + "grad_norm_var": 0.47698744011981165, + "learning_rate": 0.0001, + "loss": 1.3115, + "loss/crossentropy": 2.546844005584717, + "loss/hidden": 1.125, + "loss/logits": 0.18587306141853333, + "loss/reg": 5.8282243116991594e-05, + "step": 454 + }, + { + "epoch": 0.056875, + "grad_norm": 2.0210747718811035, + "grad_norm_var": 0.5030154373593951, + "learning_rate": 0.0001, + "loss": 1.21, + "loss/crossentropy": 2.6095550060272217, + "loss/hidden": 1.046875, + "loss/logits": 0.16256017982959747, + "loss/reg": 5.8266243286198005e-05, + "step": 455 + }, + { + "epoch": 0.057, + "grad_norm": 2.0944671630859375, + "grad_norm_var": 0.520400331750174, + "learning_rate": 0.0001, + "loss": 1.1407, + "loss/crossentropy": 2.450681447982788, + "loss/hidden": 0.98828125, + "loss/logits": 0.15184549987316132, + "loss/reg": 5.8250909205526114e-05, + "step": 456 + }, + { + "epoch": 0.057125, + "grad_norm": 2.5854806900024414, + "grad_norm_var": 0.5178481401493921, + "learning_rate": 0.0001, + "loss": 1.1308, + "loss/crossentropy": 2.8090949058532715, + "loss/hidden": 0.97265625, + "loss/logits": 0.15754011273384094, + "loss/reg": 5.8233421441400424e-05, + "step": 457 + }, + { + "epoch": 0.05725, + "grad_norm": 6.832178592681885, + "grad_norm_var": 1.6526915128701443, + "learning_rate": 0.0001, + "loss": 1.7544, + "loss/crossentropy": 2.4325008392333984, + "loss/hidden": 1.5625, + "loss/logits": 0.1913643479347229, + "loss/reg": 5.821782906423323e-05, + "step": 458 + }, + { + "epoch": 0.057375, + "grad_norm": 2.4911727905273438, + "grad_norm_var": 1.6585857165051416, + "learning_rate": 0.0001, + "loss": 1.277, + "loss/crossentropy": 2.5682671070098877, + "loss/hidden": 1.09375, + "loss/logits": 0.18270117044448853, + "loss/reg": 5.820325532113202e-05, + "step": 459 + }, + { + "epoch": 0.0575, + "grad_norm": 2.2592287063598633, + "grad_norm_var": 1.5000806172221008, + "learning_rate": 0.0001, + "loss": 1.149, + "loss/crossentropy": 2.3300366401672363, + "loss/hidden": 0.98828125, + "loss/logits": 0.16016384959220886, + "loss/reg": 5.8191151765640825e-05, + "step": 460 + }, + { + "epoch": 0.057625, + "grad_norm": 2.6110737323760986, + "grad_norm_var": 1.4915332961489087, + "learning_rate": 0.0001, + "loss": 1.4344, + "loss/crossentropy": 2.560197591781616, + "loss/hidden": 1.21875, + "loss/logits": 0.21507461369037628, + "loss/reg": 5.817634882987477e-05, + "step": 461 + }, + { + "epoch": 0.05775, + "grad_norm": 2.6446752548217773, + "grad_norm_var": 1.48995546498276, + "learning_rate": 0.0001, + "loss": 1.2381, + "loss/crossentropy": 2.5068211555480957, + "loss/hidden": 1.0546875, + "loss/logits": 0.1828281581401825, + "loss/reg": 5.816355405841023e-05, + "step": 462 + }, + { + "epoch": 0.057875, + "grad_norm": 2.498300075531006, + "grad_norm_var": 1.4615785550667995, + "learning_rate": 0.0001, + "loss": 1.3019, + "loss/crossentropy": 2.3765523433685303, + "loss/hidden": 1.1328125, + "loss/logits": 0.16848215460777283, + "loss/reg": 5.814860560349189e-05, + "step": 463 + }, + { + "epoch": 0.058, + "grad_norm": 2.4674289226531982, + "grad_norm_var": 1.3126372255276026, + "learning_rate": 0.0001, + "loss": 1.3472, + "loss/crossentropy": 2.714657783508301, + "loss/hidden": 1.1640625, + "loss/logits": 0.18256625533103943, + "loss/reg": 5.81321437493898e-05, + "step": 464 + }, + { + "epoch": 0.058125, + "grad_norm": 3.7482964992523193, + "grad_norm_var": 1.3780257940909062, + "learning_rate": 0.0001, + "loss": 1.4579, + "loss/crossentropy": 2.7645256519317627, + "loss/hidden": 1.2109375, + "loss/logits": 0.24636635184288025, + "loss/reg": 5.811548908241093e-05, + "step": 465 + }, + { + "epoch": 0.05825, + "grad_norm": 3.1881492137908936, + "grad_norm_var": 1.3717908440858895, + "learning_rate": 0.0001, + "loss": 1.2469, + "loss/crossentropy": 2.6280384063720703, + "loss/hidden": 1.078125, + "loss/logits": 0.16818463802337646, + "loss/reg": 5.8095396525459364e-05, + "step": 466 + }, + { + "epoch": 0.058375, + "grad_norm": 3.4882731437683105, + "grad_norm_var": 1.3977675144088226, + "learning_rate": 0.0001, + "loss": 1.5403, + "loss/crossentropy": 1.8358429670333862, + "loss/hidden": 1.3203125, + "loss/logits": 0.21941694617271423, + "loss/reg": 5.807522757095285e-05, + "step": 467 + }, + { + "epoch": 0.0585, + "grad_norm": 2.530682325363159, + "grad_norm_var": 1.391870091660969, + "learning_rate": 0.0001, + "loss": 1.1578, + "loss/crossentropy": 2.3950142860412598, + "loss/hidden": 0.99609375, + "loss/logits": 0.1611400693655014, + "loss/reg": 5.80518099013716e-05, + "step": 468 + }, + { + "epoch": 0.058625, + "grad_norm": 3.4676575660705566, + "grad_norm_var": 1.366390295617852, + "learning_rate": 0.0001, + "loss": 1.5162, + "loss/crossentropy": 2.851280689239502, + "loss/hidden": 1.234375, + "loss/logits": 0.28122612833976746, + "loss/reg": 5.8030982472701e-05, + "step": 469 + }, + { + "epoch": 0.05875, + "grad_norm": 2.9446208477020264, + "grad_norm_var": 1.302065384350945, + "learning_rate": 0.0001, + "loss": 1.3015, + "loss/crossentropy": 2.740093469619751, + "loss/hidden": 1.125, + "loss/logits": 0.17590749263763428, + "loss/reg": 5.800585859105922e-05, + "step": 470 + }, + { + "epoch": 0.058875, + "grad_norm": 2.7597243785858154, + "grad_norm_var": 1.2405377686230998, + "learning_rate": 0.0001, + "loss": 1.1651, + "loss/crossentropy": 2.440762996673584, + "loss/hidden": 1.015625, + "loss/logits": 0.14888577163219452, + "loss/reg": 5.7990357163362205e-05, + "step": 471 + }, + { + "epoch": 0.059, + "grad_norm": 2.8147523403167725, + "grad_norm_var": 1.182327943249795, + "learning_rate": 0.0001, + "loss": 1.3195, + "loss/crossentropy": 2.5801327228546143, + "loss/hidden": 1.140625, + "loss/logits": 0.17824885249137878, + "loss/reg": 5.7975972595158964e-05, + "step": 472 + }, + { + "epoch": 0.059125, + "grad_norm": 2.4511027336120605, + "grad_norm_var": 1.1923747545104257, + "learning_rate": 0.0001, + "loss": 1.4217, + "loss/crossentropy": 2.5711913108825684, + "loss/hidden": 1.203125, + "loss/logits": 0.2180328667163849, + "loss/reg": 5.796052937512286e-05, + "step": 473 + }, + { + "epoch": 0.05925, + "grad_norm": 2.9213221073150635, + "grad_norm_var": 0.1890407192544025, + "learning_rate": 0.0001, + "loss": 1.2735, + "loss/crossentropy": 2.5805675983428955, + "loss/hidden": 1.1015625, + "loss/logits": 0.17132875323295593, + "loss/reg": 5.794024036731571e-05, + "step": 474 + }, + { + "epoch": 0.059375, + "grad_norm": 2.6587464809417725, + "grad_norm_var": 0.1832162860499608, + "learning_rate": 0.0001, + "loss": 1.6569, + "loss/crossentropy": 2.356299638748169, + "loss/hidden": 1.40625, + "loss/logits": 0.25005391240119934, + "loss/reg": 5.791860894532874e-05, + "step": 475 + }, + { + "epoch": 0.0595, + "grad_norm": 3.5978729724884033, + "grad_norm_var": 0.19139826910290647, + "learning_rate": 0.0001, + "loss": 1.7357, + "loss/crossentropy": 2.0626883506774902, + "loss/hidden": 1.4765625, + "loss/logits": 0.2585859000682831, + "loss/reg": 5.790415525552817e-05, + "step": 476 + }, + { + "epoch": 0.059625, + "grad_norm": 2.8491876125335693, + "grad_norm_var": 0.18498974202791843, + "learning_rate": 0.0001, + "loss": 1.5276, + "loss/crossentropy": 2.5583596229553223, + "loss/hidden": 1.2734375, + "loss/logits": 0.25358158349990845, + "loss/reg": 5.788617272628471e-05, + "step": 477 + }, + { + "epoch": 0.05975, + "grad_norm": 2.5821259021759033, + "grad_norm_var": 0.1876924518839881, + "learning_rate": 0.0001, + "loss": 1.3568, + "loss/crossentropy": 2.486640453338623, + "loss/hidden": 1.1640625, + "loss/logits": 0.19216927886009216, + "loss/reg": 5.786680776509456e-05, + "step": 478 + }, + { + "epoch": 0.059875, + "grad_norm": 2.877934217453003, + "grad_norm_var": 0.17456917708907038, + "learning_rate": 0.0001, + "loss": 1.5607, + "loss/crossentropy": 2.3836066722869873, + "loss/hidden": 1.3203125, + "loss/logits": 0.23981472849845886, + "loss/reg": 5.785070243291557e-05, + "step": 479 + }, + { + "epoch": 0.06, + "grad_norm": 2.3281009197235107, + "grad_norm_var": 0.1849188959934999, + "learning_rate": 0.0001, + "loss": 1.2716, + "loss/crossentropy": 2.508988380432129, + "loss/hidden": 1.078125, + "loss/logits": 0.19294525682926178, + "loss/reg": 5.783725646324456e-05, + "step": 480 + }, + { + "epoch": 0.060125, + "grad_norm": 2.8099567890167236, + "grad_norm_var": 0.14013939438571937, + "learning_rate": 0.0001, + "loss": 1.5081, + "loss/crossentropy": 2.3855881690979004, + "loss/hidden": 1.25, + "loss/logits": 0.2575419545173645, + "loss/reg": 5.782474545412697e-05, + "step": 481 + }, + { + "epoch": 0.06025, + "grad_norm": 2.9827277660369873, + "grad_norm_var": 0.134662315913679, + "learning_rate": 0.0001, + "loss": 1.4593, + "loss/crossentropy": 2.5487606525421143, + "loss/hidden": 1.25, + "loss/logits": 0.2087090015411377, + "loss/reg": 5.7816720072878525e-05, + "step": 482 + }, + { + "epoch": 0.060375, + "grad_norm": 2.306149959564209, + "grad_norm_var": 0.1259770764512929, + "learning_rate": 0.0001, + "loss": 1.2076, + "loss/crossentropy": 2.4755747318267822, + "loss/hidden": 1.046875, + "loss/logits": 0.16014963388442993, + "loss/reg": 5.781082290923223e-05, + "step": 483 + }, + { + "epoch": 0.0605, + "grad_norm": 2.4719114303588867, + "grad_norm_var": 0.12834384378027816, + "learning_rate": 0.0001, + "loss": 1.3745, + "loss/crossentropy": 2.8203346729278564, + "loss/hidden": 1.171875, + "loss/logits": 0.20208273828029633, + "loss/reg": 5.7795077736955136e-05, + "step": 484 + }, + { + "epoch": 0.060625, + "grad_norm": 2.300952911376953, + "grad_norm_var": 0.10978991346620433, + "learning_rate": 0.0001, + "loss": 1.464, + "loss/crossentropy": 2.610508680343628, + "loss/hidden": 1.2265625, + "loss/logits": 0.2368427813053131, + "loss/reg": 5.778546983492561e-05, + "step": 485 + }, + { + "epoch": 0.06075, + "grad_norm": 3.3388009071350098, + "grad_norm_var": 0.13085586368501342, + "learning_rate": 0.0001, + "loss": 1.5116, + "loss/crossentropy": 2.763427972793579, + "loss/hidden": 1.296875, + "loss/logits": 0.21419215202331543, + "loss/reg": 5.7770797866396606e-05, + "step": 486 + }, + { + "epoch": 0.060875, + "grad_norm": 2.102293014526367, + "grad_norm_var": 0.1572983810037916, + "learning_rate": 0.0001, + "loss": 1.1595, + "loss/crossentropy": 2.204011917114258, + "loss/hidden": 1.0, + "loss/logits": 0.158901646733284, + "loss/reg": 5.7755187299335375e-05, + "step": 487 + }, + { + "epoch": 0.061, + "grad_norm": 2.766934633255005, + "grad_norm_var": 0.15678694409689248, + "learning_rate": 0.0001, + "loss": 1.4246, + "loss/crossentropy": 2.537151575088501, + "loss/hidden": 1.2109375, + "loss/logits": 0.2130882441997528, + "loss/reg": 5.774224700871855e-05, + "step": 488 + }, + { + "epoch": 0.061125, + "grad_norm": 2.0001540184020996, + "grad_norm_var": 0.18501104247654798, + "learning_rate": 0.0001, + "loss": 1.103, + "loss/crossentropy": 2.3592050075531006, + "loss/hidden": 0.96484375, + "loss/logits": 0.13754940032958984, + "loss/reg": 5.77289865759667e-05, + "step": 489 + }, + { + "epoch": 0.06125, + "grad_norm": 2.3166351318359375, + "grad_norm_var": 0.18848381138351228, + "learning_rate": 0.0001, + "loss": 1.3329, + "loss/crossentropy": 2.7236411571502686, + "loss/hidden": 1.15625, + "loss/logits": 0.1761033535003662, + "loss/reg": 5.771181167801842e-05, + "step": 490 + }, + { + "epoch": 0.061375, + "grad_norm": 2.357775926589966, + "grad_norm_var": 0.19351960086170053, + "learning_rate": 0.0001, + "loss": 1.1437, + "loss/crossentropy": 2.866445779800415, + "loss/hidden": 0.98828125, + "loss/logits": 0.15484049916267395, + "loss/reg": 5.769642666564323e-05, + "step": 491 + }, + { + "epoch": 0.0615, + "grad_norm": 3.680264949798584, + "grad_norm_var": 0.20463866822373877, + "learning_rate": 0.0001, + "loss": 1.2002, + "loss/crossentropy": 3.115431308746338, + "loss/hidden": 1.0390625, + "loss/logits": 0.16054463386535645, + "loss/reg": 5.7679084420669824e-05, + "step": 492 + }, + { + "epoch": 0.061625, + "grad_norm": 2.3650856018066406, + "grad_norm_var": 0.2051052996774897, + "learning_rate": 0.0001, + "loss": 1.1996, + "loss/crossentropy": 2.6519298553466797, + "loss/hidden": 1.0234375, + "loss/logits": 0.17554257810115814, + "loss/reg": 5.766074173152447e-05, + "step": 493 + }, + { + "epoch": 0.06175, + "grad_norm": 2.7080323696136475, + "grad_norm_var": 0.2058088113620099, + "learning_rate": 0.0001, + "loss": 1.365, + "loss/crossentropy": 2.329538106918335, + "loss/hidden": 1.15625, + "loss/logits": 0.20815491676330566, + "loss/reg": 5.764625166193582e-05, + "step": 494 + }, + { + "epoch": 0.061875, + "grad_norm": 2.2859530448913574, + "grad_norm_var": 0.2063347958167308, + "learning_rate": 0.0001, + "loss": 1.2994, + "loss/crossentropy": 2.6445348262786865, + "loss/hidden": 1.125, + "loss/logits": 0.1738019585609436, + "loss/reg": 5.763155422755517e-05, + "step": 495 + }, + { + "epoch": 0.062, + "grad_norm": 2.771320343017578, + "grad_norm_var": 0.20431087500909348, + "learning_rate": 0.0001, + "loss": 1.4714, + "loss/crossentropy": 2.340728282928467, + "loss/hidden": 1.2578125, + "loss/logits": 0.21303007006645203, + "loss/reg": 5.761897409684025e-05, + "step": 496 + }, + { + "epoch": 0.062125, + "grad_norm": 3.022183656692505, + "grad_norm_var": 0.21312900983479016, + "learning_rate": 0.0001, + "loss": 1.4858, + "loss/crossentropy": 2.6772336959838867, + "loss/hidden": 1.265625, + "loss/logits": 0.2196260541677475, + "loss/reg": 5.761081411037594e-05, + "step": 497 + }, + { + "epoch": 0.06225, + "grad_norm": 13.948429107666016, + "grad_norm_var": 8.27193520122967, + "learning_rate": 0.0001, + "loss": 1.3633, + "loss/crossentropy": 2.862323760986328, + "loss/hidden": 1.171875, + "loss/logits": 0.19083081185817719, + "loss/reg": 5.7596374972490594e-05, + "step": 498 + }, + { + "epoch": 0.062375, + "grad_norm": 2.6107678413391113, + "grad_norm_var": 8.237513777759569, + "learning_rate": 0.0001, + "loss": 1.6771, + "loss/crossentropy": 2.1725099086761475, + "loss/hidden": 1.40625, + "loss/logits": 0.2702314555644989, + "loss/reg": 5.7586628827266395e-05, + "step": 499 + }, + { + "epoch": 0.0625, + "grad_norm": 2.5658040046691895, + "grad_norm_var": 8.22750426778598, + "learning_rate": 0.0001, + "loss": 1.4381, + "loss/crossentropy": 2.246595859527588, + "loss/hidden": 1.25, + "loss/logits": 0.18755751848220825, + "loss/reg": 5.756897371611558e-05, + "step": 500 + }, + { + "epoch": 0.062625, + "grad_norm": 2.179478168487549, + "grad_norm_var": 8.244953306240525, + "learning_rate": 0.0001, + "loss": 1.291, + "loss/crossentropy": 2.488560199737549, + "loss/hidden": 1.1015625, + "loss/logits": 0.1888759732246399, + "loss/reg": 5.755467645940371e-05, + "step": 501 + }, + { + "epoch": 0.06275, + "grad_norm": 2.0030248165130615, + "grad_norm_var": 8.352009291243755, + "learning_rate": 0.0001, + "loss": 1.2442, + "loss/crossentropy": 2.5138843059539795, + "loss/hidden": 1.0546875, + "loss/logits": 0.18891112506389618, + "loss/reg": 5.754067751695402e-05, + "step": 502 + }, + { + "epoch": 0.062875, + "grad_norm": 2.3097050189971924, + "grad_norm_var": 8.323504212905837, + "learning_rate": 0.0001, + "loss": 1.0807, + "loss/crossentropy": 2.534362554550171, + "loss/hidden": 0.93359375, + "loss/logits": 0.1465301215648651, + "loss/reg": 5.752982178819366e-05, + "step": 503 + }, + { + "epoch": 0.063, + "grad_norm": 2.1386523246765137, + "grad_norm_var": 8.388074418328216, + "learning_rate": 0.0001, + "loss": 1.267, + "loss/crossentropy": 2.5574960708618164, + "loss/hidden": 1.0859375, + "loss/logits": 0.18048033118247986, + "loss/reg": 5.7512213970767334e-05, + "step": 504 + }, + { + "epoch": 0.063125, + "grad_norm": 2.0644736289978027, + "grad_norm_var": 8.378009254211024, + "learning_rate": 0.0001, + "loss": 1.1193, + "loss/crossentropy": 2.4307045936584473, + "loss/hidden": 0.96484375, + "loss/logits": 0.15389274060726166, + "loss/reg": 5.749760748585686e-05, + "step": 505 + }, + { + "epoch": 0.06325, + "grad_norm": 3.5246083736419678, + "grad_norm_var": 8.32564739400077, + "learning_rate": 0.0001, + "loss": 1.904, + "loss/crossentropy": 2.737135171890259, + "loss/hidden": 1.5078125, + "loss/logits": 0.395632803440094, + "loss/reg": 5.74878795305267e-05, + "step": 506 + }, + { + "epoch": 0.063375, + "grad_norm": 2.487663745880127, + "grad_norm_var": 8.310670261508536, + "learning_rate": 0.0001, + "loss": 1.3095, + "loss/crossentropy": 2.5345706939697266, + "loss/hidden": 1.1171875, + "loss/logits": 0.19169974327087402, + "loss/reg": 5.747407703893259e-05, + "step": 507 + }, + { + "epoch": 0.0635, + "grad_norm": 2.319613218307495, + "grad_norm_var": 8.355867662618861, + "learning_rate": 0.0001, + "loss": 1.4296, + "loss/crossentropy": 2.439621925354004, + "loss/hidden": 1.1875, + "loss/logits": 0.24147875607013702, + "loss/reg": 5.746008537244052e-05, + "step": 508 + }, + { + "epoch": 0.063625, + "grad_norm": 2.239403009414673, + "grad_norm_var": 8.370955905049472, + "learning_rate": 0.0001, + "loss": 1.307, + "loss/crossentropy": 2.4812705516815186, + "loss/hidden": 1.1171875, + "loss/logits": 0.18924759328365326, + "loss/reg": 5.744830923504196e-05, + "step": 509 + }, + { + "epoch": 0.06375, + "grad_norm": 2.440845489501953, + "grad_norm_var": 8.39289750619492, + "learning_rate": 0.0001, + "loss": 1.1623, + "loss/crossentropy": 2.5835447311401367, + "loss/hidden": 0.984375, + "loss/logits": 0.17732426524162292, + "loss/reg": 5.744034206145443e-05, + "step": 510 + }, + { + "epoch": 0.063875, + "grad_norm": 3.7700071334838867, + "grad_norm_var": 8.353245171235498, + "learning_rate": 0.0001, + "loss": 1.5529, + "loss/crossentropy": 2.9319827556610107, + "loss/hidden": 1.2734375, + "loss/logits": 0.27884694933891296, + "loss/reg": 5.742744542658329e-05, + "step": 511 + }, + { + "epoch": 0.064, + "grad_norm": 3.187791347503662, + "grad_norm_var": 8.3361305665004, + "learning_rate": 0.0001, + "loss": 1.6948, + "loss/crossentropy": 2.616928815841675, + "loss/hidden": 1.40625, + "loss/logits": 0.2879628539085388, + "loss/reg": 5.7413530157646164e-05, + "step": 512 + }, + { + "epoch": 0.064125, + "grad_norm": 2.2276997566223145, + "grad_norm_var": 8.405092706710946, + "learning_rate": 0.0001, + "loss": 1.2498, + "loss/crossentropy": 2.5594594478607178, + "loss/hidden": 1.0625, + "loss/logits": 0.18669450283050537, + "loss/reg": 5.740176129620522e-05, + "step": 513 + }, + { + "epoch": 0.06425, + "grad_norm": 2.571033239364624, + "grad_norm_var": 0.26774881553773466, + "learning_rate": 0.0001, + "loss": 1.5606, + "loss/crossentropy": 2.3838601112365723, + "loss/hidden": 1.296875, + "loss/logits": 0.2631247639656067, + "loss/reg": 5.7387296692468226e-05, + "step": 514 + }, + { + "epoch": 0.064375, + "grad_norm": 2.2190117835998535, + "grad_norm_var": 0.2736462331703469, + "learning_rate": 0.0001, + "loss": 1.2127, + "loss/crossentropy": 2.3376893997192383, + "loss/hidden": 1.046875, + "loss/logits": 0.16529923677444458, + "loss/reg": 5.7369947171537206e-05, + "step": 515 + }, + { + "epoch": 0.0645, + "grad_norm": 3.6509628295898438, + "grad_norm_var": 0.3545153452463372, + "learning_rate": 0.0001, + "loss": 1.4324, + "loss/crossentropy": 2.669734001159668, + "loss/hidden": 1.2421875, + "loss/logits": 0.18968772888183594, + "loss/reg": 5.7353197917109355e-05, + "step": 516 + }, + { + "epoch": 0.064625, + "grad_norm": 2.248657703399658, + "grad_norm_var": 0.351088953977406, + "learning_rate": 0.0001, + "loss": 1.2818, + "loss/crossentropy": 2.6037304401397705, + "loss/hidden": 1.09375, + "loss/logits": 0.1875147819519043, + "loss/reg": 5.7342589570907876e-05, + "step": 517 + }, + { + "epoch": 0.06475, + "grad_norm": 2.4198105335235596, + "grad_norm_var": 0.32945477622143454, + "learning_rate": 0.0001, + "loss": 1.1107, + "loss/crossentropy": 2.6085071563720703, + "loss/hidden": 0.96875, + "loss/logits": 0.1413734257221222, + "loss/reg": 5.7321107306052e-05, + "step": 518 + }, + { + "epoch": 0.064875, + "grad_norm": 2.1648852825164795, + "grad_norm_var": 0.33663639522773486, + "learning_rate": 0.0001, + "loss": 1.4, + "loss/crossentropy": 2.177536964416504, + "loss/hidden": 1.1796875, + "loss/logits": 0.2197086662054062, + "loss/reg": 5.7304925576318055e-05, + "step": 519 + }, + { + "epoch": 0.065, + "grad_norm": 3.105713129043579, + "grad_norm_var": 0.33499459859275643, + "learning_rate": 0.0001, + "loss": 1.3224, + "loss/crossentropy": 3.021796941757202, + "loss/hidden": 1.1328125, + "loss/logits": 0.1890622079372406, + "loss/reg": 5.7283985370304435e-05, + "step": 520 + }, + { + "epoch": 0.065125, + "grad_norm": 2.680781364440918, + "grad_norm_var": 0.3093752297956028, + "learning_rate": 0.0001, + "loss": 1.4408, + "loss/crossentropy": 2.4209651947021484, + "loss/hidden": 1.203125, + "loss/logits": 0.23711565136909485, + "loss/reg": 5.726591916754842e-05, + "step": 521 + }, + { + "epoch": 0.06525, + "grad_norm": 2.415611505508423, + "grad_norm_var": 0.2648511354846446, + "learning_rate": 0.0001, + "loss": 1.2277, + "loss/crossentropy": 2.623185873031616, + "loss/hidden": 1.0546875, + "loss/logits": 0.172480508685112, + "loss/reg": 5.724430957343429e-05, + "step": 522 + }, + { + "epoch": 0.065375, + "grad_norm": 2.7129733562469482, + "grad_norm_var": 0.2636174732540553, + "learning_rate": 0.0001, + "loss": 1.5511, + "loss/crossentropy": 2.6856141090393066, + "loss/hidden": 1.328125, + "loss/logits": 0.22241194546222687, + "loss/reg": 5.722355126636103e-05, + "step": 523 + }, + { + "epoch": 0.0655, + "grad_norm": 2.5169126987457275, + "grad_norm_var": 0.25740049578529633, + "learning_rate": 0.0001, + "loss": 1.4756, + "loss/crossentropy": 2.5400278568267822, + "loss/hidden": 1.2421875, + "loss/logits": 0.23280034959316254, + "loss/reg": 5.720969784306362e-05, + "step": 524 + }, + { + "epoch": 0.065625, + "grad_norm": 3.141322612762451, + "grad_norm_var": 0.25757144722960346, + "learning_rate": 0.0001, + "loss": 1.3953, + "loss/crossentropy": 2.6088011264801025, + "loss/hidden": 1.203125, + "loss/logits": 0.1916118562221527, + "loss/reg": 5.71877826587297e-05, + "step": 525 + }, + { + "epoch": 0.06575, + "grad_norm": 2.1077466011047363, + "grad_norm_var": 0.276776634481363, + "learning_rate": 0.0001, + "loss": 1.2543, + "loss/crossentropy": 2.3197500705718994, + "loss/hidden": 1.078125, + "loss/logits": 0.1755562126636505, + "loss/reg": 5.716781743103638e-05, + "step": 526 + }, + { + "epoch": 0.065875, + "grad_norm": 3.1689445972442627, + "grad_norm_var": 0.2133083163409907, + "learning_rate": 0.0001, + "loss": 1.4275, + "loss/crossentropy": 2.9355862140655518, + "loss/hidden": 1.21875, + "loss/logits": 0.20817086100578308, + "loss/reg": 5.714903454645537e-05, + "step": 527 + }, + { + "epoch": 0.066, + "grad_norm": 2.285956859588623, + "grad_norm_var": 0.20052447759750577, + "learning_rate": 0.0001, + "loss": 1.1824, + "loss/crossentropy": 2.760286331176758, + "loss/hidden": 1.0234375, + "loss/logits": 0.1584203690290451, + "loss/reg": 5.712690472137183e-05, + "step": 528 + }, + { + "epoch": 0.066125, + "grad_norm": 3.018244981765747, + "grad_norm_var": 0.2000914000663156, + "learning_rate": 0.0001, + "loss": 1.6053, + "loss/crossentropy": 2.4221365451812744, + "loss/hidden": 1.3359375, + "loss/logits": 0.26882410049438477, + "loss/reg": 5.7109886256512254e-05, + "step": 529 + }, + { + "epoch": 0.06625, + "grad_norm": 2.6761245727539062, + "grad_norm_var": 0.19965014586136837, + "learning_rate": 0.0001, + "loss": 1.4026, + "loss/crossentropy": 2.5661122798919678, + "loss/hidden": 1.203125, + "loss/logits": 0.19894230365753174, + "loss/reg": 5.709614561055787e-05, + "step": 530 + }, + { + "epoch": 0.066375, + "grad_norm": 3.7308688163757324, + "grad_norm_var": 0.25394415558544003, + "learning_rate": 0.0001, + "loss": 1.3714, + "loss/crossentropy": 2.674487352371216, + "loss/hidden": 1.1875, + "loss/logits": 0.18333487212657928, + "loss/reg": 5.708081880584359e-05, + "step": 531 + }, + { + "epoch": 0.0665, + "grad_norm": 5.148390293121338, + "grad_norm_var": 0.5734027576420241, + "learning_rate": 0.0001, + "loss": 2.0224, + "loss/crossentropy": 2.5511114597320557, + "loss/hidden": 1.6875, + "loss/logits": 0.3343617022037506, + "loss/reg": 5.706860974896699e-05, + "step": 532 + }, + { + "epoch": 0.066625, + "grad_norm": 4.639659881591797, + "grad_norm_var": 0.7401371960890089, + "learning_rate": 0.0001, + "loss": 2.0703, + "loss/crossentropy": 2.8264873027801514, + "loss/hidden": 1.609375, + "loss/logits": 0.4603120684623718, + "loss/reg": 5.705539297196083e-05, + "step": 533 + }, + { + "epoch": 0.06675, + "grad_norm": 2.6590423583984375, + "grad_norm_var": 0.7253392327299117, + "learning_rate": 0.0001, + "loss": 1.1927, + "loss/crossentropy": 2.692796230316162, + "loss/hidden": 1.03125, + "loss/logits": 0.1608980894088745, + "loss/reg": 5.7040437241084874e-05, + "step": 534 + }, + { + "epoch": 0.066875, + "grad_norm": 2.4473471641540527, + "grad_norm_var": 0.6984663971172343, + "learning_rate": 0.0001, + "loss": 1.2469, + "loss/crossentropy": 2.6443886756896973, + "loss/hidden": 1.0703125, + "loss/logits": 0.1760546863079071, + "loss/reg": 5.702269481844269e-05, + "step": 535 + }, + { + "epoch": 0.067, + "grad_norm": 2.676015853881836, + "grad_norm_var": 0.7055813256444667, + "learning_rate": 0.0001, + "loss": 1.5843, + "loss/crossentropy": 2.431217908859253, + "loss/hidden": 1.3359375, + "loss/logits": 0.24780328571796417, + "loss/reg": 5.7007055147551e-05, + "step": 536 + }, + { + "epoch": 0.067125, + "grad_norm": 2.392594575881958, + "grad_norm_var": 0.723100302829251, + "learning_rate": 0.0001, + "loss": 1.4829, + "loss/crossentropy": 2.6035330295562744, + "loss/hidden": 1.25, + "loss/logits": 0.23232683539390564, + "loss/reg": 5.699544635717757e-05, + "step": 537 + }, + { + "epoch": 0.06725, + "grad_norm": 3.0852084159851074, + "grad_norm_var": 0.7004121508792098, + "learning_rate": 0.0001, + "loss": 1.662, + "loss/crossentropy": 2.6200292110443115, + "loss/hidden": 1.359375, + "loss/logits": 0.30209141969680786, + "loss/reg": 5.697782398783602e-05, + "step": 538 + }, + { + "epoch": 0.067375, + "grad_norm": 3.3582072257995605, + "grad_norm_var": 0.6995490112188842, + "learning_rate": 0.0001, + "loss": 1.4034, + "loss/crossentropy": 2.590505599975586, + "loss/hidden": 1.1875, + "loss/logits": 0.2153635025024414, + "loss/reg": 5.69607327634003e-05, + "step": 539 + }, + { + "epoch": 0.0675, + "grad_norm": 2.4375123977661133, + "grad_norm_var": 0.705753805030601, + "learning_rate": 0.0001, + "loss": 1.2346, + "loss/crossentropy": 2.815932035446167, + "loss/hidden": 1.0703125, + "loss/logits": 0.16367268562316895, + "loss/reg": 5.6944831158034503e-05, + "step": 540 + }, + { + "epoch": 0.067625, + "grad_norm": 3.832122564315796, + "grad_norm_var": 0.742993530751691, + "learning_rate": 0.0001, + "loss": 1.3468, + "loss/crossentropy": 2.4388320446014404, + "loss/hidden": 1.15625, + "loss/logits": 0.1899527907371521, + "loss/reg": 5.6931155995698646e-05, + "step": 541 + }, + { + "epoch": 0.06775, + "grad_norm": 2.635655641555786, + "grad_norm_var": 0.6902874276451028, + "learning_rate": 0.0001, + "loss": 1.3026, + "loss/crossentropy": 2.51385760307312, + "loss/hidden": 1.125, + "loss/logits": 0.17700721323490143, + "loss/reg": 5.6917735491879284e-05, + "step": 542 + }, + { + "epoch": 0.067875, + "grad_norm": 2.1058261394500732, + "grad_norm_var": 0.7563971927113601, + "learning_rate": 0.0001, + "loss": 1.112, + "loss/crossentropy": 2.427570343017578, + "loss/hidden": 0.9765625, + "loss/logits": 0.13487987220287323, + "loss/reg": 5.690442776540294e-05, + "step": 543 + }, + { + "epoch": 0.068, + "grad_norm": 2.943103551864624, + "grad_norm_var": 0.7146417206132497, + "learning_rate": 0.0001, + "loss": 1.3356, + "loss/crossentropy": 2.8370819091796875, + "loss/hidden": 1.1484375, + "loss/logits": 0.186607226729393, + "loss/reg": 5.68877840123605e-05, + "step": 544 + }, + { + "epoch": 0.068125, + "grad_norm": 3.8723862171173096, + "grad_norm_var": 0.7496049567117694, + "learning_rate": 0.0001, + "loss": 1.5018, + "loss/crossentropy": 2.479180097579956, + "loss/hidden": 1.25, + "loss/logits": 0.2512153685092926, + "loss/reg": 5.6872839195420966e-05, + "step": 545 + }, + { + "epoch": 0.06825, + "grad_norm": 2.1730518341064453, + "grad_norm_var": 0.7982148549638083, + "learning_rate": 0.0001, + "loss": 1.3274, + "loss/crossentropy": 2.7552671432495117, + "loss/hidden": 1.140625, + "loss/logits": 0.18620190024375916, + "loss/reg": 5.685817450284958e-05, + "step": 546 + }, + { + "epoch": 0.068375, + "grad_norm": 2.5830624103546143, + "grad_norm_var": 0.7891437401193322, + "learning_rate": 0.0001, + "loss": 1.3184, + "loss/crossentropy": 2.976935386657715, + "loss/hidden": 1.1171875, + "loss/logits": 0.20067663490772247, + "loss/reg": 5.684147254214622e-05, + "step": 547 + }, + { + "epoch": 0.0685, + "grad_norm": 2.8029427528381348, + "grad_norm_var": 0.48043981243007633, + "learning_rate": 0.0001, + "loss": 1.509, + "loss/crossentropy": 2.4375596046447754, + "loss/hidden": 1.2578125, + "loss/logits": 0.25059816241264343, + "loss/reg": 5.682710980181582e-05, + "step": 548 + }, + { + "epoch": 0.068625, + "grad_norm": 2.4281182289123535, + "grad_norm_var": 0.27763671155671144, + "learning_rate": 0.0001, + "loss": 1.432, + "loss/crossentropy": 2.510115623474121, + "loss/hidden": 1.21875, + "loss/logits": 0.21264401078224182, + "loss/reg": 5.681176844518632e-05, + "step": 549 + }, + { + "epoch": 0.06875, + "grad_norm": 2.535102605819702, + "grad_norm_var": 0.2805462672149124, + "learning_rate": 0.0001, + "loss": 1.6199, + "loss/crossentropy": 2.3414230346679688, + "loss/hidden": 1.3828125, + "loss/logits": 0.2365313321352005, + "loss/reg": 5.6794018746586516e-05, + "step": 550 + }, + { + "epoch": 0.068875, + "grad_norm": 2.8906142711639404, + "grad_norm_var": 0.2738004819147721, + "learning_rate": 0.0001, + "loss": 1.3797, + "loss/crossentropy": 2.3271195888519287, + "loss/hidden": 1.1796875, + "loss/logits": 0.19946351647377014, + "loss/reg": 5.6774406402837485e-05, + "step": 551 + }, + { + "epoch": 0.069, + "grad_norm": 3.370306968688965, + "grad_norm_var": 0.2927309791110661, + "learning_rate": 0.0001, + "loss": 1.3856, + "loss/crossentropy": 2.3809690475463867, + "loss/hidden": 1.171875, + "loss/logits": 0.2131776064634323, + "loss/reg": 5.675842476193793e-05, + "step": 552 + }, + { + "epoch": 0.069125, + "grad_norm": 4.104588508605957, + "grad_norm_var": 0.37370332603284834, + "learning_rate": 0.0001, + "loss": 1.4949, + "loss/crossentropy": 2.32270884513855, + "loss/hidden": 1.296875, + "loss/logits": 0.19749879837036133, + "loss/reg": 5.6740394938969985e-05, + "step": 553 + }, + { + "epoch": 0.06925, + "grad_norm": 2.1966323852539062, + "grad_norm_var": 0.40671981467929375, + "learning_rate": 0.0001, + "loss": 1.2987, + "loss/crossentropy": 2.56730580329895, + "loss/hidden": 1.1171875, + "loss/logits": 0.18095816671848297, + "loss/reg": 5.6720054999459535e-05, + "step": 554 + }, + { + "epoch": 0.069375, + "grad_norm": 3.2739336490631104, + "grad_norm_var": 0.40192322247623313, + "learning_rate": 0.0001, + "loss": 1.7776, + "loss/crossentropy": 2.051370143890381, + "loss/hidden": 1.515625, + "loss/logits": 0.26144689321517944, + "loss/reg": 5.670300015481189e-05, + "step": 555 + }, + { + "epoch": 0.0695, + "grad_norm": 2.814973831176758, + "grad_norm_var": 0.3882282893863246, + "learning_rate": 0.0001, + "loss": 1.3358, + "loss/crossentropy": 2.2743093967437744, + "loss/hidden": 1.171875, + "loss/logits": 0.16339904069900513, + "loss/reg": 5.66886410524603e-05, + "step": 556 + }, + { + "epoch": 0.069625, + "grad_norm": 2.559269428253174, + "grad_norm_var": 0.3330167895556078, + "learning_rate": 0.0001, + "loss": 1.4237, + "loss/crossentropy": 2.19891357421875, + "loss/hidden": 1.2109375, + "loss/logits": 0.212164968252182, + "loss/reg": 5.667324876412749e-05, + "step": 557 + }, + { + "epoch": 0.06975, + "grad_norm": 2.4474921226501465, + "grad_norm_var": 0.34012043993938973, + "learning_rate": 0.0001, + "loss": 1.3187, + "loss/crossentropy": 2.409031867980957, + "loss/hidden": 1.140625, + "loss/logits": 0.17748260498046875, + "loss/reg": 5.6656310334801674e-05, + "step": 558 + }, + { + "epoch": 0.069875, + "grad_norm": 2.838435173034668, + "grad_norm_var": 0.3040173512426804, + "learning_rate": 0.0001, + "loss": 1.2111, + "loss/crossentropy": 2.6684110164642334, + "loss/hidden": 1.0390625, + "loss/logits": 0.17149032652378082, + "loss/reg": 5.664560740115121e-05, + "step": 559 + }, + { + "epoch": 0.07, + "grad_norm": 2.46317458152771, + "grad_norm_var": 0.3133912614469295, + "learning_rate": 0.0001, + "loss": 1.414, + "loss/crossentropy": 2.6221396923065186, + "loss/hidden": 1.203125, + "loss/logits": 0.21026402711868286, + "loss/reg": 5.663983756676316e-05, + "step": 560 + }, + { + "epoch": 0.070125, + "grad_norm": 2.4146885871887207, + "grad_norm_var": 0.24449850015313715, + "learning_rate": 0.0001, + "loss": 1.2718, + "loss/crossentropy": 2.4179940223693848, + "loss/hidden": 1.09375, + "loss/logits": 0.17744633555412292, + "loss/reg": 5.66331364098005e-05, + "step": 561 + }, + { + "epoch": 0.07025, + "grad_norm": 2.1373863220214844, + "grad_norm_var": 0.24729082719934822, + "learning_rate": 0.0001, + "loss": 1.3358, + "loss/crossentropy": 2.3687002658843994, + "loss/hidden": 1.15625, + "loss/logits": 0.17901673913002014, + "loss/reg": 5.6618908274685964e-05, + "step": 562 + }, + { + "epoch": 0.070375, + "grad_norm": 2.2190897464752197, + "grad_norm_var": 0.26324956728146254, + "learning_rate": 0.0001, + "loss": 1.4028, + "loss/crossentropy": 2.2975175380706787, + "loss/hidden": 1.1953125, + "loss/logits": 0.20690736174583435, + "loss/reg": 5.660299211740494e-05, + "step": 563 + }, + { + "epoch": 0.0705, + "grad_norm": 2.7386221885681152, + "grad_norm_var": 0.26278435237145437, + "learning_rate": 0.0001, + "loss": 1.2775, + "loss/crossentropy": 2.603506565093994, + "loss/hidden": 1.1015625, + "loss/logits": 0.17539449036121368, + "loss/reg": 5.658900772687048e-05, + "step": 564 + }, + { + "epoch": 0.070625, + "grad_norm": 2.46307635307312, + "grad_norm_var": 0.2615257576478134, + "learning_rate": 0.0001, + "loss": 1.2791, + "loss/crossentropy": 2.4369289875030518, + "loss/hidden": 1.1171875, + "loss/logits": 0.16132491827011108, + "loss/reg": 5.6575612688902766e-05, + "step": 565 + }, + { + "epoch": 0.07075, + "grad_norm": 2.45520281791687, + "grad_norm_var": 0.26385949291743505, + "learning_rate": 0.0001, + "loss": 1.4484, + "loss/crossentropy": 2.494572877883911, + "loss/hidden": 1.234375, + "loss/logits": 0.21350392699241638, + "loss/reg": 5.656494977301918e-05, + "step": 566 + }, + { + "epoch": 0.070875, + "grad_norm": 2.569112539291382, + "grad_norm_var": 0.26265097215405053, + "learning_rate": 0.0001, + "loss": 1.2421, + "loss/crossentropy": 2.3923959732055664, + "loss/hidden": 1.0625, + "loss/logits": 0.17903804779052734, + "loss/reg": 5.65506998100318e-05, + "step": 567 + }, + { + "epoch": 0.071, + "grad_norm": 2.2171080112457275, + "grad_norm_var": 0.24141352450480985, + "learning_rate": 0.0001, + "loss": 1.1494, + "loss/crossentropy": 2.3929827213287354, + "loss/hidden": 0.9921875, + "loss/logits": 0.15663662552833557, + "loss/reg": 5.653856715071015e-05, + "step": 568 + }, + { + "epoch": 0.071125, + "grad_norm": 3.1210575103759766, + "grad_norm_var": 0.10712755072980235, + "learning_rate": 0.0001, + "loss": 1.6912, + "loss/crossentropy": 2.2765614986419678, + "loss/hidden": 1.453125, + "loss/logits": 0.23751139640808105, + "loss/reg": 5.652818435919471e-05, + "step": 569 + }, + { + "epoch": 0.07125, + "grad_norm": 3.839294672012329, + "grad_norm_var": 0.19660925262190102, + "learning_rate": 0.0001, + "loss": 1.8286, + "loss/crossentropy": 2.0632450580596924, + "loss/hidden": 1.546875, + "loss/logits": 0.28118443489074707, + "loss/reg": 5.651290121022612e-05, + "step": 570 + }, + { + "epoch": 0.071375, + "grad_norm": 3.240445613861084, + "grad_norm_var": 0.19394141138966817, + "learning_rate": 0.0001, + "loss": 1.4813, + "loss/crossentropy": 2.642064094543457, + "loss/hidden": 1.2265625, + "loss/logits": 0.2541462779045105, + "loss/reg": 5.650040111504495e-05, + "step": 571 + }, + { + "epoch": 0.0715, + "grad_norm": 2.4979491233825684, + "grad_norm_var": 0.19361522865255718, + "learning_rate": 0.0001, + "loss": 1.4798, + "loss/crossentropy": 2.3248634338378906, + "loss/hidden": 1.234375, + "loss/logits": 0.2448451817035675, + "loss/reg": 5.648669321089983e-05, + "step": 572 + }, + { + "epoch": 0.071625, + "grad_norm": 2.3329668045043945, + "grad_norm_var": 0.19921690431924213, + "learning_rate": 0.0001, + "loss": 1.363, + "loss/crossentropy": 2.8348701000213623, + "loss/hidden": 1.1484375, + "loss/logits": 0.21398252248764038, + "loss/reg": 5.6470726121915504e-05, + "step": 573 + }, + { + "epoch": 0.07175, + "grad_norm": 2.494943857192993, + "grad_norm_var": 0.19823649604586155, + "learning_rate": 0.0001, + "loss": 1.4279, + "loss/crossentropy": 2.077019214630127, + "loss/hidden": 1.21875, + "loss/logits": 0.20862454175949097, + "loss/reg": 5.645084456773475e-05, + "step": 574 + }, + { + "epoch": 0.071875, + "grad_norm": 2.4467947483062744, + "grad_norm_var": 0.1968164545969214, + "learning_rate": 0.0001, + "loss": 1.2799, + "loss/crossentropy": 2.764270067214966, + "loss/hidden": 1.109375, + "loss/logits": 0.16992923617362976, + "loss/reg": 5.643080658046529e-05, + "step": 575 + }, + { + "epoch": 0.072, + "grad_norm": 4.124429702758789, + "grad_norm_var": 0.33829023147095444, + "learning_rate": 0.0001, + "loss": 1.8319, + "loss/crossentropy": 2.614333152770996, + "loss/hidden": 1.5859375, + "loss/logits": 0.245355024933815, + "loss/reg": 5.64096771995537e-05, + "step": 576 + }, + { + "epoch": 0.072125, + "grad_norm": 2.9513115882873535, + "grad_norm_var": 0.3353724391757993, + "learning_rate": 0.0001, + "loss": 1.4848, + "loss/crossentropy": 2.291598320007324, + "loss/hidden": 1.234375, + "loss/logits": 0.24986042082309723, + "loss/reg": 5.639591472572647e-05, + "step": 577 + }, + { + "epoch": 0.07225, + "grad_norm": 3.004474639892578, + "grad_norm_var": 0.31262981045543464, + "learning_rate": 0.0001, + "loss": 1.4425, + "loss/crossentropy": 2.2871763706207275, + "loss/hidden": 1.203125, + "loss/logits": 0.23880186676979065, + "loss/reg": 5.638147922581993e-05, + "step": 578 + }, + { + "epoch": 0.072375, + "grad_norm": 3.2634410858154297, + "grad_norm_var": 0.3006388387902421, + "learning_rate": 0.0001, + "loss": 1.5696, + "loss/crossentropy": 2.5439798831939697, + "loss/hidden": 1.3125, + "loss/logits": 0.2565382122993469, + "loss/reg": 5.637051799567416e-05, + "step": 579 + }, + { + "epoch": 0.0725, + "grad_norm": 2.787332534790039, + "grad_norm_var": 0.2999987245038954, + "learning_rate": 0.0001, + "loss": 1.3747, + "loss/crossentropy": 2.5118601322174072, + "loss/hidden": 1.1796875, + "loss/logits": 0.19444304704666138, + "loss/reg": 5.635723573504947e-05, + "step": 580 + }, + { + "epoch": 0.072625, + "grad_norm": 2.7229959964752197, + "grad_norm_var": 0.2903593389163989, + "learning_rate": 0.0001, + "loss": 1.5141, + "loss/crossentropy": 2.7853024005889893, + "loss/hidden": 1.3046875, + "loss/logits": 0.20882655680179596, + "loss/reg": 5.634501940221526e-05, + "step": 581 + }, + { + "epoch": 0.07275, + "grad_norm": 2.594968318939209, + "grad_norm_var": 0.28367694660223985, + "learning_rate": 0.0001, + "loss": 1.093, + "loss/crossentropy": 2.6438136100769043, + "loss/hidden": 0.94921875, + "loss/logits": 0.1432032436132431, + "loss/reg": 5.633091495838016e-05, + "step": 582 + }, + { + "epoch": 0.072875, + "grad_norm": 2.3869121074676514, + "grad_norm_var": 0.2934995682895912, + "learning_rate": 0.0001, + "loss": 1.3018, + "loss/crossentropy": 2.4067459106445312, + "loss/hidden": 1.1171875, + "loss/logits": 0.18405942618846893, + "loss/reg": 5.631797466776334e-05, + "step": 583 + }, + { + "epoch": 0.073, + "grad_norm": 3.3360087871551514, + "grad_norm_var": 0.2733505680051763, + "learning_rate": 0.0001, + "loss": 1.1895, + "loss/crossentropy": 2.7274060249328613, + "loss/hidden": 1.015625, + "loss/logits": 0.17334823310375214, + "loss/reg": 5.630190935335122e-05, + "step": 584 + }, + { + "epoch": 0.073125, + "grad_norm": 2.636382818222046, + "grad_norm_var": 0.27675729437985763, + "learning_rate": 0.0001, + "loss": 1.2592, + "loss/crossentropy": 2.4599273204803467, + "loss/hidden": 1.0859375, + "loss/logits": 0.17271864414215088, + "loss/reg": 5.6286880862899125e-05, + "step": 585 + }, + { + "epoch": 0.07325, + "grad_norm": 2.3085126876831055, + "grad_norm_var": 0.2348241054879698, + "learning_rate": 0.0001, + "loss": 1.1438, + "loss/crossentropy": 2.4817559719085693, + "loss/hidden": 0.98046875, + "loss/logits": 0.16277402639389038, + "loss/reg": 5.6276072427863255e-05, + "step": 586 + }, + { + "epoch": 0.073375, + "grad_norm": 2.8122873306274414, + "grad_norm_var": 0.22231448974219556, + "learning_rate": 0.0001, + "loss": 1.6107, + "loss/crossentropy": 2.3126115798950195, + "loss/hidden": 1.3515625, + "loss/logits": 0.25852900743484497, + "loss/reg": 5.625975609291345e-05, + "step": 587 + }, + { + "epoch": 0.0735, + "grad_norm": 2.5446043014526367, + "grad_norm_var": 0.22060978250349308, + "learning_rate": 0.0001, + "loss": 1.3722, + "loss/crossentropy": 2.3330647945404053, + "loss/hidden": 1.171875, + "loss/logits": 0.1998036503791809, + "loss/reg": 5.6243337894557044e-05, + "step": 588 + }, + { + "epoch": 0.073625, + "grad_norm": 3.0254905223846436, + "grad_norm_var": 0.20775786644308383, + "learning_rate": 0.0001, + "loss": 1.4257, + "loss/crossentropy": 2.4669790267944336, + "loss/hidden": 1.1953125, + "loss/logits": 0.22982466220855713, + "loss/reg": 5.6233355280710384e-05, + "step": 589 + }, + { + "epoch": 0.07375, + "grad_norm": 2.742598295211792, + "grad_norm_var": 0.20019536457655604, + "learning_rate": 0.0001, + "loss": 1.3087, + "loss/crossentropy": 2.350428581237793, + "loss/hidden": 1.125, + "loss/logits": 0.1831551194190979, + "loss/reg": 5.621850868919864e-05, + "step": 590 + }, + { + "epoch": 0.073875, + "grad_norm": 2.2741334438323975, + "grad_norm_var": 0.21146840571566727, + "learning_rate": 0.0001, + "loss": 1.3572, + "loss/crossentropy": 2.237168550491333, + "loss/hidden": 1.1484375, + "loss/logits": 0.20822051167488098, + "loss/reg": 5.6203607528004795e-05, + "step": 591 + }, + { + "epoch": 0.074, + "grad_norm": 4.630364894866943, + "grad_norm_var": 0.3137917114378768, + "learning_rate": 0.0001, + "loss": 1.8838, + "loss/crossentropy": 2.4884331226348877, + "loss/hidden": 1.5859375, + "loss/logits": 0.2973060607910156, + "loss/reg": 5.619114017463289e-05, + "step": 592 + }, + { + "epoch": 0.074125, + "grad_norm": 2.7753021717071533, + "grad_norm_var": 0.3139690476205639, + "learning_rate": 0.0001, + "loss": 1.465, + "loss/crossentropy": 2.409294843673706, + "loss/hidden": 1.2421875, + "loss/logits": 0.22228561341762543, + "loss/reg": 5.617353235720657e-05, + "step": 593 + }, + { + "epoch": 0.07425, + "grad_norm": 2.694066286087036, + "grad_norm_var": 0.3142336147439567, + "learning_rate": 0.0001, + "loss": 1.6625, + "loss/crossentropy": 2.588970899581909, + "loss/hidden": 1.3828125, + "loss/logits": 0.27915188670158386, + "loss/reg": 5.6154247431550175e-05, + "step": 594 + }, + { + "epoch": 0.074375, + "grad_norm": 7.720087051391602, + "grad_norm_var": 1.8036632855610812, + "learning_rate": 0.0001, + "loss": 2.0674, + "loss/crossentropy": 2.8772268295288086, + "loss/hidden": 1.71875, + "loss/logits": 0.3481142520904541, + "loss/reg": 5.61414854018949e-05, + "step": 595 + }, + { + "epoch": 0.0745, + "grad_norm": 2.24833083152771, + "grad_norm_var": 1.8460523547946992, + "learning_rate": 0.0001, + "loss": 1.3332, + "loss/crossentropy": 2.568021297454834, + "loss/hidden": 1.140625, + "loss/logits": 0.19198307394981384, + "loss/reg": 5.612680615740828e-05, + "step": 596 + }, + { + "epoch": 0.074625, + "grad_norm": 2.1800825595855713, + "grad_norm_var": 1.8911004193173881, + "learning_rate": 0.0001, + "loss": 1.1532, + "loss/crossentropy": 2.43203067779541, + "loss/hidden": 1.0, + "loss/logits": 0.1526886522769928, + "loss/reg": 5.6111755839083344e-05, + "step": 597 + }, + { + "epoch": 0.07475, + "grad_norm": 2.212115526199341, + "grad_norm_var": 1.9238408264416513, + "learning_rate": 0.0001, + "loss": 1.2472, + "loss/crossentropy": 2.493605613708496, + "loss/hidden": 1.0703125, + "loss/logits": 0.17635077238082886, + "loss/reg": 5.609134313999675e-05, + "step": 598 + }, + { + "epoch": 0.074875, + "grad_norm": 3.4206509590148926, + "grad_norm_var": 1.9015840455427668, + "learning_rate": 0.0001, + "loss": 1.6227, + "loss/crossentropy": 2.383911371231079, + "loss/hidden": 1.3515625, + "loss/logits": 0.2705824673175812, + "loss/reg": 5.607017010333948e-05, + "step": 599 + }, + { + "epoch": 0.075, + "grad_norm": 2.6518476009368896, + "grad_norm_var": 1.9090875079187366, + "learning_rate": 0.0001, + "loss": 1.311, + "loss/crossentropy": 2.764665126800537, + "loss/hidden": 1.1328125, + "loss/logits": 0.17762941122055054, + "loss/reg": 5.605430851574056e-05, + "step": 600 + }, + { + "epoch": 0.075125, + "grad_norm": 2.1637043952941895, + "grad_norm_var": 1.9494220257467947, + "learning_rate": 0.0001, + "loss": 1.2314, + "loss/crossentropy": 2.510344982147217, + "loss/hidden": 1.0625, + "loss/logits": 0.16831059753894806, + "loss/reg": 5.603917452390306e-05, + "step": 601 + }, + { + "epoch": 0.07525, + "grad_norm": 3.601780891418457, + "grad_norm_var": 1.930362870052075, + "learning_rate": 0.0001, + "loss": 1.6586, + "loss/crossentropy": 2.4858670234680176, + "loss/hidden": 1.375, + "loss/logits": 0.2830356955528259, + "loss/reg": 5.602244709734805e-05, + "step": 602 + }, + { + "epoch": 0.075375, + "grad_norm": 2.3047375679016113, + "grad_norm_var": 1.9663459192058854, + "learning_rate": 0.0001, + "loss": 1.2404, + "loss/crossentropy": 2.6084353923797607, + "loss/hidden": 1.078125, + "loss/logits": 0.16176369786262512, + "loss/reg": 5.6005988881224766e-05, + "step": 603 + }, + { + "epoch": 0.0755, + "grad_norm": 2.468519449234009, + "grad_norm_var": 1.972081997343082, + "learning_rate": 0.0001, + "loss": 1.32, + "loss/crossentropy": 2.4244751930236816, + "loss/hidden": 1.140625, + "loss/logits": 0.17885854840278625, + "loss/reg": 5.598864299827255e-05, + "step": 604 + }, + { + "epoch": 0.075625, + "grad_norm": 3.1042160987854004, + "grad_norm_var": 1.9720062093686368, + "learning_rate": 0.0001, + "loss": 1.1799, + "loss/crossentropy": 2.981356143951416, + "loss/hidden": 1.0, + "loss/logits": 0.17934700846672058, + "loss/reg": 5.5970504035940394e-05, + "step": 605 + }, + { + "epoch": 0.07575, + "grad_norm": 2.198490858078003, + "grad_norm_var": 2.0145906467974495, + "learning_rate": 0.0001, + "loss": 1.3087, + "loss/crossentropy": 2.4534380435943604, + "loss/hidden": 1.125, + "loss/logits": 0.18310286104679108, + "loss/reg": 5.595075344899669e-05, + "step": 606 + }, + { + "epoch": 0.075875, + "grad_norm": 3.01007080078125, + "grad_norm_var": 1.973238539473501, + "learning_rate": 0.0001, + "loss": 1.3278, + "loss/crossentropy": 2.601656913757324, + "loss/hidden": 1.1328125, + "loss/logits": 0.19443120062351227, + "loss/reg": 5.5936940043466166e-05, + "step": 607 + }, + { + "epoch": 0.076, + "grad_norm": 2.328502655029297, + "grad_norm_var": 1.8305709674660562, + "learning_rate": 0.0001, + "loss": 1.2808, + "loss/crossentropy": 2.5165281295776367, + "loss/hidden": 1.1015625, + "loss/logits": 0.17871087789535522, + "loss/reg": 5.592526576947421e-05, + "step": 608 + }, + { + "epoch": 0.076125, + "grad_norm": 2.4415786266326904, + "grad_norm_var": 1.844978362281832, + "learning_rate": 0.0001, + "loss": 1.4699, + "loss/crossentropy": 2.3549885749816895, + "loss/hidden": 1.234375, + "loss/logits": 0.23492830991744995, + "loss/reg": 5.591001536231488e-05, + "step": 609 + }, + { + "epoch": 0.07625, + "grad_norm": 2.1607983112335205, + "grad_norm_var": 1.8789441166626564, + "learning_rate": 0.0001, + "loss": 1.5289, + "loss/crossentropy": 2.310319185256958, + "loss/hidden": 1.3203125, + "loss/logits": 0.20807045698165894, + "loss/reg": 5.5894925026223063e-05, + "step": 610 + }, + { + "epoch": 0.076375, + "grad_norm": 2.4163475036621094, + "grad_norm_var": 0.22029539262357123, + "learning_rate": 0.0001, + "loss": 1.326, + "loss/crossentropy": 2.5177974700927734, + "loss/hidden": 1.1484375, + "loss/logits": 0.1770188808441162, + "loss/reg": 5.587563646258786e-05, + "step": 611 + }, + { + "epoch": 0.0765, + "grad_norm": 5.376523494720459, + "grad_norm_var": 0.7031570506973231, + "learning_rate": 0.0001, + "loss": 1.7307, + "loss/crossentropy": 2.593522548675537, + "loss/hidden": 1.46875, + "loss/logits": 0.26142174005508423, + "loss/reg": 5.5857744882814586e-05, + "step": 612 + }, + { + "epoch": 0.076625, + "grad_norm": 2.5275163650512695, + "grad_norm_var": 0.6841845799993801, + "learning_rate": 0.0001, + "loss": 1.322, + "loss/crossentropy": 2.5519967079162598, + "loss/hidden": 1.125, + "loss/logits": 0.1964191198348999, + "loss/reg": 5.583597521763295e-05, + "step": 613 + }, + { + "epoch": 0.07675, + "grad_norm": 2.870879888534546, + "grad_norm_var": 0.6619358019877306, + "learning_rate": 0.0001, + "loss": 1.4565, + "loss/crossentropy": 2.4661271572113037, + "loss/hidden": 1.2265625, + "loss/logits": 0.22941797971725464, + "loss/reg": 5.581411096500233e-05, + "step": 614 + }, + { + "epoch": 0.076875, + "grad_norm": 2.530247926712036, + "grad_norm_var": 0.6396295206762201, + "learning_rate": 0.0001, + "loss": 1.3657, + "loss/crossentropy": 2.298511028289795, + "loss/hidden": 1.1875, + "loss/logits": 0.17764486372470856, + "loss/reg": 5.579264689004049e-05, + "step": 615 + }, + { + "epoch": 0.077, + "grad_norm": 2.1187071800231934, + "grad_norm_var": 0.665063668545568, + "learning_rate": 0.0001, + "loss": 1.1568, + "loss/crossentropy": 2.4262094497680664, + "loss/hidden": 1.015625, + "loss/logits": 0.14061546325683594, + "loss/reg": 5.5783228162908927e-05, + "step": 616 + }, + { + "epoch": 0.077125, + "grad_norm": 2.3431754112243652, + "grad_norm_var": 0.6536114449405801, + "learning_rate": 0.0001, + "loss": 1.4278, + "loss/crossentropy": 2.3483335971832275, + "loss/hidden": 1.2109375, + "loss/logits": 0.21630419790744781, + "loss/reg": 5.57744933757931e-05, + "step": 617 + }, + { + "epoch": 0.07725, + "grad_norm": 2.5985348224639893, + "grad_norm_var": 0.6009238397412027, + "learning_rate": 0.0001, + "loss": 1.7645, + "loss/crossentropy": 2.279369831085205, + "loss/hidden": 1.546875, + "loss/logits": 0.2171006053686142, + "loss/reg": 5.575196701101959e-05, + "step": 618 + }, + { + "epoch": 0.077375, + "grad_norm": 4.004838943481445, + "grad_norm_var": 0.697655562382554, + "learning_rate": 0.0001, + "loss": 1.6666, + "loss/crossentropy": 2.558466911315918, + "loss/hidden": 1.3984375, + "loss/logits": 0.26759639382362366, + "loss/reg": 5.5740612879162654e-05, + "step": 619 + }, + { + "epoch": 0.0775, + "grad_norm": 2.264273166656494, + "grad_norm_var": 0.7087775967170871, + "learning_rate": 0.0001, + "loss": 1.1433, + "loss/crossentropy": 2.5713441371917725, + "loss/hidden": 0.9921875, + "loss/logits": 0.1505858451128006, + "loss/reg": 5.5717180657666177e-05, + "step": 620 + }, + { + "epoch": 0.077625, + "grad_norm": 2.5551795959472656, + "grad_norm_var": 0.7030356734291255, + "learning_rate": 0.0001, + "loss": 1.1309, + "loss/crossentropy": 2.5923280715942383, + "loss/hidden": 0.98046875, + "loss/logits": 0.14986979961395264, + "loss/reg": 5.57043167646043e-05, + "step": 621 + }, + { + "epoch": 0.07775, + "grad_norm": 2.652561902999878, + "grad_norm_var": 0.6834944271410867, + "learning_rate": 0.0001, + "loss": 1.3254, + "loss/crossentropy": 2.8501980304718018, + "loss/hidden": 1.140625, + "loss/logits": 0.18422411382198334, + "loss/reg": 5.5684457038296387e-05, + "step": 622 + }, + { + "epoch": 0.077875, + "grad_norm": 2.6352500915527344, + "grad_norm_var": 0.6799016428975733, + "learning_rate": 0.0001, + "loss": 1.1878, + "loss/crossentropy": 2.7150673866271973, + "loss/hidden": 1.03125, + "loss/logits": 0.15596626698970795, + "loss/reg": 5.5671138397883624e-05, + "step": 623 + }, + { + "epoch": 0.078, + "grad_norm": 2.724209785461426, + "grad_norm_var": 0.6680269008360408, + "learning_rate": 0.0001, + "loss": 1.5681, + "loss/crossentropy": 2.3070638179779053, + "loss/hidden": 1.34375, + "loss/logits": 0.22377389669418335, + "loss/reg": 5.56498380319681e-05, + "step": 624 + }, + { + "epoch": 0.078125, + "grad_norm": 9.59151840209961, + "grad_norm_var": 3.5559580820469003, + "learning_rate": 0.0001, + "loss": 1.568, + "loss/crossentropy": 2.6425082683563232, + "loss/hidden": 1.375, + "loss/logits": 0.19247561693191528, + "loss/reg": 5.562700243899599e-05, + "step": 625 + }, + { + "epoch": 0.07825, + "grad_norm": 2.7146146297454834, + "grad_norm_var": 3.4976035299386314, + "learning_rate": 0.0001, + "loss": 1.2429, + "loss/crossentropy": 2.7501344680786133, + "loss/hidden": 1.0703125, + "loss/logits": 0.17199140787124634, + "loss/reg": 5.560599674936384e-05, + "step": 626 + }, + { + "epoch": 0.078375, + "grad_norm": 2.4837982654571533, + "grad_norm_var": 3.4904329865295534, + "learning_rate": 0.0001, + "loss": 1.3951, + "loss/crossentropy": 2.4862208366394043, + "loss/hidden": 1.21875, + "loss/logits": 0.17582716047763824, + "loss/reg": 5.558757402468473e-05, + "step": 627 + }, + { + "epoch": 0.0785, + "grad_norm": 2.1883833408355713, + "grad_norm_var": 3.2215267842479833, + "learning_rate": 0.0001, + "loss": 1.0922, + "loss/crossentropy": 2.6592166423797607, + "loss/hidden": 0.953125, + "loss/logits": 0.1385241448879242, + "loss/reg": 5.5569151300005615e-05, + "step": 628 + }, + { + "epoch": 0.078625, + "grad_norm": 2.8857290744781494, + "grad_norm_var": 3.2045808378220917, + "learning_rate": 0.0001, + "loss": 1.3705, + "loss/crossentropy": 2.581509828567505, + "loss/hidden": 1.1796875, + "loss/logits": 0.19029666483402252, + "loss/reg": 5.555087045649998e-05, + "step": 629 + }, + { + "epoch": 0.07875, + "grad_norm": 1.9136121273040771, + "grad_norm_var": 3.287602536208832, + "learning_rate": 0.0001, + "loss": 1.1828, + "loss/crossentropy": 2.4255590438842773, + "loss/hidden": 1.03125, + "loss/logits": 0.15101952850818634, + "loss/reg": 5.5534914281452075e-05, + "step": 630 + }, + { + "epoch": 0.078875, + "grad_norm": 2.1227009296417236, + "grad_norm_var": 3.3242045708283325, + "learning_rate": 0.0001, + "loss": 1.3325, + "loss/crossentropy": 2.5583293437957764, + "loss/hidden": 1.140625, + "loss/logits": 0.19136284291744232, + "loss/reg": 5.551309368456714e-05, + "step": 631 + }, + { + "epoch": 0.079, + "grad_norm": 2.6222591400146484, + "grad_norm_var": 3.2817336007922213, + "learning_rate": 0.0001, + "loss": 1.5832, + "loss/crossentropy": 2.2433364391326904, + "loss/hidden": 1.3515625, + "loss/logits": 0.23112601041793823, + "loss/reg": 5.549259003601037e-05, + "step": 632 + }, + { + "epoch": 0.079125, + "grad_norm": 2.6109414100646973, + "grad_norm_var": 3.2620938839767395, + "learning_rate": 0.0001, + "loss": 1.2217, + "loss/crossentropy": 2.715541124343872, + "loss/hidden": 1.046875, + "loss/logits": 0.17423760890960693, + "loss/reg": 5.547174077946693e-05, + "step": 633 + }, + { + "epoch": 0.07925, + "grad_norm": 2.377688407897949, + "grad_norm_var": 3.2780099106994003, + "learning_rate": 0.0001, + "loss": 1.3809, + "loss/crossentropy": 2.317793130874634, + "loss/hidden": 1.1875, + "loss/logits": 0.19284963607788086, + "loss/reg": 5.545308886212297e-05, + "step": 634 + }, + { + "epoch": 0.079375, + "grad_norm": 2.2899274826049805, + "grad_norm_var": 3.2370231277032433, + "learning_rate": 0.0001, + "loss": 1.2669, + "loss/crossentropy": 2.6336915493011475, + "loss/hidden": 1.09375, + "loss/logits": 0.17255498468875885, + "loss/reg": 5.543839870369993e-05, + "step": 635 + }, + { + "epoch": 0.0795, + "grad_norm": 2.190656900405884, + "grad_norm_var": 3.2437445376370118, + "learning_rate": 0.0001, + "loss": 1.2508, + "loss/crossentropy": 2.450021505355835, + "loss/hidden": 1.09375, + "loss/logits": 0.15647029876708984, + "loss/reg": 5.542321741813794e-05, + "step": 636 + }, + { + "epoch": 0.079625, + "grad_norm": 1.986946940422058, + "grad_norm_var": 3.2908032121113298, + "learning_rate": 0.0001, + "loss": 1.2266, + "loss/crossentropy": 2.7081422805786133, + "loss/hidden": 1.0546875, + "loss/logits": 0.17138829827308655, + "loss/reg": 5.5409673223039135e-05, + "step": 637 + }, + { + "epoch": 0.07975, + "grad_norm": 2.3423492908477783, + "grad_norm_var": 3.3059943314168536, + "learning_rate": 0.0001, + "loss": 1.2321, + "loss/crossentropy": 2.384462833404541, + "loss/hidden": 1.0546875, + "loss/logits": 0.17684702575206757, + "loss/reg": 5.5391312343999743e-05, + "step": 638 + }, + { + "epoch": 0.079875, + "grad_norm": 2.3519933223724365, + "grad_norm_var": 3.31930978488029, + "learning_rate": 0.0001, + "loss": 1.3743, + "loss/crossentropy": 2.6151647567749023, + "loss/hidden": 1.1640625, + "loss/logits": 0.2097240835428238, + "loss/reg": 5.537479228223674e-05, + "step": 639 + }, + { + "epoch": 0.08, + "grad_norm": 2.447504997253418, + "grad_norm_var": 3.3282686991442922, + "learning_rate": 0.0001, + "loss": 1.4717, + "loss/crossentropy": 2.5143349170684814, + "loss/hidden": 1.2109375, + "loss/logits": 0.2602579593658447, + "loss/reg": 5.535819582291879e-05, + "step": 640 + }, + { + "epoch": 0.080125, + "grad_norm": 2.7342166900634766, + "grad_norm_var": 0.07597010378907397, + "learning_rate": 0.0001, + "loss": 1.5484, + "loss/crossentropy": 2.4136970043182373, + "loss/hidden": 1.3046875, + "loss/logits": 0.24319079518318176, + "loss/reg": 5.5341512052109465e-05, + "step": 641 + }, + { + "epoch": 0.08025, + "grad_norm": 7.95797061920166, + "grad_norm_var": 2.020192568164274, + "learning_rate": 0.0001, + "loss": 1.4043, + "loss/crossentropy": 2.4566597938537598, + "loss/hidden": 1.2265625, + "loss/logits": 0.1772194355726242, + "loss/reg": 5.532177601708099e-05, + "step": 642 + }, + { + "epoch": 0.080375, + "grad_norm": 2.706076145172119, + "grad_norm_var": 2.0163048861974153, + "learning_rate": 0.0001, + "loss": 1.5589, + "loss/crossentropy": 2.313260078430176, + "loss/hidden": 1.3203125, + "loss/logits": 0.23800881206989288, + "loss/reg": 5.5301832617260516e-05, + "step": 643 + }, + { + "epoch": 0.0805, + "grad_norm": 2.8865914344787598, + "grad_norm_var": 1.9960669600053733, + "learning_rate": 0.0001, + "loss": 1.4925, + "loss/crossentropy": 2.48056697845459, + "loss/hidden": 1.28125, + "loss/logits": 0.2106875777244568, + "loss/reg": 5.528131077880971e-05, + "step": 644 + }, + { + "epoch": 0.080625, + "grad_norm": 2.7675015926361084, + "grad_norm_var": 1.995221836304215, + "learning_rate": 0.0001, + "loss": 1.4228, + "loss/crossentropy": 2.414571523666382, + "loss/hidden": 1.21875, + "loss/logits": 0.20353971421718597, + "loss/reg": 5.5263531976379454e-05, + "step": 645 + }, + { + "epoch": 0.08075, + "grad_norm": 2.2879977226257324, + "grad_norm_var": 1.9612673982614381, + "learning_rate": 0.0001, + "loss": 1.4466, + "loss/crossentropy": 2.5126829147338867, + "loss/hidden": 1.234375, + "loss/logits": 0.21164974570274353, + "loss/reg": 5.523953586816788e-05, + "step": 646 + }, + { + "epoch": 0.080875, + "grad_norm": 2.0180585384368896, + "grad_norm_var": 1.9712999230632604, + "learning_rate": 0.0001, + "loss": 1.2852, + "loss/crossentropy": 2.544811248779297, + "loss/hidden": 1.1015625, + "loss/logits": 0.18305200338363647, + "loss/reg": 5.5218009947566316e-05, + "step": 647 + }, + { + "epoch": 0.081, + "grad_norm": 2.460339069366455, + "grad_norm_var": 1.9764772295131516, + "learning_rate": 0.0001, + "loss": 1.4663, + "loss/crossentropy": 2.384995937347412, + "loss/hidden": 1.2578125, + "loss/logits": 0.20790857076644897, + "loss/reg": 5.520256672753021e-05, + "step": 648 + }, + { + "epoch": 0.081125, + "grad_norm": 2.6601645946502686, + "grad_norm_var": 1.975545055561849, + "learning_rate": 0.0001, + "loss": 1.2009, + "loss/crossentropy": 2.5382332801818848, + "loss/hidden": 1.0625, + "loss/logits": 0.13785940408706665, + "loss/reg": 5.518757097888738e-05, + "step": 649 + }, + { + "epoch": 0.08125, + "grad_norm": 2.0407180786132812, + "grad_norm_var": 2.0006781186858693, + "learning_rate": 0.0001, + "loss": 1.3368, + "loss/crossentropy": 2.314056158065796, + "loss/hidden": 1.171875, + "loss/logits": 0.1643456667661667, + "loss/reg": 5.5170850828289986e-05, + "step": 650 + }, + { + "epoch": 0.081375, + "grad_norm": 2.672747850418091, + "grad_norm_var": 1.9859426578962365, + "learning_rate": 0.0001, + "loss": 1.4244, + "loss/crossentropy": 2.495609760284424, + "loss/hidden": 1.21875, + "loss/logits": 0.2050689160823822, + "loss/reg": 5.5157281167339534e-05, + "step": 651 + }, + { + "epoch": 0.0815, + "grad_norm": 2.3654656410217285, + "grad_norm_var": 1.9740698553443072, + "learning_rate": 0.0001, + "loss": 1.5368, + "loss/crossentropy": 2.4721033573150635, + "loss/hidden": 1.328125, + "loss/logits": 0.20810337364673615, + "loss/reg": 5.5145894293673337e-05, + "step": 652 + }, + { + "epoch": 0.081625, + "grad_norm": 2.0404245853424072, + "grad_norm_var": 1.9685017588802374, + "learning_rate": 0.0001, + "loss": 1.2802, + "loss/crossentropy": 2.340548515319824, + "loss/hidden": 1.1015625, + "loss/logits": 0.17806307971477509, + "loss/reg": 5.5128544772742316e-05, + "step": 653 + }, + { + "epoch": 0.08175, + "grad_norm": 2.6265711784362793, + "grad_norm_var": 1.9563492188252354, + "learning_rate": 0.0001, + "loss": 1.1856, + "loss/crossentropy": 2.6735007762908936, + "loss/hidden": 1.0, + "loss/logits": 0.18507899343967438, + "loss/reg": 5.5116473959060386e-05, + "step": 654 + }, + { + "epoch": 0.081875, + "grad_norm": 3.466085433959961, + "grad_norm_var": 1.9652920541676069, + "learning_rate": 0.0001, + "loss": 1.5681, + "loss/crossentropy": 2.62325119972229, + "loss/hidden": 1.3515625, + "loss/logits": 0.21598073840141296, + "loss/reg": 5.510494884219952e-05, + "step": 655 + }, + { + "epoch": 0.082, + "grad_norm": 1.9770357608795166, + "grad_norm_var": 2.0064850603907494, + "learning_rate": 0.0001, + "loss": 1.2097, + "loss/crossentropy": 2.2057158946990967, + "loss/hidden": 1.0625, + "loss/logits": 0.14666706323623657, + "loss/reg": 5.509403126779944e-05, + "step": 656 + }, + { + "epoch": 0.082125, + "grad_norm": 2.3105995655059814, + "grad_norm_var": 2.024480408785202, + "learning_rate": 0.0001, + "loss": 1.2902, + "loss/crossentropy": 2.493821144104004, + "loss/hidden": 1.1015625, + "loss/logits": 0.1881340891122818, + "loss/reg": 5.5078246077755466e-05, + "step": 657 + }, + { + "epoch": 0.08225, + "grad_norm": 2.718162775039673, + "grad_norm_var": 0.15628703716138168, + "learning_rate": 0.0001, + "loss": 1.3117, + "loss/crossentropy": 2.4584882259368896, + "loss/hidden": 1.1171875, + "loss/logits": 0.19399182498455048, + "loss/reg": 5.506265733856708e-05, + "step": 658 + }, + { + "epoch": 0.082375, + "grad_norm": 3.373929977416992, + "grad_norm_var": 0.2024890656434612, + "learning_rate": 0.0001, + "loss": 1.657, + "loss/crossentropy": 2.3616418838500977, + "loss/hidden": 1.375, + "loss/logits": 0.2814520597457886, + "loss/reg": 5.505214721779339e-05, + "step": 659 + }, + { + "epoch": 0.0825, + "grad_norm": 3.0150086879730225, + "grad_norm_var": 0.20941952923840457, + "learning_rate": 0.0001, + "loss": 1.4792, + "loss/crossentropy": 2.703028440475464, + "loss/hidden": 1.2578125, + "loss/logits": 0.2208247035741806, + "loss/reg": 5.504006549017504e-05, + "step": 660 + }, + { + "epoch": 0.082625, + "grad_norm": 3.6257362365722656, + "grad_norm_var": 0.2803381345532055, + "learning_rate": 0.0001, + "loss": 1.3149, + "loss/crossentropy": 2.6234936714172363, + "loss/hidden": 1.1328125, + "loss/logits": 0.18155357241630554, + "loss/reg": 5.502764179254882e-05, + "step": 661 + }, + { + "epoch": 0.08275, + "grad_norm": 8.147649765014648, + "grad_norm_var": 2.179661731968463, + "learning_rate": 0.0001, + "loss": 1.4767, + "loss/crossentropy": 2.5337109565734863, + "loss/hidden": 1.296875, + "loss/logits": 0.17923393845558167, + "loss/reg": 5.501080886460841e-05, + "step": 662 + }, + { + "epoch": 0.082875, + "grad_norm": 2.901944398880005, + "grad_norm_var": 2.1163120327356095, + "learning_rate": 0.0001, + "loss": 1.2957, + "loss/crossentropy": 2.814661979675293, + "loss/hidden": 1.09375, + "loss/logits": 0.20143108069896698, + "loss/reg": 5.4996402468532324e-05, + "step": 663 + }, + { + "epoch": 0.083, + "grad_norm": 2.2565462589263916, + "grad_norm_var": 2.1342553181424, + "learning_rate": 0.0001, + "loss": 1.1747, + "loss/crossentropy": 2.4640450477600098, + "loss/hidden": 1.015625, + "loss/logits": 0.158490851521492, + "loss/reg": 5.4978750995360315e-05, + "step": 664 + }, + { + "epoch": 0.083125, + "grad_norm": 2.5376529693603516, + "grad_norm_var": 2.140947510021911, + "learning_rate": 0.0001, + "loss": 1.2111, + "loss/crossentropy": 2.433427095413208, + "loss/hidden": 1.0546875, + "loss/logits": 0.1558808833360672, + "loss/reg": 5.4963678849162534e-05, + "step": 665 + }, + { + "epoch": 0.08325, + "grad_norm": 2.327512741088867, + "grad_norm_var": 2.1092236468841206, + "learning_rate": 0.0001, + "loss": 1.371, + "loss/crossentropy": 2.400848627090454, + "loss/hidden": 1.171875, + "loss/logits": 0.19855274260044098, + "loss/reg": 5.494604556588456e-05, + "step": 666 + }, + { + "epoch": 0.083375, + "grad_norm": 2.6282968521118164, + "grad_norm_var": 2.1114211896703217, + "learning_rate": 0.0001, + "loss": 1.3032, + "loss/crossentropy": 2.4893503189086914, + "loss/hidden": 1.125, + "loss/logits": 0.17762264609336853, + "loss/reg": 5.492940545082092e-05, + "step": 667 + }, + { + "epoch": 0.0835, + "grad_norm": 15.134915351867676, + "grad_norm_var": 11.188339796523456, + "learning_rate": 0.0001, + "loss": 2.056, + "loss/crossentropy": 1.2935031652450562, + "loss/hidden": 1.9921875, + "loss/logits": 0.0632929727435112, + "loss/reg": 5.491078627528623e-05, + "step": 668 + }, + { + "epoch": 0.083625, + "grad_norm": 2.7197494506835938, + "grad_norm_var": 11.056175204029886, + "learning_rate": 0.0001, + "loss": 1.2911, + "loss/crossentropy": 2.4398157596588135, + "loss/hidden": 1.1171875, + "loss/logits": 0.17337752878665924, + "loss/reg": 5.489288014359772e-05, + "step": 669 + }, + { + "epoch": 0.08375, + "grad_norm": 4.744369029998779, + "grad_norm_var": 10.98807433162251, + "learning_rate": 0.0001, + "loss": 1.3603, + "loss/crossentropy": 2.813530683517456, + "loss/hidden": 1.1875, + "loss/logits": 0.1722554862499237, + "loss/reg": 5.487642920343205e-05, + "step": 670 + }, + { + "epoch": 0.083875, + "grad_norm": 2.2052688598632812, + "grad_norm_var": 11.1759775305449, + "learning_rate": 0.0001, + "loss": 1.2101, + "loss/crossentropy": 2.712545394897461, + "loss/hidden": 1.046875, + "loss/logits": 0.1627039760351181, + "loss/reg": 5.4856664064573124e-05, + "step": 671 + }, + { + "epoch": 0.084, + "grad_norm": 2.127044677734375, + "grad_norm_var": 11.138641886695007, + "learning_rate": 0.0001, + "loss": 1.1681, + "loss/crossentropy": 2.7978134155273438, + "loss/hidden": 1.0, + "loss/logits": 0.16759660840034485, + "loss/reg": 5.4841766541358083e-05, + "step": 672 + }, + { + "epoch": 0.084125, + "grad_norm": 2.57954478263855, + "grad_norm_var": 11.0853286400312, + "learning_rate": 0.0001, + "loss": 1.2535, + "loss/crossentropy": 2.8341479301452637, + "loss/hidden": 1.0859375, + "loss/logits": 0.16698572039604187, + "loss/reg": 5.482636333908886e-05, + "step": 673 + }, + { + "epoch": 0.08425, + "grad_norm": 14.73951244354248, + "grad_norm_var": 18.158630087103635, + "learning_rate": 0.0001, + "loss": 1.745, + "loss/crossentropy": 2.6920087337493896, + "loss/hidden": 1.5625, + "loss/logits": 0.18198290467262268, + "loss/reg": 5.481092011905275e-05, + "step": 674 + }, + { + "epoch": 0.084375, + "grad_norm": 2.964245319366455, + "grad_norm_var": 18.241094275290784, + "learning_rate": 0.0001, + "loss": 1.3768, + "loss/crossentropy": 2.4664485454559326, + "loss/hidden": 1.203125, + "loss/logits": 0.1731320172548294, + "loss/reg": 5.479659739648923e-05, + "step": 675 + }, + { + "epoch": 0.0845, + "grad_norm": 2.0473501682281494, + "grad_norm_var": 18.512621656660862, + "learning_rate": 0.0001, + "loss": 1.2978, + "loss/crossentropy": 2.4481780529022217, + "loss/hidden": 1.1171875, + "loss/logits": 0.18001943826675415, + "loss/reg": 5.47790368727874e-05, + "step": 676 + }, + { + "epoch": 0.084625, + "grad_norm": 2.1904749870300293, + "grad_norm_var": 18.82885777793808, + "learning_rate": 0.0001, + "loss": 1.3854, + "loss/crossentropy": 2.406938076019287, + "loss/hidden": 1.1484375, + "loss/logits": 0.23644110560417175, + "loss/reg": 5.476039950735867e-05, + "step": 677 + }, + { + "epoch": 0.08475, + "grad_norm": 2.6025397777557373, + "grad_norm_var": 18.06538886174356, + "learning_rate": 0.0001, + "loss": 1.1031, + "loss/crossentropy": 2.539059638977051, + "loss/hidden": 0.96484375, + "loss/logits": 0.13775205612182617, + "loss/reg": 5.474198769661598e-05, + "step": 678 + }, + { + "epoch": 0.084875, + "grad_norm": 3.0288338661193848, + "grad_norm_var": 18.044955230468542, + "learning_rate": 0.0001, + "loss": 1.44, + "loss/crossentropy": 2.4128031730651855, + "loss/hidden": 1.25, + "loss/logits": 0.18944835662841797, + "loss/reg": 5.472711563925259e-05, + "step": 679 + }, + { + "epoch": 0.085, + "grad_norm": 6.3264336585998535, + "grad_norm_var": 18.038003798487622, + "learning_rate": 0.0001, + "loss": 1.6715, + "loss/crossentropy": 2.146503210067749, + "loss/hidden": 1.4140625, + "loss/logits": 0.2568877935409546, + "loss/reg": 5.471197800943628e-05, + "step": 680 + }, + { + "epoch": 0.085125, + "grad_norm": 2.3084750175476074, + "grad_norm_var": 18.099156367515864, + "learning_rate": 0.0001, + "loss": 1.2205, + "loss/crossentropy": 2.5457520484924316, + "loss/hidden": 1.046875, + "loss/logits": 0.173065185546875, + "loss/reg": 5.469706593430601e-05, + "step": 681 + }, + { + "epoch": 0.08525, + "grad_norm": 2.4326412677764893, + "grad_norm_var": 18.07055624015896, + "learning_rate": 0.0001, + "loss": 1.3589, + "loss/crossentropy": 2.703038215637207, + "loss/hidden": 1.15625, + "loss/logits": 0.20215150713920593, + "loss/reg": 5.468199015012942e-05, + "step": 682 + }, + { + "epoch": 0.085375, + "grad_norm": 2.473970890045166, + "grad_norm_var": 18.10898905123375, + "learning_rate": 0.0001, + "loss": 1.1869, + "loss/crossentropy": 2.4774723052978516, + "loss/hidden": 1.0078125, + "loss/logits": 0.17857202887535095, + "loss/reg": 5.466764559969306e-05, + "step": 683 + }, + { + "epoch": 0.0855, + "grad_norm": 3.152622699737549, + "grad_norm_var": 9.95443167979618, + "learning_rate": 0.0001, + "loss": 1.3118, + "loss/crossentropy": 2.385828971862793, + "loss/hidden": 1.1328125, + "loss/logits": 0.17848467826843262, + "loss/reg": 5.465377034852281e-05, + "step": 684 + }, + { + "epoch": 0.085625, + "grad_norm": 3.110018014907837, + "grad_norm_var": 9.914754143381142, + "learning_rate": 0.0001, + "loss": 1.3367, + "loss/crossentropy": 2.453490972518921, + "loss/hidden": 1.15625, + "loss/logits": 0.17990395426750183, + "loss/reg": 5.463728302856907e-05, + "step": 685 + }, + { + "epoch": 0.08575, + "grad_norm": 2.377671003341675, + "grad_norm_var": 9.93198520749056, + "learning_rate": 0.0001, + "loss": 1.3087, + "loss/crossentropy": 2.7564713954925537, + "loss/hidden": 1.1171875, + "loss/logits": 0.1909516155719757, + "loss/reg": 5.4623284086119384e-05, + "step": 686 + }, + { + "epoch": 0.085875, + "grad_norm": 2.422139883041382, + "grad_norm_var": 9.896281345994009, + "learning_rate": 0.0001, + "loss": 1.3778, + "loss/crossentropy": 2.1622204780578613, + "loss/hidden": 1.1796875, + "loss/logits": 0.19756248593330383, + "loss/reg": 5.46108276466839e-05, + "step": 687 + }, + { + "epoch": 0.086, + "grad_norm": 2.239145517349243, + "grad_norm_var": 9.875720139459439, + "learning_rate": 0.0001, + "loss": 1.2286, + "loss/crossentropy": 2.7086918354034424, + "loss/hidden": 1.0546875, + "loss/logits": 0.17338880896568298, + "loss/reg": 5.45964103366714e-05, + "step": 688 + }, + { + "epoch": 0.086125, + "grad_norm": 2.24678111076355, + "grad_norm_var": 9.92624095879092, + "learning_rate": 0.0001, + "loss": 1.4267, + "loss/crossentropy": 2.4056406021118164, + "loss/hidden": 1.1875, + "loss/logits": 0.23862136900424957, + "loss/reg": 5.457905717776157e-05, + "step": 689 + }, + { + "epoch": 0.08625, + "grad_norm": 3.519033432006836, + "grad_norm_var": 1.0418889707029915, + "learning_rate": 0.0001, + "loss": 1.5521, + "loss/crossentropy": 2.2903122901916504, + "loss/hidden": 1.3515625, + "loss/logits": 0.19997593760490417, + "loss/reg": 5.456155486172065e-05, + "step": 690 + }, + { + "epoch": 0.086375, + "grad_norm": 2.253674268722534, + "grad_norm_var": 1.061688644486465, + "learning_rate": 0.0001, + "loss": 1.3563, + "loss/crossentropy": 2.185561418533325, + "loss/hidden": 1.15625, + "loss/logits": 0.1994805932044983, + "loss/reg": 5.454723577713594e-05, + "step": 691 + }, + { + "epoch": 0.0865, + "grad_norm": 2.2427175045013428, + "grad_norm_var": 1.0445794349169109, + "learning_rate": 0.0001, + "loss": 1.2501, + "loss/crossentropy": 2.4699208736419678, + "loss/hidden": 1.0859375, + "loss/logits": 0.16366711258888245, + "loss/reg": 5.452951154438779e-05, + "step": 692 + }, + { + "epoch": 0.086625, + "grad_norm": 2.7868127822875977, + "grad_norm_var": 1.0177092507570797, + "learning_rate": 0.0001, + "loss": 1.3042, + "loss/crossentropy": 2.8428211212158203, + "loss/hidden": 1.125, + "loss/logits": 0.17867949604988098, + "loss/reg": 5.450921526062302e-05, + "step": 693 + }, + { + "epoch": 0.08675, + "grad_norm": 2.493699789047241, + "grad_norm_var": 1.0219714012832202, + "learning_rate": 0.0001, + "loss": 1.4317, + "loss/crossentropy": 2.2749669551849365, + "loss/hidden": 1.2265625, + "loss/logits": 0.20456844568252563, + "loss/reg": 5.449183299788274e-05, + "step": 694 + }, + { + "epoch": 0.086875, + "grad_norm": 2.647190570831299, + "grad_norm_var": 1.0213851131010148, + "learning_rate": 0.0001, + "loss": 1.2026, + "loss/crossentropy": 2.4777019023895264, + "loss/hidden": 1.0390625, + "loss/logits": 0.16303668916225433, + "loss/reg": 5.447701914818026e-05, + "step": 695 + }, + { + "epoch": 0.087, + "grad_norm": 2.7858493328094482, + "grad_norm_var": 0.14699271023528884, + "learning_rate": 0.0001, + "loss": 1.6292, + "loss/crossentropy": 2.0768496990203857, + "loss/hidden": 1.3671875, + "loss/logits": 0.26145654916763306, + "loss/reg": 5.4463806009152904e-05, + "step": 696 + }, + { + "epoch": 0.087125, + "grad_norm": 2.994112968444824, + "grad_norm_var": 0.15033771969499318, + "learning_rate": 0.0001, + "loss": 1.3258, + "loss/crossentropy": 2.7410616874694824, + "loss/hidden": 1.125, + "loss/logits": 0.20025156438350677, + "loss/reg": 5.4450483730761334e-05, + "step": 697 + }, + { + "epoch": 0.08725, + "grad_norm": 5.041564464569092, + "grad_norm_var": 0.5049578494311615, + "learning_rate": 0.0001, + "loss": 1.3599, + "loss/crossentropy": 2.6277389526367188, + "loss/hidden": 1.171875, + "loss/logits": 0.1875256896018982, + "loss/reg": 5.443237751023844e-05, + "step": 698 + }, + { + "epoch": 0.087375, + "grad_norm": 2.892210006713867, + "grad_norm_var": 0.49775480774286673, + "learning_rate": 0.0001, + "loss": 1.5111, + "loss/crossentropy": 2.3542768955230713, + "loss/hidden": 1.28125, + "loss/logits": 0.22926336526870728, + "loss/reg": 5.441823668661527e-05, + "step": 699 + }, + { + "epoch": 0.0875, + "grad_norm": 2.094099283218384, + "grad_norm_var": 0.52159104183775, + "learning_rate": 0.0001, + "loss": 1.2078, + "loss/crossentropy": 2.344701051712036, + "loss/hidden": 1.0390625, + "loss/logits": 0.16822829842567444, + "loss/reg": 5.440011591417715e-05, + "step": 700 + }, + { + "epoch": 0.087625, + "grad_norm": 3.5102314949035645, + "grad_norm_var": 0.5503235995769403, + "learning_rate": 0.0001, + "loss": 1.5397, + "loss/crossentropy": 1.9948920011520386, + "loss/hidden": 1.3203125, + "loss/logits": 0.21885889768600464, + "loss/reg": 5.4386586270993575e-05, + "step": 701 + }, + { + "epoch": 0.08775, + "grad_norm": 2.873744249343872, + "grad_norm_var": 0.5388161553597188, + "learning_rate": 0.0001, + "loss": 1.3158, + "loss/crossentropy": 2.527198076248169, + "loss/hidden": 1.140625, + "loss/logits": 0.17459964752197266, + "loss/reg": 5.4375817853724584e-05, + "step": 702 + }, + { + "epoch": 0.087875, + "grad_norm": 2.0903828144073486, + "grad_norm_var": 0.5630812725031336, + "learning_rate": 0.0001, + "loss": 1.3679, + "loss/crossentropy": 2.401954412460327, + "loss/hidden": 1.1875, + "loss/logits": 0.17989476025104523, + "loss/reg": 5.4366686526918784e-05, + "step": 703 + }, + { + "epoch": 0.088, + "grad_norm": 2.791881799697876, + "grad_norm_var": 0.5412509567410152, + "learning_rate": 0.0001, + "loss": 1.4039, + "loss/crossentropy": 2.5304641723632812, + "loss/hidden": 1.1875, + "loss/logits": 0.21581397950649261, + "loss/reg": 5.434821287053637e-05, + "step": 704 + }, + { + "epoch": 0.088125, + "grad_norm": 2.373342514038086, + "grad_norm_var": 0.532427224823193, + "learning_rate": 0.0001, + "loss": 1.1941, + "loss/crossentropy": 2.376497268676758, + "loss/hidden": 1.03125, + "loss/logits": 0.16228163242340088, + "loss/reg": 5.4336887842509896e-05, + "step": 705 + }, + { + "epoch": 0.08825, + "grad_norm": 3.0839779376983643, + "grad_norm_var": 0.504688552634502, + "learning_rate": 0.0001, + "loss": 1.3383, + "loss/crossentropy": 2.65484881401062, + "loss/hidden": 1.140625, + "loss/logits": 0.1971454918384552, + "loss/reg": 5.43265778105706e-05, + "step": 706 + }, + { + "epoch": 0.088375, + "grad_norm": 2.2295589447021484, + "grad_norm_var": 0.5065127901642389, + "learning_rate": 0.0001, + "loss": 1.3799, + "loss/crossentropy": 2.480161666870117, + "loss/hidden": 1.171875, + "loss/logits": 0.20748105645179749, + "loss/reg": 5.4318323236657307e-05, + "step": 707 + }, + { + "epoch": 0.0885, + "grad_norm": 3.5498087406158447, + "grad_norm_var": 0.5147397082983834, + "learning_rate": 0.0001, + "loss": 1.5722, + "loss/crossentropy": 1.904441475868225, + "loss/hidden": 1.375, + "loss/logits": 0.19664257764816284, + "loss/reg": 5.4309970437316224e-05, + "step": 708 + }, + { + "epoch": 0.088625, + "grad_norm": 4.781530857086182, + "grad_norm_var": 0.7360025205661221, + "learning_rate": 0.0001, + "loss": 1.5755, + "loss/crossentropy": 2.7621335983276367, + "loss/hidden": 1.3046875, + "loss/logits": 0.27026501297950745, + "loss/reg": 5.429290467873216e-05, + "step": 709 + }, + { + "epoch": 0.08875, + "grad_norm": 2.2828309535980225, + "grad_norm_var": 0.7534264462205973, + "learning_rate": 0.0001, + "loss": 1.2681, + "loss/crossentropy": 2.4517247676849365, + "loss/hidden": 1.09375, + "loss/logits": 0.1738508641719818, + "loss/reg": 5.427583528216928e-05, + "step": 710 + }, + { + "epoch": 0.088875, + "grad_norm": 2.650787115097046, + "grad_norm_var": 0.7532573998370441, + "learning_rate": 0.0001, + "loss": 1.3073, + "loss/crossentropy": 2.8212263584136963, + "loss/hidden": 1.1171875, + "loss/logits": 0.18959403038024902, + "loss/reg": 5.4265576181933284e-05, + "step": 711 + }, + { + "epoch": 0.089, + "grad_norm": 2.8599400520324707, + "grad_norm_var": 0.7514689463424626, + "learning_rate": 0.0001, + "loss": 1.4851, + "loss/crossentropy": 2.485013246536255, + "loss/hidden": 1.2421875, + "loss/logits": 0.24233944714069366, + "loss/reg": 5.425211202236824e-05, + "step": 712 + }, + { + "epoch": 0.089125, + "grad_norm": 3.3207690715789795, + "grad_norm_var": 0.7576093299695834, + "learning_rate": 0.0001, + "loss": 1.4396, + "loss/crossentropy": 2.5560240745544434, + "loss/hidden": 1.2421875, + "loss/logits": 0.19684939086437225, + "loss/reg": 5.4234227718552575e-05, + "step": 713 + }, + { + "epoch": 0.08925, + "grad_norm": 2.121156692504883, + "grad_norm_var": 0.5060815970166267, + "learning_rate": 0.0001, + "loss": 1.2187, + "loss/crossentropy": 2.7032556533813477, + "loss/hidden": 1.046875, + "loss/logits": 0.1712983250617981, + "loss/reg": 5.4215375712374225e-05, + "step": 714 + }, + { + "epoch": 0.089375, + "grad_norm": 2.606315851211548, + "grad_norm_var": 0.509357702424964, + "learning_rate": 0.0001, + "loss": 1.3993, + "loss/crossentropy": 2.3998241424560547, + "loss/hidden": 1.1796875, + "loss/logits": 0.21911802887916565, + "loss/reg": 5.4198477300815284e-05, + "step": 715 + }, + { + "epoch": 0.0895, + "grad_norm": 2.922720193862915, + "grad_norm_var": 0.4713784636638735, + "learning_rate": 0.0001, + "loss": 1.4542, + "loss/crossentropy": 2.6829943656921387, + "loss/hidden": 1.234375, + "loss/logits": 0.21930286288261414, + "loss/reg": 5.4178999562282115e-05, + "step": 716 + }, + { + "epoch": 0.089625, + "grad_norm": 2.1098432540893555, + "grad_norm_var": 0.47590856989839947, + "learning_rate": 0.0001, + "loss": 1.233, + "loss/crossentropy": 2.6072769165039062, + "loss/hidden": 1.0625, + "loss/logits": 0.16997693479061127, + "loss/reg": 5.41587796760723e-05, + "step": 717 + }, + { + "epoch": 0.08975, + "grad_norm": 3.364995241165161, + "grad_norm_var": 0.4964416307179249, + "learning_rate": 0.0001, + "loss": 1.3734, + "loss/crossentropy": 2.735403060913086, + "loss/hidden": 1.1640625, + "loss/logits": 0.20879969000816345, + "loss/reg": 5.4137595725478604e-05, + "step": 718 + }, + { + "epoch": 0.089875, + "grad_norm": 2.06780743598938, + "grad_norm_var": 0.49867340108694785, + "learning_rate": 0.0001, + "loss": 1.3704, + "loss/crossentropy": 2.5375735759735107, + "loss/hidden": 1.15625, + "loss/logits": 0.21360768377780914, + "loss/reg": 5.411420715972781e-05, + "step": 719 + }, + { + "epoch": 0.09, + "grad_norm": 2.5839052200317383, + "grad_norm_var": 0.5021517785446536, + "learning_rate": 0.0001, + "loss": 1.201, + "loss/crossentropy": 2.7017717361450195, + "loss/hidden": 1.015625, + "loss/logits": 0.18482069671154022, + "loss/reg": 5.409633740782738e-05, + "step": 720 + }, + { + "epoch": 0.090125, + "grad_norm": 2.7051029205322266, + "grad_norm_var": 0.48985561320671017, + "learning_rate": 0.0001, + "loss": 1.3077, + "loss/crossentropy": 2.4120161533355713, + "loss/hidden": 1.1328125, + "loss/logits": 0.17430010437965393, + "loss/reg": 5.407804201240651e-05, + "step": 721 + }, + { + "epoch": 0.09025, + "grad_norm": 3.305095911026001, + "grad_norm_var": 0.5004710841579763, + "learning_rate": 0.0001, + "loss": 1.232, + "loss/crossentropy": 2.5663864612579346, + "loss/hidden": 1.046875, + "loss/logits": 0.18463225662708282, + "loss/reg": 5.405680713010952e-05, + "step": 722 + }, + { + "epoch": 0.090375, + "grad_norm": 2.5852112770080566, + "grad_norm_var": 0.47936361363680907, + "learning_rate": 0.0001, + "loss": 1.2172, + "loss/crossentropy": 2.4206771850585938, + "loss/hidden": 1.046875, + "loss/logits": 0.16979727149009705, + "loss/reg": 5.403965406003408e-05, + "step": 723 + }, + { + "epoch": 0.0905, + "grad_norm": 5.01464319229126, + "grad_norm_var": 0.7474939605767976, + "learning_rate": 0.0001, + "loss": 1.8636, + "loss/crossentropy": 2.4939754009246826, + "loss/hidden": 1.53125, + "loss/logits": 0.33179470896720886, + "loss/reg": 5.4016720241634175e-05, + "step": 724 + }, + { + "epoch": 0.090625, + "grad_norm": 2.617537021636963, + "grad_norm_var": 0.5132076404113444, + "learning_rate": 0.0001, + "loss": 1.4005, + "loss/crossentropy": 2.3498785495758057, + "loss/hidden": 1.1953125, + "loss/logits": 0.20461852848529816, + "loss/reg": 5.3998457588022575e-05, + "step": 725 + }, + { + "epoch": 0.09075, + "grad_norm": 2.8471455574035645, + "grad_norm_var": 0.49269947606489545, + "learning_rate": 0.0001, + "loss": 1.4197, + "loss/crossentropy": 2.580679178237915, + "loss/hidden": 1.2109375, + "loss/logits": 0.20819300413131714, + "loss/reg": 5.397558561526239e-05, + "step": 726 + }, + { + "epoch": 0.090875, + "grad_norm": 21.460710525512695, + "grad_norm_var": 22.0933953279752, + "learning_rate": 0.0001, + "loss": 1.8327, + "loss/crossentropy": 2.223879814147949, + "loss/hidden": 1.546875, + "loss/logits": 0.28528282046318054, + "loss/reg": 5.395858170231804e-05, + "step": 727 + }, + { + "epoch": 0.091, + "grad_norm": 2.9006693363189697, + "grad_norm_var": 22.08714053553672, + "learning_rate": 0.0001, + "loss": 1.5806, + "loss/crossentropy": 2.5028350353240967, + "loss/hidden": 1.3203125, + "loss/logits": 0.25977060198783875, + "loss/reg": 5.394085019361228e-05, + "step": 728 + }, + { + "epoch": 0.091125, + "grad_norm": 3.1578407287597656, + "grad_norm_var": 22.10427962795151, + "learning_rate": 0.0001, + "loss": 1.3831, + "loss/crossentropy": 2.9056754112243652, + "loss/hidden": 1.1796875, + "loss/logits": 0.2028241753578186, + "loss/reg": 5.392428647610359e-05, + "step": 729 + }, + { + "epoch": 0.09125, + "grad_norm": 2.310551643371582, + "grad_norm_var": 22.0584906663241, + "learning_rate": 0.0001, + "loss": 1.2775, + "loss/crossentropy": 2.683490753173828, + "loss/hidden": 1.1015625, + "loss/logits": 0.17539778351783752, + "loss/reg": 5.390584919950925e-05, + "step": 730 + }, + { + "epoch": 0.091375, + "grad_norm": 3.0188450813293457, + "grad_norm_var": 21.990543415264224, + "learning_rate": 0.0001, + "loss": 1.4888, + "loss/crossentropy": 2.2403600215911865, + "loss/hidden": 1.296875, + "loss/logits": 0.19139915704727173, + "loss/reg": 5.389019497670233e-05, + "step": 731 + }, + { + "epoch": 0.0915, + "grad_norm": 2.768749713897705, + "grad_norm_var": 22.015388964459888, + "learning_rate": 0.0001, + "loss": 1.3743, + "loss/crossentropy": 2.5775599479675293, + "loss/hidden": 1.1953125, + "loss/logits": 0.17840096354484558, + "loss/reg": 5.38753520231694e-05, + "step": 732 + }, + { + "epoch": 0.091625, + "grad_norm": 2.750666856765747, + "grad_norm_var": 21.87518218062818, + "learning_rate": 0.0001, + "loss": 1.2523, + "loss/crossentropy": 2.4601376056671143, + "loss/hidden": 1.09375, + "loss/logits": 0.15799552202224731, + "loss/reg": 5.3856590966461226e-05, + "step": 733 + }, + { + "epoch": 0.09175, + "grad_norm": 2.343596935272217, + "grad_norm_var": 22.03928719159194, + "learning_rate": 0.0001, + "loss": 1.395, + "loss/crossentropy": 2.3660390377044678, + "loss/hidden": 1.21875, + "loss/logits": 0.17570561170578003, + "loss/reg": 5.383471216191538e-05, + "step": 734 + }, + { + "epoch": 0.091875, + "grad_norm": 2.5348923206329346, + "grad_norm_var": 21.930884482184023, + "learning_rate": 0.0001, + "loss": 1.2834, + "loss/crossentropy": 2.6442513465881348, + "loss/hidden": 1.1015625, + "loss/logits": 0.18133598566055298, + "loss/reg": 5.381777373258956e-05, + "step": 735 + }, + { + "epoch": 0.092, + "grad_norm": 2.759902000427246, + "grad_norm_var": 21.89826244514976, + "learning_rate": 0.0001, + "loss": 1.461, + "loss/crossentropy": 2.3467421531677246, + "loss/hidden": 1.2578125, + "loss/logits": 0.20265132188796997, + "loss/reg": 5.3800951718585566e-05, + "step": 736 + }, + { + "epoch": 0.092125, + "grad_norm": 3.2894835472106934, + "grad_norm_var": 21.813446124750236, + "learning_rate": 0.0001, + "loss": 1.3914, + "loss/crossentropy": 2.2499518394470215, + "loss/hidden": 1.203125, + "loss/logits": 0.18771812319755554, + "loss/reg": 5.378201967687346e-05, + "step": 737 + }, + { + "epoch": 0.09225, + "grad_norm": 2.6464269161224365, + "grad_norm_var": 21.910731669963607, + "learning_rate": 0.0001, + "loss": 1.5077, + "loss/crossentropy": 2.420416831970215, + "loss/hidden": 1.28125, + "loss/logits": 0.2258935570716858, + "loss/reg": 5.376638000598177e-05, + "step": 738 + }, + { + "epoch": 0.092375, + "grad_norm": 2.278536796569824, + "grad_norm_var": 21.97703354471374, + "learning_rate": 0.0001, + "loss": 1.2114, + "loss/crossentropy": 2.5701427459716797, + "loss/hidden": 1.0390625, + "loss/logits": 0.1718250811100006, + "loss/reg": 5.3752868552692235e-05, + "step": 739 + }, + { + "epoch": 0.0925, + "grad_norm": 2.3287525177001953, + "grad_norm_var": 22.080218462231635, + "learning_rate": 0.0001, + "loss": 1.2548, + "loss/crossentropy": 2.382229804992676, + "loss/hidden": 1.1015625, + "loss/logits": 0.15274423360824585, + "loss/reg": 5.3735657274955884e-05, + "step": 740 + }, + { + "epoch": 0.092625, + "grad_norm": 2.697570562362671, + "grad_norm_var": 22.067190693445564, + "learning_rate": 0.0001, + "loss": 1.4246, + "loss/crossentropy": 2.500366687774658, + "loss/hidden": 1.234375, + "loss/logits": 0.18973666429519653, + "loss/reg": 5.372005034587346e-05, + "step": 741 + }, + { + "epoch": 0.09275, + "grad_norm": 2.773134469985962, + "grad_norm_var": 22.077734248370113, + "learning_rate": 0.0001, + "loss": 1.3596, + "loss/crossentropy": 2.606679916381836, + "loss/hidden": 1.1640625, + "loss/logits": 0.1950472891330719, + "loss/reg": 5.370312646846287e-05, + "step": 742 + }, + { + "epoch": 0.092875, + "grad_norm": 2.5400867462158203, + "grad_norm_var": 0.09096660622796264, + "learning_rate": 0.0001, + "loss": 1.3041, + "loss/crossentropy": 2.1913156509399414, + "loss/hidden": 1.125, + "loss/logits": 0.1785746067762375, + "loss/reg": 5.3685631428379565e-05, + "step": 743 + }, + { + "epoch": 0.093, + "grad_norm": 2.3412983417510986, + "grad_norm_var": 0.09508860759598577, + "learning_rate": 0.0001, + "loss": 1.4625, + "loss/crossentropy": 2.1983559131622314, + "loss/hidden": 1.2421875, + "loss/logits": 0.21976345777511597, + "loss/reg": 5.366921686800197e-05, + "step": 744 + }, + { + "epoch": 0.093125, + "grad_norm": 2.2313711643218994, + "grad_norm_var": 0.08708549521983074, + "learning_rate": 0.0001, + "loss": 1.4097, + "loss/crossentropy": 2.288938522338867, + "loss/hidden": 1.203125, + "loss/logits": 0.20603150129318237, + "loss/reg": 5.365387551137246e-05, + "step": 745 + }, + { + "epoch": 0.09325, + "grad_norm": 2.251176118850708, + "grad_norm_var": 0.08960418307721056, + "learning_rate": 0.0001, + "loss": 1.2055, + "loss/crossentropy": 2.539421796798706, + "loss/hidden": 1.046875, + "loss/logits": 0.15807852149009705, + "loss/reg": 5.3636584198102355e-05, + "step": 746 + }, + { + "epoch": 0.093375, + "grad_norm": 2.9313182830810547, + "grad_norm_var": 0.0851617748558122, + "learning_rate": 0.0001, + "loss": 1.4474, + "loss/crossentropy": 2.4747750759124756, + "loss/hidden": 1.25, + "loss/logits": 0.1968374252319336, + "loss/reg": 5.362145748222247e-05, + "step": 747 + }, + { + "epoch": 0.0935, + "grad_norm": 2.4297850131988525, + "grad_norm_var": 0.0843403592222972, + "learning_rate": 0.0001, + "loss": 1.1399, + "loss/crossentropy": 2.758875846862793, + "loss/hidden": 0.99609375, + "loss/logits": 0.14325925707817078, + "loss/reg": 5.360745853977278e-05, + "step": 748 + }, + { + "epoch": 0.093625, + "grad_norm": 4.064235687255859, + "grad_norm_var": 0.223736692323439, + "learning_rate": 0.0001, + "loss": 1.3673, + "loss/crossentropy": 2.235731840133667, + "loss/hidden": 1.171875, + "loss/logits": 0.19489188492298126, + "loss/reg": 5.359313581720926e-05, + "step": 749 + }, + { + "epoch": 0.09375, + "grad_norm": 2.2421138286590576, + "grad_norm_var": 0.22856148654111977, + "learning_rate": 0.0001, + "loss": 1.2226, + "loss/crossentropy": 2.5793533325195312, + "loss/hidden": 1.0546875, + "loss/logits": 0.16737329959869385, + "loss/reg": 5.35770996066276e-05, + "step": 750 + }, + { + "epoch": 0.093875, + "grad_norm": 2.058133602142334, + "grad_norm_var": 0.2498467671842178, + "learning_rate": 0.0001, + "loss": 1.1218, + "loss/crossentropy": 2.627385377883911, + "loss/hidden": 0.97265625, + "loss/logits": 0.14862656593322754, + "loss/reg": 5.355522807803936e-05, + "step": 751 + }, + { + "epoch": 0.094, + "grad_norm": 3.1811017990112305, + "grad_norm_var": 0.26899066622994783, + "learning_rate": 0.0001, + "loss": 1.2385, + "loss/crossentropy": 2.3661398887634277, + "loss/hidden": 1.09375, + "loss/logits": 0.14422640204429626, + "loss/reg": 5.3536150517174974e-05, + "step": 752 + }, + { + "epoch": 0.094125, + "grad_norm": 2.043060302734375, + "grad_norm_var": 0.25861380812167456, + "learning_rate": 0.0001, + "loss": 1.3259, + "loss/crossentropy": 2.3880302906036377, + "loss/hidden": 1.140625, + "loss/logits": 0.184707909822464, + "loss/reg": 5.35179533471819e-05, + "step": 753 + }, + { + "epoch": 0.09425, + "grad_norm": 16.624187469482422, + "grad_norm_var": 12.621702210829204, + "learning_rate": 0.0001, + "loss": 2.518, + "loss/crossentropy": 2.336055040359497, + "loss/hidden": 2.09375, + "loss/logits": 0.42366719245910645, + "loss/reg": 5.34973805770278e-05, + "step": 754 + }, + { + "epoch": 0.094375, + "grad_norm": 2.8955469131469727, + "grad_norm_var": 12.550068888672113, + "learning_rate": 0.0001, + "loss": 1.663, + "loss/crossentropy": 2.07586932182312, + "loss/hidden": 1.453125, + "loss/logits": 0.20937134325504303, + "loss/reg": 5.347675323719159e-05, + "step": 755 + }, + { + "epoch": 0.0945, + "grad_norm": 2.708143711090088, + "grad_norm_var": 12.500977569673056, + "learning_rate": 0.0001, + "loss": 1.3643, + "loss/crossentropy": 2.9162800312042236, + "loss/hidden": 1.140625, + "loss/logits": 0.22310392558574677, + "loss/reg": 5.345011595636606e-05, + "step": 756 + }, + { + "epoch": 0.094625, + "grad_norm": 2.578479051589966, + "grad_norm_var": 12.51461783628701, + "learning_rate": 0.0001, + "loss": 1.3496, + "loss/crossentropy": 2.7447614669799805, + "loss/hidden": 1.15625, + "loss/logits": 0.1928335428237915, + "loss/reg": 5.3432562708621845e-05, + "step": 757 + }, + { + "epoch": 0.09475, + "grad_norm": 2.6751816272735596, + "grad_norm_var": 12.524623447598344, + "learning_rate": 0.0001, + "loss": 1.2677, + "loss/crossentropy": 2.4300589561462402, + "loss/hidden": 1.0859375, + "loss/logits": 0.18121860921382904, + "loss/reg": 5.34126374986954e-05, + "step": 758 + }, + { + "epoch": 0.094875, + "grad_norm": 2.838064432144165, + "grad_norm_var": 12.492543668855477, + "learning_rate": 0.0001, + "loss": 1.3323, + "loss/crossentropy": 2.4878199100494385, + "loss/hidden": 1.1484375, + "loss/logits": 0.1833486557006836, + "loss/reg": 5.339576819096692e-05, + "step": 759 + }, + { + "epoch": 0.095, + "grad_norm": 1.874202013015747, + "grad_norm_var": 12.578705995031372, + "learning_rate": 0.0001, + "loss": 1.2521, + "loss/crossentropy": 2.6370134353637695, + "loss/hidden": 1.078125, + "loss/logits": 0.17343951761722565, + "loss/reg": 5.3375784773379564e-05, + "step": 760 + }, + { + "epoch": 0.095125, + "grad_norm": 2.471583843231201, + "grad_norm_var": 12.54242874137393, + "learning_rate": 0.0001, + "loss": 1.3698, + "loss/crossentropy": 2.6770365238189697, + "loss/hidden": 1.1953125, + "loss/logits": 0.1740024983882904, + "loss/reg": 5.3358369768830016e-05, + "step": 761 + }, + { + "epoch": 0.09525, + "grad_norm": 2.321331739425659, + "grad_norm_var": 12.531132909698341, + "learning_rate": 0.0001, + "loss": 1.3088, + "loss/crossentropy": 2.7517244815826416, + "loss/hidden": 1.1328125, + "loss/logits": 0.17546439170837402, + "loss/reg": 5.334003799362108e-05, + "step": 762 + }, + { + "epoch": 0.095375, + "grad_norm": 2.227292776107788, + "grad_norm_var": 12.615120618713426, + "learning_rate": 0.0001, + "loss": 1.1134, + "loss/crossentropy": 2.4684908390045166, + "loss/hidden": 0.9609375, + "loss/logits": 0.15190817415714264, + "loss/reg": 5.331678767106496e-05, + "step": 763 + }, + { + "epoch": 0.0955, + "grad_norm": 3.2276973724365234, + "grad_norm_var": 12.546157446449454, + "learning_rate": 0.0001, + "loss": 1.3265, + "loss/crossentropy": 2.317460775375366, + "loss/hidden": 1.1328125, + "loss/logits": 0.19315344095230103, + "loss/reg": 5.328991392161697e-05, + "step": 764 + }, + { + "epoch": 0.095625, + "grad_norm": 2.525618553161621, + "grad_norm_var": 12.578753225816566, + "learning_rate": 0.0001, + "loss": 1.2785, + "loss/crossentropy": 2.763885021209717, + "loss/hidden": 1.109375, + "loss/logits": 0.1685691773891449, + "loss/reg": 5.3272808145266026e-05, + "step": 765 + }, + { + "epoch": 0.09575, + "grad_norm": 2.8182990550994873, + "grad_norm_var": 12.510107821183945, + "learning_rate": 0.0001, + "loss": 1.4041, + "loss/crossentropy": 2.609581232070923, + "loss/hidden": 1.2109375, + "loss/logits": 0.1925983875989914, + "loss/reg": 5.325373786035925e-05, + "step": 766 + }, + { + "epoch": 0.095875, + "grad_norm": 3.1149539947509766, + "grad_norm_var": 12.384948285453223, + "learning_rate": 0.0001, + "loss": 1.4725, + "loss/crossentropy": 2.8335139751434326, + "loss/hidden": 1.2578125, + "loss/logits": 0.21420395374298096, + "loss/reg": 5.3225699957692996e-05, + "step": 767 + }, + { + "epoch": 0.096, + "grad_norm": 2.8965165615081787, + "grad_norm_var": 12.402406416217548, + "learning_rate": 0.0001, + "loss": 1.4859, + "loss/crossentropy": 2.5122146606445312, + "loss/hidden": 1.265625, + "loss/logits": 0.2197187840938568, + "loss/reg": 5.320890340954065e-05, + "step": 768 + }, + { + "epoch": 0.096125, + "grad_norm": 2.1168019771575928, + "grad_norm_var": 12.388519548771136, + "learning_rate": 0.0001, + "loss": 1.1191, + "loss/crossentropy": 2.4875917434692383, + "loss/hidden": 0.98046875, + "loss/logits": 0.13811561465263367, + "loss/reg": 5.319330739439465e-05, + "step": 769 + }, + { + "epoch": 0.09625, + "grad_norm": 3.6164140701293945, + "grad_norm_var": 0.1921279196482864, + "learning_rate": 0.0001, + "loss": 1.506, + "loss/crossentropy": 2.443065643310547, + "loss/hidden": 1.2734375, + "loss/logits": 0.23207004368305206, + "loss/reg": 5.3170962928561494e-05, + "step": 770 + }, + { + "epoch": 0.096375, + "grad_norm": 2.91166090965271, + "grad_norm_var": 0.19260374956815268, + "learning_rate": 0.0001, + "loss": 1.3272, + "loss/crossentropy": 2.334723472595215, + "loss/hidden": 1.1484375, + "loss/logits": 0.17818781733512878, + "loss/reg": 5.315555972629227e-05, + "step": 771 + }, + { + "epoch": 0.0965, + "grad_norm": 2.8536345958709717, + "grad_norm_var": 0.1944214633678918, + "learning_rate": 0.0001, + "loss": 1.356, + "loss/crossentropy": 2.735614538192749, + "loss/hidden": 1.15625, + "loss/logits": 0.19926324486732483, + "loss/reg": 5.313804649631493e-05, + "step": 772 + }, + { + "epoch": 0.096625, + "grad_norm": 2.7003872394561768, + "grad_norm_var": 0.19350943129851217, + "learning_rate": 0.0001, + "loss": 1.2365, + "loss/crossentropy": 2.695556879043579, + "loss/hidden": 1.078125, + "loss/logits": 0.15788228809833527, + "loss/reg": 5.3116473281988874e-05, + "step": 773 + }, + { + "epoch": 0.09675, + "grad_norm": 2.7257320880889893, + "grad_norm_var": 0.193506227128938, + "learning_rate": 0.0001, + "loss": 1.2951, + "loss/crossentropy": 2.7179250717163086, + "loss/hidden": 1.1171875, + "loss/logits": 0.17742162942886353, + "loss/reg": 5.30926845385693e-05, + "step": 774 + }, + { + "epoch": 0.096875, + "grad_norm": 2.576854944229126, + "grad_norm_var": 0.19304961436835294, + "learning_rate": 0.0001, + "loss": 1.3194, + "loss/crossentropy": 2.3818206787109375, + "loss/hidden": 1.140625, + "loss/logits": 0.17822806537151337, + "loss/reg": 5.3073516028234735e-05, + "step": 775 + }, + { + "epoch": 0.097, + "grad_norm": 2.4969394207000732, + "grad_norm_var": 0.1498668282970544, + "learning_rate": 0.0001, + "loss": 1.3081, + "loss/crossentropy": 2.0176634788513184, + "loss/hidden": 1.15625, + "loss/logits": 0.15133124589920044, + "loss/reg": 5.305654849507846e-05, + "step": 776 + }, + { + "epoch": 0.097125, + "grad_norm": 3.22451114654541, + "grad_norm_var": 0.15984673617916367, + "learning_rate": 0.0001, + "loss": 1.5053, + "loss/crossentropy": 2.820338010787964, + "loss/hidden": 1.25, + "loss/logits": 0.25477665662765503, + "loss/reg": 5.3042218496557325e-05, + "step": 777 + }, + { + "epoch": 0.09725, + "grad_norm": 24.615375518798828, + "grad_norm_var": 29.883750264758483, + "learning_rate": 0.0001, + "loss": 1.3336, + "loss/crossentropy": 1.9187242984771729, + "loss/hidden": 1.1484375, + "loss/logits": 0.1846262812614441, + "loss/reg": 5.302110002958216e-05, + "step": 778 + }, + { + "epoch": 0.097375, + "grad_norm": 2.6715848445892334, + "grad_norm_var": 29.781267578163263, + "learning_rate": 0.0001, + "loss": 1.4791, + "loss/crossentropy": 2.532146692276001, + "loss/hidden": 1.265625, + "loss/logits": 0.2129930853843689, + "loss/reg": 5.30023971805349e-05, + "step": 779 + }, + { + "epoch": 0.0975, + "grad_norm": 2.747297763824463, + "grad_norm_var": 29.85754231101701, + "learning_rate": 0.0001, + "loss": 1.3347, + "loss/crossentropy": 2.687166929244995, + "loss/hidden": 1.140625, + "loss/logits": 0.19358965754508972, + "loss/reg": 5.2986379159847274e-05, + "step": 780 + }, + { + "epoch": 0.097625, + "grad_norm": 2.6517791748046875, + "grad_norm_var": 29.83098919964196, + "learning_rate": 0.0001, + "loss": 1.4492, + "loss/crossentropy": 2.927424907684326, + "loss/hidden": 1.234375, + "loss/logits": 0.21428784728050232, + "loss/reg": 5.296709787216969e-05, + "step": 781 + }, + { + "epoch": 0.09775, + "grad_norm": 2.4107885360717773, + "grad_norm_var": 29.914876215687343, + "learning_rate": 0.0001, + "loss": 1.4283, + "loss/crossentropy": 2.3049545288085938, + "loss/hidden": 1.2265625, + "loss/logits": 0.20118646323680878, + "loss/reg": 5.2940118621336296e-05, + "step": 782 + }, + { + "epoch": 0.097875, + "grad_norm": 4.856544017791748, + "grad_norm_var": 29.865095133338084, + "learning_rate": 0.0001, + "loss": 2.0985, + "loss/crossentropy": 2.8381922245025635, + "loss/hidden": 1.65625, + "loss/logits": 0.4416726231575012, + "loss/reg": 5.291615889291279e-05, + "step": 783 + }, + { + "epoch": 0.098, + "grad_norm": 3.376915216445923, + "grad_norm_var": 29.792532646292965, + "learning_rate": 0.0001, + "loss": 1.4062, + "loss/crossentropy": 3.5869369506835938, + "loss/hidden": 1.234375, + "loss/logits": 0.17128312587738037, + "loss/reg": 5.289655382512137e-05, + "step": 784 + }, + { + "epoch": 0.098125, + "grad_norm": 2.383514881134033, + "grad_norm_var": 29.719888845997076, + "learning_rate": 0.0001, + "loss": 1.1903, + "loss/crossentropy": 2.9117846488952637, + "loss/hidden": 1.03125, + "loss/logits": 0.15850681066513062, + "loss/reg": 5.287636668072082e-05, + "step": 785 + }, + { + "epoch": 0.09825, + "grad_norm": 2.088331937789917, + "grad_norm_var": 30.00535910434075, + "learning_rate": 0.0001, + "loss": 1.2191, + "loss/crossentropy": 2.4488675594329834, + "loss/hidden": 1.0546875, + "loss/logits": 0.1639135628938675, + "loss/reg": 5.285735096549615e-05, + "step": 786 + }, + { + "epoch": 0.098375, + "grad_norm": 3.4909255504608154, + "grad_norm_var": 29.926382197605413, + "learning_rate": 0.0001, + "loss": 1.7858, + "loss/crossentropy": 2.534886121749878, + "loss/hidden": 1.484375, + "loss/logits": 0.3009305000305176, + "loss/reg": 5.283685095491819e-05, + "step": 787 + }, + { + "epoch": 0.0985, + "grad_norm": 2.4427733421325684, + "grad_norm_var": 30.01298634962116, + "learning_rate": 0.0001, + "loss": 1.1909, + "loss/crossentropy": 2.2383270263671875, + "loss/hidden": 1.0390625, + "loss/logits": 0.1513344794511795, + "loss/reg": 5.281960329739377e-05, + "step": 788 + }, + { + "epoch": 0.098625, + "grad_norm": 2.3890647888183594, + "grad_norm_var": 30.08196756498999, + "learning_rate": 0.0001, + "loss": 1.3743, + "loss/crossentropy": 2.4775261878967285, + "loss/hidden": 1.171875, + "loss/logits": 0.20193596184253693, + "loss/reg": 5.280092591419816e-05, + "step": 789 + }, + { + "epoch": 0.09875, + "grad_norm": 5.748532295227051, + "grad_norm_var": 30.06014752680334, + "learning_rate": 0.0001, + "loss": 1.9235, + "loss/crossentropy": 2.484543561935425, + "loss/hidden": 1.6640625, + "loss/logits": 0.2588757276535034, + "loss/reg": 5.27824777236674e-05, + "step": 790 + }, + { + "epoch": 0.098875, + "grad_norm": 2.519845485687256, + "grad_norm_var": 30.074100413727034, + "learning_rate": 0.0001, + "loss": 1.3943, + "loss/crossentropy": 2.480121374130249, + "loss/hidden": 1.203125, + "loss/logits": 0.19063332676887512, + "loss/reg": 5.276537558529526e-05, + "step": 791 + }, + { + "epoch": 0.099, + "grad_norm": 3.150681495666504, + "grad_norm_var": 29.936484287726426, + "learning_rate": 0.0001, + "loss": 1.7175, + "loss/crossentropy": 2.2747299671173096, + "loss/hidden": 1.4921875, + "loss/logits": 0.22481489181518555, + "loss/reg": 5.27442607562989e-05, + "step": 792 + }, + { + "epoch": 0.099125, + "grad_norm": 3.151745557785034, + "grad_norm_var": 29.948443330167883, + "learning_rate": 0.0001, + "loss": 1.5235, + "loss/crossentropy": 2.7657337188720703, + "loss/hidden": 1.2734375, + "loss/logits": 0.24951426684856415, + "loss/reg": 5.272776979836635e-05, + "step": 793 + }, + { + "epoch": 0.09925, + "grad_norm": 2.513519525527954, + "grad_norm_var": 0.960682649799162, + "learning_rate": 0.0001, + "loss": 1.3274, + "loss/crossentropy": 2.569284200668335, + "loss/hidden": 1.1484375, + "loss/logits": 0.17844460904598236, + "loss/reg": 5.271006011753343e-05, + "step": 794 + }, + { + "epoch": 0.099375, + "grad_norm": 2.5686886310577393, + "grad_norm_var": 0.966359269696044, + "learning_rate": 0.0001, + "loss": 1.3463, + "loss/crossentropy": 2.590020179748535, + "loss/hidden": 1.140625, + "loss/logits": 0.20516559481620789, + "loss/reg": 5.269322718959302e-05, + "step": 795 + }, + { + "epoch": 0.0995, + "grad_norm": 3.2609806060791016, + "grad_norm_var": 0.9634417109839205, + "learning_rate": 0.0001, + "loss": 2.0341, + "loss/crossentropy": 2.2520804405212402, + "loss/hidden": 1.5625, + "loss/logits": 0.4710923433303833, + "loss/reg": 5.267909000394866e-05, + "step": 796 + }, + { + "epoch": 0.099625, + "grad_norm": 4.763014793395996, + "grad_norm_var": 1.1263253492342187, + "learning_rate": 0.0001, + "loss": 1.9775, + "loss/crossentropy": 2.288787364959717, + "loss/hidden": 1.640625, + "loss/logits": 0.33635812997817993, + "loss/reg": 5.266653897706419e-05, + "step": 797 + }, + { + "epoch": 0.09975, + "grad_norm": 3.378089189529419, + "grad_norm_var": 1.0836956421815953, + "learning_rate": 0.0001, + "loss": 1.2208, + "loss/crossentropy": 2.697234869003296, + "loss/hidden": 1.0546875, + "loss/logits": 0.16561515629291534, + "loss/reg": 5.264836363494396e-05, + "step": 798 + }, + { + "epoch": 0.099875, + "grad_norm": 2.9043984413146973, + "grad_norm_var": 0.905067080343201, + "learning_rate": 0.0001, + "loss": 1.3825, + "loss/crossentropy": 2.6060903072357178, + "loss/hidden": 1.171875, + "loss/logits": 0.21010896563529968, + "loss/reg": 5.2631796279456466e-05, + "step": 799 + }, + { + "epoch": 0.1, + "grad_norm": 4.528866291046143, + "grad_norm_var": 1.0254388138747639, + "learning_rate": 0.0001, + "loss": 1.598, + "loss/crossentropy": 2.841477632522583, + "loss/hidden": 1.359375, + "loss/logits": 0.23809757828712463, + "loss/reg": 5.261685873847455e-05, + "step": 800 + }, + { + "epoch": 0.100125, + "grad_norm": 3.34147047996521, + "grad_norm_var": 0.9778438459070872, + "learning_rate": 0.0001, + "loss": 1.7202, + "loss/crossentropy": 2.2488768100738525, + "loss/hidden": 1.4453125, + "loss/logits": 0.2743300199508667, + "loss/reg": 5.2600626077037305e-05, + "step": 801 + }, + { + "epoch": 0.10025, + "grad_norm": 2.4430441856384277, + "grad_norm_var": 0.9300544238136648, + "learning_rate": 0.0001, + "loss": 1.2715, + "loss/crossentropy": 3.037824869155884, + "loss/hidden": 1.09375, + "loss/logits": 0.17723414301872253, + "loss/reg": 5.258737655822188e-05, + "step": 802 + }, + { + "epoch": 0.100375, + "grad_norm": 2.2626993656158447, + "grad_norm_var": 0.9909798492162054, + "learning_rate": 0.0001, + "loss": 1.5159, + "loss/crossentropy": 2.764657497406006, + "loss/hidden": 1.28125, + "loss/logits": 0.23410022258758545, + "loss/reg": 5.2564399084076285e-05, + "step": 803 + }, + { + "epoch": 0.1005, + "grad_norm": 2.310553789138794, + "grad_norm_var": 1.005606293107261, + "learning_rate": 0.0001, + "loss": 1.3199, + "loss/crossentropy": 2.452549457550049, + "loss/hidden": 1.140625, + "loss/logits": 0.1787756383419037, + "loss/reg": 5.254061034065671e-05, + "step": 804 + }, + { + "epoch": 0.100625, + "grad_norm": 2.7235846519470215, + "grad_norm_var": 0.9763322945012208, + "learning_rate": 0.0001, + "loss": 1.3767, + "loss/crossentropy": 2.6927273273468018, + "loss/hidden": 1.1875, + "loss/logits": 0.18868786096572876, + "loss/reg": 5.252664050203748e-05, + "step": 805 + }, + { + "epoch": 0.10075, + "grad_norm": 3.140831232070923, + "grad_norm_var": 0.5232650102161784, + "learning_rate": 0.0001, + "loss": 1.278, + "loss/crossentropy": 2.5507137775421143, + "loss/hidden": 1.1171875, + "loss/logits": 0.16028936207294464, + "loss/reg": 5.2507621148834005e-05, + "step": 806 + }, + { + "epoch": 0.100875, + "grad_norm": 2.3224105834960938, + "grad_norm_var": 0.5399239876549131, + "learning_rate": 0.0001, + "loss": 1.4063, + "loss/crossentropy": 2.2676374912261963, + "loss/hidden": 1.21875, + "loss/logits": 0.18703754246234894, + "loss/reg": 5.249512832961045e-05, + "step": 807 + }, + { + "epoch": 0.101, + "grad_norm": 2.5600266456604004, + "grad_norm_var": 0.5536251437135727, + "learning_rate": 0.0001, + "loss": 1.2854, + "loss/crossentropy": 2.1395981311798096, + "loss/hidden": 1.1171875, + "loss/logits": 0.1677340865135193, + "loss/reg": 5.2473347750492394e-05, + "step": 808 + }, + { + "epoch": 0.101125, + "grad_norm": 3.5100274085998535, + "grad_norm_var": 0.5683777537285927, + "learning_rate": 0.0001, + "loss": 1.3423, + "loss/crossentropy": 2.3729093074798584, + "loss/hidden": 1.15625, + "loss/logits": 0.18547970056533813, + "loss/reg": 5.2468130888883024e-05, + "step": 809 + }, + { + "epoch": 0.10125, + "grad_norm": 2.435258626937866, + "grad_norm_var": 0.5741839625022178, + "learning_rate": 0.0001, + "loss": 1.284, + "loss/crossentropy": 2.6526637077331543, + "loss/hidden": 1.109375, + "loss/logits": 0.17410364747047424, + "loss/reg": 5.2465042244875804e-05, + "step": 810 + }, + { + "epoch": 0.101375, + "grad_norm": 2.5132720470428467, + "grad_norm_var": 0.5777724408661865, + "learning_rate": 0.0001, + "loss": 1.1777, + "loss/crossentropy": 2.680278778076172, + "loss/hidden": 1.015625, + "loss/logits": 0.16152815520763397, + "loss/reg": 5.245738066150807e-05, + "step": 811 + }, + { + "epoch": 0.1015, + "grad_norm": 2.755100965499878, + "grad_norm_var": 0.5778438371123987, + "learning_rate": 0.0001, + "loss": 1.3688, + "loss/crossentropy": 2.2540314197540283, + "loss/hidden": 1.171875, + "loss/logits": 0.19639912247657776, + "loss/reg": 5.2445950132096186e-05, + "step": 812 + }, + { + "epoch": 0.101625, + "grad_norm": 3.0264639854431152, + "grad_norm_var": 0.35655723794493405, + "learning_rate": 0.0001, + "loss": 1.4901, + "loss/crossentropy": 2.477457284927368, + "loss/hidden": 1.28125, + "loss/logits": 0.20836040377616882, + "loss/reg": 5.24301067343913e-05, + "step": 813 + }, + { + "epoch": 0.10175, + "grad_norm": 2.25610089302063, + "grad_norm_var": 0.3614339888761369, + "learning_rate": 0.0001, + "loss": 1.2316, + "loss/crossentropy": 2.53898549079895, + "loss/hidden": 1.0546875, + "loss/logits": 0.17634719610214233, + "loss/reg": 5.240976679488085e-05, + "step": 814 + }, + { + "epoch": 0.101875, + "grad_norm": 2.8859362602233887, + "grad_norm_var": 0.3612343205244988, + "learning_rate": 0.0001, + "loss": 1.2416, + "loss/crossentropy": 2.5830461978912354, + "loss/hidden": 1.0859375, + "loss/logits": 0.1551593840122223, + "loss/reg": 5.2394090744201094e-05, + "step": 815 + }, + { + "epoch": 0.102, + "grad_norm": 2.317923069000244, + "grad_norm_var": 0.16106769833787193, + "learning_rate": 0.0001, + "loss": 1.3219, + "loss/crossentropy": 2.636214256286621, + "loss/hidden": 1.1484375, + "loss/logits": 0.17291927337646484, + "loss/reg": 5.2378440159372985e-05, + "step": 816 + }, + { + "epoch": 0.102125, + "grad_norm": 2.4304862022399902, + "grad_norm_var": 0.132019131991211, + "learning_rate": 0.0001, + "loss": 1.2323, + "loss/crossentropy": 2.396676778793335, + "loss/hidden": 1.0546875, + "loss/logits": 0.17712949216365814, + "loss/reg": 5.236340803094208e-05, + "step": 817 + }, + { + "epoch": 0.10225, + "grad_norm": 3.4825210571289062, + "grad_norm_var": 0.17525325841580727, + "learning_rate": 0.0001, + "loss": 1.394, + "loss/crossentropy": 2.8964662551879883, + "loss/hidden": 1.203125, + "loss/logits": 0.1903287172317505, + "loss/reg": 5.2351682825246826e-05, + "step": 818 + }, + { + "epoch": 0.102375, + "grad_norm": 3.8344733715057373, + "grad_norm_var": 0.24150743745623965, + "learning_rate": 0.0001, + "loss": 1.4858, + "loss/crossentropy": 2.266521453857422, + "loss/hidden": 1.2578125, + "loss/logits": 0.22744080424308777, + "loss/reg": 5.2338960813358426e-05, + "step": 819 + }, + { + "epoch": 0.1025, + "grad_norm": 16.969898223876953, + "grad_norm_var": 12.751910852860423, + "learning_rate": 0.0001, + "loss": 1.3032, + "loss/crossentropy": 2.4637362957000732, + "loss/hidden": 1.125, + "loss/logits": 0.17766262590885162, + "loss/reg": 5.232635885477066e-05, + "step": 820 + }, + { + "epoch": 0.102625, + "grad_norm": 2.360180139541626, + "grad_norm_var": 12.807367879393507, + "learning_rate": 0.0001, + "loss": 1.1473, + "loss/crossentropy": 2.466048240661621, + "loss/hidden": 1.0, + "loss/logits": 0.14673739671707153, + "loss/reg": 5.230958413449116e-05, + "step": 821 + }, + { + "epoch": 0.10275, + "grad_norm": 2.3272242546081543, + "grad_norm_var": 12.906693448577307, + "learning_rate": 0.0001, + "loss": 1.2033, + "loss/crossentropy": 2.551889419555664, + "loss/hidden": 1.0390625, + "loss/logits": 0.1636783480644226, + "loss/reg": 5.229478847468272e-05, + "step": 822 + }, + { + "epoch": 0.102875, + "grad_norm": 2.406409502029419, + "grad_norm_var": 12.892554510856643, + "learning_rate": 0.0001, + "loss": 1.278, + "loss/crossentropy": 2.602328062057495, + "loss/hidden": 1.09375, + "loss/logits": 0.18373973667621613, + "loss/reg": 5.227828660281375e-05, + "step": 823 + }, + { + "epoch": 0.103, + "grad_norm": 2.1223392486572266, + "grad_norm_var": 12.966937776264512, + "learning_rate": 0.0001, + "loss": 1.2087, + "loss/crossentropy": 2.498873472213745, + "loss/hidden": 1.046875, + "loss/logits": 0.16126872599124908, + "loss/reg": 5.226361099630594e-05, + "step": 824 + }, + { + "epoch": 0.103125, + "grad_norm": 3.208017110824585, + "grad_norm_var": 12.976346036172204, + "learning_rate": 0.0001, + "loss": 1.6108, + "loss/crossentropy": 2.687594175338745, + "loss/hidden": 1.390625, + "loss/logits": 0.21968355774879456, + "loss/reg": 5.2248811698518693e-05, + "step": 825 + }, + { + "epoch": 0.10325, + "grad_norm": 5.490570545196533, + "grad_norm_var": 13.092126380129832, + "learning_rate": 0.0001, + "loss": 1.7116, + "loss/crossentropy": 2.1563971042633057, + "loss/hidden": 1.453125, + "loss/logits": 0.257944792509079, + "loss/reg": 5.22348600497935e-05, + "step": 826 + }, + { + "epoch": 0.103375, + "grad_norm": 2.088407278060913, + "grad_norm_var": 13.174837105670752, + "learning_rate": 0.0001, + "loss": 1.1401, + "loss/crossentropy": 2.368215560913086, + "loss/hidden": 0.99609375, + "loss/logits": 0.1434709131717682, + "loss/reg": 5.222256004344672e-05, + "step": 827 + }, + { + "epoch": 0.1035, + "grad_norm": 2.4680469036102295, + "grad_norm_var": 13.217974973219603, + "learning_rate": 0.0001, + "loss": 1.4306, + "loss/crossentropy": 2.192209482192993, + "loss/hidden": 1.234375, + "loss/logits": 0.19567325711250305, + "loss/reg": 5.220507227932103e-05, + "step": 828 + }, + { + "epoch": 0.103625, + "grad_norm": 2.515068531036377, + "grad_norm_var": 13.282270337982398, + "learning_rate": 0.0001, + "loss": 1.1004, + "loss/crossentropy": 2.653474807739258, + "loss/hidden": 0.96484375, + "loss/logits": 0.13500887155532837, + "loss/reg": 5.2188064728397876e-05, + "step": 829 + }, + { + "epoch": 0.10375, + "grad_norm": 2.164379358291626, + "grad_norm_var": 13.30042653920423, + "learning_rate": 0.0001, + "loss": 1.1057, + "loss/crossentropy": 2.469616651535034, + "loss/hidden": 0.97265625, + "loss/logits": 0.13253287971019745, + "loss/reg": 5.217095895204693e-05, + "step": 830 + }, + { + "epoch": 0.103875, + "grad_norm": 2.072486400604248, + "grad_norm_var": 13.429207683172454, + "learning_rate": 0.0001, + "loss": 1.2587, + "loss/crossentropy": 2.185154914855957, + "loss/hidden": 1.1015625, + "loss/logits": 0.15661650896072388, + "loss/reg": 5.2154366130707785e-05, + "step": 831 + }, + { + "epoch": 0.104, + "grad_norm": 2.822169542312622, + "grad_norm_var": 13.356134748586637, + "learning_rate": 0.0001, + "loss": 1.4411, + "loss/crossentropy": 2.312222719192505, + "loss/hidden": 1.25, + "loss/logits": 0.19062718749046326, + "loss/reg": 5.213710755924694e-05, + "step": 832 + }, + { + "epoch": 0.104125, + "grad_norm": 3.2380270957946777, + "grad_norm_var": 13.263144115005607, + "learning_rate": 0.0001, + "loss": 1.3954, + "loss/crossentropy": 2.565952777862549, + "loss/hidden": 1.1875, + "loss/logits": 0.20734888315200806, + "loss/reg": 5.212176256463863e-05, + "step": 833 + }, + { + "epoch": 0.10425, + "grad_norm": 2.9629056453704834, + "grad_norm_var": 13.29668960799979, + "learning_rate": 0.0001, + "loss": 1.3472, + "loss/crossentropy": 2.4239981174468994, + "loss/hidden": 1.1875, + "loss/logits": 0.15917940437793732, + "loss/reg": 5.2105944632785395e-05, + "step": 834 + }, + { + "epoch": 0.104375, + "grad_norm": 2.65922474861145, + "grad_norm_var": 13.360480084554695, + "learning_rate": 0.0001, + "loss": 1.4703, + "loss/crossentropy": 2.5808305740356445, + "loss/hidden": 1.2265625, + "loss/logits": 0.243194580078125, + "loss/reg": 5.209133814787492e-05, + "step": 835 + }, + { + "epoch": 0.1045, + "grad_norm": 2.3345870971679688, + "grad_norm_var": 0.691400615292184, + "learning_rate": 0.0001, + "loss": 1.3773, + "loss/crossentropy": 2.2131404876708984, + "loss/hidden": 1.171875, + "loss/logits": 0.20489053428173065, + "loss/reg": 5.207399954088032e-05, + "step": 836 + }, + { + "epoch": 0.104625, + "grad_norm": 2.3921146392822266, + "grad_norm_var": 0.6900067668765199, + "learning_rate": 0.0001, + "loss": 1.1512, + "loss/crossentropy": 3.2045135498046875, + "loss/hidden": 0.99609375, + "loss/logits": 0.15453840792179108, + "loss/reg": 5.205388879403472e-05, + "step": 837 + }, + { + "epoch": 0.10475, + "grad_norm": 2.016071319580078, + "grad_norm_var": 0.7117097796198171, + "learning_rate": 0.0001, + "loss": 1.3153, + "loss/crossentropy": 2.6639490127563477, + "loss/hidden": 1.1328125, + "loss/logits": 0.18196584284305573, + "loss/reg": 5.2032042731298134e-05, + "step": 838 + }, + { + "epoch": 0.104875, + "grad_norm": 2.5177884101867676, + "grad_norm_var": 0.7083471286799463, + "learning_rate": 0.0001, + "loss": 1.3959, + "loss/crossentropy": 2.569981336593628, + "loss/hidden": 1.1953125, + "loss/logits": 0.20005394518375397, + "loss/reg": 5.200940722716041e-05, + "step": 839 + }, + { + "epoch": 0.105, + "grad_norm": 1.9999221563339233, + "grad_norm_var": 0.7185821198972163, + "learning_rate": 0.0001, + "loss": 1.1947, + "loss/crossentropy": 2.6757583618164062, + "loss/hidden": 1.0234375, + "loss/logits": 0.17078331112861633, + "loss/reg": 5.199360748520121e-05, + "step": 840 + }, + { + "epoch": 0.105125, + "grad_norm": 2.270017147064209, + "grad_norm_var": 0.708080528199305, + "learning_rate": 0.0001, + "loss": 1.2045, + "loss/crossentropy": 2.5654280185699463, + "loss/hidden": 1.0390625, + "loss/logits": 0.16487887501716614, + "loss/reg": 5.197878272156231e-05, + "step": 841 + }, + { + "epoch": 0.10525, + "grad_norm": 2.2202858924865723, + "grad_norm_var": 0.12732683712733267, + "learning_rate": 0.0001, + "loss": 1.3008, + "loss/crossentropy": 2.4787936210632324, + "loss/hidden": 1.109375, + "loss/logits": 0.19092029333114624, + "loss/reg": 5.196038546273485e-05, + "step": 842 + }, + { + "epoch": 0.105375, + "grad_norm": 2.4694182872772217, + "grad_norm_var": 0.11948625558178154, + "learning_rate": 0.0001, + "loss": 1.2657, + "loss/crossentropy": 2.716890335083008, + "loss/hidden": 1.09375, + "loss/logits": 0.17146353423595428, + "loss/reg": 5.1946241001132876e-05, + "step": 843 + }, + { + "epoch": 0.1055, + "grad_norm": 2.3803436756134033, + "grad_norm_var": 0.11969932832842947, + "learning_rate": 0.0001, + "loss": 1.1049, + "loss/crossentropy": 2.3049373626708984, + "loss/hidden": 0.9609375, + "loss/logits": 0.1434895098209381, + "loss/reg": 5.193064862396568e-05, + "step": 844 + }, + { + "epoch": 0.105625, + "grad_norm": 2.839191198348999, + "grad_norm_var": 0.1295235040782898, + "learning_rate": 0.0001, + "loss": 1.3633, + "loss/crossentropy": 2.3535118103027344, + "loss/hidden": 1.171875, + "loss/logits": 0.1908886432647705, + "loss/reg": 5.1912767958128825e-05, + "step": 845 + }, + { + "epoch": 0.10575, + "grad_norm": 2.440533399581909, + "grad_norm_var": 0.12340736502353161, + "learning_rate": 0.0001, + "loss": 1.3406, + "loss/crossentropy": 2.587473154067993, + "loss/hidden": 1.125, + "loss/logits": 0.21512514352798462, + "loss/reg": 5.18912602274213e-05, + "step": 846 + }, + { + "epoch": 0.105875, + "grad_norm": 2.274799108505249, + "grad_norm_var": 0.11504854753899843, + "learning_rate": 0.0001, + "loss": 1.3655, + "loss/crossentropy": 2.5222935676574707, + "loss/hidden": 1.171875, + "loss/logits": 0.19314169883728027, + "loss/reg": 5.187144415685907e-05, + "step": 847 + }, + { + "epoch": 0.106, + "grad_norm": 3.504936456680298, + "grad_norm_var": 0.17443826044681046, + "learning_rate": 0.0001, + "loss": 1.5789, + "loss/crossentropy": 2.4715349674224854, + "loss/hidden": 1.3359375, + "loss/logits": 0.24247828125953674, + "loss/reg": 5.1850674935849383e-05, + "step": 848 + }, + { + "epoch": 0.106125, + "grad_norm": 2.178030490875244, + "grad_norm_var": 0.14495010255316926, + "learning_rate": 0.0001, + "loss": 1.1547, + "loss/crossentropy": 2.576503038406372, + "loss/hidden": 1.0078125, + "loss/logits": 0.1463319957256317, + "loss/reg": 5.182998575037345e-05, + "step": 849 + }, + { + "epoch": 0.10625, + "grad_norm": 4.153827667236328, + "grad_norm_var": 0.3124556252586463, + "learning_rate": 0.0001, + "loss": 1.5262, + "loss/crossentropy": 2.7432754039764404, + "loss/hidden": 1.28125, + "loss/logits": 0.24444353580474854, + "loss/reg": 5.180889638722874e-05, + "step": 850 + }, + { + "epoch": 0.106375, + "grad_norm": 6.0633087158203125, + "grad_norm_var": 1.09049118560782, + "learning_rate": 0.0001, + "loss": 1.7417, + "loss/crossentropy": 3.1306426525115967, + "loss/hidden": 1.484375, + "loss/logits": 0.25679004192352295, + "loss/reg": 5.178620995138772e-05, + "step": 851 + }, + { + "epoch": 0.1065, + "grad_norm": 2.4886646270751953, + "grad_norm_var": 1.0833699781585693, + "learning_rate": 0.0001, + "loss": 1.265, + "loss/crossentropy": 2.5716023445129395, + "loss/hidden": 1.078125, + "loss/logits": 0.1863655000925064, + "loss/reg": 5.177418643143028e-05, + "step": 852 + }, + { + "epoch": 0.106625, + "grad_norm": 4.484250068664551, + "grad_norm_var": 1.2534535582414965, + "learning_rate": 0.0001, + "loss": 1.9001, + "loss/crossentropy": 3.1852643489837646, + "loss/hidden": 1.515625, + "loss/logits": 0.3839457631111145, + "loss/reg": 5.1762908697128296e-05, + "step": 853 + }, + { + "epoch": 0.10675, + "grad_norm": 2.6472911834716797, + "grad_norm_var": 1.2044808988564493, + "learning_rate": 0.0001, + "loss": 1.2718, + "loss/crossentropy": 2.6136245727539062, + "loss/hidden": 1.1015625, + "loss/logits": 0.1696871519088745, + "loss/reg": 5.174713805899955e-05, + "step": 854 + }, + { + "epoch": 0.106875, + "grad_norm": 4.49519681930542, + "grad_norm_var": 1.3393165741714852, + "learning_rate": 0.0001, + "loss": 1.6395, + "loss/crossentropy": 2.5587494373321533, + "loss/hidden": 1.3828125, + "loss/logits": 0.2561890184879303, + "loss/reg": 5.173370664124377e-05, + "step": 855 + }, + { + "epoch": 0.107, + "grad_norm": 2.924689531326294, + "grad_norm_var": 1.2624413783622968, + "learning_rate": 0.0001, + "loss": 1.4652, + "loss/crossentropy": 2.696464776992798, + "loss/hidden": 1.265625, + "loss/logits": 0.1990174502134323, + "loss/reg": 5.172126475372352e-05, + "step": 856 + }, + { + "epoch": 0.107125, + "grad_norm": 2.4132745265960693, + "grad_norm_var": 1.2475902683594655, + "learning_rate": 0.0001, + "loss": 1.3195, + "loss/crossentropy": 2.5237464904785156, + "loss/hidden": 1.140625, + "loss/logits": 0.17832911014556885, + "loss/reg": 5.170765507500619e-05, + "step": 857 + }, + { + "epoch": 0.10725, + "grad_norm": 2.395402193069458, + "grad_norm_var": 1.228414894644508, + "learning_rate": 0.0001, + "loss": 1.1678, + "loss/crossentropy": 2.910306453704834, + "loss/hidden": 1.0078125, + "loss/logits": 0.15949246287345886, + "loss/reg": 5.169427822693251e-05, + "step": 858 + }, + { + "epoch": 0.107375, + "grad_norm": 2.0667219161987305, + "grad_norm_var": 1.274264185741049, + "learning_rate": 0.0001, + "loss": 1.3264, + "loss/crossentropy": 2.495528221130371, + "loss/hidden": 1.125, + "loss/logits": 0.2009069323539734, + "loss/reg": 5.168099596630782e-05, + "step": 859 + }, + { + "epoch": 0.1075, + "grad_norm": 2.2085459232330322, + "grad_norm_var": 1.29280895985072, + "learning_rate": 0.0001, + "loss": 1.3652, + "loss/crossentropy": 2.452284574508667, + "loss/hidden": 1.171875, + "loss/logits": 0.19278863072395325, + "loss/reg": 5.1672195695573464e-05, + "step": 860 + }, + { + "epoch": 0.107625, + "grad_norm": 2.144191026687622, + "grad_norm_var": 1.347042753481233, + "learning_rate": 0.0001, + "loss": 1.3302, + "loss/crossentropy": 2.5444531440734863, + "loss/hidden": 1.15625, + "loss/logits": 0.17347240447998047, + "loss/reg": 5.165613038116135e-05, + "step": 861 + }, + { + "epoch": 0.10775, + "grad_norm": 2.207613229751587, + "grad_norm_var": 1.3695234911406715, + "learning_rate": 0.0001, + "loss": 1.1685, + "loss/crossentropy": 2.5881083011627197, + "loss/hidden": 1.015625, + "loss/logits": 0.15239441394805908, + "loss/reg": 5.1647028158186004e-05, + "step": 862 + }, + { + "epoch": 0.107875, + "grad_norm": 3.2510950565338135, + "grad_norm_var": 1.3293998581318258, + "learning_rate": 0.0001, + "loss": 1.3857, + "loss/crossentropy": 2.6068148612976074, + "loss/hidden": 1.1953125, + "loss/logits": 0.189855694770813, + "loss/reg": 5.1638893637573346e-05, + "step": 863 + }, + { + "epoch": 0.108, + "grad_norm": 2.065004587173462, + "grad_norm_var": 1.3815679315585072, + "learning_rate": 0.0001, + "loss": 1.2268, + "loss/crossentropy": 2.1115543842315674, + "loss/hidden": 1.0625, + "loss/logits": 0.16378697752952576, + "loss/reg": 5.16266591148451e-05, + "step": 864 + }, + { + "epoch": 0.108125, + "grad_norm": 2.9799695014953613, + "grad_norm_var": 1.3326224051682771, + "learning_rate": 0.0001, + "loss": 1.2981, + "loss/crossentropy": 2.548758029937744, + "loss/hidden": 1.1484375, + "loss/logits": 0.14914320409297943, + "loss/reg": 5.1615705160656944e-05, + "step": 865 + }, + { + "epoch": 0.10825, + "grad_norm": 2.810962438583374, + "grad_norm_var": 1.2498044722821606, + "learning_rate": 0.0001, + "loss": 1.5109, + "loss/crossentropy": 2.707660436630249, + "loss/hidden": 1.28125, + "loss/logits": 0.2291330099105835, + "loss/reg": 5.1602582971099764e-05, + "step": 866 + }, + { + "epoch": 0.108375, + "grad_norm": 2.15248966217041, + "grad_norm_var": 0.5968405914629725, + "learning_rate": 0.0001, + "loss": 1.248, + "loss/crossentropy": 2.542229413986206, + "loss/hidden": 1.0703125, + "loss/logits": 0.17721006274223328, + "loss/reg": 5.158934072824195e-05, + "step": 867 + }, + { + "epoch": 0.1085, + "grad_norm": 2.480729818344116, + "grad_norm_var": 0.5971035139504882, + "learning_rate": 0.0001, + "loss": 1.2536, + "loss/crossentropy": 2.53440523147583, + "loss/hidden": 1.0859375, + "loss/logits": 0.16716773808002472, + "loss/reg": 5.157275154488161e-05, + "step": 868 + }, + { + "epoch": 0.108625, + "grad_norm": 7.901394367218018, + "grad_norm_var": 2.1248277393899104, + "learning_rate": 0.0001, + "loss": 1.5415, + "loss/crossentropy": 2.556912660598755, + "loss/hidden": 1.3125, + "loss/logits": 0.228495255112648, + "loss/reg": 5.155721737537533e-05, + "step": 869 + }, + { + "epoch": 0.10875, + "grad_norm": 2.86877179145813, + "grad_norm_var": 2.1190566777217703, + "learning_rate": 0.0001, + "loss": 1.4311, + "loss/crossentropy": 2.6490020751953125, + "loss/hidden": 1.1953125, + "loss/logits": 0.23524877429008484, + "loss/reg": 5.154574682819657e-05, + "step": 870 + }, + { + "epoch": 0.108875, + "grad_norm": 2.6678075790405273, + "grad_norm_var": 1.953804689788177, + "learning_rate": 0.0001, + "loss": 1.334, + "loss/crossentropy": 2.4952635765075684, + "loss/hidden": 1.140625, + "loss/logits": 0.19286967813968658, + "loss/reg": 5.152893209015019e-05, + "step": 871 + }, + { + "epoch": 0.109, + "grad_norm": 3.3251094818115234, + "grad_norm_var": 1.9680179929503008, + "learning_rate": 0.0001, + "loss": 1.6556, + "loss/crossentropy": 2.4640188217163086, + "loss/hidden": 1.3828125, + "loss/logits": 0.2722957730293274, + "loss/reg": 5.151686491444707e-05, + "step": 872 + }, + { + "epoch": 0.109125, + "grad_norm": 2.9334840774536133, + "grad_norm_var": 1.9531698292946447, + "learning_rate": 0.0001, + "loss": 1.6151, + "loss/crossentropy": 2.1477441787719727, + "loss/hidden": 1.4140625, + "loss/logits": 0.2005080133676529, + "loss/reg": 5.15064675710164e-05, + "step": 873 + }, + { + "epoch": 0.10925, + "grad_norm": 2.6843268871307373, + "grad_norm_var": 1.9388056435329775, + "learning_rate": 0.0001, + "loss": 1.3321, + "loss/crossentropy": 2.6752922534942627, + "loss/hidden": 1.1484375, + "loss/logits": 0.18317534029483795, + "loss/reg": 5.148894706508145e-05, + "step": 874 + }, + { + "epoch": 0.109375, + "grad_norm": 2.666318416595459, + "grad_norm_var": 1.8929180590094554, + "learning_rate": 0.0001, + "loss": 1.181, + "loss/crossentropy": 2.3448598384857178, + "loss/hidden": 1.015625, + "loss/logits": 0.16483411192893982, + "loss/reg": 5.147013871464878e-05, + "step": 875 + }, + { + "epoch": 0.1095, + "grad_norm": 2.9108211994171143, + "grad_norm_var": 1.8534501036204496, + "learning_rate": 0.0001, + "loss": 1.6481, + "loss/crossentropy": 2.2253129482269287, + "loss/hidden": 1.390625, + "loss/logits": 0.2569289207458496, + "loss/reg": 5.1448183512547985e-05, + "step": 876 + }, + { + "epoch": 0.109625, + "grad_norm": 2.372954845428467, + "grad_norm_var": 1.8305216702505192, + "learning_rate": 0.0001, + "loss": 1.0989, + "loss/crossentropy": 2.879788398742676, + "loss/hidden": 0.9609375, + "loss/logits": 0.13748227059841156, + "loss/reg": 5.143091766512953e-05, + "step": 877 + }, + { + "epoch": 0.10975, + "grad_norm": 2.7331626415252686, + "grad_norm_var": 1.7910379283106483, + "learning_rate": 0.0001, + "loss": 1.34, + "loss/crossentropy": 2.64849853515625, + "loss/hidden": 1.15625, + "loss/logits": 0.18327152729034424, + "loss/reg": 5.141158544574864e-05, + "step": 878 + }, + { + "epoch": 0.109875, + "grad_norm": 2.3987855911254883, + "grad_norm_var": 1.8136184643926978, + "learning_rate": 0.0001, + "loss": 1.2782, + "loss/crossentropy": 2.5128934383392334, + "loss/hidden": 1.109375, + "loss/logits": 0.16826963424682617, + "loss/reg": 5.139862696523778e-05, + "step": 879 + }, + { + "epoch": 0.11, + "grad_norm": 2.94789981842041, + "grad_norm_var": 1.7526228729189588, + "learning_rate": 0.0001, + "loss": 1.323, + "loss/crossentropy": 2.762059211730957, + "loss/hidden": 1.140625, + "loss/logits": 0.18185189366340637, + "loss/reg": 5.138712003827095e-05, + "step": 880 + }, + { + "epoch": 0.110125, + "grad_norm": 2.8312692642211914, + "grad_norm_var": 1.7554366876979393, + "learning_rate": 0.0001, + "loss": 1.5558, + "loss/crossentropy": 2.3510918617248535, + "loss/hidden": 1.3046875, + "loss/logits": 0.2505726218223572, + "loss/reg": 5.1374005124671385e-05, + "step": 881 + }, + { + "epoch": 0.11025, + "grad_norm": 2.362635612487793, + "grad_norm_var": 1.781863088516642, + "learning_rate": 0.0001, + "loss": 1.3719, + "loss/crossentropy": 2.6087443828582764, + "loss/hidden": 1.1640625, + "loss/logits": 0.207294762134552, + "loss/reg": 5.1361905207159e-05, + "step": 882 + }, + { + "epoch": 0.110375, + "grad_norm": 3.219357967376709, + "grad_norm_var": 1.730327889053624, + "learning_rate": 0.0001, + "loss": 1.625, + "loss/crossentropy": 2.5749869346618652, + "loss/hidden": 1.34375, + "loss/logits": 0.2807803452014923, + "loss/reg": 5.134905586601235e-05, + "step": 883 + }, + { + "epoch": 0.1105, + "grad_norm": 3.389406442642212, + "grad_norm_var": 1.709139991612549, + "learning_rate": 0.0001, + "loss": 1.3304, + "loss/crossentropy": 2.8171286582946777, + "loss/hidden": 1.140625, + "loss/logits": 0.18924608826637268, + "loss/reg": 5.133213926455937e-05, + "step": 884 + }, + { + "epoch": 0.110625, + "grad_norm": 12.887139320373535, + "grad_norm_var": 6.4290571159925625, + "learning_rate": 0.0001, + "loss": 1.6923, + "loss/crossentropy": 2.3453848361968994, + "loss/hidden": 1.453125, + "loss/logits": 0.23862439393997192, + "loss/reg": 5.1321330829523504e-05, + "step": 885 + }, + { + "epoch": 0.11075, + "grad_norm": 2.346778154373169, + "grad_norm_var": 6.486536682635502, + "learning_rate": 0.0001, + "loss": 1.4458, + "loss/crossentropy": 2.5474321842193604, + "loss/hidden": 1.2109375, + "loss/logits": 0.23435597121715546, + "loss/reg": 5.130986392032355e-05, + "step": 886 + }, + { + "epoch": 0.110875, + "grad_norm": 2.487971067428589, + "grad_norm_var": 6.506530171472066, + "learning_rate": 0.0001, + "loss": 1.357, + "loss/crossentropy": 2.5673558712005615, + "loss/hidden": 1.1484375, + "loss/logits": 0.20802843570709229, + "loss/reg": 5.1305341912666336e-05, + "step": 887 + }, + { + "epoch": 0.111, + "grad_norm": 2.816526174545288, + "grad_norm_var": 6.528187529959202, + "learning_rate": 0.0001, + "loss": 1.4391, + "loss/crossentropy": 2.6126768589019775, + "loss/hidden": 1.2109375, + "loss/logits": 0.22769951820373535, + "loss/reg": 5.12950646225363e-05, + "step": 888 + }, + { + "epoch": 0.111125, + "grad_norm": 2.141483783721924, + "grad_norm_var": 6.613941985095446, + "learning_rate": 0.0001, + "loss": 1.2903, + "loss/crossentropy": 2.3378610610961914, + "loss/hidden": 1.109375, + "loss/logits": 0.18040552735328674, + "loss/reg": 5.1290844567120075e-05, + "step": 889 + }, + { + "epoch": 0.11125, + "grad_norm": 2.6674747467041016, + "grad_norm_var": 6.615398852360909, + "learning_rate": 0.0001, + "loss": 1.2673, + "loss/crossentropy": 2.5276970863342285, + "loss/hidden": 1.09375, + "loss/logits": 0.17306920886039734, + "loss/reg": 5.129120108904317e-05, + "step": 890 + }, + { + "epoch": 0.111375, + "grad_norm": 3.5922231674194336, + "grad_norm_var": 6.5878176563605235, + "learning_rate": 0.0001, + "loss": 1.5823, + "loss/crossentropy": 2.5318856239318848, + "loss/hidden": 1.3359375, + "loss/logits": 0.2458970844745636, + "loss/reg": 5.128348493599333e-05, + "step": 891 + }, + { + "epoch": 0.1115, + "grad_norm": 4.822789192199707, + "grad_norm_var": 6.696274189555324, + "learning_rate": 0.0001, + "loss": 1.4305, + "loss/crossentropy": 2.564779281616211, + "loss/hidden": 1.234375, + "loss/logits": 0.1956566870212555, + "loss/reg": 5.1267714297864586e-05, + "step": 892 + }, + { + "epoch": 0.111625, + "grad_norm": 2.7208211421966553, + "grad_norm_var": 6.651510803659902, + "learning_rate": 0.0001, + "loss": 1.2779, + "loss/crossentropy": 2.423957109451294, + "loss/hidden": 1.1015625, + "loss/logits": 0.17579975724220276, + "loss/reg": 5.1254595746286213e-05, + "step": 893 + }, + { + "epoch": 0.11175, + "grad_norm": 3.0301880836486816, + "grad_norm_var": 6.62575020535945, + "learning_rate": 0.0001, + "loss": 1.5594, + "loss/crossentropy": 2.4482266902923584, + "loss/hidden": 1.296875, + "loss/logits": 0.262008935213089, + "loss/reg": 5.123936352902092e-05, + "step": 894 + }, + { + "epoch": 0.111875, + "grad_norm": 3.360795021057129, + "grad_norm_var": 6.537028009081854, + "learning_rate": 0.0001, + "loss": 1.3794, + "loss/crossentropy": 2.7706689834594727, + "loss/hidden": 1.203125, + "loss/logits": 0.17581212520599365, + "loss/reg": 5.1222959882579744e-05, + "step": 895 + }, + { + "epoch": 0.112, + "grad_norm": 2.2740559577941895, + "grad_norm_var": 6.6241346303160356, + "learning_rate": 0.0001, + "loss": 1.1855, + "loss/crossentropy": 2.6317555904388428, + "loss/hidden": 1.046875, + "loss/logits": 0.13808242976665497, + "loss/reg": 5.120660716784187e-05, + "step": 896 + }, + { + "epoch": 0.112125, + "grad_norm": 2.4496805667877197, + "grad_norm_var": 6.670283083692273, + "learning_rate": 0.0001, + "loss": 1.42, + "loss/crossentropy": 2.499284505844116, + "loss/hidden": 1.1875, + "loss/logits": 0.2319883406162262, + "loss/reg": 5.1194838306400925e-05, + "step": 897 + }, + { + "epoch": 0.11225, + "grad_norm": 2.3721413612365723, + "grad_norm_var": 6.668802098851164, + "learning_rate": 0.0001, + "loss": 1.2307, + "loss/crossentropy": 2.375321388244629, + "loss/hidden": 1.0703125, + "loss/logits": 0.15983566641807556, + "loss/reg": 5.118764966027811e-05, + "step": 898 + }, + { + "epoch": 0.112375, + "grad_norm": 1.9777494668960571, + "grad_norm_var": 6.817600273546391, + "learning_rate": 0.0001, + "loss": 1.2443, + "loss/crossentropy": 2.382485866546631, + "loss/hidden": 1.0625, + "loss/logits": 0.18131142854690552, + "loss/reg": 5.117098771734163e-05, + "step": 899 + }, + { + "epoch": 0.1125, + "grad_norm": 3.215449571609497, + "grad_norm_var": 6.821095932665104, + "learning_rate": 0.0001, + "loss": 1.7545, + "loss/crossentropy": 2.471181869506836, + "loss/hidden": 1.4921875, + "loss/logits": 0.26175931096076965, + "loss/reg": 5.11566577188205e-05, + "step": 900 + }, + { + "epoch": 0.112625, + "grad_norm": 2.2360520362854004, + "grad_norm_var": 0.5060833487590211, + "learning_rate": 0.0001, + "loss": 1.1463, + "loss/crossentropy": 2.5280520915985107, + "loss/hidden": 1.0078125, + "loss/logits": 0.13795122504234314, + "loss/reg": 5.114359737490304e-05, + "step": 901 + }, + { + "epoch": 0.11275, + "grad_norm": 2.2849807739257812, + "grad_norm_var": 0.5099081994552771, + "learning_rate": 0.0001, + "loss": 1.318, + "loss/crossentropy": 2.583923578262329, + "loss/hidden": 1.140625, + "loss/logits": 0.1768278032541275, + "loss/reg": 5.113161387271248e-05, + "step": 902 + }, + { + "epoch": 0.112875, + "grad_norm": 2.292587995529175, + "grad_norm_var": 0.5198535528811049, + "learning_rate": 0.0001, + "loss": 1.4232, + "loss/crossentropy": 2.196122884750366, + "loss/hidden": 1.21875, + "loss/logits": 0.2038969099521637, + "loss/reg": 5.111364953336306e-05, + "step": 903 + }, + { + "epoch": 0.113, + "grad_norm": 2.8599159717559814, + "grad_norm_var": 0.5202638913613247, + "learning_rate": 0.0001, + "loss": 1.2203, + "loss/crossentropy": 2.5006511211395264, + "loss/hidden": 1.0625, + "loss/logits": 0.15732157230377197, + "loss/reg": 5.110198981128633e-05, + "step": 904 + }, + { + "epoch": 0.113125, + "grad_norm": 2.5395283699035645, + "grad_norm_var": 0.49688104773360486, + "learning_rate": 0.0001, + "loss": 1.2577, + "loss/crossentropy": 2.845609426498413, + "loss/hidden": 1.078125, + "loss/logits": 0.17907723784446716, + "loss/reg": 5.108524055685848e-05, + "step": 905 + }, + { + "epoch": 0.11325, + "grad_norm": 2.205470085144043, + "grad_norm_var": 0.5179864695758818, + "learning_rate": 0.0001, + "loss": 1.4047, + "loss/crossentropy": 2.444505214691162, + "loss/hidden": 1.2109375, + "loss/logits": 0.19329029321670532, + "loss/reg": 5.106762910145335e-05, + "step": 906 + }, + { + "epoch": 0.113375, + "grad_norm": 2.612846612930298, + "grad_norm_var": 0.4698679222391608, + "learning_rate": 0.0001, + "loss": 1.246, + "loss/crossentropy": 2.8539822101593018, + "loss/hidden": 1.09375, + "loss/logits": 0.15170122683048248, + "loss/reg": 5.1049682951997966e-05, + "step": 907 + }, + { + "epoch": 0.1135, + "grad_norm": 2.0672478675842285, + "grad_norm_var": 0.1657706313496551, + "learning_rate": 0.0001, + "loss": 1.0483, + "loss/crossentropy": 2.453460216522217, + "loss/hidden": 0.921875, + "loss/logits": 0.12594252824783325, + "loss/reg": 5.103057628730312e-05, + "step": 908 + }, + { + "epoch": 0.113625, + "grad_norm": 2.428112030029297, + "grad_norm_var": 0.1637257922027207, + "learning_rate": 0.0001, + "loss": 1.465, + "loss/crossentropy": 2.6168901920318604, + "loss/hidden": 1.2421875, + "loss/logits": 0.22226470708847046, + "loss/reg": 5.1011342293350026e-05, + "step": 909 + }, + { + "epoch": 0.11375, + "grad_norm": 2.6597177982330322, + "grad_norm_var": 0.14675306523261794, + "learning_rate": 0.0001, + "loss": 1.3562, + "loss/crossentropy": 2.513742685317993, + "loss/hidden": 1.171875, + "loss/logits": 0.18382194638252258, + "loss/reg": 5.0994767661904916e-05, + "step": 910 + }, + { + "epoch": 0.113875, + "grad_norm": 2.3672919273376465, + "grad_norm_var": 0.09306154474314253, + "learning_rate": 0.0001, + "loss": 1.3962, + "loss/crossentropy": 2.329597234725952, + "loss/hidden": 1.1875, + "loss/logits": 0.20819973945617676, + "loss/reg": 5.097627581562847e-05, + "step": 911 + }, + { + "epoch": 0.114, + "grad_norm": 2.389256000518799, + "grad_norm_var": 0.091531368737714, + "learning_rate": 0.0001, + "loss": 1.3408, + "loss/crossentropy": 2.712867021560669, + "loss/hidden": 1.125, + "loss/logits": 0.21526110172271729, + "loss/reg": 5.0954702601302415e-05, + "step": 912 + }, + { + "epoch": 0.114125, + "grad_norm": 2.9275593757629395, + "grad_norm_var": 0.10674763413478458, + "learning_rate": 0.0001, + "loss": 1.5311, + "loss/crossentropy": 2.5449111461639404, + "loss/hidden": 1.3125, + "loss/logits": 0.21804079413414001, + "loss/reg": 5.093521031085402e-05, + "step": 913 + }, + { + "epoch": 0.11425, + "grad_norm": 2.181626558303833, + "grad_norm_var": 0.11136842221632476, + "learning_rate": 0.0001, + "loss": 1.2699, + "loss/crossentropy": 2.8895819187164307, + "loss/hidden": 1.109375, + "loss/logits": 0.16004905104637146, + "loss/reg": 5.091511411592364e-05, + "step": 914 + }, + { + "epoch": 0.114375, + "grad_norm": 2.1453068256378174, + "grad_norm_var": 0.10250921674971565, + "learning_rate": 0.0001, + "loss": 1.36, + "loss/crossentropy": 2.5813710689544678, + "loss/hidden": 1.15625, + "loss/logits": 0.20324170589447021, + "loss/reg": 5.089937985758297e-05, + "step": 915 + }, + { + "epoch": 0.1145, + "grad_norm": 2.3220584392547607, + "grad_norm_var": 0.06279939654988856, + "learning_rate": 0.0001, + "loss": 1.5004, + "loss/crossentropy": 2.2584786415100098, + "loss/hidden": 1.296875, + "loss/logits": 0.20300912857055664, + "loss/reg": 5.0884518714156e-05, + "step": 916 + }, + { + "epoch": 0.114625, + "grad_norm": 2.3360254764556885, + "grad_norm_var": 0.0611390665759463, + "learning_rate": 0.0001, + "loss": 1.2846, + "loss/crossentropy": 2.713193416595459, + "loss/hidden": 1.1015625, + "loss/logits": 0.18253561854362488, + "loss/reg": 5.086654346087016e-05, + "step": 917 + }, + { + "epoch": 0.11475, + "grad_norm": 2.8872170448303223, + "grad_norm_var": 0.07346951449265286, + "learning_rate": 0.0001, + "loss": 1.5614, + "loss/crossentropy": 2.5153021812438965, + "loss/hidden": 1.3203125, + "loss/logits": 0.24056395888328552, + "loss/reg": 5.084551958134398e-05, + "step": 918 + }, + { + "epoch": 0.114875, + "grad_norm": 2.3302507400512695, + "grad_norm_var": 0.07276086174920783, + "learning_rate": 0.0001, + "loss": 1.2503, + "loss/crossentropy": 2.6722702980041504, + "loss/hidden": 1.0859375, + "loss/logits": 0.16384665668010712, + "loss/reg": 5.0823444325942546e-05, + "step": 919 + }, + { + "epoch": 0.115, + "grad_norm": 2.578481674194336, + "grad_norm_var": 0.06246865190141004, + "learning_rate": 0.0001, + "loss": 1.3916, + "loss/crossentropy": 2.56015682220459, + "loss/hidden": 1.21875, + "loss/logits": 0.1723623275756836, + "loss/reg": 5.080721894046292e-05, + "step": 920 + }, + { + "epoch": 0.115125, + "grad_norm": 2.512157917022705, + "grad_norm_var": 0.062138112924693206, + "learning_rate": 0.0001, + "loss": 1.1445, + "loss/crossentropy": 2.7548606395721436, + "loss/hidden": 0.99609375, + "loss/logits": 0.14792391657829285, + "loss/reg": 5.078666072222404e-05, + "step": 921 + }, + { + "epoch": 0.11525, + "grad_norm": 2.1921284198760986, + "grad_norm_var": 0.0625565039341834, + "learning_rate": 0.0001, + "loss": 1.0968, + "loss/crossentropy": 2.660780668258667, + "loss/hidden": 0.95703125, + "loss/logits": 0.1393047422170639, + "loss/reg": 5.077124296803959e-05, + "step": 922 + }, + { + "epoch": 0.115375, + "grad_norm": 2.3002402782440186, + "grad_norm_var": 0.06119220238923783, + "learning_rate": 0.0001, + "loss": 1.4776, + "loss/crossentropy": 2.416215419769287, + "loss/hidden": 1.265625, + "loss/logits": 0.2114565074443817, + "loss/reg": 5.075276203569956e-05, + "step": 923 + }, + { + "epoch": 0.1155, + "grad_norm": 2.2460763454437256, + "grad_norm_var": 0.05492203051156442, + "learning_rate": 0.0001, + "loss": 1.4112, + "loss/crossentropy": 2.3629891872406006, + "loss/hidden": 1.1875, + "loss/logits": 0.22322767972946167, + "loss/reg": 5.0733655371004716e-05, + "step": 924 + }, + { + "epoch": 0.115625, + "grad_norm": 1.99420166015625, + "grad_norm_var": 0.06652205345829619, + "learning_rate": 0.0001, + "loss": 1.3674, + "loss/crossentropy": 2.4530787467956543, + "loss/hidden": 1.1875, + "loss/logits": 0.17934995889663696, + "loss/reg": 5.071550185675733e-05, + "step": 925 + }, + { + "epoch": 0.11575, + "grad_norm": 3.024604558944702, + "grad_norm_var": 0.0875715770421029, + "learning_rate": 0.0001, + "loss": 1.5185, + "loss/crossentropy": 2.496591091156006, + "loss/hidden": 1.2890625, + "loss/logits": 0.22892767190933228, + "loss/reg": 5.070024053566158e-05, + "step": 926 + }, + { + "epoch": 0.115875, + "grad_norm": 2.557852268218994, + "grad_norm_var": 0.08847894622657862, + "learning_rate": 0.0001, + "loss": 1.1789, + "loss/crossentropy": 2.917902708053589, + "loss/hidden": 1.0234375, + "loss/logits": 0.1549963504076004, + "loss/reg": 5.068165046395734e-05, + "step": 927 + }, + { + "epoch": 0.116, + "grad_norm": 2.644162893295288, + "grad_norm_var": 0.09105956863669439, + "learning_rate": 0.0001, + "loss": 1.4557, + "loss/crossentropy": 2.3896613121032715, + "loss/hidden": 1.234375, + "loss/logits": 0.22077350318431854, + "loss/reg": 5.066774247097783e-05, + "step": 928 + }, + { + "epoch": 0.116125, + "grad_norm": 2.3583998680114746, + "grad_norm_var": 0.07496988833996525, + "learning_rate": 0.0001, + "loss": 1.3946, + "loss/crossentropy": 2.5956573486328125, + "loss/hidden": 1.203125, + "loss/logits": 0.19093617796897888, + "loss/reg": 5.065161531092599e-05, + "step": 929 + }, + { + "epoch": 0.11625, + "grad_norm": 3.7659823894500732, + "grad_norm_var": 0.18294245356447975, + "learning_rate": 0.0001, + "loss": 1.4272, + "loss/crossentropy": 3.112509250640869, + "loss/hidden": 1.21875, + "loss/logits": 0.20792649686336517, + "loss/reg": 5.063477874500677e-05, + "step": 930 + }, + { + "epoch": 0.116375, + "grad_norm": 3.4702413082122803, + "grad_norm_var": 0.22784416332236573, + "learning_rate": 0.0001, + "loss": 1.4236, + "loss/crossentropy": 2.6097307205200195, + "loss/hidden": 1.2265625, + "loss/logits": 0.19651469588279724, + "loss/reg": 5.061656338511966e-05, + "step": 931 + }, + { + "epoch": 0.1165, + "grad_norm": 10.169079780578613, + "grad_norm_var": 3.7907524102504455, + "learning_rate": 0.0001, + "loss": 1.7727, + "loss/crossentropy": 2.720949649810791, + "loss/hidden": 1.46875, + "loss/logits": 0.3034912347793579, + "loss/reg": 5.059718750999309e-05, + "step": 932 + }, + { + "epoch": 0.116625, + "grad_norm": 2.715127944946289, + "grad_norm_var": 3.761853977240573, + "learning_rate": 0.0001, + "loss": 1.2669, + "loss/crossentropy": 2.721991777420044, + "loss/hidden": 1.0703125, + "loss/logits": 0.19605056941509247, + "loss/reg": 5.0578131777001545e-05, + "step": 933 + }, + { + "epoch": 0.11675, + "grad_norm": 2.626614570617676, + "grad_norm_var": 3.7738096606882756, + "learning_rate": 0.0001, + "loss": 1.561, + "loss/crossentropy": 2.431546449661255, + "loss/hidden": 1.3125, + "loss/logits": 0.2480194866657257, + "loss/reg": 5.056073496234603e-05, + "step": 934 + }, + { + "epoch": 0.116875, + "grad_norm": 2.2887232303619385, + "grad_norm_var": 3.7781399580603714, + "learning_rate": 0.0001, + "loss": 1.4852, + "loss/crossentropy": 2.362403631210327, + "loss/hidden": 1.2578125, + "loss/logits": 0.2268555462360382, + "loss/reg": 5.0543880206532776e-05, + "step": 935 + }, + { + "epoch": 0.117, + "grad_norm": 2.2659595012664795, + "grad_norm_var": 3.8055697286814714, + "learning_rate": 0.0001, + "loss": 1.38, + "loss/crossentropy": 2.4935097694396973, + "loss/hidden": 1.1953125, + "loss/logits": 0.18415585160255432, + "loss/reg": 5.0521102821221575e-05, + "step": 936 + }, + { + "epoch": 0.117125, + "grad_norm": 2.611288070678711, + "grad_norm_var": 3.7988011630033136, + "learning_rate": 0.0001, + "loss": 1.2173, + "loss/crossentropy": 2.497889995574951, + "loss/hidden": 1.0546875, + "loss/logits": 0.1621410995721817, + "loss/reg": 5.049499304732308e-05, + "step": 937 + }, + { + "epoch": 0.11725, + "grad_norm": 2.490511894226074, + "grad_norm_var": 3.769164840295244, + "learning_rate": 0.0001, + "loss": 1.3398, + "loss/crossentropy": 2.580732583999634, + "loss/hidden": 1.1328125, + "loss/logits": 0.20645791292190552, + "loss/reg": 5.047738159191795e-05, + "step": 938 + }, + { + "epoch": 0.117375, + "grad_norm": 2.6626391410827637, + "grad_norm_var": 3.7389430985960397, + "learning_rate": 0.0001, + "loss": 1.3499, + "loss/crossentropy": 2.3543365001678467, + "loss/hidden": 1.1484375, + "loss/logits": 0.20099475979804993, + "loss/reg": 5.046039950684644e-05, + "step": 939 + }, + { + "epoch": 0.1175, + "grad_norm": 2.3437256813049316, + "grad_norm_var": 3.728183871902979, + "learning_rate": 0.0001, + "loss": 1.56, + "loss/crossentropy": 2.385300636291504, + "loss/hidden": 1.3046875, + "loss/logits": 0.2547788918018341, + "loss/reg": 5.044609497417696e-05, + "step": 940 + }, + { + "epoch": 0.117625, + "grad_norm": 2.517521858215332, + "grad_norm_var": 3.6664452294798027, + "learning_rate": 0.0001, + "loss": 1.3269, + "loss/crossentropy": 2.5399513244628906, + "loss/hidden": 1.140625, + "loss/logits": 0.18573379516601562, + "loss/reg": 5.042907650931738e-05, + "step": 941 + }, + { + "epoch": 0.11775, + "grad_norm": 2.5876271724700928, + "grad_norm_var": 3.6860949824849625, + "learning_rate": 0.0001, + "loss": 1.2793, + "loss/crossentropy": 2.377321481704712, + "loss/hidden": 1.09375, + "loss/logits": 0.18503513932228088, + "loss/reg": 5.0405546062393114e-05, + "step": 942 + }, + { + "epoch": 0.117875, + "grad_norm": 2.671018123626709, + "grad_norm_var": 3.6782666614773385, + "learning_rate": 0.0001, + "loss": 1.2238, + "loss/crossentropy": 2.802436113357544, + "loss/hidden": 1.046875, + "loss/logits": 0.17646291851997375, + "loss/reg": 5.039115058025345e-05, + "step": 943 + }, + { + "epoch": 0.118, + "grad_norm": 2.572817325592041, + "grad_norm_var": 3.683271023247499, + "learning_rate": 0.0001, + "loss": 1.5448, + "loss/crossentropy": 2.634742498397827, + "loss/hidden": 1.28125, + "loss/logits": 0.26302051544189453, + "loss/reg": 5.037582013756037e-05, + "step": 944 + }, + { + "epoch": 0.118125, + "grad_norm": 3.284419059753418, + "grad_norm_var": 3.641308957185208, + "learning_rate": 0.0001, + "loss": 1.4113, + "loss/crossentropy": 2.7402541637420654, + "loss/hidden": 1.1875, + "loss/logits": 0.22326701879501343, + "loss/reg": 5.035655340179801e-05, + "step": 945 + }, + { + "epoch": 0.11825, + "grad_norm": 2.1808974742889404, + "grad_norm_var": 3.676652595263378, + "learning_rate": 0.0001, + "loss": 1.345, + "loss/crossentropy": 2.1766951084136963, + "loss/hidden": 1.1640625, + "loss/logits": 0.18044030666351318, + "loss/reg": 5.033357228967361e-05, + "step": 946 + }, + { + "epoch": 0.118375, + "grad_norm": 2.5026612281799316, + "grad_norm_var": 3.6862574547537976, + "learning_rate": 0.0001, + "loss": 1.3828, + "loss/crossentropy": 2.443451404571533, + "loss/hidden": 1.171875, + "loss/logits": 0.21045450866222382, + "loss/reg": 5.0314705731580034e-05, + "step": 947 + }, + { + "epoch": 0.1185, + "grad_norm": 2.595186233520508, + "grad_norm_var": 0.06275260077619957, + "learning_rate": 0.0001, + "loss": 1.3268, + "loss/crossentropy": 2.5740835666656494, + "loss/hidden": 1.140625, + "loss/logits": 0.18566593527793884, + "loss/reg": 5.0293325330130756e-05, + "step": 948 + }, + { + "epoch": 0.118625, + "grad_norm": 2.7742605209350586, + "grad_norm_var": 0.06421554214945217, + "learning_rate": 0.0001, + "loss": 1.3436, + "loss/crossentropy": 2.6714890003204346, + "loss/hidden": 1.15625, + "loss/logits": 0.18687836825847626, + "loss/reg": 5.027998849982396e-05, + "step": 949 + }, + { + "epoch": 0.11875, + "grad_norm": 2.370361328125, + "grad_norm_var": 0.06607751509905029, + "learning_rate": 0.0001, + "loss": 1.3589, + "loss/crossentropy": 2.659331798553467, + "loss/hidden": 1.1640625, + "loss/logits": 0.19433243572711945, + "loss/reg": 5.026358849136159e-05, + "step": 950 + }, + { + "epoch": 0.118875, + "grad_norm": 2.5973522663116455, + "grad_norm_var": 0.06148581360481557, + "learning_rate": 0.0001, + "loss": 1.4772, + "loss/crossentropy": 2.323859930038452, + "loss/hidden": 1.25, + "loss/logits": 0.2266673892736435, + "loss/reg": 5.024715210311115e-05, + "step": 951 + }, + { + "epoch": 0.119, + "grad_norm": 2.227506637573242, + "grad_norm_var": 0.06310765648726502, + "learning_rate": 0.0001, + "loss": 1.1943, + "loss/crossentropy": 2.683986186981201, + "loss/hidden": 1.03125, + "loss/logits": 0.16250261664390564, + "loss/reg": 5.023390258429572e-05, + "step": 952 + }, + { + "epoch": 0.119125, + "grad_norm": 2.293842315673828, + "grad_norm_var": 0.06731388693757458, + "learning_rate": 0.0001, + "loss": 1.2523, + "loss/crossentropy": 2.6023056507110596, + "loss/hidden": 1.0859375, + "loss/logits": 0.16582387685775757, + "loss/reg": 5.021702963858843e-05, + "step": 953 + }, + { + "epoch": 0.11925, + "grad_norm": 2.304372549057007, + "grad_norm_var": 0.07075777977412372, + "learning_rate": 0.0001, + "loss": 1.3426, + "loss/crossentropy": 2.63718318939209, + "loss/hidden": 1.15625, + "loss/logits": 0.1858091950416565, + "loss/reg": 5.020539538236335e-05, + "step": 954 + }, + { + "epoch": 0.119375, + "grad_norm": 2.330775737762451, + "grad_norm_var": 0.07178920620747628, + "learning_rate": 0.0001, + "loss": 1.3944, + "loss/crossentropy": 2.5988028049468994, + "loss/hidden": 1.1875, + "loss/logits": 0.20635762810707092, + "loss/reg": 5.0196507800137624e-05, + "step": 955 + }, + { + "epoch": 0.1195, + "grad_norm": 2.9979050159454346, + "grad_norm_var": 0.08406384780934838, + "learning_rate": 0.0001, + "loss": 1.4253, + "loss/crossentropy": 2.7068779468536377, + "loss/hidden": 1.1953125, + "loss/logits": 0.22944168746471405, + "loss/reg": 5.018553929403424e-05, + "step": 956 + }, + { + "epoch": 0.119625, + "grad_norm": 2.523890495300293, + "grad_norm_var": 0.08403835148358946, + "learning_rate": 0.0001, + "loss": 1.3006, + "loss/crossentropy": 2.4745264053344727, + "loss/hidden": 1.1328125, + "loss/logits": 0.16726145148277283, + "loss/reg": 5.017763396608643e-05, + "step": 957 + }, + { + "epoch": 0.11975, + "grad_norm": 2.2493395805358887, + "grad_norm_var": 0.08953556901061101, + "learning_rate": 0.0001, + "loss": 1.3401, + "loss/crossentropy": 2.6549365520477295, + "loss/hidden": 1.15625, + "loss/logits": 0.1833769977092743, + "loss/reg": 5.017070361645892e-05, + "step": 958 + }, + { + "epoch": 0.119875, + "grad_norm": 2.9584262371063232, + "grad_norm_var": 0.10011037915958786, + "learning_rate": 0.0001, + "loss": 1.4029, + "loss/crossentropy": 2.5109822750091553, + "loss/hidden": 1.1953125, + "loss/logits": 0.20707273483276367, + "loss/reg": 5.0166461733169854e-05, + "step": 959 + }, + { + "epoch": 0.12, + "grad_norm": 3.285101890563965, + "grad_norm_var": 0.13420030325027574, + "learning_rate": 0.0001, + "loss": 1.7876, + "loss/crossentropy": 2.0449328422546387, + "loss/hidden": 1.5703125, + "loss/logits": 0.2167491614818573, + "loss/reg": 5.015368151362054e-05, + "step": 960 + }, + { + "epoch": 0.120125, + "grad_norm": 2.2315921783447266, + "grad_norm_var": 0.10631614140368863, + "learning_rate": 0.0001, + "loss": 1.2062, + "loss/crossentropy": 2.7129147052764893, + "loss/hidden": 1.0390625, + "loss/logits": 0.16663849353790283, + "loss/reg": 5.013948612031527e-05, + "step": 961 + }, + { + "epoch": 0.12025, + "grad_norm": 2.6541385650634766, + "grad_norm_var": 0.0985084366826707, + "learning_rate": 0.0001, + "loss": 1.4258, + "loss/crossentropy": 2.565558433532715, + "loss/hidden": 1.2109375, + "loss/logits": 0.21438126266002655, + "loss/reg": 5.0122467655455694e-05, + "step": 962 + }, + { + "epoch": 0.120375, + "grad_norm": 3.077623128890991, + "grad_norm_var": 0.11507731082550829, + "learning_rate": 0.0001, + "loss": 1.3409, + "loss/crossentropy": 2.8010950088500977, + "loss/hidden": 1.140625, + "loss/logits": 0.1998089998960495, + "loss/reg": 5.010988388676196e-05, + "step": 963 + }, + { + "epoch": 0.1205, + "grad_norm": 2.849869728088379, + "grad_norm_var": 0.11924017889962192, + "learning_rate": 0.0001, + "loss": 1.6222, + "loss/crossentropy": 2.570343255996704, + "loss/hidden": 1.3515625, + "loss/logits": 0.27010178565979004, + "loss/reg": 5.009587403037585e-05, + "step": 964 + }, + { + "epoch": 0.120625, + "grad_norm": 2.347888469696045, + "grad_norm_var": 0.12114457046454064, + "learning_rate": 0.0001, + "loss": 1.2266, + "loss/crossentropy": 2.685359239578247, + "loss/hidden": 1.046875, + "loss/logits": 0.17924004793167114, + "loss/reg": 5.00831774843391e-05, + "step": 965 + }, + { + "epoch": 0.12075, + "grad_norm": 3.009894609451294, + "grad_norm_var": 0.12872461062677967, + "learning_rate": 0.0001, + "loss": 1.1479, + "loss/crossentropy": 2.1576528549194336, + "loss/hidden": 1.0, + "loss/logits": 0.14742408692836761, + "loss/reg": 5.006848732591607e-05, + "step": 966 + }, + { + "epoch": 0.120875, + "grad_norm": 2.795736789703369, + "grad_norm_var": 0.1305530559419573, + "learning_rate": 0.0001, + "loss": 1.277, + "loss/crossentropy": 2.619136333465576, + "loss/hidden": 1.09375, + "loss/logits": 0.18277448415756226, + "loss/reg": 5.005668936064467e-05, + "step": 967 + }, + { + "epoch": 0.121, + "grad_norm": 3.669609785079956, + "grad_norm_var": 0.18244444432157156, + "learning_rate": 0.0001, + "loss": 1.4086, + "loss/crossentropy": 2.494560956954956, + "loss/hidden": 1.2109375, + "loss/logits": 0.19715511798858643, + "loss/reg": 5.0041482609231025e-05, + "step": 968 + }, + { + "epoch": 0.121125, + "grad_norm": 2.406184196472168, + "grad_norm_var": 0.17679367962298234, + "learning_rate": 0.0001, + "loss": 1.4467, + "loss/crossentropy": 2.6060521602630615, + "loss/hidden": 1.2265625, + "loss/logits": 0.2196432203054428, + "loss/reg": 5.002524994779378e-05, + "step": 969 + }, + { + "epoch": 0.12125, + "grad_norm": 2.449737071990967, + "grad_norm_var": 0.16984991405668073, + "learning_rate": 0.0001, + "loss": 1.3311, + "loss/crossentropy": 2.571929931640625, + "loss/hidden": 1.140625, + "loss/logits": 0.18994662165641785, + "loss/reg": 5.001126555725932e-05, + "step": 970 + }, + { + "epoch": 0.121375, + "grad_norm": 4.657814025878906, + "grad_norm_var": 0.38136771698041777, + "learning_rate": 0.0001, + "loss": 1.4828, + "loss/crossentropy": 1.9526991844177246, + "loss/hidden": 1.34375, + "loss/logits": 0.1385582983493805, + "loss/reg": 4.999621523893438e-05, + "step": 971 + }, + { + "epoch": 0.1215, + "grad_norm": 2.4782066345214844, + "grad_norm_var": 0.39044515597160984, + "learning_rate": 0.0001, + "loss": 1.5125, + "loss/crossentropy": 2.691642999649048, + "loss/hidden": 1.28125, + "loss/logits": 0.23075540363788605, + "loss/reg": 4.9980124458670616e-05, + "step": 972 + }, + { + "epoch": 0.121625, + "grad_norm": 2.052445650100708, + "grad_norm_var": 0.4250124419864655, + "learning_rate": 0.0001, + "loss": 1.1691, + "loss/crossentropy": 2.250474452972412, + "loss/hidden": 1.015625, + "loss/logits": 0.15293559432029724, + "loss/reg": 4.996646021027118e-05, + "step": 973 + }, + { + "epoch": 0.12175, + "grad_norm": 3.020663261413574, + "grad_norm_var": 0.4031631069299368, + "learning_rate": 0.0001, + "loss": 1.6364, + "loss/crossentropy": 2.6588315963745117, + "loss/hidden": 1.3515625, + "loss/logits": 0.28431302309036255, + "loss/reg": 4.995078415959142e-05, + "step": 974 + }, + { + "epoch": 0.121875, + "grad_norm": 2.3784873485565186, + "grad_norm_var": 0.41746659447213474, + "learning_rate": 0.0001, + "loss": 1.3054, + "loss/crossentropy": 2.7177300453186035, + "loss/hidden": 1.1171875, + "loss/logits": 0.1876787394285202, + "loss/reg": 4.993857510271482e-05, + "step": 975 + }, + { + "epoch": 0.122, + "grad_norm": 2.3851540088653564, + "grad_norm_var": 0.41411408010637746, + "learning_rate": 0.0001, + "loss": 1.3903, + "loss/crossentropy": 2.4946720600128174, + "loss/hidden": 1.1875, + "loss/logits": 0.20234179496765137, + "loss/reg": 4.9920308811124414e-05, + "step": 976 + }, + { + "epoch": 0.122125, + "grad_norm": 3.0380489826202393, + "grad_norm_var": 0.39589390524756496, + "learning_rate": 0.0001, + "loss": 1.4918, + "loss/crossentropy": 2.525160312652588, + "loss/hidden": 1.2578125, + "loss/logits": 0.23351210355758667, + "loss/reg": 4.9907186621567234e-05, + "step": 977 + }, + { + "epoch": 0.12225, + "grad_norm": 3.0249407291412354, + "grad_norm_var": 0.39581891364688315, + "learning_rate": 0.0001, + "loss": 1.3042, + "loss/crossentropy": 2.2865822315216064, + "loss/hidden": 1.125, + "loss/logits": 0.17868509888648987, + "loss/reg": 4.989467197447084e-05, + "step": 978 + }, + { + "epoch": 0.122375, + "grad_norm": 2.5235769748687744, + "grad_norm_var": 0.3983845190744196, + "learning_rate": 0.0001, + "loss": 1.4039, + "loss/crossentropy": 2.571760416030884, + "loss/hidden": 1.203125, + "loss/logits": 0.20029743015766144, + "loss/reg": 4.9882932216860354e-05, + "step": 979 + }, + { + "epoch": 0.1225, + "grad_norm": 2.345391035079956, + "grad_norm_var": 0.4121480969686348, + "learning_rate": 0.0001, + "loss": 1.3082, + "loss/crossentropy": 2.435070037841797, + "loss/hidden": 1.125, + "loss/logits": 0.18273219466209412, + "loss/reg": 4.9867430789163336e-05, + "step": 980 + }, + { + "epoch": 0.122625, + "grad_norm": 2.638735294342041, + "grad_norm_var": 0.40042645398898813, + "learning_rate": 0.0001, + "loss": 1.3063, + "loss/crossentropy": 2.5130980014801025, + "loss/hidden": 1.1328125, + "loss/logits": 0.17303845286369324, + "loss/reg": 4.985083796782419e-05, + "step": 981 + }, + { + "epoch": 0.12275, + "grad_norm": 2.5330209732055664, + "grad_norm_var": 0.40159028364493377, + "learning_rate": 0.0001, + "loss": 1.2562, + "loss/crossentropy": 2.6750576496124268, + "loss/hidden": 1.0859375, + "loss/logits": 0.169732928276062, + "loss/reg": 4.983275357517414e-05, + "step": 982 + }, + { + "epoch": 0.122875, + "grad_norm": 2.729797840118408, + "grad_norm_var": 0.4016784804234855, + "learning_rate": 0.0001, + "loss": 1.2532, + "loss/crossentropy": 2.4468331336975098, + "loss/hidden": 1.0859375, + "loss/logits": 0.1667352169752121, + "loss/reg": 4.9810380005510524e-05, + "step": 983 + }, + { + "epoch": 0.123, + "grad_norm": 2.0886266231536865, + "grad_norm_var": 0.368417637633604, + "learning_rate": 0.0001, + "loss": 1.153, + "loss/crossentropy": 2.3423781394958496, + "loss/hidden": 1.0078125, + "loss/logits": 0.14471980929374695, + "loss/reg": 4.978798096999526e-05, + "step": 984 + }, + { + "epoch": 0.123125, + "grad_norm": 4.569364070892334, + "grad_norm_var": 0.5842302621168443, + "learning_rate": 0.0001, + "loss": 2.2018, + "loss/crossentropy": 3.0823965072631836, + "loss/hidden": 1.859375, + "loss/logits": 0.3418978154659271, + "loss/reg": 4.976466516382061e-05, + "step": 985 + }, + { + "epoch": 0.12325, + "grad_norm": 2.2796969413757324, + "grad_norm_var": 0.5941400852345159, + "learning_rate": 0.0001, + "loss": 1.3454, + "loss/crossentropy": 2.238121509552002, + "loss/hidden": 1.171875, + "loss/logits": 0.17299339175224304, + "loss/reg": 4.9749010941013694e-05, + "step": 986 + }, + { + "epoch": 0.123375, + "grad_norm": 2.2396790981292725, + "grad_norm_var": 0.3594793940281323, + "learning_rate": 0.0001, + "loss": 1.2469, + "loss/crossentropy": 2.4478795528411865, + "loss/hidden": 1.0703125, + "loss/logits": 0.1760822981595993, + "loss/reg": 4.9731748731574044e-05, + "step": 987 + }, + { + "epoch": 0.1235, + "grad_norm": 2.769818067550659, + "grad_norm_var": 0.3582948597206647, + "learning_rate": 0.0001, + "loss": 1.4051, + "loss/crossentropy": 2.5798401832580566, + "loss/hidden": 1.1875, + "loss/logits": 0.2170933485031128, + "loss/reg": 4.971622547600418e-05, + "step": 988 + }, + { + "epoch": 0.123625, + "grad_norm": 7.930779457092285, + "grad_norm_var": 2.038968644334443, + "learning_rate": 0.0001, + "loss": 1.7555, + "loss/crossentropy": 2.4812231063842773, + "loss/hidden": 1.5234375, + "loss/logits": 0.23152852058410645, + "loss/reg": 4.969895235262811e-05, + "step": 989 + }, + { + "epoch": 0.12375, + "grad_norm": 2.6717734336853027, + "grad_norm_var": 2.047056614809465, + "learning_rate": 0.0001, + "loss": 1.3037, + "loss/crossentropy": 2.765683889389038, + "loss/hidden": 1.109375, + "loss/logits": 0.1938176304101944, + "loss/reg": 4.968183202436194e-05, + "step": 990 + }, + { + "epoch": 0.123875, + "grad_norm": 2.1470675468444824, + "grad_norm_var": 2.0698644668564423, + "learning_rate": 0.0001, + "loss": 1.1412, + "loss/crossentropy": 2.3901665210723877, + "loss/hidden": 1.0078125, + "loss/logits": 0.13293424248695374, + "loss/reg": 4.966145206708461e-05, + "step": 991 + }, + { + "epoch": 0.124, + "grad_norm": 3.5537383556365967, + "grad_norm_var": 2.0602370425069307, + "learning_rate": 0.0001, + "loss": 1.5701, + "loss/crossentropy": 2.7112905979156494, + "loss/hidden": 1.3125, + "loss/logits": 0.2570686340332031, + "loss/reg": 4.964391700923443e-05, + "step": 992 + }, + { + "epoch": 0.124125, + "grad_norm": 2.8144960403442383, + "grad_norm_var": 2.0642459406096196, + "learning_rate": 0.0001, + "loss": 1.4034, + "loss/crossentropy": 2.398422956466675, + "loss/hidden": 1.1796875, + "loss/logits": 0.2232397496700287, + "loss/reg": 4.962670573149808e-05, + "step": 993 + }, + { + "epoch": 0.12425, + "grad_norm": 2.438244342803955, + "grad_norm_var": 2.0880153272663686, + "learning_rate": 0.0001, + "loss": 1.2807, + "loss/crossentropy": 2.8859853744506836, + "loss/hidden": 1.09375, + "loss/logits": 0.186467245221138, + "loss/reg": 4.960416481480934e-05, + "step": 994 + }, + { + "epoch": 0.124375, + "grad_norm": 2.4503746032714844, + "grad_norm_var": 2.0931673054725968, + "learning_rate": 0.0001, + "loss": 1.2689, + "loss/crossentropy": 2.5799286365509033, + "loss/hidden": 1.0859375, + "loss/logits": 0.18246138095855713, + "loss/reg": 4.958092904416844e-05, + "step": 995 + }, + { + "epoch": 0.1245, + "grad_norm": 2.7326478958129883, + "grad_norm_var": 2.0680926796305954, + "learning_rate": 0.0001, + "loss": 1.2971, + "loss/crossentropy": 2.5156333446502686, + "loss/hidden": 1.1015625, + "loss/logits": 0.19501572847366333, + "loss/reg": 4.9566217057872564e-05, + "step": 996 + }, + { + "epoch": 0.124625, + "grad_norm": 2.3510055541992188, + "grad_norm_var": 2.0885360429345687, + "learning_rate": 0.0001, + "loss": 1.3411, + "loss/crossentropy": 2.4484684467315674, + "loss/hidden": 1.15625, + "loss/logits": 0.1843767762184143, + "loss/reg": 4.954791802447289e-05, + "step": 997 + }, + { + "epoch": 0.12475, + "grad_norm": 2.5405001640319824, + "grad_norm_var": 2.088055149578788, + "learning_rate": 0.0001, + "loss": 1.3355, + "loss/crossentropy": 2.7601253986358643, + "loss/hidden": 1.1328125, + "loss/logits": 0.2021464705467224, + "loss/reg": 4.952655945089646e-05, + "step": 998 + }, + { + "epoch": 0.124875, + "grad_norm": 2.3194994926452637, + "grad_norm_var": 2.1144102611494255, + "learning_rate": 0.0001, + "loss": 1.3942, + "loss/crossentropy": 2.220510721206665, + "loss/hidden": 1.203125, + "loss/logits": 0.19055192172527313, + "loss/reg": 4.9505808419780806e-05, + "step": 999 + }, + { + "epoch": 0.125, + "grad_norm": 2.5603721141815186, + "grad_norm_var": 2.071398101249299, + "learning_rate": 0.0001, + "loss": 1.36, + "loss/crossentropy": 2.8782269954681396, + "loss/hidden": 1.1484375, + "loss/logits": 0.2110556960105896, + "loss/reg": 4.9486519856145605e-05, + "step": 1000 + }, + { + "epoch": 0.125125, + "grad_norm": 2.2395670413970947, + "grad_norm_var": 1.9303038412412072, + "learning_rate": 0.0001, + "loss": 1.2975, + "loss/crossentropy": 2.69596529006958, + "loss/hidden": 1.125, + "loss/logits": 0.1720152497291565, + "loss/reg": 4.9466805648989975e-05, + "step": 1001 + }, + { + "epoch": 0.12525, + "grad_norm": 2.61370849609375, + "grad_norm_var": 1.9106555491120425, + "learning_rate": 0.0001, + "loss": 1.3189, + "loss/crossentropy": 2.220857620239258, + "loss/hidden": 1.1328125, + "loss/logits": 0.1856231689453125, + "loss/reg": 4.944577085552737e-05, + "step": 1002 + }, + { + "epoch": 0.125375, + "grad_norm": 2.382514715194702, + "grad_norm_var": 1.8993868437643942, + "learning_rate": 0.0001, + "loss": 1.3043, + "loss/crossentropy": 2.4896750450134277, + "loss/hidden": 1.1171875, + "loss/logits": 0.18666106462478638, + "loss/reg": 4.9421672883909196e-05, + "step": 1003 + }, + { + "epoch": 0.1255, + "grad_norm": 2.090632200241089, + "grad_norm_var": 1.9406638681660182, + "learning_rate": 0.0001, + "loss": 1.223, + "loss/crossentropy": 2.4760797023773193, + "loss/hidden": 1.0546875, + "loss/logits": 0.1677936315536499, + "loss/reg": 4.9405876779928803e-05, + "step": 1004 + }, + { + "epoch": 0.125625, + "grad_norm": 2.207566976547241, + "grad_norm_var": 0.12204364862270628, + "learning_rate": 0.0001, + "loss": 1.2431, + "loss/crossentropy": 2.4455795288085938, + "loss/hidden": 1.0703125, + "loss/logits": 0.17229218780994415, + "loss/reg": 4.938853089697659e-05, + "step": 1005 + }, + { + "epoch": 0.12575, + "grad_norm": 2.572543144226074, + "grad_norm_var": 0.12048040871569204, + "learning_rate": 0.0001, + "loss": 1.4118, + "loss/crossentropy": 2.363635540008545, + "loss/hidden": 1.1875, + "loss/logits": 0.22376522421836853, + "loss/reg": 4.936993354931474e-05, + "step": 1006 + }, + { + "epoch": 0.125875, + "grad_norm": 1.9542391300201416, + "grad_norm_var": 0.13190165361677916, + "learning_rate": 0.0001, + "loss": 1.3172, + "loss/crossentropy": 2.2691657543182373, + "loss/hidden": 1.15625, + "loss/logits": 0.1604304313659668, + "loss/reg": 4.934666503686458e-05, + "step": 1007 + }, + { + "epoch": 0.126, + "grad_norm": 2.453639268875122, + "grad_norm_var": 0.05134304514072573, + "learning_rate": 0.0001, + "loss": 1.1705, + "loss/crossentropy": 2.5322656631469727, + "loss/hidden": 1.03125, + "loss/logits": 0.1387622058391571, + "loss/reg": 4.932629963150248e-05, + "step": 1008 + }, + { + "epoch": 0.126125, + "grad_norm": 4.550439834594727, + "grad_norm_var": 0.33097413609364873, + "learning_rate": 0.0001, + "loss": 1.7293, + "loss/crossentropy": 2.4164276123046875, + "loss/hidden": 1.4609375, + "loss/logits": 0.26790663599967957, + "loss/reg": 4.9302150728181005e-05, + "step": 1009 + }, + { + "epoch": 0.12625, + "grad_norm": 2.1454691886901855, + "grad_norm_var": 0.33985839605951634, + "learning_rate": 0.0001, + "loss": 1.2495, + "loss/crossentropy": 2.3248133659362793, + "loss/hidden": 1.09375, + "loss/logits": 0.15521638095378876, + "loss/reg": 4.9280359235126525e-05, + "step": 1010 + }, + { + "epoch": 0.126375, + "grad_norm": 3.380275249481201, + "grad_norm_var": 0.38647376277448847, + "learning_rate": 0.0001, + "loss": 1.4021, + "loss/crossentropy": 2.729059934616089, + "loss/hidden": 1.203125, + "loss/logits": 0.19849233329296112, + "loss/reg": 4.9261008825851604e-05, + "step": 1011 + }, + { + "epoch": 0.1265, + "grad_norm": 2.9117698669433594, + "grad_norm_var": 0.39240144713572794, + "learning_rate": 0.0001, + "loss": 1.4475, + "loss/crossentropy": 2.5537755489349365, + "loss/hidden": 1.25, + "loss/logits": 0.19704851508140564, + "loss/reg": 4.923251617583446e-05, + "step": 1012 + }, + { + "epoch": 0.126625, + "grad_norm": 3.525696277618408, + "grad_norm_var": 0.4428399929631228, + "learning_rate": 0.0001, + "loss": 1.3903, + "loss/crossentropy": 2.463677406311035, + "loss/hidden": 1.171875, + "loss/logits": 0.21794721484184265, + "loss/reg": 4.921284562442452e-05, + "step": 1013 + }, + { + "epoch": 0.12675, + "grad_norm": 2.036464214324951, + "grad_norm_var": 0.46628060550236017, + "learning_rate": 0.0001, + "loss": 1.2705, + "loss/crossentropy": 2.529911756515503, + "loss/hidden": 1.09375, + "loss/logits": 0.17623832821846008, + "loss/reg": 4.9188893171958625e-05, + "step": 1014 + }, + { + "epoch": 0.126875, + "grad_norm": 2.6179473400115967, + "grad_norm_var": 0.45982904228581656, + "learning_rate": 0.0001, + "loss": 1.1577, + "loss/crossentropy": 2.501230001449585, + "loss/hidden": 1.0078125, + "loss/logits": 0.1493779569864273, + "loss/reg": 4.916240504826419e-05, + "step": 1015 + }, + { + "epoch": 0.127, + "grad_norm": 2.0810256004333496, + "grad_norm_var": 0.47929047113658096, + "learning_rate": 0.0001, + "loss": 1.342, + "loss/crossentropy": 2.4926223754882812, + "loss/hidden": 1.15625, + "loss/logits": 0.1852131187915802, + "loss/reg": 4.9135691369883716e-05, + "step": 1016 + }, + { + "epoch": 0.127125, + "grad_norm": 7.561760425567627, + "grad_norm_var": 1.9866254273241934, + "learning_rate": 0.0001, + "loss": 2.1473, + "loss/crossentropy": 2.5160434246063232, + "loss/hidden": 1.75, + "loss/logits": 0.3968074321746826, + "loss/reg": 4.910998904961161e-05, + "step": 1017 + }, + { + "epoch": 0.12725, + "grad_norm": 2.6571788787841797, + "grad_norm_var": 1.9848357777071584, + "learning_rate": 0.0001, + "loss": 1.4137, + "loss/crossentropy": 2.503122329711914, + "loss/hidden": 1.1875, + "loss/logits": 0.2257142961025238, + "loss/reg": 4.908502523903735e-05, + "step": 1018 + }, + { + "epoch": 0.127375, + "grad_norm": 2.9156904220581055, + "grad_norm_var": 1.962575207346818, + "learning_rate": 0.0001, + "loss": 1.3309, + "loss/crossentropy": 2.6177690029144287, + "loss/hidden": 1.1015625, + "loss/logits": 0.22888478636741638, + "loss/reg": 4.906098547508009e-05, + "step": 1019 + }, + { + "epoch": 0.1275, + "grad_norm": 2.439221143722534, + "grad_norm_var": 1.928884650271243, + "learning_rate": 0.0001, + "loss": 1.3874, + "loss/crossentropy": 2.2906293869018555, + "loss/hidden": 1.203125, + "loss/logits": 0.18382297456264496, + "loss/reg": 4.9045229388866574e-05, + "step": 1020 + }, + { + "epoch": 0.127625, + "grad_norm": 2.555960178375244, + "grad_norm_var": 1.899628603116729, + "learning_rate": 0.0001, + "loss": 1.4984, + "loss/crossentropy": 2.429898262023926, + "loss/hidden": 1.265625, + "loss/logits": 0.2322796881198883, + "loss/reg": 4.902849468635395e-05, + "step": 1021 + }, + { + "epoch": 0.12775, + "grad_norm": 3.4673826694488525, + "grad_norm_var": 1.89599455975474, + "learning_rate": 0.0001, + "loss": 1.3344, + "loss/crossentropy": 2.338287353515625, + "loss/hidden": 1.140625, + "loss/logits": 0.19326989352703094, + "loss/reg": 4.901745342067443e-05, + "step": 1022 + }, + { + "epoch": 0.127875, + "grad_norm": 3.8274197578430176, + "grad_norm_var": 1.8345311497725865, + "learning_rate": 0.0001, + "loss": 1.6114, + "loss/crossentropy": 2.693875312805176, + "loss/hidden": 1.359375, + "loss/logits": 0.2515537440776825, + "loss/reg": 4.899735358776525e-05, + "step": 1023 + }, + { + "epoch": 0.128, + "grad_norm": 2.0832443237304688, + "grad_norm_var": 1.8797411681812697, + "learning_rate": 0.0001, + "loss": 1.2648, + "loss/crossentropy": 2.5829262733459473, + "loss/hidden": 1.1015625, + "loss/logits": 0.16278542578220367, + "loss/reg": 4.8977166443364695e-05, + "step": 1024 + }, + { + "epoch": 0.128125, + "grad_norm": 2.5200939178466797, + "grad_norm_var": 1.7643075835401967, + "learning_rate": 0.0001, + "loss": 1.5924, + "loss/crossentropy": 2.269489288330078, + "loss/hidden": 1.328125, + "loss/logits": 0.263778954744339, + "loss/reg": 4.896056270808913e-05, + "step": 1025 + }, + { + "epoch": 0.12825, + "grad_norm": 4.524285793304443, + "grad_norm_var": 1.8325406094582772, + "learning_rate": 0.0001, + "loss": 1.6757, + "loss/crossentropy": 2.3938968181610107, + "loss/hidden": 1.4375, + "loss/logits": 0.23774707317352295, + "loss/reg": 4.8936548409983516e-05, + "step": 1026 + }, + { + "epoch": 0.128375, + "grad_norm": 15.092655181884766, + "grad_norm_var": 10.697039493485823, + "learning_rate": 0.0001, + "loss": 2.1874, + "loss/crossentropy": 2.30786395072937, + "loss/hidden": 1.96875, + "loss/logits": 0.21819457411766052, + "loss/reg": 4.8918100219452754e-05, + "step": 1027 + }, + { + "epoch": 0.1285, + "grad_norm": 4.296483993530273, + "grad_norm_var": 10.629602505750489, + "learning_rate": 0.0001, + "loss": 1.6954, + "loss/crossentropy": 2.2013046741485596, + "loss/hidden": 1.4765625, + "loss/logits": 0.21837672591209412, + "loss/reg": 4.8897407395998016e-05, + "step": 1028 + }, + { + "epoch": 0.128625, + "grad_norm": 2.0280611515045166, + "grad_norm_var": 10.867023015671123, + "learning_rate": 0.0001, + "loss": 1.2187, + "loss/crossentropy": 2.3579764366149902, + "loss/hidden": 1.046875, + "loss/logits": 0.17130854725837708, + "loss/reg": 4.887663453700952e-05, + "step": 1029 + }, + { + "epoch": 0.12875, + "grad_norm": 3.1296451091766357, + "grad_norm_var": 10.66731170329762, + "learning_rate": 0.0001, + "loss": 1.4486, + "loss/crossentropy": 2.4010651111602783, + "loss/hidden": 1.2734375, + "loss/logits": 0.1747232973575592, + "loss/reg": 4.8857429646886885e-05, + "step": 1030 + }, + { + "epoch": 0.128875, + "grad_norm": 2.333757162094116, + "grad_norm_var": 10.724249974607462, + "learning_rate": 0.0001, + "loss": 1.3921, + "loss/crossentropy": 2.4793214797973633, + "loss/hidden": 1.171875, + "loss/logits": 0.21972517669200897, + "loss/reg": 4.883726069238037e-05, + "step": 1031 + }, + { + "epoch": 0.129, + "grad_norm": 2.856081962585449, + "grad_norm_var": 10.566625900721201, + "learning_rate": 0.0001, + "loss": 1.6614, + "loss/crossentropy": 2.073331594467163, + "loss/hidden": 1.375, + "loss/logits": 0.2858877182006836, + "loss/reg": 4.881904897047207e-05, + "step": 1032 + }, + { + "epoch": 0.129125, + "grad_norm": 2.5459816455841064, + "grad_norm_var": 9.769079293862243, + "learning_rate": 0.0001, + "loss": 1.217, + "loss/crossentropy": 2.49267315864563, + "loss/hidden": 1.046875, + "loss/logits": 0.16961437463760376, + "loss/reg": 4.880000778939575e-05, + "step": 1033 + }, + { + "epoch": 0.12925, + "grad_norm": 2.683567523956299, + "grad_norm_var": 9.76543758025689, + "learning_rate": 0.0001, + "loss": 1.4644, + "loss/crossentropy": 2.461768627166748, + "loss/hidden": 1.2734375, + "loss/logits": 0.19044238328933716, + "loss/reg": 4.8780999350128695e-05, + "step": 1034 + }, + { + "epoch": 0.129375, + "grad_norm": 3.0184967517852783, + "grad_norm_var": 9.755261948776834, + "learning_rate": 0.0001, + "loss": 1.6364, + "loss/crossentropy": 2.4083235263824463, + "loss/hidden": 1.34375, + "loss/logits": 0.292146235704422, + "loss/reg": 4.8763413360575214e-05, + "step": 1035 + }, + { + "epoch": 0.1295, + "grad_norm": 2.9849438667297363, + "grad_norm_var": 9.681217018438769, + "learning_rate": 0.0001, + "loss": 1.4003, + "loss/crossentropy": 2.462160348892212, + "loss/hidden": 1.1875, + "loss/logits": 0.21227726340293884, + "loss/reg": 4.8741494538262486e-05, + "step": 1036 + }, + { + "epoch": 0.129625, + "grad_norm": 2.4786312580108643, + "grad_norm_var": 9.69386845836604, + "learning_rate": 0.0001, + "loss": 1.4985, + "loss/crossentropy": 2.27028489112854, + "loss/hidden": 1.2578125, + "loss/logits": 0.24016925692558289, + "loss/reg": 4.8723446525400504e-05, + "step": 1037 + }, + { + "epoch": 0.12975, + "grad_norm": 2.5076568126678467, + "grad_norm_var": 9.78656640570568, + "learning_rate": 0.0001, + "loss": 1.2807, + "loss/crossentropy": 2.4636549949645996, + "loss/hidden": 1.09375, + "loss/logits": 0.18648827075958252, + "loss/reg": 4.8708836402511224e-05, + "step": 1038 + }, + { + "epoch": 0.129875, + "grad_norm": 5.714248180389404, + "grad_norm_var": 10.04567390941182, + "learning_rate": 0.0001, + "loss": 1.4724, + "loss/crossentropy": 2.7502284049987793, + "loss/hidden": 1.296875, + "loss/logits": 0.1750672310590744, + "loss/reg": 4.8695965233491734e-05, + "step": 1039 + }, + { + "epoch": 0.13, + "grad_norm": 5.00759220123291, + "grad_norm_var": 9.910829392800613, + "learning_rate": 0.0001, + "loss": 1.9101, + "loss/crossentropy": 2.573537826538086, + "loss/hidden": 1.640625, + "loss/logits": 0.2690110206604004, + "loss/reg": 4.868546238867566e-05, + "step": 1040 + }, + { + "epoch": 0.130125, + "grad_norm": 2.8802566528320312, + "grad_norm_var": 9.848702943805838, + "learning_rate": 0.0001, + "loss": 1.2756, + "loss/crossentropy": 2.78995680809021, + "loss/hidden": 1.078125, + "loss/logits": 0.19703038036823273, + "loss/reg": 4.867479583481327e-05, + "step": 1041 + }, + { + "epoch": 0.13025, + "grad_norm": 2.6452934741973877, + "grad_norm_var": 9.939305055834435, + "learning_rate": 0.0001, + "loss": 1.5079, + "loss/crossentropy": 2.455906867980957, + "loss/hidden": 1.265625, + "loss/logits": 0.24182063341140747, + "loss/reg": 4.866902600042522e-05, + "step": 1042 + }, + { + "epoch": 0.130375, + "grad_norm": 2.370833396911621, + "grad_norm_var": 1.0482923897399208, + "learning_rate": 0.0001, + "loss": 1.2825, + "loss/crossentropy": 2.5100162029266357, + "loss/hidden": 1.1015625, + "loss/logits": 0.18049922585487366, + "loss/reg": 4.866467497777194e-05, + "step": 1043 + }, + { + "epoch": 0.1305, + "grad_norm": 2.5938727855682373, + "grad_norm_var": 0.9561722032163684, + "learning_rate": 0.0001, + "loss": 1.4683, + "loss/crossentropy": 2.258338212966919, + "loss/hidden": 1.2578125, + "loss/logits": 0.2099648416042328, + "loss/reg": 4.86554745293688e-05, + "step": 1044 + }, + { + "epoch": 0.130625, + "grad_norm": 2.603637218475342, + "grad_norm_var": 0.9033481292550277, + "learning_rate": 0.0001, + "loss": 1.3901, + "loss/crossentropy": 2.691969871520996, + "loss/hidden": 1.1875, + "loss/logits": 0.202120840549469, + "loss/reg": 4.865137816523202e-05, + "step": 1045 + }, + { + "epoch": 0.13075, + "grad_norm": 2.4388020038604736, + "grad_norm_var": 0.9232760601983566, + "learning_rate": 0.0001, + "loss": 1.2329, + "loss/crossentropy": 2.3171486854553223, + "loss/hidden": 1.0625, + "loss/logits": 0.16995665431022644, + "loss/reg": 4.864386573899537e-05, + "step": 1046 + }, + { + "epoch": 0.130875, + "grad_norm": 2.494767904281616, + "grad_norm_var": 0.9110446675235295, + "learning_rate": 0.0001, + "loss": 1.6476, + "loss/crossentropy": 2.3494441509246826, + "loss/hidden": 1.3984375, + "loss/logits": 0.24871033430099487, + "loss/reg": 4.862891000811942e-05, + "step": 1047 + }, + { + "epoch": 0.131, + "grad_norm": 49.92427444458008, + "grad_norm_var": 138.5400442659784, + "learning_rate": 0.0001, + "loss": 1.2997, + "loss/crossentropy": 2.663750648498535, + "loss/hidden": 1.1328125, + "loss/logits": 0.16641995310783386, + "loss/reg": 4.861348133999854e-05, + "step": 1048 + }, + { + "epoch": 0.131125, + "grad_norm": 2.3657724857330322, + "grad_norm_var": 138.62340409534175, + "learning_rate": 0.0001, + "loss": 1.6284, + "loss/crossentropy": 2.2861435413360596, + "loss/hidden": 1.359375, + "loss/logits": 0.2685868740081787, + "loss/reg": 4.8597343266010284e-05, + "step": 1049 + }, + { + "epoch": 0.13125, + "grad_norm": 2.4345011711120605, + "grad_norm_var": 138.73474415279924, + "learning_rate": 0.0001, + "loss": 1.4715, + "loss/crossentropy": 2.8440446853637695, + "loss/hidden": 1.2265625, + "loss/logits": 0.24448415637016296, + "loss/reg": 4.858469037571922e-05, + "step": 1050 + }, + { + "epoch": 0.131375, + "grad_norm": 2.356966018676758, + "grad_norm_var": 139.01660648328053, + "learning_rate": 0.0001, + "loss": 1.225, + "loss/crossentropy": 2.7689743041992188, + "loss/hidden": 1.0625, + "loss/logits": 0.16196824610233307, + "loss/reg": 4.856809391640127e-05, + "step": 1051 + }, + { + "epoch": 0.1315, + "grad_norm": 4.138930797576904, + "grad_norm_var": 138.65706217924568, + "learning_rate": 0.0001, + "loss": 1.4593, + "loss/crossentropy": 2.3509294986724854, + "loss/hidden": 1.2734375, + "loss/logits": 0.18540024757385254, + "loss/reg": 4.855730730923824e-05, + "step": 1052 + }, + { + "epoch": 0.131625, + "grad_norm": 3.313295364379883, + "grad_norm_var": 138.315976999055, + "learning_rate": 0.0001, + "loss": 1.2929, + "loss/crossentropy": 2.486267566680908, + "loss/hidden": 1.125, + "loss/logits": 0.16744676232337952, + "loss/reg": 4.854327562497929e-05, + "step": 1053 + }, + { + "epoch": 0.13175, + "grad_norm": 2.193730354309082, + "grad_norm_var": 138.46776734896258, + "learning_rate": 0.0001, + "loss": 1.0996, + "loss/crossentropy": 2.776700496673584, + "loss/hidden": 0.97265625, + "loss/logits": 0.12648262083530426, + "loss/reg": 4.8529618652537465e-05, + "step": 1054 + }, + { + "epoch": 0.131875, + "grad_norm": 5.068596839904785, + "grad_norm_var": 138.51560574772913, + "learning_rate": 0.0001, + "loss": 2.2526, + "loss/crossentropy": 2.9333205223083496, + "loss/hidden": 1.890625, + "loss/logits": 0.36145755648612976, + "loss/reg": 4.8516998504055664e-05, + "step": 1055 + }, + { + "epoch": 0.132, + "grad_norm": 2.5417041778564453, + "grad_norm_var": 139.19791301979916, + "learning_rate": 0.0001, + "loss": 1.4094, + "loss/crossentropy": 2.6531319618225098, + "loss/hidden": 1.1875, + "loss/logits": 0.2214501053094864, + "loss/reg": 4.8505764425499365e-05, + "step": 1056 + }, + { + "epoch": 0.132125, + "grad_norm": 2.918754816055298, + "grad_norm_var": 139.18315783121213, + "learning_rate": 0.0001, + "loss": 1.9489, + "loss/crossentropy": 2.144564151763916, + "loss/hidden": 1.6484375, + "loss/logits": 0.29992836713790894, + "loss/reg": 4.849358811043203e-05, + "step": 1057 + }, + { + "epoch": 0.13225, + "grad_norm": 2.9262378215789795, + "grad_norm_var": 139.07084575115195, + "learning_rate": 0.0001, + "loss": 1.4915, + "loss/crossentropy": 2.6257550716400146, + "loss/hidden": 1.2890625, + "loss/logits": 0.20195814967155457, + "loss/reg": 4.848901880905032e-05, + "step": 1058 + }, + { + "epoch": 0.132375, + "grad_norm": 2.1910111904144287, + "grad_norm_var": 139.15491264737338, + "learning_rate": 0.0001, + "loss": 1.3347, + "loss/crossentropy": 2.4378821849823, + "loss/hidden": 1.140625, + "loss/logits": 0.19360756874084473, + "loss/reg": 4.8472938942722976e-05, + "step": 1059 + }, + { + "epoch": 0.1325, + "grad_norm": 2.1801326274871826, + "grad_norm_var": 139.34146098904594, + "learning_rate": 0.0001, + "loss": 1.346, + "loss/crossentropy": 2.6777944564819336, + "loss/hidden": 1.1328125, + "loss/logits": 0.21269740164279938, + "loss/reg": 4.8464396968483925e-05, + "step": 1060 + }, + { + "epoch": 0.132625, + "grad_norm": 2.5178987979888916, + "grad_norm_var": 139.37795408866836, + "learning_rate": 0.0001, + "loss": 1.2985, + "loss/crossentropy": 2.4651262760162354, + "loss/hidden": 1.1171875, + "loss/logits": 0.18079692125320435, + "loss/reg": 4.844791328650899e-05, + "step": 1061 + }, + { + "epoch": 0.13275, + "grad_norm": 2.6272640228271484, + "grad_norm_var": 139.29696084046898, + "learning_rate": 0.0001, + "loss": 1.2467, + "loss/crossentropy": 2.454970121383667, + "loss/hidden": 1.078125, + "loss/logits": 0.16807857155799866, + "loss/reg": 4.843333954340778e-05, + "step": 1062 + }, + { + "epoch": 0.132875, + "grad_norm": 3.2516655921936035, + "grad_norm_var": 139.00302706804425, + "learning_rate": 0.0001, + "loss": 1.5799, + "loss/crossentropy": 2.5497400760650635, + "loss/hidden": 1.34375, + "loss/logits": 0.2356692999601364, + "loss/reg": 4.84242700622417e-05, + "step": 1063 + }, + { + "epoch": 0.133, + "grad_norm": 3.2384908199310303, + "grad_norm_var": 0.6208098056257557, + "learning_rate": 0.0001, + "loss": 1.5082, + "loss/crossentropy": 2.2672436237335205, + "loss/hidden": 1.296875, + "loss/logits": 0.2108006477355957, + "loss/reg": 4.841904592467472e-05, + "step": 1064 + }, + { + "epoch": 0.133125, + "grad_norm": 5.589666366577148, + "grad_norm_var": 1.0443921444605961, + "learning_rate": 0.0001, + "loss": 1.442, + "loss/crossentropy": 2.660480499267578, + "loss/hidden": 1.265625, + "loss/logits": 0.17584452033042908, + "loss/reg": 4.8413534386781976e-05, + "step": 1065 + }, + { + "epoch": 0.13325, + "grad_norm": 2.072868824005127, + "grad_norm_var": 1.0843195820781981, + "learning_rate": 0.0001, + "loss": 1.3041, + "loss/crossentropy": 2.5538711547851562, + "loss/hidden": 1.1171875, + "loss/logits": 0.18640504777431488, + "loss/reg": 4.8400739615317434e-05, + "step": 1066 + }, + { + "epoch": 0.133375, + "grad_norm": 3.5696866512298584, + "grad_norm_var": 1.0608700784999008, + "learning_rate": 0.0001, + "loss": 1.2464, + "loss/crossentropy": 2.4085769653320312, + "loss/hidden": 1.0859375, + "loss/logits": 0.1599891036748886, + "loss/reg": 4.838715904043056e-05, + "step": 1067 + }, + { + "epoch": 0.1335, + "grad_norm": 2.6294519901275635, + "grad_norm_var": 1.0034864033576762, + "learning_rate": 0.0001, + "loss": 1.2444, + "loss/crossentropy": 2.6341211795806885, + "loss/hidden": 1.0703125, + "loss/logits": 0.17361275851726532, + "loss/reg": 4.837429878534749e-05, + "step": 1068 + }, + { + "epoch": 0.133625, + "grad_norm": 2.570727825164795, + "grad_norm_var": 1.0120691658735572, + "learning_rate": 0.0001, + "loss": 1.1159, + "loss/crossentropy": 2.428523540496826, + "loss/hidden": 0.96484375, + "loss/logits": 0.15061385929584503, + "loss/reg": 4.8361835069954395e-05, + "step": 1069 + }, + { + "epoch": 0.13375, + "grad_norm": 2.9712700843811035, + "grad_norm_var": 0.9656976354251573, + "learning_rate": 0.0001, + "loss": 1.7233, + "loss/crossentropy": 2.2465953826904297, + "loss/hidden": 1.4375, + "loss/logits": 0.28532207012176514, + "loss/reg": 4.834888386540115e-05, + "step": 1070 + }, + { + "epoch": 0.133875, + "grad_norm": 2.2101681232452393, + "grad_norm_var": 0.7085842206157051, + "learning_rate": 0.0001, + "loss": 1.2741, + "loss/crossentropy": 2.525843381881714, + "loss/hidden": 1.109375, + "loss/logits": 0.16421258449554443, + "loss/reg": 4.834130231756717e-05, + "step": 1071 + }, + { + "epoch": 0.134, + "grad_norm": 2.3994717597961426, + "grad_norm_var": 0.7161776254130849, + "learning_rate": 0.0001, + "loss": 1.4061, + "loss/crossentropy": 2.706874370574951, + "loss/hidden": 1.1796875, + "loss/logits": 0.2258806824684143, + "loss/reg": 4.8336529289372265e-05, + "step": 1072 + }, + { + "epoch": 0.134125, + "grad_norm": 3.553044080734253, + "grad_norm_var": 0.7457380382287337, + "learning_rate": 0.0001, + "loss": 1.5817, + "loss/crossentropy": 2.382352590560913, + "loss/hidden": 1.3203125, + "loss/logits": 0.2608606517314911, + "loss/reg": 4.83308540424332e-05, + "step": 1073 + }, + { + "epoch": 0.13425, + "grad_norm": 2.454843282699585, + "grad_norm_var": 0.7583663462414731, + "learning_rate": 0.0001, + "loss": 1.3007, + "loss/crossentropy": 2.62082839012146, + "loss/hidden": 1.1328125, + "loss/logits": 0.1674119234085083, + "loss/reg": 4.8327397962566465e-05, + "step": 1074 + }, + { + "epoch": 0.134375, + "grad_norm": 2.4312965869903564, + "grad_norm_var": 0.74000585371445, + "learning_rate": 0.0001, + "loss": 1.633, + "loss/crossentropy": 2.220055103302002, + "loss/hidden": 1.375, + "loss/logits": 0.2574748992919922, + "loss/reg": 4.832783088204451e-05, + "step": 1075 + }, + { + "epoch": 0.1345, + "grad_norm": 2.182128667831421, + "grad_norm_var": 0.7398167146684991, + "learning_rate": 0.0001, + "loss": 1.2387, + "loss/crossentropy": 2.543452501296997, + "loss/hidden": 1.0625, + "loss/logits": 0.17573592066764832, + "loss/reg": 4.831590558751486e-05, + "step": 1076 + }, + { + "epoch": 0.134625, + "grad_norm": 5.24340295791626, + "grad_norm_var": 1.068188147100082, + "learning_rate": 0.0001, + "loss": 1.9491, + "loss/crossentropy": 2.9112777709960938, + "loss/hidden": 1.6484375, + "loss/logits": 0.3002144396305084, + "loss/reg": 4.831279147765599e-05, + "step": 1077 + }, + { + "epoch": 0.13475, + "grad_norm": 7.397027015686035, + "grad_norm_var": 2.2134877049840185, + "learning_rate": 0.0001, + "loss": 2.1408, + "loss/crossentropy": 1.7057814598083496, + "loss/hidden": 1.8984375, + "loss/logits": 0.2418610006570816, + "loss/reg": 4.831252226722427e-05, + "step": 1078 + }, + { + "epoch": 0.134875, + "grad_norm": 2.7105135917663574, + "grad_norm_var": 2.2396307633405352, + "learning_rate": 0.0001, + "loss": 1.3127, + "loss/crossentropy": 2.462306261062622, + "loss/hidden": 1.125, + "loss/logits": 0.18726205825805664, + "loss/reg": 4.830381294596009e-05, + "step": 1079 + }, + { + "epoch": 0.135, + "grad_norm": 2.3571834564208984, + "grad_norm_var": 2.298516862523117, + "learning_rate": 0.0001, + "loss": 1.2161, + "loss/crossentropy": 2.7559218406677246, + "loss/hidden": 1.0546875, + "loss/logits": 0.16088250279426575, + "loss/reg": 4.828806049772538e-05, + "step": 1080 + }, + { + "epoch": 0.135125, + "grad_norm": 2.9629805088043213, + "grad_norm_var": 1.9178276329659905, + "learning_rate": 0.0001, + "loss": 1.2992, + "loss/crossentropy": 2.526937484741211, + "loss/hidden": 1.1171875, + "loss/logits": 0.18156106770038605, + "loss/reg": 4.828528471989557e-05, + "step": 1081 + }, + { + "epoch": 0.13525, + "grad_norm": 2.0817148685455322, + "grad_norm_var": 1.9166124946652812, + "learning_rate": 0.0001, + "loss": 1.1721, + "loss/crossentropy": 2.451180934906006, + "loss/hidden": 1.015625, + "loss/logits": 0.1559874713420868, + "loss/reg": 4.828141754842363e-05, + "step": 1082 + }, + { + "epoch": 0.135375, + "grad_norm": 2.3504202365875244, + "grad_norm_var": 1.9344384047775915, + "learning_rate": 0.0001, + "loss": 1.1707, + "loss/crossentropy": 2.8747832775115967, + "loss/hidden": 1.015625, + "loss/logits": 0.1546313613653183, + "loss/reg": 4.827776865568012e-05, + "step": 1083 + }, + { + "epoch": 0.1355, + "grad_norm": 2.17263126373291, + "grad_norm_var": 1.971976005860794, + "learning_rate": 0.0001, + "loss": 1.3889, + "loss/crossentropy": 2.723773717880249, + "loss/hidden": 1.1796875, + "loss/logits": 0.20874163508415222, + "loss/reg": 4.827152588404715e-05, + "step": 1084 + }, + { + "epoch": 0.135625, + "grad_norm": 2.8484668731689453, + "grad_norm_var": 1.9607874358662902, + "learning_rate": 0.0001, + "loss": 1.4451, + "loss/crossentropy": 2.3394646644592285, + "loss/hidden": 1.2578125, + "loss/logits": 0.18679557740688324, + "loss/reg": 4.826381700695492e-05, + "step": 1085 + }, + { + "epoch": 0.13575, + "grad_norm": 2.0447754859924316, + "grad_norm_var": 2.0205073590326026, + "learning_rate": 0.0001, + "loss": 1.2954, + "loss/crossentropy": 2.6775341033935547, + "loss/hidden": 1.1171875, + "loss/logits": 0.17772985994815826, + "loss/reg": 4.8260139010380954e-05, + "step": 1086 + }, + { + "epoch": 0.135875, + "grad_norm": 2.259357452392578, + "grad_norm_var": 2.015724328520027, + "learning_rate": 0.0001, + "loss": 1.4122, + "loss/crossentropy": 2.410541534423828, + "loss/hidden": 1.1875, + "loss/logits": 0.2242109179496765, + "loss/reg": 4.8245739890262485e-05, + "step": 1087 + }, + { + "epoch": 0.136, + "grad_norm": 2.8727777004241943, + "grad_norm_var": 1.9939999196141713, + "learning_rate": 0.0001, + "loss": 1.313, + "loss/crossentropy": 2.701556921005249, + "loss/hidden": 1.125, + "loss/logits": 0.1874905824661255, + "loss/reg": 4.823635390494019e-05, + "step": 1088 + }, + { + "epoch": 0.136125, + "grad_norm": 2.3779656887054443, + "grad_norm_var": 1.9928928653171767, + "learning_rate": 0.0001, + "loss": 1.2705, + "loss/crossentropy": 2.656157970428467, + "loss/hidden": 1.1015625, + "loss/logits": 0.16845184564590454, + "loss/reg": 4.823903873329982e-05, + "step": 1089 + }, + { + "epoch": 0.13625, + "grad_norm": 3.045264720916748, + "grad_norm_var": 1.9779265068353404, + "learning_rate": 0.0001, + "loss": 1.4187, + "loss/crossentropy": 2.417032241821289, + "loss/hidden": 1.2265625, + "loss/logits": 0.19169865548610687, + "loss/reg": 4.823317431146279e-05, + "step": 1090 + }, + { + "epoch": 0.136375, + "grad_norm": 2.2823352813720703, + "grad_norm_var": 1.9897867705807812, + "learning_rate": 0.0001, + "loss": 1.1742, + "loss/crossentropy": 2.6444296836853027, + "loss/hidden": 1.03125, + "loss/logits": 0.14250019192695618, + "loss/reg": 4.8217982111964375e-05, + "step": 1091 + }, + { + "epoch": 0.1365, + "grad_norm": 3.824678897857666, + "grad_norm_var": 1.9903923191744959, + "learning_rate": 0.0001, + "loss": 1.3644, + "loss/crossentropy": 2.6071150302886963, + "loss/hidden": 1.171875, + "loss/logits": 0.19199731945991516, + "loss/reg": 4.8215104470727965e-05, + "step": 1092 + }, + { + "epoch": 0.136625, + "grad_norm": 2.549513578414917, + "grad_norm_var": 1.656826383552368, + "learning_rate": 0.0001, + "loss": 1.3147, + "loss/crossentropy": 2.4254698753356934, + "loss/hidden": 1.1484375, + "loss/logits": 0.16581851243972778, + "loss/reg": 4.82136856589932e-05, + "step": 1093 + }, + { + "epoch": 0.13675, + "grad_norm": 6.654994964599609, + "grad_norm_var": 1.244691979844697, + "learning_rate": 0.0001, + "loss": 1.2894, + "loss/crossentropy": 2.5099565982818604, + "loss/hidden": 1.125, + "loss/logits": 0.1638849973678589, + "loss/reg": 4.8204721679212525e-05, + "step": 1094 + }, + { + "epoch": 0.136875, + "grad_norm": 2.3230693340301514, + "grad_norm_var": 1.2606197778757762, + "learning_rate": 0.0001, + "loss": 1.3462, + "loss/crossentropy": 2.512430429458618, + "loss/hidden": 1.1484375, + "loss/logits": 0.19730091094970703, + "loss/reg": 4.8203397454926744e-05, + "step": 1095 + }, + { + "epoch": 0.137, + "grad_norm": 2.261275053024292, + "grad_norm_var": 1.267023668315869, + "learning_rate": 0.0001, + "loss": 1.2825, + "loss/crossentropy": 2.6225202083587646, + "loss/hidden": 1.109375, + "loss/logits": 0.17268520593643188, + "loss/reg": 4.8197605792665854e-05, + "step": 1096 + }, + { + "epoch": 0.137125, + "grad_norm": 2.195023775100708, + "grad_norm_var": 1.2879134307282092, + "learning_rate": 0.0001, + "loss": 1.3726, + "loss/crossentropy": 2.742640256881714, + "loss/hidden": 1.15625, + "loss/logits": 0.2158210277557373, + "loss/reg": 4.818299203179777e-05, + "step": 1097 + }, + { + "epoch": 0.13725, + "grad_norm": 3.191065549850464, + "grad_norm_var": 1.26464759974198, + "learning_rate": 0.0001, + "loss": 1.3751, + "loss/crossentropy": 2.2995545864105225, + "loss/hidden": 1.1875, + "loss/logits": 0.18707019090652466, + "loss/reg": 4.817119406652637e-05, + "step": 1098 + }, + { + "epoch": 0.137375, + "grad_norm": 2.409144401550293, + "grad_norm_var": 1.2611209881187542, + "learning_rate": 0.0001, + "loss": 1.2264, + "loss/crossentropy": 2.800138473510742, + "loss/hidden": 1.0703125, + "loss/logits": 0.15558959543704987, + "loss/reg": 4.815995634999126e-05, + "step": 1099 + }, + { + "epoch": 0.1375, + "grad_norm": 2.1808769702911377, + "grad_norm_var": 1.2604002860858665, + "learning_rate": 0.0001, + "loss": 1.2945, + "loss/crossentropy": 2.440044641494751, + "loss/hidden": 1.109375, + "loss/logits": 0.18468278646469116, + "loss/reg": 4.814787462237291e-05, + "step": 1100 + }, + { + "epoch": 0.137625, + "grad_norm": 2.423313617706299, + "grad_norm_var": 1.2707944512029181, + "learning_rate": 0.0001, + "loss": 1.4005, + "loss/crossentropy": 2.3712844848632812, + "loss/hidden": 1.203125, + "loss/logits": 0.19688689708709717, + "loss/reg": 4.81365823361557e-05, + "step": 1101 + }, + { + "epoch": 0.13775, + "grad_norm": 2.9854326248168945, + "grad_norm_var": 1.230627637633081, + "learning_rate": 0.0001, + "loss": 1.3837, + "loss/crossentropy": 2.38476300239563, + "loss/hidden": 1.203125, + "loss/logits": 0.18007422983646393, + "loss/reg": 4.8126166802830994e-05, + "step": 1102 + }, + { + "epoch": 0.137875, + "grad_norm": 2.0251779556274414, + "grad_norm_var": 1.2529580510886253, + "learning_rate": 0.0001, + "loss": 1.2151, + "loss/crossentropy": 2.37416410446167, + "loss/hidden": 1.0546875, + "loss/logits": 0.15991759300231934, + "loss/reg": 4.811226244783029e-05, + "step": 1103 + }, + { + "epoch": 0.138, + "grad_norm": 2.8228797912597656, + "grad_norm_var": 1.2529629166446565, + "learning_rate": 0.0001, + "loss": 1.3447, + "loss/crossentropy": 2.926377296447754, + "loss/hidden": 1.1484375, + "loss/logits": 0.19579367339611053, + "loss/reg": 4.809819074580446e-05, + "step": 1104 + }, + { + "epoch": 0.138125, + "grad_norm": 2.234987735748291, + "grad_norm_var": 1.263182141719084, + "learning_rate": 0.0001, + "loss": 1.2051, + "loss/crossentropy": 2.458136558532715, + "loss/hidden": 1.0390625, + "loss/logits": 0.16551420092582703, + "loss/reg": 4.8088575567817315e-05, + "step": 1105 + }, + { + "epoch": 0.13825, + "grad_norm": 2.385833740234375, + "grad_norm_var": 1.272142330921156, + "learning_rate": 0.0001, + "loss": 1.3504, + "loss/crossentropy": 2.558030843734741, + "loss/hidden": 1.1484375, + "loss/logits": 0.2014528512954712, + "loss/reg": 4.807797813555226e-05, + "step": 1106 + }, + { + "epoch": 0.138375, + "grad_norm": 3.3867454528808594, + "grad_norm_var": 1.272610209843083, + "learning_rate": 0.0001, + "loss": 1.5406, + "loss/crossentropy": 2.503751516342163, + "loss/hidden": 1.3203125, + "loss/logits": 0.21979767084121704, + "loss/reg": 4.8066383897094056e-05, + "step": 1107 + }, + { + "epoch": 0.1385, + "grad_norm": 2.790325880050659, + "grad_norm_var": 1.2072459836922649, + "learning_rate": 0.0001, + "loss": 1.4099, + "loss/crossentropy": 2.3122177124023438, + "loss/hidden": 1.21875, + "loss/logits": 0.19064576923847198, + "loss/reg": 4.805479693459347e-05, + "step": 1108 + }, + { + "epoch": 0.138625, + "grad_norm": 2.8147926330566406, + "grad_norm_var": 1.202740992103356, + "learning_rate": 0.0001, + "loss": 1.3872, + "loss/crossentropy": 2.715533494949341, + "loss/hidden": 1.203125, + "loss/logits": 0.18357136845588684, + "loss/reg": 4.8043169954326004e-05, + "step": 1109 + }, + { + "epoch": 0.13875, + "grad_norm": 3.2109782695770264, + "grad_norm_var": 0.18202471306677784, + "learning_rate": 0.0001, + "loss": 1.4888, + "loss/crossentropy": 2.2935917377471924, + "loss/hidden": 1.296875, + "loss/logits": 0.19139942526817322, + "loss/reg": 4.802411058335565e-05, + "step": 1110 + }, + { + "epoch": 0.138875, + "grad_norm": 2.9282126426696777, + "grad_norm_var": 0.18236138139209668, + "learning_rate": 0.0001, + "loss": 1.7108, + "loss/crossentropy": 2.1880927085876465, + "loss/hidden": 1.4609375, + "loss/logits": 0.2493777573108673, + "loss/reg": 4.800450551556423e-05, + "step": 1111 + }, + { + "epoch": 0.139, + "grad_norm": 2.2871787548065186, + "grad_norm_var": 0.1810939591389835, + "learning_rate": 0.0001, + "loss": 1.2958, + "loss/crossentropy": 2.5252413749694824, + "loss/hidden": 1.109375, + "loss/logits": 0.1859796941280365, + "loss/reg": 4.798595546162687e-05, + "step": 1112 + }, + { + "epoch": 0.139125, + "grad_norm": 1.9749583005905151, + "grad_norm_var": 0.19723590923434664, + "learning_rate": 0.0001, + "loss": 1.3081, + "loss/crossentropy": 2.4636902809143066, + "loss/hidden": 1.1171875, + "loss/logits": 0.19039222598075867, + "loss/reg": 4.796446955879219e-05, + "step": 1113 + }, + { + "epoch": 0.13925, + "grad_norm": 4.308104515075684, + "grad_norm_var": 0.35904772291688963, + "learning_rate": 0.0001, + "loss": 1.3787, + "loss/crossentropy": 2.9549102783203125, + "loss/hidden": 1.2109375, + "loss/logits": 0.1673247218132019, + "loss/reg": 4.794850246980786e-05, + "step": 1114 + }, + { + "epoch": 0.139375, + "grad_norm": 2.7217111587524414, + "grad_norm_var": 0.3531131684974496, + "learning_rate": 0.0001, + "loss": 1.3919, + "loss/crossentropy": 2.6249146461486816, + "loss/hidden": 1.1953125, + "loss/logits": 0.1961226463317871, + "loss/reg": 4.7932728193700314e-05, + "step": 1115 + }, + { + "epoch": 0.1395, + "grad_norm": 2.2082791328430176, + "grad_norm_var": 0.35119913605219605, + "learning_rate": 0.0001, + "loss": 1.3636, + "loss/crossentropy": 2.314013719558716, + "loss/hidden": 1.1875, + "loss/logits": 0.17566710710525513, + "loss/reg": 4.791447281604633e-05, + "step": 1116 + }, + { + "epoch": 0.139625, + "grad_norm": 3.60855770111084, + "grad_norm_var": 0.39222276775296944, + "learning_rate": 0.0001, + "loss": 1.6117, + "loss/crossentropy": 2.6529369354248047, + "loss/hidden": 1.375, + "loss/logits": 0.23623248934745789, + "loss/reg": 4.789578088093549e-05, + "step": 1117 + }, + { + "epoch": 0.13975, + "grad_norm": 2.7757325172424316, + "grad_norm_var": 0.3896014903684961, + "learning_rate": 0.0001, + "loss": 1.2422, + "loss/crossentropy": 2.7011404037475586, + "loss/hidden": 1.078125, + "loss/logits": 0.1636306345462799, + "loss/reg": 4.787558282259852e-05, + "step": 1118 + }, + { + "epoch": 0.139875, + "grad_norm": 2.764423370361328, + "grad_norm_var": 0.34932944368847996, + "learning_rate": 0.0001, + "loss": 1.3844, + "loss/crossentropy": 2.8270928859710693, + "loss/hidden": 1.1796875, + "loss/logits": 0.2042662799358368, + "loss/reg": 4.785611236002296e-05, + "step": 1119 + }, + { + "epoch": 0.14, + "grad_norm": 2.6311185359954834, + "grad_norm_var": 0.3517198026279449, + "learning_rate": 0.0001, + "loss": 1.4001, + "loss/crossentropy": 2.745588541030884, + "loss/hidden": 1.234375, + "loss/logits": 0.16527175903320312, + "loss/reg": 4.7835168516030535e-05, + "step": 1120 + }, + { + "epoch": 0.140125, + "grad_norm": 2.0787642002105713, + "grad_norm_var": 0.36531621433395045, + "learning_rate": 0.0001, + "loss": 1.2194, + "loss/crossentropy": 2.6111185550689697, + "loss/hidden": 1.0546875, + "loss/logits": 0.16425329446792603, + "loss/reg": 4.781952884513885e-05, + "step": 1121 + }, + { + "epoch": 0.14025, + "grad_norm": 2.299438238143921, + "grad_norm_var": 0.37060818594784395, + "learning_rate": 0.0001, + "loss": 1.2904, + "loss/crossentropy": 2.3531951904296875, + "loss/hidden": 1.109375, + "loss/logits": 0.18054065108299255, + "loss/reg": 4.780311792274006e-05, + "step": 1122 + }, + { + "epoch": 0.140375, + "grad_norm": 2.123501777648926, + "grad_norm_var": 0.3714053097766362, + "learning_rate": 0.0001, + "loss": 1.2384, + "loss/crossentropy": 2.5495216846466064, + "loss/hidden": 1.0546875, + "loss/logits": 0.1832682490348816, + "loss/reg": 4.778530274052173e-05, + "step": 1123 + }, + { + "epoch": 0.1405, + "grad_norm": 2.496851921081543, + "grad_norm_var": 0.3740512666854229, + "learning_rate": 0.0001, + "loss": 1.3948, + "loss/crossentropy": 3.104501962661743, + "loss/hidden": 1.203125, + "loss/logits": 0.19115296006202698, + "loss/reg": 4.776769856107421e-05, + "step": 1124 + }, + { + "epoch": 0.140625, + "grad_norm": 3.203948497772217, + "grad_norm_var": 0.38936697390070374, + "learning_rate": 0.0001, + "loss": 1.4557, + "loss/crossentropy": 2.750394105911255, + "loss/hidden": 1.25, + "loss/logits": 0.20525826513767242, + "loss/reg": 4.7750996600370854e-05, + "step": 1125 + }, + { + "epoch": 0.14075, + "grad_norm": 2.309300184249878, + "grad_norm_var": 0.3819183078721844, + "learning_rate": 0.0001, + "loss": 1.308, + "loss/crossentropy": 2.725071430206299, + "loss/hidden": 1.109375, + "loss/logits": 0.19812864065170288, + "loss/reg": 4.773392356582917e-05, + "step": 1126 + }, + { + "epoch": 0.140875, + "grad_norm": 3.0331461429595947, + "grad_norm_var": 0.3862191141394532, + "learning_rate": 0.0001, + "loss": 1.1116, + "loss/crossentropy": 2.7448618412017822, + "loss/hidden": 0.9609375, + "loss/logits": 0.1502147614955902, + "loss/reg": 4.771806197823025e-05, + "step": 1127 + }, + { + "epoch": 0.141, + "grad_norm": 6.447192668914795, + "grad_norm_var": 1.2518469248685036, + "learning_rate": 0.0001, + "loss": 1.5841, + "loss/crossentropy": 2.4020206928253174, + "loss/hidden": 1.390625, + "loss/logits": 0.19297294318675995, + "loss/reg": 4.770241503138095e-05, + "step": 1128 + }, + { + "epoch": 0.141125, + "grad_norm": 3.1637861728668213, + "grad_norm_var": 1.1877543708808314, + "learning_rate": 0.0001, + "loss": 1.7336, + "loss/crossentropy": 2.453598737716675, + "loss/hidden": 1.4375, + "loss/logits": 0.2955983281135559, + "loss/reg": 4.768367944052443e-05, + "step": 1129 + }, + { + "epoch": 0.14125, + "grad_norm": 2.668591022491455, + "grad_norm_var": 1.0721759885566438, + "learning_rate": 0.0001, + "loss": 1.2714, + "loss/crossentropy": 2.3829445838928223, + "loss/hidden": 1.1015625, + "loss/logits": 0.16937227547168732, + "loss/reg": 4.766715210280381e-05, + "step": 1130 + }, + { + "epoch": 0.141375, + "grad_norm": 2.74442195892334, + "grad_norm_var": 1.0716429218470986, + "learning_rate": 0.0001, + "loss": 1.1947, + "loss/crossentropy": 2.5616579055786133, + "loss/hidden": 1.03125, + "loss/logits": 0.16293516755104065, + "loss/reg": 4.76537061331328e-05, + "step": 1131 + }, + { + "epoch": 0.1415, + "grad_norm": 2.376505136489868, + "grad_norm_var": 1.057676108050079, + "learning_rate": 0.0001, + "loss": 1.2084, + "loss/crossentropy": 2.4909427165985107, + "loss/hidden": 1.03125, + "loss/logits": 0.17672239243984222, + "loss/reg": 4.763679316965863e-05, + "step": 1132 + }, + { + "epoch": 0.141625, + "grad_norm": 3.554159641265869, + "grad_norm_var": 1.0528692879887234, + "learning_rate": 0.0001, + "loss": 1.5053, + "loss/crossentropy": 2.6319496631622314, + "loss/hidden": 1.2734375, + "loss/logits": 0.2313893735408783, + "loss/reg": 4.7620640543755144e-05, + "step": 1133 + }, + { + "epoch": 0.14175, + "grad_norm": 2.9882657527923584, + "grad_norm_var": 1.0516912119352735, + "learning_rate": 0.0001, + "loss": 1.4679, + "loss/crossentropy": 2.4339728355407715, + "loss/hidden": 1.2421875, + "loss/logits": 0.22522477805614471, + "loss/reg": 4.760462979902513e-05, + "step": 1134 + }, + { + "epoch": 0.141875, + "grad_norm": 2.1122543811798096, + "grad_norm_var": 1.0926904062794485, + "learning_rate": 0.0001, + "loss": 1.3286, + "loss/crossentropy": 2.5192081928253174, + "loss/hidden": 1.1328125, + "loss/logits": 0.1953265368938446, + "loss/reg": 4.758898649015464e-05, + "step": 1135 + }, + { + "epoch": 0.142, + "grad_norm": 3.5696256160736084, + "grad_norm_var": 1.115413644842679, + "learning_rate": 0.0001, + "loss": 1.3965, + "loss/crossentropy": 2.285820245742798, + "loss/hidden": 1.2109375, + "loss/logits": 0.18508857488632202, + "loss/reg": 4.756827183882706e-05, + "step": 1136 + }, + { + "epoch": 0.142125, + "grad_norm": 3.564345359802246, + "grad_norm_var": 1.081150356805938, + "learning_rate": 0.0001, + "loss": 1.6697, + "loss/crossentropy": 2.237314462661743, + "loss/hidden": 1.4765625, + "loss/logits": 0.19265148043632507, + "loss/reg": 4.755084592034109e-05, + "step": 1137 + }, + { + "epoch": 0.14225, + "grad_norm": 2.2548444271087646, + "grad_norm_var": 1.085683606162062, + "learning_rate": 0.0001, + "loss": 1.1842, + "loss/crossentropy": 2.4809651374816895, + "loss/hidden": 1.03125, + "loss/logits": 0.15246474742889404, + "loss/reg": 4.753081884700805e-05, + "step": 1138 + }, + { + "epoch": 0.142375, + "grad_norm": 2.6867403984069824, + "grad_norm_var": 1.036820672443695, + "learning_rate": 0.0001, + "loss": 1.2994, + "loss/crossentropy": 2.91271710395813, + "loss/hidden": 1.125, + "loss/logits": 0.17396080493927002, + "loss/reg": 4.7511359298368916e-05, + "step": 1139 + }, + { + "epoch": 0.1425, + "grad_norm": 2.4465537071228027, + "grad_norm_var": 1.0408451939787444, + "learning_rate": 0.0001, + "loss": 1.2231, + "loss/crossentropy": 2.5878028869628906, + "loss/hidden": 1.0234375, + "loss/logits": 0.1992160677909851, + "loss/reg": 4.749055369757116e-05, + "step": 1140 + }, + { + "epoch": 0.142625, + "grad_norm": 4.528571128845215, + "grad_norm_var": 1.1741261249691728, + "learning_rate": 0.0001, + "loss": 1.9586, + "loss/crossentropy": 2.373525619506836, + "loss/hidden": 1.6953125, + "loss/logits": 0.2627871334552765, + "loss/reg": 4.7472818550886586e-05, + "step": 1141 + }, + { + "epoch": 0.14275, + "grad_norm": 2.7354013919830322, + "grad_norm_var": 1.1375391518044278, + "learning_rate": 0.0001, + "loss": 1.3563, + "loss/crossentropy": 2.648790121078491, + "loss/hidden": 1.1640625, + "loss/logits": 0.19176684319972992, + "loss/reg": 4.745826663565822e-05, + "step": 1142 + }, + { + "epoch": 0.142875, + "grad_norm": 2.713024854660034, + "grad_norm_var": 1.1501972178076196, + "learning_rate": 0.0001, + "loss": 1.3524, + "loss/crossentropy": 2.4191818237304688, + "loss/hidden": 1.171875, + "loss/logits": 0.1800951361656189, + "loss/reg": 4.743799945572391e-05, + "step": 1143 + }, + { + "epoch": 0.143, + "grad_norm": 2.346231460571289, + "grad_norm_var": 0.40369959007739076, + "learning_rate": 0.0001, + "loss": 1.2126, + "loss/crossentropy": 2.2274231910705566, + "loss/hidden": 1.046875, + "loss/logits": 0.16525804996490479, + "loss/reg": 4.7417455789400265e-05, + "step": 1144 + }, + { + "epoch": 0.143125, + "grad_norm": 2.744516372680664, + "grad_norm_var": 0.40012624841660693, + "learning_rate": 0.0001, + "loss": 1.603, + "loss/crossentropy": 2.5323281288146973, + "loss/hidden": 1.34375, + "loss/logits": 0.2587292492389679, + "loss/reg": 4.739963696920313e-05, + "step": 1145 + }, + { + "epoch": 0.14325, + "grad_norm": 2.8817338943481445, + "grad_norm_var": 0.3970391852633552, + "learning_rate": 0.0001, + "loss": 1.3588, + "loss/crossentropy": 2.4774110317230225, + "loss/hidden": 1.15625, + "loss/logits": 0.20203325152397156, + "loss/reg": 4.737896233564243e-05, + "step": 1146 + }, + { + "epoch": 0.143375, + "grad_norm": 2.2984557151794434, + "grad_norm_var": 0.41815268022830854, + "learning_rate": 0.0001, + "loss": 1.4007, + "loss/crossentropy": 2.5607006549835205, + "loss/hidden": 1.1875, + "loss/logits": 0.21274057030677795, + "loss/reg": 4.7362878831336275e-05, + "step": 1147 + }, + { + "epoch": 0.1435, + "grad_norm": 2.0969078540802, + "grad_norm_var": 0.441159171760544, + "learning_rate": 0.0001, + "loss": 1.1426, + "loss/crossentropy": 2.3691599369049072, + "loss/hidden": 0.9921875, + "loss/logits": 0.1498948335647583, + "loss/reg": 4.734646063297987e-05, + "step": 1148 + }, + { + "epoch": 0.143625, + "grad_norm": 2.029167652130127, + "grad_norm_var": 0.4423349102668773, + "learning_rate": 0.0001, + "loss": 1.2621, + "loss/crossentropy": 2.385744571685791, + "loss/hidden": 1.09375, + "loss/logits": 0.1678312122821808, + "loss/reg": 4.732477100333199e-05, + "step": 1149 + }, + { + "epoch": 0.14375, + "grad_norm": 1.9239691495895386, + "grad_norm_var": 0.47928917254578013, + "learning_rate": 0.0001, + "loss": 1.0533, + "loss/crossentropy": 2.613713502883911, + "loss/hidden": 0.91796875, + "loss/logits": 0.1348608434200287, + "loss/reg": 4.730945511255413e-05, + "step": 1150 + }, + { + "epoch": 0.143875, + "grad_norm": 2.0164811611175537, + "grad_norm_var": 0.4871542069837379, + "learning_rate": 0.0001, + "loss": 1.342, + "loss/crossentropy": 2.0422286987304688, + "loss/hidden": 1.1640625, + "loss/logits": 0.17750707268714905, + "loss/reg": 4.7292855015257373e-05, + "step": 1151 + }, + { + "epoch": 0.144, + "grad_norm": 2.2749154567718506, + "grad_norm_var": 0.437878471389033, + "learning_rate": 0.0001, + "loss": 1.217, + "loss/crossentropy": 2.351731300354004, + "loss/hidden": 1.0390625, + "loss/logits": 0.17749223113059998, + "loss/reg": 4.7281155275413767e-05, + "step": 1152 + }, + { + "epoch": 0.144125, + "grad_norm": 2.5271859169006348, + "grad_norm_var": 0.3712498798941301, + "learning_rate": 0.0001, + "loss": 1.4337, + "loss/crossentropy": 2.5493454933166504, + "loss/hidden": 1.203125, + "loss/logits": 0.23008278012275696, + "loss/reg": 4.7269335482269526e-05, + "step": 1153 + }, + { + "epoch": 0.14425, + "grad_norm": 2.2880935668945312, + "grad_norm_var": 0.3700923052297336, + "learning_rate": 0.0001, + "loss": 1.2332, + "loss/crossentropy": 2.7758634090423584, + "loss/hidden": 1.0625, + "loss/logits": 0.1702326536178589, + "loss/reg": 4.725826875073835e-05, + "step": 1154 + }, + { + "epoch": 0.144375, + "grad_norm": 4.250509262084961, + "grad_norm_var": 0.5548537228186133, + "learning_rate": 0.0001, + "loss": 1.744, + "loss/crossentropy": 2.3507156372070312, + "loss/hidden": 1.4609375, + "loss/logits": 0.2826390266418457, + "loss/reg": 4.7243731387425214e-05, + "step": 1155 + }, + { + "epoch": 0.1445, + "grad_norm": 2.425056219100952, + "grad_norm_var": 0.555412315408905, + "learning_rate": 0.0001, + "loss": 1.2433, + "loss/crossentropy": 2.7652292251586914, + "loss/hidden": 1.0703125, + "loss/logits": 0.17255419492721558, + "loss/reg": 4.7228229959728196e-05, + "step": 1156 + }, + { + "epoch": 0.144625, + "grad_norm": 2.7023041248321533, + "grad_norm_var": 0.3015625034546047, + "learning_rate": 0.0001, + "loss": 1.4738, + "loss/crossentropy": 2.423006772994995, + "loss/hidden": 1.2734375, + "loss/logits": 0.1999123990535736, + "loss/reg": 4.721076038549654e-05, + "step": 1157 + }, + { + "epoch": 0.14475, + "grad_norm": 2.7887837886810303, + "grad_norm_var": 0.30330314157064175, + "learning_rate": 0.0001, + "loss": 1.4482, + "loss/crossentropy": 2.6713080406188965, + "loss/hidden": 1.21875, + "loss/logits": 0.2290123999118805, + "loss/reg": 4.7194982471410185e-05, + "step": 1158 + }, + { + "epoch": 0.144875, + "grad_norm": 2.315005302429199, + "grad_norm_var": 0.30291867264976885, + "learning_rate": 0.0001, + "loss": 1.2302, + "loss/crossentropy": 2.9456934928894043, + "loss/hidden": 1.0625, + "loss/logits": 0.16724364459514618, + "loss/reg": 4.717716728919186e-05, + "step": 1159 + }, + { + "epoch": 0.145, + "grad_norm": 20.321826934814453, + "grad_norm_var": 20.14308559505051, + "learning_rate": 0.0001, + "loss": 1.5487, + "loss/crossentropy": 2.416504383087158, + "loss/hidden": 1.3359375, + "loss/logits": 0.21224364638328552, + "loss/reg": 4.7157529479591176e-05, + "step": 1160 + }, + { + "epoch": 0.145125, + "grad_norm": 2.4590742588043213, + "grad_norm_var": 20.181414443983293, + "learning_rate": 0.0001, + "loss": 1.3333, + "loss/crossentropy": 2.6246085166931152, + "loss/hidden": 1.140625, + "loss/logits": 0.1921812891960144, + "loss/reg": 4.713963426183909e-05, + "step": 1161 + }, + { + "epoch": 0.14525, + "grad_norm": 6.681092739105225, + "grad_norm_var": 20.719766602422965, + "learning_rate": 0.0001, + "loss": 1.8658, + "loss/crossentropy": 2.900252342224121, + "loss/hidden": 1.59375, + "loss/logits": 0.2715913951396942, + "loss/reg": 4.712406007456593e-05, + "step": 1162 + }, + { + "epoch": 0.145375, + "grad_norm": 2.4327285289764404, + "grad_norm_var": 20.6933411626458, + "learning_rate": 0.0001, + "loss": 1.4701, + "loss/crossentropy": 2.718841314315796, + "loss/hidden": 1.25, + "loss/logits": 0.21964167058467865, + "loss/reg": 4.7109646402532235e-05, + "step": 1163 + }, + { + "epoch": 0.1455, + "grad_norm": 3.1247708797454834, + "grad_norm_var": 20.51968710018897, + "learning_rate": 0.0001, + "loss": 1.6086, + "loss/crossentropy": 2.7490997314453125, + "loss/hidden": 1.3515625, + "loss/logits": 0.2565191686153412, + "loss/reg": 4.709674612968229e-05, + "step": 1164 + }, + { + "epoch": 0.145625, + "grad_norm": 3.131579875946045, + "grad_norm_var": 20.319174937109523, + "learning_rate": 0.0001, + "loss": 1.4654, + "loss/crossentropy": 2.797558546066284, + "loss/hidden": 1.2265625, + "loss/logits": 0.23839589953422546, + "loss/reg": 4.7083391109481454e-05, + "step": 1165 + }, + { + "epoch": 0.14575, + "grad_norm": 3.672037124633789, + "grad_norm_var": 20.031190047272485, + "learning_rate": 0.0001, + "loss": 1.5083, + "loss/crossentropy": 2.388136386871338, + "loss/hidden": 1.296875, + "loss/logits": 0.21094849705696106, + "loss/reg": 4.707129119196907e-05, + "step": 1166 + }, + { + "epoch": 0.145875, + "grad_norm": 2.3486030101776123, + "grad_norm_var": 19.946341680930896, + "learning_rate": 0.0001, + "loss": 1.403, + "loss/crossentropy": 2.45896577835083, + "loss/hidden": 1.2109375, + "loss/logits": 0.19156183302402496, + "loss/reg": 4.7055495087988675e-05, + "step": 1167 + }, + { + "epoch": 0.146, + "grad_norm": 2.5459513664245605, + "grad_norm_var": 19.884653568287856, + "learning_rate": 0.0001, + "loss": 1.3022, + "loss/crossentropy": 2.4945719242095947, + "loss/hidden": 1.1328125, + "loss/logits": 0.16890111565589905, + "loss/reg": 4.704431557911448e-05, + "step": 1168 + }, + { + "epoch": 0.146125, + "grad_norm": 2.185032606124878, + "grad_norm_var": 19.964904994517646, + "learning_rate": 0.0001, + "loss": 1.3203, + "loss/crossentropy": 2.267711877822876, + "loss/hidden": 1.125, + "loss/logits": 0.19480320811271667, + "loss/reg": 4.703476224676706e-05, + "step": 1169 + }, + { + "epoch": 0.14625, + "grad_norm": 2.453258514404297, + "grad_norm_var": 19.926608452200593, + "learning_rate": 0.0001, + "loss": 1.5579, + "loss/crossentropy": 2.6074061393737793, + "loss/hidden": 1.3046875, + "loss/logits": 0.2527133822441101, + "loss/reg": 4.7016856115078554e-05, + "step": 1170 + }, + { + "epoch": 0.146375, + "grad_norm": 2.9324119091033936, + "grad_norm_var": 20.011353286130436, + "learning_rate": 0.0001, + "loss": 1.3058, + "loss/crossentropy": 2.5551066398620605, + "loss/hidden": 1.140625, + "loss/logits": 0.16465714573860168, + "loss/reg": 4.6998691686894745e-05, + "step": 1171 + }, + { + "epoch": 0.1465, + "grad_norm": 2.0241332054138184, + "grad_norm_var": 20.107326037621984, + "learning_rate": 0.0001, + "loss": 1.2586, + "loss/crossentropy": 2.7565040588378906, + "loss/hidden": 1.0703125, + "loss/logits": 0.18783336877822876, + "loss/reg": 4.698050543083809e-05, + "step": 1172 + }, + { + "epoch": 0.146625, + "grad_norm": 2.8346903324127197, + "grad_norm_var": 20.08538431269884, + "learning_rate": 0.0001, + "loss": 1.5072, + "loss/crossentropy": 2.2326302528381348, + "loss/hidden": 1.28125, + "loss/logits": 0.2255048155784607, + "loss/reg": 4.696170799434185e-05, + "step": 1173 + }, + { + "epoch": 0.14675, + "grad_norm": 2.174600601196289, + "grad_norm_var": 20.209433008289647, + "learning_rate": 0.0001, + "loss": 1.182, + "loss/crossentropy": 2.47452449798584, + "loss/hidden": 1.03125, + "loss/logits": 0.15031439065933228, + "loss/reg": 4.6942925109760836e-05, + "step": 1174 + }, + { + "epoch": 0.146875, + "grad_norm": 2.85469651222229, + "grad_norm_var": 20.108020405367256, + "learning_rate": 0.0001, + "loss": 1.3719, + "loss/crossentropy": 2.4068410396575928, + "loss/hidden": 1.1875, + "loss/logits": 0.18392156064510345, + "loss/reg": 4.692598668043502e-05, + "step": 1175 + }, + { + "epoch": 0.147, + "grad_norm": 3.0080974102020264, + "grad_norm_var": 1.1899183007710952, + "learning_rate": 0.0001, + "loss": 1.3194, + "loss/crossentropy": 2.6427528858184814, + "loss/hidden": 1.125, + "loss/logits": 0.1939159333705902, + "loss/reg": 4.690660352935083e-05, + "step": 1176 + }, + { + "epoch": 0.147125, + "grad_norm": 1.8658075332641602, + "grad_norm_var": 1.2490821768597584, + "learning_rate": 0.0001, + "loss": 1.3452, + "loss/crossentropy": 2.4428977966308594, + "loss/hidden": 1.15625, + "loss/logits": 0.18848666548728943, + "loss/reg": 4.68892467324622e-05, + "step": 1177 + }, + { + "epoch": 0.14725, + "grad_norm": 2.0562570095062256, + "grad_norm_var": 0.2492804212330249, + "learning_rate": 0.0001, + "loss": 1.26, + "loss/crossentropy": 2.4440927505493164, + "loss/hidden": 1.078125, + "loss/logits": 0.18139450252056122, + "loss/reg": 4.6868422941770405e-05, + "step": 1178 + }, + { + "epoch": 0.147375, + "grad_norm": 2.4248452186584473, + "grad_norm_var": 0.24946305945295155, + "learning_rate": 0.0001, + "loss": 1.3291, + "loss/crossentropy": 2.495669364929199, + "loss/hidden": 1.140625, + "loss/logits": 0.18804886937141418, + "loss/reg": 4.6847046178299934e-05, + "step": 1179 + }, + { + "epoch": 0.1475, + "grad_norm": 2.193629503250122, + "grad_norm_var": 0.23878596668150653, + "learning_rate": 0.0001, + "loss": 1.2206, + "loss/crossentropy": 2.5296146869659424, + "loss/hidden": 1.0546875, + "loss/logits": 0.16542188823223114, + "loss/reg": 4.682805956690572e-05, + "step": 1180 + }, + { + "epoch": 0.147625, + "grad_norm": 2.19338059425354, + "grad_norm_var": 0.22031007335769434, + "learning_rate": 0.0001, + "loss": 1.353, + "loss/crossentropy": 2.6205079555511475, + "loss/hidden": 1.1796875, + "loss/logits": 0.17287296056747437, + "loss/reg": 4.68104517494794e-05, + "step": 1181 + }, + { + "epoch": 0.14775, + "grad_norm": 2.7266340255737305, + "grad_norm_var": 0.1265998407663306, + "learning_rate": 0.0001, + "loss": 1.7818, + "loss/crossentropy": 2.054272413253784, + "loss/hidden": 1.515625, + "loss/logits": 0.26566898822784424, + "loss/reg": 4.679444100474939e-05, + "step": 1182 + }, + { + "epoch": 0.147875, + "grad_norm": 10.878344535827637, + "grad_norm_var": 4.585428414128689, + "learning_rate": 0.0001, + "loss": 2.6308, + "loss/crossentropy": 2.638014554977417, + "loss/hidden": 2.171875, + "loss/logits": 0.4585029184818268, + "loss/reg": 4.677903052652255e-05, + "step": 1183 + }, + { + "epoch": 0.148, + "grad_norm": 2.57718825340271, + "grad_norm_var": 4.583767061458208, + "learning_rate": 0.0001, + "loss": 1.426, + "loss/crossentropy": 2.631078004837036, + "loss/hidden": 1.203125, + "loss/logits": 0.2224457859992981, + "loss/reg": 4.676307435147464e-05, + "step": 1184 + }, + { + "epoch": 0.148125, + "grad_norm": 2.099421262741089, + "grad_norm_var": 4.593087690510012, + "learning_rate": 0.0001, + "loss": 1.1609, + "loss/crossentropy": 2.6211211681365967, + "loss/hidden": 1.0, + "loss/logits": 0.16039219498634338, + "loss/reg": 4.674717638408765e-05, + "step": 1185 + }, + { + "epoch": 0.14825, + "grad_norm": 2.1765975952148438, + "grad_norm_var": 4.616419928519187, + "learning_rate": 0.0001, + "loss": 1.2765, + "loss/crossentropy": 2.382640838623047, + "loss/hidden": 1.09375, + "loss/logits": 0.18227428197860718, + "loss/reg": 4.673180228564888e-05, + "step": 1186 + }, + { + "epoch": 0.148375, + "grad_norm": 5.076205253601074, + "grad_norm_var": 4.901835733529481, + "learning_rate": 0.0001, + "loss": 1.5855, + "loss/crossentropy": 2.359898328781128, + "loss/hidden": 1.28125, + "loss/logits": 0.3037688732147217, + "loss/reg": 4.6718851081095636e-05, + "step": 1187 + }, + { + "epoch": 0.1485, + "grad_norm": 3.1118228435516357, + "grad_norm_var": 4.823696787247346, + "learning_rate": 0.0001, + "loss": 1.2903, + "loss/crossentropy": 2.5006351470947266, + "loss/hidden": 1.109375, + "loss/logits": 0.1804269552230835, + "loss/reg": 4.67036988993641e-05, + "step": 1188 + }, + { + "epoch": 0.148625, + "grad_norm": 2.433318614959717, + "grad_norm_var": 4.850145380757316, + "learning_rate": 0.0001, + "loss": 1.357, + "loss/crossentropy": 2.6935362815856934, + "loss/hidden": 1.140625, + "loss/logits": 0.2159041315317154, + "loss/reg": 4.669180998462252e-05, + "step": 1189 + }, + { + "epoch": 0.14875, + "grad_norm": 2.3695571422576904, + "grad_norm_var": 4.828058326793973, + "learning_rate": 0.0001, + "loss": 1.4454, + "loss/crossentropy": 2.7090983390808105, + "loss/hidden": 1.234375, + "loss/logits": 0.2105472981929779, + "loss/reg": 4.6679560909979045e-05, + "step": 1190 + }, + { + "epoch": 0.148875, + "grad_norm": 2.809579372406006, + "grad_norm_var": 4.829828812715443, + "learning_rate": 0.0001, + "loss": 1.3304, + "loss/crossentropy": 2.592134714126587, + "loss/hidden": 1.140625, + "loss/logits": 0.18935655057430267, + "loss/reg": 4.6665063564432785e-05, + "step": 1191 + }, + { + "epoch": 0.149, + "grad_norm": 3.6269266605377197, + "grad_norm_var": 4.844113927837752, + "learning_rate": 0.0001, + "loss": 1.5038, + "loss/crossentropy": 2.5124716758728027, + "loss/hidden": 1.28125, + "loss/logits": 0.2220713496208191, + "loss/reg": 4.664724110625684e-05, + "step": 1192 + }, + { + "epoch": 0.149125, + "grad_norm": 2.287263870239258, + "grad_norm_var": 4.78228040964938, + "learning_rate": 0.0001, + "loss": 1.2169, + "loss/crossentropy": 2.4629602432250977, + "loss/hidden": 1.0546875, + "loss/logits": 0.16175468266010284, + "loss/reg": 4.6629724238300696e-05, + "step": 1193 + }, + { + "epoch": 0.14925, + "grad_norm": 2.4541614055633545, + "grad_norm_var": 4.732023172385815, + "learning_rate": 0.0001, + "loss": 1.4253, + "loss/crossentropy": 2.29724383354187, + "loss/hidden": 1.2421875, + "loss/logits": 0.1826433539390564, + "loss/reg": 4.6614863094873726e-05, + "step": 1194 + }, + { + "epoch": 0.149375, + "grad_norm": 2.073838710784912, + "grad_norm_var": 4.776700162502281, + "learning_rate": 0.0001, + "loss": 1.2284, + "loss/crossentropy": 2.670344352722168, + "loss/hidden": 1.0546875, + "loss/logits": 0.17323313653469086, + "loss/reg": 4.660046033677645e-05, + "step": 1195 + }, + { + "epoch": 0.1495, + "grad_norm": 2.8066813945770264, + "grad_norm_var": 4.718501570878416, + "learning_rate": 0.0001, + "loss": 1.3393, + "loss/crossentropy": 2.8886072635650635, + "loss/hidden": 1.15625, + "loss/logits": 0.1826225221157074, + "loss/reg": 4.658956459024921e-05, + "step": 1196 + }, + { + "epoch": 0.149625, + "grad_norm": 3.8294754028320312, + "grad_norm_var": 4.6593823625693265, + "learning_rate": 0.0001, + "loss": 1.5133, + "loss/crossentropy": 2.2249755859375, + "loss/hidden": 1.265625, + "loss/logits": 0.24719995260238647, + "loss/reg": 4.657540921471082e-05, + "step": 1197 + }, + { + "epoch": 0.14975, + "grad_norm": 2.4204254150390625, + "grad_norm_var": 4.690022199661033, + "learning_rate": 0.0001, + "loss": 1.3452, + "loss/crossentropy": 2.817233085632324, + "loss/hidden": 1.140625, + "loss/logits": 0.20412875711917877, + "loss/reg": 4.6560286136809736e-05, + "step": 1198 + }, + { + "epoch": 0.149875, + "grad_norm": 1.9155974388122559, + "grad_norm_var": 0.6715669493519023, + "learning_rate": 0.0001, + "loss": 1.1469, + "loss/crossentropy": 2.4378252029418945, + "loss/hidden": 0.9921875, + "loss/logits": 0.15422964096069336, + "loss/reg": 4.654593067243695e-05, + "step": 1199 + }, + { + "epoch": 0.15, + "grad_norm": 2.4836931228637695, + "grad_norm_var": 0.6743205851249285, + "learning_rate": 0.0001, + "loss": 1.398, + "loss/crossentropy": 2.3904969692230225, + "loss/hidden": 1.2109375, + "loss/logits": 0.1865827888250351, + "loss/reg": 4.653136420529336e-05, + "step": 1200 + }, + { + "epoch": 0.150125, + "grad_norm": 3.4455342292785645, + "grad_norm_var": 0.6710901601970398, + "learning_rate": 0.0001, + "loss": 1.8156, + "loss/crossentropy": 2.0689449310302734, + "loss/hidden": 1.5234375, + "loss/logits": 0.2916872501373291, + "loss/reg": 4.651547351386398e-05, + "step": 1201 + }, + { + "epoch": 0.15025, + "grad_norm": 2.7757627964019775, + "grad_norm_var": 0.6411250200226505, + "learning_rate": 0.0001, + "loss": 1.5134, + "loss/crossentropy": 2.8068742752075195, + "loss/hidden": 1.28125, + "loss/logits": 0.23169484734535217, + "loss/reg": 4.650075425161049e-05, + "step": 1202 + }, + { + "epoch": 0.150375, + "grad_norm": 2.3012502193450928, + "grad_norm_var": 0.3061121534919548, + "learning_rate": 0.0001, + "loss": 1.3758, + "loss/crossentropy": 2.697503089904785, + "loss/hidden": 1.15625, + "loss/logits": 0.2190503478050232, + "loss/reg": 4.648489993996918e-05, + "step": 1203 + }, + { + "epoch": 0.1505, + "grad_norm": 2.1723225116729736, + "grad_norm_var": 0.3092592888203074, + "learning_rate": 0.0001, + "loss": 1.3397, + "loss/crossentropy": 2.3358473777770996, + "loss/hidden": 1.15625, + "loss/logits": 0.18293632566928864, + "loss/reg": 4.6468485379591584e-05, + "step": 1204 + }, + { + "epoch": 0.150625, + "grad_norm": 3.029073715209961, + "grad_norm_var": 0.31519634973794436, + "learning_rate": 0.0001, + "loss": 1.6785, + "loss/crossentropy": 2.385075330734253, + "loss/hidden": 1.4453125, + "loss/logits": 0.23269888758659363, + "loss/reg": 4.645344597520307e-05, + "step": 1205 + }, + { + "epoch": 0.15075, + "grad_norm": 3.408116102218628, + "grad_norm_var": 0.3403031929612437, + "learning_rate": 0.0001, + "loss": 1.6782, + "loss/crossentropy": 2.7597949504852295, + "loss/hidden": 1.4296875, + "loss/logits": 0.24803972244262695, + "loss/reg": 4.6434746764134616e-05, + "step": 1206 + }, + { + "epoch": 0.150875, + "grad_norm": 2.469802141189575, + "grad_norm_var": 0.3443656874500211, + "learning_rate": 0.0001, + "loss": 1.3171, + "loss/crossentropy": 2.701486349105835, + "loss/hidden": 1.125, + "loss/logits": 0.19166097044944763, + "loss/reg": 4.641970372176729e-05, + "step": 1207 + }, + { + "epoch": 0.151, + "grad_norm": 1.992879867553711, + "grad_norm_var": 0.31337938768562573, + "learning_rate": 0.0001, + "loss": 1.2277, + "loss/crossentropy": 2.489752769470215, + "loss/hidden": 1.0546875, + "loss/logits": 0.17256753146648407, + "loss/reg": 4.6402536099776626e-05, + "step": 1208 + }, + { + "epoch": 0.151125, + "grad_norm": 2.535855293273926, + "grad_norm_var": 0.30632514875860556, + "learning_rate": 0.0001, + "loss": 1.3068, + "loss/crossentropy": 2.897026777267456, + "loss/hidden": 1.1328125, + "loss/logits": 0.17352746427059174, + "loss/reg": 4.638026439351961e-05, + "step": 1209 + }, + { + "epoch": 0.15125, + "grad_norm": 2.456472158432007, + "grad_norm_var": 0.3062706427848125, + "learning_rate": 0.0001, + "loss": 1.3139, + "loss/crossentropy": 2.676337957382202, + "loss/hidden": 1.125, + "loss/logits": 0.18842476606369019, + "loss/reg": 4.6367600589292124e-05, + "step": 1210 + }, + { + "epoch": 0.151375, + "grad_norm": 3.8645594120025635, + "grad_norm_var": 0.37334871398992014, + "learning_rate": 0.0001, + "loss": 1.6388, + "loss/crossentropy": 2.2264482975006104, + "loss/hidden": 1.3984375, + "loss/logits": 0.2399100363254547, + "loss/reg": 4.6349210606422275e-05, + "step": 1211 + }, + { + "epoch": 0.1515, + "grad_norm": 3.0219388008117676, + "grad_norm_var": 0.37803743581498545, + "learning_rate": 0.0001, + "loss": 1.5009, + "loss/crossentropy": 3.0072405338287354, + "loss/hidden": 1.296875, + "loss/logits": 0.20359638333320618, + "loss/reg": 4.6335251681739464e-05, + "step": 1212 + }, + { + "epoch": 0.151625, + "grad_norm": 3.7400689125061035, + "grad_norm_var": 0.3657602117088911, + "learning_rate": 0.0001, + "loss": 1.8148, + "loss/crossentropy": 2.6893434524536133, + "loss/hidden": 1.5390625, + "loss/logits": 0.275297075510025, + "loss/reg": 4.631928095477633e-05, + "step": 1213 + }, + { + "epoch": 0.15175, + "grad_norm": 2.2333054542541504, + "grad_norm_var": 0.3762232507342539, + "learning_rate": 0.0001, + "loss": 1.3913, + "loss/crossentropy": 2.447011947631836, + "loss/hidden": 1.203125, + "loss/logits": 0.18775393068790436, + "loss/reg": 4.630190960597247e-05, + "step": 1214 + }, + { + "epoch": 0.151875, + "grad_norm": 2.458369493484497, + "grad_norm_var": 0.33494596633352763, + "learning_rate": 0.0001, + "loss": 1.3824, + "loss/crossentropy": 2.6085517406463623, + "loss/hidden": 1.171875, + "loss/logits": 0.21003778278827667, + "loss/reg": 4.628559690900147e-05, + "step": 1215 + }, + { + "epoch": 0.152, + "grad_norm": 2.192143201828003, + "grad_norm_var": 0.35155590225111055, + "learning_rate": 0.0001, + "loss": 1.4439, + "loss/crossentropy": 2.356393575668335, + "loss/hidden": 1.234375, + "loss/logits": 0.20902931690216064, + "loss/reg": 4.626429654308595e-05, + "step": 1216 + }, + { + "epoch": 0.152125, + "grad_norm": 2.258812665939331, + "grad_norm_var": 0.3304848535876232, + "learning_rate": 0.0001, + "loss": 1.1278, + "loss/crossentropy": 2.5807323455810547, + "loss/hidden": 0.9765625, + "loss/logits": 0.15082184970378876, + "loss/reg": 4.624574285116978e-05, + "step": 1217 + }, + { + "epoch": 0.15225, + "grad_norm": 2.9527230262756348, + "grad_norm_var": 0.3346562098525602, + "learning_rate": 0.0001, + "loss": 1.6524, + "loss/crossentropy": 2.108830213546753, + "loss/hidden": 1.3671875, + "loss/logits": 0.2847442626953125, + "loss/reg": 4.6231474698288366e-05, + "step": 1218 + }, + { + "epoch": 0.152375, + "grad_norm": 2.7327449321746826, + "grad_norm_var": 0.32375564974886756, + "learning_rate": 0.0001, + "loss": 1.5971, + "loss/crossentropy": 2.2259297370910645, + "loss/hidden": 1.359375, + "loss/logits": 0.2372676432132721, + "loss/reg": 4.621966218110174e-05, + "step": 1219 + }, + { + "epoch": 0.1525, + "grad_norm": 2.4477243423461914, + "grad_norm_var": 0.3083870484826993, + "learning_rate": 0.0001, + "loss": 1.3617, + "loss/crossentropy": 2.6080846786499023, + "loss/hidden": 1.171875, + "loss/logits": 0.1893840879201889, + "loss/reg": 4.6210447180783376e-05, + "step": 1220 + }, + { + "epoch": 0.152625, + "grad_norm": 3.3036723136901855, + "grad_norm_var": 0.3237876349353845, + "learning_rate": 0.0001, + "loss": 1.5448, + "loss/crossentropy": 2.5666184425354004, + "loss/hidden": 1.3046875, + "loss/logits": 0.2396336793899536, + "loss/reg": 4.619457831722684e-05, + "step": 1221 + }, + { + "epoch": 0.15275, + "grad_norm": 2.905449867248535, + "grad_norm_var": 0.2957611742412664, + "learning_rate": 0.0001, + "loss": 1.228, + "loss/crossentropy": 2.7263970375061035, + "loss/hidden": 1.0625, + "loss/logits": 0.16505266726016998, + "loss/reg": 4.618103776010685e-05, + "step": 1222 + }, + { + "epoch": 0.152875, + "grad_norm": 2.9071547985076904, + "grad_norm_var": 0.29295649472309315, + "learning_rate": 0.0001, + "loss": 1.3722, + "loss/crossentropy": 2.2898879051208496, + "loss/hidden": 1.1875, + "loss/logits": 0.1842312216758728, + "loss/reg": 4.6172623115126044e-05, + "step": 1223 + }, + { + "epoch": 0.153, + "grad_norm": 3.5174379348754883, + "grad_norm_var": 0.2842714538848438, + "learning_rate": 0.0001, + "loss": 1.5973, + "loss/crossentropy": 2.2778470516204834, + "loss/hidden": 1.390625, + "loss/logits": 0.2062428593635559, + "loss/reg": 4.616468140739016e-05, + "step": 1224 + }, + { + "epoch": 0.153125, + "grad_norm": 3.030709981918335, + "grad_norm_var": 0.2791441912567497, + "learning_rate": 0.0001, + "loss": 1.5098, + "loss/crossentropy": 2.477696180343628, + "loss/hidden": 1.3359375, + "loss/logits": 0.17343661189079285, + "loss/reg": 4.615149737219326e-05, + "step": 1225 + }, + { + "epoch": 0.15325, + "grad_norm": 2.864184856414795, + "grad_norm_var": 0.2667025408972182, + "learning_rate": 0.0001, + "loss": 1.232, + "loss/crossentropy": 2.1104869842529297, + "loss/hidden": 1.0546875, + "loss/logits": 0.176896333694458, + "loss/reg": 4.613979399437085e-05, + "step": 1226 + }, + { + "epoch": 0.153375, + "grad_norm": 2.8904950618743896, + "grad_norm_var": 0.20098186745316274, + "learning_rate": 0.0001, + "loss": 1.2044, + "loss/crossentropy": 2.806056022644043, + "loss/hidden": 1.0546875, + "loss/logits": 0.14926595985889435, + "loss/reg": 4.612850534613244e-05, + "step": 1227 + }, + { + "epoch": 0.1535, + "grad_norm": 3.037811040878296, + "grad_norm_var": 0.20138040974156488, + "learning_rate": 0.0001, + "loss": 1.2008, + "loss/crossentropy": 2.2585337162017822, + "loss/hidden": 1.046875, + "loss/logits": 0.15345463156700134, + "loss/reg": 4.6119672333588824e-05, + "step": 1228 + }, + { + "epoch": 0.153625, + "grad_norm": 2.884638786315918, + "grad_norm_var": 0.14468985219164002, + "learning_rate": 0.0001, + "loss": 1.477, + "loss/crossentropy": 2.2203421592712402, + "loss/hidden": 1.234375, + "loss/logits": 0.24218516051769257, + "loss/reg": 4.6111101255519316e-05, + "step": 1229 + }, + { + "epoch": 0.15375, + "grad_norm": 3.115166425704956, + "grad_norm_var": 0.12800406371613893, + "learning_rate": 0.0001, + "loss": 1.3236, + "loss/crossentropy": 2.7177720069885254, + "loss/hidden": 1.1328125, + "loss/logits": 0.1902787983417511, + "loss/reg": 4.609820825862698e-05, + "step": 1230 + }, + { + "epoch": 0.153875, + "grad_norm": 2.872023344039917, + "grad_norm_var": 0.11744581476523853, + "learning_rate": 0.0001, + "loss": 1.4699, + "loss/crossentropy": 2.6862807273864746, + "loss/hidden": 1.2421875, + "loss/logits": 0.22726118564605713, + "loss/reg": 4.608508970704861e-05, + "step": 1231 + }, + { + "epoch": 0.154, + "grad_norm": 2.3777549266815186, + "grad_norm_var": 0.10283428435944278, + "learning_rate": 0.0001, + "loss": 1.2628, + "loss/crossentropy": 2.3350508213043213, + "loss/hidden": 1.1015625, + "loss/logits": 0.16080161929130554, + "loss/reg": 4.6070923417573795e-05, + "step": 1232 + }, + { + "epoch": 0.154125, + "grad_norm": 2.02388858795166, + "grad_norm_var": 0.12577742446186732, + "learning_rate": 0.0001, + "loss": 1.3127, + "loss/crossentropy": 2.5232856273651123, + "loss/hidden": 1.1328125, + "loss/logits": 0.17940716445446014, + "loss/reg": 4.606059519574046e-05, + "step": 1233 + }, + { + "epoch": 0.15425, + "grad_norm": 2.5482373237609863, + "grad_norm_var": 0.13135142140041864, + "learning_rate": 0.0001, + "loss": 1.433, + "loss/crossentropy": 2.3807077407836914, + "loss/hidden": 1.234375, + "loss/logits": 0.19818373024463654, + "loss/reg": 4.605152935255319e-05, + "step": 1234 + }, + { + "epoch": 0.154375, + "grad_norm": 2.203364133834839, + "grad_norm_var": 0.15652141199913389, + "learning_rate": 0.0001, + "loss": 1.2164, + "loss/crossentropy": 2.735185146331787, + "loss/hidden": 1.0546875, + "loss/logits": 0.16127313673496246, + "loss/reg": 4.6041928726481274e-05, + "step": 1235 + }, + { + "epoch": 0.1545, + "grad_norm": 2.136706590652466, + "grad_norm_var": 0.17751188961389416, + "learning_rate": 0.0001, + "loss": 1.0936, + "loss/crossentropy": 2.520498037338257, + "loss/hidden": 0.953125, + "loss/logits": 0.1400274932384491, + "loss/reg": 4.6028115320950747e-05, + "step": 1236 + }, + { + "epoch": 0.154625, + "grad_norm": 54.65066909790039, + "grad_norm_var": 168.48549504499928, + "learning_rate": 0.0001, + "loss": 2.3815, + "loss/crossentropy": 2.986631393432617, + "loss/hidden": 2.125, + "loss/logits": 0.2560478150844574, + "loss/reg": 4.601667387760244e-05, + "step": 1237 + }, + { + "epoch": 0.15475, + "grad_norm": 2.784250497817993, + "grad_norm_var": 168.5363861452807, + "learning_rate": 0.0001, + "loss": 1.4229, + "loss/crossentropy": 2.738603115081787, + "loss/hidden": 1.1953125, + "loss/logits": 0.22707870602607727, + "loss/reg": 4.6003900934010744e-05, + "step": 1238 + }, + { + "epoch": 0.154875, + "grad_norm": 2.7723941802978516, + "grad_norm_var": 168.5929190345867, + "learning_rate": 0.0001, + "loss": 1.4481, + "loss/crossentropy": 2.760061740875244, + "loss/hidden": 1.2109375, + "loss/logits": 0.23672153055667877, + "loss/reg": 4.599518433678895e-05, + "step": 1239 + }, + { + "epoch": 0.155, + "grad_norm": 2.8237342834472656, + "grad_norm_var": 168.85093923579436, + "learning_rate": 0.0001, + "loss": 1.8431, + "loss/crossentropy": 2.110563278198242, + "loss/hidden": 1.5625, + "loss/logits": 0.2801324427127838, + "loss/reg": 4.5983535528648645e-05, + "step": 1240 + }, + { + "epoch": 0.155125, + "grad_norm": 2.586320161819458, + "grad_norm_var": 169.03557429254678, + "learning_rate": 0.0001, + "loss": 1.2401, + "loss/crossentropy": 2.4837148189544678, + "loss/hidden": 1.078125, + "loss/logits": 0.16152258217334747, + "loss/reg": 4.59690963907633e-05, + "step": 1241 + }, + { + "epoch": 0.15525, + "grad_norm": 2.43038272857666, + "grad_norm_var": 169.2235486987413, + "learning_rate": 0.0001, + "loss": 1.3523, + "loss/crossentropy": 2.467975378036499, + "loss/hidden": 1.171875, + "loss/logits": 0.17995983362197876, + "loss/reg": 4.59553484688513e-05, + "step": 1242 + }, + { + "epoch": 0.155375, + "grad_norm": 2.66363525390625, + "grad_norm_var": 169.31730109442543, + "learning_rate": 0.0001, + "loss": 1.3432, + "loss/crossentropy": 2.583726167678833, + "loss/hidden": 1.1328125, + "loss/logits": 0.2098957747220993, + "loss/reg": 4.59427283203695e-05, + "step": 1243 + }, + { + "epoch": 0.1555, + "grad_norm": 2.068486213684082, + "grad_norm_var": 169.74199410245245, + "learning_rate": 0.0001, + "loss": 1.2218, + "loss/crossentropy": 2.5289151668548584, + "loss/hidden": 1.0546875, + "loss/logits": 0.16663600504398346, + "loss/reg": 4.592623736243695e-05, + "step": 1244 + }, + { + "epoch": 0.155625, + "grad_norm": 2.142481565475464, + "grad_norm_var": 170.06578252348226, + "learning_rate": 0.0001, + "loss": 1.2824, + "loss/crossentropy": 2.624850273132324, + "loss/hidden": 1.09375, + "loss/logits": 0.1881803572177887, + "loss/reg": 4.5915359805803746e-05, + "step": 1245 + }, + { + "epoch": 0.15575, + "grad_norm": 3.677988052368164, + "grad_norm_var": 169.88691935686097, + "learning_rate": 0.0001, + "loss": 1.4276, + "loss/crossentropy": 2.7901053428649902, + "loss/hidden": 1.2109375, + "loss/logits": 0.2161804437637329, + "loss/reg": 4.5899174438090995e-05, + "step": 1246 + }, + { + "epoch": 0.155875, + "grad_norm": 2.5624334812164307, + "grad_norm_var": 170.01367542428878, + "learning_rate": 0.0001, + "loss": 1.3775, + "loss/crossentropy": 2.4989280700683594, + "loss/hidden": 1.171875, + "loss/logits": 0.20520368218421936, + "loss/reg": 4.588349111145362e-05, + "step": 1247 + }, + { + "epoch": 0.156, + "grad_norm": 2.208247661590576, + "grad_norm_var": 170.0923267285343, + "learning_rate": 0.0001, + "loss": 1.6388, + "loss/crossentropy": 2.471464157104492, + "loss/hidden": 1.3671875, + "loss/logits": 0.27117836475372314, + "loss/reg": 4.587349030771293e-05, + "step": 1248 + }, + { + "epoch": 0.156125, + "grad_norm": 2.627300500869751, + "grad_norm_var": 169.81387519584032, + "learning_rate": 0.0001, + "loss": 1.3158, + "loss/crossentropy": 2.702277898788452, + "loss/hidden": 1.1171875, + "loss/logits": 0.198166161775589, + "loss/reg": 4.586328941513784e-05, + "step": 1249 + }, + { + "epoch": 0.15625, + "grad_norm": 2.6705386638641357, + "grad_norm_var": 169.76169576274927, + "learning_rate": 0.0001, + "loss": 1.5095, + "loss/crossentropy": 2.6403021812438965, + "loss/hidden": 1.296875, + "loss/logits": 0.21217921376228333, + "loss/reg": 4.5855969801777974e-05, + "step": 1250 + }, + { + "epoch": 0.156375, + "grad_norm": 2.4441046714782715, + "grad_norm_var": 169.6494513840449, + "learning_rate": 0.0001, + "loss": 1.4097, + "loss/crossentropy": 2.6433634757995605, + "loss/hidden": 1.203125, + "loss/logits": 0.20615389943122864, + "loss/reg": 4.58406975667458e-05, + "step": 1251 + }, + { + "epoch": 0.1565, + "grad_norm": 2.4537088871002197, + "grad_norm_var": 169.49970781805342, + "learning_rate": 0.0001, + "loss": 1.4022, + "loss/crossentropy": 2.6877176761627197, + "loss/hidden": 1.1875, + "loss/logits": 0.21425047516822815, + "loss/reg": 4.58277172583621e-05, + "step": 1252 + }, + { + "epoch": 0.156625, + "grad_norm": 2.9511280059814453, + "grad_norm_var": 0.1417171540344287, + "learning_rate": 0.0001, + "loss": 1.2724, + "loss/crossentropy": 2.56616473197937, + "loss/hidden": 1.1015625, + "loss/logits": 0.17037302255630493, + "loss/reg": 4.5814376790076494e-05, + "step": 1253 + }, + { + "epoch": 0.15675, + "grad_norm": 2.333775520324707, + "grad_norm_var": 0.1443362499984147, + "learning_rate": 0.0001, + "loss": 1.2437, + "loss/crossentropy": 2.5075337886810303, + "loss/hidden": 1.078125, + "loss/logits": 0.16508585214614868, + "loss/reg": 4.5802942622685805e-05, + "step": 1254 + }, + { + "epoch": 0.156875, + "grad_norm": 2.4434492588043213, + "grad_norm_var": 0.14303538209416022, + "learning_rate": 0.0001, + "loss": 1.2319, + "loss/crossentropy": 2.753591537475586, + "loss/hidden": 1.0703125, + "loss/logits": 0.16108646988868713, + "loss/reg": 4.579335654852912e-05, + "step": 1255 + }, + { + "epoch": 0.157, + "grad_norm": 2.68575119972229, + "grad_norm_var": 0.13952007848767303, + "learning_rate": 0.0001, + "loss": 1.2704, + "loss/crossentropy": 2.274879217147827, + "loss/hidden": 1.109375, + "loss/logits": 0.1605929583311081, + "loss/reg": 4.578304287861101e-05, + "step": 1256 + }, + { + "epoch": 0.157125, + "grad_norm": 1.9222602844238281, + "grad_norm_var": 0.16469380439607614, + "learning_rate": 0.0001, + "loss": 1.1729, + "loss/crossentropy": 2.367164134979248, + "loss/hidden": 1.03125, + "loss/logits": 0.14119181036949158, + "loss/reg": 4.576797437039204e-05, + "step": 1257 + }, + { + "epoch": 0.15725, + "grad_norm": 2.551128387451172, + "grad_norm_var": 0.16419677919077813, + "learning_rate": 0.0001, + "loss": 1.4752, + "loss/crossentropy": 2.5899417400360107, + "loss/hidden": 1.2109375, + "loss/logits": 0.26381391286849976, + "loss/reg": 4.5751668949378654e-05, + "step": 1258 + }, + { + "epoch": 0.157375, + "grad_norm": 2.118729829788208, + "grad_norm_var": 0.17271112727045193, + "learning_rate": 0.0001, + "loss": 1.288, + "loss/crossentropy": 2.8331987857818604, + "loss/hidden": 1.1015625, + "loss/logits": 0.18601545691490173, + "loss/reg": 4.5734541345154867e-05, + "step": 1259 + }, + { + "epoch": 0.1575, + "grad_norm": 2.093118667602539, + "grad_norm_var": 0.17136024462738533, + "learning_rate": 0.0001, + "loss": 1.2783, + "loss/crossentropy": 2.581794261932373, + "loss/hidden": 1.1015625, + "loss/logits": 0.17623090744018555, + "loss/reg": 4.572012767312117e-05, + "step": 1260 + }, + { + "epoch": 0.157625, + "grad_norm": 2.452829122543335, + "grad_norm_var": 0.16288042975729317, + "learning_rate": 0.0001, + "loss": 1.2639, + "loss/crossentropy": 2.4306862354278564, + "loss/hidden": 1.09375, + "loss/logits": 0.16968411207199097, + "loss/reg": 4.5699918700847775e-05, + "step": 1261 + }, + { + "epoch": 0.15775, + "grad_norm": 2.0749716758728027, + "grad_norm_var": 0.07433122353475691, + "learning_rate": 0.0001, + "loss": 1.3118, + "loss/crossentropy": 2.5065548419952393, + "loss/hidden": 1.1328125, + "loss/logits": 0.17850646376609802, + "loss/reg": 4.568080112221651e-05, + "step": 1262 + }, + { + "epoch": 0.157875, + "grad_norm": 1.962320327758789, + "grad_norm_var": 0.08481014322264452, + "learning_rate": 0.0001, + "loss": 1.2214, + "loss/crossentropy": 2.2913718223571777, + "loss/hidden": 1.0625, + "loss/logits": 0.15845662355422974, + "loss/reg": 4.566472489386797e-05, + "step": 1263 + }, + { + "epoch": 0.158, + "grad_norm": 2.977077007293701, + "grad_norm_var": 0.10470244938228059, + "learning_rate": 0.0001, + "loss": 1.238, + "loss/crossentropy": 2.6589906215667725, + "loss/hidden": 1.0859375, + "loss/logits": 0.15159624814987183, + "loss/reg": 4.56501220469363e-05, + "step": 1264 + }, + { + "epoch": 0.158125, + "grad_norm": 2.3853960037231445, + "grad_norm_var": 0.101758608177992, + "learning_rate": 0.0001, + "loss": 1.4115, + "loss/crossentropy": 2.3630943298339844, + "loss/hidden": 1.21875, + "loss/logits": 0.1922955960035324, + "loss/reg": 4.5634922571480274e-05, + "step": 1265 + }, + { + "epoch": 0.15825, + "grad_norm": 2.4945428371429443, + "grad_norm_var": 0.09752244376290188, + "learning_rate": 0.0001, + "loss": 1.2384, + "loss/crossentropy": 2.578355073928833, + "loss/hidden": 1.078125, + "loss/logits": 0.1598268449306488, + "loss/reg": 4.5620046876138076e-05, + "step": 1266 + }, + { + "epoch": 0.158375, + "grad_norm": 2.6602394580841064, + "grad_norm_var": 0.10181342884065975, + "learning_rate": 0.0001, + "loss": 1.4403, + "loss/crossentropy": 2.4018986225128174, + "loss/hidden": 1.25, + "loss/logits": 0.18987661600112915, + "loss/reg": 4.5601722376886755e-05, + "step": 1267 + }, + { + "epoch": 0.1585, + "grad_norm": 2.1531805992126465, + "grad_norm_var": 0.10570789087357374, + "learning_rate": 0.0001, + "loss": 1.2901, + "loss/crossentropy": 2.259855031967163, + "loss/hidden": 1.1171875, + "loss/logits": 0.17248710989952087, + "loss/reg": 4.558489308692515e-05, + "step": 1268 + }, + { + "epoch": 0.158625, + "grad_norm": 6.556189060211182, + "grad_norm_var": 1.18710927748879, + "learning_rate": 0.0001, + "loss": 2.2171, + "loss/crossentropy": 3.0120301246643066, + "loss/hidden": 1.6796875, + "loss/logits": 0.5369682312011719, + "loss/reg": 4.5569711801363155e-05, + "step": 1269 + }, + { + "epoch": 0.15875, + "grad_norm": 1.949015736579895, + "grad_norm_var": 1.210868993450559, + "learning_rate": 0.0001, + "loss": 1.1905, + "loss/crossentropy": 2.266023635864258, + "loss/hidden": 1.0390625, + "loss/logits": 0.15093478560447693, + "loss/reg": 4.555568375508301e-05, + "step": 1270 + }, + { + "epoch": 0.158875, + "grad_norm": 2.2778539657592773, + "grad_norm_var": 1.2158740780819797, + "learning_rate": 0.0001, + "loss": 1.3343, + "loss/crossentropy": 2.4026403427124023, + "loss/hidden": 1.15625, + "loss/logits": 0.1775573194026947, + "loss/reg": 4.554086262942292e-05, + "step": 1271 + }, + { + "epoch": 0.159, + "grad_norm": 2.177086591720581, + "grad_norm_var": 1.2250197385653658, + "learning_rate": 0.0001, + "loss": 1.217, + "loss/crossentropy": 2.674293279647827, + "loss/hidden": 1.0546875, + "loss/logits": 0.16184410452842712, + "loss/reg": 4.552335667540319e-05, + "step": 1272 + }, + { + "epoch": 0.159125, + "grad_norm": 8.303696632385254, + "grad_norm_var": 3.235757025826163, + "learning_rate": 0.0001, + "loss": 1.8709, + "loss/crossentropy": 2.4282426834106445, + "loss/hidden": 1.6328125, + "loss/logits": 0.237609401345253, + "loss/reg": 4.551160236587748e-05, + "step": 1273 + }, + { + "epoch": 0.15925, + "grad_norm": 2.404670000076294, + "grad_norm_var": 3.244871326073413, + "learning_rate": 0.0001, + "loss": 1.2833, + "loss/crossentropy": 2.5967812538146973, + "loss/hidden": 1.09375, + "loss/logits": 0.18913458287715912, + "loss/reg": 4.549379809759557e-05, + "step": 1274 + }, + { + "epoch": 0.159375, + "grad_norm": 5.456787109375, + "grad_norm_var": 3.575733704160466, + "learning_rate": 0.0001, + "loss": 1.5074, + "loss/crossentropy": 2.9396886825561523, + "loss/hidden": 1.3125, + "loss/logits": 0.19442874193191528, + "loss/reg": 4.547737626126036e-05, + "step": 1275 + }, + { + "epoch": 0.1595, + "grad_norm": 4.377321720123291, + "grad_norm_var": 3.580348684788946, + "learning_rate": 0.0001, + "loss": 1.3265, + "loss/crossentropy": 2.5567522048950195, + "loss/hidden": 1.15625, + "loss/logits": 0.1698286086320877, + "loss/reg": 4.5460172259481624e-05, + "step": 1276 + }, + { + "epoch": 0.159625, + "grad_norm": 2.861487865447998, + "grad_norm_var": 3.545091749430006, + "learning_rate": 0.0001, + "loss": 1.2936, + "loss/crossentropy": 2.5974984169006348, + "loss/hidden": 1.1328125, + "loss/logits": 0.16036538779735565, + "loss/reg": 4.5443983253790066e-05, + "step": 1277 + }, + { + "epoch": 0.15975, + "grad_norm": 2.5587146282196045, + "grad_norm_var": 3.4796082011222356, + "learning_rate": 0.0001, + "loss": 1.3762, + "loss/crossentropy": 2.2575130462646484, + "loss/hidden": 1.1875, + "loss/logits": 0.18820002675056458, + "loss/reg": 4.5428696466842666e-05, + "step": 1278 + }, + { + "epoch": 0.159875, + "grad_norm": 2.16434907913208, + "grad_norm_var": 3.4448538033595457, + "learning_rate": 0.0001, + "loss": 1.4059, + "loss/crossentropy": 2.222012758255005, + "loss/hidden": 1.2109375, + "loss/logits": 0.1944960057735443, + "loss/reg": 4.5414264604914933e-05, + "step": 1279 + }, + { + "epoch": 0.16, + "grad_norm": 3.1996350288391113, + "grad_norm_var": 3.436590982541448, + "learning_rate": 0.0001, + "loss": 1.509, + "loss/crossentropy": 2.7933011054992676, + "loss/hidden": 1.265625, + "loss/logits": 0.24294595420360565, + "loss/reg": 4.5400753151625395e-05, + "step": 1280 + }, + { + "epoch": 0.160125, + "grad_norm": 2.10770583152771, + "grad_norm_var": 3.4780050157497047, + "learning_rate": 0.0001, + "loss": 1.2331, + "loss/crossentropy": 2.5146090984344482, + "loss/hidden": 1.0703125, + "loss/logits": 0.16236549615859985, + "loss/reg": 4.538971552392468e-05, + "step": 1281 + }, + { + "epoch": 0.16025, + "grad_norm": 2.3782973289489746, + "grad_norm_var": 3.4922079229987424, + "learning_rate": 0.0001, + "loss": 1.2909, + "loss/crossentropy": 2.5038681030273438, + "loss/hidden": 1.0859375, + "loss/logits": 0.20447075366973877, + "loss/reg": 4.53733628091868e-05, + "step": 1282 + }, + { + "epoch": 0.160375, + "grad_norm": 3.447486162185669, + "grad_norm_var": 3.4586315294422163, + "learning_rate": 0.0001, + "loss": 1.5448, + "loss/crossentropy": 2.573171854019165, + "loss/hidden": 1.296875, + "loss/logits": 0.247447669506073, + "loss/reg": 4.53554603154771e-05, + "step": 1283 + }, + { + "epoch": 0.1605, + "grad_norm": 3.238802194595337, + "grad_norm_var": 3.3520558241327163, + "learning_rate": 0.0001, + "loss": 1.5793, + "loss/crossentropy": 3.2341203689575195, + "loss/hidden": 1.359375, + "loss/logits": 0.2195206880569458, + "loss/reg": 4.533507308224216e-05, + "step": 1284 + }, + { + "epoch": 0.160625, + "grad_norm": 2.7456347942352295, + "grad_norm_var": 2.6896300538670217, + "learning_rate": 0.0001, + "loss": 1.5458, + "loss/crossentropy": 2.585313081741333, + "loss/hidden": 1.296875, + "loss/logits": 0.24851781129837036, + "loss/reg": 4.5315591705730185e-05, + "step": 1285 + }, + { + "epoch": 0.16075, + "grad_norm": 2.555786609649658, + "grad_norm_var": 2.609164594143196, + "learning_rate": 0.0001, + "loss": 1.354, + "loss/crossentropy": 2.4461801052093506, + "loss/hidden": 1.1484375, + "loss/logits": 0.20508697628974915, + "loss/reg": 4.529834768618457e-05, + "step": 1286 + }, + { + "epoch": 0.160875, + "grad_norm": 2.5963664054870605, + "grad_norm_var": 2.5735421395473925, + "learning_rate": 0.0001, + "loss": 1.318, + "loss/crossentropy": 2.4623584747314453, + "loss/hidden": 1.125, + "loss/logits": 0.19257178902626038, + "loss/reg": 4.528291538008489e-05, + "step": 1287 + }, + { + "epoch": 0.161, + "grad_norm": 2.6578991413116455, + "grad_norm_var": 2.5169090388190605, + "learning_rate": 0.0001, + "loss": 1.3076, + "loss/crossentropy": 2.818201780319214, + "loss/hidden": 1.125, + "loss/logits": 0.18214286863803864, + "loss/reg": 4.5269247493706644e-05, + "step": 1288 + }, + { + "epoch": 0.161125, + "grad_norm": 2.2922048568725586, + "grad_norm_var": 0.7776683827363757, + "learning_rate": 0.0001, + "loss": 1.2893, + "loss/crossentropy": 2.677027940750122, + "loss/hidden": 1.125, + "loss/logits": 0.1638355702161789, + "loss/reg": 4.525210533756763e-05, + "step": 1289 + }, + { + "epoch": 0.16125, + "grad_norm": 2.2950809001922607, + "grad_norm_var": 0.7862440467010436, + "learning_rate": 0.0001, + "loss": 1.5369, + "loss/crossentropy": 2.3946428298950195, + "loss/hidden": 1.3046875, + "loss/logits": 0.2318038046360016, + "loss/reg": 4.523458119365387e-05, + "step": 1290 + }, + { + "epoch": 0.161375, + "grad_norm": 2.176048517227173, + "grad_norm_var": 0.35511413265154773, + "learning_rate": 0.0001, + "loss": 1.2962, + "loss/crossentropy": 2.3253607749938965, + "loss/hidden": 1.125, + "loss/logits": 0.17075207829475403, + "loss/reg": 4.521861774264835e-05, + "step": 1291 + }, + { + "epoch": 0.1615, + "grad_norm": 13.866178512573242, + "grad_norm_var": 8.06882346208631, + "learning_rate": 0.0001, + "loss": 1.379, + "loss/crossentropy": 2.5671613216400146, + "loss/hidden": 1.1953125, + "loss/logits": 0.1832769513130188, + "loss/reg": 4.5201886678114533e-05, + "step": 1292 + }, + { + "epoch": 0.161625, + "grad_norm": 2.2611589431762695, + "grad_norm_var": 8.128157666131413, + "learning_rate": 0.0001, + "loss": 1.3256, + "loss/crossentropy": 2.3536336421966553, + "loss/hidden": 1.1484375, + "loss/logits": 0.176661878824234, + "loss/reg": 4.518694549915381e-05, + "step": 1293 + }, + { + "epoch": 0.16175, + "grad_norm": 2.574432134628296, + "grad_norm_var": 8.126653496369219, + "learning_rate": 0.0001, + "loss": 1.6454, + "loss/crossentropy": 2.236233711242676, + "loss/hidden": 1.3828125, + "loss/logits": 0.2621437907218933, + "loss/reg": 4.517089473665692e-05, + "step": 1294 + }, + { + "epoch": 0.161875, + "grad_norm": 2.8371546268463135, + "grad_norm_var": 8.054430963011198, + "learning_rate": 0.0001, + "loss": 1.1682, + "loss/crossentropy": 2.739943027496338, + "loss/hidden": 1.015625, + "loss/logits": 0.15212970972061157, + "loss/reg": 4.515535692917183e-05, + "step": 1295 + }, + { + "epoch": 0.162, + "grad_norm": 2.6954469680786133, + "grad_norm_var": 8.078871991774545, + "learning_rate": 0.0001, + "loss": 1.3803, + "loss/crossentropy": 2.3508806228637695, + "loss/hidden": 1.1953125, + "loss/logits": 0.18451997637748718, + "loss/reg": 4.513954991125502e-05, + "step": 1296 + }, + { + "epoch": 0.162125, + "grad_norm": 3.7373878955841064, + "grad_norm_var": 7.986798008871676, + "learning_rate": 0.0001, + "loss": 1.466, + "loss/crossentropy": 2.8382225036621094, + "loss/hidden": 1.25, + "loss/logits": 0.21551957726478577, + "loss/reg": 4.512203304329887e-05, + "step": 1297 + }, + { + "epoch": 0.16225, + "grad_norm": 4.052486419677734, + "grad_norm_var": 7.934532747645363, + "learning_rate": 0.0001, + "loss": 1.2291, + "loss/crossentropy": 2.3494935035705566, + "loss/hidden": 1.078125, + "loss/logits": 0.1504923701286316, + "loss/reg": 4.510927101364359e-05, + "step": 1298 + }, + { + "epoch": 0.162375, + "grad_norm": 2.686509132385254, + "grad_norm_var": 7.976241291204222, + "learning_rate": 0.0001, + "loss": 1.2428, + "loss/crossentropy": 2.5574803352355957, + "loss/hidden": 1.078125, + "loss/logits": 0.1642056405544281, + "loss/reg": 4.508991332841106e-05, + "step": 1299 + }, + { + "epoch": 0.1625, + "grad_norm": 2.7092995643615723, + "grad_norm_var": 8.008977847206319, + "learning_rate": 0.0001, + "loss": 1.3322, + "loss/crossentropy": 2.4917750358581543, + "loss/hidden": 1.140625, + "loss/logits": 0.19108353555202484, + "loss/reg": 4.507151243160479e-05, + "step": 1300 + }, + { + "epoch": 0.162625, + "grad_norm": 3.4720537662506104, + "grad_norm_var": 7.976526433044462, + "learning_rate": 0.0001, + "loss": 1.3601, + "loss/crossentropy": 2.49690318107605, + "loss/hidden": 1.1796875, + "loss/logits": 0.17998597025871277, + "loss/reg": 4.505674951360561e-05, + "step": 1301 + }, + { + "epoch": 0.16275, + "grad_norm": 2.996471881866455, + "grad_norm_var": 7.935146933941526, + "learning_rate": 0.0001, + "loss": 1.2871, + "loss/crossentropy": 2.723876714706421, + "loss/hidden": 1.109375, + "loss/logits": 0.1772668957710266, + "loss/reg": 4.5041644625598565e-05, + "step": 1302 + }, + { + "epoch": 0.162875, + "grad_norm": 3.104691505432129, + "grad_norm_var": 7.890448726347673, + "learning_rate": 0.0001, + "loss": 1.6074, + "loss/crossentropy": 2.483220100402832, + "loss/hidden": 1.3515625, + "loss/logits": 0.2554011046886444, + "loss/reg": 4.502179945120588e-05, + "step": 1303 + }, + { + "epoch": 0.163, + "grad_norm": 3.673701286315918, + "grad_norm_var": 7.837376429810394, + "learning_rate": 0.0001, + "loss": 1.3646, + "loss/crossentropy": 2.6128814220428467, + "loss/hidden": 1.171875, + "loss/logits": 0.192288339138031, + "loss/reg": 4.500124487094581e-05, + "step": 1304 + }, + { + "epoch": 0.163125, + "grad_norm": 3.023348331451416, + "grad_norm_var": 7.7443295688751315, + "learning_rate": 0.0001, + "loss": 1.3105, + "loss/crossentropy": 2.4459805488586426, + "loss/hidden": 1.125, + "loss/logits": 0.1850828230381012, + "loss/reg": 4.497506597544998e-05, + "step": 1305 + }, + { + "epoch": 0.16325, + "grad_norm": 2.8581490516662598, + "grad_norm_var": 7.663542686186688, + "learning_rate": 0.0001, + "loss": 1.8724, + "loss/crossentropy": 2.4504129886627197, + "loss/hidden": 1.5703125, + "loss/logits": 0.30162930488586426, + "loss/reg": 4.495858229347505e-05, + "step": 1306 + }, + { + "epoch": 0.163375, + "grad_norm": 2.220228672027588, + "grad_norm_var": 7.65486261444942, + "learning_rate": 0.0001, + "loss": 1.3188, + "loss/crossentropy": 2.48659348487854, + "loss/hidden": 1.15625, + "loss/logits": 0.16205593943595886, + "loss/reg": 4.493745291256346e-05, + "step": 1307 + }, + { + "epoch": 0.1635, + "grad_norm": 2.5835015773773193, + "grad_norm_var": 0.27692455359477297, + "learning_rate": 0.0001, + "loss": 1.4295, + "loss/crossentropy": 2.087364435195923, + "loss/hidden": 1.25, + "loss/logits": 0.1790969967842102, + "loss/reg": 4.491967774811201e-05, + "step": 1308 + }, + { + "epoch": 0.163625, + "grad_norm": 3.2623960971832275, + "grad_norm_var": 0.24523372884796876, + "learning_rate": 0.0001, + "loss": 1.413, + "loss/crossentropy": 2.5019094944000244, + "loss/hidden": 1.2265625, + "loss/logits": 0.18602579832077026, + "loss/reg": 4.489835919230245e-05, + "step": 1309 + }, + { + "epoch": 0.16375, + "grad_norm": 3.2299067974090576, + "grad_norm_var": 0.23223192578486382, + "learning_rate": 0.0001, + "loss": 1.281, + "loss/crossentropy": 2.6735575199127197, + "loss/hidden": 1.109375, + "loss/logits": 0.17119300365447998, + "loss/reg": 4.4880165660288185e-05, + "step": 1310 + }, + { + "epoch": 0.163875, + "grad_norm": 3.0332422256469727, + "grad_norm_var": 0.22851017898726292, + "learning_rate": 0.0001, + "loss": 1.6158, + "loss/crossentropy": 2.3394935131073, + "loss/hidden": 1.3828125, + "loss/logits": 0.23254308104515076, + "loss/reg": 4.485783938434906e-05, + "step": 1311 + }, + { + "epoch": 0.164, + "grad_norm": 2.899348735809326, + "grad_norm_var": 0.2205539210923611, + "learning_rate": 0.0001, + "loss": 1.4468, + "loss/crossentropy": 2.635579824447632, + "loss/hidden": 1.234375, + "loss/logits": 0.2120014727115631, + "loss/reg": 4.483287193579599e-05, + "step": 1312 + }, + { + "epoch": 0.164125, + "grad_norm": 2.1863510608673096, + "grad_norm_var": 0.2383558542244522, + "learning_rate": 0.0001, + "loss": 1.2466, + "loss/crossentropy": 2.3679277896881104, + "loss/hidden": 1.0859375, + "loss/logits": 0.16022589802742004, + "loss/reg": 4.4809763494413346e-05, + "step": 1313 + }, + { + "epoch": 0.16425, + "grad_norm": 1.8936806917190552, + "grad_norm_var": 0.22653542770815271, + "learning_rate": 0.0001, + "loss": 1.3183, + "loss/crossentropy": 2.203962564468384, + "loss/hidden": 1.15625, + "loss/logits": 0.1615590900182724, + "loss/reg": 4.479134804569185e-05, + "step": 1314 + }, + { + "epoch": 0.164375, + "grad_norm": 2.3853209018707275, + "grad_norm_var": 0.23935511818615438, + "learning_rate": 0.0001, + "loss": 1.2934, + "loss/crossentropy": 2.761049509048462, + "loss/hidden": 1.1328125, + "loss/logits": 0.16009873151779175, + "loss/reg": 4.476767935557291e-05, + "step": 1315 + }, + { + "epoch": 0.1645, + "grad_norm": 2.4226620197296143, + "grad_norm_var": 0.24970435950411762, + "learning_rate": 0.0001, + "loss": 1.2592, + "loss/crossentropy": 2.514742612838745, + "loss/hidden": 1.09375, + "loss/logits": 0.16496333479881287, + "loss/reg": 4.4750799133908004e-05, + "step": 1316 + }, + { + "epoch": 0.164625, + "grad_norm": 2.6166725158691406, + "grad_norm_var": 0.22195831312392228, + "learning_rate": 0.0001, + "loss": 1.4806, + "loss/crossentropy": 2.658413887023926, + "loss/hidden": 1.2578125, + "loss/logits": 0.22230298817157745, + "loss/reg": 4.47302772954572e-05, + "step": 1317 + }, + { + "epoch": 0.16475, + "grad_norm": 2.779916763305664, + "grad_norm_var": 0.21847590222987534, + "learning_rate": 0.0001, + "loss": 1.1961, + "loss/crossentropy": 2.6549179553985596, + "loss/hidden": 1.0390625, + "loss/logits": 0.15657024085521698, + "loss/reg": 4.4711912778439e-05, + "step": 1318 + }, + { + "epoch": 0.164875, + "grad_norm": 2.44938325881958, + "grad_norm_var": 0.21526962094278013, + "learning_rate": 0.0001, + "loss": 1.2925, + "loss/crossentropy": 2.5260696411132812, + "loss/hidden": 1.1171875, + "loss/logits": 0.17487144470214844, + "loss/reg": 4.468999759410508e-05, + "step": 1319 + }, + { + "epoch": 0.165, + "grad_norm": 2.3545005321502686, + "grad_norm_var": 0.15626391559457595, + "learning_rate": 0.0001, + "loss": 1.3639, + "loss/crossentropy": 2.4437973499298096, + "loss/hidden": 1.171875, + "loss/logits": 0.19156785309314728, + "loss/reg": 4.466849713935517e-05, + "step": 1320 + }, + { + "epoch": 0.165125, + "grad_norm": 2.07038950920105, + "grad_norm_var": 0.16398468550203796, + "learning_rate": 0.0001, + "loss": 1.4093, + "loss/crossentropy": 2.3855857849121094, + "loss/hidden": 1.21875, + "loss/logits": 0.19014930725097656, + "loss/reg": 4.465408710530028e-05, + "step": 1321 + }, + { + "epoch": 0.16525, + "grad_norm": 3.916027545928955, + "grad_norm_var": 0.2734647347174624, + "learning_rate": 0.0001, + "loss": 1.6608, + "loss/crossentropy": 2.469409942626953, + "loss/hidden": 1.4140625, + "loss/logits": 0.24630755186080933, + "loss/reg": 4.4640266423812136e-05, + "step": 1322 + }, + { + "epoch": 0.165375, + "grad_norm": 2.7967050075531006, + "grad_norm_var": 0.261664755882631, + "learning_rate": 0.0001, + "loss": 1.3672, + "loss/crossentropy": 2.4856441020965576, + "loss/hidden": 1.1875, + "loss/logits": 0.17921873927116394, + "loss/reg": 4.462606375454925e-05, + "step": 1323 + }, + { + "epoch": 0.1655, + "grad_norm": 2.528658151626587, + "grad_norm_var": 0.2625583864054059, + "learning_rate": 0.0001, + "loss": 1.2424, + "loss/crossentropy": 2.6862435340881348, + "loss/hidden": 1.0703125, + "loss/logits": 0.17163142561912537, + "loss/reg": 4.4612395868171006e-05, + "step": 1324 + }, + { + "epoch": 0.165625, + "grad_norm": 2.49149751663208, + "grad_norm_var": 0.23948644297048335, + "learning_rate": 0.0001, + "loss": 1.3718, + "loss/crossentropy": 2.6498684883117676, + "loss/hidden": 1.1875, + "loss/logits": 0.183807834982872, + "loss/reg": 4.460258787730709e-05, + "step": 1325 + }, + { + "epoch": 0.16575, + "grad_norm": 2.6475727558135986, + "grad_norm_var": 0.21397661985775135, + "learning_rate": 0.0001, + "loss": 1.274, + "loss/crossentropy": 2.4302077293395996, + "loss/hidden": 1.1015625, + "loss/logits": 0.1719643473625183, + "loss/reg": 4.458642069948837e-05, + "step": 1326 + }, + { + "epoch": 0.165875, + "grad_norm": 2.360180377960205, + "grad_norm_var": 0.20269171402897346, + "learning_rate": 0.0001, + "loss": 1.2778, + "loss/crossentropy": 2.512448787689209, + "loss/hidden": 1.1015625, + "loss/logits": 0.17575357854366302, + "loss/reg": 4.456798706087284e-05, + "step": 1327 + }, + { + "epoch": 0.166, + "grad_norm": 3.0969111919403076, + "grad_norm_var": 0.21433543744030548, + "learning_rate": 0.0001, + "loss": 1.3271, + "loss/crossentropy": 2.6267106533050537, + "loss/hidden": 1.140625, + "loss/logits": 0.186046302318573, + "loss/reg": 4.455425005289726e-05, + "step": 1328 + }, + { + "epoch": 0.166125, + "grad_norm": 2.580076217651367, + "grad_norm_var": 0.20428929677158855, + "learning_rate": 0.0001, + "loss": 1.3796, + "loss/crossentropy": 2.5159518718719482, + "loss/hidden": 1.1953125, + "loss/logits": 0.18387523293495178, + "loss/reg": 4.453653309610672e-05, + "step": 1329 + }, + { + "epoch": 0.16625, + "grad_norm": 2.2171709537506104, + "grad_norm_var": 0.1809303697723879, + "learning_rate": 0.0001, + "loss": 1.2144, + "loss/crossentropy": 2.364938735961914, + "loss/hidden": 1.0546875, + "loss/logits": 0.159266397356987, + "loss/reg": 4.452219945960678e-05, + "step": 1330 + }, + { + "epoch": 0.166375, + "grad_norm": 2.7640013694763184, + "grad_norm_var": 0.17869486976307958, + "learning_rate": 0.0001, + "loss": 1.3287, + "loss/crossentropy": 2.547267436981201, + "loss/hidden": 1.140625, + "loss/logits": 0.18760260939598083, + "loss/reg": 4.4505995901999995e-05, + "step": 1331 + }, + { + "epoch": 0.1665, + "grad_norm": 2.513836145401001, + "grad_norm_var": 0.17668453543018642, + "learning_rate": 0.0001, + "loss": 1.3596, + "loss/crossentropy": 2.403233289718628, + "loss/hidden": 1.1796875, + "loss/logits": 0.17943502962589264, + "loss/reg": 4.449459811439738e-05, + "step": 1332 + }, + { + "epoch": 0.166625, + "grad_norm": 2.718043804168701, + "grad_norm_var": 0.17705922491783213, + "learning_rate": 0.0001, + "loss": 1.5178, + "loss/crossentropy": 2.420300006866455, + "loss/hidden": 1.2734375, + "loss/logits": 0.24389630556106567, + "loss/reg": 4.448366235010326e-05, + "step": 1333 + }, + { + "epoch": 0.16675, + "grad_norm": 2.5478909015655518, + "grad_norm_var": 0.17618216107275256, + "learning_rate": 0.0001, + "loss": 1.4965, + "loss/crossentropy": 2.492011785507202, + "loss/hidden": 1.265625, + "loss/logits": 0.2303885817527771, + "loss/reg": 4.4475800677901134e-05, + "step": 1334 + }, + { + "epoch": 0.166875, + "grad_norm": 2.264066457748413, + "grad_norm_var": 0.1827494628185671, + "learning_rate": 0.0001, + "loss": 1.3425, + "loss/crossentropy": 2.515028715133667, + "loss/hidden": 1.140625, + "loss/logits": 0.20141394436359406, + "loss/reg": 4.446614548214711e-05, + "step": 1335 + }, + { + "epoch": 0.167, + "grad_norm": 2.9766416549682617, + "grad_norm_var": 0.18518897405885634, + "learning_rate": 0.0001, + "loss": 1.4471, + "loss/crossentropy": 2.330453872680664, + "loss/hidden": 1.2421875, + "loss/logits": 0.2045084834098816, + "loss/reg": 4.4459426135290414e-05, + "step": 1336 + }, + { + "epoch": 0.167125, + "grad_norm": 3.2450225353240967, + "grad_norm_var": 0.1797691221482142, + "learning_rate": 0.0001, + "loss": 1.2603, + "loss/crossentropy": 2.5466201305389404, + "loss/hidden": 1.078125, + "loss/logits": 0.18173721432685852, + "loss/reg": 4.445154991117306e-05, + "step": 1337 + }, + { + "epoch": 0.16725, + "grad_norm": 2.0674796104431152, + "grad_norm_var": 0.1007740659870798, + "learning_rate": 0.0001, + "loss": 1.2446, + "loss/crossentropy": 2.6964364051818848, + "loss/hidden": 1.0625, + "loss/logits": 0.18164029717445374, + "loss/reg": 4.444210571818985e-05, + "step": 1338 + }, + { + "epoch": 0.167375, + "grad_norm": 2.396139621734619, + "grad_norm_var": 0.10101679166968154, + "learning_rate": 0.0001, + "loss": 1.3855, + "loss/crossentropy": 2.9879798889160156, + "loss/hidden": 1.171875, + "loss/logits": 0.21313458681106567, + "loss/reg": 4.443850411917083e-05, + "step": 1339 + }, + { + "epoch": 0.1675, + "grad_norm": 1.7887595891952515, + "grad_norm_var": 0.14113099684256217, + "learning_rate": 0.0001, + "loss": 1.1985, + "loss/crossentropy": 2.2846755981445312, + "loss/hidden": 1.0390625, + "loss/logits": 0.15899279713630676, + "loss/reg": 4.4424450607039034e-05, + "step": 1340 + }, + { + "epoch": 0.167625, + "grad_norm": 3.145375967025757, + "grad_norm_var": 0.163432382007852, + "learning_rate": 0.0001, + "loss": 1.3871, + "loss/crossentropy": 2.4436898231506348, + "loss/hidden": 1.1875, + "loss/logits": 0.1991751343011856, + "loss/reg": 4.4411804992705584e-05, + "step": 1341 + }, + { + "epoch": 0.16775, + "grad_norm": 2.3595798015594482, + "grad_norm_var": 0.16613940110398026, + "learning_rate": 0.0001, + "loss": 1.389, + "loss/crossentropy": 2.584183692932129, + "loss/hidden": 1.171875, + "loss/logits": 0.21666912734508514, + "loss/reg": 4.439669646671973e-05, + "step": 1342 + }, + { + "epoch": 0.167875, + "grad_norm": 2.5877060890197754, + "grad_norm_var": 0.16315910377839907, + "learning_rate": 0.0001, + "loss": 1.3234, + "loss/crossentropy": 2.7484211921691895, + "loss/hidden": 1.1484375, + "loss/logits": 0.17448198795318604, + "loss/reg": 4.438200267031789e-05, + "step": 1343 + }, + { + "epoch": 0.168, + "grad_norm": 2.4293344020843506, + "grad_norm_var": 0.144939535521361, + "learning_rate": 0.0001, + "loss": 1.2442, + "loss/crossentropy": 2.207484245300293, + "loss/hidden": 1.078125, + "loss/logits": 0.16565534472465515, + "loss/reg": 4.437361712916754e-05, + "step": 1344 + }, + { + "epoch": 0.168125, + "grad_norm": 2.433609962463379, + "grad_norm_var": 0.14545021764670352, + "learning_rate": 0.0001, + "loss": 1.2619, + "loss/crossentropy": 2.1073293685913086, + "loss/hidden": 1.078125, + "loss/logits": 0.18332740664482117, + "loss/reg": 4.4370663090376183e-05, + "step": 1345 + }, + { + "epoch": 0.16825, + "grad_norm": 2.080322027206421, + "grad_norm_var": 0.15229983777140596, + "learning_rate": 0.0001, + "loss": 1.2279, + "loss/crossentropy": 2.59745717048645, + "loss/hidden": 1.0625, + "loss/logits": 0.16493362188339233, + "loss/reg": 4.436418385012075e-05, + "step": 1346 + }, + { + "epoch": 0.168375, + "grad_norm": 2.801445245742798, + "grad_norm_var": 0.1536063298279311, + "learning_rate": 0.0001, + "loss": 1.3217, + "loss/crossentropy": 2.6507155895233154, + "loss/hidden": 1.125, + "loss/logits": 0.1962617039680481, + "loss/reg": 4.43481003458146e-05, + "step": 1347 + }, + { + "epoch": 0.1685, + "grad_norm": 2.6921567916870117, + "grad_norm_var": 0.15539478093565381, + "learning_rate": 0.0001, + "loss": 1.5269, + "loss/crossentropy": 2.371269702911377, + "loss/hidden": 1.3203125, + "loss/logits": 0.2061552107334137, + "loss/reg": 4.434393486008048e-05, + "step": 1348 + }, + { + "epoch": 0.168625, + "grad_norm": 2.6421940326690674, + "grad_norm_var": 0.15388647465415672, + "learning_rate": 0.0001, + "loss": 1.259, + "loss/crossentropy": 2.309507369995117, + "loss/hidden": 1.078125, + "loss/logits": 0.18043813109397888, + "loss/reg": 4.433002322912216e-05, + "step": 1349 + }, + { + "epoch": 0.16875, + "grad_norm": 2.754995584487915, + "grad_norm_var": 0.1570997294501703, + "learning_rate": 0.0001, + "loss": 1.3353, + "loss/crossentropy": 2.3387720584869385, + "loss/hidden": 1.140625, + "loss/logits": 0.19423820078372955, + "loss/reg": 4.432429341250099e-05, + "step": 1350 + }, + { + "epoch": 0.168875, + "grad_norm": 2.669968366622925, + "grad_norm_var": 0.15237942264023818, + "learning_rate": 0.0001, + "loss": 1.4666, + "loss/crossentropy": 2.56152606010437, + "loss/hidden": 1.234375, + "loss/logits": 0.23174414038658142, + "loss/reg": 4.4314343540463597e-05, + "step": 1351 + }, + { + "epoch": 0.169, + "grad_norm": 3.638333559036255, + "grad_norm_var": 0.21589205502504516, + "learning_rate": 0.0001, + "loss": 1.3802, + "loss/crossentropy": 2.711392879486084, + "loss/hidden": 1.1875, + "loss/logits": 0.19228197634220123, + "loss/reg": 4.430532862897962e-05, + "step": 1352 + }, + { + "epoch": 0.169125, + "grad_norm": 2.1712546348571777, + "grad_norm_var": 0.19679081461784917, + "learning_rate": 0.0001, + "loss": 1.3556, + "loss/crossentropy": 2.6300976276397705, + "loss/hidden": 1.1640625, + "loss/logits": 0.19107992947101593, + "loss/reg": 4.4293406972428784e-05, + "step": 1353 + }, + { + "epoch": 0.16925, + "grad_norm": 2.3168716430664062, + "grad_norm_var": 0.18492694202072874, + "learning_rate": 0.0001, + "loss": 1.3021, + "loss/crossentropy": 2.3884122371673584, + "loss/hidden": 1.1171875, + "loss/logits": 0.18451938033103943, + "loss/reg": 4.427470048540272e-05, + "step": 1354 + }, + { + "epoch": 0.169375, + "grad_norm": 2.2110610008239746, + "grad_norm_var": 0.19103130230434662, + "learning_rate": 0.0001, + "loss": 1.3532, + "loss/crossentropy": 2.588346242904663, + "loss/hidden": 1.140625, + "loss/logits": 0.21214817464351654, + "loss/reg": 4.4263808376854286e-05, + "step": 1355 + }, + { + "epoch": 0.1695, + "grad_norm": 2.515874147415161, + "grad_norm_var": 0.15074033294804298, + "learning_rate": 0.0001, + "loss": 1.3287, + "loss/crossentropy": 2.5218629837036133, + "loss/hidden": 1.15625, + "loss/logits": 0.17196393013000488, + "loss/reg": 4.4249900383874774e-05, + "step": 1356 + }, + { + "epoch": 0.169625, + "grad_norm": 2.6412770748138428, + "grad_norm_var": 0.12933633378730272, + "learning_rate": 0.0001, + "loss": 1.2391, + "loss/crossentropy": 3.1147713661193848, + "loss/hidden": 1.0703125, + "loss/logits": 0.16838082671165466, + "loss/reg": 4.4234649976715446e-05, + "step": 1357 + }, + { + "epoch": 0.16975, + "grad_norm": 2.0776495933532715, + "grad_norm_var": 0.14180512977350357, + "learning_rate": 0.0001, + "loss": 1.1746, + "loss/crossentropy": 2.6043543815612793, + "loss/hidden": 1.015625, + "loss/logits": 0.15855082869529724, + "loss/reg": 4.422090933076106e-05, + "step": 1358 + }, + { + "epoch": 0.169875, + "grad_norm": 3.2153241634368896, + "grad_norm_var": 0.17029051137356627, + "learning_rate": 0.0001, + "loss": 1.2748, + "loss/crossentropy": 2.0572850704193115, + "loss/hidden": 1.125, + "loss/logits": 0.14936628937721252, + "loss/reg": 4.420821642270312e-05, + "step": 1359 + }, + { + "epoch": 0.17, + "grad_norm": 2.1766624450683594, + "grad_norm_var": 0.17938114614681974, + "learning_rate": 0.0001, + "loss": 1.3318, + "loss/crossentropy": 2.3410725593566895, + "loss/hidden": 1.140625, + "loss/logits": 0.19073010981082916, + "loss/reg": 4.4193744543008506e-05, + "step": 1360 + }, + { + "epoch": 0.170125, + "grad_norm": 3.1066315174102783, + "grad_norm_var": 0.1959061853200069, + "learning_rate": 0.0001, + "loss": 1.3065, + "loss/crossentropy": 2.4431891441345215, + "loss/hidden": 1.1328125, + "loss/logits": 0.1732902228832245, + "loss/reg": 4.4178697862662375e-05, + "step": 1361 + }, + { + "epoch": 0.17025, + "grad_norm": 2.798218250274658, + "grad_norm_var": 0.17770364110440345, + "learning_rate": 0.0001, + "loss": 1.3664, + "loss/crossentropy": 2.451411008834839, + "loss/hidden": 1.15625, + "loss/logits": 0.2097121775150299, + "loss/reg": 4.416131559992209e-05, + "step": 1362 + }, + { + "epoch": 0.170375, + "grad_norm": 3.6879491806030273, + "grad_norm_var": 0.2445016046832573, + "learning_rate": 0.0001, + "loss": 2.0203, + "loss/crossentropy": 2.753589630126953, + "loss/hidden": 1.6953125, + "loss/logits": 0.3245903253555298, + "loss/reg": 4.414744034875184e-05, + "step": 1363 + }, + { + "epoch": 0.1705, + "grad_norm": 3.903742551803589, + "grad_norm_var": 0.33380536863192345, + "learning_rate": 0.0001, + "loss": 1.6308, + "loss/crossentropy": 2.876974582672119, + "loss/hidden": 1.3828125, + "loss/logits": 0.24756835401058197, + "loss/reg": 4.413632632349618e-05, + "step": 1364 + }, + { + "epoch": 0.170625, + "grad_norm": 2.2541909217834473, + "grad_norm_var": 0.3504989650026715, + "learning_rate": 0.0001, + "loss": 1.2854, + "loss/crossentropy": 2.5404322147369385, + "loss/hidden": 1.09375, + "loss/logits": 0.19123107194900513, + "loss/reg": 4.412614362081513e-05, + "step": 1365 + }, + { + "epoch": 0.17075, + "grad_norm": 2.7338268756866455, + "grad_norm_var": 0.3505375697769665, + "learning_rate": 0.0001, + "loss": 1.3595, + "loss/crossentropy": 2.5987484455108643, + "loss/hidden": 1.1640625, + "loss/logits": 0.19501641392707825, + "loss/reg": 4.4113399781053886e-05, + "step": 1366 + }, + { + "epoch": 0.170875, + "grad_norm": 2.3233795166015625, + "grad_norm_var": 0.3620869455068522, + "learning_rate": 0.0001, + "loss": 1.3391, + "loss/crossentropy": 2.3241047859191895, + "loss/hidden": 1.15625, + "loss/logits": 0.18244224786758423, + "loss/reg": 4.4100765080656856e-05, + "step": 1367 + }, + { + "epoch": 0.171, + "grad_norm": 2.498671293258667, + "grad_norm_var": 0.3061141155101474, + "learning_rate": 0.0001, + "loss": 1.4606, + "loss/crossentropy": 2.457289218902588, + "loss/hidden": 1.234375, + "loss/logits": 0.2258085310459137, + "loss/reg": 4.408616223372519e-05, + "step": 1368 + }, + { + "epoch": 0.171125, + "grad_norm": 4.238934516906738, + "grad_norm_var": 0.4373271589653179, + "learning_rate": 0.0001, + "loss": 1.5951, + "loss/crossentropy": 2.760684013366699, + "loss/hidden": 1.3671875, + "loss/logits": 0.2275162637233734, + "loss/reg": 4.407550295582041e-05, + "step": 1369 + }, + { + "epoch": 0.17125, + "grad_norm": 4.71045446395874, + "grad_norm_var": 0.6432062535952201, + "learning_rate": 0.0001, + "loss": 1.6466, + "loss/crossentropy": 2.5298707485198975, + "loss/hidden": 1.4296875, + "loss/logits": 0.21650245785713196, + "loss/reg": 4.406468724482693e-05, + "step": 1370 + }, + { + "epoch": 0.171375, + "grad_norm": 2.916440486907959, + "grad_norm_var": 0.6054300939970363, + "learning_rate": 0.0001, + "loss": 1.3805, + "loss/crossentropy": 2.6951613426208496, + "loss/hidden": 1.171875, + "loss/logits": 0.20820212364196777, + "loss/reg": 4.4051706936443225e-05, + "step": 1371 + }, + { + "epoch": 0.1715, + "grad_norm": 2.497441530227661, + "grad_norm_var": 0.6066103168523642, + "learning_rate": 0.0001, + "loss": 1.1551, + "loss/crossentropy": 2.323773145675659, + "loss/hidden": 1.015625, + "loss/logits": 0.13906535506248474, + "loss/reg": 4.403495404403657e-05, + "step": 1372 + }, + { + "epoch": 0.171625, + "grad_norm": 2.6485416889190674, + "grad_norm_var": 0.6062794211515701, + "learning_rate": 0.0001, + "loss": 1.4911, + "loss/crossentropy": 2.572211980819702, + "loss/hidden": 1.2578125, + "loss/logits": 0.23286481201648712, + "loss/reg": 4.402002741699107e-05, + "step": 1373 + }, + { + "epoch": 0.17175, + "grad_norm": 2.163757801055908, + "grad_norm_var": 0.596305325230626, + "learning_rate": 0.0001, + "loss": 1.2345, + "loss/crossentropy": 2.4492759704589844, + "loss/hidden": 1.0546875, + "loss/logits": 0.17937690019607544, + "loss/reg": 4.40061412518844e-05, + "step": 1374 + }, + { + "epoch": 0.171875, + "grad_norm": 2.115994930267334, + "grad_norm_var": 0.6391237393217495, + "learning_rate": 0.0001, + "loss": 1.3079, + "loss/crossentropy": 2.676708459854126, + "loss/hidden": 1.1328125, + "loss/logits": 0.17461000382900238, + "loss/reg": 4.399328099680133e-05, + "step": 1375 + }, + { + "epoch": 0.172, + "grad_norm": 2.555100440979004, + "grad_norm_var": 0.6103941335775583, + "learning_rate": 0.0001, + "loss": 1.2241, + "loss/crossentropy": 2.4660518169403076, + "loss/hidden": 1.0703125, + "loss/logits": 0.15334627032279968, + "loss/reg": 4.398010059958324e-05, + "step": 1376 + }, + { + "epoch": 0.172125, + "grad_norm": 2.0078580379486084, + "grad_norm_var": 0.6624757473026042, + "learning_rate": 0.0001, + "loss": 1.307, + "loss/crossentropy": 2.5937724113464355, + "loss/hidden": 1.125, + "loss/logits": 0.18153205513954163, + "loss/reg": 4.3968833779217675e-05, + "step": 1377 + }, + { + "epoch": 0.17225, + "grad_norm": 2.7480297088623047, + "grad_norm_var": 0.6631697814477386, + "learning_rate": 0.0001, + "loss": 1.2397, + "loss/crossentropy": 2.588857412338257, + "loss/hidden": 1.0703125, + "loss/logits": 0.16894888877868652, + "loss/reg": 4.395194991957396e-05, + "step": 1378 + }, + { + "epoch": 0.172375, + "grad_norm": 3.1688647270202637, + "grad_norm_var": 0.6237637466773701, + "learning_rate": 0.0001, + "loss": 1.4138, + "loss/crossentropy": 2.757561206817627, + "loss/hidden": 1.21875, + "loss/logits": 0.19462066888809204, + "loss/reg": 4.3936484871665016e-05, + "step": 1379 + }, + { + "epoch": 0.1725, + "grad_norm": 15.428227424621582, + "grad_norm_var": 10.554824158588978, + "learning_rate": 0.0001, + "loss": 1.6149, + "loss/crossentropy": 2.402078151702881, + "loss/hidden": 1.3984375, + "loss/logits": 0.21606966853141785, + "loss/reg": 4.392436676425859e-05, + "step": 1380 + }, + { + "epoch": 0.172625, + "grad_norm": 2.6017940044403076, + "grad_norm_var": 10.501711460516692, + "learning_rate": 0.0001, + "loss": 1.2484, + "loss/crossentropy": 2.660430431365967, + "loss/hidden": 1.0703125, + "loss/logits": 0.17760685086250305, + "loss/reg": 4.391232141642831e-05, + "step": 1381 + }, + { + "epoch": 0.17275, + "grad_norm": 2.525033950805664, + "grad_norm_var": 10.528127305203704, + "learning_rate": 0.0001, + "loss": 1.4136, + "loss/crossentropy": 2.454327344894409, + "loss/hidden": 1.203125, + "loss/logits": 0.210074782371521, + "loss/reg": 4.389778769109398e-05, + "step": 1382 + }, + { + "epoch": 0.172875, + "grad_norm": 2.833066463470459, + "grad_norm_var": 10.459524290972332, + "learning_rate": 0.0001, + "loss": 1.4644, + "loss/crossentropy": 2.8289997577667236, + "loss/hidden": 1.234375, + "loss/logits": 0.22955310344696045, + "loss/reg": 4.3889413063880056e-05, + "step": 1383 + }, + { + "epoch": 0.173, + "grad_norm": 2.6454131603240967, + "grad_norm_var": 10.439250793189025, + "learning_rate": 0.0001, + "loss": 1.211, + "loss/crossentropy": 2.5164341926574707, + "loss/hidden": 1.0625, + "loss/logits": 0.1480443775653839, + "loss/reg": 4.388386514619924e-05, + "step": 1384 + }, + { + "epoch": 0.173125, + "grad_norm": 2.4730100631713867, + "grad_norm_var": 10.48673112258549, + "learning_rate": 0.0001, + "loss": 1.2959, + "loss/crossentropy": 2.454392433166504, + "loss/hidden": 1.1171875, + "loss/logits": 0.17824172973632812, + "loss/reg": 4.3870826630154625e-05, + "step": 1385 + }, + { + "epoch": 0.17325, + "grad_norm": 4.211810111999512, + "grad_norm_var": 10.42195551797722, + "learning_rate": 0.0001, + "loss": 1.4687, + "loss/crossentropy": 2.650273323059082, + "loss/hidden": 1.28125, + "loss/logits": 0.1870349645614624, + "loss/reg": 4.385815191199072e-05, + "step": 1386 + }, + { + "epoch": 0.173375, + "grad_norm": 2.2195355892181396, + "grad_norm_var": 10.50386579069449, + "learning_rate": 0.0001, + "loss": 1.1958, + "loss/crossentropy": 2.6540579795837402, + "loss/hidden": 1.03125, + "loss/logits": 0.16409122943878174, + "loss/reg": 4.384495332487859e-05, + "step": 1387 + }, + { + "epoch": 0.1735, + "grad_norm": 2.2435309886932373, + "grad_norm_var": 10.53938945014739, + "learning_rate": 0.0001, + "loss": 1.2039, + "loss/crossentropy": 2.4984354972839355, + "loss/hidden": 1.0546875, + "loss/logits": 0.1487729251384735, + "loss/reg": 4.3836476834258065e-05, + "step": 1388 + }, + { + "epoch": 0.173625, + "grad_norm": 6.076218128204346, + "grad_norm_var": 10.924850838611649, + "learning_rate": 0.0001, + "loss": 1.775, + "loss/crossentropy": 2.60779070854187, + "loss/hidden": 1.4375, + "loss/logits": 0.3370318114757538, + "loss/reg": 4.383291889098473e-05, + "step": 1389 + }, + { + "epoch": 0.17375, + "grad_norm": 2.962207317352295, + "grad_norm_var": 10.809017442849843, + "learning_rate": 0.0001, + "loss": 1.3568, + "loss/crossentropy": 2.5193846225738525, + "loss/hidden": 1.171875, + "loss/logits": 0.1845148503780365, + "loss/reg": 4.3830696085933596e-05, + "step": 1390 + }, + { + "epoch": 0.173875, + "grad_norm": 2.499361991882324, + "grad_norm_var": 10.738463453127084, + "learning_rate": 0.0001, + "loss": 1.2684, + "loss/crossentropy": 2.727522373199463, + "loss/hidden": 1.078125, + "loss/logits": 0.1898011714220047, + "loss/reg": 4.38306997239124e-05, + "step": 1391 + }, + { + "epoch": 0.174, + "grad_norm": 2.902444839477539, + "grad_norm_var": 10.692983416262415, + "learning_rate": 0.0001, + "loss": 1.2714, + "loss/crossentropy": 2.5181994438171387, + "loss/hidden": 1.09375, + "loss/logits": 0.17718875408172607, + "loss/reg": 4.3814598029712215e-05, + "step": 1392 + }, + { + "epoch": 0.174125, + "grad_norm": 2.5764389038085938, + "grad_norm_var": 10.583264738967724, + "learning_rate": 0.0001, + "loss": 1.2864, + "loss/crossentropy": 2.609466552734375, + "loss/hidden": 1.1171875, + "loss/logits": 0.16876953840255737, + "loss/reg": 4.38142305938527e-05, + "step": 1393 + }, + { + "epoch": 0.17425, + "grad_norm": 1.9676804542541504, + "grad_norm_var": 10.726323120818574, + "learning_rate": 0.0001, + "loss": 1.2885, + "loss/crossentropy": 2.610501527786255, + "loss/hidden": 1.109375, + "loss/logits": 0.178666889667511, + "loss/reg": 4.380764585221186e-05, + "step": 1394 + }, + { + "epoch": 0.174375, + "grad_norm": 3.298060178756714, + "grad_norm_var": 10.71807201389054, + "learning_rate": 0.0001, + "loss": 1.5811, + "loss/crossentropy": 2.571620464324951, + "loss/hidden": 1.328125, + "loss/logits": 0.2525365352630615, + "loss/reg": 4.379446909297258e-05, + "step": 1395 + }, + { + "epoch": 0.1745, + "grad_norm": 2.5956883430480957, + "grad_norm_var": 0.9713562693941827, + "learning_rate": 0.0001, + "loss": 1.2787, + "loss/crossentropy": 2.4515395164489746, + "loss/hidden": 1.109375, + "loss/logits": 0.16887177526950836, + "loss/reg": 4.378123412607238e-05, + "step": 1396 + }, + { + "epoch": 0.174625, + "grad_norm": 6.457098484039307, + "grad_norm_var": 1.7395961483986715, + "learning_rate": 0.0001, + "loss": 1.4139, + "loss/crossentropy": 2.481762170791626, + "loss/hidden": 1.234375, + "loss/logits": 0.17910394072532654, + "loss/reg": 4.376547803985886e-05, + "step": 1397 + }, + { + "epoch": 0.17475, + "grad_norm": 2.6334445476531982, + "grad_norm_var": 1.7312187409571094, + "learning_rate": 0.0001, + "loss": 1.1642, + "loss/crossentropy": 2.4399666786193848, + "loss/hidden": 1.0078125, + "loss/logits": 0.15591077506542206, + "loss/reg": 4.375265780254267e-05, + "step": 1398 + }, + { + "epoch": 0.174875, + "grad_norm": 4.455918788909912, + "grad_norm_var": 1.824606404052923, + "learning_rate": 0.0001, + "loss": 1.4613, + "loss/crossentropy": 2.47601580619812, + "loss/hidden": 1.2265625, + "loss/logits": 0.23428377509117126, + "loss/reg": 4.3734627979574725e-05, + "step": 1399 + }, + { + "epoch": 0.175, + "grad_norm": 12.483902931213379, + "grad_norm_var": 7.063390839893884, + "learning_rate": 0.0001, + "loss": 1.2458, + "loss/crossentropy": 2.957716941833496, + "loss/hidden": 1.09375, + "loss/logits": 0.151652991771698, + "loss/reg": 4.3715503124985844e-05, + "step": 1400 + }, + { + "epoch": 0.175125, + "grad_norm": 2.1250927448272705, + "grad_norm_var": 7.136156501883171, + "learning_rate": 0.0001, + "loss": 1.361, + "loss/crossentropy": 2.3386826515197754, + "loss/hidden": 1.1640625, + "loss/logits": 0.19654320180416107, + "loss/reg": 4.36952177551575e-05, + "step": 1401 + }, + { + "epoch": 0.17525, + "grad_norm": 2.5386850833892822, + "grad_norm_var": 7.231913773217872, + "learning_rate": 0.0001, + "loss": 1.4102, + "loss/crossentropy": 2.5731849670410156, + "loss/hidden": 1.2109375, + "loss/logits": 0.19881173968315125, + "loss/reg": 4.368073132354766e-05, + "step": 1402 + }, + { + "epoch": 0.175375, + "grad_norm": 2.7075517177581787, + "grad_norm_var": 7.147069652233706, + "learning_rate": 0.0001, + "loss": 1.2989, + "loss/crossentropy": 2.923799753189087, + "loss/hidden": 1.1171875, + "loss/logits": 0.1812412142753601, + "loss/reg": 4.366214852780104e-05, + "step": 1403 + }, + { + "epoch": 0.1755, + "grad_norm": 2.7373690605163574, + "grad_norm_var": 7.06096468766825, + "learning_rate": 0.0001, + "loss": 1.2479, + "loss/crossentropy": 2.924968719482422, + "loss/hidden": 1.0859375, + "loss/logits": 0.1615295708179474, + "loss/reg": 4.3649502913467586e-05, + "step": 1404 + }, + { + "epoch": 0.175625, + "grad_norm": 2.0970749855041504, + "grad_norm_var": 6.850111452164137, + "learning_rate": 0.0001, + "loss": 1.3082, + "loss/crossentropy": 2.4840149879455566, + "loss/hidden": 1.125, + "loss/logits": 0.18273839354515076, + "loss/reg": 4.363973130239174e-05, + "step": 1405 + }, + { + "epoch": 0.17575, + "grad_norm": 3.3448736667633057, + "grad_norm_var": 6.828514064197718, + "learning_rate": 0.0001, + "loss": 1.2617, + "loss/crossentropy": 2.663756847381592, + "loss/hidden": 1.09375, + "loss/logits": 0.16753330826759338, + "loss/reg": 4.362704567029141e-05, + "step": 1406 + }, + { + "epoch": 0.175875, + "grad_norm": 2.696303606033325, + "grad_norm_var": 6.802330951091016, + "learning_rate": 0.0001, + "loss": 1.4086, + "loss/crossentropy": 2.449850559234619, + "loss/hidden": 1.1875, + "loss/logits": 0.2206811010837555, + "loss/reg": 4.361617538961582e-05, + "step": 1407 + }, + { + "epoch": 0.176, + "grad_norm": 2.2869067192077637, + "grad_norm_var": 6.88335139626389, + "learning_rate": 0.0001, + "loss": 1.33, + "loss/crossentropy": 2.6695072650909424, + "loss/hidden": 1.15625, + "loss/logits": 0.1733335256576538, + "loss/reg": 4.3601157813100144e-05, + "step": 1408 + }, + { + "epoch": 0.176125, + "grad_norm": 2.6789941787719727, + "grad_norm_var": 6.870523523354868, + "learning_rate": 0.0001, + "loss": 1.3967, + "loss/crossentropy": 2.765038251876831, + "loss/hidden": 1.1796875, + "loss/logits": 0.21654057502746582, + "loss/reg": 4.358700243756175e-05, + "step": 1409 + }, + { + "epoch": 0.17625, + "grad_norm": 2.1456291675567627, + "grad_norm_var": 6.834507974821424, + "learning_rate": 0.0001, + "loss": 1.1905, + "loss/crossentropy": 2.0626509189605713, + "loss/hidden": 1.03125, + "loss/logits": 0.1588020920753479, + "loss/reg": 4.357034413260408e-05, + "step": 1410 + }, + { + "epoch": 0.176375, + "grad_norm": 5.724753379821777, + "grad_norm_var": 7.111283813958933, + "learning_rate": 0.0001, + "loss": 1.7126, + "loss/crossentropy": 2.2581207752227783, + "loss/hidden": 1.484375, + "loss/logits": 0.22774820029735565, + "loss/reg": 4.355575583758764e-05, + "step": 1411 + }, + { + "epoch": 0.1765, + "grad_norm": 2.65604305267334, + "grad_norm_var": 7.102368611780612, + "learning_rate": 0.0001, + "loss": 1.23, + "loss/crossentropy": 2.4369256496429443, + "loss/hidden": 1.0546875, + "loss/logits": 0.17484912276268005, + "loss/reg": 4.353715121396817e-05, + "step": 1412 + }, + { + "epoch": 0.176625, + "grad_norm": 2.5243303775787354, + "grad_norm_var": 6.641966894564712, + "learning_rate": 0.0001, + "loss": 1.256, + "loss/crossentropy": 2.585545539855957, + "loss/hidden": 1.0859375, + "loss/logits": 0.169620543718338, + "loss/reg": 4.352208998170681e-05, + "step": 1413 + }, + { + "epoch": 0.17675, + "grad_norm": 3.367011070251465, + "grad_norm_var": 6.591839773502852, + "learning_rate": 0.0001, + "loss": 1.5165, + "loss/crossentropy": 2.5050852298736572, + "loss/hidden": 1.2890625, + "loss/logits": 0.22704654932022095, + "loss/reg": 4.350413291831501e-05, + "step": 1414 + }, + { + "epoch": 0.176875, + "grad_norm": 4.210506439208984, + "grad_norm_var": 6.565491347616695, + "learning_rate": 0.0001, + "loss": 1.3093, + "loss/crossentropy": 2.5862507820129395, + "loss/hidden": 1.125, + "loss/logits": 0.18390105664730072, + "loss/reg": 4.3487238144734874e-05, + "step": 1415 + }, + { + "epoch": 0.177, + "grad_norm": 3.892646312713623, + "grad_norm_var": 0.9107982589900016, + "learning_rate": 0.0001, + "loss": 1.8682, + "loss/crossentropy": 2.8675537109375, + "loss/hidden": 1.5, + "loss/logits": 0.3677327632904053, + "loss/reg": 4.347108188085258e-05, + "step": 1416 + }, + { + "epoch": 0.177125, + "grad_norm": 3.503170967102051, + "grad_norm_var": 0.8717905952754526, + "learning_rate": 0.0001, + "loss": 1.6121, + "loss/crossentropy": 2.618744134902954, + "loss/hidden": 1.3671875, + "loss/logits": 0.24444371461868286, + "loss/reg": 4.3455151171656325e-05, + "step": 1417 + }, + { + "epoch": 0.17725, + "grad_norm": 3.0146777629852295, + "grad_norm_var": 0.8522632202887498, + "learning_rate": 0.0001, + "loss": 1.3245, + "loss/crossentropy": 2.8389289379119873, + "loss/hidden": 1.125, + "loss/logits": 0.19902637600898743, + "loss/reg": 4.343886030255817e-05, + "step": 1418 + }, + { + "epoch": 0.177375, + "grad_norm": 2.4084014892578125, + "grad_norm_var": 0.8734795570176286, + "learning_rate": 0.0001, + "loss": 1.4294, + "loss/crossentropy": 2.539510726928711, + "loss/hidden": 1.2265625, + "loss/logits": 0.20238348841667175, + "loss/reg": 4.3425516196293756e-05, + "step": 1419 + }, + { + "epoch": 0.1775, + "grad_norm": 2.334496021270752, + "grad_norm_var": 0.9020578094969264, + "learning_rate": 0.0001, + "loss": 1.2769, + "loss/crossentropy": 2.2953195571899414, + "loss/hidden": 1.109375, + "loss/logits": 0.16713181138038635, + "loss/reg": 4.341412568464875e-05, + "step": 1420 + }, + { + "epoch": 0.177625, + "grad_norm": 2.5287718772888184, + "grad_norm_var": 0.8585467461433038, + "learning_rate": 0.0001, + "loss": 1.5263, + "loss/crossentropy": 2.703735589981079, + "loss/hidden": 1.3046875, + "loss/logits": 0.2211885154247284, + "loss/reg": 4.339984297985211e-05, + "step": 1421 + }, + { + "epoch": 0.17775, + "grad_norm": 3.0021073818206787, + "grad_norm_var": 0.8538916502450261, + "learning_rate": 0.0001, + "loss": 1.5981, + "loss/crossentropy": 2.4952101707458496, + "loss/hidden": 1.3828125, + "loss/logits": 0.2148495614528656, + "loss/reg": 4.338718645158224e-05, + "step": 1422 + }, + { + "epoch": 0.177875, + "grad_norm": 2.6841554641723633, + "grad_norm_var": 0.8544914650704224, + "learning_rate": 0.0001, + "loss": 1.3463, + "loss/crossentropy": 2.640184164047241, + "loss/hidden": 1.1328125, + "loss/logits": 0.21307799220085144, + "loss/reg": 4.337489735917188e-05, + "step": 1423 + }, + { + "epoch": 0.178, + "grad_norm": 2.777501344680786, + "grad_norm_var": 0.8189534671629093, + "learning_rate": 0.0001, + "loss": 1.5327, + "loss/crossentropy": 2.3427610397338867, + "loss/hidden": 1.2890625, + "loss/logits": 0.243157759308815, + "loss/reg": 4.3363947042962536e-05, + "step": 1424 + }, + { + "epoch": 0.178125, + "grad_norm": 2.5543341636657715, + "grad_norm_var": 0.8267698989523389, + "learning_rate": 0.0001, + "loss": 1.3129, + "loss/crossentropy": 2.744206666946411, + "loss/hidden": 1.1484375, + "loss/logits": 0.1639942228794098, + "loss/reg": 4.334727782406844e-05, + "step": 1425 + }, + { + "epoch": 0.17825, + "grad_norm": 2.4885315895080566, + "grad_norm_var": 0.7912603488188845, + "learning_rate": 0.0001, + "loss": 1.307, + "loss/crossentropy": 2.6813228130340576, + "loss/hidden": 1.140625, + "loss/logits": 0.16597452759742737, + "loss/reg": 4.333686229074374e-05, + "step": 1426 + }, + { + "epoch": 0.178375, + "grad_norm": 3.5424156188964844, + "grad_norm_var": 0.3264754697171298, + "learning_rate": 0.0001, + "loss": 1.3473, + "loss/crossentropy": 2.712770938873291, + "loss/hidden": 1.15625, + "loss/logits": 0.19061842560768127, + "loss/reg": 4.332158641773276e-05, + "step": 1427 + }, + { + "epoch": 0.1785, + "grad_norm": 2.0955185890197754, + "grad_norm_var": 0.3694319419413773, + "learning_rate": 0.0001, + "loss": 1.3663, + "loss/crossentropy": 2.5209343433380127, + "loss/hidden": 1.171875, + "loss/logits": 0.19394898414611816, + "loss/reg": 4.3307081796228886e-05, + "step": 1428 + }, + { + "epoch": 0.178625, + "grad_norm": 2.6297364234924316, + "grad_norm_var": 0.3643823378726265, + "learning_rate": 0.0001, + "loss": 1.2339, + "loss/crossentropy": 2.5593748092651367, + "loss/hidden": 1.0546875, + "loss/logits": 0.17882663011550903, + "loss/reg": 4.329224975663237e-05, + "step": 1429 + }, + { + "epoch": 0.17875, + "grad_norm": 2.8635098934173584, + "grad_norm_var": 0.3515349356239161, + "learning_rate": 0.0001, + "loss": 1.4032, + "loss/crossentropy": 2.8081912994384766, + "loss/hidden": 1.2109375, + "loss/logits": 0.19186900556087494, + "loss/reg": 4.327530041337013e-05, + "step": 1430 + }, + { + "epoch": 0.178875, + "grad_norm": 2.7731873989105225, + "grad_norm_var": 0.23106689203328087, + "learning_rate": 0.0001, + "loss": 1.4425, + "loss/crossentropy": 2.60203218460083, + "loss/hidden": 1.1953125, + "loss/logits": 0.24671795964241028, + "loss/reg": 4.325600093579851e-05, + "step": 1431 + }, + { + "epoch": 0.179, + "grad_norm": 2.6906487941741943, + "grad_norm_var": 0.1491888512825502, + "learning_rate": 0.0001, + "loss": 1.4079, + "loss/crossentropy": 2.4658117294311523, + "loss/hidden": 1.2265625, + "loss/logits": 0.18089887499809265, + "loss/reg": 4.324036490288563e-05, + "step": 1432 + }, + { + "epoch": 0.179125, + "grad_norm": 2.9456405639648438, + "grad_norm_var": 0.1121219410924103, + "learning_rate": 0.0001, + "loss": 1.2971, + "loss/crossentropy": 2.6626834869384766, + "loss/hidden": 1.125, + "loss/logits": 0.17169824242591858, + "loss/reg": 4.322533277445473e-05, + "step": 1433 + }, + { + "epoch": 0.17925, + "grad_norm": 3.171668767929077, + "grad_norm_var": 0.12007437587654597, + "learning_rate": 0.0001, + "loss": 1.4177, + "loss/crossentropy": 2.836962938308716, + "loss/hidden": 1.203125, + "loss/logits": 0.21418796479701996, + "loss/reg": 4.3210424337303266e-05, + "step": 1434 + }, + { + "epoch": 0.179375, + "grad_norm": 3.561671018600464, + "grad_norm_var": 0.15556932022715816, + "learning_rate": 0.0001, + "loss": 1.4663, + "loss/crossentropy": 2.6809699535369873, + "loss/hidden": 1.2109375, + "loss/logits": 0.2549501657485962, + "loss/reg": 4.319531217333861e-05, + "step": 1435 + }, + { + "epoch": 0.1795, + "grad_norm": 2.716172695159912, + "grad_norm_var": 0.14148105049478066, + "learning_rate": 0.0001, + "loss": 1.3285, + "loss/crossentropy": 2.245272397994995, + "loss/hidden": 1.15625, + "loss/logits": 0.1718417853116989, + "loss/reg": 4.317987259128131e-05, + "step": 1436 + }, + { + "epoch": 0.179625, + "grad_norm": 17.232587814331055, + "grad_norm_var": 13.09473393360591, + "learning_rate": 0.0001, + "loss": 1.4851, + "loss/crossentropy": 2.410290241241455, + "loss/hidden": 1.3046875, + "loss/logits": 0.17997747659683228, + "loss/reg": 4.316571357776411e-05, + "step": 1437 + }, + { + "epoch": 0.17975, + "grad_norm": 2.3736934661865234, + "grad_norm_var": 13.180663115120598, + "learning_rate": 0.0001, + "loss": 1.1406, + "loss/crossentropy": 2.472769260406494, + "loss/hidden": 1.0, + "loss/logits": 0.14021781086921692, + "loss/reg": 4.3151158024556935e-05, + "step": 1438 + }, + { + "epoch": 0.179875, + "grad_norm": 2.5714287757873535, + "grad_norm_var": 13.19663266950907, + "learning_rate": 0.0001, + "loss": 1.2952, + "loss/crossentropy": 2.3950321674346924, + "loss/hidden": 1.109375, + "loss/logits": 0.18539366126060486, + "loss/reg": 4.3139560148119926e-05, + "step": 1439 + }, + { + "epoch": 0.18, + "grad_norm": 3.797583818435669, + "grad_norm_var": 13.137998270690733, + "learning_rate": 0.0001, + "loss": 1.5306, + "loss/crossentropy": 2.4296462535858154, + "loss/hidden": 1.3125, + "loss/logits": 0.21766766905784607, + "loss/reg": 4.312453165766783e-05, + "step": 1440 + }, + { + "epoch": 0.180125, + "grad_norm": 3.5081300735473633, + "grad_norm_var": 13.042733823147271, + "learning_rate": 0.0001, + "loss": 2.1133, + "loss/crossentropy": 2.5396132469177246, + "loss/hidden": 1.7109375, + "loss/logits": 0.40195000171661377, + "loss/reg": 4.311450902605429e-05, + "step": 1441 + }, + { + "epoch": 0.18025, + "grad_norm": 2.7153725624084473, + "grad_norm_var": 13.005977433302935, + "learning_rate": 0.0001, + "loss": 1.2424, + "loss/crossentropy": 2.7652928829193115, + "loss/hidden": 1.078125, + "loss/logits": 0.16388410329818726, + "loss/reg": 4.3100644688820466e-05, + "step": 1442 + }, + { + "epoch": 0.180375, + "grad_norm": 2.1137332916259766, + "grad_norm_var": 13.187246668576858, + "learning_rate": 0.0001, + "loss": 1.2525, + "loss/crossentropy": 2.5129823684692383, + "loss/hidden": 1.078125, + "loss/logits": 0.17393165826797485, + "loss/reg": 4.308501229388639e-05, + "step": 1443 + }, + { + "epoch": 0.1805, + "grad_norm": 2.8767898082733154, + "grad_norm_var": 13.054609912670541, + "learning_rate": 0.0001, + "loss": 1.4468, + "loss/crossentropy": 2.0135140419006348, + "loss/hidden": 1.2578125, + "loss/logits": 0.1885446310043335, + "loss/reg": 4.307483322918415e-05, + "step": 1444 + }, + { + "epoch": 0.180625, + "grad_norm": 2.403043508529663, + "grad_norm_var": 13.09270559894486, + "learning_rate": 0.0001, + "loss": 1.5386, + "loss/crossentropy": 2.0192453861236572, + "loss/hidden": 1.265625, + "loss/logits": 0.27254170179367065, + "loss/reg": 4.306132541387342e-05, + "step": 1445 + }, + { + "epoch": 0.18075, + "grad_norm": 1.994828701019287, + "grad_norm_var": 13.244824799331905, + "learning_rate": 0.0001, + "loss": 1.188, + "loss/crossentropy": 2.5380778312683105, + "loss/hidden": 1.015625, + "loss/logits": 0.17196664214134216, + "loss/reg": 4.305263428250328e-05, + "step": 1446 + }, + { + "epoch": 0.180875, + "grad_norm": 2.4696993827819824, + "grad_norm_var": 13.28870750435451, + "learning_rate": 0.0001, + "loss": 1.2176, + "loss/crossentropy": 2.686527967453003, + "loss/hidden": 1.0625, + "loss/logits": 0.154626727104187, + "loss/reg": 4.3038289732066914e-05, + "step": 1447 + }, + { + "epoch": 0.181, + "grad_norm": 2.959864377975464, + "grad_norm_var": 13.2571348082626, + "learning_rate": 0.0001, + "loss": 1.4701, + "loss/crossentropy": 2.482746124267578, + "loss/hidden": 1.2421875, + "loss/logits": 0.22751757502555847, + "loss/reg": 4.3027317587984726e-05, + "step": 1448 + }, + { + "epoch": 0.181125, + "grad_norm": 2.3715078830718994, + "grad_norm_var": 13.336497430498211, + "learning_rate": 0.0001, + "loss": 1.2266, + "loss/crossentropy": 2.3657546043395996, + "loss/hidden": 1.078125, + "loss/logits": 0.14807792007923126, + "loss/reg": 4.3015385017497465e-05, + "step": 1449 + }, + { + "epoch": 0.18125, + "grad_norm": 2.3996551036834717, + "grad_norm_var": 13.425801257168224, + "learning_rate": 0.0001, + "loss": 1.3934, + "loss/crossentropy": 2.3912100791931152, + "loss/hidden": 1.1796875, + "loss/logits": 0.21327857673168182, + "loss/reg": 4.300205546314828e-05, + "step": 1450 + }, + { + "epoch": 0.181375, + "grad_norm": 8.881511688232422, + "grad_norm_var": 15.146759918124577, + "learning_rate": 0.0001, + "loss": 1.5182, + "loss/crossentropy": 2.411609411239624, + "loss/hidden": 1.3359375, + "loss/logits": 0.18182581663131714, + "loss/reg": 4.298591738916002e-05, + "step": 1451 + }, + { + "epoch": 0.1815, + "grad_norm": 2.5281503200531006, + "grad_norm_var": 15.180191875245258, + "learning_rate": 0.0001, + "loss": 1.343, + "loss/crossentropy": 2.5658814907073975, + "loss/hidden": 1.140625, + "loss/logits": 0.20197615027427673, + "loss/reg": 4.2972216760972515e-05, + "step": 1452 + }, + { + "epoch": 0.181625, + "grad_norm": 2.931814432144165, + "grad_norm_var": 2.6350739014278046, + "learning_rate": 0.0001, + "loss": 1.5714, + "loss/crossentropy": 2.623166561126709, + "loss/hidden": 1.3046875, + "loss/logits": 0.2662585973739624, + "loss/reg": 4.295963663025759e-05, + "step": 1453 + }, + { + "epoch": 0.18175, + "grad_norm": 2.307128667831421, + "grad_norm_var": 2.6414069582859265, + "learning_rate": 0.0001, + "loss": 1.3099, + "loss/crossentropy": 2.6455140113830566, + "loss/hidden": 1.1171875, + "loss/logits": 0.19229310750961304, + "loss/reg": 4.29393767262809e-05, + "step": 1454 + }, + { + "epoch": 0.181875, + "grad_norm": 2.719261884689331, + "grad_norm_var": 2.6333024593927794, + "learning_rate": 0.0001, + "loss": 1.2597, + "loss/crossentropy": 2.699171543121338, + "loss/hidden": 1.0859375, + "loss/logits": 0.17330314218997955, + "loss/reg": 4.29198844358325e-05, + "step": 1455 + }, + { + "epoch": 0.182, + "grad_norm": 2.2648873329162598, + "grad_norm_var": 2.629623452031683, + "learning_rate": 0.0001, + "loss": 1.3001, + "loss/crossentropy": 2.819680690765381, + "loss/hidden": 1.1171875, + "loss/logits": 0.1824684888124466, + "loss/reg": 4.289634307497181e-05, + "step": 1456 + }, + { + "epoch": 0.182125, + "grad_norm": 3.51459002494812, + "grad_norm_var": 2.6300935831663605, + "learning_rate": 0.0001, + "loss": 1.8678, + "loss/crossentropy": 2.649803638458252, + "loss/hidden": 1.546875, + "loss/logits": 0.32052403688430786, + "loss/reg": 4.287390038371086e-05, + "step": 1457 + }, + { + "epoch": 0.18225, + "grad_norm": 2.2351443767547607, + "grad_norm_var": 2.660538406812168, + "learning_rate": 0.0001, + "loss": 1.4079, + "loss/crossentropy": 2.6567535400390625, + "loss/hidden": 1.203125, + "loss/logits": 0.20430999994277954, + "loss/reg": 4.285141403670423e-05, + "step": 1458 + }, + { + "epoch": 0.182375, + "grad_norm": 2.742172956466675, + "grad_norm_var": 2.6163455836101916, + "learning_rate": 0.0001, + "loss": 1.393, + "loss/crossentropy": 2.60951828956604, + "loss/hidden": 1.1796875, + "loss/logits": 0.21288588643074036, + "loss/reg": 4.283638554625213e-05, + "step": 1459 + }, + { + "epoch": 0.1825, + "grad_norm": 2.7413218021392822, + "grad_norm_var": 2.6192665262027277, + "learning_rate": 0.0001, + "loss": 1.1963, + "loss/crossentropy": 2.773202657699585, + "loss/hidden": 1.03125, + "loss/logits": 0.16459418833255768, + "loss/reg": 4.281382280169055e-05, + "step": 1460 + }, + { + "epoch": 0.182625, + "grad_norm": 3.7331223487854004, + "grad_norm_var": 2.6299038870939264, + "learning_rate": 0.0001, + "loss": 1.4449, + "loss/crossentropy": 2.539788246154785, + "loss/hidden": 1.21875, + "loss/logits": 0.22573140263557434, + "loss/reg": 4.279471977497451e-05, + "step": 1461 + }, + { + "epoch": 0.18275, + "grad_norm": 28.74688720703125, + "grad_norm_var": 43.596899801988485, + "learning_rate": 0.0001, + "loss": 1.6068, + "loss/crossentropy": 2.5083324909210205, + "loss/hidden": 1.3984375, + "loss/logits": 0.20789732038974762, + "loss/reg": 4.277807965991087e-05, + "step": 1462 + }, + { + "epoch": 0.182875, + "grad_norm": 3.0596749782562256, + "grad_norm_var": 43.441506559109, + "learning_rate": 0.0001, + "loss": 1.5056, + "loss/crossentropy": 2.5228803157806396, + "loss/hidden": 1.2734375, + "loss/logits": 0.231749027967453, + "loss/reg": 4.275907122064382e-05, + "step": 1463 + }, + { + "epoch": 0.183, + "grad_norm": 2.790465831756592, + "grad_norm_var": 43.48392586707514, + "learning_rate": 0.0001, + "loss": 1.2947, + "loss/crossentropy": 2.5042221546173096, + "loss/hidden": 1.1171875, + "loss/logits": 0.17708972096443176, + "loss/reg": 4.274360981071368e-05, + "step": 1464 + }, + { + "epoch": 0.183125, + "grad_norm": 2.082412004470825, + "grad_norm_var": 43.58075224329326, + "learning_rate": 0.0001, + "loss": 1.2976, + "loss/crossentropy": 2.425107479095459, + "loss/hidden": 1.1171875, + "loss/logits": 0.17995816469192505, + "loss/reg": 4.272775549907237e-05, + "step": 1465 + }, + { + "epoch": 0.18325, + "grad_norm": 2.225287675857544, + "grad_norm_var": 43.636828045239, + "learning_rate": 0.0001, + "loss": 1.2741, + "loss/crossentropy": 2.5390968322753906, + "loss/hidden": 1.0859375, + "loss/logits": 0.18771487474441528, + "loss/reg": 4.2712516005849466e-05, + "step": 1466 + }, + { + "epoch": 0.183375, + "grad_norm": 2.2095344066619873, + "grad_norm_var": 42.716066053442496, + "learning_rate": 0.0001, + "loss": 1.3697, + "loss/crossentropy": 2.1581828594207764, + "loss/hidden": 1.1484375, + "loss/logits": 0.2208089530467987, + "loss/reg": 4.269627606845461e-05, + "step": 1467 + }, + { + "epoch": 0.1835, + "grad_norm": 3.168578863143921, + "grad_norm_var": 42.590231253385056, + "learning_rate": 0.0001, + "loss": 1.4461, + "loss/crossentropy": 2.5799078941345215, + "loss/hidden": 1.25, + "loss/logits": 0.19567829370498657, + "loss/reg": 4.268271732144058e-05, + "step": 1468 + }, + { + "epoch": 0.183625, + "grad_norm": 2.506560802459717, + "grad_norm_var": 42.681493007397286, + "learning_rate": 0.0001, + "loss": 1.4977, + "loss/crossentropy": 2.5367000102996826, + "loss/hidden": 1.2734375, + "loss/logits": 0.2238171398639679, + "loss/reg": 4.266422183718532e-05, + "step": 1469 + }, + { + "epoch": 0.18375, + "grad_norm": 7.600844383239746, + "grad_norm_var": 43.01543362550186, + "learning_rate": 0.0001, + "loss": 1.7192, + "loss/crossentropy": 3.0554118156433105, + "loss/hidden": 1.5390625, + "loss/logits": 0.17968618869781494, + "loss/reg": 4.265028110239655e-05, + "step": 1470 + }, + { + "epoch": 0.183875, + "grad_norm": 2.2054150104522705, + "grad_norm_var": 43.16396281278411, + "learning_rate": 0.0001, + "loss": 1.2623, + "loss/crossentropy": 2.5536673069000244, + "loss/hidden": 1.09375, + "loss/logits": 0.16808277368545532, + "loss/reg": 4.263762821210548e-05, + "step": 1471 + }, + { + "epoch": 0.184, + "grad_norm": 2.2131237983703613, + "grad_norm_var": 43.18034464683377, + "learning_rate": 0.0001, + "loss": 1.3421, + "loss/crossentropy": 2.7410731315612793, + "loss/hidden": 1.15625, + "loss/logits": 0.185453400015831, + "loss/reg": 4.262782385922037e-05, + "step": 1472 + }, + { + "epoch": 0.184125, + "grad_norm": 2.219083070755005, + "grad_norm_var": 43.4746190323492, + "learning_rate": 0.0001, + "loss": 1.162, + "loss/crossentropy": 2.3957958221435547, + "loss/hidden": 1.0234375, + "loss/logits": 0.13811297714710236, + "loss/reg": 4.262103539076634e-05, + "step": 1473 + }, + { + "epoch": 0.18425, + "grad_norm": 2.394564628601074, + "grad_norm_var": 43.42742842239302, + "learning_rate": 0.0001, + "loss": 1.2506, + "loss/crossentropy": 2.809195041656494, + "loss/hidden": 1.09375, + "loss/logits": 0.15642398595809937, + "loss/reg": 4.260565037839115e-05, + "step": 1474 + }, + { + "epoch": 0.184375, + "grad_norm": 9.86483097076416, + "grad_norm_var": 44.89087660481017, + "learning_rate": 0.0001, + "loss": 1.9438, + "loss/crossentropy": 2.620823383331299, + "loss/hidden": 1.765625, + "loss/logits": 0.1777106523513794, + "loss/reg": 4.258997432771139e-05, + "step": 1475 + }, + { + "epoch": 0.1845, + "grad_norm": 2.3290772438049316, + "grad_norm_var": 45.0248299538665, + "learning_rate": 0.0001, + "loss": 1.3243, + "loss/crossentropy": 2.333923816680908, + "loss/hidden": 1.1328125, + "loss/logits": 0.19102182984352112, + "loss/reg": 4.2582487367326394e-05, + "step": 1476 + }, + { + "epoch": 0.184625, + "grad_norm": 2.4037561416625977, + "grad_norm_var": 45.35262675926802, + "learning_rate": 0.0001, + "loss": 1.2645, + "loss/crossentropy": 2.9200727939605713, + "loss/hidden": 1.0859375, + "loss/logits": 0.17812412977218628, + "loss/reg": 4.257016189512797e-05, + "step": 1477 + }, + { + "epoch": 0.18475, + "grad_norm": 2.5915751457214355, + "grad_norm_var": 4.863057685649219, + "learning_rate": 0.0001, + "loss": 1.3253, + "loss/crossentropy": 2.2665810585021973, + "loss/hidden": 1.15625, + "loss/logits": 0.16864070296287537, + "loss/reg": 4.2560521251289174e-05, + "step": 1478 + }, + { + "epoch": 0.184875, + "grad_norm": 2.981412649154663, + "grad_norm_var": 4.865338349555102, + "learning_rate": 0.0001, + "loss": 1.3654, + "loss/crossentropy": 2.818727970123291, + "loss/hidden": 1.171875, + "loss/logits": 0.19310520589351654, + "loss/reg": 4.2553452658466995e-05, + "step": 1479 + }, + { + "epoch": 0.185, + "grad_norm": 2.439756393432617, + "grad_norm_var": 4.893890160529382, + "learning_rate": 0.0001, + "loss": 1.3272, + "loss/crossentropy": 2.6037240028381348, + "loss/hidden": 1.140625, + "loss/logits": 0.18612533807754517, + "loss/reg": 4.254809027770534e-05, + "step": 1480 + }, + { + "epoch": 0.185125, + "grad_norm": 2.447357654571533, + "grad_norm_var": 4.847115901511771, + "learning_rate": 0.0001, + "loss": 1.4924, + "loss/crossentropy": 2.919430732727051, + "loss/hidden": 1.265625, + "loss/logits": 0.22634942829608917, + "loss/reg": 4.254366285749711e-05, + "step": 1481 + }, + { + "epoch": 0.18525, + "grad_norm": 2.3883841037750244, + "grad_norm_var": 4.826765636031229, + "learning_rate": 0.0001, + "loss": 1.4694, + "loss/crossentropy": 2.2129786014556885, + "loss/hidden": 1.2421875, + "loss/logits": 0.22677311301231384, + "loss/reg": 4.253921360941604e-05, + "step": 1482 + }, + { + "epoch": 0.185375, + "grad_norm": 18.632322311401367, + "grad_norm_var": 19.410147172166884, + "learning_rate": 0.0001, + "loss": 1.9694, + "loss/crossentropy": 2.3374440670013428, + "loss/hidden": 1.71875, + "loss/logits": 0.25020337104797363, + "loss/reg": 4.2520878196228296e-05, + "step": 1483 + }, + { + "epoch": 0.1855, + "grad_norm": 2.408111095428467, + "grad_norm_var": 19.558393326740877, + "learning_rate": 0.0001, + "loss": 1.2343, + "loss/crossentropy": 2.4988977909088135, + "loss/hidden": 1.046875, + "loss/logits": 0.18700845539569855, + "loss/reg": 4.251228892826475e-05, + "step": 1484 + }, + { + "epoch": 0.185625, + "grad_norm": 2.6058828830718994, + "grad_norm_var": 19.536231022308367, + "learning_rate": 0.0001, + "loss": 1.3138, + "loss/crossentropy": 2.569941759109497, + "loss/hidden": 1.125, + "loss/logits": 0.188359797000885, + "loss/reg": 4.249310586601496e-05, + "step": 1485 + }, + { + "epoch": 0.18575, + "grad_norm": 2.337801694869995, + "grad_norm_var": 18.903999577234515, + "learning_rate": 0.0001, + "loss": 1.2814, + "loss/crossentropy": 2.5755937099456787, + "loss/hidden": 1.1015625, + "loss/logits": 0.17937231063842773, + "loss/reg": 4.2484189179958776e-05, + "step": 1486 + }, + { + "epoch": 0.185875, + "grad_norm": 2.2206108570098877, + "grad_norm_var": 18.900572680100932, + "learning_rate": 0.0001, + "loss": 1.3908, + "loss/crossentropy": 2.497291326522827, + "loss/hidden": 1.1875, + "loss/logits": 0.20290207862854004, + "loss/reg": 4.24723511969205e-05, + "step": 1487 + }, + { + "epoch": 0.186, + "grad_norm": 2.805840253829956, + "grad_norm_var": 18.78883428537014, + "learning_rate": 0.0001, + "loss": 1.2726, + "loss/crossentropy": 2.637183666229248, + "loss/hidden": 1.1015625, + "loss/logits": 0.17063820362091064, + "loss/reg": 4.2465177102712914e-05, + "step": 1488 + }, + { + "epoch": 0.186125, + "grad_norm": 2.482452630996704, + "grad_norm_var": 18.73267123963991, + "learning_rate": 0.0001, + "loss": 1.4955, + "loss/crossentropy": 2.3506133556365967, + "loss/hidden": 1.25, + "loss/logits": 0.2450503706932068, + "loss/reg": 4.244835872668773e-05, + "step": 1489 + }, + { + "epoch": 0.18625, + "grad_norm": 2.404538154602051, + "grad_norm_var": 18.730597918024976, + "learning_rate": 0.0001, + "loss": 1.3089, + "loss/crossentropy": 2.5639541149139404, + "loss/hidden": 1.125, + "loss/logits": 0.18347838521003723, + "loss/reg": 4.243505100021139e-05, + "step": 1490 + }, + { + "epoch": 0.186375, + "grad_norm": 2.1632773876190186, + "grad_norm_var": 16.373156635802495, + "learning_rate": 0.0001, + "loss": 1.2372, + "loss/crossentropy": 2.5014357566833496, + "loss/hidden": 1.0625, + "loss/logits": 0.17423637211322784, + "loss/reg": 4.242025897838175e-05, + "step": 1491 + }, + { + "epoch": 0.1865, + "grad_norm": 2.3475615978240967, + "grad_norm_var": 16.370347277694776, + "learning_rate": 0.0001, + "loss": 1.3586, + "loss/crossentropy": 2.4183082580566406, + "loss/hidden": 1.1875, + "loss/logits": 0.17070849239826202, + "loss/reg": 4.240818088874221e-05, + "step": 1492 + }, + { + "epoch": 0.186625, + "grad_norm": 2.2167327404022217, + "grad_norm_var": 16.39934092054266, + "learning_rate": 0.0001, + "loss": 1.2318, + "loss/crossentropy": 2.737880229949951, + "loss/hidden": 1.0703125, + "loss/logits": 0.161026269197464, + "loss/reg": 4.2394876800244674e-05, + "step": 1493 + }, + { + "epoch": 0.18675, + "grad_norm": 2.2914958000183105, + "grad_norm_var": 16.43999919701839, + "learning_rate": 0.0001, + "loss": 1.3837, + "loss/crossentropy": 2.581719398498535, + "loss/hidden": 1.171875, + "loss/logits": 0.21139121055603027, + "loss/reg": 4.2386356653878465e-05, + "step": 1494 + }, + { + "epoch": 0.186875, + "grad_norm": 2.403869390487671, + "grad_norm_var": 16.496803032325875, + "learning_rate": 0.0001, + "loss": 1.3391, + "loss/crossentropy": 2.3859739303588867, + "loss/hidden": 1.1484375, + "loss/logits": 0.19020606577396393, + "loss/reg": 4.237349276081659e-05, + "step": 1495 + }, + { + "epoch": 0.187, + "grad_norm": 2.8205666542053223, + "grad_norm_var": 16.456488504250864, + "learning_rate": 0.0001, + "loss": 1.5081, + "loss/crossentropy": 2.8649418354034424, + "loss/hidden": 1.2734375, + "loss/logits": 0.23426686227321625, + "loss/reg": 4.2357834900030866e-05, + "step": 1496 + }, + { + "epoch": 0.187125, + "grad_norm": 2.3977134227752686, + "grad_norm_var": 16.463186923695265, + "learning_rate": 0.0001, + "loss": 1.3612, + "loss/crossentropy": 2.7040863037109375, + "loss/hidden": 1.1796875, + "loss/logits": 0.18106544017791748, + "loss/reg": 4.234022708260454e-05, + "step": 1497 + }, + { + "epoch": 0.18725, + "grad_norm": 9.282831192016602, + "grad_norm_var": 18.473799466194198, + "learning_rate": 0.0001, + "loss": 2.1135, + "loss/crossentropy": 2.3761186599731445, + "loss/hidden": 1.828125, + "loss/logits": 0.2849646806716919, + "loss/reg": 4.23242527176626e-05, + "step": 1498 + }, + { + "epoch": 0.187375, + "grad_norm": 2.448495864868164, + "grad_norm_var": 2.9755130882248815, + "learning_rate": 0.0001, + "loss": 1.6106, + "loss/crossentropy": 2.541566848754883, + "loss/hidden": 1.3671875, + "loss/logits": 0.24301382899284363, + "loss/reg": 4.2308161937398836e-05, + "step": 1499 + }, + { + "epoch": 0.1875, + "grad_norm": 2.0536553859710693, + "grad_norm_var": 3.004361121628405, + "learning_rate": 0.0001, + "loss": 1.1144, + "loss/crossentropy": 2.4763259887695312, + "loss/hidden": 0.9765625, + "loss/logits": 0.1373748481273651, + "loss/reg": 4.2289339035050943e-05, + "step": 1500 + } + ], + "logging_steps": 1, + "max_steps": 8000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": true, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.6608792346624e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}