| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.0625, |
| "eval_steps": 500, |
| "global_step": 500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.000125, |
| "grad_norm": 2.377527952194214, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 1.2768, |
| "loss/crossentropy": 2.697097063064575, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.15893849730491638, |
| "loss/reg": 6.247002602322027e-05, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.00025, |
| "grad_norm": 4.216994762420654, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 1.3752, |
| "loss/crossentropy": 3.101844310760498, |
| "loss/hidden": 1.1796875, |
| "loss/logits": 0.1949012577533722, |
| "loss/reg": 6.247002602322027e-05, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.000375, |
| "grad_norm": 2.3287529945373535, |
| "learning_rate": 3e-06, |
| "loss": 1.2785, |
| "loss/crossentropy": 2.63712477684021, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.18410107493400574, |
| "loss/reg": 6.246996053960174e-05, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.0005, |
| "grad_norm": 5.415231227874756, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 1.4285, |
| "loss/crossentropy": 2.5702285766601562, |
| "loss/hidden": 1.265625, |
| "loss/logits": 0.16228657960891724, |
| "loss/reg": 6.246980774449185e-05, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.000625, |
| "grad_norm": 4.888370513916016, |
| "learning_rate": 5e-06, |
| "loss": 1.5121, |
| "loss/crossentropy": 2.439383029937744, |
| "loss/hidden": 1.3125, |
| "loss/logits": 0.19899356365203857, |
| "loss/reg": 6.24695821898058e-05, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.00075, |
| "grad_norm": 2.608705997467041, |
| "learning_rate": 6e-06, |
| "loss": 1.293, |
| "loss/crossentropy": 2.668699026107788, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.18298496305942535, |
| "loss/reg": 6.246933480724692e-05, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.000875, |
| "grad_norm": 2.8447623252868652, |
| "learning_rate": 7.000000000000001e-06, |
| "loss": 1.5339, |
| "loss/crossentropy": 2.5219366550445557, |
| "loss/hidden": 1.296875, |
| "loss/logits": 0.2364223599433899, |
| "loss/reg": 6.246914563234895e-05, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.001, |
| "grad_norm": 3.7877628803253174, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 1.8218, |
| "loss/crossentropy": 2.1927688121795654, |
| "loss/hidden": 1.5546875, |
| "loss/logits": 0.2664879262447357, |
| "loss/reg": 6.246889097383246e-05, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.001125, |
| "grad_norm": 2.988516330718994, |
| "learning_rate": 9e-06, |
| "loss": 1.7373, |
| "loss/crossentropy": 2.3826897144317627, |
| "loss/hidden": 1.421875, |
| "loss/logits": 0.314752995967865, |
| "loss/reg": 6.246858538361266e-05, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.00125, |
| "grad_norm": 2.143723726272583, |
| "learning_rate": 1e-05, |
| "loss": 1.405, |
| "loss/crossentropy": 2.2246415615081787, |
| "loss/hidden": 1.234375, |
| "loss/logits": 0.16997714340686798, |
| "loss/reg": 6.246842531254515e-05, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.001375, |
| "grad_norm": 2.4413657188415527, |
| "learning_rate": 1.1000000000000001e-05, |
| "loss": 1.4206, |
| "loss/crossentropy": 2.4612021446228027, |
| "loss/hidden": 1.1796875, |
| "loss/logits": 0.24033024907112122, |
| "loss/reg": 6.246819975785911e-05, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.0015, |
| "grad_norm": 2.483156204223633, |
| "learning_rate": 1.2e-05, |
| "loss": 1.6449, |
| "loss/crossentropy": 2.2882771492004395, |
| "loss/hidden": 1.4140625, |
| "loss/logits": 0.23023059964179993, |
| "loss/reg": 6.246790871955454e-05, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.001625, |
| "grad_norm": 2.7368147373199463, |
| "learning_rate": 1.3000000000000001e-05, |
| "loss": 1.4981, |
| "loss/crossentropy": 2.6942052841186523, |
| "loss/hidden": 1.265625, |
| "loss/logits": 0.23185348510742188, |
| "loss/reg": 6.24675813014619e-05, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.00175, |
| "grad_norm": 5.189184665679932, |
| "learning_rate": 1.4000000000000001e-05, |
| "loss": 1.946, |
| "loss/crossentropy": 2.3771214485168457, |
| "loss/hidden": 1.625, |
| "loss/logits": 0.320385217666626, |
| "loss/reg": 6.246678822208196e-05, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.001875, |
| "grad_norm": 2.305589437484741, |
| "learning_rate": 1.5e-05, |
| "loss": 1.4982, |
| "loss/crossentropy": 2.7562549114227295, |
| "loss/hidden": 1.25, |
| "loss/logits": 0.2476150244474411, |
| "loss/reg": 6.246620614547282e-05, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.002, |
| "grad_norm": 2.3378520011901855, |
| "grad_norm_var": 1.2675163586822178, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 1.3302, |
| "loss/crossentropy": 2.445441961288452, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.20453599095344543, |
| "loss/reg": 6.246585689950734e-05, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.002125, |
| "grad_norm": 1.7903435230255127, |
| "grad_norm_var": 1.3529406709866008, |
| "learning_rate": 1.7000000000000003e-05, |
| "loss": 1.1333, |
| "loss/crossentropy": 2.323503017425537, |
| "loss/hidden": 0.984375, |
| "loss/logits": 0.14828170835971832, |
| "loss/reg": 6.246510747587308e-05, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.00225, |
| "grad_norm": 3.363795518875122, |
| "grad_norm_var": 1.277817936381435, |
| "learning_rate": 1.8e-05, |
| "loss": 1.7292, |
| "loss/crossentropy": 2.6075525283813477, |
| "loss/hidden": 1.46875, |
| "loss/logits": 0.25987327098846436, |
| "loss/reg": 6.24642925686203e-05, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.002375, |
| "grad_norm": 2.162050724029541, |
| "grad_norm_var": 1.2967721886362786, |
| "learning_rate": 1.9e-05, |
| "loss": 1.3146, |
| "loss/crossentropy": 2.570558786392212, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.18898281455039978, |
| "loss/reg": 6.246323027880862e-05, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.0025, |
| "grad_norm": 2.147024393081665, |
| "grad_norm_var": 0.9523869945360727, |
| "learning_rate": 2e-05, |
| "loss": 1.3484, |
| "loss/crossentropy": 2.6676244735717773, |
| "loss/hidden": 1.1484375, |
| "loss/logits": 0.19929195940494537, |
| "loss/reg": 6.246233533602208e-05, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.002625, |
| "grad_norm": 2.0668728351593018, |
| "grad_norm_var": 0.6976603751830339, |
| "learning_rate": 2.1e-05, |
| "loss": 1.1929, |
| "loss/crossentropy": 2.401143789291382, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.1610003113746643, |
| "loss/reg": 6.246144039323553e-05, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.00275, |
| "grad_norm": 2.8019566535949707, |
| "grad_norm_var": 0.6973240463492516, |
| "learning_rate": 2.2000000000000003e-05, |
| "loss": 1.419, |
| "loss/crossentropy": 2.627523183822632, |
| "loss/hidden": 1.203125, |
| "loss/logits": 0.2152642011642456, |
| "loss/reg": 6.246032717172056e-05, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.002875, |
| "grad_norm": 3.8118937015533447, |
| "grad_norm_var": 0.7713008187193999, |
| "learning_rate": 2.3000000000000003e-05, |
| "loss": 1.4284, |
| "loss/crossentropy": 2.7227890491485596, |
| "loss/hidden": 1.1640625, |
| "loss/logits": 0.2637593150138855, |
| "loss/reg": 6.245896656764671e-05, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.003, |
| "grad_norm": 2.1418018341064453, |
| "grad_norm_var": 0.7205284729945551, |
| "learning_rate": 2.4e-05, |
| "loss": 1.3002, |
| "loss/crossentropy": 2.545552968978882, |
| "loss/hidden": 1.1328125, |
| "loss/logits": 0.16680249571800232, |
| "loss/reg": 6.245774420676753e-05, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.003125, |
| "grad_norm": 3.5331156253814697, |
| "grad_norm_var": 0.7613226543465996, |
| "learning_rate": 2.5e-05, |
| "loss": 1.3224, |
| "loss/crossentropy": 2.2371270656585693, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.16548338532447815, |
| "loss/reg": 6.245705299079418e-05, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.00325, |
| "grad_norm": 1.9795947074890137, |
| "grad_norm_var": 0.7755306597344306, |
| "learning_rate": 2.6000000000000002e-05, |
| "loss": 1.3209, |
| "loss/crossentropy": 2.7113037109375, |
| "loss/hidden": 1.1328125, |
| "loss/logits": 0.18742361664772034, |
| "loss/reg": 6.245569966267794e-05, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.003375, |
| "grad_norm": 2.6044108867645264, |
| "grad_norm_var": 0.7714440385524235, |
| "learning_rate": 2.7000000000000002e-05, |
| "loss": 1.4566, |
| "loss/crossentropy": 2.6034419536590576, |
| "loss/hidden": 1.2265625, |
| "loss/logits": 0.22937631607055664, |
| "loss/reg": 6.245376425795257e-05, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.0035, |
| "grad_norm": 2.48085355758667, |
| "grad_norm_var": 0.7715158471256792, |
| "learning_rate": 2.8000000000000003e-05, |
| "loss": 1.4579, |
| "loss/crossentropy": 2.5794363021850586, |
| "loss/hidden": 1.2421875, |
| "loss/logits": 0.21509718894958496, |
| "loss/reg": 6.245166878215969e-05, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.003625, |
| "grad_norm": 3.0413854122161865, |
| "grad_norm_var": 0.7781660489700184, |
| "learning_rate": 2.9e-05, |
| "loss": 1.6102, |
| "loss/crossentropy": 2.4173922538757324, |
| "loss/hidden": 1.375, |
| "loss/logits": 0.23455965518951416, |
| "loss/reg": 6.244902033358812e-05, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.00375, |
| "grad_norm": 2.1076390743255615, |
| "grad_norm_var": 0.36324525064493024, |
| "learning_rate": 3e-05, |
| "loss": 1.0735, |
| "loss/crossentropy": 2.4064886569976807, |
| "loss/hidden": 0.9453125, |
| "loss/logits": 0.12752822041511536, |
| "loss/reg": 6.244838004931808e-05, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.003875, |
| "grad_norm": 2.5296630859375, |
| "grad_norm_var": 0.359312391151574, |
| "learning_rate": 3.1e-05, |
| "loss": 1.3467, |
| "loss/crossentropy": 2.61391544342041, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.18978667259216309, |
| "loss/reg": 6.244736141525209e-05, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.004, |
| "grad_norm": 2.123671054840088, |
| "grad_norm_var": 0.3684168280400947, |
| "learning_rate": 3.2000000000000005e-05, |
| "loss": 1.2191, |
| "loss/crossentropy": 2.6056668758392334, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.16381201148033142, |
| "loss/reg": 6.244605174288154e-05, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.004125, |
| "grad_norm": 3.685770034790039, |
| "grad_norm_var": 0.4027733703548923, |
| "learning_rate": 3.3e-05, |
| "loss": 1.6794, |
| "loss/crossentropy": 2.519561290740967, |
| "loss/hidden": 1.3828125, |
| "loss/logits": 0.29592496156692505, |
| "loss/reg": 6.24443418928422e-05, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.00425, |
| "grad_norm": 1.9660468101501465, |
| "grad_norm_var": 0.393966226946808, |
| "learning_rate": 3.4000000000000007e-05, |
| "loss": 1.3395, |
| "loss/crossentropy": 2.638051986694336, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.18261724710464478, |
| "loss/reg": 6.244314135983586e-05, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.004375, |
| "grad_norm": 2.3111677169799805, |
| "grad_norm_var": 0.38716579449971367, |
| "learning_rate": 3.5e-05, |
| "loss": 1.3501, |
| "loss/crossentropy": 2.599940776824951, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.19327056407928467, |
| "loss/reg": 6.244215182960033e-05, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.0045, |
| "grad_norm": 2.5357542037963867, |
| "grad_norm_var": 0.3739975607775089, |
| "learning_rate": 3.6e-05, |
| "loss": 1.287, |
| "loss/crossentropy": 2.9884798526763916, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.16922441124916077, |
| "loss/reg": 6.244022370083258e-05, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.004625, |
| "grad_norm": 1.7781621217727661, |
| "grad_norm_var": 0.40002233468076764, |
| "learning_rate": 3.7e-05, |
| "loss": 1.074, |
| "loss/crossentropy": 2.669071674346924, |
| "loss/hidden": 0.93359375, |
| "loss/logits": 0.13981276750564575, |
| "loss/reg": 6.243858661036938e-05, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.00475, |
| "grad_norm": 24.6973819732666, |
| "grad_norm_var": 30.983207545217457, |
| "learning_rate": 3.8e-05, |
| "loss": 1.3637, |
| "loss/crossentropy": 2.482579469680786, |
| "loss/hidden": 1.1953125, |
| "loss/logits": 0.16777344048023224, |
| "loss/reg": 6.243725511012599e-05, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.004875, |
| "grad_norm": 2.5728342533111572, |
| "grad_norm_var": 31.103302953089262, |
| "learning_rate": 3.9000000000000006e-05, |
| "loss": 1.3424, |
| "loss/crossentropy": 2.2785422801971436, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.16988611221313477, |
| "loss/reg": 6.243555981200188e-05, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.005, |
| "grad_norm": 1.7385622262954712, |
| "grad_norm_var": 31.206951393275006, |
| "learning_rate": 4e-05, |
| "loss": 1.077, |
| "loss/crossentropy": 2.7017714977264404, |
| "loss/hidden": 0.9453125, |
| "loss/logits": 0.13102804124355316, |
| "loss/reg": 6.243350071599707e-05, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.005125, |
| "grad_norm": 2.455116033554077, |
| "grad_norm_var": 31.325901099338942, |
| "learning_rate": 4.1e-05, |
| "loss": 1.178, |
| "loss/crossentropy": 2.6521873474121094, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.16170336306095123, |
| "loss/reg": 6.243147072382271e-05, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.00525, |
| "grad_norm": 3.0441935062408447, |
| "grad_norm_var": 31.14003983168487, |
| "learning_rate": 4.2e-05, |
| "loss": 1.488, |
| "loss/crossentropy": 2.5000290870666504, |
| "loss/hidden": 1.265625, |
| "loss/logits": 0.2217317819595337, |
| "loss/reg": 6.24291569693014e-05, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.005375, |
| "grad_norm": 2.6227200031280518, |
| "grad_norm_var": 31.137008952861066, |
| "learning_rate": 4.3e-05, |
| "loss": 1.3106, |
| "loss/crossentropy": 2.6832528114318848, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.19276997447013855, |
| "loss/reg": 6.242711242521182e-05, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.0055, |
| "grad_norm": 2.9194633960723877, |
| "grad_norm_var": 31.06863081080745, |
| "learning_rate": 4.4000000000000006e-05, |
| "loss": 1.5396, |
| "loss/crossentropy": 2.483938455581665, |
| "loss/hidden": 1.3046875, |
| "loss/logits": 0.23424991965293884, |
| "loss/reg": 6.242513336474076e-05, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.005625, |
| "grad_norm": 2.2491037845611572, |
| "grad_norm_var": 31.196778907875057, |
| "learning_rate": 4.5e-05, |
| "loss": 1.2321, |
| "loss/crossentropy": 2.9735186100006104, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.1689363420009613, |
| "loss/reg": 6.242344534257427e-05, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.00575, |
| "grad_norm": 2.687225103378296, |
| "grad_norm_var": 31.084396554405373, |
| "learning_rate": 4.600000000000001e-05, |
| "loss": 1.2443, |
| "loss/crossentropy": 2.913846254348755, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.18112678825855255, |
| "loss/reg": 6.242193921934813e-05, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.005875, |
| "grad_norm": 2.3648312091827393, |
| "grad_norm_var": 31.1155476706496, |
| "learning_rate": 4.7e-05, |
| "loss": 1.2044, |
| "loss/crossentropy": 2.374119520187378, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.15688437223434448, |
| "loss/reg": 6.242006929824129e-05, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.006, |
| "grad_norm": 1.896540880203247, |
| "grad_norm_var": 31.171339818602494, |
| "learning_rate": 4.8e-05, |
| "loss": 1.238, |
| "loss/crossentropy": 2.613962173461914, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.1826920211315155, |
| "loss/reg": 6.24187450739555e-05, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.006125, |
| "grad_norm": 1.7585434913635254, |
| "grad_norm_var": 31.44447201393312, |
| "learning_rate": 4.9e-05, |
| "loss": 1.1411, |
| "loss/crossentropy": 2.5672757625579834, |
| "loss/hidden": 1.0, |
| "loss/logits": 0.14043202996253967, |
| "loss/reg": 6.241785740712658e-05, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.00625, |
| "grad_norm": 1.8257592916488647, |
| "grad_norm_var": 31.47860052328912, |
| "learning_rate": 5e-05, |
| "loss": 1.2643, |
| "loss/crossentropy": 2.4829366207122803, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.1777852475643158, |
| "loss/reg": 6.2416227592621e-05, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.006375, |
| "grad_norm": 1.9530550241470337, |
| "grad_norm_var": 31.553698309541367, |
| "learning_rate": 5.1000000000000006e-05, |
| "loss": 1.1787, |
| "loss/crossentropy": 2.501922369003296, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.16241338849067688, |
| "loss/reg": 6.241373193915933e-05, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.0065, |
| "grad_norm": 2.366898536682129, |
| "grad_norm_var": 31.58155048439878, |
| "learning_rate": 5.2000000000000004e-05, |
| "loss": 1.476, |
| "loss/crossentropy": 2.557314872741699, |
| "loss/hidden": 1.234375, |
| "loss/logits": 0.24098029732704163, |
| "loss/reg": 6.241213122848421e-05, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.006625, |
| "grad_norm": 2.139944553375244, |
| "grad_norm_var": 31.497838767117898, |
| "learning_rate": 5.300000000000001e-05, |
| "loss": 1.3057, |
| "loss/crossentropy": 2.5664379596710205, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.18005570769309998, |
| "loss/reg": 6.241026130737737e-05, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.00675, |
| "grad_norm": 2.2614963054656982, |
| "grad_norm_var": 0.16298419379227144, |
| "learning_rate": 5.4000000000000005e-05, |
| "loss": 1.2081, |
| "loss/crossentropy": 2.5651533603668213, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.1606135070323944, |
| "loss/reg": 6.240784568944946e-05, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.006875, |
| "grad_norm": 1.88372802734375, |
| "grad_norm_var": 0.16791840248250048, |
| "learning_rate": 5.500000000000001e-05, |
| "loss": 1.2037, |
| "loss/crossentropy": 2.0431623458862305, |
| "loss/hidden": 1.0703125, |
| "loss/logits": 0.13271506130695343, |
| "loss/reg": 6.240410584723577e-05, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.007, |
| "grad_norm": 1.7579172849655151, |
| "grad_norm_var": 0.16659499666655736, |
| "learning_rate": 5.6000000000000006e-05, |
| "loss": 1.0787, |
| "loss/crossentropy": 2.5805883407592773, |
| "loss/hidden": 0.94140625, |
| "loss/logits": 0.13670633733272552, |
| "loss/reg": 6.240163202164695e-05, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.007125, |
| "grad_norm": 2.740758180618286, |
| "grad_norm_var": 0.17906241043444873, |
| "learning_rate": 5.6999999999999996e-05, |
| "loss": 1.2499, |
| "loss/crossentropy": 2.821078062057495, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.16337308287620544, |
| "loss/reg": 6.239958747755736e-05, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.00725, |
| "grad_norm": 3.3393216133117676, |
| "grad_norm_var": 0.21459676497742203, |
| "learning_rate": 5.8e-05, |
| "loss": 1.5094, |
| "loss/crossentropy": 2.6574273109436035, |
| "loss/hidden": 1.2265625, |
| "loss/logits": 0.2822623550891876, |
| "loss/reg": 6.239775393623859e-05, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.007375, |
| "grad_norm": 2.1151742935180664, |
| "grad_norm_var": 0.20871929877454623, |
| "learning_rate": 5.9e-05, |
| "loss": 1.31, |
| "loss/crossentropy": 2.28176212310791, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.18433833122253418, |
| "loss/reg": 6.239649519557133e-05, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.0075, |
| "grad_norm": 1.9203850030899048, |
| "grad_norm_var": 0.18408730894700795, |
| "learning_rate": 6e-05, |
| "loss": 1.2862, |
| "loss/crossentropy": 2.319091558456421, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.1918697953224182, |
| "loss/reg": 6.239335925783962e-05, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.007625, |
| "grad_norm": 2.689425230026245, |
| "grad_norm_var": 0.1988651894699956, |
| "learning_rate": 6.1e-05, |
| "loss": 1.2077, |
| "loss/crossentropy": 2.396440029144287, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.1523526906967163, |
| "loss/reg": 6.239157664822415e-05, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.00775, |
| "grad_norm": 2.0848548412323, |
| "grad_norm_var": 0.184926237897677, |
| "learning_rate": 6.2e-05, |
| "loss": 1.1889, |
| "loss/crossentropy": 2.375331401824951, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.15707406401634216, |
| "loss/reg": 6.238814967218786e-05, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.007875, |
| "grad_norm": 1.9770179986953735, |
| "grad_norm_var": 0.18547542502594508, |
| "learning_rate": 6.3e-05, |
| "loss": 1.1255, |
| "loss/crossentropy": 2.5883288383483887, |
| "loss/hidden": 0.984375, |
| "loss/logits": 0.14046350121498108, |
| "loss/reg": 6.238514470169321e-05, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.008, |
| "grad_norm": 1.9654349088668823, |
| "grad_norm_var": 0.1832653842408547, |
| "learning_rate": 6.400000000000001e-05, |
| "loss": 1.1315, |
| "loss/crossentropy": 2.6122260093688965, |
| "loss/hidden": 0.9765625, |
| "loss/logits": 0.1543133556842804, |
| "loss/reg": 6.238299101823941e-05, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.008125, |
| "grad_norm": 2.110621690750122, |
| "grad_norm_var": 0.1715223081433841, |
| "learning_rate": 6.500000000000001e-05, |
| "loss": 1.1513, |
| "loss/crossentropy": 2.3829517364501953, |
| "loss/hidden": 1.0, |
| "loss/logits": 0.15063607692718506, |
| "loss/reg": 6.237896013772115e-05, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.00825, |
| "grad_norm": 3.1477179527282715, |
| "grad_norm_var": 0.21553302023151552, |
| "learning_rate": 6.6e-05, |
| "loss": 1.4659, |
| "loss/crossentropy": 2.2805211544036865, |
| "loss/hidden": 1.2421875, |
| "loss/logits": 0.22310970723628998, |
| "loss/reg": 6.237393972696736e-05, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.008375, |
| "grad_norm": 2.482203722000122, |
| "grad_norm_var": 0.21008166056666275, |
| "learning_rate": 6.7e-05, |
| "loss": 1.0839, |
| "loss/crossentropy": 2.982119560241699, |
| "loss/hidden": 0.94140625, |
| "loss/logits": 0.14186254143714905, |
| "loss/reg": 6.236990884644911e-05, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.0085, |
| "grad_norm": 2.198028087615967, |
| "grad_norm_var": 0.21061508280485744, |
| "learning_rate": 6.800000000000001e-05, |
| "loss": 1.2007, |
| "loss/crossentropy": 2.725332498550415, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.1610267162322998, |
| "loss/reg": 6.236397166503593e-05, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.008625, |
| "grad_norm": 1.9412530660629272, |
| "grad_norm_var": 0.21734592747188602, |
| "learning_rate": 6.9e-05, |
| "loss": 1.1269, |
| "loss/crossentropy": 2.682379722595215, |
| "loss/hidden": 0.984375, |
| "loss/logits": 0.14185243844985962, |
| "loss/reg": 6.235777982510626e-05, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.00875, |
| "grad_norm": 2.223443031311035, |
| "grad_norm_var": 0.21757323137186588, |
| "learning_rate": 7e-05, |
| "loss": 1.3663, |
| "loss/crossentropy": 2.6186935901641846, |
| "loss/hidden": 1.1640625, |
| "loss/logits": 0.2016535997390747, |
| "loss/reg": 6.23530286247842e-05, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.008875, |
| "grad_norm": 3.4456241130828857, |
| "grad_norm_var": 0.28625219910078287, |
| "learning_rate": 7.1e-05, |
| "loss": 1.6214, |
| "loss/crossentropy": 2.054266929626465, |
| "loss/hidden": 1.421875, |
| "loss/logits": 0.19887767732143402, |
| "loss/reg": 6.234741158550605e-05, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.009, |
| "grad_norm": 1.9013352394104004, |
| "grad_norm_var": 0.27557130255187207, |
| "learning_rate": 7.2e-05, |
| "loss": 1.1365, |
| "loss/crossentropy": 2.422841787338257, |
| "loss/hidden": 0.9765625, |
| "loss/logits": 0.15926527976989746, |
| "loss/reg": 6.234211468836293e-05, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.009125, |
| "grad_norm": 2.4032697677612305, |
| "grad_norm_var": 0.267026183625853, |
| "learning_rate": 7.3e-05, |
| "loss": 1.4414, |
| "loss/crossentropy": 2.4159440994262695, |
| "loss/hidden": 1.21875, |
| "loss/logits": 0.22204136848449707, |
| "loss/reg": 6.233662861632183e-05, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.00925, |
| "grad_norm": 1.915128231048584, |
| "grad_norm_var": 0.21002777018266153, |
| "learning_rate": 7.4e-05, |
| "loss": 1.2439, |
| "loss/crossentropy": 2.587275505065918, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.1807810664176941, |
| "loss/reg": 6.232755549717695e-05, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.009375, |
| "grad_norm": 3.4048879146575928, |
| "grad_norm_var": 0.28520435687560547, |
| "learning_rate": 7.500000000000001e-05, |
| "loss": 1.2774, |
| "loss/crossentropy": 2.6182703971862793, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.15172982215881348, |
| "loss/reg": 6.231923180166632e-05, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.0095, |
| "grad_norm": 2.3605074882507324, |
| "grad_norm_var": 0.27132747056331724, |
| "learning_rate": 7.6e-05, |
| "loss": 1.1409, |
| "loss/crossentropy": 2.6013262271881104, |
| "loss/hidden": 0.98828125, |
| "loss/logits": 0.151985764503479, |
| "loss/reg": 6.231063889572397e-05, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.009625, |
| "grad_norm": 2.6056039333343506, |
| "grad_norm_var": 0.2684276793201585, |
| "learning_rate": 7.7e-05, |
| "loss": 1.1, |
| "loss/crossentropy": 2.534158945083618, |
| "loss/hidden": 0.94921875, |
| "loss/logits": 0.1501779407262802, |
| "loss/reg": 6.230256258277223e-05, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.00975, |
| "grad_norm": 1.7923972606658936, |
| "grad_norm_var": 0.285494251958092, |
| "learning_rate": 7.800000000000001e-05, |
| "loss": 1.1471, |
| "loss/crossentropy": 2.3036601543426514, |
| "loss/hidden": 0.98828125, |
| "loss/logits": 0.15817409753799438, |
| "loss/reg": 6.229766586329788e-05, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.009875, |
| "grad_norm": 2.0376312732696533, |
| "grad_norm_var": 0.2825708803585835, |
| "learning_rate": 7.900000000000001e-05, |
| "loss": 1.2985, |
| "loss/crossentropy": 2.5548579692840576, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.1572834551334381, |
| "loss/reg": 6.229063728824258e-05, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 2.998662233352661, |
| "grad_norm_var": 0.29342903010298654, |
| "learning_rate": 8e-05, |
| "loss": 1.5504, |
| "loss/crossentropy": 2.4098215103149414, |
| "loss/hidden": 1.3046875, |
| "loss/logits": 0.24512597918510437, |
| "loss/reg": 6.22822335571982e-05, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.010125, |
| "grad_norm": 2.103449583053589, |
| "grad_norm_var": 0.29374293883859787, |
| "learning_rate": 8.1e-05, |
| "loss": 1.2985, |
| "loss/crossentropy": 2.380378484725952, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.17282900214195251, |
| "loss/reg": 6.227292760740966e-05, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.01025, |
| "grad_norm": 2.6376256942749023, |
| "grad_norm_var": 0.2615363410208279, |
| "learning_rate": 8.2e-05, |
| "loss": 1.266, |
| "loss/crossentropy": 2.4291374683380127, |
| "loss/hidden": 1.1015625, |
| "loss/logits": 0.16384728252887726, |
| "loss/reg": 6.226752884685993e-05, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.010375, |
| "grad_norm": 2.0763561725616455, |
| "grad_norm_var": 0.2675552215302521, |
| "learning_rate": 8.3e-05, |
| "loss": 1.1733, |
| "loss/crossentropy": 2.423896312713623, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.15705125033855438, |
| "loss/reg": 6.225931429071352e-05, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.0105, |
| "grad_norm": 4.398110866546631, |
| "grad_norm_var": 0.5173355174320988, |
| "learning_rate": 8.4e-05, |
| "loss": 1.5654, |
| "loss/crossentropy": 2.230816602706909, |
| "loss/hidden": 1.296875, |
| "loss/logits": 0.26791903376579285, |
| "loss/reg": 6.225006654858589e-05, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.010625, |
| "grad_norm": 2.7163784503936768, |
| "grad_norm_var": 0.4955558090734691, |
| "learning_rate": 8.5e-05, |
| "loss": 1.2008, |
| "loss/crossentropy": 2.1671087741851807, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.145525261759758, |
| "loss/reg": 6.224414391908795e-05, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.01075, |
| "grad_norm": 1.9465394020080566, |
| "grad_norm_var": 0.5129132822581631, |
| "learning_rate": 8.6e-05, |
| "loss": 1.0109, |
| "loss/crossentropy": 2.218550443649292, |
| "loss/hidden": 0.90234375, |
| "loss/logits": 0.10795612633228302, |
| "loss/reg": 6.22385778115131e-05, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.010875, |
| "grad_norm": 5.668015956878662, |
| "grad_norm_var": 1.0880389746416426, |
| "learning_rate": 8.7e-05, |
| "loss": 1.2925, |
| "loss/crossentropy": 2.360995292663574, |
| "loss/hidden": 1.1484375, |
| "loss/logits": 0.1434704214334488, |
| "loss/reg": 6.223141826922074e-05, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.011, |
| "grad_norm": 3.4049394130706787, |
| "grad_norm_var": 1.0721571012465496, |
| "learning_rate": 8.800000000000001e-05, |
| "loss": 1.6353, |
| "loss/crossentropy": 1.9898579120635986, |
| "loss/hidden": 1.3828125, |
| "loss/logits": 0.25186440348625183, |
| "loss/reg": 6.222462252480909e-05, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.011125, |
| "grad_norm": 1.885895013809204, |
| "grad_norm_var": 1.1148297312339375, |
| "learning_rate": 8.900000000000001e-05, |
| "loss": 1.0561, |
| "loss/crossentropy": 2.670912027359009, |
| "loss/hidden": 0.92578125, |
| "loss/logits": 0.12972213327884674, |
| "loss/reg": 6.221828516572714e-05, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.01125, |
| "grad_norm": 1.886960506439209, |
| "grad_norm_var": 1.118003608268531, |
| "learning_rate": 9e-05, |
| "loss": 1.1335, |
| "loss/crossentropy": 2.5691866874694824, |
| "loss/hidden": 0.97265625, |
| "loss/logits": 0.16021151840686798, |
| "loss/reg": 6.221193325472996e-05, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.011375, |
| "grad_norm": 3.117880344390869, |
| "grad_norm_var": 1.0979090394478965, |
| "learning_rate": 9.1e-05, |
| "loss": 1.3175, |
| "loss/crossentropy": 2.7383711338043213, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.1762513369321823, |
| "loss/reg": 6.220516661414877e-05, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.0115, |
| "grad_norm": 2.5928220748901367, |
| "grad_norm_var": 1.0899203711980436, |
| "learning_rate": 9.200000000000001e-05, |
| "loss": 1.3898, |
| "loss/crossentropy": 2.255321741104126, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.21727776527404785, |
| "loss/reg": 6.2199542298913e-05, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.011625, |
| "grad_norm": 2.5842387676239014, |
| "grad_norm_var": 1.09033696415262, |
| "learning_rate": 9.300000000000001e-05, |
| "loss": 1.3599, |
| "loss/crossentropy": 2.7780256271362305, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.203078031539917, |
| "loss/reg": 6.219152419362217e-05, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.01175, |
| "grad_norm": 2.497912645339966, |
| "grad_norm_var": 1.032260222561935, |
| "learning_rate": 9.4e-05, |
| "loss": 1.2791, |
| "loss/crossentropy": 2.0482513904571533, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.16910339891910553, |
| "loss/reg": 6.218066846486181e-05, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.011875, |
| "grad_norm": 2.1033713817596436, |
| "grad_norm_var": 1.0259829914817806, |
| "learning_rate": 9.5e-05, |
| "loss": 1.0875, |
| "loss/crossentropy": 2.427816152572632, |
| "loss/hidden": 0.94921875, |
| "loss/logits": 0.13770164549350739, |
| "loss/reg": 6.21745057287626e-05, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.012, |
| "grad_norm": 2.063559055328369, |
| "grad_norm_var": 1.0544556100156115, |
| "learning_rate": 9.6e-05, |
| "loss": 1.217, |
| "loss/crossentropy": 2.498270034790039, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.16950619220733643, |
| "loss/reg": 6.216309702722356e-05, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.012125, |
| "grad_norm": 2.3693654537200928, |
| "grad_norm_var": 1.036651450071012, |
| "learning_rate": 9.7e-05, |
| "loss": 1.2016, |
| "loss/crossentropy": 2.8368701934814453, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.16189493238925934, |
| "loss/reg": 6.215785833774135e-05, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.01225, |
| "grad_norm": 2.2980258464813232, |
| "grad_norm_var": 1.0488061784492646, |
| "learning_rate": 9.8e-05, |
| "loss": 1.5249, |
| "loss/crossentropy": 2.194488525390625, |
| "loss/hidden": 1.2421875, |
| "loss/logits": 0.2820858359336853, |
| "loss/reg": 6.215048051672056e-05, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.012375, |
| "grad_norm": 3.147524833679199, |
| "grad_norm_var": 1.0277853179901806, |
| "learning_rate": 9.900000000000001e-05, |
| "loss": 1.7374, |
| "loss/crossentropy": 2.7856016159057617, |
| "loss/hidden": 1.4609375, |
| "loss/logits": 0.27581536769866943, |
| "loss/reg": 6.214459426701069e-05, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.0125, |
| "grad_norm": 2.1317031383514404, |
| "grad_norm_var": 0.8636563030021608, |
| "learning_rate": 0.0001, |
| "loss": 1.3633, |
| "loss/crossentropy": 2.282402753829956, |
| "loss/hidden": 1.1484375, |
| "loss/logits": 0.2142634242773056, |
| "loss/reg": 6.213640881469473e-05, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.012625, |
| "grad_norm": 2.2720911502838135, |
| "grad_norm_var": 0.8721171319962743, |
| "learning_rate": 0.0001, |
| "loss": 1.2405, |
| "loss/crossentropy": 2.8501064777374268, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.17741592228412628, |
| "loss/reg": 6.21288490947336e-05, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.01275, |
| "grad_norm": 2.879110097885132, |
| "grad_norm_var": 0.8423375514351165, |
| "learning_rate": 0.0001, |
| "loss": 1.3486, |
| "loss/crossentropy": 2.4649596214294434, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.1761254221200943, |
| "loss/reg": 6.211963773239404e-05, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.012875, |
| "grad_norm": 2.2214345932006836, |
| "grad_norm_var": 0.2123174305005847, |
| "learning_rate": 0.0001, |
| "loss": 1.1049, |
| "loss/crossentropy": 2.513540029525757, |
| "loss/hidden": 0.96484375, |
| "loss/logits": 0.13943374156951904, |
| "loss/reg": 6.21131548541598e-05, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.013, |
| "grad_norm": 1.9674383401870728, |
| "grad_norm_var": 0.16151448650877043, |
| "learning_rate": 0.0001, |
| "loss": 1.2055, |
| "loss/crossentropy": 2.4960575103759766, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.17365112900733948, |
| "loss/reg": 6.210394349182025e-05, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.013125, |
| "grad_norm": 2.152989387512207, |
| "grad_norm_var": 0.1485118756217919, |
| "learning_rate": 0.0001, |
| "loss": 1.3728, |
| "loss/crossentropy": 2.651463508605957, |
| "loss/hidden": 1.1796875, |
| "loss/logits": 0.1924474835395813, |
| "loss/reg": 6.209702405612916e-05, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.01325, |
| "grad_norm": 2.591555118560791, |
| "grad_norm_var": 0.13200909593287988, |
| "learning_rate": 0.0001, |
| "loss": 1.5933, |
| "loss/crossentropy": 2.1848952770233154, |
| "loss/hidden": 1.375, |
| "loss/logits": 0.21770122647285461, |
| "loss/reg": 6.208720878930762e-05, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.013375, |
| "grad_norm": 2.205780029296875, |
| "grad_norm_var": 0.10119294371901374, |
| "learning_rate": 0.0001, |
| "loss": 0.9785, |
| "loss/crossentropy": 2.4988999366760254, |
| "loss/hidden": 0.8671875, |
| "loss/logits": 0.1106652021408081, |
| "loss/reg": 6.207643309608102e-05, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.0135, |
| "grad_norm": 2.427882671356201, |
| "grad_norm_var": 0.09821140867718908, |
| "learning_rate": 0.0001, |
| "loss": 1.2968, |
| "loss/crossentropy": 2.5072600841522217, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.20241403579711914, |
| "loss/reg": 6.206895341165364e-05, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.013625, |
| "grad_norm": 2.4435040950775146, |
| "grad_norm_var": 0.09542213222792188, |
| "learning_rate": 0.0001, |
| "loss": 1.2803, |
| "loss/crossentropy": 2.2629339694976807, |
| "loss/hidden": 1.1015625, |
| "loss/logits": 0.17810457944869995, |
| "loss/reg": 6.205752288224176e-05, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.01375, |
| "grad_norm": 2.9938735961914062, |
| "grad_norm_var": 0.11986086275213564, |
| "learning_rate": 0.0001, |
| "loss": 1.2708, |
| "loss/crossentropy": 2.5084388256073, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.1764756739139557, |
| "loss/reg": 6.204319652169943e-05, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.013875, |
| "grad_norm": 2.499802827835083, |
| "grad_norm_var": 0.11443625726480532, |
| "learning_rate": 0.0001, |
| "loss": 1.3281, |
| "loss/crossentropy": 2.342087507247925, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.17120838165283203, |
| "loss/reg": 6.20328210061416e-05, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.014, |
| "grad_norm": 3.28193736076355, |
| "grad_norm_var": 0.149862047644675, |
| "learning_rate": 0.0001, |
| "loss": 1.3891, |
| "loss/crossentropy": 2.396040916442871, |
| "loss/hidden": 1.1953125, |
| "loss/logits": 0.193180650472641, |
| "loss/reg": 6.202506483532488e-05, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.014125, |
| "grad_norm": 2.2074780464172363, |
| "grad_norm_var": 0.15416329735346365, |
| "learning_rate": 0.0001, |
| "loss": 1.2137, |
| "loss/crossentropy": 2.501718759536743, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.1583903729915619, |
| "loss/reg": 6.201667565619573e-05, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.01425, |
| "grad_norm": 2.888498306274414, |
| "grad_norm_var": 0.1614203311265588, |
| "learning_rate": 0.0001, |
| "loss": 1.3498, |
| "loss/crossentropy": 3.097370147705078, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.19293376803398132, |
| "loss/reg": 6.200573989190161e-05, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.014375, |
| "grad_norm": 2.385442018508911, |
| "grad_norm_var": 0.1339080451651928, |
| "learning_rate": 0.0001, |
| "loss": 1.3415, |
| "loss/crossentropy": 2.4950473308563232, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.18464481830596924, |
| "loss/reg": 6.199457857292145e-05, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.0145, |
| "grad_norm": 3.3269190788269043, |
| "grad_norm_var": 0.16897616880053803, |
| "learning_rate": 0.0001, |
| "loss": 1.6405, |
| "loss/crossentropy": 2.19484806060791, |
| "loss/hidden": 1.3828125, |
| "loss/logits": 0.2570968270301819, |
| "loss/reg": 6.198590563144535e-05, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.014625, |
| "grad_norm": 2.2415361404418945, |
| "grad_norm_var": 0.17015290356553733, |
| "learning_rate": 0.0001, |
| "loss": 1.2381, |
| "loss/crossentropy": 2.540816068649292, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.1749531626701355, |
| "loss/reg": 6.197726906975731e-05, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.01475, |
| "grad_norm": 2.397615671157837, |
| "grad_norm_var": 0.1631737555736056, |
| "learning_rate": 0.0001, |
| "loss": 1.2192, |
| "loss/crossentropy": 2.6213266849517822, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.16386428475379944, |
| "loss/reg": 6.197066250024363e-05, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.014875, |
| "grad_norm": 2.75325345993042, |
| "grad_norm_var": 0.16006220619054398, |
| "learning_rate": 0.0001, |
| "loss": 1.5693, |
| "loss/crossentropy": 2.3850035667419434, |
| "loss/hidden": 1.34375, |
| "loss/logits": 0.2249460369348526, |
| "loss/reg": 6.196285539772362e-05, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.015, |
| "grad_norm": 2.675480842590332, |
| "grad_norm_var": 0.13660137165245084, |
| "learning_rate": 0.0001, |
| "loss": 1.299, |
| "loss/crossentropy": 2.380896806716919, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.17339974641799927, |
| "loss/reg": 6.195474998094141e-05, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.015125, |
| "grad_norm": 2.611541509628296, |
| "grad_norm_var": 0.12289609882195597, |
| "learning_rate": 0.0001, |
| "loss": 1.2924, |
| "loss/crossentropy": 2.7064404487609863, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.18236055970191956, |
| "loss/reg": 6.194705929374322e-05, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.01525, |
| "grad_norm": 2.3449323177337646, |
| "grad_norm_var": 0.12765774775469155, |
| "learning_rate": 0.0001, |
| "loss": 1.2957, |
| "loss/crossentropy": 2.5846447944641113, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.17786133289337158, |
| "loss/reg": 6.193818262545392e-05, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.015375, |
| "grad_norm": 2.1001734733581543, |
| "grad_norm_var": 0.13398098136615483, |
| "learning_rate": 0.0001, |
| "loss": 1.1704, |
| "loss/crossentropy": 2.504185676574707, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.15416675806045532, |
| "loss/reg": 6.192670116433874e-05, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.0155, |
| "grad_norm": 2.365839719772339, |
| "grad_norm_var": 0.13563497966163046, |
| "learning_rate": 0.0001, |
| "loss": 1.3773, |
| "loss/crossentropy": 2.3259832859039307, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.20480972528457642, |
| "loss/reg": 6.19165730313398e-05, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.015625, |
| "grad_norm": 2.1480026245117188, |
| "grad_norm_var": 0.1470561705316013, |
| "learning_rate": 0.0001, |
| "loss": 1.2768, |
| "loss/crossentropy": 2.288093090057373, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.16683252155780792, |
| "loss/reg": 6.19063139311038e-05, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.01575, |
| "grad_norm": 2.2346343994140625, |
| "grad_norm_var": 0.14082182611320845, |
| "learning_rate": 0.0001, |
| "loss": 1.1441, |
| "loss/crossentropy": 2.6062135696411133, |
| "loss/hidden": 1.0, |
| "loss/logits": 0.14351129531860352, |
| "loss/reg": 6.189729174366221e-05, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.015875, |
| "grad_norm": 3.187627077102661, |
| "grad_norm_var": 0.16771827237098264, |
| "learning_rate": 0.0001, |
| "loss": 1.4505, |
| "loss/crossentropy": 2.3607077598571777, |
| "loss/hidden": 1.2265625, |
| "loss/logits": 0.22327345609664917, |
| "loss/reg": 6.189044506754726e-05, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.016, |
| "grad_norm": 2.1208789348602295, |
| "grad_norm_var": 0.1420574537193353, |
| "learning_rate": 0.0001, |
| "loss": 1.1414, |
| "loss/crossentropy": 2.408287286758423, |
| "loss/hidden": 1.0, |
| "loss/logits": 0.14076298475265503, |
| "loss/reg": 6.188445695443079e-05, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.016125, |
| "grad_norm": 2.4475457668304443, |
| "grad_norm_var": 0.13631644029428572, |
| "learning_rate": 0.0001, |
| "loss": 1.2863, |
| "loss/crossentropy": 2.4705042839050293, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.16846278309822083, |
| "loss/reg": 6.187462713569403e-05, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.01625, |
| "grad_norm": 2.3132476806640625, |
| "grad_norm_var": 0.128302854564951, |
| "learning_rate": 0.0001, |
| "loss": 1.2265, |
| "loss/crossentropy": 2.323221445083618, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.16340406239032745, |
| "loss/reg": 6.18634803686291e-05, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.016375, |
| "grad_norm": 2.6015546321868896, |
| "grad_norm_var": 0.12854282273958592, |
| "learning_rate": 0.0001, |
| "loss": 1.0946, |
| "loss/crossentropy": 2.554730176925659, |
| "loss/hidden": 0.9609375, |
| "loss/logits": 0.13307343423366547, |
| "loss/reg": 6.185180245665833e-05, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.0165, |
| "grad_norm": 2.040545701980591, |
| "grad_norm_var": 0.08874970269449302, |
| "learning_rate": 0.0001, |
| "loss": 1.1715, |
| "loss/crossentropy": 2.6177141666412354, |
| "loss/hidden": 1.0078125, |
| "loss/logits": 0.163020521402359, |
| "loss/reg": 6.184292578836903e-05, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.016625, |
| "grad_norm": 2.4451427459716797, |
| "grad_norm_var": 0.08672588329890019, |
| "learning_rate": 0.0001, |
| "loss": 1.2794, |
| "loss/crossentropy": 2.6671459674835205, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.16941678524017334, |
| "loss/reg": 6.18349076830782e-05, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.01675, |
| "grad_norm": 2.5730879306793213, |
| "grad_norm_var": 0.08802712142174655, |
| "learning_rate": 0.0001, |
| "loss": 1.356, |
| "loss/crossentropy": 2.483858585357666, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.1835438758134842, |
| "loss/reg": 6.182605284266174e-05, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.016875, |
| "grad_norm": 2.996643543243408, |
| "grad_norm_var": 0.10205043083370029, |
| "learning_rate": 0.0001, |
| "loss": 1.5067, |
| "loss/crossentropy": 2.267930507659912, |
| "loss/hidden": 1.3046875, |
| "loss/logits": 0.20140591263771057, |
| "loss/reg": 6.181577919051051e-05, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.017, |
| "grad_norm": 2.2333881855010986, |
| "grad_norm_var": 0.10100001995976887, |
| "learning_rate": 0.0001, |
| "loss": 1.23, |
| "loss/crossentropy": 2.552584648132324, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.17466390132904053, |
| "loss/reg": 6.180404307087883e-05, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.017125, |
| "grad_norm": 2.476086378097534, |
| "grad_norm_var": 0.09873795942098601, |
| "learning_rate": 0.0001, |
| "loss": 1.2347, |
| "loss/crossentropy": 2.2955551147460938, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.1402929574251175, |
| "loss/reg": 6.179526099003851e-05, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.01725, |
| "grad_norm": 2.9701859951019287, |
| "grad_norm_var": 0.11738609069977789, |
| "learning_rate": 0.0001, |
| "loss": 1.1041, |
| "loss/crossentropy": 2.4560158252716064, |
| "loss/hidden": 0.97265625, |
| "loss/logits": 0.1307787150144577, |
| "loss/reg": 6.178120383992791e-05, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.017375, |
| "grad_norm": 2.151567220687866, |
| "grad_norm_var": 0.11513060923898569, |
| "learning_rate": 0.0001, |
| "loss": 1.1406, |
| "loss/crossentropy": 2.6192235946655273, |
| "loss/hidden": 0.98828125, |
| "loss/logits": 0.15172292292118073, |
| "loss/reg": 6.176753231557086e-05, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.0175, |
| "grad_norm": 2.0209085941314697, |
| "grad_norm_var": 0.1267419293205286, |
| "learning_rate": 0.0001, |
| "loss": 1.0928, |
| "loss/crossentropy": 2.6628799438476562, |
| "loss/hidden": 0.94921875, |
| "loss/logits": 0.14296585321426392, |
| "loss/reg": 6.175567978061736e-05, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.017625, |
| "grad_norm": 3.458299398422241, |
| "grad_norm_var": 0.18389511336323494, |
| "learning_rate": 0.0001, |
| "loss": 1.3966, |
| "loss/crossentropy": 2.885798692703247, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.22411209344863892, |
| "loss/reg": 6.174653390189633e-05, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.01775, |
| "grad_norm": 2.608558177947998, |
| "grad_norm_var": 0.17855808227350187, |
| "learning_rate": 0.0001, |
| "loss": 1.1734, |
| "loss/crossentropy": 2.2689590454101562, |
| "loss/hidden": 1.0234375, |
| "loss/logits": 0.1493585705757141, |
| "loss/reg": 6.1732207541354e-05, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.017875, |
| "grad_norm": 2.7264318466186523, |
| "grad_norm_var": 0.1520478077633771, |
| "learning_rate": 0.0001, |
| "loss": 1.2868, |
| "loss/crossentropy": 2.3888814449310303, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.16896918416023254, |
| "loss/reg": 6.172260327730328e-05, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.018, |
| "grad_norm": 2.4999561309814453, |
| "grad_norm_var": 0.14128539295791806, |
| "learning_rate": 0.0001, |
| "loss": 1.3804, |
| "loss/crossentropy": 2.442732572555542, |
| "loss/hidden": 1.1875, |
| "loss/logits": 0.19230639934539795, |
| "loss/reg": 6.171311542857438e-05, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.018125, |
| "grad_norm": 3.084848642349243, |
| "grad_norm_var": 0.1592220375940921, |
| "learning_rate": 0.0001, |
| "loss": 1.5124, |
| "loss/crossentropy": 2.6801810264587402, |
| "loss/hidden": 1.2421875, |
| "loss/logits": 0.2696050703525543, |
| "loss/reg": 6.170615233713761e-05, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.01825, |
| "grad_norm": 3.0833539962768555, |
| "grad_norm_var": 0.16940866671487811, |
| "learning_rate": 0.0001, |
| "loss": 1.294, |
| "loss/crossentropy": 2.434020519256592, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.15272179245948792, |
| "loss/reg": 6.170049164211378e-05, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.018375, |
| "grad_norm": 2.2046446800231934, |
| "grad_norm_var": 0.18039814292173043, |
| "learning_rate": 0.0001, |
| "loss": 1.1769, |
| "loss/crossentropy": 2.5624289512634277, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.160653755068779, |
| "loss/reg": 6.169131665956229e-05, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.0185, |
| "grad_norm": 1.9920902252197266, |
| "grad_norm_var": 0.18414873169562326, |
| "learning_rate": 0.0001, |
| "loss": 1.1186, |
| "loss/crossentropy": 2.709728479385376, |
| "loss/hidden": 0.96875, |
| "loss/logits": 0.1492651402950287, |
| "loss/reg": 6.168704567244276e-05, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.018625, |
| "grad_norm": 2.7053756713867188, |
| "grad_norm_var": 0.18317033653553666, |
| "learning_rate": 0.0001, |
| "loss": 1.2849, |
| "loss/crossentropy": 2.594032049179077, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.1905450075864792, |
| "loss/reg": 6.168089748825878e-05, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.01875, |
| "grad_norm": 2.1234872341156006, |
| "grad_norm_var": 0.1981121598309187, |
| "learning_rate": 0.0001, |
| "loss": 1.2526, |
| "loss/crossentropy": 2.5880792140960693, |
| "loss/hidden": 1.0703125, |
| "loss/logits": 0.18171370029449463, |
| "loss/reg": 6.167205719975755e-05, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.018875, |
| "grad_norm": 2.4820902347564697, |
| "grad_norm_var": 0.18631464898325945, |
| "learning_rate": 0.0001, |
| "loss": 1.1869, |
| "loss/crossentropy": 2.2422618865966797, |
| "loss/hidden": 1.0234375, |
| "loss/logits": 0.16288068890571594, |
| "loss/reg": 6.166584353195503e-05, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.019, |
| "grad_norm": 2.5669338703155518, |
| "grad_norm_var": 0.17912821539433874, |
| "learning_rate": 0.0001, |
| "loss": 1.0968, |
| "loss/crossentropy": 2.5655312538146973, |
| "loss/hidden": 0.953125, |
| "loss/logits": 0.1430792212486267, |
| "loss/reg": 6.165904778754339e-05, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.019125, |
| "grad_norm": 2.191638469696045, |
| "grad_norm_var": 0.18782946638749062, |
| "learning_rate": 0.0001, |
| "loss": 1.297, |
| "loss/crossentropy": 2.3935883045196533, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.18698745965957642, |
| "loss/reg": 6.165434024296701e-05, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.01925, |
| "grad_norm": 1.9139376878738403, |
| "grad_norm_var": 0.19900155234911943, |
| "learning_rate": 0.0001, |
| "loss": 1.1497, |
| "loss/crossentropy": 2.5978732109069824, |
| "loss/hidden": 0.99609375, |
| "loss/logits": 0.1530168354511261, |
| "loss/reg": 6.164138176245615e-05, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.019375, |
| "grad_norm": 2.061805486679077, |
| "grad_norm_var": 0.20353621009625153, |
| "learning_rate": 0.0001, |
| "loss": 1.034, |
| "loss/crossentropy": 2.29733943939209, |
| "loss/hidden": 0.91015625, |
| "loss/logits": 0.12318030744791031, |
| "loss/reg": 6.162770296214148e-05, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.0195, |
| "grad_norm": 2.686328649520874, |
| "grad_norm_var": 0.19023239802865194, |
| "learning_rate": 0.0001, |
| "loss": 1.4235, |
| "loss/crossentropy": 2.2928433418273926, |
| "loss/hidden": 1.2265625, |
| "loss/logits": 0.19631928205490112, |
| "loss/reg": 6.16170436842367e-05, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.019625, |
| "grad_norm": 2.6863300800323486, |
| "grad_norm_var": 0.13134889378527811, |
| "learning_rate": 0.0001, |
| "loss": 1.4147, |
| "loss/crossentropy": 2.289113759994507, |
| "loss/hidden": 1.21875, |
| "loss/logits": 0.19536322355270386, |
| "loss/reg": 6.160605698823929e-05, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.01975, |
| "grad_norm": 3.7774782180786133, |
| "grad_norm_var": 0.2373896188726722, |
| "learning_rate": 0.0001, |
| "loss": 1.3606, |
| "loss/crossentropy": 2.4960098266601562, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.18812544643878937, |
| "loss/reg": 6.159812619443983e-05, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.019875, |
| "grad_norm": 2.5556654930114746, |
| "grad_norm_var": 0.23517615853210802, |
| "learning_rate": 0.0001, |
| "loss": 1.1015, |
| "loss/crossentropy": 2.4794013500213623, |
| "loss/hidden": 0.9609375, |
| "loss/logits": 0.1399209052324295, |
| "loss/reg": 6.158895121188834e-05, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 2.3351266384124756, |
| "grad_norm_var": 0.23772124659223212, |
| "learning_rate": 0.0001, |
| "loss": 1.1072, |
| "loss/crossentropy": 2.402188301086426, |
| "loss/hidden": 0.96484375, |
| "loss/logits": 0.14173097908496857, |
| "loss/reg": 6.158249016152695e-05, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.020125, |
| "grad_norm": 2.319366455078125, |
| "grad_norm_var": 0.21752957054554395, |
| "learning_rate": 0.0001, |
| "loss": 1.1774, |
| "loss/crossentropy": 2.1729917526245117, |
| "loss/hidden": 1.0234375, |
| "loss/logits": 0.15335121750831604, |
| "loss/reg": 6.157202733447775e-05, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.02025, |
| "grad_norm": 2.0917341709136963, |
| "grad_norm_var": 0.19926011430610652, |
| "learning_rate": 0.0001, |
| "loss": 1.2443, |
| "loss/crossentropy": 2.276581048965454, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.1577274203300476, |
| "loss/reg": 6.156737799756229e-05, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.020375, |
| "grad_norm": 4.31035041809082, |
| "grad_norm_var": 0.41637723338655513, |
| "learning_rate": 0.0001, |
| "loss": 1.8974, |
| "loss/crossentropy": 2.6449058055877686, |
| "loss/hidden": 1.5625, |
| "loss/logits": 0.33430173993110657, |
| "loss/reg": 6.156211748020723e-05, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.0205, |
| "grad_norm": 2.145301342010498, |
| "grad_norm_var": 0.4064476055559296, |
| "learning_rate": 0.0001, |
| "loss": 1.2636, |
| "loss/crossentropy": 2.613586664199829, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.1848127692937851, |
| "loss/reg": 6.155785376904532e-05, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.020625, |
| "grad_norm": 3.6308248043060303, |
| "grad_norm_var": 0.47796885273955964, |
| "learning_rate": 0.0001, |
| "loss": 1.2327, |
| "loss/crossentropy": 2.599729537963867, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.1852511763572693, |
| "loss/reg": 6.154972652439028e-05, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.02075, |
| "grad_norm": 2.812910556793213, |
| "grad_norm_var": 0.4622733920417279, |
| "learning_rate": 0.0001, |
| "loss": 1.3898, |
| "loss/crossentropy": 2.7171225547790527, |
| "loss/hidden": 1.1875, |
| "loss/logits": 0.20167264342308044, |
| "loss/reg": 6.154461152618751e-05, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.020875, |
| "grad_norm": 2.4922893047332764, |
| "grad_norm_var": 0.46203729327833537, |
| "learning_rate": 0.0001, |
| "loss": 1.3528, |
| "loss/crossentropy": 2.648606777191162, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.21159711480140686, |
| "loss/reg": 6.153558933874592e-05, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.021, |
| "grad_norm": 2.2380781173706055, |
| "grad_norm_var": 0.47292652355391496, |
| "learning_rate": 0.0001, |
| "loss": 1.3863, |
| "loss/crossentropy": 2.5556812286376953, |
| "loss/hidden": 1.1796875, |
| "loss/logits": 0.20603393018245697, |
| "loss/reg": 6.152570131234825e-05, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.021125, |
| "grad_norm": 2.8179726600646973, |
| "grad_norm_var": 0.4599538691877346, |
| "learning_rate": 0.0001, |
| "loss": 1.3315, |
| "loss/crossentropy": 2.285341262817383, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.19030849635601044, |
| "loss/reg": 6.151832349132746e-05, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.02125, |
| "grad_norm": 2.933023691177368, |
| "grad_norm_var": 0.42080948451517297, |
| "learning_rate": 0.0001, |
| "loss": 1.5924, |
| "loss/crossentropy": 2.254920482635498, |
| "loss/hidden": 1.3828125, |
| "loss/logits": 0.20900759100914001, |
| "loss/reg": 6.151078559923917e-05, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.021375, |
| "grad_norm": 2.9309163093566895, |
| "grad_norm_var": 0.38903358238886365, |
| "learning_rate": 0.0001, |
| "loss": 1.2104, |
| "loss/crossentropy": 2.771516799926758, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.15512725710868835, |
| "loss/reg": 6.14999225945212e-05, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.0215, |
| "grad_norm": 2.7658286094665527, |
| "grad_norm_var": 0.3882477326935183, |
| "learning_rate": 0.0001, |
| "loss": 1.2183, |
| "loss/crossentropy": 2.565211296081543, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.16297924518585205, |
| "loss/reg": 6.149257387733087e-05, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.021625, |
| "grad_norm": 3.39176344871521, |
| "grad_norm_var": 0.40840451933244426, |
| "learning_rate": 0.0001, |
| "loss": 1.3931, |
| "loss/crossentropy": 2.4181013107299805, |
| "loss/hidden": 1.1875, |
| "loss/logits": 0.2049458771944046, |
| "loss/reg": 6.148203829070553e-05, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.02175, |
| "grad_norm": 2.7971994876861572, |
| "grad_norm_var": 0.3468190736041642, |
| "learning_rate": 0.0001, |
| "loss": 1.2467, |
| "loss/crossentropy": 2.644824981689453, |
| "loss/hidden": 1.0703125, |
| "loss/logits": 0.17579111456871033, |
| "loss/reg": 6.147275416878983e-05, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.021875, |
| "grad_norm": 7.143955707550049, |
| "grad_norm_var": 1.5219747541806836, |
| "learning_rate": 0.0001, |
| "loss": 1.3279, |
| "loss/crossentropy": 2.6274638175964355, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.15536972880363464, |
| "loss/reg": 6.146173836896196e-05, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.022, |
| "grad_norm": 8.911324501037598, |
| "grad_norm_var": 3.578509022301667, |
| "learning_rate": 0.0001, |
| "loss": 1.8863, |
| "loss/crossentropy": 1.8980119228363037, |
| "loss/hidden": 1.765625, |
| "loss/logits": 0.12003660202026367, |
| "loss/reg": 6.145203224150464e-05, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.022125, |
| "grad_norm": 2.14353609085083, |
| "grad_norm_var": 3.6077286646662734, |
| "learning_rate": 0.0001, |
| "loss": 1.1573, |
| "loss/crossentropy": 2.1538591384887695, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.1410439908504486, |
| "loss/reg": 6.144325743662193e-05, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.02225, |
| "grad_norm": 4.625613212585449, |
| "grad_norm_var": 3.542583274880191, |
| "learning_rate": 0.0001, |
| "loss": 1.6226, |
| "loss/crossentropy": 2.7923362255096436, |
| "loss/hidden": 1.375, |
| "loss/logits": 0.24694563448429108, |
| "loss/reg": 6.143252539914101e-05, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.022375, |
| "grad_norm": 2.543745517730713, |
| "grad_norm_var": 3.5775446556342367, |
| "learning_rate": 0.0001, |
| "loss": 1.4192, |
| "loss/crossentropy": 2.3237483501434326, |
| "loss/hidden": 1.203125, |
| "loss/logits": 0.21549411118030548, |
| "loss/reg": 6.14215387031436e-05, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.0225, |
| "grad_norm": 2.3068995475769043, |
| "grad_norm_var": 3.5495511663474453, |
| "learning_rate": 0.0001, |
| "loss": 1.2428, |
| "loss/crossentropy": 2.7135560512542725, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.1562565714120865, |
| "loss/reg": 6.141421181382611e-05, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.022625, |
| "grad_norm": 3.465264081954956, |
| "grad_norm_var": 3.5490467443763025, |
| "learning_rate": 0.0001, |
| "loss": 1.4771, |
| "loss/crossentropy": 3.3183774948120117, |
| "loss/hidden": 1.234375, |
| "loss/logits": 0.2421126663684845, |
| "loss/reg": 6.140418554423377e-05, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.02275, |
| "grad_norm": 2.696394205093384, |
| "grad_norm_var": 3.5608805573030993, |
| "learning_rate": 0.0001, |
| "loss": 1.2269, |
| "loss/crossentropy": 2.609964370727539, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.17162814736366272, |
| "loss/reg": 6.139430479379371e-05, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.022875, |
| "grad_norm": 2.3278727531433105, |
| "grad_norm_var": 3.5849405900569513, |
| "learning_rate": 0.0001, |
| "loss": 1.0795, |
| "loss/crossentropy": 2.753383159637451, |
| "loss/hidden": 0.9453125, |
| "loss/logits": 0.1335984170436859, |
| "loss/reg": 6.138216122053564e-05, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.023, |
| "grad_norm": 2.4336531162261963, |
| "grad_norm_var": 3.554360278579671, |
| "learning_rate": 0.0001, |
| "loss": 1.3948, |
| "loss/crossentropy": 2.4162991046905518, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.22235547006130219, |
| "loss/reg": 6.137174204923213e-05, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.023125, |
| "grad_norm": 2.420710802078247, |
| "grad_norm_var": 3.601127481620784, |
| "learning_rate": 0.0001, |
| "loss": 1.4926, |
| "loss/crossentropy": 2.30292010307312, |
| "loss/hidden": 1.296875, |
| "loss/logits": 0.19511133432388306, |
| "loss/reg": 6.136245065135881e-05, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.02325, |
| "grad_norm": 2.727184534072876, |
| "grad_norm_var": 3.6190579859970224, |
| "learning_rate": 0.0001, |
| "loss": 1.2816, |
| "loss/crossentropy": 2.4605464935302734, |
| "loss/hidden": 1.0703125, |
| "loss/logits": 0.2107134908437729, |
| "loss/reg": 6.135714647825807e-05, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.023375, |
| "grad_norm": 1.9292963743209839, |
| "grad_norm_var": 3.754688597499932, |
| "learning_rate": 0.0001, |
| "loss": 1.1628, |
| "loss/crossentropy": 2.5925047397613525, |
| "loss/hidden": 1.0, |
| "loss/logits": 0.16220712661743164, |
| "loss/reg": 6.134893919806927e-05, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.0235, |
| "grad_norm": 2.1395771503448486, |
| "grad_norm_var": 3.833355540800866, |
| "learning_rate": 0.0001, |
| "loss": 1.2712, |
| "loss/crossentropy": 2.227994441986084, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.18463259935379028, |
| "loss/reg": 6.134230352472514e-05, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.023625, |
| "grad_norm": 3.552602529525757, |
| "grad_norm_var": 3.8353265135005175, |
| "learning_rate": 0.0001, |
| "loss": 1.2518, |
| "loss/crossentropy": 2.562777280807495, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.16521015763282776, |
| "loss/reg": 6.13337178947404e-05, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.02375, |
| "grad_norm": 2.766602039337158, |
| "grad_norm_var": 3.8377842837978386, |
| "learning_rate": 0.0001, |
| "loss": 1.3731, |
| "loss/crossentropy": 2.4200425148010254, |
| "loss/hidden": 1.203125, |
| "loss/logits": 0.1694013774394989, |
| "loss/reg": 6.132431008154526e-05, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.023875, |
| "grad_norm": 2.403444528579712, |
| "grad_norm_var": 2.8653780273055327, |
| "learning_rate": 0.0001, |
| "loss": 1.1651, |
| "loss/crossentropy": 2.6963400840759277, |
| "loss/hidden": 1.0078125, |
| "loss/logits": 0.1566968709230423, |
| "loss/reg": 6.132054841145873e-05, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.024, |
| "grad_norm": 2.0356028079986572, |
| "grad_norm_var": 0.4806738598539164, |
| "learning_rate": 0.0001, |
| "loss": 1.4298, |
| "loss/crossentropy": 2.174285650253296, |
| "loss/hidden": 1.21875, |
| "loss/logits": 0.21048110723495483, |
| "loss/reg": 6.13146330579184e-05, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.024125, |
| "grad_norm": 2.501723051071167, |
| "grad_norm_var": 0.4641524277019669, |
| "learning_rate": 0.0001, |
| "loss": 1.2669, |
| "loss/crossentropy": 2.6477620601654053, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.17256709933280945, |
| "loss/reg": 6.130609108367935e-05, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.02425, |
| "grad_norm": 2.8256325721740723, |
| "grad_norm_var": 0.19964871735684203, |
| "learning_rate": 0.0001, |
| "loss": 1.364, |
| "loss/crossentropy": 2.4205310344696045, |
| "loss/hidden": 1.1875, |
| "loss/logits": 0.17588719725608826, |
| "loss/reg": 6.129377288743854e-05, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.024375, |
| "grad_norm": 3.715850353240967, |
| "grad_norm_var": 0.28183777248683595, |
| "learning_rate": 0.0001, |
| "loss": 1.4108, |
| "loss/crossentropy": 2.5872642993927, |
| "loss/hidden": 1.234375, |
| "loss/logits": 0.1758473813533783, |
| "loss/reg": 6.128078530309722e-05, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.0245, |
| "grad_norm": 3.3498318195343018, |
| "grad_norm_var": 0.3034271167360647, |
| "learning_rate": 0.0001, |
| "loss": 1.3691, |
| "loss/crossentropy": 2.6444506645202637, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.19665929675102234, |
| "loss/reg": 6.126934749772772e-05, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.024625, |
| "grad_norm": 2.0526957511901855, |
| "grad_norm_var": 0.2850787945150557, |
| "learning_rate": 0.0001, |
| "loss": 1.2051, |
| "loss/crossentropy": 2.592327117919922, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.16540399193763733, |
| "loss/reg": 6.125810614321381e-05, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.02475, |
| "grad_norm": 2.4300317764282227, |
| "grad_norm_var": 0.28670823409057716, |
| "learning_rate": 0.0001, |
| "loss": 1.5286, |
| "loss/crossentropy": 2.36305570602417, |
| "loss/hidden": 1.2890625, |
| "loss/logits": 0.2389371693134308, |
| "loss/reg": 6.124811625340953e-05, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.024875, |
| "grad_norm": 2.3255856037139893, |
| "grad_norm_var": 0.28679178178242776, |
| "learning_rate": 0.0001, |
| "loss": 1.1743, |
| "loss/crossentropy": 2.0803394317626953, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.1424179971218109, |
| "loss/reg": 6.124229548731819e-05, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.025, |
| "grad_norm": 2.2634005546569824, |
| "grad_norm_var": 0.2923937566916393, |
| "learning_rate": 0.0001, |
| "loss": 1.2619, |
| "loss/crossentropy": 2.427354574203491, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.1753256618976593, |
| "loss/reg": 6.123317871242762e-05, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.025125, |
| "grad_norm": 2.789698839187622, |
| "grad_norm_var": 0.292575209213462, |
| "learning_rate": 0.0001, |
| "loss": 1.2794, |
| "loss/crossentropy": 2.4137160778045654, |
| "loss/hidden": 1.1328125, |
| "loss/logits": 0.14599566161632538, |
| "loss/reg": 6.122920603957027e-05, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.02525, |
| "grad_norm": 2.23150897026062, |
| "grad_norm_var": 0.3003877767651639, |
| "learning_rate": 0.0001, |
| "loss": 1.2906, |
| "loss/crossentropy": 2.502619743347168, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.19620737433433533, |
| "loss/reg": 6.122409831732512e-05, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.025375, |
| "grad_norm": 3.3167238235473633, |
| "grad_norm_var": 0.2999410613935005, |
| "learning_rate": 0.0001, |
| "loss": 1.4511, |
| "loss/crossentropy": 2.5889461040496826, |
| "loss/hidden": 1.2265625, |
| "loss/logits": 0.2239363044500351, |
| "loss/reg": 6.122187187429518e-05, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.0255, |
| "grad_norm": 2.5847971439361572, |
| "grad_norm_var": 0.28091485279191464, |
| "learning_rate": 0.0001, |
| "loss": 1.248, |
| "loss/crossentropy": 2.4720451831817627, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.16930653154850006, |
| "loss/reg": 6.120974285295233e-05, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.025625, |
| "grad_norm": 2.071563243865967, |
| "grad_norm_var": 0.24897236933793085, |
| "learning_rate": 0.0001, |
| "loss": 1.1016, |
| "loss/crossentropy": 2.5648884773254395, |
| "loss/hidden": 0.96875, |
| "loss/logits": 0.13218875229358673, |
| "loss/reg": 6.120166654000059e-05, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.02575, |
| "grad_norm": 2.9454479217529297, |
| "grad_norm_var": 0.2548478796483238, |
| "learning_rate": 0.0001, |
| "loss": 1.3574, |
| "loss/crossentropy": 2.607356309890747, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.20053817331790924, |
| "loss/reg": 6.119644967839122e-05, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.025875, |
| "grad_norm": 3.396070718765259, |
| "grad_norm_var": 0.28840087929906133, |
| "learning_rate": 0.0001, |
| "loss": 1.1743, |
| "loss/crossentropy": 2.682058334350586, |
| "loss/hidden": 1.0078125, |
| "loss/logits": 0.16590501368045807, |
| "loss/reg": 6.11838695476763e-05, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.026, |
| "grad_norm": 2.4477601051330566, |
| "grad_norm_var": 0.26375613878289506, |
| "learning_rate": 0.0001, |
| "loss": 1.3022, |
| "loss/crossentropy": 2.819031000137329, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.19222432374954224, |
| "loss/reg": 6.117635348346084e-05, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.026125, |
| "grad_norm": 2.5916216373443604, |
| "grad_norm_var": 0.2618484053528464, |
| "learning_rate": 0.0001, |
| "loss": 1.353, |
| "loss/crossentropy": 2.529510259628296, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.19612029194831848, |
| "loss/reg": 6.116151052992791e-05, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.02625, |
| "grad_norm": 2.108261823654175, |
| "grad_norm_var": 0.28282181699858694, |
| "learning_rate": 0.0001, |
| "loss": 1.2782, |
| "loss/crossentropy": 2.3222012519836426, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.18379396200180054, |
| "loss/reg": 6.114997813710943e-05, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.026375, |
| "grad_norm": 2.48710560798645, |
| "grad_norm_var": 0.20482550381518247, |
| "learning_rate": 0.0001, |
| "loss": 1.2718, |
| "loss/crossentropy": 2.6183624267578125, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.18522073328495026, |
| "loss/reg": 6.114102870924398e-05, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.0265, |
| "grad_norm": 2.63779616355896, |
| "grad_norm_var": 0.1640915083279668, |
| "learning_rate": 0.0001, |
| "loss": 1.3499, |
| "loss/crossentropy": 2.391116142272949, |
| "loss/hidden": 1.1640625, |
| "loss/logits": 0.18524512648582458, |
| "loss/reg": 6.112866685725749e-05, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.026625, |
| "grad_norm": 2.7476329803466797, |
| "grad_norm_var": 0.14889028663519804, |
| "learning_rate": 0.0001, |
| "loss": 1.2842, |
| "loss/crossentropy": 2.5770251750946045, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.16641706228256226, |
| "loss/reg": 6.111864786362275e-05, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.02675, |
| "grad_norm": 2.565723419189453, |
| "grad_norm_var": 0.14722036218699916, |
| "learning_rate": 0.0001, |
| "loss": 1.2381, |
| "loss/crossentropy": 2.80257248878479, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.18279102444648743, |
| "loss/reg": 6.110716640250757e-05, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.026875, |
| "grad_norm": 4.107775688171387, |
| "grad_norm_var": 0.2818514081658729, |
| "learning_rate": 0.0001, |
| "loss": 1.5243, |
| "loss/crossentropy": 2.4806065559387207, |
| "loss/hidden": 1.3046875, |
| "loss/logits": 0.2190462350845337, |
| "loss/reg": 6.109999230829999e-05, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.027, |
| "grad_norm": 2.3829445838928223, |
| "grad_norm_var": 0.27569299833046823, |
| "learning_rate": 0.0001, |
| "loss": 1.2079, |
| "loss/crossentropy": 2.466684579849243, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.16046380996704102, |
| "loss/reg": 6.108790694270283e-05, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.027125, |
| "grad_norm": 2.554863929748535, |
| "grad_norm_var": 0.2767468455530223, |
| "learning_rate": 0.0001, |
| "loss": 1.1988, |
| "loss/crossentropy": 2.582035541534424, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.15130122005939484, |
| "loss/reg": 6.1076192650944e-05, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.02725, |
| "grad_norm": 2.7898809909820557, |
| "grad_norm_var": 0.26145832144768877, |
| "learning_rate": 0.0001, |
| "loss": 1.6592, |
| "loss/crossentropy": 2.655186414718628, |
| "loss/hidden": 1.3984375, |
| "loss/logits": 0.26013702154159546, |
| "loss/reg": 6.107001536292955e-05, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.027375, |
| "grad_norm": 2.7881548404693604, |
| "grad_norm_var": 0.2378165583524293, |
| "learning_rate": 0.0001, |
| "loss": 1.5451, |
| "loss/crossentropy": 2.4413743019104004, |
| "loss/hidden": 1.3203125, |
| "loss/logits": 0.2241469919681549, |
| "loss/reg": 6.106249202275649e-05, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.0275, |
| "grad_norm": 2.2896728515625, |
| "grad_norm_var": 0.24781162791184835, |
| "learning_rate": 0.0001, |
| "loss": 1.2198, |
| "loss/crossentropy": 2.4421772956848145, |
| "loss/hidden": 1.0703125, |
| "loss/logits": 0.14890027046203613, |
| "loss/reg": 6.105640932219103e-05, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.027625, |
| "grad_norm": 2.324869155883789, |
| "grad_norm_var": 0.23120432182346703, |
| "learning_rate": 0.0001, |
| "loss": 1.3402, |
| "loss/crossentropy": 2.526216745376587, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.19898337125778198, |
| "loss/reg": 6.10438291914761e-05, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.02775, |
| "grad_norm": 2.88158917427063, |
| "grad_norm_var": 0.22935101127255847, |
| "learning_rate": 0.0001, |
| "loss": 1.372, |
| "loss/crossentropy": 2.361729621887207, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.21510916948318481, |
| "loss/reg": 6.10318202234339e-05, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.027875, |
| "grad_norm": 2.9760019779205322, |
| "grad_norm_var": 0.20104925696453316, |
| "learning_rate": 0.0001, |
| "loss": 1.2925, |
| "loss/crossentropy": 2.5573909282684326, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.1747477501630783, |
| "loss/reg": 6.1027145420666784e-05, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.028, |
| "grad_norm": 2.702091932296753, |
| "grad_norm_var": 0.19763696198550798, |
| "learning_rate": 0.0001, |
| "loss": 1.3524, |
| "loss/crossentropy": 2.717195510864258, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.19553202390670776, |
| "loss/reg": 6.1014961829641834e-05, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.028125, |
| "grad_norm": 2.1232945919036865, |
| "grad_norm_var": 0.21708226542899425, |
| "learning_rate": 0.0001, |
| "loss": 1.2661, |
| "loss/crossentropy": 2.4481968879699707, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.1795472800731659, |
| "loss/reg": 6.100164682720788e-05, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.02825, |
| "grad_norm": 2.191066026687622, |
| "grad_norm_var": 0.2114830183011783, |
| "learning_rate": 0.0001, |
| "loss": 1.1895, |
| "loss/crossentropy": 2.34470534324646, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.15763415396213531, |
| "loss/reg": 6.099118763813749e-05, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.028375, |
| "grad_norm": 2.3068013191223145, |
| "grad_norm_var": 0.21765702233228598, |
| "learning_rate": 0.0001, |
| "loss": 1.539, |
| "loss/crossentropy": 2.5549845695495605, |
| "loss/hidden": 1.328125, |
| "loss/logits": 0.21025767922401428, |
| "loss/reg": 6.09817034273874e-05, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.0285, |
| "grad_norm": 2.890655279159546, |
| "grad_norm_var": 0.221304562186567, |
| "learning_rate": 0.0001, |
| "loss": 1.5638, |
| "loss/crossentropy": 2.2339606285095215, |
| "loss/hidden": 1.34375, |
| "loss/logits": 0.21939440071582794, |
| "loss/reg": 6.096933429944329e-05, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.028625, |
| "grad_norm": 2.182521343231201, |
| "grad_norm_var": 0.2349577927735633, |
| "learning_rate": 0.0001, |
| "loss": 1.2085, |
| "loss/crossentropy": 2.641230583190918, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.161014586687088, |
| "loss/reg": 6.095720891607925e-05, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.02875, |
| "grad_norm": 2.704406976699829, |
| "grad_norm_var": 0.23499684870281476, |
| "learning_rate": 0.0001, |
| "loss": 1.3456, |
| "loss/crossentropy": 2.6833486557006836, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.18876385688781738, |
| "loss/reg": 6.094613127061166e-05, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.028875, |
| "grad_norm": 3.4925310611724854, |
| "grad_norm_var": 0.13802667852219105, |
| "learning_rate": 0.0001, |
| "loss": 1.3709, |
| "loss/crossentropy": 2.1604089736938477, |
| "loss/hidden": 1.1953125, |
| "loss/logits": 0.17500904202461243, |
| "loss/reg": 6.093499541748315e-05, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.029, |
| "grad_norm": 2.344773530960083, |
| "grad_norm_var": 0.13921650701028032, |
| "learning_rate": 0.0001, |
| "loss": 1.4725, |
| "loss/crossentropy": 2.493307113647461, |
| "loss/hidden": 1.25, |
| "loss/logits": 0.22193682193756104, |
| "loss/reg": 6.092391777201556e-05, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.029125, |
| "grad_norm": 1.8828089237213135, |
| "grad_norm_var": 0.17117140448626647, |
| "learning_rate": 0.0001, |
| "loss": 1.1104, |
| "loss/crossentropy": 2.5302743911743164, |
| "loss/hidden": 0.9765625, |
| "loss/logits": 0.1331850290298462, |
| "loss/reg": 6.0912472690688446e-05, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.02925, |
| "grad_norm": 2.747770071029663, |
| "grad_norm_var": 0.16996031408720758, |
| "learning_rate": 0.0001, |
| "loss": 1.1371, |
| "loss/crossentropy": 2.4189980030059814, |
| "loss/hidden": 0.99609375, |
| "loss/logits": 0.14035619795322418, |
| "loss/reg": 6.089695307309739e-05, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.029375, |
| "grad_norm": 1.8742481470108032, |
| "grad_norm_var": 0.1933626604088189, |
| "learning_rate": 0.0001, |
| "loss": 1.1601, |
| "loss/crossentropy": 2.2694003582000732, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.14385350048542023, |
| "loss/reg": 6.088387090130709e-05, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.0295, |
| "grad_norm": 2.0313689708709717, |
| "grad_norm_var": 0.20459374724346724, |
| "learning_rate": 0.0001, |
| "loss": 1.2446, |
| "loss/crossentropy": 2.4902865886688232, |
| "loss/hidden": 1.0703125, |
| "loss/logits": 0.17369529604911804, |
| "loss/reg": 6.086897337809205e-05, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.029625, |
| "grad_norm": 2.3882880210876465, |
| "grad_norm_var": 0.20354561810974156, |
| "learning_rate": 0.0001, |
| "loss": 1.3947, |
| "loss/crossentropy": 2.4032340049743652, |
| "loss/hidden": 1.1875, |
| "loss/logits": 0.20656049251556396, |
| "loss/reg": 6.085408676881343e-05, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.02975, |
| "grad_norm": 1.7327938079833984, |
| "grad_norm_var": 0.22490130088653987, |
| "learning_rate": 0.0001, |
| "loss": 1.1777, |
| "loss/crossentropy": 2.4949777126312256, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.1614799201488495, |
| "loss/reg": 6.084307824494317e-05, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.029875, |
| "grad_norm": 2.2483370304107666, |
| "grad_norm_var": 0.20314943964483845, |
| "learning_rate": 0.0001, |
| "loss": 1.331, |
| "loss/crossentropy": 2.5907418727874756, |
| "loss/hidden": 1.1328125, |
| "loss/logits": 0.19753864407539368, |
| "loss/reg": 6.0828475398011506e-05, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 2.5151193141937256, |
| "grad_norm_var": 0.19693662117647784, |
| "learning_rate": 0.0001, |
| "loss": 1.2278, |
| "loss/crossentropy": 2.6233856678009033, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.1725194901227951, |
| "loss/reg": 6.0820282669737935e-05, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.030125, |
| "grad_norm": 2.198249101638794, |
| "grad_norm_var": 0.19498660957211478, |
| "learning_rate": 0.0001, |
| "loss": 1.1441, |
| "loss/crossentropy": 2.368884563446045, |
| "loss/hidden": 0.99609375, |
| "loss/logits": 0.1473642736673355, |
| "loss/reg": 6.0812566516688094e-05, |
| "step": 241 |
| }, |
| { |
| "epoch": 0.03025, |
| "grad_norm": 2.195218563079834, |
| "grad_norm_var": 0.1948951313244331, |
| "learning_rate": 0.0001, |
| "loss": 1.2993, |
| "loss/crossentropy": 2.352041721343994, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.1815069168806076, |
| "loss/reg": 6.080829552956857e-05, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.030375, |
| "grad_norm": 2.6142425537109375, |
| "grad_norm_var": 0.19868367561009795, |
| "learning_rate": 0.0001, |
| "loss": 1.3644, |
| "loss/crossentropy": 2.497286558151245, |
| "loss/hidden": 1.1875, |
| "loss/logits": 0.17629210650920868, |
| "loss/reg": 6.0799306083936244e-05, |
| "step": 243 |
| }, |
| { |
| "epoch": 0.0305, |
| "grad_norm": 2.342033624649048, |
| "grad_norm_var": 0.1799734399041227, |
| "learning_rate": 0.0001, |
| "loss": 1.1311, |
| "loss/crossentropy": 2.5182478427886963, |
| "loss/hidden": 0.984375, |
| "loss/logits": 0.1461625099182129, |
| "loss/reg": 6.078776277718134e-05, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.030625, |
| "grad_norm": 2.3943874835968018, |
| "grad_norm_var": 0.17823371257387344, |
| "learning_rate": 0.0001, |
| "loss": 1.1773, |
| "loss/crossentropy": 2.575707197189331, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.1610667109489441, |
| "loss/reg": 6.078143633203581e-05, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.03075, |
| "grad_norm": 2.2752902507781982, |
| "grad_norm_var": 0.16984605758260846, |
| "learning_rate": 0.0001, |
| "loss": 1.3322, |
| "loss/crossentropy": 2.228628635406494, |
| "loss/hidden": 1.1484375, |
| "loss/logits": 0.18314987421035767, |
| "loss/reg": 6.077219222788699e-05, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.030875, |
| "grad_norm": 2.1779940128326416, |
| "grad_norm_var": 0.07406002979102144, |
| "learning_rate": 0.0001, |
| "loss": 1.179, |
| "loss/crossentropy": 2.4325718879699707, |
| "loss/hidden": 1.0078125, |
| "loss/logits": 0.17062756419181824, |
| "loss/reg": 6.076457793824375e-05, |
| "step": 247 |
| }, |
| { |
| "epoch": 0.031, |
| "grad_norm": 2.031386613845825, |
| "grad_norm_var": 0.07614130749575872, |
| "learning_rate": 0.0001, |
| "loss": 1.3177, |
| "loss/crossentropy": 2.3050920963287354, |
| "loss/hidden": 1.1328125, |
| "loss/logits": 0.18426315486431122, |
| "loss/reg": 6.075216515455395e-05, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.031125, |
| "grad_norm": 2.4880683422088623, |
| "grad_norm_var": 0.07117238958467732, |
| "learning_rate": 0.0001, |
| "loss": 1.2617, |
| "loss/crossentropy": 2.690160036087036, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.1985635757446289, |
| "loss/reg": 6.0742688219761476e-05, |
| "step": 249 |
| }, |
| { |
| "epoch": 0.03125, |
| "grad_norm": 2.631229877471924, |
| "grad_norm_var": 0.06453399427719399, |
| "learning_rate": 0.0001, |
| "loss": 1.3072, |
| "loss/crossentropy": 2.4459030628204346, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.1971898078918457, |
| "loss/reg": 6.0733007558155805e-05, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.031375, |
| "grad_norm": 2.7028048038482666, |
| "grad_norm_var": 0.06497512863382227, |
| "learning_rate": 0.0001, |
| "loss": 1.3656, |
| "loss/crossentropy": 2.7830824851989746, |
| "loss/hidden": 1.1796875, |
| "loss/logits": 0.18533006310462952, |
| "loss/reg": 6.0722686612280086e-05, |
| "step": 251 |
| }, |
| { |
| "epoch": 0.0315, |
| "grad_norm": 3.7025880813598633, |
| "grad_norm_var": 0.17735395269518506, |
| "learning_rate": 0.0001, |
| "loss": 1.2542, |
| "loss/crossentropy": 2.4722542762756348, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.17551761865615845, |
| "loss/reg": 6.0708127421094105e-05, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.031625, |
| "grad_norm": 2.1496498584747314, |
| "grad_norm_var": 0.18175923180052275, |
| "learning_rate": 0.0001, |
| "loss": 1.0403, |
| "loss/crossentropy": 2.4383487701416016, |
| "loss/hidden": 0.91015625, |
| "loss/logits": 0.12949630618095398, |
| "loss/reg": 6.069323717383668e-05, |
| "step": 253 |
| }, |
| { |
| "epoch": 0.03175, |
| "grad_norm": 3.212991237640381, |
| "grad_norm_var": 0.18702365671043306, |
| "learning_rate": 0.0001, |
| "loss": 1.3555, |
| "loss/crossentropy": 2.1896352767944336, |
| "loss/hidden": 1.1953125, |
| "loss/logits": 0.1595323085784912, |
| "loss/reg": 6.067836147849448e-05, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.031875, |
| "grad_norm": 2.53044056892395, |
| "grad_norm_var": 0.18281462084492142, |
| "learning_rate": 0.0001, |
| "loss": 1.2462, |
| "loss/crossentropy": 2.8005239963531494, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.18304814398288727, |
| "loss/reg": 6.0668298829114065e-05, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.032, |
| "grad_norm": 5.920226573944092, |
| "grad_norm_var": 0.9097630014084027, |
| "learning_rate": 0.0001, |
| "loss": 1.9011, |
| "loss/crossentropy": 2.2827932834625244, |
| "loss/hidden": 1.59375, |
| "loss/logits": 0.3067648708820343, |
| "loss/reg": 6.0657377616735175e-05, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.032125, |
| "grad_norm": 3.144649028778076, |
| "grad_norm_var": 0.8995354429829506, |
| "learning_rate": 0.0001, |
| "loss": 1.2361, |
| "loss/crossentropy": 2.9163215160369873, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.15732741355895996, |
| "loss/reg": 6.064687840989791e-05, |
| "step": 257 |
| }, |
| { |
| "epoch": 0.03225, |
| "grad_norm": 2.677065849304199, |
| "grad_norm_var": 0.8763431299745091, |
| "learning_rate": 0.0001, |
| "loss": 1.3123, |
| "loss/crossentropy": 2.9036660194396973, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.18664765357971191, |
| "loss/reg": 6.0635462432401255e-05, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.032375, |
| "grad_norm": 1.9815617799758911, |
| "grad_norm_var": 0.9180593253885627, |
| "learning_rate": 0.0001, |
| "loss": 1.2567, |
| "loss/crossentropy": 2.6647751331329346, |
| "loss/hidden": 1.0703125, |
| "loss/logits": 0.18578888475894928, |
| "loss/reg": 6.062128159101121e-05, |
| "step": 259 |
| }, |
| { |
| "epoch": 0.0325, |
| "grad_norm": 2.6094260215759277, |
| "grad_norm_var": 0.9071755924568459, |
| "learning_rate": 0.0001, |
| "loss": 1.4176, |
| "loss/crossentropy": 2.9915220737457275, |
| "loss/hidden": 1.21875, |
| "loss/logits": 0.19824379682540894, |
| "loss/reg": 6.060625673853792e-05, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.032625, |
| "grad_norm": 2.4859585762023926, |
| "grad_norm_var": 0.9028772625757899, |
| "learning_rate": 0.0001, |
| "loss": 1.2047, |
| "loss/crossentropy": 2.325611114501953, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.17281952500343323, |
| "loss/reg": 6.0591693909373134e-05, |
| "step": 261 |
| }, |
| { |
| "epoch": 0.03275, |
| "grad_norm": 4.910043716430664, |
| "grad_norm_var": 1.154144117287072, |
| "learning_rate": 0.0001, |
| "loss": 1.2858, |
| "loss/crossentropy": 2.568098306655884, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.17582398653030396, |
| "loss/reg": 6.057979408069514e-05, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.032875, |
| "grad_norm": 2.2592694759368896, |
| "grad_norm_var": 1.1460852387432343, |
| "learning_rate": 0.0001, |
| "loss": 1.3156, |
| "loss/crossentropy": 2.5264766216278076, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.19776055216789246, |
| "loss/reg": 6.056776692275889e-05, |
| "step": 263 |
| }, |
| { |
| "epoch": 0.033, |
| "grad_norm": 2.6964571475982666, |
| "grad_norm_var": 1.0909556269012999, |
| "learning_rate": 0.0001, |
| "loss": 1.0468, |
| "loss/crossentropy": 2.740647792816162, |
| "loss/hidden": 0.91796875, |
| "loss/logits": 0.12825211882591248, |
| "loss/reg": 6.0556718381121755e-05, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.033125, |
| "grad_norm": 2.112201690673828, |
| "grad_norm_var": 1.125761935491216, |
| "learning_rate": 0.0001, |
| "loss": 1.2175, |
| "loss/crossentropy": 2.475130081176758, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.1778050661087036, |
| "loss/reg": 6.0543683503055945e-05, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.03325, |
| "grad_norm": 1.8527328968048096, |
| "grad_norm_var": 1.2001448152569836, |
| "learning_rate": 0.0001, |
| "loss": 1.1913, |
| "loss/crossentropy": 2.2017788887023926, |
| "loss/hidden": 1.0234375, |
| "loss/logits": 0.16727614402770996, |
| "loss/reg": 6.053145989426412e-05, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.033375, |
| "grad_norm": 2.2294929027557373, |
| "grad_norm_var": 1.2287526925730277, |
| "learning_rate": 0.0001, |
| "loss": 1.3521, |
| "loss/crossentropy": 2.268073558807373, |
| "loss/hidden": 1.1640625, |
| "loss/logits": 0.18739831447601318, |
| "loss/reg": 6.052442768123001e-05, |
| "step": 267 |
| }, |
| { |
| "epoch": 0.0335, |
| "grad_norm": 2.185410499572754, |
| "grad_norm_var": 1.2112062552861744, |
| "learning_rate": 0.0001, |
| "loss": 1.44, |
| "loss/crossentropy": 2.390622138977051, |
| "loss/hidden": 1.234375, |
| "loss/logits": 0.20500804483890533, |
| "loss/reg": 6.051711898180656e-05, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.033625, |
| "grad_norm": 2.616452693939209, |
| "grad_norm_var": 1.1837342905938153, |
| "learning_rate": 0.0001, |
| "loss": 1.3338, |
| "loss/crossentropy": 2.3374340534210205, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.17693625390529633, |
| "loss/reg": 6.0506343288579956e-05, |
| "step": 269 |
| }, |
| { |
| "epoch": 0.03375, |
| "grad_norm": 2.5214874744415283, |
| "grad_norm_var": 1.1791403953024882, |
| "learning_rate": 0.0001, |
| "loss": 1.4572, |
| "loss/crossentropy": 2.6334807872772217, |
| "loss/hidden": 1.25, |
| "loss/logits": 0.20655225217342377, |
| "loss/reg": 6.0493421187857166e-05, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.033875, |
| "grad_norm": 2.3426766395568848, |
| "grad_norm_var": 1.18798729537596, |
| "learning_rate": 0.0001, |
| "loss": 1.2858, |
| "loss/crossentropy": 2.362666130065918, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.16799038648605347, |
| "loss/reg": 6.047951683285646e-05, |
| "step": 271 |
| }, |
| { |
| "epoch": 0.034, |
| "grad_norm": 2.483227491378784, |
| "grad_norm_var": 0.4891016266434789, |
| "learning_rate": 0.0001, |
| "loss": 1.4126, |
| "loss/crossentropy": 2.6330323219299316, |
| "loss/hidden": 1.203125, |
| "loss/logits": 0.20882482826709747, |
| "loss/reg": 6.046749331289902e-05, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.034125, |
| "grad_norm": 3.3453869819641113, |
| "grad_norm_var": 0.5070205087741229, |
| "learning_rate": 0.0001, |
| "loss": 1.3731, |
| "loss/crossentropy": 2.6637308597564697, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.20059773325920105, |
| "loss/reg": 6.0458773077698424e-05, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.03425, |
| "grad_norm": 2.2971482276916504, |
| "grad_norm_var": 0.5112160036914843, |
| "learning_rate": 0.0001, |
| "loss": 1.3516, |
| "loss/crossentropy": 2.400428533554077, |
| "loss/hidden": 1.1640625, |
| "loss/logits": 0.18688717484474182, |
| "loss/reg": 6.0452930483734235e-05, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.034375, |
| "grad_norm": 11.117164611816406, |
| "grad_norm_var": 5.025199240890341, |
| "learning_rate": 0.0001, |
| "loss": 2.1956, |
| "loss/crossentropy": 2.7653286457061768, |
| "loss/hidden": 1.8984375, |
| "loss/logits": 0.2965186834335327, |
| "loss/reg": 6.045090049155988e-05, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.0345, |
| "grad_norm": 3.6517550945281982, |
| "grad_norm_var": 5.020888752799834, |
| "learning_rate": 0.0001, |
| "loss": 1.4104, |
| "loss/crossentropy": 2.8897998332977295, |
| "loss/hidden": 1.1484375, |
| "loss/logits": 0.26139265298843384, |
| "loss/reg": 6.0451366152847186e-05, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.034625, |
| "grad_norm": 2.6342201232910156, |
| "grad_norm_var": 5.008262345647254, |
| "learning_rate": 0.0001, |
| "loss": 1.272, |
| "loss/crossentropy": 2.662801504135132, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.17764705419540405, |
| "loss/reg": 6.0443973779911175e-05, |
| "step": 277 |
| }, |
| { |
| "epoch": 0.03475, |
| "grad_norm": 2.613866090774536, |
| "grad_norm_var": 4.815302301096653, |
| "learning_rate": 0.0001, |
| "loss": 1.3, |
| "loss/crossentropy": 2.2599401473999023, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.1744215488433838, |
| "loss/reg": 6.04407032369636e-05, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.034875, |
| "grad_norm": 2.4121639728546143, |
| "grad_norm_var": 4.800441045565859, |
| "learning_rate": 0.0001, |
| "loss": 1.2736, |
| "loss/crossentropy": 2.3868885040283203, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.16360533237457275, |
| "loss/reg": 6.0438182117650285e-05, |
| "step": 279 |
| }, |
| { |
| "epoch": 0.035, |
| "grad_norm": 2.257427930831909, |
| "grad_norm_var": 4.834324037466968, |
| "learning_rate": 0.0001, |
| "loss": 1.3236, |
| "loss/crossentropy": 2.452359914779663, |
| "loss/hidden": 1.1328125, |
| "loss/logits": 0.19017404317855835, |
| "loss/reg": 6.043619578122161e-05, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.035125, |
| "grad_norm": 2.3916571140289307, |
| "grad_norm_var": 4.8045581397439525, |
| "learning_rate": 0.0001, |
| "loss": 1.3161, |
| "loss/crossentropy": 2.4834201335906982, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.20611721277236938, |
| "loss/reg": 6.043669054633938e-05, |
| "step": 281 |
| }, |
| { |
| "epoch": 0.03525, |
| "grad_norm": 2.815398931503296, |
| "grad_norm_var": 4.707581175884913, |
| "learning_rate": 0.0001, |
| "loss": 1.1312, |
| "loss/crossentropy": 3.0801713466644287, |
| "loss/hidden": 0.98828125, |
| "loss/logits": 0.14229975640773773, |
| "loss/reg": 6.044648034730926e-05, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.035375, |
| "grad_norm": 3.1715469360351562, |
| "grad_norm_var": 4.651233430019207, |
| "learning_rate": 0.0001, |
| "loss": 1.409, |
| "loss/crossentropy": 2.354785919189453, |
| "loss/hidden": 1.1953125, |
| "loss/logits": 0.21305763721466064, |
| "loss/reg": 6.0437832871684805e-05, |
| "step": 283 |
| }, |
| { |
| "epoch": 0.0355, |
| "grad_norm": 2.5010037422180176, |
| "grad_norm_var": 4.615667456235268, |
| "learning_rate": 0.0001, |
| "loss": 1.3572, |
| "loss/crossentropy": 2.492047071456909, |
| "loss/hidden": 1.1640625, |
| "loss/logits": 0.1925477683544159, |
| "loss/reg": 6.044709516572766e-05, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.035625, |
| "grad_norm": 1.964429259300232, |
| "grad_norm_var": 4.6928209367171645, |
| "learning_rate": 0.0001, |
| "loss": 1.1671, |
| "loss/crossentropy": 2.3351125717163086, |
| "loss/hidden": 0.99609375, |
| "loss/logits": 0.1704423427581787, |
| "loss/reg": 6.0453679907368496e-05, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.03575, |
| "grad_norm": 2.3656678199768066, |
| "grad_norm_var": 4.707552916907375, |
| "learning_rate": 0.0001, |
| "loss": 1.5385, |
| "loss/crossentropy": 2.4216158390045166, |
| "loss/hidden": 1.28125, |
| "loss/logits": 0.2566841244697571, |
| "loss/reg": 6.0443537222454324e-05, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.035875, |
| "grad_norm": 3.140928030014038, |
| "grad_norm_var": 4.661686527481659, |
| "learning_rate": 0.0001, |
| "loss": 1.3637, |
| "loss/crossentropy": 2.8347983360290527, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.20682096481323242, |
| "loss/reg": 6.043089888407849e-05, |
| "step": 287 |
| }, |
| { |
| "epoch": 0.036, |
| "grad_norm": 2.6460797786712646, |
| "grad_norm_var": 4.647830565858565, |
| "learning_rate": 0.0001, |
| "loss": 1.3928, |
| "loss/crossentropy": 2.108215093612671, |
| "loss/hidden": 1.2109375, |
| "loss/logits": 0.18129181861877441, |
| "loss/reg": 6.042820677976124e-05, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.036125, |
| "grad_norm": 2.879531145095825, |
| "grad_norm_var": 4.652852381956769, |
| "learning_rate": 0.0001, |
| "loss": 1.4359, |
| "loss/crossentropy": 2.90163516998291, |
| "loss/hidden": 1.25, |
| "loss/logits": 0.1853410005569458, |
| "loss/reg": 6.042792301741429e-05, |
| "step": 289 |
| }, |
| { |
| "epoch": 0.03625, |
| "grad_norm": 2.5701370239257812, |
| "grad_norm_var": 4.625421100051376, |
| "learning_rate": 0.0001, |
| "loss": 1.3639, |
| "loss/crossentropy": 2.6896326541900635, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.2070741057395935, |
| "loss/reg": 6.0414979088818654e-05, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.036375, |
| "grad_norm": 2.988196849822998, |
| "grad_norm_var": 0.16977142791367086, |
| "learning_rate": 0.0001, |
| "loss": 1.103, |
| "loss/crossentropy": 2.8485705852508545, |
| "loss/hidden": 0.96875, |
| "loss/logits": 0.13362044095993042, |
| "loss/reg": 6.0413527535274625e-05, |
| "step": 291 |
| }, |
| { |
| "epoch": 0.0365, |
| "grad_norm": 5.9153923988342285, |
| "grad_norm_var": 0.7809789933836029, |
| "learning_rate": 0.0001, |
| "loss": 1.6292, |
| "loss/crossentropy": 2.607590436935425, |
| "loss/hidden": 1.4375, |
| "loss/logits": 0.19109681248664856, |
| "loss/reg": 6.041422238922678e-05, |
| "step": 292 |
| }, |
| { |
| "epoch": 0.036625, |
| "grad_norm": 1.932381510734558, |
| "grad_norm_var": 0.8300136192923785, |
| "learning_rate": 0.0001, |
| "loss": 1.1314, |
| "loss/crossentropy": 2.2319207191467285, |
| "loss/hidden": 0.9921875, |
| "loss/logits": 0.13856041431427002, |
| "loss/reg": 6.041810775059275e-05, |
| "step": 293 |
| }, |
| { |
| "epoch": 0.03675, |
| "grad_norm": 2.1218042373657227, |
| "grad_norm_var": 0.8563980373093443, |
| "learning_rate": 0.0001, |
| "loss": 1.1898, |
| "loss/crossentropy": 2.7033910751342773, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.15791726112365723, |
| "loss/reg": 6.0404745454434305e-05, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.036875, |
| "grad_norm": 3.239748954772949, |
| "grad_norm_var": 0.8614170936653748, |
| "learning_rate": 0.0001, |
| "loss": 1.6186, |
| "loss/crossentropy": 2.3478281497955322, |
| "loss/hidden": 1.3671875, |
| "loss/logits": 0.2507687509059906, |
| "loss/reg": 6.039286745362915e-05, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.037, |
| "grad_norm": 2.361431121826172, |
| "grad_norm_var": 0.8544814148079373, |
| "learning_rate": 0.0001, |
| "loss": 1.2822, |
| "loss/crossentropy": 2.4396111965179443, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.19569119811058044, |
| "loss/reg": 6.0390335420379415e-05, |
| "step": 296 |
| }, |
| { |
| "epoch": 0.037125, |
| "grad_norm": 2.6921112537384033, |
| "grad_norm_var": 0.8432509023111928, |
| "learning_rate": 0.0001, |
| "loss": 1.3584, |
| "loss/crossentropy": 2.3235762119293213, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.20157676935195923, |
| "loss/reg": 6.037576531525701e-05, |
| "step": 297 |
| }, |
| { |
| "epoch": 0.03725, |
| "grad_norm": 2.2376601696014404, |
| "grad_norm_var": 0.8653611900667765, |
| "learning_rate": 0.0001, |
| "loss": 1.3703, |
| "loss/crossentropy": 2.441978693008423, |
| "loss/hidden": 1.1875, |
| "loss/logits": 0.1821848303079605, |
| "loss/reg": 6.036146805854514e-05, |
| "step": 298 |
| }, |
| { |
| "epoch": 0.037375, |
| "grad_norm": 2.5022082328796387, |
| "grad_norm_var": 0.8598019948407729, |
| "learning_rate": 0.0001, |
| "loss": 1.2909, |
| "loss/crossentropy": 2.4099972248077393, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.1965959370136261, |
| "loss/reg": 6.035445403540507e-05, |
| "step": 299 |
| }, |
| { |
| "epoch": 0.0375, |
| "grad_norm": 2.323599338531494, |
| "grad_norm_var": 0.8677455500426021, |
| "learning_rate": 0.0001, |
| "loss": 1.2301, |
| "loss/crossentropy": 2.714334011077881, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.16703477501869202, |
| "loss/reg": 6.034153193468228e-05, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.037625, |
| "grad_norm": 2.902794361114502, |
| "grad_norm_var": 0.8254198045813945, |
| "learning_rate": 0.0001, |
| "loss": 1.287, |
| "loss/crossentropy": 2.5897319316864014, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.1692187488079071, |
| "loss/reg": 6.032464443705976e-05, |
| "step": 301 |
| }, |
| { |
| "epoch": 0.03775, |
| "grad_norm": 2.455423355102539, |
| "grad_norm_var": 0.8207107650276014, |
| "learning_rate": 0.0001, |
| "loss": 1.3118, |
| "loss/crossentropy": 2.2553625106811523, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.15494795143604279, |
| "loss/reg": 6.031416342011653e-05, |
| "step": 302 |
| }, |
| { |
| "epoch": 0.037875, |
| "grad_norm": 2.70770001411438, |
| "grad_norm_var": 0.8131429553718594, |
| "learning_rate": 0.0001, |
| "loss": 1.3645, |
| "loss/crossentropy": 2.298628807067871, |
| "loss/hidden": 1.1875, |
| "loss/logits": 0.17642799019813538, |
| "loss/reg": 6.029937867424451e-05, |
| "step": 303 |
| }, |
| { |
| "epoch": 0.038, |
| "grad_norm": 2.4096872806549072, |
| "grad_norm_var": 0.8208490888498592, |
| "learning_rate": 0.0001, |
| "loss": 1.2573, |
| "loss/crossentropy": 2.6787161827087402, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.17861339449882507, |
| "loss/reg": 6.027881318004802e-05, |
| "step": 304 |
| }, |
| { |
| "epoch": 0.038125, |
| "grad_norm": 2.364800214767456, |
| "grad_norm_var": 0.8295471446711137, |
| "learning_rate": 0.0001, |
| "loss": 1.3251, |
| "loss/crossentropy": 2.351970911026001, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.18391045928001404, |
| "loss/reg": 6.026409027981572e-05, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.03825, |
| "grad_norm": 2.0991923809051514, |
| "grad_norm_var": 0.8536240669336511, |
| "learning_rate": 0.0001, |
| "loss": 1.078, |
| "loss/crossentropy": 2.7187068462371826, |
| "loss/hidden": 0.9453125, |
| "loss/logits": 0.13205038011074066, |
| "loss/reg": 6.0248257796047255e-05, |
| "step": 306 |
| }, |
| { |
| "epoch": 0.038375, |
| "grad_norm": 2.7471582889556885, |
| "grad_norm_var": 0.8481018158238611, |
| "learning_rate": 0.0001, |
| "loss": 1.4035, |
| "loss/crossentropy": 2.1265523433685303, |
| "loss/hidden": 1.21875, |
| "loss/logits": 0.18416792154312134, |
| "loss/reg": 6.0230733652133495e-05, |
| "step": 307 |
| }, |
| { |
| "epoch": 0.0385, |
| "grad_norm": 2.2592687606811523, |
| "grad_norm_var": 0.11041007633642194, |
| "learning_rate": 0.0001, |
| "loss": 1.271, |
| "loss/crossentropy": 2.66719651222229, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.184452086687088, |
| "loss/reg": 6.0217109421500936e-05, |
| "step": 308 |
| }, |
| { |
| "epoch": 0.038625, |
| "grad_norm": 2.2400615215301514, |
| "grad_norm_var": 0.09468951175299385, |
| "learning_rate": 0.0001, |
| "loss": 1.2348, |
| "loss/crossentropy": 2.3710193634033203, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.1717246174812317, |
| "loss/reg": 6.020214277668856e-05, |
| "step": 309 |
| }, |
| { |
| "epoch": 0.03875, |
| "grad_norm": 2.0783209800720215, |
| "grad_norm_var": 0.09687885973874776, |
| "learning_rate": 0.0001, |
| "loss": 1.2085, |
| "loss/crossentropy": 2.2699692249298096, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.17665645480155945, |
| "loss/reg": 6.018438944010995e-05, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.038875, |
| "grad_norm": 2.077648162841797, |
| "grad_norm_var": 0.06299334570375853, |
| "learning_rate": 0.0001, |
| "loss": 1.2169, |
| "loss/crossentropy": 2.334127426147461, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.15378312766551971, |
| "loss/reg": 6.0161146393511444e-05, |
| "step": 311 |
| }, |
| { |
| "epoch": 0.039, |
| "grad_norm": 2.440629482269287, |
| "grad_norm_var": 0.06293910816862744, |
| "learning_rate": 0.0001, |
| "loss": 1.2956, |
| "loss/crossentropy": 2.791874408721924, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.1777758002281189, |
| "loss/reg": 6.014638711349107e-05, |
| "step": 312 |
| }, |
| { |
| "epoch": 0.039125, |
| "grad_norm": 2.853940963745117, |
| "grad_norm_var": 0.07069242228717272, |
| "learning_rate": 0.0001, |
| "loss": 1.2688, |
| "loss/crossentropy": 2.5036516189575195, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.18226328492164612, |
| "loss/reg": 6.013087840983644e-05, |
| "step": 313 |
| }, |
| { |
| "epoch": 0.03925, |
| "grad_norm": 3.287529230117798, |
| "grad_norm_var": 0.11423125477930943, |
| "learning_rate": 0.0001, |
| "loss": 1.2435, |
| "loss/crossentropy": 2.696265697479248, |
| "loss/hidden": 1.0703125, |
| "loss/logits": 0.17254707217216492, |
| "loss/reg": 6.011854929965921e-05, |
| "step": 314 |
| }, |
| { |
| "epoch": 0.039375, |
| "grad_norm": 3.1080963611602783, |
| "grad_norm_var": 0.1386158794861321, |
| "learning_rate": 0.0001, |
| "loss": 1.473, |
| "loss/crossentropy": 2.1882760524749756, |
| "loss/hidden": 1.25, |
| "loss/logits": 0.2224160134792328, |
| "loss/reg": 6.0103353462181985e-05, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.0395, |
| "grad_norm": 2.7303977012634277, |
| "grad_norm_var": 0.13818442385569654, |
| "learning_rate": 0.0001, |
| "loss": 1.4029, |
| "loss/crossentropy": 2.361660957336426, |
| "loss/hidden": 1.2109375, |
| "loss/logits": 0.19139324128627777, |
| "loss/reg": 6.008424679748714e-05, |
| "step": 316 |
| }, |
| { |
| "epoch": 0.039625, |
| "grad_norm": 1.7651097774505615, |
| "grad_norm_var": 0.16520987140884788, |
| "learning_rate": 0.0001, |
| "loss": 1.0765, |
| "loss/crossentropy": 2.435858964920044, |
| "loss/hidden": 0.953125, |
| "loss/logits": 0.1227254569530487, |
| "loss/reg": 6.007165211485699e-05, |
| "step": 317 |
| }, |
| { |
| "epoch": 0.03975, |
| "grad_norm": 2.128772258758545, |
| "grad_norm_var": 0.17279926669385734, |
| "learning_rate": 0.0001, |
| "loss": 1.1848, |
| "loss/crossentropy": 2.334495782852173, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.12953956425189972, |
| "loss/reg": 6.005321120028384e-05, |
| "step": 318 |
| }, |
| { |
| "epoch": 0.039875, |
| "grad_norm": 2.1308538913726807, |
| "grad_norm_var": 0.1742483958439737, |
| "learning_rate": 0.0001, |
| "loss": 1.3191, |
| "loss/crossentropy": 2.3873021602630615, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.19348952174186707, |
| "loss/reg": 6.0041034885216504e-05, |
| "step": 319 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 2.706742286682129, |
| "grad_norm_var": 0.17935140917835876, |
| "learning_rate": 0.0001, |
| "loss": 1.4123, |
| "loss/crossentropy": 2.5321033000946045, |
| "loss/hidden": 1.203125, |
| "loss/logits": 0.20852993428707123, |
| "loss/reg": 6.002993177389726e-05, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.040125, |
| "grad_norm": 6.118154525756836, |
| "grad_norm_var": 1.0228689502418715, |
| "learning_rate": 0.0001, |
| "loss": 1.7298, |
| "loss/crossentropy": 2.457045316696167, |
| "loss/hidden": 1.515625, |
| "loss/logits": 0.2136228382587433, |
| "loss/reg": 6.001694418955594e-05, |
| "step": 321 |
| }, |
| { |
| "epoch": 0.04025, |
| "grad_norm": 3.091947317123413, |
| "grad_norm_var": 1.0084811477178388, |
| "learning_rate": 0.0001, |
| "loss": 1.6635, |
| "loss/crossentropy": 2.6943020820617676, |
| "loss/hidden": 1.40625, |
| "loss/logits": 0.25662127137184143, |
| "loss/reg": 6.0004946135450155e-05, |
| "step": 322 |
| }, |
| { |
| "epoch": 0.040375, |
| "grad_norm": 2.488391637802124, |
| "grad_norm_var": 1.0122566583255546, |
| "learning_rate": 0.0001, |
| "loss": 1.2065, |
| "loss/crossentropy": 2.646897792816162, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.15123483538627625, |
| "loss/reg": 5.9991711168549955e-05, |
| "step": 323 |
| }, |
| { |
| "epoch": 0.0405, |
| "grad_norm": 3.0675456523895264, |
| "grad_norm_var": 1.0035307165437406, |
| "learning_rate": 0.0001, |
| "loss": 1.4832, |
| "loss/crossentropy": 2.4176406860351562, |
| "loss/hidden": 1.2421875, |
| "loss/logits": 0.24040505290031433, |
| "loss/reg": 5.9981128288200125e-05, |
| "step": 324 |
| }, |
| { |
| "epoch": 0.040625, |
| "grad_norm": 2.424546957015991, |
| "grad_norm_var": 0.9926314451715664, |
| "learning_rate": 0.0001, |
| "loss": 1.07, |
| "loss/crossentropy": 2.703134059906006, |
| "loss/hidden": 0.94140625, |
| "loss/logits": 0.12803316116333008, |
| "loss/reg": 5.997138941893354e-05, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.04075, |
| "grad_norm": 2.9345507621765137, |
| "grad_norm_var": 0.9582126623175621, |
| "learning_rate": 0.0001, |
| "loss": 1.4247, |
| "loss/crossentropy": 2.8940789699554443, |
| "loss/hidden": 1.21875, |
| "loss/logits": 0.20539763569831848, |
| "loss/reg": 5.996019172016531e-05, |
| "step": 326 |
| }, |
| { |
| "epoch": 0.040875, |
| "grad_norm": 3.069572925567627, |
| "grad_norm_var": 0.9195850402401864, |
| "learning_rate": 0.0001, |
| "loss": 1.3896, |
| "loss/crossentropy": 2.4416871070861816, |
| "loss/hidden": 1.1875, |
| "loss/logits": 0.20154833793640137, |
| "loss/reg": 5.9947429690510035e-05, |
| "step": 327 |
| }, |
| { |
| "epoch": 0.041, |
| "grad_norm": 2.323606491088867, |
| "grad_norm_var": 0.9275566292830253, |
| "learning_rate": 0.0001, |
| "loss": 1.2888, |
| "loss/crossentropy": 2.811528444290161, |
| "loss/hidden": 1.1015625, |
| "loss/logits": 0.18662354350090027, |
| "loss/reg": 5.9936231991741806e-05, |
| "step": 328 |
| }, |
| { |
| "epoch": 0.041125, |
| "grad_norm": 3.1679723262786865, |
| "grad_norm_var": 0.9322370885273564, |
| "learning_rate": 0.0001, |
| "loss": 1.5559, |
| "loss/crossentropy": 2.3170981407165527, |
| "loss/hidden": 1.3046875, |
| "loss/logits": 0.2506353557109833, |
| "loss/reg": 5.991987563902512e-05, |
| "step": 329 |
| }, |
| { |
| "epoch": 0.04125, |
| "grad_norm": 2.7683303356170654, |
| "grad_norm_var": 0.9228798875820224, |
| "learning_rate": 0.0001, |
| "loss": 1.3127, |
| "loss/crossentropy": 2.51680850982666, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.1949077993631363, |
| "loss/reg": 5.990756835672073e-05, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.041375, |
| "grad_norm": 2.4825031757354736, |
| "grad_norm_var": 0.9280253827718864, |
| "learning_rate": 0.0001, |
| "loss": 1.3408, |
| "loss/crossentropy": 2.605055332183838, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.19955970346927643, |
| "loss/reg": 5.989522469462827e-05, |
| "step": 331 |
| }, |
| { |
| "epoch": 0.0415, |
| "grad_norm": 3.2399041652679443, |
| "grad_norm_var": 0.9369785308922095, |
| "learning_rate": 0.0001, |
| "loss": 1.5753, |
| "loss/crossentropy": 2.7269279956817627, |
| "loss/hidden": 1.3515625, |
| "loss/logits": 0.22315430641174316, |
| "loss/reg": 5.988113844068721e-05, |
| "step": 332 |
| }, |
| { |
| "epoch": 0.041625, |
| "grad_norm": 2.8936927318573, |
| "grad_norm_var": 0.8504314928241191, |
| "learning_rate": 0.0001, |
| "loss": 1.3222, |
| "loss/crossentropy": 2.812412738800049, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.1809367835521698, |
| "loss/reg": 5.9867059462703764e-05, |
| "step": 333 |
| }, |
| { |
| "epoch": 0.04175, |
| "grad_norm": 2.432213068008423, |
| "grad_norm_var": 0.8233723477256942, |
| "learning_rate": 0.0001, |
| "loss": 1.4094, |
| "loss/crossentropy": 2.6377694606781006, |
| "loss/hidden": 1.203125, |
| "loss/logits": 0.20563456416130066, |
| "loss/reg": 5.9853711718460545e-05, |
| "step": 334 |
| }, |
| { |
| "epoch": 0.041875, |
| "grad_norm": 2.422299861907959, |
| "grad_norm_var": 0.7965082638815336, |
| "learning_rate": 0.0001, |
| "loss": 1.2328, |
| "loss/crossentropy": 2.5352189540863037, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.15405428409576416, |
| "loss/reg": 5.984482049825601e-05, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.042, |
| "grad_norm": 2.703420877456665, |
| "grad_norm_var": 0.7966286375145801, |
| "learning_rate": 0.0001, |
| "loss": 1.2981, |
| "loss/crossentropy": 2.525949716567993, |
| "loss/hidden": 1.1015625, |
| "loss/logits": 0.1959662139415741, |
| "loss/reg": 5.983649680274539e-05, |
| "step": 336 |
| }, |
| { |
| "epoch": 0.042125, |
| "grad_norm": 3.625760078430176, |
| "grad_norm_var": 0.14094485019601447, |
| "learning_rate": 0.0001, |
| "loss": 1.6517, |
| "loss/crossentropy": 1.9824917316436768, |
| "loss/hidden": 1.3828125, |
| "loss/logits": 0.2682979702949524, |
| "loss/reg": 5.9825455537065864e-05, |
| "step": 337 |
| }, |
| { |
| "epoch": 0.04225, |
| "grad_norm": 2.2066762447357178, |
| "grad_norm_var": 0.1579467344221198, |
| "learning_rate": 0.0001, |
| "loss": 1.1768, |
| "loss/crossentropy": 2.5151102542877197, |
| "loss/hidden": 1.0078125, |
| "loss/logits": 0.16843904554843903, |
| "loss/reg": 5.981199865345843e-05, |
| "step": 338 |
| }, |
| { |
| "epoch": 0.042375, |
| "grad_norm": 2.961968421936035, |
| "grad_norm_var": 0.15445451920782696, |
| "learning_rate": 0.0001, |
| "loss": 1.5446, |
| "loss/crossentropy": 2.397102117538452, |
| "loss/hidden": 1.3046875, |
| "loss/logits": 0.23934724926948547, |
| "loss/reg": 5.979971319902688e-05, |
| "step": 339 |
| }, |
| { |
| "epoch": 0.0425, |
| "grad_norm": 2.4696779251098633, |
| "grad_norm_var": 0.15509145555751214, |
| "learning_rate": 0.0001, |
| "loss": 1.2907, |
| "loss/crossentropy": 2.518648624420166, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.17289261519908905, |
| "loss/reg": 5.979237175779417e-05, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.042625, |
| "grad_norm": 2.2886741161346436, |
| "grad_norm_var": 0.16228478040589658, |
| "learning_rate": 0.0001, |
| "loss": 1.2915, |
| "loss/crossentropy": 2.4755570888519287, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.18152545392513275, |
| "loss/reg": 5.978640547255054e-05, |
| "step": 341 |
| }, |
| { |
| "epoch": 0.04275, |
| "grad_norm": 2.4154622554779053, |
| "grad_norm_var": 0.16631279956205466, |
| "learning_rate": 0.0001, |
| "loss": 1.1361, |
| "loss/crossentropy": 2.620903730392456, |
| "loss/hidden": 0.9921875, |
| "loss/logits": 0.1432739496231079, |
| "loss/reg": 5.977362161502242e-05, |
| "step": 342 |
| }, |
| { |
| "epoch": 0.042875, |
| "grad_norm": 3.9107778072357178, |
| "grad_norm_var": 0.25008606934497735, |
| "learning_rate": 0.0001, |
| "loss": 1.6206, |
| "loss/crossentropy": 3.3820858001708984, |
| "loss/hidden": 1.40625, |
| "loss/logits": 0.21375682950019836, |
| "loss/reg": 5.976331885904074e-05, |
| "step": 343 |
| }, |
| { |
| "epoch": 0.043, |
| "grad_norm": 2.2201833724975586, |
| "grad_norm_var": 0.25690416036597197, |
| "learning_rate": 0.0001, |
| "loss": 1.2446, |
| "loss/crossentropy": 2.467216730117798, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.18146604299545288, |
| "loss/reg": 5.975304884486832e-05, |
| "step": 344 |
| }, |
| { |
| "epoch": 0.043125, |
| "grad_norm": 2.1915907859802246, |
| "grad_norm_var": 0.26377805805320803, |
| "learning_rate": 0.0001, |
| "loss": 1.4337, |
| "loss/crossentropy": 2.3638522624969482, |
| "loss/hidden": 1.203125, |
| "loss/logits": 0.22996577620506287, |
| "loss/reg": 5.974585292278789e-05, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.04325, |
| "grad_norm": 2.2508416175842285, |
| "grad_norm_var": 0.27594342104869135, |
| "learning_rate": 0.0001, |
| "loss": 1.1804, |
| "loss/crossentropy": 2.5332260131835938, |
| "loss/hidden": 1.0234375, |
| "loss/logits": 0.15640094876289368, |
| "loss/reg": 5.973771112621762e-05, |
| "step": 346 |
| }, |
| { |
| "epoch": 0.043375, |
| "grad_norm": 2.0090150833129883, |
| "grad_norm_var": 0.30177518099136, |
| "learning_rate": 0.0001, |
| "loss": 1.2994, |
| "loss/crossentropy": 2.511950731277466, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.17384442687034607, |
| "loss/reg": 5.9728798078140244e-05, |
| "step": 347 |
| }, |
| { |
| "epoch": 0.0435, |
| "grad_norm": 2.7306134700775146, |
| "grad_norm_var": 0.277258656834267, |
| "learning_rate": 0.0001, |
| "loss": 1.4199, |
| "loss/crossentropy": 2.6389760971069336, |
| "loss/hidden": 1.1875, |
| "loss/logits": 0.2318291962146759, |
| "loss/reg": 5.9719615819631144e-05, |
| "step": 348 |
| }, |
| { |
| "epoch": 0.043625, |
| "grad_norm": 2.270148515701294, |
| "grad_norm_var": 0.27783213891549774, |
| "learning_rate": 0.0001, |
| "loss": 1.5484, |
| "loss/crossentropy": 2.312831163406372, |
| "loss/hidden": 1.2890625, |
| "loss/logits": 0.2587454915046692, |
| "loss/reg": 5.970869824523106e-05, |
| "step": 349 |
| }, |
| { |
| "epoch": 0.04375, |
| "grad_norm": 2.0988070964813232, |
| "grad_norm_var": 0.2908751450016543, |
| "learning_rate": 0.0001, |
| "loss": 1.2681, |
| "loss/crossentropy": 2.378908634185791, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.1581004559993744, |
| "loss/reg": 5.970033089397475e-05, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.043875, |
| "grad_norm": 2.045546770095825, |
| "grad_norm_var": 0.30608582246859417, |
| "learning_rate": 0.0001, |
| "loss": 1.1063, |
| "loss/crossentropy": 2.4011952877044678, |
| "loss/hidden": 0.96875, |
| "loss/logits": 0.13691341876983643, |
| "loss/reg": 5.969877020106651e-05, |
| "step": 351 |
| }, |
| { |
| "epoch": 0.044, |
| "grad_norm": 2.9582409858703613, |
| "grad_norm_var": 0.316207957574548, |
| "learning_rate": 0.0001, |
| "loss": 1.2072, |
| "loss/crossentropy": 2.643101215362549, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.15975871682167053, |
| "loss/reg": 5.9694295487133786e-05, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.044125, |
| "grad_norm": 2.125020742416382, |
| "grad_norm_var": 0.23988746234485703, |
| "learning_rate": 0.0001, |
| "loss": 1.2268, |
| "loss/crossentropy": 2.5923550128936768, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.18714120984077454, |
| "loss/reg": 5.968381810816936e-05, |
| "step": 353 |
| }, |
| { |
| "epoch": 0.04425, |
| "grad_norm": 2.2348685264587402, |
| "grad_norm_var": 0.2390334750954897, |
| "learning_rate": 0.0001, |
| "loss": 1.3948, |
| "loss/crossentropy": 2.549100637435913, |
| "loss/hidden": 1.1953125, |
| "loss/logits": 0.198894202709198, |
| "loss/reg": 5.9669990150723606e-05, |
| "step": 354 |
| }, |
| { |
| "epoch": 0.044375, |
| "grad_norm": 2.6807351112365723, |
| "grad_norm_var": 0.2247355561703434, |
| "learning_rate": 0.0001, |
| "loss": 1.5721, |
| "loss/crossentropy": 2.2256884574890137, |
| "loss/hidden": 1.3046875, |
| "loss/logits": 0.26683151721954346, |
| "loss/reg": 5.966486787656322e-05, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.0445, |
| "grad_norm": 3.1524059772491455, |
| "grad_norm_var": 0.2573648537337417, |
| "learning_rate": 0.0001, |
| "loss": 1.5458, |
| "loss/crossentropy": 2.4026124477386475, |
| "loss/hidden": 1.3203125, |
| "loss/logits": 0.22484509646892548, |
| "loss/reg": 5.965128730167635e-05, |
| "step": 356 |
| }, |
| { |
| "epoch": 0.044625, |
| "grad_norm": 3.806107759475708, |
| "grad_norm_var": 0.3637951956534662, |
| "learning_rate": 0.0001, |
| "loss": 1.2257, |
| "loss/crossentropy": 2.534790277481079, |
| "loss/hidden": 1.1015625, |
| "loss/logits": 0.12353114783763885, |
| "loss/reg": 5.9637932281475514e-05, |
| "step": 357 |
| }, |
| { |
| "epoch": 0.04475, |
| "grad_norm": 2.6499619483947754, |
| "grad_norm_var": 0.36243857175732047, |
| "learning_rate": 0.0001, |
| "loss": 1.2577, |
| "loss/crossentropy": 2.786536931991577, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.17896610498428345, |
| "loss/reg": 5.962959403404966e-05, |
| "step": 358 |
| }, |
| { |
| "epoch": 0.044875, |
| "grad_norm": 2.750371217727661, |
| "grad_norm_var": 0.24122897908522703, |
| "learning_rate": 0.0001, |
| "loss": 1.3213, |
| "loss/crossentropy": 2.5112698078155518, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.19569161534309387, |
| "loss/reg": 5.961711940472014e-05, |
| "step": 359 |
| }, |
| { |
| "epoch": 0.045, |
| "grad_norm": 2.4145219326019287, |
| "grad_norm_var": 0.23605635737508593, |
| "learning_rate": 0.0001, |
| "loss": 1.4067, |
| "loss/crossentropy": 2.4327914714813232, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.23425719141960144, |
| "loss/reg": 5.960506314295344e-05, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.045125, |
| "grad_norm": 2.7820589542388916, |
| "grad_norm_var": 0.2317516785903725, |
| "learning_rate": 0.0001, |
| "loss": 1.5833, |
| "loss/crossentropy": 2.6201419830322266, |
| "loss/hidden": 1.3359375, |
| "loss/logits": 0.2468121349811554, |
| "loss/reg": 5.959635382168926e-05, |
| "step": 361 |
| }, |
| { |
| "epoch": 0.04525, |
| "grad_norm": 3.0179331302642822, |
| "grad_norm_var": 0.23691283979908515, |
| "learning_rate": 0.0001, |
| "loss": 1.3921, |
| "loss/crossentropy": 2.728665351867676, |
| "loss/hidden": 1.1953125, |
| "loss/logits": 0.19617268443107605, |
| "loss/reg": 5.958346446277574e-05, |
| "step": 362 |
| }, |
| { |
| "epoch": 0.045375, |
| "grad_norm": 2.577760934829712, |
| "grad_norm_var": 0.21171492452191767, |
| "learning_rate": 0.0001, |
| "loss": 1.3343, |
| "loss/crossentropy": 2.4396915435791016, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.19306717813014984, |
| "loss/reg": 5.957194298389368e-05, |
| "step": 363 |
| }, |
| { |
| "epoch": 0.0455, |
| "grad_norm": 2.2478973865509033, |
| "grad_norm_var": 0.22066793284785244, |
| "learning_rate": 0.0001, |
| "loss": 1.2107, |
| "loss/crossentropy": 2.5703125, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.16320618987083435, |
| "loss/reg": 5.955886445008218e-05, |
| "step": 364 |
| }, |
| { |
| "epoch": 0.045625, |
| "grad_norm": 2.8303184509277344, |
| "grad_norm_var": 0.21465200545435412, |
| "learning_rate": 0.0001, |
| "loss": 1.3184, |
| "loss/crossentropy": 2.5793418884277344, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.20061752200126648, |
| "loss/reg": 5.954650623607449e-05, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.04575, |
| "grad_norm": 2.3407225608825684, |
| "grad_norm_var": 0.20058607793752117, |
| "learning_rate": 0.0001, |
| "loss": 1.2154, |
| "loss/crossentropy": 2.5118396282196045, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.16011780500411987, |
| "loss/reg": 5.9531517763389274e-05, |
| "step": 366 |
| }, |
| { |
| "epoch": 0.045875, |
| "grad_norm": 2.9164462089538574, |
| "grad_norm_var": 0.17624459628143327, |
| "learning_rate": 0.0001, |
| "loss": 2.3079, |
| "loss/crossentropy": 2.530949831008911, |
| "loss/hidden": 1.7890625, |
| "loss/logits": 0.5182523727416992, |
| "loss/reg": 5.9519883507164195e-05, |
| "step": 367 |
| }, |
| { |
| "epoch": 0.046, |
| "grad_norm": 2.6031134128570557, |
| "grad_norm_var": 0.17274354994838556, |
| "learning_rate": 0.0001, |
| "loss": 1.3529, |
| "loss/crossentropy": 2.5211331844329834, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.21172133088111877, |
| "loss/reg": 5.950441482127644e-05, |
| "step": 368 |
| }, |
| { |
| "epoch": 0.046125, |
| "grad_norm": 2.2432241439819336, |
| "grad_norm_var": 0.16462358021652007, |
| "learning_rate": 0.0001, |
| "loss": 1.2433, |
| "loss/crossentropy": 2.469212055206299, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.18018998205661774, |
| "loss/reg": 5.9490499552339315e-05, |
| "step": 369 |
| }, |
| { |
| "epoch": 0.04625, |
| "grad_norm": 3.287365674972534, |
| "grad_norm_var": 0.16815977224474163, |
| "learning_rate": 0.0001, |
| "loss": 1.3311, |
| "loss/crossentropy": 2.7330899238586426, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.18986304104328156, |
| "loss/reg": 5.9471924032550305e-05, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.046375, |
| "grad_norm": 2.6555063724517822, |
| "grad_norm_var": 0.16849581874392333, |
| "learning_rate": 0.0001, |
| "loss": 1.3069, |
| "loss/crossentropy": 2.4908649921417236, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.1890988051891327, |
| "loss/reg": 5.9457710449351e-05, |
| "step": 371 |
| }, |
| { |
| "epoch": 0.0465, |
| "grad_norm": 2.2832915782928467, |
| "grad_norm_var": 0.1710711381355336, |
| "learning_rate": 0.0001, |
| "loss": 1.251, |
| "loss/crossentropy": 2.6485414505004883, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.16444087028503418, |
| "loss/reg": 5.9437123127281666e-05, |
| "step": 372 |
| }, |
| { |
| "epoch": 0.046625, |
| "grad_norm": 1.9312299489974976, |
| "grad_norm_var": 0.11748808484953574, |
| "learning_rate": 0.0001, |
| "loss": 1.3104, |
| "loss/crossentropy": 2.4345285892486572, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.1848057061433792, |
| "loss/reg": 5.941649214946665e-05, |
| "step": 373 |
| }, |
| { |
| "epoch": 0.04675, |
| "grad_norm": 2.2687668800354004, |
| "grad_norm_var": 0.12381368567199799, |
| "learning_rate": 0.0001, |
| "loss": 1.2697, |
| "loss/crossentropy": 2.514896869659424, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.18321493268013, |
| "loss/reg": 5.939120819675736e-05, |
| "step": 374 |
| }, |
| { |
| "epoch": 0.046875, |
| "grad_norm": 2.1616384983062744, |
| "grad_norm_var": 0.131467626574521, |
| "learning_rate": 0.0001, |
| "loss": 1.2405, |
| "loss/crossentropy": 2.698112964630127, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.1773754358291626, |
| "loss/reg": 5.936667002970353e-05, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.047, |
| "grad_norm": 2.6922011375427246, |
| "grad_norm_var": 0.13182201209023336, |
| "learning_rate": 0.0001, |
| "loss": 1.3426, |
| "loss/crossentropy": 2.538865327835083, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.1857489049434662, |
| "loss/reg": 5.9345431509427726e-05, |
| "step": 376 |
| }, |
| { |
| "epoch": 0.047125, |
| "grad_norm": 2.2630982398986816, |
| "grad_norm_var": 0.13276797957838743, |
| "learning_rate": 0.0001, |
| "loss": 1.2869, |
| "loss/crossentropy": 2.4644358158111572, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.17694343626499176, |
| "loss/reg": 5.933275315328501e-05, |
| "step": 377 |
| }, |
| { |
| "epoch": 0.04725, |
| "grad_norm": 2.479646682739258, |
| "grad_norm_var": 0.11514238570119009, |
| "learning_rate": 0.0001, |
| "loss": 1.1618, |
| "loss/crossentropy": 2.5582141876220703, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.14554372429847717, |
| "loss/reg": 5.931046689511277e-05, |
| "step": 378 |
| }, |
| { |
| "epoch": 0.047375, |
| "grad_norm": 2.466947317123413, |
| "grad_norm_var": 0.114559834161389, |
| "learning_rate": 0.0001, |
| "loss": 1.48, |
| "loss/crossentropy": 2.4128925800323486, |
| "loss/hidden": 1.2421875, |
| "loss/logits": 0.23724211752414703, |
| "loss/reg": 5.929026156081818e-05, |
| "step": 379 |
| }, |
| { |
| "epoch": 0.0475, |
| "grad_norm": 2.538424015045166, |
| "grad_norm_var": 0.11086504579424972, |
| "learning_rate": 0.0001, |
| "loss": 1.4136, |
| "loss/crossentropy": 2.0768887996673584, |
| "loss/hidden": 1.234375, |
| "loss/logits": 0.17867109179496765, |
| "loss/reg": 5.92764736211393e-05, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.047625, |
| "grad_norm": 2.654524564743042, |
| "grad_norm_var": 0.1049983644074643, |
| "learning_rate": 0.0001, |
| "loss": 1.3221, |
| "loss/crossentropy": 2.1216413974761963, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.16521546244621277, |
| "loss/reg": 5.926107769482769e-05, |
| "step": 381 |
| }, |
| { |
| "epoch": 0.04775, |
| "grad_norm": 2.237818717956543, |
| "grad_norm_var": 0.10766217194697697, |
| "learning_rate": 0.0001, |
| "loss": 1.2236, |
| "loss/crossentropy": 2.6475207805633545, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.16833502054214478, |
| "loss/reg": 5.924178913119249e-05, |
| "step": 382 |
| }, |
| { |
| "epoch": 0.047875, |
| "grad_norm": 2.7116799354553223, |
| "grad_norm_var": 0.09837235459497572, |
| "learning_rate": 0.0001, |
| "loss": 1.3102, |
| "loss/crossentropy": 2.6615209579467773, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.1924624741077423, |
| "loss/reg": 5.921960837440565e-05, |
| "step": 383 |
| }, |
| { |
| "epoch": 0.048, |
| "grad_norm": 2.5439391136169434, |
| "grad_norm_var": 0.09752047633307553, |
| "learning_rate": 0.0001, |
| "loss": 1.3258, |
| "loss/crossentropy": 2.10198974609375, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.1689702719449997, |
| "loss/reg": 5.919525210629217e-05, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.048125, |
| "grad_norm": 2.617921829223633, |
| "grad_norm_var": 0.09528014676361156, |
| "learning_rate": 0.0001, |
| "loss": 1.61, |
| "loss/crossentropy": 2.445833206176758, |
| "loss/hidden": 1.328125, |
| "loss/logits": 0.28133296966552734, |
| "loss/reg": 5.917950693401508e-05, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.04825, |
| "grad_norm": 2.514899730682373, |
| "grad_norm_var": 0.05015297139615639, |
| "learning_rate": 0.0001, |
| "loss": 1.1964, |
| "loss/crossentropy": 2.4887778759002686, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.1567072868347168, |
| "loss/reg": 5.916162990615703e-05, |
| "step": 386 |
| }, |
| { |
| "epoch": 0.048375, |
| "grad_norm": 2.1075565814971924, |
| "grad_norm_var": 0.053089324895933446, |
| "learning_rate": 0.0001, |
| "loss": 1.0537, |
| "loss/crossentropy": 2.4045815467834473, |
| "loss/hidden": 0.921875, |
| "loss/logits": 0.1312153935432434, |
| "loss/reg": 5.914089342695661e-05, |
| "step": 387 |
| }, |
| { |
| "epoch": 0.0485, |
| "grad_norm": 2.475404739379883, |
| "grad_norm_var": 0.05228874002812057, |
| "learning_rate": 0.0001, |
| "loss": 1.3003, |
| "loss/crossentropy": 2.591153383255005, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.19037862122058868, |
| "loss/reg": 5.9116682677995414e-05, |
| "step": 388 |
| }, |
| { |
| "epoch": 0.048625, |
| "grad_norm": 4.638079643249512, |
| "grad_norm_var": 0.33504973194641535, |
| "learning_rate": 0.0001, |
| "loss": 1.7407, |
| "loss/crossentropy": 2.992236852645874, |
| "loss/hidden": 1.4609375, |
| "loss/logits": 0.2792096734046936, |
| "loss/reg": 5.9097284974996e-05, |
| "step": 389 |
| }, |
| { |
| "epoch": 0.04875, |
| "grad_norm": 2.4662392139434814, |
| "grad_norm_var": 0.32913998556907487, |
| "learning_rate": 0.0001, |
| "loss": 1.1454, |
| "loss/crossentropy": 2.9239540100097656, |
| "loss/hidden": 0.9921875, |
| "loss/logits": 0.15260137617588043, |
| "loss/reg": 5.907983722863719e-05, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.048875, |
| "grad_norm": 2.439119338989258, |
| "grad_norm_var": 0.31780327994806234, |
| "learning_rate": 0.0001, |
| "loss": 1.3638, |
| "loss/crossentropy": 2.450254440307617, |
| "loss/hidden": 1.1640625, |
| "loss/logits": 0.19915927946567535, |
| "loss/reg": 5.9063841035822406e-05, |
| "step": 391 |
| }, |
| { |
| "epoch": 0.049, |
| "grad_norm": 2.3475067615509033, |
| "grad_norm_var": 0.3217026075593497, |
| "learning_rate": 0.0001, |
| "loss": 1.533, |
| "loss/crossentropy": 2.617830753326416, |
| "loss/hidden": 1.265625, |
| "loss/logits": 0.26678475737571716, |
| "loss/reg": 5.90429590374697e-05, |
| "step": 392 |
| }, |
| { |
| "epoch": 0.049125, |
| "grad_norm": 4.364901065826416, |
| "grad_norm_var": 0.5050899240629005, |
| "learning_rate": 0.0001, |
| "loss": 1.4632, |
| "loss/crossentropy": 2.4607560634613037, |
| "loss/hidden": 1.2421875, |
| "loss/logits": 0.22039487957954407, |
| "loss/reg": 5.902666089241393e-05, |
| "step": 393 |
| }, |
| { |
| "epoch": 0.04925, |
| "grad_norm": 2.338758707046509, |
| "grad_norm_var": 0.5109449021123245, |
| "learning_rate": 0.0001, |
| "loss": 1.2995, |
| "loss/crossentropy": 2.6618576049804688, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.1817541867494583, |
| "loss/reg": 5.9010566474171355e-05, |
| "step": 394 |
| }, |
| { |
| "epoch": 0.049375, |
| "grad_norm": 3.5642833709716797, |
| "grad_norm_var": 0.549694181009107, |
| "learning_rate": 0.0001, |
| "loss": 1.3152, |
| "loss/crossentropy": 2.300379753112793, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.1974020004272461, |
| "loss/reg": 5.8987676311517134e-05, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.0495, |
| "grad_norm": 2.1328978538513184, |
| "grad_norm_var": 0.573308372527261, |
| "learning_rate": 0.0001, |
| "loss": 1.244, |
| "loss/crossentropy": 2.4386301040649414, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.18090221285820007, |
| "loss/reg": 5.8964946219930425e-05, |
| "step": 396 |
| }, |
| { |
| "epoch": 0.049625, |
| "grad_norm": 3.0894107818603516, |
| "grad_norm_var": 0.5790289690992334, |
| "learning_rate": 0.0001, |
| "loss": 1.5661, |
| "loss/crossentropy": 2.365107297897339, |
| "loss/hidden": 1.3671875, |
| "loss/logits": 0.1983477920293808, |
| "loss/reg": 5.8950212405761704e-05, |
| "step": 397 |
| }, |
| { |
| "epoch": 0.04975, |
| "grad_norm": 3.194427967071533, |
| "grad_norm_var": 0.566188494588774, |
| "learning_rate": 0.0001, |
| "loss": 1.4269, |
| "loss/crossentropy": 2.384216547012329, |
| "loss/hidden": 1.21875, |
| "loss/logits": 0.20751546323299408, |
| "loss/reg": 5.8928319049300626e-05, |
| "step": 398 |
| }, |
| { |
| "epoch": 0.049875, |
| "grad_norm": 2.5108933448791504, |
| "grad_norm_var": 0.5723226037333423, |
| "learning_rate": 0.0001, |
| "loss": 1.2127, |
| "loss/crossentropy": 2.5466771125793457, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.1574660688638687, |
| "loss/reg": 5.891324326512404e-05, |
| "step": 399 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 2.9769773483276367, |
| "grad_norm_var": 0.5672869916808385, |
| "learning_rate": 0.0001, |
| "loss": 1.3045, |
| "loss/crossentropy": 2.7223000526428223, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.18677057325839996, |
| "loss/reg": 5.889027670491487e-05, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.050125, |
| "grad_norm": 3.31915283203125, |
| "grad_norm_var": 0.5752734489563172, |
| "learning_rate": 0.0001, |
| "loss": 1.2639, |
| "loss/crossentropy": 2.4886364936828613, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.1851940155029297, |
| "loss/reg": 5.887265797355212e-05, |
| "step": 401 |
| }, |
| { |
| "epoch": 0.05025, |
| "grad_norm": 1.8946937322616577, |
| "grad_norm_var": 0.6315760522485537, |
| "learning_rate": 0.0001, |
| "loss": 1.2326, |
| "loss/crossentropy": 2.414213180541992, |
| "loss/hidden": 1.0703125, |
| "loss/logits": 0.16165336966514587, |
| "loss/reg": 5.885552673134953e-05, |
| "step": 402 |
| }, |
| { |
| "epoch": 0.050375, |
| "grad_norm": 2.5370404720306396, |
| "grad_norm_var": 0.5996572790739425, |
| "learning_rate": 0.0001, |
| "loss": 1.5079, |
| "loss/crossentropy": 2.3421835899353027, |
| "loss/hidden": 1.28125, |
| "loss/logits": 0.22602099180221558, |
| "loss/reg": 5.88419679843355e-05, |
| "step": 403 |
| }, |
| { |
| "epoch": 0.0505, |
| "grad_norm": 2.4215445518493652, |
| "grad_norm_var": 0.6028382899137373, |
| "learning_rate": 0.0001, |
| "loss": 1.4975, |
| "loss/crossentropy": 2.7361152172088623, |
| "loss/hidden": 1.2421875, |
| "loss/logits": 0.25468122959136963, |
| "loss/reg": 5.88247858104296e-05, |
| "step": 404 |
| }, |
| { |
| "epoch": 0.050625, |
| "grad_norm": 2.049978733062744, |
| "grad_norm_var": 0.4181645547932513, |
| "learning_rate": 0.0001, |
| "loss": 1.1088, |
| "loss/crossentropy": 2.350353717803955, |
| "loss/hidden": 0.953125, |
| "loss/logits": 0.15509989857673645, |
| "loss/reg": 5.880888784304261e-05, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.05075, |
| "grad_norm": 2.7967936992645264, |
| "grad_norm_var": 0.41345734870287976, |
| "learning_rate": 0.0001, |
| "loss": 1.3869, |
| "loss/crossentropy": 2.5875766277313232, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.21445012092590332, |
| "loss/reg": 5.8793633797904477e-05, |
| "step": 406 |
| }, |
| { |
| "epoch": 0.050875, |
| "grad_norm": 2.169900894165039, |
| "grad_norm_var": 0.429098064205416, |
| "learning_rate": 0.0001, |
| "loss": 1.0776, |
| "loss/crossentropy": 2.398125410079956, |
| "loss/hidden": 0.93359375, |
| "loss/logits": 0.14346018433570862, |
| "loss/reg": 5.877741932636127e-05, |
| "step": 407 |
| }, |
| { |
| "epoch": 0.051, |
| "grad_norm": 2.5045695304870605, |
| "grad_norm_var": 0.4225916301522199, |
| "learning_rate": 0.0001, |
| "loss": 1.5355, |
| "loss/crossentropy": 2.1697590351104736, |
| "loss/hidden": 1.328125, |
| "loss/logits": 0.20677754282951355, |
| "loss/reg": 5.876670911675319e-05, |
| "step": 408 |
| }, |
| { |
| "epoch": 0.051125, |
| "grad_norm": 18.23008918762207, |
| "grad_norm_var": 15.43871781968465, |
| "learning_rate": 0.0001, |
| "loss": 1.4882, |
| "loss/crossentropy": 2.602886438369751, |
| "loss/hidden": 1.3046875, |
| "loss/logits": 0.18289120495319366, |
| "loss/reg": 5.874884300283156e-05, |
| "step": 409 |
| }, |
| { |
| "epoch": 0.05125, |
| "grad_norm": 2.8436660766601562, |
| "grad_norm_var": 15.369190103974788, |
| "learning_rate": 0.0001, |
| "loss": 1.3294, |
| "loss/crossentropy": 2.4684174060821533, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.18818634748458862, |
| "loss/reg": 5.873553891433403e-05, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.051375, |
| "grad_norm": 2.2729334831237793, |
| "grad_norm_var": 15.486411427985377, |
| "learning_rate": 0.0001, |
| "loss": 1.2331, |
| "loss/crossentropy": 2.550140857696533, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.19348369538784027, |
| "loss/reg": 5.8720732340589166e-05, |
| "step": 411 |
| }, |
| { |
| "epoch": 0.0515, |
| "grad_norm": 2.5612359046936035, |
| "grad_norm_var": 15.416427881560285, |
| "learning_rate": 0.0001, |
| "loss": 1.3333, |
| "loss/crossentropy": 2.4774818420410156, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.2077203392982483, |
| "loss/reg": 5.8710702433018014e-05, |
| "step": 412 |
| }, |
| { |
| "epoch": 0.051625, |
| "grad_norm": 4.02579927444458, |
| "grad_norm_var": 15.409250289477422, |
| "learning_rate": 0.0001, |
| "loss": 1.507, |
| "loss/crossentropy": 2.555722713470459, |
| "loss/hidden": 1.3046875, |
| "loss/logits": 0.20171231031417847, |
| "loss/reg": 5.870195309398696e-05, |
| "step": 413 |
| }, |
| { |
| "epoch": 0.05175, |
| "grad_norm": 2.443574905395508, |
| "grad_norm_var": 15.489530544756628, |
| "learning_rate": 0.0001, |
| "loss": 1.2774, |
| "loss/crossentropy": 2.706422805786133, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.1831112802028656, |
| "loss/reg": 5.8690613514045253e-05, |
| "step": 414 |
| }, |
| { |
| "epoch": 0.051875, |
| "grad_norm": 2.079418897628784, |
| "grad_norm_var": 15.563674426313279, |
| "learning_rate": 0.0001, |
| "loss": 1.1798, |
| "loss/crossentropy": 2.6763839721679688, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.14800235629081726, |
| "loss/reg": 5.8675475884228945e-05, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.052, |
| "grad_norm": 2.7786471843719482, |
| "grad_norm_var": 15.581826938638233, |
| "learning_rate": 0.0001, |
| "loss": 1.2465, |
| "loss/crossentropy": 2.6709306240081787, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.1678304374217987, |
| "loss/reg": 5.866462379344739e-05, |
| "step": 416 |
| }, |
| { |
| "epoch": 0.052125, |
| "grad_norm": 2.770376443862915, |
| "grad_norm_var": 15.618130403520784, |
| "learning_rate": 0.0001, |
| "loss": 1.3111, |
| "loss/crossentropy": 2.646826982498169, |
| "loss/hidden": 1.1328125, |
| "loss/logits": 0.1777157187461853, |
| "loss/reg": 5.865520142833702e-05, |
| "step": 417 |
| }, |
| { |
| "epoch": 0.05225, |
| "grad_norm": 2.092414617538452, |
| "grad_norm_var": 15.57762685735369, |
| "learning_rate": 0.0001, |
| "loss": 1.3353, |
| "loss/crossentropy": 2.62361741065979, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.17848479747772217, |
| "loss/reg": 5.8638761402107775e-05, |
| "step": 418 |
| }, |
| { |
| "epoch": 0.052375, |
| "grad_norm": 2.05226731300354, |
| "grad_norm_var": 15.656891853986265, |
| "learning_rate": 0.0001, |
| "loss": 1.14, |
| "loss/crossentropy": 2.697723865509033, |
| "loss/hidden": 0.9921875, |
| "loss/logits": 0.14726917445659637, |
| "loss/reg": 5.862316902494058e-05, |
| "step": 419 |
| }, |
| { |
| "epoch": 0.0525, |
| "grad_norm": 2.6924796104431152, |
| "grad_norm_var": 15.622310414474152, |
| "learning_rate": 0.0001, |
| "loss": 1.404, |
| "loss/crossentropy": 2.601827383041382, |
| "loss/hidden": 1.2109375, |
| "loss/logits": 0.19246245920658112, |
| "loss/reg": 5.860950841451995e-05, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.052625, |
| "grad_norm": 5.301983833312988, |
| "grad_norm_var": 15.644682914862404, |
| "learning_rate": 0.0001, |
| "loss": 1.4862, |
| "loss/crossentropy": 2.6217854022979736, |
| "loss/hidden": 1.296875, |
| "loss/logits": 0.18871337175369263, |
| "loss/reg": 5.8600846386980265e-05, |
| "step": 421 |
| }, |
| { |
| "epoch": 0.05275, |
| "grad_norm": 2.114091634750366, |
| "grad_norm_var": 15.758396712898662, |
| "learning_rate": 0.0001, |
| "loss": 1.2033, |
| "loss/crossentropy": 2.5663623809814453, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.16362521052360535, |
| "loss/reg": 5.8592915593180805e-05, |
| "step": 422 |
| }, |
| { |
| "epoch": 0.052875, |
| "grad_norm": 2.757091999053955, |
| "grad_norm_var": 15.661455859551703, |
| "learning_rate": 0.0001, |
| "loss": 1.1223, |
| "loss/crossentropy": 2.4681971073150635, |
| "loss/hidden": 0.97265625, |
| "loss/logits": 0.14905983209609985, |
| "loss/reg": 5.858425720361993e-05, |
| "step": 423 |
| }, |
| { |
| "epoch": 0.053, |
| "grad_norm": 2.4524407386779785, |
| "grad_norm_var": 15.670073831964206, |
| "learning_rate": 0.0001, |
| "loss": 1.2938, |
| "loss/crossentropy": 2.4758145809173584, |
| "loss/hidden": 1.1015625, |
| "loss/logits": 0.19164547324180603, |
| "loss/reg": 5.857350697624497e-05, |
| "step": 424 |
| }, |
| { |
| "epoch": 0.053125, |
| "grad_norm": 2.3052892684936523, |
| "grad_norm_var": 0.7038252417895506, |
| "learning_rate": 0.0001, |
| "loss": 1.2565, |
| "loss/crossentropy": 2.597487211227417, |
| "loss/hidden": 1.0703125, |
| "loss/logits": 0.18559187650680542, |
| "loss/reg": 5.855830750078894e-05, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.05325, |
| "grad_norm": 2.7276995182037354, |
| "grad_norm_var": 0.7027765205874381, |
| "learning_rate": 0.0001, |
| "loss": 1.4141, |
| "loss/crossentropy": 2.6818253993988037, |
| "loss/hidden": 1.21875, |
| "loss/logits": 0.1948131024837494, |
| "loss/reg": 5.854442133568227e-05, |
| "step": 426 |
| }, |
| { |
| "epoch": 0.053375, |
| "grad_norm": 1.725293517112732, |
| "grad_norm_var": 0.7537440425638384, |
| "learning_rate": 0.0001, |
| "loss": 1.1664, |
| "loss/crossentropy": 2.4244258403778076, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.1502000093460083, |
| "loss/reg": 5.85384841542691e-05, |
| "step": 427 |
| }, |
| { |
| "epoch": 0.0535, |
| "grad_norm": 2.6642932891845703, |
| "grad_norm_var": 0.7527758186064119, |
| "learning_rate": 0.0001, |
| "loss": 1.5211, |
| "loss/crossentropy": 2.1209182739257812, |
| "loss/hidden": 1.328125, |
| "loss/logits": 0.192403644323349, |
| "loss/reg": 5.852692629559897e-05, |
| "step": 428 |
| }, |
| { |
| "epoch": 0.053625, |
| "grad_norm": 2.7787868976593018, |
| "grad_norm_var": 0.6272740663233074, |
| "learning_rate": 0.0001, |
| "loss": 1.3046, |
| "loss/crossentropy": 2.3020565509796143, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.179016575217247, |
| "loss/reg": 5.851646346854977e-05, |
| "step": 429 |
| }, |
| { |
| "epoch": 0.05375, |
| "grad_norm": 2.891101360321045, |
| "grad_norm_var": 0.6299498912530666, |
| "learning_rate": 0.0001, |
| "loss": 1.4198, |
| "loss/crossentropy": 2.33249568939209, |
| "loss/hidden": 1.203125, |
| "loss/logits": 0.21604114770889282, |
| "loss/reg": 5.850956222275272e-05, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.053875, |
| "grad_norm": 2.7940289974212646, |
| "grad_norm_var": 0.608789107013446, |
| "learning_rate": 0.0001, |
| "loss": 1.1825, |
| "loss/crossentropy": 2.6553549766540527, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.15064392983913422, |
| "loss/reg": 5.8500536397332326e-05, |
| "step": 431 |
| }, |
| { |
| "epoch": 0.054, |
| "grad_norm": 25.06597328186035, |
| "grad_norm_var": 31.943843646690855, |
| "learning_rate": 0.0001, |
| "loss": 2.4055, |
| "loss/crossentropy": 2.7126245498657227, |
| "loss/hidden": 2.03125, |
| "loss/logits": 0.3736712336540222, |
| "loss/reg": 5.849341687280685e-05, |
| "step": 432 |
| }, |
| { |
| "epoch": 0.054125, |
| "grad_norm": 2.4612748622894287, |
| "grad_norm_var": 32.003546233579016, |
| "learning_rate": 0.0001, |
| "loss": 1.4832, |
| "loss/crossentropy": 2.6244633197784424, |
| "loss/hidden": 1.25, |
| "loss/logits": 0.23266229033470154, |
| "loss/reg": 5.847978172823787e-05, |
| "step": 433 |
| }, |
| { |
| "epoch": 0.05425, |
| "grad_norm": 2.413149356842041, |
| "grad_norm_var": 31.926055741483236, |
| "learning_rate": 0.0001, |
| "loss": 1.405, |
| "loss/crossentropy": 2.513383626937866, |
| "loss/hidden": 1.1953125, |
| "loss/logits": 0.2091376930475235, |
| "loss/reg": 5.847239663125947e-05, |
| "step": 434 |
| }, |
| { |
| "epoch": 0.054375, |
| "grad_norm": 2.1266605854034424, |
| "grad_norm_var": 31.906339652731415, |
| "learning_rate": 0.0001, |
| "loss": 1.2307, |
| "loss/crossentropy": 2.645113706588745, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.17538747191429138, |
| "loss/reg": 5.8466725022299215e-05, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.0545, |
| "grad_norm": 2.693485975265503, |
| "grad_norm_var": 31.906153605922054, |
| "learning_rate": 0.0001, |
| "loss": 1.3491, |
| "loss/crossentropy": 2.5616350173950195, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.1766662299633026, |
| "loss/reg": 5.845691339345649e-05, |
| "step": 436 |
| }, |
| { |
| "epoch": 0.054625, |
| "grad_norm": 3.594322681427002, |
| "grad_norm_var": 31.81007436255887, |
| "learning_rate": 0.0001, |
| "loss": 1.4456, |
| "loss/crossentropy": 2.320868492126465, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.2730950713157654, |
| "loss/reg": 5.845166742801666e-05, |
| "step": 437 |
| }, |
| { |
| "epoch": 0.05475, |
| "grad_norm": 2.725066900253296, |
| "grad_norm_var": 31.681987454427826, |
| "learning_rate": 0.0001, |
| "loss": 1.4368, |
| "loss/crossentropy": 2.4526007175445557, |
| "loss/hidden": 1.21875, |
| "loss/logits": 0.21745863556861877, |
| "loss/reg": 5.844476982019842e-05, |
| "step": 438 |
| }, |
| { |
| "epoch": 0.054875, |
| "grad_norm": 2.615208625793457, |
| "grad_norm_var": 31.706966746538818, |
| "learning_rate": 0.0001, |
| "loss": 1.2902, |
| "loss/crossentropy": 2.5873489379882812, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.18027284741401672, |
| "loss/reg": 5.843998587806709e-05, |
| "step": 439 |
| }, |
| { |
| "epoch": 0.055, |
| "grad_norm": 2.679504632949829, |
| "grad_norm_var": 31.66327199965654, |
| "learning_rate": 0.0001, |
| "loss": 1.4142, |
| "loss/crossentropy": 2.171384811401367, |
| "loss/hidden": 1.21875, |
| "loss/logits": 0.1948787271976471, |
| "loss/reg": 5.8425270253792405e-05, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.055125, |
| "grad_norm": 2.781118869781494, |
| "grad_norm_var": 31.56886824166385, |
| "learning_rate": 0.0001, |
| "loss": 1.2261, |
| "loss/crossentropy": 2.616610050201416, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.16300562024116516, |
| "loss/reg": 5.841004167450592e-05, |
| "step": 441 |
| }, |
| { |
| "epoch": 0.05525, |
| "grad_norm": 2.8343710899353027, |
| "grad_norm_var": 31.550828531904, |
| "learning_rate": 0.0001, |
| "loss": 1.6654, |
| "loss/crossentropy": 2.254971504211426, |
| "loss/hidden": 1.390625, |
| "loss/logits": 0.27416497468948364, |
| "loss/reg": 5.840086305397563e-05, |
| "step": 442 |
| }, |
| { |
| "epoch": 0.055375, |
| "grad_norm": 2.943516254425049, |
| "grad_norm_var": 31.26553828771242, |
| "learning_rate": 0.0001, |
| "loss": 1.3037, |
| "loss/crossentropy": 2.607365131378174, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.16250211000442505, |
| "loss/reg": 5.8392772189108655e-05, |
| "step": 443 |
| }, |
| { |
| "epoch": 0.0555, |
| "grad_norm": 4.3494696617126465, |
| "grad_norm_var": 31.11395178262311, |
| "learning_rate": 0.0001, |
| "loss": 1.4874, |
| "loss/crossentropy": 2.803809642791748, |
| "loss/hidden": 1.265625, |
| "loss/logits": 0.22114460170269012, |
| "loss/reg": 5.8383415307616815e-05, |
| "step": 444 |
| }, |
| { |
| "epoch": 0.055625, |
| "grad_norm": 2.3149962425231934, |
| "grad_norm_var": 31.21739595793184, |
| "learning_rate": 0.0001, |
| "loss": 1.1723, |
| "loss/crossentropy": 2.7661781311035156, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.1560768485069275, |
| "loss/reg": 5.8376208471599966e-05, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.05575, |
| "grad_norm": 2.5312862396240234, |
| "grad_norm_var": 31.288532129977195, |
| "learning_rate": 0.0001, |
| "loss": 1.4583, |
| "loss/crossentropy": 2.3608808517456055, |
| "loss/hidden": 1.234375, |
| "loss/logits": 0.22338923811912537, |
| "loss/reg": 5.83621695113834e-05, |
| "step": 446 |
| }, |
| { |
| "epoch": 0.055875, |
| "grad_norm": 2.0245697498321533, |
| "grad_norm_var": 31.468007952235922, |
| "learning_rate": 0.0001, |
| "loss": 1.2537, |
| "loss/crossentropy": 2.6646907329559326, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.1671399027109146, |
| "loss/reg": 5.835363481310196e-05, |
| "step": 447 |
| }, |
| { |
| "epoch": 0.056, |
| "grad_norm": 4.180586338043213, |
| "grad_norm_var": 0.4425575902395887, |
| "learning_rate": 0.0001, |
| "loss": 1.4287, |
| "loss/crossentropy": 2.478865623474121, |
| "loss/hidden": 1.1796875, |
| "loss/logits": 0.2484455555677414, |
| "loss/reg": 5.833926479681395e-05, |
| "step": 448 |
| }, |
| { |
| "epoch": 0.056125, |
| "grad_norm": 2.2291383743286133, |
| "grad_norm_var": 0.4573160813014281, |
| "learning_rate": 0.0001, |
| "loss": 1.2997, |
| "loss/crossentropy": 2.244389295578003, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.14287710189819336, |
| "loss/reg": 5.833054456161335e-05, |
| "step": 449 |
| }, |
| { |
| "epoch": 0.05625, |
| "grad_norm": 2.204925060272217, |
| "grad_norm_var": 0.47117643459253195, |
| "learning_rate": 0.0001, |
| "loss": 1.2876, |
| "loss/crossentropy": 2.3469107151031494, |
| "loss/hidden": 1.1015625, |
| "loss/logits": 0.1854255050420761, |
| "loss/reg": 5.832717943121679e-05, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.056375, |
| "grad_norm": 2.5266880989074707, |
| "grad_norm_var": 0.4451698073358396, |
| "learning_rate": 0.0001, |
| "loss": 1.4392, |
| "loss/crossentropy": 2.440885305404663, |
| "loss/hidden": 1.2109375, |
| "loss/logits": 0.22769977152347565, |
| "loss/reg": 5.8323836128693074e-05, |
| "step": 451 |
| }, |
| { |
| "epoch": 0.0565, |
| "grad_norm": 2.410515785217285, |
| "grad_norm_var": 0.455202882380185, |
| "learning_rate": 0.0001, |
| "loss": 1.4083, |
| "loss/crossentropy": 2.4578142166137695, |
| "loss/hidden": 1.203125, |
| "loss/logits": 0.20461352169513702, |
| "loss/reg": 5.830869122291915e-05, |
| "step": 452 |
| }, |
| { |
| "epoch": 0.056625, |
| "grad_norm": 2.0389811992645264, |
| "grad_norm_var": 0.4435531519851603, |
| "learning_rate": 0.0001, |
| "loss": 1.1318, |
| "loss/crossentropy": 2.139033317565918, |
| "loss/hidden": 0.9921875, |
| "loss/logits": 0.1390083134174347, |
| "loss/reg": 5.829246947541833e-05, |
| "step": 453 |
| }, |
| { |
| "epoch": 0.05675, |
| "grad_norm": 1.979454517364502, |
| "grad_norm_var": 0.47698744011981165, |
| "learning_rate": 0.0001, |
| "loss": 1.3115, |
| "loss/crossentropy": 2.546844005584717, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.18587306141853333, |
| "loss/reg": 5.8282243116991594e-05, |
| "step": 454 |
| }, |
| { |
| "epoch": 0.056875, |
| "grad_norm": 2.0210747718811035, |
| "grad_norm_var": 0.5030154373593951, |
| "learning_rate": 0.0001, |
| "loss": 1.21, |
| "loss/crossentropy": 2.6095550060272217, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.16256017982959747, |
| "loss/reg": 5.8266243286198005e-05, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.057, |
| "grad_norm": 2.0944671630859375, |
| "grad_norm_var": 0.520400331750174, |
| "learning_rate": 0.0001, |
| "loss": 1.1407, |
| "loss/crossentropy": 2.450681447982788, |
| "loss/hidden": 0.98828125, |
| "loss/logits": 0.15184549987316132, |
| "loss/reg": 5.8250909205526114e-05, |
| "step": 456 |
| }, |
| { |
| "epoch": 0.057125, |
| "grad_norm": 2.5854806900024414, |
| "grad_norm_var": 0.5178481401493921, |
| "learning_rate": 0.0001, |
| "loss": 1.1308, |
| "loss/crossentropy": 2.8090949058532715, |
| "loss/hidden": 0.97265625, |
| "loss/logits": 0.15754011273384094, |
| "loss/reg": 5.8233421441400424e-05, |
| "step": 457 |
| }, |
| { |
| "epoch": 0.05725, |
| "grad_norm": 6.832178592681885, |
| "grad_norm_var": 1.6526915128701443, |
| "learning_rate": 0.0001, |
| "loss": 1.7544, |
| "loss/crossentropy": 2.4325008392333984, |
| "loss/hidden": 1.5625, |
| "loss/logits": 0.1913643479347229, |
| "loss/reg": 5.821782906423323e-05, |
| "step": 458 |
| }, |
| { |
| "epoch": 0.057375, |
| "grad_norm": 2.4911727905273438, |
| "grad_norm_var": 1.6585857165051416, |
| "learning_rate": 0.0001, |
| "loss": 1.277, |
| "loss/crossentropy": 2.5682671070098877, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.18270117044448853, |
| "loss/reg": 5.820325532113202e-05, |
| "step": 459 |
| }, |
| { |
| "epoch": 0.0575, |
| "grad_norm": 2.2592287063598633, |
| "grad_norm_var": 1.5000806172221008, |
| "learning_rate": 0.0001, |
| "loss": 1.149, |
| "loss/crossentropy": 2.3300366401672363, |
| "loss/hidden": 0.98828125, |
| "loss/logits": 0.16016384959220886, |
| "loss/reg": 5.8191151765640825e-05, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.057625, |
| "grad_norm": 2.6110737323760986, |
| "grad_norm_var": 1.4915332961489087, |
| "learning_rate": 0.0001, |
| "loss": 1.4344, |
| "loss/crossentropy": 2.560197591781616, |
| "loss/hidden": 1.21875, |
| "loss/logits": 0.21507461369037628, |
| "loss/reg": 5.817634882987477e-05, |
| "step": 461 |
| }, |
| { |
| "epoch": 0.05775, |
| "grad_norm": 2.6446752548217773, |
| "grad_norm_var": 1.48995546498276, |
| "learning_rate": 0.0001, |
| "loss": 1.2381, |
| "loss/crossentropy": 2.5068211555480957, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.1828281581401825, |
| "loss/reg": 5.816355405841023e-05, |
| "step": 462 |
| }, |
| { |
| "epoch": 0.057875, |
| "grad_norm": 2.498300075531006, |
| "grad_norm_var": 1.4615785550667995, |
| "learning_rate": 0.0001, |
| "loss": 1.3019, |
| "loss/crossentropy": 2.3765523433685303, |
| "loss/hidden": 1.1328125, |
| "loss/logits": 0.16848215460777283, |
| "loss/reg": 5.814860560349189e-05, |
| "step": 463 |
| }, |
| { |
| "epoch": 0.058, |
| "grad_norm": 2.4674289226531982, |
| "grad_norm_var": 1.3126372255276026, |
| "learning_rate": 0.0001, |
| "loss": 1.3472, |
| "loss/crossentropy": 2.714657783508301, |
| "loss/hidden": 1.1640625, |
| "loss/logits": 0.18256625533103943, |
| "loss/reg": 5.81321437493898e-05, |
| "step": 464 |
| }, |
| { |
| "epoch": 0.058125, |
| "grad_norm": 3.7482964992523193, |
| "grad_norm_var": 1.3780257940909062, |
| "learning_rate": 0.0001, |
| "loss": 1.4579, |
| "loss/crossentropy": 2.7645256519317627, |
| "loss/hidden": 1.2109375, |
| "loss/logits": 0.24636635184288025, |
| "loss/reg": 5.811548908241093e-05, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.05825, |
| "grad_norm": 3.1881492137908936, |
| "grad_norm_var": 1.3717908440858895, |
| "learning_rate": 0.0001, |
| "loss": 1.2469, |
| "loss/crossentropy": 2.6280384063720703, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.16818463802337646, |
| "loss/reg": 5.8095396525459364e-05, |
| "step": 466 |
| }, |
| { |
| "epoch": 0.058375, |
| "grad_norm": 3.4882731437683105, |
| "grad_norm_var": 1.3977675144088226, |
| "learning_rate": 0.0001, |
| "loss": 1.5403, |
| "loss/crossentropy": 1.8358429670333862, |
| "loss/hidden": 1.3203125, |
| "loss/logits": 0.21941694617271423, |
| "loss/reg": 5.807522757095285e-05, |
| "step": 467 |
| }, |
| { |
| "epoch": 0.0585, |
| "grad_norm": 2.530682325363159, |
| "grad_norm_var": 1.391870091660969, |
| "learning_rate": 0.0001, |
| "loss": 1.1578, |
| "loss/crossentropy": 2.3950142860412598, |
| "loss/hidden": 0.99609375, |
| "loss/logits": 0.1611400693655014, |
| "loss/reg": 5.80518099013716e-05, |
| "step": 468 |
| }, |
| { |
| "epoch": 0.058625, |
| "grad_norm": 3.4676575660705566, |
| "grad_norm_var": 1.366390295617852, |
| "learning_rate": 0.0001, |
| "loss": 1.5162, |
| "loss/crossentropy": 2.851280689239502, |
| "loss/hidden": 1.234375, |
| "loss/logits": 0.28122612833976746, |
| "loss/reg": 5.8030982472701e-05, |
| "step": 469 |
| }, |
| { |
| "epoch": 0.05875, |
| "grad_norm": 2.9446208477020264, |
| "grad_norm_var": 1.302065384350945, |
| "learning_rate": 0.0001, |
| "loss": 1.3015, |
| "loss/crossentropy": 2.740093469619751, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.17590749263763428, |
| "loss/reg": 5.800585859105922e-05, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.058875, |
| "grad_norm": 2.7597243785858154, |
| "grad_norm_var": 1.2405377686230998, |
| "learning_rate": 0.0001, |
| "loss": 1.1651, |
| "loss/crossentropy": 2.440762996673584, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.14888577163219452, |
| "loss/reg": 5.7990357163362205e-05, |
| "step": 471 |
| }, |
| { |
| "epoch": 0.059, |
| "grad_norm": 2.8147523403167725, |
| "grad_norm_var": 1.182327943249795, |
| "learning_rate": 0.0001, |
| "loss": 1.3195, |
| "loss/crossentropy": 2.5801327228546143, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.17824885249137878, |
| "loss/reg": 5.7975972595158964e-05, |
| "step": 472 |
| }, |
| { |
| "epoch": 0.059125, |
| "grad_norm": 2.4511027336120605, |
| "grad_norm_var": 1.1923747545104257, |
| "learning_rate": 0.0001, |
| "loss": 1.4217, |
| "loss/crossentropy": 2.5711913108825684, |
| "loss/hidden": 1.203125, |
| "loss/logits": 0.2180328667163849, |
| "loss/reg": 5.796052937512286e-05, |
| "step": 473 |
| }, |
| { |
| "epoch": 0.05925, |
| "grad_norm": 2.9213221073150635, |
| "grad_norm_var": 0.1890407192544025, |
| "learning_rate": 0.0001, |
| "loss": 1.2735, |
| "loss/crossentropy": 2.5805675983428955, |
| "loss/hidden": 1.1015625, |
| "loss/logits": 0.17132875323295593, |
| "loss/reg": 5.794024036731571e-05, |
| "step": 474 |
| }, |
| { |
| "epoch": 0.059375, |
| "grad_norm": 2.6587464809417725, |
| "grad_norm_var": 0.1832162860499608, |
| "learning_rate": 0.0001, |
| "loss": 1.6569, |
| "loss/crossentropy": 2.356299638748169, |
| "loss/hidden": 1.40625, |
| "loss/logits": 0.25005391240119934, |
| "loss/reg": 5.791860894532874e-05, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.0595, |
| "grad_norm": 3.5978729724884033, |
| "grad_norm_var": 0.19139826910290647, |
| "learning_rate": 0.0001, |
| "loss": 1.7357, |
| "loss/crossentropy": 2.0626883506774902, |
| "loss/hidden": 1.4765625, |
| "loss/logits": 0.2585859000682831, |
| "loss/reg": 5.790415525552817e-05, |
| "step": 476 |
| }, |
| { |
| "epoch": 0.059625, |
| "grad_norm": 2.8491876125335693, |
| "grad_norm_var": 0.18498974202791843, |
| "learning_rate": 0.0001, |
| "loss": 1.5276, |
| "loss/crossentropy": 2.5583596229553223, |
| "loss/hidden": 1.2734375, |
| "loss/logits": 0.25358158349990845, |
| "loss/reg": 5.788617272628471e-05, |
| "step": 477 |
| }, |
| { |
| "epoch": 0.05975, |
| "grad_norm": 2.5821259021759033, |
| "grad_norm_var": 0.1876924518839881, |
| "learning_rate": 0.0001, |
| "loss": 1.3568, |
| "loss/crossentropy": 2.486640453338623, |
| "loss/hidden": 1.1640625, |
| "loss/logits": 0.19216927886009216, |
| "loss/reg": 5.786680776509456e-05, |
| "step": 478 |
| }, |
| { |
| "epoch": 0.059875, |
| "grad_norm": 2.877934217453003, |
| "grad_norm_var": 0.17456917708907038, |
| "learning_rate": 0.0001, |
| "loss": 1.5607, |
| "loss/crossentropy": 2.3836066722869873, |
| "loss/hidden": 1.3203125, |
| "loss/logits": 0.23981472849845886, |
| "loss/reg": 5.785070243291557e-05, |
| "step": 479 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 2.3281009197235107, |
| "grad_norm_var": 0.1849188959934999, |
| "learning_rate": 0.0001, |
| "loss": 1.2716, |
| "loss/crossentropy": 2.508988380432129, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.19294525682926178, |
| "loss/reg": 5.783725646324456e-05, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.060125, |
| "grad_norm": 2.8099567890167236, |
| "grad_norm_var": 0.14013939438571937, |
| "learning_rate": 0.0001, |
| "loss": 1.5081, |
| "loss/crossentropy": 2.3855881690979004, |
| "loss/hidden": 1.25, |
| "loss/logits": 0.2575419545173645, |
| "loss/reg": 5.782474545412697e-05, |
| "step": 481 |
| }, |
| { |
| "epoch": 0.06025, |
| "grad_norm": 2.9827277660369873, |
| "grad_norm_var": 0.134662315913679, |
| "learning_rate": 0.0001, |
| "loss": 1.4593, |
| "loss/crossentropy": 2.5487606525421143, |
| "loss/hidden": 1.25, |
| "loss/logits": 0.2087090015411377, |
| "loss/reg": 5.7816720072878525e-05, |
| "step": 482 |
| }, |
| { |
| "epoch": 0.060375, |
| "grad_norm": 2.306149959564209, |
| "grad_norm_var": 0.1259770764512929, |
| "learning_rate": 0.0001, |
| "loss": 1.2076, |
| "loss/crossentropy": 2.4755747318267822, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.16014963388442993, |
| "loss/reg": 5.781082290923223e-05, |
| "step": 483 |
| }, |
| { |
| "epoch": 0.0605, |
| "grad_norm": 2.4719114303588867, |
| "grad_norm_var": 0.12834384378027816, |
| "learning_rate": 0.0001, |
| "loss": 1.3745, |
| "loss/crossentropy": 2.8203346729278564, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.20208273828029633, |
| "loss/reg": 5.7795077736955136e-05, |
| "step": 484 |
| }, |
| { |
| "epoch": 0.060625, |
| "grad_norm": 2.300952911376953, |
| "grad_norm_var": 0.10978991346620433, |
| "learning_rate": 0.0001, |
| "loss": 1.464, |
| "loss/crossentropy": 2.610508680343628, |
| "loss/hidden": 1.2265625, |
| "loss/logits": 0.2368427813053131, |
| "loss/reg": 5.778546983492561e-05, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.06075, |
| "grad_norm": 3.3388009071350098, |
| "grad_norm_var": 0.13085586368501342, |
| "learning_rate": 0.0001, |
| "loss": 1.5116, |
| "loss/crossentropy": 2.763427972793579, |
| "loss/hidden": 1.296875, |
| "loss/logits": 0.21419215202331543, |
| "loss/reg": 5.7770797866396606e-05, |
| "step": 486 |
| }, |
| { |
| "epoch": 0.060875, |
| "grad_norm": 2.102293014526367, |
| "grad_norm_var": 0.1572983810037916, |
| "learning_rate": 0.0001, |
| "loss": 1.1595, |
| "loss/crossentropy": 2.204011917114258, |
| "loss/hidden": 1.0, |
| "loss/logits": 0.158901646733284, |
| "loss/reg": 5.7755187299335375e-05, |
| "step": 487 |
| }, |
| { |
| "epoch": 0.061, |
| "grad_norm": 2.766934633255005, |
| "grad_norm_var": 0.15678694409689248, |
| "learning_rate": 0.0001, |
| "loss": 1.4246, |
| "loss/crossentropy": 2.537151575088501, |
| "loss/hidden": 1.2109375, |
| "loss/logits": 0.2130882441997528, |
| "loss/reg": 5.774224700871855e-05, |
| "step": 488 |
| }, |
| { |
| "epoch": 0.061125, |
| "grad_norm": 2.0001540184020996, |
| "grad_norm_var": 0.18501104247654798, |
| "learning_rate": 0.0001, |
| "loss": 1.103, |
| "loss/crossentropy": 2.3592050075531006, |
| "loss/hidden": 0.96484375, |
| "loss/logits": 0.13754940032958984, |
| "loss/reg": 5.77289865759667e-05, |
| "step": 489 |
| }, |
| { |
| "epoch": 0.06125, |
| "grad_norm": 2.3166351318359375, |
| "grad_norm_var": 0.18848381138351228, |
| "learning_rate": 0.0001, |
| "loss": 1.3329, |
| "loss/crossentropy": 2.7236411571502686, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.1761033535003662, |
| "loss/reg": 5.771181167801842e-05, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.061375, |
| "grad_norm": 2.357775926589966, |
| "grad_norm_var": 0.19351960086170053, |
| "learning_rate": 0.0001, |
| "loss": 1.1437, |
| "loss/crossentropy": 2.866445779800415, |
| "loss/hidden": 0.98828125, |
| "loss/logits": 0.15484049916267395, |
| "loss/reg": 5.769642666564323e-05, |
| "step": 491 |
| }, |
| { |
| "epoch": 0.0615, |
| "grad_norm": 3.680264949798584, |
| "grad_norm_var": 0.20463866822373877, |
| "learning_rate": 0.0001, |
| "loss": 1.2002, |
| "loss/crossentropy": 3.115431308746338, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.16054463386535645, |
| "loss/reg": 5.7679084420669824e-05, |
| "step": 492 |
| }, |
| { |
| "epoch": 0.061625, |
| "grad_norm": 2.3650856018066406, |
| "grad_norm_var": 0.2051052996774897, |
| "learning_rate": 0.0001, |
| "loss": 1.1996, |
| "loss/crossentropy": 2.6519298553466797, |
| "loss/hidden": 1.0234375, |
| "loss/logits": 0.17554257810115814, |
| "loss/reg": 5.766074173152447e-05, |
| "step": 493 |
| }, |
| { |
| "epoch": 0.06175, |
| "grad_norm": 2.7080323696136475, |
| "grad_norm_var": 0.2058088113620099, |
| "learning_rate": 0.0001, |
| "loss": 1.365, |
| "loss/crossentropy": 2.329538106918335, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.20815491676330566, |
| "loss/reg": 5.764625166193582e-05, |
| "step": 494 |
| }, |
| { |
| "epoch": 0.061875, |
| "grad_norm": 2.2859530448913574, |
| "grad_norm_var": 0.2063347958167308, |
| "learning_rate": 0.0001, |
| "loss": 1.2994, |
| "loss/crossentropy": 2.6445348262786865, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.1738019585609436, |
| "loss/reg": 5.763155422755517e-05, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.062, |
| "grad_norm": 2.771320343017578, |
| "grad_norm_var": 0.20431087500909348, |
| "learning_rate": 0.0001, |
| "loss": 1.4714, |
| "loss/crossentropy": 2.340728282928467, |
| "loss/hidden": 1.2578125, |
| "loss/logits": 0.21303007006645203, |
| "loss/reg": 5.761897409684025e-05, |
| "step": 496 |
| }, |
| { |
| "epoch": 0.062125, |
| "grad_norm": 3.022183656692505, |
| "grad_norm_var": 0.21312900983479016, |
| "learning_rate": 0.0001, |
| "loss": 1.4858, |
| "loss/crossentropy": 2.6772336959838867, |
| "loss/hidden": 1.265625, |
| "loss/logits": 0.2196260541677475, |
| "loss/reg": 5.761081411037594e-05, |
| "step": 497 |
| }, |
| { |
| "epoch": 0.06225, |
| "grad_norm": 13.948429107666016, |
| "grad_norm_var": 8.27193520122967, |
| "learning_rate": 0.0001, |
| "loss": 1.3633, |
| "loss/crossentropy": 2.862323760986328, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.19083081185817719, |
| "loss/reg": 5.7596374972490594e-05, |
| "step": 498 |
| }, |
| { |
| "epoch": 0.062375, |
| "grad_norm": 2.6107678413391113, |
| "grad_norm_var": 8.237513777759569, |
| "learning_rate": 0.0001, |
| "loss": 1.6771, |
| "loss/crossentropy": 2.1725099086761475, |
| "loss/hidden": 1.40625, |
| "loss/logits": 0.2702314555644989, |
| "loss/reg": 5.7586628827266395e-05, |
| "step": 499 |
| }, |
| { |
| "epoch": 0.0625, |
| "grad_norm": 2.5658040046691895, |
| "grad_norm_var": 8.22750426778598, |
| "learning_rate": 0.0001, |
| "loss": 1.4381, |
| "loss/crossentropy": 2.246595859527588, |
| "loss/hidden": 1.25, |
| "loss/logits": 0.18755751848220825, |
| "loss/reg": 5.756897371611558e-05, |
| "step": 500 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 8000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": true, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.2202930782208e+16, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|