{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.1875, "eval_steps": 500, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000125, "grad_norm": 2.377527952194214, "learning_rate": 1.0000000000000002e-06, "loss": 1.2768, "loss/crossentropy": 2.697097063064575, "loss/hidden": 1.1171875, "loss/logits": 0.15893849730491638, "loss/reg": 6.247002602322027e-05, "step": 1 }, { "epoch": 0.00025, "grad_norm": 4.216994762420654, "learning_rate": 2.0000000000000003e-06, "loss": 1.3752, "loss/crossentropy": 3.101844310760498, "loss/hidden": 1.1796875, "loss/logits": 0.1949012577533722, "loss/reg": 6.247002602322027e-05, "step": 2 }, { "epoch": 0.000375, "grad_norm": 2.3287529945373535, "learning_rate": 3e-06, "loss": 1.2785, "loss/crossentropy": 2.63712477684021, "loss/hidden": 1.09375, "loss/logits": 0.18410107493400574, "loss/reg": 6.246996053960174e-05, "step": 3 }, { "epoch": 0.0005, "grad_norm": 5.415231227874756, "learning_rate": 4.000000000000001e-06, "loss": 1.4285, "loss/crossentropy": 2.5702285766601562, "loss/hidden": 1.265625, "loss/logits": 0.16228657960891724, "loss/reg": 6.246980774449185e-05, "step": 4 }, { "epoch": 0.000625, "grad_norm": 4.888370513916016, "learning_rate": 5e-06, "loss": 1.5121, "loss/crossentropy": 2.439383029937744, "loss/hidden": 1.3125, "loss/logits": 0.19899356365203857, "loss/reg": 6.24695821898058e-05, "step": 5 }, { "epoch": 0.00075, "grad_norm": 2.608705997467041, "learning_rate": 6e-06, "loss": 1.293, "loss/crossentropy": 2.668699026107788, "loss/hidden": 1.109375, "loss/logits": 0.18298496305942535, "loss/reg": 6.246933480724692e-05, "step": 6 }, { "epoch": 0.000875, "grad_norm": 2.8447623252868652, "learning_rate": 7.000000000000001e-06, "loss": 1.5339, "loss/crossentropy": 2.5219366550445557, "loss/hidden": 1.296875, "loss/logits": 0.2364223599433899, "loss/reg": 6.246914563234895e-05, "step": 7 }, { "epoch": 0.001, "grad_norm": 3.7877628803253174, "learning_rate": 8.000000000000001e-06, "loss": 1.8218, "loss/crossentropy": 2.1927688121795654, "loss/hidden": 1.5546875, "loss/logits": 0.2664879262447357, "loss/reg": 6.246889097383246e-05, "step": 8 }, { "epoch": 0.001125, "grad_norm": 2.988516330718994, "learning_rate": 9e-06, "loss": 1.7373, "loss/crossentropy": 2.3826897144317627, "loss/hidden": 1.421875, "loss/logits": 0.314752995967865, "loss/reg": 6.246858538361266e-05, "step": 9 }, { "epoch": 0.00125, "grad_norm": 2.143723726272583, "learning_rate": 1e-05, "loss": 1.405, "loss/crossentropy": 2.2246415615081787, "loss/hidden": 1.234375, "loss/logits": 0.16997714340686798, "loss/reg": 6.246842531254515e-05, "step": 10 }, { "epoch": 0.001375, "grad_norm": 2.4413657188415527, "learning_rate": 1.1000000000000001e-05, "loss": 1.4206, "loss/crossentropy": 2.4612021446228027, "loss/hidden": 1.1796875, "loss/logits": 0.24033024907112122, "loss/reg": 6.246819975785911e-05, "step": 11 }, { "epoch": 0.0015, "grad_norm": 2.483156204223633, "learning_rate": 1.2e-05, "loss": 1.6449, "loss/crossentropy": 2.2882771492004395, "loss/hidden": 1.4140625, "loss/logits": 0.23023059964179993, "loss/reg": 6.246790871955454e-05, "step": 12 }, { "epoch": 0.001625, "grad_norm": 2.7368147373199463, "learning_rate": 1.3000000000000001e-05, "loss": 1.4981, "loss/crossentropy": 2.6942052841186523, "loss/hidden": 1.265625, "loss/logits": 0.23185348510742188, "loss/reg": 6.24675813014619e-05, "step": 13 }, { "epoch": 0.00175, "grad_norm": 5.189184665679932, "learning_rate": 1.4000000000000001e-05, "loss": 1.946, "loss/crossentropy": 2.3771214485168457, "loss/hidden": 1.625, "loss/logits": 0.320385217666626, "loss/reg": 6.246678822208196e-05, "step": 14 }, { "epoch": 0.001875, "grad_norm": 2.305589437484741, "learning_rate": 1.5e-05, "loss": 1.4982, "loss/crossentropy": 2.7562549114227295, "loss/hidden": 1.25, "loss/logits": 0.2476150244474411, "loss/reg": 6.246620614547282e-05, "step": 15 }, { "epoch": 0.002, "grad_norm": 2.3378520011901855, "grad_norm_var": 1.2675163586822178, "learning_rate": 1.6000000000000003e-05, "loss": 1.3302, "loss/crossentropy": 2.445441961288452, "loss/hidden": 1.125, "loss/logits": 0.20453599095344543, "loss/reg": 6.246585689950734e-05, "step": 16 }, { "epoch": 0.002125, "grad_norm": 1.7903435230255127, "grad_norm_var": 1.3529406709866008, "learning_rate": 1.7000000000000003e-05, "loss": 1.1333, "loss/crossentropy": 2.323503017425537, "loss/hidden": 0.984375, "loss/logits": 0.14828170835971832, "loss/reg": 6.246510747587308e-05, "step": 17 }, { "epoch": 0.00225, "grad_norm": 3.363795518875122, "grad_norm_var": 1.277817936381435, "learning_rate": 1.8e-05, "loss": 1.7292, "loss/crossentropy": 2.6075525283813477, "loss/hidden": 1.46875, "loss/logits": 0.25987327098846436, "loss/reg": 6.24642925686203e-05, "step": 18 }, { "epoch": 0.002375, "grad_norm": 2.162050724029541, "grad_norm_var": 1.2967721886362786, "learning_rate": 1.9e-05, "loss": 1.3146, "loss/crossentropy": 2.570558786392212, "loss/hidden": 1.125, "loss/logits": 0.18898281455039978, "loss/reg": 6.246323027880862e-05, "step": 19 }, { "epoch": 0.0025, "grad_norm": 2.147024393081665, "grad_norm_var": 0.9523869945360727, "learning_rate": 2e-05, "loss": 1.3484, "loss/crossentropy": 2.6676244735717773, "loss/hidden": 1.1484375, "loss/logits": 0.19929195940494537, "loss/reg": 6.246233533602208e-05, "step": 20 }, { "epoch": 0.002625, "grad_norm": 2.0668728351593018, "grad_norm_var": 0.6976603751830339, "learning_rate": 2.1e-05, "loss": 1.1929, "loss/crossentropy": 2.401143789291382, "loss/hidden": 1.03125, "loss/logits": 0.1610003113746643, "loss/reg": 6.246144039323553e-05, "step": 21 }, { "epoch": 0.00275, "grad_norm": 2.8019566535949707, "grad_norm_var": 0.6973240463492516, "learning_rate": 2.2000000000000003e-05, "loss": 1.419, "loss/crossentropy": 2.627523183822632, "loss/hidden": 1.203125, "loss/logits": 0.2152642011642456, "loss/reg": 6.246032717172056e-05, "step": 22 }, { "epoch": 0.002875, "grad_norm": 3.8118937015533447, "grad_norm_var": 0.7713008187193999, "learning_rate": 2.3000000000000003e-05, "loss": 1.4284, "loss/crossentropy": 2.7227890491485596, "loss/hidden": 1.1640625, "loss/logits": 0.2637593150138855, "loss/reg": 6.245896656764671e-05, "step": 23 }, { "epoch": 0.003, "grad_norm": 2.1418018341064453, "grad_norm_var": 0.7205284729945551, "learning_rate": 2.4e-05, "loss": 1.3002, "loss/crossentropy": 2.545552968978882, "loss/hidden": 1.1328125, "loss/logits": 0.16680249571800232, "loss/reg": 6.245774420676753e-05, "step": 24 }, { "epoch": 0.003125, "grad_norm": 3.5331156253814697, "grad_norm_var": 0.7613226543465996, "learning_rate": 2.5e-05, "loss": 1.3224, "loss/crossentropy": 2.2371270656585693, "loss/hidden": 1.15625, "loss/logits": 0.16548338532447815, "loss/reg": 6.245705299079418e-05, "step": 25 }, { "epoch": 0.00325, "grad_norm": 1.9795947074890137, "grad_norm_var": 0.7755306597344306, "learning_rate": 2.6000000000000002e-05, "loss": 1.3209, "loss/crossentropy": 2.7113037109375, "loss/hidden": 1.1328125, "loss/logits": 0.18742361664772034, "loss/reg": 6.245569966267794e-05, "step": 26 }, { "epoch": 0.003375, "grad_norm": 2.6044108867645264, "grad_norm_var": 0.7714440385524235, "learning_rate": 2.7000000000000002e-05, "loss": 1.4566, "loss/crossentropy": 2.6034419536590576, "loss/hidden": 1.2265625, "loss/logits": 0.22937631607055664, "loss/reg": 6.245376425795257e-05, "step": 27 }, { "epoch": 0.0035, "grad_norm": 2.48085355758667, "grad_norm_var": 0.7715158471256792, "learning_rate": 2.8000000000000003e-05, "loss": 1.4579, "loss/crossentropy": 2.5794363021850586, "loss/hidden": 1.2421875, "loss/logits": 0.21509718894958496, "loss/reg": 6.245166878215969e-05, "step": 28 }, { "epoch": 0.003625, "grad_norm": 3.0413854122161865, "grad_norm_var": 0.7781660489700184, "learning_rate": 2.9e-05, "loss": 1.6102, "loss/crossentropy": 2.4173922538757324, "loss/hidden": 1.375, "loss/logits": 0.23455965518951416, "loss/reg": 6.244902033358812e-05, "step": 29 }, { "epoch": 0.00375, "grad_norm": 2.1076390743255615, "grad_norm_var": 0.36324525064493024, "learning_rate": 3e-05, "loss": 1.0735, "loss/crossentropy": 2.4064886569976807, "loss/hidden": 0.9453125, "loss/logits": 0.12752822041511536, "loss/reg": 6.244838004931808e-05, "step": 30 }, { "epoch": 0.003875, "grad_norm": 2.5296630859375, "grad_norm_var": 0.359312391151574, "learning_rate": 3.1e-05, "loss": 1.3467, "loss/crossentropy": 2.61391544342041, "loss/hidden": 1.15625, "loss/logits": 0.18978667259216309, "loss/reg": 6.244736141525209e-05, "step": 31 }, { "epoch": 0.004, "grad_norm": 2.123671054840088, "grad_norm_var": 0.3684168280400947, "learning_rate": 3.2000000000000005e-05, "loss": 1.2191, "loss/crossentropy": 2.6056668758392334, "loss/hidden": 1.0546875, "loss/logits": 0.16381201148033142, "loss/reg": 6.244605174288154e-05, "step": 32 }, { "epoch": 0.004125, "grad_norm": 3.685770034790039, "grad_norm_var": 0.4027733703548923, "learning_rate": 3.3e-05, "loss": 1.6794, "loss/crossentropy": 2.519561290740967, "loss/hidden": 1.3828125, "loss/logits": 0.29592496156692505, "loss/reg": 6.24443418928422e-05, "step": 33 }, { "epoch": 0.00425, "grad_norm": 1.9660468101501465, "grad_norm_var": 0.393966226946808, "learning_rate": 3.4000000000000007e-05, "loss": 1.3395, "loss/crossentropy": 2.638051986694336, "loss/hidden": 1.15625, "loss/logits": 0.18261724710464478, "loss/reg": 6.244314135983586e-05, "step": 34 }, { "epoch": 0.004375, "grad_norm": 2.3111677169799805, "grad_norm_var": 0.38716579449971367, "learning_rate": 3.5e-05, "loss": 1.3501, "loss/crossentropy": 2.599940776824951, "loss/hidden": 1.15625, "loss/logits": 0.19327056407928467, "loss/reg": 6.244215182960033e-05, "step": 35 }, { "epoch": 0.0045, "grad_norm": 2.5357542037963867, "grad_norm_var": 0.3739975607775089, "learning_rate": 3.6e-05, "loss": 1.287, "loss/crossentropy": 2.9884798526763916, "loss/hidden": 1.1171875, "loss/logits": 0.16922441124916077, "loss/reg": 6.244022370083258e-05, "step": 36 }, { "epoch": 0.004625, "grad_norm": 1.7781621217727661, "grad_norm_var": 0.40002233468076764, "learning_rate": 3.7e-05, "loss": 1.074, "loss/crossentropy": 2.669071674346924, "loss/hidden": 0.93359375, "loss/logits": 0.13981276750564575, "loss/reg": 6.243858661036938e-05, "step": 37 }, { "epoch": 0.00475, "grad_norm": 24.6973819732666, "grad_norm_var": 30.983207545217457, "learning_rate": 3.8e-05, "loss": 1.3637, "loss/crossentropy": 2.482579469680786, "loss/hidden": 1.1953125, "loss/logits": 0.16777344048023224, "loss/reg": 6.243725511012599e-05, "step": 38 }, { "epoch": 0.004875, "grad_norm": 2.5728342533111572, "grad_norm_var": 31.103302953089262, "learning_rate": 3.9000000000000006e-05, "loss": 1.3424, "loss/crossentropy": 2.2785422801971436, "loss/hidden": 1.171875, "loss/logits": 0.16988611221313477, "loss/reg": 6.243555981200188e-05, "step": 39 }, { "epoch": 0.005, "grad_norm": 1.7385622262954712, "grad_norm_var": 31.206951393275006, "learning_rate": 4e-05, "loss": 1.077, "loss/crossentropy": 2.7017714977264404, "loss/hidden": 0.9453125, "loss/logits": 0.13102804124355316, "loss/reg": 6.243350071599707e-05, "step": 40 }, { "epoch": 0.005125, "grad_norm": 2.455116033554077, "grad_norm_var": 31.325901099338942, "learning_rate": 4.1e-05, "loss": 1.178, "loss/crossentropy": 2.6521873474121094, "loss/hidden": 1.015625, "loss/logits": 0.16170336306095123, "loss/reg": 6.243147072382271e-05, "step": 41 }, { "epoch": 0.00525, "grad_norm": 3.0441935062408447, "grad_norm_var": 31.14003983168487, "learning_rate": 4.2e-05, "loss": 1.488, "loss/crossentropy": 2.5000290870666504, "loss/hidden": 1.265625, "loss/logits": 0.2217317819595337, "loss/reg": 6.24291569693014e-05, "step": 42 }, { "epoch": 0.005375, "grad_norm": 2.6227200031280518, "grad_norm_var": 31.137008952861066, "learning_rate": 4.3e-05, "loss": 1.3106, "loss/crossentropy": 2.6832528114318848, "loss/hidden": 1.1171875, "loss/logits": 0.19276997447013855, "loss/reg": 6.242711242521182e-05, "step": 43 }, { "epoch": 0.0055, "grad_norm": 2.9194633960723877, "grad_norm_var": 31.06863081080745, "learning_rate": 4.4000000000000006e-05, "loss": 1.5396, "loss/crossentropy": 2.483938455581665, "loss/hidden": 1.3046875, "loss/logits": 0.23424991965293884, "loss/reg": 6.242513336474076e-05, "step": 44 }, { "epoch": 0.005625, "grad_norm": 2.2491037845611572, "grad_norm_var": 31.196778907875057, "learning_rate": 4.5e-05, "loss": 1.2321, "loss/crossentropy": 2.9735186100006104, "loss/hidden": 1.0625, "loss/logits": 0.1689363420009613, "loss/reg": 6.242344534257427e-05, "step": 45 }, { "epoch": 0.00575, "grad_norm": 2.687225103378296, "grad_norm_var": 31.084396554405373, "learning_rate": 4.600000000000001e-05, "loss": 1.2443, "loss/crossentropy": 2.913846254348755, "loss/hidden": 1.0625, "loss/logits": 0.18112678825855255, "loss/reg": 6.242193921934813e-05, "step": 46 }, { "epoch": 0.005875, "grad_norm": 2.3648312091827393, "grad_norm_var": 31.1155476706496, "learning_rate": 4.7e-05, "loss": 1.2044, "loss/crossentropy": 2.374119520187378, "loss/hidden": 1.046875, "loss/logits": 0.15688437223434448, "loss/reg": 6.242006929824129e-05, "step": 47 }, { "epoch": 0.006, "grad_norm": 1.896540880203247, "grad_norm_var": 31.171339818602494, "learning_rate": 4.8e-05, "loss": 1.238, "loss/crossentropy": 2.613962173461914, "loss/hidden": 1.0546875, "loss/logits": 0.1826920211315155, "loss/reg": 6.24187450739555e-05, "step": 48 }, { "epoch": 0.006125, "grad_norm": 1.7585434913635254, "grad_norm_var": 31.44447201393312, "learning_rate": 4.9e-05, "loss": 1.1411, "loss/crossentropy": 2.5672757625579834, "loss/hidden": 1.0, "loss/logits": 0.14043202996253967, "loss/reg": 6.241785740712658e-05, "step": 49 }, { "epoch": 0.00625, "grad_norm": 1.8257592916488647, "grad_norm_var": 31.47860052328912, "learning_rate": 5e-05, "loss": 1.2643, "loss/crossentropy": 2.4829366207122803, "loss/hidden": 1.0859375, "loss/logits": 0.1777852475643158, "loss/reg": 6.2416227592621e-05, "step": 50 }, { "epoch": 0.006375, "grad_norm": 1.9530550241470337, "grad_norm_var": 31.553698309541367, "learning_rate": 5.1000000000000006e-05, "loss": 1.1787, "loss/crossentropy": 2.501922369003296, "loss/hidden": 1.015625, "loss/logits": 0.16241338849067688, "loss/reg": 6.241373193915933e-05, "step": 51 }, { "epoch": 0.0065, "grad_norm": 2.366898536682129, "grad_norm_var": 31.58155048439878, "learning_rate": 5.2000000000000004e-05, "loss": 1.476, "loss/crossentropy": 2.557314872741699, "loss/hidden": 1.234375, "loss/logits": 0.24098029732704163, "loss/reg": 6.241213122848421e-05, "step": 52 }, { "epoch": 0.006625, "grad_norm": 2.139944553375244, "grad_norm_var": 31.497838767117898, "learning_rate": 5.300000000000001e-05, "loss": 1.3057, "loss/crossentropy": 2.5664379596710205, "loss/hidden": 1.125, "loss/logits": 0.18005570769309998, "loss/reg": 6.241026130737737e-05, "step": 53 }, { "epoch": 0.00675, "grad_norm": 2.2614963054656982, "grad_norm_var": 0.16298419379227144, "learning_rate": 5.4000000000000005e-05, "loss": 1.2081, "loss/crossentropy": 2.5651533603668213, "loss/hidden": 1.046875, "loss/logits": 0.1606135070323944, "loss/reg": 6.240784568944946e-05, "step": 54 }, { "epoch": 0.006875, "grad_norm": 1.88372802734375, "grad_norm_var": 0.16791840248250048, "learning_rate": 5.500000000000001e-05, "loss": 1.2037, "loss/crossentropy": 2.0431623458862305, "loss/hidden": 1.0703125, "loss/logits": 0.13271506130695343, "loss/reg": 6.240410584723577e-05, "step": 55 }, { "epoch": 0.007, "grad_norm": 1.7579172849655151, "grad_norm_var": 0.16659499666655736, "learning_rate": 5.6000000000000006e-05, "loss": 1.0787, "loss/crossentropy": 2.5805883407592773, "loss/hidden": 0.94140625, "loss/logits": 0.13670633733272552, "loss/reg": 6.240163202164695e-05, "step": 56 }, { "epoch": 0.007125, "grad_norm": 2.740758180618286, "grad_norm_var": 0.17906241043444873, "learning_rate": 5.6999999999999996e-05, "loss": 1.2499, "loss/crossentropy": 2.821078062057495, "loss/hidden": 1.0859375, "loss/logits": 0.16337308287620544, "loss/reg": 6.239958747755736e-05, "step": 57 }, { "epoch": 0.00725, "grad_norm": 3.3393216133117676, "grad_norm_var": 0.21459676497742203, "learning_rate": 5.8e-05, "loss": 1.5094, "loss/crossentropy": 2.6574273109436035, "loss/hidden": 1.2265625, "loss/logits": 0.2822623550891876, "loss/reg": 6.239775393623859e-05, "step": 58 }, { "epoch": 0.007375, "grad_norm": 2.1151742935180664, "grad_norm_var": 0.20871929877454623, "learning_rate": 5.9e-05, "loss": 1.31, "loss/crossentropy": 2.28176212310791, "loss/hidden": 1.125, "loss/logits": 0.18433833122253418, "loss/reg": 6.239649519557133e-05, "step": 59 }, { "epoch": 0.0075, "grad_norm": 1.9203850030899048, "grad_norm_var": 0.18408730894700795, "learning_rate": 6e-05, "loss": 1.2862, "loss/crossentropy": 2.319091558456421, "loss/hidden": 1.09375, "loss/logits": 0.1918697953224182, "loss/reg": 6.239335925783962e-05, "step": 60 }, { "epoch": 0.007625, "grad_norm": 2.689425230026245, "grad_norm_var": 0.1988651894699956, "learning_rate": 6.1e-05, "loss": 1.2077, "loss/crossentropy": 2.396440029144287, "loss/hidden": 1.0546875, "loss/logits": 0.1523526906967163, "loss/reg": 6.239157664822415e-05, "step": 61 }, { "epoch": 0.00775, "grad_norm": 2.0848548412323, "grad_norm_var": 0.184926237897677, "learning_rate": 6.2e-05, "loss": 1.1889, "loss/crossentropy": 2.375331401824951, "loss/hidden": 1.03125, "loss/logits": 0.15707406401634216, "loss/reg": 6.238814967218786e-05, "step": 62 }, { "epoch": 0.007875, "grad_norm": 1.9770179986953735, "grad_norm_var": 0.18547542502594508, "learning_rate": 6.3e-05, "loss": 1.1255, "loss/crossentropy": 2.5883288383483887, "loss/hidden": 0.984375, "loss/logits": 0.14046350121498108, "loss/reg": 6.238514470169321e-05, "step": 63 }, { "epoch": 0.008, "grad_norm": 1.9654349088668823, "grad_norm_var": 0.1832653842408547, "learning_rate": 6.400000000000001e-05, "loss": 1.1315, "loss/crossentropy": 2.6122260093688965, "loss/hidden": 0.9765625, "loss/logits": 0.1543133556842804, "loss/reg": 6.238299101823941e-05, "step": 64 }, { "epoch": 0.008125, "grad_norm": 2.110621690750122, "grad_norm_var": 0.1715223081433841, "learning_rate": 6.500000000000001e-05, "loss": 1.1513, "loss/crossentropy": 2.3829517364501953, "loss/hidden": 1.0, "loss/logits": 0.15063607692718506, "loss/reg": 6.237896013772115e-05, "step": 65 }, { "epoch": 0.00825, "grad_norm": 3.1477179527282715, "grad_norm_var": 0.21553302023151552, "learning_rate": 6.6e-05, "loss": 1.4659, "loss/crossentropy": 2.2805211544036865, "loss/hidden": 1.2421875, "loss/logits": 0.22310970723628998, "loss/reg": 6.237393972696736e-05, "step": 66 }, { "epoch": 0.008375, "grad_norm": 2.482203722000122, "grad_norm_var": 0.21008166056666275, "learning_rate": 6.7e-05, "loss": 1.0839, "loss/crossentropy": 2.982119560241699, "loss/hidden": 0.94140625, "loss/logits": 0.14186254143714905, "loss/reg": 6.236990884644911e-05, "step": 67 }, { "epoch": 0.0085, "grad_norm": 2.198028087615967, "grad_norm_var": 0.21061508280485744, "learning_rate": 6.800000000000001e-05, "loss": 1.2007, "loss/crossentropy": 2.725332498550415, "loss/hidden": 1.0390625, "loss/logits": 0.1610267162322998, "loss/reg": 6.236397166503593e-05, "step": 68 }, { "epoch": 0.008625, "grad_norm": 1.9412530660629272, "grad_norm_var": 0.21734592747188602, "learning_rate": 6.9e-05, "loss": 1.1269, "loss/crossentropy": 2.682379722595215, "loss/hidden": 0.984375, "loss/logits": 0.14185243844985962, "loss/reg": 6.235777982510626e-05, "step": 69 }, { "epoch": 0.00875, "grad_norm": 2.223443031311035, "grad_norm_var": 0.21757323137186588, "learning_rate": 7e-05, "loss": 1.3663, "loss/crossentropy": 2.6186935901641846, "loss/hidden": 1.1640625, "loss/logits": 0.2016535997390747, "loss/reg": 6.23530286247842e-05, "step": 70 }, { "epoch": 0.008875, "grad_norm": 3.4456241130828857, "grad_norm_var": 0.28625219910078287, "learning_rate": 7.1e-05, "loss": 1.6214, "loss/crossentropy": 2.054266929626465, "loss/hidden": 1.421875, "loss/logits": 0.19887767732143402, "loss/reg": 6.234741158550605e-05, "step": 71 }, { "epoch": 0.009, "grad_norm": 1.9013352394104004, "grad_norm_var": 0.27557130255187207, "learning_rate": 7.2e-05, "loss": 1.1365, "loss/crossentropy": 2.422841787338257, "loss/hidden": 0.9765625, "loss/logits": 0.15926527976989746, "loss/reg": 6.234211468836293e-05, "step": 72 }, { "epoch": 0.009125, "grad_norm": 2.4032697677612305, "grad_norm_var": 0.267026183625853, "learning_rate": 7.3e-05, "loss": 1.4414, "loss/crossentropy": 2.4159440994262695, "loss/hidden": 1.21875, "loss/logits": 0.22204136848449707, "loss/reg": 6.233662861632183e-05, "step": 73 }, { "epoch": 0.00925, "grad_norm": 1.915128231048584, "grad_norm_var": 0.21002777018266153, "learning_rate": 7.4e-05, "loss": 1.2439, "loss/crossentropy": 2.587275505065918, "loss/hidden": 1.0625, "loss/logits": 0.1807810664176941, "loss/reg": 6.232755549717695e-05, "step": 74 }, { "epoch": 0.009375, "grad_norm": 3.4048879146575928, "grad_norm_var": 0.28520435687560547, "learning_rate": 7.500000000000001e-05, "loss": 1.2774, "loss/crossentropy": 2.6182703971862793, "loss/hidden": 1.125, "loss/logits": 0.15172982215881348, "loss/reg": 6.231923180166632e-05, "step": 75 }, { "epoch": 0.0095, "grad_norm": 2.3605074882507324, "grad_norm_var": 0.27132747056331724, "learning_rate": 7.6e-05, "loss": 1.1409, "loss/crossentropy": 2.6013262271881104, "loss/hidden": 0.98828125, "loss/logits": 0.151985764503479, "loss/reg": 6.231063889572397e-05, "step": 76 }, { "epoch": 0.009625, "grad_norm": 2.6056039333343506, "grad_norm_var": 0.2684276793201585, "learning_rate": 7.7e-05, "loss": 1.1, "loss/crossentropy": 2.534158945083618, "loss/hidden": 0.94921875, "loss/logits": 0.1501779407262802, "loss/reg": 6.230256258277223e-05, "step": 77 }, { "epoch": 0.00975, "grad_norm": 1.7923972606658936, "grad_norm_var": 0.285494251958092, "learning_rate": 7.800000000000001e-05, "loss": 1.1471, "loss/crossentropy": 2.3036601543426514, "loss/hidden": 0.98828125, "loss/logits": 0.15817409753799438, "loss/reg": 6.229766586329788e-05, "step": 78 }, { "epoch": 0.009875, "grad_norm": 2.0376312732696533, "grad_norm_var": 0.2825708803585835, "learning_rate": 7.900000000000001e-05, "loss": 1.2985, "loss/crossentropy": 2.5548579692840576, "loss/hidden": 1.140625, "loss/logits": 0.1572834551334381, "loss/reg": 6.229063728824258e-05, "step": 79 }, { "epoch": 0.01, "grad_norm": 2.998662233352661, "grad_norm_var": 0.29342903010298654, "learning_rate": 8e-05, "loss": 1.5504, "loss/crossentropy": 2.4098215103149414, "loss/hidden": 1.3046875, "loss/logits": 0.24512597918510437, "loss/reg": 6.22822335571982e-05, "step": 80 }, { "epoch": 0.010125, "grad_norm": 2.103449583053589, "grad_norm_var": 0.29374293883859787, "learning_rate": 8.1e-05, "loss": 1.2985, "loss/crossentropy": 2.380378484725952, "loss/hidden": 1.125, "loss/logits": 0.17282900214195251, "loss/reg": 6.227292760740966e-05, "step": 81 }, { "epoch": 0.01025, "grad_norm": 2.6376256942749023, "grad_norm_var": 0.2615363410208279, "learning_rate": 8.2e-05, "loss": 1.266, "loss/crossentropy": 2.4291374683380127, "loss/hidden": 1.1015625, "loss/logits": 0.16384728252887726, "loss/reg": 6.226752884685993e-05, "step": 82 }, { "epoch": 0.010375, "grad_norm": 2.0763561725616455, "grad_norm_var": 0.2675552215302521, "learning_rate": 8.3e-05, "loss": 1.1733, "loss/crossentropy": 2.423896312713623, "loss/hidden": 1.015625, "loss/logits": 0.15705125033855438, "loss/reg": 6.225931429071352e-05, "step": 83 }, { "epoch": 0.0105, "grad_norm": 4.398110866546631, "grad_norm_var": 0.5173355174320988, "learning_rate": 8.4e-05, "loss": 1.5654, "loss/crossentropy": 2.230816602706909, "loss/hidden": 1.296875, "loss/logits": 0.26791903376579285, "loss/reg": 6.225006654858589e-05, "step": 84 }, { "epoch": 0.010625, "grad_norm": 2.7163784503936768, "grad_norm_var": 0.4955558090734691, "learning_rate": 8.5e-05, "loss": 1.2008, "loss/crossentropy": 2.1671087741851807, "loss/hidden": 1.0546875, "loss/logits": 0.145525261759758, "loss/reg": 6.224414391908795e-05, "step": 85 }, { "epoch": 0.01075, "grad_norm": 1.9465394020080566, "grad_norm_var": 0.5129132822581631, "learning_rate": 8.6e-05, "loss": 1.0109, "loss/crossentropy": 2.218550443649292, "loss/hidden": 0.90234375, "loss/logits": 0.10795612633228302, "loss/reg": 6.22385778115131e-05, "step": 86 }, { "epoch": 0.010875, "grad_norm": 5.668015956878662, "grad_norm_var": 1.0880389746416426, "learning_rate": 8.7e-05, "loss": 1.2925, "loss/crossentropy": 2.360995292663574, "loss/hidden": 1.1484375, "loss/logits": 0.1434704214334488, "loss/reg": 6.223141826922074e-05, "step": 87 }, { "epoch": 0.011, "grad_norm": 3.4049394130706787, "grad_norm_var": 1.0721571012465496, "learning_rate": 8.800000000000001e-05, "loss": 1.6353, "loss/crossentropy": 1.9898579120635986, "loss/hidden": 1.3828125, "loss/logits": 0.25186440348625183, "loss/reg": 6.222462252480909e-05, "step": 88 }, { "epoch": 0.011125, "grad_norm": 1.885895013809204, "grad_norm_var": 1.1148297312339375, "learning_rate": 8.900000000000001e-05, "loss": 1.0561, "loss/crossentropy": 2.670912027359009, "loss/hidden": 0.92578125, "loss/logits": 0.12972213327884674, "loss/reg": 6.221828516572714e-05, "step": 89 }, { "epoch": 0.01125, "grad_norm": 1.886960506439209, "grad_norm_var": 1.118003608268531, "learning_rate": 9e-05, "loss": 1.1335, "loss/crossentropy": 2.5691866874694824, "loss/hidden": 0.97265625, "loss/logits": 0.16021151840686798, "loss/reg": 6.221193325472996e-05, "step": 90 }, { "epoch": 0.011375, "grad_norm": 3.117880344390869, "grad_norm_var": 1.0979090394478965, "learning_rate": 9.1e-05, "loss": 1.3175, "loss/crossentropy": 2.7383711338043213, "loss/hidden": 1.140625, "loss/logits": 0.1762513369321823, "loss/reg": 6.220516661414877e-05, "step": 91 }, { "epoch": 0.0115, "grad_norm": 2.5928220748901367, "grad_norm_var": 1.0899203711980436, "learning_rate": 9.200000000000001e-05, "loss": 1.3898, "loss/crossentropy": 2.255321741104126, "loss/hidden": 1.171875, "loss/logits": 0.21727776527404785, "loss/reg": 6.2199542298913e-05, "step": 92 }, { "epoch": 0.011625, "grad_norm": 2.5842387676239014, "grad_norm_var": 1.09033696415262, "learning_rate": 9.300000000000001e-05, "loss": 1.3599, "loss/crossentropy": 2.7780256271362305, "loss/hidden": 1.15625, "loss/logits": 0.203078031539917, "loss/reg": 6.219152419362217e-05, "step": 93 }, { "epoch": 0.01175, "grad_norm": 2.497912645339966, "grad_norm_var": 1.032260222561935, "learning_rate": 9.4e-05, "loss": 1.2791, "loss/crossentropy": 2.0482513904571533, "loss/hidden": 1.109375, "loss/logits": 0.16910339891910553, "loss/reg": 6.218066846486181e-05, "step": 94 }, { "epoch": 0.011875, "grad_norm": 2.1033713817596436, "grad_norm_var": 1.0259829914817806, "learning_rate": 9.5e-05, "loss": 1.0875, "loss/crossentropy": 2.427816152572632, "loss/hidden": 0.94921875, "loss/logits": 0.13770164549350739, "loss/reg": 6.21745057287626e-05, "step": 95 }, { "epoch": 0.012, "grad_norm": 2.063559055328369, "grad_norm_var": 1.0544556100156115, "learning_rate": 9.6e-05, "loss": 1.217, "loss/crossentropy": 2.498270034790039, "loss/hidden": 1.046875, "loss/logits": 0.16950619220733643, "loss/reg": 6.216309702722356e-05, "step": 96 }, { "epoch": 0.012125, "grad_norm": 2.3693654537200928, "grad_norm_var": 1.036651450071012, "learning_rate": 9.7e-05, "loss": 1.2016, "loss/crossentropy": 2.8368701934814453, "loss/hidden": 1.0390625, "loss/logits": 0.16189493238925934, "loss/reg": 6.215785833774135e-05, "step": 97 }, { "epoch": 0.01225, "grad_norm": 2.2980258464813232, "grad_norm_var": 1.0488061784492646, "learning_rate": 9.8e-05, "loss": 1.5249, "loss/crossentropy": 2.194488525390625, "loss/hidden": 1.2421875, "loss/logits": 0.2820858359336853, "loss/reg": 6.215048051672056e-05, "step": 98 }, { "epoch": 0.012375, "grad_norm": 3.147524833679199, "grad_norm_var": 1.0277853179901806, "learning_rate": 9.900000000000001e-05, "loss": 1.7374, "loss/crossentropy": 2.7856016159057617, "loss/hidden": 1.4609375, "loss/logits": 0.27581536769866943, "loss/reg": 6.214459426701069e-05, "step": 99 }, { "epoch": 0.0125, "grad_norm": 2.1317031383514404, "grad_norm_var": 0.8636563030021608, "learning_rate": 0.0001, "loss": 1.3633, "loss/crossentropy": 2.282402753829956, "loss/hidden": 1.1484375, "loss/logits": 0.2142634242773056, "loss/reg": 6.213640881469473e-05, "step": 100 }, { "epoch": 0.012625, "grad_norm": 2.2720911502838135, "grad_norm_var": 0.8721171319962743, "learning_rate": 0.0001, "loss": 1.2405, "loss/crossentropy": 2.8501064777374268, "loss/hidden": 1.0625, "loss/logits": 0.17741592228412628, "loss/reg": 6.21288490947336e-05, "step": 101 }, { "epoch": 0.01275, "grad_norm": 2.879110097885132, "grad_norm_var": 0.8423375514351165, "learning_rate": 0.0001, "loss": 1.3486, "loss/crossentropy": 2.4649596214294434, "loss/hidden": 1.171875, "loss/logits": 0.1761254221200943, "loss/reg": 6.211963773239404e-05, "step": 102 }, { "epoch": 0.012875, "grad_norm": 2.2214345932006836, "grad_norm_var": 0.2123174305005847, "learning_rate": 0.0001, "loss": 1.1049, "loss/crossentropy": 2.513540029525757, "loss/hidden": 0.96484375, "loss/logits": 0.13943374156951904, "loss/reg": 6.21131548541598e-05, "step": 103 }, { "epoch": 0.013, "grad_norm": 1.9674383401870728, "grad_norm_var": 0.16151448650877043, "learning_rate": 0.0001, "loss": 1.2055, "loss/crossentropy": 2.4960575103759766, "loss/hidden": 1.03125, "loss/logits": 0.17365112900733948, "loss/reg": 6.210394349182025e-05, "step": 104 }, { "epoch": 0.013125, "grad_norm": 2.152989387512207, "grad_norm_var": 0.1485118756217919, "learning_rate": 0.0001, "loss": 1.3728, "loss/crossentropy": 2.651463508605957, "loss/hidden": 1.1796875, "loss/logits": 0.1924474835395813, "loss/reg": 6.209702405612916e-05, "step": 105 }, { "epoch": 0.01325, "grad_norm": 2.591555118560791, "grad_norm_var": 0.13200909593287988, "learning_rate": 0.0001, "loss": 1.5933, "loss/crossentropy": 2.1848952770233154, "loss/hidden": 1.375, "loss/logits": 0.21770122647285461, "loss/reg": 6.208720878930762e-05, "step": 106 }, { "epoch": 0.013375, "grad_norm": 2.205780029296875, "grad_norm_var": 0.10119294371901374, "learning_rate": 0.0001, "loss": 0.9785, "loss/crossentropy": 2.4988999366760254, "loss/hidden": 0.8671875, "loss/logits": 0.1106652021408081, "loss/reg": 6.207643309608102e-05, "step": 107 }, { "epoch": 0.0135, "grad_norm": 2.427882671356201, "grad_norm_var": 0.09821140867718908, "learning_rate": 0.0001, "loss": 1.2968, "loss/crossentropy": 2.5072600841522217, "loss/hidden": 1.09375, "loss/logits": 0.20241403579711914, "loss/reg": 6.206895341165364e-05, "step": 108 }, { "epoch": 0.013625, "grad_norm": 2.4435040950775146, "grad_norm_var": 0.09542213222792188, "learning_rate": 0.0001, "loss": 1.2803, "loss/crossentropy": 2.2629339694976807, "loss/hidden": 1.1015625, "loss/logits": 0.17810457944869995, "loss/reg": 6.205752288224176e-05, "step": 109 }, { "epoch": 0.01375, "grad_norm": 2.9938735961914062, "grad_norm_var": 0.11986086275213564, "learning_rate": 0.0001, "loss": 1.2708, "loss/crossentropy": 2.5084388256073, "loss/hidden": 1.09375, "loss/logits": 0.1764756739139557, "loss/reg": 6.204319652169943e-05, "step": 110 }, { "epoch": 0.013875, "grad_norm": 2.499802827835083, "grad_norm_var": 0.11443625726480532, "learning_rate": 0.0001, "loss": 1.3281, "loss/crossentropy": 2.342087507247925, "loss/hidden": 1.15625, "loss/logits": 0.17120838165283203, "loss/reg": 6.20328210061416e-05, "step": 111 }, { "epoch": 0.014, "grad_norm": 3.28193736076355, "grad_norm_var": 0.149862047644675, "learning_rate": 0.0001, "loss": 1.3891, "loss/crossentropy": 2.396040916442871, "loss/hidden": 1.1953125, "loss/logits": 0.193180650472641, "loss/reg": 6.202506483532488e-05, "step": 112 }, { "epoch": 0.014125, "grad_norm": 2.2074780464172363, "grad_norm_var": 0.15416329735346365, "learning_rate": 0.0001, "loss": 1.2137, "loss/crossentropy": 2.501718759536743, "loss/hidden": 1.0546875, "loss/logits": 0.1583903729915619, "loss/reg": 6.201667565619573e-05, "step": 113 }, { "epoch": 0.01425, "grad_norm": 2.888498306274414, "grad_norm_var": 0.1614203311265588, "learning_rate": 0.0001, "loss": 1.3498, "loss/crossentropy": 3.097370147705078, "loss/hidden": 1.15625, "loss/logits": 0.19293376803398132, "loss/reg": 6.200573989190161e-05, "step": 114 }, { "epoch": 0.014375, "grad_norm": 2.385442018508911, "grad_norm_var": 0.1339080451651928, "learning_rate": 0.0001, "loss": 1.3415, "loss/crossentropy": 2.4950473308563232, "loss/hidden": 1.15625, "loss/logits": 0.18464481830596924, "loss/reg": 6.199457857292145e-05, "step": 115 }, { "epoch": 0.0145, "grad_norm": 3.3269190788269043, "grad_norm_var": 0.16897616880053803, "learning_rate": 0.0001, "loss": 1.6405, "loss/crossentropy": 2.19484806060791, "loss/hidden": 1.3828125, "loss/logits": 0.2570968270301819, "loss/reg": 6.198590563144535e-05, "step": 116 }, { "epoch": 0.014625, "grad_norm": 2.2415361404418945, "grad_norm_var": 0.17015290356553733, "learning_rate": 0.0001, "loss": 1.2381, "loss/crossentropy": 2.540816068649292, "loss/hidden": 1.0625, "loss/logits": 0.1749531626701355, "loss/reg": 6.197726906975731e-05, "step": 117 }, { "epoch": 0.01475, "grad_norm": 2.397615671157837, "grad_norm_var": 0.1631737555736056, "learning_rate": 0.0001, "loss": 1.2192, "loss/crossentropy": 2.6213266849517822, "loss/hidden": 1.0546875, "loss/logits": 0.16386428475379944, "loss/reg": 6.197066250024363e-05, "step": 118 }, { "epoch": 0.014875, "grad_norm": 2.75325345993042, "grad_norm_var": 0.16006220619054398, "learning_rate": 0.0001, "loss": 1.5693, "loss/crossentropy": 2.3850035667419434, "loss/hidden": 1.34375, "loss/logits": 0.2249460369348526, "loss/reg": 6.196285539772362e-05, "step": 119 }, { "epoch": 0.015, "grad_norm": 2.675480842590332, "grad_norm_var": 0.13660137165245084, "learning_rate": 0.0001, "loss": 1.299, "loss/crossentropy": 2.380896806716919, "loss/hidden": 1.125, "loss/logits": 0.17339974641799927, "loss/reg": 6.195474998094141e-05, "step": 120 }, { "epoch": 0.015125, "grad_norm": 2.611541509628296, "grad_norm_var": 0.12289609882195597, "learning_rate": 0.0001, "loss": 1.2924, "loss/crossentropy": 2.7064404487609863, "loss/hidden": 1.109375, "loss/logits": 0.18236055970191956, "loss/reg": 6.194705929374322e-05, "step": 121 }, { "epoch": 0.01525, "grad_norm": 2.3449323177337646, "grad_norm_var": 0.12765774775469155, "learning_rate": 0.0001, "loss": 1.2957, "loss/crossentropy": 2.5846447944641113, "loss/hidden": 1.1171875, "loss/logits": 0.17786133289337158, "loss/reg": 6.193818262545392e-05, "step": 122 }, { "epoch": 0.015375, "grad_norm": 2.1001734733581543, "grad_norm_var": 0.13398098136615483, "learning_rate": 0.0001, "loss": 1.1704, "loss/crossentropy": 2.504185676574707, "loss/hidden": 1.015625, "loss/logits": 0.15416675806045532, "loss/reg": 6.192670116433874e-05, "step": 123 }, { "epoch": 0.0155, "grad_norm": 2.365839719772339, "grad_norm_var": 0.13563497966163046, "learning_rate": 0.0001, "loss": 1.3773, "loss/crossentropy": 2.3259832859039307, "loss/hidden": 1.171875, "loss/logits": 0.20480972528457642, "loss/reg": 6.19165730313398e-05, "step": 124 }, { "epoch": 0.015625, "grad_norm": 2.1480026245117188, "grad_norm_var": 0.1470561705316013, "learning_rate": 0.0001, "loss": 1.2768, "loss/crossentropy": 2.288093090057373, "loss/hidden": 1.109375, "loss/logits": 0.16683252155780792, "loss/reg": 6.19063139311038e-05, "step": 125 }, { "epoch": 0.01575, "grad_norm": 2.2346343994140625, "grad_norm_var": 0.14082182611320845, "learning_rate": 0.0001, "loss": 1.1441, "loss/crossentropy": 2.6062135696411133, "loss/hidden": 1.0, "loss/logits": 0.14351129531860352, "loss/reg": 6.189729174366221e-05, "step": 126 }, { "epoch": 0.015875, "grad_norm": 3.187627077102661, "grad_norm_var": 0.16771827237098264, "learning_rate": 0.0001, "loss": 1.4505, "loss/crossentropy": 2.3607077598571777, "loss/hidden": 1.2265625, "loss/logits": 0.22327345609664917, "loss/reg": 6.189044506754726e-05, "step": 127 }, { "epoch": 0.016, "grad_norm": 2.1208789348602295, "grad_norm_var": 0.1420574537193353, "learning_rate": 0.0001, "loss": 1.1414, "loss/crossentropy": 2.408287286758423, "loss/hidden": 1.0, "loss/logits": 0.14076298475265503, "loss/reg": 6.188445695443079e-05, "step": 128 }, { "epoch": 0.016125, "grad_norm": 2.4475457668304443, "grad_norm_var": 0.13631644029428572, "learning_rate": 0.0001, "loss": 1.2863, "loss/crossentropy": 2.4705042839050293, "loss/hidden": 1.1171875, "loss/logits": 0.16846278309822083, "loss/reg": 6.187462713569403e-05, "step": 129 }, { "epoch": 0.01625, "grad_norm": 2.3132476806640625, "grad_norm_var": 0.128302854564951, "learning_rate": 0.0001, "loss": 1.2265, "loss/crossentropy": 2.323221445083618, "loss/hidden": 1.0625, "loss/logits": 0.16340406239032745, "loss/reg": 6.18634803686291e-05, "step": 130 }, { "epoch": 0.016375, "grad_norm": 2.6015546321868896, "grad_norm_var": 0.12854282273958592, "learning_rate": 0.0001, "loss": 1.0946, "loss/crossentropy": 2.554730176925659, "loss/hidden": 0.9609375, "loss/logits": 0.13307343423366547, "loss/reg": 6.185180245665833e-05, "step": 131 }, { "epoch": 0.0165, "grad_norm": 2.040545701980591, "grad_norm_var": 0.08874970269449302, "learning_rate": 0.0001, "loss": 1.1715, "loss/crossentropy": 2.6177141666412354, "loss/hidden": 1.0078125, "loss/logits": 0.163020521402359, "loss/reg": 6.184292578836903e-05, "step": 132 }, { "epoch": 0.016625, "grad_norm": 2.4451427459716797, "grad_norm_var": 0.08672588329890019, "learning_rate": 0.0001, "loss": 1.2794, "loss/crossentropy": 2.6671459674835205, "loss/hidden": 1.109375, "loss/logits": 0.16941678524017334, "loss/reg": 6.18349076830782e-05, "step": 133 }, { "epoch": 0.01675, "grad_norm": 2.5730879306793213, "grad_norm_var": 0.08802712142174655, "learning_rate": 0.0001, "loss": 1.356, "loss/crossentropy": 2.483858585357666, "loss/hidden": 1.171875, "loss/logits": 0.1835438758134842, "loss/reg": 6.182605284266174e-05, "step": 134 }, { "epoch": 0.016875, "grad_norm": 2.996643543243408, "grad_norm_var": 0.10205043083370029, "learning_rate": 0.0001, "loss": 1.5067, "loss/crossentropy": 2.267930507659912, "loss/hidden": 1.3046875, "loss/logits": 0.20140591263771057, "loss/reg": 6.181577919051051e-05, "step": 135 }, { "epoch": 0.017, "grad_norm": 2.2333881855010986, "grad_norm_var": 0.10100001995976887, "learning_rate": 0.0001, "loss": 1.23, "loss/crossentropy": 2.552584648132324, "loss/hidden": 1.0546875, "loss/logits": 0.17466390132904053, "loss/reg": 6.180404307087883e-05, "step": 136 }, { "epoch": 0.017125, "grad_norm": 2.476086378097534, "grad_norm_var": 0.09873795942098601, "learning_rate": 0.0001, "loss": 1.2347, "loss/crossentropy": 2.2955551147460938, "loss/hidden": 1.09375, "loss/logits": 0.1402929574251175, "loss/reg": 6.179526099003851e-05, "step": 137 }, { "epoch": 0.01725, "grad_norm": 2.9701859951019287, "grad_norm_var": 0.11738609069977789, "learning_rate": 0.0001, "loss": 1.1041, "loss/crossentropy": 2.4560158252716064, "loss/hidden": 0.97265625, "loss/logits": 0.1307787150144577, "loss/reg": 6.178120383992791e-05, "step": 138 }, { "epoch": 0.017375, "grad_norm": 2.151567220687866, "grad_norm_var": 0.11513060923898569, "learning_rate": 0.0001, "loss": 1.1406, "loss/crossentropy": 2.6192235946655273, "loss/hidden": 0.98828125, "loss/logits": 0.15172292292118073, "loss/reg": 6.176753231557086e-05, "step": 139 }, { "epoch": 0.0175, "grad_norm": 2.0209085941314697, "grad_norm_var": 0.1267419293205286, "learning_rate": 0.0001, "loss": 1.0928, "loss/crossentropy": 2.6628799438476562, "loss/hidden": 0.94921875, "loss/logits": 0.14296585321426392, "loss/reg": 6.175567978061736e-05, "step": 140 }, { "epoch": 0.017625, "grad_norm": 3.458299398422241, "grad_norm_var": 0.18389511336323494, "learning_rate": 0.0001, "loss": 1.3966, "loss/crossentropy": 2.885798692703247, "loss/hidden": 1.171875, "loss/logits": 0.22411209344863892, "loss/reg": 6.174653390189633e-05, "step": 141 }, { "epoch": 0.01775, "grad_norm": 2.608558177947998, "grad_norm_var": 0.17855808227350187, "learning_rate": 0.0001, "loss": 1.1734, "loss/crossentropy": 2.2689590454101562, "loss/hidden": 1.0234375, "loss/logits": 0.1493585705757141, "loss/reg": 6.1732207541354e-05, "step": 142 }, { "epoch": 0.017875, "grad_norm": 2.7264318466186523, "grad_norm_var": 0.1520478077633771, "learning_rate": 0.0001, "loss": 1.2868, "loss/crossentropy": 2.3888814449310303, "loss/hidden": 1.1171875, "loss/logits": 0.16896918416023254, "loss/reg": 6.172260327730328e-05, "step": 143 }, { "epoch": 0.018, "grad_norm": 2.4999561309814453, "grad_norm_var": 0.14128539295791806, "learning_rate": 0.0001, "loss": 1.3804, "loss/crossentropy": 2.442732572555542, "loss/hidden": 1.1875, "loss/logits": 0.19230639934539795, "loss/reg": 6.171311542857438e-05, "step": 144 }, { "epoch": 0.018125, "grad_norm": 3.084848642349243, "grad_norm_var": 0.1592220375940921, "learning_rate": 0.0001, "loss": 1.5124, "loss/crossentropy": 2.6801810264587402, "loss/hidden": 1.2421875, "loss/logits": 0.2696050703525543, "loss/reg": 6.170615233713761e-05, "step": 145 }, { "epoch": 0.01825, "grad_norm": 3.0833539962768555, "grad_norm_var": 0.16940866671487811, "learning_rate": 0.0001, "loss": 1.294, "loss/crossentropy": 2.434020519256592, "loss/hidden": 1.140625, "loss/logits": 0.15272179245948792, "loss/reg": 6.170049164211378e-05, "step": 146 }, { "epoch": 0.018375, "grad_norm": 2.2046446800231934, "grad_norm_var": 0.18039814292173043, "learning_rate": 0.0001, "loss": 1.1769, "loss/crossentropy": 2.5624289512634277, "loss/hidden": 1.015625, "loss/logits": 0.160653755068779, "loss/reg": 6.169131665956229e-05, "step": 147 }, { "epoch": 0.0185, "grad_norm": 1.9920902252197266, "grad_norm_var": 0.18414873169562326, "learning_rate": 0.0001, "loss": 1.1186, "loss/crossentropy": 2.709728479385376, "loss/hidden": 0.96875, "loss/logits": 0.1492651402950287, "loss/reg": 6.168704567244276e-05, "step": 148 }, { "epoch": 0.018625, "grad_norm": 2.7053756713867188, "grad_norm_var": 0.18317033653553666, "learning_rate": 0.0001, "loss": 1.2849, "loss/crossentropy": 2.594032049179077, "loss/hidden": 1.09375, "loss/logits": 0.1905450075864792, "loss/reg": 6.168089748825878e-05, "step": 149 }, { "epoch": 0.01875, "grad_norm": 2.1234872341156006, "grad_norm_var": 0.1981121598309187, "learning_rate": 0.0001, "loss": 1.2526, "loss/crossentropy": 2.5880792140960693, "loss/hidden": 1.0703125, "loss/logits": 0.18171370029449463, "loss/reg": 6.167205719975755e-05, "step": 150 }, { "epoch": 0.018875, "grad_norm": 2.4820902347564697, "grad_norm_var": 0.18631464898325945, "learning_rate": 0.0001, "loss": 1.1869, "loss/crossentropy": 2.2422618865966797, "loss/hidden": 1.0234375, "loss/logits": 0.16288068890571594, "loss/reg": 6.166584353195503e-05, "step": 151 }, { "epoch": 0.019, "grad_norm": 2.5669338703155518, "grad_norm_var": 0.17912821539433874, "learning_rate": 0.0001, "loss": 1.0968, "loss/crossentropy": 2.5655312538146973, "loss/hidden": 0.953125, "loss/logits": 0.1430792212486267, "loss/reg": 6.165904778754339e-05, "step": 152 }, { "epoch": 0.019125, "grad_norm": 2.191638469696045, "grad_norm_var": 0.18782946638749062, "learning_rate": 0.0001, "loss": 1.297, "loss/crossentropy": 2.3935883045196533, "loss/hidden": 1.109375, "loss/logits": 0.18698745965957642, "loss/reg": 6.165434024296701e-05, "step": 153 }, { "epoch": 0.01925, "grad_norm": 1.9139376878738403, "grad_norm_var": 0.19900155234911943, "learning_rate": 0.0001, "loss": 1.1497, "loss/crossentropy": 2.5978732109069824, "loss/hidden": 0.99609375, "loss/logits": 0.1530168354511261, "loss/reg": 6.164138176245615e-05, "step": 154 }, { "epoch": 0.019375, "grad_norm": 2.061805486679077, "grad_norm_var": 0.20353621009625153, "learning_rate": 0.0001, "loss": 1.034, "loss/crossentropy": 2.29733943939209, "loss/hidden": 0.91015625, "loss/logits": 0.12318030744791031, "loss/reg": 6.162770296214148e-05, "step": 155 }, { "epoch": 0.0195, "grad_norm": 2.686328649520874, "grad_norm_var": 0.19023239802865194, "learning_rate": 0.0001, "loss": 1.4235, "loss/crossentropy": 2.2928433418273926, "loss/hidden": 1.2265625, "loss/logits": 0.19631928205490112, "loss/reg": 6.16170436842367e-05, "step": 156 }, { "epoch": 0.019625, "grad_norm": 2.6863300800323486, "grad_norm_var": 0.13134889378527811, "learning_rate": 0.0001, "loss": 1.4147, "loss/crossentropy": 2.289113759994507, "loss/hidden": 1.21875, "loss/logits": 0.19536322355270386, "loss/reg": 6.160605698823929e-05, "step": 157 }, { "epoch": 0.01975, "grad_norm": 3.7774782180786133, "grad_norm_var": 0.2373896188726722, "learning_rate": 0.0001, "loss": 1.3606, "loss/crossentropy": 2.4960098266601562, "loss/hidden": 1.171875, "loss/logits": 0.18812544643878937, "loss/reg": 6.159812619443983e-05, "step": 158 }, { "epoch": 0.019875, "grad_norm": 2.5556654930114746, "grad_norm_var": 0.23517615853210802, "learning_rate": 0.0001, "loss": 1.1015, "loss/crossentropy": 2.4794013500213623, "loss/hidden": 0.9609375, "loss/logits": 0.1399209052324295, "loss/reg": 6.158895121188834e-05, "step": 159 }, { "epoch": 0.02, "grad_norm": 2.3351266384124756, "grad_norm_var": 0.23772124659223212, "learning_rate": 0.0001, "loss": 1.1072, "loss/crossentropy": 2.402188301086426, "loss/hidden": 0.96484375, "loss/logits": 0.14173097908496857, "loss/reg": 6.158249016152695e-05, "step": 160 }, { "epoch": 0.020125, "grad_norm": 2.319366455078125, "grad_norm_var": 0.21752957054554395, "learning_rate": 0.0001, "loss": 1.1774, "loss/crossentropy": 2.1729917526245117, "loss/hidden": 1.0234375, "loss/logits": 0.15335121750831604, "loss/reg": 6.157202733447775e-05, "step": 161 }, { "epoch": 0.02025, "grad_norm": 2.0917341709136963, "grad_norm_var": 0.19926011430610652, "learning_rate": 0.0001, "loss": 1.2443, "loss/crossentropy": 2.276581048965454, "loss/hidden": 1.0859375, "loss/logits": 0.1577274203300476, "loss/reg": 6.156737799756229e-05, "step": 162 }, { "epoch": 0.020375, "grad_norm": 4.31035041809082, "grad_norm_var": 0.41637723338655513, "learning_rate": 0.0001, "loss": 1.8974, "loss/crossentropy": 2.6449058055877686, "loss/hidden": 1.5625, "loss/logits": 0.33430173993110657, "loss/reg": 6.156211748020723e-05, "step": 163 }, { "epoch": 0.0205, "grad_norm": 2.145301342010498, "grad_norm_var": 0.4064476055559296, "learning_rate": 0.0001, "loss": 1.2636, "loss/crossentropy": 2.613586664199829, "loss/hidden": 1.078125, "loss/logits": 0.1848127692937851, "loss/reg": 6.155785376904532e-05, "step": 164 }, { "epoch": 0.020625, "grad_norm": 3.6308248043060303, "grad_norm_var": 0.47796885273955964, "learning_rate": 0.0001, "loss": 1.2327, "loss/crossentropy": 2.599729537963867, "loss/hidden": 1.046875, "loss/logits": 0.1852511763572693, "loss/reg": 6.154972652439028e-05, "step": 165 }, { "epoch": 0.02075, "grad_norm": 2.812910556793213, "grad_norm_var": 0.4622733920417279, "learning_rate": 0.0001, "loss": 1.3898, "loss/crossentropy": 2.7171225547790527, "loss/hidden": 1.1875, "loss/logits": 0.20167264342308044, "loss/reg": 6.154461152618751e-05, "step": 166 }, { "epoch": 0.020875, "grad_norm": 2.4922893047332764, "grad_norm_var": 0.46203729327833537, "learning_rate": 0.0001, "loss": 1.3528, "loss/crossentropy": 2.648606777191162, "loss/hidden": 1.140625, "loss/logits": 0.21159711480140686, "loss/reg": 6.153558933874592e-05, "step": 167 }, { "epoch": 0.021, "grad_norm": 2.2380781173706055, "grad_norm_var": 0.47292652355391496, "learning_rate": 0.0001, "loss": 1.3863, "loss/crossentropy": 2.5556812286376953, "loss/hidden": 1.1796875, "loss/logits": 0.20603393018245697, "loss/reg": 6.152570131234825e-05, "step": 168 }, { "epoch": 0.021125, "grad_norm": 2.8179726600646973, "grad_norm_var": 0.4599538691877346, "learning_rate": 0.0001, "loss": 1.3315, "loss/crossentropy": 2.285341262817383, "loss/hidden": 1.140625, "loss/logits": 0.19030849635601044, "loss/reg": 6.151832349132746e-05, "step": 169 }, { "epoch": 0.02125, "grad_norm": 2.933023691177368, "grad_norm_var": 0.42080948451517297, "learning_rate": 0.0001, "loss": 1.5924, "loss/crossentropy": 2.254920482635498, "loss/hidden": 1.3828125, "loss/logits": 0.20900759100914001, "loss/reg": 6.151078559923917e-05, "step": 170 }, { "epoch": 0.021375, "grad_norm": 2.9309163093566895, "grad_norm_var": 0.38903358238886365, "learning_rate": 0.0001, "loss": 1.2104, "loss/crossentropy": 2.771516799926758, "loss/hidden": 1.0546875, "loss/logits": 0.15512725710868835, "loss/reg": 6.14999225945212e-05, "step": 171 }, { "epoch": 0.0215, "grad_norm": 2.7658286094665527, "grad_norm_var": 0.3882477326935183, "learning_rate": 0.0001, "loss": 1.2183, "loss/crossentropy": 2.565211296081543, "loss/hidden": 1.0546875, "loss/logits": 0.16297924518585205, "loss/reg": 6.149257387733087e-05, "step": 172 }, { "epoch": 0.021625, "grad_norm": 3.39176344871521, "grad_norm_var": 0.40840451933244426, "learning_rate": 0.0001, "loss": 1.3931, "loss/crossentropy": 2.4181013107299805, "loss/hidden": 1.1875, "loss/logits": 0.2049458771944046, "loss/reg": 6.148203829070553e-05, "step": 173 }, { "epoch": 0.02175, "grad_norm": 2.7971994876861572, "grad_norm_var": 0.3468190736041642, "learning_rate": 0.0001, "loss": 1.2467, "loss/crossentropy": 2.644824981689453, "loss/hidden": 1.0703125, "loss/logits": 0.17579111456871033, "loss/reg": 6.147275416878983e-05, "step": 174 }, { "epoch": 0.021875, "grad_norm": 7.143955707550049, "grad_norm_var": 1.5219747541806836, "learning_rate": 0.0001, "loss": 1.3279, "loss/crossentropy": 2.6274638175964355, "loss/hidden": 1.171875, "loss/logits": 0.15536972880363464, "loss/reg": 6.146173836896196e-05, "step": 175 }, { "epoch": 0.022, "grad_norm": 8.911324501037598, "grad_norm_var": 3.578509022301667, "learning_rate": 0.0001, "loss": 1.8863, "loss/crossentropy": 1.8980119228363037, "loss/hidden": 1.765625, "loss/logits": 0.12003660202026367, "loss/reg": 6.145203224150464e-05, "step": 176 }, { "epoch": 0.022125, "grad_norm": 2.14353609085083, "grad_norm_var": 3.6077286646662734, "learning_rate": 0.0001, "loss": 1.1573, "loss/crossentropy": 2.1538591384887695, "loss/hidden": 1.015625, "loss/logits": 0.1410439908504486, "loss/reg": 6.144325743662193e-05, "step": 177 }, { "epoch": 0.02225, "grad_norm": 4.625613212585449, "grad_norm_var": 3.542583274880191, "learning_rate": 0.0001, "loss": 1.6226, "loss/crossentropy": 2.7923362255096436, "loss/hidden": 1.375, "loss/logits": 0.24694563448429108, "loss/reg": 6.143252539914101e-05, "step": 178 }, { "epoch": 0.022375, "grad_norm": 2.543745517730713, "grad_norm_var": 3.5775446556342367, "learning_rate": 0.0001, "loss": 1.4192, "loss/crossentropy": 2.3237483501434326, "loss/hidden": 1.203125, "loss/logits": 0.21549411118030548, "loss/reg": 6.14215387031436e-05, "step": 179 }, { "epoch": 0.0225, "grad_norm": 2.3068995475769043, "grad_norm_var": 3.5495511663474453, "learning_rate": 0.0001, "loss": 1.2428, "loss/crossentropy": 2.7135560512542725, "loss/hidden": 1.0859375, "loss/logits": 0.1562565714120865, "loss/reg": 6.141421181382611e-05, "step": 180 }, { "epoch": 0.022625, "grad_norm": 3.465264081954956, "grad_norm_var": 3.5490467443763025, "learning_rate": 0.0001, "loss": 1.4771, "loss/crossentropy": 3.3183774948120117, "loss/hidden": 1.234375, "loss/logits": 0.2421126663684845, "loss/reg": 6.140418554423377e-05, "step": 181 }, { "epoch": 0.02275, "grad_norm": 2.696394205093384, "grad_norm_var": 3.5608805573030993, "learning_rate": 0.0001, "loss": 1.2269, "loss/crossentropy": 2.609964370727539, "loss/hidden": 1.0546875, "loss/logits": 0.17162814736366272, "loss/reg": 6.139430479379371e-05, "step": 182 }, { "epoch": 0.022875, "grad_norm": 2.3278727531433105, "grad_norm_var": 3.5849405900569513, "learning_rate": 0.0001, "loss": 1.0795, "loss/crossentropy": 2.753383159637451, "loss/hidden": 0.9453125, "loss/logits": 0.1335984170436859, "loss/reg": 6.138216122053564e-05, "step": 183 }, { "epoch": 0.023, "grad_norm": 2.4336531162261963, "grad_norm_var": 3.554360278579671, "learning_rate": 0.0001, "loss": 1.3948, "loss/crossentropy": 2.4162991046905518, "loss/hidden": 1.171875, "loss/logits": 0.22235547006130219, "loss/reg": 6.137174204923213e-05, "step": 184 }, { "epoch": 0.023125, "grad_norm": 2.420710802078247, "grad_norm_var": 3.601127481620784, "learning_rate": 0.0001, "loss": 1.4926, "loss/crossentropy": 2.30292010307312, "loss/hidden": 1.296875, "loss/logits": 0.19511133432388306, "loss/reg": 6.136245065135881e-05, "step": 185 }, { "epoch": 0.02325, "grad_norm": 2.727184534072876, "grad_norm_var": 3.6190579859970224, "learning_rate": 0.0001, "loss": 1.2816, "loss/crossentropy": 2.4605464935302734, "loss/hidden": 1.0703125, "loss/logits": 0.2107134908437729, "loss/reg": 6.135714647825807e-05, "step": 186 }, { "epoch": 0.023375, "grad_norm": 1.9292963743209839, "grad_norm_var": 3.754688597499932, "learning_rate": 0.0001, "loss": 1.1628, "loss/crossentropy": 2.5925047397613525, "loss/hidden": 1.0, "loss/logits": 0.16220712661743164, "loss/reg": 6.134893919806927e-05, "step": 187 }, { "epoch": 0.0235, "grad_norm": 2.1395771503448486, "grad_norm_var": 3.833355540800866, "learning_rate": 0.0001, "loss": 1.2712, "loss/crossentropy": 2.227994441986084, "loss/hidden": 1.0859375, "loss/logits": 0.18463259935379028, "loss/reg": 6.134230352472514e-05, "step": 188 }, { "epoch": 0.023625, "grad_norm": 3.552602529525757, "grad_norm_var": 3.8353265135005175, "learning_rate": 0.0001, "loss": 1.2518, "loss/crossentropy": 2.562777280807495, "loss/hidden": 1.0859375, "loss/logits": 0.16521015763282776, "loss/reg": 6.13337178947404e-05, "step": 189 }, { "epoch": 0.02375, "grad_norm": 2.766602039337158, "grad_norm_var": 3.8377842837978386, "learning_rate": 0.0001, "loss": 1.3731, "loss/crossentropy": 2.4200425148010254, "loss/hidden": 1.203125, "loss/logits": 0.1694013774394989, "loss/reg": 6.132431008154526e-05, "step": 190 }, { "epoch": 0.023875, "grad_norm": 2.403444528579712, "grad_norm_var": 2.8653780273055327, "learning_rate": 0.0001, "loss": 1.1651, "loss/crossentropy": 2.6963400840759277, "loss/hidden": 1.0078125, "loss/logits": 0.1566968709230423, "loss/reg": 6.132054841145873e-05, "step": 191 }, { "epoch": 0.024, "grad_norm": 2.0356028079986572, "grad_norm_var": 0.4806738598539164, "learning_rate": 0.0001, "loss": 1.4298, "loss/crossentropy": 2.174285650253296, "loss/hidden": 1.21875, "loss/logits": 0.21048110723495483, "loss/reg": 6.13146330579184e-05, "step": 192 }, { "epoch": 0.024125, "grad_norm": 2.501723051071167, "grad_norm_var": 0.4641524277019669, "learning_rate": 0.0001, "loss": 1.2669, "loss/crossentropy": 2.6477620601654053, "loss/hidden": 1.09375, "loss/logits": 0.17256709933280945, "loss/reg": 6.130609108367935e-05, "step": 193 }, { "epoch": 0.02425, "grad_norm": 2.8256325721740723, "grad_norm_var": 0.19964871735684203, "learning_rate": 0.0001, "loss": 1.364, "loss/crossentropy": 2.4205310344696045, "loss/hidden": 1.1875, "loss/logits": 0.17588719725608826, "loss/reg": 6.129377288743854e-05, "step": 194 }, { "epoch": 0.024375, "grad_norm": 3.715850353240967, "grad_norm_var": 0.28183777248683595, "learning_rate": 0.0001, "loss": 1.4108, "loss/crossentropy": 2.5872642993927, "loss/hidden": 1.234375, "loss/logits": 0.1758473813533783, "loss/reg": 6.128078530309722e-05, "step": 195 }, { "epoch": 0.0245, "grad_norm": 3.3498318195343018, "grad_norm_var": 0.3034271167360647, "learning_rate": 0.0001, "loss": 1.3691, "loss/crossentropy": 2.6444506645202637, "loss/hidden": 1.171875, "loss/logits": 0.19665929675102234, "loss/reg": 6.126934749772772e-05, "step": 196 }, { "epoch": 0.024625, "grad_norm": 2.0526957511901855, "grad_norm_var": 0.2850787945150557, "learning_rate": 0.0001, "loss": 1.2051, "loss/crossentropy": 2.592327117919922, "loss/hidden": 1.0390625, "loss/logits": 0.16540399193763733, "loss/reg": 6.125810614321381e-05, "step": 197 }, { "epoch": 0.02475, "grad_norm": 2.4300317764282227, "grad_norm_var": 0.28670823409057716, "learning_rate": 0.0001, "loss": 1.5286, "loss/crossentropy": 2.36305570602417, "loss/hidden": 1.2890625, "loss/logits": 0.2389371693134308, "loss/reg": 6.124811625340953e-05, "step": 198 }, { "epoch": 0.024875, "grad_norm": 2.3255856037139893, "grad_norm_var": 0.28679178178242776, "learning_rate": 0.0001, "loss": 1.1743, "loss/crossentropy": 2.0803394317626953, "loss/hidden": 1.03125, "loss/logits": 0.1424179971218109, "loss/reg": 6.124229548731819e-05, "step": 199 }, { "epoch": 0.025, "grad_norm": 2.2634005546569824, "grad_norm_var": 0.2923937566916393, "learning_rate": 0.0001, "loss": 1.2619, "loss/crossentropy": 2.427354574203491, "loss/hidden": 1.0859375, "loss/logits": 0.1753256618976593, "loss/reg": 6.123317871242762e-05, "step": 200 }, { "epoch": 0.025125, "grad_norm": 2.789698839187622, "grad_norm_var": 0.292575209213462, "learning_rate": 0.0001, "loss": 1.2794, "loss/crossentropy": 2.4137160778045654, "loss/hidden": 1.1328125, "loss/logits": 0.14599566161632538, "loss/reg": 6.122920603957027e-05, "step": 201 }, { "epoch": 0.02525, "grad_norm": 2.23150897026062, "grad_norm_var": 0.3003877767651639, "learning_rate": 0.0001, "loss": 1.2906, "loss/crossentropy": 2.502619743347168, "loss/hidden": 1.09375, "loss/logits": 0.19620737433433533, "loss/reg": 6.122409831732512e-05, "step": 202 }, { "epoch": 0.025375, "grad_norm": 3.3167238235473633, "grad_norm_var": 0.2999410613935005, "learning_rate": 0.0001, "loss": 1.4511, "loss/crossentropy": 2.5889461040496826, "loss/hidden": 1.2265625, "loss/logits": 0.2239363044500351, "loss/reg": 6.122187187429518e-05, "step": 203 }, { "epoch": 0.0255, "grad_norm": 2.5847971439361572, "grad_norm_var": 0.28091485279191464, "learning_rate": 0.0001, "loss": 1.248, "loss/crossentropy": 2.4720451831817627, "loss/hidden": 1.078125, "loss/logits": 0.16930653154850006, "loss/reg": 6.120974285295233e-05, "step": 204 }, { "epoch": 0.025625, "grad_norm": 2.071563243865967, "grad_norm_var": 0.24897236933793085, "learning_rate": 0.0001, "loss": 1.1016, "loss/crossentropy": 2.5648884773254395, "loss/hidden": 0.96875, "loss/logits": 0.13218875229358673, "loss/reg": 6.120166654000059e-05, "step": 205 }, { "epoch": 0.02575, "grad_norm": 2.9454479217529297, "grad_norm_var": 0.2548478796483238, "learning_rate": 0.0001, "loss": 1.3574, "loss/crossentropy": 2.607356309890747, "loss/hidden": 1.15625, "loss/logits": 0.20053817331790924, "loss/reg": 6.119644967839122e-05, "step": 206 }, { "epoch": 0.025875, "grad_norm": 3.396070718765259, "grad_norm_var": 0.28840087929906133, "learning_rate": 0.0001, "loss": 1.1743, "loss/crossentropy": 2.682058334350586, "loss/hidden": 1.0078125, "loss/logits": 0.16590501368045807, "loss/reg": 6.11838695476763e-05, "step": 207 }, { "epoch": 0.026, "grad_norm": 2.4477601051330566, "grad_norm_var": 0.26375613878289506, "learning_rate": 0.0001, "loss": 1.3022, "loss/crossentropy": 2.819031000137329, "loss/hidden": 1.109375, "loss/logits": 0.19222432374954224, "loss/reg": 6.117635348346084e-05, "step": 208 }, { "epoch": 0.026125, "grad_norm": 2.5916216373443604, "grad_norm_var": 0.2618484053528464, "learning_rate": 0.0001, "loss": 1.353, "loss/crossentropy": 2.529510259628296, "loss/hidden": 1.15625, "loss/logits": 0.19612029194831848, "loss/reg": 6.116151052992791e-05, "step": 209 }, { "epoch": 0.02625, "grad_norm": 2.108261823654175, "grad_norm_var": 0.28282181699858694, "learning_rate": 0.0001, "loss": 1.2782, "loss/crossentropy": 2.3222012519836426, "loss/hidden": 1.09375, "loss/logits": 0.18379396200180054, "loss/reg": 6.114997813710943e-05, "step": 210 }, { "epoch": 0.026375, "grad_norm": 2.48710560798645, "grad_norm_var": 0.20482550381518247, "learning_rate": 0.0001, "loss": 1.2718, "loss/crossentropy": 2.6183624267578125, "loss/hidden": 1.0859375, "loss/logits": 0.18522073328495026, "loss/reg": 6.114102870924398e-05, "step": 211 }, { "epoch": 0.0265, "grad_norm": 2.63779616355896, "grad_norm_var": 0.1640915083279668, "learning_rate": 0.0001, "loss": 1.3499, "loss/crossentropy": 2.391116142272949, "loss/hidden": 1.1640625, "loss/logits": 0.18524512648582458, "loss/reg": 6.112866685725749e-05, "step": 212 }, { "epoch": 0.026625, "grad_norm": 2.7476329803466797, "grad_norm_var": 0.14889028663519804, "learning_rate": 0.0001, "loss": 1.2842, "loss/crossentropy": 2.5770251750946045, "loss/hidden": 1.1171875, "loss/logits": 0.16641706228256226, "loss/reg": 6.111864786362275e-05, "step": 213 }, { "epoch": 0.02675, "grad_norm": 2.565723419189453, "grad_norm_var": 0.14722036218699916, "learning_rate": 0.0001, "loss": 1.2381, "loss/crossentropy": 2.80257248878479, "loss/hidden": 1.0546875, "loss/logits": 0.18279102444648743, "loss/reg": 6.110716640250757e-05, "step": 214 }, { "epoch": 0.026875, "grad_norm": 4.107775688171387, "grad_norm_var": 0.2818514081658729, "learning_rate": 0.0001, "loss": 1.5243, "loss/crossentropy": 2.4806065559387207, "loss/hidden": 1.3046875, "loss/logits": 0.2190462350845337, "loss/reg": 6.109999230829999e-05, "step": 215 }, { "epoch": 0.027, "grad_norm": 2.3829445838928223, "grad_norm_var": 0.27569299833046823, "learning_rate": 0.0001, "loss": 1.2079, "loss/crossentropy": 2.466684579849243, "loss/hidden": 1.046875, "loss/logits": 0.16046380996704102, "loss/reg": 6.108790694270283e-05, "step": 216 }, { "epoch": 0.027125, "grad_norm": 2.554863929748535, "grad_norm_var": 0.2767468455530223, "learning_rate": 0.0001, "loss": 1.1988, "loss/crossentropy": 2.582035541534424, "loss/hidden": 1.046875, "loss/logits": 0.15130122005939484, "loss/reg": 6.1076192650944e-05, "step": 217 }, { "epoch": 0.02725, "grad_norm": 2.7898809909820557, "grad_norm_var": 0.26145832144768877, "learning_rate": 0.0001, "loss": 1.6592, "loss/crossentropy": 2.655186414718628, "loss/hidden": 1.3984375, "loss/logits": 0.26013702154159546, "loss/reg": 6.107001536292955e-05, "step": 218 }, { "epoch": 0.027375, "grad_norm": 2.7881548404693604, "grad_norm_var": 0.2378165583524293, "learning_rate": 0.0001, "loss": 1.5451, "loss/crossentropy": 2.4413743019104004, "loss/hidden": 1.3203125, "loss/logits": 0.2241469919681549, "loss/reg": 6.106249202275649e-05, "step": 219 }, { "epoch": 0.0275, "grad_norm": 2.2896728515625, "grad_norm_var": 0.24781162791184835, "learning_rate": 0.0001, "loss": 1.2198, "loss/crossentropy": 2.4421772956848145, "loss/hidden": 1.0703125, "loss/logits": 0.14890027046203613, "loss/reg": 6.105640932219103e-05, "step": 220 }, { "epoch": 0.027625, "grad_norm": 2.324869155883789, "grad_norm_var": 0.23120432182346703, "learning_rate": 0.0001, "loss": 1.3402, "loss/crossentropy": 2.526216745376587, "loss/hidden": 1.140625, "loss/logits": 0.19898337125778198, "loss/reg": 6.10438291914761e-05, "step": 221 }, { "epoch": 0.02775, "grad_norm": 2.88158917427063, "grad_norm_var": 0.22935101127255847, "learning_rate": 0.0001, "loss": 1.372, "loss/crossentropy": 2.361729621887207, "loss/hidden": 1.15625, "loss/logits": 0.21510916948318481, "loss/reg": 6.10318202234339e-05, "step": 222 }, { "epoch": 0.027875, "grad_norm": 2.9760019779205322, "grad_norm_var": 0.20104925696453316, "learning_rate": 0.0001, "loss": 1.2925, "loss/crossentropy": 2.5573909282684326, "loss/hidden": 1.1171875, "loss/logits": 0.1747477501630783, "loss/reg": 6.1027145420666784e-05, "step": 223 }, { "epoch": 0.028, "grad_norm": 2.702091932296753, "grad_norm_var": 0.19763696198550798, "learning_rate": 0.0001, "loss": 1.3524, "loss/crossentropy": 2.717195510864258, "loss/hidden": 1.15625, "loss/logits": 0.19553202390670776, "loss/reg": 6.1014961829641834e-05, "step": 224 }, { "epoch": 0.028125, "grad_norm": 2.1232945919036865, "grad_norm_var": 0.21708226542899425, "learning_rate": 0.0001, "loss": 1.2661, "loss/crossentropy": 2.4481968879699707, "loss/hidden": 1.0859375, "loss/logits": 0.1795472800731659, "loss/reg": 6.100164682720788e-05, "step": 225 }, { "epoch": 0.02825, "grad_norm": 2.191066026687622, "grad_norm_var": 0.2114830183011783, "learning_rate": 0.0001, "loss": 1.1895, "loss/crossentropy": 2.34470534324646, "loss/hidden": 1.03125, "loss/logits": 0.15763415396213531, "loss/reg": 6.099118763813749e-05, "step": 226 }, { "epoch": 0.028375, "grad_norm": 2.3068013191223145, "grad_norm_var": 0.21765702233228598, "learning_rate": 0.0001, "loss": 1.539, "loss/crossentropy": 2.5549845695495605, "loss/hidden": 1.328125, "loss/logits": 0.21025767922401428, "loss/reg": 6.09817034273874e-05, "step": 227 }, { "epoch": 0.0285, "grad_norm": 2.890655279159546, "grad_norm_var": 0.221304562186567, "learning_rate": 0.0001, "loss": 1.5638, "loss/crossentropy": 2.2339606285095215, "loss/hidden": 1.34375, "loss/logits": 0.21939440071582794, "loss/reg": 6.096933429944329e-05, "step": 228 }, { "epoch": 0.028625, "grad_norm": 2.182521343231201, "grad_norm_var": 0.2349577927735633, "learning_rate": 0.0001, "loss": 1.2085, "loss/crossentropy": 2.641230583190918, "loss/hidden": 1.046875, "loss/logits": 0.161014586687088, "loss/reg": 6.095720891607925e-05, "step": 229 }, { "epoch": 0.02875, "grad_norm": 2.704406976699829, "grad_norm_var": 0.23499684870281476, "learning_rate": 0.0001, "loss": 1.3456, "loss/crossentropy": 2.6833486557006836, "loss/hidden": 1.15625, "loss/logits": 0.18876385688781738, "loss/reg": 6.094613127061166e-05, "step": 230 }, { "epoch": 0.028875, "grad_norm": 3.4925310611724854, "grad_norm_var": 0.13802667852219105, "learning_rate": 0.0001, "loss": 1.3709, "loss/crossentropy": 2.1604089736938477, "loss/hidden": 1.1953125, "loss/logits": 0.17500904202461243, "loss/reg": 6.093499541748315e-05, "step": 231 }, { "epoch": 0.029, "grad_norm": 2.344773530960083, "grad_norm_var": 0.13921650701028032, "learning_rate": 0.0001, "loss": 1.4725, "loss/crossentropy": 2.493307113647461, "loss/hidden": 1.25, "loss/logits": 0.22193682193756104, "loss/reg": 6.092391777201556e-05, "step": 232 }, { "epoch": 0.029125, "grad_norm": 1.8828089237213135, "grad_norm_var": 0.17117140448626647, "learning_rate": 0.0001, "loss": 1.1104, "loss/crossentropy": 2.5302743911743164, "loss/hidden": 0.9765625, "loss/logits": 0.1331850290298462, "loss/reg": 6.0912472690688446e-05, "step": 233 }, { "epoch": 0.02925, "grad_norm": 2.747770071029663, "grad_norm_var": 0.16996031408720758, "learning_rate": 0.0001, "loss": 1.1371, "loss/crossentropy": 2.4189980030059814, "loss/hidden": 0.99609375, "loss/logits": 0.14035619795322418, "loss/reg": 6.089695307309739e-05, "step": 234 }, { "epoch": 0.029375, "grad_norm": 1.8742481470108032, "grad_norm_var": 0.1933626604088189, "learning_rate": 0.0001, "loss": 1.1601, "loss/crossentropy": 2.2694003582000732, "loss/hidden": 1.015625, "loss/logits": 0.14385350048542023, "loss/reg": 6.088387090130709e-05, "step": 235 }, { "epoch": 0.0295, "grad_norm": 2.0313689708709717, "grad_norm_var": 0.20459374724346724, "learning_rate": 0.0001, "loss": 1.2446, "loss/crossentropy": 2.4902865886688232, "loss/hidden": 1.0703125, "loss/logits": 0.17369529604911804, "loss/reg": 6.086897337809205e-05, "step": 236 }, { "epoch": 0.029625, "grad_norm": 2.3882880210876465, "grad_norm_var": 0.20354561810974156, "learning_rate": 0.0001, "loss": 1.3947, "loss/crossentropy": 2.4032340049743652, "loss/hidden": 1.1875, "loss/logits": 0.20656049251556396, "loss/reg": 6.085408676881343e-05, "step": 237 }, { "epoch": 0.02975, "grad_norm": 1.7327938079833984, "grad_norm_var": 0.22490130088653987, "learning_rate": 0.0001, "loss": 1.1777, "loss/crossentropy": 2.4949777126312256, "loss/hidden": 1.015625, "loss/logits": 0.1614799201488495, "loss/reg": 6.084307824494317e-05, "step": 238 }, { "epoch": 0.029875, "grad_norm": 2.2483370304107666, "grad_norm_var": 0.20314943964483845, "learning_rate": 0.0001, "loss": 1.331, "loss/crossentropy": 2.5907418727874756, "loss/hidden": 1.1328125, "loss/logits": 0.19753864407539368, "loss/reg": 6.0828475398011506e-05, "step": 239 }, { "epoch": 0.03, "grad_norm": 2.5151193141937256, "grad_norm_var": 0.19693662117647784, "learning_rate": 0.0001, "loss": 1.2278, "loss/crossentropy": 2.6233856678009033, "loss/hidden": 1.0546875, "loss/logits": 0.1725194901227951, "loss/reg": 6.0820282669737935e-05, "step": 240 }, { "epoch": 0.030125, "grad_norm": 2.198249101638794, "grad_norm_var": 0.19498660957211478, "learning_rate": 0.0001, "loss": 1.1441, "loss/crossentropy": 2.368884563446045, "loss/hidden": 0.99609375, "loss/logits": 0.1473642736673355, "loss/reg": 6.0812566516688094e-05, "step": 241 }, { "epoch": 0.03025, "grad_norm": 2.195218563079834, "grad_norm_var": 0.1948951313244331, "learning_rate": 0.0001, "loss": 1.2993, "loss/crossentropy": 2.352041721343994, "loss/hidden": 1.1171875, "loss/logits": 0.1815069168806076, "loss/reg": 6.080829552956857e-05, "step": 242 }, { "epoch": 0.030375, "grad_norm": 2.6142425537109375, "grad_norm_var": 0.19868367561009795, "learning_rate": 0.0001, "loss": 1.3644, "loss/crossentropy": 2.497286558151245, "loss/hidden": 1.1875, "loss/logits": 0.17629210650920868, "loss/reg": 6.0799306083936244e-05, "step": 243 }, { "epoch": 0.0305, "grad_norm": 2.342033624649048, "grad_norm_var": 0.1799734399041227, "learning_rate": 0.0001, "loss": 1.1311, "loss/crossentropy": 2.5182478427886963, "loss/hidden": 0.984375, "loss/logits": 0.1461625099182129, "loss/reg": 6.078776277718134e-05, "step": 244 }, { "epoch": 0.030625, "grad_norm": 2.3943874835968018, "grad_norm_var": 0.17823371257387344, "learning_rate": 0.0001, "loss": 1.1773, "loss/crossentropy": 2.575707197189331, "loss/hidden": 1.015625, "loss/logits": 0.1610667109489441, "loss/reg": 6.078143633203581e-05, "step": 245 }, { "epoch": 0.03075, "grad_norm": 2.2752902507781982, "grad_norm_var": 0.16984605758260846, "learning_rate": 0.0001, "loss": 1.3322, "loss/crossentropy": 2.228628635406494, "loss/hidden": 1.1484375, "loss/logits": 0.18314987421035767, "loss/reg": 6.077219222788699e-05, "step": 246 }, { "epoch": 0.030875, "grad_norm": 2.1779940128326416, "grad_norm_var": 0.07406002979102144, "learning_rate": 0.0001, "loss": 1.179, "loss/crossentropy": 2.4325718879699707, "loss/hidden": 1.0078125, "loss/logits": 0.17062756419181824, "loss/reg": 6.076457793824375e-05, "step": 247 }, { "epoch": 0.031, "grad_norm": 2.031386613845825, "grad_norm_var": 0.07614130749575872, "learning_rate": 0.0001, "loss": 1.3177, "loss/crossentropy": 2.3050920963287354, "loss/hidden": 1.1328125, "loss/logits": 0.18426315486431122, "loss/reg": 6.075216515455395e-05, "step": 248 }, { "epoch": 0.031125, "grad_norm": 2.4880683422088623, "grad_norm_var": 0.07117238958467732, "learning_rate": 0.0001, "loss": 1.2617, "loss/crossentropy": 2.690160036087036, "loss/hidden": 1.0625, "loss/logits": 0.1985635757446289, "loss/reg": 6.0742688219761476e-05, "step": 249 }, { "epoch": 0.03125, "grad_norm": 2.631229877471924, "grad_norm_var": 0.06453399427719399, "learning_rate": 0.0001, "loss": 1.3072, "loss/crossentropy": 2.4459030628204346, "loss/hidden": 1.109375, "loss/logits": 0.1971898078918457, "loss/reg": 6.0733007558155805e-05, "step": 250 }, { "epoch": 0.031375, "grad_norm": 2.7028048038482666, "grad_norm_var": 0.06497512863382227, "learning_rate": 0.0001, "loss": 1.3656, "loss/crossentropy": 2.7830824851989746, "loss/hidden": 1.1796875, "loss/logits": 0.18533006310462952, "loss/reg": 6.0722686612280086e-05, "step": 251 }, { "epoch": 0.0315, "grad_norm": 3.7025880813598633, "grad_norm_var": 0.17735395269518506, "learning_rate": 0.0001, "loss": 1.2542, "loss/crossentropy": 2.4722542762756348, "loss/hidden": 1.078125, "loss/logits": 0.17551761865615845, "loss/reg": 6.0708127421094105e-05, "step": 252 }, { "epoch": 0.031625, "grad_norm": 2.1496498584747314, "grad_norm_var": 0.18175923180052275, "learning_rate": 0.0001, "loss": 1.0403, "loss/crossentropy": 2.4383487701416016, "loss/hidden": 0.91015625, "loss/logits": 0.12949630618095398, "loss/reg": 6.069323717383668e-05, "step": 253 }, { "epoch": 0.03175, "grad_norm": 3.212991237640381, "grad_norm_var": 0.18702365671043306, "learning_rate": 0.0001, "loss": 1.3555, "loss/crossentropy": 2.1896352767944336, "loss/hidden": 1.1953125, "loss/logits": 0.1595323085784912, "loss/reg": 6.067836147849448e-05, "step": 254 }, { "epoch": 0.031875, "grad_norm": 2.53044056892395, "grad_norm_var": 0.18281462084492142, "learning_rate": 0.0001, "loss": 1.2462, "loss/crossentropy": 2.8005239963531494, "loss/hidden": 1.0625, "loss/logits": 0.18304814398288727, "loss/reg": 6.0668298829114065e-05, "step": 255 }, { "epoch": 0.032, "grad_norm": 5.920226573944092, "grad_norm_var": 0.9097630014084027, "learning_rate": 0.0001, "loss": 1.9011, "loss/crossentropy": 2.2827932834625244, "loss/hidden": 1.59375, "loss/logits": 0.3067648708820343, "loss/reg": 6.0657377616735175e-05, "step": 256 }, { "epoch": 0.032125, "grad_norm": 3.144649028778076, "grad_norm_var": 0.8995354429829506, "learning_rate": 0.0001, "loss": 1.2361, "loss/crossentropy": 2.9163215160369873, "loss/hidden": 1.078125, "loss/logits": 0.15732741355895996, "loss/reg": 6.064687840989791e-05, "step": 257 }, { "epoch": 0.03225, "grad_norm": 2.677065849304199, "grad_norm_var": 0.8763431299745091, "learning_rate": 0.0001, "loss": 1.3123, "loss/crossentropy": 2.9036660194396973, "loss/hidden": 1.125, "loss/logits": 0.18664765357971191, "loss/reg": 6.0635462432401255e-05, "step": 258 }, { "epoch": 0.032375, "grad_norm": 1.9815617799758911, "grad_norm_var": 0.9180593253885627, "learning_rate": 0.0001, "loss": 1.2567, "loss/crossentropy": 2.6647751331329346, "loss/hidden": 1.0703125, "loss/logits": 0.18578888475894928, "loss/reg": 6.062128159101121e-05, "step": 259 }, { "epoch": 0.0325, "grad_norm": 2.6094260215759277, "grad_norm_var": 0.9071755924568459, "learning_rate": 0.0001, "loss": 1.4176, "loss/crossentropy": 2.9915220737457275, "loss/hidden": 1.21875, "loss/logits": 0.19824379682540894, "loss/reg": 6.060625673853792e-05, "step": 260 }, { "epoch": 0.032625, "grad_norm": 2.4859585762023926, "grad_norm_var": 0.9028772625757899, "learning_rate": 0.0001, "loss": 1.2047, "loss/crossentropy": 2.325611114501953, "loss/hidden": 1.03125, "loss/logits": 0.17281952500343323, "loss/reg": 6.0591693909373134e-05, "step": 261 }, { "epoch": 0.03275, "grad_norm": 4.910043716430664, "grad_norm_var": 1.154144117287072, "learning_rate": 0.0001, "loss": 1.2858, "loss/crossentropy": 2.568098306655884, "loss/hidden": 1.109375, "loss/logits": 0.17582398653030396, "loss/reg": 6.057979408069514e-05, "step": 262 }, { "epoch": 0.032875, "grad_norm": 2.2592694759368896, "grad_norm_var": 1.1460852387432343, "learning_rate": 0.0001, "loss": 1.3156, "loss/crossentropy": 2.5264766216278076, "loss/hidden": 1.1171875, "loss/logits": 0.19776055216789246, "loss/reg": 6.056776692275889e-05, "step": 263 }, { "epoch": 0.033, "grad_norm": 2.6964571475982666, "grad_norm_var": 1.0909556269012999, "learning_rate": 0.0001, "loss": 1.0468, "loss/crossentropy": 2.740647792816162, "loss/hidden": 0.91796875, "loss/logits": 0.12825211882591248, "loss/reg": 6.0556718381121755e-05, "step": 264 }, { "epoch": 0.033125, "grad_norm": 2.112201690673828, "grad_norm_var": 1.125761935491216, "learning_rate": 0.0001, "loss": 1.2175, "loss/crossentropy": 2.475130081176758, "loss/hidden": 1.0390625, "loss/logits": 0.1778050661087036, "loss/reg": 6.0543683503055945e-05, "step": 265 }, { "epoch": 0.03325, "grad_norm": 1.8527328968048096, "grad_norm_var": 1.2001448152569836, "learning_rate": 0.0001, "loss": 1.1913, "loss/crossentropy": 2.2017788887023926, "loss/hidden": 1.0234375, "loss/logits": 0.16727614402770996, "loss/reg": 6.053145989426412e-05, "step": 266 }, { "epoch": 0.033375, "grad_norm": 2.2294929027557373, "grad_norm_var": 1.2287526925730277, "learning_rate": 0.0001, "loss": 1.3521, "loss/crossentropy": 2.268073558807373, "loss/hidden": 1.1640625, "loss/logits": 0.18739831447601318, "loss/reg": 6.052442768123001e-05, "step": 267 }, { "epoch": 0.0335, "grad_norm": 2.185410499572754, "grad_norm_var": 1.2112062552861744, "learning_rate": 0.0001, "loss": 1.44, "loss/crossentropy": 2.390622138977051, "loss/hidden": 1.234375, "loss/logits": 0.20500804483890533, "loss/reg": 6.051711898180656e-05, "step": 268 }, { "epoch": 0.033625, "grad_norm": 2.616452693939209, "grad_norm_var": 1.1837342905938153, "learning_rate": 0.0001, "loss": 1.3338, "loss/crossentropy": 2.3374340534210205, "loss/hidden": 1.15625, "loss/logits": 0.17693625390529633, "loss/reg": 6.0506343288579956e-05, "step": 269 }, { "epoch": 0.03375, "grad_norm": 2.5214874744415283, "grad_norm_var": 1.1791403953024882, "learning_rate": 0.0001, "loss": 1.4572, "loss/crossentropy": 2.6334807872772217, "loss/hidden": 1.25, "loss/logits": 0.20655225217342377, "loss/reg": 6.0493421187857166e-05, "step": 270 }, { "epoch": 0.033875, "grad_norm": 2.3426766395568848, "grad_norm_var": 1.18798729537596, "learning_rate": 0.0001, "loss": 1.2858, "loss/crossentropy": 2.362666130065918, "loss/hidden": 1.1171875, "loss/logits": 0.16799038648605347, "loss/reg": 6.047951683285646e-05, "step": 271 }, { "epoch": 0.034, "grad_norm": 2.483227491378784, "grad_norm_var": 0.4891016266434789, "learning_rate": 0.0001, "loss": 1.4126, "loss/crossentropy": 2.6330323219299316, "loss/hidden": 1.203125, "loss/logits": 0.20882482826709747, "loss/reg": 6.046749331289902e-05, "step": 272 }, { "epoch": 0.034125, "grad_norm": 3.3453869819641113, "grad_norm_var": 0.5070205087741229, "learning_rate": 0.0001, "loss": 1.3731, "loss/crossentropy": 2.6637308597564697, "loss/hidden": 1.171875, "loss/logits": 0.20059773325920105, "loss/reg": 6.0458773077698424e-05, "step": 273 }, { "epoch": 0.03425, "grad_norm": 2.2971482276916504, "grad_norm_var": 0.5112160036914843, "learning_rate": 0.0001, "loss": 1.3516, "loss/crossentropy": 2.400428533554077, "loss/hidden": 1.1640625, "loss/logits": 0.18688717484474182, "loss/reg": 6.0452930483734235e-05, "step": 274 }, { "epoch": 0.034375, "grad_norm": 11.117164611816406, "grad_norm_var": 5.025199240890341, "learning_rate": 0.0001, "loss": 2.1956, "loss/crossentropy": 2.7653286457061768, "loss/hidden": 1.8984375, "loss/logits": 0.2965186834335327, "loss/reg": 6.045090049155988e-05, "step": 275 }, { "epoch": 0.0345, "grad_norm": 3.6517550945281982, "grad_norm_var": 5.020888752799834, "learning_rate": 0.0001, "loss": 1.4104, "loss/crossentropy": 2.8897998332977295, "loss/hidden": 1.1484375, "loss/logits": 0.26139265298843384, "loss/reg": 6.0451366152847186e-05, "step": 276 }, { "epoch": 0.034625, "grad_norm": 2.6342201232910156, "grad_norm_var": 5.008262345647254, "learning_rate": 0.0001, "loss": 1.272, "loss/crossentropy": 2.662801504135132, "loss/hidden": 1.09375, "loss/logits": 0.17764705419540405, "loss/reg": 6.0443973779911175e-05, "step": 277 }, { "epoch": 0.03475, "grad_norm": 2.613866090774536, "grad_norm_var": 4.815302301096653, "learning_rate": 0.0001, "loss": 1.3, "loss/crossentropy": 2.2599401473999023, "loss/hidden": 1.125, "loss/logits": 0.1744215488433838, "loss/reg": 6.04407032369636e-05, "step": 278 }, { "epoch": 0.034875, "grad_norm": 2.4121639728546143, "grad_norm_var": 4.800441045565859, "learning_rate": 0.0001, "loss": 1.2736, "loss/crossentropy": 2.3868885040283203, "loss/hidden": 1.109375, "loss/logits": 0.16360533237457275, "loss/reg": 6.0438182117650285e-05, "step": 279 }, { "epoch": 0.035, "grad_norm": 2.257427930831909, "grad_norm_var": 4.834324037466968, "learning_rate": 0.0001, "loss": 1.3236, "loss/crossentropy": 2.452359914779663, "loss/hidden": 1.1328125, "loss/logits": 0.19017404317855835, "loss/reg": 6.043619578122161e-05, "step": 280 }, { "epoch": 0.035125, "grad_norm": 2.3916571140289307, "grad_norm_var": 4.8045581397439525, "learning_rate": 0.0001, "loss": 1.3161, "loss/crossentropy": 2.4834201335906982, "loss/hidden": 1.109375, "loss/logits": 0.20611721277236938, "loss/reg": 6.043669054633938e-05, "step": 281 }, { "epoch": 0.03525, "grad_norm": 2.815398931503296, "grad_norm_var": 4.707581175884913, "learning_rate": 0.0001, "loss": 1.1312, "loss/crossentropy": 3.0801713466644287, "loss/hidden": 0.98828125, "loss/logits": 0.14229975640773773, "loss/reg": 6.044648034730926e-05, "step": 282 }, { "epoch": 0.035375, "grad_norm": 3.1715469360351562, "grad_norm_var": 4.651233430019207, "learning_rate": 0.0001, "loss": 1.409, "loss/crossentropy": 2.354785919189453, "loss/hidden": 1.1953125, "loss/logits": 0.21305763721466064, "loss/reg": 6.0437832871684805e-05, "step": 283 }, { "epoch": 0.0355, "grad_norm": 2.5010037422180176, "grad_norm_var": 4.615667456235268, "learning_rate": 0.0001, "loss": 1.3572, "loss/crossentropy": 2.492047071456909, "loss/hidden": 1.1640625, "loss/logits": 0.1925477683544159, "loss/reg": 6.044709516572766e-05, "step": 284 }, { "epoch": 0.035625, "grad_norm": 1.964429259300232, "grad_norm_var": 4.6928209367171645, "learning_rate": 0.0001, "loss": 1.1671, "loss/crossentropy": 2.3351125717163086, "loss/hidden": 0.99609375, "loss/logits": 0.1704423427581787, "loss/reg": 6.0453679907368496e-05, "step": 285 }, { "epoch": 0.03575, "grad_norm": 2.3656678199768066, "grad_norm_var": 4.707552916907375, "learning_rate": 0.0001, "loss": 1.5385, "loss/crossentropy": 2.4216158390045166, "loss/hidden": 1.28125, "loss/logits": 0.2566841244697571, "loss/reg": 6.0443537222454324e-05, "step": 286 }, { "epoch": 0.035875, "grad_norm": 3.140928030014038, "grad_norm_var": 4.661686527481659, "learning_rate": 0.0001, "loss": 1.3637, "loss/crossentropy": 2.8347983360290527, "loss/hidden": 1.15625, "loss/logits": 0.20682096481323242, "loss/reg": 6.043089888407849e-05, "step": 287 }, { "epoch": 0.036, "grad_norm": 2.6460797786712646, "grad_norm_var": 4.647830565858565, "learning_rate": 0.0001, "loss": 1.3928, "loss/crossentropy": 2.108215093612671, "loss/hidden": 1.2109375, "loss/logits": 0.18129181861877441, "loss/reg": 6.042820677976124e-05, "step": 288 }, { "epoch": 0.036125, "grad_norm": 2.879531145095825, "grad_norm_var": 4.652852381956769, "learning_rate": 0.0001, "loss": 1.4359, "loss/crossentropy": 2.90163516998291, "loss/hidden": 1.25, "loss/logits": 0.1853410005569458, "loss/reg": 6.042792301741429e-05, "step": 289 }, { "epoch": 0.03625, "grad_norm": 2.5701370239257812, "grad_norm_var": 4.625421100051376, "learning_rate": 0.0001, "loss": 1.3639, "loss/crossentropy": 2.6896326541900635, "loss/hidden": 1.15625, "loss/logits": 0.2070741057395935, "loss/reg": 6.0414979088818654e-05, "step": 290 }, { "epoch": 0.036375, "grad_norm": 2.988196849822998, "grad_norm_var": 0.16977142791367086, "learning_rate": 0.0001, "loss": 1.103, "loss/crossentropy": 2.8485705852508545, "loss/hidden": 0.96875, "loss/logits": 0.13362044095993042, "loss/reg": 6.0413527535274625e-05, "step": 291 }, { "epoch": 0.0365, "grad_norm": 5.9153923988342285, "grad_norm_var": 0.7809789933836029, "learning_rate": 0.0001, "loss": 1.6292, "loss/crossentropy": 2.607590436935425, "loss/hidden": 1.4375, "loss/logits": 0.19109681248664856, "loss/reg": 6.041422238922678e-05, "step": 292 }, { "epoch": 0.036625, "grad_norm": 1.932381510734558, "grad_norm_var": 0.8300136192923785, "learning_rate": 0.0001, "loss": 1.1314, "loss/crossentropy": 2.2319207191467285, "loss/hidden": 0.9921875, "loss/logits": 0.13856041431427002, "loss/reg": 6.041810775059275e-05, "step": 293 }, { "epoch": 0.03675, "grad_norm": 2.1218042373657227, "grad_norm_var": 0.8563980373093443, "learning_rate": 0.0001, "loss": 1.1898, "loss/crossentropy": 2.7033910751342773, "loss/hidden": 1.03125, "loss/logits": 0.15791726112365723, "loss/reg": 6.0404745454434305e-05, "step": 294 }, { "epoch": 0.036875, "grad_norm": 3.239748954772949, "grad_norm_var": 0.8614170936653748, "learning_rate": 0.0001, "loss": 1.6186, "loss/crossentropy": 2.3478281497955322, "loss/hidden": 1.3671875, "loss/logits": 0.2507687509059906, "loss/reg": 6.039286745362915e-05, "step": 295 }, { "epoch": 0.037, "grad_norm": 2.361431121826172, "grad_norm_var": 0.8544814148079373, "learning_rate": 0.0001, "loss": 1.2822, "loss/crossentropy": 2.4396111965179443, "loss/hidden": 1.0859375, "loss/logits": 0.19569119811058044, "loss/reg": 6.0390335420379415e-05, "step": 296 }, { "epoch": 0.037125, "grad_norm": 2.6921112537384033, "grad_norm_var": 0.8432509023111928, "learning_rate": 0.0001, "loss": 1.3584, "loss/crossentropy": 2.3235762119293213, "loss/hidden": 1.15625, "loss/logits": 0.20157676935195923, "loss/reg": 6.037576531525701e-05, "step": 297 }, { "epoch": 0.03725, "grad_norm": 2.2376601696014404, "grad_norm_var": 0.8653611900667765, "learning_rate": 0.0001, "loss": 1.3703, "loss/crossentropy": 2.441978693008423, "loss/hidden": 1.1875, "loss/logits": 0.1821848303079605, "loss/reg": 6.036146805854514e-05, "step": 298 }, { "epoch": 0.037375, "grad_norm": 2.5022082328796387, "grad_norm_var": 0.8598019948407729, "learning_rate": 0.0001, "loss": 1.2909, "loss/crossentropy": 2.4099972248077393, "loss/hidden": 1.09375, "loss/logits": 0.1965959370136261, "loss/reg": 6.035445403540507e-05, "step": 299 }, { "epoch": 0.0375, "grad_norm": 2.323599338531494, "grad_norm_var": 0.8677455500426021, "learning_rate": 0.0001, "loss": 1.2301, "loss/crossentropy": 2.714334011077881, "loss/hidden": 1.0625, "loss/logits": 0.16703477501869202, "loss/reg": 6.034153193468228e-05, "step": 300 }, { "epoch": 0.037625, "grad_norm": 2.902794361114502, "grad_norm_var": 0.8254198045813945, "learning_rate": 0.0001, "loss": 1.287, "loss/crossentropy": 2.5897319316864014, "loss/hidden": 1.1171875, "loss/logits": 0.1692187488079071, "loss/reg": 6.032464443705976e-05, "step": 301 }, { "epoch": 0.03775, "grad_norm": 2.455423355102539, "grad_norm_var": 0.8207107650276014, "learning_rate": 0.0001, "loss": 1.3118, "loss/crossentropy": 2.2553625106811523, "loss/hidden": 1.15625, "loss/logits": 0.15494795143604279, "loss/reg": 6.031416342011653e-05, "step": 302 }, { "epoch": 0.037875, "grad_norm": 2.70770001411438, "grad_norm_var": 0.8131429553718594, "learning_rate": 0.0001, "loss": 1.3645, "loss/crossentropy": 2.298628807067871, "loss/hidden": 1.1875, "loss/logits": 0.17642799019813538, "loss/reg": 6.029937867424451e-05, "step": 303 }, { "epoch": 0.038, "grad_norm": 2.4096872806549072, "grad_norm_var": 0.8208490888498592, "learning_rate": 0.0001, "loss": 1.2573, "loss/crossentropy": 2.6787161827087402, "loss/hidden": 1.078125, "loss/logits": 0.17861339449882507, "loss/reg": 6.027881318004802e-05, "step": 304 }, { "epoch": 0.038125, "grad_norm": 2.364800214767456, "grad_norm_var": 0.8295471446711137, "learning_rate": 0.0001, "loss": 1.3251, "loss/crossentropy": 2.351970911026001, "loss/hidden": 1.140625, "loss/logits": 0.18391045928001404, "loss/reg": 6.026409027981572e-05, "step": 305 }, { "epoch": 0.03825, "grad_norm": 2.0991923809051514, "grad_norm_var": 0.8536240669336511, "learning_rate": 0.0001, "loss": 1.078, "loss/crossentropy": 2.7187068462371826, "loss/hidden": 0.9453125, "loss/logits": 0.13205038011074066, "loss/reg": 6.0248257796047255e-05, "step": 306 }, { "epoch": 0.038375, "grad_norm": 2.7471582889556885, "grad_norm_var": 0.8481018158238611, "learning_rate": 0.0001, "loss": 1.4035, "loss/crossentropy": 2.1265523433685303, "loss/hidden": 1.21875, "loss/logits": 0.18416792154312134, "loss/reg": 6.0230733652133495e-05, "step": 307 }, { "epoch": 0.0385, "grad_norm": 2.2592687606811523, "grad_norm_var": 0.11041007633642194, "learning_rate": 0.0001, "loss": 1.271, "loss/crossentropy": 2.66719651222229, "loss/hidden": 1.0859375, "loss/logits": 0.184452086687088, "loss/reg": 6.0217109421500936e-05, "step": 308 }, { "epoch": 0.038625, "grad_norm": 2.2400615215301514, "grad_norm_var": 0.09468951175299385, "learning_rate": 0.0001, "loss": 1.2348, "loss/crossentropy": 2.3710193634033203, "loss/hidden": 1.0625, "loss/logits": 0.1717246174812317, "loss/reg": 6.020214277668856e-05, "step": 309 }, { "epoch": 0.03875, "grad_norm": 2.0783209800720215, "grad_norm_var": 0.09687885973874776, "learning_rate": 0.0001, "loss": 1.2085, "loss/crossentropy": 2.2699692249298096, "loss/hidden": 1.03125, "loss/logits": 0.17665645480155945, "loss/reg": 6.018438944010995e-05, "step": 310 }, { "epoch": 0.038875, "grad_norm": 2.077648162841797, "grad_norm_var": 0.06299334570375853, "learning_rate": 0.0001, "loss": 1.2169, "loss/crossentropy": 2.334127426147461, "loss/hidden": 1.0625, "loss/logits": 0.15378312766551971, "loss/reg": 6.0161146393511444e-05, "step": 311 }, { "epoch": 0.039, "grad_norm": 2.440629482269287, "grad_norm_var": 0.06293910816862744, "learning_rate": 0.0001, "loss": 1.2956, "loss/crossentropy": 2.791874408721924, "loss/hidden": 1.1171875, "loss/logits": 0.1777758002281189, "loss/reg": 6.014638711349107e-05, "step": 312 }, { "epoch": 0.039125, "grad_norm": 2.853940963745117, "grad_norm_var": 0.07069242228717272, "learning_rate": 0.0001, "loss": 1.2688, "loss/crossentropy": 2.5036516189575195, "loss/hidden": 1.0859375, "loss/logits": 0.18226328492164612, "loss/reg": 6.013087840983644e-05, "step": 313 }, { "epoch": 0.03925, "grad_norm": 3.287529230117798, "grad_norm_var": 0.11423125477930943, "learning_rate": 0.0001, "loss": 1.2435, "loss/crossentropy": 2.696265697479248, "loss/hidden": 1.0703125, "loss/logits": 0.17254707217216492, "loss/reg": 6.011854929965921e-05, "step": 314 }, { "epoch": 0.039375, "grad_norm": 3.1080963611602783, "grad_norm_var": 0.1386158794861321, "learning_rate": 0.0001, "loss": 1.473, "loss/crossentropy": 2.1882760524749756, "loss/hidden": 1.25, "loss/logits": 0.2224160134792328, "loss/reg": 6.0103353462181985e-05, "step": 315 }, { "epoch": 0.0395, "grad_norm": 2.7303977012634277, "grad_norm_var": 0.13818442385569654, "learning_rate": 0.0001, "loss": 1.4029, "loss/crossentropy": 2.361660957336426, "loss/hidden": 1.2109375, "loss/logits": 0.19139324128627777, "loss/reg": 6.008424679748714e-05, "step": 316 }, { "epoch": 0.039625, "grad_norm": 1.7651097774505615, "grad_norm_var": 0.16520987140884788, "learning_rate": 0.0001, "loss": 1.0765, "loss/crossentropy": 2.435858964920044, "loss/hidden": 0.953125, "loss/logits": 0.1227254569530487, "loss/reg": 6.007165211485699e-05, "step": 317 }, { "epoch": 0.03975, "grad_norm": 2.128772258758545, "grad_norm_var": 0.17279926669385734, "learning_rate": 0.0001, "loss": 1.1848, "loss/crossentropy": 2.334495782852173, "loss/hidden": 1.0546875, "loss/logits": 0.12953956425189972, "loss/reg": 6.005321120028384e-05, "step": 318 }, { "epoch": 0.039875, "grad_norm": 2.1308538913726807, "grad_norm_var": 0.1742483958439737, "learning_rate": 0.0001, "loss": 1.3191, "loss/crossentropy": 2.3873021602630615, "loss/hidden": 1.125, "loss/logits": 0.19348952174186707, "loss/reg": 6.0041034885216504e-05, "step": 319 }, { "epoch": 0.04, "grad_norm": 2.706742286682129, "grad_norm_var": 0.17935140917835876, "learning_rate": 0.0001, "loss": 1.4123, "loss/crossentropy": 2.5321033000946045, "loss/hidden": 1.203125, "loss/logits": 0.20852993428707123, "loss/reg": 6.002993177389726e-05, "step": 320 }, { "epoch": 0.040125, "grad_norm": 6.118154525756836, "grad_norm_var": 1.0228689502418715, "learning_rate": 0.0001, "loss": 1.7298, "loss/crossentropy": 2.457045316696167, "loss/hidden": 1.515625, "loss/logits": 0.2136228382587433, "loss/reg": 6.001694418955594e-05, "step": 321 }, { "epoch": 0.04025, "grad_norm": 3.091947317123413, "grad_norm_var": 1.0084811477178388, "learning_rate": 0.0001, "loss": 1.6635, "loss/crossentropy": 2.6943020820617676, "loss/hidden": 1.40625, "loss/logits": 0.25662127137184143, "loss/reg": 6.0004946135450155e-05, "step": 322 }, { "epoch": 0.040375, "grad_norm": 2.488391637802124, "grad_norm_var": 1.0122566583255546, "learning_rate": 0.0001, "loss": 1.2065, "loss/crossentropy": 2.646897792816162, "loss/hidden": 1.0546875, "loss/logits": 0.15123483538627625, "loss/reg": 5.9991711168549955e-05, "step": 323 }, { "epoch": 0.0405, "grad_norm": 3.0675456523895264, "grad_norm_var": 1.0035307165437406, "learning_rate": 0.0001, "loss": 1.4832, "loss/crossentropy": 2.4176406860351562, "loss/hidden": 1.2421875, "loss/logits": 0.24040505290031433, "loss/reg": 5.9981128288200125e-05, "step": 324 }, { "epoch": 0.040625, "grad_norm": 2.424546957015991, "grad_norm_var": 0.9926314451715664, "learning_rate": 0.0001, "loss": 1.07, "loss/crossentropy": 2.703134059906006, "loss/hidden": 0.94140625, "loss/logits": 0.12803316116333008, "loss/reg": 5.997138941893354e-05, "step": 325 }, { "epoch": 0.04075, "grad_norm": 2.9345507621765137, "grad_norm_var": 0.9582126623175621, "learning_rate": 0.0001, "loss": 1.4247, "loss/crossentropy": 2.8940789699554443, "loss/hidden": 1.21875, "loss/logits": 0.20539763569831848, "loss/reg": 5.996019172016531e-05, "step": 326 }, { "epoch": 0.040875, "grad_norm": 3.069572925567627, "grad_norm_var": 0.9195850402401864, "learning_rate": 0.0001, "loss": 1.3896, "loss/crossentropy": 2.4416871070861816, "loss/hidden": 1.1875, "loss/logits": 0.20154833793640137, "loss/reg": 5.9947429690510035e-05, "step": 327 }, { "epoch": 0.041, "grad_norm": 2.323606491088867, "grad_norm_var": 0.9275566292830253, "learning_rate": 0.0001, "loss": 1.2888, "loss/crossentropy": 2.811528444290161, "loss/hidden": 1.1015625, "loss/logits": 0.18662354350090027, "loss/reg": 5.9936231991741806e-05, "step": 328 }, { "epoch": 0.041125, "grad_norm": 3.1679723262786865, "grad_norm_var": 0.9322370885273564, "learning_rate": 0.0001, "loss": 1.5559, "loss/crossentropy": 2.3170981407165527, "loss/hidden": 1.3046875, "loss/logits": 0.2506353557109833, "loss/reg": 5.991987563902512e-05, "step": 329 }, { "epoch": 0.04125, "grad_norm": 2.7683303356170654, "grad_norm_var": 0.9228798875820224, "learning_rate": 0.0001, "loss": 1.3127, "loss/crossentropy": 2.51680850982666, "loss/hidden": 1.1171875, "loss/logits": 0.1949077993631363, "loss/reg": 5.990756835672073e-05, "step": 330 }, { "epoch": 0.041375, "grad_norm": 2.4825031757354736, "grad_norm_var": 0.9280253827718864, "learning_rate": 0.0001, "loss": 1.3408, "loss/crossentropy": 2.605055332183838, "loss/hidden": 1.140625, "loss/logits": 0.19955970346927643, "loss/reg": 5.989522469462827e-05, "step": 331 }, { "epoch": 0.0415, "grad_norm": 3.2399041652679443, "grad_norm_var": 0.9369785308922095, "learning_rate": 0.0001, "loss": 1.5753, "loss/crossentropy": 2.7269279956817627, "loss/hidden": 1.3515625, "loss/logits": 0.22315430641174316, "loss/reg": 5.988113844068721e-05, "step": 332 }, { "epoch": 0.041625, "grad_norm": 2.8936927318573, "grad_norm_var": 0.8504314928241191, "learning_rate": 0.0001, "loss": 1.3222, "loss/crossentropy": 2.812412738800049, "loss/hidden": 1.140625, "loss/logits": 0.1809367835521698, "loss/reg": 5.9867059462703764e-05, "step": 333 }, { "epoch": 0.04175, "grad_norm": 2.432213068008423, "grad_norm_var": 0.8233723477256942, "learning_rate": 0.0001, "loss": 1.4094, "loss/crossentropy": 2.6377694606781006, "loss/hidden": 1.203125, "loss/logits": 0.20563456416130066, "loss/reg": 5.9853711718460545e-05, "step": 334 }, { "epoch": 0.041875, "grad_norm": 2.422299861907959, "grad_norm_var": 0.7965082638815336, "learning_rate": 0.0001, "loss": 1.2328, "loss/crossentropy": 2.5352189540863037, "loss/hidden": 1.078125, "loss/logits": 0.15405428409576416, "loss/reg": 5.984482049825601e-05, "step": 335 }, { "epoch": 0.042, "grad_norm": 2.703420877456665, "grad_norm_var": 0.7966286375145801, "learning_rate": 0.0001, "loss": 1.2981, "loss/crossentropy": 2.525949716567993, "loss/hidden": 1.1015625, "loss/logits": 0.1959662139415741, "loss/reg": 5.983649680274539e-05, "step": 336 }, { "epoch": 0.042125, "grad_norm": 3.625760078430176, "grad_norm_var": 0.14094485019601447, "learning_rate": 0.0001, "loss": 1.6517, "loss/crossentropy": 1.9824917316436768, "loss/hidden": 1.3828125, "loss/logits": 0.2682979702949524, "loss/reg": 5.9825455537065864e-05, "step": 337 }, { "epoch": 0.04225, "grad_norm": 2.2066762447357178, "grad_norm_var": 0.1579467344221198, "learning_rate": 0.0001, "loss": 1.1768, "loss/crossentropy": 2.5151102542877197, "loss/hidden": 1.0078125, "loss/logits": 0.16843904554843903, "loss/reg": 5.981199865345843e-05, "step": 338 }, { "epoch": 0.042375, "grad_norm": 2.961968421936035, "grad_norm_var": 0.15445451920782696, "learning_rate": 0.0001, "loss": 1.5446, "loss/crossentropy": 2.397102117538452, "loss/hidden": 1.3046875, "loss/logits": 0.23934724926948547, "loss/reg": 5.979971319902688e-05, "step": 339 }, { "epoch": 0.0425, "grad_norm": 2.4696779251098633, "grad_norm_var": 0.15509145555751214, "learning_rate": 0.0001, "loss": 1.2907, "loss/crossentropy": 2.518648624420166, "loss/hidden": 1.1171875, "loss/logits": 0.17289261519908905, "loss/reg": 5.979237175779417e-05, "step": 340 }, { "epoch": 0.042625, "grad_norm": 2.2886741161346436, "grad_norm_var": 0.16228478040589658, "learning_rate": 0.0001, "loss": 1.2915, "loss/crossentropy": 2.4755570888519287, "loss/hidden": 1.109375, "loss/logits": 0.18152545392513275, "loss/reg": 5.978640547255054e-05, "step": 341 }, { "epoch": 0.04275, "grad_norm": 2.4154622554779053, "grad_norm_var": 0.16631279956205466, "learning_rate": 0.0001, "loss": 1.1361, "loss/crossentropy": 2.620903730392456, "loss/hidden": 0.9921875, "loss/logits": 0.1432739496231079, "loss/reg": 5.977362161502242e-05, "step": 342 }, { "epoch": 0.042875, "grad_norm": 3.9107778072357178, "grad_norm_var": 0.25008606934497735, "learning_rate": 0.0001, "loss": 1.6206, "loss/crossentropy": 3.3820858001708984, "loss/hidden": 1.40625, "loss/logits": 0.21375682950019836, "loss/reg": 5.976331885904074e-05, "step": 343 }, { "epoch": 0.043, "grad_norm": 2.2201833724975586, "grad_norm_var": 0.25690416036597197, "learning_rate": 0.0001, "loss": 1.2446, "loss/crossentropy": 2.467216730117798, "loss/hidden": 1.0625, "loss/logits": 0.18146604299545288, "loss/reg": 5.975304884486832e-05, "step": 344 }, { "epoch": 0.043125, "grad_norm": 2.1915907859802246, "grad_norm_var": 0.26377805805320803, "learning_rate": 0.0001, "loss": 1.4337, "loss/crossentropy": 2.3638522624969482, "loss/hidden": 1.203125, "loss/logits": 0.22996577620506287, "loss/reg": 5.974585292278789e-05, "step": 345 }, { "epoch": 0.04325, "grad_norm": 2.2508416175842285, "grad_norm_var": 0.27594342104869135, "learning_rate": 0.0001, "loss": 1.1804, "loss/crossentropy": 2.5332260131835938, "loss/hidden": 1.0234375, "loss/logits": 0.15640094876289368, "loss/reg": 5.973771112621762e-05, "step": 346 }, { "epoch": 0.043375, "grad_norm": 2.0090150833129883, "grad_norm_var": 0.30177518099136, "learning_rate": 0.0001, "loss": 1.2994, "loss/crossentropy": 2.511950731277466, "loss/hidden": 1.125, "loss/logits": 0.17384442687034607, "loss/reg": 5.9728798078140244e-05, "step": 347 }, { "epoch": 0.0435, "grad_norm": 2.7306134700775146, "grad_norm_var": 0.277258656834267, "learning_rate": 0.0001, "loss": 1.4199, "loss/crossentropy": 2.6389760971069336, "loss/hidden": 1.1875, "loss/logits": 0.2318291962146759, "loss/reg": 5.9719615819631144e-05, "step": 348 }, { "epoch": 0.043625, "grad_norm": 2.270148515701294, "grad_norm_var": 0.27783213891549774, "learning_rate": 0.0001, "loss": 1.5484, "loss/crossentropy": 2.312831163406372, "loss/hidden": 1.2890625, "loss/logits": 0.2587454915046692, "loss/reg": 5.970869824523106e-05, "step": 349 }, { "epoch": 0.04375, "grad_norm": 2.0988070964813232, "grad_norm_var": 0.2908751450016543, "learning_rate": 0.0001, "loss": 1.2681, "loss/crossentropy": 2.378908634185791, "loss/hidden": 1.109375, "loss/logits": 0.1581004559993744, "loss/reg": 5.970033089397475e-05, "step": 350 }, { "epoch": 0.043875, "grad_norm": 2.045546770095825, "grad_norm_var": 0.30608582246859417, "learning_rate": 0.0001, "loss": 1.1063, "loss/crossentropy": 2.4011952877044678, "loss/hidden": 0.96875, "loss/logits": 0.13691341876983643, "loss/reg": 5.969877020106651e-05, "step": 351 }, { "epoch": 0.044, "grad_norm": 2.9582409858703613, "grad_norm_var": 0.316207957574548, "learning_rate": 0.0001, "loss": 1.2072, "loss/crossentropy": 2.643101215362549, "loss/hidden": 1.046875, "loss/logits": 0.15975871682167053, "loss/reg": 5.9694295487133786e-05, "step": 352 }, { "epoch": 0.044125, "grad_norm": 2.125020742416382, "grad_norm_var": 0.23988746234485703, "learning_rate": 0.0001, "loss": 1.2268, "loss/crossentropy": 2.5923550128936768, "loss/hidden": 1.0390625, "loss/logits": 0.18714120984077454, "loss/reg": 5.968381810816936e-05, "step": 353 }, { "epoch": 0.04425, "grad_norm": 2.2348685264587402, "grad_norm_var": 0.2390334750954897, "learning_rate": 0.0001, "loss": 1.3948, "loss/crossentropy": 2.549100637435913, "loss/hidden": 1.1953125, "loss/logits": 0.198894202709198, "loss/reg": 5.9669990150723606e-05, "step": 354 }, { "epoch": 0.044375, "grad_norm": 2.6807351112365723, "grad_norm_var": 0.2247355561703434, "learning_rate": 0.0001, "loss": 1.5721, "loss/crossentropy": 2.2256884574890137, "loss/hidden": 1.3046875, "loss/logits": 0.26683151721954346, "loss/reg": 5.966486787656322e-05, "step": 355 }, { "epoch": 0.0445, "grad_norm": 3.1524059772491455, "grad_norm_var": 0.2573648537337417, "learning_rate": 0.0001, "loss": 1.5458, "loss/crossentropy": 2.4026124477386475, "loss/hidden": 1.3203125, "loss/logits": 0.22484509646892548, "loss/reg": 5.965128730167635e-05, "step": 356 }, { "epoch": 0.044625, "grad_norm": 3.806107759475708, "grad_norm_var": 0.3637951956534662, "learning_rate": 0.0001, "loss": 1.2257, "loss/crossentropy": 2.534790277481079, "loss/hidden": 1.1015625, "loss/logits": 0.12353114783763885, "loss/reg": 5.9637932281475514e-05, "step": 357 }, { "epoch": 0.04475, "grad_norm": 2.6499619483947754, "grad_norm_var": 0.36243857175732047, "learning_rate": 0.0001, "loss": 1.2577, "loss/crossentropy": 2.786536931991577, "loss/hidden": 1.078125, "loss/logits": 0.17896610498428345, "loss/reg": 5.962959403404966e-05, "step": 358 }, { "epoch": 0.044875, "grad_norm": 2.750371217727661, "grad_norm_var": 0.24122897908522703, "learning_rate": 0.0001, "loss": 1.3213, "loss/crossentropy": 2.5112698078155518, "loss/hidden": 1.125, "loss/logits": 0.19569161534309387, "loss/reg": 5.961711940472014e-05, "step": 359 }, { "epoch": 0.045, "grad_norm": 2.4145219326019287, "grad_norm_var": 0.23605635737508593, "learning_rate": 0.0001, "loss": 1.4067, "loss/crossentropy": 2.4327914714813232, "loss/hidden": 1.171875, "loss/logits": 0.23425719141960144, "loss/reg": 5.960506314295344e-05, "step": 360 }, { "epoch": 0.045125, "grad_norm": 2.7820589542388916, "grad_norm_var": 0.2317516785903725, "learning_rate": 0.0001, "loss": 1.5833, "loss/crossentropy": 2.6201419830322266, "loss/hidden": 1.3359375, "loss/logits": 0.2468121349811554, "loss/reg": 5.959635382168926e-05, "step": 361 }, { "epoch": 0.04525, "grad_norm": 3.0179331302642822, "grad_norm_var": 0.23691283979908515, "learning_rate": 0.0001, "loss": 1.3921, "loss/crossentropy": 2.728665351867676, "loss/hidden": 1.1953125, "loss/logits": 0.19617268443107605, "loss/reg": 5.958346446277574e-05, "step": 362 }, { "epoch": 0.045375, "grad_norm": 2.577760934829712, "grad_norm_var": 0.21171492452191767, "learning_rate": 0.0001, "loss": 1.3343, "loss/crossentropy": 2.4396915435791016, "loss/hidden": 1.140625, "loss/logits": 0.19306717813014984, "loss/reg": 5.957194298389368e-05, "step": 363 }, { "epoch": 0.0455, "grad_norm": 2.2478973865509033, "grad_norm_var": 0.22066793284785244, "learning_rate": 0.0001, "loss": 1.2107, "loss/crossentropy": 2.5703125, "loss/hidden": 1.046875, "loss/logits": 0.16320618987083435, "loss/reg": 5.955886445008218e-05, "step": 364 }, { "epoch": 0.045625, "grad_norm": 2.8303184509277344, "grad_norm_var": 0.21465200545435412, "learning_rate": 0.0001, "loss": 1.3184, "loss/crossentropy": 2.5793418884277344, "loss/hidden": 1.1171875, "loss/logits": 0.20061752200126648, "loss/reg": 5.954650623607449e-05, "step": 365 }, { "epoch": 0.04575, "grad_norm": 2.3407225608825684, "grad_norm_var": 0.20058607793752117, "learning_rate": 0.0001, "loss": 1.2154, "loss/crossentropy": 2.5118396282196045, "loss/hidden": 1.0546875, "loss/logits": 0.16011780500411987, "loss/reg": 5.9531517763389274e-05, "step": 366 }, { "epoch": 0.045875, "grad_norm": 2.9164462089538574, "grad_norm_var": 0.17624459628143327, "learning_rate": 0.0001, "loss": 2.3079, "loss/crossentropy": 2.530949831008911, "loss/hidden": 1.7890625, "loss/logits": 0.5182523727416992, "loss/reg": 5.9519883507164195e-05, "step": 367 }, { "epoch": 0.046, "grad_norm": 2.6031134128570557, "grad_norm_var": 0.17274354994838556, "learning_rate": 0.0001, "loss": 1.3529, "loss/crossentropy": 2.5211331844329834, "loss/hidden": 1.140625, "loss/logits": 0.21172133088111877, "loss/reg": 5.950441482127644e-05, "step": 368 }, { "epoch": 0.046125, "grad_norm": 2.2432241439819336, "grad_norm_var": 0.16462358021652007, "learning_rate": 0.0001, "loss": 1.2433, "loss/crossentropy": 2.469212055206299, "loss/hidden": 1.0625, "loss/logits": 0.18018998205661774, "loss/reg": 5.9490499552339315e-05, "step": 369 }, { "epoch": 0.04625, "grad_norm": 3.287365674972534, "grad_norm_var": 0.16815977224474163, "learning_rate": 0.0001, "loss": 1.3311, "loss/crossentropy": 2.7330899238586426, "loss/hidden": 1.140625, "loss/logits": 0.18986304104328156, "loss/reg": 5.9471924032550305e-05, "step": 370 }, { "epoch": 0.046375, "grad_norm": 2.6555063724517822, "grad_norm_var": 0.16849581874392333, "learning_rate": 0.0001, "loss": 1.3069, "loss/crossentropy": 2.4908649921417236, "loss/hidden": 1.1171875, "loss/logits": 0.1890988051891327, "loss/reg": 5.9457710449351e-05, "step": 371 }, { "epoch": 0.0465, "grad_norm": 2.2832915782928467, "grad_norm_var": 0.1710711381355336, "learning_rate": 0.0001, "loss": 1.251, "loss/crossentropy": 2.6485414505004883, "loss/hidden": 1.0859375, "loss/logits": 0.16444087028503418, "loss/reg": 5.9437123127281666e-05, "step": 372 }, { "epoch": 0.046625, "grad_norm": 1.9312299489974976, "grad_norm_var": 0.11748808484953574, "learning_rate": 0.0001, "loss": 1.3104, "loss/crossentropy": 2.4345285892486572, "loss/hidden": 1.125, "loss/logits": 0.1848057061433792, "loss/reg": 5.941649214946665e-05, "step": 373 }, { "epoch": 0.04675, "grad_norm": 2.2687668800354004, "grad_norm_var": 0.12381368567199799, "learning_rate": 0.0001, "loss": 1.2697, "loss/crossentropy": 2.514896869659424, "loss/hidden": 1.0859375, "loss/logits": 0.18321493268013, "loss/reg": 5.939120819675736e-05, "step": 374 }, { "epoch": 0.046875, "grad_norm": 2.1616384983062744, "grad_norm_var": 0.131467626574521, "learning_rate": 0.0001, "loss": 1.2405, "loss/crossentropy": 2.698112964630127, "loss/hidden": 1.0625, "loss/logits": 0.1773754358291626, "loss/reg": 5.936667002970353e-05, "step": 375 }, { "epoch": 0.047, "grad_norm": 2.6922011375427246, "grad_norm_var": 0.13182201209023336, "learning_rate": 0.0001, "loss": 1.3426, "loss/crossentropy": 2.538865327835083, "loss/hidden": 1.15625, "loss/logits": 0.1857489049434662, "loss/reg": 5.9345431509427726e-05, "step": 376 }, { "epoch": 0.047125, "grad_norm": 2.2630982398986816, "grad_norm_var": 0.13276797957838743, "learning_rate": 0.0001, "loss": 1.2869, "loss/crossentropy": 2.4644358158111572, "loss/hidden": 1.109375, "loss/logits": 0.17694343626499176, "loss/reg": 5.933275315328501e-05, "step": 377 }, { "epoch": 0.04725, "grad_norm": 2.479646682739258, "grad_norm_var": 0.11514238570119009, "learning_rate": 0.0001, "loss": 1.1618, "loss/crossentropy": 2.5582141876220703, "loss/hidden": 1.015625, "loss/logits": 0.14554372429847717, "loss/reg": 5.931046689511277e-05, "step": 378 }, { "epoch": 0.047375, "grad_norm": 2.466947317123413, "grad_norm_var": 0.114559834161389, "learning_rate": 0.0001, "loss": 1.48, "loss/crossentropy": 2.4128925800323486, "loss/hidden": 1.2421875, "loss/logits": 0.23724211752414703, "loss/reg": 5.929026156081818e-05, "step": 379 }, { "epoch": 0.0475, "grad_norm": 2.538424015045166, "grad_norm_var": 0.11086504579424972, "learning_rate": 0.0001, "loss": 1.4136, "loss/crossentropy": 2.0768887996673584, "loss/hidden": 1.234375, "loss/logits": 0.17867109179496765, "loss/reg": 5.92764736211393e-05, "step": 380 }, { "epoch": 0.047625, "grad_norm": 2.654524564743042, "grad_norm_var": 0.1049983644074643, "learning_rate": 0.0001, "loss": 1.3221, "loss/crossentropy": 2.1216413974761963, "loss/hidden": 1.15625, "loss/logits": 0.16521546244621277, "loss/reg": 5.926107769482769e-05, "step": 381 }, { "epoch": 0.04775, "grad_norm": 2.237818717956543, "grad_norm_var": 0.10766217194697697, "learning_rate": 0.0001, "loss": 1.2236, "loss/crossentropy": 2.6475207805633545, "loss/hidden": 1.0546875, "loss/logits": 0.16833502054214478, "loss/reg": 5.924178913119249e-05, "step": 382 }, { "epoch": 0.047875, "grad_norm": 2.7116799354553223, "grad_norm_var": 0.09837235459497572, "learning_rate": 0.0001, "loss": 1.3102, "loss/crossentropy": 2.6615209579467773, "loss/hidden": 1.1171875, "loss/logits": 0.1924624741077423, "loss/reg": 5.921960837440565e-05, "step": 383 }, { "epoch": 0.048, "grad_norm": 2.5439391136169434, "grad_norm_var": 0.09752047633307553, "learning_rate": 0.0001, "loss": 1.3258, "loss/crossentropy": 2.10198974609375, "loss/hidden": 1.15625, "loss/logits": 0.1689702719449997, "loss/reg": 5.919525210629217e-05, "step": 384 }, { "epoch": 0.048125, "grad_norm": 2.617921829223633, "grad_norm_var": 0.09528014676361156, "learning_rate": 0.0001, "loss": 1.61, "loss/crossentropy": 2.445833206176758, "loss/hidden": 1.328125, "loss/logits": 0.28133296966552734, "loss/reg": 5.917950693401508e-05, "step": 385 }, { "epoch": 0.04825, "grad_norm": 2.514899730682373, "grad_norm_var": 0.05015297139615639, "learning_rate": 0.0001, "loss": 1.1964, "loss/crossentropy": 2.4887778759002686, "loss/hidden": 1.0390625, "loss/logits": 0.1567072868347168, "loss/reg": 5.916162990615703e-05, "step": 386 }, { "epoch": 0.048375, "grad_norm": 2.1075565814971924, "grad_norm_var": 0.053089324895933446, "learning_rate": 0.0001, "loss": 1.0537, "loss/crossentropy": 2.4045815467834473, "loss/hidden": 0.921875, "loss/logits": 0.1312153935432434, "loss/reg": 5.914089342695661e-05, "step": 387 }, { "epoch": 0.0485, "grad_norm": 2.475404739379883, "grad_norm_var": 0.05228874002812057, "learning_rate": 0.0001, "loss": 1.3003, "loss/crossentropy": 2.591153383255005, "loss/hidden": 1.109375, "loss/logits": 0.19037862122058868, "loss/reg": 5.9116682677995414e-05, "step": 388 }, { "epoch": 0.048625, "grad_norm": 4.638079643249512, "grad_norm_var": 0.33504973194641535, "learning_rate": 0.0001, "loss": 1.7407, "loss/crossentropy": 2.992236852645874, "loss/hidden": 1.4609375, "loss/logits": 0.2792096734046936, "loss/reg": 5.9097284974996e-05, "step": 389 }, { "epoch": 0.04875, "grad_norm": 2.4662392139434814, "grad_norm_var": 0.32913998556907487, "learning_rate": 0.0001, "loss": 1.1454, "loss/crossentropy": 2.9239540100097656, "loss/hidden": 0.9921875, "loss/logits": 0.15260137617588043, "loss/reg": 5.907983722863719e-05, "step": 390 }, { "epoch": 0.048875, "grad_norm": 2.439119338989258, "grad_norm_var": 0.31780327994806234, "learning_rate": 0.0001, "loss": 1.3638, "loss/crossentropy": 2.450254440307617, "loss/hidden": 1.1640625, "loss/logits": 0.19915927946567535, "loss/reg": 5.9063841035822406e-05, "step": 391 }, { "epoch": 0.049, "grad_norm": 2.3475067615509033, "grad_norm_var": 0.3217026075593497, "learning_rate": 0.0001, "loss": 1.533, "loss/crossentropy": 2.617830753326416, "loss/hidden": 1.265625, "loss/logits": 0.26678475737571716, "loss/reg": 5.90429590374697e-05, "step": 392 }, { "epoch": 0.049125, "grad_norm": 4.364901065826416, "grad_norm_var": 0.5050899240629005, "learning_rate": 0.0001, "loss": 1.4632, "loss/crossentropy": 2.4607560634613037, "loss/hidden": 1.2421875, "loss/logits": 0.22039487957954407, "loss/reg": 5.902666089241393e-05, "step": 393 }, { "epoch": 0.04925, "grad_norm": 2.338758707046509, "grad_norm_var": 0.5109449021123245, "learning_rate": 0.0001, "loss": 1.2995, "loss/crossentropy": 2.6618576049804688, "loss/hidden": 1.1171875, "loss/logits": 0.1817541867494583, "loss/reg": 5.9010566474171355e-05, "step": 394 }, { "epoch": 0.049375, "grad_norm": 3.5642833709716797, "grad_norm_var": 0.549694181009107, "learning_rate": 0.0001, "loss": 1.3152, "loss/crossentropy": 2.300379753112793, "loss/hidden": 1.1171875, "loss/logits": 0.1974020004272461, "loss/reg": 5.8987676311517134e-05, "step": 395 }, { "epoch": 0.0495, "grad_norm": 2.1328978538513184, "grad_norm_var": 0.573308372527261, "learning_rate": 0.0001, "loss": 1.244, "loss/crossentropy": 2.4386301040649414, "loss/hidden": 1.0625, "loss/logits": 0.18090221285820007, "loss/reg": 5.8964946219930425e-05, "step": 396 }, { "epoch": 0.049625, "grad_norm": 3.0894107818603516, "grad_norm_var": 0.5790289690992334, "learning_rate": 0.0001, "loss": 1.5661, "loss/crossentropy": 2.365107297897339, "loss/hidden": 1.3671875, "loss/logits": 0.1983477920293808, "loss/reg": 5.8950212405761704e-05, "step": 397 }, { "epoch": 0.04975, "grad_norm": 3.194427967071533, "grad_norm_var": 0.566188494588774, "learning_rate": 0.0001, "loss": 1.4269, "loss/crossentropy": 2.384216547012329, "loss/hidden": 1.21875, "loss/logits": 0.20751546323299408, "loss/reg": 5.8928319049300626e-05, "step": 398 }, { "epoch": 0.049875, "grad_norm": 2.5108933448791504, "grad_norm_var": 0.5723226037333423, "learning_rate": 0.0001, "loss": 1.2127, "loss/crossentropy": 2.5466771125793457, "loss/hidden": 1.0546875, "loss/logits": 0.1574660688638687, "loss/reg": 5.891324326512404e-05, "step": 399 }, { "epoch": 0.05, "grad_norm": 2.9769773483276367, "grad_norm_var": 0.5672869916808385, "learning_rate": 0.0001, "loss": 1.3045, "loss/crossentropy": 2.7223000526428223, "loss/hidden": 1.1171875, "loss/logits": 0.18677057325839996, "loss/reg": 5.889027670491487e-05, "step": 400 }, { "epoch": 0.050125, "grad_norm": 3.31915283203125, "grad_norm_var": 0.5752734489563172, "learning_rate": 0.0001, "loss": 1.2639, "loss/crossentropy": 2.4886364936828613, "loss/hidden": 1.078125, "loss/logits": 0.1851940155029297, "loss/reg": 5.887265797355212e-05, "step": 401 }, { "epoch": 0.05025, "grad_norm": 1.8946937322616577, "grad_norm_var": 0.6315760522485537, "learning_rate": 0.0001, "loss": 1.2326, "loss/crossentropy": 2.414213180541992, "loss/hidden": 1.0703125, "loss/logits": 0.16165336966514587, "loss/reg": 5.885552673134953e-05, "step": 402 }, { "epoch": 0.050375, "grad_norm": 2.5370404720306396, "grad_norm_var": 0.5996572790739425, "learning_rate": 0.0001, "loss": 1.5079, "loss/crossentropy": 2.3421835899353027, "loss/hidden": 1.28125, "loss/logits": 0.22602099180221558, "loss/reg": 5.88419679843355e-05, "step": 403 }, { "epoch": 0.0505, "grad_norm": 2.4215445518493652, "grad_norm_var": 0.6028382899137373, "learning_rate": 0.0001, "loss": 1.4975, "loss/crossentropy": 2.7361152172088623, "loss/hidden": 1.2421875, "loss/logits": 0.25468122959136963, "loss/reg": 5.88247858104296e-05, "step": 404 }, { "epoch": 0.050625, "grad_norm": 2.049978733062744, "grad_norm_var": 0.4181645547932513, "learning_rate": 0.0001, "loss": 1.1088, "loss/crossentropy": 2.350353717803955, "loss/hidden": 0.953125, "loss/logits": 0.15509989857673645, "loss/reg": 5.880888784304261e-05, "step": 405 }, { "epoch": 0.05075, "grad_norm": 2.7967936992645264, "grad_norm_var": 0.41345734870287976, "learning_rate": 0.0001, "loss": 1.3869, "loss/crossentropy": 2.5875766277313232, "loss/hidden": 1.171875, "loss/logits": 0.21445012092590332, "loss/reg": 5.8793633797904477e-05, "step": 406 }, { "epoch": 0.050875, "grad_norm": 2.169900894165039, "grad_norm_var": 0.429098064205416, "learning_rate": 0.0001, "loss": 1.0776, "loss/crossentropy": 2.398125410079956, "loss/hidden": 0.93359375, "loss/logits": 0.14346018433570862, "loss/reg": 5.877741932636127e-05, "step": 407 }, { "epoch": 0.051, "grad_norm": 2.5045695304870605, "grad_norm_var": 0.4225916301522199, "learning_rate": 0.0001, "loss": 1.5355, "loss/crossentropy": 2.1697590351104736, "loss/hidden": 1.328125, "loss/logits": 0.20677754282951355, "loss/reg": 5.876670911675319e-05, "step": 408 }, { "epoch": 0.051125, "grad_norm": 18.23008918762207, "grad_norm_var": 15.43871781968465, "learning_rate": 0.0001, "loss": 1.4882, "loss/crossentropy": 2.602886438369751, "loss/hidden": 1.3046875, "loss/logits": 0.18289120495319366, "loss/reg": 5.874884300283156e-05, "step": 409 }, { "epoch": 0.05125, "grad_norm": 2.8436660766601562, "grad_norm_var": 15.369190103974788, "learning_rate": 0.0001, "loss": 1.3294, "loss/crossentropy": 2.4684174060821533, "loss/hidden": 1.140625, "loss/logits": 0.18818634748458862, "loss/reg": 5.873553891433403e-05, "step": 410 }, { "epoch": 0.051375, "grad_norm": 2.2729334831237793, "grad_norm_var": 15.486411427985377, "learning_rate": 0.0001, "loss": 1.2331, "loss/crossentropy": 2.550140857696533, "loss/hidden": 1.0390625, "loss/logits": 0.19348369538784027, "loss/reg": 5.8720732340589166e-05, "step": 411 }, { "epoch": 0.0515, "grad_norm": 2.5612359046936035, "grad_norm_var": 15.416427881560285, "learning_rate": 0.0001, "loss": 1.3333, "loss/crossentropy": 2.4774818420410156, "loss/hidden": 1.125, "loss/logits": 0.2077203392982483, "loss/reg": 5.8710702433018014e-05, "step": 412 }, { "epoch": 0.051625, "grad_norm": 4.02579927444458, "grad_norm_var": 15.409250289477422, "learning_rate": 0.0001, "loss": 1.507, "loss/crossentropy": 2.555722713470459, "loss/hidden": 1.3046875, "loss/logits": 0.20171231031417847, "loss/reg": 5.870195309398696e-05, "step": 413 }, { "epoch": 0.05175, "grad_norm": 2.443574905395508, "grad_norm_var": 15.489530544756628, "learning_rate": 0.0001, "loss": 1.2774, "loss/crossentropy": 2.706422805786133, "loss/hidden": 1.09375, "loss/logits": 0.1831112802028656, "loss/reg": 5.8690613514045253e-05, "step": 414 }, { "epoch": 0.051875, "grad_norm": 2.079418897628784, "grad_norm_var": 15.563674426313279, "learning_rate": 0.0001, "loss": 1.1798, "loss/crossentropy": 2.6763839721679688, "loss/hidden": 1.03125, "loss/logits": 0.14800235629081726, "loss/reg": 5.8675475884228945e-05, "step": 415 }, { "epoch": 0.052, "grad_norm": 2.7786471843719482, "grad_norm_var": 15.581826938638233, "learning_rate": 0.0001, "loss": 1.2465, "loss/crossentropy": 2.6709306240081787, "loss/hidden": 1.078125, "loss/logits": 0.1678304374217987, "loss/reg": 5.866462379344739e-05, "step": 416 }, { "epoch": 0.052125, "grad_norm": 2.770376443862915, "grad_norm_var": 15.618130403520784, "learning_rate": 0.0001, "loss": 1.3111, "loss/crossentropy": 2.646826982498169, "loss/hidden": 1.1328125, "loss/logits": 0.1777157187461853, "loss/reg": 5.865520142833702e-05, "step": 417 }, { "epoch": 0.05225, "grad_norm": 2.092414617538452, "grad_norm_var": 15.57762685735369, "learning_rate": 0.0001, "loss": 1.3353, "loss/crossentropy": 2.62361741065979, "loss/hidden": 1.15625, "loss/logits": 0.17848479747772217, "loss/reg": 5.8638761402107775e-05, "step": 418 }, { "epoch": 0.052375, "grad_norm": 2.05226731300354, "grad_norm_var": 15.656891853986265, "learning_rate": 0.0001, "loss": 1.14, "loss/crossentropy": 2.697723865509033, "loss/hidden": 0.9921875, "loss/logits": 0.14726917445659637, "loss/reg": 5.862316902494058e-05, "step": 419 }, { "epoch": 0.0525, "grad_norm": 2.6924796104431152, "grad_norm_var": 15.622310414474152, "learning_rate": 0.0001, "loss": 1.404, "loss/crossentropy": 2.601827383041382, "loss/hidden": 1.2109375, "loss/logits": 0.19246245920658112, "loss/reg": 5.860950841451995e-05, "step": 420 }, { "epoch": 0.052625, "grad_norm": 5.301983833312988, "grad_norm_var": 15.644682914862404, "learning_rate": 0.0001, "loss": 1.4862, "loss/crossentropy": 2.6217854022979736, "loss/hidden": 1.296875, "loss/logits": 0.18871337175369263, "loss/reg": 5.8600846386980265e-05, "step": 421 }, { "epoch": 0.05275, "grad_norm": 2.114091634750366, "grad_norm_var": 15.758396712898662, "learning_rate": 0.0001, "loss": 1.2033, "loss/crossentropy": 2.5663623809814453, "loss/hidden": 1.0390625, "loss/logits": 0.16362521052360535, "loss/reg": 5.8592915593180805e-05, "step": 422 }, { "epoch": 0.052875, "grad_norm": 2.757091999053955, "grad_norm_var": 15.661455859551703, "learning_rate": 0.0001, "loss": 1.1223, "loss/crossentropy": 2.4681971073150635, "loss/hidden": 0.97265625, "loss/logits": 0.14905983209609985, "loss/reg": 5.858425720361993e-05, "step": 423 }, { "epoch": 0.053, "grad_norm": 2.4524407386779785, "grad_norm_var": 15.670073831964206, "learning_rate": 0.0001, "loss": 1.2938, "loss/crossentropy": 2.4758145809173584, "loss/hidden": 1.1015625, "loss/logits": 0.19164547324180603, "loss/reg": 5.857350697624497e-05, "step": 424 }, { "epoch": 0.053125, "grad_norm": 2.3052892684936523, "grad_norm_var": 0.7038252417895506, "learning_rate": 0.0001, "loss": 1.2565, "loss/crossentropy": 2.597487211227417, "loss/hidden": 1.0703125, "loss/logits": 0.18559187650680542, "loss/reg": 5.855830750078894e-05, "step": 425 }, { "epoch": 0.05325, "grad_norm": 2.7276995182037354, "grad_norm_var": 0.7027765205874381, "learning_rate": 0.0001, "loss": 1.4141, "loss/crossentropy": 2.6818253993988037, "loss/hidden": 1.21875, "loss/logits": 0.1948131024837494, "loss/reg": 5.854442133568227e-05, "step": 426 }, { "epoch": 0.053375, "grad_norm": 1.725293517112732, "grad_norm_var": 0.7537440425638384, "learning_rate": 0.0001, "loss": 1.1664, "loss/crossentropy": 2.4244258403778076, "loss/hidden": 1.015625, "loss/logits": 0.1502000093460083, "loss/reg": 5.85384841542691e-05, "step": 427 }, { "epoch": 0.0535, "grad_norm": 2.6642932891845703, "grad_norm_var": 0.7527758186064119, "learning_rate": 0.0001, "loss": 1.5211, "loss/crossentropy": 2.1209182739257812, "loss/hidden": 1.328125, "loss/logits": 0.192403644323349, "loss/reg": 5.852692629559897e-05, "step": 428 }, { "epoch": 0.053625, "grad_norm": 2.7787868976593018, "grad_norm_var": 0.6272740663233074, "learning_rate": 0.0001, "loss": 1.3046, "loss/crossentropy": 2.3020565509796143, "loss/hidden": 1.125, "loss/logits": 0.179016575217247, "loss/reg": 5.851646346854977e-05, "step": 429 }, { "epoch": 0.05375, "grad_norm": 2.891101360321045, "grad_norm_var": 0.6299498912530666, "learning_rate": 0.0001, "loss": 1.4198, "loss/crossentropy": 2.33249568939209, "loss/hidden": 1.203125, "loss/logits": 0.21604114770889282, "loss/reg": 5.850956222275272e-05, "step": 430 }, { "epoch": 0.053875, "grad_norm": 2.7940289974212646, "grad_norm_var": 0.608789107013446, "learning_rate": 0.0001, "loss": 1.1825, "loss/crossentropy": 2.6553549766540527, "loss/hidden": 1.03125, "loss/logits": 0.15064392983913422, "loss/reg": 5.8500536397332326e-05, "step": 431 }, { "epoch": 0.054, "grad_norm": 25.06597328186035, "grad_norm_var": 31.943843646690855, "learning_rate": 0.0001, "loss": 2.4055, "loss/crossentropy": 2.7126245498657227, "loss/hidden": 2.03125, "loss/logits": 0.3736712336540222, "loss/reg": 5.849341687280685e-05, "step": 432 }, { "epoch": 0.054125, "grad_norm": 2.4612748622894287, "grad_norm_var": 32.003546233579016, "learning_rate": 0.0001, "loss": 1.4832, "loss/crossentropy": 2.6244633197784424, "loss/hidden": 1.25, "loss/logits": 0.23266229033470154, "loss/reg": 5.847978172823787e-05, "step": 433 }, { "epoch": 0.05425, "grad_norm": 2.413149356842041, "grad_norm_var": 31.926055741483236, "learning_rate": 0.0001, "loss": 1.405, "loss/crossentropy": 2.513383626937866, "loss/hidden": 1.1953125, "loss/logits": 0.2091376930475235, "loss/reg": 5.847239663125947e-05, "step": 434 }, { "epoch": 0.054375, "grad_norm": 2.1266605854034424, "grad_norm_var": 31.906339652731415, "learning_rate": 0.0001, "loss": 1.2307, "loss/crossentropy": 2.645113706588745, "loss/hidden": 1.0546875, "loss/logits": 0.17538747191429138, "loss/reg": 5.8466725022299215e-05, "step": 435 }, { "epoch": 0.0545, "grad_norm": 2.693485975265503, "grad_norm_var": 31.906153605922054, "learning_rate": 0.0001, "loss": 1.3491, "loss/crossentropy": 2.5616350173950195, "loss/hidden": 1.171875, "loss/logits": 0.1766662299633026, "loss/reg": 5.845691339345649e-05, "step": 436 }, { "epoch": 0.054625, "grad_norm": 3.594322681427002, "grad_norm_var": 31.81007436255887, "learning_rate": 0.0001, "loss": 1.4456, "loss/crossentropy": 2.320868492126465, "loss/hidden": 1.171875, "loss/logits": 0.2730950713157654, "loss/reg": 5.845166742801666e-05, "step": 437 }, { "epoch": 0.05475, "grad_norm": 2.725066900253296, "grad_norm_var": 31.681987454427826, "learning_rate": 0.0001, "loss": 1.4368, "loss/crossentropy": 2.4526007175445557, "loss/hidden": 1.21875, "loss/logits": 0.21745863556861877, "loss/reg": 5.844476982019842e-05, "step": 438 }, { "epoch": 0.054875, "grad_norm": 2.615208625793457, "grad_norm_var": 31.706966746538818, "learning_rate": 0.0001, "loss": 1.2902, "loss/crossentropy": 2.5873489379882812, "loss/hidden": 1.109375, "loss/logits": 0.18027284741401672, "loss/reg": 5.843998587806709e-05, "step": 439 }, { "epoch": 0.055, "grad_norm": 2.679504632949829, "grad_norm_var": 31.66327199965654, "learning_rate": 0.0001, "loss": 1.4142, "loss/crossentropy": 2.171384811401367, "loss/hidden": 1.21875, "loss/logits": 0.1948787271976471, "loss/reg": 5.8425270253792405e-05, "step": 440 }, { "epoch": 0.055125, "grad_norm": 2.781118869781494, "grad_norm_var": 31.56886824166385, "learning_rate": 0.0001, "loss": 1.2261, "loss/crossentropy": 2.616610050201416, "loss/hidden": 1.0625, "loss/logits": 0.16300562024116516, "loss/reg": 5.841004167450592e-05, "step": 441 }, { "epoch": 0.05525, "grad_norm": 2.8343710899353027, "grad_norm_var": 31.550828531904, "learning_rate": 0.0001, "loss": 1.6654, "loss/crossentropy": 2.254971504211426, "loss/hidden": 1.390625, "loss/logits": 0.27416497468948364, "loss/reg": 5.840086305397563e-05, "step": 442 }, { "epoch": 0.055375, "grad_norm": 2.943516254425049, "grad_norm_var": 31.26553828771242, "learning_rate": 0.0001, "loss": 1.3037, "loss/crossentropy": 2.607365131378174, "loss/hidden": 1.140625, "loss/logits": 0.16250211000442505, "loss/reg": 5.8392772189108655e-05, "step": 443 }, { "epoch": 0.0555, "grad_norm": 4.3494696617126465, "grad_norm_var": 31.11395178262311, "learning_rate": 0.0001, "loss": 1.4874, "loss/crossentropy": 2.803809642791748, "loss/hidden": 1.265625, "loss/logits": 0.22114460170269012, "loss/reg": 5.8383415307616815e-05, "step": 444 }, { "epoch": 0.055625, "grad_norm": 2.3149962425231934, "grad_norm_var": 31.21739595793184, "learning_rate": 0.0001, "loss": 1.1723, "loss/crossentropy": 2.7661781311035156, "loss/hidden": 1.015625, "loss/logits": 0.1560768485069275, "loss/reg": 5.8376208471599966e-05, "step": 445 }, { "epoch": 0.05575, "grad_norm": 2.5312862396240234, "grad_norm_var": 31.288532129977195, "learning_rate": 0.0001, "loss": 1.4583, "loss/crossentropy": 2.3608808517456055, "loss/hidden": 1.234375, "loss/logits": 0.22338923811912537, "loss/reg": 5.83621695113834e-05, "step": 446 }, { "epoch": 0.055875, "grad_norm": 2.0245697498321533, "grad_norm_var": 31.468007952235922, "learning_rate": 0.0001, "loss": 1.2537, "loss/crossentropy": 2.6646907329559326, "loss/hidden": 1.0859375, "loss/logits": 0.1671399027109146, "loss/reg": 5.835363481310196e-05, "step": 447 }, { "epoch": 0.056, "grad_norm": 4.180586338043213, "grad_norm_var": 0.4425575902395887, "learning_rate": 0.0001, "loss": 1.4287, "loss/crossentropy": 2.478865623474121, "loss/hidden": 1.1796875, "loss/logits": 0.2484455555677414, "loss/reg": 5.833926479681395e-05, "step": 448 }, { "epoch": 0.056125, "grad_norm": 2.2291383743286133, "grad_norm_var": 0.4573160813014281, "learning_rate": 0.0001, "loss": 1.2997, "loss/crossentropy": 2.244389295578003, "loss/hidden": 1.15625, "loss/logits": 0.14287710189819336, "loss/reg": 5.833054456161335e-05, "step": 449 }, { "epoch": 0.05625, "grad_norm": 2.204925060272217, "grad_norm_var": 0.47117643459253195, "learning_rate": 0.0001, "loss": 1.2876, "loss/crossentropy": 2.3469107151031494, "loss/hidden": 1.1015625, "loss/logits": 0.1854255050420761, "loss/reg": 5.832717943121679e-05, "step": 450 }, { "epoch": 0.056375, "grad_norm": 2.5266880989074707, "grad_norm_var": 0.4451698073358396, "learning_rate": 0.0001, "loss": 1.4392, "loss/crossentropy": 2.440885305404663, "loss/hidden": 1.2109375, "loss/logits": 0.22769977152347565, "loss/reg": 5.8323836128693074e-05, "step": 451 }, { "epoch": 0.0565, "grad_norm": 2.410515785217285, "grad_norm_var": 0.455202882380185, "learning_rate": 0.0001, "loss": 1.4083, "loss/crossentropy": 2.4578142166137695, "loss/hidden": 1.203125, "loss/logits": 0.20461352169513702, "loss/reg": 5.830869122291915e-05, "step": 452 }, { "epoch": 0.056625, "grad_norm": 2.0389811992645264, "grad_norm_var": 0.4435531519851603, "learning_rate": 0.0001, "loss": 1.1318, "loss/crossentropy": 2.139033317565918, "loss/hidden": 0.9921875, "loss/logits": 0.1390083134174347, "loss/reg": 5.829246947541833e-05, "step": 453 }, { "epoch": 0.05675, "grad_norm": 1.979454517364502, "grad_norm_var": 0.47698744011981165, "learning_rate": 0.0001, "loss": 1.3115, "loss/crossentropy": 2.546844005584717, "loss/hidden": 1.125, "loss/logits": 0.18587306141853333, "loss/reg": 5.8282243116991594e-05, "step": 454 }, { "epoch": 0.056875, "grad_norm": 2.0210747718811035, "grad_norm_var": 0.5030154373593951, "learning_rate": 0.0001, "loss": 1.21, "loss/crossentropy": 2.6095550060272217, "loss/hidden": 1.046875, "loss/logits": 0.16256017982959747, "loss/reg": 5.8266243286198005e-05, "step": 455 }, { "epoch": 0.057, "grad_norm": 2.0944671630859375, "grad_norm_var": 0.520400331750174, "learning_rate": 0.0001, "loss": 1.1407, "loss/crossentropy": 2.450681447982788, "loss/hidden": 0.98828125, "loss/logits": 0.15184549987316132, "loss/reg": 5.8250909205526114e-05, "step": 456 }, { "epoch": 0.057125, "grad_norm": 2.5854806900024414, "grad_norm_var": 0.5178481401493921, "learning_rate": 0.0001, "loss": 1.1308, "loss/crossentropy": 2.8090949058532715, "loss/hidden": 0.97265625, "loss/logits": 0.15754011273384094, "loss/reg": 5.8233421441400424e-05, "step": 457 }, { "epoch": 0.05725, "grad_norm": 6.832178592681885, "grad_norm_var": 1.6526915128701443, "learning_rate": 0.0001, "loss": 1.7544, "loss/crossentropy": 2.4325008392333984, "loss/hidden": 1.5625, "loss/logits": 0.1913643479347229, "loss/reg": 5.821782906423323e-05, "step": 458 }, { "epoch": 0.057375, "grad_norm": 2.4911727905273438, "grad_norm_var": 1.6585857165051416, "learning_rate": 0.0001, "loss": 1.277, "loss/crossentropy": 2.5682671070098877, "loss/hidden": 1.09375, "loss/logits": 0.18270117044448853, "loss/reg": 5.820325532113202e-05, "step": 459 }, { "epoch": 0.0575, "grad_norm": 2.2592287063598633, "grad_norm_var": 1.5000806172221008, "learning_rate": 0.0001, "loss": 1.149, "loss/crossentropy": 2.3300366401672363, "loss/hidden": 0.98828125, "loss/logits": 0.16016384959220886, "loss/reg": 5.8191151765640825e-05, "step": 460 }, { "epoch": 0.057625, "grad_norm": 2.6110737323760986, "grad_norm_var": 1.4915332961489087, "learning_rate": 0.0001, "loss": 1.4344, "loss/crossentropy": 2.560197591781616, "loss/hidden": 1.21875, "loss/logits": 0.21507461369037628, "loss/reg": 5.817634882987477e-05, "step": 461 }, { "epoch": 0.05775, "grad_norm": 2.6446752548217773, "grad_norm_var": 1.48995546498276, "learning_rate": 0.0001, "loss": 1.2381, "loss/crossentropy": 2.5068211555480957, "loss/hidden": 1.0546875, "loss/logits": 0.1828281581401825, "loss/reg": 5.816355405841023e-05, "step": 462 }, { "epoch": 0.057875, "grad_norm": 2.498300075531006, "grad_norm_var": 1.4615785550667995, "learning_rate": 0.0001, "loss": 1.3019, "loss/crossentropy": 2.3765523433685303, "loss/hidden": 1.1328125, "loss/logits": 0.16848215460777283, "loss/reg": 5.814860560349189e-05, "step": 463 }, { "epoch": 0.058, "grad_norm": 2.4674289226531982, "grad_norm_var": 1.3126372255276026, "learning_rate": 0.0001, "loss": 1.3472, "loss/crossentropy": 2.714657783508301, "loss/hidden": 1.1640625, "loss/logits": 0.18256625533103943, "loss/reg": 5.81321437493898e-05, "step": 464 }, { "epoch": 0.058125, "grad_norm": 3.7482964992523193, "grad_norm_var": 1.3780257940909062, "learning_rate": 0.0001, "loss": 1.4579, "loss/crossentropy": 2.7645256519317627, "loss/hidden": 1.2109375, "loss/logits": 0.24636635184288025, "loss/reg": 5.811548908241093e-05, "step": 465 }, { "epoch": 0.05825, "grad_norm": 3.1881492137908936, "grad_norm_var": 1.3717908440858895, "learning_rate": 0.0001, "loss": 1.2469, "loss/crossentropy": 2.6280384063720703, "loss/hidden": 1.078125, "loss/logits": 0.16818463802337646, "loss/reg": 5.8095396525459364e-05, "step": 466 }, { "epoch": 0.058375, "grad_norm": 3.4882731437683105, "grad_norm_var": 1.3977675144088226, "learning_rate": 0.0001, "loss": 1.5403, "loss/crossentropy": 1.8358429670333862, "loss/hidden": 1.3203125, "loss/logits": 0.21941694617271423, "loss/reg": 5.807522757095285e-05, "step": 467 }, { "epoch": 0.0585, "grad_norm": 2.530682325363159, "grad_norm_var": 1.391870091660969, "learning_rate": 0.0001, "loss": 1.1578, "loss/crossentropy": 2.3950142860412598, "loss/hidden": 0.99609375, "loss/logits": 0.1611400693655014, "loss/reg": 5.80518099013716e-05, "step": 468 }, { "epoch": 0.058625, "grad_norm": 3.4676575660705566, "grad_norm_var": 1.366390295617852, "learning_rate": 0.0001, "loss": 1.5162, "loss/crossentropy": 2.851280689239502, "loss/hidden": 1.234375, "loss/logits": 0.28122612833976746, "loss/reg": 5.8030982472701e-05, "step": 469 }, { "epoch": 0.05875, "grad_norm": 2.9446208477020264, "grad_norm_var": 1.302065384350945, "learning_rate": 0.0001, "loss": 1.3015, "loss/crossentropy": 2.740093469619751, "loss/hidden": 1.125, "loss/logits": 0.17590749263763428, "loss/reg": 5.800585859105922e-05, "step": 470 }, { "epoch": 0.058875, "grad_norm": 2.7597243785858154, "grad_norm_var": 1.2405377686230998, "learning_rate": 0.0001, "loss": 1.1651, "loss/crossentropy": 2.440762996673584, "loss/hidden": 1.015625, "loss/logits": 0.14888577163219452, "loss/reg": 5.7990357163362205e-05, "step": 471 }, { "epoch": 0.059, "grad_norm": 2.8147523403167725, "grad_norm_var": 1.182327943249795, "learning_rate": 0.0001, "loss": 1.3195, "loss/crossentropy": 2.5801327228546143, "loss/hidden": 1.140625, "loss/logits": 0.17824885249137878, "loss/reg": 5.7975972595158964e-05, "step": 472 }, { "epoch": 0.059125, "grad_norm": 2.4511027336120605, "grad_norm_var": 1.1923747545104257, "learning_rate": 0.0001, "loss": 1.4217, "loss/crossentropy": 2.5711913108825684, "loss/hidden": 1.203125, "loss/logits": 0.2180328667163849, "loss/reg": 5.796052937512286e-05, "step": 473 }, { "epoch": 0.05925, "grad_norm": 2.9213221073150635, "grad_norm_var": 0.1890407192544025, "learning_rate": 0.0001, "loss": 1.2735, "loss/crossentropy": 2.5805675983428955, "loss/hidden": 1.1015625, "loss/logits": 0.17132875323295593, "loss/reg": 5.794024036731571e-05, "step": 474 }, { "epoch": 0.059375, "grad_norm": 2.6587464809417725, "grad_norm_var": 0.1832162860499608, "learning_rate": 0.0001, "loss": 1.6569, "loss/crossentropy": 2.356299638748169, "loss/hidden": 1.40625, "loss/logits": 0.25005391240119934, "loss/reg": 5.791860894532874e-05, "step": 475 }, { "epoch": 0.0595, "grad_norm": 3.5978729724884033, "grad_norm_var": 0.19139826910290647, "learning_rate": 0.0001, "loss": 1.7357, "loss/crossentropy": 2.0626883506774902, "loss/hidden": 1.4765625, "loss/logits": 0.2585859000682831, "loss/reg": 5.790415525552817e-05, "step": 476 }, { "epoch": 0.059625, "grad_norm": 2.8491876125335693, "grad_norm_var": 0.18498974202791843, "learning_rate": 0.0001, "loss": 1.5276, "loss/crossentropy": 2.5583596229553223, "loss/hidden": 1.2734375, "loss/logits": 0.25358158349990845, "loss/reg": 5.788617272628471e-05, "step": 477 }, { "epoch": 0.05975, "grad_norm": 2.5821259021759033, "grad_norm_var": 0.1876924518839881, "learning_rate": 0.0001, "loss": 1.3568, "loss/crossentropy": 2.486640453338623, "loss/hidden": 1.1640625, "loss/logits": 0.19216927886009216, "loss/reg": 5.786680776509456e-05, "step": 478 }, { "epoch": 0.059875, "grad_norm": 2.877934217453003, "grad_norm_var": 0.17456917708907038, "learning_rate": 0.0001, "loss": 1.5607, "loss/crossentropy": 2.3836066722869873, "loss/hidden": 1.3203125, "loss/logits": 0.23981472849845886, "loss/reg": 5.785070243291557e-05, "step": 479 }, { "epoch": 0.06, "grad_norm": 2.3281009197235107, "grad_norm_var": 0.1849188959934999, "learning_rate": 0.0001, "loss": 1.2716, "loss/crossentropy": 2.508988380432129, "loss/hidden": 1.078125, "loss/logits": 0.19294525682926178, "loss/reg": 5.783725646324456e-05, "step": 480 }, { "epoch": 0.060125, "grad_norm": 2.8099567890167236, "grad_norm_var": 0.14013939438571937, "learning_rate": 0.0001, "loss": 1.5081, "loss/crossentropy": 2.3855881690979004, "loss/hidden": 1.25, "loss/logits": 0.2575419545173645, "loss/reg": 5.782474545412697e-05, "step": 481 }, { "epoch": 0.06025, "grad_norm": 2.9827277660369873, "grad_norm_var": 0.134662315913679, "learning_rate": 0.0001, "loss": 1.4593, "loss/crossentropy": 2.5487606525421143, "loss/hidden": 1.25, "loss/logits": 0.2087090015411377, "loss/reg": 5.7816720072878525e-05, "step": 482 }, { "epoch": 0.060375, "grad_norm": 2.306149959564209, "grad_norm_var": 0.1259770764512929, "learning_rate": 0.0001, "loss": 1.2076, "loss/crossentropy": 2.4755747318267822, "loss/hidden": 1.046875, "loss/logits": 0.16014963388442993, "loss/reg": 5.781082290923223e-05, "step": 483 }, { "epoch": 0.0605, "grad_norm": 2.4719114303588867, "grad_norm_var": 0.12834384378027816, "learning_rate": 0.0001, "loss": 1.3745, "loss/crossentropy": 2.8203346729278564, "loss/hidden": 1.171875, "loss/logits": 0.20208273828029633, "loss/reg": 5.7795077736955136e-05, "step": 484 }, { "epoch": 0.060625, "grad_norm": 2.300952911376953, "grad_norm_var": 0.10978991346620433, "learning_rate": 0.0001, "loss": 1.464, "loss/crossentropy": 2.610508680343628, "loss/hidden": 1.2265625, "loss/logits": 0.2368427813053131, "loss/reg": 5.778546983492561e-05, "step": 485 }, { "epoch": 0.06075, "grad_norm": 3.3388009071350098, "grad_norm_var": 0.13085586368501342, "learning_rate": 0.0001, "loss": 1.5116, "loss/crossentropy": 2.763427972793579, "loss/hidden": 1.296875, "loss/logits": 0.21419215202331543, "loss/reg": 5.7770797866396606e-05, "step": 486 }, { "epoch": 0.060875, "grad_norm": 2.102293014526367, "grad_norm_var": 0.1572983810037916, "learning_rate": 0.0001, "loss": 1.1595, "loss/crossentropy": 2.204011917114258, "loss/hidden": 1.0, "loss/logits": 0.158901646733284, "loss/reg": 5.7755187299335375e-05, "step": 487 }, { "epoch": 0.061, "grad_norm": 2.766934633255005, "grad_norm_var": 0.15678694409689248, "learning_rate": 0.0001, "loss": 1.4246, "loss/crossentropy": 2.537151575088501, "loss/hidden": 1.2109375, "loss/logits": 0.2130882441997528, "loss/reg": 5.774224700871855e-05, "step": 488 }, { "epoch": 0.061125, "grad_norm": 2.0001540184020996, "grad_norm_var": 0.18501104247654798, "learning_rate": 0.0001, "loss": 1.103, "loss/crossentropy": 2.3592050075531006, "loss/hidden": 0.96484375, "loss/logits": 0.13754940032958984, "loss/reg": 5.77289865759667e-05, "step": 489 }, { "epoch": 0.06125, "grad_norm": 2.3166351318359375, "grad_norm_var": 0.18848381138351228, "learning_rate": 0.0001, "loss": 1.3329, "loss/crossentropy": 2.7236411571502686, "loss/hidden": 1.15625, "loss/logits": 0.1761033535003662, "loss/reg": 5.771181167801842e-05, "step": 490 }, { "epoch": 0.061375, "grad_norm": 2.357775926589966, "grad_norm_var": 0.19351960086170053, "learning_rate": 0.0001, "loss": 1.1437, "loss/crossentropy": 2.866445779800415, "loss/hidden": 0.98828125, "loss/logits": 0.15484049916267395, "loss/reg": 5.769642666564323e-05, "step": 491 }, { "epoch": 0.0615, "grad_norm": 3.680264949798584, "grad_norm_var": 0.20463866822373877, "learning_rate": 0.0001, "loss": 1.2002, "loss/crossentropy": 3.115431308746338, "loss/hidden": 1.0390625, "loss/logits": 0.16054463386535645, "loss/reg": 5.7679084420669824e-05, "step": 492 }, { "epoch": 0.061625, "grad_norm": 2.3650856018066406, "grad_norm_var": 0.2051052996774897, "learning_rate": 0.0001, "loss": 1.1996, "loss/crossentropy": 2.6519298553466797, "loss/hidden": 1.0234375, "loss/logits": 0.17554257810115814, "loss/reg": 5.766074173152447e-05, "step": 493 }, { "epoch": 0.06175, "grad_norm": 2.7080323696136475, "grad_norm_var": 0.2058088113620099, "learning_rate": 0.0001, "loss": 1.365, "loss/crossentropy": 2.329538106918335, "loss/hidden": 1.15625, "loss/logits": 0.20815491676330566, "loss/reg": 5.764625166193582e-05, "step": 494 }, { "epoch": 0.061875, "grad_norm": 2.2859530448913574, "grad_norm_var": 0.2063347958167308, "learning_rate": 0.0001, "loss": 1.2994, "loss/crossentropy": 2.6445348262786865, "loss/hidden": 1.125, "loss/logits": 0.1738019585609436, "loss/reg": 5.763155422755517e-05, "step": 495 }, { "epoch": 0.062, "grad_norm": 2.771320343017578, "grad_norm_var": 0.20431087500909348, "learning_rate": 0.0001, "loss": 1.4714, "loss/crossentropy": 2.340728282928467, "loss/hidden": 1.2578125, "loss/logits": 0.21303007006645203, "loss/reg": 5.761897409684025e-05, "step": 496 }, { "epoch": 0.062125, "grad_norm": 3.022183656692505, "grad_norm_var": 0.21312900983479016, "learning_rate": 0.0001, "loss": 1.4858, "loss/crossentropy": 2.6772336959838867, "loss/hidden": 1.265625, "loss/logits": 0.2196260541677475, "loss/reg": 5.761081411037594e-05, "step": 497 }, { "epoch": 0.06225, "grad_norm": 13.948429107666016, "grad_norm_var": 8.27193520122967, "learning_rate": 0.0001, "loss": 1.3633, "loss/crossentropy": 2.862323760986328, "loss/hidden": 1.171875, "loss/logits": 0.19083081185817719, "loss/reg": 5.7596374972490594e-05, "step": 498 }, { "epoch": 0.062375, "grad_norm": 2.6107678413391113, "grad_norm_var": 8.237513777759569, "learning_rate": 0.0001, "loss": 1.6771, "loss/crossentropy": 2.1725099086761475, "loss/hidden": 1.40625, "loss/logits": 0.2702314555644989, "loss/reg": 5.7586628827266395e-05, "step": 499 }, { "epoch": 0.0625, "grad_norm": 2.5658040046691895, "grad_norm_var": 8.22750426778598, "learning_rate": 0.0001, "loss": 1.4381, "loss/crossentropy": 2.246595859527588, "loss/hidden": 1.25, "loss/logits": 0.18755751848220825, "loss/reg": 5.756897371611558e-05, "step": 500 }, { "epoch": 0.062625, "grad_norm": 2.179478168487549, "grad_norm_var": 8.244953306240525, "learning_rate": 0.0001, "loss": 1.291, "loss/crossentropy": 2.488560199737549, "loss/hidden": 1.1015625, "loss/logits": 0.1888759732246399, "loss/reg": 5.755467645940371e-05, "step": 501 }, { "epoch": 0.06275, "grad_norm": 2.0030248165130615, "grad_norm_var": 8.352009291243755, "learning_rate": 0.0001, "loss": 1.2442, "loss/crossentropy": 2.5138843059539795, "loss/hidden": 1.0546875, "loss/logits": 0.18891112506389618, "loss/reg": 5.754067751695402e-05, "step": 502 }, { "epoch": 0.062875, "grad_norm": 2.3097050189971924, "grad_norm_var": 8.323504212905837, "learning_rate": 0.0001, "loss": 1.0807, "loss/crossentropy": 2.534362554550171, "loss/hidden": 0.93359375, "loss/logits": 0.1465301215648651, "loss/reg": 5.752982178819366e-05, "step": 503 }, { "epoch": 0.063, "grad_norm": 2.1386523246765137, "grad_norm_var": 8.388074418328216, "learning_rate": 0.0001, "loss": 1.267, "loss/crossentropy": 2.5574960708618164, "loss/hidden": 1.0859375, "loss/logits": 0.18048033118247986, "loss/reg": 5.7512213970767334e-05, "step": 504 }, { "epoch": 0.063125, "grad_norm": 2.0644736289978027, "grad_norm_var": 8.378009254211024, "learning_rate": 0.0001, "loss": 1.1193, "loss/crossentropy": 2.4307045936584473, "loss/hidden": 0.96484375, "loss/logits": 0.15389274060726166, "loss/reg": 5.749760748585686e-05, "step": 505 }, { "epoch": 0.06325, "grad_norm": 3.5246083736419678, "grad_norm_var": 8.32564739400077, "learning_rate": 0.0001, "loss": 1.904, "loss/crossentropy": 2.737135171890259, "loss/hidden": 1.5078125, "loss/logits": 0.395632803440094, "loss/reg": 5.74878795305267e-05, "step": 506 }, { "epoch": 0.063375, "grad_norm": 2.487663745880127, "grad_norm_var": 8.310670261508536, "learning_rate": 0.0001, "loss": 1.3095, "loss/crossentropy": 2.5345706939697266, "loss/hidden": 1.1171875, "loss/logits": 0.19169974327087402, "loss/reg": 5.747407703893259e-05, "step": 507 }, { "epoch": 0.0635, "grad_norm": 2.319613218307495, "grad_norm_var": 8.355867662618861, "learning_rate": 0.0001, "loss": 1.4296, "loss/crossentropy": 2.439621925354004, "loss/hidden": 1.1875, "loss/logits": 0.24147875607013702, "loss/reg": 5.746008537244052e-05, "step": 508 }, { "epoch": 0.063625, "grad_norm": 2.239403009414673, "grad_norm_var": 8.370955905049472, "learning_rate": 0.0001, "loss": 1.307, "loss/crossentropy": 2.4812705516815186, "loss/hidden": 1.1171875, "loss/logits": 0.18924759328365326, "loss/reg": 5.744830923504196e-05, "step": 509 }, { "epoch": 0.06375, "grad_norm": 2.440845489501953, "grad_norm_var": 8.39289750619492, "learning_rate": 0.0001, "loss": 1.1623, "loss/crossentropy": 2.5835447311401367, "loss/hidden": 0.984375, "loss/logits": 0.17732426524162292, "loss/reg": 5.744034206145443e-05, "step": 510 }, { "epoch": 0.063875, "grad_norm": 3.7700071334838867, "grad_norm_var": 8.353245171235498, "learning_rate": 0.0001, "loss": 1.5529, "loss/crossentropy": 2.9319827556610107, "loss/hidden": 1.2734375, "loss/logits": 0.27884694933891296, "loss/reg": 5.742744542658329e-05, "step": 511 }, { "epoch": 0.064, "grad_norm": 3.187791347503662, "grad_norm_var": 8.3361305665004, "learning_rate": 0.0001, "loss": 1.6948, "loss/crossentropy": 2.616928815841675, "loss/hidden": 1.40625, "loss/logits": 0.2879628539085388, "loss/reg": 5.7413530157646164e-05, "step": 512 }, { "epoch": 0.064125, "grad_norm": 2.2276997566223145, "grad_norm_var": 8.405092706710946, "learning_rate": 0.0001, "loss": 1.2498, "loss/crossentropy": 2.5594594478607178, "loss/hidden": 1.0625, "loss/logits": 0.18669450283050537, "loss/reg": 5.740176129620522e-05, "step": 513 }, { "epoch": 0.06425, "grad_norm": 2.571033239364624, "grad_norm_var": 0.26774881553773466, "learning_rate": 0.0001, "loss": 1.5606, "loss/crossentropy": 2.3838601112365723, "loss/hidden": 1.296875, "loss/logits": 0.2631247639656067, "loss/reg": 5.7387296692468226e-05, "step": 514 }, { "epoch": 0.064375, "grad_norm": 2.2190117835998535, "grad_norm_var": 0.2736462331703469, "learning_rate": 0.0001, "loss": 1.2127, "loss/crossentropy": 2.3376893997192383, "loss/hidden": 1.046875, "loss/logits": 0.16529923677444458, "loss/reg": 5.7369947171537206e-05, "step": 515 }, { "epoch": 0.0645, "grad_norm": 3.6509628295898438, "grad_norm_var": 0.3545153452463372, "learning_rate": 0.0001, "loss": 1.4324, "loss/crossentropy": 2.669734001159668, "loss/hidden": 1.2421875, "loss/logits": 0.18968772888183594, "loss/reg": 5.7353197917109355e-05, "step": 516 }, { "epoch": 0.064625, "grad_norm": 2.248657703399658, "grad_norm_var": 0.351088953977406, "learning_rate": 0.0001, "loss": 1.2818, "loss/crossentropy": 2.6037304401397705, "loss/hidden": 1.09375, "loss/logits": 0.1875147819519043, "loss/reg": 5.7342589570907876e-05, "step": 517 }, { "epoch": 0.06475, "grad_norm": 2.4198105335235596, "grad_norm_var": 0.32945477622143454, "learning_rate": 0.0001, "loss": 1.1107, "loss/crossentropy": 2.6085071563720703, "loss/hidden": 0.96875, "loss/logits": 0.1413734257221222, "loss/reg": 5.7321107306052e-05, "step": 518 }, { "epoch": 0.064875, "grad_norm": 2.1648852825164795, "grad_norm_var": 0.33663639522773486, "learning_rate": 0.0001, "loss": 1.4, "loss/crossentropy": 2.177536964416504, "loss/hidden": 1.1796875, "loss/logits": 0.2197086662054062, "loss/reg": 5.7304925576318055e-05, "step": 519 }, { "epoch": 0.065, "grad_norm": 3.105713129043579, "grad_norm_var": 0.33499459859275643, "learning_rate": 0.0001, "loss": 1.3224, "loss/crossentropy": 3.021796941757202, "loss/hidden": 1.1328125, "loss/logits": 0.1890622079372406, "loss/reg": 5.7283985370304435e-05, "step": 520 }, { "epoch": 0.065125, "grad_norm": 2.680781364440918, "grad_norm_var": 0.3093752297956028, "learning_rate": 0.0001, "loss": 1.4408, "loss/crossentropy": 2.4209651947021484, "loss/hidden": 1.203125, "loss/logits": 0.23711565136909485, "loss/reg": 5.726591916754842e-05, "step": 521 }, { "epoch": 0.06525, "grad_norm": 2.415611505508423, "grad_norm_var": 0.2648511354846446, "learning_rate": 0.0001, "loss": 1.2277, "loss/crossentropy": 2.623185873031616, "loss/hidden": 1.0546875, "loss/logits": 0.172480508685112, "loss/reg": 5.724430957343429e-05, "step": 522 }, { "epoch": 0.065375, "grad_norm": 2.7129733562469482, "grad_norm_var": 0.2636174732540553, "learning_rate": 0.0001, "loss": 1.5511, "loss/crossentropy": 2.6856141090393066, "loss/hidden": 1.328125, "loss/logits": 0.22241194546222687, "loss/reg": 5.722355126636103e-05, "step": 523 }, { "epoch": 0.0655, "grad_norm": 2.5169126987457275, "grad_norm_var": 0.25740049578529633, "learning_rate": 0.0001, "loss": 1.4756, "loss/crossentropy": 2.5400278568267822, "loss/hidden": 1.2421875, "loss/logits": 0.23280034959316254, "loss/reg": 5.720969784306362e-05, "step": 524 }, { "epoch": 0.065625, "grad_norm": 3.141322612762451, "grad_norm_var": 0.25757144722960346, "learning_rate": 0.0001, "loss": 1.3953, "loss/crossentropy": 2.6088011264801025, "loss/hidden": 1.203125, "loss/logits": 0.1916118562221527, "loss/reg": 5.71877826587297e-05, "step": 525 }, { "epoch": 0.06575, "grad_norm": 2.1077466011047363, "grad_norm_var": 0.276776634481363, "learning_rate": 0.0001, "loss": 1.2543, "loss/crossentropy": 2.3197500705718994, "loss/hidden": 1.078125, "loss/logits": 0.1755562126636505, "loss/reg": 5.716781743103638e-05, "step": 526 }, { "epoch": 0.065875, "grad_norm": 3.1689445972442627, "grad_norm_var": 0.2133083163409907, "learning_rate": 0.0001, "loss": 1.4275, "loss/crossentropy": 2.9355862140655518, "loss/hidden": 1.21875, "loss/logits": 0.20817086100578308, "loss/reg": 5.714903454645537e-05, "step": 527 }, { "epoch": 0.066, "grad_norm": 2.285956859588623, "grad_norm_var": 0.20052447759750577, "learning_rate": 0.0001, "loss": 1.1824, "loss/crossentropy": 2.760286331176758, "loss/hidden": 1.0234375, "loss/logits": 0.1584203690290451, "loss/reg": 5.712690472137183e-05, "step": 528 }, { "epoch": 0.066125, "grad_norm": 3.018244981765747, "grad_norm_var": 0.2000914000663156, "learning_rate": 0.0001, "loss": 1.6053, "loss/crossentropy": 2.4221365451812744, "loss/hidden": 1.3359375, "loss/logits": 0.26882410049438477, "loss/reg": 5.7109886256512254e-05, "step": 529 }, { "epoch": 0.06625, "grad_norm": 2.6761245727539062, "grad_norm_var": 0.19965014586136837, "learning_rate": 0.0001, "loss": 1.4026, "loss/crossentropy": 2.5661122798919678, "loss/hidden": 1.203125, "loss/logits": 0.19894230365753174, "loss/reg": 5.709614561055787e-05, "step": 530 }, { "epoch": 0.066375, "grad_norm": 3.7308688163757324, "grad_norm_var": 0.25394415558544003, "learning_rate": 0.0001, "loss": 1.3714, "loss/crossentropy": 2.674487352371216, "loss/hidden": 1.1875, "loss/logits": 0.18333487212657928, "loss/reg": 5.708081880584359e-05, "step": 531 }, { "epoch": 0.0665, "grad_norm": 5.148390293121338, "grad_norm_var": 0.5734027576420241, "learning_rate": 0.0001, "loss": 2.0224, "loss/crossentropy": 2.5511114597320557, "loss/hidden": 1.6875, "loss/logits": 0.3343617022037506, "loss/reg": 5.706860974896699e-05, "step": 532 }, { "epoch": 0.066625, "grad_norm": 4.639659881591797, "grad_norm_var": 0.7401371960890089, "learning_rate": 0.0001, "loss": 2.0703, "loss/crossentropy": 2.8264873027801514, "loss/hidden": 1.609375, "loss/logits": 0.4603120684623718, "loss/reg": 5.705539297196083e-05, "step": 533 }, { "epoch": 0.06675, "grad_norm": 2.6590423583984375, "grad_norm_var": 0.7253392327299117, "learning_rate": 0.0001, "loss": 1.1927, "loss/crossentropy": 2.692796230316162, "loss/hidden": 1.03125, "loss/logits": 0.1608980894088745, "loss/reg": 5.7040437241084874e-05, "step": 534 }, { "epoch": 0.066875, "grad_norm": 2.4473471641540527, "grad_norm_var": 0.6984663971172343, "learning_rate": 0.0001, "loss": 1.2469, "loss/crossentropy": 2.6443886756896973, "loss/hidden": 1.0703125, "loss/logits": 0.1760546863079071, "loss/reg": 5.702269481844269e-05, "step": 535 }, { "epoch": 0.067, "grad_norm": 2.676015853881836, "grad_norm_var": 0.7055813256444667, "learning_rate": 0.0001, "loss": 1.5843, "loss/crossentropy": 2.431217908859253, "loss/hidden": 1.3359375, "loss/logits": 0.24780328571796417, "loss/reg": 5.7007055147551e-05, "step": 536 }, { "epoch": 0.067125, "grad_norm": 2.392594575881958, "grad_norm_var": 0.723100302829251, "learning_rate": 0.0001, "loss": 1.4829, "loss/crossentropy": 2.6035330295562744, "loss/hidden": 1.25, "loss/logits": 0.23232683539390564, "loss/reg": 5.699544635717757e-05, "step": 537 }, { "epoch": 0.06725, "grad_norm": 3.0852084159851074, "grad_norm_var": 0.7004121508792098, "learning_rate": 0.0001, "loss": 1.662, "loss/crossentropy": 2.6200292110443115, "loss/hidden": 1.359375, "loss/logits": 0.30209141969680786, "loss/reg": 5.697782398783602e-05, "step": 538 }, { "epoch": 0.067375, "grad_norm": 3.3582072257995605, "grad_norm_var": 0.6995490112188842, "learning_rate": 0.0001, "loss": 1.4034, "loss/crossentropy": 2.590505599975586, "loss/hidden": 1.1875, "loss/logits": 0.2153635025024414, "loss/reg": 5.69607327634003e-05, "step": 539 }, { "epoch": 0.0675, "grad_norm": 2.4375123977661133, "grad_norm_var": 0.705753805030601, "learning_rate": 0.0001, "loss": 1.2346, "loss/crossentropy": 2.815932035446167, "loss/hidden": 1.0703125, "loss/logits": 0.16367268562316895, "loss/reg": 5.6944831158034503e-05, "step": 540 }, { "epoch": 0.067625, "grad_norm": 3.832122564315796, "grad_norm_var": 0.742993530751691, "learning_rate": 0.0001, "loss": 1.3468, "loss/crossentropy": 2.4388320446014404, "loss/hidden": 1.15625, "loss/logits": 0.1899527907371521, "loss/reg": 5.6931155995698646e-05, "step": 541 }, { "epoch": 0.06775, "grad_norm": 2.635655641555786, "grad_norm_var": 0.6902874276451028, "learning_rate": 0.0001, "loss": 1.3026, "loss/crossentropy": 2.51385760307312, "loss/hidden": 1.125, "loss/logits": 0.17700721323490143, "loss/reg": 5.6917735491879284e-05, "step": 542 }, { "epoch": 0.067875, "grad_norm": 2.1058261394500732, "grad_norm_var": 0.7563971927113601, "learning_rate": 0.0001, "loss": 1.112, "loss/crossentropy": 2.427570343017578, "loss/hidden": 0.9765625, "loss/logits": 0.13487987220287323, "loss/reg": 5.690442776540294e-05, "step": 543 }, { "epoch": 0.068, "grad_norm": 2.943103551864624, "grad_norm_var": 0.7146417206132497, "learning_rate": 0.0001, "loss": 1.3356, "loss/crossentropy": 2.8370819091796875, "loss/hidden": 1.1484375, "loss/logits": 0.186607226729393, "loss/reg": 5.68877840123605e-05, "step": 544 }, { "epoch": 0.068125, "grad_norm": 3.8723862171173096, "grad_norm_var": 0.7496049567117694, "learning_rate": 0.0001, "loss": 1.5018, "loss/crossentropy": 2.479180097579956, "loss/hidden": 1.25, "loss/logits": 0.2512153685092926, "loss/reg": 5.6872839195420966e-05, "step": 545 }, { "epoch": 0.06825, "grad_norm": 2.1730518341064453, "grad_norm_var": 0.7982148549638083, "learning_rate": 0.0001, "loss": 1.3274, "loss/crossentropy": 2.7552671432495117, "loss/hidden": 1.140625, "loss/logits": 0.18620190024375916, "loss/reg": 5.685817450284958e-05, "step": 546 }, { "epoch": 0.068375, "grad_norm": 2.5830624103546143, "grad_norm_var": 0.7891437401193322, "learning_rate": 0.0001, "loss": 1.3184, "loss/crossentropy": 2.976935386657715, "loss/hidden": 1.1171875, "loss/logits": 0.20067663490772247, "loss/reg": 5.684147254214622e-05, "step": 547 }, { "epoch": 0.0685, "grad_norm": 2.8029427528381348, "grad_norm_var": 0.48043981243007633, "learning_rate": 0.0001, "loss": 1.509, "loss/crossentropy": 2.4375596046447754, "loss/hidden": 1.2578125, "loss/logits": 0.25059816241264343, "loss/reg": 5.682710980181582e-05, "step": 548 }, { "epoch": 0.068625, "grad_norm": 2.4281182289123535, "grad_norm_var": 0.27763671155671144, "learning_rate": 0.0001, "loss": 1.432, "loss/crossentropy": 2.510115623474121, "loss/hidden": 1.21875, "loss/logits": 0.21264401078224182, "loss/reg": 5.681176844518632e-05, "step": 549 }, { "epoch": 0.06875, "grad_norm": 2.535102605819702, "grad_norm_var": 0.2805462672149124, "learning_rate": 0.0001, "loss": 1.6199, "loss/crossentropy": 2.3414230346679688, "loss/hidden": 1.3828125, "loss/logits": 0.2365313321352005, "loss/reg": 5.6794018746586516e-05, "step": 550 }, { "epoch": 0.068875, "grad_norm": 2.8906142711639404, "grad_norm_var": 0.2738004819147721, "learning_rate": 0.0001, "loss": 1.3797, "loss/crossentropy": 2.3271195888519287, "loss/hidden": 1.1796875, "loss/logits": 0.19946351647377014, "loss/reg": 5.6774406402837485e-05, "step": 551 }, { "epoch": 0.069, "grad_norm": 3.370306968688965, "grad_norm_var": 0.2927309791110661, "learning_rate": 0.0001, "loss": 1.3856, "loss/crossentropy": 2.3809690475463867, "loss/hidden": 1.171875, "loss/logits": 0.2131776064634323, "loss/reg": 5.675842476193793e-05, "step": 552 }, { "epoch": 0.069125, "grad_norm": 4.104588508605957, "grad_norm_var": 0.37370332603284834, "learning_rate": 0.0001, "loss": 1.4949, "loss/crossentropy": 2.32270884513855, "loss/hidden": 1.296875, "loss/logits": 0.19749879837036133, "loss/reg": 5.6740394938969985e-05, "step": 553 }, { "epoch": 0.06925, "grad_norm": 2.1966323852539062, "grad_norm_var": 0.40671981467929375, "learning_rate": 0.0001, "loss": 1.2987, "loss/crossentropy": 2.56730580329895, "loss/hidden": 1.1171875, "loss/logits": 0.18095816671848297, "loss/reg": 5.6720054999459535e-05, "step": 554 }, { "epoch": 0.069375, "grad_norm": 3.2739336490631104, "grad_norm_var": 0.40192322247623313, "learning_rate": 0.0001, "loss": 1.7776, "loss/crossentropy": 2.051370143890381, "loss/hidden": 1.515625, "loss/logits": 0.26144689321517944, "loss/reg": 5.670300015481189e-05, "step": 555 }, { "epoch": 0.0695, "grad_norm": 2.814973831176758, "grad_norm_var": 0.3882282893863246, "learning_rate": 0.0001, "loss": 1.3358, "loss/crossentropy": 2.2743093967437744, "loss/hidden": 1.171875, "loss/logits": 0.16339904069900513, "loss/reg": 5.66886410524603e-05, "step": 556 }, { "epoch": 0.069625, "grad_norm": 2.559269428253174, "grad_norm_var": 0.3330167895556078, "learning_rate": 0.0001, "loss": 1.4237, "loss/crossentropy": 2.19891357421875, "loss/hidden": 1.2109375, "loss/logits": 0.212164968252182, "loss/reg": 5.667324876412749e-05, "step": 557 }, { "epoch": 0.06975, "grad_norm": 2.4474921226501465, "grad_norm_var": 0.34012043993938973, "learning_rate": 0.0001, "loss": 1.3187, "loss/crossentropy": 2.409031867980957, "loss/hidden": 1.140625, "loss/logits": 0.17748260498046875, "loss/reg": 5.6656310334801674e-05, "step": 558 }, { "epoch": 0.069875, "grad_norm": 2.838435173034668, "grad_norm_var": 0.3040173512426804, "learning_rate": 0.0001, "loss": 1.2111, "loss/crossentropy": 2.6684110164642334, "loss/hidden": 1.0390625, "loss/logits": 0.17149032652378082, "loss/reg": 5.664560740115121e-05, "step": 559 }, { "epoch": 0.07, "grad_norm": 2.46317458152771, "grad_norm_var": 0.3133912614469295, "learning_rate": 0.0001, "loss": 1.414, "loss/crossentropy": 2.6221396923065186, "loss/hidden": 1.203125, "loss/logits": 0.21026402711868286, "loss/reg": 5.663983756676316e-05, "step": 560 }, { "epoch": 0.070125, "grad_norm": 2.4146885871887207, "grad_norm_var": 0.24449850015313715, "learning_rate": 0.0001, "loss": 1.2718, "loss/crossentropy": 2.4179940223693848, "loss/hidden": 1.09375, "loss/logits": 0.17744633555412292, "loss/reg": 5.66331364098005e-05, "step": 561 }, { "epoch": 0.07025, "grad_norm": 2.1373863220214844, "grad_norm_var": 0.24729082719934822, "learning_rate": 0.0001, "loss": 1.3358, "loss/crossentropy": 2.3687002658843994, "loss/hidden": 1.15625, "loss/logits": 0.17901673913002014, "loss/reg": 5.6618908274685964e-05, "step": 562 }, { "epoch": 0.070375, "grad_norm": 2.2190897464752197, "grad_norm_var": 0.26324956728146254, "learning_rate": 0.0001, "loss": 1.4028, "loss/crossentropy": 2.2975175380706787, "loss/hidden": 1.1953125, "loss/logits": 0.20690736174583435, "loss/reg": 5.660299211740494e-05, "step": 563 }, { "epoch": 0.0705, "grad_norm": 2.7386221885681152, "grad_norm_var": 0.26278435237145437, "learning_rate": 0.0001, "loss": 1.2775, "loss/crossentropy": 2.603506565093994, "loss/hidden": 1.1015625, "loss/logits": 0.17539449036121368, "loss/reg": 5.658900772687048e-05, "step": 564 }, { "epoch": 0.070625, "grad_norm": 2.46307635307312, "grad_norm_var": 0.2615257576478134, "learning_rate": 0.0001, "loss": 1.2791, "loss/crossentropy": 2.4369289875030518, "loss/hidden": 1.1171875, "loss/logits": 0.16132491827011108, "loss/reg": 5.6575612688902766e-05, "step": 565 }, { "epoch": 0.07075, "grad_norm": 2.45520281791687, "grad_norm_var": 0.26385949291743505, "learning_rate": 0.0001, "loss": 1.4484, "loss/crossentropy": 2.494572877883911, "loss/hidden": 1.234375, "loss/logits": 0.21350392699241638, "loss/reg": 5.656494977301918e-05, "step": 566 }, { "epoch": 0.070875, "grad_norm": 2.569112539291382, "grad_norm_var": 0.26265097215405053, "learning_rate": 0.0001, "loss": 1.2421, "loss/crossentropy": 2.3923959732055664, "loss/hidden": 1.0625, "loss/logits": 0.17903804779052734, "loss/reg": 5.65506998100318e-05, "step": 567 }, { "epoch": 0.071, "grad_norm": 2.2171080112457275, "grad_norm_var": 0.24141352450480985, "learning_rate": 0.0001, "loss": 1.1494, "loss/crossentropy": 2.3929827213287354, "loss/hidden": 0.9921875, "loss/logits": 0.15663662552833557, "loss/reg": 5.653856715071015e-05, "step": 568 }, { "epoch": 0.071125, "grad_norm": 3.1210575103759766, "grad_norm_var": 0.10712755072980235, "learning_rate": 0.0001, "loss": 1.6912, "loss/crossentropy": 2.2765614986419678, "loss/hidden": 1.453125, "loss/logits": 0.23751139640808105, "loss/reg": 5.652818435919471e-05, "step": 569 }, { "epoch": 0.07125, "grad_norm": 3.839294672012329, "grad_norm_var": 0.19660925262190102, "learning_rate": 0.0001, "loss": 1.8286, "loss/crossentropy": 2.0632450580596924, "loss/hidden": 1.546875, "loss/logits": 0.28118443489074707, "loss/reg": 5.651290121022612e-05, "step": 570 }, { "epoch": 0.071375, "grad_norm": 3.240445613861084, "grad_norm_var": 0.19394141138966817, "learning_rate": 0.0001, "loss": 1.4813, "loss/crossentropy": 2.642064094543457, "loss/hidden": 1.2265625, "loss/logits": 0.2541462779045105, "loss/reg": 5.650040111504495e-05, "step": 571 }, { "epoch": 0.0715, "grad_norm": 2.4979491233825684, "grad_norm_var": 0.19361522865255718, "learning_rate": 0.0001, "loss": 1.4798, "loss/crossentropy": 2.3248634338378906, "loss/hidden": 1.234375, "loss/logits": 0.2448451817035675, "loss/reg": 5.648669321089983e-05, "step": 572 }, { "epoch": 0.071625, "grad_norm": 2.3329668045043945, "grad_norm_var": 0.19921690431924213, "learning_rate": 0.0001, "loss": 1.363, "loss/crossentropy": 2.8348701000213623, "loss/hidden": 1.1484375, "loss/logits": 0.21398252248764038, "loss/reg": 5.6470726121915504e-05, "step": 573 }, { "epoch": 0.07175, "grad_norm": 2.494943857192993, "grad_norm_var": 0.19823649604586155, "learning_rate": 0.0001, "loss": 1.4279, "loss/crossentropy": 2.077019214630127, "loss/hidden": 1.21875, "loss/logits": 0.20862454175949097, "loss/reg": 5.645084456773475e-05, "step": 574 }, { "epoch": 0.071875, "grad_norm": 2.4467947483062744, "grad_norm_var": 0.1968164545969214, "learning_rate": 0.0001, "loss": 1.2799, "loss/crossentropy": 2.764270067214966, "loss/hidden": 1.109375, "loss/logits": 0.16992923617362976, "loss/reg": 5.643080658046529e-05, "step": 575 }, { "epoch": 0.072, "grad_norm": 4.124429702758789, "grad_norm_var": 0.33829023147095444, "learning_rate": 0.0001, "loss": 1.8319, "loss/crossentropy": 2.614333152770996, "loss/hidden": 1.5859375, "loss/logits": 0.245355024933815, "loss/reg": 5.64096771995537e-05, "step": 576 }, { "epoch": 0.072125, "grad_norm": 2.9513115882873535, "grad_norm_var": 0.3353724391757993, "learning_rate": 0.0001, "loss": 1.4848, "loss/crossentropy": 2.291598320007324, "loss/hidden": 1.234375, "loss/logits": 0.24986042082309723, "loss/reg": 5.639591472572647e-05, "step": 577 }, { "epoch": 0.07225, "grad_norm": 3.004474639892578, "grad_norm_var": 0.31262981045543464, "learning_rate": 0.0001, "loss": 1.4425, "loss/crossentropy": 2.2871763706207275, "loss/hidden": 1.203125, "loss/logits": 0.23880186676979065, "loss/reg": 5.638147922581993e-05, "step": 578 }, { "epoch": 0.072375, "grad_norm": 3.2634410858154297, "grad_norm_var": 0.3006388387902421, "learning_rate": 0.0001, "loss": 1.5696, "loss/crossentropy": 2.5439798831939697, "loss/hidden": 1.3125, "loss/logits": 0.2565382122993469, "loss/reg": 5.637051799567416e-05, "step": 579 }, { "epoch": 0.0725, "grad_norm": 2.787332534790039, "grad_norm_var": 0.2999987245038954, "learning_rate": 0.0001, "loss": 1.3747, "loss/crossentropy": 2.5118601322174072, "loss/hidden": 1.1796875, "loss/logits": 0.19444304704666138, "loss/reg": 5.635723573504947e-05, "step": 580 }, { "epoch": 0.072625, "grad_norm": 2.7229959964752197, "grad_norm_var": 0.2903593389163989, "learning_rate": 0.0001, "loss": 1.5141, "loss/crossentropy": 2.7853024005889893, "loss/hidden": 1.3046875, "loss/logits": 0.20882655680179596, "loss/reg": 5.634501940221526e-05, "step": 581 }, { "epoch": 0.07275, "grad_norm": 2.594968318939209, "grad_norm_var": 0.28367694660223985, "learning_rate": 0.0001, "loss": 1.093, "loss/crossentropy": 2.6438136100769043, "loss/hidden": 0.94921875, "loss/logits": 0.1432032436132431, "loss/reg": 5.633091495838016e-05, "step": 582 }, { "epoch": 0.072875, "grad_norm": 2.3869121074676514, "grad_norm_var": 0.2934995682895912, "learning_rate": 0.0001, "loss": 1.3018, "loss/crossentropy": 2.4067459106445312, "loss/hidden": 1.1171875, "loss/logits": 0.18405942618846893, "loss/reg": 5.631797466776334e-05, "step": 583 }, { "epoch": 0.073, "grad_norm": 3.3360087871551514, "grad_norm_var": 0.2733505680051763, "learning_rate": 0.0001, "loss": 1.1895, "loss/crossentropy": 2.7274060249328613, "loss/hidden": 1.015625, "loss/logits": 0.17334823310375214, "loss/reg": 5.630190935335122e-05, "step": 584 }, { "epoch": 0.073125, "grad_norm": 2.636382818222046, "grad_norm_var": 0.27675729437985763, "learning_rate": 0.0001, "loss": 1.2592, "loss/crossentropy": 2.4599273204803467, "loss/hidden": 1.0859375, "loss/logits": 0.17271864414215088, "loss/reg": 5.6286880862899125e-05, "step": 585 }, { "epoch": 0.07325, "grad_norm": 2.3085126876831055, "grad_norm_var": 0.2348241054879698, "learning_rate": 0.0001, "loss": 1.1438, "loss/crossentropy": 2.4817559719085693, "loss/hidden": 0.98046875, "loss/logits": 0.16277402639389038, "loss/reg": 5.6276072427863255e-05, "step": 586 }, { "epoch": 0.073375, "grad_norm": 2.8122873306274414, "grad_norm_var": 0.22231448974219556, "learning_rate": 0.0001, "loss": 1.6107, "loss/crossentropy": 2.3126115798950195, "loss/hidden": 1.3515625, "loss/logits": 0.25852900743484497, "loss/reg": 5.625975609291345e-05, "step": 587 }, { "epoch": 0.0735, "grad_norm": 2.5446043014526367, "grad_norm_var": 0.22060978250349308, "learning_rate": 0.0001, "loss": 1.3722, "loss/crossentropy": 2.3330647945404053, "loss/hidden": 1.171875, "loss/logits": 0.1998036503791809, "loss/reg": 5.6243337894557044e-05, "step": 588 }, { "epoch": 0.073625, "grad_norm": 3.0254905223846436, "grad_norm_var": 0.20775786644308383, "learning_rate": 0.0001, "loss": 1.4257, "loss/crossentropy": 2.4669790267944336, "loss/hidden": 1.1953125, "loss/logits": 0.22982466220855713, "loss/reg": 5.6233355280710384e-05, "step": 589 }, { "epoch": 0.07375, "grad_norm": 2.742598295211792, "grad_norm_var": 0.20019536457655604, "learning_rate": 0.0001, "loss": 1.3087, "loss/crossentropy": 2.350428581237793, "loss/hidden": 1.125, "loss/logits": 0.1831551194190979, "loss/reg": 5.621850868919864e-05, "step": 590 }, { "epoch": 0.073875, "grad_norm": 2.2741334438323975, "grad_norm_var": 0.21146840571566727, "learning_rate": 0.0001, "loss": 1.3572, "loss/crossentropy": 2.237168550491333, "loss/hidden": 1.1484375, "loss/logits": 0.20822051167488098, "loss/reg": 5.6203607528004795e-05, "step": 591 }, { "epoch": 0.074, "grad_norm": 4.630364894866943, "grad_norm_var": 0.3137917114378768, "learning_rate": 0.0001, "loss": 1.8838, "loss/crossentropy": 2.4884331226348877, "loss/hidden": 1.5859375, "loss/logits": 0.2973060607910156, "loss/reg": 5.619114017463289e-05, "step": 592 }, { "epoch": 0.074125, "grad_norm": 2.7753021717071533, "grad_norm_var": 0.3139690476205639, "learning_rate": 0.0001, "loss": 1.465, "loss/crossentropy": 2.409294843673706, "loss/hidden": 1.2421875, "loss/logits": 0.22228561341762543, "loss/reg": 5.617353235720657e-05, "step": 593 }, { "epoch": 0.07425, "grad_norm": 2.694066286087036, "grad_norm_var": 0.3142336147439567, "learning_rate": 0.0001, "loss": 1.6625, "loss/crossentropy": 2.588970899581909, "loss/hidden": 1.3828125, "loss/logits": 0.27915188670158386, "loss/reg": 5.6154247431550175e-05, "step": 594 }, { "epoch": 0.074375, "grad_norm": 7.720087051391602, "grad_norm_var": 1.8036632855610812, "learning_rate": 0.0001, "loss": 2.0674, "loss/crossentropy": 2.8772268295288086, "loss/hidden": 1.71875, "loss/logits": 0.3481142520904541, "loss/reg": 5.61414854018949e-05, "step": 595 }, { "epoch": 0.0745, "grad_norm": 2.24833083152771, "grad_norm_var": 1.8460523547946992, "learning_rate": 0.0001, "loss": 1.3332, "loss/crossentropy": 2.568021297454834, "loss/hidden": 1.140625, "loss/logits": 0.19198307394981384, "loss/reg": 5.612680615740828e-05, "step": 596 }, { "epoch": 0.074625, "grad_norm": 2.1800825595855713, "grad_norm_var": 1.8911004193173881, "learning_rate": 0.0001, "loss": 1.1532, "loss/crossentropy": 2.43203067779541, "loss/hidden": 1.0, "loss/logits": 0.1526886522769928, "loss/reg": 5.6111755839083344e-05, "step": 597 }, { "epoch": 0.07475, "grad_norm": 2.212115526199341, "grad_norm_var": 1.9238408264416513, "learning_rate": 0.0001, "loss": 1.2472, "loss/crossentropy": 2.493605613708496, "loss/hidden": 1.0703125, "loss/logits": 0.17635077238082886, "loss/reg": 5.609134313999675e-05, "step": 598 }, { "epoch": 0.074875, "grad_norm": 3.4206509590148926, "grad_norm_var": 1.9015840455427668, "learning_rate": 0.0001, "loss": 1.6227, "loss/crossentropy": 2.383911371231079, "loss/hidden": 1.3515625, "loss/logits": 0.2705824673175812, "loss/reg": 5.607017010333948e-05, "step": 599 }, { "epoch": 0.075, "grad_norm": 2.6518476009368896, "grad_norm_var": 1.9090875079187366, "learning_rate": 0.0001, "loss": 1.311, "loss/crossentropy": 2.764665126800537, "loss/hidden": 1.1328125, "loss/logits": 0.17762941122055054, "loss/reg": 5.605430851574056e-05, "step": 600 }, { "epoch": 0.075125, "grad_norm": 2.1637043952941895, "grad_norm_var": 1.9494220257467947, "learning_rate": 0.0001, "loss": 1.2314, "loss/crossentropy": 2.510344982147217, "loss/hidden": 1.0625, "loss/logits": 0.16831059753894806, "loss/reg": 5.603917452390306e-05, "step": 601 }, { "epoch": 0.07525, "grad_norm": 3.601780891418457, "grad_norm_var": 1.930362870052075, "learning_rate": 0.0001, "loss": 1.6586, "loss/crossentropy": 2.4858670234680176, "loss/hidden": 1.375, "loss/logits": 0.2830356955528259, "loss/reg": 5.602244709734805e-05, "step": 602 }, { "epoch": 0.075375, "grad_norm": 2.3047375679016113, "grad_norm_var": 1.9663459192058854, "learning_rate": 0.0001, "loss": 1.2404, "loss/crossentropy": 2.6084353923797607, "loss/hidden": 1.078125, "loss/logits": 0.16176369786262512, "loss/reg": 5.6005988881224766e-05, "step": 603 }, { "epoch": 0.0755, "grad_norm": 2.468519449234009, "grad_norm_var": 1.972081997343082, "learning_rate": 0.0001, "loss": 1.32, "loss/crossentropy": 2.4244751930236816, "loss/hidden": 1.140625, "loss/logits": 0.17885854840278625, "loss/reg": 5.598864299827255e-05, "step": 604 }, { "epoch": 0.075625, "grad_norm": 3.1042160987854004, "grad_norm_var": 1.9720062093686368, "learning_rate": 0.0001, "loss": 1.1799, "loss/crossentropy": 2.981356143951416, "loss/hidden": 1.0, "loss/logits": 0.17934700846672058, "loss/reg": 5.5970504035940394e-05, "step": 605 }, { "epoch": 0.07575, "grad_norm": 2.198490858078003, "grad_norm_var": 2.0145906467974495, "learning_rate": 0.0001, "loss": 1.3087, "loss/crossentropy": 2.4534380435943604, "loss/hidden": 1.125, "loss/logits": 0.18310286104679108, "loss/reg": 5.595075344899669e-05, "step": 606 }, { "epoch": 0.075875, "grad_norm": 3.01007080078125, "grad_norm_var": 1.973238539473501, "learning_rate": 0.0001, "loss": 1.3278, "loss/crossentropy": 2.601656913757324, "loss/hidden": 1.1328125, "loss/logits": 0.19443120062351227, "loss/reg": 5.5936940043466166e-05, "step": 607 }, { "epoch": 0.076, "grad_norm": 2.328502655029297, "grad_norm_var": 1.8305709674660562, "learning_rate": 0.0001, "loss": 1.2808, "loss/crossentropy": 2.5165281295776367, "loss/hidden": 1.1015625, "loss/logits": 0.17871087789535522, "loss/reg": 5.592526576947421e-05, "step": 608 }, { "epoch": 0.076125, "grad_norm": 2.4415786266326904, "grad_norm_var": 1.844978362281832, "learning_rate": 0.0001, "loss": 1.4699, "loss/crossentropy": 2.3549885749816895, "loss/hidden": 1.234375, "loss/logits": 0.23492830991744995, "loss/reg": 5.591001536231488e-05, "step": 609 }, { "epoch": 0.07625, "grad_norm": 2.1607983112335205, "grad_norm_var": 1.8789441166626564, "learning_rate": 0.0001, "loss": 1.5289, "loss/crossentropy": 2.310319185256958, "loss/hidden": 1.3203125, "loss/logits": 0.20807045698165894, "loss/reg": 5.5894925026223063e-05, "step": 610 }, { "epoch": 0.076375, "grad_norm": 2.4163475036621094, "grad_norm_var": 0.22029539262357123, "learning_rate": 0.0001, "loss": 1.326, "loss/crossentropy": 2.5177974700927734, "loss/hidden": 1.1484375, "loss/logits": 0.1770188808441162, "loss/reg": 5.587563646258786e-05, "step": 611 }, { "epoch": 0.0765, "grad_norm": 5.376523494720459, "grad_norm_var": 0.7031570506973231, "learning_rate": 0.0001, "loss": 1.7307, "loss/crossentropy": 2.593522548675537, "loss/hidden": 1.46875, "loss/logits": 0.26142174005508423, "loss/reg": 5.5857744882814586e-05, "step": 612 }, { "epoch": 0.076625, "grad_norm": 2.5275163650512695, "grad_norm_var": 0.6841845799993801, "learning_rate": 0.0001, "loss": 1.322, "loss/crossentropy": 2.5519967079162598, "loss/hidden": 1.125, "loss/logits": 0.1964191198348999, "loss/reg": 5.583597521763295e-05, "step": 613 }, { "epoch": 0.07675, "grad_norm": 2.870879888534546, "grad_norm_var": 0.6619358019877306, "learning_rate": 0.0001, "loss": 1.4565, "loss/crossentropy": 2.4661271572113037, "loss/hidden": 1.2265625, "loss/logits": 0.22941797971725464, "loss/reg": 5.581411096500233e-05, "step": 614 }, { "epoch": 0.076875, "grad_norm": 2.530247926712036, "grad_norm_var": 0.6396295206762201, "learning_rate": 0.0001, "loss": 1.3657, "loss/crossentropy": 2.298511028289795, "loss/hidden": 1.1875, "loss/logits": 0.17764486372470856, "loss/reg": 5.579264689004049e-05, "step": 615 }, { "epoch": 0.077, "grad_norm": 2.1187071800231934, "grad_norm_var": 0.665063668545568, "learning_rate": 0.0001, "loss": 1.1568, "loss/crossentropy": 2.4262094497680664, "loss/hidden": 1.015625, "loss/logits": 0.14061546325683594, "loss/reg": 5.5783228162908927e-05, "step": 616 }, { "epoch": 0.077125, "grad_norm": 2.3431754112243652, "grad_norm_var": 0.6536114449405801, "learning_rate": 0.0001, "loss": 1.4278, "loss/crossentropy": 2.3483335971832275, "loss/hidden": 1.2109375, "loss/logits": 0.21630419790744781, "loss/reg": 5.57744933757931e-05, "step": 617 }, { "epoch": 0.07725, "grad_norm": 2.5985348224639893, "grad_norm_var": 0.6009238397412027, "learning_rate": 0.0001, "loss": 1.7645, "loss/crossentropy": 2.279369831085205, "loss/hidden": 1.546875, "loss/logits": 0.2171006053686142, "loss/reg": 5.575196701101959e-05, "step": 618 }, { "epoch": 0.077375, "grad_norm": 4.004838943481445, "grad_norm_var": 0.697655562382554, "learning_rate": 0.0001, "loss": 1.6666, "loss/crossentropy": 2.558466911315918, "loss/hidden": 1.3984375, "loss/logits": 0.26759639382362366, "loss/reg": 5.5740612879162654e-05, "step": 619 }, { "epoch": 0.0775, "grad_norm": 2.264273166656494, "grad_norm_var": 0.7087775967170871, "learning_rate": 0.0001, "loss": 1.1433, "loss/crossentropy": 2.5713441371917725, "loss/hidden": 0.9921875, "loss/logits": 0.1505858451128006, "loss/reg": 5.5717180657666177e-05, "step": 620 }, { "epoch": 0.077625, "grad_norm": 2.5551795959472656, "grad_norm_var": 0.7030356734291255, "learning_rate": 0.0001, "loss": 1.1309, "loss/crossentropy": 2.5923280715942383, "loss/hidden": 0.98046875, "loss/logits": 0.14986979961395264, "loss/reg": 5.57043167646043e-05, "step": 621 }, { "epoch": 0.07775, "grad_norm": 2.652561902999878, "grad_norm_var": 0.6834944271410867, "learning_rate": 0.0001, "loss": 1.3254, "loss/crossentropy": 2.8501980304718018, "loss/hidden": 1.140625, "loss/logits": 0.18422411382198334, "loss/reg": 5.5684457038296387e-05, "step": 622 }, { "epoch": 0.077875, "grad_norm": 2.6352500915527344, "grad_norm_var": 0.6799016428975733, "learning_rate": 0.0001, "loss": 1.1878, "loss/crossentropy": 2.7150673866271973, "loss/hidden": 1.03125, "loss/logits": 0.15596626698970795, "loss/reg": 5.5671138397883624e-05, "step": 623 }, { "epoch": 0.078, "grad_norm": 2.724209785461426, "grad_norm_var": 0.6680269008360408, "learning_rate": 0.0001, "loss": 1.5681, "loss/crossentropy": 2.3070638179779053, "loss/hidden": 1.34375, "loss/logits": 0.22377389669418335, "loss/reg": 5.56498380319681e-05, "step": 624 }, { "epoch": 0.078125, "grad_norm": 9.59151840209961, "grad_norm_var": 3.5559580820469003, "learning_rate": 0.0001, "loss": 1.568, "loss/crossentropy": 2.6425082683563232, "loss/hidden": 1.375, "loss/logits": 0.19247561693191528, "loss/reg": 5.562700243899599e-05, "step": 625 }, { "epoch": 0.07825, "grad_norm": 2.7146146297454834, "grad_norm_var": 3.4976035299386314, "learning_rate": 0.0001, "loss": 1.2429, "loss/crossentropy": 2.7501344680786133, "loss/hidden": 1.0703125, "loss/logits": 0.17199140787124634, "loss/reg": 5.560599674936384e-05, "step": 626 }, { "epoch": 0.078375, "grad_norm": 2.4837982654571533, "grad_norm_var": 3.4904329865295534, "learning_rate": 0.0001, "loss": 1.3951, "loss/crossentropy": 2.4862208366394043, "loss/hidden": 1.21875, "loss/logits": 0.17582716047763824, "loss/reg": 5.558757402468473e-05, "step": 627 }, { "epoch": 0.0785, "grad_norm": 2.1883833408355713, "grad_norm_var": 3.2215267842479833, "learning_rate": 0.0001, "loss": 1.0922, "loss/crossentropy": 2.6592166423797607, "loss/hidden": 0.953125, "loss/logits": 0.1385241448879242, "loss/reg": 5.5569151300005615e-05, "step": 628 }, { "epoch": 0.078625, "grad_norm": 2.8857290744781494, "grad_norm_var": 3.2045808378220917, "learning_rate": 0.0001, "loss": 1.3705, "loss/crossentropy": 2.581509828567505, "loss/hidden": 1.1796875, "loss/logits": 0.19029666483402252, "loss/reg": 5.555087045649998e-05, "step": 629 }, { "epoch": 0.07875, "grad_norm": 1.9136121273040771, "grad_norm_var": 3.287602536208832, "learning_rate": 0.0001, "loss": 1.1828, "loss/crossentropy": 2.4255590438842773, "loss/hidden": 1.03125, "loss/logits": 0.15101952850818634, "loss/reg": 5.5534914281452075e-05, "step": 630 }, { "epoch": 0.078875, "grad_norm": 2.1227009296417236, "grad_norm_var": 3.3242045708283325, "learning_rate": 0.0001, "loss": 1.3325, "loss/crossentropy": 2.5583293437957764, "loss/hidden": 1.140625, "loss/logits": 0.19136284291744232, "loss/reg": 5.551309368456714e-05, "step": 631 }, { "epoch": 0.079, "grad_norm": 2.6222591400146484, "grad_norm_var": 3.2817336007922213, "learning_rate": 0.0001, "loss": 1.5832, "loss/crossentropy": 2.2433364391326904, "loss/hidden": 1.3515625, "loss/logits": 0.23112601041793823, "loss/reg": 5.549259003601037e-05, "step": 632 }, { "epoch": 0.079125, "grad_norm": 2.6109414100646973, "grad_norm_var": 3.2620938839767395, "learning_rate": 0.0001, "loss": 1.2217, "loss/crossentropy": 2.715541124343872, "loss/hidden": 1.046875, "loss/logits": 0.17423760890960693, "loss/reg": 5.547174077946693e-05, "step": 633 }, { "epoch": 0.07925, "grad_norm": 2.377688407897949, "grad_norm_var": 3.2780099106994003, "learning_rate": 0.0001, "loss": 1.3809, "loss/crossentropy": 2.317793130874634, "loss/hidden": 1.1875, "loss/logits": 0.19284963607788086, "loss/reg": 5.545308886212297e-05, "step": 634 }, { "epoch": 0.079375, "grad_norm": 2.2899274826049805, "grad_norm_var": 3.2370231277032433, "learning_rate": 0.0001, "loss": 1.2669, "loss/crossentropy": 2.6336915493011475, "loss/hidden": 1.09375, "loss/logits": 0.17255498468875885, "loss/reg": 5.543839870369993e-05, "step": 635 }, { "epoch": 0.0795, "grad_norm": 2.190656900405884, "grad_norm_var": 3.2437445376370118, "learning_rate": 0.0001, "loss": 1.2508, "loss/crossentropy": 2.450021505355835, "loss/hidden": 1.09375, "loss/logits": 0.15647029876708984, "loss/reg": 5.542321741813794e-05, "step": 636 }, { "epoch": 0.079625, "grad_norm": 1.986946940422058, "grad_norm_var": 3.2908032121113298, "learning_rate": 0.0001, "loss": 1.2266, "loss/crossentropy": 2.7081422805786133, "loss/hidden": 1.0546875, "loss/logits": 0.17138829827308655, "loss/reg": 5.5409673223039135e-05, "step": 637 }, { "epoch": 0.07975, "grad_norm": 2.3423492908477783, "grad_norm_var": 3.3059943314168536, "learning_rate": 0.0001, "loss": 1.2321, "loss/crossentropy": 2.384462833404541, "loss/hidden": 1.0546875, "loss/logits": 0.17684702575206757, "loss/reg": 5.5391312343999743e-05, "step": 638 }, { "epoch": 0.079875, "grad_norm": 2.3519933223724365, "grad_norm_var": 3.31930978488029, "learning_rate": 0.0001, "loss": 1.3743, "loss/crossentropy": 2.6151647567749023, "loss/hidden": 1.1640625, "loss/logits": 0.2097240835428238, "loss/reg": 5.537479228223674e-05, "step": 639 }, { "epoch": 0.08, "grad_norm": 2.447504997253418, "grad_norm_var": 3.3282686991442922, "learning_rate": 0.0001, "loss": 1.4717, "loss/crossentropy": 2.5143349170684814, "loss/hidden": 1.2109375, "loss/logits": 0.2602579593658447, "loss/reg": 5.535819582291879e-05, "step": 640 }, { "epoch": 0.080125, "grad_norm": 2.7342166900634766, "grad_norm_var": 0.07597010378907397, "learning_rate": 0.0001, "loss": 1.5484, "loss/crossentropy": 2.4136970043182373, "loss/hidden": 1.3046875, "loss/logits": 0.24319079518318176, "loss/reg": 5.5341512052109465e-05, "step": 641 }, { "epoch": 0.08025, "grad_norm": 7.95797061920166, "grad_norm_var": 2.020192568164274, "learning_rate": 0.0001, "loss": 1.4043, "loss/crossentropy": 2.4566597938537598, "loss/hidden": 1.2265625, "loss/logits": 0.1772194355726242, "loss/reg": 5.532177601708099e-05, "step": 642 }, { "epoch": 0.080375, "grad_norm": 2.706076145172119, "grad_norm_var": 2.0163048861974153, "learning_rate": 0.0001, "loss": 1.5589, "loss/crossentropy": 2.313260078430176, "loss/hidden": 1.3203125, "loss/logits": 0.23800881206989288, "loss/reg": 5.5301832617260516e-05, "step": 643 }, { "epoch": 0.0805, "grad_norm": 2.8865914344787598, "grad_norm_var": 1.9960669600053733, "learning_rate": 0.0001, "loss": 1.4925, "loss/crossentropy": 2.48056697845459, "loss/hidden": 1.28125, "loss/logits": 0.2106875777244568, "loss/reg": 5.528131077880971e-05, "step": 644 }, { "epoch": 0.080625, "grad_norm": 2.7675015926361084, "grad_norm_var": 1.995221836304215, "learning_rate": 0.0001, "loss": 1.4228, "loss/crossentropy": 2.414571523666382, "loss/hidden": 1.21875, "loss/logits": 0.20353971421718597, "loss/reg": 5.5263531976379454e-05, "step": 645 }, { "epoch": 0.08075, "grad_norm": 2.2879977226257324, "grad_norm_var": 1.9612673982614381, "learning_rate": 0.0001, "loss": 1.4466, "loss/crossentropy": 2.5126829147338867, "loss/hidden": 1.234375, "loss/logits": 0.21164974570274353, "loss/reg": 5.523953586816788e-05, "step": 646 }, { "epoch": 0.080875, "grad_norm": 2.0180585384368896, "grad_norm_var": 1.9712999230632604, "learning_rate": 0.0001, "loss": 1.2852, "loss/crossentropy": 2.544811248779297, "loss/hidden": 1.1015625, "loss/logits": 0.18305200338363647, "loss/reg": 5.5218009947566316e-05, "step": 647 }, { "epoch": 0.081, "grad_norm": 2.460339069366455, "grad_norm_var": 1.9764772295131516, "learning_rate": 0.0001, "loss": 1.4663, "loss/crossentropy": 2.384995937347412, "loss/hidden": 1.2578125, "loss/logits": 0.20790857076644897, "loss/reg": 5.520256672753021e-05, "step": 648 }, { "epoch": 0.081125, "grad_norm": 2.6601645946502686, "grad_norm_var": 1.975545055561849, "learning_rate": 0.0001, "loss": 1.2009, "loss/crossentropy": 2.5382332801818848, "loss/hidden": 1.0625, "loss/logits": 0.13785940408706665, "loss/reg": 5.518757097888738e-05, "step": 649 }, { "epoch": 0.08125, "grad_norm": 2.0407180786132812, "grad_norm_var": 2.0006781186858693, "learning_rate": 0.0001, "loss": 1.3368, "loss/crossentropy": 2.314056158065796, "loss/hidden": 1.171875, "loss/logits": 0.1643456667661667, "loss/reg": 5.5170850828289986e-05, "step": 650 }, { "epoch": 0.081375, "grad_norm": 2.672747850418091, "grad_norm_var": 1.9859426578962365, "learning_rate": 0.0001, "loss": 1.4244, "loss/crossentropy": 2.495609760284424, "loss/hidden": 1.21875, "loss/logits": 0.2050689160823822, "loss/reg": 5.5157281167339534e-05, "step": 651 }, { "epoch": 0.0815, "grad_norm": 2.3654656410217285, "grad_norm_var": 1.9740698553443072, "learning_rate": 0.0001, "loss": 1.5368, "loss/crossentropy": 2.4721033573150635, "loss/hidden": 1.328125, "loss/logits": 0.20810337364673615, "loss/reg": 5.5145894293673337e-05, "step": 652 }, { "epoch": 0.081625, "grad_norm": 2.0404245853424072, "grad_norm_var": 1.9685017588802374, "learning_rate": 0.0001, "loss": 1.2802, "loss/crossentropy": 2.340548515319824, "loss/hidden": 1.1015625, "loss/logits": 0.17806307971477509, "loss/reg": 5.5128544772742316e-05, "step": 653 }, { "epoch": 0.08175, "grad_norm": 2.6265711784362793, "grad_norm_var": 1.9563492188252354, "learning_rate": 0.0001, "loss": 1.1856, "loss/crossentropy": 2.6735007762908936, "loss/hidden": 1.0, "loss/logits": 0.18507899343967438, "loss/reg": 5.5116473959060386e-05, "step": 654 }, { "epoch": 0.081875, "grad_norm": 3.466085433959961, "grad_norm_var": 1.9652920541676069, "learning_rate": 0.0001, "loss": 1.5681, "loss/crossentropy": 2.62325119972229, "loss/hidden": 1.3515625, "loss/logits": 0.21598073840141296, "loss/reg": 5.510494884219952e-05, "step": 655 }, { "epoch": 0.082, "grad_norm": 1.9770357608795166, "grad_norm_var": 2.0064850603907494, "learning_rate": 0.0001, "loss": 1.2097, "loss/crossentropy": 2.2057158946990967, "loss/hidden": 1.0625, "loss/logits": 0.14666706323623657, "loss/reg": 5.509403126779944e-05, "step": 656 }, { "epoch": 0.082125, "grad_norm": 2.3105995655059814, "grad_norm_var": 2.024480408785202, "learning_rate": 0.0001, "loss": 1.2902, "loss/crossentropy": 2.493821144104004, "loss/hidden": 1.1015625, "loss/logits": 0.1881340891122818, "loss/reg": 5.5078246077755466e-05, "step": 657 }, { "epoch": 0.08225, "grad_norm": 2.718162775039673, "grad_norm_var": 0.15628703716138168, "learning_rate": 0.0001, "loss": 1.3117, "loss/crossentropy": 2.4584882259368896, "loss/hidden": 1.1171875, "loss/logits": 0.19399182498455048, "loss/reg": 5.506265733856708e-05, "step": 658 }, { "epoch": 0.082375, "grad_norm": 3.373929977416992, "grad_norm_var": 0.2024890656434612, "learning_rate": 0.0001, "loss": 1.657, "loss/crossentropy": 2.3616418838500977, "loss/hidden": 1.375, "loss/logits": 0.2814520597457886, "loss/reg": 5.505214721779339e-05, "step": 659 }, { "epoch": 0.0825, "grad_norm": 3.0150086879730225, "grad_norm_var": 0.20941952923840457, "learning_rate": 0.0001, "loss": 1.4792, "loss/crossentropy": 2.703028440475464, "loss/hidden": 1.2578125, "loss/logits": 0.2208247035741806, "loss/reg": 5.504006549017504e-05, "step": 660 }, { "epoch": 0.082625, "grad_norm": 3.6257362365722656, "grad_norm_var": 0.2803381345532055, "learning_rate": 0.0001, "loss": 1.3149, "loss/crossentropy": 2.6234936714172363, "loss/hidden": 1.1328125, "loss/logits": 0.18155357241630554, "loss/reg": 5.502764179254882e-05, "step": 661 }, { "epoch": 0.08275, "grad_norm": 8.147649765014648, "grad_norm_var": 2.179661731968463, "learning_rate": 0.0001, "loss": 1.4767, "loss/crossentropy": 2.5337109565734863, "loss/hidden": 1.296875, "loss/logits": 0.17923393845558167, "loss/reg": 5.501080886460841e-05, "step": 662 }, { "epoch": 0.082875, "grad_norm": 2.901944398880005, "grad_norm_var": 2.1163120327356095, "learning_rate": 0.0001, "loss": 1.2957, "loss/crossentropy": 2.814661979675293, "loss/hidden": 1.09375, "loss/logits": 0.20143108069896698, "loss/reg": 5.4996402468532324e-05, "step": 663 }, { "epoch": 0.083, "grad_norm": 2.2565462589263916, "grad_norm_var": 2.1342553181424, "learning_rate": 0.0001, "loss": 1.1747, "loss/crossentropy": 2.4640450477600098, "loss/hidden": 1.015625, "loss/logits": 0.158490851521492, "loss/reg": 5.4978750995360315e-05, "step": 664 }, { "epoch": 0.083125, "grad_norm": 2.5376529693603516, "grad_norm_var": 2.140947510021911, "learning_rate": 0.0001, "loss": 1.2111, "loss/crossentropy": 2.433427095413208, "loss/hidden": 1.0546875, "loss/logits": 0.1558808833360672, "loss/reg": 5.4963678849162534e-05, "step": 665 }, { "epoch": 0.08325, "grad_norm": 2.327512741088867, "grad_norm_var": 2.1092236468841206, "learning_rate": 0.0001, "loss": 1.371, "loss/crossentropy": 2.400848627090454, "loss/hidden": 1.171875, "loss/logits": 0.19855274260044098, "loss/reg": 5.494604556588456e-05, "step": 666 }, { "epoch": 0.083375, "grad_norm": 2.6282968521118164, "grad_norm_var": 2.1114211896703217, "learning_rate": 0.0001, "loss": 1.3032, "loss/crossentropy": 2.4893503189086914, "loss/hidden": 1.125, "loss/logits": 0.17762264609336853, "loss/reg": 5.492940545082092e-05, "step": 667 }, { "epoch": 0.0835, "grad_norm": 15.134915351867676, "grad_norm_var": 11.188339796523456, "learning_rate": 0.0001, "loss": 2.056, "loss/crossentropy": 1.2935031652450562, "loss/hidden": 1.9921875, "loss/logits": 0.0632929727435112, "loss/reg": 5.491078627528623e-05, "step": 668 }, { "epoch": 0.083625, "grad_norm": 2.7197494506835938, "grad_norm_var": 11.056175204029886, "learning_rate": 0.0001, "loss": 1.2911, "loss/crossentropy": 2.4398157596588135, "loss/hidden": 1.1171875, "loss/logits": 0.17337752878665924, "loss/reg": 5.489288014359772e-05, "step": 669 }, { "epoch": 0.08375, "grad_norm": 4.744369029998779, "grad_norm_var": 10.98807433162251, "learning_rate": 0.0001, "loss": 1.3603, "loss/crossentropy": 2.813530683517456, "loss/hidden": 1.1875, "loss/logits": 0.1722554862499237, "loss/reg": 5.487642920343205e-05, "step": 670 }, { "epoch": 0.083875, "grad_norm": 2.2052688598632812, "grad_norm_var": 11.1759775305449, "learning_rate": 0.0001, "loss": 1.2101, "loss/crossentropy": 2.712545394897461, "loss/hidden": 1.046875, "loss/logits": 0.1627039760351181, "loss/reg": 5.4856664064573124e-05, "step": 671 }, { "epoch": 0.084, "grad_norm": 2.127044677734375, "grad_norm_var": 11.138641886695007, "learning_rate": 0.0001, "loss": 1.1681, "loss/crossentropy": 2.7978134155273438, "loss/hidden": 1.0, "loss/logits": 0.16759660840034485, "loss/reg": 5.4841766541358083e-05, "step": 672 }, { "epoch": 0.084125, "grad_norm": 2.57954478263855, "grad_norm_var": 11.0853286400312, "learning_rate": 0.0001, "loss": 1.2535, "loss/crossentropy": 2.8341479301452637, "loss/hidden": 1.0859375, "loss/logits": 0.16698572039604187, "loss/reg": 5.482636333908886e-05, "step": 673 }, { "epoch": 0.08425, "grad_norm": 14.73951244354248, "grad_norm_var": 18.158630087103635, "learning_rate": 0.0001, "loss": 1.745, "loss/crossentropy": 2.6920087337493896, "loss/hidden": 1.5625, "loss/logits": 0.18198290467262268, "loss/reg": 5.481092011905275e-05, "step": 674 }, { "epoch": 0.084375, "grad_norm": 2.964245319366455, "grad_norm_var": 18.241094275290784, "learning_rate": 0.0001, "loss": 1.3768, "loss/crossentropy": 2.4664485454559326, "loss/hidden": 1.203125, "loss/logits": 0.1731320172548294, "loss/reg": 5.479659739648923e-05, "step": 675 }, { "epoch": 0.0845, "grad_norm": 2.0473501682281494, "grad_norm_var": 18.512621656660862, "learning_rate": 0.0001, "loss": 1.2978, "loss/crossentropy": 2.4481780529022217, "loss/hidden": 1.1171875, "loss/logits": 0.18001943826675415, "loss/reg": 5.47790368727874e-05, "step": 676 }, { "epoch": 0.084625, "grad_norm": 2.1904749870300293, "grad_norm_var": 18.82885777793808, "learning_rate": 0.0001, "loss": 1.3854, "loss/crossentropy": 2.406938076019287, "loss/hidden": 1.1484375, "loss/logits": 0.23644110560417175, "loss/reg": 5.476039950735867e-05, "step": 677 }, { "epoch": 0.08475, "grad_norm": 2.6025397777557373, "grad_norm_var": 18.06538886174356, "learning_rate": 0.0001, "loss": 1.1031, "loss/crossentropy": 2.539059638977051, "loss/hidden": 0.96484375, "loss/logits": 0.13775205612182617, "loss/reg": 5.474198769661598e-05, "step": 678 }, { "epoch": 0.084875, "grad_norm": 3.0288338661193848, "grad_norm_var": 18.044955230468542, "learning_rate": 0.0001, "loss": 1.44, "loss/crossentropy": 2.4128031730651855, "loss/hidden": 1.25, "loss/logits": 0.18944835662841797, "loss/reg": 5.472711563925259e-05, "step": 679 }, { "epoch": 0.085, "grad_norm": 6.3264336585998535, "grad_norm_var": 18.038003798487622, "learning_rate": 0.0001, "loss": 1.6715, "loss/crossentropy": 2.146503210067749, "loss/hidden": 1.4140625, "loss/logits": 0.2568877935409546, "loss/reg": 5.471197800943628e-05, "step": 680 }, { "epoch": 0.085125, "grad_norm": 2.3084750175476074, "grad_norm_var": 18.099156367515864, "learning_rate": 0.0001, "loss": 1.2205, "loss/crossentropy": 2.5457520484924316, "loss/hidden": 1.046875, "loss/logits": 0.173065185546875, "loss/reg": 5.469706593430601e-05, "step": 681 }, { "epoch": 0.08525, "grad_norm": 2.4326412677764893, "grad_norm_var": 18.07055624015896, "learning_rate": 0.0001, "loss": 1.3589, "loss/crossentropy": 2.703038215637207, "loss/hidden": 1.15625, "loss/logits": 0.20215150713920593, "loss/reg": 5.468199015012942e-05, "step": 682 }, { "epoch": 0.085375, "grad_norm": 2.473970890045166, "grad_norm_var": 18.10898905123375, "learning_rate": 0.0001, "loss": 1.1869, "loss/crossentropy": 2.4774723052978516, "loss/hidden": 1.0078125, "loss/logits": 0.17857202887535095, "loss/reg": 5.466764559969306e-05, "step": 683 }, { "epoch": 0.0855, "grad_norm": 3.152622699737549, "grad_norm_var": 9.95443167979618, "learning_rate": 0.0001, "loss": 1.3118, "loss/crossentropy": 2.385828971862793, "loss/hidden": 1.1328125, "loss/logits": 0.17848467826843262, "loss/reg": 5.465377034852281e-05, "step": 684 }, { "epoch": 0.085625, "grad_norm": 3.110018014907837, "grad_norm_var": 9.914754143381142, "learning_rate": 0.0001, "loss": 1.3367, "loss/crossentropy": 2.453490972518921, "loss/hidden": 1.15625, "loss/logits": 0.17990395426750183, "loss/reg": 5.463728302856907e-05, "step": 685 }, { "epoch": 0.08575, "grad_norm": 2.377671003341675, "grad_norm_var": 9.93198520749056, "learning_rate": 0.0001, "loss": 1.3087, "loss/crossentropy": 2.7564713954925537, "loss/hidden": 1.1171875, "loss/logits": 0.1909516155719757, "loss/reg": 5.4623284086119384e-05, "step": 686 }, { "epoch": 0.085875, "grad_norm": 2.422139883041382, "grad_norm_var": 9.896281345994009, "learning_rate": 0.0001, "loss": 1.3778, "loss/crossentropy": 2.1622204780578613, "loss/hidden": 1.1796875, "loss/logits": 0.19756248593330383, "loss/reg": 5.46108276466839e-05, "step": 687 }, { "epoch": 0.086, "grad_norm": 2.239145517349243, "grad_norm_var": 9.875720139459439, "learning_rate": 0.0001, "loss": 1.2286, "loss/crossentropy": 2.7086918354034424, "loss/hidden": 1.0546875, "loss/logits": 0.17338880896568298, "loss/reg": 5.45964103366714e-05, "step": 688 }, { "epoch": 0.086125, "grad_norm": 2.24678111076355, "grad_norm_var": 9.92624095879092, "learning_rate": 0.0001, "loss": 1.4267, "loss/crossentropy": 2.4056406021118164, "loss/hidden": 1.1875, "loss/logits": 0.23862136900424957, "loss/reg": 5.457905717776157e-05, "step": 689 }, { "epoch": 0.08625, "grad_norm": 3.519033432006836, "grad_norm_var": 1.0418889707029915, "learning_rate": 0.0001, "loss": 1.5521, "loss/crossentropy": 2.2903122901916504, "loss/hidden": 1.3515625, "loss/logits": 0.19997593760490417, "loss/reg": 5.456155486172065e-05, "step": 690 }, { "epoch": 0.086375, "grad_norm": 2.253674268722534, "grad_norm_var": 1.061688644486465, "learning_rate": 0.0001, "loss": 1.3563, "loss/crossentropy": 2.185561418533325, "loss/hidden": 1.15625, "loss/logits": 0.1994805932044983, "loss/reg": 5.454723577713594e-05, "step": 691 }, { "epoch": 0.0865, "grad_norm": 2.2427175045013428, "grad_norm_var": 1.0445794349169109, "learning_rate": 0.0001, "loss": 1.2501, "loss/crossentropy": 2.4699208736419678, "loss/hidden": 1.0859375, "loss/logits": 0.16366711258888245, "loss/reg": 5.452951154438779e-05, "step": 692 }, { "epoch": 0.086625, "grad_norm": 2.7868127822875977, "grad_norm_var": 1.0177092507570797, "learning_rate": 0.0001, "loss": 1.3042, "loss/crossentropy": 2.8428211212158203, "loss/hidden": 1.125, "loss/logits": 0.17867949604988098, "loss/reg": 5.450921526062302e-05, "step": 693 }, { "epoch": 0.08675, "grad_norm": 2.493699789047241, "grad_norm_var": 1.0219714012832202, "learning_rate": 0.0001, "loss": 1.4317, "loss/crossentropy": 2.2749669551849365, "loss/hidden": 1.2265625, "loss/logits": 0.20456844568252563, "loss/reg": 5.449183299788274e-05, "step": 694 }, { "epoch": 0.086875, "grad_norm": 2.647190570831299, "grad_norm_var": 1.0213851131010148, "learning_rate": 0.0001, "loss": 1.2026, "loss/crossentropy": 2.4777019023895264, "loss/hidden": 1.0390625, "loss/logits": 0.16303668916225433, "loss/reg": 5.447701914818026e-05, "step": 695 }, { "epoch": 0.087, "grad_norm": 2.7858493328094482, "grad_norm_var": 0.14699271023528884, "learning_rate": 0.0001, "loss": 1.6292, "loss/crossentropy": 2.0768496990203857, "loss/hidden": 1.3671875, "loss/logits": 0.26145654916763306, "loss/reg": 5.4463806009152904e-05, "step": 696 }, { "epoch": 0.087125, "grad_norm": 2.994112968444824, "grad_norm_var": 0.15033771969499318, "learning_rate": 0.0001, "loss": 1.3258, "loss/crossentropy": 2.7410616874694824, "loss/hidden": 1.125, "loss/logits": 0.20025156438350677, "loss/reg": 5.4450483730761334e-05, "step": 697 }, { "epoch": 0.08725, "grad_norm": 5.041564464569092, "grad_norm_var": 0.5049578494311615, "learning_rate": 0.0001, "loss": 1.3599, "loss/crossentropy": 2.6277389526367188, "loss/hidden": 1.171875, "loss/logits": 0.1875256896018982, "loss/reg": 5.443237751023844e-05, "step": 698 }, { "epoch": 0.087375, "grad_norm": 2.892210006713867, "grad_norm_var": 0.49775480774286673, "learning_rate": 0.0001, "loss": 1.5111, "loss/crossentropy": 2.3542768955230713, "loss/hidden": 1.28125, "loss/logits": 0.22926336526870728, "loss/reg": 5.441823668661527e-05, "step": 699 }, { "epoch": 0.0875, "grad_norm": 2.094099283218384, "grad_norm_var": 0.52159104183775, "learning_rate": 0.0001, "loss": 1.2078, "loss/crossentropy": 2.344701051712036, "loss/hidden": 1.0390625, "loss/logits": 0.16822829842567444, "loss/reg": 5.440011591417715e-05, "step": 700 }, { "epoch": 0.087625, "grad_norm": 3.5102314949035645, "grad_norm_var": 0.5503235995769403, "learning_rate": 0.0001, "loss": 1.5397, "loss/crossentropy": 1.9948920011520386, "loss/hidden": 1.3203125, "loss/logits": 0.21885889768600464, "loss/reg": 5.4386586270993575e-05, "step": 701 }, { "epoch": 0.08775, "grad_norm": 2.873744249343872, "grad_norm_var": 0.5388161553597188, "learning_rate": 0.0001, "loss": 1.3158, "loss/crossentropy": 2.527198076248169, "loss/hidden": 1.140625, "loss/logits": 0.17459964752197266, "loss/reg": 5.4375817853724584e-05, "step": 702 }, { "epoch": 0.087875, "grad_norm": 2.0903828144073486, "grad_norm_var": 0.5630812725031336, "learning_rate": 0.0001, "loss": 1.3679, "loss/crossentropy": 2.401954412460327, "loss/hidden": 1.1875, "loss/logits": 0.17989476025104523, "loss/reg": 5.4366686526918784e-05, "step": 703 }, { "epoch": 0.088, "grad_norm": 2.791881799697876, "grad_norm_var": 0.5412509567410152, "learning_rate": 0.0001, "loss": 1.4039, "loss/crossentropy": 2.5304641723632812, "loss/hidden": 1.1875, "loss/logits": 0.21581397950649261, "loss/reg": 5.434821287053637e-05, "step": 704 }, { "epoch": 0.088125, "grad_norm": 2.373342514038086, "grad_norm_var": 0.532427224823193, "learning_rate": 0.0001, "loss": 1.1941, "loss/crossentropy": 2.376497268676758, "loss/hidden": 1.03125, "loss/logits": 0.16228163242340088, "loss/reg": 5.4336887842509896e-05, "step": 705 }, { "epoch": 0.08825, "grad_norm": 3.0839779376983643, "grad_norm_var": 0.504688552634502, "learning_rate": 0.0001, "loss": 1.3383, "loss/crossentropy": 2.65484881401062, "loss/hidden": 1.140625, "loss/logits": 0.1971454918384552, "loss/reg": 5.43265778105706e-05, "step": 706 }, { "epoch": 0.088375, "grad_norm": 2.2295589447021484, "grad_norm_var": 0.5065127901642389, "learning_rate": 0.0001, "loss": 1.3799, "loss/crossentropy": 2.480161666870117, "loss/hidden": 1.171875, "loss/logits": 0.20748105645179749, "loss/reg": 5.4318323236657307e-05, "step": 707 }, { "epoch": 0.0885, "grad_norm": 3.5498087406158447, "grad_norm_var": 0.5147397082983834, "learning_rate": 0.0001, "loss": 1.5722, "loss/crossentropy": 1.904441475868225, "loss/hidden": 1.375, "loss/logits": 0.19664257764816284, "loss/reg": 5.4309970437316224e-05, "step": 708 }, { "epoch": 0.088625, "grad_norm": 4.781530857086182, "grad_norm_var": 0.7360025205661221, "learning_rate": 0.0001, "loss": 1.5755, "loss/crossentropy": 2.7621335983276367, "loss/hidden": 1.3046875, "loss/logits": 0.27026501297950745, "loss/reg": 5.429290467873216e-05, "step": 709 }, { "epoch": 0.08875, "grad_norm": 2.2828309535980225, "grad_norm_var": 0.7534264462205973, "learning_rate": 0.0001, "loss": 1.2681, "loss/crossentropy": 2.4517247676849365, "loss/hidden": 1.09375, "loss/logits": 0.1738508641719818, "loss/reg": 5.427583528216928e-05, "step": 710 }, { "epoch": 0.088875, "grad_norm": 2.650787115097046, "grad_norm_var": 0.7532573998370441, "learning_rate": 0.0001, "loss": 1.3073, "loss/crossentropy": 2.8212263584136963, "loss/hidden": 1.1171875, "loss/logits": 0.18959403038024902, "loss/reg": 5.4265576181933284e-05, "step": 711 }, { "epoch": 0.089, "grad_norm": 2.8599400520324707, "grad_norm_var": 0.7514689463424626, "learning_rate": 0.0001, "loss": 1.4851, "loss/crossentropy": 2.485013246536255, "loss/hidden": 1.2421875, "loss/logits": 0.24233944714069366, "loss/reg": 5.425211202236824e-05, "step": 712 }, { "epoch": 0.089125, "grad_norm": 3.3207690715789795, "grad_norm_var": 0.7576093299695834, "learning_rate": 0.0001, "loss": 1.4396, "loss/crossentropy": 2.5560240745544434, "loss/hidden": 1.2421875, "loss/logits": 0.19684939086437225, "loss/reg": 5.4234227718552575e-05, "step": 713 }, { "epoch": 0.08925, "grad_norm": 2.121156692504883, "grad_norm_var": 0.5060815970166267, "learning_rate": 0.0001, "loss": 1.2187, "loss/crossentropy": 2.7032556533813477, "loss/hidden": 1.046875, "loss/logits": 0.1712983250617981, "loss/reg": 5.4215375712374225e-05, "step": 714 }, { "epoch": 0.089375, "grad_norm": 2.606315851211548, "grad_norm_var": 0.509357702424964, "learning_rate": 0.0001, "loss": 1.3993, "loss/crossentropy": 2.3998241424560547, "loss/hidden": 1.1796875, "loss/logits": 0.21911802887916565, "loss/reg": 5.4198477300815284e-05, "step": 715 }, { "epoch": 0.0895, "grad_norm": 2.922720193862915, "grad_norm_var": 0.4713784636638735, "learning_rate": 0.0001, "loss": 1.4542, "loss/crossentropy": 2.6829943656921387, "loss/hidden": 1.234375, "loss/logits": 0.21930286288261414, "loss/reg": 5.4178999562282115e-05, "step": 716 }, { "epoch": 0.089625, "grad_norm": 2.1098432540893555, "grad_norm_var": 0.47590856989839947, "learning_rate": 0.0001, "loss": 1.233, "loss/crossentropy": 2.6072769165039062, "loss/hidden": 1.0625, "loss/logits": 0.16997693479061127, "loss/reg": 5.41587796760723e-05, "step": 717 }, { "epoch": 0.08975, "grad_norm": 3.364995241165161, "grad_norm_var": 0.4964416307179249, "learning_rate": 0.0001, "loss": 1.3734, "loss/crossentropy": 2.735403060913086, "loss/hidden": 1.1640625, "loss/logits": 0.20879969000816345, "loss/reg": 5.4137595725478604e-05, "step": 718 }, { "epoch": 0.089875, "grad_norm": 2.06780743598938, "grad_norm_var": 0.49867340108694785, "learning_rate": 0.0001, "loss": 1.3704, "loss/crossentropy": 2.5375735759735107, "loss/hidden": 1.15625, "loss/logits": 0.21360768377780914, "loss/reg": 5.411420715972781e-05, "step": 719 }, { "epoch": 0.09, "grad_norm": 2.5839052200317383, "grad_norm_var": 0.5021517785446536, "learning_rate": 0.0001, "loss": 1.201, "loss/crossentropy": 2.7017717361450195, "loss/hidden": 1.015625, "loss/logits": 0.18482069671154022, "loss/reg": 5.409633740782738e-05, "step": 720 }, { "epoch": 0.090125, "grad_norm": 2.7051029205322266, "grad_norm_var": 0.48985561320671017, "learning_rate": 0.0001, "loss": 1.3077, "loss/crossentropy": 2.4120161533355713, "loss/hidden": 1.1328125, "loss/logits": 0.17430010437965393, "loss/reg": 5.407804201240651e-05, "step": 721 }, { "epoch": 0.09025, "grad_norm": 3.305095911026001, "grad_norm_var": 0.5004710841579763, "learning_rate": 0.0001, "loss": 1.232, "loss/crossentropy": 2.5663864612579346, "loss/hidden": 1.046875, "loss/logits": 0.18463225662708282, "loss/reg": 5.405680713010952e-05, "step": 722 }, { "epoch": 0.090375, "grad_norm": 2.5852112770080566, "grad_norm_var": 0.47936361363680907, "learning_rate": 0.0001, "loss": 1.2172, "loss/crossentropy": 2.4206771850585938, "loss/hidden": 1.046875, "loss/logits": 0.16979727149009705, "loss/reg": 5.403965406003408e-05, "step": 723 }, { "epoch": 0.0905, "grad_norm": 5.01464319229126, "grad_norm_var": 0.7474939605767976, "learning_rate": 0.0001, "loss": 1.8636, "loss/crossentropy": 2.4939754009246826, "loss/hidden": 1.53125, "loss/logits": 0.33179470896720886, "loss/reg": 5.4016720241634175e-05, "step": 724 }, { "epoch": 0.090625, "grad_norm": 2.617537021636963, "grad_norm_var": 0.5132076404113444, "learning_rate": 0.0001, "loss": 1.4005, "loss/crossentropy": 2.3498785495758057, "loss/hidden": 1.1953125, "loss/logits": 0.20461852848529816, "loss/reg": 5.3998457588022575e-05, "step": 725 }, { "epoch": 0.09075, "grad_norm": 2.8471455574035645, "grad_norm_var": 0.49269947606489545, "learning_rate": 0.0001, "loss": 1.4197, "loss/crossentropy": 2.580679178237915, "loss/hidden": 1.2109375, "loss/logits": 0.20819300413131714, "loss/reg": 5.397558561526239e-05, "step": 726 }, { "epoch": 0.090875, "grad_norm": 21.460710525512695, "grad_norm_var": 22.0933953279752, "learning_rate": 0.0001, "loss": 1.8327, "loss/crossentropy": 2.223879814147949, "loss/hidden": 1.546875, "loss/logits": 0.28528282046318054, "loss/reg": 5.395858170231804e-05, "step": 727 }, { "epoch": 0.091, "grad_norm": 2.9006693363189697, "grad_norm_var": 22.08714053553672, "learning_rate": 0.0001, "loss": 1.5806, "loss/crossentropy": 2.5028350353240967, "loss/hidden": 1.3203125, "loss/logits": 0.25977060198783875, "loss/reg": 5.394085019361228e-05, "step": 728 }, { "epoch": 0.091125, "grad_norm": 3.1578407287597656, "grad_norm_var": 22.10427962795151, "learning_rate": 0.0001, "loss": 1.3831, "loss/crossentropy": 2.9056754112243652, "loss/hidden": 1.1796875, "loss/logits": 0.2028241753578186, "loss/reg": 5.392428647610359e-05, "step": 729 }, { "epoch": 0.09125, "grad_norm": 2.310551643371582, "grad_norm_var": 22.0584906663241, "learning_rate": 0.0001, "loss": 1.2775, "loss/crossentropy": 2.683490753173828, "loss/hidden": 1.1015625, "loss/logits": 0.17539778351783752, "loss/reg": 5.390584919950925e-05, "step": 730 }, { "epoch": 0.091375, "grad_norm": 3.0188450813293457, "grad_norm_var": 21.990543415264224, "learning_rate": 0.0001, "loss": 1.4888, "loss/crossentropy": 2.2403600215911865, "loss/hidden": 1.296875, "loss/logits": 0.19139915704727173, "loss/reg": 5.389019497670233e-05, "step": 731 }, { "epoch": 0.0915, "grad_norm": 2.768749713897705, "grad_norm_var": 22.015388964459888, "learning_rate": 0.0001, "loss": 1.3743, "loss/crossentropy": 2.5775599479675293, "loss/hidden": 1.1953125, "loss/logits": 0.17840096354484558, "loss/reg": 5.38753520231694e-05, "step": 732 }, { "epoch": 0.091625, "grad_norm": 2.750666856765747, "grad_norm_var": 21.87518218062818, "learning_rate": 0.0001, "loss": 1.2523, "loss/crossentropy": 2.4601376056671143, "loss/hidden": 1.09375, "loss/logits": 0.15799552202224731, "loss/reg": 5.3856590966461226e-05, "step": 733 }, { "epoch": 0.09175, "grad_norm": 2.343596935272217, "grad_norm_var": 22.03928719159194, "learning_rate": 0.0001, "loss": 1.395, "loss/crossentropy": 2.3660390377044678, "loss/hidden": 1.21875, "loss/logits": 0.17570561170578003, "loss/reg": 5.383471216191538e-05, "step": 734 }, { "epoch": 0.091875, "grad_norm": 2.5348923206329346, "grad_norm_var": 21.930884482184023, "learning_rate": 0.0001, "loss": 1.2834, "loss/crossentropy": 2.6442513465881348, "loss/hidden": 1.1015625, "loss/logits": 0.18133598566055298, "loss/reg": 5.381777373258956e-05, "step": 735 }, { "epoch": 0.092, "grad_norm": 2.759902000427246, "grad_norm_var": 21.89826244514976, "learning_rate": 0.0001, "loss": 1.461, "loss/crossentropy": 2.3467421531677246, "loss/hidden": 1.2578125, "loss/logits": 0.20265132188796997, "loss/reg": 5.3800951718585566e-05, "step": 736 }, { "epoch": 0.092125, "grad_norm": 3.2894835472106934, "grad_norm_var": 21.813446124750236, "learning_rate": 0.0001, "loss": 1.3914, "loss/crossentropy": 2.2499518394470215, "loss/hidden": 1.203125, "loss/logits": 0.18771812319755554, "loss/reg": 5.378201967687346e-05, "step": 737 }, { "epoch": 0.09225, "grad_norm": 2.6464269161224365, "grad_norm_var": 21.910731669963607, "learning_rate": 0.0001, "loss": 1.5077, "loss/crossentropy": 2.420416831970215, "loss/hidden": 1.28125, "loss/logits": 0.2258935570716858, "loss/reg": 5.376638000598177e-05, "step": 738 }, { "epoch": 0.092375, "grad_norm": 2.278536796569824, "grad_norm_var": 21.97703354471374, "learning_rate": 0.0001, "loss": 1.2114, "loss/crossentropy": 2.5701427459716797, "loss/hidden": 1.0390625, "loss/logits": 0.1718250811100006, "loss/reg": 5.3752868552692235e-05, "step": 739 }, { "epoch": 0.0925, "grad_norm": 2.3287525177001953, "grad_norm_var": 22.080218462231635, "learning_rate": 0.0001, "loss": 1.2548, "loss/crossentropy": 2.382229804992676, "loss/hidden": 1.1015625, "loss/logits": 0.15274423360824585, "loss/reg": 5.3735657274955884e-05, "step": 740 }, { "epoch": 0.092625, "grad_norm": 2.697570562362671, "grad_norm_var": 22.067190693445564, "learning_rate": 0.0001, "loss": 1.4246, "loss/crossentropy": 2.500366687774658, "loss/hidden": 1.234375, "loss/logits": 0.18973666429519653, "loss/reg": 5.372005034587346e-05, "step": 741 }, { "epoch": 0.09275, "grad_norm": 2.773134469985962, "grad_norm_var": 22.077734248370113, "learning_rate": 0.0001, "loss": 1.3596, "loss/crossentropy": 2.606679916381836, "loss/hidden": 1.1640625, "loss/logits": 0.1950472891330719, "loss/reg": 5.370312646846287e-05, "step": 742 }, { "epoch": 0.092875, "grad_norm": 2.5400867462158203, "grad_norm_var": 0.09096660622796264, "learning_rate": 0.0001, "loss": 1.3041, "loss/crossentropy": 2.1913156509399414, "loss/hidden": 1.125, "loss/logits": 0.1785746067762375, "loss/reg": 5.3685631428379565e-05, "step": 743 }, { "epoch": 0.093, "grad_norm": 2.3412983417510986, "grad_norm_var": 0.09508860759598577, "learning_rate": 0.0001, "loss": 1.4625, "loss/crossentropy": 2.1983559131622314, "loss/hidden": 1.2421875, "loss/logits": 0.21976345777511597, "loss/reg": 5.366921686800197e-05, "step": 744 }, { "epoch": 0.093125, "grad_norm": 2.2313711643218994, "grad_norm_var": 0.08708549521983074, "learning_rate": 0.0001, "loss": 1.4097, "loss/crossentropy": 2.288938522338867, "loss/hidden": 1.203125, "loss/logits": 0.20603150129318237, "loss/reg": 5.365387551137246e-05, "step": 745 }, { "epoch": 0.09325, "grad_norm": 2.251176118850708, "grad_norm_var": 0.08960418307721056, "learning_rate": 0.0001, "loss": 1.2055, "loss/crossentropy": 2.539421796798706, "loss/hidden": 1.046875, "loss/logits": 0.15807852149009705, "loss/reg": 5.3636584198102355e-05, "step": 746 }, { "epoch": 0.093375, "grad_norm": 2.9313182830810547, "grad_norm_var": 0.0851617748558122, "learning_rate": 0.0001, "loss": 1.4474, "loss/crossentropy": 2.4747750759124756, "loss/hidden": 1.25, "loss/logits": 0.1968374252319336, "loss/reg": 5.362145748222247e-05, "step": 747 }, { "epoch": 0.0935, "grad_norm": 2.4297850131988525, "grad_norm_var": 0.0843403592222972, "learning_rate": 0.0001, "loss": 1.1399, "loss/crossentropy": 2.758875846862793, "loss/hidden": 0.99609375, "loss/logits": 0.14325925707817078, "loss/reg": 5.360745853977278e-05, "step": 748 }, { "epoch": 0.093625, "grad_norm": 4.064235687255859, "grad_norm_var": 0.223736692323439, "learning_rate": 0.0001, "loss": 1.3673, "loss/crossentropy": 2.235731840133667, "loss/hidden": 1.171875, "loss/logits": 0.19489188492298126, "loss/reg": 5.359313581720926e-05, "step": 749 }, { "epoch": 0.09375, "grad_norm": 2.2421138286590576, "grad_norm_var": 0.22856148654111977, "learning_rate": 0.0001, "loss": 1.2226, "loss/crossentropy": 2.5793533325195312, "loss/hidden": 1.0546875, "loss/logits": 0.16737329959869385, "loss/reg": 5.35770996066276e-05, "step": 750 }, { "epoch": 0.093875, "grad_norm": 2.058133602142334, "grad_norm_var": 0.2498467671842178, "learning_rate": 0.0001, "loss": 1.1218, "loss/crossentropy": 2.627385377883911, "loss/hidden": 0.97265625, "loss/logits": 0.14862656593322754, "loss/reg": 5.355522807803936e-05, "step": 751 }, { "epoch": 0.094, "grad_norm": 3.1811017990112305, "grad_norm_var": 0.26899066622994783, "learning_rate": 0.0001, "loss": 1.2385, "loss/crossentropy": 2.3661398887634277, "loss/hidden": 1.09375, "loss/logits": 0.14422640204429626, "loss/reg": 5.3536150517174974e-05, "step": 752 }, { "epoch": 0.094125, "grad_norm": 2.043060302734375, "grad_norm_var": 0.25861380812167456, "learning_rate": 0.0001, "loss": 1.3259, "loss/crossentropy": 2.3880302906036377, "loss/hidden": 1.140625, "loss/logits": 0.184707909822464, "loss/reg": 5.35179533471819e-05, "step": 753 }, { "epoch": 0.09425, "grad_norm": 16.624187469482422, "grad_norm_var": 12.621702210829204, "learning_rate": 0.0001, "loss": 2.518, "loss/crossentropy": 2.336055040359497, "loss/hidden": 2.09375, "loss/logits": 0.42366719245910645, "loss/reg": 5.34973805770278e-05, "step": 754 }, { "epoch": 0.094375, "grad_norm": 2.8955469131469727, "grad_norm_var": 12.550068888672113, "learning_rate": 0.0001, "loss": 1.663, "loss/crossentropy": 2.07586932182312, "loss/hidden": 1.453125, "loss/logits": 0.20937134325504303, "loss/reg": 5.347675323719159e-05, "step": 755 }, { "epoch": 0.0945, "grad_norm": 2.708143711090088, "grad_norm_var": 12.500977569673056, "learning_rate": 0.0001, "loss": 1.3643, "loss/crossentropy": 2.9162800312042236, "loss/hidden": 1.140625, "loss/logits": 0.22310392558574677, "loss/reg": 5.345011595636606e-05, "step": 756 }, { "epoch": 0.094625, "grad_norm": 2.578479051589966, "grad_norm_var": 12.51461783628701, "learning_rate": 0.0001, "loss": 1.3496, "loss/crossentropy": 2.7447614669799805, "loss/hidden": 1.15625, "loss/logits": 0.1928335428237915, "loss/reg": 5.3432562708621845e-05, "step": 757 }, { "epoch": 0.09475, "grad_norm": 2.6751816272735596, "grad_norm_var": 12.524623447598344, "learning_rate": 0.0001, "loss": 1.2677, "loss/crossentropy": 2.4300589561462402, "loss/hidden": 1.0859375, "loss/logits": 0.18121860921382904, "loss/reg": 5.34126374986954e-05, "step": 758 }, { "epoch": 0.094875, "grad_norm": 2.838064432144165, "grad_norm_var": 12.492543668855477, "learning_rate": 0.0001, "loss": 1.3323, "loss/crossentropy": 2.4878199100494385, "loss/hidden": 1.1484375, "loss/logits": 0.1833486557006836, "loss/reg": 5.339576819096692e-05, "step": 759 }, { "epoch": 0.095, "grad_norm": 1.874202013015747, "grad_norm_var": 12.578705995031372, "learning_rate": 0.0001, "loss": 1.2521, "loss/crossentropy": 2.6370134353637695, "loss/hidden": 1.078125, "loss/logits": 0.17343951761722565, "loss/reg": 5.3375784773379564e-05, "step": 760 }, { "epoch": 0.095125, "grad_norm": 2.471583843231201, "grad_norm_var": 12.54242874137393, "learning_rate": 0.0001, "loss": 1.3698, "loss/crossentropy": 2.6770365238189697, "loss/hidden": 1.1953125, "loss/logits": 0.1740024983882904, "loss/reg": 5.3358369768830016e-05, "step": 761 }, { "epoch": 0.09525, "grad_norm": 2.321331739425659, "grad_norm_var": 12.531132909698341, "learning_rate": 0.0001, "loss": 1.3088, "loss/crossentropy": 2.7517244815826416, "loss/hidden": 1.1328125, "loss/logits": 0.17546439170837402, "loss/reg": 5.334003799362108e-05, "step": 762 }, { "epoch": 0.095375, "grad_norm": 2.227292776107788, "grad_norm_var": 12.615120618713426, "learning_rate": 0.0001, "loss": 1.1134, "loss/crossentropy": 2.4684908390045166, "loss/hidden": 0.9609375, "loss/logits": 0.15190817415714264, "loss/reg": 5.331678767106496e-05, "step": 763 }, { "epoch": 0.0955, "grad_norm": 3.2276973724365234, "grad_norm_var": 12.546157446449454, "learning_rate": 0.0001, "loss": 1.3265, "loss/crossentropy": 2.317460775375366, "loss/hidden": 1.1328125, "loss/logits": 0.19315344095230103, "loss/reg": 5.328991392161697e-05, "step": 764 }, { "epoch": 0.095625, "grad_norm": 2.525618553161621, "grad_norm_var": 12.578753225816566, "learning_rate": 0.0001, "loss": 1.2785, "loss/crossentropy": 2.763885021209717, "loss/hidden": 1.109375, "loss/logits": 0.1685691773891449, "loss/reg": 5.3272808145266026e-05, "step": 765 }, { "epoch": 0.09575, "grad_norm": 2.8182990550994873, "grad_norm_var": 12.510107821183945, "learning_rate": 0.0001, "loss": 1.4041, "loss/crossentropy": 2.609581232070923, "loss/hidden": 1.2109375, "loss/logits": 0.1925983875989914, "loss/reg": 5.325373786035925e-05, "step": 766 }, { "epoch": 0.095875, "grad_norm": 3.1149539947509766, "grad_norm_var": 12.384948285453223, "learning_rate": 0.0001, "loss": 1.4725, "loss/crossentropy": 2.8335139751434326, "loss/hidden": 1.2578125, "loss/logits": 0.21420395374298096, "loss/reg": 5.3225699957692996e-05, "step": 767 }, { "epoch": 0.096, "grad_norm": 2.8965165615081787, "grad_norm_var": 12.402406416217548, "learning_rate": 0.0001, "loss": 1.4859, "loss/crossentropy": 2.5122146606445312, "loss/hidden": 1.265625, "loss/logits": 0.2197187840938568, "loss/reg": 5.320890340954065e-05, "step": 768 }, { "epoch": 0.096125, "grad_norm": 2.1168019771575928, "grad_norm_var": 12.388519548771136, "learning_rate": 0.0001, "loss": 1.1191, "loss/crossentropy": 2.4875917434692383, "loss/hidden": 0.98046875, "loss/logits": 0.13811561465263367, "loss/reg": 5.319330739439465e-05, "step": 769 }, { "epoch": 0.09625, "grad_norm": 3.6164140701293945, "grad_norm_var": 0.1921279196482864, "learning_rate": 0.0001, "loss": 1.506, "loss/crossentropy": 2.443065643310547, "loss/hidden": 1.2734375, "loss/logits": 0.23207004368305206, "loss/reg": 5.3170962928561494e-05, "step": 770 }, { "epoch": 0.096375, "grad_norm": 2.91166090965271, "grad_norm_var": 0.19260374956815268, "learning_rate": 0.0001, "loss": 1.3272, "loss/crossentropy": 2.334723472595215, "loss/hidden": 1.1484375, "loss/logits": 0.17818781733512878, "loss/reg": 5.315555972629227e-05, "step": 771 }, { "epoch": 0.0965, "grad_norm": 2.8536345958709717, "grad_norm_var": 0.1944214633678918, "learning_rate": 0.0001, "loss": 1.356, "loss/crossentropy": 2.735614538192749, "loss/hidden": 1.15625, "loss/logits": 0.19926324486732483, "loss/reg": 5.313804649631493e-05, "step": 772 }, { "epoch": 0.096625, "grad_norm": 2.7003872394561768, "grad_norm_var": 0.19350943129851217, "learning_rate": 0.0001, "loss": 1.2365, "loss/crossentropy": 2.695556879043579, "loss/hidden": 1.078125, "loss/logits": 0.15788228809833527, "loss/reg": 5.3116473281988874e-05, "step": 773 }, { "epoch": 0.09675, "grad_norm": 2.7257320880889893, "grad_norm_var": 0.193506227128938, "learning_rate": 0.0001, "loss": 1.2951, "loss/crossentropy": 2.7179250717163086, "loss/hidden": 1.1171875, "loss/logits": 0.17742162942886353, "loss/reg": 5.30926845385693e-05, "step": 774 }, { "epoch": 0.096875, "grad_norm": 2.576854944229126, "grad_norm_var": 0.19304961436835294, "learning_rate": 0.0001, "loss": 1.3194, "loss/crossentropy": 2.3818206787109375, "loss/hidden": 1.140625, "loss/logits": 0.17822806537151337, "loss/reg": 5.3073516028234735e-05, "step": 775 }, { "epoch": 0.097, "grad_norm": 2.4969394207000732, "grad_norm_var": 0.1498668282970544, "learning_rate": 0.0001, "loss": 1.3081, "loss/crossentropy": 2.0176634788513184, "loss/hidden": 1.15625, "loss/logits": 0.15133124589920044, "loss/reg": 5.305654849507846e-05, "step": 776 }, { "epoch": 0.097125, "grad_norm": 3.22451114654541, "grad_norm_var": 0.15984673617916367, "learning_rate": 0.0001, "loss": 1.5053, "loss/crossentropy": 2.820338010787964, "loss/hidden": 1.25, "loss/logits": 0.25477665662765503, "loss/reg": 5.3042218496557325e-05, "step": 777 }, { "epoch": 0.09725, "grad_norm": 24.615375518798828, "grad_norm_var": 29.883750264758483, "learning_rate": 0.0001, "loss": 1.3336, "loss/crossentropy": 1.9187242984771729, "loss/hidden": 1.1484375, "loss/logits": 0.1846262812614441, "loss/reg": 5.302110002958216e-05, "step": 778 }, { "epoch": 0.097375, "grad_norm": 2.6715848445892334, "grad_norm_var": 29.781267578163263, "learning_rate": 0.0001, "loss": 1.4791, "loss/crossentropy": 2.532146692276001, "loss/hidden": 1.265625, "loss/logits": 0.2129930853843689, "loss/reg": 5.30023971805349e-05, "step": 779 }, { "epoch": 0.0975, "grad_norm": 2.747297763824463, "grad_norm_var": 29.85754231101701, "learning_rate": 0.0001, "loss": 1.3347, "loss/crossentropy": 2.687166929244995, "loss/hidden": 1.140625, "loss/logits": 0.19358965754508972, "loss/reg": 5.2986379159847274e-05, "step": 780 }, { "epoch": 0.097625, "grad_norm": 2.6517791748046875, "grad_norm_var": 29.83098919964196, "learning_rate": 0.0001, "loss": 1.4492, "loss/crossentropy": 2.927424907684326, "loss/hidden": 1.234375, "loss/logits": 0.21428784728050232, "loss/reg": 5.296709787216969e-05, "step": 781 }, { "epoch": 0.09775, "grad_norm": 2.4107885360717773, "grad_norm_var": 29.914876215687343, "learning_rate": 0.0001, "loss": 1.4283, "loss/crossentropy": 2.3049545288085938, "loss/hidden": 1.2265625, "loss/logits": 0.20118646323680878, "loss/reg": 5.2940118621336296e-05, "step": 782 }, { "epoch": 0.097875, "grad_norm": 4.856544017791748, "grad_norm_var": 29.865095133338084, "learning_rate": 0.0001, "loss": 2.0985, "loss/crossentropy": 2.8381922245025635, "loss/hidden": 1.65625, "loss/logits": 0.4416726231575012, "loss/reg": 5.291615889291279e-05, "step": 783 }, { "epoch": 0.098, "grad_norm": 3.376915216445923, "grad_norm_var": 29.792532646292965, "learning_rate": 0.0001, "loss": 1.4062, "loss/crossentropy": 3.5869369506835938, "loss/hidden": 1.234375, "loss/logits": 0.17128312587738037, "loss/reg": 5.289655382512137e-05, "step": 784 }, { "epoch": 0.098125, "grad_norm": 2.383514881134033, "grad_norm_var": 29.719888845997076, "learning_rate": 0.0001, "loss": 1.1903, "loss/crossentropy": 2.9117846488952637, "loss/hidden": 1.03125, "loss/logits": 0.15850681066513062, "loss/reg": 5.287636668072082e-05, "step": 785 }, { "epoch": 0.09825, "grad_norm": 2.088331937789917, "grad_norm_var": 30.00535910434075, "learning_rate": 0.0001, "loss": 1.2191, "loss/crossentropy": 2.4488675594329834, "loss/hidden": 1.0546875, "loss/logits": 0.1639135628938675, "loss/reg": 5.285735096549615e-05, "step": 786 }, { "epoch": 0.098375, "grad_norm": 3.4909255504608154, "grad_norm_var": 29.926382197605413, "learning_rate": 0.0001, "loss": 1.7858, "loss/crossentropy": 2.534886121749878, "loss/hidden": 1.484375, "loss/logits": 0.3009305000305176, "loss/reg": 5.283685095491819e-05, "step": 787 }, { "epoch": 0.0985, "grad_norm": 2.4427733421325684, "grad_norm_var": 30.01298634962116, "learning_rate": 0.0001, "loss": 1.1909, "loss/crossentropy": 2.2383270263671875, "loss/hidden": 1.0390625, "loss/logits": 0.1513344794511795, "loss/reg": 5.281960329739377e-05, "step": 788 }, { "epoch": 0.098625, "grad_norm": 2.3890647888183594, "grad_norm_var": 30.08196756498999, "learning_rate": 0.0001, "loss": 1.3743, "loss/crossentropy": 2.4775261878967285, "loss/hidden": 1.171875, "loss/logits": 0.20193596184253693, "loss/reg": 5.280092591419816e-05, "step": 789 }, { "epoch": 0.09875, "grad_norm": 5.748532295227051, "grad_norm_var": 30.06014752680334, "learning_rate": 0.0001, "loss": 1.9235, "loss/crossentropy": 2.484543561935425, "loss/hidden": 1.6640625, "loss/logits": 0.2588757276535034, "loss/reg": 5.27824777236674e-05, "step": 790 }, { "epoch": 0.098875, "grad_norm": 2.519845485687256, "grad_norm_var": 30.074100413727034, "learning_rate": 0.0001, "loss": 1.3943, "loss/crossentropy": 2.480121374130249, "loss/hidden": 1.203125, "loss/logits": 0.19063332676887512, "loss/reg": 5.276537558529526e-05, "step": 791 }, { "epoch": 0.099, "grad_norm": 3.150681495666504, "grad_norm_var": 29.936484287726426, "learning_rate": 0.0001, "loss": 1.7175, "loss/crossentropy": 2.2747299671173096, "loss/hidden": 1.4921875, "loss/logits": 0.22481489181518555, "loss/reg": 5.27442607562989e-05, "step": 792 }, { "epoch": 0.099125, "grad_norm": 3.151745557785034, "grad_norm_var": 29.948443330167883, "learning_rate": 0.0001, "loss": 1.5235, "loss/crossentropy": 2.7657337188720703, "loss/hidden": 1.2734375, "loss/logits": 0.24951426684856415, "loss/reg": 5.272776979836635e-05, "step": 793 }, { "epoch": 0.09925, "grad_norm": 2.513519525527954, "grad_norm_var": 0.960682649799162, "learning_rate": 0.0001, "loss": 1.3274, "loss/crossentropy": 2.569284200668335, "loss/hidden": 1.1484375, "loss/logits": 0.17844460904598236, "loss/reg": 5.271006011753343e-05, "step": 794 }, { "epoch": 0.099375, "grad_norm": 2.5686886310577393, "grad_norm_var": 0.966359269696044, "learning_rate": 0.0001, "loss": 1.3463, "loss/crossentropy": 2.590020179748535, "loss/hidden": 1.140625, "loss/logits": 0.20516559481620789, "loss/reg": 5.269322718959302e-05, "step": 795 }, { "epoch": 0.0995, "grad_norm": 3.2609806060791016, "grad_norm_var": 0.9634417109839205, "learning_rate": 0.0001, "loss": 2.0341, "loss/crossentropy": 2.2520804405212402, "loss/hidden": 1.5625, "loss/logits": 0.4710923433303833, "loss/reg": 5.267909000394866e-05, "step": 796 }, { "epoch": 0.099625, "grad_norm": 4.763014793395996, "grad_norm_var": 1.1263253492342187, "learning_rate": 0.0001, "loss": 1.9775, "loss/crossentropy": 2.288787364959717, "loss/hidden": 1.640625, "loss/logits": 0.33635812997817993, "loss/reg": 5.266653897706419e-05, "step": 797 }, { "epoch": 0.09975, "grad_norm": 3.378089189529419, "grad_norm_var": 1.0836956421815953, "learning_rate": 0.0001, "loss": 1.2208, "loss/crossentropy": 2.697234869003296, "loss/hidden": 1.0546875, "loss/logits": 0.16561515629291534, "loss/reg": 5.264836363494396e-05, "step": 798 }, { "epoch": 0.099875, "grad_norm": 2.9043984413146973, "grad_norm_var": 0.905067080343201, "learning_rate": 0.0001, "loss": 1.3825, "loss/crossentropy": 2.6060903072357178, "loss/hidden": 1.171875, "loss/logits": 0.21010896563529968, "loss/reg": 5.2631796279456466e-05, "step": 799 }, { "epoch": 0.1, "grad_norm": 4.528866291046143, "grad_norm_var": 1.0254388138747639, "learning_rate": 0.0001, "loss": 1.598, "loss/crossentropy": 2.841477632522583, "loss/hidden": 1.359375, "loss/logits": 0.23809757828712463, "loss/reg": 5.261685873847455e-05, "step": 800 }, { "epoch": 0.100125, "grad_norm": 3.34147047996521, "grad_norm_var": 0.9778438459070872, "learning_rate": 0.0001, "loss": 1.7202, "loss/crossentropy": 2.2488768100738525, "loss/hidden": 1.4453125, "loss/logits": 0.2743300199508667, "loss/reg": 5.2600626077037305e-05, "step": 801 }, { "epoch": 0.10025, "grad_norm": 2.4430441856384277, "grad_norm_var": 0.9300544238136648, "learning_rate": 0.0001, "loss": 1.2715, "loss/crossentropy": 3.037824869155884, "loss/hidden": 1.09375, "loss/logits": 0.17723414301872253, "loss/reg": 5.258737655822188e-05, "step": 802 }, { "epoch": 0.100375, "grad_norm": 2.2626993656158447, "grad_norm_var": 0.9909798492162054, "learning_rate": 0.0001, "loss": 1.5159, "loss/crossentropy": 2.764657497406006, "loss/hidden": 1.28125, "loss/logits": 0.23410022258758545, "loss/reg": 5.2564399084076285e-05, "step": 803 }, { "epoch": 0.1005, "grad_norm": 2.310553789138794, "grad_norm_var": 1.005606293107261, "learning_rate": 0.0001, "loss": 1.3199, "loss/crossentropy": 2.452549457550049, "loss/hidden": 1.140625, "loss/logits": 0.1787756383419037, "loss/reg": 5.254061034065671e-05, "step": 804 }, { "epoch": 0.100625, "grad_norm": 2.7235846519470215, "grad_norm_var": 0.9763322945012208, "learning_rate": 0.0001, "loss": 1.3767, "loss/crossentropy": 2.6927273273468018, "loss/hidden": 1.1875, "loss/logits": 0.18868786096572876, "loss/reg": 5.252664050203748e-05, "step": 805 }, { "epoch": 0.10075, "grad_norm": 3.140831232070923, "grad_norm_var": 0.5232650102161784, "learning_rate": 0.0001, "loss": 1.278, "loss/crossentropy": 2.5507137775421143, "loss/hidden": 1.1171875, "loss/logits": 0.16028936207294464, "loss/reg": 5.2507621148834005e-05, "step": 806 }, { "epoch": 0.100875, "grad_norm": 2.3224105834960938, "grad_norm_var": 0.5399239876549131, "learning_rate": 0.0001, "loss": 1.4063, "loss/crossentropy": 2.2676374912261963, "loss/hidden": 1.21875, "loss/logits": 0.18703754246234894, "loss/reg": 5.249512832961045e-05, "step": 807 }, { "epoch": 0.101, "grad_norm": 2.5600266456604004, "grad_norm_var": 0.5536251437135727, "learning_rate": 0.0001, "loss": 1.2854, "loss/crossentropy": 2.1395981311798096, "loss/hidden": 1.1171875, "loss/logits": 0.1677340865135193, "loss/reg": 5.2473347750492394e-05, "step": 808 }, { "epoch": 0.101125, "grad_norm": 3.5100274085998535, "grad_norm_var": 0.5683777537285927, "learning_rate": 0.0001, "loss": 1.3423, "loss/crossentropy": 2.3729093074798584, "loss/hidden": 1.15625, "loss/logits": 0.18547970056533813, "loss/reg": 5.2468130888883024e-05, "step": 809 }, { "epoch": 0.10125, "grad_norm": 2.435258626937866, "grad_norm_var": 0.5741839625022178, "learning_rate": 0.0001, "loss": 1.284, "loss/crossentropy": 2.6526637077331543, "loss/hidden": 1.109375, "loss/logits": 0.17410364747047424, "loss/reg": 5.2465042244875804e-05, "step": 810 }, { "epoch": 0.101375, "grad_norm": 2.5132720470428467, "grad_norm_var": 0.5777724408661865, "learning_rate": 0.0001, "loss": 1.1777, "loss/crossentropy": 2.680278778076172, "loss/hidden": 1.015625, "loss/logits": 0.16152815520763397, "loss/reg": 5.245738066150807e-05, "step": 811 }, { "epoch": 0.1015, "grad_norm": 2.755100965499878, "grad_norm_var": 0.5778438371123987, "learning_rate": 0.0001, "loss": 1.3688, "loss/crossentropy": 2.2540314197540283, "loss/hidden": 1.171875, "loss/logits": 0.19639912247657776, "loss/reg": 5.2445950132096186e-05, "step": 812 }, { "epoch": 0.101625, "grad_norm": 3.0264639854431152, "grad_norm_var": 0.35655723794493405, "learning_rate": 0.0001, "loss": 1.4901, "loss/crossentropy": 2.477457284927368, "loss/hidden": 1.28125, "loss/logits": 0.20836040377616882, "loss/reg": 5.24301067343913e-05, "step": 813 }, { "epoch": 0.10175, "grad_norm": 2.25610089302063, "grad_norm_var": 0.3614339888761369, "learning_rate": 0.0001, "loss": 1.2316, "loss/crossentropy": 2.53898549079895, "loss/hidden": 1.0546875, "loss/logits": 0.17634719610214233, "loss/reg": 5.240976679488085e-05, "step": 814 }, { "epoch": 0.101875, "grad_norm": 2.8859362602233887, "grad_norm_var": 0.3612343205244988, "learning_rate": 0.0001, "loss": 1.2416, "loss/crossentropy": 2.5830461978912354, "loss/hidden": 1.0859375, "loss/logits": 0.1551593840122223, "loss/reg": 5.2394090744201094e-05, "step": 815 }, { "epoch": 0.102, "grad_norm": 2.317923069000244, "grad_norm_var": 0.16106769833787193, "learning_rate": 0.0001, "loss": 1.3219, "loss/crossentropy": 2.636214256286621, "loss/hidden": 1.1484375, "loss/logits": 0.17291927337646484, "loss/reg": 5.2378440159372985e-05, "step": 816 }, { "epoch": 0.102125, "grad_norm": 2.4304862022399902, "grad_norm_var": 0.132019131991211, "learning_rate": 0.0001, "loss": 1.2323, "loss/crossentropy": 2.396676778793335, "loss/hidden": 1.0546875, "loss/logits": 0.17712949216365814, "loss/reg": 5.236340803094208e-05, "step": 817 }, { "epoch": 0.10225, "grad_norm": 3.4825210571289062, "grad_norm_var": 0.17525325841580727, "learning_rate": 0.0001, "loss": 1.394, "loss/crossentropy": 2.8964662551879883, "loss/hidden": 1.203125, "loss/logits": 0.1903287172317505, "loss/reg": 5.2351682825246826e-05, "step": 818 }, { "epoch": 0.102375, "grad_norm": 3.8344733715057373, "grad_norm_var": 0.24150743745623965, "learning_rate": 0.0001, "loss": 1.4858, "loss/crossentropy": 2.266521453857422, "loss/hidden": 1.2578125, "loss/logits": 0.22744080424308777, "loss/reg": 5.2338960813358426e-05, "step": 819 }, { "epoch": 0.1025, "grad_norm": 16.969898223876953, "grad_norm_var": 12.751910852860423, "learning_rate": 0.0001, "loss": 1.3032, "loss/crossentropy": 2.4637362957000732, "loss/hidden": 1.125, "loss/logits": 0.17766262590885162, "loss/reg": 5.232635885477066e-05, "step": 820 }, { "epoch": 0.102625, "grad_norm": 2.360180139541626, "grad_norm_var": 12.807367879393507, "learning_rate": 0.0001, "loss": 1.1473, "loss/crossentropy": 2.466048240661621, "loss/hidden": 1.0, "loss/logits": 0.14673739671707153, "loss/reg": 5.230958413449116e-05, "step": 821 }, { "epoch": 0.10275, "grad_norm": 2.3272242546081543, "grad_norm_var": 12.906693448577307, "learning_rate": 0.0001, "loss": 1.2033, "loss/crossentropy": 2.551889419555664, "loss/hidden": 1.0390625, "loss/logits": 0.1636783480644226, "loss/reg": 5.229478847468272e-05, "step": 822 }, { "epoch": 0.102875, "grad_norm": 2.406409502029419, "grad_norm_var": 12.892554510856643, "learning_rate": 0.0001, "loss": 1.278, "loss/crossentropy": 2.602328062057495, "loss/hidden": 1.09375, "loss/logits": 0.18373973667621613, "loss/reg": 5.227828660281375e-05, "step": 823 }, { "epoch": 0.103, "grad_norm": 2.1223392486572266, "grad_norm_var": 12.966937776264512, "learning_rate": 0.0001, "loss": 1.2087, "loss/crossentropy": 2.498873472213745, "loss/hidden": 1.046875, "loss/logits": 0.16126872599124908, "loss/reg": 5.226361099630594e-05, "step": 824 }, { "epoch": 0.103125, "grad_norm": 3.208017110824585, "grad_norm_var": 12.976346036172204, "learning_rate": 0.0001, "loss": 1.6108, "loss/crossentropy": 2.687594175338745, "loss/hidden": 1.390625, "loss/logits": 0.21968355774879456, "loss/reg": 5.2248811698518693e-05, "step": 825 }, { "epoch": 0.10325, "grad_norm": 5.490570545196533, "grad_norm_var": 13.092126380129832, "learning_rate": 0.0001, "loss": 1.7116, "loss/crossentropy": 2.1563971042633057, "loss/hidden": 1.453125, "loss/logits": 0.257944792509079, "loss/reg": 5.22348600497935e-05, "step": 826 }, { "epoch": 0.103375, "grad_norm": 2.088407278060913, "grad_norm_var": 13.174837105670752, "learning_rate": 0.0001, "loss": 1.1401, "loss/crossentropy": 2.368215560913086, "loss/hidden": 0.99609375, "loss/logits": 0.1434709131717682, "loss/reg": 5.222256004344672e-05, "step": 827 }, { "epoch": 0.1035, "grad_norm": 2.4680469036102295, "grad_norm_var": 13.217974973219603, "learning_rate": 0.0001, "loss": 1.4306, "loss/crossentropy": 2.192209482192993, "loss/hidden": 1.234375, "loss/logits": 0.19567325711250305, "loss/reg": 5.220507227932103e-05, "step": 828 }, { "epoch": 0.103625, "grad_norm": 2.515068531036377, "grad_norm_var": 13.282270337982398, "learning_rate": 0.0001, "loss": 1.1004, "loss/crossentropy": 2.653474807739258, "loss/hidden": 0.96484375, "loss/logits": 0.13500887155532837, "loss/reg": 5.2188064728397876e-05, "step": 829 }, { "epoch": 0.10375, "grad_norm": 2.164379358291626, "grad_norm_var": 13.30042653920423, "learning_rate": 0.0001, "loss": 1.1057, "loss/crossentropy": 2.469616651535034, "loss/hidden": 0.97265625, "loss/logits": 0.13253287971019745, "loss/reg": 5.217095895204693e-05, "step": 830 }, { "epoch": 0.103875, "grad_norm": 2.072486400604248, "grad_norm_var": 13.429207683172454, "learning_rate": 0.0001, "loss": 1.2587, "loss/crossentropy": 2.185154914855957, "loss/hidden": 1.1015625, "loss/logits": 0.15661650896072388, "loss/reg": 5.2154366130707785e-05, "step": 831 }, { "epoch": 0.104, "grad_norm": 2.822169542312622, "grad_norm_var": 13.356134748586637, "learning_rate": 0.0001, "loss": 1.4411, "loss/crossentropy": 2.312222719192505, "loss/hidden": 1.25, "loss/logits": 0.19062718749046326, "loss/reg": 5.213710755924694e-05, "step": 832 }, { "epoch": 0.104125, "grad_norm": 3.2380270957946777, "grad_norm_var": 13.263144115005607, "learning_rate": 0.0001, "loss": 1.3954, "loss/crossentropy": 2.565952777862549, "loss/hidden": 1.1875, "loss/logits": 0.20734888315200806, "loss/reg": 5.212176256463863e-05, "step": 833 }, { "epoch": 0.10425, "grad_norm": 2.9629056453704834, "grad_norm_var": 13.29668960799979, "learning_rate": 0.0001, "loss": 1.3472, "loss/crossentropy": 2.4239981174468994, "loss/hidden": 1.1875, "loss/logits": 0.15917940437793732, "loss/reg": 5.2105944632785395e-05, "step": 834 }, { "epoch": 0.104375, "grad_norm": 2.65922474861145, "grad_norm_var": 13.360480084554695, "learning_rate": 0.0001, "loss": 1.4703, "loss/crossentropy": 2.5808305740356445, "loss/hidden": 1.2265625, "loss/logits": 0.243194580078125, "loss/reg": 5.209133814787492e-05, "step": 835 }, { "epoch": 0.1045, "grad_norm": 2.3345870971679688, "grad_norm_var": 0.691400615292184, "learning_rate": 0.0001, "loss": 1.3773, "loss/crossentropy": 2.2131404876708984, "loss/hidden": 1.171875, "loss/logits": 0.20489053428173065, "loss/reg": 5.207399954088032e-05, "step": 836 }, { "epoch": 0.104625, "grad_norm": 2.3921146392822266, "grad_norm_var": 0.6900067668765199, "learning_rate": 0.0001, "loss": 1.1512, "loss/crossentropy": 3.2045135498046875, "loss/hidden": 0.99609375, "loss/logits": 0.15453840792179108, "loss/reg": 5.205388879403472e-05, "step": 837 }, { "epoch": 0.10475, "grad_norm": 2.016071319580078, "grad_norm_var": 0.7117097796198171, "learning_rate": 0.0001, "loss": 1.3153, "loss/crossentropy": 2.6639490127563477, "loss/hidden": 1.1328125, "loss/logits": 0.18196584284305573, "loss/reg": 5.2032042731298134e-05, "step": 838 }, { "epoch": 0.104875, "grad_norm": 2.5177884101867676, "grad_norm_var": 0.7083471286799463, "learning_rate": 0.0001, "loss": 1.3959, "loss/crossentropy": 2.569981336593628, "loss/hidden": 1.1953125, "loss/logits": 0.20005394518375397, "loss/reg": 5.200940722716041e-05, "step": 839 }, { "epoch": 0.105, "grad_norm": 1.9999221563339233, "grad_norm_var": 0.7185821198972163, "learning_rate": 0.0001, "loss": 1.1947, "loss/crossentropy": 2.6757583618164062, "loss/hidden": 1.0234375, "loss/logits": 0.17078331112861633, "loss/reg": 5.199360748520121e-05, "step": 840 }, { "epoch": 0.105125, "grad_norm": 2.270017147064209, "grad_norm_var": 0.708080528199305, "learning_rate": 0.0001, "loss": 1.2045, "loss/crossentropy": 2.5654280185699463, "loss/hidden": 1.0390625, "loss/logits": 0.16487887501716614, "loss/reg": 5.197878272156231e-05, "step": 841 }, { "epoch": 0.10525, "grad_norm": 2.2202858924865723, "grad_norm_var": 0.12732683712733267, "learning_rate": 0.0001, "loss": 1.3008, "loss/crossentropy": 2.4787936210632324, "loss/hidden": 1.109375, "loss/logits": 0.19092029333114624, "loss/reg": 5.196038546273485e-05, "step": 842 }, { "epoch": 0.105375, "grad_norm": 2.4694182872772217, "grad_norm_var": 0.11948625558178154, "learning_rate": 0.0001, "loss": 1.2657, "loss/crossentropy": 2.716890335083008, "loss/hidden": 1.09375, "loss/logits": 0.17146353423595428, "loss/reg": 5.1946241001132876e-05, "step": 843 }, { "epoch": 0.1055, "grad_norm": 2.3803436756134033, "grad_norm_var": 0.11969932832842947, "learning_rate": 0.0001, "loss": 1.1049, "loss/crossentropy": 2.3049373626708984, "loss/hidden": 0.9609375, "loss/logits": 0.1434895098209381, "loss/reg": 5.193064862396568e-05, "step": 844 }, { "epoch": 0.105625, "grad_norm": 2.839191198348999, "grad_norm_var": 0.1295235040782898, "learning_rate": 0.0001, "loss": 1.3633, "loss/crossentropy": 2.3535118103027344, "loss/hidden": 1.171875, "loss/logits": 0.1908886432647705, "loss/reg": 5.1912767958128825e-05, "step": 845 }, { "epoch": 0.10575, "grad_norm": 2.440533399581909, "grad_norm_var": 0.12340736502353161, "learning_rate": 0.0001, "loss": 1.3406, "loss/crossentropy": 2.587473154067993, "loss/hidden": 1.125, "loss/logits": 0.21512514352798462, "loss/reg": 5.18912602274213e-05, "step": 846 }, { "epoch": 0.105875, "grad_norm": 2.274799108505249, "grad_norm_var": 0.11504854753899843, "learning_rate": 0.0001, "loss": 1.3655, "loss/crossentropy": 2.5222935676574707, "loss/hidden": 1.171875, "loss/logits": 0.19314169883728027, "loss/reg": 5.187144415685907e-05, "step": 847 }, { "epoch": 0.106, "grad_norm": 3.504936456680298, "grad_norm_var": 0.17443826044681046, "learning_rate": 0.0001, "loss": 1.5789, "loss/crossentropy": 2.4715349674224854, "loss/hidden": 1.3359375, "loss/logits": 0.24247828125953674, "loss/reg": 5.1850674935849383e-05, "step": 848 }, { "epoch": 0.106125, "grad_norm": 2.178030490875244, "grad_norm_var": 0.14495010255316926, "learning_rate": 0.0001, "loss": 1.1547, "loss/crossentropy": 2.576503038406372, "loss/hidden": 1.0078125, "loss/logits": 0.1463319957256317, "loss/reg": 5.182998575037345e-05, "step": 849 }, { "epoch": 0.10625, "grad_norm": 4.153827667236328, "grad_norm_var": 0.3124556252586463, "learning_rate": 0.0001, "loss": 1.5262, "loss/crossentropy": 2.7432754039764404, "loss/hidden": 1.28125, "loss/logits": 0.24444353580474854, "loss/reg": 5.180889638722874e-05, "step": 850 }, { "epoch": 0.106375, "grad_norm": 6.0633087158203125, "grad_norm_var": 1.09049118560782, "learning_rate": 0.0001, "loss": 1.7417, "loss/crossentropy": 3.1306426525115967, "loss/hidden": 1.484375, "loss/logits": 0.25679004192352295, "loss/reg": 5.178620995138772e-05, "step": 851 }, { "epoch": 0.1065, "grad_norm": 2.4886646270751953, "grad_norm_var": 1.0833699781585693, "learning_rate": 0.0001, "loss": 1.265, "loss/crossentropy": 2.5716023445129395, "loss/hidden": 1.078125, "loss/logits": 0.1863655000925064, "loss/reg": 5.177418643143028e-05, "step": 852 }, { "epoch": 0.106625, "grad_norm": 4.484250068664551, "grad_norm_var": 1.2534535582414965, "learning_rate": 0.0001, "loss": 1.9001, "loss/crossentropy": 3.1852643489837646, "loss/hidden": 1.515625, "loss/logits": 0.3839457631111145, "loss/reg": 5.1762908697128296e-05, "step": 853 }, { "epoch": 0.10675, "grad_norm": 2.6472911834716797, "grad_norm_var": 1.2044808988564493, "learning_rate": 0.0001, "loss": 1.2718, "loss/crossentropy": 2.6136245727539062, "loss/hidden": 1.1015625, "loss/logits": 0.1696871519088745, "loss/reg": 5.174713805899955e-05, "step": 854 }, { "epoch": 0.106875, "grad_norm": 4.49519681930542, "grad_norm_var": 1.3393165741714852, "learning_rate": 0.0001, "loss": 1.6395, "loss/crossentropy": 2.5587494373321533, "loss/hidden": 1.3828125, "loss/logits": 0.2561890184879303, "loss/reg": 5.173370664124377e-05, "step": 855 }, { "epoch": 0.107, "grad_norm": 2.924689531326294, "grad_norm_var": 1.2624413783622968, "learning_rate": 0.0001, "loss": 1.4652, "loss/crossentropy": 2.696464776992798, "loss/hidden": 1.265625, "loss/logits": 0.1990174502134323, "loss/reg": 5.172126475372352e-05, "step": 856 }, { "epoch": 0.107125, "grad_norm": 2.4132745265960693, "grad_norm_var": 1.2475902683594655, "learning_rate": 0.0001, "loss": 1.3195, "loss/crossentropy": 2.5237464904785156, "loss/hidden": 1.140625, "loss/logits": 0.17832911014556885, "loss/reg": 5.170765507500619e-05, "step": 857 }, { "epoch": 0.10725, "grad_norm": 2.395402193069458, "grad_norm_var": 1.228414894644508, "learning_rate": 0.0001, "loss": 1.1678, "loss/crossentropy": 2.910306453704834, "loss/hidden": 1.0078125, "loss/logits": 0.15949246287345886, "loss/reg": 5.169427822693251e-05, "step": 858 }, { "epoch": 0.107375, "grad_norm": 2.0667219161987305, "grad_norm_var": 1.274264185741049, "learning_rate": 0.0001, "loss": 1.3264, "loss/crossentropy": 2.495528221130371, "loss/hidden": 1.125, "loss/logits": 0.2009069323539734, "loss/reg": 5.168099596630782e-05, "step": 859 }, { "epoch": 0.1075, "grad_norm": 2.2085459232330322, "grad_norm_var": 1.29280895985072, "learning_rate": 0.0001, "loss": 1.3652, "loss/crossentropy": 2.452284574508667, "loss/hidden": 1.171875, "loss/logits": 0.19278863072395325, "loss/reg": 5.1672195695573464e-05, "step": 860 }, { "epoch": 0.107625, "grad_norm": 2.144191026687622, "grad_norm_var": 1.347042753481233, "learning_rate": 0.0001, "loss": 1.3302, "loss/crossentropy": 2.5444531440734863, "loss/hidden": 1.15625, "loss/logits": 0.17347240447998047, "loss/reg": 5.165613038116135e-05, "step": 861 }, { "epoch": 0.10775, "grad_norm": 2.207613229751587, "grad_norm_var": 1.3695234911406715, "learning_rate": 0.0001, "loss": 1.1685, "loss/crossentropy": 2.5881083011627197, "loss/hidden": 1.015625, "loss/logits": 0.15239441394805908, "loss/reg": 5.1647028158186004e-05, "step": 862 }, { "epoch": 0.107875, "grad_norm": 3.2510950565338135, "grad_norm_var": 1.3293998581318258, "learning_rate": 0.0001, "loss": 1.3857, "loss/crossentropy": 2.6068148612976074, "loss/hidden": 1.1953125, "loss/logits": 0.189855694770813, "loss/reg": 5.1638893637573346e-05, "step": 863 }, { "epoch": 0.108, "grad_norm": 2.065004587173462, "grad_norm_var": 1.3815679315585072, "learning_rate": 0.0001, "loss": 1.2268, "loss/crossentropy": 2.1115543842315674, "loss/hidden": 1.0625, "loss/logits": 0.16378697752952576, "loss/reg": 5.16266591148451e-05, "step": 864 }, { "epoch": 0.108125, "grad_norm": 2.9799695014953613, "grad_norm_var": 1.3326224051682771, "learning_rate": 0.0001, "loss": 1.2981, "loss/crossentropy": 2.548758029937744, "loss/hidden": 1.1484375, "loss/logits": 0.14914320409297943, "loss/reg": 5.1615705160656944e-05, "step": 865 }, { "epoch": 0.10825, "grad_norm": 2.810962438583374, "grad_norm_var": 1.2498044722821606, "learning_rate": 0.0001, "loss": 1.5109, "loss/crossentropy": 2.707660436630249, "loss/hidden": 1.28125, "loss/logits": 0.2291330099105835, "loss/reg": 5.1602582971099764e-05, "step": 866 }, { "epoch": 0.108375, "grad_norm": 2.15248966217041, "grad_norm_var": 0.5968405914629725, "learning_rate": 0.0001, "loss": 1.248, "loss/crossentropy": 2.542229413986206, "loss/hidden": 1.0703125, "loss/logits": 0.17721006274223328, "loss/reg": 5.158934072824195e-05, "step": 867 }, { "epoch": 0.1085, "grad_norm": 2.480729818344116, "grad_norm_var": 0.5971035139504882, "learning_rate": 0.0001, "loss": 1.2536, "loss/crossentropy": 2.53440523147583, "loss/hidden": 1.0859375, "loss/logits": 0.16716773808002472, "loss/reg": 5.157275154488161e-05, "step": 868 }, { "epoch": 0.108625, "grad_norm": 7.901394367218018, "grad_norm_var": 2.1248277393899104, "learning_rate": 0.0001, "loss": 1.5415, "loss/crossentropy": 2.556912660598755, "loss/hidden": 1.3125, "loss/logits": 0.228495255112648, "loss/reg": 5.155721737537533e-05, "step": 869 }, { "epoch": 0.10875, "grad_norm": 2.86877179145813, "grad_norm_var": 2.1190566777217703, "learning_rate": 0.0001, "loss": 1.4311, "loss/crossentropy": 2.6490020751953125, "loss/hidden": 1.1953125, "loss/logits": 0.23524877429008484, "loss/reg": 5.154574682819657e-05, "step": 870 }, { "epoch": 0.108875, "grad_norm": 2.6678075790405273, "grad_norm_var": 1.953804689788177, "learning_rate": 0.0001, "loss": 1.334, "loss/crossentropy": 2.4952635765075684, "loss/hidden": 1.140625, "loss/logits": 0.19286967813968658, "loss/reg": 5.152893209015019e-05, "step": 871 }, { "epoch": 0.109, "grad_norm": 3.3251094818115234, "grad_norm_var": 1.9680179929503008, "learning_rate": 0.0001, "loss": 1.6556, "loss/crossentropy": 2.4640188217163086, "loss/hidden": 1.3828125, "loss/logits": 0.2722957730293274, "loss/reg": 5.151686491444707e-05, "step": 872 }, { "epoch": 0.109125, "grad_norm": 2.9334840774536133, "grad_norm_var": 1.9531698292946447, "learning_rate": 0.0001, "loss": 1.6151, "loss/crossentropy": 2.1477441787719727, "loss/hidden": 1.4140625, "loss/logits": 0.2005080133676529, "loss/reg": 5.15064675710164e-05, "step": 873 }, { "epoch": 0.10925, "grad_norm": 2.6843268871307373, "grad_norm_var": 1.9388056435329775, "learning_rate": 0.0001, "loss": 1.3321, "loss/crossentropy": 2.6752922534942627, "loss/hidden": 1.1484375, "loss/logits": 0.18317534029483795, "loss/reg": 5.148894706508145e-05, "step": 874 }, { "epoch": 0.109375, "grad_norm": 2.666318416595459, "grad_norm_var": 1.8929180590094554, "learning_rate": 0.0001, "loss": 1.181, "loss/crossentropy": 2.3448598384857178, "loss/hidden": 1.015625, "loss/logits": 0.16483411192893982, "loss/reg": 5.147013871464878e-05, "step": 875 }, { "epoch": 0.1095, "grad_norm": 2.9108211994171143, "grad_norm_var": 1.8534501036204496, "learning_rate": 0.0001, "loss": 1.6481, "loss/crossentropy": 2.2253129482269287, "loss/hidden": 1.390625, "loss/logits": 0.2569289207458496, "loss/reg": 5.1448183512547985e-05, "step": 876 }, { "epoch": 0.109625, "grad_norm": 2.372954845428467, "grad_norm_var": 1.8305216702505192, "learning_rate": 0.0001, "loss": 1.0989, "loss/crossentropy": 2.879788398742676, "loss/hidden": 0.9609375, "loss/logits": 0.13748227059841156, "loss/reg": 5.143091766512953e-05, "step": 877 }, { "epoch": 0.10975, "grad_norm": 2.7331626415252686, "grad_norm_var": 1.7910379283106483, "learning_rate": 0.0001, "loss": 1.34, "loss/crossentropy": 2.64849853515625, "loss/hidden": 1.15625, "loss/logits": 0.18327152729034424, "loss/reg": 5.141158544574864e-05, "step": 878 }, { "epoch": 0.109875, "grad_norm": 2.3987855911254883, "grad_norm_var": 1.8136184643926978, "learning_rate": 0.0001, "loss": 1.2782, "loss/crossentropy": 2.5128934383392334, "loss/hidden": 1.109375, "loss/logits": 0.16826963424682617, "loss/reg": 5.139862696523778e-05, "step": 879 }, { "epoch": 0.11, "grad_norm": 2.94789981842041, "grad_norm_var": 1.7526228729189588, "learning_rate": 0.0001, "loss": 1.323, "loss/crossentropy": 2.762059211730957, "loss/hidden": 1.140625, "loss/logits": 0.18185189366340637, "loss/reg": 5.138712003827095e-05, "step": 880 }, { "epoch": 0.110125, "grad_norm": 2.8312692642211914, "grad_norm_var": 1.7554366876979393, "learning_rate": 0.0001, "loss": 1.5558, "loss/crossentropy": 2.3510918617248535, "loss/hidden": 1.3046875, "loss/logits": 0.2505726218223572, "loss/reg": 5.1374005124671385e-05, "step": 881 }, { "epoch": 0.11025, "grad_norm": 2.362635612487793, "grad_norm_var": 1.781863088516642, "learning_rate": 0.0001, "loss": 1.3719, "loss/crossentropy": 2.6087443828582764, "loss/hidden": 1.1640625, "loss/logits": 0.207294762134552, "loss/reg": 5.1361905207159e-05, "step": 882 }, { "epoch": 0.110375, "grad_norm": 3.219357967376709, "grad_norm_var": 1.730327889053624, "learning_rate": 0.0001, "loss": 1.625, "loss/crossentropy": 2.5749869346618652, "loss/hidden": 1.34375, "loss/logits": 0.2807803452014923, "loss/reg": 5.134905586601235e-05, "step": 883 }, { "epoch": 0.1105, "grad_norm": 3.389406442642212, "grad_norm_var": 1.709139991612549, "learning_rate": 0.0001, "loss": 1.3304, "loss/crossentropy": 2.8171286582946777, "loss/hidden": 1.140625, "loss/logits": 0.18924608826637268, "loss/reg": 5.133213926455937e-05, "step": 884 }, { "epoch": 0.110625, "grad_norm": 12.887139320373535, "grad_norm_var": 6.4290571159925625, "learning_rate": 0.0001, "loss": 1.6923, "loss/crossentropy": 2.3453848361968994, "loss/hidden": 1.453125, "loss/logits": 0.23862439393997192, "loss/reg": 5.1321330829523504e-05, "step": 885 }, { "epoch": 0.11075, "grad_norm": 2.346778154373169, "grad_norm_var": 6.486536682635502, "learning_rate": 0.0001, "loss": 1.4458, "loss/crossentropy": 2.5474321842193604, "loss/hidden": 1.2109375, "loss/logits": 0.23435597121715546, "loss/reg": 5.130986392032355e-05, "step": 886 }, { "epoch": 0.110875, "grad_norm": 2.487971067428589, "grad_norm_var": 6.506530171472066, "learning_rate": 0.0001, "loss": 1.357, "loss/crossentropy": 2.5673558712005615, "loss/hidden": 1.1484375, "loss/logits": 0.20802843570709229, "loss/reg": 5.1305341912666336e-05, "step": 887 }, { "epoch": 0.111, "grad_norm": 2.816526174545288, "grad_norm_var": 6.528187529959202, "learning_rate": 0.0001, "loss": 1.4391, "loss/crossentropy": 2.6126768589019775, "loss/hidden": 1.2109375, "loss/logits": 0.22769951820373535, "loss/reg": 5.12950646225363e-05, "step": 888 }, { "epoch": 0.111125, "grad_norm": 2.141483783721924, "grad_norm_var": 6.613941985095446, "learning_rate": 0.0001, "loss": 1.2903, "loss/crossentropy": 2.3378610610961914, "loss/hidden": 1.109375, "loss/logits": 0.18040552735328674, "loss/reg": 5.1290844567120075e-05, "step": 889 }, { "epoch": 0.11125, "grad_norm": 2.6674747467041016, "grad_norm_var": 6.615398852360909, "learning_rate": 0.0001, "loss": 1.2673, "loss/crossentropy": 2.5276970863342285, "loss/hidden": 1.09375, "loss/logits": 0.17306920886039734, "loss/reg": 5.129120108904317e-05, "step": 890 }, { "epoch": 0.111375, "grad_norm": 3.5922231674194336, "grad_norm_var": 6.5878176563605235, "learning_rate": 0.0001, "loss": 1.5823, "loss/crossentropy": 2.5318856239318848, "loss/hidden": 1.3359375, "loss/logits": 0.2458970844745636, "loss/reg": 5.128348493599333e-05, "step": 891 }, { "epoch": 0.1115, "grad_norm": 4.822789192199707, "grad_norm_var": 6.696274189555324, "learning_rate": 0.0001, "loss": 1.4305, "loss/crossentropy": 2.564779281616211, "loss/hidden": 1.234375, "loss/logits": 0.1956566870212555, "loss/reg": 5.1267714297864586e-05, "step": 892 }, { "epoch": 0.111625, "grad_norm": 2.7208211421966553, "grad_norm_var": 6.651510803659902, "learning_rate": 0.0001, "loss": 1.2779, "loss/crossentropy": 2.423957109451294, "loss/hidden": 1.1015625, "loss/logits": 0.17579975724220276, "loss/reg": 5.1254595746286213e-05, "step": 893 }, { "epoch": 0.11175, "grad_norm": 3.0301880836486816, "grad_norm_var": 6.62575020535945, "learning_rate": 0.0001, "loss": 1.5594, "loss/crossentropy": 2.4482266902923584, "loss/hidden": 1.296875, "loss/logits": 0.262008935213089, "loss/reg": 5.123936352902092e-05, "step": 894 }, { "epoch": 0.111875, "grad_norm": 3.360795021057129, "grad_norm_var": 6.537028009081854, "learning_rate": 0.0001, "loss": 1.3794, "loss/crossentropy": 2.7706689834594727, "loss/hidden": 1.203125, "loss/logits": 0.17581212520599365, "loss/reg": 5.1222959882579744e-05, "step": 895 }, { "epoch": 0.112, "grad_norm": 2.2740559577941895, "grad_norm_var": 6.6241346303160356, "learning_rate": 0.0001, "loss": 1.1855, "loss/crossentropy": 2.6317555904388428, "loss/hidden": 1.046875, "loss/logits": 0.13808242976665497, "loss/reg": 5.120660716784187e-05, "step": 896 }, { "epoch": 0.112125, "grad_norm": 2.4496805667877197, "grad_norm_var": 6.670283083692273, "learning_rate": 0.0001, "loss": 1.42, "loss/crossentropy": 2.499284505844116, "loss/hidden": 1.1875, "loss/logits": 0.2319883406162262, "loss/reg": 5.1194838306400925e-05, "step": 897 }, { "epoch": 0.11225, "grad_norm": 2.3721413612365723, "grad_norm_var": 6.668802098851164, "learning_rate": 0.0001, "loss": 1.2307, "loss/crossentropy": 2.375321388244629, "loss/hidden": 1.0703125, "loss/logits": 0.15983566641807556, "loss/reg": 5.118764966027811e-05, "step": 898 }, { "epoch": 0.112375, "grad_norm": 1.9777494668960571, "grad_norm_var": 6.817600273546391, "learning_rate": 0.0001, "loss": 1.2443, "loss/crossentropy": 2.382485866546631, "loss/hidden": 1.0625, "loss/logits": 0.18131142854690552, "loss/reg": 5.117098771734163e-05, "step": 899 }, { "epoch": 0.1125, "grad_norm": 3.215449571609497, "grad_norm_var": 6.821095932665104, "learning_rate": 0.0001, "loss": 1.7545, "loss/crossentropy": 2.471181869506836, "loss/hidden": 1.4921875, "loss/logits": 0.26175931096076965, "loss/reg": 5.11566577188205e-05, "step": 900 }, { "epoch": 0.112625, "grad_norm": 2.2360520362854004, "grad_norm_var": 0.5060833487590211, "learning_rate": 0.0001, "loss": 1.1463, "loss/crossentropy": 2.5280520915985107, "loss/hidden": 1.0078125, "loss/logits": 0.13795122504234314, "loss/reg": 5.114359737490304e-05, "step": 901 }, { "epoch": 0.11275, "grad_norm": 2.2849807739257812, "grad_norm_var": 0.5099081994552771, "learning_rate": 0.0001, "loss": 1.318, "loss/crossentropy": 2.583923578262329, "loss/hidden": 1.140625, "loss/logits": 0.1768278032541275, "loss/reg": 5.113161387271248e-05, "step": 902 }, { "epoch": 0.112875, "grad_norm": 2.292587995529175, "grad_norm_var": 0.5198535528811049, "learning_rate": 0.0001, "loss": 1.4232, "loss/crossentropy": 2.196122884750366, "loss/hidden": 1.21875, "loss/logits": 0.2038969099521637, "loss/reg": 5.111364953336306e-05, "step": 903 }, { "epoch": 0.113, "grad_norm": 2.8599159717559814, "grad_norm_var": 0.5202638913613247, "learning_rate": 0.0001, "loss": 1.2203, "loss/crossentropy": 2.5006511211395264, "loss/hidden": 1.0625, "loss/logits": 0.15732157230377197, "loss/reg": 5.110198981128633e-05, "step": 904 }, { "epoch": 0.113125, "grad_norm": 2.5395283699035645, "grad_norm_var": 0.49688104773360486, "learning_rate": 0.0001, "loss": 1.2577, "loss/crossentropy": 2.845609426498413, "loss/hidden": 1.078125, "loss/logits": 0.17907723784446716, "loss/reg": 5.108524055685848e-05, "step": 905 }, { "epoch": 0.11325, "grad_norm": 2.205470085144043, "grad_norm_var": 0.5179864695758818, "learning_rate": 0.0001, "loss": 1.4047, "loss/crossentropy": 2.444505214691162, "loss/hidden": 1.2109375, "loss/logits": 0.19329029321670532, "loss/reg": 5.106762910145335e-05, "step": 906 }, { "epoch": 0.113375, "grad_norm": 2.612846612930298, "grad_norm_var": 0.4698679222391608, "learning_rate": 0.0001, "loss": 1.246, "loss/crossentropy": 2.8539822101593018, "loss/hidden": 1.09375, "loss/logits": 0.15170122683048248, "loss/reg": 5.1049682951997966e-05, "step": 907 }, { "epoch": 0.1135, "grad_norm": 2.0672478675842285, "grad_norm_var": 0.1657706313496551, "learning_rate": 0.0001, "loss": 1.0483, "loss/crossentropy": 2.453460216522217, "loss/hidden": 0.921875, "loss/logits": 0.12594252824783325, "loss/reg": 5.103057628730312e-05, "step": 908 }, { "epoch": 0.113625, "grad_norm": 2.428112030029297, "grad_norm_var": 0.1637257922027207, "learning_rate": 0.0001, "loss": 1.465, "loss/crossentropy": 2.6168901920318604, "loss/hidden": 1.2421875, "loss/logits": 0.22226470708847046, "loss/reg": 5.1011342293350026e-05, "step": 909 }, { "epoch": 0.11375, "grad_norm": 2.6597177982330322, "grad_norm_var": 0.14675306523261794, "learning_rate": 0.0001, "loss": 1.3562, "loss/crossentropy": 2.513742685317993, "loss/hidden": 1.171875, "loss/logits": 0.18382194638252258, "loss/reg": 5.0994767661904916e-05, "step": 910 }, { "epoch": 0.113875, "grad_norm": 2.3672919273376465, "grad_norm_var": 0.09306154474314253, "learning_rate": 0.0001, "loss": 1.3962, "loss/crossentropy": 2.329597234725952, "loss/hidden": 1.1875, "loss/logits": 0.20819973945617676, "loss/reg": 5.097627581562847e-05, "step": 911 }, { "epoch": 0.114, "grad_norm": 2.389256000518799, "grad_norm_var": 0.091531368737714, "learning_rate": 0.0001, "loss": 1.3408, "loss/crossentropy": 2.712867021560669, "loss/hidden": 1.125, "loss/logits": 0.21526110172271729, "loss/reg": 5.0954702601302415e-05, "step": 912 }, { "epoch": 0.114125, "grad_norm": 2.9275593757629395, "grad_norm_var": 0.10674763413478458, "learning_rate": 0.0001, "loss": 1.5311, "loss/crossentropy": 2.5449111461639404, "loss/hidden": 1.3125, "loss/logits": 0.21804079413414001, "loss/reg": 5.093521031085402e-05, "step": 913 }, { "epoch": 0.11425, "grad_norm": 2.181626558303833, "grad_norm_var": 0.11136842221632476, "learning_rate": 0.0001, "loss": 1.2699, "loss/crossentropy": 2.8895819187164307, "loss/hidden": 1.109375, "loss/logits": 0.16004905104637146, "loss/reg": 5.091511411592364e-05, "step": 914 }, { "epoch": 0.114375, "grad_norm": 2.1453068256378174, "grad_norm_var": 0.10250921674971565, "learning_rate": 0.0001, "loss": 1.36, "loss/crossentropy": 2.5813710689544678, "loss/hidden": 1.15625, "loss/logits": 0.20324170589447021, "loss/reg": 5.089937985758297e-05, "step": 915 }, { "epoch": 0.1145, "grad_norm": 2.3220584392547607, "grad_norm_var": 0.06279939654988856, "learning_rate": 0.0001, "loss": 1.5004, "loss/crossentropy": 2.2584786415100098, "loss/hidden": 1.296875, "loss/logits": 0.20300912857055664, "loss/reg": 5.0884518714156e-05, "step": 916 }, { "epoch": 0.114625, "grad_norm": 2.3360254764556885, "grad_norm_var": 0.0611390665759463, "learning_rate": 0.0001, "loss": 1.2846, "loss/crossentropy": 2.713193416595459, "loss/hidden": 1.1015625, "loss/logits": 0.18253561854362488, "loss/reg": 5.086654346087016e-05, "step": 917 }, { "epoch": 0.11475, "grad_norm": 2.8872170448303223, "grad_norm_var": 0.07346951449265286, "learning_rate": 0.0001, "loss": 1.5614, "loss/crossentropy": 2.5153021812438965, "loss/hidden": 1.3203125, "loss/logits": 0.24056395888328552, "loss/reg": 5.084551958134398e-05, "step": 918 }, { "epoch": 0.114875, "grad_norm": 2.3302507400512695, "grad_norm_var": 0.07276086174920783, "learning_rate": 0.0001, "loss": 1.2503, "loss/crossentropy": 2.6722702980041504, "loss/hidden": 1.0859375, "loss/logits": 0.16384665668010712, "loss/reg": 5.0823444325942546e-05, "step": 919 }, { "epoch": 0.115, "grad_norm": 2.578481674194336, "grad_norm_var": 0.06246865190141004, "learning_rate": 0.0001, "loss": 1.3916, "loss/crossentropy": 2.56015682220459, "loss/hidden": 1.21875, "loss/logits": 0.1723623275756836, "loss/reg": 5.080721894046292e-05, "step": 920 }, { "epoch": 0.115125, "grad_norm": 2.512157917022705, "grad_norm_var": 0.062138112924693206, "learning_rate": 0.0001, "loss": 1.1445, "loss/crossentropy": 2.7548606395721436, "loss/hidden": 0.99609375, "loss/logits": 0.14792391657829285, "loss/reg": 5.078666072222404e-05, "step": 921 }, { "epoch": 0.11525, "grad_norm": 2.1921284198760986, "grad_norm_var": 0.0625565039341834, "learning_rate": 0.0001, "loss": 1.0968, "loss/crossentropy": 2.660780668258667, "loss/hidden": 0.95703125, "loss/logits": 0.1393047422170639, "loss/reg": 5.077124296803959e-05, "step": 922 }, { "epoch": 0.115375, "grad_norm": 2.3002402782440186, "grad_norm_var": 0.06119220238923783, "learning_rate": 0.0001, "loss": 1.4776, "loss/crossentropy": 2.416215419769287, "loss/hidden": 1.265625, "loss/logits": 0.2114565074443817, "loss/reg": 5.075276203569956e-05, "step": 923 }, { "epoch": 0.1155, "grad_norm": 2.2460763454437256, "grad_norm_var": 0.05492203051156442, "learning_rate": 0.0001, "loss": 1.4112, "loss/crossentropy": 2.3629891872406006, "loss/hidden": 1.1875, "loss/logits": 0.22322767972946167, "loss/reg": 5.0733655371004716e-05, "step": 924 }, { "epoch": 0.115625, "grad_norm": 1.99420166015625, "grad_norm_var": 0.06652205345829619, "learning_rate": 0.0001, "loss": 1.3674, "loss/crossentropy": 2.4530787467956543, "loss/hidden": 1.1875, "loss/logits": 0.17934995889663696, "loss/reg": 5.071550185675733e-05, "step": 925 }, { "epoch": 0.11575, "grad_norm": 3.024604558944702, "grad_norm_var": 0.0875715770421029, "learning_rate": 0.0001, "loss": 1.5185, "loss/crossentropy": 2.496591091156006, "loss/hidden": 1.2890625, "loss/logits": 0.22892767190933228, "loss/reg": 5.070024053566158e-05, "step": 926 }, { "epoch": 0.115875, "grad_norm": 2.557852268218994, "grad_norm_var": 0.08847894622657862, "learning_rate": 0.0001, "loss": 1.1789, "loss/crossentropy": 2.917902708053589, "loss/hidden": 1.0234375, "loss/logits": 0.1549963504076004, "loss/reg": 5.068165046395734e-05, "step": 927 }, { "epoch": 0.116, "grad_norm": 2.644162893295288, "grad_norm_var": 0.09105956863669439, "learning_rate": 0.0001, "loss": 1.4557, "loss/crossentropy": 2.3896613121032715, "loss/hidden": 1.234375, "loss/logits": 0.22077350318431854, "loss/reg": 5.066774247097783e-05, "step": 928 }, { "epoch": 0.116125, "grad_norm": 2.3583998680114746, "grad_norm_var": 0.07496988833996525, "learning_rate": 0.0001, "loss": 1.3946, "loss/crossentropy": 2.5956573486328125, "loss/hidden": 1.203125, "loss/logits": 0.19093617796897888, "loss/reg": 5.065161531092599e-05, "step": 929 }, { "epoch": 0.11625, "grad_norm": 3.7659823894500732, "grad_norm_var": 0.18294245356447975, "learning_rate": 0.0001, "loss": 1.4272, "loss/crossentropy": 3.112509250640869, "loss/hidden": 1.21875, "loss/logits": 0.20792649686336517, "loss/reg": 5.063477874500677e-05, "step": 930 }, { "epoch": 0.116375, "grad_norm": 3.4702413082122803, "grad_norm_var": 0.22784416332236573, "learning_rate": 0.0001, "loss": 1.4236, "loss/crossentropy": 2.6097307205200195, "loss/hidden": 1.2265625, "loss/logits": 0.19651469588279724, "loss/reg": 5.061656338511966e-05, "step": 931 }, { "epoch": 0.1165, "grad_norm": 10.169079780578613, "grad_norm_var": 3.7907524102504455, "learning_rate": 0.0001, "loss": 1.7727, "loss/crossentropy": 2.720949649810791, "loss/hidden": 1.46875, "loss/logits": 0.3034912347793579, "loss/reg": 5.059718750999309e-05, "step": 932 }, { "epoch": 0.116625, "grad_norm": 2.715127944946289, "grad_norm_var": 3.761853977240573, "learning_rate": 0.0001, "loss": 1.2669, "loss/crossentropy": 2.721991777420044, "loss/hidden": 1.0703125, "loss/logits": 0.19605056941509247, "loss/reg": 5.0578131777001545e-05, "step": 933 }, { "epoch": 0.11675, "grad_norm": 2.626614570617676, "grad_norm_var": 3.7738096606882756, "learning_rate": 0.0001, "loss": 1.561, "loss/crossentropy": 2.431546449661255, "loss/hidden": 1.3125, "loss/logits": 0.2480194866657257, "loss/reg": 5.056073496234603e-05, "step": 934 }, { "epoch": 0.116875, "grad_norm": 2.2887232303619385, "grad_norm_var": 3.7781399580603714, "learning_rate": 0.0001, "loss": 1.4852, "loss/crossentropy": 2.362403631210327, "loss/hidden": 1.2578125, "loss/logits": 0.2268555462360382, "loss/reg": 5.0543880206532776e-05, "step": 935 }, { "epoch": 0.117, "grad_norm": 2.2659595012664795, "grad_norm_var": 3.8055697286814714, "learning_rate": 0.0001, "loss": 1.38, "loss/crossentropy": 2.4935097694396973, "loss/hidden": 1.1953125, "loss/logits": 0.18415585160255432, "loss/reg": 5.0521102821221575e-05, "step": 936 }, { "epoch": 0.117125, "grad_norm": 2.611288070678711, "grad_norm_var": 3.7988011630033136, "learning_rate": 0.0001, "loss": 1.2173, "loss/crossentropy": 2.497889995574951, "loss/hidden": 1.0546875, "loss/logits": 0.1621410995721817, "loss/reg": 5.049499304732308e-05, "step": 937 }, { "epoch": 0.11725, "grad_norm": 2.490511894226074, "grad_norm_var": 3.769164840295244, "learning_rate": 0.0001, "loss": 1.3398, "loss/crossentropy": 2.580732583999634, "loss/hidden": 1.1328125, "loss/logits": 0.20645791292190552, "loss/reg": 5.047738159191795e-05, "step": 938 }, { "epoch": 0.117375, "grad_norm": 2.6626391410827637, "grad_norm_var": 3.7389430985960397, "learning_rate": 0.0001, "loss": 1.3499, "loss/crossentropy": 2.3543365001678467, "loss/hidden": 1.1484375, "loss/logits": 0.20099475979804993, "loss/reg": 5.046039950684644e-05, "step": 939 }, { "epoch": 0.1175, "grad_norm": 2.3437256813049316, "grad_norm_var": 3.728183871902979, "learning_rate": 0.0001, "loss": 1.56, "loss/crossentropy": 2.385300636291504, "loss/hidden": 1.3046875, "loss/logits": 0.2547788918018341, "loss/reg": 5.044609497417696e-05, "step": 940 }, { "epoch": 0.117625, "grad_norm": 2.517521858215332, "grad_norm_var": 3.6664452294798027, "learning_rate": 0.0001, "loss": 1.3269, "loss/crossentropy": 2.5399513244628906, "loss/hidden": 1.140625, "loss/logits": 0.18573379516601562, "loss/reg": 5.042907650931738e-05, "step": 941 }, { "epoch": 0.11775, "grad_norm": 2.5876271724700928, "grad_norm_var": 3.6860949824849625, "learning_rate": 0.0001, "loss": 1.2793, "loss/crossentropy": 2.377321481704712, "loss/hidden": 1.09375, "loss/logits": 0.18503513932228088, "loss/reg": 5.0405546062393114e-05, "step": 942 }, { "epoch": 0.117875, "grad_norm": 2.671018123626709, "grad_norm_var": 3.6782666614773385, "learning_rate": 0.0001, "loss": 1.2238, "loss/crossentropy": 2.802436113357544, "loss/hidden": 1.046875, "loss/logits": 0.17646291851997375, "loss/reg": 5.039115058025345e-05, "step": 943 }, { "epoch": 0.118, "grad_norm": 2.572817325592041, "grad_norm_var": 3.683271023247499, "learning_rate": 0.0001, "loss": 1.5448, "loss/crossentropy": 2.634742498397827, "loss/hidden": 1.28125, "loss/logits": 0.26302051544189453, "loss/reg": 5.037582013756037e-05, "step": 944 }, { "epoch": 0.118125, "grad_norm": 3.284419059753418, "grad_norm_var": 3.641308957185208, "learning_rate": 0.0001, "loss": 1.4113, "loss/crossentropy": 2.7402541637420654, "loss/hidden": 1.1875, "loss/logits": 0.22326701879501343, "loss/reg": 5.035655340179801e-05, "step": 945 }, { "epoch": 0.11825, "grad_norm": 2.1808974742889404, "grad_norm_var": 3.676652595263378, "learning_rate": 0.0001, "loss": 1.345, "loss/crossentropy": 2.1766951084136963, "loss/hidden": 1.1640625, "loss/logits": 0.18044030666351318, "loss/reg": 5.033357228967361e-05, "step": 946 }, { "epoch": 0.118375, "grad_norm": 2.5026612281799316, "grad_norm_var": 3.6862574547537976, "learning_rate": 0.0001, "loss": 1.3828, "loss/crossentropy": 2.443451404571533, "loss/hidden": 1.171875, "loss/logits": 0.21045450866222382, "loss/reg": 5.0314705731580034e-05, "step": 947 }, { "epoch": 0.1185, "grad_norm": 2.595186233520508, "grad_norm_var": 0.06275260077619957, "learning_rate": 0.0001, "loss": 1.3268, "loss/crossentropy": 2.5740835666656494, "loss/hidden": 1.140625, "loss/logits": 0.18566593527793884, "loss/reg": 5.0293325330130756e-05, "step": 948 }, { "epoch": 0.118625, "grad_norm": 2.7742605209350586, "grad_norm_var": 0.06421554214945217, "learning_rate": 0.0001, "loss": 1.3436, "loss/crossentropy": 2.6714890003204346, "loss/hidden": 1.15625, "loss/logits": 0.18687836825847626, "loss/reg": 5.027998849982396e-05, "step": 949 }, { "epoch": 0.11875, "grad_norm": 2.370361328125, "grad_norm_var": 0.06607751509905029, "learning_rate": 0.0001, "loss": 1.3589, "loss/crossentropy": 2.659331798553467, "loss/hidden": 1.1640625, "loss/logits": 0.19433243572711945, "loss/reg": 5.026358849136159e-05, "step": 950 }, { "epoch": 0.118875, "grad_norm": 2.5973522663116455, "grad_norm_var": 0.06148581360481557, "learning_rate": 0.0001, "loss": 1.4772, "loss/crossentropy": 2.323859930038452, "loss/hidden": 1.25, "loss/logits": 0.2266673892736435, "loss/reg": 5.024715210311115e-05, "step": 951 }, { "epoch": 0.119, "grad_norm": 2.227506637573242, "grad_norm_var": 0.06310765648726502, "learning_rate": 0.0001, "loss": 1.1943, "loss/crossentropy": 2.683986186981201, "loss/hidden": 1.03125, "loss/logits": 0.16250261664390564, "loss/reg": 5.023390258429572e-05, "step": 952 }, { "epoch": 0.119125, "grad_norm": 2.293842315673828, "grad_norm_var": 0.06731388693757458, "learning_rate": 0.0001, "loss": 1.2523, "loss/crossentropy": 2.6023056507110596, "loss/hidden": 1.0859375, "loss/logits": 0.16582387685775757, "loss/reg": 5.021702963858843e-05, "step": 953 }, { "epoch": 0.11925, "grad_norm": 2.304372549057007, "grad_norm_var": 0.07075777977412372, "learning_rate": 0.0001, "loss": 1.3426, "loss/crossentropy": 2.63718318939209, "loss/hidden": 1.15625, "loss/logits": 0.1858091950416565, "loss/reg": 5.020539538236335e-05, "step": 954 }, { "epoch": 0.119375, "grad_norm": 2.330775737762451, "grad_norm_var": 0.07178920620747628, "learning_rate": 0.0001, "loss": 1.3944, "loss/crossentropy": 2.5988028049468994, "loss/hidden": 1.1875, "loss/logits": 0.20635762810707092, "loss/reg": 5.0196507800137624e-05, "step": 955 }, { "epoch": 0.1195, "grad_norm": 2.9979050159454346, "grad_norm_var": 0.08406384780934838, "learning_rate": 0.0001, "loss": 1.4253, "loss/crossentropy": 2.7068779468536377, "loss/hidden": 1.1953125, "loss/logits": 0.22944168746471405, "loss/reg": 5.018553929403424e-05, "step": 956 }, { "epoch": 0.119625, "grad_norm": 2.523890495300293, "grad_norm_var": 0.08403835148358946, "learning_rate": 0.0001, "loss": 1.3006, "loss/crossentropy": 2.4745264053344727, "loss/hidden": 1.1328125, "loss/logits": 0.16726145148277283, "loss/reg": 5.017763396608643e-05, "step": 957 }, { "epoch": 0.11975, "grad_norm": 2.2493395805358887, "grad_norm_var": 0.08953556901061101, "learning_rate": 0.0001, "loss": 1.3401, "loss/crossentropy": 2.6549365520477295, "loss/hidden": 1.15625, "loss/logits": 0.1833769977092743, "loss/reg": 5.017070361645892e-05, "step": 958 }, { "epoch": 0.119875, "grad_norm": 2.9584262371063232, "grad_norm_var": 0.10011037915958786, "learning_rate": 0.0001, "loss": 1.4029, "loss/crossentropy": 2.5109822750091553, "loss/hidden": 1.1953125, "loss/logits": 0.20707273483276367, "loss/reg": 5.0166461733169854e-05, "step": 959 }, { "epoch": 0.12, "grad_norm": 3.285101890563965, "grad_norm_var": 0.13420030325027574, "learning_rate": 0.0001, "loss": 1.7876, "loss/crossentropy": 2.0449328422546387, "loss/hidden": 1.5703125, "loss/logits": 0.2167491614818573, "loss/reg": 5.015368151362054e-05, "step": 960 }, { "epoch": 0.120125, "grad_norm": 2.2315921783447266, "grad_norm_var": 0.10631614140368863, "learning_rate": 0.0001, "loss": 1.2062, "loss/crossentropy": 2.7129147052764893, "loss/hidden": 1.0390625, "loss/logits": 0.16663849353790283, "loss/reg": 5.013948612031527e-05, "step": 961 }, { "epoch": 0.12025, "grad_norm": 2.6541385650634766, "grad_norm_var": 0.0985084366826707, "learning_rate": 0.0001, "loss": 1.4258, "loss/crossentropy": 2.565558433532715, "loss/hidden": 1.2109375, "loss/logits": 0.21438126266002655, "loss/reg": 5.0122467655455694e-05, "step": 962 }, { "epoch": 0.120375, "grad_norm": 3.077623128890991, "grad_norm_var": 0.11507731082550829, "learning_rate": 0.0001, "loss": 1.3409, "loss/crossentropy": 2.8010950088500977, "loss/hidden": 1.140625, "loss/logits": 0.1998089998960495, "loss/reg": 5.010988388676196e-05, "step": 963 }, { "epoch": 0.1205, "grad_norm": 2.849869728088379, "grad_norm_var": 0.11924017889962192, "learning_rate": 0.0001, "loss": 1.6222, "loss/crossentropy": 2.570343255996704, "loss/hidden": 1.3515625, "loss/logits": 0.27010178565979004, "loss/reg": 5.009587403037585e-05, "step": 964 }, { "epoch": 0.120625, "grad_norm": 2.347888469696045, "grad_norm_var": 0.12114457046454064, "learning_rate": 0.0001, "loss": 1.2266, "loss/crossentropy": 2.685359239578247, "loss/hidden": 1.046875, "loss/logits": 0.17924004793167114, "loss/reg": 5.00831774843391e-05, "step": 965 }, { "epoch": 0.12075, "grad_norm": 3.009894609451294, "grad_norm_var": 0.12872461062677967, "learning_rate": 0.0001, "loss": 1.1479, "loss/crossentropy": 2.1576528549194336, "loss/hidden": 1.0, "loss/logits": 0.14742408692836761, "loss/reg": 5.006848732591607e-05, "step": 966 }, { "epoch": 0.120875, "grad_norm": 2.795736789703369, "grad_norm_var": 0.1305530559419573, "learning_rate": 0.0001, "loss": 1.277, "loss/crossentropy": 2.619136333465576, "loss/hidden": 1.09375, "loss/logits": 0.18277448415756226, "loss/reg": 5.005668936064467e-05, "step": 967 }, { "epoch": 0.121, "grad_norm": 3.669609785079956, "grad_norm_var": 0.18244444432157156, "learning_rate": 0.0001, "loss": 1.4086, "loss/crossentropy": 2.494560956954956, "loss/hidden": 1.2109375, "loss/logits": 0.19715511798858643, "loss/reg": 5.0041482609231025e-05, "step": 968 }, { "epoch": 0.121125, "grad_norm": 2.406184196472168, "grad_norm_var": 0.17679367962298234, "learning_rate": 0.0001, "loss": 1.4467, "loss/crossentropy": 2.6060521602630615, "loss/hidden": 1.2265625, "loss/logits": 0.2196432203054428, "loss/reg": 5.002524994779378e-05, "step": 969 }, { "epoch": 0.12125, "grad_norm": 2.449737071990967, "grad_norm_var": 0.16984991405668073, "learning_rate": 0.0001, "loss": 1.3311, "loss/crossentropy": 2.571929931640625, "loss/hidden": 1.140625, "loss/logits": 0.18994662165641785, "loss/reg": 5.001126555725932e-05, "step": 970 }, { "epoch": 0.121375, "grad_norm": 4.657814025878906, "grad_norm_var": 0.38136771698041777, "learning_rate": 0.0001, "loss": 1.4828, "loss/crossentropy": 1.9526991844177246, "loss/hidden": 1.34375, "loss/logits": 0.1385582983493805, "loss/reg": 4.999621523893438e-05, "step": 971 }, { "epoch": 0.1215, "grad_norm": 2.4782066345214844, "grad_norm_var": 0.39044515597160984, "learning_rate": 0.0001, "loss": 1.5125, "loss/crossentropy": 2.691642999649048, "loss/hidden": 1.28125, "loss/logits": 0.23075540363788605, "loss/reg": 4.9980124458670616e-05, "step": 972 }, { "epoch": 0.121625, "grad_norm": 2.052445650100708, "grad_norm_var": 0.4250124419864655, "learning_rate": 0.0001, "loss": 1.1691, "loss/crossentropy": 2.250474452972412, "loss/hidden": 1.015625, "loss/logits": 0.15293559432029724, "loss/reg": 4.996646021027118e-05, "step": 973 }, { "epoch": 0.12175, "grad_norm": 3.020663261413574, "grad_norm_var": 0.4031631069299368, "learning_rate": 0.0001, "loss": 1.6364, "loss/crossentropy": 2.6588315963745117, "loss/hidden": 1.3515625, "loss/logits": 0.28431302309036255, "loss/reg": 4.995078415959142e-05, "step": 974 }, { "epoch": 0.121875, "grad_norm": 2.3784873485565186, "grad_norm_var": 0.41746659447213474, "learning_rate": 0.0001, "loss": 1.3054, "loss/crossentropy": 2.7177300453186035, "loss/hidden": 1.1171875, "loss/logits": 0.1876787394285202, "loss/reg": 4.993857510271482e-05, "step": 975 }, { "epoch": 0.122, "grad_norm": 2.3851540088653564, "grad_norm_var": 0.41411408010637746, "learning_rate": 0.0001, "loss": 1.3903, "loss/crossentropy": 2.4946720600128174, "loss/hidden": 1.1875, "loss/logits": 0.20234179496765137, "loss/reg": 4.9920308811124414e-05, "step": 976 }, { "epoch": 0.122125, "grad_norm": 3.0380489826202393, "grad_norm_var": 0.39589390524756496, "learning_rate": 0.0001, "loss": 1.4918, "loss/crossentropy": 2.525160312652588, "loss/hidden": 1.2578125, "loss/logits": 0.23351210355758667, "loss/reg": 4.9907186621567234e-05, "step": 977 }, { "epoch": 0.12225, "grad_norm": 3.0249407291412354, "grad_norm_var": 0.39581891364688315, "learning_rate": 0.0001, "loss": 1.3042, "loss/crossentropy": 2.2865822315216064, "loss/hidden": 1.125, "loss/logits": 0.17868509888648987, "loss/reg": 4.989467197447084e-05, "step": 978 }, { "epoch": 0.122375, "grad_norm": 2.5235769748687744, "grad_norm_var": 0.3983845190744196, "learning_rate": 0.0001, "loss": 1.4039, "loss/crossentropy": 2.571760416030884, "loss/hidden": 1.203125, "loss/logits": 0.20029743015766144, "loss/reg": 4.9882932216860354e-05, "step": 979 }, { "epoch": 0.1225, "grad_norm": 2.345391035079956, "grad_norm_var": 0.4121480969686348, "learning_rate": 0.0001, "loss": 1.3082, "loss/crossentropy": 2.435070037841797, "loss/hidden": 1.125, "loss/logits": 0.18273219466209412, "loss/reg": 4.9867430789163336e-05, "step": 980 }, { "epoch": 0.122625, "grad_norm": 2.638735294342041, "grad_norm_var": 0.40042645398898813, "learning_rate": 0.0001, "loss": 1.3063, "loss/crossentropy": 2.5130980014801025, "loss/hidden": 1.1328125, "loss/logits": 0.17303845286369324, "loss/reg": 4.985083796782419e-05, "step": 981 }, { "epoch": 0.12275, "grad_norm": 2.5330209732055664, "grad_norm_var": 0.40159028364493377, "learning_rate": 0.0001, "loss": 1.2562, "loss/crossentropy": 2.6750576496124268, "loss/hidden": 1.0859375, "loss/logits": 0.169732928276062, "loss/reg": 4.983275357517414e-05, "step": 982 }, { "epoch": 0.122875, "grad_norm": 2.729797840118408, "grad_norm_var": 0.4016784804234855, "learning_rate": 0.0001, "loss": 1.2532, "loss/crossentropy": 2.4468331336975098, "loss/hidden": 1.0859375, "loss/logits": 0.1667352169752121, "loss/reg": 4.9810380005510524e-05, "step": 983 }, { "epoch": 0.123, "grad_norm": 2.0886266231536865, "grad_norm_var": 0.368417637633604, "learning_rate": 0.0001, "loss": 1.153, "loss/crossentropy": 2.3423781394958496, "loss/hidden": 1.0078125, "loss/logits": 0.14471980929374695, "loss/reg": 4.978798096999526e-05, "step": 984 }, { "epoch": 0.123125, "grad_norm": 4.569364070892334, "grad_norm_var": 0.5842302621168443, "learning_rate": 0.0001, "loss": 2.2018, "loss/crossentropy": 3.0823965072631836, "loss/hidden": 1.859375, "loss/logits": 0.3418978154659271, "loss/reg": 4.976466516382061e-05, "step": 985 }, { "epoch": 0.12325, "grad_norm": 2.2796969413757324, "grad_norm_var": 0.5941400852345159, "learning_rate": 0.0001, "loss": 1.3454, "loss/crossentropy": 2.238121509552002, "loss/hidden": 1.171875, "loss/logits": 0.17299339175224304, "loss/reg": 4.9749010941013694e-05, "step": 986 }, { "epoch": 0.123375, "grad_norm": 2.2396790981292725, "grad_norm_var": 0.3594793940281323, "learning_rate": 0.0001, "loss": 1.2469, "loss/crossentropy": 2.4478795528411865, "loss/hidden": 1.0703125, "loss/logits": 0.1760822981595993, "loss/reg": 4.9731748731574044e-05, "step": 987 }, { "epoch": 0.1235, "grad_norm": 2.769818067550659, "grad_norm_var": 0.3582948597206647, "learning_rate": 0.0001, "loss": 1.4051, "loss/crossentropy": 2.5798401832580566, "loss/hidden": 1.1875, "loss/logits": 0.2170933485031128, "loss/reg": 4.971622547600418e-05, "step": 988 }, { "epoch": 0.123625, "grad_norm": 7.930779457092285, "grad_norm_var": 2.038968644334443, "learning_rate": 0.0001, "loss": 1.7555, "loss/crossentropy": 2.4812231063842773, "loss/hidden": 1.5234375, "loss/logits": 0.23152852058410645, "loss/reg": 4.969895235262811e-05, "step": 989 }, { "epoch": 0.12375, "grad_norm": 2.6717734336853027, "grad_norm_var": 2.047056614809465, "learning_rate": 0.0001, "loss": 1.3037, "loss/crossentropy": 2.765683889389038, "loss/hidden": 1.109375, "loss/logits": 0.1938176304101944, "loss/reg": 4.968183202436194e-05, "step": 990 }, { "epoch": 0.123875, "grad_norm": 2.1470675468444824, "grad_norm_var": 2.0698644668564423, "learning_rate": 0.0001, "loss": 1.1412, "loss/crossentropy": 2.3901665210723877, "loss/hidden": 1.0078125, "loss/logits": 0.13293424248695374, "loss/reg": 4.966145206708461e-05, "step": 991 }, { "epoch": 0.124, "grad_norm": 3.5537383556365967, "grad_norm_var": 2.0602370425069307, "learning_rate": 0.0001, "loss": 1.5701, "loss/crossentropy": 2.7112905979156494, "loss/hidden": 1.3125, "loss/logits": 0.2570686340332031, "loss/reg": 4.964391700923443e-05, "step": 992 }, { "epoch": 0.124125, "grad_norm": 2.8144960403442383, "grad_norm_var": 2.0642459406096196, "learning_rate": 0.0001, "loss": 1.4034, "loss/crossentropy": 2.398422956466675, "loss/hidden": 1.1796875, "loss/logits": 0.2232397496700287, "loss/reg": 4.962670573149808e-05, "step": 993 }, { "epoch": 0.12425, "grad_norm": 2.438244342803955, "grad_norm_var": 2.0880153272663686, "learning_rate": 0.0001, "loss": 1.2807, "loss/crossentropy": 2.8859853744506836, "loss/hidden": 1.09375, "loss/logits": 0.186467245221138, "loss/reg": 4.960416481480934e-05, "step": 994 }, { "epoch": 0.124375, "grad_norm": 2.4503746032714844, "grad_norm_var": 2.0931673054725968, "learning_rate": 0.0001, "loss": 1.2689, "loss/crossentropy": 2.5799286365509033, "loss/hidden": 1.0859375, "loss/logits": 0.18246138095855713, "loss/reg": 4.958092904416844e-05, "step": 995 }, { "epoch": 0.1245, "grad_norm": 2.7326478958129883, "grad_norm_var": 2.0680926796305954, "learning_rate": 0.0001, "loss": 1.2971, "loss/crossentropy": 2.5156333446502686, "loss/hidden": 1.1015625, "loss/logits": 0.19501572847366333, "loss/reg": 4.9566217057872564e-05, "step": 996 }, { "epoch": 0.124625, "grad_norm": 2.3510055541992188, "grad_norm_var": 2.0885360429345687, "learning_rate": 0.0001, "loss": 1.3411, "loss/crossentropy": 2.4484684467315674, "loss/hidden": 1.15625, "loss/logits": 0.1843767762184143, "loss/reg": 4.954791802447289e-05, "step": 997 }, { "epoch": 0.12475, "grad_norm": 2.5405001640319824, "grad_norm_var": 2.088055149578788, "learning_rate": 0.0001, "loss": 1.3355, "loss/crossentropy": 2.7601253986358643, "loss/hidden": 1.1328125, "loss/logits": 0.2021464705467224, "loss/reg": 4.952655945089646e-05, "step": 998 }, { "epoch": 0.124875, "grad_norm": 2.3194994926452637, "grad_norm_var": 2.1144102611494255, "learning_rate": 0.0001, "loss": 1.3942, "loss/crossentropy": 2.220510721206665, "loss/hidden": 1.203125, "loss/logits": 0.19055192172527313, "loss/reg": 4.9505808419780806e-05, "step": 999 }, { "epoch": 0.125, "grad_norm": 2.5603721141815186, "grad_norm_var": 2.071398101249299, "learning_rate": 0.0001, "loss": 1.36, "loss/crossentropy": 2.8782269954681396, "loss/hidden": 1.1484375, "loss/logits": 0.2110556960105896, "loss/reg": 4.9486519856145605e-05, "step": 1000 }, { "epoch": 0.125125, "grad_norm": 2.2395670413970947, "grad_norm_var": 1.9303038412412072, "learning_rate": 0.0001, "loss": 1.2975, "loss/crossentropy": 2.69596529006958, "loss/hidden": 1.125, "loss/logits": 0.1720152497291565, "loss/reg": 4.9466805648989975e-05, "step": 1001 }, { "epoch": 0.12525, "grad_norm": 2.61370849609375, "grad_norm_var": 1.9106555491120425, "learning_rate": 0.0001, "loss": 1.3189, "loss/crossentropy": 2.220857620239258, "loss/hidden": 1.1328125, "loss/logits": 0.1856231689453125, "loss/reg": 4.944577085552737e-05, "step": 1002 }, { "epoch": 0.125375, "grad_norm": 2.382514715194702, "grad_norm_var": 1.8993868437643942, "learning_rate": 0.0001, "loss": 1.3043, "loss/crossentropy": 2.4896750450134277, "loss/hidden": 1.1171875, "loss/logits": 0.18666106462478638, "loss/reg": 4.9421672883909196e-05, "step": 1003 }, { "epoch": 0.1255, "grad_norm": 2.090632200241089, "grad_norm_var": 1.9406638681660182, "learning_rate": 0.0001, "loss": 1.223, "loss/crossentropy": 2.4760797023773193, "loss/hidden": 1.0546875, "loss/logits": 0.1677936315536499, "loss/reg": 4.9405876779928803e-05, "step": 1004 }, { "epoch": 0.125625, "grad_norm": 2.207566976547241, "grad_norm_var": 0.12204364862270628, "learning_rate": 0.0001, "loss": 1.2431, "loss/crossentropy": 2.4455795288085938, "loss/hidden": 1.0703125, "loss/logits": 0.17229218780994415, "loss/reg": 4.938853089697659e-05, "step": 1005 }, { "epoch": 0.12575, "grad_norm": 2.572543144226074, "grad_norm_var": 0.12048040871569204, "learning_rate": 0.0001, "loss": 1.4118, "loss/crossentropy": 2.363635540008545, "loss/hidden": 1.1875, "loss/logits": 0.22376522421836853, "loss/reg": 4.936993354931474e-05, "step": 1006 }, { "epoch": 0.125875, "grad_norm": 1.9542391300201416, "grad_norm_var": 0.13190165361677916, "learning_rate": 0.0001, "loss": 1.3172, "loss/crossentropy": 2.2691657543182373, "loss/hidden": 1.15625, "loss/logits": 0.1604304313659668, "loss/reg": 4.934666503686458e-05, "step": 1007 }, { "epoch": 0.126, "grad_norm": 2.453639268875122, "grad_norm_var": 0.05134304514072573, "learning_rate": 0.0001, "loss": 1.1705, "loss/crossentropy": 2.5322656631469727, "loss/hidden": 1.03125, "loss/logits": 0.1387622058391571, "loss/reg": 4.932629963150248e-05, "step": 1008 }, { "epoch": 0.126125, "grad_norm": 4.550439834594727, "grad_norm_var": 0.33097413609364873, "learning_rate": 0.0001, "loss": 1.7293, "loss/crossentropy": 2.4164276123046875, "loss/hidden": 1.4609375, "loss/logits": 0.26790663599967957, "loss/reg": 4.9302150728181005e-05, "step": 1009 }, { "epoch": 0.12625, "grad_norm": 2.1454691886901855, "grad_norm_var": 0.33985839605951634, "learning_rate": 0.0001, "loss": 1.2495, "loss/crossentropy": 2.3248133659362793, "loss/hidden": 1.09375, "loss/logits": 0.15521638095378876, "loss/reg": 4.9280359235126525e-05, "step": 1010 }, { "epoch": 0.126375, "grad_norm": 3.380275249481201, "grad_norm_var": 0.38647376277448847, "learning_rate": 0.0001, "loss": 1.4021, "loss/crossentropy": 2.729059934616089, "loss/hidden": 1.203125, "loss/logits": 0.19849233329296112, "loss/reg": 4.9261008825851604e-05, "step": 1011 }, { "epoch": 0.1265, "grad_norm": 2.9117698669433594, "grad_norm_var": 0.39240144713572794, "learning_rate": 0.0001, "loss": 1.4475, "loss/crossentropy": 2.5537755489349365, "loss/hidden": 1.25, "loss/logits": 0.19704851508140564, "loss/reg": 4.923251617583446e-05, "step": 1012 }, { "epoch": 0.126625, "grad_norm": 3.525696277618408, "grad_norm_var": 0.4428399929631228, "learning_rate": 0.0001, "loss": 1.3903, "loss/crossentropy": 2.463677406311035, "loss/hidden": 1.171875, "loss/logits": 0.21794721484184265, "loss/reg": 4.921284562442452e-05, "step": 1013 }, { "epoch": 0.12675, "grad_norm": 2.036464214324951, "grad_norm_var": 0.46628060550236017, "learning_rate": 0.0001, "loss": 1.2705, "loss/crossentropy": 2.529911756515503, "loss/hidden": 1.09375, "loss/logits": 0.17623832821846008, "loss/reg": 4.9188893171958625e-05, "step": 1014 }, { "epoch": 0.126875, "grad_norm": 2.6179473400115967, "grad_norm_var": 0.45982904228581656, "learning_rate": 0.0001, "loss": 1.1577, "loss/crossentropy": 2.501230001449585, "loss/hidden": 1.0078125, "loss/logits": 0.1493779569864273, "loss/reg": 4.916240504826419e-05, "step": 1015 }, { "epoch": 0.127, "grad_norm": 2.0810256004333496, "grad_norm_var": 0.47929047113658096, "learning_rate": 0.0001, "loss": 1.342, "loss/crossentropy": 2.4926223754882812, "loss/hidden": 1.15625, "loss/logits": 0.1852131187915802, "loss/reg": 4.9135691369883716e-05, "step": 1016 }, { "epoch": 0.127125, "grad_norm": 7.561760425567627, "grad_norm_var": 1.9866254273241934, "learning_rate": 0.0001, "loss": 2.1473, "loss/crossentropy": 2.5160434246063232, "loss/hidden": 1.75, "loss/logits": 0.3968074321746826, "loss/reg": 4.910998904961161e-05, "step": 1017 }, { "epoch": 0.12725, "grad_norm": 2.6571788787841797, "grad_norm_var": 1.9848357777071584, "learning_rate": 0.0001, "loss": 1.4137, "loss/crossentropy": 2.503122329711914, "loss/hidden": 1.1875, "loss/logits": 0.2257142961025238, "loss/reg": 4.908502523903735e-05, "step": 1018 }, { "epoch": 0.127375, "grad_norm": 2.9156904220581055, "grad_norm_var": 1.962575207346818, "learning_rate": 0.0001, "loss": 1.3309, "loss/crossentropy": 2.6177690029144287, "loss/hidden": 1.1015625, "loss/logits": 0.22888478636741638, "loss/reg": 4.906098547508009e-05, "step": 1019 }, { "epoch": 0.1275, "grad_norm": 2.439221143722534, "grad_norm_var": 1.928884650271243, "learning_rate": 0.0001, "loss": 1.3874, "loss/crossentropy": 2.2906293869018555, "loss/hidden": 1.203125, "loss/logits": 0.18382297456264496, "loss/reg": 4.9045229388866574e-05, "step": 1020 }, { "epoch": 0.127625, "grad_norm": 2.555960178375244, "grad_norm_var": 1.899628603116729, "learning_rate": 0.0001, "loss": 1.4984, "loss/crossentropy": 2.429898262023926, "loss/hidden": 1.265625, "loss/logits": 0.2322796881198883, "loss/reg": 4.902849468635395e-05, "step": 1021 }, { "epoch": 0.12775, "grad_norm": 3.4673826694488525, "grad_norm_var": 1.89599455975474, "learning_rate": 0.0001, "loss": 1.3344, "loss/crossentropy": 2.338287353515625, "loss/hidden": 1.140625, "loss/logits": 0.19326989352703094, "loss/reg": 4.901745342067443e-05, "step": 1022 }, { "epoch": 0.127875, "grad_norm": 3.8274197578430176, "grad_norm_var": 1.8345311497725865, "learning_rate": 0.0001, "loss": 1.6114, "loss/crossentropy": 2.693875312805176, "loss/hidden": 1.359375, "loss/logits": 0.2515537440776825, "loss/reg": 4.899735358776525e-05, "step": 1023 }, { "epoch": 0.128, "grad_norm": 2.0832443237304688, "grad_norm_var": 1.8797411681812697, "learning_rate": 0.0001, "loss": 1.2648, "loss/crossentropy": 2.5829262733459473, "loss/hidden": 1.1015625, "loss/logits": 0.16278542578220367, "loss/reg": 4.8977166443364695e-05, "step": 1024 }, { "epoch": 0.128125, "grad_norm": 2.5200939178466797, "grad_norm_var": 1.7643075835401967, "learning_rate": 0.0001, "loss": 1.5924, "loss/crossentropy": 2.269489288330078, "loss/hidden": 1.328125, "loss/logits": 0.263778954744339, "loss/reg": 4.896056270808913e-05, "step": 1025 }, { "epoch": 0.12825, "grad_norm": 4.524285793304443, "grad_norm_var": 1.8325406094582772, "learning_rate": 0.0001, "loss": 1.6757, "loss/crossentropy": 2.3938968181610107, "loss/hidden": 1.4375, "loss/logits": 0.23774707317352295, "loss/reg": 4.8936548409983516e-05, "step": 1026 }, { "epoch": 0.128375, "grad_norm": 15.092655181884766, "grad_norm_var": 10.697039493485823, "learning_rate": 0.0001, "loss": 2.1874, "loss/crossentropy": 2.30786395072937, "loss/hidden": 1.96875, "loss/logits": 0.21819457411766052, "loss/reg": 4.8918100219452754e-05, "step": 1027 }, { "epoch": 0.1285, "grad_norm": 4.296483993530273, "grad_norm_var": 10.629602505750489, "learning_rate": 0.0001, "loss": 1.6954, "loss/crossentropy": 2.2013046741485596, "loss/hidden": 1.4765625, "loss/logits": 0.21837672591209412, "loss/reg": 4.8897407395998016e-05, "step": 1028 }, { "epoch": 0.128625, "grad_norm": 2.0280611515045166, "grad_norm_var": 10.867023015671123, "learning_rate": 0.0001, "loss": 1.2187, "loss/crossentropy": 2.3579764366149902, "loss/hidden": 1.046875, "loss/logits": 0.17130854725837708, "loss/reg": 4.887663453700952e-05, "step": 1029 }, { "epoch": 0.12875, "grad_norm": 3.1296451091766357, "grad_norm_var": 10.66731170329762, "learning_rate": 0.0001, "loss": 1.4486, "loss/crossentropy": 2.4010651111602783, "loss/hidden": 1.2734375, "loss/logits": 0.1747232973575592, "loss/reg": 4.8857429646886885e-05, "step": 1030 }, { "epoch": 0.128875, "grad_norm": 2.333757162094116, "grad_norm_var": 10.724249974607462, "learning_rate": 0.0001, "loss": 1.3921, "loss/crossentropy": 2.4793214797973633, "loss/hidden": 1.171875, "loss/logits": 0.21972517669200897, "loss/reg": 4.883726069238037e-05, "step": 1031 }, { "epoch": 0.129, "grad_norm": 2.856081962585449, "grad_norm_var": 10.566625900721201, "learning_rate": 0.0001, "loss": 1.6614, "loss/crossentropy": 2.073331594467163, "loss/hidden": 1.375, "loss/logits": 0.2858877182006836, "loss/reg": 4.881904897047207e-05, "step": 1032 }, { "epoch": 0.129125, "grad_norm": 2.5459816455841064, "grad_norm_var": 9.769079293862243, "learning_rate": 0.0001, "loss": 1.217, "loss/crossentropy": 2.49267315864563, "loss/hidden": 1.046875, "loss/logits": 0.16961437463760376, "loss/reg": 4.880000778939575e-05, "step": 1033 }, { "epoch": 0.12925, "grad_norm": 2.683567523956299, "grad_norm_var": 9.76543758025689, "learning_rate": 0.0001, "loss": 1.4644, "loss/crossentropy": 2.461768627166748, "loss/hidden": 1.2734375, "loss/logits": 0.19044238328933716, "loss/reg": 4.8780999350128695e-05, "step": 1034 }, { "epoch": 0.129375, "grad_norm": 3.0184967517852783, "grad_norm_var": 9.755261948776834, "learning_rate": 0.0001, "loss": 1.6364, "loss/crossentropy": 2.4083235263824463, "loss/hidden": 1.34375, "loss/logits": 0.292146235704422, "loss/reg": 4.8763413360575214e-05, "step": 1035 }, { "epoch": 0.1295, "grad_norm": 2.9849438667297363, "grad_norm_var": 9.681217018438769, "learning_rate": 0.0001, "loss": 1.4003, "loss/crossentropy": 2.462160348892212, "loss/hidden": 1.1875, "loss/logits": 0.21227726340293884, "loss/reg": 4.8741494538262486e-05, "step": 1036 }, { "epoch": 0.129625, "grad_norm": 2.4786312580108643, "grad_norm_var": 9.69386845836604, "learning_rate": 0.0001, "loss": 1.4985, "loss/crossentropy": 2.27028489112854, "loss/hidden": 1.2578125, "loss/logits": 0.24016925692558289, "loss/reg": 4.8723446525400504e-05, "step": 1037 }, { "epoch": 0.12975, "grad_norm": 2.5076568126678467, "grad_norm_var": 9.78656640570568, "learning_rate": 0.0001, "loss": 1.2807, "loss/crossentropy": 2.4636549949645996, "loss/hidden": 1.09375, "loss/logits": 0.18648827075958252, "loss/reg": 4.8708836402511224e-05, "step": 1038 }, { "epoch": 0.129875, "grad_norm": 5.714248180389404, "grad_norm_var": 10.04567390941182, "learning_rate": 0.0001, "loss": 1.4724, "loss/crossentropy": 2.7502284049987793, "loss/hidden": 1.296875, "loss/logits": 0.1750672310590744, "loss/reg": 4.8695965233491734e-05, "step": 1039 }, { "epoch": 0.13, "grad_norm": 5.00759220123291, "grad_norm_var": 9.910829392800613, "learning_rate": 0.0001, "loss": 1.9101, "loss/crossentropy": 2.573537826538086, "loss/hidden": 1.640625, "loss/logits": 0.2690110206604004, "loss/reg": 4.868546238867566e-05, "step": 1040 }, { "epoch": 0.130125, "grad_norm": 2.8802566528320312, "grad_norm_var": 9.848702943805838, "learning_rate": 0.0001, "loss": 1.2756, "loss/crossentropy": 2.78995680809021, "loss/hidden": 1.078125, "loss/logits": 0.19703038036823273, "loss/reg": 4.867479583481327e-05, "step": 1041 }, { "epoch": 0.13025, "grad_norm": 2.6452934741973877, "grad_norm_var": 9.939305055834435, "learning_rate": 0.0001, "loss": 1.5079, "loss/crossentropy": 2.455906867980957, "loss/hidden": 1.265625, "loss/logits": 0.24182063341140747, "loss/reg": 4.866902600042522e-05, "step": 1042 }, { "epoch": 0.130375, "grad_norm": 2.370833396911621, "grad_norm_var": 1.0482923897399208, "learning_rate": 0.0001, "loss": 1.2825, "loss/crossentropy": 2.5100162029266357, "loss/hidden": 1.1015625, "loss/logits": 0.18049922585487366, "loss/reg": 4.866467497777194e-05, "step": 1043 }, { "epoch": 0.1305, "grad_norm": 2.5938727855682373, "grad_norm_var": 0.9561722032163684, "learning_rate": 0.0001, "loss": 1.4683, "loss/crossentropy": 2.258338212966919, "loss/hidden": 1.2578125, "loss/logits": 0.2099648416042328, "loss/reg": 4.86554745293688e-05, "step": 1044 }, { "epoch": 0.130625, "grad_norm": 2.603637218475342, "grad_norm_var": 0.9033481292550277, "learning_rate": 0.0001, "loss": 1.3901, "loss/crossentropy": 2.691969871520996, "loss/hidden": 1.1875, "loss/logits": 0.202120840549469, "loss/reg": 4.865137816523202e-05, "step": 1045 }, { "epoch": 0.13075, "grad_norm": 2.4388020038604736, "grad_norm_var": 0.9232760601983566, "learning_rate": 0.0001, "loss": 1.2329, "loss/crossentropy": 2.3171486854553223, "loss/hidden": 1.0625, "loss/logits": 0.16995665431022644, "loss/reg": 4.864386573899537e-05, "step": 1046 }, { "epoch": 0.130875, "grad_norm": 2.494767904281616, "grad_norm_var": 0.9110446675235295, "learning_rate": 0.0001, "loss": 1.6476, "loss/crossentropy": 2.3494441509246826, "loss/hidden": 1.3984375, "loss/logits": 0.24871033430099487, "loss/reg": 4.862891000811942e-05, "step": 1047 }, { "epoch": 0.131, "grad_norm": 49.92427444458008, "grad_norm_var": 138.5400442659784, "learning_rate": 0.0001, "loss": 1.2997, "loss/crossentropy": 2.663750648498535, "loss/hidden": 1.1328125, "loss/logits": 0.16641995310783386, "loss/reg": 4.861348133999854e-05, "step": 1048 }, { "epoch": 0.131125, "grad_norm": 2.3657724857330322, "grad_norm_var": 138.62340409534175, "learning_rate": 0.0001, "loss": 1.6284, "loss/crossentropy": 2.2861435413360596, "loss/hidden": 1.359375, "loss/logits": 0.2685868740081787, "loss/reg": 4.8597343266010284e-05, "step": 1049 }, { "epoch": 0.13125, "grad_norm": 2.4345011711120605, "grad_norm_var": 138.73474415279924, "learning_rate": 0.0001, "loss": 1.4715, "loss/crossentropy": 2.8440446853637695, "loss/hidden": 1.2265625, "loss/logits": 0.24448415637016296, "loss/reg": 4.858469037571922e-05, "step": 1050 }, { "epoch": 0.131375, "grad_norm": 2.356966018676758, "grad_norm_var": 139.01660648328053, "learning_rate": 0.0001, "loss": 1.225, "loss/crossentropy": 2.7689743041992188, "loss/hidden": 1.0625, "loss/logits": 0.16196824610233307, "loss/reg": 4.856809391640127e-05, "step": 1051 }, { "epoch": 0.1315, "grad_norm": 4.138930797576904, "grad_norm_var": 138.65706217924568, "learning_rate": 0.0001, "loss": 1.4593, "loss/crossentropy": 2.3509294986724854, "loss/hidden": 1.2734375, "loss/logits": 0.18540024757385254, "loss/reg": 4.855730730923824e-05, "step": 1052 }, { "epoch": 0.131625, "grad_norm": 3.313295364379883, "grad_norm_var": 138.315976999055, "learning_rate": 0.0001, "loss": 1.2929, "loss/crossentropy": 2.486267566680908, "loss/hidden": 1.125, "loss/logits": 0.16744676232337952, "loss/reg": 4.854327562497929e-05, "step": 1053 }, { "epoch": 0.13175, "grad_norm": 2.193730354309082, "grad_norm_var": 138.46776734896258, "learning_rate": 0.0001, "loss": 1.0996, "loss/crossentropy": 2.776700496673584, "loss/hidden": 0.97265625, "loss/logits": 0.12648262083530426, "loss/reg": 4.8529618652537465e-05, "step": 1054 }, { "epoch": 0.131875, "grad_norm": 5.068596839904785, "grad_norm_var": 138.51560574772913, "learning_rate": 0.0001, "loss": 2.2526, "loss/crossentropy": 2.9333205223083496, "loss/hidden": 1.890625, "loss/logits": 0.36145755648612976, "loss/reg": 4.8516998504055664e-05, "step": 1055 }, { "epoch": 0.132, "grad_norm": 2.5417041778564453, "grad_norm_var": 139.19791301979916, "learning_rate": 0.0001, "loss": 1.4094, "loss/crossentropy": 2.6531319618225098, "loss/hidden": 1.1875, "loss/logits": 0.2214501053094864, "loss/reg": 4.8505764425499365e-05, "step": 1056 }, { "epoch": 0.132125, "grad_norm": 2.918754816055298, "grad_norm_var": 139.18315783121213, "learning_rate": 0.0001, "loss": 1.9489, "loss/crossentropy": 2.144564151763916, "loss/hidden": 1.6484375, "loss/logits": 0.29992836713790894, "loss/reg": 4.849358811043203e-05, "step": 1057 }, { "epoch": 0.13225, "grad_norm": 2.9262378215789795, "grad_norm_var": 139.07084575115195, "learning_rate": 0.0001, "loss": 1.4915, "loss/crossentropy": 2.6257550716400146, "loss/hidden": 1.2890625, "loss/logits": 0.20195814967155457, "loss/reg": 4.848901880905032e-05, "step": 1058 }, { "epoch": 0.132375, "grad_norm": 2.1910111904144287, "grad_norm_var": 139.15491264737338, "learning_rate": 0.0001, "loss": 1.3347, "loss/crossentropy": 2.4378821849823, "loss/hidden": 1.140625, "loss/logits": 0.19360756874084473, "loss/reg": 4.8472938942722976e-05, "step": 1059 }, { "epoch": 0.1325, "grad_norm": 2.1801326274871826, "grad_norm_var": 139.34146098904594, "learning_rate": 0.0001, "loss": 1.346, "loss/crossentropy": 2.6777944564819336, "loss/hidden": 1.1328125, "loss/logits": 0.21269740164279938, "loss/reg": 4.8464396968483925e-05, "step": 1060 }, { "epoch": 0.132625, "grad_norm": 2.5178987979888916, "grad_norm_var": 139.37795408866836, "learning_rate": 0.0001, "loss": 1.2985, "loss/crossentropy": 2.4651262760162354, "loss/hidden": 1.1171875, "loss/logits": 0.18079692125320435, "loss/reg": 4.844791328650899e-05, "step": 1061 }, { "epoch": 0.13275, "grad_norm": 2.6272640228271484, "grad_norm_var": 139.29696084046898, "learning_rate": 0.0001, "loss": 1.2467, "loss/crossentropy": 2.454970121383667, "loss/hidden": 1.078125, "loss/logits": 0.16807857155799866, "loss/reg": 4.843333954340778e-05, "step": 1062 }, { "epoch": 0.132875, "grad_norm": 3.2516655921936035, "grad_norm_var": 139.00302706804425, "learning_rate": 0.0001, "loss": 1.5799, "loss/crossentropy": 2.5497400760650635, "loss/hidden": 1.34375, "loss/logits": 0.2356692999601364, "loss/reg": 4.84242700622417e-05, "step": 1063 }, { "epoch": 0.133, "grad_norm": 3.2384908199310303, "grad_norm_var": 0.6208098056257557, "learning_rate": 0.0001, "loss": 1.5082, "loss/crossentropy": 2.2672436237335205, "loss/hidden": 1.296875, "loss/logits": 0.2108006477355957, "loss/reg": 4.841904592467472e-05, "step": 1064 }, { "epoch": 0.133125, "grad_norm": 5.589666366577148, "grad_norm_var": 1.0443921444605961, "learning_rate": 0.0001, "loss": 1.442, "loss/crossentropy": 2.660480499267578, "loss/hidden": 1.265625, "loss/logits": 0.17584452033042908, "loss/reg": 4.8413534386781976e-05, "step": 1065 }, { "epoch": 0.13325, "grad_norm": 2.072868824005127, "grad_norm_var": 1.0843195820781981, "learning_rate": 0.0001, "loss": 1.3041, "loss/crossentropy": 2.5538711547851562, "loss/hidden": 1.1171875, "loss/logits": 0.18640504777431488, "loss/reg": 4.8400739615317434e-05, "step": 1066 }, { "epoch": 0.133375, "grad_norm": 3.5696866512298584, "grad_norm_var": 1.0608700784999008, "learning_rate": 0.0001, "loss": 1.2464, "loss/crossentropy": 2.4085769653320312, "loss/hidden": 1.0859375, "loss/logits": 0.1599891036748886, "loss/reg": 4.838715904043056e-05, "step": 1067 }, { "epoch": 0.1335, "grad_norm": 2.6294519901275635, "grad_norm_var": 1.0034864033576762, "learning_rate": 0.0001, "loss": 1.2444, "loss/crossentropy": 2.6341211795806885, "loss/hidden": 1.0703125, "loss/logits": 0.17361275851726532, "loss/reg": 4.837429878534749e-05, "step": 1068 }, { "epoch": 0.133625, "grad_norm": 2.570727825164795, "grad_norm_var": 1.0120691658735572, "learning_rate": 0.0001, "loss": 1.1159, "loss/crossentropy": 2.428523540496826, "loss/hidden": 0.96484375, "loss/logits": 0.15061385929584503, "loss/reg": 4.8361835069954395e-05, "step": 1069 }, { "epoch": 0.13375, "grad_norm": 2.9712700843811035, "grad_norm_var": 0.9656976354251573, "learning_rate": 0.0001, "loss": 1.7233, "loss/crossentropy": 2.2465953826904297, "loss/hidden": 1.4375, "loss/logits": 0.28532207012176514, "loss/reg": 4.834888386540115e-05, "step": 1070 }, { "epoch": 0.133875, "grad_norm": 2.2101681232452393, "grad_norm_var": 0.7085842206157051, "learning_rate": 0.0001, "loss": 1.2741, "loss/crossentropy": 2.525843381881714, "loss/hidden": 1.109375, "loss/logits": 0.16421258449554443, "loss/reg": 4.834130231756717e-05, "step": 1071 }, { "epoch": 0.134, "grad_norm": 2.3994717597961426, "grad_norm_var": 0.7161776254130849, "learning_rate": 0.0001, "loss": 1.4061, "loss/crossentropy": 2.706874370574951, "loss/hidden": 1.1796875, "loss/logits": 0.2258806824684143, "loss/reg": 4.8336529289372265e-05, "step": 1072 }, { "epoch": 0.134125, "grad_norm": 3.553044080734253, "grad_norm_var": 0.7457380382287337, "learning_rate": 0.0001, "loss": 1.5817, "loss/crossentropy": 2.382352590560913, "loss/hidden": 1.3203125, "loss/logits": 0.2608606517314911, "loss/reg": 4.83308540424332e-05, "step": 1073 }, { "epoch": 0.13425, "grad_norm": 2.454843282699585, "grad_norm_var": 0.7583663462414731, "learning_rate": 0.0001, "loss": 1.3007, "loss/crossentropy": 2.62082839012146, "loss/hidden": 1.1328125, "loss/logits": 0.1674119234085083, "loss/reg": 4.8327397962566465e-05, "step": 1074 }, { "epoch": 0.134375, "grad_norm": 2.4312965869903564, "grad_norm_var": 0.74000585371445, "learning_rate": 0.0001, "loss": 1.633, "loss/crossentropy": 2.220055103302002, "loss/hidden": 1.375, "loss/logits": 0.2574748992919922, "loss/reg": 4.832783088204451e-05, "step": 1075 }, { "epoch": 0.1345, "grad_norm": 2.182128667831421, "grad_norm_var": 0.7398167146684991, "learning_rate": 0.0001, "loss": 1.2387, "loss/crossentropy": 2.543452501296997, "loss/hidden": 1.0625, "loss/logits": 0.17573592066764832, "loss/reg": 4.831590558751486e-05, "step": 1076 }, { "epoch": 0.134625, "grad_norm": 5.24340295791626, "grad_norm_var": 1.068188147100082, "learning_rate": 0.0001, "loss": 1.9491, "loss/crossentropy": 2.9112777709960938, "loss/hidden": 1.6484375, "loss/logits": 0.3002144396305084, "loss/reg": 4.831279147765599e-05, "step": 1077 }, { "epoch": 0.13475, "grad_norm": 7.397027015686035, "grad_norm_var": 2.2134877049840185, "learning_rate": 0.0001, "loss": 2.1408, "loss/crossentropy": 1.7057814598083496, "loss/hidden": 1.8984375, "loss/logits": 0.2418610006570816, "loss/reg": 4.831252226722427e-05, "step": 1078 }, { "epoch": 0.134875, "grad_norm": 2.7105135917663574, "grad_norm_var": 2.2396307633405352, "learning_rate": 0.0001, "loss": 1.3127, "loss/crossentropy": 2.462306261062622, "loss/hidden": 1.125, "loss/logits": 0.18726205825805664, "loss/reg": 4.830381294596009e-05, "step": 1079 }, { "epoch": 0.135, "grad_norm": 2.3571834564208984, "grad_norm_var": 2.298516862523117, "learning_rate": 0.0001, "loss": 1.2161, "loss/crossentropy": 2.7559218406677246, "loss/hidden": 1.0546875, "loss/logits": 0.16088250279426575, "loss/reg": 4.828806049772538e-05, "step": 1080 }, { "epoch": 0.135125, "grad_norm": 2.9629805088043213, "grad_norm_var": 1.9178276329659905, "learning_rate": 0.0001, "loss": 1.2992, "loss/crossentropy": 2.526937484741211, "loss/hidden": 1.1171875, "loss/logits": 0.18156106770038605, "loss/reg": 4.828528471989557e-05, "step": 1081 }, { "epoch": 0.13525, "grad_norm": 2.0817148685455322, "grad_norm_var": 1.9166124946652812, "learning_rate": 0.0001, "loss": 1.1721, "loss/crossentropy": 2.451180934906006, "loss/hidden": 1.015625, "loss/logits": 0.1559874713420868, "loss/reg": 4.828141754842363e-05, "step": 1082 }, { "epoch": 0.135375, "grad_norm": 2.3504202365875244, "grad_norm_var": 1.9344384047775915, "learning_rate": 0.0001, "loss": 1.1707, "loss/crossentropy": 2.8747832775115967, "loss/hidden": 1.015625, "loss/logits": 0.1546313613653183, "loss/reg": 4.827776865568012e-05, "step": 1083 }, { "epoch": 0.1355, "grad_norm": 2.17263126373291, "grad_norm_var": 1.971976005860794, "learning_rate": 0.0001, "loss": 1.3889, "loss/crossentropy": 2.723773717880249, "loss/hidden": 1.1796875, "loss/logits": 0.20874163508415222, "loss/reg": 4.827152588404715e-05, "step": 1084 }, { "epoch": 0.135625, "grad_norm": 2.8484668731689453, "grad_norm_var": 1.9607874358662902, "learning_rate": 0.0001, "loss": 1.4451, "loss/crossentropy": 2.3394646644592285, "loss/hidden": 1.2578125, "loss/logits": 0.18679557740688324, "loss/reg": 4.826381700695492e-05, "step": 1085 }, { "epoch": 0.13575, "grad_norm": 2.0447754859924316, "grad_norm_var": 2.0205073590326026, "learning_rate": 0.0001, "loss": 1.2954, "loss/crossentropy": 2.6775341033935547, "loss/hidden": 1.1171875, "loss/logits": 0.17772985994815826, "loss/reg": 4.8260139010380954e-05, "step": 1086 }, { "epoch": 0.135875, "grad_norm": 2.259357452392578, "grad_norm_var": 2.015724328520027, "learning_rate": 0.0001, "loss": 1.4122, "loss/crossentropy": 2.410541534423828, "loss/hidden": 1.1875, "loss/logits": 0.2242109179496765, "loss/reg": 4.8245739890262485e-05, "step": 1087 }, { "epoch": 0.136, "grad_norm": 2.8727777004241943, "grad_norm_var": 1.9939999196141713, "learning_rate": 0.0001, "loss": 1.313, "loss/crossentropy": 2.701556921005249, "loss/hidden": 1.125, "loss/logits": 0.1874905824661255, "loss/reg": 4.823635390494019e-05, "step": 1088 }, { "epoch": 0.136125, "grad_norm": 2.3779656887054443, "grad_norm_var": 1.9928928653171767, "learning_rate": 0.0001, "loss": 1.2705, "loss/crossentropy": 2.656157970428467, "loss/hidden": 1.1015625, "loss/logits": 0.16845184564590454, "loss/reg": 4.823903873329982e-05, "step": 1089 }, { "epoch": 0.13625, "grad_norm": 3.045264720916748, "grad_norm_var": 1.9779265068353404, "learning_rate": 0.0001, "loss": 1.4187, "loss/crossentropy": 2.417032241821289, "loss/hidden": 1.2265625, "loss/logits": 0.19169865548610687, "loss/reg": 4.823317431146279e-05, "step": 1090 }, { "epoch": 0.136375, "grad_norm": 2.2823352813720703, "grad_norm_var": 1.9897867705807812, "learning_rate": 0.0001, "loss": 1.1742, "loss/crossentropy": 2.6444296836853027, "loss/hidden": 1.03125, "loss/logits": 0.14250019192695618, "loss/reg": 4.8217982111964375e-05, "step": 1091 }, { "epoch": 0.1365, "grad_norm": 3.824678897857666, "grad_norm_var": 1.9903923191744959, "learning_rate": 0.0001, "loss": 1.3644, "loss/crossentropy": 2.6071150302886963, "loss/hidden": 1.171875, "loss/logits": 0.19199731945991516, "loss/reg": 4.8215104470727965e-05, "step": 1092 }, { "epoch": 0.136625, "grad_norm": 2.549513578414917, "grad_norm_var": 1.656826383552368, "learning_rate": 0.0001, "loss": 1.3147, "loss/crossentropy": 2.4254698753356934, "loss/hidden": 1.1484375, "loss/logits": 0.16581851243972778, "loss/reg": 4.82136856589932e-05, "step": 1093 }, { "epoch": 0.13675, "grad_norm": 6.654994964599609, "grad_norm_var": 1.244691979844697, "learning_rate": 0.0001, "loss": 1.2894, "loss/crossentropy": 2.5099565982818604, "loss/hidden": 1.125, "loss/logits": 0.1638849973678589, "loss/reg": 4.8204721679212525e-05, "step": 1094 }, { "epoch": 0.136875, "grad_norm": 2.3230693340301514, "grad_norm_var": 1.2606197778757762, "learning_rate": 0.0001, "loss": 1.3462, "loss/crossentropy": 2.512430429458618, "loss/hidden": 1.1484375, "loss/logits": 0.19730091094970703, "loss/reg": 4.8203397454926744e-05, "step": 1095 }, { "epoch": 0.137, "grad_norm": 2.261275053024292, "grad_norm_var": 1.267023668315869, "learning_rate": 0.0001, "loss": 1.2825, "loss/crossentropy": 2.6225202083587646, "loss/hidden": 1.109375, "loss/logits": 0.17268520593643188, "loss/reg": 4.8197605792665854e-05, "step": 1096 }, { "epoch": 0.137125, "grad_norm": 2.195023775100708, "grad_norm_var": 1.2879134307282092, "learning_rate": 0.0001, "loss": 1.3726, "loss/crossentropy": 2.742640256881714, "loss/hidden": 1.15625, "loss/logits": 0.2158210277557373, "loss/reg": 4.818299203179777e-05, "step": 1097 }, { "epoch": 0.13725, "grad_norm": 3.191065549850464, "grad_norm_var": 1.26464759974198, "learning_rate": 0.0001, "loss": 1.3751, "loss/crossentropy": 2.2995545864105225, "loss/hidden": 1.1875, "loss/logits": 0.18707019090652466, "loss/reg": 4.817119406652637e-05, "step": 1098 }, { "epoch": 0.137375, "grad_norm": 2.409144401550293, "grad_norm_var": 1.2611209881187542, "learning_rate": 0.0001, "loss": 1.2264, "loss/crossentropy": 2.800138473510742, "loss/hidden": 1.0703125, "loss/logits": 0.15558959543704987, "loss/reg": 4.815995634999126e-05, "step": 1099 }, { "epoch": 0.1375, "grad_norm": 2.1808769702911377, "grad_norm_var": 1.2604002860858665, "learning_rate": 0.0001, "loss": 1.2945, "loss/crossentropy": 2.440044641494751, "loss/hidden": 1.109375, "loss/logits": 0.18468278646469116, "loss/reg": 4.814787462237291e-05, "step": 1100 }, { "epoch": 0.137625, "grad_norm": 2.423313617706299, "grad_norm_var": 1.2707944512029181, "learning_rate": 0.0001, "loss": 1.4005, "loss/crossentropy": 2.3712844848632812, "loss/hidden": 1.203125, "loss/logits": 0.19688689708709717, "loss/reg": 4.81365823361557e-05, "step": 1101 }, { "epoch": 0.13775, "grad_norm": 2.9854326248168945, "grad_norm_var": 1.230627637633081, "learning_rate": 0.0001, "loss": 1.3837, "loss/crossentropy": 2.38476300239563, "loss/hidden": 1.203125, "loss/logits": 0.18007422983646393, "loss/reg": 4.8126166802830994e-05, "step": 1102 }, { "epoch": 0.137875, "grad_norm": 2.0251779556274414, "grad_norm_var": 1.2529580510886253, "learning_rate": 0.0001, "loss": 1.2151, "loss/crossentropy": 2.37416410446167, "loss/hidden": 1.0546875, "loss/logits": 0.15991759300231934, "loss/reg": 4.811226244783029e-05, "step": 1103 }, { "epoch": 0.138, "grad_norm": 2.8228797912597656, "grad_norm_var": 1.2529629166446565, "learning_rate": 0.0001, "loss": 1.3447, "loss/crossentropy": 2.926377296447754, "loss/hidden": 1.1484375, "loss/logits": 0.19579367339611053, "loss/reg": 4.809819074580446e-05, "step": 1104 }, { "epoch": 0.138125, "grad_norm": 2.234987735748291, "grad_norm_var": 1.263182141719084, "learning_rate": 0.0001, "loss": 1.2051, "loss/crossentropy": 2.458136558532715, "loss/hidden": 1.0390625, "loss/logits": 0.16551420092582703, "loss/reg": 4.8088575567817315e-05, "step": 1105 }, { "epoch": 0.13825, "grad_norm": 2.385833740234375, "grad_norm_var": 1.272142330921156, "learning_rate": 0.0001, "loss": 1.3504, "loss/crossentropy": 2.558030843734741, "loss/hidden": 1.1484375, "loss/logits": 0.2014528512954712, "loss/reg": 4.807797813555226e-05, "step": 1106 }, { "epoch": 0.138375, "grad_norm": 3.3867454528808594, "grad_norm_var": 1.272610209843083, "learning_rate": 0.0001, "loss": 1.5406, "loss/crossentropy": 2.503751516342163, "loss/hidden": 1.3203125, "loss/logits": 0.21979767084121704, "loss/reg": 4.8066383897094056e-05, "step": 1107 }, { "epoch": 0.1385, "grad_norm": 2.790325880050659, "grad_norm_var": 1.2072459836922649, "learning_rate": 0.0001, "loss": 1.4099, "loss/crossentropy": 2.3122177124023438, "loss/hidden": 1.21875, "loss/logits": 0.19064576923847198, "loss/reg": 4.805479693459347e-05, "step": 1108 }, { "epoch": 0.138625, "grad_norm": 2.8147926330566406, "grad_norm_var": 1.202740992103356, "learning_rate": 0.0001, "loss": 1.3872, "loss/crossentropy": 2.715533494949341, "loss/hidden": 1.203125, "loss/logits": 0.18357136845588684, "loss/reg": 4.8043169954326004e-05, "step": 1109 }, { "epoch": 0.13875, "grad_norm": 3.2109782695770264, "grad_norm_var": 0.18202471306677784, "learning_rate": 0.0001, "loss": 1.4888, "loss/crossentropy": 2.2935917377471924, "loss/hidden": 1.296875, "loss/logits": 0.19139942526817322, "loss/reg": 4.802411058335565e-05, "step": 1110 }, { "epoch": 0.138875, "grad_norm": 2.9282126426696777, "grad_norm_var": 0.18236138139209668, "learning_rate": 0.0001, "loss": 1.7108, "loss/crossentropy": 2.1880927085876465, "loss/hidden": 1.4609375, "loss/logits": 0.2493777573108673, "loss/reg": 4.800450551556423e-05, "step": 1111 }, { "epoch": 0.139, "grad_norm": 2.2871787548065186, "grad_norm_var": 0.1810939591389835, "learning_rate": 0.0001, "loss": 1.2958, "loss/crossentropy": 2.5252413749694824, "loss/hidden": 1.109375, "loss/logits": 0.1859796941280365, "loss/reg": 4.798595546162687e-05, "step": 1112 }, { "epoch": 0.139125, "grad_norm": 1.9749583005905151, "grad_norm_var": 0.19723590923434664, "learning_rate": 0.0001, "loss": 1.3081, "loss/crossentropy": 2.4636902809143066, "loss/hidden": 1.1171875, "loss/logits": 0.19039222598075867, "loss/reg": 4.796446955879219e-05, "step": 1113 }, { "epoch": 0.13925, "grad_norm": 4.308104515075684, "grad_norm_var": 0.35904772291688963, "learning_rate": 0.0001, "loss": 1.3787, "loss/crossentropy": 2.9549102783203125, "loss/hidden": 1.2109375, "loss/logits": 0.1673247218132019, "loss/reg": 4.794850246980786e-05, "step": 1114 }, { "epoch": 0.139375, "grad_norm": 2.7217111587524414, "grad_norm_var": 0.3531131684974496, "learning_rate": 0.0001, "loss": 1.3919, "loss/crossentropy": 2.6249146461486816, "loss/hidden": 1.1953125, "loss/logits": 0.1961226463317871, "loss/reg": 4.7932728193700314e-05, "step": 1115 }, { "epoch": 0.1395, "grad_norm": 2.2082791328430176, "grad_norm_var": 0.35119913605219605, "learning_rate": 0.0001, "loss": 1.3636, "loss/crossentropy": 2.314013719558716, "loss/hidden": 1.1875, "loss/logits": 0.17566710710525513, "loss/reg": 4.791447281604633e-05, "step": 1116 }, { "epoch": 0.139625, "grad_norm": 3.60855770111084, "grad_norm_var": 0.39222276775296944, "learning_rate": 0.0001, "loss": 1.6117, "loss/crossentropy": 2.6529369354248047, "loss/hidden": 1.375, "loss/logits": 0.23623248934745789, "loss/reg": 4.789578088093549e-05, "step": 1117 }, { "epoch": 0.13975, "grad_norm": 2.7757325172424316, "grad_norm_var": 0.3896014903684961, "learning_rate": 0.0001, "loss": 1.2422, "loss/crossentropy": 2.7011404037475586, "loss/hidden": 1.078125, "loss/logits": 0.1636306345462799, "loss/reg": 4.787558282259852e-05, "step": 1118 }, { "epoch": 0.139875, "grad_norm": 2.764423370361328, "grad_norm_var": 0.34932944368847996, "learning_rate": 0.0001, "loss": 1.3844, "loss/crossentropy": 2.8270928859710693, "loss/hidden": 1.1796875, "loss/logits": 0.2042662799358368, "loss/reg": 4.785611236002296e-05, "step": 1119 }, { "epoch": 0.14, "grad_norm": 2.6311185359954834, "grad_norm_var": 0.3517198026279449, "learning_rate": 0.0001, "loss": 1.4001, "loss/crossentropy": 2.745588541030884, "loss/hidden": 1.234375, "loss/logits": 0.16527175903320312, "loss/reg": 4.7835168516030535e-05, "step": 1120 }, { "epoch": 0.140125, "grad_norm": 2.0787642002105713, "grad_norm_var": 0.36531621433395045, "learning_rate": 0.0001, "loss": 1.2194, "loss/crossentropy": 2.6111185550689697, "loss/hidden": 1.0546875, "loss/logits": 0.16425329446792603, "loss/reg": 4.781952884513885e-05, "step": 1121 }, { "epoch": 0.14025, "grad_norm": 2.299438238143921, "grad_norm_var": 0.37060818594784395, "learning_rate": 0.0001, "loss": 1.2904, "loss/crossentropy": 2.3531951904296875, "loss/hidden": 1.109375, "loss/logits": 0.18054065108299255, "loss/reg": 4.780311792274006e-05, "step": 1122 }, { "epoch": 0.140375, "grad_norm": 2.123501777648926, "grad_norm_var": 0.3714053097766362, "learning_rate": 0.0001, "loss": 1.2384, "loss/crossentropy": 2.5495216846466064, "loss/hidden": 1.0546875, "loss/logits": 0.1832682490348816, "loss/reg": 4.778530274052173e-05, "step": 1123 }, { "epoch": 0.1405, "grad_norm": 2.496851921081543, "grad_norm_var": 0.3740512666854229, "learning_rate": 0.0001, "loss": 1.3948, "loss/crossentropy": 3.104501962661743, "loss/hidden": 1.203125, "loss/logits": 0.19115296006202698, "loss/reg": 4.776769856107421e-05, "step": 1124 }, { "epoch": 0.140625, "grad_norm": 3.203948497772217, "grad_norm_var": 0.38936697390070374, "learning_rate": 0.0001, "loss": 1.4557, "loss/crossentropy": 2.750394105911255, "loss/hidden": 1.25, "loss/logits": 0.20525826513767242, "loss/reg": 4.7750996600370854e-05, "step": 1125 }, { "epoch": 0.14075, "grad_norm": 2.309300184249878, "grad_norm_var": 0.3819183078721844, "learning_rate": 0.0001, "loss": 1.308, "loss/crossentropy": 2.725071430206299, "loss/hidden": 1.109375, "loss/logits": 0.19812864065170288, "loss/reg": 4.773392356582917e-05, "step": 1126 }, { "epoch": 0.140875, "grad_norm": 3.0331461429595947, "grad_norm_var": 0.3862191141394532, "learning_rate": 0.0001, "loss": 1.1116, "loss/crossentropy": 2.7448618412017822, "loss/hidden": 0.9609375, "loss/logits": 0.1502147614955902, "loss/reg": 4.771806197823025e-05, "step": 1127 }, { "epoch": 0.141, "grad_norm": 6.447192668914795, "grad_norm_var": 1.2518469248685036, "learning_rate": 0.0001, "loss": 1.5841, "loss/crossentropy": 2.4020206928253174, "loss/hidden": 1.390625, "loss/logits": 0.19297294318675995, "loss/reg": 4.770241503138095e-05, "step": 1128 }, { "epoch": 0.141125, "grad_norm": 3.1637861728668213, "grad_norm_var": 1.1877543708808314, "learning_rate": 0.0001, "loss": 1.7336, "loss/crossentropy": 2.453598737716675, "loss/hidden": 1.4375, "loss/logits": 0.2955983281135559, "loss/reg": 4.768367944052443e-05, "step": 1129 }, { "epoch": 0.14125, "grad_norm": 2.668591022491455, "grad_norm_var": 1.0721759885566438, "learning_rate": 0.0001, "loss": 1.2714, "loss/crossentropy": 2.3829445838928223, "loss/hidden": 1.1015625, "loss/logits": 0.16937227547168732, "loss/reg": 4.766715210280381e-05, "step": 1130 }, { "epoch": 0.141375, "grad_norm": 2.74442195892334, "grad_norm_var": 1.0716429218470986, "learning_rate": 0.0001, "loss": 1.1947, "loss/crossentropy": 2.5616579055786133, "loss/hidden": 1.03125, "loss/logits": 0.16293516755104065, "loss/reg": 4.76537061331328e-05, "step": 1131 }, { "epoch": 0.1415, "grad_norm": 2.376505136489868, "grad_norm_var": 1.057676108050079, "learning_rate": 0.0001, "loss": 1.2084, "loss/crossentropy": 2.4909427165985107, "loss/hidden": 1.03125, "loss/logits": 0.17672239243984222, "loss/reg": 4.763679316965863e-05, "step": 1132 }, { "epoch": 0.141625, "grad_norm": 3.554159641265869, "grad_norm_var": 1.0528692879887234, "learning_rate": 0.0001, "loss": 1.5053, "loss/crossentropy": 2.6319496631622314, "loss/hidden": 1.2734375, "loss/logits": 0.2313893735408783, "loss/reg": 4.7620640543755144e-05, "step": 1133 }, { "epoch": 0.14175, "grad_norm": 2.9882657527923584, "grad_norm_var": 1.0516912119352735, "learning_rate": 0.0001, "loss": 1.4679, "loss/crossentropy": 2.4339728355407715, "loss/hidden": 1.2421875, "loss/logits": 0.22522477805614471, "loss/reg": 4.760462979902513e-05, "step": 1134 }, { "epoch": 0.141875, "grad_norm": 2.1122543811798096, "grad_norm_var": 1.0926904062794485, "learning_rate": 0.0001, "loss": 1.3286, "loss/crossentropy": 2.5192081928253174, "loss/hidden": 1.1328125, "loss/logits": 0.1953265368938446, "loss/reg": 4.758898649015464e-05, "step": 1135 }, { "epoch": 0.142, "grad_norm": 3.5696256160736084, "grad_norm_var": 1.115413644842679, "learning_rate": 0.0001, "loss": 1.3965, "loss/crossentropy": 2.285820245742798, "loss/hidden": 1.2109375, "loss/logits": 0.18508857488632202, "loss/reg": 4.756827183882706e-05, "step": 1136 }, { "epoch": 0.142125, "grad_norm": 3.564345359802246, "grad_norm_var": 1.081150356805938, "learning_rate": 0.0001, "loss": 1.6697, "loss/crossentropy": 2.237314462661743, "loss/hidden": 1.4765625, "loss/logits": 0.19265148043632507, "loss/reg": 4.755084592034109e-05, "step": 1137 }, { "epoch": 0.14225, "grad_norm": 2.2548444271087646, "grad_norm_var": 1.085683606162062, "learning_rate": 0.0001, "loss": 1.1842, "loss/crossentropy": 2.4809651374816895, "loss/hidden": 1.03125, "loss/logits": 0.15246474742889404, "loss/reg": 4.753081884700805e-05, "step": 1138 }, { "epoch": 0.142375, "grad_norm": 2.6867403984069824, "grad_norm_var": 1.036820672443695, "learning_rate": 0.0001, "loss": 1.2994, "loss/crossentropy": 2.91271710395813, "loss/hidden": 1.125, "loss/logits": 0.17396080493927002, "loss/reg": 4.7511359298368916e-05, "step": 1139 }, { "epoch": 0.1425, "grad_norm": 2.4465537071228027, "grad_norm_var": 1.0408451939787444, "learning_rate": 0.0001, "loss": 1.2231, "loss/crossentropy": 2.5878028869628906, "loss/hidden": 1.0234375, "loss/logits": 0.1992160677909851, "loss/reg": 4.749055369757116e-05, "step": 1140 }, { "epoch": 0.142625, "grad_norm": 4.528571128845215, "grad_norm_var": 1.1741261249691728, "learning_rate": 0.0001, "loss": 1.9586, "loss/crossentropy": 2.373525619506836, "loss/hidden": 1.6953125, "loss/logits": 0.2627871334552765, "loss/reg": 4.7472818550886586e-05, "step": 1141 }, { "epoch": 0.14275, "grad_norm": 2.7354013919830322, "grad_norm_var": 1.1375391518044278, "learning_rate": 0.0001, "loss": 1.3563, "loss/crossentropy": 2.648790121078491, "loss/hidden": 1.1640625, "loss/logits": 0.19176684319972992, "loss/reg": 4.745826663565822e-05, "step": 1142 }, { "epoch": 0.142875, "grad_norm": 2.713024854660034, "grad_norm_var": 1.1501972178076196, "learning_rate": 0.0001, "loss": 1.3524, "loss/crossentropy": 2.4191818237304688, "loss/hidden": 1.171875, "loss/logits": 0.1800951361656189, "loss/reg": 4.743799945572391e-05, "step": 1143 }, { "epoch": 0.143, "grad_norm": 2.346231460571289, "grad_norm_var": 0.40369959007739076, "learning_rate": 0.0001, "loss": 1.2126, "loss/crossentropy": 2.2274231910705566, "loss/hidden": 1.046875, "loss/logits": 0.16525804996490479, "loss/reg": 4.7417455789400265e-05, "step": 1144 }, { "epoch": 0.143125, "grad_norm": 2.744516372680664, "grad_norm_var": 0.40012624841660693, "learning_rate": 0.0001, "loss": 1.603, "loss/crossentropy": 2.5323281288146973, "loss/hidden": 1.34375, "loss/logits": 0.2587292492389679, "loss/reg": 4.739963696920313e-05, "step": 1145 }, { "epoch": 0.14325, "grad_norm": 2.8817338943481445, "grad_norm_var": 0.3970391852633552, "learning_rate": 0.0001, "loss": 1.3588, "loss/crossentropy": 2.4774110317230225, "loss/hidden": 1.15625, "loss/logits": 0.20203325152397156, "loss/reg": 4.737896233564243e-05, "step": 1146 }, { "epoch": 0.143375, "grad_norm": 2.2984557151794434, "grad_norm_var": 0.41815268022830854, "learning_rate": 0.0001, "loss": 1.4007, "loss/crossentropy": 2.5607006549835205, "loss/hidden": 1.1875, "loss/logits": 0.21274057030677795, "loss/reg": 4.7362878831336275e-05, "step": 1147 }, { "epoch": 0.1435, "grad_norm": 2.0969078540802, "grad_norm_var": 0.441159171760544, "learning_rate": 0.0001, "loss": 1.1426, "loss/crossentropy": 2.3691599369049072, "loss/hidden": 0.9921875, "loss/logits": 0.1498948335647583, "loss/reg": 4.734646063297987e-05, "step": 1148 }, { "epoch": 0.143625, "grad_norm": 2.029167652130127, "grad_norm_var": 0.4423349102668773, "learning_rate": 0.0001, "loss": 1.2621, "loss/crossentropy": 2.385744571685791, "loss/hidden": 1.09375, "loss/logits": 0.1678312122821808, "loss/reg": 4.732477100333199e-05, "step": 1149 }, { "epoch": 0.14375, "grad_norm": 1.9239691495895386, "grad_norm_var": 0.47928917254578013, "learning_rate": 0.0001, "loss": 1.0533, "loss/crossentropy": 2.613713502883911, "loss/hidden": 0.91796875, "loss/logits": 0.1348608434200287, "loss/reg": 4.730945511255413e-05, "step": 1150 }, { "epoch": 0.143875, "grad_norm": 2.0164811611175537, "grad_norm_var": 0.4871542069837379, "learning_rate": 0.0001, "loss": 1.342, "loss/crossentropy": 2.0422286987304688, "loss/hidden": 1.1640625, "loss/logits": 0.17750707268714905, "loss/reg": 4.7292855015257373e-05, "step": 1151 }, { "epoch": 0.144, "grad_norm": 2.2749154567718506, "grad_norm_var": 0.437878471389033, "learning_rate": 0.0001, "loss": 1.217, "loss/crossentropy": 2.351731300354004, "loss/hidden": 1.0390625, "loss/logits": 0.17749223113059998, "loss/reg": 4.7281155275413767e-05, "step": 1152 }, { "epoch": 0.144125, "grad_norm": 2.5271859169006348, "grad_norm_var": 0.3712498798941301, "learning_rate": 0.0001, "loss": 1.4337, "loss/crossentropy": 2.5493454933166504, "loss/hidden": 1.203125, "loss/logits": 0.23008278012275696, "loss/reg": 4.7269335482269526e-05, "step": 1153 }, { "epoch": 0.14425, "grad_norm": 2.2880935668945312, "grad_norm_var": 0.3700923052297336, "learning_rate": 0.0001, "loss": 1.2332, "loss/crossentropy": 2.7758634090423584, "loss/hidden": 1.0625, "loss/logits": 0.1702326536178589, "loss/reg": 4.725826875073835e-05, "step": 1154 }, { "epoch": 0.144375, "grad_norm": 4.250509262084961, "grad_norm_var": 0.5548537228186133, "learning_rate": 0.0001, "loss": 1.744, "loss/crossentropy": 2.3507156372070312, "loss/hidden": 1.4609375, "loss/logits": 0.2826390266418457, "loss/reg": 4.7243731387425214e-05, "step": 1155 }, { "epoch": 0.1445, "grad_norm": 2.425056219100952, "grad_norm_var": 0.555412315408905, "learning_rate": 0.0001, "loss": 1.2433, "loss/crossentropy": 2.7652292251586914, "loss/hidden": 1.0703125, "loss/logits": 0.17255419492721558, "loss/reg": 4.7228229959728196e-05, "step": 1156 }, { "epoch": 0.144625, "grad_norm": 2.7023041248321533, "grad_norm_var": 0.3015625034546047, "learning_rate": 0.0001, "loss": 1.4738, "loss/crossentropy": 2.423006772994995, "loss/hidden": 1.2734375, "loss/logits": 0.1999123990535736, "loss/reg": 4.721076038549654e-05, "step": 1157 }, { "epoch": 0.14475, "grad_norm": 2.7887837886810303, "grad_norm_var": 0.30330314157064175, "learning_rate": 0.0001, "loss": 1.4482, "loss/crossentropy": 2.6713080406188965, "loss/hidden": 1.21875, "loss/logits": 0.2290123999118805, "loss/reg": 4.7194982471410185e-05, "step": 1158 }, { "epoch": 0.144875, "grad_norm": 2.315005302429199, "grad_norm_var": 0.30291867264976885, "learning_rate": 0.0001, "loss": 1.2302, "loss/crossentropy": 2.9456934928894043, "loss/hidden": 1.0625, "loss/logits": 0.16724364459514618, "loss/reg": 4.717716728919186e-05, "step": 1159 }, { "epoch": 0.145, "grad_norm": 20.321826934814453, "grad_norm_var": 20.14308559505051, "learning_rate": 0.0001, "loss": 1.5487, "loss/crossentropy": 2.416504383087158, "loss/hidden": 1.3359375, "loss/logits": 0.21224364638328552, "loss/reg": 4.7157529479591176e-05, "step": 1160 }, { "epoch": 0.145125, "grad_norm": 2.4590742588043213, "grad_norm_var": 20.181414443983293, "learning_rate": 0.0001, "loss": 1.3333, "loss/crossentropy": 2.6246085166931152, "loss/hidden": 1.140625, "loss/logits": 0.1921812891960144, "loss/reg": 4.713963426183909e-05, "step": 1161 }, { "epoch": 0.14525, "grad_norm": 6.681092739105225, "grad_norm_var": 20.719766602422965, "learning_rate": 0.0001, "loss": 1.8658, "loss/crossentropy": 2.900252342224121, "loss/hidden": 1.59375, "loss/logits": 0.2715913951396942, "loss/reg": 4.712406007456593e-05, "step": 1162 }, { "epoch": 0.145375, "grad_norm": 2.4327285289764404, "grad_norm_var": 20.6933411626458, "learning_rate": 0.0001, "loss": 1.4701, "loss/crossentropy": 2.718841314315796, "loss/hidden": 1.25, "loss/logits": 0.21964167058467865, "loss/reg": 4.7109646402532235e-05, "step": 1163 }, { "epoch": 0.1455, "grad_norm": 3.1247708797454834, "grad_norm_var": 20.51968710018897, "learning_rate": 0.0001, "loss": 1.6086, "loss/crossentropy": 2.7490997314453125, "loss/hidden": 1.3515625, "loss/logits": 0.2565191686153412, "loss/reg": 4.709674612968229e-05, "step": 1164 }, { "epoch": 0.145625, "grad_norm": 3.131579875946045, "grad_norm_var": 20.319174937109523, "learning_rate": 0.0001, "loss": 1.4654, "loss/crossentropy": 2.797558546066284, "loss/hidden": 1.2265625, "loss/logits": 0.23839589953422546, "loss/reg": 4.7083391109481454e-05, "step": 1165 }, { "epoch": 0.14575, "grad_norm": 3.672037124633789, "grad_norm_var": 20.031190047272485, "learning_rate": 0.0001, "loss": 1.5083, "loss/crossentropy": 2.388136386871338, "loss/hidden": 1.296875, "loss/logits": 0.21094849705696106, "loss/reg": 4.707129119196907e-05, "step": 1166 }, { "epoch": 0.145875, "grad_norm": 2.3486030101776123, "grad_norm_var": 19.946341680930896, "learning_rate": 0.0001, "loss": 1.403, "loss/crossentropy": 2.45896577835083, "loss/hidden": 1.2109375, "loss/logits": 0.19156183302402496, "loss/reg": 4.7055495087988675e-05, "step": 1167 }, { "epoch": 0.146, "grad_norm": 2.5459513664245605, "grad_norm_var": 19.884653568287856, "learning_rate": 0.0001, "loss": 1.3022, "loss/crossentropy": 2.4945719242095947, "loss/hidden": 1.1328125, "loss/logits": 0.16890111565589905, "loss/reg": 4.704431557911448e-05, "step": 1168 }, { "epoch": 0.146125, "grad_norm": 2.185032606124878, "grad_norm_var": 19.964904994517646, "learning_rate": 0.0001, "loss": 1.3203, "loss/crossentropy": 2.267711877822876, "loss/hidden": 1.125, "loss/logits": 0.19480320811271667, "loss/reg": 4.703476224676706e-05, "step": 1169 }, { "epoch": 0.14625, "grad_norm": 2.453258514404297, "grad_norm_var": 19.926608452200593, "learning_rate": 0.0001, "loss": 1.5579, "loss/crossentropy": 2.6074061393737793, "loss/hidden": 1.3046875, "loss/logits": 0.2527133822441101, "loss/reg": 4.7016856115078554e-05, "step": 1170 }, { "epoch": 0.146375, "grad_norm": 2.9324119091033936, "grad_norm_var": 20.011353286130436, "learning_rate": 0.0001, "loss": 1.3058, "loss/crossentropy": 2.5551066398620605, "loss/hidden": 1.140625, "loss/logits": 0.16465714573860168, "loss/reg": 4.6998691686894745e-05, "step": 1171 }, { "epoch": 0.1465, "grad_norm": 2.0241332054138184, "grad_norm_var": 20.107326037621984, "learning_rate": 0.0001, "loss": 1.2586, "loss/crossentropy": 2.7565040588378906, "loss/hidden": 1.0703125, "loss/logits": 0.18783336877822876, "loss/reg": 4.698050543083809e-05, "step": 1172 }, { "epoch": 0.146625, "grad_norm": 2.8346903324127197, "grad_norm_var": 20.08538431269884, "learning_rate": 0.0001, "loss": 1.5072, "loss/crossentropy": 2.2326302528381348, "loss/hidden": 1.28125, "loss/logits": 0.2255048155784607, "loss/reg": 4.696170799434185e-05, "step": 1173 }, { "epoch": 0.14675, "grad_norm": 2.174600601196289, "grad_norm_var": 20.209433008289647, "learning_rate": 0.0001, "loss": 1.182, "loss/crossentropy": 2.47452449798584, "loss/hidden": 1.03125, "loss/logits": 0.15031439065933228, "loss/reg": 4.6942925109760836e-05, "step": 1174 }, { "epoch": 0.146875, "grad_norm": 2.85469651222229, "grad_norm_var": 20.108020405367256, "learning_rate": 0.0001, "loss": 1.3719, "loss/crossentropy": 2.4068410396575928, "loss/hidden": 1.1875, "loss/logits": 0.18392156064510345, "loss/reg": 4.692598668043502e-05, "step": 1175 }, { "epoch": 0.147, "grad_norm": 3.0080974102020264, "grad_norm_var": 1.1899183007710952, "learning_rate": 0.0001, "loss": 1.3194, "loss/crossentropy": 2.6427528858184814, "loss/hidden": 1.125, "loss/logits": 0.1939159333705902, "loss/reg": 4.690660352935083e-05, "step": 1176 }, { "epoch": 0.147125, "grad_norm": 1.8658075332641602, "grad_norm_var": 1.2490821768597584, "learning_rate": 0.0001, "loss": 1.3452, "loss/crossentropy": 2.4428977966308594, "loss/hidden": 1.15625, "loss/logits": 0.18848666548728943, "loss/reg": 4.68892467324622e-05, "step": 1177 }, { "epoch": 0.14725, "grad_norm": 2.0562570095062256, "grad_norm_var": 0.2492804212330249, "learning_rate": 0.0001, "loss": 1.26, "loss/crossentropy": 2.4440927505493164, "loss/hidden": 1.078125, "loss/logits": 0.18139450252056122, "loss/reg": 4.6868422941770405e-05, "step": 1178 }, { "epoch": 0.147375, "grad_norm": 2.4248452186584473, "grad_norm_var": 0.24946305945295155, "learning_rate": 0.0001, "loss": 1.3291, "loss/crossentropy": 2.495669364929199, "loss/hidden": 1.140625, "loss/logits": 0.18804886937141418, "loss/reg": 4.6847046178299934e-05, "step": 1179 }, { "epoch": 0.1475, "grad_norm": 2.193629503250122, "grad_norm_var": 0.23878596668150653, "learning_rate": 0.0001, "loss": 1.2206, "loss/crossentropy": 2.5296146869659424, "loss/hidden": 1.0546875, "loss/logits": 0.16542188823223114, "loss/reg": 4.682805956690572e-05, "step": 1180 }, { "epoch": 0.147625, "grad_norm": 2.19338059425354, "grad_norm_var": 0.22031007335769434, "learning_rate": 0.0001, "loss": 1.353, "loss/crossentropy": 2.6205079555511475, "loss/hidden": 1.1796875, "loss/logits": 0.17287296056747437, "loss/reg": 4.68104517494794e-05, "step": 1181 }, { "epoch": 0.14775, "grad_norm": 2.7266340255737305, "grad_norm_var": 0.1265998407663306, "learning_rate": 0.0001, "loss": 1.7818, "loss/crossentropy": 2.054272413253784, "loss/hidden": 1.515625, "loss/logits": 0.26566898822784424, "loss/reg": 4.679444100474939e-05, "step": 1182 }, { "epoch": 0.147875, "grad_norm": 10.878344535827637, "grad_norm_var": 4.585428414128689, "learning_rate": 0.0001, "loss": 2.6308, "loss/crossentropy": 2.638014554977417, "loss/hidden": 2.171875, "loss/logits": 0.4585029184818268, "loss/reg": 4.677903052652255e-05, "step": 1183 }, { "epoch": 0.148, "grad_norm": 2.57718825340271, "grad_norm_var": 4.583767061458208, "learning_rate": 0.0001, "loss": 1.426, "loss/crossentropy": 2.631078004837036, "loss/hidden": 1.203125, "loss/logits": 0.2224457859992981, "loss/reg": 4.676307435147464e-05, "step": 1184 }, { "epoch": 0.148125, "grad_norm": 2.099421262741089, "grad_norm_var": 4.593087690510012, "learning_rate": 0.0001, "loss": 1.1609, "loss/crossentropy": 2.6211211681365967, "loss/hidden": 1.0, "loss/logits": 0.16039219498634338, "loss/reg": 4.674717638408765e-05, "step": 1185 }, { "epoch": 0.14825, "grad_norm": 2.1765975952148438, "grad_norm_var": 4.616419928519187, "learning_rate": 0.0001, "loss": 1.2765, "loss/crossentropy": 2.382640838623047, "loss/hidden": 1.09375, "loss/logits": 0.18227428197860718, "loss/reg": 4.673180228564888e-05, "step": 1186 }, { "epoch": 0.148375, "grad_norm": 5.076205253601074, "grad_norm_var": 4.901835733529481, "learning_rate": 0.0001, "loss": 1.5855, "loss/crossentropy": 2.359898328781128, "loss/hidden": 1.28125, "loss/logits": 0.3037688732147217, "loss/reg": 4.6718851081095636e-05, "step": 1187 }, { "epoch": 0.1485, "grad_norm": 3.1118228435516357, "grad_norm_var": 4.823696787247346, "learning_rate": 0.0001, "loss": 1.2903, "loss/crossentropy": 2.5006351470947266, "loss/hidden": 1.109375, "loss/logits": 0.1804269552230835, "loss/reg": 4.67036988993641e-05, "step": 1188 }, { "epoch": 0.148625, "grad_norm": 2.433318614959717, "grad_norm_var": 4.850145380757316, "learning_rate": 0.0001, "loss": 1.357, "loss/crossentropy": 2.6935362815856934, "loss/hidden": 1.140625, "loss/logits": 0.2159041315317154, "loss/reg": 4.669180998462252e-05, "step": 1189 }, { "epoch": 0.14875, "grad_norm": 2.3695571422576904, "grad_norm_var": 4.828058326793973, "learning_rate": 0.0001, "loss": 1.4454, "loss/crossentropy": 2.7090983390808105, "loss/hidden": 1.234375, "loss/logits": 0.2105472981929779, "loss/reg": 4.6679560909979045e-05, "step": 1190 }, { "epoch": 0.148875, "grad_norm": 2.809579372406006, "grad_norm_var": 4.829828812715443, "learning_rate": 0.0001, "loss": 1.3304, "loss/crossentropy": 2.592134714126587, "loss/hidden": 1.140625, "loss/logits": 0.18935655057430267, "loss/reg": 4.6665063564432785e-05, "step": 1191 }, { "epoch": 0.149, "grad_norm": 3.6269266605377197, "grad_norm_var": 4.844113927837752, "learning_rate": 0.0001, "loss": 1.5038, "loss/crossentropy": 2.5124716758728027, "loss/hidden": 1.28125, "loss/logits": 0.2220713496208191, "loss/reg": 4.664724110625684e-05, "step": 1192 }, { "epoch": 0.149125, "grad_norm": 2.287263870239258, "grad_norm_var": 4.78228040964938, "learning_rate": 0.0001, "loss": 1.2169, "loss/crossentropy": 2.4629602432250977, "loss/hidden": 1.0546875, "loss/logits": 0.16175468266010284, "loss/reg": 4.6629724238300696e-05, "step": 1193 }, { "epoch": 0.14925, "grad_norm": 2.4541614055633545, "grad_norm_var": 4.732023172385815, "learning_rate": 0.0001, "loss": 1.4253, "loss/crossentropy": 2.29724383354187, "loss/hidden": 1.2421875, "loss/logits": 0.1826433539390564, "loss/reg": 4.6614863094873726e-05, "step": 1194 }, { "epoch": 0.149375, "grad_norm": 2.073838710784912, "grad_norm_var": 4.776700162502281, "learning_rate": 0.0001, "loss": 1.2284, "loss/crossentropy": 2.670344352722168, "loss/hidden": 1.0546875, "loss/logits": 0.17323313653469086, "loss/reg": 4.660046033677645e-05, "step": 1195 }, { "epoch": 0.1495, "grad_norm": 2.8066813945770264, "grad_norm_var": 4.718501570878416, "learning_rate": 0.0001, "loss": 1.3393, "loss/crossentropy": 2.8886072635650635, "loss/hidden": 1.15625, "loss/logits": 0.1826225221157074, "loss/reg": 4.658956459024921e-05, "step": 1196 }, { "epoch": 0.149625, "grad_norm": 3.8294754028320312, "grad_norm_var": 4.6593823625693265, "learning_rate": 0.0001, "loss": 1.5133, "loss/crossentropy": 2.2249755859375, "loss/hidden": 1.265625, "loss/logits": 0.24719995260238647, "loss/reg": 4.657540921471082e-05, "step": 1197 }, { "epoch": 0.14975, "grad_norm": 2.4204254150390625, "grad_norm_var": 4.690022199661033, "learning_rate": 0.0001, "loss": 1.3452, "loss/crossentropy": 2.817233085632324, "loss/hidden": 1.140625, "loss/logits": 0.20412875711917877, "loss/reg": 4.6560286136809736e-05, "step": 1198 }, { "epoch": 0.149875, "grad_norm": 1.9155974388122559, "grad_norm_var": 0.6715669493519023, "learning_rate": 0.0001, "loss": 1.1469, "loss/crossentropy": 2.4378252029418945, "loss/hidden": 0.9921875, "loss/logits": 0.15422964096069336, "loss/reg": 4.654593067243695e-05, "step": 1199 }, { "epoch": 0.15, "grad_norm": 2.4836931228637695, "grad_norm_var": 0.6743205851249285, "learning_rate": 0.0001, "loss": 1.398, "loss/crossentropy": 2.3904969692230225, "loss/hidden": 1.2109375, "loss/logits": 0.1865827888250351, "loss/reg": 4.653136420529336e-05, "step": 1200 }, { "epoch": 0.150125, "grad_norm": 3.4455342292785645, "grad_norm_var": 0.6710901601970398, "learning_rate": 0.0001, "loss": 1.8156, "loss/crossentropy": 2.0689449310302734, "loss/hidden": 1.5234375, "loss/logits": 0.2916872501373291, "loss/reg": 4.651547351386398e-05, "step": 1201 }, { "epoch": 0.15025, "grad_norm": 2.7757627964019775, "grad_norm_var": 0.6411250200226505, "learning_rate": 0.0001, "loss": 1.5134, "loss/crossentropy": 2.8068742752075195, "loss/hidden": 1.28125, "loss/logits": 0.23169484734535217, "loss/reg": 4.650075425161049e-05, "step": 1202 }, { "epoch": 0.150375, "grad_norm": 2.3012502193450928, "grad_norm_var": 0.3061121534919548, "learning_rate": 0.0001, "loss": 1.3758, "loss/crossentropy": 2.697503089904785, "loss/hidden": 1.15625, "loss/logits": 0.2190503478050232, "loss/reg": 4.648489993996918e-05, "step": 1203 }, { "epoch": 0.1505, "grad_norm": 2.1723225116729736, "grad_norm_var": 0.3092592888203074, "learning_rate": 0.0001, "loss": 1.3397, "loss/crossentropy": 2.3358473777770996, "loss/hidden": 1.15625, "loss/logits": 0.18293632566928864, "loss/reg": 4.6468485379591584e-05, "step": 1204 }, { "epoch": 0.150625, "grad_norm": 3.029073715209961, "grad_norm_var": 0.31519634973794436, "learning_rate": 0.0001, "loss": 1.6785, "loss/crossentropy": 2.385075330734253, "loss/hidden": 1.4453125, "loss/logits": 0.23269888758659363, "loss/reg": 4.645344597520307e-05, "step": 1205 }, { "epoch": 0.15075, "grad_norm": 3.408116102218628, "grad_norm_var": 0.3403031929612437, "learning_rate": 0.0001, "loss": 1.6782, "loss/crossentropy": 2.7597949504852295, "loss/hidden": 1.4296875, "loss/logits": 0.24803972244262695, "loss/reg": 4.6434746764134616e-05, "step": 1206 }, { "epoch": 0.150875, "grad_norm": 2.469802141189575, "grad_norm_var": 0.3443656874500211, "learning_rate": 0.0001, "loss": 1.3171, "loss/crossentropy": 2.701486349105835, "loss/hidden": 1.125, "loss/logits": 0.19166097044944763, "loss/reg": 4.641970372176729e-05, "step": 1207 }, { "epoch": 0.151, "grad_norm": 1.992879867553711, "grad_norm_var": 0.31337938768562573, "learning_rate": 0.0001, "loss": 1.2277, "loss/crossentropy": 2.489752769470215, "loss/hidden": 1.0546875, "loss/logits": 0.17256753146648407, "loss/reg": 4.6402536099776626e-05, "step": 1208 }, { "epoch": 0.151125, "grad_norm": 2.535855293273926, "grad_norm_var": 0.30632514875860556, "learning_rate": 0.0001, "loss": 1.3068, "loss/crossentropy": 2.897026777267456, "loss/hidden": 1.1328125, "loss/logits": 0.17352746427059174, "loss/reg": 4.638026439351961e-05, "step": 1209 }, { "epoch": 0.15125, "grad_norm": 2.456472158432007, "grad_norm_var": 0.3062706427848125, "learning_rate": 0.0001, "loss": 1.3139, "loss/crossentropy": 2.676337957382202, "loss/hidden": 1.125, "loss/logits": 0.18842476606369019, "loss/reg": 4.6367600589292124e-05, "step": 1210 }, { "epoch": 0.151375, "grad_norm": 3.8645594120025635, "grad_norm_var": 0.37334871398992014, "learning_rate": 0.0001, "loss": 1.6388, "loss/crossentropy": 2.2264482975006104, "loss/hidden": 1.3984375, "loss/logits": 0.2399100363254547, "loss/reg": 4.6349210606422275e-05, "step": 1211 }, { "epoch": 0.1515, "grad_norm": 3.0219388008117676, "grad_norm_var": 0.37803743581498545, "learning_rate": 0.0001, "loss": 1.5009, "loss/crossentropy": 3.0072405338287354, "loss/hidden": 1.296875, "loss/logits": 0.20359638333320618, "loss/reg": 4.6335251681739464e-05, "step": 1212 }, { "epoch": 0.151625, "grad_norm": 3.7400689125061035, "grad_norm_var": 0.3657602117088911, "learning_rate": 0.0001, "loss": 1.8148, "loss/crossentropy": 2.6893434524536133, "loss/hidden": 1.5390625, "loss/logits": 0.275297075510025, "loss/reg": 4.631928095477633e-05, "step": 1213 }, { "epoch": 0.15175, "grad_norm": 2.2333054542541504, "grad_norm_var": 0.3762232507342539, "learning_rate": 0.0001, "loss": 1.3913, "loss/crossentropy": 2.447011947631836, "loss/hidden": 1.203125, "loss/logits": 0.18775393068790436, "loss/reg": 4.630190960597247e-05, "step": 1214 }, { "epoch": 0.151875, "grad_norm": 2.458369493484497, "grad_norm_var": 0.33494596633352763, "learning_rate": 0.0001, "loss": 1.3824, "loss/crossentropy": 2.6085517406463623, "loss/hidden": 1.171875, "loss/logits": 0.21003778278827667, "loss/reg": 4.628559690900147e-05, "step": 1215 }, { "epoch": 0.152, "grad_norm": 2.192143201828003, "grad_norm_var": 0.35155590225111055, "learning_rate": 0.0001, "loss": 1.4439, "loss/crossentropy": 2.356393575668335, "loss/hidden": 1.234375, "loss/logits": 0.20902931690216064, "loss/reg": 4.626429654308595e-05, "step": 1216 }, { "epoch": 0.152125, "grad_norm": 2.258812665939331, "grad_norm_var": 0.3304848535876232, "learning_rate": 0.0001, "loss": 1.1278, "loss/crossentropy": 2.5807323455810547, "loss/hidden": 0.9765625, "loss/logits": 0.15082184970378876, "loss/reg": 4.624574285116978e-05, "step": 1217 }, { "epoch": 0.15225, "grad_norm": 2.9527230262756348, "grad_norm_var": 0.3346562098525602, "learning_rate": 0.0001, "loss": 1.6524, "loss/crossentropy": 2.108830213546753, "loss/hidden": 1.3671875, "loss/logits": 0.2847442626953125, "loss/reg": 4.6231474698288366e-05, "step": 1218 }, { "epoch": 0.152375, "grad_norm": 2.7327449321746826, "grad_norm_var": 0.32375564974886756, "learning_rate": 0.0001, "loss": 1.5971, "loss/crossentropy": 2.2259297370910645, "loss/hidden": 1.359375, "loss/logits": 0.2372676432132721, "loss/reg": 4.621966218110174e-05, "step": 1219 }, { "epoch": 0.1525, "grad_norm": 2.4477243423461914, "grad_norm_var": 0.3083870484826993, "learning_rate": 0.0001, "loss": 1.3617, "loss/crossentropy": 2.6080846786499023, "loss/hidden": 1.171875, "loss/logits": 0.1893840879201889, "loss/reg": 4.6210447180783376e-05, "step": 1220 }, { "epoch": 0.152625, "grad_norm": 3.3036723136901855, "grad_norm_var": 0.3237876349353845, "learning_rate": 0.0001, "loss": 1.5448, "loss/crossentropy": 2.5666184425354004, "loss/hidden": 1.3046875, "loss/logits": 0.2396336793899536, "loss/reg": 4.619457831722684e-05, "step": 1221 }, { "epoch": 0.15275, "grad_norm": 2.905449867248535, "grad_norm_var": 0.2957611742412664, "learning_rate": 0.0001, "loss": 1.228, "loss/crossentropy": 2.7263970375061035, "loss/hidden": 1.0625, "loss/logits": 0.16505266726016998, "loss/reg": 4.618103776010685e-05, "step": 1222 }, { "epoch": 0.152875, "grad_norm": 2.9071547985076904, "grad_norm_var": 0.29295649472309315, "learning_rate": 0.0001, "loss": 1.3722, "loss/crossentropy": 2.2898879051208496, "loss/hidden": 1.1875, "loss/logits": 0.1842312216758728, "loss/reg": 4.6172623115126044e-05, "step": 1223 }, { "epoch": 0.153, "grad_norm": 3.5174379348754883, "grad_norm_var": 0.2842714538848438, "learning_rate": 0.0001, "loss": 1.5973, "loss/crossentropy": 2.2778470516204834, "loss/hidden": 1.390625, "loss/logits": 0.2062428593635559, "loss/reg": 4.616468140739016e-05, "step": 1224 }, { "epoch": 0.153125, "grad_norm": 3.030709981918335, "grad_norm_var": 0.2791441912567497, "learning_rate": 0.0001, "loss": 1.5098, "loss/crossentropy": 2.477696180343628, "loss/hidden": 1.3359375, "loss/logits": 0.17343661189079285, "loss/reg": 4.615149737219326e-05, "step": 1225 }, { "epoch": 0.15325, "grad_norm": 2.864184856414795, "grad_norm_var": 0.2667025408972182, "learning_rate": 0.0001, "loss": 1.232, "loss/crossentropy": 2.1104869842529297, "loss/hidden": 1.0546875, "loss/logits": 0.176896333694458, "loss/reg": 4.613979399437085e-05, "step": 1226 }, { "epoch": 0.153375, "grad_norm": 2.8904950618743896, "grad_norm_var": 0.20098186745316274, "learning_rate": 0.0001, "loss": 1.2044, "loss/crossentropy": 2.806056022644043, "loss/hidden": 1.0546875, "loss/logits": 0.14926595985889435, "loss/reg": 4.612850534613244e-05, "step": 1227 }, { "epoch": 0.1535, "grad_norm": 3.037811040878296, "grad_norm_var": 0.20138040974156488, "learning_rate": 0.0001, "loss": 1.2008, "loss/crossentropy": 2.2585337162017822, "loss/hidden": 1.046875, "loss/logits": 0.15345463156700134, "loss/reg": 4.6119672333588824e-05, "step": 1228 }, { "epoch": 0.153625, "grad_norm": 2.884638786315918, "grad_norm_var": 0.14468985219164002, "learning_rate": 0.0001, "loss": 1.477, "loss/crossentropy": 2.2203421592712402, "loss/hidden": 1.234375, "loss/logits": 0.24218516051769257, "loss/reg": 4.6111101255519316e-05, "step": 1229 }, { "epoch": 0.15375, "grad_norm": 3.115166425704956, "grad_norm_var": 0.12800406371613893, "learning_rate": 0.0001, "loss": 1.3236, "loss/crossentropy": 2.7177720069885254, "loss/hidden": 1.1328125, "loss/logits": 0.1902787983417511, "loss/reg": 4.609820825862698e-05, "step": 1230 }, { "epoch": 0.153875, "grad_norm": 2.872023344039917, "grad_norm_var": 0.11744581476523853, "learning_rate": 0.0001, "loss": 1.4699, "loss/crossentropy": 2.6862807273864746, "loss/hidden": 1.2421875, "loss/logits": 0.22726118564605713, "loss/reg": 4.608508970704861e-05, "step": 1231 }, { "epoch": 0.154, "grad_norm": 2.3777549266815186, "grad_norm_var": 0.10283428435944278, "learning_rate": 0.0001, "loss": 1.2628, "loss/crossentropy": 2.3350508213043213, "loss/hidden": 1.1015625, "loss/logits": 0.16080161929130554, "loss/reg": 4.6070923417573795e-05, "step": 1232 }, { "epoch": 0.154125, "grad_norm": 2.02388858795166, "grad_norm_var": 0.12577742446186732, "learning_rate": 0.0001, "loss": 1.3127, "loss/crossentropy": 2.5232856273651123, "loss/hidden": 1.1328125, "loss/logits": 0.17940716445446014, "loss/reg": 4.606059519574046e-05, "step": 1233 }, { "epoch": 0.15425, "grad_norm": 2.5482373237609863, "grad_norm_var": 0.13135142140041864, "learning_rate": 0.0001, "loss": 1.433, "loss/crossentropy": 2.3807077407836914, "loss/hidden": 1.234375, "loss/logits": 0.19818373024463654, "loss/reg": 4.605152935255319e-05, "step": 1234 }, { "epoch": 0.154375, "grad_norm": 2.203364133834839, "grad_norm_var": 0.15652141199913389, "learning_rate": 0.0001, "loss": 1.2164, "loss/crossentropy": 2.735185146331787, "loss/hidden": 1.0546875, "loss/logits": 0.16127313673496246, "loss/reg": 4.6041928726481274e-05, "step": 1235 }, { "epoch": 0.1545, "grad_norm": 2.136706590652466, "grad_norm_var": 0.17751188961389416, "learning_rate": 0.0001, "loss": 1.0936, "loss/crossentropy": 2.520498037338257, "loss/hidden": 0.953125, "loss/logits": 0.1400274932384491, "loss/reg": 4.6028115320950747e-05, "step": 1236 }, { "epoch": 0.154625, "grad_norm": 54.65066909790039, "grad_norm_var": 168.48549504499928, "learning_rate": 0.0001, "loss": 2.3815, "loss/crossentropy": 2.986631393432617, "loss/hidden": 2.125, "loss/logits": 0.2560478150844574, "loss/reg": 4.601667387760244e-05, "step": 1237 }, { "epoch": 0.15475, "grad_norm": 2.784250497817993, "grad_norm_var": 168.5363861452807, "learning_rate": 0.0001, "loss": 1.4229, "loss/crossentropy": 2.738603115081787, "loss/hidden": 1.1953125, "loss/logits": 0.22707870602607727, "loss/reg": 4.6003900934010744e-05, "step": 1238 }, { "epoch": 0.154875, "grad_norm": 2.7723941802978516, "grad_norm_var": 168.5929190345867, "learning_rate": 0.0001, "loss": 1.4481, "loss/crossentropy": 2.760061740875244, "loss/hidden": 1.2109375, "loss/logits": 0.23672153055667877, "loss/reg": 4.599518433678895e-05, "step": 1239 }, { "epoch": 0.155, "grad_norm": 2.8237342834472656, "grad_norm_var": 168.85093923579436, "learning_rate": 0.0001, "loss": 1.8431, "loss/crossentropy": 2.110563278198242, "loss/hidden": 1.5625, "loss/logits": 0.2801324427127838, "loss/reg": 4.5983535528648645e-05, "step": 1240 }, { "epoch": 0.155125, "grad_norm": 2.586320161819458, "grad_norm_var": 169.03557429254678, "learning_rate": 0.0001, "loss": 1.2401, "loss/crossentropy": 2.4837148189544678, "loss/hidden": 1.078125, "loss/logits": 0.16152258217334747, "loss/reg": 4.59690963907633e-05, "step": 1241 }, { "epoch": 0.15525, "grad_norm": 2.43038272857666, "grad_norm_var": 169.2235486987413, "learning_rate": 0.0001, "loss": 1.3523, "loss/crossentropy": 2.467975378036499, "loss/hidden": 1.171875, "loss/logits": 0.17995983362197876, "loss/reg": 4.59553484688513e-05, "step": 1242 }, { "epoch": 0.155375, "grad_norm": 2.66363525390625, "grad_norm_var": 169.31730109442543, "learning_rate": 0.0001, "loss": 1.3432, "loss/crossentropy": 2.583726167678833, "loss/hidden": 1.1328125, "loss/logits": 0.2098957747220993, "loss/reg": 4.59427283203695e-05, "step": 1243 }, { "epoch": 0.1555, "grad_norm": 2.068486213684082, "grad_norm_var": 169.74199410245245, "learning_rate": 0.0001, "loss": 1.2218, "loss/crossentropy": 2.5289151668548584, "loss/hidden": 1.0546875, "loss/logits": 0.16663600504398346, "loss/reg": 4.592623736243695e-05, "step": 1244 }, { "epoch": 0.155625, "grad_norm": 2.142481565475464, "grad_norm_var": 170.06578252348226, "learning_rate": 0.0001, "loss": 1.2824, "loss/crossentropy": 2.624850273132324, "loss/hidden": 1.09375, "loss/logits": 0.1881803572177887, "loss/reg": 4.5915359805803746e-05, "step": 1245 }, { "epoch": 0.15575, "grad_norm": 3.677988052368164, "grad_norm_var": 169.88691935686097, "learning_rate": 0.0001, "loss": 1.4276, "loss/crossentropy": 2.7901053428649902, "loss/hidden": 1.2109375, "loss/logits": 0.2161804437637329, "loss/reg": 4.5899174438090995e-05, "step": 1246 }, { "epoch": 0.155875, "grad_norm": 2.5624334812164307, "grad_norm_var": 170.01367542428878, "learning_rate": 0.0001, "loss": 1.3775, "loss/crossentropy": 2.4989280700683594, "loss/hidden": 1.171875, "loss/logits": 0.20520368218421936, "loss/reg": 4.588349111145362e-05, "step": 1247 }, { "epoch": 0.156, "grad_norm": 2.208247661590576, "grad_norm_var": 170.0923267285343, "learning_rate": 0.0001, "loss": 1.6388, "loss/crossentropy": 2.471464157104492, "loss/hidden": 1.3671875, "loss/logits": 0.27117836475372314, "loss/reg": 4.587349030771293e-05, "step": 1248 }, { "epoch": 0.156125, "grad_norm": 2.627300500869751, "grad_norm_var": 169.81387519584032, "learning_rate": 0.0001, "loss": 1.3158, "loss/crossentropy": 2.702277898788452, "loss/hidden": 1.1171875, "loss/logits": 0.198166161775589, "loss/reg": 4.586328941513784e-05, "step": 1249 }, { "epoch": 0.15625, "grad_norm": 2.6705386638641357, "grad_norm_var": 169.76169576274927, "learning_rate": 0.0001, "loss": 1.5095, "loss/crossentropy": 2.6403021812438965, "loss/hidden": 1.296875, "loss/logits": 0.21217921376228333, "loss/reg": 4.5855969801777974e-05, "step": 1250 }, { "epoch": 0.156375, "grad_norm": 2.4441046714782715, "grad_norm_var": 169.6494513840449, "learning_rate": 0.0001, "loss": 1.4097, "loss/crossentropy": 2.6433634757995605, "loss/hidden": 1.203125, "loss/logits": 0.20615389943122864, "loss/reg": 4.58406975667458e-05, "step": 1251 }, { "epoch": 0.1565, "grad_norm": 2.4537088871002197, "grad_norm_var": 169.49970781805342, "learning_rate": 0.0001, "loss": 1.4022, "loss/crossentropy": 2.6877176761627197, "loss/hidden": 1.1875, "loss/logits": 0.21425047516822815, "loss/reg": 4.58277172583621e-05, "step": 1252 }, { "epoch": 0.156625, "grad_norm": 2.9511280059814453, "grad_norm_var": 0.1417171540344287, "learning_rate": 0.0001, "loss": 1.2724, "loss/crossentropy": 2.56616473197937, "loss/hidden": 1.1015625, "loss/logits": 0.17037302255630493, "loss/reg": 4.5814376790076494e-05, "step": 1253 }, { "epoch": 0.15675, "grad_norm": 2.333775520324707, "grad_norm_var": 0.1443362499984147, "learning_rate": 0.0001, "loss": 1.2437, "loss/crossentropy": 2.5075337886810303, "loss/hidden": 1.078125, "loss/logits": 0.16508585214614868, "loss/reg": 4.5802942622685805e-05, "step": 1254 }, { "epoch": 0.156875, "grad_norm": 2.4434492588043213, "grad_norm_var": 0.14303538209416022, "learning_rate": 0.0001, "loss": 1.2319, "loss/crossentropy": 2.753591537475586, "loss/hidden": 1.0703125, "loss/logits": 0.16108646988868713, "loss/reg": 4.579335654852912e-05, "step": 1255 }, { "epoch": 0.157, "grad_norm": 2.68575119972229, "grad_norm_var": 0.13952007848767303, "learning_rate": 0.0001, "loss": 1.2704, "loss/crossentropy": 2.274879217147827, "loss/hidden": 1.109375, "loss/logits": 0.1605929583311081, "loss/reg": 4.578304287861101e-05, "step": 1256 }, { "epoch": 0.157125, "grad_norm": 1.9222602844238281, "grad_norm_var": 0.16469380439607614, "learning_rate": 0.0001, "loss": 1.1729, "loss/crossentropy": 2.367164134979248, "loss/hidden": 1.03125, "loss/logits": 0.14119181036949158, "loss/reg": 4.576797437039204e-05, "step": 1257 }, { "epoch": 0.15725, "grad_norm": 2.551128387451172, "grad_norm_var": 0.16419677919077813, "learning_rate": 0.0001, "loss": 1.4752, "loss/crossentropy": 2.5899417400360107, "loss/hidden": 1.2109375, "loss/logits": 0.26381391286849976, "loss/reg": 4.5751668949378654e-05, "step": 1258 }, { "epoch": 0.157375, "grad_norm": 2.118729829788208, "grad_norm_var": 0.17271112727045193, "learning_rate": 0.0001, "loss": 1.288, "loss/crossentropy": 2.8331987857818604, "loss/hidden": 1.1015625, "loss/logits": 0.18601545691490173, "loss/reg": 4.5734541345154867e-05, "step": 1259 }, { "epoch": 0.1575, "grad_norm": 2.093118667602539, "grad_norm_var": 0.17136024462738533, "learning_rate": 0.0001, "loss": 1.2783, "loss/crossentropy": 2.581794261932373, "loss/hidden": 1.1015625, "loss/logits": 0.17623090744018555, "loss/reg": 4.572012767312117e-05, "step": 1260 }, { "epoch": 0.157625, "grad_norm": 2.452829122543335, "grad_norm_var": 0.16288042975729317, "learning_rate": 0.0001, "loss": 1.2639, "loss/crossentropy": 2.4306862354278564, "loss/hidden": 1.09375, "loss/logits": 0.16968411207199097, "loss/reg": 4.5699918700847775e-05, "step": 1261 }, { "epoch": 0.15775, "grad_norm": 2.0749716758728027, "grad_norm_var": 0.07433122353475691, "learning_rate": 0.0001, "loss": 1.3118, "loss/crossentropy": 2.5065548419952393, "loss/hidden": 1.1328125, "loss/logits": 0.17850646376609802, "loss/reg": 4.568080112221651e-05, "step": 1262 }, { "epoch": 0.157875, "grad_norm": 1.962320327758789, "grad_norm_var": 0.08481014322264452, "learning_rate": 0.0001, "loss": 1.2214, "loss/crossentropy": 2.2913718223571777, "loss/hidden": 1.0625, "loss/logits": 0.15845662355422974, "loss/reg": 4.566472489386797e-05, "step": 1263 }, { "epoch": 0.158, "grad_norm": 2.977077007293701, "grad_norm_var": 0.10470244938228059, "learning_rate": 0.0001, "loss": 1.238, "loss/crossentropy": 2.6589906215667725, "loss/hidden": 1.0859375, "loss/logits": 0.15159624814987183, "loss/reg": 4.56501220469363e-05, "step": 1264 }, { "epoch": 0.158125, "grad_norm": 2.3853960037231445, "grad_norm_var": 0.101758608177992, "learning_rate": 0.0001, "loss": 1.4115, "loss/crossentropy": 2.3630943298339844, "loss/hidden": 1.21875, "loss/logits": 0.1922955960035324, "loss/reg": 4.5634922571480274e-05, "step": 1265 }, { "epoch": 0.15825, "grad_norm": 2.4945428371429443, "grad_norm_var": 0.09752244376290188, "learning_rate": 0.0001, "loss": 1.2384, "loss/crossentropy": 2.578355073928833, "loss/hidden": 1.078125, "loss/logits": 0.1598268449306488, "loss/reg": 4.5620046876138076e-05, "step": 1266 }, { "epoch": 0.158375, "grad_norm": 2.6602394580841064, "grad_norm_var": 0.10181342884065975, "learning_rate": 0.0001, "loss": 1.4403, "loss/crossentropy": 2.4018986225128174, "loss/hidden": 1.25, "loss/logits": 0.18987661600112915, "loss/reg": 4.5601722376886755e-05, "step": 1267 }, { "epoch": 0.1585, "grad_norm": 2.1531805992126465, "grad_norm_var": 0.10570789087357374, "learning_rate": 0.0001, "loss": 1.2901, "loss/crossentropy": 2.259855031967163, "loss/hidden": 1.1171875, "loss/logits": 0.17248710989952087, "loss/reg": 4.558489308692515e-05, "step": 1268 }, { "epoch": 0.158625, "grad_norm": 6.556189060211182, "grad_norm_var": 1.18710927748879, "learning_rate": 0.0001, "loss": 2.2171, "loss/crossentropy": 3.0120301246643066, "loss/hidden": 1.6796875, "loss/logits": 0.5369682312011719, "loss/reg": 4.5569711801363155e-05, "step": 1269 }, { "epoch": 0.15875, "grad_norm": 1.949015736579895, "grad_norm_var": 1.210868993450559, "learning_rate": 0.0001, "loss": 1.1905, "loss/crossentropy": 2.266023635864258, "loss/hidden": 1.0390625, "loss/logits": 0.15093478560447693, "loss/reg": 4.555568375508301e-05, "step": 1270 }, { "epoch": 0.158875, "grad_norm": 2.2778539657592773, "grad_norm_var": 1.2158740780819797, "learning_rate": 0.0001, "loss": 1.3343, "loss/crossentropy": 2.4026403427124023, "loss/hidden": 1.15625, "loss/logits": 0.1775573194026947, "loss/reg": 4.554086262942292e-05, "step": 1271 }, { "epoch": 0.159, "grad_norm": 2.177086591720581, "grad_norm_var": 1.2250197385653658, "learning_rate": 0.0001, "loss": 1.217, "loss/crossentropy": 2.674293279647827, "loss/hidden": 1.0546875, "loss/logits": 0.16184410452842712, "loss/reg": 4.552335667540319e-05, "step": 1272 }, { "epoch": 0.159125, "grad_norm": 8.303696632385254, "grad_norm_var": 3.235757025826163, "learning_rate": 0.0001, "loss": 1.8709, "loss/crossentropy": 2.4282426834106445, "loss/hidden": 1.6328125, "loss/logits": 0.237609401345253, "loss/reg": 4.551160236587748e-05, "step": 1273 }, { "epoch": 0.15925, "grad_norm": 2.404670000076294, "grad_norm_var": 3.244871326073413, "learning_rate": 0.0001, "loss": 1.2833, "loss/crossentropy": 2.5967812538146973, "loss/hidden": 1.09375, "loss/logits": 0.18913458287715912, "loss/reg": 4.549379809759557e-05, "step": 1274 }, { "epoch": 0.159375, "grad_norm": 5.456787109375, "grad_norm_var": 3.575733704160466, "learning_rate": 0.0001, "loss": 1.5074, "loss/crossentropy": 2.9396886825561523, "loss/hidden": 1.3125, "loss/logits": 0.19442874193191528, "loss/reg": 4.547737626126036e-05, "step": 1275 }, { "epoch": 0.1595, "grad_norm": 4.377321720123291, "grad_norm_var": 3.580348684788946, "learning_rate": 0.0001, "loss": 1.3265, "loss/crossentropy": 2.5567522048950195, "loss/hidden": 1.15625, "loss/logits": 0.1698286086320877, "loss/reg": 4.5460172259481624e-05, "step": 1276 }, { "epoch": 0.159625, "grad_norm": 2.861487865447998, "grad_norm_var": 3.545091749430006, "learning_rate": 0.0001, "loss": 1.2936, "loss/crossentropy": 2.5974984169006348, "loss/hidden": 1.1328125, "loss/logits": 0.16036538779735565, "loss/reg": 4.5443983253790066e-05, "step": 1277 }, { "epoch": 0.15975, "grad_norm": 2.5587146282196045, "grad_norm_var": 3.4796082011222356, "learning_rate": 0.0001, "loss": 1.3762, "loss/crossentropy": 2.2575130462646484, "loss/hidden": 1.1875, "loss/logits": 0.18820002675056458, "loss/reg": 4.5428696466842666e-05, "step": 1278 }, { "epoch": 0.159875, "grad_norm": 2.16434907913208, "grad_norm_var": 3.4448538033595457, "learning_rate": 0.0001, "loss": 1.4059, "loss/crossentropy": 2.222012758255005, "loss/hidden": 1.2109375, "loss/logits": 0.1944960057735443, "loss/reg": 4.5414264604914933e-05, "step": 1279 }, { "epoch": 0.16, "grad_norm": 3.1996350288391113, "grad_norm_var": 3.436590982541448, "learning_rate": 0.0001, "loss": 1.509, "loss/crossentropy": 2.7933011054992676, "loss/hidden": 1.265625, "loss/logits": 0.24294595420360565, "loss/reg": 4.5400753151625395e-05, "step": 1280 }, { "epoch": 0.160125, "grad_norm": 2.10770583152771, "grad_norm_var": 3.4780050157497047, "learning_rate": 0.0001, "loss": 1.2331, "loss/crossentropy": 2.5146090984344482, "loss/hidden": 1.0703125, "loss/logits": 0.16236549615859985, "loss/reg": 4.538971552392468e-05, "step": 1281 }, { "epoch": 0.16025, "grad_norm": 2.3782973289489746, "grad_norm_var": 3.4922079229987424, "learning_rate": 0.0001, "loss": 1.2909, "loss/crossentropy": 2.5038681030273438, "loss/hidden": 1.0859375, "loss/logits": 0.20447075366973877, "loss/reg": 4.53733628091868e-05, "step": 1282 }, { "epoch": 0.160375, "grad_norm": 3.447486162185669, "grad_norm_var": 3.4586315294422163, "learning_rate": 0.0001, "loss": 1.5448, "loss/crossentropy": 2.573171854019165, "loss/hidden": 1.296875, "loss/logits": 0.247447669506073, "loss/reg": 4.53554603154771e-05, "step": 1283 }, { "epoch": 0.1605, "grad_norm": 3.238802194595337, "grad_norm_var": 3.3520558241327163, "learning_rate": 0.0001, "loss": 1.5793, "loss/crossentropy": 3.2341203689575195, "loss/hidden": 1.359375, "loss/logits": 0.2195206880569458, "loss/reg": 4.533507308224216e-05, "step": 1284 }, { "epoch": 0.160625, "grad_norm": 2.7456347942352295, "grad_norm_var": 2.6896300538670217, "learning_rate": 0.0001, "loss": 1.5458, "loss/crossentropy": 2.585313081741333, "loss/hidden": 1.296875, "loss/logits": 0.24851781129837036, "loss/reg": 4.5315591705730185e-05, "step": 1285 }, { "epoch": 0.16075, "grad_norm": 2.555786609649658, "grad_norm_var": 2.609164594143196, "learning_rate": 0.0001, "loss": 1.354, "loss/crossentropy": 2.4461801052093506, "loss/hidden": 1.1484375, "loss/logits": 0.20508697628974915, "loss/reg": 4.529834768618457e-05, "step": 1286 }, { "epoch": 0.160875, "grad_norm": 2.5963664054870605, "grad_norm_var": 2.5735421395473925, "learning_rate": 0.0001, "loss": 1.318, "loss/crossentropy": 2.4623584747314453, "loss/hidden": 1.125, "loss/logits": 0.19257178902626038, "loss/reg": 4.528291538008489e-05, "step": 1287 }, { "epoch": 0.161, "grad_norm": 2.6578991413116455, "grad_norm_var": 2.5169090388190605, "learning_rate": 0.0001, "loss": 1.3076, "loss/crossentropy": 2.818201780319214, "loss/hidden": 1.125, "loss/logits": 0.18214286863803864, "loss/reg": 4.5269247493706644e-05, "step": 1288 }, { "epoch": 0.161125, "grad_norm": 2.2922048568725586, "grad_norm_var": 0.7776683827363757, "learning_rate": 0.0001, "loss": 1.2893, "loss/crossentropy": 2.677027940750122, "loss/hidden": 1.125, "loss/logits": 0.1638355702161789, "loss/reg": 4.525210533756763e-05, "step": 1289 }, { "epoch": 0.16125, "grad_norm": 2.2950809001922607, "grad_norm_var": 0.7862440467010436, "learning_rate": 0.0001, "loss": 1.5369, "loss/crossentropy": 2.3946428298950195, "loss/hidden": 1.3046875, "loss/logits": 0.2318038046360016, "loss/reg": 4.523458119365387e-05, "step": 1290 }, { "epoch": 0.161375, "grad_norm": 2.176048517227173, "grad_norm_var": 0.35511413265154773, "learning_rate": 0.0001, "loss": 1.2962, "loss/crossentropy": 2.3253607749938965, "loss/hidden": 1.125, "loss/logits": 0.17075207829475403, "loss/reg": 4.521861774264835e-05, "step": 1291 }, { "epoch": 0.1615, "grad_norm": 13.866178512573242, "grad_norm_var": 8.06882346208631, "learning_rate": 0.0001, "loss": 1.379, "loss/crossentropy": 2.5671613216400146, "loss/hidden": 1.1953125, "loss/logits": 0.1832769513130188, "loss/reg": 4.5201886678114533e-05, "step": 1292 }, { "epoch": 0.161625, "grad_norm": 2.2611589431762695, "grad_norm_var": 8.128157666131413, "learning_rate": 0.0001, "loss": 1.3256, "loss/crossentropy": 2.3536336421966553, "loss/hidden": 1.1484375, "loss/logits": 0.176661878824234, "loss/reg": 4.518694549915381e-05, "step": 1293 }, { "epoch": 0.16175, "grad_norm": 2.574432134628296, "grad_norm_var": 8.126653496369219, "learning_rate": 0.0001, "loss": 1.6454, "loss/crossentropy": 2.236233711242676, "loss/hidden": 1.3828125, "loss/logits": 0.2621437907218933, "loss/reg": 4.517089473665692e-05, "step": 1294 }, { "epoch": 0.161875, "grad_norm": 2.8371546268463135, "grad_norm_var": 8.054430963011198, "learning_rate": 0.0001, "loss": 1.1682, "loss/crossentropy": 2.739943027496338, "loss/hidden": 1.015625, "loss/logits": 0.15212970972061157, "loss/reg": 4.515535692917183e-05, "step": 1295 }, { "epoch": 0.162, "grad_norm": 2.6954469680786133, "grad_norm_var": 8.078871991774545, "learning_rate": 0.0001, "loss": 1.3803, "loss/crossentropy": 2.3508806228637695, "loss/hidden": 1.1953125, "loss/logits": 0.18451997637748718, "loss/reg": 4.513954991125502e-05, "step": 1296 }, { "epoch": 0.162125, "grad_norm": 3.7373878955841064, "grad_norm_var": 7.986798008871676, "learning_rate": 0.0001, "loss": 1.466, "loss/crossentropy": 2.8382225036621094, "loss/hidden": 1.25, "loss/logits": 0.21551957726478577, "loss/reg": 4.512203304329887e-05, "step": 1297 }, { "epoch": 0.16225, "grad_norm": 4.052486419677734, "grad_norm_var": 7.934532747645363, "learning_rate": 0.0001, "loss": 1.2291, "loss/crossentropy": 2.3494935035705566, "loss/hidden": 1.078125, "loss/logits": 0.1504923701286316, "loss/reg": 4.510927101364359e-05, "step": 1298 }, { "epoch": 0.162375, "grad_norm": 2.686509132385254, "grad_norm_var": 7.976241291204222, "learning_rate": 0.0001, "loss": 1.2428, "loss/crossentropy": 2.5574803352355957, "loss/hidden": 1.078125, "loss/logits": 0.1642056405544281, "loss/reg": 4.508991332841106e-05, "step": 1299 }, { "epoch": 0.1625, "grad_norm": 2.7092995643615723, "grad_norm_var": 8.008977847206319, "learning_rate": 0.0001, "loss": 1.3322, "loss/crossentropy": 2.4917750358581543, "loss/hidden": 1.140625, "loss/logits": 0.19108353555202484, "loss/reg": 4.507151243160479e-05, "step": 1300 }, { "epoch": 0.162625, "grad_norm": 3.4720537662506104, "grad_norm_var": 7.976526433044462, "learning_rate": 0.0001, "loss": 1.3601, "loss/crossentropy": 2.49690318107605, "loss/hidden": 1.1796875, "loss/logits": 0.17998597025871277, "loss/reg": 4.505674951360561e-05, "step": 1301 }, { "epoch": 0.16275, "grad_norm": 2.996471881866455, "grad_norm_var": 7.935146933941526, "learning_rate": 0.0001, "loss": 1.2871, "loss/crossentropy": 2.723876714706421, "loss/hidden": 1.109375, "loss/logits": 0.1772668957710266, "loss/reg": 4.5041644625598565e-05, "step": 1302 }, { "epoch": 0.162875, "grad_norm": 3.104691505432129, "grad_norm_var": 7.890448726347673, "learning_rate": 0.0001, "loss": 1.6074, "loss/crossentropy": 2.483220100402832, "loss/hidden": 1.3515625, "loss/logits": 0.2554011046886444, "loss/reg": 4.502179945120588e-05, "step": 1303 }, { "epoch": 0.163, "grad_norm": 3.673701286315918, "grad_norm_var": 7.837376429810394, "learning_rate": 0.0001, "loss": 1.3646, "loss/crossentropy": 2.6128814220428467, "loss/hidden": 1.171875, "loss/logits": 0.192288339138031, "loss/reg": 4.500124487094581e-05, "step": 1304 }, { "epoch": 0.163125, "grad_norm": 3.023348331451416, "grad_norm_var": 7.7443295688751315, "learning_rate": 0.0001, "loss": 1.3105, "loss/crossentropy": 2.4459805488586426, "loss/hidden": 1.125, "loss/logits": 0.1850828230381012, "loss/reg": 4.497506597544998e-05, "step": 1305 }, { "epoch": 0.16325, "grad_norm": 2.8581490516662598, "grad_norm_var": 7.663542686186688, "learning_rate": 0.0001, "loss": 1.8724, "loss/crossentropy": 2.4504129886627197, "loss/hidden": 1.5703125, "loss/logits": 0.30162930488586426, "loss/reg": 4.495858229347505e-05, "step": 1306 }, { "epoch": 0.163375, "grad_norm": 2.220228672027588, "grad_norm_var": 7.65486261444942, "learning_rate": 0.0001, "loss": 1.3188, "loss/crossentropy": 2.48659348487854, "loss/hidden": 1.15625, "loss/logits": 0.16205593943595886, "loss/reg": 4.493745291256346e-05, "step": 1307 }, { "epoch": 0.1635, "grad_norm": 2.5835015773773193, "grad_norm_var": 0.27692455359477297, "learning_rate": 0.0001, "loss": 1.4295, "loss/crossentropy": 2.087364435195923, "loss/hidden": 1.25, "loss/logits": 0.1790969967842102, "loss/reg": 4.491967774811201e-05, "step": 1308 }, { "epoch": 0.163625, "grad_norm": 3.2623960971832275, "grad_norm_var": 0.24523372884796876, "learning_rate": 0.0001, "loss": 1.413, "loss/crossentropy": 2.5019094944000244, "loss/hidden": 1.2265625, "loss/logits": 0.18602579832077026, "loss/reg": 4.489835919230245e-05, "step": 1309 }, { "epoch": 0.16375, "grad_norm": 3.2299067974090576, "grad_norm_var": 0.23223192578486382, "learning_rate": 0.0001, "loss": 1.281, "loss/crossentropy": 2.6735575199127197, "loss/hidden": 1.109375, "loss/logits": 0.17119300365447998, "loss/reg": 4.4880165660288185e-05, "step": 1310 }, { "epoch": 0.163875, "grad_norm": 3.0332422256469727, "grad_norm_var": 0.22851017898726292, "learning_rate": 0.0001, "loss": 1.6158, "loss/crossentropy": 2.3394935131073, "loss/hidden": 1.3828125, "loss/logits": 0.23254308104515076, "loss/reg": 4.485783938434906e-05, "step": 1311 }, { "epoch": 0.164, "grad_norm": 2.899348735809326, "grad_norm_var": 0.2205539210923611, "learning_rate": 0.0001, "loss": 1.4468, "loss/crossentropy": 2.635579824447632, "loss/hidden": 1.234375, "loss/logits": 0.2120014727115631, "loss/reg": 4.483287193579599e-05, "step": 1312 }, { "epoch": 0.164125, "grad_norm": 2.1863510608673096, "grad_norm_var": 0.2383558542244522, "learning_rate": 0.0001, "loss": 1.2466, "loss/crossentropy": 2.3679277896881104, "loss/hidden": 1.0859375, "loss/logits": 0.16022589802742004, "loss/reg": 4.4809763494413346e-05, "step": 1313 }, { "epoch": 0.16425, "grad_norm": 1.8936806917190552, "grad_norm_var": 0.22653542770815271, "learning_rate": 0.0001, "loss": 1.3183, "loss/crossentropy": 2.203962564468384, "loss/hidden": 1.15625, "loss/logits": 0.1615590900182724, "loss/reg": 4.479134804569185e-05, "step": 1314 }, { "epoch": 0.164375, "grad_norm": 2.3853209018707275, "grad_norm_var": 0.23935511818615438, "learning_rate": 0.0001, "loss": 1.2934, "loss/crossentropy": 2.761049509048462, "loss/hidden": 1.1328125, "loss/logits": 0.16009873151779175, "loss/reg": 4.476767935557291e-05, "step": 1315 }, { "epoch": 0.1645, "grad_norm": 2.4226620197296143, "grad_norm_var": 0.24970435950411762, "learning_rate": 0.0001, "loss": 1.2592, "loss/crossentropy": 2.514742612838745, "loss/hidden": 1.09375, "loss/logits": 0.16496333479881287, "loss/reg": 4.4750799133908004e-05, "step": 1316 }, { "epoch": 0.164625, "grad_norm": 2.6166725158691406, "grad_norm_var": 0.22195831312392228, "learning_rate": 0.0001, "loss": 1.4806, "loss/crossentropy": 2.658413887023926, "loss/hidden": 1.2578125, "loss/logits": 0.22230298817157745, "loss/reg": 4.47302772954572e-05, "step": 1317 }, { "epoch": 0.16475, "grad_norm": 2.779916763305664, "grad_norm_var": 0.21847590222987534, "learning_rate": 0.0001, "loss": 1.1961, "loss/crossentropy": 2.6549179553985596, "loss/hidden": 1.0390625, "loss/logits": 0.15657024085521698, "loss/reg": 4.4711912778439e-05, "step": 1318 }, { "epoch": 0.164875, "grad_norm": 2.44938325881958, "grad_norm_var": 0.21526962094278013, "learning_rate": 0.0001, "loss": 1.2925, "loss/crossentropy": 2.5260696411132812, "loss/hidden": 1.1171875, "loss/logits": 0.17487144470214844, "loss/reg": 4.468999759410508e-05, "step": 1319 }, { "epoch": 0.165, "grad_norm": 2.3545005321502686, "grad_norm_var": 0.15626391559457595, "learning_rate": 0.0001, "loss": 1.3639, "loss/crossentropy": 2.4437973499298096, "loss/hidden": 1.171875, "loss/logits": 0.19156785309314728, "loss/reg": 4.466849713935517e-05, "step": 1320 }, { "epoch": 0.165125, "grad_norm": 2.07038950920105, "grad_norm_var": 0.16398468550203796, "learning_rate": 0.0001, "loss": 1.4093, "loss/crossentropy": 2.3855857849121094, "loss/hidden": 1.21875, "loss/logits": 0.19014930725097656, "loss/reg": 4.465408710530028e-05, "step": 1321 }, { "epoch": 0.16525, "grad_norm": 3.916027545928955, "grad_norm_var": 0.2734647347174624, "learning_rate": 0.0001, "loss": 1.6608, "loss/crossentropy": 2.469409942626953, "loss/hidden": 1.4140625, "loss/logits": 0.24630755186080933, "loss/reg": 4.4640266423812136e-05, "step": 1322 }, { "epoch": 0.165375, "grad_norm": 2.7967050075531006, "grad_norm_var": 0.261664755882631, "learning_rate": 0.0001, "loss": 1.3672, "loss/crossentropy": 2.4856441020965576, "loss/hidden": 1.1875, "loss/logits": 0.17921873927116394, "loss/reg": 4.462606375454925e-05, "step": 1323 }, { "epoch": 0.1655, "grad_norm": 2.528658151626587, "grad_norm_var": 0.2625583864054059, "learning_rate": 0.0001, "loss": 1.2424, "loss/crossentropy": 2.6862435340881348, "loss/hidden": 1.0703125, "loss/logits": 0.17163142561912537, "loss/reg": 4.4612395868171006e-05, "step": 1324 }, { "epoch": 0.165625, "grad_norm": 2.49149751663208, "grad_norm_var": 0.23948644297048335, "learning_rate": 0.0001, "loss": 1.3718, "loss/crossentropy": 2.6498684883117676, "loss/hidden": 1.1875, "loss/logits": 0.183807834982872, "loss/reg": 4.460258787730709e-05, "step": 1325 }, { "epoch": 0.16575, "grad_norm": 2.6475727558135986, "grad_norm_var": 0.21397661985775135, "learning_rate": 0.0001, "loss": 1.274, "loss/crossentropy": 2.4302077293395996, "loss/hidden": 1.1015625, "loss/logits": 0.1719643473625183, "loss/reg": 4.458642069948837e-05, "step": 1326 }, { "epoch": 0.165875, "grad_norm": 2.360180377960205, "grad_norm_var": 0.20269171402897346, "learning_rate": 0.0001, "loss": 1.2778, "loss/crossentropy": 2.512448787689209, "loss/hidden": 1.1015625, "loss/logits": 0.17575357854366302, "loss/reg": 4.456798706087284e-05, "step": 1327 }, { "epoch": 0.166, "grad_norm": 3.0969111919403076, "grad_norm_var": 0.21433543744030548, "learning_rate": 0.0001, "loss": 1.3271, "loss/crossentropy": 2.6267106533050537, "loss/hidden": 1.140625, "loss/logits": 0.186046302318573, "loss/reg": 4.455425005289726e-05, "step": 1328 }, { "epoch": 0.166125, "grad_norm": 2.580076217651367, "grad_norm_var": 0.20428929677158855, "learning_rate": 0.0001, "loss": 1.3796, "loss/crossentropy": 2.5159518718719482, "loss/hidden": 1.1953125, "loss/logits": 0.18387523293495178, "loss/reg": 4.453653309610672e-05, "step": 1329 }, { "epoch": 0.16625, "grad_norm": 2.2171709537506104, "grad_norm_var": 0.1809303697723879, "learning_rate": 0.0001, "loss": 1.2144, "loss/crossentropy": 2.364938735961914, "loss/hidden": 1.0546875, "loss/logits": 0.159266397356987, "loss/reg": 4.452219945960678e-05, "step": 1330 }, { "epoch": 0.166375, "grad_norm": 2.7640013694763184, "grad_norm_var": 0.17869486976307958, "learning_rate": 0.0001, "loss": 1.3287, "loss/crossentropy": 2.547267436981201, "loss/hidden": 1.140625, "loss/logits": 0.18760260939598083, "loss/reg": 4.4505995901999995e-05, "step": 1331 }, { "epoch": 0.1665, "grad_norm": 2.513836145401001, "grad_norm_var": 0.17668453543018642, "learning_rate": 0.0001, "loss": 1.3596, "loss/crossentropy": 2.403233289718628, "loss/hidden": 1.1796875, "loss/logits": 0.17943502962589264, "loss/reg": 4.449459811439738e-05, "step": 1332 }, { "epoch": 0.166625, "grad_norm": 2.718043804168701, "grad_norm_var": 0.17705922491783213, "learning_rate": 0.0001, "loss": 1.5178, "loss/crossentropy": 2.420300006866455, "loss/hidden": 1.2734375, "loss/logits": 0.24389630556106567, "loss/reg": 4.448366235010326e-05, "step": 1333 }, { "epoch": 0.16675, "grad_norm": 2.5478909015655518, "grad_norm_var": 0.17618216107275256, "learning_rate": 0.0001, "loss": 1.4965, "loss/crossentropy": 2.492011785507202, "loss/hidden": 1.265625, "loss/logits": 0.2303885817527771, "loss/reg": 4.4475800677901134e-05, "step": 1334 }, { "epoch": 0.166875, "grad_norm": 2.264066457748413, "grad_norm_var": 0.1827494628185671, "learning_rate": 0.0001, "loss": 1.3425, "loss/crossentropy": 2.515028715133667, "loss/hidden": 1.140625, "loss/logits": 0.20141394436359406, "loss/reg": 4.446614548214711e-05, "step": 1335 }, { "epoch": 0.167, "grad_norm": 2.9766416549682617, "grad_norm_var": 0.18518897405885634, "learning_rate": 0.0001, "loss": 1.4471, "loss/crossentropy": 2.330453872680664, "loss/hidden": 1.2421875, "loss/logits": 0.2045084834098816, "loss/reg": 4.4459426135290414e-05, "step": 1336 }, { "epoch": 0.167125, "grad_norm": 3.2450225353240967, "grad_norm_var": 0.1797691221482142, "learning_rate": 0.0001, "loss": 1.2603, "loss/crossentropy": 2.5466201305389404, "loss/hidden": 1.078125, "loss/logits": 0.18173721432685852, "loss/reg": 4.445154991117306e-05, "step": 1337 }, { "epoch": 0.16725, "grad_norm": 2.0674796104431152, "grad_norm_var": 0.1007740659870798, "learning_rate": 0.0001, "loss": 1.2446, "loss/crossentropy": 2.6964364051818848, "loss/hidden": 1.0625, "loss/logits": 0.18164029717445374, "loss/reg": 4.444210571818985e-05, "step": 1338 }, { "epoch": 0.167375, "grad_norm": 2.396139621734619, "grad_norm_var": 0.10101679166968154, "learning_rate": 0.0001, "loss": 1.3855, "loss/crossentropy": 2.9879798889160156, "loss/hidden": 1.171875, "loss/logits": 0.21313458681106567, "loss/reg": 4.443850411917083e-05, "step": 1339 }, { "epoch": 0.1675, "grad_norm": 1.7887595891952515, "grad_norm_var": 0.14113099684256217, "learning_rate": 0.0001, "loss": 1.1985, "loss/crossentropy": 2.2846755981445312, "loss/hidden": 1.0390625, "loss/logits": 0.15899279713630676, "loss/reg": 4.4424450607039034e-05, "step": 1340 }, { "epoch": 0.167625, "grad_norm": 3.145375967025757, "grad_norm_var": 0.163432382007852, "learning_rate": 0.0001, "loss": 1.3871, "loss/crossentropy": 2.4436898231506348, "loss/hidden": 1.1875, "loss/logits": 0.1991751343011856, "loss/reg": 4.4411804992705584e-05, "step": 1341 }, { "epoch": 0.16775, "grad_norm": 2.3595798015594482, "grad_norm_var": 0.16613940110398026, "learning_rate": 0.0001, "loss": 1.389, "loss/crossentropy": 2.584183692932129, "loss/hidden": 1.171875, "loss/logits": 0.21666912734508514, "loss/reg": 4.439669646671973e-05, "step": 1342 }, { "epoch": 0.167875, "grad_norm": 2.5877060890197754, "grad_norm_var": 0.16315910377839907, "learning_rate": 0.0001, "loss": 1.3234, "loss/crossentropy": 2.7484211921691895, "loss/hidden": 1.1484375, "loss/logits": 0.17448198795318604, "loss/reg": 4.438200267031789e-05, "step": 1343 }, { "epoch": 0.168, "grad_norm": 2.4293344020843506, "grad_norm_var": 0.144939535521361, "learning_rate": 0.0001, "loss": 1.2442, "loss/crossentropy": 2.207484245300293, "loss/hidden": 1.078125, "loss/logits": 0.16565534472465515, "loss/reg": 4.437361712916754e-05, "step": 1344 }, { "epoch": 0.168125, "grad_norm": 2.433609962463379, "grad_norm_var": 0.14545021764670352, "learning_rate": 0.0001, "loss": 1.2619, "loss/crossentropy": 2.1073293685913086, "loss/hidden": 1.078125, "loss/logits": 0.18332740664482117, "loss/reg": 4.4370663090376183e-05, "step": 1345 }, { "epoch": 0.16825, "grad_norm": 2.080322027206421, "grad_norm_var": 0.15229983777140596, "learning_rate": 0.0001, "loss": 1.2279, "loss/crossentropy": 2.59745717048645, "loss/hidden": 1.0625, "loss/logits": 0.16493362188339233, "loss/reg": 4.436418385012075e-05, "step": 1346 }, { "epoch": 0.168375, "grad_norm": 2.801445245742798, "grad_norm_var": 0.1536063298279311, "learning_rate": 0.0001, "loss": 1.3217, "loss/crossentropy": 2.6507155895233154, "loss/hidden": 1.125, "loss/logits": 0.1962617039680481, "loss/reg": 4.43481003458146e-05, "step": 1347 }, { "epoch": 0.1685, "grad_norm": 2.6921567916870117, "grad_norm_var": 0.15539478093565381, "learning_rate": 0.0001, "loss": 1.5269, "loss/crossentropy": 2.371269702911377, "loss/hidden": 1.3203125, "loss/logits": 0.2061552107334137, "loss/reg": 4.434393486008048e-05, "step": 1348 }, { "epoch": 0.168625, "grad_norm": 2.6421940326690674, "grad_norm_var": 0.15388647465415672, "learning_rate": 0.0001, "loss": 1.259, "loss/crossentropy": 2.309507369995117, "loss/hidden": 1.078125, "loss/logits": 0.18043813109397888, "loss/reg": 4.433002322912216e-05, "step": 1349 }, { "epoch": 0.16875, "grad_norm": 2.754995584487915, "grad_norm_var": 0.1570997294501703, "learning_rate": 0.0001, "loss": 1.3353, "loss/crossentropy": 2.3387720584869385, "loss/hidden": 1.140625, "loss/logits": 0.19423820078372955, "loss/reg": 4.432429341250099e-05, "step": 1350 }, { "epoch": 0.168875, "grad_norm": 2.669968366622925, "grad_norm_var": 0.15237942264023818, "learning_rate": 0.0001, "loss": 1.4666, "loss/crossentropy": 2.56152606010437, "loss/hidden": 1.234375, "loss/logits": 0.23174414038658142, "loss/reg": 4.4314343540463597e-05, "step": 1351 }, { "epoch": 0.169, "grad_norm": 3.638333559036255, "grad_norm_var": 0.21589205502504516, "learning_rate": 0.0001, "loss": 1.3802, "loss/crossentropy": 2.711392879486084, "loss/hidden": 1.1875, "loss/logits": 0.19228197634220123, "loss/reg": 4.430532862897962e-05, "step": 1352 }, { "epoch": 0.169125, "grad_norm": 2.1712546348571777, "grad_norm_var": 0.19679081461784917, "learning_rate": 0.0001, "loss": 1.3556, "loss/crossentropy": 2.6300976276397705, "loss/hidden": 1.1640625, "loss/logits": 0.19107992947101593, "loss/reg": 4.4293406972428784e-05, "step": 1353 }, { "epoch": 0.16925, "grad_norm": 2.3168716430664062, "grad_norm_var": 0.18492694202072874, "learning_rate": 0.0001, "loss": 1.3021, "loss/crossentropy": 2.3884122371673584, "loss/hidden": 1.1171875, "loss/logits": 0.18451938033103943, "loss/reg": 4.427470048540272e-05, "step": 1354 }, { "epoch": 0.169375, "grad_norm": 2.2110610008239746, "grad_norm_var": 0.19103130230434662, "learning_rate": 0.0001, "loss": 1.3532, "loss/crossentropy": 2.588346242904663, "loss/hidden": 1.140625, "loss/logits": 0.21214817464351654, "loss/reg": 4.4263808376854286e-05, "step": 1355 }, { "epoch": 0.1695, "grad_norm": 2.515874147415161, "grad_norm_var": 0.15074033294804298, "learning_rate": 0.0001, "loss": 1.3287, "loss/crossentropy": 2.5218629837036133, "loss/hidden": 1.15625, "loss/logits": 0.17196393013000488, "loss/reg": 4.4249900383874774e-05, "step": 1356 }, { "epoch": 0.169625, "grad_norm": 2.6412770748138428, "grad_norm_var": 0.12933633378730272, "learning_rate": 0.0001, "loss": 1.2391, "loss/crossentropy": 3.1147713661193848, "loss/hidden": 1.0703125, "loss/logits": 0.16838082671165466, "loss/reg": 4.4234649976715446e-05, "step": 1357 }, { "epoch": 0.16975, "grad_norm": 2.0776495933532715, "grad_norm_var": 0.14180512977350357, "learning_rate": 0.0001, "loss": 1.1746, "loss/crossentropy": 2.6043543815612793, "loss/hidden": 1.015625, "loss/logits": 0.15855082869529724, "loss/reg": 4.422090933076106e-05, "step": 1358 }, { "epoch": 0.169875, "grad_norm": 3.2153241634368896, "grad_norm_var": 0.17029051137356627, "learning_rate": 0.0001, "loss": 1.2748, "loss/crossentropy": 2.0572850704193115, "loss/hidden": 1.125, "loss/logits": 0.14936628937721252, "loss/reg": 4.420821642270312e-05, "step": 1359 }, { "epoch": 0.17, "grad_norm": 2.1766624450683594, "grad_norm_var": 0.17938114614681974, "learning_rate": 0.0001, "loss": 1.3318, "loss/crossentropy": 2.3410725593566895, "loss/hidden": 1.140625, "loss/logits": 0.19073010981082916, "loss/reg": 4.4193744543008506e-05, "step": 1360 }, { "epoch": 0.170125, "grad_norm": 3.1066315174102783, "grad_norm_var": 0.1959061853200069, "learning_rate": 0.0001, "loss": 1.3065, "loss/crossentropy": 2.4431891441345215, "loss/hidden": 1.1328125, "loss/logits": 0.1732902228832245, "loss/reg": 4.4178697862662375e-05, "step": 1361 }, { "epoch": 0.17025, "grad_norm": 2.798218250274658, "grad_norm_var": 0.17770364110440345, "learning_rate": 0.0001, "loss": 1.3664, "loss/crossentropy": 2.451411008834839, "loss/hidden": 1.15625, "loss/logits": 0.2097121775150299, "loss/reg": 4.416131559992209e-05, "step": 1362 }, { "epoch": 0.170375, "grad_norm": 3.6879491806030273, "grad_norm_var": 0.2445016046832573, "learning_rate": 0.0001, "loss": 2.0203, "loss/crossentropy": 2.753589630126953, "loss/hidden": 1.6953125, "loss/logits": 0.3245903253555298, "loss/reg": 4.414744034875184e-05, "step": 1363 }, { "epoch": 0.1705, "grad_norm": 3.903742551803589, "grad_norm_var": 0.33380536863192345, "learning_rate": 0.0001, "loss": 1.6308, "loss/crossentropy": 2.876974582672119, "loss/hidden": 1.3828125, "loss/logits": 0.24756835401058197, "loss/reg": 4.413632632349618e-05, "step": 1364 }, { "epoch": 0.170625, "grad_norm": 2.2541909217834473, "grad_norm_var": 0.3504989650026715, "learning_rate": 0.0001, "loss": 1.2854, "loss/crossentropy": 2.5404322147369385, "loss/hidden": 1.09375, "loss/logits": 0.19123107194900513, "loss/reg": 4.412614362081513e-05, "step": 1365 }, { "epoch": 0.17075, "grad_norm": 2.7338268756866455, "grad_norm_var": 0.3505375697769665, "learning_rate": 0.0001, "loss": 1.3595, "loss/crossentropy": 2.5987484455108643, "loss/hidden": 1.1640625, "loss/logits": 0.19501641392707825, "loss/reg": 4.4113399781053886e-05, "step": 1366 }, { "epoch": 0.170875, "grad_norm": 2.3233795166015625, "grad_norm_var": 0.3620869455068522, "learning_rate": 0.0001, "loss": 1.3391, "loss/crossentropy": 2.3241047859191895, "loss/hidden": 1.15625, "loss/logits": 0.18244224786758423, "loss/reg": 4.4100765080656856e-05, "step": 1367 }, { "epoch": 0.171, "grad_norm": 2.498671293258667, "grad_norm_var": 0.3061141155101474, "learning_rate": 0.0001, "loss": 1.4606, "loss/crossentropy": 2.457289218902588, "loss/hidden": 1.234375, "loss/logits": 0.2258085310459137, "loss/reg": 4.408616223372519e-05, "step": 1368 }, { "epoch": 0.171125, "grad_norm": 4.238934516906738, "grad_norm_var": 0.4373271589653179, "learning_rate": 0.0001, "loss": 1.5951, "loss/crossentropy": 2.760684013366699, "loss/hidden": 1.3671875, "loss/logits": 0.2275162637233734, "loss/reg": 4.407550295582041e-05, "step": 1369 }, { "epoch": 0.17125, "grad_norm": 4.71045446395874, "grad_norm_var": 0.6432062535952201, "learning_rate": 0.0001, "loss": 1.6466, "loss/crossentropy": 2.5298707485198975, "loss/hidden": 1.4296875, "loss/logits": 0.21650245785713196, "loss/reg": 4.406468724482693e-05, "step": 1370 }, { "epoch": 0.171375, "grad_norm": 2.916440486907959, "grad_norm_var": 0.6054300939970363, "learning_rate": 0.0001, "loss": 1.3805, "loss/crossentropy": 2.6951613426208496, "loss/hidden": 1.171875, "loss/logits": 0.20820212364196777, "loss/reg": 4.4051706936443225e-05, "step": 1371 }, { "epoch": 0.1715, "grad_norm": 2.497441530227661, "grad_norm_var": 0.6066103168523642, "learning_rate": 0.0001, "loss": 1.1551, "loss/crossentropy": 2.323773145675659, "loss/hidden": 1.015625, "loss/logits": 0.13906535506248474, "loss/reg": 4.403495404403657e-05, "step": 1372 }, { "epoch": 0.171625, "grad_norm": 2.6485416889190674, "grad_norm_var": 0.6062794211515701, "learning_rate": 0.0001, "loss": 1.4911, "loss/crossentropy": 2.572211980819702, "loss/hidden": 1.2578125, "loss/logits": 0.23286481201648712, "loss/reg": 4.402002741699107e-05, "step": 1373 }, { "epoch": 0.17175, "grad_norm": 2.163757801055908, "grad_norm_var": 0.596305325230626, "learning_rate": 0.0001, "loss": 1.2345, "loss/crossentropy": 2.4492759704589844, "loss/hidden": 1.0546875, "loss/logits": 0.17937690019607544, "loss/reg": 4.40061412518844e-05, "step": 1374 }, { "epoch": 0.171875, "grad_norm": 2.115994930267334, "grad_norm_var": 0.6391237393217495, "learning_rate": 0.0001, "loss": 1.3079, "loss/crossentropy": 2.676708459854126, "loss/hidden": 1.1328125, "loss/logits": 0.17461000382900238, "loss/reg": 4.399328099680133e-05, "step": 1375 }, { "epoch": 0.172, "grad_norm": 2.555100440979004, "grad_norm_var": 0.6103941335775583, "learning_rate": 0.0001, "loss": 1.2241, "loss/crossentropy": 2.4660518169403076, "loss/hidden": 1.0703125, "loss/logits": 0.15334627032279968, "loss/reg": 4.398010059958324e-05, "step": 1376 }, { "epoch": 0.172125, "grad_norm": 2.0078580379486084, "grad_norm_var": 0.6624757473026042, "learning_rate": 0.0001, "loss": 1.307, "loss/crossentropy": 2.5937724113464355, "loss/hidden": 1.125, "loss/logits": 0.18153205513954163, "loss/reg": 4.3968833779217675e-05, "step": 1377 }, { "epoch": 0.17225, "grad_norm": 2.7480297088623047, "grad_norm_var": 0.6631697814477386, "learning_rate": 0.0001, "loss": 1.2397, "loss/crossentropy": 2.588857412338257, "loss/hidden": 1.0703125, "loss/logits": 0.16894888877868652, "loss/reg": 4.395194991957396e-05, "step": 1378 }, { "epoch": 0.172375, "grad_norm": 3.1688647270202637, "grad_norm_var": 0.6237637466773701, "learning_rate": 0.0001, "loss": 1.4138, "loss/crossentropy": 2.757561206817627, "loss/hidden": 1.21875, "loss/logits": 0.19462066888809204, "loss/reg": 4.3936484871665016e-05, "step": 1379 }, { "epoch": 0.1725, "grad_norm": 15.428227424621582, "grad_norm_var": 10.554824158588978, "learning_rate": 0.0001, "loss": 1.6149, "loss/crossentropy": 2.402078151702881, "loss/hidden": 1.3984375, "loss/logits": 0.21606966853141785, "loss/reg": 4.392436676425859e-05, "step": 1380 }, { "epoch": 0.172625, "grad_norm": 2.6017940044403076, "grad_norm_var": 10.501711460516692, "learning_rate": 0.0001, "loss": 1.2484, "loss/crossentropy": 2.660430431365967, "loss/hidden": 1.0703125, "loss/logits": 0.17760685086250305, "loss/reg": 4.391232141642831e-05, "step": 1381 }, { "epoch": 0.17275, "grad_norm": 2.525033950805664, "grad_norm_var": 10.528127305203704, "learning_rate": 0.0001, "loss": 1.4136, "loss/crossentropy": 2.454327344894409, "loss/hidden": 1.203125, "loss/logits": 0.210074782371521, "loss/reg": 4.389778769109398e-05, "step": 1382 }, { "epoch": 0.172875, "grad_norm": 2.833066463470459, "grad_norm_var": 10.459524290972332, "learning_rate": 0.0001, "loss": 1.4644, "loss/crossentropy": 2.8289997577667236, "loss/hidden": 1.234375, "loss/logits": 0.22955310344696045, "loss/reg": 4.3889413063880056e-05, "step": 1383 }, { "epoch": 0.173, "grad_norm": 2.6454131603240967, "grad_norm_var": 10.439250793189025, "learning_rate": 0.0001, "loss": 1.211, "loss/crossentropy": 2.5164341926574707, "loss/hidden": 1.0625, "loss/logits": 0.1480443775653839, "loss/reg": 4.388386514619924e-05, "step": 1384 }, { "epoch": 0.173125, "grad_norm": 2.4730100631713867, "grad_norm_var": 10.48673112258549, "learning_rate": 0.0001, "loss": 1.2959, "loss/crossentropy": 2.454392433166504, "loss/hidden": 1.1171875, "loss/logits": 0.17824172973632812, "loss/reg": 4.3870826630154625e-05, "step": 1385 }, { "epoch": 0.17325, "grad_norm": 4.211810111999512, "grad_norm_var": 10.42195551797722, "learning_rate": 0.0001, "loss": 1.4687, "loss/crossentropy": 2.650273323059082, "loss/hidden": 1.28125, "loss/logits": 0.1870349645614624, "loss/reg": 4.385815191199072e-05, "step": 1386 }, { "epoch": 0.173375, "grad_norm": 2.2195355892181396, "grad_norm_var": 10.50386579069449, "learning_rate": 0.0001, "loss": 1.1958, "loss/crossentropy": 2.6540579795837402, "loss/hidden": 1.03125, "loss/logits": 0.16409122943878174, "loss/reg": 4.384495332487859e-05, "step": 1387 }, { "epoch": 0.1735, "grad_norm": 2.2435309886932373, "grad_norm_var": 10.53938945014739, "learning_rate": 0.0001, "loss": 1.2039, "loss/crossentropy": 2.4984354972839355, "loss/hidden": 1.0546875, "loss/logits": 0.1487729251384735, "loss/reg": 4.3836476834258065e-05, "step": 1388 }, { "epoch": 0.173625, "grad_norm": 6.076218128204346, "grad_norm_var": 10.924850838611649, "learning_rate": 0.0001, "loss": 1.775, "loss/crossentropy": 2.60779070854187, "loss/hidden": 1.4375, "loss/logits": 0.3370318114757538, "loss/reg": 4.383291889098473e-05, "step": 1389 }, { "epoch": 0.17375, "grad_norm": 2.962207317352295, "grad_norm_var": 10.809017442849843, "learning_rate": 0.0001, "loss": 1.3568, "loss/crossentropy": 2.5193846225738525, "loss/hidden": 1.171875, "loss/logits": 0.1845148503780365, "loss/reg": 4.3830696085933596e-05, "step": 1390 }, { "epoch": 0.173875, "grad_norm": 2.499361991882324, "grad_norm_var": 10.738463453127084, "learning_rate": 0.0001, "loss": 1.2684, "loss/crossentropy": 2.727522373199463, "loss/hidden": 1.078125, "loss/logits": 0.1898011714220047, "loss/reg": 4.38306997239124e-05, "step": 1391 }, { "epoch": 0.174, "grad_norm": 2.902444839477539, "grad_norm_var": 10.692983416262415, "learning_rate": 0.0001, "loss": 1.2714, "loss/crossentropy": 2.5181994438171387, "loss/hidden": 1.09375, "loss/logits": 0.17718875408172607, "loss/reg": 4.3814598029712215e-05, "step": 1392 }, { "epoch": 0.174125, "grad_norm": 2.5764389038085938, "grad_norm_var": 10.583264738967724, "learning_rate": 0.0001, "loss": 1.2864, "loss/crossentropy": 2.609466552734375, "loss/hidden": 1.1171875, "loss/logits": 0.16876953840255737, "loss/reg": 4.38142305938527e-05, "step": 1393 }, { "epoch": 0.17425, "grad_norm": 1.9676804542541504, "grad_norm_var": 10.726323120818574, "learning_rate": 0.0001, "loss": 1.2885, "loss/crossentropy": 2.610501527786255, "loss/hidden": 1.109375, "loss/logits": 0.178666889667511, "loss/reg": 4.380764585221186e-05, "step": 1394 }, { "epoch": 0.174375, "grad_norm": 3.298060178756714, "grad_norm_var": 10.71807201389054, "learning_rate": 0.0001, "loss": 1.5811, "loss/crossentropy": 2.571620464324951, "loss/hidden": 1.328125, "loss/logits": 0.2525365352630615, "loss/reg": 4.379446909297258e-05, "step": 1395 }, { "epoch": 0.1745, "grad_norm": 2.5956883430480957, "grad_norm_var": 0.9713562693941827, "learning_rate": 0.0001, "loss": 1.2787, "loss/crossentropy": 2.4515395164489746, "loss/hidden": 1.109375, "loss/logits": 0.16887177526950836, "loss/reg": 4.378123412607238e-05, "step": 1396 }, { "epoch": 0.174625, "grad_norm": 6.457098484039307, "grad_norm_var": 1.7395961483986715, "learning_rate": 0.0001, "loss": 1.4139, "loss/crossentropy": 2.481762170791626, "loss/hidden": 1.234375, "loss/logits": 0.17910394072532654, "loss/reg": 4.376547803985886e-05, "step": 1397 }, { "epoch": 0.17475, "grad_norm": 2.6334445476531982, "grad_norm_var": 1.7312187409571094, "learning_rate": 0.0001, "loss": 1.1642, "loss/crossentropy": 2.4399666786193848, "loss/hidden": 1.0078125, "loss/logits": 0.15591077506542206, "loss/reg": 4.375265780254267e-05, "step": 1398 }, { "epoch": 0.174875, "grad_norm": 4.455918788909912, "grad_norm_var": 1.824606404052923, "learning_rate": 0.0001, "loss": 1.4613, "loss/crossentropy": 2.47601580619812, "loss/hidden": 1.2265625, "loss/logits": 0.23428377509117126, "loss/reg": 4.3734627979574725e-05, "step": 1399 }, { "epoch": 0.175, "grad_norm": 12.483902931213379, "grad_norm_var": 7.063390839893884, "learning_rate": 0.0001, "loss": 1.2458, "loss/crossentropy": 2.957716941833496, "loss/hidden": 1.09375, "loss/logits": 0.151652991771698, "loss/reg": 4.3715503124985844e-05, "step": 1400 }, { "epoch": 0.175125, "grad_norm": 2.1250927448272705, "grad_norm_var": 7.136156501883171, "learning_rate": 0.0001, "loss": 1.361, "loss/crossentropy": 2.3386826515197754, "loss/hidden": 1.1640625, "loss/logits": 0.19654320180416107, "loss/reg": 4.36952177551575e-05, "step": 1401 }, { "epoch": 0.17525, "grad_norm": 2.5386850833892822, "grad_norm_var": 7.231913773217872, "learning_rate": 0.0001, "loss": 1.4102, "loss/crossentropy": 2.5731849670410156, "loss/hidden": 1.2109375, "loss/logits": 0.19881173968315125, "loss/reg": 4.368073132354766e-05, "step": 1402 }, { "epoch": 0.175375, "grad_norm": 2.7075517177581787, "grad_norm_var": 7.147069652233706, "learning_rate": 0.0001, "loss": 1.2989, "loss/crossentropy": 2.923799753189087, "loss/hidden": 1.1171875, "loss/logits": 0.1812412142753601, "loss/reg": 4.366214852780104e-05, "step": 1403 }, { "epoch": 0.1755, "grad_norm": 2.7373690605163574, "grad_norm_var": 7.06096468766825, "learning_rate": 0.0001, "loss": 1.2479, "loss/crossentropy": 2.924968719482422, "loss/hidden": 1.0859375, "loss/logits": 0.1615295708179474, "loss/reg": 4.3649502913467586e-05, "step": 1404 }, { "epoch": 0.175625, "grad_norm": 2.0970749855041504, "grad_norm_var": 6.850111452164137, "learning_rate": 0.0001, "loss": 1.3082, "loss/crossentropy": 2.4840149879455566, "loss/hidden": 1.125, "loss/logits": 0.18273839354515076, "loss/reg": 4.363973130239174e-05, "step": 1405 }, { "epoch": 0.17575, "grad_norm": 3.3448736667633057, "grad_norm_var": 6.828514064197718, "learning_rate": 0.0001, "loss": 1.2617, "loss/crossentropy": 2.663756847381592, "loss/hidden": 1.09375, "loss/logits": 0.16753330826759338, "loss/reg": 4.362704567029141e-05, "step": 1406 }, { "epoch": 0.175875, "grad_norm": 2.696303606033325, "grad_norm_var": 6.802330951091016, "learning_rate": 0.0001, "loss": 1.4086, "loss/crossentropy": 2.449850559234619, "loss/hidden": 1.1875, "loss/logits": 0.2206811010837555, "loss/reg": 4.361617538961582e-05, "step": 1407 }, { "epoch": 0.176, "grad_norm": 2.2869067192077637, "grad_norm_var": 6.88335139626389, "learning_rate": 0.0001, "loss": 1.33, "loss/crossentropy": 2.6695072650909424, "loss/hidden": 1.15625, "loss/logits": 0.1733335256576538, "loss/reg": 4.3601157813100144e-05, "step": 1408 }, { "epoch": 0.176125, "grad_norm": 2.6789941787719727, "grad_norm_var": 6.870523523354868, "learning_rate": 0.0001, "loss": 1.3967, "loss/crossentropy": 2.765038251876831, "loss/hidden": 1.1796875, "loss/logits": 0.21654057502746582, "loss/reg": 4.358700243756175e-05, "step": 1409 }, { "epoch": 0.17625, "grad_norm": 2.1456291675567627, "grad_norm_var": 6.834507974821424, "learning_rate": 0.0001, "loss": 1.1905, "loss/crossentropy": 2.0626509189605713, "loss/hidden": 1.03125, "loss/logits": 0.1588020920753479, "loss/reg": 4.357034413260408e-05, "step": 1410 }, { "epoch": 0.176375, "grad_norm": 5.724753379821777, "grad_norm_var": 7.111283813958933, "learning_rate": 0.0001, "loss": 1.7126, "loss/crossentropy": 2.2581207752227783, "loss/hidden": 1.484375, "loss/logits": 0.22774820029735565, "loss/reg": 4.355575583758764e-05, "step": 1411 }, { "epoch": 0.1765, "grad_norm": 2.65604305267334, "grad_norm_var": 7.102368611780612, "learning_rate": 0.0001, "loss": 1.23, "loss/crossentropy": 2.4369256496429443, "loss/hidden": 1.0546875, "loss/logits": 0.17484912276268005, "loss/reg": 4.353715121396817e-05, "step": 1412 }, { "epoch": 0.176625, "grad_norm": 2.5243303775787354, "grad_norm_var": 6.641966894564712, "learning_rate": 0.0001, "loss": 1.256, "loss/crossentropy": 2.585545539855957, "loss/hidden": 1.0859375, "loss/logits": 0.169620543718338, "loss/reg": 4.352208998170681e-05, "step": 1413 }, { "epoch": 0.17675, "grad_norm": 3.367011070251465, "grad_norm_var": 6.591839773502852, "learning_rate": 0.0001, "loss": 1.5165, "loss/crossentropy": 2.5050852298736572, "loss/hidden": 1.2890625, "loss/logits": 0.22704654932022095, "loss/reg": 4.350413291831501e-05, "step": 1414 }, { "epoch": 0.176875, "grad_norm": 4.210506439208984, "grad_norm_var": 6.565491347616695, "learning_rate": 0.0001, "loss": 1.3093, "loss/crossentropy": 2.5862507820129395, "loss/hidden": 1.125, "loss/logits": 0.18390105664730072, "loss/reg": 4.3487238144734874e-05, "step": 1415 }, { "epoch": 0.177, "grad_norm": 3.892646312713623, "grad_norm_var": 0.9107982589900016, "learning_rate": 0.0001, "loss": 1.8682, "loss/crossentropy": 2.8675537109375, "loss/hidden": 1.5, "loss/logits": 0.3677327632904053, "loss/reg": 4.347108188085258e-05, "step": 1416 }, { "epoch": 0.177125, "grad_norm": 3.503170967102051, "grad_norm_var": 0.8717905952754526, "learning_rate": 0.0001, "loss": 1.6121, "loss/crossentropy": 2.618744134902954, "loss/hidden": 1.3671875, "loss/logits": 0.24444371461868286, "loss/reg": 4.3455151171656325e-05, "step": 1417 }, { "epoch": 0.17725, "grad_norm": 3.0146777629852295, "grad_norm_var": 0.8522632202887498, "learning_rate": 0.0001, "loss": 1.3245, "loss/crossentropy": 2.8389289379119873, "loss/hidden": 1.125, "loss/logits": 0.19902637600898743, "loss/reg": 4.343886030255817e-05, "step": 1418 }, { "epoch": 0.177375, "grad_norm": 2.4084014892578125, "grad_norm_var": 0.8734795570176286, "learning_rate": 0.0001, "loss": 1.4294, "loss/crossentropy": 2.539510726928711, "loss/hidden": 1.2265625, "loss/logits": 0.20238348841667175, "loss/reg": 4.3425516196293756e-05, "step": 1419 }, { "epoch": 0.1775, "grad_norm": 2.334496021270752, "grad_norm_var": 0.9020578094969264, "learning_rate": 0.0001, "loss": 1.2769, "loss/crossentropy": 2.2953195571899414, "loss/hidden": 1.109375, "loss/logits": 0.16713181138038635, "loss/reg": 4.341412568464875e-05, "step": 1420 }, { "epoch": 0.177625, "grad_norm": 2.5287718772888184, "grad_norm_var": 0.8585467461433038, "learning_rate": 0.0001, "loss": 1.5263, "loss/crossentropy": 2.703735589981079, "loss/hidden": 1.3046875, "loss/logits": 0.2211885154247284, "loss/reg": 4.339984297985211e-05, "step": 1421 }, { "epoch": 0.17775, "grad_norm": 3.0021073818206787, "grad_norm_var": 0.8538916502450261, "learning_rate": 0.0001, "loss": 1.5981, "loss/crossentropy": 2.4952101707458496, "loss/hidden": 1.3828125, "loss/logits": 0.2148495614528656, "loss/reg": 4.338718645158224e-05, "step": 1422 }, { "epoch": 0.177875, "grad_norm": 2.6841554641723633, "grad_norm_var": 0.8544914650704224, "learning_rate": 0.0001, "loss": 1.3463, "loss/crossentropy": 2.640184164047241, "loss/hidden": 1.1328125, "loss/logits": 0.21307799220085144, "loss/reg": 4.337489735917188e-05, "step": 1423 }, { "epoch": 0.178, "grad_norm": 2.777501344680786, "grad_norm_var": 0.8189534671629093, "learning_rate": 0.0001, "loss": 1.5327, "loss/crossentropy": 2.3427610397338867, "loss/hidden": 1.2890625, "loss/logits": 0.243157759308815, "loss/reg": 4.3363947042962536e-05, "step": 1424 }, { "epoch": 0.178125, "grad_norm": 2.5543341636657715, "grad_norm_var": 0.8267698989523389, "learning_rate": 0.0001, "loss": 1.3129, "loss/crossentropy": 2.744206666946411, "loss/hidden": 1.1484375, "loss/logits": 0.1639942228794098, "loss/reg": 4.334727782406844e-05, "step": 1425 }, { "epoch": 0.17825, "grad_norm": 2.4885315895080566, "grad_norm_var": 0.7912603488188845, "learning_rate": 0.0001, "loss": 1.307, "loss/crossentropy": 2.6813228130340576, "loss/hidden": 1.140625, "loss/logits": 0.16597452759742737, "loss/reg": 4.333686229074374e-05, "step": 1426 }, { "epoch": 0.178375, "grad_norm": 3.5424156188964844, "grad_norm_var": 0.3264754697171298, "learning_rate": 0.0001, "loss": 1.3473, "loss/crossentropy": 2.712770938873291, "loss/hidden": 1.15625, "loss/logits": 0.19061842560768127, "loss/reg": 4.332158641773276e-05, "step": 1427 }, { "epoch": 0.1785, "grad_norm": 2.0955185890197754, "grad_norm_var": 0.3694319419413773, "learning_rate": 0.0001, "loss": 1.3663, "loss/crossentropy": 2.5209343433380127, "loss/hidden": 1.171875, "loss/logits": 0.19394898414611816, "loss/reg": 4.3307081796228886e-05, "step": 1428 }, { "epoch": 0.178625, "grad_norm": 2.6297364234924316, "grad_norm_var": 0.3643823378726265, "learning_rate": 0.0001, "loss": 1.2339, "loss/crossentropy": 2.5593748092651367, "loss/hidden": 1.0546875, "loss/logits": 0.17882663011550903, "loss/reg": 4.329224975663237e-05, "step": 1429 }, { "epoch": 0.17875, "grad_norm": 2.8635098934173584, "grad_norm_var": 0.3515349356239161, "learning_rate": 0.0001, "loss": 1.4032, "loss/crossentropy": 2.8081912994384766, "loss/hidden": 1.2109375, "loss/logits": 0.19186900556087494, "loss/reg": 4.327530041337013e-05, "step": 1430 }, { "epoch": 0.178875, "grad_norm": 2.7731873989105225, "grad_norm_var": 0.23106689203328087, "learning_rate": 0.0001, "loss": 1.4425, "loss/crossentropy": 2.60203218460083, "loss/hidden": 1.1953125, "loss/logits": 0.24671795964241028, "loss/reg": 4.325600093579851e-05, "step": 1431 }, { "epoch": 0.179, "grad_norm": 2.6906487941741943, "grad_norm_var": 0.1491888512825502, "learning_rate": 0.0001, "loss": 1.4079, "loss/crossentropy": 2.4658117294311523, "loss/hidden": 1.2265625, "loss/logits": 0.18089887499809265, "loss/reg": 4.324036490288563e-05, "step": 1432 }, { "epoch": 0.179125, "grad_norm": 2.9456405639648438, "grad_norm_var": 0.1121219410924103, "learning_rate": 0.0001, "loss": 1.2971, "loss/crossentropy": 2.6626834869384766, "loss/hidden": 1.125, "loss/logits": 0.17169824242591858, "loss/reg": 4.322533277445473e-05, "step": 1433 }, { "epoch": 0.17925, "grad_norm": 3.171668767929077, "grad_norm_var": 0.12007437587654597, "learning_rate": 0.0001, "loss": 1.4177, "loss/crossentropy": 2.836962938308716, "loss/hidden": 1.203125, "loss/logits": 0.21418796479701996, "loss/reg": 4.3210424337303266e-05, "step": 1434 }, { "epoch": 0.179375, "grad_norm": 3.561671018600464, "grad_norm_var": 0.15556932022715816, "learning_rate": 0.0001, "loss": 1.4663, "loss/crossentropy": 2.6809699535369873, "loss/hidden": 1.2109375, "loss/logits": 0.2549501657485962, "loss/reg": 4.319531217333861e-05, "step": 1435 }, { "epoch": 0.1795, "grad_norm": 2.716172695159912, "grad_norm_var": 0.14148105049478066, "learning_rate": 0.0001, "loss": 1.3285, "loss/crossentropy": 2.245272397994995, "loss/hidden": 1.15625, "loss/logits": 0.1718417853116989, "loss/reg": 4.317987259128131e-05, "step": 1436 }, { "epoch": 0.179625, "grad_norm": 17.232587814331055, "grad_norm_var": 13.09473393360591, "learning_rate": 0.0001, "loss": 1.4851, "loss/crossentropy": 2.410290241241455, "loss/hidden": 1.3046875, "loss/logits": 0.17997747659683228, "loss/reg": 4.316571357776411e-05, "step": 1437 }, { "epoch": 0.17975, "grad_norm": 2.3736934661865234, "grad_norm_var": 13.180663115120598, "learning_rate": 0.0001, "loss": 1.1406, "loss/crossentropy": 2.472769260406494, "loss/hidden": 1.0, "loss/logits": 0.14021781086921692, "loss/reg": 4.3151158024556935e-05, "step": 1438 }, { "epoch": 0.179875, "grad_norm": 2.5714287757873535, "grad_norm_var": 13.19663266950907, "learning_rate": 0.0001, "loss": 1.2952, "loss/crossentropy": 2.3950321674346924, "loss/hidden": 1.109375, "loss/logits": 0.18539366126060486, "loss/reg": 4.3139560148119926e-05, "step": 1439 }, { "epoch": 0.18, "grad_norm": 3.797583818435669, "grad_norm_var": 13.137998270690733, "learning_rate": 0.0001, "loss": 1.5306, "loss/crossentropy": 2.4296462535858154, "loss/hidden": 1.3125, "loss/logits": 0.21766766905784607, "loss/reg": 4.312453165766783e-05, "step": 1440 }, { "epoch": 0.180125, "grad_norm": 3.5081300735473633, "grad_norm_var": 13.042733823147271, "learning_rate": 0.0001, "loss": 2.1133, "loss/crossentropy": 2.5396132469177246, "loss/hidden": 1.7109375, "loss/logits": 0.40195000171661377, "loss/reg": 4.311450902605429e-05, "step": 1441 }, { "epoch": 0.18025, "grad_norm": 2.7153725624084473, "grad_norm_var": 13.005977433302935, "learning_rate": 0.0001, "loss": 1.2424, "loss/crossentropy": 2.7652928829193115, "loss/hidden": 1.078125, "loss/logits": 0.16388410329818726, "loss/reg": 4.3100644688820466e-05, "step": 1442 }, { "epoch": 0.180375, "grad_norm": 2.1137332916259766, "grad_norm_var": 13.187246668576858, "learning_rate": 0.0001, "loss": 1.2525, "loss/crossentropy": 2.5129823684692383, "loss/hidden": 1.078125, "loss/logits": 0.17393165826797485, "loss/reg": 4.308501229388639e-05, "step": 1443 }, { "epoch": 0.1805, "grad_norm": 2.8767898082733154, "grad_norm_var": 13.054609912670541, "learning_rate": 0.0001, "loss": 1.4468, "loss/crossentropy": 2.0135140419006348, "loss/hidden": 1.2578125, "loss/logits": 0.1885446310043335, "loss/reg": 4.307483322918415e-05, "step": 1444 }, { "epoch": 0.180625, "grad_norm": 2.403043508529663, "grad_norm_var": 13.09270559894486, "learning_rate": 0.0001, "loss": 1.5386, "loss/crossentropy": 2.0192453861236572, "loss/hidden": 1.265625, "loss/logits": 0.27254170179367065, "loss/reg": 4.306132541387342e-05, "step": 1445 }, { "epoch": 0.18075, "grad_norm": 1.994828701019287, "grad_norm_var": 13.244824799331905, "learning_rate": 0.0001, "loss": 1.188, "loss/crossentropy": 2.5380778312683105, "loss/hidden": 1.015625, "loss/logits": 0.17196664214134216, "loss/reg": 4.305263428250328e-05, "step": 1446 }, { "epoch": 0.180875, "grad_norm": 2.4696993827819824, "grad_norm_var": 13.28870750435451, "learning_rate": 0.0001, "loss": 1.2176, "loss/crossentropy": 2.686527967453003, "loss/hidden": 1.0625, "loss/logits": 0.154626727104187, "loss/reg": 4.3038289732066914e-05, "step": 1447 }, { "epoch": 0.181, "grad_norm": 2.959864377975464, "grad_norm_var": 13.2571348082626, "learning_rate": 0.0001, "loss": 1.4701, "loss/crossentropy": 2.482746124267578, "loss/hidden": 1.2421875, "loss/logits": 0.22751757502555847, "loss/reg": 4.3027317587984726e-05, "step": 1448 }, { "epoch": 0.181125, "grad_norm": 2.3715078830718994, "grad_norm_var": 13.336497430498211, "learning_rate": 0.0001, "loss": 1.2266, "loss/crossentropy": 2.3657546043395996, "loss/hidden": 1.078125, "loss/logits": 0.14807792007923126, "loss/reg": 4.3015385017497465e-05, "step": 1449 }, { "epoch": 0.18125, "grad_norm": 2.3996551036834717, "grad_norm_var": 13.425801257168224, "learning_rate": 0.0001, "loss": 1.3934, "loss/crossentropy": 2.3912100791931152, "loss/hidden": 1.1796875, "loss/logits": 0.21327857673168182, "loss/reg": 4.300205546314828e-05, "step": 1450 }, { "epoch": 0.181375, "grad_norm": 8.881511688232422, "grad_norm_var": 15.146759918124577, "learning_rate": 0.0001, "loss": 1.5182, "loss/crossentropy": 2.411609411239624, "loss/hidden": 1.3359375, "loss/logits": 0.18182581663131714, "loss/reg": 4.298591738916002e-05, "step": 1451 }, { "epoch": 0.1815, "grad_norm": 2.5281503200531006, "grad_norm_var": 15.180191875245258, "learning_rate": 0.0001, "loss": 1.343, "loss/crossentropy": 2.5658814907073975, "loss/hidden": 1.140625, "loss/logits": 0.20197615027427673, "loss/reg": 4.2972216760972515e-05, "step": 1452 }, { "epoch": 0.181625, "grad_norm": 2.931814432144165, "grad_norm_var": 2.6350739014278046, "learning_rate": 0.0001, "loss": 1.5714, "loss/crossentropy": 2.623166561126709, "loss/hidden": 1.3046875, "loss/logits": 0.2662585973739624, "loss/reg": 4.295963663025759e-05, "step": 1453 }, { "epoch": 0.18175, "grad_norm": 2.307128667831421, "grad_norm_var": 2.6414069582859265, "learning_rate": 0.0001, "loss": 1.3099, "loss/crossentropy": 2.6455140113830566, "loss/hidden": 1.1171875, "loss/logits": 0.19229310750961304, "loss/reg": 4.29393767262809e-05, "step": 1454 }, { "epoch": 0.181875, "grad_norm": 2.719261884689331, "grad_norm_var": 2.6333024593927794, "learning_rate": 0.0001, "loss": 1.2597, "loss/crossentropy": 2.699171543121338, "loss/hidden": 1.0859375, "loss/logits": 0.17330314218997955, "loss/reg": 4.29198844358325e-05, "step": 1455 }, { "epoch": 0.182, "grad_norm": 2.2648873329162598, "grad_norm_var": 2.629623452031683, "learning_rate": 0.0001, "loss": 1.3001, "loss/crossentropy": 2.819680690765381, "loss/hidden": 1.1171875, "loss/logits": 0.1824684888124466, "loss/reg": 4.289634307497181e-05, "step": 1456 }, { "epoch": 0.182125, "grad_norm": 3.51459002494812, "grad_norm_var": 2.6300935831663605, "learning_rate": 0.0001, "loss": 1.8678, "loss/crossentropy": 2.649803638458252, "loss/hidden": 1.546875, "loss/logits": 0.32052403688430786, "loss/reg": 4.287390038371086e-05, "step": 1457 }, { "epoch": 0.18225, "grad_norm": 2.2351443767547607, "grad_norm_var": 2.660538406812168, "learning_rate": 0.0001, "loss": 1.4079, "loss/crossentropy": 2.6567535400390625, "loss/hidden": 1.203125, "loss/logits": 0.20430999994277954, "loss/reg": 4.285141403670423e-05, "step": 1458 }, { "epoch": 0.182375, "grad_norm": 2.742172956466675, "grad_norm_var": 2.6163455836101916, "learning_rate": 0.0001, "loss": 1.393, "loss/crossentropy": 2.60951828956604, "loss/hidden": 1.1796875, "loss/logits": 0.21288588643074036, "loss/reg": 4.283638554625213e-05, "step": 1459 }, { "epoch": 0.1825, "grad_norm": 2.7413218021392822, "grad_norm_var": 2.6192665262027277, "learning_rate": 0.0001, "loss": 1.1963, "loss/crossentropy": 2.773202657699585, "loss/hidden": 1.03125, "loss/logits": 0.16459418833255768, "loss/reg": 4.281382280169055e-05, "step": 1460 }, { "epoch": 0.182625, "grad_norm": 3.7331223487854004, "grad_norm_var": 2.6299038870939264, "learning_rate": 0.0001, "loss": 1.4449, "loss/crossentropy": 2.539788246154785, "loss/hidden": 1.21875, "loss/logits": 0.22573140263557434, "loss/reg": 4.279471977497451e-05, "step": 1461 }, { "epoch": 0.18275, "grad_norm": 28.74688720703125, "grad_norm_var": 43.596899801988485, "learning_rate": 0.0001, "loss": 1.6068, "loss/crossentropy": 2.5083324909210205, "loss/hidden": 1.3984375, "loss/logits": 0.20789732038974762, "loss/reg": 4.277807965991087e-05, "step": 1462 }, { "epoch": 0.182875, "grad_norm": 3.0596749782562256, "grad_norm_var": 43.441506559109, "learning_rate": 0.0001, "loss": 1.5056, "loss/crossentropy": 2.5228803157806396, "loss/hidden": 1.2734375, "loss/logits": 0.231749027967453, "loss/reg": 4.275907122064382e-05, "step": 1463 }, { "epoch": 0.183, "grad_norm": 2.790465831756592, "grad_norm_var": 43.48392586707514, "learning_rate": 0.0001, "loss": 1.2947, "loss/crossentropy": 2.5042221546173096, "loss/hidden": 1.1171875, "loss/logits": 0.17708972096443176, "loss/reg": 4.274360981071368e-05, "step": 1464 }, { "epoch": 0.183125, "grad_norm": 2.082412004470825, "grad_norm_var": 43.58075224329326, "learning_rate": 0.0001, "loss": 1.2976, "loss/crossentropy": 2.425107479095459, "loss/hidden": 1.1171875, "loss/logits": 0.17995816469192505, "loss/reg": 4.272775549907237e-05, "step": 1465 }, { "epoch": 0.18325, "grad_norm": 2.225287675857544, "grad_norm_var": 43.636828045239, "learning_rate": 0.0001, "loss": 1.2741, "loss/crossentropy": 2.5390968322753906, "loss/hidden": 1.0859375, "loss/logits": 0.18771487474441528, "loss/reg": 4.2712516005849466e-05, "step": 1466 }, { "epoch": 0.183375, "grad_norm": 2.2095344066619873, "grad_norm_var": 42.716066053442496, "learning_rate": 0.0001, "loss": 1.3697, "loss/crossentropy": 2.1581828594207764, "loss/hidden": 1.1484375, "loss/logits": 0.2208089530467987, "loss/reg": 4.269627606845461e-05, "step": 1467 }, { "epoch": 0.1835, "grad_norm": 3.168578863143921, "grad_norm_var": 42.590231253385056, "learning_rate": 0.0001, "loss": 1.4461, "loss/crossentropy": 2.5799078941345215, "loss/hidden": 1.25, "loss/logits": 0.19567829370498657, "loss/reg": 4.268271732144058e-05, "step": 1468 }, { "epoch": 0.183625, "grad_norm": 2.506560802459717, "grad_norm_var": 42.681493007397286, "learning_rate": 0.0001, "loss": 1.4977, "loss/crossentropy": 2.5367000102996826, "loss/hidden": 1.2734375, "loss/logits": 0.2238171398639679, "loss/reg": 4.266422183718532e-05, "step": 1469 }, { "epoch": 0.18375, "grad_norm": 7.600844383239746, "grad_norm_var": 43.01543362550186, "learning_rate": 0.0001, "loss": 1.7192, "loss/crossentropy": 3.0554118156433105, "loss/hidden": 1.5390625, "loss/logits": 0.17968618869781494, "loss/reg": 4.265028110239655e-05, "step": 1470 }, { "epoch": 0.183875, "grad_norm": 2.2054150104522705, "grad_norm_var": 43.16396281278411, "learning_rate": 0.0001, "loss": 1.2623, "loss/crossentropy": 2.5536673069000244, "loss/hidden": 1.09375, "loss/logits": 0.16808277368545532, "loss/reg": 4.263762821210548e-05, "step": 1471 }, { "epoch": 0.184, "grad_norm": 2.2131237983703613, "grad_norm_var": 43.18034464683377, "learning_rate": 0.0001, "loss": 1.3421, "loss/crossentropy": 2.7410731315612793, "loss/hidden": 1.15625, "loss/logits": 0.185453400015831, "loss/reg": 4.262782385922037e-05, "step": 1472 }, { "epoch": 0.184125, "grad_norm": 2.219083070755005, "grad_norm_var": 43.4746190323492, "learning_rate": 0.0001, "loss": 1.162, "loss/crossentropy": 2.3957958221435547, "loss/hidden": 1.0234375, "loss/logits": 0.13811297714710236, "loss/reg": 4.262103539076634e-05, "step": 1473 }, { "epoch": 0.18425, "grad_norm": 2.394564628601074, "grad_norm_var": 43.42742842239302, "learning_rate": 0.0001, "loss": 1.2506, "loss/crossentropy": 2.809195041656494, "loss/hidden": 1.09375, "loss/logits": 0.15642398595809937, "loss/reg": 4.260565037839115e-05, "step": 1474 }, { "epoch": 0.184375, "grad_norm": 9.86483097076416, "grad_norm_var": 44.89087660481017, "learning_rate": 0.0001, "loss": 1.9438, "loss/crossentropy": 2.620823383331299, "loss/hidden": 1.765625, "loss/logits": 0.1777106523513794, "loss/reg": 4.258997432771139e-05, "step": 1475 }, { "epoch": 0.1845, "grad_norm": 2.3290772438049316, "grad_norm_var": 45.0248299538665, "learning_rate": 0.0001, "loss": 1.3243, "loss/crossentropy": 2.333923816680908, "loss/hidden": 1.1328125, "loss/logits": 0.19102182984352112, "loss/reg": 4.2582487367326394e-05, "step": 1476 }, { "epoch": 0.184625, "grad_norm": 2.4037561416625977, "grad_norm_var": 45.35262675926802, "learning_rate": 0.0001, "loss": 1.2645, "loss/crossentropy": 2.9200727939605713, "loss/hidden": 1.0859375, "loss/logits": 0.17812412977218628, "loss/reg": 4.257016189512797e-05, "step": 1477 }, { "epoch": 0.18475, "grad_norm": 2.5915751457214355, "grad_norm_var": 4.863057685649219, "learning_rate": 0.0001, "loss": 1.3253, "loss/crossentropy": 2.2665810585021973, "loss/hidden": 1.15625, "loss/logits": 0.16864070296287537, "loss/reg": 4.2560521251289174e-05, "step": 1478 }, { "epoch": 0.184875, "grad_norm": 2.981412649154663, "grad_norm_var": 4.865338349555102, "learning_rate": 0.0001, "loss": 1.3654, "loss/crossentropy": 2.818727970123291, "loss/hidden": 1.171875, "loss/logits": 0.19310520589351654, "loss/reg": 4.2553452658466995e-05, "step": 1479 }, { "epoch": 0.185, "grad_norm": 2.439756393432617, "grad_norm_var": 4.893890160529382, "learning_rate": 0.0001, "loss": 1.3272, "loss/crossentropy": 2.6037240028381348, "loss/hidden": 1.140625, "loss/logits": 0.18612533807754517, "loss/reg": 4.254809027770534e-05, "step": 1480 }, { "epoch": 0.185125, "grad_norm": 2.447357654571533, "grad_norm_var": 4.847115901511771, "learning_rate": 0.0001, "loss": 1.4924, "loss/crossentropy": 2.919430732727051, "loss/hidden": 1.265625, "loss/logits": 0.22634942829608917, "loss/reg": 4.254366285749711e-05, "step": 1481 }, { "epoch": 0.18525, "grad_norm": 2.3883841037750244, "grad_norm_var": 4.826765636031229, "learning_rate": 0.0001, "loss": 1.4694, "loss/crossentropy": 2.2129786014556885, "loss/hidden": 1.2421875, "loss/logits": 0.22677311301231384, "loss/reg": 4.253921360941604e-05, "step": 1482 }, { "epoch": 0.185375, "grad_norm": 18.632322311401367, "grad_norm_var": 19.410147172166884, "learning_rate": 0.0001, "loss": 1.9694, "loss/crossentropy": 2.3374440670013428, "loss/hidden": 1.71875, "loss/logits": 0.25020337104797363, "loss/reg": 4.2520878196228296e-05, "step": 1483 }, { "epoch": 0.1855, "grad_norm": 2.408111095428467, "grad_norm_var": 19.558393326740877, "learning_rate": 0.0001, "loss": 1.2343, "loss/crossentropy": 2.4988977909088135, "loss/hidden": 1.046875, "loss/logits": 0.18700845539569855, "loss/reg": 4.251228892826475e-05, "step": 1484 }, { "epoch": 0.185625, "grad_norm": 2.6058828830718994, "grad_norm_var": 19.536231022308367, "learning_rate": 0.0001, "loss": 1.3138, "loss/crossentropy": 2.569941759109497, "loss/hidden": 1.125, "loss/logits": 0.188359797000885, "loss/reg": 4.249310586601496e-05, "step": 1485 }, { "epoch": 0.18575, "grad_norm": 2.337801694869995, "grad_norm_var": 18.903999577234515, "learning_rate": 0.0001, "loss": 1.2814, "loss/crossentropy": 2.5755937099456787, "loss/hidden": 1.1015625, "loss/logits": 0.17937231063842773, "loss/reg": 4.2484189179958776e-05, "step": 1486 }, { "epoch": 0.185875, "grad_norm": 2.2206108570098877, "grad_norm_var": 18.900572680100932, "learning_rate": 0.0001, "loss": 1.3908, "loss/crossentropy": 2.497291326522827, "loss/hidden": 1.1875, "loss/logits": 0.20290207862854004, "loss/reg": 4.24723511969205e-05, "step": 1487 }, { "epoch": 0.186, "grad_norm": 2.805840253829956, "grad_norm_var": 18.78883428537014, "learning_rate": 0.0001, "loss": 1.2726, "loss/crossentropy": 2.637183666229248, "loss/hidden": 1.1015625, "loss/logits": 0.17063820362091064, "loss/reg": 4.2465177102712914e-05, "step": 1488 }, { "epoch": 0.186125, "grad_norm": 2.482452630996704, "grad_norm_var": 18.73267123963991, "learning_rate": 0.0001, "loss": 1.4955, "loss/crossentropy": 2.3506133556365967, "loss/hidden": 1.25, "loss/logits": 0.2450503706932068, "loss/reg": 4.244835872668773e-05, "step": 1489 }, { "epoch": 0.18625, "grad_norm": 2.404538154602051, "grad_norm_var": 18.730597918024976, "learning_rate": 0.0001, "loss": 1.3089, "loss/crossentropy": 2.5639541149139404, "loss/hidden": 1.125, "loss/logits": 0.18347838521003723, "loss/reg": 4.243505100021139e-05, "step": 1490 }, { "epoch": 0.186375, "grad_norm": 2.1632773876190186, "grad_norm_var": 16.373156635802495, "learning_rate": 0.0001, "loss": 1.2372, "loss/crossentropy": 2.5014357566833496, "loss/hidden": 1.0625, "loss/logits": 0.17423637211322784, "loss/reg": 4.242025897838175e-05, "step": 1491 }, { "epoch": 0.1865, "grad_norm": 2.3475615978240967, "grad_norm_var": 16.370347277694776, "learning_rate": 0.0001, "loss": 1.3586, "loss/crossentropy": 2.4183082580566406, "loss/hidden": 1.1875, "loss/logits": 0.17070849239826202, "loss/reg": 4.240818088874221e-05, "step": 1492 }, { "epoch": 0.186625, "grad_norm": 2.2167327404022217, "grad_norm_var": 16.39934092054266, "learning_rate": 0.0001, "loss": 1.2318, "loss/crossentropy": 2.737880229949951, "loss/hidden": 1.0703125, "loss/logits": 0.161026269197464, "loss/reg": 4.2394876800244674e-05, "step": 1493 }, { "epoch": 0.18675, "grad_norm": 2.2914958000183105, "grad_norm_var": 16.43999919701839, "learning_rate": 0.0001, "loss": 1.3837, "loss/crossentropy": 2.581719398498535, "loss/hidden": 1.171875, "loss/logits": 0.21139121055603027, "loss/reg": 4.2386356653878465e-05, "step": 1494 }, { "epoch": 0.186875, "grad_norm": 2.403869390487671, "grad_norm_var": 16.496803032325875, "learning_rate": 0.0001, "loss": 1.3391, "loss/crossentropy": 2.3859739303588867, "loss/hidden": 1.1484375, "loss/logits": 0.19020606577396393, "loss/reg": 4.237349276081659e-05, "step": 1495 }, { "epoch": 0.187, "grad_norm": 2.8205666542053223, "grad_norm_var": 16.456488504250864, "learning_rate": 0.0001, "loss": 1.5081, "loss/crossentropy": 2.8649418354034424, "loss/hidden": 1.2734375, "loss/logits": 0.23426686227321625, "loss/reg": 4.2357834900030866e-05, "step": 1496 }, { "epoch": 0.187125, "grad_norm": 2.3977134227752686, "grad_norm_var": 16.463186923695265, "learning_rate": 0.0001, "loss": 1.3612, "loss/crossentropy": 2.7040863037109375, "loss/hidden": 1.1796875, "loss/logits": 0.18106544017791748, "loss/reg": 4.234022708260454e-05, "step": 1497 }, { "epoch": 0.18725, "grad_norm": 9.282831192016602, "grad_norm_var": 18.473799466194198, "learning_rate": 0.0001, "loss": 2.1135, "loss/crossentropy": 2.3761186599731445, "loss/hidden": 1.828125, "loss/logits": 0.2849646806716919, "loss/reg": 4.23242527176626e-05, "step": 1498 }, { "epoch": 0.187375, "grad_norm": 2.448495864868164, "grad_norm_var": 2.9755130882248815, "learning_rate": 0.0001, "loss": 1.6106, "loss/crossentropy": 2.541566848754883, "loss/hidden": 1.3671875, "loss/logits": 0.24301382899284363, "loss/reg": 4.2308161937398836e-05, "step": 1499 }, { "epoch": 0.1875, "grad_norm": 2.0536553859710693, "grad_norm_var": 3.004361121628405, "learning_rate": 0.0001, "loss": 1.1144, "loss/crossentropy": 2.4763259887695312, "loss/hidden": 0.9765625, "loss/logits": 0.1373748481273651, "loss/reg": 4.2289339035050943e-05, "step": 1500 } ], "logging_steps": 1, "max_steps": 8000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": true, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.6608792346624e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }