{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.125, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000125, "grad_norm": 2.8797903060913086, "learning_rate": 1.0000000000000002e-06, "loss": 1.3346, "loss/crossentropy": 2.6933815479278564, "loss/hidden": 1.171875, "loss/logits": 0.16231727600097656, "loss/reg": 3.5815275623463094e-05, "step": 1 }, { "epoch": 0.00025, "grad_norm": 3.151318073272705, "learning_rate": 2.0000000000000003e-06, "loss": 1.334, "loss/crossentropy": 3.0975701808929443, "loss/hidden": 1.1484375, "loss/logits": 0.18517285585403442, "loss/reg": 3.5815275623463094e-05, "step": 2 }, { "epoch": 0.000375, "grad_norm": 2.3074228763580322, "learning_rate": 3e-06, "loss": 1.2917, "loss/crossentropy": 2.613313674926758, "loss/hidden": 1.1171875, "loss/logits": 0.17419689893722534, "loss/reg": 3.581521741580218e-05, "step": 3 }, { "epoch": 0.0005, "grad_norm": 2.994593381881714, "learning_rate": 4.000000000000001e-06, "loss": 1.3622, "loss/crossentropy": 2.562746047973633, "loss/hidden": 1.2109375, "loss/logits": 0.150880828499794, "loss/reg": 3.5815086448565125e-05, "step": 4 }, { "epoch": 0.000625, "grad_norm": 4.555283069610596, "learning_rate": 5e-06, "loss": 1.435, "loss/crossentropy": 2.4253523349761963, "loss/hidden": 1.25, "loss/logits": 0.18461981415748596, "loss/reg": 3.581498458515853e-05, "step": 5 }, { "epoch": 0.00075, "grad_norm": 2.5826594829559326, "learning_rate": 6e-06, "loss": 1.2796, "loss/crossentropy": 2.666372060775757, "loss/hidden": 1.1015625, "loss/logits": 0.17770954966545105, "loss/reg": 3.5814849979942665e-05, "step": 6 }, { "epoch": 0.000875, "grad_norm": 2.9724032878875732, "learning_rate": 7.000000000000001e-06, "loss": 1.5032, "loss/crossentropy": 2.488424062728882, "loss/hidden": 1.3046875, "loss/logits": 0.1981831043958664, "loss/reg": 3.581465352908708e-05, "step": 7 }, { "epoch": 0.001, "grad_norm": 4.469974517822266, "learning_rate": 8.000000000000001e-06, "loss": 1.7569, "loss/crossentropy": 2.152468204498291, "loss/hidden": 1.515625, "loss/logits": 0.24093276262283325, "loss/reg": 3.581443888833746e-05, "step": 8 }, { "epoch": 0.001125, "grad_norm": 2.529066801071167, "learning_rate": 9e-06, "loss": 1.7045, "loss/crossentropy": 2.3210883140563965, "loss/hidden": 1.4453125, "loss/logits": 0.2588244378566742, "loss/reg": 3.581414057407528e-05, "step": 9 }, { "epoch": 0.00125, "grad_norm": 2.1863760948181152, "learning_rate": 1e-05, "loss": 1.4129, "loss/crossentropy": 2.213552236557007, "loss/hidden": 1.25, "loss/logits": 0.1625480353832245, "loss/reg": 3.5813736758427694e-05, "step": 10 }, { "epoch": 0.001375, "grad_norm": 2.182722330093384, "learning_rate": 1.1000000000000001e-05, "loss": 1.3913, "loss/crossentropy": 2.4366066455841064, "loss/hidden": 1.1796875, "loss/logits": 0.21124377846717834, "loss/reg": 3.5813347494695336e-05, "step": 11 }, { "epoch": 0.0015, "grad_norm": 2.28460431098938, "learning_rate": 1.2e-05, "loss": 1.6315, "loss/crossentropy": 2.2548444271087646, "loss/hidden": 1.4296875, "loss/logits": 0.2014051228761673, "loss/reg": 3.581297642085701e-05, "step": 12 }, { "epoch": 0.001625, "grad_norm": 3.58573579788208, "learning_rate": 1.3000000000000001e-05, "loss": 1.5091, "loss/crossentropy": 2.6865081787109375, "loss/hidden": 1.28125, "loss/logits": 0.22751325368881226, "loss/reg": 3.581246710382402e-05, "step": 13 }, { "epoch": 0.00175, "grad_norm": 3.04477596282959, "learning_rate": 1.4000000000000001e-05, "loss": 1.7733, "loss/crossentropy": 2.2971208095550537, "loss/hidden": 1.5234375, "loss/logits": 0.2495010793209076, "loss/reg": 3.5811823181575164e-05, "step": 14 }, { "epoch": 0.001875, "grad_norm": 3.0177462100982666, "learning_rate": 1.5e-05, "loss": 1.5114, "loss/crossentropy": 2.726813554763794, "loss/hidden": 1.28125, "loss/logits": 0.2297666072845459, "loss/reg": 3.581113196560182e-05, "step": 15 }, { "epoch": 0.002, "grad_norm": 2.147826671600342, "grad_norm_var": 0.5553412532630915, "learning_rate": 1.6000000000000003e-05, "loss": 1.2521, "loss/crossentropy": 2.413343667984009, "loss/hidden": 1.078125, "loss/logits": 0.17357708513736725, "loss/reg": 3.581081909942441e-05, "step": 16 }, { "epoch": 0.002125, "grad_norm": 1.957945704460144, "grad_norm_var": 0.6147194825502799, "learning_rate": 1.7000000000000003e-05, "loss": 1.1395, "loss/crossentropy": 2.327432632446289, "loss/hidden": 1.0, "loss/logits": 0.1391144096851349, "loss/reg": 3.581016790121794e-05, "step": 17 }, { "epoch": 0.00225, "grad_norm": 2.9085915088653564, "grad_norm_var": 0.6093993504039028, "learning_rate": 1.8e-05, "loss": 1.7224, "loss/crossentropy": 2.5963706970214844, "loss/hidden": 1.4765625, "loss/logits": 0.24546313285827637, "loss/reg": 3.580931297619827e-05, "step": 18 }, { "epoch": 0.002375, "grad_norm": 1.6794862747192383, "grad_norm_var": 0.6801389543370343, "learning_rate": 1.9e-05, "loss": 1.2792, "loss/crossentropy": 2.542264938354492, "loss/hidden": 1.109375, "loss/logits": 0.1695137917995453, "loss/reg": 3.580832708394155e-05, "step": 19 }, { "epoch": 0.0025, "grad_norm": 2.006974935531616, "grad_norm_var": 0.7179436357972528, "learning_rate": 2e-05, "loss": 1.2741, "loss/crossentropy": 2.6418614387512207, "loss/hidden": 1.1015625, "loss/logits": 0.1721784472465515, "loss/reg": 3.580794873414561e-05, "step": 20 }, { "epoch": 0.002625, "grad_norm": 1.895347237586975, "grad_norm_var": 0.5223754576868543, "learning_rate": 2.1e-05, "loss": 1.1787, "loss/crossentropy": 2.38079571723938, "loss/hidden": 1.03125, "loss/logits": 0.14714078605175018, "loss/reg": 3.580757766030729e-05, "step": 21 }, { "epoch": 0.00275, "grad_norm": 2.195387125015259, "grad_norm_var": 0.5321677299000015, "learning_rate": 2.2000000000000003e-05, "loss": 1.3881, "loss/crossentropy": 2.613879919052124, "loss/hidden": 1.1875, "loss/logits": 0.20026516914367676, "loss/reg": 3.58072757080663e-05, "step": 22 }, { "epoch": 0.002875, "grad_norm": 2.943157911300659, "grad_norm_var": 0.530638648177441, "learning_rate": 2.3000000000000003e-05, "loss": 1.391, "loss/crossentropy": 2.653855562210083, "loss/hidden": 1.1875, "loss/logits": 0.20310327410697937, "loss/reg": 3.580666452762671e-05, "step": 23 }, { "epoch": 0.003, "grad_norm": 2.057532787322998, "grad_norm_var": 0.2815427832165767, "learning_rate": 2.4e-05, "loss": 1.2261, "loss/crossentropy": 2.5107123851776123, "loss/hidden": 1.0859375, "loss/logits": 0.13977402448654175, "loss/reg": 3.580632255761884e-05, "step": 24 }, { "epoch": 0.003125, "grad_norm": 2.4862008094787598, "grad_norm_var": 0.28099970817646425, "learning_rate": 2.5e-05, "loss": 1.2713, "loss/crossentropy": 2.234706163406372, "loss/hidden": 1.1171875, "loss/logits": 0.1537853181362152, "loss/reg": 3.580575867090374e-05, "step": 25 }, { "epoch": 0.00325, "grad_norm": 1.8715572357177734, "grad_norm_var": 0.29663449315952994, "learning_rate": 2.6000000000000002e-05, "loss": 1.2613, "loss/crossentropy": 2.693939447402954, "loss/hidden": 1.1015625, "loss/logits": 0.1593395173549652, "loss/reg": 3.58048637281172e-05, "step": 26 }, { "epoch": 0.003375, "grad_norm": 1.8992433547973633, "grad_norm_var": 0.3095519871493233, "learning_rate": 2.7000000000000002e-05, "loss": 1.3686, "loss/crossentropy": 2.5521254539489746, "loss/hidden": 1.1796875, "loss/logits": 0.18858906626701355, "loss/reg": 3.580458724172786e-05, "step": 27 }, { "epoch": 0.0035, "grad_norm": 2.2363150119781494, "grad_norm_var": 0.31027254984992586, "learning_rate": 2.8000000000000003e-05, "loss": 1.4412, "loss/crossentropy": 2.5556640625, "loss/hidden": 1.25, "loss/logits": 0.19085438549518585, "loss/reg": 3.580446355044842e-05, "step": 28 }, { "epoch": 0.003625, "grad_norm": 1.916678786277771, "grad_norm_var": 0.21402330843771242, "learning_rate": 2.9e-05, "loss": 1.4761, "loss/crossentropy": 2.3765358924865723, "loss/hidden": 1.28125, "loss/logits": 0.19447964429855347, "loss/reg": 3.5804154322249815e-05, "step": 29 }, { "epoch": 0.00375, "grad_norm": 1.638509750366211, "grad_norm_var": 0.19170291887504137, "learning_rate": 3e-05, "loss": 1.0217, "loss/crossentropy": 2.39214825630188, "loss/hidden": 0.91015625, "loss/logits": 0.11122289299964905, "loss/reg": 3.580400880309753e-05, "step": 30 }, { "epoch": 0.003875, "grad_norm": 1.9309443235397339, "grad_norm_var": 0.14393413685788706, "learning_rate": 3.1e-05, "loss": 1.2692, "loss/crossentropy": 2.5890893936157227, "loss/hidden": 1.109375, "loss/logits": 0.15942150354385376, "loss/reg": 3.580367410904728e-05, "step": 31 }, { "epoch": 0.004, "grad_norm": 1.8290704488754272, "grad_norm_var": 0.14870789473936974, "learning_rate": 3.2000000000000005e-05, "loss": 1.2562, "loss/crossentropy": 2.614006757736206, "loss/hidden": 1.09375, "loss/logits": 0.1620863825082779, "loss/reg": 3.580304473871365e-05, "step": 32 }, { "epoch": 0.004125, "grad_norm": 2.8197247982025146, "grad_norm_var": 0.17985784278712322, "learning_rate": 3.3e-05, "loss": 1.6221, "loss/crossentropy": 2.458843231201172, "loss/hidden": 1.390625, "loss/logits": 0.23112158477306366, "loss/reg": 3.5803102946374565e-05, "step": 33 }, { "epoch": 0.00425, "grad_norm": 2.04148530960083, "grad_norm_var": 0.1385297884752911, "learning_rate": 3.4000000000000007e-05, "loss": 1.2995, "loss/crossentropy": 2.6012392044067383, "loss/hidden": 1.140625, "loss/logits": 0.15854208171367645, "loss/reg": 3.580349584808573e-05, "step": 34 }, { "epoch": 0.004375, "grad_norm": 2.370253086090088, "grad_norm_var": 0.13049913719012618, "learning_rate": 3.5e-05, "loss": 1.2372, "loss/crossentropy": 2.554248571395874, "loss/hidden": 1.0859375, "loss/logits": 0.1508997678756714, "loss/reg": 3.580387055990286e-05, "step": 35 }, { "epoch": 0.0045, "grad_norm": 2.1753602027893066, "grad_norm_var": 0.12942723244658866, "learning_rate": 3.6e-05, "loss": 1.2043, "loss/crossentropy": 2.950917959213257, "loss/hidden": 1.0625, "loss/logits": 0.14146575331687927, "loss/reg": 3.5805252991849557e-05, "step": 36 }, { "epoch": 0.004625, "grad_norm": 1.8446277379989624, "grad_norm_var": 0.13127072083684937, "learning_rate": 3.7e-05, "loss": 1.0664, "loss/crossentropy": 2.6589579582214355, "loss/hidden": 0.93359375, "loss/logits": 0.1324453502893448, "loss/reg": 3.58061988663394e-05, "step": 37 }, { "epoch": 0.00475, "grad_norm": 3.153823137283325, "grad_norm_var": 0.1956330169497003, "learning_rate": 3.8e-05, "loss": 1.3224, "loss/crossentropy": 2.4948697090148926, "loss/hidden": 1.1484375, "loss/logits": 0.1736428141593933, "loss/reg": 3.5806617233902216e-05, "step": 38 }, { "epoch": 0.004875, "grad_norm": 2.105498790740967, "grad_norm_var": 0.1565869437188399, "learning_rate": 3.9000000000000006e-05, "loss": 1.403, "loss/crossentropy": 2.2583742141723633, "loss/hidden": 1.25, "loss/logits": 0.15260592103004456, "loss/reg": 3.580802876967937e-05, "step": 39 }, { "epoch": 0.005, "grad_norm": 1.635926365852356, "grad_norm_var": 0.17281299081781085, "learning_rate": 4e-05, "loss": 1.0375, "loss/crossentropy": 2.6808717250823975, "loss/hidden": 0.921875, "loss/logits": 0.11526834964752197, "loss/reg": 3.580863995011896e-05, "step": 40 }, { "epoch": 0.005125, "grad_norm": 1.715374231338501, "grad_norm_var": 0.17253809821943988, "learning_rate": 4.1e-05, "loss": 1.13, "loss/crossentropy": 2.643165349960327, "loss/hidden": 0.984375, "loss/logits": 0.1453102082014084, "loss/reg": 3.580814882298e-05, "step": 41 }, { "epoch": 0.00525, "grad_norm": 2.1999247074127197, "grad_norm_var": 0.17041268294515716, "learning_rate": 4.2e-05, "loss": 1.291, "loss/crossentropy": 2.4450502395629883, "loss/hidden": 1.125, "loss/logits": 0.165659099817276, "loss/reg": 3.5807905078399926e-05, "step": 42 }, { "epoch": 0.005375, "grad_norm": 6.767260551452637, "grad_norm_var": 1.5247462870548845, "learning_rate": 4.3e-05, "loss": 1.3462, "loss/crossentropy": 2.6365652084350586, "loss/hidden": 1.1953125, "loss/logits": 0.15048328042030334, "loss/reg": 3.580807242542505e-05, "step": 43 }, { "epoch": 0.0055, "grad_norm": 2.4290215969085693, "grad_norm_var": 1.5228923892282233, "learning_rate": 4.4000000000000006e-05, "loss": 1.444, "loss/crossentropy": 2.445629596710205, "loss/hidden": 1.2578125, "loss/logits": 0.1858382523059845, "loss/reg": 3.5807508538709953e-05, "step": 44 }, { "epoch": 0.005625, "grad_norm": 1.8787518739700317, "grad_norm_var": 1.525481240782518, "learning_rate": 4.5e-05, "loss": 1.1501, "loss/crossentropy": 2.932614326477051, "loss/hidden": 1.015625, "loss/logits": 0.13410566747188568, "loss/reg": 3.5807570384349674e-05, "step": 45 }, { "epoch": 0.00575, "grad_norm": 2.0517191886901855, "grad_norm_var": 1.4937318455351132, "learning_rate": 4.600000000000001e-05, "loss": 1.1484, "loss/crossentropy": 2.8540468215942383, "loss/hidden": 1.0078125, "loss/logits": 0.14026299118995667, "loss/reg": 3.5806355299428105e-05, "step": 46 }, { "epoch": 0.005875, "grad_norm": 2.3056695461273193, "grad_norm_var": 1.4773587952527152, "learning_rate": 4.7e-05, "loss": 1.1201, "loss/crossentropy": 2.3501625061035156, "loss/hidden": 0.9921875, "loss/logits": 0.127536341547966, "loss/reg": 3.580517659429461e-05, "step": 47 }, { "epoch": 0.006, "grad_norm": 1.8737199306488037, "grad_norm_var": 1.473740887453625, "learning_rate": 4.8e-05, "loss": 1.1727, "loss/crossentropy": 2.56876540184021, "loss/hidden": 1.0234375, "loss/logits": 0.14892947673797607, "loss/reg": 3.5804278013529256e-05, "step": 48 }, { "epoch": 0.006125, "grad_norm": 1.6498337984085083, "grad_norm_var": 1.503248724299241, "learning_rate": 4.9e-05, "loss": 1.0887, "loss/crossentropy": 2.5359740257263184, "loss/hidden": 0.97265625, "loss/logits": 0.11568085849285126, "loss/reg": 3.580292104743421e-05, "step": 49 }, { "epoch": 0.00625, "grad_norm": 1.8893804550170898, "grad_norm_var": 1.5117099009867367, "learning_rate": 5e-05, "loss": 1.197, "loss/crossentropy": 2.4427387714385986, "loss/hidden": 1.046875, "loss/logits": 0.14971715211868286, "loss/reg": 3.58012730430346e-05, "step": 50 }, { "epoch": 0.006375, "grad_norm": 1.6203227043151855, "grad_norm_var": 1.5476226526424812, "learning_rate": 5.1000000000000006e-05, "loss": 1.1026, "loss/crossentropy": 2.4663264751434326, "loss/hidden": 0.96875, "loss/logits": 0.13351351022720337, "loss/reg": 3.579998519853689e-05, "step": 51 }, { "epoch": 0.0065, "grad_norm": 1.86211097240448, "grad_norm_var": 1.5602565704882587, "learning_rate": 5.2000000000000004e-05, "loss": 1.3754, "loss/crossentropy": 2.4876317977905273, "loss/hidden": 1.1875, "loss/logits": 0.1875210702419281, "loss/reg": 3.5798137105302885e-05, "step": 52 }, { "epoch": 0.006625, "grad_norm": 1.8947949409484863, "grad_norm_var": 1.5572914096308217, "learning_rate": 5.300000000000001e-05, "loss": 1.2427, "loss/crossentropy": 2.5425305366516113, "loss/hidden": 1.09375, "loss/logits": 0.14857983589172363, "loss/reg": 3.5795987059827894e-05, "step": 53 }, { "epoch": 0.00675, "grad_norm": 2.155927896499634, "grad_norm_var": 1.5078638031084188, "learning_rate": 5.4000000000000005e-05, "loss": 1.1465, "loss/crossentropy": 2.525212287902832, "loss/hidden": 1.0234375, "loss/logits": 0.12268626689910889, "loss/reg": 3.579404437914491e-05, "step": 54 }, { "epoch": 0.006875, "grad_norm": 2.088404893875122, "grad_norm_var": 1.5082164304181902, "learning_rate": 5.500000000000001e-05, "loss": 1.2035, "loss/crossentropy": 2.0337164402008057, "loss/hidden": 1.078125, "loss/logits": 0.12503597140312195, "loss/reg": 3.579181066015735e-05, "step": 55 }, { "epoch": 0.007, "grad_norm": 1.5657821893692017, "grad_norm_var": 1.5142777074410627, "learning_rate": 5.6000000000000006e-05, "loss": 1.0258, "loss/crossentropy": 2.5575084686279297, "loss/hidden": 0.90625, "loss/logits": 0.11920958012342453, "loss/reg": 3.5789642424788326e-05, "step": 56 }, { "epoch": 0.007125, "grad_norm": 1.8427784442901611, "grad_norm_var": 1.5062655960432332, "learning_rate": 5.6999999999999996e-05, "loss": 1.1217, "loss/crossentropy": 2.771742820739746, "loss/hidden": 0.9921875, "loss/logits": 0.1291133463382721, "loss/reg": 3.5787568776868284e-05, "step": 57 }, { "epoch": 0.00725, "grad_norm": 2.1850311756134033, "grad_norm_var": 1.5063882579126575, "learning_rate": 5.8e-05, "loss": 1.1639, "loss/crossentropy": 2.5320937633514404, "loss/hidden": 1.03125, "loss/logits": 0.13231654465198517, "loss/reg": 3.5786692023975775e-05, "step": 58 }, { "epoch": 0.007375, "grad_norm": 1.9679864645004272, "grad_norm_var": 0.057763248837537105, "learning_rate": 5.9e-05, "loss": 1.2283, "loss/crossentropy": 2.2511909008026123, "loss/hidden": 1.0703125, "loss/logits": 0.15765681862831116, "loss/reg": 3.578452378860675e-05, "step": 59 }, { "epoch": 0.0075, "grad_norm": 1.5962857007980347, "grad_norm_var": 0.04834229766963934, "learning_rate": 6e-05, "loss": 1.1863, "loss/crossentropy": 2.2581472396850586, "loss/hidden": 1.046875, "loss/logits": 0.13910087943077087, "loss/reg": 3.578297037165612e-05, "step": 60 }, { "epoch": 0.007625, "grad_norm": 1.9367214441299438, "grad_norm_var": 0.04837432662246878, "learning_rate": 6.1e-05, "loss": 1.1308, "loss/crossentropy": 2.375943660736084, "loss/hidden": 1.0078125, "loss/logits": 0.12259182333946228, "loss/reg": 3.578166069928557e-05, "step": 61 }, { "epoch": 0.00775, "grad_norm": 1.74626624584198, "grad_norm_var": 0.048246697686887254, "learning_rate": 6.2e-05, "loss": 1.0925, "loss/crossentropy": 2.33935284614563, "loss/hidden": 0.97265625, "loss/logits": 0.11948312819004059, "loss/reg": 3.577923052944243e-05, "step": 62 }, { "epoch": 0.007875, "grad_norm": 1.7242004871368408, "grad_norm_var": 0.036866001167244575, "learning_rate": 6.3e-05, "loss": 1.0529, "loss/crossentropy": 2.56264066696167, "loss/hidden": 0.9375, "loss/logits": 0.11508607119321823, "loss/reg": 3.5776785807684064e-05, "step": 63 }, { "epoch": 0.008, "grad_norm": 1.759158730506897, "grad_norm_var": 0.03732351836526746, "learning_rate": 6.400000000000001e-05, "loss": 1.0359, "loss/crossentropy": 2.5598065853118896, "loss/hidden": 0.921875, "loss/logits": 0.11371441185474396, "loss/reg": 3.577530151233077e-05, "step": 64 }, { "epoch": 0.008125, "grad_norm": 1.8450080156326294, "grad_norm_var": 0.03468242225663947, "learning_rate": 6.500000000000001e-05, "loss": 1.1044, "loss/crossentropy": 2.358488082885742, "loss/hidden": 0.9765625, "loss/logits": 0.1274409294128418, "loss/reg": 3.577340248739347e-05, "step": 65 }, { "epoch": 0.00825, "grad_norm": 1.8574451208114624, "grad_norm_var": 0.03459981312827119, "learning_rate": 6.6e-05, "loss": 1.3393, "loss/crossentropy": 2.2067902088165283, "loss/hidden": 1.1796875, "loss/logits": 0.15927882492542267, "loss/reg": 3.5770081012742594e-05, "step": 66 }, { "epoch": 0.008375, "grad_norm": 2.0372934341430664, "grad_norm_var": 0.032529617098579836, "learning_rate": 6.7e-05, "loss": 1.0537, "loss/crossentropy": 2.948381185531616, "loss/hidden": 0.92578125, "loss/logits": 0.1275252103805542, "loss/reg": 3.576773087843321e-05, "step": 67 }, { "epoch": 0.0085, "grad_norm": 1.6391419172286987, "grad_norm_var": 0.03614113702393708, "learning_rate": 6.800000000000001e-05, "loss": 1.1128, "loss/crossentropy": 2.691239833831787, "loss/hidden": 0.98046875, "loss/logits": 0.13195790350437164, "loss/reg": 3.576446033548564e-05, "step": 68 }, { "epoch": 0.008625, "grad_norm": 1.6962119340896606, "grad_norm_var": 0.03782062069620693, "learning_rate": 6.9e-05, "loss": 1.0633, "loss/crossentropy": 2.6416876316070557, "loss/hidden": 0.94921875, "loss/logits": 0.11374930292367935, "loss/reg": 3.576194285415113e-05, "step": 69 }, { "epoch": 0.00875, "grad_norm": 2.015970468521118, "grad_norm_var": 0.03338686088704298, "learning_rate": 7e-05, "loss": 1.2654, "loss/crossentropy": 2.5686511993408203, "loss/hidden": 1.109375, "loss/logits": 0.1556204855442047, "loss/reg": 3.576026938389987e-05, "step": 70 }, { "epoch": 0.008875, "grad_norm": 3.6860287189483643, "grad_norm_var": 0.24497842788792332, "learning_rate": 7.1e-05, "loss": 1.4119, "loss/crossentropy": 2.0071253776550293, "loss/hidden": 1.2578125, "loss/logits": 0.15377236902713776, "loss/reg": 3.575763912522234e-05, "step": 71 }, { "epoch": 0.009, "grad_norm": 2.0074028968811035, "grad_norm_var": 0.2349071198746244, "learning_rate": 7.2e-05, "loss": 1.0776, "loss/crossentropy": 2.3829903602600098, "loss/hidden": 0.953125, "loss/logits": 0.12408198416233063, "loss/reg": 3.575549635570496e-05, "step": 72 }, { "epoch": 0.009125, "grad_norm": 1.9751619100570679, "grad_norm_var": 0.23373155459140638, "learning_rate": 7.3e-05, "loss": 1.3348, "loss/crossentropy": 2.3497986793518066, "loss/hidden": 1.171875, "loss/logits": 0.16253460943698883, "loss/reg": 3.575363371055573e-05, "step": 73 }, { "epoch": 0.00925, "grad_norm": 1.5991307497024536, "grad_norm_var": 0.2391465881612707, "learning_rate": 7.4e-05, "loss": 1.1257, "loss/crossentropy": 2.5422017574310303, "loss/hidden": 0.9921875, "loss/logits": 0.13318461179733276, "loss/reg": 3.575047594495118e-05, "step": 74 }, { "epoch": 0.009375, "grad_norm": 6.278711318969727, "grad_norm_var": 1.4148538861937499, "learning_rate": 7.500000000000001e-05, "loss": 1.2142, "loss/crossentropy": 2.6029913425445557, "loss/hidden": 1.0859375, "loss/logits": 0.1279023289680481, "loss/reg": 3.5748576920013875e-05, "step": 75 }, { "epoch": 0.0095, "grad_norm": 2.744645833969116, "grad_norm_var": 1.4029217843730426, "learning_rate": 7.6e-05, "loss": 1.1281, "loss/crossentropy": 2.5766711235046387, "loss/hidden": 0.99609375, "loss/logits": 0.13161128759384155, "loss/reg": 3.574538277462125e-05, "step": 76 }, { "epoch": 0.009625, "grad_norm": 1.6794898509979248, "grad_norm_var": 1.418977736839713, "learning_rate": 7.7e-05, "loss": 0.9818, "loss/crossentropy": 2.4829020500183105, "loss/hidden": 0.87109375, "loss/logits": 0.11033609509468079, "loss/reg": 3.574356742319651e-05, "step": 77 }, { "epoch": 0.00975, "grad_norm": 1.4505054950714111, "grad_norm_var": 1.4450273907543434, "learning_rate": 7.800000000000001e-05, "loss": 1.0562, "loss/crossentropy": 2.2392332553863525, "loss/hidden": 0.9375, "loss/logits": 0.11831367015838623, "loss/reg": 3.5740758903557435e-05, "step": 78 }, { "epoch": 0.009875, "grad_norm": 2.267005205154419, "grad_norm_var": 1.425408330742154, "learning_rate": 7.900000000000001e-05, "loss": 1.2224, "loss/crossentropy": 2.524975299835205, "loss/hidden": 1.09375, "loss/logits": 0.12832751870155334, "loss/reg": 3.5736080462811515e-05, "step": 79 }, { "epoch": 0.01, "grad_norm": 1.9941935539245605, "grad_norm_var": 1.4124245943422287, "learning_rate": 8e-05, "loss": 1.3792, "loss/crossentropy": 2.3303298950195312, "loss/hidden": 1.2109375, "loss/logits": 0.16792933642864227, "loss/reg": 3.573206049622968e-05, "step": 80 }, { "epoch": 0.010125, "grad_norm": 1.9384936094284058, "grad_norm_var": 1.407320221541647, "learning_rate": 8.1e-05, "loss": 1.2054, "loss/crossentropy": 2.339118003845215, "loss/hidden": 1.0703125, "loss/logits": 0.13472682237625122, "loss/reg": 3.5727785871131346e-05, "step": 81 }, { "epoch": 0.01025, "grad_norm": 2.2007267475128174, "grad_norm_var": 1.3942380508674062, "learning_rate": 8.2e-05, "loss": 1.2089, "loss/crossentropy": 2.3879401683807373, "loss/hidden": 1.078125, "loss/logits": 0.130388081073761, "loss/reg": 3.5725021007237956e-05, "step": 82 }, { "epoch": 0.010375, "grad_norm": 1.5669373273849487, "grad_norm_var": 1.426148143880052, "learning_rate": 8.3e-05, "loss": 1.0953, "loss/crossentropy": 2.386194944381714, "loss/hidden": 0.97265625, "loss/logits": 0.12224260717630386, "loss/reg": 3.572153946151957e-05, "step": 83 }, { "epoch": 0.0105, "grad_norm": 3.444465160369873, "grad_norm_var": 1.4716789596545155, "learning_rate": 8.4e-05, "loss": 1.4601, "loss/crossentropy": 2.160597801208496, "loss/hidden": 1.2578125, "loss/logits": 0.20196697115898132, "loss/reg": 3.571900742826983e-05, "step": 84 }, { "epoch": 0.010625, "grad_norm": 2.17124080657959, "grad_norm_var": 1.440631969989453, "learning_rate": 8.5e-05, "loss": 1.1224, "loss/crossentropy": 2.1496167182922363, "loss/hidden": 1.0, "loss/logits": 0.12202942371368408, "loss/reg": 3.571617344277911e-05, "step": 85 }, { "epoch": 0.01075, "grad_norm": 1.842886209487915, "grad_norm_var": 1.4522613774542446, "learning_rate": 8.6e-05, "loss": 1.0156, "loss/crossentropy": 2.226436138153076, "loss/hidden": 0.90625, "loss/logits": 0.10898593068122864, "loss/reg": 3.57139615516644e-05, "step": 86 }, { "epoch": 0.010875, "grad_norm": 2.05486798286438, "grad_norm_var": 1.344934690323482, "learning_rate": 8.7e-05, "loss": 1.0828, "loss/crossentropy": 2.322002410888672, "loss/hidden": 0.9609375, "loss/logits": 0.12146371603012085, "loss/reg": 3.5710025258595124e-05, "step": 87 }, { "epoch": 0.011, "grad_norm": 3.25955867767334, "grad_norm_var": 1.3897383898525164, "learning_rate": 8.800000000000001e-05, "loss": 1.5155, "loss/crossentropy": 1.937675952911377, "loss/hidden": 1.3125, "loss/logits": 0.2026323676109314, "loss/reg": 3.570731496438384e-05, "step": 88 }, { "epoch": 0.011125, "grad_norm": 1.7932565212249756, "grad_norm_var": 1.4022136437703991, "learning_rate": 8.900000000000001e-05, "loss": 1.0228, "loss/crossentropy": 2.649721145629883, "loss/hidden": 0.91015625, "loss/logits": 0.11223678290843964, "loss/reg": 3.570578701328486e-05, "step": 89 }, { "epoch": 0.01125, "grad_norm": 1.6545979976654053, "grad_norm_var": 1.3965356378457594, "learning_rate": 9e-05, "loss": 1.0415, "loss/crossentropy": 2.546100616455078, "loss/hidden": 0.9140625, "loss/logits": 0.12706589698791504, "loss/reg": 3.570249828044325e-05, "step": 90 }, { "epoch": 0.011375, "grad_norm": 2.2377841472625732, "grad_norm_var": 0.3253247379631695, "learning_rate": 9.1e-05, "loss": 1.2172, "loss/crossentropy": 2.677785873413086, "loss/hidden": 1.078125, "loss/logits": 0.13871382176876068, "loss/reg": 3.5701228625839576e-05, "step": 91 }, { "epoch": 0.0115, "grad_norm": 2.016057014465332, "grad_norm_var": 0.3001321883475782, "learning_rate": 9.200000000000001e-05, "loss": 1.2553, "loss/crossentropy": 2.216202974319458, "loss/hidden": 1.09375, "loss/logits": 0.16114352643489838, "loss/reg": 3.569832188077271e-05, "step": 92 }, { "epoch": 0.011625, "grad_norm": 1.9337667226791382, "grad_norm_var": 0.28997562388856146, "learning_rate": 9.300000000000001e-05, "loss": 1.2073, "loss/crossentropy": 2.7274839878082275, "loss/hidden": 1.0546875, "loss/logits": 0.1522083729505539, "loss/reg": 3.5696477425517514e-05, "step": 93 }, { "epoch": 0.01175, "grad_norm": 2.1966376304626465, "grad_norm_var": 0.25874835102596966, "learning_rate": 9.4e-05, "loss": 1.2497, "loss/crossentropy": 2.0065174102783203, "loss/hidden": 1.109375, "loss/logits": 0.13999012112617493, "loss/reg": 3.56943673978094e-05, "step": 94 }, { "epoch": 0.011875, "grad_norm": 1.7162114381790161, "grad_norm_var": 0.2699080995908368, "learning_rate": 9.5e-05, "loss": 1.0104, "loss/crossentropy": 2.4001636505126953, "loss/hidden": 0.90234375, "loss/logits": 0.10766053199768066, "loss/reg": 3.5692111850949004e-05, "step": 95 }, { "epoch": 0.012, "grad_norm": 1.9596549272537231, "grad_norm_var": 0.2705912806447509, "learning_rate": 9.6e-05, "loss": 1.1288, "loss/crossentropy": 2.4599721431732178, "loss/hidden": 0.98828125, "loss/logits": 0.14019131660461426, "loss/reg": 3.5689983633346856e-05, "step": 96 }, { "epoch": 0.012125, "grad_norm": 1.9994059801101685, "grad_norm_var": 0.26931496222480145, "learning_rate": 9.7e-05, "loss": 1.1051, "loss/crossentropy": 2.8105618953704834, "loss/hidden": 0.9765625, "loss/logits": 0.12819884717464447, "loss/reg": 3.568677857401781e-05, "step": 97 }, { "epoch": 0.01225, "grad_norm": 2.0389626026153564, "grad_norm_var": 0.26938190348710545, "learning_rate": 9.8e-05, "loss": 1.4044, "loss/crossentropy": 2.10030198097229, "loss/hidden": 1.203125, "loss/logits": 0.20087596774101257, "loss/reg": 3.5685956390807405e-05, "step": 98 }, { "epoch": 0.012375, "grad_norm": 3.1797940731048584, "grad_norm_var": 0.31348186491538, "learning_rate": 9.900000000000001e-05, "loss": 1.5773, "loss/crossentropy": 2.7054786682128906, "loss/hidden": 1.359375, "loss/logits": 0.2175736129283905, "loss/reg": 3.5683315218193457e-05, "step": 99 }, { "epoch": 0.0125, "grad_norm": 2.2074787616729736, "grad_norm_var": 0.20694747633483127, "learning_rate": 0.0001, "loss": 1.2851, "loss/crossentropy": 2.2195346355438232, "loss/hidden": 1.125, "loss/logits": 0.15975427627563477, "loss/reg": 3.568131796782836e-05, "step": 100 }, { "epoch": 0.012625, "grad_norm": 2.0853495597839355, "grad_norm_var": 0.20706664538577282, "learning_rate": 0.0001, "loss": 1.1671, "loss/crossentropy": 2.8007214069366455, "loss/hidden": 1.015625, "loss/logits": 0.15108169615268707, "loss/reg": 3.5677723644766957e-05, "step": 101 }, { "epoch": 0.01275, "grad_norm": 2.261103630065918, "grad_norm_var": 0.20165261092997658, "learning_rate": 0.0001, "loss": 1.2524, "loss/crossentropy": 2.4267494678497314, "loss/hidden": 1.109375, "loss/logits": 0.1426696479320526, "loss/reg": 3.567594103515148e-05, "step": 102 }, { "epoch": 0.012875, "grad_norm": 1.7995065450668335, "grad_norm_var": 0.20938114766728447, "learning_rate": 0.0001, "loss": 1.0294, "loss/crossentropy": 2.477445602416992, "loss/hidden": 0.9140625, "loss/logits": 0.11502823233604431, "loss/reg": 3.5673674574354663e-05, "step": 103 }, { "epoch": 0.013, "grad_norm": 1.9462140798568726, "grad_norm_var": 0.12222182001865463, "learning_rate": 0.0001, "loss": 1.175, "loss/crossentropy": 2.4584763050079346, "loss/hidden": 1.0234375, "loss/logits": 0.15122900903224945, "loss/reg": 3.567052772268653e-05, "step": 104 }, { "epoch": 0.013125, "grad_norm": 5.390810489654541, "grad_norm_var": 0.8011994969266916, "learning_rate": 0.0001, "loss": 1.3528, "loss/crossentropy": 2.61253023147583, "loss/hidden": 1.1953125, "loss/logits": 0.15716172754764557, "loss/reg": 3.566941450117156e-05, "step": 105 }, { "epoch": 0.01325, "grad_norm": 2.104395866394043, "grad_norm_var": 0.7757998475018496, "learning_rate": 0.0001, "loss": 1.4737, "loss/crossentropy": 2.1339404582977295, "loss/hidden": 1.296875, "loss/logits": 0.17651526629924774, "loss/reg": 3.566693339962512e-05, "step": 106 }, { "epoch": 0.013375, "grad_norm": 1.8610461950302124, "grad_norm_var": 0.788653272883981, "learning_rate": 0.0001, "loss": 0.9407, "loss/crossentropy": 2.4770216941833496, "loss/hidden": 0.8359375, "loss/logits": 0.10441947728395462, "loss/reg": 3.566368104657158e-05, "step": 107 }, { "epoch": 0.0135, "grad_norm": 2.2065460681915283, "grad_norm_var": 0.7838738781084629, "learning_rate": 0.0001, "loss": 1.2355, "loss/crossentropy": 2.461944818496704, "loss/hidden": 1.0703125, "loss/logits": 0.16484174132347107, "loss/reg": 3.566055602277629e-05, "step": 108 }, { "epoch": 0.013625, "grad_norm": 2.1468284130096436, "grad_norm_var": 0.7761527810904186, "learning_rate": 0.0001, "loss": 1.1623, "loss/crossentropy": 2.2069454193115234, "loss/hidden": 1.0234375, "loss/logits": 0.13850779831409454, "loss/reg": 3.565785300452262e-05, "step": 109 }, { "epoch": 0.01375, "grad_norm": 2.383087635040283, "grad_norm_var": 0.7752898762699504, "learning_rate": 0.0001, "loss": 1.1793, "loss/crossentropy": 2.4479548931121826, "loss/hidden": 1.0390625, "loss/logits": 0.139842689037323, "loss/reg": 3.565509177860804e-05, "step": 110 }, { "epoch": 0.013875, "grad_norm": 2.9489004611968994, "grad_norm_var": 0.7693129207579057, "learning_rate": 0.0001, "loss": 1.2705, "loss/crossentropy": 2.3081014156341553, "loss/hidden": 1.1171875, "loss/logits": 0.15295040607452393, "loss/reg": 3.5651082725962624e-05, "step": 111 }, { "epoch": 0.014, "grad_norm": 3.968780755996704, "grad_norm_var": 0.9016446173616401, "learning_rate": 0.0001, "loss": 1.4396, "loss/crossentropy": 2.420243740081787, "loss/hidden": 1.203125, "loss/logits": 0.23613759875297546, "loss/reg": 3.56451710104011e-05, "step": 112 }, { "epoch": 0.014125, "grad_norm": 1.9860399961471558, "grad_norm_var": 0.9026067410203076, "learning_rate": 0.0001, "loss": 1.1773, "loss/crossentropy": 2.477583169937134, "loss/hidden": 1.03125, "loss/logits": 0.14566099643707275, "loss/reg": 3.56405544152949e-05, "step": 113 }, { "epoch": 0.01425, "grad_norm": 2.120425224304199, "grad_norm_var": 0.8976643536437109, "learning_rate": 0.0001, "loss": 1.2152, "loss/crossentropy": 3.030984878540039, "loss/hidden": 1.0625, "loss/logits": 0.15236616134643555, "loss/reg": 3.563678910722956e-05, "step": 114 }, { "epoch": 0.014375, "grad_norm": 2.0870068073272705, "grad_norm_var": 0.8786817926388901, "learning_rate": 0.0001, "loss": 1.2595, "loss/crossentropy": 2.4538302421569824, "loss/hidden": 1.109375, "loss/logits": 0.1497730016708374, "loss/reg": 3.5631266655400395e-05, "step": 115 }, { "epoch": 0.0145, "grad_norm": 3.1647448539733887, "grad_norm_var": 0.9025786275056549, "learning_rate": 0.0001, "loss": 1.6928, "loss/crossentropy": 2.205573320388794, "loss/hidden": 1.3984375, "loss/logits": 0.29398053884506226, "loss/reg": 3.562564597814344e-05, "step": 116 }, { "epoch": 0.014625, "grad_norm": 1.8475593328475952, "grad_norm_var": 0.9201723703583595, "learning_rate": 0.0001, "loss": 1.1518, "loss/crossentropy": 2.5012142658233643, "loss/hidden": 1.0078125, "loss/logits": 0.14361616969108582, "loss/reg": 3.562155688996427e-05, "step": 117 }, { "epoch": 0.01475, "grad_norm": 1.858892798423767, "grad_norm_var": 0.9438422080188066, "learning_rate": 0.0001, "loss": 1.1388, "loss/crossentropy": 2.5725672245025635, "loss/hidden": 1.0078125, "loss/logits": 0.1306220442056656, "loss/reg": 3.561788616934791e-05, "step": 118 }, { "epoch": 0.014875, "grad_norm": 2.2440059185028076, "grad_norm_var": 0.9153389246133348, "learning_rate": 0.0001, "loss": 1.4255, "loss/crossentropy": 2.341083288192749, "loss/hidden": 1.2421875, "loss/logits": 0.1829112321138382, "loss/reg": 3.561256380635314e-05, "step": 119 }, { "epoch": 0.015, "grad_norm": 2.1901657581329346, "grad_norm_var": 0.9005062112002877, "learning_rate": 0.0001, "loss": 1.2391, "loss/crossentropy": 2.362614631652832, "loss/hidden": 1.0703125, "loss/logits": 0.16847620904445648, "loss/reg": 3.56065938831307e-05, "step": 120 }, { "epoch": 0.015125, "grad_norm": 5.200242519378662, "grad_norm_var": 0.830131887163558, "learning_rate": 0.0001, "loss": 1.2103, "loss/crossentropy": 2.6675992012023926, "loss/hidden": 1.0546875, "loss/logits": 0.15525725483894348, "loss/reg": 3.5602170100901276e-05, "step": 121 }, { "epoch": 0.01525, "grad_norm": 2.407500982284546, "grad_norm_var": 0.8190810626824183, "learning_rate": 0.0001, "loss": 1.2234, "loss/crossentropy": 2.56121563911438, "loss/hidden": 1.0703125, "loss/logits": 0.15273353457450867, "loss/reg": 3.559728429536335e-05, "step": 122 }, { "epoch": 0.015375, "grad_norm": 1.8797663450241089, "grad_norm_var": 0.8174111264801723, "learning_rate": 0.0001, "loss": 1.1099, "loss/crossentropy": 2.4745869636535645, "loss/hidden": 0.98046875, "loss/logits": 0.12910515069961548, "loss/reg": 3.559128526831046e-05, "step": 123 }, { "epoch": 0.0155, "grad_norm": 2.1494781970977783, "grad_norm_var": 0.8201521751832492, "learning_rate": 0.0001, "loss": 1.3179, "loss/crossentropy": 2.279508590698242, "loss/hidden": 1.1484375, "loss/logits": 0.1690721958875656, "loss/reg": 3.558437674655579e-05, "step": 124 }, { "epoch": 0.015625, "grad_norm": 9.331904411315918, "grad_norm_var": 3.67345953379431, "learning_rate": 0.0001, "loss": 1.3993, "loss/crossentropy": 2.252732038497925, "loss/hidden": 1.2578125, "loss/logits": 0.14117392897605896, "loss/reg": 3.5579581890488043e-05, "step": 125 }, { "epoch": 0.01575, "grad_norm": 4.699957847595215, "grad_norm_var": 3.8228479802693203, "learning_rate": 0.0001, "loss": 1.2831, "loss/crossentropy": 2.5877878665924072, "loss/hidden": 1.15625, "loss/logits": 0.12651070952415466, "loss/reg": 3.557529635145329e-05, "step": 126 }, { "epoch": 0.015875, "grad_norm": 1.8446160554885864, "grad_norm_var": 3.9257773899168873, "learning_rate": 0.0001, "loss": 1.2999, "loss/crossentropy": 2.2898948192596436, "loss/hidden": 1.140625, "loss/logits": 0.15888020396232605, "loss/reg": 3.557029049261473e-05, "step": 127 }, { "epoch": 0.016, "grad_norm": 1.873570203781128, "grad_norm_var": 3.9466365178432232, "learning_rate": 0.0001, "loss": 1.1097, "loss/crossentropy": 2.392472743988037, "loss/hidden": 0.9765625, "loss/logits": 0.13277310132980347, "loss/reg": 3.556452429620549e-05, "step": 128 }, { "epoch": 0.016125, "grad_norm": 2.215426445007324, "grad_norm_var": 3.92104303267346, "learning_rate": 0.0001, "loss": 1.2203, "loss/crossentropy": 2.457443952560425, "loss/hidden": 1.0546875, "loss/logits": 0.16524553298950195, "loss/reg": 3.555676812538877e-05, "step": 129 }, { "epoch": 0.01625, "grad_norm": 1.7194857597351074, "grad_norm_var": 3.9751548455277104, "learning_rate": 0.0001, "loss": 1.1788, "loss/crossentropy": 2.3001158237457275, "loss/hidden": 1.03125, "loss/logits": 0.1472093164920807, "loss/reg": 3.5550358006730676e-05, "step": 130 }, { "epoch": 0.016375, "grad_norm": 1.6397395133972168, "grad_norm_var": 4.037312774164259, "learning_rate": 0.0001, "loss": 1.0336, "loss/crossentropy": 2.54146146774292, "loss/hidden": 0.91015625, "loss/logits": 0.12307839095592499, "loss/reg": 3.5546618164516985e-05, "step": 131 }, { "epoch": 0.0165, "grad_norm": 1.9748913049697876, "grad_norm_var": 4.082478037296671, "learning_rate": 0.0001, "loss": 1.104, "loss/crossentropy": 2.562748670578003, "loss/hidden": 0.97265625, "loss/logits": 0.130945086479187, "loss/reg": 3.5543002013582736e-05, "step": 132 }, { "epoch": 0.016625, "grad_norm": 1.8674203157424927, "grad_norm_var": 4.079934623823218, "learning_rate": 0.0001, "loss": 1.2082, "loss/crossentropy": 2.6365339756011963, "loss/hidden": 1.0625, "loss/logits": 0.14538231492042542, "loss/reg": 3.5538523661671206e-05, "step": 133 }, { "epoch": 0.01675, "grad_norm": 2.3201568126678467, "grad_norm_var": 4.03421067719521, "learning_rate": 0.0001, "loss": 1.2356, "loss/crossentropy": 2.435370445251465, "loss/hidden": 1.09375, "loss/logits": 0.14145305752754211, "loss/reg": 3.5533634218154475e-05, "step": 134 }, { "epoch": 0.016875, "grad_norm": 2.4132328033447266, "grad_norm_var": 4.022385903408254, "learning_rate": 0.0001, "loss": 1.3273, "loss/crossentropy": 2.206634998321533, "loss/hidden": 1.1796875, "loss/logits": 0.14724516868591309, "loss/reg": 3.552551061147824e-05, "step": 135 }, { "epoch": 0.017, "grad_norm": 2.419842481613159, "grad_norm_var": 4.005232252864962, "learning_rate": 0.0001, "loss": 1.1667, "loss/crossentropy": 2.517561912536621, "loss/hidden": 1.015625, "loss/logits": 0.15071895718574524, "loss/reg": 3.5521599784260616e-05, "step": 136 }, { "epoch": 0.017125, "grad_norm": 2.716203212738037, "grad_norm_var": 3.6198676372845124, "learning_rate": 0.0001, "loss": 1.1423, "loss/crossentropy": 2.2819504737854004, "loss/hidden": 1.015625, "loss/logits": 0.12632890045642853, "loss/reg": 3.551522604539059e-05, "step": 137 }, { "epoch": 0.01725, "grad_norm": 2.166456460952759, "grad_norm_var": 3.6334485092224402, "learning_rate": 0.0001, "loss": 1.0296, "loss/crossentropy": 2.4271674156188965, "loss/hidden": 0.91015625, "loss/logits": 0.11908704042434692, "loss/reg": 3.550978362909518e-05, "step": 138 }, { "epoch": 0.017375, "grad_norm": 2.0737109184265137, "grad_norm_var": 3.6145368084521117, "learning_rate": 0.0001, "loss": 1.0938, "loss/crossentropy": 2.595165967941284, "loss/hidden": 0.95703125, "loss/logits": 0.13645675778388977, "loss/reg": 3.550490873749368e-05, "step": 139 }, { "epoch": 0.0175, "grad_norm": 2.0036461353302, "grad_norm_var": 3.6268452557090196, "learning_rate": 0.0001, "loss": 1.0352, "loss/crossentropy": 2.6407668590545654, "loss/hidden": 0.91015625, "loss/logits": 0.12472639232873917, "loss/reg": 3.5499935620464385e-05, "step": 140 }, { "epoch": 0.017625, "grad_norm": 3.4189512729644775, "grad_norm_var": 0.5874364952131912, "learning_rate": 0.0001, "loss": 1.3364, "loss/crossentropy": 2.8673250675201416, "loss/hidden": 1.140625, "loss/logits": 0.19542476534843445, "loss/reg": 3.549545363057405e-05, "step": 141 }, { "epoch": 0.01775, "grad_norm": 2.0884158611297607, "grad_norm_var": 0.19036343785417903, "learning_rate": 0.0001, "loss": 1.1356, "loss/crossentropy": 2.2495715618133545, "loss/hidden": 1.0078125, "loss/logits": 0.12745517492294312, "loss/reg": 3.5490164009388536e-05, "step": 142 }, { "epoch": 0.017875, "grad_norm": 2.4939138889312744, "grad_norm_var": 0.1883496681179942, "learning_rate": 0.0001, "loss": 1.2224, "loss/crossentropy": 2.3506898880004883, "loss/hidden": 1.078125, "loss/logits": 0.14388948678970337, "loss/reg": 3.548476524883881e-05, "step": 143 }, { "epoch": 0.018, "grad_norm": 2.634059190750122, "grad_norm_var": 0.19009706439956606, "learning_rate": 0.0001, "loss": 1.308, "loss/crossentropy": 2.422675371170044, "loss/hidden": 1.1484375, "loss/logits": 0.15922774374485016, "loss/reg": 3.5479293728712946e-05, "step": 144 }, { "epoch": 0.018125, "grad_norm": 2.9936301708221436, "grad_norm_var": 0.22328614777820613, "learning_rate": 0.0001, "loss": 1.4104, "loss/crossentropy": 2.5935611724853516, "loss/hidden": 1.2109375, "loss/logits": 0.19912970066070557, "loss/reg": 3.547423330019228e-05, "step": 145 }, { "epoch": 0.01825, "grad_norm": 3.1390833854675293, "grad_norm_var": 0.23765955297996205, "learning_rate": 0.0001, "loss": 1.2747, "loss/crossentropy": 2.4289345741271973, "loss/hidden": 1.125, "loss/logits": 0.14936049282550812, "loss/reg": 3.54700350726489e-05, "step": 146 }, { "epoch": 0.018375, "grad_norm": 2.4484870433807373, "grad_norm_var": 0.19680489618343674, "learning_rate": 0.0001, "loss": 1.1176, "loss/crossentropy": 2.519469976425171, "loss/hidden": 0.98046875, "loss/logits": 0.13677741587162018, "loss/reg": 3.546685184119269e-05, "step": 147 }, { "epoch": 0.0185, "grad_norm": 1.876994252204895, "grad_norm_var": 0.20358269116957192, "learning_rate": 0.0001, "loss": 1.0701, "loss/crossentropy": 2.669678211212158, "loss/hidden": 0.9453125, "loss/logits": 0.12444234639406204, "loss/reg": 3.5462882806314155e-05, "step": 148 }, { "epoch": 0.018625, "grad_norm": 2.704676628112793, "grad_norm_var": 0.1832369663952557, "learning_rate": 0.0001, "loss": 1.2437, "loss/crossentropy": 2.595327854156494, "loss/hidden": 1.0625, "loss/logits": 0.18086357414722443, "loss/reg": 3.5459666833048686e-05, "step": 149 }, { "epoch": 0.01875, "grad_norm": 1.9055484533309937, "grad_norm_var": 0.20361674389210194, "learning_rate": 0.0001, "loss": 1.161, "loss/crossentropy": 2.5460128784179688, "loss/hidden": 1.015625, "loss/logits": 0.1449938714504242, "loss/reg": 3.545805884641595e-05, "step": 150 }, { "epoch": 0.018875, "grad_norm": 2.7920961380004883, "grad_norm_var": 0.20979331401592252, "learning_rate": 0.0001, "loss": 1.1624, "loss/crossentropy": 2.2290139198303223, "loss/hidden": 1.0078125, "loss/logits": 0.15427884459495544, "loss/reg": 3.545627259882167e-05, "step": 151 }, { "epoch": 0.019, "grad_norm": 2.299669027328491, "grad_norm_var": 0.21185582767360506, "learning_rate": 0.0001, "loss": 1.1007, "loss/crossentropy": 2.5660064220428467, "loss/hidden": 0.9609375, "loss/logits": 0.13937082886695862, "loss/reg": 3.5451499570626765e-05, "step": 152 }, { "epoch": 0.019125, "grad_norm": 1.9452663660049438, "grad_norm_var": 0.2252079205413636, "learning_rate": 0.0001, "loss": 1.1951, "loss/crossentropy": 2.3395628929138184, "loss/hidden": 1.0390625, "loss/logits": 0.155635803937912, "loss/reg": 3.544955688994378e-05, "step": 153 }, { "epoch": 0.01925, "grad_norm": 1.8410539627075195, "grad_norm_var": 0.24354386471799874, "learning_rate": 0.0001, "loss": 1.0907, "loss/crossentropy": 2.5739612579345703, "loss/hidden": 0.95703125, "loss/logits": 0.13335567712783813, "loss/reg": 3.544604260241613e-05, "step": 154 }, { "epoch": 0.019375, "grad_norm": 2.2286977767944336, "grad_norm_var": 0.23796766155861107, "learning_rate": 0.0001, "loss": 1.0203, "loss/crossentropy": 2.2887346744537354, "loss/hidden": 0.90625, "loss/logits": 0.11365014314651489, "loss/reg": 3.544157516444102e-05, "step": 155 }, { "epoch": 0.0195, "grad_norm": 2.3557684421539307, "grad_norm_var": 0.22589299419968203, "learning_rate": 0.0001, "loss": 1.3643, "loss/crossentropy": 2.274764060974121, "loss/hidden": 1.1875, "loss/logits": 0.1764501929283142, "loss/reg": 3.543913771864027e-05, "step": 156 }, { "epoch": 0.019625, "grad_norm": 2.247559070587158, "grad_norm_var": 0.15998786264861256, "learning_rate": 0.0001, "loss": 1.3654, "loss/crossentropy": 2.2784736156463623, "loss/hidden": 1.171875, "loss/logits": 0.19315966963768005, "loss/reg": 3.543505954439752e-05, "step": 157 }, { "epoch": 0.01975, "grad_norm": 2.834188222885132, "grad_norm_var": 0.16628359109999918, "learning_rate": 0.0001, "loss": 1.3386, "loss/crossentropy": 2.509218454360962, "loss/hidden": 1.15625, "loss/logits": 0.18198764324188232, "loss/reg": 3.54316653101705e-05, "step": 158 }, { "epoch": 0.019875, "grad_norm": 2.1036031246185303, "grad_norm_var": 0.17202571468130015, "learning_rate": 0.0001, "loss": 1.0503, "loss/crossentropy": 2.4518606662750244, "loss/hidden": 0.93359375, "loss/logits": 0.11635103076696396, "loss/reg": 3.542845297488384e-05, "step": 159 }, { "epoch": 0.02, "grad_norm": 2.062805652618408, "grad_norm_var": 0.17435755134033895, "learning_rate": 0.0001, "loss": 1.0312, "loss/crossentropy": 2.361372470855713, "loss/hidden": 0.91796875, "loss/logits": 0.1128474771976471, "loss/reg": 3.5423294320935383e-05, "step": 160 }, { "epoch": 0.020125, "grad_norm": 2.021106004714966, "grad_norm_var": 0.15146251895304388, "learning_rate": 0.0001, "loss": 1.1121, "loss/crossentropy": 2.134568214416504, "loss/hidden": 0.984375, "loss/logits": 0.1273871660232544, "loss/reg": 3.541701880749315e-05, "step": 161 }, { "epoch": 0.02025, "grad_norm": 1.8616065979003906, "grad_norm_var": 0.11060822886564707, "learning_rate": 0.0001, "loss": 1.1566, "loss/crossentropy": 2.252749443054199, "loss/hidden": 1.0234375, "loss/logits": 0.1327974796295166, "loss/reg": 3.541166370268911e-05, "step": 162 }, { "epoch": 0.020375, "grad_norm": 3.580717086791992, "grad_norm_var": 0.2251369893581473, "learning_rate": 0.0001, "loss": 1.7797, "loss/crossentropy": 2.554025888442993, "loss/hidden": 1.4921875, "loss/logits": 0.2871723771095276, "loss/reg": 3.54056573996786e-05, "step": 163 }, { "epoch": 0.0205, "grad_norm": 1.9080392122268677, "grad_norm_var": 0.22348213477058507, "learning_rate": 0.0001, "loss": 1.1281, "loss/crossentropy": 2.561861038208008, "loss/hidden": 0.984375, "loss/logits": 0.14337776601314545, "loss/reg": 3.53991927113384e-05, "step": 164 }, { "epoch": 0.020625, "grad_norm": 2.122875213623047, "grad_norm_var": 0.2127240754841674, "learning_rate": 0.0001, "loss": 1.1555, "loss/crossentropy": 2.5679633617401123, "loss/hidden": 1.0, "loss/logits": 0.15509843826293945, "loss/reg": 3.539249883033335e-05, "step": 165 }, { "epoch": 0.02075, "grad_norm": 2.3634352684020996, "grad_norm_var": 0.2043765165354652, "learning_rate": 0.0001, "loss": 1.2497, "loss/crossentropy": 2.691157579421997, "loss/hidden": 1.09375, "loss/logits": 0.15564362704753876, "loss/reg": 3.5388431570027024e-05, "step": 166 }, { "epoch": 0.020875, "grad_norm": 1.8694658279418945, "grad_norm_var": 0.1952630533114321, "learning_rate": 0.0001, "loss": 1.193, "loss/crossentropy": 2.585261106491089, "loss/hidden": 1.046875, "loss/logits": 0.14572536945343018, "loss/reg": 3.5384666261961684e-05, "step": 167 }, { "epoch": 0.021, "grad_norm": 1.9730387926101685, "grad_norm_var": 0.19880394057846942, "learning_rate": 0.0001, "loss": 1.2912, "loss/crossentropy": 2.5292489528656006, "loss/hidden": 1.125, "loss/logits": 0.16588369011878967, "loss/reg": 3.538179225870408e-05, "step": 168 }, { "epoch": 0.021125, "grad_norm": 2.0155906677246094, "grad_norm_var": 0.1966546350588845, "learning_rate": 0.0001, "loss": 1.2574, "loss/crossentropy": 2.252187728881836, "loss/hidden": 1.1015625, "loss/logits": 0.1554747223854065, "loss/reg": 3.537629891070537e-05, "step": 169 }, { "epoch": 0.02125, "grad_norm": 2.432105302810669, "grad_norm_var": 0.18926746622633459, "learning_rate": 0.0001, "loss": 1.4755, "loss/crossentropy": 2.1912407875061035, "loss/hidden": 1.3125, "loss/logits": 0.1626225709915161, "loss/reg": 3.5371955164009705e-05, "step": 170 }, { "epoch": 0.021375, "grad_norm": 2.286074161529541, "grad_norm_var": 0.18931952814725433, "learning_rate": 0.0001, "loss": 1.1264, "loss/crossentropy": 2.7250216007232666, "loss/hidden": 0.9921875, "loss/logits": 0.13382771611213684, "loss/reg": 3.53686700691469e-05, "step": 171 }, { "epoch": 0.0215, "grad_norm": 2.3155455589294434, "grad_norm_var": 0.18886613419825055, "learning_rate": 0.0001, "loss": 1.1029, "loss/crossentropy": 2.54166316986084, "loss/hidden": 0.96875, "loss/logits": 0.13384276628494263, "loss/reg": 3.536650910973549e-05, "step": 172 }, { "epoch": 0.021625, "grad_norm": 2.488759994506836, "grad_norm_var": 0.19242826239165894, "learning_rate": 0.0001, "loss": 1.2388, "loss/crossentropy": 2.368736743927002, "loss/hidden": 1.0859375, "loss/logits": 0.1524895578622818, "loss/reg": 3.5365450457902625e-05, "step": 173 }, { "epoch": 0.02175, "grad_norm": 2.2716481685638428, "grad_norm_var": 0.16950942206230835, "learning_rate": 0.0001, "loss": 1.1404, "loss/crossentropy": 2.602968454360962, "loss/hidden": 1.0, "loss/logits": 0.14000558853149414, "loss/reg": 3.5365239455131814e-05, "step": 174 }, { "epoch": 0.021875, "grad_norm": 2.346731424331665, "grad_norm_var": 0.16911372185244672, "learning_rate": 0.0001, "loss": 1.1505, "loss/crossentropy": 2.6104869842529297, "loss/hidden": 1.015625, "loss/logits": 0.1345091462135315, "loss/reg": 3.536231452017091e-05, "step": 175 }, { "epoch": 0.022, "grad_norm": 9.636815071105957, "grad_norm_var": 3.5705013839433035, "learning_rate": 0.0001, "loss": 1.9711, "loss/crossentropy": 1.9007188081741333, "loss/hidden": 1.8359375, "loss/logits": 0.13476577401161194, "loss/reg": 3.535941868904047e-05, "step": 176 }, { "epoch": 0.022125, "grad_norm": 1.9420382976531982, "grad_norm_var": 3.578242683123464, "learning_rate": 0.0001, "loss": 1.0553, "loss/crossentropy": 2.1399552822113037, "loss/hidden": 0.93359375, "loss/logits": 0.12134355306625366, "loss/reg": 3.535431460477412e-05, "step": 177 }, { "epoch": 0.02225, "grad_norm": 3.67820405960083, "grad_norm_var": 3.5781775866024454, "learning_rate": 0.0001, "loss": 1.6061, "loss/crossentropy": 2.7716376781463623, "loss/hidden": 1.3671875, "loss/logits": 0.2385806441307068, "loss/reg": 3.5349476092960685e-05, "step": 178 }, { "epoch": 0.022375, "grad_norm": 2.345334768295288, "grad_norm_var": 3.5494032480633924, "learning_rate": 0.0001, "loss": 1.3134, "loss/crossentropy": 2.2584104537963867, "loss/hidden": 1.1484375, "loss/logits": 0.16456623375415802, "loss/reg": 3.534728966769762e-05, "step": 179 }, { "epoch": 0.0225, "grad_norm": 2.019059181213379, "grad_norm_var": 3.5377143028114526, "learning_rate": 0.0001, "loss": 1.1633, "loss/crossentropy": 2.688572645187378, "loss/hidden": 1.03125, "loss/logits": 0.13166731595993042, "loss/reg": 3.5341858165338635e-05, "step": 180 }, { "epoch": 0.022625, "grad_norm": 2.5575642585754395, "grad_norm_var": 3.5127901367513408, "learning_rate": 0.0001, "loss": 1.3238, "loss/crossentropy": 3.2461724281311035, "loss/hidden": 1.1328125, "loss/logits": 0.19062137603759766, "loss/reg": 3.534007555572316e-05, "step": 181 }, { "epoch": 0.02275, "grad_norm": 2.1583099365234375, "grad_norm_var": 3.52691794996746, "learning_rate": 0.0001, "loss": 1.105, "loss/crossentropy": 2.570775270462036, "loss/hidden": 0.97265625, "loss/logits": 0.13197766244411469, "loss/reg": 3.5337754525244236e-05, "step": 182 }, { "epoch": 0.022875, "grad_norm": 2.1373021602630615, "grad_norm_var": 3.499205684128989, "learning_rate": 0.0001, "loss": 1.0368, "loss/crossentropy": 2.7369937896728516, "loss/hidden": 0.91015625, "loss/logits": 0.12632793188095093, "loss/reg": 3.5334065614733845e-05, "step": 183 }, { "epoch": 0.023, "grad_norm": 1.9534015655517578, "grad_norm_var": 3.501362961216583, "learning_rate": 0.0001, "loss": 1.2631, "loss/crossentropy": 2.348998546600342, "loss/hidden": 1.09375, "loss/logits": 0.16902770102024078, "loss/reg": 3.533027120283805e-05, "step": 184 }, { "epoch": 0.023125, "grad_norm": 3.3518424034118652, "grad_norm_var": 3.47560508461983, "learning_rate": 0.0001, "loss": 1.4757, "loss/crossentropy": 2.2752151489257812, "loss/hidden": 1.3125, "loss/logits": 0.16288352012634277, "loss/reg": 3.532712798914872e-05, "step": 185 }, { "epoch": 0.02325, "grad_norm": 1.9095062017440796, "grad_norm_var": 3.5231901050491348, "learning_rate": 0.0001, "loss": 1.1564, "loss/crossentropy": 2.3877570629119873, "loss/hidden": 0.9921875, "loss/logits": 0.16389842331409454, "loss/reg": 3.531980837578885e-05, "step": 186 }, { "epoch": 0.023375, "grad_norm": 1.7745263576507568, "grad_norm_var": 3.5771479932902293, "learning_rate": 0.0001, "loss": 1.0993, "loss/crossentropy": 2.5461585521698, "loss/hidden": 0.96484375, "loss/logits": 0.13407567143440247, "loss/reg": 3.531064066919498e-05, "step": 187 }, { "epoch": 0.0235, "grad_norm": 1.932446002960205, "grad_norm_var": 3.611343163184297, "learning_rate": 0.0001, "loss": 1.1479, "loss/crossentropy": 2.177215099334717, "loss/hidden": 1.0078125, "loss/logits": 0.13969676196575165, "loss/reg": 3.530231333570555e-05, "step": 188 }, { "epoch": 0.023625, "grad_norm": 2.2318572998046875, "grad_norm_var": 3.6254944343577464, "learning_rate": 0.0001, "loss": 1.1476, "loss/crossentropy": 2.539461374282837, "loss/hidden": 1.0078125, "loss/logits": 0.13939380645751953, "loss/reg": 3.5298002330819145e-05, "step": 189 }, { "epoch": 0.02375, "grad_norm": 1.8733116388320923, "grad_norm_var": 3.661635973864308, "learning_rate": 0.0001, "loss": 1.2894, "loss/crossentropy": 2.3773810863494873, "loss/hidden": 1.1484375, "loss/logits": 0.14065586030483246, "loss/reg": 3.529394234647043e-05, "step": 190 }, { "epoch": 0.023875, "grad_norm": 1.9819684028625488, "grad_norm_var": 3.689103451615234, "learning_rate": 0.0001, "loss": 1.0824, "loss/crossentropy": 2.652743101119995, "loss/hidden": 0.95703125, "loss/logits": 0.12501415610313416, "loss/reg": 3.528552406351082e-05, "step": 191 }, { "epoch": 0.024, "grad_norm": 1.781873345375061, "grad_norm_var": 0.29881303206394033, "learning_rate": 0.0001, "loss": 1.3252, "loss/crossentropy": 2.130110740661621, "loss/hidden": 1.1640625, "loss/logits": 0.16076770424842834, "loss/reg": 3.527875742292963e-05, "step": 192 }, { "epoch": 0.024125, "grad_norm": 1.968166470527649, "grad_norm_var": 0.29786371458498306, "learning_rate": 0.0001, "loss": 1.1787, "loss/crossentropy": 2.607984781265259, "loss/hidden": 1.0234375, "loss/logits": 0.15494059026241302, "loss/reg": 3.5275123082101345e-05, "step": 193 }, { "epoch": 0.02425, "grad_norm": 2.250447988510132, "grad_norm_var": 0.14927689793780866, "learning_rate": 0.0001, "loss": 1.2501, "loss/crossentropy": 2.381725549697876, "loss/hidden": 1.109375, "loss/logits": 0.1403769701719284, "loss/reg": 3.5266541090095416e-05, "step": 194 }, { "epoch": 0.024375, "grad_norm": 2.3107409477233887, "grad_norm_var": 0.14840081385512557, "learning_rate": 0.0001, "loss": 1.308, "loss/crossentropy": 2.5593056678771973, "loss/hidden": 1.15625, "loss/logits": 0.15144123136997223, "loss/reg": 3.526056025293656e-05, "step": 195 }, { "epoch": 0.0245, "grad_norm": 2.0219268798828125, "grad_norm_var": 0.14835622425891018, "learning_rate": 0.0001, "loss": 1.2059, "loss/crossentropy": 2.591111421585083, "loss/hidden": 1.0546875, "loss/logits": 0.1508275270462036, "loss/reg": 3.525464853737503e-05, "step": 196 }, { "epoch": 0.024625, "grad_norm": 1.7184540033340454, "grad_norm_var": 0.14533186557784786, "learning_rate": 0.0001, "loss": 1.1103, "loss/crossentropy": 2.5513174533843994, "loss/hidden": 0.98046875, "loss/logits": 0.12951934337615967, "loss/reg": 3.5250719520263374e-05, "step": 197 }, { "epoch": 0.02475, "grad_norm": 2.099649429321289, "grad_norm_var": 0.14497162965532903, "learning_rate": 0.0001, "loss": 1.3992, "loss/crossentropy": 2.3214898109436035, "loss/hidden": 1.2109375, "loss/logits": 0.1879514902830124, "loss/reg": 3.524802013998851e-05, "step": 198 }, { "epoch": 0.024875, "grad_norm": 2.551090717315674, "grad_norm_var": 0.15877433194770507, "learning_rate": 0.0001, "loss": 1.1497, "loss/crossentropy": 2.0819451808929443, "loss/hidden": 1.0, "loss/logits": 0.14934971928596497, "loss/reg": 3.524927524267696e-05, "step": 199 }, { "epoch": 0.025, "grad_norm": 2.41422438621521, "grad_norm_var": 0.1626121663513837, "learning_rate": 0.0001, "loss": 1.1887, "loss/crossentropy": 2.4138712882995605, "loss/hidden": 1.03125, "loss/logits": 0.15712395310401917, "loss/reg": 3.5251006920589134e-05, "step": 200 }, { "epoch": 0.025125, "grad_norm": 3.0029654502868652, "grad_norm_var": 0.11365057463783608, "learning_rate": 0.0001, "loss": 1.2376, "loss/crossentropy": 2.3929851055145264, "loss/hidden": 1.109375, "loss/logits": 0.12789341807365417, "loss/reg": 3.525133433868177e-05, "step": 201 }, { "epoch": 0.02525, "grad_norm": 2.318742513656616, "grad_norm_var": 0.1129624302912769, "learning_rate": 0.0001, "loss": 1.1832, "loss/crossentropy": 2.4472897052764893, "loss/hidden": 1.0234375, "loss/logits": 0.15940618515014648, "loss/reg": 3.525337888277136e-05, "step": 202 }, { "epoch": 0.025375, "grad_norm": 2.432077169418335, "grad_norm_var": 0.1079851047719283, "learning_rate": 0.0001, "loss": 1.2983, "loss/crossentropy": 2.5350871086120605, "loss/hidden": 1.125, "loss/logits": 0.17297999560832977, "loss/reg": 3.5248252970632166e-05, "step": 203 }, { "epoch": 0.0255, "grad_norm": 2.019660711288452, "grad_norm_var": 0.10557456561180795, "learning_rate": 0.0001, "loss": 1.1381, "loss/crossentropy": 2.443376064300537, "loss/hidden": 1.0078125, "loss/logits": 0.1299603134393692, "loss/reg": 3.5241089790361e-05, "step": 204 }, { "epoch": 0.025625, "grad_norm": 1.928032636642456, "grad_norm_var": 0.10948915785116071, "learning_rate": 0.0001, "loss": 1.039, "loss/crossentropy": 2.552450656890869, "loss/hidden": 0.9140625, "loss/logits": 0.12461342662572861, "loss/reg": 3.523936538840644e-05, "step": 205 }, { "epoch": 0.02575, "grad_norm": 1.872836947441101, "grad_norm_var": 0.10950776538443824, "learning_rate": 0.0001, "loss": 1.203, "loss/crossentropy": 2.563770294189453, "loss/hidden": 1.046875, "loss/logits": 0.1558125615119934, "loss/reg": 3.523280975059606e-05, "step": 206 }, { "epoch": 0.025875, "grad_norm": 2.722188711166382, "grad_norm_var": 0.12548596824484168, "learning_rate": 0.0001, "loss": 1.1048, "loss/crossentropy": 2.673701286315918, "loss/hidden": 0.95703125, "loss/logits": 0.14743700623512268, "loss/reg": 3.523009945638478e-05, "step": 207 }, { "epoch": 0.026, "grad_norm": 2.174853563308716, "grad_norm_var": 0.1125315287945383, "learning_rate": 0.0001, "loss": 1.2005, "loss/crossentropy": 2.776305913925171, "loss/hidden": 1.0390625, "loss/logits": 0.16110967099666595, "loss/reg": 3.5228087654104456e-05, "step": 208 }, { "epoch": 0.026125, "grad_norm": 2.026176691055298, "grad_norm_var": 0.11065571110427162, "learning_rate": 0.0001, "loss": 1.2427, "loss/crossentropy": 2.4790477752685547, "loss/hidden": 1.0859375, "loss/logits": 0.15644526481628418, "loss/reg": 3.5227625630795956e-05, "step": 209 }, { "epoch": 0.02625, "grad_norm": 1.8512555360794067, "grad_norm_var": 0.12013934057968918, "learning_rate": 0.0001, "loss": 1.1542, "loss/crossentropy": 2.2648251056671143, "loss/hidden": 1.015625, "loss/logits": 0.1382524073123932, "loss/reg": 3.5229088098276407e-05, "step": 210 }, { "epoch": 0.026375, "grad_norm": 2.1204733848571777, "grad_norm_var": 0.12001253969897234, "learning_rate": 0.0001, "loss": 1.1644, "loss/crossentropy": 2.5878171920776367, "loss/hidden": 1.0078125, "loss/logits": 0.15627792477607727, "loss/reg": 3.523128543747589e-05, "step": 211 }, { "epoch": 0.0265, "grad_norm": 2.3862364292144775, "grad_norm_var": 0.11943129282008957, "learning_rate": 0.0001, "loss": 1.2387, "loss/crossentropy": 2.3441061973571777, "loss/hidden": 1.0859375, "loss/logits": 0.1524021029472351, "loss/reg": 3.522722909110598e-05, "step": 212 }, { "epoch": 0.026625, "grad_norm": 2.4904723167419434, "grad_norm_var": 0.10428997507238075, "learning_rate": 0.0001, "loss": 1.2116, "loss/crossentropy": 2.5467946529388428, "loss/hidden": 1.0546875, "loss/logits": 0.15654009580612183, "loss/reg": 3.522184488247149e-05, "step": 213 }, { "epoch": 0.02675, "grad_norm": 2.1831283569335938, "grad_norm_var": 0.1027661689763948, "learning_rate": 0.0001, "loss": 1.1544, "loss/crossentropy": 2.782853126525879, "loss/hidden": 0.98828125, "loss/logits": 0.16573229432106018, "loss/reg": 3.521531107253395e-05, "step": 214 }, { "epoch": 0.026875, "grad_norm": 3.787935972213745, "grad_norm_var": 0.24293552641352203, "learning_rate": 0.0001, "loss": 1.5035, "loss/crossentropy": 2.463303804397583, "loss/hidden": 1.28125, "loss/logits": 0.22189679741859436, "loss/reg": 3.5209486668463796e-05, "step": 215 }, { "epoch": 0.027, "grad_norm": 1.9294849634170532, "grad_norm_var": 0.25400057735268855, "learning_rate": 0.0001, "loss": 1.1094, "loss/crossentropy": 2.4277851581573486, "loss/hidden": 0.97265625, "loss/logits": 0.13635680079460144, "loss/reg": 3.5205699532525614e-05, "step": 216 }, { "epoch": 0.027125, "grad_norm": 1.9984577894210815, "grad_norm_var": 0.22665186521847977, "learning_rate": 0.0001, "loss": 1.0918, "loss/crossentropy": 2.5422215461730957, "loss/hidden": 0.96484375, "loss/logits": 0.12658366560935974, "loss/reg": 3.519668462104164e-05, "step": 217 }, { "epoch": 0.02725, "grad_norm": 2.2303996086120605, "grad_norm_var": 0.2265080910144917, "learning_rate": 0.0001, "loss": 1.4173, "loss/crossentropy": 2.6119143962860107, "loss/hidden": 1.2109375, "loss/logits": 0.2059965431690216, "loss/reg": 3.519029269227758e-05, "step": 218 }, { "epoch": 0.027375, "grad_norm": 2.595283031463623, "grad_norm_var": 0.23192599234322203, "learning_rate": 0.0001, "loss": 1.4022, "loss/crossentropy": 2.388324499130249, "loss/hidden": 1.2265625, "loss/logits": 0.17526502907276154, "loss/reg": 3.5182933061150834e-05, "step": 219 }, { "epoch": 0.0275, "grad_norm": 2.3104512691497803, "grad_norm_var": 0.2275123342772699, "learning_rate": 0.0001, "loss": 1.1324, "loss/crossentropy": 2.400674819946289, "loss/hidden": 1.0078125, "loss/logits": 0.1242409497499466, "loss/reg": 3.517704681144096e-05, "step": 220 }, { "epoch": 0.027625, "grad_norm": 2.0627379417419434, "grad_norm_var": 0.2221815343350992, "learning_rate": 0.0001, "loss": 1.2552, "loss/crossentropy": 2.4785444736480713, "loss/hidden": 1.09375, "loss/logits": 0.1611141413450241, "loss/reg": 3.517186632961966e-05, "step": 221 }, { "epoch": 0.02775, "grad_norm": 2.6437251567840576, "grad_norm_var": 0.215787531953678, "learning_rate": 0.0001, "loss": 1.3035, "loss/crossentropy": 2.35196590423584, "loss/hidden": 1.0859375, "loss/logits": 0.21720939874649048, "loss/reg": 3.5167005989933386e-05, "step": 222 }, { "epoch": 0.027875, "grad_norm": 2.0463922023773193, "grad_norm_var": 0.21030634447597923, "learning_rate": 0.0001, "loss": 1.178, "loss/crossentropy": 2.5125770568847656, "loss/hidden": 1.03125, "loss/logits": 0.14638057351112366, "loss/reg": 3.516068682074547e-05, "step": 223 }, { "epoch": 0.028, "grad_norm": 2.099517822265625, "grad_norm_var": 0.2119416481519659, "learning_rate": 0.0001, "loss": 1.2246, "loss/crossentropy": 2.6609790325164795, "loss/hidden": 1.0703125, "loss/logits": 0.1539691686630249, "loss/reg": 3.51545459125191e-05, "step": 224 }, { "epoch": 0.028125, "grad_norm": 2.2835323810577393, "grad_norm_var": 0.20676636732833859, "learning_rate": 0.0001, "loss": 1.1949, "loss/crossentropy": 2.408785581588745, "loss/hidden": 1.046875, "loss/logits": 0.14770297706127167, "loss/reg": 3.514792115311138e-05, "step": 225 }, { "epoch": 0.02825, "grad_norm": 2.2726855278015137, "grad_norm_var": 0.19188050953051553, "learning_rate": 0.0001, "loss": 1.125, "loss/crossentropy": 2.3164994716644287, "loss/hidden": 0.98828125, "loss/logits": 0.13638192415237427, "loss/reg": 3.5143304558005184e-05, "step": 226 }, { "epoch": 0.028375, "grad_norm": 2.239753484725952, "grad_norm_var": 0.18927748053925839, "learning_rate": 0.0001, "loss": 1.4585, "loss/crossentropy": 2.5264809131622314, "loss/hidden": 1.2734375, "loss/logits": 0.18467766046524048, "loss/reg": 3.5138236853526905e-05, "step": 227 }, { "epoch": 0.0285, "grad_norm": 2.5043678283691406, "grad_norm_var": 0.1907596103376837, "learning_rate": 0.0001, "loss": 1.465, "loss/crossentropy": 2.1979873180389404, "loss/hidden": 1.2734375, "loss/logits": 0.191162109375, "loss/reg": 3.513501360430382e-05, "step": 228 }, { "epoch": 0.028625, "grad_norm": 2.2842559814453125, "grad_norm_var": 0.18968967595688752, "learning_rate": 0.0001, "loss": 1.1184, "loss/crossentropy": 2.6084938049316406, "loss/hidden": 0.984375, "loss/logits": 0.1337008774280548, "loss/reg": 3.5130418837070465e-05, "step": 229 }, { "epoch": 0.02875, "grad_norm": 78.28353881835938, "grad_norm_var": 360.5321235749958, "learning_rate": 0.0001, "loss": 1.287, "loss/crossentropy": 2.642012119293213, "loss/hidden": 1.1328125, "loss/logits": 0.15383732318878174, "loss/reg": 3.512646071612835e-05, "step": 230 }, { "epoch": 0.028875, "grad_norm": 2.2934622764587402, "grad_norm_var": 361.3313444069006, "learning_rate": 0.0001, "loss": 1.2146, "loss/crossentropy": 2.125366687774658, "loss/hidden": 1.078125, "loss/logits": 0.13609249889850616, "loss/reg": 3.51221788150724e-05, "step": 231 }, { "epoch": 0.029, "grad_norm": 2.2334835529327393, "grad_norm_var": 361.13139871490966, "learning_rate": 0.0001, "loss": 1.3334, "loss/crossentropy": 2.4540159702301025, "loss/hidden": 1.1640625, "loss/logits": 0.16899245977401733, "loss/reg": 3.511944305500947e-05, "step": 232 }, { "epoch": 0.029125, "grad_norm": 2.025312900543213, "grad_norm_var": 361.11344936137425, "learning_rate": 0.0001, "loss": 1.0377, "loss/crossentropy": 2.4971330165863037, "loss/hidden": 0.921875, "loss/logits": 0.11546964198350906, "loss/reg": 3.5114706406602636e-05, "step": 233 }, { "epoch": 0.02925, "grad_norm": 1.8107097148895264, "grad_norm_var": 361.39278859021084, "learning_rate": 0.0001, "loss": 1.0426, "loss/crossentropy": 2.3869125843048096, "loss/hidden": 0.92578125, "loss/logits": 0.11641789972782135, "loss/reg": 3.510946407914162e-05, "step": 234 }, { "epoch": 0.029375, "grad_norm": 1.5672167539596558, "grad_norm_var": 362.06253246288617, "learning_rate": 0.0001, "loss": 1.1107, "loss/crossentropy": 2.239819049835205, "loss/hidden": 0.984375, "loss/logits": 0.12596073746681213, "loss/reg": 3.510485475999303e-05, "step": 235 }, { "epoch": 0.0295, "grad_norm": 1.660971999168396, "grad_norm_var": 362.48937574795417, "learning_rate": 0.0001, "loss": 1.1466, "loss/crossentropy": 2.445607900619507, "loss/hidden": 1.0078125, "loss/logits": 0.13841402530670166, "loss/reg": 3.510040551191196e-05, "step": 236 }, { "epoch": 0.029625, "grad_norm": 1.9987144470214844, "grad_norm_var": 362.5308779292139, "learning_rate": 0.0001, "loss": 1.2824, "loss/crossentropy": 2.3451719284057617, "loss/hidden": 1.125, "loss/logits": 0.15705101191997528, "loss/reg": 3.50964764948003e-05, "step": 237 }, { "epoch": 0.02975, "grad_norm": 1.6107144355773926, "grad_norm_var": 363.18249781017846, "learning_rate": 0.0001, "loss": 1.094, "loss/crossentropy": 2.4538111686706543, "loss/hidden": 0.96484375, "loss/logits": 0.12884217500686646, "loss/reg": 3.5092118196189404e-05, "step": 238 }, { "epoch": 0.029875, "grad_norm": 2.075155735015869, "grad_norm_var": 363.1642193933474, "learning_rate": 0.0001, "loss": 1.215, "loss/crossentropy": 2.531987190246582, "loss/hidden": 1.0625, "loss/logits": 0.1521448791027069, "loss/reg": 3.5086912248516455e-05, "step": 239 }, { "epoch": 0.03, "grad_norm": 2.3201041221618652, "grad_norm_var": 363.0281972205138, "learning_rate": 0.0001, "loss": 1.1417, "loss/crossentropy": 2.5911645889282227, "loss/hidden": 0.98828125, "loss/logits": 0.15311402082443237, "loss/reg": 3.5083008697256446e-05, "step": 240 }, { "epoch": 0.030125, "grad_norm": 2.102585554122925, "grad_norm_var": 363.1402101869869, "learning_rate": 0.0001, "loss": 1.0786, "loss/crossentropy": 2.350059747695923, "loss/hidden": 0.9453125, "loss/logits": 0.13292624056339264, "loss/reg": 3.507927613100037e-05, "step": 241 }, { "epoch": 0.03025, "grad_norm": 2.0235884189605713, "grad_norm_var": 363.295456416674, "learning_rate": 0.0001, "loss": 1.191, "loss/crossentropy": 2.2975988388061523, "loss/hidden": 1.046875, "loss/logits": 0.1438218355178833, "loss/reg": 3.5071363527094945e-05, "step": 242 }, { "epoch": 0.030375, "grad_norm": 2.6130146980285645, "grad_norm_var": 363.0764814158417, "learning_rate": 0.0001, "loss": 1.2925, "loss/crossentropy": 2.477764368057251, "loss/hidden": 1.140625, "loss/logits": 0.15148332715034485, "loss/reg": 3.5061231756117195e-05, "step": 243 }, { "epoch": 0.0305, "grad_norm": 2.840742826461792, "grad_norm_var": 362.889192023, "learning_rate": 0.0001, "loss": 1.0848, "loss/crossentropy": 2.4943950176239014, "loss/hidden": 0.95703125, "loss/logits": 0.12739571928977966, "loss/reg": 3.505099084577523e-05, "step": 244 }, { "epoch": 0.030625, "grad_norm": 2.412440776824951, "grad_norm_var": 362.8120310886774, "learning_rate": 0.0001, "loss": 1.1284, "loss/crossentropy": 2.5384373664855957, "loss/hidden": 0.9921875, "loss/logits": 0.1358700394630432, "loss/reg": 3.504483902361244e-05, "step": 245 }, { "epoch": 0.03075, "grad_norm": 2.105398654937744, "grad_norm_var": 0.12231122414377561, "learning_rate": 0.0001, "loss": 1.2701, "loss/crossentropy": 2.192657232284546, "loss/hidden": 1.109375, "loss/logits": 0.16039346158504486, "loss/reg": 3.5036639019381255e-05, "step": 246 }, { "epoch": 0.030875, "grad_norm": 1.9865885972976685, "grad_norm_var": 0.12052054727502123, "learning_rate": 0.0001, "loss": 1.1144, "loss/crossentropy": 2.396636486053467, "loss/hidden": 0.97265625, "loss/logits": 0.14134395122528076, "loss/reg": 3.502915205899626e-05, "step": 247 }, { "epoch": 0.031, "grad_norm": 2.064413547515869, "grad_norm_var": 0.11899755252362822, "learning_rate": 0.0001, "loss": 1.2757, "loss/crossentropy": 2.2669105529785156, "loss/hidden": 1.109375, "loss/logits": 0.1659543216228485, "loss/reg": 3.502297113300301e-05, "step": 248 }, { "epoch": 0.031125, "grad_norm": 2.341909170150757, "grad_norm_var": 0.12311806681906787, "learning_rate": 0.0001, "loss": 1.1985, "loss/crossentropy": 2.6345901489257812, "loss/hidden": 1.03125, "loss/logits": 0.16693958640098572, "loss/reg": 3.501290120766498e-05, "step": 249 }, { "epoch": 0.03125, "grad_norm": 2.0425026416778564, "grad_norm_var": 0.11766230442624745, "learning_rate": 0.0001, "loss": 1.1968, "loss/crossentropy": 2.4224119186401367, "loss/hidden": 1.0390625, "loss/logits": 0.15737830102443695, "loss/reg": 3.500358798191883e-05, "step": 250 }, { "epoch": 0.031375, "grad_norm": 2.139225482940674, "grad_norm_var": 0.09668613014885802, "learning_rate": 0.0001, "loss": 1.2377, "loss/crossentropy": 2.7258267402648926, "loss/hidden": 1.09375, "loss/logits": 0.14362195134162903, "loss/reg": 3.499682497931644e-05, "step": 251 }, { "epoch": 0.0315, "grad_norm": 2.102008581161499, "grad_norm_var": 0.08031358514106011, "learning_rate": 0.0001, "loss": 1.1117, "loss/crossentropy": 2.432748317718506, "loss/hidden": 0.96875, "loss/logits": 0.1425955444574356, "loss/reg": 3.498911246424541e-05, "step": 252 }, { "epoch": 0.031625, "grad_norm": 2.2371959686279297, "grad_norm_var": 0.0783042488946918, "learning_rate": 0.0001, "loss": 1.0361, "loss/crossentropy": 2.437335968017578, "loss/hidden": 0.90234375, "loss/logits": 0.13340801000595093, "loss/reg": 3.498331716400571e-05, "step": 253 }, { "epoch": 0.03175, "grad_norm": 2.230013608932495, "grad_norm_var": 0.054557147559412954, "learning_rate": 0.0001, "loss": 1.2139, "loss/crossentropy": 2.159069299697876, "loss/hidden": 1.0859375, "loss/logits": 0.12760058045387268, "loss/reg": 3.497749275993556e-05, "step": 254 }, { "epoch": 0.031875, "grad_norm": 2.2526209354400635, "grad_norm_var": 0.05292534377042599, "learning_rate": 0.0001, "loss": 1.1517, "loss/crossentropy": 2.744022846221924, "loss/hidden": 1.0, "loss/logits": 0.15138062834739685, "loss/reg": 3.496619319776073e-05, "step": 255 }, { "epoch": 0.032, "grad_norm": 2.243044376373291, "grad_norm_var": 0.05245697188967089, "learning_rate": 0.0001, "loss": 1.6025, "loss/crossentropy": 2.2367136478424072, "loss/hidden": 1.3515625, "loss/logits": 0.25056183338165283, "loss/reg": 3.495947385090403e-05, "step": 256 }, { "epoch": 0.032125, "grad_norm": 2.2301154136657715, "grad_norm_var": 0.05124602164451577, "learning_rate": 0.0001, "loss": 1.1302, "loss/crossentropy": 2.8784143924713135, "loss/hidden": 0.99609375, "loss/logits": 0.1337929368019104, "loss/reg": 3.495061901048757e-05, "step": 257 }, { "epoch": 0.03225, "grad_norm": 1.9009541273117065, "grad_norm_var": 0.0557499358364358, "learning_rate": 0.0001, "loss": 1.1815, "loss/crossentropy": 2.852105140686035, "loss/hidden": 1.03125, "loss/logits": 0.1498585343360901, "loss/reg": 3.494451448204927e-05, "step": 258 }, { "epoch": 0.032375, "grad_norm": 1.6674587726593018, "grad_norm_var": 0.06383147372835059, "learning_rate": 0.0001, "loss": 1.1406, "loss/crossentropy": 2.6262168884277344, "loss/hidden": 0.9921875, "loss/logits": 0.1480187475681305, "loss/reg": 3.493377516861074e-05, "step": 259 }, { "epoch": 0.0325, "grad_norm": 2.1328914165496826, "grad_norm_var": 0.03229453348446872, "learning_rate": 0.0001, "loss": 1.2748, "loss/crossentropy": 2.944517135620117, "loss/hidden": 1.1171875, "loss/logits": 0.15721508860588074, "loss/reg": 3.492645555525087e-05, "step": 260 }, { "epoch": 0.032625, "grad_norm": 1.8784502744674683, "grad_norm_var": 0.030045803407693878, "learning_rate": 0.0001, "loss": 1.0683, "loss/crossentropy": 2.2746896743774414, "loss/hidden": 0.9375, "loss/logits": 0.1304292529821396, "loss/reg": 3.4919979952974245e-05, "step": 261 }, { "epoch": 0.03275, "grad_norm": 2.3806915283203125, "grad_norm_var": 0.03508431327747174, "learning_rate": 0.0001, "loss": 1.0997, "loss/crossentropy": 2.5303072929382324, "loss/hidden": 0.96484375, "loss/logits": 0.13454417884349823, "loss/reg": 3.4913624403998256e-05, "step": 262 }, { "epoch": 0.032875, "grad_norm": 1.8924614191055298, "grad_norm_var": 0.03724188133506529, "learning_rate": 0.0001, "loss": 1.198, "loss/crossentropy": 2.496983766555786, "loss/hidden": 1.0390625, "loss/logits": 0.15854007005691528, "loss/reg": 3.490634844638407e-05, "step": 263 }, { "epoch": 0.033, "grad_norm": 2.439052104949951, "grad_norm_var": 0.04381194480349577, "learning_rate": 0.0001, "loss": 0.953, "loss/crossentropy": 2.7149696350097656, "loss/hidden": 0.83984375, "loss/logits": 0.1127798855304718, "loss/reg": 3.490111339488067e-05, "step": 264 }, { "epoch": 0.033125, "grad_norm": 2.3024566173553467, "grad_norm_var": 0.042804570962997876, "learning_rate": 0.0001, "loss": 1.1402, "loss/crossentropy": 2.447719097137451, "loss/hidden": 0.9765625, "loss/logits": 0.1632998287677765, "loss/reg": 3.489398295641877e-05, "step": 265 }, { "epoch": 0.03325, "grad_norm": 2.287111520767212, "grad_norm_var": 0.04370853447134184, "learning_rate": 0.0001, "loss": 1.2209, "loss/crossentropy": 2.1921133995056152, "loss/hidden": 1.0625, "loss/logits": 0.15808838605880737, "loss/reg": 3.48894864146132e-05, "step": 266 }, { "epoch": 0.033375, "grad_norm": 2.336646318435669, "grad_norm_var": 0.04599945823637951, "learning_rate": 0.0001, "loss": 1.2784, "loss/crossentropy": 2.231534719467163, "loss/hidden": 1.125, "loss/logits": 0.15302959084510803, "loss/reg": 3.488411311991513e-05, "step": 267 }, { "epoch": 0.0335, "grad_norm": 1.9491949081420898, "grad_norm_var": 0.04858091189580491, "learning_rate": 0.0001, "loss": 1.3204, "loss/crossentropy": 2.3477442264556885, "loss/hidden": 1.1484375, "loss/logits": 0.17159268260002136, "loss/reg": 3.488002403173596e-05, "step": 268 }, { "epoch": 0.033625, "grad_norm": 1.8812212944030762, "grad_norm_var": 0.05224458505779281, "learning_rate": 0.0001, "loss": 1.1176, "loss/crossentropy": 2.3098113536834717, "loss/hidden": 0.98046875, "loss/logits": 0.13675528764724731, "loss/reg": 3.487144203973003e-05, "step": 269 }, { "epoch": 0.03375, "grad_norm": 2.1917343139648438, "grad_norm_var": 0.05180158566938828, "learning_rate": 0.0001, "loss": 1.3389, "loss/crossentropy": 2.6006360054016113, "loss/hidden": 1.171875, "loss/logits": 0.16665683686733246, "loss/reg": 3.4867065551225096e-05, "step": 270 }, { "epoch": 0.033875, "grad_norm": 2.1218273639678955, "grad_norm_var": 0.050608227478554056, "learning_rate": 0.0001, "loss": 1.2115, "loss/crossentropy": 2.3429975509643555, "loss/hidden": 1.0546875, "loss/logits": 0.15650612115859985, "loss/reg": 3.486104469629936e-05, "step": 271 }, { "epoch": 0.034, "grad_norm": 2.216865062713623, "grad_norm_var": 0.05020309095007551, "learning_rate": 0.0001, "loss": 1.309, "loss/crossentropy": 2.6172518730163574, "loss/hidden": 1.125, "loss/logits": 0.18367114663124084, "loss/reg": 3.485478737275116e-05, "step": 272 }, { "epoch": 0.034125, "grad_norm": 2.660276174545288, "grad_norm_var": 0.06848105136911228, "learning_rate": 0.0001, "loss": 1.2656, "loss/crossentropy": 2.6182620525360107, "loss/hidden": 1.1015625, "loss/logits": 0.16370661556720734, "loss/reg": 3.4849643270717934e-05, "step": 273 }, { "epoch": 0.03425, "grad_norm": 1.9411466121673584, "grad_norm_var": 0.06730120648781887, "learning_rate": 0.0001, "loss": 1.2427, "loss/crossentropy": 2.360733985900879, "loss/hidden": 1.09375, "loss/logits": 0.14862678945064545, "loss/reg": 3.4842636523535475e-05, "step": 274 }, { "epoch": 0.034375, "grad_norm": 7.836575031280518, "grad_norm_var": 2.0552077515562965, "learning_rate": 0.0001, "loss": 2.1182, "loss/crossentropy": 2.7246999740600586, "loss/hidden": 1.8671875, "loss/logits": 0.25068169832229614, "loss/reg": 3.483570981188677e-05, "step": 275 }, { "epoch": 0.0345, "grad_norm": 15.36082649230957, "grad_norm_var": 12.29442028509769, "learning_rate": 0.0001, "loss": 1.2297, "loss/crossentropy": 2.7898216247558594, "loss/hidden": 1.0625, "loss/logits": 0.16689400374889374, "loss/reg": 3.482873580651358e-05, "step": 276 }, { "epoch": 0.034625, "grad_norm": 2.4146132469177246, "grad_norm_var": 12.206846506541835, "learning_rate": 0.0001, "loss": 1.1546, "loss/crossentropy": 2.6433522701263428, "loss/hidden": 1.0, "loss/logits": 0.15420594811439514, "loss/reg": 3.482148167677224e-05, "step": 277 }, { "epoch": 0.03475, "grad_norm": 2.0731208324432373, "grad_norm_var": 12.254080178741175, "learning_rate": 0.0001, "loss": 1.1779, "loss/crossentropy": 2.2190020084381104, "loss/hidden": 1.03125, "loss/logits": 0.14627079665660858, "loss/reg": 3.481503881630488e-05, "step": 278 }, { "epoch": 0.034875, "grad_norm": 2.2058444023132324, "grad_norm_var": 12.19851901002264, "learning_rate": 0.0001, "loss": 1.1722, "loss/crossentropy": 2.3826961517333984, "loss/hidden": 1.015625, "loss/logits": 0.15618480741977692, "loss/reg": 3.48087414749898e-05, "step": 279 }, { "epoch": 0.035, "grad_norm": 1.8618927001953125, "grad_norm_var": 12.292415025402823, "learning_rate": 0.0001, "loss": 1.2047, "loss/crossentropy": 2.424729824066162, "loss/hidden": 1.0390625, "loss/logits": 0.16533657908439636, "loss/reg": 3.479952283669263e-05, "step": 280 }, { "epoch": 0.035125, "grad_norm": 1.9102482795715332, "grad_norm_var": 12.356945094423846, "learning_rate": 0.0001, "loss": 1.2226, "loss/crossentropy": 2.4433910846710205, "loss/hidden": 1.0546875, "loss/logits": 0.1675223708152771, "loss/reg": 3.4793913073372096e-05, "step": 281 }, { "epoch": 0.03525, "grad_norm": 2.706441640853882, "grad_norm_var": 12.309734168758586, "learning_rate": 0.0001, "loss": 1.0941, "loss/crossentropy": 3.0568315982818604, "loss/hidden": 0.95703125, "loss/logits": 0.13671937584877014, "loss/reg": 3.478690632618964e-05, "step": 282 }, { "epoch": 0.035375, "grad_norm": 2.22886061668396, "grad_norm_var": 12.325085121884593, "learning_rate": 0.0001, "loss": 1.2228, "loss/crossentropy": 2.31925892829895, "loss/hidden": 1.0546875, "loss/logits": 0.16773122549057007, "loss/reg": 3.4781944123096764e-05, "step": 283 }, { "epoch": 0.0355, "grad_norm": 1.9104151725769043, "grad_norm_var": 12.332409456506062, "learning_rate": 0.0001, "loss": 1.2103, "loss/crossentropy": 2.433910369873047, "loss/hidden": 1.0625, "loss/logits": 0.14740660786628723, "loss/reg": 3.47744207829237e-05, "step": 284 }, { "epoch": 0.035625, "grad_norm": 1.8624837398529053, "grad_norm_var": 12.336088715902626, "learning_rate": 0.0001, "loss": 1.1048, "loss/crossentropy": 2.308101177215576, "loss/hidden": 0.95703125, "loss/logits": 0.14743317663669586, "loss/reg": 3.476895290077664e-05, "step": 285 }, { "epoch": 0.03575, "grad_norm": 1.7985292673110962, "grad_norm_var": 12.406159364169602, "learning_rate": 0.0001, "loss": 1.391, "loss/crossentropy": 2.338219165802002, "loss/hidden": 1.1875, "loss/logits": 0.20311352610588074, "loss/reg": 3.476293932180852e-05, "step": 286 }, { "epoch": 0.035875, "grad_norm": 2.0390207767486572, "grad_norm_var": 12.419809877029808, "learning_rate": 0.0001, "loss": 1.2142, "loss/crossentropy": 2.798340320587158, "loss/hidden": 1.0390625, "loss/logits": 0.1748366355895996, "loss/reg": 3.4757238609017804e-05, "step": 287 }, { "epoch": 0.036, "grad_norm": 2.6159002780914307, "grad_norm_var": 12.371378457752575, "learning_rate": 0.0001, "loss": 1.3686, "loss/crossentropy": 2.0943591594696045, "loss/hidden": 1.1953125, "loss/logits": 0.1729019582271576, "loss/reg": 3.475058838375844e-05, "step": 288 }, { "epoch": 0.036125, "grad_norm": 2.64959716796875, "grad_norm_var": 12.372352193512818, "learning_rate": 0.0001, "loss": 1.3369, "loss/crossentropy": 2.875190019607544, "loss/hidden": 1.1640625, "loss/logits": 0.17247627675533295, "loss/reg": 3.474393815849908e-05, "step": 289 }, { "epoch": 0.03625, "grad_norm": 2.2474873065948486, "grad_norm_var": 12.321143222954635, "learning_rate": 0.0001, "loss": 1.252, "loss/crossentropy": 2.6608498096466064, "loss/hidden": 1.078125, "loss/logits": 0.17357373237609863, "loss/reg": 3.473682954791002e-05, "step": 290 }, { "epoch": 0.036375, "grad_norm": 2.4677608013153076, "grad_norm_var": 10.916427124267383, "learning_rate": 0.0001, "loss": 1.0142, "loss/crossentropy": 2.8202872276306152, "loss/hidden": 0.89453125, "loss/logits": 0.11931537836790085, "loss/reg": 3.472894968581386e-05, "step": 291 }, { "epoch": 0.0365, "grad_norm": 2.4355132579803467, "grad_norm_var": 0.09359576037076062, "learning_rate": 0.0001, "loss": 1.344, "loss/crossentropy": 2.588075876235962, "loss/hidden": 1.171875, "loss/logits": 0.17175744473934174, "loss/reg": 3.4721750125754625e-05, "step": 292 }, { "epoch": 0.036625, "grad_norm": 1.8848857879638672, "grad_norm_var": 0.09698104319835413, "learning_rate": 0.0001, "loss": 1.0617, "loss/crossentropy": 2.2032036781311035, "loss/hidden": 0.93359375, "loss/logits": 0.12777957320213318, "loss/reg": 3.471899981377646e-05, "step": 293 }, { "epoch": 0.03675, "grad_norm": 2.0544004440307617, "grad_norm_var": 0.09727253081927305, "learning_rate": 0.0001, "loss": 1.1195, "loss/crossentropy": 2.68400502204895, "loss/hidden": 0.97265625, "loss/logits": 0.14646826684474945, "loss/reg": 3.471321178949438e-05, "step": 294 }, { "epoch": 0.036875, "grad_norm": 2.784888744354248, "grad_norm_var": 0.12022710970729276, "learning_rate": 0.0001, "loss": 1.4568, "loss/crossentropy": 2.2672512531280518, "loss/hidden": 1.265625, "loss/logits": 0.19084087014198303, "loss/reg": 3.4705888538155705e-05, "step": 295 }, { "epoch": 0.037, "grad_norm": 2.107628345489502, "grad_norm_var": 0.11239423391909416, "learning_rate": 0.0001, "loss": 1.1654, "loss/crossentropy": 2.3829853534698486, "loss/hidden": 1.015625, "loss/logits": 0.14940626919269562, "loss/reg": 3.470026422291994e-05, "step": 296 }, { "epoch": 0.037125, "grad_norm": 2.666799783706665, "grad_norm_var": 0.11576118522773501, "learning_rate": 0.0001, "loss": 1.2658, "loss/crossentropy": 2.3234565258026123, "loss/hidden": 1.0859375, "loss/logits": 0.17954039573669434, "loss/reg": 3.469527655397542e-05, "step": 297 }, { "epoch": 0.03725, "grad_norm": 1.8957864046096802, "grad_norm_var": 0.11060988429572352, "learning_rate": 0.0001, "loss": 1.305, "loss/crossentropy": 2.4239273071289062, "loss/hidden": 1.125, "loss/logits": 0.17967185378074646, "loss/reg": 3.469140938250348e-05, "step": 298 }, { "epoch": 0.037375, "grad_norm": 2.1476354598999023, "grad_norm_var": 0.1110142344328826, "learning_rate": 0.0001, "loss": 1.18, "loss/crossentropy": 2.3623642921447754, "loss/hidden": 1.0234375, "loss/logits": 0.1561676412820816, "loss/reg": 3.468482827884145e-05, "step": 299 }, { "epoch": 0.0375, "grad_norm": 1.9932529926300049, "grad_norm_var": 0.10799009738127907, "learning_rate": 0.0001, "loss": 1.1559, "loss/crossentropy": 2.69016170501709, "loss/hidden": 1.0078125, "loss/logits": 0.14770537614822388, "loss/reg": 3.468065187917091e-05, "step": 300 }, { "epoch": 0.037625, "grad_norm": 2.2224810123443604, "grad_norm_var": 0.0985346154888075, "learning_rate": 0.0001, "loss": 1.1653, "loss/crossentropy": 2.553476572036743, "loss/hidden": 1.0234375, "loss/logits": 0.14147460460662842, "loss/reg": 3.467235364951193e-05, "step": 301 }, { "epoch": 0.03775, "grad_norm": 2.135714530944824, "grad_norm_var": 0.08531074310993644, "learning_rate": 0.0001, "loss": 1.2046, "loss/crossentropy": 2.243950128555298, "loss/hidden": 1.0625, "loss/logits": 0.1418030858039856, "loss/reg": 3.466893394943327e-05, "step": 302 }, { "epoch": 0.037875, "grad_norm": 2.3680195808410645, "grad_norm_var": 0.08186467355099622, "learning_rate": 0.0001, "loss": 1.2752, "loss/crossentropy": 2.2647523880004883, "loss/hidden": 1.1328125, "loss/logits": 0.1420516073703766, "loss/reg": 3.4659868106245995e-05, "step": 303 }, { "epoch": 0.038, "grad_norm": 2.08476185798645, "grad_norm_var": 0.07658376607289051, "learning_rate": 0.0001, "loss": 1.1305, "loss/crossentropy": 2.627105712890625, "loss/hidden": 0.984375, "loss/logits": 0.14580708742141724, "loss/reg": 3.465437112026848e-05, "step": 304 }, { "epoch": 0.038125, "grad_norm": 2.110761880874634, "grad_norm_var": 0.06667962973879416, "learning_rate": 0.0001, "loss": 1.2576, "loss/crossentropy": 2.3144562244415283, "loss/hidden": 1.09375, "loss/logits": 0.16346824169158936, "loss/reg": 3.4645545383682474e-05, "step": 305 }, { "epoch": 0.03825, "grad_norm": 1.900791049003601, "grad_norm_var": 0.07317499342195574, "learning_rate": 0.0001, "loss": 1.0492, "loss/crossentropy": 2.716005325317383, "loss/hidden": 0.9140625, "loss/logits": 0.13480325043201447, "loss/reg": 3.463495886535384e-05, "step": 306 }, { "epoch": 0.038375, "grad_norm": 6.467423439025879, "grad_norm_var": 1.2137641430294792, "learning_rate": 0.0001, "loss": 1.4583, "loss/crossentropy": 2.1065866947174072, "loss/hidden": 1.2890625, "loss/logits": 0.1689138412475586, "loss/reg": 3.462713721091859e-05, "step": 307 }, { "epoch": 0.0385, "grad_norm": 2.0642800331115723, "grad_norm_var": 1.2232825060870915, "learning_rate": 0.0001, "loss": 1.185, "loss/crossentropy": 2.624023675918579, "loss/hidden": 1.03125, "loss/logits": 0.1534431129693985, "loss/reg": 3.4622189559740946e-05, "step": 308 }, { "epoch": 0.038625, "grad_norm": 1.680816650390625, "grad_norm_var": 1.2407335757806945, "learning_rate": 0.0001, "loss": 1.153, "loss/crossentropy": 2.327143430709839, "loss/hidden": 1.0078125, "loss/logits": 0.14481112360954285, "loss/reg": 3.461851520114578e-05, "step": 309 }, { "epoch": 0.03875, "grad_norm": 1.7195234298706055, "grad_norm_var": 1.2639701691366267, "learning_rate": 0.0001, "loss": 1.1146, "loss/crossentropy": 2.234018087387085, "loss/hidden": 0.97265625, "loss/logits": 0.141597181558609, "loss/reg": 3.461261803749949e-05, "step": 310 }, { "epoch": 0.038875, "grad_norm": 1.9253605604171753, "grad_norm_var": 1.265680677961886, "learning_rate": 0.0001, "loss": 1.1557, "loss/crossentropy": 2.310030460357666, "loss/hidden": 1.015625, "loss/logits": 0.13973672688007355, "loss/reg": 3.4606102417455986e-05, "step": 311 }, { "epoch": 0.039, "grad_norm": 1.8293761014938354, "grad_norm_var": 1.2792590983492156, "learning_rate": 0.0001, "loss": 1.1528, "loss/crossentropy": 2.735854148864746, "loss/hidden": 1.015625, "loss/logits": 0.1367899477481842, "loss/reg": 3.460055449977517e-05, "step": 312 }, { "epoch": 0.039125, "grad_norm": 2.47273325920105, "grad_norm_var": 1.2727893848260379, "learning_rate": 0.0001, "loss": 1.1983, "loss/crossentropy": 2.4735970497131348, "loss/hidden": 1.03125, "loss/logits": 0.16670575737953186, "loss/reg": 3.4597334888530895e-05, "step": 313 }, { "epoch": 0.03925, "grad_norm": 2.4255082607269287, "grad_norm_var": 1.2608122772144856, "learning_rate": 0.0001, "loss": 1.1323, "loss/crossentropy": 2.6550486087799072, "loss/hidden": 0.9765625, "loss/logits": 0.1554185003042221, "loss/reg": 3.4591066651046276e-05, "step": 314 }, { "epoch": 0.039375, "grad_norm": 2.7170803546905518, "grad_norm_var": 1.2659589390154238, "learning_rate": 0.0001, "loss": 1.439, "loss/crossentropy": 2.187866687774658, "loss/hidden": 1.21875, "loss/logits": 0.21992294490337372, "loss/reg": 3.458749415585771e-05, "step": 315 }, { "epoch": 0.0395, "grad_norm": 2.1601176261901855, "grad_norm_var": 1.259041909984486, "learning_rate": 0.0001, "loss": 1.3232, "loss/crossentropy": 2.364377737045288, "loss/hidden": 1.1484375, "loss/logits": 0.1744486689567566, "loss/reg": 3.458080755081028e-05, "step": 316 }, { "epoch": 0.039625, "grad_norm": 1.5947940349578857, "grad_norm_var": 1.2979203484203092, "learning_rate": 0.0001, "loss": 1.028, "loss/crossentropy": 2.4233779907226562, "loss/hidden": 0.91015625, "loss/logits": 0.11754067242145538, "loss/reg": 3.4571639844216406e-05, "step": 317 }, { "epoch": 0.03975, "grad_norm": 1.9653059244155884, "grad_norm_var": 1.3046851365567058, "learning_rate": 0.0001, "loss": 1.126, "loss/crossentropy": 2.31756854057312, "loss/hidden": 1.0, "loss/logits": 0.12566252052783966, "loss/reg": 3.456034028204158e-05, "step": 318 }, { "epoch": 0.039875, "grad_norm": 2.036482572555542, "grad_norm_var": 1.310445228246628, "learning_rate": 0.0001, "loss": 1.1971, "loss/crossentropy": 2.3461310863494873, "loss/hidden": 1.0234375, "loss/logits": 0.17331476509571075, "loss/reg": 3.455525802564807e-05, "step": 319 }, { "epoch": 0.04, "grad_norm": 2.075100898742676, "grad_norm_var": 1.3107569056456743, "learning_rate": 0.0001, "loss": 1.327, "loss/crossentropy": 2.506412982940674, "loss/hidden": 1.140625, "loss/logits": 0.1859946846961975, "loss/reg": 3.45489097526297e-05, "step": 320 }, { "epoch": 0.040125, "grad_norm": 2.719728946685791, "grad_norm_var": 1.3168160620395004, "learning_rate": 0.0001, "loss": 1.3389, "loss/crossentropy": 2.4216549396514893, "loss/hidden": 1.15625, "loss/logits": 0.1822904348373413, "loss/reg": 3.45378830388654e-05, "step": 321 }, { "epoch": 0.04025, "grad_norm": 2.6060094833374023, "grad_norm_var": 1.3047531355820505, "learning_rate": 0.0001, "loss": 1.5674, "loss/crossentropy": 2.658222198486328, "loss/hidden": 1.34375, "loss/logits": 0.22325485944747925, "loss/reg": 3.453323370194994e-05, "step": 322 }, { "epoch": 0.040375, "grad_norm": 2.5935699939727783, "grad_norm_var": 0.14371946682283335, "learning_rate": 0.0001, "loss": 1.1519, "loss/crossentropy": 2.6234920024871826, "loss/hidden": 1.015625, "loss/logits": 0.13588842749595642, "loss/reg": 3.452138480497524e-05, "step": 323 }, { "epoch": 0.0405, "grad_norm": 2.4168694019317627, "grad_norm_var": 0.1469136698932582, "learning_rate": 0.0001, "loss": 1.2903, "loss/crossentropy": 2.3443264961242676, "loss/hidden": 1.109375, "loss/logits": 0.18053767085075378, "loss/reg": 3.4516375308157876e-05, "step": 324 }, { "epoch": 0.040625, "grad_norm": 2.598829746246338, "grad_norm_var": 0.13803791478749894, "learning_rate": 0.0001, "loss": 1.0512, "loss/crossentropy": 2.6923201084136963, "loss/hidden": 0.93359375, "loss/logits": 0.11729900538921356, "loss/reg": 3.450660369708203e-05, "step": 325 }, { "epoch": 0.04075, "grad_norm": 2.2705416679382324, "grad_norm_var": 0.11870002646295455, "learning_rate": 0.0001, "loss": 1.3007, "loss/crossentropy": 2.8326780796051025, "loss/hidden": 1.1328125, "loss/logits": 0.16750125586986542, "loss/reg": 3.449714495218359e-05, "step": 326 }, { "epoch": 0.040875, "grad_norm": 2.108572006225586, "grad_norm_var": 0.11224555742265707, "learning_rate": 0.0001, "loss": 1.2583, "loss/crossentropy": 2.4080569744110107, "loss/hidden": 1.1015625, "loss/logits": 0.15636944770812988, "loss/reg": 3.449182622716762e-05, "step": 327 }, { "epoch": 0.041, "grad_norm": 1.9010335206985474, "grad_norm_var": 0.10819501908635042, "learning_rate": 0.0001, "loss": 1.1679, "loss/crossentropy": 2.7714359760284424, "loss/hidden": 1.015625, "loss/logits": 0.15192916989326477, "loss/reg": 3.448529605520889e-05, "step": 328 }, { "epoch": 0.041125, "grad_norm": 2.413954973220825, "grad_norm_var": 0.1069897618565714, "learning_rate": 0.0001, "loss": 1.4022, "loss/crossentropy": 2.225947618484497, "loss/hidden": 1.2109375, "loss/logits": 0.19096189737319946, "loss/reg": 3.4474134736228734e-05, "step": 329 }, { "epoch": 0.04125, "grad_norm": 2.385904312133789, "grad_norm_var": 0.1063601900492659, "learning_rate": 0.0001, "loss": 1.2387, "loss/crossentropy": 2.4583194255828857, "loss/hidden": 1.078125, "loss/logits": 0.16027729213237762, "loss/reg": 3.44689360645134e-05, "step": 330 }, { "epoch": 0.041375, "grad_norm": 2.4138543605804443, "grad_norm_var": 0.09464759263946097, "learning_rate": 0.0001, "loss": 1.2305, "loss/crossentropy": 2.5432093143463135, "loss/hidden": 1.0703125, "loss/logits": 0.15986011922359467, "loss/reg": 3.446204573265277e-05, "step": 331 }, { "epoch": 0.0415, "grad_norm": 3.4917404651641846, "grad_norm_var": 0.18662260281899326, "learning_rate": 0.0001, "loss": 1.4531, "loss/crossentropy": 2.667515516281128, "loss/hidden": 1.2578125, "loss/logits": 0.19496265053749084, "loss/reg": 3.445533729973249e-05, "step": 332 }, { "epoch": 0.041625, "grad_norm": 2.6574389934539795, "grad_norm_var": 0.15026464336703782, "learning_rate": 0.0001, "loss": 1.2525, "loss/crossentropy": 2.8141376972198486, "loss/hidden": 1.0859375, "loss/logits": 0.16617870330810547, "loss/reg": 3.444873073021881e-05, "step": 333 }, { "epoch": 0.04175, "grad_norm": 2.011918783187866, "grad_norm_var": 0.14759976834883393, "learning_rate": 0.0001, "loss": 1.2704, "loss/crossentropy": 2.5743820667266846, "loss/hidden": 1.109375, "loss/logits": 0.16070207953453064, "loss/reg": 3.444178946665488e-05, "step": 334 }, { "epoch": 0.041875, "grad_norm": 2.0087714195251465, "grad_norm_var": 0.14906053005454342, "learning_rate": 0.0001, "loss": 1.1449, "loss/crossentropy": 2.510054111480713, "loss/hidden": 1.0078125, "loss/logits": 0.13675576448440552, "loss/reg": 3.4435932320775464e-05, "step": 335 }, { "epoch": 0.042, "grad_norm": 2.2362313270568848, "grad_norm_var": 0.1433353693831899, "learning_rate": 0.0001, "loss": 1.1899, "loss/crossentropy": 2.4818272590637207, "loss/hidden": 1.03125, "loss/logits": 0.15829679369926453, "loss/reg": 3.443356399657205e-05, "step": 336 }, { "epoch": 0.042125, "grad_norm": 2.767322301864624, "grad_norm_var": 0.14533335584858278, "learning_rate": 0.0001, "loss": 1.4789, "loss/crossentropy": 1.9489027261734009, "loss/hidden": 1.2578125, "loss/logits": 0.22079172730445862, "loss/reg": 3.442970410105772e-05, "step": 337 }, { "epoch": 0.04225, "grad_norm": 1.9009366035461426, "grad_norm_var": 0.15987229719162357, "learning_rate": 0.0001, "loss": 1.0737, "loss/crossentropy": 2.4470298290252686, "loss/hidden": 0.9375, "loss/logits": 0.13584166765213013, "loss/reg": 3.44260515703354e-05, "step": 338 }, { "epoch": 0.042375, "grad_norm": 2.4537954330444336, "grad_norm_var": 0.1572266899389352, "learning_rate": 0.0001, "loss": 1.4332, "loss/crossentropy": 2.3663222789764404, "loss/hidden": 1.2421875, "loss/logits": 0.1906408667564392, "loss/reg": 3.442170054768212e-05, "step": 339 }, { "epoch": 0.0425, "grad_norm": 2.032825469970703, "grad_norm_var": 0.1644215429789795, "learning_rate": 0.0001, "loss": 1.1493, "loss/crossentropy": 2.4772043228149414, "loss/hidden": 1.0078125, "loss/logits": 0.1411634385585785, "loss/reg": 3.4417531423969194e-05, "step": 340 }, { "epoch": 0.042625, "grad_norm": 2.3325209617614746, "grad_norm_var": 0.1601377693951102, "learning_rate": 0.0001, "loss": 1.2114, "loss/crossentropy": 2.45560622215271, "loss/hidden": 1.0546875, "loss/logits": 0.15634939074516296, "loss/reg": 3.44164072885178e-05, "step": 341 }, { "epoch": 0.04275, "grad_norm": 1.9114069938659668, "grad_norm_var": 0.1713673299562344, "learning_rate": 0.0001, "loss": 1.0862, "loss/crossentropy": 2.6129817962646484, "loss/hidden": 0.9453125, "loss/logits": 0.14053717255592346, "loss/reg": 3.441024091443978e-05, "step": 342 }, { "epoch": 0.042875, "grad_norm": 3.5132644176483154, "grad_norm_var": 0.256165301144145, "learning_rate": 0.0001, "loss": 1.4695, "loss/crossentropy": 3.366391658782959, "loss/hidden": 1.265625, "loss/logits": 0.203495591878891, "loss/reg": 3.440545333432965e-05, "step": 343 }, { "epoch": 0.043, "grad_norm": 1.859586238861084, "grad_norm_var": 0.2590414795273373, "learning_rate": 0.0001, "loss": 1.1472, "loss/crossentropy": 2.428865909576416, "loss/hidden": 0.984375, "loss/logits": 0.1624484360218048, "loss/reg": 3.440186628722586e-05, "step": 344 }, { "epoch": 0.043125, "grad_norm": 1.98198401927948, "grad_norm_var": 0.2698694637418498, "learning_rate": 0.0001, "loss": 1.3233, "loss/crossentropy": 2.302736282348633, "loss/hidden": 1.140625, "loss/logits": 0.18230174481868744, "loss/reg": 3.439932697801851e-05, "step": 345 }, { "epoch": 0.04325, "grad_norm": 2.3659989833831787, "grad_norm_var": 0.2698585694015619, "learning_rate": 0.0001, "loss": 1.14, "loss/crossentropy": 2.5037076473236084, "loss/hidden": 1.0, "loss/logits": 0.13970160484313965, "loss/reg": 3.439432111917995e-05, "step": 346 }, { "epoch": 0.043375, "grad_norm": 2.281691074371338, "grad_norm_var": 0.2701990568843252, "learning_rate": 0.0001, "loss": 1.2372, "loss/crossentropy": 2.4770443439483643, "loss/hidden": 1.0859375, "loss/logits": 0.15086981654167175, "loss/reg": 3.439074498601258e-05, "step": 347 }, { "epoch": 0.0435, "grad_norm": 2.166990041732788, "grad_norm_var": 0.18050477852144595, "learning_rate": 0.0001, "loss": 1.32, "loss/crossentropy": 2.5852904319763184, "loss/hidden": 1.125, "loss/logits": 0.19465678930282593, "loss/reg": 3.438742714934051e-05, "step": 348 }, { "epoch": 0.043625, "grad_norm": 1.9888639450073242, "grad_norm_var": 0.17481059186203815, "learning_rate": 0.0001, "loss": 1.4304, "loss/crossentropy": 2.2656476497650146, "loss/hidden": 1.21875, "loss/logits": 0.21130971610546112, "loss/reg": 3.437901978031732e-05, "step": 349 }, { "epoch": 0.04375, "grad_norm": 1.7951990365982056, "grad_norm_var": 0.1842899236598953, "learning_rate": 0.0001, "loss": 1.1657, "loss/crossentropy": 2.3542745113372803, "loss/hidden": 1.0234375, "loss/logits": 0.141954243183136, "loss/reg": 3.437545819906518e-05, "step": 350 }, { "epoch": 0.043875, "grad_norm": 2.0519394874572754, "grad_norm_var": 0.18316277481239354, "learning_rate": 0.0001, "loss": 1.0528, "loss/crossentropy": 2.401294231414795, "loss/hidden": 0.91796875, "loss/logits": 0.13452798128128052, "loss/reg": 3.4367119951639324e-05, "step": 351 }, { "epoch": 0.044, "grad_norm": 2.12829327583313, "grad_norm_var": 0.18376578016818687, "learning_rate": 0.0001, "loss": 1.1034, "loss/crossentropy": 2.637117624282837, "loss/hidden": 0.9609375, "loss/logits": 0.14211076498031616, "loss/reg": 3.436086262809113e-05, "step": 352 }, { "epoch": 0.044125, "grad_norm": 1.8479722738265991, "grad_norm_var": 0.16959696182082518, "learning_rate": 0.0001, "loss": 1.1462, "loss/crossentropy": 2.534714698791504, "loss/hidden": 0.99609375, "loss/logits": 0.14980709552764893, "loss/reg": 3.435524195083417e-05, "step": 353 }, { "epoch": 0.04425, "grad_norm": 1.7758541107177734, "grad_norm_var": 0.17495091080606068, "learning_rate": 0.0001, "loss": 1.2699, "loss/crossentropy": 2.484570026397705, "loss/hidden": 1.109375, "loss/logits": 0.16018345952033997, "loss/reg": 3.434780228417367e-05, "step": 354 }, { "epoch": 0.044375, "grad_norm": 2.274486541748047, "grad_norm_var": 0.1698290651703018, "learning_rate": 0.0001, "loss": 1.4533, "loss/crossentropy": 2.1426889896392822, "loss/hidden": 1.2421875, "loss/logits": 0.21076492965221405, "loss/reg": 3.433872916502878e-05, "step": 355 }, { "epoch": 0.0445, "grad_norm": 2.4361371994018555, "grad_norm_var": 0.17400054735299186, "learning_rate": 0.0001, "loss": 1.4138, "loss/crossentropy": 2.34684419631958, "loss/hidden": 1.234375, "loss/logits": 0.17912375926971436, "loss/reg": 3.432907396927476e-05, "step": 356 }, { "epoch": 0.044625, "grad_norm": 2.6452977657318115, "grad_norm_var": 0.1869129455570799, "learning_rate": 0.0001, "loss": 1.131, "loss/crossentropy": 2.5198328495025635, "loss/hidden": 1.015625, "loss/logits": 0.11500594019889832, "loss/reg": 3.431808727327734e-05, "step": 357 }, { "epoch": 0.04475, "grad_norm": 1.905686855316162, "grad_norm_var": 0.18712675263565845, "learning_rate": 0.0001, "loss": 1.1227, "loss/crossentropy": 2.7459700107574463, "loss/hidden": 0.984375, "loss/logits": 0.1379736065864563, "loss/reg": 3.431065852055326e-05, "step": 358 }, { "epoch": 0.044875, "grad_norm": 2.1955368518829346, "grad_norm_var": 0.06293061471083418, "learning_rate": 0.0001, "loss": 1.2854, "loss/crossentropy": 2.514496326446533, "loss/hidden": 1.09375, "loss/logits": 0.19134163856506348, "loss/reg": 3.430093784118071e-05, "step": 359 }, { "epoch": 0.045, "grad_norm": 2.074857711791992, "grad_norm_var": 0.0587442988467273, "learning_rate": 0.0001, "loss": 1.2615, "loss/crossentropy": 2.3883326053619385, "loss/hidden": 1.078125, "loss/logits": 0.18299797177314758, "loss/reg": 3.428904165048152e-05, "step": 360 }, { "epoch": 0.045125, "grad_norm": 3.084925651550293, "grad_norm_var": 0.11450734924812096, "learning_rate": 0.0001, "loss": 1.4694, "loss/crossentropy": 2.5634469985961914, "loss/hidden": 1.265625, "loss/logits": 0.20340915024280548, "loss/reg": 3.4281070838915184e-05, "step": 361 }, { "epoch": 0.04525, "grad_norm": 2.3508076667785645, "grad_norm_var": 0.11416271928607671, "learning_rate": 0.0001, "loss": 1.2864, "loss/crossentropy": 2.713895559310913, "loss/hidden": 1.1171875, "loss/logits": 0.1689026653766632, "loss/reg": 3.4273252822458744e-05, "step": 362 }, { "epoch": 0.045375, "grad_norm": 7.681649208068848, "grad_norm_var": 2.004247231943063, "learning_rate": 0.0001, "loss": 1.3636, "loss/crossentropy": 2.4041330814361572, "loss/hidden": 1.203125, "loss/logits": 0.1601409614086151, "loss/reg": 3.426634430070408e-05, "step": 363 }, { "epoch": 0.0455, "grad_norm": 1.973664402961731, "grad_norm_var": 2.0158187368377836, "learning_rate": 0.0001, "loss": 1.1228, "loss/crossentropy": 2.5436434745788574, "loss/hidden": 0.98046875, "loss/logits": 0.14196962118148804, "loss/reg": 3.425794784561731e-05, "step": 364 }, { "epoch": 0.045625, "grad_norm": 2.2556684017181396, "grad_norm_var": 2.001615144920621, "learning_rate": 0.0001, "loss": 1.1705, "loss/crossentropy": 2.5252158641815186, "loss/hidden": 1.0078125, "loss/logits": 0.162298783659935, "loss/reg": 3.425147588131949e-05, "step": 365 }, { "epoch": 0.04575, "grad_norm": 2.2237462997436523, "grad_norm_var": 1.9711144098953564, "learning_rate": 0.0001, "loss": 1.1598, "loss/crossentropy": 2.4940881729125977, "loss/hidden": 1.015625, "loss/logits": 0.14381377398967743, "loss/reg": 3.4247070288984105e-05, "step": 366 }, { "epoch": 0.045875, "grad_norm": 6.294942378997803, "grad_norm_var": 2.8107703767931165, "learning_rate": 0.0001, "loss": 1.9233, "loss/crossentropy": 2.4311485290527344, "loss/hidden": 1.515625, "loss/logits": 0.40729784965515137, "loss/reg": 3.424140595598146e-05, "step": 367 }, { "epoch": 0.046, "grad_norm": 2.1130430698394775, "grad_norm_var": 2.8121951540684127, "learning_rate": 0.0001, "loss": 1.243, "loss/crossentropy": 2.475876808166504, "loss/hidden": 1.0703125, "loss/logits": 0.17232248187065125, "loss/reg": 3.424075111979619e-05, "step": 368 }, { "epoch": 0.046125, "grad_norm": 1.9393197298049927, "grad_norm_var": 2.80086684083605, "learning_rate": 0.0001, "loss": 1.1504, "loss/crossentropy": 2.431164503097534, "loss/hidden": 0.99609375, "loss/logits": 0.1539495587348938, "loss/reg": 3.4231870813528076e-05, "step": 369 }, { "epoch": 0.04625, "grad_norm": 2.072326183319092, "grad_norm_var": 2.764824687660132, "learning_rate": 0.0001, "loss": 1.209, "loss/crossentropy": 2.686079502105713, "loss/hidden": 1.0546875, "loss/logits": 0.15395890176296234, "loss/reg": 3.422388545004651e-05, "step": 370 }, { "epoch": 0.046375, "grad_norm": 1.8920451402664185, "grad_norm_var": 2.8030644353470513, "learning_rate": 0.0001, "loss": 1.1611, "loss/crossentropy": 2.4512131214141846, "loss/hidden": 1.0, "loss/logits": 0.16072264313697815, "loss/reg": 3.421401197556406e-05, "step": 371 }, { "epoch": 0.0465, "grad_norm": 1.8197071552276611, "grad_norm_var": 2.858464465681775, "learning_rate": 0.0001, "loss": 1.1394, "loss/crossentropy": 2.6290316581726074, "loss/hidden": 0.99609375, "loss/logits": 0.1429995894432068, "loss/reg": 3.4208966098958626e-05, "step": 372 }, { "epoch": 0.046625, "grad_norm": 1.6114336252212524, "grad_norm_var": 2.9442100668891373, "learning_rate": 0.0001, "loss": 1.1743, "loss/crossentropy": 2.412721872329712, "loss/hidden": 1.0234375, "loss/logits": 0.15048110485076904, "loss/reg": 3.419875429244712e-05, "step": 373 }, { "epoch": 0.04675, "grad_norm": 2.3423984050750732, "grad_norm_var": 2.908825389746772, "learning_rate": 0.0001, "loss": 1.1547, "loss/crossentropy": 2.479085683822632, "loss/hidden": 1.0078125, "loss/logits": 0.14653661847114563, "loss/reg": 3.419152199057862e-05, "step": 374 }, { "epoch": 0.046875, "grad_norm": 2.2863998413085938, "grad_norm_var": 2.9026800154510086, "learning_rate": 0.0001, "loss": 1.1562, "loss/crossentropy": 2.6520957946777344, "loss/hidden": 1.015625, "loss/logits": 0.1402827501296997, "loss/reg": 3.418369669816457e-05, "step": 375 }, { "epoch": 0.047, "grad_norm": 3.5551681518554688, "grad_norm_var": 2.906172521956549, "learning_rate": 0.0001, "loss": 1.3527, "loss/crossentropy": 2.5694146156311035, "loss/hidden": 1.140625, "loss/logits": 0.21178269386291504, "loss/reg": 3.417985135456547e-05, "step": 376 }, { "epoch": 0.047125, "grad_norm": 2.4192416667938232, "grad_norm_var": 2.912446952830265, "learning_rate": 0.0001, "loss": 1.1754, "loss/crossentropy": 2.4164340496063232, "loss/hidden": 1.03125, "loss/logits": 0.14376118779182434, "loss/reg": 3.4170752769568935e-05, "step": 377 }, { "epoch": 0.04725, "grad_norm": 2.242144823074341, "grad_norm_var": 2.91972157704962, "learning_rate": 0.0001, "loss": 1.1153, "loss/crossentropy": 2.544785261154175, "loss/hidden": 0.984375, "loss/logits": 0.13062497973442078, "loss/reg": 3.416725303395651e-05, "step": 378 }, { "epoch": 0.047375, "grad_norm": 2.2893593311309814, "grad_norm_var": 1.2237873306323004, "learning_rate": 0.0001, "loss": 1.3118, "loss/crossentropy": 2.364673614501953, "loss/hidden": 1.1328125, "loss/logits": 0.17860937118530273, "loss/reg": 3.416024992475286e-05, "step": 379 }, { "epoch": 0.0475, "grad_norm": 2.2065534591674805, "grad_norm_var": 1.2121325720205287, "learning_rate": 0.0001, "loss": 1.3261, "loss/crossentropy": 2.0714166164398193, "loss/hidden": 1.1484375, "loss/logits": 0.17733046412467957, "loss/reg": 3.415203173062764e-05, "step": 380 }, { "epoch": 0.047625, "grad_norm": 12.950845718383789, "grad_norm_var": 8.05178996682972, "learning_rate": 0.0001, "loss": 1.4789, "loss/crossentropy": 2.1035971641540527, "loss/hidden": 1.328125, "loss/logits": 0.15040796995162964, "loss/reg": 3.414938328205608e-05, "step": 381 }, { "epoch": 0.04775, "grad_norm": 1.9148967266082764, "grad_norm_var": 8.095531060395368, "learning_rate": 0.0001, "loss": 1.136, "loss/crossentropy": 2.589541435241699, "loss/hidden": 0.9921875, "loss/logits": 0.14346718788146973, "loss/reg": 3.41419035976287e-05, "step": 382 }, { "epoch": 0.047875, "grad_norm": 2.309288263320923, "grad_norm_var": 7.402131974293955, "learning_rate": 0.0001, "loss": 1.1656, "loss/crossentropy": 2.601487398147583, "loss/hidden": 1.015625, "loss/logits": 0.1496451497077942, "loss/reg": 3.413420563447289e-05, "step": 383 }, { "epoch": 0.048, "grad_norm": 2.129903554916382, "grad_norm_var": 7.400441847159771, "learning_rate": 0.0001, "loss": 1.2433, "loss/crossentropy": 2.0785951614379883, "loss/hidden": 1.1015625, "loss/logits": 0.14143729209899902, "loss/reg": 3.413164085941389e-05, "step": 384 }, { "epoch": 0.048125, "grad_norm": 2.513585329055786, "grad_norm_var": 7.349500066162391, "learning_rate": 0.0001, "loss": 1.46, "loss/crossentropy": 2.378612756729126, "loss/hidden": 1.25, "loss/logits": 0.20967382192611694, "loss/reg": 3.413192825973965e-05, "step": 385 }, { "epoch": 0.04825, "grad_norm": 1.8059685230255127, "grad_norm_var": 7.3836732232467055, "learning_rate": 0.0001, "loss": 1.091, "loss/crossentropy": 2.4599719047546387, "loss/hidden": 0.9609375, "loss/logits": 0.12968455255031586, "loss/reg": 3.412771911825985e-05, "step": 386 }, { "epoch": 0.048375, "grad_norm": 1.9648966789245605, "grad_norm_var": 7.374281548362966, "learning_rate": 0.0001, "loss": 1.0183, "loss/crossentropy": 2.405507802963257, "loss/hidden": 0.890625, "loss/logits": 0.12728646397590637, "loss/reg": 3.412525984458625e-05, "step": 387 }, { "epoch": 0.0485, "grad_norm": 2.7891759872436523, "grad_norm_var": 7.29369073112806, "learning_rate": 0.0001, "loss": 1.2625, "loss/crossentropy": 2.564371347427368, "loss/hidden": 1.078125, "loss/logits": 0.18406376242637634, "loss/reg": 3.4118053008569404e-05, "step": 388 }, { "epoch": 0.048625, "grad_norm": 3.2247207164764404, "grad_norm_var": 7.166662268117063, "learning_rate": 0.0001, "loss": 1.5145, "loss/crossentropy": 2.9137074947357178, "loss/hidden": 1.3125, "loss/logits": 0.20163431763648987, "loss/reg": 3.411182842683047e-05, "step": 389 }, { "epoch": 0.04875, "grad_norm": 2.1028342247009277, "grad_norm_var": 7.193139907597329, "learning_rate": 0.0001, "loss": 1.0431, "loss/crossentropy": 2.8769431114196777, "loss/hidden": 0.9140625, "loss/logits": 0.1287393867969513, "loss/reg": 3.410813951632008e-05, "step": 390 }, { "epoch": 0.048875, "grad_norm": 2.3499820232391357, "grad_norm_var": 7.186969405638869, "learning_rate": 0.0001, "loss": 1.2858, "loss/crossentropy": 2.426356315612793, "loss/hidden": 1.109375, "loss/logits": 0.17607171833515167, "loss/reg": 3.410330828046426e-05, "step": 391 }, { "epoch": 0.049, "grad_norm": 1.912327527999878, "grad_norm_var": 7.244567116261923, "learning_rate": 0.0001, "loss": 1.362, "loss/crossentropy": 2.551398277282715, "loss/hidden": 1.15625, "loss/logits": 0.20536215603351593, "loss/reg": 3.409969940548763e-05, "step": 392 }, { "epoch": 0.049125, "grad_norm": 1.8643162250518799, "grad_norm_var": 7.302740869176493, "learning_rate": 0.0001, "loss": 1.3451, "loss/crossentropy": 2.4072647094726562, "loss/hidden": 1.171875, "loss/logits": 0.17286883294582367, "loss/reg": 3.40941951435525e-05, "step": 393 }, { "epoch": 0.04925, "grad_norm": 2.126138925552368, "grad_norm_var": 7.313922412927239, "learning_rate": 0.0001, "loss": 1.1946, "loss/crossentropy": 2.6316819190979004, "loss/hidden": 1.0390625, "loss/logits": 0.15518739819526672, "loss/reg": 3.409163764445111e-05, "step": 394 }, { "epoch": 0.049375, "grad_norm": 5.341352939605713, "grad_norm_var": 7.646205880923245, "learning_rate": 0.0001, "loss": 1.3627, "loss/crossentropy": 2.2951955795288086, "loss/hidden": 1.171875, "loss/logits": 0.19052964448928833, "loss/reg": 3.408612610655837e-05, "step": 395 }, { "epoch": 0.0495, "grad_norm": 1.841691017150879, "grad_norm_var": 7.697707430188741, "learning_rate": 0.0001, "loss": 1.126, "loss/crossentropy": 2.412738561630249, "loss/hidden": 0.9765625, "loss/logits": 0.14905960857868195, "loss/reg": 3.408074189792387e-05, "step": 396 }, { "epoch": 0.049625, "grad_norm": 2.309384346008301, "grad_norm_var": 0.7576436792480905, "learning_rate": 0.0001, "loss": 1.403, "loss/crossentropy": 2.3188250064849854, "loss/hidden": 1.2421875, "loss/logits": 0.16045960783958435, "loss/reg": 3.407212716410868e-05, "step": 397 }, { "epoch": 0.04975, "grad_norm": 2.7863593101501465, "grad_norm_var": 0.7480129573726607, "learning_rate": 0.0001, "loss": 1.2891, "loss/crossentropy": 2.3670787811279297, "loss/hidden": 1.109375, "loss/logits": 0.17938411235809326, "loss/reg": 3.406599716981873e-05, "step": 398 }, { "epoch": 0.049875, "grad_norm": 1.8358136415481567, "grad_norm_var": 0.7715855741782158, "learning_rate": 0.0001, "loss": 1.1237, "loss/crossentropy": 2.514230966567993, "loss/hidden": 0.9921875, "loss/logits": 0.13119591772556305, "loss/reg": 3.4062337363138795e-05, "step": 399 }, { "epoch": 0.05, "grad_norm": 2.350162982940674, "grad_norm_var": 0.7657706364738013, "learning_rate": 0.0001, "loss": 1.2326, "loss/crossentropy": 2.713247299194336, "loss/hidden": 1.0625, "loss/logits": 0.1697346568107605, "loss/reg": 3.4054195566568524e-05, "step": 400 }, { "epoch": 0.050125, "grad_norm": 2.63322377204895, "grad_norm_var": 0.7677605659354261, "learning_rate": 0.0001, "loss": 1.1099, "loss/crossentropy": 2.440999746322632, "loss/hidden": 0.96484375, "loss/logits": 0.14475254714488983, "loss/reg": 3.404737435630523e-05, "step": 401 }, { "epoch": 0.05025, "grad_norm": 1.7843867540359497, "grad_norm_var": 0.769649818838896, "learning_rate": 0.0001, "loss": 1.1619, "loss/crossentropy": 2.3848907947540283, "loss/hidden": 1.0234375, "loss/logits": 0.13809266686439514, "loss/reg": 3.403963637538254e-05, "step": 402 }, { "epoch": 0.050375, "grad_norm": 2.4891116619110107, "grad_norm_var": 0.7528451996298959, "learning_rate": 0.0001, "loss": 1.4016, "loss/crossentropy": 2.2915351390838623, "loss/hidden": 1.2265625, "loss/logits": 0.17465338110923767, "loss/reg": 3.4031403629342094e-05, "step": 403 }, { "epoch": 0.0505, "grad_norm": 2.6278252601623535, "grad_norm_var": 0.7479028879806237, "learning_rate": 0.0001, "loss": 1.3323, "loss/crossentropy": 2.657773494720459, "loss/hidden": 1.140625, "loss/logits": 0.19131596386432648, "loss/reg": 3.402295624255203e-05, "step": 404 }, { "epoch": 0.050625, "grad_norm": 2.6351633071899414, "grad_norm_var": 0.7105926512095813, "learning_rate": 0.0001, "loss": 1.09, "loss/crossentropy": 2.341869592666626, "loss/hidden": 0.91796875, "loss/logits": 0.1717243790626526, "loss/reg": 3.401270805625245e-05, "step": 405 }, { "epoch": 0.05075, "grad_norm": 1.9917317628860474, "grad_norm_var": 0.7163125714595162, "learning_rate": 0.0001, "loss": 1.2231, "loss/crossentropy": 2.5361015796661377, "loss/hidden": 1.0625, "loss/logits": 0.16022570431232452, "loss/reg": 3.400079003768042e-05, "step": 406 }, { "epoch": 0.050875, "grad_norm": 2.01811146736145, "grad_norm_var": 0.7267341041079096, "learning_rate": 0.0001, "loss": 1.0135, "loss/crossentropy": 2.3607399463653564, "loss/hidden": 0.890625, "loss/logits": 0.12249953299760818, "loss/reg": 3.399254273972474e-05, "step": 407 }, { "epoch": 0.051, "grad_norm": 2.469839096069336, "grad_norm_var": 0.7092258078292549, "learning_rate": 0.0001, "loss": 1.3583, "loss/crossentropy": 2.1284053325653076, "loss/hidden": 1.1953125, "loss/logits": 0.1626756489276886, "loss/reg": 3.398398621357046e-05, "step": 408 }, { "epoch": 0.051125, "grad_norm": 2.0157532691955566, "grad_norm_var": 0.6989536122316408, "learning_rate": 0.0001, "loss": 1.2599, "loss/crossentropy": 2.5868444442749023, "loss/hidden": 1.09375, "loss/logits": 0.16585640609264374, "loss/reg": 3.3978514693444595e-05, "step": 409 }, { "epoch": 0.05125, "grad_norm": 2.468764543533325, "grad_norm_var": 0.6913355184321047, "learning_rate": 0.0001, "loss": 1.2353, "loss/crossentropy": 2.4670286178588867, "loss/hidden": 1.0703125, "loss/logits": 0.16462820768356323, "loss/reg": 3.397303225938231e-05, "step": 410 }, { "epoch": 0.051375, "grad_norm": 1.657066822052002, "grad_norm_var": 0.13160569161618738, "learning_rate": 0.0001, "loss": 1.1004, "loss/crossentropy": 2.463649034500122, "loss/hidden": 0.9609375, "loss/logits": 0.13913306593894958, "loss/reg": 3.397110413061455e-05, "step": 411 }, { "epoch": 0.0515, "grad_norm": 2.3177919387817383, "grad_norm_var": 0.12019285492734794, "learning_rate": 0.0001, "loss": 1.2397, "loss/crossentropy": 2.44539737701416, "loss/hidden": 1.0625, "loss/logits": 0.17688173055648804, "loss/reg": 3.396526153665036e-05, "step": 412 }, { "epoch": 0.051625, "grad_norm": 2.2815206050872803, "grad_norm_var": 0.12011142743010049, "learning_rate": 0.0001, "loss": 1.3311, "loss/crossentropy": 2.4922685623168945, "loss/hidden": 1.1796875, "loss/logits": 0.15103884041309357, "loss/reg": 3.396152169443667e-05, "step": 413 }, { "epoch": 0.05175, "grad_norm": 2.062899351119995, "grad_norm_var": 0.1032718534450779, "learning_rate": 0.0001, "loss": 1.1936, "loss/crossentropy": 2.671872138977051, "loss/hidden": 1.0390625, "loss/logits": 0.15416079759597778, "loss/reg": 3.3957923733396456e-05, "step": 414 }, { "epoch": 0.051875, "grad_norm": 1.9539638757705688, "grad_norm_var": 0.09797476372330317, "learning_rate": 0.0001, "loss": 1.1046, "loss/crossentropy": 2.6380488872528076, "loss/hidden": 0.97265625, "loss/logits": 0.1315636783838272, "loss/reg": 3.395261592231691e-05, "step": 415 }, { "epoch": 0.052, "grad_norm": 2.2393951416015625, "grad_norm_var": 0.09703828398074225, "learning_rate": 0.0001, "loss": 1.1526, "loss/crossentropy": 2.6465938091278076, "loss/hidden": 0.99609375, "loss/logits": 0.1561371386051178, "loss/reg": 3.394690065761097e-05, "step": 416 }, { "epoch": 0.052125, "grad_norm": 2.3431386947631836, "grad_norm_var": 0.08662086074431645, "learning_rate": 0.0001, "loss": 1.1849, "loss/crossentropy": 2.598094940185547, "loss/hidden": 1.0390625, "loss/logits": 0.14550316333770752, "loss/reg": 3.393869337742217e-05, "step": 417 }, { "epoch": 0.05225, "grad_norm": 1.8472511768341064, "grad_norm_var": 0.08330225189024129, "learning_rate": 0.0001, "loss": 1.1985, "loss/crossentropy": 2.5920615196228027, "loss/hidden": 1.0546875, "loss/logits": 0.14346075057983398, "loss/reg": 3.392928192624822e-05, "step": 418 }, { "epoch": 0.052375, "grad_norm": 1.7260291576385498, "grad_norm_var": 0.09167492136177748, "learning_rate": 0.0001, "loss": 1.0539, "loss/crossentropy": 2.678154468536377, "loss/hidden": 0.921875, "loss/logits": 0.13166582584381104, "loss/reg": 3.3915493986569345e-05, "step": 419 }, { "epoch": 0.0525, "grad_norm": 2.1379270553588867, "grad_norm_var": 0.0765096237299017, "learning_rate": 0.0001, "loss": 1.3037, "loss/crossentropy": 2.554790735244751, "loss/hidden": 1.140625, "loss/logits": 0.16276058554649353, "loss/reg": 3.390039273654111e-05, "step": 420 }, { "epoch": 0.052625, "grad_norm": 2.773489475250244, "grad_norm_var": 0.08692294666244584, "learning_rate": 0.0001, "loss": 1.286, "loss/crossentropy": 2.5890371799468994, "loss/hidden": 1.125, "loss/logits": 0.16067233681678772, "loss/reg": 3.3892716601258144e-05, "step": 421 }, { "epoch": 0.05275, "grad_norm": 1.9473000764846802, "grad_norm_var": 0.0879486532075814, "learning_rate": 0.0001, "loss": 1.1138, "loss/crossentropy": 2.544093370437622, "loss/hidden": 0.9765625, "loss/logits": 0.13691496849060059, "loss/reg": 3.38886420649942e-05, "step": 422 }, { "epoch": 0.052875, "grad_norm": 2.065152406692505, "grad_norm_var": 0.08731452126513635, "learning_rate": 0.0001, "loss": 1.0308, "loss/crossentropy": 2.4369661808013916, "loss/hidden": 0.90234375, "loss/logits": 0.12809085845947266, "loss/reg": 3.38791505782865e-05, "step": 423 }, { "epoch": 0.053, "grad_norm": 1.76610267162323, "grad_norm_var": 0.08771260345232476, "learning_rate": 0.0001, "loss": 1.1559, "loss/crossentropy": 2.437058210372925, "loss/hidden": 1.015625, "loss/logits": 0.13998199999332428, "loss/reg": 3.387559627299197e-05, "step": 424 }, { "epoch": 0.053125, "grad_norm": 2.1202235221862793, "grad_norm_var": 0.08721813960099964, "learning_rate": 0.0001, "loss": 1.1406, "loss/crossentropy": 2.5524604320526123, "loss/hidden": 0.98046875, "loss/logits": 0.15982511639595032, "loss/reg": 3.3869648177642375e-05, "step": 425 }, { "epoch": 0.05325, "grad_norm": 2.3238697052001953, "grad_norm_var": 0.08153644484325746, "learning_rate": 0.0001, "loss": 1.2742, "loss/crossentropy": 2.6345760822296143, "loss/hidden": 1.1171875, "loss/logits": 0.15670067071914673, "loss/reg": 3.386356183909811e-05, "step": 426 }, { "epoch": 0.053375, "grad_norm": 1.8629034757614136, "grad_norm_var": 0.07209149684443308, "learning_rate": 0.0001, "loss": 1.1243, "loss/crossentropy": 2.4134674072265625, "loss/hidden": 0.96875, "loss/logits": 0.1552024483680725, "loss/reg": 3.385494346730411e-05, "step": 427 }, { "epoch": 0.0535, "grad_norm": 2.185732841491699, "grad_norm_var": 0.06953255529498938, "learning_rate": 0.0001, "loss": 1.3593, "loss/crossentropy": 2.088414430618286, "loss/hidden": 1.203125, "loss/logits": 0.15583863854408264, "loss/reg": 3.384939191164449e-05, "step": 428 }, { "epoch": 0.053625, "grad_norm": 2.0822196006774902, "grad_norm_var": 0.06725276287184746, "learning_rate": 0.0001, "loss": 1.1775, "loss/crossentropy": 2.2571120262145996, "loss/hidden": 1.03125, "loss/logits": 0.1458958387374878, "loss/reg": 3.3843141864053905e-05, "step": 429 }, { "epoch": 0.05375, "grad_norm": 1.9582529067993164, "grad_norm_var": 0.06831322983159846, "learning_rate": 0.0001, "loss": 1.2637, "loss/crossentropy": 2.273467779159546, "loss/hidden": 1.09375, "loss/logits": 0.16964468359947205, "loss/reg": 3.38399586325977e-05, "step": 430 }, { "epoch": 0.053875, "grad_norm": 2.5428881645202637, "grad_norm_var": 0.07983358220816962, "learning_rate": 0.0001, "loss": 1.08, "loss/crossentropy": 2.6513051986694336, "loss/hidden": 0.94921875, "loss/logits": 0.13046178221702576, "loss/reg": 3.3836400689324364e-05, "step": 431 }, { "epoch": 0.054, "grad_norm": 5.832214832305908, "grad_norm_var": 0.9437448574938159, "learning_rate": 0.0001, "loss": 1.9424, "loss/crossentropy": 2.6202034950256348, "loss/hidden": 1.640625, "loss/logits": 0.30144432187080383, "loss/reg": 3.383049988769926e-05, "step": 432 }, { "epoch": 0.054125, "grad_norm": 17.29625129699707, "grad_norm_var": 14.915418371233736, "learning_rate": 0.0001, "loss": 1.369, "loss/crossentropy": 2.5406692028045654, "loss/hidden": 1.1953125, "loss/logits": 0.17335116863250732, "loss/reg": 3.382577415322885e-05, "step": 433 }, { "epoch": 0.05425, "grad_norm": 2.1661648750305176, "grad_norm_var": 14.860884296803356, "learning_rate": 0.0001, "loss": 1.3231, "loss/crossentropy": 2.486368417739868, "loss/hidden": 1.1328125, "loss/logits": 0.18998079001903534, "loss/reg": 3.381881833774969e-05, "step": 434 }, { "epoch": 0.054375, "grad_norm": 3.2340309619903564, "grad_norm_var": 14.686707047148603, "learning_rate": 0.0001, "loss": 1.1883, "loss/crossentropy": 2.6120805740356445, "loss/hidden": 1.03125, "loss/logits": 0.15667462348937988, "loss/reg": 3.381211718078703e-05, "step": 435 }, { "epoch": 0.0545, "grad_norm": 2.4656379222869873, "grad_norm_var": 14.638560696511908, "learning_rate": 0.0001, "loss": 1.2838, "loss/crossentropy": 2.5453333854675293, "loss/hidden": 1.1171875, "loss/logits": 0.1663120836019516, "loss/reg": 3.380520502105355e-05, "step": 436 }, { "epoch": 0.054625, "grad_norm": 2.4808294773101807, "grad_norm_var": 14.668903570755694, "learning_rate": 0.0001, "loss": 1.3034, "loss/crossentropy": 2.254472255706787, "loss/hidden": 1.09375, "loss/logits": 0.20927491784095764, "loss/reg": 3.3801999961724505e-05, "step": 437 }, { "epoch": 0.05475, "grad_norm": 2.0809855461120605, "grad_norm_var": 14.644204809831463, "learning_rate": 0.0001, "loss": 1.2904, "loss/crossentropy": 2.387235403060913, "loss/hidden": 1.125, "loss/logits": 0.16507935523986816, "loss/reg": 3.380004272912629e-05, "step": 438 }, { "epoch": 0.054875, "grad_norm": 2.5827701091766357, "grad_norm_var": 14.568551148225374, "learning_rate": 0.0001, "loss": 1.1913, "loss/crossentropy": 2.5564305782318115, "loss/hidden": 1.0390625, "loss/logits": 0.15191948413848877, "loss/reg": 3.379736881470308e-05, "step": 439 }, { "epoch": 0.055, "grad_norm": 2.4748318195343018, "grad_norm_var": 14.44211406577169, "learning_rate": 0.0001, "loss": 1.3339, "loss/crossentropy": 2.169847249984741, "loss/hidden": 1.140625, "loss/logits": 0.1929139941930771, "loss/reg": 3.379437475814484e-05, "step": 440 }, { "epoch": 0.055125, "grad_norm": 2.17909836769104, "grad_norm_var": 14.43165167732106, "learning_rate": 0.0001, "loss": 1.1514, "loss/crossentropy": 2.6119625568389893, "loss/hidden": 1.0, "loss/logits": 0.1510833203792572, "loss/reg": 3.3788579457905143e-05, "step": 441 }, { "epoch": 0.05525, "grad_norm": 2.328896999359131, "grad_norm_var": 14.430875418614992, "learning_rate": 0.0001, "loss": 1.4589, "loss/crossentropy": 2.1644153594970703, "loss/hidden": 1.2578125, "loss/logits": 0.20078104734420776, "loss/reg": 3.37848650815431e-05, "step": 442 }, { "epoch": 0.055375, "grad_norm": 2.473328113555908, "grad_norm_var": 14.322173701255808, "learning_rate": 0.0001, "loss": 1.1936, "loss/crossentropy": 2.5964508056640625, "loss/hidden": 1.046875, "loss/logits": 0.1464037448167801, "loss/reg": 3.3778171200538054e-05, "step": 443 }, { "epoch": 0.0555, "grad_norm": 4.068549633026123, "grad_norm_var": 14.208086262392497, "learning_rate": 0.0001, "loss": 1.3952, "loss/crossentropy": 2.8060688972473145, "loss/hidden": 1.1875, "loss/logits": 0.20741160213947296, "loss/reg": 3.37726560246665e-05, "step": 444 }, { "epoch": 0.055625, "grad_norm": 2.1623542308807373, "grad_norm_var": 14.191838680780066, "learning_rate": 0.0001, "loss": 1.0824, "loss/crossentropy": 2.724346160888672, "loss/hidden": 0.94921875, "loss/logits": 0.13284045457839966, "loss/reg": 3.3767199056455866e-05, "step": 445 }, { "epoch": 0.05575, "grad_norm": 2.2896392345428467, "grad_norm_var": 14.12415401393583, "learning_rate": 0.0001, "loss": 1.369, "loss/crossentropy": 2.3175323009490967, "loss/hidden": 1.1796875, "loss/logits": 0.18897001445293427, "loss/reg": 3.375912274350412e-05, "step": 446 }, { "epoch": 0.055875, "grad_norm": 2.046797037124634, "grad_norm_var": 14.213834657666071, "learning_rate": 0.0001, "loss": 1.1878, "loss/crossentropy": 2.6415698528289795, "loss/hidden": 1.0390625, "loss/logits": 0.14838215708732605, "loss/reg": 3.3750762668205425e-05, "step": 447 }, { "epoch": 0.056, "grad_norm": 3.2464237213134766, "grad_norm_var": 13.874242204082186, "learning_rate": 0.0001, "loss": 1.418, "loss/crossentropy": 2.4582679271698, "loss/hidden": 1.1875, "loss/logits": 0.2301977574825287, "loss/reg": 3.3743133826646954e-05, "step": 448 }, { "epoch": 0.056125, "grad_norm": 2.189342737197876, "grad_norm_var": 0.29544563519738554, "learning_rate": 0.0001, "loss": 1.2354, "loss/crossentropy": 2.224080801010132, "loss/hidden": 1.109375, "loss/logits": 0.12568463385105133, "loss/reg": 3.3736727345967665e-05, "step": 449 }, { "epoch": 0.05625, "grad_norm": 1.8410553932189941, "grad_norm_var": 0.3177951887186661, "learning_rate": 0.0001, "loss": 1.1724, "loss/crossentropy": 2.3079254627227783, "loss/hidden": 1.015625, "loss/logits": 0.15645131468772888, "loss/reg": 3.372762876097113e-05, "step": 450 }, { "epoch": 0.056375, "grad_norm": 2.075913667678833, "grad_norm_var": 0.28967181210960763, "learning_rate": 0.0001, "loss": 1.3254, "loss/crossentropy": 2.3725907802581787, "loss/hidden": 1.140625, "loss/logits": 0.18442153930664062, "loss/reg": 3.3720290957717225e-05, "step": 451 }, { "epoch": 0.0565, "grad_norm": 2.2170984745025635, "grad_norm_var": 0.29257204608246923, "learning_rate": 0.0001, "loss": 1.3107, "loss/crossentropy": 2.410374164581299, "loss/hidden": 1.1328125, "loss/logits": 0.17750215530395508, "loss/reg": 3.371315688127652e-05, "step": 452 }, { "epoch": 0.056625, "grad_norm": 2.1257264614105225, "grad_norm_var": 0.2976260957554473, "learning_rate": 0.0001, "loss": 1.1145, "loss/crossentropy": 2.1573164463043213, "loss/hidden": 0.9609375, "loss/logits": 0.1532134860754013, "loss/reg": 3.3702854125294834e-05, "step": 453 }, { "epoch": 0.05675, "grad_norm": 2.020737409591675, "grad_norm_var": 0.3004070010410295, "learning_rate": 0.0001, "loss": 1.2252, "loss/crossentropy": 2.5048506259918213, "loss/hidden": 1.0703125, "loss/logits": 0.15453127026557922, "loss/reg": 3.3693660952849314e-05, "step": 454 }, { "epoch": 0.056875, "grad_norm": 2.0882959365844727, "grad_norm_var": 0.30331944550090667, "learning_rate": 0.0001, "loss": 1.148, "loss/crossentropy": 2.6023924350738525, "loss/hidden": 1.0, "loss/logits": 0.1476426124572754, "loss/reg": 3.368509715073742e-05, "step": 455 }, { "epoch": 0.057, "grad_norm": 1.9702045917510986, "grad_norm_var": 0.31179501443108676, "learning_rate": 0.0001, "loss": 1.0904, "loss/crossentropy": 2.414108991622925, "loss/hidden": 0.953125, "loss/logits": 0.13695700466632843, "loss/reg": 3.367620593053289e-05, "step": 456 }, { "epoch": 0.057125, "grad_norm": 2.7802071571350098, "grad_norm_var": 0.32206609917582013, "learning_rate": 0.0001, "loss": 1.1101, "loss/crossentropy": 2.802133560180664, "loss/hidden": 0.96484375, "loss/logits": 0.14492589235305786, "loss/reg": 3.3667642128420994e-05, "step": 457 }, { "epoch": 0.05725, "grad_norm": 2.838331699371338, "grad_norm_var": 0.3354750209379326, "learning_rate": 0.0001, "loss": 1.3563, "loss/crossentropy": 2.409419536590576, "loss/hidden": 1.1875, "loss/logits": 0.16849547624588013, "loss/reg": 3.365922748344019e-05, "step": 458 }, { "epoch": 0.057375, "grad_norm": 2.1942574977874756, "grad_norm_var": 0.33769313303004084, "learning_rate": 0.0001, "loss": 1.1997, "loss/crossentropy": 2.5466361045837402, "loss/hidden": 1.0390625, "loss/logits": 0.16028670966625214, "loss/reg": 3.3653053833404556e-05, "step": 459 }, { "epoch": 0.0575, "grad_norm": 1.9543126821517944, "grad_norm_var": 0.14238904796044755, "learning_rate": 0.0001, "loss": 1.082, "loss/crossentropy": 2.3072826862335205, "loss/hidden": 0.9453125, "loss/logits": 0.13639651238918304, "loss/reg": 3.36485099978745e-05, "step": 460 }, { "epoch": 0.057625, "grad_norm": 2.3599038124084473, "grad_norm_var": 0.14245257928573613, "learning_rate": 0.0001, "loss": 1.3462, "loss/crossentropy": 2.5248610973358154, "loss/hidden": 1.15625, "loss/logits": 0.18964824080467224, "loss/reg": 3.3640484616626054e-05, "step": 461 }, { "epoch": 0.05775, "grad_norm": 2.160914421081543, "grad_norm_var": 0.14306343844920466, "learning_rate": 0.0001, "loss": 1.1187, "loss/crossentropy": 2.485192060470581, "loss/hidden": 0.95703125, "loss/logits": 0.16129833459854126, "loss/reg": 3.363731593708508e-05, "step": 462 }, { "epoch": 0.057875, "grad_norm": 1.96443510055542, "grad_norm_var": 0.14579406927241975, "learning_rate": 0.0001, "loss": 1.172, "loss/crossentropy": 2.3434600830078125, "loss/hidden": 1.03125, "loss/logits": 0.14036868512630463, "loss/reg": 3.362847928656265e-05, "step": 463 }, { "epoch": 0.058, "grad_norm": 2.240238666534424, "grad_norm_var": 0.07561911079075631, "learning_rate": 0.0001, "loss": 1.2581, "loss/crossentropy": 2.66363263130188, "loss/hidden": 1.1015625, "loss/logits": 0.1561916470527649, "loss/reg": 3.361971539561637e-05, "step": 464 }, { "epoch": 0.058125, "grad_norm": 2.9686269760131836, "grad_norm_var": 0.11362960790722566, "learning_rate": 0.0001, "loss": 1.3006, "loss/crossentropy": 2.733015775680542, "loss/hidden": 1.078125, "loss/logits": 0.22209219634532928, "loss/reg": 3.361287963343784e-05, "step": 465 }, { "epoch": 0.05825, "grad_norm": 2.5101568698883057, "grad_norm_var": 0.10624098469997983, "learning_rate": 0.0001, "loss": 1.1411, "loss/crossentropy": 2.588346242904663, "loss/hidden": 0.99609375, "loss/logits": 0.1446218490600586, "loss/reg": 3.360513073857874e-05, "step": 466 }, { "epoch": 0.058375, "grad_norm": 2.770623207092285, "grad_norm_var": 0.11756231178518603, "learning_rate": 0.0001, "loss": 1.3929, "loss/crossentropy": 1.7947008609771729, "loss/hidden": 1.2109375, "loss/logits": 0.18167641758918762, "loss/reg": 3.359945912961848e-05, "step": 467 }, { "epoch": 0.0585, "grad_norm": 1.9178507328033447, "grad_norm_var": 0.1273747784869385, "learning_rate": 0.0001, "loss": 1.0863, "loss/crossentropy": 2.3694558143615723, "loss/hidden": 0.94140625, "loss/logits": 0.14460425078868866, "loss/reg": 3.359114271006547e-05, "step": 468 }, { "epoch": 0.058625, "grad_norm": 2.9377102851867676, "grad_norm_var": 0.14927586898533457, "learning_rate": 0.0001, "loss": 1.3075, "loss/crossentropy": 2.737233877182007, "loss/hidden": 1.1328125, "loss/logits": 0.17436236143112183, "loss/reg": 3.35832592099905e-05, "step": 469 }, { "epoch": 0.05875, "grad_norm": 2.4766845703125, "grad_norm_var": 0.14196017860283514, "learning_rate": 0.0001, "loss": 1.2198, "loss/crossentropy": 2.725649118423462, "loss/hidden": 1.046875, "loss/logits": 0.17263534665107727, "loss/reg": 3.3575062843738124e-05, "step": 470 }, { "epoch": 0.058875, "grad_norm": 2.97731876373291, "grad_norm_var": 0.15638940419960357, "learning_rate": 0.0001, "loss": 1.1337, "loss/crossentropy": 2.4248714447021484, "loss/hidden": 0.98828125, "loss/logits": 0.1451077163219452, "loss/reg": 3.3564418117748573e-05, "step": 471 }, { "epoch": 0.059, "grad_norm": 2.4938745498657227, "grad_norm_var": 0.14080595119560016, "learning_rate": 0.0001, "loss": 1.2354, "loss/crossentropy": 2.5639638900756836, "loss/hidden": 1.0625, "loss/logits": 0.17252308130264282, "loss/reg": 3.355655644554645e-05, "step": 472 }, { "epoch": 0.059125, "grad_norm": 2.453796148300171, "grad_norm_var": 0.13403350770174421, "learning_rate": 0.0001, "loss": 1.3133, "loss/crossentropy": 2.535618543624878, "loss/hidden": 1.140625, "loss/logits": 0.1723370999097824, "loss/reg": 3.354718137416057e-05, "step": 473 }, { "epoch": 0.05925, "grad_norm": 2.3145835399627686, "grad_norm_var": 0.12414269824475065, "learning_rate": 0.0001, "loss": 1.1404, "loss/crossentropy": 2.5441014766693115, "loss/hidden": 0.9921875, "loss/logits": 0.14785614609718323, "loss/reg": 3.353881766088307e-05, "step": 474 }, { "epoch": 0.059375, "grad_norm": 2.3766791820526123, "grad_norm_var": 0.12076940932042811, "learning_rate": 0.0001, "loss": 1.546, "loss/crossentropy": 2.321577310562134, "loss/hidden": 1.3203125, "loss/logits": 0.22531206905841827, "loss/reg": 3.352982457727194e-05, "step": 475 }, { "epoch": 0.0595, "grad_norm": 2.9967336654663086, "grad_norm_var": 0.1225888750658117, "learning_rate": 0.0001, "loss": 1.5924, "loss/crossentropy": 2.00260066986084, "loss/hidden": 1.375, "loss/logits": 0.21702903509140015, "loss/reg": 3.3521097066113725e-05, "step": 476 }, { "epoch": 0.059625, "grad_norm": 2.6383934020996094, "grad_norm_var": 0.12241946620474262, "learning_rate": 0.0001, "loss": 1.4419, "loss/crossentropy": 2.5109200477600098, "loss/hidden": 1.2265625, "loss/logits": 0.21502982079982758, "loss/reg": 3.3512478694319725e-05, "step": 477 }, { "epoch": 0.05975, "grad_norm": 2.120065450668335, "grad_norm_var": 0.12443820755625362, "learning_rate": 0.0001, "loss": 1.2943, "loss/crossentropy": 2.4915192127227783, "loss/hidden": 1.1171875, "loss/logits": 0.17678330838680267, "loss/reg": 3.3503984013805166e-05, "step": 478 }, { "epoch": 0.059875, "grad_norm": 2.858367681503296, "grad_norm_var": 0.10937309591752348, "learning_rate": 0.0001, "loss": 1.4656, "loss/crossentropy": 2.3548312187194824, "loss/hidden": 1.265625, "loss/logits": 0.19961079955101013, "loss/reg": 3.349495091242716e-05, "step": 479 }, { "epoch": 0.06, "grad_norm": 2.285705089569092, "grad_norm_var": 0.1075290964460765, "learning_rate": 0.0001, "loss": 1.2262, "loss/crossentropy": 2.489813804626465, "loss/hidden": 1.0390625, "loss/logits": 0.18675509095191956, "loss/reg": 3.3486459869891405e-05, "step": 480 }, { "epoch": 0.060125, "grad_norm": 2.094900131225586, "grad_norm_var": 0.10863647120417165, "learning_rate": 0.0001, "loss": 1.3702, "loss/crossentropy": 2.3183302879333496, "loss/hidden": 1.171875, "loss/logits": 0.1980261653661728, "loss/reg": 3.3478718250989914e-05, "step": 481 }, { "epoch": 0.06025, "grad_norm": 2.406928777694702, "grad_norm_var": 0.10935489058969262, "learning_rate": 0.0001, "loss": 1.3264, "loss/crossentropy": 2.5121381282806396, "loss/hidden": 1.15625, "loss/logits": 0.1697903275489807, "loss/reg": 3.3472137147327885e-05, "step": 482 }, { "epoch": 0.060375, "grad_norm": 1.7551078796386719, "grad_norm_var": 0.1381837528506061, "learning_rate": 0.0001, "loss": 1.0999, "loss/crossentropy": 2.433582067489624, "loss/hidden": 0.9609375, "loss/logits": 0.13861994445323944, "loss/reg": 3.34642463712953e-05, "step": 483 }, { "epoch": 0.0605, "grad_norm": 2.15712308883667, "grad_norm_var": 0.12497483119508387, "learning_rate": 0.0001, "loss": 1.264, "loss/crossentropy": 2.77188777923584, "loss/hidden": 1.1015625, "loss/logits": 0.1621045470237732, "loss/reg": 3.345516961417161e-05, "step": 484 }, { "epoch": 0.060625, "grad_norm": 2.0167171955108643, "grad_norm_var": 0.11920370288203964, "learning_rate": 0.0001, "loss": 1.3431, "loss/crossentropy": 2.546163558959961, "loss/hidden": 1.1484375, "loss/logits": 0.19433115422725677, "loss/reg": 3.344708966324106e-05, "step": 485 }, { "epoch": 0.06075, "grad_norm": 2.6408584117889404, "grad_norm_var": 0.12253544383796963, "learning_rate": 0.0001, "loss": 1.379, "loss/crossentropy": 2.7440547943115234, "loss/hidden": 1.1953125, "loss/logits": 0.1834029257297516, "loss/reg": 3.343883508932777e-05, "step": 486 }, { "epoch": 0.060875, "grad_norm": 2.1073925495147705, "grad_norm_var": 0.10422711697162654, "learning_rate": 0.0001, "loss": 1.095, "loss/crossentropy": 2.180604934692383, "loss/hidden": 0.953125, "loss/logits": 0.14150169491767883, "loss/reg": 3.343077696627006e-05, "step": 487 }, { "epoch": 0.061, "grad_norm": 2.980268716812134, "grad_norm_var": 0.1278688011981179, "learning_rate": 0.0001, "loss": 1.3713, "loss/crossentropy": 2.511422634124756, "loss/hidden": 1.1796875, "loss/logits": 0.19132453203201294, "loss/reg": 3.3423166314605623e-05, "step": 488 }, { "epoch": 0.061125, "grad_norm": 2.237560272216797, "grad_norm_var": 0.1288862839917743, "learning_rate": 0.0001, "loss": 1.0922, "loss/crossentropy": 2.361391305923462, "loss/hidden": 0.94921875, "loss/logits": 0.14263351261615753, "loss/reg": 3.341743286000565e-05, "step": 489 }, { "epoch": 0.06125, "grad_norm": 2.2243480682373047, "grad_norm_var": 0.1301125949056683, "learning_rate": 0.0001, "loss": 1.2353, "loss/crossentropy": 2.6845271587371826, "loss/hidden": 1.0703125, "loss/logits": 0.16469591856002808, "loss/reg": 3.340901093906723e-05, "step": 490 }, { "epoch": 0.061375, "grad_norm": 1.9749258756637573, "grad_norm_var": 0.13976616590314225, "learning_rate": 0.0001, "loss": 1.0788, "loss/crossentropy": 2.8269782066345215, "loss/hidden": 0.9453125, "loss/logits": 0.13316524028778076, "loss/reg": 3.340181865496561e-05, "step": 491 }, { "epoch": 0.0615, "grad_norm": 3.8082363605499268, "grad_norm_var": 0.2516089050798465, "learning_rate": 0.0001, "loss": 1.1953, "loss/crossentropy": 3.110802412033081, "loss/hidden": 1.03125, "loss/logits": 0.16368849575519562, "loss/reg": 3.33938623953145e-05, "step": 492 }, { "epoch": 0.061625, "grad_norm": 7.89940881729126, "grad_norm_var": 2.152808837213317, "learning_rate": 0.0001, "loss": 1.5515, "loss/crossentropy": 2.6210036277770996, "loss/hidden": 1.40625, "loss/logits": 0.1449393779039383, "loss/reg": 3.338697206345387e-05, "step": 493 }, { "epoch": 0.06175, "grad_norm": 2.5107972621917725, "grad_norm_var": 2.1309396475018327, "learning_rate": 0.0001, "loss": 1.2771, "loss/crossentropy": 2.296051263809204, "loss/hidden": 1.09375, "loss/logits": 0.18302392959594727, "loss/reg": 3.337907401146367e-05, "step": 494 }, { "epoch": 0.061875, "grad_norm": 2.3817129135131836, "grad_norm_var": 2.138088174245088, "learning_rate": 0.0001, "loss": 1.2106, "loss/crossentropy": 2.6225788593292236, "loss/hidden": 1.0546875, "loss/logits": 0.1555565595626831, "loss/reg": 3.3369677112204954e-05, "step": 495 }, { "epoch": 0.062, "grad_norm": 2.297321081161499, "grad_norm_var": 2.137427651207279, "learning_rate": 0.0001, "loss": 1.3821, "loss/crossentropy": 2.326659679412842, "loss/hidden": 1.1796875, "loss/logits": 0.20210911333560944, "loss/reg": 3.3361084206262603e-05, "step": 496 }, { "epoch": 0.062125, "grad_norm": 2.1879894733428955, "grad_norm_var": 2.1302310419826815, "learning_rate": 0.0001, "loss": 1.3433, "loss/crossentropy": 2.6444246768951416, "loss/hidden": 1.15625, "loss/logits": 0.18667887151241302, "loss/reg": 3.3350897865602747e-05, "step": 497 }, { "epoch": 0.06225, "grad_norm": 2.7556395530700684, "grad_norm_var": 2.1230810021853803, "learning_rate": 0.0001, "loss": 1.1761, "loss/crossentropy": 2.8364853858947754, "loss/hidden": 1.015625, "loss/logits": 0.16018438339233398, "loss/reg": 3.334263601573184e-05, "step": 498 }, { "epoch": 0.062375, "grad_norm": 2.0885095596313477, "grad_norm_var": 2.085981261133649, "learning_rate": 0.0001, "loss": 1.4855, "loss/crossentropy": 2.0875301361083984, "loss/hidden": 1.2890625, "loss/logits": 0.19610214233398438, "loss/reg": 3.3336276828777045e-05, "step": 499 }, { "epoch": 0.0625, "grad_norm": 2.195967674255371, "grad_norm_var": 2.0829178782721693, "learning_rate": 0.0001, "loss": 1.357, "loss/crossentropy": 2.2123799324035645, "loss/hidden": 1.1953125, "loss/logits": 0.16132420301437378, "loss/reg": 3.3326996344840154e-05, "step": 500 }, { "epoch": 0.062625, "grad_norm": 1.9233721494674683, "grad_norm_var": 2.0928282179657134, "learning_rate": 0.0001, "loss": 1.2098, "loss/crossentropy": 2.4555163383483887, "loss/hidden": 1.0546875, "loss/logits": 0.15475736558437347, "loss/reg": 3.331886910018511e-05, "step": 501 }, { "epoch": 0.06275, "grad_norm": 1.7558976411819458, "grad_norm_var": 2.1562340342327277, "learning_rate": 0.0001, "loss": 1.1455, "loss/crossentropy": 2.4747304916381836, "loss/hidden": 0.984375, "loss/logits": 0.16081759333610535, "loss/reg": 3.331211337354034e-05, "step": 502 }, { "epoch": 0.062875, "grad_norm": 2.020193099975586, "grad_norm_var": 2.163693266292887, "learning_rate": 0.0001, "loss": 1.0334, "loss/crossentropy": 2.5233755111694336, "loss/hidden": 0.890625, "loss/logits": 0.14245402812957764, "loss/reg": 3.330542676849291e-05, "step": 503 }, { "epoch": 0.063, "grad_norm": 1.9017070531845093, "grad_norm_var": 2.196473105856813, "learning_rate": 0.0001, "loss": 1.187, "loss/crossentropy": 2.513927459716797, "loss/hidden": 1.03125, "loss/logits": 0.15539123117923737, "loss/reg": 3.329779065097682e-05, "step": 504 }, { "epoch": 0.063125, "grad_norm": 1.7002081871032715, "grad_norm_var": 2.243011213708456, "learning_rate": 0.0001, "loss": 1.0332, "loss/crossentropy": 2.3853862285614014, "loss/hidden": 0.90234375, "loss/logits": 0.13057225942611694, "loss/reg": 3.329246101202443e-05, "step": 505 }, { "epoch": 0.06325, "grad_norm": 3.703674793243408, "grad_norm_var": 2.305368345603883, "learning_rate": 0.0001, "loss": 1.6924, "loss/crossentropy": 2.689699172973633, "loss/hidden": 1.3359375, "loss/logits": 0.3561299741268158, "loss/reg": 3.328541060909629e-05, "step": 506 }, { "epoch": 0.063375, "grad_norm": 2.304819345474243, "grad_norm_var": 2.28053686149454, "learning_rate": 0.0001, "loss": 1.2551, "loss/crossentropy": 2.5181477069854736, "loss/hidden": 1.078125, "loss/logits": 0.17664968967437744, "loss/reg": 3.3279109629802406e-05, "step": 507 }, { "epoch": 0.0635, "grad_norm": 1.981182336807251, "grad_norm_var": 2.222780309447023, "learning_rate": 0.0001, "loss": 1.2923, "loss/crossentropy": 2.379751443862915, "loss/hidden": 1.109375, "loss/logits": 0.18254666030406952, "loss/reg": 3.327402373543009e-05, "step": 508 }, { "epoch": 0.063625, "grad_norm": 1.9515694379806519, "grad_norm_var": 0.23157529156244816, "learning_rate": 0.0001, "loss": 1.2068, "loss/crossentropy": 2.4413554668426514, "loss/hidden": 1.046875, "loss/logits": 0.15955983102321625, "loss/reg": 3.327105878270231e-05, "step": 509 }, { "epoch": 0.06375, "grad_norm": 2.283177614212036, "grad_norm_var": 0.2262545926208522, "learning_rate": 0.0001, "loss": 1.121, "loss/crossentropy": 2.586780309677124, "loss/hidden": 0.9453125, "loss/logits": 0.17534837126731873, "loss/reg": 3.3264575904468074e-05, "step": 510 }, { "epoch": 0.063875, "grad_norm": 2.5725600719451904, "grad_norm_var": 0.23278445739527623, "learning_rate": 0.0001, "loss": 1.372, "loss/crossentropy": 2.8506243228912354, "loss/hidden": 1.15625, "loss/logits": 0.21537676453590393, "loss/reg": 3.325942452647723e-05, "step": 511 }, { "epoch": 0.064, "grad_norm": 2.17802357673645, "grad_norm_var": 0.23254723734647367, "learning_rate": 0.0001, "loss": 1.4721, "loss/crossentropy": 2.531144618988037, "loss/hidden": 1.265625, "loss/logits": 0.20617413520812988, "loss/reg": 3.325657962705009e-05, "step": 512 }, { "epoch": 0.064125, "grad_norm": 2.052086353302002, "grad_norm_var": 0.23426407133045282, "learning_rate": 0.0001, "loss": 1.1786, "loss/crossentropy": 2.5373010635375977, "loss/hidden": 1.0, "loss/logits": 0.17822688817977905, "loss/reg": 3.3255280868615955e-05, "step": 513 }, { "epoch": 0.06425, "grad_norm": 2.104679584503174, "grad_norm_var": 0.21343636499370103, "learning_rate": 0.0001, "loss": 1.3482, "loss/crossentropy": 2.2992570400238037, "loss/hidden": 1.15625, "loss/logits": 0.19161126017570496, "loss/reg": 3.3254742447752506e-05, "step": 514 }, { "epoch": 0.064375, "grad_norm": 1.9803693294525146, "grad_norm_var": 0.2153401081871055, "learning_rate": 0.0001, "loss": 1.1247, "loss/crossentropy": 2.305745840072632, "loss/hidden": 0.98828125, "loss/logits": 0.13605481386184692, "loss/reg": 3.325332727399655e-05, "step": 515 }, { "epoch": 0.0645, "grad_norm": 2.516315221786499, "grad_norm_var": 0.22315819314323923, "learning_rate": 0.0001, "loss": 1.286, "loss/crossentropy": 2.6059908866882324, "loss/hidden": 1.1328125, "loss/logits": 0.15280824899673462, "loss/reg": 3.324857243569568e-05, "step": 516 }, { "epoch": 0.064625, "grad_norm": 1.9717873334884644, "grad_norm_var": 0.22162796366275472, "learning_rate": 0.0001, "loss": 1.1563, "loss/crossentropy": 2.566389322280884, "loss/hidden": 1.0, "loss/logits": 0.15599536895751953, "loss/reg": 3.324487624922767e-05, "step": 517 }, { "epoch": 0.06475, "grad_norm": 2.5053038597106934, "grad_norm_var": 0.21373832688979721, "learning_rate": 0.0001, "loss": 1.0753, "loss/crossentropy": 2.5956661701202393, "loss/hidden": 0.93359375, "loss/logits": 0.14140459895133972, "loss/reg": 3.324192584841512e-05, "step": 518 }, { "epoch": 0.064875, "grad_norm": 1.617448329925537, "grad_norm_var": 0.2353024678766184, "learning_rate": 0.0001, "loss": 1.2648, "loss/crossentropy": 2.1230597496032715, "loss/hidden": 1.09375, "loss/logits": 0.1706867814064026, "loss/reg": 3.323415876366198e-05, "step": 519 }, { "epoch": 0.065, "grad_norm": 2.6171135902404785, "grad_norm_var": 0.23809225200622847, "learning_rate": 0.0001, "loss": 1.2681, "loss/crossentropy": 3.001396894454956, "loss/hidden": 1.0859375, "loss/logits": 0.1818409115076065, "loss/reg": 3.32270392391365e-05, "step": 520 }, { "epoch": 0.065125, "grad_norm": 2.2164251804351807, "grad_norm_var": 0.21673222300943445, "learning_rate": 0.0001, "loss": 1.3872, "loss/crossentropy": 2.394097089767456, "loss/hidden": 1.171875, "loss/logits": 0.2150021493434906, "loss/reg": 3.322528209537268e-05, "step": 521 }, { "epoch": 0.06525, "grad_norm": 1.6968417167663574, "grad_norm_var": 0.08877967906966712, "learning_rate": 0.0001, "loss": 1.1035, "loss/crossentropy": 2.572463274002075, "loss/hidden": 0.96484375, "loss/logits": 0.13827435672283173, "loss/reg": 3.322424890939146e-05, "step": 522 }, { "epoch": 0.065375, "grad_norm": 2.6645708084106445, "grad_norm_var": 0.10384589830682482, "learning_rate": 0.0001, "loss": 1.4294, "loss/crossentropy": 2.662943124771118, "loss/hidden": 1.234375, "loss/logits": 0.19470733404159546, "loss/reg": 3.3224798244191334e-05, "step": 523 }, { "epoch": 0.0655, "grad_norm": 3.8316986560821533, "grad_norm_var": 0.26836197186599786, "learning_rate": 0.0001, "loss": 1.3972, "loss/crossentropy": 2.487124443054199, "loss/hidden": 1.203125, "loss/logits": 0.1937580108642578, "loss/reg": 3.32186245941557e-05, "step": 524 }, { "epoch": 0.065625, "grad_norm": 2.3071606159210205, "grad_norm_var": 0.2598635625197322, "learning_rate": 0.0001, "loss": 1.2605, "loss/crossentropy": 2.560915470123291, "loss/hidden": 1.109375, "loss/logits": 0.1507827639579773, "loss/reg": 3.322085103718564e-05, "step": 525 }, { "epoch": 0.06575, "grad_norm": 2.080793857574463, "grad_norm_var": 0.2634096601901807, "learning_rate": 0.0001, "loss": 1.1865, "loss/crossentropy": 2.3294875621795654, "loss/hidden": 1.0234375, "loss/logits": 0.16270865499973297, "loss/reg": 3.322162956465036e-05, "step": 526 }, { "epoch": 0.065875, "grad_norm": 2.533914566040039, "grad_norm_var": 0.26213502133962924, "learning_rate": 0.0001, "loss": 1.3337, "loss/crossentropy": 2.919621229171753, "loss/hidden": 1.140625, "loss/logits": 0.1927223801612854, "loss/reg": 3.3224547223653644e-05, "step": 527 }, { "epoch": 0.066, "grad_norm": 2.0774967670440674, "grad_norm_var": 0.2644639815857895, "learning_rate": 0.0001, "loss": 1.1001, "loss/crossentropy": 2.7280097007751465, "loss/hidden": 0.96484375, "loss/logits": 0.1349409520626068, "loss/reg": 3.3227763196919113e-05, "step": 528 }, { "epoch": 0.066125, "grad_norm": 2.5009524822235107, "grad_norm_var": 0.26231642591397886, "learning_rate": 0.0001, "loss": 1.4265, "loss/crossentropy": 2.3399899005889893, "loss/hidden": 1.2109375, "loss/logits": 0.21527621150016785, "loss/reg": 3.322759221191518e-05, "step": 529 }, { "epoch": 0.06625, "grad_norm": 2.312666177749634, "grad_norm_var": 0.2588706095933777, "learning_rate": 0.0001, "loss": 1.2942, "loss/crossentropy": 2.527575969696045, "loss/hidden": 1.1171875, "loss/logits": 0.17671984434127808, "loss/reg": 3.322830525576137e-05, "step": 530 }, { "epoch": 0.066375, "grad_norm": 2.528494358062744, "grad_norm_var": 0.2514069212263559, "learning_rate": 0.0001, "loss": 1.2719, "loss/crossentropy": 2.648963451385498, "loss/hidden": 1.1015625, "loss/logits": 0.17003270983695984, "loss/reg": 3.322178599773906e-05, "step": 531 }, { "epoch": 0.0665, "grad_norm": 3.481004476547241, "grad_norm_var": 0.3279166626744005, "learning_rate": 0.0001, "loss": 1.8273, "loss/crossentropy": 2.473355531692505, "loss/hidden": 1.546875, "loss/logits": 0.2800578474998474, "loss/reg": 3.321353142382577e-05, "step": 532 }, { "epoch": 0.066625, "grad_norm": 4.112295150756836, "grad_norm_var": 0.48236737999875434, "learning_rate": 0.0001, "loss": 1.7412, "loss/crossentropy": 2.67095685005188, "loss/hidden": 1.421875, "loss/logits": 0.31896963715553284, "loss/reg": 3.3209085813723505e-05, "step": 533 }, { "epoch": 0.06675, "grad_norm": 2.737959384918213, "grad_norm_var": 0.48381294167741634, "learning_rate": 0.0001, "loss": 1.1166, "loss/crossentropy": 2.6570234298706055, "loss/hidden": 0.98046875, "loss/logits": 0.13577935099601746, "loss/reg": 3.3210537367267534e-05, "step": 534 }, { "epoch": 0.066875, "grad_norm": 2.4249866008758545, "grad_norm_var": 0.42068279072435266, "learning_rate": 0.0001, "loss": 1.1667, "loss/crossentropy": 2.6176538467407227, "loss/hidden": 1.015625, "loss/logits": 0.15073555707931519, "loss/reg": 3.3202206395799294e-05, "step": 535 }, { "epoch": 0.067, "grad_norm": 2.17911696434021, "grad_norm_var": 0.43358738180024237, "learning_rate": 0.0001, "loss": 1.4199, "loss/crossentropy": 2.3476545810699463, "loss/hidden": 1.234375, "loss/logits": 0.18518668413162231, "loss/reg": 3.3198077289853245e-05, "step": 536 }, { "epoch": 0.067125, "grad_norm": 1.7613617181777954, "grad_norm_var": 0.4701310667265805, "learning_rate": 0.0001, "loss": 1.3332, "loss/crossentropy": 2.5358726978302, "loss/hidden": 1.15625, "loss/logits": 0.17660382390022278, "loss/reg": 3.319012466818094e-05, "step": 537 }, { "epoch": 0.06725, "grad_norm": 2.5631303787231445, "grad_norm_var": 0.415376700832607, "learning_rate": 0.0001, "loss": 1.4518, "loss/crossentropy": 2.551100015640259, "loss/hidden": 1.21875, "loss/logits": 0.23273751139640808, "loss/reg": 3.3181466278620064e-05, "step": 538 }, { "epoch": 0.067375, "grad_norm": 2.444988250732422, "grad_norm_var": 0.4174102900534516, "learning_rate": 0.0001, "loss": 1.2783, "loss/crossentropy": 2.5376105308532715, "loss/hidden": 1.1015625, "loss/logits": 0.17640447616577148, "loss/reg": 3.317417576909065e-05, "step": 539 }, { "epoch": 0.0675, "grad_norm": 1.9729820489883423, "grad_norm_var": 0.3323928474246306, "learning_rate": 0.0001, "loss": 1.098, "loss/crossentropy": 2.779029369354248, "loss/hidden": 0.9609375, "loss/logits": 0.13674432039260864, "loss/reg": 3.316561924293637e-05, "step": 540 }, { "epoch": 0.067625, "grad_norm": 2.1116485595703125, "grad_norm_var": 0.3398403486674641, "learning_rate": 0.0001, "loss": 1.2947, "loss/crossentropy": 2.4113619327545166, "loss/hidden": 1.125, "loss/logits": 0.16934365034103394, "loss/reg": 3.315923095215112e-05, "step": 541 }, { "epoch": 0.06775, "grad_norm": 2.0958714485168457, "grad_norm_var": 0.3390339478288655, "learning_rate": 0.0001, "loss": 1.1846, "loss/crossentropy": 2.4695186614990234, "loss/hidden": 1.03125, "loss/logits": 0.15305987000465393, "loss/reg": 3.315501817269251e-05, "step": 542 }, { "epoch": 0.067875, "grad_norm": 1.9753048419952393, "grad_norm_var": 0.35526067215531704, "learning_rate": 0.0001, "loss": 1.0784, "loss/crossentropy": 2.416532039642334, "loss/hidden": 0.94921875, "loss/logits": 0.12887820601463318, "loss/reg": 3.314777131890878e-05, "step": 543 }, { "epoch": 0.068, "grad_norm": 2.2497429847717285, "grad_norm_var": 0.3484447964453047, "learning_rate": 0.0001, "loss": 1.2219, "loss/crossentropy": 2.8293488025665283, "loss/hidden": 1.0390625, "loss/logits": 0.18253688514232635, "loss/reg": 3.314365312689915e-05, "step": 544 }, { "epoch": 0.068125, "grad_norm": 2.1856963634490967, "grad_norm_var": 0.3531780702082564, "learning_rate": 0.0001, "loss": 1.342, "loss/crossentropy": 2.4071121215820312, "loss/hidden": 1.1484375, "loss/logits": 0.19320359826087952, "loss/reg": 3.313948764116503e-05, "step": 545 }, { "epoch": 0.06825, "grad_norm": 1.9663119316101074, "grad_norm_var": 0.3668366876101023, "learning_rate": 0.0001, "loss": 1.1952, "loss/crossentropy": 2.734199047088623, "loss/hidden": 1.03125, "loss/logits": 0.16357938945293427, "loss/reg": 3.3137544960482046e-05, "step": 546 }, { "epoch": 0.068375, "grad_norm": 1.899553894996643, "grad_norm_var": 0.38283294553956176, "learning_rate": 0.0001, "loss": 1.1598, "loss/crossentropy": 2.943643808364868, "loss/hidden": 0.9921875, "loss/logits": 0.16727682948112488, "loss/reg": 3.312825720058754e-05, "step": 547 }, { "epoch": 0.0685, "grad_norm": 2.248959541320801, "grad_norm_var": 0.29768036917005575, "learning_rate": 0.0001, "loss": 1.3018, "loss/crossentropy": 2.3630926609039307, "loss/hidden": 1.125, "loss/logits": 0.17643454670906067, "loss/reg": 3.311951149953529e-05, "step": 548 }, { "epoch": 0.068625, "grad_norm": 1.9839047193527222, "grad_norm_var": 0.06880950688917717, "learning_rate": 0.0001, "loss": 1.3062, "loss/crossentropy": 2.461066484451294, "loss/hidden": 1.1328125, "loss/logits": 0.17305001616477966, "loss/reg": 3.3115993574028835e-05, "step": 549 }, { "epoch": 0.06875, "grad_norm": 2.138064384460449, "grad_norm_var": 0.046280360048162335, "learning_rate": 0.0001, "loss": 1.456, "loss/crossentropy": 2.2832694053649902, "loss/hidden": 1.2578125, "loss/logits": 0.19788944721221924, "loss/reg": 3.3114942198153585e-05, "step": 550 }, { "epoch": 0.068875, "grad_norm": 1.9238662719726562, "grad_norm_var": 0.042773526186366935, "learning_rate": 0.0001, "loss": 1.2408, "loss/crossentropy": 2.2895045280456543, "loss/hidden": 1.0859375, "loss/logits": 0.15455299615859985, "loss/reg": 3.3106487535405904e-05, "step": 551 }, { "epoch": 0.069, "grad_norm": 2.023206949234009, "grad_norm_var": 0.042778668601256224, "learning_rate": 0.0001, "loss": 1.271, "loss/crossentropy": 2.322998285293579, "loss/hidden": 1.109375, "loss/logits": 0.1612919569015503, "loss/reg": 3.310632018838078e-05, "step": 552 }, { "epoch": 0.069125, "grad_norm": 3.662583589553833, "grad_norm_var": 0.18372824324306117, "learning_rate": 0.0001, "loss": 1.4513, "loss/crossentropy": 2.347020149230957, "loss/hidden": 1.234375, "loss/logits": 0.21659526228904724, "loss/reg": 3.3105799957411364e-05, "step": 553 }, { "epoch": 0.06925, "grad_norm": 2.078821897506714, "grad_norm_var": 0.17593105310000295, "learning_rate": 0.0001, "loss": 1.1689, "loss/crossentropy": 2.5081067085266113, "loss/hidden": 1.0234375, "loss/logits": 0.14510974287986755, "loss/reg": 3.3105472539318725e-05, "step": 554 }, { "epoch": 0.069375, "grad_norm": 3.5003957748413086, "grad_norm_var": 0.2821214155658367, "learning_rate": 0.0001, "loss": 1.5964, "loss/crossentropy": 2.0227768421173096, "loss/hidden": 1.359375, "loss/logits": 0.23666173219680786, "loss/reg": 3.309710882604122e-05, "step": 555 }, { "epoch": 0.0695, "grad_norm": 2.685995101928711, "grad_norm_var": 0.2874594797577854, "learning_rate": 0.0001, "loss": 1.1887, "loss/crossentropy": 2.2398436069488525, "loss/hidden": 1.0546875, "loss/logits": 0.13363878428936005, "loss/reg": 3.308998930151574e-05, "step": 556 }, { "epoch": 0.069625, "grad_norm": 2.505387306213379, "grad_norm_var": 0.2874906156265238, "learning_rate": 0.0001, "loss": 1.3653, "loss/crossentropy": 2.1469314098358154, "loss/hidden": 1.203125, "loss/logits": 0.16182279586791992, "loss/reg": 3.308698069304228e-05, "step": 557 }, { "epoch": 0.06975, "grad_norm": 2.418957233428955, "grad_norm_var": 0.28434973598758473, "learning_rate": 0.0001, "loss": 1.2753, "loss/crossentropy": 2.3960018157958984, "loss/hidden": 1.109375, "loss/logits": 0.16554874181747437, "loss/reg": 3.308444865979254e-05, "step": 558 }, { "epoch": 0.069875, "grad_norm": 1.9992293119430542, "grad_norm_var": 0.28322081166726315, "learning_rate": 0.0001, "loss": 1.095, "loss/crossentropy": 2.6128995418548584, "loss/hidden": 0.95703125, "loss/logits": 0.13762758672237396, "loss/reg": 3.307777296868153e-05, "step": 559 }, { "epoch": 0.07, "grad_norm": 1.9864604473114014, "grad_norm_var": 0.2907888869241692, "learning_rate": 0.0001, "loss": 1.2665, "loss/crossentropy": 2.561873197555542, "loss/hidden": 1.1015625, "loss/logits": 0.16459921002388, "loss/reg": 3.307408405817114e-05, "step": 560 }, { "epoch": 0.070125, "grad_norm": 2.078775644302368, "grad_norm_var": 0.29349590417462296, "learning_rate": 0.0001, "loss": 1.1511, "loss/crossentropy": 2.37235951423645, "loss/hidden": 1.0078125, "loss/logits": 0.14295557141304016, "loss/reg": 3.3065680327126756e-05, "step": 561 }, { "epoch": 0.07025, "grad_norm": 2.110450506210327, "grad_norm_var": 0.2880205075952574, "learning_rate": 0.0001, "loss": 1.3304, "loss/crossentropy": 2.3763866424560547, "loss/hidden": 1.1484375, "loss/logits": 0.18167179822921753, "loss/reg": 3.3057913242373616e-05, "step": 562 }, { "epoch": 0.070375, "grad_norm": 1.9325919151306152, "grad_norm_var": 0.28620232482811025, "learning_rate": 0.0001, "loss": 1.3137, "loss/crossentropy": 2.2546942234039307, "loss/hidden": 1.140625, "loss/logits": 0.17276525497436523, "loss/reg": 3.30470284097828e-05, "step": 563 }, { "epoch": 0.0705, "grad_norm": 2.6625826358795166, "grad_norm_var": 0.2924338162120667, "learning_rate": 0.0001, "loss": 1.2107, "loss/crossentropy": 2.6117820739746094, "loss/hidden": 1.03125, "loss/logits": 0.17915129661560059, "loss/reg": 3.3036336390068755e-05, "step": 564 }, { "epoch": 0.070625, "grad_norm": 1.9594511985778809, "grad_norm_var": 0.29368343179504985, "learning_rate": 0.0001, "loss": 1.1768, "loss/crossentropy": 2.4371213912963867, "loss/hidden": 1.03125, "loss/logits": 0.14522477984428406, "loss/reg": 3.3024989534169436e-05, "step": 565 }, { "epoch": 0.07075, "grad_norm": 2.5697479248046875, "grad_norm_var": 0.29289142392980183, "learning_rate": 0.0001, "loss": 1.3313, "loss/crossentropy": 2.450010299682617, "loss/hidden": 1.15625, "loss/logits": 0.17474365234375, "loss/reg": 3.3016185625456274e-05, "step": 566 }, { "epoch": 0.070875, "grad_norm": 2.129605531692505, "grad_norm_var": 0.28299262421701216, "learning_rate": 0.0001, "loss": 1.1703, "loss/crossentropy": 2.386748790740967, "loss/hidden": 1.0078125, "loss/logits": 0.16217753291130066, "loss/reg": 3.3007927413564175e-05, "step": 567 }, { "epoch": 0.071, "grad_norm": 2.708440065383911, "grad_norm_var": 0.27846047291652093, "learning_rate": 0.0001, "loss": 1.1157, "loss/crossentropy": 2.3872506618499756, "loss/hidden": 0.9609375, "loss/logits": 0.15438461303710938, "loss/reg": 3.299961463198997e-05, "step": 568 }, { "epoch": 0.071125, "grad_norm": 3.5758209228515625, "grad_norm_var": 0.2647511463576776, "learning_rate": 0.0001, "loss": 1.639, "loss/crossentropy": 2.284062147140503, "loss/hidden": 1.3984375, "loss/logits": 0.24023738503456116, "loss/reg": 3.2989250030368567e-05, "step": 569 }, { "epoch": 0.07125, "grad_norm": 3.0785133838653564, "grad_norm_var": 0.2802140667784868, "learning_rate": 0.0001, "loss": 1.6418, "loss/crossentropy": 1.961942434310913, "loss/hidden": 1.4609375, "loss/logits": 0.18052928149700165, "loss/reg": 3.297963485238142e-05, "step": 570 }, { "epoch": 0.071375, "grad_norm": 2.4593677520751953, "grad_norm_var": 0.20824244414912407, "learning_rate": 0.0001, "loss": 1.3159, "loss/crossentropy": 2.573103666305542, "loss/hidden": 1.1328125, "loss/logits": 0.18279321491718292, "loss/reg": 3.297093280707486e-05, "step": 571 }, { "epoch": 0.0715, "grad_norm": 1.9554235935211182, "grad_norm_var": 0.21655112167287166, "learning_rate": 0.0001, "loss": 1.2922, "loss/crossentropy": 2.270695686340332, "loss/hidden": 1.109375, "loss/logits": 0.18250404298305511, "loss/reg": 3.296155409771018e-05, "step": 572 }, { "epoch": 0.071625, "grad_norm": 2.0329935550689697, "grad_norm_var": 0.2228007398634886, "learning_rate": 0.0001, "loss": 1.2539, "loss/crossentropy": 2.752925395965576, "loss/hidden": 1.078125, "loss/logits": 0.17549508810043335, "loss/reg": 3.295526767033152e-05, "step": 573 }, { "epoch": 0.07175, "grad_norm": 2.126927614212036, "grad_norm_var": 0.2255879631015389, "learning_rate": 0.0001, "loss": 1.3487, "loss/crossentropy": 2.0483946800231934, "loss/hidden": 1.171875, "loss/logits": 0.17648756504058838, "loss/reg": 3.2946434657787904e-05, "step": 574 }, { "epoch": 0.071875, "grad_norm": 1.9886771440505981, "grad_norm_var": 0.22606789805653443, "learning_rate": 0.0001, "loss": 1.1781, "loss/crossentropy": 2.7617955207824707, "loss/hidden": 1.0234375, "loss/logits": 0.15430483222007751, "loss/reg": 3.2934098271653056e-05, "step": 575 }, { "epoch": 0.072, "grad_norm": 2.9631826877593994, "grad_norm_var": 0.24033580872959698, "learning_rate": 0.0001, "loss": 1.5852, "loss/crossentropy": 2.5309393405914307, "loss/hidden": 1.390625, "loss/logits": 0.19425997138023376, "loss/reg": 3.292256951681338e-05, "step": 576 }, { "epoch": 0.072125, "grad_norm": 1.988107681274414, "grad_norm_var": 0.24468194088425713, "learning_rate": 0.0001, "loss": 1.3095, "loss/crossentropy": 2.2598788738250732, "loss/hidden": 1.1171875, "loss/logits": 0.1919427067041397, "loss/reg": 3.2912772439885885e-05, "step": 577 }, { "epoch": 0.07225, "grad_norm": 2.3229868412017822, "grad_norm_var": 0.23957991140601814, "learning_rate": 0.0001, "loss": 1.3089, "loss/crossentropy": 2.2162625789642334, "loss/hidden": 1.125, "loss/logits": 0.1835722178220749, "loss/reg": 3.289993037469685e-05, "step": 578 }, { "epoch": 0.072375, "grad_norm": 3.223689079284668, "grad_norm_var": 0.2627150699340527, "learning_rate": 0.0001, "loss": 1.5039, "loss/crossentropy": 2.5137839317321777, "loss/hidden": 1.2734375, "loss/logits": 0.230127215385437, "loss/reg": 3.2886728149605915e-05, "step": 579 }, { "epoch": 0.0725, "grad_norm": 2.5606164932250977, "grad_norm_var": 0.2609382601960414, "learning_rate": 0.0001, "loss": 1.2698, "loss/crossentropy": 2.4530560970306396, "loss/hidden": 1.109375, "loss/logits": 0.1600678414106369, "loss/reg": 3.287781146354973e-05, "step": 580 }, { "epoch": 0.072625, "grad_norm": 2.2816648483276367, "grad_norm_var": 0.24516125701289765, "learning_rate": 0.0001, "loss": 1.3915, "loss/crossentropy": 2.7529642581939697, "loss/hidden": 1.203125, "loss/logits": 0.18805408477783203, "loss/reg": 3.2862662919797e-05, "step": 581 }, { "epoch": 0.07275, "grad_norm": 2.407241106033325, "grad_norm_var": 0.24525415601641257, "learning_rate": 0.0001, "loss": 1.0467, "loss/crossentropy": 2.632035255432129, "loss/hidden": 0.90625, "loss/logits": 0.14010252058506012, "loss/reg": 3.284540434833616e-05, "step": 582 }, { "epoch": 0.072875, "grad_norm": 2.066805362701416, "grad_norm_var": 0.24849913026991757, "learning_rate": 0.0001, "loss": 1.2104, "loss/crossentropy": 2.3920540809631348, "loss/hidden": 1.046875, "loss/logits": 0.16320452094078064, "loss/reg": 3.283292244304903e-05, "step": 583 }, { "epoch": 0.073, "grad_norm": 2.0391831398010254, "grad_norm_var": 0.2564456863753709, "learning_rate": 0.0001, "loss": 1.0556, "loss/crossentropy": 2.683037519454956, "loss/hidden": 0.91796875, "loss/logits": 0.13727085292339325, "loss/reg": 3.282453690189868e-05, "step": 584 }, { "epoch": 0.073125, "grad_norm": 2.0682592391967773, "grad_norm_var": 0.17057470989331774, "learning_rate": 0.0001, "loss": 1.1549, "loss/crossentropy": 2.4454257488250732, "loss/hidden": 0.99609375, "loss/logits": 0.15847988426685333, "loss/reg": 3.2813150028232485e-05, "step": 585 }, { "epoch": 0.07325, "grad_norm": 2.8501038551330566, "grad_norm_var": 0.15157959645531768, "learning_rate": 0.0001, "loss": 1.0946, "loss/crossentropy": 2.457731008529663, "loss/hidden": 0.9375, "loss/logits": 0.15681084990501404, "loss/reg": 3.279821976320818e-05, "step": 586 }, { "epoch": 0.073375, "grad_norm": 2.148087739944458, "grad_norm_var": 0.15240953654303122, "learning_rate": 0.0001, "loss": 1.4257, "loss/crossentropy": 2.22470760345459, "loss/hidden": 1.2421875, "loss/logits": 0.18313491344451904, "loss/reg": 3.2786967494757846e-05, "step": 587 }, { "epoch": 0.0735, "grad_norm": 2.5976343154907227, "grad_norm_var": 0.1474827523957486, "learning_rate": 0.0001, "loss": 1.3191, "loss/crossentropy": 2.3259403705596924, "loss/hidden": 1.1328125, "loss/logits": 0.18595190346240997, "loss/reg": 3.277562063885853e-05, "step": 588 }, { "epoch": 0.073625, "grad_norm": 2.4023032188415527, "grad_norm_var": 0.14019368342773214, "learning_rate": 0.0001, "loss": 1.2833, "loss/crossentropy": 2.4084057807922363, "loss/hidden": 1.1015625, "loss/logits": 0.18139836192131042, "loss/reg": 3.276380448369309e-05, "step": 589 }, { "epoch": 0.07375, "grad_norm": 2.2520782947540283, "grad_norm_var": 0.13699608517203637, "learning_rate": 0.0001, "loss": 1.1876, "loss/crossentropy": 2.3067727088928223, "loss/hidden": 1.046875, "loss/logits": 0.14043688774108887, "loss/reg": 3.27551897498779e-05, "step": 590 }, { "epoch": 0.073875, "grad_norm": 1.9270496368408203, "grad_norm_var": 0.14049036125966244, "learning_rate": 0.0001, "loss": 1.2389, "loss/crossentropy": 2.1966562271118164, "loss/hidden": 1.0703125, "loss/logits": 0.168295219540596, "loss/reg": 3.274507253081538e-05, "step": 591 }, { "epoch": 0.074, "grad_norm": 4.388664722442627, "grad_norm_var": 0.37810686870707194, "learning_rate": 0.0001, "loss": 1.69, "loss/crossentropy": 2.4276816844940186, "loss/hidden": 1.4375, "loss/logits": 0.25220978260040283, "loss/reg": 3.2734093110775575e-05, "step": 592 }, { "epoch": 0.074125, "grad_norm": 2.117884635925293, "grad_norm_var": 0.37081618809672723, "learning_rate": 0.0001, "loss": 1.3117, "loss/crossentropy": 2.3757622241973877, "loss/hidden": 1.125, "loss/logits": 0.18635454773902893, "loss/reg": 3.272575850132853e-05, "step": 593 }, { "epoch": 0.07425, "grad_norm": 2.390171766281128, "grad_norm_var": 0.3697061945227233, "learning_rate": 0.0001, "loss": 1.4421, "loss/crossentropy": 2.5182714462280273, "loss/hidden": 1.2421875, "loss/logits": 0.1995772272348404, "loss/reg": 3.2716481655370444e-05, "step": 594 }, { "epoch": 0.074375, "grad_norm": 5.211070537567139, "grad_norm_var": 0.8129410955026809, "learning_rate": 0.0001, "loss": 1.8681, "loss/crossentropy": 2.823439598083496, "loss/hidden": 1.5703125, "loss/logits": 0.297451913356781, "loss/reg": 3.270539309596643e-05, "step": 595 }, { "epoch": 0.0745, "grad_norm": 2.154491424560547, "grad_norm_var": 0.8257505950367497, "learning_rate": 0.0001, "loss": 1.2172, "loss/crossentropy": 2.5359957218170166, "loss/hidden": 1.0546875, "loss/logits": 0.16215971112251282, "loss/reg": 3.269331136834808e-05, "step": 596 }, { "epoch": 0.074625, "grad_norm": 2.0445237159729004, "grad_norm_var": 0.8387431916180466, "learning_rate": 0.0001, "loss": 1.0679, "loss/crossentropy": 2.402836322784424, "loss/hidden": 0.93359375, "loss/logits": 0.13397127389907837, "loss/reg": 3.268320142524317e-05, "step": 597 }, { "epoch": 0.07475, "grad_norm": 1.9361920356750488, "grad_norm_var": 0.8626197388399758, "learning_rate": 0.0001, "loss": 1.1555, "loss/crossentropy": 2.455265522003174, "loss/hidden": 1.0, "loss/logits": 0.15518441796302795, "loss/reg": 3.267572174081579e-05, "step": 598 }, { "epoch": 0.074875, "grad_norm": 2.7497634887695312, "grad_norm_var": 0.848941044328622, "learning_rate": 0.0001, "loss": 1.4715, "loss/crossentropy": 2.318593978881836, "loss/hidden": 1.265625, "loss/logits": 0.20550090074539185, "loss/reg": 3.266748899477534e-05, "step": 599 }, { "epoch": 0.075, "grad_norm": 2.400951862335205, "grad_norm_var": 0.8310417345248119, "learning_rate": 0.0001, "loss": 1.2447, "loss/crossentropy": 2.722557306289673, "loss/hidden": 1.0859375, "loss/logits": 0.15843895077705383, "loss/reg": 3.265763007220812e-05, "step": 600 }, { "epoch": 0.075125, "grad_norm": 1.875627875328064, "grad_norm_var": 0.8470812137580309, "learning_rate": 0.0001, "loss": 1.177, "loss/crossentropy": 2.498196840286255, "loss/hidden": 1.0234375, "loss/logits": 0.15324150025844574, "loss/reg": 3.2648506021359935e-05, "step": 601 }, { "epoch": 0.07525, "grad_norm": 2.43035626411438, "grad_norm_var": 0.8435589871140308, "learning_rate": 0.0001, "loss": 1.4324, "loss/crossentropy": 2.3864665031433105, "loss/hidden": 1.2265625, "loss/logits": 0.20555616915225983, "loss/reg": 3.264078623033129e-05, "step": 602 }, { "epoch": 0.075375, "grad_norm": 1.9471216201782227, "grad_norm_var": 0.8572325437028823, "learning_rate": 0.0001, "loss": 1.1456, "loss/crossentropy": 2.574090003967285, "loss/hidden": 1.0078125, "loss/logits": 0.13747350871562958, "loss/reg": 3.263230973971076e-05, "step": 603 }, { "epoch": 0.0755, "grad_norm": 2.211796522140503, "grad_norm_var": 0.8641696494148537, "learning_rate": 0.0001, "loss": 1.215, "loss/crossentropy": 2.3869664669036865, "loss/hidden": 1.0625, "loss/logits": 0.15220922231674194, "loss/reg": 3.2624688174109906e-05, "step": 604 }, { "epoch": 0.075625, "grad_norm": 2.5104334354400635, "grad_norm_var": 0.8630953581455959, "learning_rate": 0.0001, "loss": 1.0907, "loss/crossentropy": 2.945605516433716, "loss/hidden": 0.93359375, "loss/logits": 0.1568160206079483, "loss/reg": 3.2617448596283793e-05, "step": 605 }, { "epoch": 0.07575, "grad_norm": 2.0008065700531006, "grad_norm_var": 0.8764953924757828, "learning_rate": 0.0001, "loss": 1.1952, "loss/crossentropy": 2.3963301181793213, "loss/hidden": 1.0390625, "loss/logits": 0.15582264959812164, "loss/reg": 3.2607302273390815e-05, "step": 606 }, { "epoch": 0.075875, "grad_norm": 2.4198153018951416, "grad_norm_var": 0.8528082724629522, "learning_rate": 0.0001, "loss": 1.1929, "loss/crossentropy": 2.565782308578491, "loss/hidden": 1.03125, "loss/logits": 0.1613207310438156, "loss/reg": 3.26011395372916e-05, "step": 607 }, { "epoch": 0.076, "grad_norm": 2.041506052017212, "grad_norm_var": 0.6215099906491213, "learning_rate": 0.0001, "loss": 1.1144, "loss/crossentropy": 2.477787733078003, "loss/hidden": 0.97265625, "loss/logits": 0.14146365225315094, "loss/reg": 3.2594605727354065e-05, "step": 608 }, { "epoch": 0.076125, "grad_norm": 2.013791561126709, "grad_norm_var": 0.6261395795901743, "learning_rate": 0.0001, "loss": 1.3659, "loss/crossentropy": 2.2931771278381348, "loss/hidden": 1.1796875, "loss/logits": 0.1858958601951599, "loss/reg": 3.258982178522274e-05, "step": 609 }, { "epoch": 0.07625, "grad_norm": 2.0193915367126465, "grad_norm_var": 0.6350275632712709, "learning_rate": 0.0001, "loss": 1.4135, "loss/crossentropy": 2.249884605407715, "loss/hidden": 1.2421875, "loss/logits": 0.17098332941532135, "loss/reg": 3.2585812732577324e-05, "step": 610 }, { "epoch": 0.076375, "grad_norm": 1.9450736045837402, "grad_norm_var": 0.06580480166266979, "learning_rate": 0.0001, "loss": 1.2443, "loss/crossentropy": 2.4982762336730957, "loss/hidden": 1.0859375, "loss/logits": 0.1580805480480194, "loss/reg": 3.2582185667706653e-05, "step": 611 }, { "epoch": 0.0765, "grad_norm": 2.3560380935668945, "grad_norm_var": 0.06795768948846134, "learning_rate": 0.0001, "loss": 1.3639, "loss/crossentropy": 2.5245325565338135, "loss/hidden": 1.1796875, "loss/logits": 0.1839311122894287, "loss/reg": 3.257774005760439e-05, "step": 612 }, { "epoch": 0.076625, "grad_norm": 2.068222761154175, "grad_norm_var": 0.06756012472509756, "learning_rate": 0.0001, "loss": 1.2086, "loss/crossentropy": 2.5093209743499756, "loss/hidden": 1.0390625, "loss/logits": 0.1692183017730713, "loss/reg": 3.257165371906012e-05, "step": 613 }, { "epoch": 0.07675, "grad_norm": 3.0041604042053223, "grad_norm_var": 0.1037103800860999, "learning_rate": 0.0001, "loss": 1.3476, "loss/crossentropy": 2.4445579051971436, "loss/hidden": 1.140625, "loss/logits": 0.20659969747066498, "loss/reg": 3.2569387258263305e-05, "step": 614 }, { "epoch": 0.076875, "grad_norm": 2.224217176437378, "grad_norm_var": 0.08593044093617071, "learning_rate": 0.0001, "loss": 1.2956, "loss/crossentropy": 2.265068769454956, "loss/hidden": 1.1484375, "loss/logits": 0.14682193100452423, "loss/reg": 3.256817581132054e-05, "step": 615 }, { "epoch": 0.077, "grad_norm": 1.9176480770111084, "grad_norm_var": 0.0886645679147117, "learning_rate": 0.0001, "loss": 1.0694, "loss/crossentropy": 2.406165599822998, "loss/hidden": 0.94140625, "loss/logits": 0.12762577831745148, "loss/reg": 3.256642958149314e-05, "step": 616 }, { "epoch": 0.077125, "grad_norm": 1.9381603002548218, "grad_norm_var": 0.08631597110569747, "learning_rate": 0.0001, "loss": 1.2835, "loss/crossentropy": 2.292922258377075, "loss/hidden": 1.1015625, "loss/logits": 0.18164017796516418, "loss/reg": 3.255980118410662e-05, "step": 617 }, { "epoch": 0.07725, "grad_norm": 3.8307087421417236, "grad_norm_var": 0.25365581117415176, "learning_rate": 0.0001, "loss": 1.5716, "loss/crossentropy": 2.24588680267334, "loss/hidden": 1.3828125, "loss/logits": 0.1884302794933319, "loss/reg": 3.255438059568405e-05, "step": 618 }, { "epoch": 0.077375, "grad_norm": 2.7307581901550293, "grad_norm_var": 0.25745859334372617, "learning_rate": 0.0001, "loss": 1.4418, "loss/crossentropy": 2.5056464672088623, "loss/hidden": 1.2265625, "loss/logits": 0.2148783802986145, "loss/reg": 3.2550698961131275e-05, "step": 619 }, { "epoch": 0.0775, "grad_norm": 1.9064363241195679, "grad_norm_var": 0.2679782151655727, "learning_rate": 0.0001, "loss": 1.0563, "loss/crossentropy": 2.5244431495666504, "loss/hidden": 0.9296875, "loss/logits": 0.1262451708316803, "loss/reg": 3.25437868013978e-05, "step": 620 }, { "epoch": 0.077625, "grad_norm": 2.1985690593719482, "grad_norm_var": 0.2656371947904515, "learning_rate": 0.0001, "loss": 1.058, "loss/crossentropy": 2.575101613998413, "loss/hidden": 0.921875, "loss/logits": 0.1357976496219635, "loss/reg": 3.2535335776628926e-05, "step": 621 }, { "epoch": 0.07775, "grad_norm": 2.120142936706543, "grad_norm_var": 0.26195032172526944, "learning_rate": 0.0001, "loss": 1.1902, "loss/crossentropy": 2.801929235458374, "loss/hidden": 1.046875, "loss/logits": 0.14296142756938934, "loss/reg": 3.2529584132134914e-05, "step": 622 }, { "epoch": 0.077875, "grad_norm": 2.0709078311920166, "grad_norm_var": 0.2637948830624718, "learning_rate": 0.0001, "loss": 1.0827, "loss/crossentropy": 2.676384687423706, "loss/hidden": 0.953125, "loss/logits": 0.12926620244979858, "loss/reg": 3.2523679692531005e-05, "step": 623 }, { "epoch": 0.078, "grad_norm": 2.508070707321167, "grad_norm_var": 0.2629301797210142, "learning_rate": 0.0001, "loss": 1.4165, "loss/crossentropy": 2.2480056285858154, "loss/hidden": 1.2421875, "loss/logits": 0.17401380836963654, "loss/reg": 3.251908492529765e-05, "step": 624 }, { "epoch": 0.078125, "grad_norm": 3.1828787326812744, "grad_norm_var": 0.30322979782217746, "learning_rate": 0.0001, "loss": 1.2913, "loss/crossentropy": 2.6146721839904785, "loss/hidden": 1.125, "loss/logits": 0.16597937047481537, "loss/reg": 3.251036469009705e-05, "step": 625 }, { "epoch": 0.07825, "grad_norm": 2.092921733856201, "grad_norm_var": 0.3000682178451103, "learning_rate": 0.0001, "loss": 1.1381, "loss/crossentropy": 2.7154953479766846, "loss/hidden": 0.9921875, "loss/logits": 0.14560630917549133, "loss/reg": 3.2501688110642135e-05, "step": 626 }, { "epoch": 0.078375, "grad_norm": 2.392030954360962, "grad_norm_var": 0.2865792056426225, "learning_rate": 0.0001, "loss": 1.3192, "loss/crossentropy": 2.4488956928253174, "loss/hidden": 1.171875, "loss/logits": 0.14700947701931, "loss/reg": 3.249543806305155e-05, "step": 627 }, { "epoch": 0.0785, "grad_norm": 1.8949799537658691, "grad_norm_var": 0.30311274506456226, "learning_rate": 0.0001, "loss": 1.0355, "loss/crossentropy": 2.656559705734253, "loss/hidden": 0.90234375, "loss/logits": 0.1328512728214264, "loss/reg": 3.248927168897353e-05, "step": 628 }, { "epoch": 0.078625, "grad_norm": 1.8167521953582764, "grad_norm_var": 0.317520497460355, "learning_rate": 0.0001, "loss": 1.2374, "loss/crossentropy": 2.555947780609131, "loss/hidden": 1.0703125, "loss/logits": 0.1667390763759613, "loss/reg": 3.248196662752889e-05, "step": 629 }, { "epoch": 0.07875, "grad_norm": 1.9900246858596802, "grad_norm_var": 0.29528383715011153, "learning_rate": 0.0001, "loss": 1.128, "loss/crossentropy": 2.405784845352173, "loss/hidden": 0.9765625, "loss/logits": 0.1510818898677826, "loss/reg": 3.247513450332917e-05, "step": 630 }, { "epoch": 0.078875, "grad_norm": 1.9893391132354736, "grad_norm_var": 0.30113488116037995, "learning_rate": 0.0001, "loss": 1.2307, "loss/crossentropy": 2.5071752071380615, "loss/hidden": 1.0625, "loss/logits": 0.16787654161453247, "loss/reg": 3.2470503356307745e-05, "step": 631 }, { "epoch": 0.079, "grad_norm": 2.4550352096557617, "grad_norm_var": 0.29277153949887513, "learning_rate": 0.0001, "loss": 1.5059, "loss/crossentropy": 2.2291836738586426, "loss/hidden": 1.296875, "loss/logits": 0.20867902040481567, "loss/reg": 3.24644279316999e-05, "step": 632 }, { "epoch": 0.079125, "grad_norm": 2.5132346153259277, "grad_norm_var": 0.2841737256035174, "learning_rate": 0.0001, "loss": 1.1279, "loss/crossentropy": 2.668013572692871, "loss/hidden": 0.98046875, "loss/logits": 0.14705964922904968, "loss/reg": 3.2460746297147125e-05, "step": 633 }, { "epoch": 0.07925, "grad_norm": 2.484381675720215, "grad_norm_var": 0.1326996354001511, "learning_rate": 0.0001, "loss": 1.3101, "loss/crossentropy": 2.3046844005584717, "loss/hidden": 1.1328125, "loss/logits": 0.17695116996765137, "loss/reg": 3.245668631279841e-05, "step": 634 }, { "epoch": 0.079375, "grad_norm": 2.414663076400757, "grad_norm_var": 0.11959498058305182, "learning_rate": 0.0001, "loss": 1.2178, "loss/crossentropy": 2.610231399536133, "loss/hidden": 1.0625, "loss/logits": 0.15494966506958008, "loss/reg": 3.2452466257382184e-05, "step": 635 }, { "epoch": 0.0795, "grad_norm": 2.3784496784210205, "grad_norm_var": 0.11177809540983971, "learning_rate": 0.0001, "loss": 1.194, "loss/crossentropy": 2.3995094299316406, "loss/hidden": 1.0546875, "loss/logits": 0.13898079097270966, "loss/reg": 3.244556864956394e-05, "step": 636 }, { "epoch": 0.079625, "grad_norm": 1.608014464378357, "grad_norm_var": 0.14009733722727352, "learning_rate": 0.0001, "loss": 1.114, "loss/crossentropy": 2.6790590286254883, "loss/hidden": 0.9765625, "loss/logits": 0.13714221119880676, "loss/reg": 3.244182880735025e-05, "step": 637 }, { "epoch": 0.07975, "grad_norm": 4.00265645980835, "grad_norm_var": 0.3303772680116007, "learning_rate": 0.0001, "loss": 1.2438, "loss/crossentropy": 2.4462854862213135, "loss/hidden": 1.0, "loss/logits": 0.2435140162706375, "loss/reg": 3.243668834329583e-05, "step": 638 }, { "epoch": 0.079875, "grad_norm": 1.6296852827072144, "grad_norm_var": 0.3596780665720629, "learning_rate": 0.0001, "loss": 1.234, "loss/crossentropy": 2.5636279582977295, "loss/hidden": 1.0703125, "loss/logits": 0.16334283351898193, "loss/reg": 3.2429612474516034e-05, "step": 639 }, { "epoch": 0.08, "grad_norm": 1.871125340461731, "grad_norm_var": 0.3702995417825491, "learning_rate": 0.0001, "loss": 1.293, "loss/crossentropy": 2.4291610717773438, "loss/hidden": 1.109375, "loss/logits": 0.18330498039722443, "loss/reg": 3.242455204599537e-05, "step": 640 }, { "epoch": 0.080125, "grad_norm": 2.3847568035125732, "grad_norm_var": 0.315601771252336, "learning_rate": 0.0001, "loss": 1.4179, "loss/crossentropy": 2.3636293411254883, "loss/hidden": 1.2265625, "loss/logits": 0.1910477578639984, "loss/reg": 3.241969898226671e-05, "step": 641 }, { "epoch": 0.08025, "grad_norm": 2.253438949584961, "grad_norm_var": 0.31395991504814447, "learning_rate": 0.0001, "loss": 1.2483, "loss/crossentropy": 2.436042547225952, "loss/hidden": 1.0859375, "loss/logits": 0.16208630800247192, "loss/reg": 3.241678132326342e-05, "step": 642 }, { "epoch": 0.080375, "grad_norm": 2.3757476806640625, "grad_norm_var": 0.3136787840213039, "learning_rate": 0.0001, "loss": 1.4148, "loss/crossentropy": 2.2471892833709717, "loss/hidden": 1.2265625, "loss/logits": 0.18789851665496826, "loss/reg": 3.240973092033528e-05, "step": 643 }, { "epoch": 0.0805, "grad_norm": 2.440065622329712, "grad_norm_var": 0.30616358156997026, "learning_rate": 0.0001, "loss": 1.4075, "loss/crossentropy": 2.4688096046447754, "loss/hidden": 1.203125, "loss/logits": 0.20404627919197083, "loss/reg": 3.240336445742287e-05, "step": 644 }, { "epoch": 0.080625, "grad_norm": 2.2615809440612793, "grad_norm_var": 0.2905830094011717, "learning_rate": 0.0001, "loss": 1.3328, "loss/crossentropy": 2.3882384300231934, "loss/hidden": 1.140625, "loss/logits": 0.19187776744365692, "loss/reg": 3.239453144487925e-05, "step": 645 }, { "epoch": 0.08075, "grad_norm": 1.8651819229125977, "grad_norm_var": 0.2969792506986578, "learning_rate": 0.0001, "loss": 1.3054, "loss/crossentropy": 2.470728635787964, "loss/hidden": 1.1328125, "loss/logits": 0.17228686809539795, "loss/reg": 3.238815043005161e-05, "step": 646 }, { "epoch": 0.080875, "grad_norm": 1.9243041276931763, "grad_norm_var": 0.3000064631215499, "learning_rate": 0.0001, "loss": 1.199, "loss/crossentropy": 2.514539957046509, "loss/hidden": 1.03125, "loss/logits": 0.16743598878383636, "loss/reg": 3.237748023821041e-05, "step": 647 }, { "epoch": 0.081, "grad_norm": 1.8536611795425415, "grad_norm_var": 0.31049073640116415, "learning_rate": 0.0001, "loss": 1.3365, "loss/crossentropy": 2.3504388332366943, "loss/hidden": 1.171875, "loss/logits": 0.16425597667694092, "loss/reg": 3.2367766834795475e-05, "step": 648 }, { "epoch": 0.081125, "grad_norm": 2.169321060180664, "grad_norm_var": 0.30656022186771464, "learning_rate": 0.0001, "loss": 1.1024, "loss/crossentropy": 2.530703544616699, "loss/hidden": 0.96875, "loss/logits": 0.1333715319633484, "loss/reg": 3.235774420318194e-05, "step": 649 }, { "epoch": 0.08125, "grad_norm": 2.1230084896087646, "grad_norm_var": 0.3031790527826331, "learning_rate": 0.0001, "loss": 1.3157, "loss/crossentropy": 2.310661554336548, "loss/hidden": 1.140625, "loss/logits": 0.1747591644525528, "loss/reg": 3.2350304536521435e-05, "step": 650 }, { "epoch": 0.081375, "grad_norm": 2.6993911266326904, "grad_norm_var": 0.3155514558670091, "learning_rate": 0.0001, "loss": 1.347, "loss/crossentropy": 2.4968340396881104, "loss/hidden": 1.1484375, "loss/logits": 0.19822362065315247, "loss/reg": 3.2342599297408015e-05, "step": 651 }, { "epoch": 0.0815, "grad_norm": 2.080300807952881, "grad_norm_var": 0.3156044041405048, "learning_rate": 0.0001, "loss": 1.409, "loss/crossentropy": 2.424464702606201, "loss/hidden": 1.234375, "loss/logits": 0.17433631420135498, "loss/reg": 3.2335545256501064e-05, "step": 652 }, { "epoch": 0.081625, "grad_norm": 1.9338946342468262, "grad_norm_var": 0.29559018141630206, "learning_rate": 0.0001, "loss": 1.2105, "loss/crossentropy": 2.3058454990386963, "loss/hidden": 1.0625, "loss/logits": 0.14769870042800903, "loss/reg": 3.2326461223419756e-05, "step": 653 }, { "epoch": 0.08175, "grad_norm": 2.011249542236328, "grad_norm_var": 0.07589101708228417, "learning_rate": 0.0001, "loss": 1.1236, "loss/crossentropy": 2.682971239089966, "loss/hidden": 0.9453125, "loss/logits": 0.1779399812221527, "loss/reg": 3.23207896144595e-05, "step": 654 }, { "epoch": 0.081875, "grad_norm": 2.7516329288482666, "grad_norm_var": 0.08162097532279228, "learning_rate": 0.0001, "loss": 1.46, "loss/crossentropy": 2.5869898796081543, "loss/hidden": 1.265625, "loss/logits": 0.19408713281154633, "loss/reg": 3.231072332710028e-05, "step": 655 }, { "epoch": 0.082, "grad_norm": 2.366501569747925, "grad_norm_var": 0.07606725464947293, "learning_rate": 0.0001, "loss": 1.1699, "loss/crossentropy": 2.1895010471343994, "loss/hidden": 1.0390625, "loss/logits": 0.13050538301467896, "loss/reg": 3.230313814128749e-05, "step": 656 }, { "epoch": 0.082125, "grad_norm": 2.0115721225738525, "grad_norm_var": 0.07649272760625057, "learning_rate": 0.0001, "loss": 1.2011, "loss/crossentropy": 2.481351375579834, "loss/hidden": 1.0390625, "loss/logits": 0.16167044639587402, "loss/reg": 3.2294527045451105e-05, "step": 657 }, { "epoch": 0.08225, "grad_norm": 2.1796374320983887, "grad_norm_var": 0.07625861744395455, "learning_rate": 0.0001, "loss": 1.1522, "loss/crossentropy": 2.40244722366333, "loss/hidden": 1.0078125, "loss/logits": 0.14410331845283508, "loss/reg": 3.2286981877405196e-05, "step": 658 }, { "epoch": 0.082375, "grad_norm": 3.0727481842041016, "grad_norm_var": 0.1238429317095663, "learning_rate": 0.0001, "loss": 1.4898, "loss/crossentropy": 2.295776128768921, "loss/hidden": 1.2734375, "loss/logits": 0.21601419150829315, "loss/reg": 3.2281091989716515e-05, "step": 659 }, { "epoch": 0.0825, "grad_norm": 3.2960402965545654, "grad_norm_var": 0.19315411367156585, "learning_rate": 0.0001, "loss": 1.3082, "loss/crossentropy": 2.6390655040740967, "loss/hidden": 1.140625, "loss/logits": 0.1673002392053604, "loss/reg": 3.227585693821311e-05, "step": 660 }, { "epoch": 0.082625, "grad_norm": 3.4391167163848877, "grad_norm_var": 0.27574634545379506, "learning_rate": 0.0001, "loss": 1.1884, "loss/crossentropy": 2.5897035598754883, "loss/hidden": 1.03125, "loss/logits": 0.15684694051742554, "loss/reg": 3.2269697840092704e-05, "step": 661 }, { "epoch": 0.08275, "grad_norm": 4.276190280914307, "grad_norm_var": 0.4796355036633154, "learning_rate": 0.0001, "loss": 1.1999, "loss/crossentropy": 2.5243704319000244, "loss/hidden": 1.046875, "loss/logits": 0.15274158120155334, "loss/reg": 3.226202534278855e-05, "step": 662 }, { "epoch": 0.082875, "grad_norm": 2.588935136795044, "grad_norm_var": 0.4551827768206384, "learning_rate": 0.0001, "loss": 1.1808, "loss/crossentropy": 2.7738707065582275, "loss/hidden": 1.015625, "loss/logits": 0.16485591232776642, "loss/reg": 3.2259602448903024e-05, "step": 663 }, { "epoch": 0.083, "grad_norm": 2.2615346908569336, "grad_norm_var": 0.42753040987829916, "learning_rate": 0.0001, "loss": 1.1047, "loss/crossentropy": 2.4449167251586914, "loss/hidden": 0.96484375, "loss/logits": 0.13952355086803436, "loss/reg": 3.225212640245445e-05, "step": 664 }, { "epoch": 0.083125, "grad_norm": 2.022779941558838, "grad_norm_var": 0.4368736230271139, "learning_rate": 0.0001, "loss": 1.134, "loss/crossentropy": 2.4114463329315186, "loss/hidden": 0.99609375, "loss/logits": 0.13763144612312317, "loss/reg": 3.224598913220689e-05, "step": 665 }, { "epoch": 0.08325, "grad_norm": 2.180579900741577, "grad_norm_var": 0.433652208727842, "learning_rate": 0.0001, "loss": 1.3077, "loss/crossentropy": 2.3560714721679688, "loss/hidden": 1.140625, "loss/logits": 0.16675901412963867, "loss/reg": 3.2240248401649296e-05, "step": 666 }, { "epoch": 0.083375, "grad_norm": 2.29941725730896, "grad_norm_var": 0.43692416598824574, "learning_rate": 0.0001, "loss": 1.2171, "loss/crossentropy": 2.4641499519348145, "loss/hidden": 1.0390625, "loss/logits": 0.17773011326789856, "loss/reg": 3.223533713025972e-05, "step": 667 }, { "epoch": 0.0835, "grad_norm": 13.708366394042969, "grad_norm_var": 8.162143239260631, "learning_rate": 0.0001, "loss": 1.9561, "loss/crossentropy": 1.3007676601409912, "loss/hidden": 1.890625, "loss/logits": 0.06512448191642761, "loss/reg": 3.222926170565188e-05, "step": 668 }, { "epoch": 0.083625, "grad_norm": 2.223507881164551, "grad_norm_var": 8.1155980860334, "learning_rate": 0.0001, "loss": 1.1802, "loss/crossentropy": 2.3888051509857178, "loss/hidden": 1.0390625, "loss/logits": 0.14084848761558533, "loss/reg": 3.22245032293722e-05, "step": 669 }, { "epoch": 0.08375, "grad_norm": 4.826849460601807, "grad_norm_var": 8.129844594294168, "learning_rate": 0.0001, "loss": 1.2754, "loss/crossentropy": 2.787766456604004, "loss/hidden": 1.125, "loss/logits": 0.15011203289031982, "loss/reg": 3.2221811125054955e-05, "step": 670 }, { "epoch": 0.083875, "grad_norm": 2.107593059539795, "grad_norm_var": 8.217378105018076, "learning_rate": 0.0001, "loss": 1.1406, "loss/crossentropy": 2.673910617828369, "loss/hidden": 1.0, "loss/logits": 0.1403258740901947, "loss/reg": 3.222101804567501e-05, "step": 671 }, { "epoch": 0.084, "grad_norm": 2.0006186962127686, "grad_norm_var": 8.27757030990291, "learning_rate": 0.0001, "loss": 1.0909, "loss/crossentropy": 2.7820777893066406, "loss/hidden": 0.93359375, "loss/logits": 0.15698012709617615, "loss/reg": 3.221366569050588e-05, "step": 672 }, { "epoch": 0.084125, "grad_norm": 2.0933070182800293, "grad_norm_var": 8.262791740468037, "learning_rate": 0.0001, "loss": 1.1451, "loss/crossentropy": 2.8087332248687744, "loss/hidden": 0.99609375, "loss/logits": 0.14868998527526855, "loss/reg": 3.220442158635706e-05, "step": 673 }, { "epoch": 0.08425, "grad_norm": 2.358794927597046, "grad_norm_var": 8.235381625712368, "learning_rate": 0.0001, "loss": 1.2276, "loss/crossentropy": 2.664926528930664, "loss/hidden": 1.0703125, "loss/logits": 0.1569160521030426, "loss/reg": 3.2195319363381714e-05, "step": 674 }, { "epoch": 0.084375, "grad_norm": 1.998713731765747, "grad_norm_var": 8.357532166242084, "learning_rate": 0.0001, "loss": 1.2374, "loss/crossentropy": 2.4228875637054443, "loss/hidden": 1.1015625, "loss/logits": 0.135471910238266, "loss/reg": 3.218696656404063e-05, "step": 675 }, { "epoch": 0.0845, "grad_norm": 1.7373796701431274, "grad_norm_var": 8.521654653515995, "learning_rate": 0.0001, "loss": 1.1959, "loss/crossentropy": 2.411160945892334, "loss/hidden": 1.046875, "loss/logits": 0.1486646831035614, "loss/reg": 3.217814810341224e-05, "step": 676 }, { "epoch": 0.084625, "grad_norm": 2.1248972415924072, "grad_norm_var": 8.597818746749187, "learning_rate": 0.0001, "loss": 1.2671, "loss/crossentropy": 2.322160005569458, "loss/hidden": 1.09375, "loss/logits": 0.17302247881889343, "loss/reg": 3.216690311091952e-05, "step": 677 }, { "epoch": 0.08475, "grad_norm": 2.1823222637176514, "grad_norm_var": 8.564568662216503, "learning_rate": 0.0001, "loss": 1.0307, "loss/crossentropy": 2.5252137184143066, "loss/hidden": 0.90625, "loss/logits": 0.1240834966301918, "loss/reg": 3.215742253814824e-05, "step": 678 }, { "epoch": 0.084875, "grad_norm": 2.385857343673706, "grad_norm_var": 8.579487634417987, "learning_rate": 0.0001, "loss": 1.301, "loss/crossentropy": 2.356689929962158, "loss/hidden": 1.1484375, "loss/logits": 0.1522800326347351, "loss/reg": 3.214823664166033e-05, "step": 679 }, { "epoch": 0.085, "grad_norm": 4.639679908752441, "grad_norm_var": 8.688646971389552, "learning_rate": 0.0001, "loss": 1.5426, "loss/crossentropy": 2.088296413421631, "loss/hidden": 1.3359375, "loss/logits": 0.20634829998016357, "loss/reg": 3.213853415218182e-05, "step": 680 }, { "epoch": 0.085125, "grad_norm": 2.2640221118927, "grad_norm_var": 8.6550401893545, "learning_rate": 0.0001, "loss": 1.118, "loss/crossentropy": 2.519866466522217, "loss/hidden": 0.9765625, "loss/logits": 0.1411462128162384, "loss/reg": 3.212389492546208e-05, "step": 681 }, { "epoch": 0.08525, "grad_norm": 1.9441190958023071, "grad_norm_var": 8.690541004695175, "learning_rate": 0.0001, "loss": 1.2132, "loss/crossentropy": 2.6540563106536865, "loss/hidden": 1.046875, "loss/logits": 0.1660272479057312, "loss/reg": 3.211110379197635e-05, "step": 682 }, { "epoch": 0.085375, "grad_norm": 2.1394095420837402, "grad_norm_var": 8.7109484257759, "learning_rate": 0.0001, "loss": 1.1292, "loss/crossentropy": 2.454695463180542, "loss/hidden": 0.96484375, "loss/logits": 0.1640835851430893, "loss/reg": 3.209971691831015e-05, "step": 683 }, { "epoch": 0.0855, "grad_norm": 2.5914788246154785, "grad_norm_var": 0.8159417233832197, "learning_rate": 0.0001, "loss": 1.1781, "loss/crossentropy": 2.354567766189575, "loss/hidden": 1.03125, "loss/logits": 0.14655154943466187, "loss/reg": 3.209043643437326e-05, "step": 684 }, { "epoch": 0.085625, "grad_norm": 2.1125876903533936, "grad_norm_var": 0.8204472332347561, "learning_rate": 0.0001, "loss": 1.1683, "loss/crossentropy": 2.4232585430145264, "loss/hidden": 1.0234375, "loss/logits": 0.14451487362384796, "loss/reg": 3.208300768164918e-05, "step": 685 }, { "epoch": 0.08575, "grad_norm": 2.005004405975342, "grad_norm_var": 0.431076757035284, "learning_rate": 0.0001, "loss": 1.2143, "loss/crossentropy": 2.7095413208007812, "loss/hidden": 1.0546875, "loss/logits": 0.159327432513237, "loss/reg": 3.20776853186544e-05, "step": 686 }, { "epoch": 0.085875, "grad_norm": 2.0700883865356445, "grad_norm_var": 0.43209112768215785, "learning_rate": 0.0001, "loss": 1.3235, "loss/crossentropy": 2.146639347076416, "loss/hidden": 1.1484375, "loss/logits": 0.17476913332939148, "loss/reg": 3.2072603062260896e-05, "step": 687 }, { "epoch": 0.086, "grad_norm": 1.8895165920257568, "grad_norm_var": 0.4371570572715728, "learning_rate": 0.0001, "loss": 1.1324, "loss/crossentropy": 2.6808338165283203, "loss/hidden": 0.9765625, "loss/logits": 0.15553486347198486, "loss/reg": 3.206895780749619e-05, "step": 688 }, { "epoch": 0.086125, "grad_norm": 2.0443785190582275, "grad_norm_var": 0.43854794372576955, "learning_rate": 0.0001, "loss": 1.353, "loss/crossentropy": 2.3666627407073975, "loss/hidden": 1.1484375, "loss/logits": 0.2042878270149231, "loss/reg": 3.2062020181911066e-05, "step": 689 }, { "epoch": 0.08625, "grad_norm": 2.71120285987854, "grad_norm_var": 0.4499880686852729, "learning_rate": 0.0001, "loss": 1.3123, "loss/crossentropy": 2.225734233856201, "loss/hidden": 1.1640625, "loss/logits": 0.1479206681251526, "loss/reg": 3.205486427759752e-05, "step": 690 }, { "epoch": 0.086375, "grad_norm": 1.9704585075378418, "grad_norm_var": 0.45118259423517365, "learning_rate": 0.0001, "loss": 1.2783, "loss/crossentropy": 2.1413638591766357, "loss/hidden": 1.1171875, "loss/logits": 0.16075628995895386, "loss/reg": 3.204666791134514e-05, "step": 691 }, { "epoch": 0.0865, "grad_norm": 2.0193846225738525, "grad_norm_var": 0.43496897541908663, "learning_rate": 0.0001, "loss": 1.1574, "loss/crossentropy": 2.461449384689331, "loss/hidden": 1.0, "loss/logits": 0.15706798434257507, "loss/reg": 3.203826054232195e-05, "step": 692 }, { "epoch": 0.086625, "grad_norm": 2.3389272689819336, "grad_norm_var": 0.43230996116488185, "learning_rate": 0.0001, "loss": 1.2097, "loss/crossentropy": 2.8094615936279297, "loss/hidden": 1.0546875, "loss/logits": 0.15468040108680725, "loss/reg": 3.203040250809863e-05, "step": 693 }, { "epoch": 0.08675, "grad_norm": 2.1352357864379883, "grad_norm_var": 0.4333868407910055, "learning_rate": 0.0001, "loss": 1.3619, "loss/crossentropy": 2.2299346923828125, "loss/hidden": 1.1875, "loss/logits": 0.17409831285476685, "loss/reg": 3.2022617233451456e-05, "step": 694 }, { "epoch": 0.086875, "grad_norm": 1.8997348546981812, "grad_norm_var": 0.44446051921212015, "learning_rate": 0.0001, "loss": 1.1474, "loss/crossentropy": 2.464475154876709, "loss/hidden": 0.99609375, "loss/logits": 0.15099835395812988, "loss/reg": 3.201406798325479e-05, "step": 695 }, { "epoch": 0.087, "grad_norm": 2.0923757553100586, "grad_norm_var": 0.05483191469694191, "learning_rate": 0.0001, "loss": 1.4524, "loss/crossentropy": 2.0002782344818115, "loss/hidden": 1.2578125, "loss/logits": 0.19427113234996796, "loss/reg": 3.2007144909584895e-05, "step": 696 }, { "epoch": 0.087125, "grad_norm": 2.2059760093688965, "grad_norm_var": 0.054076791402477654, "learning_rate": 0.0001, "loss": 1.1609, "loss/crossentropy": 2.6744987964630127, "loss/hidden": 1.0078125, "loss/logits": 0.1527547836303711, "loss/reg": 3.199988714186475e-05, "step": 697 }, { "epoch": 0.08725, "grad_norm": 2.7213289737701416, "grad_norm_var": 0.07198565582104041, "learning_rate": 0.0001, "loss": 1.2713, "loss/crossentropy": 2.643150806427002, "loss/hidden": 1.0625, "loss/logits": 0.20850570499897003, "loss/reg": 3.199481943738647e-05, "step": 698 }, { "epoch": 0.087375, "grad_norm": 2.3728480339050293, "grad_norm_var": 0.07399760919694666, "learning_rate": 0.0001, "loss": 1.3903, "loss/crossentropy": 2.309610605239868, "loss/hidden": 1.203125, "loss/logits": 0.18681778013706207, "loss/reg": 3.198991544195451e-05, "step": 699 }, { "epoch": 0.0875, "grad_norm": 2.271571397781372, "grad_norm_var": 0.06364372961186285, "learning_rate": 0.0001, "loss": 1.1185, "loss/crossentropy": 2.3085150718688965, "loss/hidden": 0.9765625, "loss/logits": 0.1415865421295166, "loss/reg": 3.198152262484655e-05, "step": 700 }, { "epoch": 0.087625, "grad_norm": 1.9595108032226562, "grad_norm_var": 0.06645944280407286, "learning_rate": 0.0001, "loss": 1.3993, "loss/crossentropy": 1.939794659614563, "loss/hidden": 1.2265625, "loss/logits": 0.1723695695400238, "loss/reg": 3.1972482247510925e-05, "step": 701 }, { "epoch": 0.08775, "grad_norm": 2.3391191959381104, "grad_norm_var": 0.06612084152980054, "learning_rate": 0.0001, "loss": 1.2046, "loss/crossentropy": 2.515406847000122, "loss/hidden": 1.046875, "loss/logits": 0.1574229598045349, "loss/reg": 3.1967716495273635e-05, "step": 702 }, { "epoch": 0.087875, "grad_norm": 1.834547758102417, "grad_norm_var": 0.07335743103287004, "learning_rate": 0.0001, "loss": 1.2564, "loss/crossentropy": 2.3931076526641846, "loss/hidden": 1.09375, "loss/logits": 0.1623239815235138, "loss/reg": 3.195786848664284e-05, "step": 703 }, { "epoch": 0.088, "grad_norm": 2.1778762340545654, "grad_norm_var": 0.06756343480080407, "learning_rate": 0.0001, "loss": 1.3099, "loss/crossentropy": 2.487224578857422, "loss/hidden": 1.125, "loss/logits": 0.18461742997169495, "loss/reg": 3.1952691642800346e-05, "step": 704 }, { "epoch": 0.088125, "grad_norm": 2.150066375732422, "grad_norm_var": 0.06616151942176168, "learning_rate": 0.0001, "loss": 1.0666, "loss/crossentropy": 2.3318979740142822, "loss/hidden": 0.9375, "loss/logits": 0.1287609487771988, "loss/reg": 3.1944622605806217e-05, "step": 705 }, { "epoch": 0.08825, "grad_norm": 2.367643117904663, "grad_norm_var": 0.050121908206295925, "learning_rate": 0.0001, "loss": 1.2344, "loss/crossentropy": 2.623997688293457, "loss/hidden": 1.0625, "loss/logits": 0.17162081599235535, "loss/reg": 3.19385617331136e-05, "step": 706 }, { "epoch": 0.088375, "grad_norm": 2.1544952392578125, "grad_norm_var": 0.047132855557610695, "learning_rate": 0.0001, "loss": 1.2948, "loss/crossentropy": 2.4254250526428223, "loss/hidden": 1.125, "loss/logits": 0.16950462758541107, "loss/reg": 3.1930052500683814e-05, "step": 707 }, { "epoch": 0.0885, "grad_norm": 2.867445230484009, "grad_norm_var": 0.07278645639077812, "learning_rate": 0.0001, "loss": 1.4275, "loss/crossentropy": 1.8883780241012573, "loss/hidden": 1.265625, "loss/logits": 0.16154590249061584, "loss/reg": 3.19233258778695e-05, "step": 708 }, { "epoch": 0.088625, "grad_norm": 2.6383581161499023, "grad_norm_var": 0.08221819277021461, "learning_rate": 0.0001, "loss": 1.3537, "loss/crossentropy": 2.7024612426757812, "loss/hidden": 1.140625, "loss/logits": 0.21271824836730957, "loss/reg": 3.191478390363045e-05, "step": 709 }, { "epoch": 0.08875, "grad_norm": 1.9694470167160034, "grad_norm_var": 0.08673286422611473, "learning_rate": 0.0001, "loss": 1.2019, "loss/crossentropy": 2.422839403152466, "loss/hidden": 1.046875, "loss/logits": 0.1547282487154007, "loss/reg": 3.190719507983886e-05, "step": 710 }, { "epoch": 0.088875, "grad_norm": 2.1444804668426514, "grad_norm_var": 0.07900095396042553, "learning_rate": 0.0001, "loss": 1.1908, "loss/crossentropy": 2.7688839435577393, "loss/hidden": 1.0234375, "loss/logits": 0.16705238819122314, "loss/reg": 3.1896463042357937e-05, "step": 711 }, { "epoch": 0.089, "grad_norm": 2.683474540710449, "grad_norm_var": 0.08709981146559749, "learning_rate": 0.0001, "loss": 1.363, "loss/crossentropy": 2.40535044670105, "loss/hidden": 1.1640625, "loss/logits": 0.1986573487520218, "loss/reg": 3.1885796488495544e-05, "step": 712 }, { "epoch": 0.089125, "grad_norm": 2.5018036365509033, "grad_norm_var": 0.08871733491526891, "learning_rate": 0.0001, "loss": 1.2907, "loss/crossentropy": 2.5045652389526367, "loss/hidden": 1.1328125, "loss/logits": 0.15754517912864685, "loss/reg": 3.18759230140131e-05, "step": 713 }, { "epoch": 0.08925, "grad_norm": 2.2823939323425293, "grad_norm_var": 0.07739561040159319, "learning_rate": 0.0001, "loss": 1.1379, "loss/crossentropy": 2.6719865798950195, "loss/hidden": 0.9765625, "loss/logits": 0.16101884841918945, "loss/reg": 3.1869571103015915e-05, "step": 714 }, { "epoch": 0.089375, "grad_norm": 1.9277560710906982, "grad_norm_var": 0.08513910626034443, "learning_rate": 0.0001, "loss": 1.3134, "loss/crossentropy": 2.351673126220703, "loss/hidden": 1.140625, "loss/logits": 0.17247360944747925, "loss/reg": 3.1860403396422043e-05, "step": 715 }, { "epoch": 0.0895, "grad_norm": 2.0939505100250244, "grad_norm_var": 0.08699969013148531, "learning_rate": 0.0001, "loss": 1.3346, "loss/crossentropy": 2.618321418762207, "loss/hidden": 1.15625, "loss/logits": 0.178018718957901, "loss/reg": 3.184879824402742e-05, "step": 716 }, { "epoch": 0.089625, "grad_norm": 1.9107413291931152, "grad_norm_var": 0.08907481761581591, "learning_rate": 0.0001, "loss": 1.1277, "loss/crossentropy": 2.5781402587890625, "loss/hidden": 0.98046875, "loss/logits": 0.1469302475452423, "loss/reg": 3.183981971233152e-05, "step": 717 }, { "epoch": 0.08975, "grad_norm": 2.362642526626587, "grad_norm_var": 0.08938037261504844, "learning_rate": 0.0001, "loss": 1.2477, "loss/crossentropy": 2.685119390487671, "loss/hidden": 1.078125, "loss/logits": 0.1692849099636078, "loss/reg": 3.182946966262534e-05, "step": 718 }, { "epoch": 0.089875, "grad_norm": 1.6800652742385864, "grad_norm_var": 0.09951568078791032, "learning_rate": 0.0001, "loss": 1.2464, "loss/crossentropy": 2.4787790775299072, "loss/hidden": 1.078125, "loss/logits": 0.168003648519516, "loss/reg": 3.181990177836269e-05, "step": 719 }, { "epoch": 0.09, "grad_norm": 2.121812105178833, "grad_norm_var": 0.1002104558972718, "learning_rate": 0.0001, "loss": 1.1216, "loss/crossentropy": 2.671614646911621, "loss/hidden": 0.96875, "loss/logits": 0.15253598988056183, "loss/reg": 3.180657222401351e-05, "step": 720 }, { "epoch": 0.090125, "grad_norm": 1.9008033275604248, "grad_norm_var": 0.10711709114638453, "learning_rate": 0.0001, "loss": 1.1998, "loss/crossentropy": 2.382974624633789, "loss/hidden": 1.046875, "loss/logits": 0.15262249112129211, "loss/reg": 3.179501072736457e-05, "step": 721 }, { "epoch": 0.09025, "grad_norm": 2.5321600437164307, "grad_norm_var": 0.11192764062330766, "learning_rate": 0.0001, "loss": 1.1406, "loss/crossentropy": 2.551715850830078, "loss/hidden": 0.97265625, "loss/logits": 0.16764254868030548, "loss/reg": 3.178184852004051e-05, "step": 722 }, { "epoch": 0.090375, "grad_norm": 2.9801716804504395, "grad_norm_var": 0.14559231156155736, "learning_rate": 0.0001, "loss": 1.1411, "loss/crossentropy": 2.439439296722412, "loss/hidden": 0.97265625, "loss/logits": 0.1681419312953949, "loss/reg": 3.176909376634285e-05, "step": 723 }, { "epoch": 0.0905, "grad_norm": 3.572810411453247, "grad_norm_var": 0.23124631459862688, "learning_rate": 0.0001, "loss": 1.7208, "loss/crossentropy": 2.434739351272583, "loss/hidden": 1.4375, "loss/logits": 0.283025860786438, "loss/reg": 3.17596313834656e-05, "step": 724 }, { "epoch": 0.090625, "grad_norm": 2.4641849994659424, "grad_norm_var": 0.2260145018020476, "learning_rate": 0.0001, "loss": 1.3361, "loss/crossentropy": 2.359750747680664, "loss/hidden": 1.140625, "loss/logits": 0.19520103931427002, "loss/reg": 3.1749059417052194e-05, "step": 725 }, { "epoch": 0.09075, "grad_norm": 2.363614320755005, "grad_norm_var": 0.21727288655602492, "learning_rate": 0.0001, "loss": 1.2934, "loss/crossentropy": 2.556663751602173, "loss/hidden": 1.1171875, "loss/logits": 0.1759195625782013, "loss/reg": 3.1738876714371145e-05, "step": 726 }, { "epoch": 0.090875, "grad_norm": 2.7360336780548096, "grad_norm_var": 0.22331398262713684, "learning_rate": 0.0001, "loss": 1.6679, "loss/crossentropy": 2.204488515853882, "loss/hidden": 1.3984375, "loss/logits": 0.26912397146224976, "loss/reg": 3.172622018610127e-05, "step": 727 }, { "epoch": 0.091, "grad_norm": 1.9641075134277344, "grad_norm_var": 0.2267554251378797, "learning_rate": 0.0001, "loss": 1.381, "loss/crossentropy": 2.422118663787842, "loss/hidden": 1.1953125, "loss/logits": 0.18534547090530396, "loss/reg": 3.1712734198663384e-05, "step": 728 }, { "epoch": 0.091125, "grad_norm": 2.564943790435791, "grad_norm_var": 0.22839041731253118, "learning_rate": 0.0001, "loss": 1.2944, "loss/crossentropy": 2.87058687210083, "loss/hidden": 1.1171875, "loss/logits": 0.17691189050674438, "loss/reg": 3.1703999411547557e-05, "step": 729 }, { "epoch": 0.09125, "grad_norm": 1.7801804542541504, "grad_norm_var": 0.2480876052532271, "learning_rate": 0.0001, "loss": 1.147, "loss/crossentropy": 2.6544079780578613, "loss/hidden": 1.0, "loss/logits": 0.14668874442577362, "loss/reg": 3.1691190088167787e-05, "step": 730 }, { "epoch": 0.091375, "grad_norm": 2.378593921661377, "grad_norm_var": 0.23782880116232022, "learning_rate": 0.0001, "loss": 1.3614, "loss/crossentropy": 2.2079715728759766, "loss/hidden": 1.1875, "loss/logits": 0.1736200451850891, "loss/reg": 3.1681309337727726e-05, "step": 731 }, { "epoch": 0.0915, "grad_norm": 2.1592485904693604, "grad_norm_var": 0.23597114035816222, "learning_rate": 0.0001, "loss": 1.1879, "loss/crossentropy": 2.5136733055114746, "loss/hidden": 1.0546875, "loss/logits": 0.1328616440296173, "loss/reg": 3.1669525924371555e-05, "step": 732 }, { "epoch": 0.091625, "grad_norm": 1.9308961629867554, "grad_norm_var": 0.23483758355516002, "learning_rate": 0.0001, "loss": 1.113, "loss/crossentropy": 2.443802833557129, "loss/hidden": 0.97265625, "loss/logits": 0.14004755020141602, "loss/reg": 3.165780799463391e-05, "step": 733 }, { "epoch": 0.09175, "grad_norm": 2.5313358306884766, "grad_norm_var": 0.23705198036043196, "learning_rate": 0.0001, "loss": 1.316, "loss/crossentropy": 2.374687910079956, "loss/hidden": 1.140625, "loss/logits": 0.17510093748569489, "loss/reg": 3.1643921829527244e-05, "step": 734 }, { "epoch": 0.091875, "grad_norm": 2.2036988735198975, "grad_norm_var": 0.2071495968864624, "learning_rate": 0.0001, "loss": 1.1801, "loss/crossentropy": 2.5990524291992188, "loss/hidden": 1.0234375, "loss/logits": 0.15631292760372162, "loss/reg": 3.162867506034672e-05, "step": 735 }, { "epoch": 0.092, "grad_norm": 2.2874338626861572, "grad_norm_var": 0.2030181085393209, "learning_rate": 0.0001, "loss": 1.3517, "loss/crossentropy": 2.3098976612091064, "loss/hidden": 1.171875, "loss/logits": 0.1794787347316742, "loss/reg": 3.16194818879012e-05, "step": 736 }, { "epoch": 0.092125, "grad_norm": 2.161745309829712, "grad_norm_var": 0.19001384880688754, "learning_rate": 0.0001, "loss": 1.2909, "loss/crossentropy": 2.206538200378418, "loss/hidden": 1.1328125, "loss/logits": 0.15781772136688232, "loss/reg": 3.160776032018475e-05, "step": 737 }, { "epoch": 0.09225, "grad_norm": 2.825727939605713, "grad_norm_var": 0.2000567098307892, "learning_rate": 0.0001, "loss": 1.4565, "loss/crossentropy": 2.395667314529419, "loss/hidden": 1.2578125, "loss/logits": 0.19834987819194794, "loss/reg": 3.159312836942263e-05, "step": 738 }, { "epoch": 0.092375, "grad_norm": 2.25457763671875, "grad_norm_var": 0.1798848071044782, "learning_rate": 0.0001, "loss": 1.1574, "loss/crossentropy": 2.5495216846466064, "loss/hidden": 0.9921875, "loss/logits": 0.16488471627235413, "loss/reg": 3.15783909172751e-05, "step": 739 }, { "epoch": 0.0925, "grad_norm": 2.0684523582458496, "grad_norm_var": 0.083315702432507, "learning_rate": 0.0001, "loss": 1.1796, "loss/crossentropy": 2.3564293384552, "loss/hidden": 1.046875, "loss/logits": 0.1323927640914917, "loss/reg": 3.156912134727463e-05, "step": 740 }, { "epoch": 0.092625, "grad_norm": 2.656198501586914, "grad_norm_var": 0.0900238317620723, "learning_rate": 0.0001, "loss": 1.3841, "loss/crossentropy": 2.487755060195923, "loss/hidden": 1.1875, "loss/logits": 0.19630743563175201, "loss/reg": 3.155725062242709e-05, "step": 741 }, { "epoch": 0.09275, "grad_norm": 1.9687042236328125, "grad_norm_var": 0.09664116038215548, "learning_rate": 0.0001, "loss": 1.2491, "loss/crossentropy": 2.573404550552368, "loss/hidden": 1.078125, "loss/logits": 0.17063230276107788, "loss/reg": 3.1548213883070275e-05, "step": 742 }, { "epoch": 0.092875, "grad_norm": 2.102797269821167, "grad_norm_var": 0.0831564589342415, "learning_rate": 0.0001, "loss": 1.1905, "loss/crossentropy": 2.153092384338379, "loss/hidden": 1.0390625, "loss/logits": 0.15110386908054352, "loss/reg": 3.153769648633897e-05, "step": 743 }, { "epoch": 0.093, "grad_norm": 2.355156898498535, "grad_norm_var": 0.07833334824755027, "learning_rate": 0.0001, "loss": 1.3599, "loss/crossentropy": 2.17565655708313, "loss/hidden": 1.1640625, "loss/logits": 0.19551442563533783, "loss/reg": 3.152925637550652e-05, "step": 744 }, { "epoch": 0.093125, "grad_norm": 2.443155527114868, "grad_norm_var": 0.07437929229497317, "learning_rate": 0.0001, "loss": 1.3143, "loss/crossentropy": 2.252290725708008, "loss/hidden": 1.15625, "loss/logits": 0.15772585570812225, "loss/reg": 3.1521783967036754e-05, "step": 745 }, { "epoch": 0.09325, "grad_norm": 2.0343992710113525, "grad_norm_var": 0.06226497131138201, "learning_rate": 0.0001, "loss": 1.0988, "loss/crossentropy": 2.4974706172943115, "loss/hidden": 0.96875, "loss/logits": 0.12977877259254456, "loss/reg": 3.151370765408501e-05, "step": 746 }, { "epoch": 0.093375, "grad_norm": 2.5981836318969727, "grad_norm_var": 0.0683810999287743, "learning_rate": 0.0001, "loss": 1.3081, "loss/crossentropy": 2.437986373901367, "loss/hidden": 1.140625, "loss/logits": 0.16720974445343018, "loss/reg": 3.150551856379025e-05, "step": 747 }, { "epoch": 0.0935, "grad_norm": 2.1747589111328125, "grad_norm_var": 0.06813326994570724, "learning_rate": 0.0001, "loss": 1.0686, "loss/crossentropy": 2.7409119606018066, "loss/hidden": 0.9375, "loss/logits": 0.13082191348075867, "loss/reg": 3.1495314033236355e-05, "step": 748 }, { "epoch": 0.093625, "grad_norm": 2.183622360229492, "grad_norm_var": 0.06011461073695254, "learning_rate": 0.0001, "loss": 1.2254, "loss/crossentropy": 2.191103458404541, "loss/hidden": 1.0625, "loss/logits": 0.1625903844833374, "loss/reg": 3.1484429200645536e-05, "step": 749 }, { "epoch": 0.09375, "grad_norm": 1.729537844657898, "grad_norm_var": 0.07589706873069174, "learning_rate": 0.0001, "loss": 1.1576, "loss/crossentropy": 2.559917449951172, "loss/hidden": 1.0, "loss/logits": 0.15729467570781708, "loss/reg": 3.1475185096496716e-05, "step": 750 }, { "epoch": 0.093875, "grad_norm": 1.9405204057693481, "grad_norm_var": 0.08195632956667372, "learning_rate": 0.0001, "loss": 1.0518, "loss/crossentropy": 2.624401330947876, "loss/hidden": 0.90625, "loss/logits": 0.1452496498823166, "loss/reg": 3.1468345696339384e-05, "step": 751 }, { "epoch": 0.094, "grad_norm": 2.1372480392456055, "grad_norm_var": 0.0823473431455047, "learning_rate": 0.0001, "loss": 1.1548, "loss/crossentropy": 2.3568766117095947, "loss/hidden": 1.0078125, "loss/logits": 0.14671632647514343, "loss/reg": 3.1459076126338914e-05, "step": 752 }, { "epoch": 0.094125, "grad_norm": 1.9586896896362305, "grad_norm_var": 0.08669574257193607, "learning_rate": 0.0001, "loss": 1.2604, "loss/crossentropy": 2.3817858695983887, "loss/hidden": 1.0859375, "loss/logits": 0.1741340607404709, "loss/reg": 3.145124719594605e-05, "step": 753 }, { "epoch": 0.09425, "grad_norm": 5.136489391326904, "grad_norm_var": 0.6087473488841433, "learning_rate": 0.0001, "loss": 2.2121, "loss/crossentropy": 2.298271656036377, "loss/hidden": 1.8203125, "loss/logits": 0.39150530099868774, "loss/reg": 3.144397123833187e-05, "step": 754 }, { "epoch": 0.094375, "grad_norm": 3.463529109954834, "grad_norm_var": 0.683278061488306, "learning_rate": 0.0001, "loss": 1.5672, "loss/crossentropy": 2.0770769119262695, "loss/hidden": 1.3515625, "loss/logits": 0.21530470252037048, "loss/reg": 3.1436022254638374e-05, "step": 755 }, { "epoch": 0.0945, "grad_norm": 2.3703503608703613, "grad_norm_var": 0.67424132170143, "learning_rate": 0.0001, "loss": 1.2125, "loss/crossentropy": 2.8401291370391846, "loss/hidden": 1.046875, "loss/logits": 0.16533055901527405, "loss/reg": 3.142944842693396e-05, "step": 756 }, { "epoch": 0.094625, "grad_norm": 2.6325223445892334, "grad_norm_var": 0.6736359493160847, "learning_rate": 0.0001, "loss": 1.2971, "loss/crossentropy": 2.7272450923919678, "loss/hidden": 1.1171875, "loss/logits": 0.1796242892742157, "loss/reg": 3.1421946914633736e-05, "step": 757 }, { "epoch": 0.09475, "grad_norm": 2.570432424545288, "grad_norm_var": 0.6575024318759268, "learning_rate": 0.0001, "loss": 1.1965, "loss/crossentropy": 2.4085519313812256, "loss/hidden": 1.0234375, "loss/logits": 0.17274212837219238, "loss/reg": 3.141486013191752e-05, "step": 758 }, { "epoch": 0.094875, "grad_norm": 2.4089767932891846, "grad_norm_var": 0.6475763705088162, "learning_rate": 0.0001, "loss": 1.2483, "loss/crossentropy": 2.472520589828491, "loss/hidden": 1.0859375, "loss/logits": 0.1620466709136963, "loss/reg": 3.1408424547407776e-05, "step": 759 }, { "epoch": 0.095, "grad_norm": 1.9809985160827637, "grad_norm_var": 0.6639808786341005, "learning_rate": 0.0001, "loss": 1.1507, "loss/crossentropy": 2.600600481033325, "loss/hidden": 1.0, "loss/logits": 0.1503715068101883, "loss/reg": 3.1402007152792066e-05, "step": 760 }, { "epoch": 0.095125, "grad_norm": 2.333096742630005, "grad_norm_var": 0.665355115788793, "learning_rate": 0.0001, "loss": 1.279, "loss/crossentropy": 2.6416707038879395, "loss/hidden": 1.125, "loss/logits": 0.1537056565284729, "loss/reg": 3.139731416013092e-05, "step": 761 }, { "epoch": 0.09525, "grad_norm": 1.9397908449172974, "grad_norm_var": 0.6715145427304359, "learning_rate": 0.0001, "loss": 1.1725, "loss/crossentropy": 2.7050859928131104, "loss/hidden": 1.0234375, "loss/logits": 0.14873114228248596, "loss/reg": 3.13945856760256e-05, "step": 762 }, { "epoch": 0.095375, "grad_norm": 1.8747289180755615, "grad_norm_var": 0.6920951391921969, "learning_rate": 0.0001, "loss": 1.05, "loss/crossentropy": 2.4486820697784424, "loss/hidden": 0.91015625, "loss/logits": 0.13953766226768494, "loss/reg": 3.138865577057004e-05, "step": 763 }, { "epoch": 0.0955, "grad_norm": 1.8983080387115479, "grad_norm_var": 0.7061769284476885, "learning_rate": 0.0001, "loss": 1.1905, "loss/crossentropy": 2.2771549224853516, "loss/hidden": 1.03125, "loss/logits": 0.1589195728302002, "loss/reg": 3.138252577628009e-05, "step": 764 }, { "epoch": 0.095625, "grad_norm": 2.072046995162964, "grad_norm_var": 0.7103216736695799, "learning_rate": 0.0001, "loss": 1.1747, "loss/crossentropy": 2.7360568046569824, "loss/hidden": 1.0234375, "loss/logits": 0.15097260475158691, "loss/reg": 3.1379106076201424e-05, "step": 765 }, { "epoch": 0.09575, "grad_norm": 2.3827669620513916, "grad_norm_var": 0.6783382556637773, "learning_rate": 0.0001, "loss": 1.277, "loss/crossentropy": 2.583840847015381, "loss/hidden": 1.109375, "loss/logits": 0.16732946038246155, "loss/reg": 3.137430394417606e-05, "step": 766 }, { "epoch": 0.095875, "grad_norm": 2.367859363555908, "grad_norm_var": 0.6610768710121395, "learning_rate": 0.0001, "loss": 1.3393, "loss/crossentropy": 2.7965312004089355, "loss/hidden": 1.15625, "loss/logits": 0.18269799649715424, "loss/reg": 3.1365445465780795e-05, "step": 767 }, { "epoch": 0.096, "grad_norm": 2.1879124641418457, "grad_norm_var": 0.658986168594122, "learning_rate": 0.0001, "loss": 1.3228, "loss/crossentropy": 2.4425103664398193, "loss/hidden": 1.15625, "loss/logits": 0.16622185707092285, "loss/reg": 3.1357700208900496e-05, "step": 768 }, { "epoch": 0.096125, "grad_norm": 2.281101703643799, "grad_norm_var": 0.6433454947799935, "learning_rate": 0.0001, "loss": 1.053, "loss/crossentropy": 2.447190284729004, "loss/hidden": 0.92578125, "loss/logits": 0.12689539790153503, "loss/reg": 3.135461884085089e-05, "step": 769 }, { "epoch": 0.09625, "grad_norm": 2.780257225036621, "grad_norm_var": 0.1600984168688796, "learning_rate": 0.0001, "loss": 1.3624, "loss/crossentropy": 2.413775682449341, "loss/hidden": 1.15625, "loss/logits": 0.20579016208648682, "loss/reg": 3.13528798869811e-05, "step": 770 }, { "epoch": 0.096375, "grad_norm": 2.1634106636047363, "grad_norm_var": 0.07211399956462869, "learning_rate": 0.0001, "loss": 1.2305, "loss/crossentropy": 2.276298761367798, "loss/hidden": 1.09375, "loss/logits": 0.13639463484287262, "loss/reg": 3.135051156277768e-05, "step": 771 }, { "epoch": 0.0965, "grad_norm": 3.03767728805542, "grad_norm_var": 0.10929521688149742, "learning_rate": 0.0001, "loss": 1.2595, "loss/crossentropy": 2.69195294380188, "loss/hidden": 1.0703125, "loss/logits": 0.1888556033372879, "loss/reg": 3.134880535071716e-05, "step": 772 }, { "epoch": 0.096625, "grad_norm": 2.160676956176758, "grad_norm_var": 0.1027301574876498, "learning_rate": 0.0001, "loss": 1.099, "loss/crossentropy": 2.6800880432128906, "loss/hidden": 0.96484375, "loss/logits": 0.13380314409732819, "loss/reg": 3.13417476718314e-05, "step": 773 }, { "epoch": 0.09675, "grad_norm": 1.9055464267730713, "grad_norm_var": 0.10439108753585717, "learning_rate": 0.0001, "loss": 1.1548, "loss/crossentropy": 2.693740129470825, "loss/hidden": 1.0078125, "loss/logits": 0.14665429294109344, "loss/reg": 3.1334358936874196e-05, "step": 774 }, { "epoch": 0.096875, "grad_norm": 2.2329320907592773, "grad_norm_var": 0.102266613042209, "learning_rate": 0.0001, "loss": 1.2392, "loss/crossentropy": 2.3540096282958984, "loss/hidden": 1.078125, "loss/logits": 0.16073733568191528, "loss/reg": 3.132629717583768e-05, "step": 775 }, { "epoch": 0.097, "grad_norm": 2.4246928691864014, "grad_norm_var": 0.10013899770161926, "learning_rate": 0.0001, "loss": 1.2505, "loss/crossentropy": 1.9924331903457642, "loss/hidden": 1.109375, "loss/logits": 0.14083942770957947, "loss/reg": 3.1318559194915e-05, "step": 776 }, { "epoch": 0.097125, "grad_norm": 2.389742136001587, "grad_norm_var": 0.10094694170040738, "learning_rate": 0.0001, "loss": 1.324, "loss/crossentropy": 2.7448296546936035, "loss/hidden": 1.140625, "loss/logits": 0.18304391205310822, "loss/reg": 3.130955883534625e-05, "step": 777 }, { "epoch": 0.09725, "grad_norm": 2.5808043479919434, "grad_norm_var": 0.09958374019438997, "learning_rate": 0.0001, "loss": 1.265, "loss/crossentropy": 1.8842538595199585, "loss/hidden": 1.109375, "loss/logits": 0.1553521603345871, "loss/reg": 3.1302373827202246e-05, "step": 778 }, { "epoch": 0.097375, "grad_norm": 2.4130847454071045, "grad_norm_var": 0.08743873306624413, "learning_rate": 0.0001, "loss": 1.3718, "loss/crossentropy": 2.4912071228027344, "loss/hidden": 1.1875, "loss/logits": 0.18399140238761902, "loss/reg": 3.129445394733921e-05, "step": 779 }, { "epoch": 0.0975, "grad_norm": 2.1240077018737793, "grad_norm_var": 0.07763369234828493, "learning_rate": 0.0001, "loss": 1.175, "loss/crossentropy": 2.626298666000366, "loss/hidden": 1.015625, "loss/logits": 0.15901657938957214, "loss/reg": 3.128518073935993e-05, "step": 780 }, { "epoch": 0.097625, "grad_norm": 2.241236686706543, "grad_norm_var": 0.07328714526078836, "learning_rate": 0.0001, "loss": 1.3103, "loss/crossentropy": 2.8781237602233887, "loss/hidden": 1.140625, "loss/logits": 0.1693672388792038, "loss/reg": 3.127881063846871e-05, "step": 781 }, { "epoch": 0.09775, "grad_norm": 2.1519434452056885, "grad_norm_var": 0.07575044500278688, "learning_rate": 0.0001, "loss": 1.2974, "loss/crossentropy": 2.2590219974517822, "loss/hidden": 1.125, "loss/logits": 0.1720578521490097, "loss/reg": 3.1273764761863276e-05, "step": 782 }, { "epoch": 0.097875, "grad_norm": 4.857376575469971, "grad_norm_var": 0.472294081867043, "learning_rate": 0.0001, "loss": 1.9927, "loss/crossentropy": 2.8379626274108887, "loss/hidden": 1.5625, "loss/logits": 0.4299107789993286, "loss/reg": 3.126606316072866e-05, "step": 783 }, { "epoch": 0.098, "grad_norm": 3.427262783050537, "grad_norm_var": 0.5174201023944398, "learning_rate": 0.0001, "loss": 1.3025, "loss/crossentropy": 3.541722059249878, "loss/hidden": 1.140625, "loss/logits": 0.16157390177249908, "loss/reg": 3.125540752080269e-05, "step": 784 }, { "epoch": 0.098125, "grad_norm": 2.264031171798706, "grad_norm_var": 0.5181032302799584, "learning_rate": 0.0001, "loss": 1.1002, "loss/crossentropy": 2.884652614593506, "loss/hidden": 0.95703125, "loss/logits": 0.14289110898971558, "loss/reg": 3.124582508462481e-05, "step": 785 }, { "epoch": 0.09825, "grad_norm": 1.806728482246399, "grad_norm_var": 0.5503273000636368, "learning_rate": 0.0001, "loss": 1.1306, "loss/crossentropy": 2.4572439193725586, "loss/hidden": 0.97265625, "loss/logits": 0.1576440930366516, "loss/reg": 3.1235387723427266e-05, "step": 786 }, { "epoch": 0.098375, "grad_norm": 2.684609889984131, "grad_norm_var": 0.5431278467835586, "learning_rate": 0.0001, "loss": 1.5656, "loss/crossentropy": 2.4877264499664307, "loss/hidden": 1.3203125, "loss/logits": 0.2449798732995987, "loss/reg": 3.122756606899202e-05, "step": 787 }, { "epoch": 0.0985, "grad_norm": 2.632183313369751, "grad_norm_var": 0.5267077798480964, "learning_rate": 0.0001, "loss": 1.1392, "loss/crossentropy": 2.2014877796173096, "loss/hidden": 1.0, "loss/logits": 0.13884884119033813, "loss/reg": 3.121886038570665e-05, "step": 788 }, { "epoch": 0.098625, "grad_norm": 1.87946355342865, "grad_norm_var": 0.54506897354085, "learning_rate": 0.0001, "loss": 1.2559, "loss/crossentropy": 2.4326982498168945, "loss/hidden": 1.09375, "loss/logits": 0.16185970604419708, "loss/reg": 3.121058762189932e-05, "step": 789 }, { "epoch": 0.09875, "grad_norm": 4.394942283630371, "grad_norm_var": 0.7347519248832649, "learning_rate": 0.0001, "loss": 1.7373, "loss/crossentropy": 2.4330739974975586, "loss/hidden": 1.515625, "loss/logits": 0.22132834792137146, "loss/reg": 3.1201776437228546e-05, "step": 790 }, { "epoch": 0.098875, "grad_norm": 1.9128646850585938, "grad_norm_var": 0.7592334384300727, "learning_rate": 0.0001, "loss": 1.2412, "loss/crossentropy": 2.433431625366211, "loss/hidden": 1.0859375, "loss/logits": 0.15490993857383728, "loss/reg": 3.119331449852325e-05, "step": 791 }, { "epoch": 0.099, "grad_norm": 2.9175164699554443, "grad_norm_var": 0.7604913223839332, "learning_rate": 0.0001, "loss": 1.5882, "loss/crossentropy": 2.2526400089263916, "loss/hidden": 1.3828125, "loss/logits": 0.20504862070083618, "loss/reg": 3.118627500953153e-05, "step": 792 }, { "epoch": 0.099125, "grad_norm": 2.331383228302002, "grad_norm_var": 0.7628643978346566, "learning_rate": 0.0001, "loss": 1.4146, "loss/crossentropy": 2.7012693881988525, "loss/hidden": 1.2109375, "loss/logits": 0.2033015489578247, "loss/reg": 3.117845699307509e-05, "step": 793 }, { "epoch": 0.09925, "grad_norm": 2.187777042388916, "grad_norm_var": 0.7768636197061916, "learning_rate": 0.0001, "loss": 1.234, "loss/crossentropy": 2.525279998779297, "loss/hidden": 1.0859375, "loss/logits": 0.14775747060775757, "loss/reg": 3.116917287115939e-05, "step": 794 }, { "epoch": 0.099375, "grad_norm": 2.1538753509521484, "grad_norm_var": 0.7888760885047162, "learning_rate": 0.0001, "loss": 1.2094, "loss/crossentropy": 2.5589966773986816, "loss/hidden": 1.0390625, "loss/logits": 0.17005379498004913, "loss/reg": 3.11611256620381e-05, "step": 795 }, { "epoch": 0.0995, "grad_norm": 5.747286796569824, "grad_norm_var": 1.3683445106961756, "learning_rate": 0.0001, "loss": 1.8437, "loss/crossentropy": 2.058424234390259, "loss/hidden": 1.53125, "loss/logits": 0.3121880888938904, "loss/reg": 3.115229628747329e-05, "step": 796 }, { "epoch": 0.099625, "grad_norm": 2.5717875957489014, "grad_norm_var": 1.3483694213120883, "learning_rate": 0.0001, "loss": 1.7157, "loss/crossentropy": 2.185353994369507, "loss/hidden": 1.4765625, "loss/logits": 0.23884126543998718, "loss/reg": 3.114379069302231e-05, "step": 797 }, { "epoch": 0.09975, "grad_norm": 2.291684150695801, "grad_norm_var": 1.3362097880401214, "learning_rate": 0.0001, "loss": 1.1224, "loss/crossentropy": 2.6714465618133545, "loss/hidden": 0.9765625, "loss/logits": 0.14549368619918823, "loss/reg": 3.1131978175835684e-05, "step": 798 }, { "epoch": 0.099875, "grad_norm": 2.000739097595215, "grad_norm_var": 1.0926226260566427, "learning_rate": 0.0001, "loss": 1.2438, "loss/crossentropy": 2.5698318481445312, "loss/hidden": 1.0703125, "loss/logits": 0.17316791415214539, "loss/reg": 3.1121257052291185e-05, "step": 799 }, { "epoch": 0.1, "grad_norm": 3.3989579677581787, "grad_norm_var": 1.0899290024325692, "learning_rate": 0.0001, "loss": 1.5563, "loss/crossentropy": 2.8038387298583984, "loss/hidden": 1.328125, "loss/logits": 0.2278510481119156, "loss/reg": 3.1109084375202656e-05, "step": 800 }, { "epoch": 0.100125, "grad_norm": 2.534914255142212, "grad_norm_var": 1.0788234524427875, "learning_rate": 0.0001, "loss": 1.5327, "loss/crossentropy": 2.1927835941314697, "loss/hidden": 1.3046875, "loss/logits": 0.2276715636253357, "loss/reg": 3.110256511718035e-05, "step": 801 }, { "epoch": 0.10025, "grad_norm": 2.1616692543029785, "grad_norm_var": 1.0436931816711053, "learning_rate": 0.0001, "loss": 1.1964, "loss/crossentropy": 3.0142710208892822, "loss/hidden": 1.03125, "loss/logits": 0.1648503690958023, "loss/reg": 3.1094125006347895e-05, "step": 802 }, { "epoch": 0.100375, "grad_norm": 2.1722917556762695, "grad_norm_var": 1.0637174890335515, "learning_rate": 0.0001, "loss": 1.3883, "loss/crossentropy": 2.711493730545044, "loss/hidden": 1.1953125, "loss/logits": 0.19265002012252808, "loss/reg": 3.108750388491899e-05, "step": 803 }, { "epoch": 0.1005, "grad_norm": 1.8611395359039307, "grad_norm_var": 1.1084202434727317, "learning_rate": 0.0001, "loss": 1.168, "loss/crossentropy": 2.396332263946533, "loss/hidden": 1.0234375, "loss/logits": 0.1442836970090866, "loss/reg": 3.108082091785036e-05, "step": 804 }, { "epoch": 0.100625, "grad_norm": 2.0508856773376465, "grad_norm_var": 1.0924762571014581, "learning_rate": 0.0001, "loss": 1.2781, "loss/crossentropy": 2.658599376678467, "loss/hidden": 1.109375, "loss/logits": 0.16839157044887543, "loss/reg": 3.107260272372514e-05, "step": 805 }, { "epoch": 0.10075, "grad_norm": 2.4619462490081787, "grad_norm_var": 0.8809438114007198, "learning_rate": 0.0001, "loss": 1.1897, "loss/crossentropy": 2.5231716632843018, "loss/hidden": 1.046875, "loss/logits": 0.14253319799900055, "loss/reg": 3.106672738795169e-05, "step": 806 }, { "epoch": 0.100875, "grad_norm": 2.3893401622772217, "grad_norm_var": 0.8548277216729955, "learning_rate": 0.0001, "loss": 1.3323, "loss/crossentropy": 2.2468364238739014, "loss/hidden": 1.15625, "loss/logits": 0.17571108043193817, "loss/reg": 3.105968062300235e-05, "step": 807 }, { "epoch": 0.101, "grad_norm": 1.9697636365890503, "grad_norm_var": 0.8679468111481312, "learning_rate": 0.0001, "loss": 1.1459, "loss/crossentropy": 2.113001585006714, "loss/hidden": 1.0078125, "loss/logits": 0.13779830932617188, "loss/reg": 3.1051207770360634e-05, "step": 808 }, { "epoch": 0.101125, "grad_norm": 2.3979098796844482, "grad_norm_var": 0.8665695097636771, "learning_rate": 0.0001, "loss": 1.2127, "loss/crossentropy": 2.347339391708374, "loss/hidden": 1.0546875, "loss/logits": 0.1576911211013794, "loss/reg": 3.103955532424152e-05, "step": 809 }, { "epoch": 0.10125, "grad_norm": 4.104284763336182, "grad_norm_var": 1.0107271790963963, "learning_rate": 0.0001, "loss": 1.2255, "loss/crossentropy": 2.645543098449707, "loss/hidden": 1.0546875, "loss/logits": 0.17050105333328247, "loss/reg": 3.102962000411935e-05, "step": 810 }, { "epoch": 0.101375, "grad_norm": 2.593677520751953, "grad_norm_var": 0.9942054452960448, "learning_rate": 0.0001, "loss": 1.1147, "loss/crossentropy": 2.655158281326294, "loss/hidden": 0.96875, "loss/logits": 0.14562083780765533, "loss/reg": 3.101498805335723e-05, "step": 811 }, { "epoch": 0.1015, "grad_norm": 2.747690439224243, "grad_norm_var": 0.32551198430473954, "learning_rate": 0.0001, "loss": 1.2766, "loss/crossentropy": 2.234384298324585, "loss/hidden": 1.1015625, "loss/logits": 0.1747133731842041, "loss/reg": 3.100006506429054e-05, "step": 812 }, { "epoch": 0.101625, "grad_norm": 2.240682601928711, "grad_norm_var": 0.3283908535525057, "learning_rate": 0.0001, "loss": 1.3393, "loss/crossentropy": 2.416161298751831, "loss/hidden": 1.1796875, "loss/logits": 0.15925709903240204, "loss/reg": 3.099101013503969e-05, "step": 813 }, { "epoch": 0.10175, "grad_norm": 2.198143482208252, "grad_norm_var": 0.33105067119688786, "learning_rate": 0.0001, "loss": 1.1581, "loss/crossentropy": 2.5200369358062744, "loss/hidden": 0.99609375, "loss/logits": 0.16167645156383514, "loss/reg": 3.0979368602856994e-05, "step": 814 }, { "epoch": 0.101875, "grad_norm": 2.7281014919281006, "grad_norm_var": 0.3200372361620191, "learning_rate": 0.0001, "loss": 1.1707, "loss/crossentropy": 2.55796217918396, "loss/hidden": 1.0234375, "loss/logits": 0.14694485068321228, "loss/reg": 3.096580257988535e-05, "step": 815 }, { "epoch": 0.102, "grad_norm": 2.093496322631836, "grad_norm_var": 0.2702016025984974, "learning_rate": 0.0001, "loss": 1.2431, "loss/crossentropy": 2.6173605918884277, "loss/hidden": 1.0859375, "loss/logits": 0.15686647593975067, "loss/reg": 3.095622741966508e-05, "step": 816 }, { "epoch": 0.102125, "grad_norm": 2.214099407196045, "grad_norm_var": 0.2716811480241621, "learning_rate": 0.0001, "loss": 1.1334, "loss/crossentropy": 2.370023727416992, "loss/hidden": 0.9765625, "loss/logits": 0.15652622282505035, "loss/reg": 3.094406929449178e-05, "step": 817 }, { "epoch": 0.10225, "grad_norm": 2.3707094192504883, "grad_norm_var": 0.267795417331483, "learning_rate": 0.0001, "loss": 1.2637, "loss/crossentropy": 2.85809588432312, "loss/hidden": 1.1015625, "loss/logits": 0.16179285943508148, "loss/reg": 3.0934257665649056e-05, "step": 818 }, { "epoch": 0.102375, "grad_norm": 2.4590866565704346, "grad_norm_var": 0.2637646763277468, "learning_rate": 0.0001, "loss": 1.3356, "loss/crossentropy": 2.219639301300049, "loss/hidden": 1.1484375, "loss/logits": 0.1868869662284851, "loss/reg": 3.09246352117043e-05, "step": 819 }, { "epoch": 0.1025, "grad_norm": 2.2254512310028076, "grad_norm_var": 0.24442462240150267, "learning_rate": 0.0001, "loss": 1.2099, "loss/crossentropy": 2.446502447128296, "loss/hidden": 1.0390625, "loss/logits": 0.17052100598812103, "loss/reg": 3.091415055678226e-05, "step": 820 }, { "epoch": 0.102625, "grad_norm": 1.9032593965530396, "grad_norm_var": 0.2536983764450135, "learning_rate": 0.0001, "loss": 1.0735, "loss/crossentropy": 2.448707342147827, "loss/hidden": 0.94140625, "loss/logits": 0.13180768489837646, "loss/reg": 3.090177779085934e-05, "step": 821 }, { "epoch": 0.10275, "grad_norm": 1.844329595565796, "grad_norm_var": 0.27602844848279134, "learning_rate": 0.0001, "loss": 1.1176, "loss/crossentropy": 2.5376136302948, "loss/hidden": 0.96875, "loss/logits": 0.14854903519153595, "loss/reg": 3.089279562118463e-05, "step": 822 }, { "epoch": 0.102875, "grad_norm": 1.6877379417419434, "grad_norm_var": 0.308258885532959, "learning_rate": 0.0001, "loss": 1.1547, "loss/crossentropy": 2.5561609268188477, "loss/hidden": 1.0, "loss/logits": 0.1543552428483963, "loss/reg": 3.0882885766914114e-05, "step": 823 }, { "epoch": 0.103, "grad_norm": 7.1836419105529785, "grad_norm_var": 1.7352053204418108, "learning_rate": 0.0001, "loss": 1.3146, "loss/crossentropy": 2.6273272037506104, "loss/hidden": 1.0234375, "loss/logits": 0.2908306121826172, "loss/reg": 3.0875191441737115e-05, "step": 824 }, { "epoch": 0.103125, "grad_norm": 2.999370574951172, "grad_norm_var": 1.734629979326649, "learning_rate": 0.0001, "loss": 1.5094, "loss/crossentropy": 2.6536483764648438, "loss/hidden": 1.3203125, "loss/logits": 0.18879318237304688, "loss/reg": 3.0866562156006694e-05, "step": 825 }, { "epoch": 0.10325, "grad_norm": 3.1951546669006348, "grad_norm_var": 1.619046832548184, "learning_rate": 0.0001, "loss": 1.5738, "loss/crossentropy": 2.104147434234619, "loss/hidden": 1.3515625, "loss/logits": 0.22189953923225403, "loss/reg": 3.0859606340527534e-05, "step": 826 }, { "epoch": 0.103375, "grad_norm": 1.9920458793640137, "grad_norm_var": 1.6476144569097508, "learning_rate": 0.0001, "loss": 1.0835, "loss/crossentropy": 2.3575265407562256, "loss/hidden": 0.953125, "loss/logits": 0.1301153153181076, "loss/reg": 3.0850660550640896e-05, "step": 827 }, { "epoch": 0.1035, "grad_norm": 2.1523900032043457, "grad_norm_var": 1.6604367682342531, "learning_rate": 0.0001, "loss": 1.3061, "loss/crossentropy": 2.1625425815582275, "loss/hidden": 1.140625, "loss/logits": 0.16514690220355988, "loss/reg": 3.084292620769702e-05, "step": 828 }, { "epoch": 0.103625, "grad_norm": 2.931431531906128, "grad_norm_var": 1.6578109899282356, "learning_rate": 0.0001, "loss": 1.1044, "loss/crossentropy": 2.6592483520507812, "loss/hidden": 0.9609375, "loss/logits": 0.14313727617263794, "loss/reg": 3.083515548496507e-05, "step": 829 }, { "epoch": 0.10375, "grad_norm": 2.6759235858917236, "grad_norm_var": 1.6441751337506705, "learning_rate": 0.0001, "loss": 1.0772, "loss/crossentropy": 2.441382884979248, "loss/hidden": 0.953125, "loss/logits": 0.12375655770301819, "loss/reg": 3.082441253354773e-05, "step": 830 }, { "epoch": 0.103875, "grad_norm": 2.1944730281829834, "grad_norm_var": 1.6575550635786949, "learning_rate": 0.0001, "loss": 1.2211, "loss/crossentropy": 2.1641762256622314, "loss/hidden": 1.078125, "loss/logits": 0.14268890023231506, "loss/reg": 3.08187554765027e-05, "step": 831 }, { "epoch": 0.104, "grad_norm": 2.6154282093048096, "grad_norm_var": 1.637059795107976, "learning_rate": 0.0001, "loss": 1.372, "loss/crossentropy": 2.3172926902770996, "loss/hidden": 1.1875, "loss/logits": 0.18420815467834473, "loss/reg": 3.0813283956376836e-05, "step": 832 }, { "epoch": 0.104125, "grad_norm": 2.7839176654815674, "grad_norm_var": 1.6230740542825255, "learning_rate": 0.0001, "loss": 1.266, "loss/crossentropy": 2.526543378829956, "loss/hidden": 1.09375, "loss/logits": 0.17195840179920197, "loss/reg": 3.080438546021469e-05, "step": 833 }, { "epoch": 0.10425, "grad_norm": 2.2361974716186523, "grad_norm_var": 1.630126784940075, "learning_rate": 0.0001, "loss": 1.1865, "loss/crossentropy": 2.38508677482605, "loss/hidden": 1.046875, "loss/logits": 0.13929709792137146, "loss/reg": 3.079506132053211e-05, "step": 834 }, { "epoch": 0.104375, "grad_norm": 2.0524253845214844, "grad_norm_var": 1.6531180996917048, "learning_rate": 0.0001, "loss": 1.3726, "loss/crossentropy": 2.530698776245117, "loss/hidden": 1.1640625, "loss/logits": 0.20820161700248718, "loss/reg": 3.078530426137149e-05, "step": 835 }, { "epoch": 0.1045, "grad_norm": 2.179396629333496, "grad_norm_var": 1.6559624963262625, "learning_rate": 0.0001, "loss": 1.3062, "loss/crossentropy": 2.200507402420044, "loss/hidden": 1.1171875, "loss/logits": 0.18874022364616394, "loss/reg": 3.07746377075091e-05, "step": 836 }, { "epoch": 0.104625, "grad_norm": 1.8860087394714355, "grad_norm_var": 1.657731314453099, "learning_rate": 0.0001, "loss": 1.052, "loss/crossentropy": 3.1556520462036133, "loss/hidden": 0.92578125, "loss/logits": 0.12589877843856812, "loss/reg": 3.0765488190809265e-05, "step": 837 }, { "epoch": 0.10475, "grad_norm": 1.7952905893325806, "grad_norm_var": 1.663235285712947, "learning_rate": 0.0001, "loss": 1.205, "loss/crossentropy": 2.6172409057617188, "loss/hidden": 1.046875, "loss/logits": 0.15777094662189484, "loss/reg": 3.075488348258659e-05, "step": 838 }, { "epoch": 0.104875, "grad_norm": 1.9205029010772705, "grad_norm_var": 1.63644541696118, "learning_rate": 0.0001, "loss": 1.2576, "loss/crossentropy": 2.506983518600464, "loss/hidden": 1.09375, "loss/logits": 0.16354331374168396, "loss/reg": 3.074315463891253e-05, "step": 839 }, { "epoch": 0.105, "grad_norm": 2.0928657054901123, "grad_norm_var": 0.19559241083775677, "learning_rate": 0.0001, "loss": 1.1347, "loss/crossentropy": 2.6664047241210938, "loss/hidden": 0.97265625, "loss/logits": 0.16169464588165283, "loss/reg": 3.073291736654937e-05, "step": 840 }, { "epoch": 0.105125, "grad_norm": 2.0819296836853027, "grad_norm_var": 0.1695500869973637, "learning_rate": 0.0001, "loss": 1.1457, "loss/crossentropy": 2.5542869567871094, "loss/hidden": 0.9921875, "loss/logits": 0.15322107076644897, "loss/reg": 3.072019899263978e-05, "step": 841 }, { "epoch": 0.10525, "grad_norm": 2.0458364486694336, "grad_norm_var": 0.11479267511667969, "learning_rate": 0.0001, "loss": 1.1969, "loss/crossentropy": 2.4433228969573975, "loss/hidden": 1.0234375, "loss/logits": 0.17314405739307404, "loss/reg": 3.071278115385212e-05, "step": 842 }, { "epoch": 0.105375, "grad_norm": 2.0129029750823975, "grad_norm_var": 0.11416576275897222, "learning_rate": 0.0001, "loss": 1.1662, "loss/crossentropy": 2.686662435531616, "loss/hidden": 1.015625, "loss/logits": 0.15030167996883392, "loss/reg": 3.0703693482792005e-05, "step": 843 }, { "epoch": 0.1055, "grad_norm": 2.0519752502441406, "grad_norm_var": 0.1158157371009238, "learning_rate": 0.0001, "loss": 1.0377, "loss/crossentropy": 2.2896480560302734, "loss/hidden": 0.91015625, "loss/logits": 0.12721426784992218, "loss/reg": 3.0695973691763356e-05, "step": 844 }, { "epoch": 0.105625, "grad_norm": 2.1257541179656982, "grad_norm_var": 0.08020601663278004, "learning_rate": 0.0001, "loss": 1.2512, "loss/crossentropy": 2.3321328163146973, "loss/hidden": 1.0859375, "loss/logits": 0.16499708592891693, "loss/reg": 3.06832225760445e-05, "step": 845 }, { "epoch": 0.10575, "grad_norm": 2.699514865875244, "grad_norm_var": 0.08182612489990308, "learning_rate": 0.0001, "loss": 1.2479, "loss/crossentropy": 2.5453882217407227, "loss/hidden": 1.0625, "loss/logits": 0.18509814143180847, "loss/reg": 3.067553188884631e-05, "step": 846 }, { "epoch": 0.105875, "grad_norm": 2.1057229042053223, "grad_norm_var": 0.08206906146053014, "learning_rate": 0.0001, "loss": 1.2905, "loss/crossentropy": 2.517914295196533, "loss/hidden": 1.109375, "loss/logits": 0.18079468607902527, "loss/reg": 3.0666917155031115e-05, "step": 847 }, { "epoch": 0.106, "grad_norm": 2.237938404083252, "grad_norm_var": 0.06844794497861711, "learning_rate": 0.0001, "loss": 1.3847, "loss/crossentropy": 2.418829917907715, "loss/hidden": 1.1875, "loss/logits": 0.19684617221355438, "loss/reg": 3.065919372602366e-05, "step": 848 }, { "epoch": 0.106125, "grad_norm": 1.911841630935669, "grad_norm_var": 0.0416030271498783, "learning_rate": 0.0001, "loss": 1.1004, "loss/crossentropy": 2.5736911296844482, "loss/hidden": 0.9609375, "loss/logits": 0.1391741931438446, "loss/reg": 3.065243436140008e-05, "step": 849 }, { "epoch": 0.10625, "grad_norm": 2.6131272315979004, "grad_norm_var": 0.057842508872359075, "learning_rate": 0.0001, "loss": 1.4266, "loss/crossentropy": 2.7167627811431885, "loss/hidden": 1.1953125, "loss/logits": 0.23097163438796997, "loss/reg": 3.064147676923312e-05, "step": 850 }, { "epoch": 0.106375, "grad_norm": 3.0576887130737305, "grad_norm_var": 0.11284086479193854, "learning_rate": 0.0001, "loss": 1.6205, "loss/crossentropy": 3.076043128967285, "loss/hidden": 1.390625, "loss/logits": 0.2295425832271576, "loss/reg": 3.063471012865193e-05, "step": 851 }, { "epoch": 0.1065, "grad_norm": 2.0406875610351562, "grad_norm_var": 0.1139832134184904, "learning_rate": 0.0001, "loss": 1.1822, "loss/crossentropy": 2.5482842922210693, "loss/hidden": 1.0078125, "loss/logits": 0.17407500743865967, "loss/reg": 3.0626764782937244e-05, "step": 852 }, { "epoch": 0.106625, "grad_norm": 3.4559848308563232, "grad_norm_var": 0.2091155587506681, "learning_rate": 0.0001, "loss": 1.629, "loss/crossentropy": 3.0823347568511963, "loss/hidden": 1.3359375, "loss/logits": 0.29273706674575806, "loss/reg": 3.0620882171206176e-05, "step": 853 }, { "epoch": 0.10675, "grad_norm": 2.512244939804077, "grad_norm_var": 0.1962835291714666, "learning_rate": 0.0001, "loss": 1.1673, "loss/crossentropy": 2.5817465782165527, "loss/hidden": 1.015625, "loss/logits": 0.15131962299346924, "loss/reg": 3.061717143282294e-05, "step": 854 }, { "epoch": 0.106875, "grad_norm": 2.2254841327667236, "grad_norm_var": 0.18624173617588796, "learning_rate": 0.0001, "loss": 1.3592, "loss/crossentropy": 2.4730138778686523, "loss/hidden": 1.171875, "loss/logits": 0.18697890639305115, "loss/reg": 3.061169627471827e-05, "step": 855 }, { "epoch": 0.107, "grad_norm": 2.1242430210113525, "grad_norm_var": 0.18531340737878446, "learning_rate": 0.0001, "loss": 1.3297, "loss/crossentropy": 2.643832206726074, "loss/hidden": 1.1640625, "loss/logits": 0.1653291881084442, "loss/reg": 3.060722156078555e-05, "step": 856 }, { "epoch": 0.107125, "grad_norm": 1.8313719034194946, "grad_norm_var": 0.19757233331361063, "learning_rate": 0.0001, "loss": 1.1884, "loss/crossentropy": 2.4841344356536865, "loss/hidden": 1.046875, "loss/logits": 0.1411784291267395, "loss/reg": 3.060205199290067e-05, "step": 857 }, { "epoch": 0.10725, "grad_norm": 2.642951250076294, "grad_norm_var": 0.19836562649402692, "learning_rate": 0.0001, "loss": 1.1128, "loss/crossentropy": 2.895691394805908, "loss/hidden": 0.9609375, "loss/logits": 0.15156272053718567, "loss/reg": 3.0597617296734825e-05, "step": 858 }, { "epoch": 0.107375, "grad_norm": 1.8541446924209595, "grad_norm_var": 0.2071418812691462, "learning_rate": 0.0001, "loss": 1.2063, "loss/crossentropy": 2.4588773250579834, "loss/hidden": 1.046875, "loss/logits": 0.15909163653850555, "loss/reg": 3.059022856177762e-05, "step": 859 }, { "epoch": 0.1075, "grad_norm": 2.2945339679718018, "grad_norm_var": 0.20140156536063855, "learning_rate": 0.0001, "loss": 1.3389, "loss/crossentropy": 2.463376760482788, "loss/hidden": 1.140625, "loss/logits": 0.19798508286476135, "loss/reg": 3.0582417821278796e-05, "step": 860 }, { "epoch": 0.107625, "grad_norm": 2.0763418674468994, "grad_norm_var": 0.2030864243441099, "learning_rate": 0.0001, "loss": 1.2307, "loss/crossentropy": 2.511608600616455, "loss/hidden": 1.078125, "loss/logits": 0.1522822380065918, "loss/reg": 3.057560752495192e-05, "step": 861 }, { "epoch": 0.10775, "grad_norm": 2.028618812561035, "grad_norm_var": 0.2004213147208908, "learning_rate": 0.0001, "loss": 1.0788, "loss/crossentropy": 2.5496108531951904, "loss/hidden": 0.9453125, "loss/logits": 0.13319332897663116, "loss/reg": 3.056845525861718e-05, "step": 862 }, { "epoch": 0.107875, "grad_norm": 2.4464786052703857, "grad_norm_var": 0.19824703313002487, "learning_rate": 0.0001, "loss": 1.2798, "loss/crossentropy": 2.569153070449829, "loss/hidden": 1.1171875, "loss/logits": 0.1623011976480484, "loss/reg": 3.05598478007596e-05, "step": 863 }, { "epoch": 0.108, "grad_norm": 1.840403437614441, "grad_norm_var": 0.2132479466723514, "learning_rate": 0.0001, "loss": 1.1379, "loss/crossentropy": 2.0996623039245605, "loss/hidden": 0.9921875, "loss/logits": 0.1454375684261322, "loss/reg": 3.055387787753716e-05, "step": 864 }, { "epoch": 0.108125, "grad_norm": 2.1220955848693848, "grad_norm_var": 0.20485570241751858, "learning_rate": 0.0001, "loss": 1.172, "loss/crossentropy": 2.507676124572754, "loss/hidden": 1.046875, "loss/logits": 0.12477228045463562, "loss/reg": 3.054780245292932e-05, "step": 865 }, { "epoch": 0.10825, "grad_norm": 2.0436394214630127, "grad_norm_var": 0.20308802849589866, "learning_rate": 0.0001, "loss": 1.2824, "loss/crossentropy": 2.6431305408477783, "loss/hidden": 1.109375, "loss/logits": 0.17272983491420746, "loss/reg": 3.054209446418099e-05, "step": 866 }, { "epoch": 0.108375, "grad_norm": 1.9790053367614746, "grad_norm_var": 0.16501067300080757, "learning_rate": 0.0001, "loss": 1.1319, "loss/crossentropy": 2.4894466400146484, "loss/hidden": 0.9921875, "loss/logits": 0.1393980085849762, "loss/reg": 3.053849286516197e-05, "step": 867 }, { "epoch": 0.1085, "grad_norm": 1.9582802057266235, "grad_norm_var": 0.1674041146687453, "learning_rate": 0.0001, "loss": 1.1628, "loss/crossentropy": 2.497908592224121, "loss/hidden": 1.015625, "loss/logits": 0.14682599902153015, "loss/reg": 3.0536324629792944e-05, "step": 868 }, { "epoch": 0.108625, "grad_norm": 2.071611166000366, "grad_norm_var": 0.0580716724783836, "learning_rate": 0.0001, "loss": 1.1909, "loss/crossentropy": 2.4994702339172363, "loss/hidden": 1.03125, "loss/logits": 0.15929651260375977, "loss/reg": 3.053318869206123e-05, "step": 869 }, { "epoch": 0.10875, "grad_norm": 2.1442458629608154, "grad_norm_var": 0.047692633827984804, "learning_rate": 0.0001, "loss": 1.3262, "loss/crossentropy": 2.6097989082336426, "loss/hidden": 1.125, "loss/logits": 0.20085518062114716, "loss/reg": 3.053041291423142e-05, "step": 870 }, { "epoch": 0.108875, "grad_norm": 1.9006311893463135, "grad_norm_var": 0.04907894435885491, "learning_rate": 0.0001, "loss": 1.1552, "loss/crossentropy": 2.437607765197754, "loss/hidden": 1.0078125, "loss/logits": 0.14705568552017212, "loss/reg": 3.052354077226482e-05, "step": 871 }, { "epoch": 0.109, "grad_norm": 2.8589375019073486, "grad_norm_var": 0.08666775452125629, "learning_rate": 0.0001, "loss": 1.5007, "loss/crossentropy": 2.3860392570495605, "loss/hidden": 1.2734375, "loss/logits": 0.22699284553527832, "loss/reg": 3.0516899641952477e-05, "step": 872 }, { "epoch": 0.109125, "grad_norm": 2.0803627967834473, "grad_norm_var": 0.08060086596212314, "learning_rate": 0.0001, "loss": 1.4761, "loss/crossentropy": 2.097466230392456, "loss/hidden": 1.3046875, "loss/logits": 0.17110571265220642, "loss/reg": 3.0513721867464483e-05, "step": 873 }, { "epoch": 0.10925, "grad_norm": 2.0202836990356445, "grad_norm_var": 0.06360758527622175, "learning_rate": 0.0001, "loss": 1.1768, "loss/crossentropy": 2.6354477405548096, "loss/hidden": 1.03125, "loss/logits": 0.14520543813705444, "loss/reg": 3.050914529012516e-05, "step": 874 }, { "epoch": 0.109375, "grad_norm": 1.677229404449463, "grad_norm_var": 0.07153952873858506, "learning_rate": 0.0001, "loss": 1.0757, "loss/crossentropy": 2.3119208812713623, "loss/hidden": 0.9375, "loss/logits": 0.13793236017227173, "loss/reg": 3.050183477171231e-05, "step": 875 }, { "epoch": 0.1095, "grad_norm": 2.254667282104492, "grad_norm_var": 0.07058576994531313, "learning_rate": 0.0001, "loss": 1.427, "loss/crossentropy": 2.143519639968872, "loss/hidden": 1.2421875, "loss/logits": 0.18450552225112915, "loss/reg": 3.0492194127873518e-05, "step": 876 }, { "epoch": 0.109625, "grad_norm": 3.199824810028076, "grad_norm_var": 0.14683992559318046, "learning_rate": 0.0001, "loss": 1.0668, "loss/crossentropy": 2.8740196228027344, "loss/hidden": 0.9296875, "loss/logits": 0.13684490323066711, "loss/reg": 3.04836175928358e-05, "step": 877 }, { "epoch": 0.10975, "grad_norm": 2.2499160766601562, "grad_norm_var": 0.14590183794275807, "learning_rate": 0.0001, "loss": 1.2143, "loss/crossentropy": 2.618100881576538, "loss/hidden": 1.0625, "loss/logits": 0.15145710110664368, "loss/reg": 3.048092003155034e-05, "step": 878 }, { "epoch": 0.109875, "grad_norm": 2.1639883518218994, "grad_norm_var": 0.1407761266771947, "learning_rate": 0.0001, "loss": 1.1532, "loss/crossentropy": 2.4676895141601562, "loss/hidden": 1.015625, "loss/logits": 0.13726571202278137, "loss/reg": 3.0473505830741487e-05, "step": 879 }, { "epoch": 0.11, "grad_norm": 2.9332988262176514, "grad_norm_var": 0.16880933318345934, "learning_rate": 0.0001, "loss": 1.2218, "loss/crossentropy": 2.739016056060791, "loss/hidden": 1.0546875, "loss/logits": 0.16677409410476685, "loss/reg": 3.04626373690553e-05, "step": 880 }, { "epoch": 0.110125, "grad_norm": 2.377056121826172, "grad_norm_var": 0.16925066109580858, "learning_rate": 0.0001, "loss": 1.3437, "loss/crossentropy": 2.279279947280884, "loss/hidden": 1.15625, "loss/logits": 0.18712583184242249, "loss/reg": 3.0454097213805653e-05, "step": 881 }, { "epoch": 0.11025, "grad_norm": 1.7906708717346191, "grad_norm_var": 0.18002714541507406, "learning_rate": 0.0001, "loss": 1.2342, "loss/crossentropy": 2.567777395248413, "loss/hidden": 1.0703125, "loss/logits": 0.16357748210430145, "loss/reg": 3.0445706215687096e-05, "step": 882 }, { "epoch": 0.110375, "grad_norm": 2.8536040782928467, "grad_norm_var": 0.19871124531315582, "learning_rate": 0.0001, "loss": 1.4335, "loss/crossentropy": 2.493900775909424, "loss/hidden": 1.234375, "loss/logits": 0.19878800213336945, "loss/reg": 3.043895776499994e-05, "step": 883 }, { "epoch": 0.1105, "grad_norm": 5.356804847717285, "grad_norm_var": 0.7732547721650782, "learning_rate": 0.0001, "loss": 1.3051, "loss/crossentropy": 2.786006212234497, "loss/hidden": 1.140625, "loss/logits": 0.1641331911087036, "loss/reg": 3.043297510885168e-05, "step": 884 }, { "epoch": 0.110625, "grad_norm": 2.8684051036834717, "grad_norm_var": 0.7678671191599996, "learning_rate": 0.0001, "loss": 1.2754, "loss/crossentropy": 2.2656214237213135, "loss/hidden": 1.1171875, "loss/logits": 0.15792518854141235, "loss/reg": 3.0427321689785458e-05, "step": 885 }, { "epoch": 0.11075, "grad_norm": 1.9989514350891113, "grad_norm_var": 0.7769621885934072, "learning_rate": 0.0001, "loss": 1.2788, "loss/crossentropy": 2.489605188369751, "loss/hidden": 1.109375, "loss/logits": 0.16912804543972015, "loss/reg": 3.0422537747654133e-05, "step": 886 }, { "epoch": 0.110875, "grad_norm": 2.3265321254730225, "grad_norm_var": 0.7521879700078027, "learning_rate": 0.0001, "loss": 1.3237, "loss/crossentropy": 2.5821127891540527, "loss/hidden": 1.109375, "loss/logits": 0.21401891112327576, "loss/reg": 3.0414203138207085e-05, "step": 887 }, { "epoch": 0.111, "grad_norm": 2.1429810523986816, "grad_norm_var": 0.7559897385715504, "learning_rate": 0.0001, "loss": 1.2854, "loss/crossentropy": 2.565568208694458, "loss/hidden": 1.0859375, "loss/logits": 0.19918876886367798, "loss/reg": 3.040812589460984e-05, "step": 888 }, { "epoch": 0.111125, "grad_norm": 1.6851451396942139, "grad_norm_var": 0.788835305036081, "learning_rate": 0.0001, "loss": 1.1598, "loss/crossentropy": 2.294679641723633, "loss/hidden": 1.015625, "loss/logits": 0.14390423893928528, "loss/reg": 3.0401906769839115e-05, "step": 889 }, { "epoch": 0.11125, "grad_norm": 19.322298049926758, "grad_norm_var": 18.406652883554056, "learning_rate": 0.0001, "loss": 1.2189, "loss/crossentropy": 2.5112252235412598, "loss/hidden": 1.0625, "loss/logits": 0.156059131026268, "loss/reg": 3.0395181966014206e-05, "step": 890 }, { "epoch": 0.111375, "grad_norm": 2.8927786350250244, "grad_norm_var": 18.191408653915317, "learning_rate": 0.0001, "loss": 1.4005, "loss/crossentropy": 2.4821994304656982, "loss/hidden": 1.1875, "loss/logits": 0.21269716322422028, "loss/reg": 3.0386423532036133e-05, "step": 891 }, { "epoch": 0.1115, "grad_norm": 4.272779941558838, "grad_norm_var": 18.070214239000837, "learning_rate": 0.0001, "loss": 1.4083, "loss/crossentropy": 2.565413475036621, "loss/hidden": 1.21875, "loss/logits": 0.18922537565231323, "loss/reg": 3.0379227609955706e-05, "step": 892 }, { "epoch": 0.111625, "grad_norm": 2.1861484050750732, "grad_norm_var": 18.21247030426496, "learning_rate": 0.0001, "loss": 1.1391, "loss/crossentropy": 2.397334575653076, "loss/hidden": 0.98828125, "loss/logits": 0.15054136514663696, "loss/reg": 3.0370387321454473e-05, "step": 893 }, { "epoch": 0.11175, "grad_norm": 2.010301113128662, "grad_norm_var": 18.262829011154675, "learning_rate": 0.0001, "loss": 1.367, "loss/crossentropy": 2.3572635650634766, "loss/hidden": 1.1796875, "loss/logits": 0.1869942843914032, "loss/reg": 3.036250564036891e-05, "step": 894 }, { "epoch": 0.111875, "grad_norm": 2.95393967628479, "grad_norm_var": 18.140167373756306, "learning_rate": 0.0001, "loss": 1.2857, "loss/crossentropy": 2.755155086517334, "loss/hidden": 1.1171875, "loss/logits": 0.16822287440299988, "loss/reg": 3.035445297427941e-05, "step": 895 }, { "epoch": 0.112, "grad_norm": 1.8677095174789429, "grad_norm_var": 18.32691930612881, "learning_rate": 0.0001, "loss": 1.0605, "loss/crossentropy": 2.6163880825042725, "loss/hidden": 0.9375, "loss/logits": 0.12269198894500732, "loss/reg": 3.034656765521504e-05, "step": 896 }, { "epoch": 0.112125, "grad_norm": 1.8834550380706787, "grad_norm_var": 18.42800558442811, "learning_rate": 0.0001, "loss": 1.251, "loss/crossentropy": 2.4362659454345703, "loss/hidden": 1.0703125, "loss/logits": 0.18038895726203918, "loss/reg": 3.0335075280163437e-05, "step": 897 }, { "epoch": 0.11225, "grad_norm": 1.9402846097946167, "grad_norm_var": 18.39229818615, "learning_rate": 0.0001, "loss": 1.1382, "loss/crossentropy": 2.34395170211792, "loss/hidden": 0.99609375, "loss/logits": 0.14177373051643372, "loss/reg": 3.0326225896715187e-05, "step": 898 }, { "epoch": 0.112375, "grad_norm": 1.5321539640426636, "grad_norm_var": 18.643542516203766, "learning_rate": 0.0001, "loss": 1.1133, "loss/crossentropy": 2.346829652786255, "loss/hidden": 0.96484375, "loss/logits": 0.14812731742858887, "loss/reg": 3.031741471204441e-05, "step": 899 }, { "epoch": 0.1125, "grad_norm": 2.4379284381866455, "grad_norm_var": 18.483572622867825, "learning_rate": 0.0001, "loss": 1.5778, "loss/crossentropy": 2.4154257774353027, "loss/hidden": 1.359375, "loss/logits": 0.21813495457172394, "loss/reg": 3.0310697184177116e-05, "step": 900 }, { "epoch": 0.112625, "grad_norm": 1.8157447576522827, "grad_norm_var": 18.62675428293736, "learning_rate": 0.0001, "loss": 1.0782, "loss/crossentropy": 2.5257809162139893, "loss/hidden": 0.9453125, "loss/logits": 0.1325758695602417, "loss/reg": 3.030002881132532e-05, "step": 901 }, { "epoch": 0.11275, "grad_norm": 1.805254578590393, "grad_norm_var": 18.66345763452514, "learning_rate": 0.0001, "loss": 1.1999, "loss/crossentropy": 2.5594637393951416, "loss/hidden": 1.046875, "loss/logits": 0.1527424305677414, "loss/reg": 3.0291475923149846e-05, "step": 902 }, { "epoch": 0.112875, "grad_norm": 2.0050342082977295, "grad_norm_var": 18.712384675596915, "learning_rate": 0.0001, "loss": 1.2333, "loss/crossentropy": 2.142164945602417, "loss/hidden": 1.078125, "loss/logits": 0.1548900008201599, "loss/reg": 3.028149512829259e-05, "step": 903 }, { "epoch": 0.113, "grad_norm": 2.051323175430298, "grad_norm_var": 18.72701455166655, "learning_rate": 0.0001, "loss": 1.1036, "loss/crossentropy": 2.466301918029785, "loss/hidden": 0.96875, "loss/logits": 0.13455404341220856, "loss/reg": 3.027167076652404e-05, "step": 904 }, { "epoch": 0.113125, "grad_norm": 1.9347882270812988, "grad_norm_var": 18.67744451765512, "learning_rate": 0.0001, "loss": 1.162, "loss/crossentropy": 2.8280982971191406, "loss/hidden": 1.0, "loss/logits": 0.16173742711544037, "loss/reg": 3.0260214771260507e-05, "step": 905 }, { "epoch": 0.11325, "grad_norm": 1.9693732261657715, "grad_norm_var": 0.44271487091433853, "learning_rate": 0.0001, "loss": 1.2495, "loss/crossentropy": 2.420048713684082, "loss/hidden": 1.078125, "loss/logits": 0.17107552289962769, "loss/reg": 3.0245597372413613e-05, "step": 906 }, { "epoch": 0.113375, "grad_norm": 2.4886035919189453, "grad_norm_var": 0.41680001650657716, "learning_rate": 0.0001, "loss": 1.1317, "loss/crossentropy": 2.844113826751709, "loss/hidden": 1.0, "loss/logits": 0.131430983543396, "loss/reg": 3.0234865334932692e-05, "step": 907 }, { "epoch": 0.1135, "grad_norm": 1.840441346168518, "grad_norm_var": 0.11342421101602417, "learning_rate": 0.0001, "loss": 0.9626, "loss/crossentropy": 2.434068202972412, "loss/hidden": 0.8515625, "loss/logits": 0.11073873192071915, "loss/reg": 3.0223776775528677e-05, "step": 908 }, { "epoch": 0.113625, "grad_norm": 1.8804395198822021, "grad_norm_var": 0.11351828281440793, "learning_rate": 0.0001, "loss": 1.2627, "loss/crossentropy": 2.553520441055298, "loss/hidden": 1.09375, "loss/logits": 0.16862741112709045, "loss/reg": 3.0212599085643888e-05, "step": 909 }, { "epoch": 0.11375, "grad_norm": 3.197678804397583, "grad_norm_var": 0.19914182473502062, "learning_rate": 0.0001, "loss": 1.2905, "loss/crossentropy": 2.499753475189209, "loss/hidden": 1.1171875, "loss/logits": 0.1730211228132248, "loss/reg": 3.0199351385817863e-05, "step": 910 }, { "epoch": 0.113875, "grad_norm": 1.9219839572906494, "grad_norm_var": 0.14823876643624204, "learning_rate": 0.0001, "loss": 1.2543, "loss/crossentropy": 2.252265453338623, "loss/hidden": 1.1015625, "loss/logits": 0.1524173766374588, "loss/reg": 3.0187717129592784e-05, "step": 911 }, { "epoch": 0.114, "grad_norm": 2.2684502601623535, "grad_norm_var": 0.14929642441132382, "learning_rate": 0.0001, "loss": 1.2484, "loss/crossentropy": 2.674544095993042, "loss/hidden": 1.0625, "loss/logits": 0.18555624783039093, "loss/reg": 3.017616290890146e-05, "step": 912 }, { "epoch": 0.114125, "grad_norm": 2.682770252227783, "grad_norm_var": 0.17032645440362532, "learning_rate": 0.0001, "loss": 1.4042, "loss/crossentropy": 2.5020034313201904, "loss/hidden": 1.2109375, "loss/logits": 0.19293466210365295, "loss/reg": 3.0163550036377273e-05, "step": 913 }, { "epoch": 0.11425, "grad_norm": 2.211789131164551, "grad_norm_var": 0.16876210134861747, "learning_rate": 0.0001, "loss": 1.1638, "loss/crossentropy": 2.8326942920684814, "loss/hidden": 1.0234375, "loss/logits": 0.14011076092720032, "loss/reg": 3.015105721715372e-05, "step": 914 }, { "epoch": 0.114375, "grad_norm": 1.9461342096328735, "grad_norm_var": 0.14659883344723781, "learning_rate": 0.0001, "loss": 1.242, "loss/crossentropy": 2.554008960723877, "loss/hidden": 1.0625, "loss/logits": 0.179220050573349, "loss/reg": 3.014074536622502e-05, "step": 915 }, { "epoch": 0.1145, "grad_norm": 2.0787339210510254, "grad_norm_var": 0.14104581058875282, "learning_rate": 0.0001, "loss": 1.3882, "loss/crossentropy": 2.234307050704956, "loss/hidden": 1.2109375, "loss/logits": 0.17698973417282104, "loss/reg": 3.0129771403153427e-05, "step": 916 }, { "epoch": 0.114625, "grad_norm": 1.7083910703659058, "grad_norm_var": 0.1462808949880042, "learning_rate": 0.0001, "loss": 1.1463, "loss/crossentropy": 2.667232036590576, "loss/hidden": 1.0, "loss/logits": 0.14599871635437012, "loss/reg": 3.0115046683931723e-05, "step": 917 }, { "epoch": 0.11475, "grad_norm": 2.0105397701263428, "grad_norm_var": 0.14017797617193484, "learning_rate": 0.0001, "loss": 1.379, "loss/crossentropy": 2.4737157821655273, "loss/hidden": 1.1875, "loss/logits": 0.19115224480628967, "loss/reg": 3.0103803510428406e-05, "step": 918 }, { "epoch": 0.114875, "grad_norm": 2.173673391342163, "grad_norm_var": 0.13898185573586228, "learning_rate": 0.0001, "loss": 1.1508, "loss/crossentropy": 2.6369824409484863, "loss/hidden": 1.015625, "loss/logits": 0.134853333234787, "loss/reg": 3.0090330255916342e-05, "step": 919 }, { "epoch": 0.115, "grad_norm": 1.7742769718170166, "grad_norm_var": 0.14734354783532472, "learning_rate": 0.0001, "loss": 1.2682, "loss/crossentropy": 2.511277914047241, "loss/hidden": 1.125, "loss/logits": 0.1428862065076828, "loss/reg": 3.008243402291555e-05, "step": 920 }, { "epoch": 0.115125, "grad_norm": 2.2685179710388184, "grad_norm_var": 0.14559568575267523, "learning_rate": 0.0001, "loss": 1.0555, "loss/crossentropy": 2.741913318634033, "loss/hidden": 0.921875, "loss/logits": 0.1333094835281372, "loss/reg": 3.006882434419822e-05, "step": 921 }, { "epoch": 0.11525, "grad_norm": 2.0138773918151855, "grad_norm_var": 0.14463957141116968, "learning_rate": 0.0001, "loss": 1.0304, "loss/crossentropy": 2.6428472995758057, "loss/hidden": 0.8984375, "loss/logits": 0.13170146942138672, "loss/reg": 3.0055622119107284e-05, "step": 922 }, { "epoch": 0.115375, "grad_norm": 2.0200250148773193, "grad_norm_var": 0.13746634960929824, "learning_rate": 0.0001, "loss": 1.3501, "loss/crossentropy": 2.382972478866577, "loss/hidden": 1.171875, "loss/logits": 0.17797118425369263, "loss/reg": 3.0043020160519518e-05, "step": 923 }, { "epoch": 0.1155, "grad_norm": 1.8085798025131226, "grad_norm_var": 0.13873805613438883, "learning_rate": 0.0001, "loss": 1.2652, "loss/crossentropy": 2.305152416229248, "loss/hidden": 1.09375, "loss/logits": 0.17112156748771667, "loss/reg": 3.0029701520106755e-05, "step": 924 }, { "epoch": 0.115625, "grad_norm": 2.2086219787597656, "grad_norm_var": 0.13486150837332828, "learning_rate": 0.0001, "loss": 1.2893, "loss/crossentropy": 2.4080841541290283, "loss/hidden": 1.140625, "loss/logits": 0.14834384620189667, "loss/reg": 3.0015635275049135e-05, "step": 925 }, { "epoch": 0.11575, "grad_norm": 2.3216629028320312, "grad_norm_var": 0.059679650378250376, "learning_rate": 0.0001, "loss": 1.3637, "loss/crossentropy": 2.4338295459747314, "loss/hidden": 1.1796875, "loss/logits": 0.18370847404003143, "loss/reg": 3.0002041967236437e-05, "step": 926 }, { "epoch": 0.115875, "grad_norm": 2.669752597808838, "grad_norm_var": 0.07801232102321709, "learning_rate": 0.0001, "loss": 1.1004, "loss/crossentropy": 2.89715576171875, "loss/hidden": 0.953125, "loss/logits": 0.1469748616218567, "loss/reg": 2.998723357450217e-05, "step": 927 }, { "epoch": 0.116, "grad_norm": 1.8070098161697388, "grad_norm_var": 0.08313198661767274, "learning_rate": 0.0001, "loss": 1.3002, "loss/crossentropy": 2.311203718185425, "loss/hidden": 1.140625, "loss/logits": 0.15929476916790009, "loss/reg": 2.997562478412874e-05, "step": 928 }, { "epoch": 0.116125, "grad_norm": 1.9325246810913086, "grad_norm_var": 0.06066759568447632, "learning_rate": 0.0001, "loss": 1.2746, "loss/crossentropy": 2.5550014972686768, "loss/hidden": 1.1171875, "loss/logits": 0.15711072087287903, "loss/reg": 2.9964614441269077e-05, "step": 929 }, { "epoch": 0.11625, "grad_norm": 5.985466957092285, "grad_norm_var": 1.027266842132435, "learning_rate": 0.0001, "loss": 1.3972, "loss/crossentropy": 3.0680112838745117, "loss/hidden": 1.234375, "loss/logits": 0.16257420182228088, "loss/reg": 2.9954886485938914e-05, "step": 930 }, { "epoch": 0.116375, "grad_norm": 2.5261380672454834, "grad_norm_var": 1.0212753434993587, "learning_rate": 0.0001, "loss": 1.252, "loss/crossentropy": 2.5753066539764404, "loss/hidden": 1.078125, "loss/logits": 0.17356222867965698, "loss/reg": 2.9943923436803743e-05, "step": 931 }, { "epoch": 0.1165, "grad_norm": 6.641204833984375, "grad_norm_var": 2.1683749086823068, "learning_rate": 0.0001, "loss": 1.721, "loss/crossentropy": 2.6875956058502197, "loss/hidden": 1.4453125, "loss/logits": 0.275409460067749, "loss/reg": 2.9933024052297696e-05, "step": 932 }, { "epoch": 0.116625, "grad_norm": 2.314908027648926, "grad_norm_var": 2.1178968833443887, "learning_rate": 0.0001, "loss": 1.1737, "loss/crossentropy": 2.6610100269317627, "loss/hidden": 1.0078125, "loss/logits": 0.1656096875667572, "loss/reg": 2.9921167879365385e-05, "step": 933 }, { "epoch": 0.11675, "grad_norm": 2.40584135055542, "grad_norm_var": 2.0937064624222272, "learning_rate": 0.0001, "loss": 1.4219, "loss/crossentropy": 2.402122735977173, "loss/hidden": 1.2109375, "loss/logits": 0.21065916121006012, "loss/reg": 2.9912616810179316e-05, "step": 934 }, { "epoch": 0.116875, "grad_norm": 1.8654179573059082, "grad_norm_var": 2.120435350833001, "learning_rate": 0.0001, "loss": 1.329, "loss/crossentropy": 2.288451671600342, "loss/hidden": 1.1484375, "loss/logits": 0.18031169474124908, "loss/reg": 2.9901168090873398e-05, "step": 935 }, { "epoch": 0.117, "grad_norm": 2.2229185104370117, "grad_norm_var": 2.0800180450379466, "learning_rate": 0.0001, "loss": 1.3147, "loss/crossentropy": 2.4958393573760986, "loss/hidden": 1.1484375, "loss/logits": 0.16593661904335022, "loss/reg": 2.9892928068875335e-05, "step": 936 }, { "epoch": 0.117125, "grad_norm": 2.3262715339660645, "grad_norm_var": 2.0769941509731638, "learning_rate": 0.0001, "loss": 1.1184, "loss/crossentropy": 2.465583562850952, "loss/hidden": 0.9765625, "loss/logits": 0.14153215289115906, "loss/reg": 2.9885473850299604e-05, "step": 937 }, { "epoch": 0.11725, "grad_norm": 1.9139127731323242, "grad_norm_var": 2.0866556628890844, "learning_rate": 0.0001, "loss": 1.1842, "loss/crossentropy": 2.4913489818573, "loss/hidden": 1.03125, "loss/logits": 0.15265440940856934, "loss/reg": 2.987419611599762e-05, "step": 938 }, { "epoch": 0.117375, "grad_norm": 10.830428123474121, "grad_norm_var": 6.156193101325849, "learning_rate": 0.0001, "loss": 1.2625, "loss/crossentropy": 2.3177073001861572, "loss/hidden": 1.09375, "loss/logits": 0.16842305660247803, "loss/reg": 2.986286926898174e-05, "step": 939 }, { "epoch": 0.1175, "grad_norm": 2.227233648300171, "grad_norm_var": 6.087451956699234, "learning_rate": 0.0001, "loss": 1.4568, "loss/crossentropy": 2.335895538330078, "loss/hidden": 1.25, "loss/logits": 0.20651838183403015, "loss/reg": 2.9853996238671243e-05, "step": 940 }, { "epoch": 0.117625, "grad_norm": 2.052086114883423, "grad_norm_var": 6.110978489678372, "learning_rate": 0.0001, "loss": 1.2067, "loss/crossentropy": 2.5038609504699707, "loss/hidden": 1.046875, "loss/logits": 0.15951794385910034, "loss/reg": 2.9841248760931194e-05, "step": 941 }, { "epoch": 0.11775, "grad_norm": 2.391615390777588, "grad_norm_var": 6.102600788640365, "learning_rate": 0.0001, "loss": 1.1734, "loss/crossentropy": 2.3519561290740967, "loss/hidden": 1.0078125, "loss/logits": 0.1652698963880539, "loss/reg": 2.9828568585799076e-05, "step": 942 }, { "epoch": 0.117875, "grad_norm": 2.6826672554016113, "grad_norm_var": 6.101599921092475, "learning_rate": 0.0001, "loss": 1.1231, "loss/crossentropy": 2.7894511222839355, "loss/hidden": 0.9609375, "loss/logits": 0.16190429031848907, "loss/reg": 2.9821638236171566e-05, "step": 943 }, { "epoch": 0.118, "grad_norm": 2.0484187602996826, "grad_norm_var": 6.058542783290856, "learning_rate": 0.0001, "loss": 1.3326, "loss/crossentropy": 2.5720252990722656, "loss/hidden": 1.1328125, "loss/logits": 0.1994791030883789, "loss/reg": 2.981125726364553e-05, "step": 944 }, { "epoch": 0.118125, "grad_norm": 2.304121971130371, "grad_norm_var": 6.000760397434108, "learning_rate": 0.0001, "loss": 1.2185, "loss/crossentropy": 2.6887125968933105, "loss/hidden": 1.046875, "loss/logits": 0.17130357027053833, "loss/reg": 2.9803662982885726e-05, "step": 945 }, { "epoch": 0.11825, "grad_norm": 2.098452568054199, "grad_norm_var": 5.551285095967417, "learning_rate": 0.0001, "loss": 1.2843, "loss/crossentropy": 2.1777865886688232, "loss/hidden": 1.109375, "loss/logits": 0.17466512322425842, "loss/reg": 2.979521559609566e-05, "step": 946 }, { "epoch": 0.118375, "grad_norm": 2.4448280334472656, "grad_norm_var": 5.557412656069853, "learning_rate": 0.0001, "loss": 1.2825, "loss/crossentropy": 2.4249322414398193, "loss/hidden": 1.09375, "loss/logits": 0.18843106925487518, "loss/reg": 2.9785163860651664e-05, "step": 947 }, { "epoch": 0.1185, "grad_norm": 2.038378953933716, "grad_norm_var": 4.6764411267263375, "learning_rate": 0.0001, "loss": 1.2371, "loss/crossentropy": 2.5682852268218994, "loss/hidden": 1.046875, "loss/logits": 0.18992964923381805, "loss/reg": 2.9779299438814633e-05, "step": 948 }, { "epoch": 0.118625, "grad_norm": 2.4998066425323486, "grad_norm_var": 4.667593369117897, "learning_rate": 0.0001, "loss": 1.2371, "loss/crossentropy": 2.66314697265625, "loss/hidden": 1.0625, "loss/logits": 0.17428331077098846, "loss/reg": 2.9770533728878945e-05, "step": 949 }, { "epoch": 0.11875, "grad_norm": 2.7099368572235107, "grad_norm_var": 4.658525692998186, "learning_rate": 0.0001, "loss": 1.2726, "loss/crossentropy": 2.6387088298797607, "loss/hidden": 1.09375, "loss/logits": 0.1785287708044052, "loss/reg": 2.9763001293758862e-05, "step": 950 }, { "epoch": 0.118875, "grad_norm": 2.2414186000823975, "grad_norm_var": 4.620957579511203, "learning_rate": 0.0001, "loss": 1.3926, "loss/crossentropy": 2.2660908699035645, "loss/hidden": 1.203125, "loss/logits": 0.18920645117759705, "loss/reg": 2.975258212245535e-05, "step": 951 }, { "epoch": 0.119, "grad_norm": 1.984645962715149, "grad_norm_var": 4.643301277280139, "learning_rate": 0.0001, "loss": 1.1178, "loss/crossentropy": 2.6594138145446777, "loss/hidden": 0.9765625, "loss/logits": 0.14097043871879578, "loss/reg": 2.9740975151071325e-05, "step": 952 }, { "epoch": 0.119125, "grad_norm": 2.6357920169830322, "grad_norm_var": 4.629753372228165, "learning_rate": 0.0001, "loss": 1.2005, "loss/crossentropy": 2.621708631515503, "loss/hidden": 1.015625, "loss/logits": 0.1845404952764511, "loss/reg": 2.973414484586101e-05, "step": 953 }, { "epoch": 0.11925, "grad_norm": 1.8917043209075928, "grad_norm_var": 4.632464228940502, "learning_rate": 0.0001, "loss": 1.2065, "loss/crossentropy": 2.599364995956421, "loss/hidden": 1.046875, "loss/logits": 0.15931665897369385, "loss/reg": 2.972304901049938e-05, "step": 954 }, { "epoch": 0.119375, "grad_norm": 3.248734712600708, "grad_norm_var": 0.12498233063500468, "learning_rate": 0.0001, "loss": 1.3427, "loss/crossentropy": 2.567194938659668, "loss/hidden": 1.15625, "loss/logits": 0.18617364764213562, "loss/reg": 2.9710497983614914e-05, "step": 955 }, { "epoch": 0.1195, "grad_norm": 2.8562896251678467, "grad_norm_var": 0.13994241611030506, "learning_rate": 0.0001, "loss": 1.3554, "loss/crossentropy": 2.6916966438293457, "loss/hidden": 1.140625, "loss/logits": 0.21445384621620178, "loss/reg": 2.9702167012146674e-05, "step": 956 }, { "epoch": 0.119625, "grad_norm": 2.176276445388794, "grad_norm_var": 0.13542593205940623, "learning_rate": 0.0001, "loss": 1.2327, "loss/crossentropy": 2.4443368911743164, "loss/hidden": 1.078125, "loss/logits": 0.15425175428390503, "loss/reg": 2.969575689348858e-05, "step": 957 }, { "epoch": 0.11975, "grad_norm": 1.8663557767868042, "grad_norm_var": 0.1526136914943218, "learning_rate": 0.0001, "loss": 1.2129, "loss/crossentropy": 2.619982957839966, "loss/hidden": 1.0546875, "loss/logits": 0.1578969955444336, "loss/reg": 2.9686147172469646e-05, "step": 958 }, { "epoch": 0.119875, "grad_norm": 2.1611175537109375, "grad_norm_var": 0.1470364788056391, "learning_rate": 0.0001, "loss": 1.2601, "loss/crossentropy": 2.4998703002929688, "loss/hidden": 1.0703125, "loss/logits": 0.18950100243091583, "loss/reg": 2.9678791179321706e-05, "step": 959 }, { "epoch": 0.12, "grad_norm": 2.9389288425445557, "grad_norm_var": 0.16371311291879256, "learning_rate": 0.0001, "loss": 1.7144, "loss/crossentropy": 2.059462785720825, "loss/hidden": 1.4765625, "loss/logits": 0.2375560998916626, "loss/reg": 2.9669770810869522e-05, "step": 960 }, { "epoch": 0.120125, "grad_norm": 1.8231537342071533, "grad_norm_var": 0.18310454629205022, "learning_rate": 0.0001, "loss": 1.1061, "loss/crossentropy": 2.686415195465088, "loss/hidden": 0.96875, "loss/logits": 0.1370917558670044, "loss/reg": 2.9663093300769106e-05, "step": 961 }, { "epoch": 0.12025, "grad_norm": 2.095581293106079, "grad_norm_var": 0.1832017416796153, "learning_rate": 0.0001, "loss": 1.2927, "loss/crossentropy": 2.542818307876587, "loss/hidden": 1.109375, "loss/logits": 0.18301713466644287, "loss/reg": 2.9656326660187915e-05, "step": 962 }, { "epoch": 0.120375, "grad_norm": 1.9731796979904175, "grad_norm_var": 0.1911924995621158, "learning_rate": 0.0001, "loss": 1.2607, "loss/crossentropy": 2.7729532718658447, "loss/hidden": 1.0703125, "loss/logits": 0.19010211527347565, "loss/reg": 2.9653167075593956e-05, "step": 963 }, { "epoch": 0.1205, "grad_norm": 2.3905394077301025, "grad_norm_var": 0.18565761023355182, "learning_rate": 0.0001, "loss": 1.3915, "loss/crossentropy": 2.4821271896362305, "loss/hidden": 1.203125, "loss/logits": 0.18802939355373383, "loss/reg": 2.9650735086761415e-05, "step": 964 }, { "epoch": 0.120625, "grad_norm": 2.089430809020996, "grad_norm_var": 0.18762185350676308, "learning_rate": 0.0001, "loss": 1.1282, "loss/crossentropy": 2.6699776649475098, "loss/hidden": 0.96875, "loss/logits": 0.15917512774467468, "loss/reg": 2.9652708690264262e-05, "step": 965 }, { "epoch": 0.12075, "grad_norm": 2.2462618350982666, "grad_norm_var": 0.1768091784440002, "learning_rate": 0.0001, "loss": 1.0844, "loss/crossentropy": 2.1657514572143555, "loss/hidden": 0.94140625, "loss/logits": 0.14271298050880432, "loss/reg": 2.9650564101757482e-05, "step": 966 }, { "epoch": 0.120875, "grad_norm": 2.1140072345733643, "grad_norm_var": 0.17862723062111877, "learning_rate": 0.0001, "loss": 1.1495, "loss/crossentropy": 2.588456392288208, "loss/hidden": 1.0, "loss/logits": 0.1492307484149933, "loss/reg": 2.9643733796547167e-05, "step": 967 }, { "epoch": 0.121, "grad_norm": 4.149561405181885, "grad_norm_var": 0.38608389632931334, "learning_rate": 0.0001, "loss": 1.4457, "loss/crossentropy": 2.559540033340454, "loss/hidden": 1.203125, "loss/logits": 0.24225984513759613, "loss/reg": 2.9640305001521483e-05, "step": 968 }, { "epoch": 0.121125, "grad_norm": 2.4170870780944824, "grad_norm_var": 0.3826657741188169, "learning_rate": 0.0001, "loss": 1.3368, "loss/crossentropy": 2.5627620220184326, "loss/hidden": 1.140625, "loss/logits": 0.1958288550376892, "loss/reg": 2.9633007216034457e-05, "step": 969 }, { "epoch": 0.12125, "grad_norm": 2.025369882583618, "grad_norm_var": 0.3746809845195796, "learning_rate": 0.0001, "loss": 1.1745, "loss/crossentropy": 2.51656174659729, "loss/hidden": 1.0234375, "loss/logits": 0.15072372555732727, "loss/reg": 2.9625756724271923e-05, "step": 970 }, { "epoch": 0.121375, "grad_norm": 3.451981544494629, "grad_norm_var": 0.3999720570717547, "learning_rate": 0.0001, "loss": 1.4399, "loss/crossentropy": 1.9385815858840942, "loss/hidden": 1.3125, "loss/logits": 0.12711143493652344, "loss/reg": 2.9617278414661996e-05, "step": 971 }, { "epoch": 0.1215, "grad_norm": 1.8940742015838623, "grad_norm_var": 0.4023062621100176, "learning_rate": 0.0001, "loss": 1.348, "loss/crossentropy": 2.650892734527588, "loss/hidden": 1.15625, "loss/logits": 0.19150036573410034, "loss/reg": 2.961178142868448e-05, "step": 972 }, { "epoch": 0.121625, "grad_norm": 1.8390206098556519, "grad_norm_var": 0.41782537712418366, "learning_rate": 0.0001, "loss": 1.1005, "loss/crossentropy": 2.235429525375366, "loss/hidden": 0.95703125, "loss/logits": 0.14319097995758057, "loss/reg": 2.9602917493321e-05, "step": 973 }, { "epoch": 0.12175, "grad_norm": 3.1122865676879883, "grad_norm_var": 0.4357929705368214, "learning_rate": 0.0001, "loss": 1.4691, "loss/crossentropy": 2.6123275756835938, "loss/hidden": 1.234375, "loss/logits": 0.23446011543273926, "loss/reg": 2.9595235901069827e-05, "step": 974 }, { "epoch": 0.121875, "grad_norm": 2.0248794555664062, "grad_norm_var": 0.44165743776062505, "learning_rate": 0.0001, "loss": 1.2073, "loss/crossentropy": 2.6745986938476562, "loss/hidden": 1.03125, "loss/logits": 0.17576925456523895, "loss/reg": 2.9589489713544026e-05, "step": 975 }, { "epoch": 0.122, "grad_norm": 1.9854148626327515, "grad_norm_var": 0.4314376508313742, "learning_rate": 0.0001, "loss": 1.2895, "loss/crossentropy": 2.4620821475982666, "loss/hidden": 1.1171875, "loss/logits": 0.17201289534568787, "loss/reg": 2.9581056878669187e-05, "step": 976 }, { "epoch": 0.122125, "grad_norm": 15.087516784667969, "grad_norm_var": 10.49260658100263, "learning_rate": 0.0001, "loss": 1.3515, "loss/crossentropy": 2.4697861671447754, "loss/hidden": 1.1484375, "loss/logits": 0.2027597725391388, "loss/reg": 2.9572154744528234e-05, "step": 977 }, { "epoch": 0.12225, "grad_norm": 2.448667049407959, "grad_norm_var": 10.449298409713819, "learning_rate": 0.0001, "loss": 1.1618, "loss/crossentropy": 2.2382848262786865, "loss/hidden": 1.0078125, "loss/logits": 0.15371698141098022, "loss/reg": 2.956432945211418e-05, "step": 978 }, { "epoch": 0.122375, "grad_norm": 1.7963190078735352, "grad_norm_var": 10.480256191034487, "learning_rate": 0.0001, "loss": 1.2832, "loss/crossentropy": 2.5349695682525635, "loss/hidden": 1.109375, "loss/logits": 0.17357102036476135, "loss/reg": 2.9555221772170626e-05, "step": 979 }, { "epoch": 0.1225, "grad_norm": 2.065178155899048, "grad_norm_var": 10.521642133051824, "learning_rate": 0.0001, "loss": 1.2602, "loss/crossentropy": 2.4281973838806152, "loss/hidden": 1.0625, "loss/logits": 0.1973562091588974, "loss/reg": 2.954368210339453e-05, "step": 980 }, { "epoch": 0.122625, "grad_norm": 2.2039473056793213, "grad_norm_var": 10.505936873267668, "learning_rate": 0.0001, "loss": 1.1894, "loss/crossentropy": 2.483654737472534, "loss/hidden": 1.0390625, "loss/logits": 0.1500002145767212, "loss/reg": 2.953330840682611e-05, "step": 981 }, { "epoch": 0.12275, "grad_norm": 2.1155283451080322, "grad_norm_var": 10.523261114072442, "learning_rate": 0.0001, "loss": 1.1604, "loss/crossentropy": 2.6215789318084717, "loss/hidden": 1.015625, "loss/logits": 0.14448747038841248, "loss/reg": 2.9524298952310346e-05, "step": 982 }, { "epoch": 0.122875, "grad_norm": 2.112273931503296, "grad_norm_var": 10.523505505811658, "learning_rate": 0.0001, "loss": 1.1496, "loss/crossentropy": 2.4145495891571045, "loss/hidden": 1.0, "loss/logits": 0.14932119846343994, "loss/reg": 2.9514942070818506e-05, "step": 983 }, { "epoch": 0.123, "grad_norm": 2.0390849113464355, "grad_norm_var": 10.526402089225046, "learning_rate": 0.0001, "loss": 1.0821, "loss/crossentropy": 2.318721294403076, "loss/hidden": 0.94140625, "loss/logits": 0.14042669534683228, "loss/reg": 2.9506210921681486e-05, "step": 984 }, { "epoch": 0.123125, "grad_norm": 4.910560607910156, "grad_norm_var": 10.708338697617826, "learning_rate": 0.0001, "loss": 2.0722, "loss/crossentropy": 3.077894926071167, "loss/hidden": 1.734375, "loss/logits": 0.3375716805458069, "loss/reg": 2.9494345653802156e-05, "step": 985 }, { "epoch": 0.12325, "grad_norm": 2.085634469985962, "grad_norm_var": 10.699171348673152, "learning_rate": 0.0001, "loss": 1.2434, "loss/crossentropy": 2.197237730026245, "loss/hidden": 1.09375, "loss/logits": 0.14937615394592285, "loss/reg": 2.9486029234249145e-05, "step": 986 }, { "epoch": 0.123375, "grad_norm": 1.863654375076294, "grad_norm_var": 10.803115672749165, "learning_rate": 0.0001, "loss": 1.1457, "loss/crossentropy": 2.4037911891937256, "loss/hidden": 0.9921875, "loss/logits": 0.1532135307788849, "loss/reg": 2.9479617296601646e-05, "step": 987 }, { "epoch": 0.1235, "grad_norm": 2.7112700939178467, "grad_norm_var": 10.713565411311919, "learning_rate": 0.0001, "loss": 1.355, "loss/crossentropy": 2.576564073562622, "loss/hidden": 1.140625, "loss/logits": 0.21411700546741486, "loss/reg": 2.9471080779330805e-05, "step": 988 }, { "epoch": 0.123625, "grad_norm": 4.951111316680908, "grad_norm_var": 10.774867724807326, "learning_rate": 0.0001, "loss": 1.5529, "loss/crossentropy": 2.4628682136535645, "loss/hidden": 1.34375, "loss/logits": 0.208875373005867, "loss/reg": 2.9464130420819856e-05, "step": 989 }, { "epoch": 0.12375, "grad_norm": 2.1607253551483154, "grad_norm_var": 10.860932085087141, "learning_rate": 0.0001, "loss": 1.158, "loss/crossentropy": 2.727191209793091, "loss/hidden": 0.9921875, "loss/logits": 0.16549015045166016, "loss/reg": 2.9459288271027617e-05, "step": 990 }, { "epoch": 0.123875, "grad_norm": 2.0294203758239746, "grad_norm_var": 10.860170359418529, "learning_rate": 0.0001, "loss": 1.0676, "loss/crossentropy": 2.377748489379883, "loss/hidden": 0.9453125, "loss/logits": 0.12201999127864838, "loss/reg": 2.944932202808559e-05, "step": 991 }, { "epoch": 0.124, "grad_norm": 2.561225414276123, "grad_norm_var": 10.781087146669456, "learning_rate": 0.0001, "loss": 1.426, "loss/crossentropy": 2.6533565521240234, "loss/hidden": 1.2109375, "loss/logits": 0.2147250473499298, "loss/reg": 2.9440669095492922e-05, "step": 992 }, { "epoch": 0.124125, "grad_norm": 2.614508867263794, "grad_norm_var": 0.9367041482529769, "learning_rate": 0.0001, "loss": 1.2862, "loss/crossentropy": 2.3770902156829834, "loss/hidden": 1.0859375, "loss/logits": 0.20000842213630676, "loss/reg": 2.943119397968985e-05, "step": 993 }, { "epoch": 0.12425, "grad_norm": 4.801121711730957, "grad_norm_var": 1.2533636237198265, "learning_rate": 0.0001, "loss": 1.316, "loss/crossentropy": 2.862474203109741, "loss/hidden": 1.125, "loss/logits": 0.1907142549753189, "loss/reg": 2.9420858481898904e-05, "step": 994 }, { "epoch": 0.124375, "grad_norm": 1.8006936311721802, "grad_norm_var": 1.252844222856614, "learning_rate": 0.0001, "loss": 1.1567, "loss/crossentropy": 2.538184881210327, "loss/hidden": 1.0078125, "loss/logits": 0.14863096177577972, "loss/reg": 2.941078128060326e-05, "step": 995 }, { "epoch": 0.1245, "grad_norm": 1.8835642337799072, "grad_norm_var": 1.2700145975215755, "learning_rate": 0.0001, "loss": 1.1197, "loss/crossentropy": 2.46339750289917, "loss/hidden": 0.97265625, "loss/logits": 0.14673538506031036, "loss/reg": 2.939947444247082e-05, "step": 996 }, { "epoch": 0.124625, "grad_norm": 1.8348726034164429, "grad_norm_var": 1.3018449172030708, "learning_rate": 0.0001, "loss": 1.2213, "loss/crossentropy": 2.3999040126800537, "loss/hidden": 1.0625, "loss/logits": 0.15847530961036682, "loss/reg": 2.9386575988610275e-05, "step": 997 }, { "epoch": 0.12475, "grad_norm": 2.0075278282165527, "grad_norm_var": 1.3103380783906078, "learning_rate": 0.0001, "loss": 1.1907, "loss/crossentropy": 2.6882195472717285, "loss/hidden": 1.0390625, "loss/logits": 0.15131571888923645, "loss/reg": 2.9374945370364003e-05, "step": 998 }, { "epoch": 0.124875, "grad_norm": 2.0520100593566895, "grad_norm_var": 1.3148693419703117, "learning_rate": 0.0001, "loss": 1.2862, "loss/crossentropy": 2.186708450317383, "loss/hidden": 1.125, "loss/logits": 0.16095511615276337, "loss/reg": 2.936382225016132e-05, "step": 999 }, { "epoch": 0.125, "grad_norm": 2.2306151390075684, "grad_norm_var": 1.3017093789284313, "learning_rate": 0.0001, "loss": 1.2571, "loss/crossentropy": 2.8570730686187744, "loss/hidden": 1.0703125, "loss/logits": 0.18647560477256775, "loss/reg": 2.935159318440128e-05, "step": 1000 } ], "logging_steps": 1, "max_steps": 8000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": true, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.4405861564416e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }