diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,18018 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.1875, + "eval_steps": 500, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000125, + "grad_norm": 2.8797903060913086, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.3346, + "loss/crossentropy": 2.6933815479278564, + "loss/hidden": 1.171875, + "loss/logits": 0.16231727600097656, + "loss/reg": 3.5815275623463094e-05, + "step": 1 + }, + { + "epoch": 0.00025, + "grad_norm": 3.151318073272705, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.334, + "loss/crossentropy": 3.0975701808929443, + "loss/hidden": 1.1484375, + "loss/logits": 0.18517285585403442, + "loss/reg": 3.5815275623463094e-05, + "step": 2 + }, + { + "epoch": 0.000375, + "grad_norm": 2.3074228763580322, + "learning_rate": 3e-06, + "loss": 1.2917, + "loss/crossentropy": 2.613313674926758, + "loss/hidden": 1.1171875, + "loss/logits": 0.17419689893722534, + "loss/reg": 3.581521741580218e-05, + "step": 3 + }, + { + "epoch": 0.0005, + "grad_norm": 2.994593381881714, + "learning_rate": 4.000000000000001e-06, + "loss": 1.3622, + "loss/crossentropy": 2.562746047973633, + "loss/hidden": 1.2109375, + "loss/logits": 0.150880828499794, + "loss/reg": 3.5815086448565125e-05, + "step": 4 + }, + { + "epoch": 0.000625, + "grad_norm": 4.555283069610596, + "learning_rate": 5e-06, + "loss": 1.435, + "loss/crossentropy": 2.4253523349761963, + "loss/hidden": 1.25, + "loss/logits": 0.18461981415748596, + "loss/reg": 3.581498458515853e-05, + "step": 5 + }, + { + "epoch": 0.00075, + "grad_norm": 2.5826594829559326, + "learning_rate": 6e-06, + "loss": 1.2796, + "loss/crossentropy": 2.666372060775757, + "loss/hidden": 1.1015625, + "loss/logits": 0.17770954966545105, + "loss/reg": 3.5814849979942665e-05, + "step": 6 + }, + { + "epoch": 0.000875, + "grad_norm": 2.9724032878875732, + "learning_rate": 7.000000000000001e-06, + "loss": 1.5032, + "loss/crossentropy": 2.488424062728882, + "loss/hidden": 1.3046875, + "loss/logits": 0.1981831043958664, + "loss/reg": 3.581465352908708e-05, + "step": 7 + }, + { + "epoch": 0.001, + "grad_norm": 4.469974517822266, + "learning_rate": 8.000000000000001e-06, + "loss": 1.7569, + "loss/crossentropy": 2.152468204498291, + "loss/hidden": 1.515625, + "loss/logits": 0.24093276262283325, + "loss/reg": 3.581443888833746e-05, + "step": 8 + }, + { + "epoch": 0.001125, + "grad_norm": 2.529066801071167, + "learning_rate": 9e-06, + "loss": 1.7045, + "loss/crossentropy": 2.3210883140563965, + "loss/hidden": 1.4453125, + "loss/logits": 0.2588244378566742, + "loss/reg": 3.581414057407528e-05, + "step": 9 + }, + { + "epoch": 0.00125, + "grad_norm": 2.1863760948181152, + "learning_rate": 1e-05, + "loss": 1.4129, + "loss/crossentropy": 2.213552236557007, + "loss/hidden": 1.25, + "loss/logits": 0.1625480353832245, + "loss/reg": 3.5813736758427694e-05, + "step": 10 + }, + { + "epoch": 0.001375, + "grad_norm": 2.182722330093384, + "learning_rate": 1.1000000000000001e-05, + "loss": 1.3913, + "loss/crossentropy": 2.4366066455841064, + "loss/hidden": 1.1796875, + "loss/logits": 0.21124377846717834, + "loss/reg": 3.5813347494695336e-05, + "step": 11 + }, + { + "epoch": 0.0015, + "grad_norm": 2.28460431098938, + "learning_rate": 1.2e-05, + "loss": 1.6315, + "loss/crossentropy": 2.2548444271087646, + "loss/hidden": 1.4296875, + "loss/logits": 0.2014051228761673, + "loss/reg": 3.581297642085701e-05, + "step": 12 + }, + { + "epoch": 0.001625, + "grad_norm": 3.58573579788208, + "learning_rate": 1.3000000000000001e-05, + "loss": 1.5091, + "loss/crossentropy": 2.6865081787109375, + "loss/hidden": 1.28125, + "loss/logits": 0.22751325368881226, + "loss/reg": 3.581246710382402e-05, + "step": 13 + }, + { + "epoch": 0.00175, + "grad_norm": 3.04477596282959, + "learning_rate": 1.4000000000000001e-05, + "loss": 1.7733, + "loss/crossentropy": 2.2971208095550537, + "loss/hidden": 1.5234375, + "loss/logits": 0.2495010793209076, + "loss/reg": 3.5811823181575164e-05, + "step": 14 + }, + { + "epoch": 0.001875, + "grad_norm": 3.0177462100982666, + "learning_rate": 1.5e-05, + "loss": 1.5114, + "loss/crossentropy": 2.726813554763794, + "loss/hidden": 1.28125, + "loss/logits": 0.2297666072845459, + "loss/reg": 3.581113196560182e-05, + "step": 15 + }, + { + "epoch": 0.002, + "grad_norm": 2.147826671600342, + "grad_norm_var": 0.5553412532630915, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.2521, + "loss/crossentropy": 2.413343667984009, + "loss/hidden": 1.078125, + "loss/logits": 0.17357708513736725, + "loss/reg": 3.581081909942441e-05, + "step": 16 + }, + { + "epoch": 0.002125, + "grad_norm": 1.957945704460144, + "grad_norm_var": 0.6147194825502799, + "learning_rate": 1.7000000000000003e-05, + "loss": 1.1395, + "loss/crossentropy": 2.327432632446289, + "loss/hidden": 1.0, + "loss/logits": 0.1391144096851349, + "loss/reg": 3.581016790121794e-05, + "step": 17 + }, + { + "epoch": 0.00225, + "grad_norm": 2.9085915088653564, + "grad_norm_var": 0.6093993504039028, + "learning_rate": 1.8e-05, + "loss": 1.7224, + "loss/crossentropy": 2.5963706970214844, + "loss/hidden": 1.4765625, + "loss/logits": 0.24546313285827637, + "loss/reg": 3.580931297619827e-05, + "step": 18 + }, + { + "epoch": 0.002375, + "grad_norm": 1.6794862747192383, + "grad_norm_var": 0.6801389543370343, + "learning_rate": 1.9e-05, + "loss": 1.2792, + "loss/crossentropy": 2.542264938354492, + "loss/hidden": 1.109375, + "loss/logits": 0.1695137917995453, + "loss/reg": 3.580832708394155e-05, + "step": 19 + }, + { + "epoch": 0.0025, + "grad_norm": 2.006974935531616, + "grad_norm_var": 0.7179436357972528, + "learning_rate": 2e-05, + "loss": 1.2741, + "loss/crossentropy": 2.6418614387512207, + "loss/hidden": 1.1015625, + "loss/logits": 0.1721784472465515, + "loss/reg": 3.580794873414561e-05, + "step": 20 + }, + { + "epoch": 0.002625, + "grad_norm": 1.895347237586975, + "grad_norm_var": 0.5223754576868543, + "learning_rate": 2.1e-05, + "loss": 1.1787, + "loss/crossentropy": 2.38079571723938, + "loss/hidden": 1.03125, + "loss/logits": 0.14714078605175018, + "loss/reg": 3.580757766030729e-05, + "step": 21 + }, + { + "epoch": 0.00275, + "grad_norm": 2.195387125015259, + "grad_norm_var": 0.5321677299000015, + "learning_rate": 2.2000000000000003e-05, + "loss": 1.3881, + "loss/crossentropy": 2.613879919052124, + "loss/hidden": 1.1875, + "loss/logits": 0.20026516914367676, + "loss/reg": 3.58072757080663e-05, + "step": 22 + }, + { + "epoch": 0.002875, + "grad_norm": 2.943157911300659, + "grad_norm_var": 0.530638648177441, + "learning_rate": 2.3000000000000003e-05, + "loss": 1.391, + "loss/crossentropy": 2.653855562210083, + "loss/hidden": 1.1875, + "loss/logits": 0.20310327410697937, + "loss/reg": 3.580666452762671e-05, + "step": 23 + }, + { + "epoch": 0.003, + "grad_norm": 2.057532787322998, + "grad_norm_var": 0.2815427832165767, + "learning_rate": 2.4e-05, + "loss": 1.2261, + "loss/crossentropy": 2.5107123851776123, + "loss/hidden": 1.0859375, + "loss/logits": 0.13977402448654175, + "loss/reg": 3.580632255761884e-05, + "step": 24 + }, + { + "epoch": 0.003125, + "grad_norm": 2.4862008094787598, + "grad_norm_var": 0.28099970817646425, + "learning_rate": 2.5e-05, + "loss": 1.2713, + "loss/crossentropy": 2.234706163406372, + "loss/hidden": 1.1171875, + "loss/logits": 0.1537853181362152, + "loss/reg": 3.580575867090374e-05, + "step": 25 + }, + { + "epoch": 0.00325, + "grad_norm": 1.8715572357177734, + "grad_norm_var": 0.29663449315952994, + "learning_rate": 2.6000000000000002e-05, + "loss": 1.2613, + "loss/crossentropy": 2.693939447402954, + "loss/hidden": 1.1015625, + "loss/logits": 0.1593395173549652, + "loss/reg": 3.58048637281172e-05, + "step": 26 + }, + { + "epoch": 0.003375, + "grad_norm": 1.8992433547973633, + "grad_norm_var": 0.3095519871493233, + "learning_rate": 2.7000000000000002e-05, + "loss": 1.3686, + "loss/crossentropy": 2.5521254539489746, + "loss/hidden": 1.1796875, + "loss/logits": 0.18858906626701355, + "loss/reg": 3.580458724172786e-05, + "step": 27 + }, + { + "epoch": 0.0035, + "grad_norm": 2.2363150119781494, + "grad_norm_var": 0.31027254984992586, + "learning_rate": 2.8000000000000003e-05, + "loss": 1.4412, + "loss/crossentropy": 2.5556640625, + "loss/hidden": 1.25, + "loss/logits": 0.19085438549518585, + "loss/reg": 3.580446355044842e-05, + "step": 28 + }, + { + "epoch": 0.003625, + "grad_norm": 1.916678786277771, + "grad_norm_var": 0.21402330843771242, + "learning_rate": 2.9e-05, + "loss": 1.4761, + "loss/crossentropy": 2.3765358924865723, + "loss/hidden": 1.28125, + "loss/logits": 0.19447964429855347, + "loss/reg": 3.5804154322249815e-05, + "step": 29 + }, + { + "epoch": 0.00375, + "grad_norm": 1.638509750366211, + "grad_norm_var": 0.19170291887504137, + "learning_rate": 3e-05, + "loss": 1.0217, + "loss/crossentropy": 2.39214825630188, + "loss/hidden": 0.91015625, + "loss/logits": 0.11122289299964905, + "loss/reg": 3.580400880309753e-05, + "step": 30 + }, + { + "epoch": 0.003875, + "grad_norm": 1.9309443235397339, + "grad_norm_var": 0.14393413685788706, + "learning_rate": 3.1e-05, + "loss": 1.2692, + "loss/crossentropy": 2.5890893936157227, + "loss/hidden": 1.109375, + "loss/logits": 0.15942150354385376, + "loss/reg": 3.580367410904728e-05, + "step": 31 + }, + { + "epoch": 0.004, + "grad_norm": 1.8290704488754272, + "grad_norm_var": 0.14870789473936974, + "learning_rate": 3.2000000000000005e-05, + "loss": 1.2562, + "loss/crossentropy": 2.614006757736206, + "loss/hidden": 1.09375, + "loss/logits": 0.1620863825082779, + "loss/reg": 3.580304473871365e-05, + "step": 32 + }, + { + "epoch": 0.004125, + "grad_norm": 2.8197247982025146, + "grad_norm_var": 0.17985784278712322, + "learning_rate": 3.3e-05, + "loss": 1.6221, + "loss/crossentropy": 2.458843231201172, + "loss/hidden": 1.390625, + "loss/logits": 0.23112158477306366, + "loss/reg": 3.5803102946374565e-05, + "step": 33 + }, + { + "epoch": 0.00425, + "grad_norm": 2.04148530960083, + "grad_norm_var": 0.1385297884752911, + "learning_rate": 3.4000000000000007e-05, + "loss": 1.2995, + "loss/crossentropy": 2.6012392044067383, + "loss/hidden": 1.140625, + "loss/logits": 0.15854208171367645, + "loss/reg": 3.580349584808573e-05, + "step": 34 + }, + { + "epoch": 0.004375, + "grad_norm": 2.370253086090088, + "grad_norm_var": 0.13049913719012618, + "learning_rate": 3.5e-05, + "loss": 1.2372, + "loss/crossentropy": 2.554248571395874, + "loss/hidden": 1.0859375, + "loss/logits": 0.1508997678756714, + "loss/reg": 3.580387055990286e-05, + "step": 35 + }, + { + "epoch": 0.0045, + "grad_norm": 2.1753602027893066, + "grad_norm_var": 0.12942723244658866, + "learning_rate": 3.6e-05, + "loss": 1.2043, + "loss/crossentropy": 2.950917959213257, + "loss/hidden": 1.0625, + "loss/logits": 0.14146575331687927, + "loss/reg": 3.5805252991849557e-05, + "step": 36 + }, + { + "epoch": 0.004625, + "grad_norm": 1.8446277379989624, + "grad_norm_var": 0.13127072083684937, + "learning_rate": 3.7e-05, + "loss": 1.0664, + "loss/crossentropy": 2.6589579582214355, + "loss/hidden": 0.93359375, + "loss/logits": 0.1324453502893448, + "loss/reg": 3.58061988663394e-05, + "step": 37 + }, + { + "epoch": 0.00475, + "grad_norm": 3.153823137283325, + "grad_norm_var": 0.1956330169497003, + "learning_rate": 3.8e-05, + "loss": 1.3224, + "loss/crossentropy": 2.4948697090148926, + "loss/hidden": 1.1484375, + "loss/logits": 0.1736428141593933, + "loss/reg": 3.5806617233902216e-05, + "step": 38 + }, + { + "epoch": 0.004875, + "grad_norm": 2.105498790740967, + "grad_norm_var": 0.1565869437188399, + "learning_rate": 3.9000000000000006e-05, + "loss": 1.403, + "loss/crossentropy": 2.2583742141723633, + "loss/hidden": 1.25, + "loss/logits": 0.15260592103004456, + "loss/reg": 3.580802876967937e-05, + "step": 39 + }, + { + "epoch": 0.005, + "grad_norm": 1.635926365852356, + "grad_norm_var": 0.17281299081781085, + "learning_rate": 4e-05, + "loss": 1.0375, + "loss/crossentropy": 2.6808717250823975, + "loss/hidden": 0.921875, + "loss/logits": 0.11526834964752197, + "loss/reg": 3.580863995011896e-05, + "step": 40 + }, + { + "epoch": 0.005125, + "grad_norm": 1.715374231338501, + "grad_norm_var": 0.17253809821943988, + "learning_rate": 4.1e-05, + "loss": 1.13, + "loss/crossentropy": 2.643165349960327, + "loss/hidden": 0.984375, + "loss/logits": 0.1453102082014084, + "loss/reg": 3.580814882298e-05, + "step": 41 + }, + { + "epoch": 0.00525, + "grad_norm": 2.1999247074127197, + "grad_norm_var": 0.17041268294515716, + "learning_rate": 4.2e-05, + "loss": 1.291, + "loss/crossentropy": 2.4450502395629883, + "loss/hidden": 1.125, + "loss/logits": 0.165659099817276, + "loss/reg": 3.5807905078399926e-05, + "step": 42 + }, + { + "epoch": 0.005375, + "grad_norm": 6.767260551452637, + "grad_norm_var": 1.5247462870548845, + "learning_rate": 4.3e-05, + "loss": 1.3462, + "loss/crossentropy": 2.6365652084350586, + "loss/hidden": 1.1953125, + "loss/logits": 0.15048328042030334, + "loss/reg": 3.580807242542505e-05, + "step": 43 + }, + { + "epoch": 0.0055, + "grad_norm": 2.4290215969085693, + "grad_norm_var": 1.5228923892282233, + "learning_rate": 4.4000000000000006e-05, + "loss": 1.444, + "loss/crossentropy": 2.445629596710205, + "loss/hidden": 1.2578125, + "loss/logits": 0.1858382523059845, + "loss/reg": 3.5807508538709953e-05, + "step": 44 + }, + { + "epoch": 0.005625, + "grad_norm": 1.8787518739700317, + "grad_norm_var": 1.525481240782518, + "learning_rate": 4.5e-05, + "loss": 1.1501, + "loss/crossentropy": 2.932614326477051, + "loss/hidden": 1.015625, + "loss/logits": 0.13410566747188568, + "loss/reg": 3.5807570384349674e-05, + "step": 45 + }, + { + "epoch": 0.00575, + "grad_norm": 2.0517191886901855, + "grad_norm_var": 1.4937318455351132, + "learning_rate": 4.600000000000001e-05, + "loss": 1.1484, + "loss/crossentropy": 2.8540468215942383, + "loss/hidden": 1.0078125, + "loss/logits": 0.14026299118995667, + "loss/reg": 3.5806355299428105e-05, + "step": 46 + }, + { + "epoch": 0.005875, + "grad_norm": 2.3056695461273193, + "grad_norm_var": 1.4773587952527152, + "learning_rate": 4.7e-05, + "loss": 1.1201, + "loss/crossentropy": 2.3501625061035156, + "loss/hidden": 0.9921875, + "loss/logits": 0.127536341547966, + "loss/reg": 3.580517659429461e-05, + "step": 47 + }, + { + "epoch": 0.006, + "grad_norm": 1.8737199306488037, + "grad_norm_var": 1.473740887453625, + "learning_rate": 4.8e-05, + "loss": 1.1727, + "loss/crossentropy": 2.56876540184021, + "loss/hidden": 1.0234375, + "loss/logits": 0.14892947673797607, + "loss/reg": 3.5804278013529256e-05, + "step": 48 + }, + { + "epoch": 0.006125, + "grad_norm": 1.6498337984085083, + "grad_norm_var": 1.503248724299241, + "learning_rate": 4.9e-05, + "loss": 1.0887, + "loss/crossentropy": 2.5359740257263184, + "loss/hidden": 0.97265625, + "loss/logits": 0.11568085849285126, + "loss/reg": 3.580292104743421e-05, + "step": 49 + }, + { + "epoch": 0.00625, + "grad_norm": 1.8893804550170898, + "grad_norm_var": 1.5117099009867367, + "learning_rate": 5e-05, + "loss": 1.197, + "loss/crossentropy": 2.4427387714385986, + "loss/hidden": 1.046875, + "loss/logits": 0.14971715211868286, + "loss/reg": 3.58012730430346e-05, + "step": 50 + }, + { + "epoch": 0.006375, + "grad_norm": 1.6203227043151855, + "grad_norm_var": 1.5476226526424812, + "learning_rate": 5.1000000000000006e-05, + "loss": 1.1026, + "loss/crossentropy": 2.4663264751434326, + "loss/hidden": 0.96875, + "loss/logits": 0.13351351022720337, + "loss/reg": 3.579998519853689e-05, + "step": 51 + }, + { + "epoch": 0.0065, + "grad_norm": 1.86211097240448, + "grad_norm_var": 1.5602565704882587, + "learning_rate": 5.2000000000000004e-05, + "loss": 1.3754, + "loss/crossentropy": 2.4876317977905273, + "loss/hidden": 1.1875, + "loss/logits": 0.1875210702419281, + "loss/reg": 3.5798137105302885e-05, + "step": 52 + }, + { + "epoch": 0.006625, + "grad_norm": 1.8947949409484863, + "grad_norm_var": 1.5572914096308217, + "learning_rate": 5.300000000000001e-05, + "loss": 1.2427, + "loss/crossentropy": 2.5425305366516113, + "loss/hidden": 1.09375, + "loss/logits": 0.14857983589172363, + "loss/reg": 3.5795987059827894e-05, + "step": 53 + }, + { + "epoch": 0.00675, + "grad_norm": 2.155927896499634, + "grad_norm_var": 1.5078638031084188, + "learning_rate": 5.4000000000000005e-05, + "loss": 1.1465, + "loss/crossentropy": 2.525212287902832, + "loss/hidden": 1.0234375, + "loss/logits": 0.12268626689910889, + "loss/reg": 3.579404437914491e-05, + "step": 54 + }, + { + "epoch": 0.006875, + "grad_norm": 2.088404893875122, + "grad_norm_var": 1.5082164304181902, + "learning_rate": 5.500000000000001e-05, + "loss": 1.2035, + "loss/crossentropy": 2.0337164402008057, + "loss/hidden": 1.078125, + "loss/logits": 0.12503597140312195, + "loss/reg": 3.579181066015735e-05, + "step": 55 + }, + { + "epoch": 0.007, + "grad_norm": 1.5657821893692017, + "grad_norm_var": 1.5142777074410627, + "learning_rate": 5.6000000000000006e-05, + "loss": 1.0258, + "loss/crossentropy": 2.5575084686279297, + "loss/hidden": 0.90625, + "loss/logits": 0.11920958012342453, + "loss/reg": 3.5789642424788326e-05, + "step": 56 + }, + { + "epoch": 0.007125, + "grad_norm": 1.8427784442901611, + "grad_norm_var": 1.5062655960432332, + "learning_rate": 5.6999999999999996e-05, + "loss": 1.1217, + "loss/crossentropy": 2.771742820739746, + "loss/hidden": 0.9921875, + "loss/logits": 0.1291133463382721, + "loss/reg": 3.5787568776868284e-05, + "step": 57 + }, + { + "epoch": 0.00725, + "grad_norm": 2.1850311756134033, + "grad_norm_var": 1.5063882579126575, + "learning_rate": 5.8e-05, + "loss": 1.1639, + "loss/crossentropy": 2.5320937633514404, + "loss/hidden": 1.03125, + "loss/logits": 0.13231654465198517, + "loss/reg": 3.5786692023975775e-05, + "step": 58 + }, + { + "epoch": 0.007375, + "grad_norm": 1.9679864645004272, + "grad_norm_var": 0.057763248837537105, + "learning_rate": 5.9e-05, + "loss": 1.2283, + "loss/crossentropy": 2.2511909008026123, + "loss/hidden": 1.0703125, + "loss/logits": 0.15765681862831116, + "loss/reg": 3.578452378860675e-05, + "step": 59 + }, + { + "epoch": 0.0075, + "grad_norm": 1.5962857007980347, + "grad_norm_var": 0.04834229766963934, + "learning_rate": 6e-05, + "loss": 1.1863, + "loss/crossentropy": 2.2581472396850586, + "loss/hidden": 1.046875, + "loss/logits": 0.13910087943077087, + "loss/reg": 3.578297037165612e-05, + "step": 60 + }, + { + "epoch": 0.007625, + "grad_norm": 1.9367214441299438, + "grad_norm_var": 0.04837432662246878, + "learning_rate": 6.1e-05, + "loss": 1.1308, + "loss/crossentropy": 2.375943660736084, + "loss/hidden": 1.0078125, + "loss/logits": 0.12259182333946228, + "loss/reg": 3.578166069928557e-05, + "step": 61 + }, + { + "epoch": 0.00775, + "grad_norm": 1.74626624584198, + "grad_norm_var": 0.048246697686887254, + "learning_rate": 6.2e-05, + "loss": 1.0925, + "loss/crossentropy": 2.33935284614563, + "loss/hidden": 0.97265625, + "loss/logits": 0.11948312819004059, + "loss/reg": 3.577923052944243e-05, + "step": 62 + }, + { + "epoch": 0.007875, + "grad_norm": 1.7242004871368408, + "grad_norm_var": 0.036866001167244575, + "learning_rate": 6.3e-05, + "loss": 1.0529, + "loss/crossentropy": 2.56264066696167, + "loss/hidden": 0.9375, + "loss/logits": 0.11508607119321823, + "loss/reg": 3.5776785807684064e-05, + "step": 63 + }, + { + "epoch": 0.008, + "grad_norm": 1.759158730506897, + "grad_norm_var": 0.03732351836526746, + "learning_rate": 6.400000000000001e-05, + "loss": 1.0359, + "loss/crossentropy": 2.5598065853118896, + "loss/hidden": 0.921875, + "loss/logits": 0.11371441185474396, + "loss/reg": 3.577530151233077e-05, + "step": 64 + }, + { + "epoch": 0.008125, + "grad_norm": 1.8450080156326294, + "grad_norm_var": 0.03468242225663947, + "learning_rate": 6.500000000000001e-05, + "loss": 1.1044, + "loss/crossentropy": 2.358488082885742, + "loss/hidden": 0.9765625, + "loss/logits": 0.1274409294128418, + "loss/reg": 3.577340248739347e-05, + "step": 65 + }, + { + "epoch": 0.00825, + "grad_norm": 1.8574451208114624, + "grad_norm_var": 0.03459981312827119, + "learning_rate": 6.6e-05, + "loss": 1.3393, + "loss/crossentropy": 2.2067902088165283, + "loss/hidden": 1.1796875, + "loss/logits": 0.15927882492542267, + "loss/reg": 3.5770081012742594e-05, + "step": 66 + }, + { + "epoch": 0.008375, + "grad_norm": 2.0372934341430664, + "grad_norm_var": 0.032529617098579836, + "learning_rate": 6.7e-05, + "loss": 1.0537, + "loss/crossentropy": 2.948381185531616, + "loss/hidden": 0.92578125, + "loss/logits": 0.1275252103805542, + "loss/reg": 3.576773087843321e-05, + "step": 67 + }, + { + "epoch": 0.0085, + "grad_norm": 1.6391419172286987, + "grad_norm_var": 0.03614113702393708, + "learning_rate": 6.800000000000001e-05, + "loss": 1.1128, + "loss/crossentropy": 2.691239833831787, + "loss/hidden": 0.98046875, + "loss/logits": 0.13195790350437164, + "loss/reg": 3.576446033548564e-05, + "step": 68 + }, + { + "epoch": 0.008625, + "grad_norm": 1.6962119340896606, + "grad_norm_var": 0.03782062069620693, + "learning_rate": 6.9e-05, + "loss": 1.0633, + "loss/crossentropy": 2.6416876316070557, + "loss/hidden": 0.94921875, + "loss/logits": 0.11374930292367935, + "loss/reg": 3.576194285415113e-05, + "step": 69 + }, + { + "epoch": 0.00875, + "grad_norm": 2.015970468521118, + "grad_norm_var": 0.03338686088704298, + "learning_rate": 7e-05, + "loss": 1.2654, + "loss/crossentropy": 2.5686511993408203, + "loss/hidden": 1.109375, + "loss/logits": 0.1556204855442047, + "loss/reg": 3.576026938389987e-05, + "step": 70 + }, + { + "epoch": 0.008875, + "grad_norm": 3.6860287189483643, + "grad_norm_var": 0.24497842788792332, + "learning_rate": 7.1e-05, + "loss": 1.4119, + "loss/crossentropy": 2.0071253776550293, + "loss/hidden": 1.2578125, + "loss/logits": 0.15377236902713776, + "loss/reg": 3.575763912522234e-05, + "step": 71 + }, + { + "epoch": 0.009, + "grad_norm": 2.0074028968811035, + "grad_norm_var": 0.2349071198746244, + "learning_rate": 7.2e-05, + "loss": 1.0776, + "loss/crossentropy": 2.3829903602600098, + "loss/hidden": 0.953125, + "loss/logits": 0.12408198416233063, + "loss/reg": 3.575549635570496e-05, + "step": 72 + }, + { + "epoch": 0.009125, + "grad_norm": 1.9751619100570679, + "grad_norm_var": 0.23373155459140638, + "learning_rate": 7.3e-05, + "loss": 1.3348, + "loss/crossentropy": 2.3497986793518066, + "loss/hidden": 1.171875, + "loss/logits": 0.16253460943698883, + "loss/reg": 3.575363371055573e-05, + "step": 73 + }, + { + "epoch": 0.00925, + "grad_norm": 1.5991307497024536, + "grad_norm_var": 0.2391465881612707, + "learning_rate": 7.4e-05, + "loss": 1.1257, + "loss/crossentropy": 2.5422017574310303, + "loss/hidden": 0.9921875, + "loss/logits": 0.13318461179733276, + "loss/reg": 3.575047594495118e-05, + "step": 74 + }, + { + "epoch": 0.009375, + "grad_norm": 6.278711318969727, + "grad_norm_var": 1.4148538861937499, + "learning_rate": 7.500000000000001e-05, + "loss": 1.2142, + "loss/crossentropy": 2.6029913425445557, + "loss/hidden": 1.0859375, + "loss/logits": 0.1279023289680481, + "loss/reg": 3.5748576920013875e-05, + "step": 75 + }, + { + "epoch": 0.0095, + "grad_norm": 2.744645833969116, + "grad_norm_var": 1.4029217843730426, + "learning_rate": 7.6e-05, + "loss": 1.1281, + "loss/crossentropy": 2.5766711235046387, + "loss/hidden": 0.99609375, + "loss/logits": 0.13161128759384155, + "loss/reg": 3.574538277462125e-05, + "step": 76 + }, + { + "epoch": 0.009625, + "grad_norm": 1.6794898509979248, + "grad_norm_var": 1.418977736839713, + "learning_rate": 7.7e-05, + "loss": 0.9818, + "loss/crossentropy": 2.4829020500183105, + "loss/hidden": 0.87109375, + "loss/logits": 0.11033609509468079, + "loss/reg": 3.574356742319651e-05, + "step": 77 + }, + { + "epoch": 0.00975, + "grad_norm": 1.4505054950714111, + "grad_norm_var": 1.4450273907543434, + "learning_rate": 7.800000000000001e-05, + "loss": 1.0562, + "loss/crossentropy": 2.2392332553863525, + "loss/hidden": 0.9375, + "loss/logits": 0.11831367015838623, + "loss/reg": 3.5740758903557435e-05, + "step": 78 + }, + { + "epoch": 0.009875, + "grad_norm": 2.267005205154419, + "grad_norm_var": 1.425408330742154, + "learning_rate": 7.900000000000001e-05, + "loss": 1.2224, + "loss/crossentropy": 2.524975299835205, + "loss/hidden": 1.09375, + "loss/logits": 0.12832751870155334, + "loss/reg": 3.5736080462811515e-05, + "step": 79 + }, + { + "epoch": 0.01, + "grad_norm": 1.9941935539245605, + "grad_norm_var": 1.4124245943422287, + "learning_rate": 8e-05, + "loss": 1.3792, + "loss/crossentropy": 2.3303298950195312, + "loss/hidden": 1.2109375, + "loss/logits": 0.16792933642864227, + "loss/reg": 3.573206049622968e-05, + "step": 80 + }, + { + "epoch": 0.010125, + "grad_norm": 1.9384936094284058, + "grad_norm_var": 1.407320221541647, + "learning_rate": 8.1e-05, + "loss": 1.2054, + "loss/crossentropy": 2.339118003845215, + "loss/hidden": 1.0703125, + "loss/logits": 0.13472682237625122, + "loss/reg": 3.5727785871131346e-05, + "step": 81 + }, + { + "epoch": 0.01025, + "grad_norm": 2.2007267475128174, + "grad_norm_var": 1.3942380508674062, + "learning_rate": 8.2e-05, + "loss": 1.2089, + "loss/crossentropy": 2.3879401683807373, + "loss/hidden": 1.078125, + "loss/logits": 0.130388081073761, + "loss/reg": 3.5725021007237956e-05, + "step": 82 + }, + { + "epoch": 0.010375, + "grad_norm": 1.5669373273849487, + "grad_norm_var": 1.426148143880052, + "learning_rate": 8.3e-05, + "loss": 1.0953, + "loss/crossentropy": 2.386194944381714, + "loss/hidden": 0.97265625, + "loss/logits": 0.12224260717630386, + "loss/reg": 3.572153946151957e-05, + "step": 83 + }, + { + "epoch": 0.0105, + "grad_norm": 3.444465160369873, + "grad_norm_var": 1.4716789596545155, + "learning_rate": 8.4e-05, + "loss": 1.4601, + "loss/crossentropy": 2.160597801208496, + "loss/hidden": 1.2578125, + "loss/logits": 0.20196697115898132, + "loss/reg": 3.571900742826983e-05, + "step": 84 + }, + { + "epoch": 0.010625, + "grad_norm": 2.17124080657959, + "grad_norm_var": 1.440631969989453, + "learning_rate": 8.5e-05, + "loss": 1.1224, + "loss/crossentropy": 2.1496167182922363, + "loss/hidden": 1.0, + "loss/logits": 0.12202942371368408, + "loss/reg": 3.571617344277911e-05, + "step": 85 + }, + { + "epoch": 0.01075, + "grad_norm": 1.842886209487915, + "grad_norm_var": 1.4522613774542446, + "learning_rate": 8.6e-05, + "loss": 1.0156, + "loss/crossentropy": 2.226436138153076, + "loss/hidden": 0.90625, + "loss/logits": 0.10898593068122864, + "loss/reg": 3.57139615516644e-05, + "step": 86 + }, + { + "epoch": 0.010875, + "grad_norm": 2.05486798286438, + "grad_norm_var": 1.344934690323482, + "learning_rate": 8.7e-05, + "loss": 1.0828, + "loss/crossentropy": 2.322002410888672, + "loss/hidden": 0.9609375, + "loss/logits": 0.12146371603012085, + "loss/reg": 3.5710025258595124e-05, + "step": 87 + }, + { + "epoch": 0.011, + "grad_norm": 3.25955867767334, + "grad_norm_var": 1.3897383898525164, + "learning_rate": 8.800000000000001e-05, + "loss": 1.5155, + "loss/crossentropy": 1.937675952911377, + "loss/hidden": 1.3125, + "loss/logits": 0.2026323676109314, + "loss/reg": 3.570731496438384e-05, + "step": 88 + }, + { + "epoch": 0.011125, + "grad_norm": 1.7932565212249756, + "grad_norm_var": 1.4022136437703991, + "learning_rate": 8.900000000000001e-05, + "loss": 1.0228, + "loss/crossentropy": 2.649721145629883, + "loss/hidden": 0.91015625, + "loss/logits": 0.11223678290843964, + "loss/reg": 3.570578701328486e-05, + "step": 89 + }, + { + "epoch": 0.01125, + "grad_norm": 1.6545979976654053, + "grad_norm_var": 1.3965356378457594, + "learning_rate": 9e-05, + "loss": 1.0415, + "loss/crossentropy": 2.546100616455078, + "loss/hidden": 0.9140625, + "loss/logits": 0.12706589698791504, + "loss/reg": 3.570249828044325e-05, + "step": 90 + }, + { + "epoch": 0.011375, + "grad_norm": 2.2377841472625732, + "grad_norm_var": 0.3253247379631695, + "learning_rate": 9.1e-05, + "loss": 1.2172, + "loss/crossentropy": 2.677785873413086, + "loss/hidden": 1.078125, + "loss/logits": 0.13871382176876068, + "loss/reg": 3.5701228625839576e-05, + "step": 91 + }, + { + "epoch": 0.0115, + "grad_norm": 2.016057014465332, + "grad_norm_var": 0.3001321883475782, + "learning_rate": 9.200000000000001e-05, + "loss": 1.2553, + "loss/crossentropy": 2.216202974319458, + "loss/hidden": 1.09375, + "loss/logits": 0.16114352643489838, + "loss/reg": 3.569832188077271e-05, + "step": 92 + }, + { + "epoch": 0.011625, + "grad_norm": 1.9337667226791382, + "grad_norm_var": 0.28997562388856146, + "learning_rate": 9.300000000000001e-05, + "loss": 1.2073, + "loss/crossentropy": 2.7274839878082275, + "loss/hidden": 1.0546875, + "loss/logits": 0.1522083729505539, + "loss/reg": 3.5696477425517514e-05, + "step": 93 + }, + { + "epoch": 0.01175, + "grad_norm": 2.1966376304626465, + "grad_norm_var": 0.25874835102596966, + "learning_rate": 9.4e-05, + "loss": 1.2497, + "loss/crossentropy": 2.0065174102783203, + "loss/hidden": 1.109375, + "loss/logits": 0.13999012112617493, + "loss/reg": 3.56943673978094e-05, + "step": 94 + }, + { + "epoch": 0.011875, + "grad_norm": 1.7162114381790161, + "grad_norm_var": 0.2699080995908368, + "learning_rate": 9.5e-05, + "loss": 1.0104, + "loss/crossentropy": 2.4001636505126953, + "loss/hidden": 0.90234375, + "loss/logits": 0.10766053199768066, + "loss/reg": 3.5692111850949004e-05, + "step": 95 + }, + { + "epoch": 0.012, + "grad_norm": 1.9596549272537231, + "grad_norm_var": 0.2705912806447509, + "learning_rate": 9.6e-05, + "loss": 1.1288, + "loss/crossentropy": 2.4599721431732178, + "loss/hidden": 0.98828125, + "loss/logits": 0.14019131660461426, + "loss/reg": 3.5689983633346856e-05, + "step": 96 + }, + { + "epoch": 0.012125, + "grad_norm": 1.9994059801101685, + "grad_norm_var": 0.26931496222480145, + "learning_rate": 9.7e-05, + "loss": 1.1051, + "loss/crossentropy": 2.8105618953704834, + "loss/hidden": 0.9765625, + "loss/logits": 0.12819884717464447, + "loss/reg": 3.568677857401781e-05, + "step": 97 + }, + { + "epoch": 0.01225, + "grad_norm": 2.0389626026153564, + "grad_norm_var": 0.26938190348710545, + "learning_rate": 9.8e-05, + "loss": 1.4044, + "loss/crossentropy": 2.10030198097229, + "loss/hidden": 1.203125, + "loss/logits": 0.20087596774101257, + "loss/reg": 3.5685956390807405e-05, + "step": 98 + }, + { + "epoch": 0.012375, + "grad_norm": 3.1797940731048584, + "grad_norm_var": 0.31348186491538, + "learning_rate": 9.900000000000001e-05, + "loss": 1.5773, + "loss/crossentropy": 2.7054786682128906, + "loss/hidden": 1.359375, + "loss/logits": 0.2175736129283905, + "loss/reg": 3.5683315218193457e-05, + "step": 99 + }, + { + "epoch": 0.0125, + "grad_norm": 2.2074787616729736, + "grad_norm_var": 0.20694747633483127, + "learning_rate": 0.0001, + "loss": 1.2851, + "loss/crossentropy": 2.2195346355438232, + "loss/hidden": 1.125, + "loss/logits": 0.15975427627563477, + "loss/reg": 3.568131796782836e-05, + "step": 100 + }, + { + "epoch": 0.012625, + "grad_norm": 2.0853495597839355, + "grad_norm_var": 0.20706664538577282, + "learning_rate": 0.0001, + "loss": 1.1671, + "loss/crossentropy": 2.8007214069366455, + "loss/hidden": 1.015625, + "loss/logits": 0.15108169615268707, + "loss/reg": 3.5677723644766957e-05, + "step": 101 + }, + { + "epoch": 0.01275, + "grad_norm": 2.261103630065918, + "grad_norm_var": 0.20165261092997658, + "learning_rate": 0.0001, + "loss": 1.2524, + "loss/crossentropy": 2.4267494678497314, + "loss/hidden": 1.109375, + "loss/logits": 0.1426696479320526, + "loss/reg": 3.567594103515148e-05, + "step": 102 + }, + { + "epoch": 0.012875, + "grad_norm": 1.7995065450668335, + "grad_norm_var": 0.20938114766728447, + "learning_rate": 0.0001, + "loss": 1.0294, + "loss/crossentropy": 2.477445602416992, + "loss/hidden": 0.9140625, + "loss/logits": 0.11502823233604431, + "loss/reg": 3.5673674574354663e-05, + "step": 103 + }, + { + "epoch": 0.013, + "grad_norm": 1.9462140798568726, + "grad_norm_var": 0.12222182001865463, + "learning_rate": 0.0001, + "loss": 1.175, + "loss/crossentropy": 2.4584763050079346, + "loss/hidden": 1.0234375, + "loss/logits": 0.15122900903224945, + "loss/reg": 3.567052772268653e-05, + "step": 104 + }, + { + "epoch": 0.013125, + "grad_norm": 5.390810489654541, + "grad_norm_var": 0.8011994969266916, + "learning_rate": 0.0001, + "loss": 1.3528, + "loss/crossentropy": 2.61253023147583, + "loss/hidden": 1.1953125, + "loss/logits": 0.15716172754764557, + "loss/reg": 3.566941450117156e-05, + "step": 105 + }, + { + "epoch": 0.01325, + "grad_norm": 2.104395866394043, + "grad_norm_var": 0.7757998475018496, + "learning_rate": 0.0001, + "loss": 1.4737, + "loss/crossentropy": 2.1339404582977295, + "loss/hidden": 1.296875, + "loss/logits": 0.17651526629924774, + "loss/reg": 3.566693339962512e-05, + "step": 106 + }, + { + "epoch": 0.013375, + "grad_norm": 1.8610461950302124, + "grad_norm_var": 0.788653272883981, + "learning_rate": 0.0001, + "loss": 0.9407, + "loss/crossentropy": 2.4770216941833496, + "loss/hidden": 0.8359375, + "loss/logits": 0.10441947728395462, + "loss/reg": 3.566368104657158e-05, + "step": 107 + }, + { + "epoch": 0.0135, + "grad_norm": 2.2065460681915283, + "grad_norm_var": 0.7838738781084629, + "learning_rate": 0.0001, + "loss": 1.2355, + "loss/crossentropy": 2.461944818496704, + "loss/hidden": 1.0703125, + "loss/logits": 0.16484174132347107, + "loss/reg": 3.566055602277629e-05, + "step": 108 + }, + { + "epoch": 0.013625, + "grad_norm": 2.1468284130096436, + "grad_norm_var": 0.7761527810904186, + "learning_rate": 0.0001, + "loss": 1.1623, + "loss/crossentropy": 2.2069454193115234, + "loss/hidden": 1.0234375, + "loss/logits": 0.13850779831409454, + "loss/reg": 3.565785300452262e-05, + "step": 109 + }, + { + "epoch": 0.01375, + "grad_norm": 2.383087635040283, + "grad_norm_var": 0.7752898762699504, + "learning_rate": 0.0001, + "loss": 1.1793, + "loss/crossentropy": 2.4479548931121826, + "loss/hidden": 1.0390625, + "loss/logits": 0.139842689037323, + "loss/reg": 3.565509177860804e-05, + "step": 110 + }, + { + "epoch": 0.013875, + "grad_norm": 2.9489004611968994, + "grad_norm_var": 0.7693129207579057, + "learning_rate": 0.0001, + "loss": 1.2705, + "loss/crossentropy": 2.3081014156341553, + "loss/hidden": 1.1171875, + "loss/logits": 0.15295040607452393, + "loss/reg": 3.5651082725962624e-05, + "step": 111 + }, + { + "epoch": 0.014, + "grad_norm": 3.968780755996704, + "grad_norm_var": 0.9016446173616401, + "learning_rate": 0.0001, + "loss": 1.4396, + "loss/crossentropy": 2.420243740081787, + "loss/hidden": 1.203125, + "loss/logits": 0.23613759875297546, + "loss/reg": 3.56451710104011e-05, + "step": 112 + }, + { + "epoch": 0.014125, + "grad_norm": 1.9860399961471558, + "grad_norm_var": 0.9026067410203076, + "learning_rate": 0.0001, + "loss": 1.1773, + "loss/crossentropy": 2.477583169937134, + "loss/hidden": 1.03125, + "loss/logits": 0.14566099643707275, + "loss/reg": 3.56405544152949e-05, + "step": 113 + }, + { + "epoch": 0.01425, + "grad_norm": 2.120425224304199, + "grad_norm_var": 0.8976643536437109, + "learning_rate": 0.0001, + "loss": 1.2152, + "loss/crossentropy": 3.030984878540039, + "loss/hidden": 1.0625, + "loss/logits": 0.15236616134643555, + "loss/reg": 3.563678910722956e-05, + "step": 114 + }, + { + "epoch": 0.014375, + "grad_norm": 2.0870068073272705, + "grad_norm_var": 0.8786817926388901, + "learning_rate": 0.0001, + "loss": 1.2595, + "loss/crossentropy": 2.4538302421569824, + "loss/hidden": 1.109375, + "loss/logits": 0.1497730016708374, + "loss/reg": 3.5631266655400395e-05, + "step": 115 + }, + { + "epoch": 0.0145, + "grad_norm": 3.1647448539733887, + "grad_norm_var": 0.9025786275056549, + "learning_rate": 0.0001, + "loss": 1.6928, + "loss/crossentropy": 2.205573320388794, + "loss/hidden": 1.3984375, + "loss/logits": 0.29398053884506226, + "loss/reg": 3.562564597814344e-05, + "step": 116 + }, + { + "epoch": 0.014625, + "grad_norm": 1.8475593328475952, + "grad_norm_var": 0.9201723703583595, + "learning_rate": 0.0001, + "loss": 1.1518, + "loss/crossentropy": 2.5012142658233643, + "loss/hidden": 1.0078125, + "loss/logits": 0.14361616969108582, + "loss/reg": 3.562155688996427e-05, + "step": 117 + }, + { + "epoch": 0.01475, + "grad_norm": 1.858892798423767, + "grad_norm_var": 0.9438422080188066, + "learning_rate": 0.0001, + "loss": 1.1388, + "loss/crossentropy": 2.5725672245025635, + "loss/hidden": 1.0078125, + "loss/logits": 0.1306220442056656, + "loss/reg": 3.561788616934791e-05, + "step": 118 + }, + { + "epoch": 0.014875, + "grad_norm": 2.2440059185028076, + "grad_norm_var": 0.9153389246133348, + "learning_rate": 0.0001, + "loss": 1.4255, + "loss/crossentropy": 2.341083288192749, + "loss/hidden": 1.2421875, + "loss/logits": 0.1829112321138382, + "loss/reg": 3.561256380635314e-05, + "step": 119 + }, + { + "epoch": 0.015, + "grad_norm": 2.1901657581329346, + "grad_norm_var": 0.9005062112002877, + "learning_rate": 0.0001, + "loss": 1.2391, + "loss/crossentropy": 2.362614631652832, + "loss/hidden": 1.0703125, + "loss/logits": 0.16847620904445648, + "loss/reg": 3.56065938831307e-05, + "step": 120 + }, + { + "epoch": 0.015125, + "grad_norm": 5.200242519378662, + "grad_norm_var": 0.830131887163558, + "learning_rate": 0.0001, + "loss": 1.2103, + "loss/crossentropy": 2.6675992012023926, + "loss/hidden": 1.0546875, + "loss/logits": 0.15525725483894348, + "loss/reg": 3.5602170100901276e-05, + "step": 121 + }, + { + "epoch": 0.01525, + "grad_norm": 2.407500982284546, + "grad_norm_var": 0.8190810626824183, + "learning_rate": 0.0001, + "loss": 1.2234, + "loss/crossentropy": 2.56121563911438, + "loss/hidden": 1.0703125, + "loss/logits": 0.15273353457450867, + "loss/reg": 3.559728429536335e-05, + "step": 122 + }, + { + "epoch": 0.015375, + "grad_norm": 1.8797663450241089, + "grad_norm_var": 0.8174111264801723, + "learning_rate": 0.0001, + "loss": 1.1099, + "loss/crossentropy": 2.4745869636535645, + "loss/hidden": 0.98046875, + "loss/logits": 0.12910515069961548, + "loss/reg": 3.559128526831046e-05, + "step": 123 + }, + { + "epoch": 0.0155, + "grad_norm": 2.1494781970977783, + "grad_norm_var": 0.8201521751832492, + "learning_rate": 0.0001, + "loss": 1.3179, + "loss/crossentropy": 2.279508590698242, + "loss/hidden": 1.1484375, + "loss/logits": 0.1690721958875656, + "loss/reg": 3.558437674655579e-05, + "step": 124 + }, + { + "epoch": 0.015625, + "grad_norm": 9.331904411315918, + "grad_norm_var": 3.67345953379431, + "learning_rate": 0.0001, + "loss": 1.3993, + "loss/crossentropy": 2.252732038497925, + "loss/hidden": 1.2578125, + "loss/logits": 0.14117392897605896, + "loss/reg": 3.5579581890488043e-05, + "step": 125 + }, + { + "epoch": 0.01575, + "grad_norm": 4.699957847595215, + "grad_norm_var": 3.8228479802693203, + "learning_rate": 0.0001, + "loss": 1.2831, + "loss/crossentropy": 2.5877878665924072, + "loss/hidden": 1.15625, + "loss/logits": 0.12651070952415466, + "loss/reg": 3.557529635145329e-05, + "step": 126 + }, + { + "epoch": 0.015875, + "grad_norm": 1.8446160554885864, + "grad_norm_var": 3.9257773899168873, + "learning_rate": 0.0001, + "loss": 1.2999, + "loss/crossentropy": 2.2898948192596436, + "loss/hidden": 1.140625, + "loss/logits": 0.15888020396232605, + "loss/reg": 3.557029049261473e-05, + "step": 127 + }, + { + "epoch": 0.016, + "grad_norm": 1.873570203781128, + "grad_norm_var": 3.9466365178432232, + "learning_rate": 0.0001, + "loss": 1.1097, + "loss/crossentropy": 2.392472743988037, + "loss/hidden": 0.9765625, + "loss/logits": 0.13277310132980347, + "loss/reg": 3.556452429620549e-05, + "step": 128 + }, + { + "epoch": 0.016125, + "grad_norm": 2.215426445007324, + "grad_norm_var": 3.92104303267346, + "learning_rate": 0.0001, + "loss": 1.2203, + "loss/crossentropy": 2.457443952560425, + "loss/hidden": 1.0546875, + "loss/logits": 0.16524553298950195, + "loss/reg": 3.555676812538877e-05, + "step": 129 + }, + { + "epoch": 0.01625, + "grad_norm": 1.7194857597351074, + "grad_norm_var": 3.9751548455277104, + "learning_rate": 0.0001, + "loss": 1.1788, + "loss/crossentropy": 2.3001158237457275, + "loss/hidden": 1.03125, + "loss/logits": 0.1472093164920807, + "loss/reg": 3.5550358006730676e-05, + "step": 130 + }, + { + "epoch": 0.016375, + "grad_norm": 1.6397395133972168, + "grad_norm_var": 4.037312774164259, + "learning_rate": 0.0001, + "loss": 1.0336, + "loss/crossentropy": 2.54146146774292, + "loss/hidden": 0.91015625, + "loss/logits": 0.12307839095592499, + "loss/reg": 3.5546618164516985e-05, + "step": 131 + }, + { + "epoch": 0.0165, + "grad_norm": 1.9748913049697876, + "grad_norm_var": 4.082478037296671, + "learning_rate": 0.0001, + "loss": 1.104, + "loss/crossentropy": 2.562748670578003, + "loss/hidden": 0.97265625, + "loss/logits": 0.130945086479187, + "loss/reg": 3.5543002013582736e-05, + "step": 132 + }, + { + "epoch": 0.016625, + "grad_norm": 1.8674203157424927, + "grad_norm_var": 4.079934623823218, + "learning_rate": 0.0001, + "loss": 1.2082, + "loss/crossentropy": 2.6365339756011963, + "loss/hidden": 1.0625, + "loss/logits": 0.14538231492042542, + "loss/reg": 3.5538523661671206e-05, + "step": 133 + }, + { + "epoch": 0.01675, + "grad_norm": 2.3201568126678467, + "grad_norm_var": 4.03421067719521, + "learning_rate": 0.0001, + "loss": 1.2356, + "loss/crossentropy": 2.435370445251465, + "loss/hidden": 1.09375, + "loss/logits": 0.14145305752754211, + "loss/reg": 3.5533634218154475e-05, + "step": 134 + }, + { + "epoch": 0.016875, + "grad_norm": 2.4132328033447266, + "grad_norm_var": 4.022385903408254, + "learning_rate": 0.0001, + "loss": 1.3273, + "loss/crossentropy": 2.206634998321533, + "loss/hidden": 1.1796875, + "loss/logits": 0.14724516868591309, + "loss/reg": 3.552551061147824e-05, + "step": 135 + }, + { + "epoch": 0.017, + "grad_norm": 2.419842481613159, + "grad_norm_var": 4.005232252864962, + "learning_rate": 0.0001, + "loss": 1.1667, + "loss/crossentropy": 2.517561912536621, + "loss/hidden": 1.015625, + "loss/logits": 0.15071895718574524, + "loss/reg": 3.5521599784260616e-05, + "step": 136 + }, + { + "epoch": 0.017125, + "grad_norm": 2.716203212738037, + "grad_norm_var": 3.6198676372845124, + "learning_rate": 0.0001, + "loss": 1.1423, + "loss/crossentropy": 2.2819504737854004, + "loss/hidden": 1.015625, + "loss/logits": 0.12632890045642853, + "loss/reg": 3.551522604539059e-05, + "step": 137 + }, + { + "epoch": 0.01725, + "grad_norm": 2.166456460952759, + "grad_norm_var": 3.6334485092224402, + "learning_rate": 0.0001, + "loss": 1.0296, + "loss/crossentropy": 2.4271674156188965, + "loss/hidden": 0.91015625, + "loss/logits": 0.11908704042434692, + "loss/reg": 3.550978362909518e-05, + "step": 138 + }, + { + "epoch": 0.017375, + "grad_norm": 2.0737109184265137, + "grad_norm_var": 3.6145368084521117, + "learning_rate": 0.0001, + "loss": 1.0938, + "loss/crossentropy": 2.595165967941284, + "loss/hidden": 0.95703125, + "loss/logits": 0.13645675778388977, + "loss/reg": 3.550490873749368e-05, + "step": 139 + }, + { + "epoch": 0.0175, + "grad_norm": 2.0036461353302, + "grad_norm_var": 3.6268452557090196, + "learning_rate": 0.0001, + "loss": 1.0352, + "loss/crossentropy": 2.6407668590545654, + "loss/hidden": 0.91015625, + "loss/logits": 0.12472639232873917, + "loss/reg": 3.5499935620464385e-05, + "step": 140 + }, + { + "epoch": 0.017625, + "grad_norm": 3.4189512729644775, + "grad_norm_var": 0.5874364952131912, + "learning_rate": 0.0001, + "loss": 1.3364, + "loss/crossentropy": 2.8673250675201416, + "loss/hidden": 1.140625, + "loss/logits": 0.19542476534843445, + "loss/reg": 3.549545363057405e-05, + "step": 141 + }, + { + "epoch": 0.01775, + "grad_norm": 2.0884158611297607, + "grad_norm_var": 0.19036343785417903, + "learning_rate": 0.0001, + "loss": 1.1356, + "loss/crossentropy": 2.2495715618133545, + "loss/hidden": 1.0078125, + "loss/logits": 0.12745517492294312, + "loss/reg": 3.5490164009388536e-05, + "step": 142 + }, + { + "epoch": 0.017875, + "grad_norm": 2.4939138889312744, + "grad_norm_var": 0.1883496681179942, + "learning_rate": 0.0001, + "loss": 1.2224, + "loss/crossentropy": 2.3506898880004883, + "loss/hidden": 1.078125, + "loss/logits": 0.14388948678970337, + "loss/reg": 3.548476524883881e-05, + "step": 143 + }, + { + "epoch": 0.018, + "grad_norm": 2.634059190750122, + "grad_norm_var": 0.19009706439956606, + "learning_rate": 0.0001, + "loss": 1.308, + "loss/crossentropy": 2.422675371170044, + "loss/hidden": 1.1484375, + "loss/logits": 0.15922774374485016, + "loss/reg": 3.5479293728712946e-05, + "step": 144 + }, + { + "epoch": 0.018125, + "grad_norm": 2.9936301708221436, + "grad_norm_var": 0.22328614777820613, + "learning_rate": 0.0001, + "loss": 1.4104, + "loss/crossentropy": 2.5935611724853516, + "loss/hidden": 1.2109375, + "loss/logits": 0.19912970066070557, + "loss/reg": 3.547423330019228e-05, + "step": 145 + }, + { + "epoch": 0.01825, + "grad_norm": 3.1390833854675293, + "grad_norm_var": 0.23765955297996205, + "learning_rate": 0.0001, + "loss": 1.2747, + "loss/crossentropy": 2.4289345741271973, + "loss/hidden": 1.125, + "loss/logits": 0.14936049282550812, + "loss/reg": 3.54700350726489e-05, + "step": 146 + }, + { + "epoch": 0.018375, + "grad_norm": 2.4484870433807373, + "grad_norm_var": 0.19680489618343674, + "learning_rate": 0.0001, + "loss": 1.1176, + "loss/crossentropy": 2.519469976425171, + "loss/hidden": 0.98046875, + "loss/logits": 0.13677741587162018, + "loss/reg": 3.546685184119269e-05, + "step": 147 + }, + { + "epoch": 0.0185, + "grad_norm": 1.876994252204895, + "grad_norm_var": 0.20358269116957192, + "learning_rate": 0.0001, + "loss": 1.0701, + "loss/crossentropy": 2.669678211212158, + "loss/hidden": 0.9453125, + "loss/logits": 0.12444234639406204, + "loss/reg": 3.5462882806314155e-05, + "step": 148 + }, + { + "epoch": 0.018625, + "grad_norm": 2.704676628112793, + "grad_norm_var": 0.1832369663952557, + "learning_rate": 0.0001, + "loss": 1.2437, + "loss/crossentropy": 2.595327854156494, + "loss/hidden": 1.0625, + "loss/logits": 0.18086357414722443, + "loss/reg": 3.5459666833048686e-05, + "step": 149 + }, + { + "epoch": 0.01875, + "grad_norm": 1.9055484533309937, + "grad_norm_var": 0.20361674389210194, + "learning_rate": 0.0001, + "loss": 1.161, + "loss/crossentropy": 2.5460128784179688, + "loss/hidden": 1.015625, + "loss/logits": 0.1449938714504242, + "loss/reg": 3.545805884641595e-05, + "step": 150 + }, + { + "epoch": 0.018875, + "grad_norm": 2.7920961380004883, + "grad_norm_var": 0.20979331401592252, + "learning_rate": 0.0001, + "loss": 1.1624, + "loss/crossentropy": 2.2290139198303223, + "loss/hidden": 1.0078125, + "loss/logits": 0.15427884459495544, + "loss/reg": 3.545627259882167e-05, + "step": 151 + }, + { + "epoch": 0.019, + "grad_norm": 2.299669027328491, + "grad_norm_var": 0.21185582767360506, + "learning_rate": 0.0001, + "loss": 1.1007, + "loss/crossentropy": 2.5660064220428467, + "loss/hidden": 0.9609375, + "loss/logits": 0.13937082886695862, + "loss/reg": 3.5451499570626765e-05, + "step": 152 + }, + { + "epoch": 0.019125, + "grad_norm": 1.9452663660049438, + "grad_norm_var": 0.2252079205413636, + "learning_rate": 0.0001, + "loss": 1.1951, + "loss/crossentropy": 2.3395628929138184, + "loss/hidden": 1.0390625, + "loss/logits": 0.155635803937912, + "loss/reg": 3.544955688994378e-05, + "step": 153 + }, + { + "epoch": 0.01925, + "grad_norm": 1.8410539627075195, + "grad_norm_var": 0.24354386471799874, + "learning_rate": 0.0001, + "loss": 1.0907, + "loss/crossentropy": 2.5739612579345703, + "loss/hidden": 0.95703125, + "loss/logits": 0.13335567712783813, + "loss/reg": 3.544604260241613e-05, + "step": 154 + }, + { + "epoch": 0.019375, + "grad_norm": 2.2286977767944336, + "grad_norm_var": 0.23796766155861107, + "learning_rate": 0.0001, + "loss": 1.0203, + "loss/crossentropy": 2.2887346744537354, + "loss/hidden": 0.90625, + "loss/logits": 0.11365014314651489, + "loss/reg": 3.544157516444102e-05, + "step": 155 + }, + { + "epoch": 0.0195, + "grad_norm": 2.3557684421539307, + "grad_norm_var": 0.22589299419968203, + "learning_rate": 0.0001, + "loss": 1.3643, + "loss/crossentropy": 2.274764060974121, + "loss/hidden": 1.1875, + "loss/logits": 0.1764501929283142, + "loss/reg": 3.543913771864027e-05, + "step": 156 + }, + { + "epoch": 0.019625, + "grad_norm": 2.247559070587158, + "grad_norm_var": 0.15998786264861256, + "learning_rate": 0.0001, + "loss": 1.3654, + "loss/crossentropy": 2.2784736156463623, + "loss/hidden": 1.171875, + "loss/logits": 0.19315966963768005, + "loss/reg": 3.543505954439752e-05, + "step": 157 + }, + { + "epoch": 0.01975, + "grad_norm": 2.834188222885132, + "grad_norm_var": 0.16628359109999918, + "learning_rate": 0.0001, + "loss": 1.3386, + "loss/crossentropy": 2.509218454360962, + "loss/hidden": 1.15625, + "loss/logits": 0.18198764324188232, + "loss/reg": 3.54316653101705e-05, + "step": 158 + }, + { + "epoch": 0.019875, + "grad_norm": 2.1036031246185303, + "grad_norm_var": 0.17202571468130015, + "learning_rate": 0.0001, + "loss": 1.0503, + "loss/crossentropy": 2.4518606662750244, + "loss/hidden": 0.93359375, + "loss/logits": 0.11635103076696396, + "loss/reg": 3.542845297488384e-05, + "step": 159 + }, + { + "epoch": 0.02, + "grad_norm": 2.062805652618408, + "grad_norm_var": 0.17435755134033895, + "learning_rate": 0.0001, + "loss": 1.0312, + "loss/crossentropy": 2.361372470855713, + "loss/hidden": 0.91796875, + "loss/logits": 0.1128474771976471, + "loss/reg": 3.5423294320935383e-05, + "step": 160 + }, + { + "epoch": 0.020125, + "grad_norm": 2.021106004714966, + "grad_norm_var": 0.15146251895304388, + "learning_rate": 0.0001, + "loss": 1.1121, + "loss/crossentropy": 2.134568214416504, + "loss/hidden": 0.984375, + "loss/logits": 0.1273871660232544, + "loss/reg": 3.541701880749315e-05, + "step": 161 + }, + { + "epoch": 0.02025, + "grad_norm": 1.8616065979003906, + "grad_norm_var": 0.11060822886564707, + "learning_rate": 0.0001, + "loss": 1.1566, + "loss/crossentropy": 2.252749443054199, + "loss/hidden": 1.0234375, + "loss/logits": 0.1327974796295166, + "loss/reg": 3.541166370268911e-05, + "step": 162 + }, + { + "epoch": 0.020375, + "grad_norm": 3.580717086791992, + "grad_norm_var": 0.2251369893581473, + "learning_rate": 0.0001, + "loss": 1.7797, + "loss/crossentropy": 2.554025888442993, + "loss/hidden": 1.4921875, + "loss/logits": 0.2871723771095276, + "loss/reg": 3.54056573996786e-05, + "step": 163 + }, + { + "epoch": 0.0205, + "grad_norm": 1.9080392122268677, + "grad_norm_var": 0.22348213477058507, + "learning_rate": 0.0001, + "loss": 1.1281, + "loss/crossentropy": 2.561861038208008, + "loss/hidden": 0.984375, + "loss/logits": 0.14337776601314545, + "loss/reg": 3.53991927113384e-05, + "step": 164 + }, + { + "epoch": 0.020625, + "grad_norm": 2.122875213623047, + "grad_norm_var": 0.2127240754841674, + "learning_rate": 0.0001, + "loss": 1.1555, + "loss/crossentropy": 2.5679633617401123, + "loss/hidden": 1.0, + "loss/logits": 0.15509843826293945, + "loss/reg": 3.539249883033335e-05, + "step": 165 + }, + { + "epoch": 0.02075, + "grad_norm": 2.3634352684020996, + "grad_norm_var": 0.2043765165354652, + "learning_rate": 0.0001, + "loss": 1.2497, + "loss/crossentropy": 2.691157579421997, + "loss/hidden": 1.09375, + "loss/logits": 0.15564362704753876, + "loss/reg": 3.5388431570027024e-05, + "step": 166 + }, + { + "epoch": 0.020875, + "grad_norm": 1.8694658279418945, + "grad_norm_var": 0.1952630533114321, + "learning_rate": 0.0001, + "loss": 1.193, + "loss/crossentropy": 2.585261106491089, + "loss/hidden": 1.046875, + "loss/logits": 0.14572536945343018, + "loss/reg": 3.5384666261961684e-05, + "step": 167 + }, + { + "epoch": 0.021, + "grad_norm": 1.9730387926101685, + "grad_norm_var": 0.19880394057846942, + "learning_rate": 0.0001, + "loss": 1.2912, + "loss/crossentropy": 2.5292489528656006, + "loss/hidden": 1.125, + "loss/logits": 0.16588369011878967, + "loss/reg": 3.538179225870408e-05, + "step": 168 + }, + { + "epoch": 0.021125, + "grad_norm": 2.0155906677246094, + "grad_norm_var": 0.1966546350588845, + "learning_rate": 0.0001, + "loss": 1.2574, + "loss/crossentropy": 2.252187728881836, + "loss/hidden": 1.1015625, + "loss/logits": 0.1554747223854065, + "loss/reg": 3.537629891070537e-05, + "step": 169 + }, + { + "epoch": 0.02125, + "grad_norm": 2.432105302810669, + "grad_norm_var": 0.18926746622633459, + "learning_rate": 0.0001, + "loss": 1.4755, + "loss/crossentropy": 2.1912407875061035, + "loss/hidden": 1.3125, + "loss/logits": 0.1626225709915161, + "loss/reg": 3.5371955164009705e-05, + "step": 170 + }, + { + "epoch": 0.021375, + "grad_norm": 2.286074161529541, + "grad_norm_var": 0.18931952814725433, + "learning_rate": 0.0001, + "loss": 1.1264, + "loss/crossentropy": 2.7250216007232666, + "loss/hidden": 0.9921875, + "loss/logits": 0.13382771611213684, + "loss/reg": 3.53686700691469e-05, + "step": 171 + }, + { + "epoch": 0.0215, + "grad_norm": 2.3155455589294434, + "grad_norm_var": 0.18886613419825055, + "learning_rate": 0.0001, + "loss": 1.1029, + "loss/crossentropy": 2.54166316986084, + "loss/hidden": 0.96875, + "loss/logits": 0.13384276628494263, + "loss/reg": 3.536650910973549e-05, + "step": 172 + }, + { + "epoch": 0.021625, + "grad_norm": 2.488759994506836, + "grad_norm_var": 0.19242826239165894, + "learning_rate": 0.0001, + "loss": 1.2388, + "loss/crossentropy": 2.368736743927002, + "loss/hidden": 1.0859375, + "loss/logits": 0.1524895578622818, + "loss/reg": 3.5365450457902625e-05, + "step": 173 + }, + { + "epoch": 0.02175, + "grad_norm": 2.2716481685638428, + "grad_norm_var": 0.16950942206230835, + "learning_rate": 0.0001, + "loss": 1.1404, + "loss/crossentropy": 2.602968454360962, + "loss/hidden": 1.0, + "loss/logits": 0.14000558853149414, + "loss/reg": 3.5365239455131814e-05, + "step": 174 + }, + { + "epoch": 0.021875, + "grad_norm": 2.346731424331665, + "grad_norm_var": 0.16911372185244672, + "learning_rate": 0.0001, + "loss": 1.1505, + "loss/crossentropy": 2.6104869842529297, + "loss/hidden": 1.015625, + "loss/logits": 0.1345091462135315, + "loss/reg": 3.536231452017091e-05, + "step": 175 + }, + { + "epoch": 0.022, + "grad_norm": 9.636815071105957, + "grad_norm_var": 3.5705013839433035, + "learning_rate": 0.0001, + "loss": 1.9711, + "loss/crossentropy": 1.9007188081741333, + "loss/hidden": 1.8359375, + "loss/logits": 0.13476577401161194, + "loss/reg": 3.535941868904047e-05, + "step": 176 + }, + { + "epoch": 0.022125, + "grad_norm": 1.9420382976531982, + "grad_norm_var": 3.578242683123464, + "learning_rate": 0.0001, + "loss": 1.0553, + "loss/crossentropy": 2.1399552822113037, + "loss/hidden": 0.93359375, + "loss/logits": 0.12134355306625366, + "loss/reg": 3.535431460477412e-05, + "step": 177 + }, + { + "epoch": 0.02225, + "grad_norm": 3.67820405960083, + "grad_norm_var": 3.5781775866024454, + "learning_rate": 0.0001, + "loss": 1.6061, + "loss/crossentropy": 2.7716376781463623, + "loss/hidden": 1.3671875, + "loss/logits": 0.2385806441307068, + "loss/reg": 3.5349476092960685e-05, + "step": 178 + }, + { + "epoch": 0.022375, + "grad_norm": 2.345334768295288, + "grad_norm_var": 3.5494032480633924, + "learning_rate": 0.0001, + "loss": 1.3134, + "loss/crossentropy": 2.2584104537963867, + "loss/hidden": 1.1484375, + "loss/logits": 0.16456623375415802, + "loss/reg": 3.534728966769762e-05, + "step": 179 + }, + { + "epoch": 0.0225, + "grad_norm": 2.019059181213379, + "grad_norm_var": 3.5377143028114526, + "learning_rate": 0.0001, + "loss": 1.1633, + "loss/crossentropy": 2.688572645187378, + "loss/hidden": 1.03125, + "loss/logits": 0.13166731595993042, + "loss/reg": 3.5341858165338635e-05, + "step": 180 + }, + { + "epoch": 0.022625, + "grad_norm": 2.5575642585754395, + "grad_norm_var": 3.5127901367513408, + "learning_rate": 0.0001, + "loss": 1.3238, + "loss/crossentropy": 3.2461724281311035, + "loss/hidden": 1.1328125, + "loss/logits": 0.19062137603759766, + "loss/reg": 3.534007555572316e-05, + "step": 181 + }, + { + "epoch": 0.02275, + "grad_norm": 2.1583099365234375, + "grad_norm_var": 3.52691794996746, + "learning_rate": 0.0001, + "loss": 1.105, + "loss/crossentropy": 2.570775270462036, + "loss/hidden": 0.97265625, + "loss/logits": 0.13197766244411469, + "loss/reg": 3.5337754525244236e-05, + "step": 182 + }, + { + "epoch": 0.022875, + "grad_norm": 2.1373021602630615, + "grad_norm_var": 3.499205684128989, + "learning_rate": 0.0001, + "loss": 1.0368, + "loss/crossentropy": 2.7369937896728516, + "loss/hidden": 0.91015625, + "loss/logits": 0.12632793188095093, + "loss/reg": 3.5334065614733845e-05, + "step": 183 + }, + { + "epoch": 0.023, + "grad_norm": 1.9534015655517578, + "grad_norm_var": 3.501362961216583, + "learning_rate": 0.0001, + "loss": 1.2631, + "loss/crossentropy": 2.348998546600342, + "loss/hidden": 1.09375, + "loss/logits": 0.16902770102024078, + "loss/reg": 3.533027120283805e-05, + "step": 184 + }, + { + "epoch": 0.023125, + "grad_norm": 3.3518424034118652, + "grad_norm_var": 3.47560508461983, + "learning_rate": 0.0001, + "loss": 1.4757, + "loss/crossentropy": 2.2752151489257812, + "loss/hidden": 1.3125, + "loss/logits": 0.16288352012634277, + "loss/reg": 3.532712798914872e-05, + "step": 185 + }, + { + "epoch": 0.02325, + "grad_norm": 1.9095062017440796, + "grad_norm_var": 3.5231901050491348, + "learning_rate": 0.0001, + "loss": 1.1564, + "loss/crossentropy": 2.3877570629119873, + "loss/hidden": 0.9921875, + "loss/logits": 0.16389842331409454, + "loss/reg": 3.531980837578885e-05, + "step": 186 + }, + { + "epoch": 0.023375, + "grad_norm": 1.7745263576507568, + "grad_norm_var": 3.5771479932902293, + "learning_rate": 0.0001, + "loss": 1.0993, + "loss/crossentropy": 2.5461585521698, + "loss/hidden": 0.96484375, + "loss/logits": 0.13407567143440247, + "loss/reg": 3.531064066919498e-05, + "step": 187 + }, + { + "epoch": 0.0235, + "grad_norm": 1.932446002960205, + "grad_norm_var": 3.611343163184297, + "learning_rate": 0.0001, + "loss": 1.1479, + "loss/crossentropy": 2.177215099334717, + "loss/hidden": 1.0078125, + "loss/logits": 0.13969676196575165, + "loss/reg": 3.530231333570555e-05, + "step": 188 + }, + { + "epoch": 0.023625, + "grad_norm": 2.2318572998046875, + "grad_norm_var": 3.6254944343577464, + "learning_rate": 0.0001, + "loss": 1.1476, + "loss/crossentropy": 2.539461374282837, + "loss/hidden": 1.0078125, + "loss/logits": 0.13939380645751953, + "loss/reg": 3.5298002330819145e-05, + "step": 189 + }, + { + "epoch": 0.02375, + "grad_norm": 1.8733116388320923, + "grad_norm_var": 3.661635973864308, + "learning_rate": 0.0001, + "loss": 1.2894, + "loss/crossentropy": 2.3773810863494873, + "loss/hidden": 1.1484375, + "loss/logits": 0.14065586030483246, + "loss/reg": 3.529394234647043e-05, + "step": 190 + }, + { + "epoch": 0.023875, + "grad_norm": 1.9819684028625488, + "grad_norm_var": 3.689103451615234, + "learning_rate": 0.0001, + "loss": 1.0824, + "loss/crossentropy": 2.652743101119995, + "loss/hidden": 0.95703125, + "loss/logits": 0.12501415610313416, + "loss/reg": 3.528552406351082e-05, + "step": 191 + }, + { + "epoch": 0.024, + "grad_norm": 1.781873345375061, + "grad_norm_var": 0.29881303206394033, + "learning_rate": 0.0001, + "loss": 1.3252, + "loss/crossentropy": 2.130110740661621, + "loss/hidden": 1.1640625, + "loss/logits": 0.16076770424842834, + "loss/reg": 3.527875742292963e-05, + "step": 192 + }, + { + "epoch": 0.024125, + "grad_norm": 1.968166470527649, + "grad_norm_var": 0.29786371458498306, + "learning_rate": 0.0001, + "loss": 1.1787, + "loss/crossentropy": 2.607984781265259, + "loss/hidden": 1.0234375, + "loss/logits": 0.15494059026241302, + "loss/reg": 3.5275123082101345e-05, + "step": 193 + }, + { + "epoch": 0.02425, + "grad_norm": 2.250447988510132, + "grad_norm_var": 0.14927689793780866, + "learning_rate": 0.0001, + "loss": 1.2501, + "loss/crossentropy": 2.381725549697876, + "loss/hidden": 1.109375, + "loss/logits": 0.1403769701719284, + "loss/reg": 3.5266541090095416e-05, + "step": 194 + }, + { + "epoch": 0.024375, + "grad_norm": 2.3107409477233887, + "grad_norm_var": 0.14840081385512557, + "learning_rate": 0.0001, + "loss": 1.308, + "loss/crossentropy": 2.5593056678771973, + "loss/hidden": 1.15625, + "loss/logits": 0.15144123136997223, + "loss/reg": 3.526056025293656e-05, + "step": 195 + }, + { + "epoch": 0.0245, + "grad_norm": 2.0219268798828125, + "grad_norm_var": 0.14835622425891018, + "learning_rate": 0.0001, + "loss": 1.2059, + "loss/crossentropy": 2.591111421585083, + "loss/hidden": 1.0546875, + "loss/logits": 0.1508275270462036, + "loss/reg": 3.525464853737503e-05, + "step": 196 + }, + { + "epoch": 0.024625, + "grad_norm": 1.7184540033340454, + "grad_norm_var": 0.14533186557784786, + "learning_rate": 0.0001, + "loss": 1.1103, + "loss/crossentropy": 2.5513174533843994, + "loss/hidden": 0.98046875, + "loss/logits": 0.12951934337615967, + "loss/reg": 3.5250719520263374e-05, + "step": 197 + }, + { + "epoch": 0.02475, + "grad_norm": 2.099649429321289, + "grad_norm_var": 0.14497162965532903, + "learning_rate": 0.0001, + "loss": 1.3992, + "loss/crossentropy": 2.3214898109436035, + "loss/hidden": 1.2109375, + "loss/logits": 0.1879514902830124, + "loss/reg": 3.524802013998851e-05, + "step": 198 + }, + { + "epoch": 0.024875, + "grad_norm": 2.551090717315674, + "grad_norm_var": 0.15877433194770507, + "learning_rate": 0.0001, + "loss": 1.1497, + "loss/crossentropy": 2.0819451808929443, + "loss/hidden": 1.0, + "loss/logits": 0.14934971928596497, + "loss/reg": 3.524927524267696e-05, + "step": 199 + }, + { + "epoch": 0.025, + "grad_norm": 2.41422438621521, + "grad_norm_var": 0.1626121663513837, + "learning_rate": 0.0001, + "loss": 1.1887, + "loss/crossentropy": 2.4138712882995605, + "loss/hidden": 1.03125, + "loss/logits": 0.15712395310401917, + "loss/reg": 3.5251006920589134e-05, + "step": 200 + }, + { + "epoch": 0.025125, + "grad_norm": 3.0029654502868652, + "grad_norm_var": 0.11365057463783608, + "learning_rate": 0.0001, + "loss": 1.2376, + "loss/crossentropy": 2.3929851055145264, + "loss/hidden": 1.109375, + "loss/logits": 0.12789341807365417, + "loss/reg": 3.525133433868177e-05, + "step": 201 + }, + { + "epoch": 0.02525, + "grad_norm": 2.318742513656616, + "grad_norm_var": 0.1129624302912769, + "learning_rate": 0.0001, + "loss": 1.1832, + "loss/crossentropy": 2.4472897052764893, + "loss/hidden": 1.0234375, + "loss/logits": 0.15940618515014648, + "loss/reg": 3.525337888277136e-05, + "step": 202 + }, + { + "epoch": 0.025375, + "grad_norm": 2.432077169418335, + "grad_norm_var": 0.1079851047719283, + "learning_rate": 0.0001, + "loss": 1.2983, + "loss/crossentropy": 2.5350871086120605, + "loss/hidden": 1.125, + "loss/logits": 0.17297999560832977, + "loss/reg": 3.5248252970632166e-05, + "step": 203 + }, + { + "epoch": 0.0255, + "grad_norm": 2.019660711288452, + "grad_norm_var": 0.10557456561180795, + "learning_rate": 0.0001, + "loss": 1.1381, + "loss/crossentropy": 2.443376064300537, + "loss/hidden": 1.0078125, + "loss/logits": 0.1299603134393692, + "loss/reg": 3.5241089790361e-05, + "step": 204 + }, + { + "epoch": 0.025625, + "grad_norm": 1.928032636642456, + "grad_norm_var": 0.10948915785116071, + "learning_rate": 0.0001, + "loss": 1.039, + "loss/crossentropy": 2.552450656890869, + "loss/hidden": 0.9140625, + "loss/logits": 0.12461342662572861, + "loss/reg": 3.523936538840644e-05, + "step": 205 + }, + { + "epoch": 0.02575, + "grad_norm": 1.872836947441101, + "grad_norm_var": 0.10950776538443824, + "learning_rate": 0.0001, + "loss": 1.203, + "loss/crossentropy": 2.563770294189453, + "loss/hidden": 1.046875, + "loss/logits": 0.1558125615119934, + "loss/reg": 3.523280975059606e-05, + "step": 206 + }, + { + "epoch": 0.025875, + "grad_norm": 2.722188711166382, + "grad_norm_var": 0.12548596824484168, + "learning_rate": 0.0001, + "loss": 1.1048, + "loss/crossentropy": 2.673701286315918, + "loss/hidden": 0.95703125, + "loss/logits": 0.14743700623512268, + "loss/reg": 3.523009945638478e-05, + "step": 207 + }, + { + "epoch": 0.026, + "grad_norm": 2.174853563308716, + "grad_norm_var": 0.1125315287945383, + "learning_rate": 0.0001, + "loss": 1.2005, + "loss/crossentropy": 2.776305913925171, + "loss/hidden": 1.0390625, + "loss/logits": 0.16110967099666595, + "loss/reg": 3.5228087654104456e-05, + "step": 208 + }, + { + "epoch": 0.026125, + "grad_norm": 2.026176691055298, + "grad_norm_var": 0.11065571110427162, + "learning_rate": 0.0001, + "loss": 1.2427, + "loss/crossentropy": 2.4790477752685547, + "loss/hidden": 1.0859375, + "loss/logits": 0.15644526481628418, + "loss/reg": 3.5227625630795956e-05, + "step": 209 + }, + { + "epoch": 0.02625, + "grad_norm": 1.8512555360794067, + "grad_norm_var": 0.12013934057968918, + "learning_rate": 0.0001, + "loss": 1.1542, + "loss/crossentropy": 2.2648251056671143, + "loss/hidden": 1.015625, + "loss/logits": 0.1382524073123932, + "loss/reg": 3.5229088098276407e-05, + "step": 210 + }, + { + "epoch": 0.026375, + "grad_norm": 2.1204733848571777, + "grad_norm_var": 0.12001253969897234, + "learning_rate": 0.0001, + "loss": 1.1644, + "loss/crossentropy": 2.5878171920776367, + "loss/hidden": 1.0078125, + "loss/logits": 0.15627792477607727, + "loss/reg": 3.523128543747589e-05, + "step": 211 + }, + { + "epoch": 0.0265, + "grad_norm": 2.3862364292144775, + "grad_norm_var": 0.11943129282008957, + "learning_rate": 0.0001, + "loss": 1.2387, + "loss/crossentropy": 2.3441061973571777, + "loss/hidden": 1.0859375, + "loss/logits": 0.1524021029472351, + "loss/reg": 3.522722909110598e-05, + "step": 212 + }, + { + "epoch": 0.026625, + "grad_norm": 2.4904723167419434, + "grad_norm_var": 0.10428997507238075, + "learning_rate": 0.0001, + "loss": 1.2116, + "loss/crossentropy": 2.5467946529388428, + "loss/hidden": 1.0546875, + "loss/logits": 0.15654009580612183, + "loss/reg": 3.522184488247149e-05, + "step": 213 + }, + { + "epoch": 0.02675, + "grad_norm": 2.1831283569335938, + "grad_norm_var": 0.1027661689763948, + "learning_rate": 0.0001, + "loss": 1.1544, + "loss/crossentropy": 2.782853126525879, + "loss/hidden": 0.98828125, + "loss/logits": 0.16573229432106018, + "loss/reg": 3.521531107253395e-05, + "step": 214 + }, + { + "epoch": 0.026875, + "grad_norm": 3.787935972213745, + "grad_norm_var": 0.24293552641352203, + "learning_rate": 0.0001, + "loss": 1.5035, + "loss/crossentropy": 2.463303804397583, + "loss/hidden": 1.28125, + "loss/logits": 0.22189679741859436, + "loss/reg": 3.5209486668463796e-05, + "step": 215 + }, + { + "epoch": 0.027, + "grad_norm": 1.9294849634170532, + "grad_norm_var": 0.25400057735268855, + "learning_rate": 0.0001, + "loss": 1.1094, + "loss/crossentropy": 2.4277851581573486, + "loss/hidden": 0.97265625, + "loss/logits": 0.13635680079460144, + "loss/reg": 3.5205699532525614e-05, + "step": 216 + }, + { + "epoch": 0.027125, + "grad_norm": 1.9984577894210815, + "grad_norm_var": 0.22665186521847977, + "learning_rate": 0.0001, + "loss": 1.0918, + "loss/crossentropy": 2.5422215461730957, + "loss/hidden": 0.96484375, + "loss/logits": 0.12658366560935974, + "loss/reg": 3.519668462104164e-05, + "step": 217 + }, + { + "epoch": 0.02725, + "grad_norm": 2.2303996086120605, + "grad_norm_var": 0.2265080910144917, + "learning_rate": 0.0001, + "loss": 1.4173, + "loss/crossentropy": 2.6119143962860107, + "loss/hidden": 1.2109375, + "loss/logits": 0.2059965431690216, + "loss/reg": 3.519029269227758e-05, + "step": 218 + }, + { + "epoch": 0.027375, + "grad_norm": 2.595283031463623, + "grad_norm_var": 0.23192599234322203, + "learning_rate": 0.0001, + "loss": 1.4022, + "loss/crossentropy": 2.388324499130249, + "loss/hidden": 1.2265625, + "loss/logits": 0.17526502907276154, + "loss/reg": 3.5182933061150834e-05, + "step": 219 + }, + { + "epoch": 0.0275, + "grad_norm": 2.3104512691497803, + "grad_norm_var": 0.2275123342772699, + "learning_rate": 0.0001, + "loss": 1.1324, + "loss/crossentropy": 2.400674819946289, + "loss/hidden": 1.0078125, + "loss/logits": 0.1242409497499466, + "loss/reg": 3.517704681144096e-05, + "step": 220 + }, + { + "epoch": 0.027625, + "grad_norm": 2.0627379417419434, + "grad_norm_var": 0.2221815343350992, + "learning_rate": 0.0001, + "loss": 1.2552, + "loss/crossentropy": 2.4785444736480713, + "loss/hidden": 1.09375, + "loss/logits": 0.1611141413450241, + "loss/reg": 3.517186632961966e-05, + "step": 221 + }, + { + "epoch": 0.02775, + "grad_norm": 2.6437251567840576, + "grad_norm_var": 0.215787531953678, + "learning_rate": 0.0001, + "loss": 1.3035, + "loss/crossentropy": 2.35196590423584, + "loss/hidden": 1.0859375, + "loss/logits": 0.21720939874649048, + "loss/reg": 3.5167005989933386e-05, + "step": 222 + }, + { + "epoch": 0.027875, + "grad_norm": 2.0463922023773193, + "grad_norm_var": 0.21030634447597923, + "learning_rate": 0.0001, + "loss": 1.178, + "loss/crossentropy": 2.5125770568847656, + "loss/hidden": 1.03125, + "loss/logits": 0.14638057351112366, + "loss/reg": 3.516068682074547e-05, + "step": 223 + }, + { + "epoch": 0.028, + "grad_norm": 2.099517822265625, + "grad_norm_var": 0.2119416481519659, + "learning_rate": 0.0001, + "loss": 1.2246, + "loss/crossentropy": 2.6609790325164795, + "loss/hidden": 1.0703125, + "loss/logits": 0.1539691686630249, + "loss/reg": 3.51545459125191e-05, + "step": 224 + }, + { + "epoch": 0.028125, + "grad_norm": 2.2835323810577393, + "grad_norm_var": 0.20676636732833859, + "learning_rate": 0.0001, + "loss": 1.1949, + "loss/crossentropy": 2.408785581588745, + "loss/hidden": 1.046875, + "loss/logits": 0.14770297706127167, + "loss/reg": 3.514792115311138e-05, + "step": 225 + }, + { + "epoch": 0.02825, + "grad_norm": 2.2726855278015137, + "grad_norm_var": 0.19188050953051553, + "learning_rate": 0.0001, + "loss": 1.125, + "loss/crossentropy": 2.3164994716644287, + "loss/hidden": 0.98828125, + "loss/logits": 0.13638192415237427, + "loss/reg": 3.5143304558005184e-05, + "step": 226 + }, + { + "epoch": 0.028375, + "grad_norm": 2.239753484725952, + "grad_norm_var": 0.18927748053925839, + "learning_rate": 0.0001, + "loss": 1.4585, + "loss/crossentropy": 2.5264809131622314, + "loss/hidden": 1.2734375, + "loss/logits": 0.18467766046524048, + "loss/reg": 3.5138236853526905e-05, + "step": 227 + }, + { + "epoch": 0.0285, + "grad_norm": 2.5043678283691406, + "grad_norm_var": 0.1907596103376837, + "learning_rate": 0.0001, + "loss": 1.465, + "loss/crossentropy": 2.1979873180389404, + "loss/hidden": 1.2734375, + "loss/logits": 0.191162109375, + "loss/reg": 3.513501360430382e-05, + "step": 228 + }, + { + "epoch": 0.028625, + "grad_norm": 2.2842559814453125, + "grad_norm_var": 0.18968967595688752, + "learning_rate": 0.0001, + "loss": 1.1184, + "loss/crossentropy": 2.6084938049316406, + "loss/hidden": 0.984375, + "loss/logits": 0.1337008774280548, + "loss/reg": 3.5130418837070465e-05, + "step": 229 + }, + { + "epoch": 0.02875, + "grad_norm": 78.28353881835938, + "grad_norm_var": 360.5321235749958, + "learning_rate": 0.0001, + "loss": 1.287, + "loss/crossentropy": 2.642012119293213, + "loss/hidden": 1.1328125, + "loss/logits": 0.15383732318878174, + "loss/reg": 3.512646071612835e-05, + "step": 230 + }, + { + "epoch": 0.028875, + "grad_norm": 2.2934622764587402, + "grad_norm_var": 361.3313444069006, + "learning_rate": 0.0001, + "loss": 1.2146, + "loss/crossentropy": 2.125366687774658, + "loss/hidden": 1.078125, + "loss/logits": 0.13609249889850616, + "loss/reg": 3.51221788150724e-05, + "step": 231 + }, + { + "epoch": 0.029, + "grad_norm": 2.2334835529327393, + "grad_norm_var": 361.13139871490966, + "learning_rate": 0.0001, + "loss": 1.3334, + "loss/crossentropy": 2.4540159702301025, + "loss/hidden": 1.1640625, + "loss/logits": 0.16899245977401733, + "loss/reg": 3.511944305500947e-05, + "step": 232 + }, + { + "epoch": 0.029125, + "grad_norm": 2.025312900543213, + "grad_norm_var": 361.11344936137425, + "learning_rate": 0.0001, + "loss": 1.0377, + "loss/crossentropy": 2.4971330165863037, + "loss/hidden": 0.921875, + "loss/logits": 0.11546964198350906, + "loss/reg": 3.5114706406602636e-05, + "step": 233 + }, + { + "epoch": 0.02925, + "grad_norm": 1.8107097148895264, + "grad_norm_var": 361.39278859021084, + "learning_rate": 0.0001, + "loss": 1.0426, + "loss/crossentropy": 2.3869125843048096, + "loss/hidden": 0.92578125, + "loss/logits": 0.11641789972782135, + "loss/reg": 3.510946407914162e-05, + "step": 234 + }, + { + "epoch": 0.029375, + "grad_norm": 1.5672167539596558, + "grad_norm_var": 362.06253246288617, + "learning_rate": 0.0001, + "loss": 1.1107, + "loss/crossentropy": 2.239819049835205, + "loss/hidden": 0.984375, + "loss/logits": 0.12596073746681213, + "loss/reg": 3.510485475999303e-05, + "step": 235 + }, + { + "epoch": 0.0295, + "grad_norm": 1.660971999168396, + "grad_norm_var": 362.48937574795417, + "learning_rate": 0.0001, + "loss": 1.1466, + "loss/crossentropy": 2.445607900619507, + "loss/hidden": 1.0078125, + "loss/logits": 0.13841402530670166, + "loss/reg": 3.510040551191196e-05, + "step": 236 + }, + { + "epoch": 0.029625, + "grad_norm": 1.9987144470214844, + "grad_norm_var": 362.5308779292139, + "learning_rate": 0.0001, + "loss": 1.2824, + "loss/crossentropy": 2.3451719284057617, + "loss/hidden": 1.125, + "loss/logits": 0.15705101191997528, + "loss/reg": 3.50964764948003e-05, + "step": 237 + }, + { + "epoch": 0.02975, + "grad_norm": 1.6107144355773926, + "grad_norm_var": 363.18249781017846, + "learning_rate": 0.0001, + "loss": 1.094, + "loss/crossentropy": 2.4538111686706543, + "loss/hidden": 0.96484375, + "loss/logits": 0.12884217500686646, + "loss/reg": 3.5092118196189404e-05, + "step": 238 + }, + { + "epoch": 0.029875, + "grad_norm": 2.075155735015869, + "grad_norm_var": 363.1642193933474, + "learning_rate": 0.0001, + "loss": 1.215, + "loss/crossentropy": 2.531987190246582, + "loss/hidden": 1.0625, + "loss/logits": 0.1521448791027069, + "loss/reg": 3.5086912248516455e-05, + "step": 239 + }, + { + "epoch": 0.03, + "grad_norm": 2.3201041221618652, + "grad_norm_var": 363.0281972205138, + "learning_rate": 0.0001, + "loss": 1.1417, + "loss/crossentropy": 2.5911645889282227, + "loss/hidden": 0.98828125, + "loss/logits": 0.15311402082443237, + "loss/reg": 3.5083008697256446e-05, + "step": 240 + }, + { + "epoch": 0.030125, + "grad_norm": 2.102585554122925, + "grad_norm_var": 363.1402101869869, + "learning_rate": 0.0001, + "loss": 1.0786, + "loss/crossentropy": 2.350059747695923, + "loss/hidden": 0.9453125, + "loss/logits": 0.13292624056339264, + "loss/reg": 3.507927613100037e-05, + "step": 241 + }, + { + "epoch": 0.03025, + "grad_norm": 2.0235884189605713, + "grad_norm_var": 363.295456416674, + "learning_rate": 0.0001, + "loss": 1.191, + "loss/crossentropy": 2.2975988388061523, + "loss/hidden": 1.046875, + "loss/logits": 0.1438218355178833, + "loss/reg": 3.5071363527094945e-05, + "step": 242 + }, + { + "epoch": 0.030375, + "grad_norm": 2.6130146980285645, + "grad_norm_var": 363.0764814158417, + "learning_rate": 0.0001, + "loss": 1.2925, + "loss/crossentropy": 2.477764368057251, + "loss/hidden": 1.140625, + "loss/logits": 0.15148332715034485, + "loss/reg": 3.5061231756117195e-05, + "step": 243 + }, + { + "epoch": 0.0305, + "grad_norm": 2.840742826461792, + "grad_norm_var": 362.889192023, + "learning_rate": 0.0001, + "loss": 1.0848, + "loss/crossentropy": 2.4943950176239014, + "loss/hidden": 0.95703125, + "loss/logits": 0.12739571928977966, + "loss/reg": 3.505099084577523e-05, + "step": 244 + }, + { + "epoch": 0.030625, + "grad_norm": 2.412440776824951, + "grad_norm_var": 362.8120310886774, + "learning_rate": 0.0001, + "loss": 1.1284, + "loss/crossentropy": 2.5384373664855957, + "loss/hidden": 0.9921875, + "loss/logits": 0.1358700394630432, + "loss/reg": 3.504483902361244e-05, + "step": 245 + }, + { + "epoch": 0.03075, + "grad_norm": 2.105398654937744, + "grad_norm_var": 0.12231122414377561, + "learning_rate": 0.0001, + "loss": 1.2701, + "loss/crossentropy": 2.192657232284546, + "loss/hidden": 1.109375, + "loss/logits": 0.16039346158504486, + "loss/reg": 3.5036639019381255e-05, + "step": 246 + }, + { + "epoch": 0.030875, + "grad_norm": 1.9865885972976685, + "grad_norm_var": 0.12052054727502123, + "learning_rate": 0.0001, + "loss": 1.1144, + "loss/crossentropy": 2.396636486053467, + "loss/hidden": 0.97265625, + "loss/logits": 0.14134395122528076, + "loss/reg": 3.502915205899626e-05, + "step": 247 + }, + { + "epoch": 0.031, + "grad_norm": 2.064413547515869, + "grad_norm_var": 0.11899755252362822, + "learning_rate": 0.0001, + "loss": 1.2757, + "loss/crossentropy": 2.2669105529785156, + "loss/hidden": 1.109375, + "loss/logits": 0.1659543216228485, + "loss/reg": 3.502297113300301e-05, + "step": 248 + }, + { + "epoch": 0.031125, + "grad_norm": 2.341909170150757, + "grad_norm_var": 0.12311806681906787, + "learning_rate": 0.0001, + "loss": 1.1985, + "loss/crossentropy": 2.6345901489257812, + "loss/hidden": 1.03125, + "loss/logits": 0.16693958640098572, + "loss/reg": 3.501290120766498e-05, + "step": 249 + }, + { + "epoch": 0.03125, + "grad_norm": 2.0425026416778564, + "grad_norm_var": 0.11766230442624745, + "learning_rate": 0.0001, + "loss": 1.1968, + "loss/crossentropy": 2.4224119186401367, + "loss/hidden": 1.0390625, + "loss/logits": 0.15737830102443695, + "loss/reg": 3.500358798191883e-05, + "step": 250 + }, + { + "epoch": 0.031375, + "grad_norm": 2.139225482940674, + "grad_norm_var": 0.09668613014885802, + "learning_rate": 0.0001, + "loss": 1.2377, + "loss/crossentropy": 2.7258267402648926, + "loss/hidden": 1.09375, + "loss/logits": 0.14362195134162903, + "loss/reg": 3.499682497931644e-05, + "step": 251 + }, + { + "epoch": 0.0315, + "grad_norm": 2.102008581161499, + "grad_norm_var": 0.08031358514106011, + "learning_rate": 0.0001, + "loss": 1.1117, + "loss/crossentropy": 2.432748317718506, + "loss/hidden": 0.96875, + "loss/logits": 0.1425955444574356, + "loss/reg": 3.498911246424541e-05, + "step": 252 + }, + { + "epoch": 0.031625, + "grad_norm": 2.2371959686279297, + "grad_norm_var": 0.0783042488946918, + "learning_rate": 0.0001, + "loss": 1.0361, + "loss/crossentropy": 2.437335968017578, + "loss/hidden": 0.90234375, + "loss/logits": 0.13340801000595093, + "loss/reg": 3.498331716400571e-05, + "step": 253 + }, + { + "epoch": 0.03175, + "grad_norm": 2.230013608932495, + "grad_norm_var": 0.054557147559412954, + "learning_rate": 0.0001, + "loss": 1.2139, + "loss/crossentropy": 2.159069299697876, + "loss/hidden": 1.0859375, + "loss/logits": 0.12760058045387268, + "loss/reg": 3.497749275993556e-05, + "step": 254 + }, + { + "epoch": 0.031875, + "grad_norm": 2.2526209354400635, + "grad_norm_var": 0.05292534377042599, + "learning_rate": 0.0001, + "loss": 1.1517, + "loss/crossentropy": 2.744022846221924, + "loss/hidden": 1.0, + "loss/logits": 0.15138062834739685, + "loss/reg": 3.496619319776073e-05, + "step": 255 + }, + { + "epoch": 0.032, + "grad_norm": 2.243044376373291, + "grad_norm_var": 0.05245697188967089, + "learning_rate": 0.0001, + "loss": 1.6025, + "loss/crossentropy": 2.2367136478424072, + "loss/hidden": 1.3515625, + "loss/logits": 0.25056183338165283, + "loss/reg": 3.495947385090403e-05, + "step": 256 + }, + { + "epoch": 0.032125, + "grad_norm": 2.2301154136657715, + "grad_norm_var": 0.05124602164451577, + "learning_rate": 0.0001, + "loss": 1.1302, + "loss/crossentropy": 2.8784143924713135, + "loss/hidden": 0.99609375, + "loss/logits": 0.1337929368019104, + "loss/reg": 3.495061901048757e-05, + "step": 257 + }, + { + "epoch": 0.03225, + "grad_norm": 1.9009541273117065, + "grad_norm_var": 0.0557499358364358, + "learning_rate": 0.0001, + "loss": 1.1815, + "loss/crossentropy": 2.852105140686035, + "loss/hidden": 1.03125, + "loss/logits": 0.1498585343360901, + "loss/reg": 3.494451448204927e-05, + "step": 258 + }, + { + "epoch": 0.032375, + "grad_norm": 1.6674587726593018, + "grad_norm_var": 0.06383147372835059, + "learning_rate": 0.0001, + "loss": 1.1406, + "loss/crossentropy": 2.6262168884277344, + "loss/hidden": 0.9921875, + "loss/logits": 0.1480187475681305, + "loss/reg": 3.493377516861074e-05, + "step": 259 + }, + { + "epoch": 0.0325, + "grad_norm": 2.1328914165496826, + "grad_norm_var": 0.03229453348446872, + "learning_rate": 0.0001, + "loss": 1.2748, + "loss/crossentropy": 2.944517135620117, + "loss/hidden": 1.1171875, + "loss/logits": 0.15721508860588074, + "loss/reg": 3.492645555525087e-05, + "step": 260 + }, + { + "epoch": 0.032625, + "grad_norm": 1.8784502744674683, + "grad_norm_var": 0.030045803407693878, + "learning_rate": 0.0001, + "loss": 1.0683, + "loss/crossentropy": 2.2746896743774414, + "loss/hidden": 0.9375, + "loss/logits": 0.1304292529821396, + "loss/reg": 3.4919979952974245e-05, + "step": 261 + }, + { + "epoch": 0.03275, + "grad_norm": 2.3806915283203125, + "grad_norm_var": 0.03508431327747174, + "learning_rate": 0.0001, + "loss": 1.0997, + "loss/crossentropy": 2.5303072929382324, + "loss/hidden": 0.96484375, + "loss/logits": 0.13454417884349823, + "loss/reg": 3.4913624403998256e-05, + "step": 262 + }, + { + "epoch": 0.032875, + "grad_norm": 1.8924614191055298, + "grad_norm_var": 0.03724188133506529, + "learning_rate": 0.0001, + "loss": 1.198, + "loss/crossentropy": 2.496983766555786, + "loss/hidden": 1.0390625, + "loss/logits": 0.15854007005691528, + "loss/reg": 3.490634844638407e-05, + "step": 263 + }, + { + "epoch": 0.033, + "grad_norm": 2.439052104949951, + "grad_norm_var": 0.04381194480349577, + "learning_rate": 0.0001, + "loss": 0.953, + "loss/crossentropy": 2.7149696350097656, + "loss/hidden": 0.83984375, + "loss/logits": 0.1127798855304718, + "loss/reg": 3.490111339488067e-05, + "step": 264 + }, + { + "epoch": 0.033125, + "grad_norm": 2.3024566173553467, + "grad_norm_var": 0.042804570962997876, + "learning_rate": 0.0001, + "loss": 1.1402, + "loss/crossentropy": 2.447719097137451, + "loss/hidden": 0.9765625, + "loss/logits": 0.1632998287677765, + "loss/reg": 3.489398295641877e-05, + "step": 265 + }, + { + "epoch": 0.03325, + "grad_norm": 2.287111520767212, + "grad_norm_var": 0.04370853447134184, + "learning_rate": 0.0001, + "loss": 1.2209, + "loss/crossentropy": 2.1921133995056152, + "loss/hidden": 1.0625, + "loss/logits": 0.15808838605880737, + "loss/reg": 3.48894864146132e-05, + "step": 266 + }, + { + "epoch": 0.033375, + "grad_norm": 2.336646318435669, + "grad_norm_var": 0.04599945823637951, + "learning_rate": 0.0001, + "loss": 1.2784, + "loss/crossentropy": 2.231534719467163, + "loss/hidden": 1.125, + "loss/logits": 0.15302959084510803, + "loss/reg": 3.488411311991513e-05, + "step": 267 + }, + { + "epoch": 0.0335, + "grad_norm": 1.9491949081420898, + "grad_norm_var": 0.04858091189580491, + "learning_rate": 0.0001, + "loss": 1.3204, + "loss/crossentropy": 2.3477442264556885, + "loss/hidden": 1.1484375, + "loss/logits": 0.17159268260002136, + "loss/reg": 3.488002403173596e-05, + "step": 268 + }, + { + "epoch": 0.033625, + "grad_norm": 1.8812212944030762, + "grad_norm_var": 0.05224458505779281, + "learning_rate": 0.0001, + "loss": 1.1176, + "loss/crossentropy": 2.3098113536834717, + "loss/hidden": 0.98046875, + "loss/logits": 0.13675528764724731, + "loss/reg": 3.487144203973003e-05, + "step": 269 + }, + { + "epoch": 0.03375, + "grad_norm": 2.1917343139648438, + "grad_norm_var": 0.05180158566938828, + "learning_rate": 0.0001, + "loss": 1.3389, + "loss/crossentropy": 2.6006360054016113, + "loss/hidden": 1.171875, + "loss/logits": 0.16665683686733246, + "loss/reg": 3.4867065551225096e-05, + "step": 270 + }, + { + "epoch": 0.033875, + "grad_norm": 2.1218273639678955, + "grad_norm_var": 0.050608227478554056, + "learning_rate": 0.0001, + "loss": 1.2115, + "loss/crossentropy": 2.3429975509643555, + "loss/hidden": 1.0546875, + "loss/logits": 0.15650612115859985, + "loss/reg": 3.486104469629936e-05, + "step": 271 + }, + { + "epoch": 0.034, + "grad_norm": 2.216865062713623, + "grad_norm_var": 0.05020309095007551, + "learning_rate": 0.0001, + "loss": 1.309, + "loss/crossentropy": 2.6172518730163574, + "loss/hidden": 1.125, + "loss/logits": 0.18367114663124084, + "loss/reg": 3.485478737275116e-05, + "step": 272 + }, + { + "epoch": 0.034125, + "grad_norm": 2.660276174545288, + "grad_norm_var": 0.06848105136911228, + "learning_rate": 0.0001, + "loss": 1.2656, + "loss/crossentropy": 2.6182620525360107, + "loss/hidden": 1.1015625, + "loss/logits": 0.16370661556720734, + "loss/reg": 3.4849643270717934e-05, + "step": 273 + }, + { + "epoch": 0.03425, + "grad_norm": 1.9411466121673584, + "grad_norm_var": 0.06730120648781887, + "learning_rate": 0.0001, + "loss": 1.2427, + "loss/crossentropy": 2.360733985900879, + "loss/hidden": 1.09375, + "loss/logits": 0.14862678945064545, + "loss/reg": 3.4842636523535475e-05, + "step": 274 + }, + { + "epoch": 0.034375, + "grad_norm": 7.836575031280518, + "grad_norm_var": 2.0552077515562965, + "learning_rate": 0.0001, + "loss": 2.1182, + "loss/crossentropy": 2.7246999740600586, + "loss/hidden": 1.8671875, + "loss/logits": 0.25068169832229614, + "loss/reg": 3.483570981188677e-05, + "step": 275 + }, + { + "epoch": 0.0345, + "grad_norm": 15.36082649230957, + "grad_norm_var": 12.29442028509769, + "learning_rate": 0.0001, + "loss": 1.2297, + "loss/crossentropy": 2.7898216247558594, + "loss/hidden": 1.0625, + "loss/logits": 0.16689400374889374, + "loss/reg": 3.482873580651358e-05, + "step": 276 + }, + { + "epoch": 0.034625, + "grad_norm": 2.4146132469177246, + "grad_norm_var": 12.206846506541835, + "learning_rate": 0.0001, + "loss": 1.1546, + "loss/crossentropy": 2.6433522701263428, + "loss/hidden": 1.0, + "loss/logits": 0.15420594811439514, + "loss/reg": 3.482148167677224e-05, + "step": 277 + }, + { + "epoch": 0.03475, + "grad_norm": 2.0731208324432373, + "grad_norm_var": 12.254080178741175, + "learning_rate": 0.0001, + "loss": 1.1779, + "loss/crossentropy": 2.2190020084381104, + "loss/hidden": 1.03125, + "loss/logits": 0.14627079665660858, + "loss/reg": 3.481503881630488e-05, + "step": 278 + }, + { + "epoch": 0.034875, + "grad_norm": 2.2058444023132324, + "grad_norm_var": 12.19851901002264, + "learning_rate": 0.0001, + "loss": 1.1722, + "loss/crossentropy": 2.3826961517333984, + "loss/hidden": 1.015625, + "loss/logits": 0.15618480741977692, + "loss/reg": 3.48087414749898e-05, + "step": 279 + }, + { + "epoch": 0.035, + "grad_norm": 1.8618927001953125, + "grad_norm_var": 12.292415025402823, + "learning_rate": 0.0001, + "loss": 1.2047, + "loss/crossentropy": 2.424729824066162, + "loss/hidden": 1.0390625, + "loss/logits": 0.16533657908439636, + "loss/reg": 3.479952283669263e-05, + "step": 280 + }, + { + "epoch": 0.035125, + "grad_norm": 1.9102482795715332, + "grad_norm_var": 12.356945094423846, + "learning_rate": 0.0001, + "loss": 1.2226, + "loss/crossentropy": 2.4433910846710205, + "loss/hidden": 1.0546875, + "loss/logits": 0.1675223708152771, + "loss/reg": 3.4793913073372096e-05, + "step": 281 + }, + { + "epoch": 0.03525, + "grad_norm": 2.706441640853882, + "grad_norm_var": 12.309734168758586, + "learning_rate": 0.0001, + "loss": 1.0941, + "loss/crossentropy": 3.0568315982818604, + "loss/hidden": 0.95703125, + "loss/logits": 0.13671937584877014, + "loss/reg": 3.478690632618964e-05, + "step": 282 + }, + { + "epoch": 0.035375, + "grad_norm": 2.22886061668396, + "grad_norm_var": 12.325085121884593, + "learning_rate": 0.0001, + "loss": 1.2228, + "loss/crossentropy": 2.31925892829895, + "loss/hidden": 1.0546875, + "loss/logits": 0.16773122549057007, + "loss/reg": 3.4781944123096764e-05, + "step": 283 + }, + { + "epoch": 0.0355, + "grad_norm": 1.9104151725769043, + "grad_norm_var": 12.332409456506062, + "learning_rate": 0.0001, + "loss": 1.2103, + "loss/crossentropy": 2.433910369873047, + "loss/hidden": 1.0625, + "loss/logits": 0.14740660786628723, + "loss/reg": 3.47744207829237e-05, + "step": 284 + }, + { + "epoch": 0.035625, + "grad_norm": 1.8624837398529053, + "grad_norm_var": 12.336088715902626, + "learning_rate": 0.0001, + "loss": 1.1048, + "loss/crossentropy": 2.308101177215576, + "loss/hidden": 0.95703125, + "loss/logits": 0.14743317663669586, + "loss/reg": 3.476895290077664e-05, + "step": 285 + }, + { + "epoch": 0.03575, + "grad_norm": 1.7985292673110962, + "grad_norm_var": 12.406159364169602, + "learning_rate": 0.0001, + "loss": 1.391, + "loss/crossentropy": 2.338219165802002, + "loss/hidden": 1.1875, + "loss/logits": 0.20311352610588074, + "loss/reg": 3.476293932180852e-05, + "step": 286 + }, + { + "epoch": 0.035875, + "grad_norm": 2.0390207767486572, + "grad_norm_var": 12.419809877029808, + "learning_rate": 0.0001, + "loss": 1.2142, + "loss/crossentropy": 2.798340320587158, + "loss/hidden": 1.0390625, + "loss/logits": 0.1748366355895996, + "loss/reg": 3.4757238609017804e-05, + "step": 287 + }, + { + "epoch": 0.036, + "grad_norm": 2.6159002780914307, + "grad_norm_var": 12.371378457752575, + "learning_rate": 0.0001, + "loss": 1.3686, + "loss/crossentropy": 2.0943591594696045, + "loss/hidden": 1.1953125, + "loss/logits": 0.1729019582271576, + "loss/reg": 3.475058838375844e-05, + "step": 288 + }, + { + "epoch": 0.036125, + "grad_norm": 2.64959716796875, + "grad_norm_var": 12.372352193512818, + "learning_rate": 0.0001, + "loss": 1.3369, + "loss/crossentropy": 2.875190019607544, + "loss/hidden": 1.1640625, + "loss/logits": 0.17247627675533295, + "loss/reg": 3.474393815849908e-05, + "step": 289 + }, + { + "epoch": 0.03625, + "grad_norm": 2.2474873065948486, + "grad_norm_var": 12.321143222954635, + "learning_rate": 0.0001, + "loss": 1.252, + "loss/crossentropy": 2.6608498096466064, + "loss/hidden": 1.078125, + "loss/logits": 0.17357373237609863, + "loss/reg": 3.473682954791002e-05, + "step": 290 + }, + { + "epoch": 0.036375, + "grad_norm": 2.4677608013153076, + "grad_norm_var": 10.916427124267383, + "learning_rate": 0.0001, + "loss": 1.0142, + "loss/crossentropy": 2.8202872276306152, + "loss/hidden": 0.89453125, + "loss/logits": 0.11931537836790085, + "loss/reg": 3.472894968581386e-05, + "step": 291 + }, + { + "epoch": 0.0365, + "grad_norm": 2.4355132579803467, + "grad_norm_var": 0.09359576037076062, + "learning_rate": 0.0001, + "loss": 1.344, + "loss/crossentropy": 2.588075876235962, + "loss/hidden": 1.171875, + "loss/logits": 0.17175744473934174, + "loss/reg": 3.4721750125754625e-05, + "step": 292 + }, + { + "epoch": 0.036625, + "grad_norm": 1.8848857879638672, + "grad_norm_var": 0.09698104319835413, + "learning_rate": 0.0001, + "loss": 1.0617, + "loss/crossentropy": 2.2032036781311035, + "loss/hidden": 0.93359375, + "loss/logits": 0.12777957320213318, + "loss/reg": 3.471899981377646e-05, + "step": 293 + }, + { + "epoch": 0.03675, + "grad_norm": 2.0544004440307617, + "grad_norm_var": 0.09727253081927305, + "learning_rate": 0.0001, + "loss": 1.1195, + "loss/crossentropy": 2.68400502204895, + "loss/hidden": 0.97265625, + "loss/logits": 0.14646826684474945, + "loss/reg": 3.471321178949438e-05, + "step": 294 + }, + { + "epoch": 0.036875, + "grad_norm": 2.784888744354248, + "grad_norm_var": 0.12022710970729276, + "learning_rate": 0.0001, + "loss": 1.4568, + "loss/crossentropy": 2.2672512531280518, + "loss/hidden": 1.265625, + "loss/logits": 0.19084087014198303, + "loss/reg": 3.4705888538155705e-05, + "step": 295 + }, + { + "epoch": 0.037, + "grad_norm": 2.107628345489502, + "grad_norm_var": 0.11239423391909416, + "learning_rate": 0.0001, + "loss": 1.1654, + "loss/crossentropy": 2.3829853534698486, + "loss/hidden": 1.015625, + "loss/logits": 0.14940626919269562, + "loss/reg": 3.470026422291994e-05, + "step": 296 + }, + { + "epoch": 0.037125, + "grad_norm": 2.666799783706665, + "grad_norm_var": 0.11576118522773501, + "learning_rate": 0.0001, + "loss": 1.2658, + "loss/crossentropy": 2.3234565258026123, + "loss/hidden": 1.0859375, + "loss/logits": 0.17954039573669434, + "loss/reg": 3.469527655397542e-05, + "step": 297 + }, + { + "epoch": 0.03725, + "grad_norm": 1.8957864046096802, + "grad_norm_var": 0.11060988429572352, + "learning_rate": 0.0001, + "loss": 1.305, + "loss/crossentropy": 2.4239273071289062, + "loss/hidden": 1.125, + "loss/logits": 0.17967185378074646, + "loss/reg": 3.469140938250348e-05, + "step": 298 + }, + { + "epoch": 0.037375, + "grad_norm": 2.1476354598999023, + "grad_norm_var": 0.1110142344328826, + "learning_rate": 0.0001, + "loss": 1.18, + "loss/crossentropy": 2.3623642921447754, + "loss/hidden": 1.0234375, + "loss/logits": 0.1561676412820816, + "loss/reg": 3.468482827884145e-05, + "step": 299 + }, + { + "epoch": 0.0375, + "grad_norm": 1.9932529926300049, + "grad_norm_var": 0.10799009738127907, + "learning_rate": 0.0001, + "loss": 1.1559, + "loss/crossentropy": 2.69016170501709, + "loss/hidden": 1.0078125, + "loss/logits": 0.14770537614822388, + "loss/reg": 3.468065187917091e-05, + "step": 300 + }, + { + "epoch": 0.037625, + "grad_norm": 2.2224810123443604, + "grad_norm_var": 0.0985346154888075, + "learning_rate": 0.0001, + "loss": 1.1653, + "loss/crossentropy": 2.553476572036743, + "loss/hidden": 1.0234375, + "loss/logits": 0.14147460460662842, + "loss/reg": 3.467235364951193e-05, + "step": 301 + }, + { + "epoch": 0.03775, + "grad_norm": 2.135714530944824, + "grad_norm_var": 0.08531074310993644, + "learning_rate": 0.0001, + "loss": 1.2046, + "loss/crossentropy": 2.243950128555298, + "loss/hidden": 1.0625, + "loss/logits": 0.1418030858039856, + "loss/reg": 3.466893394943327e-05, + "step": 302 + }, + { + "epoch": 0.037875, + "grad_norm": 2.3680195808410645, + "grad_norm_var": 0.08186467355099622, + "learning_rate": 0.0001, + "loss": 1.2752, + "loss/crossentropy": 2.2647523880004883, + "loss/hidden": 1.1328125, + "loss/logits": 0.1420516073703766, + "loss/reg": 3.4659868106245995e-05, + "step": 303 + }, + { + "epoch": 0.038, + "grad_norm": 2.08476185798645, + "grad_norm_var": 0.07658376607289051, + "learning_rate": 0.0001, + "loss": 1.1305, + "loss/crossentropy": 2.627105712890625, + "loss/hidden": 0.984375, + "loss/logits": 0.14580708742141724, + "loss/reg": 3.465437112026848e-05, + "step": 304 + }, + { + "epoch": 0.038125, + "grad_norm": 2.110761880874634, + "grad_norm_var": 0.06667962973879416, + "learning_rate": 0.0001, + "loss": 1.2576, + "loss/crossentropy": 2.3144562244415283, + "loss/hidden": 1.09375, + "loss/logits": 0.16346824169158936, + "loss/reg": 3.4645545383682474e-05, + "step": 305 + }, + { + "epoch": 0.03825, + "grad_norm": 1.900791049003601, + "grad_norm_var": 0.07317499342195574, + "learning_rate": 0.0001, + "loss": 1.0492, + "loss/crossentropy": 2.716005325317383, + "loss/hidden": 0.9140625, + "loss/logits": 0.13480325043201447, + "loss/reg": 3.463495886535384e-05, + "step": 306 + }, + { + "epoch": 0.038375, + "grad_norm": 6.467423439025879, + "grad_norm_var": 1.2137641430294792, + "learning_rate": 0.0001, + "loss": 1.4583, + "loss/crossentropy": 2.1065866947174072, + "loss/hidden": 1.2890625, + "loss/logits": 0.1689138412475586, + "loss/reg": 3.462713721091859e-05, + "step": 307 + }, + { + "epoch": 0.0385, + "grad_norm": 2.0642800331115723, + "grad_norm_var": 1.2232825060870915, + "learning_rate": 0.0001, + "loss": 1.185, + "loss/crossentropy": 2.624023675918579, + "loss/hidden": 1.03125, + "loss/logits": 0.1534431129693985, + "loss/reg": 3.4622189559740946e-05, + "step": 308 + }, + { + "epoch": 0.038625, + "grad_norm": 1.680816650390625, + "grad_norm_var": 1.2407335757806945, + "learning_rate": 0.0001, + "loss": 1.153, + "loss/crossentropy": 2.327143430709839, + "loss/hidden": 1.0078125, + "loss/logits": 0.14481112360954285, + "loss/reg": 3.461851520114578e-05, + "step": 309 + }, + { + "epoch": 0.03875, + "grad_norm": 1.7195234298706055, + "grad_norm_var": 1.2639701691366267, + "learning_rate": 0.0001, + "loss": 1.1146, + "loss/crossentropy": 2.234018087387085, + "loss/hidden": 0.97265625, + "loss/logits": 0.141597181558609, + "loss/reg": 3.461261803749949e-05, + "step": 310 + }, + { + "epoch": 0.038875, + "grad_norm": 1.9253605604171753, + "grad_norm_var": 1.265680677961886, + "learning_rate": 0.0001, + "loss": 1.1557, + "loss/crossentropy": 2.310030460357666, + "loss/hidden": 1.015625, + "loss/logits": 0.13973672688007355, + "loss/reg": 3.4606102417455986e-05, + "step": 311 + }, + { + "epoch": 0.039, + "grad_norm": 1.8293761014938354, + "grad_norm_var": 1.2792590983492156, + "learning_rate": 0.0001, + "loss": 1.1528, + "loss/crossentropy": 2.735854148864746, + "loss/hidden": 1.015625, + "loss/logits": 0.1367899477481842, + "loss/reg": 3.460055449977517e-05, + "step": 312 + }, + { + "epoch": 0.039125, + "grad_norm": 2.47273325920105, + "grad_norm_var": 1.2727893848260379, + "learning_rate": 0.0001, + "loss": 1.1983, + "loss/crossentropy": 2.4735970497131348, + "loss/hidden": 1.03125, + "loss/logits": 0.16670575737953186, + "loss/reg": 3.4597334888530895e-05, + "step": 313 + }, + { + "epoch": 0.03925, + "grad_norm": 2.4255082607269287, + "grad_norm_var": 1.2608122772144856, + "learning_rate": 0.0001, + "loss": 1.1323, + "loss/crossentropy": 2.6550486087799072, + "loss/hidden": 0.9765625, + "loss/logits": 0.1554185003042221, + "loss/reg": 3.4591066651046276e-05, + "step": 314 + }, + { + "epoch": 0.039375, + "grad_norm": 2.7170803546905518, + "grad_norm_var": 1.2659589390154238, + "learning_rate": 0.0001, + "loss": 1.439, + "loss/crossentropy": 2.187866687774658, + "loss/hidden": 1.21875, + "loss/logits": 0.21992294490337372, + "loss/reg": 3.458749415585771e-05, + "step": 315 + }, + { + "epoch": 0.0395, + "grad_norm": 2.1601176261901855, + "grad_norm_var": 1.259041909984486, + "learning_rate": 0.0001, + "loss": 1.3232, + "loss/crossentropy": 2.364377737045288, + "loss/hidden": 1.1484375, + "loss/logits": 0.1744486689567566, + "loss/reg": 3.458080755081028e-05, + "step": 316 + }, + { + "epoch": 0.039625, + "grad_norm": 1.5947940349578857, + "grad_norm_var": 1.2979203484203092, + "learning_rate": 0.0001, + "loss": 1.028, + "loss/crossentropy": 2.4233779907226562, + "loss/hidden": 0.91015625, + "loss/logits": 0.11754067242145538, + "loss/reg": 3.4571639844216406e-05, + "step": 317 + }, + { + "epoch": 0.03975, + "grad_norm": 1.9653059244155884, + "grad_norm_var": 1.3046851365567058, + "learning_rate": 0.0001, + "loss": 1.126, + "loss/crossentropy": 2.31756854057312, + "loss/hidden": 1.0, + "loss/logits": 0.12566252052783966, + "loss/reg": 3.456034028204158e-05, + "step": 318 + }, + { + "epoch": 0.039875, + "grad_norm": 2.036482572555542, + "grad_norm_var": 1.310445228246628, + "learning_rate": 0.0001, + "loss": 1.1971, + "loss/crossentropy": 2.3461310863494873, + "loss/hidden": 1.0234375, + "loss/logits": 0.17331476509571075, + "loss/reg": 3.455525802564807e-05, + "step": 319 + }, + { + "epoch": 0.04, + "grad_norm": 2.075100898742676, + "grad_norm_var": 1.3107569056456743, + "learning_rate": 0.0001, + "loss": 1.327, + "loss/crossentropy": 2.506412982940674, + "loss/hidden": 1.140625, + "loss/logits": 0.1859946846961975, + "loss/reg": 3.45489097526297e-05, + "step": 320 + }, + { + "epoch": 0.040125, + "grad_norm": 2.719728946685791, + "grad_norm_var": 1.3168160620395004, + "learning_rate": 0.0001, + "loss": 1.3389, + "loss/crossentropy": 2.4216549396514893, + "loss/hidden": 1.15625, + "loss/logits": 0.1822904348373413, + "loss/reg": 3.45378830388654e-05, + "step": 321 + }, + { + "epoch": 0.04025, + "grad_norm": 2.6060094833374023, + "grad_norm_var": 1.3047531355820505, + "learning_rate": 0.0001, + "loss": 1.5674, + "loss/crossentropy": 2.658222198486328, + "loss/hidden": 1.34375, + "loss/logits": 0.22325485944747925, + "loss/reg": 3.453323370194994e-05, + "step": 322 + }, + { + "epoch": 0.040375, + "grad_norm": 2.5935699939727783, + "grad_norm_var": 0.14371946682283335, + "learning_rate": 0.0001, + "loss": 1.1519, + "loss/crossentropy": 2.6234920024871826, + "loss/hidden": 1.015625, + "loss/logits": 0.13588842749595642, + "loss/reg": 3.452138480497524e-05, + "step": 323 + }, + { + "epoch": 0.0405, + "grad_norm": 2.4168694019317627, + "grad_norm_var": 0.1469136698932582, + "learning_rate": 0.0001, + "loss": 1.2903, + "loss/crossentropy": 2.3443264961242676, + "loss/hidden": 1.109375, + "loss/logits": 0.18053767085075378, + "loss/reg": 3.4516375308157876e-05, + "step": 324 + }, + { + "epoch": 0.040625, + "grad_norm": 2.598829746246338, + "grad_norm_var": 0.13803791478749894, + "learning_rate": 0.0001, + "loss": 1.0512, + "loss/crossentropy": 2.6923201084136963, + "loss/hidden": 0.93359375, + "loss/logits": 0.11729900538921356, + "loss/reg": 3.450660369708203e-05, + "step": 325 + }, + { + "epoch": 0.04075, + "grad_norm": 2.2705416679382324, + "grad_norm_var": 0.11870002646295455, + "learning_rate": 0.0001, + "loss": 1.3007, + "loss/crossentropy": 2.8326780796051025, + "loss/hidden": 1.1328125, + "loss/logits": 0.16750125586986542, + "loss/reg": 3.449714495218359e-05, + "step": 326 + }, + { + "epoch": 0.040875, + "grad_norm": 2.108572006225586, + "grad_norm_var": 0.11224555742265707, + "learning_rate": 0.0001, + "loss": 1.2583, + "loss/crossentropy": 2.4080569744110107, + "loss/hidden": 1.1015625, + "loss/logits": 0.15636944770812988, + "loss/reg": 3.449182622716762e-05, + "step": 327 + }, + { + "epoch": 0.041, + "grad_norm": 1.9010335206985474, + "grad_norm_var": 0.10819501908635042, + "learning_rate": 0.0001, + "loss": 1.1679, + "loss/crossentropy": 2.7714359760284424, + "loss/hidden": 1.015625, + "loss/logits": 0.15192916989326477, + "loss/reg": 3.448529605520889e-05, + "step": 328 + }, + { + "epoch": 0.041125, + "grad_norm": 2.413954973220825, + "grad_norm_var": 0.1069897618565714, + "learning_rate": 0.0001, + "loss": 1.4022, + "loss/crossentropy": 2.225947618484497, + "loss/hidden": 1.2109375, + "loss/logits": 0.19096189737319946, + "loss/reg": 3.4474134736228734e-05, + "step": 329 + }, + { + "epoch": 0.04125, + "grad_norm": 2.385904312133789, + "grad_norm_var": 0.1063601900492659, + "learning_rate": 0.0001, + "loss": 1.2387, + "loss/crossentropy": 2.4583194255828857, + "loss/hidden": 1.078125, + "loss/logits": 0.16027729213237762, + "loss/reg": 3.44689360645134e-05, + "step": 330 + }, + { + "epoch": 0.041375, + "grad_norm": 2.4138543605804443, + "grad_norm_var": 0.09464759263946097, + "learning_rate": 0.0001, + "loss": 1.2305, + "loss/crossentropy": 2.5432093143463135, + "loss/hidden": 1.0703125, + "loss/logits": 0.15986011922359467, + "loss/reg": 3.446204573265277e-05, + "step": 331 + }, + { + "epoch": 0.0415, + "grad_norm": 3.4917404651641846, + "grad_norm_var": 0.18662260281899326, + "learning_rate": 0.0001, + "loss": 1.4531, + "loss/crossentropy": 2.667515516281128, + "loss/hidden": 1.2578125, + "loss/logits": 0.19496265053749084, + "loss/reg": 3.445533729973249e-05, + "step": 332 + }, + { + "epoch": 0.041625, + "grad_norm": 2.6574389934539795, + "grad_norm_var": 0.15026464336703782, + "learning_rate": 0.0001, + "loss": 1.2525, + "loss/crossentropy": 2.8141376972198486, + "loss/hidden": 1.0859375, + "loss/logits": 0.16617870330810547, + "loss/reg": 3.444873073021881e-05, + "step": 333 + }, + { + "epoch": 0.04175, + "grad_norm": 2.011918783187866, + "grad_norm_var": 0.14759976834883393, + "learning_rate": 0.0001, + "loss": 1.2704, + "loss/crossentropy": 2.5743820667266846, + "loss/hidden": 1.109375, + "loss/logits": 0.16070207953453064, + "loss/reg": 3.444178946665488e-05, + "step": 334 + }, + { + "epoch": 0.041875, + "grad_norm": 2.0087714195251465, + "grad_norm_var": 0.14906053005454342, + "learning_rate": 0.0001, + "loss": 1.1449, + "loss/crossentropy": 2.510054111480713, + "loss/hidden": 1.0078125, + "loss/logits": 0.13675576448440552, + "loss/reg": 3.4435932320775464e-05, + "step": 335 + }, + { + "epoch": 0.042, + "grad_norm": 2.2362313270568848, + "grad_norm_var": 0.1433353693831899, + "learning_rate": 0.0001, + "loss": 1.1899, + "loss/crossentropy": 2.4818272590637207, + "loss/hidden": 1.03125, + "loss/logits": 0.15829679369926453, + "loss/reg": 3.443356399657205e-05, + "step": 336 + }, + { + "epoch": 0.042125, + "grad_norm": 2.767322301864624, + "grad_norm_var": 0.14533335584858278, + "learning_rate": 0.0001, + "loss": 1.4789, + "loss/crossentropy": 1.9489027261734009, + "loss/hidden": 1.2578125, + "loss/logits": 0.22079172730445862, + "loss/reg": 3.442970410105772e-05, + "step": 337 + }, + { + "epoch": 0.04225, + "grad_norm": 1.9009366035461426, + "grad_norm_var": 0.15987229719162357, + "learning_rate": 0.0001, + "loss": 1.0737, + "loss/crossentropy": 2.4470298290252686, + "loss/hidden": 0.9375, + "loss/logits": 0.13584166765213013, + "loss/reg": 3.44260515703354e-05, + "step": 338 + }, + { + "epoch": 0.042375, + "grad_norm": 2.4537954330444336, + "grad_norm_var": 0.1572266899389352, + "learning_rate": 0.0001, + "loss": 1.4332, + "loss/crossentropy": 2.3663222789764404, + "loss/hidden": 1.2421875, + "loss/logits": 0.1906408667564392, + "loss/reg": 3.442170054768212e-05, + "step": 339 + }, + { + "epoch": 0.0425, + "grad_norm": 2.032825469970703, + "grad_norm_var": 0.1644215429789795, + "learning_rate": 0.0001, + "loss": 1.1493, + "loss/crossentropy": 2.4772043228149414, + "loss/hidden": 1.0078125, + "loss/logits": 0.1411634385585785, + "loss/reg": 3.4417531423969194e-05, + "step": 340 + }, + { + "epoch": 0.042625, + "grad_norm": 2.3325209617614746, + "grad_norm_var": 0.1601377693951102, + "learning_rate": 0.0001, + "loss": 1.2114, + "loss/crossentropy": 2.45560622215271, + "loss/hidden": 1.0546875, + "loss/logits": 0.15634939074516296, + "loss/reg": 3.44164072885178e-05, + "step": 341 + }, + { + "epoch": 0.04275, + "grad_norm": 1.9114069938659668, + "grad_norm_var": 0.1713673299562344, + "learning_rate": 0.0001, + "loss": 1.0862, + "loss/crossentropy": 2.6129817962646484, + "loss/hidden": 0.9453125, + "loss/logits": 0.14053717255592346, + "loss/reg": 3.441024091443978e-05, + "step": 342 + }, + { + "epoch": 0.042875, + "grad_norm": 3.5132644176483154, + "grad_norm_var": 0.256165301144145, + "learning_rate": 0.0001, + "loss": 1.4695, + "loss/crossentropy": 3.366391658782959, + "loss/hidden": 1.265625, + "loss/logits": 0.203495591878891, + "loss/reg": 3.440545333432965e-05, + "step": 343 + }, + { + "epoch": 0.043, + "grad_norm": 1.859586238861084, + "grad_norm_var": 0.2590414795273373, + "learning_rate": 0.0001, + "loss": 1.1472, + "loss/crossentropy": 2.428865909576416, + "loss/hidden": 0.984375, + "loss/logits": 0.1624484360218048, + "loss/reg": 3.440186628722586e-05, + "step": 344 + }, + { + "epoch": 0.043125, + "grad_norm": 1.98198401927948, + "grad_norm_var": 0.2698694637418498, + "learning_rate": 0.0001, + "loss": 1.3233, + "loss/crossentropy": 2.302736282348633, + "loss/hidden": 1.140625, + "loss/logits": 0.18230174481868744, + "loss/reg": 3.439932697801851e-05, + "step": 345 + }, + { + "epoch": 0.04325, + "grad_norm": 2.3659989833831787, + "grad_norm_var": 0.2698585694015619, + "learning_rate": 0.0001, + "loss": 1.14, + "loss/crossentropy": 2.5037076473236084, + "loss/hidden": 1.0, + "loss/logits": 0.13970160484313965, + "loss/reg": 3.439432111917995e-05, + "step": 346 + }, + { + "epoch": 0.043375, + "grad_norm": 2.281691074371338, + "grad_norm_var": 0.2701990568843252, + "learning_rate": 0.0001, + "loss": 1.2372, + "loss/crossentropy": 2.4770443439483643, + "loss/hidden": 1.0859375, + "loss/logits": 0.15086981654167175, + "loss/reg": 3.439074498601258e-05, + "step": 347 + }, + { + "epoch": 0.0435, + "grad_norm": 2.166990041732788, + "grad_norm_var": 0.18050477852144595, + "learning_rate": 0.0001, + "loss": 1.32, + "loss/crossentropy": 2.5852904319763184, + "loss/hidden": 1.125, + "loss/logits": 0.19465678930282593, + "loss/reg": 3.438742714934051e-05, + "step": 348 + }, + { + "epoch": 0.043625, + "grad_norm": 1.9888639450073242, + "grad_norm_var": 0.17481059186203815, + "learning_rate": 0.0001, + "loss": 1.4304, + "loss/crossentropy": 2.2656476497650146, + "loss/hidden": 1.21875, + "loss/logits": 0.21130971610546112, + "loss/reg": 3.437901978031732e-05, + "step": 349 + }, + { + "epoch": 0.04375, + "grad_norm": 1.7951990365982056, + "grad_norm_var": 0.1842899236598953, + "learning_rate": 0.0001, + "loss": 1.1657, + "loss/crossentropy": 2.3542745113372803, + "loss/hidden": 1.0234375, + "loss/logits": 0.141954243183136, + "loss/reg": 3.437545819906518e-05, + "step": 350 + }, + { + "epoch": 0.043875, + "grad_norm": 2.0519394874572754, + "grad_norm_var": 0.18316277481239354, + "learning_rate": 0.0001, + "loss": 1.0528, + "loss/crossentropy": 2.401294231414795, + "loss/hidden": 0.91796875, + "loss/logits": 0.13452798128128052, + "loss/reg": 3.4367119951639324e-05, + "step": 351 + }, + { + "epoch": 0.044, + "grad_norm": 2.12829327583313, + "grad_norm_var": 0.18376578016818687, + "learning_rate": 0.0001, + "loss": 1.1034, + "loss/crossentropy": 2.637117624282837, + "loss/hidden": 0.9609375, + "loss/logits": 0.14211076498031616, + "loss/reg": 3.436086262809113e-05, + "step": 352 + }, + { + "epoch": 0.044125, + "grad_norm": 1.8479722738265991, + "grad_norm_var": 0.16959696182082518, + "learning_rate": 0.0001, + "loss": 1.1462, + "loss/crossentropy": 2.534714698791504, + "loss/hidden": 0.99609375, + "loss/logits": 0.14980709552764893, + "loss/reg": 3.435524195083417e-05, + "step": 353 + }, + { + "epoch": 0.04425, + "grad_norm": 1.7758541107177734, + "grad_norm_var": 0.17495091080606068, + "learning_rate": 0.0001, + "loss": 1.2699, + "loss/crossentropy": 2.484570026397705, + "loss/hidden": 1.109375, + "loss/logits": 0.16018345952033997, + "loss/reg": 3.434780228417367e-05, + "step": 354 + }, + { + "epoch": 0.044375, + "grad_norm": 2.274486541748047, + "grad_norm_var": 0.1698290651703018, + "learning_rate": 0.0001, + "loss": 1.4533, + "loss/crossentropy": 2.1426889896392822, + "loss/hidden": 1.2421875, + "loss/logits": 0.21076492965221405, + "loss/reg": 3.433872916502878e-05, + "step": 355 + }, + { + "epoch": 0.0445, + "grad_norm": 2.4361371994018555, + "grad_norm_var": 0.17400054735299186, + "learning_rate": 0.0001, + "loss": 1.4138, + "loss/crossentropy": 2.34684419631958, + "loss/hidden": 1.234375, + "loss/logits": 0.17912375926971436, + "loss/reg": 3.432907396927476e-05, + "step": 356 + }, + { + "epoch": 0.044625, + "grad_norm": 2.6452977657318115, + "grad_norm_var": 0.1869129455570799, + "learning_rate": 0.0001, + "loss": 1.131, + "loss/crossentropy": 2.5198328495025635, + "loss/hidden": 1.015625, + "loss/logits": 0.11500594019889832, + "loss/reg": 3.431808727327734e-05, + "step": 357 + }, + { + "epoch": 0.04475, + "grad_norm": 1.905686855316162, + "grad_norm_var": 0.18712675263565845, + "learning_rate": 0.0001, + "loss": 1.1227, + "loss/crossentropy": 2.7459700107574463, + "loss/hidden": 0.984375, + "loss/logits": 0.1379736065864563, + "loss/reg": 3.431065852055326e-05, + "step": 358 + }, + { + "epoch": 0.044875, + "grad_norm": 2.1955368518829346, + "grad_norm_var": 0.06293061471083418, + "learning_rate": 0.0001, + "loss": 1.2854, + "loss/crossentropy": 2.514496326446533, + "loss/hidden": 1.09375, + "loss/logits": 0.19134163856506348, + "loss/reg": 3.430093784118071e-05, + "step": 359 + }, + { + "epoch": 0.045, + "grad_norm": 2.074857711791992, + "grad_norm_var": 0.0587442988467273, + "learning_rate": 0.0001, + "loss": 1.2615, + "loss/crossentropy": 2.3883326053619385, + "loss/hidden": 1.078125, + "loss/logits": 0.18299797177314758, + "loss/reg": 3.428904165048152e-05, + "step": 360 + }, + { + "epoch": 0.045125, + "grad_norm": 3.084925651550293, + "grad_norm_var": 0.11450734924812096, + "learning_rate": 0.0001, + "loss": 1.4694, + "loss/crossentropy": 2.5634469985961914, + "loss/hidden": 1.265625, + "loss/logits": 0.20340915024280548, + "loss/reg": 3.4281070838915184e-05, + "step": 361 + }, + { + "epoch": 0.04525, + "grad_norm": 2.3508076667785645, + "grad_norm_var": 0.11416271928607671, + "learning_rate": 0.0001, + "loss": 1.2864, + "loss/crossentropy": 2.713895559310913, + "loss/hidden": 1.1171875, + "loss/logits": 0.1689026653766632, + "loss/reg": 3.4273252822458744e-05, + "step": 362 + }, + { + "epoch": 0.045375, + "grad_norm": 7.681649208068848, + "grad_norm_var": 2.004247231943063, + "learning_rate": 0.0001, + "loss": 1.3636, + "loss/crossentropy": 2.4041330814361572, + "loss/hidden": 1.203125, + "loss/logits": 0.1601409614086151, + "loss/reg": 3.426634430070408e-05, + "step": 363 + }, + { + "epoch": 0.0455, + "grad_norm": 1.973664402961731, + "grad_norm_var": 2.0158187368377836, + "learning_rate": 0.0001, + "loss": 1.1228, + "loss/crossentropy": 2.5436434745788574, + "loss/hidden": 0.98046875, + "loss/logits": 0.14196962118148804, + "loss/reg": 3.425794784561731e-05, + "step": 364 + }, + { + "epoch": 0.045625, + "grad_norm": 2.2556684017181396, + "grad_norm_var": 2.001615144920621, + "learning_rate": 0.0001, + "loss": 1.1705, + "loss/crossentropy": 2.5252158641815186, + "loss/hidden": 1.0078125, + "loss/logits": 0.162298783659935, + "loss/reg": 3.425147588131949e-05, + "step": 365 + }, + { + "epoch": 0.04575, + "grad_norm": 2.2237462997436523, + "grad_norm_var": 1.9711144098953564, + "learning_rate": 0.0001, + "loss": 1.1598, + "loss/crossentropy": 2.4940881729125977, + "loss/hidden": 1.015625, + "loss/logits": 0.14381377398967743, + "loss/reg": 3.4247070288984105e-05, + "step": 366 + }, + { + "epoch": 0.045875, + "grad_norm": 6.294942378997803, + "grad_norm_var": 2.8107703767931165, + "learning_rate": 0.0001, + "loss": 1.9233, + "loss/crossentropy": 2.4311485290527344, + "loss/hidden": 1.515625, + "loss/logits": 0.40729784965515137, + "loss/reg": 3.424140595598146e-05, + "step": 367 + }, + { + "epoch": 0.046, + "grad_norm": 2.1130430698394775, + "grad_norm_var": 2.8121951540684127, + "learning_rate": 0.0001, + "loss": 1.243, + "loss/crossentropy": 2.475876808166504, + "loss/hidden": 1.0703125, + "loss/logits": 0.17232248187065125, + "loss/reg": 3.424075111979619e-05, + "step": 368 + }, + { + "epoch": 0.046125, + "grad_norm": 1.9393197298049927, + "grad_norm_var": 2.80086684083605, + "learning_rate": 0.0001, + "loss": 1.1504, + "loss/crossentropy": 2.431164503097534, + "loss/hidden": 0.99609375, + "loss/logits": 0.1539495587348938, + "loss/reg": 3.4231870813528076e-05, + "step": 369 + }, + { + "epoch": 0.04625, + "grad_norm": 2.072326183319092, + "grad_norm_var": 2.764824687660132, + "learning_rate": 0.0001, + "loss": 1.209, + "loss/crossentropy": 2.686079502105713, + "loss/hidden": 1.0546875, + "loss/logits": 0.15395890176296234, + "loss/reg": 3.422388545004651e-05, + "step": 370 + }, + { + "epoch": 0.046375, + "grad_norm": 1.8920451402664185, + "grad_norm_var": 2.8030644353470513, + "learning_rate": 0.0001, + "loss": 1.1611, + "loss/crossentropy": 2.4512131214141846, + "loss/hidden": 1.0, + "loss/logits": 0.16072264313697815, + "loss/reg": 3.421401197556406e-05, + "step": 371 + }, + { + "epoch": 0.0465, + "grad_norm": 1.8197071552276611, + "grad_norm_var": 2.858464465681775, + "learning_rate": 0.0001, + "loss": 1.1394, + "loss/crossentropy": 2.6290316581726074, + "loss/hidden": 0.99609375, + "loss/logits": 0.1429995894432068, + "loss/reg": 3.4208966098958626e-05, + "step": 372 + }, + { + "epoch": 0.046625, + "grad_norm": 1.6114336252212524, + "grad_norm_var": 2.9442100668891373, + "learning_rate": 0.0001, + "loss": 1.1743, + "loss/crossentropy": 2.412721872329712, + "loss/hidden": 1.0234375, + "loss/logits": 0.15048110485076904, + "loss/reg": 3.419875429244712e-05, + "step": 373 + }, + { + "epoch": 0.04675, + "grad_norm": 2.3423984050750732, + "grad_norm_var": 2.908825389746772, + "learning_rate": 0.0001, + "loss": 1.1547, + "loss/crossentropy": 2.479085683822632, + "loss/hidden": 1.0078125, + "loss/logits": 0.14653661847114563, + "loss/reg": 3.419152199057862e-05, + "step": 374 + }, + { + "epoch": 0.046875, + "grad_norm": 2.2863998413085938, + "grad_norm_var": 2.9026800154510086, + "learning_rate": 0.0001, + "loss": 1.1562, + "loss/crossentropy": 2.6520957946777344, + "loss/hidden": 1.015625, + "loss/logits": 0.1402827501296997, + "loss/reg": 3.418369669816457e-05, + "step": 375 + }, + { + "epoch": 0.047, + "grad_norm": 3.5551681518554688, + "grad_norm_var": 2.906172521956549, + "learning_rate": 0.0001, + "loss": 1.3527, + "loss/crossentropy": 2.5694146156311035, + "loss/hidden": 1.140625, + "loss/logits": 0.21178269386291504, + "loss/reg": 3.417985135456547e-05, + "step": 376 + }, + { + "epoch": 0.047125, + "grad_norm": 2.4192416667938232, + "grad_norm_var": 2.912446952830265, + "learning_rate": 0.0001, + "loss": 1.1754, + "loss/crossentropy": 2.4164340496063232, + "loss/hidden": 1.03125, + "loss/logits": 0.14376118779182434, + "loss/reg": 3.4170752769568935e-05, + "step": 377 + }, + { + "epoch": 0.04725, + "grad_norm": 2.242144823074341, + "grad_norm_var": 2.91972157704962, + "learning_rate": 0.0001, + "loss": 1.1153, + "loss/crossentropy": 2.544785261154175, + "loss/hidden": 0.984375, + "loss/logits": 0.13062497973442078, + "loss/reg": 3.416725303395651e-05, + "step": 378 + }, + { + "epoch": 0.047375, + "grad_norm": 2.2893593311309814, + "grad_norm_var": 1.2237873306323004, + "learning_rate": 0.0001, + "loss": 1.3118, + "loss/crossentropy": 2.364673614501953, + "loss/hidden": 1.1328125, + "loss/logits": 0.17860937118530273, + "loss/reg": 3.416024992475286e-05, + "step": 379 + }, + { + "epoch": 0.0475, + "grad_norm": 2.2065534591674805, + "grad_norm_var": 1.2121325720205287, + "learning_rate": 0.0001, + "loss": 1.3261, + "loss/crossentropy": 2.0714166164398193, + "loss/hidden": 1.1484375, + "loss/logits": 0.17733046412467957, + "loss/reg": 3.415203173062764e-05, + "step": 380 + }, + { + "epoch": 0.047625, + "grad_norm": 12.950845718383789, + "grad_norm_var": 8.05178996682972, + "learning_rate": 0.0001, + "loss": 1.4789, + "loss/crossentropy": 2.1035971641540527, + "loss/hidden": 1.328125, + "loss/logits": 0.15040796995162964, + "loss/reg": 3.414938328205608e-05, + "step": 381 + }, + { + "epoch": 0.04775, + "grad_norm": 1.9148967266082764, + "grad_norm_var": 8.095531060395368, + "learning_rate": 0.0001, + "loss": 1.136, + "loss/crossentropy": 2.589541435241699, + "loss/hidden": 0.9921875, + "loss/logits": 0.14346718788146973, + "loss/reg": 3.41419035976287e-05, + "step": 382 + }, + { + "epoch": 0.047875, + "grad_norm": 2.309288263320923, + "grad_norm_var": 7.402131974293955, + "learning_rate": 0.0001, + "loss": 1.1656, + "loss/crossentropy": 2.601487398147583, + "loss/hidden": 1.015625, + "loss/logits": 0.1496451497077942, + "loss/reg": 3.413420563447289e-05, + "step": 383 + }, + { + "epoch": 0.048, + "grad_norm": 2.129903554916382, + "grad_norm_var": 7.400441847159771, + "learning_rate": 0.0001, + "loss": 1.2433, + "loss/crossentropy": 2.0785951614379883, + "loss/hidden": 1.1015625, + "loss/logits": 0.14143729209899902, + "loss/reg": 3.413164085941389e-05, + "step": 384 + }, + { + "epoch": 0.048125, + "grad_norm": 2.513585329055786, + "grad_norm_var": 7.349500066162391, + "learning_rate": 0.0001, + "loss": 1.46, + "loss/crossentropy": 2.378612756729126, + "loss/hidden": 1.25, + "loss/logits": 0.20967382192611694, + "loss/reg": 3.413192825973965e-05, + "step": 385 + }, + { + "epoch": 0.04825, + "grad_norm": 1.8059685230255127, + "grad_norm_var": 7.3836732232467055, + "learning_rate": 0.0001, + "loss": 1.091, + "loss/crossentropy": 2.4599719047546387, + "loss/hidden": 0.9609375, + "loss/logits": 0.12968455255031586, + "loss/reg": 3.412771911825985e-05, + "step": 386 + }, + { + "epoch": 0.048375, + "grad_norm": 1.9648966789245605, + "grad_norm_var": 7.374281548362966, + "learning_rate": 0.0001, + "loss": 1.0183, + "loss/crossentropy": 2.405507802963257, + "loss/hidden": 0.890625, + "loss/logits": 0.12728646397590637, + "loss/reg": 3.412525984458625e-05, + "step": 387 + }, + { + "epoch": 0.0485, + "grad_norm": 2.7891759872436523, + "grad_norm_var": 7.29369073112806, + "learning_rate": 0.0001, + "loss": 1.2625, + "loss/crossentropy": 2.564371347427368, + "loss/hidden": 1.078125, + "loss/logits": 0.18406376242637634, + "loss/reg": 3.4118053008569404e-05, + "step": 388 + }, + { + "epoch": 0.048625, + "grad_norm": 3.2247207164764404, + "grad_norm_var": 7.166662268117063, + "learning_rate": 0.0001, + "loss": 1.5145, + "loss/crossentropy": 2.9137074947357178, + "loss/hidden": 1.3125, + "loss/logits": 0.20163431763648987, + "loss/reg": 3.411182842683047e-05, + "step": 389 + }, + { + "epoch": 0.04875, + "grad_norm": 2.1028342247009277, + "grad_norm_var": 7.193139907597329, + "learning_rate": 0.0001, + "loss": 1.0431, + "loss/crossentropy": 2.8769431114196777, + "loss/hidden": 0.9140625, + "loss/logits": 0.1287393867969513, + "loss/reg": 3.410813951632008e-05, + "step": 390 + }, + { + "epoch": 0.048875, + "grad_norm": 2.3499820232391357, + "grad_norm_var": 7.186969405638869, + "learning_rate": 0.0001, + "loss": 1.2858, + "loss/crossentropy": 2.426356315612793, + "loss/hidden": 1.109375, + "loss/logits": 0.17607171833515167, + "loss/reg": 3.410330828046426e-05, + "step": 391 + }, + { + "epoch": 0.049, + "grad_norm": 1.912327527999878, + "grad_norm_var": 7.244567116261923, + "learning_rate": 0.0001, + "loss": 1.362, + "loss/crossentropy": 2.551398277282715, + "loss/hidden": 1.15625, + "loss/logits": 0.20536215603351593, + "loss/reg": 3.409969940548763e-05, + "step": 392 + }, + { + "epoch": 0.049125, + "grad_norm": 1.8643162250518799, + "grad_norm_var": 7.302740869176493, + "learning_rate": 0.0001, + "loss": 1.3451, + "loss/crossentropy": 2.4072647094726562, + "loss/hidden": 1.171875, + "loss/logits": 0.17286883294582367, + "loss/reg": 3.40941951435525e-05, + "step": 393 + }, + { + "epoch": 0.04925, + "grad_norm": 2.126138925552368, + "grad_norm_var": 7.313922412927239, + "learning_rate": 0.0001, + "loss": 1.1946, + "loss/crossentropy": 2.6316819190979004, + "loss/hidden": 1.0390625, + "loss/logits": 0.15518739819526672, + "loss/reg": 3.409163764445111e-05, + "step": 394 + }, + { + "epoch": 0.049375, + "grad_norm": 5.341352939605713, + "grad_norm_var": 7.646205880923245, + "learning_rate": 0.0001, + "loss": 1.3627, + "loss/crossentropy": 2.2951955795288086, + "loss/hidden": 1.171875, + "loss/logits": 0.19052964448928833, + "loss/reg": 3.408612610655837e-05, + "step": 395 + }, + { + "epoch": 0.0495, + "grad_norm": 1.841691017150879, + "grad_norm_var": 7.697707430188741, + "learning_rate": 0.0001, + "loss": 1.126, + "loss/crossentropy": 2.412738561630249, + "loss/hidden": 0.9765625, + "loss/logits": 0.14905960857868195, + "loss/reg": 3.408074189792387e-05, + "step": 396 + }, + { + "epoch": 0.049625, + "grad_norm": 2.309384346008301, + "grad_norm_var": 0.7576436792480905, + "learning_rate": 0.0001, + "loss": 1.403, + "loss/crossentropy": 2.3188250064849854, + "loss/hidden": 1.2421875, + "loss/logits": 0.16045960783958435, + "loss/reg": 3.407212716410868e-05, + "step": 397 + }, + { + "epoch": 0.04975, + "grad_norm": 2.7863593101501465, + "grad_norm_var": 0.7480129573726607, + "learning_rate": 0.0001, + "loss": 1.2891, + "loss/crossentropy": 2.3670787811279297, + "loss/hidden": 1.109375, + "loss/logits": 0.17938411235809326, + "loss/reg": 3.406599716981873e-05, + "step": 398 + }, + { + "epoch": 0.049875, + "grad_norm": 1.8358136415481567, + "grad_norm_var": 0.7715855741782158, + "learning_rate": 0.0001, + "loss": 1.1237, + "loss/crossentropy": 2.514230966567993, + "loss/hidden": 0.9921875, + "loss/logits": 0.13119591772556305, + "loss/reg": 3.4062337363138795e-05, + "step": 399 + }, + { + "epoch": 0.05, + "grad_norm": 2.350162982940674, + "grad_norm_var": 0.7657706364738013, + "learning_rate": 0.0001, + "loss": 1.2326, + "loss/crossentropy": 2.713247299194336, + "loss/hidden": 1.0625, + "loss/logits": 0.1697346568107605, + "loss/reg": 3.4054195566568524e-05, + "step": 400 + }, + { + "epoch": 0.050125, + "grad_norm": 2.63322377204895, + "grad_norm_var": 0.7677605659354261, + "learning_rate": 0.0001, + "loss": 1.1099, + "loss/crossentropy": 2.440999746322632, + "loss/hidden": 0.96484375, + "loss/logits": 0.14475254714488983, + "loss/reg": 3.404737435630523e-05, + "step": 401 + }, + { + "epoch": 0.05025, + "grad_norm": 1.7843867540359497, + "grad_norm_var": 0.769649818838896, + "learning_rate": 0.0001, + "loss": 1.1619, + "loss/crossentropy": 2.3848907947540283, + "loss/hidden": 1.0234375, + "loss/logits": 0.13809266686439514, + "loss/reg": 3.403963637538254e-05, + "step": 402 + }, + { + "epoch": 0.050375, + "grad_norm": 2.4891116619110107, + "grad_norm_var": 0.7528451996298959, + "learning_rate": 0.0001, + "loss": 1.4016, + "loss/crossentropy": 2.2915351390838623, + "loss/hidden": 1.2265625, + "loss/logits": 0.17465338110923767, + "loss/reg": 3.4031403629342094e-05, + "step": 403 + }, + { + "epoch": 0.0505, + "grad_norm": 2.6278252601623535, + "grad_norm_var": 0.7479028879806237, + "learning_rate": 0.0001, + "loss": 1.3323, + "loss/crossentropy": 2.657773494720459, + "loss/hidden": 1.140625, + "loss/logits": 0.19131596386432648, + "loss/reg": 3.402295624255203e-05, + "step": 404 + }, + { + "epoch": 0.050625, + "grad_norm": 2.6351633071899414, + "grad_norm_var": 0.7105926512095813, + "learning_rate": 0.0001, + "loss": 1.09, + "loss/crossentropy": 2.341869592666626, + "loss/hidden": 0.91796875, + "loss/logits": 0.1717243790626526, + "loss/reg": 3.401270805625245e-05, + "step": 405 + }, + { + "epoch": 0.05075, + "grad_norm": 1.9917317628860474, + "grad_norm_var": 0.7163125714595162, + "learning_rate": 0.0001, + "loss": 1.2231, + "loss/crossentropy": 2.5361015796661377, + "loss/hidden": 1.0625, + "loss/logits": 0.16022570431232452, + "loss/reg": 3.400079003768042e-05, + "step": 406 + }, + { + "epoch": 0.050875, + "grad_norm": 2.01811146736145, + "grad_norm_var": 0.7267341041079096, + "learning_rate": 0.0001, + "loss": 1.0135, + "loss/crossentropy": 2.3607399463653564, + "loss/hidden": 0.890625, + "loss/logits": 0.12249953299760818, + "loss/reg": 3.399254273972474e-05, + "step": 407 + }, + { + "epoch": 0.051, + "grad_norm": 2.469839096069336, + "grad_norm_var": 0.7092258078292549, + "learning_rate": 0.0001, + "loss": 1.3583, + "loss/crossentropy": 2.1284053325653076, + "loss/hidden": 1.1953125, + "loss/logits": 0.1626756489276886, + "loss/reg": 3.398398621357046e-05, + "step": 408 + }, + { + "epoch": 0.051125, + "grad_norm": 2.0157532691955566, + "grad_norm_var": 0.6989536122316408, + "learning_rate": 0.0001, + "loss": 1.2599, + "loss/crossentropy": 2.5868444442749023, + "loss/hidden": 1.09375, + "loss/logits": 0.16585640609264374, + "loss/reg": 3.3978514693444595e-05, + "step": 409 + }, + { + "epoch": 0.05125, + "grad_norm": 2.468764543533325, + "grad_norm_var": 0.6913355184321047, + "learning_rate": 0.0001, + "loss": 1.2353, + "loss/crossentropy": 2.4670286178588867, + "loss/hidden": 1.0703125, + "loss/logits": 0.16462820768356323, + "loss/reg": 3.397303225938231e-05, + "step": 410 + }, + { + "epoch": 0.051375, + "grad_norm": 1.657066822052002, + "grad_norm_var": 0.13160569161618738, + "learning_rate": 0.0001, + "loss": 1.1004, + "loss/crossentropy": 2.463649034500122, + "loss/hidden": 0.9609375, + "loss/logits": 0.13913306593894958, + "loss/reg": 3.397110413061455e-05, + "step": 411 + }, + { + "epoch": 0.0515, + "grad_norm": 2.3177919387817383, + "grad_norm_var": 0.12019285492734794, + "learning_rate": 0.0001, + "loss": 1.2397, + "loss/crossentropy": 2.44539737701416, + "loss/hidden": 1.0625, + "loss/logits": 0.17688173055648804, + "loss/reg": 3.396526153665036e-05, + "step": 412 + }, + { + "epoch": 0.051625, + "grad_norm": 2.2815206050872803, + "grad_norm_var": 0.12011142743010049, + "learning_rate": 0.0001, + "loss": 1.3311, + "loss/crossentropy": 2.4922685623168945, + "loss/hidden": 1.1796875, + "loss/logits": 0.15103884041309357, + "loss/reg": 3.396152169443667e-05, + "step": 413 + }, + { + "epoch": 0.05175, + "grad_norm": 2.062899351119995, + "grad_norm_var": 0.1032718534450779, + "learning_rate": 0.0001, + "loss": 1.1936, + "loss/crossentropy": 2.671872138977051, + "loss/hidden": 1.0390625, + "loss/logits": 0.15416079759597778, + "loss/reg": 3.3957923733396456e-05, + "step": 414 + }, + { + "epoch": 0.051875, + "grad_norm": 1.9539638757705688, + "grad_norm_var": 0.09797476372330317, + "learning_rate": 0.0001, + "loss": 1.1046, + "loss/crossentropy": 2.6380488872528076, + "loss/hidden": 0.97265625, + "loss/logits": 0.1315636783838272, + "loss/reg": 3.395261592231691e-05, + "step": 415 + }, + { + "epoch": 0.052, + "grad_norm": 2.2393951416015625, + "grad_norm_var": 0.09703828398074225, + "learning_rate": 0.0001, + "loss": 1.1526, + "loss/crossentropy": 2.6465938091278076, + "loss/hidden": 0.99609375, + "loss/logits": 0.1561371386051178, + "loss/reg": 3.394690065761097e-05, + "step": 416 + }, + { + "epoch": 0.052125, + "grad_norm": 2.3431386947631836, + "grad_norm_var": 0.08662086074431645, + "learning_rate": 0.0001, + "loss": 1.1849, + "loss/crossentropy": 2.598094940185547, + "loss/hidden": 1.0390625, + "loss/logits": 0.14550316333770752, + "loss/reg": 3.393869337742217e-05, + "step": 417 + }, + { + "epoch": 0.05225, + "grad_norm": 1.8472511768341064, + "grad_norm_var": 0.08330225189024129, + "learning_rate": 0.0001, + "loss": 1.1985, + "loss/crossentropy": 2.5920615196228027, + "loss/hidden": 1.0546875, + "loss/logits": 0.14346075057983398, + "loss/reg": 3.392928192624822e-05, + "step": 418 + }, + { + "epoch": 0.052375, + "grad_norm": 1.7260291576385498, + "grad_norm_var": 0.09167492136177748, + "learning_rate": 0.0001, + "loss": 1.0539, + "loss/crossentropy": 2.678154468536377, + "loss/hidden": 0.921875, + "loss/logits": 0.13166582584381104, + "loss/reg": 3.3915493986569345e-05, + "step": 419 + }, + { + "epoch": 0.0525, + "grad_norm": 2.1379270553588867, + "grad_norm_var": 0.0765096237299017, + "learning_rate": 0.0001, + "loss": 1.3037, + "loss/crossentropy": 2.554790735244751, + "loss/hidden": 1.140625, + "loss/logits": 0.16276058554649353, + "loss/reg": 3.390039273654111e-05, + "step": 420 + }, + { + "epoch": 0.052625, + "grad_norm": 2.773489475250244, + "grad_norm_var": 0.08692294666244584, + "learning_rate": 0.0001, + "loss": 1.286, + "loss/crossentropy": 2.5890371799468994, + "loss/hidden": 1.125, + "loss/logits": 0.16067233681678772, + "loss/reg": 3.3892716601258144e-05, + "step": 421 + }, + { + "epoch": 0.05275, + "grad_norm": 1.9473000764846802, + "grad_norm_var": 0.0879486532075814, + "learning_rate": 0.0001, + "loss": 1.1138, + "loss/crossentropy": 2.544093370437622, + "loss/hidden": 0.9765625, + "loss/logits": 0.13691496849060059, + "loss/reg": 3.38886420649942e-05, + "step": 422 + }, + { + "epoch": 0.052875, + "grad_norm": 2.065152406692505, + "grad_norm_var": 0.08731452126513635, + "learning_rate": 0.0001, + "loss": 1.0308, + "loss/crossentropy": 2.4369661808013916, + "loss/hidden": 0.90234375, + "loss/logits": 0.12809085845947266, + "loss/reg": 3.38791505782865e-05, + "step": 423 + }, + { + "epoch": 0.053, + "grad_norm": 1.76610267162323, + "grad_norm_var": 0.08771260345232476, + "learning_rate": 0.0001, + "loss": 1.1559, + "loss/crossentropy": 2.437058210372925, + "loss/hidden": 1.015625, + "loss/logits": 0.13998199999332428, + "loss/reg": 3.387559627299197e-05, + "step": 424 + }, + { + "epoch": 0.053125, + "grad_norm": 2.1202235221862793, + "grad_norm_var": 0.08721813960099964, + "learning_rate": 0.0001, + "loss": 1.1406, + "loss/crossentropy": 2.5524604320526123, + "loss/hidden": 0.98046875, + "loss/logits": 0.15982511639595032, + "loss/reg": 3.3869648177642375e-05, + "step": 425 + }, + { + "epoch": 0.05325, + "grad_norm": 2.3238697052001953, + "grad_norm_var": 0.08153644484325746, + "learning_rate": 0.0001, + "loss": 1.2742, + "loss/crossentropy": 2.6345760822296143, + "loss/hidden": 1.1171875, + "loss/logits": 0.15670067071914673, + "loss/reg": 3.386356183909811e-05, + "step": 426 + }, + { + "epoch": 0.053375, + "grad_norm": 1.8629034757614136, + "grad_norm_var": 0.07209149684443308, + "learning_rate": 0.0001, + "loss": 1.1243, + "loss/crossentropy": 2.4134674072265625, + "loss/hidden": 0.96875, + "loss/logits": 0.1552024483680725, + "loss/reg": 3.385494346730411e-05, + "step": 427 + }, + { + "epoch": 0.0535, + "grad_norm": 2.185732841491699, + "grad_norm_var": 0.06953255529498938, + "learning_rate": 0.0001, + "loss": 1.3593, + "loss/crossentropy": 2.088414430618286, + "loss/hidden": 1.203125, + "loss/logits": 0.15583863854408264, + "loss/reg": 3.384939191164449e-05, + "step": 428 + }, + { + "epoch": 0.053625, + "grad_norm": 2.0822196006774902, + "grad_norm_var": 0.06725276287184746, + "learning_rate": 0.0001, + "loss": 1.1775, + "loss/crossentropy": 2.2571120262145996, + "loss/hidden": 1.03125, + "loss/logits": 0.1458958387374878, + "loss/reg": 3.3843141864053905e-05, + "step": 429 + }, + { + "epoch": 0.05375, + "grad_norm": 1.9582529067993164, + "grad_norm_var": 0.06831322983159846, + "learning_rate": 0.0001, + "loss": 1.2637, + "loss/crossentropy": 2.273467779159546, + "loss/hidden": 1.09375, + "loss/logits": 0.16964468359947205, + "loss/reg": 3.38399586325977e-05, + "step": 430 + }, + { + "epoch": 0.053875, + "grad_norm": 2.5428881645202637, + "grad_norm_var": 0.07983358220816962, + "learning_rate": 0.0001, + "loss": 1.08, + "loss/crossentropy": 2.6513051986694336, + "loss/hidden": 0.94921875, + "loss/logits": 0.13046178221702576, + "loss/reg": 3.3836400689324364e-05, + "step": 431 + }, + { + "epoch": 0.054, + "grad_norm": 5.832214832305908, + "grad_norm_var": 0.9437448574938159, + "learning_rate": 0.0001, + "loss": 1.9424, + "loss/crossentropy": 2.6202034950256348, + "loss/hidden": 1.640625, + "loss/logits": 0.30144432187080383, + "loss/reg": 3.383049988769926e-05, + "step": 432 + }, + { + "epoch": 0.054125, + "grad_norm": 17.29625129699707, + "grad_norm_var": 14.915418371233736, + "learning_rate": 0.0001, + "loss": 1.369, + "loss/crossentropy": 2.5406692028045654, + "loss/hidden": 1.1953125, + "loss/logits": 0.17335116863250732, + "loss/reg": 3.382577415322885e-05, + "step": 433 + }, + { + "epoch": 0.05425, + "grad_norm": 2.1661648750305176, + "grad_norm_var": 14.860884296803356, + "learning_rate": 0.0001, + "loss": 1.3231, + "loss/crossentropy": 2.486368417739868, + "loss/hidden": 1.1328125, + "loss/logits": 0.18998079001903534, + "loss/reg": 3.381881833774969e-05, + "step": 434 + }, + { + "epoch": 0.054375, + "grad_norm": 3.2340309619903564, + "grad_norm_var": 14.686707047148603, + "learning_rate": 0.0001, + "loss": 1.1883, + "loss/crossentropy": 2.6120805740356445, + "loss/hidden": 1.03125, + "loss/logits": 0.15667462348937988, + "loss/reg": 3.381211718078703e-05, + "step": 435 + }, + { + "epoch": 0.0545, + "grad_norm": 2.4656379222869873, + "grad_norm_var": 14.638560696511908, + "learning_rate": 0.0001, + "loss": 1.2838, + "loss/crossentropy": 2.5453333854675293, + "loss/hidden": 1.1171875, + "loss/logits": 0.1663120836019516, + "loss/reg": 3.380520502105355e-05, + "step": 436 + }, + { + "epoch": 0.054625, + "grad_norm": 2.4808294773101807, + "grad_norm_var": 14.668903570755694, + "learning_rate": 0.0001, + "loss": 1.3034, + "loss/crossentropy": 2.254472255706787, + "loss/hidden": 1.09375, + "loss/logits": 0.20927491784095764, + "loss/reg": 3.3801999961724505e-05, + "step": 437 + }, + { + "epoch": 0.05475, + "grad_norm": 2.0809855461120605, + "grad_norm_var": 14.644204809831463, + "learning_rate": 0.0001, + "loss": 1.2904, + "loss/crossentropy": 2.387235403060913, + "loss/hidden": 1.125, + "loss/logits": 0.16507935523986816, + "loss/reg": 3.380004272912629e-05, + "step": 438 + }, + { + "epoch": 0.054875, + "grad_norm": 2.5827701091766357, + "grad_norm_var": 14.568551148225374, + "learning_rate": 0.0001, + "loss": 1.1913, + "loss/crossentropy": 2.5564305782318115, + "loss/hidden": 1.0390625, + "loss/logits": 0.15191948413848877, + "loss/reg": 3.379736881470308e-05, + "step": 439 + }, + { + "epoch": 0.055, + "grad_norm": 2.4748318195343018, + "grad_norm_var": 14.44211406577169, + "learning_rate": 0.0001, + "loss": 1.3339, + "loss/crossentropy": 2.169847249984741, + "loss/hidden": 1.140625, + "loss/logits": 0.1929139941930771, + "loss/reg": 3.379437475814484e-05, + "step": 440 + }, + { + "epoch": 0.055125, + "grad_norm": 2.17909836769104, + "grad_norm_var": 14.43165167732106, + "learning_rate": 0.0001, + "loss": 1.1514, + "loss/crossentropy": 2.6119625568389893, + "loss/hidden": 1.0, + "loss/logits": 0.1510833203792572, + "loss/reg": 3.3788579457905143e-05, + "step": 441 + }, + { + "epoch": 0.05525, + "grad_norm": 2.328896999359131, + "grad_norm_var": 14.430875418614992, + "learning_rate": 0.0001, + "loss": 1.4589, + "loss/crossentropy": 2.1644153594970703, + "loss/hidden": 1.2578125, + "loss/logits": 0.20078104734420776, + "loss/reg": 3.37848650815431e-05, + "step": 442 + }, + { + "epoch": 0.055375, + "grad_norm": 2.473328113555908, + "grad_norm_var": 14.322173701255808, + "learning_rate": 0.0001, + "loss": 1.1936, + "loss/crossentropy": 2.5964508056640625, + "loss/hidden": 1.046875, + "loss/logits": 0.1464037448167801, + "loss/reg": 3.3778171200538054e-05, + "step": 443 + }, + { + "epoch": 0.0555, + "grad_norm": 4.068549633026123, + "grad_norm_var": 14.208086262392497, + "learning_rate": 0.0001, + "loss": 1.3952, + "loss/crossentropy": 2.8060688972473145, + "loss/hidden": 1.1875, + "loss/logits": 0.20741160213947296, + "loss/reg": 3.37726560246665e-05, + "step": 444 + }, + { + "epoch": 0.055625, + "grad_norm": 2.1623542308807373, + "grad_norm_var": 14.191838680780066, + "learning_rate": 0.0001, + "loss": 1.0824, + "loss/crossentropy": 2.724346160888672, + "loss/hidden": 0.94921875, + "loss/logits": 0.13284045457839966, + "loss/reg": 3.3767199056455866e-05, + "step": 445 + }, + { + "epoch": 0.05575, + "grad_norm": 2.2896392345428467, + "grad_norm_var": 14.12415401393583, + "learning_rate": 0.0001, + "loss": 1.369, + "loss/crossentropy": 2.3175323009490967, + "loss/hidden": 1.1796875, + "loss/logits": 0.18897001445293427, + "loss/reg": 3.375912274350412e-05, + "step": 446 + }, + { + "epoch": 0.055875, + "grad_norm": 2.046797037124634, + "grad_norm_var": 14.213834657666071, + "learning_rate": 0.0001, + "loss": 1.1878, + "loss/crossentropy": 2.6415698528289795, + "loss/hidden": 1.0390625, + "loss/logits": 0.14838215708732605, + "loss/reg": 3.3750762668205425e-05, + "step": 447 + }, + { + "epoch": 0.056, + "grad_norm": 3.2464237213134766, + "grad_norm_var": 13.874242204082186, + "learning_rate": 0.0001, + "loss": 1.418, + "loss/crossentropy": 2.4582679271698, + "loss/hidden": 1.1875, + "loss/logits": 0.2301977574825287, + "loss/reg": 3.3743133826646954e-05, + "step": 448 + }, + { + "epoch": 0.056125, + "grad_norm": 2.189342737197876, + "grad_norm_var": 0.29544563519738554, + "learning_rate": 0.0001, + "loss": 1.2354, + "loss/crossentropy": 2.224080801010132, + "loss/hidden": 1.109375, + "loss/logits": 0.12568463385105133, + "loss/reg": 3.3736727345967665e-05, + "step": 449 + }, + { + "epoch": 0.05625, + "grad_norm": 1.8410553932189941, + "grad_norm_var": 0.3177951887186661, + "learning_rate": 0.0001, + "loss": 1.1724, + "loss/crossentropy": 2.3079254627227783, + "loss/hidden": 1.015625, + "loss/logits": 0.15645131468772888, + "loss/reg": 3.372762876097113e-05, + "step": 450 + }, + { + "epoch": 0.056375, + "grad_norm": 2.075913667678833, + "grad_norm_var": 0.28967181210960763, + "learning_rate": 0.0001, + "loss": 1.3254, + "loss/crossentropy": 2.3725907802581787, + "loss/hidden": 1.140625, + "loss/logits": 0.18442153930664062, + "loss/reg": 3.3720290957717225e-05, + "step": 451 + }, + { + "epoch": 0.0565, + "grad_norm": 2.2170984745025635, + "grad_norm_var": 0.29257204608246923, + "learning_rate": 0.0001, + "loss": 1.3107, + "loss/crossentropy": 2.410374164581299, + "loss/hidden": 1.1328125, + "loss/logits": 0.17750215530395508, + "loss/reg": 3.371315688127652e-05, + "step": 452 + }, + { + "epoch": 0.056625, + "grad_norm": 2.1257264614105225, + "grad_norm_var": 0.2976260957554473, + "learning_rate": 0.0001, + "loss": 1.1145, + "loss/crossentropy": 2.1573164463043213, + "loss/hidden": 0.9609375, + "loss/logits": 0.1532134860754013, + "loss/reg": 3.3702854125294834e-05, + "step": 453 + }, + { + "epoch": 0.05675, + "grad_norm": 2.020737409591675, + "grad_norm_var": 0.3004070010410295, + "learning_rate": 0.0001, + "loss": 1.2252, + "loss/crossentropy": 2.5048506259918213, + "loss/hidden": 1.0703125, + "loss/logits": 0.15453127026557922, + "loss/reg": 3.3693660952849314e-05, + "step": 454 + }, + { + "epoch": 0.056875, + "grad_norm": 2.0882959365844727, + "grad_norm_var": 0.30331944550090667, + "learning_rate": 0.0001, + "loss": 1.148, + "loss/crossentropy": 2.6023924350738525, + "loss/hidden": 1.0, + "loss/logits": 0.1476426124572754, + "loss/reg": 3.368509715073742e-05, + "step": 455 + }, + { + "epoch": 0.057, + "grad_norm": 1.9702045917510986, + "grad_norm_var": 0.31179501443108676, + "learning_rate": 0.0001, + "loss": 1.0904, + "loss/crossentropy": 2.414108991622925, + "loss/hidden": 0.953125, + "loss/logits": 0.13695700466632843, + "loss/reg": 3.367620593053289e-05, + "step": 456 + }, + { + "epoch": 0.057125, + "grad_norm": 2.7802071571350098, + "grad_norm_var": 0.32206609917582013, + "learning_rate": 0.0001, + "loss": 1.1101, + "loss/crossentropy": 2.802133560180664, + "loss/hidden": 0.96484375, + "loss/logits": 0.14492589235305786, + "loss/reg": 3.3667642128420994e-05, + "step": 457 + }, + { + "epoch": 0.05725, + "grad_norm": 2.838331699371338, + "grad_norm_var": 0.3354750209379326, + "learning_rate": 0.0001, + "loss": 1.3563, + "loss/crossentropy": 2.409419536590576, + "loss/hidden": 1.1875, + "loss/logits": 0.16849547624588013, + "loss/reg": 3.365922748344019e-05, + "step": 458 + }, + { + "epoch": 0.057375, + "grad_norm": 2.1942574977874756, + "grad_norm_var": 0.33769313303004084, + "learning_rate": 0.0001, + "loss": 1.1997, + "loss/crossentropy": 2.5466361045837402, + "loss/hidden": 1.0390625, + "loss/logits": 0.16028670966625214, + "loss/reg": 3.3653053833404556e-05, + "step": 459 + }, + { + "epoch": 0.0575, + "grad_norm": 1.9543126821517944, + "grad_norm_var": 0.14238904796044755, + "learning_rate": 0.0001, + "loss": 1.082, + "loss/crossentropy": 2.3072826862335205, + "loss/hidden": 0.9453125, + "loss/logits": 0.13639651238918304, + "loss/reg": 3.36485099978745e-05, + "step": 460 + }, + { + "epoch": 0.057625, + "grad_norm": 2.3599038124084473, + "grad_norm_var": 0.14245257928573613, + "learning_rate": 0.0001, + "loss": 1.3462, + "loss/crossentropy": 2.5248610973358154, + "loss/hidden": 1.15625, + "loss/logits": 0.18964824080467224, + "loss/reg": 3.3640484616626054e-05, + "step": 461 + }, + { + "epoch": 0.05775, + "grad_norm": 2.160914421081543, + "grad_norm_var": 0.14306343844920466, + "learning_rate": 0.0001, + "loss": 1.1187, + "loss/crossentropy": 2.485192060470581, + "loss/hidden": 0.95703125, + "loss/logits": 0.16129833459854126, + "loss/reg": 3.363731593708508e-05, + "step": 462 + }, + { + "epoch": 0.057875, + "grad_norm": 1.96443510055542, + "grad_norm_var": 0.14579406927241975, + "learning_rate": 0.0001, + "loss": 1.172, + "loss/crossentropy": 2.3434600830078125, + "loss/hidden": 1.03125, + "loss/logits": 0.14036868512630463, + "loss/reg": 3.362847928656265e-05, + "step": 463 + }, + { + "epoch": 0.058, + "grad_norm": 2.240238666534424, + "grad_norm_var": 0.07561911079075631, + "learning_rate": 0.0001, + "loss": 1.2581, + "loss/crossentropy": 2.66363263130188, + "loss/hidden": 1.1015625, + "loss/logits": 0.1561916470527649, + "loss/reg": 3.361971539561637e-05, + "step": 464 + }, + { + "epoch": 0.058125, + "grad_norm": 2.9686269760131836, + "grad_norm_var": 0.11362960790722566, + "learning_rate": 0.0001, + "loss": 1.3006, + "loss/crossentropy": 2.733015775680542, + "loss/hidden": 1.078125, + "loss/logits": 0.22209219634532928, + "loss/reg": 3.361287963343784e-05, + "step": 465 + }, + { + "epoch": 0.05825, + "grad_norm": 2.5101568698883057, + "grad_norm_var": 0.10624098469997983, + "learning_rate": 0.0001, + "loss": 1.1411, + "loss/crossentropy": 2.588346242904663, + "loss/hidden": 0.99609375, + "loss/logits": 0.1446218490600586, + "loss/reg": 3.360513073857874e-05, + "step": 466 + }, + { + "epoch": 0.058375, + "grad_norm": 2.770623207092285, + "grad_norm_var": 0.11756231178518603, + "learning_rate": 0.0001, + "loss": 1.3929, + "loss/crossentropy": 1.7947008609771729, + "loss/hidden": 1.2109375, + "loss/logits": 0.18167641758918762, + "loss/reg": 3.359945912961848e-05, + "step": 467 + }, + { + "epoch": 0.0585, + "grad_norm": 1.9178507328033447, + "grad_norm_var": 0.1273747784869385, + "learning_rate": 0.0001, + "loss": 1.0863, + "loss/crossentropy": 2.3694558143615723, + "loss/hidden": 0.94140625, + "loss/logits": 0.14460425078868866, + "loss/reg": 3.359114271006547e-05, + "step": 468 + }, + { + "epoch": 0.058625, + "grad_norm": 2.9377102851867676, + "grad_norm_var": 0.14927586898533457, + "learning_rate": 0.0001, + "loss": 1.3075, + "loss/crossentropy": 2.737233877182007, + "loss/hidden": 1.1328125, + "loss/logits": 0.17436236143112183, + "loss/reg": 3.35832592099905e-05, + "step": 469 + }, + { + "epoch": 0.05875, + "grad_norm": 2.4766845703125, + "grad_norm_var": 0.14196017860283514, + "learning_rate": 0.0001, + "loss": 1.2198, + "loss/crossentropy": 2.725649118423462, + "loss/hidden": 1.046875, + "loss/logits": 0.17263534665107727, + "loss/reg": 3.3575062843738124e-05, + "step": 470 + }, + { + "epoch": 0.058875, + "grad_norm": 2.97731876373291, + "grad_norm_var": 0.15638940419960357, + "learning_rate": 0.0001, + "loss": 1.1337, + "loss/crossentropy": 2.4248714447021484, + "loss/hidden": 0.98828125, + "loss/logits": 0.1451077163219452, + "loss/reg": 3.3564418117748573e-05, + "step": 471 + }, + { + "epoch": 0.059, + "grad_norm": 2.4938745498657227, + "grad_norm_var": 0.14080595119560016, + "learning_rate": 0.0001, + "loss": 1.2354, + "loss/crossentropy": 2.5639638900756836, + "loss/hidden": 1.0625, + "loss/logits": 0.17252308130264282, + "loss/reg": 3.355655644554645e-05, + "step": 472 + }, + { + "epoch": 0.059125, + "grad_norm": 2.453796148300171, + "grad_norm_var": 0.13403350770174421, + "learning_rate": 0.0001, + "loss": 1.3133, + "loss/crossentropy": 2.535618543624878, + "loss/hidden": 1.140625, + "loss/logits": 0.1723370999097824, + "loss/reg": 3.354718137416057e-05, + "step": 473 + }, + { + "epoch": 0.05925, + "grad_norm": 2.3145835399627686, + "grad_norm_var": 0.12414269824475065, + "learning_rate": 0.0001, + "loss": 1.1404, + "loss/crossentropy": 2.5441014766693115, + "loss/hidden": 0.9921875, + "loss/logits": 0.14785614609718323, + "loss/reg": 3.353881766088307e-05, + "step": 474 + }, + { + "epoch": 0.059375, + "grad_norm": 2.3766791820526123, + "grad_norm_var": 0.12076940932042811, + "learning_rate": 0.0001, + "loss": 1.546, + "loss/crossentropy": 2.321577310562134, + "loss/hidden": 1.3203125, + "loss/logits": 0.22531206905841827, + "loss/reg": 3.352982457727194e-05, + "step": 475 + }, + { + "epoch": 0.0595, + "grad_norm": 2.9967336654663086, + "grad_norm_var": 0.1225888750658117, + "learning_rate": 0.0001, + "loss": 1.5924, + "loss/crossentropy": 2.00260066986084, + "loss/hidden": 1.375, + "loss/logits": 0.21702903509140015, + "loss/reg": 3.3521097066113725e-05, + "step": 476 + }, + { + "epoch": 0.059625, + "grad_norm": 2.6383934020996094, + "grad_norm_var": 0.12241946620474262, + "learning_rate": 0.0001, + "loss": 1.4419, + "loss/crossentropy": 2.5109200477600098, + "loss/hidden": 1.2265625, + "loss/logits": 0.21502982079982758, + "loss/reg": 3.3512478694319725e-05, + "step": 477 + }, + { + "epoch": 0.05975, + "grad_norm": 2.120065450668335, + "grad_norm_var": 0.12443820755625362, + "learning_rate": 0.0001, + "loss": 1.2943, + "loss/crossentropy": 2.4915192127227783, + "loss/hidden": 1.1171875, + "loss/logits": 0.17678330838680267, + "loss/reg": 3.3503984013805166e-05, + "step": 478 + }, + { + "epoch": 0.059875, + "grad_norm": 2.858367681503296, + "grad_norm_var": 0.10937309591752348, + "learning_rate": 0.0001, + "loss": 1.4656, + "loss/crossentropy": 2.3548312187194824, + "loss/hidden": 1.265625, + "loss/logits": 0.19961079955101013, + "loss/reg": 3.349495091242716e-05, + "step": 479 + }, + { + "epoch": 0.06, + "grad_norm": 2.285705089569092, + "grad_norm_var": 0.1075290964460765, + "learning_rate": 0.0001, + "loss": 1.2262, + "loss/crossentropy": 2.489813804626465, + "loss/hidden": 1.0390625, + "loss/logits": 0.18675509095191956, + "loss/reg": 3.3486459869891405e-05, + "step": 480 + }, + { + "epoch": 0.060125, + "grad_norm": 2.094900131225586, + "grad_norm_var": 0.10863647120417165, + "learning_rate": 0.0001, + "loss": 1.3702, + "loss/crossentropy": 2.3183302879333496, + "loss/hidden": 1.171875, + "loss/logits": 0.1980261653661728, + "loss/reg": 3.3478718250989914e-05, + "step": 481 + }, + { + "epoch": 0.06025, + "grad_norm": 2.406928777694702, + "grad_norm_var": 0.10935489058969262, + "learning_rate": 0.0001, + "loss": 1.3264, + "loss/crossentropy": 2.5121381282806396, + "loss/hidden": 1.15625, + "loss/logits": 0.1697903275489807, + "loss/reg": 3.3472137147327885e-05, + "step": 482 + }, + { + "epoch": 0.060375, + "grad_norm": 1.7551078796386719, + "grad_norm_var": 0.1381837528506061, + "learning_rate": 0.0001, + "loss": 1.0999, + "loss/crossentropy": 2.433582067489624, + "loss/hidden": 0.9609375, + "loss/logits": 0.13861994445323944, + "loss/reg": 3.34642463712953e-05, + "step": 483 + }, + { + "epoch": 0.0605, + "grad_norm": 2.15712308883667, + "grad_norm_var": 0.12497483119508387, + "learning_rate": 0.0001, + "loss": 1.264, + "loss/crossentropy": 2.77188777923584, + "loss/hidden": 1.1015625, + "loss/logits": 0.1621045470237732, + "loss/reg": 3.345516961417161e-05, + "step": 484 + }, + { + "epoch": 0.060625, + "grad_norm": 2.0167171955108643, + "grad_norm_var": 0.11920370288203964, + "learning_rate": 0.0001, + "loss": 1.3431, + "loss/crossentropy": 2.546163558959961, + "loss/hidden": 1.1484375, + "loss/logits": 0.19433115422725677, + "loss/reg": 3.344708966324106e-05, + "step": 485 + }, + { + "epoch": 0.06075, + "grad_norm": 2.6408584117889404, + "grad_norm_var": 0.12253544383796963, + "learning_rate": 0.0001, + "loss": 1.379, + "loss/crossentropy": 2.7440547943115234, + "loss/hidden": 1.1953125, + "loss/logits": 0.1834029257297516, + "loss/reg": 3.343883508932777e-05, + "step": 486 + }, + { + "epoch": 0.060875, + "grad_norm": 2.1073925495147705, + "grad_norm_var": 0.10422711697162654, + "learning_rate": 0.0001, + "loss": 1.095, + "loss/crossentropy": 2.180604934692383, + "loss/hidden": 0.953125, + "loss/logits": 0.14150169491767883, + "loss/reg": 3.343077696627006e-05, + "step": 487 + }, + { + "epoch": 0.061, + "grad_norm": 2.980268716812134, + "grad_norm_var": 0.1278688011981179, + "learning_rate": 0.0001, + "loss": 1.3713, + "loss/crossentropy": 2.511422634124756, + "loss/hidden": 1.1796875, + "loss/logits": 0.19132453203201294, + "loss/reg": 3.3423166314605623e-05, + "step": 488 + }, + { + "epoch": 0.061125, + "grad_norm": 2.237560272216797, + "grad_norm_var": 0.1288862839917743, + "learning_rate": 0.0001, + "loss": 1.0922, + "loss/crossentropy": 2.361391305923462, + "loss/hidden": 0.94921875, + "loss/logits": 0.14263351261615753, + "loss/reg": 3.341743286000565e-05, + "step": 489 + }, + { + "epoch": 0.06125, + "grad_norm": 2.2243480682373047, + "grad_norm_var": 0.1301125949056683, + "learning_rate": 0.0001, + "loss": 1.2353, + "loss/crossentropy": 2.6845271587371826, + "loss/hidden": 1.0703125, + "loss/logits": 0.16469591856002808, + "loss/reg": 3.340901093906723e-05, + "step": 490 + }, + { + "epoch": 0.061375, + "grad_norm": 1.9749258756637573, + "grad_norm_var": 0.13976616590314225, + "learning_rate": 0.0001, + "loss": 1.0788, + "loss/crossentropy": 2.8269782066345215, + "loss/hidden": 0.9453125, + "loss/logits": 0.13316524028778076, + "loss/reg": 3.340181865496561e-05, + "step": 491 + }, + { + "epoch": 0.0615, + "grad_norm": 3.8082363605499268, + "grad_norm_var": 0.2516089050798465, + "learning_rate": 0.0001, + "loss": 1.1953, + "loss/crossentropy": 3.110802412033081, + "loss/hidden": 1.03125, + "loss/logits": 0.16368849575519562, + "loss/reg": 3.33938623953145e-05, + "step": 492 + }, + { + "epoch": 0.061625, + "grad_norm": 7.89940881729126, + "grad_norm_var": 2.152808837213317, + "learning_rate": 0.0001, + "loss": 1.5515, + "loss/crossentropy": 2.6210036277770996, + "loss/hidden": 1.40625, + "loss/logits": 0.1449393779039383, + "loss/reg": 3.338697206345387e-05, + "step": 493 + }, + { + "epoch": 0.06175, + "grad_norm": 2.5107972621917725, + "grad_norm_var": 2.1309396475018327, + "learning_rate": 0.0001, + "loss": 1.2771, + "loss/crossentropy": 2.296051263809204, + "loss/hidden": 1.09375, + "loss/logits": 0.18302392959594727, + "loss/reg": 3.337907401146367e-05, + "step": 494 + }, + { + "epoch": 0.061875, + "grad_norm": 2.3817129135131836, + "grad_norm_var": 2.138088174245088, + "learning_rate": 0.0001, + "loss": 1.2106, + "loss/crossentropy": 2.6225788593292236, + "loss/hidden": 1.0546875, + "loss/logits": 0.1555565595626831, + "loss/reg": 3.3369677112204954e-05, + "step": 495 + }, + { + "epoch": 0.062, + "grad_norm": 2.297321081161499, + "grad_norm_var": 2.137427651207279, + "learning_rate": 0.0001, + "loss": 1.3821, + "loss/crossentropy": 2.326659679412842, + "loss/hidden": 1.1796875, + "loss/logits": 0.20210911333560944, + "loss/reg": 3.3361084206262603e-05, + "step": 496 + }, + { + "epoch": 0.062125, + "grad_norm": 2.1879894733428955, + "grad_norm_var": 2.1302310419826815, + "learning_rate": 0.0001, + "loss": 1.3433, + "loss/crossentropy": 2.6444246768951416, + "loss/hidden": 1.15625, + "loss/logits": 0.18667887151241302, + "loss/reg": 3.3350897865602747e-05, + "step": 497 + }, + { + "epoch": 0.06225, + "grad_norm": 2.7556395530700684, + "grad_norm_var": 2.1230810021853803, + "learning_rate": 0.0001, + "loss": 1.1761, + "loss/crossentropy": 2.8364853858947754, + "loss/hidden": 1.015625, + "loss/logits": 0.16018438339233398, + "loss/reg": 3.334263601573184e-05, + "step": 498 + }, + { + "epoch": 0.062375, + "grad_norm": 2.0885095596313477, + "grad_norm_var": 2.085981261133649, + "learning_rate": 0.0001, + "loss": 1.4855, + "loss/crossentropy": 2.0875301361083984, + "loss/hidden": 1.2890625, + "loss/logits": 0.19610214233398438, + "loss/reg": 3.3336276828777045e-05, + "step": 499 + }, + { + "epoch": 0.0625, + "grad_norm": 2.195967674255371, + "grad_norm_var": 2.0829178782721693, + "learning_rate": 0.0001, + "loss": 1.357, + "loss/crossentropy": 2.2123799324035645, + "loss/hidden": 1.1953125, + "loss/logits": 0.16132420301437378, + "loss/reg": 3.3326996344840154e-05, + "step": 500 + }, + { + "epoch": 0.062625, + "grad_norm": 1.9233721494674683, + "grad_norm_var": 2.0928282179657134, + "learning_rate": 0.0001, + "loss": 1.2098, + "loss/crossentropy": 2.4555163383483887, + "loss/hidden": 1.0546875, + "loss/logits": 0.15475736558437347, + "loss/reg": 3.331886910018511e-05, + "step": 501 + }, + { + "epoch": 0.06275, + "grad_norm": 1.7558976411819458, + "grad_norm_var": 2.1562340342327277, + "learning_rate": 0.0001, + "loss": 1.1455, + "loss/crossentropy": 2.4747304916381836, + "loss/hidden": 0.984375, + "loss/logits": 0.16081759333610535, + "loss/reg": 3.331211337354034e-05, + "step": 502 + }, + { + "epoch": 0.062875, + "grad_norm": 2.020193099975586, + "grad_norm_var": 2.163693266292887, + "learning_rate": 0.0001, + "loss": 1.0334, + "loss/crossentropy": 2.5233755111694336, + "loss/hidden": 0.890625, + "loss/logits": 0.14245402812957764, + "loss/reg": 3.330542676849291e-05, + "step": 503 + }, + { + "epoch": 0.063, + "grad_norm": 1.9017070531845093, + "grad_norm_var": 2.196473105856813, + "learning_rate": 0.0001, + "loss": 1.187, + "loss/crossentropy": 2.513927459716797, + "loss/hidden": 1.03125, + "loss/logits": 0.15539123117923737, + "loss/reg": 3.329779065097682e-05, + "step": 504 + }, + { + "epoch": 0.063125, + "grad_norm": 1.7002081871032715, + "grad_norm_var": 2.243011213708456, + "learning_rate": 0.0001, + "loss": 1.0332, + "loss/crossentropy": 2.3853862285614014, + "loss/hidden": 0.90234375, + "loss/logits": 0.13057225942611694, + "loss/reg": 3.329246101202443e-05, + "step": 505 + }, + { + "epoch": 0.06325, + "grad_norm": 3.703674793243408, + "grad_norm_var": 2.305368345603883, + "learning_rate": 0.0001, + "loss": 1.6924, + "loss/crossentropy": 2.689699172973633, + "loss/hidden": 1.3359375, + "loss/logits": 0.3561299741268158, + "loss/reg": 3.328541060909629e-05, + "step": 506 + }, + { + "epoch": 0.063375, + "grad_norm": 2.304819345474243, + "grad_norm_var": 2.28053686149454, + "learning_rate": 0.0001, + "loss": 1.2551, + "loss/crossentropy": 2.5181477069854736, + "loss/hidden": 1.078125, + "loss/logits": 0.17664968967437744, + "loss/reg": 3.3279109629802406e-05, + "step": 507 + }, + { + "epoch": 0.0635, + "grad_norm": 1.981182336807251, + "grad_norm_var": 2.222780309447023, + "learning_rate": 0.0001, + "loss": 1.2923, + "loss/crossentropy": 2.379751443862915, + "loss/hidden": 1.109375, + "loss/logits": 0.18254666030406952, + "loss/reg": 3.327402373543009e-05, + "step": 508 + }, + { + "epoch": 0.063625, + "grad_norm": 1.9515694379806519, + "grad_norm_var": 0.23157529156244816, + "learning_rate": 0.0001, + "loss": 1.2068, + "loss/crossentropy": 2.4413554668426514, + "loss/hidden": 1.046875, + "loss/logits": 0.15955983102321625, + "loss/reg": 3.327105878270231e-05, + "step": 509 + }, + { + "epoch": 0.06375, + "grad_norm": 2.283177614212036, + "grad_norm_var": 0.2262545926208522, + "learning_rate": 0.0001, + "loss": 1.121, + "loss/crossentropy": 2.586780309677124, + "loss/hidden": 0.9453125, + "loss/logits": 0.17534837126731873, + "loss/reg": 3.3264575904468074e-05, + "step": 510 + }, + { + "epoch": 0.063875, + "grad_norm": 2.5725600719451904, + "grad_norm_var": 0.23278445739527623, + "learning_rate": 0.0001, + "loss": 1.372, + "loss/crossentropy": 2.8506243228912354, + "loss/hidden": 1.15625, + "loss/logits": 0.21537676453590393, + "loss/reg": 3.325942452647723e-05, + "step": 511 + }, + { + "epoch": 0.064, + "grad_norm": 2.17802357673645, + "grad_norm_var": 0.23254723734647367, + "learning_rate": 0.0001, + "loss": 1.4721, + "loss/crossentropy": 2.531144618988037, + "loss/hidden": 1.265625, + "loss/logits": 0.20617413520812988, + "loss/reg": 3.325657962705009e-05, + "step": 512 + }, + { + "epoch": 0.064125, + "grad_norm": 2.052086353302002, + "grad_norm_var": 0.23426407133045282, + "learning_rate": 0.0001, + "loss": 1.1786, + "loss/crossentropy": 2.5373010635375977, + "loss/hidden": 1.0, + "loss/logits": 0.17822688817977905, + "loss/reg": 3.3255280868615955e-05, + "step": 513 + }, + { + "epoch": 0.06425, + "grad_norm": 2.104679584503174, + "grad_norm_var": 0.21343636499370103, + "learning_rate": 0.0001, + "loss": 1.3482, + "loss/crossentropy": 2.2992570400238037, + "loss/hidden": 1.15625, + "loss/logits": 0.19161126017570496, + "loss/reg": 3.3254742447752506e-05, + "step": 514 + }, + { + "epoch": 0.064375, + "grad_norm": 1.9803693294525146, + "grad_norm_var": 0.2153401081871055, + "learning_rate": 0.0001, + "loss": 1.1247, + "loss/crossentropy": 2.305745840072632, + "loss/hidden": 0.98828125, + "loss/logits": 0.13605481386184692, + "loss/reg": 3.325332727399655e-05, + "step": 515 + }, + { + "epoch": 0.0645, + "grad_norm": 2.516315221786499, + "grad_norm_var": 0.22315819314323923, + "learning_rate": 0.0001, + "loss": 1.286, + "loss/crossentropy": 2.6059908866882324, + "loss/hidden": 1.1328125, + "loss/logits": 0.15280824899673462, + "loss/reg": 3.324857243569568e-05, + "step": 516 + }, + { + "epoch": 0.064625, + "grad_norm": 1.9717873334884644, + "grad_norm_var": 0.22162796366275472, + "learning_rate": 0.0001, + "loss": 1.1563, + "loss/crossentropy": 2.566389322280884, + "loss/hidden": 1.0, + "loss/logits": 0.15599536895751953, + "loss/reg": 3.324487624922767e-05, + "step": 517 + }, + { + "epoch": 0.06475, + "grad_norm": 2.5053038597106934, + "grad_norm_var": 0.21373832688979721, + "learning_rate": 0.0001, + "loss": 1.0753, + "loss/crossentropy": 2.5956661701202393, + "loss/hidden": 0.93359375, + "loss/logits": 0.14140459895133972, + "loss/reg": 3.324192584841512e-05, + "step": 518 + }, + { + "epoch": 0.064875, + "grad_norm": 1.617448329925537, + "grad_norm_var": 0.2353024678766184, + "learning_rate": 0.0001, + "loss": 1.2648, + "loss/crossentropy": 2.1230597496032715, + "loss/hidden": 1.09375, + "loss/logits": 0.1706867814064026, + "loss/reg": 3.323415876366198e-05, + "step": 519 + }, + { + "epoch": 0.065, + "grad_norm": 2.6171135902404785, + "grad_norm_var": 0.23809225200622847, + "learning_rate": 0.0001, + "loss": 1.2681, + "loss/crossentropy": 3.001396894454956, + "loss/hidden": 1.0859375, + "loss/logits": 0.1818409115076065, + "loss/reg": 3.32270392391365e-05, + "step": 520 + }, + { + "epoch": 0.065125, + "grad_norm": 2.2164251804351807, + "grad_norm_var": 0.21673222300943445, + "learning_rate": 0.0001, + "loss": 1.3872, + "loss/crossentropy": 2.394097089767456, + "loss/hidden": 1.171875, + "loss/logits": 0.2150021493434906, + "loss/reg": 3.322528209537268e-05, + "step": 521 + }, + { + "epoch": 0.06525, + "grad_norm": 1.6968417167663574, + "grad_norm_var": 0.08877967906966712, + "learning_rate": 0.0001, + "loss": 1.1035, + "loss/crossentropy": 2.572463274002075, + "loss/hidden": 0.96484375, + "loss/logits": 0.13827435672283173, + "loss/reg": 3.322424890939146e-05, + "step": 522 + }, + { + "epoch": 0.065375, + "grad_norm": 2.6645708084106445, + "grad_norm_var": 0.10384589830682482, + "learning_rate": 0.0001, + "loss": 1.4294, + "loss/crossentropy": 2.662943124771118, + "loss/hidden": 1.234375, + "loss/logits": 0.19470733404159546, + "loss/reg": 3.3224798244191334e-05, + "step": 523 + }, + { + "epoch": 0.0655, + "grad_norm": 3.8316986560821533, + "grad_norm_var": 0.26836197186599786, + "learning_rate": 0.0001, + "loss": 1.3972, + "loss/crossentropy": 2.487124443054199, + "loss/hidden": 1.203125, + "loss/logits": 0.1937580108642578, + "loss/reg": 3.32186245941557e-05, + "step": 524 + }, + { + "epoch": 0.065625, + "grad_norm": 2.3071606159210205, + "grad_norm_var": 0.2598635625197322, + "learning_rate": 0.0001, + "loss": 1.2605, + "loss/crossentropy": 2.560915470123291, + "loss/hidden": 1.109375, + "loss/logits": 0.1507827639579773, + "loss/reg": 3.322085103718564e-05, + "step": 525 + }, + { + "epoch": 0.06575, + "grad_norm": 2.080793857574463, + "grad_norm_var": 0.2634096601901807, + "learning_rate": 0.0001, + "loss": 1.1865, + "loss/crossentropy": 2.3294875621795654, + "loss/hidden": 1.0234375, + "loss/logits": 0.16270865499973297, + "loss/reg": 3.322162956465036e-05, + "step": 526 + }, + { + "epoch": 0.065875, + "grad_norm": 2.533914566040039, + "grad_norm_var": 0.26213502133962924, + "learning_rate": 0.0001, + "loss": 1.3337, + "loss/crossentropy": 2.919621229171753, + "loss/hidden": 1.140625, + "loss/logits": 0.1927223801612854, + "loss/reg": 3.3224547223653644e-05, + "step": 527 + }, + { + "epoch": 0.066, + "grad_norm": 2.0774967670440674, + "grad_norm_var": 0.2644639815857895, + "learning_rate": 0.0001, + "loss": 1.1001, + "loss/crossentropy": 2.7280097007751465, + "loss/hidden": 0.96484375, + "loss/logits": 0.1349409520626068, + "loss/reg": 3.3227763196919113e-05, + "step": 528 + }, + { + "epoch": 0.066125, + "grad_norm": 2.5009524822235107, + "grad_norm_var": 0.26231642591397886, + "learning_rate": 0.0001, + "loss": 1.4265, + "loss/crossentropy": 2.3399899005889893, + "loss/hidden": 1.2109375, + "loss/logits": 0.21527621150016785, + "loss/reg": 3.322759221191518e-05, + "step": 529 + }, + { + "epoch": 0.06625, + "grad_norm": 2.312666177749634, + "grad_norm_var": 0.2588706095933777, + "learning_rate": 0.0001, + "loss": 1.2942, + "loss/crossentropy": 2.527575969696045, + "loss/hidden": 1.1171875, + "loss/logits": 0.17671984434127808, + "loss/reg": 3.322830525576137e-05, + "step": 530 + }, + { + "epoch": 0.066375, + "grad_norm": 2.528494358062744, + "grad_norm_var": 0.2514069212263559, + "learning_rate": 0.0001, + "loss": 1.2719, + "loss/crossentropy": 2.648963451385498, + "loss/hidden": 1.1015625, + "loss/logits": 0.17003270983695984, + "loss/reg": 3.322178599773906e-05, + "step": 531 + }, + { + "epoch": 0.0665, + "grad_norm": 3.481004476547241, + "grad_norm_var": 0.3279166626744005, + "learning_rate": 0.0001, + "loss": 1.8273, + "loss/crossentropy": 2.473355531692505, + "loss/hidden": 1.546875, + "loss/logits": 0.2800578474998474, + "loss/reg": 3.321353142382577e-05, + "step": 532 + }, + { + "epoch": 0.066625, + "grad_norm": 4.112295150756836, + "grad_norm_var": 0.48236737999875434, + "learning_rate": 0.0001, + "loss": 1.7412, + "loss/crossentropy": 2.67095685005188, + "loss/hidden": 1.421875, + "loss/logits": 0.31896963715553284, + "loss/reg": 3.3209085813723505e-05, + "step": 533 + }, + { + "epoch": 0.06675, + "grad_norm": 2.737959384918213, + "grad_norm_var": 0.48381294167741634, + "learning_rate": 0.0001, + "loss": 1.1166, + "loss/crossentropy": 2.6570234298706055, + "loss/hidden": 0.98046875, + "loss/logits": 0.13577935099601746, + "loss/reg": 3.3210537367267534e-05, + "step": 534 + }, + { + "epoch": 0.066875, + "grad_norm": 2.4249866008758545, + "grad_norm_var": 0.42068279072435266, + "learning_rate": 0.0001, + "loss": 1.1667, + "loss/crossentropy": 2.6176538467407227, + "loss/hidden": 1.015625, + "loss/logits": 0.15073555707931519, + "loss/reg": 3.3202206395799294e-05, + "step": 535 + }, + { + "epoch": 0.067, + "grad_norm": 2.17911696434021, + "grad_norm_var": 0.43358738180024237, + "learning_rate": 0.0001, + "loss": 1.4199, + "loss/crossentropy": 2.3476545810699463, + "loss/hidden": 1.234375, + "loss/logits": 0.18518668413162231, + "loss/reg": 3.3198077289853245e-05, + "step": 536 + }, + { + "epoch": 0.067125, + "grad_norm": 1.7613617181777954, + "grad_norm_var": 0.4701310667265805, + "learning_rate": 0.0001, + "loss": 1.3332, + "loss/crossentropy": 2.5358726978302, + "loss/hidden": 1.15625, + "loss/logits": 0.17660382390022278, + "loss/reg": 3.319012466818094e-05, + "step": 537 + }, + { + "epoch": 0.06725, + "grad_norm": 2.5631303787231445, + "grad_norm_var": 0.415376700832607, + "learning_rate": 0.0001, + "loss": 1.4518, + "loss/crossentropy": 2.551100015640259, + "loss/hidden": 1.21875, + "loss/logits": 0.23273751139640808, + "loss/reg": 3.3181466278620064e-05, + "step": 538 + }, + { + "epoch": 0.067375, + "grad_norm": 2.444988250732422, + "grad_norm_var": 0.4174102900534516, + "learning_rate": 0.0001, + "loss": 1.2783, + "loss/crossentropy": 2.5376105308532715, + "loss/hidden": 1.1015625, + "loss/logits": 0.17640447616577148, + "loss/reg": 3.317417576909065e-05, + "step": 539 + }, + { + "epoch": 0.0675, + "grad_norm": 1.9729820489883423, + "grad_norm_var": 0.3323928474246306, + "learning_rate": 0.0001, + "loss": 1.098, + "loss/crossentropy": 2.779029369354248, + "loss/hidden": 0.9609375, + "loss/logits": 0.13674432039260864, + "loss/reg": 3.316561924293637e-05, + "step": 540 + }, + { + "epoch": 0.067625, + "grad_norm": 2.1116485595703125, + "grad_norm_var": 0.3398403486674641, + "learning_rate": 0.0001, + "loss": 1.2947, + "loss/crossentropy": 2.4113619327545166, + "loss/hidden": 1.125, + "loss/logits": 0.16934365034103394, + "loss/reg": 3.315923095215112e-05, + "step": 541 + }, + { + "epoch": 0.06775, + "grad_norm": 2.0958714485168457, + "grad_norm_var": 0.3390339478288655, + "learning_rate": 0.0001, + "loss": 1.1846, + "loss/crossentropy": 2.4695186614990234, + "loss/hidden": 1.03125, + "loss/logits": 0.15305987000465393, + "loss/reg": 3.315501817269251e-05, + "step": 542 + }, + { + "epoch": 0.067875, + "grad_norm": 1.9753048419952393, + "grad_norm_var": 0.35526067215531704, + "learning_rate": 0.0001, + "loss": 1.0784, + "loss/crossentropy": 2.416532039642334, + "loss/hidden": 0.94921875, + "loss/logits": 0.12887820601463318, + "loss/reg": 3.314777131890878e-05, + "step": 543 + }, + { + "epoch": 0.068, + "grad_norm": 2.2497429847717285, + "grad_norm_var": 0.3484447964453047, + "learning_rate": 0.0001, + "loss": 1.2219, + "loss/crossentropy": 2.8293488025665283, + "loss/hidden": 1.0390625, + "loss/logits": 0.18253688514232635, + "loss/reg": 3.314365312689915e-05, + "step": 544 + }, + { + "epoch": 0.068125, + "grad_norm": 2.1856963634490967, + "grad_norm_var": 0.3531780702082564, + "learning_rate": 0.0001, + "loss": 1.342, + "loss/crossentropy": 2.4071121215820312, + "loss/hidden": 1.1484375, + "loss/logits": 0.19320359826087952, + "loss/reg": 3.313948764116503e-05, + "step": 545 + }, + { + "epoch": 0.06825, + "grad_norm": 1.9663119316101074, + "grad_norm_var": 0.3668366876101023, + "learning_rate": 0.0001, + "loss": 1.1952, + "loss/crossentropy": 2.734199047088623, + "loss/hidden": 1.03125, + "loss/logits": 0.16357938945293427, + "loss/reg": 3.3137544960482046e-05, + "step": 546 + }, + { + "epoch": 0.068375, + "grad_norm": 1.899553894996643, + "grad_norm_var": 0.38283294553956176, + "learning_rate": 0.0001, + "loss": 1.1598, + "loss/crossentropy": 2.943643808364868, + "loss/hidden": 0.9921875, + "loss/logits": 0.16727682948112488, + "loss/reg": 3.312825720058754e-05, + "step": 547 + }, + { + "epoch": 0.0685, + "grad_norm": 2.248959541320801, + "grad_norm_var": 0.29768036917005575, + "learning_rate": 0.0001, + "loss": 1.3018, + "loss/crossentropy": 2.3630926609039307, + "loss/hidden": 1.125, + "loss/logits": 0.17643454670906067, + "loss/reg": 3.311951149953529e-05, + "step": 548 + }, + { + "epoch": 0.068625, + "grad_norm": 1.9839047193527222, + "grad_norm_var": 0.06880950688917717, + "learning_rate": 0.0001, + "loss": 1.3062, + "loss/crossentropy": 2.461066484451294, + "loss/hidden": 1.1328125, + "loss/logits": 0.17305001616477966, + "loss/reg": 3.3115993574028835e-05, + "step": 549 + }, + { + "epoch": 0.06875, + "grad_norm": 2.138064384460449, + "grad_norm_var": 0.046280360048162335, + "learning_rate": 0.0001, + "loss": 1.456, + "loss/crossentropy": 2.2832694053649902, + "loss/hidden": 1.2578125, + "loss/logits": 0.19788944721221924, + "loss/reg": 3.3114942198153585e-05, + "step": 550 + }, + { + "epoch": 0.068875, + "grad_norm": 1.9238662719726562, + "grad_norm_var": 0.042773526186366935, + "learning_rate": 0.0001, + "loss": 1.2408, + "loss/crossentropy": 2.2895045280456543, + "loss/hidden": 1.0859375, + "loss/logits": 0.15455299615859985, + "loss/reg": 3.3106487535405904e-05, + "step": 551 + }, + { + "epoch": 0.069, + "grad_norm": 2.023206949234009, + "grad_norm_var": 0.042778668601256224, + "learning_rate": 0.0001, + "loss": 1.271, + "loss/crossentropy": 2.322998285293579, + "loss/hidden": 1.109375, + "loss/logits": 0.1612919569015503, + "loss/reg": 3.310632018838078e-05, + "step": 552 + }, + { + "epoch": 0.069125, + "grad_norm": 3.662583589553833, + "grad_norm_var": 0.18372824324306117, + "learning_rate": 0.0001, + "loss": 1.4513, + "loss/crossentropy": 2.347020149230957, + "loss/hidden": 1.234375, + "loss/logits": 0.21659526228904724, + "loss/reg": 3.3105799957411364e-05, + "step": 553 + }, + { + "epoch": 0.06925, + "grad_norm": 2.078821897506714, + "grad_norm_var": 0.17593105310000295, + "learning_rate": 0.0001, + "loss": 1.1689, + "loss/crossentropy": 2.5081067085266113, + "loss/hidden": 1.0234375, + "loss/logits": 0.14510974287986755, + "loss/reg": 3.3105472539318725e-05, + "step": 554 + }, + { + "epoch": 0.069375, + "grad_norm": 3.5003957748413086, + "grad_norm_var": 0.2821214155658367, + "learning_rate": 0.0001, + "loss": 1.5964, + "loss/crossentropy": 2.0227768421173096, + "loss/hidden": 1.359375, + "loss/logits": 0.23666173219680786, + "loss/reg": 3.309710882604122e-05, + "step": 555 + }, + { + "epoch": 0.0695, + "grad_norm": 2.685995101928711, + "grad_norm_var": 0.2874594797577854, + "learning_rate": 0.0001, + "loss": 1.1887, + "loss/crossentropy": 2.2398436069488525, + "loss/hidden": 1.0546875, + "loss/logits": 0.13363878428936005, + "loss/reg": 3.308998930151574e-05, + "step": 556 + }, + { + "epoch": 0.069625, + "grad_norm": 2.505387306213379, + "grad_norm_var": 0.2874906156265238, + "learning_rate": 0.0001, + "loss": 1.3653, + "loss/crossentropy": 2.1469314098358154, + "loss/hidden": 1.203125, + "loss/logits": 0.16182279586791992, + "loss/reg": 3.308698069304228e-05, + "step": 557 + }, + { + "epoch": 0.06975, + "grad_norm": 2.418957233428955, + "grad_norm_var": 0.28434973598758473, + "learning_rate": 0.0001, + "loss": 1.2753, + "loss/crossentropy": 2.3960018157958984, + "loss/hidden": 1.109375, + "loss/logits": 0.16554874181747437, + "loss/reg": 3.308444865979254e-05, + "step": 558 + }, + { + "epoch": 0.069875, + "grad_norm": 1.9992293119430542, + "grad_norm_var": 0.28322081166726315, + "learning_rate": 0.0001, + "loss": 1.095, + "loss/crossentropy": 2.6128995418548584, + "loss/hidden": 0.95703125, + "loss/logits": 0.13762758672237396, + "loss/reg": 3.307777296868153e-05, + "step": 559 + }, + { + "epoch": 0.07, + "grad_norm": 1.9864604473114014, + "grad_norm_var": 0.2907888869241692, + "learning_rate": 0.0001, + "loss": 1.2665, + "loss/crossentropy": 2.561873197555542, + "loss/hidden": 1.1015625, + "loss/logits": 0.16459921002388, + "loss/reg": 3.307408405817114e-05, + "step": 560 + }, + { + "epoch": 0.070125, + "grad_norm": 2.078775644302368, + "grad_norm_var": 0.29349590417462296, + "learning_rate": 0.0001, + "loss": 1.1511, + "loss/crossentropy": 2.37235951423645, + "loss/hidden": 1.0078125, + "loss/logits": 0.14295557141304016, + "loss/reg": 3.3065680327126756e-05, + "step": 561 + }, + { + "epoch": 0.07025, + "grad_norm": 2.110450506210327, + "grad_norm_var": 0.2880205075952574, + "learning_rate": 0.0001, + "loss": 1.3304, + "loss/crossentropy": 2.3763866424560547, + "loss/hidden": 1.1484375, + "loss/logits": 0.18167179822921753, + "loss/reg": 3.3057913242373616e-05, + "step": 562 + }, + { + "epoch": 0.070375, + "grad_norm": 1.9325919151306152, + "grad_norm_var": 0.28620232482811025, + "learning_rate": 0.0001, + "loss": 1.3137, + "loss/crossentropy": 2.2546942234039307, + "loss/hidden": 1.140625, + "loss/logits": 0.17276525497436523, + "loss/reg": 3.30470284097828e-05, + "step": 563 + }, + { + "epoch": 0.0705, + "grad_norm": 2.6625826358795166, + "grad_norm_var": 0.2924338162120667, + "learning_rate": 0.0001, + "loss": 1.2107, + "loss/crossentropy": 2.6117820739746094, + "loss/hidden": 1.03125, + "loss/logits": 0.17915129661560059, + "loss/reg": 3.3036336390068755e-05, + "step": 564 + }, + { + "epoch": 0.070625, + "grad_norm": 1.9594511985778809, + "grad_norm_var": 0.29368343179504985, + "learning_rate": 0.0001, + "loss": 1.1768, + "loss/crossentropy": 2.4371213912963867, + "loss/hidden": 1.03125, + "loss/logits": 0.14522477984428406, + "loss/reg": 3.3024989534169436e-05, + "step": 565 + }, + { + "epoch": 0.07075, + "grad_norm": 2.5697479248046875, + "grad_norm_var": 0.29289142392980183, + "learning_rate": 0.0001, + "loss": 1.3313, + "loss/crossentropy": 2.450010299682617, + "loss/hidden": 1.15625, + "loss/logits": 0.17474365234375, + "loss/reg": 3.3016185625456274e-05, + "step": 566 + }, + { + "epoch": 0.070875, + "grad_norm": 2.129605531692505, + "grad_norm_var": 0.28299262421701216, + "learning_rate": 0.0001, + "loss": 1.1703, + "loss/crossentropy": 2.386748790740967, + "loss/hidden": 1.0078125, + "loss/logits": 0.16217753291130066, + "loss/reg": 3.3007927413564175e-05, + "step": 567 + }, + { + "epoch": 0.071, + "grad_norm": 2.708440065383911, + "grad_norm_var": 0.27846047291652093, + "learning_rate": 0.0001, + "loss": 1.1157, + "loss/crossentropy": 2.3872506618499756, + "loss/hidden": 0.9609375, + "loss/logits": 0.15438461303710938, + "loss/reg": 3.299961463198997e-05, + "step": 568 + }, + { + "epoch": 0.071125, + "grad_norm": 3.5758209228515625, + "grad_norm_var": 0.2647511463576776, + "learning_rate": 0.0001, + "loss": 1.639, + "loss/crossentropy": 2.284062147140503, + "loss/hidden": 1.3984375, + "loss/logits": 0.24023738503456116, + "loss/reg": 3.2989250030368567e-05, + "step": 569 + }, + { + "epoch": 0.07125, + "grad_norm": 3.0785133838653564, + "grad_norm_var": 0.2802140667784868, + "learning_rate": 0.0001, + "loss": 1.6418, + "loss/crossentropy": 1.961942434310913, + "loss/hidden": 1.4609375, + "loss/logits": 0.18052928149700165, + "loss/reg": 3.297963485238142e-05, + "step": 570 + }, + { + "epoch": 0.071375, + "grad_norm": 2.4593677520751953, + "grad_norm_var": 0.20824244414912407, + "learning_rate": 0.0001, + "loss": 1.3159, + "loss/crossentropy": 2.573103666305542, + "loss/hidden": 1.1328125, + "loss/logits": 0.18279321491718292, + "loss/reg": 3.297093280707486e-05, + "step": 571 + }, + { + "epoch": 0.0715, + "grad_norm": 1.9554235935211182, + "grad_norm_var": 0.21655112167287166, + "learning_rate": 0.0001, + "loss": 1.2922, + "loss/crossentropy": 2.270695686340332, + "loss/hidden": 1.109375, + "loss/logits": 0.18250404298305511, + "loss/reg": 3.296155409771018e-05, + "step": 572 + }, + { + "epoch": 0.071625, + "grad_norm": 2.0329935550689697, + "grad_norm_var": 0.2228007398634886, + "learning_rate": 0.0001, + "loss": 1.2539, + "loss/crossentropy": 2.752925395965576, + "loss/hidden": 1.078125, + "loss/logits": 0.17549508810043335, + "loss/reg": 3.295526767033152e-05, + "step": 573 + }, + { + "epoch": 0.07175, + "grad_norm": 2.126927614212036, + "grad_norm_var": 0.2255879631015389, + "learning_rate": 0.0001, + "loss": 1.3487, + "loss/crossentropy": 2.0483946800231934, + "loss/hidden": 1.171875, + "loss/logits": 0.17648756504058838, + "loss/reg": 3.2946434657787904e-05, + "step": 574 + }, + { + "epoch": 0.071875, + "grad_norm": 1.9886771440505981, + "grad_norm_var": 0.22606789805653443, + "learning_rate": 0.0001, + "loss": 1.1781, + "loss/crossentropy": 2.7617955207824707, + "loss/hidden": 1.0234375, + "loss/logits": 0.15430483222007751, + "loss/reg": 3.2934098271653056e-05, + "step": 575 + }, + { + "epoch": 0.072, + "grad_norm": 2.9631826877593994, + "grad_norm_var": 0.24033580872959698, + "learning_rate": 0.0001, + "loss": 1.5852, + "loss/crossentropy": 2.5309393405914307, + "loss/hidden": 1.390625, + "loss/logits": 0.19425997138023376, + "loss/reg": 3.292256951681338e-05, + "step": 576 + }, + { + "epoch": 0.072125, + "grad_norm": 1.988107681274414, + "grad_norm_var": 0.24468194088425713, + "learning_rate": 0.0001, + "loss": 1.3095, + "loss/crossentropy": 2.2598788738250732, + "loss/hidden": 1.1171875, + "loss/logits": 0.1919427067041397, + "loss/reg": 3.2912772439885885e-05, + "step": 577 + }, + { + "epoch": 0.07225, + "grad_norm": 2.3229868412017822, + "grad_norm_var": 0.23957991140601814, + "learning_rate": 0.0001, + "loss": 1.3089, + "loss/crossentropy": 2.2162625789642334, + "loss/hidden": 1.125, + "loss/logits": 0.1835722178220749, + "loss/reg": 3.289993037469685e-05, + "step": 578 + }, + { + "epoch": 0.072375, + "grad_norm": 3.223689079284668, + "grad_norm_var": 0.2627150699340527, + "learning_rate": 0.0001, + "loss": 1.5039, + "loss/crossentropy": 2.5137839317321777, + "loss/hidden": 1.2734375, + "loss/logits": 0.230127215385437, + "loss/reg": 3.2886728149605915e-05, + "step": 579 + }, + { + "epoch": 0.0725, + "grad_norm": 2.5606164932250977, + "grad_norm_var": 0.2609382601960414, + "learning_rate": 0.0001, + "loss": 1.2698, + "loss/crossentropy": 2.4530560970306396, + "loss/hidden": 1.109375, + "loss/logits": 0.1600678414106369, + "loss/reg": 3.287781146354973e-05, + "step": 580 + }, + { + "epoch": 0.072625, + "grad_norm": 2.2816648483276367, + "grad_norm_var": 0.24516125701289765, + "learning_rate": 0.0001, + "loss": 1.3915, + "loss/crossentropy": 2.7529642581939697, + "loss/hidden": 1.203125, + "loss/logits": 0.18805408477783203, + "loss/reg": 3.2862662919797e-05, + "step": 581 + }, + { + "epoch": 0.07275, + "grad_norm": 2.407241106033325, + "grad_norm_var": 0.24525415601641257, + "learning_rate": 0.0001, + "loss": 1.0467, + "loss/crossentropy": 2.632035255432129, + "loss/hidden": 0.90625, + "loss/logits": 0.14010252058506012, + "loss/reg": 3.284540434833616e-05, + "step": 582 + }, + { + "epoch": 0.072875, + "grad_norm": 2.066805362701416, + "grad_norm_var": 0.24849913026991757, + "learning_rate": 0.0001, + "loss": 1.2104, + "loss/crossentropy": 2.3920540809631348, + "loss/hidden": 1.046875, + "loss/logits": 0.16320452094078064, + "loss/reg": 3.283292244304903e-05, + "step": 583 + }, + { + "epoch": 0.073, + "grad_norm": 2.0391831398010254, + "grad_norm_var": 0.2564456863753709, + "learning_rate": 0.0001, + "loss": 1.0556, + "loss/crossentropy": 2.683037519454956, + "loss/hidden": 0.91796875, + "loss/logits": 0.13727085292339325, + "loss/reg": 3.282453690189868e-05, + "step": 584 + }, + { + "epoch": 0.073125, + "grad_norm": 2.0682592391967773, + "grad_norm_var": 0.17057470989331774, + "learning_rate": 0.0001, + "loss": 1.1549, + "loss/crossentropy": 2.4454257488250732, + "loss/hidden": 0.99609375, + "loss/logits": 0.15847988426685333, + "loss/reg": 3.2813150028232485e-05, + "step": 585 + }, + { + "epoch": 0.07325, + "grad_norm": 2.8501038551330566, + "grad_norm_var": 0.15157959645531768, + "learning_rate": 0.0001, + "loss": 1.0946, + "loss/crossentropy": 2.457731008529663, + "loss/hidden": 0.9375, + "loss/logits": 0.15681084990501404, + "loss/reg": 3.279821976320818e-05, + "step": 586 + }, + { + "epoch": 0.073375, + "grad_norm": 2.148087739944458, + "grad_norm_var": 0.15240953654303122, + "learning_rate": 0.0001, + "loss": 1.4257, + "loss/crossentropy": 2.22470760345459, + "loss/hidden": 1.2421875, + "loss/logits": 0.18313491344451904, + "loss/reg": 3.2786967494757846e-05, + "step": 587 + }, + { + "epoch": 0.0735, + "grad_norm": 2.5976343154907227, + "grad_norm_var": 0.1474827523957486, + "learning_rate": 0.0001, + "loss": 1.3191, + "loss/crossentropy": 2.3259403705596924, + "loss/hidden": 1.1328125, + "loss/logits": 0.18595190346240997, + "loss/reg": 3.277562063885853e-05, + "step": 588 + }, + { + "epoch": 0.073625, + "grad_norm": 2.4023032188415527, + "grad_norm_var": 0.14019368342773214, + "learning_rate": 0.0001, + "loss": 1.2833, + "loss/crossentropy": 2.4084057807922363, + "loss/hidden": 1.1015625, + "loss/logits": 0.18139836192131042, + "loss/reg": 3.276380448369309e-05, + "step": 589 + }, + { + "epoch": 0.07375, + "grad_norm": 2.2520782947540283, + "grad_norm_var": 0.13699608517203637, + "learning_rate": 0.0001, + "loss": 1.1876, + "loss/crossentropy": 2.3067727088928223, + "loss/hidden": 1.046875, + "loss/logits": 0.14043688774108887, + "loss/reg": 3.27551897498779e-05, + "step": 590 + }, + { + "epoch": 0.073875, + "grad_norm": 1.9270496368408203, + "grad_norm_var": 0.14049036125966244, + "learning_rate": 0.0001, + "loss": 1.2389, + "loss/crossentropy": 2.1966562271118164, + "loss/hidden": 1.0703125, + "loss/logits": 0.168295219540596, + "loss/reg": 3.274507253081538e-05, + "step": 591 + }, + { + "epoch": 0.074, + "grad_norm": 4.388664722442627, + "grad_norm_var": 0.37810686870707194, + "learning_rate": 0.0001, + "loss": 1.69, + "loss/crossentropy": 2.4276816844940186, + "loss/hidden": 1.4375, + "loss/logits": 0.25220978260040283, + "loss/reg": 3.2734093110775575e-05, + "step": 592 + }, + { + "epoch": 0.074125, + "grad_norm": 2.117884635925293, + "grad_norm_var": 0.37081618809672723, + "learning_rate": 0.0001, + "loss": 1.3117, + "loss/crossentropy": 2.3757622241973877, + "loss/hidden": 1.125, + "loss/logits": 0.18635454773902893, + "loss/reg": 3.272575850132853e-05, + "step": 593 + }, + { + "epoch": 0.07425, + "grad_norm": 2.390171766281128, + "grad_norm_var": 0.3697061945227233, + "learning_rate": 0.0001, + "loss": 1.4421, + "loss/crossentropy": 2.5182714462280273, + "loss/hidden": 1.2421875, + "loss/logits": 0.1995772272348404, + "loss/reg": 3.2716481655370444e-05, + "step": 594 + }, + { + "epoch": 0.074375, + "grad_norm": 5.211070537567139, + "grad_norm_var": 0.8129410955026809, + "learning_rate": 0.0001, + "loss": 1.8681, + "loss/crossentropy": 2.823439598083496, + "loss/hidden": 1.5703125, + "loss/logits": 0.297451913356781, + "loss/reg": 3.270539309596643e-05, + "step": 595 + }, + { + "epoch": 0.0745, + "grad_norm": 2.154491424560547, + "grad_norm_var": 0.8257505950367497, + "learning_rate": 0.0001, + "loss": 1.2172, + "loss/crossentropy": 2.5359957218170166, + "loss/hidden": 1.0546875, + "loss/logits": 0.16215971112251282, + "loss/reg": 3.269331136834808e-05, + "step": 596 + }, + { + "epoch": 0.074625, + "grad_norm": 2.0445237159729004, + "grad_norm_var": 0.8387431916180466, + "learning_rate": 0.0001, + "loss": 1.0679, + "loss/crossentropy": 2.402836322784424, + "loss/hidden": 0.93359375, + "loss/logits": 0.13397127389907837, + "loss/reg": 3.268320142524317e-05, + "step": 597 + }, + { + "epoch": 0.07475, + "grad_norm": 1.9361920356750488, + "grad_norm_var": 0.8626197388399758, + "learning_rate": 0.0001, + "loss": 1.1555, + "loss/crossentropy": 2.455265522003174, + "loss/hidden": 1.0, + "loss/logits": 0.15518441796302795, + "loss/reg": 3.267572174081579e-05, + "step": 598 + }, + { + "epoch": 0.074875, + "grad_norm": 2.7497634887695312, + "grad_norm_var": 0.848941044328622, + "learning_rate": 0.0001, + "loss": 1.4715, + "loss/crossentropy": 2.318593978881836, + "loss/hidden": 1.265625, + "loss/logits": 0.20550090074539185, + "loss/reg": 3.266748899477534e-05, + "step": 599 + }, + { + "epoch": 0.075, + "grad_norm": 2.400951862335205, + "grad_norm_var": 0.8310417345248119, + "learning_rate": 0.0001, + "loss": 1.2447, + "loss/crossentropy": 2.722557306289673, + "loss/hidden": 1.0859375, + "loss/logits": 0.15843895077705383, + "loss/reg": 3.265763007220812e-05, + "step": 600 + }, + { + "epoch": 0.075125, + "grad_norm": 1.875627875328064, + "grad_norm_var": 0.8470812137580309, + "learning_rate": 0.0001, + "loss": 1.177, + "loss/crossentropy": 2.498196840286255, + "loss/hidden": 1.0234375, + "loss/logits": 0.15324150025844574, + "loss/reg": 3.2648506021359935e-05, + "step": 601 + }, + { + "epoch": 0.07525, + "grad_norm": 2.43035626411438, + "grad_norm_var": 0.8435589871140308, + "learning_rate": 0.0001, + "loss": 1.4324, + "loss/crossentropy": 2.3864665031433105, + "loss/hidden": 1.2265625, + "loss/logits": 0.20555616915225983, + "loss/reg": 3.264078623033129e-05, + "step": 602 + }, + { + "epoch": 0.075375, + "grad_norm": 1.9471216201782227, + "grad_norm_var": 0.8572325437028823, + "learning_rate": 0.0001, + "loss": 1.1456, + "loss/crossentropy": 2.574090003967285, + "loss/hidden": 1.0078125, + "loss/logits": 0.13747350871562958, + "loss/reg": 3.263230973971076e-05, + "step": 603 + }, + { + "epoch": 0.0755, + "grad_norm": 2.211796522140503, + "grad_norm_var": 0.8641696494148537, + "learning_rate": 0.0001, + "loss": 1.215, + "loss/crossentropy": 2.3869664669036865, + "loss/hidden": 1.0625, + "loss/logits": 0.15220922231674194, + "loss/reg": 3.2624688174109906e-05, + "step": 604 + }, + { + "epoch": 0.075625, + "grad_norm": 2.5104334354400635, + "grad_norm_var": 0.8630953581455959, + "learning_rate": 0.0001, + "loss": 1.0907, + "loss/crossentropy": 2.945605516433716, + "loss/hidden": 0.93359375, + "loss/logits": 0.1568160206079483, + "loss/reg": 3.2617448596283793e-05, + "step": 605 + }, + { + "epoch": 0.07575, + "grad_norm": 2.0008065700531006, + "grad_norm_var": 0.8764953924757828, + "learning_rate": 0.0001, + "loss": 1.1952, + "loss/crossentropy": 2.3963301181793213, + "loss/hidden": 1.0390625, + "loss/logits": 0.15582264959812164, + "loss/reg": 3.2607302273390815e-05, + "step": 606 + }, + { + "epoch": 0.075875, + "grad_norm": 2.4198153018951416, + "grad_norm_var": 0.8528082724629522, + "learning_rate": 0.0001, + "loss": 1.1929, + "loss/crossentropy": 2.565782308578491, + "loss/hidden": 1.03125, + "loss/logits": 0.1613207310438156, + "loss/reg": 3.26011395372916e-05, + "step": 607 + }, + { + "epoch": 0.076, + "grad_norm": 2.041506052017212, + "grad_norm_var": 0.6215099906491213, + "learning_rate": 0.0001, + "loss": 1.1144, + "loss/crossentropy": 2.477787733078003, + "loss/hidden": 0.97265625, + "loss/logits": 0.14146365225315094, + "loss/reg": 3.2594605727354065e-05, + "step": 608 + }, + { + "epoch": 0.076125, + "grad_norm": 2.013791561126709, + "grad_norm_var": 0.6261395795901743, + "learning_rate": 0.0001, + "loss": 1.3659, + "loss/crossentropy": 2.2931771278381348, + "loss/hidden": 1.1796875, + "loss/logits": 0.1858958601951599, + "loss/reg": 3.258982178522274e-05, + "step": 609 + }, + { + "epoch": 0.07625, + "grad_norm": 2.0193915367126465, + "grad_norm_var": 0.6350275632712709, + "learning_rate": 0.0001, + "loss": 1.4135, + "loss/crossentropy": 2.249884605407715, + "loss/hidden": 1.2421875, + "loss/logits": 0.17098332941532135, + "loss/reg": 3.2585812732577324e-05, + "step": 610 + }, + { + "epoch": 0.076375, + "grad_norm": 1.9450736045837402, + "grad_norm_var": 0.06580480166266979, + "learning_rate": 0.0001, + "loss": 1.2443, + "loss/crossentropy": 2.4982762336730957, + "loss/hidden": 1.0859375, + "loss/logits": 0.1580805480480194, + "loss/reg": 3.2582185667706653e-05, + "step": 611 + }, + { + "epoch": 0.0765, + "grad_norm": 2.3560380935668945, + "grad_norm_var": 0.06795768948846134, + "learning_rate": 0.0001, + "loss": 1.3639, + "loss/crossentropy": 2.5245325565338135, + "loss/hidden": 1.1796875, + "loss/logits": 0.1839311122894287, + "loss/reg": 3.257774005760439e-05, + "step": 612 + }, + { + "epoch": 0.076625, + "grad_norm": 2.068222761154175, + "grad_norm_var": 0.06756012472509756, + "learning_rate": 0.0001, + "loss": 1.2086, + "loss/crossentropy": 2.5093209743499756, + "loss/hidden": 1.0390625, + "loss/logits": 0.1692183017730713, + "loss/reg": 3.257165371906012e-05, + "step": 613 + }, + { + "epoch": 0.07675, + "grad_norm": 3.0041604042053223, + "grad_norm_var": 0.1037103800860999, + "learning_rate": 0.0001, + "loss": 1.3476, + "loss/crossentropy": 2.4445579051971436, + "loss/hidden": 1.140625, + "loss/logits": 0.20659969747066498, + "loss/reg": 3.2569387258263305e-05, + "step": 614 + }, + { + "epoch": 0.076875, + "grad_norm": 2.224217176437378, + "grad_norm_var": 0.08593044093617071, + "learning_rate": 0.0001, + "loss": 1.2956, + "loss/crossentropy": 2.265068769454956, + "loss/hidden": 1.1484375, + "loss/logits": 0.14682193100452423, + "loss/reg": 3.256817581132054e-05, + "step": 615 + }, + { + "epoch": 0.077, + "grad_norm": 1.9176480770111084, + "grad_norm_var": 0.0886645679147117, + "learning_rate": 0.0001, + "loss": 1.0694, + "loss/crossentropy": 2.406165599822998, + "loss/hidden": 0.94140625, + "loss/logits": 0.12762577831745148, + "loss/reg": 3.256642958149314e-05, + "step": 616 + }, + { + "epoch": 0.077125, + "grad_norm": 1.9381603002548218, + "grad_norm_var": 0.08631597110569747, + "learning_rate": 0.0001, + "loss": 1.2835, + "loss/crossentropy": 2.292922258377075, + "loss/hidden": 1.1015625, + "loss/logits": 0.18164017796516418, + "loss/reg": 3.255980118410662e-05, + "step": 617 + }, + { + "epoch": 0.07725, + "grad_norm": 3.8307087421417236, + "grad_norm_var": 0.25365581117415176, + "learning_rate": 0.0001, + "loss": 1.5716, + "loss/crossentropy": 2.24588680267334, + "loss/hidden": 1.3828125, + "loss/logits": 0.1884302794933319, + "loss/reg": 3.255438059568405e-05, + "step": 618 + }, + { + "epoch": 0.077375, + "grad_norm": 2.7307581901550293, + "grad_norm_var": 0.25745859334372617, + "learning_rate": 0.0001, + "loss": 1.4418, + "loss/crossentropy": 2.5056464672088623, + "loss/hidden": 1.2265625, + "loss/logits": 0.2148783802986145, + "loss/reg": 3.2550698961131275e-05, + "step": 619 + }, + { + "epoch": 0.0775, + "grad_norm": 1.9064363241195679, + "grad_norm_var": 0.2679782151655727, + "learning_rate": 0.0001, + "loss": 1.0563, + "loss/crossentropy": 2.5244431495666504, + "loss/hidden": 0.9296875, + "loss/logits": 0.1262451708316803, + "loss/reg": 3.25437868013978e-05, + "step": 620 + }, + { + "epoch": 0.077625, + "grad_norm": 2.1985690593719482, + "grad_norm_var": 0.2656371947904515, + "learning_rate": 0.0001, + "loss": 1.058, + "loss/crossentropy": 2.575101613998413, + "loss/hidden": 0.921875, + "loss/logits": 0.1357976496219635, + "loss/reg": 3.2535335776628926e-05, + "step": 621 + }, + { + "epoch": 0.07775, + "grad_norm": 2.120142936706543, + "grad_norm_var": 0.26195032172526944, + "learning_rate": 0.0001, + "loss": 1.1902, + "loss/crossentropy": 2.801929235458374, + "loss/hidden": 1.046875, + "loss/logits": 0.14296142756938934, + "loss/reg": 3.2529584132134914e-05, + "step": 622 + }, + { + "epoch": 0.077875, + "grad_norm": 2.0709078311920166, + "grad_norm_var": 0.2637948830624718, + "learning_rate": 0.0001, + "loss": 1.0827, + "loss/crossentropy": 2.676384687423706, + "loss/hidden": 0.953125, + "loss/logits": 0.12926620244979858, + "loss/reg": 3.2523679692531005e-05, + "step": 623 + }, + { + "epoch": 0.078, + "grad_norm": 2.508070707321167, + "grad_norm_var": 0.2629301797210142, + "learning_rate": 0.0001, + "loss": 1.4165, + "loss/crossentropy": 2.2480056285858154, + "loss/hidden": 1.2421875, + "loss/logits": 0.17401380836963654, + "loss/reg": 3.251908492529765e-05, + "step": 624 + }, + { + "epoch": 0.078125, + "grad_norm": 3.1828787326812744, + "grad_norm_var": 0.30322979782217746, + "learning_rate": 0.0001, + "loss": 1.2913, + "loss/crossentropy": 2.6146721839904785, + "loss/hidden": 1.125, + "loss/logits": 0.16597937047481537, + "loss/reg": 3.251036469009705e-05, + "step": 625 + }, + { + "epoch": 0.07825, + "grad_norm": 2.092921733856201, + "grad_norm_var": 0.3000682178451103, + "learning_rate": 0.0001, + "loss": 1.1381, + "loss/crossentropy": 2.7154953479766846, + "loss/hidden": 0.9921875, + "loss/logits": 0.14560630917549133, + "loss/reg": 3.2501688110642135e-05, + "step": 626 + }, + { + "epoch": 0.078375, + "grad_norm": 2.392030954360962, + "grad_norm_var": 0.2865792056426225, + "learning_rate": 0.0001, + "loss": 1.3192, + "loss/crossentropy": 2.4488956928253174, + "loss/hidden": 1.171875, + "loss/logits": 0.14700947701931, + "loss/reg": 3.249543806305155e-05, + "step": 627 + }, + { + "epoch": 0.0785, + "grad_norm": 1.8949799537658691, + "grad_norm_var": 0.30311274506456226, + "learning_rate": 0.0001, + "loss": 1.0355, + "loss/crossentropy": 2.656559705734253, + "loss/hidden": 0.90234375, + "loss/logits": 0.1328512728214264, + "loss/reg": 3.248927168897353e-05, + "step": 628 + }, + { + "epoch": 0.078625, + "grad_norm": 1.8167521953582764, + "grad_norm_var": 0.317520497460355, + "learning_rate": 0.0001, + "loss": 1.2374, + "loss/crossentropy": 2.555947780609131, + "loss/hidden": 1.0703125, + "loss/logits": 0.1667390763759613, + "loss/reg": 3.248196662752889e-05, + "step": 629 + }, + { + "epoch": 0.07875, + "grad_norm": 1.9900246858596802, + "grad_norm_var": 0.29528383715011153, + "learning_rate": 0.0001, + "loss": 1.128, + "loss/crossentropy": 2.405784845352173, + "loss/hidden": 0.9765625, + "loss/logits": 0.1510818898677826, + "loss/reg": 3.247513450332917e-05, + "step": 630 + }, + { + "epoch": 0.078875, + "grad_norm": 1.9893391132354736, + "grad_norm_var": 0.30113488116037995, + "learning_rate": 0.0001, + "loss": 1.2307, + "loss/crossentropy": 2.5071752071380615, + "loss/hidden": 1.0625, + "loss/logits": 0.16787654161453247, + "loss/reg": 3.2470503356307745e-05, + "step": 631 + }, + { + "epoch": 0.079, + "grad_norm": 2.4550352096557617, + "grad_norm_var": 0.29277153949887513, + "learning_rate": 0.0001, + "loss": 1.5059, + "loss/crossentropy": 2.2291836738586426, + "loss/hidden": 1.296875, + "loss/logits": 0.20867902040481567, + "loss/reg": 3.24644279316999e-05, + "step": 632 + }, + { + "epoch": 0.079125, + "grad_norm": 2.5132346153259277, + "grad_norm_var": 0.2841737256035174, + "learning_rate": 0.0001, + "loss": 1.1279, + "loss/crossentropy": 2.668013572692871, + "loss/hidden": 0.98046875, + "loss/logits": 0.14705964922904968, + "loss/reg": 3.2460746297147125e-05, + "step": 633 + }, + { + "epoch": 0.07925, + "grad_norm": 2.484381675720215, + "grad_norm_var": 0.1326996354001511, + "learning_rate": 0.0001, + "loss": 1.3101, + "loss/crossentropy": 2.3046844005584717, + "loss/hidden": 1.1328125, + "loss/logits": 0.17695116996765137, + "loss/reg": 3.245668631279841e-05, + "step": 634 + }, + { + "epoch": 0.079375, + "grad_norm": 2.414663076400757, + "grad_norm_var": 0.11959498058305182, + "learning_rate": 0.0001, + "loss": 1.2178, + "loss/crossentropy": 2.610231399536133, + "loss/hidden": 1.0625, + "loss/logits": 0.15494966506958008, + "loss/reg": 3.2452466257382184e-05, + "step": 635 + }, + { + "epoch": 0.0795, + "grad_norm": 2.3784496784210205, + "grad_norm_var": 0.11177809540983971, + "learning_rate": 0.0001, + "loss": 1.194, + "loss/crossentropy": 2.3995094299316406, + "loss/hidden": 1.0546875, + "loss/logits": 0.13898079097270966, + "loss/reg": 3.244556864956394e-05, + "step": 636 + }, + { + "epoch": 0.079625, + "grad_norm": 1.608014464378357, + "grad_norm_var": 0.14009733722727352, + "learning_rate": 0.0001, + "loss": 1.114, + "loss/crossentropy": 2.6790590286254883, + "loss/hidden": 0.9765625, + "loss/logits": 0.13714221119880676, + "loss/reg": 3.244182880735025e-05, + "step": 637 + }, + { + "epoch": 0.07975, + "grad_norm": 4.00265645980835, + "grad_norm_var": 0.3303772680116007, + "learning_rate": 0.0001, + "loss": 1.2438, + "loss/crossentropy": 2.4462854862213135, + "loss/hidden": 1.0, + "loss/logits": 0.2435140162706375, + "loss/reg": 3.243668834329583e-05, + "step": 638 + }, + { + "epoch": 0.079875, + "grad_norm": 1.6296852827072144, + "grad_norm_var": 0.3596780665720629, + "learning_rate": 0.0001, + "loss": 1.234, + "loss/crossentropy": 2.5636279582977295, + "loss/hidden": 1.0703125, + "loss/logits": 0.16334283351898193, + "loss/reg": 3.2429612474516034e-05, + "step": 639 + }, + { + "epoch": 0.08, + "grad_norm": 1.871125340461731, + "grad_norm_var": 0.3702995417825491, + "learning_rate": 0.0001, + "loss": 1.293, + "loss/crossentropy": 2.4291610717773438, + "loss/hidden": 1.109375, + "loss/logits": 0.18330498039722443, + "loss/reg": 3.242455204599537e-05, + "step": 640 + }, + { + "epoch": 0.080125, + "grad_norm": 2.3847568035125732, + "grad_norm_var": 0.315601771252336, + "learning_rate": 0.0001, + "loss": 1.4179, + "loss/crossentropy": 2.3636293411254883, + "loss/hidden": 1.2265625, + "loss/logits": 0.1910477578639984, + "loss/reg": 3.241969898226671e-05, + "step": 641 + }, + { + "epoch": 0.08025, + "grad_norm": 2.253438949584961, + "grad_norm_var": 0.31395991504814447, + "learning_rate": 0.0001, + "loss": 1.2483, + "loss/crossentropy": 2.436042547225952, + "loss/hidden": 1.0859375, + "loss/logits": 0.16208630800247192, + "loss/reg": 3.241678132326342e-05, + "step": 642 + }, + { + "epoch": 0.080375, + "grad_norm": 2.3757476806640625, + "grad_norm_var": 0.3136787840213039, + "learning_rate": 0.0001, + "loss": 1.4148, + "loss/crossentropy": 2.2471892833709717, + "loss/hidden": 1.2265625, + "loss/logits": 0.18789851665496826, + "loss/reg": 3.240973092033528e-05, + "step": 643 + }, + { + "epoch": 0.0805, + "grad_norm": 2.440065622329712, + "grad_norm_var": 0.30616358156997026, + "learning_rate": 0.0001, + "loss": 1.4075, + "loss/crossentropy": 2.4688096046447754, + "loss/hidden": 1.203125, + "loss/logits": 0.20404627919197083, + "loss/reg": 3.240336445742287e-05, + "step": 644 + }, + { + "epoch": 0.080625, + "grad_norm": 2.2615809440612793, + "grad_norm_var": 0.2905830094011717, + "learning_rate": 0.0001, + "loss": 1.3328, + "loss/crossentropy": 2.3882384300231934, + "loss/hidden": 1.140625, + "loss/logits": 0.19187776744365692, + "loss/reg": 3.239453144487925e-05, + "step": 645 + }, + { + "epoch": 0.08075, + "grad_norm": 1.8651819229125977, + "grad_norm_var": 0.2969792506986578, + "learning_rate": 0.0001, + "loss": 1.3054, + "loss/crossentropy": 2.470728635787964, + "loss/hidden": 1.1328125, + "loss/logits": 0.17228686809539795, + "loss/reg": 3.238815043005161e-05, + "step": 646 + }, + { + "epoch": 0.080875, + "grad_norm": 1.9243041276931763, + "grad_norm_var": 0.3000064631215499, + "learning_rate": 0.0001, + "loss": 1.199, + "loss/crossentropy": 2.514539957046509, + "loss/hidden": 1.03125, + "loss/logits": 0.16743598878383636, + "loss/reg": 3.237748023821041e-05, + "step": 647 + }, + { + "epoch": 0.081, + "grad_norm": 1.8536611795425415, + "grad_norm_var": 0.31049073640116415, + "learning_rate": 0.0001, + "loss": 1.3365, + "loss/crossentropy": 2.3504388332366943, + "loss/hidden": 1.171875, + "loss/logits": 0.16425597667694092, + "loss/reg": 3.2367766834795475e-05, + "step": 648 + }, + { + "epoch": 0.081125, + "grad_norm": 2.169321060180664, + "grad_norm_var": 0.30656022186771464, + "learning_rate": 0.0001, + "loss": 1.1024, + "loss/crossentropy": 2.530703544616699, + "loss/hidden": 0.96875, + "loss/logits": 0.1333715319633484, + "loss/reg": 3.235774420318194e-05, + "step": 649 + }, + { + "epoch": 0.08125, + "grad_norm": 2.1230084896087646, + "grad_norm_var": 0.3031790527826331, + "learning_rate": 0.0001, + "loss": 1.3157, + "loss/crossentropy": 2.310661554336548, + "loss/hidden": 1.140625, + "loss/logits": 0.1747591644525528, + "loss/reg": 3.2350304536521435e-05, + "step": 650 + }, + { + "epoch": 0.081375, + "grad_norm": 2.6993911266326904, + "grad_norm_var": 0.3155514558670091, + "learning_rate": 0.0001, + "loss": 1.347, + "loss/crossentropy": 2.4968340396881104, + "loss/hidden": 1.1484375, + "loss/logits": 0.19822362065315247, + "loss/reg": 3.2342599297408015e-05, + "step": 651 + }, + { + "epoch": 0.0815, + "grad_norm": 2.080300807952881, + "grad_norm_var": 0.3156044041405048, + "learning_rate": 0.0001, + "loss": 1.409, + "loss/crossentropy": 2.424464702606201, + "loss/hidden": 1.234375, + "loss/logits": 0.17433631420135498, + "loss/reg": 3.2335545256501064e-05, + "step": 652 + }, + { + "epoch": 0.081625, + "grad_norm": 1.9338946342468262, + "grad_norm_var": 0.29559018141630206, + "learning_rate": 0.0001, + "loss": 1.2105, + "loss/crossentropy": 2.3058454990386963, + "loss/hidden": 1.0625, + "loss/logits": 0.14769870042800903, + "loss/reg": 3.2326461223419756e-05, + "step": 653 + }, + { + "epoch": 0.08175, + "grad_norm": 2.011249542236328, + "grad_norm_var": 0.07589101708228417, + "learning_rate": 0.0001, + "loss": 1.1236, + "loss/crossentropy": 2.682971239089966, + "loss/hidden": 0.9453125, + "loss/logits": 0.1779399812221527, + "loss/reg": 3.23207896144595e-05, + "step": 654 + }, + { + "epoch": 0.081875, + "grad_norm": 2.7516329288482666, + "grad_norm_var": 0.08162097532279228, + "learning_rate": 0.0001, + "loss": 1.46, + "loss/crossentropy": 2.5869898796081543, + "loss/hidden": 1.265625, + "loss/logits": 0.19408713281154633, + "loss/reg": 3.231072332710028e-05, + "step": 655 + }, + { + "epoch": 0.082, + "grad_norm": 2.366501569747925, + "grad_norm_var": 0.07606725464947293, + "learning_rate": 0.0001, + "loss": 1.1699, + "loss/crossentropy": 2.1895010471343994, + "loss/hidden": 1.0390625, + "loss/logits": 0.13050538301467896, + "loss/reg": 3.230313814128749e-05, + "step": 656 + }, + { + "epoch": 0.082125, + "grad_norm": 2.0115721225738525, + "grad_norm_var": 0.07649272760625057, + "learning_rate": 0.0001, + "loss": 1.2011, + "loss/crossentropy": 2.481351375579834, + "loss/hidden": 1.0390625, + "loss/logits": 0.16167044639587402, + "loss/reg": 3.2294527045451105e-05, + "step": 657 + }, + { + "epoch": 0.08225, + "grad_norm": 2.1796374320983887, + "grad_norm_var": 0.07625861744395455, + "learning_rate": 0.0001, + "loss": 1.1522, + "loss/crossentropy": 2.40244722366333, + "loss/hidden": 1.0078125, + "loss/logits": 0.14410331845283508, + "loss/reg": 3.2286981877405196e-05, + "step": 658 + }, + { + "epoch": 0.082375, + "grad_norm": 3.0727481842041016, + "grad_norm_var": 0.1238429317095663, + "learning_rate": 0.0001, + "loss": 1.4898, + "loss/crossentropy": 2.295776128768921, + "loss/hidden": 1.2734375, + "loss/logits": 0.21601419150829315, + "loss/reg": 3.2281091989716515e-05, + "step": 659 + }, + { + "epoch": 0.0825, + "grad_norm": 3.2960402965545654, + "grad_norm_var": 0.19315411367156585, + "learning_rate": 0.0001, + "loss": 1.3082, + "loss/crossentropy": 2.6390655040740967, + "loss/hidden": 1.140625, + "loss/logits": 0.1673002392053604, + "loss/reg": 3.227585693821311e-05, + "step": 660 + }, + { + "epoch": 0.082625, + "grad_norm": 3.4391167163848877, + "grad_norm_var": 0.27574634545379506, + "learning_rate": 0.0001, + "loss": 1.1884, + "loss/crossentropy": 2.5897035598754883, + "loss/hidden": 1.03125, + "loss/logits": 0.15684694051742554, + "loss/reg": 3.2269697840092704e-05, + "step": 661 + }, + { + "epoch": 0.08275, + "grad_norm": 4.276190280914307, + "grad_norm_var": 0.4796355036633154, + "learning_rate": 0.0001, + "loss": 1.1999, + "loss/crossentropy": 2.5243704319000244, + "loss/hidden": 1.046875, + "loss/logits": 0.15274158120155334, + "loss/reg": 3.226202534278855e-05, + "step": 662 + }, + { + "epoch": 0.082875, + "grad_norm": 2.588935136795044, + "grad_norm_var": 0.4551827768206384, + "learning_rate": 0.0001, + "loss": 1.1808, + "loss/crossentropy": 2.7738707065582275, + "loss/hidden": 1.015625, + "loss/logits": 0.16485591232776642, + "loss/reg": 3.2259602448903024e-05, + "step": 663 + }, + { + "epoch": 0.083, + "grad_norm": 2.2615346908569336, + "grad_norm_var": 0.42753040987829916, + "learning_rate": 0.0001, + "loss": 1.1047, + "loss/crossentropy": 2.4449167251586914, + "loss/hidden": 0.96484375, + "loss/logits": 0.13952355086803436, + "loss/reg": 3.225212640245445e-05, + "step": 664 + }, + { + "epoch": 0.083125, + "grad_norm": 2.022779941558838, + "grad_norm_var": 0.4368736230271139, + "learning_rate": 0.0001, + "loss": 1.134, + "loss/crossentropy": 2.4114463329315186, + "loss/hidden": 0.99609375, + "loss/logits": 0.13763144612312317, + "loss/reg": 3.224598913220689e-05, + "step": 665 + }, + { + "epoch": 0.08325, + "grad_norm": 2.180579900741577, + "grad_norm_var": 0.433652208727842, + "learning_rate": 0.0001, + "loss": 1.3077, + "loss/crossentropy": 2.3560714721679688, + "loss/hidden": 1.140625, + "loss/logits": 0.16675901412963867, + "loss/reg": 3.2240248401649296e-05, + "step": 666 + }, + { + "epoch": 0.083375, + "grad_norm": 2.29941725730896, + "grad_norm_var": 0.43692416598824574, + "learning_rate": 0.0001, + "loss": 1.2171, + "loss/crossentropy": 2.4641499519348145, + "loss/hidden": 1.0390625, + "loss/logits": 0.17773011326789856, + "loss/reg": 3.223533713025972e-05, + "step": 667 + }, + { + "epoch": 0.0835, + "grad_norm": 13.708366394042969, + "grad_norm_var": 8.162143239260631, + "learning_rate": 0.0001, + "loss": 1.9561, + "loss/crossentropy": 1.3007676601409912, + "loss/hidden": 1.890625, + "loss/logits": 0.06512448191642761, + "loss/reg": 3.222926170565188e-05, + "step": 668 + }, + { + "epoch": 0.083625, + "grad_norm": 2.223507881164551, + "grad_norm_var": 8.1155980860334, + "learning_rate": 0.0001, + "loss": 1.1802, + "loss/crossentropy": 2.3888051509857178, + "loss/hidden": 1.0390625, + "loss/logits": 0.14084848761558533, + "loss/reg": 3.22245032293722e-05, + "step": 669 + }, + { + "epoch": 0.08375, + "grad_norm": 4.826849460601807, + "grad_norm_var": 8.129844594294168, + "learning_rate": 0.0001, + "loss": 1.2754, + "loss/crossentropy": 2.787766456604004, + "loss/hidden": 1.125, + "loss/logits": 0.15011203289031982, + "loss/reg": 3.2221811125054955e-05, + "step": 670 + }, + { + "epoch": 0.083875, + "grad_norm": 2.107593059539795, + "grad_norm_var": 8.217378105018076, + "learning_rate": 0.0001, + "loss": 1.1406, + "loss/crossentropy": 2.673910617828369, + "loss/hidden": 1.0, + "loss/logits": 0.1403258740901947, + "loss/reg": 3.222101804567501e-05, + "step": 671 + }, + { + "epoch": 0.084, + "grad_norm": 2.0006186962127686, + "grad_norm_var": 8.27757030990291, + "learning_rate": 0.0001, + "loss": 1.0909, + "loss/crossentropy": 2.7820777893066406, + "loss/hidden": 0.93359375, + "loss/logits": 0.15698012709617615, + "loss/reg": 3.221366569050588e-05, + "step": 672 + }, + { + "epoch": 0.084125, + "grad_norm": 2.0933070182800293, + "grad_norm_var": 8.262791740468037, + "learning_rate": 0.0001, + "loss": 1.1451, + "loss/crossentropy": 2.8087332248687744, + "loss/hidden": 0.99609375, + "loss/logits": 0.14868998527526855, + "loss/reg": 3.220442158635706e-05, + "step": 673 + }, + { + "epoch": 0.08425, + "grad_norm": 2.358794927597046, + "grad_norm_var": 8.235381625712368, + "learning_rate": 0.0001, + "loss": 1.2276, + "loss/crossentropy": 2.664926528930664, + "loss/hidden": 1.0703125, + "loss/logits": 0.1569160521030426, + "loss/reg": 3.2195319363381714e-05, + "step": 674 + }, + { + "epoch": 0.084375, + "grad_norm": 1.998713731765747, + "grad_norm_var": 8.357532166242084, + "learning_rate": 0.0001, + "loss": 1.2374, + "loss/crossentropy": 2.4228875637054443, + "loss/hidden": 1.1015625, + "loss/logits": 0.135471910238266, + "loss/reg": 3.218696656404063e-05, + "step": 675 + }, + { + "epoch": 0.0845, + "grad_norm": 1.7373796701431274, + "grad_norm_var": 8.521654653515995, + "learning_rate": 0.0001, + "loss": 1.1959, + "loss/crossentropy": 2.411160945892334, + "loss/hidden": 1.046875, + "loss/logits": 0.1486646831035614, + "loss/reg": 3.217814810341224e-05, + "step": 676 + }, + { + "epoch": 0.084625, + "grad_norm": 2.1248972415924072, + "grad_norm_var": 8.597818746749187, + "learning_rate": 0.0001, + "loss": 1.2671, + "loss/crossentropy": 2.322160005569458, + "loss/hidden": 1.09375, + "loss/logits": 0.17302247881889343, + "loss/reg": 3.216690311091952e-05, + "step": 677 + }, + { + "epoch": 0.08475, + "grad_norm": 2.1823222637176514, + "grad_norm_var": 8.564568662216503, + "learning_rate": 0.0001, + "loss": 1.0307, + "loss/crossentropy": 2.5252137184143066, + "loss/hidden": 0.90625, + "loss/logits": 0.1240834966301918, + "loss/reg": 3.215742253814824e-05, + "step": 678 + }, + { + "epoch": 0.084875, + "grad_norm": 2.385857343673706, + "grad_norm_var": 8.579487634417987, + "learning_rate": 0.0001, + "loss": 1.301, + "loss/crossentropy": 2.356689929962158, + "loss/hidden": 1.1484375, + "loss/logits": 0.1522800326347351, + "loss/reg": 3.214823664166033e-05, + "step": 679 + }, + { + "epoch": 0.085, + "grad_norm": 4.639679908752441, + "grad_norm_var": 8.688646971389552, + "learning_rate": 0.0001, + "loss": 1.5426, + "loss/crossentropy": 2.088296413421631, + "loss/hidden": 1.3359375, + "loss/logits": 0.20634829998016357, + "loss/reg": 3.213853415218182e-05, + "step": 680 + }, + { + "epoch": 0.085125, + "grad_norm": 2.2640221118927, + "grad_norm_var": 8.6550401893545, + "learning_rate": 0.0001, + "loss": 1.118, + "loss/crossentropy": 2.519866466522217, + "loss/hidden": 0.9765625, + "loss/logits": 0.1411462128162384, + "loss/reg": 3.212389492546208e-05, + "step": 681 + }, + { + "epoch": 0.08525, + "grad_norm": 1.9441190958023071, + "grad_norm_var": 8.690541004695175, + "learning_rate": 0.0001, + "loss": 1.2132, + "loss/crossentropy": 2.6540563106536865, + "loss/hidden": 1.046875, + "loss/logits": 0.1660272479057312, + "loss/reg": 3.211110379197635e-05, + "step": 682 + }, + { + "epoch": 0.085375, + "grad_norm": 2.1394095420837402, + "grad_norm_var": 8.7109484257759, + "learning_rate": 0.0001, + "loss": 1.1292, + "loss/crossentropy": 2.454695463180542, + "loss/hidden": 0.96484375, + "loss/logits": 0.1640835851430893, + "loss/reg": 3.209971691831015e-05, + "step": 683 + }, + { + "epoch": 0.0855, + "grad_norm": 2.5914788246154785, + "grad_norm_var": 0.8159417233832197, + "learning_rate": 0.0001, + "loss": 1.1781, + "loss/crossentropy": 2.354567766189575, + "loss/hidden": 1.03125, + "loss/logits": 0.14655154943466187, + "loss/reg": 3.209043643437326e-05, + "step": 684 + }, + { + "epoch": 0.085625, + "grad_norm": 2.1125876903533936, + "grad_norm_var": 0.8204472332347561, + "learning_rate": 0.0001, + "loss": 1.1683, + "loss/crossentropy": 2.4232585430145264, + "loss/hidden": 1.0234375, + "loss/logits": 0.14451487362384796, + "loss/reg": 3.208300768164918e-05, + "step": 685 + }, + { + "epoch": 0.08575, + "grad_norm": 2.005004405975342, + "grad_norm_var": 0.431076757035284, + "learning_rate": 0.0001, + "loss": 1.2143, + "loss/crossentropy": 2.7095413208007812, + "loss/hidden": 1.0546875, + "loss/logits": 0.159327432513237, + "loss/reg": 3.20776853186544e-05, + "step": 686 + }, + { + "epoch": 0.085875, + "grad_norm": 2.0700883865356445, + "grad_norm_var": 0.43209112768215785, + "learning_rate": 0.0001, + "loss": 1.3235, + "loss/crossentropy": 2.146639347076416, + "loss/hidden": 1.1484375, + "loss/logits": 0.17476913332939148, + "loss/reg": 3.2072603062260896e-05, + "step": 687 + }, + { + "epoch": 0.086, + "grad_norm": 1.8895165920257568, + "grad_norm_var": 0.4371570572715728, + "learning_rate": 0.0001, + "loss": 1.1324, + "loss/crossentropy": 2.6808338165283203, + "loss/hidden": 0.9765625, + "loss/logits": 0.15553486347198486, + "loss/reg": 3.206895780749619e-05, + "step": 688 + }, + { + "epoch": 0.086125, + "grad_norm": 2.0443785190582275, + "grad_norm_var": 0.43854794372576955, + "learning_rate": 0.0001, + "loss": 1.353, + "loss/crossentropy": 2.3666627407073975, + "loss/hidden": 1.1484375, + "loss/logits": 0.2042878270149231, + "loss/reg": 3.2062020181911066e-05, + "step": 689 + }, + { + "epoch": 0.08625, + "grad_norm": 2.71120285987854, + "grad_norm_var": 0.4499880686852729, + "learning_rate": 0.0001, + "loss": 1.3123, + "loss/crossentropy": 2.225734233856201, + "loss/hidden": 1.1640625, + "loss/logits": 0.1479206681251526, + "loss/reg": 3.205486427759752e-05, + "step": 690 + }, + { + "epoch": 0.086375, + "grad_norm": 1.9704585075378418, + "grad_norm_var": 0.45118259423517365, + "learning_rate": 0.0001, + "loss": 1.2783, + "loss/crossentropy": 2.1413638591766357, + "loss/hidden": 1.1171875, + "loss/logits": 0.16075628995895386, + "loss/reg": 3.204666791134514e-05, + "step": 691 + }, + { + "epoch": 0.0865, + "grad_norm": 2.0193846225738525, + "grad_norm_var": 0.43496897541908663, + "learning_rate": 0.0001, + "loss": 1.1574, + "loss/crossentropy": 2.461449384689331, + "loss/hidden": 1.0, + "loss/logits": 0.15706798434257507, + "loss/reg": 3.203826054232195e-05, + "step": 692 + }, + { + "epoch": 0.086625, + "grad_norm": 2.3389272689819336, + "grad_norm_var": 0.43230996116488185, + "learning_rate": 0.0001, + "loss": 1.2097, + "loss/crossentropy": 2.8094615936279297, + "loss/hidden": 1.0546875, + "loss/logits": 0.15468040108680725, + "loss/reg": 3.203040250809863e-05, + "step": 693 + }, + { + "epoch": 0.08675, + "grad_norm": 2.1352357864379883, + "grad_norm_var": 0.4333868407910055, + "learning_rate": 0.0001, + "loss": 1.3619, + "loss/crossentropy": 2.2299346923828125, + "loss/hidden": 1.1875, + "loss/logits": 0.17409831285476685, + "loss/reg": 3.2022617233451456e-05, + "step": 694 + }, + { + "epoch": 0.086875, + "grad_norm": 1.8997348546981812, + "grad_norm_var": 0.44446051921212015, + "learning_rate": 0.0001, + "loss": 1.1474, + "loss/crossentropy": 2.464475154876709, + "loss/hidden": 0.99609375, + "loss/logits": 0.15099835395812988, + "loss/reg": 3.201406798325479e-05, + "step": 695 + }, + { + "epoch": 0.087, + "grad_norm": 2.0923757553100586, + "grad_norm_var": 0.05483191469694191, + "learning_rate": 0.0001, + "loss": 1.4524, + "loss/crossentropy": 2.0002782344818115, + "loss/hidden": 1.2578125, + "loss/logits": 0.19427113234996796, + "loss/reg": 3.2007144909584895e-05, + "step": 696 + }, + { + "epoch": 0.087125, + "grad_norm": 2.2059760093688965, + "grad_norm_var": 0.054076791402477654, + "learning_rate": 0.0001, + "loss": 1.1609, + "loss/crossentropy": 2.6744987964630127, + "loss/hidden": 1.0078125, + "loss/logits": 0.1527547836303711, + "loss/reg": 3.199988714186475e-05, + "step": 697 + }, + { + "epoch": 0.08725, + "grad_norm": 2.7213289737701416, + "grad_norm_var": 0.07198565582104041, + "learning_rate": 0.0001, + "loss": 1.2713, + "loss/crossentropy": 2.643150806427002, + "loss/hidden": 1.0625, + "loss/logits": 0.20850570499897003, + "loss/reg": 3.199481943738647e-05, + "step": 698 + }, + { + "epoch": 0.087375, + "grad_norm": 2.3728480339050293, + "grad_norm_var": 0.07399760919694666, + "learning_rate": 0.0001, + "loss": 1.3903, + "loss/crossentropy": 2.309610605239868, + "loss/hidden": 1.203125, + "loss/logits": 0.18681778013706207, + "loss/reg": 3.198991544195451e-05, + "step": 699 + }, + { + "epoch": 0.0875, + "grad_norm": 2.271571397781372, + "grad_norm_var": 0.06364372961186285, + "learning_rate": 0.0001, + "loss": 1.1185, + "loss/crossentropy": 2.3085150718688965, + "loss/hidden": 0.9765625, + "loss/logits": 0.1415865421295166, + "loss/reg": 3.198152262484655e-05, + "step": 700 + }, + { + "epoch": 0.087625, + "grad_norm": 1.9595108032226562, + "grad_norm_var": 0.06645944280407286, + "learning_rate": 0.0001, + "loss": 1.3993, + "loss/crossentropy": 1.939794659614563, + "loss/hidden": 1.2265625, + "loss/logits": 0.1723695695400238, + "loss/reg": 3.1972482247510925e-05, + "step": 701 + }, + { + "epoch": 0.08775, + "grad_norm": 2.3391191959381104, + "grad_norm_var": 0.06612084152980054, + "learning_rate": 0.0001, + "loss": 1.2046, + "loss/crossentropy": 2.515406847000122, + "loss/hidden": 1.046875, + "loss/logits": 0.1574229598045349, + "loss/reg": 3.1967716495273635e-05, + "step": 702 + }, + { + "epoch": 0.087875, + "grad_norm": 1.834547758102417, + "grad_norm_var": 0.07335743103287004, + "learning_rate": 0.0001, + "loss": 1.2564, + "loss/crossentropy": 2.3931076526641846, + "loss/hidden": 1.09375, + "loss/logits": 0.1623239815235138, + "loss/reg": 3.195786848664284e-05, + "step": 703 + }, + { + "epoch": 0.088, + "grad_norm": 2.1778762340545654, + "grad_norm_var": 0.06756343480080407, + "learning_rate": 0.0001, + "loss": 1.3099, + "loss/crossentropy": 2.487224578857422, + "loss/hidden": 1.125, + "loss/logits": 0.18461742997169495, + "loss/reg": 3.1952691642800346e-05, + "step": 704 + }, + { + "epoch": 0.088125, + "grad_norm": 2.150066375732422, + "grad_norm_var": 0.06616151942176168, + "learning_rate": 0.0001, + "loss": 1.0666, + "loss/crossentropy": 2.3318979740142822, + "loss/hidden": 0.9375, + "loss/logits": 0.1287609487771988, + "loss/reg": 3.1944622605806217e-05, + "step": 705 + }, + { + "epoch": 0.08825, + "grad_norm": 2.367643117904663, + "grad_norm_var": 0.050121908206295925, + "learning_rate": 0.0001, + "loss": 1.2344, + "loss/crossentropy": 2.623997688293457, + "loss/hidden": 1.0625, + "loss/logits": 0.17162081599235535, + "loss/reg": 3.19385617331136e-05, + "step": 706 + }, + { + "epoch": 0.088375, + "grad_norm": 2.1544952392578125, + "grad_norm_var": 0.047132855557610695, + "learning_rate": 0.0001, + "loss": 1.2948, + "loss/crossentropy": 2.4254250526428223, + "loss/hidden": 1.125, + "loss/logits": 0.16950462758541107, + "loss/reg": 3.1930052500683814e-05, + "step": 707 + }, + { + "epoch": 0.0885, + "grad_norm": 2.867445230484009, + "grad_norm_var": 0.07278645639077812, + "learning_rate": 0.0001, + "loss": 1.4275, + "loss/crossentropy": 1.8883780241012573, + "loss/hidden": 1.265625, + "loss/logits": 0.16154590249061584, + "loss/reg": 3.19233258778695e-05, + "step": 708 + }, + { + "epoch": 0.088625, + "grad_norm": 2.6383581161499023, + "grad_norm_var": 0.08221819277021461, + "learning_rate": 0.0001, + "loss": 1.3537, + "loss/crossentropy": 2.7024612426757812, + "loss/hidden": 1.140625, + "loss/logits": 0.21271824836730957, + "loss/reg": 3.191478390363045e-05, + "step": 709 + }, + { + "epoch": 0.08875, + "grad_norm": 1.9694470167160034, + "grad_norm_var": 0.08673286422611473, + "learning_rate": 0.0001, + "loss": 1.2019, + "loss/crossentropy": 2.422839403152466, + "loss/hidden": 1.046875, + "loss/logits": 0.1547282487154007, + "loss/reg": 3.190719507983886e-05, + "step": 710 + }, + { + "epoch": 0.088875, + "grad_norm": 2.1444804668426514, + "grad_norm_var": 0.07900095396042553, + "learning_rate": 0.0001, + "loss": 1.1908, + "loss/crossentropy": 2.7688839435577393, + "loss/hidden": 1.0234375, + "loss/logits": 0.16705238819122314, + "loss/reg": 3.1896463042357937e-05, + "step": 711 + }, + { + "epoch": 0.089, + "grad_norm": 2.683474540710449, + "grad_norm_var": 0.08709981146559749, + "learning_rate": 0.0001, + "loss": 1.363, + "loss/crossentropy": 2.40535044670105, + "loss/hidden": 1.1640625, + "loss/logits": 0.1986573487520218, + "loss/reg": 3.1885796488495544e-05, + "step": 712 + }, + { + "epoch": 0.089125, + "grad_norm": 2.5018036365509033, + "grad_norm_var": 0.08871733491526891, + "learning_rate": 0.0001, + "loss": 1.2907, + "loss/crossentropy": 2.5045652389526367, + "loss/hidden": 1.1328125, + "loss/logits": 0.15754517912864685, + "loss/reg": 3.18759230140131e-05, + "step": 713 + }, + { + "epoch": 0.08925, + "grad_norm": 2.2823939323425293, + "grad_norm_var": 0.07739561040159319, + "learning_rate": 0.0001, + "loss": 1.1379, + "loss/crossentropy": 2.6719865798950195, + "loss/hidden": 0.9765625, + "loss/logits": 0.16101884841918945, + "loss/reg": 3.1869571103015915e-05, + "step": 714 + }, + { + "epoch": 0.089375, + "grad_norm": 1.9277560710906982, + "grad_norm_var": 0.08513910626034443, + "learning_rate": 0.0001, + "loss": 1.3134, + "loss/crossentropy": 2.351673126220703, + "loss/hidden": 1.140625, + "loss/logits": 0.17247360944747925, + "loss/reg": 3.1860403396422043e-05, + "step": 715 + }, + { + "epoch": 0.0895, + "grad_norm": 2.0939505100250244, + "grad_norm_var": 0.08699969013148531, + "learning_rate": 0.0001, + "loss": 1.3346, + "loss/crossentropy": 2.618321418762207, + "loss/hidden": 1.15625, + "loss/logits": 0.178018718957901, + "loss/reg": 3.184879824402742e-05, + "step": 716 + }, + { + "epoch": 0.089625, + "grad_norm": 1.9107413291931152, + "grad_norm_var": 0.08907481761581591, + "learning_rate": 0.0001, + "loss": 1.1277, + "loss/crossentropy": 2.5781402587890625, + "loss/hidden": 0.98046875, + "loss/logits": 0.1469302475452423, + "loss/reg": 3.183981971233152e-05, + "step": 717 + }, + { + "epoch": 0.08975, + "grad_norm": 2.362642526626587, + "grad_norm_var": 0.08938037261504844, + "learning_rate": 0.0001, + "loss": 1.2477, + "loss/crossentropy": 2.685119390487671, + "loss/hidden": 1.078125, + "loss/logits": 0.1692849099636078, + "loss/reg": 3.182946966262534e-05, + "step": 718 + }, + { + "epoch": 0.089875, + "grad_norm": 1.6800652742385864, + "grad_norm_var": 0.09951568078791032, + "learning_rate": 0.0001, + "loss": 1.2464, + "loss/crossentropy": 2.4787790775299072, + "loss/hidden": 1.078125, + "loss/logits": 0.168003648519516, + "loss/reg": 3.181990177836269e-05, + "step": 719 + }, + { + "epoch": 0.09, + "grad_norm": 2.121812105178833, + "grad_norm_var": 0.1002104558972718, + "learning_rate": 0.0001, + "loss": 1.1216, + "loss/crossentropy": 2.671614646911621, + "loss/hidden": 0.96875, + "loss/logits": 0.15253598988056183, + "loss/reg": 3.180657222401351e-05, + "step": 720 + }, + { + "epoch": 0.090125, + "grad_norm": 1.9008033275604248, + "grad_norm_var": 0.10711709114638453, + "learning_rate": 0.0001, + "loss": 1.1998, + "loss/crossentropy": 2.382974624633789, + "loss/hidden": 1.046875, + "loss/logits": 0.15262249112129211, + "loss/reg": 3.179501072736457e-05, + "step": 721 + }, + { + "epoch": 0.09025, + "grad_norm": 2.5321600437164307, + "grad_norm_var": 0.11192764062330766, + "learning_rate": 0.0001, + "loss": 1.1406, + "loss/crossentropy": 2.551715850830078, + "loss/hidden": 0.97265625, + "loss/logits": 0.16764254868030548, + "loss/reg": 3.178184852004051e-05, + "step": 722 + }, + { + "epoch": 0.090375, + "grad_norm": 2.9801716804504395, + "grad_norm_var": 0.14559231156155736, + "learning_rate": 0.0001, + "loss": 1.1411, + "loss/crossentropy": 2.439439296722412, + "loss/hidden": 0.97265625, + "loss/logits": 0.1681419312953949, + "loss/reg": 3.176909376634285e-05, + "step": 723 + }, + { + "epoch": 0.0905, + "grad_norm": 3.572810411453247, + "grad_norm_var": 0.23124631459862688, + "learning_rate": 0.0001, + "loss": 1.7208, + "loss/crossentropy": 2.434739351272583, + "loss/hidden": 1.4375, + "loss/logits": 0.283025860786438, + "loss/reg": 3.17596313834656e-05, + "step": 724 + }, + { + "epoch": 0.090625, + "grad_norm": 2.4641849994659424, + "grad_norm_var": 0.2260145018020476, + "learning_rate": 0.0001, + "loss": 1.3361, + "loss/crossentropy": 2.359750747680664, + "loss/hidden": 1.140625, + "loss/logits": 0.19520103931427002, + "loss/reg": 3.1749059417052194e-05, + "step": 725 + }, + { + "epoch": 0.09075, + "grad_norm": 2.363614320755005, + "grad_norm_var": 0.21727288655602492, + "learning_rate": 0.0001, + "loss": 1.2934, + "loss/crossentropy": 2.556663751602173, + "loss/hidden": 1.1171875, + "loss/logits": 0.1759195625782013, + "loss/reg": 3.1738876714371145e-05, + "step": 726 + }, + { + "epoch": 0.090875, + "grad_norm": 2.7360336780548096, + "grad_norm_var": 0.22331398262713684, + "learning_rate": 0.0001, + "loss": 1.6679, + "loss/crossentropy": 2.204488515853882, + "loss/hidden": 1.3984375, + "loss/logits": 0.26912397146224976, + "loss/reg": 3.172622018610127e-05, + "step": 727 + }, + { + "epoch": 0.091, + "grad_norm": 1.9641075134277344, + "grad_norm_var": 0.2267554251378797, + "learning_rate": 0.0001, + "loss": 1.381, + "loss/crossentropy": 2.422118663787842, + "loss/hidden": 1.1953125, + "loss/logits": 0.18534547090530396, + "loss/reg": 3.1712734198663384e-05, + "step": 728 + }, + { + "epoch": 0.091125, + "grad_norm": 2.564943790435791, + "grad_norm_var": 0.22839041731253118, + "learning_rate": 0.0001, + "loss": 1.2944, + "loss/crossentropy": 2.87058687210083, + "loss/hidden": 1.1171875, + "loss/logits": 0.17691189050674438, + "loss/reg": 3.1703999411547557e-05, + "step": 729 + }, + { + "epoch": 0.09125, + "grad_norm": 1.7801804542541504, + "grad_norm_var": 0.2480876052532271, + "learning_rate": 0.0001, + "loss": 1.147, + "loss/crossentropy": 2.6544079780578613, + "loss/hidden": 1.0, + "loss/logits": 0.14668874442577362, + "loss/reg": 3.1691190088167787e-05, + "step": 730 + }, + { + "epoch": 0.091375, + "grad_norm": 2.378593921661377, + "grad_norm_var": 0.23782880116232022, + "learning_rate": 0.0001, + "loss": 1.3614, + "loss/crossentropy": 2.2079715728759766, + "loss/hidden": 1.1875, + "loss/logits": 0.1736200451850891, + "loss/reg": 3.1681309337727726e-05, + "step": 731 + }, + { + "epoch": 0.0915, + "grad_norm": 2.1592485904693604, + "grad_norm_var": 0.23597114035816222, + "learning_rate": 0.0001, + "loss": 1.1879, + "loss/crossentropy": 2.5136733055114746, + "loss/hidden": 1.0546875, + "loss/logits": 0.1328616440296173, + "loss/reg": 3.1669525924371555e-05, + "step": 732 + }, + { + "epoch": 0.091625, + "grad_norm": 1.9308961629867554, + "grad_norm_var": 0.23483758355516002, + "learning_rate": 0.0001, + "loss": 1.113, + "loss/crossentropy": 2.443802833557129, + "loss/hidden": 0.97265625, + "loss/logits": 0.14004755020141602, + "loss/reg": 3.165780799463391e-05, + "step": 733 + }, + { + "epoch": 0.09175, + "grad_norm": 2.5313358306884766, + "grad_norm_var": 0.23705198036043196, + "learning_rate": 0.0001, + "loss": 1.316, + "loss/crossentropy": 2.374687910079956, + "loss/hidden": 1.140625, + "loss/logits": 0.17510093748569489, + "loss/reg": 3.1643921829527244e-05, + "step": 734 + }, + { + "epoch": 0.091875, + "grad_norm": 2.2036988735198975, + "grad_norm_var": 0.2071495968864624, + "learning_rate": 0.0001, + "loss": 1.1801, + "loss/crossentropy": 2.5990524291992188, + "loss/hidden": 1.0234375, + "loss/logits": 0.15631292760372162, + "loss/reg": 3.162867506034672e-05, + "step": 735 + }, + { + "epoch": 0.092, + "grad_norm": 2.2874338626861572, + "grad_norm_var": 0.2030181085393209, + "learning_rate": 0.0001, + "loss": 1.3517, + "loss/crossentropy": 2.3098976612091064, + "loss/hidden": 1.171875, + "loss/logits": 0.1794787347316742, + "loss/reg": 3.16194818879012e-05, + "step": 736 + }, + { + "epoch": 0.092125, + "grad_norm": 2.161745309829712, + "grad_norm_var": 0.19001384880688754, + "learning_rate": 0.0001, + "loss": 1.2909, + "loss/crossentropy": 2.206538200378418, + "loss/hidden": 1.1328125, + "loss/logits": 0.15781772136688232, + "loss/reg": 3.160776032018475e-05, + "step": 737 + }, + { + "epoch": 0.09225, + "grad_norm": 2.825727939605713, + "grad_norm_var": 0.2000567098307892, + "learning_rate": 0.0001, + "loss": 1.4565, + "loss/crossentropy": 2.395667314529419, + "loss/hidden": 1.2578125, + "loss/logits": 0.19834987819194794, + "loss/reg": 3.159312836942263e-05, + "step": 738 + }, + { + "epoch": 0.092375, + "grad_norm": 2.25457763671875, + "grad_norm_var": 0.1798848071044782, + "learning_rate": 0.0001, + "loss": 1.1574, + "loss/crossentropy": 2.5495216846466064, + "loss/hidden": 0.9921875, + "loss/logits": 0.16488471627235413, + "loss/reg": 3.15783909172751e-05, + "step": 739 + }, + { + "epoch": 0.0925, + "grad_norm": 2.0684523582458496, + "grad_norm_var": 0.083315702432507, + "learning_rate": 0.0001, + "loss": 1.1796, + "loss/crossentropy": 2.3564293384552, + "loss/hidden": 1.046875, + "loss/logits": 0.1323927640914917, + "loss/reg": 3.156912134727463e-05, + "step": 740 + }, + { + "epoch": 0.092625, + "grad_norm": 2.656198501586914, + "grad_norm_var": 0.0900238317620723, + "learning_rate": 0.0001, + "loss": 1.3841, + "loss/crossentropy": 2.487755060195923, + "loss/hidden": 1.1875, + "loss/logits": 0.19630743563175201, + "loss/reg": 3.155725062242709e-05, + "step": 741 + }, + { + "epoch": 0.09275, + "grad_norm": 1.9687042236328125, + "grad_norm_var": 0.09664116038215548, + "learning_rate": 0.0001, + "loss": 1.2491, + "loss/crossentropy": 2.573404550552368, + "loss/hidden": 1.078125, + "loss/logits": 0.17063230276107788, + "loss/reg": 3.1548213883070275e-05, + "step": 742 + }, + { + "epoch": 0.092875, + "grad_norm": 2.102797269821167, + "grad_norm_var": 0.0831564589342415, + "learning_rate": 0.0001, + "loss": 1.1905, + "loss/crossentropy": 2.153092384338379, + "loss/hidden": 1.0390625, + "loss/logits": 0.15110386908054352, + "loss/reg": 3.153769648633897e-05, + "step": 743 + }, + { + "epoch": 0.093, + "grad_norm": 2.355156898498535, + "grad_norm_var": 0.07833334824755027, + "learning_rate": 0.0001, + "loss": 1.3599, + "loss/crossentropy": 2.17565655708313, + "loss/hidden": 1.1640625, + "loss/logits": 0.19551442563533783, + "loss/reg": 3.152925637550652e-05, + "step": 744 + }, + { + "epoch": 0.093125, + "grad_norm": 2.443155527114868, + "grad_norm_var": 0.07437929229497317, + "learning_rate": 0.0001, + "loss": 1.3143, + "loss/crossentropy": 2.252290725708008, + "loss/hidden": 1.15625, + "loss/logits": 0.15772585570812225, + "loss/reg": 3.1521783967036754e-05, + "step": 745 + }, + { + "epoch": 0.09325, + "grad_norm": 2.0343992710113525, + "grad_norm_var": 0.06226497131138201, + "learning_rate": 0.0001, + "loss": 1.0988, + "loss/crossentropy": 2.4974706172943115, + "loss/hidden": 0.96875, + "loss/logits": 0.12977877259254456, + "loss/reg": 3.151370765408501e-05, + "step": 746 + }, + { + "epoch": 0.093375, + "grad_norm": 2.5981836318969727, + "grad_norm_var": 0.0683810999287743, + "learning_rate": 0.0001, + "loss": 1.3081, + "loss/crossentropy": 2.437986373901367, + "loss/hidden": 1.140625, + "loss/logits": 0.16720974445343018, + "loss/reg": 3.150551856379025e-05, + "step": 747 + }, + { + "epoch": 0.0935, + "grad_norm": 2.1747589111328125, + "grad_norm_var": 0.06813326994570724, + "learning_rate": 0.0001, + "loss": 1.0686, + "loss/crossentropy": 2.7409119606018066, + "loss/hidden": 0.9375, + "loss/logits": 0.13082191348075867, + "loss/reg": 3.1495314033236355e-05, + "step": 748 + }, + { + "epoch": 0.093625, + "grad_norm": 2.183622360229492, + "grad_norm_var": 0.06011461073695254, + "learning_rate": 0.0001, + "loss": 1.2254, + "loss/crossentropy": 2.191103458404541, + "loss/hidden": 1.0625, + "loss/logits": 0.1625903844833374, + "loss/reg": 3.1484429200645536e-05, + "step": 749 + }, + { + "epoch": 0.09375, + "grad_norm": 1.729537844657898, + "grad_norm_var": 0.07589706873069174, + "learning_rate": 0.0001, + "loss": 1.1576, + "loss/crossentropy": 2.559917449951172, + "loss/hidden": 1.0, + "loss/logits": 0.15729467570781708, + "loss/reg": 3.1475185096496716e-05, + "step": 750 + }, + { + "epoch": 0.093875, + "grad_norm": 1.9405204057693481, + "grad_norm_var": 0.08195632956667372, + "learning_rate": 0.0001, + "loss": 1.0518, + "loss/crossentropy": 2.624401330947876, + "loss/hidden": 0.90625, + "loss/logits": 0.1452496498823166, + "loss/reg": 3.1468345696339384e-05, + "step": 751 + }, + { + "epoch": 0.094, + "grad_norm": 2.1372480392456055, + "grad_norm_var": 0.0823473431455047, + "learning_rate": 0.0001, + "loss": 1.1548, + "loss/crossentropy": 2.3568766117095947, + "loss/hidden": 1.0078125, + "loss/logits": 0.14671632647514343, + "loss/reg": 3.1459076126338914e-05, + "step": 752 + }, + { + "epoch": 0.094125, + "grad_norm": 1.9586896896362305, + "grad_norm_var": 0.08669574257193607, + "learning_rate": 0.0001, + "loss": 1.2604, + "loss/crossentropy": 2.3817858695983887, + "loss/hidden": 1.0859375, + "loss/logits": 0.1741340607404709, + "loss/reg": 3.145124719594605e-05, + "step": 753 + }, + { + "epoch": 0.09425, + "grad_norm": 5.136489391326904, + "grad_norm_var": 0.6087473488841433, + "learning_rate": 0.0001, + "loss": 2.2121, + "loss/crossentropy": 2.298271656036377, + "loss/hidden": 1.8203125, + "loss/logits": 0.39150530099868774, + "loss/reg": 3.144397123833187e-05, + "step": 754 + }, + { + "epoch": 0.094375, + "grad_norm": 3.463529109954834, + "grad_norm_var": 0.683278061488306, + "learning_rate": 0.0001, + "loss": 1.5672, + "loss/crossentropy": 2.0770769119262695, + "loss/hidden": 1.3515625, + "loss/logits": 0.21530470252037048, + "loss/reg": 3.1436022254638374e-05, + "step": 755 + }, + { + "epoch": 0.0945, + "grad_norm": 2.3703503608703613, + "grad_norm_var": 0.67424132170143, + "learning_rate": 0.0001, + "loss": 1.2125, + "loss/crossentropy": 2.8401291370391846, + "loss/hidden": 1.046875, + "loss/logits": 0.16533055901527405, + "loss/reg": 3.142944842693396e-05, + "step": 756 + }, + { + "epoch": 0.094625, + "grad_norm": 2.6325223445892334, + "grad_norm_var": 0.6736359493160847, + "learning_rate": 0.0001, + "loss": 1.2971, + "loss/crossentropy": 2.7272450923919678, + "loss/hidden": 1.1171875, + "loss/logits": 0.1796242892742157, + "loss/reg": 3.1421946914633736e-05, + "step": 757 + }, + { + "epoch": 0.09475, + "grad_norm": 2.570432424545288, + "grad_norm_var": 0.6575024318759268, + "learning_rate": 0.0001, + "loss": 1.1965, + "loss/crossentropy": 2.4085519313812256, + "loss/hidden": 1.0234375, + "loss/logits": 0.17274212837219238, + "loss/reg": 3.141486013191752e-05, + "step": 758 + }, + { + "epoch": 0.094875, + "grad_norm": 2.4089767932891846, + "grad_norm_var": 0.6475763705088162, + "learning_rate": 0.0001, + "loss": 1.2483, + "loss/crossentropy": 2.472520589828491, + "loss/hidden": 1.0859375, + "loss/logits": 0.1620466709136963, + "loss/reg": 3.1408424547407776e-05, + "step": 759 + }, + { + "epoch": 0.095, + "grad_norm": 1.9809985160827637, + "grad_norm_var": 0.6639808786341005, + "learning_rate": 0.0001, + "loss": 1.1507, + "loss/crossentropy": 2.600600481033325, + "loss/hidden": 1.0, + "loss/logits": 0.1503715068101883, + "loss/reg": 3.1402007152792066e-05, + "step": 760 + }, + { + "epoch": 0.095125, + "grad_norm": 2.333096742630005, + "grad_norm_var": 0.665355115788793, + "learning_rate": 0.0001, + "loss": 1.279, + "loss/crossentropy": 2.6416707038879395, + "loss/hidden": 1.125, + "loss/logits": 0.1537056565284729, + "loss/reg": 3.139731416013092e-05, + "step": 761 + }, + { + "epoch": 0.09525, + "grad_norm": 1.9397908449172974, + "grad_norm_var": 0.6715145427304359, + "learning_rate": 0.0001, + "loss": 1.1725, + "loss/crossentropy": 2.7050859928131104, + "loss/hidden": 1.0234375, + "loss/logits": 0.14873114228248596, + "loss/reg": 3.13945856760256e-05, + "step": 762 + }, + { + "epoch": 0.095375, + "grad_norm": 1.8747289180755615, + "grad_norm_var": 0.6920951391921969, + "learning_rate": 0.0001, + "loss": 1.05, + "loss/crossentropy": 2.4486820697784424, + "loss/hidden": 0.91015625, + "loss/logits": 0.13953766226768494, + "loss/reg": 3.138865577057004e-05, + "step": 763 + }, + { + "epoch": 0.0955, + "grad_norm": 1.8983080387115479, + "grad_norm_var": 0.7061769284476885, + "learning_rate": 0.0001, + "loss": 1.1905, + "loss/crossentropy": 2.2771549224853516, + "loss/hidden": 1.03125, + "loss/logits": 0.1589195728302002, + "loss/reg": 3.138252577628009e-05, + "step": 764 + }, + { + "epoch": 0.095625, + "grad_norm": 2.072046995162964, + "grad_norm_var": 0.7103216736695799, + "learning_rate": 0.0001, + "loss": 1.1747, + "loss/crossentropy": 2.7360568046569824, + "loss/hidden": 1.0234375, + "loss/logits": 0.15097260475158691, + "loss/reg": 3.1379106076201424e-05, + "step": 765 + }, + { + "epoch": 0.09575, + "grad_norm": 2.3827669620513916, + "grad_norm_var": 0.6783382556637773, + "learning_rate": 0.0001, + "loss": 1.277, + "loss/crossentropy": 2.583840847015381, + "loss/hidden": 1.109375, + "loss/logits": 0.16732946038246155, + "loss/reg": 3.137430394417606e-05, + "step": 766 + }, + { + "epoch": 0.095875, + "grad_norm": 2.367859363555908, + "grad_norm_var": 0.6610768710121395, + "learning_rate": 0.0001, + "loss": 1.3393, + "loss/crossentropy": 2.7965312004089355, + "loss/hidden": 1.15625, + "loss/logits": 0.18269799649715424, + "loss/reg": 3.1365445465780795e-05, + "step": 767 + }, + { + "epoch": 0.096, + "grad_norm": 2.1879124641418457, + "grad_norm_var": 0.658986168594122, + "learning_rate": 0.0001, + "loss": 1.3228, + "loss/crossentropy": 2.4425103664398193, + "loss/hidden": 1.15625, + "loss/logits": 0.16622185707092285, + "loss/reg": 3.1357700208900496e-05, + "step": 768 + }, + { + "epoch": 0.096125, + "grad_norm": 2.281101703643799, + "grad_norm_var": 0.6433454947799935, + "learning_rate": 0.0001, + "loss": 1.053, + "loss/crossentropy": 2.447190284729004, + "loss/hidden": 0.92578125, + "loss/logits": 0.12689539790153503, + "loss/reg": 3.135461884085089e-05, + "step": 769 + }, + { + "epoch": 0.09625, + "grad_norm": 2.780257225036621, + "grad_norm_var": 0.1600984168688796, + "learning_rate": 0.0001, + "loss": 1.3624, + "loss/crossentropy": 2.413775682449341, + "loss/hidden": 1.15625, + "loss/logits": 0.20579016208648682, + "loss/reg": 3.13528798869811e-05, + "step": 770 + }, + { + "epoch": 0.096375, + "grad_norm": 2.1634106636047363, + "grad_norm_var": 0.07211399956462869, + "learning_rate": 0.0001, + "loss": 1.2305, + "loss/crossentropy": 2.276298761367798, + "loss/hidden": 1.09375, + "loss/logits": 0.13639463484287262, + "loss/reg": 3.135051156277768e-05, + "step": 771 + }, + { + "epoch": 0.0965, + "grad_norm": 3.03767728805542, + "grad_norm_var": 0.10929521688149742, + "learning_rate": 0.0001, + "loss": 1.2595, + "loss/crossentropy": 2.69195294380188, + "loss/hidden": 1.0703125, + "loss/logits": 0.1888556033372879, + "loss/reg": 3.134880535071716e-05, + "step": 772 + }, + { + "epoch": 0.096625, + "grad_norm": 2.160676956176758, + "grad_norm_var": 0.1027301574876498, + "learning_rate": 0.0001, + "loss": 1.099, + "loss/crossentropy": 2.6800880432128906, + "loss/hidden": 0.96484375, + "loss/logits": 0.13380314409732819, + "loss/reg": 3.13417476718314e-05, + "step": 773 + }, + { + "epoch": 0.09675, + "grad_norm": 1.9055464267730713, + "grad_norm_var": 0.10439108753585717, + "learning_rate": 0.0001, + "loss": 1.1548, + "loss/crossentropy": 2.693740129470825, + "loss/hidden": 1.0078125, + "loss/logits": 0.14665429294109344, + "loss/reg": 3.1334358936874196e-05, + "step": 774 + }, + { + "epoch": 0.096875, + "grad_norm": 2.2329320907592773, + "grad_norm_var": 0.102266613042209, + "learning_rate": 0.0001, + "loss": 1.2392, + "loss/crossentropy": 2.3540096282958984, + "loss/hidden": 1.078125, + "loss/logits": 0.16073733568191528, + "loss/reg": 3.132629717583768e-05, + "step": 775 + }, + { + "epoch": 0.097, + "grad_norm": 2.4246928691864014, + "grad_norm_var": 0.10013899770161926, + "learning_rate": 0.0001, + "loss": 1.2505, + "loss/crossentropy": 1.9924331903457642, + "loss/hidden": 1.109375, + "loss/logits": 0.14083942770957947, + "loss/reg": 3.1318559194915e-05, + "step": 776 + }, + { + "epoch": 0.097125, + "grad_norm": 2.389742136001587, + "grad_norm_var": 0.10094694170040738, + "learning_rate": 0.0001, + "loss": 1.324, + "loss/crossentropy": 2.7448296546936035, + "loss/hidden": 1.140625, + "loss/logits": 0.18304391205310822, + "loss/reg": 3.130955883534625e-05, + "step": 777 + }, + { + "epoch": 0.09725, + "grad_norm": 2.5808043479919434, + "grad_norm_var": 0.09958374019438997, + "learning_rate": 0.0001, + "loss": 1.265, + "loss/crossentropy": 1.8842538595199585, + "loss/hidden": 1.109375, + "loss/logits": 0.1553521603345871, + "loss/reg": 3.1302373827202246e-05, + "step": 778 + }, + { + "epoch": 0.097375, + "grad_norm": 2.4130847454071045, + "grad_norm_var": 0.08743873306624413, + "learning_rate": 0.0001, + "loss": 1.3718, + "loss/crossentropy": 2.4912071228027344, + "loss/hidden": 1.1875, + "loss/logits": 0.18399140238761902, + "loss/reg": 3.129445394733921e-05, + "step": 779 + }, + { + "epoch": 0.0975, + "grad_norm": 2.1240077018737793, + "grad_norm_var": 0.07763369234828493, + "learning_rate": 0.0001, + "loss": 1.175, + "loss/crossentropy": 2.626298666000366, + "loss/hidden": 1.015625, + "loss/logits": 0.15901657938957214, + "loss/reg": 3.128518073935993e-05, + "step": 780 + }, + { + "epoch": 0.097625, + "grad_norm": 2.241236686706543, + "grad_norm_var": 0.07328714526078836, + "learning_rate": 0.0001, + "loss": 1.3103, + "loss/crossentropy": 2.8781237602233887, + "loss/hidden": 1.140625, + "loss/logits": 0.1693672388792038, + "loss/reg": 3.127881063846871e-05, + "step": 781 + }, + { + "epoch": 0.09775, + "grad_norm": 2.1519434452056885, + "grad_norm_var": 0.07575044500278688, + "learning_rate": 0.0001, + "loss": 1.2974, + "loss/crossentropy": 2.2590219974517822, + "loss/hidden": 1.125, + "loss/logits": 0.1720578521490097, + "loss/reg": 3.1273764761863276e-05, + "step": 782 + }, + { + "epoch": 0.097875, + "grad_norm": 4.857376575469971, + "grad_norm_var": 0.472294081867043, + "learning_rate": 0.0001, + "loss": 1.9927, + "loss/crossentropy": 2.8379626274108887, + "loss/hidden": 1.5625, + "loss/logits": 0.4299107789993286, + "loss/reg": 3.126606316072866e-05, + "step": 783 + }, + { + "epoch": 0.098, + "grad_norm": 3.427262783050537, + "grad_norm_var": 0.5174201023944398, + "learning_rate": 0.0001, + "loss": 1.3025, + "loss/crossentropy": 3.541722059249878, + "loss/hidden": 1.140625, + "loss/logits": 0.16157390177249908, + "loss/reg": 3.125540752080269e-05, + "step": 784 + }, + { + "epoch": 0.098125, + "grad_norm": 2.264031171798706, + "grad_norm_var": 0.5181032302799584, + "learning_rate": 0.0001, + "loss": 1.1002, + "loss/crossentropy": 2.884652614593506, + "loss/hidden": 0.95703125, + "loss/logits": 0.14289110898971558, + "loss/reg": 3.124582508462481e-05, + "step": 785 + }, + { + "epoch": 0.09825, + "grad_norm": 1.806728482246399, + "grad_norm_var": 0.5503273000636368, + "learning_rate": 0.0001, + "loss": 1.1306, + "loss/crossentropy": 2.4572439193725586, + "loss/hidden": 0.97265625, + "loss/logits": 0.1576440930366516, + "loss/reg": 3.1235387723427266e-05, + "step": 786 + }, + { + "epoch": 0.098375, + "grad_norm": 2.684609889984131, + "grad_norm_var": 0.5431278467835586, + "learning_rate": 0.0001, + "loss": 1.5656, + "loss/crossentropy": 2.4877264499664307, + "loss/hidden": 1.3203125, + "loss/logits": 0.2449798732995987, + "loss/reg": 3.122756606899202e-05, + "step": 787 + }, + { + "epoch": 0.0985, + "grad_norm": 2.632183313369751, + "grad_norm_var": 0.5267077798480964, + "learning_rate": 0.0001, + "loss": 1.1392, + "loss/crossentropy": 2.2014877796173096, + "loss/hidden": 1.0, + "loss/logits": 0.13884884119033813, + "loss/reg": 3.121886038570665e-05, + "step": 788 + }, + { + "epoch": 0.098625, + "grad_norm": 1.87946355342865, + "grad_norm_var": 0.54506897354085, + "learning_rate": 0.0001, + "loss": 1.2559, + "loss/crossentropy": 2.4326982498168945, + "loss/hidden": 1.09375, + "loss/logits": 0.16185970604419708, + "loss/reg": 3.121058762189932e-05, + "step": 789 + }, + { + "epoch": 0.09875, + "grad_norm": 4.394942283630371, + "grad_norm_var": 0.7347519248832649, + "learning_rate": 0.0001, + "loss": 1.7373, + "loss/crossentropy": 2.4330739974975586, + "loss/hidden": 1.515625, + "loss/logits": 0.22132834792137146, + "loss/reg": 3.1201776437228546e-05, + "step": 790 + }, + { + "epoch": 0.098875, + "grad_norm": 1.9128646850585938, + "grad_norm_var": 0.7592334384300727, + "learning_rate": 0.0001, + "loss": 1.2412, + "loss/crossentropy": 2.433431625366211, + "loss/hidden": 1.0859375, + "loss/logits": 0.15490993857383728, + "loss/reg": 3.119331449852325e-05, + "step": 791 + }, + { + "epoch": 0.099, + "grad_norm": 2.9175164699554443, + "grad_norm_var": 0.7604913223839332, + "learning_rate": 0.0001, + "loss": 1.5882, + "loss/crossentropy": 2.2526400089263916, + "loss/hidden": 1.3828125, + "loss/logits": 0.20504862070083618, + "loss/reg": 3.118627500953153e-05, + "step": 792 + }, + { + "epoch": 0.099125, + "grad_norm": 2.331383228302002, + "grad_norm_var": 0.7628643978346566, + "learning_rate": 0.0001, + "loss": 1.4146, + "loss/crossentropy": 2.7012693881988525, + "loss/hidden": 1.2109375, + "loss/logits": 0.2033015489578247, + "loss/reg": 3.117845699307509e-05, + "step": 793 + }, + { + "epoch": 0.09925, + "grad_norm": 2.187777042388916, + "grad_norm_var": 0.7768636197061916, + "learning_rate": 0.0001, + "loss": 1.234, + "loss/crossentropy": 2.525279998779297, + "loss/hidden": 1.0859375, + "loss/logits": 0.14775747060775757, + "loss/reg": 3.116917287115939e-05, + "step": 794 + }, + { + "epoch": 0.099375, + "grad_norm": 2.1538753509521484, + "grad_norm_var": 0.7888760885047162, + "learning_rate": 0.0001, + "loss": 1.2094, + "loss/crossentropy": 2.5589966773986816, + "loss/hidden": 1.0390625, + "loss/logits": 0.17005379498004913, + "loss/reg": 3.11611256620381e-05, + "step": 795 + }, + { + "epoch": 0.0995, + "grad_norm": 5.747286796569824, + "grad_norm_var": 1.3683445106961756, + "learning_rate": 0.0001, + "loss": 1.8437, + "loss/crossentropy": 2.058424234390259, + "loss/hidden": 1.53125, + "loss/logits": 0.3121880888938904, + "loss/reg": 3.115229628747329e-05, + "step": 796 + }, + { + "epoch": 0.099625, + "grad_norm": 2.5717875957489014, + "grad_norm_var": 1.3483694213120883, + "learning_rate": 0.0001, + "loss": 1.7157, + "loss/crossentropy": 2.185353994369507, + "loss/hidden": 1.4765625, + "loss/logits": 0.23884126543998718, + "loss/reg": 3.114379069302231e-05, + "step": 797 + }, + { + "epoch": 0.09975, + "grad_norm": 2.291684150695801, + "grad_norm_var": 1.3362097880401214, + "learning_rate": 0.0001, + "loss": 1.1224, + "loss/crossentropy": 2.6714465618133545, + "loss/hidden": 0.9765625, + "loss/logits": 0.14549368619918823, + "loss/reg": 3.1131978175835684e-05, + "step": 798 + }, + { + "epoch": 0.099875, + "grad_norm": 2.000739097595215, + "grad_norm_var": 1.0926226260566427, + "learning_rate": 0.0001, + "loss": 1.2438, + "loss/crossentropy": 2.5698318481445312, + "loss/hidden": 1.0703125, + "loss/logits": 0.17316791415214539, + "loss/reg": 3.1121257052291185e-05, + "step": 799 + }, + { + "epoch": 0.1, + "grad_norm": 3.3989579677581787, + "grad_norm_var": 1.0899290024325692, + "learning_rate": 0.0001, + "loss": 1.5563, + "loss/crossentropy": 2.8038387298583984, + "loss/hidden": 1.328125, + "loss/logits": 0.2278510481119156, + "loss/reg": 3.1109084375202656e-05, + "step": 800 + }, + { + "epoch": 0.100125, + "grad_norm": 2.534914255142212, + "grad_norm_var": 1.0788234524427875, + "learning_rate": 0.0001, + "loss": 1.5327, + "loss/crossentropy": 2.1927835941314697, + "loss/hidden": 1.3046875, + "loss/logits": 0.2276715636253357, + "loss/reg": 3.110256511718035e-05, + "step": 801 + }, + { + "epoch": 0.10025, + "grad_norm": 2.1616692543029785, + "grad_norm_var": 1.0436931816711053, + "learning_rate": 0.0001, + "loss": 1.1964, + "loss/crossentropy": 3.0142710208892822, + "loss/hidden": 1.03125, + "loss/logits": 0.1648503690958023, + "loss/reg": 3.1094125006347895e-05, + "step": 802 + }, + { + "epoch": 0.100375, + "grad_norm": 2.1722917556762695, + "grad_norm_var": 1.0637174890335515, + "learning_rate": 0.0001, + "loss": 1.3883, + "loss/crossentropy": 2.711493730545044, + "loss/hidden": 1.1953125, + "loss/logits": 0.19265002012252808, + "loss/reg": 3.108750388491899e-05, + "step": 803 + }, + { + "epoch": 0.1005, + "grad_norm": 1.8611395359039307, + "grad_norm_var": 1.1084202434727317, + "learning_rate": 0.0001, + "loss": 1.168, + "loss/crossentropy": 2.396332263946533, + "loss/hidden": 1.0234375, + "loss/logits": 0.1442836970090866, + "loss/reg": 3.108082091785036e-05, + "step": 804 + }, + { + "epoch": 0.100625, + "grad_norm": 2.0508856773376465, + "grad_norm_var": 1.0924762571014581, + "learning_rate": 0.0001, + "loss": 1.2781, + "loss/crossentropy": 2.658599376678467, + "loss/hidden": 1.109375, + "loss/logits": 0.16839157044887543, + "loss/reg": 3.107260272372514e-05, + "step": 805 + }, + { + "epoch": 0.10075, + "grad_norm": 2.4619462490081787, + "grad_norm_var": 0.8809438114007198, + "learning_rate": 0.0001, + "loss": 1.1897, + "loss/crossentropy": 2.5231716632843018, + "loss/hidden": 1.046875, + "loss/logits": 0.14253319799900055, + "loss/reg": 3.106672738795169e-05, + "step": 806 + }, + { + "epoch": 0.100875, + "grad_norm": 2.3893401622772217, + "grad_norm_var": 0.8548277216729955, + "learning_rate": 0.0001, + "loss": 1.3323, + "loss/crossentropy": 2.2468364238739014, + "loss/hidden": 1.15625, + "loss/logits": 0.17571108043193817, + "loss/reg": 3.105968062300235e-05, + "step": 807 + }, + { + "epoch": 0.101, + "grad_norm": 1.9697636365890503, + "grad_norm_var": 0.8679468111481312, + "learning_rate": 0.0001, + "loss": 1.1459, + "loss/crossentropy": 2.113001585006714, + "loss/hidden": 1.0078125, + "loss/logits": 0.13779830932617188, + "loss/reg": 3.1051207770360634e-05, + "step": 808 + }, + { + "epoch": 0.101125, + "grad_norm": 2.3979098796844482, + "grad_norm_var": 0.8665695097636771, + "learning_rate": 0.0001, + "loss": 1.2127, + "loss/crossentropy": 2.347339391708374, + "loss/hidden": 1.0546875, + "loss/logits": 0.1576911211013794, + "loss/reg": 3.103955532424152e-05, + "step": 809 + }, + { + "epoch": 0.10125, + "grad_norm": 4.104284763336182, + "grad_norm_var": 1.0107271790963963, + "learning_rate": 0.0001, + "loss": 1.2255, + "loss/crossentropy": 2.645543098449707, + "loss/hidden": 1.0546875, + "loss/logits": 0.17050105333328247, + "loss/reg": 3.102962000411935e-05, + "step": 810 + }, + { + "epoch": 0.101375, + "grad_norm": 2.593677520751953, + "grad_norm_var": 0.9942054452960448, + "learning_rate": 0.0001, + "loss": 1.1147, + "loss/crossentropy": 2.655158281326294, + "loss/hidden": 0.96875, + "loss/logits": 0.14562083780765533, + "loss/reg": 3.101498805335723e-05, + "step": 811 + }, + { + "epoch": 0.1015, + "grad_norm": 2.747690439224243, + "grad_norm_var": 0.32551198430473954, + "learning_rate": 0.0001, + "loss": 1.2766, + "loss/crossentropy": 2.234384298324585, + "loss/hidden": 1.1015625, + "loss/logits": 0.1747133731842041, + "loss/reg": 3.100006506429054e-05, + "step": 812 + }, + { + "epoch": 0.101625, + "grad_norm": 2.240682601928711, + "grad_norm_var": 0.3283908535525057, + "learning_rate": 0.0001, + "loss": 1.3393, + "loss/crossentropy": 2.416161298751831, + "loss/hidden": 1.1796875, + "loss/logits": 0.15925709903240204, + "loss/reg": 3.099101013503969e-05, + "step": 813 + }, + { + "epoch": 0.10175, + "grad_norm": 2.198143482208252, + "grad_norm_var": 0.33105067119688786, + "learning_rate": 0.0001, + "loss": 1.1581, + "loss/crossentropy": 2.5200369358062744, + "loss/hidden": 0.99609375, + "loss/logits": 0.16167645156383514, + "loss/reg": 3.0979368602856994e-05, + "step": 814 + }, + { + "epoch": 0.101875, + "grad_norm": 2.7281014919281006, + "grad_norm_var": 0.3200372361620191, + "learning_rate": 0.0001, + "loss": 1.1707, + "loss/crossentropy": 2.55796217918396, + "loss/hidden": 1.0234375, + "loss/logits": 0.14694485068321228, + "loss/reg": 3.096580257988535e-05, + "step": 815 + }, + { + "epoch": 0.102, + "grad_norm": 2.093496322631836, + "grad_norm_var": 0.2702016025984974, + "learning_rate": 0.0001, + "loss": 1.2431, + "loss/crossentropy": 2.6173605918884277, + "loss/hidden": 1.0859375, + "loss/logits": 0.15686647593975067, + "loss/reg": 3.095622741966508e-05, + "step": 816 + }, + { + "epoch": 0.102125, + "grad_norm": 2.214099407196045, + "grad_norm_var": 0.2716811480241621, + "learning_rate": 0.0001, + "loss": 1.1334, + "loss/crossentropy": 2.370023727416992, + "loss/hidden": 0.9765625, + "loss/logits": 0.15652622282505035, + "loss/reg": 3.094406929449178e-05, + "step": 817 + }, + { + "epoch": 0.10225, + "grad_norm": 2.3707094192504883, + "grad_norm_var": 0.267795417331483, + "learning_rate": 0.0001, + "loss": 1.2637, + "loss/crossentropy": 2.85809588432312, + "loss/hidden": 1.1015625, + "loss/logits": 0.16179285943508148, + "loss/reg": 3.0934257665649056e-05, + "step": 818 + }, + { + "epoch": 0.102375, + "grad_norm": 2.4590866565704346, + "grad_norm_var": 0.2637646763277468, + "learning_rate": 0.0001, + "loss": 1.3356, + "loss/crossentropy": 2.219639301300049, + "loss/hidden": 1.1484375, + "loss/logits": 0.1868869662284851, + "loss/reg": 3.09246352117043e-05, + "step": 819 + }, + { + "epoch": 0.1025, + "grad_norm": 2.2254512310028076, + "grad_norm_var": 0.24442462240150267, + "learning_rate": 0.0001, + "loss": 1.2099, + "loss/crossentropy": 2.446502447128296, + "loss/hidden": 1.0390625, + "loss/logits": 0.17052100598812103, + "loss/reg": 3.091415055678226e-05, + "step": 820 + }, + { + "epoch": 0.102625, + "grad_norm": 1.9032593965530396, + "grad_norm_var": 0.2536983764450135, + "learning_rate": 0.0001, + "loss": 1.0735, + "loss/crossentropy": 2.448707342147827, + "loss/hidden": 0.94140625, + "loss/logits": 0.13180768489837646, + "loss/reg": 3.090177779085934e-05, + "step": 821 + }, + { + "epoch": 0.10275, + "grad_norm": 1.844329595565796, + "grad_norm_var": 0.27602844848279134, + "learning_rate": 0.0001, + "loss": 1.1176, + "loss/crossentropy": 2.5376136302948, + "loss/hidden": 0.96875, + "loss/logits": 0.14854903519153595, + "loss/reg": 3.089279562118463e-05, + "step": 822 + }, + { + "epoch": 0.102875, + "grad_norm": 1.6877379417419434, + "grad_norm_var": 0.308258885532959, + "learning_rate": 0.0001, + "loss": 1.1547, + "loss/crossentropy": 2.5561609268188477, + "loss/hidden": 1.0, + "loss/logits": 0.1543552428483963, + "loss/reg": 3.0882885766914114e-05, + "step": 823 + }, + { + "epoch": 0.103, + "grad_norm": 7.1836419105529785, + "grad_norm_var": 1.7352053204418108, + "learning_rate": 0.0001, + "loss": 1.3146, + "loss/crossentropy": 2.6273272037506104, + "loss/hidden": 1.0234375, + "loss/logits": 0.2908306121826172, + "loss/reg": 3.0875191441737115e-05, + "step": 824 + }, + { + "epoch": 0.103125, + "grad_norm": 2.999370574951172, + "grad_norm_var": 1.734629979326649, + "learning_rate": 0.0001, + "loss": 1.5094, + "loss/crossentropy": 2.6536483764648438, + "loss/hidden": 1.3203125, + "loss/logits": 0.18879318237304688, + "loss/reg": 3.0866562156006694e-05, + "step": 825 + }, + { + "epoch": 0.10325, + "grad_norm": 3.1951546669006348, + "grad_norm_var": 1.619046832548184, + "learning_rate": 0.0001, + "loss": 1.5738, + "loss/crossentropy": 2.104147434234619, + "loss/hidden": 1.3515625, + "loss/logits": 0.22189953923225403, + "loss/reg": 3.0859606340527534e-05, + "step": 826 + }, + { + "epoch": 0.103375, + "grad_norm": 1.9920458793640137, + "grad_norm_var": 1.6476144569097508, + "learning_rate": 0.0001, + "loss": 1.0835, + "loss/crossentropy": 2.3575265407562256, + "loss/hidden": 0.953125, + "loss/logits": 0.1301153153181076, + "loss/reg": 3.0850660550640896e-05, + "step": 827 + }, + { + "epoch": 0.1035, + "grad_norm": 2.1523900032043457, + "grad_norm_var": 1.6604367682342531, + "learning_rate": 0.0001, + "loss": 1.3061, + "loss/crossentropy": 2.1625425815582275, + "loss/hidden": 1.140625, + "loss/logits": 0.16514690220355988, + "loss/reg": 3.084292620769702e-05, + "step": 828 + }, + { + "epoch": 0.103625, + "grad_norm": 2.931431531906128, + "grad_norm_var": 1.6578109899282356, + "learning_rate": 0.0001, + "loss": 1.1044, + "loss/crossentropy": 2.6592483520507812, + "loss/hidden": 0.9609375, + "loss/logits": 0.14313727617263794, + "loss/reg": 3.083515548496507e-05, + "step": 829 + }, + { + "epoch": 0.10375, + "grad_norm": 2.6759235858917236, + "grad_norm_var": 1.6441751337506705, + "learning_rate": 0.0001, + "loss": 1.0772, + "loss/crossentropy": 2.441382884979248, + "loss/hidden": 0.953125, + "loss/logits": 0.12375655770301819, + "loss/reg": 3.082441253354773e-05, + "step": 830 + }, + { + "epoch": 0.103875, + "grad_norm": 2.1944730281829834, + "grad_norm_var": 1.6575550635786949, + "learning_rate": 0.0001, + "loss": 1.2211, + "loss/crossentropy": 2.1641762256622314, + "loss/hidden": 1.078125, + "loss/logits": 0.14268890023231506, + "loss/reg": 3.08187554765027e-05, + "step": 831 + }, + { + "epoch": 0.104, + "grad_norm": 2.6154282093048096, + "grad_norm_var": 1.637059795107976, + "learning_rate": 0.0001, + "loss": 1.372, + "loss/crossentropy": 2.3172926902770996, + "loss/hidden": 1.1875, + "loss/logits": 0.18420815467834473, + "loss/reg": 3.0813283956376836e-05, + "step": 832 + }, + { + "epoch": 0.104125, + "grad_norm": 2.7839176654815674, + "grad_norm_var": 1.6230740542825255, + "learning_rate": 0.0001, + "loss": 1.266, + "loss/crossentropy": 2.526543378829956, + "loss/hidden": 1.09375, + "loss/logits": 0.17195840179920197, + "loss/reg": 3.080438546021469e-05, + "step": 833 + }, + { + "epoch": 0.10425, + "grad_norm": 2.2361974716186523, + "grad_norm_var": 1.630126784940075, + "learning_rate": 0.0001, + "loss": 1.1865, + "loss/crossentropy": 2.38508677482605, + "loss/hidden": 1.046875, + "loss/logits": 0.13929709792137146, + "loss/reg": 3.079506132053211e-05, + "step": 834 + }, + { + "epoch": 0.104375, + "grad_norm": 2.0524253845214844, + "grad_norm_var": 1.6531180996917048, + "learning_rate": 0.0001, + "loss": 1.3726, + "loss/crossentropy": 2.530698776245117, + "loss/hidden": 1.1640625, + "loss/logits": 0.20820161700248718, + "loss/reg": 3.078530426137149e-05, + "step": 835 + }, + { + "epoch": 0.1045, + "grad_norm": 2.179396629333496, + "grad_norm_var": 1.6559624963262625, + "learning_rate": 0.0001, + "loss": 1.3062, + "loss/crossentropy": 2.200507402420044, + "loss/hidden": 1.1171875, + "loss/logits": 0.18874022364616394, + "loss/reg": 3.07746377075091e-05, + "step": 836 + }, + { + "epoch": 0.104625, + "grad_norm": 1.8860087394714355, + "grad_norm_var": 1.657731314453099, + "learning_rate": 0.0001, + "loss": 1.052, + "loss/crossentropy": 3.1556520462036133, + "loss/hidden": 0.92578125, + "loss/logits": 0.12589877843856812, + "loss/reg": 3.0765488190809265e-05, + "step": 837 + }, + { + "epoch": 0.10475, + "grad_norm": 1.7952905893325806, + "grad_norm_var": 1.663235285712947, + "learning_rate": 0.0001, + "loss": 1.205, + "loss/crossentropy": 2.6172409057617188, + "loss/hidden": 1.046875, + "loss/logits": 0.15777094662189484, + "loss/reg": 3.075488348258659e-05, + "step": 838 + }, + { + "epoch": 0.104875, + "grad_norm": 1.9205029010772705, + "grad_norm_var": 1.63644541696118, + "learning_rate": 0.0001, + "loss": 1.2576, + "loss/crossentropy": 2.506983518600464, + "loss/hidden": 1.09375, + "loss/logits": 0.16354331374168396, + "loss/reg": 3.074315463891253e-05, + "step": 839 + }, + { + "epoch": 0.105, + "grad_norm": 2.0928657054901123, + "grad_norm_var": 0.19559241083775677, + "learning_rate": 0.0001, + "loss": 1.1347, + "loss/crossentropy": 2.6664047241210938, + "loss/hidden": 0.97265625, + "loss/logits": 0.16169464588165283, + "loss/reg": 3.073291736654937e-05, + "step": 840 + }, + { + "epoch": 0.105125, + "grad_norm": 2.0819296836853027, + "grad_norm_var": 0.1695500869973637, + "learning_rate": 0.0001, + "loss": 1.1457, + "loss/crossentropy": 2.5542869567871094, + "loss/hidden": 0.9921875, + "loss/logits": 0.15322107076644897, + "loss/reg": 3.072019899263978e-05, + "step": 841 + }, + { + "epoch": 0.10525, + "grad_norm": 2.0458364486694336, + "grad_norm_var": 0.11479267511667969, + "learning_rate": 0.0001, + "loss": 1.1969, + "loss/crossentropy": 2.4433228969573975, + "loss/hidden": 1.0234375, + "loss/logits": 0.17314405739307404, + "loss/reg": 3.071278115385212e-05, + "step": 842 + }, + { + "epoch": 0.105375, + "grad_norm": 2.0129029750823975, + "grad_norm_var": 0.11416576275897222, + "learning_rate": 0.0001, + "loss": 1.1662, + "loss/crossentropy": 2.686662435531616, + "loss/hidden": 1.015625, + "loss/logits": 0.15030167996883392, + "loss/reg": 3.0703693482792005e-05, + "step": 843 + }, + { + "epoch": 0.1055, + "grad_norm": 2.0519752502441406, + "grad_norm_var": 0.1158157371009238, + "learning_rate": 0.0001, + "loss": 1.0377, + "loss/crossentropy": 2.2896480560302734, + "loss/hidden": 0.91015625, + "loss/logits": 0.12721426784992218, + "loss/reg": 3.0695973691763356e-05, + "step": 844 + }, + { + "epoch": 0.105625, + "grad_norm": 2.1257541179656982, + "grad_norm_var": 0.08020601663278004, + "learning_rate": 0.0001, + "loss": 1.2512, + "loss/crossentropy": 2.3321328163146973, + "loss/hidden": 1.0859375, + "loss/logits": 0.16499708592891693, + "loss/reg": 3.06832225760445e-05, + "step": 845 + }, + { + "epoch": 0.10575, + "grad_norm": 2.699514865875244, + "grad_norm_var": 0.08182612489990308, + "learning_rate": 0.0001, + "loss": 1.2479, + "loss/crossentropy": 2.5453882217407227, + "loss/hidden": 1.0625, + "loss/logits": 0.18509814143180847, + "loss/reg": 3.067553188884631e-05, + "step": 846 + }, + { + "epoch": 0.105875, + "grad_norm": 2.1057229042053223, + "grad_norm_var": 0.08206906146053014, + "learning_rate": 0.0001, + "loss": 1.2905, + "loss/crossentropy": 2.517914295196533, + "loss/hidden": 1.109375, + "loss/logits": 0.18079468607902527, + "loss/reg": 3.0666917155031115e-05, + "step": 847 + }, + { + "epoch": 0.106, + "grad_norm": 2.237938404083252, + "grad_norm_var": 0.06844794497861711, + "learning_rate": 0.0001, + "loss": 1.3847, + "loss/crossentropy": 2.418829917907715, + "loss/hidden": 1.1875, + "loss/logits": 0.19684617221355438, + "loss/reg": 3.065919372602366e-05, + "step": 848 + }, + { + "epoch": 0.106125, + "grad_norm": 1.911841630935669, + "grad_norm_var": 0.0416030271498783, + "learning_rate": 0.0001, + "loss": 1.1004, + "loss/crossentropy": 2.5736911296844482, + "loss/hidden": 0.9609375, + "loss/logits": 0.1391741931438446, + "loss/reg": 3.065243436140008e-05, + "step": 849 + }, + { + "epoch": 0.10625, + "grad_norm": 2.6131272315979004, + "grad_norm_var": 0.057842508872359075, + "learning_rate": 0.0001, + "loss": 1.4266, + "loss/crossentropy": 2.7167627811431885, + "loss/hidden": 1.1953125, + "loss/logits": 0.23097163438796997, + "loss/reg": 3.064147676923312e-05, + "step": 850 + }, + { + "epoch": 0.106375, + "grad_norm": 3.0576887130737305, + "grad_norm_var": 0.11284086479193854, + "learning_rate": 0.0001, + "loss": 1.6205, + "loss/crossentropy": 3.076043128967285, + "loss/hidden": 1.390625, + "loss/logits": 0.2295425832271576, + "loss/reg": 3.063471012865193e-05, + "step": 851 + }, + { + "epoch": 0.1065, + "grad_norm": 2.0406875610351562, + "grad_norm_var": 0.1139832134184904, + "learning_rate": 0.0001, + "loss": 1.1822, + "loss/crossentropy": 2.5482842922210693, + "loss/hidden": 1.0078125, + "loss/logits": 0.17407500743865967, + "loss/reg": 3.0626764782937244e-05, + "step": 852 + }, + { + "epoch": 0.106625, + "grad_norm": 3.4559848308563232, + "grad_norm_var": 0.2091155587506681, + "learning_rate": 0.0001, + "loss": 1.629, + "loss/crossentropy": 3.0823347568511963, + "loss/hidden": 1.3359375, + "loss/logits": 0.29273706674575806, + "loss/reg": 3.0620882171206176e-05, + "step": 853 + }, + { + "epoch": 0.10675, + "grad_norm": 2.512244939804077, + "grad_norm_var": 0.1962835291714666, + "learning_rate": 0.0001, + "loss": 1.1673, + "loss/crossentropy": 2.5817465782165527, + "loss/hidden": 1.015625, + "loss/logits": 0.15131962299346924, + "loss/reg": 3.061717143282294e-05, + "step": 854 + }, + { + "epoch": 0.106875, + "grad_norm": 2.2254841327667236, + "grad_norm_var": 0.18624173617588796, + "learning_rate": 0.0001, + "loss": 1.3592, + "loss/crossentropy": 2.4730138778686523, + "loss/hidden": 1.171875, + "loss/logits": 0.18697890639305115, + "loss/reg": 3.061169627471827e-05, + "step": 855 + }, + { + "epoch": 0.107, + "grad_norm": 2.1242430210113525, + "grad_norm_var": 0.18531340737878446, + "learning_rate": 0.0001, + "loss": 1.3297, + "loss/crossentropy": 2.643832206726074, + "loss/hidden": 1.1640625, + "loss/logits": 0.1653291881084442, + "loss/reg": 3.060722156078555e-05, + "step": 856 + }, + { + "epoch": 0.107125, + "grad_norm": 1.8313719034194946, + "grad_norm_var": 0.19757233331361063, + "learning_rate": 0.0001, + "loss": 1.1884, + "loss/crossentropy": 2.4841344356536865, + "loss/hidden": 1.046875, + "loss/logits": 0.1411784291267395, + "loss/reg": 3.060205199290067e-05, + "step": 857 + }, + { + "epoch": 0.10725, + "grad_norm": 2.642951250076294, + "grad_norm_var": 0.19836562649402692, + "learning_rate": 0.0001, + "loss": 1.1128, + "loss/crossentropy": 2.895691394805908, + "loss/hidden": 0.9609375, + "loss/logits": 0.15156272053718567, + "loss/reg": 3.0597617296734825e-05, + "step": 858 + }, + { + "epoch": 0.107375, + "grad_norm": 1.8541446924209595, + "grad_norm_var": 0.2071418812691462, + "learning_rate": 0.0001, + "loss": 1.2063, + "loss/crossentropy": 2.4588773250579834, + "loss/hidden": 1.046875, + "loss/logits": 0.15909163653850555, + "loss/reg": 3.059022856177762e-05, + "step": 859 + }, + { + "epoch": 0.1075, + "grad_norm": 2.2945339679718018, + "grad_norm_var": 0.20140156536063855, + "learning_rate": 0.0001, + "loss": 1.3389, + "loss/crossentropy": 2.463376760482788, + "loss/hidden": 1.140625, + "loss/logits": 0.19798508286476135, + "loss/reg": 3.0582417821278796e-05, + "step": 860 + }, + { + "epoch": 0.107625, + "grad_norm": 2.0763418674468994, + "grad_norm_var": 0.2030864243441099, + "learning_rate": 0.0001, + "loss": 1.2307, + "loss/crossentropy": 2.511608600616455, + "loss/hidden": 1.078125, + "loss/logits": 0.1522822380065918, + "loss/reg": 3.057560752495192e-05, + "step": 861 + }, + { + "epoch": 0.10775, + "grad_norm": 2.028618812561035, + "grad_norm_var": 0.2004213147208908, + "learning_rate": 0.0001, + "loss": 1.0788, + "loss/crossentropy": 2.5496108531951904, + "loss/hidden": 0.9453125, + "loss/logits": 0.13319332897663116, + "loss/reg": 3.056845525861718e-05, + "step": 862 + }, + { + "epoch": 0.107875, + "grad_norm": 2.4464786052703857, + "grad_norm_var": 0.19824703313002487, + "learning_rate": 0.0001, + "loss": 1.2798, + "loss/crossentropy": 2.569153070449829, + "loss/hidden": 1.1171875, + "loss/logits": 0.1623011976480484, + "loss/reg": 3.05598478007596e-05, + "step": 863 + }, + { + "epoch": 0.108, + "grad_norm": 1.840403437614441, + "grad_norm_var": 0.2132479466723514, + "learning_rate": 0.0001, + "loss": 1.1379, + "loss/crossentropy": 2.0996623039245605, + "loss/hidden": 0.9921875, + "loss/logits": 0.1454375684261322, + "loss/reg": 3.055387787753716e-05, + "step": 864 + }, + { + "epoch": 0.108125, + "grad_norm": 2.1220955848693848, + "grad_norm_var": 0.20485570241751858, + "learning_rate": 0.0001, + "loss": 1.172, + "loss/crossentropy": 2.507676124572754, + "loss/hidden": 1.046875, + "loss/logits": 0.12477228045463562, + "loss/reg": 3.054780245292932e-05, + "step": 865 + }, + { + "epoch": 0.10825, + "grad_norm": 2.0436394214630127, + "grad_norm_var": 0.20308802849589866, + "learning_rate": 0.0001, + "loss": 1.2824, + "loss/crossentropy": 2.6431305408477783, + "loss/hidden": 1.109375, + "loss/logits": 0.17272983491420746, + "loss/reg": 3.054209446418099e-05, + "step": 866 + }, + { + "epoch": 0.108375, + "grad_norm": 1.9790053367614746, + "grad_norm_var": 0.16501067300080757, + "learning_rate": 0.0001, + "loss": 1.1319, + "loss/crossentropy": 2.4894466400146484, + "loss/hidden": 0.9921875, + "loss/logits": 0.1393980085849762, + "loss/reg": 3.053849286516197e-05, + "step": 867 + }, + { + "epoch": 0.1085, + "grad_norm": 1.9582802057266235, + "grad_norm_var": 0.1674041146687453, + "learning_rate": 0.0001, + "loss": 1.1628, + "loss/crossentropy": 2.497908592224121, + "loss/hidden": 1.015625, + "loss/logits": 0.14682599902153015, + "loss/reg": 3.0536324629792944e-05, + "step": 868 + }, + { + "epoch": 0.108625, + "grad_norm": 2.071611166000366, + "grad_norm_var": 0.0580716724783836, + "learning_rate": 0.0001, + "loss": 1.1909, + "loss/crossentropy": 2.4994702339172363, + "loss/hidden": 1.03125, + "loss/logits": 0.15929651260375977, + "loss/reg": 3.053318869206123e-05, + "step": 869 + }, + { + "epoch": 0.10875, + "grad_norm": 2.1442458629608154, + "grad_norm_var": 0.047692633827984804, + "learning_rate": 0.0001, + "loss": 1.3262, + "loss/crossentropy": 2.6097989082336426, + "loss/hidden": 1.125, + "loss/logits": 0.20085518062114716, + "loss/reg": 3.053041291423142e-05, + "step": 870 + }, + { + "epoch": 0.108875, + "grad_norm": 1.9006311893463135, + "grad_norm_var": 0.04907894435885491, + "learning_rate": 0.0001, + "loss": 1.1552, + "loss/crossentropy": 2.437607765197754, + "loss/hidden": 1.0078125, + "loss/logits": 0.14705568552017212, + "loss/reg": 3.052354077226482e-05, + "step": 871 + }, + { + "epoch": 0.109, + "grad_norm": 2.8589375019073486, + "grad_norm_var": 0.08666775452125629, + "learning_rate": 0.0001, + "loss": 1.5007, + "loss/crossentropy": 2.3860392570495605, + "loss/hidden": 1.2734375, + "loss/logits": 0.22699284553527832, + "loss/reg": 3.0516899641952477e-05, + "step": 872 + }, + { + "epoch": 0.109125, + "grad_norm": 2.0803627967834473, + "grad_norm_var": 0.08060086596212314, + "learning_rate": 0.0001, + "loss": 1.4761, + "loss/crossentropy": 2.097466230392456, + "loss/hidden": 1.3046875, + "loss/logits": 0.17110571265220642, + "loss/reg": 3.0513721867464483e-05, + "step": 873 + }, + { + "epoch": 0.10925, + "grad_norm": 2.0202836990356445, + "grad_norm_var": 0.06360758527622175, + "learning_rate": 0.0001, + "loss": 1.1768, + "loss/crossentropy": 2.6354477405548096, + "loss/hidden": 1.03125, + "loss/logits": 0.14520543813705444, + "loss/reg": 3.050914529012516e-05, + "step": 874 + }, + { + "epoch": 0.109375, + "grad_norm": 1.677229404449463, + "grad_norm_var": 0.07153952873858506, + "learning_rate": 0.0001, + "loss": 1.0757, + "loss/crossentropy": 2.3119208812713623, + "loss/hidden": 0.9375, + "loss/logits": 0.13793236017227173, + "loss/reg": 3.050183477171231e-05, + "step": 875 + }, + { + "epoch": 0.1095, + "grad_norm": 2.254667282104492, + "grad_norm_var": 0.07058576994531313, + "learning_rate": 0.0001, + "loss": 1.427, + "loss/crossentropy": 2.143519639968872, + "loss/hidden": 1.2421875, + "loss/logits": 0.18450552225112915, + "loss/reg": 3.0492194127873518e-05, + "step": 876 + }, + { + "epoch": 0.109625, + "grad_norm": 3.199824810028076, + "grad_norm_var": 0.14683992559318046, + "learning_rate": 0.0001, + "loss": 1.0668, + "loss/crossentropy": 2.8740196228027344, + "loss/hidden": 0.9296875, + "loss/logits": 0.13684490323066711, + "loss/reg": 3.04836175928358e-05, + "step": 877 + }, + { + "epoch": 0.10975, + "grad_norm": 2.2499160766601562, + "grad_norm_var": 0.14590183794275807, + "learning_rate": 0.0001, + "loss": 1.2143, + "loss/crossentropy": 2.618100881576538, + "loss/hidden": 1.0625, + "loss/logits": 0.15145710110664368, + "loss/reg": 3.048092003155034e-05, + "step": 878 + }, + { + "epoch": 0.109875, + "grad_norm": 2.1639883518218994, + "grad_norm_var": 0.1407761266771947, + "learning_rate": 0.0001, + "loss": 1.1532, + "loss/crossentropy": 2.4676895141601562, + "loss/hidden": 1.015625, + "loss/logits": 0.13726571202278137, + "loss/reg": 3.0473505830741487e-05, + "step": 879 + }, + { + "epoch": 0.11, + "grad_norm": 2.9332988262176514, + "grad_norm_var": 0.16880933318345934, + "learning_rate": 0.0001, + "loss": 1.2218, + "loss/crossentropy": 2.739016056060791, + "loss/hidden": 1.0546875, + "loss/logits": 0.16677409410476685, + "loss/reg": 3.04626373690553e-05, + "step": 880 + }, + { + "epoch": 0.110125, + "grad_norm": 2.377056121826172, + "grad_norm_var": 0.16925066109580858, + "learning_rate": 0.0001, + "loss": 1.3437, + "loss/crossentropy": 2.279279947280884, + "loss/hidden": 1.15625, + "loss/logits": 0.18712583184242249, + "loss/reg": 3.0454097213805653e-05, + "step": 881 + }, + { + "epoch": 0.11025, + "grad_norm": 1.7906708717346191, + "grad_norm_var": 0.18002714541507406, + "learning_rate": 0.0001, + "loss": 1.2342, + "loss/crossentropy": 2.567777395248413, + "loss/hidden": 1.0703125, + "loss/logits": 0.16357748210430145, + "loss/reg": 3.0445706215687096e-05, + "step": 882 + }, + { + "epoch": 0.110375, + "grad_norm": 2.8536040782928467, + "grad_norm_var": 0.19871124531315582, + "learning_rate": 0.0001, + "loss": 1.4335, + "loss/crossentropy": 2.493900775909424, + "loss/hidden": 1.234375, + "loss/logits": 0.19878800213336945, + "loss/reg": 3.043895776499994e-05, + "step": 883 + }, + { + "epoch": 0.1105, + "grad_norm": 5.356804847717285, + "grad_norm_var": 0.7732547721650782, + "learning_rate": 0.0001, + "loss": 1.3051, + "loss/crossentropy": 2.786006212234497, + "loss/hidden": 1.140625, + "loss/logits": 0.1641331911087036, + "loss/reg": 3.043297510885168e-05, + "step": 884 + }, + { + "epoch": 0.110625, + "grad_norm": 2.8684051036834717, + "grad_norm_var": 0.7678671191599996, + "learning_rate": 0.0001, + "loss": 1.2754, + "loss/crossentropy": 2.2656214237213135, + "loss/hidden": 1.1171875, + "loss/logits": 0.15792518854141235, + "loss/reg": 3.0427321689785458e-05, + "step": 885 + }, + { + "epoch": 0.11075, + "grad_norm": 1.9989514350891113, + "grad_norm_var": 0.7769621885934072, + "learning_rate": 0.0001, + "loss": 1.2788, + "loss/crossentropy": 2.489605188369751, + "loss/hidden": 1.109375, + "loss/logits": 0.16912804543972015, + "loss/reg": 3.0422537747654133e-05, + "step": 886 + }, + { + "epoch": 0.110875, + "grad_norm": 2.3265321254730225, + "grad_norm_var": 0.7521879700078027, + "learning_rate": 0.0001, + "loss": 1.3237, + "loss/crossentropy": 2.5821127891540527, + "loss/hidden": 1.109375, + "loss/logits": 0.21401891112327576, + "loss/reg": 3.0414203138207085e-05, + "step": 887 + }, + { + "epoch": 0.111, + "grad_norm": 2.1429810523986816, + "grad_norm_var": 0.7559897385715504, + "learning_rate": 0.0001, + "loss": 1.2854, + "loss/crossentropy": 2.565568208694458, + "loss/hidden": 1.0859375, + "loss/logits": 0.19918876886367798, + "loss/reg": 3.040812589460984e-05, + "step": 888 + }, + { + "epoch": 0.111125, + "grad_norm": 1.6851451396942139, + "grad_norm_var": 0.788835305036081, + "learning_rate": 0.0001, + "loss": 1.1598, + "loss/crossentropy": 2.294679641723633, + "loss/hidden": 1.015625, + "loss/logits": 0.14390423893928528, + "loss/reg": 3.0401906769839115e-05, + "step": 889 + }, + { + "epoch": 0.11125, + "grad_norm": 19.322298049926758, + "grad_norm_var": 18.406652883554056, + "learning_rate": 0.0001, + "loss": 1.2189, + "loss/crossentropy": 2.5112252235412598, + "loss/hidden": 1.0625, + "loss/logits": 0.156059131026268, + "loss/reg": 3.0395181966014206e-05, + "step": 890 + }, + { + "epoch": 0.111375, + "grad_norm": 2.8927786350250244, + "grad_norm_var": 18.191408653915317, + "learning_rate": 0.0001, + "loss": 1.4005, + "loss/crossentropy": 2.4821994304656982, + "loss/hidden": 1.1875, + "loss/logits": 0.21269716322422028, + "loss/reg": 3.0386423532036133e-05, + "step": 891 + }, + { + "epoch": 0.1115, + "grad_norm": 4.272779941558838, + "grad_norm_var": 18.070214239000837, + "learning_rate": 0.0001, + "loss": 1.4083, + "loss/crossentropy": 2.565413475036621, + "loss/hidden": 1.21875, + "loss/logits": 0.18922537565231323, + "loss/reg": 3.0379227609955706e-05, + "step": 892 + }, + { + "epoch": 0.111625, + "grad_norm": 2.1861484050750732, + "grad_norm_var": 18.21247030426496, + "learning_rate": 0.0001, + "loss": 1.1391, + "loss/crossentropy": 2.397334575653076, + "loss/hidden": 0.98828125, + "loss/logits": 0.15054136514663696, + "loss/reg": 3.0370387321454473e-05, + "step": 893 + }, + { + "epoch": 0.11175, + "grad_norm": 2.010301113128662, + "grad_norm_var": 18.262829011154675, + "learning_rate": 0.0001, + "loss": 1.367, + "loss/crossentropy": 2.3572635650634766, + "loss/hidden": 1.1796875, + "loss/logits": 0.1869942843914032, + "loss/reg": 3.036250564036891e-05, + "step": 894 + }, + { + "epoch": 0.111875, + "grad_norm": 2.95393967628479, + "grad_norm_var": 18.140167373756306, + "learning_rate": 0.0001, + "loss": 1.2857, + "loss/crossentropy": 2.755155086517334, + "loss/hidden": 1.1171875, + "loss/logits": 0.16822287440299988, + "loss/reg": 3.035445297427941e-05, + "step": 895 + }, + { + "epoch": 0.112, + "grad_norm": 1.8677095174789429, + "grad_norm_var": 18.32691930612881, + "learning_rate": 0.0001, + "loss": 1.0605, + "loss/crossentropy": 2.6163880825042725, + "loss/hidden": 0.9375, + "loss/logits": 0.12269198894500732, + "loss/reg": 3.034656765521504e-05, + "step": 896 + }, + { + "epoch": 0.112125, + "grad_norm": 1.8834550380706787, + "grad_norm_var": 18.42800558442811, + "learning_rate": 0.0001, + "loss": 1.251, + "loss/crossentropy": 2.4362659454345703, + "loss/hidden": 1.0703125, + "loss/logits": 0.18038895726203918, + "loss/reg": 3.0335075280163437e-05, + "step": 897 + }, + { + "epoch": 0.11225, + "grad_norm": 1.9402846097946167, + "grad_norm_var": 18.39229818615, + "learning_rate": 0.0001, + "loss": 1.1382, + "loss/crossentropy": 2.34395170211792, + "loss/hidden": 0.99609375, + "loss/logits": 0.14177373051643372, + "loss/reg": 3.0326225896715187e-05, + "step": 898 + }, + { + "epoch": 0.112375, + "grad_norm": 1.5321539640426636, + "grad_norm_var": 18.643542516203766, + "learning_rate": 0.0001, + "loss": 1.1133, + "loss/crossentropy": 2.346829652786255, + "loss/hidden": 0.96484375, + "loss/logits": 0.14812731742858887, + "loss/reg": 3.031741471204441e-05, + "step": 899 + }, + { + "epoch": 0.1125, + "grad_norm": 2.4379284381866455, + "grad_norm_var": 18.483572622867825, + "learning_rate": 0.0001, + "loss": 1.5778, + "loss/crossentropy": 2.4154257774353027, + "loss/hidden": 1.359375, + "loss/logits": 0.21813495457172394, + "loss/reg": 3.0310697184177116e-05, + "step": 900 + }, + { + "epoch": 0.112625, + "grad_norm": 1.8157447576522827, + "grad_norm_var": 18.62675428293736, + "learning_rate": 0.0001, + "loss": 1.0782, + "loss/crossentropy": 2.5257809162139893, + "loss/hidden": 0.9453125, + "loss/logits": 0.1325758695602417, + "loss/reg": 3.030002881132532e-05, + "step": 901 + }, + { + "epoch": 0.11275, + "grad_norm": 1.805254578590393, + "grad_norm_var": 18.66345763452514, + "learning_rate": 0.0001, + "loss": 1.1999, + "loss/crossentropy": 2.5594637393951416, + "loss/hidden": 1.046875, + "loss/logits": 0.1527424305677414, + "loss/reg": 3.0291475923149846e-05, + "step": 902 + }, + { + "epoch": 0.112875, + "grad_norm": 2.0050342082977295, + "grad_norm_var": 18.712384675596915, + "learning_rate": 0.0001, + "loss": 1.2333, + "loss/crossentropy": 2.142164945602417, + "loss/hidden": 1.078125, + "loss/logits": 0.1548900008201599, + "loss/reg": 3.028149512829259e-05, + "step": 903 + }, + { + "epoch": 0.113, + "grad_norm": 2.051323175430298, + "grad_norm_var": 18.72701455166655, + "learning_rate": 0.0001, + "loss": 1.1036, + "loss/crossentropy": 2.466301918029785, + "loss/hidden": 0.96875, + "loss/logits": 0.13455404341220856, + "loss/reg": 3.027167076652404e-05, + "step": 904 + }, + { + "epoch": 0.113125, + "grad_norm": 1.9347882270812988, + "grad_norm_var": 18.67744451765512, + "learning_rate": 0.0001, + "loss": 1.162, + "loss/crossentropy": 2.8280982971191406, + "loss/hidden": 1.0, + "loss/logits": 0.16173742711544037, + "loss/reg": 3.0260214771260507e-05, + "step": 905 + }, + { + "epoch": 0.11325, + "grad_norm": 1.9693732261657715, + "grad_norm_var": 0.44271487091433853, + "learning_rate": 0.0001, + "loss": 1.2495, + "loss/crossentropy": 2.420048713684082, + "loss/hidden": 1.078125, + "loss/logits": 0.17107552289962769, + "loss/reg": 3.0245597372413613e-05, + "step": 906 + }, + { + "epoch": 0.113375, + "grad_norm": 2.4886035919189453, + "grad_norm_var": 0.41680001650657716, + "learning_rate": 0.0001, + "loss": 1.1317, + "loss/crossentropy": 2.844113826751709, + "loss/hidden": 1.0, + "loss/logits": 0.131430983543396, + "loss/reg": 3.0234865334932692e-05, + "step": 907 + }, + { + "epoch": 0.1135, + "grad_norm": 1.840441346168518, + "grad_norm_var": 0.11342421101602417, + "learning_rate": 0.0001, + "loss": 0.9626, + "loss/crossentropy": 2.434068202972412, + "loss/hidden": 0.8515625, + "loss/logits": 0.11073873192071915, + "loss/reg": 3.0223776775528677e-05, + "step": 908 + }, + { + "epoch": 0.113625, + "grad_norm": 1.8804395198822021, + "grad_norm_var": 0.11351828281440793, + "learning_rate": 0.0001, + "loss": 1.2627, + "loss/crossentropy": 2.553520441055298, + "loss/hidden": 1.09375, + "loss/logits": 0.16862741112709045, + "loss/reg": 3.0212599085643888e-05, + "step": 909 + }, + { + "epoch": 0.11375, + "grad_norm": 3.197678804397583, + "grad_norm_var": 0.19914182473502062, + "learning_rate": 0.0001, + "loss": 1.2905, + "loss/crossentropy": 2.499753475189209, + "loss/hidden": 1.1171875, + "loss/logits": 0.1730211228132248, + "loss/reg": 3.0199351385817863e-05, + "step": 910 + }, + { + "epoch": 0.113875, + "grad_norm": 1.9219839572906494, + "grad_norm_var": 0.14823876643624204, + "learning_rate": 0.0001, + "loss": 1.2543, + "loss/crossentropy": 2.252265453338623, + "loss/hidden": 1.1015625, + "loss/logits": 0.1524173766374588, + "loss/reg": 3.0187717129592784e-05, + "step": 911 + }, + { + "epoch": 0.114, + "grad_norm": 2.2684502601623535, + "grad_norm_var": 0.14929642441132382, + "learning_rate": 0.0001, + "loss": 1.2484, + "loss/crossentropy": 2.674544095993042, + "loss/hidden": 1.0625, + "loss/logits": 0.18555624783039093, + "loss/reg": 3.017616290890146e-05, + "step": 912 + }, + { + "epoch": 0.114125, + "grad_norm": 2.682770252227783, + "grad_norm_var": 0.17032645440362532, + "learning_rate": 0.0001, + "loss": 1.4042, + "loss/crossentropy": 2.5020034313201904, + "loss/hidden": 1.2109375, + "loss/logits": 0.19293466210365295, + "loss/reg": 3.0163550036377273e-05, + "step": 913 + }, + { + "epoch": 0.11425, + "grad_norm": 2.211789131164551, + "grad_norm_var": 0.16876210134861747, + "learning_rate": 0.0001, + "loss": 1.1638, + "loss/crossentropy": 2.8326942920684814, + "loss/hidden": 1.0234375, + "loss/logits": 0.14011076092720032, + "loss/reg": 3.015105721715372e-05, + "step": 914 + }, + { + "epoch": 0.114375, + "grad_norm": 1.9461342096328735, + "grad_norm_var": 0.14659883344723781, + "learning_rate": 0.0001, + "loss": 1.242, + "loss/crossentropy": 2.554008960723877, + "loss/hidden": 1.0625, + "loss/logits": 0.179220050573349, + "loss/reg": 3.014074536622502e-05, + "step": 915 + }, + { + "epoch": 0.1145, + "grad_norm": 2.0787339210510254, + "grad_norm_var": 0.14104581058875282, + "learning_rate": 0.0001, + "loss": 1.3882, + "loss/crossentropy": 2.234307050704956, + "loss/hidden": 1.2109375, + "loss/logits": 0.17698973417282104, + "loss/reg": 3.0129771403153427e-05, + "step": 916 + }, + { + "epoch": 0.114625, + "grad_norm": 1.7083910703659058, + "grad_norm_var": 0.1462808949880042, + "learning_rate": 0.0001, + "loss": 1.1463, + "loss/crossentropy": 2.667232036590576, + "loss/hidden": 1.0, + "loss/logits": 0.14599871635437012, + "loss/reg": 3.0115046683931723e-05, + "step": 917 + }, + { + "epoch": 0.11475, + "grad_norm": 2.0105397701263428, + "grad_norm_var": 0.14017797617193484, + "learning_rate": 0.0001, + "loss": 1.379, + "loss/crossentropy": 2.4737157821655273, + "loss/hidden": 1.1875, + "loss/logits": 0.19115224480628967, + "loss/reg": 3.0103803510428406e-05, + "step": 918 + }, + { + "epoch": 0.114875, + "grad_norm": 2.173673391342163, + "grad_norm_var": 0.13898185573586228, + "learning_rate": 0.0001, + "loss": 1.1508, + "loss/crossentropy": 2.6369824409484863, + "loss/hidden": 1.015625, + "loss/logits": 0.134853333234787, + "loss/reg": 3.0090330255916342e-05, + "step": 919 + }, + { + "epoch": 0.115, + "grad_norm": 1.7742769718170166, + "grad_norm_var": 0.14734354783532472, + "learning_rate": 0.0001, + "loss": 1.2682, + "loss/crossentropy": 2.511277914047241, + "loss/hidden": 1.125, + "loss/logits": 0.1428862065076828, + "loss/reg": 3.008243402291555e-05, + "step": 920 + }, + { + "epoch": 0.115125, + "grad_norm": 2.2685179710388184, + "grad_norm_var": 0.14559568575267523, + "learning_rate": 0.0001, + "loss": 1.0555, + "loss/crossentropy": 2.741913318634033, + "loss/hidden": 0.921875, + "loss/logits": 0.1333094835281372, + "loss/reg": 3.006882434419822e-05, + "step": 921 + }, + { + "epoch": 0.11525, + "grad_norm": 2.0138773918151855, + "grad_norm_var": 0.14463957141116968, + "learning_rate": 0.0001, + "loss": 1.0304, + "loss/crossentropy": 2.6428472995758057, + "loss/hidden": 0.8984375, + "loss/logits": 0.13170146942138672, + "loss/reg": 3.0055622119107284e-05, + "step": 922 + }, + { + "epoch": 0.115375, + "grad_norm": 2.0200250148773193, + "grad_norm_var": 0.13746634960929824, + "learning_rate": 0.0001, + "loss": 1.3501, + "loss/crossentropy": 2.382972478866577, + "loss/hidden": 1.171875, + "loss/logits": 0.17797118425369263, + "loss/reg": 3.0043020160519518e-05, + "step": 923 + }, + { + "epoch": 0.1155, + "grad_norm": 1.8085798025131226, + "grad_norm_var": 0.13873805613438883, + "learning_rate": 0.0001, + "loss": 1.2652, + "loss/crossentropy": 2.305152416229248, + "loss/hidden": 1.09375, + "loss/logits": 0.17112156748771667, + "loss/reg": 3.0029701520106755e-05, + "step": 924 + }, + { + "epoch": 0.115625, + "grad_norm": 2.2086219787597656, + "grad_norm_var": 0.13486150837332828, + "learning_rate": 0.0001, + "loss": 1.2893, + "loss/crossentropy": 2.4080841541290283, + "loss/hidden": 1.140625, + "loss/logits": 0.14834384620189667, + "loss/reg": 3.0015635275049135e-05, + "step": 925 + }, + { + "epoch": 0.11575, + "grad_norm": 2.3216629028320312, + "grad_norm_var": 0.059679650378250376, + "learning_rate": 0.0001, + "loss": 1.3637, + "loss/crossentropy": 2.4338295459747314, + "loss/hidden": 1.1796875, + "loss/logits": 0.18370847404003143, + "loss/reg": 3.0002041967236437e-05, + "step": 926 + }, + { + "epoch": 0.115875, + "grad_norm": 2.669752597808838, + "grad_norm_var": 0.07801232102321709, + "learning_rate": 0.0001, + "loss": 1.1004, + "loss/crossentropy": 2.89715576171875, + "loss/hidden": 0.953125, + "loss/logits": 0.1469748616218567, + "loss/reg": 2.998723357450217e-05, + "step": 927 + }, + { + "epoch": 0.116, + "grad_norm": 1.8070098161697388, + "grad_norm_var": 0.08313198661767274, + "learning_rate": 0.0001, + "loss": 1.3002, + "loss/crossentropy": 2.311203718185425, + "loss/hidden": 1.140625, + "loss/logits": 0.15929476916790009, + "loss/reg": 2.997562478412874e-05, + "step": 928 + }, + { + "epoch": 0.116125, + "grad_norm": 1.9325246810913086, + "grad_norm_var": 0.06066759568447632, + "learning_rate": 0.0001, + "loss": 1.2746, + "loss/crossentropy": 2.5550014972686768, + "loss/hidden": 1.1171875, + "loss/logits": 0.15711072087287903, + "loss/reg": 2.9964614441269077e-05, + "step": 929 + }, + { + "epoch": 0.11625, + "grad_norm": 5.985466957092285, + "grad_norm_var": 1.027266842132435, + "learning_rate": 0.0001, + "loss": 1.3972, + "loss/crossentropy": 3.0680112838745117, + "loss/hidden": 1.234375, + "loss/logits": 0.16257420182228088, + "loss/reg": 2.9954886485938914e-05, + "step": 930 + }, + { + "epoch": 0.116375, + "grad_norm": 2.5261380672454834, + "grad_norm_var": 1.0212753434993587, + "learning_rate": 0.0001, + "loss": 1.252, + "loss/crossentropy": 2.5753066539764404, + "loss/hidden": 1.078125, + "loss/logits": 0.17356222867965698, + "loss/reg": 2.9943923436803743e-05, + "step": 931 + }, + { + "epoch": 0.1165, + "grad_norm": 6.641204833984375, + "grad_norm_var": 2.1683749086823068, + "learning_rate": 0.0001, + "loss": 1.721, + "loss/crossentropy": 2.6875956058502197, + "loss/hidden": 1.4453125, + "loss/logits": 0.275409460067749, + "loss/reg": 2.9933024052297696e-05, + "step": 932 + }, + { + "epoch": 0.116625, + "grad_norm": 2.314908027648926, + "grad_norm_var": 2.1178968833443887, + "learning_rate": 0.0001, + "loss": 1.1737, + "loss/crossentropy": 2.6610100269317627, + "loss/hidden": 1.0078125, + "loss/logits": 0.1656096875667572, + "loss/reg": 2.9921167879365385e-05, + "step": 933 + }, + { + "epoch": 0.11675, + "grad_norm": 2.40584135055542, + "grad_norm_var": 2.0937064624222272, + "learning_rate": 0.0001, + "loss": 1.4219, + "loss/crossentropy": 2.402122735977173, + "loss/hidden": 1.2109375, + "loss/logits": 0.21065916121006012, + "loss/reg": 2.9912616810179316e-05, + "step": 934 + }, + { + "epoch": 0.116875, + "grad_norm": 1.8654179573059082, + "grad_norm_var": 2.120435350833001, + "learning_rate": 0.0001, + "loss": 1.329, + "loss/crossentropy": 2.288451671600342, + "loss/hidden": 1.1484375, + "loss/logits": 0.18031169474124908, + "loss/reg": 2.9901168090873398e-05, + "step": 935 + }, + { + "epoch": 0.117, + "grad_norm": 2.2229185104370117, + "grad_norm_var": 2.0800180450379466, + "learning_rate": 0.0001, + "loss": 1.3147, + "loss/crossentropy": 2.4958393573760986, + "loss/hidden": 1.1484375, + "loss/logits": 0.16593661904335022, + "loss/reg": 2.9892928068875335e-05, + "step": 936 + }, + { + "epoch": 0.117125, + "grad_norm": 2.3262715339660645, + "grad_norm_var": 2.0769941509731638, + "learning_rate": 0.0001, + "loss": 1.1184, + "loss/crossentropy": 2.465583562850952, + "loss/hidden": 0.9765625, + "loss/logits": 0.14153215289115906, + "loss/reg": 2.9885473850299604e-05, + "step": 937 + }, + { + "epoch": 0.11725, + "grad_norm": 1.9139127731323242, + "grad_norm_var": 2.0866556628890844, + "learning_rate": 0.0001, + "loss": 1.1842, + "loss/crossentropy": 2.4913489818573, + "loss/hidden": 1.03125, + "loss/logits": 0.15265440940856934, + "loss/reg": 2.987419611599762e-05, + "step": 938 + }, + { + "epoch": 0.117375, + "grad_norm": 10.830428123474121, + "grad_norm_var": 6.156193101325849, + "learning_rate": 0.0001, + "loss": 1.2625, + "loss/crossentropy": 2.3177073001861572, + "loss/hidden": 1.09375, + "loss/logits": 0.16842305660247803, + "loss/reg": 2.986286926898174e-05, + "step": 939 + }, + { + "epoch": 0.1175, + "grad_norm": 2.227233648300171, + "grad_norm_var": 6.087451956699234, + "learning_rate": 0.0001, + "loss": 1.4568, + "loss/crossentropy": 2.335895538330078, + "loss/hidden": 1.25, + "loss/logits": 0.20651838183403015, + "loss/reg": 2.9853996238671243e-05, + "step": 940 + }, + { + "epoch": 0.117625, + "grad_norm": 2.052086114883423, + "grad_norm_var": 6.110978489678372, + "learning_rate": 0.0001, + "loss": 1.2067, + "loss/crossentropy": 2.5038609504699707, + "loss/hidden": 1.046875, + "loss/logits": 0.15951794385910034, + "loss/reg": 2.9841248760931194e-05, + "step": 941 + }, + { + "epoch": 0.11775, + "grad_norm": 2.391615390777588, + "grad_norm_var": 6.102600788640365, + "learning_rate": 0.0001, + "loss": 1.1734, + "loss/crossentropy": 2.3519561290740967, + "loss/hidden": 1.0078125, + "loss/logits": 0.1652698963880539, + "loss/reg": 2.9828568585799076e-05, + "step": 942 + }, + { + "epoch": 0.117875, + "grad_norm": 2.6826672554016113, + "grad_norm_var": 6.101599921092475, + "learning_rate": 0.0001, + "loss": 1.1231, + "loss/crossentropy": 2.7894511222839355, + "loss/hidden": 0.9609375, + "loss/logits": 0.16190429031848907, + "loss/reg": 2.9821638236171566e-05, + "step": 943 + }, + { + "epoch": 0.118, + "grad_norm": 2.0484187602996826, + "grad_norm_var": 6.058542783290856, + "learning_rate": 0.0001, + "loss": 1.3326, + "loss/crossentropy": 2.5720252990722656, + "loss/hidden": 1.1328125, + "loss/logits": 0.1994791030883789, + "loss/reg": 2.981125726364553e-05, + "step": 944 + }, + { + "epoch": 0.118125, + "grad_norm": 2.304121971130371, + "grad_norm_var": 6.000760397434108, + "learning_rate": 0.0001, + "loss": 1.2185, + "loss/crossentropy": 2.6887125968933105, + "loss/hidden": 1.046875, + "loss/logits": 0.17130357027053833, + "loss/reg": 2.9803662982885726e-05, + "step": 945 + }, + { + "epoch": 0.11825, + "grad_norm": 2.098452568054199, + "grad_norm_var": 5.551285095967417, + "learning_rate": 0.0001, + "loss": 1.2843, + "loss/crossentropy": 2.1777865886688232, + "loss/hidden": 1.109375, + "loss/logits": 0.17466512322425842, + "loss/reg": 2.979521559609566e-05, + "step": 946 + }, + { + "epoch": 0.118375, + "grad_norm": 2.4448280334472656, + "grad_norm_var": 5.557412656069853, + "learning_rate": 0.0001, + "loss": 1.2825, + "loss/crossentropy": 2.4249322414398193, + "loss/hidden": 1.09375, + "loss/logits": 0.18843106925487518, + "loss/reg": 2.9785163860651664e-05, + "step": 947 + }, + { + "epoch": 0.1185, + "grad_norm": 2.038378953933716, + "grad_norm_var": 4.6764411267263375, + "learning_rate": 0.0001, + "loss": 1.2371, + "loss/crossentropy": 2.5682852268218994, + "loss/hidden": 1.046875, + "loss/logits": 0.18992964923381805, + "loss/reg": 2.9779299438814633e-05, + "step": 948 + }, + { + "epoch": 0.118625, + "grad_norm": 2.4998066425323486, + "grad_norm_var": 4.667593369117897, + "learning_rate": 0.0001, + "loss": 1.2371, + "loss/crossentropy": 2.66314697265625, + "loss/hidden": 1.0625, + "loss/logits": 0.17428331077098846, + "loss/reg": 2.9770533728878945e-05, + "step": 949 + }, + { + "epoch": 0.11875, + "grad_norm": 2.7099368572235107, + "grad_norm_var": 4.658525692998186, + "learning_rate": 0.0001, + "loss": 1.2726, + "loss/crossentropy": 2.6387088298797607, + "loss/hidden": 1.09375, + "loss/logits": 0.1785287708044052, + "loss/reg": 2.9763001293758862e-05, + "step": 950 + }, + { + "epoch": 0.118875, + "grad_norm": 2.2414186000823975, + "grad_norm_var": 4.620957579511203, + "learning_rate": 0.0001, + "loss": 1.3926, + "loss/crossentropy": 2.2660908699035645, + "loss/hidden": 1.203125, + "loss/logits": 0.18920645117759705, + "loss/reg": 2.975258212245535e-05, + "step": 951 + }, + { + "epoch": 0.119, + "grad_norm": 1.984645962715149, + "grad_norm_var": 4.643301277280139, + "learning_rate": 0.0001, + "loss": 1.1178, + "loss/crossentropy": 2.6594138145446777, + "loss/hidden": 0.9765625, + "loss/logits": 0.14097043871879578, + "loss/reg": 2.9740975151071325e-05, + "step": 952 + }, + { + "epoch": 0.119125, + "grad_norm": 2.6357920169830322, + "grad_norm_var": 4.629753372228165, + "learning_rate": 0.0001, + "loss": 1.2005, + "loss/crossentropy": 2.621708631515503, + "loss/hidden": 1.015625, + "loss/logits": 0.1845404952764511, + "loss/reg": 2.973414484586101e-05, + "step": 953 + }, + { + "epoch": 0.11925, + "grad_norm": 1.8917043209075928, + "grad_norm_var": 4.632464228940502, + "learning_rate": 0.0001, + "loss": 1.2065, + "loss/crossentropy": 2.599364995956421, + "loss/hidden": 1.046875, + "loss/logits": 0.15931665897369385, + "loss/reg": 2.972304901049938e-05, + "step": 954 + }, + { + "epoch": 0.119375, + "grad_norm": 3.248734712600708, + "grad_norm_var": 0.12498233063500468, + "learning_rate": 0.0001, + "loss": 1.3427, + "loss/crossentropy": 2.567194938659668, + "loss/hidden": 1.15625, + "loss/logits": 0.18617364764213562, + "loss/reg": 2.9710497983614914e-05, + "step": 955 + }, + { + "epoch": 0.1195, + "grad_norm": 2.8562896251678467, + "grad_norm_var": 0.13994241611030506, + "learning_rate": 0.0001, + "loss": 1.3554, + "loss/crossentropy": 2.6916966438293457, + "loss/hidden": 1.140625, + "loss/logits": 0.21445384621620178, + "loss/reg": 2.9702167012146674e-05, + "step": 956 + }, + { + "epoch": 0.119625, + "grad_norm": 2.176276445388794, + "grad_norm_var": 0.13542593205940623, + "learning_rate": 0.0001, + "loss": 1.2327, + "loss/crossentropy": 2.4443368911743164, + "loss/hidden": 1.078125, + "loss/logits": 0.15425175428390503, + "loss/reg": 2.969575689348858e-05, + "step": 957 + }, + { + "epoch": 0.11975, + "grad_norm": 1.8663557767868042, + "grad_norm_var": 0.1526136914943218, + "learning_rate": 0.0001, + "loss": 1.2129, + "loss/crossentropy": 2.619982957839966, + "loss/hidden": 1.0546875, + "loss/logits": 0.1578969955444336, + "loss/reg": 2.9686147172469646e-05, + "step": 958 + }, + { + "epoch": 0.119875, + "grad_norm": 2.1611175537109375, + "grad_norm_var": 0.1470364788056391, + "learning_rate": 0.0001, + "loss": 1.2601, + "loss/crossentropy": 2.4998703002929688, + "loss/hidden": 1.0703125, + "loss/logits": 0.18950100243091583, + "loss/reg": 2.9678791179321706e-05, + "step": 959 + }, + { + "epoch": 0.12, + "grad_norm": 2.9389288425445557, + "grad_norm_var": 0.16371311291879256, + "learning_rate": 0.0001, + "loss": 1.7144, + "loss/crossentropy": 2.059462785720825, + "loss/hidden": 1.4765625, + "loss/logits": 0.2375560998916626, + "loss/reg": 2.9669770810869522e-05, + "step": 960 + }, + { + "epoch": 0.120125, + "grad_norm": 1.8231537342071533, + "grad_norm_var": 0.18310454629205022, + "learning_rate": 0.0001, + "loss": 1.1061, + "loss/crossentropy": 2.686415195465088, + "loss/hidden": 0.96875, + "loss/logits": 0.1370917558670044, + "loss/reg": 2.9663093300769106e-05, + "step": 961 + }, + { + "epoch": 0.12025, + "grad_norm": 2.095581293106079, + "grad_norm_var": 0.1832017416796153, + "learning_rate": 0.0001, + "loss": 1.2927, + "loss/crossentropy": 2.542818307876587, + "loss/hidden": 1.109375, + "loss/logits": 0.18301713466644287, + "loss/reg": 2.9656326660187915e-05, + "step": 962 + }, + { + "epoch": 0.120375, + "grad_norm": 1.9731796979904175, + "grad_norm_var": 0.1911924995621158, + "learning_rate": 0.0001, + "loss": 1.2607, + "loss/crossentropy": 2.7729532718658447, + "loss/hidden": 1.0703125, + "loss/logits": 0.19010211527347565, + "loss/reg": 2.9653167075593956e-05, + "step": 963 + }, + { + "epoch": 0.1205, + "grad_norm": 2.3905394077301025, + "grad_norm_var": 0.18565761023355182, + "learning_rate": 0.0001, + "loss": 1.3915, + "loss/crossentropy": 2.4821271896362305, + "loss/hidden": 1.203125, + "loss/logits": 0.18802939355373383, + "loss/reg": 2.9650735086761415e-05, + "step": 964 + }, + { + "epoch": 0.120625, + "grad_norm": 2.089430809020996, + "grad_norm_var": 0.18762185350676308, + "learning_rate": 0.0001, + "loss": 1.1282, + "loss/crossentropy": 2.6699776649475098, + "loss/hidden": 0.96875, + "loss/logits": 0.15917512774467468, + "loss/reg": 2.9652708690264262e-05, + "step": 965 + }, + { + "epoch": 0.12075, + "grad_norm": 2.2462618350982666, + "grad_norm_var": 0.1768091784440002, + "learning_rate": 0.0001, + "loss": 1.0844, + "loss/crossentropy": 2.1657514572143555, + "loss/hidden": 0.94140625, + "loss/logits": 0.14271298050880432, + "loss/reg": 2.9650564101757482e-05, + "step": 966 + }, + { + "epoch": 0.120875, + "grad_norm": 2.1140072345733643, + "grad_norm_var": 0.17862723062111877, + "learning_rate": 0.0001, + "loss": 1.1495, + "loss/crossentropy": 2.588456392288208, + "loss/hidden": 1.0, + "loss/logits": 0.1492307484149933, + "loss/reg": 2.9643733796547167e-05, + "step": 967 + }, + { + "epoch": 0.121, + "grad_norm": 4.149561405181885, + "grad_norm_var": 0.38608389632931334, + "learning_rate": 0.0001, + "loss": 1.4457, + "loss/crossentropy": 2.559540033340454, + "loss/hidden": 1.203125, + "loss/logits": 0.24225984513759613, + "loss/reg": 2.9640305001521483e-05, + "step": 968 + }, + { + "epoch": 0.121125, + "grad_norm": 2.4170870780944824, + "grad_norm_var": 0.3826657741188169, + "learning_rate": 0.0001, + "loss": 1.3368, + "loss/crossentropy": 2.5627620220184326, + "loss/hidden": 1.140625, + "loss/logits": 0.1958288550376892, + "loss/reg": 2.9633007216034457e-05, + "step": 969 + }, + { + "epoch": 0.12125, + "grad_norm": 2.025369882583618, + "grad_norm_var": 0.3746809845195796, + "learning_rate": 0.0001, + "loss": 1.1745, + "loss/crossentropy": 2.51656174659729, + "loss/hidden": 1.0234375, + "loss/logits": 0.15072372555732727, + "loss/reg": 2.9625756724271923e-05, + "step": 970 + }, + { + "epoch": 0.121375, + "grad_norm": 3.451981544494629, + "grad_norm_var": 0.3999720570717547, + "learning_rate": 0.0001, + "loss": 1.4399, + "loss/crossentropy": 1.9385815858840942, + "loss/hidden": 1.3125, + "loss/logits": 0.12711143493652344, + "loss/reg": 2.9617278414661996e-05, + "step": 971 + }, + { + "epoch": 0.1215, + "grad_norm": 1.8940742015838623, + "grad_norm_var": 0.4023062621100176, + "learning_rate": 0.0001, + "loss": 1.348, + "loss/crossentropy": 2.650892734527588, + "loss/hidden": 1.15625, + "loss/logits": 0.19150036573410034, + "loss/reg": 2.961178142868448e-05, + "step": 972 + }, + { + "epoch": 0.121625, + "grad_norm": 1.8390206098556519, + "grad_norm_var": 0.41782537712418366, + "learning_rate": 0.0001, + "loss": 1.1005, + "loss/crossentropy": 2.235429525375366, + "loss/hidden": 0.95703125, + "loss/logits": 0.14319097995758057, + "loss/reg": 2.9602917493321e-05, + "step": 973 + }, + { + "epoch": 0.12175, + "grad_norm": 3.1122865676879883, + "grad_norm_var": 0.4357929705368214, + "learning_rate": 0.0001, + "loss": 1.4691, + "loss/crossentropy": 2.6123275756835938, + "loss/hidden": 1.234375, + "loss/logits": 0.23446011543273926, + "loss/reg": 2.9595235901069827e-05, + "step": 974 + }, + { + "epoch": 0.121875, + "grad_norm": 2.0248794555664062, + "grad_norm_var": 0.44165743776062505, + "learning_rate": 0.0001, + "loss": 1.2073, + "loss/crossentropy": 2.6745986938476562, + "loss/hidden": 1.03125, + "loss/logits": 0.17576925456523895, + "loss/reg": 2.9589489713544026e-05, + "step": 975 + }, + { + "epoch": 0.122, + "grad_norm": 1.9854148626327515, + "grad_norm_var": 0.4314376508313742, + "learning_rate": 0.0001, + "loss": 1.2895, + "loss/crossentropy": 2.4620821475982666, + "loss/hidden": 1.1171875, + "loss/logits": 0.17201289534568787, + "loss/reg": 2.9581056878669187e-05, + "step": 976 + }, + { + "epoch": 0.122125, + "grad_norm": 15.087516784667969, + "grad_norm_var": 10.49260658100263, + "learning_rate": 0.0001, + "loss": 1.3515, + "loss/crossentropy": 2.4697861671447754, + "loss/hidden": 1.1484375, + "loss/logits": 0.2027597725391388, + "loss/reg": 2.9572154744528234e-05, + "step": 977 + }, + { + "epoch": 0.12225, + "grad_norm": 2.448667049407959, + "grad_norm_var": 10.449298409713819, + "learning_rate": 0.0001, + "loss": 1.1618, + "loss/crossentropy": 2.2382848262786865, + "loss/hidden": 1.0078125, + "loss/logits": 0.15371698141098022, + "loss/reg": 2.956432945211418e-05, + "step": 978 + }, + { + "epoch": 0.122375, + "grad_norm": 1.7963190078735352, + "grad_norm_var": 10.480256191034487, + "learning_rate": 0.0001, + "loss": 1.2832, + "loss/crossentropy": 2.5349695682525635, + "loss/hidden": 1.109375, + "loss/logits": 0.17357102036476135, + "loss/reg": 2.9555221772170626e-05, + "step": 979 + }, + { + "epoch": 0.1225, + "grad_norm": 2.065178155899048, + "grad_norm_var": 10.521642133051824, + "learning_rate": 0.0001, + "loss": 1.2602, + "loss/crossentropy": 2.4281973838806152, + "loss/hidden": 1.0625, + "loss/logits": 0.1973562091588974, + "loss/reg": 2.954368210339453e-05, + "step": 980 + }, + { + "epoch": 0.122625, + "grad_norm": 2.2039473056793213, + "grad_norm_var": 10.505936873267668, + "learning_rate": 0.0001, + "loss": 1.1894, + "loss/crossentropy": 2.483654737472534, + "loss/hidden": 1.0390625, + "loss/logits": 0.1500002145767212, + "loss/reg": 2.953330840682611e-05, + "step": 981 + }, + { + "epoch": 0.12275, + "grad_norm": 2.1155283451080322, + "grad_norm_var": 10.523261114072442, + "learning_rate": 0.0001, + "loss": 1.1604, + "loss/crossentropy": 2.6215789318084717, + "loss/hidden": 1.015625, + "loss/logits": 0.14448747038841248, + "loss/reg": 2.9524298952310346e-05, + "step": 982 + }, + { + "epoch": 0.122875, + "grad_norm": 2.112273931503296, + "grad_norm_var": 10.523505505811658, + "learning_rate": 0.0001, + "loss": 1.1496, + "loss/crossentropy": 2.4145495891571045, + "loss/hidden": 1.0, + "loss/logits": 0.14932119846343994, + "loss/reg": 2.9514942070818506e-05, + "step": 983 + }, + { + "epoch": 0.123, + "grad_norm": 2.0390849113464355, + "grad_norm_var": 10.526402089225046, + "learning_rate": 0.0001, + "loss": 1.0821, + "loss/crossentropy": 2.318721294403076, + "loss/hidden": 0.94140625, + "loss/logits": 0.14042669534683228, + "loss/reg": 2.9506210921681486e-05, + "step": 984 + }, + { + "epoch": 0.123125, + "grad_norm": 4.910560607910156, + "grad_norm_var": 10.708338697617826, + "learning_rate": 0.0001, + "loss": 2.0722, + "loss/crossentropy": 3.077894926071167, + "loss/hidden": 1.734375, + "loss/logits": 0.3375716805458069, + "loss/reg": 2.9494345653802156e-05, + "step": 985 + }, + { + "epoch": 0.12325, + "grad_norm": 2.085634469985962, + "grad_norm_var": 10.699171348673152, + "learning_rate": 0.0001, + "loss": 1.2434, + "loss/crossentropy": 2.197237730026245, + "loss/hidden": 1.09375, + "loss/logits": 0.14937615394592285, + "loss/reg": 2.9486029234249145e-05, + "step": 986 + }, + { + "epoch": 0.123375, + "grad_norm": 1.863654375076294, + "grad_norm_var": 10.803115672749165, + "learning_rate": 0.0001, + "loss": 1.1457, + "loss/crossentropy": 2.4037911891937256, + "loss/hidden": 0.9921875, + "loss/logits": 0.1532135307788849, + "loss/reg": 2.9479617296601646e-05, + "step": 987 + }, + { + "epoch": 0.1235, + "grad_norm": 2.7112700939178467, + "grad_norm_var": 10.713565411311919, + "learning_rate": 0.0001, + "loss": 1.355, + "loss/crossentropy": 2.576564073562622, + "loss/hidden": 1.140625, + "loss/logits": 0.21411700546741486, + "loss/reg": 2.9471080779330805e-05, + "step": 988 + }, + { + "epoch": 0.123625, + "grad_norm": 4.951111316680908, + "grad_norm_var": 10.774867724807326, + "learning_rate": 0.0001, + "loss": 1.5529, + "loss/crossentropy": 2.4628682136535645, + "loss/hidden": 1.34375, + "loss/logits": 0.208875373005867, + "loss/reg": 2.9464130420819856e-05, + "step": 989 + }, + { + "epoch": 0.12375, + "grad_norm": 2.1607253551483154, + "grad_norm_var": 10.860932085087141, + "learning_rate": 0.0001, + "loss": 1.158, + "loss/crossentropy": 2.727191209793091, + "loss/hidden": 0.9921875, + "loss/logits": 0.16549015045166016, + "loss/reg": 2.9459288271027617e-05, + "step": 990 + }, + { + "epoch": 0.123875, + "grad_norm": 2.0294203758239746, + "grad_norm_var": 10.860170359418529, + "learning_rate": 0.0001, + "loss": 1.0676, + "loss/crossentropy": 2.377748489379883, + "loss/hidden": 0.9453125, + "loss/logits": 0.12201999127864838, + "loss/reg": 2.944932202808559e-05, + "step": 991 + }, + { + "epoch": 0.124, + "grad_norm": 2.561225414276123, + "grad_norm_var": 10.781087146669456, + "learning_rate": 0.0001, + "loss": 1.426, + "loss/crossentropy": 2.6533565521240234, + "loss/hidden": 1.2109375, + "loss/logits": 0.2147250473499298, + "loss/reg": 2.9440669095492922e-05, + "step": 992 + }, + { + "epoch": 0.124125, + "grad_norm": 2.614508867263794, + "grad_norm_var": 0.9367041482529769, + "learning_rate": 0.0001, + "loss": 1.2862, + "loss/crossentropy": 2.3770902156829834, + "loss/hidden": 1.0859375, + "loss/logits": 0.20000842213630676, + "loss/reg": 2.943119397968985e-05, + "step": 993 + }, + { + "epoch": 0.12425, + "grad_norm": 4.801121711730957, + "grad_norm_var": 1.2533636237198265, + "learning_rate": 0.0001, + "loss": 1.316, + "loss/crossentropy": 2.862474203109741, + "loss/hidden": 1.125, + "loss/logits": 0.1907142549753189, + "loss/reg": 2.9420858481898904e-05, + "step": 994 + }, + { + "epoch": 0.124375, + "grad_norm": 1.8006936311721802, + "grad_norm_var": 1.252844222856614, + "learning_rate": 0.0001, + "loss": 1.1567, + "loss/crossentropy": 2.538184881210327, + "loss/hidden": 1.0078125, + "loss/logits": 0.14863096177577972, + "loss/reg": 2.941078128060326e-05, + "step": 995 + }, + { + "epoch": 0.1245, + "grad_norm": 1.8835642337799072, + "grad_norm_var": 1.2700145975215755, + "learning_rate": 0.0001, + "loss": 1.1197, + "loss/crossentropy": 2.46339750289917, + "loss/hidden": 0.97265625, + "loss/logits": 0.14673538506031036, + "loss/reg": 2.939947444247082e-05, + "step": 996 + }, + { + "epoch": 0.124625, + "grad_norm": 1.8348726034164429, + "grad_norm_var": 1.3018449172030708, + "learning_rate": 0.0001, + "loss": 1.2213, + "loss/crossentropy": 2.3999040126800537, + "loss/hidden": 1.0625, + "loss/logits": 0.15847530961036682, + "loss/reg": 2.9386575988610275e-05, + "step": 997 + }, + { + "epoch": 0.12475, + "grad_norm": 2.0075278282165527, + "grad_norm_var": 1.3103380783906078, + "learning_rate": 0.0001, + "loss": 1.1907, + "loss/crossentropy": 2.6882195472717285, + "loss/hidden": 1.0390625, + "loss/logits": 0.15131571888923645, + "loss/reg": 2.9374945370364003e-05, + "step": 998 + }, + { + "epoch": 0.124875, + "grad_norm": 2.0520100593566895, + "grad_norm_var": 1.3148693419703117, + "learning_rate": 0.0001, + "loss": 1.2862, + "loss/crossentropy": 2.186708450317383, + "loss/hidden": 1.125, + "loss/logits": 0.16095511615276337, + "loss/reg": 2.936382225016132e-05, + "step": 999 + }, + { + "epoch": 0.125, + "grad_norm": 2.2306151390075684, + "grad_norm_var": 1.3017093789284313, + "learning_rate": 0.0001, + "loss": 1.2571, + "loss/crossentropy": 2.8570730686187744, + "loss/hidden": 1.0703125, + "loss/logits": 0.18647560477256775, + "loss/reg": 2.935159318440128e-05, + "step": 1000 + }, + { + "epoch": 0.125125, + "grad_norm": 1.8645871877670288, + "grad_norm_var": 0.9660011499294489, + "learning_rate": 0.0001, + "loss": 1.1655, + "loss/crossentropy": 2.660804271697998, + "loss/hidden": 1.0234375, + "loss/logits": 0.1418103277683258, + "loss/reg": 2.9340237233554944e-05, + "step": 1001 + }, + { + "epoch": 0.12525, + "grad_norm": 2.1849992275238037, + "grad_norm_var": 0.9615817736098241, + "learning_rate": 0.0001, + "loss": 1.2203, + "loss/crossentropy": 2.178314208984375, + "loss/hidden": 1.0625, + "loss/logits": 0.15750828385353088, + "loss/reg": 2.932794814114459e-05, + "step": 1002 + }, + { + "epoch": 0.125375, + "grad_norm": 3.986067295074463, + "grad_norm_var": 1.0709684501795464, + "learning_rate": 0.0001, + "loss": 1.1845, + "loss/crossentropy": 2.4393160343170166, + "loss/hidden": 1.03125, + "loss/logits": 0.15295693278312683, + "loss/reg": 2.9315660867723636e-05, + "step": 1003 + }, + { + "epoch": 0.1255, + "grad_norm": 1.8600398302078247, + "grad_norm_var": 1.104153845133293, + "learning_rate": 0.0001, + "loss": 1.1494, + "loss/crossentropy": 2.4713494777679443, + "loss/hidden": 0.9921875, + "loss/logits": 0.15691892802715302, + "loss/reg": 2.930188929894939e-05, + "step": 1004 + }, + { + "epoch": 0.125625, + "grad_norm": 1.8195825815200806, + "grad_norm_var": 0.7151077078776793, + "learning_rate": 0.0001, + "loss": 1.1427, + "loss/crossentropy": 2.3956546783447266, + "loss/hidden": 1.0078125, + "loss/logits": 0.13460367918014526, + "loss/reg": 2.9290595193742774e-05, + "step": 1005 + }, + { + "epoch": 0.12575, + "grad_norm": 2.014495372772217, + "grad_norm_var": 0.7202460838702616, + "learning_rate": 0.0001, + "loss": 1.2825, + "loss/crossentropy": 2.3066282272338867, + "loss/hidden": 1.109375, + "loss/logits": 0.17283910512924194, + "loss/reg": 2.9276796340127476e-05, + "step": 1006 + }, + { + "epoch": 0.125875, + "grad_norm": 2.076793909072876, + "grad_norm_var": 0.7183829997229518, + "learning_rate": 0.0001, + "loss": 1.2332, + "loss/crossentropy": 2.2323529720306396, + "loss/hidden": 1.09375, + "loss/logits": 0.1391303986310959, + "loss/reg": 2.926570778072346e-05, + "step": 1007 + }, + { + "epoch": 0.126, + "grad_norm": 2.1991078853607178, + "grad_norm_var": 0.7163580980412428, + "learning_rate": 0.0001, + "loss": 1.0737, + "loss/crossentropy": 2.5160655975341797, + "loss/hidden": 0.94921875, + "loss/logits": 0.12417572736740112, + "loss/reg": 2.925609442172572e-05, + "step": 1008 + }, + { + "epoch": 0.126125, + "grad_norm": 2.4293153285980225, + "grad_norm_var": 0.7114001537671166, + "learning_rate": 0.0001, + "loss": 1.4833, + "loss/crossentropy": 2.3626649379730225, + "loss/hidden": 1.28125, + "loss/logits": 0.20180566608905792, + "loss/reg": 2.9248867576825432e-05, + "step": 1009 + }, + { + "epoch": 0.12625, + "grad_norm": 1.709128737449646, + "grad_norm_var": 0.2841226367766957, + "learning_rate": 0.0001, + "loss": 1.1388, + "loss/crossentropy": 2.3142037391662598, + "loss/hidden": 1.0, + "loss/logits": 0.13846877217292786, + "loss/reg": 2.9243630706332624e-05, + "step": 1010 + }, + { + "epoch": 0.126375, + "grad_norm": 3.8883016109466553, + "grad_norm_var": 0.4670450602817121, + "learning_rate": 0.0001, + "loss": 1.2973, + "loss/crossentropy": 2.6982264518737793, + "loss/hidden": 1.1171875, + "loss/logits": 0.1798182725906372, + "loss/reg": 2.9232929591671564e-05, + "step": 1011 + }, + { + "epoch": 0.1265, + "grad_norm": 2.284008502960205, + "grad_norm_var": 0.45736549113770475, + "learning_rate": 0.0001, + "loss": 1.332, + "loss/crossentropy": 2.510465621948242, + "loss/hidden": 1.1640625, + "loss/logits": 0.16764146089553833, + "loss/reg": 2.9224862373666838e-05, + "step": 1012 + }, + { + "epoch": 0.126625, + "grad_norm": 2.52846097946167, + "grad_norm_var": 0.44649014895108863, + "learning_rate": 0.0001, + "loss": 1.3265, + "loss/crossentropy": 2.448003053665161, + "loss/hidden": 1.109375, + "loss/logits": 0.21680811047554016, + "loss/reg": 2.9215812901384197e-05, + "step": 1013 + }, + { + "epoch": 0.12675, + "grad_norm": 1.8621190786361694, + "grad_norm_var": 0.45388801520081185, + "learning_rate": 0.0001, + "loss": 1.1694, + "loss/crossentropy": 2.4996485710144043, + "loss/hidden": 1.015625, + "loss/logits": 0.1535225361585617, + "loss/reg": 2.920874612755142e-05, + "step": 1014 + }, + { + "epoch": 0.126875, + "grad_norm": 2.573948860168457, + "grad_norm_var": 0.4528313902447076, + "learning_rate": 0.0001, + "loss": 1.1064, + "loss/crossentropy": 2.477339506149292, + "loss/hidden": 0.97265625, + "loss/logits": 0.13347411155700684, + "loss/reg": 2.920458427979611e-05, + "step": 1015 + }, + { + "epoch": 0.127, + "grad_norm": 1.8308422565460205, + "grad_norm_var": 0.4688890207107471, + "learning_rate": 0.0001, + "loss": 1.2671, + "loss/crossentropy": 2.4544882774353027, + "loss/hidden": 1.109375, + "loss/logits": 0.1574433147907257, + "loss/reg": 2.9197683033999056e-05, + "step": 1016 + }, + { + "epoch": 0.127125, + "grad_norm": 3.3784258365631104, + "grad_norm_var": 0.5203013305270662, + "learning_rate": 0.0001, + "loss": 1.8158, + "loss/crossentropy": 2.3868064880371094, + "loss/hidden": 1.546875, + "loss/logits": 0.2686420977115631, + "loss/reg": 2.918928839790169e-05, + "step": 1017 + }, + { + "epoch": 0.12725, + "grad_norm": 3.3114662170410156, + "grad_norm_var": 0.5651990451927712, + "learning_rate": 0.0001, + "loss": 1.3926, + "loss/crossentropy": 2.5821990966796875, + "loss/hidden": 1.1015625, + "loss/logits": 0.2907836139202118, + "loss/reg": 2.918156133091543e-05, + "step": 1018 + }, + { + "epoch": 0.127375, + "grad_norm": 2.2275679111480713, + "grad_norm_var": 0.4064032234596157, + "learning_rate": 0.0001, + "loss": 1.2251, + "loss/crossentropy": 2.5899624824523926, + "loss/hidden": 1.03125, + "loss/logits": 0.19354669749736786, + "loss/reg": 2.9175122108426876e-05, + "step": 1019 + }, + { + "epoch": 0.1275, + "grad_norm": 2.6336710453033447, + "grad_norm_var": 0.39073246252080807, + "learning_rate": 0.0001, + "loss": 1.3826, + "loss/crossentropy": 2.2695744037628174, + "loss/hidden": 1.203125, + "loss/logits": 0.17915448546409607, + "loss/reg": 2.9165765226935036e-05, + "step": 1020 + }, + { + "epoch": 0.127625, + "grad_norm": 2.174442768096924, + "grad_norm_var": 0.3700545719352322, + "learning_rate": 0.0001, + "loss": 1.405, + "loss/crossentropy": 2.3914268016815186, + "loss/hidden": 1.203125, + "loss/logits": 0.20156240463256836, + "loss/reg": 2.9154887670301832e-05, + "step": 1021 + }, + { + "epoch": 0.12775, + "grad_norm": 2.3307485580444336, + "grad_norm_var": 0.35814692412861454, + "learning_rate": 0.0001, + "loss": 1.2054, + "loss/crossentropy": 2.322477102279663, + "loss/hidden": 1.0390625, + "loss/logits": 0.16605599224567413, + "loss/reg": 2.9147728128009476e-05, + "step": 1022 + }, + { + "epoch": 0.127875, + "grad_norm": 2.6750171184539795, + "grad_norm_var": 0.34955757112951436, + "learning_rate": 0.0001, + "loss": 1.4378, + "loss/crossentropy": 2.642554759979248, + "loss/hidden": 1.21875, + "loss/logits": 0.2187577188014984, + "loss/reg": 2.913829121098388e-05, + "step": 1023 + }, + { + "epoch": 0.128, + "grad_norm": 2.119832754135132, + "grad_norm_var": 0.3531549510742932, + "learning_rate": 0.0001, + "loss": 1.1875, + "loss/crossentropy": 2.5752387046813965, + "loss/hidden": 1.0390625, + "loss/logits": 0.14814549684524536, + "loss/reg": 2.9129343602107838e-05, + "step": 1024 + }, + { + "epoch": 0.128125, + "grad_norm": 1.9459450244903564, + "grad_norm_var": 0.37214145298919427, + "learning_rate": 0.0001, + "loss": 1.4058, + "loss/crossentropy": 2.232697010040283, + "loss/hidden": 1.1953125, + "loss/logits": 0.21017488837242126, + "loss/reg": 2.9119990358594805e-05, + "step": 1025 + }, + { + "epoch": 0.12825, + "grad_norm": 2.9667158126831055, + "grad_norm_var": 0.343888036491702, + "learning_rate": 0.0001, + "loss": 1.5782, + "loss/crossentropy": 2.3418338298797607, + "loss/hidden": 1.3671875, + "loss/logits": 0.21075811982154846, + "loss/reg": 2.9111633921274915e-05, + "step": 1026 + }, + { + "epoch": 0.128375, + "grad_norm": 11.993021965026855, + "grad_norm_var": 5.900127304805949, + "learning_rate": 0.0001, + "loss": 2.0617, + "loss/crossentropy": 2.232128858566284, + "loss/hidden": 1.875, + "loss/logits": 0.18643450736999512, + "loss/reg": 2.9101418476784602e-05, + "step": 1027 + }, + { + "epoch": 0.1285, + "grad_norm": 3.834670066833496, + "grad_norm_var": 5.8915710526393115, + "learning_rate": 0.0001, + "loss": 1.5941, + "loss/crossentropy": 2.155024766921997, + "loss/hidden": 1.3984375, + "loss/logits": 0.1953962743282318, + "loss/reg": 2.9092350814607926e-05, + "step": 1028 + }, + { + "epoch": 0.128625, + "grad_norm": 1.9689583778381348, + "grad_norm_var": 5.957442180243151, + "learning_rate": 0.0001, + "loss": 1.1175, + "loss/crossentropy": 2.3315813541412354, + "loss/hidden": 0.96875, + "loss/logits": 0.1484983265399933, + "loss/reg": 2.9080758395139128e-05, + "step": 1029 + }, + { + "epoch": 0.12875, + "grad_norm": 5.438080310821533, + "grad_norm_var": 6.159669369138983, + "learning_rate": 0.0001, + "loss": 1.4224, + "loss/crossentropy": 2.360717535018921, + "loss/hidden": 1.2734375, + "loss/logits": 0.14865463972091675, + "loss/reg": 2.907273665186949e-05, + "step": 1030 + }, + { + "epoch": 0.128875, + "grad_norm": 2.202073812484741, + "grad_norm_var": 6.206182372014499, + "learning_rate": 0.0001, + "loss": 1.2922, + "loss/crossentropy": 2.4415535926818848, + "loss/hidden": 1.109375, + "loss/logits": 0.18250058591365814, + "loss/reg": 2.9064294722047634e-05, + "step": 1031 + }, + { + "epoch": 0.129, + "grad_norm": 2.438520669937134, + "grad_norm_var": 6.109053000860937, + "learning_rate": 0.0001, + "loss": 1.5477, + "loss/crossentropy": 1.995033621788025, + "loss/hidden": 1.328125, + "loss/logits": 0.2193126678466797, + "loss/reg": 2.9056420316919684e-05, + "step": 1032 + }, + { + "epoch": 0.129125, + "grad_norm": 2.0374245643615723, + "grad_norm_var": 6.216800826061167, + "learning_rate": 0.0001, + "loss": 1.1426, + "loss/crossentropy": 2.4915618896484375, + "loss/hidden": 0.98046875, + "loss/logits": 0.16179411113262177, + "loss/reg": 2.904866232711356e-05, + "step": 1033 + }, + { + "epoch": 0.12925, + "grad_norm": 2.655311346054077, + "grad_norm_var": 6.239962322224316, + "learning_rate": 0.0001, + "loss": 1.4044, + "loss/crossentropy": 2.4660041332244873, + "loss/hidden": 1.2109375, + "loss/logits": 0.1931331753730774, + "loss/reg": 2.903978202084545e-05, + "step": 1034 + }, + { + "epoch": 0.129375, + "grad_norm": 2.1644914150238037, + "grad_norm_var": 6.248621668215733, + "learning_rate": 0.0001, + "loss": 1.4525, + "loss/crossentropy": 2.332583427429199, + "loss/hidden": 1.25, + "loss/logits": 0.20222735404968262, + "loss/reg": 2.9032480597379617e-05, + "step": 1035 + }, + { + "epoch": 0.1295, + "grad_norm": 2.8199288845062256, + "grad_norm_var": 6.236137340604452, + "learning_rate": 0.0001, + "loss": 1.3379, + "loss/crossentropy": 2.45756459236145, + "loss/hidden": 1.125, + "loss/logits": 0.21257054805755615, + "loss/reg": 2.902676715166308e-05, + "step": 1036 + }, + { + "epoch": 0.129625, + "grad_norm": 2.106027841567993, + "grad_norm_var": 6.2461072261308175, + "learning_rate": 0.0001, + "loss": 1.3743, + "loss/crossentropy": 2.203122854232788, + "loss/hidden": 1.1796875, + "loss/logits": 0.19433873891830444, + "loss/reg": 2.90215448330855e-05, + "step": 1037 + }, + { + "epoch": 0.12975, + "grad_norm": 2.1614882946014404, + "grad_norm_var": 6.268215781096, + "learning_rate": 0.0001, + "loss": 1.1925, + "loss/crossentropy": 2.441249132156372, + "loss/hidden": 1.015625, + "loss/logits": 0.17661580443382263, + "loss/reg": 2.9012806407990865e-05, + "step": 1038 + }, + { + "epoch": 0.129875, + "grad_norm": 2.8542771339416504, + "grad_norm_var": 6.257187130941577, + "learning_rate": 0.0001, + "loss": 1.3209, + "loss/crossentropy": 2.7331464290618896, + "loss/hidden": 1.15625, + "loss/logits": 0.16440153121948242, + "loss/reg": 2.9005204851273447e-05, + "step": 1039 + }, + { + "epoch": 0.13, + "grad_norm": 4.274694442749023, + "grad_norm_var": 6.2279531721253525, + "learning_rate": 0.0001, + "loss": 1.8022, + "loss/crossentropy": 2.583268404006958, + "loss/hidden": 1.5234375, + "loss/logits": 0.27850720286369324, + "loss/reg": 2.899775790865533e-05, + "step": 1040 + }, + { + "epoch": 0.130125, + "grad_norm": 1.9724745750427246, + "grad_norm_var": 6.222972793319928, + "learning_rate": 0.0001, + "loss": 1.1135, + "loss/crossentropy": 2.7154476642608643, + "loss/hidden": 0.96875, + "loss/logits": 0.14446626603603363, + "loss/reg": 2.8989099519094452e-05, + "step": 1041 + }, + { + "epoch": 0.13025, + "grad_norm": 2.058563232421875, + "grad_norm_var": 6.323110622506711, + "learning_rate": 0.0001, + "loss": 1.4176, + "loss/crossentropy": 2.4074690341949463, + "loss/hidden": 1.2109375, + "loss/logits": 0.2063521146774292, + "loss/reg": 2.8983529773540795e-05, + "step": 1042 + }, + { + "epoch": 0.130375, + "grad_norm": 2.1934494972229004, + "grad_norm_var": 0.9814003371475885, + "learning_rate": 0.0001, + "loss": 1.1936, + "loss/crossentropy": 2.492440938949585, + "loss/hidden": 1.015625, + "loss/logits": 0.17771762609481812, + "loss/reg": 2.897422746173106e-05, + "step": 1043 + }, + { + "epoch": 0.1305, + "grad_norm": 2.223344564437866, + "grad_norm_var": 0.8996343585415313, + "learning_rate": 0.0001, + "loss": 1.3285, + "loss/crossentropy": 2.2236666679382324, + "loss/hidden": 1.15625, + "loss/logits": 0.17200037837028503, + "loss/reg": 2.8966707759536803e-05, + "step": 1044 + }, + { + "epoch": 0.130625, + "grad_norm": 2.1789419651031494, + "grad_norm_var": 0.8847764483424041, + "learning_rate": 0.0001, + "loss": 1.2946, + "loss/crossentropy": 2.6670303344726562, + "loss/hidden": 1.109375, + "loss/logits": 0.18490077555179596, + "loss/reg": 2.8957747417734936e-05, + "step": 1045 + }, + { + "epoch": 0.13075, + "grad_norm": 1.9736850261688232, + "grad_norm_var": 0.3291097384150286, + "learning_rate": 0.0001, + "loss": 1.1273, + "loss/crossentropy": 2.2852139472961426, + "loss/hidden": 0.98828125, + "loss/logits": 0.1387089043855667, + "loss/reg": 2.895041870942805e-05, + "step": 1046 + }, + { + "epoch": 0.130875, + "grad_norm": 2.1519150733947754, + "grad_norm_var": 0.3305550235290655, + "learning_rate": 0.0001, + "loss": 1.5386, + "loss/crossentropy": 2.289642333984375, + "loss/hidden": 1.328125, + "loss/logits": 0.2101442813873291, + "loss/reg": 2.8941822165506892e-05, + "step": 1047 + }, + { + "epoch": 0.131, + "grad_norm": 2.6084046363830566, + "grad_norm_var": 0.33342312065417823, + "learning_rate": 0.0001, + "loss": 1.1892, + "loss/crossentropy": 2.647247552871704, + "loss/hidden": 1.03125, + "loss/logits": 0.15762609243392944, + "loss/reg": 2.8930307962582447e-05, + "step": 1048 + }, + { + "epoch": 0.131125, + "grad_norm": 2.165187120437622, + "grad_norm_var": 0.32823020060403274, + "learning_rate": 0.0001, + "loss": 1.4571, + "loss/crossentropy": 2.1876065731048584, + "loss/hidden": 1.265625, + "loss/logits": 0.1911635398864746, + "loss/reg": 2.8921131161041558e-05, + "step": 1049 + }, + { + "epoch": 0.13125, + "grad_norm": 2.4668209552764893, + "grad_norm_var": 0.32428899445976805, + "learning_rate": 0.0001, + "loss": 1.377, + "loss/crossentropy": 2.7762222290039062, + "loss/hidden": 1.1796875, + "loss/logits": 0.1969912201166153, + "loss/reg": 2.8910844775964506e-05, + "step": 1050 + }, + { + "epoch": 0.131375, + "grad_norm": 1.8908017873764038, + "grad_norm_var": 0.33750479199152694, + "learning_rate": 0.0001, + "loss": 1.1263, + "loss/crossentropy": 2.7445945739746094, + "loss/hidden": 0.97265625, + "loss/logits": 0.15338945388793945, + "loss/reg": 2.890328505600337e-05, + "step": 1051 + }, + { + "epoch": 0.1315, + "grad_norm": 2.43595552444458, + "grad_norm_var": 0.32426072871282036, + "learning_rate": 0.0001, + "loss": 1.3214, + "loss/crossentropy": 2.334587812423706, + "loss/hidden": 1.15625, + "loss/logits": 0.16486158967018127, + "loss/reg": 2.8892452974105254e-05, + "step": 1052 + }, + { + "epoch": 0.131625, + "grad_norm": 2.55059552192688, + "grad_norm_var": 0.3217217708192677, + "learning_rate": 0.0001, + "loss": 1.1742, + "loss/crossentropy": 2.477536201477051, + "loss/hidden": 1.0234375, + "loss/logits": 0.15046975016593933, + "loss/reg": 2.8882859623990953e-05, + "step": 1053 + }, + { + "epoch": 0.13175, + "grad_norm": 2.001939296722412, + "grad_norm_var": 0.3280683695196079, + "learning_rate": 0.0001, + "loss": 1.0487, + "loss/crossentropy": 2.767658233642578, + "loss/hidden": 0.91796875, + "loss/logits": 0.13048313558101654, + "loss/reg": 2.8873668270534836e-05, + "step": 1054 + }, + { + "epoch": 0.131875, + "grad_norm": 5.6443190574646, + "grad_norm_var": 0.992858592273797, + "learning_rate": 0.0001, + "loss": 2.1121, + "loss/crossentropy": 2.9565234184265137, + "loss/hidden": 1.7265625, + "loss/logits": 0.38526785373687744, + "loss/reg": 2.8864071282441728e-05, + "step": 1055 + }, + { + "epoch": 0.132, + "grad_norm": 2.5285122394561768, + "grad_norm_var": 0.7817502237101511, + "learning_rate": 0.0001, + "loss": 1.3205, + "loss/crossentropy": 2.5985231399536133, + "loss/hidden": 1.1328125, + "loss/logits": 0.1874181032180786, + "loss/reg": 2.8854963602498174e-05, + "step": 1056 + }, + { + "epoch": 0.132125, + "grad_norm": 2.8304665088653564, + "grad_norm_var": 0.7742400961267412, + "learning_rate": 0.0001, + "loss": 1.8562, + "loss/crossentropy": 2.10994815826416, + "loss/hidden": 1.59375, + "loss/logits": 0.26213082671165466, + "loss/reg": 2.884282366721891e-05, + "step": 1057 + }, + { + "epoch": 0.13225, + "grad_norm": 2.0675032138824463, + "grad_norm_var": 0.7737261335807379, + "learning_rate": 0.0001, + "loss": 1.281, + "loss/crossentropy": 2.5855510234832764, + "loss/hidden": 1.125, + "loss/logits": 0.15567795932292938, + "loss/reg": 2.882570151996333e-05, + "step": 1058 + }, + { + "epoch": 0.132375, + "grad_norm": 1.884971261024475, + "grad_norm_var": 0.7920554908567773, + "learning_rate": 0.0001, + "loss": 1.2255, + "loss/crossentropy": 2.3967983722686768, + "loss/hidden": 1.0625, + "loss/logits": 0.1627260446548462, + "loss/reg": 2.881682667066343e-05, + "step": 1059 + }, + { + "epoch": 0.1325, + "grad_norm": 2.0898094177246094, + "grad_norm_var": 0.7976543572973375, + "learning_rate": 0.0001, + "loss": 1.2322, + "loss/crossentropy": 2.636255979537964, + "loss/hidden": 1.0546875, + "loss/logits": 0.17724928259849548, + "loss/reg": 2.8808293791371398e-05, + "step": 1060 + }, + { + "epoch": 0.132625, + "grad_norm": 1.6610822677612305, + "grad_norm_var": 0.8342959728528415, + "learning_rate": 0.0001, + "loss": 1.215, + "loss/crossentropy": 2.4474031925201416, + "loss/hidden": 1.0546875, + "loss/logits": 0.16007071733474731, + "loss/reg": 2.8794902391382493e-05, + "step": 1061 + }, + { + "epoch": 0.13275, + "grad_norm": 1.970723271369934, + "grad_norm_var": 0.8344784964629458, + "learning_rate": 0.0001, + "loss": 1.1225, + "loss/crossentropy": 2.422860860824585, + "loss/hidden": 0.98046875, + "loss/logits": 0.14171727001667023, + "loss/reg": 2.8774575184797868e-05, + "step": 1062 + }, + { + "epoch": 0.132875, + "grad_norm": 17.373838424682617, + "grad_norm_var": 14.743011065090014, + "learning_rate": 0.0001, + "loss": 1.4518, + "loss/crossentropy": 2.5156593322753906, + "loss/hidden": 1.25, + "loss/logits": 0.20146852731704712, + "loss/reg": 2.8766620744136162e-05, + "step": 1063 + }, + { + "epoch": 0.133, + "grad_norm": 2.5942296981811523, + "grad_norm_var": 14.744492673143116, + "learning_rate": 0.0001, + "loss": 1.4178, + "loss/crossentropy": 2.2148756980895996, + "loss/hidden": 1.2421875, + "loss/logits": 0.17528752982616425, + "loss/reg": 2.875737118301913e-05, + "step": 1064 + }, + { + "epoch": 0.133125, + "grad_norm": 2.08968186378479, + "grad_norm_var": 14.757127251553335, + "learning_rate": 0.0001, + "loss": 1.2161, + "loss/crossentropy": 2.6527700424194336, + "loss/hidden": 1.046875, + "loss/logits": 0.16889378428459167, + "loss/reg": 2.8740398192894645e-05, + "step": 1065 + }, + { + "epoch": 0.13325, + "grad_norm": 1.6698960065841675, + "grad_norm_var": 14.893859995893845, + "learning_rate": 0.0001, + "loss": 1.1578, + "loss/crossentropy": 2.4963927268981934, + "loss/hidden": 1.0, + "loss/logits": 0.15749511122703552, + "loss/reg": 2.8730215490213595e-05, + "step": 1066 + }, + { + "epoch": 0.133375, + "grad_norm": 1.7506073713302612, + "grad_norm_var": 14.921995794198581, + "learning_rate": 0.0001, + "loss": 1.1045, + "loss/crossentropy": 2.3753435611724854, + "loss/hidden": 0.96875, + "loss/logits": 0.13541364669799805, + "loss/reg": 2.8719266992993653e-05, + "step": 1067 + }, + { + "epoch": 0.1335, + "grad_norm": 2.1705684661865234, + "grad_norm_var": 14.957732916069695, + "learning_rate": 0.0001, + "loss": 1.1569, + "loss/crossentropy": 2.615133285522461, + "loss/hidden": 1.0, + "loss/logits": 0.15659929811954498, + "loss/reg": 2.8703889256576076e-05, + "step": 1068 + }, + { + "epoch": 0.133625, + "grad_norm": 2.645045280456543, + "grad_norm_var": 14.948791009266516, + "learning_rate": 0.0001, + "loss": 1.0382, + "loss/crossentropy": 2.4030604362487793, + "loss/hidden": 0.91015625, + "loss/logits": 0.12774257361888885, + "loss/reg": 2.868651608878281e-05, + "step": 1069 + }, + { + "epoch": 0.13375, + "grad_norm": 2.5146496295928955, + "grad_norm_var": 14.875743299307356, + "learning_rate": 0.0001, + "loss": 1.6545, + "loss/crossentropy": 2.2613282203674316, + "loss/hidden": 1.359375, + "loss/logits": 0.29488736391067505, + "loss/reg": 2.8676033252850175e-05, + "step": 1070 + }, + { + "epoch": 0.133875, + "grad_norm": 1.915973424911499, + "grad_norm_var": 14.600448201499113, + "learning_rate": 0.0001, + "loss": 1.183, + "loss/crossentropy": 2.5148019790649414, + "loss/hidden": 1.03125, + "loss/logits": 0.1514246165752411, + "loss/reg": 2.866520117095206e-05, + "step": 1071 + }, + { + "epoch": 0.134, + "grad_norm": 12.971419334411621, + "grad_norm_var": 20.60689861698786, + "learning_rate": 0.0001, + "loss": 1.3227, + "loss/crossentropy": 2.654324531555176, + "loss/hidden": 1.125, + "loss/logits": 0.19739076495170593, + "loss/reg": 2.8656122594838962e-05, + "step": 1072 + }, + { + "epoch": 0.134125, + "grad_norm": 2.641289472579956, + "grad_norm_var": 20.632645342161023, + "learning_rate": 0.0001, + "loss": 1.4723, + "loss/crossentropy": 2.3359360694885254, + "loss/hidden": 1.2578125, + "loss/logits": 0.21416810154914856, + "loss/reg": 2.8642458346439525e-05, + "step": 1073 + }, + { + "epoch": 0.13425, + "grad_norm": 1.900654911994934, + "grad_norm_var": 20.67183049905315, + "learning_rate": 0.0001, + "loss": 1.1773, + "loss/crossentropy": 2.5847532749176025, + "loss/hidden": 1.03125, + "loss/logits": 0.1457844078540802, + "loss/reg": 2.8628437576116994e-05, + "step": 1074 + }, + { + "epoch": 0.134375, + "grad_norm": 2.386420249938965, + "grad_norm_var": 20.56350066663325, + "learning_rate": 0.0001, + "loss": 1.5607, + "loss/crossentropy": 2.190727472305298, + "loss/hidden": 1.3359375, + "loss/logits": 0.22446081042289734, + "loss/reg": 2.861370194295887e-05, + "step": 1075 + }, + { + "epoch": 0.1345, + "grad_norm": 2.2469475269317627, + "grad_norm_var": 20.529807109867, + "learning_rate": 0.0001, + "loss": 1.1523, + "loss/crossentropy": 2.511223554611206, + "loss/hidden": 1.0, + "loss/logits": 0.1519821584224701, + "loss/reg": 2.8598515200428665e-05, + "step": 1076 + }, + { + "epoch": 0.134625, + "grad_norm": 6.970561504364014, + "grad_norm_var": 20.79065223929303, + "learning_rate": 0.0001, + "loss": 2.5131, + "loss/crossentropy": 3.561710834503174, + "loss/hidden": 1.6953125, + "loss/logits": 0.817465603351593, + "loss/reg": 2.858954576367978e-05, + "step": 1077 + }, + { + "epoch": 0.13475, + "grad_norm": 5.319186687469482, + "grad_norm_var": 20.53484474495194, + "learning_rate": 0.0001, + "loss": 1.9724, + "loss/crossentropy": 1.6715443134307861, + "loss/hidden": 1.765625, + "loss/logits": 0.20650218427181244, + "loss/reg": 2.8575825126608834e-05, + "step": 1078 + }, + { + "epoch": 0.134875, + "grad_norm": 2.3126540184020996, + "grad_norm_var": 8.503272568569564, + "learning_rate": 0.0001, + "loss": 1.2038, + "loss/crossentropy": 2.4389309883117676, + "loss/hidden": 1.046875, + "loss/logits": 0.1566343903541565, + "loss/reg": 2.856591163435951e-05, + "step": 1079 + }, + { + "epoch": 0.135, + "grad_norm": 2.344682216644287, + "grad_norm_var": 8.533350768554783, + "learning_rate": 0.0001, + "loss": 1.1453, + "loss/crossentropy": 2.7388672828674316, + "loss/hidden": 1.0, + "loss/logits": 0.14501038193702698, + "loss/reg": 2.8555594326462597e-05, + "step": 1080 + }, + { + "epoch": 0.135125, + "grad_norm": 2.275749444961548, + "grad_norm_var": 8.503859334045302, + "learning_rate": 0.0001, + "loss": 1.1832, + "loss/crossentropy": 2.4810714721679688, + "loss/hidden": 1.03125, + "loss/logits": 0.1516847312450409, + "loss/reg": 2.8541464416775852e-05, + "step": 1081 + }, + { + "epoch": 0.13525, + "grad_norm": 2.24882173538208, + "grad_norm_var": 8.393014226777234, + "learning_rate": 0.0001, + "loss": 1.138, + "loss/crossentropy": 2.4267804622650146, + "loss/hidden": 0.98828125, + "loss/logits": 0.14944884181022644, + "loss/reg": 2.8530525014502928e-05, + "step": 1082 + }, + { + "epoch": 0.135375, + "grad_norm": 1.8094111680984497, + "grad_norm_var": 8.380192801359197, + "learning_rate": 0.0001, + "loss": 1.0467, + "loss/crossentropy": 2.8205809593200684, + "loss/hidden": 0.921875, + "loss/logits": 0.12454194575548172, + "loss/reg": 2.8521597414510325e-05, + "step": 1083 + }, + { + "epoch": 0.1355, + "grad_norm": 1.8599380254745483, + "grad_norm_var": 8.437852717294973, + "learning_rate": 0.0001, + "loss": 1.2943, + "loss/crossentropy": 2.7064578533172607, + "loss/hidden": 1.1171875, + "loss/logits": 0.17686372995376587, + "loss/reg": 2.851022145478055e-05, + "step": 1084 + }, + { + "epoch": 0.135625, + "grad_norm": 3.0408518314361572, + "grad_norm_var": 8.407922713136788, + "learning_rate": 0.0001, + "loss": 1.4152, + "loss/crossentropy": 2.336047410964966, + "loss/hidden": 1.234375, + "loss/logits": 0.18051210045814514, + "loss/reg": 2.850222881534137e-05, + "step": 1085 + }, + { + "epoch": 0.13575, + "grad_norm": 2.252040386199951, + "grad_norm_var": 8.444019199318532, + "learning_rate": 0.0001, + "loss": 1.2386, + "loss/crossentropy": 2.657888650894165, + "loss/hidden": 1.0703125, + "loss/logits": 0.1680040955543518, + "loss/reg": 2.8494112484622747e-05, + "step": 1086 + }, + { + "epoch": 0.135875, + "grad_norm": 2.2364859580993652, + "grad_norm_var": 8.38676181704219, + "learning_rate": 0.0001, + "loss": 1.3235, + "loss/crossentropy": 2.3576953411102295, + "loss/hidden": 1.140625, + "loss/logits": 0.18254104256629944, + "loss/reg": 2.8489477699622512e-05, + "step": 1087 + }, + { + "epoch": 0.136, + "grad_norm": 2.642712354660034, + "grad_norm_var": 1.908915910764065, + "learning_rate": 0.0001, + "loss": 1.2375, + "loss/crossentropy": 2.679563283920288, + "loss/hidden": 1.0625, + "loss/logits": 0.1747627705335617, + "loss/reg": 2.8480613764259033e-05, + "step": 1088 + }, + { + "epoch": 0.136125, + "grad_norm": 2.25370454788208, + "grad_norm_var": 1.9255002267682082, + "learning_rate": 0.0001, + "loss": 1.1938, + "loss/crossentropy": 2.618626594543457, + "loss/hidden": 1.0546875, + "loss/logits": 0.13881683349609375, + "loss/reg": 2.8473928978201002e-05, + "step": 1089 + }, + { + "epoch": 0.13625, + "grad_norm": 3.0173609256744385, + "grad_norm_var": 1.876039001435261, + "learning_rate": 0.0001, + "loss": 1.3079, + "loss/crossentropy": 2.394578456878662, + "loss/hidden": 1.1484375, + "loss/logits": 0.15916720032691956, + "loss/reg": 2.8467378797358833e-05, + "step": 1090 + }, + { + "epoch": 0.136375, + "grad_norm": 2.1959891319274902, + "grad_norm_var": 1.8894692162849214, + "learning_rate": 0.0001, + "loss": 1.1257, + "loss/crossentropy": 2.6383235454559326, + "loss/hidden": 0.984375, + "loss/logits": 0.14102642238140106, + "loss/reg": 2.845912240445614e-05, + "step": 1091 + }, + { + "epoch": 0.1365, + "grad_norm": 2.0317325592041016, + "grad_norm_var": 1.9086413713940535, + "learning_rate": 0.0001, + "loss": 1.2671, + "loss/crossentropy": 2.5728659629821777, + "loss/hidden": 1.109375, + "loss/logits": 0.15746140480041504, + "loss/reg": 2.8449610908864997e-05, + "step": 1092 + }, + { + "epoch": 0.136625, + "grad_norm": 2.4898619651794434, + "grad_norm_var": 0.6722724249379259, + "learning_rate": 0.0001, + "loss": 1.2773, + "loss/crossentropy": 2.423884391784668, + "loss/hidden": 1.109375, + "loss/logits": 0.1676451712846756, + "loss/reg": 2.844280425051693e-05, + "step": 1093 + }, + { + "epoch": 0.13675, + "grad_norm": 2.463630199432373, + "grad_norm_var": 0.1164114556149201, + "learning_rate": 0.0001, + "loss": 1.2804, + "loss/crossentropy": 2.501183032989502, + "loss/hidden": 1.125, + "loss/logits": 0.15511515736579895, + "loss/reg": 2.8434869818738662e-05, + "step": 1094 + }, + { + "epoch": 0.136875, + "grad_norm": 1.963381052017212, + "grad_norm_var": 0.12541312056514825, + "learning_rate": 0.0001, + "loss": 1.2281, + "loss/crossentropy": 2.466003656387329, + "loss/hidden": 1.0703125, + "loss/logits": 0.15753915905952454, + "loss/reg": 2.8423899493645877e-05, + "step": 1095 + }, + { + "epoch": 0.137, + "grad_norm": 1.9231977462768555, + "grad_norm_var": 0.13515141937423678, + "learning_rate": 0.0001, + "loss": 1.1614, + "loss/crossentropy": 2.5952444076538086, + "loss/hidden": 1.0078125, + "loss/logits": 0.15328103303909302, + "loss/reg": 2.8416661734809168e-05, + "step": 1096 + }, + { + "epoch": 0.137125, + "grad_norm": 1.9311559200286865, + "grad_norm_var": 0.1434139948987628, + "learning_rate": 0.0001, + "loss": 1.2166, + "loss/crossentropy": 2.67840313911438, + "loss/hidden": 1.046875, + "loss/logits": 0.16948139667510986, + "loss/reg": 2.8408794605638832e-05, + "step": 1097 + }, + { + "epoch": 0.13725, + "grad_norm": 2.152937889099121, + "grad_norm_var": 0.1442915371142739, + "learning_rate": 0.0001, + "loss": 1.257, + "loss/crossentropy": 2.251335382461548, + "loss/hidden": 1.1015625, + "loss/logits": 0.15517005324363708, + "loss/reg": 2.8401800591382198e-05, + "step": 1098 + }, + { + "epoch": 0.137375, + "grad_norm": 1.897124171257019, + "grad_norm_var": 0.1394264144616623, + "learning_rate": 0.0001, + "loss": 1.1333, + "loss/crossentropy": 2.777588129043579, + "loss/hidden": 0.9921875, + "loss/logits": 0.14081111550331116, + "loss/reg": 2.83905155811226e-05, + "step": 1099 + }, + { + "epoch": 0.1375, + "grad_norm": 1.7621134519577026, + "grad_norm_var": 0.14539924098917822, + "learning_rate": 0.0001, + "loss": 1.1384, + "loss/crossentropy": 2.386300802230835, + "loss/hidden": 0.99609375, + "loss/logits": 0.14199820160865784, + "loss/reg": 2.838683758454863e-05, + "step": 1100 + }, + { + "epoch": 0.137625, + "grad_norm": 2.3598530292510986, + "grad_norm_var": 0.1040180185112168, + "learning_rate": 0.0001, + "loss": 1.3581, + "loss/crossentropy": 2.3626723289489746, + "loss/hidden": 1.1640625, + "loss/logits": 0.19375786185264587, + "loss/reg": 2.8378843126120046e-05, + "step": 1101 + }, + { + "epoch": 0.13775, + "grad_norm": 2.344046115875244, + "grad_norm_var": 0.10489928608777177, + "learning_rate": 0.0001, + "loss": 1.2875, + "loss/crossentropy": 2.378030300140381, + "loss/hidden": 1.1171875, + "loss/logits": 0.1700662076473236, + "loss/reg": 2.8371290682116523e-05, + "step": 1102 + }, + { + "epoch": 0.137875, + "grad_norm": 2.064368486404419, + "grad_norm_var": 0.1065808633529644, + "learning_rate": 0.0001, + "loss": 1.1376, + "loss/crossentropy": 2.352322578430176, + "loss/hidden": 0.98828125, + "loss/logits": 0.14905816316604614, + "loss/reg": 2.8365529942675494e-05, + "step": 1103 + }, + { + "epoch": 0.138, + "grad_norm": 2.194638729095459, + "grad_norm_var": 0.09377463559072889, + "learning_rate": 0.0001, + "loss": 1.2208, + "loss/crossentropy": 2.9058327674865723, + "loss/hidden": 1.046875, + "loss/logits": 0.17361563444137573, + "loss/reg": 2.836092608049512e-05, + "step": 1104 + }, + { + "epoch": 0.138125, + "grad_norm": 1.9892427921295166, + "grad_norm_var": 0.09591079527989615, + "learning_rate": 0.0001, + "loss": 1.0988, + "loss/crossentropy": 2.432803153991699, + "loss/hidden": 0.9609375, + "loss/logits": 0.1375415325164795, + "loss/reg": 2.835432496794965e-05, + "step": 1105 + }, + { + "epoch": 0.13825, + "grad_norm": 1.8463401794433594, + "grad_norm_var": 0.049904463609351524, + "learning_rate": 0.0001, + "loss": 1.2267, + "loss/crossentropy": 2.5338962078094482, + "loss/hidden": 1.0546875, + "loss/logits": 0.1717638075351715, + "loss/reg": 2.8345293685561046e-05, + "step": 1106 + }, + { + "epoch": 0.138375, + "grad_norm": 2.3203155994415283, + "grad_norm_var": 0.052451769122999536, + "learning_rate": 0.0001, + "loss": 1.4182, + "loss/crossentropy": 2.4577367305755615, + "loss/hidden": 1.234375, + "loss/logits": 0.18351434171199799, + "loss/reg": 2.8336622563074343e-05, + "step": 1107 + }, + { + "epoch": 0.1385, + "grad_norm": 2.083280563354492, + "grad_norm_var": 0.052091101094176175, + "learning_rate": 0.0001, + "loss": 1.2894, + "loss/crossentropy": 2.256762742996216, + "loss/hidden": 1.125, + "loss/logits": 0.16409918665885925, + "loss/reg": 2.8329204724286683e-05, + "step": 1108 + }, + { + "epoch": 0.138625, + "grad_norm": 2.6403331756591797, + "grad_norm_var": 0.06109534551608415, + "learning_rate": 0.0001, + "loss": 1.2767, + "loss/crossentropy": 2.672240734100342, + "loss/hidden": 1.1171875, + "loss/logits": 0.15924429893493652, + "loss/reg": 2.832092286553234e-05, + "step": 1109 + }, + { + "epoch": 0.13875, + "grad_norm": 2.471147060394287, + "grad_norm_var": 0.06144228000402047, + "learning_rate": 0.0001, + "loss": 1.3533, + "loss/crossentropy": 2.2716329097747803, + "loss/hidden": 1.1875, + "loss/logits": 0.16549324989318848, + "loss/reg": 2.8314194423728622e-05, + "step": 1110 + }, + { + "epoch": 0.138875, + "grad_norm": 3.3604722023010254, + "grad_norm_var": 0.15398565016317983, + "learning_rate": 0.0001, + "loss": 1.6854, + "loss/crossentropy": 2.1522159576416016, + "loss/hidden": 1.4765625, + "loss/logits": 0.2085460126399994, + "loss/reg": 2.8307435059105046e-05, + "step": 1111 + }, + { + "epoch": 0.139, + "grad_norm": 1.767682433128357, + "grad_norm_var": 0.16141898149485687, + "learning_rate": 0.0001, + "loss": 1.1712, + "loss/crossentropy": 2.491121530532837, + "loss/hidden": 1.0234375, + "loss/logits": 0.14750945568084717, + "loss/reg": 2.8299278710619546e-05, + "step": 1112 + }, + { + "epoch": 0.139125, + "grad_norm": 1.8311489820480347, + "grad_norm_var": 0.16561644695500868, + "learning_rate": 0.0001, + "loss": 1.1949, + "loss/crossentropy": 2.4164485931396484, + "loss/hidden": 1.0390625, + "loss/logits": 0.15554410219192505, + "loss/reg": 2.8293772629695013e-05, + "step": 1113 + }, + { + "epoch": 0.13925, + "grad_norm": 2.8394663333892822, + "grad_norm_var": 0.19142376457574314, + "learning_rate": 0.0001, + "loss": 1.2882, + "loss/crossentropy": 2.917569160461426, + "loss/hidden": 1.140625, + "loss/logits": 0.14732028543949127, + "loss/reg": 2.8290531190577894e-05, + "step": 1114 + }, + { + "epoch": 0.139375, + "grad_norm": 2.1642987728118896, + "grad_norm_var": 0.18382314354217985, + "learning_rate": 0.0001, + "loss": 1.2788, + "loss/crossentropy": 2.5852484703063965, + "loss/hidden": 1.1171875, + "loss/logits": 0.16129213571548462, + "loss/reg": 2.8288275643717498e-05, + "step": 1115 + }, + { + "epoch": 0.1395, + "grad_norm": 2.0506513118743896, + "grad_norm_var": 0.1701635238688501, + "learning_rate": 0.0001, + "loss": 1.2535, + "loss/crossentropy": 2.2902791500091553, + "loss/hidden": 1.1015625, + "loss/logits": 0.1516706645488739, + "loss/reg": 2.8281245249672793e-05, + "step": 1116 + }, + { + "epoch": 0.139625, + "grad_norm": 2.528313398361206, + "grad_norm_var": 0.17394520010507694, + "learning_rate": 0.0001, + "loss": 1.4604, + "loss/crossentropy": 2.606721878051758, + "loss/hidden": 1.25, + "loss/logits": 0.21016553044319153, + "loss/reg": 2.827922253345605e-05, + "step": 1117 + }, + { + "epoch": 0.13975, + "grad_norm": 1.9809324741363525, + "grad_norm_var": 0.17913276442851966, + "learning_rate": 0.0001, + "loss": 1.1, + "loss/crossentropy": 2.6824090480804443, + "loss/hidden": 0.9609375, + "loss/logits": 0.13875439763069153, + "loss/reg": 2.827651405823417e-05, + "step": 1118 + }, + { + "epoch": 0.139875, + "grad_norm": 2.0517003536224365, + "grad_norm_var": 0.17947034353318803, + "learning_rate": 0.0001, + "loss": 1.2744, + "loss/crossentropy": 2.788567066192627, + "loss/hidden": 1.1015625, + "loss/logits": 0.17254550755023956, + "loss/reg": 2.8268350433791056e-05, + "step": 1119 + }, + { + "epoch": 0.14, + "grad_norm": 2.382298469543457, + "grad_norm_var": 0.1800985397652082, + "learning_rate": 0.0001, + "loss": 1.3013, + "loss/crossentropy": 2.7195708751678467, + "loss/hidden": 1.1484375, + "loss/logits": 0.15262669324874878, + "loss/reg": 2.8258880774956197e-05, + "step": 1120 + }, + { + "epoch": 0.140125, + "grad_norm": 2.144865036010742, + "grad_norm_var": 0.17580262325861326, + "learning_rate": 0.0001, + "loss": 1.1674, + "loss/crossentropy": 2.600773811340332, + "loss/hidden": 1.015625, + "loss/logits": 0.15153665840625763, + "loss/reg": 2.825040792231448e-05, + "step": 1121 + }, + { + "epoch": 0.14025, + "grad_norm": 2.0462546348571777, + "grad_norm_var": 0.1667690803340643, + "learning_rate": 0.0001, + "loss": 1.2006, + "loss/crossentropy": 2.3139140605926514, + "loss/hidden": 1.0390625, + "loss/logits": 0.16129767894744873, + "loss/reg": 2.8240427127457224e-05, + "step": 1122 + }, + { + "epoch": 0.140375, + "grad_norm": 1.8142099380493164, + "grad_norm_var": 0.1808299763514031, + "learning_rate": 0.0001, + "loss": 1.1118, + "loss/crossentropy": 2.491962432861328, + "loss/hidden": 0.97265625, + "loss/logits": 0.13889466226100922, + "loss/reg": 2.823096110660117e-05, + "step": 1123 + }, + { + "epoch": 0.1405, + "grad_norm": 1.8904823064804077, + "grad_norm_var": 0.18769126955470242, + "learning_rate": 0.0001, + "loss": 1.2568, + "loss/crossentropy": 3.042818784713745, + "loss/hidden": 1.1015625, + "loss/logits": 0.1549292802810669, + "loss/reg": 2.822266105795279e-05, + "step": 1124 + }, + { + "epoch": 0.140625, + "grad_norm": 2.4015777111053467, + "grad_norm_var": 0.1787570242660322, + "learning_rate": 0.0001, + "loss": 1.3311, + "loss/crossentropy": 2.7175662517547607, + "loss/hidden": 1.1484375, + "loss/logits": 0.18239489197731018, + "loss/reg": 2.8215783459017985e-05, + "step": 1125 + }, + { + "epoch": 0.14075, + "grad_norm": 2.5804173946380615, + "grad_norm_var": 0.18297520433008424, + "learning_rate": 0.0001, + "loss": 1.1927, + "loss/crossentropy": 2.6656172275543213, + "loss/hidden": 1.03125, + "loss/logits": 0.16119658946990967, + "loss/reg": 2.820966619765386e-05, + "step": 1126 + }, + { + "epoch": 0.140875, + "grad_norm": 2.679173707962036, + "grad_norm_var": 0.11017252850212879, + "learning_rate": 0.0001, + "loss": 1.0596, + "loss/crossentropy": 2.744784355163574, + "loss/hidden": 0.91015625, + "loss/logits": 0.14915892481803894, + "loss/reg": 2.820882582454942e-05, + "step": 1127 + }, + { + "epoch": 0.141, + "grad_norm": 2.318621873855591, + "grad_norm_var": 0.09759959222030616, + "learning_rate": 0.0001, + "loss": 1.3248, + "loss/crossentropy": 2.3725292682647705, + "loss/hidden": 1.15625, + "loss/logits": 0.16826823353767395, + "loss/reg": 2.819988367264159e-05, + "step": 1128 + }, + { + "epoch": 0.141125, + "grad_norm": 2.600517749786377, + "grad_norm_var": 0.09352345579319579, + "learning_rate": 0.0001, + "loss": 1.5187, + "loss/crossentropy": 2.3792738914489746, + "loss/hidden": 1.296875, + "loss/logits": 0.22157782316207886, + "loss/reg": 2.8191148885525763e-05, + "step": 1129 + }, + { + "epoch": 0.14125, + "grad_norm": 2.189312696456909, + "grad_norm_var": 0.07140995573180403, + "learning_rate": 0.0001, + "loss": 1.1643, + "loss/crossentropy": 2.3591907024383545, + "loss/hidden": 1.015625, + "loss/logits": 0.1483803540468216, + "loss/reg": 2.8188154828967527e-05, + "step": 1130 + }, + { + "epoch": 0.141375, + "grad_norm": 2.165137529373169, + "grad_norm_var": 0.07140164815008436, + "learning_rate": 0.0001, + "loss": 1.0711, + "loss/crossentropy": 2.528093099594116, + "loss/hidden": 0.921875, + "loss/logits": 0.14893028140068054, + "loss/reg": 2.817946187860798e-05, + "step": 1131 + }, + { + "epoch": 0.1415, + "grad_norm": 1.9415894746780396, + "grad_norm_var": 0.07488436467176385, + "learning_rate": 0.0001, + "loss": 1.1059, + "loss/crossentropy": 2.453845739364624, + "loss/hidden": 0.96484375, + "loss/logits": 0.14074820280075073, + "loss/reg": 2.817391577991657e-05, + "step": 1132 + }, + { + "epoch": 0.141625, + "grad_norm": 2.169292688369751, + "grad_norm_var": 0.06876619013293232, + "learning_rate": 0.0001, + "loss": 1.3332, + "loss/crossentropy": 2.592689037322998, + "loss/hidden": 1.1484375, + "loss/logits": 0.18449340760707855, + "loss/reg": 2.8168065909994766e-05, + "step": 1133 + }, + { + "epoch": 0.14175, + "grad_norm": 2.271085739135742, + "grad_norm_var": 0.06517478191027237, + "learning_rate": 0.0001, + "loss": 1.3698, + "loss/crossentropy": 2.3752005100250244, + "loss/hidden": 1.1875, + "loss/logits": 0.1820026934146881, + "loss/reg": 2.8164591640233994e-05, + "step": 1134 + }, + { + "epoch": 0.141875, + "grad_norm": 1.7783443927764893, + "grad_norm_var": 0.07626734208528395, + "learning_rate": 0.0001, + "loss": 1.1879, + "loss/crossentropy": 2.4869723320007324, + "loss/hidden": 1.0234375, + "loss/logits": 0.16420593857765198, + "loss/reg": 2.8161663067294285e-05, + "step": 1135 + }, + { + "epoch": 0.142, + "grad_norm": 2.0181262493133545, + "grad_norm_var": 0.07623000679149428, + "learning_rate": 0.0001, + "loss": 1.2679, + "loss/crossentropy": 2.268914222717285, + "loss/hidden": 1.109375, + "loss/logits": 0.1581989973783493, + "loss/reg": 2.8158268833067268e-05, + "step": 1136 + }, + { + "epoch": 0.142125, + "grad_norm": 2.3905601501464844, + "grad_norm_var": 0.07858774790285732, + "learning_rate": 0.0001, + "loss": 1.5254, + "loss/crossentropy": 2.212190628051758, + "loss/hidden": 1.34375, + "loss/logits": 0.18133576214313507, + "loss/reg": 2.815179141180124e-05, + "step": 1137 + }, + { + "epoch": 0.14225, + "grad_norm": 1.7961739301681519, + "grad_norm_var": 0.08773702418359992, + "learning_rate": 0.0001, + "loss": 1.0594, + "loss/crossentropy": 2.431272506713867, + "loss/hidden": 0.9375, + "loss/logits": 0.1216229721903801, + "loss/reg": 2.8146629119873978e-05, + "step": 1138 + }, + { + "epoch": 0.142375, + "grad_norm": 2.0319154262542725, + "grad_norm_var": 0.07985522673142587, + "learning_rate": 0.0001, + "loss": 1.17, + "loss/crossentropy": 2.8600165843963623, + "loss/hidden": 1.03125, + "loss/logits": 0.13844284415245056, + "loss/reg": 2.8138236302766018e-05, + "step": 1139 + }, + { + "epoch": 0.1425, + "grad_norm": 1.6292545795440674, + "grad_norm_var": 0.09494944386115639, + "learning_rate": 0.0001, + "loss": 1.0668, + "loss/crossentropy": 2.5235483646392822, + "loss/hidden": 0.921875, + "loss/logits": 0.1446669101715088, + "loss/reg": 2.813064020301681e-05, + "step": 1140 + }, + { + "epoch": 0.142625, + "grad_norm": 4.410617828369141, + "grad_norm_var": 0.40521125106713307, + "learning_rate": 0.0001, + "loss": 1.6405, + "loss/crossentropy": 2.308990001678467, + "loss/hidden": 1.40625, + "loss/logits": 0.23399674892425537, + "loss/reg": 2.812209822877776e-05, + "step": 1141 + }, + { + "epoch": 0.14275, + "grad_norm": 1.9667885303497314, + "grad_norm_var": 0.40667209469611504, + "learning_rate": 0.0001, + "loss": 1.1949, + "loss/crossentropy": 2.6161997318267822, + "loss/hidden": 1.0390625, + "loss/logits": 0.15554136037826538, + "loss/reg": 2.811234116961714e-05, + "step": 1142 + }, + { + "epoch": 0.142875, + "grad_norm": 1.7699799537658691, + "grad_norm_var": 0.4090108924020927, + "learning_rate": 0.0001, + "loss": 1.1786, + "loss/crossentropy": 2.3731281757354736, + "loss/hidden": 1.0390625, + "loss/logits": 0.13922549784183502, + "loss/reg": 2.8100412237108685e-05, + "step": 1143 + }, + { + "epoch": 0.143, + "grad_norm": 1.8722507953643799, + "grad_norm_var": 0.41532388843268275, + "learning_rate": 0.0001, + "loss": 1.1185, + "loss/crossentropy": 2.1923420429229736, + "loss/hidden": 0.98046875, + "loss/logits": 0.13772645592689514, + "loss/reg": 2.8088381441193633e-05, + "step": 1144 + }, + { + "epoch": 0.143125, + "grad_norm": 2.238243341445923, + "grad_norm_var": 0.4035793197618972, + "learning_rate": 0.0001, + "loss": 1.4408, + "loss/crossentropy": 2.4919800758361816, + "loss/hidden": 1.2265625, + "loss/logits": 0.2139793038368225, + "loss/reg": 2.8076270609744824e-05, + "step": 1145 + }, + { + "epoch": 0.14325, + "grad_norm": 2.429471492767334, + "grad_norm_var": 0.40796526142730893, + "learning_rate": 0.0001, + "loss": 1.2013, + "loss/crossentropy": 2.4356842041015625, + "loss/hidden": 1.03125, + "loss/logits": 0.16974246501922607, + "loss/reg": 2.8063353965990245e-05, + "step": 1146 + }, + { + "epoch": 0.143375, + "grad_norm": 2.0629639625549316, + "grad_norm_var": 0.40881920543581696, + "learning_rate": 0.0001, + "loss": 1.2628, + "loss/crossentropy": 2.5349087715148926, + "loss/hidden": 1.0859375, + "loss/logits": 0.17658013105392456, + "loss/reg": 2.8049853426637128e-05, + "step": 1147 + }, + { + "epoch": 0.1435, + "grad_norm": 1.706183910369873, + "grad_norm_var": 0.4195630539352104, + "learning_rate": 0.0001, + "loss": 1.0592, + "loss/crossentropy": 2.3464481830596924, + "loss/hidden": 0.91796875, + "loss/logits": 0.14094504714012146, + "loss/reg": 2.804088762786705e-05, + "step": 1148 + }, + { + "epoch": 0.143625, + "grad_norm": 1.6815345287322998, + "grad_norm_var": 0.433751760580607, + "learning_rate": 0.0001, + "loss": 1.1477, + "loss/crossentropy": 2.368020534515381, + "loss/hidden": 0.99609375, + "loss/logits": 0.1513562798500061, + "loss/reg": 2.8033473427058198e-05, + "step": 1149 + }, + { + "epoch": 0.14375, + "grad_norm": 1.9398962259292603, + "grad_norm_var": 0.43430386546040345, + "learning_rate": 0.0001, + "loss": 0.9971, + "loss/crossentropy": 2.5943963527679443, + "loss/hidden": 0.87109375, + "loss/logits": 0.12571600079536438, + "loss/reg": 2.802498420351185e-05, + "step": 1150 + }, + { + "epoch": 0.143875, + "grad_norm": 1.7635091543197632, + "grad_norm_var": 0.43496898598252276, + "learning_rate": 0.0001, + "loss": 1.2479, + "loss/crossentropy": 1.9986343383789062, + "loss/hidden": 1.1015625, + "loss/logits": 0.14603732526302338, + "loss/reg": 2.8015836505801417e-05, + "step": 1151 + }, + { + "epoch": 0.144, + "grad_norm": 2.449038028717041, + "grad_norm_var": 0.44148432998396075, + "learning_rate": 0.0001, + "loss": 1.1202, + "loss/crossentropy": 2.3182108402252197, + "loss/hidden": 0.96875, + "loss/logits": 0.1511959433555603, + "loss/reg": 2.800196125463117e-05, + "step": 1152 + }, + { + "epoch": 0.144125, + "grad_norm": 2.4035236835479736, + "grad_norm_var": 0.4419388970458101, + "learning_rate": 0.0001, + "loss": 1.3259, + "loss/crossentropy": 2.5049266815185547, + "loss/hidden": 1.1328125, + "loss/logits": 0.19285129010677338, + "loss/reg": 2.7993848561891355e-05, + "step": 1153 + }, + { + "epoch": 0.14425, + "grad_norm": 2.36033296585083, + "grad_norm_var": 0.436384893686761, + "learning_rate": 0.0001, + "loss": 1.1683, + "loss/crossentropy": 2.759612560272217, + "loss/hidden": 1.0078125, + "loss/logits": 0.16024839878082275, + "loss/reg": 2.7986376153421588e-05, + "step": 1154 + }, + { + "epoch": 0.144375, + "grad_norm": 2.450425148010254, + "grad_norm_var": 0.4396421734237966, + "learning_rate": 0.0001, + "loss": 1.5243, + "loss/crossentropy": 2.2864620685577393, + "loss/hidden": 1.3046875, + "loss/logits": 0.21933995187282562, + "loss/reg": 2.7975396733381785e-05, + "step": 1155 + }, + { + "epoch": 0.1445, + "grad_norm": 2.2934749126434326, + "grad_norm_var": 0.41703494530644797, + "learning_rate": 0.0001, + "loss": 1.151, + "loss/crossentropy": 2.744310140609741, + "loss/hidden": 0.99609375, + "loss/logits": 0.15459004044532776, + "loss/reg": 2.796579065034166e-05, + "step": 1156 + }, + { + "epoch": 0.144625, + "grad_norm": 2.370851755142212, + "grad_norm_var": 0.08602501745009643, + "learning_rate": 0.0001, + "loss": 1.3772, + "loss/crossentropy": 2.3964056968688965, + "loss/hidden": 1.1953125, + "loss/logits": 0.1816563755273819, + "loss/reg": 2.795742875605356e-05, + "step": 1157 + }, + { + "epoch": 0.14475, + "grad_norm": 2.4725124835968018, + "grad_norm_var": 0.09235953761226649, + "learning_rate": 0.0001, + "loss": 1.3472, + "loss/crossentropy": 2.6245546340942383, + "loss/hidden": 1.15625, + "loss/logits": 0.1907157599925995, + "loss/reg": 2.794588181131985e-05, + "step": 1158 + }, + { + "epoch": 0.144875, + "grad_norm": 1.9545862674713135, + "grad_norm_var": 0.08534455041238687, + "learning_rate": 0.0001, + "loss": 1.1268, + "loss/crossentropy": 2.918393135070801, + "loss/hidden": 0.98828125, + "loss/logits": 0.1382652223110199, + "loss/reg": 2.7934829631703906e-05, + "step": 1159 + }, + { + "epoch": 0.145, + "grad_norm": 7.729360580444336, + "grad_norm_var": 2.0101634864291533, + "learning_rate": 0.0001, + "loss": 1.4562, + "loss/crossentropy": 2.3912408351898193, + "loss/hidden": 1.265625, + "loss/logits": 0.19027158617973328, + "loss/reg": 2.7923553716391325e-05, + "step": 1160 + }, + { + "epoch": 0.145125, + "grad_norm": 2.148836135864258, + "grad_norm_var": 2.014011400463621, + "learning_rate": 0.0001, + "loss": 1.2079, + "loss/crossentropy": 2.58884596824646, + "loss/hidden": 1.046875, + "loss/logits": 0.16077430546283722, + "loss/reg": 2.791491715470329e-05, + "step": 1161 + }, + { + "epoch": 0.14525, + "grad_norm": 2.8709280490875244, + "grad_norm_var": 2.0212438083178994, + "learning_rate": 0.0001, + "loss": 1.5555, + "loss/crossentropy": 2.849884033203125, + "loss/hidden": 1.34375, + "loss/logits": 0.21148604154586792, + "loss/reg": 2.7907261028303765e-05, + "step": 1162 + }, + { + "epoch": 0.145375, + "grad_norm": 2.402651786804199, + "grad_norm_var": 2.006798935424683, + "learning_rate": 0.0001, + "loss": 1.3341, + "loss/crossentropy": 2.667253255844116, + "loss/hidden": 1.140625, + "loss/logits": 0.1932215392589569, + "loss/reg": 2.7902357032871805e-05, + "step": 1163 + }, + { + "epoch": 0.1455, + "grad_norm": 6.189431667327881, + "grad_norm_var": 2.7512293408590263, + "learning_rate": 0.0001, + "loss": 1.6157, + "loss/crossentropy": 2.7028310298919678, + "loss/hidden": 1.4140625, + "loss/logits": 0.20140285789966583, + "loss/reg": 2.7897443942492828e-05, + "step": 1164 + }, + { + "epoch": 0.145625, + "grad_norm": 3.0593209266662598, + "grad_norm_var": 2.656587552134933, + "learning_rate": 0.0001, + "loss": 1.3056, + "loss/crossentropy": 2.7540316581726074, + "loss/hidden": 1.1171875, + "loss/logits": 0.18814054131507874, + "loss/reg": 2.7892532671103254e-05, + "step": 1165 + }, + { + "epoch": 0.14575, + "grad_norm": 2.7008728981018066, + "grad_norm_var": 2.5924561472345826, + "learning_rate": 0.0001, + "loss": 1.3922, + "loss/crossentropy": 2.353156328201294, + "loss/hidden": 1.2109375, + "loss/logits": 0.18103182315826416, + "loss/reg": 2.7884898372576572e-05, + "step": 1166 + }, + { + "epoch": 0.145875, + "grad_norm": 2.3122024536132812, + "grad_norm_var": 2.5225512023962504, + "learning_rate": 0.0001, + "loss": 1.2837, + "loss/crossentropy": 2.418126344680786, + "loss/hidden": 1.125, + "loss/logits": 0.1583927571773529, + "loss/reg": 2.7877298634848557e-05, + "step": 1167 + }, + { + "epoch": 0.146, + "grad_norm": 2.36159086227417, + "grad_norm_var": 2.529575829656103, + "learning_rate": 0.0001, + "loss": 1.1861, + "loss/crossentropy": 2.465980052947998, + "loss/hidden": 1.0390625, + "loss/logits": 0.14675036072731018, + "loss/reg": 2.787059020192828e-05, + "step": 1168 + }, + { + "epoch": 0.146125, + "grad_norm": 2.1341376304626465, + "grad_norm_var": 2.555717319473571, + "learning_rate": 0.0001, + "loss": 1.221, + "loss/crossentropy": 2.220935583114624, + "loss/hidden": 1.0546875, + "loss/logits": 0.165988489985466, + "loss/reg": 2.786431650747545e-05, + "step": 1169 + }, + { + "epoch": 0.14625, + "grad_norm": 1.9800208806991577, + "grad_norm_var": 2.5965962088801287, + "learning_rate": 0.0001, + "loss": 1.3826, + "loss/crossentropy": 2.5578231811523438, + "loss/hidden": 1.1875, + "loss/logits": 0.194828599691391, + "loss/reg": 2.7856074666487984e-05, + "step": 1170 + }, + { + "epoch": 0.146375, + "grad_norm": 2.1109437942504883, + "grad_norm_var": 2.6270661094654026, + "learning_rate": 0.0001, + "loss": 1.1771, + "loss/crossentropy": 2.523261785507202, + "loss/hidden": 1.03125, + "loss/logits": 0.14561527967453003, + "loss/reg": 2.7851159757119603e-05, + "step": 1171 + }, + { + "epoch": 0.1465, + "grad_norm": 1.6841378211975098, + "grad_norm_var": 2.7030613756058552, + "learning_rate": 0.0001, + "loss": 1.143, + "loss/crossentropy": 2.7305710315704346, + "loss/hidden": 0.984375, + "loss/logits": 0.1583786904811859, + "loss/reg": 2.7844978831126355e-05, + "step": 1172 + }, + { + "epoch": 0.146625, + "grad_norm": 2.1665713787078857, + "grad_norm_var": 2.720222392485194, + "learning_rate": 0.0001, + "loss": 1.2944, + "loss/crossentropy": 2.139784097671509, + "loss/hidden": 1.140625, + "loss/logits": 0.15345725417137146, + "loss/reg": 2.7836540539283305e-05, + "step": 1173 + }, + { + "epoch": 0.14675, + "grad_norm": 2.2671074867248535, + "grad_norm_var": 2.734358438581994, + "learning_rate": 0.0001, + "loss": 1.1124, + "loss/crossentropy": 2.4452567100524902, + "loss/hidden": 0.9765625, + "loss/logits": 0.1355462372303009, + "loss/reg": 2.782705814752262e-05, + "step": 1174 + }, + { + "epoch": 0.146875, + "grad_norm": 2.7200121879577637, + "grad_norm_var": 2.6765775461170427, + "learning_rate": 0.0001, + "loss": 1.2912, + "loss/crossentropy": 2.3593950271606445, + "loss/hidden": 1.1328125, + "loss/logits": 0.15807639062404633, + "loss/reg": 2.7815445719170384e-05, + "step": 1175 + }, + { + "epoch": 0.147, + "grad_norm": 2.6776604652404785, + "grad_norm_var": 1.0371370201894223, + "learning_rate": 0.0001, + "loss": 1.1975, + "loss/crossentropy": 2.6101787090301514, + "loss/hidden": 1.0390625, + "loss/logits": 0.15811535716056824, + "loss/reg": 2.7805714125861414e-05, + "step": 1176 + }, + { + "epoch": 0.147125, + "grad_norm": 1.6529101133346558, + "grad_norm_var": 1.0831114016435792, + "learning_rate": 0.0001, + "loss": 1.1845, + "loss/crossentropy": 2.383350133895874, + "loss/hidden": 1.0390625, + "loss/logits": 0.14519202709197998, + "loss/reg": 2.7793756089522503e-05, + "step": 1177 + }, + { + "epoch": 0.14725, + "grad_norm": 1.8173997402191162, + "grad_norm_var": 1.111706916095401, + "learning_rate": 0.0001, + "loss": 1.1422, + "loss/crossentropy": 2.429786205291748, + "loss/hidden": 0.98828125, + "loss/logits": 0.15360063314437866, + "loss/reg": 2.7781039534602314e-05, + "step": 1178 + }, + { + "epoch": 0.147375, + "grad_norm": 2.207003593444824, + "grad_norm_var": 1.117025131189886, + "learning_rate": 0.0001, + "loss": 1.2346, + "loss/crossentropy": 2.45831561088562, + "loss/hidden": 1.078125, + "loss/logits": 0.1561683863401413, + "loss/reg": 2.7767162464442663e-05, + "step": 1179 + }, + { + "epoch": 0.1475, + "grad_norm": 1.8291656970977783, + "grad_norm_var": 0.1618511695471644, + "learning_rate": 0.0001, + "loss": 1.1021, + "loss/crossentropy": 2.486833333969116, + "loss/hidden": 0.96875, + "loss/logits": 0.13303807377815247, + "loss/reg": 2.7753247195505537e-05, + "step": 1180 + }, + { + "epoch": 0.147625, + "grad_norm": 2.027369737625122, + "grad_norm_var": 0.11430880866733423, + "learning_rate": 0.0001, + "loss": 1.2475, + "loss/crossentropy": 2.5908496379852295, + "loss/hidden": 1.09375, + "loss/logits": 0.15349674224853516, + "loss/reg": 2.7742535166908056e-05, + "step": 1181 + }, + { + "epoch": 0.14775, + "grad_norm": 2.9332141876220703, + "grad_norm_var": 0.13426580109905498, + "learning_rate": 0.0001, + "loss": 1.6945, + "loss/crossentropy": 2.0537381172180176, + "loss/hidden": 1.421875, + "loss/logits": 0.2723906934261322, + "loss/reg": 2.773108826659154e-05, + "step": 1182 + }, + { + "epoch": 0.147875, + "grad_norm": 17.93061065673828, + "grad_norm_var": 15.655300094770794, + "learning_rate": 0.0001, + "loss": 2.6856, + "loss/crossentropy": 2.8054592609405518, + "loss/hidden": 2.078125, + "loss/logits": 0.6072109937667847, + "loss/reg": 2.7719906938727945e-05, + "step": 1183 + }, + { + "epoch": 0.148, + "grad_norm": 2.2986295223236084, + "grad_norm_var": 15.662218818033963, + "learning_rate": 0.0001, + "loss": 1.3112, + "loss/crossentropy": 2.596726179122925, + "loss/hidden": 1.125, + "loss/logits": 0.18587882816791534, + "loss/reg": 2.7707143090083264e-05, + "step": 1184 + }, + { + "epoch": 0.148125, + "grad_norm": 1.880839467048645, + "grad_norm_var": 15.70061550357963, + "learning_rate": 0.0001, + "loss": 1.0707, + "loss/crossentropy": 2.592128276824951, + "loss/hidden": 0.93359375, + "loss/logits": 0.13686493039131165, + "loss/reg": 2.7695143216988072e-05, + "step": 1185 + }, + { + "epoch": 0.14825, + "grad_norm": 1.9475699663162231, + "grad_norm_var": 15.705685051542437, + "learning_rate": 0.0001, + "loss": 1.171, + "loss/crossentropy": 2.3457136154174805, + "loss/hidden": 1.0234375, + "loss/logits": 0.14732672274112701, + "loss/reg": 2.7684172891895287e-05, + "step": 1186 + }, + { + "epoch": 0.148375, + "grad_norm": 8.073831558227539, + "grad_norm_var": 17.114199298605104, + "learning_rate": 0.0001, + "loss": 1.4527, + "loss/crossentropy": 2.260563611984253, + "loss/hidden": 1.25, + "loss/logits": 0.20243975520133972, + "loss/reg": 2.7672240321408026e-05, + "step": 1187 + }, + { + "epoch": 0.1485, + "grad_norm": 2.516343355178833, + "grad_norm_var": 16.95520444142492, + "learning_rate": 0.0001, + "loss": 1.1665, + "loss/crossentropy": 2.466325521469116, + "loss/hidden": 1.015625, + "loss/logits": 0.15060952305793762, + "loss/reg": 2.7659907573251985e-05, + "step": 1188 + }, + { + "epoch": 0.148625, + "grad_norm": 2.1922333240509033, + "grad_norm_var": 16.950480797433226, + "learning_rate": 0.0001, + "loss": 1.2555, + "loss/crossentropy": 2.6469104290008545, + "loss/hidden": 1.078125, + "loss/logits": 0.177072674036026, + "loss/reg": 2.7647196475300007e-05, + "step": 1189 + }, + { + "epoch": 0.14875, + "grad_norm": 2.5205237865448, + "grad_norm_var": 16.910784065322964, + "learning_rate": 0.0001, + "loss": 1.2949, + "loss/crossentropy": 2.6733131408691406, + "loss/hidden": 1.109375, + "loss/logits": 0.18528792262077332, + "loss/reg": 2.763914380921051e-05, + "step": 1190 + }, + { + "epoch": 0.148875, + "grad_norm": 2.818329334259033, + "grad_norm_var": 16.900159468990488, + "learning_rate": 0.0001, + "loss": 1.2345, + "loss/crossentropy": 2.564368963241577, + "loss/hidden": 1.078125, + "loss/logits": 0.1561104953289032, + "loss/reg": 2.763119846349582e-05, + "step": 1191 + }, + { + "epoch": 0.149, + "grad_norm": 2.362257480621338, + "grad_norm_var": 16.94443834059001, + "learning_rate": 0.0001, + "loss": 1.371, + "loss/crossentropy": 2.4741804599761963, + "loss/hidden": 1.1796875, + "loss/logits": 0.1909944713115692, + "loss/reg": 2.762146687018685e-05, + "step": 1192 + }, + { + "epoch": 0.149125, + "grad_norm": 2.101936101913452, + "grad_norm_var": 16.84268166213339, + "learning_rate": 0.0001, + "loss": 1.1126, + "loss/crossentropy": 2.433854341506958, + "loss/hidden": 0.96875, + "loss/logits": 0.14353907108306885, + "loss/reg": 2.761345967883244e-05, + "step": 1193 + }, + { + "epoch": 0.14925, + "grad_norm": 2.2441279888153076, + "grad_norm_var": 16.753145541719693, + "learning_rate": 0.0001, + "loss": 1.3061, + "loss/crossentropy": 2.2769577503204346, + "loss/hidden": 1.15625, + "loss/logits": 0.14958356320858002, + "loss/reg": 2.7605628929450177e-05, + "step": 1194 + }, + { + "epoch": 0.149375, + "grad_norm": 1.9734419584274292, + "grad_norm_var": 16.80048778547531, + "learning_rate": 0.0001, + "loss": 1.1214, + "loss/crossentropy": 2.6236746311187744, + "loss/hidden": 0.96875, + "loss/logits": 0.15241211652755737, + "loss/reg": 2.7598134693107568e-05, + "step": 1195 + }, + { + "epoch": 0.1495, + "grad_norm": 2.894045829772949, + "grad_norm_var": 16.619483258824765, + "learning_rate": 0.0001, + "loss": 1.2443, + "loss/crossentropy": 2.845644950866699, + "loss/hidden": 1.0859375, + "loss/logits": 0.15806913375854492, + "loss/reg": 2.7588790544541553e-05, + "step": 1196 + }, + { + "epoch": 0.149625, + "grad_norm": 2.7348477840423584, + "grad_norm_var": 16.495843787559767, + "learning_rate": 0.0001, + "loss": 1.341, + "loss/crossentropy": 2.1583986282348633, + "loss/hidden": 1.15625, + "loss/logits": 0.184425950050354, + "loss/reg": 2.7581947506405413e-05, + "step": 1197 + }, + { + "epoch": 0.14975, + "grad_norm": 2.4606244564056396, + "grad_norm_var": 16.55899665546907, + "learning_rate": 0.0001, + "loss": 1.2287, + "loss/crossentropy": 2.7660982608795166, + "loss/hidden": 1.0625, + "loss/logits": 0.16596384346485138, + "loss/reg": 2.7575186322792433e-05, + "step": 1198 + }, + { + "epoch": 0.149875, + "grad_norm": 1.9735963344573975, + "grad_norm_var": 2.1628482042498813, + "learning_rate": 0.0001, + "loss": 1.0777, + "loss/crossentropy": 2.4040403366088867, + "loss/hidden": 0.9453125, + "loss/logits": 0.13210439682006836, + "loss/reg": 2.7569951271289028e-05, + "step": 1199 + }, + { + "epoch": 0.15, + "grad_norm": 1.826606273651123, + "grad_norm_var": 2.2012208632355126, + "learning_rate": 0.0001, + "loss": 1.2289, + "loss/crossentropy": 2.336017370223999, + "loss/hidden": 1.0859375, + "loss/logits": 0.14264222979545593, + "loss/reg": 2.756535650405567e-05, + "step": 1200 + }, + { + "epoch": 0.150125, + "grad_norm": 2.256070137023926, + "grad_norm_var": 2.1711602165054207, + "learning_rate": 0.0001, + "loss": 1.5425, + "loss/crossentropy": 1.976460576057434, + "loss/hidden": 1.3515625, + "loss/logits": 0.19066616892814636, + "loss/reg": 2.7560516173252836e-05, + "step": 1201 + }, + { + "epoch": 0.15025, + "grad_norm": 2.39620041847229, + "grad_norm_var": 2.139866231790048, + "learning_rate": 0.0001, + "loss": 1.3731, + "loss/crossentropy": 2.75640869140625, + "loss/hidden": 1.1796875, + "loss/logits": 0.19317974150180817, + "loss/reg": 2.7557771318242885e-05, + "step": 1202 + }, + { + "epoch": 0.150375, + "grad_norm": 1.982822299003601, + "grad_norm_var": 0.10172726138896925, + "learning_rate": 0.0001, + "loss": 1.2335, + "loss/crossentropy": 2.6502585411071777, + "loss/hidden": 1.0625, + "loss/logits": 0.1707686334848404, + "loss/reg": 2.7554302505450323e-05, + "step": 1203 + }, + { + "epoch": 0.1505, + "grad_norm": 1.9111863374710083, + "grad_norm_var": 0.10944901860302939, + "learning_rate": 0.0001, + "loss": 1.2185, + "loss/crossentropy": 2.3117260932922363, + "loss/hidden": 1.0625, + "loss/logits": 0.15573087334632874, + "loss/reg": 2.7545747798285447e-05, + "step": 1204 + }, + { + "epoch": 0.150625, + "grad_norm": 2.7746851444244385, + "grad_norm_var": 0.12301661244117615, + "learning_rate": 0.0001, + "loss": 1.5466, + "loss/crossentropy": 2.350983142852783, + "loss/hidden": 1.3515625, + "loss/logits": 0.19476735591888428, + "loss/reg": 2.7536516427062452e-05, + "step": 1205 + }, + { + "epoch": 0.15075, + "grad_norm": 2.941901922225952, + "grad_norm_var": 0.14498942777738122, + "learning_rate": 0.0001, + "loss": 1.4761, + "loss/crossentropy": 2.69158935546875, + "loss/hidden": 1.2890625, + "loss/logits": 0.1867833435535431, + "loss/reg": 2.75287438853411e-05, + "step": 1206 + }, + { + "epoch": 0.150875, + "grad_norm": 2.1526482105255127, + "grad_norm_var": 0.13140963759173013, + "learning_rate": 0.0001, + "loss": 1.1718, + "loss/crossentropy": 2.6562676429748535, + "loss/hidden": 1.015625, + "loss/logits": 0.1558629423379898, + "loss/reg": 2.7521507945493795e-05, + "step": 1207 + }, + { + "epoch": 0.151, + "grad_norm": 1.8180968761444092, + "grad_norm_var": 0.1462474621469477, + "learning_rate": 0.0001, + "loss": 1.1164, + "loss/crossentropy": 2.464545726776123, + "loss/hidden": 0.96875, + "loss/logits": 0.14735284447669983, + "loss/reg": 2.7512320230016485e-05, + "step": 1208 + }, + { + "epoch": 0.151125, + "grad_norm": 2.0615530014038086, + "grad_norm_var": 0.14729565051282847, + "learning_rate": 0.0001, + "loss": 1.1875, + "loss/crossentropy": 2.8697397708892822, + "loss/hidden": 1.03125, + "loss/logits": 0.15592771768569946, + "loss/reg": 2.750484782154672e-05, + "step": 1209 + }, + { + "epoch": 0.15125, + "grad_norm": 1.985027551651001, + "grad_norm_var": 0.15256329287894246, + "learning_rate": 0.0001, + "loss": 1.1981, + "loss/crossentropy": 2.6676063537597656, + "loss/hidden": 1.0390625, + "loss/logits": 0.15873049199581146, + "loss/reg": 2.749882150965277e-05, + "step": 1210 + }, + { + "epoch": 0.151375, + "grad_norm": 3.9016621112823486, + "grad_norm_var": 0.3115348883237033, + "learning_rate": 0.0001, + "loss": 1.3969, + "loss/crossentropy": 2.131171226501465, + "loss/hidden": 1.234375, + "loss/logits": 0.16224710643291473, + "loss/reg": 2.7490825232234783e-05, + "step": 1211 + }, + { + "epoch": 0.1515, + "grad_norm": 2.5709502696990967, + "grad_norm_var": 0.2958918347549265, + "learning_rate": 0.0001, + "loss": 1.367, + "loss/crossentropy": 2.940727710723877, + "loss/hidden": 1.203125, + "loss/logits": 0.1635884940624237, + "loss/reg": 2.7484367819852196e-05, + "step": 1212 + }, + { + "epoch": 0.151625, + "grad_norm": 2.8296761512756348, + "grad_norm_var": 0.30120245894559994, + "learning_rate": 0.0001, + "loss": 1.5743, + "loss/crossentropy": 2.6142184734344482, + "loss/hidden": 1.34375, + "loss/logits": 0.23027721047401428, + "loss/reg": 2.7475369279272854e-05, + "step": 1213 + }, + { + "epoch": 0.15175, + "grad_norm": 1.9998527765274048, + "grad_norm_var": 0.30860976223532827, + "learning_rate": 0.0001, + "loss": 1.3208, + "loss/crossentropy": 2.4321632385253906, + "loss/hidden": 1.1484375, + "loss/logits": 0.17208465933799744, + "loss/reg": 2.746755490079522e-05, + "step": 1214 + }, + { + "epoch": 0.151875, + "grad_norm": 2.1271767616271973, + "grad_norm_var": 0.3026545003818715, + "learning_rate": 0.0001, + "loss": 1.2344, + "loss/crossentropy": 2.575026512145996, + "loss/hidden": 1.0625, + "loss/logits": 0.171661376953125, + "loss/reg": 2.746006975939963e-05, + "step": 1215 + }, + { + "epoch": 0.152, + "grad_norm": 1.9783564805984497, + "grad_norm_var": 0.2935845304871651, + "learning_rate": 0.0001, + "loss": 1.3158, + "loss/crossentropy": 2.3076202869415283, + "loss/hidden": 1.1484375, + "loss/logits": 0.16706429421901703, + "loss/reg": 2.7451042114989832e-05, + "step": 1216 + }, + { + "epoch": 0.152125, + "grad_norm": 1.744504690170288, + "grad_norm_var": 0.3167221458601451, + "learning_rate": 0.0001, + "loss": 1.0255, + "loss/crossentropy": 2.526693105697632, + "loss/hidden": 0.90234375, + "loss/logits": 0.12289679050445557, + "loss/reg": 2.744182347669266e-05, + "step": 1217 + }, + { + "epoch": 0.15225, + "grad_norm": 2.328500986099243, + "grad_norm_var": 0.3163525295539292, + "learning_rate": 0.0001, + "loss": 1.4369, + "loss/crossentropy": 2.011615753173828, + "loss/hidden": 1.234375, + "loss/logits": 0.20220641791820526, + "loss/reg": 2.7431295166024938e-05, + "step": 1218 + }, + { + "epoch": 0.152375, + "grad_norm": 3.2956676483154297, + "grad_norm_var": 0.36517829108227406, + "learning_rate": 0.0001, + "loss": 1.5316, + "loss/crossentropy": 2.2073464393615723, + "loss/hidden": 1.3203125, + "loss/logits": 0.2110351026058197, + "loss/reg": 2.7422491257311776e-05, + "step": 1219 + }, + { + "epoch": 0.1525, + "grad_norm": 2.1012229919433594, + "grad_norm_var": 0.3550157791248182, + "learning_rate": 0.0001, + "loss": 1.2098, + "loss/crossentropy": 2.576190233230591, + "loss/hidden": 1.0625, + "loss/logits": 0.1470044106245041, + "loss/reg": 2.7414396754465997e-05, + "step": 1220 + }, + { + "epoch": 0.152625, + "grad_norm": 2.730412721633911, + "grad_norm_var": 0.35300454362322664, + "learning_rate": 0.0001, + "loss": 1.3266, + "loss/crossentropy": 2.508300542831421, + "loss/hidden": 1.140625, + "loss/logits": 0.1857323944568634, + "loss/reg": 2.7405403670854867e-05, + "step": 1221 + }, + { + "epoch": 0.15275, + "grad_norm": 2.1391255855560303, + "grad_norm_var": 0.3363977966764329, + "learning_rate": 0.0001, + "loss": 1.1016, + "loss/crossentropy": 2.678865432739258, + "loss/hidden": 0.96875, + "loss/logits": 0.1326090395450592, + "loss/reg": 2.7396990844863467e-05, + "step": 1222 + }, + { + "epoch": 0.152875, + "grad_norm": 2.2695343494415283, + "grad_norm_var": 0.33401583502299936, + "learning_rate": 0.0001, + "loss": 1.2001, + "loss/crossentropy": 2.2428488731384277, + "loss/hidden": 1.0625, + "loss/logits": 0.13728411495685577, + "loss/reg": 2.73899277090095e-05, + "step": 1223 + }, + { + "epoch": 0.153, + "grad_norm": 2.9962921142578125, + "grad_norm_var": 0.3344546474653193, + "learning_rate": 0.0001, + "loss": 1.4559, + "loss/crossentropy": 2.2411420345306396, + "loss/hidden": 1.28125, + "loss/logits": 0.17434944212436676, + "loss/reg": 2.7382282496546395e-05, + "step": 1224 + }, + { + "epoch": 0.153125, + "grad_norm": 3.223398447036743, + "grad_norm_var": 0.36000723705386783, + "learning_rate": 0.0001, + "loss": 1.4231, + "loss/crossentropy": 2.4420807361602783, + "loss/hidden": 1.2734375, + "loss/logits": 0.14939409494400024, + "loss/reg": 2.737477007030975e-05, + "step": 1225 + }, + { + "epoch": 0.15325, + "grad_norm": 2.665025472640991, + "grad_norm_var": 0.340961988997149, + "learning_rate": 0.0001, + "loss": 1.11, + "loss/crossentropy": 2.084591865539551, + "loss/hidden": 0.9609375, + "loss/logits": 0.14879773557186127, + "loss/reg": 2.736686474236194e-05, + "step": 1226 + }, + { + "epoch": 0.153375, + "grad_norm": 2.74051570892334, + "grad_norm_var": 0.21694539716836303, + "learning_rate": 0.0001, + "loss": 1.1491, + "loss/crossentropy": 2.80490779876709, + "loss/hidden": 0.99609375, + "loss/logits": 0.15269222855567932, + "loss/reg": 2.7354166377335787e-05, + "step": 1227 + }, + { + "epoch": 0.1535, + "grad_norm": 2.0795443058013916, + "grad_norm_var": 0.22632532787522805, + "learning_rate": 0.0001, + "loss": 1.071, + "loss/crossentropy": 2.2139289379119873, + "loss/hidden": 0.9453125, + "loss/logits": 0.12540248036384583, + "loss/reg": 2.7347803552402183e-05, + "step": 1228 + }, + { + "epoch": 0.153625, + "grad_norm": 2.400036334991455, + "grad_norm_var": 0.216287106465726, + "learning_rate": 0.0001, + "loss": 1.2821, + "loss/crossentropy": 2.145390033721924, + "loss/hidden": 1.1171875, + "loss/logits": 0.1646306812763214, + "loss/reg": 2.733768087637145e-05, + "step": 1229 + }, + { + "epoch": 0.15375, + "grad_norm": 3.0078351497650146, + "grad_norm_var": 0.22248909473499284, + "learning_rate": 0.0001, + "loss": 1.2166, + "loss/crossentropy": 2.7007522583007812, + "loss/hidden": 1.046875, + "loss/logits": 0.16944603621959686, + "loss/reg": 2.7327603675075807e-05, + "step": 1230 + }, + { + "epoch": 0.153875, + "grad_norm": 2.3207321166992188, + "grad_norm_var": 0.21548778397920304, + "learning_rate": 0.0001, + "loss": 1.3329, + "loss/crossentropy": 2.6447603702545166, + "loss/hidden": 1.140625, + "loss/logits": 0.19204488396644592, + "loss/reg": 2.73200585070299e-05, + "step": 1231 + }, + { + "epoch": 0.154, + "grad_norm": 1.784392237663269, + "grad_norm_var": 0.23136332607497578, + "learning_rate": 0.0001, + "loss": 1.1033, + "loss/crossentropy": 2.293508291244507, + "loss/hidden": 0.97265625, + "loss/logits": 0.13034118711948395, + "loss/reg": 2.730847518250812e-05, + "step": 1232 + }, + { + "epoch": 0.154125, + "grad_norm": 1.939539909362793, + "grad_norm_var": 0.21437591829147676, + "learning_rate": 0.0001, + "loss": 1.2477, + "loss/crossentropy": 2.510955572128296, + "loss/hidden": 1.078125, + "loss/logits": 0.1693347543478012, + "loss/reg": 2.7302123271510936e-05, + "step": 1233 + }, + { + "epoch": 0.15425, + "grad_norm": 2.138371229171753, + "grad_norm_var": 0.2210173621878278, + "learning_rate": 0.0001, + "loss": 1.2836, + "loss/crossentropy": 2.3280651569366455, + "loss/hidden": 1.125, + "loss/logits": 0.15833412110805511, + "loss/reg": 2.729334846662823e-05, + "step": 1234 + }, + { + "epoch": 0.154375, + "grad_norm": 1.8322596549987793, + "grad_norm_var": 0.19756044302410575, + "learning_rate": 0.0001, + "loss": 1.0835, + "loss/crossentropy": 2.710282325744629, + "loss/hidden": 0.94921875, + "loss/logits": 0.13399583101272583, + "loss/reg": 2.728701838350389e-05, + "step": 1235 + }, + { + "epoch": 0.1545, + "grad_norm": 10.050215721130371, + "grad_norm_var": 3.832156223147056, + "learning_rate": 0.0001, + "loss": 1.1119, + "loss/crossentropy": 2.4910335540771484, + "loss/hidden": 0.98828125, + "loss/logits": 0.12333837151527405, + "loss/reg": 2.7277554181637242e-05, + "step": 1236 + }, + { + "epoch": 0.154625, + "grad_norm": 2.295006036758423, + "grad_norm_var": 3.853549849512097, + "learning_rate": 0.0001, + "loss": 1.0402, + "loss/crossentropy": 2.860826015472412, + "loss/hidden": 0.90625, + "loss/logits": 0.13368600606918335, + "loss/reg": 2.7267007681075484e-05, + "step": 1237 + }, + { + "epoch": 0.15475, + "grad_norm": 2.255885362625122, + "grad_norm_var": 3.8430608160119424, + "learning_rate": 0.0001, + "loss": 1.2777, + "loss/crossentropy": 2.6877009868621826, + "loss/hidden": 1.109375, + "loss/logits": 0.16801691055297852, + "loss/reg": 2.7256486646365374e-05, + "step": 1238 + }, + { + "epoch": 0.154875, + "grad_norm": 2.6192026138305664, + "grad_norm_var": 3.82247840201134, + "learning_rate": 0.0001, + "loss": 1.3209, + "loss/crossentropy": 2.6847801208496094, + "loss/hidden": 1.1328125, + "loss/logits": 0.18776792287826538, + "loss/reg": 2.724896876316052e-05, + "step": 1239 + }, + { + "epoch": 0.155, + "grad_norm": 2.3446271419525146, + "grad_norm_var": 3.840372393805658, + "learning_rate": 0.0001, + "loss": 1.6021, + "loss/crossentropy": 2.0214121341705322, + "loss/hidden": 1.3984375, + "loss/logits": 0.20343554019927979, + "loss/reg": 2.724030491663143e-05, + "step": 1240 + }, + { + "epoch": 0.155125, + "grad_norm": 2.2599332332611084, + "grad_norm_var": 3.8511969366753327, + "learning_rate": 0.0001, + "loss": 1.1206, + "loss/crossentropy": 2.4702773094177246, + "loss/hidden": 0.98046875, + "loss/logits": 0.13982072472572327, + "loss/reg": 2.723286161199212e-05, + "step": 1241 + }, + { + "epoch": 0.15525, + "grad_norm": 2.0794544219970703, + "grad_norm_var": 3.882839720355186, + "learning_rate": 0.0001, + "loss": 1.22, + "loss/crossentropy": 2.4464218616485596, + "loss/hidden": 1.0625, + "loss/logits": 0.1572023630142212, + "loss/reg": 2.722362660279032e-05, + "step": 1242 + }, + { + "epoch": 0.155375, + "grad_norm": 2.395498037338257, + "grad_norm_var": 3.891140076066621, + "learning_rate": 0.0001, + "loss": 1.196, + "loss/crossentropy": 2.529829740524292, + "loss/hidden": 1.03125, + "loss/logits": 0.16444090008735657, + "loss/reg": 2.7215221052756533e-05, + "step": 1243 + }, + { + "epoch": 0.1555, + "grad_norm": 1.8105294704437256, + "grad_norm_var": 3.919268796044457, + "learning_rate": 0.0001, + "loss": 1.0926, + "loss/crossentropy": 2.4872617721557617, + "loss/hidden": 0.95703125, + "loss/logits": 0.13527435064315796, + "loss/reg": 2.7208938263356686e-05, + "step": 1244 + }, + { + "epoch": 0.155625, + "grad_norm": 1.9682385921478271, + "grad_norm_var": 3.94939179959421, + "learning_rate": 0.0001, + "loss": 1.1989, + "loss/crossentropy": 2.627549648284912, + "loss/hidden": 1.03125, + "loss/logits": 0.1673639714717865, + "loss/reg": 2.7198479074286297e-05, + "step": 1245 + }, + { + "epoch": 0.15575, + "grad_norm": 2.086871862411499, + "grad_norm_var": 3.963847724301516, + "learning_rate": 0.0001, + "loss": 1.2063, + "loss/crossentropy": 2.7231545448303223, + "loss/hidden": 1.046875, + "loss/logits": 0.1591671109199524, + "loss/reg": 2.718949326663278e-05, + "step": 1246 + }, + { + "epoch": 0.155875, + "grad_norm": 3.0249781608581543, + "grad_norm_var": 3.9652139707623544, + "learning_rate": 0.0001, + "loss": 1.2943, + "loss/crossentropy": 2.472691297531128, + "loss/hidden": 1.1171875, + "loss/logits": 0.17680460214614868, + "loss/reg": 2.717867391766049e-05, + "step": 1247 + }, + { + "epoch": 0.156, + "grad_norm": 1.9492664337158203, + "grad_norm_var": 3.9472177167501226, + "learning_rate": 0.0001, + "loss": 1.382, + "loss/crossentropy": 2.378443479537964, + "loss/hidden": 1.1953125, + "loss/logits": 0.18645590543746948, + "loss/reg": 2.717053757805843e-05, + "step": 1248 + }, + { + "epoch": 0.156125, + "grad_norm": 2.779618501663208, + "grad_norm_var": 3.9071974234816214, + "learning_rate": 0.0001, + "loss": 1.2189, + "loss/crossentropy": 2.65794038772583, + "loss/hidden": 1.046875, + "loss/logits": 0.17179173231124878, + "loss/reg": 2.7163399863638915e-05, + "step": 1249 + }, + { + "epoch": 0.15625, + "grad_norm": 2.25864839553833, + "grad_norm_var": 3.898403220084043, + "learning_rate": 0.0001, + "loss": 1.3456, + "loss/crossentropy": 2.606032133102417, + "loss/hidden": 1.1796875, + "loss/logits": 0.16563929617404938, + "loss/reg": 2.7157680960954167e-05, + "step": 1250 + }, + { + "epoch": 0.156375, + "grad_norm": 1.958497166633606, + "grad_norm_var": 3.883941347842938, + "learning_rate": 0.0001, + "loss": 1.2755, + "loss/crossentropy": 2.6055822372436523, + "loss/hidden": 1.09375, + "loss/logits": 0.18147012591362, + "loss/reg": 2.7153007977176458e-05, + "step": 1251 + }, + { + "epoch": 0.1565, + "grad_norm": 2.0419399738311768, + "grad_norm_var": 0.1063767961910888, + "learning_rate": 0.0001, + "loss": 1.2348, + "loss/crossentropy": 2.623753786087036, + "loss/hidden": 1.078125, + "loss/logits": 0.1563998907804489, + "loss/reg": 2.7146954380441457e-05, + "step": 1252 + }, + { + "epoch": 0.156625, + "grad_norm": 2.27734375, + "grad_norm_var": 0.10630917406085931, + "learning_rate": 0.0001, + "loss": 1.1269, + "loss/crossentropy": 2.5552492141723633, + "loss/hidden": 0.98046875, + "loss/logits": 0.1461717188358307, + "loss/reg": 2.7141086320625618e-05, + "step": 1253 + }, + { + "epoch": 0.15675, + "grad_norm": 1.9560954570770264, + "grad_norm_var": 0.11196718791257178, + "learning_rate": 0.0001, + "loss": 1.1041, + "loss/crossentropy": 2.461141586303711, + "loss/hidden": 0.97265625, + "loss/logits": 0.13115814328193665, + "loss/reg": 2.713777394092176e-05, + "step": 1254 + }, + { + "epoch": 0.156875, + "grad_norm": 2.726699113845825, + "grad_norm_var": 0.11815067536371936, + "learning_rate": 0.0001, + "loss": 1.1927, + "loss/crossentropy": 2.7415926456451416, + "loss/hidden": 1.0390625, + "loss/logits": 0.15338170528411865, + "loss/reg": 2.713773756113369e-05, + "step": 1255 + }, + { + "epoch": 0.157, + "grad_norm": 1.9863296747207642, + "grad_norm_var": 0.12140949964824775, + "learning_rate": 0.0001, + "loss": 1.1484, + "loss/crossentropy": 2.248748302459717, + "loss/hidden": 1.0078125, + "loss/logits": 0.14029096066951752, + "loss/reg": 2.713200228754431e-05, + "step": 1256 + }, + { + "epoch": 0.157125, + "grad_norm": 1.9705520868301392, + "grad_norm_var": 0.1251988712729399, + "learning_rate": 0.0001, + "loss": 1.0735, + "loss/crossentropy": 2.328368902206421, + "loss/hidden": 0.95703125, + "loss/logits": 0.11620669066905975, + "loss/reg": 2.7130297894473188e-05, + "step": 1257 + }, + { + "epoch": 0.15725, + "grad_norm": 2.0879461765289307, + "grad_norm_var": 0.1250618991175002, + "learning_rate": 0.0001, + "loss": 1.2803, + "loss/crossentropy": 2.492924928665161, + "loss/hidden": 1.09375, + "loss/logits": 0.18631741404533386, + "loss/reg": 2.7127516659675166e-05, + "step": 1258 + }, + { + "epoch": 0.157375, + "grad_norm": 1.8996506929397583, + "grad_norm_var": 0.12783012946942543, + "learning_rate": 0.0001, + "loss": 1.1824, + "loss/crossentropy": 2.8100178241729736, + "loss/hidden": 1.0234375, + "loss/logits": 0.1586633026599884, + "loss/reg": 2.712728746701032e-05, + "step": 1259 + }, + { + "epoch": 0.1575, + "grad_norm": 1.8070100545883179, + "grad_norm_var": 0.12800144083718593, + "learning_rate": 0.0001, + "loss": 1.1339, + "loss/crossentropy": 2.556910991668701, + "loss/hidden": 0.9921875, + "loss/logits": 0.1414453387260437, + "loss/reg": 2.7120453523821197e-05, + "step": 1260 + }, + { + "epoch": 0.157625, + "grad_norm": 1.8373955488204956, + "grad_norm_var": 0.13265639084609854, + "learning_rate": 0.0001, + "loss": 1.1079, + "loss/crossentropy": 2.3946197032928467, + "loss/hidden": 0.96875, + "loss/logits": 0.13884976506233215, + "loss/reg": 2.711996239668224e-05, + "step": 1261 + }, + { + "epoch": 0.15775, + "grad_norm": 1.828991174697876, + "grad_norm_var": 0.13951816272652665, + "learning_rate": 0.0001, + "loss": 1.1848, + "loss/crossentropy": 2.4561285972595215, + "loss/hidden": 1.0390625, + "loss/logits": 0.1455036997795105, + "loss/reg": 2.7114530894323252e-05, + "step": 1262 + }, + { + "epoch": 0.157875, + "grad_norm": 1.79993736743927, + "grad_norm_var": 0.09030335081195388, + "learning_rate": 0.0001, + "loss": 1.0951, + "loss/crossentropy": 2.2786779403686523, + "loss/hidden": 0.96484375, + "loss/logits": 0.13001078367233276, + "loss/reg": 2.711398155952338e-05, + "step": 1263 + }, + { + "epoch": 0.158, + "grad_norm": 2.105189085006714, + "grad_norm_var": 0.08925316141232707, + "learning_rate": 0.0001, + "loss": 1.0958, + "loss/crossentropy": 2.617133617401123, + "loss/hidden": 0.96875, + "loss/logits": 0.12676237523555756, + "loss/reg": 2.710930311877746e-05, + "step": 1264 + }, + { + "epoch": 0.158125, + "grad_norm": 1.864131212234497, + "grad_norm_var": 0.05655579181596118, + "learning_rate": 0.0001, + "loss": 1.2529, + "loss/crossentropy": 2.3149518966674805, + "loss/hidden": 1.1015625, + "loss/logits": 0.15111187100410461, + "loss/reg": 2.7104842956759967e-05, + "step": 1265 + }, + { + "epoch": 0.15825, + "grad_norm": 1.8007097244262695, + "grad_norm_var": 0.05542057190759942, + "learning_rate": 0.0001, + "loss": 1.0775, + "loss/crossentropy": 2.546469211578369, + "loss/hidden": 0.9453125, + "loss/logits": 0.13196244835853577, + "loss/reg": 2.7098209102405235e-05, + "step": 1266 + }, + { + "epoch": 0.158375, + "grad_norm": 2.172588586807251, + "grad_norm_var": 0.05719257458181891, + "learning_rate": 0.0001, + "loss": 1.2644, + "loss/crossentropy": 2.354116201400757, + "loss/hidden": 1.1171875, + "loss/logits": 0.14690588414669037, + "loss/reg": 2.7094565666629933e-05, + "step": 1267 + }, + { + "epoch": 0.1585, + "grad_norm": 1.8879003524780273, + "grad_norm_var": 0.05802280611202851, + "learning_rate": 0.0001, + "loss": 1.1519, + "loss/crossentropy": 2.2336063385009766, + "loss/hidden": 1.0, + "loss/logits": 0.15161558985710144, + "loss/reg": 2.7088328351965174e-05, + "step": 1268 + }, + { + "epoch": 0.158625, + "grad_norm": 6.909046173095703, + "grad_norm_var": 1.5697640872212872, + "learning_rate": 0.0001, + "loss": 2.2894, + "loss/crossentropy": 3.0389015674591064, + "loss/hidden": 1.7109375, + "loss/logits": 0.5781978368759155, + "loss/reg": 2.7081203370471485e-05, + "step": 1269 + }, + { + "epoch": 0.15875, + "grad_norm": 1.7925323247909546, + "grad_norm_var": 1.5787183081816636, + "learning_rate": 0.0001, + "loss": 1.1178, + "loss/crossentropy": 2.2378323078155518, + "loss/hidden": 0.984375, + "loss/logits": 0.1331055760383606, + "loss/reg": 2.7074365789303556e-05, + "step": 1270 + }, + { + "epoch": 0.158875, + "grad_norm": 1.9227347373962402, + "grad_norm_var": 1.5712089884708818, + "learning_rate": 0.0001, + "loss": 1.2393, + "loss/crossentropy": 2.3808493614196777, + "loss/hidden": 1.078125, + "loss/logits": 0.16095364093780518, + "loss/reg": 2.7065649192081764e-05, + "step": 1271 + }, + { + "epoch": 0.159, + "grad_norm": 1.884147047996521, + "grad_norm_var": 1.5751751559317293, + "learning_rate": 0.0001, + "loss": 1.0841, + "loss/crossentropy": 2.620154857635498, + "loss/hidden": 0.953125, + "loss/logits": 0.130690336227417, + "loss/reg": 2.705651604628656e-05, + "step": 1272 + }, + { + "epoch": 0.159125, + "grad_norm": 2.060220718383789, + "grad_norm_var": 1.5726576237511618, + "learning_rate": 0.0001, + "loss": 1.2826, + "loss/crossentropy": 2.3764121532440186, + "loss/hidden": 1.1015625, + "loss/logits": 0.18076592683792114, + "loss/reg": 2.7045331080444157e-05, + "step": 1273 + }, + { + "epoch": 0.15925, + "grad_norm": 1.6765326261520386, + "grad_norm_var": 1.5909607055966526, + "learning_rate": 0.0001, + "loss": 1.1287, + "loss/crossentropy": 2.5571322441101074, + "loss/hidden": 0.984375, + "loss/logits": 0.1440846174955368, + "loss/reg": 2.7036814572056755e-05, + "step": 1274 + }, + { + "epoch": 0.159375, + "grad_norm": 2.7736411094665527, + "grad_norm_var": 1.6033467651059832, + "learning_rate": 0.0001, + "loss": 1.2805, + "loss/crossentropy": 2.9112753868103027, + "loss/hidden": 1.109375, + "loss/logits": 0.17085835337638855, + "loss/reg": 2.702650090213865e-05, + "step": 1275 + }, + { + "epoch": 0.1595, + "grad_norm": 2.1149637699127197, + "grad_norm_var": 1.59076969387115, + "learning_rate": 0.0001, + "loss": 1.0759, + "loss/crossentropy": 2.5236618518829346, + "loss/hidden": 0.94140625, + "loss/logits": 0.13423603773117065, + "loss/reg": 2.7017153115593828e-05, + "step": 1276 + }, + { + "epoch": 0.159625, + "grad_norm": 2.207482099533081, + "grad_norm_var": 1.5776418491325914, + "learning_rate": 0.0001, + "loss": 1.1506, + "loss/crossentropy": 2.554729700088501, + "loss/hidden": 1.0078125, + "loss/logits": 0.1425366997718811, + "loss/reg": 2.7008831239072606e-05, + "step": 1277 + }, + { + "epoch": 0.15975, + "grad_norm": 1.8357477188110352, + "grad_norm_var": 1.5772203412703598, + "learning_rate": 0.0001, + "loss": 1.2369, + "loss/crossentropy": 2.21962308883667, + "loss/hidden": 1.0859375, + "loss/logits": 0.15070578455924988, + "loss/reg": 2.7000798581866547e-05, + "step": 1278 + }, + { + "epoch": 0.159875, + "grad_norm": 2.054856061935425, + "grad_norm_var": 1.564269161804572, + "learning_rate": 0.0001, + "loss": 1.2829, + "loss/crossentropy": 2.1774165630340576, + "loss/hidden": 1.125, + "loss/logits": 0.1576499044895172, + "loss/reg": 2.6995241569238715e-05, + "step": 1279 + }, + { + "epoch": 0.16, + "grad_norm": 2.48515248298645, + "grad_norm_var": 1.5625920271026104, + "learning_rate": 0.0001, + "loss": 1.3469, + "loss/crossentropy": 2.715681552886963, + "loss/hidden": 1.15625, + "loss/logits": 0.19035163521766663, + "loss/reg": 2.699112519621849e-05, + "step": 1280 + }, + { + "epoch": 0.160125, + "grad_norm": 2.170650005340576, + "grad_norm_var": 1.549009677704963, + "learning_rate": 0.0001, + "loss": 1.1166, + "loss/crossentropy": 2.4817681312561035, + "loss/hidden": 0.9765625, + "loss/logits": 0.13981525599956512, + "loss/reg": 2.698470234463457e-05, + "step": 1281 + }, + { + "epoch": 0.16025, + "grad_norm": 1.872865080833435, + "grad_norm_var": 1.5439609765714803, + "learning_rate": 0.0001, + "loss": 1.1394, + "loss/crossentropy": 2.4493248462677, + "loss/hidden": 0.98046875, + "loss/logits": 0.15864945948123932, + "loss/reg": 2.697757736314088e-05, + "step": 1282 + }, + { + "epoch": 0.160375, + "grad_norm": 2.2090535163879395, + "grad_norm_var": 1.5431143348893557, + "learning_rate": 0.0001, + "loss": 1.3036, + "loss/crossentropy": 2.4853250980377197, + "loss/hidden": 1.125, + "loss/logits": 0.17830908298492432, + "loss/reg": 2.6970161343342625e-05, + "step": 1283 + }, + { + "epoch": 0.1605, + "grad_norm": 2.777097225189209, + "grad_norm_var": 1.535836676108506, + "learning_rate": 0.0001, + "loss": 1.3596, + "loss/crossentropy": 3.1726160049438477, + "loss/hidden": 1.1875, + "loss/logits": 0.17180846631526947, + "loss/reg": 2.6960331524605863e-05, + "step": 1284 + }, + { + "epoch": 0.160625, + "grad_norm": 2.088535785675049, + "grad_norm_var": 0.10397684857714845, + "learning_rate": 0.0001, + "loss": 1.3496, + "loss/crossentropy": 2.510334014892578, + "loss/hidden": 1.1640625, + "loss/logits": 0.18524698913097382, + "loss/reg": 2.695082366699353e-05, + "step": 1285 + }, + { + "epoch": 0.16075, + "grad_norm": 2.1612887382507324, + "grad_norm_var": 0.09635581505311273, + "learning_rate": 0.0001, + "loss": 1.247, + "loss/crossentropy": 2.407069206237793, + "loss/hidden": 1.078125, + "loss/logits": 0.16862691938877106, + "loss/reg": 2.6943031116388738e-05, + "step": 1286 + }, + { + "epoch": 0.160875, + "grad_norm": 1.9132295846939087, + "grad_norm_var": 0.09664116778264974, + "learning_rate": 0.0001, + "loss": 1.1865, + "loss/crossentropy": 2.420624256134033, + "loss/hidden": 1.0390625, + "loss/logits": 0.1472083330154419, + "loss/reg": 2.693499845918268e-05, + "step": 1287 + }, + { + "epoch": 0.161, + "grad_norm": 2.416897773742676, + "grad_norm_var": 0.09600417389772525, + "learning_rate": 0.0001, + "loss": 1.2, + "loss/crossentropy": 2.801936388015747, + "loss/hidden": 1.03125, + "loss/logits": 0.16843974590301514, + "loss/reg": 2.6923460609395988e-05, + "step": 1288 + }, + { + "epoch": 0.161125, + "grad_norm": 2.03855562210083, + "grad_norm_var": 0.09636835893507362, + "learning_rate": 0.0001, + "loss": 1.1533, + "loss/crossentropy": 2.6552908420562744, + "loss/hidden": 1.015625, + "loss/logits": 0.13738903403282166, + "loss/reg": 2.6914471163763665e-05, + "step": 1289 + }, + { + "epoch": 0.16125, + "grad_norm": 2.2621166706085205, + "grad_norm_var": 0.0788977183377689, + "learning_rate": 0.0001, + "loss": 1.3871, + "loss/crossentropy": 2.332151174545288, + "loss/hidden": 1.203125, + "loss/logits": 0.1837497055530548, + "loss/reg": 2.6905630875262432e-05, + "step": 1290 + }, + { + "epoch": 0.161375, + "grad_norm": 1.8717083930969238, + "grad_norm_var": 0.06212455728727922, + "learning_rate": 0.0001, + "loss": 1.1823, + "loss/crossentropy": 2.281320571899414, + "loss/hidden": 1.0390625, + "loss/logits": 0.14292402565479279, + "loss/reg": 2.689589382498525e-05, + "step": 1291 + }, + { + "epoch": 0.1615, + "grad_norm": 2.2855136394500732, + "grad_norm_var": 0.06303180273471275, + "learning_rate": 0.0001, + "loss": 1.2458, + "loss/crossentropy": 2.552885055541992, + "loss/hidden": 1.0703125, + "loss/logits": 0.1751728355884552, + "loss/reg": 2.688968561415095e-05, + "step": 1292 + }, + { + "epoch": 0.161625, + "grad_norm": 1.9216214418411255, + "grad_norm_var": 0.06654548697256685, + "learning_rate": 0.0001, + "loss": 1.173, + "loss/crossentropy": 2.3000409603118896, + "loss/hidden": 1.03125, + "loss/logits": 0.1414794772863388, + "loss/reg": 2.688014501472935e-05, + "step": 1293 + }, + { + "epoch": 0.16175, + "grad_norm": 2.296865701675415, + "grad_norm_var": 0.06064878180208755, + "learning_rate": 0.0001, + "loss": 1.4708, + "loss/crossentropy": 2.172919988632202, + "loss/hidden": 1.2734375, + "loss/logits": 0.1971122771501541, + "loss/reg": 2.6870919100474566e-05, + "step": 1294 + }, + { + "epoch": 0.161875, + "grad_norm": 2.194643497467041, + "grad_norm_var": 0.05960048673984719, + "learning_rate": 0.0001, + "loss": 1.0693, + "loss/crossentropy": 2.6945507526397705, + "loss/hidden": 0.9296875, + "loss/logits": 0.13929899036884308, + "loss/reg": 2.6860114303417504e-05, + "step": 1295 + }, + { + "epoch": 0.162, + "grad_norm": 2.470828056335449, + "grad_norm_var": 0.05904073453734728, + "learning_rate": 0.0001, + "loss": 1.2986, + "loss/crossentropy": 2.3514182567596436, + "loss/hidden": 1.125, + "loss/logits": 0.1733003705739975, + "loss/reg": 2.6851983420783654e-05, + "step": 1296 + }, + { + "epoch": 0.162125, + "grad_norm": 2.629394292831421, + "grad_norm_var": 0.07134850548099739, + "learning_rate": 0.0001, + "loss": 1.2808, + "loss/crossentropy": 2.763662099838257, + "loss/hidden": 1.125, + "loss/logits": 0.15555787086486816, + "loss/reg": 2.6843548766919412e-05, + "step": 1297 + }, + { + "epoch": 0.16225, + "grad_norm": 2.2366325855255127, + "grad_norm_var": 0.06311487827932089, + "learning_rate": 0.0001, + "loss": 1.0763, + "loss/crossentropy": 2.331860065460205, + "loss/hidden": 0.9453125, + "loss/logits": 0.130709707736969, + "loss/reg": 2.6833902666112408e-05, + "step": 1298 + }, + { + "epoch": 0.162375, + "grad_norm": 2.3925933837890625, + "grad_norm_var": 0.06456396031760694, + "learning_rate": 0.0001, + "loss": 1.1051, + "loss/crossentropy": 2.53729510307312, + "loss/hidden": 0.953125, + "loss/logits": 0.15170174837112427, + "loss/reg": 2.6826441171579063e-05, + "step": 1299 + }, + { + "epoch": 0.1625, + "grad_norm": 2.1190478801727295, + "grad_norm_var": 0.04514786824177174, + "learning_rate": 0.0001, + "loss": 1.173, + "loss/crossentropy": 2.4471685886383057, + "loss/hidden": 1.0078125, + "loss/logits": 0.16488343477249146, + "loss/reg": 2.6819612685358152e-05, + "step": 1300 + }, + { + "epoch": 0.162625, + "grad_norm": 2.3287594318389893, + "grad_norm_var": 0.04498527060430699, + "learning_rate": 0.0001, + "loss": 1.1902, + "loss/crossentropy": 2.48089337348938, + "loss/hidden": 1.03125, + "loss/logits": 0.15867485105991364, + "loss/reg": 2.6813508156919852e-05, + "step": 1301 + }, + { + "epoch": 0.16275, + "grad_norm": 1.844240427017212, + "grad_norm_var": 0.05380169512942316, + "learning_rate": 0.0001, + "loss": 1.0863, + "loss/crossentropy": 2.671802043914795, + "loss/hidden": 0.94921875, + "loss/logits": 0.13678114116191864, + "loss/reg": 2.680667967069894e-05, + "step": 1302 + }, + { + "epoch": 0.162875, + "grad_norm": 2.3637824058532715, + "grad_norm_var": 0.04917666203077094, + "learning_rate": 0.0001, + "loss": 1.4411, + "loss/crossentropy": 2.4006457328796387, + "loss/hidden": 1.25, + "loss/logits": 0.19082440435886383, + "loss/reg": 2.6799469196703285e-05, + "step": 1303 + }, + { + "epoch": 0.163, + "grad_norm": 4.0514984130859375, + "grad_norm_var": 0.2569979888694646, + "learning_rate": 0.0001, + "loss": 1.283, + "loss/crossentropy": 2.5820600986480713, + "loss/hidden": 1.1171875, + "loss/logits": 0.16555315256118774, + "loss/reg": 2.6791109121404588e-05, + "step": 1304 + }, + { + "epoch": 0.163125, + "grad_norm": 1.996328353881836, + "grad_norm_var": 0.2587601385435561, + "learning_rate": 0.0001, + "loss": 1.1278, + "loss/crossentropy": 2.4008634090423584, + "loss/hidden": 0.98828125, + "loss/logits": 0.13921648263931274, + "loss/reg": 2.6783658540807664e-05, + "step": 1305 + }, + { + "epoch": 0.16325, + "grad_norm": 2.5234479904174805, + "grad_norm_var": 0.2606945936671744, + "learning_rate": 0.0001, + "loss": 1.6677, + "loss/crossentropy": 2.368112564086914, + "loss/hidden": 1.4296875, + "loss/logits": 0.23771673440933228, + "loss/reg": 2.67771611106582e-05, + "step": 1306 + }, + { + "epoch": 0.163375, + "grad_norm": 2.0552024841308594, + "grad_norm_var": 0.25120891874047824, + "learning_rate": 0.0001, + "loss": 1.2546, + "loss/crossentropy": 2.4685752391815186, + "loss/hidden": 1.09375, + "loss/logits": 0.16061872243881226, + "loss/reg": 2.6768024326884188e-05, + "step": 1307 + }, + { + "epoch": 0.1635, + "grad_norm": 2.6488709449768066, + "grad_norm_var": 0.25600220125298934, + "learning_rate": 0.0001, + "loss": 1.3711, + "loss/crossentropy": 2.07741117477417, + "loss/hidden": 1.203125, + "loss/logits": 0.1677468717098236, + "loss/reg": 2.6759482352645136e-05, + "step": 1308 + }, + { + "epoch": 0.163625, + "grad_norm": 5.07850980758667, + "grad_norm_var": 0.6860979486823292, + "learning_rate": 0.0001, + "loss": 1.3626, + "loss/crossentropy": 2.4804584980010986, + "loss/hidden": 1.1875, + "loss/logits": 0.17485862970352173, + "loss/reg": 2.675014729902614e-05, + "step": 1309 + }, + { + "epoch": 0.16375, + "grad_norm": 2.5337674617767334, + "grad_norm_var": 0.6807597007029661, + "learning_rate": 0.0001, + "loss": 1.1599, + "loss/crossentropy": 2.6424808502197266, + "loss/hidden": 1.015625, + "loss/logits": 0.14404962956905365, + "loss/reg": 2.6742369300336577e-05, + "step": 1310 + }, + { + "epoch": 0.163875, + "grad_norm": 2.6069765090942383, + "grad_norm_var": 0.6695553968413293, + "learning_rate": 0.0001, + "loss": 1.4102, + "loss/crossentropy": 2.269629955291748, + "loss/hidden": 1.2421875, + "loss/logits": 0.1677122414112091, + "loss/reg": 2.6734827770269476e-05, + "step": 1311 + }, + { + "epoch": 0.164, + "grad_norm": 2.2442753314971924, + "grad_norm_var": 0.6771935784672335, + "learning_rate": 0.0001, + "loss": 1.2841, + "loss/crossentropy": 2.570171594619751, + "loss/hidden": 1.1171875, + "loss/logits": 0.16669419407844543, + "loss/reg": 2.6727713702712208e-05, + "step": 1312 + }, + { + "epoch": 0.164125, + "grad_norm": 1.8854857683181763, + "grad_norm_var": 0.7091961075454435, + "learning_rate": 0.0001, + "loss": 1.1282, + "loss/crossentropy": 2.3410756587982178, + "loss/hidden": 0.984375, + "loss/logits": 0.14355753362178802, + "loss/reg": 2.6721001631813124e-05, + "step": 1313 + }, + { + "epoch": 0.16425, + "grad_norm": 1.5990004539489746, + "grad_norm_var": 0.7618301893603315, + "learning_rate": 0.0001, + "loss": 1.2331, + "loss/crossentropy": 2.189560890197754, + "loss/hidden": 1.078125, + "loss/logits": 0.15474070608615875, + "loss/reg": 2.6713809347711504e-05, + "step": 1314 + }, + { + "epoch": 0.164375, + "grad_norm": 1.9670287370681763, + "grad_norm_var": 0.7802075877918061, + "learning_rate": 0.0001, + "loss": 1.157, + "loss/crossentropy": 2.732132911682129, + "loss/hidden": 1.015625, + "loss/logits": 0.14111988246440887, + "loss/reg": 2.6707104552770033e-05, + "step": 1315 + }, + { + "epoch": 0.1645, + "grad_norm": 1.9812804460525513, + "grad_norm_var": 0.78821498934293, + "learning_rate": 0.0001, + "loss": 1.1328, + "loss/crossentropy": 2.4936752319335938, + "loss/hidden": 0.98828125, + "loss/logits": 0.14423099160194397, + "loss/reg": 2.6700867238105275e-05, + "step": 1316 + }, + { + "epoch": 0.164625, + "grad_norm": 2.170367479324341, + "grad_norm_var": 0.7930145871730757, + "learning_rate": 0.0001, + "loss": 1.2827, + "loss/crossentropy": 2.60505747795105, + "loss/hidden": 1.1015625, + "loss/logits": 0.18084633350372314, + "loss/reg": 2.6694346161093563e-05, + "step": 1317 + }, + { + "epoch": 0.16475, + "grad_norm": 3.137220621109009, + "grad_norm_var": 0.7892987266693675, + "learning_rate": 0.0001, + "loss": 1.1344, + "loss/crossentropy": 2.6313364505767822, + "loss/hidden": 0.98046875, + "loss/logits": 0.15370672941207886, + "loss/reg": 2.6686380806495436e-05, + "step": 1318 + }, + { + "epoch": 0.164875, + "grad_norm": 2.006977081298828, + "grad_norm_var": 0.8062427117439365, + "learning_rate": 0.0001, + "loss": 1.1524, + "loss/crossentropy": 2.4833168983459473, + "loss/hidden": 1.0078125, + "loss/logits": 0.14430877566337585, + "loss/reg": 2.6684570912038907e-05, + "step": 1319 + }, + { + "epoch": 0.165, + "grad_norm": 2.0003442764282227, + "grad_norm_var": 0.6531910478308915, + "learning_rate": 0.0001, + "loss": 1.1996, + "loss/crossentropy": 2.406473159790039, + "loss/hidden": 1.046875, + "loss/logits": 0.15249782800674438, + "loss/reg": 2.6676581910578534e-05, + "step": 1320 + }, + { + "epoch": 0.165125, + "grad_norm": 2.0674219131469727, + "grad_norm_var": 0.6496596954331906, + "learning_rate": 0.0001, + "loss": 1.3007, + "loss/crossentropy": 2.3661906719207764, + "loss/hidden": 1.1328125, + "loss/logits": 0.1676027476787567, + "loss/reg": 2.6668809368857183e-05, + "step": 1321 + }, + { + "epoch": 0.16525, + "grad_norm": 2.374068260192871, + "grad_norm_var": 0.6487277618980399, + "learning_rate": 0.0001, + "loss": 1.4624, + "loss/crossentropy": 2.4206950664520264, + "loss/hidden": 1.265625, + "loss/logits": 0.1965237259864807, + "loss/reg": 2.6664110919227824e-05, + "step": 1322 + }, + { + "epoch": 0.165375, + "grad_norm": 3.1102726459503174, + "grad_norm_var": 0.6701761810850214, + "learning_rate": 0.0001, + "loss": 1.243, + "loss/crossentropy": 2.4548113346099854, + "loss/hidden": 1.078125, + "loss/logits": 0.16458836197853088, + "loss/reg": 2.6658321075956337e-05, + "step": 1323 + }, + { + "epoch": 0.1655, + "grad_norm": 2.5451745986938477, + "grad_norm_var": 0.6682816965519407, + "learning_rate": 0.0001, + "loss": 1.1581, + "loss/crossentropy": 2.6601362228393555, + "loss/hidden": 1.0078125, + "loss/logits": 0.15003395080566406, + "loss/reg": 2.664869862201158e-05, + "step": 1324 + }, + { + "epoch": 0.165625, + "grad_norm": 2.617114782333374, + "grad_norm_var": 0.186514430925861, + "learning_rate": 0.0001, + "loss": 1.295, + "loss/crossentropy": 2.6460087299346924, + "loss/hidden": 1.125, + "loss/logits": 0.16977858543395996, + "loss/reg": 2.664211206138134e-05, + "step": 1325 + }, + { + "epoch": 0.16575, + "grad_norm": 2.0105721950531006, + "grad_norm_var": 0.1875192338806428, + "learning_rate": 0.0001, + "loss": 1.1514, + "loss/crossentropy": 2.3992791175842285, + "loss/hidden": 1.0078125, + "loss/logits": 0.14329934120178223, + "loss/reg": 2.6631181754055433e-05, + "step": 1326 + }, + { + "epoch": 0.165875, + "grad_norm": 2.0908360481262207, + "grad_norm_var": 0.1809944030005207, + "learning_rate": 0.0001, + "loss": 1.1341, + "loss/crossentropy": 2.4683284759521484, + "loss/hidden": 0.98828125, + "loss/logits": 0.1455761194229126, + "loss/reg": 2.66248043772066e-05, + "step": 1327 + }, + { + "epoch": 0.166, + "grad_norm": 2.010671377182007, + "grad_norm_var": 0.18420853059178005, + "learning_rate": 0.0001, + "loss": 1.1878, + "loss/crossentropy": 2.5996124744415283, + "loss/hidden": 1.03125, + "loss/logits": 0.1562933325767517, + "loss/reg": 2.6614095986587927e-05, + "step": 1328 + }, + { + "epoch": 0.166125, + "grad_norm": 2.42618465423584, + "grad_norm_var": 0.17812196097309704, + "learning_rate": 0.0001, + "loss": 1.2696, + "loss/crossentropy": 2.487701177597046, + "loss/hidden": 1.109375, + "loss/logits": 0.1599753499031067, + "loss/reg": 2.660502832441125e-05, + "step": 1329 + }, + { + "epoch": 0.16625, + "grad_norm": 2.4546120166778564, + "grad_norm_var": 0.14879272610630967, + "learning_rate": 0.0001, + "loss": 1.1312, + "loss/crossentropy": 2.33776593208313, + "loss/hidden": 0.984375, + "loss/logits": 0.14658929407596588, + "loss/reg": 2.6595171220833436e-05, + "step": 1330 + }, + { + "epoch": 0.166375, + "grad_norm": 2.357835531234741, + "grad_norm_var": 0.14043390163264313, + "learning_rate": 0.0001, + "loss": 1.167, + "loss/crossentropy": 2.520327091217041, + "loss/hidden": 1.0078125, + "loss/logits": 0.15895235538482666, + "loss/reg": 2.6584895749692805e-05, + "step": 1331 + }, + { + "epoch": 0.1665, + "grad_norm": 1.9540313482284546, + "grad_norm_var": 0.14176566382670894, + "learning_rate": 0.0001, + "loss": 1.2656, + "loss/crossentropy": 2.3803107738494873, + "loss/hidden": 1.109375, + "loss/logits": 0.155920147895813, + "loss/reg": 2.657651020854246e-05, + "step": 1332 + }, + { + "epoch": 0.166625, + "grad_norm": 2.323978900909424, + "grad_norm_var": 0.13990217871198726, + "learning_rate": 0.0001, + "loss": 1.4289, + "loss/crossentropy": 2.3884425163269043, + "loss/hidden": 1.203125, + "loss/logits": 0.22553202509880066, + "loss/reg": 2.6566140149952844e-05, + "step": 1333 + }, + { + "epoch": 0.16675, + "grad_norm": 1.8182610273361206, + "grad_norm_var": 0.10895040965308808, + "learning_rate": 0.0001, + "loss": 1.3328, + "loss/crossentropy": 2.444770097732544, + "loss/hidden": 1.140625, + "loss/logits": 0.1919519007205963, + "loss/reg": 2.6553065254120156e-05, + "step": 1334 + }, + { + "epoch": 0.166875, + "grad_norm": 1.8410413265228271, + "grad_norm_var": 0.11628095558962456, + "learning_rate": 0.0001, + "loss": 1.2273, + "loss/crossentropy": 2.4763848781585693, + "loss/hidden": 1.0625, + "loss/logits": 0.16453632712364197, + "loss/reg": 2.654367381182965e-05, + "step": 1335 + }, + { + "epoch": 0.167, + "grad_norm": 2.226025104522705, + "grad_norm_var": 0.1119473076987769, + "learning_rate": 0.0001, + "loss": 1.2745, + "loss/crossentropy": 2.293704032897949, + "loss/hidden": 1.1171875, + "loss/logits": 0.15703290700912476, + "loss/reg": 2.652865441632457e-05, + "step": 1336 + }, + { + "epoch": 0.167125, + "grad_norm": 2.1916122436523438, + "grad_norm_var": 0.10965193544846935, + "learning_rate": 0.0001, + "loss": 1.1417, + "loss/crossentropy": 2.5248830318450928, + "loss/hidden": 0.96875, + "loss/logits": 0.17263789474964142, + "loss/reg": 2.6519792299950495e-05, + "step": 1337 + }, + { + "epoch": 0.16725, + "grad_norm": 1.9012585878372192, + "grad_norm_var": 0.11719038307920654, + "learning_rate": 0.0001, + "loss": 1.1134, + "loss/crossentropy": 2.6347897052764893, + "loss/hidden": 0.96484375, + "loss/logits": 0.14825037121772766, + "loss/reg": 2.6508707378525287e-05, + "step": 1338 + }, + { + "epoch": 0.167375, + "grad_norm": 2.1343119144439697, + "grad_norm_var": 0.0637957791721267, + "learning_rate": 0.0001, + "loss": 1.226, + "loss/crossentropy": 2.9463067054748535, + "loss/hidden": 1.0546875, + "loss/logits": 0.17104627192020416, + "loss/reg": 2.6497429644223303e-05, + "step": 1339 + }, + { + "epoch": 0.1675, + "grad_norm": 1.689785361289978, + "grad_norm_var": 0.06804526279126909, + "learning_rate": 0.0001, + "loss": 1.0931, + "loss/crossentropy": 2.2576656341552734, + "loss/hidden": 0.95703125, + "loss/logits": 0.13584816455841064, + "loss/reg": 2.6484205591259524e-05, + "step": 1340 + }, + { + "epoch": 0.167625, + "grad_norm": 2.1243538856506348, + "grad_norm_var": 0.05108608605265938, + "learning_rate": 0.0001, + "loss": 1.2469, + "loss/crossentropy": 2.432766914367676, + "loss/hidden": 1.078125, + "loss/logits": 0.16849841177463531, + "loss/reg": 2.647744258865714e-05, + "step": 1341 + }, + { + "epoch": 0.16775, + "grad_norm": 2.0008251667022705, + "grad_norm_var": 0.051204619592523905, + "learning_rate": 0.0001, + "loss": 1.223, + "loss/crossentropy": 2.502549409866333, + "loss/hidden": 1.0703125, + "loss/logits": 0.15237583220005035, + "loss/reg": 2.6465522751095705e-05, + "step": 1342 + }, + { + "epoch": 0.167875, + "grad_norm": 2.073878526687622, + "grad_norm_var": 0.05123562771141129, + "learning_rate": 0.0001, + "loss": 1.1706, + "loss/crossentropy": 2.731919765472412, + "loss/hidden": 1.0234375, + "loss/logits": 0.14691473543643951, + "loss/reg": 2.6457675630808808e-05, + "step": 1343 + }, + { + "epoch": 0.168, + "grad_norm": 2.0315918922424316, + "grad_norm_var": 0.05102624454897272, + "learning_rate": 0.0001, + "loss": 1.1375, + "loss/crossentropy": 2.1843767166137695, + "loss/hidden": 0.984375, + "loss/logits": 0.15289223194122314, + "loss/reg": 2.644642154336907e-05, + "step": 1344 + }, + { + "epoch": 0.168125, + "grad_norm": 4.893985271453857, + "grad_norm_var": 0.5400182964836799, + "learning_rate": 0.0001, + "loss": 1.2603, + "loss/crossentropy": 2.0751454830169678, + "loss/hidden": 1.1015625, + "loss/logits": 0.1584956794977188, + "loss/reg": 2.6439100111019798e-05, + "step": 1345 + }, + { + "epoch": 0.16825, + "grad_norm": 1.8516738414764404, + "grad_norm_var": 0.5463774459881731, + "learning_rate": 0.0001, + "loss": 1.142, + "loss/crossentropy": 2.5779693126678467, + "loss/hidden": 0.99609375, + "loss/logits": 0.14566245675086975, + "loss/reg": 2.6431454898556694e-05, + "step": 1346 + }, + { + "epoch": 0.168375, + "grad_norm": 2.332744836807251, + "grad_norm_var": 0.5459336044917201, + "learning_rate": 0.0001, + "loss": 1.2105, + "loss/crossentropy": 2.6423795223236084, + "loss/hidden": 1.0390625, + "loss/logits": 0.17117266356945038, + "loss/reg": 2.6420617359690368e-05, + "step": 1347 + }, + { + "epoch": 0.1685, + "grad_norm": 2.911855697631836, + "grad_norm_var": 0.5703487463568786, + "learning_rate": 0.0001, + "loss": 1.4788, + "loss/crossentropy": 2.3872833251953125, + "loss/hidden": 1.234375, + "loss/logits": 0.24415868520736694, + "loss/reg": 2.6411762519273907e-05, + "step": 1348 + }, + { + "epoch": 0.168625, + "grad_norm": 2.4840517044067383, + "grad_norm_var": 0.5730660153521436, + "learning_rate": 0.0001, + "loss": 1.1625, + "loss/crossentropy": 2.269804000854492, + "loss/hidden": 1.0, + "loss/logits": 0.16221199929714203, + "loss/reg": 2.640218735905364e-05, + "step": 1349 + }, + { + "epoch": 0.16875, + "grad_norm": 2.2593142986297607, + "grad_norm_var": 0.5579703040310806, + "learning_rate": 0.0001, + "loss": 1.193, + "loss/crossentropy": 2.312058448791504, + "loss/hidden": 1.03125, + "loss/logits": 0.16144749522209167, + "loss/reg": 2.6392237487016246e-05, + "step": 1350 + }, + { + "epoch": 0.168875, + "grad_norm": 2.1701931953430176, + "grad_norm_var": 0.5441925295518265, + "learning_rate": 0.0001, + "loss": 1.3017, + "loss/crossentropy": 2.490743398666382, + "loss/hidden": 1.1328125, + "loss/logits": 0.16862213611602783, + "loss/reg": 2.6384701413917355e-05, + "step": 1351 + }, + { + "epoch": 0.169, + "grad_norm": 2.2270307540893555, + "grad_norm_var": 0.5441786723923107, + "learning_rate": 0.0001, + "loss": 1.2207, + "loss/crossentropy": 2.6737143993377686, + "loss/hidden": 1.0625, + "loss/logits": 0.15791228413581848, + "loss/reg": 2.6375717425253242e-05, + "step": 1352 + }, + { + "epoch": 0.169125, + "grad_norm": 1.760250449180603, + "grad_norm_var": 0.5637620835327354, + "learning_rate": 0.0001, + "loss": 1.2358, + "loss/crossentropy": 2.5891165733337402, + "loss/hidden": 1.0703125, + "loss/logits": 0.16525432467460632, + "loss/reg": 2.6367311875219457e-05, + "step": 1353 + }, + { + "epoch": 0.16925, + "grad_norm": 1.907132625579834, + "grad_norm_var": 0.56344963794873, + "learning_rate": 0.0001, + "loss": 1.1823, + "loss/crossentropy": 2.3526220321655273, + "loss/hidden": 1.0234375, + "loss/logits": 0.15862733125686646, + "loss/reg": 2.6355100999353454e-05, + "step": 1354 + }, + { + "epoch": 0.169375, + "grad_norm": 1.9027105569839478, + "grad_norm_var": 0.5720208162009734, + "learning_rate": 0.0001, + "loss": 1.1983, + "loss/crossentropy": 2.545768976211548, + "loss/hidden": 1.03125, + "loss/logits": 0.1668192744255066, + "loss/reg": 2.6347095626988448e-05, + "step": 1355 + }, + { + "epoch": 0.1695, + "grad_norm": 2.6173818111419678, + "grad_norm_var": 0.5517076991730401, + "learning_rate": 0.0001, + "loss": 1.2561, + "loss/crossentropy": 2.5225110054016113, + "loss/hidden": 1.09375, + "loss/logits": 0.16205820441246033, + "loss/reg": 2.633567237353418e-05, + "step": 1356 + }, + { + "epoch": 0.169625, + "grad_norm": 3.8964571952819824, + "grad_norm_var": 0.6954173397225996, + "learning_rate": 0.0001, + "loss": 1.1632, + "loss/crossentropy": 3.085906505584717, + "loss/hidden": 0.99609375, + "loss/logits": 0.16680637001991272, + "loss/reg": 2.6325124053983018e-05, + "step": 1357 + }, + { + "epoch": 0.16975, + "grad_norm": 1.6623084545135498, + "grad_norm_var": 0.7231947530914662, + "learning_rate": 0.0001, + "loss": 1.0765, + "loss/crossentropy": 2.5557963848114014, + "loss/hidden": 0.94140625, + "loss/logits": 0.13480140268802643, + "loss/reg": 2.6315523427911103e-05, + "step": 1358 + }, + { + "epoch": 0.169875, + "grad_norm": 2.897042751312256, + "grad_norm_var": 0.7257549790436217, + "learning_rate": 0.0001, + "loss": 1.1845, + "loss/crossentropy": 2.0533289909362793, + "loss/hidden": 1.0390625, + "loss/logits": 0.1451372653245926, + "loss/reg": 2.630468225106597e-05, + "step": 1359 + }, + { + "epoch": 0.17, + "grad_norm": 1.6887840032577515, + "grad_norm_var": 0.7539546823091342, + "learning_rate": 0.0001, + "loss": 1.2244, + "loss/crossentropy": 2.343716859817505, + "loss/hidden": 1.046875, + "loss/logits": 0.17730815708637238, + "loss/reg": 2.6297102522221394e-05, + "step": 1360 + }, + { + "epoch": 0.170125, + "grad_norm": 2.051896810531616, + "grad_norm_var": 0.33888700207370487, + "learning_rate": 0.0001, + "loss": 1.1851, + "loss/crossentropy": 2.437046766281128, + "loss/hidden": 1.03125, + "loss/logits": 0.1536104381084442, + "loss/reg": 2.6288973458576947e-05, + "step": 1361 + }, + { + "epoch": 0.17025, + "grad_norm": 2.3271827697753906, + "grad_norm_var": 0.3253043646968123, + "learning_rate": 0.0001, + "loss": 1.207, + "loss/crossentropy": 2.400099039077759, + "loss/hidden": 1.0390625, + "loss/logits": 0.1677071750164032, + "loss/reg": 2.6284051273250952e-05, + "step": 1362 + }, + { + "epoch": 0.170375, + "grad_norm": 2.492544651031494, + "grad_norm_var": 0.3272034231337623, + "learning_rate": 0.0001, + "loss": 1.7338, + "loss/crossentropy": 2.6612331867218018, + "loss/hidden": 1.4921875, + "loss/logits": 0.24138575792312622, + "loss/reg": 2.627680078148842e-05, + "step": 1363 + }, + { + "epoch": 0.1705, + "grad_norm": 2.459014654159546, + "grad_norm_var": 0.30479818566546674, + "learning_rate": 0.0001, + "loss": 1.4379, + "loss/crossentropy": 2.8525729179382324, + "loss/hidden": 1.2421875, + "loss/logits": 0.19541873037815094, + "loss/reg": 2.6268575311405584e-05, + "step": 1364 + }, + { + "epoch": 0.170625, + "grad_norm": 2.0299651622772217, + "grad_norm_var": 0.30655443529906223, + "learning_rate": 0.0001, + "loss": 1.144, + "loss/crossentropy": 2.4801418781280518, + "loss/hidden": 0.98828125, + "loss/logits": 0.15544641017913818, + "loss/reg": 2.6265021006111056e-05, + "step": 1365 + }, + { + "epoch": 0.17075, + "grad_norm": 2.2856857776641846, + "grad_norm_var": 0.3065539089084618, + "learning_rate": 0.0001, + "loss": 1.2307, + "loss/crossentropy": 2.584764003753662, + "loss/hidden": 1.0625, + "loss/logits": 0.16793392598628998, + "loss/reg": 2.6261264792992733e-05, + "step": 1366 + }, + { + "epoch": 0.170875, + "grad_norm": 4.037449836730957, + "grad_norm_var": 0.4987558370866475, + "learning_rate": 0.0001, + "loss": 1.3591, + "loss/crossentropy": 2.293288230895996, + "loss/hidden": 1.1953125, + "loss/logits": 0.16355790197849274, + "loss/reg": 2.625383422127925e-05, + "step": 1367 + }, + { + "epoch": 0.171, + "grad_norm": 2.059861660003662, + "grad_norm_var": 0.5041388412892653, + "learning_rate": 0.0001, + "loss": 1.2821, + "loss/crossentropy": 2.3882124423980713, + "loss/hidden": 1.1171875, + "loss/logits": 0.16469590365886688, + "loss/reg": 2.6246241759508848e-05, + "step": 1368 + }, + { + "epoch": 0.171125, + "grad_norm": 4.463994979858398, + "grad_norm_var": 0.7377068144439828, + "learning_rate": 0.0001, + "loss": 1.5494, + "loss/crossentropy": 2.7510950565338135, + "loss/hidden": 1.328125, + "loss/logits": 0.22101837396621704, + "loss/reg": 2.6237423298880458e-05, + "step": 1369 + }, + { + "epoch": 0.17125, + "grad_norm": 3.0326664447784424, + "grad_norm_var": 0.7206006883959406, + "learning_rate": 0.0001, + "loss": 1.5292, + "loss/crossentropy": 2.500507116317749, + "loss/hidden": 1.3359375, + "loss/logits": 0.19301241636276245, + "loss/reg": 2.6228930437355302e-05, + "step": 1370 + }, + { + "epoch": 0.171375, + "grad_norm": 2.7396833896636963, + "grad_norm_var": 0.6844414926099346, + "learning_rate": 0.0001, + "loss": 1.2269, + "loss/crossentropy": 2.645461320877075, + "loss/hidden": 1.0625, + "loss/logits": 0.16411615908145905, + "loss/reg": 2.6220308427582495e-05, + "step": 1371 + }, + { + "epoch": 0.1715, + "grad_norm": 2.3114166259765625, + "grad_norm_var": 0.6924948794091439, + "learning_rate": 0.0001, + "loss": 1.0779, + "loss/crossentropy": 2.3120405673980713, + "loss/hidden": 0.9453125, + "loss/logits": 0.13236598670482635, + "loss/reg": 2.6214587705908343e-05, + "step": 1372 + }, + { + "epoch": 0.171625, + "grad_norm": 2.3938355445861816, + "grad_norm_var": 0.5843349511418201, + "learning_rate": 0.0001, + "loss": 1.4033, + "loss/crossentropy": 2.55775785446167, + "loss/hidden": 1.1875, + "loss/logits": 0.21555431187152863, + "loss/reg": 2.6206525944871828e-05, + "step": 1373 + }, + { + "epoch": 0.17175, + "grad_norm": 2.033670425415039, + "grad_norm_var": 0.5485876990022693, + "learning_rate": 0.0001, + "loss": 1.1458, + "loss/crossentropy": 2.4343767166137695, + "loss/hidden": 0.984375, + "loss/logits": 0.1611873209476471, + "loss/reg": 2.6202809749520384e-05, + "step": 1374 + }, + { + "epoch": 0.171875, + "grad_norm": 1.8415040969848633, + "grad_norm_var": 0.573819922807552, + "learning_rate": 0.0001, + "loss": 1.1715, + "loss/crossentropy": 2.6316497325897217, + "loss/hidden": 1.03125, + "loss/logits": 0.13998553156852722, + "loss/reg": 2.6196285034529865e-05, + "step": 1375 + }, + { + "epoch": 0.172, + "grad_norm": 2.325756549835205, + "grad_norm_var": 0.5289594396214928, + "learning_rate": 0.0001, + "loss": 1.1011, + "loss/crossentropy": 2.4344868659973145, + "loss/hidden": 0.96484375, + "loss/logits": 0.13596412539482117, + "loss/reg": 2.6189338313997723e-05, + "step": 1376 + }, + { + "epoch": 0.172125, + "grad_norm": 1.7061195373535156, + "grad_norm_var": 0.5596446079848683, + "learning_rate": 0.0001, + "loss": 1.1747, + "loss/crossentropy": 2.5702881813049316, + "loss/hidden": 1.0234375, + "loss/logits": 0.1509830355644226, + "loss/reg": 2.6182387955486774e-05, + "step": 1377 + }, + { + "epoch": 0.17225, + "grad_norm": 1.8262470960617065, + "grad_norm_var": 0.5891265314492993, + "learning_rate": 0.0001, + "loss": 1.0823, + "loss/crossentropy": 2.5284423828125, + "loss/hidden": 0.94921875, + "loss/logits": 0.13286352157592773, + "loss/reg": 2.6176339815719984e-05, + "step": 1378 + }, + { + "epoch": 0.172375, + "grad_norm": 1.923034429550171, + "grad_norm_var": 0.6101510865576267, + "learning_rate": 0.0001, + "loss": 1.239, + "loss/crossentropy": 2.7109146118164062, + "loss/hidden": 1.0859375, + "loss/logits": 0.15282438695430756, + "loss/reg": 2.6170004275627434e-05, + "step": 1379 + }, + { + "epoch": 0.1725, + "grad_norm": 3.0109708309173584, + "grad_norm_var": 0.6286140187444785, + "learning_rate": 0.0001, + "loss": 1.4178, + "loss/crossentropy": 2.3706412315368652, + "loss/hidden": 1.234375, + "loss/logits": 0.18320372700691223, + "loss/reg": 2.616273377498146e-05, + "step": 1380 + }, + { + "epoch": 0.172625, + "grad_norm": 2.3772876262664795, + "grad_norm_var": 0.6143231427328814, + "learning_rate": 0.0001, + "loss": 1.1551, + "loss/crossentropy": 2.638753890991211, + "loss/hidden": 0.99609375, + "loss/logits": 0.15875723958015442, + "loss/reg": 2.615546327433549e-05, + "step": 1381 + }, + { + "epoch": 0.17275, + "grad_norm": 2.1071722507476807, + "grad_norm_var": 0.6219651042979094, + "learning_rate": 0.0001, + "loss": 1.3109, + "loss/crossentropy": 2.4046380519866943, + "loss/hidden": 1.140625, + "loss/logits": 0.1700143963098526, + "loss/reg": 2.6145620722672902e-05, + "step": 1382 + }, + { + "epoch": 0.172875, + "grad_norm": 2.3622021675109863, + "grad_norm_var": 0.45661580640727945, + "learning_rate": 0.0001, + "loss": 1.3012, + "loss/crossentropy": 2.771113395690918, + "loss/hidden": 1.1171875, + "loss/logits": 0.1837317943572998, + "loss/reg": 2.6136000087717548e-05, + "step": 1383 + }, + { + "epoch": 0.173, + "grad_norm": 2.1810295581817627, + "grad_norm_var": 0.45192168341498157, + "learning_rate": 0.0001, + "loss": 1.1035, + "loss/crossentropy": 2.476903200149536, + "loss/hidden": 0.97265625, + "loss/logits": 0.13062769174575806, + "loss/reg": 2.612779280752875e-05, + "step": 1384 + }, + { + "epoch": 0.173125, + "grad_norm": 2.0933210849761963, + "grad_norm_var": 0.15544374593544788, + "learning_rate": 0.0001, + "loss": 1.1633, + "loss/crossentropy": 2.437777280807495, + "loss/hidden": 1.0078125, + "loss/logits": 0.15518274903297424, + "loss/reg": 2.6117859306395985e-05, + "step": 1385 + }, + { + "epoch": 0.17325, + "grad_norm": 2.44395112991333, + "grad_norm_var": 0.11697423888006421, + "learning_rate": 0.0001, + "loss": 1.2902, + "loss/crossentropy": 2.620086431503296, + "loss/hidden": 1.1328125, + "loss/logits": 0.1571187674999237, + "loss/reg": 2.6107893063453957e-05, + "step": 1386 + }, + { + "epoch": 0.173375, + "grad_norm": 1.9207756519317627, + "grad_norm_var": 0.10321710849889877, + "learning_rate": 0.0001, + "loss": 1.1086, + "loss/crossentropy": 2.6362812519073486, + "loss/hidden": 0.94921875, + "loss/logits": 0.159139484167099, + "loss/reg": 2.6095696739503182e-05, + "step": 1387 + }, + { + "epoch": 0.1735, + "grad_norm": 1.9902938604354858, + "grad_norm_var": 0.10397723105796122, + "learning_rate": 0.0001, + "loss": 1.1246, + "loss/crossentropy": 2.4900975227355957, + "loss/hidden": 0.984375, + "loss/logits": 0.13992765545845032, + "loss/reg": 2.6083762350026518e-05, + "step": 1388 + }, + { + "epoch": 0.173625, + "grad_norm": 2.161506414413452, + "grad_norm_var": 0.10006301105975482, + "learning_rate": 0.0001, + "loss": 1.5207, + "loss/crossentropy": 2.4960265159606934, + "loss/hidden": 1.28125, + "loss/logits": 0.23919013142585754, + "loss/reg": 2.6072422770084813e-05, + "step": 1389 + }, + { + "epoch": 0.17375, + "grad_norm": 2.4101386070251465, + "grad_norm_var": 0.10338030893808323, + "learning_rate": 0.0001, + "loss": 1.2348, + "loss/crossentropy": 2.4889702796936035, + "loss/hidden": 1.0703125, + "loss/logits": 0.16426759958267212, + "loss/reg": 2.606646012281999e-05, + "step": 1390 + }, + { + "epoch": 0.173875, + "grad_norm": 2.121083974838257, + "grad_norm_var": 0.09611030890281995, + "learning_rate": 0.0001, + "loss": 1.1486, + "loss/crossentropy": 2.6603963375091553, + "loss/hidden": 0.98828125, + "loss/logits": 0.1600644737482071, + "loss/reg": 2.6058891307911836e-05, + "step": 1391 + }, + { + "epoch": 0.174, + "grad_norm": 1.6075880527496338, + "grad_norm_var": 0.11487275833504379, + "learning_rate": 0.0001, + "loss": 1.0879, + "loss/crossentropy": 2.4910590648651123, + "loss/hidden": 0.94921875, + "loss/logits": 0.13843700289726257, + "loss/reg": 2.6046389393741265e-05, + "step": 1392 + }, + { + "epoch": 0.174125, + "grad_norm": 2.131148338317871, + "grad_norm_var": 0.101565483194927, + "learning_rate": 0.0001, + "loss": 1.1899, + "loss/crossentropy": 2.6006202697753906, + "loss/hidden": 1.03125, + "loss/logits": 0.1583990603685379, + "loss/reg": 2.6033550966531038e-05, + "step": 1393 + }, + { + "epoch": 0.17425, + "grad_norm": 1.6584205627441406, + "grad_norm_var": 0.11094488373214322, + "learning_rate": 0.0001, + "loss": 1.134, + "loss/crossentropy": 2.5571131706237793, + "loss/hidden": 1.0, + "loss/logits": 0.13371092081069946, + "loss/reg": 2.6022720703622326e-05, + "step": 1394 + }, + { + "epoch": 0.174375, + "grad_norm": 2.3416531085968018, + "grad_norm_var": 0.10888062766998932, + "learning_rate": 0.0001, + "loss": 1.3829, + "loss/crossentropy": 2.5052099227905273, + "loss/hidden": 1.1796875, + "loss/logits": 0.20295186340808868, + "loss/reg": 2.6009753128164448e-05, + "step": 1395 + }, + { + "epoch": 0.1745, + "grad_norm": 1.8771302700042725, + "grad_norm_var": 0.06396933657835806, + "learning_rate": 0.0001, + "loss": 1.1255, + "loss/crossentropy": 2.4224088191986084, + "loss/hidden": 0.98828125, + "loss/logits": 0.13692684471607208, + "loss/reg": 2.6001749574788846e-05, + "step": 1396 + }, + { + "epoch": 0.174625, + "grad_norm": 5.879499912261963, + "grad_norm_var": 0.9546546357815344, + "learning_rate": 0.0001, + "loss": 1.2573, + "loss/crossentropy": 2.4596617221832275, + "loss/hidden": 1.109375, + "loss/logits": 0.14763441681861877, + "loss/reg": 2.5993622330133803e-05, + "step": 1397 + }, + { + "epoch": 0.17475, + "grad_norm": 1.9581843614578247, + "grad_norm_var": 0.9604770428919646, + "learning_rate": 0.0001, + "loss": 1.0675, + "loss/crossentropy": 2.413883686065674, + "loss/hidden": 0.93359375, + "loss/logits": 0.13364389538764954, + "loss/reg": 2.5986768378061242e-05, + "step": 1398 + }, + { + "epoch": 0.174875, + "grad_norm": 2.021740436553955, + "grad_norm_var": 0.9658567790180522, + "learning_rate": 0.0001, + "loss": 1.2209, + "loss/crossentropy": 2.3972346782684326, + "loss/hidden": 1.0625, + "loss/logits": 0.15818513929843903, + "loss/reg": 2.5978755729738623e-05, + "step": 1399 + }, + { + "epoch": 0.175, + "grad_norm": 1.993298053741455, + "grad_norm_var": 0.9710334418943506, + "learning_rate": 0.0001, + "loss": 1.0716, + "loss/crossentropy": 2.9447786808013916, + "loss/hidden": 0.9375, + "loss/logits": 0.13379423320293427, + "loss/reg": 2.5968713089241646e-05, + "step": 1400 + }, + { + "epoch": 0.175125, + "grad_norm": 1.7662354707717896, + "grad_norm_var": 0.9862149532285541, + "learning_rate": 0.0001, + "loss": 1.2191, + "loss/crossentropy": 2.283414602279663, + "loss/hidden": 1.0625, + "loss/logits": 0.1563636213541031, + "loss/reg": 2.5960520360968076e-05, + "step": 1401 + }, + { + "epoch": 0.17525, + "grad_norm": 2.0254478454589844, + "grad_norm_var": 0.9873247010403857, + "learning_rate": 0.0001, + "loss": 1.2597, + "loss/crossentropy": 2.5163142681121826, + "loss/hidden": 1.1015625, + "loss/logits": 0.15783895552158356, + "loss/reg": 2.5953582735382952e-05, + "step": 1402 + }, + { + "epoch": 0.175375, + "grad_norm": 2.208174705505371, + "grad_norm_var": 0.9801966259089866, + "learning_rate": 0.0001, + "loss": 1.1855, + "loss/crossentropy": 2.9131150245666504, + "loss/hidden": 1.0234375, + "loss/logits": 0.16176986694335938, + "loss/reg": 2.5946499590645544e-05, + "step": 1403 + }, + { + "epoch": 0.1755, + "grad_norm": 4.003154277801514, + "grad_norm_var": 1.1611797987785821, + "learning_rate": 0.0001, + "loss": 1.2407, + "loss/crossentropy": 2.9394659996032715, + "loss/hidden": 1.0625, + "loss/logits": 0.17795130610466003, + "loss/reg": 2.593876706669107e-05, + "step": 1404 + }, + { + "epoch": 0.175625, + "grad_norm": 2.228199005126953, + "grad_norm_var": 1.1594679626319284, + "learning_rate": 0.0001, + "loss": 1.2398, + "loss/crossentropy": 2.490885019302368, + "loss/hidden": 1.0625, + "loss/logits": 0.17708484828472137, + "loss/reg": 2.5932813514373265e-05, + "step": 1405 + }, + { + "epoch": 0.17575, + "grad_norm": 3.2858388423919678, + "grad_norm_var": 1.2098124981933647, + "learning_rate": 0.0001, + "loss": 1.1244, + "loss/crossentropy": 2.6443049907684326, + "loss/hidden": 0.98046875, + "loss/logits": 0.14362701773643494, + "loss/reg": 2.592829696368426e-05, + "step": 1406 + }, + { + "epoch": 0.175875, + "grad_norm": 2.34966778755188, + "grad_norm_var": 1.2032310463387499, + "learning_rate": 0.0001, + "loss": 1.2881, + "loss/crossentropy": 2.401207208633423, + "loss/hidden": 1.1015625, + "loss/logits": 0.18629170954227448, + "loss/reg": 2.5925079171429388e-05, + "step": 1407 + }, + { + "epoch": 0.176, + "grad_norm": 1.9315677881240845, + "grad_norm_var": 1.1730357997591387, + "learning_rate": 0.0001, + "loss": 1.1709, + "loss/crossentropy": 2.637568712234497, + "loss/hidden": 1.0234375, + "loss/logits": 0.14718171954154968, + "loss/reg": 2.5920739062712528e-05, + "step": 1408 + }, + { + "epoch": 0.176125, + "grad_norm": 1.865646243095398, + "grad_norm_var": 1.1897452915347209, + "learning_rate": 0.0001, + "loss": 1.2278, + "loss/crossentropy": 2.7019155025482178, + "loss/hidden": 1.0625, + "loss/logits": 0.16503757238388062, + "loss/reg": 2.5917377570294775e-05, + "step": 1409 + }, + { + "epoch": 0.17625, + "grad_norm": 1.8569799661636353, + "grad_norm_var": 1.170931897034036, + "learning_rate": 0.0001, + "loss": 1.1045, + "loss/crossentropy": 2.042006254196167, + "loss/hidden": 0.9609375, + "loss/logits": 0.14331603050231934, + "loss/reg": 2.5916371669154614e-05, + "step": 1410 + }, + { + "epoch": 0.176375, + "grad_norm": 4.393603801727295, + "grad_norm_var": 1.3977350649506077, + "learning_rate": 0.0001, + "loss": 1.5698, + "loss/crossentropy": 2.2271413803100586, + "loss/hidden": 1.3671875, + "loss/logits": 0.20231911540031433, + "loss/reg": 2.5909093892551027e-05, + "step": 1411 + }, + { + "epoch": 0.1765, + "grad_norm": 2.2757301330566406, + "grad_norm_var": 1.3690996990368574, + "learning_rate": 0.0001, + "loss": 1.1363, + "loss/crossentropy": 2.434222936630249, + "loss/hidden": 0.96875, + "loss/logits": 0.1672743260860443, + "loss/reg": 2.590236545074731e-05, + "step": 1412 + }, + { + "epoch": 0.176625, + "grad_norm": 2.309377670288086, + "grad_norm_var": 0.6177938578866241, + "learning_rate": 0.0001, + "loss": 1.1849, + "loss/crossentropy": 2.560271739959717, + "loss/hidden": 1.03125, + "loss/logits": 0.15339678525924683, + "loss/reg": 2.589613723102957e-05, + "step": 1413 + }, + { + "epoch": 0.17675, + "grad_norm": 2.012202739715576, + "grad_norm_var": 0.6147612846916973, + "learning_rate": 0.0001, + "loss": 1.2896, + "loss/crossentropy": 2.43329119682312, + "loss/hidden": 1.1171875, + "loss/logits": 0.17218977212905884, + "loss/reg": 2.589081304904539e-05, + "step": 1414 + }, + { + "epoch": 0.176875, + "grad_norm": 1.690306305885315, + "grad_norm_var": 0.6386929660193961, + "learning_rate": 0.0001, + "loss": 1.1426, + "loss/crossentropy": 2.5350229740142822, + "loss/hidden": 0.98828125, + "loss/logits": 0.15402650833129883, + "loss/reg": 2.5883180569508113e-05, + "step": 1415 + }, + { + "epoch": 0.177, + "grad_norm": 4.526970386505127, + "grad_norm_var": 0.9068374360827087, + "learning_rate": 0.0001, + "loss": 1.775, + "loss/crossentropy": 2.8256900310516357, + "loss/hidden": 1.4765625, + "loss/logits": 0.2982058525085449, + "loss/reg": 2.5873918275465257e-05, + "step": 1416 + }, + { + "epoch": 0.177125, + "grad_norm": 5.4454426765441895, + "grad_norm_var": 1.3705622167678064, + "learning_rate": 0.0001, + "loss": 1.4805, + "loss/crossentropy": 2.6808760166168213, + "loss/hidden": 1.171875, + "loss/logits": 0.3083723187446594, + "loss/reg": 2.5865858333418146e-05, + "step": 1417 + }, + { + "epoch": 0.17725, + "grad_norm": 2.271430253982544, + "grad_norm_var": 1.3497433386371152, + "learning_rate": 0.0001, + "loss": 1.21, + "loss/crossentropy": 2.8046674728393555, + "loss/hidden": 1.0390625, + "loss/logits": 0.17070798575878143, + "loss/reg": 2.5857239961624146e-05, + "step": 1418 + }, + { + "epoch": 0.177375, + "grad_norm": 2.094118118286133, + "grad_norm_var": 1.3594181142256114, + "learning_rate": 0.0001, + "loss": 1.2936, + "loss/crossentropy": 2.497432231903076, + "loss/hidden": 1.125, + "loss/logits": 0.16831550002098083, + "loss/reg": 2.584878347988706e-05, + "step": 1419 + }, + { + "epoch": 0.1775, + "grad_norm": 2.007697582244873, + "grad_norm_var": 1.2838517117875492, + "learning_rate": 0.0001, + "loss": 1.2004, + "loss/crossentropy": 2.2927417755126953, + "loss/hidden": 1.0390625, + "loss/logits": 0.16103318333625793, + "loss/reg": 2.58402887993725e-05, + "step": 1420 + }, + { + "epoch": 0.177625, + "grad_norm": 2.122683525085449, + "grad_norm_var": 1.2906090649764783, + "learning_rate": 0.0001, + "loss": 1.3965, + "loss/crossentropy": 2.666346549987793, + "loss/hidden": 1.203125, + "loss/logits": 0.19306930899620056, + "loss/reg": 2.583224886620883e-05, + "step": 1421 + }, + { + "epoch": 0.17775, + "grad_norm": 4.172621250152588, + "grad_norm_var": 1.4146479442981934, + "learning_rate": 0.0001, + "loss": 1.5344, + "loss/crossentropy": 2.4803218841552734, + "loss/hidden": 1.3359375, + "loss/logits": 0.198216512799263, + "loss/reg": 2.5824783733696677e-05, + "step": 1422 + }, + { + "epoch": 0.177875, + "grad_norm": 2.5568783283233643, + "grad_norm_var": 1.407434802792723, + "learning_rate": 0.0001, + "loss": 1.3095, + "loss/crossentropy": 2.624493360519409, + "loss/hidden": 1.109375, + "loss/logits": 0.19990001618862152, + "loss/reg": 2.581939406809397e-05, + "step": 1423 + }, + { + "epoch": 0.178, + "grad_norm": 2.2513160705566406, + "grad_norm_var": 1.3801761017320904, + "learning_rate": 0.0001, + "loss": 1.3443, + "loss/crossentropy": 2.269963026046753, + "loss/hidden": 1.1640625, + "loss/logits": 0.17994387447834015, + "loss/reg": 2.581306398496963e-05, + "step": 1424 + }, + { + "epoch": 0.178125, + "grad_norm": 2.2689077854156494, + "grad_norm_var": 1.3432837074529382, + "learning_rate": 0.0001, + "loss": 1.2212, + "loss/crossentropy": 2.748401641845703, + "loss/hidden": 1.0625, + "loss/logits": 0.15845412015914917, + "loss/reg": 2.5804667529882863e-05, + "step": 1425 + }, + { + "epoch": 0.17825, + "grad_norm": 2.8776655197143555, + "grad_norm_var": 1.2846840618098379, + "learning_rate": 0.0001, + "loss": 1.2449, + "loss/crossentropy": 2.6518797874450684, + "loss/hidden": 1.09375, + "loss/logits": 0.15091879665851593, + "loss/reg": 2.5797799025895074e-05, + "step": 1426 + }, + { + "epoch": 0.178375, + "grad_norm": 2.3662192821502686, + "grad_norm_var": 1.1188554158070525, + "learning_rate": 0.0001, + "loss": 1.2269, + "loss/crossentropy": 2.6787312030792236, + "loss/hidden": 1.0625, + "loss/logits": 0.16410867869853973, + "loss/reg": 2.579004103608895e-05, + "step": 1427 + }, + { + "epoch": 0.1785, + "grad_norm": 1.9026354551315308, + "grad_norm_var": 1.1488152156250417, + "learning_rate": 0.0001, + "loss": 1.2462, + "loss/crossentropy": 2.4991490840911865, + "loss/hidden": 1.078125, + "loss/logits": 0.16783137619495392, + "loss/reg": 2.5781922886380926e-05, + "step": 1428 + }, + { + "epoch": 0.178625, + "grad_norm": 1.8243776559829712, + "grad_norm_var": 1.187469435192988, + "learning_rate": 0.0001, + "loss": 1.1097, + "loss/crossentropy": 2.5217294692993164, + "loss/hidden": 0.953125, + "loss/logits": 0.15634295344352722, + "loss/reg": 2.5774059395189397e-05, + "step": 1429 + }, + { + "epoch": 0.17875, + "grad_norm": 2.35154390335083, + "grad_norm_var": 1.1658331263802144, + "learning_rate": 0.0001, + "loss": 1.2323, + "loss/crossentropy": 2.7699735164642334, + "loss/hidden": 1.0703125, + "loss/logits": 0.1617283821105957, + "loss/reg": 2.576728002168238e-05, + "step": 1430 + }, + { + "epoch": 0.178875, + "grad_norm": 2.27311635017395, + "grad_norm_var": 1.1108797833646196, + "learning_rate": 0.0001, + "loss": 1.2561, + "loss/crossentropy": 2.5132880210876465, + "loss/hidden": 1.0859375, + "loss/logits": 0.1699417531490326, + "loss/reg": 2.5762588848010637e-05, + "step": 1431 + }, + { + "epoch": 0.179, + "grad_norm": 2.558253526687622, + "grad_norm_var": 0.875412624084304, + "learning_rate": 0.0001, + "loss": 1.2722, + "loss/crossentropy": 2.4209253787994385, + "loss/hidden": 1.1171875, + "loss/logits": 0.1547500491142273, + "loss/reg": 2.5753201043698937e-05, + "step": 1432 + }, + { + "epoch": 0.179125, + "grad_norm": 1.9234302043914795, + "grad_norm_var": 0.3069867544807624, + "learning_rate": 0.0001, + "loss": 1.1575, + "loss/crossentropy": 2.6482443809509277, + "loss/hidden": 1.0078125, + "loss/logits": 0.14946459233760834, + "loss/reg": 2.574502104835119e-05, + "step": 1433 + }, + { + "epoch": 0.17925, + "grad_norm": 2.313383102416992, + "grad_norm_var": 0.3065793348524276, + "learning_rate": 0.0001, + "loss": 1.2782, + "loss/crossentropy": 2.7689170837402344, + "loss/hidden": 1.1015625, + "loss/logits": 0.17633205652236938, + "loss/reg": 2.5740164346643724e-05, + "step": 1434 + }, + { + "epoch": 0.179375, + "grad_norm": 3.7249557971954346, + "grad_norm_var": 0.41356670105355775, + "learning_rate": 0.0001, + "loss": 1.4068, + "loss/crossentropy": 2.6547913551330566, + "loss/hidden": 1.1796875, + "loss/logits": 0.22690324485301971, + "loss/reg": 2.5730874767759815e-05, + "step": 1435 + }, + { + "epoch": 0.1795, + "grad_norm": 1.9937807321548462, + "grad_norm_var": 0.41443382523678446, + "learning_rate": 0.0001, + "loss": 1.1834, + "loss/crossentropy": 2.209207773208618, + "loss/hidden": 1.0390625, + "loss/logits": 0.14412438869476318, + "loss/reg": 2.572069024608936e-05, + "step": 1436 + }, + { + "epoch": 0.179625, + "grad_norm": 2.286607027053833, + "grad_norm_var": 0.4085743717048728, + "learning_rate": 0.0001, + "loss": 1.2185, + "loss/crossentropy": 2.385519504547119, + "loss/hidden": 1.0625, + "loss/logits": 0.155765101313591, + "loss/reg": 2.571181903476827e-05, + "step": 1437 + }, + { + "epoch": 0.17975, + "grad_norm": 1.7383593320846558, + "grad_norm_var": 0.22885923210213738, + "learning_rate": 0.0001, + "loss": 1.0513, + "loss/crossentropy": 2.4355742931365967, + "loss/hidden": 0.92578125, + "loss/logits": 0.1253068745136261, + "loss/reg": 2.5706225642352365e-05, + "step": 1438 + }, + { + "epoch": 0.179875, + "grad_norm": 2.2149345874786377, + "grad_norm_var": 0.22562773516875628, + "learning_rate": 0.0001, + "loss": 1.1391, + "loss/crossentropy": 2.352376699447632, + "loss/hidden": 0.984375, + "loss/logits": 0.1545073390007019, + "loss/reg": 2.570123251643963e-05, + "step": 1439 + }, + { + "epoch": 0.18, + "grad_norm": 3.050992727279663, + "grad_norm_var": 0.25994149880425516, + "learning_rate": 0.0001, + "loss": 1.461, + "loss/crossentropy": 2.428091049194336, + "loss/hidden": 1.2578125, + "loss/logits": 0.20295487344264984, + "loss/reg": 2.569424941611942e-05, + "step": 1440 + }, + { + "epoch": 0.180125, + "grad_norm": 3.2958126068115234, + "grad_norm_var": 0.31415478011155834, + "learning_rate": 0.0001, + "loss": 1.7639, + "loss/crossentropy": 2.45920991897583, + "loss/hidden": 1.4296875, + "loss/logits": 0.333952397108078, + "loss/reg": 2.5688686946523376e-05, + "step": 1441 + }, + { + "epoch": 0.18025, + "grad_norm": 2.06813645362854, + "grad_norm_var": 0.30555277478527476, + "learning_rate": 0.0001, + "loss": 1.0717, + "loss/crossentropy": 2.707188367843628, + "loss/hidden": 0.9453125, + "loss/logits": 0.12614969909191132, + "loss/reg": 2.5683581043267623e-05, + "step": 1442 + }, + { + "epoch": 0.180375, + "grad_norm": 1.6563732624053955, + "grad_norm_var": 0.3372052546934353, + "learning_rate": 0.0001, + "loss": 1.1144, + "loss/crossentropy": 2.4765775203704834, + "loss/hidden": 0.96484375, + "loss/logits": 0.1492757499217987, + "loss/reg": 2.5674558855826035e-05, + "step": 1443 + }, + { + "epoch": 0.1805, + "grad_norm": 2.182406425476074, + "grad_norm_var": 0.3263962026192984, + "learning_rate": 0.0001, + "loss": 1.3068, + "loss/crossentropy": 1.9676281213760376, + "loss/hidden": 1.1640625, + "loss/logits": 0.14251422882080078, + "loss/reg": 2.5668443413451314e-05, + "step": 1444 + }, + { + "epoch": 0.180625, + "grad_norm": 2.164161443710327, + "grad_norm_var": 0.310205383970335, + "learning_rate": 0.0001, + "loss": 1.3421, + "loss/crossentropy": 1.9245415925979614, + "loss/hidden": 1.1484375, + "loss/logits": 0.1933591067790985, + "loss/reg": 2.5662753614597023e-05, + "step": 1445 + }, + { + "epoch": 0.18075, + "grad_norm": 1.7898859977722168, + "grad_norm_var": 0.33072452266422375, + "learning_rate": 0.0001, + "loss": 1.0813, + "loss/crossentropy": 2.5171167850494385, + "loss/hidden": 0.93359375, + "loss/logits": 0.14747771620750427, + "loss/reg": 2.5654762794147246e-05, + "step": 1446 + }, + { + "epoch": 0.180875, + "grad_norm": 2.0774154663085938, + "grad_norm_var": 0.3345284334841852, + "learning_rate": 0.0001, + "loss": 1.1149, + "loss/crossentropy": 2.6739940643310547, + "loss/hidden": 0.96484375, + "loss/logits": 0.14980116486549377, + "loss/reg": 2.564600799814798e-05, + "step": 1447 + }, + { + "epoch": 0.181, + "grad_norm": 2.466822385787964, + "grad_norm_var": 0.33208460543251755, + "learning_rate": 0.0001, + "loss": 1.3337, + "loss/crossentropy": 2.4361507892608643, + "loss/hidden": 1.140625, + "loss/logits": 0.1928482949733734, + "loss/reg": 2.563859561632853e-05, + "step": 1448 + }, + { + "epoch": 0.181125, + "grad_norm": 2.334491014480591, + "grad_norm_var": 0.32150109741056454, + "learning_rate": 0.0001, + "loss": 1.1542, + "loss/crossentropy": 2.357128143310547, + "loss/hidden": 1.015625, + "loss/logits": 0.13835731148719788, + "loss/reg": 2.562982263043523e-05, + "step": 1449 + }, + { + "epoch": 0.18125, + "grad_norm": 2.4722788333892822, + "grad_norm_var": 0.32262307242698207, + "learning_rate": 0.0001, + "loss": 1.3029, + "loss/crossentropy": 2.3831655979156494, + "loss/hidden": 1.1171875, + "loss/logits": 0.18548110127449036, + "loss/reg": 2.5621100576245226e-05, + "step": 1450 + }, + { + "epoch": 0.181375, + "grad_norm": 2.492461681365967, + "grad_norm_var": 0.19076496887820607, + "learning_rate": 0.0001, + "loss": 1.2473, + "loss/crossentropy": 2.387707233428955, + "loss/hidden": 1.0859375, + "loss/logits": 0.16109108924865723, + "loss/reg": 2.5610464945202693e-05, + "step": 1451 + }, + { + "epoch": 0.1815, + "grad_norm": 2.131697416305542, + "grad_norm_var": 0.18691473371457093, + "learning_rate": 0.0001, + "loss": 1.1926, + "loss/crossentropy": 2.512455701828003, + "loss/hidden": 1.0390625, + "loss/logits": 0.15326723456382751, + "loss/reg": 2.56019011430908e-05, + "step": 1452 + }, + { + "epoch": 0.181625, + "grad_norm": 2.5552515983581543, + "grad_norm_var": 0.19178998358541383, + "learning_rate": 0.0001, + "loss": 1.3558, + "loss/crossentropy": 2.5649163722991943, + "loss/hidden": 1.15625, + "loss/logits": 0.19930413365364075, + "loss/reg": 2.5591183657525107e-05, + "step": 1453 + }, + { + "epoch": 0.18175, + "grad_norm": 2.419811725616455, + "grad_norm_var": 0.17039897166327833, + "learning_rate": 0.0001, + "loss": 1.202, + "loss/crossentropy": 2.609708309173584, + "loss/hidden": 1.03125, + "loss/logits": 0.17048929631710052, + "loss/reg": 2.5584980903659016e-05, + "step": 1454 + }, + { + "epoch": 0.181875, + "grad_norm": 2.15309476852417, + "grad_norm_var": 0.17163462342270938, + "learning_rate": 0.0001, + "loss": 1.1573, + "loss/crossentropy": 2.696734666824341, + "loss/hidden": 0.99609375, + "loss/logits": 0.16092917323112488, + "loss/reg": 2.5579236535122618e-05, + "step": 1455 + }, + { + "epoch": 0.182, + "grad_norm": 1.8917382955551147, + "grad_norm_var": 0.144485062467919, + "learning_rate": 0.0001, + "loss": 1.1949, + "loss/crossentropy": 2.8072500228881836, + "loss/hidden": 1.0234375, + "loss/logits": 0.1711609959602356, + "loss/reg": 2.5570438083377667e-05, + "step": 1456 + }, + { + "epoch": 0.182125, + "grad_norm": 3.081989049911499, + "grad_norm_var": 0.11779723533006171, + "learning_rate": 0.0001, + "loss": 1.6155, + "loss/crossentropy": 2.616046190261841, + "loss/hidden": 1.3359375, + "loss/logits": 0.2792993187904358, + "loss/reg": 2.5565803298377432e-05, + "step": 1457 + }, + { + "epoch": 0.18225, + "grad_norm": 2.022599935531616, + "grad_norm_var": 0.11900750355950288, + "learning_rate": 0.0001, + "loss": 1.2701, + "loss/crossentropy": 2.6289854049682617, + "loss/hidden": 1.1015625, + "loss/logits": 0.16826939582824707, + "loss/reg": 2.5558520064805634e-05, + "step": 1458 + }, + { + "epoch": 0.182375, + "grad_norm": 2.773674488067627, + "grad_norm_var": 0.10959658790807646, + "learning_rate": 0.0001, + "loss": 1.267, + "loss/crossentropy": 2.5791869163513184, + "loss/hidden": 1.078125, + "loss/logits": 0.18863606452941895, + "loss/reg": 2.5548339181113988e-05, + "step": 1459 + }, + { + "epoch": 0.1825, + "grad_norm": 2.4866325855255127, + "grad_norm_var": 0.11007934027081484, + "learning_rate": 0.0001, + "loss": 1.1262, + "loss/crossentropy": 2.7668955326080322, + "loss/hidden": 0.96875, + "loss/logits": 0.15719452500343323, + "loss/reg": 2.5538258341839537e-05, + "step": 1460 + }, + { + "epoch": 0.182625, + "grad_norm": 2.6052374839782715, + "grad_norm_var": 0.11236061023356421, + "learning_rate": 0.0001, + "loss": 1.2976, + "loss/crossentropy": 2.490511655807495, + "loss/hidden": 1.1171875, + "loss/logits": 0.18015122413635254, + "loss/reg": 2.5533803636790253e-05, + "step": 1461 + }, + { + "epoch": 0.18275, + "grad_norm": 1.9814751148223877, + "grad_norm_var": 0.10009892528778573, + "learning_rate": 0.0001, + "loss": 1.3016, + "loss/crossentropy": 2.491091728210449, + "loss/hidden": 1.109375, + "loss/logits": 0.19195450842380524, + "loss/reg": 2.55266968451906e-05, + "step": 1462 + }, + { + "epoch": 0.182875, + "grad_norm": 2.4323647022247314, + "grad_norm_var": 0.09404732148199878, + "learning_rate": 0.0001, + "loss": 1.3769, + "loss/crossentropy": 2.504441976547241, + "loss/hidden": 1.15625, + "loss/logits": 0.22034478187561035, + "loss/reg": 2.5521509087411687e-05, + "step": 1463 + }, + { + "epoch": 0.183, + "grad_norm": 2.1816508769989014, + "grad_norm_var": 0.09635542653419353, + "learning_rate": 0.0001, + "loss": 1.178, + "loss/crossentropy": 2.494457483291626, + "loss/hidden": 1.0078125, + "loss/logits": 0.16996243596076965, + "loss/reg": 2.5518480470054783e-05, + "step": 1464 + }, + { + "epoch": 0.183125, + "grad_norm": 1.7302703857421875, + "grad_norm_var": 0.12251942875563812, + "learning_rate": 0.0001, + "loss": 1.1627, + "loss/crossentropy": 2.4096591472625732, + "loss/hidden": 1.0078125, + "loss/logits": 0.15461833775043488, + "loss/reg": 2.551410398154985e-05, + "step": 1465 + }, + { + "epoch": 0.18325, + "grad_norm": 1.8543156385421753, + "grad_norm_var": 0.135344696478514, + "learning_rate": 0.0001, + "loss": 1.1122, + "loss/crossentropy": 2.498844861984253, + "loss/hidden": 0.96484375, + "loss/logits": 0.1470860242843628, + "loss/reg": 2.551029683672823e-05, + "step": 1466 + }, + { + "epoch": 0.183375, + "grad_norm": 2.0110981464385986, + "grad_norm_var": 0.13745108456965316, + "learning_rate": 0.0001, + "loss": 1.2425, + "loss/crossentropy": 2.1072216033935547, + "loss/hidden": 1.0625, + "loss/logits": 0.17970889806747437, + "loss/reg": 2.5503086362732574e-05, + "step": 1467 + }, + { + "epoch": 0.1835, + "grad_norm": 21.95742416381836, + "grad_norm_var": 24.339245576907846, + "learning_rate": 0.0001, + "loss": 1.3599, + "loss/crossentropy": 2.542722702026367, + "loss/hidden": 1.1875, + "loss/logits": 0.1721649169921875, + "loss/reg": 2.5499815819785e-05, + "step": 1468 + }, + { + "epoch": 0.183625, + "grad_norm": 2.1019513607025146, + "grad_norm_var": 24.40971244050053, + "learning_rate": 0.0001, + "loss": 1.3006, + "loss/crossentropy": 2.466510057449341, + "loss/hidden": 1.1328125, + "loss/logits": 0.1675121784210205, + "loss/reg": 2.549572309362702e-05, + "step": 1469 + }, + { + "epoch": 0.18375, + "grad_norm": 5.368893146514893, + "grad_norm_var": 24.536271521216303, + "learning_rate": 0.0001, + "loss": 1.4063, + "loss/crossentropy": 3.0288920402526855, + "loss/hidden": 1.2421875, + "loss/logits": 0.16387341916561127, + "loss/reg": 2.5490704501862638e-05, + "step": 1470 + }, + { + "epoch": 0.183875, + "grad_norm": 1.9355922937393188, + "grad_norm_var": 24.583063847991777, + "learning_rate": 0.0001, + "loss": 1.153, + "loss/crossentropy": 2.525538682937622, + "loss/hidden": 1.0078125, + "loss/logits": 0.14489027857780457, + "loss/reg": 2.5484558136668056e-05, + "step": 1471 + }, + { + "epoch": 0.184, + "grad_norm": 2.061171770095825, + "grad_norm_var": 24.54511308804579, + "learning_rate": 0.0001, + "loss": 1.1959, + "loss/crossentropy": 2.6838600635528564, + "loss/hidden": 1.046875, + "loss/logits": 0.14881327748298645, + "loss/reg": 2.5476831069681793e-05, + "step": 1472 + }, + { + "epoch": 0.184125, + "grad_norm": 2.119288206100464, + "grad_norm_var": 24.67744251779941, + "learning_rate": 0.0001, + "loss": 1.0942, + "loss/crossentropy": 2.37785005569458, + "loss/hidden": 0.96875, + "loss/logits": 0.1251501739025116, + "loss/reg": 2.5469844331382774e-05, + "step": 1473 + }, + { + "epoch": 0.18425, + "grad_norm": 2.2518277168273926, + "grad_norm_var": 24.632470256405476, + "learning_rate": 0.0001, + "loss": 1.1319, + "loss/crossentropy": 2.816399097442627, + "loss/hidden": 0.984375, + "loss/logits": 0.14727868139743805, + "loss/reg": 2.546577889006585e-05, + "step": 1474 + }, + { + "epoch": 0.184375, + "grad_norm": 1.8676663637161255, + "grad_norm_var": 24.785503614573322, + "learning_rate": 0.0001, + "loss": 1.0958, + "loss/crossentropy": 2.5837602615356445, + "loss/hidden": 0.95703125, + "loss/logits": 0.13850995898246765, + "loss/reg": 2.5458113668719307e-05, + "step": 1475 + }, + { + "epoch": 0.1845, + "grad_norm": 2.119408369064331, + "grad_norm_var": 24.846447289325923, + "learning_rate": 0.0001, + "loss": 1.1609, + "loss/crossentropy": 2.2676808834075928, + "loss/hidden": 1.0078125, + "loss/logits": 0.15280012786388397, + "loss/reg": 2.5450208340771496e-05, + "step": 1476 + }, + { + "epoch": 0.184625, + "grad_norm": 2.209832191467285, + "grad_norm_var": 24.90530130998557, + "learning_rate": 0.0001, + "loss": 1.1264, + "loss/crossentropy": 2.879619836807251, + "loss/hidden": 0.98046875, + "loss/logits": 0.14570090174674988, + "loss/reg": 2.5441964680794626e-05, + "step": 1477 + }, + { + "epoch": 0.18475, + "grad_norm": 2.3415815830230713, + "grad_norm_var": 24.839942495863436, + "learning_rate": 0.0001, + "loss": 1.2596, + "loss/crossentropy": 2.2408559322357178, + "loss/hidden": 1.1015625, + "loss/logits": 0.1578315645456314, + "loss/reg": 2.5431700123590417e-05, + "step": 1478 + }, + { + "epoch": 0.184875, + "grad_norm": 3.094979763031006, + "grad_norm_var": 24.7700537867713, + "learning_rate": 0.0001, + "loss": 1.2469, + "loss/crossentropy": 2.7882633209228516, + "loss/hidden": 1.0859375, + "loss/logits": 0.16074952483177185, + "loss/reg": 2.5424664272577502e-05, + "step": 1479 + }, + { + "epoch": 0.185, + "grad_norm": 2.47113037109375, + "grad_norm_var": 24.721494948348255, + "learning_rate": 0.0001, + "loss": 1.1913, + "loss/crossentropy": 2.553076982498169, + "loss/hidden": 1.03125, + "loss/logits": 0.15983465313911438, + "loss/reg": 2.541463982197456e-05, + "step": 1480 + }, + { + "epoch": 0.185125, + "grad_norm": 2.177408218383789, + "grad_norm_var": 24.62290637885572, + "learning_rate": 0.0001, + "loss": 1.3268, + "loss/crossentropy": 2.863682270050049, + "loss/hidden": 1.1484375, + "loss/logits": 0.17812541127204895, + "loss/reg": 2.5403636755072512e-05, + "step": 1481 + }, + { + "epoch": 0.18525, + "grad_norm": 1.8120688199996948, + "grad_norm_var": 24.632972165770507, + "learning_rate": 0.0001, + "loss": 1.3315, + "loss/crossentropy": 2.166480779647827, + "loss/hidden": 1.140625, + "loss/logits": 0.19066983461380005, + "loss/reg": 2.5392428142367862e-05, + "step": 1482 + }, + { + "epoch": 0.185375, + "grad_norm": 2.0231502056121826, + "grad_norm_var": 24.63039770917057, + "learning_rate": 0.0001, + "loss": 1.4177, + "loss/crossentropy": 2.3042335510253906, + "loss/hidden": 1.2109375, + "loss/logits": 0.20655736327171326, + "loss/reg": 2.5382934836670756e-05, + "step": 1483 + }, + { + "epoch": 0.1855, + "grad_norm": 2.104567527770996, + "grad_norm_var": 0.722762645527663, + "learning_rate": 0.0001, + "loss": 1.1221, + "loss/crossentropy": 2.4812278747558594, + "loss/hidden": 0.95703125, + "loss/logits": 0.16483670473098755, + "loss/reg": 2.5371986339450814e-05, + "step": 1484 + }, + { + "epoch": 0.185625, + "grad_norm": 1.8099244832992554, + "grad_norm_var": 0.7388715725952141, + "learning_rate": 0.0001, + "loss": 1.1406, + "loss/crossentropy": 2.5045602321624756, + "loss/hidden": 0.9921875, + "loss/logits": 0.14815980195999146, + "loss/reg": 2.5362240194226615e-05, + "step": 1485 + }, + { + "epoch": 0.18575, + "grad_norm": 2.288804054260254, + "grad_norm_var": 0.09633595496465712, + "learning_rate": 0.0001, + "loss": 1.133, + "loss/crossentropy": 2.5304551124572754, + "loss/hidden": 0.984375, + "loss/logits": 0.14837783575057983, + "loss/reg": 2.5353912860737182e-05, + "step": 1486 + }, + { + "epoch": 0.185875, + "grad_norm": 1.964654564857483, + "grad_norm_var": 0.09548807332710846, + "learning_rate": 0.0001, + "loss": 1.2475, + "loss/crossentropy": 2.4388115406036377, + "loss/hidden": 1.078125, + "loss/logits": 0.1691289246082306, + "loss/reg": 2.5346316760987975e-05, + "step": 1487 + }, + { + "epoch": 0.186, + "grad_norm": 1.86490797996521, + "grad_norm_var": 0.10073926528402405, + "learning_rate": 0.0001, + "loss": 1.11, + "loss/crossentropy": 2.6186814308166504, + "loss/hidden": 0.96484375, + "loss/logits": 0.14489483833312988, + "loss/reg": 2.5335386453662068e-05, + "step": 1488 + }, + { + "epoch": 0.186125, + "grad_norm": 2.4556772708892822, + "grad_norm_var": 0.1060943797869714, + "learning_rate": 0.0001, + "loss": 1.3398, + "loss/crossentropy": 2.2856523990631104, + "loss/hidden": 1.15625, + "loss/logits": 0.18330934643745422, + "loss/reg": 2.5327975890832022e-05, + "step": 1489 + }, + { + "epoch": 0.18625, + "grad_norm": 2.2611231803894043, + "grad_norm_var": 0.10619053903076457, + "learning_rate": 0.0001, + "loss": 1.1984, + "loss/crossentropy": 2.5412044525146484, + "loss/hidden": 1.0234375, + "loss/logits": 0.17472562193870544, + "loss/reg": 2.531990503484849e-05, + "step": 1490 + }, + { + "epoch": 0.186375, + "grad_norm": 2.007587194442749, + "grad_norm_var": 0.10160251528931823, + "learning_rate": 0.0001, + "loss": 1.142, + "loss/crossentropy": 2.49784779548645, + "loss/hidden": 0.9765625, + "loss/logits": 0.1652270257472992, + "loss/reg": 2.5309469492640346e-05, + "step": 1491 + }, + { + "epoch": 0.1865, + "grad_norm": 2.0241851806640625, + "grad_norm_var": 0.10303915212367887, + "learning_rate": 0.0001, + "loss": 1.2785, + "loss/crossentropy": 2.4042551517486572, + "loss/hidden": 1.1171875, + "loss/logits": 0.16109603643417358, + "loss/reg": 2.5299974367953837e-05, + "step": 1492 + }, + { + "epoch": 0.186625, + "grad_norm": 2.033846616744995, + "grad_norm_var": 0.10432114740001779, + "learning_rate": 0.0001, + "loss": 1.101, + "loss/crossentropy": 2.703930377960205, + "loss/hidden": 0.9609375, + "loss/logits": 0.13981907069683075, + "loss/reg": 2.5290129997301847e-05, + "step": 1493 + }, + { + "epoch": 0.18675, + "grad_norm": 1.9111216068267822, + "grad_norm_var": 0.10611021621218075, + "learning_rate": 0.0001, + "loss": 1.2379, + "loss/crossentropy": 2.5457046031951904, + "loss/hidden": 1.0625, + "loss/logits": 0.17511677742004395, + "loss/reg": 2.527740434743464e-05, + "step": 1494 + }, + { + "epoch": 0.186875, + "grad_norm": 1.892629861831665, + "grad_norm_var": 0.04401971595129428, + "learning_rate": 0.0001, + "loss": 1.1905, + "loss/crossentropy": 2.339268684387207, + "loss/hidden": 1.0390625, + "loss/logits": 0.15117594599723816, + "loss/reg": 2.526561002014205e-05, + "step": 1495 + }, + { + "epoch": 0.187, + "grad_norm": 2.199645757675171, + "grad_norm_var": 0.034067171016665475, + "learning_rate": 0.0001, + "loss": 1.3386, + "loss/crossentropy": 2.8161470890045166, + "loss/hidden": 1.140625, + "loss/logits": 0.19775289297103882, + "loss/reg": 2.525371019146405e-05, + "step": 1496 + }, + { + "epoch": 0.187125, + "grad_norm": 2.2716476917266846, + "grad_norm_var": 0.03619857342150486, + "learning_rate": 0.0001, + "loss": 1.2954, + "loss/crossentropy": 2.691171169281006, + "loss/hidden": 1.1171875, + "loss/logits": 0.17799657583236694, + "loss/reg": 2.5244744392693974e-05, + "step": 1497 + }, + { + "epoch": 0.18725, + "grad_norm": 5.6878790855407715, + "grad_norm_var": 0.8480560266631223, + "learning_rate": 0.0001, + "loss": 1.8704, + "loss/crossentropy": 2.3318469524383545, + "loss/hidden": 1.625, + "loss/logits": 0.24512439966201782, + "loss/reg": 2.5230590836144984e-05, + "step": 1498 + }, + { + "epoch": 0.187375, + "grad_norm": 2.5068492889404297, + "grad_norm_var": 0.8448184438941572, + "learning_rate": 0.0001, + "loss": 1.3553, + "loss/crossentropy": 2.5144317150115967, + "loss/hidden": 1.1484375, + "loss/logits": 0.20657923817634583, + "loss/reg": 2.5222305339411832e-05, + "step": 1499 + }, + { + "epoch": 0.1875, + "grad_norm": 1.735006332397461, + "grad_norm_var": 0.8644781135812669, + "learning_rate": 0.0001, + "loss": 1.0371, + "loss/crossentropy": 2.4479753971099854, + "loss/hidden": 0.9140625, + "loss/logits": 0.12281764298677444, + "loss/reg": 2.5209938030457124e-05, + "step": 1500 + } + ], + "logging_steps": 1, + "max_steps": 8000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": true, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.6608792346624e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}