| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.0625, |
| "eval_steps": 500, |
| "global_step": 500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.000125, |
| "grad_norm": 2.8797903060913086, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 1.3346, |
| "loss/crossentropy": 2.6933815479278564, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.16231727600097656, |
| "loss/reg": 3.5815275623463094e-05, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.00025, |
| "grad_norm": 3.151318073272705, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 1.334, |
| "loss/crossentropy": 3.0975701808929443, |
| "loss/hidden": 1.1484375, |
| "loss/logits": 0.18517285585403442, |
| "loss/reg": 3.5815275623463094e-05, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.000375, |
| "grad_norm": 2.3074228763580322, |
| "learning_rate": 3e-06, |
| "loss": 1.2917, |
| "loss/crossentropy": 2.613313674926758, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.17419689893722534, |
| "loss/reg": 3.581521741580218e-05, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.0005, |
| "grad_norm": 2.994593381881714, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 1.3622, |
| "loss/crossentropy": 2.562746047973633, |
| "loss/hidden": 1.2109375, |
| "loss/logits": 0.150880828499794, |
| "loss/reg": 3.5815086448565125e-05, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.000625, |
| "grad_norm": 4.555283069610596, |
| "learning_rate": 5e-06, |
| "loss": 1.435, |
| "loss/crossentropy": 2.4253523349761963, |
| "loss/hidden": 1.25, |
| "loss/logits": 0.18461981415748596, |
| "loss/reg": 3.581498458515853e-05, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.00075, |
| "grad_norm": 2.5826594829559326, |
| "learning_rate": 6e-06, |
| "loss": 1.2796, |
| "loss/crossentropy": 2.666372060775757, |
| "loss/hidden": 1.1015625, |
| "loss/logits": 0.17770954966545105, |
| "loss/reg": 3.5814849979942665e-05, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.000875, |
| "grad_norm": 2.9724032878875732, |
| "learning_rate": 7.000000000000001e-06, |
| "loss": 1.5032, |
| "loss/crossentropy": 2.488424062728882, |
| "loss/hidden": 1.3046875, |
| "loss/logits": 0.1981831043958664, |
| "loss/reg": 3.581465352908708e-05, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.001, |
| "grad_norm": 4.469974517822266, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 1.7569, |
| "loss/crossentropy": 2.152468204498291, |
| "loss/hidden": 1.515625, |
| "loss/logits": 0.24093276262283325, |
| "loss/reg": 3.581443888833746e-05, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.001125, |
| "grad_norm": 2.529066801071167, |
| "learning_rate": 9e-06, |
| "loss": 1.7045, |
| "loss/crossentropy": 2.3210883140563965, |
| "loss/hidden": 1.4453125, |
| "loss/logits": 0.2588244378566742, |
| "loss/reg": 3.581414057407528e-05, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.00125, |
| "grad_norm": 2.1863760948181152, |
| "learning_rate": 1e-05, |
| "loss": 1.4129, |
| "loss/crossentropy": 2.213552236557007, |
| "loss/hidden": 1.25, |
| "loss/logits": 0.1625480353832245, |
| "loss/reg": 3.5813736758427694e-05, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.001375, |
| "grad_norm": 2.182722330093384, |
| "learning_rate": 1.1000000000000001e-05, |
| "loss": 1.3913, |
| "loss/crossentropy": 2.4366066455841064, |
| "loss/hidden": 1.1796875, |
| "loss/logits": 0.21124377846717834, |
| "loss/reg": 3.5813347494695336e-05, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.0015, |
| "grad_norm": 2.28460431098938, |
| "learning_rate": 1.2e-05, |
| "loss": 1.6315, |
| "loss/crossentropy": 2.2548444271087646, |
| "loss/hidden": 1.4296875, |
| "loss/logits": 0.2014051228761673, |
| "loss/reg": 3.581297642085701e-05, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.001625, |
| "grad_norm": 3.58573579788208, |
| "learning_rate": 1.3000000000000001e-05, |
| "loss": 1.5091, |
| "loss/crossentropy": 2.6865081787109375, |
| "loss/hidden": 1.28125, |
| "loss/logits": 0.22751325368881226, |
| "loss/reg": 3.581246710382402e-05, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.00175, |
| "grad_norm": 3.04477596282959, |
| "learning_rate": 1.4000000000000001e-05, |
| "loss": 1.7733, |
| "loss/crossentropy": 2.2971208095550537, |
| "loss/hidden": 1.5234375, |
| "loss/logits": 0.2495010793209076, |
| "loss/reg": 3.5811823181575164e-05, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.001875, |
| "grad_norm": 3.0177462100982666, |
| "learning_rate": 1.5e-05, |
| "loss": 1.5114, |
| "loss/crossentropy": 2.726813554763794, |
| "loss/hidden": 1.28125, |
| "loss/logits": 0.2297666072845459, |
| "loss/reg": 3.581113196560182e-05, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.002, |
| "grad_norm": 2.147826671600342, |
| "grad_norm_var": 0.5553412532630915, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 1.2521, |
| "loss/crossentropy": 2.413343667984009, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.17357708513736725, |
| "loss/reg": 3.581081909942441e-05, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.002125, |
| "grad_norm": 1.957945704460144, |
| "grad_norm_var": 0.6147194825502799, |
| "learning_rate": 1.7000000000000003e-05, |
| "loss": 1.1395, |
| "loss/crossentropy": 2.327432632446289, |
| "loss/hidden": 1.0, |
| "loss/logits": 0.1391144096851349, |
| "loss/reg": 3.581016790121794e-05, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.00225, |
| "grad_norm": 2.9085915088653564, |
| "grad_norm_var": 0.6093993504039028, |
| "learning_rate": 1.8e-05, |
| "loss": 1.7224, |
| "loss/crossentropy": 2.5963706970214844, |
| "loss/hidden": 1.4765625, |
| "loss/logits": 0.24546313285827637, |
| "loss/reg": 3.580931297619827e-05, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.002375, |
| "grad_norm": 1.6794862747192383, |
| "grad_norm_var": 0.6801389543370343, |
| "learning_rate": 1.9e-05, |
| "loss": 1.2792, |
| "loss/crossentropy": 2.542264938354492, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.1695137917995453, |
| "loss/reg": 3.580832708394155e-05, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.0025, |
| "grad_norm": 2.006974935531616, |
| "grad_norm_var": 0.7179436357972528, |
| "learning_rate": 2e-05, |
| "loss": 1.2741, |
| "loss/crossentropy": 2.6418614387512207, |
| "loss/hidden": 1.1015625, |
| "loss/logits": 0.1721784472465515, |
| "loss/reg": 3.580794873414561e-05, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.002625, |
| "grad_norm": 1.895347237586975, |
| "grad_norm_var": 0.5223754576868543, |
| "learning_rate": 2.1e-05, |
| "loss": 1.1787, |
| "loss/crossentropy": 2.38079571723938, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.14714078605175018, |
| "loss/reg": 3.580757766030729e-05, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.00275, |
| "grad_norm": 2.195387125015259, |
| "grad_norm_var": 0.5321677299000015, |
| "learning_rate": 2.2000000000000003e-05, |
| "loss": 1.3881, |
| "loss/crossentropy": 2.613879919052124, |
| "loss/hidden": 1.1875, |
| "loss/logits": 0.20026516914367676, |
| "loss/reg": 3.58072757080663e-05, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.002875, |
| "grad_norm": 2.943157911300659, |
| "grad_norm_var": 0.530638648177441, |
| "learning_rate": 2.3000000000000003e-05, |
| "loss": 1.391, |
| "loss/crossentropy": 2.653855562210083, |
| "loss/hidden": 1.1875, |
| "loss/logits": 0.20310327410697937, |
| "loss/reg": 3.580666452762671e-05, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.003, |
| "grad_norm": 2.057532787322998, |
| "grad_norm_var": 0.2815427832165767, |
| "learning_rate": 2.4e-05, |
| "loss": 1.2261, |
| "loss/crossentropy": 2.5107123851776123, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.13977402448654175, |
| "loss/reg": 3.580632255761884e-05, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.003125, |
| "grad_norm": 2.4862008094787598, |
| "grad_norm_var": 0.28099970817646425, |
| "learning_rate": 2.5e-05, |
| "loss": 1.2713, |
| "loss/crossentropy": 2.234706163406372, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.1537853181362152, |
| "loss/reg": 3.580575867090374e-05, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.00325, |
| "grad_norm": 1.8715572357177734, |
| "grad_norm_var": 0.29663449315952994, |
| "learning_rate": 2.6000000000000002e-05, |
| "loss": 1.2613, |
| "loss/crossentropy": 2.693939447402954, |
| "loss/hidden": 1.1015625, |
| "loss/logits": 0.1593395173549652, |
| "loss/reg": 3.58048637281172e-05, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.003375, |
| "grad_norm": 1.8992433547973633, |
| "grad_norm_var": 0.3095519871493233, |
| "learning_rate": 2.7000000000000002e-05, |
| "loss": 1.3686, |
| "loss/crossentropy": 2.5521254539489746, |
| "loss/hidden": 1.1796875, |
| "loss/logits": 0.18858906626701355, |
| "loss/reg": 3.580458724172786e-05, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.0035, |
| "grad_norm": 2.2363150119781494, |
| "grad_norm_var": 0.31027254984992586, |
| "learning_rate": 2.8000000000000003e-05, |
| "loss": 1.4412, |
| "loss/crossentropy": 2.5556640625, |
| "loss/hidden": 1.25, |
| "loss/logits": 0.19085438549518585, |
| "loss/reg": 3.580446355044842e-05, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.003625, |
| "grad_norm": 1.916678786277771, |
| "grad_norm_var": 0.21402330843771242, |
| "learning_rate": 2.9e-05, |
| "loss": 1.4761, |
| "loss/crossentropy": 2.3765358924865723, |
| "loss/hidden": 1.28125, |
| "loss/logits": 0.19447964429855347, |
| "loss/reg": 3.5804154322249815e-05, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.00375, |
| "grad_norm": 1.638509750366211, |
| "grad_norm_var": 0.19170291887504137, |
| "learning_rate": 3e-05, |
| "loss": 1.0217, |
| "loss/crossentropy": 2.39214825630188, |
| "loss/hidden": 0.91015625, |
| "loss/logits": 0.11122289299964905, |
| "loss/reg": 3.580400880309753e-05, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.003875, |
| "grad_norm": 1.9309443235397339, |
| "grad_norm_var": 0.14393413685788706, |
| "learning_rate": 3.1e-05, |
| "loss": 1.2692, |
| "loss/crossentropy": 2.5890893936157227, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.15942150354385376, |
| "loss/reg": 3.580367410904728e-05, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.004, |
| "grad_norm": 1.8290704488754272, |
| "grad_norm_var": 0.14870789473936974, |
| "learning_rate": 3.2000000000000005e-05, |
| "loss": 1.2562, |
| "loss/crossentropy": 2.614006757736206, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.1620863825082779, |
| "loss/reg": 3.580304473871365e-05, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.004125, |
| "grad_norm": 2.8197247982025146, |
| "grad_norm_var": 0.17985784278712322, |
| "learning_rate": 3.3e-05, |
| "loss": 1.6221, |
| "loss/crossentropy": 2.458843231201172, |
| "loss/hidden": 1.390625, |
| "loss/logits": 0.23112158477306366, |
| "loss/reg": 3.5803102946374565e-05, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.00425, |
| "grad_norm": 2.04148530960083, |
| "grad_norm_var": 0.1385297884752911, |
| "learning_rate": 3.4000000000000007e-05, |
| "loss": 1.2995, |
| "loss/crossentropy": 2.6012392044067383, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.15854208171367645, |
| "loss/reg": 3.580349584808573e-05, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.004375, |
| "grad_norm": 2.370253086090088, |
| "grad_norm_var": 0.13049913719012618, |
| "learning_rate": 3.5e-05, |
| "loss": 1.2372, |
| "loss/crossentropy": 2.554248571395874, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.1508997678756714, |
| "loss/reg": 3.580387055990286e-05, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.0045, |
| "grad_norm": 2.1753602027893066, |
| "grad_norm_var": 0.12942723244658866, |
| "learning_rate": 3.6e-05, |
| "loss": 1.2043, |
| "loss/crossentropy": 2.950917959213257, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.14146575331687927, |
| "loss/reg": 3.5805252991849557e-05, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.004625, |
| "grad_norm": 1.8446277379989624, |
| "grad_norm_var": 0.13127072083684937, |
| "learning_rate": 3.7e-05, |
| "loss": 1.0664, |
| "loss/crossentropy": 2.6589579582214355, |
| "loss/hidden": 0.93359375, |
| "loss/logits": 0.1324453502893448, |
| "loss/reg": 3.58061988663394e-05, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.00475, |
| "grad_norm": 3.153823137283325, |
| "grad_norm_var": 0.1956330169497003, |
| "learning_rate": 3.8e-05, |
| "loss": 1.3224, |
| "loss/crossentropy": 2.4948697090148926, |
| "loss/hidden": 1.1484375, |
| "loss/logits": 0.1736428141593933, |
| "loss/reg": 3.5806617233902216e-05, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.004875, |
| "grad_norm": 2.105498790740967, |
| "grad_norm_var": 0.1565869437188399, |
| "learning_rate": 3.9000000000000006e-05, |
| "loss": 1.403, |
| "loss/crossentropy": 2.2583742141723633, |
| "loss/hidden": 1.25, |
| "loss/logits": 0.15260592103004456, |
| "loss/reg": 3.580802876967937e-05, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.005, |
| "grad_norm": 1.635926365852356, |
| "grad_norm_var": 0.17281299081781085, |
| "learning_rate": 4e-05, |
| "loss": 1.0375, |
| "loss/crossentropy": 2.6808717250823975, |
| "loss/hidden": 0.921875, |
| "loss/logits": 0.11526834964752197, |
| "loss/reg": 3.580863995011896e-05, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.005125, |
| "grad_norm": 1.715374231338501, |
| "grad_norm_var": 0.17253809821943988, |
| "learning_rate": 4.1e-05, |
| "loss": 1.13, |
| "loss/crossentropy": 2.643165349960327, |
| "loss/hidden": 0.984375, |
| "loss/logits": 0.1453102082014084, |
| "loss/reg": 3.580814882298e-05, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.00525, |
| "grad_norm": 2.1999247074127197, |
| "grad_norm_var": 0.17041268294515716, |
| "learning_rate": 4.2e-05, |
| "loss": 1.291, |
| "loss/crossentropy": 2.4450502395629883, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.165659099817276, |
| "loss/reg": 3.5807905078399926e-05, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.005375, |
| "grad_norm": 6.767260551452637, |
| "grad_norm_var": 1.5247462870548845, |
| "learning_rate": 4.3e-05, |
| "loss": 1.3462, |
| "loss/crossentropy": 2.6365652084350586, |
| "loss/hidden": 1.1953125, |
| "loss/logits": 0.15048328042030334, |
| "loss/reg": 3.580807242542505e-05, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.0055, |
| "grad_norm": 2.4290215969085693, |
| "grad_norm_var": 1.5228923892282233, |
| "learning_rate": 4.4000000000000006e-05, |
| "loss": 1.444, |
| "loss/crossentropy": 2.445629596710205, |
| "loss/hidden": 1.2578125, |
| "loss/logits": 0.1858382523059845, |
| "loss/reg": 3.5807508538709953e-05, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.005625, |
| "grad_norm": 1.8787518739700317, |
| "grad_norm_var": 1.525481240782518, |
| "learning_rate": 4.5e-05, |
| "loss": 1.1501, |
| "loss/crossentropy": 2.932614326477051, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.13410566747188568, |
| "loss/reg": 3.5807570384349674e-05, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.00575, |
| "grad_norm": 2.0517191886901855, |
| "grad_norm_var": 1.4937318455351132, |
| "learning_rate": 4.600000000000001e-05, |
| "loss": 1.1484, |
| "loss/crossentropy": 2.8540468215942383, |
| "loss/hidden": 1.0078125, |
| "loss/logits": 0.14026299118995667, |
| "loss/reg": 3.5806355299428105e-05, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.005875, |
| "grad_norm": 2.3056695461273193, |
| "grad_norm_var": 1.4773587952527152, |
| "learning_rate": 4.7e-05, |
| "loss": 1.1201, |
| "loss/crossentropy": 2.3501625061035156, |
| "loss/hidden": 0.9921875, |
| "loss/logits": 0.127536341547966, |
| "loss/reg": 3.580517659429461e-05, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.006, |
| "grad_norm": 1.8737199306488037, |
| "grad_norm_var": 1.473740887453625, |
| "learning_rate": 4.8e-05, |
| "loss": 1.1727, |
| "loss/crossentropy": 2.56876540184021, |
| "loss/hidden": 1.0234375, |
| "loss/logits": 0.14892947673797607, |
| "loss/reg": 3.5804278013529256e-05, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.006125, |
| "grad_norm": 1.6498337984085083, |
| "grad_norm_var": 1.503248724299241, |
| "learning_rate": 4.9e-05, |
| "loss": 1.0887, |
| "loss/crossentropy": 2.5359740257263184, |
| "loss/hidden": 0.97265625, |
| "loss/logits": 0.11568085849285126, |
| "loss/reg": 3.580292104743421e-05, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.00625, |
| "grad_norm": 1.8893804550170898, |
| "grad_norm_var": 1.5117099009867367, |
| "learning_rate": 5e-05, |
| "loss": 1.197, |
| "loss/crossentropy": 2.4427387714385986, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.14971715211868286, |
| "loss/reg": 3.58012730430346e-05, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.006375, |
| "grad_norm": 1.6203227043151855, |
| "grad_norm_var": 1.5476226526424812, |
| "learning_rate": 5.1000000000000006e-05, |
| "loss": 1.1026, |
| "loss/crossentropy": 2.4663264751434326, |
| "loss/hidden": 0.96875, |
| "loss/logits": 0.13351351022720337, |
| "loss/reg": 3.579998519853689e-05, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.0065, |
| "grad_norm": 1.86211097240448, |
| "grad_norm_var": 1.5602565704882587, |
| "learning_rate": 5.2000000000000004e-05, |
| "loss": 1.3754, |
| "loss/crossentropy": 2.4876317977905273, |
| "loss/hidden": 1.1875, |
| "loss/logits": 0.1875210702419281, |
| "loss/reg": 3.5798137105302885e-05, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.006625, |
| "grad_norm": 1.8947949409484863, |
| "grad_norm_var": 1.5572914096308217, |
| "learning_rate": 5.300000000000001e-05, |
| "loss": 1.2427, |
| "loss/crossentropy": 2.5425305366516113, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.14857983589172363, |
| "loss/reg": 3.5795987059827894e-05, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.00675, |
| "grad_norm": 2.155927896499634, |
| "grad_norm_var": 1.5078638031084188, |
| "learning_rate": 5.4000000000000005e-05, |
| "loss": 1.1465, |
| "loss/crossentropy": 2.525212287902832, |
| "loss/hidden": 1.0234375, |
| "loss/logits": 0.12268626689910889, |
| "loss/reg": 3.579404437914491e-05, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.006875, |
| "grad_norm": 2.088404893875122, |
| "grad_norm_var": 1.5082164304181902, |
| "learning_rate": 5.500000000000001e-05, |
| "loss": 1.2035, |
| "loss/crossentropy": 2.0337164402008057, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.12503597140312195, |
| "loss/reg": 3.579181066015735e-05, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.007, |
| "grad_norm": 1.5657821893692017, |
| "grad_norm_var": 1.5142777074410627, |
| "learning_rate": 5.6000000000000006e-05, |
| "loss": 1.0258, |
| "loss/crossentropy": 2.5575084686279297, |
| "loss/hidden": 0.90625, |
| "loss/logits": 0.11920958012342453, |
| "loss/reg": 3.5789642424788326e-05, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.007125, |
| "grad_norm": 1.8427784442901611, |
| "grad_norm_var": 1.5062655960432332, |
| "learning_rate": 5.6999999999999996e-05, |
| "loss": 1.1217, |
| "loss/crossentropy": 2.771742820739746, |
| "loss/hidden": 0.9921875, |
| "loss/logits": 0.1291133463382721, |
| "loss/reg": 3.5787568776868284e-05, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.00725, |
| "grad_norm": 2.1850311756134033, |
| "grad_norm_var": 1.5063882579126575, |
| "learning_rate": 5.8e-05, |
| "loss": 1.1639, |
| "loss/crossentropy": 2.5320937633514404, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.13231654465198517, |
| "loss/reg": 3.5786692023975775e-05, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.007375, |
| "grad_norm": 1.9679864645004272, |
| "grad_norm_var": 0.057763248837537105, |
| "learning_rate": 5.9e-05, |
| "loss": 1.2283, |
| "loss/crossentropy": 2.2511909008026123, |
| "loss/hidden": 1.0703125, |
| "loss/logits": 0.15765681862831116, |
| "loss/reg": 3.578452378860675e-05, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.0075, |
| "grad_norm": 1.5962857007980347, |
| "grad_norm_var": 0.04834229766963934, |
| "learning_rate": 6e-05, |
| "loss": 1.1863, |
| "loss/crossentropy": 2.2581472396850586, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.13910087943077087, |
| "loss/reg": 3.578297037165612e-05, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.007625, |
| "grad_norm": 1.9367214441299438, |
| "grad_norm_var": 0.04837432662246878, |
| "learning_rate": 6.1e-05, |
| "loss": 1.1308, |
| "loss/crossentropy": 2.375943660736084, |
| "loss/hidden": 1.0078125, |
| "loss/logits": 0.12259182333946228, |
| "loss/reg": 3.578166069928557e-05, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.00775, |
| "grad_norm": 1.74626624584198, |
| "grad_norm_var": 0.048246697686887254, |
| "learning_rate": 6.2e-05, |
| "loss": 1.0925, |
| "loss/crossentropy": 2.33935284614563, |
| "loss/hidden": 0.97265625, |
| "loss/logits": 0.11948312819004059, |
| "loss/reg": 3.577923052944243e-05, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.007875, |
| "grad_norm": 1.7242004871368408, |
| "grad_norm_var": 0.036866001167244575, |
| "learning_rate": 6.3e-05, |
| "loss": 1.0529, |
| "loss/crossentropy": 2.56264066696167, |
| "loss/hidden": 0.9375, |
| "loss/logits": 0.11508607119321823, |
| "loss/reg": 3.5776785807684064e-05, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.008, |
| "grad_norm": 1.759158730506897, |
| "grad_norm_var": 0.03732351836526746, |
| "learning_rate": 6.400000000000001e-05, |
| "loss": 1.0359, |
| "loss/crossentropy": 2.5598065853118896, |
| "loss/hidden": 0.921875, |
| "loss/logits": 0.11371441185474396, |
| "loss/reg": 3.577530151233077e-05, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.008125, |
| "grad_norm": 1.8450080156326294, |
| "grad_norm_var": 0.03468242225663947, |
| "learning_rate": 6.500000000000001e-05, |
| "loss": 1.1044, |
| "loss/crossentropy": 2.358488082885742, |
| "loss/hidden": 0.9765625, |
| "loss/logits": 0.1274409294128418, |
| "loss/reg": 3.577340248739347e-05, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.00825, |
| "grad_norm": 1.8574451208114624, |
| "grad_norm_var": 0.03459981312827119, |
| "learning_rate": 6.6e-05, |
| "loss": 1.3393, |
| "loss/crossentropy": 2.2067902088165283, |
| "loss/hidden": 1.1796875, |
| "loss/logits": 0.15927882492542267, |
| "loss/reg": 3.5770081012742594e-05, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.008375, |
| "grad_norm": 2.0372934341430664, |
| "grad_norm_var": 0.032529617098579836, |
| "learning_rate": 6.7e-05, |
| "loss": 1.0537, |
| "loss/crossentropy": 2.948381185531616, |
| "loss/hidden": 0.92578125, |
| "loss/logits": 0.1275252103805542, |
| "loss/reg": 3.576773087843321e-05, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.0085, |
| "grad_norm": 1.6391419172286987, |
| "grad_norm_var": 0.03614113702393708, |
| "learning_rate": 6.800000000000001e-05, |
| "loss": 1.1128, |
| "loss/crossentropy": 2.691239833831787, |
| "loss/hidden": 0.98046875, |
| "loss/logits": 0.13195790350437164, |
| "loss/reg": 3.576446033548564e-05, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.008625, |
| "grad_norm": 1.6962119340896606, |
| "grad_norm_var": 0.03782062069620693, |
| "learning_rate": 6.9e-05, |
| "loss": 1.0633, |
| "loss/crossentropy": 2.6416876316070557, |
| "loss/hidden": 0.94921875, |
| "loss/logits": 0.11374930292367935, |
| "loss/reg": 3.576194285415113e-05, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.00875, |
| "grad_norm": 2.015970468521118, |
| "grad_norm_var": 0.03338686088704298, |
| "learning_rate": 7e-05, |
| "loss": 1.2654, |
| "loss/crossentropy": 2.5686511993408203, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.1556204855442047, |
| "loss/reg": 3.576026938389987e-05, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.008875, |
| "grad_norm": 3.6860287189483643, |
| "grad_norm_var": 0.24497842788792332, |
| "learning_rate": 7.1e-05, |
| "loss": 1.4119, |
| "loss/crossentropy": 2.0071253776550293, |
| "loss/hidden": 1.2578125, |
| "loss/logits": 0.15377236902713776, |
| "loss/reg": 3.575763912522234e-05, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.009, |
| "grad_norm": 2.0074028968811035, |
| "grad_norm_var": 0.2349071198746244, |
| "learning_rate": 7.2e-05, |
| "loss": 1.0776, |
| "loss/crossentropy": 2.3829903602600098, |
| "loss/hidden": 0.953125, |
| "loss/logits": 0.12408198416233063, |
| "loss/reg": 3.575549635570496e-05, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.009125, |
| "grad_norm": 1.9751619100570679, |
| "grad_norm_var": 0.23373155459140638, |
| "learning_rate": 7.3e-05, |
| "loss": 1.3348, |
| "loss/crossentropy": 2.3497986793518066, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.16253460943698883, |
| "loss/reg": 3.575363371055573e-05, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.00925, |
| "grad_norm": 1.5991307497024536, |
| "grad_norm_var": 0.2391465881612707, |
| "learning_rate": 7.4e-05, |
| "loss": 1.1257, |
| "loss/crossentropy": 2.5422017574310303, |
| "loss/hidden": 0.9921875, |
| "loss/logits": 0.13318461179733276, |
| "loss/reg": 3.575047594495118e-05, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.009375, |
| "grad_norm": 6.278711318969727, |
| "grad_norm_var": 1.4148538861937499, |
| "learning_rate": 7.500000000000001e-05, |
| "loss": 1.2142, |
| "loss/crossentropy": 2.6029913425445557, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.1279023289680481, |
| "loss/reg": 3.5748576920013875e-05, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.0095, |
| "grad_norm": 2.744645833969116, |
| "grad_norm_var": 1.4029217843730426, |
| "learning_rate": 7.6e-05, |
| "loss": 1.1281, |
| "loss/crossentropy": 2.5766711235046387, |
| "loss/hidden": 0.99609375, |
| "loss/logits": 0.13161128759384155, |
| "loss/reg": 3.574538277462125e-05, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.009625, |
| "grad_norm": 1.6794898509979248, |
| "grad_norm_var": 1.418977736839713, |
| "learning_rate": 7.7e-05, |
| "loss": 0.9818, |
| "loss/crossentropy": 2.4829020500183105, |
| "loss/hidden": 0.87109375, |
| "loss/logits": 0.11033609509468079, |
| "loss/reg": 3.574356742319651e-05, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.00975, |
| "grad_norm": 1.4505054950714111, |
| "grad_norm_var": 1.4450273907543434, |
| "learning_rate": 7.800000000000001e-05, |
| "loss": 1.0562, |
| "loss/crossentropy": 2.2392332553863525, |
| "loss/hidden": 0.9375, |
| "loss/logits": 0.11831367015838623, |
| "loss/reg": 3.5740758903557435e-05, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.009875, |
| "grad_norm": 2.267005205154419, |
| "grad_norm_var": 1.425408330742154, |
| "learning_rate": 7.900000000000001e-05, |
| "loss": 1.2224, |
| "loss/crossentropy": 2.524975299835205, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.12832751870155334, |
| "loss/reg": 3.5736080462811515e-05, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 1.9941935539245605, |
| "grad_norm_var": 1.4124245943422287, |
| "learning_rate": 8e-05, |
| "loss": 1.3792, |
| "loss/crossentropy": 2.3303298950195312, |
| "loss/hidden": 1.2109375, |
| "loss/logits": 0.16792933642864227, |
| "loss/reg": 3.573206049622968e-05, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.010125, |
| "grad_norm": 1.9384936094284058, |
| "grad_norm_var": 1.407320221541647, |
| "learning_rate": 8.1e-05, |
| "loss": 1.2054, |
| "loss/crossentropy": 2.339118003845215, |
| "loss/hidden": 1.0703125, |
| "loss/logits": 0.13472682237625122, |
| "loss/reg": 3.5727785871131346e-05, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.01025, |
| "grad_norm": 2.2007267475128174, |
| "grad_norm_var": 1.3942380508674062, |
| "learning_rate": 8.2e-05, |
| "loss": 1.2089, |
| "loss/crossentropy": 2.3879401683807373, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.130388081073761, |
| "loss/reg": 3.5725021007237956e-05, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.010375, |
| "grad_norm": 1.5669373273849487, |
| "grad_norm_var": 1.426148143880052, |
| "learning_rate": 8.3e-05, |
| "loss": 1.0953, |
| "loss/crossentropy": 2.386194944381714, |
| "loss/hidden": 0.97265625, |
| "loss/logits": 0.12224260717630386, |
| "loss/reg": 3.572153946151957e-05, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.0105, |
| "grad_norm": 3.444465160369873, |
| "grad_norm_var": 1.4716789596545155, |
| "learning_rate": 8.4e-05, |
| "loss": 1.4601, |
| "loss/crossentropy": 2.160597801208496, |
| "loss/hidden": 1.2578125, |
| "loss/logits": 0.20196697115898132, |
| "loss/reg": 3.571900742826983e-05, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.010625, |
| "grad_norm": 2.17124080657959, |
| "grad_norm_var": 1.440631969989453, |
| "learning_rate": 8.5e-05, |
| "loss": 1.1224, |
| "loss/crossentropy": 2.1496167182922363, |
| "loss/hidden": 1.0, |
| "loss/logits": 0.12202942371368408, |
| "loss/reg": 3.571617344277911e-05, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.01075, |
| "grad_norm": 1.842886209487915, |
| "grad_norm_var": 1.4522613774542446, |
| "learning_rate": 8.6e-05, |
| "loss": 1.0156, |
| "loss/crossentropy": 2.226436138153076, |
| "loss/hidden": 0.90625, |
| "loss/logits": 0.10898593068122864, |
| "loss/reg": 3.57139615516644e-05, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.010875, |
| "grad_norm": 2.05486798286438, |
| "grad_norm_var": 1.344934690323482, |
| "learning_rate": 8.7e-05, |
| "loss": 1.0828, |
| "loss/crossentropy": 2.322002410888672, |
| "loss/hidden": 0.9609375, |
| "loss/logits": 0.12146371603012085, |
| "loss/reg": 3.5710025258595124e-05, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.011, |
| "grad_norm": 3.25955867767334, |
| "grad_norm_var": 1.3897383898525164, |
| "learning_rate": 8.800000000000001e-05, |
| "loss": 1.5155, |
| "loss/crossentropy": 1.937675952911377, |
| "loss/hidden": 1.3125, |
| "loss/logits": 0.2026323676109314, |
| "loss/reg": 3.570731496438384e-05, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.011125, |
| "grad_norm": 1.7932565212249756, |
| "grad_norm_var": 1.4022136437703991, |
| "learning_rate": 8.900000000000001e-05, |
| "loss": 1.0228, |
| "loss/crossentropy": 2.649721145629883, |
| "loss/hidden": 0.91015625, |
| "loss/logits": 0.11223678290843964, |
| "loss/reg": 3.570578701328486e-05, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.01125, |
| "grad_norm": 1.6545979976654053, |
| "grad_norm_var": 1.3965356378457594, |
| "learning_rate": 9e-05, |
| "loss": 1.0415, |
| "loss/crossentropy": 2.546100616455078, |
| "loss/hidden": 0.9140625, |
| "loss/logits": 0.12706589698791504, |
| "loss/reg": 3.570249828044325e-05, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.011375, |
| "grad_norm": 2.2377841472625732, |
| "grad_norm_var": 0.3253247379631695, |
| "learning_rate": 9.1e-05, |
| "loss": 1.2172, |
| "loss/crossentropy": 2.677785873413086, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.13871382176876068, |
| "loss/reg": 3.5701228625839576e-05, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.0115, |
| "grad_norm": 2.016057014465332, |
| "grad_norm_var": 0.3001321883475782, |
| "learning_rate": 9.200000000000001e-05, |
| "loss": 1.2553, |
| "loss/crossentropy": 2.216202974319458, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.16114352643489838, |
| "loss/reg": 3.569832188077271e-05, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.011625, |
| "grad_norm": 1.9337667226791382, |
| "grad_norm_var": 0.28997562388856146, |
| "learning_rate": 9.300000000000001e-05, |
| "loss": 1.2073, |
| "loss/crossentropy": 2.7274839878082275, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.1522083729505539, |
| "loss/reg": 3.5696477425517514e-05, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.01175, |
| "grad_norm": 2.1966376304626465, |
| "grad_norm_var": 0.25874835102596966, |
| "learning_rate": 9.4e-05, |
| "loss": 1.2497, |
| "loss/crossentropy": 2.0065174102783203, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.13999012112617493, |
| "loss/reg": 3.56943673978094e-05, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.011875, |
| "grad_norm": 1.7162114381790161, |
| "grad_norm_var": 0.2699080995908368, |
| "learning_rate": 9.5e-05, |
| "loss": 1.0104, |
| "loss/crossentropy": 2.4001636505126953, |
| "loss/hidden": 0.90234375, |
| "loss/logits": 0.10766053199768066, |
| "loss/reg": 3.5692111850949004e-05, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.012, |
| "grad_norm": 1.9596549272537231, |
| "grad_norm_var": 0.2705912806447509, |
| "learning_rate": 9.6e-05, |
| "loss": 1.1288, |
| "loss/crossentropy": 2.4599721431732178, |
| "loss/hidden": 0.98828125, |
| "loss/logits": 0.14019131660461426, |
| "loss/reg": 3.5689983633346856e-05, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.012125, |
| "grad_norm": 1.9994059801101685, |
| "grad_norm_var": 0.26931496222480145, |
| "learning_rate": 9.7e-05, |
| "loss": 1.1051, |
| "loss/crossentropy": 2.8105618953704834, |
| "loss/hidden": 0.9765625, |
| "loss/logits": 0.12819884717464447, |
| "loss/reg": 3.568677857401781e-05, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.01225, |
| "grad_norm": 2.0389626026153564, |
| "grad_norm_var": 0.26938190348710545, |
| "learning_rate": 9.8e-05, |
| "loss": 1.4044, |
| "loss/crossentropy": 2.10030198097229, |
| "loss/hidden": 1.203125, |
| "loss/logits": 0.20087596774101257, |
| "loss/reg": 3.5685956390807405e-05, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.012375, |
| "grad_norm": 3.1797940731048584, |
| "grad_norm_var": 0.31348186491538, |
| "learning_rate": 9.900000000000001e-05, |
| "loss": 1.5773, |
| "loss/crossentropy": 2.7054786682128906, |
| "loss/hidden": 1.359375, |
| "loss/logits": 0.2175736129283905, |
| "loss/reg": 3.5683315218193457e-05, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.0125, |
| "grad_norm": 2.2074787616729736, |
| "grad_norm_var": 0.20694747633483127, |
| "learning_rate": 0.0001, |
| "loss": 1.2851, |
| "loss/crossentropy": 2.2195346355438232, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.15975427627563477, |
| "loss/reg": 3.568131796782836e-05, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.012625, |
| "grad_norm": 2.0853495597839355, |
| "grad_norm_var": 0.20706664538577282, |
| "learning_rate": 0.0001, |
| "loss": 1.1671, |
| "loss/crossentropy": 2.8007214069366455, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.15108169615268707, |
| "loss/reg": 3.5677723644766957e-05, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.01275, |
| "grad_norm": 2.261103630065918, |
| "grad_norm_var": 0.20165261092997658, |
| "learning_rate": 0.0001, |
| "loss": 1.2524, |
| "loss/crossentropy": 2.4267494678497314, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.1426696479320526, |
| "loss/reg": 3.567594103515148e-05, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.012875, |
| "grad_norm": 1.7995065450668335, |
| "grad_norm_var": 0.20938114766728447, |
| "learning_rate": 0.0001, |
| "loss": 1.0294, |
| "loss/crossentropy": 2.477445602416992, |
| "loss/hidden": 0.9140625, |
| "loss/logits": 0.11502823233604431, |
| "loss/reg": 3.5673674574354663e-05, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.013, |
| "grad_norm": 1.9462140798568726, |
| "grad_norm_var": 0.12222182001865463, |
| "learning_rate": 0.0001, |
| "loss": 1.175, |
| "loss/crossentropy": 2.4584763050079346, |
| "loss/hidden": 1.0234375, |
| "loss/logits": 0.15122900903224945, |
| "loss/reg": 3.567052772268653e-05, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.013125, |
| "grad_norm": 5.390810489654541, |
| "grad_norm_var": 0.8011994969266916, |
| "learning_rate": 0.0001, |
| "loss": 1.3528, |
| "loss/crossentropy": 2.61253023147583, |
| "loss/hidden": 1.1953125, |
| "loss/logits": 0.15716172754764557, |
| "loss/reg": 3.566941450117156e-05, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.01325, |
| "grad_norm": 2.104395866394043, |
| "grad_norm_var": 0.7757998475018496, |
| "learning_rate": 0.0001, |
| "loss": 1.4737, |
| "loss/crossentropy": 2.1339404582977295, |
| "loss/hidden": 1.296875, |
| "loss/logits": 0.17651526629924774, |
| "loss/reg": 3.566693339962512e-05, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.013375, |
| "grad_norm": 1.8610461950302124, |
| "grad_norm_var": 0.788653272883981, |
| "learning_rate": 0.0001, |
| "loss": 0.9407, |
| "loss/crossentropy": 2.4770216941833496, |
| "loss/hidden": 0.8359375, |
| "loss/logits": 0.10441947728395462, |
| "loss/reg": 3.566368104657158e-05, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.0135, |
| "grad_norm": 2.2065460681915283, |
| "grad_norm_var": 0.7838738781084629, |
| "learning_rate": 0.0001, |
| "loss": 1.2355, |
| "loss/crossentropy": 2.461944818496704, |
| "loss/hidden": 1.0703125, |
| "loss/logits": 0.16484174132347107, |
| "loss/reg": 3.566055602277629e-05, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.013625, |
| "grad_norm": 2.1468284130096436, |
| "grad_norm_var": 0.7761527810904186, |
| "learning_rate": 0.0001, |
| "loss": 1.1623, |
| "loss/crossentropy": 2.2069454193115234, |
| "loss/hidden": 1.0234375, |
| "loss/logits": 0.13850779831409454, |
| "loss/reg": 3.565785300452262e-05, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.01375, |
| "grad_norm": 2.383087635040283, |
| "grad_norm_var": 0.7752898762699504, |
| "learning_rate": 0.0001, |
| "loss": 1.1793, |
| "loss/crossentropy": 2.4479548931121826, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.139842689037323, |
| "loss/reg": 3.565509177860804e-05, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.013875, |
| "grad_norm": 2.9489004611968994, |
| "grad_norm_var": 0.7693129207579057, |
| "learning_rate": 0.0001, |
| "loss": 1.2705, |
| "loss/crossentropy": 2.3081014156341553, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.15295040607452393, |
| "loss/reg": 3.5651082725962624e-05, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.014, |
| "grad_norm": 3.968780755996704, |
| "grad_norm_var": 0.9016446173616401, |
| "learning_rate": 0.0001, |
| "loss": 1.4396, |
| "loss/crossentropy": 2.420243740081787, |
| "loss/hidden": 1.203125, |
| "loss/logits": 0.23613759875297546, |
| "loss/reg": 3.56451710104011e-05, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.014125, |
| "grad_norm": 1.9860399961471558, |
| "grad_norm_var": 0.9026067410203076, |
| "learning_rate": 0.0001, |
| "loss": 1.1773, |
| "loss/crossentropy": 2.477583169937134, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.14566099643707275, |
| "loss/reg": 3.56405544152949e-05, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.01425, |
| "grad_norm": 2.120425224304199, |
| "grad_norm_var": 0.8976643536437109, |
| "learning_rate": 0.0001, |
| "loss": 1.2152, |
| "loss/crossentropy": 3.030984878540039, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.15236616134643555, |
| "loss/reg": 3.563678910722956e-05, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.014375, |
| "grad_norm": 2.0870068073272705, |
| "grad_norm_var": 0.8786817926388901, |
| "learning_rate": 0.0001, |
| "loss": 1.2595, |
| "loss/crossentropy": 2.4538302421569824, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.1497730016708374, |
| "loss/reg": 3.5631266655400395e-05, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.0145, |
| "grad_norm": 3.1647448539733887, |
| "grad_norm_var": 0.9025786275056549, |
| "learning_rate": 0.0001, |
| "loss": 1.6928, |
| "loss/crossentropy": 2.205573320388794, |
| "loss/hidden": 1.3984375, |
| "loss/logits": 0.29398053884506226, |
| "loss/reg": 3.562564597814344e-05, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.014625, |
| "grad_norm": 1.8475593328475952, |
| "grad_norm_var": 0.9201723703583595, |
| "learning_rate": 0.0001, |
| "loss": 1.1518, |
| "loss/crossentropy": 2.5012142658233643, |
| "loss/hidden": 1.0078125, |
| "loss/logits": 0.14361616969108582, |
| "loss/reg": 3.562155688996427e-05, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.01475, |
| "grad_norm": 1.858892798423767, |
| "grad_norm_var": 0.9438422080188066, |
| "learning_rate": 0.0001, |
| "loss": 1.1388, |
| "loss/crossentropy": 2.5725672245025635, |
| "loss/hidden": 1.0078125, |
| "loss/logits": 0.1306220442056656, |
| "loss/reg": 3.561788616934791e-05, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.014875, |
| "grad_norm": 2.2440059185028076, |
| "grad_norm_var": 0.9153389246133348, |
| "learning_rate": 0.0001, |
| "loss": 1.4255, |
| "loss/crossentropy": 2.341083288192749, |
| "loss/hidden": 1.2421875, |
| "loss/logits": 0.1829112321138382, |
| "loss/reg": 3.561256380635314e-05, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.015, |
| "grad_norm": 2.1901657581329346, |
| "grad_norm_var": 0.9005062112002877, |
| "learning_rate": 0.0001, |
| "loss": 1.2391, |
| "loss/crossentropy": 2.362614631652832, |
| "loss/hidden": 1.0703125, |
| "loss/logits": 0.16847620904445648, |
| "loss/reg": 3.56065938831307e-05, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.015125, |
| "grad_norm": 5.200242519378662, |
| "grad_norm_var": 0.830131887163558, |
| "learning_rate": 0.0001, |
| "loss": 1.2103, |
| "loss/crossentropy": 2.6675992012023926, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.15525725483894348, |
| "loss/reg": 3.5602170100901276e-05, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.01525, |
| "grad_norm": 2.407500982284546, |
| "grad_norm_var": 0.8190810626824183, |
| "learning_rate": 0.0001, |
| "loss": 1.2234, |
| "loss/crossentropy": 2.56121563911438, |
| "loss/hidden": 1.0703125, |
| "loss/logits": 0.15273353457450867, |
| "loss/reg": 3.559728429536335e-05, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.015375, |
| "grad_norm": 1.8797663450241089, |
| "grad_norm_var": 0.8174111264801723, |
| "learning_rate": 0.0001, |
| "loss": 1.1099, |
| "loss/crossentropy": 2.4745869636535645, |
| "loss/hidden": 0.98046875, |
| "loss/logits": 0.12910515069961548, |
| "loss/reg": 3.559128526831046e-05, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.0155, |
| "grad_norm": 2.1494781970977783, |
| "grad_norm_var": 0.8201521751832492, |
| "learning_rate": 0.0001, |
| "loss": 1.3179, |
| "loss/crossentropy": 2.279508590698242, |
| "loss/hidden": 1.1484375, |
| "loss/logits": 0.1690721958875656, |
| "loss/reg": 3.558437674655579e-05, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.015625, |
| "grad_norm": 9.331904411315918, |
| "grad_norm_var": 3.67345953379431, |
| "learning_rate": 0.0001, |
| "loss": 1.3993, |
| "loss/crossentropy": 2.252732038497925, |
| "loss/hidden": 1.2578125, |
| "loss/logits": 0.14117392897605896, |
| "loss/reg": 3.5579581890488043e-05, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.01575, |
| "grad_norm": 4.699957847595215, |
| "grad_norm_var": 3.8228479802693203, |
| "learning_rate": 0.0001, |
| "loss": 1.2831, |
| "loss/crossentropy": 2.5877878665924072, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.12651070952415466, |
| "loss/reg": 3.557529635145329e-05, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.015875, |
| "grad_norm": 1.8446160554885864, |
| "grad_norm_var": 3.9257773899168873, |
| "learning_rate": 0.0001, |
| "loss": 1.2999, |
| "loss/crossentropy": 2.2898948192596436, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.15888020396232605, |
| "loss/reg": 3.557029049261473e-05, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.016, |
| "grad_norm": 1.873570203781128, |
| "grad_norm_var": 3.9466365178432232, |
| "learning_rate": 0.0001, |
| "loss": 1.1097, |
| "loss/crossentropy": 2.392472743988037, |
| "loss/hidden": 0.9765625, |
| "loss/logits": 0.13277310132980347, |
| "loss/reg": 3.556452429620549e-05, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.016125, |
| "grad_norm": 2.215426445007324, |
| "grad_norm_var": 3.92104303267346, |
| "learning_rate": 0.0001, |
| "loss": 1.2203, |
| "loss/crossentropy": 2.457443952560425, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.16524553298950195, |
| "loss/reg": 3.555676812538877e-05, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.01625, |
| "grad_norm": 1.7194857597351074, |
| "grad_norm_var": 3.9751548455277104, |
| "learning_rate": 0.0001, |
| "loss": 1.1788, |
| "loss/crossentropy": 2.3001158237457275, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.1472093164920807, |
| "loss/reg": 3.5550358006730676e-05, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.016375, |
| "grad_norm": 1.6397395133972168, |
| "grad_norm_var": 4.037312774164259, |
| "learning_rate": 0.0001, |
| "loss": 1.0336, |
| "loss/crossentropy": 2.54146146774292, |
| "loss/hidden": 0.91015625, |
| "loss/logits": 0.12307839095592499, |
| "loss/reg": 3.5546618164516985e-05, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.0165, |
| "grad_norm": 1.9748913049697876, |
| "grad_norm_var": 4.082478037296671, |
| "learning_rate": 0.0001, |
| "loss": 1.104, |
| "loss/crossentropy": 2.562748670578003, |
| "loss/hidden": 0.97265625, |
| "loss/logits": 0.130945086479187, |
| "loss/reg": 3.5543002013582736e-05, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.016625, |
| "grad_norm": 1.8674203157424927, |
| "grad_norm_var": 4.079934623823218, |
| "learning_rate": 0.0001, |
| "loss": 1.2082, |
| "loss/crossentropy": 2.6365339756011963, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.14538231492042542, |
| "loss/reg": 3.5538523661671206e-05, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.01675, |
| "grad_norm": 2.3201568126678467, |
| "grad_norm_var": 4.03421067719521, |
| "learning_rate": 0.0001, |
| "loss": 1.2356, |
| "loss/crossentropy": 2.435370445251465, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.14145305752754211, |
| "loss/reg": 3.5533634218154475e-05, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.016875, |
| "grad_norm": 2.4132328033447266, |
| "grad_norm_var": 4.022385903408254, |
| "learning_rate": 0.0001, |
| "loss": 1.3273, |
| "loss/crossentropy": 2.206634998321533, |
| "loss/hidden": 1.1796875, |
| "loss/logits": 0.14724516868591309, |
| "loss/reg": 3.552551061147824e-05, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.017, |
| "grad_norm": 2.419842481613159, |
| "grad_norm_var": 4.005232252864962, |
| "learning_rate": 0.0001, |
| "loss": 1.1667, |
| "loss/crossentropy": 2.517561912536621, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.15071895718574524, |
| "loss/reg": 3.5521599784260616e-05, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.017125, |
| "grad_norm": 2.716203212738037, |
| "grad_norm_var": 3.6198676372845124, |
| "learning_rate": 0.0001, |
| "loss": 1.1423, |
| "loss/crossentropy": 2.2819504737854004, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.12632890045642853, |
| "loss/reg": 3.551522604539059e-05, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.01725, |
| "grad_norm": 2.166456460952759, |
| "grad_norm_var": 3.6334485092224402, |
| "learning_rate": 0.0001, |
| "loss": 1.0296, |
| "loss/crossentropy": 2.4271674156188965, |
| "loss/hidden": 0.91015625, |
| "loss/logits": 0.11908704042434692, |
| "loss/reg": 3.550978362909518e-05, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.017375, |
| "grad_norm": 2.0737109184265137, |
| "grad_norm_var": 3.6145368084521117, |
| "learning_rate": 0.0001, |
| "loss": 1.0938, |
| "loss/crossentropy": 2.595165967941284, |
| "loss/hidden": 0.95703125, |
| "loss/logits": 0.13645675778388977, |
| "loss/reg": 3.550490873749368e-05, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.0175, |
| "grad_norm": 2.0036461353302, |
| "grad_norm_var": 3.6268452557090196, |
| "learning_rate": 0.0001, |
| "loss": 1.0352, |
| "loss/crossentropy": 2.6407668590545654, |
| "loss/hidden": 0.91015625, |
| "loss/logits": 0.12472639232873917, |
| "loss/reg": 3.5499935620464385e-05, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.017625, |
| "grad_norm": 3.4189512729644775, |
| "grad_norm_var": 0.5874364952131912, |
| "learning_rate": 0.0001, |
| "loss": 1.3364, |
| "loss/crossentropy": 2.8673250675201416, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.19542476534843445, |
| "loss/reg": 3.549545363057405e-05, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.01775, |
| "grad_norm": 2.0884158611297607, |
| "grad_norm_var": 0.19036343785417903, |
| "learning_rate": 0.0001, |
| "loss": 1.1356, |
| "loss/crossentropy": 2.2495715618133545, |
| "loss/hidden": 1.0078125, |
| "loss/logits": 0.12745517492294312, |
| "loss/reg": 3.5490164009388536e-05, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.017875, |
| "grad_norm": 2.4939138889312744, |
| "grad_norm_var": 0.1883496681179942, |
| "learning_rate": 0.0001, |
| "loss": 1.2224, |
| "loss/crossentropy": 2.3506898880004883, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.14388948678970337, |
| "loss/reg": 3.548476524883881e-05, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.018, |
| "grad_norm": 2.634059190750122, |
| "grad_norm_var": 0.19009706439956606, |
| "learning_rate": 0.0001, |
| "loss": 1.308, |
| "loss/crossentropy": 2.422675371170044, |
| "loss/hidden": 1.1484375, |
| "loss/logits": 0.15922774374485016, |
| "loss/reg": 3.5479293728712946e-05, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.018125, |
| "grad_norm": 2.9936301708221436, |
| "grad_norm_var": 0.22328614777820613, |
| "learning_rate": 0.0001, |
| "loss": 1.4104, |
| "loss/crossentropy": 2.5935611724853516, |
| "loss/hidden": 1.2109375, |
| "loss/logits": 0.19912970066070557, |
| "loss/reg": 3.547423330019228e-05, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.01825, |
| "grad_norm": 3.1390833854675293, |
| "grad_norm_var": 0.23765955297996205, |
| "learning_rate": 0.0001, |
| "loss": 1.2747, |
| "loss/crossentropy": 2.4289345741271973, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.14936049282550812, |
| "loss/reg": 3.54700350726489e-05, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.018375, |
| "grad_norm": 2.4484870433807373, |
| "grad_norm_var": 0.19680489618343674, |
| "learning_rate": 0.0001, |
| "loss": 1.1176, |
| "loss/crossentropy": 2.519469976425171, |
| "loss/hidden": 0.98046875, |
| "loss/logits": 0.13677741587162018, |
| "loss/reg": 3.546685184119269e-05, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.0185, |
| "grad_norm": 1.876994252204895, |
| "grad_norm_var": 0.20358269116957192, |
| "learning_rate": 0.0001, |
| "loss": 1.0701, |
| "loss/crossentropy": 2.669678211212158, |
| "loss/hidden": 0.9453125, |
| "loss/logits": 0.12444234639406204, |
| "loss/reg": 3.5462882806314155e-05, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.018625, |
| "grad_norm": 2.704676628112793, |
| "grad_norm_var": 0.1832369663952557, |
| "learning_rate": 0.0001, |
| "loss": 1.2437, |
| "loss/crossentropy": 2.595327854156494, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.18086357414722443, |
| "loss/reg": 3.5459666833048686e-05, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.01875, |
| "grad_norm": 1.9055484533309937, |
| "grad_norm_var": 0.20361674389210194, |
| "learning_rate": 0.0001, |
| "loss": 1.161, |
| "loss/crossentropy": 2.5460128784179688, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.1449938714504242, |
| "loss/reg": 3.545805884641595e-05, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.018875, |
| "grad_norm": 2.7920961380004883, |
| "grad_norm_var": 0.20979331401592252, |
| "learning_rate": 0.0001, |
| "loss": 1.1624, |
| "loss/crossentropy": 2.2290139198303223, |
| "loss/hidden": 1.0078125, |
| "loss/logits": 0.15427884459495544, |
| "loss/reg": 3.545627259882167e-05, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.019, |
| "grad_norm": 2.299669027328491, |
| "grad_norm_var": 0.21185582767360506, |
| "learning_rate": 0.0001, |
| "loss": 1.1007, |
| "loss/crossentropy": 2.5660064220428467, |
| "loss/hidden": 0.9609375, |
| "loss/logits": 0.13937082886695862, |
| "loss/reg": 3.5451499570626765e-05, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.019125, |
| "grad_norm": 1.9452663660049438, |
| "grad_norm_var": 0.2252079205413636, |
| "learning_rate": 0.0001, |
| "loss": 1.1951, |
| "loss/crossentropy": 2.3395628929138184, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.155635803937912, |
| "loss/reg": 3.544955688994378e-05, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.01925, |
| "grad_norm": 1.8410539627075195, |
| "grad_norm_var": 0.24354386471799874, |
| "learning_rate": 0.0001, |
| "loss": 1.0907, |
| "loss/crossentropy": 2.5739612579345703, |
| "loss/hidden": 0.95703125, |
| "loss/logits": 0.13335567712783813, |
| "loss/reg": 3.544604260241613e-05, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.019375, |
| "grad_norm": 2.2286977767944336, |
| "grad_norm_var": 0.23796766155861107, |
| "learning_rate": 0.0001, |
| "loss": 1.0203, |
| "loss/crossentropy": 2.2887346744537354, |
| "loss/hidden": 0.90625, |
| "loss/logits": 0.11365014314651489, |
| "loss/reg": 3.544157516444102e-05, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.0195, |
| "grad_norm": 2.3557684421539307, |
| "grad_norm_var": 0.22589299419968203, |
| "learning_rate": 0.0001, |
| "loss": 1.3643, |
| "loss/crossentropy": 2.274764060974121, |
| "loss/hidden": 1.1875, |
| "loss/logits": 0.1764501929283142, |
| "loss/reg": 3.543913771864027e-05, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.019625, |
| "grad_norm": 2.247559070587158, |
| "grad_norm_var": 0.15998786264861256, |
| "learning_rate": 0.0001, |
| "loss": 1.3654, |
| "loss/crossentropy": 2.2784736156463623, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.19315966963768005, |
| "loss/reg": 3.543505954439752e-05, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.01975, |
| "grad_norm": 2.834188222885132, |
| "grad_norm_var": 0.16628359109999918, |
| "learning_rate": 0.0001, |
| "loss": 1.3386, |
| "loss/crossentropy": 2.509218454360962, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.18198764324188232, |
| "loss/reg": 3.54316653101705e-05, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.019875, |
| "grad_norm": 2.1036031246185303, |
| "grad_norm_var": 0.17202571468130015, |
| "learning_rate": 0.0001, |
| "loss": 1.0503, |
| "loss/crossentropy": 2.4518606662750244, |
| "loss/hidden": 0.93359375, |
| "loss/logits": 0.11635103076696396, |
| "loss/reg": 3.542845297488384e-05, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 2.062805652618408, |
| "grad_norm_var": 0.17435755134033895, |
| "learning_rate": 0.0001, |
| "loss": 1.0312, |
| "loss/crossentropy": 2.361372470855713, |
| "loss/hidden": 0.91796875, |
| "loss/logits": 0.1128474771976471, |
| "loss/reg": 3.5423294320935383e-05, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.020125, |
| "grad_norm": 2.021106004714966, |
| "grad_norm_var": 0.15146251895304388, |
| "learning_rate": 0.0001, |
| "loss": 1.1121, |
| "loss/crossentropy": 2.134568214416504, |
| "loss/hidden": 0.984375, |
| "loss/logits": 0.1273871660232544, |
| "loss/reg": 3.541701880749315e-05, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.02025, |
| "grad_norm": 1.8616065979003906, |
| "grad_norm_var": 0.11060822886564707, |
| "learning_rate": 0.0001, |
| "loss": 1.1566, |
| "loss/crossentropy": 2.252749443054199, |
| "loss/hidden": 1.0234375, |
| "loss/logits": 0.1327974796295166, |
| "loss/reg": 3.541166370268911e-05, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.020375, |
| "grad_norm": 3.580717086791992, |
| "grad_norm_var": 0.2251369893581473, |
| "learning_rate": 0.0001, |
| "loss": 1.7797, |
| "loss/crossentropy": 2.554025888442993, |
| "loss/hidden": 1.4921875, |
| "loss/logits": 0.2871723771095276, |
| "loss/reg": 3.54056573996786e-05, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.0205, |
| "grad_norm": 1.9080392122268677, |
| "grad_norm_var": 0.22348213477058507, |
| "learning_rate": 0.0001, |
| "loss": 1.1281, |
| "loss/crossentropy": 2.561861038208008, |
| "loss/hidden": 0.984375, |
| "loss/logits": 0.14337776601314545, |
| "loss/reg": 3.53991927113384e-05, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.020625, |
| "grad_norm": 2.122875213623047, |
| "grad_norm_var": 0.2127240754841674, |
| "learning_rate": 0.0001, |
| "loss": 1.1555, |
| "loss/crossentropy": 2.5679633617401123, |
| "loss/hidden": 1.0, |
| "loss/logits": 0.15509843826293945, |
| "loss/reg": 3.539249883033335e-05, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.02075, |
| "grad_norm": 2.3634352684020996, |
| "grad_norm_var": 0.2043765165354652, |
| "learning_rate": 0.0001, |
| "loss": 1.2497, |
| "loss/crossentropy": 2.691157579421997, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.15564362704753876, |
| "loss/reg": 3.5388431570027024e-05, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.020875, |
| "grad_norm": 1.8694658279418945, |
| "grad_norm_var": 0.1952630533114321, |
| "learning_rate": 0.0001, |
| "loss": 1.193, |
| "loss/crossentropy": 2.585261106491089, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.14572536945343018, |
| "loss/reg": 3.5384666261961684e-05, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.021, |
| "grad_norm": 1.9730387926101685, |
| "grad_norm_var": 0.19880394057846942, |
| "learning_rate": 0.0001, |
| "loss": 1.2912, |
| "loss/crossentropy": 2.5292489528656006, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.16588369011878967, |
| "loss/reg": 3.538179225870408e-05, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.021125, |
| "grad_norm": 2.0155906677246094, |
| "grad_norm_var": 0.1966546350588845, |
| "learning_rate": 0.0001, |
| "loss": 1.2574, |
| "loss/crossentropy": 2.252187728881836, |
| "loss/hidden": 1.1015625, |
| "loss/logits": 0.1554747223854065, |
| "loss/reg": 3.537629891070537e-05, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.02125, |
| "grad_norm": 2.432105302810669, |
| "grad_norm_var": 0.18926746622633459, |
| "learning_rate": 0.0001, |
| "loss": 1.4755, |
| "loss/crossentropy": 2.1912407875061035, |
| "loss/hidden": 1.3125, |
| "loss/logits": 0.1626225709915161, |
| "loss/reg": 3.5371955164009705e-05, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.021375, |
| "grad_norm": 2.286074161529541, |
| "grad_norm_var": 0.18931952814725433, |
| "learning_rate": 0.0001, |
| "loss": 1.1264, |
| "loss/crossentropy": 2.7250216007232666, |
| "loss/hidden": 0.9921875, |
| "loss/logits": 0.13382771611213684, |
| "loss/reg": 3.53686700691469e-05, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.0215, |
| "grad_norm": 2.3155455589294434, |
| "grad_norm_var": 0.18886613419825055, |
| "learning_rate": 0.0001, |
| "loss": 1.1029, |
| "loss/crossentropy": 2.54166316986084, |
| "loss/hidden": 0.96875, |
| "loss/logits": 0.13384276628494263, |
| "loss/reg": 3.536650910973549e-05, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.021625, |
| "grad_norm": 2.488759994506836, |
| "grad_norm_var": 0.19242826239165894, |
| "learning_rate": 0.0001, |
| "loss": 1.2388, |
| "loss/crossentropy": 2.368736743927002, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.1524895578622818, |
| "loss/reg": 3.5365450457902625e-05, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.02175, |
| "grad_norm": 2.2716481685638428, |
| "grad_norm_var": 0.16950942206230835, |
| "learning_rate": 0.0001, |
| "loss": 1.1404, |
| "loss/crossentropy": 2.602968454360962, |
| "loss/hidden": 1.0, |
| "loss/logits": 0.14000558853149414, |
| "loss/reg": 3.5365239455131814e-05, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.021875, |
| "grad_norm": 2.346731424331665, |
| "grad_norm_var": 0.16911372185244672, |
| "learning_rate": 0.0001, |
| "loss": 1.1505, |
| "loss/crossentropy": 2.6104869842529297, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.1345091462135315, |
| "loss/reg": 3.536231452017091e-05, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.022, |
| "grad_norm": 9.636815071105957, |
| "grad_norm_var": 3.5705013839433035, |
| "learning_rate": 0.0001, |
| "loss": 1.9711, |
| "loss/crossentropy": 1.9007188081741333, |
| "loss/hidden": 1.8359375, |
| "loss/logits": 0.13476577401161194, |
| "loss/reg": 3.535941868904047e-05, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.022125, |
| "grad_norm": 1.9420382976531982, |
| "grad_norm_var": 3.578242683123464, |
| "learning_rate": 0.0001, |
| "loss": 1.0553, |
| "loss/crossentropy": 2.1399552822113037, |
| "loss/hidden": 0.93359375, |
| "loss/logits": 0.12134355306625366, |
| "loss/reg": 3.535431460477412e-05, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.02225, |
| "grad_norm": 3.67820405960083, |
| "grad_norm_var": 3.5781775866024454, |
| "learning_rate": 0.0001, |
| "loss": 1.6061, |
| "loss/crossentropy": 2.7716376781463623, |
| "loss/hidden": 1.3671875, |
| "loss/logits": 0.2385806441307068, |
| "loss/reg": 3.5349476092960685e-05, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.022375, |
| "grad_norm": 2.345334768295288, |
| "grad_norm_var": 3.5494032480633924, |
| "learning_rate": 0.0001, |
| "loss": 1.3134, |
| "loss/crossentropy": 2.2584104537963867, |
| "loss/hidden": 1.1484375, |
| "loss/logits": 0.16456623375415802, |
| "loss/reg": 3.534728966769762e-05, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.0225, |
| "grad_norm": 2.019059181213379, |
| "grad_norm_var": 3.5377143028114526, |
| "learning_rate": 0.0001, |
| "loss": 1.1633, |
| "loss/crossentropy": 2.688572645187378, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.13166731595993042, |
| "loss/reg": 3.5341858165338635e-05, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.022625, |
| "grad_norm": 2.5575642585754395, |
| "grad_norm_var": 3.5127901367513408, |
| "learning_rate": 0.0001, |
| "loss": 1.3238, |
| "loss/crossentropy": 3.2461724281311035, |
| "loss/hidden": 1.1328125, |
| "loss/logits": 0.19062137603759766, |
| "loss/reg": 3.534007555572316e-05, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.02275, |
| "grad_norm": 2.1583099365234375, |
| "grad_norm_var": 3.52691794996746, |
| "learning_rate": 0.0001, |
| "loss": 1.105, |
| "loss/crossentropy": 2.570775270462036, |
| "loss/hidden": 0.97265625, |
| "loss/logits": 0.13197766244411469, |
| "loss/reg": 3.5337754525244236e-05, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.022875, |
| "grad_norm": 2.1373021602630615, |
| "grad_norm_var": 3.499205684128989, |
| "learning_rate": 0.0001, |
| "loss": 1.0368, |
| "loss/crossentropy": 2.7369937896728516, |
| "loss/hidden": 0.91015625, |
| "loss/logits": 0.12632793188095093, |
| "loss/reg": 3.5334065614733845e-05, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.023, |
| "grad_norm": 1.9534015655517578, |
| "grad_norm_var": 3.501362961216583, |
| "learning_rate": 0.0001, |
| "loss": 1.2631, |
| "loss/crossentropy": 2.348998546600342, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.16902770102024078, |
| "loss/reg": 3.533027120283805e-05, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.023125, |
| "grad_norm": 3.3518424034118652, |
| "grad_norm_var": 3.47560508461983, |
| "learning_rate": 0.0001, |
| "loss": 1.4757, |
| "loss/crossentropy": 2.2752151489257812, |
| "loss/hidden": 1.3125, |
| "loss/logits": 0.16288352012634277, |
| "loss/reg": 3.532712798914872e-05, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.02325, |
| "grad_norm": 1.9095062017440796, |
| "grad_norm_var": 3.5231901050491348, |
| "learning_rate": 0.0001, |
| "loss": 1.1564, |
| "loss/crossentropy": 2.3877570629119873, |
| "loss/hidden": 0.9921875, |
| "loss/logits": 0.16389842331409454, |
| "loss/reg": 3.531980837578885e-05, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.023375, |
| "grad_norm": 1.7745263576507568, |
| "grad_norm_var": 3.5771479932902293, |
| "learning_rate": 0.0001, |
| "loss": 1.0993, |
| "loss/crossentropy": 2.5461585521698, |
| "loss/hidden": 0.96484375, |
| "loss/logits": 0.13407567143440247, |
| "loss/reg": 3.531064066919498e-05, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.0235, |
| "grad_norm": 1.932446002960205, |
| "grad_norm_var": 3.611343163184297, |
| "learning_rate": 0.0001, |
| "loss": 1.1479, |
| "loss/crossentropy": 2.177215099334717, |
| "loss/hidden": 1.0078125, |
| "loss/logits": 0.13969676196575165, |
| "loss/reg": 3.530231333570555e-05, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.023625, |
| "grad_norm": 2.2318572998046875, |
| "grad_norm_var": 3.6254944343577464, |
| "learning_rate": 0.0001, |
| "loss": 1.1476, |
| "loss/crossentropy": 2.539461374282837, |
| "loss/hidden": 1.0078125, |
| "loss/logits": 0.13939380645751953, |
| "loss/reg": 3.5298002330819145e-05, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.02375, |
| "grad_norm": 1.8733116388320923, |
| "grad_norm_var": 3.661635973864308, |
| "learning_rate": 0.0001, |
| "loss": 1.2894, |
| "loss/crossentropy": 2.3773810863494873, |
| "loss/hidden": 1.1484375, |
| "loss/logits": 0.14065586030483246, |
| "loss/reg": 3.529394234647043e-05, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.023875, |
| "grad_norm": 1.9819684028625488, |
| "grad_norm_var": 3.689103451615234, |
| "learning_rate": 0.0001, |
| "loss": 1.0824, |
| "loss/crossentropy": 2.652743101119995, |
| "loss/hidden": 0.95703125, |
| "loss/logits": 0.12501415610313416, |
| "loss/reg": 3.528552406351082e-05, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.024, |
| "grad_norm": 1.781873345375061, |
| "grad_norm_var": 0.29881303206394033, |
| "learning_rate": 0.0001, |
| "loss": 1.3252, |
| "loss/crossentropy": 2.130110740661621, |
| "loss/hidden": 1.1640625, |
| "loss/logits": 0.16076770424842834, |
| "loss/reg": 3.527875742292963e-05, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.024125, |
| "grad_norm": 1.968166470527649, |
| "grad_norm_var": 0.29786371458498306, |
| "learning_rate": 0.0001, |
| "loss": 1.1787, |
| "loss/crossentropy": 2.607984781265259, |
| "loss/hidden": 1.0234375, |
| "loss/logits": 0.15494059026241302, |
| "loss/reg": 3.5275123082101345e-05, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.02425, |
| "grad_norm": 2.250447988510132, |
| "grad_norm_var": 0.14927689793780866, |
| "learning_rate": 0.0001, |
| "loss": 1.2501, |
| "loss/crossentropy": 2.381725549697876, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.1403769701719284, |
| "loss/reg": 3.5266541090095416e-05, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.024375, |
| "grad_norm": 2.3107409477233887, |
| "grad_norm_var": 0.14840081385512557, |
| "learning_rate": 0.0001, |
| "loss": 1.308, |
| "loss/crossentropy": 2.5593056678771973, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.15144123136997223, |
| "loss/reg": 3.526056025293656e-05, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.0245, |
| "grad_norm": 2.0219268798828125, |
| "grad_norm_var": 0.14835622425891018, |
| "learning_rate": 0.0001, |
| "loss": 1.2059, |
| "loss/crossentropy": 2.591111421585083, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.1508275270462036, |
| "loss/reg": 3.525464853737503e-05, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.024625, |
| "grad_norm": 1.7184540033340454, |
| "grad_norm_var": 0.14533186557784786, |
| "learning_rate": 0.0001, |
| "loss": 1.1103, |
| "loss/crossentropy": 2.5513174533843994, |
| "loss/hidden": 0.98046875, |
| "loss/logits": 0.12951934337615967, |
| "loss/reg": 3.5250719520263374e-05, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.02475, |
| "grad_norm": 2.099649429321289, |
| "grad_norm_var": 0.14497162965532903, |
| "learning_rate": 0.0001, |
| "loss": 1.3992, |
| "loss/crossentropy": 2.3214898109436035, |
| "loss/hidden": 1.2109375, |
| "loss/logits": 0.1879514902830124, |
| "loss/reg": 3.524802013998851e-05, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.024875, |
| "grad_norm": 2.551090717315674, |
| "grad_norm_var": 0.15877433194770507, |
| "learning_rate": 0.0001, |
| "loss": 1.1497, |
| "loss/crossentropy": 2.0819451808929443, |
| "loss/hidden": 1.0, |
| "loss/logits": 0.14934971928596497, |
| "loss/reg": 3.524927524267696e-05, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.025, |
| "grad_norm": 2.41422438621521, |
| "grad_norm_var": 0.1626121663513837, |
| "learning_rate": 0.0001, |
| "loss": 1.1887, |
| "loss/crossentropy": 2.4138712882995605, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.15712395310401917, |
| "loss/reg": 3.5251006920589134e-05, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.025125, |
| "grad_norm": 3.0029654502868652, |
| "grad_norm_var": 0.11365057463783608, |
| "learning_rate": 0.0001, |
| "loss": 1.2376, |
| "loss/crossentropy": 2.3929851055145264, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.12789341807365417, |
| "loss/reg": 3.525133433868177e-05, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.02525, |
| "grad_norm": 2.318742513656616, |
| "grad_norm_var": 0.1129624302912769, |
| "learning_rate": 0.0001, |
| "loss": 1.1832, |
| "loss/crossentropy": 2.4472897052764893, |
| "loss/hidden": 1.0234375, |
| "loss/logits": 0.15940618515014648, |
| "loss/reg": 3.525337888277136e-05, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.025375, |
| "grad_norm": 2.432077169418335, |
| "grad_norm_var": 0.1079851047719283, |
| "learning_rate": 0.0001, |
| "loss": 1.2983, |
| "loss/crossentropy": 2.5350871086120605, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.17297999560832977, |
| "loss/reg": 3.5248252970632166e-05, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.0255, |
| "grad_norm": 2.019660711288452, |
| "grad_norm_var": 0.10557456561180795, |
| "learning_rate": 0.0001, |
| "loss": 1.1381, |
| "loss/crossentropy": 2.443376064300537, |
| "loss/hidden": 1.0078125, |
| "loss/logits": 0.1299603134393692, |
| "loss/reg": 3.5241089790361e-05, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.025625, |
| "grad_norm": 1.928032636642456, |
| "grad_norm_var": 0.10948915785116071, |
| "learning_rate": 0.0001, |
| "loss": 1.039, |
| "loss/crossentropy": 2.552450656890869, |
| "loss/hidden": 0.9140625, |
| "loss/logits": 0.12461342662572861, |
| "loss/reg": 3.523936538840644e-05, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.02575, |
| "grad_norm": 1.872836947441101, |
| "grad_norm_var": 0.10950776538443824, |
| "learning_rate": 0.0001, |
| "loss": 1.203, |
| "loss/crossentropy": 2.563770294189453, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.1558125615119934, |
| "loss/reg": 3.523280975059606e-05, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.025875, |
| "grad_norm": 2.722188711166382, |
| "grad_norm_var": 0.12548596824484168, |
| "learning_rate": 0.0001, |
| "loss": 1.1048, |
| "loss/crossentropy": 2.673701286315918, |
| "loss/hidden": 0.95703125, |
| "loss/logits": 0.14743700623512268, |
| "loss/reg": 3.523009945638478e-05, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.026, |
| "grad_norm": 2.174853563308716, |
| "grad_norm_var": 0.1125315287945383, |
| "learning_rate": 0.0001, |
| "loss": 1.2005, |
| "loss/crossentropy": 2.776305913925171, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.16110967099666595, |
| "loss/reg": 3.5228087654104456e-05, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.026125, |
| "grad_norm": 2.026176691055298, |
| "grad_norm_var": 0.11065571110427162, |
| "learning_rate": 0.0001, |
| "loss": 1.2427, |
| "loss/crossentropy": 2.4790477752685547, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.15644526481628418, |
| "loss/reg": 3.5227625630795956e-05, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.02625, |
| "grad_norm": 1.8512555360794067, |
| "grad_norm_var": 0.12013934057968918, |
| "learning_rate": 0.0001, |
| "loss": 1.1542, |
| "loss/crossentropy": 2.2648251056671143, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.1382524073123932, |
| "loss/reg": 3.5229088098276407e-05, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.026375, |
| "grad_norm": 2.1204733848571777, |
| "grad_norm_var": 0.12001253969897234, |
| "learning_rate": 0.0001, |
| "loss": 1.1644, |
| "loss/crossentropy": 2.5878171920776367, |
| "loss/hidden": 1.0078125, |
| "loss/logits": 0.15627792477607727, |
| "loss/reg": 3.523128543747589e-05, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.0265, |
| "grad_norm": 2.3862364292144775, |
| "grad_norm_var": 0.11943129282008957, |
| "learning_rate": 0.0001, |
| "loss": 1.2387, |
| "loss/crossentropy": 2.3441061973571777, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.1524021029472351, |
| "loss/reg": 3.522722909110598e-05, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.026625, |
| "grad_norm": 2.4904723167419434, |
| "grad_norm_var": 0.10428997507238075, |
| "learning_rate": 0.0001, |
| "loss": 1.2116, |
| "loss/crossentropy": 2.5467946529388428, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.15654009580612183, |
| "loss/reg": 3.522184488247149e-05, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.02675, |
| "grad_norm": 2.1831283569335938, |
| "grad_norm_var": 0.1027661689763948, |
| "learning_rate": 0.0001, |
| "loss": 1.1544, |
| "loss/crossentropy": 2.782853126525879, |
| "loss/hidden": 0.98828125, |
| "loss/logits": 0.16573229432106018, |
| "loss/reg": 3.521531107253395e-05, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.026875, |
| "grad_norm": 3.787935972213745, |
| "grad_norm_var": 0.24293552641352203, |
| "learning_rate": 0.0001, |
| "loss": 1.5035, |
| "loss/crossentropy": 2.463303804397583, |
| "loss/hidden": 1.28125, |
| "loss/logits": 0.22189679741859436, |
| "loss/reg": 3.5209486668463796e-05, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.027, |
| "grad_norm": 1.9294849634170532, |
| "grad_norm_var": 0.25400057735268855, |
| "learning_rate": 0.0001, |
| "loss": 1.1094, |
| "loss/crossentropy": 2.4277851581573486, |
| "loss/hidden": 0.97265625, |
| "loss/logits": 0.13635680079460144, |
| "loss/reg": 3.5205699532525614e-05, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.027125, |
| "grad_norm": 1.9984577894210815, |
| "grad_norm_var": 0.22665186521847977, |
| "learning_rate": 0.0001, |
| "loss": 1.0918, |
| "loss/crossentropy": 2.5422215461730957, |
| "loss/hidden": 0.96484375, |
| "loss/logits": 0.12658366560935974, |
| "loss/reg": 3.519668462104164e-05, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.02725, |
| "grad_norm": 2.2303996086120605, |
| "grad_norm_var": 0.2265080910144917, |
| "learning_rate": 0.0001, |
| "loss": 1.4173, |
| "loss/crossentropy": 2.6119143962860107, |
| "loss/hidden": 1.2109375, |
| "loss/logits": 0.2059965431690216, |
| "loss/reg": 3.519029269227758e-05, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.027375, |
| "grad_norm": 2.595283031463623, |
| "grad_norm_var": 0.23192599234322203, |
| "learning_rate": 0.0001, |
| "loss": 1.4022, |
| "loss/crossentropy": 2.388324499130249, |
| "loss/hidden": 1.2265625, |
| "loss/logits": 0.17526502907276154, |
| "loss/reg": 3.5182933061150834e-05, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.0275, |
| "grad_norm": 2.3104512691497803, |
| "grad_norm_var": 0.2275123342772699, |
| "learning_rate": 0.0001, |
| "loss": 1.1324, |
| "loss/crossentropy": 2.400674819946289, |
| "loss/hidden": 1.0078125, |
| "loss/logits": 0.1242409497499466, |
| "loss/reg": 3.517704681144096e-05, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.027625, |
| "grad_norm": 2.0627379417419434, |
| "grad_norm_var": 0.2221815343350992, |
| "learning_rate": 0.0001, |
| "loss": 1.2552, |
| "loss/crossentropy": 2.4785444736480713, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.1611141413450241, |
| "loss/reg": 3.517186632961966e-05, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.02775, |
| "grad_norm": 2.6437251567840576, |
| "grad_norm_var": 0.215787531953678, |
| "learning_rate": 0.0001, |
| "loss": 1.3035, |
| "loss/crossentropy": 2.35196590423584, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.21720939874649048, |
| "loss/reg": 3.5167005989933386e-05, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.027875, |
| "grad_norm": 2.0463922023773193, |
| "grad_norm_var": 0.21030634447597923, |
| "learning_rate": 0.0001, |
| "loss": 1.178, |
| "loss/crossentropy": 2.5125770568847656, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.14638057351112366, |
| "loss/reg": 3.516068682074547e-05, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.028, |
| "grad_norm": 2.099517822265625, |
| "grad_norm_var": 0.2119416481519659, |
| "learning_rate": 0.0001, |
| "loss": 1.2246, |
| "loss/crossentropy": 2.6609790325164795, |
| "loss/hidden": 1.0703125, |
| "loss/logits": 0.1539691686630249, |
| "loss/reg": 3.51545459125191e-05, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.028125, |
| "grad_norm": 2.2835323810577393, |
| "grad_norm_var": 0.20676636732833859, |
| "learning_rate": 0.0001, |
| "loss": 1.1949, |
| "loss/crossentropy": 2.408785581588745, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.14770297706127167, |
| "loss/reg": 3.514792115311138e-05, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.02825, |
| "grad_norm": 2.2726855278015137, |
| "grad_norm_var": 0.19188050953051553, |
| "learning_rate": 0.0001, |
| "loss": 1.125, |
| "loss/crossentropy": 2.3164994716644287, |
| "loss/hidden": 0.98828125, |
| "loss/logits": 0.13638192415237427, |
| "loss/reg": 3.5143304558005184e-05, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.028375, |
| "grad_norm": 2.239753484725952, |
| "grad_norm_var": 0.18927748053925839, |
| "learning_rate": 0.0001, |
| "loss": 1.4585, |
| "loss/crossentropy": 2.5264809131622314, |
| "loss/hidden": 1.2734375, |
| "loss/logits": 0.18467766046524048, |
| "loss/reg": 3.5138236853526905e-05, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.0285, |
| "grad_norm": 2.5043678283691406, |
| "grad_norm_var": 0.1907596103376837, |
| "learning_rate": 0.0001, |
| "loss": 1.465, |
| "loss/crossentropy": 2.1979873180389404, |
| "loss/hidden": 1.2734375, |
| "loss/logits": 0.191162109375, |
| "loss/reg": 3.513501360430382e-05, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.028625, |
| "grad_norm": 2.2842559814453125, |
| "grad_norm_var": 0.18968967595688752, |
| "learning_rate": 0.0001, |
| "loss": 1.1184, |
| "loss/crossentropy": 2.6084938049316406, |
| "loss/hidden": 0.984375, |
| "loss/logits": 0.1337008774280548, |
| "loss/reg": 3.5130418837070465e-05, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.02875, |
| "grad_norm": 78.28353881835938, |
| "grad_norm_var": 360.5321235749958, |
| "learning_rate": 0.0001, |
| "loss": 1.287, |
| "loss/crossentropy": 2.642012119293213, |
| "loss/hidden": 1.1328125, |
| "loss/logits": 0.15383732318878174, |
| "loss/reg": 3.512646071612835e-05, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.028875, |
| "grad_norm": 2.2934622764587402, |
| "grad_norm_var": 361.3313444069006, |
| "learning_rate": 0.0001, |
| "loss": 1.2146, |
| "loss/crossentropy": 2.125366687774658, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.13609249889850616, |
| "loss/reg": 3.51221788150724e-05, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.029, |
| "grad_norm": 2.2334835529327393, |
| "grad_norm_var": 361.13139871490966, |
| "learning_rate": 0.0001, |
| "loss": 1.3334, |
| "loss/crossentropy": 2.4540159702301025, |
| "loss/hidden": 1.1640625, |
| "loss/logits": 0.16899245977401733, |
| "loss/reg": 3.511944305500947e-05, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.029125, |
| "grad_norm": 2.025312900543213, |
| "grad_norm_var": 361.11344936137425, |
| "learning_rate": 0.0001, |
| "loss": 1.0377, |
| "loss/crossentropy": 2.4971330165863037, |
| "loss/hidden": 0.921875, |
| "loss/logits": 0.11546964198350906, |
| "loss/reg": 3.5114706406602636e-05, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.02925, |
| "grad_norm": 1.8107097148895264, |
| "grad_norm_var": 361.39278859021084, |
| "learning_rate": 0.0001, |
| "loss": 1.0426, |
| "loss/crossentropy": 2.3869125843048096, |
| "loss/hidden": 0.92578125, |
| "loss/logits": 0.11641789972782135, |
| "loss/reg": 3.510946407914162e-05, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.029375, |
| "grad_norm": 1.5672167539596558, |
| "grad_norm_var": 362.06253246288617, |
| "learning_rate": 0.0001, |
| "loss": 1.1107, |
| "loss/crossentropy": 2.239819049835205, |
| "loss/hidden": 0.984375, |
| "loss/logits": 0.12596073746681213, |
| "loss/reg": 3.510485475999303e-05, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.0295, |
| "grad_norm": 1.660971999168396, |
| "grad_norm_var": 362.48937574795417, |
| "learning_rate": 0.0001, |
| "loss": 1.1466, |
| "loss/crossentropy": 2.445607900619507, |
| "loss/hidden": 1.0078125, |
| "loss/logits": 0.13841402530670166, |
| "loss/reg": 3.510040551191196e-05, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.029625, |
| "grad_norm": 1.9987144470214844, |
| "grad_norm_var": 362.5308779292139, |
| "learning_rate": 0.0001, |
| "loss": 1.2824, |
| "loss/crossentropy": 2.3451719284057617, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.15705101191997528, |
| "loss/reg": 3.50964764948003e-05, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.02975, |
| "grad_norm": 1.6107144355773926, |
| "grad_norm_var": 363.18249781017846, |
| "learning_rate": 0.0001, |
| "loss": 1.094, |
| "loss/crossentropy": 2.4538111686706543, |
| "loss/hidden": 0.96484375, |
| "loss/logits": 0.12884217500686646, |
| "loss/reg": 3.5092118196189404e-05, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.029875, |
| "grad_norm": 2.075155735015869, |
| "grad_norm_var": 363.1642193933474, |
| "learning_rate": 0.0001, |
| "loss": 1.215, |
| "loss/crossentropy": 2.531987190246582, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.1521448791027069, |
| "loss/reg": 3.5086912248516455e-05, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 2.3201041221618652, |
| "grad_norm_var": 363.0281972205138, |
| "learning_rate": 0.0001, |
| "loss": 1.1417, |
| "loss/crossentropy": 2.5911645889282227, |
| "loss/hidden": 0.98828125, |
| "loss/logits": 0.15311402082443237, |
| "loss/reg": 3.5083008697256446e-05, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.030125, |
| "grad_norm": 2.102585554122925, |
| "grad_norm_var": 363.1402101869869, |
| "learning_rate": 0.0001, |
| "loss": 1.0786, |
| "loss/crossentropy": 2.350059747695923, |
| "loss/hidden": 0.9453125, |
| "loss/logits": 0.13292624056339264, |
| "loss/reg": 3.507927613100037e-05, |
| "step": 241 |
| }, |
| { |
| "epoch": 0.03025, |
| "grad_norm": 2.0235884189605713, |
| "grad_norm_var": 363.295456416674, |
| "learning_rate": 0.0001, |
| "loss": 1.191, |
| "loss/crossentropy": 2.2975988388061523, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.1438218355178833, |
| "loss/reg": 3.5071363527094945e-05, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.030375, |
| "grad_norm": 2.6130146980285645, |
| "grad_norm_var": 363.0764814158417, |
| "learning_rate": 0.0001, |
| "loss": 1.2925, |
| "loss/crossentropy": 2.477764368057251, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.15148332715034485, |
| "loss/reg": 3.5061231756117195e-05, |
| "step": 243 |
| }, |
| { |
| "epoch": 0.0305, |
| "grad_norm": 2.840742826461792, |
| "grad_norm_var": 362.889192023, |
| "learning_rate": 0.0001, |
| "loss": 1.0848, |
| "loss/crossentropy": 2.4943950176239014, |
| "loss/hidden": 0.95703125, |
| "loss/logits": 0.12739571928977966, |
| "loss/reg": 3.505099084577523e-05, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.030625, |
| "grad_norm": 2.412440776824951, |
| "grad_norm_var": 362.8120310886774, |
| "learning_rate": 0.0001, |
| "loss": 1.1284, |
| "loss/crossentropy": 2.5384373664855957, |
| "loss/hidden": 0.9921875, |
| "loss/logits": 0.1358700394630432, |
| "loss/reg": 3.504483902361244e-05, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.03075, |
| "grad_norm": 2.105398654937744, |
| "grad_norm_var": 0.12231122414377561, |
| "learning_rate": 0.0001, |
| "loss": 1.2701, |
| "loss/crossentropy": 2.192657232284546, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.16039346158504486, |
| "loss/reg": 3.5036639019381255e-05, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.030875, |
| "grad_norm": 1.9865885972976685, |
| "grad_norm_var": 0.12052054727502123, |
| "learning_rate": 0.0001, |
| "loss": 1.1144, |
| "loss/crossentropy": 2.396636486053467, |
| "loss/hidden": 0.97265625, |
| "loss/logits": 0.14134395122528076, |
| "loss/reg": 3.502915205899626e-05, |
| "step": 247 |
| }, |
| { |
| "epoch": 0.031, |
| "grad_norm": 2.064413547515869, |
| "grad_norm_var": 0.11899755252362822, |
| "learning_rate": 0.0001, |
| "loss": 1.2757, |
| "loss/crossentropy": 2.2669105529785156, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.1659543216228485, |
| "loss/reg": 3.502297113300301e-05, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.031125, |
| "grad_norm": 2.341909170150757, |
| "grad_norm_var": 0.12311806681906787, |
| "learning_rate": 0.0001, |
| "loss": 1.1985, |
| "loss/crossentropy": 2.6345901489257812, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.16693958640098572, |
| "loss/reg": 3.501290120766498e-05, |
| "step": 249 |
| }, |
| { |
| "epoch": 0.03125, |
| "grad_norm": 2.0425026416778564, |
| "grad_norm_var": 0.11766230442624745, |
| "learning_rate": 0.0001, |
| "loss": 1.1968, |
| "loss/crossentropy": 2.4224119186401367, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.15737830102443695, |
| "loss/reg": 3.500358798191883e-05, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.031375, |
| "grad_norm": 2.139225482940674, |
| "grad_norm_var": 0.09668613014885802, |
| "learning_rate": 0.0001, |
| "loss": 1.2377, |
| "loss/crossentropy": 2.7258267402648926, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.14362195134162903, |
| "loss/reg": 3.499682497931644e-05, |
| "step": 251 |
| }, |
| { |
| "epoch": 0.0315, |
| "grad_norm": 2.102008581161499, |
| "grad_norm_var": 0.08031358514106011, |
| "learning_rate": 0.0001, |
| "loss": 1.1117, |
| "loss/crossentropy": 2.432748317718506, |
| "loss/hidden": 0.96875, |
| "loss/logits": 0.1425955444574356, |
| "loss/reg": 3.498911246424541e-05, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.031625, |
| "grad_norm": 2.2371959686279297, |
| "grad_norm_var": 0.0783042488946918, |
| "learning_rate": 0.0001, |
| "loss": 1.0361, |
| "loss/crossentropy": 2.437335968017578, |
| "loss/hidden": 0.90234375, |
| "loss/logits": 0.13340801000595093, |
| "loss/reg": 3.498331716400571e-05, |
| "step": 253 |
| }, |
| { |
| "epoch": 0.03175, |
| "grad_norm": 2.230013608932495, |
| "grad_norm_var": 0.054557147559412954, |
| "learning_rate": 0.0001, |
| "loss": 1.2139, |
| "loss/crossentropy": 2.159069299697876, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.12760058045387268, |
| "loss/reg": 3.497749275993556e-05, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.031875, |
| "grad_norm": 2.2526209354400635, |
| "grad_norm_var": 0.05292534377042599, |
| "learning_rate": 0.0001, |
| "loss": 1.1517, |
| "loss/crossentropy": 2.744022846221924, |
| "loss/hidden": 1.0, |
| "loss/logits": 0.15138062834739685, |
| "loss/reg": 3.496619319776073e-05, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.032, |
| "grad_norm": 2.243044376373291, |
| "grad_norm_var": 0.05245697188967089, |
| "learning_rate": 0.0001, |
| "loss": 1.6025, |
| "loss/crossentropy": 2.2367136478424072, |
| "loss/hidden": 1.3515625, |
| "loss/logits": 0.25056183338165283, |
| "loss/reg": 3.495947385090403e-05, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.032125, |
| "grad_norm": 2.2301154136657715, |
| "grad_norm_var": 0.05124602164451577, |
| "learning_rate": 0.0001, |
| "loss": 1.1302, |
| "loss/crossentropy": 2.8784143924713135, |
| "loss/hidden": 0.99609375, |
| "loss/logits": 0.1337929368019104, |
| "loss/reg": 3.495061901048757e-05, |
| "step": 257 |
| }, |
| { |
| "epoch": 0.03225, |
| "grad_norm": 1.9009541273117065, |
| "grad_norm_var": 0.0557499358364358, |
| "learning_rate": 0.0001, |
| "loss": 1.1815, |
| "loss/crossentropy": 2.852105140686035, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.1498585343360901, |
| "loss/reg": 3.494451448204927e-05, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.032375, |
| "grad_norm": 1.6674587726593018, |
| "grad_norm_var": 0.06383147372835059, |
| "learning_rate": 0.0001, |
| "loss": 1.1406, |
| "loss/crossentropy": 2.6262168884277344, |
| "loss/hidden": 0.9921875, |
| "loss/logits": 0.1480187475681305, |
| "loss/reg": 3.493377516861074e-05, |
| "step": 259 |
| }, |
| { |
| "epoch": 0.0325, |
| "grad_norm": 2.1328914165496826, |
| "grad_norm_var": 0.03229453348446872, |
| "learning_rate": 0.0001, |
| "loss": 1.2748, |
| "loss/crossentropy": 2.944517135620117, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.15721508860588074, |
| "loss/reg": 3.492645555525087e-05, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.032625, |
| "grad_norm": 1.8784502744674683, |
| "grad_norm_var": 0.030045803407693878, |
| "learning_rate": 0.0001, |
| "loss": 1.0683, |
| "loss/crossentropy": 2.2746896743774414, |
| "loss/hidden": 0.9375, |
| "loss/logits": 0.1304292529821396, |
| "loss/reg": 3.4919979952974245e-05, |
| "step": 261 |
| }, |
| { |
| "epoch": 0.03275, |
| "grad_norm": 2.3806915283203125, |
| "grad_norm_var": 0.03508431327747174, |
| "learning_rate": 0.0001, |
| "loss": 1.0997, |
| "loss/crossentropy": 2.5303072929382324, |
| "loss/hidden": 0.96484375, |
| "loss/logits": 0.13454417884349823, |
| "loss/reg": 3.4913624403998256e-05, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.032875, |
| "grad_norm": 1.8924614191055298, |
| "grad_norm_var": 0.03724188133506529, |
| "learning_rate": 0.0001, |
| "loss": 1.198, |
| "loss/crossentropy": 2.496983766555786, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.15854007005691528, |
| "loss/reg": 3.490634844638407e-05, |
| "step": 263 |
| }, |
| { |
| "epoch": 0.033, |
| "grad_norm": 2.439052104949951, |
| "grad_norm_var": 0.04381194480349577, |
| "learning_rate": 0.0001, |
| "loss": 0.953, |
| "loss/crossentropy": 2.7149696350097656, |
| "loss/hidden": 0.83984375, |
| "loss/logits": 0.1127798855304718, |
| "loss/reg": 3.490111339488067e-05, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.033125, |
| "grad_norm": 2.3024566173553467, |
| "grad_norm_var": 0.042804570962997876, |
| "learning_rate": 0.0001, |
| "loss": 1.1402, |
| "loss/crossentropy": 2.447719097137451, |
| "loss/hidden": 0.9765625, |
| "loss/logits": 0.1632998287677765, |
| "loss/reg": 3.489398295641877e-05, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.03325, |
| "grad_norm": 2.287111520767212, |
| "grad_norm_var": 0.04370853447134184, |
| "learning_rate": 0.0001, |
| "loss": 1.2209, |
| "loss/crossentropy": 2.1921133995056152, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.15808838605880737, |
| "loss/reg": 3.48894864146132e-05, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.033375, |
| "grad_norm": 2.336646318435669, |
| "grad_norm_var": 0.04599945823637951, |
| "learning_rate": 0.0001, |
| "loss": 1.2784, |
| "loss/crossentropy": 2.231534719467163, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.15302959084510803, |
| "loss/reg": 3.488411311991513e-05, |
| "step": 267 |
| }, |
| { |
| "epoch": 0.0335, |
| "grad_norm": 1.9491949081420898, |
| "grad_norm_var": 0.04858091189580491, |
| "learning_rate": 0.0001, |
| "loss": 1.3204, |
| "loss/crossentropy": 2.3477442264556885, |
| "loss/hidden": 1.1484375, |
| "loss/logits": 0.17159268260002136, |
| "loss/reg": 3.488002403173596e-05, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.033625, |
| "grad_norm": 1.8812212944030762, |
| "grad_norm_var": 0.05224458505779281, |
| "learning_rate": 0.0001, |
| "loss": 1.1176, |
| "loss/crossentropy": 2.3098113536834717, |
| "loss/hidden": 0.98046875, |
| "loss/logits": 0.13675528764724731, |
| "loss/reg": 3.487144203973003e-05, |
| "step": 269 |
| }, |
| { |
| "epoch": 0.03375, |
| "grad_norm": 2.1917343139648438, |
| "grad_norm_var": 0.05180158566938828, |
| "learning_rate": 0.0001, |
| "loss": 1.3389, |
| "loss/crossentropy": 2.6006360054016113, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.16665683686733246, |
| "loss/reg": 3.4867065551225096e-05, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.033875, |
| "grad_norm": 2.1218273639678955, |
| "grad_norm_var": 0.050608227478554056, |
| "learning_rate": 0.0001, |
| "loss": 1.2115, |
| "loss/crossentropy": 2.3429975509643555, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.15650612115859985, |
| "loss/reg": 3.486104469629936e-05, |
| "step": 271 |
| }, |
| { |
| "epoch": 0.034, |
| "grad_norm": 2.216865062713623, |
| "grad_norm_var": 0.05020309095007551, |
| "learning_rate": 0.0001, |
| "loss": 1.309, |
| "loss/crossentropy": 2.6172518730163574, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.18367114663124084, |
| "loss/reg": 3.485478737275116e-05, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.034125, |
| "grad_norm": 2.660276174545288, |
| "grad_norm_var": 0.06848105136911228, |
| "learning_rate": 0.0001, |
| "loss": 1.2656, |
| "loss/crossentropy": 2.6182620525360107, |
| "loss/hidden": 1.1015625, |
| "loss/logits": 0.16370661556720734, |
| "loss/reg": 3.4849643270717934e-05, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.03425, |
| "grad_norm": 1.9411466121673584, |
| "grad_norm_var": 0.06730120648781887, |
| "learning_rate": 0.0001, |
| "loss": 1.2427, |
| "loss/crossentropy": 2.360733985900879, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.14862678945064545, |
| "loss/reg": 3.4842636523535475e-05, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.034375, |
| "grad_norm": 7.836575031280518, |
| "grad_norm_var": 2.0552077515562965, |
| "learning_rate": 0.0001, |
| "loss": 2.1182, |
| "loss/crossentropy": 2.7246999740600586, |
| "loss/hidden": 1.8671875, |
| "loss/logits": 0.25068169832229614, |
| "loss/reg": 3.483570981188677e-05, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.0345, |
| "grad_norm": 15.36082649230957, |
| "grad_norm_var": 12.29442028509769, |
| "learning_rate": 0.0001, |
| "loss": 1.2297, |
| "loss/crossentropy": 2.7898216247558594, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.16689400374889374, |
| "loss/reg": 3.482873580651358e-05, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.034625, |
| "grad_norm": 2.4146132469177246, |
| "grad_norm_var": 12.206846506541835, |
| "learning_rate": 0.0001, |
| "loss": 1.1546, |
| "loss/crossentropy": 2.6433522701263428, |
| "loss/hidden": 1.0, |
| "loss/logits": 0.15420594811439514, |
| "loss/reg": 3.482148167677224e-05, |
| "step": 277 |
| }, |
| { |
| "epoch": 0.03475, |
| "grad_norm": 2.0731208324432373, |
| "grad_norm_var": 12.254080178741175, |
| "learning_rate": 0.0001, |
| "loss": 1.1779, |
| "loss/crossentropy": 2.2190020084381104, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.14627079665660858, |
| "loss/reg": 3.481503881630488e-05, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.034875, |
| "grad_norm": 2.2058444023132324, |
| "grad_norm_var": 12.19851901002264, |
| "learning_rate": 0.0001, |
| "loss": 1.1722, |
| "loss/crossentropy": 2.3826961517333984, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.15618480741977692, |
| "loss/reg": 3.48087414749898e-05, |
| "step": 279 |
| }, |
| { |
| "epoch": 0.035, |
| "grad_norm": 1.8618927001953125, |
| "grad_norm_var": 12.292415025402823, |
| "learning_rate": 0.0001, |
| "loss": 1.2047, |
| "loss/crossentropy": 2.424729824066162, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.16533657908439636, |
| "loss/reg": 3.479952283669263e-05, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.035125, |
| "grad_norm": 1.9102482795715332, |
| "grad_norm_var": 12.356945094423846, |
| "learning_rate": 0.0001, |
| "loss": 1.2226, |
| "loss/crossentropy": 2.4433910846710205, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.1675223708152771, |
| "loss/reg": 3.4793913073372096e-05, |
| "step": 281 |
| }, |
| { |
| "epoch": 0.03525, |
| "grad_norm": 2.706441640853882, |
| "grad_norm_var": 12.309734168758586, |
| "learning_rate": 0.0001, |
| "loss": 1.0941, |
| "loss/crossentropy": 3.0568315982818604, |
| "loss/hidden": 0.95703125, |
| "loss/logits": 0.13671937584877014, |
| "loss/reg": 3.478690632618964e-05, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.035375, |
| "grad_norm": 2.22886061668396, |
| "grad_norm_var": 12.325085121884593, |
| "learning_rate": 0.0001, |
| "loss": 1.2228, |
| "loss/crossentropy": 2.31925892829895, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.16773122549057007, |
| "loss/reg": 3.4781944123096764e-05, |
| "step": 283 |
| }, |
| { |
| "epoch": 0.0355, |
| "grad_norm": 1.9104151725769043, |
| "grad_norm_var": 12.332409456506062, |
| "learning_rate": 0.0001, |
| "loss": 1.2103, |
| "loss/crossentropy": 2.433910369873047, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.14740660786628723, |
| "loss/reg": 3.47744207829237e-05, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.035625, |
| "grad_norm": 1.8624837398529053, |
| "grad_norm_var": 12.336088715902626, |
| "learning_rate": 0.0001, |
| "loss": 1.1048, |
| "loss/crossentropy": 2.308101177215576, |
| "loss/hidden": 0.95703125, |
| "loss/logits": 0.14743317663669586, |
| "loss/reg": 3.476895290077664e-05, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.03575, |
| "grad_norm": 1.7985292673110962, |
| "grad_norm_var": 12.406159364169602, |
| "learning_rate": 0.0001, |
| "loss": 1.391, |
| "loss/crossentropy": 2.338219165802002, |
| "loss/hidden": 1.1875, |
| "loss/logits": 0.20311352610588074, |
| "loss/reg": 3.476293932180852e-05, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.035875, |
| "grad_norm": 2.0390207767486572, |
| "grad_norm_var": 12.419809877029808, |
| "learning_rate": 0.0001, |
| "loss": 1.2142, |
| "loss/crossentropy": 2.798340320587158, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.1748366355895996, |
| "loss/reg": 3.4757238609017804e-05, |
| "step": 287 |
| }, |
| { |
| "epoch": 0.036, |
| "grad_norm": 2.6159002780914307, |
| "grad_norm_var": 12.371378457752575, |
| "learning_rate": 0.0001, |
| "loss": 1.3686, |
| "loss/crossentropy": 2.0943591594696045, |
| "loss/hidden": 1.1953125, |
| "loss/logits": 0.1729019582271576, |
| "loss/reg": 3.475058838375844e-05, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.036125, |
| "grad_norm": 2.64959716796875, |
| "grad_norm_var": 12.372352193512818, |
| "learning_rate": 0.0001, |
| "loss": 1.3369, |
| "loss/crossentropy": 2.875190019607544, |
| "loss/hidden": 1.1640625, |
| "loss/logits": 0.17247627675533295, |
| "loss/reg": 3.474393815849908e-05, |
| "step": 289 |
| }, |
| { |
| "epoch": 0.03625, |
| "grad_norm": 2.2474873065948486, |
| "grad_norm_var": 12.321143222954635, |
| "learning_rate": 0.0001, |
| "loss": 1.252, |
| "loss/crossentropy": 2.6608498096466064, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.17357373237609863, |
| "loss/reg": 3.473682954791002e-05, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.036375, |
| "grad_norm": 2.4677608013153076, |
| "grad_norm_var": 10.916427124267383, |
| "learning_rate": 0.0001, |
| "loss": 1.0142, |
| "loss/crossentropy": 2.8202872276306152, |
| "loss/hidden": 0.89453125, |
| "loss/logits": 0.11931537836790085, |
| "loss/reg": 3.472894968581386e-05, |
| "step": 291 |
| }, |
| { |
| "epoch": 0.0365, |
| "grad_norm": 2.4355132579803467, |
| "grad_norm_var": 0.09359576037076062, |
| "learning_rate": 0.0001, |
| "loss": 1.344, |
| "loss/crossentropy": 2.588075876235962, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.17175744473934174, |
| "loss/reg": 3.4721750125754625e-05, |
| "step": 292 |
| }, |
| { |
| "epoch": 0.036625, |
| "grad_norm": 1.8848857879638672, |
| "grad_norm_var": 0.09698104319835413, |
| "learning_rate": 0.0001, |
| "loss": 1.0617, |
| "loss/crossentropy": 2.2032036781311035, |
| "loss/hidden": 0.93359375, |
| "loss/logits": 0.12777957320213318, |
| "loss/reg": 3.471899981377646e-05, |
| "step": 293 |
| }, |
| { |
| "epoch": 0.03675, |
| "grad_norm": 2.0544004440307617, |
| "grad_norm_var": 0.09727253081927305, |
| "learning_rate": 0.0001, |
| "loss": 1.1195, |
| "loss/crossentropy": 2.68400502204895, |
| "loss/hidden": 0.97265625, |
| "loss/logits": 0.14646826684474945, |
| "loss/reg": 3.471321178949438e-05, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.036875, |
| "grad_norm": 2.784888744354248, |
| "grad_norm_var": 0.12022710970729276, |
| "learning_rate": 0.0001, |
| "loss": 1.4568, |
| "loss/crossentropy": 2.2672512531280518, |
| "loss/hidden": 1.265625, |
| "loss/logits": 0.19084087014198303, |
| "loss/reg": 3.4705888538155705e-05, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.037, |
| "grad_norm": 2.107628345489502, |
| "grad_norm_var": 0.11239423391909416, |
| "learning_rate": 0.0001, |
| "loss": 1.1654, |
| "loss/crossentropy": 2.3829853534698486, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.14940626919269562, |
| "loss/reg": 3.470026422291994e-05, |
| "step": 296 |
| }, |
| { |
| "epoch": 0.037125, |
| "grad_norm": 2.666799783706665, |
| "grad_norm_var": 0.11576118522773501, |
| "learning_rate": 0.0001, |
| "loss": 1.2658, |
| "loss/crossentropy": 2.3234565258026123, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.17954039573669434, |
| "loss/reg": 3.469527655397542e-05, |
| "step": 297 |
| }, |
| { |
| "epoch": 0.03725, |
| "grad_norm": 1.8957864046096802, |
| "grad_norm_var": 0.11060988429572352, |
| "learning_rate": 0.0001, |
| "loss": 1.305, |
| "loss/crossentropy": 2.4239273071289062, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.17967185378074646, |
| "loss/reg": 3.469140938250348e-05, |
| "step": 298 |
| }, |
| { |
| "epoch": 0.037375, |
| "grad_norm": 2.1476354598999023, |
| "grad_norm_var": 0.1110142344328826, |
| "learning_rate": 0.0001, |
| "loss": 1.18, |
| "loss/crossentropy": 2.3623642921447754, |
| "loss/hidden": 1.0234375, |
| "loss/logits": 0.1561676412820816, |
| "loss/reg": 3.468482827884145e-05, |
| "step": 299 |
| }, |
| { |
| "epoch": 0.0375, |
| "grad_norm": 1.9932529926300049, |
| "grad_norm_var": 0.10799009738127907, |
| "learning_rate": 0.0001, |
| "loss": 1.1559, |
| "loss/crossentropy": 2.69016170501709, |
| "loss/hidden": 1.0078125, |
| "loss/logits": 0.14770537614822388, |
| "loss/reg": 3.468065187917091e-05, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.037625, |
| "grad_norm": 2.2224810123443604, |
| "grad_norm_var": 0.0985346154888075, |
| "learning_rate": 0.0001, |
| "loss": 1.1653, |
| "loss/crossentropy": 2.553476572036743, |
| "loss/hidden": 1.0234375, |
| "loss/logits": 0.14147460460662842, |
| "loss/reg": 3.467235364951193e-05, |
| "step": 301 |
| }, |
| { |
| "epoch": 0.03775, |
| "grad_norm": 2.135714530944824, |
| "grad_norm_var": 0.08531074310993644, |
| "learning_rate": 0.0001, |
| "loss": 1.2046, |
| "loss/crossentropy": 2.243950128555298, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.1418030858039856, |
| "loss/reg": 3.466893394943327e-05, |
| "step": 302 |
| }, |
| { |
| "epoch": 0.037875, |
| "grad_norm": 2.3680195808410645, |
| "grad_norm_var": 0.08186467355099622, |
| "learning_rate": 0.0001, |
| "loss": 1.2752, |
| "loss/crossentropy": 2.2647523880004883, |
| "loss/hidden": 1.1328125, |
| "loss/logits": 0.1420516073703766, |
| "loss/reg": 3.4659868106245995e-05, |
| "step": 303 |
| }, |
| { |
| "epoch": 0.038, |
| "grad_norm": 2.08476185798645, |
| "grad_norm_var": 0.07658376607289051, |
| "learning_rate": 0.0001, |
| "loss": 1.1305, |
| "loss/crossentropy": 2.627105712890625, |
| "loss/hidden": 0.984375, |
| "loss/logits": 0.14580708742141724, |
| "loss/reg": 3.465437112026848e-05, |
| "step": 304 |
| }, |
| { |
| "epoch": 0.038125, |
| "grad_norm": 2.110761880874634, |
| "grad_norm_var": 0.06667962973879416, |
| "learning_rate": 0.0001, |
| "loss": 1.2576, |
| "loss/crossentropy": 2.3144562244415283, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.16346824169158936, |
| "loss/reg": 3.4645545383682474e-05, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.03825, |
| "grad_norm": 1.900791049003601, |
| "grad_norm_var": 0.07317499342195574, |
| "learning_rate": 0.0001, |
| "loss": 1.0492, |
| "loss/crossentropy": 2.716005325317383, |
| "loss/hidden": 0.9140625, |
| "loss/logits": 0.13480325043201447, |
| "loss/reg": 3.463495886535384e-05, |
| "step": 306 |
| }, |
| { |
| "epoch": 0.038375, |
| "grad_norm": 6.467423439025879, |
| "grad_norm_var": 1.2137641430294792, |
| "learning_rate": 0.0001, |
| "loss": 1.4583, |
| "loss/crossentropy": 2.1065866947174072, |
| "loss/hidden": 1.2890625, |
| "loss/logits": 0.1689138412475586, |
| "loss/reg": 3.462713721091859e-05, |
| "step": 307 |
| }, |
| { |
| "epoch": 0.0385, |
| "grad_norm": 2.0642800331115723, |
| "grad_norm_var": 1.2232825060870915, |
| "learning_rate": 0.0001, |
| "loss": 1.185, |
| "loss/crossentropy": 2.624023675918579, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.1534431129693985, |
| "loss/reg": 3.4622189559740946e-05, |
| "step": 308 |
| }, |
| { |
| "epoch": 0.038625, |
| "grad_norm": 1.680816650390625, |
| "grad_norm_var": 1.2407335757806945, |
| "learning_rate": 0.0001, |
| "loss": 1.153, |
| "loss/crossentropy": 2.327143430709839, |
| "loss/hidden": 1.0078125, |
| "loss/logits": 0.14481112360954285, |
| "loss/reg": 3.461851520114578e-05, |
| "step": 309 |
| }, |
| { |
| "epoch": 0.03875, |
| "grad_norm": 1.7195234298706055, |
| "grad_norm_var": 1.2639701691366267, |
| "learning_rate": 0.0001, |
| "loss": 1.1146, |
| "loss/crossentropy": 2.234018087387085, |
| "loss/hidden": 0.97265625, |
| "loss/logits": 0.141597181558609, |
| "loss/reg": 3.461261803749949e-05, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.038875, |
| "grad_norm": 1.9253605604171753, |
| "grad_norm_var": 1.265680677961886, |
| "learning_rate": 0.0001, |
| "loss": 1.1557, |
| "loss/crossentropy": 2.310030460357666, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.13973672688007355, |
| "loss/reg": 3.4606102417455986e-05, |
| "step": 311 |
| }, |
| { |
| "epoch": 0.039, |
| "grad_norm": 1.8293761014938354, |
| "grad_norm_var": 1.2792590983492156, |
| "learning_rate": 0.0001, |
| "loss": 1.1528, |
| "loss/crossentropy": 2.735854148864746, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.1367899477481842, |
| "loss/reg": 3.460055449977517e-05, |
| "step": 312 |
| }, |
| { |
| "epoch": 0.039125, |
| "grad_norm": 2.47273325920105, |
| "grad_norm_var": 1.2727893848260379, |
| "learning_rate": 0.0001, |
| "loss": 1.1983, |
| "loss/crossentropy": 2.4735970497131348, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.16670575737953186, |
| "loss/reg": 3.4597334888530895e-05, |
| "step": 313 |
| }, |
| { |
| "epoch": 0.03925, |
| "grad_norm": 2.4255082607269287, |
| "grad_norm_var": 1.2608122772144856, |
| "learning_rate": 0.0001, |
| "loss": 1.1323, |
| "loss/crossentropy": 2.6550486087799072, |
| "loss/hidden": 0.9765625, |
| "loss/logits": 0.1554185003042221, |
| "loss/reg": 3.4591066651046276e-05, |
| "step": 314 |
| }, |
| { |
| "epoch": 0.039375, |
| "grad_norm": 2.7170803546905518, |
| "grad_norm_var": 1.2659589390154238, |
| "learning_rate": 0.0001, |
| "loss": 1.439, |
| "loss/crossentropy": 2.187866687774658, |
| "loss/hidden": 1.21875, |
| "loss/logits": 0.21992294490337372, |
| "loss/reg": 3.458749415585771e-05, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.0395, |
| "grad_norm": 2.1601176261901855, |
| "grad_norm_var": 1.259041909984486, |
| "learning_rate": 0.0001, |
| "loss": 1.3232, |
| "loss/crossentropy": 2.364377737045288, |
| "loss/hidden": 1.1484375, |
| "loss/logits": 0.1744486689567566, |
| "loss/reg": 3.458080755081028e-05, |
| "step": 316 |
| }, |
| { |
| "epoch": 0.039625, |
| "grad_norm": 1.5947940349578857, |
| "grad_norm_var": 1.2979203484203092, |
| "learning_rate": 0.0001, |
| "loss": 1.028, |
| "loss/crossentropy": 2.4233779907226562, |
| "loss/hidden": 0.91015625, |
| "loss/logits": 0.11754067242145538, |
| "loss/reg": 3.4571639844216406e-05, |
| "step": 317 |
| }, |
| { |
| "epoch": 0.03975, |
| "grad_norm": 1.9653059244155884, |
| "grad_norm_var": 1.3046851365567058, |
| "learning_rate": 0.0001, |
| "loss": 1.126, |
| "loss/crossentropy": 2.31756854057312, |
| "loss/hidden": 1.0, |
| "loss/logits": 0.12566252052783966, |
| "loss/reg": 3.456034028204158e-05, |
| "step": 318 |
| }, |
| { |
| "epoch": 0.039875, |
| "grad_norm": 2.036482572555542, |
| "grad_norm_var": 1.310445228246628, |
| "learning_rate": 0.0001, |
| "loss": 1.1971, |
| "loss/crossentropy": 2.3461310863494873, |
| "loss/hidden": 1.0234375, |
| "loss/logits": 0.17331476509571075, |
| "loss/reg": 3.455525802564807e-05, |
| "step": 319 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 2.075100898742676, |
| "grad_norm_var": 1.3107569056456743, |
| "learning_rate": 0.0001, |
| "loss": 1.327, |
| "loss/crossentropy": 2.506412982940674, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.1859946846961975, |
| "loss/reg": 3.45489097526297e-05, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.040125, |
| "grad_norm": 2.719728946685791, |
| "grad_norm_var": 1.3168160620395004, |
| "learning_rate": 0.0001, |
| "loss": 1.3389, |
| "loss/crossentropy": 2.4216549396514893, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.1822904348373413, |
| "loss/reg": 3.45378830388654e-05, |
| "step": 321 |
| }, |
| { |
| "epoch": 0.04025, |
| "grad_norm": 2.6060094833374023, |
| "grad_norm_var": 1.3047531355820505, |
| "learning_rate": 0.0001, |
| "loss": 1.5674, |
| "loss/crossentropy": 2.658222198486328, |
| "loss/hidden": 1.34375, |
| "loss/logits": 0.22325485944747925, |
| "loss/reg": 3.453323370194994e-05, |
| "step": 322 |
| }, |
| { |
| "epoch": 0.040375, |
| "grad_norm": 2.5935699939727783, |
| "grad_norm_var": 0.14371946682283335, |
| "learning_rate": 0.0001, |
| "loss": 1.1519, |
| "loss/crossentropy": 2.6234920024871826, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.13588842749595642, |
| "loss/reg": 3.452138480497524e-05, |
| "step": 323 |
| }, |
| { |
| "epoch": 0.0405, |
| "grad_norm": 2.4168694019317627, |
| "grad_norm_var": 0.1469136698932582, |
| "learning_rate": 0.0001, |
| "loss": 1.2903, |
| "loss/crossentropy": 2.3443264961242676, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.18053767085075378, |
| "loss/reg": 3.4516375308157876e-05, |
| "step": 324 |
| }, |
| { |
| "epoch": 0.040625, |
| "grad_norm": 2.598829746246338, |
| "grad_norm_var": 0.13803791478749894, |
| "learning_rate": 0.0001, |
| "loss": 1.0512, |
| "loss/crossentropy": 2.6923201084136963, |
| "loss/hidden": 0.93359375, |
| "loss/logits": 0.11729900538921356, |
| "loss/reg": 3.450660369708203e-05, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.04075, |
| "grad_norm": 2.2705416679382324, |
| "grad_norm_var": 0.11870002646295455, |
| "learning_rate": 0.0001, |
| "loss": 1.3007, |
| "loss/crossentropy": 2.8326780796051025, |
| "loss/hidden": 1.1328125, |
| "loss/logits": 0.16750125586986542, |
| "loss/reg": 3.449714495218359e-05, |
| "step": 326 |
| }, |
| { |
| "epoch": 0.040875, |
| "grad_norm": 2.108572006225586, |
| "grad_norm_var": 0.11224555742265707, |
| "learning_rate": 0.0001, |
| "loss": 1.2583, |
| "loss/crossentropy": 2.4080569744110107, |
| "loss/hidden": 1.1015625, |
| "loss/logits": 0.15636944770812988, |
| "loss/reg": 3.449182622716762e-05, |
| "step": 327 |
| }, |
| { |
| "epoch": 0.041, |
| "grad_norm": 1.9010335206985474, |
| "grad_norm_var": 0.10819501908635042, |
| "learning_rate": 0.0001, |
| "loss": 1.1679, |
| "loss/crossentropy": 2.7714359760284424, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.15192916989326477, |
| "loss/reg": 3.448529605520889e-05, |
| "step": 328 |
| }, |
| { |
| "epoch": 0.041125, |
| "grad_norm": 2.413954973220825, |
| "grad_norm_var": 0.1069897618565714, |
| "learning_rate": 0.0001, |
| "loss": 1.4022, |
| "loss/crossentropy": 2.225947618484497, |
| "loss/hidden": 1.2109375, |
| "loss/logits": 0.19096189737319946, |
| "loss/reg": 3.4474134736228734e-05, |
| "step": 329 |
| }, |
| { |
| "epoch": 0.04125, |
| "grad_norm": 2.385904312133789, |
| "grad_norm_var": 0.1063601900492659, |
| "learning_rate": 0.0001, |
| "loss": 1.2387, |
| "loss/crossentropy": 2.4583194255828857, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.16027729213237762, |
| "loss/reg": 3.44689360645134e-05, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.041375, |
| "grad_norm": 2.4138543605804443, |
| "grad_norm_var": 0.09464759263946097, |
| "learning_rate": 0.0001, |
| "loss": 1.2305, |
| "loss/crossentropy": 2.5432093143463135, |
| "loss/hidden": 1.0703125, |
| "loss/logits": 0.15986011922359467, |
| "loss/reg": 3.446204573265277e-05, |
| "step": 331 |
| }, |
| { |
| "epoch": 0.0415, |
| "grad_norm": 3.4917404651641846, |
| "grad_norm_var": 0.18662260281899326, |
| "learning_rate": 0.0001, |
| "loss": 1.4531, |
| "loss/crossentropy": 2.667515516281128, |
| "loss/hidden": 1.2578125, |
| "loss/logits": 0.19496265053749084, |
| "loss/reg": 3.445533729973249e-05, |
| "step": 332 |
| }, |
| { |
| "epoch": 0.041625, |
| "grad_norm": 2.6574389934539795, |
| "grad_norm_var": 0.15026464336703782, |
| "learning_rate": 0.0001, |
| "loss": 1.2525, |
| "loss/crossentropy": 2.8141376972198486, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.16617870330810547, |
| "loss/reg": 3.444873073021881e-05, |
| "step": 333 |
| }, |
| { |
| "epoch": 0.04175, |
| "grad_norm": 2.011918783187866, |
| "grad_norm_var": 0.14759976834883393, |
| "learning_rate": 0.0001, |
| "loss": 1.2704, |
| "loss/crossentropy": 2.5743820667266846, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.16070207953453064, |
| "loss/reg": 3.444178946665488e-05, |
| "step": 334 |
| }, |
| { |
| "epoch": 0.041875, |
| "grad_norm": 2.0087714195251465, |
| "grad_norm_var": 0.14906053005454342, |
| "learning_rate": 0.0001, |
| "loss": 1.1449, |
| "loss/crossentropy": 2.510054111480713, |
| "loss/hidden": 1.0078125, |
| "loss/logits": 0.13675576448440552, |
| "loss/reg": 3.4435932320775464e-05, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.042, |
| "grad_norm": 2.2362313270568848, |
| "grad_norm_var": 0.1433353693831899, |
| "learning_rate": 0.0001, |
| "loss": 1.1899, |
| "loss/crossentropy": 2.4818272590637207, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.15829679369926453, |
| "loss/reg": 3.443356399657205e-05, |
| "step": 336 |
| }, |
| { |
| "epoch": 0.042125, |
| "grad_norm": 2.767322301864624, |
| "grad_norm_var": 0.14533335584858278, |
| "learning_rate": 0.0001, |
| "loss": 1.4789, |
| "loss/crossentropy": 1.9489027261734009, |
| "loss/hidden": 1.2578125, |
| "loss/logits": 0.22079172730445862, |
| "loss/reg": 3.442970410105772e-05, |
| "step": 337 |
| }, |
| { |
| "epoch": 0.04225, |
| "grad_norm": 1.9009366035461426, |
| "grad_norm_var": 0.15987229719162357, |
| "learning_rate": 0.0001, |
| "loss": 1.0737, |
| "loss/crossentropy": 2.4470298290252686, |
| "loss/hidden": 0.9375, |
| "loss/logits": 0.13584166765213013, |
| "loss/reg": 3.44260515703354e-05, |
| "step": 338 |
| }, |
| { |
| "epoch": 0.042375, |
| "grad_norm": 2.4537954330444336, |
| "grad_norm_var": 0.1572266899389352, |
| "learning_rate": 0.0001, |
| "loss": 1.4332, |
| "loss/crossentropy": 2.3663222789764404, |
| "loss/hidden": 1.2421875, |
| "loss/logits": 0.1906408667564392, |
| "loss/reg": 3.442170054768212e-05, |
| "step": 339 |
| }, |
| { |
| "epoch": 0.0425, |
| "grad_norm": 2.032825469970703, |
| "grad_norm_var": 0.1644215429789795, |
| "learning_rate": 0.0001, |
| "loss": 1.1493, |
| "loss/crossentropy": 2.4772043228149414, |
| "loss/hidden": 1.0078125, |
| "loss/logits": 0.1411634385585785, |
| "loss/reg": 3.4417531423969194e-05, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.042625, |
| "grad_norm": 2.3325209617614746, |
| "grad_norm_var": 0.1601377693951102, |
| "learning_rate": 0.0001, |
| "loss": 1.2114, |
| "loss/crossentropy": 2.45560622215271, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.15634939074516296, |
| "loss/reg": 3.44164072885178e-05, |
| "step": 341 |
| }, |
| { |
| "epoch": 0.04275, |
| "grad_norm": 1.9114069938659668, |
| "grad_norm_var": 0.1713673299562344, |
| "learning_rate": 0.0001, |
| "loss": 1.0862, |
| "loss/crossentropy": 2.6129817962646484, |
| "loss/hidden": 0.9453125, |
| "loss/logits": 0.14053717255592346, |
| "loss/reg": 3.441024091443978e-05, |
| "step": 342 |
| }, |
| { |
| "epoch": 0.042875, |
| "grad_norm": 3.5132644176483154, |
| "grad_norm_var": 0.256165301144145, |
| "learning_rate": 0.0001, |
| "loss": 1.4695, |
| "loss/crossentropy": 3.366391658782959, |
| "loss/hidden": 1.265625, |
| "loss/logits": 0.203495591878891, |
| "loss/reg": 3.440545333432965e-05, |
| "step": 343 |
| }, |
| { |
| "epoch": 0.043, |
| "grad_norm": 1.859586238861084, |
| "grad_norm_var": 0.2590414795273373, |
| "learning_rate": 0.0001, |
| "loss": 1.1472, |
| "loss/crossentropy": 2.428865909576416, |
| "loss/hidden": 0.984375, |
| "loss/logits": 0.1624484360218048, |
| "loss/reg": 3.440186628722586e-05, |
| "step": 344 |
| }, |
| { |
| "epoch": 0.043125, |
| "grad_norm": 1.98198401927948, |
| "grad_norm_var": 0.2698694637418498, |
| "learning_rate": 0.0001, |
| "loss": 1.3233, |
| "loss/crossentropy": 2.302736282348633, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.18230174481868744, |
| "loss/reg": 3.439932697801851e-05, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.04325, |
| "grad_norm": 2.3659989833831787, |
| "grad_norm_var": 0.2698585694015619, |
| "learning_rate": 0.0001, |
| "loss": 1.14, |
| "loss/crossentropy": 2.5037076473236084, |
| "loss/hidden": 1.0, |
| "loss/logits": 0.13970160484313965, |
| "loss/reg": 3.439432111917995e-05, |
| "step": 346 |
| }, |
| { |
| "epoch": 0.043375, |
| "grad_norm": 2.281691074371338, |
| "grad_norm_var": 0.2701990568843252, |
| "learning_rate": 0.0001, |
| "loss": 1.2372, |
| "loss/crossentropy": 2.4770443439483643, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.15086981654167175, |
| "loss/reg": 3.439074498601258e-05, |
| "step": 347 |
| }, |
| { |
| "epoch": 0.0435, |
| "grad_norm": 2.166990041732788, |
| "grad_norm_var": 0.18050477852144595, |
| "learning_rate": 0.0001, |
| "loss": 1.32, |
| "loss/crossentropy": 2.5852904319763184, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.19465678930282593, |
| "loss/reg": 3.438742714934051e-05, |
| "step": 348 |
| }, |
| { |
| "epoch": 0.043625, |
| "grad_norm": 1.9888639450073242, |
| "grad_norm_var": 0.17481059186203815, |
| "learning_rate": 0.0001, |
| "loss": 1.4304, |
| "loss/crossentropy": 2.2656476497650146, |
| "loss/hidden": 1.21875, |
| "loss/logits": 0.21130971610546112, |
| "loss/reg": 3.437901978031732e-05, |
| "step": 349 |
| }, |
| { |
| "epoch": 0.04375, |
| "grad_norm": 1.7951990365982056, |
| "grad_norm_var": 0.1842899236598953, |
| "learning_rate": 0.0001, |
| "loss": 1.1657, |
| "loss/crossentropy": 2.3542745113372803, |
| "loss/hidden": 1.0234375, |
| "loss/logits": 0.141954243183136, |
| "loss/reg": 3.437545819906518e-05, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.043875, |
| "grad_norm": 2.0519394874572754, |
| "grad_norm_var": 0.18316277481239354, |
| "learning_rate": 0.0001, |
| "loss": 1.0528, |
| "loss/crossentropy": 2.401294231414795, |
| "loss/hidden": 0.91796875, |
| "loss/logits": 0.13452798128128052, |
| "loss/reg": 3.4367119951639324e-05, |
| "step": 351 |
| }, |
| { |
| "epoch": 0.044, |
| "grad_norm": 2.12829327583313, |
| "grad_norm_var": 0.18376578016818687, |
| "learning_rate": 0.0001, |
| "loss": 1.1034, |
| "loss/crossentropy": 2.637117624282837, |
| "loss/hidden": 0.9609375, |
| "loss/logits": 0.14211076498031616, |
| "loss/reg": 3.436086262809113e-05, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.044125, |
| "grad_norm": 1.8479722738265991, |
| "grad_norm_var": 0.16959696182082518, |
| "learning_rate": 0.0001, |
| "loss": 1.1462, |
| "loss/crossentropy": 2.534714698791504, |
| "loss/hidden": 0.99609375, |
| "loss/logits": 0.14980709552764893, |
| "loss/reg": 3.435524195083417e-05, |
| "step": 353 |
| }, |
| { |
| "epoch": 0.04425, |
| "grad_norm": 1.7758541107177734, |
| "grad_norm_var": 0.17495091080606068, |
| "learning_rate": 0.0001, |
| "loss": 1.2699, |
| "loss/crossentropy": 2.484570026397705, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.16018345952033997, |
| "loss/reg": 3.434780228417367e-05, |
| "step": 354 |
| }, |
| { |
| "epoch": 0.044375, |
| "grad_norm": 2.274486541748047, |
| "grad_norm_var": 0.1698290651703018, |
| "learning_rate": 0.0001, |
| "loss": 1.4533, |
| "loss/crossentropy": 2.1426889896392822, |
| "loss/hidden": 1.2421875, |
| "loss/logits": 0.21076492965221405, |
| "loss/reg": 3.433872916502878e-05, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.0445, |
| "grad_norm": 2.4361371994018555, |
| "grad_norm_var": 0.17400054735299186, |
| "learning_rate": 0.0001, |
| "loss": 1.4138, |
| "loss/crossentropy": 2.34684419631958, |
| "loss/hidden": 1.234375, |
| "loss/logits": 0.17912375926971436, |
| "loss/reg": 3.432907396927476e-05, |
| "step": 356 |
| }, |
| { |
| "epoch": 0.044625, |
| "grad_norm": 2.6452977657318115, |
| "grad_norm_var": 0.1869129455570799, |
| "learning_rate": 0.0001, |
| "loss": 1.131, |
| "loss/crossentropy": 2.5198328495025635, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.11500594019889832, |
| "loss/reg": 3.431808727327734e-05, |
| "step": 357 |
| }, |
| { |
| "epoch": 0.04475, |
| "grad_norm": 1.905686855316162, |
| "grad_norm_var": 0.18712675263565845, |
| "learning_rate": 0.0001, |
| "loss": 1.1227, |
| "loss/crossentropy": 2.7459700107574463, |
| "loss/hidden": 0.984375, |
| "loss/logits": 0.1379736065864563, |
| "loss/reg": 3.431065852055326e-05, |
| "step": 358 |
| }, |
| { |
| "epoch": 0.044875, |
| "grad_norm": 2.1955368518829346, |
| "grad_norm_var": 0.06293061471083418, |
| "learning_rate": 0.0001, |
| "loss": 1.2854, |
| "loss/crossentropy": 2.514496326446533, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.19134163856506348, |
| "loss/reg": 3.430093784118071e-05, |
| "step": 359 |
| }, |
| { |
| "epoch": 0.045, |
| "grad_norm": 2.074857711791992, |
| "grad_norm_var": 0.0587442988467273, |
| "learning_rate": 0.0001, |
| "loss": 1.2615, |
| "loss/crossentropy": 2.3883326053619385, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.18299797177314758, |
| "loss/reg": 3.428904165048152e-05, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.045125, |
| "grad_norm": 3.084925651550293, |
| "grad_norm_var": 0.11450734924812096, |
| "learning_rate": 0.0001, |
| "loss": 1.4694, |
| "loss/crossentropy": 2.5634469985961914, |
| "loss/hidden": 1.265625, |
| "loss/logits": 0.20340915024280548, |
| "loss/reg": 3.4281070838915184e-05, |
| "step": 361 |
| }, |
| { |
| "epoch": 0.04525, |
| "grad_norm": 2.3508076667785645, |
| "grad_norm_var": 0.11416271928607671, |
| "learning_rate": 0.0001, |
| "loss": 1.2864, |
| "loss/crossentropy": 2.713895559310913, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.1689026653766632, |
| "loss/reg": 3.4273252822458744e-05, |
| "step": 362 |
| }, |
| { |
| "epoch": 0.045375, |
| "grad_norm": 7.681649208068848, |
| "grad_norm_var": 2.004247231943063, |
| "learning_rate": 0.0001, |
| "loss": 1.3636, |
| "loss/crossentropy": 2.4041330814361572, |
| "loss/hidden": 1.203125, |
| "loss/logits": 0.1601409614086151, |
| "loss/reg": 3.426634430070408e-05, |
| "step": 363 |
| }, |
| { |
| "epoch": 0.0455, |
| "grad_norm": 1.973664402961731, |
| "grad_norm_var": 2.0158187368377836, |
| "learning_rate": 0.0001, |
| "loss": 1.1228, |
| "loss/crossentropy": 2.5436434745788574, |
| "loss/hidden": 0.98046875, |
| "loss/logits": 0.14196962118148804, |
| "loss/reg": 3.425794784561731e-05, |
| "step": 364 |
| }, |
| { |
| "epoch": 0.045625, |
| "grad_norm": 2.2556684017181396, |
| "grad_norm_var": 2.001615144920621, |
| "learning_rate": 0.0001, |
| "loss": 1.1705, |
| "loss/crossentropy": 2.5252158641815186, |
| "loss/hidden": 1.0078125, |
| "loss/logits": 0.162298783659935, |
| "loss/reg": 3.425147588131949e-05, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.04575, |
| "grad_norm": 2.2237462997436523, |
| "grad_norm_var": 1.9711144098953564, |
| "learning_rate": 0.0001, |
| "loss": 1.1598, |
| "loss/crossentropy": 2.4940881729125977, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.14381377398967743, |
| "loss/reg": 3.4247070288984105e-05, |
| "step": 366 |
| }, |
| { |
| "epoch": 0.045875, |
| "grad_norm": 6.294942378997803, |
| "grad_norm_var": 2.8107703767931165, |
| "learning_rate": 0.0001, |
| "loss": 1.9233, |
| "loss/crossentropy": 2.4311485290527344, |
| "loss/hidden": 1.515625, |
| "loss/logits": 0.40729784965515137, |
| "loss/reg": 3.424140595598146e-05, |
| "step": 367 |
| }, |
| { |
| "epoch": 0.046, |
| "grad_norm": 2.1130430698394775, |
| "grad_norm_var": 2.8121951540684127, |
| "learning_rate": 0.0001, |
| "loss": 1.243, |
| "loss/crossentropy": 2.475876808166504, |
| "loss/hidden": 1.0703125, |
| "loss/logits": 0.17232248187065125, |
| "loss/reg": 3.424075111979619e-05, |
| "step": 368 |
| }, |
| { |
| "epoch": 0.046125, |
| "grad_norm": 1.9393197298049927, |
| "grad_norm_var": 2.80086684083605, |
| "learning_rate": 0.0001, |
| "loss": 1.1504, |
| "loss/crossentropy": 2.431164503097534, |
| "loss/hidden": 0.99609375, |
| "loss/logits": 0.1539495587348938, |
| "loss/reg": 3.4231870813528076e-05, |
| "step": 369 |
| }, |
| { |
| "epoch": 0.04625, |
| "grad_norm": 2.072326183319092, |
| "grad_norm_var": 2.764824687660132, |
| "learning_rate": 0.0001, |
| "loss": 1.209, |
| "loss/crossentropy": 2.686079502105713, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.15395890176296234, |
| "loss/reg": 3.422388545004651e-05, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.046375, |
| "grad_norm": 1.8920451402664185, |
| "grad_norm_var": 2.8030644353470513, |
| "learning_rate": 0.0001, |
| "loss": 1.1611, |
| "loss/crossentropy": 2.4512131214141846, |
| "loss/hidden": 1.0, |
| "loss/logits": 0.16072264313697815, |
| "loss/reg": 3.421401197556406e-05, |
| "step": 371 |
| }, |
| { |
| "epoch": 0.0465, |
| "grad_norm": 1.8197071552276611, |
| "grad_norm_var": 2.858464465681775, |
| "learning_rate": 0.0001, |
| "loss": 1.1394, |
| "loss/crossentropy": 2.6290316581726074, |
| "loss/hidden": 0.99609375, |
| "loss/logits": 0.1429995894432068, |
| "loss/reg": 3.4208966098958626e-05, |
| "step": 372 |
| }, |
| { |
| "epoch": 0.046625, |
| "grad_norm": 1.6114336252212524, |
| "grad_norm_var": 2.9442100668891373, |
| "learning_rate": 0.0001, |
| "loss": 1.1743, |
| "loss/crossentropy": 2.412721872329712, |
| "loss/hidden": 1.0234375, |
| "loss/logits": 0.15048110485076904, |
| "loss/reg": 3.419875429244712e-05, |
| "step": 373 |
| }, |
| { |
| "epoch": 0.04675, |
| "grad_norm": 2.3423984050750732, |
| "grad_norm_var": 2.908825389746772, |
| "learning_rate": 0.0001, |
| "loss": 1.1547, |
| "loss/crossentropy": 2.479085683822632, |
| "loss/hidden": 1.0078125, |
| "loss/logits": 0.14653661847114563, |
| "loss/reg": 3.419152199057862e-05, |
| "step": 374 |
| }, |
| { |
| "epoch": 0.046875, |
| "grad_norm": 2.2863998413085938, |
| "grad_norm_var": 2.9026800154510086, |
| "learning_rate": 0.0001, |
| "loss": 1.1562, |
| "loss/crossentropy": 2.6520957946777344, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.1402827501296997, |
| "loss/reg": 3.418369669816457e-05, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.047, |
| "grad_norm": 3.5551681518554688, |
| "grad_norm_var": 2.906172521956549, |
| "learning_rate": 0.0001, |
| "loss": 1.3527, |
| "loss/crossentropy": 2.5694146156311035, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.21178269386291504, |
| "loss/reg": 3.417985135456547e-05, |
| "step": 376 |
| }, |
| { |
| "epoch": 0.047125, |
| "grad_norm": 2.4192416667938232, |
| "grad_norm_var": 2.912446952830265, |
| "learning_rate": 0.0001, |
| "loss": 1.1754, |
| "loss/crossentropy": 2.4164340496063232, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.14376118779182434, |
| "loss/reg": 3.4170752769568935e-05, |
| "step": 377 |
| }, |
| { |
| "epoch": 0.04725, |
| "grad_norm": 2.242144823074341, |
| "grad_norm_var": 2.91972157704962, |
| "learning_rate": 0.0001, |
| "loss": 1.1153, |
| "loss/crossentropy": 2.544785261154175, |
| "loss/hidden": 0.984375, |
| "loss/logits": 0.13062497973442078, |
| "loss/reg": 3.416725303395651e-05, |
| "step": 378 |
| }, |
| { |
| "epoch": 0.047375, |
| "grad_norm": 2.2893593311309814, |
| "grad_norm_var": 1.2237873306323004, |
| "learning_rate": 0.0001, |
| "loss": 1.3118, |
| "loss/crossentropy": 2.364673614501953, |
| "loss/hidden": 1.1328125, |
| "loss/logits": 0.17860937118530273, |
| "loss/reg": 3.416024992475286e-05, |
| "step": 379 |
| }, |
| { |
| "epoch": 0.0475, |
| "grad_norm": 2.2065534591674805, |
| "grad_norm_var": 1.2121325720205287, |
| "learning_rate": 0.0001, |
| "loss": 1.3261, |
| "loss/crossentropy": 2.0714166164398193, |
| "loss/hidden": 1.1484375, |
| "loss/logits": 0.17733046412467957, |
| "loss/reg": 3.415203173062764e-05, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.047625, |
| "grad_norm": 12.950845718383789, |
| "grad_norm_var": 8.05178996682972, |
| "learning_rate": 0.0001, |
| "loss": 1.4789, |
| "loss/crossentropy": 2.1035971641540527, |
| "loss/hidden": 1.328125, |
| "loss/logits": 0.15040796995162964, |
| "loss/reg": 3.414938328205608e-05, |
| "step": 381 |
| }, |
| { |
| "epoch": 0.04775, |
| "grad_norm": 1.9148967266082764, |
| "grad_norm_var": 8.095531060395368, |
| "learning_rate": 0.0001, |
| "loss": 1.136, |
| "loss/crossentropy": 2.589541435241699, |
| "loss/hidden": 0.9921875, |
| "loss/logits": 0.14346718788146973, |
| "loss/reg": 3.41419035976287e-05, |
| "step": 382 |
| }, |
| { |
| "epoch": 0.047875, |
| "grad_norm": 2.309288263320923, |
| "grad_norm_var": 7.402131974293955, |
| "learning_rate": 0.0001, |
| "loss": 1.1656, |
| "loss/crossentropy": 2.601487398147583, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.1496451497077942, |
| "loss/reg": 3.413420563447289e-05, |
| "step": 383 |
| }, |
| { |
| "epoch": 0.048, |
| "grad_norm": 2.129903554916382, |
| "grad_norm_var": 7.400441847159771, |
| "learning_rate": 0.0001, |
| "loss": 1.2433, |
| "loss/crossentropy": 2.0785951614379883, |
| "loss/hidden": 1.1015625, |
| "loss/logits": 0.14143729209899902, |
| "loss/reg": 3.413164085941389e-05, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.048125, |
| "grad_norm": 2.513585329055786, |
| "grad_norm_var": 7.349500066162391, |
| "learning_rate": 0.0001, |
| "loss": 1.46, |
| "loss/crossentropy": 2.378612756729126, |
| "loss/hidden": 1.25, |
| "loss/logits": 0.20967382192611694, |
| "loss/reg": 3.413192825973965e-05, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.04825, |
| "grad_norm": 1.8059685230255127, |
| "grad_norm_var": 7.3836732232467055, |
| "learning_rate": 0.0001, |
| "loss": 1.091, |
| "loss/crossentropy": 2.4599719047546387, |
| "loss/hidden": 0.9609375, |
| "loss/logits": 0.12968455255031586, |
| "loss/reg": 3.412771911825985e-05, |
| "step": 386 |
| }, |
| { |
| "epoch": 0.048375, |
| "grad_norm": 1.9648966789245605, |
| "grad_norm_var": 7.374281548362966, |
| "learning_rate": 0.0001, |
| "loss": 1.0183, |
| "loss/crossentropy": 2.405507802963257, |
| "loss/hidden": 0.890625, |
| "loss/logits": 0.12728646397590637, |
| "loss/reg": 3.412525984458625e-05, |
| "step": 387 |
| }, |
| { |
| "epoch": 0.0485, |
| "grad_norm": 2.7891759872436523, |
| "grad_norm_var": 7.29369073112806, |
| "learning_rate": 0.0001, |
| "loss": 1.2625, |
| "loss/crossentropy": 2.564371347427368, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.18406376242637634, |
| "loss/reg": 3.4118053008569404e-05, |
| "step": 388 |
| }, |
| { |
| "epoch": 0.048625, |
| "grad_norm": 3.2247207164764404, |
| "grad_norm_var": 7.166662268117063, |
| "learning_rate": 0.0001, |
| "loss": 1.5145, |
| "loss/crossentropy": 2.9137074947357178, |
| "loss/hidden": 1.3125, |
| "loss/logits": 0.20163431763648987, |
| "loss/reg": 3.411182842683047e-05, |
| "step": 389 |
| }, |
| { |
| "epoch": 0.04875, |
| "grad_norm": 2.1028342247009277, |
| "grad_norm_var": 7.193139907597329, |
| "learning_rate": 0.0001, |
| "loss": 1.0431, |
| "loss/crossentropy": 2.8769431114196777, |
| "loss/hidden": 0.9140625, |
| "loss/logits": 0.1287393867969513, |
| "loss/reg": 3.410813951632008e-05, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.048875, |
| "grad_norm": 2.3499820232391357, |
| "grad_norm_var": 7.186969405638869, |
| "learning_rate": 0.0001, |
| "loss": 1.2858, |
| "loss/crossentropy": 2.426356315612793, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.17607171833515167, |
| "loss/reg": 3.410330828046426e-05, |
| "step": 391 |
| }, |
| { |
| "epoch": 0.049, |
| "grad_norm": 1.912327527999878, |
| "grad_norm_var": 7.244567116261923, |
| "learning_rate": 0.0001, |
| "loss": 1.362, |
| "loss/crossentropy": 2.551398277282715, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.20536215603351593, |
| "loss/reg": 3.409969940548763e-05, |
| "step": 392 |
| }, |
| { |
| "epoch": 0.049125, |
| "grad_norm": 1.8643162250518799, |
| "grad_norm_var": 7.302740869176493, |
| "learning_rate": 0.0001, |
| "loss": 1.3451, |
| "loss/crossentropy": 2.4072647094726562, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.17286883294582367, |
| "loss/reg": 3.40941951435525e-05, |
| "step": 393 |
| }, |
| { |
| "epoch": 0.04925, |
| "grad_norm": 2.126138925552368, |
| "grad_norm_var": 7.313922412927239, |
| "learning_rate": 0.0001, |
| "loss": 1.1946, |
| "loss/crossentropy": 2.6316819190979004, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.15518739819526672, |
| "loss/reg": 3.409163764445111e-05, |
| "step": 394 |
| }, |
| { |
| "epoch": 0.049375, |
| "grad_norm": 5.341352939605713, |
| "grad_norm_var": 7.646205880923245, |
| "learning_rate": 0.0001, |
| "loss": 1.3627, |
| "loss/crossentropy": 2.2951955795288086, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.19052964448928833, |
| "loss/reg": 3.408612610655837e-05, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.0495, |
| "grad_norm": 1.841691017150879, |
| "grad_norm_var": 7.697707430188741, |
| "learning_rate": 0.0001, |
| "loss": 1.126, |
| "loss/crossentropy": 2.412738561630249, |
| "loss/hidden": 0.9765625, |
| "loss/logits": 0.14905960857868195, |
| "loss/reg": 3.408074189792387e-05, |
| "step": 396 |
| }, |
| { |
| "epoch": 0.049625, |
| "grad_norm": 2.309384346008301, |
| "grad_norm_var": 0.7576436792480905, |
| "learning_rate": 0.0001, |
| "loss": 1.403, |
| "loss/crossentropy": 2.3188250064849854, |
| "loss/hidden": 1.2421875, |
| "loss/logits": 0.16045960783958435, |
| "loss/reg": 3.407212716410868e-05, |
| "step": 397 |
| }, |
| { |
| "epoch": 0.04975, |
| "grad_norm": 2.7863593101501465, |
| "grad_norm_var": 0.7480129573726607, |
| "learning_rate": 0.0001, |
| "loss": 1.2891, |
| "loss/crossentropy": 2.3670787811279297, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.17938411235809326, |
| "loss/reg": 3.406599716981873e-05, |
| "step": 398 |
| }, |
| { |
| "epoch": 0.049875, |
| "grad_norm": 1.8358136415481567, |
| "grad_norm_var": 0.7715855741782158, |
| "learning_rate": 0.0001, |
| "loss": 1.1237, |
| "loss/crossentropy": 2.514230966567993, |
| "loss/hidden": 0.9921875, |
| "loss/logits": 0.13119591772556305, |
| "loss/reg": 3.4062337363138795e-05, |
| "step": 399 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 2.350162982940674, |
| "grad_norm_var": 0.7657706364738013, |
| "learning_rate": 0.0001, |
| "loss": 1.2326, |
| "loss/crossentropy": 2.713247299194336, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.1697346568107605, |
| "loss/reg": 3.4054195566568524e-05, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.050125, |
| "grad_norm": 2.63322377204895, |
| "grad_norm_var": 0.7677605659354261, |
| "learning_rate": 0.0001, |
| "loss": 1.1099, |
| "loss/crossentropy": 2.440999746322632, |
| "loss/hidden": 0.96484375, |
| "loss/logits": 0.14475254714488983, |
| "loss/reg": 3.404737435630523e-05, |
| "step": 401 |
| }, |
| { |
| "epoch": 0.05025, |
| "grad_norm": 1.7843867540359497, |
| "grad_norm_var": 0.769649818838896, |
| "learning_rate": 0.0001, |
| "loss": 1.1619, |
| "loss/crossentropy": 2.3848907947540283, |
| "loss/hidden": 1.0234375, |
| "loss/logits": 0.13809266686439514, |
| "loss/reg": 3.403963637538254e-05, |
| "step": 402 |
| }, |
| { |
| "epoch": 0.050375, |
| "grad_norm": 2.4891116619110107, |
| "grad_norm_var": 0.7528451996298959, |
| "learning_rate": 0.0001, |
| "loss": 1.4016, |
| "loss/crossentropy": 2.2915351390838623, |
| "loss/hidden": 1.2265625, |
| "loss/logits": 0.17465338110923767, |
| "loss/reg": 3.4031403629342094e-05, |
| "step": 403 |
| }, |
| { |
| "epoch": 0.0505, |
| "grad_norm": 2.6278252601623535, |
| "grad_norm_var": 0.7479028879806237, |
| "learning_rate": 0.0001, |
| "loss": 1.3323, |
| "loss/crossentropy": 2.657773494720459, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.19131596386432648, |
| "loss/reg": 3.402295624255203e-05, |
| "step": 404 |
| }, |
| { |
| "epoch": 0.050625, |
| "grad_norm": 2.6351633071899414, |
| "grad_norm_var": 0.7105926512095813, |
| "learning_rate": 0.0001, |
| "loss": 1.09, |
| "loss/crossentropy": 2.341869592666626, |
| "loss/hidden": 0.91796875, |
| "loss/logits": 0.1717243790626526, |
| "loss/reg": 3.401270805625245e-05, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.05075, |
| "grad_norm": 1.9917317628860474, |
| "grad_norm_var": 0.7163125714595162, |
| "learning_rate": 0.0001, |
| "loss": 1.2231, |
| "loss/crossentropy": 2.5361015796661377, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.16022570431232452, |
| "loss/reg": 3.400079003768042e-05, |
| "step": 406 |
| }, |
| { |
| "epoch": 0.050875, |
| "grad_norm": 2.01811146736145, |
| "grad_norm_var": 0.7267341041079096, |
| "learning_rate": 0.0001, |
| "loss": 1.0135, |
| "loss/crossentropy": 2.3607399463653564, |
| "loss/hidden": 0.890625, |
| "loss/logits": 0.12249953299760818, |
| "loss/reg": 3.399254273972474e-05, |
| "step": 407 |
| }, |
| { |
| "epoch": 0.051, |
| "grad_norm": 2.469839096069336, |
| "grad_norm_var": 0.7092258078292549, |
| "learning_rate": 0.0001, |
| "loss": 1.3583, |
| "loss/crossentropy": 2.1284053325653076, |
| "loss/hidden": 1.1953125, |
| "loss/logits": 0.1626756489276886, |
| "loss/reg": 3.398398621357046e-05, |
| "step": 408 |
| }, |
| { |
| "epoch": 0.051125, |
| "grad_norm": 2.0157532691955566, |
| "grad_norm_var": 0.6989536122316408, |
| "learning_rate": 0.0001, |
| "loss": 1.2599, |
| "loss/crossentropy": 2.5868444442749023, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.16585640609264374, |
| "loss/reg": 3.3978514693444595e-05, |
| "step": 409 |
| }, |
| { |
| "epoch": 0.05125, |
| "grad_norm": 2.468764543533325, |
| "grad_norm_var": 0.6913355184321047, |
| "learning_rate": 0.0001, |
| "loss": 1.2353, |
| "loss/crossentropy": 2.4670286178588867, |
| "loss/hidden": 1.0703125, |
| "loss/logits": 0.16462820768356323, |
| "loss/reg": 3.397303225938231e-05, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.051375, |
| "grad_norm": 1.657066822052002, |
| "grad_norm_var": 0.13160569161618738, |
| "learning_rate": 0.0001, |
| "loss": 1.1004, |
| "loss/crossentropy": 2.463649034500122, |
| "loss/hidden": 0.9609375, |
| "loss/logits": 0.13913306593894958, |
| "loss/reg": 3.397110413061455e-05, |
| "step": 411 |
| }, |
| { |
| "epoch": 0.0515, |
| "grad_norm": 2.3177919387817383, |
| "grad_norm_var": 0.12019285492734794, |
| "learning_rate": 0.0001, |
| "loss": 1.2397, |
| "loss/crossentropy": 2.44539737701416, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.17688173055648804, |
| "loss/reg": 3.396526153665036e-05, |
| "step": 412 |
| }, |
| { |
| "epoch": 0.051625, |
| "grad_norm": 2.2815206050872803, |
| "grad_norm_var": 0.12011142743010049, |
| "learning_rate": 0.0001, |
| "loss": 1.3311, |
| "loss/crossentropy": 2.4922685623168945, |
| "loss/hidden": 1.1796875, |
| "loss/logits": 0.15103884041309357, |
| "loss/reg": 3.396152169443667e-05, |
| "step": 413 |
| }, |
| { |
| "epoch": 0.05175, |
| "grad_norm": 2.062899351119995, |
| "grad_norm_var": 0.1032718534450779, |
| "learning_rate": 0.0001, |
| "loss": 1.1936, |
| "loss/crossentropy": 2.671872138977051, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.15416079759597778, |
| "loss/reg": 3.3957923733396456e-05, |
| "step": 414 |
| }, |
| { |
| "epoch": 0.051875, |
| "grad_norm": 1.9539638757705688, |
| "grad_norm_var": 0.09797476372330317, |
| "learning_rate": 0.0001, |
| "loss": 1.1046, |
| "loss/crossentropy": 2.6380488872528076, |
| "loss/hidden": 0.97265625, |
| "loss/logits": 0.1315636783838272, |
| "loss/reg": 3.395261592231691e-05, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.052, |
| "grad_norm": 2.2393951416015625, |
| "grad_norm_var": 0.09703828398074225, |
| "learning_rate": 0.0001, |
| "loss": 1.1526, |
| "loss/crossentropy": 2.6465938091278076, |
| "loss/hidden": 0.99609375, |
| "loss/logits": 0.1561371386051178, |
| "loss/reg": 3.394690065761097e-05, |
| "step": 416 |
| }, |
| { |
| "epoch": 0.052125, |
| "grad_norm": 2.3431386947631836, |
| "grad_norm_var": 0.08662086074431645, |
| "learning_rate": 0.0001, |
| "loss": 1.1849, |
| "loss/crossentropy": 2.598094940185547, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.14550316333770752, |
| "loss/reg": 3.393869337742217e-05, |
| "step": 417 |
| }, |
| { |
| "epoch": 0.05225, |
| "grad_norm": 1.8472511768341064, |
| "grad_norm_var": 0.08330225189024129, |
| "learning_rate": 0.0001, |
| "loss": 1.1985, |
| "loss/crossentropy": 2.5920615196228027, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.14346075057983398, |
| "loss/reg": 3.392928192624822e-05, |
| "step": 418 |
| }, |
| { |
| "epoch": 0.052375, |
| "grad_norm": 1.7260291576385498, |
| "grad_norm_var": 0.09167492136177748, |
| "learning_rate": 0.0001, |
| "loss": 1.0539, |
| "loss/crossentropy": 2.678154468536377, |
| "loss/hidden": 0.921875, |
| "loss/logits": 0.13166582584381104, |
| "loss/reg": 3.3915493986569345e-05, |
| "step": 419 |
| }, |
| { |
| "epoch": 0.0525, |
| "grad_norm": 2.1379270553588867, |
| "grad_norm_var": 0.0765096237299017, |
| "learning_rate": 0.0001, |
| "loss": 1.3037, |
| "loss/crossentropy": 2.554790735244751, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.16276058554649353, |
| "loss/reg": 3.390039273654111e-05, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.052625, |
| "grad_norm": 2.773489475250244, |
| "grad_norm_var": 0.08692294666244584, |
| "learning_rate": 0.0001, |
| "loss": 1.286, |
| "loss/crossentropy": 2.5890371799468994, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.16067233681678772, |
| "loss/reg": 3.3892716601258144e-05, |
| "step": 421 |
| }, |
| { |
| "epoch": 0.05275, |
| "grad_norm": 1.9473000764846802, |
| "grad_norm_var": 0.0879486532075814, |
| "learning_rate": 0.0001, |
| "loss": 1.1138, |
| "loss/crossentropy": 2.544093370437622, |
| "loss/hidden": 0.9765625, |
| "loss/logits": 0.13691496849060059, |
| "loss/reg": 3.38886420649942e-05, |
| "step": 422 |
| }, |
| { |
| "epoch": 0.052875, |
| "grad_norm": 2.065152406692505, |
| "grad_norm_var": 0.08731452126513635, |
| "learning_rate": 0.0001, |
| "loss": 1.0308, |
| "loss/crossentropy": 2.4369661808013916, |
| "loss/hidden": 0.90234375, |
| "loss/logits": 0.12809085845947266, |
| "loss/reg": 3.38791505782865e-05, |
| "step": 423 |
| }, |
| { |
| "epoch": 0.053, |
| "grad_norm": 1.76610267162323, |
| "grad_norm_var": 0.08771260345232476, |
| "learning_rate": 0.0001, |
| "loss": 1.1559, |
| "loss/crossentropy": 2.437058210372925, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.13998199999332428, |
| "loss/reg": 3.387559627299197e-05, |
| "step": 424 |
| }, |
| { |
| "epoch": 0.053125, |
| "grad_norm": 2.1202235221862793, |
| "grad_norm_var": 0.08721813960099964, |
| "learning_rate": 0.0001, |
| "loss": 1.1406, |
| "loss/crossentropy": 2.5524604320526123, |
| "loss/hidden": 0.98046875, |
| "loss/logits": 0.15982511639595032, |
| "loss/reg": 3.3869648177642375e-05, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.05325, |
| "grad_norm": 2.3238697052001953, |
| "grad_norm_var": 0.08153644484325746, |
| "learning_rate": 0.0001, |
| "loss": 1.2742, |
| "loss/crossentropy": 2.6345760822296143, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.15670067071914673, |
| "loss/reg": 3.386356183909811e-05, |
| "step": 426 |
| }, |
| { |
| "epoch": 0.053375, |
| "grad_norm": 1.8629034757614136, |
| "grad_norm_var": 0.07209149684443308, |
| "learning_rate": 0.0001, |
| "loss": 1.1243, |
| "loss/crossentropy": 2.4134674072265625, |
| "loss/hidden": 0.96875, |
| "loss/logits": 0.1552024483680725, |
| "loss/reg": 3.385494346730411e-05, |
| "step": 427 |
| }, |
| { |
| "epoch": 0.0535, |
| "grad_norm": 2.185732841491699, |
| "grad_norm_var": 0.06953255529498938, |
| "learning_rate": 0.0001, |
| "loss": 1.3593, |
| "loss/crossentropy": 2.088414430618286, |
| "loss/hidden": 1.203125, |
| "loss/logits": 0.15583863854408264, |
| "loss/reg": 3.384939191164449e-05, |
| "step": 428 |
| }, |
| { |
| "epoch": 0.053625, |
| "grad_norm": 2.0822196006774902, |
| "grad_norm_var": 0.06725276287184746, |
| "learning_rate": 0.0001, |
| "loss": 1.1775, |
| "loss/crossentropy": 2.2571120262145996, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.1458958387374878, |
| "loss/reg": 3.3843141864053905e-05, |
| "step": 429 |
| }, |
| { |
| "epoch": 0.05375, |
| "grad_norm": 1.9582529067993164, |
| "grad_norm_var": 0.06831322983159846, |
| "learning_rate": 0.0001, |
| "loss": 1.2637, |
| "loss/crossentropy": 2.273467779159546, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.16964468359947205, |
| "loss/reg": 3.38399586325977e-05, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.053875, |
| "grad_norm": 2.5428881645202637, |
| "grad_norm_var": 0.07983358220816962, |
| "learning_rate": 0.0001, |
| "loss": 1.08, |
| "loss/crossentropy": 2.6513051986694336, |
| "loss/hidden": 0.94921875, |
| "loss/logits": 0.13046178221702576, |
| "loss/reg": 3.3836400689324364e-05, |
| "step": 431 |
| }, |
| { |
| "epoch": 0.054, |
| "grad_norm": 5.832214832305908, |
| "grad_norm_var": 0.9437448574938159, |
| "learning_rate": 0.0001, |
| "loss": 1.9424, |
| "loss/crossentropy": 2.6202034950256348, |
| "loss/hidden": 1.640625, |
| "loss/logits": 0.30144432187080383, |
| "loss/reg": 3.383049988769926e-05, |
| "step": 432 |
| }, |
| { |
| "epoch": 0.054125, |
| "grad_norm": 17.29625129699707, |
| "grad_norm_var": 14.915418371233736, |
| "learning_rate": 0.0001, |
| "loss": 1.369, |
| "loss/crossentropy": 2.5406692028045654, |
| "loss/hidden": 1.1953125, |
| "loss/logits": 0.17335116863250732, |
| "loss/reg": 3.382577415322885e-05, |
| "step": 433 |
| }, |
| { |
| "epoch": 0.05425, |
| "grad_norm": 2.1661648750305176, |
| "grad_norm_var": 14.860884296803356, |
| "learning_rate": 0.0001, |
| "loss": 1.3231, |
| "loss/crossentropy": 2.486368417739868, |
| "loss/hidden": 1.1328125, |
| "loss/logits": 0.18998079001903534, |
| "loss/reg": 3.381881833774969e-05, |
| "step": 434 |
| }, |
| { |
| "epoch": 0.054375, |
| "grad_norm": 3.2340309619903564, |
| "grad_norm_var": 14.686707047148603, |
| "learning_rate": 0.0001, |
| "loss": 1.1883, |
| "loss/crossentropy": 2.6120805740356445, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.15667462348937988, |
| "loss/reg": 3.381211718078703e-05, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.0545, |
| "grad_norm": 2.4656379222869873, |
| "grad_norm_var": 14.638560696511908, |
| "learning_rate": 0.0001, |
| "loss": 1.2838, |
| "loss/crossentropy": 2.5453333854675293, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.1663120836019516, |
| "loss/reg": 3.380520502105355e-05, |
| "step": 436 |
| }, |
| { |
| "epoch": 0.054625, |
| "grad_norm": 2.4808294773101807, |
| "grad_norm_var": 14.668903570755694, |
| "learning_rate": 0.0001, |
| "loss": 1.3034, |
| "loss/crossentropy": 2.254472255706787, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.20927491784095764, |
| "loss/reg": 3.3801999961724505e-05, |
| "step": 437 |
| }, |
| { |
| "epoch": 0.05475, |
| "grad_norm": 2.0809855461120605, |
| "grad_norm_var": 14.644204809831463, |
| "learning_rate": 0.0001, |
| "loss": 1.2904, |
| "loss/crossentropy": 2.387235403060913, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.16507935523986816, |
| "loss/reg": 3.380004272912629e-05, |
| "step": 438 |
| }, |
| { |
| "epoch": 0.054875, |
| "grad_norm": 2.5827701091766357, |
| "grad_norm_var": 14.568551148225374, |
| "learning_rate": 0.0001, |
| "loss": 1.1913, |
| "loss/crossentropy": 2.5564305782318115, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.15191948413848877, |
| "loss/reg": 3.379736881470308e-05, |
| "step": 439 |
| }, |
| { |
| "epoch": 0.055, |
| "grad_norm": 2.4748318195343018, |
| "grad_norm_var": 14.44211406577169, |
| "learning_rate": 0.0001, |
| "loss": 1.3339, |
| "loss/crossentropy": 2.169847249984741, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.1929139941930771, |
| "loss/reg": 3.379437475814484e-05, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.055125, |
| "grad_norm": 2.17909836769104, |
| "grad_norm_var": 14.43165167732106, |
| "learning_rate": 0.0001, |
| "loss": 1.1514, |
| "loss/crossentropy": 2.6119625568389893, |
| "loss/hidden": 1.0, |
| "loss/logits": 0.1510833203792572, |
| "loss/reg": 3.3788579457905143e-05, |
| "step": 441 |
| }, |
| { |
| "epoch": 0.05525, |
| "grad_norm": 2.328896999359131, |
| "grad_norm_var": 14.430875418614992, |
| "learning_rate": 0.0001, |
| "loss": 1.4589, |
| "loss/crossentropy": 2.1644153594970703, |
| "loss/hidden": 1.2578125, |
| "loss/logits": 0.20078104734420776, |
| "loss/reg": 3.37848650815431e-05, |
| "step": 442 |
| }, |
| { |
| "epoch": 0.055375, |
| "grad_norm": 2.473328113555908, |
| "grad_norm_var": 14.322173701255808, |
| "learning_rate": 0.0001, |
| "loss": 1.1936, |
| "loss/crossentropy": 2.5964508056640625, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.1464037448167801, |
| "loss/reg": 3.3778171200538054e-05, |
| "step": 443 |
| }, |
| { |
| "epoch": 0.0555, |
| "grad_norm": 4.068549633026123, |
| "grad_norm_var": 14.208086262392497, |
| "learning_rate": 0.0001, |
| "loss": 1.3952, |
| "loss/crossentropy": 2.8060688972473145, |
| "loss/hidden": 1.1875, |
| "loss/logits": 0.20741160213947296, |
| "loss/reg": 3.37726560246665e-05, |
| "step": 444 |
| }, |
| { |
| "epoch": 0.055625, |
| "grad_norm": 2.1623542308807373, |
| "grad_norm_var": 14.191838680780066, |
| "learning_rate": 0.0001, |
| "loss": 1.0824, |
| "loss/crossentropy": 2.724346160888672, |
| "loss/hidden": 0.94921875, |
| "loss/logits": 0.13284045457839966, |
| "loss/reg": 3.3767199056455866e-05, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.05575, |
| "grad_norm": 2.2896392345428467, |
| "grad_norm_var": 14.12415401393583, |
| "learning_rate": 0.0001, |
| "loss": 1.369, |
| "loss/crossentropy": 2.3175323009490967, |
| "loss/hidden": 1.1796875, |
| "loss/logits": 0.18897001445293427, |
| "loss/reg": 3.375912274350412e-05, |
| "step": 446 |
| }, |
| { |
| "epoch": 0.055875, |
| "grad_norm": 2.046797037124634, |
| "grad_norm_var": 14.213834657666071, |
| "learning_rate": 0.0001, |
| "loss": 1.1878, |
| "loss/crossentropy": 2.6415698528289795, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.14838215708732605, |
| "loss/reg": 3.3750762668205425e-05, |
| "step": 447 |
| }, |
| { |
| "epoch": 0.056, |
| "grad_norm": 3.2464237213134766, |
| "grad_norm_var": 13.874242204082186, |
| "learning_rate": 0.0001, |
| "loss": 1.418, |
| "loss/crossentropy": 2.4582679271698, |
| "loss/hidden": 1.1875, |
| "loss/logits": 0.2301977574825287, |
| "loss/reg": 3.3743133826646954e-05, |
| "step": 448 |
| }, |
| { |
| "epoch": 0.056125, |
| "grad_norm": 2.189342737197876, |
| "grad_norm_var": 0.29544563519738554, |
| "learning_rate": 0.0001, |
| "loss": 1.2354, |
| "loss/crossentropy": 2.224080801010132, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.12568463385105133, |
| "loss/reg": 3.3736727345967665e-05, |
| "step": 449 |
| }, |
| { |
| "epoch": 0.05625, |
| "grad_norm": 1.8410553932189941, |
| "grad_norm_var": 0.3177951887186661, |
| "learning_rate": 0.0001, |
| "loss": 1.1724, |
| "loss/crossentropy": 2.3079254627227783, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.15645131468772888, |
| "loss/reg": 3.372762876097113e-05, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.056375, |
| "grad_norm": 2.075913667678833, |
| "grad_norm_var": 0.28967181210960763, |
| "learning_rate": 0.0001, |
| "loss": 1.3254, |
| "loss/crossentropy": 2.3725907802581787, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.18442153930664062, |
| "loss/reg": 3.3720290957717225e-05, |
| "step": 451 |
| }, |
| { |
| "epoch": 0.0565, |
| "grad_norm": 2.2170984745025635, |
| "grad_norm_var": 0.29257204608246923, |
| "learning_rate": 0.0001, |
| "loss": 1.3107, |
| "loss/crossentropy": 2.410374164581299, |
| "loss/hidden": 1.1328125, |
| "loss/logits": 0.17750215530395508, |
| "loss/reg": 3.371315688127652e-05, |
| "step": 452 |
| }, |
| { |
| "epoch": 0.056625, |
| "grad_norm": 2.1257264614105225, |
| "grad_norm_var": 0.2976260957554473, |
| "learning_rate": 0.0001, |
| "loss": 1.1145, |
| "loss/crossentropy": 2.1573164463043213, |
| "loss/hidden": 0.9609375, |
| "loss/logits": 0.1532134860754013, |
| "loss/reg": 3.3702854125294834e-05, |
| "step": 453 |
| }, |
| { |
| "epoch": 0.05675, |
| "grad_norm": 2.020737409591675, |
| "grad_norm_var": 0.3004070010410295, |
| "learning_rate": 0.0001, |
| "loss": 1.2252, |
| "loss/crossentropy": 2.5048506259918213, |
| "loss/hidden": 1.0703125, |
| "loss/logits": 0.15453127026557922, |
| "loss/reg": 3.3693660952849314e-05, |
| "step": 454 |
| }, |
| { |
| "epoch": 0.056875, |
| "grad_norm": 2.0882959365844727, |
| "grad_norm_var": 0.30331944550090667, |
| "learning_rate": 0.0001, |
| "loss": 1.148, |
| "loss/crossentropy": 2.6023924350738525, |
| "loss/hidden": 1.0, |
| "loss/logits": 0.1476426124572754, |
| "loss/reg": 3.368509715073742e-05, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.057, |
| "grad_norm": 1.9702045917510986, |
| "grad_norm_var": 0.31179501443108676, |
| "learning_rate": 0.0001, |
| "loss": 1.0904, |
| "loss/crossentropy": 2.414108991622925, |
| "loss/hidden": 0.953125, |
| "loss/logits": 0.13695700466632843, |
| "loss/reg": 3.367620593053289e-05, |
| "step": 456 |
| }, |
| { |
| "epoch": 0.057125, |
| "grad_norm": 2.7802071571350098, |
| "grad_norm_var": 0.32206609917582013, |
| "learning_rate": 0.0001, |
| "loss": 1.1101, |
| "loss/crossentropy": 2.802133560180664, |
| "loss/hidden": 0.96484375, |
| "loss/logits": 0.14492589235305786, |
| "loss/reg": 3.3667642128420994e-05, |
| "step": 457 |
| }, |
| { |
| "epoch": 0.05725, |
| "grad_norm": 2.838331699371338, |
| "grad_norm_var": 0.3354750209379326, |
| "learning_rate": 0.0001, |
| "loss": 1.3563, |
| "loss/crossentropy": 2.409419536590576, |
| "loss/hidden": 1.1875, |
| "loss/logits": 0.16849547624588013, |
| "loss/reg": 3.365922748344019e-05, |
| "step": 458 |
| }, |
| { |
| "epoch": 0.057375, |
| "grad_norm": 2.1942574977874756, |
| "grad_norm_var": 0.33769313303004084, |
| "learning_rate": 0.0001, |
| "loss": 1.1997, |
| "loss/crossentropy": 2.5466361045837402, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.16028670966625214, |
| "loss/reg": 3.3653053833404556e-05, |
| "step": 459 |
| }, |
| { |
| "epoch": 0.0575, |
| "grad_norm": 1.9543126821517944, |
| "grad_norm_var": 0.14238904796044755, |
| "learning_rate": 0.0001, |
| "loss": 1.082, |
| "loss/crossentropy": 2.3072826862335205, |
| "loss/hidden": 0.9453125, |
| "loss/logits": 0.13639651238918304, |
| "loss/reg": 3.36485099978745e-05, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.057625, |
| "grad_norm": 2.3599038124084473, |
| "grad_norm_var": 0.14245257928573613, |
| "learning_rate": 0.0001, |
| "loss": 1.3462, |
| "loss/crossentropy": 2.5248610973358154, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.18964824080467224, |
| "loss/reg": 3.3640484616626054e-05, |
| "step": 461 |
| }, |
| { |
| "epoch": 0.05775, |
| "grad_norm": 2.160914421081543, |
| "grad_norm_var": 0.14306343844920466, |
| "learning_rate": 0.0001, |
| "loss": 1.1187, |
| "loss/crossentropy": 2.485192060470581, |
| "loss/hidden": 0.95703125, |
| "loss/logits": 0.16129833459854126, |
| "loss/reg": 3.363731593708508e-05, |
| "step": 462 |
| }, |
| { |
| "epoch": 0.057875, |
| "grad_norm": 1.96443510055542, |
| "grad_norm_var": 0.14579406927241975, |
| "learning_rate": 0.0001, |
| "loss": 1.172, |
| "loss/crossentropy": 2.3434600830078125, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.14036868512630463, |
| "loss/reg": 3.362847928656265e-05, |
| "step": 463 |
| }, |
| { |
| "epoch": 0.058, |
| "grad_norm": 2.240238666534424, |
| "grad_norm_var": 0.07561911079075631, |
| "learning_rate": 0.0001, |
| "loss": 1.2581, |
| "loss/crossentropy": 2.66363263130188, |
| "loss/hidden": 1.1015625, |
| "loss/logits": 0.1561916470527649, |
| "loss/reg": 3.361971539561637e-05, |
| "step": 464 |
| }, |
| { |
| "epoch": 0.058125, |
| "grad_norm": 2.9686269760131836, |
| "grad_norm_var": 0.11362960790722566, |
| "learning_rate": 0.0001, |
| "loss": 1.3006, |
| "loss/crossentropy": 2.733015775680542, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.22209219634532928, |
| "loss/reg": 3.361287963343784e-05, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.05825, |
| "grad_norm": 2.5101568698883057, |
| "grad_norm_var": 0.10624098469997983, |
| "learning_rate": 0.0001, |
| "loss": 1.1411, |
| "loss/crossentropy": 2.588346242904663, |
| "loss/hidden": 0.99609375, |
| "loss/logits": 0.1446218490600586, |
| "loss/reg": 3.360513073857874e-05, |
| "step": 466 |
| }, |
| { |
| "epoch": 0.058375, |
| "grad_norm": 2.770623207092285, |
| "grad_norm_var": 0.11756231178518603, |
| "learning_rate": 0.0001, |
| "loss": 1.3929, |
| "loss/crossentropy": 1.7947008609771729, |
| "loss/hidden": 1.2109375, |
| "loss/logits": 0.18167641758918762, |
| "loss/reg": 3.359945912961848e-05, |
| "step": 467 |
| }, |
| { |
| "epoch": 0.0585, |
| "grad_norm": 1.9178507328033447, |
| "grad_norm_var": 0.1273747784869385, |
| "learning_rate": 0.0001, |
| "loss": 1.0863, |
| "loss/crossentropy": 2.3694558143615723, |
| "loss/hidden": 0.94140625, |
| "loss/logits": 0.14460425078868866, |
| "loss/reg": 3.359114271006547e-05, |
| "step": 468 |
| }, |
| { |
| "epoch": 0.058625, |
| "grad_norm": 2.9377102851867676, |
| "grad_norm_var": 0.14927586898533457, |
| "learning_rate": 0.0001, |
| "loss": 1.3075, |
| "loss/crossentropy": 2.737233877182007, |
| "loss/hidden": 1.1328125, |
| "loss/logits": 0.17436236143112183, |
| "loss/reg": 3.35832592099905e-05, |
| "step": 469 |
| }, |
| { |
| "epoch": 0.05875, |
| "grad_norm": 2.4766845703125, |
| "grad_norm_var": 0.14196017860283514, |
| "learning_rate": 0.0001, |
| "loss": 1.2198, |
| "loss/crossentropy": 2.725649118423462, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.17263534665107727, |
| "loss/reg": 3.3575062843738124e-05, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.058875, |
| "grad_norm": 2.97731876373291, |
| "grad_norm_var": 0.15638940419960357, |
| "learning_rate": 0.0001, |
| "loss": 1.1337, |
| "loss/crossentropy": 2.4248714447021484, |
| "loss/hidden": 0.98828125, |
| "loss/logits": 0.1451077163219452, |
| "loss/reg": 3.3564418117748573e-05, |
| "step": 471 |
| }, |
| { |
| "epoch": 0.059, |
| "grad_norm": 2.4938745498657227, |
| "grad_norm_var": 0.14080595119560016, |
| "learning_rate": 0.0001, |
| "loss": 1.2354, |
| "loss/crossentropy": 2.5639638900756836, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.17252308130264282, |
| "loss/reg": 3.355655644554645e-05, |
| "step": 472 |
| }, |
| { |
| "epoch": 0.059125, |
| "grad_norm": 2.453796148300171, |
| "grad_norm_var": 0.13403350770174421, |
| "learning_rate": 0.0001, |
| "loss": 1.3133, |
| "loss/crossentropy": 2.535618543624878, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.1723370999097824, |
| "loss/reg": 3.354718137416057e-05, |
| "step": 473 |
| }, |
| { |
| "epoch": 0.05925, |
| "grad_norm": 2.3145835399627686, |
| "grad_norm_var": 0.12414269824475065, |
| "learning_rate": 0.0001, |
| "loss": 1.1404, |
| "loss/crossentropy": 2.5441014766693115, |
| "loss/hidden": 0.9921875, |
| "loss/logits": 0.14785614609718323, |
| "loss/reg": 3.353881766088307e-05, |
| "step": 474 |
| }, |
| { |
| "epoch": 0.059375, |
| "grad_norm": 2.3766791820526123, |
| "grad_norm_var": 0.12076940932042811, |
| "learning_rate": 0.0001, |
| "loss": 1.546, |
| "loss/crossentropy": 2.321577310562134, |
| "loss/hidden": 1.3203125, |
| "loss/logits": 0.22531206905841827, |
| "loss/reg": 3.352982457727194e-05, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.0595, |
| "grad_norm": 2.9967336654663086, |
| "grad_norm_var": 0.1225888750658117, |
| "learning_rate": 0.0001, |
| "loss": 1.5924, |
| "loss/crossentropy": 2.00260066986084, |
| "loss/hidden": 1.375, |
| "loss/logits": 0.21702903509140015, |
| "loss/reg": 3.3521097066113725e-05, |
| "step": 476 |
| }, |
| { |
| "epoch": 0.059625, |
| "grad_norm": 2.6383934020996094, |
| "grad_norm_var": 0.12241946620474262, |
| "learning_rate": 0.0001, |
| "loss": 1.4419, |
| "loss/crossentropy": 2.5109200477600098, |
| "loss/hidden": 1.2265625, |
| "loss/logits": 0.21502982079982758, |
| "loss/reg": 3.3512478694319725e-05, |
| "step": 477 |
| }, |
| { |
| "epoch": 0.05975, |
| "grad_norm": 2.120065450668335, |
| "grad_norm_var": 0.12443820755625362, |
| "learning_rate": 0.0001, |
| "loss": 1.2943, |
| "loss/crossentropy": 2.4915192127227783, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.17678330838680267, |
| "loss/reg": 3.3503984013805166e-05, |
| "step": 478 |
| }, |
| { |
| "epoch": 0.059875, |
| "grad_norm": 2.858367681503296, |
| "grad_norm_var": 0.10937309591752348, |
| "learning_rate": 0.0001, |
| "loss": 1.4656, |
| "loss/crossentropy": 2.3548312187194824, |
| "loss/hidden": 1.265625, |
| "loss/logits": 0.19961079955101013, |
| "loss/reg": 3.349495091242716e-05, |
| "step": 479 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 2.285705089569092, |
| "grad_norm_var": 0.1075290964460765, |
| "learning_rate": 0.0001, |
| "loss": 1.2262, |
| "loss/crossentropy": 2.489813804626465, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.18675509095191956, |
| "loss/reg": 3.3486459869891405e-05, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.060125, |
| "grad_norm": 2.094900131225586, |
| "grad_norm_var": 0.10863647120417165, |
| "learning_rate": 0.0001, |
| "loss": 1.3702, |
| "loss/crossentropy": 2.3183302879333496, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.1980261653661728, |
| "loss/reg": 3.3478718250989914e-05, |
| "step": 481 |
| }, |
| { |
| "epoch": 0.06025, |
| "grad_norm": 2.406928777694702, |
| "grad_norm_var": 0.10935489058969262, |
| "learning_rate": 0.0001, |
| "loss": 1.3264, |
| "loss/crossentropy": 2.5121381282806396, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.1697903275489807, |
| "loss/reg": 3.3472137147327885e-05, |
| "step": 482 |
| }, |
| { |
| "epoch": 0.060375, |
| "grad_norm": 1.7551078796386719, |
| "grad_norm_var": 0.1381837528506061, |
| "learning_rate": 0.0001, |
| "loss": 1.0999, |
| "loss/crossentropy": 2.433582067489624, |
| "loss/hidden": 0.9609375, |
| "loss/logits": 0.13861994445323944, |
| "loss/reg": 3.34642463712953e-05, |
| "step": 483 |
| }, |
| { |
| "epoch": 0.0605, |
| "grad_norm": 2.15712308883667, |
| "grad_norm_var": 0.12497483119508387, |
| "learning_rate": 0.0001, |
| "loss": 1.264, |
| "loss/crossentropy": 2.77188777923584, |
| "loss/hidden": 1.1015625, |
| "loss/logits": 0.1621045470237732, |
| "loss/reg": 3.345516961417161e-05, |
| "step": 484 |
| }, |
| { |
| "epoch": 0.060625, |
| "grad_norm": 2.0167171955108643, |
| "grad_norm_var": 0.11920370288203964, |
| "learning_rate": 0.0001, |
| "loss": 1.3431, |
| "loss/crossentropy": 2.546163558959961, |
| "loss/hidden": 1.1484375, |
| "loss/logits": 0.19433115422725677, |
| "loss/reg": 3.344708966324106e-05, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.06075, |
| "grad_norm": 2.6408584117889404, |
| "grad_norm_var": 0.12253544383796963, |
| "learning_rate": 0.0001, |
| "loss": 1.379, |
| "loss/crossentropy": 2.7440547943115234, |
| "loss/hidden": 1.1953125, |
| "loss/logits": 0.1834029257297516, |
| "loss/reg": 3.343883508932777e-05, |
| "step": 486 |
| }, |
| { |
| "epoch": 0.060875, |
| "grad_norm": 2.1073925495147705, |
| "grad_norm_var": 0.10422711697162654, |
| "learning_rate": 0.0001, |
| "loss": 1.095, |
| "loss/crossentropy": 2.180604934692383, |
| "loss/hidden": 0.953125, |
| "loss/logits": 0.14150169491767883, |
| "loss/reg": 3.343077696627006e-05, |
| "step": 487 |
| }, |
| { |
| "epoch": 0.061, |
| "grad_norm": 2.980268716812134, |
| "grad_norm_var": 0.1278688011981179, |
| "learning_rate": 0.0001, |
| "loss": 1.3713, |
| "loss/crossentropy": 2.511422634124756, |
| "loss/hidden": 1.1796875, |
| "loss/logits": 0.19132453203201294, |
| "loss/reg": 3.3423166314605623e-05, |
| "step": 488 |
| }, |
| { |
| "epoch": 0.061125, |
| "grad_norm": 2.237560272216797, |
| "grad_norm_var": 0.1288862839917743, |
| "learning_rate": 0.0001, |
| "loss": 1.0922, |
| "loss/crossentropy": 2.361391305923462, |
| "loss/hidden": 0.94921875, |
| "loss/logits": 0.14263351261615753, |
| "loss/reg": 3.341743286000565e-05, |
| "step": 489 |
| }, |
| { |
| "epoch": 0.06125, |
| "grad_norm": 2.2243480682373047, |
| "grad_norm_var": 0.1301125949056683, |
| "learning_rate": 0.0001, |
| "loss": 1.2353, |
| "loss/crossentropy": 2.6845271587371826, |
| "loss/hidden": 1.0703125, |
| "loss/logits": 0.16469591856002808, |
| "loss/reg": 3.340901093906723e-05, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.061375, |
| "grad_norm": 1.9749258756637573, |
| "grad_norm_var": 0.13976616590314225, |
| "learning_rate": 0.0001, |
| "loss": 1.0788, |
| "loss/crossentropy": 2.8269782066345215, |
| "loss/hidden": 0.9453125, |
| "loss/logits": 0.13316524028778076, |
| "loss/reg": 3.340181865496561e-05, |
| "step": 491 |
| }, |
| { |
| "epoch": 0.0615, |
| "grad_norm": 3.8082363605499268, |
| "grad_norm_var": 0.2516089050798465, |
| "learning_rate": 0.0001, |
| "loss": 1.1953, |
| "loss/crossentropy": 3.110802412033081, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.16368849575519562, |
| "loss/reg": 3.33938623953145e-05, |
| "step": 492 |
| }, |
| { |
| "epoch": 0.061625, |
| "grad_norm": 7.89940881729126, |
| "grad_norm_var": 2.152808837213317, |
| "learning_rate": 0.0001, |
| "loss": 1.5515, |
| "loss/crossentropy": 2.6210036277770996, |
| "loss/hidden": 1.40625, |
| "loss/logits": 0.1449393779039383, |
| "loss/reg": 3.338697206345387e-05, |
| "step": 493 |
| }, |
| { |
| "epoch": 0.06175, |
| "grad_norm": 2.5107972621917725, |
| "grad_norm_var": 2.1309396475018327, |
| "learning_rate": 0.0001, |
| "loss": 1.2771, |
| "loss/crossentropy": 2.296051263809204, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.18302392959594727, |
| "loss/reg": 3.337907401146367e-05, |
| "step": 494 |
| }, |
| { |
| "epoch": 0.061875, |
| "grad_norm": 2.3817129135131836, |
| "grad_norm_var": 2.138088174245088, |
| "learning_rate": 0.0001, |
| "loss": 1.2106, |
| "loss/crossentropy": 2.6225788593292236, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.1555565595626831, |
| "loss/reg": 3.3369677112204954e-05, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.062, |
| "grad_norm": 2.297321081161499, |
| "grad_norm_var": 2.137427651207279, |
| "learning_rate": 0.0001, |
| "loss": 1.3821, |
| "loss/crossentropy": 2.326659679412842, |
| "loss/hidden": 1.1796875, |
| "loss/logits": 0.20210911333560944, |
| "loss/reg": 3.3361084206262603e-05, |
| "step": 496 |
| }, |
| { |
| "epoch": 0.062125, |
| "grad_norm": 2.1879894733428955, |
| "grad_norm_var": 2.1302310419826815, |
| "learning_rate": 0.0001, |
| "loss": 1.3433, |
| "loss/crossentropy": 2.6444246768951416, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.18667887151241302, |
| "loss/reg": 3.3350897865602747e-05, |
| "step": 497 |
| }, |
| { |
| "epoch": 0.06225, |
| "grad_norm": 2.7556395530700684, |
| "grad_norm_var": 2.1230810021853803, |
| "learning_rate": 0.0001, |
| "loss": 1.1761, |
| "loss/crossentropy": 2.8364853858947754, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.16018438339233398, |
| "loss/reg": 3.334263601573184e-05, |
| "step": 498 |
| }, |
| { |
| "epoch": 0.062375, |
| "grad_norm": 2.0885095596313477, |
| "grad_norm_var": 2.085981261133649, |
| "learning_rate": 0.0001, |
| "loss": 1.4855, |
| "loss/crossentropy": 2.0875301361083984, |
| "loss/hidden": 1.2890625, |
| "loss/logits": 0.19610214233398438, |
| "loss/reg": 3.3336276828777045e-05, |
| "step": 499 |
| }, |
| { |
| "epoch": 0.0625, |
| "grad_norm": 2.195967674255371, |
| "grad_norm_var": 2.0829178782721693, |
| "learning_rate": 0.0001, |
| "loss": 1.357, |
| "loss/crossentropy": 2.2123799324035645, |
| "loss/hidden": 1.1953125, |
| "loss/logits": 0.16132420301437378, |
| "loss/reg": 3.3326996344840154e-05, |
| "step": 500 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 8000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": true, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.2202930782208e+16, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|