| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.125, |
| "eval_steps": 250, |
| "global_step": 1000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.000125, |
| "grad_norm": 2.537714958190918, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 0.5468, |
| "loss/crossentropy": 2.2066214084625244, |
| "loss/hidden": 0.248046875, |
| "loss/logits": 0.03443578630685806, |
| "loss/reg": 0.026429571211338043, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.00025, |
| "grad_norm": 2.4728448390960693, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 0.6642, |
| "loss/crossentropy": 2.132329225540161, |
| "loss/hidden": 0.345703125, |
| "loss/logits": 0.05424630641937256, |
| "loss/reg": 0.026429571211338043, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.000375, |
| "grad_norm": 2.773984670639038, |
| "learning_rate": 3e-06, |
| "loss": 0.5822, |
| "loss/crossentropy": 2.3457791805267334, |
| "loss/hidden": 0.2734375, |
| "loss/logits": 0.044443465769290924, |
| "loss/reg": 0.02642953023314476, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.0005, |
| "grad_norm": 4.14040470123291, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 0.7192, |
| "loss/crossentropy": 2.7209200859069824, |
| "loss/hidden": 0.35546875, |
| "loss/logits": 0.09940779209136963, |
| "loss/reg": 0.026429466903209686, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.000625, |
| "grad_norm": 1.9164764881134033, |
| "learning_rate": 5e-06, |
| "loss": 0.5467, |
| "loss/crossentropy": 2.4304752349853516, |
| "loss/hidden": 0.244140625, |
| "loss/logits": 0.03826362267136574, |
| "loss/reg": 0.02642936445772648, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.00075, |
| "grad_norm": 1.9878246784210205, |
| "learning_rate": 6e-06, |
| "loss": 0.517, |
| "loss/crossentropy": 2.472181797027588, |
| "loss/hidden": 0.2255859375, |
| "loss/logits": 0.027161670848727226, |
| "loss/reg": 0.02642924338579178, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.000875, |
| "grad_norm": 2.1939733028411865, |
| "learning_rate": 7.000000000000001e-06, |
| "loss": 0.6043, |
| "loss/crossentropy": 2.241501808166504, |
| "loss/hidden": 0.298828125, |
| "loss/logits": 0.04118040204048157, |
| "loss/reg": 0.02642909064888954, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.001, |
| "grad_norm": 3.516223907470703, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 0.5199, |
| "loss/crossentropy": 2.409766912460327, |
| "loss/hidden": 0.2236328125, |
| "loss/logits": 0.032000549137592316, |
| "loss/reg": 0.02642889879643917, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.001125, |
| "grad_norm": 1.9335486888885498, |
| "learning_rate": 9e-06, |
| "loss": 0.5575, |
| "loss/crossentropy": 2.6256861686706543, |
| "loss/hidden": 0.255859375, |
| "loss/logits": 0.037392452359199524, |
| "loss/reg": 0.02642873302102089, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.00125, |
| "grad_norm": 1.6782876253128052, |
| "learning_rate": 1e-05, |
| "loss": 0.5162, |
| "loss/crossentropy": 2.1947107315063477, |
| "loss/hidden": 0.2255859375, |
| "loss/logits": 0.026354767382144928, |
| "loss/reg": 0.026428483426570892, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.001375, |
| "grad_norm": 10.848552703857422, |
| "learning_rate": 1.1000000000000001e-05, |
| "loss": 1.0046, |
| "loss/crossentropy": 3.0539069175720215, |
| "loss/hidden": 0.640625, |
| "loss/logits": 0.09970991313457489, |
| "loss/reg": 0.026428230106830597, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.0015, |
| "grad_norm": 2.237061023712158, |
| "learning_rate": 1.2e-05, |
| "loss": 0.563, |
| "loss/crossentropy": 2.5601325035095215, |
| "loss/hidden": 0.248046875, |
| "loss/logits": 0.050660137087106705, |
| "loss/reg": 0.02642793208360672, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.001625, |
| "grad_norm": 1.4406346082687378, |
| "learning_rate": 1.3000000000000001e-05, |
| "loss": 0.507, |
| "loss/crossentropy": 1.965380311012268, |
| "loss/hidden": 0.2197265625, |
| "loss/logits": 0.02295786701142788, |
| "loss/reg": 0.02642756886780262, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.00175, |
| "grad_norm": 3.0757036209106445, |
| "learning_rate": 1.4000000000000001e-05, |
| "loss": 0.7761, |
| "loss/crossentropy": 2.15138840675354, |
| "loss/hidden": 0.4375, |
| "loss/logits": 0.07431840896606445, |
| "loss/reg": 0.026427194476127625, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.001875, |
| "grad_norm": 2.8731143474578857, |
| "learning_rate": 1.5e-05, |
| "loss": 0.4684, |
| "loss/crossentropy": 2.5530812740325928, |
| "loss/hidden": 0.1845703125, |
| "loss/logits": 0.019558344036340714, |
| "loss/reg": 0.026426764205098152, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.002, |
| "grad_norm": 2.5288755893707275, |
| "grad_norm_var": 4.846526347105633, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 0.5781, |
| "loss/crossentropy": 2.5096747875213623, |
| "loss/hidden": 0.275390625, |
| "loss/logits": 0.0384209081530571, |
| "loss/reg": 0.026426298543810844, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.002125, |
| "grad_norm": 2.0281474590301514, |
| "grad_norm_var": 4.89482291121508, |
| "learning_rate": 1.7000000000000003e-05, |
| "loss": 0.5318, |
| "loss/crossentropy": 2.396097421646118, |
| "loss/hidden": 0.232421875, |
| "loss/logits": 0.03516196087002754, |
| "loss/reg": 0.026425909250974655, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.00225, |
| "grad_norm": 2.4487411975860596, |
| "grad_norm_var": 4.896482229626747, |
| "learning_rate": 1.8e-05, |
| "loss": 0.5984, |
| "loss/crossentropy": 2.3916616439819336, |
| "loss/hidden": 0.28125, |
| "loss/logits": 0.052923329174518585, |
| "loss/reg": 0.026425503194332123, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.002375, |
| "grad_norm": 1.986022710800171, |
| "grad_norm_var": 4.956548008938709, |
| "learning_rate": 1.9e-05, |
| "loss": 0.5504, |
| "loss/crossentropy": 2.4791200160980225, |
| "loss/hidden": 0.2421875, |
| "loss/logits": 0.04391499236226082, |
| "loss/reg": 0.026425078511238098, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.0025, |
| "grad_norm": 2.0934784412384033, |
| "grad_norm_var": 4.887277710992484, |
| "learning_rate": 2e-05, |
| "loss": 0.5369, |
| "loss/crossentropy": 2.1741297245025635, |
| "loss/hidden": 0.2392578125, |
| "loss/logits": 0.03335873782634735, |
| "loss/reg": 0.026424556970596313, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.002625, |
| "grad_norm": 1.9445254802703857, |
| "grad_norm_var": 4.884025740026245, |
| "learning_rate": 2.1e-05, |
| "loss": 0.4865, |
| "loss/crossentropy": 2.45112943649292, |
| "loss/hidden": 0.1962890625, |
| "loss/logits": 0.025960583239793777, |
| "loss/reg": 0.02642405778169632, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.00275, |
| "grad_norm": 3.070704221725464, |
| "grad_norm_var": 4.8399171328504895, |
| "learning_rate": 2.2000000000000003e-05, |
| "loss": 0.5887, |
| "loss/crossentropy": 2.1550512313842773, |
| "loss/hidden": 0.275390625, |
| "loss/logits": 0.049097511917352676, |
| "loss/reg": 0.02642347477376461, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.002875, |
| "grad_norm": 2.2452821731567383, |
| "grad_norm_var": 4.835466428034186, |
| "learning_rate": 2.3000000000000003e-05, |
| "loss": 0.5482, |
| "loss/crossentropy": 2.1640255451202393, |
| "loss/hidden": 0.2470703125, |
| "loss/logits": 0.036868080496788025, |
| "loss/reg": 0.026422815397381783, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.003, |
| "grad_norm": 2.032148838043213, |
| "grad_norm_var": 4.84560617678243, |
| "learning_rate": 2.4e-05, |
| "loss": 0.5756, |
| "loss/crossentropy": 2.323482036590576, |
| "loss/hidden": 0.271484375, |
| "loss/logits": 0.03993295133113861, |
| "loss/reg": 0.02642211876809597, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.003125, |
| "grad_norm": 1.763465404510498, |
| "grad_norm_var": 4.8665883230545335, |
| "learning_rate": 2.5e-05, |
| "loss": 0.5139, |
| "loss/crossentropy": 2.33661150932312, |
| "loss/hidden": 0.22265625, |
| "loss/logits": 0.02701444923877716, |
| "loss/reg": 0.026421383023262024, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.00325, |
| "grad_norm": 1.7001625299453735, |
| "grad_norm_var": 4.863438686484135, |
| "learning_rate": 2.6000000000000002e-05, |
| "loss": 0.5696, |
| "loss/crossentropy": 2.2305383682250977, |
| "loss/hidden": 0.275390625, |
| "loss/logits": 0.02997000887989998, |
| "loss/reg": 0.026420695707201958, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.003375, |
| "grad_norm": 3.474130392074585, |
| "grad_norm_var": 0.318824614624526, |
| "learning_rate": 2.7000000000000002e-05, |
| "loss": 0.5456, |
| "loss/crossentropy": 2.1680750846862793, |
| "loss/hidden": 0.2451171875, |
| "loss/logits": 0.036261945962905884, |
| "loss/reg": 0.0264199897646904, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.0035, |
| "grad_norm": 3.8201987743377686, |
| "grad_norm_var": 0.46030846745106163, |
| "learning_rate": 2.8000000000000003e-05, |
| "loss": 0.5181, |
| "loss/crossentropy": 2.418672800064087, |
| "loss/hidden": 0.224609375, |
| "loss/logits": 0.029263213276863098, |
| "loss/reg": 0.02641921117901802, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.003625, |
| "grad_norm": 1.9781090021133423, |
| "grad_norm_var": 0.40905077024170067, |
| "learning_rate": 2.9e-05, |
| "loss": 0.5379, |
| "loss/crossentropy": 2.390868663787842, |
| "loss/hidden": 0.2392578125, |
| "loss/logits": 0.034465983510017395, |
| "loss/reg": 0.026418352499604225, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.00375, |
| "grad_norm": 1.6551319360733032, |
| "grad_norm_var": 0.41503895204729413, |
| "learning_rate": 3e-05, |
| "loss": 0.496, |
| "loss/crossentropy": 2.5960400104522705, |
| "loss/hidden": 0.205078125, |
| "loss/logits": 0.02678578905761242, |
| "loss/reg": 0.026417305693030357, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.003875, |
| "grad_norm": 1.7136921882629395, |
| "grad_norm_var": 0.4185952392532807, |
| "learning_rate": 3.1e-05, |
| "loss": 0.5235, |
| "loss/crossentropy": 2.349839687347412, |
| "loss/hidden": 0.2275390625, |
| "loss/logits": 0.031792763620615005, |
| "loss/reg": 0.026416433975100517, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.004, |
| "grad_norm": 1.9992157220840454, |
| "grad_norm_var": 0.41856547198587274, |
| "learning_rate": 3.2000000000000005e-05, |
| "loss": 0.4928, |
| "loss/crossentropy": 2.3164803981781006, |
| "loss/hidden": 0.2041015625, |
| "loss/logits": 0.024579893797636032, |
| "loss/reg": 0.02641524001955986, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.004125, |
| "grad_norm": 2.705052614212036, |
| "grad_norm_var": 0.42744416353313663, |
| "learning_rate": 3.3e-05, |
| "loss": 0.5732, |
| "loss/crossentropy": 2.42107892036438, |
| "loss/hidden": 0.275390625, |
| "loss/logits": 0.03370767831802368, |
| "loss/reg": 0.02641397900879383, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.00425, |
| "grad_norm": 1.8898464441299438, |
| "grad_norm_var": 0.4350913020843951, |
| "learning_rate": 3.4000000000000007e-05, |
| "loss": 0.5531, |
| "loss/crossentropy": 2.4147770404815674, |
| "loss/hidden": 0.25390625, |
| "loss/logits": 0.03504405915737152, |
| "loss/reg": 0.026412710547447205, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.004375, |
| "grad_norm": 4.9570159912109375, |
| "grad_norm_var": 0.8804344329355491, |
| "learning_rate": 3.5e-05, |
| "loss": 0.6763, |
| "loss/crossentropy": 1.6753497123718262, |
| "loss/hidden": 0.376953125, |
| "loss/logits": 0.03519564867019653, |
| "loss/reg": 0.0264116358011961, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.0045, |
| "grad_norm": 4.928956508636475, |
| "grad_norm_var": 1.2518721453244992, |
| "learning_rate": 3.6e-05, |
| "loss": 0.7329, |
| "loss/crossentropy": 2.6104867458343506, |
| "loss/hidden": 0.400390625, |
| "loss/logits": 0.06845290958881378, |
| "loss/reg": 0.02641039527952671, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.004625, |
| "grad_norm": 7.503647327423096, |
| "grad_norm_var": 2.684651641752033, |
| "learning_rate": 3.7e-05, |
| "loss": 0.6258, |
| "loss/crossentropy": 2.2158656120300293, |
| "loss/hidden": 0.318359375, |
| "loss/logits": 0.043342188000679016, |
| "loss/reg": 0.026409219950437546, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.00475, |
| "grad_norm": 2.6838622093200684, |
| "grad_norm_var": 2.6885420074665602, |
| "learning_rate": 3.8e-05, |
| "loss": 0.5939, |
| "loss/crossentropy": 2.344879627227783, |
| "loss/hidden": 0.28515625, |
| "loss/logits": 0.04461552947759628, |
| "loss/reg": 0.026408080011606216, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.004875, |
| "grad_norm": 3.357893705368042, |
| "grad_norm_var": 2.662758933855309, |
| "learning_rate": 3.9000000000000006e-05, |
| "loss": 0.5729, |
| "loss/crossentropy": 2.6759543418884277, |
| "loss/hidden": 0.275390625, |
| "loss/logits": 0.033413954079151154, |
| "loss/reg": 0.026406895369291306, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.005, |
| "grad_norm": 3.0177316665649414, |
| "grad_norm_var": 2.5949485604856193, |
| "learning_rate": 4e-05, |
| "loss": 0.7498, |
| "loss/crossentropy": 2.2261273860931396, |
| "loss/hidden": 0.408203125, |
| "loss/logits": 0.07758316397666931, |
| "loss/reg": 0.026405224576592445, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.005125, |
| "grad_norm": 2.1196699142456055, |
| "grad_norm_var": 2.54074274703229, |
| "learning_rate": 4.1e-05, |
| "loss": 0.6396, |
| "loss/crossentropy": 2.193378448486328, |
| "loss/hidden": 0.30859375, |
| "loss/logits": 0.06692355871200562, |
| "loss/reg": 0.026403924450278282, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.00525, |
| "grad_norm": 2.456051826477051, |
| "grad_norm_var": 2.435973046683167, |
| "learning_rate": 4.2e-05, |
| "loss": 0.5571, |
| "loss/crossentropy": 1.9526888132095337, |
| "loss/hidden": 0.26171875, |
| "loss/logits": 0.03133418411016464, |
| "loss/reg": 0.026402529329061508, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.005375, |
| "grad_norm": 2.257375717163086, |
| "grad_norm_var": 2.474501380785125, |
| "learning_rate": 4.3e-05, |
| "loss": 0.5544, |
| "loss/crossentropy": 2.3284847736358643, |
| "loss/hidden": 0.25390625, |
| "loss/logits": 0.03650724142789841, |
| "loss/reg": 0.026400938630104065, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.0055, |
| "grad_norm": 2.9145264625549316, |
| "grad_norm_var": 2.4345975605903694, |
| "learning_rate": 4.4000000000000006e-05, |
| "loss": 0.5175, |
| "loss/crossentropy": 2.295241594314575, |
| "loss/hidden": 0.220703125, |
| "loss/logits": 0.03284794092178345, |
| "loss/reg": 0.026399515569210052, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.005625, |
| "grad_norm": 3.1294264793395996, |
| "grad_norm_var": 2.3592519473156615, |
| "learning_rate": 4.5e-05, |
| "loss": 0.5567, |
| "loss/crossentropy": 2.660597085952759, |
| "loss/hidden": 0.255859375, |
| "loss/logits": 0.03686758130788803, |
| "loss/reg": 0.026398126035928726, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.00575, |
| "grad_norm": 2.197265863418579, |
| "grad_norm_var": 2.2745842657817748, |
| "learning_rate": 4.600000000000001e-05, |
| "loss": 0.5512, |
| "loss/crossentropy": 2.3832643032073975, |
| "loss/hidden": 0.2490234375, |
| "loss/logits": 0.038256023079156876, |
| "loss/reg": 0.02639671601355076, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.005875, |
| "grad_norm": 2.883378744125366, |
| "grad_norm_var": 2.141634704665381, |
| "learning_rate": 4.7e-05, |
| "loss": 0.5298, |
| "loss/crossentropy": 2.6035244464874268, |
| "loss/hidden": 0.2333984375, |
| "loss/logits": 0.03240815922617912, |
| "loss/reg": 0.026395246386528015, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.006, |
| "grad_norm": 3.1519744396209717, |
| "grad_norm_var": 2.0420385103816727, |
| "learning_rate": 4.8e-05, |
| "loss": 0.5385, |
| "loss/crossentropy": 2.250037908554077, |
| "loss/hidden": 0.244140625, |
| "loss/logits": 0.03043752908706665, |
| "loss/reg": 0.026393571868538857, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.006125, |
| "grad_norm": 3.187680244445801, |
| "grad_norm_var": 2.0209109756516943, |
| "learning_rate": 4.9e-05, |
| "loss": 0.5614, |
| "loss/crossentropy": 2.366483688354492, |
| "loss/hidden": 0.263671875, |
| "loss/logits": 0.033859170973300934, |
| "loss/reg": 0.02639181725680828, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.00625, |
| "grad_norm": 2.3717658519744873, |
| "grad_norm_var": 1.9454730589909708, |
| "learning_rate": 5e-05, |
| "loss": 0.5865, |
| "loss/crossentropy": 2.007732391357422, |
| "loss/hidden": 0.2890625, |
| "loss/logits": 0.0335388109087944, |
| "loss/reg": 0.02638987824320793, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.006375, |
| "grad_norm": 3.658735990524292, |
| "grad_norm_var": 1.767425501826429, |
| "learning_rate": 5.1000000000000006e-05, |
| "loss": 0.5028, |
| "loss/crossentropy": 2.511072874069214, |
| "loss/hidden": 0.2099609375, |
| "loss/logits": 0.02893088385462761, |
| "loss/reg": 0.026387827470898628, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.0065, |
| "grad_norm": 2.5912654399871826, |
| "grad_norm_var": 1.582150273328572, |
| "learning_rate": 5.2000000000000004e-05, |
| "loss": 0.5619, |
| "loss/crossentropy": 2.3280093669891357, |
| "loss/hidden": 0.263671875, |
| "loss/logits": 0.03436018154025078, |
| "loss/reg": 0.026385735720396042, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.006625, |
| "grad_norm": 2.0419421195983887, |
| "grad_norm_var": 0.23432357463535497, |
| "learning_rate": 5.300000000000001e-05, |
| "loss": 0.5674, |
| "loss/crossentropy": 2.3851194381713867, |
| "loss/hidden": 0.263671875, |
| "loss/logits": 0.039869021624326706, |
| "loss/reg": 0.026383817195892334, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.00675, |
| "grad_norm": 2.4164810180664062, |
| "grad_norm_var": 0.24119551692934707, |
| "learning_rate": 5.4000000000000005e-05, |
| "loss": 0.6087, |
| "loss/crossentropy": 2.6006996631622314, |
| "loss/hidden": 0.296875, |
| "loss/logits": 0.04797635227441788, |
| "loss/reg": 0.026381801813840866, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.006875, |
| "grad_norm": 2.697831153869629, |
| "grad_norm_var": 0.2135682431387058, |
| "learning_rate": 5.500000000000001e-05, |
| "loss": 0.523, |
| "loss/crossentropy": 2.472208261489868, |
| "loss/hidden": 0.2275390625, |
| "loss/logits": 0.031705208122730255, |
| "loss/reg": 0.026379752904176712, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.007, |
| "grad_norm": 4.182509422302246, |
| "grad_norm_var": 0.34874494246430365, |
| "learning_rate": 5.6000000000000006e-05, |
| "loss": 0.6766, |
| "loss/crossentropy": 2.693652868270874, |
| "loss/hidden": 0.3671875, |
| "loss/logits": 0.04566050320863724, |
| "loss/reg": 0.02637762948870659, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.007125, |
| "grad_norm": 2.231238842010498, |
| "grad_norm_var": 0.33990645656106155, |
| "learning_rate": 5.6999999999999996e-05, |
| "loss": 0.5811, |
| "loss/crossentropy": 2.4935543537139893, |
| "loss/hidden": 0.28125, |
| "loss/logits": 0.03613065183162689, |
| "loss/reg": 0.026375366374850273, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.00725, |
| "grad_norm": 2.0192184448242188, |
| "grad_norm_var": 0.37029866859904437, |
| "learning_rate": 5.8e-05, |
| "loss": 0.5285, |
| "loss/crossentropy": 2.192227840423584, |
| "loss/hidden": 0.2294921875, |
| "loss/logits": 0.03531934320926666, |
| "loss/reg": 0.02637314423918724, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.007375, |
| "grad_norm": 2.3108532428741455, |
| "grad_norm_var": 0.36699486123436575, |
| "learning_rate": 5.9e-05, |
| "loss": 0.507, |
| "loss/crossentropy": 2.39101243019104, |
| "loss/hidden": 0.2177734375, |
| "loss/logits": 0.025539016351103783, |
| "loss/reg": 0.02637065388262272, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.0075, |
| "grad_norm": 2.049551486968994, |
| "grad_norm_var": 0.3946811437011318, |
| "learning_rate": 6e-05, |
| "loss": 0.5351, |
| "loss/crossentropy": 2.7062017917633057, |
| "loss/hidden": 0.2353515625, |
| "loss/logits": 0.03605186939239502, |
| "loss/reg": 0.026368385180830956, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.007625, |
| "grad_norm": 2.6327223777770996, |
| "grad_norm_var": 0.38133460463904284, |
| "learning_rate": 6.1e-05, |
| "loss": 0.5344, |
| "loss/crossentropy": 2.0810956954956055, |
| "loss/hidden": 0.23828125, |
| "loss/logits": 0.03246723860502243, |
| "loss/reg": 0.02636607363820076, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.00775, |
| "grad_norm": 2.06585955619812, |
| "grad_norm_var": 0.39059185941555535, |
| "learning_rate": 6.2e-05, |
| "loss": 0.5202, |
| "loss/crossentropy": 2.6240835189819336, |
| "loss/hidden": 0.224609375, |
| "loss/logits": 0.03200242295861244, |
| "loss/reg": 0.026363445445895195, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.007875, |
| "grad_norm": 2.109790563583374, |
| "grad_norm_var": 0.40452198957435875, |
| "learning_rate": 6.3e-05, |
| "loss": 0.5249, |
| "loss/crossentropy": 2.587536573410034, |
| "loss/hidden": 0.2294921875, |
| "loss/logits": 0.03178905323147774, |
| "loss/reg": 0.02636083774268627, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.008, |
| "grad_norm": 3.818783760070801, |
| "grad_norm_var": 0.48072296241430573, |
| "learning_rate": 6.400000000000001e-05, |
| "loss": 0.6632, |
| "loss/crossentropy": 2.011171817779541, |
| "loss/hidden": 0.353515625, |
| "loss/logits": 0.04613731801509857, |
| "loss/reg": 0.02635800838470459, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.008125, |
| "grad_norm": 2.4136369228363037, |
| "grad_norm_var": 0.46258887231494605, |
| "learning_rate": 6.500000000000001e-05, |
| "loss": 0.522, |
| "loss/crossentropy": 2.600787401199341, |
| "loss/hidden": 0.224609375, |
| "loss/logits": 0.03384025767445564, |
| "loss/reg": 0.026355121284723282, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.00825, |
| "grad_norm": 2.3908252716064453, |
| "grad_norm_var": 0.46202963925557394, |
| "learning_rate": 6.6e-05, |
| "loss": 0.5859, |
| "loss/crossentropy": 2.1056201457977295, |
| "loss/hidden": 0.279296875, |
| "loss/logits": 0.04309317469596863, |
| "loss/reg": 0.02635251171886921, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.008375, |
| "grad_norm": 2.5190653800964355, |
| "grad_norm_var": 0.38262308323354144, |
| "learning_rate": 6.7e-05, |
| "loss": 0.5391, |
| "loss/crossentropy": 2.5527184009552, |
| "loss/hidden": 0.2421875, |
| "loss/logits": 0.0334152951836586, |
| "loss/reg": 0.02634957991540432, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.0085, |
| "grad_norm": 2.500368595123291, |
| "grad_norm_var": 0.3824057294099087, |
| "learning_rate": 6.800000000000001e-05, |
| "loss": 0.5958, |
| "loss/crossentropy": 2.30499267578125, |
| "loss/hidden": 0.28125, |
| "loss/logits": 0.05108712613582611, |
| "loss/reg": 0.026346800848841667, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.008625, |
| "grad_norm": 2.7905988693237305, |
| "grad_norm_var": 0.3692126592154902, |
| "learning_rate": 6.9e-05, |
| "loss": 0.6049, |
| "loss/crossentropy": 2.3807146549224854, |
| "loss/hidden": 0.294921875, |
| "loss/logits": 0.04648623988032341, |
| "loss/reg": 0.026344334706664085, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.00875, |
| "grad_norm": 2.147470235824585, |
| "grad_norm_var": 0.3793077808516782, |
| "learning_rate": 7e-05, |
| "loss": 0.5345, |
| "loss/crossentropy": 2.627505302429199, |
| "loss/hidden": 0.236328125, |
| "loss/logits": 0.034769318997859955, |
| "loss/reg": 0.026341637596488, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.008875, |
| "grad_norm": 2.6987268924713135, |
| "grad_norm_var": 0.3793248871627156, |
| "learning_rate": 7.1e-05, |
| "loss": 0.5722, |
| "loss/crossentropy": 2.382685899734497, |
| "loss/hidden": 0.271484375, |
| "loss/logits": 0.03735022246837616, |
| "loss/reg": 0.02633870206773281, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.009, |
| "grad_norm": 3.085085153579712, |
| "grad_norm_var": 0.2164648496489896, |
| "learning_rate": 7.2e-05, |
| "loss": 0.5882, |
| "loss/crossentropy": 2.371429681777954, |
| "loss/hidden": 0.27734375, |
| "loss/logits": 0.04748620092868805, |
| "loss/reg": 0.02633603662252426, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.009125, |
| "grad_norm": 4.158353328704834, |
| "grad_norm_var": 0.3829897758196862, |
| "learning_rate": 7.3e-05, |
| "loss": 0.8663, |
| "loss/crossentropy": 2.29622745513916, |
| "loss/hidden": 0.5234375, |
| "loss/logits": 0.07955377548933029, |
| "loss/reg": 0.026333071291446686, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.00925, |
| "grad_norm": 2.111111879348755, |
| "grad_norm_var": 0.37631661688178514, |
| "learning_rate": 7.4e-05, |
| "loss": 0.5468, |
| "loss/crossentropy": 2.29744815826416, |
| "loss/hidden": 0.24609375, |
| "loss/logits": 0.037446070462465286, |
| "loss/reg": 0.026330096647143364, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.009375, |
| "grad_norm": 2.545919179916382, |
| "grad_norm_var": 0.37031037444480336, |
| "learning_rate": 7.500000000000001e-05, |
| "loss": 0.625, |
| "loss/crossentropy": 2.4376375675201416, |
| "loss/hidden": 0.306640625, |
| "loss/logits": 0.05505819618701935, |
| "loss/reg": 0.026326792314648628, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.0095, |
| "grad_norm": 2.362215042114258, |
| "grad_norm_var": 0.35233204024674003, |
| "learning_rate": 7.6e-05, |
| "loss": 0.5747, |
| "loss/crossentropy": 2.677924156188965, |
| "loss/hidden": 0.267578125, |
| "loss/logits": 0.04392882436513901, |
| "loss/reg": 0.026323769241571426, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.009625, |
| "grad_norm": 3.135709762573242, |
| "grad_norm_var": 0.3671929300455114, |
| "learning_rate": 7.7e-05, |
| "loss": 0.7113, |
| "loss/crossentropy": 1.972798466682434, |
| "loss/hidden": 0.384765625, |
| "loss/logits": 0.06329117715358734, |
| "loss/reg": 0.026320943608880043, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.00975, |
| "grad_norm": 4.418634414672852, |
| "grad_norm_var": 0.5210260544686395, |
| "learning_rate": 7.800000000000001e-05, |
| "loss": 0.6851, |
| "loss/crossentropy": 2.558809518814087, |
| "loss/hidden": 0.357421875, |
| "loss/logits": 0.06447892636060715, |
| "loss/reg": 0.02631756290793419, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.009875, |
| "grad_norm": 3.9261293411254883, |
| "grad_norm_var": 0.55391532710314, |
| "learning_rate": 7.900000000000001e-05, |
| "loss": 0.5968, |
| "loss/crossentropy": 2.6102137565612793, |
| "loss/hidden": 0.28515625, |
| "loss/logits": 0.04846350848674774, |
| "loss/reg": 0.026314500719308853, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 3.1532020568847656, |
| "grad_norm_var": 0.5035194586586452, |
| "learning_rate": 8e-05, |
| "loss": 0.7066, |
| "loss/crossentropy": 2.36220121383667, |
| "loss/hidden": 0.38671875, |
| "loss/logits": 0.05672474205493927, |
| "loss/reg": 0.026311254128813744, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.010125, |
| "grad_norm": 3.557161808013916, |
| "grad_norm_var": 0.5115010248662256, |
| "learning_rate": 8.1e-05, |
| "loss": 0.6647, |
| "loss/crossentropy": 2.412325859069824, |
| "loss/hidden": 0.345703125, |
| "loss/logits": 0.05596970394253731, |
| "loss/reg": 0.026307715103030205, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.01025, |
| "grad_norm": 2.2158408164978027, |
| "grad_norm_var": 0.5268993015208875, |
| "learning_rate": 8.2e-05, |
| "loss": 0.5575, |
| "loss/crossentropy": 2.3789050579071045, |
| "loss/hidden": 0.255859375, |
| "loss/logits": 0.03855578228831291, |
| "loss/reg": 0.0263040903955698, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.010375, |
| "grad_norm": 3.2140979766845703, |
| "grad_norm_var": 0.5164286227091848, |
| "learning_rate": 8.3e-05, |
| "loss": 0.5548, |
| "loss/crossentropy": 2.401925802230835, |
| "loss/hidden": 0.2578125, |
| "loss/logits": 0.03401009738445282, |
| "loss/reg": 0.026300577446818352, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.0105, |
| "grad_norm": 2.4155867099761963, |
| "grad_norm_var": 0.5225404018326155, |
| "learning_rate": 8.4e-05, |
| "loss": 0.5432, |
| "loss/crossentropy": 2.6546974182128906, |
| "loss/hidden": 0.244140625, |
| "loss/logits": 0.03612750768661499, |
| "loss/reg": 0.02629682794213295, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.010625, |
| "grad_norm": 4.232295036315918, |
| "grad_norm_var": 0.6129643025970267, |
| "learning_rate": 8.5e-05, |
| "loss": 0.7199, |
| "loss/crossentropy": 2.2300524711608887, |
| "loss/hidden": 0.412109375, |
| "loss/logits": 0.044823646545410156, |
| "loss/reg": 0.0262930728495121, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.01075, |
| "grad_norm": 2.7160282135009766, |
| "grad_norm_var": 0.5620128324129702, |
| "learning_rate": 8.6e-05, |
| "loss": 0.6973, |
| "loss/crossentropy": 2.2953288555145264, |
| "loss/hidden": 0.37109375, |
| "loss/logits": 0.06333646178245544, |
| "loss/reg": 0.02628917805850506, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.010875, |
| "grad_norm": 3.0872819423675537, |
| "grad_norm_var": 0.5495392294868544, |
| "learning_rate": 8.7e-05, |
| "loss": 0.636, |
| "loss/crossentropy": 2.449223756790161, |
| "loss/hidden": 0.328125, |
| "loss/logits": 0.045011188834905624, |
| "loss/reg": 0.02628495544195175, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.011, |
| "grad_norm": 2.6966607570648193, |
| "grad_norm_var": 0.5621192378124493, |
| "learning_rate": 8.800000000000001e-05, |
| "loss": 0.6272, |
| "loss/crossentropy": 2.5449047088623047, |
| "loss/hidden": 0.31640625, |
| "loss/logits": 0.04802623763680458, |
| "loss/reg": 0.026280568912625313, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.011125, |
| "grad_norm": 2.9160921573638916, |
| "grad_norm_var": 0.48685408890983506, |
| "learning_rate": 8.900000000000001e-05, |
| "loss": 0.6278, |
| "loss/crossentropy": 2.1442108154296875, |
| "loss/hidden": 0.310546875, |
| "loss/logits": 0.054462507367134094, |
| "loss/reg": 0.02627684734761715, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.01125, |
| "grad_norm": 3.3378944396972656, |
| "grad_norm_var": 0.4283231906687052, |
| "learning_rate": 9e-05, |
| "loss": 0.6536, |
| "loss/crossentropy": 2.4361062049865723, |
| "loss/hidden": 0.349609375, |
| "loss/logits": 0.04130454361438751, |
| "loss/reg": 0.026273205876350403, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.011375, |
| "grad_norm": 2.597607374191284, |
| "grad_norm_var": 0.42452911296148627, |
| "learning_rate": 9.1e-05, |
| "loss": 0.6391, |
| "loss/crossentropy": 2.0079762935638428, |
| "loss/hidden": 0.3203125, |
| "loss/logits": 0.056107863783836365, |
| "loss/reg": 0.02626909501850605, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.0115, |
| "grad_norm": 4.960971355438232, |
| "grad_norm_var": 0.5826997127177641, |
| "learning_rate": 9.200000000000001e-05, |
| "loss": 0.6735, |
| "loss/crossentropy": 2.708275079727173, |
| "loss/hidden": 0.349609375, |
| "loss/logits": 0.06119866296648979, |
| "loss/reg": 0.026265164837241173, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.011625, |
| "grad_norm": 3.8193323612213135, |
| "grad_norm_var": 0.5981799563928756, |
| "learning_rate": 9.300000000000001e-05, |
| "loss": 0.8429, |
| "loss/crossentropy": 2.4117016792297363, |
| "loss/hidden": 0.498046875, |
| "loss/logits": 0.08221981674432755, |
| "loss/reg": 0.026260720565915108, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.01175, |
| "grad_norm": 3.434213638305664, |
| "grad_norm_var": 0.5157332557296352, |
| "learning_rate": 9.4e-05, |
| "loss": 0.6738, |
| "loss/crossentropy": 2.62178111076355, |
| "loss/hidden": 0.36328125, |
| "loss/logits": 0.04794853553175926, |
| "loss/reg": 0.02625615894794464, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.011875, |
| "grad_norm": 3.0944480895996094, |
| "grad_norm_var": 0.4859309000509171, |
| "learning_rate": 9.5e-05, |
| "loss": 0.6829, |
| "loss/crossentropy": 2.582462787628174, |
| "loss/hidden": 0.353515625, |
| "loss/logits": 0.0669020414352417, |
| "loss/reg": 0.026251958683133125, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.012, |
| "grad_norm": 3.9548256397247314, |
| "grad_norm_var": 0.5194300484800329, |
| "learning_rate": 9.6e-05, |
| "loss": 0.8508, |
| "loss/crossentropy": 2.3243165016174316, |
| "loss/hidden": 0.49609375, |
| "loss/logits": 0.09221720695495605, |
| "loss/reg": 0.026247689500451088, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.012125, |
| "grad_norm": 8.949115753173828, |
| "grad_norm_var": 2.5460815450669125, |
| "learning_rate": 9.7e-05, |
| "loss": 0.9668, |
| "loss/crossentropy": 2.3593697547912598, |
| "loss/hidden": 0.62109375, |
| "loss/logits": 0.08330727368593216, |
| "loss/reg": 0.026243869215250015, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.01225, |
| "grad_norm": 3.874511957168579, |
| "grad_norm_var": 2.411331023611861, |
| "learning_rate": 9.8e-05, |
| "loss": 0.7272, |
| "loss/crossentropy": 1.9933784008026123, |
| "loss/hidden": 0.41015625, |
| "loss/logits": 0.05464401841163635, |
| "loss/reg": 0.026239972561597824, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.012375, |
| "grad_norm": 5.088143825531006, |
| "grad_norm_var": 2.507843574169987, |
| "learning_rate": 9.900000000000001e-05, |
| "loss": 0.6761, |
| "loss/crossentropy": 2.578767776489258, |
| "loss/hidden": 0.3515625, |
| "loss/logits": 0.0621890164911747, |
| "loss/reg": 0.026235179975628853, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.0125, |
| "grad_norm": 3.9010627269744873, |
| "grad_norm_var": 2.366914585761602, |
| "learning_rate": 0.0001, |
| "loss": 0.7051, |
| "loss/crossentropy": 2.4717133045196533, |
| "loss/hidden": 0.375, |
| "loss/logits": 0.0677795261144638, |
| "loss/reg": 0.026230769231915474, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.012625, |
| "grad_norm": 5.50706148147583, |
| "grad_norm_var": 2.522191588171327, |
| "learning_rate": 0.0001, |
| "loss": 0.7765, |
| "loss/crossentropy": 2.3764612674713135, |
| "loss/hidden": 0.44921875, |
| "loss/logits": 0.06503438949584961, |
| "loss/reg": 0.02622627653181553, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.01275, |
| "grad_norm": 5.103200435638428, |
| "grad_norm_var": 2.470966679212188, |
| "learning_rate": 0.0001, |
| "loss": 0.7008, |
| "loss/crossentropy": 2.5796542167663574, |
| "loss/hidden": 0.38671875, |
| "loss/logits": 0.05191829800605774, |
| "loss/reg": 0.026221245527267456, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.012875, |
| "grad_norm": 18.05303192138672, |
| "grad_norm_var": 14.358413039824521, |
| "learning_rate": 0.0001, |
| "loss": 1.0008, |
| "loss/crossentropy": 1.927337646484375, |
| "loss/hidden": 0.6796875, |
| "loss/logits": 0.05896752327680588, |
| "loss/reg": 0.02621658518910408, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.013, |
| "grad_norm": 3.410438299179077, |
| "grad_norm_var": 14.16338361533652, |
| "learning_rate": 0.0001, |
| "loss": 0.735, |
| "loss/crossentropy": 2.290928363800049, |
| "loss/hidden": 0.40625, |
| "loss/logits": 0.06666909158229828, |
| "loss/reg": 0.026211561635136604, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.013125, |
| "grad_norm": 3.117622137069702, |
| "grad_norm_var": 14.10656391346422, |
| "learning_rate": 0.0001, |
| "loss": 0.6665, |
| "loss/crossentropy": 2.6549246311187744, |
| "loss/hidden": 0.353515625, |
| "loss/logits": 0.05095440149307251, |
| "loss/reg": 0.026206739246845245, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.01325, |
| "grad_norm": 3.9999241828918457, |
| "grad_norm_var": 13.97508509706009, |
| "learning_rate": 0.0001, |
| "loss": 0.8082, |
| "loss/crossentropy": 2.460174798965454, |
| "loss/hidden": 0.46484375, |
| "loss/logits": 0.08136071264743805, |
| "loss/reg": 0.02620157040655613, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.013375, |
| "grad_norm": 3.405712842941284, |
| "grad_norm_var": 13.73775124044489, |
| "learning_rate": 0.0001, |
| "loss": 0.6518, |
| "loss/crossentropy": 2.521803855895996, |
| "loss/hidden": 0.3359375, |
| "loss/logits": 0.053909383714199066, |
| "loss/reg": 0.02619684301316738, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.0135, |
| "grad_norm": 3.615098237991333, |
| "grad_norm_var": 13.899167673014775, |
| "learning_rate": 0.0001, |
| "loss": 0.7068, |
| "loss/crossentropy": 2.510159969329834, |
| "loss/hidden": 0.37890625, |
| "loss/logits": 0.06601101160049438, |
| "loss/reg": 0.02619197592139244, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.013625, |
| "grad_norm": 4.2520599365234375, |
| "grad_norm_var": 13.834356012442765, |
| "learning_rate": 0.0001, |
| "loss": 0.735, |
| "loss/crossentropy": 2.508683681488037, |
| "loss/hidden": 0.41015625, |
| "loss/logits": 0.0629870742559433, |
| "loss/reg": 0.0261868704110384, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.01375, |
| "grad_norm": 3.215749979019165, |
| "grad_norm_var": 13.887973421518442, |
| "learning_rate": 0.0001, |
| "loss": 0.8268, |
| "loss/crossentropy": 2.3564252853393555, |
| "loss/hidden": 0.48046875, |
| "loss/logits": 0.08449074625968933, |
| "loss/reg": 0.026181429624557495, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.013875, |
| "grad_norm": 4.598328590393066, |
| "grad_norm_var": 13.615373346458785, |
| "learning_rate": 0.0001, |
| "loss": 0.7398, |
| "loss/crossentropy": 2.366943359375, |
| "loss/hidden": 0.41796875, |
| "loss/logits": 0.060083672404289246, |
| "loss/reg": 0.026176555082201958, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.014, |
| "grad_norm": 2.758070707321167, |
| "grad_norm_var": 13.912012390229316, |
| "learning_rate": 0.0001, |
| "loss": 0.6839, |
| "loss/crossentropy": 2.3351521492004395, |
| "loss/hidden": 0.365234375, |
| "loss/logits": 0.056987129151821136, |
| "loss/reg": 0.0261719711124897, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.014125, |
| "grad_norm": 2.9389584064483643, |
| "grad_norm_var": 13.147693721903025, |
| "learning_rate": 0.0001, |
| "loss": 0.8964, |
| "loss/crossentropy": 2.188626766204834, |
| "loss/hidden": 0.54296875, |
| "loss/logits": 0.09171397984027863, |
| "loss/reg": 0.026167072355747223, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.01425, |
| "grad_norm": 2.8545026779174805, |
| "grad_norm_var": 13.338918720074266, |
| "learning_rate": 0.0001, |
| "loss": 0.6652, |
| "loss/crossentropy": 2.488462448120117, |
| "loss/hidden": 0.341796875, |
| "loss/logits": 0.06177069991827011, |
| "loss/reg": 0.026162149384617805, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.014375, |
| "grad_norm": 3.343590497970581, |
| "grad_norm_var": 13.447848849906688, |
| "learning_rate": 0.0001, |
| "loss": 0.7317, |
| "loss/crossentropy": 2.4826672077178955, |
| "loss/hidden": 0.396484375, |
| "loss/logits": 0.07369040697813034, |
| "loss/reg": 0.026156950742006302, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.0145, |
| "grad_norm": 5.309541702270508, |
| "grad_norm_var": 13.435010363164546, |
| "learning_rate": 0.0001, |
| "loss": 0.6918, |
| "loss/crossentropy": 2.715517282485962, |
| "loss/hidden": 0.376953125, |
| "loss/logits": 0.05337735265493393, |
| "loss/reg": 0.026151426136493683, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.014625, |
| "grad_norm": 3.413027763366699, |
| "grad_norm_var": 13.488672790501717, |
| "learning_rate": 0.0001, |
| "loss": 0.7942, |
| "loss/crossentropy": 2.3932089805603027, |
| "loss/hidden": 0.451171875, |
| "loss/logits": 0.08154396712779999, |
| "loss/reg": 0.026146216318011284, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.01475, |
| "grad_norm": 2.735275983810425, |
| "grad_norm_var": 13.676075950246783, |
| "learning_rate": 0.0001, |
| "loss": 0.7606, |
| "loss/crossentropy": 2.2933082580566406, |
| "loss/hidden": 0.4296875, |
| "loss/logits": 0.06949938833713531, |
| "loss/reg": 0.02614082768559456, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.014875, |
| "grad_norm": 3.1346964836120605, |
| "grad_norm_var": 0.5056645529599865, |
| "learning_rate": 0.0001, |
| "loss": 0.7759, |
| "loss/crossentropy": 2.331713914871216, |
| "loss/hidden": 0.4375, |
| "loss/logits": 0.07704727351665497, |
| "loss/reg": 0.026135168969631195, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.015, |
| "grad_norm": 3.8077635765075684, |
| "grad_norm_var": 0.5104468723684629, |
| "learning_rate": 0.0001, |
| "loss": 0.6882, |
| "loss/crossentropy": 2.331220865249634, |
| "loss/hidden": 0.373046875, |
| "loss/logits": 0.05386776477098465, |
| "loss/reg": 0.02612963318824768, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.015125, |
| "grad_norm": 3.209914445877075, |
| "grad_norm_var": 0.5058893418769751, |
| "learning_rate": 0.0001, |
| "loss": 0.7536, |
| "loss/crossentropy": 2.352771759033203, |
| "loss/hidden": 0.4140625, |
| "loss/logits": 0.07825946807861328, |
| "loss/reg": 0.026124266907572746, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.01525, |
| "grad_norm": 3.3548500537872314, |
| "grad_norm_var": 0.49208198737674874, |
| "learning_rate": 0.0001, |
| "loss": 0.7088, |
| "loss/crossentropy": 2.483644723892212, |
| "loss/hidden": 0.384765625, |
| "loss/logits": 0.06287863850593567, |
| "loss/reg": 0.0261182002723217, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.015375, |
| "grad_norm": 3.9953765869140625, |
| "grad_norm_var": 0.5066601541023895, |
| "learning_rate": 0.0001, |
| "loss": 0.7792, |
| "loss/crossentropy": 2.6117637157440186, |
| "loss/hidden": 0.435546875, |
| "loss/logits": 0.08250629901885986, |
| "loss/reg": 0.026112213730812073, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.0155, |
| "grad_norm": 3.1783852577209473, |
| "grad_norm_var": 0.5138316405800327, |
| "learning_rate": 0.0001, |
| "loss": 0.7559, |
| "loss/crossentropy": 2.401679754257202, |
| "loss/hidden": 0.423828125, |
| "loss/logits": 0.07104581594467163, |
| "loss/reg": 0.02610679157078266, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.015625, |
| "grad_norm": 3.2759885787963867, |
| "grad_norm_var": 0.47631527116517774, |
| "learning_rate": 0.0001, |
| "loss": 0.8262, |
| "loss/crossentropy": 2.3979361057281494, |
| "loss/hidden": 0.486328125, |
| "loss/logits": 0.0788530558347702, |
| "loss/reg": 0.026101654395461082, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.01575, |
| "grad_norm": 4.0768632888793945, |
| "grad_norm_var": 0.4963098069624029, |
| "learning_rate": 0.0001, |
| "loss": 0.6983, |
| "loss/crossentropy": 2.5287113189697266, |
| "loss/hidden": 0.375, |
| "loss/logits": 0.062308911234140396, |
| "loss/reg": 0.026096193119883537, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.015875, |
| "grad_norm": 4.300101280212402, |
| "grad_norm_var": 0.45815803943682926, |
| "learning_rate": 0.0001, |
| "loss": 0.858, |
| "loss/crossentropy": 2.255234956741333, |
| "loss/hidden": 0.50390625, |
| "loss/logits": 0.0932290330529213, |
| "loss/reg": 0.026090849190950394, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.016, |
| "grad_norm": 3.303663492202759, |
| "grad_norm_var": 0.42421384752866137, |
| "learning_rate": 0.0001, |
| "loss": 0.7284, |
| "loss/crossentropy": 2.528862476348877, |
| "loss/hidden": 0.40234375, |
| "loss/logits": 0.06523742526769638, |
| "loss/reg": 0.026085302233695984, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.016125, |
| "grad_norm": 6.868241310119629, |
| "grad_norm_var": 1.0876227157335427, |
| "learning_rate": 0.0001, |
| "loss": 0.8999, |
| "loss/crossentropy": 2.6554996967315674, |
| "loss/hidden": 0.5390625, |
| "loss/logits": 0.1000661626458168, |
| "loss/reg": 0.02607985958456993, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.01625, |
| "grad_norm": 3.3035075664520264, |
| "grad_norm_var": 1.046006684658701, |
| "learning_rate": 0.0001, |
| "loss": 0.7367, |
| "loss/crossentropy": 2.293642282485962, |
| "loss/hidden": 0.40625, |
| "loss/logits": 0.06973426043987274, |
| "loss/reg": 0.026074659079313278, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.016375, |
| "grad_norm": 4.2563276290893555, |
| "grad_norm_var": 1.0439696727840135, |
| "learning_rate": 0.0001, |
| "loss": 0.7754, |
| "loss/crossentropy": 2.2192564010620117, |
| "loss/hidden": 0.447265625, |
| "loss/logits": 0.06743350625038147, |
| "loss/reg": 0.026069074869155884, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.0165, |
| "grad_norm": 4.646778583526611, |
| "grad_norm_var": 0.9420233457711596, |
| "learning_rate": 0.0001, |
| "loss": 0.7496, |
| "loss/crossentropy": 2.430368423461914, |
| "loss/hidden": 0.4140625, |
| "loss/logits": 0.07494455575942993, |
| "loss/reg": 0.026063458994030952, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.016625, |
| "grad_norm": 7.465832233428955, |
| "grad_norm_var": 1.7574380087301391, |
| "learning_rate": 0.0001, |
| "loss": 0.9607, |
| "loss/crossentropy": 2.6182897090911865, |
| "loss/hidden": 0.57421875, |
| "loss/logits": 0.1258610337972641, |
| "loss/reg": 0.026057813316583633, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.01675, |
| "grad_norm": 4.5479936599731445, |
| "grad_norm_var": 1.6433309350178509, |
| "learning_rate": 0.0001, |
| "loss": 0.9529, |
| "loss/crossentropy": 2.238551378250122, |
| "loss/hidden": 0.58984375, |
| "loss/logits": 0.10252824425697327, |
| "loss/reg": 0.02605200558900833, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.016875, |
| "grad_norm": 3.4055774211883545, |
| "grad_norm_var": 1.6105102483452751, |
| "learning_rate": 0.0001, |
| "loss": 0.8407, |
| "loss/crossentropy": 2.4216599464416504, |
| "loss/hidden": 0.498046875, |
| "loss/logits": 0.08216647803783417, |
| "loss/reg": 0.026046328246593475, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.017, |
| "grad_norm": 8.839641571044922, |
| "grad_norm_var": 2.938344740358995, |
| "learning_rate": 0.0001, |
| "loss": 1.1023, |
| "loss/crossentropy": 2.534442901611328, |
| "loss/hidden": 0.7109375, |
| "loss/logits": 0.1310025304555893, |
| "loss/reg": 0.026040658354759216, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.017125, |
| "grad_norm": 4.6421589851379395, |
| "grad_norm_var": 2.819843479450094, |
| "learning_rate": 0.0001, |
| "loss": 0.7418, |
| "loss/crossentropy": 2.605559825897217, |
| "loss/hidden": 0.416015625, |
| "loss/logits": 0.06548085808753967, |
| "loss/reg": 0.02603481523692608, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.01725, |
| "grad_norm": 3.1547701358795166, |
| "grad_norm_var": 2.8553314644504555, |
| "learning_rate": 0.0001, |
| "loss": 0.731, |
| "loss/crossentropy": 2.5905325412750244, |
| "loss/hidden": 0.40234375, |
| "loss/logits": 0.06835847347974777, |
| "loss/reg": 0.02602926455438137, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.017375, |
| "grad_norm": 4.074351787567139, |
| "grad_norm_var": 2.849577549707145, |
| "learning_rate": 0.0001, |
| "loss": 0.9582, |
| "loss/crossentropy": 2.1483733654022217, |
| "loss/hidden": 0.60546875, |
| "loss/logits": 0.0924658477306366, |
| "loss/reg": 0.026023706421256065, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.0175, |
| "grad_norm": 3.758636713027954, |
| "grad_norm_var": 2.7618912420839155, |
| "learning_rate": 0.0001, |
| "loss": 0.9996, |
| "loss/crossentropy": 2.335742473602295, |
| "loss/hidden": 0.6328125, |
| "loss/logits": 0.10663188993930817, |
| "loss/reg": 0.026017924770712852, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.017625, |
| "grad_norm": 4.186927795410156, |
| "grad_norm_var": 2.6505093919275544, |
| "learning_rate": 0.0001, |
| "loss": 0.8414, |
| "loss/crossentropy": 2.281843423843384, |
| "loss/hidden": 0.498046875, |
| "loss/logits": 0.08321215212345123, |
| "loss/reg": 0.026011699810624123, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.01775, |
| "grad_norm": 3.1666276454925537, |
| "grad_norm_var": 2.775123140671519, |
| "learning_rate": 0.0001, |
| "loss": 0.7768, |
| "loss/crossentropy": 2.4267935752868652, |
| "loss/hidden": 0.44140625, |
| "loss/logits": 0.07538889348506927, |
| "loss/reg": 0.02600528486073017, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.017875, |
| "grad_norm": 6.386529445648193, |
| "grad_norm_var": 2.9581845034072245, |
| "learning_rate": 0.0001, |
| "loss": 0.8401, |
| "loss/crossentropy": 2.7848174571990967, |
| "loss/hidden": 0.50390625, |
| "loss/logits": 0.07621172070503235, |
| "loss/reg": 0.025999369099736214, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.018, |
| "grad_norm": 3.8083512783050537, |
| "grad_norm_var": 2.876745593692829, |
| "learning_rate": 0.0001, |
| "loss": 0.9046, |
| "loss/crossentropy": 2.341048002243042, |
| "loss/hidden": 0.56640625, |
| "loss/logits": 0.0782276839017868, |
| "loss/reg": 0.0259928647428751, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.018125, |
| "grad_norm": 4.083465576171875, |
| "grad_norm_var": 2.5868089188751657, |
| "learning_rate": 0.0001, |
| "loss": 0.9449, |
| "loss/crossentropy": 2.3942995071411133, |
| "loss/hidden": 0.59375, |
| "loss/logits": 0.09130540490150452, |
| "loss/reg": 0.025986921042203903, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.01825, |
| "grad_norm": 3.654815673828125, |
| "grad_norm_var": 2.533420197907486, |
| "learning_rate": 0.0001, |
| "loss": 0.9459, |
| "loss/crossentropy": 2.2414467334747314, |
| "loss/hidden": 0.5859375, |
| "loss/logits": 0.10016702860593796, |
| "loss/reg": 0.02598092146217823, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.018375, |
| "grad_norm": 5.4976935386657715, |
| "grad_norm_var": 2.567896035243579, |
| "learning_rate": 0.0001, |
| "loss": 0.9079, |
| "loss/crossentropy": 2.1185100078582764, |
| "loss/hidden": 0.58203125, |
| "loss/logits": 0.06617112457752228, |
| "loss/reg": 0.025974513962864876, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.0185, |
| "grad_norm": 3.4107933044433594, |
| "grad_norm_var": 2.673383097164577, |
| "learning_rate": 0.0001, |
| "loss": 0.7982, |
| "loss/crossentropy": 2.417313575744629, |
| "loss/hidden": 0.462890625, |
| "loss/logits": 0.07567355036735535, |
| "loss/reg": 0.025968506932258606, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.018625, |
| "grad_norm": 3.589749574661255, |
| "grad_norm_var": 2.1469293827286418, |
| "learning_rate": 0.0001, |
| "loss": 0.8088, |
| "loss/crossentropy": 2.2554194927215576, |
| "loss/hidden": 0.478515625, |
| "loss/logits": 0.07071521133184433, |
| "loss/reg": 0.025961775332689285, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.01875, |
| "grad_norm": 4.003805160522461, |
| "grad_norm_var": 2.1538296896943887, |
| "learning_rate": 0.0001, |
| "loss": 0.7856, |
| "loss/crossentropy": 2.7128918170928955, |
| "loss/hidden": 0.453125, |
| "loss/logits": 0.07290500402450562, |
| "loss/reg": 0.025955306366086006, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.018875, |
| "grad_norm": 3.536449432373047, |
| "grad_norm_var": 2.1383506752067736, |
| "learning_rate": 0.0001, |
| "loss": 0.7436, |
| "loss/crossentropy": 2.509995460510254, |
| "loss/hidden": 0.421875, |
| "loss/logits": 0.062191903591156006, |
| "loss/reg": 0.02594931609928608, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.019, |
| "grad_norm": 3.434654951095581, |
| "grad_norm_var": 0.7374638182579057, |
| "learning_rate": 0.0001, |
| "loss": 0.9118, |
| "loss/crossentropy": 2.3447437286376953, |
| "loss/hidden": 0.55859375, |
| "loss/logits": 0.0937500149011612, |
| "loss/reg": 0.025943227112293243, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.019125, |
| "grad_norm": 8.066261291503906, |
| "grad_norm_var": 1.7522972641868202, |
| "learning_rate": 0.0001, |
| "loss": 0.8648, |
| "loss/crossentropy": 2.4481232166290283, |
| "loss/hidden": 0.52734375, |
| "loss/logits": 0.07811163365840912, |
| "loss/reg": 0.025936946272850037, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.01925, |
| "grad_norm": 3.2214348316192627, |
| "grad_norm_var": 1.7429433318934864, |
| "learning_rate": 0.0001, |
| "loss": 0.7673, |
| "loss/crossentropy": 2.3142549991607666, |
| "loss/hidden": 0.43359375, |
| "loss/logits": 0.07435894012451172, |
| "loss/reg": 0.02593095973134041, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.019375, |
| "grad_norm": 2.854038715362549, |
| "grad_norm_var": 1.8633807825236384, |
| "learning_rate": 0.0001, |
| "loss": 0.7601, |
| "loss/crossentropy": 2.7022647857666016, |
| "loss/hidden": 0.43359375, |
| "loss/logits": 0.06724615395069122, |
| "loss/reg": 0.025924943387508392, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.0195, |
| "grad_norm": 3.1763484477996826, |
| "grad_norm_var": 1.9162196068123232, |
| "learning_rate": 0.0001, |
| "loss": 0.9, |
| "loss/crossentropy": 2.574676036834717, |
| "loss/hidden": 0.54296875, |
| "loss/logits": 0.09785018861293793, |
| "loss/reg": 0.025919148698449135, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.019625, |
| "grad_norm": 3.999523162841797, |
| "grad_norm_var": 1.9169889601133074, |
| "learning_rate": 0.0001, |
| "loss": 0.858, |
| "loss/crossentropy": 2.480532646179199, |
| "loss/hidden": 0.515625, |
| "loss/logits": 0.08323468267917633, |
| "loss/reg": 0.025913061574101448, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.01975, |
| "grad_norm": 2.9662230014801025, |
| "grad_norm_var": 1.944924590139983, |
| "learning_rate": 0.0001, |
| "loss": 0.818, |
| "loss/crossentropy": 2.459967613220215, |
| "loss/hidden": 0.48046875, |
| "loss/logits": 0.07847169041633606, |
| "loss/reg": 0.025906959548592567, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.019875, |
| "grad_norm": 7.074191093444824, |
| "grad_norm_var": 2.1836107796529065, |
| "learning_rate": 0.0001, |
| "loss": 0.9801, |
| "loss/crossentropy": 2.2858352661132812, |
| "loss/hidden": 0.6484375, |
| "loss/logits": 0.07268328964710236, |
| "loss/reg": 0.025901462882757187, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 3.6333370208740234, |
| "grad_norm_var": 2.193465227977892, |
| "learning_rate": 0.0001, |
| "loss": 0.9048, |
| "loss/crossentropy": 2.4626388549804688, |
| "loss/hidden": 0.5546875, |
| "loss/logits": 0.09115847945213318, |
| "loss/reg": 0.02589540183544159, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.020125, |
| "grad_norm": 4.283749103546143, |
| "grad_norm_var": 2.1945247126451437, |
| "learning_rate": 0.0001, |
| "loss": 0.9251, |
| "loss/crossentropy": 2.5746121406555176, |
| "loss/hidden": 0.58203125, |
| "loss/logits": 0.0841851755976677, |
| "loss/reg": 0.025888830423355103, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.02025, |
| "grad_norm": 3.628138542175293, |
| "grad_norm_var": 2.196331220420876, |
| "learning_rate": 0.0001, |
| "loss": 0.9408, |
| "loss/crossentropy": 2.398010730743408, |
| "loss/hidden": 0.58203125, |
| "loss/logits": 0.09991887211799622, |
| "loss/reg": 0.025882074609398842, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.020375, |
| "grad_norm": 3.5412559509277344, |
| "grad_norm_var": 2.0836172065033507, |
| "learning_rate": 0.0001, |
| "loss": 0.857, |
| "loss/crossentropy": 2.5064220428466797, |
| "loss/hidden": 0.50390625, |
| "loss/logits": 0.0943569540977478, |
| "loss/reg": 0.025876009836792946, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.0205, |
| "grad_norm": 3.498668670654297, |
| "grad_norm_var": 2.0768887394910185, |
| "learning_rate": 0.0001, |
| "loss": 0.8917, |
| "loss/crossentropy": 2.535529375076294, |
| "loss/hidden": 0.52734375, |
| "loss/logits": 0.10565976053476334, |
| "loss/reg": 0.02587023191154003, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.020625, |
| "grad_norm": 3.14025616645813, |
| "grad_norm_var": 2.116006039378421, |
| "learning_rate": 0.0001, |
| "loss": 0.7816, |
| "loss/crossentropy": 2.7250640392303467, |
| "loss/hidden": 0.453125, |
| "loss/logits": 0.06980661302804947, |
| "loss/reg": 0.025863803923130035, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.02075, |
| "grad_norm": 2.6821706295013428, |
| "grad_norm_var": 2.2251478520018773, |
| "learning_rate": 0.0001, |
| "loss": 0.7558, |
| "loss/crossentropy": 2.230567693710327, |
| "loss/hidden": 0.43359375, |
| "loss/logits": 0.06358660757541656, |
| "loss/reg": 0.02585742622613907, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.020875, |
| "grad_norm": 3.8048856258392334, |
| "grad_norm_var": 2.2158862694911607, |
| "learning_rate": 0.0001, |
| "loss": 0.7965, |
| "loss/crossentropy": 2.69014310836792, |
| "loss/hidden": 0.453125, |
| "loss/logits": 0.08488957583904266, |
| "loss/reg": 0.025850988924503326, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.021, |
| "grad_norm": 4.305826187133789, |
| "grad_norm_var": 2.2048741298976515, |
| "learning_rate": 0.0001, |
| "loss": 1.1009, |
| "loss/crossentropy": 2.2925851345062256, |
| "loss/hidden": 0.73828125, |
| "loss/logits": 0.10412566363811493, |
| "loss/reg": 0.025845136493444443, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.021125, |
| "grad_norm": 4.051706790924072, |
| "grad_norm_var": 1.0314628897999674, |
| "learning_rate": 0.0001, |
| "loss": 0.7756, |
| "loss/crossentropy": 2.4515578746795654, |
| "loss/hidden": 0.4453125, |
| "loss/logits": 0.07186460494995117, |
| "loss/reg": 0.02583896555006504, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.02125, |
| "grad_norm": 3.3800575733184814, |
| "grad_norm_var": 1.022039210437893, |
| "learning_rate": 0.0001, |
| "loss": 0.798, |
| "loss/crossentropy": 2.3387436866760254, |
| "loss/hidden": 0.455078125, |
| "loss/logits": 0.08458675444126129, |
| "loss/reg": 0.025832952931523323, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.021375, |
| "grad_norm": 3.069735527038574, |
| "grad_norm_var": 0.9991429378891439, |
| "learning_rate": 0.0001, |
| "loss": 0.7416, |
| "loss/crossentropy": 2.660727024078369, |
| "loss/hidden": 0.412109375, |
| "loss/logits": 0.07125158607959747, |
| "loss/reg": 0.025827286764979362, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.0215, |
| "grad_norm": 3.5827748775482178, |
| "grad_norm_var": 0.9775809993657352, |
| "learning_rate": 0.0001, |
| "loss": 0.9301, |
| "loss/crossentropy": 2.160230875015259, |
| "loss/hidden": 0.5703125, |
| "loss/logits": 0.10158500075340271, |
| "loss/reg": 0.025821613147854805, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.021625, |
| "grad_norm": 3.0348143577575684, |
| "grad_norm_var": 1.008817027257092, |
| "learning_rate": 0.0001, |
| "loss": 0.9059, |
| "loss/crossentropy": 2.5519795417785645, |
| "loss/hidden": 0.5390625, |
| "loss/logits": 0.10867513716220856, |
| "loss/reg": 0.025815250352025032, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.02175, |
| "grad_norm": 3.325514316558838, |
| "grad_norm_var": 0.980302655794393, |
| "learning_rate": 0.0001, |
| "loss": 0.7651, |
| "loss/crossentropy": 2.3060855865478516, |
| "loss/hidden": 0.43359375, |
| "loss/logits": 0.0733788013458252, |
| "loss/reg": 0.02580902725458145, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.021875, |
| "grad_norm": 2.9402523040771484, |
| "grad_norm_var": 0.21740374576502078, |
| "learning_rate": 0.0001, |
| "loss": 0.859, |
| "loss/crossentropy": 2.4109151363372803, |
| "loss/hidden": 0.515625, |
| "loss/logits": 0.0853327289223671, |
| "loss/reg": 0.025802936404943466, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.022, |
| "grad_norm": 4.970669746398926, |
| "grad_norm_var": 0.354037293260262, |
| "learning_rate": 0.0001, |
| "loss": 0.9835, |
| "loss/crossentropy": 2.9366648197174072, |
| "loss/hidden": 0.6171875, |
| "loss/logits": 0.10840301960706711, |
| "loss/reg": 0.02579565905034542, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.022125, |
| "grad_norm": 4.438536643981934, |
| "grad_norm_var": 0.37010993593279384, |
| "learning_rate": 0.0001, |
| "loss": 0.8574, |
| "loss/crossentropy": 2.4177615642547607, |
| "loss/hidden": 0.51171875, |
| "loss/logits": 0.08775197714567184, |
| "loss/reg": 0.025788920000195503, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.02225, |
| "grad_norm": 4.3905863761901855, |
| "grad_norm_var": 0.41060424896311526, |
| "learning_rate": 0.0001, |
| "loss": 0.8796, |
| "loss/crossentropy": 2.320418119430542, |
| "loss/hidden": 0.53125, |
| "loss/logits": 0.09051363915205002, |
| "loss/reg": 0.02578234300017357, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.022375, |
| "grad_norm": 3.2044453620910645, |
| "grad_norm_var": 0.42189777730298467, |
| "learning_rate": 0.0001, |
| "loss": 0.785, |
| "loss/crossentropy": 2.327108383178711, |
| "loss/hidden": 0.447265625, |
| "loss/logits": 0.08003242313861847, |
| "loss/reg": 0.025775177404284477, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.0225, |
| "grad_norm": 3.2016260623931885, |
| "grad_norm_var": 0.4319725268587102, |
| "learning_rate": 0.0001, |
| "loss": 0.8076, |
| "loss/crossentropy": 2.6913022994995117, |
| "loss/hidden": 0.466796875, |
| "loss/logits": 0.08316424489021301, |
| "loss/reg": 0.02576799876987934, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.022625, |
| "grad_norm": 3.216141939163208, |
| "grad_norm_var": 0.42772885748243633, |
| "learning_rate": 0.0001, |
| "loss": 0.7782, |
| "loss/crossentropy": 2.4444668292999268, |
| "loss/hidden": 0.447265625, |
| "loss/logits": 0.07332297414541245, |
| "loss/reg": 0.025760415941476822, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.02275, |
| "grad_norm": 3.6005637645721436, |
| "grad_norm_var": 0.3680557604441513, |
| "learning_rate": 0.0001, |
| "loss": 0.8578, |
| "loss/crossentropy": 2.2084226608276367, |
| "loss/hidden": 0.51171875, |
| "loss/logits": 0.0885147675871849, |
| "loss/reg": 0.025753989815711975, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.022875, |
| "grad_norm": 4.19577693939209, |
| "grad_norm_var": 0.3852931468556484, |
| "learning_rate": 0.0001, |
| "loss": 0.9394, |
| "loss/crossentropy": 2.557783842086792, |
| "loss/hidden": 0.56640625, |
| "loss/logits": 0.11552520841360092, |
| "loss/reg": 0.025747526437044144, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.023, |
| "grad_norm": 3.024552822113037, |
| "grad_norm_var": 0.38129301153876227, |
| "learning_rate": 0.0001, |
| "loss": 0.8858, |
| "loss/crossentropy": 2.994615316390991, |
| "loss/hidden": 0.52734375, |
| "loss/logits": 0.10108112543821335, |
| "loss/reg": 0.02574075385928154, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.023125, |
| "grad_norm": 3.3255319595336914, |
| "grad_norm_var": 0.3706833429951111, |
| "learning_rate": 0.0001, |
| "loss": 0.8407, |
| "loss/crossentropy": 2.677245855331421, |
| "loss/hidden": 0.4765625, |
| "loss/logits": 0.10678394883871078, |
| "loss/reg": 0.02573317475616932, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.02325, |
| "grad_norm": 3.341599464416504, |
| "grad_norm_var": 0.3716797590150757, |
| "learning_rate": 0.0001, |
| "loss": 0.9127, |
| "loss/crossentropy": 2.156444549560547, |
| "loss/hidden": 0.56640625, |
| "loss/logits": 0.0890708938241005, |
| "loss/reg": 0.02572541870176792, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.023375, |
| "grad_norm": 2.925915241241455, |
| "grad_norm_var": 0.3822577484351124, |
| "learning_rate": 0.0001, |
| "loss": 0.8815, |
| "loss/crossentropy": 2.2716755867004395, |
| "loss/hidden": 0.53125, |
| "loss/logits": 0.09304732084274292, |
| "loss/reg": 0.02571748197078705, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.0235, |
| "grad_norm": 4.192226886749268, |
| "grad_norm_var": 0.40854537365233884, |
| "learning_rate": 0.0001, |
| "loss": 0.8547, |
| "loss/crossentropy": 2.378901720046997, |
| "loss/hidden": 0.5078125, |
| "loss/logits": 0.08977752178907394, |
| "loss/reg": 0.025709524750709534, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.023625, |
| "grad_norm": 5.648179054260254, |
| "grad_norm_var": 0.6443691048121629, |
| "learning_rate": 0.0001, |
| "loss": 1.0032, |
| "loss/crossentropy": 2.4307687282562256, |
| "loss/hidden": 0.63671875, |
| "loss/logits": 0.10948194563388824, |
| "loss/reg": 0.025701580569148064, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.02375, |
| "grad_norm": 6.345841884613037, |
| "grad_norm_var": 1.045029826307897, |
| "learning_rate": 0.0001, |
| "loss": 1.0995, |
| "loss/crossentropy": 2.279203414916992, |
| "loss/hidden": 0.74609375, |
| "loss/logits": 0.09642117470502853, |
| "loss/reg": 0.025694590061903, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.023875, |
| "grad_norm": 4.242865085601807, |
| "grad_norm_var": 0.9782837984014707, |
| "learning_rate": 0.0001, |
| "loss": 1.155, |
| "loss/crossentropy": 2.235325574874878, |
| "loss/hidden": 0.7890625, |
| "loss/logits": 0.10906486213207245, |
| "loss/reg": 0.025686509907245636, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.024, |
| "grad_norm": 6.347895622253418, |
| "grad_norm_var": 1.272032888242258, |
| "learning_rate": 0.0001, |
| "loss": 1.0789, |
| "loss/crossentropy": 2.5386898517608643, |
| "loss/hidden": 0.67578125, |
| "loss/logits": 0.1463102549314499, |
| "loss/reg": 0.025679145008325577, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.024125, |
| "grad_norm": 3.077846050262451, |
| "grad_norm_var": 1.3268106432816444, |
| "learning_rate": 0.0001, |
| "loss": 0.855, |
| "loss/crossentropy": 2.5889694690704346, |
| "loss/hidden": 0.515625, |
| "loss/logits": 0.08266487717628479, |
| "loss/reg": 0.025671878829598427, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.02425, |
| "grad_norm": 3.672849416732788, |
| "grad_norm_var": 1.323313109234428, |
| "learning_rate": 0.0001, |
| "loss": 0.8213, |
| "loss/crossentropy": 2.269009590148926, |
| "loss/hidden": 0.4921875, |
| "loss/logits": 0.07244250178337097, |
| "loss/reg": 0.02566472254693508, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.024375, |
| "grad_norm": 3.13712215423584, |
| "grad_norm_var": 1.330492936258482, |
| "learning_rate": 0.0001, |
| "loss": 0.9044, |
| "loss/crossentropy": 2.3420536518096924, |
| "loss/hidden": 0.5390625, |
| "loss/logits": 0.10874692350625992, |
| "loss/reg": 0.02565707452595234, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.0245, |
| "grad_norm": 5.941372871398926, |
| "grad_norm_var": 1.5194802994125645, |
| "learning_rate": 0.0001, |
| "loss": 1.0268, |
| "loss/crossentropy": 2.245668649673462, |
| "loss/hidden": 0.63671875, |
| "loss/logits": 0.13362175226211548, |
| "loss/reg": 0.02564912661910057, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.024625, |
| "grad_norm": 2.8778631687164307, |
| "grad_norm_var": 1.5682913914576866, |
| "learning_rate": 0.0001, |
| "loss": 0.933, |
| "loss/crossentropy": 2.4744086265563965, |
| "loss/hidden": 0.57421875, |
| "loss/logits": 0.10236240178346634, |
| "loss/reg": 0.025642510503530502, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.02475, |
| "grad_norm": 5.235295295715332, |
| "grad_norm_var": 1.6223942527523605, |
| "learning_rate": 0.0001, |
| "loss": 0.993, |
| "loss/crossentropy": 2.3120462894439697, |
| "loss/hidden": 0.61328125, |
| "loss/logits": 0.12334179133176804, |
| "loss/reg": 0.025634463876485825, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.024875, |
| "grad_norm": 3.2772397994995117, |
| "grad_norm_var": 1.6781902664948347, |
| "learning_rate": 0.0001, |
| "loss": 0.9134, |
| "loss/crossentropy": 2.2896904945373535, |
| "loss/hidden": 0.5625, |
| "loss/logits": 0.0946369469165802, |
| "loss/reg": 0.025627706199884415, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.025, |
| "grad_norm": 4.021130084991455, |
| "grad_norm_var": 1.5889382838258257, |
| "learning_rate": 0.0001, |
| "loss": 0.9467, |
| "loss/crossentropy": 2.3281702995300293, |
| "loss/hidden": 0.5859375, |
| "loss/logits": 0.10459813475608826, |
| "loss/reg": 0.025620225816965103, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.025125, |
| "grad_norm": 3.3705508708953857, |
| "grad_norm_var": 1.5836618344967157, |
| "learning_rate": 0.0001, |
| "loss": 1.0247, |
| "loss/crossentropy": 2.3704278469085693, |
| "loss/hidden": 0.640625, |
| "loss/logits": 0.12795141339302063, |
| "loss/reg": 0.025613589212298393, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.02525, |
| "grad_norm": 5.586423397064209, |
| "grad_norm_var": 1.6331597901730046, |
| "learning_rate": 0.0001, |
| "loss": 1.2703, |
| "loss/crossentropy": 2.3346736431121826, |
| "loss/hidden": 0.83984375, |
| "loss/logits": 0.17438159883022308, |
| "loss/reg": 0.025606893002986908, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.025375, |
| "grad_norm": 6.558523654937744, |
| "grad_norm_var": 1.7590475344041898, |
| "learning_rate": 0.0001, |
| "loss": 1.0711, |
| "loss/crossentropy": 2.264883518218994, |
| "loss/hidden": 0.71484375, |
| "loss/logits": 0.10028564184904099, |
| "loss/reg": 0.025599893182516098, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.0255, |
| "grad_norm": 3.024080991744995, |
| "grad_norm_var": 1.9071946132324447, |
| "learning_rate": 0.0001, |
| "loss": 0.9349, |
| "loss/crossentropy": 2.505457878112793, |
| "loss/hidden": 0.58203125, |
| "loss/logits": 0.09690214693546295, |
| "loss/reg": 0.025592036545276642, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.025625, |
| "grad_norm": 3.268216133117676, |
| "grad_norm_var": 1.9040994009139534, |
| "learning_rate": 0.0001, |
| "loss": 0.8433, |
| "loss/crossentropy": 2.629786968231201, |
| "loss/hidden": 0.5, |
| "loss/logits": 0.08743932843208313, |
| "loss/reg": 0.025583887472748756, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.02575, |
| "grad_norm": 5.203437328338623, |
| "grad_norm_var": 1.6853258867355625, |
| "learning_rate": 0.0001, |
| "loss": 0.9278, |
| "loss/crossentropy": 2.368603229522705, |
| "loss/hidden": 0.5625, |
| "loss/logits": 0.10955986380577087, |
| "loss/reg": 0.025574835017323494, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.025875, |
| "grad_norm": 5.106112480163574, |
| "grad_norm_var": 1.725017173963382, |
| "learning_rate": 0.0001, |
| "loss": 0.9881, |
| "loss/crossentropy": 2.554746389389038, |
| "loss/hidden": 0.6171875, |
| "loss/logits": 0.11524944007396698, |
| "loss/reg": 0.02556804195046425, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.026, |
| "grad_norm": 8.258187294006348, |
| "grad_norm_var": 2.4602814049521387, |
| "learning_rate": 0.0001, |
| "loss": 1.3675, |
| "loss/crossentropy": 2.4391205310821533, |
| "loss/hidden": 0.94140625, |
| "loss/logits": 0.17047560214996338, |
| "loss/reg": 0.025560656562447548, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.026125, |
| "grad_norm": 3.8223764896392822, |
| "grad_norm_var": 2.3561294395388566, |
| "learning_rate": 0.0001, |
| "loss": 1.02, |
| "loss/crossentropy": 2.5300426483154297, |
| "loss/hidden": 0.62890625, |
| "loss/logits": 0.13556598126888275, |
| "loss/reg": 0.025551345199346542, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.02625, |
| "grad_norm": 3.6237666606903076, |
| "grad_norm_var": 2.36184075461094, |
| "learning_rate": 0.0001, |
| "loss": 1.0218, |
| "loss/crossentropy": 2.3368213176727295, |
| "loss/hidden": 0.6640625, |
| "loss/logits": 0.10230866074562073, |
| "loss/reg": 0.025544527918100357, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.026375, |
| "grad_norm": 3.1394050121307373, |
| "grad_norm_var": 2.3614203164344407, |
| "learning_rate": 0.0001, |
| "loss": 0.964, |
| "loss/crossentropy": 2.338050603866577, |
| "loss/hidden": 0.609375, |
| "loss/logits": 0.09931059181690216, |
| "loss/reg": 0.025535617023706436, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.0265, |
| "grad_norm": 3.1498186588287354, |
| "grad_norm_var": 2.319283484830568, |
| "learning_rate": 0.0001, |
| "loss": 0.8039, |
| "loss/crossentropy": 2.5108940601348877, |
| "loss/hidden": 0.46875, |
| "loss/logits": 0.07982419431209564, |
| "loss/reg": 0.025528721511363983, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.026625, |
| "grad_norm": 3.334510326385498, |
| "grad_norm_var": 2.2429786468962374, |
| "learning_rate": 0.0001, |
| "loss": 1.1105, |
| "loss/crossentropy": 2.458519697189331, |
| "loss/hidden": 0.71875, |
| "loss/logits": 0.1365831047296524, |
| "loss/reg": 0.02552017569541931, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.02675, |
| "grad_norm": 3.3243789672851562, |
| "grad_norm_var": 2.2516768547287977, |
| "learning_rate": 0.0001, |
| "loss": 1.0016, |
| "loss/crossentropy": 2.2530109882354736, |
| "loss/hidden": 0.62890625, |
| "loss/logits": 0.11759582161903381, |
| "loss/reg": 0.025511734187602997, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.026875, |
| "grad_norm": 3.3768937587738037, |
| "grad_norm_var": 2.2393156807375245, |
| "learning_rate": 0.0001, |
| "loss": 1.0452, |
| "loss/crossentropy": 2.3267643451690674, |
| "loss/hidden": 0.65625, |
| "loss/logits": 0.13390058279037476, |
| "loss/reg": 0.0255054272711277, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.027, |
| "grad_norm": 7.391561031341553, |
| "grad_norm_var": 2.841738119885866, |
| "learning_rate": 0.0001, |
| "loss": 1.2263, |
| "loss/crossentropy": 2.285508394241333, |
| "loss/hidden": 0.8203125, |
| "loss/logits": 0.1510239541530609, |
| "loss/reg": 0.025499247014522552, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.027125, |
| "grad_norm": 3.143969774246216, |
| "grad_norm_var": 2.8781965049841705, |
| "learning_rate": 0.0001, |
| "loss": 0.9421, |
| "loss/crossentropy": 2.6111316680908203, |
| "loss/hidden": 0.5703125, |
| "loss/logits": 0.11691074818372726, |
| "loss/reg": 0.025491848587989807, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.02725, |
| "grad_norm": 4.989267826080322, |
| "grad_norm_var": 2.8105564664802234, |
| "learning_rate": 0.0001, |
| "loss": 0.9104, |
| "loss/crossentropy": 2.7856109142303467, |
| "loss/hidden": 0.546875, |
| "loss/logits": 0.10870229452848434, |
| "loss/reg": 0.025485411286354065, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.027375, |
| "grad_norm": 8.867380142211914, |
| "grad_norm_var": 3.802177537112387, |
| "learning_rate": 0.0001, |
| "loss": 1.2976, |
| "loss/crossentropy": 2.414778470993042, |
| "loss/hidden": 0.84375, |
| "loss/logits": 0.199102982878685, |
| "loss/reg": 0.025478005409240723, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.0275, |
| "grad_norm": 3.949193239212036, |
| "grad_norm_var": 3.665725599495321, |
| "learning_rate": 0.0001, |
| "loss": 1.0781, |
| "loss/crossentropy": 2.3898277282714844, |
| "loss/hidden": 0.6796875, |
| "loss/logits": 0.14368270337581635, |
| "loss/reg": 0.02547168917953968, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.027625, |
| "grad_norm": 7.980153560638428, |
| "grad_norm_var": 4.202985170085262, |
| "learning_rate": 0.0001, |
| "loss": 1.4005, |
| "loss/crossentropy": 2.0598959922790527, |
| "loss/hidden": 0.9921875, |
| "loss/logits": 0.15361803770065308, |
| "loss/reg": 0.02546495571732521, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.02775, |
| "grad_norm": 4.118736743927002, |
| "grad_norm_var": 4.234989890674561, |
| "learning_rate": 0.0001, |
| "loss": 0.9084, |
| "loss/crossentropy": 2.2568750381469727, |
| "loss/hidden": 0.55859375, |
| "loss/logits": 0.09525588899850845, |
| "loss/reg": 0.02545757219195366, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.027875, |
| "grad_norm": 3.730299711227417, |
| "grad_norm_var": 4.306033514824207, |
| "learning_rate": 0.0001, |
| "loss": 0.9394, |
| "loss/crossentropy": 2.7180914878845215, |
| "loss/hidden": 0.57421875, |
| "loss/logits": 0.11064038425683975, |
| "loss/reg": 0.025450890883803368, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.028, |
| "grad_norm": 3.138925552368164, |
| "grad_norm_var": 3.557911666554559, |
| "learning_rate": 0.0001, |
| "loss": 0.8406, |
| "loss/crossentropy": 2.3719072341918945, |
| "loss/hidden": 0.5078125, |
| "loss/logits": 0.07834647595882416, |
| "loss/reg": 0.02544352412223816, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.028125, |
| "grad_norm": 3.478994846343994, |
| "grad_norm_var": 3.593674795871404, |
| "learning_rate": 0.0001, |
| "loss": 0.9072, |
| "loss/crossentropy": 2.4101219177246094, |
| "loss/hidden": 0.56640625, |
| "loss/logits": 0.0863800048828125, |
| "loss/reg": 0.025437019765377045, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.02825, |
| "grad_norm": 5.091615676879883, |
| "grad_norm_var": 3.572291640880083, |
| "learning_rate": 0.0001, |
| "loss": 0.9864, |
| "loss/crossentropy": 2.2258856296539307, |
| "loss/hidden": 0.640625, |
| "loss/logits": 0.09143185615539551, |
| "loss/reg": 0.025430168956518173, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.028375, |
| "grad_norm": 3.616190195083618, |
| "grad_norm_var": 3.4991896025782796, |
| "learning_rate": 0.0001, |
| "loss": 0.9246, |
| "loss/crossentropy": 2.765105724334717, |
| "loss/hidden": 0.5703125, |
| "loss/logits": 0.10003923624753952, |
| "loss/reg": 0.025423482060432434, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.0285, |
| "grad_norm": 4.406581878662109, |
| "grad_norm_var": 3.364516245493509, |
| "learning_rate": 0.0001, |
| "loss": 0.98, |
| "loss/crossentropy": 2.41001033782959, |
| "loss/hidden": 0.6171875, |
| "loss/logits": 0.10860306769609451, |
| "loss/reg": 0.025416266173124313, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.028625, |
| "grad_norm": 3.003995418548584, |
| "grad_norm_var": 3.4280449285692023, |
| "learning_rate": 0.0001, |
| "loss": 0.9759, |
| "loss/crossentropy": 2.518749952316284, |
| "loss/hidden": 0.62109375, |
| "loss/logits": 0.10075733810663223, |
| "loss/reg": 0.025409165769815445, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.02875, |
| "grad_norm": 4.073727130889893, |
| "grad_norm_var": 3.3356380380428576, |
| "learning_rate": 0.0001, |
| "loss": 0.8881, |
| "loss/crossentropy": 2.6824846267700195, |
| "loss/hidden": 0.5390625, |
| "loss/logits": 0.09498724341392517, |
| "loss/reg": 0.025402268394827843, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.028875, |
| "grad_norm": 3.8958635330200195, |
| "grad_norm_var": 3.2645611787952435, |
| "learning_rate": 0.0001, |
| "loss": 0.8794, |
| "loss/crossentropy": 2.684971332550049, |
| "loss/hidden": 0.53515625, |
| "loss/logits": 0.09024453163146973, |
| "loss/reg": 0.025395380333065987, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.029, |
| "grad_norm": 3.5619406700134277, |
| "grad_norm_var": 2.796506014438906, |
| "learning_rate": 0.0001, |
| "loss": 0.957, |
| "loss/crossentropy": 2.7883284091949463, |
| "loss/hidden": 0.6015625, |
| "loss/logits": 0.10159540176391602, |
| "loss/reg": 0.02538810484111309, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.029125, |
| "grad_norm": 12.658771514892578, |
| "grad_norm_var": 6.809983669727001, |
| "learning_rate": 0.0001, |
| "loss": 1.1843, |
| "loss/crossentropy": 2.537827253341675, |
| "loss/hidden": 0.7890625, |
| "loss/logits": 0.14144758880138397, |
| "loss/reg": 0.02538110502064228, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.02925, |
| "grad_norm": 8.465475082397461, |
| "grad_norm_var": 7.543990683504188, |
| "learning_rate": 0.0001, |
| "loss": 1.2958, |
| "loss/crossentropy": 2.6715972423553467, |
| "loss/hidden": 0.875, |
| "loss/logits": 0.16701380908489227, |
| "loss/reg": 0.02537420578300953, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.029375, |
| "grad_norm": 6.49767541885376, |
| "grad_norm_var": 6.75275709893707, |
| "learning_rate": 0.0001, |
| "loss": 1.0826, |
| "loss/crossentropy": 2.316210985183716, |
| "loss/hidden": 0.7109375, |
| "loss/logits": 0.11794352531433105, |
| "loss/reg": 0.025367144495248795, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.0295, |
| "grad_norm": 4.668674945831299, |
| "grad_norm_var": 6.674304000957216, |
| "learning_rate": 0.0001, |
| "loss": 1.0642, |
| "loss/crossentropy": 1.956210732460022, |
| "loss/hidden": 0.69921875, |
| "loss/logits": 0.11135473847389221, |
| "loss/reg": 0.02535996399819851, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.029625, |
| "grad_norm": 6.132915019989014, |
| "grad_norm_var": 6.19031909782113, |
| "learning_rate": 0.0001, |
| "loss": 1.0462, |
| "loss/crossentropy": 2.6552460193634033, |
| "loss/hidden": 0.6640625, |
| "loss/logits": 0.12862679362297058, |
| "loss/reg": 0.025352442637085915, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.02975, |
| "grad_norm": 4.94125509262085, |
| "grad_norm_var": 6.132251305092321, |
| "learning_rate": 0.0001, |
| "loss": 1.2795, |
| "loss/crossentropy": 2.204523801803589, |
| "loss/hidden": 0.84765625, |
| "loss/logits": 0.17835499346256256, |
| "loss/reg": 0.025344664230942726, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.029875, |
| "grad_norm": 3.7683677673339844, |
| "grad_norm_var": 6.125464850588167, |
| "learning_rate": 0.0001, |
| "loss": 0.985, |
| "loss/crossentropy": 2.652397632598877, |
| "loss/hidden": 0.625, |
| "loss/logits": 0.10665792226791382, |
| "loss/reg": 0.0253366157412529, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 4.732015609741211, |
| "grad_norm_var": 5.870172361717241, |
| "learning_rate": 0.0001, |
| "loss": 1.1018, |
| "loss/crossentropy": 2.544055938720703, |
| "loss/hidden": 0.7109375, |
| "loss/logits": 0.13754525780677795, |
| "loss/reg": 0.02532930299639702, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.030125, |
| "grad_norm": 4.046056270599365, |
| "grad_norm_var": 5.761120866273571, |
| "learning_rate": 0.0001, |
| "loss": 0.9396, |
| "loss/crossentropy": 2.6015427112579346, |
| "loss/hidden": 0.5859375, |
| "loss/logits": 0.10041546076536179, |
| "loss/reg": 0.025322062894701958, |
| "step": 241 |
| }, |
| { |
| "epoch": 0.03025, |
| "grad_norm": 3.066027879714966, |
| "grad_norm_var": 6.052926687728684, |
| "learning_rate": 0.0001, |
| "loss": 1.0668, |
| "loss/crossentropy": 2.4402151107788086, |
| "loss/hidden": 0.6875, |
| "loss/logits": 0.12614867091178894, |
| "loss/reg": 0.0253145694732666, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.030375, |
| "grad_norm": 3.168888807296753, |
| "grad_norm_var": 6.1536859873832555, |
| "learning_rate": 0.0001, |
| "loss": 0.9413, |
| "loss/crossentropy": 2.5689940452575684, |
| "loss/hidden": 0.58984375, |
| "loss/logits": 0.09841729700565338, |
| "loss/reg": 0.02530776336789131, |
| "step": 243 |
| }, |
| { |
| "epoch": 0.0305, |
| "grad_norm": 3.806450366973877, |
| "grad_norm_var": 6.229122059899357, |
| "learning_rate": 0.0001, |
| "loss": 1.021, |
| "loss/crossentropy": 2.2438809871673584, |
| "loss/hidden": 0.6328125, |
| "loss/logits": 0.13515594601631165, |
| "loss/reg": 0.02530042454600334, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.030625, |
| "grad_norm": 4.313736915588379, |
| "grad_norm_var": 5.982441934425083, |
| "learning_rate": 0.0001, |
| "loss": 1.0307, |
| "loss/crossentropy": 2.5067784786224365, |
| "loss/hidden": 0.65234375, |
| "loss/logits": 0.1253766119480133, |
| "loss/reg": 0.025293124839663506, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.03075, |
| "grad_norm": 4.419394016265869, |
| "grad_norm_var": 5.942040082684442, |
| "learning_rate": 0.0001, |
| "loss": 0.8957, |
| "loss/crossentropy": 2.4725239276885986, |
| "loss/hidden": 0.55078125, |
| "loss/logits": 0.09201550483703613, |
| "loss/reg": 0.02528616413474083, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.030875, |
| "grad_norm": 3.709151268005371, |
| "grad_norm_var": 5.975041529003943, |
| "learning_rate": 0.0001, |
| "loss": 0.9152, |
| "loss/crossentropy": 2.413250207901001, |
| "loss/hidden": 0.56640625, |
| "loss/logits": 0.09596529603004456, |
| "loss/reg": 0.025278838351368904, |
| "step": 247 |
| }, |
| { |
| "epoch": 0.031, |
| "grad_norm": 3.4139935970306396, |
| "grad_norm_var": 6.007189625317282, |
| "learning_rate": 0.0001, |
| "loss": 1.0268, |
| "loss/crossentropy": 2.1580560207366943, |
| "loss/hidden": 0.65625, |
| "loss/logits": 0.11786328256130219, |
| "loss/reg": 0.025271562859416008, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.031125, |
| "grad_norm": 6.089378833770752, |
| "grad_norm_var": 2.0950588257899443, |
| "learning_rate": 0.0001, |
| "loss": 1.0999, |
| "loss/crossentropy": 2.5466785430908203, |
| "loss/hidden": 0.703125, |
| "loss/logits": 0.1441519856452942, |
| "loss/reg": 0.025264522060751915, |
| "step": 249 |
| }, |
| { |
| "epoch": 0.03125, |
| "grad_norm": 4.001245498657227, |
| "grad_norm_var": 1.1007847740596406, |
| "learning_rate": 0.0001, |
| "loss": 0.9263, |
| "loss/crossentropy": 2.433518171310425, |
| "loss/hidden": 0.578125, |
| "loss/logits": 0.09559185057878494, |
| "loss/reg": 0.025257611647248268, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.031375, |
| "grad_norm": 4.982790946960449, |
| "grad_norm_var": 0.8252532202355399, |
| "learning_rate": 0.0001, |
| "loss": 1.1439, |
| "loss/crossentropy": 2.3542721271514893, |
| "loss/hidden": 0.73046875, |
| "loss/logits": 0.16091328859329224, |
| "loss/reg": 0.025250321254134178, |
| "step": 251 |
| }, |
| { |
| "epoch": 0.0315, |
| "grad_norm": 3.6227378845214844, |
| "grad_norm_var": 0.8462248829388517, |
| "learning_rate": 0.0001, |
| "loss": 1.1789, |
| "loss/crossentropy": 2.1333043575286865, |
| "loss/hidden": 0.7890625, |
| "loss/logits": 0.1374010145664215, |
| "loss/reg": 0.025242896750569344, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.031625, |
| "grad_norm": 3.415194511413574, |
| "grad_norm_var": 0.6304077366129337, |
| "learning_rate": 0.0001, |
| "loss": 1.0916, |
| "loss/crossentropy": 2.23525071144104, |
| "loss/hidden": 0.71484375, |
| "loss/logits": 0.1243971735239029, |
| "loss/reg": 0.025235962122678757, |
| "step": 253 |
| }, |
| { |
| "epoch": 0.03175, |
| "grad_norm": 3.7671289443969727, |
| "grad_norm_var": 0.5838590152321442, |
| "learning_rate": 0.0001, |
| "loss": 0.908, |
| "loss/crossentropy": 2.254054546356201, |
| "loss/hidden": 0.56640625, |
| "loss/logits": 0.0893106684088707, |
| "loss/reg": 0.025228681042790413, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.031875, |
| "grad_norm": 4.164376735687256, |
| "grad_norm_var": 0.5803655311074387, |
| "learning_rate": 0.0001, |
| "loss": 0.9548, |
| "loss/crossentropy": 2.4128520488739014, |
| "loss/hidden": 0.5859375, |
| "loss/logits": 0.11668873578310013, |
| "loss/reg": 0.025221774354577065, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.032, |
| "grad_norm": 3.6412672996520996, |
| "grad_norm_var": 0.5547959425019464, |
| "learning_rate": 0.0001, |
| "loss": 1.0098, |
| "loss/crossentropy": 2.7745320796966553, |
| "loss/hidden": 0.62890625, |
| "loss/logits": 0.12875014543533325, |
| "loss/reg": 0.025214577093720436, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.032125, |
| "grad_norm": 2.900871515274048, |
| "grad_norm_var": 0.6261772657264061, |
| "learning_rate": 0.0001, |
| "loss": 0.9348, |
| "loss/crossentropy": 2.286968469619751, |
| "loss/hidden": 0.578125, |
| "loss/logits": 0.10455667227506638, |
| "loss/reg": 0.02520710788667202, |
| "step": 257 |
| }, |
| { |
| "epoch": 0.03225, |
| "grad_norm": 3.6853647232055664, |
| "grad_norm_var": 0.5808564529014478, |
| "learning_rate": 0.0001, |
| "loss": 0.9013, |
| "loss/crossentropy": 2.4515562057495117, |
| "loss/hidden": 0.55078125, |
| "loss/logits": 0.09847953915596008, |
| "loss/reg": 0.025199349969625473, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.032375, |
| "grad_norm": 2.9091956615448, |
| "grad_norm_var": 0.6119059054418109, |
| "learning_rate": 0.0001, |
| "loss": 0.8784, |
| "loss/crossentropy": 2.163628339767456, |
| "loss/hidden": 0.5390625, |
| "loss/logits": 0.08743810653686523, |
| "loss/reg": 0.025191258639097214, |
| "step": 259 |
| }, |
| { |
| "epoch": 0.0325, |
| "grad_norm": 2.9488673210144043, |
| "grad_norm_var": 0.6717290813098125, |
| "learning_rate": 0.0001, |
| "loss": 0.9195, |
| "loss/crossentropy": 2.3084585666656494, |
| "loss/hidden": 0.5546875, |
| "loss/logits": 0.11298206448554993, |
| "loss/reg": 0.025183765217661858, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.032625, |
| "grad_norm": 3.4345953464508057, |
| "grad_norm_var": 0.6684943296663647, |
| "learning_rate": 0.0001, |
| "loss": 0.8851, |
| "loss/crossentropy": 2.19558048248291, |
| "loss/hidden": 0.546875, |
| "loss/logits": 0.08649411797523499, |
| "loss/reg": 0.02517561800777912, |
| "step": 261 |
| }, |
| { |
| "epoch": 0.03275, |
| "grad_norm": 28.159074783325195, |
| "grad_norm_var": 37.79188620028727, |
| "learning_rate": 0.0001, |
| "loss": 1.1781, |
| "loss/crossentropy": 2.839057207107544, |
| "loss/hidden": 0.7890625, |
| "loss/logits": 0.13732999563217163, |
| "loss/reg": 0.025168145075440407, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.032875, |
| "grad_norm": 2.734104871749878, |
| "grad_norm_var": 38.05849364469687, |
| "learning_rate": 0.0001, |
| "loss": 0.8892, |
| "loss/crossentropy": 2.4754340648651123, |
| "loss/hidden": 0.54296875, |
| "loss/logits": 0.09462890028953552, |
| "loss/reg": 0.025160137563943863, |
| "step": 263 |
| }, |
| { |
| "epoch": 0.033, |
| "grad_norm": 4.089654922485352, |
| "grad_norm_var": 37.922354469790704, |
| "learning_rate": 0.0001, |
| "loss": 0.9326, |
| "loss/crossentropy": 2.476450204849243, |
| "loss/hidden": 0.57421875, |
| "loss/logits": 0.1068512499332428, |
| "loss/reg": 0.025151856243610382, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.033125, |
| "grad_norm": 3.838066577911377, |
| "grad_norm_var": 37.99741003814723, |
| "learning_rate": 0.0001, |
| "loss": 1.0909, |
| "loss/crossentropy": 2.114243268966675, |
| "loss/hidden": 0.734375, |
| "loss/logits": 0.10510236769914627, |
| "loss/reg": 0.025143325328826904, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.03325, |
| "grad_norm": 3.0674874782562256, |
| "grad_norm_var": 38.19410456778619, |
| "learning_rate": 0.0001, |
| "loss": 0.8898, |
| "loss/crossentropy": 2.6426563262939453, |
| "loss/hidden": 0.54296875, |
| "loss/logits": 0.09550425410270691, |
| "loss/reg": 0.025135278701782227, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.033375, |
| "grad_norm": 7.777696132659912, |
| "grad_norm_var": 38.64421623432597, |
| "learning_rate": 0.0001, |
| "loss": 1.0482, |
| "loss/crossentropy": 2.640554666519165, |
| "loss/hidden": 0.6796875, |
| "loss/logits": 0.11728060245513916, |
| "loss/reg": 0.025126684457063675, |
| "step": 267 |
| }, |
| { |
| "epoch": 0.0335, |
| "grad_norm": 3.6375653743743896, |
| "grad_norm_var": 38.64099364344996, |
| "learning_rate": 0.0001, |
| "loss": 0.9888, |
| "loss/crossentropy": 2.227538824081421, |
| "loss/hidden": 0.63671875, |
| "loss/logits": 0.1008879542350769, |
| "loss/reg": 0.0251180287450552, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.033625, |
| "grad_norm": 4.305724620819092, |
| "grad_norm_var": 38.4714335626231, |
| "learning_rate": 0.0001, |
| "loss": 1.1268, |
| "loss/crossentropy": 2.556248903274536, |
| "loss/hidden": 0.73828125, |
| "loss/logits": 0.1374487727880478, |
| "loss/reg": 0.02511041797697544, |
| "step": 269 |
| }, |
| { |
| "epoch": 0.03375, |
| "grad_norm": 3.658052921295166, |
| "grad_norm_var": 38.4947077039297, |
| "learning_rate": 0.0001, |
| "loss": 0.9616, |
| "loss/crossentropy": 2.437307596206665, |
| "loss/hidden": 0.60546875, |
| "loss/logits": 0.1050904244184494, |
| "loss/reg": 0.025102900341153145, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.033875, |
| "grad_norm": 6.534849643707275, |
| "grad_norm_var": 38.483973576312195, |
| "learning_rate": 0.0001, |
| "loss": 1.2119, |
| "loss/crossentropy": 2.36796236038208, |
| "loss/hidden": 0.81640625, |
| "loss/logits": 0.1445278525352478, |
| "loss/reg": 0.025095317512750626, |
| "step": 271 |
| }, |
| { |
| "epoch": 0.034, |
| "grad_norm": 5.486751079559326, |
| "grad_norm_var": 38.24988881420663, |
| "learning_rate": 0.0001, |
| "loss": 1.0337, |
| "loss/crossentropy": 2.517333984375, |
| "loss/hidden": 0.66015625, |
| "loss/logits": 0.12263435125350952, |
| "loss/reg": 0.025087665766477585, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.034125, |
| "grad_norm": 3.231940746307373, |
| "grad_norm_var": 38.13878485092763, |
| "learning_rate": 0.0001, |
| "loss": 1.0586, |
| "loss/crossentropy": 2.6812305450439453, |
| "loss/hidden": 0.67578125, |
| "loss/logits": 0.13204006850719452, |
| "loss/reg": 0.025080092251300812, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.03425, |
| "grad_norm": 4.71685791015625, |
| "grad_norm_var": 37.942827296069424, |
| "learning_rate": 0.0001, |
| "loss": 1.0409, |
| "loss/crossentropy": 2.093151807785034, |
| "loss/hidden": 0.6796875, |
| "loss/logits": 0.11049959808588028, |
| "loss/reg": 0.02507280558347702, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.034375, |
| "grad_norm": 3.7086901664733887, |
| "grad_norm_var": 37.68973967522894, |
| "learning_rate": 0.0001, |
| "loss": 0.9144, |
| "loss/crossentropy": 2.2392518520355225, |
| "loss/hidden": 0.57421875, |
| "loss/logits": 0.08954055607318878, |
| "loss/reg": 0.02506582997739315, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.0345, |
| "grad_norm": 5.537543773651123, |
| "grad_norm_var": 37.1561912525544, |
| "learning_rate": 0.0001, |
| "loss": 1.3586, |
| "loss/crossentropy": 1.8622019290924072, |
| "loss/hidden": 0.96484375, |
| "loss/logits": 0.14318415522575378, |
| "loss/reg": 0.025058824568986893, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.034625, |
| "grad_norm": 2.898383617401123, |
| "grad_norm_var": 37.34827444255352, |
| "learning_rate": 0.0001, |
| "loss": 0.9413, |
| "loss/crossentropy": 2.4155869483947754, |
| "loss/hidden": 0.578125, |
| "loss/logits": 0.11263684928417206, |
| "loss/reg": 0.025052132084965706, |
| "step": 277 |
| }, |
| { |
| "epoch": 0.03475, |
| "grad_norm": 2.803711175918579, |
| "grad_norm_var": 2.0625830733920933, |
| "learning_rate": 0.0001, |
| "loss": 0.9404, |
| "loss/crossentropy": 2.2058048248291016, |
| "loss/hidden": 0.59765625, |
| "loss/logits": 0.09232598543167114, |
| "loss/reg": 0.02504453808069229, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.034875, |
| "grad_norm": 3.180114507675171, |
| "grad_norm_var": 1.9847680294286107, |
| "learning_rate": 0.0001, |
| "loss": 0.9311, |
| "loss/crossentropy": 2.6069722175598145, |
| "loss/hidden": 0.57421875, |
| "loss/logits": 0.10655493289232254, |
| "loss/reg": 0.02503693662583828, |
| "step": 279 |
| }, |
| { |
| "epoch": 0.035, |
| "grad_norm": 5.609209060668945, |
| "grad_norm_var": 2.0906055341906256, |
| "learning_rate": 0.0001, |
| "loss": 1.0675, |
| "loss/crossentropy": 2.5023670196533203, |
| "loss/hidden": 0.68359375, |
| "loss/logits": 0.13361594080924988, |
| "loss/reg": 0.025029515847563744, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.035125, |
| "grad_norm": 5.165370464324951, |
| "grad_norm_var": 2.105772188928517, |
| "learning_rate": 0.0001, |
| "loss": 1.1103, |
| "loss/crossentropy": 2.5107007026672363, |
| "loss/hidden": 0.73828125, |
| "loss/logits": 0.12183347344398499, |
| "loss/reg": 0.025022249668836594, |
| "step": 281 |
| }, |
| { |
| "epoch": 0.03525, |
| "grad_norm": 4.322115421295166, |
| "grad_norm_var": 1.9716269568170388, |
| "learning_rate": 0.0001, |
| "loss": 0.95, |
| "loss/crossentropy": 2.483142137527466, |
| "loss/hidden": 0.5859375, |
| "loss/logits": 0.11390361189842224, |
| "loss/reg": 0.025014575570821762, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.035375, |
| "grad_norm": 4.432065486907959, |
| "grad_norm_var": 1.2250959918754403, |
| "learning_rate": 0.0001, |
| "loss": 0.9886, |
| "loss/crossentropy": 2.1206424236297607, |
| "loss/hidden": 0.6328125, |
| "loss/logits": 0.10568062961101532, |
| "loss/reg": 0.025007015094161034, |
| "step": 283 |
| }, |
| { |
| "epoch": 0.0355, |
| "grad_norm": 4.0429558753967285, |
| "grad_norm_var": 1.198112283867575, |
| "learning_rate": 0.0001, |
| "loss": 1.0004, |
| "loss/crossentropy": 2.6465060710906982, |
| "loss/hidden": 0.6328125, |
| "loss/logits": 0.11765305697917938, |
| "loss/reg": 0.024997249245643616, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.035625, |
| "grad_norm": 3.551006555557251, |
| "grad_norm_var": 1.23838358717468, |
| "learning_rate": 0.0001, |
| "loss": 0.9764, |
| "loss/crossentropy": 2.4274182319641113, |
| "loss/hidden": 0.61328125, |
| "loss/logits": 0.11320526152849197, |
| "loss/reg": 0.02498740889132023, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.03575, |
| "grad_norm": 2.874697685241699, |
| "grad_norm_var": 1.344305852802292, |
| "learning_rate": 0.0001, |
| "loss": 1.0133, |
| "loss/crossentropy": 2.512948751449585, |
| "loss/hidden": 0.6484375, |
| "loss/logits": 0.11510799080133438, |
| "loss/reg": 0.02497740648686886, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.035875, |
| "grad_norm": 3.4613194465637207, |
| "grad_norm_var": 1.0008425760722102, |
| "learning_rate": 0.0001, |
| "loss": 0.9274, |
| "loss/crossentropy": 2.4450602531433105, |
| "loss/hidden": 0.5703125, |
| "loss/logits": 0.10743874311447144, |
| "loss/reg": 0.024968616664409637, |
| "step": 287 |
| }, |
| { |
| "epoch": 0.036, |
| "grad_norm": 3.452150344848633, |
| "grad_norm_var": 0.8735820507410946, |
| "learning_rate": 0.0001, |
| "loss": 0.952, |
| "loss/crossentropy": 2.304729700088501, |
| "loss/hidden": 0.578125, |
| "loss/logits": 0.1242954432964325, |
| "loss/reg": 0.024959923699498177, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.036125, |
| "grad_norm": 3.3603436946868896, |
| "grad_norm_var": 0.8625457550688983, |
| "learning_rate": 0.0001, |
| "loss": 0.9447, |
| "loss/crossentropy": 2.425712823867798, |
| "loss/hidden": 0.58984375, |
| "loss/logits": 0.1053181067109108, |
| "loss/reg": 0.024950530380010605, |
| "step": 289 |
| }, |
| { |
| "epoch": 0.03625, |
| "grad_norm": 3.3667662143707275, |
| "grad_norm_var": 0.8374846368179986, |
| "learning_rate": 0.0001, |
| "loss": 0.9067, |
| "loss/crossentropy": 2.2623162269592285, |
| "loss/hidden": 0.55859375, |
| "loss/logits": 0.098650723695755, |
| "loss/reg": 0.02494119666516781, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.036375, |
| "grad_norm": 2.7996857166290283, |
| "grad_norm_var": 0.9075153562133816, |
| "learning_rate": 0.0001, |
| "loss": 0.9594, |
| "loss/crossentropy": 2.370417356491089, |
| "loss/hidden": 0.59765625, |
| "loss/logits": 0.11243726313114166, |
| "loss/reg": 0.024933209642767906, |
| "step": 291 |
| }, |
| { |
| "epoch": 0.0365, |
| "grad_norm": 2.9999380111694336, |
| "grad_norm_var": 0.7233017120786666, |
| "learning_rate": 0.0001, |
| "loss": 1.0396, |
| "loss/crossentropy": 2.286776542663574, |
| "loss/hidden": 0.6640625, |
| "loss/logits": 0.12626898288726807, |
| "loss/reg": 0.02492516115307808, |
| "step": 292 |
| }, |
| { |
| "epoch": 0.036625, |
| "grad_norm": 3.880777597427368, |
| "grad_norm_var": 0.685825505757979, |
| "learning_rate": 0.0001, |
| "loss": 0.9764, |
| "loss/crossentropy": 2.613823652267456, |
| "loss/hidden": 0.61328125, |
| "loss/logits": 0.11395551264286041, |
| "loss/reg": 0.02491726726293564, |
| "step": 293 |
| }, |
| { |
| "epoch": 0.03675, |
| "grad_norm": 4.2950568199157715, |
| "grad_norm_var": 0.6453385025094112, |
| "learning_rate": 0.0001, |
| "loss": 1.0535, |
| "loss/crossentropy": 2.6882171630859375, |
| "loss/hidden": 0.66796875, |
| "loss/logits": 0.13644324243068695, |
| "loss/reg": 0.02490835078060627, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.036875, |
| "grad_norm": 3.61423659324646, |
| "grad_norm_var": 0.6212598300915251, |
| "learning_rate": 0.0001, |
| "loss": 1.155, |
| "loss/crossentropy": 2.0867068767547607, |
| "loss/hidden": 0.76953125, |
| "loss/logits": 0.13649392127990723, |
| "loss/reg": 0.024899456650018692, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.037, |
| "grad_norm": 3.9245386123657227, |
| "grad_norm_var": 0.3982568915415859, |
| "learning_rate": 0.0001, |
| "loss": 0.951, |
| "loss/crossentropy": 2.703791618347168, |
| "loss/hidden": 0.5859375, |
| "loss/logits": 0.11614765971899033, |
| "loss/reg": 0.02489159069955349, |
| "step": 296 |
| }, |
| { |
| "epoch": 0.037125, |
| "grad_norm": 4.020047187805176, |
| "grad_norm_var": 0.2597397925732442, |
| "learning_rate": 0.0001, |
| "loss": 0.9707, |
| "loss/crossentropy": 2.429636001586914, |
| "loss/hidden": 0.61328125, |
| "loss/logits": 0.1086028665304184, |
| "loss/reg": 0.024883201345801353, |
| "step": 297 |
| }, |
| { |
| "epoch": 0.03725, |
| "grad_norm": 4.086516857147217, |
| "grad_norm_var": 0.24209119003572066, |
| "learning_rate": 0.0001, |
| "loss": 1.0769, |
| "loss/crossentropy": 2.8581056594848633, |
| "loss/hidden": 0.6796875, |
| "loss/logits": 0.14844708144664764, |
| "loss/reg": 0.024874594062566757, |
| "step": 298 |
| }, |
| { |
| "epoch": 0.037375, |
| "grad_norm": 2.956085443496704, |
| "grad_norm_var": 0.22141400399237127, |
| "learning_rate": 0.0001, |
| "loss": 0.9903, |
| "loss/crossentropy": 2.4802041053771973, |
| "loss/hidden": 0.609375, |
| "loss/logits": 0.13230839371681213, |
| "loss/reg": 0.024866636842489243, |
| "step": 299 |
| }, |
| { |
| "epoch": 0.0375, |
| "grad_norm": 3.165804386138916, |
| "grad_norm_var": 0.2110158468875736, |
| "learning_rate": 0.0001, |
| "loss": 1.0347, |
| "loss/crossentropy": 2.4370639324188232, |
| "loss/hidden": 0.66015625, |
| "loss/logits": 0.1259341686964035, |
| "loss/reg": 0.024857714772224426, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.037625, |
| "grad_norm": 3.0942695140838623, |
| "grad_norm_var": 0.22022059823099174, |
| "learning_rate": 0.0001, |
| "loss": 0.9747, |
| "loss/crossentropy": 2.674190044403076, |
| "loss/hidden": 0.6015625, |
| "loss/logits": 0.12467385828495026, |
| "loss/reg": 0.024848785251379013, |
| "step": 301 |
| }, |
| { |
| "epoch": 0.03775, |
| "grad_norm": 2.949205160140991, |
| "grad_norm_var": 0.21475779393049, |
| "learning_rate": 0.0001, |
| "loss": 0.8898, |
| "loss/crossentropy": 2.491746664047241, |
| "loss/hidden": 0.55078125, |
| "loss/logits": 0.09065801650285721, |
| "loss/reg": 0.02483983524143696, |
| "step": 302 |
| }, |
| { |
| "epoch": 0.037875, |
| "grad_norm": 3.549431324005127, |
| "grad_norm_var": 0.21520952048913009, |
| "learning_rate": 0.0001, |
| "loss": 0.9197, |
| "loss/crossentropy": 2.7578208446502686, |
| "loss/hidden": 0.57421875, |
| "loss/logits": 0.09712585806846619, |
| "loss/reg": 0.024831857532262802, |
| "step": 303 |
| }, |
| { |
| "epoch": 0.038, |
| "grad_norm": 3.0621931552886963, |
| "grad_norm_var": 0.22562503941355938, |
| "learning_rate": 0.0001, |
| "loss": 0.894, |
| "loss/crossentropy": 2.440351724624634, |
| "loss/hidden": 0.546875, |
| "loss/logits": 0.09888219833374023, |
| "loss/reg": 0.02482294850051403, |
| "step": 304 |
| }, |
| { |
| "epoch": 0.038125, |
| "grad_norm": 3.334942579269409, |
| "grad_norm_var": 0.22595311715915212, |
| "learning_rate": 0.0001, |
| "loss": 1.1509, |
| "loss/crossentropy": 2.378168821334839, |
| "loss/hidden": 0.74609375, |
| "loss/logits": 0.15670377016067505, |
| "loss/reg": 0.024814244359731674, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.03825, |
| "grad_norm": 3.3253893852233887, |
| "grad_norm_var": 0.22648465837488155, |
| "learning_rate": 0.0001, |
| "loss": 0.9744, |
| "loss/crossentropy": 2.4409470558166504, |
| "loss/hidden": 0.6015625, |
| "loss/logits": 0.12483348697423935, |
| "loss/reg": 0.024805361405014992, |
| "step": 306 |
| }, |
| { |
| "epoch": 0.038375, |
| "grad_norm": 3.1687541007995605, |
| "grad_norm_var": 0.20343285009947346, |
| "learning_rate": 0.0001, |
| "loss": 0.8998, |
| "loss/crossentropy": 2.3339829444885254, |
| "loss/hidden": 0.55078125, |
| "loss/logits": 0.10110392421483994, |
| "loss/reg": 0.02479635737836361, |
| "step": 307 |
| }, |
| { |
| "epoch": 0.0385, |
| "grad_norm": 4.096661567687988, |
| "grad_norm_var": 0.21071919009248533, |
| "learning_rate": 0.0001, |
| "loss": 1.0154, |
| "loss/crossentropy": 2.468813180923462, |
| "loss/hidden": 0.64453125, |
| "loss/logits": 0.12295258045196533, |
| "loss/reg": 0.024787236005067825, |
| "step": 308 |
| }, |
| { |
| "epoch": 0.038625, |
| "grad_norm": 3.097642660140991, |
| "grad_norm_var": 0.2127095324618587, |
| "learning_rate": 0.0001, |
| "loss": 0.8772, |
| "loss/crossentropy": 2.5380513668060303, |
| "loss/hidden": 0.53125, |
| "loss/logits": 0.09821398556232452, |
| "loss/reg": 0.024777989834547043, |
| "step": 309 |
| }, |
| { |
| "epoch": 0.03875, |
| "grad_norm": 4.647727012634277, |
| "grad_norm_var": 0.25863060133758775, |
| "learning_rate": 0.0001, |
| "loss": 0.9162, |
| "loss/crossentropy": 2.0996336936950684, |
| "loss/hidden": 0.5625, |
| "loss/logits": 0.10599061101675034, |
| "loss/reg": 0.024768849834799767, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.038875, |
| "grad_norm": 3.627246141433716, |
| "grad_norm_var": 0.25882920418562944, |
| "learning_rate": 0.0001, |
| "loss": 1.0161, |
| "loss/crossentropy": 2.2958993911743164, |
| "loss/hidden": 0.65234375, |
| "loss/logits": 0.11611323803663254, |
| "loss/reg": 0.024759870022535324, |
| "step": 311 |
| }, |
| { |
| "epoch": 0.039, |
| "grad_norm": 3.7236764430999756, |
| "grad_norm_var": 0.2501591619921593, |
| "learning_rate": 0.0001, |
| "loss": 1.1308, |
| "loss/crossentropy": 2.4322752952575684, |
| "loss/hidden": 0.7421875, |
| "loss/logits": 0.1410805881023407, |
| "loss/reg": 0.024751078337430954, |
| "step": 312 |
| }, |
| { |
| "epoch": 0.039125, |
| "grad_norm": 3.998807430267334, |
| "grad_norm_var": 0.24869789076210413, |
| "learning_rate": 0.0001, |
| "loss": 1.0749, |
| "loss/crossentropy": 2.374758720397949, |
| "loss/hidden": 0.703125, |
| "loss/logits": 0.1243831142783165, |
| "loss/reg": 0.024742012843489647, |
| "step": 313 |
| }, |
| { |
| "epoch": 0.03925, |
| "grad_norm": 4.315618515014648, |
| "grad_norm_var": 0.2701154191311919, |
| "learning_rate": 0.0001, |
| "loss": 0.9542, |
| "loss/crossentropy": 2.290766477584839, |
| "loss/hidden": 0.60546875, |
| "loss/logits": 0.10138154029846191, |
| "loss/reg": 0.02473386563360691, |
| "step": 314 |
| }, |
| { |
| "epoch": 0.039375, |
| "grad_norm": 11.789870262145996, |
| "grad_norm_var": 4.498354875640615, |
| "learning_rate": 0.0001, |
| "loss": 1.6276, |
| "loss/crossentropy": 2.391585350036621, |
| "loss/hidden": 1.25, |
| "loss/logits": 0.13034726679325104, |
| "loss/reg": 0.02472575195133686, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.0395, |
| "grad_norm": 3.499288320541382, |
| "grad_norm_var": 4.465581075155145, |
| "learning_rate": 0.0001, |
| "loss": 0.9478, |
| "loss/crossentropy": 2.4527926445007324, |
| "loss/hidden": 0.5703125, |
| "loss/logits": 0.13031822443008423, |
| "loss/reg": 0.024717645719647408, |
| "step": 316 |
| }, |
| { |
| "epoch": 0.039625, |
| "grad_norm": 3.0139851570129395, |
| "grad_norm_var": 4.476536239649594, |
| "learning_rate": 0.0001, |
| "loss": 0.9555, |
| "loss/crossentropy": 2.506457805633545, |
| "loss/hidden": 0.58203125, |
| "loss/logits": 0.12640972435474396, |
| "loss/reg": 0.02470862865447998, |
| "step": 317 |
| }, |
| { |
| "epoch": 0.03975, |
| "grad_norm": 3.671523094177246, |
| "grad_norm_var": 4.4007183053584145, |
| "learning_rate": 0.0001, |
| "loss": 1.0902, |
| "loss/crossentropy": 2.1730313301086426, |
| "loss/hidden": 0.7109375, |
| "loss/logits": 0.13228294253349304, |
| "loss/reg": 0.024700626730918884, |
| "step": 318 |
| }, |
| { |
| "epoch": 0.039875, |
| "grad_norm": 3.453159809112549, |
| "grad_norm_var": 4.408623714873802, |
| "learning_rate": 0.0001, |
| "loss": 0.9518, |
| "loss/crossentropy": 2.7452075481414795, |
| "loss/hidden": 0.58984375, |
| "loss/logits": 0.11500866711139679, |
| "loss/reg": 0.02469259686768055, |
| "step": 319 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 8.18558406829834, |
| "grad_norm_var": 5.330579476502794, |
| "learning_rate": 0.0001, |
| "loss": 1.2593, |
| "loss/crossentropy": 2.295231819152832, |
| "loss/hidden": 0.78125, |
| "loss/logits": 0.2312176525592804, |
| "loss/reg": 0.024684064090251923, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.040125, |
| "grad_norm": 4.58513879776001, |
| "grad_norm_var": 5.245000173569268, |
| "learning_rate": 0.0001, |
| "loss": 1.0428, |
| "loss/crossentropy": 2.3456668853759766, |
| "loss/hidden": 0.67578125, |
| "loss/logits": 0.12027865648269653, |
| "loss/reg": 0.024676108732819557, |
| "step": 321 |
| }, |
| { |
| "epoch": 0.04025, |
| "grad_norm": 3.6148953437805176, |
| "grad_norm_var": 5.20441494141252, |
| "learning_rate": 0.0001, |
| "loss": 0.885, |
| "loss/crossentropy": 2.3615198135375977, |
| "loss/hidden": 0.53515625, |
| "loss/logits": 0.10311760008335114, |
| "loss/reg": 0.02466769702732563, |
| "step": 322 |
| }, |
| { |
| "epoch": 0.040375, |
| "grad_norm": 4.596187114715576, |
| "grad_norm_var": 5.072570501388933, |
| "learning_rate": 0.0001, |
| "loss": 1.3642, |
| "loss/crossentropy": 1.9407044649124146, |
| "loss/hidden": 0.9140625, |
| "loss/logits": 0.20351722836494446, |
| "loss/reg": 0.02465960383415222, |
| "step": 323 |
| }, |
| { |
| "epoch": 0.0405, |
| "grad_norm": 4.305622577667236, |
| "grad_norm_var": 5.060723771971758, |
| "learning_rate": 0.0001, |
| "loss": 1.2854, |
| "loss/crossentropy": 2.1887283325195312, |
| "loss/hidden": 0.83203125, |
| "loss/logits": 0.20682212710380554, |
| "loss/reg": 0.024651024490594864, |
| "step": 324 |
| }, |
| { |
| "epoch": 0.040625, |
| "grad_norm": 3.1052956581115723, |
| "grad_norm_var": 5.059160883569213, |
| "learning_rate": 0.0001, |
| "loss": 0.9277, |
| "loss/crossentropy": 2.7552340030670166, |
| "loss/hidden": 0.56640625, |
| "loss/logits": 0.11491444706916809, |
| "loss/reg": 0.024642454460263252, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.04075, |
| "grad_norm": 3.902174234390259, |
| "grad_norm_var": 5.092472426369553, |
| "learning_rate": 0.0001, |
| "loss": 1.0922, |
| "loss/crossentropy": 2.328878402709961, |
| "loss/hidden": 0.70703125, |
| "loss/logits": 0.13887017965316772, |
| "loss/reg": 0.024634363129734993, |
| "step": 326 |
| }, |
| { |
| "epoch": 0.040875, |
| "grad_norm": 3.144326686859131, |
| "grad_norm_var": 5.168830163995767, |
| "learning_rate": 0.0001, |
| "loss": 0.9028, |
| "loss/crossentropy": 2.765711545944214, |
| "loss/hidden": 0.55078125, |
| "loss/logits": 0.10574661940336227, |
| "loss/reg": 0.02462565153837204, |
| "step": 327 |
| }, |
| { |
| "epoch": 0.041, |
| "grad_norm": 4.085259914398193, |
| "grad_norm_var": 5.136846736797656, |
| "learning_rate": 0.0001, |
| "loss": 1.1366, |
| "loss/crossentropy": 2.2278430461883545, |
| "loss/hidden": 0.75390625, |
| "loss/logits": 0.13655412197113037, |
| "loss/reg": 0.0246175117790699, |
| "step": 328 |
| }, |
| { |
| "epoch": 0.041125, |
| "grad_norm": 3.578554391860962, |
| "grad_norm_var": 5.180404969237465, |
| "learning_rate": 0.0001, |
| "loss": 1.0227, |
| "loss/crossentropy": 2.7795305252075195, |
| "loss/hidden": 0.640625, |
| "loss/logits": 0.13600832223892212, |
| "loss/reg": 0.024609515443444252, |
| "step": 329 |
| }, |
| { |
| "epoch": 0.04125, |
| "grad_norm": 3.7418434619903564, |
| "grad_norm_var": 5.219134310055354, |
| "learning_rate": 0.0001, |
| "loss": 1.0244, |
| "loss/crossentropy": 2.636258840560913, |
| "loss/hidden": 0.63671875, |
| "loss/logits": 0.1417178213596344, |
| "loss/reg": 0.024600951001048088, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.041375, |
| "grad_norm": 3.369840383529663, |
| "grad_norm_var": 1.4852025101017772, |
| "learning_rate": 0.0001, |
| "loss": 1.0208, |
| "loss/crossentropy": 2.448961019515991, |
| "loss/hidden": 0.6640625, |
| "loss/logits": 0.1107659712433815, |
| "loss/reg": 0.024592852219939232, |
| "step": 331 |
| }, |
| { |
| "epoch": 0.0415, |
| "grad_norm": 2.90629506111145, |
| "grad_norm_var": 1.5460412234751968, |
| "learning_rate": 0.0001, |
| "loss": 0.9411, |
| "loss/crossentropy": 2.595834732055664, |
| "loss/hidden": 0.59375, |
| "loss/logits": 0.10147438943386078, |
| "loss/reg": 0.024584442377090454, |
| "step": 332 |
| }, |
| { |
| "epoch": 0.041625, |
| "grad_norm": 3.402386426925659, |
| "grad_norm_var": 1.5068032644485272, |
| "learning_rate": 0.0001, |
| "loss": 1.0144, |
| "loss/crossentropy": 2.363499402999878, |
| "loss/hidden": 0.62890625, |
| "loss/logits": 0.1397327035665512, |
| "loss/reg": 0.02457563206553459, |
| "step": 333 |
| }, |
| { |
| "epoch": 0.04175, |
| "grad_norm": 3.597465753555298, |
| "grad_norm_var": 1.5101723473758888, |
| "learning_rate": 0.0001, |
| "loss": 1.1917, |
| "loss/crossentropy": 2.3256943225860596, |
| "loss/hidden": 0.7890625, |
| "loss/logits": 0.15691371262073517, |
| "loss/reg": 0.02456764318048954, |
| "step": 334 |
| }, |
| { |
| "epoch": 0.041875, |
| "grad_norm": 3.512730121612549, |
| "grad_norm_var": 1.506262203991567, |
| "learning_rate": 0.0001, |
| "loss": 1.0631, |
| "loss/crossentropy": 2.383385181427002, |
| "loss/hidden": 0.69140625, |
| "loss/logits": 0.12608817219734192, |
| "loss/reg": 0.024559510871767998, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.042, |
| "grad_norm": 5.968827247619629, |
| "grad_norm_var": 0.5694964439723395, |
| "learning_rate": 0.0001, |
| "loss": 1.258, |
| "loss/crossentropy": 2.212991952896118, |
| "loss/hidden": 0.828125, |
| "loss/logits": 0.18438610434532166, |
| "loss/reg": 0.02455153875052929, |
| "step": 336 |
| }, |
| { |
| "epoch": 0.042125, |
| "grad_norm": 3.644115924835205, |
| "grad_norm_var": 0.5311677507970897, |
| "learning_rate": 0.0001, |
| "loss": 1.1203, |
| "loss/crossentropy": 2.3620948791503906, |
| "loss/hidden": 0.72265625, |
| "loss/logits": 0.15219135582447052, |
| "loss/reg": 0.02454366721212864, |
| "step": 337 |
| }, |
| { |
| "epoch": 0.04225, |
| "grad_norm": 2.716367244720459, |
| "grad_norm_var": 0.6013761572733584, |
| "learning_rate": 0.0001, |
| "loss": 0.919, |
| "loss/crossentropy": 2.3863117694854736, |
| "loss/hidden": 0.578125, |
| "loss/logits": 0.0955524817109108, |
| "loss/reg": 0.024536145851016045, |
| "step": 338 |
| }, |
| { |
| "epoch": 0.042375, |
| "grad_norm": 3.8117287158966064, |
| "grad_norm_var": 0.5485673092684531, |
| "learning_rate": 0.0001, |
| "loss": 1.0529, |
| "loss/crossentropy": 2.4951469898223877, |
| "loss/hidden": 0.67578125, |
| "loss/logits": 0.1318708062171936, |
| "loss/reg": 0.024528514593839645, |
| "step": 339 |
| }, |
| { |
| "epoch": 0.0425, |
| "grad_norm": 2.5218894481658936, |
| "grad_norm_var": 0.5973356289049185, |
| "learning_rate": 0.0001, |
| "loss": 0.9673, |
| "loss/crossentropy": 2.370839834213257, |
| "loss/hidden": 0.6015625, |
| "loss/logits": 0.12049120664596558, |
| "loss/reg": 0.02452007494866848, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.042625, |
| "grad_norm": 2.7343108654022217, |
| "grad_norm_var": 0.6285810690168129, |
| "learning_rate": 0.0001, |
| "loss": 1.1201, |
| "loss/crossentropy": 2.2990617752075195, |
| "loss/hidden": 0.7109375, |
| "loss/logits": 0.16403642296791077, |
| "loss/reg": 0.024511409923434258, |
| "step": 341 |
| }, |
| { |
| "epoch": 0.04275, |
| "grad_norm": 2.5267186164855957, |
| "grad_norm_var": 0.6803812464423664, |
| "learning_rate": 0.0001, |
| "loss": 0.9056, |
| "loss/crossentropy": 2.5629754066467285, |
| "loss/hidden": 0.5546875, |
| "loss/logits": 0.10588675737380981, |
| "loss/reg": 0.024503152817487717, |
| "step": 342 |
| }, |
| { |
| "epoch": 0.042875, |
| "grad_norm": 2.2731988430023193, |
| "grad_norm_var": 0.7637691760365584, |
| "learning_rate": 0.0001, |
| "loss": 0.8154, |
| "loss/crossentropy": 2.4123644828796387, |
| "loss/hidden": 0.490234375, |
| "loss/logits": 0.08022630214691162, |
| "loss/reg": 0.024494923651218414, |
| "step": 343 |
| }, |
| { |
| "epoch": 0.043, |
| "grad_norm": 3.2791945934295654, |
| "grad_norm_var": 0.7306725618305314, |
| "learning_rate": 0.0001, |
| "loss": 0.9879, |
| "loss/crossentropy": 2.7643816471099854, |
| "loss/hidden": 0.6171875, |
| "loss/logits": 0.12586694955825806, |
| "loss/reg": 0.024486759677529335, |
| "step": 344 |
| }, |
| { |
| "epoch": 0.043125, |
| "grad_norm": 3.6740994453430176, |
| "grad_norm_var": 0.7341663188433093, |
| "learning_rate": 0.0001, |
| "loss": 0.8947, |
| "loss/crossentropy": 2.6802780628204346, |
| "loss/hidden": 0.5546875, |
| "loss/logits": 0.0951782613992691, |
| "loss/reg": 0.024478696286678314, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.04325, |
| "grad_norm": 3.234722137451172, |
| "grad_norm_var": 0.7240869727338347, |
| "learning_rate": 0.0001, |
| "loss": 0.8517, |
| "loss/crossentropy": 2.577921152114868, |
| "loss/hidden": 0.515625, |
| "loss/logits": 0.09137749671936035, |
| "loss/reg": 0.024470962584018707, |
| "step": 346 |
| }, |
| { |
| "epoch": 0.043375, |
| "grad_norm": 4.024074554443359, |
| "grad_norm_var": 0.7548921970504276, |
| "learning_rate": 0.0001, |
| "loss": 1.0017, |
| "loss/crossentropy": 2.8129634857177734, |
| "loss/hidden": 0.63671875, |
| "loss/logits": 0.120377317070961, |
| "loss/reg": 0.02446298860013485, |
| "step": 347 |
| }, |
| { |
| "epoch": 0.0435, |
| "grad_norm": 3.4327027797698975, |
| "grad_norm_var": 0.7400679146500114, |
| "learning_rate": 0.0001, |
| "loss": 0.9864, |
| "loss/crossentropy": 2.4310224056243896, |
| "loss/hidden": 0.640625, |
| "loss/logits": 0.10121208429336548, |
| "loss/reg": 0.0244552381336689, |
| "step": 348 |
| }, |
| { |
| "epoch": 0.043625, |
| "grad_norm": 3.2115890979766846, |
| "grad_norm_var": 0.7422101391295163, |
| "learning_rate": 0.0001, |
| "loss": 1.0594, |
| "loss/crossentropy": 2.3658318519592285, |
| "loss/hidden": 0.6796875, |
| "loss/logits": 0.13528358936309814, |
| "loss/reg": 0.02444704994559288, |
| "step": 349 |
| }, |
| { |
| "epoch": 0.04375, |
| "grad_norm": 3.4005136489868164, |
| "grad_norm_var": 0.739061242813568, |
| "learning_rate": 0.0001, |
| "loss": 1.0154, |
| "loss/crossentropy": 2.9171154499053955, |
| "loss/hidden": 0.6171875, |
| "loss/logits": 0.15386469662189484, |
| "loss/reg": 0.024438532069325447, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.043875, |
| "grad_norm": 4.1175127029418945, |
| "grad_norm_var": 0.7731950105324129, |
| "learning_rate": 0.0001, |
| "loss": 0.949, |
| "loss/crossentropy": 2.3405494689941406, |
| "loss/hidden": 0.59765625, |
| "loss/logits": 0.10704682767391205, |
| "loss/reg": 0.024429937824606895, |
| "step": 351 |
| }, |
| { |
| "epoch": 0.044, |
| "grad_norm": 3.9496090412139893, |
| "grad_norm_var": 0.33930652052577653, |
| "learning_rate": 0.0001, |
| "loss": 0.889, |
| "loss/crossentropy": 2.9135613441467285, |
| "loss/hidden": 0.54296875, |
| "loss/logits": 0.1018136739730835, |
| "loss/reg": 0.024421829730272293, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.044125, |
| "grad_norm": 3.794520139694214, |
| "grad_norm_var": 0.347931624130162, |
| "learning_rate": 0.0001, |
| "loss": 1.1771, |
| "loss/crossentropy": 2.5529844760894775, |
| "loss/hidden": 0.75390625, |
| "loss/logits": 0.17909468710422516, |
| "loss/reg": 0.02441396936774254, |
| "step": 353 |
| }, |
| { |
| "epoch": 0.04425, |
| "grad_norm": 3.164349317550659, |
| "grad_norm_var": 0.3259767305032634, |
| "learning_rate": 0.0001, |
| "loss": 1.0804, |
| "loss/crossentropy": 2.3843231201171875, |
| "loss/hidden": 0.71484375, |
| "loss/logits": 0.12148329615592957, |
| "loss/reg": 0.02440580353140831, |
| "step": 354 |
| }, |
| { |
| "epoch": 0.044375, |
| "grad_norm": 2.7847914695739746, |
| "grad_norm_var": 0.32482231475126633, |
| "learning_rate": 0.0001, |
| "loss": 1.0463, |
| "loss/crossentropy": 2.521649122238159, |
| "loss/hidden": 0.68359375, |
| "loss/logits": 0.1187494546175003, |
| "loss/reg": 0.024397339671850204, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.0445, |
| "grad_norm": 3.2391860485076904, |
| "grad_norm_var": 0.28660331114573767, |
| "learning_rate": 0.0001, |
| "loss": 1.0225, |
| "loss/crossentropy": 2.4073832035064697, |
| "loss/hidden": 0.640625, |
| "loss/logits": 0.13795481622219086, |
| "loss/reg": 0.024389205500483513, |
| "step": 356 |
| }, |
| { |
| "epoch": 0.044625, |
| "grad_norm": 5.839352130889893, |
| "grad_norm_var": 0.6539216724231817, |
| "learning_rate": 0.0001, |
| "loss": 1.1058, |
| "loss/crossentropy": 2.6860220432281494, |
| "loss/hidden": 0.7265625, |
| "loss/logits": 0.13543812930583954, |
| "loss/reg": 0.02438061311841011, |
| "step": 357 |
| }, |
| { |
| "epoch": 0.04475, |
| "grad_norm": 4.340758800506592, |
| "grad_norm_var": 0.6249977794062299, |
| "learning_rate": 0.0001, |
| "loss": 1.1, |
| "loss/crossentropy": 2.770012378692627, |
| "loss/hidden": 0.703125, |
| "loss/logits": 0.1531440019607544, |
| "loss/reg": 0.024372335523366928, |
| "step": 358 |
| }, |
| { |
| "epoch": 0.044875, |
| "grad_norm": 8.795727729797363, |
| "grad_norm_var": 2.1213731683569863, |
| "learning_rate": 0.0001, |
| "loss": 1.3149, |
| "loss/crossentropy": 2.2893428802490234, |
| "loss/hidden": 0.91796875, |
| "loss/logits": 0.15326841175556183, |
| "loss/reg": 0.02436378225684166, |
| "step": 359 |
| }, |
| { |
| "epoch": 0.045, |
| "grad_norm": 4.104447841644287, |
| "grad_norm_var": 2.0826812332104394, |
| "learning_rate": 0.0001, |
| "loss": 1.15, |
| "loss/crossentropy": 2.625378370285034, |
| "loss/hidden": 0.7578125, |
| "loss/logits": 0.14858195185661316, |
| "loss/reg": 0.024355949833989143, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.045125, |
| "grad_norm": 3.354828357696533, |
| "grad_norm_var": 2.105873348197963, |
| "learning_rate": 0.0001, |
| "loss": 0.9494, |
| "loss/crossentropy": 2.4916443824768066, |
| "loss/hidden": 0.5859375, |
| "loss/logits": 0.11993111670017242, |
| "loss/reg": 0.024348480626940727, |
| "step": 361 |
| }, |
| { |
| "epoch": 0.04525, |
| "grad_norm": 3.3247056007385254, |
| "grad_norm_var": 2.096606359520402, |
| "learning_rate": 0.0001, |
| "loss": 0.9604, |
| "loss/crossentropy": 2.282543420791626, |
| "loss/hidden": 0.609375, |
| "loss/logits": 0.10765975713729858, |
| "loss/reg": 0.02434113249182701, |
| "step": 362 |
| }, |
| { |
| "epoch": 0.045375, |
| "grad_norm": 4.314214706420898, |
| "grad_norm_var": 2.1006745469652883, |
| "learning_rate": 0.0001, |
| "loss": 1.0684, |
| "loss/crossentropy": 2.447218179702759, |
| "loss/hidden": 0.6953125, |
| "loss/logits": 0.12974585592746735, |
| "loss/reg": 0.024332784116268158, |
| "step": 363 |
| }, |
| { |
| "epoch": 0.0455, |
| "grad_norm": 3.6783666610717773, |
| "grad_norm_var": 2.083471757970481, |
| "learning_rate": 0.0001, |
| "loss": 1.1509, |
| "loss/crossentropy": 2.6020665168762207, |
| "loss/hidden": 0.75, |
| "loss/logits": 0.1576002687215805, |
| "loss/reg": 0.024325383827090263, |
| "step": 364 |
| }, |
| { |
| "epoch": 0.045625, |
| "grad_norm": 3.535550832748413, |
| "grad_norm_var": 2.0521572529950523, |
| "learning_rate": 0.0001, |
| "loss": 0.9672, |
| "loss/crossentropy": 2.2514841556549072, |
| "loss/hidden": 0.61328125, |
| "loss/logits": 0.110772505402565, |
| "loss/reg": 0.024317855015397072, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.04575, |
| "grad_norm": 6.190865993499756, |
| "grad_norm_var": 2.275325586048537, |
| "learning_rate": 0.0001, |
| "loss": 1.2696, |
| "loss/crossentropy": 2.5250887870788574, |
| "loss/hidden": 0.87890625, |
| "loss/logits": 0.1475657969713211, |
| "loss/reg": 0.02430957928299904, |
| "step": 366 |
| }, |
| { |
| "epoch": 0.045875, |
| "grad_norm": 6.9109907150268555, |
| "grad_norm_var": 2.701389202772653, |
| "learning_rate": 0.0001, |
| "loss": 1.42, |
| "loss/crossentropy": 2.0714285373687744, |
| "loss/hidden": 0.98828125, |
| "loss/logits": 0.18868008255958557, |
| "loss/reg": 0.024301210418343544, |
| "step": 367 |
| }, |
| { |
| "epoch": 0.046, |
| "grad_norm": 3.939924955368042, |
| "grad_norm_var": 2.702051041555256, |
| "learning_rate": 0.0001, |
| "loss": 1.0163, |
| "loss/crossentropy": 2.677281379699707, |
| "loss/hidden": 0.640625, |
| "loss/logits": 0.1327013224363327, |
| "loss/reg": 0.024292904883623123, |
| "step": 368 |
| }, |
| { |
| "epoch": 0.046125, |
| "grad_norm": 2.942032814025879, |
| "grad_norm_var": 2.822776844100568, |
| "learning_rate": 0.0001, |
| "loss": 0.9582, |
| "loss/crossentropy": 2.3630495071411133, |
| "loss/hidden": 0.6015625, |
| "loss/logits": 0.11380600929260254, |
| "loss/reg": 0.02428455464541912, |
| "step": 369 |
| }, |
| { |
| "epoch": 0.04625, |
| "grad_norm": 3.125349521636963, |
| "grad_norm_var": 2.8293167859701627, |
| "learning_rate": 0.0001, |
| "loss": 1.0463, |
| "loss/crossentropy": 2.5113635063171387, |
| "loss/hidden": 0.67578125, |
| "loss/logits": 0.12774598598480225, |
| "loss/reg": 0.02427608147263527, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.046375, |
| "grad_norm": 3.15159273147583, |
| "grad_norm_var": 2.758666518773039, |
| "learning_rate": 0.0001, |
| "loss": 0.9198, |
| "loss/crossentropy": 2.31132435798645, |
| "loss/hidden": 0.5703125, |
| "loss/logits": 0.10678299516439438, |
| "loss/reg": 0.024267377331852913, |
| "step": 371 |
| }, |
| { |
| "epoch": 0.0465, |
| "grad_norm": 4.394677639007568, |
| "grad_norm_var": 2.6595375525429406, |
| "learning_rate": 0.0001, |
| "loss": 0.9954, |
| "loss/crossentropy": 2.4987642765045166, |
| "loss/hidden": 0.6328125, |
| "loss/logits": 0.11996030062437057, |
| "loss/reg": 0.0242580845952034, |
| "step": 372 |
| }, |
| { |
| "epoch": 0.046625, |
| "grad_norm": 3.7915477752685547, |
| "grad_norm_var": 2.5549678839666425, |
| "learning_rate": 0.0001, |
| "loss": 0.9193, |
| "loss/crossentropy": 2.3996403217315674, |
| "loss/hidden": 0.578125, |
| "loss/logits": 0.09870465099811554, |
| "loss/reg": 0.02424856275320053, |
| "step": 373 |
| }, |
| { |
| "epoch": 0.04675, |
| "grad_norm": 2.833364725112915, |
| "grad_norm_var": 2.7025530371611066, |
| "learning_rate": 0.0001, |
| "loss": 0.9417, |
| "loss/crossentropy": 2.336583137512207, |
| "loss/hidden": 0.59765625, |
| "loss/logits": 0.10166990756988525, |
| "loss/reg": 0.024240419268608093, |
| "step": 374 |
| }, |
| { |
| "epoch": 0.046875, |
| "grad_norm": 5.302695274353027, |
| "grad_norm_var": 1.359315799583595, |
| "learning_rate": 0.0001, |
| "loss": 0.9495, |
| "loss/crossentropy": 2.5733697414398193, |
| "loss/hidden": 0.578125, |
| "loss/logits": 0.1290503740310669, |
| "loss/reg": 0.0242319293320179, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.047, |
| "grad_norm": 5.087683200836182, |
| "grad_norm_var": 1.426096117003745, |
| "learning_rate": 0.0001, |
| "loss": 0.9067, |
| "loss/crossentropy": 2.548323631286621, |
| "loss/hidden": 0.56640625, |
| "loss/logits": 0.09806410223245621, |
| "loss/reg": 0.02422359585762024, |
| "step": 376 |
| }, |
| { |
| "epoch": 0.047125, |
| "grad_norm": 2.840883255004883, |
| "grad_norm_var": 1.4948607984557458, |
| "learning_rate": 0.0001, |
| "loss": 1.0234, |
| "loss/crossentropy": 2.484494209289551, |
| "loss/hidden": 0.65234375, |
| "loss/logits": 0.12894591689109802, |
| "loss/reg": 0.024214565753936768, |
| "step": 377 |
| }, |
| { |
| "epoch": 0.04725, |
| "grad_norm": 6.647733688354492, |
| "grad_norm_var": 1.848030946106532, |
| "learning_rate": 0.0001, |
| "loss": 1.131, |
| "loss/crossentropy": 2.832048177719116, |
| "loss/hidden": 0.7421875, |
| "loss/logits": 0.14676988124847412, |
| "loss/reg": 0.024205682799220085, |
| "step": 378 |
| }, |
| { |
| "epoch": 0.047375, |
| "grad_norm": 3.465564727783203, |
| "grad_norm_var": 1.8906396391038638, |
| "learning_rate": 0.0001, |
| "loss": 1.0051, |
| "loss/crossentropy": 2.5129964351654053, |
| "loss/hidden": 0.62890625, |
| "loss/logits": 0.134174644947052, |
| "loss/reg": 0.024197354912757874, |
| "step": 379 |
| }, |
| { |
| "epoch": 0.0475, |
| "grad_norm": 3.6985855102539062, |
| "grad_norm_var": 1.8891513099755568, |
| "learning_rate": 0.0001, |
| "loss": 1.4422, |
| "loss/crossentropy": 2.2384328842163086, |
| "loss/hidden": 1.0, |
| "loss/logits": 0.20027095079421997, |
| "loss/reg": 0.02418883889913559, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.047625, |
| "grad_norm": 3.4343388080596924, |
| "grad_norm_var": 1.8993141107729303, |
| "learning_rate": 0.0001, |
| "loss": 0.8648, |
| "loss/crossentropy": 2.753706932067871, |
| "loss/hidden": 0.5234375, |
| "loss/logits": 0.09959565848112106, |
| "loss/reg": 0.024180689826607704, |
| "step": 381 |
| }, |
| { |
| "epoch": 0.04775, |
| "grad_norm": 3.418168067932129, |
| "grad_norm_var": 1.6566847859375398, |
| "learning_rate": 0.0001, |
| "loss": 1.0532, |
| "loss/crossentropy": 2.1205835342407227, |
| "loss/hidden": 0.69921875, |
| "loss/logits": 0.11221310496330261, |
| "loss/reg": 0.02417258359491825, |
| "step": 382 |
| }, |
| { |
| "epoch": 0.047875, |
| "grad_norm": 3.3881125450134277, |
| "grad_norm_var": 1.093930487598201, |
| "learning_rate": 0.0001, |
| "loss": 0.9968, |
| "loss/crossentropy": 2.540379285812378, |
| "loss/hidden": 0.62890625, |
| "loss/logits": 0.1262589991092682, |
| "loss/reg": 0.024164721369743347, |
| "step": 383 |
| }, |
| { |
| "epoch": 0.048, |
| "grad_norm": 3.142486095428467, |
| "grad_norm_var": 1.1231981378319982, |
| "learning_rate": 0.0001, |
| "loss": 0.9186, |
| "loss/crossentropy": 2.5265657901763916, |
| "loss/hidden": 0.58203125, |
| "loss/logits": 0.0949624702334404, |
| "loss/reg": 0.02415630966424942, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.048125, |
| "grad_norm": 7.363763332366943, |
| "grad_norm_var": 1.8443340238906671, |
| "learning_rate": 0.0001, |
| "loss": 1.1703, |
| "loss/crossentropy": 2.442409038543701, |
| "loss/hidden": 0.8046875, |
| "loss/logits": 0.12408800423145294, |
| "loss/reg": 0.024148130789399147, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.04825, |
| "grad_norm": 3.801536798477173, |
| "grad_norm_var": 1.7879312710551798, |
| "learning_rate": 0.0001, |
| "loss": 1.0129, |
| "loss/crossentropy": 2.461942434310913, |
| "loss/hidden": 0.640625, |
| "loss/logits": 0.13092291355133057, |
| "loss/reg": 0.024139659479260445, |
| "step": 386 |
| }, |
| { |
| "epoch": 0.048375, |
| "grad_norm": 4.24082612991333, |
| "grad_norm_var": 1.7228677295443293, |
| "learning_rate": 0.0001, |
| "loss": 1.1161, |
| "loss/crossentropy": 2.2889018058776855, |
| "loss/hidden": 0.74609375, |
| "loss/logits": 0.12870003283023834, |
| "loss/reg": 0.024130841717123985, |
| "step": 387 |
| }, |
| { |
| "epoch": 0.0485, |
| "grad_norm": 4.190494060516357, |
| "grad_norm_var": 1.7195812284180172, |
| "learning_rate": 0.0001, |
| "loss": 1.1649, |
| "loss/crossentropy": 2.4475483894348145, |
| "loss/hidden": 0.76171875, |
| "loss/logits": 0.16198021173477173, |
| "loss/reg": 0.024122456088662148, |
| "step": 388 |
| }, |
| { |
| "epoch": 0.048625, |
| "grad_norm": 15.206843376159668, |
| "grad_norm_var": 9.294742605150057, |
| "learning_rate": 0.0001, |
| "loss": 1.0606, |
| "loss/crossentropy": 2.3164589405059814, |
| "loss/hidden": 0.703125, |
| "loss/logits": 0.116313636302948, |
| "loss/reg": 0.024113710969686508, |
| "step": 389 |
| }, |
| { |
| "epoch": 0.04875, |
| "grad_norm": 10.421854972839355, |
| "grad_norm_var": 10.824103712955843, |
| "learning_rate": 0.0001, |
| "loss": 1.0638, |
| "loss/crossentropy": 2.1764075756073, |
| "loss/hidden": 0.70703125, |
| "loss/logits": 0.11567908525466919, |
| "loss/reg": 0.024104835465550423, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.048875, |
| "grad_norm": 4.923640727996826, |
| "grad_norm_var": 10.83563756748113, |
| "learning_rate": 0.0001, |
| "loss": 0.9465, |
| "loss/crossentropy": 2.6762068271636963, |
| "loss/hidden": 0.6015625, |
| "loss/logits": 0.10399520397186279, |
| "loss/reg": 0.02409605123102665, |
| "step": 391 |
| }, |
| { |
| "epoch": 0.049, |
| "grad_norm": 4.840577125549316, |
| "grad_norm_var": 10.847422220224528, |
| "learning_rate": 0.0001, |
| "loss": 1.2037, |
| "loss/crossentropy": 2.482144832611084, |
| "loss/hidden": 0.8125, |
| "loss/logits": 0.15031081438064575, |
| "loss/reg": 0.02408732660114765, |
| "step": 392 |
| }, |
| { |
| "epoch": 0.049125, |
| "grad_norm": 2.9535470008850098, |
| "grad_norm_var": 10.811063470934846, |
| "learning_rate": 0.0001, |
| "loss": 1.0811, |
| "loss/crossentropy": 2.5998470783233643, |
| "loss/hidden": 0.71484375, |
| "loss/logits": 0.12545132637023926, |
| "loss/reg": 0.024078134447336197, |
| "step": 393 |
| }, |
| { |
| "epoch": 0.04925, |
| "grad_norm": 4.08555793762207, |
| "grad_norm_var": 10.768160950066, |
| "learning_rate": 0.0001, |
| "loss": 1.3211, |
| "loss/crossentropy": 2.611746072769165, |
| "loss/hidden": 0.90625, |
| "loss/logits": 0.17412351071834564, |
| "loss/reg": 0.02406897209584713, |
| "step": 394 |
| }, |
| { |
| "epoch": 0.049375, |
| "grad_norm": 4.730119705200195, |
| "grad_norm_var": 10.582242923890295, |
| "learning_rate": 0.0001, |
| "loss": 0.9463, |
| "loss/crossentropy": 2.5379066467285156, |
| "loss/hidden": 0.59765625, |
| "loss/logits": 0.10806328058242798, |
| "loss/reg": 0.024059420451521873, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.0495, |
| "grad_norm": 4.012064456939697, |
| "grad_norm_var": 10.523956759484621, |
| "learning_rate": 0.0001, |
| "loss": 0.9181, |
| "loss/crossentropy": 2.567375898361206, |
| "loss/hidden": 0.57421875, |
| "loss/logits": 0.10337453335523605, |
| "loss/reg": 0.024049852043390274, |
| "step": 396 |
| }, |
| { |
| "epoch": 0.049625, |
| "grad_norm": 4.57706880569458, |
| "grad_norm_var": 10.32746400090784, |
| "learning_rate": 0.0001, |
| "loss": 1.051, |
| "loss/crossentropy": 2.6955151557922363, |
| "loss/hidden": 0.6875, |
| "loss/logits": 0.12309969961643219, |
| "loss/reg": 0.02403969317674637, |
| "step": 397 |
| }, |
| { |
| "epoch": 0.04975, |
| "grad_norm": 2.7931346893310547, |
| "grad_norm_var": 10.511295288820635, |
| "learning_rate": 0.0001, |
| "loss": 1.0373, |
| "loss/crossentropy": 2.34134578704834, |
| "loss/hidden": 0.66796875, |
| "loss/logits": 0.12901151180267334, |
| "loss/reg": 0.02403116784989834, |
| "step": 398 |
| }, |
| { |
| "epoch": 0.049875, |
| "grad_norm": 2.4789700508117676, |
| "grad_norm_var": 10.793738555266915, |
| "learning_rate": 0.0001, |
| "loss": 1.1463, |
| "loss/crossentropy": 2.1852893829345703, |
| "loss/hidden": 0.75390625, |
| "loss/logits": 0.15216518938541412, |
| "loss/reg": 0.024022690951824188, |
| "step": 399 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 5.340605735778809, |
| "grad_norm_var": 10.482396698239997, |
| "learning_rate": 0.0001, |
| "loss": 1.2874, |
| "loss/crossentropy": 2.4223577976226807, |
| "loss/hidden": 0.91015625, |
| "loss/logits": 0.13714072108268738, |
| "loss/reg": 0.024014031514525414, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.050125, |
| "grad_norm": 5.600348949432373, |
| "grad_norm_var": 10.208567826877847, |
| "learning_rate": 0.0001, |
| "loss": 0.9905, |
| "loss/crossentropy": 2.494180917739868, |
| "loss/hidden": 0.64453125, |
| "loss/logits": 0.10594967007637024, |
| "loss/reg": 0.024006787687540054, |
| "step": 401 |
| }, |
| { |
| "epoch": 0.05025, |
| "grad_norm": 5.094600677490234, |
| "grad_norm_var": 10.061216488426146, |
| "learning_rate": 0.0001, |
| "loss": 1.2828, |
| "loss/crossentropy": 2.1899051666259766, |
| "loss/hidden": 0.89453125, |
| "loss/logits": 0.1482730507850647, |
| "loss/reg": 0.02399739809334278, |
| "step": 402 |
| }, |
| { |
| "epoch": 0.050375, |
| "grad_norm": 3.915607452392578, |
| "grad_norm_var": 10.115626051260246, |
| "learning_rate": 0.0001, |
| "loss": 1.0375, |
| "loss/crossentropy": 2.790240526199341, |
| "loss/hidden": 0.65625, |
| "loss/logits": 0.14136558771133423, |
| "loss/reg": 0.02398892305791378, |
| "step": 403 |
| }, |
| { |
| "epoch": 0.0505, |
| "grad_norm": 70.49039459228516, |
| "grad_norm_var": 274.8357269833382, |
| "learning_rate": 0.0001, |
| "loss": 1.1534, |
| "loss/crossentropy": 2.2490358352661133, |
| "loss/hidden": 0.79296875, |
| "loss/logits": 0.12061962485313416, |
| "loss/reg": 0.02398114837706089, |
| "step": 404 |
| }, |
| { |
| "epoch": 0.050625, |
| "grad_norm": 6.5985307693481445, |
| "grad_norm_var": 272.87861181728414, |
| "learning_rate": 0.0001, |
| "loss": 1.1479, |
| "loss/crossentropy": 2.856632709503174, |
| "loss/hidden": 0.78515625, |
| "loss/logits": 0.12297768890857697, |
| "loss/reg": 0.0239717997610569, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.05075, |
| "grad_norm": 4.041878700256348, |
| "grad_norm_var": 274.1523084795167, |
| "learning_rate": 0.0001, |
| "loss": 1.1408, |
| "loss/crossentropy": 2.5744457244873047, |
| "loss/hidden": 0.76171875, |
| "loss/logits": 0.13939061760902405, |
| "loss/reg": 0.02396412193775177, |
| "step": 406 |
| }, |
| { |
| "epoch": 0.050875, |
| "grad_norm": 3.5284969806671143, |
| "grad_norm_var": 274.94477307618513, |
| "learning_rate": 0.0001, |
| "loss": 1.0838, |
| "loss/crossentropy": 2.7705729007720947, |
| "loss/hidden": 0.72265625, |
| "loss/logits": 0.12160193920135498, |
| "loss/reg": 0.023956267163157463, |
| "step": 407 |
| }, |
| { |
| "epoch": 0.051, |
| "grad_norm": 3.867558240890503, |
| "grad_norm_var": 275.4712566581114, |
| "learning_rate": 0.0001, |
| "loss": 1.1238, |
| "loss/crossentropy": 2.2807302474975586, |
| "loss/hidden": 0.75390625, |
| "loss/logits": 0.1304117739200592, |
| "loss/reg": 0.023948216810822487, |
| "step": 408 |
| }, |
| { |
| "epoch": 0.051125, |
| "grad_norm": 4.9265217781066895, |
| "grad_norm_var": 274.2865770164501, |
| "learning_rate": 0.0001, |
| "loss": 0.955, |
| "loss/crossentropy": 2.6176843643188477, |
| "loss/hidden": 0.62890625, |
| "loss/logits": 0.08674542605876923, |
| "loss/reg": 0.02393944188952446, |
| "step": 409 |
| }, |
| { |
| "epoch": 0.05125, |
| "grad_norm": 3.1925463676452637, |
| "grad_norm_var": 274.86264478448203, |
| "learning_rate": 0.0001, |
| "loss": 1.0329, |
| "loss/crossentropy": 2.790286064147949, |
| "loss/hidden": 0.6796875, |
| "loss/logits": 0.11385629326105118, |
| "loss/reg": 0.02393159456551075, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.051375, |
| "grad_norm": 2.6993377208709717, |
| "grad_norm_var": 276.12743945534294, |
| "learning_rate": 0.0001, |
| "loss": 0.9268, |
| "loss/crossentropy": 2.5643763542175293, |
| "loss/hidden": 0.58203125, |
| "loss/logits": 0.10557639598846436, |
| "loss/reg": 0.0239238403737545, |
| "step": 411 |
| }, |
| { |
| "epoch": 0.0515, |
| "grad_norm": 4.7841315269470215, |
| "grad_norm_var": 275.72098389943244, |
| "learning_rate": 0.0001, |
| "loss": 1.3399, |
| "loss/crossentropy": 2.3737518787384033, |
| "loss/hidden": 0.89453125, |
| "loss/logits": 0.2062419056892395, |
| "loss/reg": 0.023915138095617294, |
| "step": 412 |
| }, |
| { |
| "epoch": 0.051625, |
| "grad_norm": 2.8743667602539062, |
| "grad_norm_var": 276.7634192046356, |
| "learning_rate": 0.0001, |
| "loss": 1.105, |
| "loss/crossentropy": 2.225154399871826, |
| "loss/hidden": 0.734375, |
| "loss/logits": 0.13158033788204193, |
| "loss/reg": 0.023907041177153587, |
| "step": 413 |
| }, |
| { |
| "epoch": 0.05175, |
| "grad_norm": 4.873403072357178, |
| "grad_norm_var": 275.51638736026473, |
| "learning_rate": 0.0001, |
| "loss": 1.1328, |
| "loss/crossentropy": 2.444675922393799, |
| "loss/hidden": 0.734375, |
| "loss/logits": 0.15947584807872772, |
| "loss/reg": 0.023899447172880173, |
| "step": 414 |
| }, |
| { |
| "epoch": 0.051875, |
| "grad_norm": 3.6676955223083496, |
| "grad_norm_var": 274.6671585398764, |
| "learning_rate": 0.0001, |
| "loss": 0.9504, |
| "loss/crossentropy": 2.6828341484069824, |
| "loss/hidden": 0.6015625, |
| "loss/logits": 0.10996139049530029, |
| "loss/reg": 0.023892199620604515, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.052, |
| "grad_norm": 2.703623056411743, |
| "grad_norm_var": 276.2015243387772, |
| "learning_rate": 0.0001, |
| "loss": 1.049, |
| "loss/crossentropy": 2.4529809951782227, |
| "loss/hidden": 0.6796875, |
| "loss/logits": 0.1305149495601654, |
| "loss/reg": 0.023883724585175514, |
| "step": 416 |
| }, |
| { |
| "epoch": 0.052125, |
| "grad_norm": 2.9288852214813232, |
| "grad_norm_var": 277.61048629826035, |
| "learning_rate": 0.0001, |
| "loss": 1.0151, |
| "loss/crossentropy": 2.6102705001831055, |
| "loss/hidden": 0.640625, |
| "loss/logits": 0.13567429780960083, |
| "loss/reg": 0.023875238373875618, |
| "step": 417 |
| }, |
| { |
| "epoch": 0.05225, |
| "grad_norm": 2.5006144046783447, |
| "grad_norm_var": 279.0831974622093, |
| "learning_rate": 0.0001, |
| "loss": 1.0571, |
| "loss/crossentropy": 2.457202672958374, |
| "loss/hidden": 0.69140625, |
| "loss/logits": 0.12700514495372772, |
| "loss/reg": 0.023867420852184296, |
| "step": 418 |
| }, |
| { |
| "epoch": 0.052375, |
| "grad_norm": 5.180849552154541, |
| "grad_norm_var": 278.49850212580674, |
| "learning_rate": 0.0001, |
| "loss": 1.1214, |
| "loss/crossentropy": 2.7064497470855713, |
| "loss/hidden": 0.74609375, |
| "loss/logits": 0.13676053285598755, |
| "loss/reg": 0.02385888434946537, |
| "step": 419 |
| }, |
| { |
| "epoch": 0.0525, |
| "grad_norm": 3.2954249382019043, |
| "grad_norm_var": 1.3051375489769337, |
| "learning_rate": 0.0001, |
| "loss": 1.0717, |
| "loss/crossentropy": 2.31532621383667, |
| "loss/hidden": 0.703125, |
| "loss/logits": 0.13004590570926666, |
| "loss/reg": 0.023851698264479637, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.052625, |
| "grad_norm": 2.8389148712158203, |
| "grad_norm_var": 0.8127685868335741, |
| "learning_rate": 0.0001, |
| "loss": 1.0781, |
| "loss/crossentropy": 2.3562088012695312, |
| "loss/hidden": 0.70703125, |
| "loss/logits": 0.13266143202781677, |
| "loss/reg": 0.023843195289373398, |
| "step": 421 |
| }, |
| { |
| "epoch": 0.05275, |
| "grad_norm": 4.581103324890137, |
| "grad_norm_var": 0.8613437167520175, |
| "learning_rate": 0.0001, |
| "loss": 1.4299, |
| "loss/crossentropy": 2.1004514694213867, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.13683074712753296, |
| "loss/reg": 0.023835282772779465, |
| "step": 422 |
| }, |
| { |
| "epoch": 0.052875, |
| "grad_norm": 4.119724750518799, |
| "grad_norm_var": 0.8733982923945914, |
| "learning_rate": 0.0001, |
| "loss": 1.2152, |
| "loss/crossentropy": 2.640662670135498, |
| "loss/hidden": 0.8515625, |
| "loss/logits": 0.12537327408790588, |
| "loss/reg": 0.02382684126496315, |
| "step": 423 |
| }, |
| { |
| "epoch": 0.053, |
| "grad_norm": 4.544858455657959, |
| "grad_norm_var": 0.918133871994677, |
| "learning_rate": 0.0001, |
| "loss": 1.0957, |
| "loss/crossentropy": 2.8237485885620117, |
| "loss/hidden": 0.7265625, |
| "loss/logits": 0.1309557855129242, |
| "loss/reg": 0.023818302899599075, |
| "step": 424 |
| }, |
| { |
| "epoch": 0.053125, |
| "grad_norm": 2.757870674133301, |
| "grad_norm_var": 0.8666742418813403, |
| "learning_rate": 0.0001, |
| "loss": 0.962, |
| "loss/crossentropy": 2.5533342361450195, |
| "loss/hidden": 0.62109375, |
| "loss/logits": 0.10283903032541275, |
| "loss/reg": 0.023810207843780518, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.05325, |
| "grad_norm": 2.8035690784454346, |
| "grad_norm_var": 0.8970790990362765, |
| "learning_rate": 0.0001, |
| "loss": 1.0877, |
| "loss/crossentropy": 2.610870361328125, |
| "loss/hidden": 0.6953125, |
| "loss/logits": 0.1544169783592224, |
| "loss/reg": 0.02380150742828846, |
| "step": 426 |
| }, |
| { |
| "epoch": 0.053375, |
| "grad_norm": 3.5559544563293457, |
| "grad_norm_var": 0.8432525593756196, |
| "learning_rate": 0.0001, |
| "loss": 1.1169, |
| "loss/crossentropy": 2.537252187728882, |
| "loss/hidden": 0.74609375, |
| "loss/logits": 0.1328693926334381, |
| "loss/reg": 0.02379263937473297, |
| "step": 427 |
| }, |
| { |
| "epoch": 0.0535, |
| "grad_norm": 3.58505916595459, |
| "grad_norm_var": 0.7479056021171611, |
| "learning_rate": 0.0001, |
| "loss": 1.0427, |
| "loss/crossentropy": 2.240241289138794, |
| "loss/hidden": 0.68359375, |
| "loss/logits": 0.1212427169084549, |
| "loss/reg": 0.02378367818892002, |
| "step": 428 |
| }, |
| { |
| "epoch": 0.053625, |
| "grad_norm": 3.6646056175231934, |
| "grad_norm_var": 0.7156687449512967, |
| "learning_rate": 0.0001, |
| "loss": 1.0166, |
| "loss/crossentropy": 2.765550374984741, |
| "loss/hidden": 0.6328125, |
| "loss/logits": 0.14600837230682373, |
| "loss/reg": 0.023774517700076103, |
| "step": 429 |
| }, |
| { |
| "epoch": 0.05375, |
| "grad_norm": 3.2596421241760254, |
| "grad_norm_var": 0.6044660126437359, |
| "learning_rate": 0.0001, |
| "loss": 1.13, |
| "loss/crossentropy": 2.1970372200012207, |
| "loss/hidden": 0.7578125, |
| "loss/logits": 0.13451042771339417, |
| "loss/reg": 0.023765094578266144, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.053875, |
| "grad_norm": 4.039770603179932, |
| "grad_norm_var": 0.6214738630237046, |
| "learning_rate": 0.0001, |
| "loss": 1.2708, |
| "loss/crossentropy": 2.5254645347595215, |
| "loss/hidden": 0.83984375, |
| "loss/logits": 0.19336232542991638, |
| "loss/reg": 0.023756500333547592, |
| "step": 431 |
| }, |
| { |
| "epoch": 0.054, |
| "grad_norm": 2.9176530838012695, |
| "grad_norm_var": 0.6009675102137348, |
| "learning_rate": 0.0001, |
| "loss": 0.9399, |
| "loss/crossentropy": 2.611178398132324, |
| "loss/hidden": 0.59375, |
| "loss/logits": 0.10868553817272186, |
| "loss/reg": 0.023747922852635384, |
| "step": 432 |
| }, |
| { |
| "epoch": 0.054125, |
| "grad_norm": 3.540189027786255, |
| "grad_norm_var": 0.5748467113480954, |
| "learning_rate": 0.0001, |
| "loss": 1.164, |
| "loss/crossentropy": 2.4766628742218018, |
| "loss/hidden": 0.80078125, |
| "loss/logits": 0.1258353739976883, |
| "loss/reg": 0.023739352822303772, |
| "step": 433 |
| }, |
| { |
| "epoch": 0.05425, |
| "grad_norm": 3.7053966522216797, |
| "grad_norm_var": 0.4931212433281331, |
| "learning_rate": 0.0001, |
| "loss": 1.0522, |
| "loss/crossentropy": 2.491478681564331, |
| "loss/hidden": 0.67578125, |
| "loss/logits": 0.13909485936164856, |
| "loss/reg": 0.023731039837002754, |
| "step": 434 |
| }, |
| { |
| "epoch": 0.054375, |
| "grad_norm": 3.4101648330688477, |
| "grad_norm_var": 0.32751985750053336, |
| "learning_rate": 0.0001, |
| "loss": 0.9527, |
| "loss/crossentropy": 2.5495922565460205, |
| "loss/hidden": 0.59765625, |
| "loss/logits": 0.1178436130285263, |
| "loss/reg": 0.023722674697637558, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.0545, |
| "grad_norm": 3.923790693283081, |
| "grad_norm_var": 0.33181180777142816, |
| "learning_rate": 0.0001, |
| "loss": 1.1217, |
| "loss/crossentropy": 2.350161552429199, |
| "loss/hidden": 0.75, |
| "loss/logits": 0.13453412055969238, |
| "loss/reg": 0.02371453307569027, |
| "step": 436 |
| }, |
| { |
| "epoch": 0.054625, |
| "grad_norm": 3.2052547931671143, |
| "grad_norm_var": 0.3040979482718351, |
| "learning_rate": 0.0001, |
| "loss": 1.0641, |
| "loss/crossentropy": 2.5055789947509766, |
| "loss/hidden": 0.71484375, |
| "loss/logits": 0.11222882568836212, |
| "loss/reg": 0.023706616833806038, |
| "step": 437 |
| }, |
| { |
| "epoch": 0.05475, |
| "grad_norm": 2.820063591003418, |
| "grad_norm_var": 0.26777286633351405, |
| "learning_rate": 0.0001, |
| "loss": 1.1811, |
| "loss/crossentropy": 2.535069704055786, |
| "loss/hidden": 0.78125, |
| "loss/logits": 0.1628357172012329, |
| "loss/reg": 0.023698095232248306, |
| "step": 438 |
| }, |
| { |
| "epoch": 0.054875, |
| "grad_norm": 3.095841884613037, |
| "grad_norm_var": 0.24744105333314317, |
| "learning_rate": 0.0001, |
| "loss": 1.1936, |
| "loss/crossentropy": 2.313276767730713, |
| "loss/hidden": 0.78125, |
| "loss/logits": 0.17543524503707886, |
| "loss/reg": 0.02368931844830513, |
| "step": 439 |
| }, |
| { |
| "epoch": 0.055, |
| "grad_norm": 2.955540895462036, |
| "grad_norm_var": 0.1683967569747227, |
| "learning_rate": 0.0001, |
| "loss": 0.991, |
| "loss/crossentropy": 2.747128486633301, |
| "loss/hidden": 0.6484375, |
| "loss/logits": 0.10580303519964218, |
| "loss/reg": 0.023680580779910088, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.055125, |
| "grad_norm": 2.866759777069092, |
| "grad_norm_var": 0.16086728592038804, |
| "learning_rate": 0.0001, |
| "loss": 0.9014, |
| "loss/crossentropy": 2.4465224742889404, |
| "loss/hidden": 0.5703125, |
| "loss/logits": 0.09433356672525406, |
| "loss/reg": 0.023672088980674744, |
| "step": 441 |
| }, |
| { |
| "epoch": 0.05525, |
| "grad_norm": 3.1793012619018555, |
| "grad_norm_var": 0.14310091597801508, |
| "learning_rate": 0.0001, |
| "loss": 1.0742, |
| "loss/crossentropy": 2.364579677581787, |
| "loss/hidden": 0.6875, |
| "loss/logits": 0.15003816783428192, |
| "loss/reg": 0.023663459345698357, |
| "step": 442 |
| }, |
| { |
| "epoch": 0.055375, |
| "grad_norm": 4.3040900230407715, |
| "grad_norm_var": 0.19784760386154687, |
| "learning_rate": 0.0001, |
| "loss": 1.1406, |
| "loss/crossentropy": 2.955233573913574, |
| "loss/hidden": 0.74609375, |
| "loss/logits": 0.1579587459564209, |
| "loss/reg": 0.023654496297240257, |
| "step": 443 |
| }, |
| { |
| "epoch": 0.0555, |
| "grad_norm": 3.6495676040649414, |
| "grad_norm_var": 0.19966009525053988, |
| "learning_rate": 0.0001, |
| "loss": 1.0678, |
| "loss/crossentropy": 2.184626340866089, |
| "loss/hidden": 0.71484375, |
| "loss/logits": 0.11654888093471527, |
| "loss/reg": 0.023645464330911636, |
| "step": 444 |
| }, |
| { |
| "epoch": 0.055625, |
| "grad_norm": 2.7992637157440186, |
| "grad_norm_var": 0.21692371557562992, |
| "learning_rate": 0.0001, |
| "loss": 1.0746, |
| "loss/crossentropy": 1.9573417901992798, |
| "loss/hidden": 0.71484375, |
| "loss/logits": 0.1234317272901535, |
| "loss/reg": 0.023636594414711, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.05575, |
| "grad_norm": 3.4131267070770264, |
| "grad_norm_var": 0.21645445922388262, |
| "learning_rate": 0.0001, |
| "loss": 1.0938, |
| "loss/crossentropy": 2.3615517616271973, |
| "loss/hidden": 0.71875, |
| "loss/logits": 0.1387380063533783, |
| "loss/reg": 0.023627731949090958, |
| "step": 446 |
| }, |
| { |
| "epoch": 0.055875, |
| "grad_norm": 2.890436887741089, |
| "grad_norm_var": 0.19547383544341201, |
| "learning_rate": 0.0001, |
| "loss": 0.8903, |
| "loss/crossentropy": 2.2940785884857178, |
| "loss/hidden": 0.5546875, |
| "loss/logits": 0.09945578873157501, |
| "loss/reg": 0.023619333282113075, |
| "step": 447 |
| }, |
| { |
| "epoch": 0.056, |
| "grad_norm": 3.9202187061309814, |
| "grad_norm_var": 0.20821686288428035, |
| "learning_rate": 0.0001, |
| "loss": 1.1124, |
| "loss/crossentropy": 2.561680316925049, |
| "loss/hidden": 0.73828125, |
| "loss/logits": 0.13801740109920502, |
| "loss/reg": 0.023610329255461693, |
| "step": 448 |
| }, |
| { |
| "epoch": 0.056125, |
| "grad_norm": 3.0585479736328125, |
| "grad_norm_var": 0.21081889060941586, |
| "learning_rate": 0.0001, |
| "loss": 0.9545, |
| "loss/crossentropy": 2.4095382690429688, |
| "loss/hidden": 0.609375, |
| "loss/logits": 0.10911431908607483, |
| "loss/reg": 0.02360081672668457, |
| "step": 449 |
| }, |
| { |
| "epoch": 0.05625, |
| "grad_norm": 3.3532161712646484, |
| "grad_norm_var": 0.2007006666523369, |
| "learning_rate": 0.0001, |
| "loss": 1.1045, |
| "loss/crossentropy": 1.8727831840515137, |
| "loss/hidden": 0.7421875, |
| "loss/logits": 0.1264159381389618, |
| "loss/reg": 0.023592744022607803, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.056375, |
| "grad_norm": 3.1456074714660645, |
| "grad_norm_var": 0.20128870800301848, |
| "learning_rate": 0.0001, |
| "loss": 0.9574, |
| "loss/crossentropy": 2.4792492389678955, |
| "loss/hidden": 0.609375, |
| "loss/logits": 0.11215440928936005, |
| "loss/reg": 0.02358343079686165, |
| "step": 451 |
| }, |
| { |
| "epoch": 0.0565, |
| "grad_norm": 3.402637004852295, |
| "grad_norm_var": 0.1739656178124496, |
| "learning_rate": 0.0001, |
| "loss": 1.0743, |
| "loss/crossentropy": 2.3384110927581787, |
| "loss/hidden": 0.6875, |
| "loss/logits": 0.15106429159641266, |
| "loss/reg": 0.02357417158782482, |
| "step": 452 |
| }, |
| { |
| "epoch": 0.056625, |
| "grad_norm": 3.259817361831665, |
| "grad_norm_var": 0.17379912081048493, |
| "learning_rate": 0.0001, |
| "loss": 1.1221, |
| "loss/crossentropy": 2.456613779067993, |
| "loss/hidden": 0.734375, |
| "loss/logits": 0.15212517976760864, |
| "loss/reg": 0.023564757779240608, |
| "step": 453 |
| }, |
| { |
| "epoch": 0.05675, |
| "grad_norm": 2.531987190246582, |
| "grad_norm_var": 0.195773570863138, |
| "learning_rate": 0.0001, |
| "loss": 0.9605, |
| "loss/crossentropy": 2.394343376159668, |
| "loss/hidden": 0.609375, |
| "loss/logits": 0.11555634438991547, |
| "loss/reg": 0.023556271567940712, |
| "step": 454 |
| }, |
| { |
| "epoch": 0.056875, |
| "grad_norm": 2.594336986541748, |
| "grad_norm_var": 0.22107356191806862, |
| "learning_rate": 0.0001, |
| "loss": 0.9207, |
| "loss/crossentropy": 2.781731367111206, |
| "loss/hidden": 0.58203125, |
| "loss/logits": 0.10317155718803406, |
| "loss/reg": 0.023547139018774033, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.057, |
| "grad_norm": 3.1260924339294434, |
| "grad_norm_var": 0.21715561662650557, |
| "learning_rate": 0.0001, |
| "loss": 1.1637, |
| "loss/crossentropy": 2.5018725395202637, |
| "loss/hidden": 0.78125, |
| "loss/logits": 0.14706720411777496, |
| "loss/reg": 0.02353852428495884, |
| "step": 456 |
| }, |
| { |
| "epoch": 0.057125, |
| "grad_norm": 3.159911632537842, |
| "grad_norm_var": 0.20878072756432645, |
| "learning_rate": 0.0001, |
| "loss": 1.0676, |
| "loss/crossentropy": 2.668788433074951, |
| "loss/hidden": 0.68359375, |
| "loss/logits": 0.1487644910812378, |
| "loss/reg": 0.02352879010140896, |
| "step": 457 |
| }, |
| { |
| "epoch": 0.05725, |
| "grad_norm": 3.0937418937683105, |
| "grad_norm_var": 0.20989373673105333, |
| "learning_rate": 0.0001, |
| "loss": 1.0188, |
| "loss/crossentropy": 2.498286008834839, |
| "loss/hidden": 0.6640625, |
| "loss/logits": 0.11953231692314148, |
| "loss/reg": 0.023519227281212807, |
| "step": 458 |
| }, |
| { |
| "epoch": 0.057375, |
| "grad_norm": 2.4465949535369873, |
| "grad_norm_var": 0.15987096754079838, |
| "learning_rate": 0.0001, |
| "loss": 1.0333, |
| "loss/crossentropy": 2.401630401611328, |
| "loss/hidden": 0.671875, |
| "loss/logits": 0.12631601095199585, |
| "loss/reg": 0.023509083315730095, |
| "step": 459 |
| }, |
| { |
| "epoch": 0.0575, |
| "grad_norm": 3.8185830116271973, |
| "grad_norm_var": 0.17369585396981316, |
| "learning_rate": 0.0001, |
| "loss": 1.2185, |
| "loss/crossentropy": 2.3343100547790527, |
| "loss/hidden": 0.84375, |
| "loss/logits": 0.13973468542099, |
| "loss/reg": 0.023500461131334305, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.057625, |
| "grad_norm": 2.769894599914551, |
| "grad_norm_var": 0.1750287637092998, |
| "learning_rate": 0.0001, |
| "loss": 1.0478, |
| "loss/crossentropy": 2.594421148300171, |
| "loss/hidden": 0.66796875, |
| "loss/logits": 0.14495806396007538, |
| "loss/reg": 0.02349086105823517, |
| "step": 461 |
| }, |
| { |
| "epoch": 0.05775, |
| "grad_norm": 3.924386501312256, |
| "grad_norm_var": 0.21107140664515026, |
| "learning_rate": 0.0001, |
| "loss": 1.1201, |
| "loss/crossentropy": 1.8543033599853516, |
| "loss/hidden": 0.71875, |
| "loss/logits": 0.16654378175735474, |
| "loss/reg": 0.023482073098421097, |
| "step": 462 |
| }, |
| { |
| "epoch": 0.057875, |
| "grad_norm": 2.719325304031372, |
| "grad_norm_var": 0.21896016035892957, |
| "learning_rate": 0.0001, |
| "loss": 0.9973, |
| "loss/crossentropy": 2.7110376358032227, |
| "loss/hidden": 0.6328125, |
| "loss/logits": 0.1297917366027832, |
| "loss/reg": 0.02347267046570778, |
| "step": 463 |
| }, |
| { |
| "epoch": 0.058, |
| "grad_norm": 3.110532522201538, |
| "grad_norm_var": 0.17627651595159174, |
| "learning_rate": 0.0001, |
| "loss": 0.9503, |
| "loss/crossentropy": 2.424137830734253, |
| "loss/hidden": 0.61328125, |
| "loss/logits": 0.10241679847240448, |
| "loss/reg": 0.023463333025574684, |
| "step": 464 |
| }, |
| { |
| "epoch": 0.058125, |
| "grad_norm": 3.2945988178253174, |
| "grad_norm_var": 0.17862116157392785, |
| "learning_rate": 0.0001, |
| "loss": 1.1634, |
| "loss/crossentropy": 2.8107378482818604, |
| "loss/hidden": 0.75, |
| "loss/logits": 0.17888996005058289, |
| "loss/reg": 0.0234534852206707, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.05825, |
| "grad_norm": 4.864523887634277, |
| "grad_norm_var": 0.37049430510921844, |
| "learning_rate": 0.0001, |
| "loss": 1.0812, |
| "loss/crossentropy": 2.5031394958496094, |
| "loss/hidden": 0.71484375, |
| "loss/logits": 0.13192051649093628, |
| "loss/reg": 0.023444540798664093, |
| "step": 466 |
| }, |
| { |
| "epoch": 0.058375, |
| "grad_norm": 3.8722984790802, |
| "grad_norm_var": 0.3978501673810001, |
| "learning_rate": 0.0001, |
| "loss": 1.2316, |
| "loss/crossentropy": 1.9772007465362549, |
| "loss/hidden": 0.8515625, |
| "loss/logits": 0.14563970267772675, |
| "loss/reg": 0.023435747250914574, |
| "step": 467 |
| }, |
| { |
| "epoch": 0.0585, |
| "grad_norm": 4.621346473693848, |
| "grad_norm_var": 0.5155902021721573, |
| "learning_rate": 0.0001, |
| "loss": 1.1725, |
| "loss/crossentropy": 2.4977753162384033, |
| "loss/hidden": 0.7890625, |
| "loss/logits": 0.149122953414917, |
| "loss/reg": 0.023427119478583336, |
| "step": 468 |
| }, |
| { |
| "epoch": 0.058625, |
| "grad_norm": 3.36370849609375, |
| "grad_norm_var": 0.5153549660191058, |
| "learning_rate": 0.0001, |
| "loss": 1.2411, |
| "loss/crossentropy": 2.4750683307647705, |
| "loss/hidden": 0.8203125, |
| "loss/logits": 0.18663738667964935, |
| "loss/reg": 0.023417862132191658, |
| "step": 469 |
| }, |
| { |
| "epoch": 0.05875, |
| "grad_norm": 3.391871690750122, |
| "grad_norm_var": 0.46984604899865395, |
| "learning_rate": 0.0001, |
| "loss": 1.1066, |
| "loss/crossentropy": 2.5189239978790283, |
| "loss/hidden": 0.71875, |
| "loss/logits": 0.1537722945213318, |
| "loss/reg": 0.023408619686961174, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.058875, |
| "grad_norm": 3.903122901916504, |
| "grad_norm_var": 0.43880097595690587, |
| "learning_rate": 0.0001, |
| "loss": 0.9258, |
| "loss/crossentropy": 2.548199415206909, |
| "loss/hidden": 0.59375, |
| "loss/logits": 0.09802491217851639, |
| "loss/reg": 0.02339930646121502, |
| "step": 471 |
| }, |
| { |
| "epoch": 0.059, |
| "grad_norm": 5.576291084289551, |
| "grad_norm_var": 0.7024716555345464, |
| "learning_rate": 0.0001, |
| "loss": 1.2843, |
| "loss/crossentropy": 2.682227849960327, |
| "loss/hidden": 0.859375, |
| "loss/logits": 0.19106051325798035, |
| "loss/reg": 0.023390140384435654, |
| "step": 472 |
| }, |
| { |
| "epoch": 0.059125, |
| "grad_norm": 2.4201464653015137, |
| "grad_norm_var": 0.7821220779043936, |
| "learning_rate": 0.0001, |
| "loss": 0.9341, |
| "loss/crossentropy": 2.5251269340515137, |
| "loss/hidden": 0.59375, |
| "loss/logits": 0.10658347606658936, |
| "loss/reg": 0.023381320759654045, |
| "step": 473 |
| }, |
| { |
| "epoch": 0.05925, |
| "grad_norm": 4.325901985168457, |
| "grad_norm_var": 0.7980385459591806, |
| "learning_rate": 0.0001, |
| "loss": 0.9798, |
| "loss/crossentropy": 2.3592865467071533, |
| "loss/hidden": 0.640625, |
| "loss/logits": 0.10548710823059082, |
| "loss/reg": 0.023372096940875053, |
| "step": 474 |
| }, |
| { |
| "epoch": 0.059375, |
| "grad_norm": 3.1382455825805664, |
| "grad_norm_var": 0.7168259193102716, |
| "learning_rate": 0.0001, |
| "loss": 0.9839, |
| "loss/crossentropy": 2.6056907176971436, |
| "loss/hidden": 0.62890625, |
| "loss/logits": 0.121395543217659, |
| "loss/reg": 0.023363398388028145, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.0595, |
| "grad_norm": 3.0779755115509033, |
| "grad_norm_var": 0.7388713721113239, |
| "learning_rate": 0.0001, |
| "loss": 1.0798, |
| "loss/crossentropy": 2.320854663848877, |
| "loss/hidden": 0.72265625, |
| "loss/logits": 0.12356055527925491, |
| "loss/reg": 0.023354284465312958, |
| "step": 476 |
| }, |
| { |
| "epoch": 0.059625, |
| "grad_norm": 3.293567657470703, |
| "grad_norm_var": 0.6946720185858983, |
| "learning_rate": 0.0001, |
| "loss": 0.9685, |
| "loss/crossentropy": 2.7064430713653564, |
| "loss/hidden": 0.625, |
| "loss/logits": 0.11008161306381226, |
| "loss/reg": 0.023345019668340683, |
| "step": 477 |
| }, |
| { |
| "epoch": 0.05975, |
| "grad_norm": 2.664088249206543, |
| "grad_norm_var": 0.7530647477645431, |
| "learning_rate": 0.0001, |
| "loss": 0.9101, |
| "loss/crossentropy": 2.6147005558013916, |
| "loss/hidden": 0.5703125, |
| "loss/logits": 0.10643748193979263, |
| "loss/reg": 0.02333623729646206, |
| "step": 478 |
| }, |
| { |
| "epoch": 0.059875, |
| "grad_norm": 2.9174795150756836, |
| "grad_norm_var": 0.7321888983535926, |
| "learning_rate": 0.0001, |
| "loss": 0.9342, |
| "loss/crossentropy": 2.566849946975708, |
| "loss/hidden": 0.58984375, |
| "loss/logits": 0.11108069121837616, |
| "loss/reg": 0.023327510803937912, |
| "step": 479 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 3.3596463203430176, |
| "grad_norm_var": 0.71932045702877, |
| "learning_rate": 0.0001, |
| "loss": 1.0961, |
| "loss/crossentropy": 2.645395040512085, |
| "loss/hidden": 0.703125, |
| "loss/logits": 0.15979796648025513, |
| "loss/reg": 0.023318573832511902, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.060125, |
| "grad_norm": 4.0903096199035645, |
| "grad_norm_var": 0.7232764591548666, |
| "learning_rate": 0.0001, |
| "loss": 1.1686, |
| "loss/crossentropy": 2.077467918395996, |
| "loss/hidden": 0.79296875, |
| "loss/logits": 0.14252969622612, |
| "loss/reg": 0.02330981194972992, |
| "step": 481 |
| }, |
| { |
| "epoch": 0.06025, |
| "grad_norm": 3.277656078338623, |
| "grad_norm_var": 0.6300433507978689, |
| "learning_rate": 0.0001, |
| "loss": 1.1136, |
| "loss/crossentropy": 2.588385581970215, |
| "loss/hidden": 0.7421875, |
| "loss/logits": 0.13843819499015808, |
| "loss/reg": 0.02330118976533413, |
| "step": 482 |
| }, |
| { |
| "epoch": 0.060375, |
| "grad_norm": 4.117528915405273, |
| "grad_norm_var": 0.6433314640873874, |
| "learning_rate": 0.0001, |
| "loss": 1.0239, |
| "loss/crossentropy": 2.766176700592041, |
| "loss/hidden": 0.6640625, |
| "loss/logits": 0.12686920166015625, |
| "loss/reg": 0.023292165249586105, |
| "step": 483 |
| }, |
| { |
| "epoch": 0.0605, |
| "grad_norm": 11.29445743560791, |
| "grad_norm_var": 4.338621670503842, |
| "learning_rate": 0.0001, |
| "loss": 1.9614, |
| "loss/crossentropy": 2.5279059410095215, |
| "loss/hidden": 1.2109375, |
| "loss/logits": 0.5176718235015869, |
| "loss/reg": 0.02328311838209629, |
| "step": 484 |
| }, |
| { |
| "epoch": 0.060625, |
| "grad_norm": 3.5054728984832764, |
| "grad_norm_var": 4.327600163307723, |
| "learning_rate": 0.0001, |
| "loss": 1.0719, |
| "loss/crossentropy": 2.726095199584961, |
| "loss/hidden": 0.7109375, |
| "loss/logits": 0.12821289896965027, |
| "loss/reg": 0.023273879662156105, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.06075, |
| "grad_norm": 3.157118558883667, |
| "grad_norm_var": 4.350771203860321, |
| "learning_rate": 0.0001, |
| "loss": 0.9326, |
| "loss/crossentropy": 2.207131862640381, |
| "loss/hidden": 0.60546875, |
| "loss/logits": 0.09451892971992493, |
| "loss/reg": 0.02326469123363495, |
| "step": 486 |
| }, |
| { |
| "epoch": 0.060875, |
| "grad_norm": 5.006563186645508, |
| "grad_norm_var": 4.411522578027558, |
| "learning_rate": 0.0001, |
| "loss": 1.1884, |
| "loss/crossentropy": 3.005479335784912, |
| "loss/hidden": 0.78515625, |
| "loss/logits": 0.17073442041873932, |
| "loss/reg": 0.02325539104640484, |
| "step": 487 |
| }, |
| { |
| "epoch": 0.061, |
| "grad_norm": 8.253157615661621, |
| "grad_norm_var": 5.3947068177792, |
| "learning_rate": 0.0001, |
| "loss": 1.4073, |
| "loss/crossentropy": 2.329857349395752, |
| "loss/hidden": 0.984375, |
| "loss/logits": 0.19045758247375488, |
| "loss/reg": 0.02324584126472473, |
| "step": 488 |
| }, |
| { |
| "epoch": 0.061125, |
| "grad_norm": 4.795626640319824, |
| "grad_norm_var": 5.169810789054156, |
| "learning_rate": 0.0001, |
| "loss": 1.2274, |
| "loss/crossentropy": 2.354893207550049, |
| "loss/hidden": 0.8515625, |
| "loss/logits": 0.14352190494537354, |
| "loss/reg": 0.023236218839883804, |
| "step": 489 |
| }, |
| { |
| "epoch": 0.06125, |
| "grad_norm": 3.1360483169555664, |
| "grad_norm_var": 5.268809256909969, |
| "learning_rate": 0.0001, |
| "loss": 1.0242, |
| "loss/crossentropy": 2.2810122966766357, |
| "loss/hidden": 0.671875, |
| "loss/logits": 0.12006018310785294, |
| "loss/reg": 0.02322734333574772, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.061375, |
| "grad_norm": 3.931467056274414, |
| "grad_norm_var": 5.1833802842946906, |
| "learning_rate": 0.0001, |
| "loss": 1.1688, |
| "loss/crossentropy": 2.3372247219085693, |
| "loss/hidden": 0.78515625, |
| "loss/logits": 0.15147234499454498, |
| "loss/reg": 0.023217879235744476, |
| "step": 491 |
| }, |
| { |
| "epoch": 0.0615, |
| "grad_norm": 3.8858096599578857, |
| "grad_norm_var": 5.085283642122107, |
| "learning_rate": 0.0001, |
| "loss": 1.0309, |
| "loss/crossentropy": 2.597487688064575, |
| "loss/hidden": 0.6796875, |
| "loss/logits": 0.11914543062448502, |
| "loss/reg": 0.02320869080722332, |
| "step": 492 |
| }, |
| { |
| "epoch": 0.061625, |
| "grad_norm": 6.116000175476074, |
| "grad_norm_var": 5.160062314221837, |
| "learning_rate": 0.0001, |
| "loss": 1.2295, |
| "loss/crossentropy": 2.4041733741760254, |
| "loss/hidden": 0.8515625, |
| "loss/logits": 0.14599129557609558, |
| "loss/reg": 0.023199014365673065, |
| "step": 493 |
| }, |
| { |
| "epoch": 0.06175, |
| "grad_norm": 3.4635026454925537, |
| "grad_norm_var": 4.994267697000357, |
| "learning_rate": 0.0001, |
| "loss": 1.0957, |
| "loss/crossentropy": 2.692230224609375, |
| "loss/hidden": 0.73046875, |
| "loss/logits": 0.13329669833183289, |
| "loss/reg": 0.023189352825284004, |
| "step": 494 |
| }, |
| { |
| "epoch": 0.061875, |
| "grad_norm": 6.833379745483398, |
| "grad_norm_var": 5.051083471594066, |
| "learning_rate": 0.0001, |
| "loss": 1.1437, |
| "loss/crossentropy": 2.470660448074341, |
| "loss/hidden": 0.7890625, |
| "loss/logits": 0.12278926372528076, |
| "loss/reg": 0.023179946467280388, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.062, |
| "grad_norm": 3.2948107719421387, |
| "grad_norm_var": 5.064566926371495, |
| "learning_rate": 0.0001, |
| "loss": 1.109, |
| "loss/crossentropy": 2.3682029247283936, |
| "loss/hidden": 0.7421875, |
| "loss/logits": 0.13507232069969177, |
| "loss/reg": 0.02317013218998909, |
| "step": 496 |
| }, |
| { |
| "epoch": 0.062125, |
| "grad_norm": 4.057919025421143, |
| "grad_norm_var": 5.068064269732227, |
| "learning_rate": 0.0001, |
| "loss": 1.1648, |
| "loss/crossentropy": 2.4153223037719727, |
| "loss/hidden": 0.765625, |
| "loss/logits": 0.16754823923110962, |
| "loss/reg": 0.02316114492714405, |
| "step": 497 |
| }, |
| { |
| "epoch": 0.06225, |
| "grad_norm": 3.3985679149627686, |
| "grad_norm_var": 5.043098814178749, |
| "learning_rate": 0.0001, |
| "loss": 1.1033, |
| "loss/crossentropy": 2.6683197021484375, |
| "loss/hidden": 0.73828125, |
| "loss/logits": 0.13353273272514343, |
| "loss/reg": 0.023151271045207977, |
| "step": 498 |
| }, |
| { |
| "epoch": 0.062375, |
| "grad_norm": 2.6574859619140625, |
| "grad_norm_var": 5.326800856327217, |
| "learning_rate": 0.0001, |
| "loss": 0.9413, |
| "loss/crossentropy": 2.490849733352661, |
| "loss/hidden": 0.60546875, |
| "loss/logits": 0.1043705940246582, |
| "loss/reg": 0.023141290992498398, |
| "step": 499 |
| }, |
| { |
| "epoch": 0.0625, |
| "grad_norm": 3.2627739906311035, |
| "grad_norm_var": 2.402846049322009, |
| "learning_rate": 0.0001, |
| "loss": 1.0233, |
| "loss/crossentropy": 2.3429057598114014, |
| "loss/hidden": 0.63671875, |
| "loss/logits": 0.1552838534116745, |
| "loss/reg": 0.02313125506043434, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.062625, |
| "grad_norm": 3.2453906536102295, |
| "grad_norm_var": 2.4345300369903553, |
| "learning_rate": 0.0001, |
| "loss": 1.1719, |
| "loss/crossentropy": 2.497878313064575, |
| "loss/hidden": 0.76953125, |
| "loss/logits": 0.17116406559944153, |
| "loss/reg": 0.02312229759991169, |
| "step": 501 |
| }, |
| { |
| "epoch": 0.06275, |
| "grad_norm": 4.79340934753418, |
| "grad_norm_var": 2.3566760840149366, |
| "learning_rate": 0.0001, |
| "loss": 1.0001, |
| "loss/crossentropy": 2.360431671142578, |
| "loss/hidden": 0.6484375, |
| "loss/logits": 0.12051868438720703, |
| "loss/reg": 0.02311263047158718, |
| "step": 502 |
| }, |
| { |
| "epoch": 0.062875, |
| "grad_norm": 3.1595826148986816, |
| "grad_norm_var": 2.4163836713766425, |
| "learning_rate": 0.0001, |
| "loss": 0.9158, |
| "loss/crossentropy": 2.266618490219116, |
| "loss/hidden": 0.58203125, |
| "loss/logits": 0.1027822494506836, |
| "loss/reg": 0.023102767765522003, |
| "step": 503 |
| }, |
| { |
| "epoch": 0.063, |
| "grad_norm": 3.59019136428833, |
| "grad_norm_var": 1.2975304557542653, |
| "learning_rate": 0.0001, |
| "loss": 0.9685, |
| "loss/crossentropy": 2.6386334896087646, |
| "loss/hidden": 0.625, |
| "loss/logits": 0.11253425478935242, |
| "loss/reg": 0.023093828931450844, |
| "step": 504 |
| }, |
| { |
| "epoch": 0.063125, |
| "grad_norm": 3.1218326091766357, |
| "grad_norm_var": 1.2897946661694502, |
| "learning_rate": 0.0001, |
| "loss": 0.9493, |
| "loss/crossentropy": 2.2931203842163086, |
| "loss/hidden": 0.6171875, |
| "loss/logits": 0.10128200799226761, |
| "loss/reg": 0.0230838842689991, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.06325, |
| "grad_norm": 7.243019104003906, |
| "grad_norm_var": 1.941121973828988, |
| "learning_rate": 0.0001, |
| "loss": 1.1363, |
| "loss/crossentropy": 2.510519504547119, |
| "loss/hidden": 0.7890625, |
| "loss/logits": 0.11650878190994263, |
| "loss/reg": 0.023074399679899216, |
| "step": 506 |
| }, |
| { |
| "epoch": 0.063375, |
| "grad_norm": 2.7458910942077637, |
| "grad_norm_var": 2.0601092371510408, |
| "learning_rate": 0.0001, |
| "loss": 0.8688, |
| "loss/crossentropy": 2.504798173904419, |
| "loss/hidden": 0.54296875, |
| "loss/logits": 0.09517204016447067, |
| "loss/reg": 0.023064618930220604, |
| "step": 507 |
| }, |
| { |
| "epoch": 0.0635, |
| "grad_norm": 3.834894895553589, |
| "grad_norm_var": 2.061415401484546, |
| "learning_rate": 0.0001, |
| "loss": 1.0614, |
| "loss/crossentropy": 2.504178285598755, |
| "loss/hidden": 0.69140625, |
| "loss/logits": 0.13941214978694916, |
| "loss/reg": 0.023055192083120346, |
| "step": 508 |
| }, |
| { |
| "epoch": 0.063625, |
| "grad_norm": 3.0524418354034424, |
| "grad_norm_var": 1.8045701590720038, |
| "learning_rate": 0.0001, |
| "loss": 0.9395, |
| "loss/crossentropy": 2.6670830249786377, |
| "loss/hidden": 0.6015625, |
| "loss/logits": 0.10751838982105255, |
| "loss/reg": 0.023046277463436127, |
| "step": 509 |
| }, |
| { |
| "epoch": 0.06375, |
| "grad_norm": 2.638979196548462, |
| "grad_norm_var": 1.8906158947457992, |
| "learning_rate": 0.0001, |
| "loss": 0.9649, |
| "loss/crossentropy": 2.5606770515441895, |
| "loss/hidden": 0.62109375, |
| "loss/logits": 0.11341118812561035, |
| "loss/reg": 0.02303677424788475, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.063875, |
| "grad_norm": 4.029105186462402, |
| "grad_norm_var": 1.2509737999906814, |
| "learning_rate": 0.0001, |
| "loss": 1.0378, |
| "loss/crossentropy": 2.446560859680176, |
| "loss/hidden": 0.7109375, |
| "loss/logits": 0.09661944955587387, |
| "loss/reg": 0.023027852177619934, |
| "step": 511 |
| }, |
| { |
| "epoch": 0.064, |
| "grad_norm": 3.203378438949585, |
| "grad_norm_var": 1.255617850639648, |
| "learning_rate": 0.0001, |
| "loss": 0.9705, |
| "loss/crossentropy": 2.08353328704834, |
| "loss/hidden": 0.63671875, |
| "loss/logits": 0.10362571477890015, |
| "loss/reg": 0.02301831543445587, |
| "step": 512 |
| }, |
| { |
| "epoch": 0.064125, |
| "grad_norm": 2.5737931728363037, |
| "grad_norm_var": 1.3080458668089554, |
| "learning_rate": 0.0001, |
| "loss": 1.0597, |
| "loss/crossentropy": 2.3520448207855225, |
| "loss/hidden": 0.7109375, |
| "loss/logits": 0.11864635348320007, |
| "loss/reg": 0.023009376600384712, |
| "step": 513 |
| }, |
| { |
| "epoch": 0.06425, |
| "grad_norm": 3.6731107234954834, |
| "grad_norm_var": 1.307783724921588, |
| "learning_rate": 0.0001, |
| "loss": 1.1013, |
| "loss/crossentropy": 2.77113938331604, |
| "loss/hidden": 0.73046875, |
| "loss/logits": 0.1408485472202301, |
| "loss/reg": 0.022999830543994904, |
| "step": 514 |
| }, |
| { |
| "epoch": 0.064375, |
| "grad_norm": 3.06605863571167, |
| "grad_norm_var": 1.269509965568249, |
| "learning_rate": 0.0001, |
| "loss": 0.9224, |
| "loss/crossentropy": 2.3147072792053223, |
| "loss/hidden": 0.59375, |
| "loss/logits": 0.0987289547920227, |
| "loss/reg": 0.022990131750702858, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.0645, |
| "grad_norm": 3.462446689605713, |
| "grad_norm_var": 1.2636330593023397, |
| "learning_rate": 0.0001, |
| "loss": 1.0201, |
| "loss/crossentropy": 2.4042327404022217, |
| "loss/hidden": 0.66796875, |
| "loss/logits": 0.12231434136629105, |
| "loss/reg": 0.02298046089708805, |
| "step": 516 |
| }, |
| { |
| "epoch": 0.064625, |
| "grad_norm": 4.2029500007629395, |
| "grad_norm_var": 1.2769943636458339, |
| "learning_rate": 0.0001, |
| "loss": 1.222, |
| "loss/crossentropy": 2.514112710952759, |
| "loss/hidden": 0.82421875, |
| "loss/logits": 0.16808617115020752, |
| "loss/reg": 0.022970519959926605, |
| "step": 517 |
| }, |
| { |
| "epoch": 0.06475, |
| "grad_norm": 3.433554172515869, |
| "grad_norm_var": 1.1851525686550586, |
| "learning_rate": 0.0001, |
| "loss": 1.2444, |
| "loss/crossentropy": 2.493224859237671, |
| "loss/hidden": 0.84375, |
| "loss/logits": 0.17107471823692322, |
| "loss/reg": 0.022961357608437538, |
| "step": 518 |
| }, |
| { |
| "epoch": 0.064875, |
| "grad_norm": 4.310100078582764, |
| "grad_norm_var": 1.2057753361074104, |
| "learning_rate": 0.0001, |
| "loss": 1.0108, |
| "loss/crossentropy": 2.5606930255889893, |
| "loss/hidden": 0.67578125, |
| "loss/logits": 0.10546360909938812, |
| "loss/reg": 0.022952331230044365, |
| "step": 519 |
| }, |
| { |
| "epoch": 0.065, |
| "grad_norm": 3.674527883529663, |
| "grad_norm_var": 1.2057007253632908, |
| "learning_rate": 0.0001, |
| "loss": 0.9803, |
| "loss/crossentropy": 2.4196624755859375, |
| "loss/hidden": 0.64453125, |
| "loss/logits": 0.10631287097930908, |
| "loss/reg": 0.02294265851378441, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.065125, |
| "grad_norm": 3.101484775543213, |
| "grad_norm_var": 1.20713683658368, |
| "learning_rate": 0.0001, |
| "loss": 1.2375, |
| "loss/crossentropy": 2.1448891162872314, |
| "loss/hidden": 0.88671875, |
| "loss/logits": 0.12145140767097473, |
| "loss/reg": 0.02293260022997856, |
| "step": 521 |
| }, |
| { |
| "epoch": 0.06525, |
| "grad_norm": 3.2564265727996826, |
| "grad_norm_var": 0.2854656858181736, |
| "learning_rate": 0.0001, |
| "loss": 0.9783, |
| "loss/crossentropy": 2.3986809253692627, |
| "loss/hidden": 0.609375, |
| "loss/logits": 0.1396813988685608, |
| "loss/reg": 0.022922798991203308, |
| "step": 522 |
| }, |
| { |
| "epoch": 0.065375, |
| "grad_norm": 3.3007333278656006, |
| "grad_norm_var": 0.2569672821287893, |
| "learning_rate": 0.0001, |
| "loss": 0.9586, |
| "loss/crossentropy": 2.7955212593078613, |
| "loss/hidden": 0.62109375, |
| "loss/logits": 0.10833179950714111, |
| "loss/reg": 0.02291307970881462, |
| "step": 523 |
| }, |
| { |
| "epoch": 0.0655, |
| "grad_norm": 2.9546499252319336, |
| "grad_norm_var": 0.25738909944094907, |
| "learning_rate": 0.0001, |
| "loss": 1.0937, |
| "loss/crossentropy": 2.409029006958008, |
| "loss/hidden": 0.71875, |
| "loss/logits": 0.14591118693351746, |
| "loss/reg": 0.022904111072421074, |
| "step": 524 |
| }, |
| { |
| "epoch": 0.065625, |
| "grad_norm": 3.2193830013275146, |
| "grad_norm_var": 0.25204334767618, |
| "learning_rate": 0.0001, |
| "loss": 1.1221, |
| "loss/crossentropy": 2.491401195526123, |
| "loss/hidden": 0.75390625, |
| "loss/logits": 0.13920898735523224, |
| "loss/reg": 0.02289445698261261, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.06575, |
| "grad_norm": 40.478694915771484, |
| "grad_norm_var": 85.9971082258447, |
| "learning_rate": 0.0001, |
| "loss": 1.1342, |
| "loss/crossentropy": 2.4311652183532715, |
| "loss/hidden": 0.796875, |
| "loss/logits": 0.10845671594142914, |
| "loss/reg": 0.022885650396347046, |
| "step": 526 |
| }, |
| { |
| "epoch": 0.065875, |
| "grad_norm": 3.2905385494232178, |
| "grad_norm_var": 86.20029999738617, |
| "learning_rate": 0.0001, |
| "loss": 1.0101, |
| "loss/crossentropy": 2.1737422943115234, |
| "loss/hidden": 0.671875, |
| "loss/logits": 0.10944204032421112, |
| "loss/reg": 0.022876843810081482, |
| "step": 527 |
| }, |
| { |
| "epoch": 0.066, |
| "grad_norm": 4.666721343994141, |
| "grad_norm_var": 85.84699165642114, |
| "learning_rate": 0.0001, |
| "loss": 1.2142, |
| "loss/crossentropy": 2.462200880050659, |
| "loss/hidden": 0.86328125, |
| "loss/logits": 0.12223749607801437, |
| "loss/reg": 0.02286742813885212, |
| "step": 528 |
| }, |
| { |
| "epoch": 0.066125, |
| "grad_norm": 3.0347273349761963, |
| "grad_norm_var": 85.66251244998139, |
| "learning_rate": 0.0001, |
| "loss": 1.0285, |
| "loss/crossentropy": 2.38840651512146, |
| "loss/hidden": 0.6640625, |
| "loss/logits": 0.13581448793411255, |
| "loss/reg": 0.022858494892716408, |
| "step": 529 |
| }, |
| { |
| "epoch": 0.06625, |
| "grad_norm": 2.5890092849731445, |
| "grad_norm_var": 86.04634847608627, |
| "learning_rate": 0.0001, |
| "loss": 0.967, |
| "loss/crossentropy": 2.5042731761932373, |
| "loss/hidden": 0.62890625, |
| "loss/logits": 0.10961504280567169, |
| "loss/reg": 0.02284966967999935, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.066375, |
| "grad_norm": 3.3963401317596436, |
| "grad_norm_var": 85.93485657047692, |
| "learning_rate": 0.0001, |
| "loss": 1.072, |
| "loss/crossentropy": 2.7259116172790527, |
| "loss/hidden": 0.7109375, |
| "loss/logits": 0.13268503546714783, |
| "loss/reg": 0.02284088172018528, |
| "step": 531 |
| }, |
| { |
| "epoch": 0.0665, |
| "grad_norm": 3.731293201446533, |
| "grad_norm_var": 85.85653980693046, |
| "learning_rate": 0.0001, |
| "loss": 1.0647, |
| "loss/crossentropy": 2.277968168258667, |
| "loss/hidden": 0.71484375, |
| "loss/logits": 0.12150134146213531, |
| "loss/reg": 0.02283208817243576, |
| "step": 532 |
| }, |
| { |
| "epoch": 0.066625, |
| "grad_norm": 4.581428050994873, |
| "grad_norm_var": 85.78540060231343, |
| "learning_rate": 0.0001, |
| "loss": 1.2486, |
| "loss/crossentropy": 2.067720890045166, |
| "loss/hidden": 0.84375, |
| "loss/logits": 0.17660680413246155, |
| "loss/reg": 0.02282322198152542, |
| "step": 533 |
| }, |
| { |
| "epoch": 0.06675, |
| "grad_norm": 3.0526421070098877, |
| "grad_norm_var": 85.91535378874303, |
| "learning_rate": 0.0001, |
| "loss": 1.1318, |
| "loss/crossentropy": 2.4441521167755127, |
| "loss/hidden": 0.7734375, |
| "loss/logits": 0.13023720681667328, |
| "loss/reg": 0.022814445197582245, |
| "step": 534 |
| }, |
| { |
| "epoch": 0.066875, |
| "grad_norm": 3.4852664470672607, |
| "grad_norm_var": 86.12062292738887, |
| "learning_rate": 0.0001, |
| "loss": 0.9936, |
| "loss/crossentropy": 2.418733596801758, |
| "loss/hidden": 0.66015625, |
| "loss/logits": 0.10543158650398254, |
| "loss/reg": 0.022805610671639442, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.067, |
| "grad_norm": 2.7321274280548096, |
| "grad_norm_var": 86.43545869041343, |
| "learning_rate": 0.0001, |
| "loss": 1.0096, |
| "loss/crossentropy": 2.3275394439697266, |
| "loss/hidden": 0.65625, |
| "loss/logits": 0.12536926567554474, |
| "loss/reg": 0.022797243669629097, |
| "step": 536 |
| }, |
| { |
| "epoch": 0.067125, |
| "grad_norm": 3.029811382293701, |
| "grad_norm_var": 86.46041611877568, |
| "learning_rate": 0.0001, |
| "loss": 1.1049, |
| "loss/crossentropy": 2.2477617263793945, |
| "loss/hidden": 0.74609375, |
| "loss/logits": 0.1309519112110138, |
| "loss/reg": 0.022788099944591522, |
| "step": 537 |
| }, |
| { |
| "epoch": 0.06725, |
| "grad_norm": 2.7345895767211914, |
| "grad_norm_var": 86.64571497988939, |
| "learning_rate": 0.0001, |
| "loss": 1.1272, |
| "loss/crossentropy": 2.4289236068725586, |
| "loss/hidden": 0.7578125, |
| "loss/logits": 0.141631618142128, |
| "loss/reg": 0.022778736427426338, |
| "step": 538 |
| }, |
| { |
| "epoch": 0.067375, |
| "grad_norm": 3.2934482097625732, |
| "grad_norm_var": 86.6479928457627, |
| "learning_rate": 0.0001, |
| "loss": 1.0286, |
| "loss/crossentropy": 2.538973093032837, |
| "loss/hidden": 0.67578125, |
| "loss/logits": 0.12509004771709442, |
| "loss/reg": 0.02277030609548092, |
| "step": 539 |
| }, |
| { |
| "epoch": 0.0675, |
| "grad_norm": 3.833656072616577, |
| "grad_norm_var": 86.38133368804911, |
| "learning_rate": 0.0001, |
| "loss": 0.999, |
| "loss/crossentropy": 2.429593563079834, |
| "loss/hidden": 0.64453125, |
| "loss/logits": 0.12682604789733887, |
| "loss/reg": 0.022761952131986618, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.067625, |
| "grad_norm": 3.544104814529419, |
| "grad_norm_var": 86.28065873545309, |
| "learning_rate": 0.0001, |
| "loss": 1.0002, |
| "loss/crossentropy": 2.166292905807495, |
| "loss/hidden": 0.66015625, |
| "loss/logits": 0.11254848539829254, |
| "loss/reg": 0.02275264821946621, |
| "step": 541 |
| }, |
| { |
| "epoch": 0.06775, |
| "grad_norm": 4.461411952972412, |
| "grad_norm_var": 0.4229304647166086, |
| "learning_rate": 0.0001, |
| "loss": 1.6203, |
| "loss/crossentropy": 2.6347737312316895, |
| "loss/hidden": 1.1328125, |
| "loss/logits": 0.2600440979003906, |
| "loss/reg": 0.022744029760360718, |
| "step": 542 |
| }, |
| { |
| "epoch": 0.067875, |
| "grad_norm": 2.6814959049224854, |
| "grad_norm_var": 0.46036790462302574, |
| "learning_rate": 0.0001, |
| "loss": 0.9923, |
| "loss/crossentropy": 2.580264091491699, |
| "loss/hidden": 0.65234375, |
| "loss/logits": 0.11260214447975159, |
| "loss/reg": 0.022734828293323517, |
| "step": 543 |
| }, |
| { |
| "epoch": 0.068, |
| "grad_norm": 3.685408353805542, |
| "grad_norm_var": 0.35847800648438505, |
| "learning_rate": 0.0001, |
| "loss": 1.1452, |
| "loss/crossentropy": 2.7129862308502197, |
| "loss/hidden": 0.78515625, |
| "loss/logits": 0.13279825448989868, |
| "loss/reg": 0.022726204246282578, |
| "step": 544 |
| }, |
| { |
| "epoch": 0.068125, |
| "grad_norm": 6.349724292755127, |
| "grad_norm_var": 0.8985836730566762, |
| "learning_rate": 0.0001, |
| "loss": 1.1353, |
| "loss/crossentropy": 2.7293214797973633, |
| "loss/hidden": 0.65625, |
| "loss/logits": 0.2518823742866516, |
| "loss/reg": 0.022716930136084557, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.06825, |
| "grad_norm": 3.681774616241455, |
| "grad_norm_var": 0.829722440393675, |
| "learning_rate": 0.0001, |
| "loss": 1.1759, |
| "loss/crossentropy": 2.278223991394043, |
| "loss/hidden": 0.8046875, |
| "loss/logits": 0.1441642791032791, |
| "loss/reg": 0.022707859054207802, |
| "step": 546 |
| }, |
| { |
| "epoch": 0.068375, |
| "grad_norm": 3.84778094291687, |
| "grad_norm_var": 0.8276635905853821, |
| "learning_rate": 0.0001, |
| "loss": 1.3398, |
| "loss/crossentropy": 2.1253304481506348, |
| "loss/hidden": 0.94921875, |
| "loss/logits": 0.16354900598526, |
| "loss/reg": 0.022699227556586266, |
| "step": 547 |
| }, |
| { |
| "epoch": 0.0685, |
| "grad_norm": 5.178676605224609, |
| "grad_norm_var": 0.9703527182714744, |
| "learning_rate": 0.0001, |
| "loss": 1.022, |
| "loss/crossentropy": 2.7832319736480713, |
| "loss/hidden": 0.67578125, |
| "loss/logits": 0.11932960152626038, |
| "loss/reg": 0.02269013226032257, |
| "step": 548 |
| }, |
| { |
| "epoch": 0.068625, |
| "grad_norm": 3.0284383296966553, |
| "grad_norm_var": 0.9511722709094016, |
| "learning_rate": 0.0001, |
| "loss": 1.0412, |
| "loss/crossentropy": 1.9599052667617798, |
| "loss/hidden": 0.703125, |
| "loss/logits": 0.11125050485134125, |
| "loss/reg": 0.022681355476379395, |
| "step": 549 |
| }, |
| { |
| "epoch": 0.06875, |
| "grad_norm": 3.167809247970581, |
| "grad_norm_var": 0.942616955302132, |
| "learning_rate": 0.0001, |
| "loss": 1.0545, |
| "loss/crossentropy": 2.5844783782958984, |
| "loss/hidden": 0.69140625, |
| "loss/logits": 0.13634443283081055, |
| "loss/reg": 0.02267223782837391, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.068875, |
| "grad_norm": 3.2949278354644775, |
| "grad_norm_var": 0.9495941353113788, |
| "learning_rate": 0.0001, |
| "loss": 1.0619, |
| "loss/crossentropy": 2.314173460006714, |
| "loss/hidden": 0.71484375, |
| "loss/logits": 0.12046810984611511, |
| "loss/reg": 0.022662866860628128, |
| "step": 551 |
| }, |
| { |
| "epoch": 0.069, |
| "grad_norm": 2.8322856426239014, |
| "grad_norm_var": 0.9378422714313653, |
| "learning_rate": 0.0001, |
| "loss": 1.0272, |
| "loss/crossentropy": 2.768298387527466, |
| "loss/hidden": 0.66796875, |
| "loss/logits": 0.1326732635498047, |
| "loss/reg": 0.022653890773653984, |
| "step": 552 |
| }, |
| { |
| "epoch": 0.069125, |
| "grad_norm": 6.815005779266357, |
| "grad_norm_var": 1.5125797637252996, |
| "learning_rate": 0.0001, |
| "loss": 1.1457, |
| "loss/crossentropy": 2.8011491298675537, |
| "loss/hidden": 0.78515625, |
| "loss/logits": 0.13409599661827087, |
| "loss/reg": 0.022644398733973503, |
| "step": 553 |
| }, |
| { |
| "epoch": 0.06925, |
| "grad_norm": 4.219581127166748, |
| "grad_norm_var": 1.4192768991356985, |
| "learning_rate": 0.0001, |
| "loss": 1.2504, |
| "loss/crossentropy": 2.5874292850494385, |
| "loss/hidden": 0.8515625, |
| "loss/logits": 0.17252284288406372, |
| "loss/reg": 0.02263464592397213, |
| "step": 554 |
| }, |
| { |
| "epoch": 0.069375, |
| "grad_norm": 3.8279531002044678, |
| "grad_norm_var": 1.3871550629864857, |
| "learning_rate": 0.0001, |
| "loss": 1.1814, |
| "loss/crossentropy": 2.5384669303894043, |
| "loss/hidden": 0.80078125, |
| "loss/logits": 0.1543978452682495, |
| "loss/reg": 0.022625621408224106, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.0695, |
| "grad_norm": 3.563680648803711, |
| "grad_norm_var": 1.3987108056073487, |
| "learning_rate": 0.0001, |
| "loss": 1.0727, |
| "loss/crossentropy": 2.111318349838257, |
| "loss/hidden": 0.73828125, |
| "loss/logits": 0.10826431214809418, |
| "loss/reg": 0.022616824135184288, |
| "step": 556 |
| }, |
| { |
| "epoch": 0.069625, |
| "grad_norm": 3.9599223136901855, |
| "grad_norm_var": 1.3836174934919199, |
| "learning_rate": 0.0001, |
| "loss": 1.0309, |
| "loss/crossentropy": 2.429032325744629, |
| "loss/hidden": 0.67578125, |
| "loss/logits": 0.12908612191677094, |
| "loss/reg": 0.022607678547501564, |
| "step": 557 |
| }, |
| { |
| "epoch": 0.06975, |
| "grad_norm": 2.8072519302368164, |
| "grad_norm_var": 1.4610802306207225, |
| "learning_rate": 0.0001, |
| "loss": 1.0694, |
| "loss/crossentropy": 2.5817906856536865, |
| "loss/hidden": 0.6875, |
| "loss/logits": 0.15589380264282227, |
| "loss/reg": 0.022598396986722946, |
| "step": 558 |
| }, |
| { |
| "epoch": 0.069875, |
| "grad_norm": 2.764833927154541, |
| "grad_norm_var": 1.4475983977607596, |
| "learning_rate": 0.0001, |
| "loss": 1.0233, |
| "loss/crossentropy": 2.202237844467163, |
| "loss/hidden": 0.66015625, |
| "loss/logits": 0.13721255958080292, |
| "loss/reg": 0.02258932963013649, |
| "step": 559 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 5.113494396209717, |
| "grad_norm_var": 1.5267634464669517, |
| "learning_rate": 0.0001, |
| "loss": 1.1672, |
| "loss/crossentropy": 2.006186008453369, |
| "loss/hidden": 0.80078125, |
| "loss/logits": 0.14061376452445984, |
| "loss/reg": 0.022580046206712723, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.070125, |
| "grad_norm": 3.8133227825164795, |
| "grad_norm_var": 1.1437787263680603, |
| "learning_rate": 0.0001, |
| "loss": 1.0148, |
| "loss/crossentropy": 2.37170672416687, |
| "loss/hidden": 0.67578125, |
| "loss/logits": 0.11329137533903122, |
| "loss/reg": 0.02257111482322216, |
| "step": 561 |
| }, |
| { |
| "epoch": 0.07025, |
| "grad_norm": 2.7208938598632812, |
| "grad_norm_var": 1.2255733087022764, |
| "learning_rate": 0.0001, |
| "loss": 1.0621, |
| "loss/crossentropy": 2.271667242050171, |
| "loss/hidden": 0.703125, |
| "loss/logits": 0.1333208978176117, |
| "loss/reg": 0.02256210334599018, |
| "step": 562 |
| }, |
| { |
| "epoch": 0.070375, |
| "grad_norm": 53.38179016113281, |
| "grad_norm_var": 154.8279377341773, |
| "learning_rate": 0.0001, |
| "loss": 0.9618, |
| "loss/crossentropy": 2.2667322158813477, |
| "loss/hidden": 0.640625, |
| "loss/logits": 0.09567096829414368, |
| "loss/reg": 0.02255306765437126, |
| "step": 563 |
| }, |
| { |
| "epoch": 0.0705, |
| "grad_norm": 5.538954257965088, |
| "grad_norm_var": 154.75309317540348, |
| "learning_rate": 0.0001, |
| "loss": 1.782, |
| "loss/crossentropy": 2.5168824195861816, |
| "loss/hidden": 1.3671875, |
| "loss/logits": 0.1893981695175171, |
| "loss/reg": 0.02254408597946167, |
| "step": 564 |
| }, |
| { |
| "epoch": 0.070625, |
| "grad_norm": 2.8311338424682617, |
| "grad_norm_var": 154.85811657117597, |
| "learning_rate": 0.0001, |
| "loss": 1.1104, |
| "loss/crossentropy": 2.1938281059265137, |
| "loss/hidden": 0.75, |
| "loss/logits": 0.13508498668670654, |
| "loss/reg": 0.02253509685397148, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.07075, |
| "grad_norm": 4.604408264160156, |
| "grad_norm_var": 154.26918998432618, |
| "learning_rate": 0.0001, |
| "loss": 1.2995, |
| "loss/crossentropy": 2.103062391281128, |
| "loss/hidden": 0.91015625, |
| "loss/logits": 0.16405051946640015, |
| "loss/reg": 0.02252543345093727, |
| "step": 566 |
| }, |
| { |
| "epoch": 0.070875, |
| "grad_norm": 4.917901992797852, |
| "grad_norm_var": 153.63084329919155, |
| "learning_rate": 0.0001, |
| "loss": 1.2945, |
| "loss/crossentropy": 2.6932315826416016, |
| "loss/hidden": 0.87890625, |
| "loss/logits": 0.1904207468032837, |
| "loss/reg": 0.022515632212162018, |
| "step": 567 |
| }, |
| { |
| "epoch": 0.071, |
| "grad_norm": 4.055351257324219, |
| "grad_norm_var": 153.02723135387427, |
| "learning_rate": 0.0001, |
| "loss": 1.2253, |
| "loss/crossentropy": 2.3849422931671143, |
| "loss/hidden": 0.8359375, |
| "loss/logits": 0.1642536073923111, |
| "loss/reg": 0.02250652387738228, |
| "step": 568 |
| }, |
| { |
| "epoch": 0.071125, |
| "grad_norm": 2.851072072982788, |
| "grad_norm_var": 154.20402053832456, |
| "learning_rate": 0.0001, |
| "loss": 1.0724, |
| "loss/crossentropy": 2.580428123474121, |
| "loss/hidden": 0.703125, |
| "loss/logits": 0.144349604845047, |
| "loss/reg": 0.022497190162539482, |
| "step": 569 |
| }, |
| { |
| "epoch": 0.07125, |
| "grad_norm": 3.257493495941162, |
| "grad_norm_var": 154.6102933496205, |
| "learning_rate": 0.0001, |
| "loss": 0.9328, |
| "loss/crossentropy": 2.4263837337493896, |
| "loss/hidden": 0.59375, |
| "loss/logits": 0.11414404958486557, |
| "loss/reg": 0.02248740941286087, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.071375, |
| "grad_norm": 3.6194608211517334, |
| "grad_norm_var": 154.69773136421824, |
| "learning_rate": 0.0001, |
| "loss": 1.0417, |
| "loss/crossentropy": 2.7035937309265137, |
| "loss/hidden": 0.69140625, |
| "loss/logits": 0.12549756467342377, |
| "loss/reg": 0.022477447986602783, |
| "step": 571 |
| }, |
| { |
| "epoch": 0.0715, |
| "grad_norm": 4.193033695220947, |
| "grad_norm_var": 154.44566535859553, |
| "learning_rate": 0.0001, |
| "loss": 1.6505, |
| "loss/crossentropy": 2.274057149887085, |
| "loss/hidden": 1.1640625, |
| "loss/logits": 0.26177138090133667, |
| "loss/reg": 0.022467276081442833, |
| "step": 572 |
| }, |
| { |
| "epoch": 0.071625, |
| "grad_norm": 2.7127623558044434, |
| "grad_norm_var": 155.03209308401435, |
| "learning_rate": 0.0001, |
| "loss": 0.9118, |
| "loss/crossentropy": 2.4076664447784424, |
| "loss/hidden": 0.58203125, |
| "loss/logits": 0.10521911084651947, |
| "loss/reg": 0.022456735372543335, |
| "step": 573 |
| }, |
| { |
| "epoch": 0.07175, |
| "grad_norm": 3.5871214866638184, |
| "grad_norm_var": 154.65243889362196, |
| "learning_rate": 0.0001, |
| "loss": 0.9755, |
| "loss/crossentropy": 2.8088905811309814, |
| "loss/hidden": 0.62890625, |
| "loss/logits": 0.12216061353683472, |
| "loss/reg": 0.022446416318416595, |
| "step": 574 |
| }, |
| { |
| "epoch": 0.071875, |
| "grad_norm": 20.843276977539062, |
| "grad_norm_var": 165.17750310305152, |
| "learning_rate": 0.0001, |
| "loss": 1.1707, |
| "loss/crossentropy": 2.569218635559082, |
| "loss/hidden": 0.82421875, |
| "loss/logits": 0.1221412867307663, |
| "loss/reg": 0.02243630215525627, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.072, |
| "grad_norm": 3.375251293182373, |
| "grad_norm_var": 166.03594003131977, |
| "learning_rate": 0.0001, |
| "loss": 0.9923, |
| "loss/crossentropy": 2.7540183067321777, |
| "loss/hidden": 0.640625, |
| "loss/logits": 0.12738245725631714, |
| "loss/reg": 0.022425668314099312, |
| "step": 576 |
| }, |
| { |
| "epoch": 0.072125, |
| "grad_norm": 3.457340717315674, |
| "grad_norm_var": 166.23754433202586, |
| "learning_rate": 0.0001, |
| "loss": 1.0371, |
| "loss/crossentropy": 2.2943716049194336, |
| "loss/hidden": 0.66796875, |
| "loss/logits": 0.14497271180152893, |
| "loss/reg": 0.022415172308683395, |
| "step": 577 |
| }, |
| { |
| "epoch": 0.07225, |
| "grad_norm": 2.966371774673462, |
| "grad_norm_var": 166.07272256293106, |
| "learning_rate": 0.0001, |
| "loss": 1.2044, |
| "loss/crossentropy": 2.760359048843384, |
| "loss/hidden": 0.8125, |
| "loss/logits": 0.16780292987823486, |
| "loss/reg": 0.022406071424484253, |
| "step": 578 |
| }, |
| { |
| "epoch": 0.072375, |
| "grad_norm": 2.5532329082489014, |
| "grad_norm_var": 19.219812763276813, |
| "learning_rate": 0.0001, |
| "loss": 0.9996, |
| "loss/crossentropy": 2.427797794342041, |
| "loss/hidden": 0.66796875, |
| "loss/logits": 0.10767525434494019, |
| "loss/reg": 0.02239692024886608, |
| "step": 579 |
| }, |
| { |
| "epoch": 0.0725, |
| "grad_norm": 3.250906229019165, |
| "grad_norm_var": 19.294198335433887, |
| "learning_rate": 0.0001, |
| "loss": 1.1148, |
| "loss/crossentropy": 2.29978084564209, |
| "loss/hidden": 0.75, |
| "loss/logits": 0.1408957540988922, |
| "loss/reg": 0.0223868228495121, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.072625, |
| "grad_norm": 3.8051562309265137, |
| "grad_norm_var": 19.12802354300362, |
| "learning_rate": 0.0001, |
| "loss": 1.2329, |
| "loss/crossentropy": 2.402350664138794, |
| "loss/hidden": 0.83203125, |
| "loss/logits": 0.17713911831378937, |
| "loss/reg": 0.022376833483576775, |
| "step": 581 |
| }, |
| { |
| "epoch": 0.07275, |
| "grad_norm": 4.466115951538086, |
| "grad_norm_var": 19.129656316190147, |
| "learning_rate": 0.0001, |
| "loss": 1.1846, |
| "loss/crossentropy": 2.5129504203796387, |
| "loss/hidden": 0.8125, |
| "loss/logits": 0.14841441810131073, |
| "loss/reg": 0.02236761339008808, |
| "step": 582 |
| }, |
| { |
| "epoch": 0.072875, |
| "grad_norm": 3.1465752124786377, |
| "grad_norm_var": 19.255278342461487, |
| "learning_rate": 0.0001, |
| "loss": 1.1857, |
| "loss/crossentropy": 2.5782837867736816, |
| "loss/hidden": 0.8203125, |
| "loss/logits": 0.14178822934627533, |
| "loss/reg": 0.022358402609825134, |
| "step": 583 |
| }, |
| { |
| "epoch": 0.073, |
| "grad_norm": 2.7474803924560547, |
| "grad_norm_var": 19.441256858474677, |
| "learning_rate": 0.0001, |
| "loss": 1.0532, |
| "loss/crossentropy": 2.4901747703552246, |
| "loss/hidden": 0.703125, |
| "loss/logits": 0.12661507725715637, |
| "loss/reg": 0.02234930731356144, |
| "step": 584 |
| }, |
| { |
| "epoch": 0.073125, |
| "grad_norm": 2.7027359008789062, |
| "grad_norm_var": 19.47380183903334, |
| "learning_rate": 0.0001, |
| "loss": 0.8469, |
| "loss/crossentropy": 2.694582223892212, |
| "loss/hidden": 0.5390625, |
| "loss/logits": 0.08438676595687866, |
| "loss/reg": 0.022340187802910805, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.07325, |
| "grad_norm": 2.847770929336548, |
| "grad_norm_var": 19.547679388785195, |
| "learning_rate": 0.0001, |
| "loss": 0.9885, |
| "loss/crossentropy": 2.5558888912200928, |
| "loss/hidden": 0.6484375, |
| "loss/logits": 0.11670757830142975, |
| "loss/reg": 0.022330984473228455, |
| "step": 586 |
| }, |
| { |
| "epoch": 0.073375, |
| "grad_norm": 3.20444917678833, |
| "grad_norm_var": 19.601201389954156, |
| "learning_rate": 0.0001, |
| "loss": 1.2216, |
| "loss/crossentropy": 2.0031001567840576, |
| "loss/hidden": 0.84375, |
| "loss/logits": 0.1546502560377121, |
| "loss/reg": 0.022321749478578568, |
| "step": 587 |
| }, |
| { |
| "epoch": 0.0735, |
| "grad_norm": 4.657837390899658, |
| "grad_norm_var": 19.6039707895662, |
| "learning_rate": 0.0001, |
| "loss": 1.1377, |
| "loss/crossentropy": 2.598100185394287, |
| "loss/hidden": 0.78125, |
| "loss/logits": 0.13329939544200897, |
| "loss/reg": 0.02231265790760517, |
| "step": 588 |
| }, |
| { |
| "epoch": 0.073625, |
| "grad_norm": 3.1166787147521973, |
| "grad_norm_var": 19.52355503271277, |
| "learning_rate": 0.0001, |
| "loss": 1.304, |
| "loss/crossentropy": 2.2209084033966064, |
| "loss/hidden": 0.89453125, |
| "loss/logits": 0.18648099899291992, |
| "loss/reg": 0.022303014993667603, |
| "step": 589 |
| }, |
| { |
| "epoch": 0.07375, |
| "grad_norm": 2.988344669342041, |
| "grad_norm_var": 19.61249925539728, |
| "learning_rate": 0.0001, |
| "loss": 0.9412, |
| "loss/crossentropy": 2.4533708095550537, |
| "loss/hidden": 0.60546875, |
| "loss/logits": 0.11275988817214966, |
| "loss/reg": 0.022293319925665855, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.073875, |
| "grad_norm": 3.0553901195526123, |
| "grad_norm_var": 0.3491433903254536, |
| "learning_rate": 0.0001, |
| "loss": 1.0883, |
| "loss/crossentropy": 2.3986990451812744, |
| "loss/hidden": 0.73046875, |
| "loss/logits": 0.13498544692993164, |
| "loss/reg": 0.022283662110567093, |
| "step": 591 |
| }, |
| { |
| "epoch": 0.074, |
| "grad_norm": 4.445681095123291, |
| "grad_norm_var": 0.43558600780207446, |
| "learning_rate": 0.0001, |
| "loss": 1.3195, |
| "loss/crossentropy": 2.335937976837158, |
| "loss/hidden": 0.90625, |
| "loss/logits": 0.19050292670726776, |
| "loss/reg": 0.022273709997534752, |
| "step": 592 |
| }, |
| { |
| "epoch": 0.074125, |
| "grad_norm": 41.39726638793945, |
| "grad_norm_var": 91.00287624901027, |
| "learning_rate": 0.0001, |
| "loss": 1.2527, |
| "loss/crossentropy": 2.3441669940948486, |
| "loss/hidden": 0.86328125, |
| "loss/logits": 0.1667976826429367, |
| "loss/reg": 0.02226419560611248, |
| "step": 593 |
| }, |
| { |
| "epoch": 0.07425, |
| "grad_norm": 2.8557939529418945, |
| "grad_norm_var": 91.04408434440504, |
| "learning_rate": 0.0001, |
| "loss": 1.0612, |
| "loss/crossentropy": 2.380885362625122, |
| "loss/hidden": 0.703125, |
| "loss/logits": 0.1354852318763733, |
| "loss/reg": 0.022254258394241333, |
| "step": 594 |
| }, |
| { |
| "epoch": 0.074375, |
| "grad_norm": 6.565737724304199, |
| "grad_norm_var": 90.36543928633743, |
| "learning_rate": 0.0001, |
| "loss": 1.2519, |
| "loss/crossentropy": 2.3877017498016357, |
| "loss/hidden": 0.90625, |
| "loss/logits": 0.1231798455119133, |
| "loss/reg": 0.022244345396757126, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.0745, |
| "grad_norm": 3.6746349334716797, |
| "grad_norm_var": 90.22397938232942, |
| "learning_rate": 0.0001, |
| "loss": 1.1718, |
| "loss/crossentropy": 2.397080898284912, |
| "loss/hidden": 0.80078125, |
| "loss/logits": 0.1486300826072693, |
| "loss/reg": 0.022234413772821426, |
| "step": 596 |
| }, |
| { |
| "epoch": 0.074625, |
| "grad_norm": 2.981614589691162, |
| "grad_norm_var": 90.50516196939815, |
| "learning_rate": 0.0001, |
| "loss": 1.043, |
| "loss/crossentropy": 2.37412428855896, |
| "loss/hidden": 0.69140625, |
| "loss/logits": 0.12931303679943085, |
| "loss/reg": 0.022224588319659233, |
| "step": 597 |
| }, |
| { |
| "epoch": 0.07475, |
| "grad_norm": 3.6531307697296143, |
| "grad_norm_var": 90.70497774366554, |
| "learning_rate": 0.0001, |
| "loss": 1.1272, |
| "loss/crossentropy": 2.6906161308288574, |
| "loss/hidden": 0.76953125, |
| "loss/logits": 0.13550975918769836, |
| "loss/reg": 0.02221417799592018, |
| "step": 598 |
| }, |
| { |
| "epoch": 0.074875, |
| "grad_norm": 3.751652240753174, |
| "grad_norm_var": 90.50753182721607, |
| "learning_rate": 0.0001, |
| "loss": 1.1618, |
| "loss/crossentropy": 2.6506831645965576, |
| "loss/hidden": 0.79296875, |
| "loss/logits": 0.1468081772327423, |
| "loss/reg": 0.022204989567399025, |
| "step": 599 |
| }, |
| { |
| "epoch": 0.075, |
| "grad_norm": 3.228563070297241, |
| "grad_norm_var": 90.31879350061254, |
| "learning_rate": 0.0001, |
| "loss": 1.0791, |
| "loss/crossentropy": 2.381368398666382, |
| "loss/hidden": 0.71875, |
| "loss/logits": 0.13835087418556213, |
| "loss/reg": 0.022195899859070778, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.075125, |
| "grad_norm": 5.392886638641357, |
| "grad_norm_var": 89.60797997668053, |
| "learning_rate": 0.0001, |
| "loss": 1.2309, |
| "loss/crossentropy": 2.52596378326416, |
| "loss/hidden": 0.8359375, |
| "loss/logits": 0.17307403683662415, |
| "loss/reg": 0.022185994312167168, |
| "step": 601 |
| }, |
| { |
| "epoch": 0.07525, |
| "grad_norm": 3.2832775115966797, |
| "grad_norm_var": 89.43019603463321, |
| "learning_rate": 0.0001, |
| "loss": 1.1462, |
| "loss/crossentropy": 2.4442057609558105, |
| "loss/hidden": 0.76953125, |
| "loss/logits": 0.15487736463546753, |
| "loss/reg": 0.02217610739171505, |
| "step": 602 |
| }, |
| { |
| "epoch": 0.075375, |
| "grad_norm": 4.1014084815979, |
| "grad_norm_var": 89.1293068696746, |
| "learning_rate": 0.0001, |
| "loss": 1.0847, |
| "loss/crossentropy": 2.531064987182617, |
| "loss/hidden": 0.7265625, |
| "loss/logits": 0.13643184304237366, |
| "loss/reg": 0.02216634899377823, |
| "step": 603 |
| }, |
| { |
| "epoch": 0.0755, |
| "grad_norm": 3.656172037124634, |
| "grad_norm_var": 89.39756111673695, |
| "learning_rate": 0.0001, |
| "loss": 1.0161, |
| "loss/crossentropy": 2.470468044281006, |
| "loss/hidden": 0.68359375, |
| "loss/logits": 0.11097002029418945, |
| "loss/reg": 0.02215682342648506, |
| "step": 604 |
| }, |
| { |
| "epoch": 0.075625, |
| "grad_norm": 3.8061492443084717, |
| "grad_norm_var": 89.1498668494714, |
| "learning_rate": 0.0001, |
| "loss": 1.1195, |
| "loss/crossentropy": 2.469788074493408, |
| "loss/hidden": 0.7734375, |
| "loss/logits": 0.12457874417304993, |
| "loss/reg": 0.022147687152028084, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.07575, |
| "grad_norm": 2.658135414123535, |
| "grad_norm_var": 89.29708722871554, |
| "learning_rate": 0.0001, |
| "loss": 1.0657, |
| "loss/crossentropy": 2.2552011013031006, |
| "loss/hidden": 0.71484375, |
| "loss/logits": 0.12950366735458374, |
| "loss/reg": 0.022138802334666252, |
| "step": 606 |
| }, |
| { |
| "epoch": 0.075875, |
| "grad_norm": 3.291750907897949, |
| "grad_norm_var": 89.20284122750789, |
| "learning_rate": 0.0001, |
| "loss": 1.133, |
| "loss/crossentropy": 2.225055456161499, |
| "loss/hidden": 0.73828125, |
| "loss/logits": 0.1734064519405365, |
| "loss/reg": 0.022129878401756287, |
| "step": 607 |
| }, |
| { |
| "epoch": 0.076, |
| "grad_norm": 4.001339912414551, |
| "grad_norm_var": 89.31742762195415, |
| "learning_rate": 0.0001, |
| "loss": 0.9658, |
| "loss/crossentropy": 2.3826169967651367, |
| "loss/hidden": 0.64453125, |
| "loss/logits": 0.10007497668266296, |
| "loss/reg": 0.022121025249361992, |
| "step": 608 |
| }, |
| { |
| "epoch": 0.076125, |
| "grad_norm": 2.9729602336883545, |
| "grad_norm_var": 0.9817241823775095, |
| "learning_rate": 0.0001, |
| "loss": 1.1962, |
| "loss/crossentropy": 2.4926881790161133, |
| "loss/hidden": 0.82421875, |
| "loss/logits": 0.1508128046989441, |
| "loss/reg": 0.022111859172582626, |
| "step": 609 |
| }, |
| { |
| "epoch": 0.07625, |
| "grad_norm": 2.5144121646881104, |
| "grad_norm_var": 1.0293551003726749, |
| "learning_rate": 0.0001, |
| "loss": 0.9883, |
| "loss/crossentropy": 2.478576898574829, |
| "loss/hidden": 0.65234375, |
| "loss/logits": 0.11488830298185349, |
| "loss/reg": 0.02210419811308384, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.076375, |
| "grad_norm": 3.7777743339538574, |
| "grad_norm_var": 0.4576308797359, |
| "learning_rate": 0.0001, |
| "loss": 1.0059, |
| "loss/crossentropy": 2.4356257915496826, |
| "loss/hidden": 0.6484375, |
| "loss/logits": 0.13651525974273682, |
| "loss/reg": 0.022095149382948875, |
| "step": 611 |
| }, |
| { |
| "epoch": 0.0765, |
| "grad_norm": 3.232398748397827, |
| "grad_norm_var": 0.4623055923756754, |
| "learning_rate": 0.0001, |
| "loss": 1.0108, |
| "loss/crossentropy": 2.4058218002319336, |
| "loss/hidden": 0.67578125, |
| "loss/logits": 0.11410736292600632, |
| "loss/reg": 0.022087210789322853, |
| "step": 612 |
| }, |
| { |
| "epoch": 0.076625, |
| "grad_norm": 2.61934757232666, |
| "grad_norm_var": 0.49646373584008807, |
| "learning_rate": 0.0001, |
| "loss": 0.9466, |
| "loss/crossentropy": 2.3981077671051025, |
| "loss/hidden": 0.59765625, |
| "loss/logits": 0.1281721591949463, |
| "loss/reg": 0.022078126668930054, |
| "step": 613 |
| }, |
| { |
| "epoch": 0.07675, |
| "grad_norm": 4.228268146514893, |
| "grad_norm_var": 0.5291615579452734, |
| "learning_rate": 0.0001, |
| "loss": 1.0729, |
| "loss/crossentropy": 2.664149761199951, |
| "loss/hidden": 0.73046875, |
| "loss/logits": 0.12176868319511414, |
| "loss/reg": 0.02207016758620739, |
| "step": 614 |
| }, |
| { |
| "epoch": 0.076875, |
| "grad_norm": 2.9137685298919678, |
| "grad_norm_var": 0.5485319535320492, |
| "learning_rate": 0.0001, |
| "loss": 0.8708, |
| "loss/crossentropy": 2.5457890033721924, |
| "loss/hidden": 0.55078125, |
| "loss/logits": 0.09941907972097397, |
| "loss/reg": 0.022060981020331383, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.077, |
| "grad_norm": 3.4499685764312744, |
| "grad_norm_var": 0.5441756848342263, |
| "learning_rate": 0.0001, |
| "loss": 1.1439, |
| "loss/crossentropy": 2.3822786808013916, |
| "loss/hidden": 0.78125, |
| "loss/logits": 0.14211627840995789, |
| "loss/reg": 0.02205180749297142, |
| "step": 616 |
| }, |
| { |
| "epoch": 0.077125, |
| "grad_norm": 2.271921157836914, |
| "grad_norm_var": 0.36266744154546565, |
| "learning_rate": 0.0001, |
| "loss": 0.9061, |
| "loss/crossentropy": 2.707848072052002, |
| "loss/hidden": 0.58203125, |
| "loss/logits": 0.10360105335712433, |
| "loss/reg": 0.022042402997612953, |
| "step": 617 |
| }, |
| { |
| "epoch": 0.07725, |
| "grad_norm": 3.16980242729187, |
| "grad_norm_var": 0.36370543210803513, |
| "learning_rate": 0.0001, |
| "loss": 1.2733, |
| "loss/crossentropy": 2.1980466842651367, |
| "loss/hidden": 0.86328125, |
| "loss/logits": 0.18972548842430115, |
| "loss/reg": 0.02203306369483471, |
| "step": 618 |
| }, |
| { |
| "epoch": 0.077375, |
| "grad_norm": 2.4408295154571533, |
| "grad_norm_var": 0.3567501583972517, |
| "learning_rate": 0.0001, |
| "loss": 1.1352, |
| "loss/crossentropy": 2.3986692428588867, |
| "loss/hidden": 0.76953125, |
| "loss/logits": 0.1454274207353592, |
| "loss/reg": 0.022023871541023254, |
| "step": 619 |
| }, |
| { |
| "epoch": 0.0775, |
| "grad_norm": 2.558687925338745, |
| "grad_norm_var": 0.36349398943808237, |
| "learning_rate": 0.0001, |
| "loss": 0.9776, |
| "loss/crossentropy": 2.4290616512298584, |
| "loss/hidden": 0.65625, |
| "loss/logits": 0.1012360006570816, |
| "loss/reg": 0.0220141913741827, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.077625, |
| "grad_norm": 3.7468619346618652, |
| "grad_norm_var": 0.3582835152003213, |
| "learning_rate": 0.0001, |
| "loss": 1.1635, |
| "loss/crossentropy": 2.4566454887390137, |
| "loss/hidden": 0.78515625, |
| "loss/logits": 0.15830263495445251, |
| "loss/reg": 0.02200442925095558, |
| "step": 621 |
| }, |
| { |
| "epoch": 0.07775, |
| "grad_norm": 3.8364651203155518, |
| "grad_norm_var": 0.3732032502257139, |
| "learning_rate": 0.0001, |
| "loss": 1.3213, |
| "loss/crossentropy": 2.5934860706329346, |
| "loss/hidden": 0.9296875, |
| "loss/logits": 0.17167437076568604, |
| "loss/reg": 0.021995313465595245, |
| "step": 622 |
| }, |
| { |
| "epoch": 0.077875, |
| "grad_norm": 2.9136173725128174, |
| "grad_norm_var": 0.37696739372618043, |
| "learning_rate": 0.0001, |
| "loss": 1.1191, |
| "loss/crossentropy": 2.1625287532806396, |
| "loss/hidden": 0.75, |
| "loss/logits": 0.1492336541414261, |
| "loss/reg": 0.021986283361911774, |
| "step": 623 |
| }, |
| { |
| "epoch": 0.078, |
| "grad_norm": 3.3136067390441895, |
| "grad_norm_var": 0.3298862344756988, |
| "learning_rate": 0.0001, |
| "loss": 1.1487, |
| "loss/crossentropy": 2.3915464878082275, |
| "loss/hidden": 0.7890625, |
| "loss/logits": 0.1398892104625702, |
| "loss/reg": 0.021976841613650322, |
| "step": 624 |
| }, |
| { |
| "epoch": 0.078125, |
| "grad_norm": 3.1731748580932617, |
| "grad_norm_var": 0.3283984444798058, |
| "learning_rate": 0.0001, |
| "loss": 1.0945, |
| "loss/crossentropy": 2.53873610496521, |
| "loss/hidden": 0.7109375, |
| "loss/logits": 0.16392138600349426, |
| "loss/reg": 0.021966535598039627, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.07825, |
| "grad_norm": 4.172556400299072, |
| "grad_norm_var": 0.3630228628347131, |
| "learning_rate": 0.0001, |
| "loss": 1.1888, |
| "loss/crossentropy": 2.4219956398010254, |
| "loss/hidden": 0.79296875, |
| "loss/logits": 0.17627781629562378, |
| "loss/reg": 0.021956363692879677, |
| "step": 626 |
| }, |
| { |
| "epoch": 0.078375, |
| "grad_norm": 3.99817156791687, |
| "grad_norm_var": 0.3819004722530487, |
| "learning_rate": 0.0001, |
| "loss": 1.3074, |
| "loss/crossentropy": 2.2407050132751465, |
| "loss/hidden": 0.8828125, |
| "loss/logits": 0.20513707399368286, |
| "loss/reg": 0.02194611169397831, |
| "step": 627 |
| }, |
| { |
| "epoch": 0.0785, |
| "grad_norm": 3.2869558334350586, |
| "grad_norm_var": 0.3819405314837089, |
| "learning_rate": 0.0001, |
| "loss": 0.923, |
| "loss/crossentropy": 2.5824477672576904, |
| "loss/hidden": 0.59375, |
| "loss/logits": 0.10985252261161804, |
| "loss/reg": 0.021935785189270973, |
| "step": 628 |
| }, |
| { |
| "epoch": 0.078625, |
| "grad_norm": 3.0037360191345215, |
| "grad_norm_var": 0.35855200267849247, |
| "learning_rate": 0.0001, |
| "loss": 1.0636, |
| "loss/crossentropy": 2.4916722774505615, |
| "loss/hidden": 0.7109375, |
| "loss/logits": 0.13343587517738342, |
| "loss/reg": 0.021925168111920357, |
| "step": 629 |
| }, |
| { |
| "epoch": 0.07875, |
| "grad_norm": 3.5274088382720947, |
| "grad_norm_var": 0.3006291732182412, |
| "learning_rate": 0.0001, |
| "loss": 1.1441, |
| "loss/crossentropy": 1.7279773950576782, |
| "loss/hidden": 0.78515625, |
| "loss/logits": 0.13975940644741058, |
| "loss/reg": 0.02191445231437683, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.078875, |
| "grad_norm": 3.2895989418029785, |
| "grad_norm_var": 0.29330515223301745, |
| "learning_rate": 0.0001, |
| "loss": 1.0394, |
| "loss/crossentropy": 2.370850086212158, |
| "loss/hidden": 0.68359375, |
| "loss/logits": 0.13678821921348572, |
| "loss/reg": 0.02190525084733963, |
| "step": 631 |
| }, |
| { |
| "epoch": 0.079, |
| "grad_norm": 3.741135597229004, |
| "grad_norm_var": 0.30599490652712474, |
| "learning_rate": 0.0001, |
| "loss": 1.2655, |
| "loss/crossentropy": 2.365464210510254, |
| "loss/hidden": 0.87890625, |
| "loss/logits": 0.16766133904457092, |
| "loss/reg": 0.02189476415514946, |
| "step": 632 |
| }, |
| { |
| "epoch": 0.079125, |
| "grad_norm": 4.110158443450928, |
| "grad_norm_var": 0.2706546096444164, |
| "learning_rate": 0.0001, |
| "loss": 1.1804, |
| "loss/crossentropy": 2.658433437347412, |
| "loss/hidden": 0.796875, |
| "loss/logits": 0.1646716445684433, |
| "loss/reg": 0.021885616704821587, |
| "step": 633 |
| }, |
| { |
| "epoch": 0.07925, |
| "grad_norm": 3.7503654956817627, |
| "grad_norm_var": 0.27446839769864156, |
| "learning_rate": 0.0001, |
| "loss": 1.1, |
| "loss/crossentropy": 2.4457831382751465, |
| "loss/hidden": 0.74609375, |
| "loss/logits": 0.13510534167289734, |
| "loss/reg": 0.021875550970435143, |
| "step": 634 |
| }, |
| { |
| "epoch": 0.079375, |
| "grad_norm": 2.917835235595703, |
| "grad_norm_var": 0.22584356567046956, |
| "learning_rate": 0.0001, |
| "loss": 1.1498, |
| "loss/crossentropy": 2.3328001499176025, |
| "loss/hidden": 0.77734375, |
| "loss/logits": 0.1537725031375885, |
| "loss/reg": 0.021865583956241608, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.0795, |
| "grad_norm": 2.907240152359009, |
| "grad_norm_var": 0.19160647764443486, |
| "learning_rate": 0.0001, |
| "loss": 0.9073, |
| "loss/crossentropy": 2.556042432785034, |
| "loss/hidden": 0.59375, |
| "loss/logits": 0.09503352642059326, |
| "loss/reg": 0.02185530960559845, |
| "step": 636 |
| }, |
| { |
| "epoch": 0.079625, |
| "grad_norm": 2.5937955379486084, |
| "grad_norm_var": 0.2337615816576618, |
| "learning_rate": 0.0001, |
| "loss": 0.9198, |
| "loss/crossentropy": 2.363370180130005, |
| "loss/hidden": 0.5859375, |
| "loss/logits": 0.11537887156009674, |
| "loss/reg": 0.021845519542694092, |
| "step": 637 |
| }, |
| { |
| "epoch": 0.07975, |
| "grad_norm": 3.9080708026885986, |
| "grad_norm_var": 0.2381681132369368, |
| "learning_rate": 0.0001, |
| "loss": 1.1552, |
| "loss/crossentropy": 2.5054194927215576, |
| "loss/hidden": 0.81640625, |
| "loss/logits": 0.12043754756450653, |
| "loss/reg": 0.021835271269083023, |
| "step": 638 |
| }, |
| { |
| "epoch": 0.079875, |
| "grad_norm": 2.8192946910858154, |
| "grad_norm_var": 0.24500412598165416, |
| "learning_rate": 0.0001, |
| "loss": 0.9375, |
| "loss/crossentropy": 2.6611719131469727, |
| "loss/hidden": 0.60546875, |
| "loss/logits": 0.11382012814283371, |
| "loss/reg": 0.021824965253472328, |
| "step": 639 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 4.111716270446777, |
| "grad_norm_var": 0.27486954530744445, |
| "learning_rate": 0.0001, |
| "loss": 1.2972, |
| "loss/crossentropy": 2.472568988800049, |
| "loss/hidden": 0.91015625, |
| "loss/logits": 0.16890740394592285, |
| "loss/reg": 0.02181575633585453, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.080125, |
| "grad_norm": 5.715061187744141, |
| "grad_norm_var": 0.5825168124345791, |
| "learning_rate": 0.0001, |
| "loss": 1.0307, |
| "loss/crossentropy": 2.2123966217041016, |
| "loss/hidden": 0.68359375, |
| "loss/logits": 0.12899596989154816, |
| "loss/reg": 0.021806620061397552, |
| "step": 641 |
| }, |
| { |
| "epoch": 0.08025, |
| "grad_norm": 3.8680315017700195, |
| "grad_norm_var": 0.5657073815126407, |
| "learning_rate": 0.0001, |
| "loss": 1.053, |
| "loss/crossentropy": 2.4526453018188477, |
| "loss/hidden": 0.703125, |
| "loss/logits": 0.13190723955631256, |
| "loss/reg": 0.021797508001327515, |
| "step": 642 |
| }, |
| { |
| "epoch": 0.080375, |
| "grad_norm": 4.992859363555908, |
| "grad_norm_var": 0.680778895488037, |
| "learning_rate": 0.0001, |
| "loss": 1.1683, |
| "loss/crossentropy": 2.3445968627929688, |
| "loss/hidden": 0.82421875, |
| "loss/logits": 0.12617573142051697, |
| "loss/reg": 0.02178841643035412, |
| "step": 643 |
| }, |
| { |
| "epoch": 0.0805, |
| "grad_norm": 4.339104175567627, |
| "grad_norm_var": 0.6977811040599325, |
| "learning_rate": 0.0001, |
| "loss": 1.3433, |
| "loss/crossentropy": 1.8577097654342651, |
| "loss/hidden": 0.9453125, |
| "loss/logits": 0.18017446994781494, |
| "loss/reg": 0.02177964523434639, |
| "step": 644 |
| }, |
| { |
| "epoch": 0.080625, |
| "grad_norm": 4.026562213897705, |
| "grad_norm_var": 0.6648423545945282, |
| "learning_rate": 0.0001, |
| "loss": 1.3266, |
| "loss/crossentropy": 2.415410041809082, |
| "loss/hidden": 0.859375, |
| "loss/logits": 0.24951061606407166, |
| "loss/reg": 0.021770501509308815, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.08075, |
| "grad_norm": 2.8517775535583496, |
| "grad_norm_var": 0.7169049906385166, |
| "learning_rate": 0.0001, |
| "loss": 0.9283, |
| "loss/crossentropy": 2.6426851749420166, |
| "loss/hidden": 0.6015625, |
| "loss/logits": 0.10916159301996231, |
| "loss/reg": 0.021761184558272362, |
| "step": 646 |
| }, |
| { |
| "epoch": 0.080875, |
| "grad_norm": 3.3171377182006836, |
| "grad_norm_var": 0.7152750431492562, |
| "learning_rate": 0.0001, |
| "loss": 1.0088, |
| "loss/crossentropy": 2.638533115386963, |
| "loss/hidden": 0.62890625, |
| "loss/logits": 0.1623907834291458, |
| "loss/reg": 0.02175196446478367, |
| "step": 647 |
| }, |
| { |
| "epoch": 0.081, |
| "grad_norm": 3.653881311416626, |
| "grad_norm_var": 0.7158322952113887, |
| "learning_rate": 0.0001, |
| "loss": 1.1132, |
| "loss/crossentropy": 2.745229482650757, |
| "loss/hidden": 0.765625, |
| "loss/logits": 0.1301097273826599, |
| "loss/reg": 0.021742329001426697, |
| "step": 648 |
| }, |
| { |
| "epoch": 0.081125, |
| "grad_norm": 4.31439733505249, |
| "grad_norm_var": 0.728446489341123, |
| "learning_rate": 0.0001, |
| "loss": 1.2706, |
| "loss/crossentropy": 2.7004082202911377, |
| "loss/hidden": 0.80078125, |
| "loss/logits": 0.25250470638275146, |
| "loss/reg": 0.021732579916715622, |
| "step": 649 |
| }, |
| { |
| "epoch": 0.08125, |
| "grad_norm": 4.621687889099121, |
| "grad_norm_var": 0.7753064642270201, |
| "learning_rate": 0.0001, |
| "loss": 1.4091, |
| "loss/crossentropy": 2.38405179977417, |
| "loss/hidden": 0.89453125, |
| "loss/logits": 0.29737186431884766, |
| "loss/reg": 0.02172265760600567, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.081375, |
| "grad_norm": 2.87424898147583, |
| "grad_norm_var": 0.7806094534209419, |
| "learning_rate": 0.0001, |
| "loss": 1.2103, |
| "loss/crossentropy": 2.0905954837799072, |
| "loss/hidden": 0.8125, |
| "loss/logits": 0.18066659569740295, |
| "loss/reg": 0.021712414920330048, |
| "step": 651 |
| }, |
| { |
| "epoch": 0.0815, |
| "grad_norm": 4.724517345428467, |
| "grad_norm_var": 0.7689569917943563, |
| "learning_rate": 0.0001, |
| "loss": 1.2928, |
| "loss/crossentropy": 2.500594139099121, |
| "loss/hidden": 0.89453125, |
| "loss/logits": 0.1812204122543335, |
| "loss/reg": 0.021703310310840607, |
| "step": 652 |
| }, |
| { |
| "epoch": 0.081625, |
| "grad_norm": 2.8673081398010254, |
| "grad_norm_var": 0.7252403996552116, |
| "learning_rate": 0.0001, |
| "loss": 1.0588, |
| "loss/crossentropy": 2.3599724769592285, |
| "loss/hidden": 0.703125, |
| "loss/logits": 0.13873916864395142, |
| "loss/reg": 0.021693557500839233, |
| "step": 653 |
| }, |
| { |
| "epoch": 0.08175, |
| "grad_norm": 3.099315643310547, |
| "grad_norm_var": 0.7693322976491117, |
| "learning_rate": 0.0001, |
| "loss": 1.1961, |
| "loss/crossentropy": 2.3167355060577393, |
| "loss/hidden": 0.83203125, |
| "loss/logits": 0.14727652072906494, |
| "loss/reg": 0.021683741360902786, |
| "step": 654 |
| }, |
| { |
| "epoch": 0.081875, |
| "grad_norm": 3.287602186203003, |
| "grad_norm_var": 0.7163515778113151, |
| "learning_rate": 0.0001, |
| "loss": 1.0429, |
| "loss/crossentropy": 2.7967636585235596, |
| "loss/hidden": 0.6875, |
| "loss/logits": 0.13863959908485413, |
| "loss/reg": 0.021674364805221558, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.082, |
| "grad_norm": 3.487874746322632, |
| "grad_norm_var": 0.7244436337536264, |
| "learning_rate": 0.0001, |
| "loss": 1.0883, |
| "loss/crossentropy": 2.5409998893737793, |
| "loss/hidden": 0.71875, |
| "loss/logits": 0.1528949737548828, |
| "loss/reg": 0.02166520059108734, |
| "step": 656 |
| }, |
| { |
| "epoch": 0.082125, |
| "grad_norm": 3.6202752590179443, |
| "grad_norm_var": 0.4854858648423857, |
| "learning_rate": 0.0001, |
| "loss": 1.2516, |
| "loss/crossentropy": 2.3633673191070557, |
| "loss/hidden": 0.88671875, |
| "loss/logits": 0.14828172326087952, |
| "loss/reg": 0.021655315533280373, |
| "step": 657 |
| }, |
| { |
| "epoch": 0.08225, |
| "grad_norm": 3.7921783924102783, |
| "grad_norm_var": 0.484617963461113, |
| "learning_rate": 0.0001, |
| "loss": 1.07, |
| "loss/crossentropy": 2.497013807296753, |
| "loss/hidden": 0.71484375, |
| "loss/logits": 0.13874131441116333, |
| "loss/reg": 0.021646033972501755, |
| "step": 658 |
| }, |
| { |
| "epoch": 0.082375, |
| "grad_norm": 5.818857669830322, |
| "grad_norm_var": 0.6650298211735747, |
| "learning_rate": 0.0001, |
| "loss": 1.2961, |
| "loss/crossentropy": 2.6599833965301514, |
| "loss/hidden": 0.91015625, |
| "loss/logits": 0.16955840587615967, |
| "loss/reg": 0.021636882796883583, |
| "step": 659 |
| }, |
| { |
| "epoch": 0.0825, |
| "grad_norm": 3.465527057647705, |
| "grad_norm_var": 0.6491808619433999, |
| "learning_rate": 0.0001, |
| "loss": 1.1365, |
| "loss/crossentropy": 2.5972306728363037, |
| "loss/hidden": 0.7734375, |
| "loss/logits": 0.14674408733844757, |
| "loss/reg": 0.02162766456604004, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.082625, |
| "grad_norm": 3.143159866333008, |
| "grad_norm_var": 0.664078497493661, |
| "learning_rate": 0.0001, |
| "loss": 1.2241, |
| "loss/crossentropy": 2.5945792198181152, |
| "loss/hidden": 0.83203125, |
| "loss/logits": 0.17588043212890625, |
| "loss/reg": 0.021618474274873734, |
| "step": 661 |
| }, |
| { |
| "epoch": 0.08275, |
| "grad_norm": 3.7091660499572754, |
| "grad_norm_var": 0.6149151800980365, |
| "learning_rate": 0.0001, |
| "loss": 1.1901, |
| "loss/crossentropy": 2.631565809249878, |
| "loss/hidden": 0.8125, |
| "loss/logits": 0.16149985790252686, |
| "loss/reg": 0.021609637886285782, |
| "step": 662 |
| }, |
| { |
| "epoch": 0.082875, |
| "grad_norm": 3.1449224948883057, |
| "grad_norm_var": 0.6264170707357067, |
| "learning_rate": 0.0001, |
| "loss": 1.1491, |
| "loss/crossentropy": 2.2890501022338867, |
| "loss/hidden": 0.78515625, |
| "loss/logits": 0.14797118306159973, |
| "loss/reg": 0.021600957959890366, |
| "step": 663 |
| }, |
| { |
| "epoch": 0.083, |
| "grad_norm": 3.096752405166626, |
| "grad_norm_var": 0.6512152784754629, |
| "learning_rate": 0.0001, |
| "loss": 1.0925, |
| "loss/crossentropy": 2.767069101333618, |
| "loss/hidden": 0.7421875, |
| "loss/logits": 0.1343650072813034, |
| "loss/reg": 0.021592585369944572, |
| "step": 664 |
| }, |
| { |
| "epoch": 0.083125, |
| "grad_norm": 3.8935883045196533, |
| "grad_norm_var": 0.6273466460071402, |
| "learning_rate": 0.0001, |
| "loss": 1.0996, |
| "loss/crossentropy": 2.4779109954833984, |
| "loss/hidden": 0.7578125, |
| "loss/logits": 0.12599951028823853, |
| "loss/reg": 0.021583350375294685, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.08325, |
| "grad_norm": 3.6881141662597656, |
| "grad_norm_var": 0.5627883047301658, |
| "learning_rate": 0.0001, |
| "loss": 1.1184, |
| "loss/crossentropy": 2.1603968143463135, |
| "loss/hidden": 0.76171875, |
| "loss/logits": 0.14095276594161987, |
| "loss/reg": 0.021574225276708603, |
| "step": 666 |
| }, |
| { |
| "epoch": 0.083375, |
| "grad_norm": 2.9750800132751465, |
| "grad_norm_var": 0.5535713466115603, |
| "learning_rate": 0.0001, |
| "loss": 1.1215, |
| "loss/crossentropy": 2.4817895889282227, |
| "loss/hidden": 0.75390625, |
| "loss/logits": 0.15194162726402283, |
| "loss/reg": 0.02156493254005909, |
| "step": 667 |
| }, |
| { |
| "epoch": 0.0835, |
| "grad_norm": 2.649543046951294, |
| "grad_norm_var": 0.5152581471177806, |
| "learning_rate": 0.0001, |
| "loss": 0.918, |
| "loss/crossentropy": 2.334226608276367, |
| "loss/hidden": 0.59375, |
| "loss/logits": 0.10870292782783508, |
| "loss/reg": 0.02155502513051033, |
| "step": 668 |
| }, |
| { |
| "epoch": 0.083625, |
| "grad_norm": 4.535495281219482, |
| "grad_norm_var": 0.5520843285134825, |
| "learning_rate": 0.0001, |
| "loss": 1.168, |
| "loss/crossentropy": 2.4633467197418213, |
| "loss/hidden": 0.796875, |
| "loss/logits": 0.15570059418678284, |
| "loss/reg": 0.021545063704252243, |
| "step": 669 |
| }, |
| { |
| "epoch": 0.08375, |
| "grad_norm": 3.3291449546813965, |
| "grad_norm_var": 0.5404115229162234, |
| "learning_rate": 0.0001, |
| "loss": 1.1241, |
| "loss/crossentropy": 2.317607879638672, |
| "loss/hidden": 0.78515625, |
| "loss/logits": 0.12363065779209137, |
| "loss/reg": 0.021535001695156097, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.083875, |
| "grad_norm": 4.071917533874512, |
| "grad_norm_var": 0.5459456401930327, |
| "learning_rate": 0.0001, |
| "loss": 1.1652, |
| "loss/crossentropy": 2.4951958656311035, |
| "loss/hidden": 0.83203125, |
| "loss/logits": 0.11796893179416656, |
| "loss/reg": 0.021524924784898758, |
| "step": 671 |
| }, |
| { |
| "epoch": 0.084, |
| "grad_norm": 4.647782802581787, |
| "grad_norm_var": 0.6047501670354971, |
| "learning_rate": 0.0001, |
| "loss": 1.157, |
| "loss/crossentropy": 2.6994447708129883, |
| "loss/hidden": 0.78125, |
| "loss/logits": 0.16059228777885437, |
| "loss/reg": 0.021514689549803734, |
| "step": 672 |
| }, |
| { |
| "epoch": 0.084125, |
| "grad_norm": 4.240062713623047, |
| "grad_norm_var": 0.6201999433703328, |
| "learning_rate": 0.0001, |
| "loss": 1.0984, |
| "loss/crossentropy": 2.592426300048828, |
| "loss/hidden": 0.7734375, |
| "loss/logits": 0.10989370942115784, |
| "loss/reg": 0.021505359560251236, |
| "step": 673 |
| }, |
| { |
| "epoch": 0.08425, |
| "grad_norm": 15.647865295410156, |
| "grad_norm_var": 9.451818582857973, |
| "learning_rate": 0.0001, |
| "loss": 1.6418, |
| "loss/crossentropy": 2.723198413848877, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.3486996293067932, |
| "loss/reg": 0.02149534970521927, |
| "step": 674 |
| }, |
| { |
| "epoch": 0.084375, |
| "grad_norm": 3.6480963230133057, |
| "grad_norm_var": 9.365638761154676, |
| "learning_rate": 0.0001, |
| "loss": 1.1052, |
| "loss/crossentropy": 2.618795871734619, |
| "loss/hidden": 0.734375, |
| "loss/logits": 0.1559816300868988, |
| "loss/reg": 0.02148519456386566, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.0845, |
| "grad_norm": 3.5789527893066406, |
| "grad_norm_var": 9.352796045350159, |
| "learning_rate": 0.0001, |
| "loss": 1.1092, |
| "loss/crossentropy": 2.4817607402801514, |
| "loss/hidden": 0.75, |
| "loss/logits": 0.14444658160209656, |
| "loss/reg": 0.021475963294506073, |
| "step": 676 |
| }, |
| { |
| "epoch": 0.084625, |
| "grad_norm": 3.143718719482422, |
| "grad_norm_var": 9.352704277495931, |
| "learning_rate": 0.0001, |
| "loss": 1.0394, |
| "loss/crossentropy": 2.28035044670105, |
| "loss/hidden": 0.70703125, |
| "loss/logits": 0.11767329275608063, |
| "loss/reg": 0.02146601676940918, |
| "step": 677 |
| }, |
| { |
| "epoch": 0.08475, |
| "grad_norm": 4.2889580726623535, |
| "grad_norm_var": 9.322240526517621, |
| "learning_rate": 0.0001, |
| "loss": 1.1067, |
| "loss/crossentropy": 2.28658390045166, |
| "loss/hidden": 0.7109375, |
| "loss/logits": 0.18121060729026794, |
| "loss/reg": 0.021456118673086166, |
| "step": 678 |
| }, |
| { |
| "epoch": 0.084875, |
| "grad_norm": 2.800516128540039, |
| "grad_norm_var": 9.387804829955044, |
| "learning_rate": 0.0001, |
| "loss": 0.975, |
| "loss/crossentropy": 2.699044704437256, |
| "loss/hidden": 0.63671875, |
| "loss/logits": 0.12379397451877594, |
| "loss/reg": 0.02144702896475792, |
| "step": 679 |
| }, |
| { |
| "epoch": 0.085, |
| "grad_norm": 3.064215898513794, |
| "grad_norm_var": 9.393480165725077, |
| "learning_rate": 0.0001, |
| "loss": 1.1186, |
| "loss/crossentropy": 2.547557830810547, |
| "loss/hidden": 0.76171875, |
| "loss/logits": 0.14249341189861298, |
| "loss/reg": 0.02143782004714012, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.085125, |
| "grad_norm": 3.7314915657043457, |
| "grad_norm_var": 9.405801361337378, |
| "learning_rate": 0.0001, |
| "loss": 1.1376, |
| "loss/crossentropy": 2.705601930618286, |
| "loss/hidden": 0.7734375, |
| "loss/logits": 0.14988452196121216, |
| "loss/reg": 0.02142806351184845, |
| "step": 681 |
| }, |
| { |
| "epoch": 0.08525, |
| "grad_norm": 2.256387948989868, |
| "grad_norm_var": 9.665529326305519, |
| "learning_rate": 0.0001, |
| "loss": 0.945, |
| "loss/crossentropy": 2.464989423751831, |
| "loss/hidden": 0.62109375, |
| "loss/logits": 0.10975323617458344, |
| "loss/reg": 0.02141808532178402, |
| "step": 682 |
| }, |
| { |
| "epoch": 0.085375, |
| "grad_norm": 3.150348424911499, |
| "grad_norm_var": 9.636765682886749, |
| "learning_rate": 0.0001, |
| "loss": 1.0573, |
| "loss/crossentropy": 2.2520275115966797, |
| "loss/hidden": 0.72265625, |
| "loss/logits": 0.12059713900089264, |
| "loss/reg": 0.021408328786492348, |
| "step": 683 |
| }, |
| { |
| "epoch": 0.0855, |
| "grad_norm": 2.973684549331665, |
| "grad_norm_var": 9.572043410499656, |
| "learning_rate": 0.0001, |
| "loss": 1.0663, |
| "loss/crossentropy": 2.4148662090301514, |
| "loss/hidden": 0.73828125, |
| "loss/logits": 0.11397817730903625, |
| "loss/reg": 0.021399127319455147, |
| "step": 684 |
| }, |
| { |
| "epoch": 0.085625, |
| "grad_norm": 4.39288330078125, |
| "grad_norm_var": 9.56920341692891, |
| "learning_rate": 0.0001, |
| "loss": 1.206, |
| "loss/crossentropy": 2.5025415420532227, |
| "loss/hidden": 0.84765625, |
| "loss/logits": 0.1444375216960907, |
| "loss/reg": 0.021389208734035492, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.08575, |
| "grad_norm": 2.815019369125366, |
| "grad_norm_var": 9.652987248771876, |
| "learning_rate": 0.0001, |
| "loss": 1.0631, |
| "loss/crossentropy": 2.236358165740967, |
| "loss/hidden": 0.72265625, |
| "loss/logits": 0.1266387552022934, |
| "loss/reg": 0.021379247307777405, |
| "step": 686 |
| }, |
| { |
| "epoch": 0.085875, |
| "grad_norm": 3.6744375228881836, |
| "grad_norm_var": 9.673796390527396, |
| "learning_rate": 0.0001, |
| "loss": 1.2523, |
| "loss/crossentropy": 2.1673667430877686, |
| "loss/hidden": 0.8671875, |
| "loss/logits": 0.1714528650045395, |
| "loss/reg": 0.02136901021003723, |
| "step": 687 |
| }, |
| { |
| "epoch": 0.086, |
| "grad_norm": 3.169265031814575, |
| "grad_norm_var": 9.732675648460468, |
| "learning_rate": 0.0001, |
| "loss": 1.1389, |
| "loss/crossentropy": 2.45385479927063, |
| "loss/hidden": 0.79296875, |
| "loss/logits": 0.13236522674560547, |
| "loss/reg": 0.021359853446483612, |
| "step": 688 |
| }, |
| { |
| "epoch": 0.086125, |
| "grad_norm": 7.017651557922363, |
| "grad_norm_var": 10.2441458601344, |
| "learning_rate": 0.0001, |
| "loss": 1.4071, |
| "loss/crossentropy": 2.344740629196167, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.13107535243034363, |
| "loss/reg": 0.021349839866161346, |
| "step": 689 |
| }, |
| { |
| "epoch": 0.08625, |
| "grad_norm": 3.5523998737335205, |
| "grad_norm_var": 1.142674868302701, |
| "learning_rate": 0.0001, |
| "loss": 1.1408, |
| "loss/crossentropy": 2.323866367340088, |
| "loss/hidden": 0.79296875, |
| "loss/logits": 0.13442741334438324, |
| "loss/reg": 0.021340306848287582, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.086375, |
| "grad_norm": 3.2477400302886963, |
| "grad_norm_var": 1.1489843436981233, |
| "learning_rate": 0.0001, |
| "loss": 1.0067, |
| "loss/crossentropy": 2.672182083129883, |
| "loss/hidden": 0.67578125, |
| "loss/logits": 0.11757320165634155, |
| "loss/reg": 0.021331045776605606, |
| "step": 691 |
| }, |
| { |
| "epoch": 0.0865, |
| "grad_norm": 3.1238696575164795, |
| "grad_norm_var": 1.1603900529546722, |
| "learning_rate": 0.0001, |
| "loss": 1.1122, |
| "loss/crossentropy": 2.510279893875122, |
| "loss/hidden": 0.765625, |
| "loss/logits": 0.13339656591415405, |
| "loss/reg": 0.021321000531315804, |
| "step": 692 |
| }, |
| { |
| "epoch": 0.086625, |
| "grad_norm": 2.552560329437256, |
| "grad_norm_var": 1.2122975827491755, |
| "learning_rate": 0.0001, |
| "loss": 1.0396, |
| "loss/crossentropy": 2.5303587913513184, |
| "loss/hidden": 0.6953125, |
| "loss/logits": 0.13117440044879913, |
| "loss/reg": 0.021313220262527466, |
| "step": 693 |
| }, |
| { |
| "epoch": 0.08675, |
| "grad_norm": 2.7075002193450928, |
| "grad_norm_var": 1.1997649773340213, |
| "learning_rate": 0.0001, |
| "loss": 1.0787, |
| "loss/crossentropy": 2.4859445095062256, |
| "loss/hidden": 0.73828125, |
| "loss/logits": 0.12738223373889923, |
| "loss/reg": 0.021303845569491386, |
| "step": 694 |
| }, |
| { |
| "epoch": 0.086875, |
| "grad_norm": 3.2640225887298584, |
| "grad_norm_var": 1.1768004922088529, |
| "learning_rate": 0.0001, |
| "loss": 1.3383, |
| "loss/crossentropy": 2.4017555713653564, |
| "loss/hidden": 0.96875, |
| "loss/logits": 0.15659019351005554, |
| "loss/reg": 0.021294469013810158, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.087, |
| "grad_norm": 3.5289857387542725, |
| "grad_norm_var": 1.1683562063707291, |
| "learning_rate": 0.0001, |
| "loss": 1.2607, |
| "loss/crossentropy": 2.2716236114501953, |
| "loss/hidden": 0.87109375, |
| "loss/logits": 0.17679363489151, |
| "loss/reg": 0.021284854039549828, |
| "step": 696 |
| }, |
| { |
| "epoch": 0.087125, |
| "grad_norm": 8.924003601074219, |
| "grad_norm_var": 3.0501856400161764, |
| "learning_rate": 0.0001, |
| "loss": 1.474, |
| "loss/crossentropy": 2.3684239387512207, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.2065410166978836, |
| "loss/reg": 0.02127666585147381, |
| "step": 697 |
| }, |
| { |
| "epoch": 0.08725, |
| "grad_norm": 4.2463788986206055, |
| "grad_norm_var": 2.8955696375948605, |
| "learning_rate": 0.0001, |
| "loss": 1.1835, |
| "loss/crossentropy": 2.234137535095215, |
| "loss/hidden": 0.828125, |
| "loss/logits": 0.1427059769630432, |
| "loss/reg": 0.021268585696816444, |
| "step": 698 |
| }, |
| { |
| "epoch": 0.087375, |
| "grad_norm": 3.0776546001434326, |
| "grad_norm_var": 2.903130025314302, |
| "learning_rate": 0.0001, |
| "loss": 1.0559, |
| "loss/crossentropy": 2.4638020992279053, |
| "loss/hidden": 0.7265625, |
| "loss/logits": 0.1167045459151268, |
| "loss/reg": 0.02125934511423111, |
| "step": 699 |
| }, |
| { |
| "epoch": 0.0875, |
| "grad_norm": 3.9374022483825684, |
| "grad_norm_var": 2.843209099820052, |
| "learning_rate": 0.0001, |
| "loss": 0.9629, |
| "loss/crossentropy": 2.88035249710083, |
| "loss/hidden": 0.63671875, |
| "loss/logits": 0.11369955539703369, |
| "loss/reg": 0.021249722689390182, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.087625, |
| "grad_norm": 4.235450744628906, |
| "grad_norm_var": 2.8355032825089417, |
| "learning_rate": 0.0001, |
| "loss": 1.278, |
| "loss/crossentropy": 2.2560718059539795, |
| "loss/hidden": 0.8828125, |
| "loss/logits": 0.1828073114156723, |
| "loss/reg": 0.0212401133030653, |
| "step": 701 |
| }, |
| { |
| "epoch": 0.08775, |
| "grad_norm": 2.3705031871795654, |
| "grad_norm_var": 2.9146564397348773, |
| "learning_rate": 0.0001, |
| "loss": 0.9899, |
| "loss/crossentropy": 2.531632661819458, |
| "loss/hidden": 0.64453125, |
| "loss/logits": 0.13305732607841492, |
| "loss/reg": 0.021231388673186302, |
| "step": 702 |
| }, |
| { |
| "epoch": 0.087875, |
| "grad_norm": 3.5063636302948, |
| "grad_norm_var": 2.921798711310286, |
| "learning_rate": 0.0001, |
| "loss": 1.0524, |
| "loss/crossentropy": 2.548243999481201, |
| "loss/hidden": 0.72265625, |
| "loss/logits": 0.11747653782367706, |
| "loss/reg": 0.02122276835143566, |
| "step": 703 |
| }, |
| { |
| "epoch": 0.088, |
| "grad_norm": 3.141618013381958, |
| "grad_norm_var": 2.924554396554724, |
| "learning_rate": 0.0001, |
| "loss": 0.9087, |
| "loss/crossentropy": 2.3723928928375244, |
| "loss/hidden": 0.59375, |
| "loss/logits": 0.10285645723342896, |
| "loss/reg": 0.021213354542851448, |
| "step": 704 |
| }, |
| { |
| "epoch": 0.088125, |
| "grad_norm": 4.195517063140869, |
| "grad_norm_var": 2.2500098957229806, |
| "learning_rate": 0.0001, |
| "loss": 1.1263, |
| "loss/crossentropy": 2.383502244949341, |
| "loss/hidden": 0.765625, |
| "loss/logits": 0.14867368340492249, |
| "loss/reg": 0.021204529330134392, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.08825, |
| "grad_norm": 3.646482467651367, |
| "grad_norm_var": 2.2483885758775686, |
| "learning_rate": 0.0001, |
| "loss": 1.1459, |
| "loss/crossentropy": 2.239716053009033, |
| "loss/hidden": 0.79296875, |
| "loss/logits": 0.14099720120429993, |
| "loss/reg": 0.021195242181420326, |
| "step": 706 |
| }, |
| { |
| "epoch": 0.088375, |
| "grad_norm": 3.3426883220672607, |
| "grad_norm_var": 2.242826109053838, |
| "learning_rate": 0.0001, |
| "loss": 1.0323, |
| "loss/crossentropy": 2.270444869995117, |
| "loss/hidden": 0.7109375, |
| "loss/logits": 0.10951918363571167, |
| "loss/reg": 0.021186839789152145, |
| "step": 707 |
| }, |
| { |
| "epoch": 0.0885, |
| "grad_norm": 3.083806037902832, |
| "grad_norm_var": 2.2462046620558, |
| "learning_rate": 0.0001, |
| "loss": 1.086, |
| "loss/crossentropy": 2.5013253688812256, |
| "loss/hidden": 0.75390625, |
| "loss/logits": 0.12026840448379517, |
| "loss/reg": 0.021178435534238815, |
| "step": 708 |
| }, |
| { |
| "epoch": 0.088625, |
| "grad_norm": 3.71588134765625, |
| "grad_norm_var": 2.147370219186798, |
| "learning_rate": 0.0001, |
| "loss": 1.1125, |
| "loss/crossentropy": 2.342475175857544, |
| "loss/hidden": 0.76953125, |
| "loss/logits": 0.13124068081378937, |
| "loss/reg": 0.021169869229197502, |
| "step": 709 |
| }, |
| { |
| "epoch": 0.08875, |
| "grad_norm": 3.4402713775634766, |
| "grad_norm_var": 2.0734307300644255, |
| "learning_rate": 0.0001, |
| "loss": 1.1229, |
| "loss/crossentropy": 2.4113690853118896, |
| "loss/hidden": 0.76171875, |
| "loss/logits": 0.14960095286369324, |
| "loss/reg": 0.021161576732993126, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.088875, |
| "grad_norm": 10.640914916992188, |
| "grad_norm_var": 4.894724677276531, |
| "learning_rate": 0.0001, |
| "loss": 1.7804, |
| "loss/crossentropy": 2.6555004119873047, |
| "loss/hidden": 1.328125, |
| "loss/logits": 0.240725576877594, |
| "loss/reg": 0.02115357480943203, |
| "step": 711 |
| }, |
| { |
| "epoch": 0.089, |
| "grad_norm": 3.23919415473938, |
| "grad_norm_var": 4.930329406483421, |
| "learning_rate": 0.0001, |
| "loss": 1.1368, |
| "loss/crossentropy": 2.4652106761932373, |
| "loss/hidden": 0.77734375, |
| "loss/logits": 0.14798855781555176, |
| "loss/reg": 0.021144360303878784, |
| "step": 712 |
| }, |
| { |
| "epoch": 0.089125, |
| "grad_norm": 2.8362622261047363, |
| "grad_norm_var": 3.4904838717428524, |
| "learning_rate": 0.0001, |
| "loss": 1.0668, |
| "loss/crossentropy": 2.216999053955078, |
| "loss/hidden": 0.7265625, |
| "loss/logits": 0.1288391500711441, |
| "loss/reg": 0.021136239171028137, |
| "step": 713 |
| }, |
| { |
| "epoch": 0.08925, |
| "grad_norm": 5.395811080932617, |
| "grad_norm_var": 3.623687874884585, |
| "learning_rate": 0.0001, |
| "loss": 1.2022, |
| "loss/crossentropy": 2.463663101196289, |
| "loss/hidden": 0.82421875, |
| "loss/logits": 0.1667168289422989, |
| "loss/reg": 0.02112707309424877, |
| "step": 714 |
| }, |
| { |
| "epoch": 0.089375, |
| "grad_norm": 3.475656032562256, |
| "grad_norm_var": 3.585286252049487, |
| "learning_rate": 0.0001, |
| "loss": 1.1524, |
| "loss/crossentropy": 2.4688560962677, |
| "loss/hidden": 0.796875, |
| "loss/logits": 0.14434993267059326, |
| "loss/reg": 0.02111782319843769, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.0895, |
| "grad_norm": 4.29054594039917, |
| "grad_norm_var": 3.5895333664829043, |
| "learning_rate": 0.0001, |
| "loss": 1.1214, |
| "loss/crossentropy": 2.4820876121520996, |
| "loss/hidden": 0.7578125, |
| "loss/logits": 0.15247562527656555, |
| "loss/reg": 0.021108638495206833, |
| "step": 716 |
| }, |
| { |
| "epoch": 0.089625, |
| "grad_norm": 2.927002429962158, |
| "grad_norm_var": 3.661532010616088, |
| "learning_rate": 0.0001, |
| "loss": 1.0904, |
| "loss/crossentropy": 2.499390125274658, |
| "loss/hidden": 0.734375, |
| "loss/logits": 0.14508014917373657, |
| "loss/reg": 0.021099381148815155, |
| "step": 717 |
| }, |
| { |
| "epoch": 0.08975, |
| "grad_norm": 2.529557943344116, |
| "grad_norm_var": 3.629551988733732, |
| "learning_rate": 0.0001, |
| "loss": 1.1194, |
| "loss/crossentropy": 2.395061492919922, |
| "loss/hidden": 0.76953125, |
| "loss/logits": 0.13898837566375732, |
| "loss/reg": 0.021089982241392136, |
| "step": 718 |
| }, |
| { |
| "epoch": 0.089875, |
| "grad_norm": 3.2681446075439453, |
| "grad_norm_var": 3.6476018392648397, |
| "learning_rate": 0.0001, |
| "loss": 1.0036, |
| "loss/crossentropy": 2.65079927444458, |
| "loss/hidden": 0.6640625, |
| "loss/logits": 0.12875254452228546, |
| "loss/reg": 0.021080130711197853, |
| "step": 719 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 2.6700246334075928, |
| "grad_norm_var": 3.7122117675621022, |
| "learning_rate": 0.0001, |
| "loss": 0.9929, |
| "loss/crossentropy": 2.428039789199829, |
| "loss/hidden": 0.65234375, |
| "loss/logits": 0.12986770272254944, |
| "loss/reg": 0.021070368587970734, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.090125, |
| "grad_norm": 3.296482563018799, |
| "grad_norm_var": 3.7295350110356558, |
| "learning_rate": 0.0001, |
| "loss": 1.1571, |
| "loss/crossentropy": 2.317190408706665, |
| "loss/hidden": 0.80078125, |
| "loss/logits": 0.14570315182209015, |
| "loss/reg": 0.02106117643415928, |
| "step": 721 |
| }, |
| { |
| "epoch": 0.09025, |
| "grad_norm": 4.451722145080566, |
| "grad_norm_var": 3.74687645800365, |
| "learning_rate": 0.0001, |
| "loss": 1.1385, |
| "loss/crossentropy": 2.2958080768585205, |
| "loss/hidden": 0.78125, |
| "loss/logits": 0.146757572889328, |
| "loss/reg": 0.021051928400993347, |
| "step": 722 |
| }, |
| { |
| "epoch": 0.090375, |
| "grad_norm": 2.4288058280944824, |
| "grad_norm_var": 3.8685376080960414, |
| "learning_rate": 0.0001, |
| "loss": 0.9788, |
| "loss/crossentropy": 2.4692001342773438, |
| "loss/hidden": 0.65234375, |
| "loss/logits": 0.11606692522764206, |
| "loss/reg": 0.021042969077825546, |
| "step": 723 |
| }, |
| { |
| "epoch": 0.0905, |
| "grad_norm": 45.21147918701172, |
| "grad_norm_var": 110.45448625779909, |
| "learning_rate": 0.0001, |
| "loss": 1.0823, |
| "loss/crossentropy": 2.4428551197052, |
| "loss/hidden": 0.7421875, |
| "loss/logits": 0.12983539700508118, |
| "loss/reg": 0.021031970158219337, |
| "step": 724 |
| }, |
| { |
| "epoch": 0.090625, |
| "grad_norm": 3.7111551761627197, |
| "grad_norm_var": 110.45623490585022, |
| "learning_rate": 0.0001, |
| "loss": 1.4512, |
| "loss/crossentropy": 2.1149277687072754, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.1941366195678711, |
| "loss/reg": 0.021020574495196342, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.09075, |
| "grad_norm": 3.3139920234680176, |
| "grad_norm_var": 110.50855221427315, |
| "learning_rate": 0.0001, |
| "loss": 1.0686, |
| "loss/crossentropy": 2.798910140991211, |
| "loss/hidden": 0.71875, |
| "loss/logits": 0.13971024751663208, |
| "loss/reg": 0.021009519696235657, |
| "step": 726 |
| }, |
| { |
| "epoch": 0.090875, |
| "grad_norm": 2.5206596851348877, |
| "grad_norm_var": 110.12514261997971, |
| "learning_rate": 0.0001, |
| "loss": 1.0391, |
| "loss/crossentropy": 2.400813579559326, |
| "loss/hidden": 0.6953125, |
| "loss/logits": 0.13384617865085602, |
| "loss/reg": 0.020998528227210045, |
| "step": 727 |
| }, |
| { |
| "epoch": 0.091, |
| "grad_norm": 2.698018789291382, |
| "grad_norm_var": 110.34070270952832, |
| "learning_rate": 0.0001, |
| "loss": 0.928, |
| "loss/crossentropy": 2.505309820175171, |
| "loss/hidden": 0.61328125, |
| "loss/logits": 0.10482652485370636, |
| "loss/reg": 0.020989248529076576, |
| "step": 728 |
| }, |
| { |
| "epoch": 0.091125, |
| "grad_norm": 6.086211204528809, |
| "grad_norm_var": 109.65630388036335, |
| "learning_rate": 0.0001, |
| "loss": 1.4543, |
| "loss/crossentropy": 2.065880537033081, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.12730881571769714, |
| "loss/reg": 0.020978538319468498, |
| "step": 729 |
| }, |
| { |
| "epoch": 0.09125, |
| "grad_norm": 2.527377128601074, |
| "grad_norm_var": 110.45601242879228, |
| "learning_rate": 0.0001, |
| "loss": 0.9281, |
| "loss/crossentropy": 2.575125217437744, |
| "loss/hidden": 0.60546875, |
| "loss/logits": 0.11296658217906952, |
| "loss/reg": 0.020967954769730568, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.091375, |
| "grad_norm": 2.405383825302124, |
| "grad_norm_var": 110.88254605251709, |
| "learning_rate": 0.0001, |
| "loss": 1.0005, |
| "loss/crossentropy": 2.1985392570495605, |
| "loss/hidden": 0.671875, |
| "loss/logits": 0.1190965548157692, |
| "loss/reg": 0.02095715142786503, |
| "step": 731 |
| }, |
| { |
| "epoch": 0.0915, |
| "grad_norm": 5.095945835113525, |
| "grad_norm_var": 110.75067974759948, |
| "learning_rate": 0.0001, |
| "loss": 1.2535, |
| "loss/crossentropy": 2.451446056365967, |
| "loss/hidden": 0.88671875, |
| "loss/logits": 0.15732397139072418, |
| "loss/reg": 0.020946422591805458, |
| "step": 732 |
| }, |
| { |
| "epoch": 0.091625, |
| "grad_norm": 2.497173547744751, |
| "grad_norm_var": 110.93526847423995, |
| "learning_rate": 0.0001, |
| "loss": 1.1301, |
| "loss/crossentropy": 2.4981319904327393, |
| "loss/hidden": 0.76953125, |
| "loss/logits": 0.15116086602210999, |
| "loss/reg": 0.020936597138643265, |
| "step": 733 |
| }, |
| { |
| "epoch": 0.09175, |
| "grad_norm": 4.977592468261719, |
| "grad_norm_var": 110.20332761050602, |
| "learning_rate": 0.0001, |
| "loss": 1.0306, |
| "loss/crossentropy": 2.37424898147583, |
| "loss/hidden": 0.703125, |
| "loss/logits": 0.11818103492259979, |
| "loss/reg": 0.020926134660840034, |
| "step": 734 |
| }, |
| { |
| "epoch": 0.091875, |
| "grad_norm": 2.8039772510528564, |
| "grad_norm_var": 110.39035266849663, |
| "learning_rate": 0.0001, |
| "loss": 0.9389, |
| "loss/crossentropy": 2.5503880977630615, |
| "loss/hidden": 0.62109375, |
| "loss/logits": 0.1085958182811737, |
| "loss/reg": 0.02091672271490097, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.092, |
| "grad_norm": 3.418717861175537, |
| "grad_norm_var": 110.08862675247053, |
| "learning_rate": 0.0001, |
| "loss": 0.9544, |
| "loss/crossentropy": 2.6695592403411865, |
| "loss/hidden": 0.62890625, |
| "loss/logits": 0.11641087383031845, |
| "loss/reg": 0.02090657874941826, |
| "step": 736 |
| }, |
| { |
| "epoch": 0.092125, |
| "grad_norm": 2.807041883468628, |
| "grad_norm_var": 110.28591938740921, |
| "learning_rate": 0.0001, |
| "loss": 1.1135, |
| "loss/crossentropy": 2.4328622817993164, |
| "loss/hidden": 0.76953125, |
| "loss/logits": 0.13500146567821503, |
| "loss/reg": 0.02089635282754898, |
| "step": 737 |
| }, |
| { |
| "epoch": 0.09225, |
| "grad_norm": 3.713057518005371, |
| "grad_norm_var": 110.4783888232826, |
| "learning_rate": 0.0001, |
| "loss": 1.1755, |
| "loss/crossentropy": 2.0374624729156494, |
| "loss/hidden": 0.83984375, |
| "loss/logits": 0.12679257988929749, |
| "loss/reg": 0.02088700234889984, |
| "step": 738 |
| }, |
| { |
| "epoch": 0.092375, |
| "grad_norm": 3.618948459625244, |
| "grad_norm_var": 109.99807079993953, |
| "learning_rate": 0.0001, |
| "loss": 0.9255, |
| "loss/crossentropy": 2.536527395248413, |
| "loss/hidden": 0.609375, |
| "loss/logits": 0.10737244784832001, |
| "loss/reg": 0.020877836272120476, |
| "step": 739 |
| }, |
| { |
| "epoch": 0.0925, |
| "grad_norm": 2.815113067626953, |
| "grad_norm_var": 1.1792510177041378, |
| "learning_rate": 0.0001, |
| "loss": 0.9755, |
| "loss/crossentropy": 2.464996337890625, |
| "loss/hidden": 0.640625, |
| "loss/logits": 0.12616491317749023, |
| "loss/reg": 0.020868681371212006, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.092625, |
| "grad_norm": 3.194117546081543, |
| "grad_norm_var": 1.1771383378848062, |
| "learning_rate": 0.0001, |
| "loss": 1.1989, |
| "loss/crossentropy": 2.325310707092285, |
| "loss/hidden": 0.84375, |
| "loss/logits": 0.14658081531524658, |
| "loss/reg": 0.020859118551015854, |
| "step": 741 |
| }, |
| { |
| "epoch": 0.09275, |
| "grad_norm": 2.970301628112793, |
| "grad_norm_var": 1.1887296793511715, |
| "learning_rate": 0.0001, |
| "loss": 0.9939, |
| "loss/crossentropy": 2.4747390747070312, |
| "loss/hidden": 0.66015625, |
| "loss/logits": 0.1252739280462265, |
| "loss/reg": 0.020850006490945816, |
| "step": 742 |
| }, |
| { |
| "epoch": 0.092875, |
| "grad_norm": 3.7745604515075684, |
| "grad_norm_var": 1.1425983881417718, |
| "learning_rate": 0.0001, |
| "loss": 0.9835, |
| "loss/crossentropy": 2.4191653728485107, |
| "loss/hidden": 0.67578125, |
| "loss/logits": 0.09930374473333359, |
| "loss/reg": 0.020840618759393692, |
| "step": 743 |
| }, |
| { |
| "epoch": 0.093, |
| "grad_norm": 2.8048055171966553, |
| "grad_norm_var": 1.132423092522494, |
| "learning_rate": 0.0001, |
| "loss": 1.083, |
| "loss/crossentropy": 2.764143228530884, |
| "loss/hidden": 0.75390625, |
| "loss/logits": 0.12075912207365036, |
| "loss/reg": 0.020830942317843437, |
| "step": 744 |
| }, |
| { |
| "epoch": 0.093125, |
| "grad_norm": 9.04628849029541, |
| "grad_norm_var": 2.712848654929009, |
| "learning_rate": 0.0001, |
| "loss": 1.4779, |
| "loss/crossentropy": 2.2979469299316406, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.1446615606546402, |
| "loss/reg": 0.02082117274403572, |
| "step": 745 |
| }, |
| { |
| "epoch": 0.09325, |
| "grad_norm": 3.1999406814575195, |
| "grad_norm_var": 2.6400540651182967, |
| "learning_rate": 0.0001, |
| "loss": 1.0424, |
| "loss/crossentropy": 2.6157429218292236, |
| "loss/hidden": 0.71875, |
| "loss/logits": 0.11555971205234528, |
| "loss/reg": 0.020811092108488083, |
| "step": 746 |
| }, |
| { |
| "epoch": 0.093375, |
| "grad_norm": 3.6833674907684326, |
| "grad_norm_var": 2.522139333113606, |
| "learning_rate": 0.0001, |
| "loss": 1.1046, |
| "loss/crossentropy": 2.6110498905181885, |
| "loss/hidden": 0.7578125, |
| "loss/logits": 0.13881847262382507, |
| "loss/reg": 0.020801017060875893, |
| "step": 747 |
| }, |
| { |
| "epoch": 0.0935, |
| "grad_norm": 3.073922872543335, |
| "grad_norm_var": 2.4218973518929827, |
| "learning_rate": 0.0001, |
| "loss": 0.9672, |
| "loss/crossentropy": 2.0898826122283936, |
| "loss/hidden": 0.65625, |
| "loss/logits": 0.10303568840026855, |
| "loss/reg": 0.020791731774806976, |
| "step": 748 |
| }, |
| { |
| "epoch": 0.093625, |
| "grad_norm": 2.9481358528137207, |
| "grad_norm_var": 2.365294319547022, |
| "learning_rate": 0.0001, |
| "loss": 0.9589, |
| "loss/crossentropy": 2.477987766265869, |
| "loss/hidden": 0.640625, |
| "loss/logits": 0.11042475700378418, |
| "loss/reg": 0.02078239433467388, |
| "step": 749 |
| }, |
| { |
| "epoch": 0.09375, |
| "grad_norm": 5.792114734649658, |
| "grad_norm_var": 2.5478865053407236, |
| "learning_rate": 0.0001, |
| "loss": 1.2395, |
| "loss/crossentropy": 2.0092225074768066, |
| "loss/hidden": 0.87109375, |
| "loss/logits": 0.16063663363456726, |
| "loss/reg": 0.020772725343704224, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.093875, |
| "grad_norm": 3.148350954055786, |
| "grad_norm_var": 2.512823601683457, |
| "learning_rate": 0.0001, |
| "loss": 1.14, |
| "loss/crossentropy": 2.669532537460327, |
| "loss/hidden": 0.78125, |
| "loss/logits": 0.15109741687774658, |
| "loss/reg": 0.020762871950864792, |
| "step": 751 |
| }, |
| { |
| "epoch": 0.094, |
| "grad_norm": 3.2779595851898193, |
| "grad_norm_var": 2.5202896391695124, |
| "learning_rate": 0.0001, |
| "loss": 1.1492, |
| "loss/crossentropy": 2.477522373199463, |
| "loss/hidden": 0.8125, |
| "loss/logits": 0.1291784942150116, |
| "loss/reg": 0.020753389224410057, |
| "step": 752 |
| }, |
| { |
| "epoch": 0.094125, |
| "grad_norm": 2.3718457221984863, |
| "grad_norm_var": 2.586364485192132, |
| "learning_rate": 0.0001, |
| "loss": 1.0399, |
| "loss/crossentropy": 2.497688055038452, |
| "loss/hidden": 0.69921875, |
| "loss/logits": 0.13327988982200623, |
| "loss/reg": 0.02074403502047062, |
| "step": 753 |
| }, |
| { |
| "epoch": 0.09425, |
| "grad_norm": 4.972538948059082, |
| "grad_norm_var": 2.6852568725766104, |
| "learning_rate": 0.0001, |
| "loss": 1.2511, |
| "loss/crossentropy": 2.232241630554199, |
| "loss/hidden": 0.90234375, |
| "loss/logits": 0.1414394974708557, |
| "loss/reg": 0.020734604448080063, |
| "step": 754 |
| }, |
| { |
| "epoch": 0.094375, |
| "grad_norm": 2.6991426944732666, |
| "grad_norm_var": 2.7595134043336267, |
| "learning_rate": 0.0001, |
| "loss": 1.001, |
| "loss/crossentropy": 2.596348285675049, |
| "loss/hidden": 0.6640625, |
| "loss/logits": 0.1297152191400528, |
| "loss/reg": 0.020725268870592117, |
| "step": 755 |
| }, |
| { |
| "epoch": 0.0945, |
| "grad_norm": 2.8633017539978027, |
| "grad_norm_var": 2.753743097466793, |
| "learning_rate": 0.0001, |
| "loss": 1.065, |
| "loss/crossentropy": 2.3198070526123047, |
| "loss/hidden": 0.74609375, |
| "loss/logits": 0.11169925332069397, |
| "loss/reg": 0.020716087892651558, |
| "step": 756 |
| }, |
| { |
| "epoch": 0.094625, |
| "grad_norm": 3.628239154815674, |
| "grad_norm_var": 2.733994536045853, |
| "learning_rate": 0.0001, |
| "loss": 1.3141, |
| "loss/crossentropy": 2.1950411796569824, |
| "loss/hidden": 0.92578125, |
| "loss/logits": 0.1812862753868103, |
| "loss/reg": 0.020707255229353905, |
| "step": 757 |
| }, |
| { |
| "epoch": 0.09475, |
| "grad_norm": 2.805727958679199, |
| "grad_norm_var": 2.753145827217212, |
| "learning_rate": 0.0001, |
| "loss": 1.109, |
| "loss/crossentropy": 2.40027117729187, |
| "loss/hidden": 0.76953125, |
| "loss/logits": 0.13251210749149323, |
| "loss/reg": 0.02069801278412342, |
| "step": 758 |
| }, |
| { |
| "epoch": 0.094875, |
| "grad_norm": 3.1892051696777344, |
| "grad_norm_var": 2.7730842000576295, |
| "learning_rate": 0.0001, |
| "loss": 1.1224, |
| "loss/crossentropy": 2.2343788146972656, |
| "loss/hidden": 0.77734375, |
| "loss/logits": 0.1382053792476654, |
| "loss/reg": 0.020689615979790688, |
| "step": 759 |
| }, |
| { |
| "epoch": 0.095, |
| "grad_norm": 2.925913095474243, |
| "grad_norm_var": 2.759237877311041, |
| "learning_rate": 0.0001, |
| "loss": 0.9857, |
| "loss/crossentropy": 2.5283889770507812, |
| "loss/hidden": 0.6640625, |
| "loss/logits": 0.11487281322479248, |
| "loss/reg": 0.02068025805056095, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.095125, |
| "grad_norm": 4.760406970977783, |
| "grad_norm_var": 0.8673601536624308, |
| "learning_rate": 0.0001, |
| "loss": 1.3807, |
| "loss/crossentropy": 2.8629026412963867, |
| "loss/hidden": 0.9765625, |
| "loss/logits": 0.1973596215248108, |
| "loss/reg": 0.020673030987381935, |
| "step": 761 |
| }, |
| { |
| "epoch": 0.09525, |
| "grad_norm": 2.6182596683502197, |
| "grad_norm_var": 0.9085803501248163, |
| "learning_rate": 0.0001, |
| "loss": 0.9554, |
| "loss/crossentropy": 2.6935925483703613, |
| "loss/hidden": 0.6328125, |
| "loss/logits": 0.11598716676235199, |
| "loss/reg": 0.020663931965827942, |
| "step": 762 |
| }, |
| { |
| "epoch": 0.095375, |
| "grad_norm": 3.404240131378174, |
| "grad_norm_var": 0.9037375089777697, |
| "learning_rate": 0.0001, |
| "loss": 1.0173, |
| "loss/crossentropy": 2.5958027839660645, |
| "loss/hidden": 0.6953125, |
| "loss/logits": 0.11542315781116486, |
| "loss/reg": 0.020654823631048203, |
| "step": 763 |
| }, |
| { |
| "epoch": 0.0955, |
| "grad_norm": 3.0021934509277344, |
| "grad_norm_var": 0.9072250591900151, |
| "learning_rate": 0.0001, |
| "loss": 0.9811, |
| "loss/crossentropy": 2.4611666202545166, |
| "loss/hidden": 0.6640625, |
| "loss/logits": 0.11060373485088348, |
| "loss/reg": 0.020645687356591225, |
| "step": 764 |
| }, |
| { |
| "epoch": 0.095625, |
| "grad_norm": 4.123987674713135, |
| "grad_norm_var": 0.9227216736856043, |
| "learning_rate": 0.0001, |
| "loss": 1.2112, |
| "loss/crossentropy": 2.327146530151367, |
| "loss/hidden": 0.87109375, |
| "loss/logits": 0.13372407853603363, |
| "loss/reg": 0.020637821406126022, |
| "step": 765 |
| }, |
| { |
| "epoch": 0.09575, |
| "grad_norm": 3.835116147994995, |
| "grad_norm_var": 0.5572045887439032, |
| "learning_rate": 0.0001, |
| "loss": 1.0062, |
| "loss/crossentropy": 2.5438876152038574, |
| "loss/hidden": 0.67578125, |
| "loss/logits": 0.12415439635515213, |
| "loss/reg": 0.02063015103340149, |
| "step": 766 |
| }, |
| { |
| "epoch": 0.095875, |
| "grad_norm": 3.2649309635162354, |
| "grad_norm_var": 0.5548939110280107, |
| "learning_rate": 0.0001, |
| "loss": 1.1664, |
| "loss/crossentropy": 2.6621615886688232, |
| "loss/hidden": 0.80078125, |
| "loss/logits": 0.15938454866409302, |
| "loss/reg": 0.020622732117772102, |
| "step": 767 |
| }, |
| { |
| "epoch": 0.096, |
| "grad_norm": 3.398061752319336, |
| "grad_norm_var": 0.554498685348062, |
| "learning_rate": 0.0001, |
| "loss": 0.852, |
| "loss/crossentropy": 2.3231897354125977, |
| "loss/hidden": 0.5625, |
| "loss/logits": 0.08329755067825317, |
| "loss/reg": 0.0206154715269804, |
| "step": 768 |
| }, |
| { |
| "epoch": 0.096125, |
| "grad_norm": 2.8744146823883057, |
| "grad_norm_var": 0.5036373977995244, |
| "learning_rate": 0.0001, |
| "loss": 0.9094, |
| "loss/crossentropy": 2.5140726566314697, |
| "loss/hidden": 0.6171875, |
| "loss/logits": 0.0860939770936966, |
| "loss/reg": 0.020607706159353256, |
| "step": 769 |
| }, |
| { |
| "epoch": 0.09625, |
| "grad_norm": 8.404803276062012, |
| "grad_norm_var": 1.9605456650252857, |
| "learning_rate": 0.0001, |
| "loss": 1.614, |
| "loss/crossentropy": 2.4072842597961426, |
| "loss/hidden": 1.1328125, |
| "loss/logits": 0.2751774787902832, |
| "loss/reg": 0.020600339397788048, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.096375, |
| "grad_norm": 2.581511974334717, |
| "grad_norm_var": 1.975733645477993, |
| "learning_rate": 0.0001, |
| "loss": 1.08, |
| "loss/crossentropy": 2.3724427223205566, |
| "loss/hidden": 0.73828125, |
| "loss/logits": 0.1358003169298172, |
| "loss/reg": 0.02059323526918888, |
| "step": 771 |
| }, |
| { |
| "epoch": 0.0965, |
| "grad_norm": 3.9817845821380615, |
| "grad_norm_var": 1.9433082266341482, |
| "learning_rate": 0.0001, |
| "loss": 1.0926, |
| "loss/crossentropy": 2.459740400314331, |
| "loss/hidden": 0.765625, |
| "loss/logits": 0.12109124660491943, |
| "loss/reg": 0.02058546058833599, |
| "step": 772 |
| }, |
| { |
| "epoch": 0.096625, |
| "grad_norm": 7.001491069793701, |
| "grad_norm_var": 2.633487351928299, |
| "learning_rate": 0.0001, |
| "loss": 1.1253, |
| "loss/crossentropy": 2.6342828273773193, |
| "loss/hidden": 0.8125, |
| "loss/logits": 0.10705184936523438, |
| "loss/reg": 0.020576275885105133, |
| "step": 773 |
| }, |
| { |
| "epoch": 0.09675, |
| "grad_norm": 2.4826319217681885, |
| "grad_norm_var": 2.6865387021083524, |
| "learning_rate": 0.0001, |
| "loss": 1.1239, |
| "loss/crossentropy": 2.537883996963501, |
| "loss/hidden": 0.7734375, |
| "loss/logits": 0.1447601616382599, |
| "loss/reg": 0.02056770585477352, |
| "step": 774 |
| }, |
| { |
| "epoch": 0.096875, |
| "grad_norm": 3.5470290184020996, |
| "grad_norm_var": 2.6622723084153237, |
| "learning_rate": 0.0001, |
| "loss": 1.1572, |
| "loss/crossentropy": 2.572312831878662, |
| "loss/hidden": 0.80078125, |
| "loss/logits": 0.15083444118499756, |
| "loss/reg": 0.020558428019285202, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.097, |
| "grad_norm": 2.8388142585754395, |
| "grad_norm_var": 2.673918444962514, |
| "learning_rate": 0.0001, |
| "loss": 1.1426, |
| "loss/crossentropy": 2.2697503566741943, |
| "loss/hidden": 0.79296875, |
| "loss/logits": 0.14412574470043182, |
| "loss/reg": 0.020549749955534935, |
| "step": 776 |
| }, |
| { |
| "epoch": 0.097125, |
| "grad_norm": 2.9166338443756104, |
| "grad_norm_var": 2.670560695292122, |
| "learning_rate": 0.0001, |
| "loss": 0.8926, |
| "loss/crossentropy": 2.4288480281829834, |
| "loss/hidden": 0.58203125, |
| "loss/logits": 0.10516718029975891, |
| "loss/reg": 0.02054043672978878, |
| "step": 777 |
| }, |
| { |
| "epoch": 0.09725, |
| "grad_norm": 3.565744161605835, |
| "grad_norm_var": 2.58151597609506, |
| "learning_rate": 0.0001, |
| "loss": 1.1299, |
| "loss/crossentropy": 2.444842576980591, |
| "loss/hidden": 0.77734375, |
| "loss/logits": 0.1472683846950531, |
| "loss/reg": 0.020531047135591507, |
| "step": 778 |
| }, |
| { |
| "epoch": 0.097375, |
| "grad_norm": 2.8829805850982666, |
| "grad_norm_var": 2.627842889624617, |
| "learning_rate": 0.0001, |
| "loss": 1.1382, |
| "loss/crossentropy": 2.548234462738037, |
| "loss/hidden": 0.78515625, |
| "loss/logits": 0.1478501409292221, |
| "loss/reg": 0.02052178978919983, |
| "step": 779 |
| }, |
| { |
| "epoch": 0.0975, |
| "grad_norm": 2.908299207687378, |
| "grad_norm_var": 2.6383052442278556, |
| "learning_rate": 0.0001, |
| "loss": 1.069, |
| "loss/crossentropy": 2.3388137817382812, |
| "loss/hidden": 0.734375, |
| "loss/logits": 0.12949424982070923, |
| "loss/reg": 0.020512979477643967, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.097625, |
| "grad_norm": 2.5852208137512207, |
| "grad_norm_var": 2.717361748364273, |
| "learning_rate": 0.0001, |
| "loss": 0.9782, |
| "loss/crossentropy": 2.4337894916534424, |
| "loss/hidden": 0.65625, |
| "loss/logits": 0.11689651757478714, |
| "loss/reg": 0.020503604784607887, |
| "step": 781 |
| }, |
| { |
| "epoch": 0.09775, |
| "grad_norm": 5.328097820281982, |
| "grad_norm_var": 2.885194693951976, |
| "learning_rate": 0.0001, |
| "loss": 1.2053, |
| "loss/crossentropy": 2.5564181804656982, |
| "loss/hidden": 0.80078125, |
| "loss/logits": 0.19961750507354736, |
| "loss/reg": 0.020494818687438965, |
| "step": 782 |
| }, |
| { |
| "epoch": 0.097875, |
| "grad_norm": 3.647054672241211, |
| "grad_norm_var": 2.8678156226553613, |
| "learning_rate": 0.0001, |
| "loss": 1.0039, |
| "loss/crossentropy": 2.5830650329589844, |
| "loss/hidden": 0.66796875, |
| "loss/logits": 0.13103139400482178, |
| "loss/reg": 0.020485466346144676, |
| "step": 783 |
| }, |
| { |
| "epoch": 0.098, |
| "grad_norm": 4.480771541595459, |
| "grad_norm_var": 2.8817531456144723, |
| "learning_rate": 0.0001, |
| "loss": 1.3811, |
| "loss/crossentropy": 2.4550743103027344, |
| "loss/hidden": 0.9765625, |
| "loss/logits": 0.1997724324464798, |
| "loss/reg": 0.020477164536714554, |
| "step": 784 |
| }, |
| { |
| "epoch": 0.098125, |
| "grad_norm": 3.1495656967163086, |
| "grad_norm_var": 2.8497140664534366, |
| "learning_rate": 0.0001, |
| "loss": 1.1386, |
| "loss/crossentropy": 2.846536636352539, |
| "loss/hidden": 0.76171875, |
| "loss/logits": 0.1722334325313568, |
| "loss/reg": 0.020469149574637413, |
| "step": 785 |
| }, |
| { |
| "epoch": 0.09825, |
| "grad_norm": 3.611919641494751, |
| "grad_norm_var": 1.4027508562338329, |
| "learning_rate": 0.0001, |
| "loss": 1.0783, |
| "loss/crossentropy": 2.7328994274139404, |
| "loss/hidden": 0.73046875, |
| "loss/logits": 0.14321856200695038, |
| "loss/reg": 0.020459884777665138, |
| "step": 786 |
| }, |
| { |
| "epoch": 0.098375, |
| "grad_norm": 4.535567283630371, |
| "grad_norm_var": 1.3775118805215696, |
| "learning_rate": 0.0001, |
| "loss": 1.3035, |
| "loss/crossentropy": 2.616641044616699, |
| "loss/hidden": 0.8515625, |
| "loss/logits": 0.24746158719062805, |
| "loss/reg": 0.020450593903660774, |
| "step": 787 |
| }, |
| { |
| "epoch": 0.0985, |
| "grad_norm": 2.7010657787323, |
| "grad_norm_var": 1.4347220572574786, |
| "learning_rate": 0.0001, |
| "loss": 0.9439, |
| "loss/crossentropy": 2.7851712703704834, |
| "loss/hidden": 0.62890625, |
| "loss/logits": 0.11055716872215271, |
| "loss/reg": 0.020441319793462753, |
| "step": 788 |
| }, |
| { |
| "epoch": 0.098625, |
| "grad_norm": 3.3690221309661865, |
| "grad_norm_var": 0.6296018822433316, |
| "learning_rate": 0.0001, |
| "loss": 0.9858, |
| "loss/crossentropy": 2.5199151039123535, |
| "loss/hidden": 0.6640625, |
| "loss/logits": 0.11739970743656158, |
| "loss/reg": 0.020432572811841965, |
| "step": 789 |
| }, |
| { |
| "epoch": 0.09875, |
| "grad_norm": 3.5332016944885254, |
| "grad_norm_var": 0.5687648370759459, |
| "learning_rate": 0.0001, |
| "loss": 1.2114, |
| "loss/crossentropy": 2.3861191272735596, |
| "loss/hidden": 0.80859375, |
| "loss/logits": 0.19853942096233368, |
| "loss/reg": 0.020423252135515213, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.098875, |
| "grad_norm": 3.4778192043304443, |
| "grad_norm_var": 0.5684000998912779, |
| "learning_rate": 0.0001, |
| "loss": 1.3337, |
| "loss/crossentropy": 2.177149772644043, |
| "loss/hidden": 0.94921875, |
| "loss/logits": 0.18037351965904236, |
| "loss/reg": 0.020413951948285103, |
| "step": 791 |
| }, |
| { |
| "epoch": 0.099, |
| "grad_norm": 3.1103479862213135, |
| "grad_norm_var": 0.5501298461305394, |
| "learning_rate": 0.0001, |
| "loss": 1.1811, |
| "loss/crossentropy": 2.381289005279541, |
| "loss/hidden": 0.8125, |
| "loss/logits": 0.16453775763511658, |
| "loss/reg": 0.020404649898409843, |
| "step": 792 |
| }, |
| { |
| "epoch": 0.099125, |
| "grad_norm": 2.7654690742492676, |
| "grad_norm_var": 0.563068172749172, |
| "learning_rate": 0.0001, |
| "loss": 0.9788, |
| "loss/crossentropy": 2.6645448207855225, |
| "loss/hidden": 0.65234375, |
| "loss/logits": 0.12248219549655914, |
| "loss/reg": 0.020395854488015175, |
| "step": 793 |
| }, |
| { |
| "epoch": 0.09925, |
| "grad_norm": 2.9942195415496826, |
| "grad_norm_var": 0.5768165563916947, |
| "learning_rate": 0.0001, |
| "loss": 1.1054, |
| "loss/crossentropy": 2.388904571533203, |
| "loss/hidden": 0.73828125, |
| "loss/logits": 0.16325941681861877, |
| "loss/reg": 0.020387381315231323, |
| "step": 794 |
| }, |
| { |
| "epoch": 0.099375, |
| "grad_norm": 3.1070520877838135, |
| "grad_norm_var": 0.5632370819485725, |
| "learning_rate": 0.0001, |
| "loss": 1.0819, |
| "loss/crossentropy": 2.4645302295684814, |
| "loss/hidden": 0.74609375, |
| "loss/logits": 0.13203924894332886, |
| "loss/reg": 0.020378144457936287, |
| "step": 795 |
| }, |
| { |
| "epoch": 0.0995, |
| "grad_norm": 3.5182595252990723, |
| "grad_norm_var": 0.5419026805153273, |
| "learning_rate": 0.0001, |
| "loss": 1.3242, |
| "loss/crossentropy": 2.337388038635254, |
| "loss/hidden": 0.96875, |
| "loss/logits": 0.15180304646492004, |
| "loss/reg": 0.020368557423353195, |
| "step": 796 |
| }, |
| { |
| "epoch": 0.099625, |
| "grad_norm": 2.4217121601104736, |
| "grad_norm_var": 0.5634005753459926, |
| "learning_rate": 0.0001, |
| "loss": 0.976, |
| "loss/crossentropy": 2.520176887512207, |
| "loss/hidden": 0.6640625, |
| "loss/logits": 0.1083153486251831, |
| "loss/reg": 0.020359758287668228, |
| "step": 797 |
| }, |
| { |
| "epoch": 0.09975, |
| "grad_norm": 3.818143129348755, |
| "grad_norm_var": 0.33472096860269646, |
| "learning_rate": 0.0001, |
| "loss": 1.2273, |
| "loss/crossentropy": 2.452592134475708, |
| "loss/hidden": 0.86328125, |
| "loss/logits": 0.1604856252670288, |
| "loss/reg": 0.02035023830831051, |
| "step": 798 |
| }, |
| { |
| "epoch": 0.099875, |
| "grad_norm": 2.8298075199127197, |
| "grad_norm_var": 0.3484620943588491, |
| "learning_rate": 0.0001, |
| "loss": 1.094, |
| "loss/crossentropy": 2.6919286251068115, |
| "loss/hidden": 0.73828125, |
| "loss/logits": 0.15227138996124268, |
| "loss/reg": 0.020340625196695328, |
| "step": 799 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 45.50175094604492, |
| "grad_norm_var": 111.76340644728633, |
| "learning_rate": 0.0001, |
| "loss": 1.6213, |
| "loss/crossentropy": 2.2076241970062256, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.2930048406124115, |
| "loss/reg": 0.020331410691142082, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.100125, |
| "grad_norm": 3.073981523513794, |
| "grad_norm_var": 111.7915103772579, |
| "learning_rate": 0.0001, |
| "loss": 1.1146, |
| "loss/crossentropy": 2.5723347663879395, |
| "loss/hidden": 0.77734375, |
| "loss/logits": 0.13403840363025665, |
| "loss/reg": 0.020322071388363838, |
| "step": 801 |
| }, |
| { |
| "epoch": 0.10025, |
| "grad_norm": 4.337286472320557, |
| "grad_norm_var": 111.60328751499571, |
| "learning_rate": 0.0001, |
| "loss": 1.2228, |
| "loss/crossentropy": 2.7784037590026855, |
| "loss/hidden": 0.83984375, |
| "loss/logits": 0.17983263731002808, |
| "loss/reg": 0.020312372595071793, |
| "step": 802 |
| }, |
| { |
| "epoch": 0.100375, |
| "grad_norm": 3.656367778778076, |
| "grad_norm_var": 111.81663718658596, |
| "learning_rate": 0.0001, |
| "loss": 1.1003, |
| "loss/crossentropy": 2.4638426303863525, |
| "loss/hidden": 0.76953125, |
| "loss/logits": 0.12776704132556915, |
| "loss/reg": 0.020302986726164818, |
| "step": 803 |
| }, |
| { |
| "epoch": 0.1005, |
| "grad_norm": 3.38935923576355, |
| "grad_norm_var": 111.55373057700994, |
| "learning_rate": 0.0001, |
| "loss": 0.9749, |
| "loss/crossentropy": 2.4947969913482666, |
| "loss/hidden": 0.6640625, |
| "loss/logits": 0.10794391483068466, |
| "loss/reg": 0.020293867215514183, |
| "step": 804 |
| }, |
| { |
| "epoch": 0.100625, |
| "grad_norm": 3.2775380611419678, |
| "grad_norm_var": 111.58551029522327, |
| "learning_rate": 0.0001, |
| "loss": 1.133, |
| "loss/crossentropy": 2.4812419414520264, |
| "loss/hidden": 0.77734375, |
| "loss/logits": 0.15277597308158875, |
| "loss/reg": 0.020284701138734818, |
| "step": 805 |
| }, |
| { |
| "epoch": 0.10075, |
| "grad_norm": 2.996157646179199, |
| "grad_norm_var": 111.77485823890761, |
| "learning_rate": 0.0001, |
| "loss": 1.0998, |
| "loss/crossentropy": 2.516984224319458, |
| "loss/hidden": 0.75, |
| "loss/logits": 0.14709463715553284, |
| "loss/reg": 0.020275365561246872, |
| "step": 806 |
| }, |
| { |
| "epoch": 0.100875, |
| "grad_norm": 5.575490951538086, |
| "grad_norm_var": 111.37459403701224, |
| "learning_rate": 0.0001, |
| "loss": 1.0629, |
| "loss/crossentropy": 2.6265506744384766, |
| "loss/hidden": 0.73828125, |
| "loss/logits": 0.12198775261640549, |
| "loss/reg": 0.02026602067053318, |
| "step": 807 |
| }, |
| { |
| "epoch": 0.101, |
| "grad_norm": 2.653712749481201, |
| "grad_norm_var": 111.56498102164149, |
| "learning_rate": 0.0001, |
| "loss": 0.957, |
| "loss/crossentropy": 2.755121946334839, |
| "loss/hidden": 0.6328125, |
| "loss/logits": 0.12166983634233475, |
| "loss/reg": 0.02025618776679039, |
| "step": 808 |
| }, |
| { |
| "epoch": 0.101125, |
| "grad_norm": 2.5454790592193604, |
| "grad_norm_var": 111.66272758702647, |
| "learning_rate": 0.0001, |
| "loss": 1.0209, |
| "loss/crossentropy": 2.475360870361328, |
| "loss/hidden": 0.6953125, |
| "loss/logits": 0.1231006383895874, |
| "loss/reg": 0.02024705521762371, |
| "step": 809 |
| }, |
| { |
| "epoch": 0.10125, |
| "grad_norm": 12.002355575561523, |
| "grad_norm_var": 113.1469842386682, |
| "learning_rate": 0.0001, |
| "loss": 1.1289, |
| "loss/crossentropy": 2.485873222351074, |
| "loss/hidden": 0.77734375, |
| "loss/logits": 0.1492297500371933, |
| "loss/reg": 0.020237451419234276, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.101375, |
| "grad_norm": 3.0118675231933594, |
| "grad_norm_var": 113.19117010752396, |
| "learning_rate": 0.0001, |
| "loss": 1.0673, |
| "loss/crossentropy": 2.610520124435425, |
| "loss/hidden": 0.7265625, |
| "loss/logits": 0.13847574591636658, |
| "loss/reg": 0.020227529108524323, |
| "step": 811 |
| }, |
| { |
| "epoch": 0.1015, |
| "grad_norm": 37.33988952636719, |
| "grad_norm_var": 171.06705552642327, |
| "learning_rate": 0.0001, |
| "loss": 4.5438, |
| "loss/crossentropy": 3.938170909881592, |
| "loss/hidden": 2.203125, |
| "loss/logits": 2.1385304927825928, |
| "loss/reg": 0.02021711878478527, |
| "step": 812 |
| }, |
| { |
| "epoch": 0.101625, |
| "grad_norm": 2.306735038757324, |
| "grad_norm_var": 171.16339278078715, |
| "learning_rate": 0.0001, |
| "loss": 0.928, |
| "loss/crossentropy": 2.3164937496185303, |
| "loss/hidden": 0.625, |
| "loss/logits": 0.10087703168392181, |
| "loss/reg": 0.020207591354846954, |
| "step": 813 |
| }, |
| { |
| "epoch": 0.10175, |
| "grad_norm": 3.00903058052063, |
| "grad_norm_var": 171.72501112959992, |
| "learning_rate": 0.0001, |
| "loss": 1.1753, |
| "loss/crossentropy": 2.4386990070343018, |
| "loss/hidden": 0.828125, |
| "loss/logits": 0.1452336609363556, |
| "loss/reg": 0.02019745111465454, |
| "step": 814 |
| }, |
| { |
| "epoch": 0.101875, |
| "grad_norm": 2.680140972137451, |
| "grad_norm_var": 171.84144221114087, |
| "learning_rate": 0.0001, |
| "loss": 1.157, |
| "loss/crossentropy": 2.531343460083008, |
| "loss/hidden": 0.796875, |
| "loss/logits": 0.15824541449546814, |
| "loss/reg": 0.020187031477689743, |
| "step": 815 |
| }, |
| { |
| "epoch": 0.102, |
| "grad_norm": 2.582740068435669, |
| "grad_norm_var": 75.71062264039246, |
| "learning_rate": 0.0001, |
| "loss": 1.0065, |
| "loss/crossentropy": 2.5258586406707764, |
| "loss/hidden": 0.69140625, |
| "loss/logits": 0.11330173909664154, |
| "loss/reg": 0.02017681486904621, |
| "step": 816 |
| }, |
| { |
| "epoch": 0.102125, |
| "grad_norm": 3.5809519290924072, |
| "grad_norm_var": 75.53549752812218, |
| "learning_rate": 0.0001, |
| "loss": 1.2756, |
| "loss/crossentropy": 2.080437183380127, |
| "loss/hidden": 0.90625, |
| "loss/logits": 0.16763544082641602, |
| "loss/reg": 0.02016652189195156, |
| "step": 817 |
| }, |
| { |
| "epoch": 0.10225, |
| "grad_norm": 3.093592405319214, |
| "grad_norm_var": 75.89695881356819, |
| "learning_rate": 0.0001, |
| "loss": 1.1484, |
| "loss/crossentropy": 2.630833625793457, |
| "loss/hidden": 0.80078125, |
| "loss/logits": 0.14600682258605957, |
| "loss/reg": 0.020157409831881523, |
| "step": 818 |
| }, |
| { |
| "epoch": 0.102375, |
| "grad_norm": 3.1120946407318115, |
| "grad_norm_var": 76.0751246894024, |
| "learning_rate": 0.0001, |
| "loss": 1.038, |
| "loss/crossentropy": 2.5411901473999023, |
| "loss/hidden": 0.69140625, |
| "loss/logits": 0.14511412382125854, |
| "loss/reg": 0.020148303359746933, |
| "step": 819 |
| }, |
| { |
| "epoch": 0.1025, |
| "grad_norm": 2.9903945922851562, |
| "grad_norm_var": 76.21449508483448, |
| "learning_rate": 0.0001, |
| "loss": 1.0769, |
| "loss/crossentropy": 2.556246042251587, |
| "loss/hidden": 0.73828125, |
| "loss/logits": 0.13721047341823578, |
| "loss/reg": 0.020139139145612717, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.102625, |
| "grad_norm": 3.297760248184204, |
| "grad_norm_var": 76.2077263993312, |
| "learning_rate": 0.0001, |
| "loss": 1.025, |
| "loss/crossentropy": 2.5704898834228516, |
| "loss/hidden": 0.6796875, |
| "loss/logits": 0.14399868249893188, |
| "loss/reg": 0.020129989832639694, |
| "step": 821 |
| }, |
| { |
| "epoch": 0.10275, |
| "grad_norm": 3.810601234436035, |
| "grad_norm_var": 75.94485425030823, |
| "learning_rate": 0.0001, |
| "loss": 1.2142, |
| "loss/crossentropy": 2.4684388637542725, |
| "loss/hidden": 0.83984375, |
| "loss/logits": 0.17318351566791534, |
| "loss/reg": 0.020120643079280853, |
| "step": 822 |
| }, |
| { |
| "epoch": 0.102875, |
| "grad_norm": 2.798835515975952, |
| "grad_norm_var": 76.52818091118122, |
| "learning_rate": 0.0001, |
| "loss": 0.9558, |
| "loss/crossentropy": 2.476940393447876, |
| "loss/hidden": 0.6484375, |
| "loss/logits": 0.10626688599586487, |
| "loss/reg": 0.020111503079533577, |
| "step": 823 |
| }, |
| { |
| "epoch": 0.103, |
| "grad_norm": 4.541812419891357, |
| "grad_norm_var": 75.99013496754353, |
| "learning_rate": 0.0001, |
| "loss": 1.1311, |
| "loss/crossentropy": 2.339744806289673, |
| "loss/hidden": 0.7265625, |
| "loss/logits": 0.2034740447998047, |
| "loss/reg": 0.020102351903915405, |
| "step": 824 |
| }, |
| { |
| "epoch": 0.103125, |
| "grad_norm": 2.6360268592834473, |
| "grad_norm_var": 75.95142766348106, |
| "learning_rate": 0.0001, |
| "loss": 1.0849, |
| "loss/crossentropy": 2.3795337677001953, |
| "loss/hidden": 0.74609375, |
| "loss/logits": 0.1378898024559021, |
| "loss/reg": 0.02009383775293827, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.10325, |
| "grad_norm": 3.2206642627716064, |
| "grad_norm_var": 73.50864103963063, |
| "learning_rate": 0.0001, |
| "loss": 1.09, |
| "loss/crossentropy": 2.645775556564331, |
| "loss/hidden": 0.7578125, |
| "loss/logits": 0.13137856125831604, |
| "loss/reg": 0.020084405317902565, |
| "step": 826 |
| }, |
| { |
| "epoch": 0.103375, |
| "grad_norm": 3.2071568965911865, |
| "grad_norm_var": 73.45272548167614, |
| "learning_rate": 0.0001, |
| "loss": 0.9383, |
| "loss/crossentropy": 2.7224762439727783, |
| "loss/hidden": 0.625, |
| "loss/logits": 0.11251779645681381, |
| "loss/reg": 0.020074598491191864, |
| "step": 827 |
| }, |
| { |
| "epoch": 0.1035, |
| "grad_norm": 3.2850613594055176, |
| "grad_norm_var": 0.2863261800221416, |
| "learning_rate": 0.0001, |
| "loss": 1.1194, |
| "loss/crossentropy": 2.522642135620117, |
| "loss/hidden": 0.7421875, |
| "loss/logits": 0.17652815580368042, |
| "loss/reg": 0.02006435953080654, |
| "step": 828 |
| }, |
| { |
| "epoch": 0.103625, |
| "grad_norm": 3.5068256855010986, |
| "grad_norm_var": 0.24387138774257647, |
| "learning_rate": 0.0001, |
| "loss": 0.9457, |
| "loss/crossentropy": 2.6463444232940674, |
| "loss/hidden": 0.6328125, |
| "loss/logits": 0.11229754984378815, |
| "loss/reg": 0.020054515451192856, |
| "step": 829 |
| }, |
| { |
| "epoch": 0.10375, |
| "grad_norm": 4.16710090637207, |
| "grad_norm_var": 0.29672115328222404, |
| "learning_rate": 0.0001, |
| "loss": 1.301, |
| "loss/crossentropy": 2.519270420074463, |
| "loss/hidden": 0.921875, |
| "loss/logits": 0.1786651611328125, |
| "loss/reg": 0.020044928416609764, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.103875, |
| "grad_norm": 3.5839273929595947, |
| "grad_norm_var": 0.27524789373512704, |
| "learning_rate": 0.0001, |
| "loss": 0.9949, |
| "loss/crossentropy": 2.0604701042175293, |
| "loss/hidden": 0.68359375, |
| "loss/logits": 0.1109282597899437, |
| "loss/reg": 0.02003585919737816, |
| "step": 831 |
| }, |
| { |
| "epoch": 0.104, |
| "grad_norm": 2.8121650218963623, |
| "grad_norm_var": 0.25541980739101955, |
| "learning_rate": 0.0001, |
| "loss": 1.1057, |
| "loss/crossentropy": 2.670191764831543, |
| "loss/hidden": 0.75390625, |
| "loss/logits": 0.1515616476535797, |
| "loss/reg": 0.020026110112667084, |
| "step": 832 |
| }, |
| { |
| "epoch": 0.104125, |
| "grad_norm": 3.116018295288086, |
| "grad_norm_var": 0.2547872758708628, |
| "learning_rate": 0.0001, |
| "loss": 1.0301, |
| "loss/crossentropy": 3.016143321990967, |
| "loss/hidden": 0.6796875, |
| "loss/logits": 0.1502160131931305, |
| "loss/reg": 0.020016156136989594, |
| "step": 833 |
| }, |
| { |
| "epoch": 0.10425, |
| "grad_norm": 6.202447414398193, |
| "grad_norm_var": 0.7634439694535559, |
| "learning_rate": 0.0001, |
| "loss": 1.2045, |
| "loss/crossentropy": 2.8383448123931885, |
| "loss/hidden": 0.84765625, |
| "loss/logits": 0.15678462386131287, |
| "loss/reg": 0.020007088780403137, |
| "step": 834 |
| }, |
| { |
| "epoch": 0.104375, |
| "grad_norm": 3.4904069900512695, |
| "grad_norm_var": 0.7519116349075012, |
| "learning_rate": 0.0001, |
| "loss": 1.2658, |
| "loss/crossentropy": 2.597175359725952, |
| "loss/hidden": 0.84375, |
| "loss/logits": 0.22209098935127258, |
| "loss/reg": 0.019998185336589813, |
| "step": 835 |
| }, |
| { |
| "epoch": 0.1045, |
| "grad_norm": 4.6242594718933105, |
| "grad_norm_var": 0.7986550791863064, |
| "learning_rate": 0.0001, |
| "loss": 1.1411, |
| "loss/crossentropy": 2.6017813682556152, |
| "loss/hidden": 0.796875, |
| "loss/logits": 0.14435435831546783, |
| "loss/reg": 0.019988389685750008, |
| "step": 836 |
| }, |
| { |
| "epoch": 0.104625, |
| "grad_norm": 3.2676827907562256, |
| "grad_norm_var": 0.800099420481778, |
| "learning_rate": 0.0001, |
| "loss": 1.1908, |
| "loss/crossentropy": 2.454334259033203, |
| "loss/hidden": 0.82421875, |
| "loss/logits": 0.16681547462940216, |
| "loss/reg": 0.019979091361165047, |
| "step": 837 |
| }, |
| { |
| "epoch": 0.10475, |
| "grad_norm": 3.4931089878082275, |
| "grad_norm_var": 0.7992595598721048, |
| "learning_rate": 0.0001, |
| "loss": 1.1803, |
| "loss/crossentropy": 2.2890915870666504, |
| "loss/hidden": 0.828125, |
| "loss/logits": 0.15247204899787903, |
| "loss/reg": 0.019969170913100243, |
| "step": 838 |
| }, |
| { |
| "epoch": 0.104875, |
| "grad_norm": 2.7106313705444336, |
| "grad_norm_var": 0.8094277801425143, |
| "learning_rate": 0.0001, |
| "loss": 1.097, |
| "loss/crossentropy": 2.56706166267395, |
| "loss/hidden": 0.76953125, |
| "loss/logits": 0.12783397734165192, |
| "loss/reg": 0.019960079342126846, |
| "step": 839 |
| }, |
| { |
| "epoch": 0.105, |
| "grad_norm": 3.2825114727020264, |
| "grad_norm_var": 0.7531900707246374, |
| "learning_rate": 0.0001, |
| "loss": 1.2726, |
| "loss/crossentropy": 2.324145555496216, |
| "loss/hidden": 0.88671875, |
| "loss/logits": 0.18636029958724976, |
| "loss/reg": 0.019951237365603447, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.105125, |
| "grad_norm": 2.752570152282715, |
| "grad_norm_var": 0.7400250579900473, |
| "learning_rate": 0.0001, |
| "loss": 1.0885, |
| "loss/crossentropy": 2.4273452758789062, |
| "loss/hidden": 0.7578125, |
| "loss/logits": 0.1312946379184723, |
| "loss/reg": 0.01994233950972557, |
| "step": 841 |
| }, |
| { |
| "epoch": 0.10525, |
| "grad_norm": 3.451720714569092, |
| "grad_norm_var": 0.733364881032247, |
| "learning_rate": 0.0001, |
| "loss": 1.1042, |
| "loss/crossentropy": 2.5271100997924805, |
| "loss/hidden": 0.76171875, |
| "loss/logits": 0.1431449055671692, |
| "loss/reg": 0.019932815805077553, |
| "step": 842 |
| }, |
| { |
| "epoch": 0.105375, |
| "grad_norm": 3.0586564540863037, |
| "grad_norm_var": 0.741721542830341, |
| "learning_rate": 0.0001, |
| "loss": 1.0412, |
| "loss/crossentropy": 2.9447247982025146, |
| "loss/hidden": 0.7109375, |
| "loss/logits": 0.13100013136863708, |
| "loss/reg": 0.019923273473978043, |
| "step": 843 |
| }, |
| { |
| "epoch": 0.1055, |
| "grad_norm": 3.4214084148406982, |
| "grad_norm_var": 0.7380611813534226, |
| "learning_rate": 0.0001, |
| "loss": 1.2281, |
| "loss/crossentropy": 2.3005340099334717, |
| "loss/hidden": 0.86328125, |
| "loss/logits": 0.1657242327928543, |
| "loss/reg": 0.01991339959204197, |
| "step": 844 |
| }, |
| { |
| "epoch": 0.105625, |
| "grad_norm": 2.6607367992401123, |
| "grad_norm_var": 0.7886706735221035, |
| "learning_rate": 0.0001, |
| "loss": 1.1345, |
| "loss/crossentropy": 2.5435853004455566, |
| "loss/hidden": 0.80078125, |
| "loss/logits": 0.1347101330757141, |
| "loss/reg": 0.019903138279914856, |
| "step": 845 |
| }, |
| { |
| "epoch": 0.10575, |
| "grad_norm": 3.0965163707733154, |
| "grad_norm_var": 0.7659307635756494, |
| "learning_rate": 0.0001, |
| "loss": 1.0956, |
| "loss/crossentropy": 2.6083555221557617, |
| "loss/hidden": 0.74609375, |
| "loss/logits": 0.1505199819803238, |
| "loss/reg": 0.019893797114491463, |
| "step": 846 |
| }, |
| { |
| "epoch": 0.105875, |
| "grad_norm": 4.309004306793213, |
| "grad_norm_var": 0.8127957898222188, |
| "learning_rate": 0.0001, |
| "loss": 1.078, |
| "loss/crossentropy": 2.515653371810913, |
| "loss/hidden": 0.76171875, |
| "loss/logits": 0.11742238700389862, |
| "loss/reg": 0.019883660599589348, |
| "step": 847 |
| }, |
| { |
| "epoch": 0.106, |
| "grad_norm": 3.248624563217163, |
| "grad_norm_var": 0.7855834171862691, |
| "learning_rate": 0.0001, |
| "loss": 1.0838, |
| "loss/crossentropy": 2.593562126159668, |
| "loss/hidden": 0.75390625, |
| "loss/logits": 0.13119123876094818, |
| "loss/reg": 0.019874349236488342, |
| "step": 848 |
| }, |
| { |
| "epoch": 0.106125, |
| "grad_norm": 3.347926378250122, |
| "grad_norm_var": 0.7767115778534122, |
| "learning_rate": 0.0001, |
| "loss": 1.0816, |
| "loss/crossentropy": 2.3947036266326904, |
| "loss/hidden": 0.75, |
| "loss/logits": 0.13295108079910278, |
| "loss/reg": 0.019864298403263092, |
| "step": 849 |
| }, |
| { |
| "epoch": 0.10625, |
| "grad_norm": 4.369658946990967, |
| "grad_norm_var": 0.33264170947598637, |
| "learning_rate": 0.0001, |
| "loss": 1.0708, |
| "loss/crossentropy": 2.449446201324463, |
| "loss/hidden": 0.7421875, |
| "loss/logits": 0.13007891178131104, |
| "loss/reg": 0.019854165613651276, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.106375, |
| "grad_norm": 3.361130475997925, |
| "grad_norm_var": 0.332327660409797, |
| "learning_rate": 0.0001, |
| "loss": 0.9761, |
| "loss/crossentropy": 2.566408157348633, |
| "loss/hidden": 0.66796875, |
| "loss/logits": 0.10966208577156067, |
| "loss/reg": 0.019843947142362595, |
| "step": 851 |
| }, |
| { |
| "epoch": 0.1065, |
| "grad_norm": 2.6885204315185547, |
| "grad_norm_var": 0.25144665871681204, |
| "learning_rate": 0.0001, |
| "loss": 0.9331, |
| "loss/crossentropy": 2.3620035648345947, |
| "loss/hidden": 0.625, |
| "loss/logits": 0.10973039269447327, |
| "loss/reg": 0.019834715873003006, |
| "step": 852 |
| }, |
| { |
| "epoch": 0.106625, |
| "grad_norm": 2.615734338760376, |
| "grad_norm_var": 0.2793016853206145, |
| "learning_rate": 0.0001, |
| "loss": 1.0168, |
| "loss/crossentropy": 2.605545997619629, |
| "loss/hidden": 0.69140625, |
| "loss/logits": 0.12715153396129608, |
| "loss/reg": 0.01982559822499752, |
| "step": 853 |
| }, |
| { |
| "epoch": 0.10675, |
| "grad_norm": 3.073012113571167, |
| "grad_norm_var": 0.276254032788457, |
| "learning_rate": 0.0001, |
| "loss": 0.9855, |
| "loss/crossentropy": 2.634042739868164, |
| "loss/hidden": 0.671875, |
| "loss/logits": 0.11542729288339615, |
| "loss/reg": 0.019816165789961815, |
| "step": 854 |
| }, |
| { |
| "epoch": 0.106875, |
| "grad_norm": 2.759152412414551, |
| "grad_norm_var": 0.27313479552050995, |
| "learning_rate": 0.0001, |
| "loss": 0.9145, |
| "loss/crossentropy": 2.4920616149902344, |
| "loss/hidden": 0.61328125, |
| "loss/logits": 0.10316716134548187, |
| "loss/reg": 0.019806833937764168, |
| "step": 855 |
| }, |
| { |
| "epoch": 0.107, |
| "grad_norm": 3.647084951400757, |
| "grad_norm_var": 0.28455080731760823, |
| "learning_rate": 0.0001, |
| "loss": 1.1336, |
| "loss/crossentropy": 2.34993314743042, |
| "loss/hidden": 0.78515625, |
| "loss/logits": 0.150485098361969, |
| "loss/reg": 0.019797123968601227, |
| "step": 856 |
| }, |
| { |
| "epoch": 0.107125, |
| "grad_norm": 5.309474468231201, |
| "grad_norm_var": 0.5265287098230971, |
| "learning_rate": 0.0001, |
| "loss": 1.3513, |
| "loss/crossentropy": 2.4511518478393555, |
| "loss/hidden": 0.94140625, |
| "loss/logits": 0.21205759048461914, |
| "loss/reg": 0.01978708617389202, |
| "step": 857 |
| }, |
| { |
| "epoch": 0.10725, |
| "grad_norm": 3.1982548236846924, |
| "grad_norm_var": 0.5288348795583182, |
| "learning_rate": 0.0001, |
| "loss": 1.1847, |
| "loss/crossentropy": 2.2767844200134277, |
| "loss/hidden": 0.8359375, |
| "loss/logits": 0.15097308158874512, |
| "loss/reg": 0.01977648213505745, |
| "step": 858 |
| }, |
| { |
| "epoch": 0.107375, |
| "grad_norm": 3.7261335849761963, |
| "grad_norm_var": 0.5276094221235986, |
| "learning_rate": 0.0001, |
| "loss": 1.1138, |
| "loss/crossentropy": 2.660844087600708, |
| "loss/hidden": 0.78515625, |
| "loss/logits": 0.13101539015769958, |
| "loss/reg": 0.01976662687957287, |
| "step": 859 |
| }, |
| { |
| "epoch": 0.1075, |
| "grad_norm": 3.3435637950897217, |
| "grad_norm_var": 0.5280464375318101, |
| "learning_rate": 0.0001, |
| "loss": 1.2877, |
| "loss/crossentropy": 2.4701263904571533, |
| "loss/hidden": 0.90625, |
| "loss/logits": 0.18388135731220245, |
| "loss/reg": 0.019757471978664398, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.107625, |
| "grad_norm": 2.630688190460205, |
| "grad_norm_var": 0.5311534898567278, |
| "learning_rate": 0.0001, |
| "loss": 1.063, |
| "loss/crossentropy": 2.5359132289886475, |
| "loss/hidden": 0.734375, |
| "loss/logits": 0.1311398446559906, |
| "loss/reg": 0.019748201593756676, |
| "step": 861 |
| }, |
| { |
| "epoch": 0.10775, |
| "grad_norm": 2.5543456077575684, |
| "grad_norm_var": 0.5729300014134933, |
| "learning_rate": 0.0001, |
| "loss": 0.987, |
| "loss/crossentropy": 2.3763418197631836, |
| "loss/hidden": 0.671875, |
| "loss/logits": 0.11773102730512619, |
| "loss/reg": 0.01973855495452881, |
| "step": 862 |
| }, |
| { |
| "epoch": 0.107875, |
| "grad_norm": 2.8768351078033447, |
| "grad_norm_var": 0.5249464789316676, |
| "learning_rate": 0.0001, |
| "loss": 1.11, |
| "loss/crossentropy": 2.467682361602783, |
| "loss/hidden": 0.7734375, |
| "loss/logits": 0.13924749195575714, |
| "loss/reg": 0.019729435443878174, |
| "step": 863 |
| }, |
| { |
| "epoch": 0.108, |
| "grad_norm": 9.608988761901855, |
| "grad_norm_var": 3.012409881249372, |
| "learning_rate": 0.0001, |
| "loss": 1.8371, |
| "loss/crossentropy": 2.4410533905029297, |
| "loss/hidden": 1.359375, |
| "loss/logits": 0.280517578125, |
| "loss/reg": 0.019719891250133514, |
| "step": 864 |
| }, |
| { |
| "epoch": 0.108125, |
| "grad_norm": 2.6161296367645264, |
| "grad_norm_var": 3.079687357926654, |
| "learning_rate": 0.0001, |
| "loss": 0.9254, |
| "loss/crossentropy": 2.656090497970581, |
| "loss/hidden": 0.6171875, |
| "loss/logits": 0.11112320423126221, |
| "loss/reg": 0.01971041038632393, |
| "step": 865 |
| }, |
| { |
| "epoch": 0.10825, |
| "grad_norm": 6.282230377197266, |
| "grad_norm_var": 3.4921671952336317, |
| "learning_rate": 0.0001, |
| "loss": 1.1269, |
| "loss/crossentropy": 2.4799087047576904, |
| "loss/hidden": 0.80078125, |
| "loss/logits": 0.12914146482944489, |
| "loss/reg": 0.01970127783715725, |
| "step": 866 |
| }, |
| { |
| "epoch": 0.108375, |
| "grad_norm": 3.052783727645874, |
| "grad_norm_var": 3.514845564297898, |
| "learning_rate": 0.0001, |
| "loss": 1.1383, |
| "loss/crossentropy": 2.4700145721435547, |
| "loss/hidden": 0.78515625, |
| "loss/logits": 0.15620394051074982, |
| "loss/reg": 0.019691679626703262, |
| "step": 867 |
| }, |
| { |
| "epoch": 0.1085, |
| "grad_norm": 3.2105300426483154, |
| "grad_norm_var": 3.458070348929603, |
| "learning_rate": 0.0001, |
| "loss": 1.067, |
| "loss/crossentropy": 2.135777473449707, |
| "loss/hidden": 0.7578125, |
| "loss/logits": 0.11233663558959961, |
| "loss/reg": 0.019682079553604126, |
| "step": 868 |
| }, |
| { |
| "epoch": 0.108625, |
| "grad_norm": 2.7341737747192383, |
| "grad_norm_var": 3.4405364793380135, |
| "learning_rate": 0.0001, |
| "loss": 1.0495, |
| "loss/crossentropy": 2.4592809677124023, |
| "loss/hidden": 0.73046875, |
| "loss/logits": 0.1223248764872551, |
| "loss/reg": 0.019672293215990067, |
| "step": 869 |
| }, |
| { |
| "epoch": 0.10875, |
| "grad_norm": 4.007035732269287, |
| "grad_norm_var": 3.4058996890488658, |
| "learning_rate": 0.0001, |
| "loss": 1.1341, |
| "loss/crossentropy": 2.5008175373077393, |
| "loss/hidden": 0.80078125, |
| "loss/logits": 0.13670845329761505, |
| "loss/reg": 0.01966211199760437, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.108875, |
| "grad_norm": 3.146348476409912, |
| "grad_norm_var": 3.359090924722083, |
| "learning_rate": 0.0001, |
| "loss": 1.2459, |
| "loss/crossentropy": 2.0440711975097656, |
| "loss/hidden": 0.8984375, |
| "loss/logits": 0.15090960264205933, |
| "loss/reg": 0.019652366638183594, |
| "step": 871 |
| }, |
| { |
| "epoch": 0.109, |
| "grad_norm": 4.632335662841797, |
| "grad_norm_var": 3.3902752468766453, |
| "learning_rate": 0.0001, |
| "loss": 1.1446, |
| "loss/crossentropy": 2.4574947357177734, |
| "loss/hidden": 0.80859375, |
| "loss/logits": 0.13958214223384857, |
| "loss/reg": 0.01964336633682251, |
| "step": 872 |
| }, |
| { |
| "epoch": 0.109125, |
| "grad_norm": 5.440892219543457, |
| "grad_norm_var": 3.415471723579617, |
| "learning_rate": 0.0001, |
| "loss": 1.1098, |
| "loss/crossentropy": 2.565347671508789, |
| "loss/hidden": 0.8046875, |
| "loss/logits": 0.10876456648111343, |
| "loss/reg": 0.01963435485959053, |
| "step": 873 |
| }, |
| { |
| "epoch": 0.10925, |
| "grad_norm": 2.873284101486206, |
| "grad_norm_var": 3.454269091905695, |
| "learning_rate": 0.0001, |
| "loss": 1.0607, |
| "loss/crossentropy": 2.561459541320801, |
| "loss/hidden": 0.72265625, |
| "loss/logits": 0.14177533984184265, |
| "loss/reg": 0.01962495781481266, |
| "step": 874 |
| }, |
| { |
| "epoch": 0.109375, |
| "grad_norm": 3.0819554328918457, |
| "grad_norm_var": 3.496943197417559, |
| "learning_rate": 0.0001, |
| "loss": 1.2422, |
| "loss/crossentropy": 2.2944936752319336, |
| "loss/hidden": 0.8828125, |
| "loss/logits": 0.16319133341312408, |
| "loss/reg": 0.019616009667515755, |
| "step": 875 |
| }, |
| { |
| "epoch": 0.1095, |
| "grad_norm": 4.361453533172607, |
| "grad_norm_var": 3.488792217244569, |
| "learning_rate": 0.0001, |
| "loss": 1.3828, |
| "loss/crossentropy": 2.4624311923980713, |
| "loss/hidden": 1.0078125, |
| "loss/logits": 0.1789003610610962, |
| "loss/reg": 0.019607286900281906, |
| "step": 876 |
| }, |
| { |
| "epoch": 0.109625, |
| "grad_norm": 3.34078049659729, |
| "grad_norm_var": 3.3959280790074216, |
| "learning_rate": 0.0001, |
| "loss": 0.9589, |
| "loss/crossentropy": 2.4549739360809326, |
| "loss/hidden": 0.64453125, |
| "loss/logits": 0.11839590966701508, |
| "loss/reg": 0.019598115235567093, |
| "step": 877 |
| }, |
| { |
| "epoch": 0.10975, |
| "grad_norm": 3.434715986251831, |
| "grad_norm_var": 3.2759937907981884, |
| "learning_rate": 0.0001, |
| "loss": 1.1192, |
| "loss/crossentropy": 2.571798086166382, |
| "loss/hidden": 0.7578125, |
| "loss/logits": 0.16550706326961517, |
| "loss/reg": 0.01958884485065937, |
| "step": 878 |
| }, |
| { |
| "epoch": 0.109875, |
| "grad_norm": 3.593993663787842, |
| "grad_norm_var": 3.196554005024426, |
| "learning_rate": 0.0001, |
| "loss": 1.1792, |
| "loss/crossentropy": 2.447843551635742, |
| "loss/hidden": 0.83203125, |
| "loss/logits": 0.15137381851673126, |
| "loss/reg": 0.019579457119107246, |
| "step": 879 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 3.3654563426971436, |
| "grad_norm_var": 1.0373482238282006, |
| "learning_rate": 0.0001, |
| "loss": 1.0326, |
| "loss/crossentropy": 2.6210246086120605, |
| "loss/hidden": 0.71484375, |
| "loss/logits": 0.12209475785493851, |
| "loss/reg": 0.019569827243685722, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.110125, |
| "grad_norm": 2.6370863914489746, |
| "grad_norm_var": 1.0343516088559113, |
| "learning_rate": 0.0001, |
| "loss": 1.1442, |
| "loss/crossentropy": 2.2681713104248047, |
| "loss/hidden": 0.81640625, |
| "loss/logits": 0.132216215133667, |
| "loss/reg": 0.019560784101486206, |
| "step": 881 |
| }, |
| { |
| "epoch": 0.11025, |
| "grad_norm": 2.152343273162842, |
| "grad_norm_var": 0.6782700998492743, |
| "learning_rate": 0.0001, |
| "loss": 1.0136, |
| "loss/crossentropy": 2.437594175338745, |
| "loss/hidden": 0.70703125, |
| "loss/logits": 0.11108942329883575, |
| "loss/reg": 0.019551947712898254, |
| "step": 882 |
| }, |
| { |
| "epoch": 0.110375, |
| "grad_norm": 3.2755074501037598, |
| "grad_norm_var": 0.6698247850929626, |
| "learning_rate": 0.0001, |
| "loss": 0.9972, |
| "loss/crossentropy": 2.481198310852051, |
| "loss/hidden": 0.6796875, |
| "loss/logits": 0.12207823246717453, |
| "loss/reg": 0.01954270713031292, |
| "step": 883 |
| }, |
| { |
| "epoch": 0.1105, |
| "grad_norm": 3.770535707473755, |
| "grad_norm_var": 0.6711344077538037, |
| "learning_rate": 0.0001, |
| "loss": 1.0475, |
| "loss/crossentropy": 2.434851884841919, |
| "loss/hidden": 0.71875, |
| "loss/logits": 0.13345816731452942, |
| "loss/reg": 0.019533507525920868, |
| "step": 884 |
| }, |
| { |
| "epoch": 0.110625, |
| "grad_norm": 2.7666234970092773, |
| "grad_norm_var": 0.6679279033368438, |
| "learning_rate": 0.0001, |
| "loss": 1.1166, |
| "loss/crossentropy": 2.433852434158325, |
| "loss/hidden": 0.78125, |
| "loss/logits": 0.1401294767856598, |
| "loss/reg": 0.01952442154288292, |
| "step": 885 |
| }, |
| { |
| "epoch": 0.11075, |
| "grad_norm": 2.7105534076690674, |
| "grad_norm_var": 0.6840409496040507, |
| "learning_rate": 0.0001, |
| "loss": 1.0924, |
| "loss/crossentropy": 2.219019889831543, |
| "loss/hidden": 0.765625, |
| "loss/logits": 0.13160449266433716, |
| "loss/reg": 0.019515201449394226, |
| "step": 886 |
| }, |
| { |
| "epoch": 0.110875, |
| "grad_norm": 2.8931920528411865, |
| "grad_norm_var": 0.6969961519386968, |
| "learning_rate": 0.0001, |
| "loss": 1.1134, |
| "loss/crossentropy": 2.511371612548828, |
| "loss/hidden": 0.78125, |
| "loss/logits": 0.13708999752998352, |
| "loss/reg": 0.019505700096488, |
| "step": 887 |
| }, |
| { |
| "epoch": 0.111, |
| "grad_norm": 2.821718215942383, |
| "grad_norm_var": 0.6033415037749854, |
| "learning_rate": 0.0001, |
| "loss": 1.2446, |
| "loss/crossentropy": 2.389580011367798, |
| "loss/hidden": 0.8828125, |
| "loss/logits": 0.16678079962730408, |
| "loss/reg": 0.019496839493513107, |
| "step": 888 |
| }, |
| { |
| "epoch": 0.111125, |
| "grad_norm": 2.833461284637451, |
| "grad_norm_var": 0.2778808504856213, |
| "learning_rate": 0.0001, |
| "loss": 1.029, |
| "loss/crossentropy": 2.1618683338165283, |
| "loss/hidden": 0.70703125, |
| "loss/logits": 0.12710769474506378, |
| "loss/reg": 0.019487854093313217, |
| "step": 889 |
| }, |
| { |
| "epoch": 0.11125, |
| "grad_norm": 3.3806753158569336, |
| "grad_norm_var": 0.2773113837378702, |
| "learning_rate": 0.0001, |
| "loss": 1.1349, |
| "loss/crossentropy": 2.656121253967285, |
| "loss/hidden": 0.79296875, |
| "loss/logits": 0.1471368372440338, |
| "loss/reg": 0.019478676840662956, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.111375, |
| "grad_norm": 2.932758092880249, |
| "grad_norm_var": 0.2800811641911004, |
| "learning_rate": 0.0001, |
| "loss": 1.1, |
| "loss/crossentropy": 2.5325851440429688, |
| "loss/hidden": 0.76171875, |
| "loss/logits": 0.14360320568084717, |
| "loss/reg": 0.019469575956463814, |
| "step": 891 |
| }, |
| { |
| "epoch": 0.1115, |
| "grad_norm": 2.3603265285491943, |
| "grad_norm_var": 0.20497304301798067, |
| "learning_rate": 0.0001, |
| "loss": 1.0739, |
| "loss/crossentropy": 2.3697421550750732, |
| "loss/hidden": 0.75, |
| "loss/logits": 0.1292482614517212, |
| "loss/reg": 0.01946048066020012, |
| "step": 892 |
| }, |
| { |
| "epoch": 0.111625, |
| "grad_norm": 4.056443214416504, |
| "grad_norm_var": 0.2678930990243863, |
| "learning_rate": 0.0001, |
| "loss": 1.0008, |
| "loss/crossentropy": 2.7553114891052246, |
| "loss/hidden": 0.68359375, |
| "loss/logits": 0.12268239259719849, |
| "loss/reg": 0.019451187923550606, |
| "step": 893 |
| }, |
| { |
| "epoch": 0.11175, |
| "grad_norm": 3.346064329147339, |
| "grad_norm_var": 0.2639738255705176, |
| "learning_rate": 0.0001, |
| "loss": 1.1791, |
| "loss/crossentropy": 2.4348855018615723, |
| "loss/hidden": 0.7890625, |
| "loss/logits": 0.19565626978874207, |
| "loss/reg": 0.019442636519670486, |
| "step": 894 |
| }, |
| { |
| "epoch": 0.111875, |
| "grad_norm": 6.878971576690674, |
| "grad_norm_var": 1.1740357353356365, |
| "learning_rate": 0.0001, |
| "loss": 1.7085, |
| "loss/crossentropy": 2.9299001693725586, |
| "loss/hidden": 1.1640625, |
| "loss/logits": 0.3501082956790924, |
| "loss/reg": 0.019433531910181046, |
| "step": 895 |
| }, |
| { |
| "epoch": 0.112, |
| "grad_norm": 3.1834142208099365, |
| "grad_norm_var": 1.1735802221223497, |
| "learning_rate": 0.0001, |
| "loss": 1.2953, |
| "loss/crossentropy": 2.4165709018707275, |
| "loss/hidden": 0.92578125, |
| "loss/logits": 0.17523059248924255, |
| "loss/reg": 0.019424354657530785, |
| "step": 896 |
| }, |
| { |
| "epoch": 0.112125, |
| "grad_norm": 2.5551555156707764, |
| "grad_norm_var": 1.180695081530242, |
| "learning_rate": 0.0001, |
| "loss": 1.0171, |
| "loss/crossentropy": 2.4345083236694336, |
| "loss/hidden": 0.6953125, |
| "loss/logits": 0.1276446282863617, |
| "loss/reg": 0.019415004178881645, |
| "step": 897 |
| }, |
| { |
| "epoch": 0.11225, |
| "grad_norm": 3.5786855220794678, |
| "grad_norm_var": 1.1000748366509354, |
| "learning_rate": 0.0001, |
| "loss": 0.9925, |
| "loss/crossentropy": 2.464219808578491, |
| "loss/hidden": 0.69140625, |
| "loss/logits": 0.10706112533807755, |
| "loss/reg": 0.01940576173365116, |
| "step": 898 |
| }, |
| { |
| "epoch": 0.112375, |
| "grad_norm": 8.784457206726074, |
| "grad_norm_var": 2.9538895197120096, |
| "learning_rate": 0.0001, |
| "loss": 1.6799, |
| "loss/crossentropy": 2.3273468017578125, |
| "loss/hidden": 1.2265625, |
| "loss/logits": 0.25934016704559326, |
| "loss/reg": 0.019396713003516197, |
| "step": 899 |
| }, |
| { |
| "epoch": 0.1125, |
| "grad_norm": 4.44765567779541, |
| "grad_norm_var": 2.990871190956643, |
| "learning_rate": 0.0001, |
| "loss": 1.0685, |
| "loss/crossentropy": 2.5760438442230225, |
| "loss/hidden": 0.73046875, |
| "loss/logits": 0.14414295554161072, |
| "loss/reg": 0.019387517124414444, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.112625, |
| "grad_norm": 4.9651265144348145, |
| "grad_norm_var": 3.013306784613239, |
| "learning_rate": 0.0001, |
| "loss": 1.1917, |
| "loss/crossentropy": 2.5148589611053467, |
| "loss/hidden": 0.828125, |
| "loss/logits": 0.1697455495595932, |
| "loss/reg": 0.01937839388847351, |
| "step": 901 |
| }, |
| { |
| "epoch": 0.11275, |
| "grad_norm": 3.8322513103485107, |
| "grad_norm_var": 2.9203267227302745, |
| "learning_rate": 0.0001, |
| "loss": 1.2309, |
| "loss/crossentropy": 2.536973476409912, |
| "loss/hidden": 0.8828125, |
| "loss/logits": 0.15436521172523499, |
| "loss/reg": 0.019369108602404594, |
| "step": 902 |
| }, |
| { |
| "epoch": 0.112875, |
| "grad_norm": 3.5994656085968018, |
| "grad_norm_var": 2.8540415836766555, |
| "learning_rate": 0.0001, |
| "loss": 1.0761, |
| "loss/crossentropy": 2.3160693645477295, |
| "loss/hidden": 0.76171875, |
| "loss/logits": 0.12077254056930542, |
| "loss/reg": 0.019359666854143143, |
| "step": 903 |
| }, |
| { |
| "epoch": 0.113, |
| "grad_norm": 4.031139373779297, |
| "grad_norm_var": 2.7599236229360753, |
| "learning_rate": 0.0001, |
| "loss": 1.0988, |
| "loss/crossentropy": 2.5636203289031982, |
| "loss/hidden": 0.78515625, |
| "loss/logits": 0.12018904089927673, |
| "loss/reg": 0.019350115209817886, |
| "step": 904 |
| }, |
| { |
| "epoch": 0.113125, |
| "grad_norm": 3.2178378105163574, |
| "grad_norm_var": 2.7069185907568794, |
| "learning_rate": 0.0001, |
| "loss": 1.0472, |
| "loss/crossentropy": 2.3457703590393066, |
| "loss/hidden": 0.72265625, |
| "loss/logits": 0.13114379346370697, |
| "loss/reg": 0.019340479746460915, |
| "step": 905 |
| }, |
| { |
| "epoch": 0.11325, |
| "grad_norm": 3.096679925918579, |
| "grad_norm_var": 2.7381334427643536, |
| "learning_rate": 0.0001, |
| "loss": 0.9662, |
| "loss/crossentropy": 2.3510820865631104, |
| "loss/hidden": 0.671875, |
| "loss/logits": 0.10106582939624786, |
| "loss/reg": 0.019330844283103943, |
| "step": 906 |
| }, |
| { |
| "epoch": 0.113375, |
| "grad_norm": 7.590158462524414, |
| "grad_norm_var": 3.3974738441650887, |
| "learning_rate": 0.0001, |
| "loss": 1.6666, |
| "loss/crossentropy": 2.3520147800445557, |
| "loss/hidden": 1.2265625, |
| "loss/logits": 0.2468346357345581, |
| "loss/reg": 0.019321195781230927, |
| "step": 907 |
| }, |
| { |
| "epoch": 0.1135, |
| "grad_norm": 3.3216323852539062, |
| "grad_norm_var": 3.20081618522182, |
| "learning_rate": 0.0001, |
| "loss": 1.2612, |
| "loss/crossentropy": 2.330040216445923, |
| "loss/hidden": 0.90234375, |
| "loss/logits": 0.16571447253227234, |
| "loss/reg": 0.019311606884002686, |
| "step": 908 |
| }, |
| { |
| "epoch": 0.113625, |
| "grad_norm": 3.4920501708984375, |
| "grad_norm_var": 3.246978809627046, |
| "learning_rate": 0.0001, |
| "loss": 1.1161, |
| "loss/crossentropy": 2.598465919494629, |
| "loss/hidden": 0.76953125, |
| "loss/logits": 0.15353354811668396, |
| "loss/reg": 0.019302019849419594, |
| "step": 909 |
| }, |
| { |
| "epoch": 0.11375, |
| "grad_norm": 3.0414321422576904, |
| "grad_norm_var": 3.294370585536838, |
| "learning_rate": 0.0001, |
| "loss": 1.0693, |
| "loss/crossentropy": 2.459712028503418, |
| "loss/hidden": 0.7421875, |
| "loss/logits": 0.13418710231781006, |
| "loss/reg": 0.019292324781417847, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.113875, |
| "grad_norm": 3.131361722946167, |
| "grad_norm_var": 2.908980195007492, |
| "learning_rate": 0.0001, |
| "loss": 1.2301, |
| "loss/crossentropy": 2.268738269805908, |
| "loss/hidden": 0.875, |
| "loss/logits": 0.16223490238189697, |
| "loss/reg": 0.019283456727862358, |
| "step": 911 |
| }, |
| { |
| "epoch": 0.114, |
| "grad_norm": 3.160707950592041, |
| "grad_norm_var": 2.9118381902992776, |
| "learning_rate": 0.0001, |
| "loss": 1.3173, |
| "loss/crossentropy": 2.2185730934143066, |
| "loss/hidden": 0.9453125, |
| "loss/logits": 0.17927365005016327, |
| "loss/reg": 0.019274268299341202, |
| "step": 912 |
| }, |
| { |
| "epoch": 0.114125, |
| "grad_norm": 14.604296684265137, |
| "grad_norm_var": 9.479147248477696, |
| "learning_rate": 0.0001, |
| "loss": 1.4278, |
| "loss/crossentropy": 2.2313976287841797, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.15699920058250427, |
| "loss/reg": 0.019266733899712563, |
| "step": 913 |
| }, |
| { |
| "epoch": 0.11425, |
| "grad_norm": 3.744290590286255, |
| "grad_norm_var": 9.452382803070204, |
| "learning_rate": 0.0001, |
| "loss": 1.1776, |
| "loss/crossentropy": 2.689699172973633, |
| "loss/hidden": 0.83203125, |
| "loss/logits": 0.15302729606628418, |
| "loss/reg": 0.019258547574281693, |
| "step": 914 |
| }, |
| { |
| "epoch": 0.114375, |
| "grad_norm": 2.999354839324951, |
| "grad_norm_var": 8.531466626401157, |
| "learning_rate": 0.0001, |
| "loss": 1.3107, |
| "loss/crossentropy": 1.8664510250091553, |
| "loss/hidden": 0.9453125, |
| "loss/logits": 0.1728517711162567, |
| "loss/reg": 0.01925109326839447, |
| "step": 915 |
| }, |
| { |
| "epoch": 0.1145, |
| "grad_norm": 3.194913148880005, |
| "grad_norm_var": 8.64117053500835, |
| "learning_rate": 0.0001, |
| "loss": 1.3016, |
| "loss/crossentropy": 2.25707745552063, |
| "loss/hidden": 0.95703125, |
| "loss/logits": 0.15214993059635162, |
| "loss/reg": 0.019243914633989334, |
| "step": 916 |
| }, |
| { |
| "epoch": 0.114625, |
| "grad_norm": 2.335845708847046, |
| "grad_norm_var": 8.88876728908843, |
| "learning_rate": 0.0001, |
| "loss": 0.9484, |
| "loss/crossentropy": 2.6386678218841553, |
| "loss/hidden": 0.63671875, |
| "loss/logits": 0.11931365728378296, |
| "loss/reg": 0.019234785810112953, |
| "step": 917 |
| }, |
| { |
| "epoch": 0.11475, |
| "grad_norm": 3.6922056674957275, |
| "grad_norm_var": 8.898252742921356, |
| "learning_rate": 0.0001, |
| "loss": 1.0667, |
| "loss/crossentropy": 2.5539944171905518, |
| "loss/hidden": 0.73046875, |
| "loss/logits": 0.14395025372505188, |
| "loss/reg": 0.019226964563131332, |
| "step": 918 |
| }, |
| { |
| "epoch": 0.114875, |
| "grad_norm": 2.946389675140381, |
| "grad_norm_var": 8.982934878513694, |
| "learning_rate": 0.0001, |
| "loss": 1.0911, |
| "loss/crossentropy": 2.3651351928710938, |
| "loss/hidden": 0.76171875, |
| "loss/logits": 0.1371677815914154, |
| "loss/reg": 0.019219111651182175, |
| "step": 919 |
| }, |
| { |
| "epoch": 0.115, |
| "grad_norm": 3.050600051879883, |
| "grad_norm_var": 9.068373446668678, |
| "learning_rate": 0.0001, |
| "loss": 1.0419, |
| "loss/crossentropy": 2.3772196769714355, |
| "loss/hidden": 0.734375, |
| "loss/logits": 0.11545050889253616, |
| "loss/reg": 0.019211286678910255, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.115125, |
| "grad_norm": 2.954526424407959, |
| "grad_norm_var": 9.105915478669973, |
| "learning_rate": 0.0001, |
| "loss": 1.0807, |
| "loss/crossentropy": 2.2472290992736816, |
| "loss/hidden": 0.75390625, |
| "loss/logits": 0.13482055068016052, |
| "loss/reg": 0.019202249124646187, |
| "step": 921 |
| }, |
| { |
| "epoch": 0.11525, |
| "grad_norm": 6.214420795440674, |
| "grad_norm_var": 9.27670245999236, |
| "learning_rate": 0.0001, |
| "loss": 1.8562, |
| "loss/crossentropy": 2.9924590587615967, |
| "loss/hidden": 1.2734375, |
| "loss/logits": 0.3907894492149353, |
| "loss/reg": 0.019193273037672043, |
| "step": 922 |
| }, |
| { |
| "epoch": 0.115375, |
| "grad_norm": 4.534095287322998, |
| "grad_norm_var": 8.536934613221737, |
| "learning_rate": 0.0001, |
| "loss": 1.4885, |
| "loss/crossentropy": 2.394090414047241, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.2185368537902832, |
| "loss/reg": 0.019184142351150513, |
| "step": 923 |
| }, |
| { |
| "epoch": 0.1155, |
| "grad_norm": 3.6761627197265625, |
| "grad_norm_var": 8.505579278095963, |
| "learning_rate": 0.0001, |
| "loss": 1.1306, |
| "loss/crossentropy": 2.3566806316375732, |
| "loss/hidden": 0.7890625, |
| "loss/logits": 0.14977289736270905, |
| "loss/reg": 0.019175738096237183, |
| "step": 924 |
| }, |
| { |
| "epoch": 0.115625, |
| "grad_norm": 4.592898368835449, |
| "grad_norm_var": 8.481328607270026, |
| "learning_rate": 0.0001, |
| "loss": 1.2942, |
| "loss/crossentropy": 2.7458250522613525, |
| "loss/hidden": 0.953125, |
| "loss/logits": 0.14940449595451355, |
| "loss/reg": 0.01916695386171341, |
| "step": 925 |
| }, |
| { |
| "epoch": 0.11575, |
| "grad_norm": 2.7483372688293457, |
| "grad_norm_var": 8.533618684340597, |
| "learning_rate": 0.0001, |
| "loss": 1.0727, |
| "loss/crossentropy": 2.4450652599334717, |
| "loss/hidden": 0.75390625, |
| "loss/logits": 0.12724441289901733, |
| "loss/reg": 0.019158538430929184, |
| "step": 926 |
| }, |
| { |
| "epoch": 0.115875, |
| "grad_norm": 3.075195074081421, |
| "grad_norm_var": 8.541996814909611, |
| "learning_rate": 0.0001, |
| "loss": 1.1251, |
| "loss/crossentropy": 2.3960814476013184, |
| "loss/hidden": 0.76171875, |
| "loss/logits": 0.1718631386756897, |
| "loss/reg": 0.019150495529174805, |
| "step": 927 |
| }, |
| { |
| "epoch": 0.116, |
| "grad_norm": 3.313359022140503, |
| "grad_norm_var": 8.521887542243068, |
| "learning_rate": 0.0001, |
| "loss": 1.1316, |
| "loss/crossentropy": 2.3464736938476562, |
| "loss/hidden": 0.8125, |
| "loss/logits": 0.1276848018169403, |
| "loss/reg": 0.019141457974910736, |
| "step": 928 |
| }, |
| { |
| "epoch": 0.116125, |
| "grad_norm": 4.252639293670654, |
| "grad_norm_var": 0.9000980544993648, |
| "learning_rate": 0.0001, |
| "loss": 1.1818, |
| "loss/crossentropy": 2.523444414138794, |
| "loss/hidden": 0.83203125, |
| "loss/logits": 0.15845248103141785, |
| "loss/reg": 0.01913331262767315, |
| "step": 929 |
| }, |
| { |
| "epoch": 0.11625, |
| "grad_norm": 3.097456455230713, |
| "grad_norm_var": 0.9123223599265882, |
| "learning_rate": 0.0001, |
| "loss": 1.1264, |
| "loss/crossentropy": 2.595120668411255, |
| "loss/hidden": 0.7734375, |
| "loss/logits": 0.16165900230407715, |
| "loss/reg": 0.019125619903206825, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.116375, |
| "grad_norm": 3.292982816696167, |
| "grad_norm_var": 0.8964505136113113, |
| "learning_rate": 0.0001, |
| "loss": 1.1132, |
| "loss/crossentropy": 2.4093523025512695, |
| "loss/hidden": 0.77734375, |
| "loss/logits": 0.14468303322792053, |
| "loss/reg": 0.019116582348942757, |
| "step": 931 |
| }, |
| { |
| "epoch": 0.1165, |
| "grad_norm": 2.559980869293213, |
| "grad_norm_var": 0.952617731514821, |
| "learning_rate": 0.0001, |
| "loss": 0.9697, |
| "loss/crossentropy": 2.5491104125976562, |
| "loss/hidden": 0.6484375, |
| "loss/logits": 0.13018551468849182, |
| "loss/reg": 0.019108334556221962, |
| "step": 932 |
| }, |
| { |
| "epoch": 0.116625, |
| "grad_norm": 3.058579683303833, |
| "grad_norm_var": 0.871050822267735, |
| "learning_rate": 0.0001, |
| "loss": 1.2919, |
| "loss/crossentropy": 2.303837537765503, |
| "loss/hidden": 0.9296875, |
| "loss/logits": 0.17123734951019287, |
| "loss/reg": 0.019099365919828415, |
| "step": 933 |
| }, |
| { |
| "epoch": 0.11675, |
| "grad_norm": 4.866446495056152, |
| "grad_norm_var": 0.9769503909617764, |
| "learning_rate": 0.0001, |
| "loss": 1.1876, |
| "loss/crossentropy": 2.364741325378418, |
| "loss/hidden": 0.8359375, |
| "loss/logits": 0.1607905626296997, |
| "loss/reg": 0.019090238958597183, |
| "step": 934 |
| }, |
| { |
| "epoch": 0.116875, |
| "grad_norm": 5.912527561187744, |
| "grad_norm_var": 1.252657817578581, |
| "learning_rate": 0.0001, |
| "loss": 1.4831, |
| "loss/crossentropy": 2.641162633895874, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.19855040311813354, |
| "loss/reg": 0.019081177189946175, |
| "step": 935 |
| }, |
| { |
| "epoch": 0.117, |
| "grad_norm": 2.8301663398742676, |
| "grad_norm_var": 1.2784556528629765, |
| "learning_rate": 0.0001, |
| "loss": 1.0878, |
| "loss/crossentropy": 2.277157783508301, |
| "loss/hidden": 0.765625, |
| "loss/logits": 0.13142800331115723, |
| "loss/reg": 0.019072722643613815, |
| "step": 936 |
| }, |
| { |
| "epoch": 0.117125, |
| "grad_norm": 2.9044833183288574, |
| "grad_norm_var": 1.2843284928455583, |
| "learning_rate": 0.0001, |
| "loss": 1.1381, |
| "loss/crossentropy": 2.3311665058135986, |
| "loss/hidden": 0.796875, |
| "loss/logits": 0.1506001055240631, |
| "loss/reg": 0.01906409114599228, |
| "step": 937 |
| }, |
| { |
| "epoch": 0.11725, |
| "grad_norm": 3.78202223777771, |
| "grad_norm_var": 0.8736988295376342, |
| "learning_rate": 0.0001, |
| "loss": 1.2869, |
| "loss/crossentropy": 2.4287989139556885, |
| "loss/hidden": 0.8984375, |
| "loss/logits": 0.197871595621109, |
| "loss/reg": 0.019055521115660667, |
| "step": 938 |
| }, |
| { |
| "epoch": 0.117375, |
| "grad_norm": 3.1086387634277344, |
| "grad_norm_var": 0.8338185014655397, |
| "learning_rate": 0.0001, |
| "loss": 1.0576, |
| "loss/crossentropy": 2.335935354232788, |
| "loss/hidden": 0.73828125, |
| "loss/logits": 0.1288556456565857, |
| "loss/reg": 0.019047552719712257, |
| "step": 939 |
| }, |
| { |
| "epoch": 0.1175, |
| "grad_norm": 4.869723320007324, |
| "grad_norm_var": 0.9402287231159085, |
| "learning_rate": 0.0001, |
| "loss": 1.0378, |
| "loss/crossentropy": 2.9672138690948486, |
| "loss/hidden": 0.7109375, |
| "loss/logits": 0.13644808530807495, |
| "loss/reg": 0.01903851516544819, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.117625, |
| "grad_norm": 2.8750882148742676, |
| "grad_norm_var": 0.9067692046415797, |
| "learning_rate": 0.0001, |
| "loss": 1.0037, |
| "loss/crossentropy": 2.3138959407806396, |
| "loss/hidden": 0.69140625, |
| "loss/logits": 0.12202918529510498, |
| "loss/reg": 0.019029438495635986, |
| "step": 941 |
| }, |
| { |
| "epoch": 0.11775, |
| "grad_norm": 3.044121742248535, |
| "grad_norm_var": 0.8812433820018912, |
| "learning_rate": 0.0001, |
| "loss": 1.0173, |
| "loss/crossentropy": 2.4543912410736084, |
| "loss/hidden": 0.70703125, |
| "loss/logits": 0.12005805224180222, |
| "loss/reg": 0.01902030035853386, |
| "step": 942 |
| }, |
| { |
| "epoch": 0.117875, |
| "grad_norm": 2.945160150527954, |
| "grad_norm_var": 0.8905794039935603, |
| "learning_rate": 0.0001, |
| "loss": 1.1688, |
| "loss/crossentropy": 2.3482954502105713, |
| "loss/hidden": 0.828125, |
| "loss/logits": 0.15057498216629028, |
| "loss/reg": 0.019011201336979866, |
| "step": 943 |
| }, |
| { |
| "epoch": 0.118, |
| "grad_norm": 3.0109965801239014, |
| "grad_norm_var": 0.9056152589294107, |
| "learning_rate": 0.0001, |
| "loss": 1.3721, |
| "loss/crossentropy": 2.353516101837158, |
| "loss/hidden": 0.984375, |
| "loss/logits": 0.1977054923772812, |
| "loss/reg": 0.0190016757696867, |
| "step": 944 |
| }, |
| { |
| "epoch": 0.118125, |
| "grad_norm": 3.424039363861084, |
| "grad_norm_var": 0.8682128423744849, |
| "learning_rate": 0.0001, |
| "loss": 1.1647, |
| "loss/crossentropy": 2.513850450515747, |
| "loss/hidden": 0.81640625, |
| "loss/logits": 0.15838034451007843, |
| "loss/reg": 0.018992552533745766, |
| "step": 945 |
| }, |
| { |
| "epoch": 0.11825, |
| "grad_norm": 2.7656776905059814, |
| "grad_norm_var": 0.8917454992029661, |
| "learning_rate": 0.0001, |
| "loss": 1.0819, |
| "loss/crossentropy": 2.650881767272949, |
| "loss/hidden": 0.75, |
| "loss/logits": 0.1420745849609375, |
| "loss/reg": 0.01898341253399849, |
| "step": 946 |
| }, |
| { |
| "epoch": 0.118375, |
| "grad_norm": 4.2130560874938965, |
| "grad_norm_var": 0.9250033835133629, |
| "learning_rate": 0.0001, |
| "loss": 1.2469, |
| "loss/crossentropy": 2.683312177658081, |
| "loss/hidden": 0.90625, |
| "loss/logits": 0.15088841319084167, |
| "loss/reg": 0.018973875790834427, |
| "step": 947 |
| }, |
| { |
| "epoch": 0.1185, |
| "grad_norm": 3.834226608276367, |
| "grad_norm_var": 0.8649633510209883, |
| "learning_rate": 0.0001, |
| "loss": 0.9767, |
| "loss/crossentropy": 2.5706145763397217, |
| "loss/hidden": 0.6640625, |
| "loss/logits": 0.12295837700366974, |
| "loss/reg": 0.018963845446705818, |
| "step": 948 |
| }, |
| { |
| "epoch": 0.118625, |
| "grad_norm": 3.3009235858917236, |
| "grad_norm_var": 0.8514524765901378, |
| "learning_rate": 0.0001, |
| "loss": 1.1857, |
| "loss/crossentropy": 2.6797893047332764, |
| "loss/hidden": 0.82421875, |
| "loss/logits": 0.17191748321056366, |
| "loss/reg": 0.018954817205667496, |
| "step": 949 |
| }, |
| { |
| "epoch": 0.11875, |
| "grad_norm": 6.452317237854004, |
| "grad_norm_var": 1.2752747995844325, |
| "learning_rate": 0.0001, |
| "loss": 1.4528, |
| "loss/crossentropy": 2.7505640983581543, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.20869939029216766, |
| "loss/reg": 0.018945740535855293, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.118875, |
| "grad_norm": 3.405714273452759, |
| "grad_norm_var": 0.9300412257064863, |
| "learning_rate": 0.0001, |
| "loss": 1.0922, |
| "loss/crossentropy": 2.802401065826416, |
| "loss/hidden": 0.7578125, |
| "loss/logits": 0.14506830275058746, |
| "loss/reg": 0.018936749547719955, |
| "step": 951 |
| }, |
| { |
| "epoch": 0.119, |
| "grad_norm": 9.662891387939453, |
| "grad_norm_var": 3.194050081601077, |
| "learning_rate": 0.0001, |
| "loss": 1.5003, |
| "loss/crossentropy": 2.700052499771118, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.19381779432296753, |
| "loss/reg": 0.01892753876745701, |
| "step": 952 |
| }, |
| { |
| "epoch": 0.119125, |
| "grad_norm": 2.8670711517333984, |
| "grad_norm_var": 3.199477320796336, |
| "learning_rate": 0.0001, |
| "loss": 1.0153, |
| "loss/crossentropy": 2.490739107131958, |
| "loss/hidden": 0.69921875, |
| "loss/logits": 0.12687504291534424, |
| "loss/reg": 0.01891852729022503, |
| "step": 953 |
| }, |
| { |
| "epoch": 0.11925, |
| "grad_norm": 4.066836357116699, |
| "grad_norm_var": 3.1973098694543274, |
| "learning_rate": 0.0001, |
| "loss": 1.2704, |
| "loss/crossentropy": 2.4122891426086426, |
| "loss/hidden": 0.9140625, |
| "loss/logits": 0.1672634333372116, |
| "loss/reg": 0.018909232690930367, |
| "step": 954 |
| }, |
| { |
| "epoch": 0.119375, |
| "grad_norm": 2.8044497966766357, |
| "grad_norm_var": 3.2388562001879793, |
| "learning_rate": 0.0001, |
| "loss": 1.0385, |
| "loss/crossentropy": 2.473945379257202, |
| "loss/hidden": 0.71484375, |
| "loss/logits": 0.13464638590812683, |
| "loss/reg": 0.018900100141763687, |
| "step": 955 |
| }, |
| { |
| "epoch": 0.1195, |
| "grad_norm": 3.7214255332946777, |
| "grad_norm_var": 3.1837278954586044, |
| "learning_rate": 0.0001, |
| "loss": 1.0832, |
| "loss/crossentropy": 2.6148641109466553, |
| "loss/hidden": 0.75390625, |
| "loss/logits": 0.1403505802154541, |
| "loss/reg": 0.01889113523066044, |
| "step": 956 |
| }, |
| { |
| "epoch": 0.119625, |
| "grad_norm": 3.0579674243927, |
| "grad_norm_var": 3.160836005262268, |
| "learning_rate": 0.0001, |
| "loss": 1.1761, |
| "loss/crossentropy": 2.5016398429870605, |
| "loss/hidden": 0.83984375, |
| "loss/logits": 0.1474056839942932, |
| "loss/reg": 0.0188821442425251, |
| "step": 957 |
| }, |
| { |
| "epoch": 0.11975, |
| "grad_norm": 2.429112672805786, |
| "grad_norm_var": 3.255565314691306, |
| "learning_rate": 0.0001, |
| "loss": 0.9853, |
| "loss/crossentropy": 2.3262507915496826, |
| "loss/hidden": 0.66796875, |
| "loss/logits": 0.1286056935787201, |
| "loss/reg": 0.018873048946261406, |
| "step": 958 |
| }, |
| { |
| "epoch": 0.119875, |
| "grad_norm": 2.9561123847961426, |
| "grad_norm_var": 3.2542184489207098, |
| "learning_rate": 0.0001, |
| "loss": 0.9739, |
| "loss/crossentropy": 2.5669143199920654, |
| "loss/hidden": 0.65234375, |
| "loss/logits": 0.13291960954666138, |
| "loss/reg": 0.018863873556256294, |
| "step": 959 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 3.1904752254486084, |
| "grad_norm_var": 3.2355963683487268, |
| "learning_rate": 0.0001, |
| "loss": 1.074, |
| "loss/crossentropy": 2.5522408485412598, |
| "loss/hidden": 0.75390625, |
| "loss/logits": 0.1315881609916687, |
| "loss/reg": 0.018855126574635506, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.120125, |
| "grad_norm": 7.435352802276611, |
| "grad_norm_var": 3.9949775747956586, |
| "learning_rate": 0.0001, |
| "loss": 1.6007, |
| "loss/crossentropy": 2.7422168254852295, |
| "loss/hidden": 1.1875, |
| "loss/logits": 0.2246999442577362, |
| "loss/reg": 0.018846556544303894, |
| "step": 961 |
| }, |
| { |
| "epoch": 0.12025, |
| "grad_norm": 4.614249229431152, |
| "grad_norm_var": 3.8709926395951384, |
| "learning_rate": 0.0001, |
| "loss": 1.3908, |
| "loss/crossentropy": 2.356748104095459, |
| "loss/hidden": 0.953125, |
| "loss/logits": 0.24928607046604156, |
| "loss/reg": 0.018837420269846916, |
| "step": 962 |
| }, |
| { |
| "epoch": 0.120375, |
| "grad_norm": 4.978577136993408, |
| "grad_norm_var": 3.903770487124885, |
| "learning_rate": 0.0001, |
| "loss": 1.4333, |
| "loss/crossentropy": 2.2456307411193848, |
| "loss/hidden": 1.0703125, |
| "loss/logits": 0.17467612028121948, |
| "loss/reg": 0.01882883533835411, |
| "step": 963 |
| }, |
| { |
| "epoch": 0.1205, |
| "grad_norm": 5.51161003112793, |
| "grad_norm_var": 3.97576236618078, |
| "learning_rate": 0.0001, |
| "loss": 1.2552, |
| "loss/crossentropy": 2.7595887184143066, |
| "loss/hidden": 0.8828125, |
| "loss/logits": 0.18415382504463196, |
| "loss/reg": 0.01882052607834339, |
| "step": 964 |
| }, |
| { |
| "epoch": 0.120625, |
| "grad_norm": 3.2307283878326416, |
| "grad_norm_var": 3.9863892013288393, |
| "learning_rate": 0.0001, |
| "loss": 1.2728, |
| "loss/crossentropy": 2.48502779006958, |
| "loss/hidden": 0.88671875, |
| "loss/logits": 0.19795754551887512, |
| "loss/reg": 0.01881156861782074, |
| "step": 965 |
| }, |
| { |
| "epoch": 0.12075, |
| "grad_norm": 3.441767454147339, |
| "grad_norm_var": 3.7286595116638916, |
| "learning_rate": 0.0001, |
| "loss": 1.182, |
| "loss/crossentropy": 2.508680582046509, |
| "loss/hidden": 0.8359375, |
| "loss/logits": 0.15802894532680511, |
| "loss/reg": 0.018803071230649948, |
| "step": 966 |
| }, |
| { |
| "epoch": 0.120875, |
| "grad_norm": 4.063246726989746, |
| "grad_norm_var": 3.685090208705704, |
| "learning_rate": 0.0001, |
| "loss": 1.3354, |
| "loss/crossentropy": 2.47729229927063, |
| "loss/hidden": 0.984375, |
| "loss/logits": 0.16310644149780273, |
| "loss/reg": 0.018794314935803413, |
| "step": 967 |
| }, |
| { |
| "epoch": 0.121, |
| "grad_norm": 5.8381171226501465, |
| "grad_norm_var": 1.8400005684538645, |
| "learning_rate": 0.0001, |
| "loss": 1.1025, |
| "loss/crossentropy": 2.5666096210479736, |
| "loss/hidden": 0.77734375, |
| "loss/logits": 0.1373094618320465, |
| "loss/reg": 0.018785255029797554, |
| "step": 968 |
| }, |
| { |
| "epoch": 0.121125, |
| "grad_norm": 2.7132527828216553, |
| "grad_norm_var": 1.8649801572693356, |
| "learning_rate": 0.0001, |
| "loss": 1.0912, |
| "loss/crossentropy": 2.4335010051727295, |
| "loss/hidden": 0.76953125, |
| "loss/logits": 0.13391214609146118, |
| "loss/reg": 0.018776265904307365, |
| "step": 969 |
| }, |
| { |
| "epoch": 0.12125, |
| "grad_norm": 2.9154088497161865, |
| "grad_norm_var": 1.938092020210782, |
| "learning_rate": 0.0001, |
| "loss": 1.1314, |
| "loss/crossentropy": 2.523137331008911, |
| "loss/hidden": 0.80078125, |
| "loss/logits": 0.1429774910211563, |
| "loss/reg": 0.01876768097281456, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.121375, |
| "grad_norm": 3.3754584789276123, |
| "grad_norm_var": 1.8726730225127388, |
| "learning_rate": 0.0001, |
| "loss": 1.0747, |
| "loss/crossentropy": 2.6033432483673096, |
| "loss/hidden": 0.74609375, |
| "loss/logits": 0.14106187224388123, |
| "loss/reg": 0.018759164959192276, |
| "step": 971 |
| }, |
| { |
| "epoch": 0.1215, |
| "grad_norm": 2.7394940853118896, |
| "grad_norm_var": 1.9650935524715956, |
| "learning_rate": 0.0001, |
| "loss": 1.0871, |
| "loss/crossentropy": 2.2976291179656982, |
| "loss/hidden": 0.76953125, |
| "loss/logits": 0.1300249695777893, |
| "loss/reg": 0.01874978095293045, |
| "step": 972 |
| }, |
| { |
| "epoch": 0.121625, |
| "grad_norm": 2.9742624759674072, |
| "grad_norm_var": 1.9749925269591906, |
| "learning_rate": 0.0001, |
| "loss": 1.1221, |
| "loss/crossentropy": 2.7049221992492676, |
| "loss/hidden": 0.79296875, |
| "loss/logits": 0.1416921317577362, |
| "loss/reg": 0.018740687519311905, |
| "step": 973 |
| }, |
| { |
| "epoch": 0.12175, |
| "grad_norm": 2.927666187286377, |
| "grad_norm_var": 1.892721758937742, |
| "learning_rate": 0.0001, |
| "loss": 1.0252, |
| "loss/crossentropy": 2.6492226123809814, |
| "loss/hidden": 0.71875, |
| "loss/logits": 0.11909263581037521, |
| "loss/reg": 0.018731672316789627, |
| "step": 974 |
| }, |
| { |
| "epoch": 0.121875, |
| "grad_norm": 16.453693389892578, |
| "grad_norm_var": 11.523681815422924, |
| "learning_rate": 0.0001, |
| "loss": 1.5788, |
| "loss/crossentropy": 2.375779867172241, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.2197396457195282, |
| "loss/reg": 0.01872306317090988, |
| "step": 975 |
| }, |
| { |
| "epoch": 0.122, |
| "grad_norm": 2.928443193435669, |
| "grad_norm_var": 11.583339951760102, |
| "learning_rate": 0.0001, |
| "loss": 1.0995, |
| "loss/crossentropy": 2.8136162757873535, |
| "loss/hidden": 0.77734375, |
| "loss/logits": 0.13505280017852783, |
| "loss/reg": 0.018714020028710365, |
| "step": 976 |
| }, |
| { |
| "epoch": 0.122125, |
| "grad_norm": 4.540535926818848, |
| "grad_norm_var": 11.07401646408874, |
| "learning_rate": 0.0001, |
| "loss": 1.1806, |
| "loss/crossentropy": 2.5312135219573975, |
| "loss/hidden": 0.83984375, |
| "loss/logits": 0.15369677543640137, |
| "loss/reg": 0.01870504766702652, |
| "step": 977 |
| }, |
| { |
| "epoch": 0.12225, |
| "grad_norm": 2.4433298110961914, |
| "grad_norm_var": 11.358052675820652, |
| "learning_rate": 0.0001, |
| "loss": 1.0167, |
| "loss/crossentropy": 2.578800678253174, |
| "loss/hidden": 0.6953125, |
| "loss/logits": 0.13442841172218323, |
| "loss/reg": 0.018696293234825134, |
| "step": 978 |
| }, |
| { |
| "epoch": 0.122375, |
| "grad_norm": 3.341182231903076, |
| "grad_norm_var": 11.408522912728658, |
| "learning_rate": 0.0001, |
| "loss": 1.1346, |
| "loss/crossentropy": 2.4482579231262207, |
| "loss/hidden": 0.8046875, |
| "loss/logits": 0.14306378364562988, |
| "loss/reg": 0.018688105046749115, |
| "step": 979 |
| }, |
| { |
| "epoch": 0.1225, |
| "grad_norm": 2.433337450027466, |
| "grad_norm_var": 11.51984045745032, |
| "learning_rate": 0.0001, |
| "loss": 0.9764, |
| "loss/crossentropy": 2.321781873703003, |
| "loss/hidden": 0.65625, |
| "loss/logits": 0.13331879675388336, |
| "loss/reg": 0.018679112195968628, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.122625, |
| "grad_norm": 3.6000678539276123, |
| "grad_norm_var": 11.483219758864427, |
| "learning_rate": 0.0001, |
| "loss": 1.1731, |
| "loss/crossentropy": 2.5233397483825684, |
| "loss/hidden": 0.8203125, |
| "loss/logits": 0.16610421240329742, |
| "loss/reg": 0.018670594319701195, |
| "step": 981 |
| }, |
| { |
| "epoch": 0.12275, |
| "grad_norm": 2.8748631477355957, |
| "grad_norm_var": 11.558394893606714, |
| "learning_rate": 0.0001, |
| "loss": 1.0362, |
| "loss/crossentropy": 2.4155187606811523, |
| "loss/hidden": 0.7265625, |
| "loss/logits": 0.12304510176181793, |
| "loss/reg": 0.018661517649888992, |
| "step": 982 |
| }, |
| { |
| "epoch": 0.122875, |
| "grad_norm": 2.2293035984039307, |
| "grad_norm_var": 11.786185692154314, |
| "learning_rate": 0.0001, |
| "loss": 1.0077, |
| "loss/crossentropy": 2.618363618850708, |
| "loss/hidden": 0.6875, |
| "loss/logits": 0.13366106152534485, |
| "loss/reg": 0.01865258812904358, |
| "step": 983 |
| }, |
| { |
| "epoch": 0.123, |
| "grad_norm": 3.615424633026123, |
| "grad_norm_var": 11.556298836968558, |
| "learning_rate": 0.0001, |
| "loss": 1.0544, |
| "loss/crossentropy": 2.224966526031494, |
| "loss/hidden": 0.734375, |
| "loss/logits": 0.1335442066192627, |
| "loss/reg": 0.01864360086619854, |
| "step": 984 |
| }, |
| { |
| "epoch": 0.123125, |
| "grad_norm": 2.93835186958313, |
| "grad_norm_var": 11.524399601900045, |
| "learning_rate": 0.0001, |
| "loss": 1.232, |
| "loss/crossentropy": 2.348299026489258, |
| "loss/hidden": 0.87890625, |
| "loss/logits": 0.1667439341545105, |
| "loss/reg": 0.018634630367159843, |
| "step": 985 |
| }, |
| { |
| "epoch": 0.12325, |
| "grad_norm": 3.2750635147094727, |
| "grad_norm_var": 11.485476360611186, |
| "learning_rate": 0.0001, |
| "loss": 1.0639, |
| "loss/crossentropy": 2.5712990760803223, |
| "loss/hidden": 0.75, |
| "loss/logits": 0.12762659788131714, |
| "loss/reg": 0.018625380471348763, |
| "step": 986 |
| }, |
| { |
| "epoch": 0.123375, |
| "grad_norm": 3.81864595413208, |
| "grad_norm_var": 11.465683474564775, |
| "learning_rate": 0.0001, |
| "loss": 1.1284, |
| "loss/crossentropy": 2.4274511337280273, |
| "loss/hidden": 0.7890625, |
| "loss/logits": 0.15321871638298035, |
| "loss/reg": 0.01861615665256977, |
| "step": 987 |
| }, |
| { |
| "epoch": 0.1235, |
| "grad_norm": 3.2677698135375977, |
| "grad_norm_var": 11.398153583229371, |
| "learning_rate": 0.0001, |
| "loss": 1.1315, |
| "loss/crossentropy": 2.656604290008545, |
| "loss/hidden": 0.8046875, |
| "loss/logits": 0.14076048135757446, |
| "loss/reg": 0.018606893718242645, |
| "step": 988 |
| }, |
| { |
| "epoch": 0.123625, |
| "grad_norm": 3.561713457107544, |
| "grad_norm_var": 11.341034456038914, |
| "learning_rate": 0.0001, |
| "loss": 1.0152, |
| "loss/crossentropy": 2.6205251216888428, |
| "loss/hidden": 0.70703125, |
| "loss/logits": 0.12218683958053589, |
| "loss/reg": 0.01859763078391552, |
| "step": 989 |
| }, |
| { |
| "epoch": 0.12375, |
| "grad_norm": 2.146240472793579, |
| "grad_norm_var": 11.49254916357394, |
| "learning_rate": 0.0001, |
| "loss": 1.0449, |
| "loss/crossentropy": 2.4804821014404297, |
| "loss/hidden": 0.71484375, |
| "loss/logits": 0.14422178268432617, |
| "loss/reg": 0.018588390201330185, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.123875, |
| "grad_norm": 2.6142220497131348, |
| "grad_norm_var": 0.4215380256098586, |
| "learning_rate": 0.0001, |
| "loss": 1.0371, |
| "loss/crossentropy": 2.3303298950195312, |
| "loss/hidden": 0.73828125, |
| "loss/logits": 0.11297546327114105, |
| "loss/reg": 0.018579507246613503, |
| "step": 991 |
| }, |
| { |
| "epoch": 0.124, |
| "grad_norm": 4.17028284072876, |
| "grad_norm_var": 0.4892223582938262, |
| "learning_rate": 0.0001, |
| "loss": 1.6898, |
| "loss/crossentropy": 2.250098943710327, |
| "loss/hidden": 1.203125, |
| "loss/logits": 0.30099910497665405, |
| "loss/reg": 0.018569782376289368, |
| "step": 992 |
| }, |
| { |
| "epoch": 0.124125, |
| "grad_norm": 2.3814339637756348, |
| "grad_norm_var": 0.3887345955884323, |
| "learning_rate": 0.0001, |
| "loss": 1.0837, |
| "loss/crossentropy": 2.6447057723999023, |
| "loss/hidden": 0.74609375, |
| "loss/logits": 0.15198630094528198, |
| "loss/reg": 0.01855996623635292, |
| "step": 993 |
| }, |
| { |
| "epoch": 0.12425, |
| "grad_norm": 4.670334815979004, |
| "grad_norm_var": 0.5202129226035586, |
| "learning_rate": 0.0001, |
| "loss": 1.1908, |
| "loss/crossentropy": 2.3941245079040527, |
| "loss/hidden": 0.85546875, |
| "loss/logits": 0.1498216688632965, |
| "loss/reg": 0.018550006672739983, |
| "step": 994 |
| }, |
| { |
| "epoch": 0.124375, |
| "grad_norm": 2.498332977294922, |
| "grad_norm_var": 0.5469080049785059, |
| "learning_rate": 0.0001, |
| "loss": 0.9226, |
| "loss/crossentropy": 2.489830255508423, |
| "loss/hidden": 0.63671875, |
| "loss/logits": 0.10051175206899643, |
| "loss/reg": 0.01854090392589569, |
| "step": 995 |
| }, |
| { |
| "epoch": 0.1245, |
| "grad_norm": 2.987192392349243, |
| "grad_norm_var": 0.5145625202891589, |
| "learning_rate": 0.0001, |
| "loss": 1.1638, |
| "loss/crossentropy": 2.094637632369995, |
| "loss/hidden": 0.8515625, |
| "loss/logits": 0.12690997123718262, |
| "loss/reg": 0.018531804904341698, |
| "step": 996 |
| }, |
| { |
| "epoch": 0.124625, |
| "grad_norm": 2.738851547241211, |
| "grad_norm_var": 0.5110263660772335, |
| "learning_rate": 0.0001, |
| "loss": 1.1959, |
| "loss/crossentropy": 2.1406211853027344, |
| "loss/hidden": 0.84375, |
| "loss/logits": 0.16688010096549988, |
| "loss/reg": 0.018522722646594048, |
| "step": 997 |
| }, |
| { |
| "epoch": 0.12475, |
| "grad_norm": 5.02874231338501, |
| "grad_norm_var": 0.732945509426732, |
| "learning_rate": 0.0001, |
| "loss": 1.089, |
| "loss/crossentropy": 2.442840337753296, |
| "loss/hidden": 0.76171875, |
| "loss/logits": 0.142162024974823, |
| "loss/reg": 0.018513953313231468, |
| "step": 998 |
| }, |
| { |
| "epoch": 0.124875, |
| "grad_norm": 2.947786331176758, |
| "grad_norm_var": 0.6677765621166297, |
| "learning_rate": 0.0001, |
| "loss": 1.3459, |
| "loss/crossentropy": 2.483854055404663, |
| "loss/hidden": 0.96875, |
| "loss/logits": 0.19213837385177612, |
| "loss/reg": 0.018505612388253212, |
| "step": 999 |
| }, |
| { |
| "epoch": 0.125, |
| "grad_norm": 4.166240692138672, |
| "grad_norm_var": 0.7105452516630361, |
| "learning_rate": 0.0001, |
| "loss": 1.2402, |
| "loss/crossentropy": 2.454993963241577, |
| "loss/hidden": 0.87890625, |
| "loss/logits": 0.17627671360969543, |
| "loss/reg": 0.018497284501791, |
| "step": 1000 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 8000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": true, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 6.4405861564416e+16, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|